diff --git a/doc/src/index-format.md b/doc/src/index-format.md deleted file mode 100644 index e21025653..000000000 --- a/doc/src/index-format.md +++ /dev/null @@ -1,50 +0,0 @@ - -# Managed files -+----------+-----------+-------------------+ -| content | footer | footer_len: u32 | -+----------+-----------+-------------------+ - -# Term Dictionary (Composite File) - -+---------+---------------------------+------------------------+ -| fst | term_info_store | footer_len: u64 | -+---------+---------------------------+------------------------+ - -During a merge the term info store need to fit in memory. -It has a cost of n bytes per term. - -# term_info_store -+-------------------+---------------------------+------------------------+ -| len_block_meta | block_meta | term_infos | -+-------------------+---------------------------+------------------------+ - -# inverted_index -+------------------------+---------------------------+------------------------+ -| total_num_tokens: u64 | posting_lists.. | term_infos | -+------------------------+---------------------------+------------------------+ - -# postings lists -+------------------------+---------------------------+------------------------+ -| -+ - -# composite file -+----------------+-----+----------------+----------------------+----------------+ -| field file 1 | ... | field field n |composite file footer | footer len: u32| -+----------------+-----+----------------+----------------------+----------------+ - -# composite file footer - -+-----------------+---------------------------------------+ -|num fields: vint | (file_addr, offset_delta: vint) []... | -+-----------------+---------------------------------------+ - -# FileAddr -+--------------+--------------+ -| field: u32 | idx: VInt | -+--------------+--------------+ - -# Posting lists -+-----------------------------------------+ -| skip_reader -+-----------------------------------------+ \ No newline at end of file diff --git a/src/core/inverted_index_reader.rs b/src/core/inverted_index_reader.rs index b9173bbc7..e1d76edd7 100644 --- a/src/core/inverted_index_reader.rs +++ b/src/core/inverted_index_reader.rs @@ -90,9 +90,9 @@ impl InvertedIndexReader { term_info: &TermInfo, block_postings: &mut BlockSegmentPostings, ) -> io::Result<()> { - let offset = term_info.postings_start_offset as usize; - let end_source = term_info.postings_end_offset as usize; - let postings_slice = self.postings_file_slice.slice(offset, end_source); + let start_offset = term_info.postings_start_offset as usize; + let stop_offset = term_info.postings_stop_offset as usize; + let postings_slice = self.postings_file_slice.slice(start_offset, stop_offset); block_postings.reset(term_info.doc_freq, postings_slice.read_bytes()?); Ok(()) } @@ -123,7 +123,7 @@ impl InvertedIndexReader { ) -> io::Result { let postings_data = self.postings_file_slice.slice( term_info.postings_start_offset as usize, - term_info.postings_end_offset as usize, + term_info.postings_stop_offset as usize, ); BlockSegmentPostings::open( term_info.doc_freq, diff --git a/src/fastfield/bytes/mod.rs b/src/fastfield/bytes/mod.rs index c1c9f2229..16b268e64 100644 --- a/src/fastfield/bytes/mod.rs +++ b/src/fastfield/bytes/mod.rs @@ -98,10 +98,9 @@ mod tests { let field = searcher.schema().get_field("string_bytes").unwrap(); let term = Term::from_field_bytes(field, b"lucene".as_ref()); let term_query = TermQuery::new(term, IndexRecordOption::Basic); - let term_weight = term_query.specialized_weight(&searcher, false)?; - let term_scorer_err = term_weight.specialized_scorer(searcher.segment_reader(0), 1.0); + let term_weight_err = term_query.specialized_weight(&searcher, false); assert!(matches!( - term_scorer_err, + term_weight_err, Err(crate::TantivyError::SchemaError(_)) )); Ok(()) diff --git a/src/fieldnorm/reader.rs b/src/fieldnorm/reader.rs index bd8ed6628..b93e79824 100644 --- a/src/fieldnorm/reader.rs +++ b/src/fieldnorm/reader.rs @@ -66,10 +66,21 @@ pub struct FieldNormReader { } impl FieldNormReader { + /// Creates a `FieldNormReader` with a constant fieldnorm. + pub fn constant(num_docs: u32, fieldnorm: u32) -> FieldNormReader { + let fieldnorm_id = fieldnorm_to_id(fieldnorm); + let field_norms_data = OwnedBytes::new(vec![fieldnorm_id; num_docs as usize]); + FieldNormReader::new(field_norms_data) + } + /// Opens a field norm reader given its file. pub fn open(fieldnorm_file: FileSlice) -> crate::Result { let data = fieldnorm_file.read_bytes()?; - Ok(FieldNormReader { data }) + Ok(FieldNormReader::new(data)) + } + + fn new(data: OwnedBytes) -> Self { + FieldNormReader { data } } /// Returns the number of documents in this segment. diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index e95bfda73..9e8d9c39a 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -177,15 +177,16 @@ impl<'a> FieldSerializer<'a> { } fn current_term_info(&self) -> TermInfo { - let positions_idx = self - .positions_serializer_opt - .as_ref() - .map(PositionSerializer::positions_idx) - .unwrap_or(0u64); + let positions_idx = + if let Some(positions_serializer) = self.positions_serializer_opt.as_ref() { + positions_serializer.positions_idx() + } else { + 0u64 + }; TermInfo { doc_freq: 0, postings_start_offset: self.postings_serializer.addr(), - postings_end_offset: 0u64, + postings_stop_offset: 0u64, positions_idx, } } @@ -241,8 +242,7 @@ impl<'a> FieldSerializer<'a> { if self.term_open { self.postings_serializer .close_term(self.current_term_info.doc_freq)?; - let end_offset = self.postings_serializer.addr(); - self.current_term_info.postings_end_offset = end_offset; + self.current_term_info.postings_stop_offset = self.postings_serializer.addr(); self.term_dictionary_builder .insert_value(&self.current_term_info)?; self.term_open = false; diff --git a/src/postings/term_info.rs b/src/postings/term_info.rs index 59278eabc..4e08f2e9f 100644 --- a/src/postings/term_info.rs +++ b/src/postings/term_info.rs @@ -9,15 +9,16 @@ pub struct TermInfo { pub doc_freq: u32, /// Start offset of the posting list within the postings (`.idx`) file. pub postings_start_offset: u64, - /// End offset of the posting list within the postings (`.idx`) file. - pub postings_end_offset: u64, + /// Stop offset of the posting list within the postings (`.idx`) file. + /// The byte range is `[start_offset..stop_offset)`. + pub postings_stop_offset: u64, /// Start offset of the first block within the position (`.pos`) file. pub positions_idx: u64, } impl TermInfo { pub(crate) fn posting_num_bytes(&self) -> u32 { - let num_bytes = self.postings_end_offset - self.postings_start_offset; + let num_bytes = self.postings_stop_offset - self.postings_start_offset; assert!(num_bytes <= std::u32::MAX as u64); num_bytes as u32 } @@ -44,12 +45,12 @@ impl BinarySerializable for TermInfo { let doc_freq = u32::deserialize(reader)?; let postings_start_offset = u64::deserialize(reader)?; let postings_num_bytes = u32::deserialize(reader)?; - let postings_end_offset = postings_start_offset + u64::from(postings_num_bytes); + let postings_stop_offset = postings_start_offset + u64::from(postings_num_bytes); let positions_idx = u64::deserialize(reader)?; Ok(TermInfo { doc_freq, postings_start_offset, - postings_end_offset, + postings_stop_offset, positions_idx, }) } diff --git a/src/query/term_query/term_query.rs b/src/query/term_query/term_query.rs index a653be561..260170dff 100644 --- a/src/query/term_query/term_query.rs +++ b/src/query/term_query/term_query.rs @@ -93,6 +93,13 @@ impl TermQuery { scoring_enabled: bool, ) -> crate::Result { let term = self.term.clone(); + let field_entry = searcher.schema().get_field_entry(term.field()); + if !field_entry.is_indexed() { + return Err(crate::TantivyError::SchemaError(format!( + "Field {:?} is not indexed", + field_entry.name() + ))); + } let bm25_weight = BM25Weight::for_terms(searcher, &[term])?; let index_record_option = if scoring_enabled { self.index_record_option @@ -103,6 +110,7 @@ impl TermQuery { self.term.clone(), index_record_option, bm25_weight, + scoring_enabled, )) } } diff --git a/src/query/term_query/term_weight.rs b/src/query/term_query/term_weight.rs index 936c6e7ca..fb1e8e0fa 100644 --- a/src/query/term_query/term_weight.rs +++ b/src/query/term_query/term_weight.rs @@ -1,6 +1,7 @@ use super::term_scorer::TermScorer; use crate::core::SegmentReader; use crate::docset::DocSet; +use crate::fieldnorm::FieldNormReader; use crate::postings::SegmentPostings; use crate::query::bm25::BM25Weight; use crate::query::explanation::does_not_match; @@ -15,6 +16,7 @@ pub struct TermWeight { term: Term, index_record_option: IndexRecordOption, similarity_weight: BM25Weight, + scoring_enabled: bool, } impl Weight for TermWeight { @@ -87,11 +89,13 @@ impl TermWeight { term: Term, index_record_option: IndexRecordOption, similarity_weight: BM25Weight, + scoring_enabled: bool, ) -> TermWeight { TermWeight { term, index_record_option, similarity_weight, + scoring_enabled, } } @@ -102,7 +106,11 @@ impl TermWeight { ) -> crate::Result { let field = self.term.field(); let inverted_index = reader.inverted_index(field)?; - let fieldnorm_reader = reader.get_fieldnorms_reader(field)?; + let fieldnorm_reader = if self.scoring_enabled { + reader.get_fieldnorms_reader(field)? + } else { + FieldNormReader::constant(reader.max_doc(), 1) + }; let similarity_weight = self.similarity_weight.boost_by(boost); let postings_opt: Option = inverted_index.read_postings(&self.term, self.index_record_option)?; diff --git a/src/termdict/mod.rs b/src/termdict/mod.rs index d1b2fb564..5a105a17d 100644 --- a/src/termdict/mod.rs +++ b/src/termdict/mod.rs @@ -49,7 +49,7 @@ mod tests { TermInfo { doc_freq: term_ord as u32, postings_start_offset: offset(term_ord), - postings_end_offset: offset(term_ord + 1), + postings_stop_offset: offset(term_ord + 1), positions_idx: offset(term_ord) * 2u64, } } @@ -199,7 +199,6 @@ mod tests { // term requires more than 16bits term_dictionary_builder.insert("abcdefghijklmnopqrstuvwxy", &make_term_info(1))?; term_dictionary_builder.insert("abcdefghijklmnopqrstuvwxyz", &make_term_info(2))?; - term_dictionary_builder.insert("abr", &make_term_info(2))?; term_dictionary_builder.insert("abr", &make_term_info(3))?; term_dictionary_builder.finish()? }; @@ -209,7 +208,6 @@ mod tests { assert!(kv_stream.advance()); assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxy".as_bytes()); assert_eq!(kv_stream.value(), &make_term_info(1)); - dbg!(make_term_info(1)); assert!(kv_stream.advance()); assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxyz".as_bytes()); assert_eq!(kv_stream.value(), &make_term_info(2)); diff --git a/src/termdict/term_info_store.rs b/src/termdict/term_info_store.rs index f80773c1a..20b709a2f 100644 --- a/src/termdict/term_info_store.rs +++ b/src/termdict/term_info_store.rs @@ -61,23 +61,26 @@ impl TermInfoBlockMeta { fn deserialize_term_info(&self, data: &[u8], inner_offset: usize) -> TermInfo { assert!(inner_offset < BLOCK_LEN - 1); let num_bits = self.num_bits() as usize; - let mut cursor = num_bits * inner_offset; - let postings_start_offset = extract_bits(data, cursor, self.postings_offset_nbits); - let postings_end_offset = self.ref_term_info.postings_start_offset - + extract_bits(data, cursor + num_bits, self.postings_offset_nbits); - cursor += self.postings_offset_nbits as usize; + let posting_start_addr = num_bits * inner_offset; + // the stop offset is the start offset of the next term info. + let posting_stop_addr = posting_start_addr + num_bits; + let doc_freq_addr = posting_start_addr + self.postings_offset_nbits as usize; + let positions_idx_addr = doc_freq_addr + self.doc_freq_nbits as usize; - let doc_freq = extract_bits(data, cursor, self.doc_freq_nbits) as u32; - cursor += self.doc_freq_nbits as usize; - - let positions_idx = extract_bits(data, cursor, self.positions_idx_nbits); + let postings_start_offset = self.ref_term_info.postings_start_offset + + extract_bits(data, posting_start_addr, self.postings_offset_nbits); + let postings_stop_offset = self.ref_term_info.postings_start_offset + + extract_bits(data, posting_stop_addr, self.postings_offset_nbits); + let doc_freq = extract_bits(data, doc_freq_addr, self.doc_freq_nbits) as u32; + let positions_idx = self.ref_term_info.positions_idx + + extract_bits(data, positions_idx_addr, self.positions_idx_nbits); TermInfo { doc_freq, - postings_start_offset: postings_start_offset + self.ref_term_info.postings_start_offset, - postings_end_offset, - positions_idx: positions_idx + self.ref_term_info.positions_idx, + postings_start_offset, + postings_stop_offset, + positions_idx, } } } @@ -197,15 +200,15 @@ impl TermInfoStoreWriter { } else { return Ok(()); }; - let postings_end_offset = - last_term_info.postings_end_offset - ref_term_info.postings_start_offset; + let postings_stop_offset = + last_term_info.postings_stop_offset - ref_term_info.postings_start_offset; for term_info in &mut self.term_infos[1..] { term_info.postings_start_offset -= ref_term_info.postings_start_offset; term_info.positions_idx -= ref_term_info.positions_idx; } let mut max_doc_freq: u32 = 0u32; - let max_postings_offset: u64 = postings_end_offset; + let max_postings_offset: u64 = postings_stop_offset; let max_positions_idx: u64 = last_term_info.positions_idx; for term_info in &self.term_infos[1..] { @@ -235,7 +238,7 @@ impl TermInfoStoreWriter { } bit_packer.write( - postings_end_offset, + postings_stop_offset, term_info_block_meta.postings_offset_nbits, &mut self.buffer_term_infos, )?; @@ -248,7 +251,7 @@ impl TermInfoStoreWriter { } pub fn write_term_info(&mut self, term_info: &TermInfo) -> io::Result<()> { - assert!(term_info.postings_end_offset >= term_info.postings_start_offset); + assert!(term_info.postings_stop_offset >= term_info.postings_start_offset); self.num_terms += 1u64; self.term_infos.push(term_info.clone()); if self.term_infos.len() >= BLOCK_LEN { @@ -312,7 +315,7 @@ mod tests { ref_term_info: TermInfo { doc_freq: 512, postings_start_offset: 51, - postings_end_offset: 57u64, + postings_stop_offset: 57u64, positions_idx: 3584, }, doc_freq_nbits: 10, @@ -335,7 +338,7 @@ mod tests { let term_info = TermInfo { doc_freq: i as u32, postings_start_offset: offset(i), - postings_end_offset: offset(i + 1), + postings_stop_offset: offset(i + 1), positions_idx: (i * 7) as u64, }; store_writer.write_term_info(&term_info)?;