diff --git a/src/core/inverted_index_reader.rs b/src/core/inverted_index_reader.rs index dec2bf684..e1d76edd7 100644 --- a/src/core/inverted_index_reader.rs +++ b/src/core/inverted_index_reader.rs @@ -90,9 +90,9 @@ impl InvertedIndexReader { term_info: &TermInfo, block_postings: &mut BlockSegmentPostings, ) -> io::Result<()> { - let postings_slice = self - .postings_file_slice - .slice_from(term_info.postings_offset as usize); + let start_offset = term_info.postings_start_offset as usize; + let stop_offset = term_info.postings_stop_offset as usize; + let postings_slice = self.postings_file_slice.slice(start_offset, stop_offset); block_postings.reset(term_info.doc_freq, postings_slice.read_bytes()?); Ok(()) } @@ -121,8 +121,10 @@ impl InvertedIndexReader { term_info: &TermInfo, requested_option: IndexRecordOption, ) -> io::Result { - let offset = term_info.postings_offset as usize; - let postings_data = self.postings_file_slice.slice_from(offset); + let postings_data = self.postings_file_slice.slice( + term_info.postings_start_offset as usize, + term_info.postings_stop_offset as usize, + ); BlockSegmentPostings::open( term_info.doc_freq, postings_data, diff --git a/src/fastfield/bytes/mod.rs b/src/fastfield/bytes/mod.rs index c1c9f2229..16b268e64 100644 --- a/src/fastfield/bytes/mod.rs +++ b/src/fastfield/bytes/mod.rs @@ -98,10 +98,9 @@ mod tests { let field = searcher.schema().get_field("string_bytes").unwrap(); let term = Term::from_field_bytes(field, b"lucene".as_ref()); let term_query = TermQuery::new(term, IndexRecordOption::Basic); - let term_weight = term_query.specialized_weight(&searcher, false)?; - let term_scorer_err = term_weight.specialized_scorer(searcher.segment_reader(0), 1.0); + let term_weight_err = term_query.specialized_weight(&searcher, false); assert!(matches!( - term_scorer_err, + term_weight_err, Err(crate::TantivyError::SchemaError(_)) )); Ok(()) diff --git a/src/fieldnorm/reader.rs b/src/fieldnorm/reader.rs index bd8ed6628..b93e79824 100644 --- a/src/fieldnorm/reader.rs +++ b/src/fieldnorm/reader.rs @@ -66,10 +66,21 @@ pub struct FieldNormReader { } impl FieldNormReader { + /// Creates a `FieldNormReader` with a constant fieldnorm. + pub fn constant(num_docs: u32, fieldnorm: u32) -> FieldNormReader { + let fieldnorm_id = fieldnorm_to_id(fieldnorm); + let field_norms_data = OwnedBytes::new(vec![fieldnorm_id; num_docs as usize]); + FieldNormReader::new(field_norms_data) + } + /// Opens a field norm reader given its file. pub fn open(fieldnorm_file: FileSlice) -> crate::Result { let data = fieldnorm_file.read_bytes()?; - Ok(FieldNormReader { data }) + Ok(FieldNormReader::new(data)) + } + + fn new(data: OwnedBytes) -> Self { + FieldNormReader { data } } /// Returns the number of documents in this segment. diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 57806109c..226adf46d 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -15,18 +15,14 @@ mod stacker; mod term_info; pub(crate) use self::block_search::BlockSearcher; - -pub(crate) use self::postings_writer::MultiFieldPostingsWriter; -pub use self::serializer::{FieldSerializer, InvertedIndexSerializer}; - -pub use self::postings::Postings; -pub(crate) use self::skip::{BlockInfo, SkipReader}; -pub use self::term_info::TermInfo; - pub use self::block_segment_postings::BlockSegmentPostings; +pub use self::postings::Postings; +pub(crate) use self::postings_writer::MultiFieldPostingsWriter; pub use self::segment_postings::SegmentPostings; - +pub use self::serializer::{FieldSerializer, InvertedIndexSerializer}; +pub(crate) use self::skip::{BlockInfo, SkipReader}; pub(crate) use self::stacker::compute_table_size; +pub use self::term_info::TermInfo; pub(crate) type UnorderedTermId = u64; diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index f6745e64e..9e8d9c39a 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -177,14 +177,16 @@ impl<'a> FieldSerializer<'a> { } fn current_term_info(&self) -> TermInfo { - let positions_idx = self - .positions_serializer_opt - .as_ref() - .map(PositionSerializer::positions_idx) - .unwrap_or(0u64); + let positions_idx = + if let Some(positions_serializer) = self.positions_serializer_opt.as_ref() { + positions_serializer.positions_idx() + } else { + 0u64 + }; TermInfo { doc_freq: 0, - postings_offset: self.postings_serializer.addr(), + postings_start_offset: self.postings_serializer.addr(), + postings_stop_offset: 0u64, positions_idx, } } @@ -238,10 +240,11 @@ impl<'a> FieldSerializer<'a> { /// using `VInt` encoding. pub fn close_term(&mut self) -> io::Result<()> { if self.term_open { - self.term_dictionary_builder - .insert_value(&self.current_term_info)?; self.postings_serializer .close_term(self.current_term_info.doc_freq)?; + self.current_term_info.postings_stop_offset = self.postings_serializer.addr(); + self.term_dictionary_builder + .insert_value(&self.current_term_info)?; self.term_open = false; } Ok(()) diff --git a/src/postings/term_info.rs b/src/postings/term_info.rs index 55a414955..4e08f2e9f 100644 --- a/src/postings/term_info.rs +++ b/src/postings/term_info.rs @@ -7,35 +7,50 @@ use std::io; pub struct TermInfo { /// Number of documents in the segment containing the term pub doc_freq: u32, - /// Start offset within the postings (`.idx`) file. - pub postings_offset: u64, + /// Start offset of the posting list within the postings (`.idx`) file. + pub postings_start_offset: u64, + /// Stop offset of the posting list within the postings (`.idx`) file. + /// The byte range is `[start_offset..stop_offset)`. + pub postings_stop_offset: u64, /// Start offset of the first block within the position (`.pos`) file. pub positions_idx: u64, } +impl TermInfo { + pub(crate) fn posting_num_bytes(&self) -> u32 { + let num_bytes = self.postings_stop_offset - self.postings_start_offset; + assert!(num_bytes <= std::u32::MAX as u64); + num_bytes as u32 + } +} + impl FixedSize for TermInfo { /// Size required for the binary serialization of a `TermInfo` object. /// This is large, but in practise, `TermInfo` are encoded in blocks and /// only the first `TermInfo` of a block is serialized uncompressed. /// The subsequent `TermInfo` are delta encoded and bitpacked. - const SIZE_IN_BYTES: usize = u32::SIZE_IN_BYTES + 2 * u64::SIZE_IN_BYTES; + const SIZE_IN_BYTES: usize = 2 * u32::SIZE_IN_BYTES + 2 * u64::SIZE_IN_BYTES; } impl BinarySerializable for TermInfo { fn serialize(&self, writer: &mut W) -> io::Result<()> { self.doc_freq.serialize(writer)?; - self.postings_offset.serialize(writer)?; + self.postings_start_offset.serialize(writer)?; + self.posting_num_bytes().serialize(writer)?; self.positions_idx.serialize(writer)?; Ok(()) } fn deserialize(reader: &mut R) -> io::Result { let doc_freq = u32::deserialize(reader)?; - let postings_offset = u64::deserialize(reader)?; + let postings_start_offset = u64::deserialize(reader)?; + let postings_num_bytes = u32::deserialize(reader)?; + let postings_stop_offset = postings_start_offset + u64::from(postings_num_bytes); let positions_idx = u64::deserialize(reader)?; Ok(TermInfo { doc_freq, - postings_offset, + postings_start_offset, + postings_stop_offset, positions_idx, }) } diff --git a/src/query/term_query/term_query.rs b/src/query/term_query/term_query.rs index a653be561..260170dff 100644 --- a/src/query/term_query/term_query.rs +++ b/src/query/term_query/term_query.rs @@ -93,6 +93,13 @@ impl TermQuery { scoring_enabled: bool, ) -> crate::Result { let term = self.term.clone(); + let field_entry = searcher.schema().get_field_entry(term.field()); + if !field_entry.is_indexed() { + return Err(crate::TantivyError::SchemaError(format!( + "Field {:?} is not indexed", + field_entry.name() + ))); + } let bm25_weight = BM25Weight::for_terms(searcher, &[term])?; let index_record_option = if scoring_enabled { self.index_record_option @@ -103,6 +110,7 @@ impl TermQuery { self.term.clone(), index_record_option, bm25_weight, + scoring_enabled, )) } } diff --git a/src/query/term_query/term_weight.rs b/src/query/term_query/term_weight.rs index 936c6e7ca..fb1e8e0fa 100644 --- a/src/query/term_query/term_weight.rs +++ b/src/query/term_query/term_weight.rs @@ -1,6 +1,7 @@ use super::term_scorer::TermScorer; use crate::core::SegmentReader; use crate::docset::DocSet; +use crate::fieldnorm::FieldNormReader; use crate::postings::SegmentPostings; use crate::query::bm25::BM25Weight; use crate::query::explanation::does_not_match; @@ -15,6 +16,7 @@ pub struct TermWeight { term: Term, index_record_option: IndexRecordOption, similarity_weight: BM25Weight, + scoring_enabled: bool, } impl Weight for TermWeight { @@ -87,11 +89,13 @@ impl TermWeight { term: Term, index_record_option: IndexRecordOption, similarity_weight: BM25Weight, + scoring_enabled: bool, ) -> TermWeight { TermWeight { term, index_record_option, similarity_weight, + scoring_enabled, } } @@ -102,7 +106,11 @@ impl TermWeight { ) -> crate::Result { let field = self.term.field(); let inverted_index = reader.inverted_index(field)?; - let fieldnorm_reader = reader.get_fieldnorms_reader(field)?; + let fieldnorm_reader = if self.scoring_enabled { + reader.get_fieldnorms_reader(field)? + } else { + FieldNormReader::constant(reader.max_doc(), 1) + }; let similarity_weight = self.similarity_weight.boost_by(boost); let postings_opt: Option = inverted_index.read_postings(&self.term, self.index_record_option)?; diff --git a/src/termdict/mod.rs b/src/termdict/mod.rs index 4a2e4fe2b..5a105a17d 100644 --- a/src/termdict/mod.rs +++ b/src/termdict/mod.rs @@ -44,11 +44,13 @@ mod tests { const BLOCK_SIZE: usize = 1_500; - fn make_term_info(val: u64) -> TermInfo { + fn make_term_info(term_ord: u64) -> TermInfo { + let offset = |term_ord: u64| term_ord * 100 + term_ord * term_ord; TermInfo { - doc_freq: val as u32, - positions_idx: val * 2u64, - postings_offset: val * 3u64, + doc_freq: term_ord as u32, + postings_start_offset: offset(term_ord), + postings_stop_offset: offset(term_ord + 1), + positions_idx: offset(term_ord) * 2u64, } } @@ -197,7 +199,7 @@ mod tests { // term requires more than 16bits term_dictionary_builder.insert("abcdefghijklmnopqrstuvwxy", &make_term_info(1))?; term_dictionary_builder.insert("abcdefghijklmnopqrstuvwxyz", &make_term_info(2))?; - term_dictionary_builder.insert("abr", &make_term_info(2))?; + term_dictionary_builder.insert("abr", &make_term_info(3))?; term_dictionary_builder.finish()? }; let term_dict_file = FileSlice::from(buffer); @@ -211,6 +213,7 @@ mod tests { assert_eq!(kv_stream.value(), &make_term_info(2)); assert!(kv_stream.advance()); assert_eq!(kv_stream.key(), "abr".as_bytes()); + assert_eq!(kv_stream.value(), &make_term_info(3)); assert!(!kv_stream.advance()); Ok(()) } diff --git a/src/termdict/term_info_store.rs b/src/termdict/term_info_store.rs index bf5049d1e..20b709a2f 100644 --- a/src/termdict/term_info_store.rs +++ b/src/termdict/term_info_store.rs @@ -55,22 +55,32 @@ impl TermInfoBlockMeta { self.doc_freq_nbits + self.postings_offset_nbits + self.positions_idx_nbits } + // Here inner_offset is the offset within the block, WITHOUT the first term_info. + // In other word, term_info #1,#2,#3 gets inner_offset 0,1,2... While term_info #0 + // is encoded without bitpacking. fn deserialize_term_info(&self, data: &[u8], inner_offset: usize) -> TermInfo { + assert!(inner_offset < BLOCK_LEN - 1); let num_bits = self.num_bits() as usize; - let mut cursor = num_bits * inner_offset; - let doc_freq = extract_bits(data, cursor, self.doc_freq_nbits) as u32; - cursor += self.doc_freq_nbits as usize; + let posting_start_addr = num_bits * inner_offset; + // the stop offset is the start offset of the next term info. + let posting_stop_addr = posting_start_addr + num_bits; + let doc_freq_addr = posting_start_addr + self.postings_offset_nbits as usize; + let positions_idx_addr = doc_freq_addr + self.doc_freq_nbits as usize; - let postings_offset = extract_bits(data, cursor, self.postings_offset_nbits); - cursor += self.postings_offset_nbits as usize; - - let positions_idx = extract_bits(data, cursor, self.positions_idx_nbits); + let postings_start_offset = self.ref_term_info.postings_start_offset + + extract_bits(data, posting_start_addr, self.postings_offset_nbits); + let postings_stop_offset = self.ref_term_info.postings_start_offset + + extract_bits(data, posting_stop_addr, self.postings_offset_nbits); + let doc_freq = extract_bits(data, doc_freq_addr, self.doc_freq_nbits) as u32; + let positions_idx = self.ref_term_info.positions_idx + + extract_bits(data, positions_idx_addr, self.positions_idx_nbits); TermInfo { doc_freq, - postings_offset: postings_offset + self.ref_term_info.postings_offset, - positions_idx: positions_idx + self.ref_term_info.positions_idx, + postings_start_offset, + postings_stop_offset, + positions_idx, } } } @@ -152,16 +162,17 @@ fn bitpack_serialize( term_info_block_meta: &TermInfoBlockMeta, term_info: &TermInfo, ) -> io::Result<()> { + bit_packer.write( + term_info.postings_start_offset, + term_info_block_meta.postings_offset_nbits, + write, + )?; bit_packer.write( u64::from(term_info.doc_freq), term_info_block_meta.doc_freq_nbits, write, )?; - bit_packer.write( - term_info.postings_offset, - term_info_block_meta.postings_offset_nbits, - write, - )?; + bit_packer.write( term_info.positions_idx, term_info_block_meta.positions_idx_nbits, @@ -181,23 +192,27 @@ impl TermInfoStoreWriter { } fn flush_block(&mut self) -> io::Result<()> { - if self.term_infos.is_empty() { - return Ok(()); - } let mut bit_packer = BitPacker::new(); let ref_term_info = self.term_infos[0].clone(); + + let last_term_info = if let Some(last_term_info) = self.term_infos.last().cloned() { + last_term_info + } else { + return Ok(()); + }; + let postings_stop_offset = + last_term_info.postings_stop_offset - ref_term_info.postings_start_offset; for term_info in &mut self.term_infos[1..] { - term_info.postings_offset -= ref_term_info.postings_offset; + term_info.postings_start_offset -= ref_term_info.postings_start_offset; term_info.positions_idx -= ref_term_info.positions_idx; } let mut max_doc_freq: u32 = 0u32; - let mut max_postings_offset: u64 = 0u64; - let mut max_positions_idx: u64 = 0u64; + let max_postings_offset: u64 = postings_stop_offset; + let max_positions_idx: u64 = last_term_info.positions_idx; + for term_info in &self.term_infos[1..] { max_doc_freq = cmp::max(max_doc_freq, term_info.doc_freq); - max_postings_offset = cmp::max(max_postings_offset, term_info.postings_offset); - max_positions_idx = cmp::max(max_positions_idx, term_info.positions_idx); } let max_doc_freq_nbits: u8 = compute_num_bits(u64::from(max_doc_freq)); @@ -222,6 +237,12 @@ impl TermInfoStoreWriter { )?; } + bit_packer.write( + postings_stop_offset, + term_info_block_meta.postings_offset_nbits, + &mut self.buffer_term_infos, + )?; + // Block need end up at the end of a byte. bit_packer.flush(&mut self.buffer_term_infos)?; self.term_infos.clear(); @@ -230,6 +251,7 @@ impl TermInfoStoreWriter { } pub fn write_term_info(&mut self, term_info: &TermInfo) -> io::Result<()> { + assert!(term_info.postings_stop_offset >= term_info.postings_start_offset); self.num_terms += 1u64; self.term_infos.push(term_info.clone()); if self.term_infos.len() >= BLOCK_LEN { @@ -289,10 +311,11 @@ mod tests { #[test] fn test_term_info_block_meta_serialization() { let term_info_block_meta = TermInfoBlockMeta { - offset: 2009, + offset: 2009u64, ref_term_info: TermInfo { doc_freq: 512, - postings_offset: 51, + postings_start_offset: 51, + postings_stop_offset: 57u64, positions_idx: 3584, }, doc_freq_nbits: 10, @@ -310,10 +333,12 @@ mod tests { fn test_pack() -> crate::Result<()> { let mut store_writer = TermInfoStoreWriter::new(); let mut term_infos = vec![]; + let offset = |i| (i * 13 + i * i) as u64; for i in 0..1000 { let term_info = TermInfo { doc_freq: i as u32, - postings_offset: (i / 10) as u64, + postings_start_offset: offset(i), + postings_stop_offset: offset(i + 1), positions_idx: (i * 7) as u64, }; store_writer.write_term_info(&term_info)?; @@ -323,7 +348,12 @@ mod tests { store_writer.serialize(&mut buffer)?; let term_info_store = TermInfoStore::open(FileSlice::from(buffer))?; for i in 0..1000 { - assert_eq!(term_info_store.get(i as u64), term_infos[i]); + assert_eq!( + term_info_store.get(i as u64), + term_infos[i], + "term info {}", + i + ); } Ok(()) }