mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-26 13:10:41 +00:00
Avoid loading fieldnorms when not necessary
This commit is contained in:
@@ -1,50 +0,0 @@
|
||||
|
||||
# Managed files
|
||||
+----------+-----------+-------------------+
|
||||
| content | footer | footer_len: u32 |
|
||||
+----------+-----------+-------------------+
|
||||
|
||||
# Term Dictionary (Composite File)
|
||||
|
||||
+---------+---------------------------+------------------------+
|
||||
| fst | term_info_store | footer_len: u64 |
|
||||
+---------+---------------------------+------------------------+
|
||||
|
||||
During a merge the term info store need to fit in memory.
|
||||
It has a cost of n bytes per term.
|
||||
|
||||
# term_info_store
|
||||
+-------------------+---------------------------+------------------------+
|
||||
| len_block_meta | block_meta | term_infos |
|
||||
+-------------------+---------------------------+------------------------+
|
||||
|
||||
# inverted_index
|
||||
+------------------------+---------------------------+------------------------+
|
||||
| total_num_tokens: u64 | posting_lists.. | term_infos |
|
||||
+------------------------+---------------------------+------------------------+
|
||||
|
||||
# postings lists
|
||||
+------------------------+---------------------------+------------------------+
|
||||
|
|
||||
+
|
||||
|
||||
# composite file
|
||||
+----------------+-----+----------------+----------------------+----------------+
|
||||
| field file 1 | ... | field field n |composite file footer | footer len: u32|
|
||||
+----------------+-----+----------------+----------------------+----------------+
|
||||
|
||||
# composite file footer
|
||||
|
||||
+-----------------+---------------------------------------+
|
||||
|num fields: vint | (file_addr, offset_delta: vint) []... |
|
||||
+-----------------+---------------------------------------+
|
||||
|
||||
# FileAddr
|
||||
+--------------+--------------+
|
||||
| field: u32 | idx: VInt |
|
||||
+--------------+--------------+
|
||||
|
||||
# Posting lists
|
||||
+-----------------------------------------+
|
||||
| skip_reader
|
||||
+-----------------------------------------+
|
||||
@@ -90,9 +90,9 @@ impl InvertedIndexReader {
|
||||
term_info: &TermInfo,
|
||||
block_postings: &mut BlockSegmentPostings,
|
||||
) -> io::Result<()> {
|
||||
let offset = term_info.postings_start_offset as usize;
|
||||
let end_source = term_info.postings_end_offset as usize;
|
||||
let postings_slice = self.postings_file_slice.slice(offset, end_source);
|
||||
let start_offset = term_info.postings_start_offset as usize;
|
||||
let stop_offset = term_info.postings_stop_offset as usize;
|
||||
let postings_slice = self.postings_file_slice.slice(start_offset, stop_offset);
|
||||
block_postings.reset(term_info.doc_freq, postings_slice.read_bytes()?);
|
||||
Ok(())
|
||||
}
|
||||
@@ -123,7 +123,7 @@ impl InvertedIndexReader {
|
||||
) -> io::Result<BlockSegmentPostings> {
|
||||
let postings_data = self.postings_file_slice.slice(
|
||||
term_info.postings_start_offset as usize,
|
||||
term_info.postings_end_offset as usize,
|
||||
term_info.postings_stop_offset as usize,
|
||||
);
|
||||
BlockSegmentPostings::open(
|
||||
term_info.doc_freq,
|
||||
|
||||
@@ -98,10 +98,9 @@ mod tests {
|
||||
let field = searcher.schema().get_field("string_bytes").unwrap();
|
||||
let term = Term::from_field_bytes(field, b"lucene".as_ref());
|
||||
let term_query = TermQuery::new(term, IndexRecordOption::Basic);
|
||||
let term_weight = term_query.specialized_weight(&searcher, false)?;
|
||||
let term_scorer_err = term_weight.specialized_scorer(searcher.segment_reader(0), 1.0);
|
||||
let term_weight_err = term_query.specialized_weight(&searcher, false);
|
||||
assert!(matches!(
|
||||
term_scorer_err,
|
||||
term_weight_err,
|
||||
Err(crate::TantivyError::SchemaError(_))
|
||||
));
|
||||
Ok(())
|
||||
|
||||
@@ -66,10 +66,21 @@ pub struct FieldNormReader {
|
||||
}
|
||||
|
||||
impl FieldNormReader {
|
||||
/// Creates a `FieldNormReader` with a constant fieldnorm.
|
||||
pub fn constant(num_docs: u32, fieldnorm: u32) -> FieldNormReader {
|
||||
let fieldnorm_id = fieldnorm_to_id(fieldnorm);
|
||||
let field_norms_data = OwnedBytes::new(vec![fieldnorm_id; num_docs as usize]);
|
||||
FieldNormReader::new(field_norms_data)
|
||||
}
|
||||
|
||||
/// Opens a field norm reader given its file.
|
||||
pub fn open(fieldnorm_file: FileSlice) -> crate::Result<Self> {
|
||||
let data = fieldnorm_file.read_bytes()?;
|
||||
Ok(FieldNormReader { data })
|
||||
Ok(FieldNormReader::new(data))
|
||||
}
|
||||
|
||||
fn new(data: OwnedBytes) -> Self {
|
||||
FieldNormReader { data }
|
||||
}
|
||||
|
||||
/// Returns the number of documents in this segment.
|
||||
|
||||
@@ -177,15 +177,16 @@ impl<'a> FieldSerializer<'a> {
|
||||
}
|
||||
|
||||
fn current_term_info(&self) -> TermInfo {
|
||||
let positions_idx = self
|
||||
.positions_serializer_opt
|
||||
.as_ref()
|
||||
.map(PositionSerializer::positions_idx)
|
||||
.unwrap_or(0u64);
|
||||
let positions_idx =
|
||||
if let Some(positions_serializer) = self.positions_serializer_opt.as_ref() {
|
||||
positions_serializer.positions_idx()
|
||||
} else {
|
||||
0u64
|
||||
};
|
||||
TermInfo {
|
||||
doc_freq: 0,
|
||||
postings_start_offset: self.postings_serializer.addr(),
|
||||
postings_end_offset: 0u64,
|
||||
postings_stop_offset: 0u64,
|
||||
positions_idx,
|
||||
}
|
||||
}
|
||||
@@ -241,8 +242,7 @@ impl<'a> FieldSerializer<'a> {
|
||||
if self.term_open {
|
||||
self.postings_serializer
|
||||
.close_term(self.current_term_info.doc_freq)?;
|
||||
let end_offset = self.postings_serializer.addr();
|
||||
self.current_term_info.postings_end_offset = end_offset;
|
||||
self.current_term_info.postings_stop_offset = self.postings_serializer.addr();
|
||||
self.term_dictionary_builder
|
||||
.insert_value(&self.current_term_info)?;
|
||||
self.term_open = false;
|
||||
|
||||
@@ -9,15 +9,16 @@ pub struct TermInfo {
|
||||
pub doc_freq: u32,
|
||||
/// Start offset of the posting list within the postings (`.idx`) file.
|
||||
pub postings_start_offset: u64,
|
||||
/// End offset of the posting list within the postings (`.idx`) file.
|
||||
pub postings_end_offset: u64,
|
||||
/// Stop offset of the posting list within the postings (`.idx`) file.
|
||||
/// The byte range is `[start_offset..stop_offset)`.
|
||||
pub postings_stop_offset: u64,
|
||||
/// Start offset of the first block within the position (`.pos`) file.
|
||||
pub positions_idx: u64,
|
||||
}
|
||||
|
||||
impl TermInfo {
|
||||
pub(crate) fn posting_num_bytes(&self) -> u32 {
|
||||
let num_bytes = self.postings_end_offset - self.postings_start_offset;
|
||||
let num_bytes = self.postings_stop_offset - self.postings_start_offset;
|
||||
assert!(num_bytes <= std::u32::MAX as u64);
|
||||
num_bytes as u32
|
||||
}
|
||||
@@ -44,12 +45,12 @@ impl BinarySerializable for TermInfo {
|
||||
let doc_freq = u32::deserialize(reader)?;
|
||||
let postings_start_offset = u64::deserialize(reader)?;
|
||||
let postings_num_bytes = u32::deserialize(reader)?;
|
||||
let postings_end_offset = postings_start_offset + u64::from(postings_num_bytes);
|
||||
let postings_stop_offset = postings_start_offset + u64::from(postings_num_bytes);
|
||||
let positions_idx = u64::deserialize(reader)?;
|
||||
Ok(TermInfo {
|
||||
doc_freq,
|
||||
postings_start_offset,
|
||||
postings_end_offset,
|
||||
postings_stop_offset,
|
||||
positions_idx,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -93,6 +93,13 @@ impl TermQuery {
|
||||
scoring_enabled: bool,
|
||||
) -> crate::Result<TermWeight> {
|
||||
let term = self.term.clone();
|
||||
let field_entry = searcher.schema().get_field_entry(term.field());
|
||||
if !field_entry.is_indexed() {
|
||||
return Err(crate::TantivyError::SchemaError(format!(
|
||||
"Field {:?} is not indexed",
|
||||
field_entry.name()
|
||||
)));
|
||||
}
|
||||
let bm25_weight = BM25Weight::for_terms(searcher, &[term])?;
|
||||
let index_record_option = if scoring_enabled {
|
||||
self.index_record_option
|
||||
@@ -103,6 +110,7 @@ impl TermQuery {
|
||||
self.term.clone(),
|
||||
index_record_option,
|
||||
bm25_weight,
|
||||
scoring_enabled,
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use super::term_scorer::TermScorer;
|
||||
use crate::core::SegmentReader;
|
||||
use crate::docset::DocSet;
|
||||
use crate::fieldnorm::FieldNormReader;
|
||||
use crate::postings::SegmentPostings;
|
||||
use crate::query::bm25::BM25Weight;
|
||||
use crate::query::explanation::does_not_match;
|
||||
@@ -15,6 +16,7 @@ pub struct TermWeight {
|
||||
term: Term,
|
||||
index_record_option: IndexRecordOption,
|
||||
similarity_weight: BM25Weight,
|
||||
scoring_enabled: bool,
|
||||
}
|
||||
|
||||
impl Weight for TermWeight {
|
||||
@@ -87,11 +89,13 @@ impl TermWeight {
|
||||
term: Term,
|
||||
index_record_option: IndexRecordOption,
|
||||
similarity_weight: BM25Weight,
|
||||
scoring_enabled: bool,
|
||||
) -> TermWeight {
|
||||
TermWeight {
|
||||
term,
|
||||
index_record_option,
|
||||
similarity_weight,
|
||||
scoring_enabled,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -102,7 +106,11 @@ impl TermWeight {
|
||||
) -> crate::Result<TermScorer> {
|
||||
let field = self.term.field();
|
||||
let inverted_index = reader.inverted_index(field)?;
|
||||
let fieldnorm_reader = reader.get_fieldnorms_reader(field)?;
|
||||
let fieldnorm_reader = if self.scoring_enabled {
|
||||
reader.get_fieldnorms_reader(field)?
|
||||
} else {
|
||||
FieldNormReader::constant(reader.max_doc(), 1)
|
||||
};
|
||||
let similarity_weight = self.similarity_weight.boost_by(boost);
|
||||
let postings_opt: Option<SegmentPostings> =
|
||||
inverted_index.read_postings(&self.term, self.index_record_option)?;
|
||||
|
||||
@@ -49,7 +49,7 @@ mod tests {
|
||||
TermInfo {
|
||||
doc_freq: term_ord as u32,
|
||||
postings_start_offset: offset(term_ord),
|
||||
postings_end_offset: offset(term_ord + 1),
|
||||
postings_stop_offset: offset(term_ord + 1),
|
||||
positions_idx: offset(term_ord) * 2u64,
|
||||
}
|
||||
}
|
||||
@@ -199,7 +199,6 @@ mod tests {
|
||||
// term requires more than 16bits
|
||||
term_dictionary_builder.insert("abcdefghijklmnopqrstuvwxy", &make_term_info(1))?;
|
||||
term_dictionary_builder.insert("abcdefghijklmnopqrstuvwxyz", &make_term_info(2))?;
|
||||
term_dictionary_builder.insert("abr", &make_term_info(2))?;
|
||||
term_dictionary_builder.insert("abr", &make_term_info(3))?;
|
||||
term_dictionary_builder.finish()?
|
||||
};
|
||||
@@ -209,7 +208,6 @@ mod tests {
|
||||
assert!(kv_stream.advance());
|
||||
assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxy".as_bytes());
|
||||
assert_eq!(kv_stream.value(), &make_term_info(1));
|
||||
dbg!(make_term_info(1));
|
||||
assert!(kv_stream.advance());
|
||||
assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxyz".as_bytes());
|
||||
assert_eq!(kv_stream.value(), &make_term_info(2));
|
||||
|
||||
@@ -61,23 +61,26 @@ impl TermInfoBlockMeta {
|
||||
fn deserialize_term_info(&self, data: &[u8], inner_offset: usize) -> TermInfo {
|
||||
assert!(inner_offset < BLOCK_LEN - 1);
|
||||
let num_bits = self.num_bits() as usize;
|
||||
let mut cursor = num_bits * inner_offset;
|
||||
|
||||
let postings_start_offset = extract_bits(data, cursor, self.postings_offset_nbits);
|
||||
let postings_end_offset = self.ref_term_info.postings_start_offset
|
||||
+ extract_bits(data, cursor + num_bits, self.postings_offset_nbits);
|
||||
cursor += self.postings_offset_nbits as usize;
|
||||
let posting_start_addr = num_bits * inner_offset;
|
||||
// the stop offset is the start offset of the next term info.
|
||||
let posting_stop_addr = posting_start_addr + num_bits;
|
||||
let doc_freq_addr = posting_start_addr + self.postings_offset_nbits as usize;
|
||||
let positions_idx_addr = doc_freq_addr + self.doc_freq_nbits as usize;
|
||||
|
||||
let doc_freq = extract_bits(data, cursor, self.doc_freq_nbits) as u32;
|
||||
cursor += self.doc_freq_nbits as usize;
|
||||
|
||||
let positions_idx = extract_bits(data, cursor, self.positions_idx_nbits);
|
||||
let postings_start_offset = self.ref_term_info.postings_start_offset
|
||||
+ extract_bits(data, posting_start_addr, self.postings_offset_nbits);
|
||||
let postings_stop_offset = self.ref_term_info.postings_start_offset
|
||||
+ extract_bits(data, posting_stop_addr, self.postings_offset_nbits);
|
||||
let doc_freq = extract_bits(data, doc_freq_addr, self.doc_freq_nbits) as u32;
|
||||
let positions_idx = self.ref_term_info.positions_idx
|
||||
+ extract_bits(data, positions_idx_addr, self.positions_idx_nbits);
|
||||
|
||||
TermInfo {
|
||||
doc_freq,
|
||||
postings_start_offset: postings_start_offset + self.ref_term_info.postings_start_offset,
|
||||
postings_end_offset,
|
||||
positions_idx: positions_idx + self.ref_term_info.positions_idx,
|
||||
postings_start_offset,
|
||||
postings_stop_offset,
|
||||
positions_idx,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -197,15 +200,15 @@ impl TermInfoStoreWriter {
|
||||
} else {
|
||||
return Ok(());
|
||||
};
|
||||
let postings_end_offset =
|
||||
last_term_info.postings_end_offset - ref_term_info.postings_start_offset;
|
||||
let postings_stop_offset =
|
||||
last_term_info.postings_stop_offset - ref_term_info.postings_start_offset;
|
||||
for term_info in &mut self.term_infos[1..] {
|
||||
term_info.postings_start_offset -= ref_term_info.postings_start_offset;
|
||||
term_info.positions_idx -= ref_term_info.positions_idx;
|
||||
}
|
||||
|
||||
let mut max_doc_freq: u32 = 0u32;
|
||||
let max_postings_offset: u64 = postings_end_offset;
|
||||
let max_postings_offset: u64 = postings_stop_offset;
|
||||
let max_positions_idx: u64 = last_term_info.positions_idx;
|
||||
|
||||
for term_info in &self.term_infos[1..] {
|
||||
@@ -235,7 +238,7 @@ impl TermInfoStoreWriter {
|
||||
}
|
||||
|
||||
bit_packer.write(
|
||||
postings_end_offset,
|
||||
postings_stop_offset,
|
||||
term_info_block_meta.postings_offset_nbits,
|
||||
&mut self.buffer_term_infos,
|
||||
)?;
|
||||
@@ -248,7 +251,7 @@ impl TermInfoStoreWriter {
|
||||
}
|
||||
|
||||
pub fn write_term_info(&mut self, term_info: &TermInfo) -> io::Result<()> {
|
||||
assert!(term_info.postings_end_offset >= term_info.postings_start_offset);
|
||||
assert!(term_info.postings_stop_offset >= term_info.postings_start_offset);
|
||||
self.num_terms += 1u64;
|
||||
self.term_infos.push(term_info.clone());
|
||||
if self.term_infos.len() >= BLOCK_LEN {
|
||||
@@ -312,7 +315,7 @@ mod tests {
|
||||
ref_term_info: TermInfo {
|
||||
doc_freq: 512,
|
||||
postings_start_offset: 51,
|
||||
postings_end_offset: 57u64,
|
||||
postings_stop_offset: 57u64,
|
||||
positions_idx: 3584,
|
||||
},
|
||||
doc_freq_nbits: 10,
|
||||
@@ -335,7 +338,7 @@ mod tests {
|
||||
let term_info = TermInfo {
|
||||
doc_freq: i as u32,
|
||||
postings_start_offset: offset(i),
|
||||
postings_end_offset: offset(i + 1),
|
||||
postings_stop_offset: offset(i + 1),
|
||||
positions_idx: (i * 7) as u64,
|
||||
};
|
||||
store_writer.write_term_info(&term_info)?;
|
||||
|
||||
Reference in New Issue
Block a user