diff --git a/src/fastfield/delete.rs b/src/fastfield/delete.rs index cc541cc18..9c666504f 100644 --- a/src/fastfield/delete.rs +++ b/src/fastfield/delete.rs @@ -51,6 +51,7 @@ impl DeleteBitSet { } } + /// Returns whether the document has been marked as deleted. pub fn is_deleted(&self, doc: DocId) -> bool { if self.len == 0 { false diff --git a/src/fieldnorm/mod.rs b/src/fieldnorm/mod.rs index c1a28e045..45fa92167 100644 --- a/src/fieldnorm/mod.rs +++ b/src/fieldnorm/mod.rs @@ -1,3 +1,21 @@ +//! The fieldnorm represents the length associated to +//! a given Field of a given document. +//! +//! This metric is important to compute the score of a +//! document : a document having a query word in one its short fields +//! (e.g. title) is likely to be more relevant than in one of its longer field +//! (e.g. body). +//! +//! It encodes `fieldnorm` on one byte with some precision loss, +//! using the exact same scheme as Lucene. Each value is place on a log-scale +//! that takes values from `0` to `255`. +//! +//! A value on this scale is identified by a `fieldnorm_id`. +//! Apart from compression, this scale also makes it possible to +//! precompute computationally expensive functions of the fieldnorm +//! in a very short array. +//! +//! This trick is used by the [BM25 similarity](). mod code; mod serializer; mod writer; @@ -7,4 +25,5 @@ pub use self::reader::FieldNormReader; pub use self::writer::FieldNormsWriter; pub use self::serializer::FieldNormsSerializer; -use self::code::{fieldnorm_to_id, id_to_fieldnorm}; \ No newline at end of file +use self::code::{fieldnorm_to_id, id_to_fieldnorm}; + diff --git a/src/fieldnorm/reader.rs b/src/fieldnorm/reader.rs index a097dd2de..982eb1f4e 100644 --- a/src/fieldnorm/reader.rs +++ b/src/fieldnorm/reader.rs @@ -2,34 +2,66 @@ use super::{id_to_fieldnorm, fieldnorm_to_id}; use directory::ReadOnlySource; use DocId; + +/// Reads the fieldnorm associated to a document. +/// The fieldnorm represents the length associated to +/// a given Field of a given document. +/// +/// This metric is important to compute the score of a +/// document : a document having a query word in one its short fields +/// (e.g. title) is likely to be more relevant than in one of its longer field +/// (e.g. body). +/// +/// tantivy encodes `fieldnorm` on one byte with some precision loss, +/// using the same scheme as Lucene. Each value is place on a log-scale +/// that takes values from `0` to `255`. +/// +/// A value on this scale is identified by a `fieldnorm_id`. +/// Apart from compression, this scale also makes it possible to +/// precompute computationally expensive functions of the fieldnorm +/// in a very short array. pub struct FieldNormReader { data: ReadOnlySource } impl FieldNormReader { + /// Opens a field norm reader given its data source. pub fn open(data: ReadOnlySource) -> Self { FieldNormReader { data } } + /// Returns the `fieldnorm` associated to a doc id. + /// The fieldnorm is a value approximating the number + /// of tokens in a given field of the `doc_id`. + /// + /// It is imprecise, and always lower than the actual + /// number of tokens. + /// + /// The fieldnorm is effectively decoded from the + /// `fieldnorm_id` by doing a simple table lookup. pub fn fieldnorm(&self, doc_id: DocId) -> u32 { let fieldnorm_id = self.fieldnorm_id(doc_id); id_to_fieldnorm(fieldnorm_id) } + /// Returns the `fieldnorm_id` associated to a document. #[inline(always)] pub fn fieldnorm_id(&self, doc_id: DocId) -> u8 { let fielnorms_data = self.data.as_slice(); fielnorms_data[doc_id as usize] } + /// Converts a `fieldnorm_id` into a fieldnorm. #[inline(always)] pub fn id_to_fieldnorm(id: u8) -> u32 { id_to_fieldnorm(id) } + /// Converts a `fieldnorm` into a `fieldnorm_id`. + /// (This function is not injective). #[inline(always)] pub fn fieldnorm_to_id(fieldnorm: u32) -> u8 { fieldnorm_to_id(fieldnorm) diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index c9eb84976..dea7b589e 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -321,11 +321,18 @@ impl IndexMerger { for (segment_ord, mut segment_postings) in segment_postings { let old_to_new_doc_id = &merged_doc_id_map[segment_ord]; loop { + let doc = segment_postings.doc(); + // `.advance()` has been called once before the loop. - // Hence we cannot use a `while segment_postings.advance()` loop. - if let Some(remapped_doc_id) = - old_to_new_doc_id[segment_postings.doc() as usize] - { + // + // It was required to make sure we only consider segments + // that effectively contain at least one non-deleted document + // and remove terms that do not have documents associated. + // + // For this reason, we cannot use a `while segment_postings.advance()` loop. + + // deleted doc are skipped as they do not have a `remapped_doc_id`. + if let Some(remapped_doc_id) = old_to_new_doc_id[doc as usize] { // we make sure to only write the term iff // there is at least one document. let term_freq = segment_postings.term_freq(); diff --git a/src/lib.rs b/src/lib.rs index 6af44d613..26b45d2d2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -292,7 +292,7 @@ mod tests { use Postings; use rand::{Rng, SeedableRng, XorShiftRng}; use rand::distributions::{IndependentSample, Range}; - + pub fn assert_nearly_equals(expected: f32, val: f32) { assert!(nearly_equals(val, expected), "Got {}, expected {}.", val, expected); } diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 69db5904e..ab3398774 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -443,7 +443,6 @@ pub mod tests { last = cur; cur = next; } - assert_eq!(cur, 377); } diff --git a/src/postings/postings.rs b/src/postings/postings.rs index b415860d5..0d4400e51 100644 --- a/src/postings/postings.rs +++ b/src/postings/postings.rs @@ -14,10 +14,12 @@ pub trait Postings: DocSet + 'static { /// Returns the term frequency fn term_freq(&self) -> u32; - /// Returns the list of positions of the term, expressed as a list of - /// token ordinals. + /// Returns the positions offseted with a given value. + /// The output vector will be resized to the `term_freq`. fn positions_with_offset(&mut self, offset: u32, output: &mut Vec); + /// Returns the positions of the term in the given document. + /// The output vector will be resized to the `term_freq`. fn positions(&mut self, output: &mut Vec) { self.positions_with_offset(0u32, output); } diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index 637420756..5b7cf9216 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -52,7 +52,7 @@ impl PositionComputer { /// /// As we iterate through the `SegmentPostings`, the frequencies are optionally decoded. /// Positions on the other hand, are optionally entirely decoded upfront. -pub struct SegmentPostings { +pub struct SegmentPostings { block_cursor: BlockSegmentPostings, cur: usize, position_computer: Option, diff --git a/src/query/intersection.rs b/src/query/intersection.rs index 874e7229c..0a978c324 100644 --- a/src/query/intersection.rs +++ b/src/query/intersection.rs @@ -78,7 +78,7 @@ impl Intersection { impl Intersection { - pub fn docset_mut_specialized(&mut self, ord: usize) -> &mut TDocSet { + pub(crate) fn docset_mut_specialized(&mut self, ord: usize) -> &mut TDocSet { match ord { 0 => &mut self.left, 1 => &mut self.right, @@ -88,7 +88,7 @@ impl Intersection { } impl Intersection { - pub fn docset_mut(&mut self, ord: usize) -> &mut DocSet { + pub(crate) fn docset_mut(&mut self, ord: usize) -> &mut DocSet { match ord { 0 => &mut self.left, 1 => &mut self.right,