Added comments

2026-06-03 00:50:41 +00:00 · 2018-03-28 08:28:49 +09:00
parent ffa03bad71
commit 8006f1df11
9 changed files with 72 additions and 12 deletions
--- a/src/fastfield/delete.rs
+++ b/src/fastfield/delete.rs
@@ -51,6 +51,7 @@ impl DeleteBitSet {
        }
    }

+    /// Returns whether the document has been marked as deleted.
    pub fn is_deleted(&self, doc: DocId) -> bool {
        if self.len == 0 {
            false
--- a/src/fieldnorm/mod.rs
+++ b/src/fieldnorm/mod.rs
@@ -1,3 +1,21 @@
+//! The fieldnorm represents the length associated to
+//! a given Field of a given document.
+//!
+//! This metric is important to compute the score of a
+//! document : a document having a query word in one its short fields
+//! (e.g. title)  is likely to be more relevant than in one of its longer field
+//! (e.g. body).
+//!
+//! It encodes `fieldnorm` on one byte with some precision loss,
+//! using the exact same scheme as Lucene. Each value is place on a log-scale
+//! that takes values from `0` to `255`.
+//!
+//! A value on this scale is identified by a `fieldnorm_id`.
+//! Apart from compression, this scale also makes it possible to
+//! precompute computationally expensive functions of the fieldnorm
+//! in a very short array.
+//!
+//! This trick is used by the [BM25 similarity]().
 mod code;
 mod serializer;
 mod writer;
@@ -7,4 +25,5 @@ pub use self::reader::FieldNormReader;
 pub use self::writer::FieldNormsWriter;
 pub use self::serializer::FieldNormsSerializer;

-use self::code::{fieldnorm_to_id, id_to_fieldnorm};
+use self::code::{fieldnorm_to_id, id_to_fieldnorm};
+
--- a/src/fieldnorm/reader.rs
+++ b/src/fieldnorm/reader.rs
@@ -2,34 +2,66 @@ use super::{id_to_fieldnorm, fieldnorm_to_id};
 use directory::ReadOnlySource;
 use DocId;

+
+/// Reads the fieldnorm associated to a document.
+/// The fieldnorm represents the length associated to
+/// a given Field of a given document.
+///
+/// This metric is important to compute the score of a
+/// document : a document having a query word in one its short fields
+/// (e.g. title)  is likely to be more relevant than in one of its longer field
+/// (e.g. body).
+///
+/// tantivy encodes `fieldnorm` on one byte with some precision loss,
+/// using the same scheme as Lucene. Each value is place on a log-scale
+/// that takes values from `0` to `255`.
+///
+/// A value on this scale is identified by a `fieldnorm_id`.
+/// Apart from compression, this scale also makes it possible to
+/// precompute computationally expensive functions of the fieldnorm
+/// in a very short array.
 pub struct FieldNormReader {
    data: ReadOnlySource
 }

 impl FieldNormReader {

+    /// Opens a field norm reader given its data source.
    pub fn open(data: ReadOnlySource) -> Self {
        FieldNormReader {
            data
        }
    }

+    /// Returns the `fieldnorm` associated to a doc id.
+    /// The fieldnorm is a value approximating the number
+    /// of tokens in a given field of the `doc_id`.
+    ///
+    /// It is imprecise, and always lower than the actual
+    /// number of tokens.
+    ///
+    /// The fieldnorm is effectively decoded from the
+    /// `fieldnorm_id` by doing a simple table lookup.
    pub fn fieldnorm(&self, doc_id: DocId) -> u32 {
        let fieldnorm_id = self.fieldnorm_id(doc_id);
        id_to_fieldnorm(fieldnorm_id)
    }

+    /// Returns the `fieldnorm_id` associated to a document.
    #[inline(always)]
    pub fn fieldnorm_id(&self, doc_id: DocId) -> u8 {
        let fielnorms_data = self.data.as_slice();
        fielnorms_data[doc_id as usize]
    }

+    /// Converts a `fieldnorm_id` into a fieldnorm.
    #[inline(always)]
    pub fn id_to_fieldnorm(id: u8) -> u32 {
        id_to_fieldnorm(id)
    }

+    /// Converts a `fieldnorm` into a `fieldnorm_id`.
+    /// (This function is not injective).
    #[inline(always)]
    pub fn fieldnorm_to_id(fieldnorm: u32) -> u8 {
        fieldnorm_to_id(fieldnorm)
--- a/src/indexer/merger.rs
+++ b/src/indexer/merger.rs
@@ -321,11 +321,18 @@ impl IndexMerger {
                    for (segment_ord, mut segment_postings) in segment_postings {
                        let old_to_new_doc_id = &merged_doc_id_map[segment_ord];
                        loop {
+                            let doc =  segment_postings.doc();
+
                            // `.advance()` has been called once before the loop.
-                            // Hence we cannot use a `while segment_postings.advance()` loop.
-                            if let Some(remapped_doc_id) =
-                                old_to_new_doc_id[segment_postings.doc() as usize]
-                            {
+                            //
+                            // It was required to make sure we only consider segments
+                            // that effectively contain at least one non-deleted document
+                            // and remove terms that do not have documents associated.
+                            //
+                            //  For this reason, we cannot use a `while segment_postings.advance()` loop.
+
+                            // deleted doc are skipped as they do not have a `remapped_doc_id`.
+                            if let Some(remapped_doc_id) = old_to_new_doc_id[doc as usize] {
                                // we make sure to only write the term iff
                                // there is at least one document.
                                let term_freq = segment_postings.term_freq();
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -292,7 +292,7 @@ mod tests {
    use Postings;
    use rand::{Rng, SeedableRng, XorShiftRng};
    use rand::distributions::{IndependentSample, Range};
-    
+
    pub fn assert_nearly_equals(expected: f32, val: f32) {
        assert!(nearly_equals(val, expected), "Got {}, expected {}.", val, expected);
    }
--- a/src/postings/mod.rs
+++ b/src/postings/mod.rs
@@ -443,7 +443,6 @@ pub mod tests {
                last = cur;
                cur = next;
            }
-
            assert_eq!(cur, 377);
        }

--- a/src/postings/postings.rs
+++ b/src/postings/postings.rs
@@ -14,10 +14,12 @@ pub trait Postings: DocSet + 'static {
    /// Returns the term frequency
    fn term_freq(&self) -> u32;

-    /// Returns the list of positions of the term, expressed as a list of
-    /// token ordinals.
+    /// Returns the positions offseted with a given value.
+    /// The output vector will be resized to the `term_freq`.
    fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>);

+    /// Returns the positions of the term in the given document.
+    /// The output vector will be resized to the `term_freq`.
    fn positions(&mut self, output: &mut Vec<u32>) {
        self.positions_with_offset(0u32, output);
    }
--- a/src/postings/segment_postings.rs
+++ b/src/postings/segment_postings.rs
@@ -52,7 +52,7 @@ impl PositionComputer {
 ///
 /// As we iterate through the `SegmentPostings`, the frequencies are optionally decoded.
 /// Positions on the other hand, are optionally entirely decoded upfront.
-pub struct  SegmentPostings {
+pub struct SegmentPostings {
    block_cursor: BlockSegmentPostings,
    cur: usize,
    position_computer: Option<PositionComputer>,
--- a/src/query/intersection.rs
+++ b/src/query/intersection.rs
@@ -78,7 +78,7 @@ impl<TDocSet: DocSet> Intersection<TDocSet, TDocSet> {


 impl<TDocSet: DocSet> Intersection<TDocSet, TDocSet> {
-    pub fn docset_mut_specialized(&mut self, ord: usize) -> &mut TDocSet {
+    pub(crate) fn docset_mut_specialized(&mut self, ord: usize) -> &mut TDocSet {
        match ord {
            0 => &mut self.left,
            1 => &mut self.right,
@@ -88,7 +88,7 @@ impl<TDocSet: DocSet> Intersection<TDocSet, TDocSet> {
 }

 impl<TDocSet: DocSet, TOtherDocSet: DocSet> Intersection<TDocSet, TOtherDocSet> {
-    pub fn docset_mut(&mut self, ord: usize) -> &mut DocSet {
+    pub(crate) fn docset_mut(&mut self, ord: usize) -> &mut DocSet {
        match ord {
            0 => &mut self.left,
            1 => &mut self.right,