Added comments

This commit is contained in:
Paul Masurel
2018-03-28 08:28:49 +09:00
parent ffa03bad71
commit 8006f1df11
9 changed files with 72 additions and 12 deletions

View File

@@ -51,6 +51,7 @@ impl DeleteBitSet {
}
}
/// Returns whether the document has been marked as deleted.
pub fn is_deleted(&self, doc: DocId) -> bool {
if self.len == 0 {
false

View File

@@ -1,3 +1,21 @@
//! The fieldnorm represents the length associated to
//! a given Field of a given document.
//!
//! This metric is important to compute the score of a
//! document : a document having a query word in one its short fields
//! (e.g. title) is likely to be more relevant than in one of its longer field
//! (e.g. body).
//!
//! It encodes `fieldnorm` on one byte with some precision loss,
//! using the exact same scheme as Lucene. Each value is place on a log-scale
//! that takes values from `0` to `255`.
//!
//! A value on this scale is identified by a `fieldnorm_id`.
//! Apart from compression, this scale also makes it possible to
//! precompute computationally expensive functions of the fieldnorm
//! in a very short array.
//!
//! This trick is used by the [BM25 similarity]().
mod code;
mod serializer;
mod writer;
@@ -7,4 +25,5 @@ pub use self::reader::FieldNormReader;
pub use self::writer::FieldNormsWriter;
pub use self::serializer::FieldNormsSerializer;
use self::code::{fieldnorm_to_id, id_to_fieldnorm};
use self::code::{fieldnorm_to_id, id_to_fieldnorm};

View File

@@ -2,34 +2,66 @@ use super::{id_to_fieldnorm, fieldnorm_to_id};
use directory::ReadOnlySource;
use DocId;
/// Reads the fieldnorm associated to a document.
/// The fieldnorm represents the length associated to
/// a given Field of a given document.
///
/// This metric is important to compute the score of a
/// document : a document having a query word in one its short fields
/// (e.g. title) is likely to be more relevant than in one of its longer field
/// (e.g. body).
///
/// tantivy encodes `fieldnorm` on one byte with some precision loss,
/// using the same scheme as Lucene. Each value is place on a log-scale
/// that takes values from `0` to `255`.
///
/// A value on this scale is identified by a `fieldnorm_id`.
/// Apart from compression, this scale also makes it possible to
/// precompute computationally expensive functions of the fieldnorm
/// in a very short array.
pub struct FieldNormReader {
data: ReadOnlySource
}
impl FieldNormReader {
/// Opens a field norm reader given its data source.
pub fn open(data: ReadOnlySource) -> Self {
FieldNormReader {
data
}
}
/// Returns the `fieldnorm` associated to a doc id.
/// The fieldnorm is a value approximating the number
/// of tokens in a given field of the `doc_id`.
///
/// It is imprecise, and always lower than the actual
/// number of tokens.
///
/// The fieldnorm is effectively decoded from the
/// `fieldnorm_id` by doing a simple table lookup.
pub fn fieldnorm(&self, doc_id: DocId) -> u32 {
let fieldnorm_id = self.fieldnorm_id(doc_id);
id_to_fieldnorm(fieldnorm_id)
}
/// Returns the `fieldnorm_id` associated to a document.
#[inline(always)]
pub fn fieldnorm_id(&self, doc_id: DocId) -> u8 {
let fielnorms_data = self.data.as_slice();
fielnorms_data[doc_id as usize]
}
/// Converts a `fieldnorm_id` into a fieldnorm.
#[inline(always)]
pub fn id_to_fieldnorm(id: u8) -> u32 {
id_to_fieldnorm(id)
}
/// Converts a `fieldnorm` into a `fieldnorm_id`.
/// (This function is not injective).
#[inline(always)]
pub fn fieldnorm_to_id(fieldnorm: u32) -> u8 {
fieldnorm_to_id(fieldnorm)

View File

@@ -321,11 +321,18 @@ impl IndexMerger {
for (segment_ord, mut segment_postings) in segment_postings {
let old_to_new_doc_id = &merged_doc_id_map[segment_ord];
loop {
let doc = segment_postings.doc();
// `.advance()` has been called once before the loop.
// Hence we cannot use a `while segment_postings.advance()` loop.
if let Some(remapped_doc_id) =
old_to_new_doc_id[segment_postings.doc() as usize]
{
//
// It was required to make sure we only consider segments
// that effectively contain at least one non-deleted document
// and remove terms that do not have documents associated.
//
// For this reason, we cannot use a `while segment_postings.advance()` loop.
// deleted doc are skipped as they do not have a `remapped_doc_id`.
if let Some(remapped_doc_id) = old_to_new_doc_id[doc as usize] {
// we make sure to only write the term iff
// there is at least one document.
let term_freq = segment_postings.term_freq();

View File

@@ -292,7 +292,7 @@ mod tests {
use Postings;
use rand::{Rng, SeedableRng, XorShiftRng};
use rand::distributions::{IndependentSample, Range};
pub fn assert_nearly_equals(expected: f32, val: f32) {
assert!(nearly_equals(val, expected), "Got {}, expected {}.", val, expected);
}

View File

@@ -443,7 +443,6 @@ pub mod tests {
last = cur;
cur = next;
}
assert_eq!(cur, 377);
}

View File

@@ -14,10 +14,12 @@ pub trait Postings: DocSet + 'static {
/// Returns the term frequency
fn term_freq(&self) -> u32;
/// Returns the list of positions of the term, expressed as a list of
/// token ordinals.
/// Returns the positions offseted with a given value.
/// The output vector will be resized to the `term_freq`.
fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>);
/// Returns the positions of the term in the given document.
/// The output vector will be resized to the `term_freq`.
fn positions(&mut self, output: &mut Vec<u32>) {
self.positions_with_offset(0u32, output);
}

View File

@@ -52,7 +52,7 @@ impl PositionComputer {
///
/// As we iterate through the `SegmentPostings`, the frequencies are optionally decoded.
/// Positions on the other hand, are optionally entirely decoded upfront.
pub struct SegmentPostings {
pub struct SegmentPostings {
block_cursor: BlockSegmentPostings,
cur: usize,
position_computer: Option<PositionComputer>,

View File

@@ -78,7 +78,7 @@ impl<TDocSet: DocSet> Intersection<TDocSet, TDocSet> {
impl<TDocSet: DocSet> Intersection<TDocSet, TDocSet> {
pub fn docset_mut_specialized(&mut self, ord: usize) -> &mut TDocSet {
pub(crate) fn docset_mut_specialized(&mut self, ord: usize) -> &mut TDocSet {
match ord {
0 => &mut self.left,
1 => &mut self.right,
@@ -88,7 +88,7 @@ impl<TDocSet: DocSet> Intersection<TDocSet, TDocSet> {
}
impl<TDocSet: DocSet, TOtherDocSet: DocSet> Intersection<TDocSet, TOtherDocSet> {
pub fn docset_mut(&mut self, ord: usize) -> &mut DocSet {
pub(crate) fn docset_mut(&mut self, ord: usize) -> &mut DocSet {
match ord {
0 => &mut self.left,
1 => &mut self.right,