mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-03 00:50:41 +00:00
Added comments
This commit is contained in:
@@ -51,6 +51,7 @@ impl DeleteBitSet {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns whether the document has been marked as deleted.
|
||||
pub fn is_deleted(&self, doc: DocId) -> bool {
|
||||
if self.len == 0 {
|
||||
false
|
||||
|
||||
@@ -1,3 +1,21 @@
|
||||
//! The fieldnorm represents the length associated to
|
||||
//! a given Field of a given document.
|
||||
//!
|
||||
//! This metric is important to compute the score of a
|
||||
//! document : a document having a query word in one its short fields
|
||||
//! (e.g. title) is likely to be more relevant than in one of its longer field
|
||||
//! (e.g. body).
|
||||
//!
|
||||
//! It encodes `fieldnorm` on one byte with some precision loss,
|
||||
//! using the exact same scheme as Lucene. Each value is place on a log-scale
|
||||
//! that takes values from `0` to `255`.
|
||||
//!
|
||||
//! A value on this scale is identified by a `fieldnorm_id`.
|
||||
//! Apart from compression, this scale also makes it possible to
|
||||
//! precompute computationally expensive functions of the fieldnorm
|
||||
//! in a very short array.
|
||||
//!
|
||||
//! This trick is used by the [BM25 similarity]().
|
||||
mod code;
|
||||
mod serializer;
|
||||
mod writer;
|
||||
@@ -7,4 +25,5 @@ pub use self::reader::FieldNormReader;
|
||||
pub use self::writer::FieldNormsWriter;
|
||||
pub use self::serializer::FieldNormsSerializer;
|
||||
|
||||
use self::code::{fieldnorm_to_id, id_to_fieldnorm};
|
||||
use self::code::{fieldnorm_to_id, id_to_fieldnorm};
|
||||
|
||||
|
||||
@@ -2,34 +2,66 @@ use super::{id_to_fieldnorm, fieldnorm_to_id};
|
||||
use directory::ReadOnlySource;
|
||||
use DocId;
|
||||
|
||||
|
||||
/// Reads the fieldnorm associated to a document.
|
||||
/// The fieldnorm represents the length associated to
|
||||
/// a given Field of a given document.
|
||||
///
|
||||
/// This metric is important to compute the score of a
|
||||
/// document : a document having a query word in one its short fields
|
||||
/// (e.g. title) is likely to be more relevant than in one of its longer field
|
||||
/// (e.g. body).
|
||||
///
|
||||
/// tantivy encodes `fieldnorm` on one byte with some precision loss,
|
||||
/// using the same scheme as Lucene. Each value is place on a log-scale
|
||||
/// that takes values from `0` to `255`.
|
||||
///
|
||||
/// A value on this scale is identified by a `fieldnorm_id`.
|
||||
/// Apart from compression, this scale also makes it possible to
|
||||
/// precompute computationally expensive functions of the fieldnorm
|
||||
/// in a very short array.
|
||||
pub struct FieldNormReader {
|
||||
data: ReadOnlySource
|
||||
}
|
||||
|
||||
impl FieldNormReader {
|
||||
|
||||
/// Opens a field norm reader given its data source.
|
||||
pub fn open(data: ReadOnlySource) -> Self {
|
||||
FieldNormReader {
|
||||
data
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the `fieldnorm` associated to a doc id.
|
||||
/// The fieldnorm is a value approximating the number
|
||||
/// of tokens in a given field of the `doc_id`.
|
||||
///
|
||||
/// It is imprecise, and always lower than the actual
|
||||
/// number of tokens.
|
||||
///
|
||||
/// The fieldnorm is effectively decoded from the
|
||||
/// `fieldnorm_id` by doing a simple table lookup.
|
||||
pub fn fieldnorm(&self, doc_id: DocId) -> u32 {
|
||||
let fieldnorm_id = self.fieldnorm_id(doc_id);
|
||||
id_to_fieldnorm(fieldnorm_id)
|
||||
}
|
||||
|
||||
/// Returns the `fieldnorm_id` associated to a document.
|
||||
#[inline(always)]
|
||||
pub fn fieldnorm_id(&self, doc_id: DocId) -> u8 {
|
||||
let fielnorms_data = self.data.as_slice();
|
||||
fielnorms_data[doc_id as usize]
|
||||
}
|
||||
|
||||
/// Converts a `fieldnorm_id` into a fieldnorm.
|
||||
#[inline(always)]
|
||||
pub fn id_to_fieldnorm(id: u8) -> u32 {
|
||||
id_to_fieldnorm(id)
|
||||
}
|
||||
|
||||
/// Converts a `fieldnorm` into a `fieldnorm_id`.
|
||||
/// (This function is not injective).
|
||||
#[inline(always)]
|
||||
pub fn fieldnorm_to_id(fieldnorm: u32) -> u8 {
|
||||
fieldnorm_to_id(fieldnorm)
|
||||
|
||||
@@ -321,11 +321,18 @@ impl IndexMerger {
|
||||
for (segment_ord, mut segment_postings) in segment_postings {
|
||||
let old_to_new_doc_id = &merged_doc_id_map[segment_ord];
|
||||
loop {
|
||||
let doc = segment_postings.doc();
|
||||
|
||||
// `.advance()` has been called once before the loop.
|
||||
// Hence we cannot use a `while segment_postings.advance()` loop.
|
||||
if let Some(remapped_doc_id) =
|
||||
old_to_new_doc_id[segment_postings.doc() as usize]
|
||||
{
|
||||
//
|
||||
// It was required to make sure we only consider segments
|
||||
// that effectively contain at least one non-deleted document
|
||||
// and remove terms that do not have documents associated.
|
||||
//
|
||||
// For this reason, we cannot use a `while segment_postings.advance()` loop.
|
||||
|
||||
// deleted doc are skipped as they do not have a `remapped_doc_id`.
|
||||
if let Some(remapped_doc_id) = old_to_new_doc_id[doc as usize] {
|
||||
// we make sure to only write the term iff
|
||||
// there is at least one document.
|
||||
let term_freq = segment_postings.term_freq();
|
||||
|
||||
@@ -292,7 +292,7 @@ mod tests {
|
||||
use Postings;
|
||||
use rand::{Rng, SeedableRng, XorShiftRng};
|
||||
use rand::distributions::{IndependentSample, Range};
|
||||
|
||||
|
||||
pub fn assert_nearly_equals(expected: f32, val: f32) {
|
||||
assert!(nearly_equals(val, expected), "Got {}, expected {}.", val, expected);
|
||||
}
|
||||
|
||||
@@ -443,7 +443,6 @@ pub mod tests {
|
||||
last = cur;
|
||||
cur = next;
|
||||
}
|
||||
|
||||
assert_eq!(cur, 377);
|
||||
}
|
||||
|
||||
|
||||
@@ -14,10 +14,12 @@ pub trait Postings: DocSet + 'static {
|
||||
/// Returns the term frequency
|
||||
fn term_freq(&self) -> u32;
|
||||
|
||||
/// Returns the list of positions of the term, expressed as a list of
|
||||
/// token ordinals.
|
||||
/// Returns the positions offseted with a given value.
|
||||
/// The output vector will be resized to the `term_freq`.
|
||||
fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>);
|
||||
|
||||
/// Returns the positions of the term in the given document.
|
||||
/// The output vector will be resized to the `term_freq`.
|
||||
fn positions(&mut self, output: &mut Vec<u32>) {
|
||||
self.positions_with_offset(0u32, output);
|
||||
}
|
||||
|
||||
@@ -52,7 +52,7 @@ impl PositionComputer {
|
||||
///
|
||||
/// As we iterate through the `SegmentPostings`, the frequencies are optionally decoded.
|
||||
/// Positions on the other hand, are optionally entirely decoded upfront.
|
||||
pub struct SegmentPostings {
|
||||
pub struct SegmentPostings {
|
||||
block_cursor: BlockSegmentPostings,
|
||||
cur: usize,
|
||||
position_computer: Option<PositionComputer>,
|
||||
|
||||
@@ -78,7 +78,7 @@ impl<TDocSet: DocSet> Intersection<TDocSet, TDocSet> {
|
||||
|
||||
|
||||
impl<TDocSet: DocSet> Intersection<TDocSet, TDocSet> {
|
||||
pub fn docset_mut_specialized(&mut self, ord: usize) -> &mut TDocSet {
|
||||
pub(crate) fn docset_mut_specialized(&mut self, ord: usize) -> &mut TDocSet {
|
||||
match ord {
|
||||
0 => &mut self.left,
|
||||
1 => &mut self.right,
|
||||
@@ -88,7 +88,7 @@ impl<TDocSet: DocSet> Intersection<TDocSet, TDocSet> {
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet, TOtherDocSet: DocSet> Intersection<TDocSet, TOtherDocSet> {
|
||||
pub fn docset_mut(&mut self, ord: usize) -> &mut DocSet {
|
||||
pub(crate) fn docset_mut(&mut self, ord: usize) -> &mut DocSet {
|
||||
match ord {
|
||||
0 => &mut self.left,
|
||||
1 => &mut self.right,
|
||||
|
||||
Reference in New Issue
Block a user