diff --git a/src/postings/term_info.rs b/src/postings/term_info.rs index ab42d7253..6930d6052 100644 --- a/src/postings/term_info.rs +++ b/src/postings/term_info.rs @@ -1,34 +1,25 @@ use common::{BinarySerializable, FixedSize}; use std::io; -/// `TermInfo` contains all of the information -/// associated to terms in the `.term` file. -/// -/// It consists of -/// * `doc_freq` : the number of document in the segment -/// containing this term. It is also the length of the -/// posting list associated to this term -/// * `postings_offset` : an offset in the `.idx` file -/// addressing the start of the posting list associated -/// to this term. +/// `TermInfo` wraps the metadata associated to a Term. +/// It is segment-local. #[derive(Debug, Default, Ord, PartialOrd, Eq, PartialEq, Clone)] pub struct TermInfo { /// Number of documents in the segment containing the term pub doc_freq: u32, - /// Offset within the postings (`.idx`) file. + /// Start offset within the postings (`.idx`) file. pub postings_offset: u64, - /// Offset within the position (`.pos`) file. + /// Start offset of the first block within the position (`.pos`) file. pub positions_offset: u64, - /// Offset within the position block. + /// Start offset within this position block. pub positions_inner_offset: u8, } impl FixedSize for TermInfo { - /// Size required for the binary serialization of `TermInfo`. - /// This is large, but in practise, all `TermInfo` but the first one - /// of the block are bitpacked. - /// - /// See `TermInfoStore`. + /// Size required for the binary serialization of a `TermInfo` object. + /// This is large, but in practise, `TermInfo` are encoded in blocks and + /// only the first `TermInfo` of a block is serialized uncompressed. + /// The subsequent `TermInfo` are delta encoded and bitpacked. const SIZE_IN_BYTES: usize = u32::SIZE_IN_BYTES + 2 * u64::SIZE_IN_BYTES + u8::SIZE_IN_BYTES; } diff --git a/src/termdict/mod.rs b/src/termdict/mod.rs index 0cc1010e1..02a331a0a 100644 --- a/src/termdict/mod.rs +++ b/src/termdict/mod.rs @@ -1,50 +1,20 @@ /*! -The term dictionary is one of the key data structures of -tantivy. It associates sorted `terms` to a `TermInfo` struct -that serves as an address to their respective posting list. +The term dictionary main role is to associate the sorted [`Term`s](../struct.Term.html) to +a [`TermInfo`](../postings/struct.TermInfo.html) struct that contains some meta-information +about the term. -The term dictionary API makes it possible to iterate through -a range of keys in a sorted manner. +Internally, the term dictionary relies on the `fst` crate to store +a sorted mapping that associate each term to its rank in the lexicographical order. +For instance, in a dictionary containing the sorted terms "abba", "bjork", "blur" and "donovan", +the `TermOrdinal` are respectively `0`, `1`, `2`, and `3`. +For `u64`-terms, tantivy explicitely uses a `BigEndian` representation to ensure that the +lexicographical order matches the natural order of integers. -# Implementations +`i64`-terms are transformed to `u64` using a continuous mapping `val ⟶ val - i64::min_value()` +and then treated as a `u64`. -There are currently two implementations of the term dictionary. - -## Default implementation : `fstdict` - -The default one relies heavily on the `fst` crate. -It associate each term's `&[u8]` representation to a `u64` -that is in fact an address in a buffer. The value is then accessible -via deserializing the value at this address. - - -## Stream implementation : `streamdict` - -The `fstdict` is a tiny bit slow when streaming all of -the terms. -For some use case (analytics engine), it is preferrable -to use the `streamdict`, that offers better streaming -performance, to the detriment of `lookup` performance. - -`streamdict` can be enabled by adding the `streamdict` -feature when compiling `tantivy`. - -`streamdict` encodes each term relatively to the precedent -as follows. - -- number of bytes that needs to be popped. -- number of bytes that needs to be added. -- sequence of bytes that is to be added -- value. - -Because such a structure does not allow for lookups, -it comes with a `fst` that indexes 1 out of `1024` -terms in this structure. - -A `lookup` therefore consists in a lookup in the `fst` -followed by a streaming through at most `1024` elements in the -term `stream`. +A second datastructure makes it possible to access a [`TermInfo`](../postings/struct.TermInfo.html). */ /// Position of the term in the sorted list of terms.