Files
tantivy/src/termdict/termdict.rs
2020-04-27 11:56:49 +09:00

206 lines
7.0 KiB
Rust

use super::term_info_store::{TermInfoStore, TermInfoStoreWriter};
use super::{TermStreamer, TermStreamerBuilder};
use crate::common::BinarySerializable;
use crate::common::CountingWriter;
use crate::directory::ReadOnlySource;
use crate::postings::TermInfo;
use crate::termdict::TermOrdinal;
use once_cell::sync::Lazy;
use std::io::{self, Write};
use tantivy_fst::raw::Fst;
use tantivy_fst::Automaton;
fn convert_fst_error(e: tantivy_fst::Error) -> io::Error {
io::Error::new(io::ErrorKind::Other, e)
}
/// Builder for the new term dictionary.
///
/// Inserting must be done in the order of the `keys`.
pub struct TermDictionaryBuilder<W> {
fst_builder: tantivy_fst::MapBuilder<W>,
term_info_store_writer: TermInfoStoreWriter,
term_ord: u64,
}
impl<W> TermDictionaryBuilder<W>
where
W: Write,
{
/// Creates a new `TermDictionaryBuilder`
pub fn create(w: W) -> io::Result<Self> {
let fst_builder = tantivy_fst::MapBuilder::new(w).map_err(convert_fst_error)?;
Ok(TermDictionaryBuilder {
fst_builder,
term_info_store_writer: TermInfoStoreWriter::new(),
term_ord: 0,
})
}
/// Inserts a `(key, value)` pair in the term dictionary.
///
/// *Keys have to be inserted in order.*
pub fn insert<K: AsRef<[u8]>>(&mut self, key_ref: K, value: &TermInfo) -> io::Result<()> {
let key = key_ref.as_ref();
self.insert_key(key)?;
self.insert_value(value)?;
Ok(())
}
/// # Warning
/// Horribly dangerous internal API
///
/// If used, it must be used by systematically alternating calls
/// to insert_key and insert_value.
///
/// Prefer using `.insert(key, value)`
pub(crate) fn insert_key(&mut self, key: &[u8]) -> io::Result<()> {
self.fst_builder
.insert(key, self.term_ord)
.map_err(convert_fst_error)?;
self.term_ord += 1;
Ok(())
}
/// # Warning
///
/// Horribly dangerous internal API. See `.insert_key(...)`.
pub(crate) fn insert_value(&mut self, term_info: &TermInfo) -> io::Result<()> {
self.term_info_store_writer.write_term_info(term_info)?;
Ok(())
}
/// Finalize writing the builder, and returns the underlying
/// `Write` object.
pub fn finish(mut self) -> io::Result<W> {
let mut file = self.fst_builder.into_inner().map_err(convert_fst_error)?;
{
let mut counting_writer = CountingWriter::wrap(&mut file);
self.term_info_store_writer
.serialize(&mut counting_writer)?;
let footer_size = counting_writer.written_bytes();
(footer_size as u64).serialize(&mut counting_writer)?;
counting_writer.flush()?;
}
Ok(file)
}
}
fn open_fst_index(source: ReadOnlySource) -> tantivy_fst::Map<ReadOnlySource> {
let fst = Fst::new(source).expect("FST data is corrupted");
tantivy_fst::Map::from(fst)
}
static EMPTY_DATA_SOURCE: Lazy<ReadOnlySource> = Lazy::new(|| {
let term_dictionary_data: Vec<u8> = TermDictionaryBuilder::create(Vec::<u8>::new())
.expect("Creating a TermDictionaryBuilder in a Vec<u8> should never fail")
.finish()
.expect("Writing in a Vec<u8> should never fail");
ReadOnlySource::from(term_dictionary_data)
});
/// The term dictionary contains all of the terms in
/// `tantivy index` in a sorted manner.
///
/// The `Fst` crate is used to associate terms to their
/// respective `TermOrdinal`. The `TermInfoStore` then makes it
/// possible to fetch the associated `TermInfo`.
pub struct TermDictionary {
fst_index: tantivy_fst::Map<ReadOnlySource>,
term_info_store: TermInfoStore,
}
impl TermDictionary {
/// Opens a `TermDictionary` given a data source.
pub fn from_source(source: &ReadOnlySource) -> Self {
let total_len = source.len();
let length_offset = total_len - 8;
let mut split_len_buffer: &[u8] = &source.as_slice()[length_offset..];
let footer_size = u64::deserialize(&mut split_len_buffer)
.expect("Deserializing 8 bytes should always work") as usize;
let split_len = length_offset - footer_size;
let fst_source = source.slice(0, split_len);
let values_source = source.slice(split_len, length_offset);
let fst_index = open_fst_index(fst_source);
TermDictionary {
fst_index,
term_info_store: TermInfoStore::open(&values_source),
}
}
/// Creates an empty term dictionary which contains no terms.
pub fn empty() -> Self {
TermDictionary::from_source(&*EMPTY_DATA_SOURCE)
}
/// Returns the number of terms in the dictionary.
/// Term ordinals range from 0 to `num_terms() - 1`.
pub fn num_terms(&self) -> usize {
self.term_info_store.num_terms()
}
/// Returns the ordinal associated to a given term.
pub fn term_ord<K: AsRef<[u8]>>(&self, key: K) -> Option<TermOrdinal> {
self.fst_index.get(key)
}
/// Returns the term associated to a given term ordinal.
///
/// Term ordinals are defined as the position of the term in
/// the sorted list of terms.
///
/// Returns true iff the term has been found.
///
/// Regardless of whether the term is found or not,
/// the buffer may be modified.
pub fn ord_to_term(&self, mut ord: TermOrdinal, bytes: &mut Vec<u8>) -> bool {
bytes.clear();
let fst = self.fst_index.as_fst();
let mut node = fst.root();
while ord != 0 || !node.is_final() {
if let Some(transition) = node
.transitions()
.take_while(|transition| transition.out.value() <= ord)
.last()
{
ord -= transition.out.value();
bytes.push(transition.inp);
let new_node_addr = transition.addr;
node = fst.node(new_node_addr);
} else {
return false;
}
}
true
}
/// Returns the number of terms in the dictionary.
pub fn term_info_from_ord(&self, term_ord: TermOrdinal) -> TermInfo {
self.term_info_store.get(term_ord)
}
/// Lookups the value corresponding to the key.
pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<TermInfo> {
self.term_ord(key)
.map(|term_ord| self.term_info_from_ord(term_ord))
}
/// Returns a range builder, to stream all of the terms
/// within an interval.
pub fn range(&self) -> TermStreamerBuilder<'_> {
TermStreamerBuilder::new(self, self.fst_index.range())
}
/// A stream of all the sorted terms. [See also `.stream_field()`](#method.stream_field)
pub fn stream(&self) -> TermStreamer<'_> {
self.range().into_stream()
}
/// Returns a search builder, to stream all of the terms
/// within the Automaton
pub fn search<'a, A: Automaton + 'a>(&'a self, automaton: A) -> TermStreamerBuilder<'a, A> {
let stream_builder = self.fst_index.search(automaton);
TermStreamerBuilder::<A>::new(self, stream_builder)
}
}