mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-27 13:40:49 +00:00
issue/174 Added doc, and made field private
This commit is contained in:
@@ -31,6 +31,11 @@ pub trait HasLen {
|
||||
const HIGHEST_BIT: u64 = 1 << 63;
|
||||
|
||||
|
||||
/// Maps a `i64` to `u64`
|
||||
///
|
||||
/// For simplicity, tantivy internally handles `i64` as `u64`.
|
||||
/// The mapping is defined by this function.
|
||||
///
|
||||
/// Maps `i64` to `u64` so that
|
||||
/// `-2^63 .. 2^63-1` is mapped
|
||||
/// to
|
||||
@@ -43,13 +48,15 @@ const HIGHEST_BIT: u64 = 1 << 63;
|
||||
/// Imagine a list of `i64` ranging from -10 to 10.
|
||||
/// When casting negative values, the negative values are projected
|
||||
/// to values over 2^63, and all values end up requiring 64 bits.
|
||||
///
|
||||
/// # See also
|
||||
/// The [reverse mapping is `u64_to_i64`](./fn.u64_to_i64.html).
|
||||
#[inline(always)]
|
||||
pub fn i64_to_u64(val: i64) -> u64 {
|
||||
(val as u64) ^ HIGHEST_BIT
|
||||
}
|
||||
|
||||
/// Reverse the mapping given by
|
||||
/// `i64_to_u64`.
|
||||
/// Reverse the mapping given by [`i64_to_u64`](./fn.i64_to_u64.html).
|
||||
#[inline(always)]
|
||||
pub fn u64_to_i64(val: u64) -> i64 {
|
||||
(val ^ HIGHEST_BIT) as i64
|
||||
|
||||
@@ -1,9 +1,43 @@
|
||||
/*!
|
||||
Tantivy's store is a compressed, row-oriented storage.
|
||||
|
||||
A field needs to be marked as stored in the schema in
|
||||
order to be handled in the `Store`.
|
||||
|
||||
Internally, documents (or rather their stored fields) are serialized to a buffer.
|
||||
When the buffer exceeds 16K, the buffer is compressed using `LZ4`
|
||||
and the resulting block is written to disk.
|
||||
|
||||
One can then request for a specific `DocId`.
|
||||
A skip list helps navigating to the right block,
|
||||
decompresses it entirely and returns the document within it.
|
||||
|
||||
If the last document requested was in the same block,
|
||||
the reader is smart enough to avoid decompressing
|
||||
the block a second time, but their is no real
|
||||
*uncompressed block* cache.
|
||||
|
||||
A typical use case for the store is, once
|
||||
the search result page has been computed, returning
|
||||
the actual content of the 10 best document.
|
||||
|
||||
# Usage
|
||||
|
||||
Most users should not access the `StoreReader` directly
|
||||
and should rely on either
|
||||
|
||||
- at the segment level, the [`SegmentReader`'s `doc` method](../struct.SegmentReader.html#method.doc)
|
||||
- at the index level, the [`Searcher`'s `doc` method](../struct.Searcher.html#method.doc)
|
||||
|
||||
!*/
|
||||
|
||||
mod reader;
|
||||
mod writer;
|
||||
pub use self::reader::StoreReader;
|
||||
pub use self::writer::StoreWriter;
|
||||
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
|
||||
@@ -11,16 +11,20 @@ use std::io::{self, Read};
|
||||
use datastruct::SkipList;
|
||||
use lz4;
|
||||
|
||||
|
||||
/// Reads document off tantivy's [`Store`](./index.html)
|
||||
#[derive(Clone)]
|
||||
pub struct StoreReader {
|
||||
pub data: ReadOnlySource,
|
||||
pub offset_index_source: ReadOnlySource,
|
||||
data: ReadOnlySource,
|
||||
offset_index_source: ReadOnlySource,
|
||||
current_block_offset: RefCell<usize>,
|
||||
current_block: RefCell<Vec<u8>>,
|
||||
pub max_doc: DocId,
|
||||
max_doc: DocId,
|
||||
}
|
||||
|
||||
impl StoreReader {
|
||||
|
||||
/// Opens a store reader
|
||||
pub fn from_source(data: ReadOnlySource) -> StoreReader {
|
||||
let (data_source, offset_index_source, max_doc) = split_source(data);
|
||||
StoreReader {
|
||||
@@ -55,20 +59,27 @@ impl StoreReader {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Reads a given document.
|
||||
///
|
||||
/// Calling `.get(doc)` is relatively costly as it requires
|
||||
/// decompressing a LZ4-compressed block.
|
||||
///
|
||||
/// It should not be called to score documents
|
||||
/// for instance.
|
||||
pub fn get(&self, doc_id: DocId) -> Result<Document> {
|
||||
let (first_doc_id, block_offset) = self.block_offset(doc_id);
|
||||
try!(self.read_block(block_offset as usize));
|
||||
self.read_block(block_offset as usize)?;
|
||||
let current_block_mut = self.current_block.borrow_mut();
|
||||
let mut cursor = ¤t_block_mut[..];
|
||||
for _ in first_doc_id..doc_id {
|
||||
let block_length = try!(u32::deserialize(&mut cursor));
|
||||
cursor = &cursor[block_length as usize..];
|
||||
}
|
||||
try!(u32::deserialize(&mut cursor));
|
||||
u32::deserialize(&mut cursor)?;
|
||||
let num_fields = u32::deserialize(&mut cursor)?;
|
||||
let mut field_values = Vec::new();
|
||||
let num_fields = try!(u32::deserialize(&mut cursor));
|
||||
for _ in 0..num_fields {
|
||||
let field_value = try!(FieldValue::deserialize(&mut cursor));
|
||||
let field_value = FieldValue::deserialize(&mut cursor)?;
|
||||
field_values.push(field_value);
|
||||
}
|
||||
Ok(Document::from(field_values))
|
||||
@@ -83,7 +94,5 @@ fn split_source(data: ReadOnlySource) -> (ReadOnlySource, ReadOnlySource, DocId)
|
||||
let offset = u64::deserialize(&mut serialized_offset_buf).unwrap();
|
||||
let offset = offset as usize;
|
||||
let max_doc = u32::deserialize(&mut serialized_offset_buf).unwrap();
|
||||
let res = (data.slice(0, offset), data.slice(offset, footer_offset), max_doc);
|
||||
drop(data);
|
||||
res
|
||||
(data.slice(0, offset), data.slice(offset, footer_offset), max_doc)
|
||||
}
|
||||
|
||||
@@ -8,6 +8,15 @@ use datastruct::SkipListBuilder;
|
||||
|
||||
const BLOCK_SIZE: usize = 16_384;
|
||||
|
||||
|
||||
/// Write tantivy's [`Store`](./index.html)
|
||||
///
|
||||
/// Contrary to the other components of `tantivy`,
|
||||
/// the store is written to disc as document as being added,
|
||||
/// as opposed to when the segment is getting finalized.
|
||||
///
|
||||
/// The skip list index on the other hand, is build in memory.
|
||||
///
|
||||
pub struct StoreWriter {
|
||||
doc: DocId,
|
||||
written: u64,
|
||||
@@ -19,6 +28,11 @@ pub struct StoreWriter {
|
||||
|
||||
|
||||
impl StoreWriter {
|
||||
|
||||
/// Create a store writer.
|
||||
///
|
||||
/// The store writer will writes blocks on disc as
|
||||
/// document are added.
|
||||
pub fn new(writer: WritePtr) -> StoreWriter {
|
||||
StoreWriter {
|
||||
doc: 0,
|
||||
@@ -30,6 +44,11 @@ impl StoreWriter {
|
||||
}
|
||||
}
|
||||
|
||||
/// Store a new document.
|
||||
///
|
||||
/// The document id is implicitely the number of times
|
||||
/// this method has been called.
|
||||
///
|
||||
pub fn store<'a>(&mut self, field_values: &[&'a FieldValue]) -> io::Result<()> {
|
||||
self.intermediary_buffer.clear();
|
||||
try!((field_values.len() as u32).serialize(&mut self.intermediary_buffer));
|
||||
@@ -62,6 +81,11 @@ impl StoreWriter {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
/// Finalized the store writer.
|
||||
///
|
||||
/// Compress the last unfinished block if any,
|
||||
/// and serializes the skip list index on disc.
|
||||
pub fn close(mut self) -> io::Result<()> {
|
||||
if !self.current_block.is_empty() {
|
||||
try!(self.write_and_compress_block());
|
||||
|
||||
Reference in New Issue
Block a user