mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-07 17:42:55 +00:00
issue/96 Added unit test, documentation and various tiny improvements.
This commit is contained in:
@@ -3,16 +3,15 @@ mod timer;
|
||||
mod vint;
|
||||
pub mod bitpacker;
|
||||
|
||||
|
||||
pub use self::serialize::BinarySerializable;
|
||||
pub use self::timer::Timing;
|
||||
pub use self::timer::TimerTree;
|
||||
pub use self::timer::OpenTimer;
|
||||
pub use self::vint::VInt;
|
||||
|
||||
|
||||
use std::io;
|
||||
|
||||
/// Create a default io error given a string.
|
||||
pub fn make_io_err(msg: String) -> io::Error {
|
||||
io::Error::new(io::ErrorKind::Other, msg)
|
||||
}
|
||||
@@ -30,7 +29,11 @@ pub trait HasLen {
|
||||
}
|
||||
|
||||
|
||||
pub fn create_vec_with_len<T>(capacity: usize) -> Vec<T> {
|
||||
/// Creates an uninitialized Vec of a given usize
|
||||
///
|
||||
/// `allocate_vec` does an unsafe call to `set_len`
|
||||
/// as other solution are extremely slow in debug mode.
|
||||
pub fn allocate_vec<T>(capacity: usize) -> Vec<T> {
|
||||
let mut v = Vec::with_capacity(capacity);
|
||||
unsafe {
|
||||
v.set_len(capacity);
|
||||
|
||||
@@ -153,9 +153,8 @@ impl Index {
|
||||
|
||||
/// Returns the list of segments that are searchable
|
||||
pub fn searchable_segments(&self) -> Result<Vec<Segment>> {
|
||||
let metas = load_metas(self.directory())?;
|
||||
Ok(metas
|
||||
.segments
|
||||
Ok(self
|
||||
.searchable_segment_metas()?
|
||||
.into_iter()
|
||||
.map(|segment_meta| self.segment(segment_meta))
|
||||
.collect())
|
||||
@@ -183,18 +182,17 @@ impl Index {
|
||||
}
|
||||
|
||||
/// Reads the meta.json and returns the list of
|
||||
/// segments in the last commit.
|
||||
pub fn segments(&self) -> Result<Vec<SegmentMeta>> {
|
||||
/// `SegmentMeta` from the last commit.
|
||||
pub fn searchable_segment_metas(&self) -> Result<Vec<SegmentMeta>> {
|
||||
Ok(load_metas(self.directory())?.segments)
|
||||
}
|
||||
|
||||
/// Returns the list of segment ids that are searchable.
|
||||
pub fn searchable_segment_ids(&self) -> Result<Vec<SegmentId>> {
|
||||
Ok(load_metas(self.directory())?
|
||||
.segments
|
||||
.iter()
|
||||
.map(|segment_meta| segment_meta.id())
|
||||
.collect())
|
||||
Ok(self.searchable_segment_metas()?
|
||||
.iter()
|
||||
.map(|segment_meta| segment_meta.id())
|
||||
.collect())
|
||||
}
|
||||
|
||||
/// Creates a new generation of searchers after
|
||||
|
||||
@@ -25,7 +25,20 @@ pub use self::term_iterator::TermIterator;
|
||||
use std::path::PathBuf;
|
||||
|
||||
lazy_static! {
|
||||
/// The meta file contains all the information about the list of segments and the schema
|
||||
/// of the index.
|
||||
pub static ref META_FILEPATH: PathBuf = PathBuf::from("meta.json");
|
||||
|
||||
/// The managed file contains a list of files that were created by the tantivy
|
||||
/// and will therefore be garbage collected when they are deemed useless by tantivy.
|
||||
///
|
||||
/// Removing this file is safe, but will prevent the garbage collection of all of the file that
|
||||
/// are currently in the directory
|
||||
pub static ref MANAGED_FILEPATH: PathBuf = PathBuf::from(".managed.json");
|
||||
|
||||
/// Only one process should be able to write tantivy's index at a time.
|
||||
/// This file, when present, is in charge of preventing other processes to open an IndexWriter.
|
||||
///
|
||||
/// If the process is killed and this file remains, it is safe to remove it manually.
|
||||
pub static ref LOCKFILE_FILEPATH: PathBuf = PathBuf::from(".tantivy-indexer.lock");
|
||||
}
|
||||
@@ -43,10 +43,6 @@ impl Segment {
|
||||
self.index.schema()
|
||||
}
|
||||
|
||||
pub fn index(&self,) -> &Index {
|
||||
&self.index
|
||||
}
|
||||
|
||||
/// Returns the segment meta-information
|
||||
pub fn meta(&self) -> &SegmentMeta {
|
||||
&self.meta
|
||||
@@ -70,19 +66,25 @@ impl Segment {
|
||||
self.meta.relative_path(component)
|
||||
}
|
||||
|
||||
|
||||
/// Protects a specific component file from being deleted.
|
||||
///
|
||||
/// Returns a FileProtection object. The file is guaranteed
|
||||
/// to not be garbage collected as long as this `FileProtection` object
|
||||
/// lives.
|
||||
pub fn protect_from_delete(&self, component: SegmentComponent) -> FileProtection {
|
||||
let path = self.relative_path(component);
|
||||
self.index.directory().protect_file_from_delete(&path)
|
||||
}
|
||||
|
||||
/// Open one of the component file for read.
|
||||
/// Open one of the component file for a *regular* read.
|
||||
pub fn open_read(&self, component: SegmentComponent) -> result::Result<ReadOnlySource, FileError> {
|
||||
let path = self.relative_path(component);
|
||||
let source = try!(self.index.directory().open_read(&path));
|
||||
Ok(source)
|
||||
}
|
||||
|
||||
/// Open one of the component file for write.
|
||||
/// Open one of the component file for *regular* write.
|
||||
pub fn open_write(&mut self, component: SegmentComponent) -> result::Result<WritePtr, OpenWriteError> {
|
||||
let path = self.relative_path(component);
|
||||
let write = try!(self.index.directory_mut().open_write(&path));
|
||||
@@ -102,4 +104,65 @@ pub trait SerializableSegment {
|
||||
#[derive(Clone,Debug,RustcDecodable,RustcEncodable)]
|
||||
pub struct SegmentInfo {
|
||||
pub max_doc: DocId,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use core::SegmentComponent;
|
||||
use std::path::Path;
|
||||
use directory::Directory;
|
||||
use schema::{SchemaBuilder, Document, FieldValue, TEXT, Term};
|
||||
use Index;
|
||||
|
||||
#[test]
|
||||
fn test_segment_protect_component() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
|
||||
{
|
||||
// simply creating two segments
|
||||
// with one delete to create the DELETE file.
|
||||
{
|
||||
let doc1 = doc!(text_field=>"a");
|
||||
index_writer.add_document(doc1);
|
||||
let doc2 = doc!(text_field=>"b");
|
||||
index_writer.add_document(doc2);
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
{
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "a"));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
}
|
||||
|
||||
let segments = index.searchable_segments().unwrap();
|
||||
let directory = index.directory().clone();
|
||||
assert_eq!(segments.len(), 1);
|
||||
|
||||
|
||||
let delete_file_path = Path::new("00000000000000000000000000000000.4.del");
|
||||
let idx_file_path = Path::new("00000000000000000000000000000000.term");
|
||||
assert!(directory.exists(&*delete_file_path));
|
||||
assert!(directory.exists(&*idx_file_path));
|
||||
|
||||
{
|
||||
let _file_protection = segments[0].protect_from_delete(SegmentComponent::DELETE);
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "b"));
|
||||
index_writer.commit().unwrap();
|
||||
// the delete file is protected, and should not be gc'ed.
|
||||
assert!(directory.exists(&*delete_file_path));
|
||||
}
|
||||
|
||||
index_writer.commit().unwrap();
|
||||
|
||||
// at this point the protection is released.
|
||||
assert!(!directory.exists(&*delete_file_path));
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::cell::UnsafeCell;
|
||||
use std::mem;
|
||||
use common::create_vec_with_len;
|
||||
use common::allocate_vec;
|
||||
use std::ptr;
|
||||
|
||||
/// `BytesRef` refers to a slice in tantivy's custom `Heap`.
|
||||
@@ -105,18 +105,11 @@ struct InnerHeap {
|
||||
next_heap: Option<Box<InnerHeap>>,
|
||||
}
|
||||
|
||||
/// initializing a long Vec<u8> is crazy slow in
|
||||
/// debug mode.
|
||||
/// We use this unsafe trick to make unit test
|
||||
/// way faster.
|
||||
fn allocate_fast(num_bytes: usize) -> Vec<u8> {
|
||||
create_vec_with_len(num_bytes)
|
||||
}
|
||||
|
||||
impl InnerHeap {
|
||||
|
||||
pub fn with_capacity(num_bytes: usize) -> InnerHeap {
|
||||
let buffer: Vec<u8> = allocate_fast(num_bytes);
|
||||
let buffer: Vec<u8> = allocate_vec(num_bytes);
|
||||
InnerHeap {
|
||||
buffer: buffer,
|
||||
buffer_len: num_bytes as u32,
|
||||
|
||||
@@ -35,11 +35,26 @@ struct MetaInformation {
|
||||
protected_files: HashMap<PathBuf, usize>,
|
||||
}
|
||||
|
||||
|
||||
/// A `FileProtection` prevents the garbage collection of a file.
|
||||
///
|
||||
/// See `ManagedDirectory.protect_file_from_delete`.
|
||||
pub struct FileProtection {
|
||||
directory: ManagedDirectory,
|
||||
path: PathBuf,
|
||||
}
|
||||
|
||||
fn unprotect_file_from_delete(directory: &ManagedDirectory, path: &Path) {
|
||||
let mut meta_informations_wlock = directory.meta_informations
|
||||
.write()
|
||||
.expect("Managed file lock poisoned");
|
||||
if let Some(counter_ref_mut) = meta_informations_wlock
|
||||
.protected_files
|
||||
.get_mut(path) {
|
||||
(*counter_ref_mut) -= 1;
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for FileProtection {
|
||||
fn fmt(&self, formatter: &mut fmt::Formatter) -> result::Result<(), fmt::Error> {
|
||||
write!(formatter, "FileProtectionFor({:?})", self.path)
|
||||
@@ -48,7 +63,7 @@ impl fmt::Debug for FileProtection {
|
||||
|
||||
impl Drop for FileProtection {
|
||||
fn drop(&mut self) {
|
||||
self.directory.unprotect_file_from_delete(&self.path);
|
||||
unprotect_file_from_delete(&self.directory, &*self.path);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -152,6 +167,12 @@ impl ManagedDirectory {
|
||||
|
||||
}
|
||||
|
||||
|
||||
/// Protects a file from being garbage collected.
|
||||
///
|
||||
/// The method returns a `FileProtection` object.
|
||||
/// The file will not be garbage collected as long as the
|
||||
/// `FileProtection` object is kept alive.
|
||||
pub fn protect_file_from_delete(&self, path: &Path) -> FileProtection {
|
||||
let mut meta_informations_wlock = self.meta_informations
|
||||
.write()
|
||||
@@ -167,16 +188,6 @@ impl ManagedDirectory {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn unprotect_file_from_delete(&self, path: &Path) {
|
||||
let mut meta_informations_wlock = self.meta_informations
|
||||
.write()
|
||||
.expect("Managed file lock poisoned");
|
||||
if let Some(counter_ref_mut) = meta_informations_wlock
|
||||
.protected_files
|
||||
.get_mut(path) {
|
||||
(*counter_ref_mut) -= 1;
|
||||
}
|
||||
}
|
||||
|
||||
/// Saves the file containing the list of existing files
|
||||
/// that were created by tantivy.
|
||||
@@ -358,4 +369,28 @@ mod tests {
|
||||
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_managed_directory_protect() {
|
||||
let tempdir = TempDir::new("index").unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
let living_files = HashSet::new();
|
||||
|
||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
|
||||
managed_directory.atomic_write(*TEST_PATH1, &vec!(0u8,1u8)).unwrap();
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
|
||||
{
|
||||
let _file_protection = managed_directory.protect_file_from_delete(*TEST_PATH1);
|
||||
managed_directory.garbage_collect(living_files.clone());
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
}
|
||||
|
||||
managed_directory.garbage_collect(living_files.clone());
|
||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -197,9 +197,6 @@ pub fn compute_deleted_bitset(
|
||||
}
|
||||
|
||||
|
||||
// TODO put delete bitset in segment entry
|
||||
// rather than DocToOpstamp.
|
||||
|
||||
// TODO skip delete operation before teh
|
||||
// last delete opstamp
|
||||
|
||||
|
||||
@@ -17,9 +17,7 @@ use fastfield::FastFieldSerializer;
|
||||
use store::StoreWriter;
|
||||
use core::SegmentInfo;
|
||||
use std::cmp::{min, max};
|
||||
use std::iter;
|
||||
|
||||
|
||||
use common::allocate_vec;
|
||||
|
||||
pub struct IndexMerger {
|
||||
schema: Schema,
|
||||
@@ -35,9 +33,7 @@ struct DeltaPositionComputer {
|
||||
impl DeltaPositionComputer {
|
||||
fn new() -> DeltaPositionComputer {
|
||||
DeltaPositionComputer {
|
||||
buffer: iter::repeat(0u32)
|
||||
.take(512)
|
||||
.collect::<Vec<u32>>()
|
||||
buffer: allocate_vec(512)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -171,7 +171,7 @@ impl SegmentUpdater {
|
||||
pub fn new(index: Index,
|
||||
stamper: Stamper,
|
||||
delete_cursor: DeleteCursor) -> Result<SegmentUpdater> {
|
||||
let segments = index.segments()?;
|
||||
let segments = index.searchable_segment_metas()?;
|
||||
let segment_manager = SegmentManager::from_segments(segments, delete_cursor);
|
||||
Ok(
|
||||
SegmentUpdater(Arc::new(InnerSegmentUpdater {
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use std::fmt;
|
||||
|
||||
use common::BinarySerializable;
|
||||
use common::create_vec_with_len;
|
||||
use common::allocate_vec;
|
||||
use byteorder::{BigEndian, ByteOrder};
|
||||
use super::Field;
|
||||
use std::str;
|
||||
@@ -45,7 +45,7 @@ impl Term {
|
||||
/// The first byte is `1`, and the 4 following bytes are that of the u32.
|
||||
pub fn from_field_u32(field: Field, val: u32) -> Term {
|
||||
const U32_TERM_LEN: usize = 1 + 4;
|
||||
let mut buffer = create_vec_with_len(U32_TERM_LEN);
|
||||
let mut buffer = allocate_vec(U32_TERM_LEN);
|
||||
buffer[0] = field.0;
|
||||
// we want BigEndian here to have lexicographic order
|
||||
// match the natural order of vals.
|
||||
|
||||
Reference in New Issue
Block a user