From e0a39fb273d4db03a637dec31f040971468ff9ff Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 4 Apr 2017 22:43:35 +0900 Subject: [PATCH] issue/96 Added unit test, documentation and various tiny improvements. --- src/common/mod.rs | 9 ++-- src/core/index.rs | 18 ++++--- src/core/mod.rs | 13 ++++++ src/core/segment.rs | 75 +++++++++++++++++++++++++++--- src/datastruct/stacker/heap.rs | 11 +---- src/directory/managed_directory.rs | 57 ++++++++++++++++++----- src/indexer/index_writer.rs | 3 -- src/indexer/merger.rs | 8 +--- src/indexer/segment_updater.rs | 2 +- src/schema/term.rs | 4 +- 10 files changed, 149 insertions(+), 51 deletions(-) diff --git a/src/common/mod.rs b/src/common/mod.rs index d2d41cef6..c99c37f85 100644 --- a/src/common/mod.rs +++ b/src/common/mod.rs @@ -3,16 +3,15 @@ mod timer; mod vint; pub mod bitpacker; - pub use self::serialize::BinarySerializable; pub use self::timer::Timing; pub use self::timer::TimerTree; pub use self::timer::OpenTimer; pub use self::vint::VInt; - use std::io; +/// Create a default io error given a string. pub fn make_io_err(msg: String) -> io::Error { io::Error::new(io::ErrorKind::Other, msg) } @@ -30,7 +29,11 @@ pub trait HasLen { } -pub fn create_vec_with_len(capacity: usize) -> Vec { +/// Creates an uninitialized Vec of a given usize +/// +/// `allocate_vec` does an unsafe call to `set_len` +/// as other solution are extremely slow in debug mode. +pub fn allocate_vec(capacity: usize) -> Vec { let mut v = Vec::with_capacity(capacity); unsafe { v.set_len(capacity); diff --git a/src/core/index.rs b/src/core/index.rs index c09baba9d..71565f321 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -153,9 +153,8 @@ impl Index { /// Returns the list of segments that are searchable pub fn searchable_segments(&self) -> Result> { - let metas = load_metas(self.directory())?; - Ok(metas - .segments + Ok(self + .searchable_segment_metas()? .into_iter() .map(|segment_meta| self.segment(segment_meta)) .collect()) @@ -183,18 +182,17 @@ impl Index { } /// Reads the meta.json and returns the list of - /// segments in the last commit. - pub fn segments(&self) -> Result> { + /// `SegmentMeta` from the last commit. + pub fn searchable_segment_metas(&self) -> Result> { Ok(load_metas(self.directory())?.segments) } /// Returns the list of segment ids that are searchable. pub fn searchable_segment_ids(&self) -> Result> { - Ok(load_metas(self.directory())? - .segments - .iter() - .map(|segment_meta| segment_meta.id()) - .collect()) + Ok(self.searchable_segment_metas()? + .iter() + .map(|segment_meta| segment_meta.id()) + .collect()) } /// Creates a new generation of searchers after diff --git a/src/core/mod.rs b/src/core/mod.rs index 6b37c5542..d0a3964a4 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -25,7 +25,20 @@ pub use self::term_iterator::TermIterator; use std::path::PathBuf; lazy_static! { + /// The meta file contains all the information about the list of segments and the schema + /// of the index. pub static ref META_FILEPATH: PathBuf = PathBuf::from("meta.json"); + + /// The managed file contains a list of files that were created by the tantivy + /// and will therefore be garbage collected when they are deemed useless by tantivy. + /// + /// Removing this file is safe, but will prevent the garbage collection of all of the file that + /// are currently in the directory pub static ref MANAGED_FILEPATH: PathBuf = PathBuf::from(".managed.json"); + + /// Only one process should be able to write tantivy's index at a time. + /// This file, when present, is in charge of preventing other processes to open an IndexWriter. + /// + /// If the process is killed and this file remains, it is safe to remove it manually. pub static ref LOCKFILE_FILEPATH: PathBuf = PathBuf::from(".tantivy-indexer.lock"); } \ No newline at end of file diff --git a/src/core/segment.rs b/src/core/segment.rs index 7baf9516c..49f811f90 100644 --- a/src/core/segment.rs +++ b/src/core/segment.rs @@ -43,10 +43,6 @@ impl Segment { self.index.schema() } - pub fn index(&self,) -> &Index { - &self.index - } - /// Returns the segment meta-information pub fn meta(&self) -> &SegmentMeta { &self.meta @@ -70,19 +66,25 @@ impl Segment { self.meta.relative_path(component) } + + /// Protects a specific component file from being deleted. + /// + /// Returns a FileProtection object. The file is guaranteed + /// to not be garbage collected as long as this `FileProtection` object + /// lives. pub fn protect_from_delete(&self, component: SegmentComponent) -> FileProtection { let path = self.relative_path(component); self.index.directory().protect_file_from_delete(&path) } - /// Open one of the component file for read. + /// Open one of the component file for a *regular* read. pub fn open_read(&self, component: SegmentComponent) -> result::Result { let path = self.relative_path(component); let source = try!(self.index.directory().open_read(&path)); Ok(source) } - /// Open one of the component file for write. + /// Open one of the component file for *regular* write. pub fn open_write(&mut self, component: SegmentComponent) -> result::Result { let path = self.relative_path(component); let write = try!(self.index.directory_mut().open_write(&path)); @@ -102,4 +104,65 @@ pub trait SerializableSegment { #[derive(Clone,Debug,RustcDecodable,RustcEncodable)] pub struct SegmentInfo { pub max_doc: DocId, +} + +#[cfg(test)] +mod tests { + + use core::SegmentComponent; + use std::path::Path; + use directory::Directory; + use schema::{SchemaBuilder, Document, FieldValue, TEXT, Term}; + use Index; + + #[test] + fn test_segment_protect_component() { + let mut schema_builder = SchemaBuilder::default(); + let text_field = schema_builder.add_text_field("text", TEXT); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); + + { + // simply creating two segments + // with one delete to create the DELETE file. + { + let doc1 = doc!(text_field=>"a"); + index_writer.add_document(doc1); + let doc2 = doc!(text_field=>"b"); + index_writer.add_document(doc2); + assert!(index_writer.commit().is_ok()); + } + { + index_writer.delete_term(Term::from_field_text(text_field, "a")); + assert!(index_writer.commit().is_ok()); + } + } + + let segments = index.searchable_segments().unwrap(); + let directory = index.directory().clone(); + assert_eq!(segments.len(), 1); + + + let delete_file_path = Path::new("00000000000000000000000000000000.4.del"); + let idx_file_path = Path::new("00000000000000000000000000000000.term"); + assert!(directory.exists(&*delete_file_path)); + assert!(directory.exists(&*idx_file_path)); + + { + let _file_protection = segments[0].protect_from_delete(SegmentComponent::DELETE); + index_writer.delete_term(Term::from_field_text(text_field, "b")); + index_writer.commit().unwrap(); + // the delete file is protected, and should not be gc'ed. + assert!(directory.exists(&*delete_file_path)); + } + + index_writer.commit().unwrap(); + + // at this point the protection is released. + assert!(!directory.exists(&*delete_file_path)); + + } + + } \ No newline at end of file diff --git a/src/datastruct/stacker/heap.rs b/src/datastruct/stacker/heap.rs index cd8d16e89..1ae116f2b 100644 --- a/src/datastruct/stacker/heap.rs +++ b/src/datastruct/stacker/heap.rs @@ -1,6 +1,6 @@ use std::cell::UnsafeCell; use std::mem; -use common::create_vec_with_len; +use common::allocate_vec; use std::ptr; /// `BytesRef` refers to a slice in tantivy's custom `Heap`. @@ -105,18 +105,11 @@ struct InnerHeap { next_heap: Option>, } -/// initializing a long Vec is crazy slow in -/// debug mode. -/// We use this unsafe trick to make unit test -/// way faster. -fn allocate_fast(num_bytes: usize) -> Vec { - create_vec_with_len(num_bytes) -} impl InnerHeap { pub fn with_capacity(num_bytes: usize) -> InnerHeap { - let buffer: Vec = allocate_fast(num_bytes); + let buffer: Vec = allocate_vec(num_bytes); InnerHeap { buffer: buffer, buffer_len: num_bytes as u32, diff --git a/src/directory/managed_directory.rs b/src/directory/managed_directory.rs index ffa1697af..1f5f551ce 100644 --- a/src/directory/managed_directory.rs +++ b/src/directory/managed_directory.rs @@ -35,11 +35,26 @@ struct MetaInformation { protected_files: HashMap, } + +/// A `FileProtection` prevents the garbage collection of a file. +/// +/// See `ManagedDirectory.protect_file_from_delete`. pub struct FileProtection { directory: ManagedDirectory, path: PathBuf, } +fn unprotect_file_from_delete(directory: &ManagedDirectory, path: &Path) { + let mut meta_informations_wlock = directory.meta_informations + .write() + .expect("Managed file lock poisoned"); + if let Some(counter_ref_mut) = meta_informations_wlock + .protected_files + .get_mut(path) { + (*counter_ref_mut) -= 1; + } +} + impl fmt::Debug for FileProtection { fn fmt(&self, formatter: &mut fmt::Formatter) -> result::Result<(), fmt::Error> { write!(formatter, "FileProtectionFor({:?})", self.path) @@ -48,7 +63,7 @@ impl fmt::Debug for FileProtection { impl Drop for FileProtection { fn drop(&mut self) { - self.directory.unprotect_file_from_delete(&self.path); + unprotect_file_from_delete(&self.directory, &*self.path); } } @@ -152,6 +167,12 @@ impl ManagedDirectory { } + + /// Protects a file from being garbage collected. + /// + /// The method returns a `FileProtection` object. + /// The file will not be garbage collected as long as the + /// `FileProtection` object is kept alive. pub fn protect_file_from_delete(&self, path: &Path) -> FileProtection { let mut meta_informations_wlock = self.meta_informations .write() @@ -167,16 +188,6 @@ impl ManagedDirectory { } } - pub fn unprotect_file_from_delete(&self, path: &Path) { - let mut meta_informations_wlock = self.meta_informations - .write() - .expect("Managed file lock poisoned"); - if let Some(counter_ref_mut) = meta_informations_wlock - .protected_files - .get_mut(path) { - (*counter_ref_mut) -= 1; - } - } /// Saves the file containing the list of existing files /// that were created by tantivy. @@ -358,4 +369,28 @@ mod tests { } + + #[test] + fn test_managed_directory_protect() { + let tempdir = TempDir::new("index").unwrap(); + let tempdir_path = PathBuf::from(tempdir.path()); + let living_files = HashSet::new(); + + let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap(); + let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap(); + managed_directory.atomic_write(*TEST_PATH1, &vec!(0u8,1u8)).unwrap(); + assert!(managed_directory.exists(*TEST_PATH1)); + + { + let _file_protection = managed_directory.protect_file_from_delete(*TEST_PATH1); + managed_directory.garbage_collect(living_files.clone()); + assert!(managed_directory.exists(*TEST_PATH1)); + } + + managed_directory.garbage_collect(living_files.clone()); + assert!(!managed_directory.exists(*TEST_PATH1)); + + + } + } diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index fbfe2f70f..48d76a576 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -197,9 +197,6 @@ pub fn compute_deleted_bitset( } -// TODO put delete bitset in segment entry -// rather than DocToOpstamp. - // TODO skip delete operation before teh // last delete opstamp diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index ae8679d47..026921111 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -17,9 +17,7 @@ use fastfield::FastFieldSerializer; use store::StoreWriter; use core::SegmentInfo; use std::cmp::{min, max}; -use std::iter; - - +use common::allocate_vec; pub struct IndexMerger { schema: Schema, @@ -35,9 +33,7 @@ struct DeltaPositionComputer { impl DeltaPositionComputer { fn new() -> DeltaPositionComputer { DeltaPositionComputer { - buffer: iter::repeat(0u32) - .take(512) - .collect::>() + buffer: allocate_vec(512) } } diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index 517400b27..8fb027042 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -171,7 +171,7 @@ impl SegmentUpdater { pub fn new(index: Index, stamper: Stamper, delete_cursor: DeleteCursor) -> Result { - let segments = index.segments()?; + let segments = index.searchable_segment_metas()?; let segment_manager = SegmentManager::from_segments(segments, delete_cursor); Ok( SegmentUpdater(Arc::new(InnerSegmentUpdater { diff --git a/src/schema/term.rs b/src/schema/term.rs index 656c9feeb..f5502e00e 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -1,7 +1,7 @@ use std::fmt; use common::BinarySerializable; -use common::create_vec_with_len; +use common::allocate_vec; use byteorder::{BigEndian, ByteOrder}; use super::Field; use std::str; @@ -45,7 +45,7 @@ impl Term { /// The first byte is `1`, and the 4 following bytes are that of the u32. pub fn from_field_u32(field: Field, val: u32) -> Term { const U32_TERM_LEN: usize = 1 + 4; - let mut buffer = create_vec_with_len(U32_TERM_LEN); + let mut buffer = allocate_vec(U32_TERM_LEN); buffer[0] = field.0; // we want BigEndian here to have lexicographic order // match the natural order of vals.