From fe2ddb8844138f9fc5cf8e0f384746f14a2a4d17 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 9 Jan 2017 17:50:24 +0900 Subject: [PATCH 001/107] issue43 Added DeleteQueue. --- src/indexer/delete_queue.rs | 86 +++++++++++++++++++++++++++++++++++++ src/indexer/index_writer.rs | 21 ++++++++- src/indexer/mod.rs | 1 + 3 files changed, 106 insertions(+), 2 deletions(-) create mode 100644 src/indexer/delete_queue.rs diff --git a/src/indexer/delete_queue.rs b/src/indexer/delete_queue.rs new file mode 100644 index 000000000..1755819a1 --- /dev/null +++ b/src/indexer/delete_queue.rs @@ -0,0 +1,86 @@ +use schema::Term; +use std::sync::{Arc, RwLock}; +use std::collections::HashMap; +use std::sync::atomic::{AtomicUsize, Ordering}; + + + +pub struct SuscribeHandle { + client_id: usize, + clients: Arc>>, +} + +impl Drop for SuscribeHandle { + fn drop(&mut self) { + self.clients + .write() + .unwrap() + .remove(&self.client_id); + } +} + +struct ClientSuscriptionRegister { + clients: Arc>>, + client_id_autoinc: AtomicUsize, +} + +impl Default for ClientSuscriptionRegister { + fn default() -> ClientSuscriptionRegister { + ClientSuscriptionRegister { + clients: Arc::new(RwLock::new(HashMap::new())), + client_id_autoinc: AtomicUsize::new(0), + } + } +} + +impl ClientSuscriptionRegister { + fn acquire_client_id(&mut self) -> usize { + self.client_id_autoinc.fetch_add(1, Ordering::SeqCst) + } + + fn suscribe(&mut self, opstamp: u64) -> SuscribeHandle { + let client_id = self.acquire_client_id(); + self.clients + .write() + .unwrap() + .insert(client_id, opstamp); + SuscribeHandle { + client_id: client_id, + clients: self.clients.clone(), + } + } + +} + + +pub struct DeleteQueue { + operations: Vec, + client_subscription_register: ClientSuscriptionRegister, +} + +impl DeleteQueue { + pub fn push(&mut self, opstamp: u64, term: Term) { + self.operations.push(DeleteOperation { + opstamp: opstamp, + term: term + }); + } + + pub fn suscribe(&mut self, opstamp: u64) -> SuscribeHandle { + self.client_subscription_register.suscribe(opstamp) + } +} + +impl Default for DeleteQueue { + fn default() -> DeleteQueue { + DeleteQueue { + operations: Vec::new(), + client_subscription_register: ClientSuscriptionRegister::default(), + } + } +} + +struct DeleteOperation { + opstamp: u64, + term: Term, +} diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index a7aaaf9b0..15af2f27d 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -4,6 +4,7 @@ use indexer::SegmentSerializer; use core::SerializableSegment; use core::Index; use core::Segment; +use schema::Term; use std::thread::JoinHandle; use indexer::{MergePolicy, DefaultMergePolicy}; use indexer::SegmentWriter; @@ -19,6 +20,7 @@ use std::mem::swap; use std::sync::{Arc, Mutex}; use chan; use core::SegmentMeta; +use super::delete_queue::DeleteQueue; use super::segment_updater::{SegmentUpdater, SegmentUpdate, SegmentUpdateSender}; use std::time::Duration; use super::super::core::index::get_segment_manager; @@ -68,6 +70,8 @@ pub struct IndexWriter { segment_update_sender: SegmentUpdateSender, segment_update_thread: JoinHandle<()>, + delete_queue: DeleteQueue, + worker_id: usize, num_threads: usize, @@ -230,6 +234,8 @@ impl IndexWriter { workers_join_handle: Vec::new(), num_threads: num_threads, + delete_queue: DeleteQueue::default(), + committed_docstamp: index.docstamp(), uncommitted_docstamp: index.docstamp(), worker_id: 0, @@ -436,8 +442,19 @@ impl IndexWriter { } Ok(self.committed_docstamp) + } + + + pub fn delete_term(&mut self, term: Term) { + let opstamp = self.stamp(); + self.delete_queue.push(opstamp, term); } + fn stamp(&mut self) -> u64 { + let opstamp = self.uncommitted_docstamp; + self.uncommitted_docstamp += 1u64; + opstamp + } /// Adds a document. /// @@ -450,9 +467,9 @@ impl IndexWriter { /// Currently it represents the number of documents that /// have been added since the creation of the index. pub fn add_document(&mut self, doc: Document) -> io::Result { + let opstamp = self.stamp(); self.document_sender.send(doc); - self.uncommitted_docstamp += 1; - Ok(self.uncommitted_docstamp) + Ok(opstamp) } } diff --git a/src/indexer/mod.rs b/src/indexer/mod.rs index c71c07fb4..2544fce39 100644 --- a/src/indexer/mod.rs +++ b/src/indexer/mod.rs @@ -7,6 +7,7 @@ mod log_merge_policy; mod segment_register; mod segment_writer; mod segment_manager; +mod delete_queue; pub mod segment_updater; mod directory_lock; From 395cbf39139b1d59b81a5a7aab01a3b7df8a15c6 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 12 Jan 2017 10:09:34 +0900 Subject: [PATCH 002/107] issue/43 Change the delete queue datastruct for something cleaner/functional --- src/indexer/delete_queue.rs | 142 ++++++++++++++++++++---------------- 1 file changed, 81 insertions(+), 61 deletions(-) diff --git a/src/indexer/delete_queue.rs b/src/indexer/delete_queue.rs index 1755819a1..19139de9a 100644 --- a/src/indexer/delete_queue.rs +++ b/src/indexer/delete_queue.rs @@ -4,83 +4,103 @@ use std::collections::HashMap; use std::sync::atomic::{AtomicUsize, Ordering}; +const BLOCK_SIZE: usize = 128; -pub struct SuscribeHandle { - client_id: usize, - clients: Arc>>, +struct DeleteOperation { + pub opstamp: u64, + pub term: Term, } -impl Drop for SuscribeHandle { - fn drop(&mut self) { - self.clients - .write() - .unwrap() - .remove(&self.client_id); - } -} - -struct ClientSuscriptionRegister { - clients: Arc>>, - client_id_autoinc: AtomicUsize, -} - -impl Default for ClientSuscriptionRegister { - fn default() -> ClientSuscriptionRegister { - ClientSuscriptionRegister { - clients: Arc::new(RwLock::new(HashMap::new())), - client_id_autoinc: AtomicUsize::new(0), - } - } -} - -impl ClientSuscriptionRegister { - fn acquire_client_id(&mut self) -> usize { - self.client_id_autoinc.fetch_add(1, Ordering::SeqCst) - } - - fn suscribe(&mut self, opstamp: u64) -> SuscribeHandle { - let client_id = self.acquire_client_id(); - self.clients - .write() - .unwrap() - .insert(client_id, opstamp); - SuscribeHandle { - client_id: client_id, - clients: self.clients.clone(), - } - } - -} - - -pub struct DeleteQueue { +struct Block { operations: Vec, - client_subscription_register: ClientSuscriptionRegister, + next: Option, } -impl DeleteQueue { - pub fn push(&mut self, opstamp: u64, term: Term) { - self.operations.push(DeleteOperation { - opstamp: opstamp, - term: term - }); +impl Default for Block { + fn default() -> Block { + Block { + operations: Vec::with_capacity(BLOCK_SIZE), + next: None + } } +} - pub fn suscribe(&mut self, opstamp: u64) -> SuscribeHandle { - self.client_subscription_register.suscribe(opstamp) +#[derive(Clone)] +struct SharedBlock { + inner: Arc>, +} + +impl SharedBlock { + // Happens a new element to the block and return + // what the new head is. + fn enqueue(&self, delete_operation: DeleteOperation) -> Option { + let mut writable_block = self.inner.write().expect("Panicked while enqueueing in the delete queue."); + if writable_block.operations.len() >= BLOCK_SIZE { + let next_block = SharedBlock::default(); + next_block.enqueue(delete_operation); + writable_block.next = Some(next_block.clone()); + Some(next_block) + } + else { + writable_block.operations.push(delete_operation); + None + } + } + + fn cursor(&self,) -> DeleteQueueCursor { + let len = self.inner + .read() + .expect("Panicked while reading a block in the delete queue.") + .operations + .len(); + DeleteQueueCursor { + block: self.clone(), + pos: len, + } + } +} + +impl Default for SharedBlock { + fn default() -> SharedBlock { + SharedBlock { + inner: Arc::default() + } } } impl Default for DeleteQueue { fn default() -> DeleteQueue { DeleteQueue { - operations: Vec::new(), - client_subscription_register: ClientSuscriptionRegister::default(), + head: SharedBlock::default(), } } } -struct DeleteOperation { - opstamp: u64, - term: Term, +pub struct DeleteQueueCursor { + block: SharedBlock, + pos: usize, +} + + +// ---------------------------------------- + +pub struct DeleteQueue { + head: SharedBlock, +} + +impl DeleteQueue { + + pub fn cursor(&self) -> DeleteQueueCursor { + self.head.cursor() + } + + pub fn push(&mut self, opstamp: u64, term: Term) { + let delete_operation = DeleteOperation { + opstamp: opstamp, + term: term, + }; + if let Some(new_head) = self.head.enqueue(delete_operation) { + self.head = new_head; + } + } } From 5a06f45403c2d9568313143abbbdb3c1b8dc5679 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 15 Jan 2017 11:09:58 +0900 Subject: [PATCH 003/107] issue/43 small progress --- src/indexer/delete_queue.rs | 148 +++++++++++++++++++++++++++++----- src/indexer/index_writer.rs | 20 +++-- src/indexer/mod.rs | 3 +- src/indexer/segment_writer.rs | 8 +- src/postings/mod.rs | 5 +- 5 files changed, 156 insertions(+), 28 deletions(-) diff --git a/src/indexer/delete_queue.rs b/src/indexer/delete_queue.rs index 19139de9a..75d9635fb 100644 --- a/src/indexer/delete_queue.rs +++ b/src/indexer/delete_queue.rs @@ -1,16 +1,18 @@ use schema::Term; use std::sync::{Arc, RwLock}; -use std::collections::HashMap; -use std::sync::atomic::{AtomicUsize, Ordering}; - const BLOCK_SIZE: usize = 128; -struct DeleteOperation { +/// Timestamped Delete operation. +#[derive(Clone, Eq, PartialEq, Debug)] +pub struct DeleteOperation { pub opstamp: u64, pub term: Term, } + +/// DeleteQueue are implemented as an unrolled linked list. +/// Block implements a block of this unrolled linked list. struct Block { operations: Vec, next: Option, @@ -25,16 +27,15 @@ impl Default for Block { } } +/// A shared block wraps a block #[derive(Clone)] -struct SharedBlock { - inner: Arc>, -} +struct SharedBlock(Arc>); impl SharedBlock { // Happens a new element to the block and return // what the new head is. fn enqueue(&self, delete_operation: DeleteOperation) -> Option { - let mut writable_block = self.inner.write().expect("Panicked while enqueueing in the delete queue."); + let mut writable_block = self.0.write().expect("Panicked while enqueueing in the delete queue."); if writable_block.operations.len() >= BLOCK_SIZE { let next_block = SharedBlock::default(); next_block.enqueue(delete_operation); @@ -46,9 +47,17 @@ impl SharedBlock { None } } + + fn next_block(&self) -> Option { + self.0 + .read() + .unwrap() + .next + .clone() + } fn cursor(&self,) -> DeleteQueueCursor { - let len = self.inner + let len = self.0 .read() .expect("Panicked while reading a block in the delete queue.") .operations @@ -62,45 +71,148 @@ impl SharedBlock { impl Default for SharedBlock { fn default() -> SharedBlock { - SharedBlock { - inner: Arc::default() - } + SharedBlock(Arc::default()) } } impl Default for DeleteQueue { fn default() -> DeleteQueue { DeleteQueue { - head: SharedBlock::default(), + writing_head: SharedBlock::default(), } } } +#[derive(Clone)] pub struct DeleteQueueCursor { block: SharedBlock, pos: usize, } +impl DeleteQueueCursor { + + pub fn peek(&mut self) -> Option { + if self.pos >= BLOCK_SIZE { + self.pos = 0; + match self.block.next_block() { + Some(next_block) => { + self.block = next_block; + self.pos = 0; + } + None => { + // there is no next block. + return None; + } + } + } + let readable_block = self.block.0 + .read() + .unwrap(); + if self.pos >= readable_block.operations.len() { + None + } + else { + Some(readable_block.operations[self.pos].clone()) + } + } + + /// Returns a delete operation if an operation is available, + /// None if the queue is empty. + /// + /// (We are voluntarily not using the `Iterator` trait + /// as a call to `consume` may return None once, and return + /// `Some(...)` ulteriorily. While this is officially + /// compatible with the `Iterator` specification, we judge + /// this confusing.) + pub fn consume(&mut self) -> Option { + let delete_position = self.peek(); + if delete_position.is_some() { + self.pos += 1; + } + delete_position + } +} // ---------------------------------------- pub struct DeleteQueue { - head: SharedBlock, + writing_head: SharedBlock, } impl DeleteQueue { pub fn cursor(&self) -> DeleteQueueCursor { - self.head.cursor() + self.writing_head.cursor() } + pub fn push_op(&mut self, delete_operation: DeleteOperation) { + if let Some(new_head) = self.writing_head.enqueue(delete_operation) { + self.writing_head = new_head; + } + } pub fn push(&mut self, opstamp: u64, term: Term) { let delete_operation = DeleteOperation { opstamp: opstamp, term: term, }; - if let Some(new_head) = self.head.enqueue(delete_operation) { - self.head = new_head; - } + self.push_op(delete_operation); } } + + + +#[cfg(test)] +mod tests { + + use super::{DeleteQueue, DeleteOperation}; + use schema::{Term, Field}; + + #[test] + fn test_deletequeue() { + let mut delete_queue = DeleteQueue::default(); + + let make_op = |i: usize| { + let field = Field(1u8); + DeleteOperation { + opstamp: i as u64, + term: Term::from_field_u32(field, i as u32) + } + }; + + delete_queue.push_op(make_op(1)); + delete_queue.push_op(make_op(2)); + + let mut delete_cursor_3 = delete_queue.cursor(); + let mut delete_cursor_3_b = delete_cursor_3.clone(); + + assert!(delete_cursor_3.consume().is_none()); + assert!(delete_cursor_3.peek().is_none()); + + delete_queue.push_op(make_op(3)); + delete_queue.push_op(make_op(4)); + + assert_eq!(delete_cursor_3_b.peek(), Some(make_op(3))); + let mut delete_cursor_3_c = delete_cursor_3_b.clone(); + + assert_eq!(delete_cursor_3_b.consume(), Some(make_op(3))); + let mut delete_cursor_4 = delete_cursor_3_b.clone(); + + assert_eq!(delete_cursor_3_b.peek(), Some(make_op(4))); + assert_eq!(delete_cursor_3_b.consume(), Some(make_op(4))); + + assert_eq!(delete_cursor_3_c.consume(), Some(make_op(3))); + + assert!(delete_cursor_3_b.consume().is_none()); + assert_eq!(delete_cursor_3_c.consume(), Some(make_op(4))); + assert!(delete_cursor_3_c.consume().is_none()); + + assert_eq!(delete_cursor_3.peek(), Some(make_op(3))); + assert_eq!(delete_cursor_3.consume(), Some(make_op(3))); + assert!(delete_cursor_3_b.consume().is_none()); + + assert_eq!(delete_cursor_4.consume(), Some(make_op(4))); + assert!(delete_cursor_4.consume().is_none()); + + + } +} \ No newline at end of file diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 15af2f27d..2e756e8f0 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -20,7 +20,7 @@ use std::mem::swap; use std::sync::{Arc, Mutex}; use chan; use core::SegmentMeta; -use super::delete_queue::DeleteQueue; +use super::delete_queue::{DeleteQueue, DeleteQueueCursor}; use super::segment_updater::{SegmentUpdater, SegmentUpdate, SegmentUpdateSender}; use std::time::Duration; use super::super::core::index::get_segment_manager; @@ -70,12 +70,12 @@ pub struct IndexWriter { segment_update_sender: SegmentUpdateSender, segment_update_thread: JoinHandle<()>, - delete_queue: DeleteQueue, - worker_id: usize, num_threads: usize, + delete_queue: DeleteQueue, + uncommitted_docstamp: u64, committed_docstamp: u64, } @@ -89,11 +89,12 @@ fn index_documents(heap: &mut Heap, segment: Segment, schema: &Schema, document_iterator: &mut Iterator, - segment_update_sender: &mut SegmentUpdateSender) + segment_update_sender: &mut SegmentUpdateSender, + delete_cursor: DeleteQueueCursor) -> Result<()> { heap.clear(); let segment_id = segment.id(); - let mut segment_writer = try!(SegmentWriter::for_segment(heap, segment, &schema)); + let mut segment_writer = try!(SegmentWriter::for_segment(heap, segment, &schema, delete_cursor)); for doc in document_iterator { try!(segment_writer.add_document(&doc, &schema)); if segment_writer.is_buffer_full() { @@ -152,11 +153,17 @@ impl IndexWriter { let document_receiver_clone = self.document_receiver.clone(); let mut segment_update_sender = self.segment_update_sender.clone(); let mut heap = Heap::with_capacity(self.heap_size_in_bytes_per_thread); + + // TODO fix this. the cursor might be too advanced + // at this point. + let delete_cursor = self.delete_queue.cursor(); + let join_handle: JoinHandle> = try!(thread::Builder::new() .name(format!("indexing_thread_{}", self.worker_id)) .spawn(move || { loop { let segment = index.new_segment(); + let mut document_iterator = document_receiver_clone.clone() .into_iter() .peekable(); @@ -168,7 +175,8 @@ impl IndexWriter { segment, &schema, &mut document_iterator, - &mut segment_update_sender)); + &mut segment_update_sender, + delete_cursor.clone())); } else { // No more documents. // Happens when there is a commit, or if the `IndexWriter` diff --git a/src/indexer/mod.rs b/src/indexer/mod.rs index 2544fce39..cb8c78963 100644 --- a/src/indexer/mod.rs +++ b/src/indexer/mod.rs @@ -7,7 +7,7 @@ mod log_merge_policy; mod segment_register; mod segment_writer; mod segment_manager; -mod delete_queue; +pub mod delete_queue; pub mod segment_updater; mod directory_lock; @@ -18,6 +18,5 @@ pub use self::log_merge_policy::LogMergePolicy; pub use self::merge_policy::{NoMergePolicy, MergeCandidate, MergePolicy}; pub use self::segment_manager::SegmentManager; - /// Alias for the default merge policy, which is the LogMergePolicy. pub type DefaultMergePolicy = LogMergePolicy; diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 17ddf91c4..c0682312a 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -18,6 +18,7 @@ use postings::SpecializedPostingsWriter; use postings::{NothingRecorder, TermFrequencyRecorder, TFAndPositionRecorder}; use indexer::segment_serializer::SegmentSerializer; use datastruct::stacker::Heap; +use super::delete_queue::DeleteQueueCursor; use indexer::index_writer::MARGIN_IN_BYTES; /// A `SegmentWriter` is in charge of creating segment index from a @@ -32,6 +33,7 @@ pub struct SegmentWriter<'a> { segment_serializer: SegmentSerializer, fast_field_writers: U32FastFieldsWriter, fieldnorms_writer: U32FastFieldsWriter, + delete_queue_cursor: DeleteQueueCursor, } @@ -80,7 +82,10 @@ impl<'a> SegmentWriter<'a> { /// the flushing behavior as a buffer limit /// - segment: The segment being written /// - schema - pub fn for_segment(heap: &'a Heap, mut segment: Segment, schema: &Schema) -> Result> { + pub fn for_segment(heap: &'a Heap, + mut segment: Segment, + schema: &Schema, + delete_queue_cursor: DeleteQueueCursor) -> Result> { let segment_serializer = try!(SegmentSerializer::for_segment(&mut segment)); let mut per_field_postings_writers: Vec> = Vec::new(); for field_entry in schema.fields() { @@ -94,6 +99,7 @@ impl<'a> SegmentWriter<'a> { fieldnorms_writer: create_fieldnorms_writer(schema), segment_serializer: segment_serializer, fast_field_writers: U32FastFieldsWriter::from_schema(schema), + delete_queue_cursor: delete_queue_cursor, }) } diff --git a/src/postings/mod.rs b/src/postings/mod.rs index d22b0e9f5..8760b4e71 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -44,6 +44,7 @@ mod tests { use schema::{Document, TEXT, STRING, SchemaBuilder, Term}; use core::SegmentComponent; use indexer::SegmentWriter; + use indexer::delete_queue::DeleteQueue; use core::SegmentReader; use core::Index; use std::iter; @@ -81,9 +82,11 @@ mod tests { let schema = schema_builder.build(); let index = Index::create_in_ram(schema.clone()); let segment = index.new_segment(); + let delete_queue = DeleteQueue::default(); + let delete_cursor = delete_queue.cursor(); let heap = Heap::with_capacity(10_000_000); { - let mut segment_writer = SegmentWriter::for_segment(&heap, segment.clone(), &schema).unwrap(); + let mut segment_writer = SegmentWriter::for_segment(&heap, segment.clone(), &schema, delete_cursor).unwrap(); { let mut doc = Document::default(); doc.add_text(text_field, "a b a c a d a a."); From 183d5221b57f19ddbb9866889dba9f7bd79af445 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 16 Jan 2017 09:13:28 +0900 Subject: [PATCH 004/107] issue/43 DeleteQueue. --- Cargo.toml | 3 +++ src/indexer/delete_queue.rs | 8 +------- src/indexer/index_writer.rs | 20 +++++++++++------- src/indexer/mod.rs | 1 + src/indexer/operation.rs | 17 ++++++++++++++++ src/indexer/segment_writer.rs | 38 +++++++++++++++++++++++++++++------ src/lib.rs | 2 +- src/postings/mod.rs | 23 ++++++++++++++++----- 8 files changed, 86 insertions(+), 26 deletions(-) create mode 100644 src/indexer/operation.rs diff --git a/Cargo.toml b/Cargo.toml index bf59afcc2..2c2ec995c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,11 +24,14 @@ rustc-serialize = "0.3" log = "0.3.6" combine = "2.2" tempdir = "0.3" + + bincode = "0.5" libc = {version = "0.2.20", optional=true} num_cpus = "1.2" itertools = "0.5.9" lz4 = "1.20" +bit-set = "0.4.0" time = "0.1" uuid = { version = "0.4", features = ["v4", "rustc-serialize"] } chan = "0.1" diff --git a/src/indexer/delete_queue.rs b/src/indexer/delete_queue.rs index 75d9635fb..23b8e162c 100644 --- a/src/indexer/delete_queue.rs +++ b/src/indexer/delete_queue.rs @@ -1,15 +1,9 @@ use schema::Term; use std::sync::{Arc, RwLock}; +use super::operation::DeleteOperation; const BLOCK_SIZE: usize = 128; -/// Timestamped Delete operation. -#[derive(Clone, Eq, PartialEq, Debug)] -pub struct DeleteOperation { - pub opstamp: u64, - pub term: Term, -} - /// DeleteQueue are implemented as an unrolled linked list. /// Block implements a block of this unrolled linked list. diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 2e756e8f0..bda2ac391 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -1,5 +1,6 @@ use schema::Schema; use schema::Document; +use super::operation::{DeleteOperation, AddOperation}; use indexer::SegmentSerializer; use core::SerializableSegment; use core::Index; @@ -40,8 +41,8 @@ const PIPELINE_MAX_SIZE_IN_DOCS: usize = 10_000; -type DocumentSender = chan::Sender; -type DocumentReceiver = chan::Receiver; +type DocumentSender = chan::Sender; +type DocumentReceiver = chan::Receiver; @@ -88,9 +89,9 @@ impl !Sync for IndexWriter {} fn index_documents(heap: &mut Heap, segment: Segment, schema: &Schema, - document_iterator: &mut Iterator, + document_iterator: &mut Iterator, segment_update_sender: &mut SegmentUpdateSender, - delete_cursor: DeleteQueueCursor) + delete_cursor: &mut DeleteQueueCursor) -> Result<()> { heap.clear(); let segment_id = segment.id(); @@ -161,6 +162,7 @@ impl IndexWriter { let join_handle: JoinHandle> = try!(thread::Builder::new() .name(format!("indexing_thread_{}", self.worker_id)) .spawn(move || { + let mut delete_cursor_clone = delete_cursor.clone(); loop { let segment = index.new_segment(); @@ -176,7 +178,7 @@ impl IndexWriter { &schema, &mut document_iterator, &mut segment_update_sender, - delete_cursor.clone())); + &mut delete_cursor_clone)); } else { // No more documents. // Happens when there is a commit, or if the `IndexWriter` @@ -474,9 +476,13 @@ impl IndexWriter { /// /// Currently it represents the number of documents that /// have been added since the creation of the index. - pub fn add_document(&mut self, doc: Document) -> io::Result { + pub fn add_document(&mut self, document: Document) -> io::Result { let opstamp = self.stamp(); - self.document_sender.send(doc); + let add_operation = AddOperation { + opstamp: opstamp, + document: document, + }; + self.document_sender.send(add_operation); Ok(opstamp) } } diff --git a/src/indexer/mod.rs b/src/indexer/mod.rs index cb8c78963..9bcfae1c5 100644 --- a/src/indexer/mod.rs +++ b/src/indexer/mod.rs @@ -10,6 +10,7 @@ mod segment_manager; pub mod delete_queue; pub mod segment_updater; mod directory_lock; +pub mod operation; pub use self::segment_serializer::SegmentSerializer; pub use self::segment_writer::SegmentWriter; diff --git a/src/indexer/operation.rs b/src/indexer/operation.rs new file mode 100644 index 000000000..ecdc2d827 --- /dev/null +++ b/src/indexer/operation.rs @@ -0,0 +1,17 @@ +use schema::Document; +use schema::Term; + + +/// Timestamped Delete operation. +#[derive(Clone, Eq, PartialEq, Debug)] +pub struct DeleteOperation { + pub opstamp: u64, + pub term: Term, +} + +/// Timestamped Add operation. +#[derive(Eq, PartialEq, Debug)] +pub struct AddOperation { + pub opstamp: u64, + pub document: Document, +} diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index c0682312a..c4ba34602 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -20,6 +20,8 @@ use indexer::segment_serializer::SegmentSerializer; use datastruct::stacker::Heap; use super::delete_queue::DeleteQueueCursor; use indexer::index_writer::MARGIN_IN_BYTES; +use super::operation::{AddOperation, DeleteOperation}; +use bit_set::BitSet; /// A `SegmentWriter` is in charge of creating segment index from a /// documents. @@ -33,7 +35,8 @@ pub struct SegmentWriter<'a> { segment_serializer: SegmentSerializer, fast_field_writers: U32FastFieldsWriter, fieldnorms_writer: U32FastFieldsWriter, - delete_queue_cursor: DeleteQueueCursor, + delete_queue_cursor: &'a mut DeleteQueueCursor, + docstamps: Vec, } @@ -85,7 +88,7 @@ impl<'a> SegmentWriter<'a> { pub fn for_segment(heap: &'a Heap, mut segment: Segment, schema: &Schema, - delete_queue_cursor: DeleteQueueCursor) -> Result> { + delete_queue_cursor: &'a mut DeleteQueueCursor) -> Result> { let segment_serializer = try!(SegmentSerializer::for_segment(&mut segment)); let mut per_field_postings_writers: Vec> = Vec::new(); for field_entry in schema.fields() { @@ -100,6 +103,7 @@ impl<'a> SegmentWriter<'a> { segment_serializer: segment_serializer, fast_field_writers: U32FastFieldsWriter::from_schema(schema), delete_queue_cursor: delete_queue_cursor, + docstamps: Vec::with_capacity(1_000), }) } @@ -112,7 +116,8 @@ impl<'a> SegmentWriter<'a> { for per_field_postings_writer in &mut self.per_field_postings_writers { per_field_postings_writer.close(self.heap); } - try!(write(&self.per_field_postings_writers, + try!(write( + &self.per_field_postings_writers, &self.fast_field_writers, &self.fieldnorms_writer, segment_info, @@ -132,11 +137,32 @@ impl<'a> SegmentWriter<'a> { self.heap.num_free_bytes() <= MARGIN_IN_BYTES } + fn compute_delete_mask(&mut self) -> BitSet { + let delete_docs = BitSet::with_capacity(self.max_doc as usize); + loop { + if let Some(delete_operation) = self.delete_queue_cursor.peek() { + let delete_term = delete_operation.term; + let Field(field_id) = delete_term.field(); + let postings_writer = &self.per_field_postings_writers[field_id as usize]; + // TODO add the associated posting list with the correct cut-off. + self.delete_queue_cursor.consume(); + } + else { + break; + } + + } + delete_docs + } + + /// Indexes a new document /// /// As a user, you should rather use `IndexWriter`'s add_document. - pub fn add_document(&mut self, doc: &Document, schema: &Schema) -> io::Result<()> { + pub fn add_document(&mut self, add_operation: &AddOperation, schema: &Schema) -> io::Result<()> { let doc_id = self.max_doc; + let doc = &add_operation.document; + self.docstamps.push(add_operation.opstamp); for (field, field_values) in doc.get_sorted_field_values() { let field_posting_writer: &mut Box = &mut self.per_field_postings_writers[field.0 as usize]; let field_options = schema.get_field_entry(field); @@ -171,7 +197,7 @@ impl<'a> SegmentWriter<'a> { } } self.fieldnorms_writer.fill_val_up_to(doc_id); - self.fast_field_writers.add_document(doc); + self.fast_field_writers.add_document(&doc); let stored_fieldvalues: Vec<&FieldValue> = doc .field_values() .iter() @@ -221,7 +247,7 @@ fn write<'a>(per_field_postings_writers: &[Box], segment_info: SegmentInfo, mut serializer: SegmentSerializer, heap: &'a Heap,) -> Result { - for per_field_postings_writer in per_field_postings_writers.iter() { + for per_field_postings_writer in per_field_postings_writers { try!(per_field_postings_writer.serialize(serializer.get_postings_serializer(), heap)); } try!(fast_field_writers.serialize(serializer.get_fast_field_serializer())); diff --git a/src/lib.rs b/src/lib.rs index e82447472..3315e776d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -47,7 +47,7 @@ extern crate combine; extern crate itertools; extern crate chan; extern crate crossbeam; - +extern crate bit_set; #[cfg(feature="simdcompression")] extern crate libc; diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 8760b4e71..f146c132b 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -52,6 +52,7 @@ mod tests { use query::TermQuery; use schema::Field; use test::Bencher; + use indexer::operation::AddOperation; use rand::{XorShiftRng, Rng, SeedableRng}; @@ -83,27 +84,39 @@ mod tests { let index = Index::create_in_ram(schema.clone()); let segment = index.new_segment(); let delete_queue = DeleteQueue::default(); - let delete_cursor = delete_queue.cursor(); + let mut delete_cursor = delete_queue.cursor(); let heap = Heap::with_capacity(10_000_000); { - let mut segment_writer = SegmentWriter::for_segment(&heap, segment.clone(), &schema, delete_cursor).unwrap(); + let mut segment_writer = SegmentWriter::for_segment(&heap, segment.clone(), &schema, &mut delete_cursor).unwrap(); { let mut doc = Document::default(); doc.add_text(text_field, "a b a c a d a a."); doc.add_text(text_field, "d d d d a"); // checking that position works if the field has two values. - segment_writer.add_document(&doc, &schema).unwrap(); + let op = AddOperation { + opstamp: 0u64, + document: doc, + }; + segment_writer.add_document(&op, &schema).unwrap(); } { let mut doc = Document::default(); doc.add_text(text_field, "b a"); - segment_writer.add_document(&doc, &schema).unwrap(); + let op = AddOperation { + opstamp: 1u64, + document: doc, + }; + segment_writer.add_document(&op, &schema).unwrap(); } for i in 2..1000 { let mut doc = Document::default(); let mut text = iter::repeat("e ").take(i).collect::(); text.push_str(" a"); doc.add_text(text_field, &text); - segment_writer.add_document(&doc, &schema).unwrap(); + let op = AddOperation { + opstamp: 2u64, + document: doc, + }; + segment_writer.add_document(&op, &schema).unwrap(); } segment_writer.finalize().unwrap(); } From d5c161e196bf8bec750d5d9f43b95d47cb96d60d Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 16 Jan 2017 23:14:28 +0900 Subject: [PATCH 005/107] issue/43 Computing deleted doc bitset --- src/datastruct/stacker/hashmap.rs | 4 ++ src/datastruct/stacker/heap.rs | 5 ++- src/indexer/delete_queue.rs | 18 +++++++++ src/indexer/document_receiver.rs | 5 +++ src/indexer/mod.rs | 1 + src/indexer/segment_writer.rs | 67 +++++++++++++++++++++++-------- src/postings/postings_writer.rs | 18 ++++++++- src/postings/recorder.rs | 42 +++++++++++++++++++ 8 files changed, 140 insertions(+), 20 deletions(-) create mode 100644 src/indexer/document_receiver.rs diff --git a/src/datastruct/stacker/hashmap.rs b/src/datastruct/stacker/hashmap.rs index 55a6dc12c..c70c879fc 100644 --- a/src/datastruct/stacker/hashmap.rs +++ b/src/datastruct/stacker/hashmap.rs @@ -125,6 +125,10 @@ impl<'a, V> HashMap<'a, V> where V: HeapAllocable { .map(move |addr: u32| heap.get_mut_ref::(addr)) } + pub fn heap(&self) -> &Heap { + &self.heap + } + pub fn get_or_create>(&mut self, key: S) -> &mut V { let entry = self.lookup(key.as_ref()); match entry { diff --git a/src/datastruct/stacker/heap.rs b/src/datastruct/stacker/heap.rs index 9a43de897..c3b8d0a27 100644 --- a/src/datastruct/stacker/heap.rs +++ b/src/datastruct/stacker/heap.rs @@ -41,7 +41,6 @@ impl Heap { self.inner().clear(); } - /// Return the heap capacity. pub fn capacity(&self,) -> u32 { self.inner().capacity() @@ -91,6 +90,10 @@ impl Heap { pub fn get_mut_ref(&self, addr: u32) -> &mut Item { self.inner().get_mut_ref(addr) } + + pub fn get_ref(&self, addr: u32) -> &Item { + self.inner().get_mut_ref(addr) + } } diff --git a/src/indexer/delete_queue.rs b/src/indexer/delete_queue.rs index 23b8e162c..6ee4980f3 100644 --- a/src/indexer/delete_queue.rs +++ b/src/indexer/delete_queue.rs @@ -85,6 +85,24 @@ pub struct DeleteQueueCursor { impl DeleteQueueCursor { + /// Skips to the first delete operation which has + /// a timestamp that is greater or equal to opstamp. + /// + /// Returns false in the DeleteQueue reaches its end before + /// meeting such an element. + pub fn skip_to(&mut self, opstamp: u64) -> bool { + // TODO optimize + while let Some(delete_operation) = self.peek() { + if delete_operation.opstamp >= opstamp { + return true; + } + else { + self.consume(); + } + } + return false; + } + pub fn peek(&mut self) -> Option { if self.pos >= BLOCK_SIZE { self.pos = 0; diff --git a/src/indexer/document_receiver.rs b/src/indexer/document_receiver.rs new file mode 100644 index 000000000..73bb7b4ec --- /dev/null +++ b/src/indexer/document_receiver.rs @@ -0,0 +1,5 @@ +use DocId; + +pub trait DocumentReceiver { + fn receive(&mut self, doc: DocId); +} \ No newline at end of file diff --git a/src/indexer/mod.rs b/src/indexer/mod.rs index 9bcfae1c5..1f970f72f 100644 --- a/src/indexer/mod.rs +++ b/src/indexer/mod.rs @@ -10,6 +10,7 @@ mod segment_manager; pub mod delete_queue; pub mod segment_updater; mod directory_lock; +pub mod document_receiver; pub mod operation; pub use self::segment_serializer::SegmentSerializer; diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index c4ba34602..bb8e19fb3 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -22,6 +22,21 @@ use super::delete_queue::DeleteQueueCursor; use indexer::index_writer::MARGIN_IN_BYTES; use super::operation::{AddOperation, DeleteOperation}; use bit_set::BitSet; +use indexer::document_receiver::DocumentReceiver; + + +struct DocumentDeleter<'a> { + limit_doc_id: DocId, + deleted_docs: &'a mut BitSet, +} + +impl<'a> DocumentReceiver for DocumentDeleter<'a> { + fn receive(&mut self, doc: DocId) { + if doc < self.limit_doc_id { + self.deleted_docs.insert(doc as usize); + } + } +} /// A `SegmentWriter` is in charge of creating segment index from a /// documents. @@ -36,7 +51,7 @@ pub struct SegmentWriter<'a> { fast_field_writers: U32FastFieldsWriter, fieldnorms_writer: U32FastFieldsWriter, delete_queue_cursor: &'a mut DeleteQueueCursor, - docstamps: Vec, + doc_opstamps: Vec, } @@ -103,7 +118,7 @@ impl<'a> SegmentWriter<'a> { segment_serializer: segment_serializer, fast_field_writers: U32FastFieldsWriter::from_schema(schema), delete_queue_cursor: delete_queue_cursor, - docstamps: Vec::with_capacity(1_000), + doc_opstamps: Vec::with_capacity(1_000), }) } @@ -136,23 +151,41 @@ impl<'a> SegmentWriter<'a> { pub fn is_buffer_full(&self,) -> bool { self.heap.num_free_bytes() <= MARGIN_IN_BYTES } - + + fn compute_doc_limit(&self, opstamp: u64) -> DocId { + let doc_id = match self.doc_opstamps.binary_search(&opstamp) { + Ok(doc_id) => doc_id, + Err(doc_id) => doc_id, + }; + doc_id as DocId + } + fn compute_delete_mask(&mut self) -> BitSet { - let delete_docs = BitSet::with_capacity(self.max_doc as usize); - loop { - if let Some(delete_operation) = self.delete_queue_cursor.peek() { - let delete_term = delete_operation.term; - let Field(field_id) = delete_term.field(); - let postings_writer = &self.per_field_postings_writers[field_id as usize]; - // TODO add the associated posting list with the correct cut-off. - self.delete_queue_cursor.consume(); + if let Some(min_opstamp) = self.doc_opstamps.first() { + if !self.delete_queue_cursor.skip_to(*min_opstamp) { + return BitSet::new(); } - else { - break; - } - } - delete_docs + else { + return BitSet::new(); + } + let mut deleted_docs = BitSet::with_capacity(self.max_doc as usize); + while let Some(delete_operation) = self.delete_queue_cursor.consume() { + // We can skip computing delete operations that + // are older than our oldest document. + // + // They don't belong to this document anyway. + let delete_term = delete_operation.term; + let Field(field_id) = delete_term.field(); + let postings_writer: &Box = &self.per_field_postings_writers[field_id as usize]; + let limit_doc_id = self.compute_doc_limit(delete_operation.opstamp); + let mut document_deleter = DocumentDeleter { + limit_doc_id: limit_doc_id, + deleted_docs: &mut deleted_docs + }; + postings_writer.push_documents(delete_term.value(), &mut document_deleter); + } + deleted_docs } @@ -162,7 +195,7 @@ impl<'a> SegmentWriter<'a> { pub fn add_document(&mut self, add_operation: &AddOperation, schema: &Schema) -> io::Result<()> { let doc_id = self.max_doc; let doc = &add_operation.document; - self.docstamps.push(add_operation.opstamp); + self.doc_opstamps.push(add_operation.opstamp); for (field, field_values) in doc.get_sorted_field_values() { let field_posting_writer: &mut Box = &mut self.per_field_postings_writers[field.0 as usize]; let field_options = schema.get_field_entry(field); diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index c3d1f997f..c69218b39 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -7,7 +7,8 @@ use postings::Recorder; use analyzer::SimpleTokenizer; use schema::Field; use analyzer::StreamingIterator; -use datastruct::stacker::{HashMap, Heap}; +use indexer::document_receiver::DocumentReceiver; +use datastruct::stacker::{HashMap, Entry, Heap}; /// The `PostingsWriter` is in charge of receiving documenting /// and building a `Segment` in anonymous memory. @@ -22,11 +23,15 @@ pub trait PostingsWriter { /// * heap - heap used to store the postings informations as well as the terms /// in the hashmap. fn suscribe(&mut self, doc: DocId, pos: u32, term: &Term, heap: &Heap); - + /// Serializes the postings on disk. /// The actual serialization format is handled by the `PostingsSerializer`. fn serialize(&self, serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()>; + /// Push all documents associated with a given term to a + /// given DocumentLister. + fn push_documents(&self, term_val: &[u8], document_listener: &mut DocumentReceiver); + /// Closes all of the currently open `Recorder`'s. fn close(&mut self, heap: &Heap); @@ -99,6 +104,15 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<' } } + + fn push_documents(&self, term_val: &[u8], document_receiver: &mut DocumentReceiver) { + if let Entry::Occupied(addr) = self.term_index.lookup(term_val) { + let heap = self.term_index.heap(); + let recorder: &Rec = heap.get_ref(addr); + recorder.push_documents(addr, document_receiver, heap); + } + } + #[inline] fn suscribe(&mut self, doc: DocId, position: u32, term: &Term, heap: &Heap) { let mut recorder = self.term_index.get_or_create(term); diff --git a/src/postings/recorder.rs b/src/postings/recorder.rs index 94173720b..970b6f071 100644 --- a/src/postings/recorder.rs +++ b/src/postings/recorder.rs @@ -2,6 +2,7 @@ use DocId; use std::io; use postings::PostingsSerializer; use datastruct::stacker::{ExpUnrolledLinkedList, Heap, HeapAllocable}; +use indexer::document_receiver::DocumentReceiver; const EMPTY_ARRAY: [u32; 0] = [0u32; 0]; const POSITION_END: u32 = 4294967295; @@ -28,6 +29,11 @@ pub trait Recorder: HeapAllocable { fn close_doc(&mut self, heap: &Heap); /// Returns the number of document that have been seen so far fn doc_freq(&self) -> u32; + /// Push all documents to a given DocumentLister. + fn push_documents(&self, + self_addr: u32, + document_receiver: &mut DocumentReceiver, + heap: &Heap); /// Pushes the postings information to the serializer. fn serialize(&self, self_addr: u32, @@ -73,6 +79,15 @@ impl Recorder for NothingRecorder { self.doc_freq } + fn push_documents(&self, + self_addr: u32, + document_receiver: &mut DocumentReceiver, + heap: &Heap) { + for doc in self.stack.iter(self_addr, heap) { + document_receiver.receive(doc); + } + } + fn serialize(&self, self_addr: u32, serializer: &mut PostingsSerializer, @@ -130,6 +145,17 @@ impl Recorder for TermFrequencyRecorder { self.doc_freq } + fn push_documents(&self, + self_addr: u32, + document_receiver: &mut DocumentReceiver, + heap: &Heap) { + let mut doc_iter = self.stack.iter(self_addr, heap); + while let Some(doc) = doc_iter.next() { + doc_iter.next().expect("Panicked while trying to read a frequency"); + document_receiver.receive(doc); + } + } + fn serialize(&self, self_addr: u32, serializer: &mut PostingsSerializer, @@ -190,6 +216,22 @@ impl Recorder for TFAndPositionRecorder { self.doc_freq } + fn push_documents(&self, + self_addr: u32, + document_receiver: &mut DocumentReceiver, + heap: &Heap) { + let mut positions_iter = self.stack.iter(self_addr, heap); + while let Some(doc) = positions_iter.next() { + document_receiver.receive(doc); + loop { + let position = positions_iter.next().expect("This should never happen. Pleasee report the bug."); + if position == POSITION_END { + break; + } + } + } + } + fn serialize(&self, self_addr: u32, serializer: &mut PostingsSerializer, From 01cf303decc5a607076abea0c312c3b5505a7710 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 17 Jan 2017 10:40:08 +0900 Subject: [PATCH 006/107] issue/43 segment writer --- src/indexer/index_writer.rs | 22 +++++++++++++--- src/indexer/segment_writer.rs | 48 +++++++++++++++++++++++++++-------- src/postings/mod.rs | 5 +--- 3 files changed, 57 insertions(+), 18 deletions(-) diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index bda2ac391..bc59efcb9 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -1,6 +1,6 @@ use schema::Schema; use schema::Document; -use super::operation::{DeleteOperation, AddOperation}; +use super::operation::AddOperation; use indexer::SegmentSerializer; use core::SerializableSegment; use core::Index; @@ -95,7 +95,7 @@ fn index_documents(heap: &mut Heap, -> Result<()> { heap.clear(); let segment_id = segment.id(); - let mut segment_writer = try!(SegmentWriter::for_segment(heap, segment, &schema, delete_cursor)); + let mut segment_writer = try!(SegmentWriter::for_segment(heap, segment, &schema)); for doc in document_iterator { try!(segment_writer.add_document(&doc, &schema)); if segment_writer.is_buffer_full() { @@ -105,11 +105,23 @@ fn index_documents(heap: &mut Heap, } } let num_docs = segment_writer.max_doc(); + assert!(num_docs > 0); + let first_opstamp: u64 = segment_writer.first_opstamp(); + let last_opstamp: u64 = segment_writer.last_opstamp(); + + delete_cursor.skip_to(first_opstamp); + + let delete_cursor_clone = delete_cursor.clone(); + + let doc_mapping = segment_writer.compute_doc_mapping_after_delete(delete_cursor_clone); + let segment_meta = SegmentMeta { segment_id: segment_id, num_docs: num_docs, }; + delete_cursor.skip_to(last_opstamp); + try!(segment_writer.finalize()); segment_update_sender.send(SegmentUpdate::AddSegment(segment_meta)); Ok(()) @@ -172,7 +184,11 @@ impl IndexWriter { // the peeking here is to avoid // creating a new segment's files // if no document are available. - if document_iterator.peek().is_some() { + // + // this is a valid guarantee as the + // peeked document now belongs to + // our local iterator. + if document_iterator.peek().is_some() { try!(index_documents(&mut heap, segment, &schema, diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index bb8e19fb3..8a7930a86 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -1,8 +1,7 @@ use Result; use DocId; use std::io; -use schema::Schema; -use schema::Document; +use schema::Schema; use schema::Term; use core::SegmentInfo; use core::Segment; @@ -20,7 +19,7 @@ use indexer::segment_serializer::SegmentSerializer; use datastruct::stacker::Heap; use super::delete_queue::DeleteQueueCursor; use indexer::index_writer::MARGIN_IN_BYTES; -use super::operation::{AddOperation, DeleteOperation}; +use super::operation::AddOperation; use bit_set::BitSet; use indexer::document_receiver::DocumentReceiver; @@ -50,7 +49,6 @@ pub struct SegmentWriter<'a> { segment_serializer: SegmentSerializer, fast_field_writers: U32FastFieldsWriter, fieldnorms_writer: U32FastFieldsWriter, - delete_queue_cursor: &'a mut DeleteQueueCursor, doc_opstamps: Vec, } @@ -102,8 +100,7 @@ impl<'a> SegmentWriter<'a> { /// - schema pub fn for_segment(heap: &'a Heap, mut segment: Segment, - schema: &Schema, - delete_queue_cursor: &'a mut DeleteQueueCursor) -> Result> { + schema: &Schema) -> Result> { let segment_serializer = try!(SegmentSerializer::for_segment(&mut segment)); let mut per_field_postings_writers: Vec> = Vec::new(); for field_entry in schema.fields() { @@ -117,7 +114,6 @@ impl<'a> SegmentWriter<'a> { fieldnorms_writer: create_fieldnorms_writer(schema), segment_serializer: segment_serializer, fast_field_writers: U32FastFieldsWriter::from_schema(schema), - delete_queue_cursor: delete_queue_cursor, doc_opstamps: Vec::with_capacity(1_000), }) } @@ -126,7 +122,7 @@ impl<'a> SegmentWriter<'a> { /// /// Finalize consumes the `SegmentWriter`, so that it cannot /// be used afterwards. - pub fn finalize(mut self,) -> Result<()> { + pub fn finalize(mut self) -> Result<()> { let segment_info = self.segment_info(); for per_field_postings_writer in &mut self.per_field_postings_writers { per_field_postings_writer.close(self.heap); @@ -160,9 +156,39 @@ impl<'a> SegmentWriter<'a> { doc_id as DocId } - fn compute_delete_mask(&mut self) -> BitSet { + pub fn compute_doc_mapping_after_delete(&self, mut delete_queue_cursor: DeleteQueueCursor) -> Vec> { + let delete_docs = self.compute_delete_mask(&mut delete_queue_cursor); + let max_doc: usize = self.max_doc as usize; + let mut doc_autoinc = 0u32; + (0..max_doc) + .map(|doc| { + if delete_docs.contains(doc) { + None + } + else { + let new_doc = doc_autoinc; + doc_autoinc += 1; + Some(new_doc) + } + }) + .collect::>() + } + + pub fn first_opstamp(&self) -> u64 { + *(self.doc_opstamps + .first() + .expect("Last doc opstamp called on an empty segment writer")) + } + + pub fn last_opstamp(&self) -> u64 { + *(self.doc_opstamps + .last() + .expect("Last doc opstamp called on an empty segment writer")) + } + + fn compute_delete_mask(&self, delete_queue_cursor: &mut DeleteQueueCursor) -> BitSet { if let Some(min_opstamp) = self.doc_opstamps.first() { - if !self.delete_queue_cursor.skip_to(*min_opstamp) { + if !delete_queue_cursor.skip_to(*min_opstamp) { return BitSet::new(); } } @@ -170,7 +196,7 @@ impl<'a> SegmentWriter<'a> { return BitSet::new(); } let mut deleted_docs = BitSet::with_capacity(self.max_doc as usize); - while let Some(delete_operation) = self.delete_queue_cursor.consume() { + while let Some(delete_operation) = delete_queue_cursor.consume() { // We can skip computing delete operations that // are older than our oldest document. // diff --git a/src/postings/mod.rs b/src/postings/mod.rs index f146c132b..ca70512b8 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -44,7 +44,6 @@ mod tests { use schema::{Document, TEXT, STRING, SchemaBuilder, Term}; use core::SegmentComponent; use indexer::SegmentWriter; - use indexer::delete_queue::DeleteQueue; use core::SegmentReader; use core::Index; use std::iter; @@ -83,11 +82,9 @@ mod tests { let schema = schema_builder.build(); let index = Index::create_in_ram(schema.clone()); let segment = index.new_segment(); - let delete_queue = DeleteQueue::default(); - let mut delete_cursor = delete_queue.cursor(); let heap = Heap::with_capacity(10_000_000); { - let mut segment_writer = SegmentWriter::for_segment(&heap, segment.clone(), &schema, &mut delete_cursor).unwrap(); + let mut segment_writer = SegmentWriter::for_segment(&heap, segment.clone(), &schema).unwrap(); { let mut doc = Document::default(); doc.add_text(text_field, "a b a c a d a a."); From fba44b78b627241c70cacc40ab7fcd5fea5891a4 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 19 Jan 2017 23:28:57 +0900 Subject: [PATCH 007/107] issue/43 Added delete doc file --- src/core/index_meta.rs | 2 + src/core/segment.rs | 17 ++++-- src/core/segment_component.rs | 33 +++++------- src/core/segment_id.rs | 2 +- src/directory/directory.rs | 6 ++- src/directory/mmap_directory.rs | 24 +++++++-- src/directory/ram_directory.rs | 19 +++++++ src/fastfield/delete.rs | 91 +++++++++++++++++++++++++++++++++ src/fastfield/mod.rs | 1 + src/indexer/index_writer.rs | 36 ++++++++----- src/indexer/segment_updater.rs | 1 + src/indexer/segment_writer.rs | 55 +++++++++++--------- 12 files changed, 218 insertions(+), 69 deletions(-) create mode 100644 src/fastfield/delete.rs diff --git a/src/core/index_meta.rs b/src/core/index_meta.rs index a2623f9d0..d82f865f2 100644 --- a/src/core/index_meta.rs +++ b/src/core/index_meta.rs @@ -34,6 +34,7 @@ impl IndexMeta { pub struct SegmentMeta { pub segment_id: SegmentId, pub num_docs: u32, + pub num_deleted_docs: u32, } #[cfg(test)] @@ -42,6 +43,7 @@ impl SegmentMeta { SegmentMeta { segment_id: segment_id, num_docs: num_docs, + num_deleted_docs: 0, } } } \ No newline at end of file diff --git a/src/core/segment.rs b/src/core/segment.rs index 3e8bc9a42..354ddb34a 100644 --- a/src/core/segment.rs +++ b/src/core/segment.rs @@ -65,11 +65,18 @@ impl Segment { /// # Disclaimer /// If deletion of a file fails (e.g. a file /// was read-only.), the method does not - /// fail and just logs an error - pub fn delete(&self,) { - for component in SegmentComponent::values() { - let rel_path = self.relative_path(component); - if let Err(err) = self.index.directory().delete(&rel_path) { + /// fail and just logs an error when it fails. + pub fn delete(&self) { + info!("Deleting segment {:?}", self.segment_id); + let segment_filepaths_res = self.index.directory().ls_starting_with( + &*self.segment_id.uuid_string() + ); + if segment_filepaths_res.is_err() { + error!("Failed to list files of segment {:?} for deletion.", self.segment_id.uuid_string()); + return; + } + for segment_filepath in &segment_filepaths_res.unwrap() { + if let Err(err) = self.index.directory().delete(&segment_filepath) { match err { FileError::FileDoesNotExist(_) => { // this is normal behavior. diff --git a/src/core/segment_component.rs b/src/core/segment_component.rs index a55ea19dc..62610af3f 100644 --- a/src/core/segment_component.rs +++ b/src/core/segment_component.rs @@ -1,5 +1,3 @@ -use std::vec::IntoIter; - #[derive(Copy, Clone)] pub enum SegmentComponent { INFO, @@ -9,30 +7,23 @@ pub enum SegmentComponent { FIELDNORMS, TERMS, STORE, + DELETE(u64), //< The argument here is an opstamp. + // All of the deletes with an opstamp smaller or equal + // to this opstamp have been taken in account. } impl SegmentComponent { - pub fn values() -> IntoIter { - vec!( - SegmentComponent::INFO, - SegmentComponent::POSTINGS, - SegmentComponent::POSITIONS, - SegmentComponent::FASTFIELDS, - SegmentComponent::FIELDNORMS, - SegmentComponent::TERMS, - SegmentComponent::STORE, - ).into_iter() - } - pub fn path_suffix(&self)-> &'static str { + pub fn path_suffix(&self)-> String { match *self { - SegmentComponent::POSITIONS => ".pos", - SegmentComponent::INFO => ".info", - SegmentComponent::POSTINGS => ".idx", - SegmentComponent::TERMS => ".term", - SegmentComponent::STORE => ".store", - SegmentComponent::FASTFIELDS => ".fast", - SegmentComponent::FIELDNORMS => ".fieldnorm", + SegmentComponent::POSITIONS => ".pos".to_string(), + SegmentComponent::INFO => ".info".to_string(), + SegmentComponent::POSTINGS => ".idx".to_string(), + SegmentComponent::TERMS => ".term".to_string(), + SegmentComponent::STORE => ".store".to_string(), + SegmentComponent::FASTFIELDS => ".fast".to_string(), + SegmentComponent::FIELDNORMS => ".fieldnorm".to_string(), + SegmentComponent::DELETE(opstamp) => format!("{}.del", opstamp) } } } diff --git a/src/core/segment_id.rs b/src/core/segment_id.rs index 3d77668b3..a9916cb83 100644 --- a/src/core/segment_id.rs +++ b/src/core/segment_id.rs @@ -50,7 +50,7 @@ impl SegmentId { } pub fn relative_path(&self, component: SegmentComponent) -> PathBuf { - let filename = self.uuid_string() + component.path_suffix(); + let filename = self.uuid_string() + &*component.path_suffix(); PathBuf::from(filename) } } diff --git a/src/directory/directory.rs b/src/directory/directory.rs index 6171ed606..1b29592b3 100644 --- a/src/directory/directory.rs +++ b/src/directory/directory.rs @@ -1,6 +1,6 @@ use std::marker::Send; use std::fmt; -use std::path::Path; +use std::path::{Path, PathBuf}; use directory::error::{FileError, OpenWriteError}; use directory::{ReadOnlySource, WritePtr}; use std::result; @@ -70,6 +70,10 @@ pub trait Directory: fmt::Debug + Send + Sync + 'static { /// Clones the directory and boxes the clone fn box_clone(&self) -> Box; + + /// Returns the list of files starting by a given + /// prefix. + fn ls_starting_with(&self, prefix: &str) -> io::Result>; } diff --git a/src/directory/mmap_directory.rs b/src/directory/mmap_directory.rs index e4e595fec..b905bd7f5 100644 --- a/src/directory/mmap_directory.rs +++ b/src/directory/mmap_directory.rs @@ -4,6 +4,7 @@ use std::collections::HashMap; use std::collections::hash_map::Entry as HashMapEntry; use fst::raw::MmapReadOnly; use std::fs::File; +use std::fs::ReadDir; use atomicwrites; use std::sync::RwLock; use std::fmt; @@ -128,8 +129,6 @@ impl Seek for SafeFileWriter { impl Directory for MmapDirectory { - - fn open_read(&self, path: &Path) -> result::Result { debug!("Open Read {:?}", path); let full_path = self.resolve_path(path); @@ -203,7 +202,7 @@ impl Directory for MmapDirectory { } fn delete(&self, path: &Path) -> result::Result<(), FileError> { - debug!("Delete {:?}", path); + debug!("Deleting file {:?}", path); let full_path = self.resolve_path(path); let mut mmap_cache = try!(self.mmap_cache .write() @@ -239,4 +238,23 @@ impl Directory for MmapDirectory { Box::new(self.clone()) } + fn ls_starting_with(&self, prefix: &str) -> io::Result> { + fs::read_dir(&self.root_path) + .map(|paths: ReadDir| { + paths + .filter_map(|dir_entry_res| + dir_entry_res + .ok() + .map(|dir_entry| dir_entry.path()) + ) + .filter(|path| + path.to_str() + .map(|filepath| filepath.starts_with(prefix)) + .unwrap_or(false) + ) + .map(PathBuf::from) + .collect() + }) + + } } diff --git a/src/directory/ram_directory.rs b/src/directory/ram_directory.rs index 6ced0ae7e..cb3c2757a 100644 --- a/src/directory/ram_directory.rs +++ b/src/directory/ram_directory.rs @@ -130,6 +130,20 @@ impl InnerDirectory { .contains_key(path) } + fn ls_starting_with(&self, prefix: &str) -> Vec { + self.0 + .read() + .expect("Failed to get read lock directory.") + .keys() + .filter(|path: &&PathBuf| + path.to_str() + .map(|p: &str| p.starts_with(prefix)) + .unwrap_or(false) + ) + .cloned() + .collect() + } + } impl fmt::Debug for RAMDirectory { @@ -198,4 +212,9 @@ impl Directory for RAMDirectory { Box::new(self.clone()) } + + fn ls_starting_with(&self, prefix: &str) -> io::Result> { + Ok(self.fs.ls_starting_with(prefix)) + } + } diff --git a/src/fastfield/delete.rs b/src/fastfield/delete.rs new file mode 100644 index 000000000..e700bc3e7 --- /dev/null +++ b/src/fastfield/delete.rs @@ -0,0 +1,91 @@ +use bit_set::BitSet; +use directory::WritePtr; +use std::io::Write; +use std::io; +use directory::ReadOnlySource; +use DocId; + +pub fn write_delete_bitset(delete_bitset: &BitSet, writer: &mut WritePtr) -> io::Result<()> { + let max_doc = delete_bitset.capacity(); + let mut byte = 0u8; + let mut shift = 0u8; + for doc in 0..max_doc { + if delete_bitset.contains(doc) { + byte |= 1 << shift; + } + if shift == 7 { + writer.write(&[byte])?; + shift = 0; + byte = 0; + } + else { + shift += 1; + } + } + if max_doc % 8 > 0 { + writer.write(&[byte])?; + } + writer.flush() +} + +pub struct DeleteBitSet(ReadOnlySource); + +impl DeleteBitSet { + + pub fn open(data: ReadOnlySource) -> DeleteBitSet { + DeleteBitSet(data) + } + + pub fn is_deleted(&self, doc: DocId) -> bool { + let byte_offset = doc / 8u32; + let b: u8 = (*self.0)[byte_offset as usize]; + let shift = (doc & 7u32) as u8; + b & (1u8 << shift) != 0 + } +} + + + +#[cfg(test)] +mod tests { + use std::path::PathBuf; + use bit_set::BitSet; + use directory::*; + use super::*; + + fn test_delete_bitset_helper(bitset: &BitSet) { + let test_path = PathBuf::from("test"); + let mut directory = RAMDirectory::create(); + { + let mut writer = directory.open_write(&*test_path).unwrap(); + write_delete_bitset(bitset, &mut writer).unwrap(); + } + { + let source = directory.open_read(&test_path).unwrap(); + let delete_bitset = DeleteBitSet::open(source); + let n = bitset.capacity(); + for doc in 0..n { + assert_eq!(bitset.contains(doc), delete_bitset.is_deleted(doc as DocId)); + } + } + } + + #[test] + fn test_delete_bitset() { + { + let mut bitset = BitSet::with_capacity(10); + bitset.insert(1); + bitset.insert(9); + test_delete_bitset_helper(&bitset); + } + { + let mut bitset = BitSet::with_capacity(8); + bitset.insert(1); + bitset.insert(2); + bitset.insert(3); + bitset.insert(5); + bitset.insert(7); + test_delete_bitset_helper(&bitset); + } + } +} \ No newline at end of file diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index b51a5d15b..00de208b9 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -13,6 +13,7 @@ mod reader; mod writer; mod serializer; +pub mod delete; pub use self::writer::{U32FastFieldsWriter, U32FastFieldWriter}; pub use self::reader::{U32FastFieldsReader, U32FastFieldReader}; diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index bc59efcb9..07aa2bd82 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -9,9 +9,11 @@ use schema::Term; use std::thread::JoinHandle; use indexer::{MergePolicy, DefaultMergePolicy}; use indexer::SegmentWriter; +use core::SegmentComponent; use super::directory_lock::DirectoryLock; use std::clone::Clone; use std::io; +use fastfield::delete; use std::thread; use std::mem; use indexer::merger::IndexMerger; @@ -87,7 +89,7 @@ impl !Sync for IndexWriter {} fn index_documents(heap: &mut Heap, - segment: Segment, + mut segment: Segment, schema: &Schema, document_iterator: &mut Iterator, segment_update_sender: &mut SegmentUpdateSender, @@ -95,7 +97,7 @@ fn index_documents(heap: &mut Heap, -> Result<()> { heap.clear(); let segment_id = segment.id(); - let mut segment_writer = try!(SegmentWriter::for_segment(heap, segment, &schema)); + let mut segment_writer = try!(SegmentWriter::for_segment(heap, segment.clone(), &schema)); for doc in document_iterator { try!(segment_writer.add_document(&doc, &schema)); if segment_writer.is_buffer_full() { @@ -105,23 +107,29 @@ fn index_documents(heap: &mut Heap, } } let num_docs = segment_writer.max_doc(); - assert!(num_docs > 0); - let first_opstamp: u64 = segment_writer.first_opstamp(); - let last_opstamp: u64 = segment_writer.last_opstamp(); - - delete_cursor.skip_to(first_opstamp); - - let delete_cursor_clone = delete_cursor.clone(); + assert!(num_docs > 0); + + let deleted_docset_opt = segment_writer.compute_deleted_bitset(delete_cursor); + + let last_opstamp = segment_writer.last_opstamp(); + + let num_deleted_docs; + + if let Some(deleted_docset) = deleted_docset_opt { + let mut delete_write = segment.open_write(SegmentComponent::DELETE(last_opstamp))?; + delete::write_delete_bitset(&deleted_docset, &mut delete_write)?; + num_deleted_docs = deleted_docset.len(); + } + else { + num_deleted_docs = 0; + } - let doc_mapping = segment_writer.compute_doc_mapping_after_delete(delete_cursor_clone); - let segment_meta = SegmentMeta { segment_id: segment_id, num_docs: num_docs, + num_deleted_docs: num_deleted_docs as u32, }; - delete_cursor.skip_to(last_opstamp); - try!(segment_writer.finalize()); segment_update_sender.send(SegmentUpdate::AddSegment(segment_meta)); Ok(()) @@ -330,9 +338,11 @@ impl IndexWriter { let num_docs = try!(merger.write(segment_serializer)); let merged_segment_ids: Vec = segments.iter().map(|segment| segment.id()).collect(); + let segment_meta = SegmentMeta { segment_id: merged_segment.id(), num_docs: num_docs, + num_deleted_docs: 0, }; segment_manager.end_merge(&merged_segment_ids, &segment_meta); diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index 2a1b3e577..f7141d46d 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -221,6 +221,7 @@ impl SegmentUpdater { let segment_meta = SegmentMeta { segment_id: merged_segment.id(), num_docs: num_docs, + num_deleted_docs: 0u32, }; let segment_update = SegmentUpdate::EndMerge(merging_thread_id, segment_ids.clone(), segment_meta.clone()); segment_update_sender_clone.send(segment_update.clone()); diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 8a7930a86..074f6e7d1 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -156,25 +156,25 @@ impl<'a> SegmentWriter<'a> { doc_id as DocId } - pub fn compute_doc_mapping_after_delete(&self, mut delete_queue_cursor: DeleteQueueCursor) -> Vec> { - let delete_docs = self.compute_delete_mask(&mut delete_queue_cursor); - let max_doc: usize = self.max_doc as usize; - let mut doc_autoinc = 0u32; - (0..max_doc) - .map(|doc| { - if delete_docs.contains(doc) { - None - } - else { - let new_doc = doc_autoinc; - doc_autoinc += 1; - Some(new_doc) - } - }) - .collect::>() - } + // pub fn compute_doc_mapping_after_delete(&self, mut delete_queue_cursor: DeleteQueueCursor) -> Vec> { + // let delete_docs = self.compute_delete_mask(&mut delete_queue_cursor); + // let max_doc: usize = self.max_doc as usize; + // let mut doc_autoinc = 0u32; + // (0..max_doc) + // .map(|doc| { + // if delete_docs.contains(doc) { + // None + // } + // else { + // let new_doc = doc_autoinc; + // doc_autoinc += 1; + // Some(new_doc) + // } + // }) + // .collect::>() + // } - pub fn first_opstamp(&self) -> u64 { + fn first_opstamp(&self) -> u64 { *(self.doc_opstamps .first() .expect("Last doc opstamp called on an empty segment writer")) @@ -186,17 +186,21 @@ impl<'a> SegmentWriter<'a> { .expect("Last doc opstamp called on an empty segment writer")) } - fn compute_delete_mask(&self, delete_queue_cursor: &mut DeleteQueueCursor) -> BitSet { - if let Some(min_opstamp) = self.doc_opstamps.first() { - if !delete_queue_cursor.skip_to(*min_opstamp) { - return BitSet::new(); + pub fn compute_deleted_bitset(&self, delete_queue_cursor: &mut DeleteQueueCursor) -> Option { + if let Some(first_opstamp) = self.doc_opstamps.first() { + if !delete_queue_cursor.skip_to(*first_opstamp) { + return None; } } else { - return BitSet::new(); + return None; } + let last_opstamp = *self.doc_opstamps.last().unwrap(); let mut deleted_docs = BitSet::with_capacity(self.max_doc as usize); - while let Some(delete_operation) = delete_queue_cursor.consume() { + while let Some(delete_operation) = delete_queue_cursor.peek() { + if delete_operation.opstamp > last_opstamp { + break; + } // We can skip computing delete operations that // are older than our oldest document. // @@ -210,8 +214,9 @@ impl<'a> SegmentWriter<'a> { deleted_docs: &mut deleted_docs }; postings_writer.push_documents(delete_term.value(), &mut document_deleter); + delete_queue_cursor.consume(); } - deleted_docs + Some(deleted_docs) } From 093dcbd2537cb64383af4c2d969f180a2c2b9d64 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 21 Jan 2017 11:23:29 +0900 Subject: [PATCH 008/107] issue/43 Isolated SegmentMeta --- src/core/index_meta.rs | 22 +--------------------- src/core/mod.rs | 4 +++- src/core/segment_meta.rs | 19 +++++++++++++++++++ src/directory/directory.rs | 3 ++- 4 files changed, 25 insertions(+), 23 deletions(-) create mode 100644 src/core/segment_meta.rs diff --git a/src/core/index_meta.rs b/src/core/index_meta.rs index d82f865f2..c6d7f4bc5 100644 --- a/src/core/index_meta.rs +++ b/src/core/index_meta.rs @@ -1,7 +1,5 @@ - use schema::Schema; -use core::SegmentId; - +use core::SegmentMeta; /// Meta information about the `Index`. /// @@ -29,21 +27,3 @@ impl IndexMeta { } } } - -#[derive(Clone, Debug, RustcDecodable,RustcEncodable)] -pub struct SegmentMeta { - pub segment_id: SegmentId, - pub num_docs: u32, - pub num_deleted_docs: u32, -} - -#[cfg(test)] -impl SegmentMeta { - pub fn new(segment_id: SegmentId, num_docs: u32) -> SegmentMeta { - SegmentMeta { - segment_id: segment_id, - num_docs: num_docs, - num_deleted_docs: 0, - } - } -} \ No newline at end of file diff --git a/src/core/mod.rs b/src/core/mod.rs index 2dfac69d1..3111cfadc 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -7,6 +7,7 @@ mod segment_component; mod segment; mod index_meta; mod pool; +mod segment_meta; mod term_iterator; use std::path::PathBuf; @@ -18,7 +19,8 @@ pub use self::segment::Segment; pub use self::segment::SegmentInfo; pub use self::segment::SerializableSegment; pub use self::index::Index; -pub use self::index_meta::{IndexMeta, SegmentMeta}; +pub use self::segment_meta::SegmentMeta; +pub use self::index_meta::IndexMeta; pub use self::term_iterator::TermIterator; lazy_static! { diff --git a/src/core/segment_meta.rs b/src/core/segment_meta.rs new file mode 100644 index 000000000..ef7521818 --- /dev/null +++ b/src/core/segment_meta.rs @@ -0,0 +1,19 @@ +use core::SegmentId; + +#[derive(Clone, Debug, RustcDecodable,RustcEncodable)] +pub struct SegmentMeta { + pub segment_id: SegmentId, + pub num_docs: u32, + pub num_deleted_docs: u32, +} + +#[cfg(test)] +impl SegmentMeta { + pub fn new(segment_id: SegmentId, num_docs: u32) -> SegmentMeta { + SegmentMeta { + segment_id: segment_id, + num_docs: num_docs, + num_deleted_docs: 0, + } + } +} \ No newline at end of file diff --git a/src/directory/directory.rs b/src/directory/directory.rs index 1b29592b3..40da74654 100644 --- a/src/directory/directory.rs +++ b/src/directory/directory.rs @@ -7,7 +7,8 @@ use std::result; use std::io; use std::marker::Sync; -/// Write-once read many (WORM) abstraction for where tantivy's index should be stored. +/// Write-once read many (WORM) abstraction for where +/// tantivy's data should be stored. /// /// There are currently two implementations of `Directory` /// From d6e7157173c8c986bd81832faf799876cdc370dd Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 21 Jan 2017 18:17:54 +0900 Subject: [PATCH 009/107] issue/43 Test broken... moved segment manager to the segment updater / segment writer --- src/core/index.rs | 38 +++++++++---------- src/indexer/index_writer.rs | 40 ++++++++++++++------ src/indexer/merger.rs | 2 +- src/indexer/mod.rs | 1 + src/indexer/segment_manager.rs | 36 +++++++++++++----- src/indexer/segment_register.rs | 65 ++++++++++++++++++++++----------- src/indexer/segment_updater.rs | 64 +++++++++++++++++++------------- 7 files changed, 156 insertions(+), 90 deletions(-) diff --git a/src/core/index.rs b/src/core/index.rs index d29cc1fc3..6f9e987ef 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -13,9 +13,9 @@ use num_cpus; use super::segment::Segment; use core::SegmentReader; use super::pool::Pool; +use core::SegmentMeta; use super::pool::LeasedItem; use std::path::Path; -use indexer::SegmentManager; use core::IndexMeta; use core::META_FILEPATH; use super::segment::create_segment; @@ -23,12 +23,6 @@ use indexer::segment_updater::save_new_metas; const NUM_SEARCHERS: usize = 12; -/// Accessor to the index segment manager -/// -/// This method is not part of tantivy's public API -pub fn get_segment_manager(index: &Index) -> Arc { - index.segment_manager.clone() -} fn load_metas(directory: &Directory) -> Result { @@ -40,8 +34,6 @@ fn load_metas(directory: &Directory) -> Result { /// Tantivy's Search Index pub struct Index { - segment_manager: Arc, - directory: Box, schema: Schema, searcher_pool: Arc>, @@ -85,10 +77,8 @@ impl Index { fn create_from_metas(directory: Box, metas: IndexMeta) -> Result { let schema = metas.schema.clone(); let docstamp = metas.docstamp; - let committed_segments = metas.committed_segments; // TODO log somethings is uncommitted is not empty. let index = Index { - segment_manager: Arc::new(SegmentManager::from_segments(committed_segments)), directory: directory, schema: schema, searcher_pool: Arc::new(Pool::new()), @@ -151,11 +141,11 @@ impl Index { } /// Returns the list of segments that are searchable - pub fn searchable_segments(&self) -> Vec { - self.searchable_segment_ids() + pub fn searchable_segments(&self) -> Result> { + Ok(self.searchable_segment_ids()? .into_iter() .map(|segment_id| self.segment(segment_id)) - .collect() + .collect()) } /// Remove all of the file associated with the segment. @@ -184,9 +174,21 @@ impl Index { &mut *self.directory } + pub fn committed_segments(&self) -> Result> { + Ok(load_metas(self.directory())? + .committed_segments) + } + /// Returns the list of segment ids that are searchable. - fn searchable_segment_ids(&self) -> Vec { - self.segment_manager.committed_segments() + pub fn searchable_segment_ids(&self) -> Result> { + self.committed_segments() + .map(|commited_segments| { + commited_segments + .iter() + .map(|segment_meta| segment_meta.segment_id) + .collect() + }) + } /// Creates a new segment. @@ -200,7 +202,7 @@ impl Index { /// This needs to be called when a new segment has been /// published or after a merge. pub fn load_searchers(&self) -> Result<()> { - let searchable_segments = self.searchable_segments(); + let searchable_segments = self.searchable_segments()?; let mut searchers = Vec::new(); for _ in 0..NUM_SEARCHERS { let searchable_segments_clone = searchable_segments.clone(); @@ -239,8 +241,6 @@ impl fmt::Debug for Index { impl Clone for Index { fn clone(&self) -> Index { Index { - segment_manager: self.segment_manager.clone(), - directory: self.directory.box_clone(), schema: self.schema.clone(), searcher_pool: self.searcher_pool.clone(), diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 07aa2bd82..38cd56e63 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -6,9 +6,11 @@ use core::SerializableSegment; use core::Index; use core::Segment; use schema::Term; +use indexer::SegmentEntry; use std::thread::JoinHandle; use indexer::{MergePolicy, DefaultMergePolicy}; use indexer::SegmentWriter; +use indexer::SegmentManager; use core::SegmentComponent; use super::directory_lock::DirectoryLock; use std::clone::Clone; @@ -26,7 +28,6 @@ use core::SegmentMeta; use super::delete_queue::{DeleteQueue, DeleteQueueCursor}; use super::segment_updater::{SegmentUpdater, SegmentUpdate, SegmentUpdateSender}; use std::time::Duration; -use super::super::core::index::get_segment_manager; use super::segment_manager::CommitState; use Result; use Error; @@ -63,6 +64,8 @@ pub struct IndexWriter { _merge_policy: Arc>>, index: Index, + segment_manager: Arc, + heap_size_in_bytes_per_thread: usize, workers_join_handle: Vec>>, @@ -130,8 +133,10 @@ fn index_documents(heap: &mut Heap, num_deleted_docs: num_deleted_docs as u32, }; + let segment_entry = SegmentEntry::new(segment_meta, delete_cursor.clone()); + try!(segment_writer.finalize()); - segment_update_sender.send(SegmentUpdate::AddSegment(segment_meta)); + segment_update_sender.send(SegmentUpdate::AddSegment(segment_entry)); Ok(()) } @@ -247,8 +252,14 @@ impl IndexWriter { chan::sync(PIPELINE_MAX_SIZE_IN_DOCS); let merge_policy: Arc>> = Arc::new(Mutex::new(box DefaultMergePolicy::default())); + + let delete_queue = DeleteQueue::default(); + + let committed_segments = index.committed_segments()?; - let (segment_update_sender, segment_update_thread) = SegmentUpdater::start_updater(index.clone(), merge_policy.clone()); + let segment_manager = Arc::new(SegmentManager::from_segments(committed_segments, delete_queue.cursor())); + + let (segment_update_sender, segment_update_thread) = SegmentUpdater::start_updater(index.clone(), segment_manager.clone(), merge_policy.clone()); let mut index_writer = IndexWriter { @@ -258,6 +269,7 @@ impl IndexWriter { heap_size_in_bytes_per_thread: heap_size_in_bytes_per_thread, index: index.clone(), + segment_manager: segment_manager, document_receiver: document_receiver, document_sender: document_sender, @@ -268,7 +280,7 @@ impl IndexWriter { workers_join_handle: Vec::new(), num_threads: num_threads, - delete_queue: DeleteQueue::default(), + delete_queue: delete_queue, committed_docstamp: index.docstamp(), uncommitted_docstamp: index.docstamp(), @@ -304,9 +316,7 @@ impl IndexWriter { return Ok(()); } - - let segment_manager = get_segment_manager(&self.index); - + let ref segment_manager = self.segment_manager; { // let's check that all these segments are in the same // committed/uncommited state. @@ -345,7 +355,13 @@ impl IndexWriter { num_deleted_docs: 0, }; - segment_manager.end_merge(&merged_segment_ids, &segment_meta); + // TODO fix this!!! + let delete_queue = DeleteQueue::default(); + let delete_cursor = delete_queue.cursor(); + + let segment_entry = SegmentEntry::new(segment_meta, delete_cursor); + + segment_manager.end_merge(&merged_segment_ids, segment_entry); try!(self.index.load_searchers()); Ok(()) } @@ -411,7 +427,7 @@ impl IndexWriter { // from now on. self.segment_update_sender.send(SegmentUpdate::NewGeneration); - let rollbacked_segments = get_segment_manager(&self.index).rollback(); + let rollbacked_segments = self.segment_manager.rollback(); for segment_id in rollbacked_segments { // TODO all delete must happen after saving @@ -472,8 +488,8 @@ impl IndexWriter { self.segment_update_sender.send(SegmentUpdate::Commit(self.committed_docstamp)); // wait for the segment update thread to have processed the info - let segment_manager = get_segment_manager(&self.index); - while segment_manager.docstamp() != self.committed_docstamp { + + while self.segment_manager.docstamp() != self.committed_docstamp { thread::sleep(Duration::from_millis(100)); } @@ -629,7 +645,7 @@ mod tests { index_writer.commit().expect("commit failed"); index_writer.wait_merging_threads().expect("waiting merging thread failed"); assert_eq!(num_docs_containing("a"), 200); - assert_eq!(index.searchable_segments().len(), 1); + assert_eq!(index.searchable_segments().unwrap().len(), 1); } } diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 4e616d5f0..865e83437 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -260,7 +260,7 @@ mod tests { } } { - let segments = index.searchable_segments(); + let segments = index.searchable_segments().unwrap(); let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); index_writer.merge(&segments).unwrap(); } diff --git a/src/indexer/mod.rs b/src/indexer/mod.rs index 1f970f72f..15e2035f7 100644 --- a/src/indexer/mod.rs +++ b/src/indexer/mod.rs @@ -13,6 +13,7 @@ mod directory_lock; pub mod document_receiver; pub mod operation; +pub use self::segment_register::SegmentEntry; pub use self::segment_serializer::SegmentSerializer; pub use self::segment_writer::SegmentWriter; pub use self::index_writer::IndexWriter; diff --git a/src/indexer/segment_manager.rs b/src/indexer/segment_manager.rs index 3a41adc25..3b9ad78af 100644 --- a/src/indexer/segment_manager.rs +++ b/src/indexer/segment_manager.rs @@ -2,6 +2,8 @@ use super::segment_register::SegmentRegister; use std::sync::RwLock; use core::SegmentMeta; use core::SegmentId; +use indexer::SegmentEntry; +use indexer::delete_queue::DeleteQueueCursor; use std::sync::{RwLockReadGuard, RwLockWriteGuard}; use std::fmt::{self, Debug, Formatter}; use std::sync::atomic::{AtomicUsize, Ordering}; @@ -84,12 +86,13 @@ impl SegmentManager { self.read().docstamp } - pub fn from_segments(segment_metas: Vec) -> SegmentManager { + + pub fn from_segments(segment_metas: Vec, delete_cursor: DeleteQueueCursor) -> SegmentManager { SegmentManager { registers: RwLock::new( SegmentRegisters { docstamp: 0u64, // TODO put the actual value uncommitted: SegmentRegister::default(), - committed: SegmentRegister::from(segment_metas), + committed: SegmentRegister::new(segment_metas, delete_cursor), }), generation: AtomicUsize::default(), } @@ -130,11 +133,6 @@ impl SegmentManager { registers_lock.uncommitted.clear(); } - pub fn add_segment(&self, segment_meta: SegmentMeta) { - let mut registers_lock = self.write(); - registers_lock.uncommitted.add_segment(segment_meta); - } - pub fn start_merge(&self, segment_ids: &[SegmentId]) { let mut registers_lock = self.write(); if registers_lock.uncommitted.contains_all(segment_ids) { @@ -148,20 +146,25 @@ impl SegmentManager { } } } + + pub fn add_segment(&self, segment_entry: SegmentEntry) { + let mut registers_lock = self.write(); + registers_lock.uncommitted.add_segment_entry(segment_entry); + } - pub fn end_merge(&self, merged_segment_ids: &[SegmentId], merged_segment_meta: &SegmentMeta) { + pub fn end_merge(&self, merged_segment_ids: &[SegmentId], merged_segment_entry: SegmentEntry) { let mut registers_lock = self.write(); if registers_lock.uncommitted.contains_all(merged_segment_ids) { for segment_id in merged_segment_ids { registers_lock.uncommitted.remove_segment(segment_id); } - registers_lock.uncommitted.add_segment(merged_segment_meta.clone()); + registers_lock.uncommitted.add_segment_entry(merged_segment_entry); } else if registers_lock.committed.contains_all(merged_segment_ids) { for segment_id in merged_segment_ids { registers_lock.committed.remove_segment(segment_id); } - registers_lock.committed.add_segment(merged_segment_meta.clone()); + registers_lock.committed.add_segment_entry(merged_segment_entry); } else { warn!("couldn't find segment in SegmentManager"); } @@ -178,3 +181,16 @@ impl SegmentManager { } } + +impl Default for SegmentManager { + fn default() -> SegmentManager { + SegmentManager { + registers: RwLock::new( SegmentRegisters { + docstamp: 0u64, // TODO put the actual value + uncommitted: SegmentRegister::default(), + committed: SegmentRegister::default(), + }), + generation: AtomicUsize::default(), + } + } +} \ No newline at end of file diff --git a/src/indexer/segment_register.rs b/src/indexer/segment_register.rs index 5718a6228..f1dc37d24 100644 --- a/src/indexer/segment_register.rs +++ b/src/indexer/segment_register.rs @@ -3,6 +3,7 @@ use std::collections::HashMap; use core::SegmentMeta; use std::fmt; use std::fmt::{Debug, Formatter}; +use indexer::delete_queue::DeleteQueueCursor; #[derive(Clone, PartialEq, Eq, Debug)] pub enum SegmentState { @@ -23,9 +24,15 @@ impl SegmentState { pub struct SegmentEntry { meta: SegmentMeta, state: SegmentState, + delete_cursor: DeleteQueueCursor, } impl SegmentEntry { + + pub fn segment_id(&self) -> SegmentId { + self.meta.segment_id + } + fn start_merge(&mut self,) { self.state = SegmentState::InMerge; } @@ -33,6 +40,21 @@ impl SegmentEntry { fn is_ready(&self,) -> bool { self.state == SegmentState::Ready } + + pub fn new(segment_meta: SegmentMeta, + delete_cursor: DeleteQueueCursor) -> SegmentEntry { + SegmentEntry { + meta: segment_meta, + state: SegmentState::Ready, + delete_cursor: delete_cursor, + } + } +} + +impl Debug for SegmentEntry { + fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + write!(formatter, "SegmentEntry({:?}, {:?})", self.meta, self.state) + } } @@ -49,6 +71,7 @@ pub struct SegmentRegister { segment_states: HashMap, } + impl Debug for SegmentRegister { fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> { try!(write!(f, "SegmentRegister(")); @@ -120,13 +143,6 @@ impl SegmentRegister { self.segment_states.insert(segment_id, segment_entry); } - pub fn add_segment(&mut self, segment_meta: SegmentMeta) { - self.add_segment_entry(SegmentEntry { - meta: segment_meta.clone(), - state: SegmentState::Ready, - }); - } - pub fn remove_segment(&mut self, segment_id: &SegmentId) { self.segment_states.remove(segment_id); } @@ -138,20 +154,11 @@ impl SegmentRegister { .start_merge(); } - -} - - -impl From> for SegmentRegister { - fn from(segment_metas: Vec) -> SegmentRegister { + pub fn new(segment_metas: Vec, delete_cursor: DeleteQueueCursor) -> SegmentRegister { let mut segment_states = HashMap::new(); for segment_meta in segment_metas { let segment_id = segment_meta.segment_id; - let segment_entry = SegmentEntry { - meta: segment_meta, - state: SegmentState::Ready, - - }; + let segment_entry = SegmentEntry::new(segment_meta, delete_cursor.clone()); segment_states.insert(segment_id, segment_entry); } SegmentRegister { @@ -173,19 +180,29 @@ mod tests { use core::SegmentId; use core::SegmentMeta; + use indexer::delete_queue::DeleteQueue; use super::*; #[test] fn test_segment_register() { + let delete_queue = DeleteQueue::default(); let mut segment_register = SegmentRegister::default(); let segment_id_a = SegmentId::generate_random(); let segment_id_b = SegmentId::generate_random(); let segment_id_merged = SegmentId::generate_random(); - let segment_meta_merged = SegmentMeta::new(segment_id_merged, 10 + 20); - segment_register.add_segment(SegmentMeta::new(segment_id_a, 10)); + + { + let segment_meta = SegmentMeta::new(segment_id_a, 10); + let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor()); + segment_register.add_segment_entry(segment_entry); + } assert_eq!(segment_register.segment_entry(&segment_id_a).unwrap().state, SegmentState::Ready); assert_eq!(segment_register.segment_ids(), vec!(segment_id_a)); - segment_register.add_segment(SegmentMeta::new(segment_id_b, 20)); + { + let segment_meta = SegmentMeta::new(segment_id_b, 20); + let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor()); + segment_register.add_segment_entry(segment_entry); + } assert_eq!(segment_register.segment_entry(&segment_id_b).unwrap().state, SegmentState::Ready); segment_register.start_merge(&segment_id_a); segment_register.start_merge(&segment_id_b); @@ -193,7 +210,11 @@ mod tests { assert_eq!(segment_register.segment_entry(&segment_id_b).unwrap().state, SegmentState::InMerge); segment_register.remove_segment(&segment_id_a); segment_register.remove_segment(&segment_id_b); - segment_register.add_segment(segment_meta_merged); + { + let segment_meta_merged = SegmentMeta::new(segment_id_merged, 10 + 20); + let segment_entry = SegmentEntry::new(segment_meta_merged, delete_queue.cursor()); + segment_register.add_segment_entry(segment_entry); + } assert_eq!(segment_register.segment_ids(), vec!(segment_id_merged)); } diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index f7141d46d..44da31c72 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -12,6 +12,7 @@ use indexer::MergePolicy; use indexer::MergeCandidate; use indexer::merger::IndexMerger; use indexer::SegmentSerializer; +use indexer::SegmentEntry; use std::thread; use schema::Schema; use directory::Directory; @@ -19,12 +20,12 @@ use std::thread::JoinHandle; use std::sync::Arc; use std::collections::HashMap; use rustc_serialize::json; +use indexer::delete_queue::DeleteQueue; use Result; use core::IndexMeta; use core::META_FILEPATH; use std::io::Write; use super::segment_manager::{SegmentManager, get_segment_ready_for_commit}; -use super::super::core::index::get_segment_manager; pub type SegmentUpdateSender = chan::Sender; pub type SegmentUpdateReceiver = chan::Receiver; @@ -54,7 +55,7 @@ pub fn save_new_metas(schema: Schema, docstamp: u64, directory: &mut Directory) -> Result<()> { - let segment_manager = SegmentManager::from_segments(Vec::new()); + let segment_manager = SegmentManager::default(); save_metas(&segment_manager, schema, docstamp, directory) } @@ -82,17 +83,17 @@ pub fn save_metas(segment_manager: &SegmentManager, } -#[derive(Debug, Clone)] +#[derive(Clone, Debug)] pub enum SegmentUpdate { /// New segment added. /// Created by the indexing worker thread - AddSegment(SegmentMeta), + AddSegment(SegmentEntry), /// A merge is ended. /// Remove the merged segment and record the new /// large merged segment. - EndMerge(usize, Vec, SegmentMeta), + EndMerge(usize, Vec, SegmentEntry), /// Happens when rollback is called. /// The current generation of segments is cancelled. @@ -135,29 +136,33 @@ pub struct SegmentUpdater { is_cancelled_generation: bool, segment_update_receiver: SegmentUpdateReceiver, segment_update_sender: SegmentUpdateSender, - segment_manager_arc: Arc, + segment_manager: Arc, merge_policy: Arc>>, merging_thread_id: usize, - merging_threads: HashMap, SegmentMeta)> >, + merging_threads: HashMap, SegmentEntry)> >, } impl SegmentUpdater { - pub fn start_updater(index: Index, merge_policy: Arc>>) -> (SegmentUpdateSender, JoinHandle<()>) { - let segment_updater = SegmentUpdater::new(index, merge_policy); + pub fn start_updater( + index: Index, + segment_manager: Arc, + merge_policy: Arc>>) -> (SegmentUpdateSender, JoinHandle<()>) { + let segment_updater = SegmentUpdater::new(index, segment_manager, merge_policy); (segment_updater.segment_update_sender.clone(), segment_updater.start()) } - fn new(index: Index, merge_policy: Arc>>) -> SegmentUpdater { - let segment_manager_arc = get_segment_manager(&index); + fn new(index: Index, + segment_manager: Arc, + merge_policy: Arc>>) -> SegmentUpdater { let (segment_update_sender, segment_update_receiver): (SegmentUpdateSender, SegmentUpdateReceiver) = chan::async(); SegmentUpdater { index: index, is_cancelled_generation: false, segment_update_sender: segment_update_sender, segment_update_receiver: segment_update_receiver, - segment_manager_arc: segment_manager_arc, + segment_manager: segment_manager, merge_policy: merge_policy, merging_thread_id: 0, merging_threads: HashMap::new(), @@ -173,10 +178,10 @@ impl SegmentUpdater { fn end_merge( &mut self, segment_ids: Vec, - segment_meta: SegmentMeta) { + segment_entry: SegmentEntry) { - let segment_manager = self.segment_manager_arc.clone(); - segment_manager.end_merge(&segment_ids, &segment_meta); + let segment_manager = self.segment_manager.clone(); + segment_manager.end_merge(&segment_ids, segment_entry); save_metas( &*segment_manager, self.index.schema(), @@ -223,9 +228,16 @@ impl SegmentUpdater { num_docs: num_docs, num_deleted_docs: 0u32, }; - let segment_update = SegmentUpdate::EndMerge(merging_thread_id, segment_ids.clone(), segment_meta.clone()); + + + // TODO fix delete cursor + let delete_queue = DeleteQueue::default(); + + let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor()); + + let segment_update = SegmentUpdate::EndMerge(merging_thread_id, segment_ids.clone(), segment_entry.clone()); segment_update_sender_clone.send(segment_update.clone()); - (segment_ids, segment_meta) + (segment_ids, segment_entry) }) .expect("Failed to spawn merge thread"); @@ -247,7 +259,7 @@ impl SegmentUpdater { fn segment_manager(&self,) -> &SegmentManager { - &*self.segment_manager_arc + &*self.segment_manager } pub fn start(self,) -> JoinHandle<()> { @@ -261,7 +273,7 @@ impl SegmentUpdater { fn process(mut self,) { - let segment_manager = self.segment_manager_arc.clone(); + let segment_manager = self.segment_manager.clone(); for segment_update in self.segment_update_receiver.clone() { @@ -311,8 +323,8 @@ impl SegmentUpdater { mem::swap(&mut merging_threads, &mut self.merging_threads); for (_, merging_thread_handle) in merging_threads { match merging_thread_handle.join() { - Ok((segment_ids, segment_meta)) => { - self.end_merge(segment_ids, segment_meta); + Ok((segment_ids, segment_entry)) => { + self.end_merge(segment_ids, segment_entry); } Err(e) => { error!("Error in merging thread {:?}", e); @@ -333,9 +345,9 @@ impl SegmentUpdater { info!("Segment update: {:?}", segment_update); match segment_update { - SegmentUpdate::AddSegment(segment_meta) => { + SegmentUpdate::AddSegment(segment_entry) => { if !self.is_cancelled_generation { - self.segment_manager().add_segment(segment_meta); + self.segment_manager().add_segment(segment_entry); } else { // rollback has been called and this @@ -343,13 +355,13 @@ impl SegmentUpdater { // documents that have been dropped. // // Let's just remove its files. - self.index.delete_segment(segment_meta.segment_id); + self.index.delete_segment(segment_entry.segment_id()); } } - SegmentUpdate::EndMerge(merging_thread_id, segment_ids, segment_meta) => { + SegmentUpdate::EndMerge(merging_thread_id, segment_ids, segment_entry) => { self.end_merge( segment_ids, - segment_meta); + segment_entry); self.merging_threads.remove(&merging_thread_id); } SegmentUpdate::CancelGeneration => { From bacaabf8571261e08085a96b78e2e238814d361e Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 22 Jan 2017 14:50:29 +0900 Subject: [PATCH 010/107] issue/43 fixed on unit test. need big refactoring of segment updater --- src/core/index.rs | 12 +++++++----- src/core/searcher.rs | 15 +++++++++++++-- src/core/segment.rs | 30 +++++++++++++++++------------- src/core/segment_reader.rs | 4 ++++ src/indexer/index_writer.rs | 27 +++++++++++++++++++-------- src/indexer/merger.rs | 7 ++++--- src/indexer/segment_updater.rs | 20 +++++++++++--------- src/lib.rs | 1 + src/query/boolean_query/mod.rs | 2 ++ 9 files changed, 78 insertions(+), 40 deletions(-) diff --git a/src/core/index.rs b/src/core/index.rs index 6f9e987ef..5b8c73723 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -2,6 +2,7 @@ use Result; use Error; use schema::Schema; use std::sync::Arc; +use std::borrow::BorrowMut; use std::fmt; use rustc_serialize::json; use core::SegmentId; @@ -55,8 +56,7 @@ impl Index { /// /// If a previous index was in this directory, then its meta file will be destroyed. pub fn create(directory_path: &Path, schema: Schema) -> Result { - let mut directory = MmapDirectory::open(directory_path)?; - save_new_metas(schema.clone(), 0, &mut directory)?; + let directory = MmapDirectory::open(directory_path)?; Index::from_directory(box directory, schema) } @@ -88,8 +88,9 @@ impl Index { Ok(index) } - /// Opens a new directory from a directory. - pub fn from_directory(directory: Box, schema: Schema) -> Result { + /// Create a new index from a directory. + pub fn from_directory(mut directory: Box, schema: Schema) -> Result { + save_new_metas(schema.clone(), 0, directory.borrow_mut())?; Index::create_from_metas(directory, IndexMeta::with_schema(schema)) } @@ -142,7 +143,8 @@ impl Index { /// Returns the list of segments that are searchable pub fn searchable_segments(&self) -> Result> { - Ok(self.searchable_segment_ids()? + let searchable_segment_ids = self.searchable_segment_ids()?; + Ok(searchable_segment_ids .into_iter() .map(|segment_id| self.segment(segment_id)) .collect()) diff --git a/src/core/searcher.rs b/src/core/searcher.rs index 0d99a2897..0ea6cf840 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -8,6 +8,7 @@ use DocId; use DocAddress; use schema::Term; use core::TermIterator; +use std::fmt; /// Holds a list of `SegmentReader`s ready for search. @@ -15,13 +16,23 @@ use core::TermIterator; /// It guarantees that the `Segment` will not be removed before /// the destruction of the `Searcher`. /// -#[derive(Debug)] pub struct Searcher { segment_readers: Vec, } +impl fmt::Debug for Searcher { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let segment_ids = self.segment_readers + .iter() + .map(|segment_reader| segment_reader.segment_id()) + .collect::>(); + write!(f, "Searcher({:?})", segment_ids); + Ok(()) + } +} + impl Searcher { - + /// Fetches a document from tantivy's store given a `DocAddress`. /// /// The searcher uses the segment ordinal to route the diff --git a/src/core/segment.rs b/src/core/segment.rs index 354ddb34a..aab25574c 100644 --- a/src/core/segment.rs +++ b/src/core/segment.rs @@ -71,22 +71,26 @@ impl Segment { let segment_filepaths_res = self.index.directory().ls_starting_with( &*self.segment_id.uuid_string() ); - if segment_filepaths_res.is_err() { - error!("Failed to list files of segment {:?} for deletion.", self.segment_id.uuid_string()); - return; - } - for segment_filepath in &segment_filepaths_res.unwrap() { - if let Err(err) = self.index.directory().delete(&segment_filepath) { - match err { - FileError::FileDoesNotExist(_) => { - // this is normal behavior. - // the position file for instance may not exists. - } - FileError::IOError(err) => { - error!("Failed to remove {:?} : {:?}", self.segment_id, err); + + match segment_filepaths_res { + Ok(segment_filepaths) => { + for segment_filepath in &segment_filepaths { + if let Err(err) = self.index.directory().delete(&segment_filepath) { + match err { + FileError::FileDoesNotExist(_) => { + // this is normal behavior. + // the position file for instance may not exists. + } + FileError::IOError(err) => { + error!("Failed to remove {:?} : {:?}", self.segment_id, err); + } + } } } } + Err(_) => { + error!("Failed to list files of segment {:?} for deletion.", self.segment_id.uuid_string()); + } } } diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 3860da881..15b873a79 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -237,6 +237,10 @@ impl SegmentReader { pub fn get_term_info(&self, term: &Term) -> Option { self.term_infos.get(term.as_slice()) } + + pub fn segment_id(&self) -> SegmentId { + self.segment_id + } } diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 38cd56e63..de2b77fb6 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -19,7 +19,6 @@ use fastfield::delete; use std::thread; use std::mem; use indexer::merger::IndexMerger; -use core::SegmentId; use datastruct::stacker::Heap; use std::mem::swap; use std::sync::{Arc, Mutex}; @@ -346,8 +345,10 @@ impl IndexWriter { // to merge the two segments. let segment_serializer = try!(SegmentSerializer::for_segment(&mut merged_segment)); let num_docs = try!(merger.write(segment_serializer)); - let merged_segment_ids: Vec = - segments.iter().map(|segment| segment.id()).collect(); + let merged_segment_ids = segments + .iter() + .map(|segment| segment.id()) + .collect::>(); let segment_meta = SegmentMeta { segment_id: merged_segment.id(), @@ -360,9 +361,18 @@ impl IndexWriter { let delete_cursor = delete_queue.cursor(); let segment_entry = SegmentEntry::new(segment_meta, delete_cursor); + + let segment_update = SegmentUpdate::EndMerge( + None, + merged_segment_ids, + segment_entry + ); + + self.segment_update_sender.send(segment_update); - segment_manager.end_merge(&merged_segment_ids, segment_entry); - try!(self.index.load_searchers()); + // self.segment_updater.(segment_ids, segment_entry); + //segment_manager.end_merge(&merged_segment_ids, segment_entry); + Ok(()) } @@ -488,10 +498,8 @@ impl IndexWriter { self.segment_update_sender.send(SegmentUpdate::Commit(self.committed_docstamp)); // wait for the segment update thread to have processed the info + // TODO no guarantee that it has been written on disk. - while self.segment_manager.docstamp() != self.committed_docstamp { - thread::sleep(Duration::from_millis(100)); - } Ok(self.committed_docstamp) } @@ -529,6 +537,9 @@ impl IndexWriter { } } + + + #[cfg(test)] mod tests { diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 865e83437..f3d286434 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -256,13 +256,14 @@ mod tests { doc.add_u32(score_field, 13); index_writer.add_document(doc).unwrap(); } - index_writer.commit().unwrap(); + index_writer.commit().expect("Commit failed"); } } { - let segments = index.searchable_segments().unwrap(); + let segments = index.searchable_segments().expect("Searchable segments failed."); let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); - index_writer.merge(&segments).unwrap(); + index_writer.merge(&segments).expect("Merging failed"); + index_writer.wait_merging_threads().unwrap(); } { let searcher = index.searcher(); diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index 44da31c72..02a28a731 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -82,7 +82,6 @@ pub fn save_metas(segment_manager: &SegmentManager, .map_err(From::from) } - #[derive(Clone, Debug)] pub enum SegmentUpdate { @@ -93,7 +92,7 @@ pub enum SegmentUpdate { /// A merge is ended. /// Remove the merged segment and record the new /// large merged segment. - EndMerge(usize, Vec, SegmentEntry), + EndMerge(Option, Vec, SegmentEntry), /// Happens when rollback is called. /// The current generation of segments is cancelled. @@ -179,7 +178,6 @@ impl SegmentUpdater { &mut self, segment_ids: Vec, segment_entry: SegmentEntry) { - let segment_manager = self.segment_manager.clone(); segment_manager.end_merge(&segment_ids, segment_entry); save_metas( @@ -187,13 +185,15 @@ impl SegmentUpdater { self.index.schema(), self.index.docstamp(), self.index.directory_mut()).expect("Could not save metas."); + for segment_id in segment_ids { self.index.delete_segment(segment_id); } + + self.index.load_searchers().unwrap(); } - - fn start_merges(&mut self,) { + fn start_merges(&mut self) { let merge_candidates = self.consider_merge_options(); @@ -235,7 +235,7 @@ impl SegmentUpdater { let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor()); - let segment_update = SegmentUpdate::EndMerge(merging_thread_id, segment_ids.clone(), segment_entry.clone()); + let segment_update = SegmentUpdate::EndMerge(Some(merging_thread_id), segment_ids.clone(), segment_entry.clone()); segment_update_sender_clone.send(segment_update.clone()); (segment_ids, segment_entry) }) @@ -262,7 +262,7 @@ impl SegmentUpdater { &*self.segment_manager } - pub fn start(self,) -> JoinHandle<()> { + pub fn start(self) -> JoinHandle<()> { thread::Builder::new() .name("segment_update".to_string()) .spawn(move || { @@ -358,11 +358,13 @@ impl SegmentUpdater { self.index.delete_segment(segment_entry.segment_id()); } } - SegmentUpdate::EndMerge(merging_thread_id, segment_ids, segment_entry) => { + SegmentUpdate::EndMerge(merging_thread_id_opt, segment_ids, segment_entry) => { self.end_merge( segment_ids, segment_entry); - self.merging_threads.remove(&merging_thread_id); + if let Some(merging_thread_id) = merging_thread_id_opt { + self.merging_threads.remove(&merging_thread_id); + } } SegmentUpdate::CancelGeneration => { // Called during rollback. The segment diff --git a/src/lib.rs b/src/lib.rs index 3315e776d..6cb2e4b09 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -48,6 +48,7 @@ extern crate itertools; extern crate chan; extern crate crossbeam; extern crate bit_set; +extern crate notify; #[cfg(feature="simdcompression")] extern crate libc; diff --git a/src/query/boolean_query/mod.rs b/src/query/boolean_query/mod.rs index 0c07180d9..c04349687 100644 --- a/src/query/boolean_query/mod.rs +++ b/src/query/boolean_query/mod.rs @@ -62,12 +62,14 @@ mod tests { } assert!(index_writer.commit().is_ok()); } + let make_term_query = |text: &str| { let term_query = TermQuery::new(Term::from_field_text(text_field, text), SegmentPostingsOption::NoFreq); let query: Box = box term_query; query }; + index.load_searchers().unwrap(); let matching_docs = |boolean_query: &Query| { let searcher = index.searcher(); From 926e71a573e564023c19912599689577fcb63785 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 22 Jan 2017 17:18:29 +0900 Subject: [PATCH 011/107] issue/43 unit test running. segment updater uses futures. --- Cargo.toml | 1 + src/core/index.rs | 6 +-- src/directory/directory.rs | 2 + src/directory/mmap_directory.rs | 10 +++- src/directory/ram_directory.rs | 6 +++ src/indexer/index_writer.rs | 49 ++++++++---------- src/indexer/segment_updater.rs | 89 ++++++++++++++++++++++++--------- src/lib.rs | 1 + 8 files changed, 109 insertions(+), 55 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 2c2ec995c..406b0bda8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -37,6 +37,7 @@ uuid = { version = "0.4", features = ["v4", "rustc-serialize"] } chan = "0.1" version = "2" crossbeam = "0.2" +eventual = "0.1.7" [dev-dependencies] rand = "0.3" diff --git a/src/core/index.rs b/src/core/index.rs index 5b8c73723..417bbebd5 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -27,9 +27,9 @@ const NUM_SEARCHERS: usize = 12; fn load_metas(directory: &Directory) -> Result { - let meta_file = try!(directory.open_read(&META_FILEPATH)); - let meta_content = String::from_utf8_lossy(meta_file.as_slice()); - json::decode(&meta_content) + let meta_data = directory.atomic_read(&META_FILEPATH)?; + let meta_string = String::from_utf8_lossy(&meta_data); + json::decode(&meta_string) .map_err(|e| Error::CorruptedFile(META_FILEPATH.clone(), Box::new(e))) } diff --git a/src/directory/directory.rs b/src/directory/directory.rs index 40da74654..320cb9f51 100644 --- a/src/directory/directory.rs +++ b/src/directory/directory.rs @@ -28,6 +28,8 @@ pub trait Directory: fmt::Debug + Send + Sync + 'static { /// have no effect on the returned `ReadOnlySource` object. fn open_read(&self, path: &Path) -> result::Result; + fn atomic_read(&self, path: &Path) -> Result, FileError>; + /// Removes a file /// /// Removing a file will not affect an eventual diff --git a/src/directory/mmap_directory.rs b/src/directory/mmap_directory.rs index b905bd7f5..192501f07 100644 --- a/src/directory/mmap_directory.rs +++ b/src/directory/mmap_directory.rs @@ -4,6 +4,7 @@ use std::collections::HashMap; use std::collections::hash_map::Entry as HashMapEntry; use fst::raw::MmapReadOnly; use std::fs::File; +use std::io::Read; use std::fs::ReadDir; use atomicwrites; use std::sync::RwLock; @@ -128,7 +129,7 @@ impl Seek for SafeFileWriter { impl Directory for MmapDirectory { - + fn open_read(&self, path: &Path) -> result::Result { debug!("Open Read {:?}", path); let full_path = self.resolve_path(path); @@ -224,6 +225,13 @@ impl Directory for MmapDirectory { full_path.exists() } + fn atomic_read(&self, path: &Path) -> Result, FileError> { + let full_path = self.resolve_path(path); + let mut buffer = Vec::new(); + File::open(&full_path)?.read_to_end(&mut buffer)?; + Ok(buffer) + } + fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> { debug!("Atomic Write {:?}", path); let full_path = self.resolve_path(path); diff --git a/src/directory/ram_directory.rs b/src/directory/ram_directory.rs index cb3c2757a..2a85a735d 100644 --- a/src/directory/ram_directory.rs +++ b/src/directory/ram_directory.rs @@ -199,6 +199,12 @@ impl Directory for RAMDirectory { self.fs.exists(path) } + fn atomic_read(&self, path: &Path) -> Result, FileError> { + let read = self.open_read(path)?; + Ok(read.as_slice() + .to_owned()) + } + fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> { let path_buf = PathBuf::from(path); let mut vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone()); diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index de2b77fb6..567cf5f4b 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -13,6 +13,7 @@ use indexer::SegmentWriter; use indexer::SegmentManager; use core::SegmentComponent; use super::directory_lock::DirectoryLock; +use eventual::Async; use std::clone::Clone; use std::io; use fastfield::delete; @@ -25,8 +26,7 @@ use std::sync::{Arc, Mutex}; use chan; use core::SegmentMeta; use super::delete_queue::{DeleteQueue, DeleteQueueCursor}; -use super::segment_updater::{SegmentUpdater, SegmentUpdate, SegmentUpdateSender}; -use std::time::Duration; +use super::segment_updater::{SegmentUpdate, SegmentUpdateManager}; use super::segment_manager::CommitState; use Result; use Error; @@ -72,8 +72,7 @@ pub struct IndexWriter { document_receiver: DocumentReceiver, document_sender: DocumentSender, - segment_update_sender: SegmentUpdateSender, - segment_update_thread: JoinHandle<()>, + segment_update_manager: SegmentUpdateManager, worker_id: usize, @@ -94,7 +93,7 @@ fn index_documents(heap: &mut Heap, mut segment: Segment, schema: &Schema, document_iterator: &mut Iterator, - segment_update_sender: &mut SegmentUpdateSender, + segment_update_manager: &mut SegmentUpdateManager, delete_cursor: &mut DeleteQueueCursor) -> Result<()> { heap.clear(); @@ -135,7 +134,7 @@ fn index_documents(heap: &mut Heap, let segment_entry = SegmentEntry::new(segment_meta, delete_cursor.clone()); try!(segment_writer.finalize()); - segment_update_sender.send(SegmentUpdate::AddSegment(segment_entry)); + let future = segment_update_manager.send(SegmentUpdate::AddSegment(segment_entry)); Ok(()) } @@ -145,10 +144,10 @@ impl IndexWriter { /// The index writer pub fn wait_merging_threads(mut self) -> Result<()> { - self.segment_update_sender.send(SegmentUpdate::Terminate); + let future = self.segment_update_manager.send(SegmentUpdate::Terminate); // this will stop the indexing thread, - // dropping the last reference to the segment_update_sender. + // dropping the last reference to the segment_update_manager. drop(self.document_sender); let mut v = Vec::new(); @@ -161,12 +160,9 @@ impl IndexWriter { })); } drop(self.workers_join_handle); - self.segment_update_thread - .join() - .map_err(|err| { - error!("Error in the merging thread {:?}", err); - Error::ErrorInThread(format!("{:?}", err)) - }) + + future.await().unwrap(); // TODO do something with the result. + Ok(()) } /// Spawns a new worker thread for indexing. @@ -176,7 +172,7 @@ impl IndexWriter { let index = self.index.clone(); let schema = self.index.schema(); let document_receiver_clone = self.document_receiver.clone(); - let mut segment_update_sender = self.segment_update_sender.clone(); + let mut segment_update_manager = self.segment_update_manager.clone(); let mut heap = Heap::with_capacity(self.heap_size_in_bytes_per_thread); // TODO fix this. the cursor might be too advanced @@ -205,7 +201,7 @@ impl IndexWriter { segment, &schema, &mut document_iterator, - &mut segment_update_sender, + &mut segment_update_manager, &mut delete_cursor_clone)); } else { // No more documents. @@ -258,7 +254,7 @@ impl IndexWriter { let segment_manager = Arc::new(SegmentManager::from_segments(committed_segments, delete_queue.cursor())); - let (segment_update_sender, segment_update_thread) = SegmentUpdater::start_updater(index.clone(), segment_manager.clone(), merge_policy.clone()); + let segment_update_manager = SegmentUpdateManager::new(index.clone(), segment_manager.clone(), merge_policy.clone()); let mut index_writer = IndexWriter { @@ -273,8 +269,7 @@ impl IndexWriter { document_receiver: document_receiver, document_sender: document_sender, - segment_update_sender: segment_update_sender, - segment_update_thread: segment_update_thread, + segment_update_manager: segment_update_manager, workers_join_handle: Vec::new(), num_threads: num_threads, @@ -367,8 +362,8 @@ impl IndexWriter { merged_segment_ids, segment_entry ); - - self.segment_update_sender.send(segment_update); + + self.segment_update_manager.send(segment_update); // self.segment_updater.(segment_ids, segment_entry); //segment_manager.end_merge(&merged_segment_ids, segment_entry); @@ -402,7 +397,7 @@ impl IndexWriter { /// The docstamp at the last commit is returned. pub fn rollback(&mut self) -> Result { - self.segment_update_sender.send(SegmentUpdate::CancelGeneration); + self.segment_update_manager.send(SegmentUpdate::CancelGeneration); // we cannot drop segment ready receiver yet // as it would block the workers. @@ -435,7 +430,7 @@ impl IndexWriter { // // We can now open a new generation and reaccept segments // from now on. - self.segment_update_sender.send(SegmentUpdate::NewGeneration); + self.segment_update_manager.send(SegmentUpdate::NewGeneration); let rollbacked_segments = self.segment_manager.rollback(); for segment_id in rollbacked_segments { @@ -477,7 +472,7 @@ impl IndexWriter { let mut former_workers_join_handle = Vec::new(); swap(&mut former_workers_join_handle, &mut self.workers_join_handle); - + for worker_handle in former_workers_join_handle { let indexing_worker_result = try!(worker_handle.join() .map_err(|e| Error::ErrorInThread(format!("{:?}", e)))); @@ -495,11 +490,11 @@ impl IndexWriter { // This will move uncommitted segments to the state of // committed segments. - self.segment_update_sender.send(SegmentUpdate::Commit(self.committed_docstamp)); + let future = self.segment_update_manager.send(SegmentUpdate::Commit(self.committed_docstamp)); // wait for the segment update thread to have processed the info - // TODO no guarantee that it has been written on disk. - + // TODO remove unwrap + future.await().unwrap(); Ok(self.committed_docstamp) } diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index 02a28a731..a5b5e4ec3 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -25,10 +25,11 @@ use Result; use core::IndexMeta; use core::META_FILEPATH; use std::io::Write; +use eventual::*; use super::segment_manager::{SegmentManager, get_segment_ready_for_commit}; -pub type SegmentUpdateSender = chan::Sender; -pub type SegmentUpdateReceiver = chan::Receiver; +type SegmentUpdateSender = chan::Sender<(Complete<(), &'static str>, SegmentUpdate)>; +type SegmentUpdateReceiver = chan::Receiver<(Complete<(), &'static str>, SegmentUpdate)>; fn create_metas(segment_manager: &SegmentManager, schema: Schema, docstamp: u64) -> IndexMeta { @@ -78,8 +79,10 @@ pub fn save_metas(segment_manager: &SegmentManager, let metas = create_metas(segment_manager, schema, docstamp); let mut w = Vec::new(); try!(write!(&mut w, "{}\n", json::as_pretty_json(&metas))); - directory.atomic_write(&META_FILEPATH, &w[..]) + directory + .atomic_write(&META_FILEPATH, &w[..]) .map_err(From::from) + } #[derive(Clone, Debug)] @@ -119,6 +122,41 @@ pub enum SegmentUpdate { +// TODO Rename +#[derive(Clone)] +pub struct SegmentUpdateManager { + channel: SegmentUpdateSender, +} + + +impl SegmentUpdateManager { + + pub fn new( + index: Index, + segment_manager: Arc, + merge_policy: Arc>>) -> SegmentUpdateManager { + let (segment_update_sender, segment_update_receiver): (SegmentUpdateSender, SegmentUpdateReceiver) = chan::async(); + let segment_update_manager = SegmentUpdateManager { + channel: segment_update_sender, + }; + let segment_updater = SegmentUpdater::new( + index, + segment_manager, + merge_policy, + segment_update_manager.clone(), + segment_update_receiver); + segment_updater.start(); + segment_update_manager + } + + pub fn send(&self, segment_update: SegmentUpdate) -> Future<(), &'static str> { + let (fullfiller, future) = Future::<(), &'static str>::pair(); + self.channel.send((fullfiller, segment_update)); + future + } + +} + /// The segment updater is in charge of processing all of the /// `SegmentUpdate`s. @@ -134,32 +172,24 @@ pub struct SegmentUpdater { index: Index, is_cancelled_generation: bool, segment_update_receiver: SegmentUpdateReceiver, - segment_update_sender: SegmentUpdateSender, + segment_update_manager: SegmentUpdateManager, segment_manager: Arc, merge_policy: Arc>>, merging_thread_id: usize, merging_threads: HashMap, SegmentEntry)> >, } - impl SegmentUpdater { - - pub fn start_updater( - index: Index, - segment_manager: Arc, - merge_policy: Arc>>) -> (SegmentUpdateSender, JoinHandle<()>) { - let segment_updater = SegmentUpdater::new(index, segment_manager, merge_policy); - (segment_updater.segment_update_sender.clone(), segment_updater.start()) - } - + fn new(index: Index, segment_manager: Arc, - merge_policy: Arc>>) -> SegmentUpdater { - let (segment_update_sender, segment_update_receiver): (SegmentUpdateSender, SegmentUpdateReceiver) = chan::async(); - SegmentUpdater { + merge_policy: Arc>>, + segment_update_manager: SegmentUpdateManager, + segment_update_receiver: SegmentUpdateReceiver) -> SegmentUpdater { + SegmentUpdater { index: index, is_cancelled_generation: false, - segment_update_sender: segment_update_sender, + segment_update_manager: segment_update_manager, segment_update_receiver: segment_update_receiver, segment_manager: segment_manager, merge_policy: merge_policy, @@ -204,7 +234,7 @@ impl SegmentUpdater { self.segment_manager().start_merge(&segment_ids); let index_clone = self.index.clone(); - let segment_update_sender_clone = self.segment_update_sender.clone(); + let segment_update_manager_clone = self.segment_update_manager.clone(); let merge_thread_handle = thread::Builder::new() .name(format!("merge_thread_{:?}", merging_thread_id)) @@ -236,7 +266,7 @@ impl SegmentUpdater { let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor()); let segment_update = SegmentUpdate::EndMerge(Some(merging_thread_id), segment_ids.clone(), segment_entry.clone()); - segment_update_sender_clone.send(segment_update.clone()); + segment_update_manager_clone.send(segment_update.clone()); (segment_ids, segment_entry) }) .expect("Failed to spawn merge thread"); @@ -271,16 +301,20 @@ impl SegmentUpdater { .expect("Failed to start segment updater thread.") } - fn process(mut self,) { + fn process(mut self) { let segment_manager = self.segment_manager.clone(); - - for segment_update in self.segment_update_receiver.clone() { + + let mut complete_option = None; + + for (complete, segment_update) in self.segment_update_receiver.clone() { if let SegmentUpdate::Terminate = segment_update { + complete_option = Some(complete); break; } - + + // we check the generation number as if it was // dirty-bit. If the value is different // to our generation, then the segment_manager has @@ -317,6 +351,9 @@ impl SegmentUpdater { // - start merges if required self.start_merges(); } + + complete.complete(()); + } let mut merging_threads = HashMap::new(); @@ -332,6 +369,10 @@ impl SegmentUpdater { } } } + + if let Some(complete) = complete_option { + complete.complete(()); + } } diff --git a/src/lib.rs b/src/lib.rs index 6cb2e4b09..9ba528862 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -49,6 +49,7 @@ extern crate chan; extern crate crossbeam; extern crate bit_set; extern crate notify; +extern crate eventual; #[cfg(feature="simdcompression")] extern crate libc; From 6530d43d6a3971f0dd59242f3f8492444bf2f096 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 23 Jan 2017 09:37:31 +0900 Subject: [PATCH 012/107] issue/43 Small fixes. --- src/core/index.rs | 7 +++--- src/core/searcher.rs | 20 +++++++-------- src/core/segment_reader.rs | 1 + src/indexer/index_writer.rs | 2 +- src/indexer/segment_manager.rs | 12 +-------- src/indexer/segment_updater.rs | 46 ++++++++++++++++------------------ src/indexer/segment_writer.rs | 6 ----- 7 files changed, 39 insertions(+), 55 deletions(-) diff --git a/src/core/index.rs b/src/core/index.rs index 417bbebd5..05296188a 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -176,11 +176,12 @@ impl Index { &mut *self.directory } + /// Reads the meta.json and returns the list of + /// committed segments. pub fn committed_segments(&self) -> Result> { - Ok(load_metas(self.directory())? - .committed_segments) + Ok(load_metas(self.directory())?.committed_segments) } - + /// Returns the list of segment ids that are searchable. pub fn searchable_segment_ids(&self) -> Result> { self.committed_segments() diff --git a/src/core/searcher.rs b/src/core/searcher.rs index 0ea6cf840..839e00172 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -20,16 +20,6 @@ pub struct Searcher { segment_readers: Vec, } -impl fmt::Debug for Searcher { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let segment_ids = self.segment_readers - .iter() - .map(|segment_reader| segment_reader.segment_id()) - .collect::>(); - write!(f, "Searcher({:?})", segment_ids); - Ok(()) - } -} impl Searcher { @@ -94,4 +84,14 @@ impl From> for Searcher { segment_readers: segment_readers, } } +} + +impl fmt::Debug for Searcher { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let segment_ids = self.segment_readers + .iter() + .map(|segment_reader| segment_reader.segment_id()) + .collect::>(); + write!(f, "Searcher({:?})", segment_ids) + } } \ No newline at end of file diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 15b873a79..d8518f87f 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -238,6 +238,7 @@ impl SegmentReader { self.term_infos.get(term.as_slice()) } + /// Returns the segment id pub fn segment_id(&self) -> SegmentId { self.segment_id } diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 567cf5f4b..c9c424255 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -134,7 +134,7 @@ fn index_documents(heap: &mut Heap, let segment_entry = SegmentEntry::new(segment_meta, delete_cursor.clone()); try!(segment_writer.finalize()); - let future = segment_update_manager.send(SegmentUpdate::AddSegment(segment_entry)); + segment_update_manager.send(SegmentUpdate::AddSegment(segment_entry)); Ok(()) } diff --git a/src/indexer/segment_manager.rs b/src/indexer/segment_manager.rs index 3b9ad78af..6fcdb253d 100644 --- a/src/indexer/segment_manager.rs +++ b/src/indexer/segment_manager.rs @@ -82,11 +82,6 @@ impl SegmentManager { } } - pub fn docstamp(&self,) -> u64 { - self.read().docstamp - } - - pub fn from_segments(segment_metas: Vec, delete_cursor: DeleteQueueCursor) -> SegmentManager { SegmentManager { registers: RwLock::new( SegmentRegisters { @@ -169,12 +164,7 @@ impl SegmentManager { warn!("couldn't find segment in SegmentManager"); } } - - pub fn committed_segments(&self,) -> Vec { - let registers_lock = self.read(); - registers_lock.committed.segment_ids() - } - + pub fn segment_metas(&self,) -> (Vec, Vec) { let registers_lock = self.read(); (registers_lock.committed.segment_metas(), registers_lock.uncommitted.segment_metas()) diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index a5b5e4ec3..060044f95 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -79,10 +79,9 @@ pub fn save_metas(segment_manager: &SegmentManager, let metas = create_metas(segment_manager, schema, docstamp); let mut w = Vec::new(); try!(write!(&mut w, "{}\n", json::as_pretty_json(&metas))); - directory - .atomic_write(&META_FILEPATH, &w[..]) - .map_err(From::from) - + Ok(directory + .atomic_write(&META_FILEPATH, &w[..])?) + } #[derive(Clone, Debug)] @@ -139,17 +138,16 @@ impl SegmentUpdateManager { let segment_update_manager = SegmentUpdateManager { channel: segment_update_sender, }; - let segment_updater = SegmentUpdater::new( + SegmentUpdateRunner::new( index, segment_manager, merge_policy, segment_update_manager.clone(), - segment_update_receiver); - segment_updater.start(); + segment_update_receiver).start(); segment_update_manager } - pub fn send(&self, segment_update: SegmentUpdate) -> Future<(), &'static str> { + pub fn send(&self, segment_update: SegmentUpdate) -> impl Async { let (fullfiller, future) = Future::<(), &'static str>::pair(); self.channel.send((fullfiller, segment_update)); future @@ -158,8 +156,8 @@ impl SegmentUpdateManager { } -/// The segment updater is in charge of processing all of the -/// `SegmentUpdate`s. +/// The segment update runner is in charge of processing all +/// of the `SegmentUpdate`s. /// /// All this processing happens on a single thread /// consuming a common queue. @@ -168,7 +166,7 @@ impl SegmentUpdateManager { /// - indexing threads are sending new segments /// - merging threads are sending merge operations /// - the index writer sends "terminate" -pub struct SegmentUpdater { +pub struct SegmentUpdateRunner { index: Index, is_cancelled_generation: bool, segment_update_receiver: SegmentUpdateReceiver, @@ -179,14 +177,14 @@ pub struct SegmentUpdater { merging_threads: HashMap, SegmentEntry)> >, } -impl SegmentUpdater { +impl SegmentUpdateRunner { fn new(index: Index, segment_manager: Arc, merge_policy: Arc>>, segment_update_manager: SegmentUpdateManager, - segment_update_receiver: SegmentUpdateReceiver) -> SegmentUpdater { - SegmentUpdater { + segment_update_receiver: SegmentUpdateReceiver) -> SegmentUpdateRunner { + SegmentUpdateRunner { index: index, is_cancelled_generation: false, segment_update_manager: segment_update_manager, @@ -259,7 +257,6 @@ impl SegmentUpdater { num_deleted_docs: 0u32, }; - // TODO fix delete cursor let delete_queue = DeleteQueue::default(); @@ -322,7 +319,7 @@ impl SegmentUpdater { let generation_before_update = segment_manager.generation(); self.process_one(segment_update); - + if generation_before_update != segment_manager.generation() { // The segment manager has changed, we need to // - save meta.json @@ -382,11 +379,12 @@ impl SegmentUpdater { pub fn process_one( &mut self, segment_update: SegmentUpdate) { - - info!("Segment update: {:?}", segment_update); + info!("Segment update: {:?}", segment_update); + + use self::SegmentUpdate::*; match segment_update { - SegmentUpdate::AddSegment(segment_entry) => { + AddSegment(segment_entry) => { if !self.is_cancelled_generation { self.segment_manager().add_segment(segment_entry); } @@ -399,7 +397,7 @@ impl SegmentUpdater { self.index.delete_segment(segment_entry.segment_id()); } } - SegmentUpdate::EndMerge(merging_thread_id_opt, segment_ids, segment_entry) => { + EndMerge(merging_thread_id_opt, segment_ids, segment_entry) => { self.end_merge( segment_ids, segment_entry); @@ -407,21 +405,21 @@ impl SegmentUpdater { self.merging_threads.remove(&merging_thread_id); } } - SegmentUpdate::CancelGeneration => { + CancelGeneration => { // Called during rollback. The segment // that will arrive will be ignored // until a NewGeneration is update arrives. self.is_cancelled_generation = true; } - SegmentUpdate::NewGeneration => { + NewGeneration => { // After rollback, we can resume // indexing new documents. self.is_cancelled_generation = false; } - SegmentUpdate::Commit(docstamp) => { + Commit(docstamp) => { self.segment_manager().commit(docstamp); } - SegmentUpdate::Terminate => { + Terminate => { panic!("We should have left the loop before processing it."); } } diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 074f6e7d1..204eb9a37 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -174,12 +174,6 @@ impl<'a> SegmentWriter<'a> { // .collect::>() // } - fn first_opstamp(&self) -> u64 { - *(self.doc_opstamps - .first() - .expect("Last doc opstamp called on an empty segment writer")) - } - pub fn last_opstamp(&self) -> u64 { *(self.doc_opstamps .last() From 20eb586660be79163034f79b8828635b1fc4fd44 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 23 Jan 2017 12:59:32 +0900 Subject: [PATCH 013/107] issue/43 Rename SegmentUpdater --- src/indexer/index_writer.rs | 8 ++++---- src/indexer/segment_updater.rs | 12 ++++++------ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index c9c424255..52907f778 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -26,7 +26,7 @@ use std::sync::{Arc, Mutex}; use chan; use core::SegmentMeta; use super::delete_queue::{DeleteQueue, DeleteQueueCursor}; -use super::segment_updater::{SegmentUpdate, SegmentUpdateManager}; +use super::segment_updater::{SegmentUpdate, SegmentUpdater}; use super::segment_manager::CommitState; use Result; use Error; @@ -72,7 +72,7 @@ pub struct IndexWriter { document_receiver: DocumentReceiver, document_sender: DocumentSender, - segment_update_manager: SegmentUpdateManager, + segment_update_manager: SegmentUpdater, worker_id: usize, @@ -93,7 +93,7 @@ fn index_documents(heap: &mut Heap, mut segment: Segment, schema: &Schema, document_iterator: &mut Iterator, - segment_update_manager: &mut SegmentUpdateManager, + segment_update_manager: &mut SegmentUpdater, delete_cursor: &mut DeleteQueueCursor) -> Result<()> { heap.clear(); @@ -254,7 +254,7 @@ impl IndexWriter { let segment_manager = Arc::new(SegmentManager::from_segments(committed_segments, delete_queue.cursor())); - let segment_update_manager = SegmentUpdateManager::new(index.clone(), segment_manager.clone(), merge_policy.clone()); + let segment_update_manager = SegmentUpdater::new(index.clone(), segment_manager.clone(), merge_policy.clone()); let mut index_writer = IndexWriter { diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index 060044f95..89a318a09 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -123,19 +123,19 @@ pub enum SegmentUpdate { // TODO Rename #[derive(Clone)] -pub struct SegmentUpdateManager { +pub struct SegmentUpdater { channel: SegmentUpdateSender, } -impl SegmentUpdateManager { +impl SegmentUpdater { pub fn new( index: Index, segment_manager: Arc, - merge_policy: Arc>>) -> SegmentUpdateManager { + merge_policy: Arc>>) -> SegmentUpdater { let (segment_update_sender, segment_update_receiver): (SegmentUpdateSender, SegmentUpdateReceiver) = chan::async(); - let segment_update_manager = SegmentUpdateManager { + let segment_update_manager = SegmentUpdater { channel: segment_update_sender, }; SegmentUpdateRunner::new( @@ -170,7 +170,7 @@ pub struct SegmentUpdateRunner { index: Index, is_cancelled_generation: bool, segment_update_receiver: SegmentUpdateReceiver, - segment_update_manager: SegmentUpdateManager, + segment_update_manager: SegmentUpdater, segment_manager: Arc, merge_policy: Arc>>, merging_thread_id: usize, @@ -182,7 +182,7 @@ impl SegmentUpdateRunner { fn new(index: Index, segment_manager: Arc, merge_policy: Arc>>, - segment_update_manager: SegmentUpdateManager, + segment_update_manager: SegmentUpdater, segment_update_receiver: SegmentUpdateReceiver) -> SegmentUpdateRunner { SegmentUpdateRunner { index: index, From 0ec492dcf26fc1db8e9fe4d1d3276e73eeb3a9a4 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Wed, 25 Jan 2017 08:33:21 +0900 Subject: [PATCH 014/107] issue/43 refactoring in order to remove the segment updater non sense for simpler futures --- Cargo.toml | 3 + src/indexer/index_writer.rs | 115 ++++----------- src/indexer/merger.rs | 8 +- src/indexer/segment_updater.rs | 246 +++++++++++++++++++-------------- src/lib.rs | 2 + 5 files changed, 181 insertions(+), 193 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 406b0bda8..998d5b48c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -37,7 +37,10 @@ uuid = { version = "0.4", features = ["v4", "rustc-serialize"] } chan = "0.1" version = "2" crossbeam = "0.2" + eventual = "0.1.7" +futures = "0.1.9" +futures-cpupool = "0.1.2" [dev-dependencies] rand = "0.3" diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 52907f778..4a4377e74 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -5,6 +5,7 @@ use indexer::SegmentSerializer; use core::SerializableSegment; use core::Index; use core::Segment; +use core::SegmentId; use schema::Term; use indexer::SegmentEntry; use std::thread::JoinHandle; @@ -63,7 +64,6 @@ pub struct IndexWriter { _merge_policy: Arc>>, index: Index, - segment_manager: Arc, heap_size_in_bytes_per_thread: usize, @@ -72,7 +72,7 @@ pub struct IndexWriter { document_receiver: DocumentReceiver, document_sender: DocumentSender, - segment_update_manager: SegmentUpdater, + segment_updater: SegmentUpdater, worker_id: usize, @@ -93,7 +93,7 @@ fn index_documents(heap: &mut Heap, mut segment: Segment, schema: &Schema, document_iterator: &mut Iterator, - segment_update_manager: &mut SegmentUpdater, + segment_updater: &mut SegmentUpdater, delete_cursor: &mut DeleteQueueCursor) -> Result<()> { heap.clear(); @@ -134,7 +134,7 @@ fn index_documents(heap: &mut Heap, let segment_entry = SegmentEntry::new(segment_meta, delete_cursor.clone()); try!(segment_writer.finalize()); - segment_update_manager.send(SegmentUpdate::AddSegment(segment_entry)); + segment_updater.add_segment(segment_entry); Ok(()) } @@ -144,10 +144,10 @@ impl IndexWriter { /// The index writer pub fn wait_merging_threads(mut self) -> Result<()> { - let future = self.segment_update_manager.send(SegmentUpdate::Terminate); + let future = self.segment_updater.terminate(); // this will stop the indexing thread, - // dropping the last reference to the segment_update_manager. + // dropping the last reference to the segment_updater. drop(self.document_sender); let mut v = Vec::new(); @@ -172,7 +172,7 @@ impl IndexWriter { let index = self.index.clone(); let schema = self.index.schema(); let document_receiver_clone = self.document_receiver.clone(); - let mut segment_update_manager = self.segment_update_manager.clone(); + let mut segment_updater = self.segment_updater.clone(); let mut heap = Heap::with_capacity(self.heap_size_in_bytes_per_thread); // TODO fix this. the cursor might be too advanced @@ -201,7 +201,7 @@ impl IndexWriter { segment, &schema, &mut document_iterator, - &mut segment_update_manager, + &mut segment_updater, &mut delete_cursor_clone)); } else { // No more documents. @@ -250,11 +250,8 @@ impl IndexWriter { let delete_queue = DeleteQueue::default(); - let committed_segments = index.committed_segments()?; - let segment_manager = Arc::new(SegmentManager::from_segments(committed_segments, delete_queue.cursor())); - - let segment_update_manager = SegmentUpdater::new(index.clone(), segment_manager.clone(), merge_policy.clone()); + let segment_updater = SegmentUpdater::create(index.clone(), delete_queue.cursor(), merge_policy.clone())?; let mut index_writer = IndexWriter { @@ -264,12 +261,11 @@ impl IndexWriter { heap_size_in_bytes_per_thread: heap_size_in_bytes_per_thread, index: index.clone(), - segment_manager: segment_manager, document_receiver: document_receiver, document_sender: document_sender, - segment_update_manager: segment_update_manager, + segment_updater: segment_updater, workers_join_handle: Vec::new(), num_threads: num_threads, @@ -303,72 +299,8 @@ impl IndexWriter { } /// Merges a given list of segments - pub fn merge(&mut self, segments: &[Segment]) -> Result<()> { - - if segments.len() < 2 { - // no segments or one segment? nothing to do. - return Ok(()); - } - - let ref segment_manager = self.segment_manager; - { - // let's check that all these segments are in the same - // committed/uncommited state. - let first_commit_state = segment_manager.is_committed(segments[0].id()); - - for segment in segments { - let commit_state = segment_manager.is_committed(segment.id()); - if commit_state == CommitState::Missing { - return Err(Error::InvalidArgument(format!("Segment {:?} is not in the index", - segments[0].id()))); - } - if commit_state != first_commit_state { - return Err(Error::InvalidArgument(String::from("You may not merge segments \ - that are heterogenously in \ - committed and uncommited."))); - } - } - } - - let schema = self.index.schema(); - - // An IndexMerger is like a "view" of our merged segments. - let merger = try!(IndexMerger::open(schema, segments)); - let mut merged_segment = self.index.new_segment(); - - // ... we just serialize this index merger in our new segment - // to merge the two segments. - let segment_serializer = try!(SegmentSerializer::for_segment(&mut merged_segment)); - let num_docs = try!(merger.write(segment_serializer)); - let merged_segment_ids = segments - .iter() - .map(|segment| segment.id()) - .collect::>(); - - let segment_meta = SegmentMeta { - segment_id: merged_segment.id(), - num_docs: num_docs, - num_deleted_docs: 0, - }; - - // TODO fix this!!! - let delete_queue = DeleteQueue::default(); - let delete_cursor = delete_queue.cursor(); - - let segment_entry = SegmentEntry::new(segment_meta, delete_cursor); - - let segment_update = SegmentUpdate::EndMerge( - None, - merged_segment_ids, - segment_entry - ); - - self.segment_update_manager.send(segment_update); - - // self.segment_updater.(segment_ids, segment_entry); - //segment_manager.end_merge(&merged_segment_ids, segment_entry); - - Ok(()) + pub fn merge(&mut self, segments: &[SegmentId]) -> impl Async { + self.segment_updater.start_merge(segments.to_vec()) } /// Closes the current document channel send. @@ -397,7 +329,7 @@ impl IndexWriter { /// The docstamp at the last commit is returned. pub fn rollback(&mut self) -> Result { - self.segment_update_manager.send(SegmentUpdate::CancelGeneration); + self.segment_updater.cancel_generation(); // we cannot drop segment ready receiver yet // as it would block the workers. @@ -430,15 +362,20 @@ impl IndexWriter { // // We can now open a new generation and reaccept segments // from now on. - self.segment_update_manager.send(SegmentUpdate::NewGeneration); + self.segment_updater.new_generation(); + + + // TODO Send rollback. + //let rollbacked_segments = self.segment_manager.rollback(); - let rollbacked_segments = self.segment_manager.rollback(); - for segment_id in rollbacked_segments { + // for segment_id in rollbacked_segments { + // // TODO all delete must happen after saving + // // meta.json + // self.index.delete_segment(segment_id); + // } + + panic!("aaaa"); - // TODO all delete must happen after saving - // meta.json - self.index.delete_segment(segment_id); - } // reset the docstamp self.uncommitted_docstamp = self.committed_docstamp; @@ -490,7 +427,7 @@ impl IndexWriter { // This will move uncommitted segments to the state of // committed segments. - let future = self.segment_update_manager.send(SegmentUpdate::Commit(self.committed_docstamp)); + let future = self.segment_updater.commit(self.committed_docstamp); // wait for the segment update thread to have processed the info // TODO remove unwrap diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index f3d286434..a41f211f0 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -1,6 +1,7 @@ use Result; use core::SegmentReader; use core::Segment; +use core::SegmentId; use DocId; use core::SerializableSegment; use indexer::SegmentSerializer; @@ -205,6 +206,7 @@ mod tests { use collector::tests::TestCollector; use query::BooleanQuery; use schema::TextIndexingOptions; + use eventual::Async; #[test] fn test_index_merger() { @@ -260,9 +262,11 @@ mod tests { } } { - let segments = index.searchable_segments().expect("Searchable segments failed."); + let segment_ids = index.searchable_segment_ids().expect("Searchable segments failed."); let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); - index_writer.merge(&segments).expect("Merging failed"); + index_writer.merge(&segment_ids) + .await() + .expect("Merging failed"); index_writer.wait_merging_threads().unwrap(); } { diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index 89a318a09..53ab821c6 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -20,7 +20,7 @@ use std::thread::JoinHandle; use std::sync::Arc; use std::collections::HashMap; use rustc_serialize::json; -use indexer::delete_queue::DeleteQueue; +use indexer::delete_queue::{DeleteQueueCursor, DeleteQueue}; use Result; use core::IndexMeta; use core::META_FILEPATH; @@ -91,6 +91,9 @@ pub enum SegmentUpdate { /// Created by the indexing worker thread AddSegment(SegmentEntry), + + StartMerge(Vec), + /// A merge is ended. /// Remove the merged segment and record the new /// large merged segment. @@ -119,8 +122,6 @@ pub enum SegmentUpdate { } - - // TODO Rename #[derive(Clone)] pub struct SegmentUpdater { @@ -130,21 +131,56 @@ pub struct SegmentUpdater { impl SegmentUpdater { - pub fn new( + pub fn create( index: Index, - segment_manager: Arc, - merge_policy: Arc>>) -> SegmentUpdater { + delete_cursor: DeleteQueueCursor, + merge_policy: Arc>>) -> Result { let (segment_update_sender, segment_update_receiver): (SegmentUpdateSender, SegmentUpdateReceiver) = chan::async(); - let segment_update_manager = SegmentUpdater { + let segment_updater = SegmentUpdater { channel: segment_update_sender, }; + let committed_segments = index.committed_segments()?; + let segment_manager = SegmentManager::from_segments(committed_segments, delete_cursor); SegmentUpdateRunner::new( index, segment_manager, merge_policy, - segment_update_manager.clone(), + segment_updater.clone(), segment_update_receiver).start(); - segment_update_manager + Ok(segment_updater) + } + + pub fn add_segment(&self, segment_entry: SegmentEntry) -> impl Async { + self.send(SegmentUpdate::AddSegment(segment_entry)) + } + + pub fn commit(&self, committed_docstamp: u64) -> impl Async { + self.send(SegmentUpdate::Commit(committed_docstamp)) + } + + pub fn start_merge(&self, segment_ids: Vec) -> impl Async { + self.send(SegmentUpdate::StartMerge(segment_ids)) + } + + pub fn new_generation(&self) -> impl Async { + self.send(SegmentUpdate::NewGeneration) + } + + pub fn cancel_generation(&self) -> impl Async { + self.send(SegmentUpdate::CancelGeneration) + } + + + pub fn end_merge(&self, + merge_thread_id: Option, + merged_segment_ids: Vec, + resulting_segment_entry: SegmentEntry) -> impl Async { + let segment_update = SegmentUpdate::EndMerge(merge_thread_id, merged_segment_ids, resulting_segment_entry); + self.send(segment_update) + } + + pub fn terminate(&self) -> impl Async { + self.send(SegmentUpdate::Terminate) } pub fn send(&self, segment_update: SegmentUpdate) -> impl Async { @@ -170,8 +206,8 @@ pub struct SegmentUpdateRunner { index: Index, is_cancelled_generation: bool, segment_update_receiver: SegmentUpdateReceiver, - segment_update_manager: SegmentUpdater, - segment_manager: Arc, + segment_updater: SegmentUpdater, + segment_manager: SegmentManager, merge_policy: Arc>>, merging_thread_id: usize, merging_threads: HashMap, SegmentEntry)> >, @@ -180,14 +216,14 @@ pub struct SegmentUpdateRunner { impl SegmentUpdateRunner { fn new(index: Index, - segment_manager: Arc, + segment_manager: SegmentManager, merge_policy: Arc>>, - segment_update_manager: SegmentUpdater, + segment_updater: SegmentUpdater, segment_update_receiver: SegmentUpdateReceiver) -> SegmentUpdateRunner { SegmentUpdateRunner { index: index, is_cancelled_generation: false, - segment_update_manager: segment_update_manager, + segment_updater: segment_updater, segment_update_receiver: segment_update_receiver, segment_manager: segment_manager, merge_policy: merge_policy, @@ -206,10 +242,10 @@ impl SegmentUpdateRunner { &mut self, segment_ids: Vec, segment_entry: SegmentEntry) { - let segment_manager = self.segment_manager.clone(); - segment_manager.end_merge(&segment_ids, segment_entry); + + self.segment_manager.end_merge(&segment_ids, segment_entry); save_metas( - &*segment_manager, + &self.segment_manager, self.index.schema(), self.index.docstamp(), self.index.directory_mut()).expect("Could not save metas."); @@ -221,60 +257,64 @@ impl SegmentUpdateRunner { self.index.load_searchers().unwrap(); } + + fn start_merge(&mut self, segment_ids: Vec, complete_opt: Option>) { + + let merging_thread_id = self.new_merging_thread_id(); + self.segment_manager.start_merge(&segment_ids); + + let index_clone = self.index.clone(); + let segment_updater_clone = self.segment_updater.clone(); + + let merge_thread_handle = thread::Builder::new() + .name(format!("merge_thread_{:?}", merging_thread_id)) + .spawn(move || { + info!("Start merge: {:?}", segment_ids); + let schema = index_clone.schema(); + let segments: Vec = segment_ids + .iter() + .map(|&segment_id| index_clone.segment(segment_id)) + .collect(); + // An IndexMerger is like a "view" of our merged segments. + // TODO unwrap + let merger: IndexMerger = IndexMerger::open(schema, &segments[..]).expect("Creating index merger failed"); + let mut merged_segment = index_clone.new_segment(); + // ... we just serialize this index merger in our new segment + // to merge the two segments. + let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment).expect("Creating index serializer failed"); + let num_docs = merger.write(segment_serializer).expect("Serializing merged index failed"); + let segment_meta = SegmentMeta { + segment_id: merged_segment.id(), + num_docs: num_docs, + num_deleted_docs: 0u32, + }; + + // TODO fix delete cursor + let delete_queue = DeleteQueue::default(); + + let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor()); + + let segment_update = SegmentUpdate::EndMerge(Some(merging_thread_id), segment_ids.clone(), segment_entry.clone()); + segment_updater_clone.send(segment_update.clone()); + if let Some(complete) = complete_opt { + complete.complete(()); + } + (segment_ids, segment_entry) + }) + .expect("Failed to spawn merge thread"); + + self.merging_threads.insert(merging_thread_id, merge_thread_handle); + } + fn start_merges(&mut self) { - let merge_candidates = self.consider_merge_options(); - for MergeCandidate(segment_ids) in merge_candidates { - - let merging_thread_id = self.new_merging_thread_id(); - - self.segment_manager().start_merge(&segment_ids); - - let index_clone = self.index.clone(); - let segment_update_manager_clone = self.segment_update_manager.clone(); - - let merge_thread_handle = thread::Builder::new() - .name(format!("merge_thread_{:?}", merging_thread_id)) - .spawn(move || { - info!("Start merge: {:?}", segment_ids); - let schema = index_clone.schema(); - let segments: Vec = segment_ids - .iter() - .map(|&segment_id| index_clone.segment(segment_id)) - .collect(); - // An IndexMerger is like a "view" of our merged segments. - // TODO unwrap - let merger: IndexMerger = IndexMerger::open(schema, &segments[..]).expect("Creating index merger failed"); - let mut merged_segment = index_clone.new_segment(); - // ... we just serialize this index merger in our new segment - // to merge the two segments. - let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment).expect("Creating index serializer failed"); - let num_docs = merger.write(segment_serializer).expect("Serializing merged index failed"); - let segment_meta = SegmentMeta { - segment_id: merged_segment.id(), - num_docs: num_docs, - num_deleted_docs: 0u32, - }; - - // TODO fix delete cursor - let delete_queue = DeleteQueue::default(); - - let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor()); - - let segment_update = SegmentUpdate::EndMerge(Some(merging_thread_id), segment_ids.clone(), segment_entry.clone()); - segment_update_manager_clone.send(segment_update.clone()); - (segment_ids, segment_entry) - }) - .expect("Failed to spawn merge thread"); - - self.merging_threads.insert(merging_thread_id, merge_thread_handle); + self.start_merge(segment_ids, None); } } fn consider_merge_options(&self,) -> Vec { - let segment_manager = self.segment_manager(); - let (committed_segments, uncommitted_segments) = get_segment_ready_for_commit(segment_manager); + let (committed_segments, uncommitted_segments) = get_segment_ready_for_commit(&self.segment_manager); // Committed segments cannot be merged with uncommitted_segments. // We therefore consider merges using these two sets of segments independantly. let merge_policy_lock = self.merge_policy.lock().unwrap(); @@ -284,11 +324,6 @@ impl SegmentUpdateRunner { merge_candidates } - - fn segment_manager(&self,) -> &SegmentManager { - &*self.segment_manager - } - pub fn start(self) -> JoinHandle<()> { thread::Builder::new() .name("segment_update".to_string()) @@ -299,9 +334,7 @@ impl SegmentUpdateRunner { } fn process(mut self) { - - let segment_manager = self.segment_manager.clone(); - + let mut complete_option = None; for (complete, segment_update) in self.segment_update_receiver.clone() { @@ -316,40 +349,46 @@ impl SegmentUpdateRunner { // dirty-bit. If the value is different // to our generation, then the segment_manager has // been update updated. - let generation_before_update = segment_manager.generation(); + let generation_before_update = self.segment_manager.generation(); - self.process_one(segment_update); - - if generation_before_update != segment_manager.generation() { - // The segment manager has changed, we need to - // - save meta.json - save_metas( - &*segment_manager, - self.index.schema(), - self.index.docstamp(), - self.index.directory_mut()).expect("Could not save metas."); + if let SegmentUpdate::StartMerge(segment_ids) = segment_update { + self.start_merge(segment_ids, Some(complete)); + } + else { + self.process_one(segment_update); + if generation_before_update != self.segment_manager.generation() { + // The segment manager has changed, we need to + // - save meta.json + save_metas( + &self.segment_manager, + self.index.schema(), + self.index.docstamp(), + self.index.directory_mut()).expect("Could not save metas."); - // - update the searchers - - // update the searchers so that they eventually will - // use the new segments. - // TODO eventually have this work through watching meta.json - // so that an external process stays up to date as well. - match self.index.load_searchers() { - Ok(()) => { - } - Err(e) => { - error!("Failure while loading new searchers {:?}", e); - panic!(format!("Failure while loading new searchers {:?}", e)); + // - update the searchers + + // update the searchers so that they eventually will + // use the new segments. + // TODO eventually have this work through watching meta.json + // so that an external process stays up to date as well. + match self.index.load_searchers() { + Ok(()) => {} + Err(e) => { + error!("Failure while loading new searchers {:?}", e); + panic!(format!("Failure while loading new searchers {:?}", e)); + } } + + // - start merges if required + self.start_merges(); } - - // - start merges if required - self.start_merges(); + complete.complete(()); } - complete.complete(()); + + + } @@ -381,12 +420,12 @@ impl SegmentUpdateRunner { segment_update: SegmentUpdate) { info!("Segment update: {:?}", segment_update); - + use self::SegmentUpdate::*; match segment_update { AddSegment(segment_entry) => { if !self.is_cancelled_generation { - self.segment_manager().add_segment(segment_entry); + self.segment_manager.add_segment(segment_entry); } else { // rollback has been called and this @@ -397,6 +436,9 @@ impl SegmentUpdateRunner { self.index.delete_segment(segment_entry.segment_id()); } } + StartMerge(segment_ids) => { + panic!("this should have been handled somewhere else"); + } EndMerge(merging_thread_id_opt, segment_ids, segment_entry) => { self.end_merge( segment_ids, @@ -417,7 +459,7 @@ impl SegmentUpdateRunner { self.is_cancelled_generation = false; } Commit(docstamp) => { - self.segment_manager().commit(docstamp); + self.segment_manager.commit(docstamp); } Terminate => { panic!("We should have left the loop before processing it."); diff --git a/src/lib.rs b/src/lib.rs index 9ba528862..ed0d726fa 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -50,6 +50,8 @@ extern crate crossbeam; extern crate bit_set; extern crate notify; extern crate eventual; +extern crate futures; +extern crate futures_cpupool; #[cfg(feature="simdcompression")] extern crate libc; From e8ecb68f00eacc10023cfb0efc0a1802878bce73 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 26 Jan 2017 00:06:07 +0900 Subject: [PATCH 015/107] issue/43 switching for futures --- Cargo.toml | 1 + src/indexer/index_writer.rs | 51 ++- src/indexer/merge_policy.rs | 2 +- src/indexer/merger.rs | 4 +- src/indexer/segment_manager.rs | 13 - src/indexer/segment_updater.rs | 622 ++++++++++++++++----------------- 6 files changed, 340 insertions(+), 353 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 998d5b48c..6941ae63d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -39,6 +39,7 @@ version = "2" crossbeam = "0.2" eventual = "0.1.7" + futures = "0.1.9" futures-cpupool = "0.1.2" diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 4a4377e74..298d1b6d6 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -14,7 +14,7 @@ use indexer::SegmentWriter; use indexer::SegmentManager; use core::SegmentComponent; use super::directory_lock::DirectoryLock; -use eventual::Async; +use futures::Future; use std::clone::Clone; use std::io; use fastfield::delete; @@ -27,7 +27,7 @@ use std::sync::{Arc, Mutex}; use chan; use core::SegmentMeta; use super::delete_queue::{DeleteQueue, DeleteQueueCursor}; -use super::segment_updater::{SegmentUpdate, SegmentUpdater}; +use super::segment_updater::SegmentUpdater; use super::segment_manager::CommitState; use Result; use Error; @@ -61,8 +61,6 @@ pub struct IndexWriter { // lifetime of the lock with that of the IndexWriter. _directory_lock: DirectoryLock, - _merge_policy: Arc>>, - index: Index, heap_size_in_bytes_per_thread: usize, @@ -78,6 +76,8 @@ pub struct IndexWriter { num_threads: usize, + generation: usize, + delete_queue: DeleteQueue, uncommitted_docstamp: u64, @@ -92,6 +92,7 @@ impl !Sync for IndexWriter {} fn index_documents(heap: &mut Heap, mut segment: Segment, schema: &Schema, + generation: usize, document_iterator: &mut Iterator, segment_updater: &mut SegmentUpdater, delete_cursor: &mut DeleteQueueCursor) @@ -134,7 +135,7 @@ fn index_documents(heap: &mut Heap, let segment_entry = SegmentEntry::new(segment_meta, delete_cursor.clone()); try!(segment_writer.finalize()); - segment_updater.add_segment(segment_entry); + segment_updater.add_segment(generation, segment_entry); Ok(()) } @@ -161,7 +162,7 @@ impl IndexWriter { } drop(self.workers_join_handle); - future.await().unwrap(); // TODO do something with the result. + future.wait().unwrap(); // TODO do something with the result. Ok(()) } @@ -179,9 +180,12 @@ impl IndexWriter { // at this point. let delete_cursor = self.delete_queue.cursor(); + let generation = self.generation; + let join_handle: JoinHandle> = try!(thread::Builder::new() - .name(format!("indexing_thread_{}", self.worker_id)) + .name(format!("indexing thread {} for gen {}", self.worker_id, generation)) .spawn(move || { + let mut delete_cursor_clone = delete_cursor.clone(); loop { let segment = index.new_segment(); @@ -200,6 +204,7 @@ impl IndexWriter { try!(index_documents(&mut heap, segment, &schema, + generation, &mut document_iterator, &mut segment_updater, &mut delete_cursor_clone)); @@ -245,20 +250,17 @@ impl IndexWriter { let (document_sender, document_receiver): (DocumentSender, DocumentReceiver) = chan::sync(PIPELINE_MAX_SIZE_IN_DOCS); - - let merge_policy: Arc>> = Arc::new(Mutex::new(box DefaultMergePolicy::default())); + let delete_queue = DeleteQueue::default(); - - let segment_updater = SegmentUpdater::create(index.clone(), delete_queue.cursor(), merge_policy.clone())?; + let merge_policy = box DefaultMergePolicy::default(); + let segment_updater = SegmentUpdater::new(index.clone(), delete_queue.cursor(), merge_policy)?; let mut index_writer = IndexWriter { _directory_lock: directory_lock, - _merge_policy: merge_policy, - heap_size_in_bytes_per_thread: heap_size_in_bytes_per_thread, index: index.clone(), @@ -274,21 +276,18 @@ impl IndexWriter { committed_docstamp: index.docstamp(), uncommitted_docstamp: index.docstamp(), + + generation: 0, + worker_id: 0, }; try!(index_writer.start_workers()); Ok(index_writer) } - - /// Returns a clone of the index_writer merge policy. - pub fn get_merge_policy(&self) -> Box { - self._merge_policy.lock().unwrap().box_clone() - } - /// Set the merge policy. pub fn set_merge_policy(&self, merge_policy: Box) { - *self._merge_policy.lock().unwrap() = merge_policy; + // *self._merge_policy.lock().unwrap() = merge_policy; } fn start_workers(&mut self) -> Result<()> { @@ -299,7 +298,7 @@ impl IndexWriter { } /// Merges a given list of segments - pub fn merge(&mut self, segments: &[SegmentId]) -> impl Async { + pub fn merge(&mut self, segments: &[SegmentId]) -> impl Future { self.segment_updater.start_merge(segments.to_vec()) } @@ -431,7 +430,7 @@ impl IndexWriter { // wait for the segment update thread to have processed the info // TODO remove unwrap - future.await().unwrap(); + future.wait().unwrap(); Ok(self.committed_docstamp) } @@ -498,10 +497,10 @@ mod tests { let schema_builder = schema::SchemaBuilder::default(); let index = Index::create_in_ram(schema_builder.build()); let index_writer = index.writer(40_000_000).unwrap(); - assert_eq!(format!("{:?}", index_writer.get_merge_policy()), "LogMergePolicy { min_merge_size: 8, min_layer_size: 10000, level_log_size: 0.75 }"); - let merge_policy = box NoMergePolicy::default(); - index_writer.set_merge_policy(merge_policy); - assert_eq!(format!("{:?}", index_writer.get_merge_policy()), "NoMergePolicy"); + // assert_eq!(format!("{:?}", index_writer.get_merge_policy()), "LogMergePolicy { min_merge_size: 8, min_layer_size: 10000, level_log_size: 0.75 }"); + // let merge_policy = box NoMergePolicy::default(); + // index_writer.set_merge_policy(merge_policy); + // assert_eq!(format!("{:?}", index_writer.get_merge_policy()), "NoMergePolicy"); } #[test] diff --git a/src/indexer/merge_policy.rs b/src/indexer/merge_policy.rs index 22a767042..ae1064355 100644 --- a/src/indexer/merge_policy.rs +++ b/src/indexer/merge_policy.rs @@ -13,7 +13,7 @@ pub struct MergeCandidate(pub Vec); /// /// Every time a the list of segments changes, the segment updater /// asks the merge policy if some segments should be merged. -pub trait MergePolicy: marker::Send + Debug { +pub trait MergePolicy: marker::Send + marker::Sync + Debug { /// Given the list of segment metas, returns the list of merge candidates. /// /// This call happens on the segment updater thread, and will block diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index a41f211f0..28068e880 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -14,6 +14,7 @@ use fastfield::FastFieldSerializer; use store::StoreWriter; use postings::ChainedPostings; use postings::HasLen; +use futures::Future; use postings::OffsetPostings; use core::SegmentInfo; use std::cmp::{min, max}; @@ -207,6 +208,7 @@ mod tests { use query::BooleanQuery; use schema::TextIndexingOptions; use eventual::Async; + use futures::Future; #[test] fn test_index_merger() { @@ -265,7 +267,7 @@ mod tests { let segment_ids = index.searchable_segment_ids().expect("Searchable segments failed."); let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); index_writer.merge(&segment_ids) - .await() + .wait() .expect("Merging failed"); index_writer.wait_merging_threads().unwrap(); } diff --git a/src/indexer/segment_manager.rs b/src/indexer/segment_manager.rs index 6fcdb253d..61f7b6fc1 100644 --- a/src/indexer/segment_manager.rs +++ b/src/indexer/segment_manager.rs @@ -39,12 +39,6 @@ impl Default for SegmentRegisters { /// changes (merges especially) pub struct SegmentManager { registers: RwLock, - // generation is an ever increasing counter that - // is incremented whenever we modify - // the segment manager. It can be useful for debugging - // purposes, and it also acts as a "dirty" marker, - // to detect when the `meta.json` should be written. - generation: AtomicUsize, } impl Debug for SegmentManager { @@ -89,7 +83,6 @@ impl SegmentManager { uncommitted: SegmentRegister::default(), committed: SegmentRegister::new(segment_metas, delete_cursor), }), - generation: AtomicUsize::default(), } } @@ -101,14 +94,9 @@ impl SegmentManager { } fn write(&self,) -> RwLockWriteGuard { - self.generation.fetch_add(1, Ordering::Release); self.registers.write().expect("Failed to acquire write lock on SegmentManager.") } - pub fn generation(&self,) -> usize { - self.generation.load(Ordering::Acquire) - } - /// Removes all of the uncommitted segments /// and returns them. pub fn rollback(&self,) -> Vec { @@ -180,7 +168,6 @@ impl Default for SegmentManager { uncommitted: SegmentRegister::default(), committed: SegmentRegister::default(), }), - generation: AtomicUsize::default(), } } } \ No newline at end of file diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index 53ab821c6..14394d3b4 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -7,6 +7,8 @@ use core::Segment; use core::SegmentId; use core::SegmentMeta; use std::mem; +use futures::Future; +use std::sync::atomic::AtomicUsize; use core::SerializableSegment; use indexer::MergePolicy; use indexer::MergeCandidate; @@ -22,15 +24,13 @@ use std::collections::HashMap; use rustc_serialize::json; use indexer::delete_queue::{DeleteQueueCursor, DeleteQueue}; use Result; +use futures_cpupool::CpuPool; use core::IndexMeta; use core::META_FILEPATH; use std::io::Write; use eventual::*; use super::segment_manager::{SegmentManager, get_segment_ready_for_commit}; -type SegmentUpdateSender = chan::Sender<(Complete<(), &'static str>, SegmentUpdate)>; -type SegmentUpdateReceiver = chan::Receiver<(Complete<(), &'static str>, SegmentUpdate)>; - fn create_metas(segment_manager: &SegmentManager, schema: Schema, docstamp: u64) -> IndexMeta { let (committed_segments, uncommitted_segments) = segment_manager.segment_metas(); @@ -84,386 +84,384 @@ pub fn save_metas(segment_manager: &SegmentManager, } -#[derive(Clone, Debug)] -pub enum SegmentUpdate { +// #[derive(Clone, Debug)] +// pub enum SegmentUpdate { - /// New segment added. - /// Created by the indexing worker thread - AddSegment(SegmentEntry), +// /// New segment added. +// /// Created by the indexing worker thread +// AddSegment(usize, SegmentEntry), - StartMerge(Vec), +// StartMerge(Vec), - /// A merge is ended. - /// Remove the merged segment and record the new - /// large merged segment. - EndMerge(Option, Vec, SegmentEntry), +// /// A merge is ended. +// /// Remove the merged segment and record the new +// /// large merged segment. +// EndMerge(Option, Vec, SegmentEntry), - /// Happens when rollback is called. - /// The current generation of segments is cancelled. - CancelGeneration, +// /// Happens when rollback is called. +// /// The current generation of segments is cancelled. +// CancelGeneration, - /// Starts a new generation... This - /// happens at the end of Rollback. - NewGeneration, +// /// Starts a new generation... This +// /// happens at the end of Rollback. +// NewGeneration, - /// Just dropping the Segment updater object - /// is safe, but some merge might be happening in - /// the background and the user may want to wait for these - /// threads to terminate. - /// - /// When receiving the Terminate signal, the segment updater stops - /// receiving segment updates and just waits for the merging threads - /// to terminate. - Terminate, +// /// Just dropping the Segment updater object +// /// is safe, but some merge might be happening in +// /// the background and the user may want to wait for these +// /// threads to terminate. +// /// +// /// When receiving the Terminate signal, the segment updater stops +// /// receiving segment updates and just waits for the merging threads +// /// to terminate. +// Terminate, - /// Commit marks uncommmitted segments as committed. - Commit(u64), -} +// /// Commit marks uncommmitted segments as committed. +// Commit(u64), +// } - -// TODO Rename #[derive(Clone)] -pub struct SegmentUpdater { - channel: SegmentUpdateSender, -} +pub struct SegmentUpdater(Arc); +struct InnerSegmentUpdater { + pool: CpuPool, + segment_manager: SegmentManager, + merge_policy: Box, + merging_thread_id: AtomicUsize, + merging_threads: HashMap, SegmentEntry)> >, +} + impl SegmentUpdater { - pub fn create( + pub fn new( index: Index, delete_cursor: DeleteQueueCursor, - merge_policy: Arc>>) -> Result { - let (segment_update_sender, segment_update_receiver): (SegmentUpdateSender, SegmentUpdateReceiver) = chan::async(); - let segment_updater = SegmentUpdater { - channel: segment_update_sender, - }; + merge_policy: Box) + -> Result + { let committed_segments = index.committed_segments()?; let segment_manager = SegmentManager::from_segments(committed_segments, delete_cursor); - SegmentUpdateRunner::new( - index, - segment_manager, - merge_policy, - segment_updater.clone(), - segment_update_receiver).start(); - Ok(segment_updater) + Ok( + SegmentUpdater(Arc::new(InnerSegmentUpdater { + pool: CpuPool::new(1), + segment_manager: segment_manager, + merge_policy: merge_policy, + merging_thread_id: AtomicUsize::new(0), + merging_threads: HashMap::new(), + })) + ) } - pub fn add_segment(&self, segment_entry: SegmentEntry) -> impl Async { - self.send(SegmentUpdate::AddSegment(segment_entry)) + pub fn add_segment(&self, generation: usize, segment_entry: SegmentEntry) { } - pub fn commit(&self, committed_docstamp: u64) -> impl Async { - self.send(SegmentUpdate::Commit(committed_docstamp)) + pub fn commit(&self, committed_docstamp: u64) -> impl Future { + self.0.pool.spawn_fn(|| { + Ok(()) + }) } - pub fn start_merge(&self, segment_ids: Vec) -> impl Async { - self.send(SegmentUpdate::StartMerge(segment_ids)) + pub fn start_merge(&self, segment_ids: Vec) -> impl Future { + self.0.pool.spawn_fn(|| { + Ok(()) + }) } - pub fn new_generation(&self) -> impl Async { - self.send(SegmentUpdate::NewGeneration) + pub fn new_generation(&self) { } - pub fn cancel_generation(&self) -> impl Async { - self.send(SegmentUpdate::CancelGeneration) + pub fn cancel_generation(&self) { } - pub fn end_merge(&self, merge_thread_id: Option, merged_segment_ids: Vec, - resulting_segment_entry: SegmentEntry) -> impl Async { - let segment_update = SegmentUpdate::EndMerge(merge_thread_id, merged_segment_ids, resulting_segment_entry); - self.send(segment_update) + resulting_segment_entry: SegmentEntry) { } - pub fn terminate(&self) -> impl Async { - self.send(SegmentUpdate::Terminate) - } - - pub fn send(&self, segment_update: SegmentUpdate) -> impl Async { - let (fullfiller, future) = Future::<(), &'static str>::pair(); - self.channel.send((fullfiller, segment_update)); - future + pub fn terminate(&self) -> impl Future { + self.0.pool.spawn_fn(|| { + Ok(()) + }) } } -/// The segment update runner is in charge of processing all -/// of the `SegmentUpdate`s. -/// -/// All this processing happens on a single thread -/// consuming a common queue. -/// -/// The segment updates producers are : -/// - indexing threads are sending new segments -/// - merging threads are sending merge operations -/// - the index writer sends "terminate" -pub struct SegmentUpdateRunner { - index: Index, - is_cancelled_generation: bool, - segment_update_receiver: SegmentUpdateReceiver, - segment_updater: SegmentUpdater, - segment_manager: SegmentManager, - merge_policy: Arc>>, - merging_thread_id: usize, - merging_threads: HashMap, SegmentEntry)> >, -} +// impl SegmentUpdater { -impl SegmentUpdateRunner { +// pub fn create( +// index: Index, +// delete_cursor: DeleteQueueCursor, +// merge_policy: Box) -> Result { +// let (segment_update_sender, segment_update_receiver): (SegmentUpdateSender, SegmentUpdateReceiver) = chan::async(); +// let segment_updater = SegmentUpdater { +// channel: segment_update_sender, +// }; +// let committed_segments = index.committed_segments()?; +// let segment_manager = SegmentManager::from_segments(committed_segments, delete_cursor); +// SegmentUpdateRunner::new( +// index, +// segment_manager, +// merge_policy, +// segment_updater.clone(), +// segment_update_receiver).start(); +// Ok(segment_updater) +// } + + +// } + + +// The segment update runner is in charge of processing all +// of the `SegmentUpdate`s. +// +// All this processing happens on a single thread +// consuming a common queue. +// +// The segment updates producers are : +// - indexing threads are sending new segments +// - merging threads are sending merge operations +// - the index writer sends "terminate" +// pub struct SegmentUpdateRunner { +// index: Index, +// is_cancelled_generation: bool, +// segment_update_receiver: SegmentUpdateReceiver, +// segment_updater: SegmentUpdater, +// segment_manager: SegmentManager, +// merge_policy: Box, +// merging_thread_id: usize, +// merging_threads: HashMap, SegmentEntry)> >, +// } + +// impl SegmentUpdateRunner { - fn new(index: Index, - segment_manager: SegmentManager, - merge_policy: Arc>>, - segment_updater: SegmentUpdater, - segment_update_receiver: SegmentUpdateReceiver) -> SegmentUpdateRunner { - SegmentUpdateRunner { - index: index, - is_cancelled_generation: false, - segment_updater: segment_updater, - segment_update_receiver: segment_update_receiver, - segment_manager: segment_manager, - merge_policy: merge_policy, - merging_thread_id: 0, - merging_threads: HashMap::new(), - } - } +// fn new(index: Index, +// segment_manager: SegmentManager, +// merge_policy: Box, +// segment_updater: SegmentUpdater, +// segment_update_receiver: SegmentUpdateReceiver) -> SegmentUpdateRunner { +// SegmentUpdateRunner { +// index: index, +// is_cancelled_generation: false, +// segment_updater: segment_updater, +// segment_update_receiver: segment_update_receiver, +// segment_manager: segment_manager, +// merge_policy: merge_policy, +// merging_thread_id: 0, +// merging_threads: HashMap::new(), +// } +// } - fn new_merging_thread_id(&mut self,) -> usize { - self.merging_thread_id += 1; - self.merging_thread_id - } +// fn new_merging_thread_id(&mut self,) -> usize { +// self.merging_thread_id += 1; +// self.merging_thread_id +// } - fn end_merge( - &mut self, - segment_ids: Vec, - segment_entry: SegmentEntry) { +// fn end_merge( +// &mut self, +// segment_ids: Vec, +// segment_entry: SegmentEntry) { - self.segment_manager.end_merge(&segment_ids, segment_entry); - save_metas( - &self.segment_manager, - self.index.schema(), - self.index.docstamp(), - self.index.directory_mut()).expect("Could not save metas."); +// self.segment_manager.end_merge(&segment_ids, segment_entry); +// save_metas( +// &self.segment_manager, +// self.index.schema(), +// self.index.docstamp(), +// self.index.directory_mut()).expect("Could not save metas."); - for segment_id in segment_ids { - self.index.delete_segment(segment_id); - } +// for segment_id in segment_ids { +// self.index.delete_segment(segment_id); +// } - self.index.load_searchers().unwrap(); - } +// self.index.load_searchers().unwrap(); +// } - fn start_merge(&mut self, segment_ids: Vec, complete_opt: Option>) { +// fn start_merge(&mut self, segment_ids: Vec, complete_opt: Option>) { - let merging_thread_id = self.new_merging_thread_id(); - self.segment_manager.start_merge(&segment_ids); +// let merging_thread_id = self.new_merging_thread_id(); +// self.segment_manager.start_merge(&segment_ids); - let index_clone = self.index.clone(); - let segment_updater_clone = self.segment_updater.clone(); +// let index_clone = self.index.clone(); +// let segment_updater_clone = self.segment_updater.clone(); - let merge_thread_handle = thread::Builder::new() - .name(format!("merge_thread_{:?}", merging_thread_id)) - .spawn(move || { - info!("Start merge: {:?}", segment_ids); - let schema = index_clone.schema(); - let segments: Vec = segment_ids - .iter() - .map(|&segment_id| index_clone.segment(segment_id)) - .collect(); - // An IndexMerger is like a "view" of our merged segments. - // TODO unwrap - let merger: IndexMerger = IndexMerger::open(schema, &segments[..]).expect("Creating index merger failed"); - let mut merged_segment = index_clone.new_segment(); - // ... we just serialize this index merger in our new segment - // to merge the two segments. - let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment).expect("Creating index serializer failed"); - let num_docs = merger.write(segment_serializer).expect("Serializing merged index failed"); - let segment_meta = SegmentMeta { - segment_id: merged_segment.id(), - num_docs: num_docs, - num_deleted_docs: 0u32, - }; +// let merge_thread_handle = thread::Builder::new() +// .name(format!("merge_thread_{:?}", merging_thread_id)) +// .spawn(move || { +// info!("Start merge: {:?}", segment_ids); +// let schema = index_clone.schema(); +// let segments: Vec = segment_ids +// .iter() +// .map(|&segment_id| index_clone.segment(segment_id)) +// .collect(); +// // An IndexMerger is like a "view" of our merged segments. +// // TODO unwrap +// let merger: IndexMerger = IndexMerger::open(schema, &segments[..]).expect("Creating index merger failed"); +// let mut merged_segment = index_clone.new_segment(); +// // ... we just serialize this index merger in our new segment +// // to merge the two segments. +// let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment).expect("Creating index serializer failed"); +// let num_docs = merger.write(segment_serializer).expect("Serializing merged index failed"); +// let segment_meta = SegmentMeta { +// segment_id: merged_segment.id(), +// num_docs: num_docs, +// num_deleted_docs: 0u32, +// }; - // TODO fix delete cursor - let delete_queue = DeleteQueue::default(); +// // TODO fix delete cursor +// let delete_queue = DeleteQueue::default(); - let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor()); +// let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor()); - let segment_update = SegmentUpdate::EndMerge(Some(merging_thread_id), segment_ids.clone(), segment_entry.clone()); - segment_updater_clone.send(segment_update.clone()); - if let Some(complete) = complete_opt { - complete.complete(()); - } - (segment_ids, segment_entry) - }) - .expect("Failed to spawn merge thread"); +// let segment_update = SegmentUpdate::EndMerge(Some(merging_thread_id), segment_ids.clone(), segment_entry.clone()); +// // segment_updater_clone.send(segment_update.clone()); +// if let Some(complete) = complete_opt { +// complete.complete(()); +// } +// (segment_ids, segment_entry) +// }) +// .expect("Failed to spawn merge thread"); - self.merging_threads.insert(merging_thread_id, merge_thread_handle); - } +// self.merging_threads.insert(merging_thread_id, merge_thread_handle); +// } - fn start_merges(&mut self) { - let merge_candidates = self.consider_merge_options(); - for MergeCandidate(segment_ids) in merge_candidates { - self.start_merge(segment_ids, None); - } - } +// fn start_merges(&mut self) { +// let merge_candidates = self.consider_merge_options(); +// for MergeCandidate(segment_ids) in merge_candidates { +// self.start_merge(segment_ids, None); +// } +// } - fn consider_merge_options(&self,) -> Vec { - let (committed_segments, uncommitted_segments) = get_segment_ready_for_commit(&self.segment_manager); - // Committed segments cannot be merged with uncommitted_segments. - // We therefore consider merges using these two sets of segments independantly. - let merge_policy_lock = self.merge_policy.lock().unwrap(); - let mut merge_candidates = merge_policy_lock.compute_merge_candidates(&uncommitted_segments); - let committed_merge_candidates = merge_policy_lock.compute_merge_candidates(&committed_segments); - merge_candidates.extend_from_slice(&committed_merge_candidates[..]); - merge_candidates - } +// fn consider_merge_options(&self,) -> Vec { +// let (committed_segments, uncommitted_segments) = get_segment_ready_for_commit(&self.segment_manager); +// // Committed segments cannot be merged with uncommitted_segments. +// // We therefore consider merges using these two sets of segments independantly. +// let mut merge_candidates = self.merge_policy.compute_merge_candidates(&uncommitted_segments); +// let committed_merge_candidates = self.merge_policy.compute_merge_candidates(&committed_segments); +// merge_candidates.extend_from_slice(&committed_merge_candidates[..]); +// merge_candidates +// } - pub fn start(self) -> JoinHandle<()> { - thread::Builder::new() - .name("segment_update".to_string()) - .spawn(move || { - self.process(); - }) - .expect("Failed to start segment updater thread.") - } +// pub fn start(self) -> JoinHandle<()> { +// thread::Builder::new() +// .name("segment_update".to_string()) +// .spawn(move || { +// self.process(); +// }) +// .expect("Failed to start segment updater thread.") +// } - fn process(mut self) { +// fn process(mut self) { - let mut complete_option = None; +// let mut complete_option = None; - for (complete, segment_update) in self.segment_update_receiver.clone() { +// for (complete, segment_update) in self.segment_update_receiver.clone() { - if let SegmentUpdate::Terminate = segment_update { - complete_option = Some(complete); - break; - } +// if let SegmentUpdate::Terminate = segment_update { +// complete_option = Some(complete); +// break; +// } - - // we check the generation number as if it was - // dirty-bit. If the value is different - // to our generation, then the segment_manager has - // been update updated. - let generation_before_update = self.segment_manager.generation(); - - if let SegmentUpdate::StartMerge(segment_ids) = segment_update { - self.start_merge(segment_ids, Some(complete)); - } - else { - self.process_one(segment_update); - if generation_before_update != self.segment_manager.generation() { - // The segment manager has changed, we need to - // - save meta.json - save_metas( - &self.segment_manager, - self.index.schema(), - self.index.docstamp(), - self.index.directory_mut()).expect("Could not save metas."); - - - // - update the searchers - - // update the searchers so that they eventually will - // use the new segments. - // TODO eventually have this work through watching meta.json - // so that an external process stays up to date as well. - match self.index.load_searchers() { - Ok(()) => {} - Err(e) => { - error!("Failure while loading new searchers {:?}", e); - panic!(format!("Failure while loading new searchers {:?}", e)); - } - } +// if let SegmentUpdate::StartMerge(segment_ids) = segment_update { +// self.start_merge(segment_ids, Some(complete)); +// } +// else { +// self.process_one(segment_update); - // - start merges if required - self.start_merges(); - } - complete.complete(()); - } - - - - - - } +// // - start merges if required +// self.start_merges(); +// complete.complete(()); +// } +// } - let mut merging_threads = HashMap::new(); - mem::swap(&mut merging_threads, &mut self.merging_threads); - for (_, merging_thread_handle) in merging_threads { - match merging_thread_handle.join() { - Ok((segment_ids, segment_entry)) => { - self.end_merge(segment_ids, segment_entry); - } - Err(e) => { - error!("Error in merging thread {:?}", e); - break; - } - } - } +// let mut merging_threads = HashMap::new(); +// mem::swap(&mut merging_threads, &mut self.merging_threads); +// for (_, merging_thread_handle) in merging_threads { +// match merging_thread_handle.join() { +// Ok((segment_ids, segment_entry)) => { +// self.end_merge(segment_ids, segment_entry); +// } +// Err(e) => { +// error!("Error in merging thread {:?}", e); +// break; +// } +// } +// } - if let Some(complete) = complete_option { - complete.complete(()); - } - } +// if let Some(complete) = complete_option { +// complete.complete(()); +// } +// } - // Process a single segment update. - pub fn process_one( - &mut self, - segment_update: SegmentUpdate) { +// // Process a single segment update. +// pub fn process_one( +// &mut self, +// segment_update: SegmentUpdate) { - info!("Segment update: {:?}", segment_update); +// info!("Segment update: {:?}", segment_update); - use self::SegmentUpdate::*; - match segment_update { - AddSegment(segment_entry) => { - if !self.is_cancelled_generation { - self.segment_manager.add_segment(segment_entry); - } - else { - // rollback has been called and this - // segment actually belong to the - // documents that have been dropped. - // - // Let's just remove its files. - self.index.delete_segment(segment_entry.segment_id()); - } - } - StartMerge(segment_ids) => { - panic!("this should have been handled somewhere else"); - } - EndMerge(merging_thread_id_opt, segment_ids, segment_entry) => { - self.end_merge( - segment_ids, - segment_entry); - if let Some(merging_thread_id) = merging_thread_id_opt { - self.merging_threads.remove(&merging_thread_id); - } - } - CancelGeneration => { - // Called during rollback. The segment - // that will arrive will be ignored - // until a NewGeneration is update arrives. - self.is_cancelled_generation = true; - } - NewGeneration => { - // After rollback, we can resume - // indexing new documents. - self.is_cancelled_generation = false; - } - Commit(docstamp) => { - self.segment_manager.commit(docstamp); - } - Terminate => { - panic!("We should have left the loop before processing it."); - } - } - } -} +// use self::SegmentUpdate::*; +// match segment_update { +// AddSegment(generation, segment_entry) => { +// if !self.is_cancelled_generation { +// self.segment_manager.add_segment(segment_entry); +// } +// else { +// // rollback has been called and this +// // segment actually belong to the +// // documents that have been dropped. +// // +// // Let's just remove its files. +// self.index.delete_segment(segment_entry.segment_id()); +// } +// } +// StartMerge(segment_ids) => { +// panic!("this should have been handled somewhere else"); +// } +// EndMerge(merging_thread_id_opt, segment_ids, segment_entry) => { +// self.end_merge( +// segment_ids, +// segment_entry); +// if let Some(merging_thread_id) = merging_thread_id_opt { +// self.merging_threads.remove(&merging_thread_id); +// } +// } +// CancelGeneration => { +// // Called during rollback. The segment +// // that will arrive will be ignored +// // until a NewGeneration is update arrives. +// self.is_cancelled_generation = true; +// } +// NewGeneration => { +// // After rollback, we can resume +// // indexing new documents. +// self.is_cancelled_generation = false; +// } +// Commit(docstamp) => { +// self.segment_manager.commit(docstamp); +// save_metas( +// &self.segment_manager, +// self.index.schema(), +// self.index.docstamp(), +// self.index.directory_mut()).expect("Could not save metas."); +// match self.index.load_searchers() { +// Ok(()) => {} +// Err(e) => { +// error!("Failure while loading new searchers {:?}", e); +// panic!(format!("Failure while loading new searchers {:?}", e)); +// } +// } +// } +// Terminate => { +// panic!("We should have left the loop before processing it."); +// } +// } +// } +// } From ca977fb17b0072af63cd0a398d9bb87134e93c47 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 30 Jan 2017 23:47:56 +0900 Subject: [PATCH 016/107] issue/43 Refactoring of SegmentUpdater --- Cargo.toml | 2 - src/core/term_iterator.rs | 1 + src/indexer/index_writer.rs | 91 +++--- src/indexer/log_merge_policy.rs | 8 +- src/indexer/merger.rs | 6 +- src/indexer/segment_manager.rs | 28 +- src/indexer/segment_register.rs | 9 +- src/indexer/segment_updater.rs | 555 +++++++++++--------------------- src/lib.rs | 6 +- src/postings/mod.rs | 3 +- src/query/phrase_query/mod.rs | 1 + 11 files changed, 250 insertions(+), 460 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 6941ae63d..e65a5b608 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,8 +38,6 @@ chan = "0.1" version = "2" crossbeam = "0.2" -eventual = "0.1.7" - futures = "0.1.9" futures-cpupool = "0.1.2" diff --git a/src/core/term_iterator.rs b/src/core/term_iterator.rs index 3a5e259f7..53311e1cd 100644 --- a/src/core/term_iterator.rs +++ b/src/core/term_iterator.rs @@ -170,6 +170,7 @@ mod tests { index_writer.commit().unwrap(); } } + index.load_searchers().unwrap(); let searcher = index.searcher(); let mut term_it = searcher.terms(); let mut terms = String::new(); diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 298d1b6d6..e4578c51f 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -1,17 +1,14 @@ use schema::Schema; use schema::Document; use super::operation::AddOperation; -use indexer::SegmentSerializer; -use core::SerializableSegment; use core::Index; use core::Segment; use core::SegmentId; use schema::Term; use indexer::SegmentEntry; use std::thread::JoinHandle; -use indexer::{MergePolicy, DefaultMergePolicy}; +use indexer::MergePolicy; use indexer::SegmentWriter; -use indexer::SegmentManager; use core::SegmentComponent; use super::directory_lock::DirectoryLock; use futures::Future; @@ -19,16 +16,14 @@ use std::clone::Clone; use std::io; use fastfield::delete; use std::thread; +use futures::Canceled; use std::mem; -use indexer::merger::IndexMerger; use datastruct::stacker::Heap; use std::mem::swap; -use std::sync::{Arc, Mutex}; use chan; use core::SegmentMeta; use super::delete_queue::{DeleteQueue, DeleteQueueCursor}; use super::segment_updater::SegmentUpdater; -use super::segment_manager::CommitState; use Result; use Error; @@ -96,7 +91,7 @@ fn index_documents(heap: &mut Heap, document_iterator: &mut Iterator, segment_updater: &mut SegmentUpdater, delete_cursor: &mut DeleteQueueCursor) - -> Result<()> { + -> Result { heap.clear(); let segment_id = segment.id(); let mut segment_writer = try!(SegmentWriter::for_segment(heap, segment.clone(), &schema)); @@ -133,10 +128,13 @@ fn index_documents(heap: &mut Heap, }; let segment_entry = SegmentEntry::new(segment_meta, delete_cursor.clone()); - + try!(segment_writer.finalize()); - segment_updater.add_segment(generation, segment_entry); - Ok(()) + + segment_updater + .add_segment(generation, segment_entry) + .wait() + .map_err(|_| Error::ErrorInThread("Could not add segment.".to_string())) } @@ -145,7 +143,7 @@ impl IndexWriter { /// The index writer pub fn wait_merging_threads(mut self) -> Result<()> { - let future = self.segment_updater.terminate(); + // let future = self.segment_updater.terminate(); // this will stop the indexing thread, // dropping the last reference to the segment_updater. @@ -162,7 +160,12 @@ impl IndexWriter { } drop(self.workers_join_handle); - future.wait().unwrap(); // TODO do something with the result. + self.segment_updater + .wait_merging_thread() + .map_err(|_| + Error::ErrorInThread("Failed to join merging thread.".to_string()) + )?; + // future.wait().unwrap(); // TODO do something with the result. Ok(()) } @@ -201,13 +204,16 @@ impl IndexWriter { // peeked document now belongs to // our local iterator. if document_iterator.peek().is_some() { - try!(index_documents(&mut heap, + let valid_generation = try!(index_documents(&mut heap, segment, &schema, generation, &mut document_iterator, &mut segment_updater, &mut delete_cursor_clone)); + if valid_generation { + return Ok(()); + } } else { // No more documents. // Happens when there is a commit, or if the `IndexWriter` @@ -254,8 +260,7 @@ impl IndexWriter { let delete_queue = DeleteQueue::default(); - let merge_policy = box DefaultMergePolicy::default(); - let segment_updater = SegmentUpdater::new(index.clone(), delete_queue.cursor(), merge_policy)?; + let segment_updater = SegmentUpdater::new(index.clone(), delete_queue.cursor())?; let mut index_writer = IndexWriter { @@ -285,9 +290,14 @@ impl IndexWriter { Ok(index_writer) } + + pub fn get_merge_policy(&self) -> Box { + self.segment_updater.get_merge_policy() + } + /// Set the merge policy. pub fn set_merge_policy(&self, merge_policy: Box) { - // *self._merge_policy.lock().unwrap() = merge_policy; + self.segment_updater.set_merge_policy(merge_policy); } fn start_workers(&mut self) -> Result<()> { @@ -298,7 +308,7 @@ impl IndexWriter { } /// Merges a given list of segments - pub fn merge(&mut self, segments: &[SegmentId]) -> impl Future { + pub fn merge(&mut self, segments: &[SegmentId]) -> impl Future { self.segment_updater.start_merge(segments.to_vec()) } @@ -328,8 +338,11 @@ impl IndexWriter { /// The docstamp at the last commit is returned. pub fn rollback(&mut self) -> Result { - self.segment_updater.cancel_generation(); - + // by updating the generation in the segment updater, + // pending add segment commands will be dismissed. + self.generation += 1; + let rollback_future = self.segment_updater.new_generation(self.generation); + // we cannot drop segment ready receiver yet // as it would block the workers. let document_receiver = self.recreate_document_channel(); @@ -341,7 +354,7 @@ impl IndexWriter { let mut former_workers_join_handle = Vec::new(); swap(&mut former_workers_join_handle, &mut self.workers_join_handle); - + // wait for all the worker to finish their work // (it should be fast since we consumed all pending documents) for worker_handle in former_workers_join_handle { @@ -355,27 +368,15 @@ impl IndexWriter { // All of our indexing workers for the rollbacked generation have // been terminated. + // // Our document receiver pipe was drained. // No new document have been added in the meanwhile because `IndexWriter` // is not shared by different threads. - // - // We can now open a new generation and reaccept segments - // from now on. - self.segment_updater.new_generation(); - - // TODO Send rollback. - //let rollbacked_segments = self.segment_manager.rollback(); + rollback_future.wait().map_err(|_| + Error::ErrorInThread("Error while waiting for rollback.".to_string()) + )?; - // for segment_id in rollbacked_segments { - // // TODO all delete must happen after saving - // // meta.json - // self.index.delete_segment(segment_id); - // } - - panic!("aaaa"); - - // reset the docstamp self.uncommitted_docstamp = self.committed_docstamp; Ok(self.committed_docstamp) @@ -474,12 +475,11 @@ impl IndexWriter { #[cfg(test)] mod tests { - + use indexer::NoMergePolicy; use schema::{self, Document}; use Index; use Term; use Error; - use indexer::NoMergePolicy; #[test] fn test_lockfile_stops_duplicates() { @@ -497,10 +497,10 @@ mod tests { let schema_builder = schema::SchemaBuilder::default(); let index = Index::create_in_ram(schema_builder.build()); let index_writer = index.writer(40_000_000).unwrap(); - // assert_eq!(format!("{:?}", index_writer.get_merge_policy()), "LogMergePolicy { min_merge_size: 8, min_layer_size: 10000, level_log_size: 0.75 }"); - // let merge_policy = box NoMergePolicy::default(); - // index_writer.set_merge_policy(merge_policy); - // assert_eq!(format!("{:?}", index_writer.get_merge_policy()), "NoMergePolicy"); + assert_eq!(format!("{:?}", index_writer.get_merge_policy()), "LogMergePolicy { min_merge_size: 8, min_layer_size: 10000, level_log_size: 0.75 }"); + let merge_policy = box NoMergePolicy::default(); + index_writer.set_merge_policy(merge_policy); + assert_eq!(format!("{:?}", index_writer.get_merge_policy()), "NoMergePolicy"); } #[test] @@ -521,7 +521,6 @@ mod tests { let text_field = schema_builder.add_text_field("text", schema::TEXT); let index = Index::create_in_ram(schema_builder.build()); - let num_docs_containing = |s: &str| { let searcher = index.searcher(); let term_a = Term::from_field_text(text_field, s); @@ -550,11 +549,12 @@ mod tests { index_writer.add_document(doc).unwrap(); } assert_eq!(index_writer.commit().unwrap(), 2u64); - + index.load_searchers().unwrap(); assert_eq!(num_docs_containing("a"), 0); assert_eq!(num_docs_containing("b"), 1); assert_eq!(num_docs_containing("c"), 1); } + index.load_searchers().unwrap(); index.searcher(); } @@ -586,6 +586,7 @@ mod tests { // this should create 8 segments and trigger a merge. index_writer.commit().expect("commit failed"); index_writer.wait_merging_threads().expect("waiting merging thread failed"); + index.load_searchers().unwrap(); assert_eq!(num_docs_containing("a"), 200); assert_eq!(index.searchable_segments().unwrap().len(), 1); } diff --git a/src/indexer/log_merge_policy.rs b/src/indexer/log_merge_policy.rs index 5ca049e4c..3eebdf78f 100644 --- a/src/indexer/log_merge_policy.rs +++ b/src/indexer/log_merge_policy.rs @@ -48,7 +48,6 @@ impl LogMergePolicy { impl MergePolicy for LogMergePolicy { fn compute_merge_candidates(&self, segments: &[SegmentMeta]) -> Vec { - if segments.is_empty() { return Vec::new(); } @@ -75,16 +74,15 @@ impl MergePolicy for LogMergePolicy { levels.last_mut().unwrap().push(ind); } - let result = levels.iter() + levels + .iter() .filter(|level| level.len() >= self.min_merge_size) .map(|ind_vec| { MergeCandidate(ind_vec.iter() .map(|&ind| segments[ind].segment_id) .collect()) }) - .collect(); - - result + .collect() } fn box_clone(&self) -> Box { diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 28068e880..0f51d0652 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -1,7 +1,6 @@ use Result; use core::SegmentReader; use core::Segment; -use core::SegmentId; use DocId; use core::SerializableSegment; use indexer::SegmentSerializer; @@ -14,7 +13,6 @@ use fastfield::FastFieldSerializer; use store::StoreWriter; use postings::ChainedPostings; use postings::HasLen; -use futures::Future; use postings::OffsetPostings; use core::SegmentInfo; use std::cmp::{min, max}; @@ -207,7 +205,6 @@ mod tests { use collector::tests::TestCollector; use query::BooleanQuery; use schema::TextIndexingOptions; - use eventual::Async; use futures::Future; #[test] @@ -243,7 +240,7 @@ mod tests { doc.add_u32(score_field, 7); index_writer.add_document(doc).unwrap(); } - index_writer.commit().unwrap(); + index_writer.commit().expect("committed"); } { @@ -272,6 +269,7 @@ mod tests { index_writer.wait_merging_threads().unwrap(); } { + index.load_searchers().unwrap(); let searcher = index.searcher(); let get_doc_ids = |terms: Vec| { let mut collector = TestCollector::default(); diff --git a/src/indexer/segment_manager.rs b/src/indexer/segment_manager.rs index 61f7b6fc1..20ccfcdfa 100644 --- a/src/indexer/segment_manager.rs +++ b/src/indexer/segment_manager.rs @@ -6,7 +6,6 @@ use indexer::SegmentEntry; use indexer::delete_queue::DeleteQueueCursor; use std::sync::{RwLockReadGuard, RwLockWriteGuard}; use std::fmt::{self, Debug, Formatter}; -use std::sync::atomic::{AtomicUsize, Ordering}; struct SegmentRegisters { docstamp: u64, @@ -14,13 +13,6 @@ struct SegmentRegisters { committed: SegmentRegister, } -#[derive(Eq, PartialEq)] -pub enum CommitState { - Committed, - Uncommitted, - Missing, -} - impl Default for SegmentRegisters { fn default() -> SegmentRegisters { SegmentRegisters { @@ -54,28 +46,14 @@ impl Debug for SegmentManager { /// /// For instance, a segment will not appear in both committed and uncommitted /// segments -pub fn get_segment_ready_for_commit(segment_manager: &SegmentManager,) -> (Vec, Vec) { +pub fn get_segments(segment_manager: &SegmentManager,) -> (Vec, Vec) { let registers_lock = segment_manager.read(); - (registers_lock.committed.get_segment_ready_for_commit(), - registers_lock.uncommitted.get_segment_ready_for_commit()) + (registers_lock.committed.get_segments(), + registers_lock.uncommitted.get_segments()) } impl SegmentManager { - /// Returns whether a segment is committed, uncommitted or missing. - pub fn is_committed(&self, segment_id: SegmentId) -> CommitState { - let lock = self.read(); - if lock.uncommitted.contains(segment_id) { - CommitState::Uncommitted - } - else if lock.committed.contains(segment_id) { - CommitState::Committed - } - else { - CommitState::Missing - } - } - pub fn from_segments(segment_metas: Vec, delete_cursor: DeleteQueueCursor) -> SegmentManager { SegmentManager { registers: RwLock::new( SegmentRegisters { diff --git a/src/indexer/segment_register.rs b/src/indexer/segment_register.rs index f1dc37d24..8f8f20611 100644 --- a/src/indexer/segment_register.rs +++ b/src/indexer/segment_register.rs @@ -89,7 +89,7 @@ impl SegmentRegister { self.segment_states.clear(); } - pub fn get_segment_ready_for_commit(&self,) -> Vec { + pub fn get_segments(&self,) -> Vec { self.segment_states .values() .filter(|segment_entry| segment_entry.is_ready()) @@ -126,12 +126,7 @@ impl SegmentRegister { .get(&segment_id) .map(|segment_entry| segment_entry.clone()) } - - pub fn contains(&self, segment_id: SegmentId) -> bool { - self.segment_states.contains_key(&segment_id) - } - - + pub fn contains_all(&mut self, segment_ids: &[SegmentId]) -> bool { segment_ids .iter() diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index 14394d3b4..2622d97a1 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -1,21 +1,25 @@ #![allow(for_kv_map)] -use chan; use core::Index; -use std::sync::Mutex; use core::Segment; +use indexer::{MergePolicy, DefaultMergePolicy}; use core::SegmentId; use core::SegmentMeta; use std::mem; -use futures::Future; +use std::sync::atomic::Ordering; +use std::ops::DerefMut; +use futures::{Future, future}; +use futures::oneshot; +use futures::Canceled; +use std::thread; use std::sync::atomic::AtomicUsize; +use std::sync::RwLock; use core::SerializableSegment; -use indexer::MergePolicy; use indexer::MergeCandidate; use indexer::merger::IndexMerger; +use std::borrow::BorrowMut; use indexer::SegmentSerializer; use indexer::SegmentEntry; -use std::thread; use schema::Schema; use directory::Directory; use std::thread::JoinHandle; @@ -28,8 +32,7 @@ use futures_cpupool::CpuPool; use core::IndexMeta; use core::META_FILEPATH; use std::io::Write; -use eventual::*; -use super::segment_manager::{SegmentManager, get_segment_ready_for_commit}; +use super::segment_manager::{SegmentManager, get_segments}; fn create_metas(segment_manager: &SegmentManager, schema: Schema, docstamp: u64) -> IndexMeta { @@ -84,135 +87,6 @@ pub fn save_metas(segment_manager: &SegmentManager, } -// #[derive(Clone, Debug)] -// pub enum SegmentUpdate { - -// /// New segment added. -// /// Created by the indexing worker thread -// AddSegment(usize, SegmentEntry), - - -// StartMerge(Vec), - -// /// A merge is ended. -// /// Remove the merged segment and record the new -// /// large merged segment. -// EndMerge(Option, Vec, SegmentEntry), - -// /// Happens when rollback is called. -// /// The current generation of segments is cancelled. -// CancelGeneration, - -// /// Starts a new generation... This -// /// happens at the end of Rollback. -// NewGeneration, - -// /// Just dropping the Segment updater object -// /// is safe, but some merge might be happening in -// /// the background and the user may want to wait for these -// /// threads to terminate. -// /// -// /// When receiving the Terminate signal, the segment updater stops -// /// receiving segment updates and just waits for the merging threads -// /// to terminate. -// Terminate, - -// /// Commit marks uncommmitted segments as committed. -// Commit(u64), -// } - -#[derive(Clone)] -pub struct SegmentUpdater(Arc); - - -struct InnerSegmentUpdater { - pool: CpuPool, - segment_manager: SegmentManager, - merge_policy: Box, - merging_thread_id: AtomicUsize, - merging_threads: HashMap, SegmentEntry)> >, -} - -impl SegmentUpdater { - - pub fn new( - index: Index, - delete_cursor: DeleteQueueCursor, - merge_policy: Box) - -> Result - { - let committed_segments = index.committed_segments()?; - let segment_manager = SegmentManager::from_segments(committed_segments, delete_cursor); - Ok( - SegmentUpdater(Arc::new(InnerSegmentUpdater { - pool: CpuPool::new(1), - segment_manager: segment_manager, - merge_policy: merge_policy, - merging_thread_id: AtomicUsize::new(0), - merging_threads: HashMap::new(), - })) - ) - } - - pub fn add_segment(&self, generation: usize, segment_entry: SegmentEntry) { - } - - pub fn commit(&self, committed_docstamp: u64) -> impl Future { - self.0.pool.spawn_fn(|| { - Ok(()) - }) - } - - pub fn start_merge(&self, segment_ids: Vec) -> impl Future { - self.0.pool.spawn_fn(|| { - Ok(()) - }) - } - - pub fn new_generation(&self) { - } - - pub fn cancel_generation(&self) { - } - - pub fn end_merge(&self, - merge_thread_id: Option, - merged_segment_ids: Vec, - resulting_segment_entry: SegmentEntry) { - } - - pub fn terminate(&self) -> impl Future { - self.0.pool.spawn_fn(|| { - Ok(()) - }) - } - -} - - -// impl SegmentUpdater { - -// pub fn create( -// index: Index, -// delete_cursor: DeleteQueueCursor, -// merge_policy: Box) -> Result { -// let (segment_update_sender, segment_update_receiver): (SegmentUpdateSender, SegmentUpdateReceiver) = chan::async(); -// let segment_updater = SegmentUpdater { -// channel: segment_update_sender, -// }; -// let committed_segments = index.committed_segments()?; -// let segment_manager = SegmentManager::from_segments(committed_segments, delete_cursor); -// SegmentUpdateRunner::new( -// index, -// segment_manager, -// merge_policy, -// segment_updater.clone(), -// segment_update_receiver).start(); -// Ok(segment_updater) -// } - - -// } // The segment update runner is in charge of processing all @@ -220,248 +94,191 @@ impl SegmentUpdater { // // All this processing happens on a single thread // consuming a common queue. -// -// The segment updates producers are : -// - indexing threads are sending new segments -// - merging threads are sending merge operations -// - the index writer sends "terminate" -// pub struct SegmentUpdateRunner { -// index: Index, -// is_cancelled_generation: bool, -// segment_update_receiver: SegmentUpdateReceiver, -// segment_updater: SegmentUpdater, -// segment_manager: SegmentManager, -// merge_policy: Box, -// merging_thread_id: usize, -// merging_threads: HashMap, SegmentEntry)> >, -// } - -// impl SegmentUpdateRunner { - -// fn new(index: Index, -// segment_manager: SegmentManager, -// merge_policy: Box, -// segment_updater: SegmentUpdater, -// segment_update_receiver: SegmentUpdateReceiver) -> SegmentUpdateRunner { -// SegmentUpdateRunner { -// index: index, -// is_cancelled_generation: false, -// segment_updater: segment_updater, -// segment_update_receiver: segment_update_receiver, -// segment_manager: segment_manager, -// merge_policy: merge_policy, -// merging_thread_id: 0, -// merging_threads: HashMap::new(), -// } -// } - -// fn new_merging_thread_id(&mut self,) -> usize { -// self.merging_thread_id += 1; -// self.merging_thread_id -// } - - -// fn end_merge( -// &mut self, -// segment_ids: Vec, -// segment_entry: SegmentEntry) { - -// self.segment_manager.end_merge(&segment_ids, segment_entry); -// save_metas( -// &self.segment_manager, -// self.index.schema(), -// self.index.docstamp(), -// self.index.directory_mut()).expect("Could not save metas."); - -// for segment_id in segment_ids { -// self.index.delete_segment(segment_id); -// } - -// self.index.load_searchers().unwrap(); -// } +#[derive(Clone)] +pub struct SegmentUpdater(Arc); -// fn start_merge(&mut self, segment_ids: Vec, complete_opt: Option>) { - -// let merging_thread_id = self.new_merging_thread_id(); -// self.segment_manager.start_merge(&segment_ids); - -// let index_clone = self.index.clone(); -// let segment_updater_clone = self.segment_updater.clone(); - -// let merge_thread_handle = thread::Builder::new() -// .name(format!("merge_thread_{:?}", merging_thread_id)) -// .spawn(move || { -// info!("Start merge: {:?}", segment_ids); -// let schema = index_clone.schema(); -// let segments: Vec = segment_ids -// .iter() -// .map(|&segment_id| index_clone.segment(segment_id)) -// .collect(); -// // An IndexMerger is like a "view" of our merged segments. -// // TODO unwrap -// let merger: IndexMerger = IndexMerger::open(schema, &segments[..]).expect("Creating index merger failed"); -// let mut merged_segment = index_clone.new_segment(); -// // ... we just serialize this index merger in our new segment -// // to merge the two segments. -// let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment).expect("Creating index serializer failed"); -// let num_docs = merger.write(segment_serializer).expect("Serializing merged index failed"); -// let segment_meta = SegmentMeta { -// segment_id: merged_segment.id(), -// num_docs: num_docs, -// num_deleted_docs: 0u32, -// }; +struct InnerSegmentUpdater { + pool: CpuPool, + index: Index, + segment_manager: SegmentManager, + merge_policy: RwLock>, + merging_thread_id: AtomicUsize, + merging_threads: RwLock>>, + generation: AtomicUsize, +} -// // TODO fix delete cursor -// let delete_queue = DeleteQueue::default(); - -// let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor()); +impl SegmentUpdater { -// let segment_update = SegmentUpdate::EndMerge(Some(merging_thread_id), segment_ids.clone(), segment_entry.clone()); -// // segment_updater_clone.send(segment_update.clone()); -// if let Some(complete) = complete_opt { -// complete.complete(()); -// } -// (segment_ids, segment_entry) -// }) -// .expect("Failed to spawn merge thread"); + pub fn new( + index: Index, + delete_cursor: DeleteQueueCursor) + -> Result + { + let committed_segments = index.committed_segments()?; + let segment_manager = SegmentManager::from_segments(committed_segments, delete_cursor); + Ok( + SegmentUpdater(Arc::new(InnerSegmentUpdater { + pool: CpuPool::new(1), + index: index, + segment_manager: segment_manager, + merge_policy: RwLock::new(box DefaultMergePolicy::default()), + merging_thread_id: AtomicUsize::default(), + merging_threads: RwLock::new(HashMap::new()), + generation: AtomicUsize::default(), + })) + ) + } + + pub fn get_merge_policy(&self) -> Box { + self.0.merge_policy.read().unwrap().box_clone() + } + + pub fn set_merge_policy(&self, merge_policy: Box) { + *self.0.merge_policy.write().unwrap()= merge_policy; + } + + fn get_merging_thread_id(&self) -> usize { + self.0.merging_thread_id.fetch_add(1, Ordering::SeqCst) + } + + + fn run_async T>(&self, f: F) -> impl Future { + let me_clone = self.clone(); + self.0.pool.spawn_fn(move || { + Ok(f(me_clone)) + }) + } + + pub fn new_generation(&mut self, generation: usize) -> impl Future { + self.0.generation.store(generation, Ordering::Release); + self.run_async(|segment_updater| { + segment_updater.0.segment_manager.rollback(); + }) + } + + pub fn add_segment(&self, generation: usize, segment_entry: SegmentEntry) -> impl Future { + if generation >= self.0.generation.load(Ordering::Acquire) { + future::Either::A(self.run_async(|segment_updater| { + segment_updater.0.segment_manager.add_segment(segment_entry); + segment_updater.consider_merge_options(); + true + })) + } + else { + future::Either::B(future::ok(false)) + } + } + + pub fn commit(&self, opstamp: u64) -> impl Future { + self.run_async(move |segment_updater| { + segment_updater.0.segment_manager.commit(opstamp); + let mut directory = segment_updater.0.index.directory().box_clone(); + save_metas( + &segment_updater.0.segment_manager, + segment_updater.0.index.schema(), + segment_updater.0.index.docstamp(), + directory.borrow_mut()).expect("Could not save metas."); + segment_updater.consider_merge_options(); + }) + } + + + pub fn start_merge(&self, segment_ids: Vec) -> impl Future { -// self.merging_threads.insert(merging_thread_id, merge_thread_handle); -// } + self.0.segment_manager.start_merge(&segment_ids); + let segment_updater_clone = self.clone(); -// fn start_merges(&mut self) { -// let merge_candidates = self.consider_merge_options(); -// for MergeCandidate(segment_ids) in merge_candidates { -// self.start_merge(segment_ids, None); -// } -// } - -// fn consider_merge_options(&self,) -> Vec { -// let (committed_segments, uncommitted_segments) = get_segment_ready_for_commit(&self.segment_manager); -// // Committed segments cannot be merged with uncommitted_segments. -// // We therefore consider merges using these two sets of segments independantly. -// let mut merge_candidates = self.merge_policy.compute_merge_candidates(&uncommitted_segments); -// let committed_merge_candidates = self.merge_policy.compute_merge_candidates(&committed_segments); -// merge_candidates.extend_from_slice(&committed_merge_candidates[..]); -// merge_candidates -// } - -// pub fn start(self) -> JoinHandle<()> { -// thread::Builder::new() -// .name("segment_update".to_string()) -// .spawn(move || { -// self.process(); -// }) -// .expect("Failed to start segment updater thread.") -// } - -// fn process(mut self) { - -// let mut complete_option = None; - -// for (complete, segment_update) in self.segment_update_receiver.clone() { + let merging_thread_id = self.get_merging_thread_id(); + let (merging_future_send, merging_future_recv) = oneshot(); + let merging_join_handle = thread::spawn(move || { -// if let SegmentUpdate::Terminate = segment_update { -// complete_option = Some(complete); -// break; -// } + info!("Start merge: {:?}", segment_ids); -// if let SegmentUpdate::StartMerge(segment_ids) = segment_update { -// self.start_merge(segment_ids, Some(complete)); -// } -// else { -// self.process_one(segment_update); - -// // - start merges if required -// self.start_merges(); -// complete.complete(()); -// } -// } - -// let mut merging_threads = HashMap::new(); -// mem::swap(&mut merging_threads, &mut self.merging_threads); -// for (_, merging_thread_handle) in merging_threads { -// match merging_thread_handle.join() { -// Ok((segment_ids, segment_entry)) => { -// self.end_merge(segment_ids, segment_entry); -// } -// Err(e) => { -// error!("Error in merging thread {:?}", e); -// break; -// } -// } -// } + let ref index = segment_updater_clone.0.index; + let schema = index.schema(); + let segments: Vec = segment_ids + .iter() + .map(|&segment_id| index.segment(segment_id)) + .collect(); + + // An IndexMerger is like a "view" of our merged segments. + // TODO unwrap + let merger: IndexMerger = IndexMerger::open(schema, &segments[..]).expect("Creating index merger failed"); + let mut merged_segment = index.new_segment(); + + // ... we just serialize this index merger in our new segment + // to merge the two segments. + let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment).expect("Creating index serializer failed"); + let num_docs = merger.write(segment_serializer).expect("Serializing merged index failed"); + let segment_meta = SegmentMeta { + segment_id: merged_segment.id(), + num_docs: num_docs, + num_deleted_docs: 0u32, + }; -// if let Some(complete) = complete_option { -// complete.complete(()); -// } -// } + // TODO fix delete cursor + let delete_queue = DeleteQueue::default(); + + let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor()); + segment_updater_clone + .end_merge(segment_ids.clone(), segment_entry.clone()) + .wait() + .unwrap(); + merging_future_send.complete(segment_entry.clone()); + segment_updater_clone.0.merging_threads.write().unwrap().remove(&merging_thread_id); + segment_entry + }); + self.0.merging_threads.write().unwrap().insert(merging_thread_id, merging_join_handle); + merging_future_recv + } + + + fn consider_merge_options(&self) { + let (committed_segments, uncommitted_segments) = get_segments(&self.0.segment_manager); + // Committed segments cannot be merged with uncommitted_segments. + // We therefore consider merges using these two sets of segments independently. + let merge_policy = self.get_merge_policy(); + let mut merge_candidates = merge_policy.compute_merge_candidates(&uncommitted_segments); + let committed_merge_candidates = merge_policy.compute_merge_candidates(&committed_segments); + merge_candidates.extend_from_slice(&committed_merge_candidates[..]); + for MergeCandidate(segment_ids) in merge_candidates { + self.start_merge(segment_ids); + } + } + fn end_merge(&self, + merged_segment_ids: Vec, + resulting_segment_entry: SegmentEntry) -> impl Future { + + self.run_async(move |segment_updater| { + segment_updater.0.segment_manager.end_merge(&merged_segment_ids, resulting_segment_entry); + let mut directory = segment_updater.0.index.directory().box_clone(); + save_metas( + &segment_updater.0.segment_manager, + segment_updater.0.index.schema(), + segment_updater.0.index.docstamp(), + directory.borrow_mut()).expect("Could not save metas."); + for segment_id in merged_segment_ids { + segment_updater.0.index.delete_segment(segment_id); + } + }) + + } - -// // Process a single segment update. -// pub fn process_one( -// &mut self, -// segment_update: SegmentUpdate) { - -// info!("Segment update: {:?}", segment_update); - -// use self::SegmentUpdate::*; -// match segment_update { -// AddSegment(generation, segment_entry) => { -// if !self.is_cancelled_generation { -// self.segment_manager.add_segment(segment_entry); -// } -// else { -// // rollback has been called and this -// // segment actually belong to the -// // documents that have been dropped. -// // -// // Let's just remove its files. -// self.index.delete_segment(segment_entry.segment_id()); -// } -// } -// StartMerge(segment_ids) => { -// panic!("this should have been handled somewhere else"); -// } -// EndMerge(merging_thread_id_opt, segment_ids, segment_entry) => { -// self.end_merge( -// segment_ids, -// segment_entry); -// if let Some(merging_thread_id) = merging_thread_id_opt { -// self.merging_threads.remove(&merging_thread_id); -// } -// } -// CancelGeneration => { -// // Called during rollback. The segment -// // that will arrive will be ignored -// // until a NewGeneration is update arrives. -// self.is_cancelled_generation = true; -// } -// NewGeneration => { -// // After rollback, we can resume -// // indexing new documents. -// self.is_cancelled_generation = false; -// } -// Commit(docstamp) => { -// self.segment_manager.commit(docstamp); -// save_metas( -// &self.segment_manager, -// self.index.schema(), -// self.index.docstamp(), -// self.index.directory_mut()).expect("Could not save metas."); -// match self.index.load_searchers() { -// Ok(()) => {} -// Err(e) => { -// error!("Failure while loading new searchers {:?}", e); -// panic!(format!("Failure while loading new searchers {:?}", e)); -// } -// } -// } -// Terminate => { -// panic!("We should have left the loop before processing it."); -// } -// } -// } -// } + pub fn wait_merging_thread(&self) -> thread::Result<()> { + let mut new_merging_threads = HashMap::new(); + { + let mut merging_threads = self.0.merging_threads.write().unwrap(); + mem::swap(&mut new_merging_threads, merging_threads.deref_mut()); + } + for (_, merging_thread_handle) in new_merging_threads { + merging_thread_handle + .join() + .map(|_| ())? + } + Ok(()) + } + +} diff --git a/src/lib.rs b/src/lib.rs index ed0d726fa..96d65e639 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -49,7 +49,6 @@ extern crate chan; extern crate crossbeam; extern crate bit_set; extern crate notify; -extern crate eventual; extern crate futures; extern crate futures_cpupool; @@ -245,6 +244,7 @@ mod tests { index_writer.commit().unwrap(); } { + index.load_searchers().unwrap(); let searcher = index.searcher(); let term_a = Term::from_field_text(text_field, "a"); assert_eq!(searcher.doc_freq(&term_a), 3); @@ -280,7 +280,7 @@ mod tests { index_writer.commit().unwrap(); } { - + index.load_searchers().unwrap(); let searcher = index.searcher(); let segment_reader: &SegmentReader = searcher.segment_reader(0); let fieldnorms_reader = segment_reader.get_fieldnorms_reader(text_field).unwrap(); @@ -306,6 +306,7 @@ mod tests { index_writer.commit().unwrap(); } { + index.load_searchers().unwrap(); let searcher = index.searcher(); let reader = searcher.segment_reader(0); assert!(reader.read_postings_all_info(&Term::from_field_text(text_field, "abcd")).is_none()); @@ -342,6 +343,7 @@ mod tests { index_writer.commit().unwrap(); } { + index.load_searchers().unwrap(); let searcher = index.searcher(); let get_doc_ids = |terms: Vec| { let query = BooleanQuery::new_multiterms_query(terms); diff --git a/src/postings/mod.rs b/src/postings/mod.rs index ca70512b8..f9898b9fc 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -189,6 +189,7 @@ mod tests { } assert!(index_writer.commit().is_ok()); } + index.load_searchers().unwrap(); let term_query = TermQuery::new(Term::from_field_text(text_field, "a"), SegmentPostingsOption::NoFreq); let searcher = index.searcher(); let mut term_weight = term_query.specialized_weight(&*searcher); @@ -256,6 +257,7 @@ mod tests { } assert!(index_writer.commit().is_ok()); } + index.load_searchers().unwrap(); index }; } @@ -275,7 +277,6 @@ mod tests { fn bench_segment_intersection(b: &mut Bencher) { let searcher = INDEX.searcher(); let segment_reader = searcher.segment_reader(0); - b.iter(|| { let segment_postings_a = segment_reader.read_postings(&*TERM_A, SegmentPostingsOption::NoFreq).unwrap(); let segment_postings_b = segment_reader.read_postings(&*TERM_B, SegmentPostingsOption::NoFreq).unwrap(); diff --git a/src/query/phrase_query/mod.rs b/src/query/phrase_query/mod.rs index 6983ff65e..e01743eb3 100644 --- a/src/query/phrase_query/mod.rs +++ b/src/query/phrase_query/mod.rs @@ -48,6 +48,7 @@ mod tests { assert!(index_writer.commit().is_ok()); } + index.load_searchers().unwrap(); let searcher = index.searcher(); let test_query = |texts: Vec<&str>| { let mut test_collector = TestCollector::default(); From 09782858daea57d08b4ea3da620e629d01b4a84a Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Wed, 1 Feb 2017 10:06:32 +0900 Subject: [PATCH 017/107] issue/43 Segment have a commit opstamp --- src/core/index.rs | 78 ++++++++++++++++++++++++++------- src/core/index_meta.rs | 4 +- src/core/segment.rs | 42 ++---------------- src/core/segment_meta.rs | 2 + src/error.rs | 2 +- src/indexer/index_writer.rs | 41 +++++++++-------- src/indexer/segment_manager.rs | 19 +++++--- src/indexer/segment_register.rs | 5 ++- src/indexer/segment_updater.rs | 66 +++++++++++++++++++--------- src/postings/mod.rs | 4 +- 10 files changed, 157 insertions(+), 106 deletions(-) diff --git a/src/core/index.rs b/src/core/index.rs index 05296188a..6b62763bd 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -21,6 +21,7 @@ use core::IndexMeta; use core::META_FILEPATH; use super::segment::create_segment; use indexer::segment_updater::save_new_metas; +use directory::error::{FileError, OpenWriteError}; const NUM_SEARCHERS: usize = 12; @@ -41,6 +42,45 @@ pub struct Index { docstamp: u64, } + + + +/// Deletes all of the document of the segment. +/// This is called when there is a merge or a rollback. +/// +/// # Disclaimer +/// If deletion of a file fails (e.g. a file +/// was read-only.), the method does not +/// fail and just logs an error when it fails. +pub fn delete_segment(directory: &Directory, segment_id: SegmentId) { + info!("Deleting segment {:?}", segment_id); + let segment_filepaths_res = directory.ls_starting_with( + &*segment_id.uuid_string() + ); + + match segment_filepaths_res { + Ok(segment_filepaths) => { + for segment_filepath in &segment_filepaths { + if let Err(err) = directory.delete(&segment_filepath) { + match err { + FileError::FileDoesNotExist(_) => { + // this is normal behavior. + // the position file for instance may not exists. + } + FileError::IOError(err) => { + error!("Failed to remove {:?} : {:?}", segment_id, err); + } + } + } + } + } + Err(_) => { + error!("Failed to list files of segment {:?} for deletion.", segment_id.uuid_string()); + } + } +} + + impl Index { /// Creates a new index using the `RAMDirectory`. /// @@ -76,7 +116,7 @@ impl Index { /// Creates a new index given a directory and an `IndexMeta`. fn create_from_metas(directory: Box, metas: IndexMeta) -> Result { let schema = metas.schema.clone(); - let docstamp = metas.docstamp; + let docstamp = metas.opstamp; // TODO log somethings is uncommitted is not empty. let index = Index { directory: directory, @@ -143,27 +183,33 @@ impl Index { /// Returns the list of segments that are searchable pub fn searchable_segments(&self) -> Result> { - let searchable_segment_ids = self.searchable_segment_ids()?; + let metas = load_metas(self.directory())?; + let searchable_segment_ids = metas + .committed_segments + .iter() + .map(|segment_meta| segment_meta.segment_id) + .collect::>(); + let commit_opstamp = metas.opstamp; Ok(searchable_segment_ids .into_iter() - .map(|segment_id| self.segment(segment_id)) + .map(|segment_id| self.segment(segment_id, commit_opstamp)) .collect()) } - + /// Remove all of the file associated with the segment. /// /// This method cannot fail. If a problem occurs, /// some files may end up never being removed. /// The error will only be logged. pub fn delete_segment(&self, segment_id: SegmentId) { - self.segment(segment_id).delete(); + delete_segment(self.directory(), segment_id); } /// Return a segment object given a `segment_id` /// /// The segment may or may not exist. - pub fn segment(&self, segment_id: SegmentId) -> Segment { - create_segment(self.clone(), segment_id) + pub fn segment(&self, segment_id: SegmentId, commit_opstamp: u64) -> Segment { + create_segment(self.clone(), segment_id, commit_opstamp) } /// Return a reference to the index directory. @@ -179,24 +225,22 @@ impl Index { /// Reads the meta.json and returns the list of /// committed segments. pub fn committed_segments(&self) -> Result> { + Ok(load_metas(self.directory())?.committed_segments) } /// Returns the list of segment ids that are searchable. pub fn searchable_segment_ids(&self) -> Result> { - self.committed_segments() - .map(|commited_segments| { - commited_segments - .iter() - .map(|segment_meta| segment_meta.segment_id) - .collect() - }) - + Ok(load_metas(self.directory())? + .committed_segments + .iter() + .map(|segment_meta| segment_meta.segment_id) + .collect()) } /// Creates a new segment. - pub fn new_segment(&self) -> Segment { - self.segment(SegmentId::generate_random()) + pub fn new_segment(&self, opstamp: u64) -> Segment { + self.segment(SegmentId::generate_random(), opstamp) } /// Creates a new generation of searchers after diff --git a/src/core/index_meta.rs b/src/core/index_meta.rs index c6d7f4bc5..c97ed9570 100644 --- a/src/core/index_meta.rs +++ b/src/core/index_meta.rs @@ -14,7 +14,7 @@ pub struct IndexMeta { pub committed_segments: Vec, pub uncommitted_segments: Vec, pub schema: Schema, - pub docstamp: u64, + pub opstamp: u64, } impl IndexMeta { @@ -23,7 +23,7 @@ impl IndexMeta { committed_segments: Vec::new(), uncommitted_segments: Vec::new(), schema: schema, - docstamp: 0u64, + opstamp: 0u64, } } } diff --git a/src/core/segment.rs b/src/core/segment.rs index aab25574c..4e719c0e3 100644 --- a/src/core/segment.rs +++ b/src/core/segment.rs @@ -11,13 +11,12 @@ use core::Index; use std::result; use directory::error::{FileError, OpenWriteError}; - - /// A segment is a piece of the index. #[derive(Clone)] pub struct Segment { index: Index, segment_id: SegmentId, + commit_opstamp: u64, } impl fmt::Debug for Segment { @@ -30,10 +29,11 @@ impl fmt::Debug for Segment { /// Creates a new segment given an `Index` and a `SegmentId` /// /// The function is here to make it private outside `tantivy`. -pub fn create_segment(index: Index, segment_id: SegmentId) -> Segment { +pub fn create_segment(index: Index, segment_id: SegmentId, commit_opstamp: u64) -> Segment { Segment { index: index, segment_id: segment_id, + commit_opstamp: commit_opstamp, } } @@ -59,42 +59,6 @@ impl Segment { self.segment_id.relative_path(component) } - /// Deletes all of the document of the segment. - /// This is called when there is a merge or a rollback. - /// - /// # Disclaimer - /// If deletion of a file fails (e.g. a file - /// was read-only.), the method does not - /// fail and just logs an error when it fails. - pub fn delete(&self) { - info!("Deleting segment {:?}", self.segment_id); - let segment_filepaths_res = self.index.directory().ls_starting_with( - &*self.segment_id.uuid_string() - ); - - match segment_filepaths_res { - Ok(segment_filepaths) => { - for segment_filepath in &segment_filepaths { - if let Err(err) = self.index.directory().delete(&segment_filepath) { - match err { - FileError::FileDoesNotExist(_) => { - // this is normal behavior. - // the position file for instance may not exists. - } - FileError::IOError(err) => { - error!("Failed to remove {:?} : {:?}", self.segment_id, err); - } - } - } - } - } - Err(_) => { - error!("Failed to list files of segment {:?} for deletion.", self.segment_id.uuid_string()); - } - } - } - - /// Open one of the component file for read. pub fn open_read(&self, component: SegmentComponent) -> result::Result { let path = self.relative_path(component); diff --git a/src/core/segment_meta.rs b/src/core/segment_meta.rs index ef7521818..3d001e896 100644 --- a/src/core/segment_meta.rs +++ b/src/core/segment_meta.rs @@ -5,6 +5,7 @@ pub struct SegmentMeta { pub segment_id: SegmentId, pub num_docs: u32, pub num_deleted_docs: u32, + pub opstamp: u64, } #[cfg(test)] @@ -14,6 +15,7 @@ impl SegmentMeta { segment_id: segment_id, num_docs: num_docs, num_deleted_docs: 0, + opstamp: 0u64, } } } \ No newline at end of file diff --git a/src/error.rs b/src/error.rs index 6699c7134..6a82c239d 100644 --- a/src/error.rs +++ b/src/error.rs @@ -32,7 +32,7 @@ pub enum Error { /// The data within is corrupted. /// /// For instance, it contains invalid JSON. - CorruptedFile(PathBuf, Box), + CorruptedFile(PathBuf, Box), /// Invalid argument was passed by the user. InvalidArgument(String), /// An Error happened in one of the thread diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index e4578c51f..84e375acf 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -125,6 +125,7 @@ fn index_documents(heap: &mut Heap, segment_id: segment_id, num_docs: num_docs, num_deleted_docs: num_deleted_docs as u32, + opstamp: last_opstamp, }; let segment_entry = SegmentEntry::new(segment_meta, delete_cursor.clone()); @@ -185,14 +186,13 @@ impl IndexWriter { let generation = self.generation; - let join_handle: JoinHandle> = try!(thread::Builder::new() + let join_handle: JoinHandle> = + thread::Builder::new() .name(format!("indexing thread {} for gen {}", self.worker_id, generation)) .spawn(move || { let mut delete_cursor_clone = delete_cursor.clone(); loop { - let segment = index.new_segment(); - let mut document_iterator = document_receiver_clone.clone() .into_iter() .peekable(); @@ -203,25 +203,28 @@ impl IndexWriter { // this is a valid guarantee as the // peeked document now belongs to // our local iterator. - if document_iterator.peek().is_some() { - let valid_generation = try!(index_documents(&mut heap, - segment, - &schema, - generation, - &mut document_iterator, - &mut segment_updater, - &mut delete_cursor_clone)); - if valid_generation { - return Ok(()); - } - } else { + let opstamp: u64; + if let Some(operation) = document_iterator.peek() { + opstamp = operation.opstamp; + } + else { // No more documents. // Happens when there is a commit, or if the `IndexWriter` // was dropped. - return Ok(()); + opstamp = 0u64; + return Ok(()) } + + let segment = index.new_segment(opstamp); + let valid_generation = index_documents(&mut heap, + segment, + &schema, + generation, + &mut document_iterator, + &mut segment_updater, + &mut delete_cursor_clone)?; } - })); + })?; self.worker_id += 1; self.workers_join_handle.push(join_handle); Ok(()) @@ -308,8 +311,8 @@ impl IndexWriter { } /// Merges a given list of segments - pub fn merge(&mut self, segments: &[SegmentId]) -> impl Future { - self.segment_updater.start_merge(segments.to_vec()) + pub fn merge(&mut self, segment_ids: &[SegmentId]) -> impl Future { + self.segment_updater.start_merge(segment_ids) } /// Closes the current document channel send. diff --git a/src/indexer/segment_manager.rs b/src/indexer/segment_manager.rs index 20ccfcdfa..41d0ded70 100644 --- a/src/indexer/segment_manager.rs +++ b/src/indexer/segment_manager.rs @@ -63,6 +63,14 @@ impl SegmentManager { }), } } + + pub fn segment_entry(&self, segment_id: &SegmentId) -> Option { + let registers = self.read(); + registers + .committed + .segment_entry(segment_id) + .or_else(|| registers.uncommitted.segment_entry(segment_id)) + } // Lock poisoning should never happen : // The lock is acquired and released within this class, @@ -113,16 +121,17 @@ impl SegmentManager { registers_lock.uncommitted.add_segment_entry(segment_entry); } - pub fn end_merge(&self, merged_segment_ids: &[SegmentId], merged_segment_entry: SegmentEntry) { + pub fn end_merge(&self, merged_segment_metas: &[SegmentMeta], merged_segment_entry: SegmentEntry) { let mut registers_lock = self.write(); - if registers_lock.uncommitted.contains_all(merged_segment_ids) { - for segment_id in merged_segment_ids { + let merged_segment_ids: Vec = merged_segment_metas.iter().map(|meta| meta.segment_id).collect(); + if registers_lock.uncommitted.contains_all(&merged_segment_ids) { + for segment_id in &merged_segment_ids { registers_lock.uncommitted.remove_segment(segment_id); } registers_lock.uncommitted.add_segment_entry(merged_segment_entry); } - else if registers_lock.committed.contains_all(merged_segment_ids) { - for segment_id in merged_segment_ids { + else if registers_lock.committed.contains_all(&merged_segment_ids) { + for segment_id in &merged_segment_ids { registers_lock.committed.remove_segment(segment_id); } registers_lock.committed.add_segment_entry(merged_segment_entry); diff --git a/src/indexer/segment_register.rs b/src/indexer/segment_register.rs index 8f8f20611..982e320c0 100644 --- a/src/indexer/segment_register.rs +++ b/src/indexer/segment_register.rs @@ -32,6 +32,10 @@ impl SegmentEntry { pub fn segment_id(&self) -> SegmentId { self.meta.segment_id } + + pub fn meta(&self) -> &SegmentMeta { + &self.meta + } fn start_merge(&mut self,) { self.state = SegmentState::InMerge; @@ -120,7 +124,6 @@ impl SegmentRegister { .collect() } - #[cfg(test)] pub fn segment_entry(&self, segment_id: &SegmentId) -> Option { self.segment_states .get(&segment_id) diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index 2622d97a1..d3e274d69 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -1,6 +1,7 @@ #![allow(for_kv_map)] use core::Index; +use Error; use core::Segment; use indexer::{MergePolicy, DefaultMergePolicy}; use core::SegmentId; @@ -35,13 +36,13 @@ use std::io::Write; use super::segment_manager::{SegmentManager, get_segments}; -fn create_metas(segment_manager: &SegmentManager, schema: Schema, docstamp: u64) -> IndexMeta { +fn create_metas(segment_manager: &SegmentManager, schema: Schema, opstamp: u64) -> IndexMeta { let (committed_segments, uncommitted_segments) = segment_manager.segment_metas(); IndexMeta { committed_segments: committed_segments, uncommitted_segments: uncommitted_segments, schema: schema, - docstamp: docstamp, + opstamp: opstamp, } } @@ -104,7 +105,7 @@ struct InnerSegmentUpdater { segment_manager: SegmentManager, merge_policy: RwLock>, merging_thread_id: AtomicUsize, - merging_threads: RwLock>>, + merging_threads: RwLock>>>, generation: AtomicUsize, } @@ -184,28 +185,52 @@ impl SegmentUpdater { } - pub fn start_merge(&self, segment_ids: Vec) -> impl Future { + pub fn start_merge(&self, segment_ids: &[SegmentId]) -> impl Future { - self.0.segment_manager.start_merge(&segment_ids); + self.0.segment_manager.start_merge(segment_ids); let segment_updater_clone = self.clone(); - + + let segment_ids_vec = segment_ids.to_vec(); + let merging_thread_id = self.get_merging_thread_id(); let (merging_future_send, merging_future_recv) = oneshot(); + + if segment_ids.is_empty() { + return merging_future_recv; + } + let merging_join_handle = thread::spawn(move || { - info!("Start merge: {:?}", segment_ids); + info!("Start merge: {:?}", segment_ids_vec); let ref index = segment_updater_clone.0.index; let schema = index.schema(); - let segments: Vec = segment_ids + let segment_metas: Vec = segment_ids_vec .iter() - .map(|&segment_id| index.segment(segment_id)) + .map(|segment_id| + segment_updater_clone.0.segment_manager + .segment_entry(segment_id) + .map(|segment_entry| segment_entry.meta().clone()) + .ok_or(Error::InvalidArgument(format!("Segment({:?}) does not exist anymore", segment_id))) + ) + .collect::>()?; + + let segments: Vec = segment_metas + .iter() + .map(|ref segment_metas| index.segment(segment_metas.segment_id, segment_metas.opstamp)) .collect(); // An IndexMerger is like a "view" of our merged segments. // TODO unwrap let merger: IndexMerger = IndexMerger::open(schema, &segments[..]).expect("Creating index merger failed"); - let mut merged_segment = index.new_segment(); + + let opstamp = segment_metas + .iter() + .map(|meta| meta.opstamp) + .max() + .unwrap(); + + let mut merged_segment = index.new_segment(opstamp); // ... we just serialize this index merger in our new segment // to merge the two segments. @@ -215,19 +240,20 @@ impl SegmentUpdater { segment_id: merged_segment.id(), num_docs: num_docs, num_deleted_docs: 0u32, + opstamp: opstamp, }; - + // TODO fix delete cursor let delete_queue = DeleteQueue::default(); let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor()); segment_updater_clone - .end_merge(segment_ids.clone(), segment_entry.clone()) + .end_merge(segment_metas.clone(), segment_entry.clone()) .wait() .unwrap(); merging_future_send.complete(segment_entry.clone()); segment_updater_clone.0.merging_threads.write().unwrap().remove(&merging_thread_id); - segment_entry + Ok(segment_entry) }); self.0.merging_threads.write().unwrap().insert(merging_thread_id, merging_join_handle); merging_future_recv @@ -242,26 +268,26 @@ impl SegmentUpdater { let mut merge_candidates = merge_policy.compute_merge_candidates(&uncommitted_segments); let committed_merge_candidates = merge_policy.compute_merge_candidates(&committed_segments); merge_candidates.extend_from_slice(&committed_merge_candidates[..]); - for MergeCandidate(segment_ids) in merge_candidates { - self.start_merge(segment_ids); + for MergeCandidate(segment_metas) in merge_candidates { + self.start_merge(&segment_metas); } } - fn end_merge(&self, - merged_segment_ids: Vec, + fn end_merge(&self, + merged_segment_metas: Vec, resulting_segment_entry: SegmentEntry) -> impl Future { self.run_async(move |segment_updater| { - segment_updater.0.segment_manager.end_merge(&merged_segment_ids, resulting_segment_entry); + segment_updater.0.segment_manager.end_merge(&merged_segment_metas, resulting_segment_entry); let mut directory = segment_updater.0.index.directory().box_clone(); save_metas( &segment_updater.0.segment_manager, segment_updater.0.index.schema(), segment_updater.0.index.docstamp(), directory.borrow_mut()).expect("Could not save metas."); - for segment_id in merged_segment_ids { - segment_updater.0.index.delete_segment(segment_id); + for segment_meta in merged_segment_metas { + segment_updater.0.index.delete_segment(segment_meta.segment_id); } }) diff --git a/src/postings/mod.rs b/src/postings/mod.rs index f9898b9fc..b7676710f 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -61,7 +61,7 @@ mod tests { let text_field = schema_builder.add_text_field("text", TEXT); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); - let mut segment = index.new_segment(); + let mut segment = index.new_segment(0u64); let mut posting_serializer = PostingsSerializer::open(&mut segment).unwrap(); let term = Term::from_field_text(text_field, "abc"); posting_serializer.new_term(&term, 3).unwrap(); @@ -81,7 +81,7 @@ mod tests { let text_field = schema_builder.add_text_field("text", TEXT); let schema = schema_builder.build(); let index = Index::create_in_ram(schema.clone()); - let segment = index.new_segment(); + let segment = index.new_segment(0u64); let heap = Heap::with_capacity(10_000_000); { let mut segment_writer = SegmentWriter::for_segment(&heap, segment.clone(), &schema).unwrap(); From 0820992141e77d9c2a535ae6ee48b750da9ef9f3 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 4 Feb 2017 16:21:24 +0900 Subject: [PATCH 018/107] issue/43 docstamp -> opstamp --- src/core/index.rs | 16 ++++----- src/core/segment.rs | 9 ++--- src/core/segment_component.rs | 4 ++- src/core/segment_reader.rs | 26 ++++++++++++-- src/fastfield/delete.rs | 45 ++++++++++++++++++++---- src/indexer/index_writer.rs | 34 +++++++++--------- src/indexer/merger.rs | 2 +- src/indexer/segment_updater.rs | 12 +++---- src/indexer/segment_writer.rs | 33 ++++++++++++++++++ src/lib.rs | 59 ++++++++++++++++++++++++++++++++ src/postings/docset.rs | 4 +++ src/postings/segment_postings.rs | 27 ++++++++++----- 12 files changed, 216 insertions(+), 55 deletions(-) diff --git a/src/core/index.rs b/src/core/index.rs index 6b62763bd..85005c6e9 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -39,7 +39,7 @@ pub struct Index { directory: Box, schema: Schema, searcher_pool: Arc>, - docstamp: u64, + opstamp: u64, } @@ -116,13 +116,13 @@ impl Index { /// Creates a new index given a directory and an `IndexMeta`. fn create_from_metas(directory: Box, metas: IndexMeta) -> Result { let schema = metas.schema.clone(); - let docstamp = metas.opstamp; + let opstamp = metas.opstamp; // TODO log somethings is uncommitted is not empty. let index = Index { directory: directory, schema: schema, searcher_pool: Arc::new(Pool::new()), - docstamp: docstamp, + opstamp: opstamp, }; try!(index.load_searchers()); Ok(index) @@ -141,12 +141,12 @@ impl Index { Index::create_from_metas(directory.box_clone(), metas) } - /// Returns the index docstamp. + /// Returns the index opstamp. /// - /// The docstamp is the number of documents that have been added + /// The opstamp is the number of documents that have been added /// from the beginning of time, and until the moment of the last commit. - pub fn docstamp(&self) -> u64 { - self.docstamp + pub fn opstamp(&self) -> u64 { + self.opstamp } /// Creates a multithreaded writer. @@ -291,7 +291,7 @@ impl Clone for Index { directory: self.directory.box_clone(), schema: self.schema.clone(), searcher_pool: self.searcher_pool.clone(), - docstamp: self.docstamp, + opstamp: self.opstamp, } } } diff --git a/src/core/segment.rs b/src/core/segment.rs index 4e719c0e3..57e87f7e8 100644 --- a/src/core/segment.rs +++ b/src/core/segment.rs @@ -25,7 +25,6 @@ impl fmt::Debug for Segment { } } - /// Creates a new segment given an `Index` and a `SegmentId` /// /// The function is here to make it private outside `tantivy`. @@ -38,18 +37,20 @@ pub fn create_segment(index: Index, segment_id: SegmentId, commit_opstamp: u64) } impl Segment { - - + /// Returns our index's schema. pub fn schema(&self,) -> Schema { self.index.schema() } + pub fn commit_opstamp(&self) -> u64 { + self.commit_opstamp + } + /// Returns the segment's id. pub fn id(&self,) -> SegmentId { self.segment_id } - /// Returns the relative path of a component of our segment. /// diff --git a/src/core/segment_component.rs b/src/core/segment_component.rs index 62610af3f..57994ddf9 100644 --- a/src/core/segment_component.rs +++ b/src/core/segment_component.rs @@ -23,7 +23,9 @@ impl SegmentComponent { SegmentComponent::STORE => ".store".to_string(), SegmentComponent::FASTFIELDS => ".fast".to_string(), SegmentComponent::FIELDNORMS => ".fieldnorm".to_string(), - SegmentComponent::DELETE(opstamp) => format!("{}.del", opstamp) + SegmentComponent::DELETE(opstamp) => { + format!(".{}.del", opstamp) + } } } } diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index d8518f87f..c1e44e754 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -3,9 +3,13 @@ use core::Segment; use core::SegmentId; use core::SegmentComponent; use schema::Term; +use bit_set::BitSet; +use common::HasLen; +use fastfield::delete::DeleteBitSet; use store::StoreReader; use schema::Document; use directory::ReadOnlySource; +use directory::error::FileError; use DocId; use std::io; use std::str; @@ -44,6 +48,7 @@ pub struct SegmentReader { store_reader: StoreReader, fast_fields_reader: U32FastFieldsReader, fieldnorms_reader: U32FastFieldsReader, + delete_bitset: DeleteBitSet, positions_data: ReadOnlySource, schema: Schema, } @@ -63,9 +68,13 @@ impl SegmentReader { /// Today, `tantivy` does not handle deletes so max doc and /// num_docs are the same. pub fn num_docs(&self) -> DocId { - self.segment_info.max_doc + self.segment_info.max_doc - self.num_deleted_docs() } + pub fn num_deleted_docs(&self) -> DocId { + self.delete_bitset.len() as DocId + } + /// Accessor to a segment's fast field reader given a field. pub fn get_fast_field_reader(&self, field: Field) -> io::Result { let field_entry = self.schema.get_field_entry(field); @@ -137,6 +146,15 @@ impl SegmentReader { .open_read(SegmentComponent::POSITIONS) .unwrap_or_else(|_| ReadOnlySource::empty()); + // TODO 0u64 + let delete_data_res = segment.open_read(SegmentComponent::DELETE(segment.commit_opstamp())); + let delete_bitset; + if let Err(FileError::FileDoesNotExist(_)) = delete_data_res { + delete_bitset = DeleteBitSet::empty(); + } + else { + delete_bitset = DeleteBitSet::open(delete_data_res?); + } let schema = segment.schema(); Ok(SegmentReader { segment_info: segment_info, @@ -146,6 +164,7 @@ impl SegmentReader { store_reader: store_reader, fast_fields_reader: fast_fields_reader, fieldnorms_reader: fieldnorms_reader, + delete_bitset: delete_bitset, positions_data: positions_data, schema: schema, }) @@ -214,9 +233,10 @@ impl SegmentReader { FreqHandler::new_without_freq() } }; - Some(SegmentPostings::from_data(term_info.doc_freq, postings_data, freq_handler)) + Some(SegmentPostings::from_data(term_info.doc_freq, postings_data, &self.delete_bitset, freq_handler)) } - + + /// Returns the posting list associated with a term. pub fn read_postings_all_info(&self, term: &Term) -> Option { let field_entry = self.schema.get_field_entry(term.field()); diff --git a/src/fastfield/delete.rs b/src/fastfield/delete.rs index e700bc3e7..03f3b13b4 100644 --- a/src/fastfield/delete.rs +++ b/src/fastfield/delete.rs @@ -4,6 +4,7 @@ use std::io::Write; use std::io; use directory::ReadOnlySource; use DocId; +use common::HasLen; pub fn write_delete_bitset(delete_bitset: &BitSet, writer: &mut WritePtr) -> io::Result<()> { let max_doc = delete_bitset.capacity(); @@ -28,23 +29,54 @@ pub fn write_delete_bitset(delete_bitset: &BitSet, writer: &mut WritePtr) -> io: writer.flush() } -pub struct DeleteBitSet(ReadOnlySource); +#[derive(Clone)] +pub struct DeleteBitSet { + data: ReadOnlySource, + len: usize, +} impl DeleteBitSet { pub fn open(data: ReadOnlySource) -> DeleteBitSet { - DeleteBitSet(data) + let num_deleted: usize = data + .as_slice() + .iter() + .map(|b| b.count_ones() as usize) + .sum(); + DeleteBitSet { + data: data, + len: num_deleted, + } + } + + pub fn empty() -> DeleteBitSet { + DeleteBitSet { + data: ReadOnlySource::empty(), + len: 0, + } } pub fn is_deleted(&self, doc: DocId) -> bool { - let byte_offset = doc / 8u32; - let b: u8 = (*self.0)[byte_offset as usize]; - let shift = (doc & 7u32) as u8; - b & (1u8 << shift) != 0 + if self.len == 0 { + false + } + else { + let byte_offset = doc / 8u32; + let b: u8 = (*self.data)[byte_offset as usize]; + let shift = (doc & 7u32) as u8; + b & (1u8 << shift) != 0 + } } + } +impl HasLen for DeleteBitSet { + + fn len(&self) -> usize { + self.len + } +} #[cfg(test)] mod tests { @@ -67,6 +99,7 @@ mod tests { for doc in 0..n { assert_eq!(bitset.contains(doc), delete_bitset.is_deleted(doc as DocId)); } + assert_eq!(delete_bitset.len(), bitset.len()); } } diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 84e375acf..f33ba5773 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -75,8 +75,8 @@ pub struct IndexWriter { delete_queue: DeleteQueue, - uncommitted_docstamp: u64, - committed_docstamp: u64, + uncommitted_opstamp: u64, + committed_opstamp: u64, } // IndexWriter cannot be sent to another thread. @@ -211,7 +211,6 @@ impl IndexWriter { // No more documents. // Happens when there is a commit, or if the `IndexWriter` // was dropped. - opstamp = 0u64; return Ok(()) } @@ -282,8 +281,8 @@ impl IndexWriter { delete_queue: delete_queue, - committed_docstamp: index.docstamp(), - uncommitted_docstamp: index.docstamp(), + committed_opstamp: index.opstamp(), + uncommitted_opstamp: index.opstamp(), generation: 0, @@ -338,7 +337,7 @@ impl IndexWriter { /// After calling rollback, the index is in the same /// state as it was after the last commit. /// - /// The docstamp at the last commit is returned. + /// The opstamp at the last commit is returned. pub fn rollback(&mut self) -> Result { // by updating the generation in the segment updater, @@ -380,9 +379,9 @@ impl IndexWriter { Error::ErrorInThread("Error while waiting for rollback.".to_string()) )?; - // reset the docstamp - self.uncommitted_docstamp = self.committed_docstamp; - Ok(self.committed_docstamp) + // reset the opstamp + self.uncommitted_opstamp = self.committed_opstamp; + Ok(self.committed_opstamp) } @@ -397,7 +396,7 @@ impl IndexWriter { /// long as the hard disk is spared), it will be possible /// to resume indexing from this point. /// - /// Commit returns the `docstamp` of the last document + /// Commit returns the `opstamp` of the last document /// that made it in the commit. /// pub fn commit(&mut self) -> Result { @@ -406,9 +405,6 @@ impl IndexWriter { // and recreate a new one channels. self.recreate_document_channel(); - // Docstamp of the last document in this commit. - self.committed_docstamp = self.uncommitted_docstamp; - let mut former_workers_join_handle = Vec::new(); swap(&mut former_workers_join_handle, &mut self.workers_join_handle); @@ -430,13 +426,15 @@ impl IndexWriter { // This will move uncommitted segments to the state of // committed segments. - let future = self.segment_updater.commit(self.committed_docstamp); + + self.committed_opstamp = self.stamp(); + let future = self.segment_updater.commit(self.committed_opstamp); // wait for the segment update thread to have processed the info // TODO remove unwrap future.wait().unwrap(); - Ok(self.committed_docstamp) + Ok(self.committed_opstamp) } @@ -446,8 +444,8 @@ impl IndexWriter { } fn stamp(&mut self) -> u64 { - let opstamp = self.uncommitted_docstamp; - self.uncommitted_docstamp += 1u64; + let opstamp = self.uncommitted_opstamp; + self.uncommitted_opstamp += 1u64; opstamp } @@ -455,7 +453,7 @@ impl IndexWriter { /// /// If the indexing pipeline is full, this call may block. /// - /// The docstamp is an increasing `u64` that can + /// The opstamp is an increasing `u64` that can /// be used by the client to align commits with its own /// document queue. /// diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 0f51d0652..15bc174f5 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -53,7 +53,7 @@ impl IndexMerger { let mut max_doc = 0; for segment in segments { let reader = try!(SegmentReader::open(segment.clone())); - max_doc += reader.max_doc(); + max_doc += reader.num_docs(); readers.push(reader); } Ok(IndexMerger { diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index d3e274d69..bb79b2721 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -57,11 +57,11 @@ fn create_metas(segment_manager: &SegmentManager, schema: Schema, opstamp: u64) /// /// This method is not part of tantivy's public API pub fn save_new_metas(schema: Schema, - docstamp: u64, + opstamp: u64, directory: &mut Directory) -> Result<()> { let segment_manager = SegmentManager::default(); - save_metas(&segment_manager, schema, docstamp, directory) + save_metas(&segment_manager, schema, opstamp, directory) } @@ -77,10 +77,10 @@ pub fn save_new_metas(schema: Schema, /// This method is not part of tantivy's public API pub fn save_metas(segment_manager: &SegmentManager, schema: Schema, - docstamp: u64, + opstamp: u64, directory: &mut Directory) -> Result<()> { - let metas = create_metas(segment_manager, schema, docstamp); + let metas = create_metas(segment_manager, schema, opstamp); let mut w = Vec::new(); try!(write!(&mut w, "{}\n", json::as_pretty_json(&metas))); Ok(directory @@ -178,7 +178,7 @@ impl SegmentUpdater { save_metas( &segment_updater.0.segment_manager, segment_updater.0.index.schema(), - segment_updater.0.index.docstamp(), + opstamp, directory.borrow_mut()).expect("Could not save metas."); segment_updater.consider_merge_options(); }) @@ -284,7 +284,7 @@ impl SegmentUpdater { save_metas( &segment_updater.0.segment_manager, segment_updater.0.index.schema(), - segment_updater.0.index.docstamp(), + segment_updater.0.index.opstamp(), directory.borrow_mut()).expect("Could not save metas."); for segment_meta in merged_segment_metas { segment_updater.0.index.delete_segment(segment_meta.segment_id); diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 204eb9a37..d64bd77bd 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -22,8 +22,40 @@ use indexer::index_writer::MARGIN_IN_BYTES; use super::operation::AddOperation; use bit_set::BitSet; use indexer::document_receiver::DocumentReceiver; +use core::SegmentReader; +use postings::SegmentPostingsOption; +use postings::DocSet; + +fn update_deleted_bitset( + segment_reader: &SegmentReader, + bitset: &mut BitSet, + delete_cursor: &mut DeleteQueueCursor, + limit_opstamp_opt: Option) -> bool { + let mut has_changed = false; + let limit_opstamp = limit_opstamp_opt.unwrap_or(u64::max_value()); + loop { + if let Some(delete_op) = delete_cursor.peek() { + if delete_op.opstamp > limit_opstamp { + break; + } + if let Some(mut docset) = segment_reader.read_postings(&delete_op.term, SegmentPostingsOption::NoFreq) { + while docset.advance() { + has_changed = true; + let deleted_doc = docset.doc(); + bitset.insert(deleted_doc as usize); + } + } + } + else { + break; + } + delete_cursor.consume(); + } + has_changed +} + struct DocumentDeleter<'a> { limit_doc_id: DocId, deleted_docs: &'a mut BitSet, @@ -180,6 +212,7 @@ impl<'a> SegmentWriter<'a> { .expect("Last doc opstamp called on an empty segment writer")) } + /// TODO compute the bitset using the segment reader directly. pub fn compute_deleted_bitset(&self, delete_queue_cursor: &mut DeleteQueueCursor) -> Option { if let Some(first_opstamp) = self.doc_opstamps.first() { if !delete_queue_cursor.skip_to(*first_opstamp) { diff --git a/src/lib.rs b/src/lib.rs index 96d65e639..b2b3b36d8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -290,6 +290,65 @@ mod tests { } } + + #[test] + fn test_delete_postings() { + let mut schema_builder = SchemaBuilder::default(); + let text_field = schema_builder.add_text_field("text", TEXT); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + { + // writing the segment + let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); + { + let doc = doc!(text_field=>"a b"); + index_writer.add_document(doc).unwrap(); + } + { + let doc = doc!(text_field=>" a c"); + index_writer.add_document(doc).unwrap(); + } + { + let doc = doc!(text_field=>" b c"); + index_writer.add_document(doc).unwrap(); + } + { + let doc = doc!(text_field=>" b d"); + index_writer.add_document(doc).unwrap(); + } + { + index_writer.delete_term(Term::from_field_text(text_field, "c")); + } + { + index_writer.delete_term(Term::from_field_text(text_field, "a")); + } + { + let doc = doc!(text_field=>" b c"); + index_writer.add_document(doc).unwrap(); + } + { + let doc = doc!(text_field=>" a"); + index_writer.add_document(doc).unwrap(); + } + index_writer.commit().unwrap(); + } + { + index.load_searchers().unwrap(); + let searcher = index.searcher(); + let reader = searcher.segment_reader(0); + assert!(reader.read_postings_all_info(&Term::from_field_text(text_field, "abcd")).is_none()); + let mut postings = reader.read_postings_all_info(&Term::from_field_text(text_field, "a")).unwrap(); + assert!(postings.advance()); + assert_eq!(postings.doc(), 2); + assert!(postings.advance()); + assert_eq!(postings.doc(), 3); + assert!(postings.advance()); + assert_eq!(postings.doc(), 5); + assert!(!postings.advance()); + } + } + + #[test] fn test_termfreq() { let mut schema_builder = SchemaBuilder::default(); diff --git a/src/postings/docset.rs b/src/postings/docset.rs index 9dda32559..6698c5a2b 100644 --- a/src/postings/docset.rs +++ b/src/postings/docset.rs @@ -55,6 +55,9 @@ pub trait DocSet { /// Returns the current document fn doc(&self) -> DocId; + /// TODO can impl trait for trait? + + /// Advances the cursor to the next document /// None is returned if the iterator has `DocSet` /// has already been entirely consumed. @@ -67,6 +70,7 @@ pub trait DocSet { } } + impl DocSet for Box { fn advance(&mut self) -> bool { let unboxed: &mut TDocSet = self.borrow_mut(); diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index fb313e76e..a886df372 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -2,6 +2,7 @@ use compression::{NUM_DOCS_PER_BLOCK, BlockDecoder, VIntDecoder}; use DocId; use postings::{Postings, FreqHandler, DocSet, HasLen}; use std::num::Wrapping; +use fastfield::delete::DeleteBitSet; const EMPTY_DATA: [u8; 0] = [0u8; 0]; @@ -18,6 +19,7 @@ pub struct SegmentPostings<'a> { freq_handler: FreqHandler, remaining_data: &'a [u8], cur: Wrapping, + delete_bitset: DeleteBitSet, } impl<'a> SegmentPostings<'a> { @@ -41,7 +43,10 @@ impl<'a> SegmentPostings<'a> { /// * `data` - data array. The complete data is not necessarily used. /// * `freq_handler` - the freq handler is in charge of decoding /// frequencies and/or positions - pub fn from_data(len: u32, data: &'a [u8], freq_handler: FreqHandler) -> SegmentPostings<'a> { + pub fn from_data(len: u32, + data: &'a [u8], + delete_bitset: &'a DeleteBitSet, + freq_handler: FreqHandler) -> SegmentPostings<'a> { SegmentPostings { len: len as usize, doc_offset: 0, @@ -49,6 +54,7 @@ impl<'a> SegmentPostings<'a> { freq_handler: freq_handler, remaining_data: data, cur: Wrapping(usize::max_value()), + delete_bitset: delete_bitset.clone(), } } @@ -60,6 +66,7 @@ impl<'a> SegmentPostings<'a> { block_decoder: BlockDecoder::new(), freq_handler: FreqHandler::new_without_freq(), remaining_data: &EMPTY_DATA, + delete_bitset: DeleteBitSet::empty(), cur: Wrapping(usize::max_value()), } } @@ -77,14 +84,18 @@ impl<'a> DocSet for SegmentPostings<'a> { // next needs to be called a first time to point to the correct element. #[inline] fn advance(&mut self) -> bool { - self.cur += Wrapping(1); - if self.cur.0 >= self.len { - return false; + loop { + self.cur += Wrapping(1); + if self.cur.0 >= self.len { + return false; + } + if self.index_within_block() == 0 { + self.load_next_block(); + } + if !self.delete_bitset.is_deleted(self.doc()) { + return true; + } } - if self.index_within_block() == 0 { - self.load_next_block(); - } - true } #[inline] From e12fc4bb09ddf52b0f6b3da561a18fccd1a23752 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 5 Feb 2017 19:01:06 +0900 Subject: [PATCH 019/107] issue/43 deletes merge not working only updating uncommitted --- src/core/index.rs | 4 +- src/core/segment.rs | 21 +++++-- src/core/segment_component.rs | 10 +--- src/core/segment_id.rs | 5 -- src/core/segment_reader.rs | 6 +- src/fastfield/mod.rs | 3 - src/indexer/delete_queue.rs | 38 ++++++------ src/indexer/document_receiver.rs | 5 -- src/indexer/index_writer.rs | 100 ++++++++++++++++++++++++------- src/indexer/mod.rs | 1 - src/indexer/segment_manager.rs | 17 ++++-- src/indexer/segment_register.rs | 4 ++ src/indexer/segment_updater.rs | 21 +++++++ src/indexer/segment_writer.rs | 100 ++----------------------------- src/lib.rs | 34 ++++++----- src/postings/postings_writer.rs | 15 +---- src/postings/recorder.rs | 42 ------------- 17 files changed, 185 insertions(+), 241 deletions(-) delete mode 100644 src/indexer/document_receiver.rs diff --git a/src/core/index.rs b/src/core/index.rs index 85005c6e9..50740281f 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -208,8 +208,8 @@ impl Index { /// Return a segment object given a `segment_id` /// /// The segment may or may not exist. - pub fn segment(&self, segment_id: SegmentId, commit_opstamp: u64) -> Segment { - create_segment(self.clone(), segment_id, commit_opstamp) + pub fn segment(&self, segment_id: SegmentId, opstamp: u64) -> Segment { + create_segment(self.clone(), segment_id, opstamp) } /// Return a reference to the index directory. diff --git a/src/core/segment.rs b/src/core/segment.rs index 57e87f7e8..82891e6e8 100644 --- a/src/core/segment.rs +++ b/src/core/segment.rs @@ -16,7 +16,7 @@ use directory::error::{FileError, OpenWriteError}; pub struct Segment { index: Index, segment_id: SegmentId, - commit_opstamp: u64, + opstamp: u64, } impl fmt::Debug for Segment { @@ -28,11 +28,11 @@ impl fmt::Debug for Segment { /// Creates a new segment given an `Index` and a `SegmentId` /// /// The function is here to make it private outside `tantivy`. -pub fn create_segment(index: Index, segment_id: SegmentId, commit_opstamp: u64) -> Segment { +pub fn create_segment(index: Index, segment_id: SegmentId, opstamp: u64) -> Segment { Segment { index: index, segment_id: segment_id, - commit_opstamp: commit_opstamp, + opstamp: opstamp, } } @@ -43,8 +43,8 @@ impl Segment { self.index.schema() } - pub fn commit_opstamp(&self) -> u64 { - self.commit_opstamp + pub fn opstamp(&self) -> u64 { + self.opstamp } /// Returns the segment's id. @@ -52,12 +52,21 @@ impl Segment { self.segment_id } + pub fn with_opstamp(&self, opstamp: u64) -> Segment { + Segment { + index: self.index.clone(), + segment_id: self.segment_id.clone(), + opstamp: opstamp, + } + } + /// Returns the relative path of a component of our segment. /// /// It just joins the segment id with the extension /// associated to a segment component. pub fn relative_path(&self, component: SegmentComponent) -> PathBuf { - self.segment_id.relative_path(component) + let path_suffix = component.path_suffix(self.opstamp); + PathBuf::from(self.segment_id.uuid_string() + &*path_suffix) } /// Open one of the component file for read. diff --git a/src/core/segment_component.rs b/src/core/segment_component.rs index 57994ddf9..9b7bea9be 100644 --- a/src/core/segment_component.rs +++ b/src/core/segment_component.rs @@ -7,14 +7,12 @@ pub enum SegmentComponent { FIELDNORMS, TERMS, STORE, - DELETE(u64), //< The argument here is an opstamp. - // All of the deletes with an opstamp smaller or equal - // to this opstamp have been taken in account. + DELETE } impl SegmentComponent { - pub fn path_suffix(&self)-> String { + pub fn path_suffix(&self, opstamp: u64)-> String { match *self { SegmentComponent::POSITIONS => ".pos".to_string(), SegmentComponent::INFO => ".info".to_string(), @@ -23,9 +21,7 @@ impl SegmentComponent { SegmentComponent::STORE => ".store".to_string(), SegmentComponent::FASTFIELDS => ".fast".to_string(), SegmentComponent::FIELDNORMS => ".fieldnorm".to_string(), - SegmentComponent::DELETE(opstamp) => { - format!(".{}.del", opstamp) - } + SegmentComponent::DELETE => {format!(".{}.del", opstamp)}, } } } diff --git a/src/core/segment_id.rs b/src/core/segment_id.rs index a9916cb83..263d94c58 100644 --- a/src/core/segment_id.rs +++ b/src/core/segment_id.rs @@ -48,11 +48,6 @@ impl SegmentId { pub fn uuid_string(&self,) -> String { self.0.simple().to_string() } - - pub fn relative_path(&self, component: SegmentComponent) -> PathBuf { - let filename = self.uuid_string() + &*component.path_suffix(); - PathBuf::from(filename) - } } impl Encodable for SegmentId { diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index c1e44e754..f2ed4e7a4 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -147,7 +147,7 @@ impl SegmentReader { .unwrap_or_else(|_| ReadOnlySource::empty()); // TODO 0u64 - let delete_data_res = segment.open_read(SegmentComponent::DELETE(segment.commit_opstamp())); + let delete_data_res = segment.open_read(SegmentComponent::DELETE); let delete_bitset; if let Err(FileError::FileDoesNotExist(_)) = delete_data_res { delete_bitset = DeleteBitSet::empty(); @@ -262,6 +262,10 @@ impl SegmentReader { pub fn segment_id(&self) -> SegmentId { self.segment_id } + + pub fn is_deleted(&self, doc: DocId) -> bool { + self.delete_bitset.is_deleted(doc) + } } diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index 00de208b9..ae3d83f88 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -54,9 +54,6 @@ mod tests { #[test] pub fn test_fastfield() { let test_fastfield = U32FastFieldReader::from(vec!(100,200,300)); - println!("{}", test_fastfield.get(0)); - println!("{}", test_fastfield.get(1)); - println!("{}", test_fastfield.get(2)); assert_eq!(test_fastfield.get(0), 100); assert_eq!(test_fastfield.get(1), 200); assert_eq!(test_fastfield.get(2), 300); diff --git a/src/indexer/delete_queue.rs b/src/indexer/delete_queue.rs index 6ee4980f3..fd1e4f59b 100644 --- a/src/indexer/delete_queue.rs +++ b/src/indexer/delete_queue.rs @@ -97,7 +97,7 @@ impl DeleteQueueCursor { return true; } else { - self.consume(); + self.next(); } } return false; @@ -128,15 +128,17 @@ impl DeleteQueueCursor { } } +} + +impl Iterator for DeleteQueueCursor { + type Item = DeleteOperation; + /// Returns a delete operation if an operation is available, /// None if the queue is empty. /// - /// (We are voluntarily not using the `Iterator` trait - /// as a call to `consume` may return None once, and return - /// `Some(...)` ulteriorily. While this is officially - /// compatible with the `Iterator` specification, we judge - /// this confusing.) - pub fn consume(&mut self) -> Option { + /// This iterator may return None once, and return + /// `Some(...)` ulteriorily. + fn next(&mut self) -> Option { let delete_position = self.peek(); if delete_position.is_some() { self.pos += 1; @@ -197,7 +199,7 @@ mod tests { let mut delete_cursor_3 = delete_queue.cursor(); let mut delete_cursor_3_b = delete_cursor_3.clone(); - assert!(delete_cursor_3.consume().is_none()); + assert!(delete_cursor_3.next().is_none()); assert!(delete_cursor_3.peek().is_none()); delete_queue.push_op(make_op(3)); @@ -206,24 +208,24 @@ mod tests { assert_eq!(delete_cursor_3_b.peek(), Some(make_op(3))); let mut delete_cursor_3_c = delete_cursor_3_b.clone(); - assert_eq!(delete_cursor_3_b.consume(), Some(make_op(3))); + assert_eq!(delete_cursor_3_b.next(), Some(make_op(3))); let mut delete_cursor_4 = delete_cursor_3_b.clone(); assert_eq!(delete_cursor_3_b.peek(), Some(make_op(4))); - assert_eq!(delete_cursor_3_b.consume(), Some(make_op(4))); + assert_eq!(delete_cursor_3_b.next(), Some(make_op(4))); - assert_eq!(delete_cursor_3_c.consume(), Some(make_op(3))); + assert_eq!(delete_cursor_3_c.next(), Some(make_op(3))); - assert!(delete_cursor_3_b.consume().is_none()); - assert_eq!(delete_cursor_3_c.consume(), Some(make_op(4))); - assert!(delete_cursor_3_c.consume().is_none()); + assert!(delete_cursor_3_b.next().is_none()); + assert_eq!(delete_cursor_3_c.next(), Some(make_op(4))); + assert!(delete_cursor_3_c.next().is_none()); assert_eq!(delete_cursor_3.peek(), Some(make_op(3))); - assert_eq!(delete_cursor_3.consume(), Some(make_op(3))); - assert!(delete_cursor_3_b.consume().is_none()); + assert_eq!(delete_cursor_3.next(), Some(make_op(3))); + assert!(delete_cursor_3_b.next().is_none()); - assert_eq!(delete_cursor_4.consume(), Some(make_op(4))); - assert!(delete_cursor_4.consume().is_none()); + assert_eq!(delete_cursor_4.next(), Some(make_op(4))); + assert!(delete_cursor_4.next().is_none()); } diff --git a/src/indexer/document_receiver.rs b/src/indexer/document_receiver.rs deleted file mode 100644 index 73bb7b4ec..000000000 --- a/src/indexer/document_receiver.rs +++ /dev/null @@ -1,5 +0,0 @@ -use DocId; - -pub trait DocumentReceiver { - fn receive(&mut self, doc: DocId); -} \ No newline at end of file diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index f33ba5773..dcc9126af 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -9,6 +9,11 @@ use indexer::SegmentEntry; use std::thread::JoinHandle; use indexer::MergePolicy; use indexer::SegmentWriter; +use DocId; +use bit_set::BitSet; +use fastfield::delete::write_delete_bitset; +use postings::SegmentPostingsOption; +use postings::DocSet; use core::SegmentComponent; use super::directory_lock::DirectoryLock; use futures::Future; @@ -19,6 +24,7 @@ use std::thread; use futures::Canceled; use std::mem; use datastruct::stacker::Heap; +use core::SegmentReader; use std::mem::swap; use chan; use core::SegmentMeta; @@ -84,6 +90,64 @@ impl !Send for IndexWriter {} impl !Sync for IndexWriter {} +pub enum DocToOpstampMapping { + WithMap(Vec), + None +} + +impl DocToOpstampMapping { + fn compute_doc_limit(&self, opstamp: u64) -> DocId { + match *self { + DocToOpstampMapping::WithMap(ref doc_opstamps) => { + match doc_opstamps.binary_search(&opstamp) { + Ok(doc_id) => doc_id as DocId, + Err(doc_id) => doc_id as DocId, + } + } + DocToOpstampMapping::None => DocId::max_value(), + } + } +} + + + +/// TODO +/// work on SegmentMeta +pub fn advance_deletes( + segment: &Segment, + delete_cursor: &mut DeleteQueueCursor, + doc_opstamps: DocToOpstampMapping) -> Result<(u64, BitSet)> { + let segment_reader = SegmentReader::open(segment.clone())?; + let mut delete_bitset = BitSet::new(); + for doc in 0u32..segment_reader.max_doc() { + if segment_reader.is_deleted(doc) { + delete_bitset.insert(doc as usize); + } + } + let mut has_changed = false; + let mut last_opstamp = segment.opstamp();//segment + for delete_op in delete_cursor { + // A delete operation should only affect + // document that were inserted after it. + // + // Limit doc helps identify the first document + // that may be affected by the delete operation. + let limit_doc = doc_opstamps.compute_doc_limit(delete_op.opstamp); + if let Some(mut docset) = segment_reader.read_postings(&delete_op.term, SegmentPostingsOption::NoFreq) { + while docset.advance() { + has_changed = true; + let deleted_doc = docset.doc(); + if deleted_doc < limit_doc { + has_changed = true; + delete_bitset.insert(deleted_doc as usize); + } + } + } + last_opstamp = delete_op.opstamp; + } + Ok((last_opstamp, delete_bitset)) +} + fn index_documents(heap: &mut Heap, mut segment: Segment, schema: &Schema, @@ -106,32 +170,28 @@ fn index_documents(heap: &mut Heap, let num_docs = segment_writer.max_doc(); assert!(num_docs > 0); - let deleted_docset_opt = segment_writer.compute_deleted_bitset(delete_cursor); let last_opstamp = segment_writer.last_opstamp(); + + let doc_opstamps: Vec = segment_writer.finalize()?; - let num_deleted_docs; + let (last_opstamp_after_deletes, deleted_docset) = advance_deletes(&segment, delete_cursor, DocToOpstampMapping::WithMap(doc_opstamps))?; - if let Some(deleted_docset) = deleted_docset_opt { - let mut delete_write = segment.open_write(SegmentComponent::DELETE(last_opstamp))?; - delete::write_delete_bitset(&deleted_docset, &mut delete_write)?; - num_deleted_docs = deleted_docset.len(); - } - else { - num_deleted_docs = 0; + { + let mut delete_file = segment.with_opstamp(last_opstamp_after_deletes).open_write(SegmentComponent::DELETE)?; + write_delete_bitset(&deleted_docset, &mut delete_file)?; } + let num_deleted_docs = deleted_docset.len() as DocId; let segment_meta = SegmentMeta { segment_id: segment_id, num_docs: num_docs, - num_deleted_docs: num_deleted_docs as u32, - opstamp: last_opstamp, + num_deleted_docs: num_deleted_docs, + opstamp: last_opstamp_after_deletes, }; let segment_entry = SegmentEntry::new(segment_meta, delete_cursor.clone()); - - try!(segment_writer.finalize()); - + segment_updater .add_segment(generation, segment_entry) .wait() @@ -143,8 +203,6 @@ fn index_documents(heap: &mut Heap, impl IndexWriter { /// The index writer pub fn wait_merging_threads(mut self) -> Result<()> { - - // let future = self.segment_updater.terminate(); // this will stop the indexing thread, // dropping the last reference to the segment_updater. @@ -165,9 +223,7 @@ impl IndexWriter { .wait_merging_thread() .map_err(|_| Error::ErrorInThread("Failed to join merging thread.".to_string()) - )?; - // future.wait().unwrap(); // TODO do something with the result. - Ok(()) + ) } /// Spawns a new worker thread for indexing. @@ -384,7 +440,6 @@ impl IndexWriter { Ok(self.committed_opstamp) } - /// Commits all of the pending changes /// /// A call to commit blocks. @@ -408,7 +463,7 @@ impl IndexWriter { let mut former_workers_join_handle = Vec::new(); swap(&mut former_workers_join_handle, &mut self.workers_join_handle); - + for worker_handle in former_workers_join_handle { let indexing_worker_result = try!(worker_handle.join() .map_err(|e| Error::ErrorInThread(format!("{:?}", e)))); @@ -416,6 +471,7 @@ impl IndexWriter { // add a new worker for the next generation. try!(self.add_indexing_worker()); } + // here, because we join all of the worker threads, // all of the segment update for this commit have been // sent. @@ -426,8 +482,8 @@ impl IndexWriter { // This will move uncommitted segments to the state of // committed segments. - self.committed_opstamp = self.stamp(); + let future = self.segment_updater.commit(self.committed_opstamp); // wait for the segment update thread to have processed the info diff --git a/src/indexer/mod.rs b/src/indexer/mod.rs index 15e2035f7..aea0965f5 100644 --- a/src/indexer/mod.rs +++ b/src/indexer/mod.rs @@ -10,7 +10,6 @@ mod segment_manager; pub mod delete_queue; pub mod segment_updater; mod directory_lock; -pub mod document_receiver; pub mod operation; pub use self::segment_register::SegmentEntry; diff --git a/src/indexer/segment_manager.rs b/src/indexer/segment_manager.rs index 41d0ded70..10ee89486 100644 --- a/src/indexer/segment_manager.rs +++ b/src/indexer/segment_manager.rs @@ -8,7 +8,6 @@ use std::sync::{RwLockReadGuard, RwLockWriteGuard}; use std::fmt::{self, Debug, Formatter}; struct SegmentRegisters { - docstamp: u64, uncommitted: SegmentRegister, committed: SegmentRegister, } @@ -16,7 +15,6 @@ struct SegmentRegisters { impl Default for SegmentRegisters { fn default() -> SegmentRegisters { SegmentRegisters { - docstamp: 0u64, uncommitted: SegmentRegister::default(), committed: SegmentRegister::default() } @@ -57,12 +55,23 @@ impl SegmentManager { pub fn from_segments(segment_metas: Vec, delete_cursor: DeleteQueueCursor) -> SegmentManager { SegmentManager { registers: RwLock::new( SegmentRegisters { - docstamp: 0u64, // TODO put the actual value uncommitted: SegmentRegister::default(), committed: SegmentRegister::new(segment_metas, delete_cursor), }), } } + + pub fn segment_entries(&self,) -> Vec { + let mut segment_entries = self.read() + .uncommitted + .segment_entries(); + segment_entries.extend( + self.read() + .committed + .segment_entries() + ); + segment_entries + } pub fn segment_entry(&self, segment_id: &SegmentId) -> Option { let registers = self.read(); @@ -98,7 +107,6 @@ impl SegmentManager { for segment_entry in segment_entries { registers_lock.committed.add_segment_entry(segment_entry); } - registers_lock.docstamp = docstamp; registers_lock.uncommitted.clear(); } @@ -151,7 +159,6 @@ impl Default for SegmentManager { fn default() -> SegmentManager { SegmentManager { registers: RwLock::new( SegmentRegisters { - docstamp: 0u64, // TODO put the actual value uncommitted: SegmentRegister::default(), committed: SegmentRegister::default(), }), diff --git a/src/indexer/segment_register.rs b/src/indexer/segment_register.rs index 982e320c0..4c838bf63 100644 --- a/src/indexer/segment_register.rs +++ b/src/indexer/segment_register.rs @@ -32,6 +32,10 @@ impl SegmentEntry { pub fn segment_id(&self) -> SegmentId { self.meta.segment_id } + + pub fn delete_cursor(&mut self) -> &mut DeleteQueueCursor { + &mut self.delete_cursor + } pub fn meta(&self) -> &SegmentMeta { &self.meta diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index bb79b2721..9482f9aaa 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -10,9 +10,11 @@ use std::mem; use std::sync::atomic::Ordering; use std::ops::DerefMut; use futures::{Future, future}; +use fastfield::delete::write_delete_bitset; use futures::oneshot; use futures::Canceled; use std::thread; +use core::SegmentComponent; use std::sync::atomic::AtomicUsize; use std::sync::RwLock; use core::SerializableSegment; @@ -22,6 +24,7 @@ use std::borrow::BorrowMut; use indexer::SegmentSerializer; use indexer::SegmentEntry; use schema::Schema; +use indexer::index_writer::{advance_deletes, DocToOpstampMapping}; use directory::Directory; use std::thread::JoinHandle; use std::sync::Arc; @@ -171,8 +174,26 @@ impl SegmentUpdater { } } + fn purge_deletes(&self, target_opstamp: u64) -> Result<()> { + let uncommitted = self.0.segment_manager.segment_entries(); + for mut segment_entry in uncommitted { + let mut segment = self.0.index.segment(segment_entry.meta().segment_id, segment_entry.meta().opstamp); + let (_, deleted_docset) = advance_deletes( + &segment, + segment_entry.delete_cursor(), + DocToOpstampMapping::None).unwrap(); + { + let mut delete_file = segment.with_opstamp(target_opstamp).open_write(SegmentComponent::DELETE)?; + write_delete_bitset(&deleted_docset, &mut delete_file)?; + } + + } + Ok(()) + } + pub fn commit(&self, opstamp: u64) -> impl Future { self.run_async(move |segment_updater| { + segment_updater.purge_deletes(opstamp).expect("Failed purge deletes"); segment_updater.0.segment_manager.commit(opstamp); let mut directory = segment_updater.0.index.directory().box_clone(); save_metas( diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index d64bd77bd..d2de2b946 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -17,58 +17,10 @@ use postings::SpecializedPostingsWriter; use postings::{NothingRecorder, TermFrequencyRecorder, TFAndPositionRecorder}; use indexer::segment_serializer::SegmentSerializer; use datastruct::stacker::Heap; -use super::delete_queue::DeleteQueueCursor; use indexer::index_writer::MARGIN_IN_BYTES; use super::operation::AddOperation; -use bit_set::BitSet; -use indexer::document_receiver::DocumentReceiver; -use core::SegmentReader; -use postings::SegmentPostingsOption; -use postings::DocSet; - -fn update_deleted_bitset( - segment_reader: &SegmentReader, - bitset: &mut BitSet, - delete_cursor: &mut DeleteQueueCursor, - limit_opstamp_opt: Option) -> bool { - let mut has_changed = false; - let limit_opstamp = limit_opstamp_opt.unwrap_or(u64::max_value()); - loop { - if let Some(delete_op) = delete_cursor.peek() { - if delete_op.opstamp > limit_opstamp { - break; - } - if let Some(mut docset) = segment_reader.read_postings(&delete_op.term, SegmentPostingsOption::NoFreq) { - while docset.advance() { - has_changed = true; - let deleted_doc = docset.doc(); - bitset.insert(deleted_doc as usize); - } - } - } - else { - break; - } - delete_cursor.consume(); - } - has_changed -} - -struct DocumentDeleter<'a> { - limit_doc_id: DocId, - deleted_docs: &'a mut BitSet, -} - -impl<'a> DocumentReceiver for DocumentDeleter<'a> { - fn receive(&mut self, doc: DocId) { - if doc < self.limit_doc_id { - self.deleted_docs.insert(doc as usize); - } - } -} - /// A `SegmentWriter` is in charge of creating segment index from a /// documents. /// @@ -154,19 +106,18 @@ impl<'a> SegmentWriter<'a> { /// /// Finalize consumes the `SegmentWriter`, so that it cannot /// be used afterwards. - pub fn finalize(mut self) -> Result<()> { + pub fn finalize(mut self) -> Result> { let segment_info = self.segment_info(); for per_field_postings_writer in &mut self.per_field_postings_writers { per_field_postings_writer.close(self.heap); } - try!(write( - &self.per_field_postings_writers, + write(&self.per_field_postings_writers, &self.fast_field_writers, &self.fieldnorms_writer, segment_info, self.segment_serializer, - self.heap)); - Ok(()) + self.heap)?; + Ok(self.doc_opstamps) } /// Returns true iff the segment writer's buffer has reached capacity. @@ -180,14 +131,6 @@ impl<'a> SegmentWriter<'a> { self.heap.num_free_bytes() <= MARGIN_IN_BYTES } - fn compute_doc_limit(&self, opstamp: u64) -> DocId { - let doc_id = match self.doc_opstamps.binary_search(&opstamp) { - Ok(doc_id) => doc_id, - Err(doc_id) => doc_id, - }; - doc_id as DocId - } - // pub fn compute_doc_mapping_after_delete(&self, mut delete_queue_cursor: DeleteQueueCursor) -> Vec> { // let delete_docs = self.compute_delete_mask(&mut delete_queue_cursor); // let max_doc: usize = self.max_doc as usize; @@ -211,41 +154,6 @@ impl<'a> SegmentWriter<'a> { .last() .expect("Last doc opstamp called on an empty segment writer")) } - - /// TODO compute the bitset using the segment reader directly. - pub fn compute_deleted_bitset(&self, delete_queue_cursor: &mut DeleteQueueCursor) -> Option { - if let Some(first_opstamp) = self.doc_opstamps.first() { - if !delete_queue_cursor.skip_to(*first_opstamp) { - return None; - } - } - else { - return None; - } - let last_opstamp = *self.doc_opstamps.last().unwrap(); - let mut deleted_docs = BitSet::with_capacity(self.max_doc as usize); - while let Some(delete_operation) = delete_queue_cursor.peek() { - if delete_operation.opstamp > last_opstamp { - break; - } - // We can skip computing delete operations that - // are older than our oldest document. - // - // They don't belong to this document anyway. - let delete_term = delete_operation.term; - let Field(field_id) = delete_term.field(); - let postings_writer: &Box = &self.per_field_postings_writers[field_id as usize]; - let limit_doc_id = self.compute_doc_limit(delete_operation.opstamp); - let mut document_deleter = DocumentDeleter { - limit_doc_id: limit_doc_id, - deleted_docs: &mut deleted_docs - }; - postings_writer.push_documents(delete_term.value(), &mut document_deleter); - delete_queue_cursor.consume(); - } - Some(deleted_docs) - } - /// Indexes a new document /// diff --git a/src/lib.rs b/src/lib.rs index b2b3b36d8..09219195c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -300,19 +300,19 @@ mod tests { { // writing the segment let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); - { + { // 0 let doc = doc!(text_field=>"a b"); index_writer.add_document(doc).unwrap(); } - { + { // 1 let doc = doc!(text_field=>" a c"); index_writer.add_document(doc).unwrap(); } - { + { // 2 let doc = doc!(text_field=>" b c"); index_writer.add_document(doc).unwrap(); } - { + { // 3 let doc = doc!(text_field=>" b d"); index_writer.add_document(doc).unwrap(); } @@ -322,11 +322,11 @@ mod tests { { index_writer.delete_term(Term::from_field_text(text_field, "a")); } - { + { // 4 let doc = doc!(text_field=>" b c"); index_writer.add_document(doc).unwrap(); } - { + { // 5 let doc = doc!(text_field=>" a"); index_writer.add_document(doc).unwrap(); } @@ -337,14 +337,20 @@ mod tests { let searcher = index.searcher(); let reader = searcher.segment_reader(0); assert!(reader.read_postings_all_info(&Term::from_field_text(text_field, "abcd")).is_none()); - let mut postings = reader.read_postings_all_info(&Term::from_field_text(text_field, "a")).unwrap(); - assert!(postings.advance()); - assert_eq!(postings.doc(), 2); - assert!(postings.advance()); - assert_eq!(postings.doc(), 3); - assert!(postings.advance()); - assert_eq!(postings.doc(), 5); - assert!(!postings.advance()); + { + let mut postings = reader.read_postings_all_info(&Term::from_field_text(text_field, "a")).unwrap(); + assert!(postings.advance()); + assert_eq!(postings.doc(), 5); + assert!(!postings.advance()); + } + { + let mut postings = reader.read_postings_all_info(&Term::from_field_text(text_field, "b")).unwrap(); + assert!(postings.advance()); + assert_eq!(postings.doc(), 3); + assert!(postings.advance()); + assert_eq!(postings.doc(), 4); + assert!(!postings.advance()); + } } } diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index c69218b39..a3c0194f1 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -7,8 +7,7 @@ use postings::Recorder; use analyzer::SimpleTokenizer; use schema::Field; use analyzer::StreamingIterator; -use indexer::document_receiver::DocumentReceiver; -use datastruct::stacker::{HashMap, Entry, Heap}; +use datastruct::stacker::{HashMap, Heap}; /// The `PostingsWriter` is in charge of receiving documenting /// and building a `Segment` in anonymous memory. @@ -28,10 +27,6 @@ pub trait PostingsWriter { /// The actual serialization format is handled by the `PostingsSerializer`. fn serialize(&self, serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()>; - /// Push all documents associated with a given term to a - /// given DocumentLister. - fn push_documents(&self, term_val: &[u8], document_listener: &mut DocumentReceiver); - /// Closes all of the currently open `Recorder`'s. fn close(&mut self, heap: &Heap); @@ -105,14 +100,6 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<' } - fn push_documents(&self, term_val: &[u8], document_receiver: &mut DocumentReceiver) { - if let Entry::Occupied(addr) = self.term_index.lookup(term_val) { - let heap = self.term_index.heap(); - let recorder: &Rec = heap.get_ref(addr); - recorder.push_documents(addr, document_receiver, heap); - } - } - #[inline] fn suscribe(&mut self, doc: DocId, position: u32, term: &Term, heap: &Heap) { let mut recorder = self.term_index.get_or_create(term); diff --git a/src/postings/recorder.rs b/src/postings/recorder.rs index 970b6f071..94173720b 100644 --- a/src/postings/recorder.rs +++ b/src/postings/recorder.rs @@ -2,7 +2,6 @@ use DocId; use std::io; use postings::PostingsSerializer; use datastruct::stacker::{ExpUnrolledLinkedList, Heap, HeapAllocable}; -use indexer::document_receiver::DocumentReceiver; const EMPTY_ARRAY: [u32; 0] = [0u32; 0]; const POSITION_END: u32 = 4294967295; @@ -29,11 +28,6 @@ pub trait Recorder: HeapAllocable { fn close_doc(&mut self, heap: &Heap); /// Returns the number of document that have been seen so far fn doc_freq(&self) -> u32; - /// Push all documents to a given DocumentLister. - fn push_documents(&self, - self_addr: u32, - document_receiver: &mut DocumentReceiver, - heap: &Heap); /// Pushes the postings information to the serializer. fn serialize(&self, self_addr: u32, @@ -79,15 +73,6 @@ impl Recorder for NothingRecorder { self.doc_freq } - fn push_documents(&self, - self_addr: u32, - document_receiver: &mut DocumentReceiver, - heap: &Heap) { - for doc in self.stack.iter(self_addr, heap) { - document_receiver.receive(doc); - } - } - fn serialize(&self, self_addr: u32, serializer: &mut PostingsSerializer, @@ -145,17 +130,6 @@ impl Recorder for TermFrequencyRecorder { self.doc_freq } - fn push_documents(&self, - self_addr: u32, - document_receiver: &mut DocumentReceiver, - heap: &Heap) { - let mut doc_iter = self.stack.iter(self_addr, heap); - while let Some(doc) = doc_iter.next() { - doc_iter.next().expect("Panicked while trying to read a frequency"); - document_receiver.receive(doc); - } - } - fn serialize(&self, self_addr: u32, serializer: &mut PostingsSerializer, @@ -216,22 +190,6 @@ impl Recorder for TFAndPositionRecorder { self.doc_freq } - fn push_documents(&self, - self_addr: u32, - document_receiver: &mut DocumentReceiver, - heap: &Heap) { - let mut positions_iter = self.stack.iter(self_addr, heap); - while let Some(doc) = positions_iter.next() { - document_receiver.receive(doc); - loop { - let position = positions_iter.next().expect("This should never happen. Pleasee report the bug."); - if position == POSITION_END { - break; - } - } - } - } - fn serialize(&self, self_addr: u32, serializer: &mut PostingsSerializer, From 64fee11bc0263fe02f9d4b0fcd8db1fa644909c9 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 6 Feb 2017 09:29:49 +0900 Subject: [PATCH 020/107] issue/43 Clean up --- src/core/index.rs | 28 +++++++++++++--------------- src/indexer/segment_manager.rs | 2 +- src/indexer/segment_register.rs | 15 ++++++++------- src/indexer/segment_updater.rs | 7 +++---- 4 files changed, 25 insertions(+), 27 deletions(-) diff --git a/src/core/index.rs b/src/core/index.rs index 50740281f..936087dd5 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -184,15 +184,10 @@ impl Index { /// Returns the list of segments that are searchable pub fn searchable_segments(&self) -> Result> { let metas = load_metas(self.directory())?; - let searchable_segment_ids = metas + Ok(metas .committed_segments .iter() - .map(|segment_meta| segment_meta.segment_id) - .collect::>(); - let commit_opstamp = metas.opstamp; - Ok(searchable_segment_ids - .into_iter() - .map(|segment_id| self.segment(segment_id, commit_opstamp)) + .map(|segment_meta| self.segment(segment_meta)) .collect()) } @@ -208,8 +203,17 @@ impl Index { /// Return a segment object given a `segment_id` /// /// The segment may or may not exist. - pub fn segment(&self, segment_id: SegmentId, opstamp: u64) -> Segment { - create_segment(self.clone(), segment_id, opstamp) + // pub fn segment(&self, segment_id: SegmentId, opstamp: u64) -> Segment { + // (self.clone(), segment_id, opstamp) + // } + + pub fn segment(&self, segment_meta: &SegmentMeta) -> Segment { + create_segment(self.clone(), segment_meta.segment_id, segment_meta.opstamp) + } + + /// Creates a new segment. + pub fn new_segment(&self, opstamp: u64) -> Segment { + create_segment(self.clone(), SegmentId::generate_random(), opstamp) } /// Return a reference to the index directory. @@ -225,7 +229,6 @@ impl Index { /// Reads the meta.json and returns the list of /// committed segments. pub fn committed_segments(&self) -> Result> { - Ok(load_metas(self.directory())?.committed_segments) } @@ -238,11 +241,6 @@ impl Index { .collect()) } - /// Creates a new segment. - pub fn new_segment(&self, opstamp: u64) -> Segment { - self.segment(SegmentId::generate_random(), opstamp) - } - /// Creates a new generation of searchers after /// a change of the set of searchable indexes. /// diff --git a/src/indexer/segment_manager.rs b/src/indexer/segment_manager.rs index 10ee89486..af180e2b3 100644 --- a/src/indexer/segment_manager.rs +++ b/src/indexer/segment_manager.rs @@ -101,7 +101,7 @@ impl SegmentManager { segment_ids } - pub fn commit(&self, docstamp: u64) { + pub fn commit(&self) { let mut registers_lock = self.write(); let segment_entries = registers_lock.uncommitted.segment_entries(); for segment_entry in segment_entries { diff --git a/src/indexer/segment_register.rs b/src/indexer/segment_register.rs index 4c838bf63..eb28ae5c4 100644 --- a/src/indexer/segment_register.rs +++ b/src/indexer/segment_register.rs @@ -157,14 +157,15 @@ impl SegmentRegister { } pub fn new(segment_metas: Vec, delete_cursor: DeleteQueueCursor) -> SegmentRegister { - let mut segment_states = HashMap::new(); - for segment_meta in segment_metas { - let segment_id = segment_meta.segment_id; - let segment_entry = SegmentEntry::new(segment_meta, delete_cursor.clone()); - segment_states.insert(segment_id, segment_entry); - } SegmentRegister { - segment_states: segment_states, + segment_states: segment_metas + .into_iter() + .map(|segment_meta| { + let segment_id = segment_meta.segment_id; + let segment_entry = SegmentEntry::new(segment_meta, delete_cursor.clone()); + (segment_id, segment_entry) + }) + .collect(), } } } diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index 9482f9aaa..1c518dc98 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -177,7 +177,7 @@ impl SegmentUpdater { fn purge_deletes(&self, target_opstamp: u64) -> Result<()> { let uncommitted = self.0.segment_manager.segment_entries(); for mut segment_entry in uncommitted { - let mut segment = self.0.index.segment(segment_entry.meta().segment_id, segment_entry.meta().opstamp); + let mut segment = self.0.index.segment(segment_entry.meta()); let (_, deleted_docset) = advance_deletes( &segment, segment_entry.delete_cursor(), @@ -186,7 +186,6 @@ impl SegmentUpdater { let mut delete_file = segment.with_opstamp(target_opstamp).open_write(SegmentComponent::DELETE)?; write_delete_bitset(&deleted_docset, &mut delete_file)?; } - } Ok(()) } @@ -194,7 +193,7 @@ impl SegmentUpdater { pub fn commit(&self, opstamp: u64) -> impl Future { self.run_async(move |segment_updater| { segment_updater.purge_deletes(opstamp).expect("Failed purge deletes"); - segment_updater.0.segment_manager.commit(opstamp); + segment_updater.0.segment_manager.commit(); let mut directory = segment_updater.0.index.directory().box_clone(); save_metas( &segment_updater.0.segment_manager, @@ -238,7 +237,7 @@ impl SegmentUpdater { let segments: Vec = segment_metas .iter() - .map(|ref segment_metas| index.segment(segment_metas.segment_id, segment_metas.opstamp)) + .map(|ref segment_meta| index.segment(segment_meta)) .collect(); // An IndexMerger is like a "view" of our merged segments. From 0c318339b024255f3ecde94736b9c712a11feab7 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 7 Feb 2017 19:19:41 +0900 Subject: [PATCH 021/107] issue/43 Path logic in segment. --- src/core/segment.rs | 15 +++++++++++++-- src/core/segment_component.rs | 16 ---------------- 2 files changed, 13 insertions(+), 18 deletions(-) diff --git a/src/core/segment.rs b/src/core/segment.rs index 82891e6e8..2eb750c58 100644 --- a/src/core/segment.rs +++ b/src/core/segment.rs @@ -65,8 +65,19 @@ impl Segment { /// It just joins the segment id with the extension /// associated to a segment component. pub fn relative_path(&self, component: SegmentComponent) -> PathBuf { - let path_suffix = component.path_suffix(self.opstamp); - PathBuf::from(self.segment_id.uuid_string() + &*path_suffix) + use self::SegmentComponent::*; + let mut path = self.segment_id.uuid_string(); + path.push_str(&*match component { + POSITIONS => ".pos".to_string(), + INFO => ".info".to_string(), + POSTINGS => ".idx".to_string(), + TERMS => ".term".to_string(), + STORE => ".store".to_string(), + FASTFIELDS => ".fast".to_string(), + FIELDNORMS => ".fieldnorm".to_string(), + DELETE => {format!(".{}.del", self.opstamp)}, + }); + PathBuf::from(path) } /// Open one of the component file for read. diff --git a/src/core/segment_component.rs b/src/core/segment_component.rs index 9b7bea9be..93aacd506 100644 --- a/src/core/segment_component.rs +++ b/src/core/segment_component.rs @@ -10,21 +10,5 @@ pub enum SegmentComponent { DELETE } -impl SegmentComponent { - - pub fn path_suffix(&self, opstamp: u64)-> String { - match *self { - SegmentComponent::POSITIONS => ".pos".to_string(), - SegmentComponent::INFO => ".info".to_string(), - SegmentComponent::POSTINGS => ".idx".to_string(), - SegmentComponent::TERMS => ".term".to_string(), - SegmentComponent::STORE => ".store".to_string(), - SegmentComponent::FASTFIELDS => ".fast".to_string(), - SegmentComponent::FIELDNORMS => ".fieldnorm".to_string(), - SegmentComponent::DELETE => {format!(".{}.del", opstamp)}, - } - } -} - \ No newline at end of file From e337c35721098d889affb1555ec7513b31fc3786 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 7 Feb 2017 22:42:05 +0900 Subject: [PATCH 022/107] issue/43 SegmentMeta refactoring --- src/core/index.rs | 18 +++------ src/core/segment.rs | 30 +++++++-------- src/core/segment_meta.rs | 14 ++++--- src/core/segment_reader.rs | 17 +++++---- src/indexer/index_writer.rs | 68 +++++++++++++++++++-------------- src/indexer/log_merge_policy.rs | 51 +++++++++++++++---------- src/indexer/segment_register.rs | 6 +-- src/indexer/segment_updater.rs | 22 +++++------ src/postings/mod.rs | 4 +- 9 files changed, 121 insertions(+), 109 deletions(-) diff --git a/src/core/index.rs b/src/core/index.rs index 936087dd5..b287bd429 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -186,7 +186,7 @@ impl Index { let metas = load_metas(self.directory())?; Ok(metas .committed_segments - .iter() + .into_iter() .map(|segment_meta| self.segment(segment_meta)) .collect()) } @@ -200,20 +200,14 @@ impl Index { delete_segment(self.directory(), segment_id); } - /// Return a segment object given a `segment_id` - /// - /// The segment may or may not exist. - // pub fn segment(&self, segment_id: SegmentId, opstamp: u64) -> Segment { - // (self.clone(), segment_id, opstamp) - // } - - pub fn segment(&self, segment_meta: &SegmentMeta) -> Segment { - create_segment(self.clone(), segment_meta.segment_id, segment_meta.opstamp) + pub fn segment(&self, segment_meta: SegmentMeta) -> Segment { + create_segment(self.clone(), segment_meta) } /// Creates a new segment. - pub fn new_segment(&self, opstamp: u64) -> Segment { - create_segment(self.clone(), SegmentId::generate_random(), opstamp) + pub fn new_segment(&self) -> Segment { + let segment_meta = SegmentMeta::new(SegmentId::generate_random()); + create_segment(self.clone(), segment_meta) } /// Return a reference to the index directory. diff --git a/src/core/segment.rs b/src/core/segment.rs index 2eb750c58..2dcb774ba 100644 --- a/src/core/segment.rs +++ b/src/core/segment.rs @@ -9,30 +9,29 @@ use indexer::segment_serializer::SegmentSerializer; use super::SegmentComponent; use core::Index; use std::result; +use core::SegmentMeta; use directory::error::{FileError, OpenWriteError}; /// A segment is a piece of the index. #[derive(Clone)] pub struct Segment { index: Index, - segment_id: SegmentId, - opstamp: u64, + meta: SegmentMeta, } impl fmt::Debug for Segment { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "Segment({:?})", self.segment_id.uuid_string()) + write!(f, "Segment({:?})", self.id().uuid_string()) } } /// Creates a new segment given an `Index` and a `SegmentId` /// /// The function is here to make it private outside `tantivy`. -pub fn create_segment(index: Index, segment_id: SegmentId, opstamp: u64) -> Segment { +pub fn create_segment(index: Index, meta: SegmentMeta) -> Segment { Segment { index: index, - segment_id: segment_id, - opstamp: opstamp, + meta: meta, } } @@ -43,20 +42,21 @@ impl Segment { self.index.schema() } - pub fn opstamp(&self) -> u64 { - self.opstamp + pub fn meta(&self,) -> &SegmentMeta { + &self.meta } /// Returns the segment's id. pub fn id(&self,) -> SegmentId { - self.segment_id + self.meta.segment_id } - pub fn with_opstamp(&self, opstamp: u64) -> Segment { + pub fn with_delete_opstamp(self, opstamp: u64) -> Segment { + let mut meta = self.meta; + meta.delete_opstamp = Some(opstamp); Segment { - index: self.index.clone(), - segment_id: self.segment_id.clone(), - opstamp: opstamp, + index: self.index, + meta: meta, } } @@ -66,7 +66,7 @@ impl Segment { /// associated to a segment component. pub fn relative_path(&self, component: SegmentComponent) -> PathBuf { use self::SegmentComponent::*; - let mut path = self.segment_id.uuid_string(); + let mut path = self.id().uuid_string(); path.push_str(&*match component { POSITIONS => ".pos".to_string(), INFO => ".info".to_string(), @@ -75,7 +75,7 @@ impl Segment { STORE => ".store".to_string(), FASTFIELDS => ".fast".to_string(), FIELDNORMS => ".fieldnorm".to_string(), - DELETE => {format!(".{}.del", self.opstamp)}, + DELETE => {format!(".{}.del", self.meta.delete_opstamp.unwrap_or(0))}, }); PathBuf::from(path) } diff --git a/src/core/segment_meta.rs b/src/core/segment_meta.rs index 3d001e896..d8e9f8e6d 100644 --- a/src/core/segment_meta.rs +++ b/src/core/segment_meta.rs @@ -1,21 +1,23 @@ use core::SegmentId; + +// TODO Option + #[derive(Clone, Debug, RustcDecodable,RustcEncodable)] pub struct SegmentMeta { pub segment_id: SegmentId, pub num_docs: u32, pub num_deleted_docs: u32, - pub opstamp: u64, + pub delete_opstamp: Option, } -#[cfg(test)] impl SegmentMeta { - pub fn new(segment_id: SegmentId, num_docs: u32) -> SegmentMeta { + pub fn new(segment_id: SegmentId) -> SegmentMeta { SegmentMeta { segment_id: segment_id, - num_docs: num_docs, + num_docs: 0, num_deleted_docs: 0, - opstamp: 0u64, + delete_opstamp: None, } } -} \ No newline at end of file +} diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index f2ed4e7a4..af0e648a8 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -147,14 +147,15 @@ impl SegmentReader { .unwrap_or_else(|_| ReadOnlySource::empty()); // TODO 0u64 - let delete_data_res = segment.open_read(SegmentComponent::DELETE); - let delete_bitset; - if let Err(FileError::FileDoesNotExist(_)) = delete_data_res { - delete_bitset = DeleteBitSet::empty(); - } - else { - delete_bitset = DeleteBitSet::open(delete_data_res?); - } + let delete_bitset = + if segment.meta().delete_opstamp.is_some() { + let delete_data = segment.open_read(SegmentComponent::DELETE)?; + DeleteBitSet::open(delete_data) + } + else { + DeleteBitSet::empty() + }; + let schema = segment.schema(); Ok(SegmentReader { segment_info: segment_info, diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index dcc9126af..efc20d171 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -116,16 +116,11 @@ impl DocToOpstampMapping { pub fn advance_deletes( segment: &Segment, delete_cursor: &mut DeleteQueueCursor, - doc_opstamps: DocToOpstampMapping) -> Result<(u64, BitSet)> { + doc_opstamps: DocToOpstampMapping) -> Result> { let segment_reader = SegmentReader::open(segment.clone())?; - let mut delete_bitset = BitSet::new(); - for doc in 0u32..segment_reader.max_doc() { - if segment_reader.is_deleted(doc) { - delete_bitset.insert(doc as usize); - } - } - let mut has_changed = false; - let mut last_opstamp = segment.opstamp();//segment + let mut delete_bitset = BitSet::with_capacity(segment_reader.max_doc() as usize); + + let mut last_opstamp_opt: Option = None; for delete_op in delete_cursor { // A delete operation should only affect // document that were inserted after it. @@ -135,17 +130,26 @@ pub fn advance_deletes( let limit_doc = doc_opstamps.compute_doc_limit(delete_op.opstamp); if let Some(mut docset) = segment_reader.read_postings(&delete_op.term, SegmentPostingsOption::NoFreq) { while docset.advance() { - has_changed = true; let deleted_doc = docset.doc(); if deleted_doc < limit_doc { - has_changed = true; delete_bitset.insert(deleted_doc as usize); } } + last_opstamp_opt = Some(delete_op.opstamp); } - last_opstamp = delete_op.opstamp; } - Ok((last_opstamp, delete_bitset)) + + if let Some(last_opstamp) = last_opstamp_opt { + for doc in 0u32..segment_reader.max_doc() { + if segment_reader.is_deleted(doc) { + delete_bitset.insert(doc as usize); + } + } + Ok(Some((last_opstamp, delete_bitset))) + } + else { + Ok(None) + } } fn index_documents(heap: &mut Heap, @@ -175,20 +179,25 @@ fn index_documents(heap: &mut Heap, let doc_opstamps: Vec = segment_writer.finalize()?; - let (last_opstamp_after_deletes, deleted_docset) = advance_deletes(&segment, delete_cursor, DocToOpstampMapping::WithMap(doc_opstamps))?; - - { - let mut delete_file = segment.with_opstamp(last_opstamp_after_deletes).open_write(SegmentComponent::DELETE)?; - write_delete_bitset(&deleted_docset, &mut delete_file)?; - } - let num_deleted_docs = deleted_docset.len() as DocId; - - let segment_meta = SegmentMeta { - segment_id: segment_id, - num_docs: num_docs, - num_deleted_docs: num_deleted_docs, - opstamp: last_opstamp_after_deletes, - }; + let segment_meta = + if let Some((last_opstamp_after_deletes, deleted_docset)) = advance_deletes(&segment, delete_cursor, DocToOpstampMapping::WithMap(doc_opstamps))? { + let mut delete_file = segment.with_delete_opstamp(last_opstamp_after_deletes).open_write(SegmentComponent::DELETE)?; + write_delete_bitset(&deleted_docset, &mut delete_file)?; + SegmentMeta { + segment_id: segment_id, + num_docs: num_docs, + num_deleted_docs: deleted_docset.len() as DocId, + delete_opstamp: Some(last_opstamp_after_deletes), + } + } + else { + SegmentMeta { + segment_id: segment_id, + num_docs: num_docs, + num_deleted_docs: 0, + delete_opstamp: None, + } + }; let segment_entry = SegmentEntry::new(segment_meta, delete_cursor.clone()); @@ -252,6 +261,7 @@ impl IndexWriter { let mut document_iterator = document_receiver_clone.clone() .into_iter() .peekable(); + // the peeking here is to avoid // creating a new segment's files // if no document are available. @@ -269,8 +279,8 @@ impl IndexWriter { // was dropped. return Ok(()) } - - let segment = index.new_segment(opstamp); + + let segment = index.new_segment(); let valid_generation = index_documents(&mut heap, segment, &schema, diff --git a/src/indexer/log_merge_policy.rs b/src/indexer/log_merge_policy.rs index 3eebdf78f..cc4cd56bc 100644 --- a/src/indexer/log_merge_policy.rs +++ b/src/indexer/log_merge_policy.rs @@ -120,11 +120,20 @@ mod tests { assert!(result_list.is_empty()); } + fn seg_meta(num_docs: u32) -> SegmentMeta { + SegmentMeta { + segment_id: SegmentId::generate_random(), + num_docs: num_docs, + num_deleted_docs: 0u32, + delete_opstamp: None, + } + } + #[test] fn test_log_merge_policy_pair() { - let test_input = vec![SegmentMeta::new(SegmentId::generate_random(), 10), - SegmentMeta::new(SegmentId::generate_random(), 10), - SegmentMeta::new(SegmentId::generate_random(), 10)]; + let test_input = vec![seg_meta(10), + seg_meta(10), + seg_meta(10)]; let result_list = test_merge_policy().compute_merge_candidates(&test_input); assert_eq!(result_list.len(), 1); } @@ -132,12 +141,12 @@ mod tests { #[test] fn test_log_merge_policy_levels() { // multiple levels all get merged correctly - let test_input = vec![SegmentMeta::new(SegmentId::generate_random(), 10), - SegmentMeta::new(SegmentId::generate_random(), 10), - SegmentMeta::new(SegmentId::generate_random(), 10), - SegmentMeta::new(SegmentId::generate_random(), 1000), - SegmentMeta::new(SegmentId::generate_random(), 1000), - SegmentMeta::new(SegmentId::generate_random(), 1000)]; + let test_input = vec![seg_meta(10), + seg_meta(10), + seg_meta(10), + seg_meta(1000), + seg_meta(1000), + seg_meta(1000)]; let result_list = test_merge_policy().compute_merge_candidates(&test_input); assert_eq!(result_list.len(), 2); } @@ -145,24 +154,24 @@ mod tests { #[test] fn test_log_merge_policy_within_levels() { // multiple levels all get merged correctly - let test_input = vec![SegmentMeta::new(SegmentId::generate_random(), 10), - SegmentMeta::new(SegmentId::generate_random(), 11), - SegmentMeta::new(SegmentId::generate_random(), 12), - SegmentMeta::new(SegmentId::generate_random(), 1000), - SegmentMeta::new(SegmentId::generate_random(), 1000), - SegmentMeta::new(SegmentId::generate_random(), 1000)]; + let test_input = vec![seg_meta(10), + seg_meta(11), + seg_meta(12), + seg_meta(1000), + seg_meta(1000), + seg_meta(1000)]; let result_list = test_merge_policy().compute_merge_candidates(&test_input); assert_eq!(result_list.len(), 2); } #[test] fn test_log_merge_policy_small_segments() { // multiple levels all get merged correctly - let test_input = vec![SegmentMeta::new(SegmentId::generate_random(), 1), - SegmentMeta::new(SegmentId::generate_random(), 1), - SegmentMeta::new(SegmentId::generate_random(), 1), - SegmentMeta::new(SegmentId::generate_random(), 2), - SegmentMeta::new(SegmentId::generate_random(), 2), - SegmentMeta::new(SegmentId::generate_random(), 2)]; + let test_input = vec![seg_meta(1), + seg_meta(1), + seg_meta(1), + seg_meta(2), + seg_meta(2), + seg_meta(2)]; let result_list = test_merge_policy().compute_merge_candidates(&test_input); assert_eq!(result_list.len(), 1); } diff --git a/src/indexer/segment_register.rs b/src/indexer/segment_register.rs index eb28ae5c4..ce3271efa 100644 --- a/src/indexer/segment_register.rs +++ b/src/indexer/segment_register.rs @@ -195,14 +195,14 @@ mod tests { let segment_id_merged = SegmentId::generate_random(); { - let segment_meta = SegmentMeta::new(segment_id_a, 10); + let segment_meta = SegmentMeta::new(segment_id_a); let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor()); segment_register.add_segment_entry(segment_entry); } assert_eq!(segment_register.segment_entry(&segment_id_a).unwrap().state, SegmentState::Ready); assert_eq!(segment_register.segment_ids(), vec!(segment_id_a)); { - let segment_meta = SegmentMeta::new(segment_id_b, 20); + let segment_meta = SegmentMeta::new(segment_id_b); let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor()); segment_register.add_segment_entry(segment_entry); } @@ -214,7 +214,7 @@ mod tests { segment_register.remove_segment(&segment_id_a); segment_register.remove_segment(&segment_id_b); { - let segment_meta_merged = SegmentMeta::new(segment_id_merged, 10 + 20); + let segment_meta_merged = SegmentMeta::new(segment_id_merged); let segment_entry = SegmentEntry::new(segment_meta_merged, delete_queue.cursor()); segment_register.add_segment_entry(segment_entry); } diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index 1c518dc98..b6f129323 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -177,13 +177,13 @@ impl SegmentUpdater { fn purge_deletes(&self, target_opstamp: u64) -> Result<()> { let uncommitted = self.0.segment_manager.segment_entries(); for mut segment_entry in uncommitted { - let mut segment = self.0.index.segment(segment_entry.meta()); - let (_, deleted_docset) = advance_deletes( + let mut segment = self.0.index.segment(segment_entry.meta().clone()); + if let Some((_, deleted_docset)) = advance_deletes( &segment, segment_entry.delete_cursor(), - DocToOpstampMapping::None).unwrap(); - { - let mut delete_file = segment.with_opstamp(target_opstamp).open_write(SegmentComponent::DELETE)?; + DocToOpstampMapping::None).unwrap() + { + let mut delete_file = segment.with_delete_opstamp(target_opstamp).open_write(SegmentComponent::DELETE)?; write_delete_bitset(&deleted_docset, &mut delete_file)?; } } @@ -237,20 +237,16 @@ impl SegmentUpdater { let segments: Vec = segment_metas .iter() - .map(|ref segment_meta| index.segment(segment_meta)) + .cloned() + .map(|segment_meta| index.segment(segment_meta)) .collect(); // An IndexMerger is like a "view" of our merged segments. // TODO unwrap let merger: IndexMerger = IndexMerger::open(schema, &segments[..]).expect("Creating index merger failed"); - let opstamp = segment_metas - .iter() - .map(|meta| meta.opstamp) - .max() - .unwrap(); - let mut merged_segment = index.new_segment(opstamp); + let mut merged_segment = index.new_segment(); // ... we just serialize this index merger in our new segment // to merge the two segments. @@ -260,7 +256,7 @@ impl SegmentUpdater { segment_id: merged_segment.id(), num_docs: num_docs, num_deleted_docs: 0u32, - opstamp: opstamp, + delete_opstamp: None, // TODO fix delete_opstamp }; // TODO fix delete cursor diff --git a/src/postings/mod.rs b/src/postings/mod.rs index b7676710f..f9898b9fc 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -61,7 +61,7 @@ mod tests { let text_field = schema_builder.add_text_field("text", TEXT); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); - let mut segment = index.new_segment(0u64); + let mut segment = index.new_segment(); let mut posting_serializer = PostingsSerializer::open(&mut segment).unwrap(); let term = Term::from_field_text(text_field, "abc"); posting_serializer.new_term(&term, 3).unwrap(); @@ -81,7 +81,7 @@ mod tests { let text_field = schema_builder.add_text_field("text", TEXT); let schema = schema_builder.build(); let index = Index::create_in_ram(schema.clone()); - let segment = index.new_segment(0u64); + let segment = index.new_segment(); let heap = Heap::with_capacity(10_000_000); { let mut segment_writer = SegmentWriter::for_segment(&heap, segment.clone(), &schema).unwrap(); From 2fc3a505bc319f2d127755afe8ff50094484851c Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 7 Feb 2017 23:35:48 +0900 Subject: [PATCH 023/107] issue/43 refactoring segment meta --- src/core/segment.rs | 17 +++++-------- src/core/segment_meta.rs | 43 ++++++++++++++++++++++++++++----- src/core/segment_reader.rs | 2 +- src/indexer/index_writer.rs | 35 +++++++++++---------------- src/indexer/log_merge_policy.rs | 11 +++------ src/indexer/segment_updater.rs | 14 +++++------ 6 files changed, 69 insertions(+), 53 deletions(-) diff --git a/src/core/segment.rs b/src/core/segment.rs index 2dcb774ba..0ed88d3bc 100644 --- a/src/core/segment.rs +++ b/src/core/segment.rs @@ -41,25 +41,20 @@ impl Segment { pub fn schema(&self,) -> Schema { self.index.schema() } - + pub fn meta(&self,) -> &SegmentMeta { &self.meta } + pub fn meta_mut(&mut self,) -> &mut SegmentMeta { + &mut self.meta + } + /// Returns the segment's id. pub fn id(&self,) -> SegmentId { self.meta.segment_id } - pub fn with_delete_opstamp(self, opstamp: u64) -> Segment { - let mut meta = self.meta; - meta.delete_opstamp = Some(opstamp); - Segment { - index: self.index, - meta: meta, - } - } - /// Returns the relative path of a component of our segment. /// /// It just joins the segment id with the extension @@ -75,7 +70,7 @@ impl Segment { STORE => ".store".to_string(), FASTFIELDS => ".fast".to_string(), FIELDNORMS => ".fieldnorm".to_string(), - DELETE => {format!(".{}.del", self.meta.delete_opstamp.unwrap_or(0))}, + DELETE => {format!(".{}.del", self.meta.delete_opstamp().unwrap_or(0))}, }); PathBuf::from(path) } diff --git a/src/core/segment_meta.rs b/src/core/segment_meta.rs index d8e9f8e6d..2f5cb9b97 100644 --- a/src/core/segment_meta.rs +++ b/src/core/segment_meta.rs @@ -1,14 +1,17 @@ use core::SegmentId; -// TODO Option +#[derive(Clone, Debug, RustcDecodable,RustcEncodable)] +struct DeleteMeta { + num_deleted_docs: u32, + opstamp: u64, +} #[derive(Clone, Debug, RustcDecodable,RustcEncodable)] pub struct SegmentMeta { pub segment_id: SegmentId, - pub num_docs: u32, - pub num_deleted_docs: u32, - pub delete_opstamp: Option, + num_docs: u32, + deletes: Option, } impl SegmentMeta { @@ -16,8 +19,36 @@ impl SegmentMeta { SegmentMeta { segment_id: segment_id, num_docs: 0, - num_deleted_docs: 0, - delete_opstamp: None, + deletes: None, } } + + pub fn id(&self) -> SegmentId { + self.segment_id + } + + pub fn num_docs(&self) -> u32 { + self.num_docs + } + + pub fn delete_opstamp(&self) -> Option { + self.deletes + .as_ref() + .map(|delete_meta| delete_meta.opstamp) + } + + pub fn has_deletes(&self) -> bool { + self.deletes.is_some() + } + + pub fn set_num_docs(&mut self, num_docs: u32) { + self.num_docs = num_docs; + } + + pub fn set_deletes(&mut self, num_deleted_docs: u32, opstamp: u64) { + self.deletes = Some(DeleteMeta { + num_deleted_docs: num_deleted_docs, + opstamp: opstamp, + }); + } } diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index af0e648a8..cb5936b8e 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -148,7 +148,7 @@ impl SegmentReader { // TODO 0u64 let delete_bitset = - if segment.meta().delete_opstamp.is_some() { + if segment.meta().has_deletes() { let delete_data = segment.open_read(SegmentComponent::DELETE)?; DeleteBitSet::open(delete_data) } diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index efc20d171..9b20ee2fa 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -172,34 +172,27 @@ fn index_documents(heap: &mut Heap, } } let num_docs = segment_writer.max_doc(); + + // this is ensured by the call to peek before starting + // the worker thread. assert!(num_docs > 0); - + + segment + .meta_mut() + .set_num_docs(num_docs); let last_opstamp = segment_writer.last_opstamp(); let doc_opstamps: Vec = segment_writer.finalize()?; - let segment_meta = - if let Some((last_opstamp_after_deletes, deleted_docset)) = advance_deletes(&segment, delete_cursor, DocToOpstampMapping::WithMap(doc_opstamps))? { - let mut delete_file = segment.with_delete_opstamp(last_opstamp_after_deletes).open_write(SegmentComponent::DELETE)?; - write_delete_bitset(&deleted_docset, &mut delete_file)?; - SegmentMeta { - segment_id: segment_id, - num_docs: num_docs, - num_deleted_docs: deleted_docset.len() as DocId, - delete_opstamp: Some(last_opstamp_after_deletes), - } - } - else { - SegmentMeta { - segment_id: segment_id, - num_docs: num_docs, - num_deleted_docs: 0, - delete_opstamp: None, - } - }; + if let Some((last_opstamp_after_deletes, deleted_docset)) = advance_deletes(&segment, delete_cursor, DocToOpstampMapping::WithMap(doc_opstamps))? { + let num_deleted_docs = deleted_docset.len(); + segment.meta_mut().set_deletes(num_deleted_docs as u32, last_opstamp_after_deletes); + let mut delete_file = segment.open_write(SegmentComponent::DELETE)?; + write_delete_bitset(&deleted_docset, &mut delete_file)?; + } - let segment_entry = SegmentEntry::new(segment_meta, delete_cursor.clone()); + let segment_entry = SegmentEntry::new(segment.meta().clone(), delete_cursor.clone()); segment_updater .add_segment(generation, segment_entry) diff --git a/src/indexer/log_merge_policy.rs b/src/indexer/log_merge_policy.rs index cc4cd56bc..210d82ed3 100644 --- a/src/indexer/log_merge_policy.rs +++ b/src/indexer/log_merge_policy.rs @@ -53,7 +53,7 @@ impl MergePolicy for LogMergePolicy { } let mut size_sorted_tuples = segments.iter() - .map(|x| x.num_docs) + .map(|x| x.num_docs()) .enumerate() .collect::>(); @@ -121,12 +121,9 @@ mod tests { } fn seg_meta(num_docs: u32) -> SegmentMeta { - SegmentMeta { - segment_id: SegmentId::generate_random(), - num_docs: num_docs, - num_deleted_docs: 0u32, - delete_opstamp: None, - } + let mut segment_metas = SegmentMeta::new(SegmentId::generate_random()); + segment_metas.set_num_docs(num_docs); + segment_metas } #[test] diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index b6f129323..ed097e317 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -183,7 +183,11 @@ impl SegmentUpdater { segment_entry.delete_cursor(), DocToOpstampMapping::None).unwrap() { - let mut delete_file = segment.with_delete_opstamp(target_opstamp).open_write(SegmentComponent::DELETE)?; + let num_deleted_docs = deleted_docset.len(); + // TODO previous mask? + // TODO save the resulting segment_entry + segment.meta_mut().set_deletes(num_deleted_docs as u32, target_opstamp); + let mut delete_file = segment.open_write(SegmentComponent::DELETE)?; write_delete_bitset(&deleted_docset, &mut delete_file)?; } } @@ -252,12 +256,8 @@ impl SegmentUpdater { // to merge the two segments. let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment).expect("Creating index serializer failed"); let num_docs = merger.write(segment_serializer).expect("Serializing merged index failed"); - let segment_meta = SegmentMeta { - segment_id: merged_segment.id(), - num_docs: num_docs, - num_deleted_docs: 0u32, - delete_opstamp: None, // TODO fix delete_opstamp - }; + let mut segment_meta = SegmentMeta::new(merged_segment.id()); + segment_meta.set_num_docs(num_docs); // TODO fix delete cursor let delete_queue = DeleteQueue::default(); From 72afbb28c7535bf4ee099ad98f6852ee2084f450 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Wed, 8 Feb 2017 17:45:14 +0900 Subject: [PATCH 024/107] issue/43 test passing --- src/core/index.rs | 12 +++---- src/core/index_meta.rs | 6 ++-- src/core/segment_id.rs | 3 -- src/core/segment_reader.rs | 2 -- src/indexer/index_writer.rs | 62 +++++++++++++++----------------- src/indexer/segment_manager.rs | 10 +++--- src/indexer/segment_register.rs | 2 +- src/indexer/segment_updater.rs | 64 +++++++++++++++------------------ src/indexer/segment_writer.rs | 7 +--- src/schema/schema.rs | 3 -- 10 files changed, 72 insertions(+), 99 deletions(-) diff --git a/src/core/index.rs b/src/core/index.rs index b287bd429..483d0eaba 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -21,7 +21,7 @@ use core::IndexMeta; use core::META_FILEPATH; use super::segment::create_segment; use indexer::segment_updater::save_new_metas; -use directory::error::{FileError, OpenWriteError}; +use directory::error::FileError; const NUM_SEARCHERS: usize = 12; @@ -185,7 +185,7 @@ impl Index { pub fn searchable_segments(&self) -> Result> { let metas = load_metas(self.directory())?; Ok(metas - .committed_segments + .segments .into_iter() .map(|segment_meta| self.segment(segment_meta)) .collect()) @@ -221,15 +221,15 @@ impl Index { } /// Reads the meta.json and returns the list of - /// committed segments. - pub fn committed_segments(&self) -> Result> { - Ok(load_metas(self.directory())?.committed_segments) + /// segments in the last commit. + pub fn segments(&self) -> Result> { + Ok(load_metas(self.directory())?.segments) } /// Returns the list of segment ids that are searchable. pub fn searchable_segment_ids(&self) -> Result> { Ok(load_metas(self.directory())? - .committed_segments + .segments .iter() .map(|segment_meta| segment_meta.segment_id) .collect()) diff --git a/src/core/index_meta.rs b/src/core/index_meta.rs index c97ed9570..8a0274b4e 100644 --- a/src/core/index_meta.rs +++ b/src/core/index_meta.rs @@ -11,8 +11,7 @@ use core::SegmentMeta; /// #[derive(Clone,Debug,RustcDecodable,RustcEncodable)] pub struct IndexMeta { - pub committed_segments: Vec, - pub uncommitted_segments: Vec, + pub segments: Vec, pub schema: Schema, pub opstamp: u64, } @@ -20,8 +19,7 @@ pub struct IndexMeta { impl IndexMeta { pub fn with_schema(schema: Schema) -> IndexMeta { IndexMeta { - committed_segments: Vec::new(), - uncommitted_segments: Vec::new(), + segments: vec!(), schema: schema, opstamp: 0u64, } diff --git a/src/core/segment_id.rs b/src/core/segment_id.rs index 263d94c58..db8a3d822 100644 --- a/src/core/segment_id.rs +++ b/src/core/segment_id.rs @@ -1,11 +1,8 @@ use uuid::Uuid; use std::fmt; use rustc_serialize::{Encoder, Decoder, Encodable, Decodable}; -use core::SegmentComponent; -use std::path::PathBuf; use std::cmp::{Ordering, Ord}; - #[cfg(test)] use std::sync::atomic; diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index cb5936b8e..a953a6810 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -3,13 +3,11 @@ use core::Segment; use core::SegmentId; use core::SegmentComponent; use schema::Term; -use bit_set::BitSet; use common::HasLen; use fastfield::delete::DeleteBitSet; use store::StoreReader; use schema::Document; use directory::ReadOnlySource; -use directory::error::FileError; use DocId; use std::io; use std::str; diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 9b20ee2fa..14a9344d3 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -19,7 +19,6 @@ use super::directory_lock::DirectoryLock; use futures::Future; use std::clone::Clone; use std::io; -use fastfield::delete; use std::thread; use futures::Canceled; use std::mem; @@ -27,7 +26,6 @@ use datastruct::stacker::Heap; use core::SegmentReader; use std::mem::swap; use chan; -use core::SegmentMeta; use super::delete_queue::{DeleteQueue, DeleteQueueCursor}; use super::segment_updater::SegmentUpdater; use Result; @@ -110,18 +108,17 @@ impl DocToOpstampMapping { } - /// TODO /// work on SegmentMeta pub fn advance_deletes( - segment: &Segment, + segment: &mut Segment, delete_cursor: &mut DeleteQueueCursor, - doc_opstamps: DocToOpstampMapping) -> Result> { + doc_opstamps: DocToOpstampMapping) -> Result { let segment_reader = SegmentReader::open(segment.clone())?; let mut delete_bitset = BitSet::with_capacity(segment_reader.max_doc() as usize); let mut last_opstamp_opt: Option = None; - for delete_op in delete_cursor { + while let Some(delete_op) = delete_cursor.next() { // A delete operation should only affect // document that were inserted after it. // @@ -145,11 +142,14 @@ pub fn advance_deletes( delete_bitset.insert(doc as usize); } } - Ok(Some((last_opstamp, delete_bitset))) - } - else { - Ok(None) + let num_deleted_docs = delete_bitset.len(); + segment.meta_mut().set_deletes(num_deleted_docs as u32, last_opstamp); + let mut delete_file = segment.open_write(SegmentComponent::DELETE)?; + write_delete_bitset(&delete_bitset, &mut delete_file)?; } + + Ok(SegmentEntry::new(segment.meta().clone(), delete_cursor.clone())) + } fn index_documents(heap: &mut Heap, @@ -161,7 +161,6 @@ fn index_documents(heap: &mut Heap, delete_cursor: &mut DeleteQueueCursor) -> Result { heap.clear(); - let segment_id = segment.id(); let mut segment_writer = try!(SegmentWriter::for_segment(heap, segment.clone(), &schema)); for doc in document_iterator { try!(segment_writer.add_document(&doc, &schema)); @@ -180,19 +179,10 @@ fn index_documents(heap: &mut Heap, segment .meta_mut() .set_num_docs(num_docs); - - let last_opstamp = segment_writer.last_opstamp(); let doc_opstamps: Vec = segment_writer.finalize()?; - if let Some((last_opstamp_after_deletes, deleted_docset)) = advance_deletes(&segment, delete_cursor, DocToOpstampMapping::WithMap(doc_opstamps))? { - let num_deleted_docs = deleted_docset.len(); - segment.meta_mut().set_deletes(num_deleted_docs as u32, last_opstamp_after_deletes); - let mut delete_file = segment.open_write(SegmentComponent::DELETE)?; - write_delete_bitset(&deleted_docset, &mut delete_file)?; - } - - let segment_entry = SegmentEntry::new(segment.meta().clone(), delete_cursor.clone()); + let segment_entry = advance_deletes(&mut segment, delete_cursor, DocToOpstampMapping::WithMap(doc_opstamps))?; segment_updater .add_segment(generation, segment_entry) @@ -262,9 +252,15 @@ impl IndexWriter { // this is a valid guarantee as the // peeked document now belongs to // our local iterator. - let opstamp: u64; - if let Some(operation) = document_iterator.peek() { - opstamp = operation.opstamp; + if document_iterator.peek().is_some() { + let segment = index.new_segment(); + index_documents(&mut heap, + segment, + &schema, + generation, + &mut document_iterator, + &mut segment_updater, + &mut delete_cursor_clone)?; } else { // No more documents. @@ -273,14 +269,7 @@ impl IndexWriter { return Ok(()) } - let segment = index.new_segment(); - let valid_generation = index_documents(&mut heap, - segment, - &schema, - generation, - &mut document_iterator, - &mut segment_updater, - &mut delete_cursor_clone)?; + } })?; self.worker_id += 1; @@ -402,7 +391,9 @@ impl IndexWriter { // by updating the generation in the segment updater, // pending add segment commands will be dismissed. self.generation += 1; - let rollback_future = self.segment_updater.new_generation(self.generation); + + // TODO requires a new delete queue... + let rollback_future = self.segment_updater.rollback(self.generation); // we cannot drop segment ready receiver yet // as it would block the workers. @@ -487,12 +478,15 @@ impl IndexWriter { // committed segments. self.committed_opstamp = self.stamp(); - let future = self.segment_updater.commit(self.committed_opstamp); + let new_delete_queue = DeleteQueue::default(); + + let future = self.segment_updater.commit(self.committed_opstamp, new_delete_queue.cursor()); // wait for the segment update thread to have processed the info // TODO remove unwrap future.wait().unwrap(); + self.delete_queue = new_delete_queue; Ok(self.committed_opstamp) } diff --git a/src/indexer/segment_manager.rs b/src/indexer/segment_manager.rs index af180e2b3..550b75382 100644 --- a/src/indexer/segment_manager.rs +++ b/src/indexer/segment_manager.rs @@ -101,13 +101,13 @@ impl SegmentManager { segment_ids } - pub fn commit(&self) { + pub fn commit(&self, segment_entries: Vec) { let mut registers_lock = self.write(); - let segment_entries = registers_lock.uncommitted.segment_entries(); + registers_lock.committed.clear(); + registers_lock.uncommitted.clear(); for segment_entry in segment_entries { registers_lock.committed.add_segment_entry(segment_entry); } - registers_lock.uncommitted.clear(); } pub fn start_merge(&self, segment_ids: &[SegmentId]) { @@ -148,9 +148,9 @@ impl SegmentManager { } } - pub fn segment_metas(&self,) -> (Vec, Vec) { + pub fn committed_segment_metas(&self,) -> Vec { let registers_lock = self.read(); - (registers_lock.committed.segment_metas(), registers_lock.uncommitted.segment_metas()) + registers_lock.committed.segment_metas() } } diff --git a/src/indexer/segment_register.rs b/src/indexer/segment_register.rs index ce3271efa..ac1205b0c 100644 --- a/src/indexer/segment_register.rs +++ b/src/indexer/segment_register.rs @@ -105,7 +105,7 @@ impl SegmentRegister { .collect() } - pub fn segment_entries(&self,) -> Vec{ + pub fn segment_entries(&self,) -> Vec { self.segment_states .values() .cloned() diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index ed097e317..c77e3fc24 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -10,11 +10,9 @@ use std::mem; use std::sync::atomic::Ordering; use std::ops::DerefMut; use futures::{Future, future}; -use fastfield::delete::write_delete_bitset; use futures::oneshot; use futures::Canceled; use std::thread; -use core::SegmentComponent; use std::sync::atomic::AtomicUsize; use std::sync::RwLock; use core::SerializableSegment; @@ -39,11 +37,9 @@ use std::io::Write; use super::segment_manager::{SegmentManager, get_segments}; -fn create_metas(segment_manager: &SegmentManager, schema: Schema, opstamp: u64) -> IndexMeta { - let (committed_segments, uncommitted_segments) = segment_manager.segment_metas(); +fn create_metas(metas: Vec, schema: Schema, opstamp: u64) -> IndexMeta { IndexMeta { - committed_segments: committed_segments, - uncommitted_segments: uncommitted_segments, + segments: metas, schema: schema, opstamp: opstamp, } @@ -63,8 +59,7 @@ pub fn save_new_metas(schema: Schema, opstamp: u64, directory: &mut Directory) -> Result<()> { - let segment_manager = SegmentManager::default(); - save_metas(&segment_manager, schema, opstamp, directory) + save_metas(vec!(), schema, opstamp, directory) } @@ -78,12 +73,12 @@ pub fn save_new_metas(schema: Schema, /// and flushed. /// /// This method is not part of tantivy's public API -pub fn save_metas(segment_manager: &SegmentManager, +pub fn save_metas(segment_metas: Vec, schema: Schema, opstamp: u64, directory: &mut Directory) -> Result<()> { - let metas = create_metas(segment_manager, schema, opstamp); + let metas = create_metas(segment_metas, schema, opstamp); let mut w = Vec::new(); try!(write!(&mut w, "{}\n", json::as_pretty_json(&metas))); Ok(directory @@ -119,8 +114,8 @@ impl SegmentUpdater { delete_cursor: DeleteQueueCursor) -> Result { - let committed_segments = index.committed_segments()?; - let segment_manager = SegmentManager::from_segments(committed_segments, delete_cursor); + let segments = index.segments()?; + let segment_manager = SegmentManager::from_segments(segments, delete_cursor); Ok( SegmentUpdater(Arc::new(InnerSegmentUpdater { pool: CpuPool::new(1), @@ -174,33 +169,31 @@ impl SegmentUpdater { } } - fn purge_deletes(&self, target_opstamp: u64) -> Result<()> { - let uncommitted = self.0.segment_manager.segment_entries(); - for mut segment_entry in uncommitted { - let mut segment = self.0.index.segment(segment_entry.meta().clone()); - if let Some((_, deleted_docset)) = advance_deletes( - &segment, - segment_entry.delete_cursor(), - DocToOpstampMapping::None).unwrap() - { - let num_deleted_docs = deleted_docset.len(); - // TODO previous mask? - // TODO save the resulting segment_entry - segment.meta_mut().set_deletes(num_deleted_docs as u32, target_opstamp); - let mut delete_file = segment.open_write(SegmentComponent::DELETE)?; - write_delete_bitset(&deleted_docset, &mut delete_file)?; - } - } - Ok(()) + fn purge_deletes(&self) -> Result> { + let segment_entries = self.0.segment_manager.segment_entries(); + segment_entries + .into_iter() + .map(|mut segment_entry| { + let mut segment = self.0.index.segment(segment_entry.meta().clone()); + advance_deletes(&mut segment, segment_entry.delete_cursor(), DocToOpstampMapping::None) + .map(|entry| entry.meta().clone()) + }) + .collect() } - pub fn commit(&self, opstamp: u64) -> impl Future { + pub fn commit(&self, opstamp: u64, new_delete_queue: DeleteQueueCursor) -> impl Future { self.run_async(move |segment_updater| { - segment_updater.purge_deletes(opstamp).expect("Failed purge deletes"); - segment_updater.0.segment_manager.commit(); + let segment_metas = segment_updater.purge_deletes().expect("Failed purge deletes"); + + let segment_entries = segment_metas.into_iter() + .map(|segment_meta| + SegmentEntry::new(segment_meta, new_delete_queue.clone()) + ) + .collect::>(); + segment_updater.0.segment_manager.commit(segment_entries); let mut directory = segment_updater.0.index.directory().box_clone(); save_metas( - &segment_updater.0.segment_manager, + segment_updater.0.segment_manager.committed_segment_metas(), segment_updater.0.index.schema(), opstamp, directory.borrow_mut()).expect("Could not save metas."); @@ -297,8 +290,9 @@ impl SegmentUpdater { self.run_async(move |segment_updater| { segment_updater.0.segment_manager.end_merge(&merged_segment_metas, resulting_segment_entry); let mut directory = segment_updater.0.index.directory().box_clone(); + let segment_metas = segment_updater.0.segment_manager.committed_segment_metas(); save_metas( - &segment_updater.0.segment_manager, + segment_metas, segment_updater.0.index.schema(), segment_updater.0.index.opstamp(), directory.borrow_mut()).expect("Could not save metas."); diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index d2de2b946..feae0e5e5 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -148,12 +148,7 @@ impl<'a> SegmentWriter<'a> { // }) // .collect::>() // } - - pub fn last_opstamp(&self) -> u64 { - *(self.doc_opstamps - .last() - .expect("Last doc opstamp called on an empty segment writer")) - } + /// Indexes a new document /// diff --git a/src/schema/schema.rs b/src/schema/schema.rs index bdd699eea..8b35aff01 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -329,7 +329,6 @@ mod tests { schema_builder.add_u32_field("count", count_options); let schema = schema_builder.build(); let schema_json: String = format!("{}", json::as_pretty_json(&schema)); - println!("{}", schema_json); let expected = r#"[ { "name": "title", @@ -456,7 +455,6 @@ mod tests { "author": "fulmicoton", "count": -5 }"#); - println!("{:?}", json_err); match json_err { Err(DocParsingError::ValueError(_, ValueParsingError::TypeError(_))) => { assert!(true); @@ -472,7 +470,6 @@ mod tests { "author": "fulmicoton", "count": 5000000000 }"#); - println!("{:?}", json_err); match json_err { Err(DocParsingError::ValueError(_, ValueParsingError::OverflowError(_))) => { assert!(true); From d007cf3435f2f247ae05bd564755daddddcea03c Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 9 Feb 2017 21:49:25 +0900 Subject: [PATCH 025/107] issue/43 simplification. removed the notion of delete cursor. --- src/core/index.rs | 2 +- src/core/segment.rs | 2 +- src/core/segment_meta.rs | 2 +- src/core/segment_reader.rs | 4 + src/directory/mod.rs | 4 +- src/indexer/delete_queue.rs | 219 ++++++-------------------------- src/indexer/index_writer.rs | 61 +++++---- src/indexer/log_merge_policy.rs | 2 +- src/indexer/segment_manager.rs | 9 +- src/indexer/segment_register.rs | 55 ++++---- src/indexer/segment_updater.rs | 57 ++++----- src/lib.rs | 72 +++++++++++ 12 files changed, 210 insertions(+), 279 deletions(-) diff --git a/src/core/index.rs b/src/core/index.rs index 483d0eaba..b4e04d0c8 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -231,7 +231,7 @@ impl Index { Ok(load_metas(self.directory())? .segments .iter() - .map(|segment_meta| segment_meta.segment_id) + .map(|segment_meta| segment_meta.id()) .collect()) } diff --git a/src/core/segment.rs b/src/core/segment.rs index 0ed88d3bc..dcf5ec116 100644 --- a/src/core/segment.rs +++ b/src/core/segment.rs @@ -52,7 +52,7 @@ impl Segment { /// Returns the segment's id. pub fn id(&self,) -> SegmentId { - self.meta.segment_id + self.meta.id() } /// Returns the relative path of a component of our segment. diff --git a/src/core/segment_meta.rs b/src/core/segment_meta.rs index 2f5cb9b97..40142c1a4 100644 --- a/src/core/segment_meta.rs +++ b/src/core/segment_meta.rs @@ -9,7 +9,7 @@ struct DeleteMeta { #[derive(Clone, Debug, RustcDecodable,RustcEncodable)] pub struct SegmentMeta { - pub segment_id: SegmentId, + segment_id: SegmentId, num_docs: u32, deletes: Option, } diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index a953a6810..020db8715 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -237,6 +237,10 @@ impl SegmentReader { /// Returns the posting list associated with a term. + /// + /// If the term is not found, return None. + /// Even when non-null, because of deletes, the posting object + /// returned by this method may contain no documents. pub fn read_postings_all_info(&self, term: &Term) -> Option { let field_entry = self.schema.get_field_entry(term.field()); let segment_posting_option = match *field_entry.field_type() { diff --git a/src/directory/mod.rs b/src/directory/mod.rs index d6873ecc8..e03435199 100644 --- a/src/directory/mod.rs +++ b/src/directory/mod.rs @@ -111,7 +111,7 @@ mod tests { } } - fn test_delete(directory: &mut Directory) { + fn test_directory_delete(directory: &mut Directory) { assert!(directory.open_read(*TEST_PATH).is_err()); let mut write_file = directory.open_write(*TEST_PATH).unwrap(); write_file.write_all(&[1, 2, 3, 4]).unwrap(); @@ -131,7 +131,7 @@ mod tests { test_seek(directory); test_rewrite_forbidden(directory); test_write_create_the_file(directory); - test_delete(directory); + test_directory_delete(directory); } } diff --git a/src/indexer/delete_queue.rs b/src/indexer/delete_queue.rs index fd1e4f59b..2a2d88abd 100644 --- a/src/indexer/delete_queue.rs +++ b/src/indexer/delete_queue.rs @@ -1,175 +1,26 @@ -use schema::Term; -use std::sync::{Arc, RwLock}; use super::operation::DeleteOperation; -const BLOCK_SIZE: usize = 128; - - -/// DeleteQueue are implemented as an unrolled linked list. -/// Block implements a block of this unrolled linked list. -struct Block { - operations: Vec, - next: Option, -} - -impl Default for Block { - fn default() -> Block { - Block { - operations: Vec::with_capacity(BLOCK_SIZE), - next: None - } - } -} - -/// A shared block wraps a block +// TODO remove clone #[derive(Clone)] -struct SharedBlock(Arc>); - -impl SharedBlock { - // Happens a new element to the block and return - // what the new head is. - fn enqueue(&self, delete_operation: DeleteOperation) -> Option { - let mut writable_block = self.0.write().expect("Panicked while enqueueing in the delete queue."); - if writable_block.operations.len() >= BLOCK_SIZE { - let next_block = SharedBlock::default(); - next_block.enqueue(delete_operation); - writable_block.next = Some(next_block.clone()); - Some(next_block) - } - else { - writable_block.operations.push(delete_operation); - None - } - } - - fn next_block(&self) -> Option { - self.0 - .read() - .unwrap() - .next - .clone() - } - - fn cursor(&self,) -> DeleteQueueCursor { - let len = self.0 - .read() - .expect("Panicked while reading a block in the delete queue.") - .operations - .len(); - DeleteQueueCursor { - block: self.clone(), - pos: len, - } - } -} - -impl Default for SharedBlock { - fn default() -> SharedBlock { - SharedBlock(Arc::default()) - } -} - -impl Default for DeleteQueue { - fn default() -> DeleteQueue { - DeleteQueue { - writing_head: SharedBlock::default(), - } - } -} - -#[derive(Clone)] -pub struct DeleteQueueCursor { - block: SharedBlock, - pos: usize, -} - -impl DeleteQueueCursor { - - /// Skips to the first delete operation which has - /// a timestamp that is greater or equal to opstamp. - /// - /// Returns false in the DeleteQueue reaches its end before - /// meeting such an element. - pub fn skip_to(&mut self, opstamp: u64) -> bool { - // TODO optimize - while let Some(delete_operation) = self.peek() { - if delete_operation.opstamp >= opstamp { - return true; - } - else { - self.next(); - } - } - return false; - } - - pub fn peek(&mut self) -> Option { - if self.pos >= BLOCK_SIZE { - self.pos = 0; - match self.block.next_block() { - Some(next_block) => { - self.block = next_block; - self.pos = 0; - } - None => { - // there is no next block. - return None; - } - } - } - let readable_block = self.block.0 - .read() - .unwrap(); - if self.pos >= readable_block.operations.len() { - None - } - else { - Some(readable_block.operations[self.pos].clone()) - } - } - -} - -impl Iterator for DeleteQueueCursor { - type Item = DeleteOperation; - - /// Returns a delete operation if an operation is available, - /// None if the queue is empty. - /// - /// This iterator may return None once, and return - /// `Some(...)` ulteriorily. - fn next(&mut self) -> Option { - let delete_position = self.peek(); - if delete_position.is_some() { - self.pos += 1; - } - delete_position - } -} - -// ---------------------------------------- - pub struct DeleteQueue { - writing_head: SharedBlock, + delete_operations: Vec, } impl DeleteQueue { - - pub fn cursor(&self) -> DeleteQueueCursor { - self.writing_head.cursor() + + pub fn new() -> DeleteQueue { + DeleteQueue { + delete_operations: vec!(), + } } pub fn push_op(&mut self, delete_operation: DeleteOperation) { - if let Some(new_head) = self.writing_head.enqueue(delete_operation) { - self.writing_head = new_head; - } + self.delete_operations.push(delete_operation); } - pub fn push(&mut self, opstamp: u64, term: Term) { - let delete_operation = DeleteOperation { - opstamp: opstamp, - term: term, - }; - self.push_op(delete_operation); + + pub fn operations(&self,) -> impl Iterator { + // TODO fix iterator + self.delete_operations.clone().into_iter() } } @@ -183,7 +34,7 @@ mod tests { #[test] fn test_deletequeue() { - let mut delete_queue = DeleteQueue::default(); + let mut delete_queue = DeleteQueue::new(); let make_op = |i: usize| { let field = Field(1u8); @@ -196,36 +47,38 @@ mod tests { delete_queue.push_op(make_op(1)); delete_queue.push_op(make_op(2)); - let mut delete_cursor_3 = delete_queue.cursor(); - let mut delete_cursor_3_b = delete_cursor_3.clone(); + // TODO unit tests + + // let mut delete_cursor_3 = delete_queue.cursor(); + // let mut delete_cursor_3_b = delete_cursor_3.clone(); - assert!(delete_cursor_3.next().is_none()); - assert!(delete_cursor_3.peek().is_none()); + // assert!(delete_cursor_3.next().is_none()); + // assert!(delete_cursor_3.peek().is_none()); - delete_queue.push_op(make_op(3)); - delete_queue.push_op(make_op(4)); + // delete_queue.push_op(make_op(3)); + // delete_queue.push_op(make_op(4)); - assert_eq!(delete_cursor_3_b.peek(), Some(make_op(3))); - let mut delete_cursor_3_c = delete_cursor_3_b.clone(); + // assert_eq!(delete_cursor_3_b.peek(), Some(make_op(3))); + // let mut delete_cursor_3_c = delete_cursor_3_b.clone(); - assert_eq!(delete_cursor_3_b.next(), Some(make_op(3))); - let mut delete_cursor_4 = delete_cursor_3_b.clone(); + // assert_eq!(delete_cursor_3_b.next(), Some(make_op(3))); + // let mut delete_cursor_4 = delete_cursor_3_b.clone(); - assert_eq!(delete_cursor_3_b.peek(), Some(make_op(4))); - assert_eq!(delete_cursor_3_b.next(), Some(make_op(4))); + // assert_eq!(delete_cursor_3_b.peek(), Some(make_op(4))); + // assert_eq!(delete_cursor_3_b.next(), Some(make_op(4))); - assert_eq!(delete_cursor_3_c.next(), Some(make_op(3))); + // assert_eq!(delete_cursor_3_c.next(), Some(make_op(3))); - assert!(delete_cursor_3_b.next().is_none()); - assert_eq!(delete_cursor_3_c.next(), Some(make_op(4))); - assert!(delete_cursor_3_c.next().is_none()); + // assert!(delete_cursor_3_b.next().is_none()); + // assert_eq!(delete_cursor_3_c.next(), Some(make_op(4))); + // assert!(delete_cursor_3_c.next().is_none()); - assert_eq!(delete_cursor_3.peek(), Some(make_op(3))); - assert_eq!(delete_cursor_3.next(), Some(make_op(3))); - assert!(delete_cursor_3_b.next().is_none()); + // assert_eq!(delete_cursor_3.peek(), Some(make_op(3))); + // assert_eq!(delete_cursor_3.next(), Some(make_op(3))); + // assert!(delete_cursor_3_b.next().is_none()); - assert_eq!(delete_cursor_4.next(), Some(make_op(4))); - assert!(delete_cursor_4.next().is_none()); + // assert_eq!(delete_cursor_4.next(), Some(make_op(4))); + // assert!(delete_cursor_4.next().is_none()); } diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 14a9344d3..bdbe39f29 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -3,7 +3,10 @@ use schema::Document; use super::operation::AddOperation; use core::Index; use core::Segment; +use core::SegmentMeta; +use std::sync::Arc; use core::SegmentId; +use indexer::operation::DeleteOperation; use schema::Term; use indexer::SegmentEntry; use std::thread::JoinHandle; @@ -18,15 +21,15 @@ use core::SegmentComponent; use super::directory_lock::DirectoryLock; use futures::Future; use std::clone::Clone; +use indexer::delete_queue::DeleteQueue; use std::io; use std::thread; use futures::Canceled; use std::mem; use datastruct::stacker::Heap; use core::SegmentReader; -use std::mem::swap; +use std::mem::swap; use chan; -use super::delete_queue::{DeleteQueue, DeleteQueueCursor}; use super::segment_updater::SegmentUpdater; use Result; use Error; @@ -88,11 +91,19 @@ impl !Send for IndexWriter {} impl !Sync for IndexWriter {} +// TODO move doc to opstamp mapping to its own file +#[derive(Clone)] pub enum DocToOpstampMapping { - WithMap(Vec), + WithMap(Arc>), None } +impl From> for DocToOpstampMapping { + fn from(opstamps: Vec) -> DocToOpstampMapping { + DocToOpstampMapping::WithMap(Arc::new(opstamps)) + } +} + impl DocToOpstampMapping { fn compute_doc_limit(&self, opstamp: u64) -> DocId { match *self { @@ -112,13 +123,14 @@ impl DocToOpstampMapping { /// work on SegmentMeta pub fn advance_deletes( segment: &mut Segment, - delete_cursor: &mut DeleteQueueCursor, - doc_opstamps: DocToOpstampMapping) -> Result { + delete_queue: &DeleteQueue, + doc_opstamps: &DocToOpstampMapping) -> Result { let segment_reader = SegmentReader::open(segment.clone())?; let mut delete_bitset = BitSet::with_capacity(segment_reader.max_doc() as usize); - + let mut last_opstamp_opt: Option = None; - while let Some(delete_op) = delete_cursor.next() { + + for delete_op in delete_queue.operations() { // A delete operation should only affect // document that were inserted after it. // @@ -147,9 +159,7 @@ pub fn advance_deletes( let mut delete_file = segment.open_write(SegmentComponent::DELETE)?; write_delete_bitset(&delete_bitset, &mut delete_file)?; } - - Ok(SegmentEntry::new(segment.meta().clone(), delete_cursor.clone())) - + Ok(SegmentEntry::new(segment.meta().clone())) } fn index_documents(heap: &mut Heap, @@ -157,8 +167,7 @@ fn index_documents(heap: &mut Heap, schema: &Schema, generation: usize, document_iterator: &mut Iterator, - segment_updater: &mut SegmentUpdater, - delete_cursor: &mut DeleteQueueCursor) + segment_updater: &mut SegmentUpdater) -> Result { heap.clear(); let mut segment_writer = try!(SegmentWriter::for_segment(heap, segment.clone(), &schema)); @@ -182,7 +191,10 @@ fn index_documents(heap: &mut Heap, let doc_opstamps: Vec = segment_writer.finalize()?; - let segment_entry = advance_deletes(&mut segment, delete_cursor, DocToOpstampMapping::WithMap(doc_opstamps))?; + // let segment_entry = advance_deletes(&mut segment, delete_queue, delete_position, )?; + + let mut segment_entry = SegmentEntry::new(SegmentMeta::new(segment.id())); + segment_entry.set_doc_to_opstamp(DocToOpstampMapping::from(doc_opstamps)); segment_updater .add_segment(generation, segment_entry) @@ -230,8 +242,6 @@ impl IndexWriter { // TODO fix this. the cursor might be too advanced // at this point. - let delete_cursor = self.delete_queue.cursor(); - let generation = self.generation; let join_handle: JoinHandle> = @@ -239,7 +249,6 @@ impl IndexWriter { .name(format!("indexing thread {} for gen {}", self.worker_id, generation)) .spawn(move || { - let mut delete_cursor_clone = delete_cursor.clone(); loop { let mut document_iterator = document_receiver_clone.clone() .into_iter() @@ -259,8 +268,7 @@ impl IndexWriter { &schema, generation, &mut document_iterator, - &mut segment_updater, - &mut delete_cursor_clone)?; + &mut segment_updater)?; } else { // No more documents. @@ -308,9 +316,9 @@ impl IndexWriter { chan::sync(PIPELINE_MAX_SIZE_IN_DOCS); - let delete_queue = DeleteQueue::default(); + let delete_queue = DeleteQueue::new(); - let segment_updater = SegmentUpdater::new(index.clone(), delete_queue.cursor())?; + let segment_updater = SegmentUpdater::new(index.clone())?; let mut index_writer = IndexWriter { @@ -429,6 +437,8 @@ impl IndexWriter { Error::ErrorInThread("Error while waiting for rollback.".to_string()) )?; + self.delete_queue = DeleteQueue::new(); + // reset the opstamp self.uncommitted_opstamp = self.committed_opstamp; Ok(self.committed_opstamp) @@ -478,9 +488,10 @@ impl IndexWriter { // committed segments. self.committed_opstamp = self.stamp(); - let new_delete_queue = DeleteQueue::default(); + let new_delete_queue = DeleteQueue::new(); - let future = self.segment_updater.commit(self.committed_opstamp, new_delete_queue.cursor()); + // TODO remove clone + let future = self.segment_updater.commit(self.delete_queue.clone(), self.committed_opstamp); // wait for the segment update thread to have processed the info // TODO remove unwrap @@ -493,7 +504,11 @@ impl IndexWriter { pub fn delete_term(&mut self, term: Term) { let opstamp = self.stamp(); - self.delete_queue.push(opstamp, term); + let delete_operation = DeleteOperation { + opstamp: opstamp, + term: term, + }; + self.delete_queue.push_op(delete_operation); } fn stamp(&mut self) -> u64 { diff --git a/src/indexer/log_merge_policy.rs b/src/indexer/log_merge_policy.rs index 210d82ed3..413964767 100644 --- a/src/indexer/log_merge_policy.rs +++ b/src/indexer/log_merge_policy.rs @@ -79,7 +79,7 @@ impl MergePolicy for LogMergePolicy { .filter(|level| level.len() >= self.min_merge_size) .map(|ind_vec| { MergeCandidate(ind_vec.iter() - .map(|&ind| segments[ind].segment_id) + .map(|&ind| segments[ind].id()) .collect()) }) .collect() diff --git a/src/indexer/segment_manager.rs b/src/indexer/segment_manager.rs index 550b75382..8d01dee1a 100644 --- a/src/indexer/segment_manager.rs +++ b/src/indexer/segment_manager.rs @@ -3,7 +3,6 @@ use std::sync::RwLock; use core::SegmentMeta; use core::SegmentId; use indexer::SegmentEntry; -use indexer::delete_queue::DeleteQueueCursor; use std::sync::{RwLockReadGuard, RwLockWriteGuard}; use std::fmt::{self, Debug, Formatter}; @@ -52,11 +51,11 @@ pub fn get_segments(segment_manager: &SegmentManager,) -> (Vec, Vec impl SegmentManager { - pub fn from_segments(segment_metas: Vec, delete_cursor: DeleteQueueCursor) -> SegmentManager { + pub fn from_segments(segment_metas: Vec) -> SegmentManager { SegmentManager { - registers: RwLock::new( SegmentRegisters { + registers: RwLock::new(SegmentRegisters { uncommitted: SegmentRegister::default(), - committed: SegmentRegister::new(segment_metas, delete_cursor), + committed: SegmentRegister::new(segment_metas), }), } } @@ -131,7 +130,7 @@ impl SegmentManager { pub fn end_merge(&self, merged_segment_metas: &[SegmentMeta], merged_segment_entry: SegmentEntry) { let mut registers_lock = self.write(); - let merged_segment_ids: Vec = merged_segment_metas.iter().map(|meta| meta.segment_id).collect(); + let merged_segment_ids: Vec = merged_segment_metas.iter().map(|meta| meta.id()).collect(); if registers_lock.uncommitted.contains_all(&merged_segment_ids) { for segment_id in &merged_segment_ids { registers_lock.uncommitted.remove_segment(segment_id); diff --git a/src/indexer/segment_register.rs b/src/indexer/segment_register.rs index ac1205b0c..fedb53ac1 100644 --- a/src/indexer/segment_register.rs +++ b/src/indexer/segment_register.rs @@ -1,9 +1,10 @@ use core::SegmentId; use std::collections::HashMap; use core::SegmentMeta; +use indexer::index_writer::DocToOpstampMapping; use std::fmt; use std::fmt::{Debug, Formatter}; -use indexer::delete_queue::DeleteQueueCursor; + #[derive(Clone, PartialEq, Eq, Debug)] pub enum SegmentState { @@ -24,17 +25,29 @@ impl SegmentState { pub struct SegmentEntry { meta: SegmentMeta, state: SegmentState, - delete_cursor: DeleteQueueCursor, + doc_to_opstamp: DocToOpstampMapping, } impl SegmentEntry { - pub fn segment_id(&self) -> SegmentId { - self.meta.segment_id + pub fn new(segment_meta: SegmentMeta) -> SegmentEntry { + SegmentEntry { + meta: segment_meta, + state: SegmentState::Ready, + doc_to_opstamp: DocToOpstampMapping::None, + } } - pub fn delete_cursor(&mut self) -> &mut DeleteQueueCursor { - &mut self.delete_cursor + pub fn doc_to_opstamp(&self) -> &DocToOpstampMapping { + &self.doc_to_opstamp + } + + pub fn set_doc_to_opstamp(&mut self, doc_to_opstamp: DocToOpstampMapping) { + self.doc_to_opstamp = doc_to_opstamp; + } + + pub fn segment_id(&self) -> SegmentId { + self.meta.id() } pub fn meta(&self) -> &SegmentMeta { @@ -48,15 +61,6 @@ impl SegmentEntry { fn is_ready(&self,) -> bool { self.state == SegmentState::Ready } - - pub fn new(segment_meta: SegmentMeta, - delete_cursor: DeleteQueueCursor) -> SegmentEntry { - SegmentEntry { - meta: segment_meta, - state: SegmentState::Ready, - delete_cursor: delete_cursor, - } - } } impl Debug for SegmentEntry { @@ -117,14 +121,14 @@ impl SegmentRegister { .values() .map(|segment_entry| segment_entry.meta.clone()) .collect(); - segment_ids.sort_by_key(|meta| meta.segment_id); + segment_ids.sort_by_key(|meta| meta.id()); segment_ids } pub fn segment_ids(&self,) -> Vec { self.segment_metas() .into_iter() - .map(|segment_meta| segment_meta.segment_id) + .map(|segment_meta| segment_meta.id()) .collect() } @@ -141,7 +145,7 @@ impl SegmentRegister { } pub fn add_segment_entry(&mut self, segment_entry: SegmentEntry) { - let segment_id = segment_entry.meta.segment_id; + let segment_id = segment_entry.meta.id(); self.segment_states.insert(segment_id, segment_entry); } @@ -156,13 +160,13 @@ impl SegmentRegister { .start_merge(); } - pub fn new(segment_metas: Vec, delete_cursor: DeleteQueueCursor) -> SegmentRegister { + pub fn new(segment_metas: Vec) -> SegmentRegister { SegmentRegister { segment_states: segment_metas .into_iter() .map(|segment_meta| { - let segment_id = segment_meta.segment_id; - let segment_entry = SegmentEntry::new(segment_meta, delete_cursor.clone()); + let segment_id = segment_meta.id(); + let segment_entry = SegmentEntry::new(segment_meta ); (segment_id, segment_entry) }) .collect(), @@ -180,15 +184,12 @@ impl Default for SegmentRegister { #[cfg(test)] mod tests { - use core::SegmentId; use core::SegmentMeta; - use indexer::delete_queue::DeleteQueue; use super::*; #[test] fn test_segment_register() { - let delete_queue = DeleteQueue::default(); let mut segment_register = SegmentRegister::default(); let segment_id_a = SegmentId::generate_random(); let segment_id_b = SegmentId::generate_random(); @@ -196,14 +197,14 @@ mod tests { { let segment_meta = SegmentMeta::new(segment_id_a); - let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor()); + let segment_entry = SegmentEntry::new(segment_meta); segment_register.add_segment_entry(segment_entry); } assert_eq!(segment_register.segment_entry(&segment_id_a).unwrap().state, SegmentState::Ready); assert_eq!(segment_register.segment_ids(), vec!(segment_id_a)); { let segment_meta = SegmentMeta::new(segment_id_b); - let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor()); + let segment_entry = SegmentEntry::new(segment_meta); segment_register.add_segment_entry(segment_entry); } assert_eq!(segment_register.segment_entry(&segment_id_b).unwrap().state, SegmentState::Ready); @@ -215,7 +216,7 @@ mod tests { segment_register.remove_segment(&segment_id_b); { let segment_meta_merged = SegmentMeta::new(segment_id_merged); - let segment_entry = SegmentEntry::new(segment_meta_merged, delete_queue.cursor()); + let segment_entry = SegmentEntry::new(segment_meta_merged); segment_register.add_segment_entry(segment_entry); } assert_eq!(segment_register.segment_ids(), vec!(segment_id_merged)); diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index c77e3fc24..28c9b0c84 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -22,13 +22,13 @@ use std::borrow::BorrowMut; use indexer::SegmentSerializer; use indexer::SegmentEntry; use schema::Schema; -use indexer::index_writer::{advance_deletes, DocToOpstampMapping}; +use indexer::index_writer::advance_deletes; use directory::Directory; use std::thread::JoinHandle; use std::sync::Arc; use std::collections::HashMap; use rustc_serialize::json; -use indexer::delete_queue::{DeleteQueueCursor, DeleteQueue}; +use indexer::delete_queue::DeleteQueue; use Result; use futures_cpupool::CpuPool; use core::IndexMeta; @@ -37,13 +37,6 @@ use std::io::Write; use super::segment_manager::{SegmentManager, get_segments}; -fn create_metas(metas: Vec, schema: Schema, opstamp: u64) -> IndexMeta { - IndexMeta { - segments: metas, - schema: schema, - opstamp: opstamp, - } -} /// Save the index meta file. @@ -78,7 +71,11 @@ pub fn save_metas(segment_metas: Vec, opstamp: u64, directory: &mut Directory) -> Result<()> { - let metas = create_metas(segment_metas, schema, opstamp); + let metas = IndexMeta { + segments: segment_metas, + schema: schema, + opstamp: opstamp, + }; let mut w = Vec::new(); try!(write!(&mut w, "{}\n", json::as_pretty_json(&metas))); Ok(directory @@ -109,13 +106,10 @@ struct InnerSegmentUpdater { impl SegmentUpdater { - pub fn new( - index: Index, - delete_cursor: DeleteQueueCursor) - -> Result + pub fn new(index: Index) -> Result { let segments = index.segments()?; - let segment_manager = SegmentManager::from_segments(segments, delete_cursor); + let segment_manager = SegmentManager::from_segments(segments); Ok( SegmentUpdater(Arc::new(InnerSegmentUpdater { pool: CpuPool::new(1), @@ -149,7 +143,7 @@ impl SegmentUpdater { }) } - pub fn new_generation(&mut self, generation: usize) -> impl Future { + pub fn rollback(&mut self, generation: usize) -> impl Future { self.0.generation.store(generation, Ordering::Release); self.run_async(|segment_updater| { segment_updater.0.segment_manager.rollback(); @@ -169,26 +163,24 @@ impl SegmentUpdater { } } - fn purge_deletes(&self) -> Result> { - let segment_entries = self.0.segment_manager.segment_entries(); - segment_entries + fn purge_deletes(&self, delete_queue: &DeleteQueue) -> Result> { + self.0.segment_manager + .segment_entries() .into_iter() - .map(|mut segment_entry| { + .map(|segment_entry| { let mut segment = self.0.index.segment(segment_entry.meta().clone()); - advance_deletes(&mut segment, segment_entry.delete_cursor(), DocToOpstampMapping::None) + advance_deletes(&mut segment, delete_queue, segment_entry.doc_to_opstamp()) .map(|entry| entry.meta().clone()) }) .collect() } - pub fn commit(&self, opstamp: u64, new_delete_queue: DeleteQueueCursor) -> impl Future { + pub fn commit(&self, delete_queue: DeleteQueue, opstamp: u64) -> impl Future { self.run_async(move |segment_updater| { - let segment_metas = segment_updater.purge_deletes().expect("Failed purge deletes"); - - let segment_entries = segment_metas.into_iter() - .map(|segment_meta| - SegmentEntry::new(segment_meta, new_delete_queue.clone()) - ) + let segment_metas = segment_updater.purge_deletes(&delete_queue).expect("Failed purge deletes"); + let segment_entries = segment_metas + .into_iter() + .map(SegmentEntry::new) .collect::>(); segment_updater.0.segment_manager.commit(segment_entries); let mut directory = segment_updater.0.index.directory().box_clone(); @@ -241,8 +233,6 @@ impl SegmentUpdater { // An IndexMerger is like a "view" of our merged segments. // TODO unwrap let merger: IndexMerger = IndexMerger::open(schema, &segments[..]).expect("Creating index merger failed"); - - let mut merged_segment = index.new_segment(); // ... we just serialize this index merger in our new segment @@ -252,10 +242,7 @@ impl SegmentUpdater { let mut segment_meta = SegmentMeta::new(merged_segment.id()); segment_meta.set_num_docs(num_docs); - // TODO fix delete cursor - let delete_queue = DeleteQueue::default(); - - let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor()); + let segment_entry = SegmentEntry::new(segment_meta); segment_updater_clone .end_merge(segment_metas.clone(), segment_entry.clone()) .wait() @@ -297,7 +284,7 @@ impl SegmentUpdater { segment_updater.0.index.opstamp(), directory.borrow_mut()).expect("Could not save metas."); for segment_meta in merged_segment_metas { - segment_updater.0.index.delete_segment(segment_meta.segment_id); + segment_updater.0.index.delete_segment(segment_meta.id()); } }) diff --git a/src/lib.rs b/src/lib.rs index 09219195c..acb4ecba1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -352,6 +352,78 @@ mod tests { assert!(!postings.advance()); } } + { + // writing the segment + let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); + { // 0 + let doc = doc!(text_field=>"a b"); + index_writer.add_document(doc).unwrap(); + } + { // 1 + index_writer.delete_term(Term::from_field_text(text_field, "c")); + } + index_writer.rollback().unwrap(); + } + { + index.load_searchers().unwrap(); + let searcher = index.searcher(); + let reader = searcher.segment_reader(0); + assert!(reader.read_postings_all_info(&Term::from_field_text(text_field, "abcd")).is_none()); + { + let mut postings = reader.read_postings_all_info(&Term::from_field_text(text_field, "a")).unwrap(); + assert!(postings.advance()); + assert_eq!(postings.doc(), 5); + assert!(!postings.advance()); + } + { + let mut postings = reader.read_postings_all_info(&Term::from_field_text(text_field, "b")).unwrap(); + assert!(postings.advance()); + assert_eq!(postings.doc(), 3); + assert!(postings.advance()); + assert_eq!(postings.doc(), 4); + assert!(!postings.advance()); + } + } + { + // writing the segment + let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); + { + let doc = doc!(text_field=>"a b"); + index_writer.add_document(doc).unwrap(); + } + { + index_writer.delete_term(Term::from_field_text(text_field, "c")); + } + index_writer.rollback().unwrap(); + { + index_writer.delete_term(Term::from_field_text(text_field, "a")); + } + index_writer.commit().unwrap(); + } + { + index.load_searchers().unwrap(); + let searcher = index.searcher(); + let reader = searcher.segment_reader(0); + assert!(reader.read_postings_all_info(&Term::from_field_text(text_field, "abcd")).is_none()); + { + let mut postings = reader.read_postings_all_info(&Term::from_field_text(text_field, "a")).unwrap(); + assert!(!postings.advance()); + } + { + let mut postings = reader.read_postings_all_info(&Term::from_field_text(text_field, "b")).unwrap(); + assert!(postings.advance()); + assert_eq!(postings.doc(), 3); + assert!(postings.advance()); + assert_eq!(postings.doc(), 4); + assert!(!postings.advance()); + } + { + let mut postings = reader.read_postings_all_info(&Term::from_field_text(text_field, "c")).unwrap(); + assert!(postings.advance()); + assert_eq!(postings.doc(), 4); + assert!(!postings.advance()); + } + } } From 8b68f22be1249bc08d1bffe0d24d3dd2bfbbb92c Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Wed, 15 Feb 2017 11:52:31 +0900 Subject: [PATCH 026/107] issue/43 made the delete queue shareable --- src/indexer/delete_queue.rs | 70 ++++++++++++++++++++++++++----------- src/indexer/index_writer.rs | 11 +++--- src/lib.rs | 2 -- 3 files changed, 55 insertions(+), 28 deletions(-) diff --git a/src/indexer/delete_queue.rs b/src/indexer/delete_queue.rs index 2a2d88abd..5ae89e9b5 100644 --- a/src/indexer/delete_queue.rs +++ b/src/indexer/delete_queue.rs @@ -1,31 +1,59 @@ use super::operation::DeleteOperation; +use std::sync::{Arc, RwLock}; +use std::mem; -// TODO remove clone -#[derive(Clone)] -pub struct DeleteQueue { - delete_operations: Vec, +/// This implementation assumes that we +/// have a lot more write operation than read operations. + +#[derive(Default)] +struct InnerDeleteQueue { + ro_chunks: ReadOnlyDeletes, + last_chunk: Vec, } +impl InnerDeleteQueue { + pub fn push(&mut self, delete_operation: DeleteOperation) { + self.last_chunk.push(delete_operation); + } + + pub fn operations(&mut self,) -> ReadOnlyDeletes { + if self.last_chunk.len() > 0 { + let new_operations = vec!(); + let new_ro_chunk = mem::replace(&mut self.last_chunk, new_operations); + self.ro_chunks.push(new_ro_chunk) + } + self.ro_chunks.clone() + } +} + +#[derive(Default, Clone)] +pub struct ReadOnlyDeletes(Vec>>); + +impl ReadOnlyDeletes { + fn push(&mut self, operations: Vec) { + self.0.push(Arc::new(operations)); + } + + pub fn iter<'a>(&'a self) -> impl Iterator { + self.0 + .iter() + .flat_map(|chunk| chunk.iter()) + } +} + +#[derive(Clone, Default)] +pub struct DeleteQueue(Arc>); + impl DeleteQueue { - - pub fn new() -> DeleteQueue { - DeleteQueue { - delete_operations: vec!(), - } - } - - pub fn push_op(&mut self, delete_operation: DeleteOperation) { - self.delete_operations.push(delete_operation); + pub fn push(&self, delete_operation: DeleteOperation) { + self.0.write().unwrap().push(delete_operation); } - pub fn operations(&self,) -> impl Iterator { - // TODO fix iterator - self.delete_operations.clone().into_iter() + pub fn operations(&self) -> ReadOnlyDeletes { + self.0.write().unwrap().operations() } } - - #[cfg(test)] mod tests { @@ -34,7 +62,7 @@ mod tests { #[test] fn test_deletequeue() { - let mut delete_queue = DeleteQueue::new(); + let delete_queue = DeleteQueue::default(); let make_op = |i: usize| { let field = Field(1u8); @@ -44,8 +72,8 @@ mod tests { } }; - delete_queue.push_op(make_op(1)); - delete_queue.push_op(make_op(2)); + delete_queue.push(make_op(1)); + delete_queue.push(make_op(2)); // TODO unit tests diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index bdbe39f29..15a25ca3d 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -130,7 +130,8 @@ pub fn advance_deletes( let mut last_opstamp_opt: Option = None; - for delete_op in delete_queue.operations() { + let delete_operations = delete_queue.operations(); + for delete_op in delete_operations.iter() { // A delete operation should only affect // document that were inserted after it. // @@ -316,7 +317,7 @@ impl IndexWriter { chan::sync(PIPELINE_MAX_SIZE_IN_DOCS); - let delete_queue = DeleteQueue::new(); + let delete_queue = DeleteQueue::default(); let segment_updater = SegmentUpdater::new(index.clone())?; @@ -437,7 +438,7 @@ impl IndexWriter { Error::ErrorInThread("Error while waiting for rollback.".to_string()) )?; - self.delete_queue = DeleteQueue::new(); + self.delete_queue = DeleteQueue::default(); // reset the opstamp self.uncommitted_opstamp = self.committed_opstamp; @@ -488,7 +489,7 @@ impl IndexWriter { // committed segments. self.committed_opstamp = self.stamp(); - let new_delete_queue = DeleteQueue::new(); + let new_delete_queue = DeleteQueue::default(); // TODO remove clone let future = self.segment_updater.commit(self.delete_queue.clone(), self.committed_opstamp); @@ -508,7 +509,7 @@ impl IndexWriter { opstamp: opstamp, term: term, }; - self.delete_queue.push_op(delete_operation); + self.delete_queue.push(delete_operation); } fn stamp(&mut self) -> u64 { diff --git a/src/lib.rs b/src/lib.rs index acb4ecba1..288d95e7c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -28,8 +28,6 @@ extern crate log; #[macro_use] extern crate version; - -#[macro_use] extern crate fst; extern crate byteorder; extern crate memmap; From 1c03d98a11e4ef8cd8f34efd0314d76469c9d15a Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 16 Feb 2017 10:20:29 +0900 Subject: [PATCH 027/107] issue/43 added delete_queue right in the segment updater --- src/indexer/delete_queue.rs | 82 +++++++++++++++++++--------------- src/indexer/index_writer.rs | 12 +++-- src/indexer/segment_updater.rs | 14 +++--- 3 files changed, 59 insertions(+), 49 deletions(-) diff --git a/src/indexer/delete_queue.rs b/src/indexer/delete_queue.rs index 5ae89e9b5..425d290ce 100644 --- a/src/indexer/delete_queue.rs +++ b/src/indexer/delete_queue.rs @@ -16,7 +16,7 @@ impl InnerDeleteQueue { self.last_chunk.push(delete_operation); } - pub fn operations(&mut self,) -> ReadOnlyDeletes { + pub fn snapshot(&mut self,) -> ReadOnlyDeletes { if self.last_chunk.len() > 0 { let new_operations = vec!(); let new_ro_chunk = mem::replace(&mut self.last_chunk, new_operations); @@ -24,6 +24,11 @@ impl InnerDeleteQueue { } self.ro_chunks.clone() } + + pub fn clear(&mut self) { + self.ro_chunks.clear(); + self.last_chunk.clear(); + } } #[derive(Default, Clone)] @@ -39,6 +44,10 @@ impl ReadOnlyDeletes { .iter() .flat_map(|chunk| chunk.iter()) } + + pub fn clear(&mut self) { + self.0.clear(); + } } #[derive(Clone, Default)] @@ -49,8 +58,12 @@ impl DeleteQueue { self.0.write().unwrap().push(delete_operation); } - pub fn operations(&self) -> ReadOnlyDeletes { - self.0.write().unwrap().operations() + pub fn snapshot(&self) -> ReadOnlyDeletes { + self.0.write().unwrap().snapshot() + } + + pub fn clear(&self) { + self.0.write().unwrap().clear(); } } @@ -74,40 +87,35 @@ mod tests { delete_queue.push(make_op(1)); delete_queue.push(make_op(2)); - - // TODO unit tests - // let mut delete_cursor_3 = delete_queue.cursor(); - // let mut delete_cursor_3_b = delete_cursor_3.clone(); - - // assert!(delete_cursor_3.next().is_none()); - // assert!(delete_cursor_3.peek().is_none()); - - // delete_queue.push_op(make_op(3)); - // delete_queue.push_op(make_op(4)); - - // assert_eq!(delete_cursor_3_b.peek(), Some(make_op(3))); - // let mut delete_cursor_3_c = delete_cursor_3_b.clone(); - - // assert_eq!(delete_cursor_3_b.next(), Some(make_op(3))); - // let mut delete_cursor_4 = delete_cursor_3_b.clone(); - - // assert_eq!(delete_cursor_3_b.peek(), Some(make_op(4))); - // assert_eq!(delete_cursor_3_b.next(), Some(make_op(4))); - - // assert_eq!(delete_cursor_3_c.next(), Some(make_op(3))); - - // assert!(delete_cursor_3_b.next().is_none()); - // assert_eq!(delete_cursor_3_c.next(), Some(make_op(4))); - // assert!(delete_cursor_3_c.next().is_none()); - - // assert_eq!(delete_cursor_3.peek(), Some(make_op(3))); - // assert_eq!(delete_cursor_3.next(), Some(make_op(3))); - // assert!(delete_cursor_3_b.next().is_none()); - - // assert_eq!(delete_cursor_4.next(), Some(make_op(4))); - // assert!(delete_cursor_4.next().is_none()); - - + let snapshot = delete_queue.snapshot(); + { + let mut operations_it = snapshot.iter(); + assert_eq!(operations_it.next().unwrap().opstamp, 1); + assert_eq!(operations_it.next().unwrap().opstamp, 2); + assert!(operations_it.next().is_none()); + } + { // iterating does not consume results. + let mut operations_it = snapshot.iter(); + assert_eq!(operations_it.next().unwrap().opstamp, 1); + assert_eq!(operations_it.next().unwrap().opstamp, 2); + assert!(operations_it.next().is_none()); + } + // operations does not own a lock on the queue. + delete_queue.push(make_op(3)); + let snapshot2 = delete_queue.snapshot(); + { + // operations is not affected by + // the push that occurs after. + let mut operations_it = snapshot.iter(); + let mut operations2_it = snapshot2.iter(); + assert_eq!(operations_it.next().unwrap().opstamp, 1); + assert_eq!(operations2_it.next().unwrap().opstamp, 1); + assert_eq!(operations_it.next().unwrap().opstamp, 2); + assert_eq!(operations2_it.next().unwrap().opstamp, 2); + assert!(operations_it.next().is_none()); + assert_eq!(operations2_it.next().unwrap().opstamp, 3); + assert!(operations2_it.next().is_none()); + } } } \ No newline at end of file diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 15a25ca3d..03b936229 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -130,7 +130,7 @@ pub fn advance_deletes( let mut last_opstamp_opt: Option = None; - let delete_operations = delete_queue.operations(); + let delete_operations = delete_queue.snapshot(); for delete_op in delete_operations.iter() { // A delete operation should only affect // document that were inserted after it. @@ -319,7 +319,7 @@ impl IndexWriter { let delete_queue = DeleteQueue::default(); - let segment_updater = SegmentUpdater::new(index.clone())?; + let segment_updater = SegmentUpdater::new(index.clone(), delete_queue.clone())?; let mut index_writer = IndexWriter { @@ -438,7 +438,7 @@ impl IndexWriter { Error::ErrorInThread("Error while waiting for rollback.".to_string()) )?; - self.delete_queue = DeleteQueue::default(); + self.delete_queue.clear(); // reset the opstamp self.uncommitted_opstamp = self.committed_opstamp; @@ -489,16 +489,14 @@ impl IndexWriter { // committed segments. self.committed_opstamp = self.stamp(); - let new_delete_queue = DeleteQueue::default(); - // TODO remove clone - let future = self.segment_updater.commit(self.delete_queue.clone(), self.committed_opstamp); + let future = self.segment_updater.commit(self.committed_opstamp); // wait for the segment update thread to have processed the info // TODO remove unwrap future.wait().unwrap(); - self.delete_queue = new_delete_queue; + self.delete_queue.clear(); Ok(self.committed_opstamp) } diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index 28c9b0c84..d72efd313 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -102,11 +102,12 @@ struct InnerSegmentUpdater { merging_thread_id: AtomicUsize, merging_threads: RwLock>>>, generation: AtomicUsize, + delete_queue: DeleteQueue, } impl SegmentUpdater { - pub fn new(index: Index) -> Result + pub fn new(index: Index, delete_queue: DeleteQueue) -> Result { let segments = index.segments()?; let segment_manager = SegmentManager::from_segments(segments); @@ -119,6 +120,7 @@ impl SegmentUpdater { merging_thread_id: AtomicUsize::default(), merging_threads: RwLock::new(HashMap::new()), generation: AtomicUsize::default(), + delete_queue: delete_queue, })) ) } @@ -163,21 +165,21 @@ impl SegmentUpdater { } } - fn purge_deletes(&self, delete_queue: &DeleteQueue) -> Result> { + fn purge_deletes(&self) -> Result> { self.0.segment_manager .segment_entries() .into_iter() .map(|segment_entry| { let mut segment = self.0.index.segment(segment_entry.meta().clone()); - advance_deletes(&mut segment, delete_queue, segment_entry.doc_to_opstamp()) + advance_deletes(&mut segment, &self.0.delete_queue, segment_entry.doc_to_opstamp()) .map(|entry| entry.meta().clone()) }) .collect() } - pub fn commit(&self, delete_queue: DeleteQueue, opstamp: u64) -> impl Future { + pub fn commit(&self, opstamp: u64) -> impl Future { self.run_async(move |segment_updater| { - let segment_metas = segment_updater.purge_deletes(&delete_queue).expect("Failed purge deletes"); + let segment_metas = segment_updater.purge_deletes().expect("Failed purge deletes"); let segment_entries = segment_metas .into_iter() .map(SegmentEntry::new) @@ -210,6 +212,8 @@ impl SegmentUpdater { let merging_join_handle = thread::spawn(move || { + + // first we need to apply deletes to our segment. info!("Start merge: {:?}", segment_ids_vec); let ref index = segment_updater_clone.0.index; From e3d2fca8449d6bb7008a43db3228ac7d4c44af61 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 19 Feb 2017 15:01:25 +0900 Subject: [PATCH 028/107] issue/43 Isolated segment_entry / doc_opstamp_mapping --- src/indexer/delete_queue.rs | 1 + src/indexer/doc_opstamp_mapping.rs | 30 ++++++++++ src/indexer/index_writer.rs | 91 +++++++++--------------------- src/indexer/mod.rs | 4 +- src/indexer/segment_entry.rs | 71 +++++++++++++++++++++++ src/indexer/segment_register.rs | 82 +++------------------------ 6 files changed, 142 insertions(+), 137 deletions(-) create mode 100644 src/indexer/doc_opstamp_mapping.rs create mode 100644 src/indexer/segment_entry.rs diff --git a/src/indexer/delete_queue.rs b/src/indexer/delete_queue.rs index 425d290ce..8543f7104 100644 --- a/src/indexer/delete_queue.rs +++ b/src/indexer/delete_queue.rs @@ -31,6 +31,7 @@ impl InnerDeleteQueue { } } + #[derive(Default, Clone)] pub struct ReadOnlyDeletes(Vec>>); diff --git a/src/indexer/doc_opstamp_mapping.rs b/src/indexer/doc_opstamp_mapping.rs new file mode 100644 index 000000000..843002416 --- /dev/null +++ b/src/indexer/doc_opstamp_mapping.rs @@ -0,0 +1,30 @@ +use std::sync::Arc; +use DocId; + +#[derive(Clone)] +pub enum DocToOpstampMapping { + WithMap(Arc>), + None +} + +impl From> for DocToOpstampMapping { + fn from(opstamps: Vec) -> DocToOpstampMapping { + DocToOpstampMapping::WithMap(Arc::new(opstamps)) + } +} + +impl DocToOpstampMapping { + // TODO Unit test + pub fn compute_doc_limit(&self, opstamp: u64) -> DocId { + match *self { + DocToOpstampMapping::WithMap(ref doc_opstamps) => { + match doc_opstamps.binary_search(&opstamp) { + Ok(doc_id) => doc_id as DocId, + Err(doc_id) => doc_id as DocId, + } + } + DocToOpstampMapping::None => DocId::max_value(), + } + } +} + diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 03b936229..b05d4f324 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -1,38 +1,36 @@ -use schema::Schema; -use schema::Document; -use super::operation::AddOperation; +use bit_set::BitSet; +use chan; use core::Index; use core::Segment; -use core::SegmentMeta; -use std::sync::Arc; -use core::SegmentId; -use indexer::operation::DeleteOperation; -use schema::Term; -use indexer::SegmentEntry; -use std::thread::JoinHandle; -use indexer::MergePolicy; -use indexer::SegmentWriter; -use DocId; -use bit_set::BitSet; -use fastfield::delete::write_delete_bitset; -use postings::SegmentPostingsOption; -use postings::DocSet; use core::SegmentComponent; -use super::directory_lock::DirectoryLock; -use futures::Future; -use std::clone::Clone; -use indexer::delete_queue::DeleteQueue; -use std::io; -use std::thread; -use futures::Canceled; -use std::mem; -use datastruct::stacker::Heap; +use core::SegmentId; +use core::SegmentMeta; use core::SegmentReader; -use std::mem::swap; -use chan; -use super::segment_updater::SegmentUpdater; -use Result; +use datastruct::stacker::Heap; use Error; +use fastfield::delete::write_delete_bitset; +use futures::Canceled; +use futures::Future; +use indexer::delete_queue::DeleteQueue; +use indexer::doc_opstamp_mapping::DocToOpstampMapping; +use indexer::MergePolicy; +use indexer::operation::DeleteOperation; +use indexer::SegmentEntry; +use indexer::SegmentWriter; +use postings::DocSet; +use postings::SegmentPostingsOption; +use Result; +use schema::Document; +use schema::Schema; +use schema::Term; +use std::io; +use std::mem; +use std::mem::swap; +use std::thread; +use std::thread::JoinHandle; +use super::directory_lock::DirectoryLock; +use super::operation::AddOperation; +use super::segment_updater::SegmentUpdater; // Size of the margin for the heap. A segment is closed when the remaining memory // in the heap goes below MARGIN_IN_BYTES. @@ -44,13 +42,9 @@ pub const HEAP_SIZE_LIMIT: u32 = MARGIN_IN_BYTES * 3u32; // Add document will block if the number of docs waiting in the queue to be indexed reaches PIPELINE_MAX_SIZE_IN_DOCS const PIPELINE_MAX_SIZE_IN_DOCS: usize = 10_000; - - type DocumentSender = chan::Sender; type DocumentReceiver = chan::Receiver; - - /// `IndexWriter` is the user entry-point to add document to an index. /// /// It manages a small number of indexing thread, as well as a shared @@ -90,35 +84,6 @@ pub struct IndexWriter { impl !Send for IndexWriter {} impl !Sync for IndexWriter {} - -// TODO move doc to opstamp mapping to its own file -#[derive(Clone)] -pub enum DocToOpstampMapping { - WithMap(Arc>), - None -} - -impl From> for DocToOpstampMapping { - fn from(opstamps: Vec) -> DocToOpstampMapping { - DocToOpstampMapping::WithMap(Arc::new(opstamps)) - } -} - -impl DocToOpstampMapping { - fn compute_doc_limit(&self, opstamp: u64) -> DocId { - match *self { - DocToOpstampMapping::WithMap(ref doc_opstamps) => { - match doc_opstamps.binary_search(&opstamp) { - Ok(doc_id) => doc_id as DocId, - Err(doc_id) => doc_id as DocId, - } - } - DocToOpstampMapping::None => DocId::max_value(), - } - } -} - - /// TODO /// work on SegmentMeta pub fn advance_deletes( diff --git a/src/indexer/mod.rs b/src/indexer/mod.rs index aea0965f5..478e851da 100644 --- a/src/indexer/mod.rs +++ b/src/indexer/mod.rs @@ -10,9 +10,11 @@ mod segment_manager; pub mod delete_queue; pub mod segment_updater; mod directory_lock; +mod segment_entry; +mod doc_opstamp_mapping; pub mod operation; -pub use self::segment_register::SegmentEntry; +pub use self::segment_entry::SegmentEntry; pub use self::segment_serializer::SegmentSerializer; pub use self::segment_writer::SegmentWriter; pub use self::index_writer::IndexWriter; diff --git a/src/indexer/segment_entry.rs b/src/indexer/segment_entry.rs new file mode 100644 index 000000000..c8a917665 --- /dev/null +++ b/src/indexer/segment_entry.rs @@ -0,0 +1,71 @@ +use indexer::doc_opstamp_mapping::DocToOpstampMapping; +use core::SegmentMeta; +use core::SegmentId; +use std::fmt; + +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +pub enum SegmentState { + Ready, + InMerge, +} + +impl SegmentState { + pub fn letter_code(&self,) -> char { + match *self { + SegmentState::InMerge => 'M', + SegmentState::Ready => 'R', + } + } +} + +#[derive(Clone)] +pub struct SegmentEntry { + meta: SegmentMeta, + state: SegmentState, + doc_to_opstamp: DocToOpstampMapping, +} + +impl SegmentEntry { + + pub fn new(segment_meta: SegmentMeta) -> SegmentEntry { + SegmentEntry { + meta: segment_meta, + state: SegmentState::Ready, + doc_to_opstamp: DocToOpstampMapping::None, + } + } + + pub fn doc_to_opstamp(&self) -> &DocToOpstampMapping { + &self.doc_to_opstamp + } + + pub fn state(&self) -> SegmentState { + self.state + } + + pub fn set_doc_to_opstamp(&mut self, doc_to_opstamp: DocToOpstampMapping) { + self.doc_to_opstamp = doc_to_opstamp; + } + + pub fn segment_id(&self) -> SegmentId { + self.meta.id() + } + + pub fn meta(&self) -> &SegmentMeta { + &self.meta + } + + pub fn start_merge(&mut self,) { + self.state = SegmentState::InMerge; + } + + pub fn is_ready(&self,) -> bool { + self.state == SegmentState::Ready + } +} + +impl fmt::Debug for SegmentEntry { + fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + write!(formatter, "SegmentEntry({:?}, {:?})", self.meta, self.state) + } +} diff --git a/src/indexer/segment_register.rs b/src/indexer/segment_register.rs index fedb53ac1..87b93be4c 100644 --- a/src/indexer/segment_register.rs +++ b/src/indexer/segment_register.rs @@ -1,73 +1,9 @@ use core::SegmentId; use std::collections::HashMap; use core::SegmentMeta; -use indexer::index_writer::DocToOpstampMapping; use std::fmt; use std::fmt::{Debug, Formatter}; - - -#[derive(Clone, PartialEq, Eq, Debug)] -pub enum SegmentState { - Ready, - InMerge, -} - -impl SegmentState { - fn letter_code(&self,) -> char { - match *self { - SegmentState::InMerge => 'M', - SegmentState::Ready => 'R', - } - } -} - -#[derive(Clone)] -pub struct SegmentEntry { - meta: SegmentMeta, - state: SegmentState, - doc_to_opstamp: DocToOpstampMapping, -} - -impl SegmentEntry { - - pub fn new(segment_meta: SegmentMeta) -> SegmentEntry { - SegmentEntry { - meta: segment_meta, - state: SegmentState::Ready, - doc_to_opstamp: DocToOpstampMapping::None, - } - } - - pub fn doc_to_opstamp(&self) -> &DocToOpstampMapping { - &self.doc_to_opstamp - } - - pub fn set_doc_to_opstamp(&mut self, doc_to_opstamp: DocToOpstampMapping) { - self.doc_to_opstamp = doc_to_opstamp; - } - - pub fn segment_id(&self) -> SegmentId { - self.meta.id() - } - - pub fn meta(&self) -> &SegmentMeta { - &self.meta - } - - fn start_merge(&mut self,) { - self.state = SegmentState::InMerge; - } - - fn is_ready(&self,) -> bool { - self.state == SegmentState::Ready - } -} - -impl Debug for SegmentEntry { - fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result { - write!(formatter, "SegmentEntry({:?}, {:?})", self.meta, self.state) - } -} +use indexer::segment_entry::SegmentEntry; @@ -88,7 +24,7 @@ impl Debug for SegmentRegister { fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> { try!(write!(f, "SegmentRegister(")); for (k, v) in &self.segment_states { - try!(write!(f, "{}:{}, ", k.short_uuid_string(), v.state.letter_code())); + try!(write!(f, "{}:{}, ", k.short_uuid_string(), v.state().letter_code())); } try!(write!(f, ")")); Ok(()) @@ -105,7 +41,7 @@ impl SegmentRegister { self.segment_states .values() .filter(|segment_entry| segment_entry.is_ready()) - .map(|segment_entry| segment_entry.meta.clone()) + .map(|segment_entry| segment_entry.meta().clone()) .collect() } @@ -119,7 +55,7 @@ impl SegmentRegister { pub fn segment_metas(&self,) -> Vec { let mut segment_ids: Vec = self.segment_states .values() - .map(|segment_entry| segment_entry.meta.clone()) + .map(|segment_entry| segment_entry.meta().clone()) .collect(); segment_ids.sort_by_key(|meta| meta.id()); segment_ids @@ -145,7 +81,7 @@ impl SegmentRegister { } pub fn add_segment_entry(&mut self, segment_entry: SegmentEntry) { - let segment_id = segment_entry.meta.id(); + let segment_id = segment_entry.segment_id(); self.segment_states.insert(segment_id, segment_entry); } @@ -200,18 +136,18 @@ mod tests { let segment_entry = SegmentEntry::new(segment_meta); segment_register.add_segment_entry(segment_entry); } - assert_eq!(segment_register.segment_entry(&segment_id_a).unwrap().state, SegmentState::Ready); + assert_eq!(segment_register.segment_entry(&segment_id_a).unwrap().state(), SegmentState::Ready); assert_eq!(segment_register.segment_ids(), vec!(segment_id_a)); { let segment_meta = SegmentMeta::new(segment_id_b); let segment_entry = SegmentEntry::new(segment_meta); segment_register.add_segment_entry(segment_entry); } - assert_eq!(segment_register.segment_entry(&segment_id_b).unwrap().state, SegmentState::Ready); + assert_eq!(segment_register.segment_entry(&segment_id_b).unwrap().state(), SegmentState::Ready); segment_register.start_merge(&segment_id_a); segment_register.start_merge(&segment_id_b); - assert_eq!(segment_register.segment_entry(&segment_id_a).unwrap().state, SegmentState::InMerge); - assert_eq!(segment_register.segment_entry(&segment_id_b).unwrap().state, SegmentState::InMerge); + assert_eq!(segment_register.segment_entry(&segment_id_a).unwrap().state(), SegmentState::InMerge); + assert_eq!(segment_register.segment_entry(&segment_id_b).unwrap().state(), SegmentState::InMerge); segment_register.remove_segment(&segment_id_a); segment_register.remove_segment(&segment_id_b); { From 7315000fd45c199fc5ab07bed7ddab2cb988c5af Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 19 Feb 2017 17:43:05 +0900 Subject: [PATCH 029/107] issue/43 Merging ok for postings / fastfields. --- src/core/segment_reader.rs | 4 + src/datastruct/fstmap.rs | 16 ++- src/fastfield/delete.rs | 5 +- src/indexer/delete_queue.rs | 2 + src/indexer/index_writer.rs | 17 ++- src/indexer/merger.rs | 226 +++++++++++++++++++++----------- src/indexer/mod.rs | 5 +- src/indexer/segment_register.rs | 1 + src/indexer/segment_updater.rs | 96 +++++++------- src/postings/mod.rs | 2 +- src/postings/postings_writer.rs | 2 +- src/postings/serializer.rs | 15 ++- src/postings/term_info.rs | 7 +- 13 files changed, 262 insertions(+), 136 deletions(-) diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 020db8715..2795d87e3 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -266,6 +266,10 @@ impl SegmentReader { self.segment_id } + pub fn delete_bitset(&self) -> &DeleteBitSet { + &self.delete_bitset + } + pub fn is_deleted(&self, doc: DocId) -> bool { self.delete_bitset.is_deleted(doc) } diff --git a/src/datastruct/fstmap.rs b/src/datastruct/fstmap.rs index 1d4a420ed..7bd5b23fa 100644 --- a/src/datastruct/fstmap.rs +++ b/src/datastruct/fstmap.rs @@ -30,7 +30,21 @@ impl FstMapBuilder { }) } - pub fn insert(&mut self, key: &[u8], value: &V) -> io::Result<()>{ + /// Horribly unsafe, nobody should ever do that... except me :) + pub fn insert_key(&mut self, key: &[u8]) -> io::Result<()> { + try!(self.fst_builder + .insert(key, self.data.len() as u64) + .map_err(convert_fst_error)); + Ok(()) + } + + /// Horribly unsafe, nobody should ever do that... except me :) + pub fn insert_value(&mut self, value: &V) -> io::Result<()> { + try!(value.serialize(&mut self.data)); + Ok(()) + } + + pub fn insert(&mut self, key: &[u8], value: &V) -> io::Result<()> { try!(self.fst_builder .insert(key, self.data.len() as u64) .map_err(convert_fst_error)); diff --git a/src/fastfield/delete.rs b/src/fastfield/delete.rs index 03f3b13b4..a899af963 100644 --- a/src/fastfield/delete.rs +++ b/src/fastfield/delete.rs @@ -56,6 +56,10 @@ impl DeleteBitSet { } } + pub fn has_deletes(&self) -> bool { + self.len() > 0 + } + pub fn is_deleted(&self, doc: DocId) -> bool { if self.len == 0 { false @@ -72,7 +76,6 @@ impl DeleteBitSet { impl HasLen for DeleteBitSet { - fn len(&self) -> usize { self.len } diff --git a/src/indexer/delete_queue.rs b/src/indexer/delete_queue.rs index 8543f7104..498bff51f 100644 --- a/src/indexer/delete_queue.rs +++ b/src/indexer/delete_queue.rs @@ -32,6 +32,8 @@ impl InnerDeleteQueue { } + +// TODO Rename to DeleteQueueSnapshot #[derive(Default, Clone)] pub struct ReadOnlyDeletes(Vec>>); diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index b05d4f324..e2a569552 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -9,6 +9,7 @@ use core::SegmentReader; use datastruct::stacker::Heap; use Error; use fastfield::delete::write_delete_bitset; +use indexer::delete_queue::ReadOnlyDeletes; use futures::Canceled; use futures::Future; use indexer::delete_queue::DeleteQueue; @@ -84,18 +85,22 @@ pub struct IndexWriter { impl !Send for IndexWriter {} impl !Sync for IndexWriter {} -/// TODO -/// work on SegmentMeta + +// TODO put delete bitset in segment entry +// rather than DocToOpstamp. + +// TODO skip delete operation before teh +// last delete opstamp + pub fn advance_deletes( segment: &mut Segment, - delete_queue: &DeleteQueue, - doc_opstamps: &DocToOpstampMapping) -> Result { + delete_operations: &ReadOnlyDeletes, + doc_opstamps: &DocToOpstampMapping) -> Result { let segment_reader = SegmentReader::open(segment.clone())?; let mut delete_bitset = BitSet::with_capacity(segment_reader.max_doc() as usize); let mut last_opstamp_opt: Option = None; - let delete_operations = delete_queue.snapshot(); for delete_op in delete_operations.iter() { // A delete operation should only affect // document that were inserted after it. @@ -125,7 +130,7 @@ pub fn advance_deletes( let mut delete_file = segment.open_write(SegmentComponent::DELETE)?; write_delete_bitset(&delete_bitset, &mut delete_file)?; } - Ok(SegmentEntry::new(segment.meta().clone())) + Ok(segment.meta().clone()) } fn index_documents(heap: &mut Heap, diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 15bc174f5..d0be34fda 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -5,18 +5,19 @@ use DocId; use core::SerializableSegment; use indexer::SegmentSerializer; use postings::PostingsSerializer; +use fastfield::U32FastFieldReader; +use itertools::Itertools; use postings::Postings; use postings::DocSet; use core::TermIterator; +use fastfield::delete::DeleteBitSet; use schema::{Schema, Field}; use fastfield::FastFieldSerializer; use store::StoreWriter; -use postings::ChainedPostings; -use postings::HasLen; -use postings::OffsetPostings; use core::SegmentInfo; use std::cmp::{min, max}; use std::iter; +use std::io; pub struct IndexMerger { schema: Schema, @@ -47,12 +48,40 @@ impl DeltaPositionComputer { } } + +fn compute_min_max_val(u32_reader: &U32FastFieldReader, max_doc: DocId, delete_bitset: &DeleteBitSet) -> Option<(u32, u32)> { + if max_doc == 0 { + None + } + else if !delete_bitset.has_deletes() { + // no deleted documents, + // we can use the previous min_val, max_val. + Some((u32_reader.min_val(), u32_reader.max_val())) + } + else { + // some deleted documents, + // we need to recompute the max / min + (0..max_doc) + .filter(|doc_id| !delete_bitset.is_deleted(*doc_id)) + .minmax() + .into_option() + } +} + +fn extract_fieldnorm_reader(segment_reader: &SegmentReader, field: Field) -> io::Result { + segment_reader.get_fieldnorms_reader(field) +} + +fn extract_fast_field_reader(segment_reader: &SegmentReader, field: Field) -> io::Result { + segment_reader.get_fast_field_reader(field) +} + impl IndexMerger { pub fn open(schema: Schema, segments: &[Segment]) -> Result { - let mut readers = Vec::new(); + let mut readers = vec!(); let mut max_doc = 0; for segment in segments { - let reader = try!(SegmentReader::open(segment.clone())); + let reader = SegmentReader::open(segment.clone())?; max_doc += reader.num_docs(); readers.push(reader); } @@ -63,74 +92,104 @@ impl IndexMerger { }) } - - fn write_fieldnorms(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> { - // TODO make sure that works even if the field is never here. - for field in self.schema + fn write_fieldnorms(&self, + fast_field_serializer: &mut FastFieldSerializer) -> Result<()> { + let fieldnorm_fastfields: Vec = self.schema .fields() .iter() .enumerate() .filter(|&(_, field_entry)| field_entry.is_indexed()) - .map(|(field_id, _)| Field(field_id as u8)) { - let mut u32_readers = Vec::new(); - let mut min_val = u32::min_value(); - let mut max_val = 0; - for reader in &self.readers { - let u32_reader = try!(reader.get_fieldnorms_reader(field)); - min_val = min(min_val, u32_reader.min_val()); - max_val = max(max_val, u32_reader.max_val()); - u32_readers.push((reader.max_doc(), u32_reader)); - } - try!(fast_field_serializer.new_u32_fast_field(field, min_val, max_val)); - for (max_doc, u32_reader) in u32_readers { - for doc_id in 0..max_doc { - let val = u32_reader.get(doc_id); - try!(fast_field_serializer.add_val(val)); - } - } - try!(fast_field_serializer.close_field()); - } - Ok(()) + .map(|(field_id, _)| Field(field_id as u8)) + .collect(); + self.generic_write_fast_field(fieldnorm_fastfields, &extract_fieldnorm_reader, fast_field_serializer) } fn write_fast_fields(&self, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> { - for field in self.schema + let fast_fields: Vec = self.schema .fields() .iter() .enumerate() .filter(|&(_, field_entry)| field_entry.is_u32_fast()) - .map(|(field_id, _)| Field(field_id as u8)) { - let mut u32_readers = Vec::new(); - let mut min_val = u32::min_value(); - let mut max_val = 0; + .map(|(field_id, _)| Field(field_id as u8)) + .collect(); + self.generic_write_fast_field(fast_fields, &extract_fast_field_reader, fast_field_serializer) + } + + + // used both to merge field norms and regular u32 fast fields. + fn generic_write_fast_field(&self, + fields: Vec, + field_reader_extractor: &Fn(&SegmentReader, Field) -> io::Result, + fast_field_serializer: &mut FastFieldSerializer) -> Result<()> { + + for field in fields { + + let mut u32_readers = vec!(); + let mut min_val = u32::max_value(); + let mut max_val = u32::min_value(); + for reader in &self.readers { - let u32_reader = try!(reader.get_fast_field_reader(field)); - min_val = min(min_val, u32_reader.min_val()); - max_val = max(max_val, u32_reader.max_val()); - u32_readers.push((reader.max_doc(), u32_reader)); - } - try!(fast_field_serializer.new_u32_fast_field(field, min_val, max_val)); - for (max_doc, u32_reader) in u32_readers { - for doc_id in 0..max_doc { - let val = u32_reader.get(doc_id); - try!(fast_field_serializer.add_val(val)); + let u32_reader = field_reader_extractor(reader, field)?; + if let Some((seg_min_val, seg_max_val)) = compute_min_max_val(&u32_reader, reader.max_doc(), reader.delete_bitset()) { + // the segment has some non-deleted documents + min_val = min(min_val, seg_min_val); + max_val = max(max_val, seg_max_val); + u32_readers.push((reader.max_doc(), u32_reader, reader.delete_bitset())); } } + + if u32_readers.is_empty() { + // we have actually zero documents. + min_val = 0; + max_val = 0; + } + + assert!(min_val <= max_val); + + // TODO test deleting all documents off the index. + + try!(fast_field_serializer.new_u32_fast_field(field, min_val, max_val)); + for (max_doc, u32_reader, delete_bitset) in u32_readers { + for doc_id in 0..max_doc { + if !delete_bitset.is_deleted(doc_id) { + let val = u32_reader.get(doc_id); + try!(fast_field_serializer.add_val(val)); + } + } + } + try!(fast_field_serializer.close_field()); } Ok(()) } - fn write_postings(&self, postings_serializer: &mut PostingsSerializer) -> Result<()> { + fn write_postings(&self, + + postings_serializer: &mut PostingsSerializer) -> Result<()> { + let mut merged_terms = TermIterator::from(&self.readers[..]); let mut delta_position_computer = DeltaPositionComputer::new(); - let mut offsets: Vec = Vec::new(); + + let mut max_doc = 0; - for reader in &self.readers { - offsets.push(max_doc); - max_doc += reader.max_doc(); - } + // map from segment doc ids to the resulting merged segment doc id. + let mut merged_doc_id_map: Vec>> = Vec::with_capacity(self.readers.len()); + + for reader in &self.readers { + let mut segment_local_map = Vec::with_capacity(reader.max_doc() as usize); + for doc_id in 0..reader.max_doc() { + if reader.is_deleted(doc_id) { + segment_local_map.push(None); + } + else { + segment_local_map.push(Some(max_doc)); + max_doc += 1u32; + } + } + merged_doc_id_map.push(segment_local_map); + } + while merged_terms.advance() { // Create the total list of doc ids // by stacking the doc ids from the different segment. @@ -142,34 +201,51 @@ impl IndexMerger { // - Segment 2's doc ids become [seg0.max_doc + seg1.max_doc, seg0.max_doc + seg1.max_doc + seg2.max_doc] // ... let term = merged_terms.term(); - let mut merged_postings = - ChainedPostings::from( - merged_terms - .segment_ords() - .iter() - .cloned() - .flat_map(|segment_ord| { - let offset = offsets[segment_ord]; - self.readers[segment_ord] - .read_postings_all_info(&term) - .map(|segment_postings| OffsetPostings::new(segment_postings, offset)) - }) - .collect::>() - ); + let mut term_written = false; + let segment_postings = merged_terms + .segment_ords() + .iter() + .cloned() + .flat_map(|segment_ord| { + self.readers[segment_ord] + .read_postings_all_info(&term) + .map(|segment_postings| (segment_ord, segment_postings)) + }) + .collect::>(); - // We can now serialize this postings, by pushing each document to the - // postings serializer. - try!(postings_serializer.new_term(&term, merged_postings.len() as DocId)); - while merged_postings.advance() { - let delta_positions: &[u32] = - delta_position_computer.compute_delta_positions(merged_postings.positions()); - try!(postings_serializer.write_doc(merged_postings.doc(), - merged_postings.term_freq(), - delta_positions)); + // We can remove the term if all documents which + // contained it have been deleted. + if segment_postings.len() > 0 { + + // We can now serialize this postings, by pushing each document to the + // postings serializer. + + for (segment_ord, mut segment_postings) in segment_postings { + let old_to_new_doc_id = &merged_doc_id_map[segment_ord]; + while segment_postings.advance() { + if let Some(remapped_doc_id) = old_to_new_doc_id[segment_postings.doc() as usize] { + if !term_written { + // we make sure to only write the term iff + // there is at least one document. + postings_serializer.new_term(&term)?; + term_written = true; + } + let delta_positions: &[u32] = + delta_position_computer.compute_delta_positions(segment_postings.positions()); + try!(postings_serializer.write_doc( + remapped_doc_id, + segment_postings.term_freq(), + delta_positions)); + } + } + } + + if term_written { + try!(postings_serializer.close_term()); + } } - try!(postings_serializer.close_term()); + } - Ok(()) } diff --git a/src/indexer/mod.rs b/src/indexer/mod.rs index 478e851da..b2a71fa02 100644 --- a/src/indexer/mod.rs +++ b/src/indexer/mod.rs @@ -14,7 +14,10 @@ mod segment_entry; mod doc_opstamp_mapping; pub mod operation; -pub use self::segment_entry::SegmentEntry; + +// TODO avoid exposing SegmentState / SegmentEntry if it does not have to be public API + +pub use self::segment_entry::{SegmentEntry, SegmentState}; pub use self::segment_serializer::SegmentSerializer; pub use self::segment_writer::SegmentWriter; pub use self::index_writer::IndexWriter; diff --git a/src/indexer/segment_register.rs b/src/indexer/segment_register.rs index 87b93be4c..5f2216342 100644 --- a/src/indexer/segment_register.rs +++ b/src/indexer/segment_register.rs @@ -120,6 +120,7 @@ impl Default for SegmentRegister { #[cfg(test)] mod tests { + use indexer::SegmentState; use core::SegmentId; use core::SegmentMeta; use super::*; diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index d72efd313..b9c4e47d2 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -1,44 +1,42 @@ #![allow(for_kv_map)] use core::Index; -use Error; -use core::Segment; -use indexer::{MergePolicy, DefaultMergePolicy}; -use core::SegmentId; -use core::SegmentMeta; -use std::mem; -use std::sync::atomic::Ordering; -use std::ops::DerefMut; -use futures::{Future, future}; -use futures::oneshot; -use futures::Canceled; -use std::thread; -use std::sync::atomic::AtomicUsize; -use std::sync::RwLock; -use core::SerializableSegment; -use indexer::MergeCandidate; -use indexer::merger::IndexMerger; -use std::borrow::BorrowMut; -use indexer::SegmentSerializer; -use indexer::SegmentEntry; -use schema::Schema; -use indexer::index_writer::advance_deletes; -use directory::Directory; -use std::thread::JoinHandle; -use std::sync::Arc; -use std::collections::HashMap; -use rustc_serialize::json; -use indexer::delete_queue::DeleteQueue; -use Result; -use futures_cpupool::CpuPool; use core::IndexMeta; use core::META_FILEPATH; +use core::Segment; +use core::SegmentId; +use core::SegmentMeta; +use core::SerializableSegment; +use directory::Directory; +use Error; +use futures_cpupool::CpuPool; +use futures::{Future, future}; +use futures::Canceled; +use futures::oneshot; +use indexer::{MergePolicy, DefaultMergePolicy}; +use indexer::delete_queue::DeleteQueue; +use indexer::index_writer::advance_deletes; +use indexer::MergeCandidate; +use indexer::merger::IndexMerger; +use indexer::SegmentEntry; +use indexer::SegmentSerializer; +use Result; +use rustc_serialize::json; +use schema::Schema; +use std::borrow::BorrowMut; +use std::collections::HashMap; use std::io::Write; +use std::mem; +use std::ops::DerefMut; +use std::sync::Arc; +use std::sync::atomic::AtomicUsize; +use std::sync::atomic::Ordering; +use std::sync::RwLock; +use std::thread; +use std::thread::JoinHandle; use super::segment_manager::{SegmentManager, get_segments}; - - /// Save the index meta file. /// This operation is atomic : /// Either @@ -171,8 +169,7 @@ impl SegmentUpdater { .into_iter() .map(|segment_entry| { let mut segment = self.0.index.segment(segment_entry.meta().clone()); - advance_deletes(&mut segment, &self.0.delete_queue, segment_entry.doc_to_opstamp()) - .map(|entry| entry.meta().clone()) + advance_deletes(&mut segment, &self.0.delete_queue.snapshot(), segment_entry.doc_to_opstamp()) }) .collect() } @@ -206,27 +203,37 @@ impl SegmentUpdater { let merging_thread_id = self.get_merging_thread_id(); let (merging_future_send, merging_future_recv) = oneshot(); + let delete_operations = self.0.delete_queue.snapshot(); + if segment_ids.is_empty() { return merging_future_recv; } let merging_join_handle = thread::spawn(move || { - // first we need to apply deletes to our segment. info!("Start merge: {:?}", segment_ids_vec); let ref index = segment_updater_clone.0.index; let schema = index.schema(); - let segment_metas: Vec = segment_ids_vec - .iter() - .map(|segment_id| - segment_updater_clone.0.segment_manager - .segment_entry(segment_id) - .map(|segment_entry| segment_entry.meta().clone()) - .ok_or(Error::InvalidArgument(format!("Segment({:?}) does not exist anymore", segment_id))) - ) - .collect::>()?; + + let mut segment_metas = vec!(); + for segment_id in &segment_ids_vec { + if let Some(segment_entry) = segment_updater_clone.0 + .segment_manager + .segment_entry(segment_id) { + let mut segment = index.segment(segment_entry.meta().clone()); + let segment_meta = advance_deletes( + &mut segment, + &delete_operations, + segment_entry.doc_to_opstamp())?; + segment_metas.push(segment_meta); + } + else { + error!("Error, had to abort merge as some of the segment is not managed anymore.a"); + return Err(Error::InvalidArgument(format!("Segment {:?} requested for merge is not managed.", segment_id))); + } + } let segments: Vec = segment_metas .iter() @@ -251,6 +258,7 @@ impl SegmentUpdater { .end_merge(segment_metas.clone(), segment_entry.clone()) .wait() .unwrap(); + merging_future_send.complete(segment_entry.clone()); segment_updater_clone.0.merging_threads.write().unwrap().remove(&merging_thread_id); Ok(segment_entry) diff --git a/src/postings/mod.rs b/src/postings/mod.rs index f9898b9fc..5e9b28414 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -64,7 +64,7 @@ mod tests { let mut segment = index.new_segment(); let mut posting_serializer = PostingsSerializer::open(&mut segment).unwrap(); let term = Term::from_field_text(text_field, "abc"); - posting_serializer.new_term(&term, 3).unwrap(); + posting_serializer.new_term(&term).unwrap(); for doc_id in 0u32..3u32 { let positions = vec!(1,2,3,2); posting_serializer.write_doc(doc_id, 2, &positions).unwrap(); diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index a3c0194f1..667e67458 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -122,7 +122,7 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<' for (term_bytes, (addr, recorder)) in term_offsets { // TODO remove copy term.set_content(term_bytes); - try!(serializer.new_term(&term, recorder.doc_freq())); + try!(serializer.new_term(&term)); try!(recorder.serialize(addr, serializer, heap)); try!(serializer.close_term()); } diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index afbfbe948..508df95b6 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -63,6 +63,7 @@ pub struct PostingsSerializer { schema: Schema, text_indexing_options: TextIndexingOptions, term_open: bool, + current_term_info: TermInfo, } impl PostingsSerializer { @@ -88,6 +89,7 @@ impl PostingsSerializer { schema: schema, text_indexing_options: TextIndexingOptions::Unindexed, term_open: false, + current_term_info: TermInfo::default(), }) } @@ -121,7 +123,7 @@ impl PostingsSerializer { /// * term - the term. It needs to come after the previous term according /// to the lexicographical order. /// * doc_freq - return the number of document containing the term. - pub fn new_term(&mut self, term: &Term, doc_freq: DocId) -> io::Result<()> { + pub fn new_term(&mut self, term: &Term) -> io::Result<()> { if self.term_open { panic!("Called new_term, while the previous term was not closed."); } @@ -131,13 +133,12 @@ impl PostingsSerializer { self.last_doc_id_encoded = 0; self.term_freqs.clear(); self.position_deltas.clear(); - let term_info = TermInfo { - doc_freq: doc_freq, + self.current_term_info = TermInfo { + doc_freq: 0, postings_offset: self.written_bytes_postings as u32, positions_offset: self.written_bytes_positions as u32, }; - self.terms_fst_builder - .insert(term.as_slice(), &term_info) + self.terms_fst_builder.insert_key(term.as_slice()) } /// Finish the serialization for this term postings. @@ -146,6 +147,9 @@ impl PostingsSerializer { /// using `VInt` encoding. pub fn close_term(&mut self) -> io::Result<()> { if self.term_open { + + self.terms_fst_builder.insert_value(&self.current_term_info)?; + if !self.doc_ids.is_empty() { // we have doc ids waiting to be written // this happens when the number of doc ids is @@ -202,6 +206,7 @@ impl PostingsSerializer { term_freq: u32, position_deltas: &[u32]) -> io::Result<()> { + self.current_term_info.inc_doc_freq(); self.doc_ids.push(doc_id); if self.text_indexing_options.is_termfreq_enabled() { self.term_freqs.push(term_freq as u32); diff --git a/src/postings/term_info.rs b/src/postings/term_info.rs index ac7edf591..20bc0b1b8 100644 --- a/src/postings/term_info.rs +++ b/src/postings/term_info.rs @@ -12,7 +12,7 @@ use std::io; /// * `postings_offset` : an offset in the `.idx` file /// addressing the start of the posting list associated /// to this term. -#[derive(Debug,Ord,PartialOrd,Eq,PartialEq,Clone)] +#[derive(Debug,Default,Ord,PartialOrd,Eq,PartialEq,Clone)] pub struct TermInfo { /// Number of documents in the segment containing the term pub doc_freq: u32, @@ -22,6 +22,11 @@ pub struct TermInfo { pub positions_offset: u32, } +impl TermInfo { + pub fn inc_doc_freq(&mut self) { + self.doc_freq += 1; + } +} impl BinarySerializable for TermInfo { fn serialize(&self, writer: &mut io::Write) -> io::Result { From 1b45539f32f80f1f843a246d9236452c25c18992 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 19 Feb 2017 17:57:19 +0900 Subject: [PATCH 030/107] issue/43 Added support for delete in merged index --- src/datastruct/fstmap.rs | 6 +++ src/indexer/merger.rs | 12 +++++- src/postings/chained_postings.rs | 71 -------------------------------- src/postings/mod.rs | 5 --- src/postings/offset_postings.rs | 59 -------------------------- src/postings/serializer.rs | 2 +- src/postings/term_info.rs | 5 --- src/store/writer.rs | 16 +------ 8 files changed, 18 insertions(+), 158 deletions(-) delete mode 100644 src/postings/chained_postings.rs delete mode 100644 src/postings/offset_postings.rs diff --git a/src/datastruct/fstmap.rs b/src/datastruct/fstmap.rs index 7bd5b23fa..428e9b710 100644 --- a/src/datastruct/fstmap.rs +++ b/src/datastruct/fstmap.rs @@ -31,6 +31,12 @@ impl FstMapBuilder { } /// Horribly unsafe, nobody should ever do that... except me :) + /// + /// If used, it must be used by systematically alternating calls + /// to insert_key and insert_value. + /// + /// TODO see if I can bend Rust typesystem to enforce that + /// in a nice way. pub fn insert_key(&mut self, key: &[u8]) -> io::Result<()> { try!(self.fst_builder .insert(key, self.data.len() as u64) diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index d0be34fda..c48409b37 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -3,6 +3,7 @@ use core::SegmentReader; use core::Segment; use DocId; use core::SerializableSegment; +use schema::FieldValue; use indexer::SegmentSerializer; use postings::PostingsSerializer; use fastfield::U32FastFieldReader; @@ -170,7 +171,6 @@ impl IndexMerger { let mut merged_terms = TermIterator::from(&self.readers[..]); let mut delta_position_computer = DeltaPositionComputer::new(); - let mut max_doc = 0; // map from segment doc ids to the resulting merged segment doc id. @@ -252,7 +252,15 @@ impl IndexMerger { fn write_storable_fields(&self, store_writer: &mut StoreWriter) -> Result<()> { for reader in &self.readers { let store_reader = reader.get_store_reader(); - try!(store_writer.stack_reader(store_reader)); + for doc_id in 0..reader.max_doc() { + if !reader.is_deleted(doc_id) { + let doc = try!(store_reader.get(doc_id)); + let field_values: Vec<&FieldValue> = doc.field_values() + .iter() + .collect(); + try!(store_writer.store(&field_values)); + } + } } Ok(()) } diff --git a/src/postings/chained_postings.rs b/src/postings/chained_postings.rs deleted file mode 100644 index f07185918..000000000 --- a/src/postings/chained_postings.rs +++ /dev/null @@ -1,71 +0,0 @@ -use DocId; -use postings::Postings; -use postings::OffsetPostings; -use postings::DocSet; -use postings::HasLen; - -/// Creates a posting object that chains two postings -/// together. -/// -/// When iterating over the chained postings, -/// it will consume all of the documents of the first postings, -/// and then iterate over the documents over the second postings. -/// -/// The chained postings is used when merging segments. -pub struct ChainedPostings<'a> { - chained_postings: Vec>, - posting_id: usize, - len: usize, -} - -impl<'a> From>> for ChainedPostings<'a> { - fn from(chained_postings: Vec>) -> ChainedPostings { - let len: usize = chained_postings - .iter() - .map(|segment_postings| segment_postings.len()) - .sum(); - ChainedPostings { - chained_postings: chained_postings, - posting_id: 0, - len: len, - } - } -} - -impl<'a> DocSet for ChainedPostings<'a> { - - fn advance(&mut self,) -> bool { - if self.posting_id == self.chained_postings.len() { - return false; - } - while !self.chained_postings[self.posting_id].advance() { - self.posting_id += 1; - if self.posting_id == self.chained_postings.len() { - return false; - } - } - true - } - - fn doc(&self,) -> DocId { - self.chained_postings[self.posting_id].doc() - } -} - -impl<'a> HasLen for ChainedPostings<'a> { - fn len(&self,) -> usize { - self.len - } -} - -impl<'a> Postings for ChainedPostings<'a> { - - fn term_freq(&self,) -> u32 { - self.chained_postings[self.posting_id].term_freq() - } - - fn positions(&self) -> &[u32] { - self.chained_postings[self.posting_id].positions() - } - -} diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 5e9b28414..82b3fbf61 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -9,17 +9,14 @@ mod recorder; mod serializer; mod postings_writer; mod term_info; -mod chained_postings; mod vec_postings; mod segment_postings; mod intersection; -mod offset_postings; mod freq_handler; mod docset; mod segment_postings_option; pub use self::docset::{SkipResult, DocSet}; -pub use self::offset_postings::OffsetPostings; pub use self::recorder::{Recorder, NothingRecorder, TermFrequencyRecorder, TFAndPositionRecorder}; pub use self::serializer::PostingsSerializer; pub use self::postings_writer::PostingsWriter; @@ -29,11 +26,9 @@ pub use self::postings::Postings; #[cfg(test)] pub use self::vec_postings::VecPostings; -pub use self::chained_postings::ChainedPostings; pub use self::segment_postings::SegmentPostings; pub use self::intersection::IntersectionDocSet; pub use self::freq_handler::FreqHandler; - pub use self::segment_postings_option::SegmentPostingsOption; pub use common::HasLen; diff --git a/src/postings/offset_postings.rs b/src/postings/offset_postings.rs deleted file mode 100644 index 1410ef922..000000000 --- a/src/postings/offset_postings.rs +++ /dev/null @@ -1,59 +0,0 @@ -use postings::Postings; -use postings::SegmentPostings; -use postings::SkipResult; -use postings::DocSet; -use postings::HasLen; -use DocId; - -/// Wraps a posting object and offset all of the doc id with a given offset. -/// -/// Assuming the original posting list is `0, 5, 7, 8...`, and the offset is `3` -/// the `OffsetPostings` becomes `3, 8, 10, 11...`. -pub struct OffsetPostings<'a> { - underlying: SegmentPostings<'a>, - offset: DocId, -} - -impl<'a> OffsetPostings<'a> { - /// Constructor - pub fn new(underlying: SegmentPostings<'a>, offset: DocId) -> OffsetPostings { - OffsetPostings { - underlying: underlying, - offset: offset, - } - } -} - -impl<'a> DocSet for OffsetPostings<'a> { - fn advance(&mut self) -> bool { - self.underlying.advance() - } - - fn doc(&self) -> DocId { - self.underlying.doc() + self.offset - } - - fn skip_next(&mut self, target: DocId) -> SkipResult { - if target >= self.offset { - SkipResult::OverStep - } else { - self.underlying.skip_next(target - self.offset) - } - } -} - -impl<'a> HasLen for OffsetPostings<'a> { - fn len(&self) -> usize { - self.underlying.len() - } -} - -impl<'a> Postings for OffsetPostings<'a> { - fn term_freq(&self) -> u32 { - self.underlying.term_freq() - } - - fn positions(&self) -> &[u32] { - self.underlying.positions() - } -} \ No newline at end of file diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index 508df95b6..67c077429 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -206,7 +206,7 @@ impl PostingsSerializer { term_freq: u32, position_deltas: &[u32]) -> io::Result<()> { - self.current_term_info.inc_doc_freq(); + self.current_term_info.doc_freq += 1; self.doc_ids.push(doc_id); if self.text_indexing_options.is_termfreq_enabled() { self.term_freqs.push(term_freq as u32); diff --git a/src/postings/term_info.rs b/src/postings/term_info.rs index 20bc0b1b8..c268ca850 100644 --- a/src/postings/term_info.rs +++ b/src/postings/term_info.rs @@ -22,11 +22,6 @@ pub struct TermInfo { pub positions_offset: u32, } -impl TermInfo { - pub fn inc_doc_freq(&mut self) { - self.doc_freq += 1; - } -} impl BinarySerializable for TermInfo { fn serialize(&self, writer: &mut io::Write) -> io::Result { diff --git a/src/store/writer.rs b/src/store/writer.rs index 569d8f509..426648381 100644 --- a/src/store/writer.rs +++ b/src/store/writer.rs @@ -2,12 +2,9 @@ use directory::WritePtr; use DocId; use schema::FieldValue; use common::BinarySerializable; -use std::io::Write; -use std::io; -use error::Result; +use std::io::{self, Write}; use lz4; use datastruct::SkipListBuilder; -use super::StoreReader; const BLOCK_SIZE: usize = 16_384; @@ -33,17 +30,6 @@ impl StoreWriter { } } - pub fn stack_reader(&mut self, reader: &StoreReader) -> Result<()> { - for doc_id in 0..reader.max_doc { - let doc = try!(reader.get(doc_id)); - let field_values: Vec<&FieldValue> = doc.field_values() - .iter() - .collect(); - try!(self.store(&field_values)); - } - Ok(()) - } - pub fn store<'a>(&mut self, field_values: &[&'a FieldValue]) -> io::Result<()> { self.intermediary_buffer.clear(); try!((field_values.len() as u32).serialize(&mut self.intermediary_buffer)); From 0f332d1fd3fc33bb53c329b717bf3a370980f890 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 19 Feb 2017 19:25:40 +0900 Subject: [PATCH 031/107] issue/43 Removed doc freq from recorders. --- src/datastruct/fstmap.rs | 4 ++-- src/postings/recorder.rs | 23 ----------------------- 2 files changed, 2 insertions(+), 25 deletions(-) diff --git a/src/datastruct/fstmap.rs b/src/datastruct/fstmap.rs index 428e9b710..adb3e6e35 100644 --- a/src/datastruct/fstmap.rs +++ b/src/datastruct/fstmap.rs @@ -20,7 +20,7 @@ pub struct FstMapBuilder { } impl FstMapBuilder { - + pub fn new(w: W) -> io::Result> { let fst_builder = try!(fst::MapBuilder::new(w).map_err(convert_fst_error)); Ok(FstMapBuilder { @@ -50,6 +50,7 @@ impl FstMapBuilder { Ok(()) } + #[cfg(test)] pub fn insert(&mut self, key: &[u8], value: &V) -> io::Result<()> { try!(self.fst_builder .insert(key, self.data.len() as u64) @@ -146,7 +147,6 @@ mod tests { assert_eq!(keys.next().unwrap(), "abc".as_bytes()); assert_eq!(keys.next().unwrap(), "abcd".as_bytes()); assert_eq!(keys.next(), None); - } } diff --git a/src/postings/recorder.rs b/src/postings/recorder.rs index 94173720b..05586858a 100644 --- a/src/postings/recorder.rs +++ b/src/postings/recorder.rs @@ -26,8 +26,6 @@ pub trait Recorder: HeapAllocable { fn record_position(&mut self, position: u32, heap: &Heap); /// Close the document. It will help record the term frequency. fn close_doc(&mut self, heap: &Heap); - /// Returns the number of document that have been seen so far - fn doc_freq(&self) -> u32; /// Pushes the postings information to the serializer. fn serialize(&self, self_addr: u32, @@ -41,7 +39,6 @@ pub trait Recorder: HeapAllocable { pub struct NothingRecorder { stack: ExpUnrolledLinkedList, current_doc: DocId, - doc_freq: u32, } impl HeapAllocable for NothingRecorder { @@ -49,7 +46,6 @@ impl HeapAllocable for NothingRecorder { NothingRecorder { stack: ExpUnrolledLinkedList::with_addr(addr), current_doc: u32::max_value(), - doc_freq: 0u32, } } } @@ -62,17 +58,12 @@ impl Recorder for NothingRecorder { fn new_doc(&mut self, doc: DocId, heap: &Heap) { self.current_doc = doc; self.stack.push(doc, heap); - self.doc_freq += 1; } fn record_position(&mut self, _position: u32, _heap: &Heap) {} fn close_doc(&mut self, _heap: &Heap) {} - fn doc_freq(&self) -> u32 { - self.doc_freq - } - fn serialize(&self, self_addr: u32, serializer: &mut PostingsSerializer, @@ -91,7 +82,6 @@ pub struct TermFrequencyRecorder { stack: ExpUnrolledLinkedList, current_doc: DocId, current_tf: u32, - doc_freq: u32, } impl HeapAllocable for TermFrequencyRecorder { @@ -100,7 +90,6 @@ impl HeapAllocable for TermFrequencyRecorder { stack: ExpUnrolledLinkedList::with_addr(addr), current_doc: u32::max_value(), current_tf: 0u32, - doc_freq: 0u32, } } } @@ -111,7 +100,6 @@ impl Recorder for TermFrequencyRecorder { } fn new_doc(&mut self, doc: DocId, heap: &Heap) { - self.doc_freq += 1u32; self.current_doc = doc; self.stack.push(doc, heap); } @@ -126,10 +114,6 @@ impl Recorder for TermFrequencyRecorder { self.current_tf = 0; } - fn doc_freq(&self) -> u32 { - self.doc_freq - } - fn serialize(&self, self_addr: u32, serializer: &mut PostingsSerializer, @@ -154,7 +138,6 @@ impl Recorder for TermFrequencyRecorder { pub struct TFAndPositionRecorder { stack: ExpUnrolledLinkedList, current_doc: DocId, - doc_freq: u32, } impl HeapAllocable for TFAndPositionRecorder { @@ -162,7 +145,6 @@ impl HeapAllocable for TFAndPositionRecorder { TFAndPositionRecorder { stack: ExpUnrolledLinkedList::with_addr(addr), current_doc: u32::max_value(), - doc_freq: 0u32, } } } @@ -173,7 +155,6 @@ impl Recorder for TFAndPositionRecorder { } fn new_doc(&mut self, doc: DocId, heap: &Heap) { - self.doc_freq += 1; self.current_doc = doc; self.stack.push(doc, heap); } @@ -186,10 +167,6 @@ impl Recorder for TFAndPositionRecorder { self.stack.push(POSITION_END, heap); } - fn doc_freq(&self) -> u32 { - self.doc_freq - } - fn serialize(&self, self_addr: u32, serializer: &mut PostingsSerializer, From c677eb9f137270a65667d345fe8692fe04553292 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 19 Feb 2017 22:41:45 +0900 Subject: [PATCH 032/107] issue/43 Removed notify --- src/lib.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index 288d95e7c..45d9ca4fc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -46,7 +46,6 @@ extern crate itertools; extern crate chan; extern crate crossbeam; extern crate bit_set; -extern crate notify; extern crate futures; extern crate futures_cpupool; From 4a8eb3cb05e4dbfedfe7b676b8dcfe051f8a93bc Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Wed, 22 Feb 2017 21:05:21 +0900 Subject: [PATCH 033/107] issue/43 Added unit test for deletes including merging. --- src/collector/mod.rs | 4 +- src/indexer/merger.rs | 135 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 136 insertions(+), 3 deletions(-) diff --git a/src/collector/mod.rs b/src/collector/mod.rs index 84bc38485..584b16714 100644 --- a/src/collector/mod.rs +++ b/src/collector/mod.rs @@ -140,8 +140,8 @@ pub mod tests { } } - pub fn vals(&self,) -> &Vec { - &self.vals + pub fn vals(self,) -> Vec { + self.vals } } diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index c48409b37..071d362d3 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -64,6 +64,7 @@ fn compute_min_max_val(u32_reader: &U32FastFieldReader, max_doc: DocId, delete_b // we need to recompute the max / min (0..max_doc) .filter(|doc_id| !delete_bitset.is_deleted(*doc_id)) + .map(|doc_id| u32_reader.get(doc_id)) .minmax() .into_option() } @@ -283,11 +284,15 @@ mod tests { use schema; use schema::Document; use schema::Term; + use query::TermQuery; + use schema::{Field, FieldValue}; use core::Index; + use Searcher; use DocAddress; use collector::tests::FastFieldTestCollector; use collector::tests::TestCollector; use query::BooleanQuery; + use postings::SegmentPostingsOption; use schema::TextIndexingOptions; use futures::Future; @@ -396,11 +401,139 @@ mod tests { let query = BooleanQuery::new_multiterms_query(terms); let mut collector = FastFieldTestCollector::for_field(score_field); assert!(searcher.search(&query, &mut collector).is_ok()); - collector.vals().clone() + collector.vals() }; assert_eq!(get_fast_vals(vec![Term::from_field_text(text_field, "a")]), vec!(5, 7, 13,)); } } } + + fn search_term(searcher: &Searcher, term: Term) -> Vec { + let mut collector = FastFieldTestCollector::for_field(Field(1)); + let term_query = TermQuery::new(term, SegmentPostingsOption::NoFreq); + searcher.search(&term_query, &mut collector).unwrap(); + collector.vals() + } + + #[test] + fn test_index_merger_with_deletes() { + let mut schema_builder = schema::SchemaBuilder::default(); + let text_fieldtype = schema::TextOptions::default() + .set_indexing_options(TextIndexingOptions::TokenizedWithFreq) + .set_stored(); + let text_field = schema_builder.add_text_field("text", text_fieldtype); + let score_fieldtype = schema::U32Options::default().set_fast(); + let score_field = schema_builder.add_u32_field("score", score_fieldtype); + let index = Index::create_in_ram(schema_builder.build()); + let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); + + { // a first commit + index_writer.add_document( + doc!( + text_field => "a b d", + score_field => 1 + )).unwrap(); + index_writer.add_document( + doc!( + text_field => "b c", + score_field => 2 + )).unwrap(); + index_writer.delete_term(Term::from_field_text(text_field, "c")); + index_writer.add_document( + doc!( + text_field => "c d", + score_field => 3 + )).unwrap(); + index_writer.commit().expect("committed"); + index.load_searchers().unwrap(); + let ref searcher = *index.searcher(); + assert_eq!(searcher.num_docs(), 2); + assert_eq!(searcher.segment_readers()[0].num_docs(), 2); + assert_eq!(searcher.segment_readers()[0].max_doc(), 3); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), vec!(1)); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), vec!(1)); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), vec!(3)); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), vec!(1, 3)); + } + { // a second commit + index_writer.add_document( + doc!( + text_field => "a d e", + score_field => 4_000 + )).unwrap(); + index_writer.add_document( + doc!( + text_field => "e f", + score_field => 5_000 + )).unwrap(); + index_writer.delete_term(Term::from_field_text(text_field, "a")); + index_writer.delete_term(Term::from_field_text(text_field, "f")); + index_writer.add_document( + doc!( + text_field => "f g", + score_field => 6_000 + )).unwrap(); + index_writer.add_document( + doc!( + text_field => "g h", + score_field => 7_000 + )).unwrap(); + index_writer.commit().expect("committed"); + index.load_searchers().unwrap(); + let searcher = index.searcher(); + assert_eq!(searcher.segment_readers().len(), 2); + assert_eq!(searcher.num_docs(), 3); + assert_eq!(searcher.segment_readers()[0].num_docs(), 1); + assert_eq!(searcher.segment_readers()[0].max_doc(), 3); + assert_eq!(searcher.segment_readers()[1].num_docs(), 2); + assert_eq!(searcher.segment_readers()[1].max_doc(), 4); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), vec!()); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), vec!()); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), vec!(3)); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), vec!(3)); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), vec!()); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")), vec!(6_000)); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")), vec!(6_000, 7_000)); + + let score_field_reader = searcher.segment_reader(0).get_fast_field_reader(score_field).unwrap(); + assert_eq!(score_field_reader.min_val(), 1); + assert_eq!(score_field_reader.max_val(), 3); + + let score_field_reader = searcher.segment_reader(1).get_fast_field_reader(score_field).unwrap(); + assert_eq!(score_field_reader.min_val(), 4000); + assert_eq!(score_field_reader.max_val(), 7000); + } + { // merging the segments + let segment_ids = index.searchable_segment_ids().expect("Searchable segments failed."); + index_writer.merge(&segment_ids) + .wait() + .expect("Merging failed"); + index_writer.wait_merging_threads().unwrap(); + + let segment_ids = index.searchable_segment_ids().expect("Searchable segments failed."); + let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); + index_writer.merge(&segment_ids) + .wait() + .expect("Merging failed"); + index_writer.wait_merging_threads().unwrap(); + index.load_searchers().unwrap(); + let ref searcher = *index.searcher(); + assert_eq!(searcher.segment_readers().len(), 1); + assert_eq!(searcher.num_docs(), 3); + assert_eq!(searcher.segment_readers()[0].num_docs(), 3); + assert_eq!(searcher.segment_readers()[0].max_doc(), 3); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), vec!()); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), vec!()); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), vec!(3)); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), vec!(3)); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), vec!()); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")), vec!(6_000)); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")), vec!(6_000, 7_000)); + let score_field_reader = searcher.segment_reader(0).get_fast_field_reader(score_field).unwrap(); + assert_eq!(score_field_reader.min_val(), 3); + assert_eq!(score_field_reader.max_val(), 7000); + } + + } } From df9090cb0b0b77ecf1a02d09ec769e19fca9bc79 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Wed, 22 Feb 2017 21:59:53 +0900 Subject: [PATCH 034/107] NOBUG TODO hunt, and cleanups --- src/collector/chained_collector.rs | 6 +-- src/collector/count_collector.rs | 4 +- src/collector/mod.rs | 11 ++--- src/collector/multi_collector.rs | 4 +- src/collector/top_collector.rs | 4 +- src/common/serialize.rs | 1 - src/core/index.rs | 4 +- src/core/segment_reader.rs | 19 ++++---- src/indexer/merger.rs | 77 +++++++++++++++++++++++++----- src/lib.rs | 14 +++--- 10 files changed, 98 insertions(+), 46 deletions(-) diff --git a/src/collector/chained_collector.rs b/src/collector/chained_collector.rs index 5840eb775..524ffec58 100644 --- a/src/collector/chained_collector.rs +++ b/src/collector/chained_collector.rs @@ -1,7 +1,7 @@ +use Result; use collector::Collector; use SegmentLocalId; use SegmentReader; -use std::io; use DocId; use Score; @@ -12,7 +12,7 @@ use Score; pub struct DoNothingCollector; impl Collector for DoNothingCollector { #[inline] - fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> io::Result<()> { + fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<()> { Ok(()) } #[inline] @@ -38,7 +38,7 @@ impl ChainedCollector { } impl Collector for ChainedCollector { - fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> io::Result<()> { + fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> Result<()> { try!(self.left.set_segment(segment_local_id, segment)); try!(self.right.set_segment(segment_local_id, segment)); Ok(()) diff --git a/src/collector/count_collector.rs b/src/collector/count_collector.rs index 8a9014a25..ff15abd73 100644 --- a/src/collector/count_collector.rs +++ b/src/collector/count_collector.rs @@ -1,7 +1,7 @@ -use std::io; use super::Collector; use DocId; use Score; +use Result; use SegmentReader; use SegmentLocalId; @@ -28,7 +28,7 @@ impl Default for CountCollector { impl Collector for CountCollector { - fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> io::Result<()> { + fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<()> { Ok(()) } diff --git a/src/collector/mod.rs b/src/collector/mod.rs index 584b16714..ff856ad08 100644 --- a/src/collector/mod.rs +++ b/src/collector/mod.rs @@ -2,7 +2,7 @@ use SegmentReader; use SegmentLocalId; use DocId; use Score; -use std::io; +use Result; mod count_collector; pub use self::count_collector::CountCollector; @@ -48,14 +48,14 @@ pub use self::chained_collector::chain; pub trait Collector { /// `set_segment` is called before beginning to enumerate /// on this segment. - fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> io::Result<()>; + fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> Result<()>; /// The query pushes the scored document to the collector via this method. fn collect(&mut self, doc: DocId, score: Score); } impl<'a, C: Collector> Collector for &'a mut C { - fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> io::Result<()> { + fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> Result<()> { (*self).set_segment(segment_local_id, segment) } /// The query pushes the scored document to the collector via this method. @@ -73,7 +73,6 @@ pub mod tests { use DocId; use Score; use core::SegmentReader; - use std::io; use SegmentLocalId; use fastfield::U32FastFieldReader; use schema::Field; @@ -107,7 +106,7 @@ pub mod tests { impl Collector for TestCollector { - fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> io::Result<()> { + fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> { self.offset += self.segment_max_doc; self.segment_max_doc = reader.max_doc(); Ok(()) @@ -146,7 +145,7 @@ pub mod tests { } impl Collector for FastFieldTestCollector { - fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> io::Result<()> { + fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> { self.ff_reader = Some(try!(reader.get_fast_field_reader(self.field))); Ok(()) } diff --git a/src/collector/multi_collector.rs b/src/collector/multi_collector.rs index 6ce999e80..e5eddc7f4 100644 --- a/src/collector/multi_collector.rs +++ b/src/collector/multi_collector.rs @@ -1,7 +1,7 @@ -use std::io; use super::Collector; use DocId; use Score; +use Result; use SegmentReader; use SegmentLocalId; @@ -25,7 +25,7 @@ impl<'a> MultiCollector<'a> { impl<'a> Collector for MultiCollector<'a> { - fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> io::Result<()> { + fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> Result<()> { for collector in &mut self.collectors { try!(collector.set_segment(segment_local_id, segment)); } diff --git a/src/collector/top_collector.rs b/src/collector/top_collector.rs index 21c023caf..6425eb300 100644 --- a/src/collector/top_collector.rs +++ b/src/collector/top_collector.rs @@ -1,8 +1,8 @@ -use std::io; use super::Collector; use SegmentReader; use SegmentLocalId; use DocAddress; +use Result; use std::collections::BinaryHeap; use std::cmp::Ordering; use DocId; @@ -105,7 +105,7 @@ impl TopCollector { impl Collector for TopCollector { - fn set_segment(&mut self, segment_id: SegmentLocalId, _: &SegmentReader) -> io::Result<()> { + fn set_segment(&mut self, segment_id: SegmentLocalId, _: &SegmentReader) -> Result<()> { self.segment_id = segment_id; Ok(()) } diff --git a/src/common/serialize.rs b/src/common/serialize.rs index b1ffab6cd..6bd1426fe 100644 --- a/src/common/serialize.rs +++ b/src/common/serialize.rs @@ -74,7 +74,6 @@ impl BinarySerializable for u64 { impl BinarySerializable for u8 { fn serialize(&self, writer: &mut Write) -> io::Result { - // TODO error try!(writer.write_u8(*self)); Ok(1) } diff --git a/src/core/index.rs b/src/core/index.rs index b4e04d0c8..06a6bc744 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -26,7 +26,6 @@ use directory::error::FileError; const NUM_SEARCHERS: usize = 12; - fn load_metas(directory: &Directory) -> Result { let meta_data = directory.atomic_read(&META_FILEPATH)?; let meta_string = String::from_utf8_lossy(&meta_data); @@ -117,7 +116,6 @@ impl Index { fn create_from_metas(directory: Box, metas: IndexMeta) -> Result { let schema = metas.schema.clone(); let opstamp = metas.opstamp; - // TODO log somethings is uncommitted is not empty. let index = Index { directory: directory, schema: schema, @@ -137,7 +135,7 @@ impl Index { /// Opens a new directory from an index path. pub fn open(directory_path: &Path) -> Result { let directory = try!(MmapDirectory::open(directory_path)); - let metas = try!(load_metas(&directory)); //< TODO does the directory already exists? + let metas = try!(load_metas(&directory)); Index::create_from_metas(directory.box_clone(), metas) } diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 2795d87e3..5e8676882 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -74,16 +74,19 @@ impl SegmentReader { } /// Accessor to a segment's fast field reader given a field. - pub fn get_fast_field_reader(&self, field: Field) -> io::Result { + pub fn get_fast_field_reader(&self, field: Field) -> Result { let field_entry = self.schema.get_field_entry(field); - match *field_entry.field_type() { - FieldType::Str(_) => { - Err(io::Error::new(io::ErrorKind::Other, "fast field are not yet supported for text fields.")) + match field_entry.field_type() { + &FieldType::Str(_) => { + Err(Error::InvalidArgument(format!("Field <{}> is not a fast field. It is a text field, and fast text fields are not supported yet.", field_entry.name()))) }, - FieldType::U32(_) => { - // TODO check that the schema allows that - //Err(io::Error::new(io::ErrorKind::Other, "fast field are not yet supported for text fields.")) - self.fast_fields_reader.get_field(field) + &FieldType::U32(ref u32_options) => { + if u32_options.is_fast() { + Ok(self.fast_fields_reader.get_field(field)?) + } + else { + Err(Error::InvalidArgument(format!("Field <{}> is not defined as a fast field.", field_entry.name()))) + } }, } } diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 071d362d3..879b278e0 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -18,7 +18,7 @@ use store::StoreWriter; use core::SegmentInfo; use std::cmp::{min, max}; use std::iter; -use std::io; + pub struct IndexMerger { schema: Schema, @@ -70,11 +70,11 @@ fn compute_min_max_val(u32_reader: &U32FastFieldReader, max_doc: DocId, delete_b } } -fn extract_fieldnorm_reader(segment_reader: &SegmentReader, field: Field) -> io::Result { - segment_reader.get_fieldnorms_reader(field) +fn extract_fieldnorm_reader(segment_reader: &SegmentReader, field: Field) -> Result { + Ok(segment_reader.get_fieldnorms_reader(field)?) } -fn extract_fast_field_reader(segment_reader: &SegmentReader, field: Field) -> io::Result { +fn extract_fast_field_reader(segment_reader: &SegmentReader, field: Field) -> Result { segment_reader.get_fast_field_reader(field) } @@ -121,7 +121,7 @@ impl IndexMerger { // used both to merge field norms and regular u32 fast fields. fn generic_write_fast_field(&self, fields: Vec, - field_reader_extractor: &Fn(&SegmentReader, Field) -> io::Result, + field_reader_extractor: &Fn(&SegmentReader, Field) -> Result, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> { for field in fields { @@ -509,14 +509,6 @@ mod tests { index_writer.merge(&segment_ids) .wait() .expect("Merging failed"); - index_writer.wait_merging_threads().unwrap(); - - let segment_ids = index.searchable_segment_ids().expect("Searchable segments failed."); - let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); - index_writer.merge(&segment_ids) - .wait() - .expect("Merging failed"); - index_writer.wait_merging_threads().unwrap(); index.load_searchers().unwrap(); let ref searcher = *index.searcher(); assert_eq!(searcher.segment_readers().len(), 1); @@ -534,6 +526,65 @@ mod tests { assert_eq!(score_field_reader.min_val(), 3); assert_eq!(score_field_reader.max_val(), 7000); } + { + // test a commit with only deletes + index_writer.delete_term(Term::from_field_text(text_field, "c")); + index_writer.commit().unwrap(); + + index.load_searchers().unwrap(); + let ref searcher = *index.searcher(); + assert_eq!(searcher.segment_readers().len(), 1); + assert_eq!(searcher.num_docs(), 2); + assert_eq!(searcher.segment_readers()[0].num_docs(), 2); + assert_eq!(searcher.segment_readers()[0].max_doc(), 3); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), vec!()); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), vec!()); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), vec!()); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), vec!()); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), vec!()); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")), vec!(6_000)); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")), vec!(6_000, 7_000)); + let score_field_reader = searcher.segment_reader(0).get_fast_field_reader(score_field).unwrap(); + assert_eq!(score_field_reader.min_val(), 3); + assert_eq!(score_field_reader.max_val(), 7000); + } + { // Test merging a single segment in order to remove deletes. + let segment_ids = index.searchable_segment_ids().expect("Searchable segments failed."); + index_writer.merge(&segment_ids) + .wait() + .expect("Merging failed"); + index.load_searchers().unwrap(); + + let ref searcher = *index.searcher(); + assert_eq!(searcher.segment_readers().len(), 1); + assert_eq!(searcher.num_docs(), 2); + assert_eq!(searcher.segment_readers()[0].num_docs(), 2); + assert_eq!(searcher.segment_readers()[0].max_doc(), 2); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "a")), vec!()); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "b")), vec!()); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "c")), vec!()); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "d")), vec!()); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "e")), vec!()); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "f")), vec!(6_000)); + assert_eq!(search_term(&searcher, Term::from_field_text(text_field, "g")), vec!(6_000, 7_000)); + let score_field_reader = searcher.segment_reader(0).get_fast_field_reader(score_field).unwrap(); + assert_eq!(score_field_reader.min_val(), 6000); + assert_eq!(score_field_reader.max_val(), 7000); + } + + { // Test removing all docs + index_writer.delete_term(Term::from_field_text(text_field, "g")); + let segment_ids = index.searchable_segment_ids().expect("Searchable segments failed."); + index_writer.merge(&segment_ids) + .wait() + .expect("Merging failed"); + index.load_searchers().unwrap(); + + let ref searcher = *index.searcher(); + assert_eq!(searcher.segment_readers().len(), 1); + assert_eq!(searcher.num_docs(), 0); + } + } } diff --git a/src/lib.rs b/src/lib.rs index 45d9ca4fc..2333987e4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -117,14 +117,16 @@ pub use postings::SegmentPostingsOption; pub use core::TermIterator; -#[cfg(feature="simdcompression")] -pub fn version() -> &'static str { - concat!(version!(), "-simd") -} -#[cfg(not(feature="simdcompression"))] +/// Expose the current version of tantivy, as well +/// whether it was compiled with the simd compression. pub fn version() -> &'static str { - concat!(version!(), "-nosimd") + if cfg!(feature="simdcompression") { + concat!(version!(), "-simd") + } + else { + concat!(version!(), "-nosimd") + } } /// Tantivy's makes it possible to personalize when From 7f78d1f4ca8ffb3ab939fd344476f2e4d498c06f Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 23 Feb 2017 08:33:59 +0900 Subject: [PATCH 035/107] Fixes #82 Renamed and commented the function to create Term from &[u8] --- src/core/term_iterator.rs | 2 +- src/schema/term.rs | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/core/term_iterator.rs b/src/core/term_iterator.rs index 3a5e259f7..ab2f125c7 100644 --- a/src/core/term_iterator.rs +++ b/src/core/term_iterator.rs @@ -100,7 +100,7 @@ impl<'a> TermIterator<'a> { for segment_ord in self.current_segment_ords.drain(..) { if let Some(term) = self.key_streams[segment_ord].next() { self.heap.push(HeapItem { - term: Term::from(term), + term: Term::from_bytes(term), segment_ord: segment_ord, }); } diff --git a/src/schema/term.rs b/src/schema/term.rs index 305aac6b6..c26fcc99f 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -11,7 +11,6 @@ use std::str; #[derive(Clone, PartialEq, PartialOrd, Ord, Eq, Hash)] pub struct Term(Vec); - impl Term { /// Pre-allocate a term buffer. @@ -64,6 +63,14 @@ impl Term { Term(buffer) } + /// Builds a term from its byte representation. + /// + /// If you want to build a field for a given `str`, + /// you want to use `from_field_text`. + pub fn from_bytes(data: &[u8]) -> Term { + Term(Vec::from(data)) + } + /// Returns the serialized value of the term. /// (this does not include the field.) /// @@ -96,12 +103,6 @@ impl Term { } } -impl<'a> From<&'a [u8]> for Term { - fn from(data: &[u8]) -> Term { - Term(Vec::from(data)) - } -} - impl AsRef<[u8]> for Term { fn as_ref(&self) -> &[u8] { &self.0 @@ -115,7 +116,6 @@ impl fmt::Debug for Term { } - #[cfg(test)] mod tests { From eb39db44fcf4dfcd0ffda5961a5267386d4a813a Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 23 Feb 2017 09:20:30 +0900 Subject: [PATCH 036/107] issue/43 Avoid keeping segments with 0 documents. --- src/core/index.rs | 3 ++- src/indexer/index_writer.rs | 10 ++++------ src/indexer/merger.rs | 8 +++++--- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/src/core/index.rs b/src/core/index.rs index 06a6bc744..c3753514d 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -243,7 +243,8 @@ impl Index { let mut searchers = Vec::new(); for _ in 0..NUM_SEARCHERS { let searchable_segments_clone = searchable_segments.clone(); - let segment_readers: Vec = try!(searchable_segments_clone.into_iter() + let segment_readers: Vec = try!(searchable_segments_clone + .into_iter() .map(SegmentReader::open) .collect()); let searcher = Searcher::from(segment_readers); diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index e2a569552..78fe1e811 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -155,16 +155,14 @@ fn index_documents(heap: &mut Heap, // this is ensured by the call to peek before starting // the worker thread. assert!(num_docs > 0); - - segment - .meta_mut() - .set_num_docs(num_docs); - + let doc_opstamps: Vec = segment_writer.finalize()?; // let segment_entry = advance_deletes(&mut segment, delete_queue, delete_position, )?; + let mut segment_meta = SegmentMeta::new(segment.id()); + segment_meta.set_num_docs(num_docs); - let mut segment_entry = SegmentEntry::new(SegmentMeta::new(segment.id())); + let mut segment_entry = SegmentEntry::new(segment_meta); segment_entry.set_doc_to_opstamp(DocToOpstampMapping::from(doc_opstamps)); segment_updater diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 879b278e0..0c28a4d79 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -83,9 +83,11 @@ impl IndexMerger { let mut readers = vec!(); let mut max_doc = 0; for segment in segments { - let reader = SegmentReader::open(segment.clone())?; - max_doc += reader.num_docs(); - readers.push(reader); + if segment.meta().num_docs() > 0 { + let reader = SegmentReader::open(segment.clone())?; + max_doc += reader.num_docs(); + readers.push(reader); + } } Ok(IndexMerger { schema: schema, From 503d0295cb3bfa9ef6966cd675a77de00fcd9f03 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 23 Feb 2017 09:48:08 +0900 Subject: [PATCH 037/107] issue/43 TODO hunt --- src/core/segment_reader.rs | 1 - src/datastruct/skip/skiplist_builder.rs | 2 +- src/indexer/doc_opstamp_mapping.rs | 71 +++++++++++++++++++++++-- src/indexer/index_writer.rs | 13 ++--- src/indexer/merger.rs | 4 +- src/indexer/segment_updater.rs | 13 +++-- src/query/boolean_query/mod.rs | 2 - 7 files changed, 79 insertions(+), 27 deletions(-) diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 5e8676882..f1713f931 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -147,7 +147,6 @@ impl SegmentReader { .open_read(SegmentComponent::POSITIONS) .unwrap_or_else(|_| ReadOnlySource::empty()); - // TODO 0u64 let delete_bitset = if segment.meta().has_deletes() { let delete_data = segment.open_read(SegmentComponent::DELETE)?; diff --git a/src/datastruct/skip/skiplist_builder.rs b/src/datastruct/skip/skiplist_builder.rs index b83406029..9806a69af 100644 --- a/src/datastruct/skip/skiplist_builder.rs +++ b/src/datastruct/skip/skiplist_builder.rs @@ -36,7 +36,7 @@ impl LayerBuilder { fn insert(&mut self, doc_id: DocId, value: &T) -> io::Result> { self.remaining -= 1; self.len += 1; - let offset = self.written_size() as u32; // TODO not sure if we want after or here + let offset = self.written_size() as u32; try!(doc_id.serialize(&mut self.buffer)); try!(value.serialize(&mut self.buffer)); Ok(if self.remaining == 0 { diff --git a/src/indexer/doc_opstamp_mapping.rs b/src/indexer/doc_opstamp_mapping.rs index 843002416..16eb1ff28 100644 --- a/src/indexer/doc_opstamp_mapping.rs +++ b/src/indexer/doc_opstamp_mapping.rs @@ -1,6 +1,22 @@ use std::sync::Arc; use DocId; + +// Doc to opstamp is used to identify which +// document should be deleted. +// +// Since the docset matching the query of a delete operation +// is not computed right when the delete operation is received, +// we need to find a way to evaluate, for each document, +// whether the document was added before or after +// the delete operation. This anteriority is used by comparing +// the docstamp of the document. +// +// The doc to opstamp mapping stores precisely an array +// indexed by doc id and storing the opstamp of the document. +// +// This mapping is (for the moment) stricly increasing +// because of the way document id are allocated. #[derive(Clone)] pub enum DocToOpstampMapping { WithMap(Arc>), @@ -13,12 +29,18 @@ impl From> for DocToOpstampMapping { } } -impl DocToOpstampMapping { - // TODO Unit test - pub fn compute_doc_limit(&self, opstamp: u64) -> DocId { +impl DocToOpstampMapping { + + /// Given an opstamp return the limit doc id L + /// such that all doc id D such that + // D >= L iff opstamp(D) >= than `target_opstamp`. + // + // The edge case opstamp = some doc opstamp is in practise + // never called. + pub fn compute_doc_limit(&self, target_opstamp: u64) -> DocId { match *self { DocToOpstampMapping::WithMap(ref doc_opstamps) => { - match doc_opstamps.binary_search(&opstamp) { + match doc_opstamps.binary_search(&target_opstamp) { Ok(doc_id) => doc_id as DocId, Err(doc_id) => doc_id as DocId, } @@ -28,3 +50,44 @@ impl DocToOpstampMapping { } } +#[cfg(test)] +mod tests { + + use super::DocToOpstampMapping; + + #[test] + fn test_doc_to_opstamp_mapping_none() { + let doc_to_opstamp_mapping = DocToOpstampMapping::None; + assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(1), u32::max_value()); + } + + #[test] + fn test_doc_to_opstamp_mapping_complex() { + { + let doc_to_opstamp_mapping = DocToOpstampMapping::from(vec!()); + assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(0u64), 0); + assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(2u64), 0); + } + { + let doc_to_opstamp_mapping = DocToOpstampMapping::from(vec!(1u64)); + assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(0u64), 0); + assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(2u64), 1); + } + { + let doc_to_opstamp_mapping = DocToOpstampMapping::from(vec!(1u64, 12u64, 17u64, 23u64)); + assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(0u64), 0); + for i in 2u64..13u64 { + assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(i), 1); + } + for i in 13u64..18u64 { + assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(i), 2); + } + for i in 18u64..24u64 { + assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(i), 3); + } + for i in 24u64..30u64 { + assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(i), 4); + } + } + } +} \ No newline at end of file diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 78fe1e811..549de17c1 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -209,8 +209,6 @@ impl IndexWriter { let mut segment_updater = self.segment_updater.clone(); let mut heap = Heap::with_capacity(self.heap_size_in_bytes_per_thread); - // TODO fix this. the cursor might be too advanced - // at this point. let generation = self.generation; let join_handle: JoinHandle> = @@ -369,7 +367,6 @@ impl IndexWriter { // pending add segment commands will be dismissed. self.generation += 1; - // TODO requires a new delete queue... let rollback_future = self.segment_updater.rollback(self.generation); // we cannot drop segment ready receiver yet @@ -457,13 +454,11 @@ impl IndexWriter { // committed segments. self.committed_opstamp = self.stamp(); - // TODO remove clone - let future = self.segment_updater.commit(self.committed_opstamp); - // wait for the segment update thread to have processed the info - // TODO remove unwrap - future.wait().unwrap(); - + self.segment_updater + .commit(self.committed_opstamp) + .wait()?; + self.delete_queue.clear(); Ok(self.committed_opstamp) } diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 0c28a4d79..fa8c94c99 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -149,9 +149,7 @@ impl IndexMerger { } assert!(min_val <= max_val); - - // TODO test deleting all documents off the index. - + try!(fast_field_serializer.new_u32_fast_field(field, min_val, max_val)); for (max_doc, u32_reader, delete_bitset) in u32_readers { for doc_id in 0..max_doc { diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index b9c4e47d2..f7f9ea868 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -136,21 +136,21 @@ impl SegmentUpdater { } - fn run_async T>(&self, f: F) -> impl Future { + fn run_async T>(&self, f: F) -> impl Future { let me_clone = self.clone(); self.0.pool.spawn_fn(move || { Ok(f(me_clone)) }) } - pub fn rollback(&mut self, generation: usize) -> impl Future { + pub fn rollback(&mut self, generation: usize) -> impl Future { self.0.generation.store(generation, Ordering::Release); self.run_async(|segment_updater| { segment_updater.0.segment_manager.rollback(); }) } - pub fn add_segment(&self, generation: usize, segment_entry: SegmentEntry) -> impl Future { + pub fn add_segment(&self, generation: usize, segment_entry: SegmentEntry) -> impl Future { if generation >= self.0.generation.load(Ordering::Acquire) { future::Either::A(self.run_async(|segment_updater| { segment_updater.0.segment_manager.add_segment(segment_entry); @@ -174,7 +174,7 @@ impl SegmentUpdater { .collect() } - pub fn commit(&self, opstamp: u64) -> impl Future { + pub fn commit(&self, opstamp: u64) -> impl Future { self.run_async(move |segment_updater| { let segment_metas = segment_updater.purge_deletes().expect("Failed purge deletes"); let segment_entries = segment_metas @@ -242,8 +242,7 @@ impl SegmentUpdater { .collect(); // An IndexMerger is like a "view" of our merged segments. - // TODO unwrap - let merger: IndexMerger = IndexMerger::open(schema, &segments[..]).expect("Creating index merger failed"); + let merger: IndexMerger = IndexMerger::open(schema, &segments[..])?; let mut merged_segment = index.new_segment(); // ... we just serialize this index merger in our new segment @@ -284,7 +283,7 @@ impl SegmentUpdater { fn end_merge(&self, merged_segment_metas: Vec, - resulting_segment_entry: SegmentEntry) -> impl Future { + resulting_segment_entry: SegmentEntry) -> impl Future { self.run_async(move |segment_updater| { segment_updater.0.segment_manager.end_merge(&merged_segment_metas, resulting_segment_entry); diff --git a/src/query/boolean_query/mod.rs b/src/query/boolean_query/mod.rs index c04349687..a38d72f81 100644 --- a/src/query/boolean_query/mod.rs +++ b/src/query/boolean_query/mod.rs @@ -102,8 +102,6 @@ mod tests { } { let boolean_query = BooleanQuery::from(vec![(Occur::MustNot, make_term_query("d")),]); - // TODO optimize this use case : only MustNot subqueries... no need - // to read any postings. assert_eq!(matching_docs(&boolean_query), Vec::new()); } } From 78228ece73ee6c16807953b38338153f5633a769 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Fri, 24 Feb 2017 23:41:46 +0900 Subject: [PATCH 038/107] Closes #92. ByteOrder of u32 terms. --- src/schema/term.rs | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/src/schema/term.rs b/src/schema/term.rs index c26fcc99f..cc727999a 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -1,6 +1,7 @@ use std::fmt; use common::BinarySerializable; +use byteorder::{BigEndian, ByteOrder}; use super::Field; use std::str; @@ -42,10 +43,13 @@ impl Term { /// the Term will have 5 bytes. /// The first byte is `1`, and the 4 following bytes are that of the u32. pub fn from_field_u32(field: Field, val: u32) -> Term { - let mut buffer = Vec::with_capacity(1 + 4); - buffer.clear(); - field.serialize(&mut buffer).unwrap(); - val.serialize(&mut buffer).unwrap(); + const U32_TERM_LEN: usize = 1 + 4; + let mut buffer = Vec::with_capacity(U32_TERM_LEN); + unsafe { buffer.set_len(U32_TERM_LEN) }; + buffer[0] = field.0; + // we want BigEndian here to have lexicographic order + // match the natural order of vals. + BigEndian::write_u32(&mut buffer[1..5], val); Term(buffer) } @@ -62,6 +66,13 @@ impl Term { buffer.extend(text.as_bytes()); Term(buffer) } + + /// Assume the term is a u32 field. + /// + /// Panics if the term is not a u32 field. + pub fn get_u32(&self) -> u32 { + BigEndian::read_u32(&self.0[1..]) + } /// Builds a term from its byte representation. /// @@ -138,7 +149,10 @@ mod tests { assert_eq!(term.field(), count_field); assert_eq!(term.as_slice()[0], 2u8); assert_eq!(term.as_slice().len(), 5); - assert_eq!(term.as_slice()[1], (983u32 % 256u32) as u8); + assert_eq!(term.as_slice()[1], 0u8); + assert_eq!(term.as_slice()[2], 0u8); + assert_eq!(term.as_slice()[3], (933u32 / 256u32) as u8); + assert_eq!(term.as_slice()[4], (983u32 % 256u32) as u8); } } From f326a2dafe932d671b8904791b6027bba4428a0d Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 25 Feb 2017 15:26:33 +0900 Subject: [PATCH 039/107] TODO hunt --- src/error.rs | 2 +- src/indexer/delete_queue.rs | 11 +++++------ src/indexer/index_writer.rs | 11 ++++++----- src/schema/field_entry.rs | 2 +- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/error.rs b/src/error.rs index 6a82c239d..3404e9911 100644 --- a/src/error.rs +++ b/src/error.rs @@ -36,7 +36,7 @@ pub enum Error { /// Invalid argument was passed by the user. InvalidArgument(String), /// An Error happened in one of the thread - ErrorInThread(String), // TODO investigate better solution + ErrorInThread(String), } impl From for Error { diff --git a/src/indexer/delete_queue.rs b/src/indexer/delete_queue.rs index 498bff51f..0c5dedba1 100644 --- a/src/indexer/delete_queue.rs +++ b/src/indexer/delete_queue.rs @@ -7,7 +7,7 @@ use std::mem; #[derive(Default)] struct InnerDeleteQueue { - ro_chunks: ReadOnlyDeletes, + ro_chunks: DeleteQueueSnapshot, last_chunk: Vec, } @@ -16,7 +16,7 @@ impl InnerDeleteQueue { self.last_chunk.push(delete_operation); } - pub fn snapshot(&mut self,) -> ReadOnlyDeletes { + pub fn snapshot(&mut self,) -> DeleteQueueSnapshot { if self.last_chunk.len() > 0 { let new_operations = vec!(); let new_ro_chunk = mem::replace(&mut self.last_chunk, new_operations); @@ -33,11 +33,10 @@ impl InnerDeleteQueue { -// TODO Rename to DeleteQueueSnapshot #[derive(Default, Clone)] -pub struct ReadOnlyDeletes(Vec>>); +pub struct DeleteQueueSnapshot(Vec>>); -impl ReadOnlyDeletes { +impl DeleteQueueSnapshot { fn push(&mut self, operations: Vec) { self.0.push(Arc::new(operations)); } @@ -61,7 +60,7 @@ impl DeleteQueue { self.0.write().unwrap().push(delete_operation); } - pub fn snapshot(&self) -> ReadOnlyDeletes { + pub fn snapshot(&self) -> DeleteQueueSnapshot { self.0.write().unwrap().snapshot() } diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 549de17c1..8e6a9351e 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -9,7 +9,7 @@ use core::SegmentReader; use datastruct::stacker::Heap; use Error; use fastfield::delete::write_delete_bitset; -use indexer::delete_queue::ReadOnlyDeletes; +use indexer::delete_queue::DeleteQueueSnapshot; use futures::Canceled; use futures::Future; use indexer::delete_queue::DeleteQueue; @@ -94,7 +94,7 @@ impl !Sync for IndexWriter {} pub fn advance_deletes( segment: &mut Segment, - delete_operations: &ReadOnlyDeletes, + delete_operations: &DeleteQueueSnapshot, doc_opstamps: &DocToOpstampMapping) -> Result { let segment_reader = SegmentReader::open(segment.clone())?; let mut delete_bitset = BitSet::with_capacity(segment_reader.max_doc() as usize); @@ -134,14 +134,15 @@ pub fn advance_deletes( } fn index_documents(heap: &mut Heap, - mut segment: Segment, + segment: Segment, schema: &Schema, generation: usize, document_iterator: &mut Iterator, segment_updater: &mut SegmentUpdater) -> Result { heap.clear(); - let mut segment_writer = try!(SegmentWriter::for_segment(heap, segment.clone(), &schema)); + let segment_id = segment.id(); + let mut segment_writer = try!(SegmentWriter::for_segment(heap, segment, &schema)); for doc in document_iterator { try!(segment_writer.add_document(&doc, &schema)); if segment_writer.is_buffer_full() { @@ -159,7 +160,7 @@ fn index_documents(heap: &mut Heap, let doc_opstamps: Vec = segment_writer.finalize()?; // let segment_entry = advance_deletes(&mut segment, delete_queue, delete_position, )?; - let mut segment_meta = SegmentMeta::new(segment.id()); + let mut segment_meta = SegmentMeta::new(segment_id); segment_meta.set_num_docs(num_docs); let mut segment_entry = SegmentEntry::new(segment_meta); diff --git a/src/schema/field_entry.rs b/src/schema/field_entry.rs index 253d2ebb0..99f3cb42c 100644 --- a/src/schema/field_entry.rs +++ b/src/schema/field_entry.rs @@ -54,7 +54,7 @@ impl FieldEntry { pub fn is_indexed(&self,) -> bool { match self.field_type { FieldType::Str(ref options) => options.get_indexing_options().is_indexed(), - _ => false, // TODO handle u32 indexed + FieldType::U32(ref options) => options.is_indexed(), } } From 1d9924ee9067c1a354399128a6b0f53f201150a6 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 25 Feb 2017 15:57:05 +0900 Subject: [PATCH 040/107] Closes #43. --- src/fastfield/writer.rs | 1 - src/postings/docset.rs | 3 --- src/postings/intersection.rs | 2 -- src/postings/postings_writer.rs | 2 +- src/postings/serializer.rs | 2 +- 5 files changed, 2 insertions(+), 8 deletions(-) diff --git a/src/fastfield/writer.rs b/src/fastfield/writer.rs index 4527533df..715182f36 100644 --- a/src/fastfield/writer.rs +++ b/src/fastfield/writer.rs @@ -98,7 +98,6 @@ impl U32FastFieldWriter { } }, None => { - // TODO make default value configurable 0u32 } } diff --git a/src/postings/docset.rs b/src/postings/docset.rs index 6698c5a2b..e28319f42 100644 --- a/src/postings/docset.rs +++ b/src/postings/docset.rs @@ -55,9 +55,6 @@ pub trait DocSet { /// Returns the current document fn doc(&self) -> DocId; - /// TODO can impl trait for trait? - - /// Advances the cursor to the next document /// None is returned if the iterator has `DocSet` /// has already been entirely consumed. diff --git a/src/postings/intersection.rs b/src/postings/intersection.rs index d6fc20545..e4e4c2308 100644 --- a/src/postings/intersection.rs +++ b/src/postings/intersection.rs @@ -2,8 +2,6 @@ use postings::DocSet; use postings::SkipResult; use DocId; -// TODO Find a way to specialize `IntersectionDocSet` - /// Creates a `DocSet` that iterator through the intersection of two `DocSet`s. pub struct IntersectionDocSet { docsets: Vec, diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index 667e67458..0ec5559dd 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -120,7 +120,7 @@ impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<' term_offsets.sort_by_key(|&(k, _v)| k); let mut term = Term::allocate(Field(0), 100); for (term_bytes, (addr, recorder)) in term_offsets { - // TODO remove copy + // sadly we are required to copy the data term.set_content(term_bytes); try!(serializer.new_term(&term)); try!(recorder.serialize(addr, serializer, heap)); diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index 67c077429..4dcfe1851 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -49,7 +49,7 @@ use common::BinarySerializable; /// A description of the serialization format is /// [available here](https://fulmicoton.gitbooks.io/tantivy-doc/content/inverted-index.html). pub struct PostingsSerializer { - terms_fst_builder: FstMapBuilder, /* TODO find an alternative to work around the "move" */ + terms_fst_builder: FstMapBuilder, postings_write: WritePtr, positions_write: WritePtr, written_bytes_postings: usize, From ca1617d3cd51427a59346ed777b734bd4d69dd4e Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 25 Feb 2017 20:32:26 +0900 Subject: [PATCH 041/107] Fixes #91 --- src/schema/schema.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/schema/schema.rs b/src/schema/schema.rs index 8b35aff01..287ad40a0 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -12,7 +12,7 @@ use std::sync::Arc; use super::*; use std::fmt; - +const MAX_NUM_FIELDS: usize = 255; /// Tantivy has a very strict schema. /// You need to specify in advance whether a field is indexed or not, @@ -94,10 +94,13 @@ impl SchemaBuilder { /// Finalize the creation of a `Schema` /// This will consume your `SchemaBuilder` pub fn build(self,) -> Schema { + if self.fields.len() > MAX_NUM_FIELDS { + panic!("There may be at most 255 fields."); + } Schema(Arc::new(InnerSchema { fields: self.fields, fields_map: self.fields_map, - })) + })) } } From 3a86fc00a2c84134395168cc854f57b326987eaa Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 25 Feb 2017 20:40:39 +0900 Subject: [PATCH 042/107] Closes #64 - Improve Index creationg API / documentation --- src/core/index.rs | 18 +++-- src/error.rs | 5 +- src/indexer/index_writer.rs | 128 ++++++++++++++++++------------------ src/indexer/mod.rs | 3 +- src/lib.rs | 8 ++- 5 files changed, 86 insertions(+), 76 deletions(-) diff --git a/src/core/index.rs b/src/core/index.rs index c3753514d..e51fcb006 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -7,7 +7,7 @@ use std::fmt; use rustc_serialize::json; use core::SegmentId; use directory::{Directory, MmapDirectory, RAMDirectory}; -use indexer::IndexWriter; +use indexer::index_writer::open_index_writer; use core::searcher::Searcher; use std::convert::From; use num_cpus; @@ -18,6 +18,7 @@ use core::SegmentMeta; use super::pool::LeasedItem; use std::path::Path; use core::IndexMeta; +use IndexWriter; use core::META_FILEPATH; use super::segment::create_segment; use indexer::segment_updater::save_new_metas; @@ -147,8 +148,16 @@ impl Index { self.opstamp } - /// Creates a multithreaded writer. - /// Each writer produces an independent segment. + /// Open a new index writer. Attempts to acquire a lockfile. + /// + /// The lockfile should be deleted on drop, but it is possible + /// that due to a panic or other error, a stale lockfile will be + /// left in the index directory. If you are sure that no other + /// `IndexWriter` on the system is accessing the index directory, + /// it is safe to manually delete the lockfile. + /// + /// num_threads specifies the number of indexing workers that + /// should work at the same time. /// /// # Errors /// If the lockfile already exists, returns `Error::FileAlreadyExists`. @@ -158,12 +167,13 @@ impl Index { num_threads: usize, heap_size_in_bytes: usize) -> Result { - IndexWriter::open(self, num_threads, heap_size_in_bytes) + open_index_writer(self, num_threads, heap_size_in_bytes) } /// Creates a multithreaded writer /// It just calls `writer_with_num_threads` with the number of cores as `num_threads` + /// /// # Errors /// If the lockfile already exists, returns `Error::FileAlreadyExists`. /// # Panics diff --git a/src/error.rs b/src/error.rs index 3404e9911..2b405357f 100644 --- a/src/error.rs +++ b/src/error.rs @@ -3,7 +3,7 @@ /// Definition of Tantivy's error and result. use std::io; -use std::result; + use std::path::PathBuf; use std::error; use std::sync::PoisonError; @@ -12,8 +12,7 @@ use query; use schema; -/// Tantivy result. -pub type Result = result::Result; + /// Generic tantivy error. diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 8e6a9351e..92c19e619 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -86,6 +86,70 @@ impl !Send for IndexWriter {} impl !Sync for IndexWriter {} + +/// Open a new index writer. Attempts to acquire a lockfile. +/// +/// The lockfile should be deleted on drop, but it is possible +/// that due to a panic or other error, a stale lockfile will be +/// left in the index directory. If you are sure that no other +/// `IndexWriter` on the system is accessing the index directory, +/// it is safe to manually delete the lockfile. +/// +/// num_threads specifies the number of indexing workers that +/// should work at the same time. +/// # Errors +/// If the lockfile already exists, returns `Error::FileAlreadyExists`. +/// # Panics +/// If the heap size per thread is too small, panics. +pub fn open_index_writer(index: &Index, + num_threads: usize, + heap_size_in_bytes_per_thread: usize) + -> Result { + + if heap_size_in_bytes_per_thread <= HEAP_SIZE_LIMIT as usize { + panic!(format!("The heap size per thread needs to be at least {}.", + HEAP_SIZE_LIMIT)); + } + + let directory_lock = try!(DirectoryLock::lock(index.directory().box_clone())); + + let (document_sender, document_receiver): (DocumentSender, DocumentReceiver) = + chan::sync(PIPELINE_MAX_SIZE_IN_DOCS); + + + let delete_queue = DeleteQueue::default(); + + let segment_updater = SegmentUpdater::new(index.clone(), delete_queue.clone())?; + + let mut index_writer = IndexWriter { + + _directory_lock: directory_lock, + + heap_size_in_bytes_per_thread: heap_size_in_bytes_per_thread, + index: index.clone(), + + document_receiver: document_receiver, + document_sender: document_sender, + + segment_updater: segment_updater, + + workers_join_handle: Vec::new(), + num_threads: num_threads, + + delete_queue: delete_queue, + + committed_opstamp: index.opstamp(), + uncommitted_opstamp: index.opstamp(), + + generation: 0, + + worker_id: 0, + }; + try!(index_writer.start_workers()); + Ok(index_writer) +} + + // TODO put delete bitset in segment entry // rather than DocToOpstamp. @@ -253,70 +317,6 @@ impl IndexWriter { Ok(()) } - - /// Open a new index writer. Attempts to acquire a lockfile. - /// - /// The lockfile should be deleted on drop, but it is possible - /// that due to a panic or other error, a stale lockfile will be - /// left in the index directory. If you are sure that no other - /// `IndexWriter` on the system is accessing the index directory, - /// it is safe to manually delete the lockfile. - /// - /// num_threads specifies the number of indexing workers that - /// should work at the same time. - /// # Errors - /// If the lockfile already exists, returns `Error::FileAlreadyExists`. - /// # Panics - /// If the heap size per thread is too small, panics. - pub fn open(index: &Index, - num_threads: usize, - heap_size_in_bytes_per_thread: usize) - -> Result { - - if heap_size_in_bytes_per_thread <= HEAP_SIZE_LIMIT as usize { - panic!(format!("The heap size per thread needs to be at least {}.", - HEAP_SIZE_LIMIT)); - } - - let directory_lock = try!(DirectoryLock::lock(index.directory().box_clone())); - - let (document_sender, document_receiver): (DocumentSender, DocumentReceiver) = - chan::sync(PIPELINE_MAX_SIZE_IN_DOCS); - - - let delete_queue = DeleteQueue::default(); - - let segment_updater = SegmentUpdater::new(index.clone(), delete_queue.clone())?; - - let mut index_writer = IndexWriter { - - _directory_lock: directory_lock, - - heap_size_in_bytes_per_thread: heap_size_in_bytes_per_thread, - index: index.clone(), - - document_receiver: document_receiver, - document_sender: document_sender, - - segment_updater: segment_updater, - - workers_join_handle: Vec::new(), - num_threads: num_threads, - - delete_queue: delete_queue, - - committed_opstamp: index.opstamp(), - uncommitted_opstamp: index.opstamp(), - - generation: 0, - - worker_id: 0, - }; - try!(index_writer.start_workers()); - Ok(index_writer) - } - - pub fn get_merge_policy(&self) -> Box { self.segment_updater.get_merge_policy() } diff --git a/src/indexer/mod.rs b/src/indexer/mod.rs index b2a71fa02..8380332af 100644 --- a/src/indexer/mod.rs +++ b/src/indexer/mod.rs @@ -1,5 +1,4 @@ - -mod index_writer; +pub mod index_writer; pub mod segment_serializer; pub mod merger; mod merge_policy; diff --git a/src/lib.rs b/src/lib.rs index 2333987e4..5c8ca7ece 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -73,6 +73,11 @@ mod macros { ); } +pub use error::Error; + +/// Tantivy result. +pub type Result = std::result::Result; + mod core; mod compression; mod fastfield; @@ -80,9 +85,6 @@ mod store; mod indexer; mod common; mod error; - -pub use error::{Result, Error}; - mod analyzer; mod datastruct; From 6a002bcc762be67ab86ba0c6a898085dd9121752 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 25 Feb 2017 21:20:55 +0900 Subject: [PATCH 043/107] NOBUGwq --- src/collector/mod.rs | 2 +- src/core/segment_reader.rs | 23 ++++++++--- src/error.rs | 3 ++ src/fastfield/reader.rs | 39 ++++++++++-------- src/indexer/merger.rs | 32 ++++++++++----- src/query/boolean_query/mod.rs | 18 +------- src/query/term_query/mod.rs | 64 +++++++++++++++++++++++++++++ src/query/term_query/term_scorer.rs | 14 +++++-- src/query/term_query/term_weight.rs | 7 ++-- 9 files changed, 145 insertions(+), 57 deletions(-) diff --git a/src/collector/mod.rs b/src/collector/mod.rs index ff856ad08..75c22aded 100644 --- a/src/collector/mod.rs +++ b/src/collector/mod.rs @@ -146,7 +146,7 @@ pub mod tests { impl Collector for FastFieldTestCollector { fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> { - self.ff_reader = Some(try!(reader.get_fast_field_reader(self.field))); + self.ff_reader = reader.get_fast_field_reader(self.field); Ok(()) } diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index f1713f931..5ff59af14 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -9,7 +9,6 @@ use store::StoreReader; use schema::Document; use directory::ReadOnlySource; use DocId; -use std::io; use std::str; use postings::TermInfo; use datastruct::FstMap; @@ -69,23 +68,35 @@ impl SegmentReader { self.segment_info.max_doc - self.num_deleted_docs() } + /// Return the number of documents that have been + /// deleted in the segment. pub fn num_deleted_docs(&self) -> DocId { self.delete_bitset.len() as DocId } /// Accessor to a segment's fast field reader given a field. - pub fn get_fast_field_reader(&self, field: Field) -> Result { + pub fn get_fast_field_reader(&self, field: Field) -> Option { + /// Returns the u32 fast value reader if the field + /// is a u32 field indexed as "fast". + /// + /// Return None if the field is not a u32 field + /// indexed with the fast option. + /// + /// # Panics + /// May panic if the index is corrupted. let field_entry = self.schema.get_field_entry(field); match field_entry.field_type() { &FieldType::Str(_) => { - Err(Error::InvalidArgument(format!("Field <{}> is not a fast field. It is a text field, and fast text fields are not supported yet.", field_entry.name()))) + warn!("Field <{}> is not a fast field. It is a text field, and fast text fields are not supported yet.", field_entry.name()); + None }, &FieldType::U32(ref u32_options) => { if u32_options.is_fast() { - Ok(self.fast_fields_reader.get_field(field)?) + self.fast_fields_reader.get_field(field) } else { - Err(Error::InvalidArgument(format!("Field <{}> is not defined as a fast field.", field_entry.name()))) + warn!("Field <{}> is not defined as a fast field.", field_entry.name()); + None } }, } @@ -98,7 +109,7 @@ impl SegmentReader { /// /// They are simply stored as a fast field, serialized in /// the `.fieldnorm` file of the segment. - pub fn get_fieldnorms_reader(&self, field: Field) -> io::Result { + pub fn get_fieldnorms_reader(&self, field: Field) -> Option { self.fieldnorms_reader.get_field(field) } diff --git a/src/error.rs b/src/error.rs index 2b405357f..8eae79513 100644 --- a/src/error.rs +++ b/src/error.rs @@ -36,6 +36,9 @@ pub enum Error { InvalidArgument(String), /// An Error happened in one of the thread ErrorInThread(String), + /// An Error appeared related to the lack of a field. + SchemaError(String), + } impl From for Error { diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs index dcf2f2d6a..1ceac8552 100644 --- a/src/fastfield/reader.rs +++ b/src/fastfield/reader.rs @@ -32,7 +32,7 @@ pub struct U32FastFieldReader { impl U32FastFieldReader { pub fn empty() -> U32FastFieldReader { - U32FastFieldReader::open(U32_FAST_FIELD_EMPTY.clone()).expect("should always work.") + U32FastFieldReader::open(U32_FAST_FIELD_EMPTY.clone()) } pub fn min_val(&self,) -> u32 { @@ -43,14 +43,18 @@ impl U32FastFieldReader { self.max_val } - pub fn open(data: ReadOnlySource) -> io::Result { + /// Opens a new fast field reader given a read only source. + /// + /// # Panics + /// Panics if the data is corrupted. + pub fn open(data: ReadOnlySource) -> U32FastFieldReader { let min_val; let amplitude; let max_val; { let mut cursor = data.as_slice(); - min_val = try!(u32::deserialize(&mut cursor)); - amplitude = try!(u32::deserialize(&mut cursor)); + min_val = u32::deserialize(&mut cursor).unwrap(); + amplitude = u32::deserialize(&mut cursor).unwrap(); max_val = min_val + amplitude; } let num_bits = compute_num_bits(amplitude); @@ -58,12 +62,12 @@ impl U32FastFieldReader { let data_arr = &(data.deref()[8..]); BitUnpacker::new(data_arr, num_bits as usize) }; - Ok(U32FastFieldReader { + U32FastFieldReader { _data: data, bit_unpacker: bit_unpacker, min_val: min_val, max_val: max_val, - }) + } } pub fn get(&self, doc: DocId) -> u32 { @@ -132,17 +136,20 @@ impl U32FastFieldsReader { }) } - pub fn get_field(&self, field: Field) -> io::Result { - match self.field_offsets.get(&field) { - Some(&(start, stop)) => { + /// Returns the u32 fast value reader if the field + /// is a u32 field indexed as "fast". + /// + /// Return None if the field is not a u32 field + /// indexed with the fast option. + /// + /// # Panics + /// May panic if the index is corrupted. + pub fn get_field(&self, field: Field) -> Option { + self.field_offsets + .get(&field) + .map(|&(start, stop)| { let field_source = self.source.slice(start as usize, stop as usize); U32FastFieldReader::open(field_source) - } - None => { - Err(io::Error::new(io::ErrorKind::InvalidInput, "Could not find field")) - } - - } - + }) } } diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index fa8c94c99..6f47f0365 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -1,4 +1,4 @@ -use Result; +use {Error, Result}; use core::SegmentReader; use core::Segment; use DocId; @@ -20,6 +20,7 @@ use std::cmp::{min, max}; use std::iter; + pub struct IndexMerger { schema: Schema, readers: Vec, @@ -70,11 +71,11 @@ fn compute_min_max_val(u32_reader: &U32FastFieldReader, max_doc: DocId, delete_b } } -fn extract_fieldnorm_reader(segment_reader: &SegmentReader, field: Field) -> Result { - Ok(segment_reader.get_fieldnorms_reader(field)?) +fn extract_fieldnorm_reader(segment_reader: &SegmentReader, field: Field) -> Option { + segment_reader.get_fieldnorms_reader(field) } -fn extract_fast_field_reader(segment_reader: &SegmentReader, field: Field) -> Result { +fn extract_fast_field_reader(segment_reader: &SegmentReader, field: Field) -> Option { segment_reader.get_fast_field_reader(field) } @@ -123,7 +124,7 @@ impl IndexMerger { // used both to merge field norms and regular u32 fast fields. fn generic_write_fast_field(&self, fields: Vec, - field_reader_extractor: &Fn(&SegmentReader, Field) -> Result, + field_reader_extractor: &Fn(&SegmentReader, Field) -> Option, fast_field_serializer: &mut FastFieldSerializer) -> Result<()> { for field in fields { @@ -133,13 +134,22 @@ impl IndexMerger { let mut max_val = u32::min_value(); for reader in &self.readers { - let u32_reader = field_reader_extractor(reader, field)?; - if let Some((seg_min_val, seg_max_val)) = compute_min_max_val(&u32_reader, reader.max_doc(), reader.delete_bitset()) { - // the segment has some non-deleted documents - min_val = min(min_val, seg_min_val); - max_val = max(max_val, seg_max_val); - u32_readers.push((reader.max_doc(), u32_reader, reader.delete_bitset())); + match field_reader_extractor(reader, field) { + Some(u32_reader) => { + if let Some((seg_min_val, seg_max_val)) = compute_min_max_val(&u32_reader, reader.max_doc(), reader.delete_bitset()) { + // the segment has some non-deleted documents + min_val = min(min_val, seg_min_val); + max_val = max(max_val, seg_max_val); + u32_readers.push((reader.max_doc(), u32_reader, reader.delete_bitset())); + } + } + None => { + let error_msg = format!("Failed to find a u32_reader for field {:?}", field); + error!("{}", error_msg); + return Err(Error::SchemaError(error_msg)) + } } + } if u32_readers.is_empty() { diff --git a/src/query/boolean_query/mod.rs b/src/query/boolean_query/mod.rs index a38d72f81..c17ea2303 100644 --- a/src/query/boolean_query/mod.rs +++ b/src/query/boolean_query/mod.rs @@ -116,7 +116,7 @@ mod tests { let left = VecPostings::from(vec!(1, 2, 3)); let left_scorer = TermScorer { idf: 1f32, - fieldnorm_reader: left_fieldnorms, + fieldnorm_reader_opt: Some(left_fieldnorms), postings: left, }; @@ -125,7 +125,7 @@ mod tests { let right_scorer = TermScorer { idf: 4f32, - fieldnorm_reader: right_fieldnorms, + fieldnorm_reader_opt: Some(right_fieldnorms), postings: right, }; @@ -141,19 +141,5 @@ mod tests { } - #[test] - pub fn test_term_scorer() { - let left_fieldnorms = U32FastFieldReader::from(vec!(10, 4)); - assert_eq!(left_fieldnorms.get(0), 10); - assert_eq!(left_fieldnorms.get(1), 4); - let left = VecPostings::from(vec!(1)); - let mut left_scorer = TermScorer { - idf: 0.30685282, - fieldnorm_reader: left_fieldnorms, - postings: left, - }; - left_scorer.advance(); - assert!(abs_diff(left_scorer.score(), 0.15342641) < 0.001f32); - } } diff --git a/src/query/term_query/mod.rs b/src/query/term_query/mod.rs index e8be286c1..747fbe62e 100644 --- a/src/query/term_query/mod.rs +++ b/src/query/term_query/mod.rs @@ -5,3 +5,67 @@ mod term_scorer; pub use self::term_query::TermQuery; pub use self::term_weight::TermWeight; pub use self::term_scorer::TermScorer; + + +#[cfg(test)] +mod tests { + + use postings::{DocSet, VecPostings}; + use query::Scorer; + use query::term_query::TermScorer; + use query::Query; + use fastfield::U32FastFieldReader; + use query::TermQuery; + use Index; + use schema::*; + use postings::SegmentPostingsOption; + + fn abs_diff(left: f32, right: f32) -> f32 { + (right - left).abs() + } + + + #[test] + pub fn test_term_query_no_freq() { + let mut schema_builder = SchemaBuilder::default(); + let text_field = schema_builder.add_text_field("text", STRING); + let schema = schema_builder.build(); + let index = Index::create_from_tempdir(schema).unwrap(); + { + // writing the segment + let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); + { + let doc = doc!(text_field => "a"); + index_writer.add_document(doc).unwrap(); + } + assert!(index_writer.commit().is_ok()); + } + + index.load_searchers().unwrap(); + let searcher = index.searcher(); + let term_query = TermQuery::new(Term::from_field_text(text_field, "a"), SegmentPostingsOption::NoFreq); + let term_weight = term_query.weight(&searcher).unwrap(); + let segment_reader = searcher.segment_reader(0); + let mut term_scorer = term_weight.scorer(segment_reader).unwrap(); + assert!(term_scorer.advance()); + assert_eq!(term_scorer.doc(), 0); + assert_eq!(term_scorer.score(), 0.30685282); + } + + + #[test] + pub fn test_term_scorer() { + let left_fieldnorms = U32FastFieldReader::from(vec!(10, 4)); + assert_eq!(left_fieldnorms.get(0), 10); + assert_eq!(left_fieldnorms.get(1), 4); + let left = VecPostings::from(vec!(1)); + let mut left_scorer = TermScorer { + idf: 0.30685282, + fieldnorm_reader_opt: Some(left_fieldnorms), + postings: left, + }; + left_scorer.advance(); + assert!(abs_diff(left_scorer.score(), 0.15342641) < 0.001f32); + } + +} \ No newline at end of file diff --git a/src/query/term_query/term_scorer.rs b/src/query/term_query/term_scorer.rs index c12b24174..81b683c99 100644 --- a/src/query/term_query/term_scorer.rs +++ b/src/query/term_query/term_scorer.rs @@ -7,7 +7,7 @@ use postings::Postings; pub struct TermScorer where TPostings: Postings { pub idf: Score, - pub fieldnorm_reader: U32FastFieldReader, + pub fieldnorm_reader_opt: Option, pub postings: TPostings, } @@ -30,8 +30,16 @@ impl DocSet for TermScorer where TPostings: Postings { impl Scorer for TermScorer where TPostings: Postings { fn score(&self,) -> Score { let doc = self.postings.doc(); - let field_norm = self.fieldnorm_reader.get(doc); - self.idf * (self.postings.term_freq() as f32 / field_norm as f32).sqrt() + let tf = match self.fieldnorm_reader_opt { + Some(ref fieldnorm_reader) => { + let field_norm = fieldnorm_reader.get(doc); + (self.postings.term_freq() as f32 / field_norm as f32) + } + None => { + self.postings.term_freq() as f32 + } + }; + self.idf * tf.sqrt() } } diff --git a/src/query/term_query/term_weight.rs b/src/query/term_query/term_weight.rs index 9d7bac3ee..48a000755 100644 --- a/src/query/term_query/term_weight.rs +++ b/src/query/term_query/term_weight.rs @@ -4,7 +4,6 @@ use core::SegmentReader; use query::Scorer; use postings::SegmentPostingsOption; use postings::SegmentPostings; -use fastfield::U32FastFieldReader; use super::term_scorer::TermScorer; use Result; @@ -33,21 +32,21 @@ impl TermWeight { pub fn specialized_scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result>> { let field = self.term.field(); - let fieldnorm_reader = try!(reader.get_fieldnorms_reader(field)); + let fieldnorm_reader_opt = reader.get_fieldnorms_reader(field); Ok( reader .read_postings(&self.term, self.segment_postings_options) .map(|segment_postings| TermScorer { idf: self.idf(), - fieldnorm_reader: fieldnorm_reader, + fieldnorm_reader_opt: fieldnorm_reader_opt, postings: segment_postings, } ) .unwrap_or( TermScorer { idf: 1f32, - fieldnorm_reader: U32FastFieldReader::empty(), + fieldnorm_reader_opt: None, postings: SegmentPostings::empty() }) ) From 597dac9cb6e3fdd5af4c20829671dd8917b66cf5 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 25 Feb 2017 23:39:02 +0900 Subject: [PATCH 044/107] NOBUG Adding doc. --- src/core/index.rs | 3 ++- src/core/segment.rs | 11 +++++++---- src/core/segment_meta.rs | 2 +- src/core/segment_reader.rs | 5 +++++ src/datastruct/stacker/hashmap.rs | 4 ---- src/directory/directory.rs | 8 ++++++-- src/directory/mmap_directory.rs | 16 +++++++++++++--- src/indexer/index_writer.rs | 14 +++++++++++--- src/lib.rs | 4 ++-- 9 files changed, 47 insertions(+), 20 deletions(-) diff --git a/src/core/index.rs b/src/core/index.rs index e51fcb006..556e8cd37 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -207,7 +207,8 @@ impl Index { pub fn delete_segment(&self, segment_id: SegmentId) { delete_segment(self.directory(), segment_id); } - + + /// Creates a segment object given an index and a segment_meta. pub fn segment(&self, segment_meta: SegmentMeta) -> Segment { create_segment(self.clone(), segment_meta) } diff --git a/src/core/segment.rs b/src/core/segment.rs index dcf5ec116..6fd81fb78 100644 --- a/src/core/segment.rs +++ b/src/core/segment.rs @@ -41,13 +41,16 @@ impl Segment { pub fn schema(&self,) -> Schema { self.index.schema() } - - pub fn meta(&self,) -> &SegmentMeta { + + /// Returns the segment meta-information. + /// See [SegmentMeta](SegmentMeta.html). + pub fn meta(&self) -> &SegmentMeta { &self.meta } - pub fn meta_mut(&mut self,) -> &mut SegmentMeta { - &mut self.meta + /// Set the delete meta data + pub fn set_delete_meta(&mut self, num_deleted_docs: u32, opstamp: u64) { + self.meta.set_delete_meta(num_deleted_docs, opstamp); } /// Returns the segment's id. diff --git a/src/core/segment_meta.rs b/src/core/segment_meta.rs index 40142c1a4..4ecfc21ea 100644 --- a/src/core/segment_meta.rs +++ b/src/core/segment_meta.rs @@ -45,7 +45,7 @@ impl SegmentMeta { self.num_docs = num_docs; } - pub fn set_deletes(&mut self, num_deleted_docs: u32, opstamp: u64) { + pub fn set_delete_meta(&mut self, num_deleted_docs: u32, opstamp: u64) { self.deletes = Some(DeleteMeta { num_deleted_docs: num_deleted_docs, opstamp: opstamp, diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 5ff59af14..43aa6b50e 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -279,10 +279,15 @@ impl SegmentReader { self.segment_id } + /// Returns the bitset representing + /// the documents that have been deleted. pub fn delete_bitset(&self) -> &DeleteBitSet { &self.delete_bitset } + + /// Returns true iff the `doc` is marked + /// as deleted. pub fn is_deleted(&self, doc: DocId) -> bool { self.delete_bitset.is_deleted(doc) } diff --git a/src/datastruct/stacker/hashmap.rs b/src/datastruct/stacker/hashmap.rs index c70c879fc..55a6dc12c 100644 --- a/src/datastruct/stacker/hashmap.rs +++ b/src/datastruct/stacker/hashmap.rs @@ -125,10 +125,6 @@ impl<'a, V> HashMap<'a, V> where V: HeapAllocable { .map(move |addr: u32| heap.get_mut_ref::(addr)) } - pub fn heap(&self) -> &Heap { - &self.heap - } - pub fn get_or_create>(&mut self, key: S) -> &mut V { let entry = self.lookup(key.as_ref()); match entry { diff --git a/src/directory/directory.rs b/src/directory/directory.rs index 320cb9f51..b3ef71016 100644 --- a/src/directory/directory.rs +++ b/src/directory/directory.rs @@ -27,8 +27,6 @@ pub trait Directory: fmt::Debug + Send + Sync + 'static { /// Specifically, subsequent writes or flushes should /// have no effect on the returned `ReadOnlySource` object. fn open_read(&self, path: &Path) -> result::Result; - - fn atomic_read(&self, path: &Path) -> Result, FileError>; /// Removes a file /// @@ -63,6 +61,12 @@ pub trait Directory: fmt::Debug + Send + Sync + 'static { /// The file may not previously exist. fn open_write(&mut self, path: &Path) -> Result; + /// Reads the full content file that has been written using + /// atomic_write. + /// + /// This should only be used for small files. + fn atomic_read(&self, path: &Path) -> Result, FileError>; + /// Atomically replace the content of a file with data. /// /// This calls ensure that reads can never *observe* diff --git a/src/directory/mmap_directory.rs b/src/directory/mmap_directory.rs index 85609c191..3b18144b3 100644 --- a/src/directory/mmap_directory.rs +++ b/src/directory/mmap_directory.rs @@ -46,9 +46,14 @@ fn open_mmap(full_path: &PathBuf) -> result::Result>, FileError #[derive(Default,Clone,Debug,RustcDecodable,RustcEncodable)] pub struct CacheCounters { - hit: usize, - miss_empty: usize, - miss_weak: usize, + // Number of time the cache prevents to call `mmap` + pub hit: usize, + // Number of time tantivy had to call `mmap` + // as no entry was in the cache. + pub miss_empty: usize, + // Number of time tantivy had to call `mmap` + // as the entry in the cache was evinced. + pub miss_weak: usize, } #[derive(Clone,Debug,RustcDecodable,RustcEncodable)] @@ -210,6 +215,11 @@ impl MmapDirectory { Ok(()) } + /// Returns some statistical information + /// about the Mmap cache. + /// + /// The `MmapDirectory` embeds a `MmapDirectory` + /// to avoid multiplying the `mmap` system calls. pub fn get_cache_info(&mut self) -> CacheInfo { self.mmap_cache .write() diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 92c19e619..7b9633ea3 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -183,14 +183,14 @@ pub fn advance_deletes( } } - if let Some(last_opstamp) = last_opstamp_opt { + if let Some(last_opstamp) = last_opstamp_opt { for doc in 0u32..segment_reader.max_doc() { if segment_reader.is_deleted(doc) { delete_bitset.insert(doc as usize); } } let num_deleted_docs = delete_bitset.len(); - segment.meta_mut().set_deletes(num_deleted_docs as u32, last_opstamp); + segment.set_delete_meta(num_deleted_docs as u32, last_opstamp); let mut delete_file = segment.open_write(SegmentComponent::DELETE)?; write_delete_bitset(&delete_bitset, &mut delete_file)?; } @@ -317,6 +317,7 @@ impl IndexWriter { Ok(()) } + /// Accessor to the merge policy. pub fn get_merge_policy(&self) -> Box { self.segment_updater.get_merge_policy() } @@ -464,7 +465,14 @@ impl IndexWriter { Ok(self.committed_opstamp) } - + /// Delete all documents containing a given term. + /// + /// Delete operation only affects documents that + /// were added in previous commits, and documents + /// that were added previously in the same commit. + /// + /// Like adds, the deletion itself will be visible + /// only after calling `commit()`. pub fn delete_term(&mut self, term: Term) { let opstamp = self.stamp(); let delete_operation = DeleteOperation { diff --git a/src/lib.rs b/src/lib.rs index 5c8ca7ece..e9febbbd8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -89,6 +89,7 @@ mod analyzer; mod datastruct; + /// Query module pub mod query; /// Directory module @@ -107,8 +108,7 @@ pub use core::searcher::Searcher; pub use core::Segment; pub use core::Index; pub use indexer::IndexWriter; -pub use schema::Term; -pub use schema::Document; +pub use schema::{Term, Document}; pub use core::SegmentReader; pub use self::common::TimerTree; From a7f10f055d1880006317f843af86daeeee76df88 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 26 Feb 2017 00:11:32 +0900 Subject: [PATCH 045/107] Nobug hidding doc, filling doc --- src/core/index.rs | 5 +++-- src/core/mod.rs | 7 ++++--- src/core/segment.rs | 5 ++--- src/core/segment_id.rs | 8 ++++++++ src/core/segment_meta.rs | 4 ++++ src/lib.rs | 5 +---- 6 files changed, 22 insertions(+), 12 deletions(-) diff --git a/src/core/index.rs b/src/core/index.rs index 556e8cd37..100714c89 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -45,13 +45,14 @@ pub struct Index { -/// Deletes all of the document of the segment. +/// Deletes all of the files of the segment. /// This is called when there is a merge or a rollback. /// /// # Disclaimer /// If deletion of a file fails (e.g. a file /// was read-only.), the method does not /// fail and just logs an error when it fails. +#[doc(hidden)] pub fn delete_segment(directory: &Directory, segment_id: SegmentId) { info!("Deleting segment {:?}", segment_id); let segment_filepaths_res = directory.ls_starting_with( @@ -208,7 +209,7 @@ impl Index { delete_segment(self.directory(), segment_id); } - /// Creates a segment object given an index and a segment_meta. + #[doc(hidden)] pub fn segment(&self, segment_meta: SegmentMeta) -> Segment { create_segment(self.clone(), segment_meta) } diff --git a/src/core/mod.rs b/src/core/mod.rs index 3111cfadc..d6238c48c 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -1,5 +1,4 @@ pub mod searcher; - pub mod index; mod segment_reader; mod segment_id; @@ -10,8 +9,7 @@ mod pool; mod segment_meta; mod term_iterator; -use std::path::PathBuf; - +pub use self::searcher::Searcher; pub use self::segment_component::SegmentComponent; pub use self::segment_id::SegmentId; pub use self::segment_reader::SegmentReader; @@ -23,6 +21,9 @@ pub use self::segment_meta::SegmentMeta; pub use self::index_meta::IndexMeta; pub use self::term_iterator::TermIterator; + +use std::path::PathBuf; + lazy_static! { pub static ref META_FILEPATH: PathBuf = PathBuf::from("meta.json"); } \ No newline at end of file diff --git a/src/core/segment.rs b/src/core/segment.rs index 6fd81fb78..22f157420 100644 --- a/src/core/segment.rs +++ b/src/core/segment.rs @@ -42,13 +42,12 @@ impl Segment { self.index.schema() } - /// Returns the segment meta-information. - /// See [SegmentMeta](SegmentMeta.html). + /// Returns the segment meta-information pub fn meta(&self) -> &SegmentMeta { &self.meta } - /// Set the delete meta data + #[doc(hidden)] pub fn set_delete_meta(&mut self, num_deleted_docs: u32, opstamp: u64) { self.meta.set_delete_meta(num_deleted_docs, opstamp); } diff --git a/src/core/segment_id.rs b/src/core/segment_id.rs index db8a3d822..92920c6cb 100644 --- a/src/core/segment_id.rs +++ b/src/core/segment_id.rs @@ -6,6 +6,14 @@ use std::cmp::{Ordering, Ord}; #[cfg(test)] use std::sync::atomic; +/// Tantivy SegmentId. +/// +/// Tantivy's segment are identified +/// by a UUID which is used to prefix the filenames +/// of all of the file associated with the segment. +/// +/// In unit test, for reproducability, the SegmentId are +/// simply generated in an autoincrement fashion. #[derive(Clone, Copy, PartialEq, Eq, Hash)] pub struct SegmentId(Uuid); diff --git a/src/core/segment_meta.rs b/src/core/segment_meta.rs index 4ecfc21ea..ee2c9593b 100644 --- a/src/core/segment_meta.rs +++ b/src/core/segment_meta.rs @@ -7,6 +7,10 @@ struct DeleteMeta { opstamp: u64, } +/// SegmentMeta contains simple meta information about a segment. +/// +/// For instance the number of docs it contains, +/// how many are deleted, etc. #[derive(Clone, Debug, RustcDecodable,RustcEncodable)] pub struct SegmentMeta { segment_id: SegmentId, diff --git a/src/lib.rs b/src/lib.rs index e9febbbd8..61285583a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -103,10 +103,7 @@ pub mod schema; pub use directory::Directory; -pub use core::searcher::Searcher; - -pub use core::Segment; -pub use core::Index; +pub use core::{Index, Segment, SegmentId, SegmentMeta, Searcher}; pub use indexer::IndexWriter; pub use schema::{Term, Document}; pub use core::SegmentReader; From 8bcfdb8e80060c55086b2ea8e8c8ae3a916a8dbc Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 26 Feb 2017 21:35:18 +0900 Subject: [PATCH 046/107] NOBUG misc ... --- src/core/segment_id.rs | 10 +++++++++ src/indexer/segment_writer.rs | 19 ------------------ src/schema/document.rs | 15 ++++++-------- src/schema/mod.rs | 2 ++ src/schema/text_options.rs | 2 +- src/schema/u32_options.rs | 38 ++++++++++++++++++++++++++++++++++- 6 files changed, 56 insertions(+), 30 deletions(-) diff --git a/src/core/segment_id.rs b/src/core/segment_id.rs index 92920c6cb..6515ab423 100644 --- a/src/core/segment_id.rs +++ b/src/core/segment_id.rs @@ -42,14 +42,24 @@ fn create_uuid() -> Uuid { } impl SegmentId { + #[doc(hidden)] pub fn generate_random() -> SegmentId { SegmentId(create_uuid()) } + + /// Returns a shorter identifier of the segment. + /// + /// We are using UUID4, so only 6 bits are fixed, + /// and the rest is random. + /// + /// Picking the first 8 chars is ok to identify + /// segments in a display message. pub fn short_uuid_string(&self,) -> String { (&self.0.simple().to_string()[..8]).to_string() } + /// Returns a segment uuid string. pub fn uuid_string(&self,) -> String { self.0.simple().to_string() } diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index feae0e5e5..bd324bd97 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -130,25 +130,6 @@ impl<'a> SegmentWriter<'a> { pub fn is_buffer_full(&self,) -> bool { self.heap.num_free_bytes() <= MARGIN_IN_BYTES } - - // pub fn compute_doc_mapping_after_delete(&self, mut delete_queue_cursor: DeleteQueueCursor) -> Vec> { - // let delete_docs = self.compute_delete_mask(&mut delete_queue_cursor); - // let max_doc: usize = self.max_doc as usize; - // let mut doc_autoinc = 0u32; - // (0..max_doc) - // .map(|doc| { - // if delete_docs.contains(doc) { - // None - // } - // else { - // let new_doc = doc_autoinc; - // doc_autoinc += 1; - // Some(new_doc) - // } - // }) - // .collect::>() - // } - /// Indexes a new document /// diff --git a/src/schema/document.rs b/src/schema/document.rs index 671237155..87d0b46f7 100644 --- a/src/schema/document.rs +++ b/src/schema/document.rs @@ -11,7 +11,7 @@ use itertools::Itertools; /// Documents are really just a list of couple `(field, value)`. /// In this list, one field may appear more than once. -#[derive(Debug, RustcEncodable, RustcDecodable)] +#[derive(Debug, RustcEncodable, RustcDecodable, Default)] pub struct Document { field_values: Vec, } @@ -31,6 +31,11 @@ impl Eq for Document {} impl Document { + /// Creates a new, empty document object + pub fn new() -> Document { + Document::default() + } + /// Returns the number of `(field, value)` pairs. pub fn len(&self,) -> usize { self.field_values.len() @@ -97,14 +102,6 @@ impl Document { } } -impl Default for Document { - - fn default() -> Document { - Document { - field_values: Vec::new(), - } - } -} impl From> for Document { fn from(field_values: Vec) -> Document { diff --git a/src/schema/mod.rs b/src/schema/mod.rs index 224d9f47d..b1802d1e5 100644 --- a/src/schema/mod.rs +++ b/src/schema/mod.rs @@ -131,6 +131,8 @@ pub use self::text_options::STORED; pub use self::u32_options::U32Options; pub use self::u32_options::FAST; +pub use self::u32_options::U32_INDEXED; +pub use self::u32_options::U32_STORED; use regex::Regex; diff --git a/src/schema/text_options.rs b/src/schema/text_options.rs index 91000de81..c718e2a87 100644 --- a/src/schema/text_options.rs +++ b/src/schema/text_options.rs @@ -206,7 +206,7 @@ impl BitOr for TextOptions { fn bitor(self, other: TextOptions) -> TextOptions { let mut res = TextOptions::default(); res.indexing = self.indexing | other.indexing; - res.stored = self.stored || other.stored; + res.stored = self.stored | other.stored; res } } diff --git a/src/schema/u32_options.rs b/src/schema/u32_options.rs index 39be904ed..5f29f63b5 100644 --- a/src/schema/u32_options.rs +++ b/src/schema/u32_options.rs @@ -1,3 +1,5 @@ +use std::ops::BitOr; + /// Define how a U32 field should be handled by tantivy. #[derive(Clone,Debug,PartialEq,Eq, RustcDecodable, RustcEncodable)] pub struct U32Options { @@ -65,9 +67,43 @@ impl Default for U32Options { } -/// Shortcut for +/// Shortcut for a u32 fast field. +/// +/// Such a shortcut can be composed as follows `STORED | FAST | U32_INDEXED` pub const FAST: U32Options = U32Options { indexed: false, stored: false, fast: true, }; + +/// Shortcut for a u32 indexed field. +/// +/// Such a shortcut can be composed as follows `STORED | FAST | U32_INDEXED` +pub const U32_INDEXED: U32Options = U32Options { + indexed: true, + stored: false, + fast: false, +}; + +/// Shortcut for a u32 stored field. +/// +/// Such a shortcut can be composed as follows `STORED | FAST | U32_INDEXED` +pub const U32_STORED: U32Options = U32Options { + indexed: false, + stored: true, + fast: false, +}; + + +impl BitOr for U32Options { + + type Output = U32Options; + + fn bitor(self, other: U32Options) -> U32Options { + let mut res = U32Options::default(); + res.indexed = self.indexed | other.indexed; + res.stored = self.stored | other.stored; + res.fast = self.fast | other.fast; + res + } +} \ No newline at end of file From 7a07144c68d734c233550060502fe467c3c8b4eb Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 27 Feb 2017 01:42:25 +0900 Subject: [PATCH 047/107] Bugfix related with deletes, rollback and the index opstamp. --- src/core/index.rs | 6 +-- src/core/segment_id.rs | 2 +- src/functional_test.rs | 61 ++++++++++++++++++++++++++++ src/indexer/index_writer.rs | 50 ++++++++++++++++------- src/indexer/segment_entry.rs | 4 ++ src/indexer/segment_manager.rs | 50 ++++++++++++----------- src/indexer/segment_register.rs | 8 +--- src/indexer/segment_updater.rs | 14 +++---- src/lib.rs | 71 ++++++++++++++++++++++++++++++--- 9 files changed, 201 insertions(+), 65 deletions(-) create mode 100644 src/functional_test.rs diff --git a/src/core/index.rs b/src/core/index.rs index 100714c89..afce57a21 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -39,7 +39,6 @@ pub struct Index { directory: Box, schema: Schema, searcher_pool: Arc>, - opstamp: u64, } @@ -117,12 +116,10 @@ impl Index { /// Creates a new index given a directory and an `IndexMeta`. fn create_from_metas(directory: Box, metas: IndexMeta) -> Result { let schema = metas.schema.clone(); - let opstamp = metas.opstamp; let index = Index { directory: directory, schema: schema, searcher_pool: Arc::new(Pool::new()), - opstamp: opstamp, }; try!(index.load_searchers()); Ok(index) @@ -146,7 +143,7 @@ impl Index { /// The opstamp is the number of documents that have been added /// from the beginning of time, and until the moment of the last commit. pub fn opstamp(&self) -> u64 { - self.opstamp + load_metas(self.directory()).unwrap().opstamp } /// Open a new index writer. Attempts to acquire a lockfile. @@ -294,7 +291,6 @@ impl Clone for Index { directory: self.directory.box_clone(), schema: self.schema.clone(), searcher_pool: self.searcher_pool.clone(), - opstamp: self.opstamp, } } } diff --git a/src/core/segment_id.rs b/src/core/segment_id.rs index 6515ab423..9e3a75d3d 100644 --- a/src/core/segment_id.rs +++ b/src/core/segment_id.rs @@ -79,7 +79,7 @@ impl Decodable for SegmentId { impl fmt::Debug for SegmentId { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "SegmentId({:?})", self.uuid_string()) + write!(f, "Seg({:?})", self.short_uuid_string()) } } diff --git a/src/functional_test.rs b/src/functional_test.rs new file mode 100644 index 000000000..dd713d896 --- /dev/null +++ b/src/functional_test.rs @@ -0,0 +1,61 @@ +use std::collections::HashSet; +use rand::{thread_rng, Rng}; + +use schema::*; +use Index; +use Searcher; +use rand::distributions::{IndependentSample, Range}; + +fn check_index_content(searcher: &Searcher, vals: &HashSet) { + assert!(searcher.segment_readers().len() < 20); + assert_eq!(searcher.num_docs() as usize, vals.len()); +} + +#[test] +fn test_indexing() { + + let mut schema_builder = SchemaBuilder::default(); + + let id_field = schema_builder.add_u32_field("id", U32_INDEXED); + let multiples_field = schema_builder.add_u32_field("multiples", U32_INDEXED); + let schema = schema_builder.build(); + + let index = Index::create_from_tempdir(schema).unwrap(); + + let universe = Range::new(0u32, 20u32); + let mut rng = thread_rng(); + + let mut index_writer = index.writer_with_num_threads(3, 120_000_000).unwrap(); + + let mut committed_docs: HashSet = HashSet::new(); + let mut uncommitted_docs: HashSet = HashSet::new(); + + for n in 0..200 { + let random_val = universe.ind_sample(&mut rng); + if random_val == 0 { + index_writer.commit(); + committed_docs.extend(&uncommitted_docs); + uncommitted_docs.clear(); + index.load_searchers().unwrap(); + let searcher = index.searcher(); + // check that everything is correct. + check_index_content(&searcher, &committed_docs); + } + else { + if committed_docs.remove(&random_val) || + uncommitted_docs.remove(&random_val) { + let doc_id_term = Term::from_field_u32(id_field, random_val); + index_writer.delete_term(doc_id_term); + } + else { + uncommitted_docs.insert(random_val); + let mut doc = Document::new(); + doc.add_u32(id_field, random_val); + for i in 1u32..10u32 { + doc.add_u32(multiples_field, random_val * i); + } + index_writer.add_document(doc); + } + } + } +} diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 7b9633ea3..f2cc58df1 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -160,12 +160,25 @@ pub fn advance_deletes( segment: &mut Segment, delete_operations: &DeleteQueueSnapshot, doc_opstamps: &DocToOpstampMapping) -> Result { + + let segment_reader = SegmentReader::open(segment.clone())?; + let mut delete_bitset = BitSet::with_capacity(segment_reader.max_doc() as usize); let mut last_opstamp_opt: Option = None; + let previous_delete_opstamp_opt = segment.meta().delete_opstamp(); + for delete_op in delete_operations.iter() { + + // let's skip operations that have already been deleted.0u32 + if let Some(previous_delete_opstamp) = previous_delete_opstamp_opt { + if delete_op.opstamp <= previous_delete_opstamp { + continue; + } + } + // A delete operation should only affect // document that were inserted after it. // @@ -179,11 +192,11 @@ pub fn advance_deletes( delete_bitset.insert(deleted_doc as usize); } } - last_opstamp_opt = Some(delete_op.opstamp); } + last_opstamp_opt = Some(delete_op.opstamp); } - if let Some(last_opstamp) = last_opstamp_opt { + if let Some(last_opstamp) = last_opstamp_opt { for doc in 0u32..segment_reader.max_doc() { if segment_reader.is_deleted(doc) { delete_bitset.insert(doc as usize); @@ -194,6 +207,7 @@ pub fn advance_deletes( let mut delete_file = segment.open_write(SegmentComponent::DELETE)?; write_delete_bitset(&delete_bitset, &mut delete_file)?; } + Ok(segment.meta().clone()) } @@ -365,6 +379,8 @@ impl IndexWriter { /// The opstamp at the last commit is returned. pub fn rollback(&mut self) -> Result { + info!("Rolling back to opstamp {}", self.committed_opstamp); + // by updating the generation in the segment updater, // pending add segment commands will be dismissed. self.generation += 1; @@ -428,6 +444,19 @@ impl IndexWriter { /// pub fn commit(&mut self) -> Result { + // here, because we join all of the worker threads, + // all of the segment update for this commit have been + // sent. + // + // No document belonging to the next generation have been + // pushed too, because add_document can only happen + // on this thread. + + // This will move uncommitted segments to the state of + // committed segments. + self.committed_opstamp = self.stamp(); + info!("committing {}", self.committed_opstamp); + // this will drop the current document channel // and recreate a new one channels. self.recreate_document_channel(); @@ -444,17 +473,7 @@ impl IndexWriter { try!(self.add_indexing_worker()); } - // here, because we join all of the worker threads, - // all of the segment update for this commit have been - // sent. - // - // No document belonging to the next generation have been - // pushed too, because add_document can only happen - // on this thread. - - // This will move uncommitted segments to the state of - // committed segments. - self.committed_opstamp = self.stamp(); + // wait for the segment update thread to have processed the info self.segment_updater @@ -473,13 +492,14 @@ impl IndexWriter { /// /// Like adds, the deletion itself will be visible /// only after calling `commit()`. - pub fn delete_term(&mut self, term: Term) { + pub fn delete_term(&mut self, term: Term) -> u64 { let opstamp = self.stamp(); let delete_operation = DeleteOperation { opstamp: opstamp, term: term, }; self.delete_queue.push(delete_operation); + opstamp } fn stamp(&mut self) -> u64 { @@ -498,6 +518,8 @@ impl IndexWriter { /// /// Currently it represents the number of documents that /// have been added since the creation of the index. + + // TODO remove return without Result<> pub fn add_document(&mut self, document: Document) -> io::Result { let opstamp = self.stamp(); let add_operation = AddOperation { diff --git a/src/indexer/segment_entry.rs b/src/indexer/segment_entry.rs index c8a917665..0c12e72ba 100644 --- a/src/indexer/segment_entry.rs +++ b/src/indexer/segment_entry.rs @@ -43,6 +43,10 @@ impl SegmentEntry { self.state } + pub fn set_state(&mut self, state: SegmentState) { + self.state = state; + } + pub fn set_doc_to_opstamp(&mut self, doc_to_opstamp: DocToOpstampMapping) { self.doc_to_opstamp = doc_to_opstamp; } diff --git a/src/indexer/segment_manager.rs b/src/indexer/segment_manager.rs index 8d01dee1a..250830a84 100644 --- a/src/indexer/segment_manager.rs +++ b/src/indexer/segment_manager.rs @@ -2,23 +2,17 @@ use super::segment_register::SegmentRegister; use std::sync::RwLock; use core::SegmentMeta; use core::SegmentId; -use indexer::SegmentEntry; +use indexer::{SegmentEntry, SegmentState}; + use std::sync::{RwLockReadGuard, RwLockWriteGuard}; use std::fmt::{self, Debug, Formatter}; +#[derive(Default)] struct SegmentRegisters { uncommitted: SegmentRegister, committed: SegmentRegister, } -impl Default for SegmentRegisters { - fn default() -> SegmentRegisters { - SegmentRegisters { - uncommitted: SegmentRegister::default(), - committed: SegmentRegister::default() - } - } -} /// The segment manager stores the list of segments @@ -26,6 +20,7 @@ impl Default for SegmentRegisters { /// /// It guarantees the atomicity of the /// changes (merges especially) +#[derive(Default)] pub struct SegmentManager { registers: RwLock, } @@ -71,7 +66,12 @@ impl SegmentManager { ); segment_entries } - + + pub fn segment_state(&self, segment_id: &SegmentId) -> Option { + self.segment_entry(segment_id) + .map(|segment_entry| segment_entry.state()) + } + pub fn segment_entry(&self, segment_id: &SegmentId) -> Option { let registers = self.read(); registers @@ -100,11 +100,22 @@ impl SegmentManager { segment_ids } - pub fn commit(&self, segment_entries: Vec) { + pub fn commit(&self, segment_metas: Vec) { + let committed_segment_entries = segment_metas + .into_iter() + .map(|segment_meta| { + let segment_id = segment_meta.id(); + let mut segment_entry = SegmentEntry::new(segment_meta); + if let Some(state) = self.segment_state(&segment_id) { + segment_entry.set_state(state); + } + segment_entry + }) + .collect::>(); let mut registers_lock = self.write(); registers_lock.committed.clear(); registers_lock.uncommitted.clear(); - for segment_entry in segment_entries { + for segment_entry in committed_segment_entries { registers_lock.committed.add_segment_entry(segment_entry); } } @@ -121,6 +132,9 @@ impl SegmentManager { registers_lock.committed.start_merge(segment_id); } } + else { + error!("Merge operation sent for segments that are not all uncommited or commited."); + } } pub fn add_segment(&self, segment_entry: SegmentEntry) { @@ -152,15 +166,3 @@ impl SegmentManager { registers_lock.committed.segment_metas() } } - - -impl Default for SegmentManager { - fn default() -> SegmentManager { - SegmentManager { - registers: RwLock::new( SegmentRegisters { - uncommitted: SegmentRegister::default(), - committed: SegmentRegister::default(), - }), - } - } -} \ No newline at end of file diff --git a/src/indexer/segment_register.rs b/src/indexer/segment_register.rs index 5f2216342..418d92ef7 100644 --- a/src/indexer/segment_register.rs +++ b/src/indexer/segment_register.rs @@ -15,6 +15,7 @@ use indexer::segment_entry::SegmentEntry; /// segments that are currently searchable, /// and by the index merger to identify /// merge candidates. +#[derive(Default)] pub struct SegmentRegister { segment_states: HashMap, } @@ -110,13 +111,6 @@ impl SegmentRegister { } } -impl Default for SegmentRegister { - fn default() -> SegmentRegister { - SegmentRegister { - segment_states: HashMap::new(), - } - } -} #[cfg(test)] mod tests { diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index f7f9ea868..768fe7ea0 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -105,8 +105,7 @@ struct InnerSegmentUpdater { impl SegmentUpdater { - pub fn new(index: Index, delete_queue: DeleteQueue) -> Result - { + pub fn new(index: Index, delete_queue: DeleteQueue) -> Result { let segments = index.segments()?; let segment_manager = SegmentManager::from_segments(segments); Ok( @@ -177,11 +176,7 @@ impl SegmentUpdater { pub fn commit(&self, opstamp: u64) -> impl Future { self.run_async(move |segment_updater| { let segment_metas = segment_updater.purge_deletes().expect("Failed purge deletes"); - let segment_entries = segment_metas - .into_iter() - .map(SegmentEntry::new) - .collect::>(); - segment_updater.0.segment_manager.commit(segment_entries); + segment_updater.0.segment_manager.commit(segment_metas); let mut directory = segment_updater.0.index.directory().box_clone(); save_metas( segment_updater.0.segment_manager.committed_segment_metas(), @@ -241,13 +236,15 @@ impl SegmentUpdater { .map(|segment_meta| index.segment(segment_meta)) .collect(); - // An IndexMerger is like a "view" of our merged segments. + // An IndexMerger is like a "view" of our merged segments. let merger: IndexMerger = IndexMerger::open(schema, &segments[..])?; let mut merged_segment = index.new_segment(); // ... we just serialize this index merger in our new segment // to merge the two segments. + let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment).expect("Creating index serializer failed"); + let num_docs = merger.write(segment_serializer).expect("Serializing merged index failed"); let mut segment_meta = SegmentMeta::new(merged_segment.id()); segment_meta.set_num_docs(num_docs); @@ -257,7 +254,6 @@ impl SegmentUpdater { .end_merge(segment_metas.clone(), segment_entry.clone()) .wait() .unwrap(); - merging_future_send.complete(segment_entry.clone()); segment_updater_clone.0.merging_threads.write().unwrap().remove(&merging_thread_id); Ok(segment_entry) diff --git a/src/lib.rs b/src/lib.rs index 61285583a..a34a8e93b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -55,6 +55,10 @@ extern crate libc; #[cfg(test)] extern crate test; #[cfg(test)] extern crate rand; + +#[cfg(test)] +mod functional_test; + #[macro_use] mod macros { macro_rules! get( @@ -185,8 +189,10 @@ mod tests { use Index; use core::SegmentReader; use query::BooleanQuery; + use postings::SegmentPostingsOption; use schema::*; use DocSet; + use IndexWriter; use Postings; #[test] @@ -290,7 +296,7 @@ mod tests { #[test] - fn test_delete_postings() { + fn test_delete_postings1() { let mut schema_builder = SchemaBuilder::default(); let text_field = schema_builder.add_text_field("text", TEXT); let schema = schema_builder.build(); @@ -392,10 +398,8 @@ mod tests { { index_writer.delete_term(Term::from_field_text(text_field, "c")); } - index_writer.rollback().unwrap(); - { - index_writer.delete_term(Term::from_field_text(text_field, "a")); - } + index_writer.rollback().unwrap(); + index_writer.delete_term(Term::from_field_text(text_field, "a")); index_writer.commit().unwrap(); } { @@ -425,6 +429,63 @@ mod tests { } + #[test] + fn test_indexed_u32() { + let mut schema_builder = SchemaBuilder::default(); + let field = schema_builder.add_u32_field("text", U32_INDEXED); + let schema = schema_builder.build(); + + let index = Index::create_in_ram(schema); + let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); + index_writer.add_document( + doc!(field=>1) + ); + index_writer.commit().unwrap(); + index.load_searchers().unwrap(); + let searcher = index.searcher(); + let term = Term::from_field_u32(field, 1u32); + let mut postings = searcher.segment_reader(0).read_postings(&term, SegmentPostingsOption::NoFreq).unwrap(); + assert!(postings.advance()); + assert_eq!(postings.doc(), 0); + assert!(!postings.advance()); + } + + #[test] + fn test_delete_postings2() { + let mut schema_builder = SchemaBuilder::default(); + let text_field = schema_builder.add_text_field("text", TEXT); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + + // writing the segment + let mut index_writer = index.writer_with_num_threads(2, 40_000_000).unwrap(); + + let add_document = |index_writer: &mut IndexWriter, val: &'static str| { + let doc = doc!(text_field=>val); + index_writer.add_document(doc); + }; + + let remove_document = |index_writer: &mut IndexWriter, val: &'static str| { + let delterm = Term::from_field_text(text_field, val); + index_writer.delete_term(delterm); + }; + + add_document(&mut index_writer, "63"); + add_document(&mut index_writer, "70"); + add_document(&mut index_writer, "34"); + add_document(&mut index_writer, "1"); + add_document(&mut index_writer, "38"); + add_document(&mut index_writer, "33"); + add_document(&mut index_writer, "40"); + add_document(&mut index_writer, "17"); + remove_document(&mut index_writer, "38"); + remove_document(&mut index_writer, "34"); + index_writer.commit().unwrap(); + index.load_searchers().unwrap(); + let searcher = index.searcher(); + assert_eq!(searcher.num_docs(), 6); + } + #[test] fn test_termfreq() { let mut schema_builder = SchemaBuilder::default(); From 15b60d72cccdd17c0375c2363039d9b1db2acb87 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 27 Feb 2017 09:36:41 +0900 Subject: [PATCH 048/107] NOBUG add_document does not return result --- examples/simple_search.rs | 6 ++-- src/core/term_iterator.rs | 6 ++-- src/functional_test.rs | 4 +-- src/indexer/index_writer.rs | 21 +++++++------- src/indexer/merger.rs | 24 ++++++++-------- src/lib.rs | 50 +++++++++++++++++----------------- src/postings/mod.rs | 6 ++-- src/query/boolean_query/mod.rs | 10 +++---- src/query/phrase_query/mod.rs | 10 +++---- src/query/term_query/mod.rs | 2 +- 10 files changed, 70 insertions(+), 69 deletions(-) diff --git a/examples/simple_search.rs b/examples/simple_search.rs index cff539b9d..2f26ba1fb 100644 --- a/examples/simple_search.rs +++ b/examples/simple_search.rs @@ -95,7 +95,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> { old_man_doc.add_text(body, "He was an old man who fished alone in a skiff in the Gulf Stream and he had gone eighty-four days now without taking a fish."); // ... and add it to the `IndexWriter`. - try!(index_writer.add_document(old_man_doc)); + index_writer.add_document(old_man_doc); // ### Create a document directly from json. // @@ -107,7 +107,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> { "body": "few miles south of Soledad, the Salinas River drops in close to the hillside bank and runs deep and green. The water is warm too, for it has slipped twinkling over the yellow sands in the sunlight before reaching the narrow pool. On one side of the river the golden foothill slopes curve up to the strong and rocky Gabilan Mountains, but on the valley side the water is lined with trees—willows fresh and green with every spring, carrying in their lower leaf junctures the debris of the winter’s flooding; and sycamores with mottled, white,recumbent limbs and branches that arch over the pool" }"#)); - try!(index_writer.add_document(mice_and_men_doc)); + index_writer.add_document(mice_and_men_doc); // Multi-valued field are allowed, they are // expressed in JSON by an array. @@ -116,7 +116,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> { "title": ["Frankenstein", "The Modern Promotheus"], "body": "You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking." }"#)); - try!(index_writer.add_document(frankenstein_doc)); + index_writer.add_document(frankenstein_doc); // This is an example, so we will only index 3 documents // here. You can check out tantivy's tutorial to index diff --git a/src/core/term_iterator.rs b/src/core/term_iterator.rs index b8d77dac5..b54377fad 100644 --- a/src/core/term_iterator.rs +++ b/src/core/term_iterator.rs @@ -149,7 +149,7 @@ mod tests { { let mut doc = Document::default(); doc.add_text(text_field, "a b d f"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } index_writer.commit().unwrap(); } @@ -157,7 +157,7 @@ mod tests { { let mut doc = Document::default(); doc.add_text(text_field, "a b c d f"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } index_writer.commit().unwrap(); } @@ -165,7 +165,7 @@ mod tests { { let mut doc = Document::default(); doc.add_text(text_field, "e f"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } index_writer.commit().unwrap(); } diff --git a/src/functional_test.rs b/src/functional_test.rs index dd713d896..754eac395 100644 --- a/src/functional_test.rs +++ b/src/functional_test.rs @@ -1,5 +1,5 @@ use std::collections::HashSet; -use rand::{thread_rng, Rng}; +use rand::thread_rng; use schema::*; use Index; @@ -30,7 +30,7 @@ fn test_indexing() { let mut committed_docs: HashSet = HashSet::new(); let mut uncommitted_docs: HashSet = HashSet::new(); - for n in 0..200 { + for _ in 0..200 { let random_val = universe.ind_sample(&mut rng); if random_val == 0 { index_writer.commit(); diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index f2cc58df1..104e2a9f1 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -24,7 +24,6 @@ use Result; use schema::Document; use schema::Schema; use schema::Term; -use std::io; use std::mem; use std::mem::swap; use std::thread; @@ -196,6 +195,10 @@ pub fn advance_deletes( last_opstamp_opt = Some(delete_op.opstamp); } + // we only write the result different + // iff we ended ended up increasing the delete opstamp + // + // TODO just move the file if there was no new delete? if let Some(last_opstamp) = last_opstamp_opt { for doc in 0u32..segment_reader.max_doc() { if segment_reader.is_deleted(doc) { @@ -518,16 +521,14 @@ impl IndexWriter { /// /// Currently it represents the number of documents that /// have been added since the creation of the index. - - // TODO remove return without Result<> - pub fn add_document(&mut self, document: Document) -> io::Result { + pub fn add_document(&mut self, document: Document) -> u64 { let opstamp = self.stamp(); let add_operation = AddOperation { opstamp: opstamp, document: document, }; self.document_sender.send(add_operation); - Ok(opstamp) + opstamp } } @@ -595,7 +596,7 @@ mod tests { { let mut doc = Document::default(); doc.add_text(text_field, "a"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } assert_eq!(index_writer.rollback().unwrap(), 0u64); assert_eq!(num_docs_containing("a"), 0); @@ -603,12 +604,12 @@ mod tests { { let mut doc = Document::default(); doc.add_text(text_field, "b"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } { let mut doc = Document::default(); doc.add_text(text_field, "c"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } assert_eq!(index_writer.commit().unwrap(), 2u64); index.load_searchers().unwrap(); @@ -637,13 +638,13 @@ mod tests { for _doc in 0..100 { let mut doc = Document::default(); doc.add_text(text_field, "a"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } index_writer.commit().expect("commit failed"); for _doc in 0..100 { let mut doc = Document::default(); doc.add_text(text_field, "a"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } // this should create 8 segments and trigger a merge. index_writer.commit().expect("commit failed"); diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 6f47f0365..b7284af96 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -325,19 +325,19 @@ mod tests { let mut doc = Document::default(); doc.add_text(text_field, "af b"); doc.add_u32(score_field, 3); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } { let mut doc = Document::default(); doc.add_text(text_field, "a b c"); doc.add_u32(score_field, 5); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } { let mut doc = Document::default(); doc.add_text(text_field, "a b c d"); doc.add_u32(score_field, 7); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } index_writer.commit().expect("committed"); } @@ -348,13 +348,13 @@ mod tests { let mut doc = Document::default(); doc.add_text(text_field, "af b"); doc.add_u32(score_field, 11); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } { let mut doc = Document::default(); doc.add_text(text_field, "a b c g"); doc.add_u32(score_field, 13); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } index_writer.commit().expect("Commit failed"); } @@ -443,18 +443,18 @@ mod tests { doc!( text_field => "a b d", score_field => 1 - )).unwrap(); + )); index_writer.add_document( doc!( text_field => "b c", score_field => 2 - )).unwrap(); + )); index_writer.delete_term(Term::from_field_text(text_field, "c")); index_writer.add_document( doc!( text_field => "c d", score_field => 3 - )).unwrap(); + )); index_writer.commit().expect("committed"); index.load_searchers().unwrap(); let ref searcher = *index.searcher(); @@ -471,24 +471,24 @@ mod tests { doc!( text_field => "a d e", score_field => 4_000 - )).unwrap(); + )); index_writer.add_document( doc!( text_field => "e f", score_field => 5_000 - )).unwrap(); + )); index_writer.delete_term(Term::from_field_text(text_field, "a")); index_writer.delete_term(Term::from_field_text(text_field, "f")); index_writer.add_document( doc!( text_field => "f g", score_field => 6_000 - )).unwrap(); + )); index_writer.add_document( doc!( text_field => "g h", score_field => 7_000 - )).unwrap(); + )); index_writer.commit().expect("committed"); index.load_searchers().unwrap(); let searcher = index.searcher(); diff --git a/src/lib.rs b/src/lib.rs index a34a8e93b..9aca5e967 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -206,15 +206,15 @@ mod tests { let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); { let doc = doc!(text_field=>"af b"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } { let doc = doc!(text_field=>"a b c"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } { let doc = doc!(text_field=>"a b c d"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } assert!(index_writer.commit().is_ok()); } @@ -228,23 +228,23 @@ mod tests { let index = Index::create_in_ram(schema_builder.build()); let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); { - index_writer.add_document(doc!(text_field=>"a b c")).unwrap(); + index_writer.add_document(doc!(text_field=>"a b c")); index_writer.commit().unwrap(); } { { let doc = doc!(text_field=>"a"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } { let doc = doc!(text_field=>"a a"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } index_writer.commit().unwrap(); } { let doc = doc!(text_field=>"c"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); index_writer.commit().unwrap(); } { @@ -271,15 +271,15 @@ mod tests { let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); { let doc = doc!(text_field=>"a b c"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } { let doc = doc!(); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } { let doc = doc!(text_field=>"a b"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } index_writer.commit().unwrap(); } @@ -306,19 +306,19 @@ mod tests { let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); { // 0 let doc = doc!(text_field=>"a b"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } { // 1 let doc = doc!(text_field=>" a c"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } { // 2 let doc = doc!(text_field=>" b c"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } { // 3 let doc = doc!(text_field=>" b d"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } { index_writer.delete_term(Term::from_field_text(text_field, "c")); @@ -328,11 +328,11 @@ mod tests { } { // 4 let doc = doc!(text_field=>" b c"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } { // 5 let doc = doc!(text_field=>" a"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } index_writer.commit().unwrap(); } @@ -361,7 +361,7 @@ mod tests { let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); { // 0 let doc = doc!(text_field=>"a b"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } { // 1 index_writer.delete_term(Term::from_field_text(text_field, "c")); @@ -393,7 +393,7 @@ mod tests { let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); { let doc = doc!(text_field=>"a b"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } { index_writer.delete_term(Term::from_field_text(text_field, "c")); @@ -497,7 +497,7 @@ mod tests { let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); { let doc = doc!(text_field=>"af af af bc bc"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } index_writer.commit().unwrap(); } @@ -526,15 +526,15 @@ mod tests { let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); { let doc = doc!(text_field=>"af af af b"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } { let doc = doc!(text_field=>"a b c"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } { let doc = doc!(text_field=>"a b c d"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } index_writer.commit().unwrap(); } @@ -593,15 +593,15 @@ mod tests { let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); { let doc = doc!(text_field=>"af b"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } { let doc = doc!(text_field=>"a b c"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } { let doc = doc!(text_field=>"a b c d"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } index_writer.commit().unwrap(); } diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 0cdef4b4e..bd0c39e60 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -175,12 +175,12 @@ mod tests { { let mut doc = Document::default(); doc.add_text(text_field, "g b b d c g c"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } { let mut doc = Document::default(); doc.add_text(text_field, "g a b b a d c g c"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } assert!(index_writer.commit().is_ok()); } @@ -257,7 +257,7 @@ mod tests { count_b += 1; doc.add_text(text_field, "b"); } - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } assert!(index_writer.commit().is_ok()); } diff --git a/src/query/boolean_query/mod.rs b/src/query/boolean_query/mod.rs index c17ea2303..1b41a8996 100644 --- a/src/query/boolean_query/mod.rs +++ b/src/query/boolean_query/mod.rs @@ -42,23 +42,23 @@ mod tests { let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); { let doc = doc!(text_field => "a b c"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } { let doc = doc!(text_field => "a c"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } { let doc = doc!(text_field => "b c"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } { let doc = doc!(text_field => "a b c d"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } { let doc = doc!(text_field => "d"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } assert!(index_writer.commit().is_ok()); } diff --git a/src/query/phrase_query/mod.rs b/src/query/phrase_query/mod.rs index e01743eb3..4c9d1dca0 100644 --- a/src/query/phrase_query/mod.rs +++ b/src/query/phrase_query/mod.rs @@ -27,23 +27,23 @@ mod tests { let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); { // 0 let doc = doc!(text_field=>"b b b d c g c"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } { // 1 let doc = doc!(text_field=>"a b b d c g c"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } { // 2 let doc = doc!(text_field=>"a b a b c"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } { // 3 let doc = doc!(text_field=>"c a b a d ga a"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } { // 4 let doc = doc!(text_field=>"a b c"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } assert!(index_writer.commit().is_ok()); } diff --git a/src/query/term_query/mod.rs b/src/query/term_query/mod.rs index 747fbe62e..8aa56484a 100644 --- a/src/query/term_query/mod.rs +++ b/src/query/term_query/mod.rs @@ -36,7 +36,7 @@ mod tests { let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); { let doc = doc!(text_field => "a"); - index_writer.add_document(doc).unwrap(); + index_writer.add_document(doc); } assert!(index_writer.commit().is_ok()); } From ec5fb2eaa9412ba14101a848a886324aa2e2dedf Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 27 Feb 2017 09:52:28 +0900 Subject: [PATCH 049/107] NOBUG cleanup --- src/core/segment_meta.rs | 42 +++++++++++++++++++++++++++------ src/functional_test.rs | 2 +- src/indexer/index_writer.rs | 2 +- src/indexer/log_merge_policy.rs | 2 +- src/indexer/segment_updater.rs | 2 +- 5 files changed, 39 insertions(+), 11 deletions(-) diff --git a/src/core/segment_meta.rs b/src/core/segment_meta.rs index ee2c9593b..ffcd2f6b9 100644 --- a/src/core/segment_meta.rs +++ b/src/core/segment_meta.rs @@ -14,41 +14,69 @@ struct DeleteMeta { #[derive(Clone, Debug, RustcDecodable,RustcEncodable)] pub struct SegmentMeta { segment_id: SegmentId, - num_docs: u32, + max_doc: u32, deletes: Option, } impl SegmentMeta { + + /// Creates a new segment meta for + /// a segment with no deletes and no documents. pub fn new(segment_id: SegmentId) -> SegmentMeta { SegmentMeta { segment_id: segment_id, - num_docs: 0, + max_doc: 0, deletes: None, } } + /// Returns the segment id. pub fn id(&self) -> SegmentId { self.segment_id } - pub fn num_docs(&self) -> u32 { - self.num_docs - } + /// Returns the number of deleted documents. + pub fn num_deleted_docs(&self) -> u32 { + self.deletes + .as_ref() + .map(|delete_meta| delete_meta.num_deleted_docs) + .unwrap_or(0u32) + } + /// Return the highest doc id + 1 + /// + /// If there are no deletes, then num_docs = max_docs + /// and all the doc ids contains in this segment + /// are exactly (0..max_doc). + pub fn max_doc(&self) -> u32 { + self.max_doc + } + + /// Return the number of documents in the segment. + pub fn num_docs(&self) -> u32 { + self.max_doc() - self.num_deleted_docs() + } + + /// Returns the opstamp of the last delete operation + /// taken in account in this segment. pub fn delete_opstamp(&self) -> Option { self.deletes .as_ref() .map(|delete_meta| delete_meta.opstamp) } + /// Returns true iff the segment meta contains + /// delete information. pub fn has_deletes(&self) -> bool { self.deletes.is_some() } - pub fn set_num_docs(&mut self, num_docs: u32) { - self.num_docs = num_docs; + #[doc(hidden)] + pub fn set_max_doc(&mut self, max_doc: u32) { + self.max_doc = max_doc; } + #[doc(hidden)] pub fn set_delete_meta(&mut self, num_deleted_docs: u32, opstamp: u64) { self.deletes = Some(DeleteMeta { num_deleted_docs: num_deleted_docs, diff --git a/src/functional_test.rs b/src/functional_test.rs index 754eac395..53d8fcd11 100644 --- a/src/functional_test.rs +++ b/src/functional_test.rs @@ -33,7 +33,7 @@ fn test_indexing() { for _ in 0..200 { let random_val = universe.ind_sample(&mut rng); if random_val == 0 { - index_writer.commit(); + index_writer.commit().expect("Commit failed"); committed_docs.extend(&uncommitted_docs); uncommitted_docs.clear(); index.load_searchers().unwrap(); diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 104e2a9f1..b7a60db68 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -242,7 +242,7 @@ fn index_documents(heap: &mut Heap, // let segment_entry = advance_deletes(&mut segment, delete_queue, delete_position, )?; let mut segment_meta = SegmentMeta::new(segment_id); - segment_meta.set_num_docs(num_docs); + segment_meta.set_max_doc(num_docs); let mut segment_entry = SegmentEntry::new(segment_meta); segment_entry.set_doc_to_opstamp(DocToOpstampMapping::from(doc_opstamps)); diff --git a/src/indexer/log_merge_policy.rs b/src/indexer/log_merge_policy.rs index 413964767..951af8c31 100644 --- a/src/indexer/log_merge_policy.rs +++ b/src/indexer/log_merge_policy.rs @@ -122,7 +122,7 @@ mod tests { fn seg_meta(num_docs: u32) -> SegmentMeta { let mut segment_metas = SegmentMeta::new(SegmentId::generate_random()); - segment_metas.set_num_docs(num_docs); + segment_metas.set_max_doc(num_docs); segment_metas } diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index 768fe7ea0..6c5b9b451 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -247,7 +247,7 @@ impl SegmentUpdater { let num_docs = merger.write(segment_serializer).expect("Serializing merged index failed"); let mut segment_meta = SegmentMeta::new(merged_segment.id()); - segment_meta.set_num_docs(num_docs); + segment_meta.set_max_doc(num_docs); let segment_entry = SegmentEntry::new(segment_meta); segment_updater_clone From ab3440f925a1d28bae7ba30efac99d5c4067b0aa Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 27 Feb 2017 12:39:59 +0900 Subject: [PATCH 050/107] NOBUG Bypass github cache for coveralls badge --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a58345a4b..a6863b2db 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ ![Tantivy](https://tantivy-search.github.io/logo/tantivy-logo.png) [![Build Status](https://travis-ci.org/tantivy-search/tantivy.svg?branch=master)](https://travis-ci.org/tantivy-search/tantivy) -[![Coverage Status](https://coveralls.io/repos/github/tantivy-search/tantivy/badge.svg?branch=master)](https://coveralls.io/github/tantivy-search/tantivy?branch=master) +[![Coverage Status](https://coveralls.io/repos/github/tantivy-search/tantivy/badge.svg?branch=master&refresh1)](https://coveralls.io/github/tantivy-search/tantivy?branch=master) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) [![Join the chat at https://gitter.im/tantivy-search/tantivy](https://badges.gitter.im/tantivy-search/tantivy.svg)](https://gitter.im/tantivy-search/tantivy?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) ![beacon for google analytics](https://ga-beacon.appspot.com/UA-88834340-1/tantivy/README) From 590a8582c9e0891f006cbaeb47b5eb2196dd372b Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 28 Feb 2017 21:17:19 +0900 Subject: [PATCH 051/107] The reference doc should not point to the schema page. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a6863b2db..054f62f47 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,7 @@ Tantivy supports Linux and MacOS. Windows is not supported. - [tantivy's usage example](http://fulmicoton.com/tantivy-examples/simple_search.html) - [tantivy-cli and its tutorial](https://github.com/tantivy-search/tantivy-cli). It will walk you through getting a wikipedia search engine up and running in a few minutes. -- [reference doc](https://tantivy-search.github.io/tantivy/tantivy/schema/index.html). +- [reference doc](https://tantivy-search.github.io/tantivy/tantivy/index.html). # Compiling From 4b7afa2ae71a3ba2b4fc6369736f0e121bb8be21 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Fri, 3 Mar 2017 22:41:30 +0900 Subject: [PATCH 052/107] issue/77 Added managed directory --- src/core/index.rs | 76 ++++++---------------------- src/core/mod.rs | 1 + src/core/segment.rs | 14 +----- src/core/segment_component.rs | 19 ++++++- src/core/segment_meta.rs | 31 ++++++++++++ src/directory/directory.rs | 3 -- src/directory/managed_directory.rs | 81 ++++++++++++++++++++++++++++++ src/directory/mmap_directory.rs | 20 -------- src/directory/mod.rs | 2 + src/directory/ram_directory.rs | 19 ------- src/indexer/segment_manager.rs | 13 ++++- src/indexer/segment_register.rs | 1 + src/indexer/segment_updater.rs | 12 +++-- 13 files changed, 168 insertions(+), 124 deletions(-) create mode 100644 src/directory/managed_directory.rs diff --git a/src/core/index.rs b/src/core/index.rs index afce57a21..cc7d51cf3 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -19,6 +19,7 @@ use super::pool::LeasedItem; use std::path::Path; use core::IndexMeta; use IndexWriter; +use directory::ManagedDirectory; use core::META_FILEPATH; use super::segment::create_segment; use indexer::segment_updater::save_new_metas; @@ -26,7 +27,6 @@ use directory::error::FileError; const NUM_SEARCHERS: usize = 12; - fn load_metas(directory: &Directory) -> Result { let meta_data = directory.atomic_read(&META_FILEPATH)?; let meta_string = String::from_utf8_lossy(&meta_data); @@ -36,58 +36,19 @@ fn load_metas(directory: &Directory) -> Result { /// Tantivy's Search Index pub struct Index { - directory: Box, + directory: ManagedDirectory, schema: Schema, searcher_pool: Arc>, } - - -/// Deletes all of the files of the segment. -/// This is called when there is a merge or a rollback. -/// -/// # Disclaimer -/// If deletion of a file fails (e.g. a file -/// was read-only.), the method does not -/// fail and just logs an error when it fails. -#[doc(hidden)] -pub fn delete_segment(directory: &Directory, segment_id: SegmentId) { - info!("Deleting segment {:?}", segment_id); - let segment_filepaths_res = directory.ls_starting_with( - &*segment_id.uuid_string() - ); - - match segment_filepaths_res { - Ok(segment_filepaths) => { - for segment_filepath in &segment_filepaths { - if let Err(err) = directory.delete(&segment_filepath) { - match err { - FileError::FileDoesNotExist(_) => { - // this is normal behavior. - // the position file for instance may not exists. - } - FileError::IOError(err) => { - error!("Failed to remove {:?} : {:?}", segment_id, err); - } - } - } - } - } - Err(_) => { - error!("Failed to list files of segment {:?} for deletion.", segment_id.uuid_string()); - } - } -} - - impl Index { /// Creates a new index using the `RAMDirectory`. /// /// The index will be allocated in anonymous memory. /// This should only be used for unit tests. pub fn create_in_ram(schema: Schema) -> Index { - let directory = Box::new(RAMDirectory::create()); + let directory = ManagedDirectory::new(RAMDirectory::create()); Index::from_directory(directory, schema).expect("Creating a RAMDirectory should never fail") // unwrap is ok here } @@ -96,8 +57,8 @@ impl Index { /// /// If a previous index was in this directory, then its meta file will be destroyed. pub fn create(directory_path: &Path, schema: Schema) -> Result { - let directory = MmapDirectory::open(directory_path)?; - Index::from_directory(box directory, schema) + let directory = ManagedDirectory::new(MmapDirectory::open(directory_path)?); + Index::from_directory(directory, schema) } /// Creates a new index in a temp directory. @@ -109,12 +70,12 @@ impl Index { /// The temp directory is only used for testing the `MmapDirectory`. /// For other unit tests, prefer the `RAMDirectory`, see: `create_in_ram`. pub fn create_from_tempdir(schema: Schema) -> Result { - let directory = Box::new(try!(MmapDirectory::create_from_tempdir())); + let directory = ManagedDirectory::new(MmapDirectory::create_from_tempdir()?); Index::from_directory(directory, schema) } /// Creates a new index given a directory and an `IndexMeta`. - fn create_from_metas(directory: Box, metas: IndexMeta) -> Result { + fn create_from_metas(directory: ManagedDirectory, metas: IndexMeta) -> Result { let schema = metas.schema.clone(); let index = Index { directory: directory, @@ -126,16 +87,16 @@ impl Index { } /// Create a new index from a directory. - pub fn from_directory(mut directory: Box, schema: Schema) -> Result { + pub fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result { save_new_metas(schema.clone(), 0, directory.borrow_mut())?; Index::create_from_metas(directory, IndexMeta::with_schema(schema)) } /// Opens a new directory from an index path. pub fn open(directory_path: &Path) -> Result { - let directory = try!(MmapDirectory::open(directory_path)); + let directory = ManagedDirectory::new(MmapDirectory::open(directory_path)?); let metas = try!(load_metas(&directory)); - Index::create_from_metas(directory.box_clone(), metas) + Index::create_from_metas(directory, metas) } /// Returns the index opstamp. @@ -196,16 +157,7 @@ impl Index { .map(|segment_meta| self.segment(segment_meta)) .collect()) } - - /// Remove all of the file associated with the segment. - /// - /// This method cannot fail. If a problem occurs, - /// some files may end up never being removed. - /// The error will only be logged. - pub fn delete_segment(&self, segment_id: SegmentId) { - delete_segment(self.directory(), segment_id); - } - + #[doc(hidden)] pub fn segment(&self, segment_meta: SegmentMeta) -> Segment { create_segment(self.clone(), segment_meta) @@ -219,12 +171,12 @@ impl Index { /// Return a reference to the index directory. pub fn directory(&self) -> &Directory { - &*self.directory + &self.directory } /// Return a mutable reference to the index directory. pub fn directory_mut(&mut self) -> &mut Directory { - &mut *self.directory + &mut self.directory } /// Reads the meta.json and returns the list of @@ -288,7 +240,7 @@ impl fmt::Debug for Index { impl Clone for Index { fn clone(&self) -> Index { Index { - directory: self.directory.box_clone(), + directory: self.directory.clone(), schema: self.schema.clone(), searcher_pool: self.searcher_pool.clone(), } diff --git a/src/core/mod.rs b/src/core/mod.rs index d6238c48c..4e11428e0 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -26,4 +26,5 @@ use std::path::PathBuf; lazy_static! { pub static ref META_FILEPATH: PathBuf = PathBuf::from("meta.json"); + pub static ref MANAGED_FILEPATH: PathBuf = PathBuf::from(".managed.json"); } \ No newline at end of file diff --git a/src/core/segment.rs b/src/core/segment.rs index 22f157420..cd78743b9 100644 --- a/src/core/segment.rs +++ b/src/core/segment.rs @@ -62,19 +62,7 @@ impl Segment { /// It just joins the segment id with the extension /// associated to a segment component. pub fn relative_path(&self, component: SegmentComponent) -> PathBuf { - use self::SegmentComponent::*; - let mut path = self.id().uuid_string(); - path.push_str(&*match component { - POSITIONS => ".pos".to_string(), - INFO => ".info".to_string(), - POSTINGS => ".idx".to_string(), - TERMS => ".term".to_string(), - STORE => ".store".to_string(), - FASTFIELDS => ".fast".to_string(), - FIELDNORMS => ".fieldnorm".to_string(), - DELETE => {format!(".{}.del", self.meta.delete_opstamp().unwrap_or(0))}, - }); - PathBuf::from(path) + self.meta.relative_path(component) } /// Open one of the component file for read. diff --git a/src/core/segment_component.rs b/src/core/segment_component.rs index 93aacd506..5e380c597 100644 --- a/src/core/segment_component.rs +++ b/src/core/segment_component.rs @@ -10,5 +10,20 @@ pub enum SegmentComponent { DELETE } - - \ No newline at end of file +impl SegmentComponent { + + pub fn iterator() -> impl Iterator { + static SEGMENT_COMPONENTS: [SegmentComponent; 8] = [ + SegmentComponent::INFO, + SegmentComponent::POSTINGS, + SegmentComponent::POSITIONS, + SegmentComponent::FASTFIELDS, + SegmentComponent::FIELDNORMS, + SegmentComponent::TERMS, + SegmentComponent::STORE, + SegmentComponent::DELETE + ]; + SEGMENT_COMPONENTS.into_iter() + } + +} \ No newline at end of file diff --git a/src/core/segment_meta.rs b/src/core/segment_meta.rs index ffcd2f6b9..b342a18d0 100644 --- a/src/core/segment_meta.rs +++ b/src/core/segment_meta.rs @@ -1,4 +1,6 @@ use core::SegmentId; +use super::SegmentComponent; +use std::path::PathBuf; #[derive(Clone, Debug, RustcDecodable,RustcEncodable)] @@ -43,6 +45,35 @@ impl SegmentMeta { .unwrap_or(0u32) } + pub fn alive_files(&self) -> Vec { + SegmentComponent::iterator() + .map(|component| { + self.relative_path(*component) + }) + .collect::>() + + } + + /// Returns the relative path of a component of our segment. + /// + /// It just joins the segment id with the extension + /// associated to a segment component. + pub fn relative_path(&self, component: SegmentComponent) -> PathBuf { + use self::SegmentComponent::*; + let mut path = self.id().uuid_string(); + path.push_str(&*match component { + POSITIONS => ".pos".to_string(), + INFO => ".info".to_string(), + POSTINGS => ".idx".to_string(), + TERMS => ".term".to_string(), + STORE => ".store".to_string(), + FASTFIELDS => ".fast".to_string(), + FIELDNORMS => ".fieldnorm".to_string(), + DELETE => {format!(".{}.del", self.delete_opstamp().unwrap_or(0))}, + }); + PathBuf::from(path) + } + /// Return the highest doc id + 1 /// /// If there are no deletes, then num_docs = max_docs diff --git a/src/directory/directory.rs b/src/directory/directory.rs index b3ef71016..3f41f5011 100644 --- a/src/directory/directory.rs +++ b/src/directory/directory.rs @@ -78,9 +78,6 @@ pub trait Directory: fmt::Debug + Send + Sync + 'static { /// Clones the directory and boxes the clone fn box_clone(&self) -> Box; - /// Returns the list of files starting by a given - /// prefix. - fn ls_starting_with(&self, prefix: &str) -> io::Result>; } diff --git a/src/directory/managed_directory.rs b/src/directory/managed_directory.rs new file mode 100644 index 000000000..ba93a405e --- /dev/null +++ b/src/directory/managed_directory.rs @@ -0,0 +1,81 @@ +use Result; +use std::path::{Path, PathBuf}; +use directory::error::{FileError, OpenWriteError}; +use directory::{ReadOnlySource, WritePtr}; +use std::result; +use std::io; +use Directory; +use std::sync::{Arc, RwLock}; +use std::collections::HashSet; +use std::io::Write; +use rustc_serialize::json; +use core::MANAGED_FILEPATH; + + + +#[derive(Debug)] +pub struct ManagedDirectory { + directory: Box, + managed_paths: Arc>>, +} + + +impl ManagedDirectory { + pub fn new(directory: Dir) -> ManagedDirectory { + ManagedDirectory { + directory: box directory, + managed_paths: Arc::default(), + } + } + + fn register_file_as_managed(&mut self, filepath: PathBuf) -> Result<()> { + let mut managed_files_lock = self.managed_paths.write()?; + if managed_files_lock.insert(filepath) { + let mut w = vec!(); + try!(write!(&mut w, "{}\n", json::as_pretty_json(&*managed_files_lock))); + self.directory.atomic_write(&MANAGED_FILEPATH, &w[..])?; + } + Ok(()) + } +} + +impl Directory for ManagedDirectory { + + fn open_read(&self, path: &Path) -> result::Result { + self.directory.open_read(path) + } + + fn open_write(&mut self, path: &Path) -> result::Result { + self.directory.open_write(path) + } + + fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> { + self.directory.atomic_write(path, data) + } + + fn atomic_read(&self, path: &Path) -> result::Result, FileError> { + self.directory.atomic_read(path) + } + + fn delete(&self, path: &Path) -> result::Result<(), FileError> { + self.directory.delete(path) + } + + fn exists(&self, path: &Path) -> bool { + self.directory.exists(path) + } + + fn box_clone(&self) -> Box { + box self.clone() + } + +} + +impl Clone for ManagedDirectory { + fn clone(&self) -> ManagedDirectory { + ManagedDirectory { + directory: self.directory.box_clone(), + managed_paths: self.managed_paths.clone(), + } + } +} \ No newline at end of file diff --git a/src/directory/mmap_directory.rs b/src/directory/mmap_directory.rs index 3b18144b3..0a8e6f4ac 100644 --- a/src/directory/mmap_directory.rs +++ b/src/directory/mmap_directory.rs @@ -352,26 +352,6 @@ impl Directory for MmapDirectory { fn box_clone(&self,) -> Box { Box::new(self.clone()) } - - fn ls_starting_with(&self, prefix: &str) -> io::Result> { - fs::read_dir(&self.root_path) - .map(|paths: ReadDir| { - paths - .filter_map(|dir_entry_res| - dir_entry_res - .ok() - .map(|dir_entry| dir_entry.path()) - ) - .filter(|path| - path.to_str() - .map(|filepath| filepath.starts_with(prefix)) - .unwrap_or(false) - ) - .map(PathBuf::from) - .collect() - }) - - } } diff --git a/src/directory/mod.rs b/src/directory/mod.rs index e03435199..760c2c0d5 100644 --- a/src/directory/mod.rs +++ b/src/directory/mod.rs @@ -3,6 +3,7 @@ mod ram_directory; mod directory; mod read_only_source; mod shared_vec_slice; +mod managed_directory; /// Errors specific to the directory module. pub mod error; @@ -14,6 +15,7 @@ pub use self::read_only_source::ReadOnlySource; pub use self::directory::Directory; pub use self::ram_directory::RAMDirectory; pub use self::mmap_directory::MmapDirectory; +pub use self::managed_directory::ManagedDirectory; /// Synonym of Seek + Write pub trait SeekableWrite: Seek + Write {} diff --git a/src/directory/ram_directory.rs b/src/directory/ram_directory.rs index 2a85a735d..3f798485d 100644 --- a/src/directory/ram_directory.rs +++ b/src/directory/ram_directory.rs @@ -130,20 +130,6 @@ impl InnerDirectory { .contains_key(path) } - fn ls_starting_with(&self, prefix: &str) -> Vec { - self.0 - .read() - .expect("Failed to get read lock directory.") - .keys() - .filter(|path: &&PathBuf| - path.to_str() - .map(|p: &str| p.starts_with(prefix)) - .unwrap_or(false) - ) - .cloned() - .collect() - } - } impl fmt::Debug for RAMDirectory { @@ -218,9 +204,4 @@ impl Directory for RAMDirectory { Box::new(self.clone()) } - - fn ls_starting_with(&self, prefix: &str) -> io::Result> { - Ok(self.fs.ls_starting_with(prefix)) - } - } diff --git a/src/indexer/segment_manager.rs b/src/indexer/segment_manager.rs index 250830a84..2533eeeda 100644 --- a/src/indexer/segment_manager.rs +++ b/src/indexer/segment_manager.rs @@ -3,7 +3,7 @@ use std::sync::RwLock; use core::SegmentMeta; use core::SegmentId; use indexer::{SegmentEntry, SegmentState}; - +use std::path::PathBuf; use std::sync::{RwLockReadGuard, RwLockWriteGuard}; use std::fmt::{self, Debug, Formatter}; @@ -67,6 +67,17 @@ impl SegmentManager { segment_entries } + pub fn alive_files(&self) -> Vec { + let mut files = vec!(); + let (segment_meta_uncommitted, segment_meta_committed) = get_segments(self); + for segment_meta in segment_meta_uncommitted + .into_iter() + .chain(segment_meta_committed.into_iter()) { + files.extend(segment_meta.alive_files()); + } + files + } + pub fn segment_state(&self, segment_id: &SegmentId) -> Option { self.segment_entry(segment_id) .map(|segment_entry| segment_entry.state()) diff --git a/src/indexer/segment_register.rs b/src/indexer/segment_register.rs index 418d92ef7..288b7a95f 100644 --- a/src/indexer/segment_register.rs +++ b/src/indexer/segment_register.rs @@ -4,6 +4,7 @@ use core::SegmentMeta; use std::fmt; use std::fmt::{Debug, Formatter}; use indexer::segment_entry::SegmentEntry; +use std::path::PathBuf; diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index 6c5b9b451..b2d1b1ba5 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -21,6 +21,7 @@ use indexer::merger::IndexMerger; use indexer::SegmentEntry; use indexer::SegmentSerializer; use Result; +use std::path::PathBuf; use rustc_serialize::json; use schema::Schema; use std::borrow::BorrowMut; @@ -74,13 +75,16 @@ pub fn save_metas(segment_metas: Vec, schema: schema, opstamp: opstamp, }; - let mut w = Vec::new(); + let mut w = vec!(); try!(write!(&mut w, "{}\n", json::as_pretty_json(&metas))); Ok(directory .atomic_write(&META_FILEPATH, &w[..])?) } +fn garbage_collect_files(directory: &Directory, alive_files: Vec) { + // +} // The segment update runner is in charge of processing all @@ -183,7 +187,10 @@ impl SegmentUpdater { segment_updater.0.index.schema(), opstamp, directory.borrow_mut()).expect("Could not save metas."); + let useful_files = segment_updater.0.segment_manager.alive_files(); + garbage_collect_files(&*directory, useful_files); segment_updater.consider_merge_options(); + }) } @@ -290,9 +297,6 @@ impl SegmentUpdater { segment_updater.0.index.schema(), segment_updater.0.index.opstamp(), directory.borrow_mut()).expect("Could not save metas."); - for segment_meta in merged_segment_metas { - segment_updater.0.index.delete_segment(segment_meta.id()); - } }) } From c59507444f134e86be231c5347212e38e2b20e7d Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 4 Mar 2017 14:57:08 +0900 Subject: [PATCH 053/107] issue/77 ManagedDirectory working Closes #77 --- src/core/index.rs | 17 +-- src/core/segment.rs | 1 + src/core/segment_meta.rs | 6 +- src/directory/directory.rs | 2 +- src/directory/error.rs | 6 -- src/directory/managed_directory.rs | 165 ++++++++++++++++++++++++++--- src/directory/mmap_directory.rs | 60 ++++++++--- src/indexer/index_writer.rs | 4 +- src/indexer/segment_manager.rs | 34 ++++-- src/indexer/segment_register.rs | 3 - src/indexer/segment_updater.rs | 29 +++-- src/indexer/segment_writer.rs | 1 - 12 files changed, 260 insertions(+), 68 deletions(-) diff --git a/src/core/index.rs b/src/core/index.rs index cc7d51cf3..c09baba9d 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -23,7 +23,6 @@ use directory::ManagedDirectory; use core::META_FILEPATH; use super::segment::create_segment; use indexer::segment_updater::save_new_metas; -use directory::error::FileError; const NUM_SEARCHERS: usize = 12; @@ -48,7 +47,8 @@ impl Index { /// The index will be allocated in anonymous memory. /// This should only be used for unit tests. pub fn create_in_ram(schema: Schema) -> Index { - let directory = ManagedDirectory::new(RAMDirectory::create()); + let ram_directory = RAMDirectory::create(); + let directory = ManagedDirectory::new(ram_directory).expect("Creating a managed directory from a brand new RAM directory should never fail."); Index::from_directory(directory, schema).expect("Creating a RAMDirectory should never fail") // unwrap is ok here } @@ -57,7 +57,8 @@ impl Index { /// /// If a previous index was in this directory, then its meta file will be destroyed. pub fn create(directory_path: &Path, schema: Schema) -> Result { - let directory = ManagedDirectory::new(MmapDirectory::open(directory_path)?); + let mmap_directory = MmapDirectory::open(directory_path)?; + let directory = ManagedDirectory::new(mmap_directory)?; Index::from_directory(directory, schema) } @@ -70,7 +71,8 @@ impl Index { /// The temp directory is only used for testing the `MmapDirectory`. /// For other unit tests, prefer the `RAMDirectory`, see: `create_in_ram`. pub fn create_from_tempdir(schema: Schema) -> Result { - let directory = ManagedDirectory::new(MmapDirectory::create_from_tempdir()?); + let mmap_directory = MmapDirectory::create_from_tempdir()?; + let directory = ManagedDirectory::new(mmap_directory)?; Index::from_directory(directory, schema) } @@ -94,7 +96,8 @@ impl Index { /// Opens a new directory from an index path. pub fn open(directory_path: &Path) -> Result { - let directory = ManagedDirectory::new(MmapDirectory::open(directory_path)?); + let mmap_directory = MmapDirectory::open(directory_path)?; + let directory = ManagedDirectory::new(mmap_directory)?; let metas = try!(load_metas(&directory)); Index::create_from_metas(directory, metas) } @@ -170,12 +173,12 @@ impl Index { } /// Return a reference to the index directory. - pub fn directory(&self) -> &Directory { + pub fn directory(&self) -> &ManagedDirectory { &self.directory } /// Return a mutable reference to the index directory. - pub fn directory_mut(&mut self) -> &mut Directory { + pub fn directory_mut(&mut self) -> &mut ManagedDirectory { &mut self.directory } diff --git a/src/core/segment.rs b/src/core/segment.rs index cd78743b9..99fad2591 100644 --- a/src/core/segment.rs +++ b/src/core/segment.rs @@ -9,6 +9,7 @@ use indexer::segment_serializer::SegmentSerializer; use super::SegmentComponent; use core::Index; use std::result; +use directory::Directory; use core::SegmentMeta; use directory::error::{FileError, OpenWriteError}; diff --git a/src/core/segment_meta.rs b/src/core/segment_meta.rs index b342a18d0..387617f39 100644 --- a/src/core/segment_meta.rs +++ b/src/core/segment_meta.rs @@ -1,7 +1,7 @@ use core::SegmentId; use super::SegmentComponent; use std::path::PathBuf; - +use std::collections::HashSet; #[derive(Clone, Debug, RustcDecodable,RustcEncodable)] struct DeleteMeta { @@ -45,12 +45,12 @@ impl SegmentMeta { .unwrap_or(0u32) } - pub fn alive_files(&self) -> Vec { + pub fn living_files(&self) -> HashSet { SegmentComponent::iterator() .map(|component| { self.relative_path(*component) }) - .collect::>() + .collect::>() } diff --git a/src/directory/directory.rs b/src/directory/directory.rs index 3f41f5011..2f3cb4146 100644 --- a/src/directory/directory.rs +++ b/src/directory/directory.rs @@ -1,6 +1,6 @@ use std::marker::Send; use std::fmt; -use std::path::{Path, PathBuf}; +use std::path::Path; use directory::error::{FileError, OpenWriteError}; use directory::{ReadOnlySource, WritePtr}; use std::result; diff --git a/src/directory/error.rs b/src/directory/error.rs index a49ea23b7..aacfe62d3 100644 --- a/src/directory/error.rs +++ b/src/directory/error.rs @@ -36,9 +36,3 @@ pub enum FileError { /// interacting with the underlying IO device. IOError(io::Error), } - -impl From for FileError { - fn from(err: io::Error) -> FileError { - FileError::IOError(err) - } -} diff --git a/src/directory/managed_directory.rs b/src/directory/managed_directory.rs index ba93a405e..2dc540229 100644 --- a/src/directory/managed_directory.rs +++ b/src/directory/managed_directory.rs @@ -1,4 +1,3 @@ -use Result; use std::path::{Path, PathBuf}; use directory::error::{FileError, OpenWriteError}; use directory::{ReadOnlySource, WritePtr}; @@ -10,8 +9,8 @@ use std::collections::HashSet; use std::io::Write; use rustc_serialize::json; use core::MANAGED_FILEPATH; - - +use Result; +use Error; #[derive(Debug)] pub struct ManagedDirectory { @@ -19,21 +18,88 @@ pub struct ManagedDirectory { managed_paths: Arc>>, } - impl ManagedDirectory { - pub fn new(directory: Dir) -> ManagedDirectory { - ManagedDirectory { - directory: box directory, - managed_paths: Arc::default(), + pub fn new(directory: Dir) -> Result { + match directory.atomic_read(&MANAGED_FILEPATH) { + Ok(data) => { + let managed_files_json = String::from_utf8_lossy(&data); + let managed_files: HashSet = json::decode(&managed_files_json) + .map_err(|e| Error::CorruptedFile(MANAGED_FILEPATH.clone(), Box::new(e)))?; + Ok(ManagedDirectory { + directory: box directory, + managed_paths: Arc::new(RwLock::new(managed_files)), + }) + } + Err(FileError::FileDoesNotExist(_)) => { + Ok(ManagedDirectory { + directory: box directory, + managed_paths: Arc::default(), + }) + } + Err(FileError::IOError(e)) => { + Err(From::from(e)) + } } } - fn register_file_as_managed(&mut self, filepath: PathBuf) -> Result<()> { - let mut managed_files_lock = self.managed_paths.write()?; - if managed_files_lock.insert(filepath) { - let mut w = vec!(); - try!(write!(&mut w, "{}\n", json::as_pretty_json(&*managed_files_lock))); - self.directory.atomic_write(&MANAGED_FILEPATH, &w[..])?; + pub fn garbage_collect(&mut self, living_files: HashSet) { + let mut managed_has_changed: bool = false; + { + let mut files_to_delete = vec!(); + let mut managed_paths_write = self.managed_paths.write().unwrap(); + for managed_path in managed_paths_write.iter() { + if !living_files.contains(managed_path) { + files_to_delete.push(managed_path.clone()); + } + } + for file_to_delete in files_to_delete { + match self.directory.delete(&file_to_delete) { + Ok(_) => { + info!("Deleted {:?}", file_to_delete); + managed_has_changed |= managed_paths_write.remove(&file_to_delete); + } + Err(file_error) => { + match file_error { + FileError::FileDoesNotExist(_) => { + managed_has_changed |= managed_paths_write.remove(&file_to_delete); + } + FileError::IOError(_) => { + error!("Failed to delete {:?}", file_to_delete); + } + + } + + } + } + } + } + if managed_has_changed { + if let Err(_) = self.save_managed_paths() { + error!("Failed to save the list of managed files."); + } + } + } + + fn save_managed_paths(&mut self,) -> io::Result<()> { + let managed_files_lock = self.managed_paths + .read() + .expect("Managed file lock poisoned"); + let mut w = vec!(); + try!(write!(&mut w, "{}\n", json::as_pretty_json(&*managed_files_lock))); + self.directory.atomic_write(&MANAGED_FILEPATH, &w[..])?; + Ok(()) + } + + fn register_file_as_managed(&mut self, filepath: &Path) -> io::Result<()> { + let has_changed = { + let mut managed_files_lock = self + .managed_paths + .write() + .expect("Managed file lock poisoned"); + managed_files_lock.insert(filepath.to_owned()) + }; + if has_changed { + self.save_managed_paths()?; } Ok(()) } @@ -46,10 +112,12 @@ impl Directory for ManagedDirectory { } fn open_write(&mut self, path: &Path) -> result::Result { + self.register_file_as_managed(path)?; self.directory.open_write(path) } fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> { + self.register_file_as_managed(path)?; self.directory.atomic_write(path, data) } @@ -78,4 +146,71 @@ impl Clone for ManagedDirectory { managed_paths: self.managed_paths.clone(), } } -} \ No newline at end of file +} + + + + +#[cfg(test)] +mod tests { + + use super::*; + use directory::MmapDirectory; + use std::path::Path; + use std::io::Write; + use tempdir::TempDir; + + lazy_static! { + static ref TEST_PATH1: &'static Path = Path::new("some_path_for_test"); + static ref TEST_PATH2: &'static Path = Path::new("some_path_for_test2"); + } + + #[test] + fn test_managed_directory() { + let tempdir = TempDir::new("index").unwrap(); + let tempdir_path = PathBuf::from(tempdir.path()); + { + let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap(); + let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap(); + { + let mut write_file = managed_directory.open_write(*TEST_PATH1).unwrap(); + write_file.flush().unwrap(); + } + { + managed_directory.atomic_write(*TEST_PATH2, &vec!(0u8,1u8)).unwrap(); + } + { + assert!(managed_directory.exists(*TEST_PATH1)); + assert!(managed_directory.exists(*TEST_PATH2)); + } + { + let living_files: HashSet = [TEST_PATH1.to_owned()] + .into_iter() + .cloned() + .collect(); + managed_directory.garbage_collect(living_files); + } + { + assert!(managed_directory.exists(*TEST_PATH1)); + assert!(!managed_directory.exists(*TEST_PATH2)); + } + } + { + let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap(); + let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap(); + { + assert!(managed_directory.exists(*TEST_PATH1)); + assert!(!managed_directory.exists(*TEST_PATH2)); + } + { + let living_files: HashSet = HashSet::new(); + managed_directory.garbage_collect(living_files); + } + { + assert!(!managed_directory.exists(*TEST_PATH1)); + assert!(!managed_directory.exists(*TEST_PATH2)); + } + } + } + +} diff --git a/src/directory/mmap_directory.rs b/src/directory/mmap_directory.rs index 0a8e6f4ac..5015f3bb6 100644 --- a/src/directory/mmap_directory.rs +++ b/src/directory/mmap_directory.rs @@ -13,7 +13,6 @@ use std::convert::From; use std::fmt; use std::fs::{self, File}; use std::fs::OpenOptions; -use std::fs::ReadDir; use std::io::{self, Seek, SeekFrom}; use std::io::{BufWriter, Read, Write}; use std::mem; @@ -35,13 +34,24 @@ fn open_mmap(full_path: &PathBuf) -> result::Result>, FileError } }; let file = File::open(&full_path).map_err(convert_file_error)?; - if try!(file.metadata()).len() == 0 { + let meta_data = file + .metadata() + .map_err(|e| FileError::IOError(e))?; + if meta_data.len() == 0 { // if the file size is 0, it will not be possible // to mmap the file, so we return an anonymous mmap_cache // instead. return Ok(None) } - Ok(Some(Arc::new(Mmap::open(&file, Protection::Read)?))) + match Mmap::open(&file, Protection::Read) { + Ok(mmap) => { + Ok(Some(Arc::new(mmap))) + } + Err(e) => { + Err(FileError::IOError(e)) + } + } + } #[derive(Default,Clone,Debug,RustcDecodable,RustcEncodable)] @@ -267,9 +277,9 @@ impl Directory for MmapDirectory { let mut mmap_cache = self.mmap_cache .write() - .map_err(|_| { + .map_err(|_| FileError::IOError( make_io_err(format!("Failed to acquired write lock on mmap cache while reading {:?}", path)) - })?; + ))?; Ok(mmap_cache.get_mmap(full_path)? .map(MmapReadOnly::from) @@ -314,17 +324,27 @@ impl Directory for MmapDirectory { let full_path = self.resolve_path(path); let mut mmap_cache = try!(self.mmap_cache .write() - .map_err(|_| { - make_io_err(format!("Failed to acquired write lock on mmap cache while deleting {:?}", path)) - }) + .map_err(|_| + FileError::IOError(make_io_err(format!("Failed to acquired write lock on mmap cache while deleting {:?}", path)))) ); // Removing the entry in the MMap cache. // The munmap will appear on Drop, // when the last reference is gone. mmap_cache.cache.remove(&full_path); - try!(fs::remove_file(&full_path)); - try!(self.sync_directory()); - Ok(()) + match fs::remove_file(&full_path) { + Ok(_) => { + self.sync_directory() + .map_err(|e| FileError::IOError(e)) + } + Err(e) => { + if e.kind() == io::ErrorKind::NotFound { + Err(FileError::FileDoesNotExist(path.to_owned())) + } + else { + Err(FileError::IOError(e)) + } + } + } } fn exists(&self, path: &Path) -> bool { @@ -335,8 +355,22 @@ impl Directory for MmapDirectory { fn atomic_read(&self, path: &Path) -> Result, FileError> { let full_path = self.resolve_path(path); let mut buffer = Vec::new(); - File::open(&full_path)?.read_to_end(&mut buffer)?; - Ok(buffer) + match File::open(&full_path) { + Ok(mut file) => { + file.read_to_end(&mut buffer) + .map_err(|e| FileError::IOError(e))?; + Ok(buffer) + } + Err(e) => { + if e.kind() == io::ErrorKind::NotFound { + Err(FileError::FileDoesNotExist(path.to_owned())) + } + else { + Err(FileError::IOError(e)) + } + } + } + } fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> { diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index b7a60db68..0ba779ec3 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -8,6 +8,7 @@ use core::SegmentMeta; use core::SegmentReader; use datastruct::stacker::Heap; use Error; +use Directory; use fastfield::delete::write_delete_bitset; use indexer::delete_queue::DeleteQueueSnapshot; use futures::Canceled; @@ -285,7 +286,6 @@ impl IndexWriter { /// The thread consumes documents from the pipeline. /// fn add_indexing_worker(&mut self) -> Result<()> { - let index = self.index.clone(); let schema = self.index.schema(); let document_receiver_clone = self.document_receiver.clone(); let mut segment_updater = self.segment_updater.clone(); @@ -311,7 +311,7 @@ impl IndexWriter { // peeked document now belongs to // our local iterator. if document_iterator.peek().is_some() { - let segment = index.new_segment(); + let segment = segment_updater.new_segment(); index_documents(&mut heap, segment, &schema, diff --git a/src/indexer/segment_manager.rs b/src/indexer/segment_manager.rs index 2533eeeda..5ba5dd220 100644 --- a/src/indexer/segment_manager.rs +++ b/src/indexer/segment_manager.rs @@ -1,9 +1,11 @@ use super::segment_register::SegmentRegister; use std::sync::RwLock; use core::SegmentMeta; +use core::META_FILEPATH; use core::SegmentId; use indexer::{SegmentEntry, SegmentState}; use std::path::PathBuf; +use std::collections::hash_set::HashSet; use std::sync::{RwLockReadGuard, RwLockWriteGuard}; use std::fmt::{self, Debug, Formatter}; @@ -11,6 +13,7 @@ use std::fmt::{self, Debug, Formatter}; struct SegmentRegisters { uncommitted: SegmentRegister, committed: SegmentRegister, + writing: HashSet, } @@ -51,6 +54,7 @@ impl SegmentManager { registers: RwLock::new(SegmentRegisters { uncommitted: SegmentRegister::default(), committed: SegmentRegister::new(segment_metas), + writing: HashSet::new(), }), } } @@ -67,13 +71,25 @@ impl SegmentManager { segment_entries } - pub fn alive_files(&self) -> Vec { - let mut files = vec!(); - let (segment_meta_uncommitted, segment_meta_committed) = get_segments(self); - for segment_meta in segment_meta_uncommitted + pub fn living_files(&self) -> HashSet { + let registers_lock = self.read(); + let mut files = HashSet::new(); + files.insert(META_FILEPATH.clone()); + + let segment_metas = + registers_lock.committed + .get_segments() .into_iter() - .chain(segment_meta_committed.into_iter()) { - files.extend(segment_meta.alive_files()); + .chain(registers_lock.uncommitted + .get_segments() + .into_iter()) + .chain(registers_lock.writing + .iter() + .cloned() + .map(SegmentMeta::new)); + + for segment_meta in segment_metas { + files.extend(segment_meta.living_files()); } files } @@ -148,8 +164,14 @@ impl SegmentManager { } } + pub fn write_segment(&self, segment_id: SegmentId) { + let mut registers_lock = self.write(); + registers_lock.writing.insert(segment_id); + } + pub fn add_segment(&self, segment_entry: SegmentEntry) { let mut registers_lock = self.write(); + registers_lock.writing.remove(&segment_entry.segment_id()); registers_lock.uncommitted.add_segment_entry(segment_entry); } diff --git a/src/indexer/segment_register.rs b/src/indexer/segment_register.rs index 288b7a95f..b618cade2 100644 --- a/src/indexer/segment_register.rs +++ b/src/indexer/segment_register.rs @@ -4,9 +4,6 @@ use core::SegmentMeta; use std::fmt; use std::fmt::{Debug, Formatter}; use indexer::segment_entry::SegmentEntry; -use std::path::PathBuf; - - /// The segment register keeps track /// of the list of segment, their size as well diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index b2d1b1ba5..347212c83 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -21,7 +21,6 @@ use indexer::merger::IndexMerger; use indexer::SegmentEntry; use indexer::SegmentSerializer; use Result; -use std::path::PathBuf; use rustc_serialize::json; use schema::Schema; use std::borrow::BorrowMut; @@ -82,10 +81,6 @@ pub fn save_metas(segment_metas: Vec, } -fn garbage_collect_files(directory: &Directory, alive_files: Vec) { - // -} - // The segment update runner is in charge of processing all // of the `SegmentUpdate`s. @@ -126,6 +121,15 @@ impl SegmentUpdater { ) } + pub fn new_segment(&self) -> Segment { + let new_segment = self.0.index.new_segment(); + let segment_id = new_segment.id(); + self.run_async(move |segment_updater| { + segment_updater.0.segment_manager.write_segment(segment_id); + }); + new_segment + } + pub fn get_merge_policy(&self) -> Box { self.0.merge_policy.read().unwrap().box_clone() } @@ -181,14 +185,17 @@ impl SegmentUpdater { self.run_async(move |segment_updater| { let segment_metas = segment_updater.purge_deletes().expect("Failed purge deletes"); segment_updater.0.segment_manager.commit(segment_metas); - let mut directory = segment_updater.0.index.directory().box_clone(); - save_metas( + let mut index = segment_updater.0.index.clone(); + { + let directory = index.directory(); + save_metas( segment_updater.0.segment_manager.committed_segment_metas(), - segment_updater.0.index.schema(), + index.schema(), opstamp, - directory.borrow_mut()).expect("Could not save metas."); - let useful_files = segment_updater.0.segment_manager.alive_files(); - garbage_collect_files(&*directory, useful_files); + directory.box_clone().borrow_mut()).expect("Could not save metas."); + } + let living_files = segment_updater.0.segment_manager.living_files(); + index.directory_mut().garbage_collect(living_files); segment_updater.consider_merge_options(); }) diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index bd324bd97..82df64a84 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -72,7 +72,6 @@ fn posting_from_field_entry<'a>(field_entry: &FieldEntry, heap: &'a Heap) -> Box impl<'a> SegmentWriter<'a> { - /// Creates a new `SegmentWriter` /// /// The arguments are defined as follows From 3a472914ce67870c34f72ccd5cab171ee90fae3d Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 6 Mar 2017 16:28:30 +0900 Subject: [PATCH 054/107] Fix .write -> .write_all --- src/directory/ram_directory.rs | 2 +- src/fastfield/delete.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/directory/ram_directory.rs b/src/directory/ram_directory.rs index 3f798485d..32d44e184 100644 --- a/src/directory/ram_directory.rs +++ b/src/directory/ram_directory.rs @@ -55,7 +55,7 @@ impl Seek for VecWriter { impl Write for VecWriter { fn write(&mut self, buf: &[u8]) -> io::Result { self.is_flushed = false; - try!(self.data.write(buf)); + try!(self.data.write_all(buf)); Ok(buf.len()) } diff --git a/src/fastfield/delete.rs b/src/fastfield/delete.rs index a899af963..83a631a38 100644 --- a/src/fastfield/delete.rs +++ b/src/fastfield/delete.rs @@ -15,7 +15,7 @@ pub fn write_delete_bitset(delete_bitset: &BitSet, writer: &mut WritePtr) -> io: byte |= 1 << shift; } if shift == 7 { - writer.write(&[byte])?; + writer.write_all(&[byte])?; shift = 0; byte = 0; } @@ -24,7 +24,7 @@ pub fn write_delete_bitset(delete_bitset: &BitSet, writer: &mut WritePtr) -> io: } } if max_doc % 8 > 0 { - writer.write(&[byte])?; + writer.write_all(&[byte])?; } writer.flush() } From ebca90476786062cc0bb3030bcf2c301ee1568a7 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 7 Mar 2017 09:58:51 +0900 Subject: [PATCH 055/107] NOBUG added rustdoc --- src/core/segment_meta.rs | 8 +++++++- src/directory/managed_directory.rs | 31 ++++++++++++++++++++++++++++++ src/indexer/segment_manager.rs | 4 ++-- src/indexer/segment_updater.rs | 2 +- 4 files changed, 41 insertions(+), 4 deletions(-) diff --git a/src/core/segment_meta.rs b/src/core/segment_meta.rs index 387617f39..a12428b07 100644 --- a/src/core/segment_meta.rs +++ b/src/core/segment_meta.rs @@ -45,7 +45,13 @@ impl SegmentMeta { .unwrap_or(0u32) } - pub fn living_files(&self) -> HashSet { + /// Returns the list of files that + /// are required for the segment meta. + /// + /// This is useful as the way tantivy removes files + /// is by removing all files that have been created by tantivy + /// and are not used by any segment anymore. + pub fn list_files(&self) -> HashSet { SegmentComponent::iterator() .map(|component| { self.relative_path(*component) diff --git a/src/directory/managed_directory.rs b/src/directory/managed_directory.rs index 2dc540229..3eefcf996 100644 --- a/src/directory/managed_directory.rs +++ b/src/directory/managed_directory.rs @@ -12,6 +12,15 @@ use core::MANAGED_FILEPATH; use Result; use Error; +/// Wrapper of directories that keeps track of files created by Tantivy. +/// +/// A managed directory is just a wrapper of a directory +/// that keeps a (persisted) list of the files that +/// have been created (and not deleted) by tantivy so far. +/// +/// Thanks to this list, it implements a `garbage_collect` method +/// that removes the files that were created by tantivy and are not +/// useful anymore. #[derive(Debug)] pub struct ManagedDirectory { directory: Box, @@ -19,6 +28,8 @@ pub struct ManagedDirectory { } impl ManagedDirectory { + + /// Wraps a directory as managed directory. pub fn new(directory: Dir) -> Result { match directory.atomic_read(&MANAGED_FILEPATH) { Ok(data) => { @@ -42,6 +53,17 @@ impl ManagedDirectory { } } + /// Garbage collect unused files. + /// + /// Removes the files that were created by `tantivy` and are not + /// used by any segment anymore. + /// + /// * `living_files` - List of files that are still used by the index. + /// + /// This method does not panick nor returns errors. + /// If a file cannot be deleted (for permission reasons for instance) + /// an error is simply logged, and the file remains in the list of managed + /// files. pub fn garbage_collect(&mut self, living_files: HashSet) { let mut managed_has_changed: bool = false; { @@ -80,6 +102,8 @@ impl ManagedDirectory { } } + /// Saves the file containing the list of existing files + /// that were created by tantivy. fn save_managed_paths(&mut self,) -> io::Result<()> { let managed_files_lock = self.managed_paths .read() @@ -90,6 +114,13 @@ impl ManagedDirectory { Ok(()) } + /// Registers a file as managed + /// + /// This method must be called before the file is + /// actually created to ensure that a failure between + /// registering the filepath and creating the file + /// will not lead to garbage files that will + /// never get removed. fn register_file_as_managed(&mut self, filepath: &Path) -> io::Result<()> { let has_changed = { let mut managed_files_lock = self diff --git a/src/indexer/segment_manager.rs b/src/indexer/segment_manager.rs index 5ba5dd220..ad35702ef 100644 --- a/src/indexer/segment_manager.rs +++ b/src/indexer/segment_manager.rs @@ -71,7 +71,7 @@ impl SegmentManager { segment_entries } - pub fn living_files(&self) -> HashSet { + pub fn list_files(&self) -> HashSet { let registers_lock = self.read(); let mut files = HashSet::new(); files.insert(META_FILEPATH.clone()); @@ -89,7 +89,7 @@ impl SegmentManager { .map(SegmentMeta::new)); for segment_meta in segment_metas { - files.extend(segment_meta.living_files()); + files.extend(segment_meta.list_files()); } files } diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index 347212c83..663e36609 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -194,7 +194,7 @@ impl SegmentUpdater { opstamp, directory.box_clone().borrow_mut()).expect("Could not save metas."); } - let living_files = segment_updater.0.segment_manager.living_files(); + let living_files = segment_updater.0.segment_manager.list_files(); index.directory_mut().garbage_collect(living_files); segment_updater.consider_merge_options(); From a397537ed88f7638216a9efa1eed610a41b11f59 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 7 Mar 2017 09:58:51 +0900 Subject: [PATCH 056/107] NOBUG added rustdoc --- src/lib.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index 9aca5e967..81d9acc24 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,7 +8,6 @@ #![cfg_attr(test, feature(test))] #![cfg_attr(test, feature(step_by))] #![doc(test(attr(allow(unused_variables), deny(warnings))))] -#![feature(conservative_impl_trait)] #![warn(missing_docs)] From 3d1196d53e9874eb4af4a653af94b43a830193b7 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 7 Mar 2017 10:13:54 +0900 Subject: [PATCH 057/107] NOBUG added doc link. --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 054f62f47..75b24b428 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,9 @@ Tantivy supports Linux and MacOS. Windows is not supported. - [tantivy's usage example](http://fulmicoton.com/tantivy-examples/simple_search.html) - [tantivy-cli and its tutorial](https://github.com/tantivy-search/tantivy-cli). It will walk you through getting a wikipedia search engine up and running in a few minutes. -- [reference doc](https://tantivy-search.github.io/tantivy/tantivy/index.html). +- [reference doc] + - [For the last released version](https://docs.rs/tantivy/) + - [For the last master branch](https://tantivy-search.github.io/tantivy/tantivy/index.html) # Compiling From 69b3de43f63960b03af24c9809290496e6921318 Mon Sep 17 00:00:00 2001 From: Ashley Mannix Date: Wed, 8 Mar 2017 14:02:48 +1000 Subject: [PATCH 058/107] convert simd wrapper to c --- Cross.toml | 5 +++ build.rs | 5 +-- cpp/simdcomp_wrapper.c | 42 +++++++++++++++++++++++++ cpp/simdcomp_wrapper.cpp | 48 ----------------------------- src/compression/compression_simd.rs | 45 +++++++++++++++------------ 5 files changed, 73 insertions(+), 72 deletions(-) create mode 100644 Cross.toml create mode 100644 cpp/simdcomp_wrapper.c delete mode 100644 cpp/simdcomp_wrapper.cpp diff --git a/Cross.toml b/Cross.toml new file mode 100644 index 000000000..33dc0402d --- /dev/null +++ b/Cross.toml @@ -0,0 +1,5 @@ +[target.x86_64-unknown-linux-gnu] +image = "tantivy-cli-x86_64-unknown-linux-gnu:latest" + +[target.x86_64-unknown-linux-musl] +image = "tantivy-cli-x86_64-unknown-linux-musl:latest" \ No newline at end of file diff --git a/build.rs b/build.rs index ceba328cb..fb7ba8110 100644 --- a/build.rs +++ b/build.rs @@ -10,8 +10,6 @@ mod build { .output() .unwrap_or_else(|e| { panic!("Failed to make simdcomp: {}", e) }); gcc::Config::new() - .cpp(true) - .flag("-std=c++11") .flag("-O3") .flag("-mssse3") .include("./cpp/simdcomp/include") @@ -22,9 +20,8 @@ mod build { .object("cpp/simdcomp/simdcomputil.o") .object("cpp/simdcomp/simdpackedselect.o") .object("cpp/simdcomp/simdfor.o") - .file("cpp/simdcomp_wrapper.cpp") + .file("cpp/simdcomp_wrapper.c") .compile("libsimdcomp.a"); - println!("cargo:rustc-flags=-l dylib=stdc++"); } } diff --git a/cpp/simdcomp_wrapper.c b/cpp/simdcomp_wrapper.c new file mode 100644 index 000000000..5223ea227 --- /dev/null +++ b/cpp/simdcomp_wrapper.c @@ -0,0 +1,42 @@ +#include "simdcomp.h" +#include "simdcomputil.h" + +// assumes datain has a size of 128 uint32 +// and that buffer is large enough to host the data. +size_t compress_sorted( + const uint32_t* datain, + uint8_t* output, + const uint32_t offset) { + const uint32_t b = simdmaxbitsd1(offset, datain); + *output++ = b; + simdpackwithoutmaskd1(offset, datain, (__m128i *) output, b); + return 1 + b * sizeof(__m128i);; +} + +// assumes datain has a size of 128 uint32 +// and that buffer is large enough to host the data. +size_t uncompress_sorted( + const uint8_t* compressed_data, + uint32_t* output, + uint32_t offset) { + const uint32_t b = *compressed_data++; + simdunpackd1(offset, (__m128i *)compressed_data, output, b); + return 1 + b * sizeof(__m128i); +} + +size_t compress_unsorted( + const uint32_t* datain, + uint8_t* output) { + const uint32_t b = maxbits(datain); + *output++ = b; + simdpackwithoutmask(datain, (__m128i *) output, b); + return 1 + b * sizeof(__m128i);; +} + +size_t uncompress_unsorted( + const uint8_t* compressed_data, + uint32_t* output) { + const uint32_t b = *compressed_data++; + simdunpack((__m128i *)compressed_data, output, b); + return 1 + b * sizeof(__m128i); +} \ No newline at end of file diff --git a/cpp/simdcomp_wrapper.cpp b/cpp/simdcomp_wrapper.cpp deleted file mode 100644 index bfcaf72ef..000000000 --- a/cpp/simdcomp_wrapper.cpp +++ /dev/null @@ -1,48 +0,0 @@ -#include -#include -#include -#include "simdcomp.h" -#include "simdcomputil.h" - -extern "C" { - - // assumes datain has a size of 128 uint32 - // and that buffer is large enough to host the data. - size_t compress_sorted_cpp( - const uint32_t* datain, - uint8_t* output, - const uint32_t offset) { - const uint32_t b = simdmaxbitsd1(offset, datain); - *output++ = b; - simdpackwithoutmaskd1(offset, datain, (__m128i *) output, b); - return 1 + b * sizeof(__m128i);; - } - - // assumes datain has a size of 128 uint32 - // and that buffer is large enough to host the data. - size_t uncompress_sorted_cpp( - const uint8_t* compressed_data, - uint32_t* output, - uint32_t offset) { - const uint32_t b = *compressed_data++; - simdunpackd1(offset, (__m128i *)compressed_data, output, b); - return 1 + b * sizeof(__m128i); - } - - size_t compress_unsorted_cpp( - const uint32_t* datain, - uint8_t* output) { - const uint32_t b = maxbits(datain); - *output++ = b; - simdpackwithoutmask(datain, (__m128i *) output, b); - return 1 + b * sizeof(__m128i);; - } - - size_t uncompress_unsorted_cpp( - const uint8_t* compressed_data, - uint32_t* output) { - const uint32_t b = *compressed_data++; - simdunpack((__m128i *)compressed_data, output, b); - return 1 + b * sizeof(__m128i); - } -} \ No newline at end of file diff --git a/src/compression/compression_simd.rs b/src/compression/compression_simd.rs index 605d0ec03..c12b6455d 100644 --- a/src/compression/compression_simd.rs +++ b/src/compression/compression_simd.rs @@ -5,40 +5,45 @@ use libc::size_t; const COMPRESSED_BLOCK_MAX_SIZE: usize = NUM_DOCS_PER_BLOCK * 4 + 1; -extern { - fn compress_sorted_cpp( - data: *const u32, - output: *mut u8, - offset: u32) -> size_t; +mod simdcomp { + use libc::size_t; - fn uncompress_sorted_cpp( - compressed_data: *const u8, - output: *mut u32, - offset: u32) -> size_t; - - fn compress_unsorted_cpp( - data: *const u32, - output: *mut u8) -> size_t; + #[link(name = "simdcomp")] + extern { + pub fn compress_sorted( + data: *const u32, + output: *mut u8, + offset: u32) -> size_t; - fn uncompress_unsorted_cpp( - compressed_data: *const u8, - output: *mut u32) -> size_t; + pub fn uncompress_sorted( + compressed_data: *const u8, + output: *mut u32, + offset: u32) -> size_t; + + pub fn compress_unsorted( + data: *const u32, + output: *mut u8) -> size_t; + + pub fn uncompress_unsorted( + compressed_data: *const u8, + output: *mut u32) -> size_t; + } } fn compress_sorted(vals: &[u32], output: &mut [u8], offset: u32) -> usize { - unsafe { compress_sorted_cpp(vals.as_ptr(), output.as_mut_ptr(), offset) } + unsafe { simdcomp::compress_sorted(vals.as_ptr(), output.as_mut_ptr(), offset) } } fn uncompress_sorted(compressed_data: &[u8], output: &mut [u32], offset: u32) -> usize { - unsafe { uncompress_sorted_cpp(compressed_data.as_ptr(), output.as_mut_ptr(), offset) } + unsafe { simdcomp::uncompress_sorted(compressed_data.as_ptr(), output.as_mut_ptr(), offset) } } fn compress_unsorted(vals: &[u32], output: &mut [u8]) -> usize { - unsafe { compress_unsorted_cpp(vals.as_ptr(), output.as_mut_ptr()) } + unsafe { simdcomp::compress_unsorted(vals.as_ptr(), output.as_mut_ptr()) } } fn uncompress_unsorted(compressed_data: &[u8], output: &mut [u32]) -> usize { - unsafe { uncompress_unsorted_cpp(compressed_data.as_ptr(), output.as_mut_ptr()) } + unsafe { simdcomp::uncompress_unsorted(compressed_data.as_ptr(), output.as_mut_ptr()) } } From ac3890f93c9624fda8fbaa26e747db41b9a59550 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Wed, 8 Mar 2017 19:08:29 +0900 Subject: [PATCH 059/107] NOBUG Marked the functional test as ignore --- src/functional_test.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/functional_test.rs b/src/functional_test.rs index 53d8fcd11..e8f8797dd 100644 --- a/src/functional_test.rs +++ b/src/functional_test.rs @@ -12,6 +12,7 @@ fn check_index_content(searcher: &Searcher, vals: &HashSet) { } #[test] +#[ignore] fn test_indexing() { let mut schema_builder = SchemaBuilder::default(); From 324b56a60cc3054e4f71229296b6b59eff30ee57 Mon Sep 17 00:00:00 2001 From: Ashley Mannix Date: Thu, 9 Mar 2017 06:54:48 +1000 Subject: [PATCH 060/107] fix warnings --- Cross.toml | 5 ----- src/compression/compression_simd.rs | 3 --- 2 files changed, 8 deletions(-) delete mode 100644 Cross.toml diff --git a/Cross.toml b/Cross.toml deleted file mode 100644 index 33dc0402d..000000000 --- a/Cross.toml +++ /dev/null @@ -1,5 +0,0 @@ -[target.x86_64-unknown-linux-gnu] -image = "tantivy-cli-x86_64-unknown-linux-gnu:latest" - -[target.x86_64-unknown-linux-musl] -image = "tantivy-cli-x86_64-unknown-linux-musl:latest" \ No newline at end of file diff --git a/src/compression/compression_simd.rs b/src/compression/compression_simd.rs index c12b6455d..9b8802927 100644 --- a/src/compression/compression_simd.rs +++ b/src/compression/compression_simd.rs @@ -1,8 +1,5 @@ - use super::NUM_DOCS_PER_BLOCK; -use libc::size_t; - const COMPRESSED_BLOCK_MAX_SIZE: usize = NUM_DOCS_PER_BLOCK * 4 + 1; mod simdcomp { From 7532c4a4409faf912852650ca9dc1795fe9f7e6e Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 9 Mar 2017 10:57:30 +0900 Subject: [PATCH 061/107] Removed double ; --- cpp/simdcomp_wrapper.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/simdcomp_wrapper.c b/cpp/simdcomp_wrapper.c index 5223ea227..4530e3f3b 100644 --- a/cpp/simdcomp_wrapper.c +++ b/cpp/simdcomp_wrapper.c @@ -10,7 +10,7 @@ size_t compress_sorted( const uint32_t b = simdmaxbitsd1(offset, datain); *output++ = b; simdpackwithoutmaskd1(offset, datain, (__m128i *) output, b); - return 1 + b * sizeof(__m128i);; + return 1 + b * sizeof(__m128i); } // assumes datain has a size of 128 uint32 @@ -30,7 +30,7 @@ size_t compress_unsorted( const uint32_t b = maxbits(datain); *output++ = b; simdpackwithoutmask(datain, (__m128i *) output, b); - return 1 + b * sizeof(__m128i);; + return 1 + b * sizeof(__m128i); } size_t uncompress_unsorted( @@ -39,4 +39,4 @@ size_t uncompress_unsorted( const uint32_t b = *compressed_data++; simdunpack((__m128i *)compressed_data, output, b); return 1 + b * sizeof(__m128i); -} \ No newline at end of file +} From cc2f78184bf3e298e2efd70175e67bff66f7b7c7 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 28 Feb 2017 20:43:16 +0900 Subject: [PATCH 062/107] Added unit test for #96 --- src/indexer/merge_policy.rs | 25 +++++++++++++ src/indexer/segment_updater.rs | 65 ++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) diff --git a/src/indexer/merge_policy.rs b/src/indexer/merge_policy.rs index ae1064355..5e3adcfb8 100644 --- a/src/indexer/merge_policy.rs +++ b/src/indexer/merge_policy.rs @@ -43,3 +43,28 @@ impl MergePolicy for NoMergePolicy { } } + +#[cfg(test)] +pub mod tests { + + use super::*; + use core::SegmentId; + use core::SegmentMeta; + + #[derive(Debug)] + pub struct MergeWheneverPossible; + + impl MergePolicy for MergeWheneverPossible { + fn compute_merge_candidates(&self, segment_metas: &[SegmentMeta]) -> Vec { + let segment_ids = segment_metas + .iter() + .map(|segment_meta| segment_meta.id()) + .collect::>(); + vec!(MergeCandidate(segment_ids)) + } + + fn box_clone(&self) -> Box { + box MergeWheneverPossible + } + } +} \ No newline at end of file diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index 663e36609..1c300ef5c 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -323,3 +323,68 @@ impl SegmentUpdater { } } + + + + +#[cfg(test)] +mod tests { + + use Index; + use schema::*; + use indexer::merge_policy::tests::MergeWheneverPossible; + + #[test] + fn test_delete_during_merge() { + let mut schema_builder = SchemaBuilder::default(); + let text_field = schema_builder.add_text_field("text", TEXT); + let schema = schema_builder.build(); + + let index = Index::create_in_ram(schema); + + // writing the segment + let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); + index_writer.set_merge_policy(box MergeWheneverPossible); + + { + for i in 0..100 { + index_writer.add_document(doc!(text_field=>"a")); + index_writer.add_document(doc!(text_field=>"b")); + } + assert!(index_writer.commit().is_ok()); + } + + { + for i in 0..100 { + index_writer.add_document(doc!(text_field=>"c")); + index_writer.add_document(doc!(text_field=>"d")); + } + assert!(index_writer.commit().is_ok()); + } + + { + index_writer.add_document(doc!(text_field=>"e")); + index_writer.add_document(doc!(text_field=>"f")); + assert!(index_writer.commit().is_ok()); + } + + { + let term = Term::from_field_text(text_field, "a"); + index_writer.delete_term(term); + assert!(index_writer.commit().is_ok()); + } + + index.load_searchers(); + assert_eq!(index.searcher().segment_readers().len(), 3); + assert_eq!(index.searcher().num_docs(), 302); + + { + index_writer.wait_merging_threads() + .expect("waiting for merging threads"); + } + + index.load_searchers(); + assert_eq!(index.searcher().segment_readers().len(), 2); + assert_eq!(index.searcher().num_docs(), 302); + } +} \ No newline at end of file From b7f026bab943daeda550b5c182a9cb4d0e70cc1d Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 28 Feb 2017 23:39:18 +0900 Subject: [PATCH 063/107] Merger returns a SegmentMeta --- src/indexer/index_writer.rs | 2 +- src/indexer/segment_manager.rs | 3 ++- src/indexer/segment_updater.rs | 23 +++++++++++------------ 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 0ba779ec3..8285a132f 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -352,7 +352,7 @@ impl IndexWriter { } /// Merges a given list of segments - pub fn merge(&mut self, segment_ids: &[SegmentId]) -> impl Future { + pub fn merge(&mut self, segment_ids: &[SegmentId]) -> impl Future { self.segment_updater.start_merge(segment_ids) } diff --git a/src/indexer/segment_manager.rs b/src/indexer/segment_manager.rs index ad35702ef..1991ebda4 100644 --- a/src/indexer/segment_manager.rs +++ b/src/indexer/segment_manager.rs @@ -175,9 +175,10 @@ impl SegmentManager { registers_lock.uncommitted.add_segment_entry(segment_entry); } - pub fn end_merge(&self, merged_segment_metas: &[SegmentMeta], merged_segment_entry: SegmentEntry) { + pub fn end_merge(&self, merged_segment_metas: &[SegmentMeta], merged_segment_meta: SegmentMeta) { let mut registers_lock = self.write(); let merged_segment_ids: Vec = merged_segment_metas.iter().map(|meta| meta.id()).collect(); + let merged_segment_entry = SegmentEntry::new(merged_segment_meta); if registers_lock.uncommitted.contains_all(&merged_segment_ids) { for segment_id in &merged_segment_ids { registers_lock.uncommitted.remove_segment(segment_id); diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index 1c300ef5c..4d3675a26 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -97,7 +97,7 @@ struct InnerSegmentUpdater { segment_manager: SegmentManager, merge_policy: RwLock>, merging_thread_id: AtomicUsize, - merging_threads: RwLock>>>, + merging_threads: RwLock>>>, generation: AtomicUsize, delete_queue: DeleteQueue, } @@ -202,7 +202,7 @@ impl SegmentUpdater { } - pub fn start_merge(&self, segment_ids: &[SegmentId]) -> impl Future { + pub fn start_merge(&self, segment_ids: &[SegmentId]) -> impl Future { self.0.segment_manager.start_merge(segment_ids); let segment_updater_clone = self.clone(); @@ -263,14 +263,13 @@ impl SegmentUpdater { let mut segment_meta = SegmentMeta::new(merged_segment.id()); segment_meta.set_max_doc(num_docs); - let segment_entry = SegmentEntry::new(segment_meta); segment_updater_clone - .end_merge(segment_metas.clone(), segment_entry.clone()) + .end_merge(segment_metas.clone(), segment_meta.clone()) .wait() .unwrap(); - merging_future_send.complete(segment_entry.clone()); + merging_future_send.complete(segment_meta); segment_updater_clone.0.merging_threads.write().unwrap().remove(&merging_thread_id); - Ok(segment_entry) + Ok(()) }); self.0.merging_threads.write().unwrap().insert(merging_thread_id, merging_join_handle); merging_future_recv @@ -293,10 +292,10 @@ impl SegmentUpdater { fn end_merge(&self, merged_segment_metas: Vec, - resulting_segment_entry: SegmentEntry) -> impl Future { + segment_meta: SegmentMeta) -> impl Future { self.run_async(move |segment_updater| { - segment_updater.0.segment_manager.end_merge(&merged_segment_metas, resulting_segment_entry); + segment_updater.0.segment_manager.end_merge(&merged_segment_metas, segment_meta); let mut directory = segment_updater.0.index.directory().box_clone(); let segment_metas = segment_updater.0.segment_manager.committed_segment_metas(); save_metas( @@ -347,7 +346,7 @@ mod tests { index_writer.set_merge_policy(box MergeWheneverPossible); { - for i in 0..100 { + for _ in 0..100 { index_writer.add_document(doc!(text_field=>"a")); index_writer.add_document(doc!(text_field=>"b")); } @@ -355,7 +354,7 @@ mod tests { } { - for i in 0..100 { + for _ in 0..100 { index_writer.add_document(doc!(text_field=>"c")); index_writer.add_document(doc!(text_field=>"d")); } @@ -374,7 +373,7 @@ mod tests { assert!(index_writer.commit().is_ok()); } - index.load_searchers(); + index.load_searchers().unwrap(); assert_eq!(index.searcher().segment_readers().len(), 3); assert_eq!(index.searcher().num_docs(), 302); @@ -383,7 +382,7 @@ mod tests { .expect("waiting for merging threads"); } - index.load_searchers(); + index.load_searchers().unwrap(); assert_eq!(index.searcher().segment_readers().len(), 2); assert_eq!(index.searcher().num_docs(), 302); } From 77c61ddab249b960f551ccbe677d24c886d841a0 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 11 Mar 2017 14:20:46 +0900 Subject: [PATCH 064/107] Baby step1 --- src/indexer/delete_queue.rs | 119 ++++++++++++++------------------- src/indexer/index_writer.rs | 10 +-- src/indexer/segment_updater.rs | 47 +++++++------ 3 files changed, 83 insertions(+), 93 deletions(-) diff --git a/src/indexer/delete_queue.rs b/src/indexer/delete_queue.rs index 0c5dedba1..634a81dc6 100644 --- a/src/indexer/delete_queue.rs +++ b/src/indexer/delete_queue.rs @@ -1,71 +1,55 @@ use super::operation::DeleteOperation; use std::sync::{Arc, RwLock}; -use std::mem; /// This implementation assumes that we /// have a lot more write operation than read operations. -#[derive(Default)] -struct InnerDeleteQueue { - ro_chunks: DeleteQueueSnapshot, - last_chunk: Vec, + +type InnerDeleteQueue = Arc>>; + +// TODO very inefficient. +// fix this once the refactoring/bugfix is done +#[derive(Clone)] +pub struct DeleteCursor { + cursor: usize, + operations: InnerDeleteQueue, } -impl InnerDeleteQueue { - pub fn push(&mut self, delete_operation: DeleteOperation) { - self.last_chunk.push(delete_operation); - } - - pub fn snapshot(&mut self,) -> DeleteQueueSnapshot { - if self.last_chunk.len() > 0 { - let new_operations = vec!(); - let new_ro_chunk = mem::replace(&mut self.last_chunk, new_operations); - self.ro_chunks.push(new_ro_chunk) +// TODO remove copy +impl Iterator for DeleteCursor { + + type Item=DeleteOperation; + + fn next(&mut self) -> Option{ + let read = self.operations.read().unwrap(); + if self.cursor >= read.len() { + None + } + else { + let operation = read[self.cursor].clone(); + self.cursor += 1; + Some(operation) } - self.ro_chunks.clone() - } - - pub fn clear(&mut self) { - self.ro_chunks.clear(); - self.last_chunk.clear(); - } -} - - - -#[derive(Default, Clone)] -pub struct DeleteQueueSnapshot(Vec>>); - -impl DeleteQueueSnapshot { - fn push(&mut self, operations: Vec) { - self.0.push(Arc::new(operations)); - } - - pub fn iter<'a>(&'a self) -> impl Iterator { - self.0 - .iter() - .flat_map(|chunk| chunk.iter()) - } - - pub fn clear(&mut self) { - self.0.clear(); } } #[derive(Clone, Default)] -pub struct DeleteQueue(Arc>); +pub struct DeleteQueue(InnerDeleteQueue); impl DeleteQueue { pub fn push(&self, delete_operation: DeleteOperation) { self.0.write().unwrap().push(delete_operation); } - pub fn snapshot(&self) -> DeleteQueueSnapshot { - self.0.write().unwrap().snapshot() + pub fn clear(&mut self) { + self.0.write().unwrap().clear(); } - pub fn clear(&self) { - self.0.write().unwrap().clear(); + pub fn cursor(&self) -> DeleteCursor { + DeleteCursor { + cursor: 0, + operations: self.0.clone(), + } } } @@ -90,34 +74,35 @@ mod tests { delete_queue.push(make_op(1)); delete_queue.push(make_op(2)); - let snapshot = delete_queue.snapshot(); + let snapshot = delete_queue.cursor(); { - let mut operations_it = snapshot.iter(); + let mut operations_it = snapshot.clone(); assert_eq!(operations_it.next().unwrap().opstamp, 1); assert_eq!(operations_it.next().unwrap().opstamp, 2); assert!(operations_it.next().is_none()); } - { // iterating does not consume results. - let mut operations_it = snapshot.iter(); + { + let mut operations_it = snapshot.clone(); assert_eq!(operations_it.next().unwrap().opstamp, 1); assert_eq!(operations_it.next().unwrap().opstamp, 2); assert!(operations_it.next().is_none()); } - // operations does not own a lock on the queue. - delete_queue.push(make_op(3)); - let snapshot2 = delete_queue.snapshot(); - { - // operations is not affected by - // the push that occurs after. - let mut operations_it = snapshot.iter(); - let mut operations2_it = snapshot2.iter(); - assert_eq!(operations_it.next().unwrap().opstamp, 1); - assert_eq!(operations2_it.next().unwrap().opstamp, 1); - assert_eq!(operations_it.next().unwrap().opstamp, 2); - assert_eq!(operations2_it.next().unwrap().opstamp, 2); - assert!(operations_it.next().is_none()); - assert_eq!(operations2_it.next().unwrap().opstamp, 3); - assert!(operations2_it.next().is_none()); - } + + // // operations does not own a lock on the queue. + // delete_queue.push(make_op(3)); + // let snapshot2 = delete_queue.snapshot(); + // { + // // operations is not affected by + // // the push that occurs after. + // let mut operations_it = snapshot.iter(); + // let mut operations2_it = snapshot2.iter(); + // assert_eq!(operations_it.next().unwrap().opstamp, 1); + // assert_eq!(operations2_it.next().unwrap().opstamp, 1); + // assert_eq!(operations_it.next().unwrap().opstamp, 2); + // assert_eq!(operations2_it.next().unwrap().opstamp, 2); + // assert!(operations_it.next().is_none()); + // assert_eq!(operations2_it.next().unwrap().opstamp, 3); + // assert!(operations2_it.next().is_none()); + // } } } \ No newline at end of file diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 8285a132f..732b49bcb 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -10,7 +10,7 @@ use datastruct::stacker::Heap; use Error; use Directory; use fastfield::delete::write_delete_bitset; -use indexer::delete_queue::DeleteQueueSnapshot; +use indexer::delete_queue::DeleteCursor; use futures::Canceled; use futures::Future; use indexer::delete_queue::DeleteQueue; @@ -119,7 +119,7 @@ pub fn open_index_writer(index: &Index, let delete_queue = DeleteQueue::default(); - let segment_updater = SegmentUpdater::new(index.clone(), delete_queue.clone())?; + let segment_updater = SegmentUpdater::new(index.clone())?; let mut index_writer = IndexWriter { @@ -158,7 +158,7 @@ pub fn open_index_writer(index: &Index, pub fn advance_deletes( segment: &mut Segment, - delete_operations: &DeleteQueueSnapshot, + delete_cursor: DeleteCursor, doc_opstamps: &DocToOpstampMapping) -> Result { @@ -170,7 +170,7 @@ pub fn advance_deletes( let previous_delete_opstamp_opt = segment.meta().delete_opstamp(); - for delete_op in delete_operations.iter() { + for delete_op in delete_cursor { // let's skip operations that have already been deleted.0u32 if let Some(previous_delete_opstamp) = previous_delete_opstamp_opt { @@ -480,7 +480,7 @@ impl IndexWriter { // wait for the segment update thread to have processed the info self.segment_updater - .commit(self.committed_opstamp) + .commit(self.committed_opstamp, self.delete_queue.cursor()) .wait()?; self.delete_queue.clear(); diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index 4d3675a26..e57c0fece 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -14,7 +14,6 @@ use futures::{Future, future}; use futures::Canceled; use futures::oneshot; use indexer::{MergePolicy, DefaultMergePolicy}; -use indexer::delete_queue::DeleteQueue; use indexer::index_writer::advance_deletes; use indexer::MergeCandidate; use indexer::merger::IndexMerger; @@ -22,6 +21,7 @@ use indexer::SegmentEntry; use indexer::SegmentSerializer; use Result; use rustc_serialize::json; +use indexer::delete_queue::DeleteCursor; use schema::Schema; use std::borrow::BorrowMut; use std::collections::HashMap; @@ -99,12 +99,11 @@ struct InnerSegmentUpdater { merging_thread_id: AtomicUsize, merging_threads: RwLock>>>, generation: AtomicUsize, - delete_queue: DeleteQueue, } impl SegmentUpdater { - pub fn new(index: Index, delete_queue: DeleteQueue) -> Result { + pub fn new(index: Index) -> Result { let segments = index.segments()?; let segment_manager = SegmentManager::from_segments(segments); Ok( @@ -116,7 +115,6 @@ impl SegmentUpdater { merging_thread_id: AtomicUsize::default(), merging_threads: RwLock::new(HashMap::new()), generation: AtomicUsize::default(), - delete_queue: delete_queue, })) ) } @@ -170,20 +168,22 @@ impl SegmentUpdater { } } - fn purge_deletes(&self) -> Result> { - self.0.segment_manager - .segment_entries() - .into_iter() - .map(|segment_entry| { - let mut segment = self.0.index.segment(segment_entry.meta().clone()); - advance_deletes(&mut segment, &self.0.delete_queue.snapshot(), segment_entry.doc_to_opstamp()) - }) - .collect() + fn purge_deletes(&self, delete_cursor: DeleteCursor) -> Result> { + let mut segment_metas = vec!(); + for segment_entry in self.0.segment_manager.segment_entries() { + let mut segment = self.0.index.segment(segment_entry.meta().clone()); + let delete_cursor = delete_cursor.clone(); + // TODO delete cursor skip... + let segment_meta = advance_deletes(&mut segment, delete_cursor, segment_entry.doc_to_opstamp())?; + segment_metas.push(segment_meta); + } + Ok(segment_metas) + } - pub fn commit(&self, opstamp: u64) -> impl Future { + pub fn commit(&self, opstamp: u64, delete_cursor: DeleteCursor) -> impl Future { self.run_async(move |segment_updater| { - let segment_metas = segment_updater.purge_deletes().expect("Failed purge deletes"); + let segment_metas = segment_updater.purge_deletes(delete_cursor).expect("Failed purge deletes"); segment_updater.0.segment_manager.commit(segment_metas); let mut index = segment_updater.0.index.clone(); { @@ -212,7 +212,7 @@ impl SegmentUpdater { let merging_thread_id = self.get_merging_thread_id(); let (merging_future_send, merging_future_recv) = oneshot(); - let delete_operations = self.0.delete_queue.snapshot(); + // let delete_operations = self.0.delete_queue.snapshot(); if segment_ids.is_empty() { return merging_future_recv; @@ -231,11 +231,16 @@ impl SegmentUpdater { if let Some(segment_entry) = segment_updater_clone.0 .segment_manager .segment_entry(segment_id) { - let mut segment = index.segment(segment_entry.meta().clone()); - let segment_meta = advance_deletes( - &mut segment, - &delete_operations, - segment_entry.doc_to_opstamp())?; + + // TODOS make sure that the segment are in the same + // position with regard to deletes. + + // let mut segment = index.segment(segment_entry.meta().clone()); + // let segment_meta = advance_deletes( + // &mut segment, + // &delete_operations, + // segment_entry.doc_to_opstamp())?; + let segment_meta = segment_entry.meta().clone(); segment_metas.push(segment_meta); } else { From 7c971b5d3b79b601ba69a0725cb6002b79b06de0 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 11 Mar 2017 16:14:20 +0900 Subject: [PATCH 065/107] baby step 2 --- src/indexer/delete_queue.rs | 14 ++++++- src/indexer/index_writer.rs | 42 ++++++++++++------- src/indexer/merger.rs | 1 + src/indexer/segment_entry.rs | 20 +++++++-- src/indexer/segment_manager.rs | 48 +++++++++++----------- src/indexer/segment_register.rs | 27 ++++++------ src/indexer/segment_updater.rs | 73 +++++++++++++++++++-------------- 7 files changed, 139 insertions(+), 86 deletions(-) diff --git a/src/indexer/delete_queue.rs b/src/indexer/delete_queue.rs index 634a81dc6..beb026654 100644 --- a/src/indexer/delete_queue.rs +++ b/src/indexer/delete_queue.rs @@ -15,6 +15,13 @@ pub struct DeleteCursor { operations: InnerDeleteQueue, } +impl DeleteCursor { + pub fn go_to_tail(&mut self,) { + let read = self.operations.read().unwrap(); + self.cursor = read.len(); + } +} + // TODO remove copy impl Iterator for DeleteCursor { @@ -37,6 +44,11 @@ impl Iterator for DeleteCursor { pub struct DeleteQueue(InnerDeleteQueue); impl DeleteQueue { + + pub fn new() -> DeleteQueue { + DeleteQueue::default() + } + pub fn push(&self, delete_operation: DeleteOperation) { self.0.write().unwrap().push(delete_operation); } @@ -61,7 +73,7 @@ mod tests { #[test] fn test_deletequeue() { - let delete_queue = DeleteQueue::default(); + let delete_queue = DeleteQueue::new(); let make_op = |i: usize| { let field = Field(1u8); diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 732b49bcb..3b77cbd13 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -10,10 +10,9 @@ use datastruct::stacker::Heap; use Error; use Directory; use fastfield::delete::write_delete_bitset; -use indexer::delete_queue::DeleteCursor; +use indexer::delete_queue::{DeleteCursor, DeleteQueue}; use futures::Canceled; use futures::Future; -use indexer::delete_queue::DeleteQueue; use indexer::doc_opstamp_mapping::DocToOpstampMapping; use indexer::MergePolicy; use indexer::operation::DeleteOperation; @@ -117,9 +116,9 @@ pub fn open_index_writer(index: &Index, chan::sync(PIPELINE_MAX_SIZE_IN_DOCS); - let delete_queue = DeleteQueue::default(); + let delete_queue = DeleteQueue::new(); - let segment_updater = SegmentUpdater::new(index.clone())?; + let segment_updater = SegmentUpdater::new(index.clone(), delete_queue.cursor())?; let mut index_writer = IndexWriter { @@ -156,12 +155,12 @@ pub fn open_index_writer(index: &Index, // TODO skip delete operation before teh // last delete opstamp -pub fn advance_deletes( - segment: &mut Segment, - delete_cursor: DeleteCursor, - doc_opstamps: &DocToOpstampMapping) -> Result { +pub fn advance_deletes(mut segment: Segment, segment_entry: &mut SegmentEntry) -> Result<()> { + + { + let doc_opstamps = segment_entry.reset_doc_to_stamp(); + let delete_cursor = segment_entry.delete_cursor(); - let segment_reader = SegmentReader::open(segment.clone())?; let mut delete_bitset = BitSet::with_capacity(segment_reader.max_doc() as usize); @@ -172,6 +171,8 @@ pub fn advance_deletes( for delete_op in delete_cursor { + println!("opstamp {:?}", delete_op.opstamp); + // let's skip operations that have already been deleted.0u32 if let Some(previous_delete_opstamp) = previous_delete_opstamp_opt { if delete_op.opstamp <= previous_delete_opstamp { @@ -211,8 +212,10 @@ pub fn advance_deletes( let mut delete_file = segment.open_write(SegmentComponent::DELETE)?; write_delete_bitset(&delete_bitset, &mut delete_file)?; } + } + segment_entry.set_meta(segment.meta().clone()); - Ok(segment.meta().clone()) + Ok(()) } fn index_documents(heap: &mut Heap, @@ -220,7 +223,8 @@ fn index_documents(heap: &mut Heap, schema: &Schema, generation: usize, document_iterator: &mut Iterator, - segment_updater: &mut SegmentUpdater) + segment_updater: &mut SegmentUpdater, + delete_cursor: DeleteCursor) -> Result { heap.clear(); let segment_id = segment.id(); @@ -245,9 +249,9 @@ fn index_documents(heap: &mut Heap, let mut segment_meta = SegmentMeta::new(segment_id); segment_meta.set_max_doc(num_docs); - let mut segment_entry = SegmentEntry::new(segment_meta); + let mut segment_entry = SegmentEntry::new(segment_meta, delete_cursor); segment_entry.set_doc_to_opstamp(DocToOpstampMapping::from(doc_opstamps)); - + segment_updater .add_segment(generation, segment_entry) .wait() @@ -292,6 +296,8 @@ impl IndexWriter { let mut heap = Heap::with_capacity(self.heap_size_in_bytes_per_thread); let generation = self.generation; + + let mut delete_cursor = self.delete_queue.cursor(); let join_handle: JoinHandle> = thread::Builder::new() @@ -299,9 +305,14 @@ impl IndexWriter { .spawn(move || { loop { + + let mut document_iterator = document_receiver_clone.clone() .into_iter() .peekable(); + + // we consume all previous delete operations. + delete_cursor.go_to_tail(); // the peeking here is to avoid // creating a new segment's files @@ -317,7 +328,8 @@ impl IndexWriter { &schema, generation, &mut document_iterator, - &mut segment_updater)?; + &mut segment_updater, + delete_cursor.clone())?; } else { // No more documents. @@ -480,7 +492,7 @@ impl IndexWriter { // wait for the segment update thread to have processed the info self.segment_updater - .commit(self.committed_opstamp, self.delete_queue.cursor()) + .commit(self.committed_opstamp) .wait()?; self.delete_queue.clear(); diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index b7284af96..0a4c21342 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -494,6 +494,7 @@ mod tests { let searcher = index.searcher(); assert_eq!(searcher.segment_readers().len(), 2); assert_eq!(searcher.num_docs(), 3); + assert_eq!(searcher.segment_readers()[0].num_docs(), 1); assert_eq!(searcher.segment_readers()[0].max_doc(), 3); assert_eq!(searcher.segment_readers()[1].num_docs(), 2); diff --git a/src/indexer/segment_entry.rs b/src/indexer/segment_entry.rs index 0c12e72ba..e18e84d47 100644 --- a/src/indexer/segment_entry.rs +++ b/src/indexer/segment_entry.rs @@ -1,7 +1,9 @@ use indexer::doc_opstamp_mapping::DocToOpstampMapping; use core::SegmentMeta; +use indexer::delete_queue::DeleteCursor; use core::SegmentId; use std::fmt; +use std::mem; #[derive(Clone, Copy, PartialEq, Eq, Debug)] pub enum SegmentState { @@ -23,20 +25,32 @@ pub struct SegmentEntry { meta: SegmentMeta, state: SegmentState, doc_to_opstamp: DocToOpstampMapping, + delete_cursor: DeleteCursor, + } impl SegmentEntry { - pub fn new(segment_meta: SegmentMeta) -> SegmentEntry { + pub fn new(segment_meta: SegmentMeta, + delete_cursor: DeleteCursor) -> SegmentEntry { SegmentEntry { meta: segment_meta, state: SegmentState::Ready, doc_to_opstamp: DocToOpstampMapping::None, + delete_cursor: delete_cursor, } } - pub fn doc_to_opstamp(&self) -> &DocToOpstampMapping { - &self.doc_to_opstamp + pub fn reset_doc_to_stamp(&mut self,) -> DocToOpstampMapping { + mem::replace(&mut self.doc_to_opstamp, DocToOpstampMapping::None) + } + + pub fn set_meta(&mut self, segment_meta: SegmentMeta) { + self.meta = segment_meta; + } + + pub fn delete_cursor(&mut self) -> &mut DeleteCursor { + &mut self.delete_cursor } pub fn state(&self) -> SegmentState { diff --git a/src/indexer/segment_manager.rs b/src/indexer/segment_manager.rs index 1991ebda4..7ac353fb5 100644 --- a/src/indexer/segment_manager.rs +++ b/src/indexer/segment_manager.rs @@ -8,6 +8,7 @@ use std::path::PathBuf; use std::collections::hash_set::HashSet; use std::sync::{RwLockReadGuard, RwLockWriteGuard}; use std::fmt::{self, Debug, Formatter}; +use indexer::delete_queue::DeleteCursor; #[derive(Default)] struct SegmentRegisters { @@ -49,11 +50,11 @@ pub fn get_segments(segment_manager: &SegmentManager,) -> (Vec, Vec impl SegmentManager { - pub fn from_segments(segment_metas: Vec) -> SegmentManager { + pub fn from_segments(segment_metas: Vec, delete_cursor: DeleteCursor) -> SegmentManager { SegmentManager { registers: RwLock::new(SegmentRegisters { uncommitted: SegmentRegister::default(), - committed: SegmentRegister::new(segment_metas), + committed: SegmentRegister::new(segment_metas, delete_cursor), writing: HashSet::new(), }), } @@ -127,22 +128,19 @@ impl SegmentManager { segment_ids } - pub fn commit(&self, segment_metas: Vec) { - let committed_segment_entries = segment_metas - .into_iter() - .map(|segment_meta| { - let segment_id = segment_meta.id(); - let mut segment_entry = SegmentEntry::new(segment_meta); - if let Some(state) = self.segment_state(&segment_id) { - segment_entry.set_state(state); - } - segment_entry - }) - .collect::>(); + pub fn commit(&self, mut segment_entries: Vec) { + // TODO is still relevant!? + // restore the state of the segment_entries + for segment_entry in &mut segment_entries { + let segment_id = segment_entry.segment_id(); + if let Some(state) = self.segment_state(&segment_id) { + segment_entry.set_state(state); + } + } let mut registers_lock = self.write(); registers_lock.committed.clear(); registers_lock.uncommitted.clear(); - for segment_entry in committed_segment_entries { + for segment_entry in segment_entries { registers_lock.committed.add_segment_entry(segment_entry); } } @@ -175,21 +173,23 @@ impl SegmentManager { registers_lock.uncommitted.add_segment_entry(segment_entry); } - pub fn end_merge(&self, merged_segment_metas: &[SegmentMeta], merged_segment_meta: SegmentMeta) { + pub fn end_merge(&self, + before_merge_segment_ids: &[SegmentId], + after_merge_segment_entry: SegmentEntry) { + let mut registers_lock = self.write(); - let merged_segment_ids: Vec = merged_segment_metas.iter().map(|meta| meta.id()).collect(); - let merged_segment_entry = SegmentEntry::new(merged_segment_meta); - if registers_lock.uncommitted.contains_all(&merged_segment_ids) { - for segment_id in &merged_segment_ids { + + if registers_lock.uncommitted.contains_all(&before_merge_segment_ids) { + for segment_id in before_merge_segment_ids { registers_lock.uncommitted.remove_segment(segment_id); } - registers_lock.uncommitted.add_segment_entry(merged_segment_entry); + registers_lock.uncommitted.add_segment_entry(after_merge_segment_entry); } - else if registers_lock.committed.contains_all(&merged_segment_ids) { - for segment_id in &merged_segment_ids { + else if registers_lock.committed.contains_all(&before_merge_segment_ids) { + for segment_id in before_merge_segment_ids { registers_lock.committed.remove_segment(segment_id); } - registers_lock.committed.add_segment_entry(merged_segment_entry); + registers_lock.committed.add_segment_entry(after_merge_segment_entry); } else { warn!("couldn't find segment in SegmentManager"); } diff --git a/src/indexer/segment_register.rs b/src/indexer/segment_register.rs index b618cade2..367babbb8 100644 --- a/src/indexer/segment_register.rs +++ b/src/indexer/segment_register.rs @@ -4,6 +4,7 @@ use core::SegmentMeta; use std::fmt; use std::fmt::{Debug, Formatter}; use indexer::segment_entry::SegmentEntry; +use indexer::delete_queue::DeleteCursor; /// The segment register keeps track /// of the list of segment, their size as well @@ -95,16 +96,15 @@ impl SegmentRegister { .start_merge(); } - pub fn new(segment_metas: Vec) -> SegmentRegister { + pub fn new(segment_metas: Vec, delete_cursor: DeleteCursor) -> SegmentRegister { + let mut segment_states = HashMap::new(); + for segment_meta in segment_metas { + let segment_id = segment_meta.id(); + let segment_entry = SegmentEntry::new(segment_meta, delete_cursor.clone()); + segment_states.insert(segment_id, segment_entry); + } SegmentRegister { - segment_states: segment_metas - .into_iter() - .map(|segment_meta| { - let segment_id = segment_meta.id(); - let segment_entry = SegmentEntry::new(segment_meta ); - (segment_id, segment_entry) - }) - .collect(), + segment_states: segment_states } } } @@ -115,10 +115,13 @@ mod tests { use indexer::SegmentState; use core::SegmentId; use core::SegmentMeta; + use indexer::delete_queue::*; use super::*; #[test] fn test_segment_register() { + let delete_queue = DeleteQueue::new(); + let mut segment_register = SegmentRegister::default(); let segment_id_a = SegmentId::generate_random(); let segment_id_b = SegmentId::generate_random(); @@ -126,14 +129,14 @@ mod tests { { let segment_meta = SegmentMeta::new(segment_id_a); - let segment_entry = SegmentEntry::new(segment_meta); + let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor()); segment_register.add_segment_entry(segment_entry); } assert_eq!(segment_register.segment_entry(&segment_id_a).unwrap().state(), SegmentState::Ready); assert_eq!(segment_register.segment_ids(), vec!(segment_id_a)); { let segment_meta = SegmentMeta::new(segment_id_b); - let segment_entry = SegmentEntry::new(segment_meta); + let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor()); segment_register.add_segment_entry(segment_entry); } assert_eq!(segment_register.segment_entry(&segment_id_b).unwrap().state(), SegmentState::Ready); @@ -145,7 +148,7 @@ mod tests { segment_register.remove_segment(&segment_id_b); { let segment_meta_merged = SegmentMeta::new(segment_id_merged); - let segment_entry = SegmentEntry::new(segment_meta_merged); + let segment_entry = SegmentEntry::new(segment_meta_merged, delete_queue.cursor()); segment_register.add_segment_entry(segment_entry); } assert_eq!(segment_register.segment_ids(), vec!(segment_id_merged)); diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index e57c0fece..3984b8e86 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -103,9 +103,9 @@ struct InnerSegmentUpdater { impl SegmentUpdater { - pub fn new(index: Index) -> Result { + pub fn new(index: Index, delete_cursor: DeleteCursor) -> Result { let segments = index.segments()?; - let segment_manager = SegmentManager::from_segments(segments); + let segment_manager = SegmentManager::from_segments(segments, delete_cursor); Ok( SegmentUpdater(Arc::new(InnerSegmentUpdater { pool: CpuPool::new(1), @@ -168,23 +168,22 @@ impl SegmentUpdater { } } - fn purge_deletes(&self, delete_cursor: DeleteCursor) -> Result> { - let mut segment_metas = vec!(); - for segment_entry in self.0.segment_manager.segment_entries() { - let mut segment = self.0.index.segment(segment_entry.meta().clone()); - let delete_cursor = delete_cursor.clone(); - // TODO delete cursor skip... - let segment_meta = advance_deletes(&mut segment, delete_cursor, segment_entry.doc_to_opstamp())?; - segment_metas.push(segment_meta); + fn purge_deletes(&self) -> Result> { + let mut segment_entries = self.0.segment_manager.segment_entries(); + for segment_entry in &mut segment_entries { + let segment = self.0.index.segment(segment_entry.meta().clone()); + advance_deletes(segment, segment_entry)?; } - Ok(segment_metas) + Ok(segment_entries) } - pub fn commit(&self, opstamp: u64, delete_cursor: DeleteCursor) -> impl Future { + pub fn commit(&self, opstamp: u64) -> impl Future { self.run_async(move |segment_updater| { - let segment_metas = segment_updater.purge_deletes(delete_cursor).expect("Failed purge deletes"); - segment_updater.0.segment_manager.commit(segment_metas); + let segment_entries = segment_updater + .purge_deletes() + .expect("Failed purge deletes"); + segment_updater.0.segment_manager.commit(segment_entries); let mut index = segment_updater.0.index.clone(); { let directory = index.directory(); @@ -226,22 +225,20 @@ impl SegmentUpdater { let ref index = segment_updater_clone.0.index; let schema = index.schema(); - let mut segment_metas = vec!(); + let mut segment_entries = vec!(); + for segment_id in &segment_ids_vec { - if let Some(segment_entry) = segment_updater_clone.0 + if let Some(mut segment_entry) = segment_updater_clone.0 .segment_manager .segment_entry(segment_id) { // TODOS make sure that the segment are in the same // position with regard to deletes. - // let mut segment = index.segment(segment_entry.meta().clone()); - // let segment_meta = advance_deletes( - // &mut segment, - // &delete_operations, - // segment_entry.doc_to_opstamp())?; - let segment_meta = segment_entry.meta().clone(); - segment_metas.push(segment_meta); + let segment = index.segment(segment_entry.meta().clone()); + advance_deletes(segment, &mut segment_entry)?; + + segment_entries.push(segment_entry); } else { error!("Error, had to abort merge as some of the segment is not managed anymore.a"); @@ -249,10 +246,13 @@ impl SegmentUpdater { } } - let segments: Vec = segment_metas + let delete_cursor = segment_entries[0].delete_cursor().clone(); + + let segments: Vec = segment_entries .iter() - .cloned() - .map(|segment_meta| index.segment(segment_meta)) + .map(|segment_entry| { + index.segment(segment_entry.meta().clone()) + }) .collect(); // An IndexMerger is like a "view" of our merged segments. @@ -262,16 +262,27 @@ impl SegmentUpdater { // ... we just serialize this index merger in our new segment // to merge the two segments. - let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment).expect("Creating index serializer failed"); + let segment_serializer = + SegmentSerializer + ::for_segment(&mut merged_segment) + .expect("Creating index serializer failed"); let num_docs = merger.write(segment_serializer).expect("Serializing merged index failed"); let mut segment_meta = SegmentMeta::new(merged_segment.id()); segment_meta.set_max_doc(num_docs); + let before_merged_segment_ids = segment_entries + .iter() + .map(|segment_entry| segment_entry.segment_id()) + .collect::>(); + + let after_merge_segment_entry = SegmentEntry::new(segment_meta.clone(), delete_cursor); + segment_updater_clone - .end_merge(segment_metas.clone(), segment_meta.clone()) + .end_merge(before_merged_segment_ids, after_merge_segment_entry) .wait() .unwrap(); + merging_future_send.complete(segment_meta); segment_updater_clone.0.merging_threads.write().unwrap().remove(&merging_thread_id); Ok(()) @@ -296,11 +307,11 @@ impl SegmentUpdater { fn end_merge(&self, - merged_segment_metas: Vec, - segment_meta: SegmentMeta) -> impl Future { + before_merge_segment_ids: Vec, + after_merge_segment_entry: SegmentEntry) -> impl Future { self.run_async(move |segment_updater| { - segment_updater.0.segment_manager.end_merge(&merged_segment_metas, segment_meta); + segment_updater.0.segment_manager.end_merge(&before_merge_segment_ids, after_merge_segment_entry); let mut directory = segment_updater.0.index.directory().box_clone(); let segment_metas = segment_updater.0.segment_manager.committed_segment_metas(); save_metas( From 202dda98ba6fe4ca68d333e537d5553486143547 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 12 Mar 2017 19:00:57 +0900 Subject: [PATCH 066/107] baby step 3 --- src/core/mod.rs | 1 + src/indexer/delete_queue.rs | 33 ++++-- src/indexer/directory_lock.rs | 9 +- src/indexer/index_writer.rs | 180 +++++++++++++++------------------ src/indexer/merger.rs | 11 +- src/indexer/mod.rs | 1 + src/indexer/segment_manager.rs | 12 +-- src/indexer/segment_updater.rs | 99 ++++++++++-------- src/indexer/stamper.rs | 17 ++++ src/lib.rs | 4 +- 10 files changed, 202 insertions(+), 165 deletions(-) create mode 100644 src/indexer/stamper.rs diff --git a/src/core/mod.rs b/src/core/mod.rs index 4e11428e0..6b37c5542 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -27,4 +27,5 @@ use std::path::PathBuf; lazy_static! { pub static ref META_FILEPATH: PathBuf = PathBuf::from("meta.json"); pub static ref MANAGED_FILEPATH: PathBuf = PathBuf::from(".managed.json"); + pub static ref LOCKFILE_FILEPATH: PathBuf = PathBuf::from(".tantivy-indexer.lock"); } \ No newline at end of file diff --git a/src/indexer/delete_queue.rs b/src/indexer/delete_queue.rs index beb026654..0286454c7 100644 --- a/src/indexer/delete_queue.rs +++ b/src/indexer/delete_queue.rs @@ -16,9 +16,32 @@ pub struct DeleteCursor { } impl DeleteCursor { - pub fn go_to_tail(&mut self,) { + + pub fn skip_to(&mut self, target_opstamp: u64) { + while let Some(operation) = self.peek() { + if operation.opstamp >= target_opstamp { + break; + } + self.advance() + } + } + + pub fn advance(&mut self) { let read = self.operations.read().unwrap(); - self.cursor = read.len(); + if self.cursor < read.len() { + self.cursor += 1; + } + } + + pub fn peek(&self,) -> Option { + let read = self.operations.read().unwrap(); + if self.cursor >= read.len() { + None + } + else { + let operation = read[self.cursor].clone(); + Some(operation) + } } } @@ -40,6 +63,7 @@ impl Iterator for DeleteCursor { } } + #[derive(Clone, Default)] pub struct DeleteQueue(InnerDeleteQueue); @@ -53,10 +77,6 @@ impl DeleteQueue { self.0.write().unwrap().push(delete_operation); } - pub fn clear(&mut self) { - self.0.write().unwrap().clear(); - } - pub fn cursor(&self) -> DeleteCursor { DeleteCursor { cursor: 0, @@ -65,6 +85,7 @@ impl DeleteQueue { } } + #[cfg(test)] mod tests { diff --git a/src/indexer/directory_lock.rs b/src/indexer/directory_lock.rs index 0b29e127a..db149a297 100644 --- a/src/indexer/directory_lock.rs +++ b/src/indexer/directory_lock.rs @@ -1,8 +1,7 @@ use Directory; -use std::path::Path; use directory::error::OpenWriteError; +use core::LOCKFILE_FILEPATH; -pub const LOCKFILE_NAME: &'static str = ".tantivy-indexer.lock"; /// The directory lock is a mechanism used to @@ -16,16 +15,14 @@ pub struct DirectoryLock { impl DirectoryLock { pub fn lock(mut directory: Box) -> Result { - let lockfile_path = Path::new(LOCKFILE_NAME); - try!(directory.open_write(lockfile_path)); + try!(directory.open_write(&*LOCKFILE_FILEPATH)); Ok(DirectoryLock { directory: directory }) } } impl Drop for DirectoryLock { fn drop(&mut self) { - let lockfile_path = Path::new(LOCKFILE_NAME); - if let Err(e) = self.directory.delete(lockfile_path) { + if let Err(e) = self.directory.delete(&*LOCKFILE_FILEPATH) { error!("Failed to remove the lock file. {:?}", e); } } diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 3b77cbd13..56e9a41f8 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -6,6 +6,7 @@ use core::SegmentComponent; use core::SegmentId; use core::SegmentMeta; use core::SegmentReader; +use indexer::stamper::Stamper; use datastruct::stacker::Heap; use Error; use Directory; @@ -26,11 +27,11 @@ use schema::Schema; use schema::Term; use std::mem; use std::mem::swap; -use std::thread; use std::thread::JoinHandle; use super::directory_lock::DirectoryLock; use super::operation::AddOperation; use super::segment_updater::SegmentUpdater; +use std::thread; // Size of the margin for the heap. A segment is closed when the remaining memory // in the heap goes below MARGIN_IN_BYTES. @@ -76,7 +77,7 @@ pub struct IndexWriter { delete_queue: DeleteQueue, - uncommitted_opstamp: u64, + stamper: Stamper, committed_opstamp: u64, } @@ -100,10 +101,10 @@ impl !Sync for IndexWriter {} /// If the lockfile already exists, returns `Error::FileAlreadyExists`. /// # Panics /// If the heap size per thread is too small, panics. -pub fn open_index_writer(index: &Index, - num_threads: usize, - heap_size_in_bytes_per_thread: usize) - -> Result { +pub fn open_index_writer( + index: &Index, + num_threads: usize, + heap_size_in_bytes_per_thread: usize) -> Result { if heap_size_in_bytes_per_thread <= HEAP_SIZE_LIMIT as usize { panic!(format!("The heap size per thread needs to be at least {}.", @@ -118,7 +119,11 @@ pub fn open_index_writer(index: &Index, let delete_queue = DeleteQueue::new(); - let segment_updater = SegmentUpdater::new(index.clone(), delete_queue.cursor())?; + let stamper = Stamper::new(index.opstamp()); + + let segment_updater = SegmentUpdater::new(index.clone(), + stamper.clone(), + delete_queue.cursor())?; let mut index_writer = IndexWriter { @@ -132,13 +137,13 @@ pub fn open_index_writer(index: &Index, segment_updater: segment_updater, - workers_join_handle: Vec::new(), + workers_join_handle: vec!(), num_threads: num_threads, delete_queue: delete_queue, committed_opstamp: index.opstamp(), - uncommitted_opstamp: index.opstamp(), + stamper: stamper, generation: 0, @@ -155,7 +160,12 @@ pub fn open_index_writer(index: &Index, // TODO skip delete operation before teh // last delete opstamp -pub fn advance_deletes(mut segment: Segment, segment_entry: &mut SegmentEntry) -> Result<()> { +/// Advance delete for the given segment up +/// to the target opstamp. +pub fn advance_deletes( + mut segment: Segment, + segment_entry: &mut SegmentEntry, + target_opstamp: u64) -> Result<()> { { let doc_opstamps = segment_entry.reset_doc_to_stamp(); @@ -168,33 +178,43 @@ pub fn advance_deletes(mut segment: Segment, segment_entry: &mut SegmentEntry) - let mut last_opstamp_opt: Option = None; let previous_delete_opstamp_opt = segment.meta().delete_opstamp(); - - for delete_op in delete_cursor { - println!("opstamp {:?}", delete_op.opstamp); + loop { - // let's skip operations that have already been deleted.0u32 - if let Some(previous_delete_opstamp) = previous_delete_opstamp_opt { - if delete_op.opstamp <= previous_delete_opstamp { - continue; + if let Some(delete_op) = delete_cursor.peek() { + if delete_op.opstamp > target_opstamp { + break; } - } - - // A delete operation should only affect - // document that were inserted after it. - // - // Limit doc helps identify the first document - // that may be affected by the delete operation. - let limit_doc = doc_opstamps.compute_doc_limit(delete_op.opstamp); - if let Some(mut docset) = segment_reader.read_postings(&delete_op.term, SegmentPostingsOption::NoFreq) { - while docset.advance() { - let deleted_doc = docset.doc(); - if deleted_doc < limit_doc { - delete_bitset.insert(deleted_doc as usize); + else { + // let's skip operations that have already been deleted.0u32 + if let Some(previous_delete_opstamp) = previous_delete_opstamp_opt { + if delete_op.opstamp <= previous_delete_opstamp { + continue; + } } + + // A delete operation should only affect + // document that were inserted after it. + // + // Limit doc helps identify the first document + // that may be affected by the delete operation. + let limit_doc = doc_opstamps.compute_doc_limit(delete_op.opstamp); + if let Some(mut docset) = segment_reader.read_postings(&delete_op.term, SegmentPostingsOption::NoFreq) { + while docset.advance() { + let deleted_doc = docset.doc(); + if deleted_doc < limit_doc { + delete_bitset.insert(deleted_doc as usize); + } + } + } + last_opstamp_opt = Some(delete_op.opstamp); } } - last_opstamp_opt = Some(delete_op.opstamp); + else { + break; + } + + delete_cursor.advance(); } // we only write the result different @@ -245,7 +265,6 @@ fn index_documents(heap: &mut Heap, let doc_opstamps: Vec = segment_writer.finalize()?; - // let segment_entry = advance_deletes(&mut segment, delete_queue, delete_position, )?; let mut segment_meta = SegmentMeta::new(segment_id); segment_meta.set_max_doc(num_docs); @@ -311,9 +330,6 @@ impl IndexWriter { .into_iter() .peekable(); - // we consume all previous delete operations. - delete_cursor.go_to_tail(); - // the peeking here is to avoid // creating a new segment's files // if no document are available. @@ -321,15 +337,8 @@ impl IndexWriter { // this is a valid guarantee as the // peeked document now belongs to // our local iterator. - if document_iterator.peek().is_some() { - let segment = segment_updater.new_segment(); - index_documents(&mut heap, - segment, - &schema, - generation, - &mut document_iterator, - &mut segment_updater, - delete_cursor.clone())?; + if let Some(operation) = document_iterator.peek() { + delete_cursor.skip_to(operation.opstamp); } else { // No more documents. @@ -337,7 +346,14 @@ impl IndexWriter { // was dropped. return Ok(()) } - + let segment = segment_updater.new_segment(); + index_documents(&mut heap, + segment, + &schema, + generation, + &mut document_iterator, + &mut segment_updater, + delete_cursor.clone())?; } })?; @@ -392,57 +408,29 @@ impl IndexWriter { /// state as it was after the last commit. /// /// The opstamp at the last commit is returned. - pub fn rollback(&mut self) -> Result { - + pub fn rollback(mut self) -> Result { info!("Rolling back to opstamp {}", self.committed_opstamp); - // by updating the generation in the segment updater, - // pending add segment commands will be dismissed. - self.generation += 1; - - let rollback_future = self.segment_updater.rollback(self.generation); - - // we cannot drop segment ready receiver yet - // as it would block the workers. - let document_receiver = self.recreate_document_channel(); + self.segment_updater.kill(); // Drains the document receiver pipeline : // Workers don't need to index the pending documents. - for _ in document_receiver {} - - let mut former_workers_join_handle = Vec::new(); - swap(&mut former_workers_join_handle, - &mut self.workers_join_handle); + let receiver_clone = self.document_receiver.clone(); + let index = self.index.clone(); + let num_threads = self.num_threads; + let heap_size_in_bytes_per_thread = self.heap_size_in_bytes_per_thread; + drop(self); + for _ in receiver_clone {} - // wait for all the worker to finish their work - // (it should be fast since we consumed all pending documents) - for worker_handle in former_workers_join_handle { - // we stop one worker at a time ... - try!(try!(worker_handle.join() - .map_err(|e| Error::ErrorInThread(format!("{:?}", e))))); - // ... and recreate a new one right away - // to work on the next generation. - try!(self.add_indexing_worker()); - } - - // All of our indexing workers for the rollbacked generation have - // been terminated. - // - // Our document receiver pipe was drained. - // No new document have been added in the meanwhile because `IndexWriter` - // is not shared by different threads. + let index_writer = open_index_writer( + &index, + num_threads, + heap_size_in_bytes_per_thread)?; - rollback_future.wait().map_err(|_| - Error::ErrorInThread("Error while waiting for rollback.".to_string()) - )?; - - self.delete_queue.clear(); - - // reset the opstamp - self.uncommitted_opstamp = self.committed_opstamp; - Ok(self.committed_opstamp) + Ok(index_writer) } + /// Commits all of the pending changes /// /// A call to commit blocks. @@ -469,7 +457,7 @@ impl IndexWriter { // This will move uncommitted segments to the state of // committed segments. - self.committed_opstamp = self.stamp(); + self.committed_opstamp = self.stamper.stamp(); info!("committing {}", self.committed_opstamp); // this will drop the current document channel @@ -495,7 +483,6 @@ impl IndexWriter { .commit(self.committed_opstamp) .wait()?; - self.delete_queue.clear(); Ok(self.committed_opstamp) } @@ -508,7 +495,7 @@ impl IndexWriter { /// Like adds, the deletion itself will be visible /// only after calling `commit()`. pub fn delete_term(&mut self, term: Term) -> u64 { - let opstamp = self.stamp(); + let opstamp = self.stamper.stamp(); let delete_operation = DeleteOperation { opstamp: opstamp, term: term, @@ -517,10 +504,8 @@ impl IndexWriter { opstamp } - fn stamp(&mut self) -> u64 { - let opstamp = self.uncommitted_opstamp; - self.uncommitted_opstamp += 1u64; - opstamp + pub fn commit_opstamp(&self) -> u64 { + self.committed_opstamp } /// Adds a document. @@ -534,7 +519,7 @@ impl IndexWriter { /// Currently it represents the number of documents that /// have been added since the creation of the index. pub fn add_document(&mut self, document: Document) -> u64 { - let opstamp = self.stamp(); + let opstamp = self.stamper.stamp(); let add_operation = AddOperation { opstamp: opstamp, document: document, @@ -610,7 +595,10 @@ mod tests { doc.add_text(text_field, "a"); index_writer.add_document(doc); } - assert_eq!(index_writer.rollback().unwrap(), 0u64); + + index_writer = index_writer.rollback().unwrap(); + + assert_eq!(index_writer.commit_opstamp(), 0u64); assert_eq!(num_docs_containing("a"), 0); { diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 0a4c21342..607860859 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -429,9 +429,10 @@ mod tests { #[test] fn test_index_merger_with_deletes() { let mut schema_builder = schema::SchemaBuilder::default(); - let text_fieldtype = schema::TextOptions::default() - .set_indexing_options(TextIndexingOptions::TokenizedWithFreq) - .set_stored(); + let text_fieldtype = schema::TextOptions + ::default() + .set_indexing_options(TextIndexingOptions::TokenizedWithFreq) + .set_stored(); let text_field = schema_builder.add_text_field("text", text_fieldtype); let score_fieldtype = schema::U32Options::default().set_fast(); let score_field = schema_builder.add_u32_field("score", score_fieldtype); @@ -492,9 +493,11 @@ mod tests { index_writer.commit().expect("committed"); index.load_searchers().unwrap(); let searcher = index.searcher(); + for segment_reader in searcher.segment_readers() { + println!("segment reader {}", segment_reader.num_docs()); + } assert_eq!(searcher.segment_readers().len(), 2); assert_eq!(searcher.num_docs(), 3); - assert_eq!(searcher.segment_readers()[0].num_docs(), 1); assert_eq!(searcher.segment_readers()[0].max_doc(), 3); assert_eq!(searcher.segment_readers()[1].num_docs(), 2); diff --git a/src/indexer/mod.rs b/src/indexer/mod.rs index 8380332af..af6e6a21d 100644 --- a/src/indexer/mod.rs +++ b/src/indexer/mod.rs @@ -12,6 +12,7 @@ mod directory_lock; mod segment_entry; mod doc_opstamp_mapping; pub mod operation; +mod stamper; // TODO avoid exposing SegmentState / SegmentEntry if it does not have to be public API diff --git a/src/indexer/segment_manager.rs b/src/indexer/segment_manager.rs index 7ac353fb5..9f618b1ad 100644 --- a/src/indexer/segment_manager.rs +++ b/src/indexer/segment_manager.rs @@ -1,7 +1,7 @@ use super::segment_register::SegmentRegister; use std::sync::RwLock; use core::SegmentMeta; -use core::META_FILEPATH; +use core::{META_FILEPATH, LOCKFILE_FILEPATH}; use core::SegmentId; use indexer::{SegmentEntry, SegmentState}; use std::path::PathBuf; @@ -76,6 +76,7 @@ impl SegmentManager { let registers_lock = self.read(); let mut files = HashSet::new(); files.insert(META_FILEPATH.clone()); + files.insert(LOCKFILE_FILEPATH.clone()); let segment_metas = registers_lock.committed @@ -119,15 +120,6 @@ impl SegmentManager { self.registers.write().expect("Failed to acquire write lock on SegmentManager.") } - /// Removes all of the uncommitted segments - /// and returns them. - pub fn rollback(&self,) -> Vec { - let mut registers_lock = self.write(); - let segment_ids = registers_lock.uncommitted.segment_ids(); - registers_lock.uncommitted.clear(); - segment_ids - } - pub fn commit(&self, mut segment_entries: Vec) { // TODO is still relevant!? // restore the state of the segment_entries diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index 3984b8e86..0fa463391 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -8,6 +8,7 @@ use core::SegmentId; use core::SegmentMeta; use core::SerializableSegment; use directory::Directory; +use indexer::stamper::Stamper; use Error; use futures_cpupool::CpuPool; use futures::{Future, future}; @@ -29,7 +30,7 @@ use std::io::Write; use std::mem; use std::ops::DerefMut; use std::sync::Arc; -use std::sync::atomic::AtomicUsize; +use std::sync::atomic::{AtomicUsize, AtomicBool}; use std::sync::atomic::Ordering; use std::sync::RwLock; use std::thread; @@ -99,11 +100,15 @@ struct InnerSegmentUpdater { merging_thread_id: AtomicUsize, merging_threads: RwLock>>>, generation: AtomicUsize, + killed: AtomicBool, + stamper: Stamper, } impl SegmentUpdater { - pub fn new(index: Index, delete_cursor: DeleteCursor) -> Result { + pub fn new(index: Index, + stamper: Stamper, + delete_cursor: DeleteCursor) -> Result { let segments = index.segments()?; let segment_manager = SegmentManager::from_segments(segments, delete_cursor); Ok( @@ -115,6 +120,8 @@ impl SegmentUpdater { merging_thread_id: AtomicUsize::default(), merging_threads: RwLock::new(HashMap::new()), generation: AtomicUsize::default(), + killed: AtomicBool::new(false), + stamper: stamper, })) ) } @@ -148,15 +155,8 @@ impl SegmentUpdater { }) } - pub fn rollback(&mut self, generation: usize) -> impl Future { - self.0.generation.store(generation, Ordering::Release); - self.run_async(|segment_updater| { - segment_updater.0.segment_manager.rollback(); - }) - } - pub fn add_segment(&self, generation: usize, segment_entry: SegmentEntry) -> impl Future { - if generation >= self.0.generation.load(Ordering::Acquire) { + if self.is_alive() && generation >= self.0.generation.load(Ordering::Acquire) { future::Either::A(self.run_async(|segment_updater| { segment_updater.0.segment_manager.add_segment(segment_entry); segment_updater.consider_merge_options(); @@ -168,35 +168,49 @@ impl SegmentUpdater { } } - fn purge_deletes(&self) -> Result> { + pub fn kill(&mut self,) { + self.0.killed.store(true, Ordering::Release); + } + + fn is_alive(&self,) -> bool { + !self.0.killed.load(Ordering::Acquire) + } + + fn purge_deletes(&self, target_opstamp: u64) -> Result> { let mut segment_entries = self.0.segment_manager.segment_entries(); for segment_entry in &mut segment_entries { let segment = self.0.index.segment(segment_entry.meta().clone()); - advance_deletes(segment, segment_entry)?; + advance_deletes(segment, segment_entry, target_opstamp)?; } Ok(segment_entries) } + pub fn save_metas(&self, opstamp: u64) { + if self.is_alive() { + let index = &self.0.index; + let directory = index.directory(); + save_metas( + self.0.segment_manager.committed_segment_metas(), + index.schema(), + opstamp, + directory.box_clone().borrow_mut()).expect("Could not save metas."); + } + } + pub fn commit(&self, opstamp: u64) -> impl Future { self.run_async(move |segment_updater| { - let segment_entries = segment_updater - .purge_deletes() - .expect("Failed purge deletes"); - segment_updater.0.segment_manager.commit(segment_entries); - let mut index = segment_updater.0.index.clone(); - { - let directory = index.directory(); - save_metas( - segment_updater.0.segment_manager.committed_segment_metas(), - index.schema(), - opstamp, - directory.box_clone().borrow_mut()).expect("Could not save metas."); + if segment_updater.is_alive() { + let segment_entries = segment_updater + .purge_deletes(opstamp) + .expect("Failed purge deletes"); + segment_updater.0.segment_manager.commit(segment_entries); + let mut index = segment_updater.0.index.clone(); + segment_updater.save_metas(opstamp); + let living_files = segment_updater.0.segment_manager.list_files(); + index.directory_mut().garbage_collect(living_files); + segment_updater.consider_merge_options(); } - let living_files = segment_updater.0.segment_manager.list_files(); - index.directory_mut().garbage_collect(living_files); - segment_updater.consider_merge_options(); - }) } @@ -217,6 +231,7 @@ impl SegmentUpdater { return merging_future_recv; } + let target_opstamp = self.0.stamper.stamp(); let merging_join_handle = thread::spawn(move || { // first we need to apply deletes to our segment. @@ -226,18 +241,16 @@ impl SegmentUpdater { let schema = index.schema(); let mut segment_entries = vec!(); + for segment_id in &segment_ids_vec { if let Some(mut segment_entry) = segment_updater_clone.0 .segment_manager .segment_entry(segment_id) { - // TODOS make sure that the segment are in the same - // position with regard to deletes. - let segment = index.segment(segment_entry.meta().clone()); - advance_deletes(segment, &mut segment_entry)?; - + // TODO unwrap + advance_deletes(segment, &mut segment_entry, target_opstamp).unwrap(); segment_entries.push(segment_entry); } else { @@ -308,17 +321,19 @@ impl SegmentUpdater { fn end_merge(&self, before_merge_segment_ids: Vec, - after_merge_segment_entry: SegmentEntry) -> impl Future { + mut after_merge_segment_entry: SegmentEntry) -> impl Future { self.run_async(move |segment_updater| { + if let Some(delete_operation) = after_merge_segment_entry.delete_cursor().peek() { + let committed_opstamp = segment_updater.0.index.opstamp(); + if delete_operation.opstamp < committed_opstamp { + let segment = segment_updater.0.index.segment(after_merge_segment_entry.meta().clone()); + // TODO check unwrap + advance_deletes(segment, &mut after_merge_segment_entry, committed_opstamp).unwrap(); + } + } segment_updater.0.segment_manager.end_merge(&before_merge_segment_ids, after_merge_segment_entry); - let mut directory = segment_updater.0.index.directory().box_clone(); - let segment_metas = segment_updater.0.segment_manager.committed_segment_metas(); - save_metas( - segment_metas, - segment_updater.0.index.schema(), - segment_updater.0.index.opstamp(), - directory.borrow_mut()).expect("Could not save metas."); + segment_updater.save_metas(segment_updater.0.index.opstamp()); }) } @@ -395,7 +410,7 @@ mod tests { { index_writer.wait_merging_threads() - .expect("waiting for merging threads"); + .expect( "waiting for merging threads"); } index.load_searchers().unwrap(); diff --git a/src/indexer/stamper.rs b/src/indexer/stamper.rs new file mode 100644 index 000000000..816eb6dc4 --- /dev/null +++ b/src/indexer/stamper.rs @@ -0,0 +1,17 @@ +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::Arc; + + +#[derive(Clone, Default)] +pub struct Stamper(Arc); + +impl Stamper { + + pub fn new(first_opstamp: u64) -> Stamper { + Stamper(Arc::new(AtomicU64::new(first_opstamp))) + } + + pub fn stamp(&self,) -> u64 { + self.0.fetch_add(1u64, Ordering::SeqCst) + } +} \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index 81d9acc24..69a63cb2b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,6 +5,8 @@ #![feature(box_syntax)] #![feature(optin_builtin_traits)] #![feature(conservative_impl_trait)] +#![feature(integer_atomics)] + #![cfg_attr(test, feature(test))] #![cfg_attr(test, feature(step_by))] #![doc(test(attr(allow(unused_variables), deny(warnings))))] @@ -397,7 +399,7 @@ mod tests { { index_writer.delete_term(Term::from_field_text(text_field, "c")); } - index_writer.rollback().unwrap(); + index_writer = index_writer.rollback().unwrap(); index_writer.delete_term(Term::from_field_text(text_field, "a")); index_writer.commit().unwrap(); } From 5932278e00526e2cff0d49b971176b249d139b0c Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 13 Mar 2017 10:00:19 +0900 Subject: [PATCH 067/107] test passing --- src/indexer/merge_policy.rs | 7 ++++++- src/indexer/merger.rs | 10 ++++------ src/indexer/segment_manager.rs | 22 ++++++++++++++-------- src/indexer/segment_register.rs | 9 ++++++++- src/indexer/segment_updater.rs | 27 ++++++++++++++------------- 5 files changed, 46 insertions(+), 29 deletions(-) diff --git a/src/indexer/merge_policy.rs b/src/indexer/merge_policy.rs index 5e3adcfb8..dfd9dfcec 100644 --- a/src/indexer/merge_policy.rs +++ b/src/indexer/merge_policy.rs @@ -60,7 +60,12 @@ pub mod tests { .iter() .map(|segment_meta| segment_meta.id()) .collect::>(); - vec!(MergeCandidate(segment_ids)) + if segment_ids.len() > 1 { + vec!(MergeCandidate(segment_ids)) + } + else { + vec!() + } } fn box_clone(&self) -> Box { diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 607860859..c087b93f4 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -175,9 +175,9 @@ impl IndexMerger { Ok(()) } - fn write_postings(&self, - - postings_serializer: &mut PostingsSerializer) -> Result<()> { + fn write_postings( + &self, + postings_serializer: &mut PostingsSerializer) -> Result<()> { let mut merged_terms = TermIterator::from(&self.readers[..]); let mut delta_position_computer = DeltaPositionComputer::new(); @@ -493,9 +493,7 @@ mod tests { index_writer.commit().expect("committed"); index.load_searchers().unwrap(); let searcher = index.searcher(); - for segment_reader in searcher.segment_readers() { - println!("segment reader {}", segment_reader.num_docs()); - } + assert_eq!(searcher.segment_readers().len(), 2); assert_eq!(searcher.num_docs(), 3); assert_eq!(searcher.segment_readers()[0].num_docs(), 1); diff --git a/src/indexer/segment_manager.rs b/src/indexer/segment_manager.rs index 9f618b1ad..4528a412f 100644 --- a/src/indexer/segment_manager.rs +++ b/src/indexer/segment_manager.rs @@ -42,10 +42,16 @@ impl Debug for SegmentManager { /// /// For instance, a segment will not appear in both committed and uncommitted /// segments -pub fn get_segments(segment_manager: &SegmentManager,) -> (Vec, Vec) { +pub fn get_all_segments(segment_manager: &SegmentManager,) -> (Vec, Vec) { let registers_lock = segment_manager.read(); - (registers_lock.committed.get_segments(), - registers_lock.uncommitted.get_segments()) + (registers_lock.committed.get_all_segments(), + registers_lock.uncommitted.get_all_segments()) +} + +pub fn get_mergeable_segments(segment_manager: &SegmentManager,) -> (Vec, Vec) { + let registers_lock = segment_manager.read(); + (registers_lock.committed.get_mergeable_segments(), + registers_lock.uncommitted.get_mergeable_segments()) } impl SegmentManager { @@ -78,18 +84,18 @@ impl SegmentManager { files.insert(META_FILEPATH.clone()); files.insert(LOCKFILE_FILEPATH.clone()); - let segment_metas = + let segment_metas: Vec = registers_lock.committed - .get_segments() + .get_all_segments() .into_iter() .chain(registers_lock.uncommitted - .get_segments() + .get_all_segments() .into_iter()) .chain(registers_lock.writing .iter() .cloned() - .map(SegmentMeta::new)); - + .map(SegmentMeta::new)) + .collect(); for segment_meta in segment_metas { files.extend(segment_meta.list_files()); } diff --git a/src/indexer/segment_register.rs b/src/indexer/segment_register.rs index 367babbb8..76f200735 100644 --- a/src/indexer/segment_register.rs +++ b/src/indexer/segment_register.rs @@ -36,8 +36,15 @@ impl SegmentRegister { pub fn clear(&mut self,) { self.segment_states.clear(); } + + pub fn get_all_segments(&self,) -> Vec { + self.segment_states + .values() + .map(|segment_entry| segment_entry.meta().clone()) + .collect() + } - pub fn get_segments(&self,) -> Vec { + pub fn get_mergeable_segments(&self,) -> Vec { self.segment_states .values() .filter(|segment_entry| segment_entry.is_ready()) diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index 0fa463391..cb1033521 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -35,7 +35,7 @@ use std::sync::atomic::Ordering; use std::sync::RwLock; use std::thread; use std::thread::JoinHandle; -use super::segment_manager::{SegmentManager, get_segments}; +use super::segment_manager::{SegmentManager, get_mergeable_segments}; /// Save the index meta file. @@ -129,9 +129,7 @@ impl SegmentUpdater { pub fn new_segment(&self) -> Segment { let new_segment = self.0.index.new_segment(); let segment_id = new_segment.id(); - self.run_async(move |segment_updater| { - segment_updater.0.segment_manager.write_segment(segment_id); - }); + self.0.segment_manager.write_segment(segment_id); new_segment } @@ -147,7 +145,8 @@ impl SegmentUpdater { self.0.merging_thread_id.fetch_add(1, Ordering::SeqCst) } - + /// TODO check that we use this correctly taking + /// the laziness in account. fn run_async T>(&self, f: F) -> impl Future { let me_clone = self.clone(); self.0.pool.spawn_fn(move || { @@ -157,11 +156,13 @@ impl SegmentUpdater { pub fn add_segment(&self, generation: usize, segment_entry: SegmentEntry) -> impl Future { if self.is_alive() && generation >= self.0.generation.load(Ordering::Acquire) { - future::Either::A(self.run_async(|segment_updater| { - segment_updater.0.segment_manager.add_segment(segment_entry); - segment_updater.consider_merge_options(); - true - })) + future::Either::A({ + self.run_async(|segment_updater| { + segment_updater.0.segment_manager.add_segment(segment_entry); + segment_updater.consider_merge_options(); + true + }) + }) } else { future::Either::B(future::ok(false)) @@ -306,7 +307,7 @@ impl SegmentUpdater { fn consider_merge_options(&self) { - let (committed_segments, uncommitted_segments) = get_segments(&self.0.segment_manager); + let (committed_segments, uncommitted_segments) = get_mergeable_segments(&self.0.segment_manager); // Committed segments cannot be merged with uncommitted_segments. // We therefore consider merges using these two sets of segments independently. let merge_policy = self.get_merge_policy(); @@ -405,7 +406,7 @@ mod tests { } index.load_searchers().unwrap(); - assert_eq!(index.searcher().segment_readers().len(), 3); + assert_eq!(index.searcher().segment_readers().len(), 2); assert_eq!(index.searcher().num_docs(), 302); { @@ -414,7 +415,7 @@ mod tests { } index.load_searchers().unwrap(); - assert_eq!(index.searcher().segment_readers().len(), 2); + assert_eq!(index.searcher().segment_readers().len(), 1); assert_eq!(index.searcher().num_docs(), 302); } } \ No newline at end of file From 37e71f7c634939fd521153f4b6854bc1cde3c105 Mon Sep 17 00:00:00 2001 From: Claus Matzinger Date: Sun, 12 Mar 2017 22:59:38 -0400 Subject: [PATCH 068/107] fixes #100 and improves #99 --- examples/simple_search.rs | 120 ++++++++++++++++++++------------------ 1 file changed, 62 insertions(+), 58 deletions(-) diff --git a/examples/simple_search.rs b/examples/simple_search.rs index 2f26ba1fb..d422b461b 100644 --- a/examples/simple_search.rs +++ b/examples/simple_search.rs @@ -10,105 +10,107 @@ use tantivy::collector::TopCollector; use tantivy::query::QueryParser; fn main() { - // Let's create a temporary directory for the + // Let's create a temporary directory for the // sake of this example if let Ok(dir) = TempDir::new("tantivy_example_dir") { run_example(dir.path()).unwrap(); dir.close().unwrap(); - } + } } fn run_example(index_path: &Path) -> tantivy::Result<()> { - - + + // # Defining the schema // // The Tantivy index requires a very strict schema. // The schema declares which fields are in the index, - // and for each field, its type and "the way it should + // and for each field, its type and "the way it should // be indexed". - - + + // first we need to define a schema ... let mut schema_builder = SchemaBuilder::default(); - + // Our first field is title. // We want full-text search for it, and we want to be able // to retrieve the document after the search. // // TEXT | STORED is some syntactic sugar to describe - // that. - // + // that. + // // `TEXT` means the field should be tokenized and indexed, // along with its term frequency and term positions. // // `STORED` means that the field will also be saved // in a compressed, row-oriented key-value store. - // This store is useful to reconstruct the + // This store is useful to reconstruct the // documents that were selected during the search phase. schema_builder.add_text_field("title", TEXT | STORED); - + // Our first field is body. // We want full-text search for it, and we want to be able // to retrieve the body after the search. schema_builder.add_text_field("body", TEXT); - - let schema = schema_builder.build(); + + let schema = schema_builder.build(); // # Indexing documents // // Let's create a brand new index. - // + // // This will actually just save a meta.json // with our schema in the directory. let index = try!(Index::create(index_path, schema.clone())); - - + + // To insert document we need an index writer. // There must be only one writer at a time. // This single `IndexWriter` is already // multithreaded. // - // Here we use a buffer of 1 GB. Using a bigger + // Here we use a buffer of 50MB. Using a bigger // heap for the indexer can increase its throughput. // This buffer will be split between the indexing // threads. - let mut index_writer = try!(index.writer(1_000_000_000)); + let mut index_writer = try!(index.writer(50_000_000)); // Let's index our documents! // We first need a handle on the title and the body field. - - + + // ### Create a document "manually". // // We can create a document manually, by setting the fields // one by one in a Document object. let title = schema.get_field("title").unwrap(); let body = schema.get_field("body").unwrap(); - + let mut old_man_doc = Document::default(); old_man_doc.add_text(title, "The Old Man and the Sea"); - old_man_doc.add_text(body, "He was an old man who fished alone in a skiff in the Gulf Stream and he had gone eighty-four days now without taking a fish."); - + old_man_doc.add_text(body, + "He was an old man who fished alone in a skiff in the Gulf Stream and \ + he had gone eighty-four days now without taking a fish."); + // ... and add it to the `IndexWriter`. index_writer.add_document(old_man_doc); - + // ### Create a document directly from json. // // Alternatively, we can use our schema to parse // a document object directly from json. - + let mice_and_men_doc = try!(schema.parse_document(r#"{ "title": "Of Mice and Men", "body": "few miles south of Soledad, the Salinas River drops in close to the hillside bank and runs deep and green. The water is warm too, for it has slipped twinkling over the yellow sands in the sunlight before reaching the narrow pool. On one side of the river the golden foothill slopes curve up to the strong and rocky Gabilan Mountains, but on the valley side the water is lined with trees—willows fresh and green with every spring, carrying in their lower leaf junctures the debris of the winter’s flooding; and sycamores with mottled, white,recumbent limbs and branches that arch over the pool" }"#)); - + index_writer.add_document(mice_and_men_doc); - + // Multi-valued field are allowed, they are // expressed in JSON by an array. // The following document has two titles. @@ -117,19 +119,19 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> { "body": "You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking." }"#)); index_writer.add_document(frankenstein_doc); - + // This is an example, so we will only index 3 documents // here. You can check out tantivy's tutorial to index - // the English wikipedia. Tantivy's indexing is rather fast. + // the English wikipedia. Tantivy's indexing is rather fast. // Indexing 5 million articles of the English wikipedia takes // around 4 minutes on my computer! - - + + // ### Committing - // + // // At this point our documents are not searchable. // - // + // // We need to call .commit() explicitly to force the // index_writer to finish processing the documents in the queue, // flush the current index to the disk, and advertise @@ -137,22 +139,25 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> { // // This call is blocking. try!(index_writer.commit()); - + // If `.commit()` returns correctly, then all of the // documents that have been added are guaranteed to be // persistently indexed. - // + // // In the scenario of a crash or a power failure, // tantivy behaves as if has rolled back to its last // commit. - - + + // # Searching // - // Let's search our index. We start - // by creating a searcher. There can be more - // than one searcher at a time. - // + // Let's search our index. Start by reloading + // searchers in the index. This should be done + // after every commit(). + try!(index.load_searchers()); + + // Afterwards create one (or more) searchers. + // // You should create a searcher // every time you start a "search query". let searcher = index.searcher(); @@ -161,46 +166,45 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> { // Here, if the user does not specify which // field they want to search, tantivy will search // in both title and body. - let query_parser = QueryParser::new(index.schema(), vec!(title, body)); - + let query_parser = QueryParser::new(index.schema(), vec![title, body]); + // QueryParser may fail if the query is not in the right // format. For user facing applications, this can be a problem. // A ticket has been opened regarding this problem. let query = try!(query_parser.parse_query("sea whale")); - - + + // A query defines a set of documents, as // well as the way they should be scored. - // + // // A query created by the query parser is scored according // to a metric called Tf-Idf, and will consider // any document matching at least one of our terms. - - // ### Collectors + + // ### Collectors // - // We are not interested in all of the documents but + // We are not interested in all of the documents but // only in the top 10. Keeping track of our top 10 best documents // is the role of the TopCollector. - let mut top_collector = TopCollector::with_limit(10); - + // We can now perform our query. try!(searcher.search(&*query, &mut top_collector)); - // Our top collector now contains the 10 + // Our top collector now contains the 10 // most relevant doc ids... let doc_addresses = top_collector.docs(); - // The actual documents still need to be + // The actual documents still need to be // retrieved from Tantivy's store. - // + // // Since the body field was not configured as stored, // the document returned will only contain // a title. - + for doc_address in doc_addresses { - let retrieved_doc = try!(searcher.doc(&doc_address)); - println!("{}", schema.to_json(&retrieved_doc)); + let retrieved_doc = try!(searcher.doc(&doc_address)); + println!("{}", schema.to_json(&retrieved_doc)); } Ok(()) From 292dd6dcb65d993fc59704290ae0a2ead5fc2a03 Mon Sep 17 00:00:00 2001 From: Claus Matzinger Date: Mon, 13 Mar 2017 00:24:54 -0400 Subject: [PATCH 069/107] fixup --- examples/simple_search.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/simple_search.rs b/examples/simple_search.rs index d422b461b..430d7abf0 100644 --- a/examples/simple_search.rs +++ b/examples/simple_search.rs @@ -73,10 +73,8 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> { // This single `IndexWriter` is already // multithreaded. // - // Here we use a buffer of 50MB. Using a bigger + // Here we use a buffer of 50MB per thread. Using a bigger // heap for the indexer can increase its throughput. - // This buffer will be split between the indexing - // threads. let mut index_writer = try!(index.writer(50_000_000)); // Let's index our documents! From da10fe3b4dfdaa03450fbcb966f278306c7ef4bf Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 13 Mar 2017 22:01:55 +0900 Subject: [PATCH 070/107] Various fixes. --- src/indexer/index_writer.rs | 7 ++ src/indexer/segment_entry.rs | 4 + src/indexer/segment_manager.rs | 74 ++++++++++++------ src/indexer/segment_register.rs | 26 ++++--- src/indexer/segment_updater.rs | 134 ++++++++++++++++++-------------- 5 files changed, 154 insertions(+), 91 deletions(-) diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 56e9a41f8..daa97ffa7 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -504,6 +504,13 @@ impl IndexWriter { opstamp } + /// Returns the opstamp of the last successful commit. + /// + /// This is, for instance, the opstamp the index will + /// rollback to if there is a failure like a power surge. + /// + /// This is also the opstamp of the commit that is currently + /// available for searchers. pub fn commit_opstamp(&self) -> u64 { self.committed_opstamp } diff --git a/src/indexer/segment_entry.rs b/src/indexer/segment_entry.rs index e18e84d47..74b45f7d8 100644 --- a/src/indexer/segment_entry.rs +++ b/src/indexer/segment_entry.rs @@ -77,6 +77,10 @@ impl SegmentEntry { self.state = SegmentState::InMerge; } + pub fn cancel_merge(&mut self,) { + self.state = SegmentState::Ready; + } + pub fn is_ready(&self,) -> bool { self.state == SegmentState::Ready } diff --git a/src/indexer/segment_manager.rs b/src/indexer/segment_manager.rs index 4528a412f..22302d942 100644 --- a/src/indexer/segment_manager.rs +++ b/src/indexer/segment_manager.rs @@ -36,18 +36,6 @@ impl Debug for SegmentManager { } } - -/// Returns the `SegmentMeta`s for (committed segment, uncommitted segments). -/// The result is consistent with other transactions. -/// -/// For instance, a segment will not appear in both committed and uncommitted -/// segments -pub fn get_all_segments(segment_manager: &SegmentManager,) -> (Vec, Vec) { - let registers_lock = segment_manager.read(); - (registers_lock.committed.get_all_segments(), - registers_lock.uncommitted.get_all_segments()) -} - pub fn get_mergeable_segments(segment_manager: &SegmentManager,) -> (Vec, Vec) { let registers_lock = segment_manager.read(); (registers_lock.committed.get_mergeable_segments(), @@ -160,6 +148,40 @@ impl SegmentManager { } } + + pub fn cancel_merge(&self, + before_merge_segment_ids: &[SegmentId], + after_merge_segment_id: SegmentId) { + + let mut registers_lock = self.write(); + + // we mark all segments are ready for merge. + { + let target_segment_register: &mut SegmentRegister; + target_segment_register = { + if registers_lock.uncommitted.contains_all(&before_merge_segment_ids) { + &mut registers_lock.uncommitted + } + else if registers_lock.committed.contains_all(&before_merge_segment_ids) { + &mut registers_lock.committed + } + else { + warn!("couldn't find segment in SegmentManager"); + return; + } + }; + for segment_id in before_merge_segment_ids { + target_segment_register.cancel_merge(segment_id); + } + } + + // ... and we make sure the target segment entry + // can be garbage collected. + registers_lock.writing.remove(&after_merge_segment_id); + + } + + pub fn write_segment(&self, segment_id: SegmentId) { let mut registers_lock = self.write(); registers_lock.writing.insert(segment_id); @@ -176,21 +198,27 @@ impl SegmentManager { after_merge_segment_entry: SegmentEntry) { let mut registers_lock = self.write(); + registers_lock.writing.remove(&after_merge_segment_entry.segment_id()); - if registers_lock.uncommitted.contains_all(&before_merge_segment_ids) { - for segment_id in before_merge_segment_ids { - registers_lock.uncommitted.remove_segment(segment_id); + let mut target_register: &mut SegmentRegister = { + if registers_lock.uncommitted.contains_all(&before_merge_segment_ids) { + &mut registers_lock.uncommitted } - registers_lock.uncommitted.add_segment_entry(after_merge_segment_entry); - } - else if registers_lock.committed.contains_all(&before_merge_segment_ids) { - for segment_id in before_merge_segment_ids { - registers_lock.committed.remove_segment(segment_id); + else if registers_lock.committed.contains_all(&before_merge_segment_ids) { + &mut registers_lock.committed + } else { + warn!("couldn't find segment in SegmentManager"); + return; } - registers_lock.committed.add_segment_entry(after_merge_segment_entry); - } else { - warn!("couldn't find segment in SegmentManager"); + }; + for segment_id in before_merge_segment_ids { + target_register.remove_segment(segment_id); } + target_register.add_segment_entry(after_merge_segment_entry); + + + + } pub fn committed_segment_metas(&self,) -> Vec { diff --git a/src/indexer/segment_register.rs b/src/indexer/segment_register.rs index 76f200735..9c6eec651 100644 --- a/src/indexer/segment_register.rs +++ b/src/indexer/segment_register.rs @@ -68,13 +68,6 @@ impl SegmentRegister { segment_ids } - pub fn segment_ids(&self,) -> Vec { - self.segment_metas() - .into_iter() - .map(|segment_meta| segment_meta.id()) - .collect() - } - pub fn segment_entry(&self, segment_id: &SegmentId) -> Option { self.segment_states .get(&segment_id) @@ -96,6 +89,13 @@ impl SegmentRegister { self.segment_states.remove(segment_id); } + pub fn cancel_merge(&mut self, segment_id: &SegmentId) { + self.segment_states + .get_mut(segment_id) + .expect("Received a merge notification for a segment that is not registered") + .cancel_merge(); + } + pub fn start_merge(&mut self, segment_id: &SegmentId) { self.segment_states .get_mut(segment_id) @@ -124,6 +124,14 @@ mod tests { use core::SegmentMeta; use indexer::delete_queue::*; use super::*; + + fn segment_ids(segment_register: &SegmentRegister) -> Vec { + segment_register + .segment_metas() + .into_iter() + .map(|segment_meta| segment_meta.id()) + .collect() + } #[test] fn test_segment_register() { @@ -140,7 +148,7 @@ mod tests { segment_register.add_segment_entry(segment_entry); } assert_eq!(segment_register.segment_entry(&segment_id_a).unwrap().state(), SegmentState::Ready); - assert_eq!(segment_register.segment_ids(), vec!(segment_id_a)); + assert_eq!(segment_ids(&segment_register), vec!(segment_id_a)); { let segment_meta = SegmentMeta::new(segment_id_b); let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor()); @@ -158,7 +166,7 @@ mod tests { let segment_entry = SegmentEntry::new(segment_meta_merged, delete_queue.cursor()); segment_register.add_segment_entry(segment_entry); } - assert_eq!(segment_register.segment_ids(), vec!(segment_id_merged)); + assert_eq!(segment_ids(&segment_register), vec!(segment_id_merged)); } } \ No newline at end of file diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index cb1033521..9eb1aa329 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -92,6 +92,59 @@ pub fn save_metas(segment_metas: Vec, pub struct SegmentUpdater(Arc); + +fn perform_merge(segment_ids: &[SegmentId], + segment_updater: &SegmentUpdater, + mut merged_segment: Segment, + target_opstamp: u64) -> Result { + // first we need to apply deletes to our segment. + info!("Start merge: {:?}", segment_ids); + + let ref index = segment_updater.0.index; + let schema = index.schema(); + let mut segment_entries = vec!(); + for segment_id in segment_ids { + if let Some(mut segment_entry) = segment_updater.0 + .segment_manager + .segment_entry(segment_id) { + let segment = index.segment(segment_entry.meta().clone()); + advance_deletes(segment, &mut segment_entry, target_opstamp)?; + segment_entries.push(segment_entry); + } + else { + error!("Error, had to abort merge as some of the segment is not managed anymore.a"); + return Err(Error::InvalidArgument(format!("Segment {:?} requested for merge is not managed.", segment_id))); + } + } + + let delete_cursor = segment_entries[0].delete_cursor().clone(); + + let segments: Vec = segment_entries + .iter() + .map(|segment_entry| { + index.segment(segment_entry.meta().clone()) + }) + .collect(); + + // An IndexMerger is like a "view" of our merged segments. + let merger: IndexMerger = IndexMerger::open(schema, &segments[..])?; + + // ... we just serialize this index merger in our new segment + // to merge the two segments. + + let segment_serializer = + SegmentSerializer::for_segment(&mut merged_segment) + .expect("Creating index serializer failed"); + + let num_docs = merger.write(segment_serializer).expect("Serializing merged index failed"); + let mut segment_meta = SegmentMeta::new(merged_segment.id()); + segment_meta.set_max_doc(num_docs); + + let after_merge_segment_entry = SegmentEntry::new(segment_meta.clone(), delete_cursor); + Ok(after_merge_segment_entry) +} + + struct InnerSegmentUpdater { pool: CpuPool, index: Index, @@ -145,8 +198,6 @@ impl SegmentUpdater { self.0.merging_thread_id.fetch_add(1, Ordering::SeqCst) } - /// TODO check that we use this correctly taking - /// the laziness in account. fn run_async T>(&self, f: F) -> impl Future { let me_clone = self.clone(); self.0.pool.spawn_fn(move || { @@ -238,66 +289,26 @@ impl SegmentUpdater { // first we need to apply deletes to our segment. info!("Start merge: {:?}", segment_ids_vec); - let ref index = segment_updater_clone.0.index; - let schema = index.schema(); + let merged_segment = segment_updater_clone.new_segment(); + let merged_segment_id = merged_segment.id(); + let merge_result = perform_merge(&segment_ids_vec, &segment_updater_clone, merged_segment, target_opstamp); - let mut segment_entries = vec!(); - - - for segment_id in &segment_ids_vec { - if let Some(mut segment_entry) = segment_updater_clone.0 - .segment_manager - .segment_entry(segment_id) { - - let segment = index.segment(segment_entry.meta().clone()); - // TODO unwrap - advance_deletes(segment, &mut segment_entry, target_opstamp).unwrap(); - segment_entries.push(segment_entry); + match merge_result { + Ok(after_merge_segment_entry) => { + let merged_segment_meta = after_merge_segment_entry.meta().clone(); + segment_updater_clone + .end_merge(segment_ids_vec, after_merge_segment_entry) + .wait() + .expect("Segment updater thread is corrupted."); + merging_future_send.complete(merged_segment_meta); } - else { - error!("Error, had to abort merge as some of the segment is not managed anymore.a"); - return Err(Error::InvalidArgument(format!("Segment {:?} requested for merge is not managed.", segment_id))); + Err(_) => { + // ... cancel merge + warn!("Merge of {:?} was cancelled", segment_ids_vec); + segment_updater_clone.cancel_merge(&segment_ids_vec, merged_segment_id); + // merging_future_send will be dropped, sending an error to the future. } } - - let delete_cursor = segment_entries[0].delete_cursor().clone(); - - let segments: Vec = segment_entries - .iter() - .map(|segment_entry| { - index.segment(segment_entry.meta().clone()) - }) - .collect(); - - // An IndexMerger is like a "view" of our merged segments. - let merger: IndexMerger = IndexMerger::open(schema, &segments[..])?; - let mut merged_segment = index.new_segment(); - - // ... we just serialize this index merger in our new segment - // to merge the two segments. - - let segment_serializer = - SegmentSerializer - ::for_segment(&mut merged_segment) - .expect("Creating index serializer failed"); - - let num_docs = merger.write(segment_serializer).expect("Serializing merged index failed"); - let mut segment_meta = SegmentMeta::new(merged_segment.id()); - segment_meta.set_max_doc(num_docs); - - let before_merged_segment_ids = segment_entries - .iter() - .map(|segment_entry| segment_entry.segment_id()) - .collect::>(); - - let after_merge_segment_entry = SegmentEntry::new(segment_meta.clone(), delete_cursor); - - segment_updater_clone - .end_merge(before_merged_segment_ids, after_merge_segment_entry) - .wait() - .unwrap(); - - merging_future_send.complete(segment_meta); segment_updater_clone.0.merging_threads.write().unwrap().remove(&merging_thread_id); Ok(()) }); @@ -319,6 +330,12 @@ impl SegmentUpdater { } } + fn cancel_merge(&self, + before_merge_segment_ids: &[SegmentId], + after_merge_segment_entry: SegmentId) { + self.0.segment_manager.cancel_merge(&before_merge_segment_ids, after_merge_segment_entry); + } + fn end_merge(&self, before_merge_segment_ids: Vec, @@ -336,7 +353,6 @@ impl SegmentUpdater { segment_updater.0.segment_manager.end_merge(&before_merge_segment_ids, after_merge_segment_entry); segment_updater.save_metas(segment_updater.0.index.opstamp()); }) - } pub fn wait_merging_thread(&self) -> thread::Result<()> { From 50659147d12da30ff805740374744e5eaa8d24c6 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 14 Mar 2017 12:04:21 +0900 Subject: [PATCH 071/107] NOBUG updated simple_search.html --- examples/html/simple_search.html | 134 +++++++++++++++++-------------- 1 file changed, 74 insertions(+), 60 deletions(-) diff --git a/examples/html/simple_search.html b/examples/html/simple_search.html index 1cfc7ac6e..1aa6b63ab 100644 --- a/examples/html/simple_search.html +++ b/examples/html/simple_search.html @@ -52,7 +52,7 @@
-

Let’s create a temporary directory for the +

Let’s create a temporary directory for the sake of this example

@@ -60,7 +60,7 @@ sake of this example

    if let Ok(dir) = TempDir::new("tantivy_example_dir") {
         run_example(dir.path()).unwrap();
         dir.close().unwrap();
-    }   
+    }
 }
 
 
@@ -78,7 +78,7 @@ sake of this example

Defining the schema

The Tantivy index requires a very strict schema. The schema declares which fields are in the index, -and for each field, its type and “the way it should +and for each field, its type and “the way it should be indexed”.

@@ -111,12 +111,12 @@ be indexed”.

We want full-text search for it, and we want to be able to retrieve the document after the search.

TEXT | STORED is some syntactic sugar to describe -that.

+that.

TEXT means the field should be tokenized and indexed, along with its term frequency and term positions.

STORED means that the field will also be saved in a compressed, row-oriented key-value store. -This store is useful to reconstruct the +This store is useful to reconstruct the documents that were selected during the search phase.

@@ -139,7 +139,7 @@ to retrieve the body after the search.

    schema_builder.add_text_field("body", TEXT);
-    
+
     let schema = schema_builder.build();
@@ -173,14 +173,12 @@ with our schema in the directory.

There must be only one writer at a time. This single IndexWriter is already multithreaded.

-

Here we use a buffer of 1 GB. Using a bigger -heap for the indexer can increase its throughput. -This buffer will be split between the indexing -threads.

+

Here we use a buffer of 50MB per thread. Using a bigger +heap for the indexer can increase its throughput.

-
    let mut index_writer = try!(index.writer(1_000_000_000));
+
    let mut index_writer = try!(index.writer(50_000_000));
@@ -213,10 +211,12 @@ one by one in a Document object.

    let title = schema.get_field("title").unwrap();
     let body = schema.get_field("body").unwrap();
-     
+
     let mut old_man_doc = Document::default();
     old_man_doc.add_text(title, "The Old Man and the Sea");
-    old_man_doc.add_text(body, "He was an old man who fished alone in a skiff in the Gulf Stream and he had gone eighty-four days now without taking a fish.");
+ old_man_doc.add_text(body, + "He was an old man who fished alone in a skiff in the Gulf Stream and \ + he had gone eighty-four days now without taking a fish."); @@ -231,7 +231,7 @@ one by one in a Document object.

-
    try!(index_writer.add_document(old_man_doc));
+
    index_writer.add_document(old_man_doc);
@@ -248,13 +248,13 @@ a document object directly from json.

-
    
+            
     let mice_and_men_doc = try!(schema.parse_document(r#"{
        "title": "Of Mice and Men",
        "body": "few miles south of Soledad, the Salinas River drops in close to the hillside bank and runs deep and green. The water is warm too, for it has slipped twinkling over the yellow sands in the sunlight before reaching the narrow pool. On one side of the river the golden foothill slopes curve up to the strong and rocky Gabilan Mountains, but on the valley side the water is lined with trees—willows fresh and green with every spring, carrying in their lower leaf junctures the debris of the winter’s flooding; and sycamores with mottled, white,recumbent limbs and branches that arch over the pool"  
     }"#));
-    
-    try!(index_writer.add_document(mice_and_men_doc));
+ + index_writer.add_document(mice_and_men_doc);
@@ -275,7 +275,7 @@ The following document has two titles.

"title": ["Frankenstein", "The Modern Promotheus"], "body": "You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings. I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking." }"#)); - try!(index_writer.add_document(frankenstein_doc)); + index_writer.add_document(frankenstein_doc); @@ -288,7 +288,7 @@ The following document has two titles.

This is an example, so we will only index 3 documents here. You can check out tantivy’s tutorial to index -the English wikipedia. Tantivy’s indexing is rather fast. +the English wikipedia. Tantivy’s indexing is rather fast. Indexing 5 million articles of the English wikipedia takes around 4 minutes on my computer!

@@ -343,15 +343,13 @@ commit.

Searching

-

Let’s search our index. We start -by creating a searcher. There can be more -than one searcher at a time.

-

You should create a searcher -every time you start a “search query”.

+

Let’s search our index. Start by reloading +searchers in the index. This should be done +after every commit().

-
    let searcher = index.searcher();
+
    try!(index.load_searchers());
@@ -362,14 +360,13 @@ every time you start a “search query”.

-

The query parser can interpret human queries. -Here, if the user does not specify which -field they want to search, tantivy will search -in both title and body.

+

Afterwards create one (or more) searchers.

+

You should create a searcher +every time you start a “search query”.

-
    let query_parser = QueryParser::new(index.schema(), vec!(title, body));
+
    let searcher = index.searcher();
@@ -380,6 +377,24 @@ in both title and body.

+

The query parser can interpret human queries. +Here, if the user does not specify which +field they want to search, tantivy will search +in both title and body.

+ + + +
    let query_parser = QueryParser::new(index.schema(), vec![title, body]);
+ + + + +
  • +
    + +
    + +

    QueryParser may fail if the query is not in the right format. For user facing applications, this can be a problem. A ticket has been opened regarding this problem.

    @@ -391,11 +406,11 @@ A ticket has been opened regarding this problem.

  • -
  • +
  • - +

    A query defines a set of documents, as well as the way they should be scored.

    @@ -408,36 +423,20 @@ any document matching at least one of our terms.

  • -
  • -
    - -
    - -
    -

    Collectors

    -

    We are not interested in all of the documents but -only in the top 10. Keeping track of our top 10 best documents -is the role of the TopCollector.

    - -
    - -
        
    -    let mut top_collector = TopCollector::with_limit(10);
    - -
  • - -
  • -

    We can now perform our query.

    +

    Collectors

    +

    We are not interested in all of the documents but +only in the top 10. Keeping track of our top 10 best documents +is the role of the TopCollector.

    -
        try!(searcher.search(&query, &mut top_collector)));
    +
        let mut top_collector = TopCollector::with_limit(10);
  • @@ -448,12 +447,11 @@ is the role of the TopCollector.

    -

    Our top collector now contains the 10 -most relevant doc ids…

    +

    We can now perform our query.

    -
        let doc_addresses = top_collector.docs();
    +
        try!(searcher.search(&*query, &mut top_collector));
    @@ -464,7 +462,23 @@ most relevant doc ids…

    -

    The actual documents still need to be +

    Our top collector now contains the 10 +most relevant doc ids…

    + + + +
        let doc_addresses = top_collector.docs();
    + + + + +
  • +
    + +
    + +
    +

    The actual documents still need to be retrieved from Tantivy’s store.

    Since the body field was not configured as stored, the document returned will only contain @@ -472,10 +486,10 @@ a title.

    -
        
    +            
         for doc_address in doc_addresses {
    -         let retrieved_doc = try!(searcher.doc(&doc_address));
    -         println!("{}", schema.to_json(&retrieved_doc));
    +        let retrieved_doc = try!(searcher.doc(&doc_address));
    +        println!("{}", schema.to_json(&retrieved_doc));
         }
     
         Ok(())
    
    From 7c114b602de85c7772b2ecb43793e1ff9fedb15a Mon Sep 17 00:00:00 2001
    From: Laurentiu Nicola 
    Date: Sat, 18 Mar 2017 16:11:10 +0200
    Subject: [PATCH 072/107] Make directory syncing work on Windows
    
    ---
     src/directory/mmap_directory.rs | 22 +++++++++++++++++++---
     1 file changed, 19 insertions(+), 3 deletions(-)
    
    diff --git a/src/directory/mmap_directory.rs b/src/directory/mmap_directory.rs
    index 5015f3bb6..9b466f0dc 100644
    --- a/src/directory/mmap_directory.rs
    +++ b/src/directory/mmap_directory.rs
    @@ -219,12 +219,28 @@ impl MmapDirectory {
         /// Sync the root directory.
         /// In certain FS, this is required to persistently create
         /// a file.
    -    fn sync_directory(&self,) -> Result<(), io::Error> {
    -        let fd = try!(File::open(&self.root_path));
    +    fn sync_directory(&self) -> Result<(), io::Error> {
    +        let mut open_opts = OpenOptions::new();
    +
    +        // Linux needs read to be set, or otherwise returns EINVAL
    +        // and fails with EISDIR if write is set
    +        open_opts.read(true);
    +
    +        // On Windows, opening a directory requires FILE_FLAG_BACKUP_SEMANTICS
    +        // and calling sync_all() only works if write access is requested.
    +        #[cfg(windows)]
    +        {
    +            use std::os::windows::fs::OpenOptionsExt;
    +            const FILE_FLAG_BACKUP_SEMANTICS: u32 = 0x02000000;
    +
    +            open_opts.write(true)
    +                .custom_flags(FILE_FLAG_BACKUP_SEMANTICS);
    +        };
    +
    +        let fd = try!(open_opts.open(&self.root_path));
             try!(fd.sync_all());
             Ok(())
         }
    -
         /// Returns some statistical information
         /// about the Mmap cache.
         /// 
    
    From 30075176cbff25d34f738c51c5185cb1574889f2 Mon Sep 17 00:00:00 2001
    From: Paul Masurel 
    Date: Sun, 19 Mar 2017 10:52:54 +0900
    Subject: [PATCH 073/107] blop
    
    ---
     src/indexer/index_writer.rs     | 144 +++++++++++++++++++-------------
     src/indexer/segment_entry.rs    |  23 +++--
     src/indexer/segment_manager.rs  |   6 ++
     src/indexer/segment_register.rs |  12 ++-
     src/indexer/segment_updater.rs  |   2 +-
     5 files changed, 112 insertions(+), 75 deletions(-)
    
    diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs
    index daa97ffa7..6b7a6419d 100644
    --- a/src/indexer/index_writer.rs
    +++ b/src/indexer/index_writer.rs
    @@ -154,6 +154,52 @@ pub fn open_index_writer(
     }
     
     
    +
    +pub fn compute_deleted_bitset(
    +    delete_bitset: &mut BitSet,
    +    segment_reader: &SegmentReader,
    +    delete_cursor: &mut DeleteCursor,
    +    doc_opstamps: DocToOpstampMapping,
    +    target_opstamp: u64) -> Result<(Option)> {
    +    
    +    
    +    loop {
    +        if let Some(delete_op) = delete_cursor.peek() {
    +            if delete_op.opstamp > target_opstamp {
    +                break;
    +            }
    +            else {
    +                // A delete operation should only affect
    +                // document that were inserted after it.
    +                // 
    +                // Limit doc helps identify the first document
    +                // that may be affected by the delete operation.
    +                let limit_doc = doc_opstamps.compute_doc_limit(delete_op.opstamp);
    +                if let Some(mut docset) = segment_reader.read_postings(&delete_op.term, SegmentPostingsOption::NoFreq) {
    +                    while docset.advance() {
    +                        let deleted_doc = docset.doc();
    +                        if deleted_doc < limit_doc {
    +                            delete_bitset.insert(deleted_doc as usize);
    +                        }
    +                    }
    +                }
    +            }
    +        }
    +        else {
    +            break;
    +        }
    +        delete_cursor.advance();
    +    }
    +
    +    if !delete_bitset.is_empty() {
    +        Ok(Some(delete_bitset))
    +    }
    +    else {
    +        Ok(None)
    +    }
    +}
    +
    +
     // TODO put delete bitset in segment entry
     // rather than DocToOpstamp.
     
    @@ -168,67 +214,40 @@ pub fn advance_deletes(
         target_opstamp: u64) -> Result<()> {
     
         {
    -        let doc_opstamps = segment_entry.reset_doc_to_stamp();
    +        
    +        let segment_reader = SegmentReader::open(segment.clone())?;
    +
    +        let mut delete_bitset: BitSet =
    +            match segment_entry.reset_delete_bitset() {
    +                Some(previous_delete_bitset) => {
    +                    previous_delete_bitset
    +                },
    +                None => {
    +                    BitSet::with_capacity(segment_reader.max_doc() as usize)
    +                }
    +            };
    +        
             let delete_cursor = segment_entry.delete_cursor();
     
    -        let segment_reader = SegmentReader::open(segment.clone())?;
    +        let new_deleted_bitset = compute_deleted_bitset(
    +            &segment_reader,
    +            delete_cursor,
    +            &mut delete_bitset,
    +            DocToOpstampMapping::None,
    +            target_opstamp)?;
             
    -        let mut delete_bitset = BitSet::with_capacity(segment_reader.max_doc() as usize);
    -        
    -        let mut last_opstamp_opt: Option = None;
    -
    -        let previous_delete_opstamp_opt = segment.meta().delete_opstamp();
    -
    -        loop {
    -
    -            if let Some(delete_op) = delete_cursor.peek() {
    -                if delete_op.opstamp > target_opstamp {
    -                    break;
    -                }
    -                else {
    -                    // let's skip operations that have already been deleted.0u32
    -                    if let Some(previous_delete_opstamp) = previous_delete_opstamp_opt {
    -                        if delete_op.opstamp <= previous_delete_opstamp {
    -                            continue;
    -                        }
    -                    }
    -
    -                    // A delete operation should only affect
    -                    // document that were inserted after it.
    -                    // 
    -                    // Limit doc helps identify the first document
    -                    // that may be affected by the delete operation.
    -                    let limit_doc = doc_opstamps.compute_doc_limit(delete_op.opstamp);
    -                    if let Some(mut docset) = segment_reader.read_postings(&delete_op.term, SegmentPostingsOption::NoFreq) {
    -                        while docset.advance() {
    -                            let deleted_doc = docset.doc();
    -                            if deleted_doc < limit_doc {
    -                                delete_bitset.insert(deleted_doc as usize);
    -                            }
    -                        }
    -                    }
    -                    last_opstamp_opt = Some(delete_op.opstamp);
    -                }
    -            }
    -            else {
    -                break;
    -            }
    -
    -            delete_cursor.advance();
    -        }
    -
             // we only write the result different
             // iff we ended ended up increasing the delete opstamp
             //
             // TODO just move the file if there was no new delete?
    -        if let Some(last_opstamp) = last_opstamp_opt {
    +        if let Some(mut delete_bitset) = new_deleted_bitset {
                 for doc in 0u32..segment_reader.max_doc() {
                     if segment_reader.is_deleted(doc) {
                         delete_bitset.insert(doc as usize);
                     }
                 }
                 let num_deleted_docs = delete_bitset.len();
    -            segment.set_delete_meta(num_deleted_docs as u32, last_opstamp);
    +            segment.set_delete_meta(num_deleted_docs as u32, target_opstamp);
                 let mut delete_file = segment.open_write(SegmentComponent::DELETE)?;    
                 write_delete_bitset(&delete_bitset, &mut delete_file)?;
             }
    @@ -244,11 +263,11 @@ fn index_documents(heap: &mut Heap,
                        generation: usize,
                        document_iterator: &mut Iterator,
                        segment_updater: &mut SegmentUpdater,
    -                   delete_cursor: DeleteCursor)
    +                   mut delete_cursor: DeleteCursor)
                        -> Result {
         heap.clear();
         let segment_id = segment.id();
    -    let mut segment_writer = try!(SegmentWriter::for_segment(heap, segment, &schema));
    +    let mut segment_writer = SegmentWriter::for_segment(heap, segment.clone(), &schema)?;
         for doc in document_iterator {
             try!(segment_writer.add_document(&doc, &schema));
             if segment_writer.is_buffer_full() {
    @@ -268,15 +287,28 @@ fn index_documents(heap: &mut Heap,
         let mut segment_meta = SegmentMeta::new(segment_id);
         segment_meta.set_max_doc(num_docs);
     
    -    let mut segment_entry = SegmentEntry::new(segment_meta, delete_cursor);
    -    segment_entry.set_doc_to_opstamp(DocToOpstampMapping::from(doc_opstamps));
    +    let last_docstamp: u64 = *(doc_opstamps.last().unwrap());
    +    
    +    let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
    +    let segment_reader = SegmentReader::open(segment)?;
    +    let delete_bitset = compute_deleted_bitset(
    +        &segment_reader,
    +        &mut delete_cursor,
    +        doc_to_opstamps,
    +        last_docstamp,
    +    )?;
    +
    +    let segment_entry = SegmentEntry::new(
    +        segment_meta,
    +        delete_cursor,
    +        delete_bitset);
         
         segment_updater
             .add_segment(generation, segment_entry)
             .wait()
             .map_err(|_| Error::ErrorInThread("Could not add segment.".to_string()))
     
    -}
    +}   
     
     
     impl IndexWriter {
    @@ -287,9 +319,8 @@ impl IndexWriter {
             // dropping the last reference to the segment_updater.
             drop(self.document_sender);
             
    -        let mut v = Vec::new();
    -        mem::swap(&mut v, &mut self.workers_join_handle);
    -        for join_handle in v {
    +        let former_workers_handles = mem::replace(&mut self.workers_join_handle, vec!());
    +        for join_handle in former_workers_handles {
                 try!(join_handle.join()
                     .expect("Indexing Worker thread panicked")
                     .map_err(|e| {
    @@ -325,7 +356,6 @@ impl IndexWriter {
                     
                     loop {
     
    -                    
                         let mut document_iterator = document_receiver_clone.clone()
                             .into_iter()
                             .peekable();
    diff --git a/src/indexer/segment_entry.rs b/src/indexer/segment_entry.rs
    index 74b45f7d8..ac44f33e8 100644
    --- a/src/indexer/segment_entry.rs
    +++ b/src/indexer/segment_entry.rs
    @@ -1,14 +1,14 @@
    -use indexer::doc_opstamp_mapping::DocToOpstampMapping;
     use core::SegmentMeta;
    +use bit_set::BitSet;
     use indexer::delete_queue::DeleteCursor;
     use core::SegmentId;
     use std::fmt;
    -use std::mem;
    +
     
     #[derive(Clone, Copy, PartialEq, Eq, Debug)]
     pub enum SegmentState {
         Ready,
    -    InMerge,    
    +    InMerge,        
     }
     
     impl SegmentState {
    @@ -24,7 +24,7 @@ impl SegmentState {
     pub struct SegmentEntry {
         meta: SegmentMeta,
         state: SegmentState,
    -    doc_to_opstamp: DocToOpstampMapping,
    +    delete_bitset: Option,
         delete_cursor: DeleteCursor,
     
     }
    @@ -32,17 +32,18 @@ pub struct SegmentEntry {
     impl SegmentEntry {
     
         pub fn new(segment_meta: SegmentMeta, 
    -               delete_cursor: DeleteCursor) -> SegmentEntry {
    +               delete_cursor: DeleteCursor,
    +               delete_bitset: Option) -> SegmentEntry {
             SegmentEntry {
                 meta: segment_meta,
                 state: SegmentState::Ready,
    -            doc_to_opstamp: DocToOpstampMapping::None,
    +            delete_bitset: delete_bitset,
                 delete_cursor: delete_cursor,
             }
         }
     
    -    pub fn reset_doc_to_stamp(&mut self,) -> DocToOpstampMapping {
    -        mem::replace(&mut self.doc_to_opstamp, DocToOpstampMapping::None)
    +    pub fn reset_delete_bitset(&mut self,) -> Option {
    +        self.delete_bitset.take()
         }
     
         pub fn set_meta(&mut self, segment_meta: SegmentMeta) {
    @@ -57,14 +58,10 @@ impl SegmentEntry {
             self.state
         }
     
    -     pub fn set_state(&mut self, state: SegmentState) {
    +    pub fn set_state(&mut self, state: SegmentState) {
             self.state = state;
         }
     
    -    pub fn set_doc_to_opstamp(&mut self, doc_to_opstamp: DocToOpstampMapping) {
    -        self.doc_to_opstamp = doc_to_opstamp;
    -    }
    -
         pub fn segment_id(&self) -> SegmentId {
             self.meta.id()
         }
    diff --git a/src/indexer/segment_manager.rs b/src/indexer/segment_manager.rs
    index 22302d942..232c88f15 100644
    --- a/src/indexer/segment_manager.rs
    +++ b/src/indexer/segment_manager.rs
    @@ -72,6 +72,12 @@ impl SegmentManager {
             files.insert(META_FILEPATH.clone());
             files.insert(LOCKFILE_FILEPATH.clone());
             
    +        // TODO do new segment
    +        // really have at no point a delete file?
    +        // that might get garbage collected?
    +        // 
    +        // Consider have new segment matched as a prefix.
    +        
             let segment_metas: Vec =
                 registers_lock.committed
                     .get_all_segments()
    diff --git a/src/indexer/segment_register.rs b/src/indexer/segment_register.rs
    index 9c6eec651..902680164 100644
    --- a/src/indexer/segment_register.rs
    +++ b/src/indexer/segment_register.rs
    @@ -31,6 +31,7 @@ impl Debug for SegmentRegister {
         }
     }
     
    +
     impl SegmentRegister {
         
         pub fn clear(&mut self,) {
    @@ -107,7 +108,10 @@ impl SegmentRegister {
             let mut segment_states = HashMap::new();
             for segment_meta in segment_metas {
                 let segment_id = segment_meta.id();
    -            let segment_entry = SegmentEntry::new(segment_meta, delete_cursor.clone());
    +            let segment_entry = SegmentEntry::new(
    +                segment_meta,
    +                delete_cursor.clone(),
    +                None);
                 segment_states.insert(segment_id, segment_entry);
             }
             SegmentRegister {
    @@ -144,14 +148,14 @@ mod tests {
             
             {
                 let segment_meta = SegmentMeta::new(segment_id_a);
    -            let segment_entry = SegmentEntry::new(segment_meta,  delete_queue.cursor());
    +            let segment_entry = SegmentEntry::new(segment_meta,  delete_queue.cursor(), None);
                 segment_register.add_segment_entry(segment_entry);
             }
             assert_eq!(segment_register.segment_entry(&segment_id_a).unwrap().state(), SegmentState::Ready);
             assert_eq!(segment_ids(&segment_register), vec!(segment_id_a));
             {
                 let segment_meta = SegmentMeta::new(segment_id_b);
    -            let segment_entry = SegmentEntry::new(segment_meta,  delete_queue.cursor());
    +            let segment_entry = SegmentEntry::new(segment_meta,  delete_queue.cursor(), None);
                 segment_register.add_segment_entry(segment_entry);
             }
             assert_eq!(segment_register.segment_entry(&segment_id_b).unwrap().state(), SegmentState::Ready);
    @@ -163,7 +167,7 @@ mod tests {
             segment_register.remove_segment(&segment_id_b);
             {
                 let segment_meta_merged = SegmentMeta::new(segment_id_merged);
    -            let segment_entry = SegmentEntry::new(segment_meta_merged,  delete_queue.cursor());
    +            let segment_entry = SegmentEntry::new(segment_meta_merged,  delete_queue.cursor(), None);
                 segment_register.add_segment_entry(segment_entry);        
             }
             assert_eq!(segment_ids(&segment_register), vec!(segment_id_merged));        
    diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs
    index 9eb1aa329..3d325d904 100644
    --- a/src/indexer/segment_updater.rs
    +++ b/src/indexer/segment_updater.rs
    @@ -140,7 +140,7 @@ fn perform_merge(segment_ids: &[SegmentId],
         let mut segment_meta = SegmentMeta::new(merged_segment.id());
         segment_meta.set_max_doc(num_docs);
         
    -    let after_merge_segment_entry = SegmentEntry::new(segment_meta.clone(), delete_cursor);
    +    let after_merge_segment_entry = SegmentEntry::new(segment_meta.clone(), delete_cursor, None);
         Ok(after_merge_segment_entry)
     }
     
    
    From ebcea0128c83e6ace0fe98ff6a01a315f0cde325 Mon Sep 17 00:00:00 2001
    From: Paul Masurel 
    Date: Sun, 19 Mar 2017 11:09:15 +0900
    Subject: [PATCH 074/107] Getting the FLAG from the winapi module.
    
    ---
     Cargo.toml                      | 6 +++---
     src/directory/mmap_directory.rs | 8 +++++---
     src/lib.rs                      | 3 +++
     3 files changed, 11 insertions(+), 6 deletions(-)
    
    diff --git a/Cargo.toml b/Cargo.toml
    index 7df8882db..f6fd20cc2 100644
    --- a/Cargo.toml
    +++ b/Cargo.toml
    @@ -24,8 +24,6 @@ rustc-serialize = "0.3"
     log = "0.3.6"
     combine = "2.2"
     tempdir = "0.3"
    -
    -
     bincode = "0.5"
     libc = {version = "0.2.20", optional=true}
     num_cpus = "1.2"
    @@ -37,10 +35,12 @@ uuid = { version = "0.4", features = ["v4", "rustc-serialize"] }
     chan = "0.1"
     version = "2"
     crossbeam = "0.2"
    -
     futures = "0.1.9"
     futures-cpupool = "0.1.2"
     
    +[target.'cfg(windows)'.dependencies]
    +winapi = "*"
    +
     [dev-dependencies]
     rand = "0.3"
     
    diff --git a/src/directory/mmap_directory.rs b/src/directory/mmap_directory.rs
    index 9b466f0dc..2f6224185 100644
    --- a/src/directory/mmap_directory.rs
    +++ b/src/directory/mmap_directory.rs
    @@ -23,6 +23,10 @@ use std::sync::RwLock;
     use std::sync::Weak;
     use tempdir::TempDir;
     
    +#[cfg(windows)]
    +use winapi::winbase::FILE_FLAG_BACKUP_SEMANTICS;
    +
    +
     
     fn open_mmap(full_path: &PathBuf) -> result::Result>, FileError> {
         let convert_file_error = |err: io::Error| {
    @@ -231,11 +235,9 @@ impl MmapDirectory {
             #[cfg(windows)]
             {
                 use std::os::windows::fs::OpenOptionsExt;
    -            const FILE_FLAG_BACKUP_SEMANTICS: u32 = 0x02000000;
    -
                 open_opts.write(true)
                     .custom_flags(FILE_FLAG_BACKUP_SEMANTICS);
    -        };
    +        }
     
             let fd = try!(open_opts.open(&self.root_path));
             try!(fd.sync_all());
    diff --git a/src/lib.rs b/src/lib.rs
    index 81d9acc24..46fb75abc 100644
    --- a/src/lib.rs
    +++ b/src/lib.rs
    @@ -51,6 +51,9 @@ extern crate futures_cpupool;
     #[cfg(feature="simdcompression")]
     extern crate libc;
     
    +#[cfg(windows)]
    +extern crate winapi;
    +
     #[cfg(test)] extern crate test;
     #[cfg(test)] extern crate rand;
     
    
    From 1e0ac31e11426d71238fcb86d43febf360f79e06 Mon Sep 17 00:00:00 2001
    From: Laurentiu Nicola 
    Date: Mon, 20 Mar 2017 23:12:48 +0200
    Subject: [PATCH 075/107] Clarify comment and use qualified import for the flag
    
    ---
     src/directory/mmap_directory.rs | 13 +++++--------
     1 file changed, 5 insertions(+), 8 deletions(-)
    
    diff --git a/src/directory/mmap_directory.rs b/src/directory/mmap_directory.rs
    index 2f6224185..6b2730a4b 100644
    --- a/src/directory/mmap_directory.rs
    +++ b/src/directory/mmap_directory.rs
    @@ -23,11 +23,6 @@ use std::sync::RwLock;
     use std::sync::Weak;
     use tempdir::TempDir;
     
    -#[cfg(windows)]
    -use winapi::winbase::FILE_FLAG_BACKUP_SEMANTICS;
    -
    -
    -
     fn open_mmap(full_path: &PathBuf) -> result::Result>, FileError> {
         let convert_file_error = |err: io::Error| {
             if err.kind() == io::ErrorKind::NotFound {
    @@ -226,8 +221,8 @@ impl MmapDirectory {
         fn sync_directory(&self) -> Result<(), io::Error> {
             let mut open_opts = OpenOptions::new();
     
    -        // Linux needs read to be set, or otherwise returns EINVAL
    -        // and fails with EISDIR if write is set
    +        // Linux needs read to be set, otherwise returns EINVAL
    +        // write must not be set, or it fails with EISDIR
             open_opts.read(true);
     
             // On Windows, opening a directory requires FILE_FLAG_BACKUP_SEMANTICS
    @@ -235,8 +230,10 @@ impl MmapDirectory {
             #[cfg(windows)]
             {
                 use std::os::windows::fs::OpenOptionsExt;
    +            use winapi::winbase;
    +
                 open_opts.write(true)
    -                .custom_flags(FILE_FLAG_BACKUP_SEMANTICS);
    +                .custom_flags(winbase::FILE_FLAG_BACKUP_SEMANTICS);
             }
     
             let fd = try!(open_opts.open(&self.root_path));
    
    From 92ce9b906b555074f9b730b4ed300c824b974c43 Mon Sep 17 00:00:00 2001
    From: Laurentiu Nicola 
    Date: Tue, 21 Mar 2017 00:25:04 +0200
    Subject: [PATCH 076/107] Avoid using make for building simdcomp
    
    ---
     build.rs                            | 61 +++++++++++++++++------------
     src/compression/compression_simd.rs |  1 -
     2 files changed, 37 insertions(+), 25 deletions(-)
    
    diff --git a/build.rs b/build.rs
    index fb7ba8110..639bbe93f 100644
    --- a/build.rs
    +++ b/build.rs
    @@ -1,37 +1,50 @@
    -#[cfg(feature= "simdcompression")]
    +#[cfg(feature = "simdcompression")]
     mod build {
         extern crate gcc;
     
    -    use std::process::Command;
    -
         pub fn build() {
    -        Command::new("make")
    -            .current_dir("cpp/simdcomp")
    -            .output()
    -            .unwrap_or_else(|e| { panic!("Failed to make simdcomp: {}", e) });
    -        gcc::Config::new()
    -                    .flag("-O3")
    -                    .flag("-mssse3")
    -                    .include("./cpp/simdcomp/include")
    -                    .object("cpp/simdcomp/avxbitpacking.o")
    -                    .object("cpp/simdcomp/simdintegratedbitpacking.o")
    -                    .object("cpp/simdcomp/simdbitpacking.o")
    -                    .object("cpp/simdcomp/simdpackedsearch.o")
    -                    .object("cpp/simdcomp/simdcomputil.o")
    -                    .object("cpp/simdcomp/simdpackedselect.o")
    -                    .object("cpp/simdcomp/simdfor.o")
    -                    .file("cpp/simdcomp_wrapper.c")
    -                    .compile("libsimdcomp.a");
    +        let mut config = gcc::Config::new();
    +        config.include("./cpp/simdcomp/include")
    +            .file("cpp/simdcomp/src/avxbitpacking.c")
    +            .file("cpp/simdcomp/src/simdintegratedbitpacking.c")
    +            .file("cpp/simdcomp/src/simdbitpacking.c")
    +            .file("cpp/simdcomp/src/simdpackedsearch.c")
    +            .file("cpp/simdcomp/src/simdcomputil.c")
    +            .file("cpp/simdcomp/src/simdpackedselect.c")
    +            .file("cpp/simdcomp/src/simdfor.c")
    +            .file("cpp/simdcomp_wrapper.c");
    +
    +        if !cfg!(debug_assertions) {
    +            config.opt_level(3);
    +
    +            if cfg!(target_env = "msvc") {
    +                config.define("NDEBUG", None)
    +                    .flag("/Gm-")
    +                    .flag("/GS-")
    +                    .flag("/Gy")
    +                    .flag("/Oi")
    +                    .flag("/GL");
    +            } else {
    +                config.flag("-msse4.1")
    +                    .flag("-march=native");
    +            }
    +        }
    +
    +        config.compile("libsimdcomp.a");
    +
    +        // Workaround for linking static libraries built with /GL
    +        // https://github.com/rust-lang/rust/issues/26003
    +        if !cfg!(debug_assertions) && cfg!(target_env = "msvc") {
    +            println!("cargo:rustc-link-lib=dylib=simdcomp");
    +        }
         }
     }
     
    -#[cfg(not(feature= "simdcompression"))]
    +#[cfg(not(feature = "simdcompression"))]
     mod build {
    -    pub fn build() {
    -    }
    +    pub fn build() {}
     }
     
    -
     fn main() {
         build::build();
     }
    diff --git a/src/compression/compression_simd.rs b/src/compression/compression_simd.rs
    index 9b8802927..308e13445 100644
    --- a/src/compression/compression_simd.rs
    +++ b/src/compression/compression_simd.rs
    @@ -5,7 +5,6 @@ const COMPRESSED_BLOCK_MAX_SIZE: usize = NUM_DOCS_PER_BLOCK * 4 + 1;
     mod simdcomp {
         use libc::size_t;
     
    -    #[link(name = "simdcomp")]
         extern {
             pub fn compress_sorted(
                 data: *const u32,
    
    From 2d169c44542667af913fbaf1f9a4b66d205f48ce Mon Sep 17 00:00:00 2001
    From: Laurentiu Nicola 
    Date: Tue, 21 Mar 2017 07:37:28 +0200
    Subject: [PATCH 077/107] Delay deleting the files in the test suite to make it
     work on Windows
    
    ---
     src/directory/mod.rs | 60 ++++++++++++++++++++++++++++----------------
     1 file changed, 38 insertions(+), 22 deletions(-)
    
    diff --git a/src/directory/mod.rs b/src/directory/mod.rs
    index 760c2c0d5..66c3ce7c7 100644
    --- a/src/directory/mod.rs
    +++ b/src/directory/mod.rs
    @@ -60,31 +60,37 @@ mod tests {
     
         fn test_simple(directory: &mut Directory) {
             {
    -            let mut write_file = directory.open_write(*TEST_PATH).unwrap();
    -            assert!(directory.exists(*TEST_PATH));
    -            write_file.write_all(&[4]).unwrap();
    -            write_file.write_all(&[3]).unwrap();
    -            write_file.write_all(&[7,3,5]).unwrap();
    -            write_file.flush().unwrap();
    +            {
    +                let mut write_file = directory.open_write(*TEST_PATH).unwrap();
    +                assert!(directory.exists(*TEST_PATH));
    +                write_file.write_all(&[4]).unwrap();
    +                write_file.write_all(&[3]).unwrap();
    +                write_file.write_all(&[7,3,5]).unwrap();
    +                write_file.flush().unwrap();
    +            }
    +            let read_file = directory.open_read(*TEST_PATH).unwrap();
    +            let data: &[u8] = &*read_file;
    +            assert_eq!(data, &[4u8, 3u8, 7u8, 3u8, 5u8]);
             }
    -        let read_file = directory.open_read(*TEST_PATH).unwrap();
    -        let data: &[u8] = &*read_file;
    -        assert_eq!(data, &[4u8, 3u8, 7u8, 3u8, 5u8]);
    +
             assert!(directory.delete(*TEST_PATH).is_ok());
             assert!(!directory.exists(*TEST_PATH));
         }
     
         fn test_seek(directory: &mut Directory) {
             {
    -            let mut write_file = directory.open_write(*TEST_PATH).unwrap();
    -            write_file.write_all(&[4, 3, 7,3,5]).unwrap();
    -            write_file.seek(SeekFrom::Start(0)).unwrap();
    -            write_file.write_all(&[3,1]).unwrap();
    -            write_file.flush().unwrap();
    +            {
    +                let mut write_file = directory.open_write(*TEST_PATH).unwrap();
    +                write_file.write_all(&[4, 3, 7,3,5]).unwrap();
    +                write_file.seek(SeekFrom::Start(0)).unwrap();
    +                write_file.write_all(&[3,1]).unwrap();
    +                write_file.flush().unwrap();
    +            }
    +            let read_file = directory.open_read(*TEST_PATH).unwrap();
    +            let data: &[u8] = &*read_file;
    +            assert_eq!(data, &[3u8, 1u8, 7u8, 3u8, 5u8]);
             }
    -        let read_file = directory.open_read(*TEST_PATH).unwrap();
    -        let data: &[u8] = &*read_file;
    -        assert_eq!(data, &[3u8, 1u8, 7u8, 3u8, 5u8]);
    +
             assert!(directory.delete(*TEST_PATH).is_ok());
         }
     
    @@ -118,14 +124,24 @@ mod tests {
             let mut write_file = directory.open_write(*TEST_PATH).unwrap();
             write_file.write_all(&[1, 2, 3, 4]).unwrap();
             write_file.flush().unwrap();
    -        let read_handle = directory.open_read(*TEST_PATH).unwrap();  
             {
    -            assert_eq!(&*read_handle, &[1u8, 2u8, 3u8, 4u8]);
    -            assert!(directory.delete(*TEST_PATH).is_ok());
    -            assert!(directory.delete(Path::new("SomeOtherPath")).is_err());
    -            assert_eq!(&*read_handle, &[1u8, 2u8, 3u8, 4u8]);
    +            let read_handle = directory.open_read(*TEST_PATH).unwrap();
    +            {
    +                assert_eq!(&*read_handle, &[1u8, 2u8, 3u8, 4u8]);
    +
    +                // Mapped files can't be deleted on Windows
    +                if !cfg!(windows) {
    +                    assert!(directory.delete(*TEST_PATH).is_ok());
    +                    assert_eq!(&*read_handle, &[1u8, 2u8, 3u8, 4u8]);
    +                }
    +
    +                assert!(directory.delete(Path::new("SomeOtherPath")).is_err());
    +            }
             }
    +
    +        assert!(directory.delete(*TEST_PATH).is_ok());
             assert!(directory.open_read(*TEST_PATH).is_err());
    +        assert!(directory.delete(*TEST_PATH).is_err());
         }
     
         fn test_directory(directory: &mut Directory) {
    
    From 2b5a4bbde2f181ad7a6a42f922038b0ec4dfee0c Mon Sep 17 00:00:00 2001
    From: Laurentiu Nicola 
    Date: Tue, 21 Mar 2017 07:45:35 +0200
    Subject: [PATCH 078/107] Don't delete twice on not(windows)
    
    ---
     src/directory/mod.rs | 5 ++++-
     1 file changed, 4 insertions(+), 1 deletion(-)
    
    diff --git a/src/directory/mod.rs b/src/directory/mod.rs
    index 66c3ce7c7..f0bf91101 100644
    --- a/src/directory/mod.rs
    +++ b/src/directory/mod.rs
    @@ -139,7 +139,10 @@ mod tests {
                 }
             }
     
    -        assert!(directory.delete(*TEST_PATH).is_ok());
    +        if cfg!(windows) {
    +            assert!(directory.delete(*TEST_PATH).is_ok());
    +        }
    +
             assert!(directory.open_read(*TEST_PATH).is_err());
             assert!(directory.delete(*TEST_PATH).is_err());
         }
    
    From b12a97abe4558b46be7291f2c0b37bc4c42f4ae7 Mon Sep 17 00:00:00 2001
    From: Paul Masurel 
    Date: Wed, 22 Mar 2017 08:57:09 +0900
    Subject: [PATCH 079/107] Add unit test for when deleting fails
    
    Test that when delete fails, we still keep
    the file as managed.
    
    Remove the error log for windows, as failing
    to delete is expected.
    ---
     src/directory/managed_directory.rs | 33 +++++++++++++++++++++++++++++-
     1 file changed, 32 insertions(+), 1 deletion(-)
    
    diff --git a/src/directory/managed_directory.rs b/src/directory/managed_directory.rs
    index 3eefcf996..f0cf5657e 100644
    --- a/src/directory/managed_directory.rs
    +++ b/src/directory/managed_directory.rs
    @@ -86,7 +86,9 @@ impl ManagedDirectory {
                                     managed_has_changed |= managed_paths_write.remove(&file_to_delete);
                                 }
                                 FileError::IOError(_) => {
    -                                error!("Failed to delete {:?}", file_to_delete);
    +                                if !cfg!(target_os = "windows") {
    +                                    error!("Failed to delete {:?}", file_to_delete);
    +                                }
                                 }
                                 
                             }
    @@ -244,4 +246,33 @@ mod tests {
             }   
         }
     
    +    #[test]
    +    fn test_managed_directory_gc_while_mmapped() {
    +        let tempdir = TempDir::new("index").unwrap();
    +        let tempdir_path = PathBuf::from(tempdir.path());
    +        let living_files = HashSet::new();
    +
    +        let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
    +        let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
    +        managed_directory.atomic_write(*TEST_PATH1, &vec!(0u8,1u8)).unwrap();
    +        assert!(managed_directory.exists(*TEST_PATH1));
    +
    +        let _mmap_read = managed_directory.open_read(*TEST_PATH1).unwrap();            
    +        managed_directory.garbage_collect(living_files.clone());
    +        if cfg!(target_os = "windows") {
    +            // On Windows, gc should try and fail the file as it is mmapped.
    +            assert!(managed_directory.exists(*TEST_PATH1));
    +            // unmap should happen here.
    +            drop(_mmap_read);
    +            // The file should still be in the list of managed file and
    +            // eventually be deleted once mmap is released.
    +            managed_directory.garbage_collect(living_files);
    +            assert!(!managed_directory.exists(*TEST_PATH1));
    +        }
    +        else {
    +            assert!(!managed_directory.exists(*TEST_PATH1));
    +        }
    +
    +    }
    +
     }
    
    From b44a9cb89d456bc0addac092bcd463e386337d3f Mon Sep 17 00:00:00 2001
    From: Ashley Mannix 
    Date: Fri, 24 Mar 2017 16:11:51 +1000
    Subject: [PATCH 080/107] add appveyor config
    
    ---
     appveyor.yml | 23 +++++++++++++++++++++++
     1 file changed, 23 insertions(+)
     create mode 100644 appveyor.yml
    
    diff --git a/appveyor.yml b/appveyor.yml
    new file mode 100644
    index 000000000..d9b6472d6
    --- /dev/null
    +++ b/appveyor.yml
    @@ -0,0 +1,23 @@
    +# Appveyor configuration template for Rust using rustup for Rust installation
    +# https://github.com/starkat99/appveyor-rust
    +
    +os: Visual Studio 2017
    +
    +environment:
    +  matrix:
    +    - channel: nightly
    +      target: x86_64-pc-windows-msvc
    +    - channel: nightly
    +      target: x86_64-pc-windows-gnu
    +
    +install:
    +  - appveyor DownloadFile https://win.rustup.rs/ -FileName rustup-init.exe
    +  - rustup-init -yv --default-toolchain %channel% --default-host %target%
    +  - set PATH=%PATH%;%USERPROFILE%\.cargo\bin
    +  - rustc -vV
    +  - cargo -vV
    +
    +build: false
    +
    +test_script:
    +  - cargo test --verbose
    \ No newline at end of file
    
    From c8e12b6847295a61b0a8ea9225b2a1381cc9dc0b Mon Sep 17 00:00:00 2001
    From: Ashley Mannix 
    Date: Fri, 24 Mar 2017 16:22:32 +1000
    Subject: [PATCH 081/107] try set mingw path
    
    ---
     appveyor.yml | 2 ++
     1 file changed, 2 insertions(+)
    
    diff --git a/appveyor.yml b/appveyor.yml
    index d9b6472d6..f95671e9a 100644
    --- a/appveyor.yml
    +++ b/appveyor.yml
    @@ -9,11 +9,13 @@ environment:
           target: x86_64-pc-windows-msvc
         - channel: nightly
           target: x86_64-pc-windows-gnu
    +      msys_bits: 64
     
     install:
       - appveyor DownloadFile https://win.rustup.rs/ -FileName rustup-init.exe
       - rustup-init -yv --default-toolchain %channel% --default-host %target%
       - set PATH=%PATH%;%USERPROFILE%\.cargo\bin
    +  - if defined msys_bits set PATH=%PATH%;C:\msys64\mingw%msys_bits%\bin
       - rustc -vV
       - cargo -vV
     
    
    From 51cab391861045b200c4941d0e188a0757ed8bee Mon Sep 17 00:00:00 2001
    From: Ashley Mannix 
    Date: Fri, 24 Mar 2017 16:37:30 +1000
    Subject: [PATCH 082/107] drop to vs2015 image
    
    ---
     appveyor.yml | 5 ++---
     1 file changed, 2 insertions(+), 3 deletions(-)
    
    diff --git a/appveyor.yml b/appveyor.yml
    index f95671e9a..d3812cc52 100644
    --- a/appveyor.yml
    +++ b/appveyor.yml
    @@ -1,8 +1,7 @@
     # Appveyor configuration template for Rust using rustup for Rust installation
     # https://github.com/starkat99/appveyor-rust
     
    -os: Visual Studio 2017
    -
    +os: Visual Studio 2015
     environment:
       matrix:
         - channel: nightly
    @@ -22,4 +21,4 @@ install:
     build: false
     
     test_script:
    -  - cargo test --verbose
    \ No newline at end of file
    +  - cargo test --verbose
    
    From daa19b770a38f0e8cf8427b3231cd51dc8eca7eb Mon Sep 17 00:00:00 2001
    From: Paul Masurel 
    Date: Fri, 24 Mar 2017 17:42:24 +0900
    Subject: [PATCH 083/107] (hopefully) bugfix race condition on wait merging
     threadwq.
    
    ---
     src/indexer/segment_updater.rs | 10 +++++++---
     1 file changed, 7 insertions(+), 3 deletions(-)
    
    diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs
    index 663e36609..eb9f11578 100644
    --- a/src/indexer/segment_updater.rs
    +++ b/src/indexer/segment_updater.rs
    @@ -308,7 +308,7 @@ impl SegmentUpdater {
             
         }
     
    -    pub fn wait_merging_thread(&self) -> thread::Result<()> {
    +    pub fn wait_merging_thread(&self) -> Result<()> {
             let mut new_merging_threads = HashMap::new();
             {
                 let mut merging_threads = self.0.merging_threads.write().unwrap();
    @@ -317,9 +317,13 @@ impl SegmentUpdater {
             for (_, merging_thread_handle) in new_merging_threads {
                 merging_thread_handle
                     .join()
    -                .map(|_| ())?
    +                .map(|_| ())
    +                .map_err(|_| {
    +                    Error::ErrorInThread("Merging thread failed.".to_string())
    +                })?
             }
    -        Ok(())
    +        // Our merging thread may have queued their completed
    +        self.run_async(move |_| {}).wait()
         }
     
     }
    
    From f50f557cfc0b12d5619cb99d6323daa238423cfa Mon Sep 17 00:00:00 2001
    From: Paul Masurel 
    Date: Sat, 25 Mar 2017 19:35:58 +0900
    Subject: [PATCH 084/107] issue/109 Remove futures from most of segment_updater
     API.
    
    ---
     src/indexer/index_writer.rs    | 10 +++-----
     src/indexer/segment_updater.rs | 46 ++++++++++++++++++----------------
     2 files changed, 28 insertions(+), 28 deletions(-)
    
    diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs
    index 0ba779ec3..e75861d35 100644
    --- a/src/indexer/index_writer.rs
    +++ b/src/indexer/index_writer.rs
    @@ -248,10 +248,7 @@ fn index_documents(heap: &mut Heap,
         let mut segment_entry = SegmentEntry::new(segment_meta);
         segment_entry.set_doc_to_opstamp(DocToOpstampMapping::from(doc_opstamps));
     
    -    segment_updater
    -        .add_segment(generation, segment_entry)
    -        .wait()
    -        .map_err(|_| Error::ErrorInThread("Could not add segment.".to_string()))
    +    Ok(segment_updater.add_segment(generation, segment_entry))
     
     }
     
    @@ -420,7 +417,7 @@ impl IndexWriter {
             // No new document have been added in the meanwhile because `IndexWriter`
             // is not shared by different threads.
             
    -        rollback_future.wait().map_err(|_|
    +        rollback_future.map_err(|_|
                 Error::ErrorInThread("Error while waiting for rollback.".to_string())
             )?;
     
    @@ -480,8 +477,7 @@ impl IndexWriter {
     
             // wait for the segment update thread to have processed the info
             self.segment_updater
    -            .commit(self.committed_opstamp)
    -            .wait()?;
    +            .commit(self.committed_opstamp)?;
             
             self.delete_queue.clear();
             Ok(self.committed_opstamp)
    diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs
    index eb9f11578..5a4de3ed1 100644
    --- a/src/indexer/segment_updater.rs
    +++ b/src/indexer/segment_updater.rs
    @@ -9,8 +9,9 @@ use core::SegmentMeta;
     use core::SerializableSegment;
     use directory::Directory;
     use Error;
    +use std::result;
     use futures_cpupool::CpuPool;
    -use futures::{Future, future};
    +use futures::Future;
     use futures::Canceled;
     use futures::oneshot;
     use indexer::{MergePolicy, DefaultMergePolicy};
    @@ -21,6 +22,7 @@ use indexer::merger::IndexMerger;
     use indexer::SegmentEntry;
     use indexer::SegmentSerializer;
     use Result;
    +use futures_cpupool::CpuFuture;
     use rustc_serialize::json;
     use schema::Schema;
     use std::borrow::BorrowMut;
    @@ -124,9 +126,7 @@ impl SegmentUpdater {
         pub fn new_segment(&self) -> Segment {
             let new_segment = self.0.index.new_segment();
             let segment_id = new_segment.id();
    -        self.run_async(move |segment_updater| {
    -            segment_updater.0.segment_manager.write_segment(segment_id);
    -        });
    +        self.0.segment_manager.write_segment(segment_id);
             new_segment
         }
     
    @@ -141,32 +141,32 @@ impl SegmentUpdater {
         fn get_merging_thread_id(&self) -> usize {
             self.0.merging_thread_id.fetch_add(1, Ordering::SeqCst)
         }
    -
    -
    -    fn run_async T>(&self, f: F) -> impl Future {
    +    
    +    fn run_async T>(&self, f: F) -> CpuFuture {
             let me_clone = self.clone();
             self.0.pool.spawn_fn(move || {
                 Ok(f(me_clone))
             })
         }
     
    -    pub fn rollback(&mut self, generation: usize) -> impl Future {
    +    pub fn rollback(&mut self, generation: usize) -> result::Result<(), Error> {
             self.0.generation.store(generation, Ordering::Release);
             self.run_async(|segment_updater| {
                 segment_updater.0.segment_manager.rollback();
    -        })
    +        }).wait()
         }
     
    -    pub fn add_segment(&self, generation: usize, segment_entry: SegmentEntry) -> impl Future {
    +    pub fn add_segment(&self, generation: usize, segment_entry: SegmentEntry) -> bool {
             if generation >= self.0.generation.load(Ordering::Acquire) {
    -            future::Either::A(self.run_async(|segment_updater| {
    +            self.run_async(|segment_updater| {
                     segment_updater.0.segment_manager.add_segment(segment_entry);
                     segment_updater.consider_merge_options();
                     true
    -            }))
    +            }).forget();
    +            true
             }
             else {
    -            future::Either::B(future::ok(false))
    +            false
             }
         }
     
    @@ -181,7 +181,7 @@ impl SegmentUpdater {
                 .collect()
         }
     
    -    pub fn commit(&self, opstamp: u64) -> impl Future {
    +    pub fn commit(&self, opstamp: u64) -> Result<()> {
             self.run_async(move |segment_updater| {
                 let segment_metas = segment_updater.purge_deletes().expect("Failed purge deletes");
                 segment_updater.0.segment_manager.commit(segment_metas);
    @@ -197,8 +197,7 @@ impl SegmentUpdater {
                 let living_files = segment_updater.0.segment_manager.list_files();
                 index.directory_mut().garbage_collect(living_files);
                 segment_updater.consider_merge_options();
    -            
    -        })
    +        }).wait()
         }
     
     
    @@ -266,9 +265,15 @@ impl SegmentUpdater {
                 let segment_entry = SegmentEntry::new(segment_meta);
                 segment_updater_clone
                     .end_merge(segment_metas.clone(), segment_entry.clone())
    -                .wait()
                     .unwrap();
    -            merging_future_send.complete(segment_entry.clone());
    +            
    +            // Send will fail if nobody is waiting for the result and
    +            // the receiver side got destroyed.
    +            //
    +            // This is not a problem.
    +            let _send_result = merging_future_send
    +                .send(segment_entry.clone());
    +            
                 segment_updater_clone.0.merging_threads.write().unwrap().remove(&merging_thread_id);
                 Ok(segment_entry)
             });
    @@ -293,7 +298,7 @@ impl SegmentUpdater {
         
         fn end_merge(&self, 
             merged_segment_metas: Vec,
    -        resulting_segment_entry: SegmentEntry) -> impl Future {
    +        resulting_segment_entry: SegmentEntry) -> Result<()> {
             
             self.run_async(move |segment_updater| {
                 segment_updater.0.segment_manager.end_merge(&merged_segment_metas, resulting_segment_entry);
    @@ -304,8 +309,7 @@ impl SegmentUpdater {
                     segment_updater.0.index.schema(),
                     segment_updater.0.index.opstamp(),
                     directory.borrow_mut()).expect("Could not save metas.");
    -        })
    -        
    +        }).wait()
         }
     
         pub fn wait_merging_thread(&self) -> Result<()> {
    
    From 68a956c6e75a03405e468f241715afd917eb32bc Mon Sep 17 00:00:00 2001
    From: Paul Masurel 
    Date: Sat, 25 Mar 2017 21:54:17 +0900
    Subject: [PATCH 085/107] issue/109  Showing debug! if test fails
    
    ---
     Cargo.toml                     |  1 +
     appveyor.yml                   |  2 ++
     src/indexer/index_writer.rs    | 12 ++++++++++--
     src/indexer/segment_updater.rs |  6 ++++--
     src/lib.rs                     |  3 +++
     5 files changed, 20 insertions(+), 4 deletions(-)
    
    diff --git a/Cargo.toml b/Cargo.toml
    index f6fd20cc2..85fd68405 100644
    --- a/Cargo.toml
    +++ b/Cargo.toml
    @@ -43,6 +43,7 @@ winapi = "*"
     
     [dev-dependencies]
     rand = "0.3"
    +env_logger = "0.4"
     
     [build-dependencies]
     gcc = {version = "0.3", optional=true}
    diff --git a/appveyor.yml b/appveyor.yml
    index d3812cc52..f7c19e093 100644
    --- a/appveyor.yml
    +++ b/appveyor.yml
    @@ -21,4 +21,6 @@ install:
     build: false
     
     test_script:
    +  - cargo test --no-run
    +  - SET RUST_LOG=debug
       - cargo test --verbose
    diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs
    index e75861d35..24753654b 100644
    --- a/src/indexer/index_writer.rs
    +++ b/src/indexer/index_writer.rs
    @@ -263,6 +263,8 @@ impl IndexWriter {
             
             let mut v = Vec::new();
             mem::swap(&mut v, &mut self.workers_join_handle);
    +
    +        debug!("wait {} merging threads START", v.len());
             for join_handle in v {
                 try!(join_handle.join()
                     .expect("Indexing Worker thread panicked")
    @@ -272,11 +274,14 @@ impl IndexWriter {
             }
             drop(self.workers_join_handle);
     
    -        self.segment_updater
    +        let result = self.segment_updater
                 .wait_merging_thread()
                 .map_err(|_| 
                     Error::ErrorInThread("Failed to join merging thread.".to_string())
    -            )
    +            );
    +        
    +        debug!("wait merging threads DONE");
    +        result
         }
     
         /// Spawns a new worker thread for indexing.
    @@ -539,6 +544,7 @@ mod tests {
         use Index;
         use Term;
         use Error;
    +    use env_logger;
     
         #[test]
         fn test_lockfile_stops_duplicates() {
    @@ -619,6 +625,7 @@ mod tests {
     
         #[test]
         fn test_with_merges() {
    +        let _ = env_logger::init();
             let mut schema_builder = schema::SchemaBuilder::default();
             let text_field = schema_builder.add_text_field("text", schema::TEXT);
             let index = Index::create_in_ram(schema_builder.build());
    @@ -646,6 +653,7 @@ mod tests {
                 index_writer.commit().expect("commit failed");
                 index_writer.wait_merging_threads().expect("waiting merging thread failed");
                 index.load_searchers().unwrap();
    +            
                 assert_eq!(num_docs_containing("a"), 200);
                 assert_eq!(index.searchable_segments().unwrap().len(), 1);
             }
    diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs
    index 5a4de3ed1..8dbc7eca5 100644
    --- a/src/indexer/segment_updater.rs
    +++ b/src/indexer/segment_updater.rs
    @@ -78,8 +78,9 @@ pub fn save_metas(segment_metas: Vec,
         };
         let mut w = vec!();
         try!(write!(&mut w, "{}\n", json::as_pretty_json(&metas)));
    -    Ok(directory
    -        .atomic_write(&META_FILEPATH, &w[..])?)
    +    let res = directory.atomic_write(&META_FILEPATH, &w[..])?;
    +    debug!("Saved metas {}", json::as_pretty_json(&metas));
    +    Ok(res)
             
     }
     
    @@ -301,6 +302,7 @@ impl SegmentUpdater {
             resulting_segment_entry: SegmentEntry) -> Result<()> {
             
             self.run_async(move |segment_updater| {
    +            debug!("End merge {:?}", merged_segment_metas);
                 segment_updater.0.segment_manager.end_merge(&merged_segment_metas, resulting_segment_entry);
                 let mut directory = segment_updater.0.index.directory().box_clone();
                 let segment_metas = segment_updater.0.segment_manager.committed_segment_metas();
    diff --git a/src/lib.rs b/src/lib.rs
    index 46fb75abc..f16305500 100644
    --- a/src/lib.rs
    +++ b/src/lib.rs
    @@ -48,6 +48,9 @@ extern crate bit_set;
     extern crate futures;
     extern crate futures_cpupool;
     
    +#[cfg(test)]
    +extern crate env_logger;
    +
     #[cfg(feature="simdcompression")]
     extern crate libc;
     
    
    From 84a060552da0837902ddd770a00de5ff19989261 Mon Sep 17 00:00:00 2001
    From: Paul Masurel 
    Date: Sat, 25 Mar 2017 22:34:34 +0900
    Subject: [PATCH 086/107] issue/109 trying to get proper logging in appveyor
    
    ---
     appveyor.yml | 4 +---
     1 file changed, 1 insertion(+), 3 deletions(-)
    
    diff --git a/appveyor.yml b/appveyor.yml
    index f7c19e093..fe19f58e5 100644
    --- a/appveyor.yml
    +++ b/appveyor.yml
    @@ -21,6 +21,4 @@ install:
     build: false
     
     test_script:
    -  - cargo test --no-run
    -  - SET RUST_LOG=debug
    -  - cargo test --verbose
    +  - REM SET RUST_LOG=tantivy,test & SET RUST_TEST_THREADS=1 & cargo test --verbose
    \ No newline at end of file
    
    From 45806951b1f00c7a66fd8c72232a17a1bb3b322b Mon Sep 17 00:00:00 2001
    From: Paul Masurel 
    Date: Sat, 25 Mar 2017 22:43:07 +0900
    Subject: [PATCH 087/107] added quotation mark
    
    ---
     appveyor.yml | 2 +-
     1 file changed, 1 insertion(+), 1 deletion(-)
    
    diff --git a/appveyor.yml b/appveyor.yml
    index fe19f58e5..789e24400 100644
    --- a/appveyor.yml
    +++ b/appveyor.yml
    @@ -21,4 +21,4 @@ install:
     build: false
     
     test_script:
    -  - REM SET RUST_LOG=tantivy,test & SET RUST_TEST_THREADS=1 & cargo test --verbose
    \ No newline at end of file
    +  - REM SET RUST_LOG=tantivy,test & cargo test --verbose
    \ No newline at end of file
    
    From ddb2b8d807b1405f220fe69699fdbae6e6f9804e Mon Sep 17 00:00:00 2001
    From: Paul Masurel 
    Date: Sun, 26 Mar 2017 17:33:42 +0900
    Subject: [PATCH 088/107] test passing.
    
    SegmentWriter create SegmentEntry which contain a delete_bitset
    ---
     src/indexer/index_writer.rs    | 59 +++++++++++++++++-----------------
     src/indexer/merger.rs          |  2 +-
     src/indexer/segment_updater.rs |  8 ++++-
     3 files changed, 38 insertions(+), 31 deletions(-)
    
    diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs
    index 6b7a6419d..6a373f568 100644
    --- a/src/indexer/index_writer.rs
    +++ b/src/indexer/index_writer.rs
    @@ -160,8 +160,9 @@ pub fn compute_deleted_bitset(
         segment_reader: &SegmentReader,
         delete_cursor: &mut DeleteCursor,
         doc_opstamps: DocToOpstampMapping,
    -    target_opstamp: u64) -> Result<(Option)> {
    +    target_opstamp: u64) -> Result {
         
    +    let mut might_have_changed = false;
         
         loop {
             if let Some(delete_op) = delete_cursor.peek() {
    @@ -180,6 +181,7 @@ pub fn compute_deleted_bitset(
                             let deleted_doc = docset.doc();
                             if deleted_doc < limit_doc {
                                 delete_bitset.insert(deleted_doc as usize);
    +                            might_have_changed = true;
                             }
                         }
                     }
    @@ -190,13 +192,7 @@ pub fn compute_deleted_bitset(
             }
             delete_cursor.advance();
         }
    -
    -    if !delete_bitset.is_empty() {
    -        Ok(Some(delete_bitset))
    -    }
    -    else {
    -        Ok(None)
    -    }
    +    Ok(might_have_changed)
     }
     
     
    @@ -213,44 +209,45 @@ pub fn advance_deletes(
         segment_entry: &mut SegmentEntry,
         target_opstamp: u64) -> Result<()> {
     
    -    {
    -        
    +    {   
    +        if let Some(previous_opstamp) = segment_entry.meta().delete_opstamp() {
    +            // We are already up-to-date here.
    +            if target_opstamp == previous_opstamp {
    +                return Ok(());
    +            }
    +        }
             let segment_reader = SegmentReader::open(segment.clone())?;
    -
    +        let max_doc = segment_reader.max_doc();
    +        
             let mut delete_bitset: BitSet =
                 match segment_entry.reset_delete_bitset() {
                     Some(previous_delete_bitset) => {
                         previous_delete_bitset
                     },
                     None => {
    -                    BitSet::with_capacity(segment_reader.max_doc() as usize)
    +                    BitSet::with_capacity(max_doc as usize)
                     }
                 };
             
             let delete_cursor = segment_entry.delete_cursor();
     
    -        let new_deleted_bitset = compute_deleted_bitset(
    +        compute_deleted_bitset(
    +            &mut delete_bitset,
                 &segment_reader,
                 delete_cursor,
    -            &mut delete_bitset,
                 DocToOpstampMapping::None,
                 target_opstamp)?;
             
    -        // we only write the result different
    -        // iff we ended ended up increasing the delete opstamp
    -        //
    -        // TODO just move the file if there was no new delete?
    -        if let Some(mut delete_bitset) = new_deleted_bitset {
    -            for doc in 0u32..segment_reader.max_doc() {
    -                if segment_reader.is_deleted(doc) {
    -                    delete_bitset.insert(doc as usize);
    -                }
    +        for doc in 0u32..max_doc {
    +            if segment_reader.is_deleted(doc) {
    +                delete_bitset.insert(doc as usize);
                 }
    -            let num_deleted_docs = delete_bitset.len();
    -            segment.set_delete_meta(num_deleted_docs as u32, target_opstamp);
    -            let mut delete_file = segment.open_write(SegmentComponent::DELETE)?;    
    -            write_delete_bitset(&delete_bitset, &mut delete_file)?;
             }
    +
    +        let num_deleted_docs = delete_bitset.len();
    +        segment.set_delete_meta(num_deleted_docs as u32, target_opstamp);
    +        let mut delete_file = segment.open_write(SegmentComponent::DELETE)?;
    +        write_delete_bitset(&delete_bitset, &mut delete_file)?;
         }
         segment_entry.set_meta(segment.meta().clone());
     
    @@ -291,7 +288,9 @@ fn index_documents(heap: &mut Heap,
         
         let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
         let segment_reader = SegmentReader::open(segment)?;
    -    let delete_bitset = compute_deleted_bitset(
    +    let mut deleted_bitset = BitSet::with_capacity(num_docs as usize);
    +    let may_have_deletes = compute_deleted_bitset(
    +        &mut deleted_bitset,
             &segment_reader,
             &mut delete_cursor,
             doc_to_opstamps,
    @@ -301,7 +300,9 @@ fn index_documents(heap: &mut Heap,
         let segment_entry = SegmentEntry::new(
             segment_meta,
             delete_cursor,
    -        delete_bitset);
    +        { if may_have_deletes { Some(deleted_bitset) }
    +          else { None } }
    +    );
         
         segment_updater
             .add_segment(generation, segment_entry)
    diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs
    index c087b93f4..596c4d696 100644
    --- a/src/indexer/merger.rs
    +++ b/src/indexer/merger.rs
    @@ -307,7 +307,7 @@ mod tests {
         use futures::Future;
     
         #[test]
    -    fn test_index_merger() {
    +    fn test_index_merger_no_deletes() {
             let mut schema_builder = schema::SchemaBuilder::default();
             let text_fieldtype = schema::TextOptions::default()
                                      .set_indexing_options(TextIndexingOptions::TokenizedWithFreq)
    diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs
    index 3d325d904..c12ac59e7 100644
    --- a/src/indexer/segment_updater.rs
    +++ b/src/indexer/segment_updater.rs
    @@ -300,7 +300,13 @@ impl SegmentUpdater {
                             .end_merge(segment_ids_vec, after_merge_segment_entry)
                             .wait()
                             .expect("Segment updater thread is corrupted.");
    -                    merging_future_send.complete(merged_segment_meta);
    +                    
    +                    // the future may fail if the listener of the oneshot future 
    +                    // has been destroyed.
    +                    //
    +                    // This is not a problem here, so we just ignore any 
    +                    // possible error.
    +                    let _merging_future_res = merging_future_send.send(merged_segment_meta);
                     }
                     Err(_) => {
                         // ... cancel merge
    
    From 456dd3a60da67d24afe774312a64a77a3b616c98 Mon Sep 17 00:00:00 2001
    From: Paul Masurel 
    Date: Tue, 28 Mar 2017 16:49:48 +0900
    Subject: [PATCH 089/107] issue/96 merge
    
    ---
     src/directory/managed_directory.rs |  1 +
     src/indexer/index_writer.rs        | 22 +++++++------
     src/indexer/segment_updater.rs     | 53 +++++++-----------------------
     3 files changed, 24 insertions(+), 52 deletions(-)
    
    diff --git a/src/directory/managed_directory.rs b/src/directory/managed_directory.rs
    index f0cf5657e..e764471fc 100644
    --- a/src/directory/managed_directory.rs
    +++ b/src/directory/managed_directory.rs
    @@ -81,6 +81,7 @@ impl ManagedDirectory {
                             managed_has_changed |= managed_paths_write.remove(&file_to_delete);
                         }
                         Err(file_error) => {
    +                        error!("Failed to delete {:?}", file_to_delete);
                             match file_error {
                                 FileError::FileDoesNotExist(_) => {
                                     managed_has_changed |= managed_paths_write.remove(&file_to_delete);
    diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs
    index 7a14fcfdf..33afe7108 100644
    --- a/src/indexer/index_writer.rs
    +++ b/src/indexer/index_writer.rs
    @@ -209,7 +209,7 @@ pub fn advance_deletes(
         segment_entry: &mut SegmentEntry,
         target_opstamp: u64) -> Result<()> {
     
    -    {   
    +    {
             if let Some(previous_opstamp) = segment_entry.meta().delete_opstamp() {
                 // We are already up-to-date here.
                 if target_opstamp == previous_opstamp {
    @@ -221,12 +221,10 @@ pub fn advance_deletes(
             
             let mut delete_bitset: BitSet =
                 match segment_entry.reset_delete_bitset() {
    -                Some(previous_delete_bitset) => {
    -                    previous_delete_bitset
    -                },
    -                None => {
    +                Some(previous_delete_bitset) =>
    +                    previous_delete_bitset,
    +                None =>
                         BitSet::with_capacity(max_doc as usize)
    -                }
                 };
             
             let delete_cursor = segment_entry.delete_cursor();
    @@ -245,9 +243,11 @@ pub fn advance_deletes(
             }
     
             let num_deleted_docs = delete_bitset.len();
    -        segment.set_delete_meta(num_deleted_docs as u32, target_opstamp);
    -        let mut delete_file = segment.open_write(SegmentComponent::DELETE)?;
    -        write_delete_bitset(&delete_bitset, &mut delete_file)?;
    +        if num_deleted_docs > 0 {
    +            segment.set_delete_meta(num_deleted_docs as u32, target_opstamp);
    +            let mut delete_file = segment.open_write(SegmentComponent::DELETE)?;
    +            write_delete_bitset(&delete_bitset, &mut delete_file)?;
    +        }
         }
         segment_entry.set_meta(segment.meta().clone());
     
    @@ -665,6 +665,7 @@ mod tests {
             index.searcher();
         }
     
    +
         #[test]
         fn test_with_merges() {
             let _ = env_logger::init();
    @@ -679,7 +680,7 @@ mod tests {
             {
                 // writing the segment
                 let mut index_writer = index.writer_with_num_threads(4, 4 * 30_000_000).unwrap();
    -            // create 10 segments with 100 tiny docs
    +            // create 8 segments with 100 tiny docs
                 for _doc in 0..100 {
                     let mut doc = Document::default();
                     doc.add_text(text_field, "a");
    @@ -698,6 +699,7 @@ mod tests {
                 
                 assert_eq!(num_docs_containing("a"), 200);
                 assert_eq!(index.searchable_segments().unwrap().len(), 1);
    +            
             }
         }
     
    diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs
    index 4fe26dad8..74ea6ca2f 100644
    --- a/src/indexer/segment_updater.rs
    +++ b/src/indexer/segment_updater.rs
    @@ -265,12 +265,10 @@ impl SegmentUpdater {
                     let living_files = segment_updater.0.segment_manager.list_files();
                     index.directory_mut().garbage_collect(living_files);
                     segment_updater.consider_merge_options();
    +                
    +                // See #112
    +                // index.directory_mut().garbage_collect(living_files);
                 }
    -            
    -            let living_files = segment_updater.0.segment_manager.list_files();
    -            index.directory_mut().garbage_collect(living_files);
    -            
    -            segment_updater.consider_merge_options();
             }).wait()
         }
     
    @@ -315,48 +313,18 @@ impl SegmentUpdater {
                         // possible error.
                         let _merging_future_res = merging_future_send.send(merged_segment_meta);
                     }
    -                Err(_) => {
    +                Err(e) => {
                         // ... cancel merge
    -                    warn!("Merge of {:?} was cancelled", segment_ids_vec);
    +                    if cfg!(test) {
    +                        panic!("Merge failed.");
    +                    }
    +                    else {
    +                        error!("Merge of {:?} was cancelled: {:?}", segment_ids_vec, e);
    +                    }
                         segment_updater_clone.cancel_merge(&segment_ids_vec, merged_segment_id);
                         // merging_future_send will be dropped, sending an error to the future.
                     }
                 }
    -// <<<<<<< HEAD
    -// =======
    -            
    -//             let segments: Vec = segment_metas
    -//                 .iter()
    -//                 .cloned()
    -//                 .map(|segment_meta| index.segment(segment_meta))
    -//                 .collect();
    -            
    -//             // An IndexMerger is like a "view" of our merged segments.
    -//             let merger: IndexMerger = IndexMerger::open(schema, &segments[..])?;
    -//             let mut merged_segment = index.new_segment(); 
    -            
    -//             // ... we just serialize this index merger in our new segment
    -//             // to merge the two segments.
    -
    -//             let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment).expect("Creating index serializer failed");
    -
    -//             let num_docs = merger.write(segment_serializer).expect("Serializing merged index failed");
    -//             let mut segment_meta = SegmentMeta::new(merged_segment.id());
    -//             segment_meta.set_max_doc(num_docs);
    -            
    -//             let segment_entry = SegmentEntry::new(segment_meta);
    -//             segment_updater_clone
    -//                 .end_merge(segment_metas.clone(), segment_entry.clone())
    -//                 .unwrap();
    -            
    -//             // Send will fail if nobody is waiting for the result and
    -//             // the receiver side got destroyed.
    -//             //
    -//             // This is not a problem.
    -//             let _send_result = merging_future_send
    -//                 .send(segment_entry.clone());
    -            
    -// >>>>>>> master
                 segment_updater_clone.0.merging_threads.write().unwrap().remove(&merging_thread_id);
                 Ok(())
             });
    @@ -410,6 +378,7 @@ impl SegmentUpdater {
                 let mut merging_threads = self.0.merging_threads.write().unwrap();
                 mem::swap(&mut new_merging_threads, merging_threads.deref_mut());
             }
    +        debug!("wait merging thread {}", new_merging_threads.len());
             for (_, merging_thread_handle) in new_merging_threads {
                 merging_thread_handle
                     .join()
    
    From f0dc0de4b79d2d99db4f59bf2be1c6267b25e7e5 Mon Sep 17 00:00:00 2001
    From: Paul Masurel 
    Date: Wed, 29 Mar 2017 11:26:18 +0900
    Subject: [PATCH 090/107] Added helper to create Vec with a given size
    
    ---
     src/common/mod.rs              | 7 +++++++
     src/datastruct/stacker/heap.rs | 7 ++-----
     src/schema/term.rs             | 4 ++--
     3 files changed, 11 insertions(+), 7 deletions(-)
    
    diff --git a/src/common/mod.rs b/src/common/mod.rs
    index 0ab876a81..d2d41cef6 100644
    --- a/src/common/mod.rs
    +++ b/src/common/mod.rs
    @@ -30,3 +30,10 @@ pub trait HasLen {
     }
     
     
    +pub fn create_vec_with_len(capacity: usize) -> Vec {
    +    let mut v = Vec::with_capacity(capacity);
    +    unsafe {
    +        v.set_len(capacity);
    +    }
    +    v
    +}
    \ No newline at end of file
    diff --git a/src/datastruct/stacker/heap.rs b/src/datastruct/stacker/heap.rs
    index c3b8d0a27..cd8d16e89 100644
    --- a/src/datastruct/stacker/heap.rs
    +++ b/src/datastruct/stacker/heap.rs
    @@ -1,5 +1,6 @@
     use std::cell::UnsafeCell;
     use std::mem;
    +use common::create_vec_with_len;
     use std::ptr;
     
     /// `BytesRef` refers to a slice in tantivy's custom `Heap`.
    @@ -109,11 +110,7 @@ struct InnerHeap {
     /// We use this unsafe trick to make unit test
     /// way faster.
     fn allocate_fast(num_bytes: usize) -> Vec {
    -    let mut buffer = Vec::with_capacity(num_bytes);
    -    unsafe {
    -        buffer.set_len(num_bytes);
    -    }
    -    buffer
    +    create_vec_with_len(num_bytes)
     }
     
     impl InnerHeap {
    diff --git a/src/schema/term.rs b/src/schema/term.rs
    index cc727999a..656c9feeb 100644
    --- a/src/schema/term.rs
    +++ b/src/schema/term.rs
    @@ -1,6 +1,7 @@
     use std::fmt;
     
     use common::BinarySerializable;
    +use common::create_vec_with_len;
     use byteorder::{BigEndian, ByteOrder};
     use super::Field;
     use std::str;
    @@ -44,8 +45,7 @@ impl Term {
         /// The first byte is `1`, and the 4 following bytes are that of the u32.
         pub fn from_field_u32(field: Field, val: u32) -> Term {
             const U32_TERM_LEN: usize = 1 + 4;
    -        let mut buffer = Vec::with_capacity(U32_TERM_LEN);
    -        unsafe { buffer.set_len(U32_TERM_LEN) };
    +        let mut buffer = create_vec_with_len(U32_TERM_LEN);
             buffer[0] = field.0;
             // we want BigEndian here to have lexicographic order
             // match the natural order of vals.
    
    From b22c6b86c72c6e934f525a1786197fe1328a21ad Mon Sep 17 00:00:00 2001
    From: Karl Hobley 
    Date: Thu, 30 Mar 2017 13:43:03 +0100
    Subject: [PATCH 091/107] Mark "cpp" folder as linguist-vendored in
     .gitattributes
    
    This repo is currently being detected as a C project because of some vendored libraries in the "cpp" folder.
    
    According to https://github.com/github/linguist#using-gitattributes you can use ``.gitattributes`` tell GitHub to not count this folder when detecting the language.
    ---
     .gitattributes | 1 +
     1 file changed, 1 insertion(+)
     create mode 100644 .gitattributes
    
    diff --git a/.gitattributes b/.gitattributes
    new file mode 100644
    index 000000000..58317d305
    --- /dev/null
    +++ b/.gitattributes
    @@ -0,0 +1 @@
    +cpp/* linguist-vendored
    
    From 4fc7bc5f09cc5ee538d088aa64a8231c2098630b Mon Sep 17 00:00:00 2001
    From: Paul Masurel 
    Date: Fri, 31 Mar 2017 18:54:23 +0900
    Subject: [PATCH 092/107] Added helper to create Vec with a given sizewq
    
    ---
     src/indexer/delete_queue2.rs | 206 +++++++++++++++++++++++++++++++++++
     1 file changed, 206 insertions(+)
     create mode 100644 src/indexer/delete_queue2.rs
    
    diff --git a/src/indexer/delete_queue2.rs b/src/indexer/delete_queue2.rs
    new file mode 100644
    index 000000000..e2b8230d1
    --- /dev/null
    +++ b/src/indexer/delete_queue2.rs
    @@ -0,0 +1,206 @@
    +use super::operation::DeleteOperation;
    +use std::sync::{Arc, RwLock};
    +use std::mem;
    +use std::ops::DerefMut;
    +
    +
    +#[derive(Clone, Default)]
    +struct DeleteQueue {
    +    writer: Arc>>,
    +    next_block: Option,
    +}
    +
    +impl DeleteQueue {
    +
    +    pub fn new() -> Arc {
    +        let mut delete_queue = Arc::new(DeleteQueue::default());
    +        delete_queue.next_block = Some(
    +            NextBlock::from(delete_queue)
    +        );
    +        delete_queue
    +    }
    +
    +    pub fn cursor(&self) -> Cursor {
    +        
    +        Cursor {
    +            current_block: Arc,
    +            pos: 0,
    +        }
    +    }
    +
    +    pub fn push(&self, delete_operation: DeleteOperation) {
    +        let mut write_lock = self.writer
    +            .write()
    +            .expect("Failed to acquire write lock on delete queue writer")
    +            .push(delete_operation);
    +    }
    +
    +    fn flush(&self) -> Option> {
    +        let mut write_lock = self
    +            .writer
    +            .write()
    +            .expect("Failed to acquire write lock on delete queue writer");
    +        if write_lock.is_empty() {
    +            return None;
    +        }
    +        Some(mem::replace(write_lock.deref_mut(), vec!()))
    +    }
    +}
    +
    +enum InnerNextBlock {
    +    Writer(Arc),
    +    Closed(Arc),
    +    Terminated,
    +}
    +
    +struct NextBlock(RwLock);
    +
    +impl From> for NextBlock {
    +    fn from(writer_arc: Arc) -> NextBlock {
    +        NextBlock(RwLock::new(InnerNextBlock::Writer(writer_arc)))
    +    }
    +}
    +
    +impl NextBlock {   
    +    pub fn next_block(&self) -> Option> {
    +        {
    +            let next_read_lock = self.0
    +                .read()
    +                .expect("Failed to acquire write lock in delete queue");
    +            match *next_read_lock {
    +                InnerNextBlock::Terminated => {
    +                    return None;
    +                }
    +                InnerNextBlock::Closed(ref block) => {
    +                    return Some(block.clone());
    +                }
    +                _ => {}
    +            }
    +        }
    +        let delete_operations;
    +        let writer_arc;
    +        {
    +            let mut next_write_lock = self.0
    +                .write()
    +                .expect("Failed to acquire write lock in delete queue");
    +            match *next_write_lock {
    +                InnerNextBlock::Terminated => {
    +                    return None;
    +                }
    +                InnerNextBlock::Closed(ref block) => {
    +                    return Some(block.clone());
    +                }
    +                InnerNextBlock::Writer(ref writer) => {
    +                    match writer.flush() {
    +                        Some(flushed_delete_operations) => {
    +                            delete_operations = flushed_delete_operations;
    +                        }
    +                        None => {
    +                            return None;
    +                        }
    +                    }
    +                    writer_arc = writer.clone();
    +                }
    +            }
    +            let next_block = Arc::new(Block {
    +                operations: Arc::new(delete_operations),
    +                next: NextBlock::from(writer_arc),
    +            });
    +            *next_write_lock.deref_mut() = InnerNextBlock::Closed(next_block.clone()); // TODO fix
    +            return Some(next_block)
    +        }
    +    }
    +}
    +
    +struct Block {
    +    operations: Arc>,
    +    next: NextBlock,
    +}
    +
    +
    +#[derive(Clone)]
    +struct Cursor {
    +    current_block: Arc,
    +    pos: usize,
    +}
    +
    +impl Cursor {   
    +    fn next<'a>(&'a mut self) -> Option<&'a DeleteOperation> {
    +        if self.pos >= self.current_block.operations.len() {
    +            // we have consumed our operations entirely.
    +            // let's ask our writer if he has more for us.
    +            // self.go_next_block();
    +            match self.current_block.next.next_block() {
    +                Some(block) => {
    +                    self.current_block = block;
    +                    self.pos = 0;
    +                }
    +                None => {
    +                    return None;
    +                }
    +            }
    +        }
    +        let operation = &self.current_block.operations[self.pos];
    +        self.pos += 1;
    +        return Some(operation);
    +    }
    +}
    +
    +
    +
    +
    +
    +
    +#[cfg(test)]
    +mod tests {
    +
    +    use super::{DeleteQueue, DeleteOperation};
    +    use schema::{Term, Field};
    +
    +    #[test]
    +    fn test_deletequeue() {
    +        let delete_queue = DeleteQueue::new();
    +        
    +        let make_op = |i: usize| {
    +            let field = Field(1u8);
    +            DeleteOperation {
    +                opstamp: i as u64,
    +                term: Term::from_field_u32(field, i as u32)
    +            }
    +        };
    +
    +        delete_queue.push(make_op(1));
    +        delete_queue.push(make_op(2));
    +
    +        let snapshot = delete_queue.cursor();
    +        {
    +            let mut operations_it = snapshot.clone();
    +            assert_eq!(operations_it.next().unwrap().opstamp, 1);
    +            assert_eq!(operations_it.next().unwrap().opstamp, 2);
    +            assert!(operations_it.next().is_none());
    +        }
    +        {   
    +            let mut operations_it = snapshot.clone();
    +            assert_eq!(operations_it.next().unwrap().opstamp, 1);
    +            assert_eq!(operations_it.next().unwrap().opstamp, 2);
    +            assert!(operations_it.next().is_none());
    +        }
    +        
    +        // // operations does not own a lock on the queue.
    +        // delete_queue.push(make_op(3));
    +        // let snapshot2 = delete_queue.snapshot();
    +        // {
    +        //     // operations is not affected by
    +        //     // the push that occurs after.
    +        //     let mut operations_it = snapshot.iter();
    +        //     let mut operations2_it = snapshot2.iter();
    +        //     assert_eq!(operations_it.next().unwrap().opstamp, 1);
    +        //     assert_eq!(operations2_it.next().unwrap().opstamp, 1);
    +        //     assert_eq!(operations_it.next().unwrap().opstamp, 2);
    +        //     assert_eq!(operations2_it.next().unwrap().opstamp, 2);
    +        //     assert!(operations_it.next().is_none());
    +        //     assert_eq!(operations2_it.next().unwrap().opstamp, 3);
    +        //     assert!(operations2_it.next().is_none());
    +        // }
    +    }
    +}
    \ No newline at end of file
    
    From afd08a7bbc8308b614d5e2efc7b68cd524bba0be Mon Sep 17 00:00:00 2001
    From: Paul Masurel 
    Date: Sat, 1 Apr 2017 21:01:10 +0900
    Subject: [PATCH 093/107] issue/96 Changed datastruct for the delete queue.
    
    ---
     src/indexer/delete_queue.rs    | 362 +++++++++++++++++++++++++--------
     src/indexer/delete_queue2.rs   | 206 -------------------
     src/indexer/index_writer.rs    |   2 +-
     src/indexer/mod.rs             |   3 -
     src/indexer/segment_updater.rs |   3 +-
     5 files changed, 276 insertions(+), 300 deletions(-)
     delete mode 100644 src/indexer/delete_queue2.rs
    
    diff --git a/src/indexer/delete_queue.rs b/src/indexer/delete_queue.rs
    index 0286454c7..1f476dad6 100644
    --- a/src/indexer/delete_queue.rs
    +++ b/src/indexer/delete_queue.rs
    @@ -1,90 +1,273 @@
     use super::operation::DeleteOperation;
     use std::sync::{Arc, RwLock};
    -
    -/// This implementation assumes that we
    -/// have a lot more write operation than read operations.
    +use std::mem;
    +use std::ops::DerefMut;
     
     
    -type InnerDeleteQueue = Arc>>;
    -
    -// TODO very inefficient.
    -// fix this once the refactoring/bugfix is done
    -#[derive(Clone)]
    -pub struct DeleteCursor {
    -    cursor: usize,
    -    operations: InnerDeleteQueue,
    +// The DeleteQueue is similar in conceptually to a multiple
    +// consumer single producer broadcast channel.
    +// 
    +// All consumer will receive all messages.
    +// 
    +// Consumer of the delete queue are holding a `DeleteCursor`,
    +// which points to a specific place of the `DeleteQueue`.
    +// 
    +// New consumer can be created in two ways
    +// - calling `delete_queue.cursor()` returns a cursor, that 
    +//   will include all future delete operation (and no past operations).
    +// - cloning an existing cursor returns a new cursor, that
    +//   is at the exact same position, and can now advance independantly 
    +//   from the original cursor.
    +#[derive(Default)]
    +struct InnerDeleteQueue {
    +    writer: Vec,
    +    last_block: Option>, // TODO last block... is that ok.
     }
     
    -impl DeleteCursor {
    -
    -    pub fn skip_to(&mut self, target_opstamp: u64) {
    -        while let Some(operation) = self.peek() {
    -            if operation.opstamp >= target_opstamp {
    -                break;
    -            }
    -            self.advance()
    -        }
    -    }
    -
    -    pub fn advance(&mut self) {
    -        let read = self.operations.read().unwrap();
    -        if self.cursor < read.len()  {
    -            self.cursor += 1;
    -        }
    -    }
    -
    -    pub fn peek(&self,) -> Option {
    -        let read = self.operations.read().unwrap();
    -        if self.cursor >= read.len() {
    -            None
    -        }
    -        else {
    -            let operation = read[self.cursor].clone();
    -            Some(operation)
    -        }
    -    }
    -}
    -
    -// TODO remove copy
    -impl Iterator for DeleteCursor {
    -    
    -    type Item=DeleteOperation;
    -    
    -    fn next(&mut self) -> Option{
    -        let read = self.operations.read().unwrap();
    -        if self.cursor >= read.len() {
    -            None
    -        }
    -        else {
    -            let operation = read[self.cursor].clone();
    -            self.cursor += 1;
    -            Some(operation)
    -        }
    -    }
    -}
    -
    -
     #[derive(Clone, Default)]
    -pub struct DeleteQueue(InnerDeleteQueue);
    +pub struct DeleteQueue {
    +    inner: Arc>,
    +}
    +
     
     impl DeleteQueue {
     
    +    // Creates a new delete queue.
         pub fn new() -> DeleteQueue {
    -        DeleteQueue::default()
    -    }
    +        
    +        let delete_queue = DeleteQueue {
    +            inner: Arc::new(RwLock::new(InnerDeleteQueue::default()))
    +        };
    +            
    +        let next_block = NextBlock::from(delete_queue.clone());
    +        {
    +            let mut delete_queue_wlock = delete_queue.inner.write().unwrap();
    +            delete_queue_wlock.last_block = Some(
    +                Arc::new(Block {
    +                    operations: Arc::default(),
    +                    next: next_block,
    +                })
    +            );
    +        }
     
    -    pub fn push(&self, delete_operation: DeleteOperation) {
    -        self.0.write().unwrap().push(delete_operation);
    +        delete_queue
         }
    +    
     
    +    // Creates a new cursor that makes it possible to 
    +    // consume future delete operations.
    +    // 
    +    // Past delete operations are not accessible.
         pub fn cursor(&self) -> DeleteCursor {
    +        let last_block = self.inner
    +            .read()
    +            .unwrap()
    +            .last_block
    +            .clone()
    +            .expect("Failed to unwrap last_block. This should never happen
    +                as the Option<> is only here to make
    +                initialization possible");
    +        let operations_len = last_block.operations.len();
             DeleteCursor {
    -            cursor: 0,
    -            operations: self.0.clone(),
    +            block: last_block,
    +            pos: operations_len,
    +        }
    +    }
    +
    +    // Appends a new delete operations.
    +    pub fn push(&self, delete_operation: DeleteOperation) {
    +        self.inner
    +            .write()
    +            .expect("Failed to acquire write lock on delete queue writer")
    +            .writer
    +            .push(delete_operation);
    +    }
    +
    +    // DeleteQueue is a linked list of blocks of
    +    // delete operations.
    +    // 
    +    // Writing happens by simply appending to a vec.
    +    // `.flush()` takes this pending delete operations vec
    +    // creates a new read-only block from it, 
    +    // and appends it to the linked list.
    +    // 
    +    // `.flush()` happens when, for instance, 
    +    // a consumer reaches the last read-only operations.
    +    // It then ask the delete queue if there happen to 
    +    // be some unflushed operations.
    +    //
    +    fn flush(&self) -> Option> {
    +        let mut self_wlock = self
    +            .inner
    +            .write()
    +            .expect("Failed to acquire write lock on delete queue writer");
    +        
    +        let delete_operations;
    +        {
    +            let writer: &mut Vec = &mut self_wlock.writer;
    +            if writer.is_empty() {
    +                return None;
    +            }
    +            delete_operations = mem::replace(writer, vec!());
    +        }
    +
    +        let next_block = NextBlock::from(self.clone());
    +        {
    +            self_wlock.last_block = Some(
    +                Arc::new(Block {
    +                    operations: Arc::new(delete_operations),
    +                    next: next_block,
    +                })
    +            );
    +        }
    +        self_wlock.last_block.clone()
    +    }
    +}
    +
    +enum InnerNextBlock {
    +    Writer(DeleteQueue),
    +    Closed(Arc),
    +}
    +
    +struct NextBlock(RwLock);
    +
    +impl From for NextBlock {
    +    fn from(delete_queue: DeleteQueue) -> NextBlock {
    +        NextBlock(RwLock::new(InnerNextBlock::Writer(delete_queue)))
    +    }
    +}
    +
    +impl NextBlock {   
    +    fn next_block(&self) -> Option> {
    +        {
    +            let next_read_lock = self.0
    +                .read()
    +                .expect("Failed to acquire write lock in delete queue");
    +            match *next_read_lock {
    +                InnerNextBlock::Closed(ref block) => {
    +                    return Some(block.clone());
    +                }
    +                _ => {}
    +            }
    +        }
    +        let next_block;
    +        {
    +            let mut next_write_lock = self.0
    +                .write()
    +                .expect("Failed to acquire write lock in delete queue");
    +            match *next_write_lock {
    +                InnerNextBlock::Closed(ref block) => {
    +                    return Some(block.clone());
    +                }
    +                InnerNextBlock::Writer(ref writer) => {
    +                    match writer.flush() {
    +                        Some(flushed_next_block) => {
    +                            next_block = flushed_next_block;
    +                        }
    +                        None => {
    +                            return None;
    +                        }
    +                    }
    +                }
    +            }
    +            *next_write_lock.deref_mut() = InnerNextBlock::Closed(next_block.clone()); // TODO fix
    +            return Some(next_block)
             }
         }
     }
     
    +struct Block {
    +    operations: Arc>,
    +    next: NextBlock,
    +}
    +
    +
    +#[derive(Clone)]
    +pub struct DeleteCursor {
    +    block: Arc,
    +    pos: usize,
    +}
    +
    +
    +impl DeleteCursor {  
    +
    +    /// Skips operations and position it so that
    +    /// - either all of the delete operation currently in the 
    +    ///   queue are consume and the next get will return None.
    +    /// - the next get will return the first operation with an
    +    /// `opstamp >= target_opstamp`.
    +    pub fn skip_to(&mut self, target_opstamp: u64) {
    +        // TODO Can be optimize as we work with block.
    +        loop {
    +            if let Some(operation) = self.get() {
    +                if operation.opstamp >= target_opstamp {
    +                    break;
    +                }
    +            }
    +            else {
    +                break;
    +            }
    +            self.advance();
    +        }
    +    }
    +
    +    /// If the current block has been entirely 
    +    /// consumed, try to load the next one.
    +    /// 
    +    /// Return `true`, if after this attempt, 
    +    /// the cursor is on a block that has not
    +    /// been entirely consumed.
    +    /// Return `false`, if we have reached the end of the queue.
    +    fn load_block_if_required(&mut self) -> bool {
    +        if self.pos >= self.block.operations.len() {
    +            // we have consumed our operations entirely.
    +            // let's ask our writer if he has more for us.
    +            // self.go_next_block();
    +            match self.block.next.next_block() {
    +                Some(block) => {
    +                    self.block = block;
    +                    self.pos = 0;
    +                    true
    +                }
    +                None => {
    +                    false
    +                }
    +            }
    +        }
    +        else {
    +            true
    +        }
    +    }
    +    
    +    /// Advance to the next delete operation.
    +    /// Returns true iff there is such an operation.
    +    pub fn advance(&mut self) -> bool {
    +        if self.load_block_if_required() {
    +            self.pos += 1;
    +            true
    +        }
    +        else {
    +            false
    +        }
    +    }
    +
    +    /// Get the current delete operation.
    +    /// Calling `.get` does not advance the cursor.
    +    pub fn get<'a>(&'a mut self) -> Option<&'a DeleteOperation> {
    +        if self.load_block_if_required() {
    +            Some(&self.block.operations[self.pos])
    +        }
    +        else {
    +            None
    +        }
    +    }
    +
    +}
    +
    +
    +
    +
    +
     
     #[cfg(test)]
     mod tests {
    @@ -110,32 +293,33 @@ mod tests {
             let snapshot = delete_queue.cursor();
             {
                 let mut operations_it = snapshot.clone();
    -            assert_eq!(operations_it.next().unwrap().opstamp, 1);
    -            assert_eq!(operations_it.next().unwrap().opstamp, 2);
    -            assert!(operations_it.next().is_none());
    +            assert_eq!(operations_it.get().unwrap().opstamp, 1);
    +            operations_it.advance();
    +            assert_eq!(operations_it.get().unwrap().opstamp, 2);
    +            operations_it.advance();
    +            assert!(operations_it.get().is_none());
    +            operations_it.advance();
    +        
    +            let mut snapshot2 = delete_queue.cursor();
    +            assert!(snapshot2.get().is_none());
    +            delete_queue.push(make_op(3));
    +            assert_eq!(snapshot2.get().unwrap().opstamp, 3);
    +            assert_eq!(operations_it.get().unwrap().opstamp, 3);
    +            assert_eq!(operations_it.get().unwrap().opstamp, 3);
    +            operations_it.advance();
    +            assert!(operations_it.get().is_none());
    +            operations_it.advance();
             }
             {   
                 let mut operations_it = snapshot.clone();
    -            assert_eq!(operations_it.next().unwrap().opstamp, 1);
    -            assert_eq!(operations_it.next().unwrap().opstamp, 2);
    -            assert!(operations_it.next().is_none());
    +            assert_eq!(operations_it.get().unwrap().opstamp, 1);
    +            operations_it.advance();
    +            assert_eq!(operations_it.get().unwrap().opstamp, 2);
    +            operations_it.advance();
    +            assert_eq!(operations_it.get().unwrap().opstamp, 3);
    +            operations_it.advance();
    +            assert!(operations_it.get().is_none());
             }
             
    -        // // operations does not own a lock on the queue.
    -        // delete_queue.push(make_op(3));
    -        // let snapshot2 = delete_queue.snapshot();
    -        // {
    -        //     // operations is not affected by
    -        //     // the push that occurs after.
    -        //     let mut operations_it = snapshot.iter();
    -        //     let mut operations2_it = snapshot2.iter();
    -        //     assert_eq!(operations_it.next().unwrap().opstamp, 1);
    -        //     assert_eq!(operations2_it.next().unwrap().opstamp, 1);
    -        //     assert_eq!(operations_it.next().unwrap().opstamp, 2);
    -        //     assert_eq!(operations2_it.next().unwrap().opstamp, 2);
    -        //     assert!(operations_it.next().is_none());
    -        //     assert_eq!(operations2_it.next().unwrap().opstamp, 3);
    -        //     assert!(operations2_it.next().is_none());
    -        // }
         }
     }
    \ No newline at end of file
    diff --git a/src/indexer/delete_queue2.rs b/src/indexer/delete_queue2.rs
    deleted file mode 100644
    index e2b8230d1..000000000
    --- a/src/indexer/delete_queue2.rs
    +++ /dev/null
    @@ -1,206 +0,0 @@
    -use super::operation::DeleteOperation;
    -use std::sync::{Arc, RwLock};
    -use std::mem;
    -use std::ops::DerefMut;
    -
    -
    -#[derive(Clone, Default)]
    -struct DeleteQueue {
    -    writer: Arc>>,
    -    next_block: Option,
    -}
    -
    -impl DeleteQueue {
    -
    -    pub fn new() -> Arc {
    -        let mut delete_queue = Arc::new(DeleteQueue::default());
    -        delete_queue.next_block = Some(
    -            NextBlock::from(delete_queue)
    -        );
    -        delete_queue
    -    }
    -
    -    pub fn cursor(&self) -> Cursor {
    -        
    -        Cursor {
    -            current_block: Arc,
    -            pos: 0,
    -        }
    -    }
    -
    -    pub fn push(&self, delete_operation: DeleteOperation) {
    -        let mut write_lock = self.writer
    -            .write()
    -            .expect("Failed to acquire write lock on delete queue writer")
    -            .push(delete_operation);
    -    }
    -
    -    fn flush(&self) -> Option> {
    -        let mut write_lock = self
    -            .writer
    -            .write()
    -            .expect("Failed to acquire write lock on delete queue writer");
    -        if write_lock.is_empty() {
    -            return None;
    -        }
    -        Some(mem::replace(write_lock.deref_mut(), vec!()))
    -    }
    -}
    -
    -enum InnerNextBlock {
    -    Writer(Arc),
    -    Closed(Arc),
    -    Terminated,
    -}
    -
    -struct NextBlock(RwLock);
    -
    -impl From> for NextBlock {
    -    fn from(writer_arc: Arc) -> NextBlock {
    -        NextBlock(RwLock::new(InnerNextBlock::Writer(writer_arc)))
    -    }
    -}
    -
    -impl NextBlock {   
    -    pub fn next_block(&self) -> Option> {
    -        {
    -            let next_read_lock = self.0
    -                .read()
    -                .expect("Failed to acquire write lock in delete queue");
    -            match *next_read_lock {
    -                InnerNextBlock::Terminated => {
    -                    return None;
    -                }
    -                InnerNextBlock::Closed(ref block) => {
    -                    return Some(block.clone());
    -                }
    -                _ => {}
    -            }
    -        }
    -        let delete_operations;
    -        let writer_arc;
    -        {
    -            let mut next_write_lock = self.0
    -                .write()
    -                .expect("Failed to acquire write lock in delete queue");
    -            match *next_write_lock {
    -                InnerNextBlock::Terminated => {
    -                    return None;
    -                }
    -                InnerNextBlock::Closed(ref block) => {
    -                    return Some(block.clone());
    -                }
    -                InnerNextBlock::Writer(ref writer) => {
    -                    match writer.flush() {
    -                        Some(flushed_delete_operations) => {
    -                            delete_operations = flushed_delete_operations;
    -                        }
    -                        None => {
    -                            return None;
    -                        }
    -                    }
    -                    writer_arc = writer.clone();
    -                }
    -            }
    -            let next_block = Arc::new(Block {
    -                operations: Arc::new(delete_operations),
    -                next: NextBlock::from(writer_arc),
    -            });
    -            *next_write_lock.deref_mut() = InnerNextBlock::Closed(next_block.clone()); // TODO fix
    -            return Some(next_block)
    -        }
    -    }
    -}
    -
    -struct Block {
    -    operations: Arc>,
    -    next: NextBlock,
    -}
    -
    -
    -#[derive(Clone)]
    -struct Cursor {
    -    current_block: Arc,
    -    pos: usize,
    -}
    -
    -impl Cursor {   
    -    fn next<'a>(&'a mut self) -> Option<&'a DeleteOperation> {
    -        if self.pos >= self.current_block.operations.len() {
    -            // we have consumed our operations entirely.
    -            // let's ask our writer if he has more for us.
    -            // self.go_next_block();
    -            match self.current_block.next.next_block() {
    -                Some(block) => {
    -                    self.current_block = block;
    -                    self.pos = 0;
    -                }
    -                None => {
    -                    return None;
    -                }
    -            }
    -        }
    -        let operation = &self.current_block.operations[self.pos];
    -        self.pos += 1;
    -        return Some(operation);
    -    }
    -}
    -
    -
    -
    -
    -
    -
    -#[cfg(test)]
    -mod tests {
    -
    -    use super::{DeleteQueue, DeleteOperation};
    -    use schema::{Term, Field};
    -
    -    #[test]
    -    fn test_deletequeue() {
    -        let delete_queue = DeleteQueue::new();
    -        
    -        let make_op = |i: usize| {
    -            let field = Field(1u8);
    -            DeleteOperation {
    -                opstamp: i as u64,
    -                term: Term::from_field_u32(field, i as u32)
    -            }
    -        };
    -
    -        delete_queue.push(make_op(1));
    -        delete_queue.push(make_op(2));
    -
    -        let snapshot = delete_queue.cursor();
    -        {
    -            let mut operations_it = snapshot.clone();
    -            assert_eq!(operations_it.next().unwrap().opstamp, 1);
    -            assert_eq!(operations_it.next().unwrap().opstamp, 2);
    -            assert!(operations_it.next().is_none());
    -        }
    -        {   
    -            let mut operations_it = snapshot.clone();
    -            assert_eq!(operations_it.next().unwrap().opstamp, 1);
    -            assert_eq!(operations_it.next().unwrap().opstamp, 2);
    -            assert!(operations_it.next().is_none());
    -        }
    -        
    -        // // operations does not own a lock on the queue.
    -        // delete_queue.push(make_op(3));
    -        // let snapshot2 = delete_queue.snapshot();
    -        // {
    -        //     // operations is not affected by
    -        //     // the push that occurs after.
    -        //     let mut operations_it = snapshot.iter();
    -        //     let mut operations2_it = snapshot2.iter();
    -        //     assert_eq!(operations_it.next().unwrap().opstamp, 1);
    -        //     assert_eq!(operations2_it.next().unwrap().opstamp, 1);
    -        //     assert_eq!(operations_it.next().unwrap().opstamp, 2);
    -        //     assert_eq!(operations2_it.next().unwrap().opstamp, 2);
    -        //     assert!(operations_it.next().is_none());
    -        //     assert_eq!(operations2_it.next().unwrap().opstamp, 3);
    -        //     assert!(operations2_it.next().is_none());
    -        // }
    -    }
    -}
    \ No newline at end of file
    diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs
    index 33afe7108..0ee8d716c 100644
    --- a/src/indexer/index_writer.rs
    +++ b/src/indexer/index_writer.rs
    @@ -165,7 +165,7 @@ pub fn compute_deleted_bitset(
         let mut might_have_changed = false;
         
         loop {
    -        if let Some(delete_op) = delete_cursor.peek() {
    +        if let Some(delete_op) = delete_cursor.get() {
                 if delete_op.opstamp > target_opstamp {
                     break;
                 }
    diff --git a/src/indexer/mod.rs b/src/indexer/mod.rs
    index af6e6a21d..4dcd9fe12 100644
    --- a/src/indexer/mod.rs
    +++ b/src/indexer/mod.rs
    @@ -14,9 +14,6 @@ mod doc_opstamp_mapping;
     pub mod operation;
     mod stamper;
     
    -
    -// TODO avoid exposing SegmentState / SegmentEntry if it does not have to be public API
    -
     pub use self::segment_entry::{SegmentEntry, SegmentState};
     pub use self::segment_serializer::SegmentSerializer;
     pub use self::segment_writer::SegmentWriter;
    diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs
    index 74ea6ca2f..f00923e37 100644
    --- a/src/indexer/segment_updater.rs
    +++ b/src/indexer/segment_updater.rs
    @@ -359,7 +359,8 @@ impl SegmentUpdater {
             
             self.run_async(move |segment_updater| {
                 debug!("End merge {:?}", after_merge_segment_entry.meta());
    -            if let Some(delete_operation) = after_merge_segment_entry.delete_cursor().peek() {
    +            let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
    +            if let Some(delete_operation) = delete_cursor.get() {
                     let committed_opstamp = segment_updater.0.index.opstamp();
                     if delete_operation.opstamp < committed_opstamp {
                         let segment = segment_updater.0.index.segment(after_merge_segment_entry.meta().clone());
    
    From 9eb2d3e8c59c2f3e0faf50177f5e5c6649bc6703 Mon Sep 17 00:00:00 2001
    From: Paul Masurel 
    Date: Sun, 2 Apr 2017 16:26:28 +0900
    Subject: [PATCH 094/107] issue/96 avoid removing the bitset from
     segment_entry.
    
    ---
     src/core/segment.rs            |  4 ++++
     src/indexer/index_writer.rs    |  6 +++---
     src/indexer/merger.rs          |  8 ++++++--
     src/indexer/segment_entry.rs   |  4 ++--
     src/indexer/segment_manager.rs |  8 +-------
     src/indexer/segment_updater.rs | 13 ++++++++-----
     6 files changed, 24 insertions(+), 19 deletions(-)
    
    diff --git a/src/core/segment.rs b/src/core/segment.rs
    index 99fad2591..75f4d2596 100644
    --- a/src/core/segment.rs
    +++ b/src/core/segment.rs
    @@ -43,6 +43,10 @@ impl Segment {
             self.index.schema()
         }
     
    +    pub fn index(&self,) -> &Index {
    +        &self.index
    +    }
    +
         /// Returns the segment meta-information
         pub fn meta(&self) -> &SegmentMeta {
             &self.meta
    diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs
    index 0ee8d716c..756d37242 100644
    --- a/src/indexer/index_writer.rs
    +++ b/src/indexer/index_writer.rs
    @@ -220,9 +220,9 @@ pub fn advance_deletes(
             let max_doc = segment_reader.max_doc();
             
             let mut delete_bitset: BitSet =
    -            match segment_entry.reset_delete_bitset() {
    -                Some(previous_delete_bitset) =>
    -                    previous_delete_bitset,
    +            match segment_entry.delete_bitset() {
    +                Some(ref previous_delete_bitset) =>
    +                    (*previous_delete_bitset).clone(),
                     None =>
                         BitSet::with_capacity(max_doc as usize)
                 };
    diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs
    index 596c4d696..ae8679d47 100644
    --- a/src/indexer/merger.rs
    +++ b/src/indexer/merger.rs
    @@ -34,7 +34,11 @@ struct DeltaPositionComputer {
     
     impl DeltaPositionComputer {
         fn new() -> DeltaPositionComputer {
    -        DeltaPositionComputer { buffer: iter::repeat(0u32).take(512).collect::>() }
    +        DeltaPositionComputer { 
    +            buffer: iter::repeat(0u32)
    +                .take(512)
    +                .collect::>()
    +        }
         }
     
         fn compute_delta_positions(&mut self, positions: &[u32]) -> &[u32] {
    @@ -227,7 +231,7 @@ impl IndexMerger {
                 // We can remove the term if all documents which
                 // contained it have been deleted.
                 if segment_postings.len() > 0 {
    -
    +                
                     // We can now serialize this postings, by pushing each document to the
                     // postings serializer.                
                     
    diff --git a/src/indexer/segment_entry.rs b/src/indexer/segment_entry.rs
    index ac44f33e8..c93bcfe64 100644
    --- a/src/indexer/segment_entry.rs
    +++ b/src/indexer/segment_entry.rs
    @@ -42,8 +42,8 @@ impl SegmentEntry {
             }
         }
     
    -    pub fn reset_delete_bitset(&mut self,) -> Option {
    -        self.delete_bitset.take()
    +    pub fn delete_bitset(&self,) -> Option<&BitSet> {
    +        self.delete_bitset.as_ref()
         }
     
         pub fn set_meta(&mut self, segment_meta: SegmentMeta) {
    diff --git a/src/indexer/segment_manager.rs b/src/indexer/segment_manager.rs
    index 232c88f15..4881cb76e 100644
    --- a/src/indexer/segment_manager.rs
    +++ b/src/indexer/segment_manager.rs
    @@ -71,13 +71,7 @@ impl SegmentManager {
             let mut files = HashSet::new();
             files.insert(META_FILEPATH.clone());
             files.insert(LOCKFILE_FILEPATH.clone());
    -        
    -        // TODO do new segment
    -        // really have at no point a delete file?
    -        // that might get garbage collected?
    -        // 
    -        // Consider have new segment matched as a prefix.
    -        
    +                
             let segment_metas: Vec =
                 registers_lock.committed
                     .get_all_segments()
    diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs
    index f00923e37..ca3c61684 100644
    --- a/src/indexer/segment_updater.rs
    +++ b/src/indexer/segment_updater.rs
    @@ -119,6 +119,14 @@ fn perform_merge(segment_ids: &[SegmentId],
             }
         }
         
    +    // TODO REMOVEEEEE THIIIIIS
    +    {
    +    let living_files = segment_updater.0.segment_manager.list_files();
    +    let mut index = merged_segment.index().clone();
    +    index.directory_mut().garbage_collect(living_files);
    +    }
    +
    +
         let delete_cursor = segment_entries[0].delete_cursor().clone();
     
         let segments: Vec = segment_entries
    @@ -265,9 +273,6 @@ impl SegmentUpdater {
                     let living_files = segment_updater.0.segment_manager.list_files();
                     index.directory_mut().garbage_collect(living_files);
                     segment_updater.consider_merge_options();
    -                
    -                // See #112
    -                // index.directory_mut().garbage_collect(living_files);
                 }
             }).wait()
         }
    @@ -283,8 +288,6 @@ impl SegmentUpdater {
             let merging_thread_id = self.get_merging_thread_id();
             let (merging_future_send, merging_future_recv) = oneshot();
             
    -        // let delete_operations = self.0.delete_queue.snapshot();
    -
             if segment_ids.is_empty() {
                 return merging_future_recv;
             }
    
    From 17631ed8664646c86ae406e8fd4ec468ce5cae2e Mon Sep 17 00:00:00 2001
    From: Paul Masurel 
    Date: Sun, 2 Apr 2017 18:48:20 +0900
    Subject: [PATCH 095/107] issue/96 Added functionality to protect files from
     deletion
    
    Hopefully fixed the race condition happening when merging files.
    ---
     src/core/segment.rs                |   7 +-
     src/directory/directory.rs         |   4 +-
     src/directory/error.rs             |  16 +++-
     src/directory/managed_directory.rs | 126 ++++++++++++++++++++++++-----
     src/directory/mmap_directory.rs    |  12 +--
     src/directory/mod.rs               |   2 +-
     src/directory/ram_directory.rs     |  10 +--
     src/indexer/index_writer.rs        |  10 ++-
     src/indexer/segment_updater.rs     |  23 +++---
     9 files changed, 156 insertions(+), 54 deletions(-)
    
    diff --git a/src/core/segment.rs b/src/core/segment.rs
    index 75f4d2596..7baf9516c 100644
    --- a/src/core/segment.rs
    +++ b/src/core/segment.rs
    @@ -4,7 +4,7 @@ use schema::Schema;
     use DocId;
     use std::fmt;
     use core::SegmentId;
    -use directory::{ReadOnlySource, WritePtr};
    +use directory::{ReadOnlySource, WritePtr, FileProtection};
     use indexer::segment_serializer::SegmentSerializer;
     use super::SegmentComponent;
     use core::Index;
    @@ -70,6 +70,11 @@ impl Segment {
             self.meta.relative_path(component)
         }
     
    +    pub fn protect_from_delete(&self, component: SegmentComponent) -> FileProtection {
    +        let path = self.relative_path(component);
    +        self.index.directory().protect_file_from_delete(&path)
    +    }
    +
         /// Open one of the component file for read.
         pub fn open_read(&self, component: SegmentComponent) -> result::Result {
             let path = self.relative_path(component);
    diff --git a/src/directory/directory.rs b/src/directory/directory.rs
    index 2f3cb4146..c4a98b237 100644
    --- a/src/directory/directory.rs
    +++ b/src/directory/directory.rs
    @@ -1,7 +1,7 @@
     use std::marker::Send;
     use std::fmt;
     use std::path::Path;
    -use directory::error::{FileError, OpenWriteError};
    +use directory::error::{FileError, DeleteError, OpenWriteError};
     use directory::{ReadOnlySource, WritePtr};
     use std::result;
     use std::io;
    @@ -35,7 +35,7 @@ pub trait Directory: fmt::Debug + Send + Sync + 'static {
         /// 
         /// Removing a nonexistent file, yields a
         /// `FileError::DoesNotExist`.
    -    fn delete(&self, path: &Path) -> result::Result<(), FileError>;
    +    fn delete(&self, path: &Path) -> result::Result<(), DeleteError>;
     
         /// Returns true iff the file exists
         fn exists(&self, path: &Path) -> bool;
    diff --git a/src/directory/error.rs b/src/directory/error.rs
    index aacfe62d3..711f4a3df 100644
    --- a/src/directory/error.rs
    +++ b/src/directory/error.rs
    @@ -27,7 +27,7 @@ impl From for OpenWriteError {
         }
     }
     
    -/// Error that may occur when accessing a file (read, or delete)
    +/// Error that may occur when accessing a file read
     #[derive(Debug)]
     pub enum FileError {
         /// The file does not exists.
    @@ -36,3 +36,17 @@ pub enum FileError {
         /// interacting with the underlying IO device.
         IOError(io::Error),
     }
    +
    +
    +/// Error that may occur when trying to delete a file
    +#[derive(Debug)]
    +pub enum DeleteError {
    +    /// The file does not exists.
    +    FileDoesNotExist(PathBuf),
    +    /// Any kind of IO error that happens when 
    +    /// interacting with the underlying IO device.
    +    IOError(io::Error),
    +    /// The file may not be deleted because it is 
    +    /// protected.
    +    FileProtected(PathBuf),
    +}
    diff --git a/src/directory/managed_directory.rs b/src/directory/managed_directory.rs
    index e764471fc..ffa1697af 100644
    --- a/src/directory/managed_directory.rs
    +++ b/src/directory/managed_directory.rs
    @@ -1,5 +1,5 @@
     use std::path::{Path, PathBuf};
    -use directory::error::{FileError, OpenWriteError};
    +use directory::error::{FileError, DeleteError, OpenWriteError};
     use directory::{ReadOnlySource, WritePtr};
     use std::result;
     use std::io;
    @@ -9,6 +9,8 @@ use std::collections::HashSet;
     use std::io::Write;
     use rustc_serialize::json;
     use core::MANAGED_FILEPATH;
    +use std::collections::HashMap;
    +use std::fmt;
     use Result;
     use Error;
     
    @@ -24,7 +26,30 @@ use Error;
     #[derive(Debug)]
     pub struct ManagedDirectory {
         directory: Box,
    -    managed_paths: Arc>>,
    +    meta_informations: Arc>,
    +}
    +
    +#[derive(Debug, Default)]
    +struct MetaInformation {
    +    managed_paths: HashSet,
    +    protected_files: HashMap,
    +}
    +
    +pub struct FileProtection {
    +    directory: ManagedDirectory,
    +    path: PathBuf,
    +}
    +
    +impl fmt::Debug for FileProtection {
    +    fn fmt(&self, formatter: &mut fmt::Formatter) -> result::Result<(), fmt::Error> {
    +        write!(formatter, "FileProtectionFor({:?})", self.path)    
    +    }
    +}
    +
    +impl Drop for FileProtection {
    +    fn drop(&mut self) {
    +        self.directory.unprotect_file_from_delete(&self.path);
    +    }
     }
     
     impl ManagedDirectory {
    @@ -38,13 +63,17 @@ impl ManagedDirectory {
                         .map_err(|e| Error::CorruptedFile(MANAGED_FILEPATH.clone(), Box::new(e)))?;
                     Ok(ManagedDirectory {
                         directory: box directory,
    -                    managed_paths: Arc::new(RwLock::new(managed_files)),
    +                    meta_informations: Arc::new(RwLock::new(
    +                        MetaInformation {
    +                            managed_paths: managed_files,
    +                            protected_files: HashMap::default()
    +                        })),
                     })
                 }
                 Err(FileError::FileDoesNotExist(_)) => {
                     Ok(ManagedDirectory {
                         directory: box directory,
    -                    managed_paths: Arc::default(),
    +                    meta_informations: Arc::default(),
                     })
                 }
                 Err(FileError::IOError(e)) => {
    @@ -65,54 +94,98 @@ impl ManagedDirectory {
         /// an error is simply logged, and the file remains in the list of managed
         /// files.
         pub fn garbage_collect(&mut self, living_files: HashSet) {
    -        let mut managed_has_changed: bool = false;
    -        {
    -            let mut files_to_delete = vec!();
    -            let mut managed_paths_write = self.managed_paths.write().unwrap();
    +        let mut files_to_delete = vec!();
    +        {   // releasing the lock as .delete() will use it too.
    +            let mut meta_informations_wlock = self.meta_informations.write().unwrap();
    +            let managed_paths_write = &mut meta_informations_wlock.managed_paths;
                 for managed_path in managed_paths_write.iter() {
                     if !living_files.contains(managed_path) {
                         files_to_delete.push(managed_path.clone());
                     }
                 }
    +        }
    +        
    +        let mut deleted_files = vec!();
    +        {
                 for file_to_delete in files_to_delete {
    -                match self.directory.delete(&file_to_delete) {
    +                match self.delete(&file_to_delete) {
                         Ok(_) => {
                             info!("Deleted {:?}", file_to_delete);
    -                        managed_has_changed |= managed_paths_write.remove(&file_to_delete);
    +                        deleted_files.push(file_to_delete);
                         }
                         Err(file_error) => {
                             error!("Failed to delete {:?}", file_to_delete);
                             match file_error {
    -                            FileError::FileDoesNotExist(_) => {
    -                                managed_has_changed |= managed_paths_write.remove(&file_to_delete);
    +                            DeleteError::FileDoesNotExist(_) => {
    +                                deleted_files.push(file_to_delete);
                                 }
    -                            FileError::IOError(_) => {
    +                            DeleteError::IOError(_) => {
                                     if !cfg!(target_os = "windows") {
                                         error!("Failed to delete {:?}", file_to_delete);
                                     }
                                 }
    -                            
    +                            DeleteError::FileProtected(_) => {
    +                                // this is expected.
    +                            }
                             }
                             
                         }
                     }
                 }
             }
    -        if managed_has_changed {
    +
    +
    +        if !deleted_files.is_empty() {
    +            // update the list of managed files by removing 
    +            // the file that were removed.
    +            {
    +                let mut meta_informations_wlock = self.meta_informations.write().unwrap();
    +                let managed_paths_write = &mut meta_informations_wlock.managed_paths;
    +                for delete_file in &deleted_files {
    +                    managed_paths_write.remove(delete_file);
    +                }
    +            }
                 if let Err(_) = self.save_managed_paths() {
                     error!("Failed to save the list of managed files.");
                 }
             }
    +
    +    }
    +
    +    pub fn protect_file_from_delete(&self, path: &Path) -> FileProtection {
    +        let mut meta_informations_wlock = self.meta_informations
    +            .write()
    +            .expect("Managed file lock poisoned");
    +        let pathbuf = path.to_owned();
    +        *meta_informations_wlock
    +            .protected_files
    +            .entry(pathbuf.clone())
    +            .or_insert(0) += 1;
    +        FileProtection {
    +            directory: self.clone(),
    +            path: pathbuf.clone(),
    +        }
    +    }
    +
    +    pub fn unprotect_file_from_delete(&self, path: &Path) {
    +        let mut meta_informations_wlock = self.meta_informations
    +            .write()
    +            .expect("Managed file lock poisoned");
    +        if let Some(counter_ref_mut) = meta_informations_wlock
    +            .protected_files
    +            .get_mut(path) {
    +            (*counter_ref_mut) -= 1;
    +        }
         }
     
         /// Saves the file containing the list of existing files
         /// that were created by tantivy.
         fn save_managed_paths(&mut self,) -> io::Result<()> {
    -        let managed_files_lock = self.managed_paths
    +        let meta_informations_rlock = self.meta_informations
                 .read()
                 .expect("Managed file lock poisoned");
             let mut w = vec!();
    -        try!(write!(&mut w, "{}\n", json::as_pretty_json(&*managed_files_lock)));
    +        try!(write!(&mut w, "{}\n", json::as_pretty_json(&meta_informations_rlock.managed_paths)));
             self.directory.atomic_write(&MANAGED_FILEPATH, &w[..])?;
             Ok(())
         }
    @@ -126,11 +199,10 @@ impl ManagedDirectory {
         /// never get removed.
         fn register_file_as_managed(&mut self, filepath: &Path) -> io::Result<()> {
             let has_changed = {
    -            let mut managed_files_lock = self
    -                .managed_paths
    +            let mut meta_wlock = self.meta_informations
                     .write()
                     .expect("Managed file lock poisoned");
    -            managed_files_lock.insert(filepath.to_owned())
    +            meta_wlock.managed_paths.insert(filepath.to_owned())
             };
             if has_changed {
                 self.save_managed_paths()?;
    @@ -159,7 +231,17 @@ impl Directory for ManagedDirectory {
             self.directory.atomic_read(path)
         }
     
    -    fn delete(&self, path: &Path) -> result::Result<(), FileError> {
    +    fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
    +        {
    +            let metas_rlock = self.meta_informations
    +                .read()
    +                .expect("poisoned lock in managed directory meta");
    +            if let Some(counter) = metas_rlock.protected_files.get(path) {
    +                if *counter > 0 {
    +                    return Err(DeleteError::FileProtected(path.to_owned()))
    +                }
    +            }
    +        }
             self.directory.delete(path)
         }
     
    @@ -177,7 +259,7 @@ impl Clone for ManagedDirectory {
         fn clone(&self) -> ManagedDirectory {
             ManagedDirectory {
                 directory: self.directory.box_clone(),
    -            managed_paths: self.managed_paths.clone(),   
    +            meta_informations: self.meta_informations.clone(),
             }
         }
     }
    diff --git a/src/directory/mmap_directory.rs b/src/directory/mmap_directory.rs
    index 6b2730a4b..9a5670b7f 100644
    --- a/src/directory/mmap_directory.rs
    +++ b/src/directory/mmap_directory.rs
    @@ -1,7 +1,7 @@
     use atomicwrites;
     use common::make_io_err;
     use directory::Directory;
    -use directory::error::{OpenWriteError, FileError, OpenDirectoryError};
    +use directory::error::{OpenWriteError, FileError, DeleteError, OpenDirectoryError};
     use directory::ReadOnlySource;
     use directory::shared_vec_slice::SharedVecSlice;
     use directory::WritePtr;
    @@ -334,13 +334,13 @@ impl Directory for MmapDirectory {
             Ok(BufWriter::new(Box::new(writer)))
         }
     
    -    fn delete(&self, path: &Path) -> result::Result<(), FileError> {
    +    fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
             debug!("Deleting file {:?}", path);
             let full_path = self.resolve_path(path);
             let mut mmap_cache = try!(self.mmap_cache
                 .write()
                 .map_err(|_| 
    -                 FileError::IOError(make_io_err(format!("Failed to acquired write lock on mmap cache while deleting {:?}", path))))
    +                 DeleteError::IOError(make_io_err(format!("Failed to acquired write lock on mmap cache while deleting {:?}", path))))
             );
             // Removing the entry in the MMap cache.
             // The munmap will appear on Drop,
    @@ -349,14 +349,14 @@ impl Directory for MmapDirectory {
             match fs::remove_file(&full_path) {
                 Ok(_) => {
                     self.sync_directory()
    -                    .map_err(|e| FileError::IOError(e))
    +                    .map_err(|e| DeleteError::IOError(e))
                 }
                 Err(e) => {
                     if e.kind() == io::ErrorKind::NotFound {
    -                    Err(FileError::FileDoesNotExist(path.to_owned()))
    +                    Err(DeleteError::FileDoesNotExist(path.to_owned()))
                     }
                     else {
    -                    Err(FileError::IOError(e))
    +                    Err(DeleteError::IOError(e))
                     }
                 }
             }
    diff --git a/src/directory/mod.rs b/src/directory/mod.rs
    index f0bf91101..09f61da3e 100644
    --- a/src/directory/mod.rs
    +++ b/src/directory/mod.rs
    @@ -15,7 +15,7 @@ pub use self::read_only_source::ReadOnlySource;
     pub use self::directory::Directory;
     pub use self::ram_directory::RAMDirectory;
     pub use self::mmap_directory::MmapDirectory;
    -pub use self::managed_directory::ManagedDirectory;
    +pub use self::managed_directory::{ManagedDirectory, FileProtection};
     
     /// Synonym of Seek + Write
     pub trait SeekableWrite: Seek + Write {}
    diff --git a/src/directory/ram_directory.rs b/src/directory/ram_directory.rs
    index 32d44e184..b8a3b5c95 100644
    --- a/src/directory/ram_directory.rs
    +++ b/src/directory/ram_directory.rs
    @@ -6,7 +6,7 @@ use std::result;
     use std::sync::{Arc, RwLock};
     use common::make_io_err;
     use directory::{Directory, ReadOnlySource};
    -use directory::error::{OpenWriteError, FileError};
    +use directory::error::{OpenWriteError, FileError, DeleteError};
     use directory::WritePtr;
     use super::shared_vec_slice::SharedVecSlice;
     
    @@ -104,12 +104,12 @@ impl InnerDirectory {
                 })
         }
     
    -    fn delete(&self, path: &Path) -> result::Result<(), FileError> {
    +    fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
             self.0
                 .write()
                 .map_err(|_| {
                     let io_err = make_io_err(format!("Failed to acquire write lock for the directory, when trying to delete {:?}", path));
    -                FileError::IOError(io_err)
    +                DeleteError::IOError(io_err)
                 })
                 .and_then(|mut writable_map| {
                     match writable_map.remove(path) {
    @@ -117,7 +117,7 @@ impl InnerDirectory {
                             Ok(())
                         },
                         None => {
    -                        Err(FileError::FileDoesNotExist(PathBuf::from(path)))
    +                        Err(DeleteError::FileDoesNotExist(PathBuf::from(path)))
                         }
                     }
                 })
    @@ -176,7 +176,7 @@ impl Directory for RAMDirectory {
             }
         }
     
    -    fn delete(&self, path: &Path) -> result::Result<(), FileError> {
    +    fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
             self.fs.delete(path)
         }
     
    diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs
    index 756d37242..a050dff4d 100644
    --- a/src/indexer/index_writer.rs
    +++ b/src/indexer/index_writer.rs
    @@ -8,6 +8,7 @@ use core::SegmentMeta;
     use core::SegmentReader;
     use indexer::stamper::Stamper;
     use datastruct::stacker::Heap;
    +use directory::FileProtection;
     use Error;
     use Directory;
     use fastfield::delete::write_delete_bitset;
    @@ -207,13 +208,15 @@ pub fn compute_deleted_bitset(
     pub fn advance_deletes(
         mut segment: Segment,
         segment_entry: &mut SegmentEntry,
    -    target_opstamp: u64) -> Result<()> {
    +    target_opstamp: u64) -> Result> {
    +
    +    let mut file_protect: Option = None;
     
         {
             if let Some(previous_opstamp) = segment_entry.meta().delete_opstamp() {
                 // We are already up-to-date here.
                 if target_opstamp == previous_opstamp {
    -                return Ok(());
    +                return Ok(file_protect);
                 }
             }
             let segment_reader = SegmentReader::open(segment.clone())?;
    @@ -245,13 +248,14 @@ pub fn advance_deletes(
             let num_deleted_docs = delete_bitset.len();
             if num_deleted_docs > 0 {
                 segment.set_delete_meta(num_deleted_docs as u32, target_opstamp);
    +            file_protect = Some(segment.protect_from_delete(SegmentComponent::DELETE));
                 let mut delete_file = segment.open_write(SegmentComponent::DELETE)?;
                 write_delete_bitset(&delete_bitset, &mut delete_file)?;
             }
         }
         segment_entry.set_meta(segment.meta().clone());
     
    -    Ok(())
    +    Ok(file_protect)
     }
     
     fn index_documents(heap: &mut Heap,
    diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs
    index ca3c61684..651e9077b 100644
    --- a/src/indexer/segment_updater.rs
    +++ b/src/indexer/segment_updater.rs
    @@ -14,6 +14,7 @@ use futures_cpupool::CpuPool;
     use futures::Future;
     use futures::Canceled;
     use futures::oneshot;
    +use directory::FileProtection;
     use indexer::{MergePolicy, DefaultMergePolicy};
     use indexer::index_writer::advance_deletes;
     use indexer::MergeCandidate;
    @@ -105,12 +106,17 @@ fn perform_merge(segment_ids: &[SegmentId],
         let ref index = segment_updater.0.index;
         let schema = index.schema();
         let mut segment_entries = vec!();
    +
    +    let mut file_protections: Vec = vec!();
    +
         for segment_id in segment_ids {
             if let Some(mut segment_entry) = segment_updater.0
                 .segment_manager
                 .segment_entry(segment_id) {
                 let segment = index.segment(segment_entry.meta().clone());
    -            advance_deletes(segment, &mut segment_entry, target_opstamp)?;
    +            if let Some(file_protection) = advance_deletes(segment, &mut segment_entry, target_opstamp)? {
    +                file_protections.push(file_protection);
    +            }
                 segment_entries.push(segment_entry);
             }
             else {
    @@ -119,14 +125,6 @@ fn perform_merge(segment_ids: &[SegmentId],
             }
         }
         
    -    // TODO REMOVEEEEE THIIIIIS
    -    {
    -    let living_files = segment_updater.0.segment_manager.list_files();
    -    let mut index = merged_segment.index().clone();
    -    index.directory_mut().garbage_collect(living_files);
    -    }
    -
    -
         let delete_cursor = segment_entries[0].delete_cursor().clone();
     
         let segments: Vec = segment_entries
    @@ -135,10 +133,11 @@ fn perform_merge(segment_ids: &[SegmentId],
                 index.segment(segment_entry.meta().clone())
             })
             .collect();
    +        
         
         // An IndexMerger is like a "view" of our merged segments.
         let merger: IndexMerger = IndexMerger::open(schema, &segments[..])?;
    -    
    +
         // ... we just serialize this index merger in our new segment
         // to merge the two segments.
     
    @@ -317,13 +316,11 @@ impl SegmentUpdater {
                         let _merging_future_res = merging_future_send.send(merged_segment_meta);
                     }
                     Err(e) => {
    +                    error!("Merge of {:?} was cancelled: {:?}", segment_ids_vec, e);
                         // ... cancel merge
                         if cfg!(test) {
                             panic!("Merge failed.");
                         }
    -                    else {
    -                        error!("Merge of {:?} was cancelled: {:?}", segment_ids_vec, e);
    -                    }
                         segment_updater_clone.cancel_merge(&segment_ids_vec, merged_segment_id);
                         // merging_future_send will be dropped, sending an error to the future.
                     }
    
    From d4f2e475ff24754e58d2743ba4a3e47700637403 Mon Sep 17 00:00:00 2001
    From: Paul Masurel 
    Date: Sun, 2 Apr 2017 19:21:20 +0900
    Subject: [PATCH 096/107] issue/96 removed faulty assert
    
    ---
     src/indexer/segment_updater.rs | 1 -
     1 file changed, 1 deletion(-)
    
    diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs
    index 651e9077b..4d2c6be20 100644
    --- a/src/indexer/segment_updater.rs
    +++ b/src/indexer/segment_updater.rs
    @@ -445,7 +445,6 @@ mod tests {
             }
     
             index.load_searchers().unwrap();
    -        assert_eq!(index.searcher().segment_readers().len(), 2);
             assert_eq!(index.searcher().num_docs(), 302);
     
             {
    
    From ea3349644c41677e02c2812d8dc899143d3b9367 Mon Sep 17 00:00:00 2001
    From: Paul Masurel 
    Date: Sun, 2 Apr 2017 21:58:38 +0900
    Subject: [PATCH 097/107] issue/96 Fixed unit test condition to something
     reasonable
    
    ---
     src/indexer/index_writer.rs | 2 +-
     1 file changed, 1 insertion(+), 1 deletion(-)
    
    diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs
    index a050dff4d..fa1cfd721 100644
    --- a/src/indexer/index_writer.rs
    +++ b/src/indexer/index_writer.rs
    @@ -702,7 +702,7 @@ mod tests {
                 index.load_searchers().unwrap();
                 
                 assert_eq!(num_docs_containing("a"), 200);
    -            assert_eq!(index.searchable_segments().unwrap().len(), 1);
    +            assert!(index.searchable_segments().unwrap().len() < 8);
                 
             }
         }
    
    From b5bf9bb13c3d744262732754cc44ede769302518 Mon Sep 17 00:00:00 2001
    From: Paul Masurel 
    Date: Mon, 3 Apr 2017 08:39:18 +0900
    Subject: [PATCH 098/107] issue/96 Looping over wait_merging_thread.
    
    ---
     src/indexer/index_writer.rs     |  8 +++---
     src/indexer/segment_manager.rs  |  5 ++++
     src/indexer/segment_register.rs |  4 +++
     src/indexer/segment_updater.rs  | 46 +++++++++++++++++++++------------
     4 files changed, 44 insertions(+), 19 deletions(-)
    
    diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs
    index fa1cfd721..fbfe2f70f 100644
    --- a/src/indexer/index_writer.rs
    +++ b/src/indexer/index_writer.rs
    @@ -324,9 +324,8 @@ impl IndexWriter {
             // dropping the last reference to the segment_updater.
             drop(self.document_sender);
             
    -
    +        
             let former_workers_handles = mem::replace(&mut self.workers_join_handle, vec!());
    -        debug!("wait {} merging threads START", former_workers_handles.len());
             for join_handle in former_workers_handles {
                 try!(join_handle.join()
                     .expect("Indexing Worker thread panicked")
    @@ -342,7 +341,10 @@ impl IndexWriter {
                     Error::ErrorInThread("Failed to join merging thread.".to_string())
                 );
             
    -        debug!("wait merging threads DONE");
    +        if let &Err(ref e) = &result {
    +            error!("Some merging thread failed {:?}", e);
    +        }
    +
             result
         }
     
    diff --git a/src/indexer/segment_manager.rs b/src/indexer/segment_manager.rs
    index 4881cb76e..4961cb148 100644
    --- a/src/indexer/segment_manager.rs
    +++ b/src/indexer/segment_manager.rs
    @@ -66,6 +66,11 @@ impl SegmentManager {
             segment_entries
         }
     
    +    pub fn num_segments(&self,) -> usize {
    +        let registers_lock = self.read();
    +        registers_lock.committed.len() + registers_lock.uncommitted.len()
    +    }
    +
         pub fn list_files(&self) -> HashSet {
             let registers_lock = self.read();
             let mut files = HashSet::new();
    diff --git a/src/indexer/segment_register.rs b/src/indexer/segment_register.rs
    index 902680164..f8f7c7d64 100644
    --- a/src/indexer/segment_register.rs
    +++ b/src/indexer/segment_register.rs
    @@ -38,6 +38,10 @@ impl SegmentRegister {
             self.segment_states.clear();
         }
     
    +    pub fn len(&self) -> usize {
    +        self.segment_states.len()
    +    }
    +
         pub fn get_all_segments(&self,) -> Vec {
             self.segment_states
                 .values()
    diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs
    index 4d2c6be20..52a68911a 100644
    --- a/src/indexer/segment_updater.rs
    +++ b/src/indexer/segment_updater.rs
    @@ -374,22 +374,36 @@ impl SegmentUpdater {
         }
     
         pub fn wait_merging_thread(&self) -> Result<()> {
    -        let mut new_merging_threads = HashMap::new();
    -        {
    -            let mut merging_threads = self.0.merging_threads.write().unwrap();
    -            mem::swap(&mut new_merging_threads, merging_threads.deref_mut());
    -        }
    -        debug!("wait merging thread {}", new_merging_threads.len());
    -        for (_, merging_thread_handle) in new_merging_threads {
    -            merging_thread_handle
    -                .join()
    -                .map(|_| ())
    -                .map_err(|_| {
    -                    Error::ErrorInThread("Merging thread failed.".to_string())
    -                })?
    -        }
    -        // Our merging thread may have queued their completed
    -        self.run_async(move |_| {}).wait()
    +
    +        let mut num_segments: usize;
    +        loop {
    +            
    +            num_segments = self.0.segment_manager.num_segments();
    +
    +            let mut new_merging_threads = HashMap::new();
    +            {
    +                let mut merging_threads = self.0.merging_threads.write().unwrap();
    +                mem::swap(&mut new_merging_threads, merging_threads.deref_mut());
    +            }
    +            debug!("wait merging thread {}", new_merging_threads.len());
    +            for (_, merging_thread_handle) in new_merging_threads {
    +                merging_thread_handle
    +                    .join()
    +                    .map(|_| ())
    +                    .map_err(|_| {
    +                        Error::ErrorInThread("Merging thread failed.".to_string())
    +                    })?
    +            }
    +            // Our merging thread may have queued their completed
    +            self.run_async(move |_| {}).wait()?;
    +
    +            let new_num_segments = self.0.segment_manager.num_segments();
    +
    +            if new_num_segments >= num_segments {
    +                break;
    +            }
    +        }      
    +        Ok(())
         }
     
     }
    
    From 35203378ef0981d24e5ccd1b73e22c97a06580bc Mon Sep 17 00:00:00 2001
    From: Paul Masurel 
    Date: Mon, 3 Apr 2017 17:26:21 +0900
    Subject: [PATCH 099/107] Considering merge options after calling end_merge
    
    ---
     src/indexer/segment_updater.rs | 1 +
     1 file changed, 1 insertion(+)
    
    diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs
    index 52a68911a..517400b27 100644
    --- a/src/indexer/segment_updater.rs
    +++ b/src/indexer/segment_updater.rs
    @@ -369,6 +369,7 @@ impl SegmentUpdater {
                     }
                 }
                 segment_updater.0.segment_manager.end_merge(&before_merge_segment_ids, after_merge_segment_entry);
    +            segment_updater.consider_merge_options();
                 segment_updater.save_metas(segment_updater.0.index.opstamp());
             }).wait()
         }
    
    From e0a39fb273d4db03a637dec31f040971468ff9ff Mon Sep 17 00:00:00 2001
    From: Paul Masurel 
    Date: Tue, 4 Apr 2017 22:43:35 +0900
    Subject: [PATCH 100/107] issue/96 Added unit test, documentation and various
     tiny improvements.
    
    ---
     src/common/mod.rs                  |  9 ++--
     src/core/index.rs                  | 18 ++++---
     src/core/mod.rs                    | 13 ++++++
     src/core/segment.rs                | 75 +++++++++++++++++++++++++++---
     src/datastruct/stacker/heap.rs     | 11 +----
     src/directory/managed_directory.rs | 57 ++++++++++++++++++-----
     src/indexer/index_writer.rs        |  3 --
     src/indexer/merger.rs              |  8 +---
     src/indexer/segment_updater.rs     |  2 +-
     src/schema/term.rs                 |  4 +-
     10 files changed, 149 insertions(+), 51 deletions(-)
    
    diff --git a/src/common/mod.rs b/src/common/mod.rs
    index d2d41cef6..c99c37f85 100644
    --- a/src/common/mod.rs
    +++ b/src/common/mod.rs
    @@ -3,16 +3,15 @@ mod timer;
     mod vint;
     pub mod bitpacker;
     
    -
     pub use self::serialize::BinarySerializable;
     pub use self::timer::Timing;
     pub use self::timer::TimerTree;
     pub use self::timer::OpenTimer;
     pub use self::vint::VInt;
     
    -
     use std::io;
     
    +/// Create a default io error given a string.
     pub fn make_io_err(msg: String) -> io::Error {
         io::Error::new(io::ErrorKind::Other, msg)
     }
    @@ -30,7 +29,11 @@ pub trait HasLen {
     }
     
     
    -pub fn create_vec_with_len(capacity: usize) -> Vec {
    +/// Creates an uninitialized Vec of a given usize
    +///
    +/// `allocate_vec` does an unsafe call to `set_len`
    +/// as other solution are extremely slow in debug mode.
    +pub fn allocate_vec(capacity: usize) -> Vec {
         let mut v = Vec::with_capacity(capacity);
         unsafe {
             v.set_len(capacity);
    diff --git a/src/core/index.rs b/src/core/index.rs
    index c09baba9d..71565f321 100644
    --- a/src/core/index.rs
    +++ b/src/core/index.rs
    @@ -153,9 +153,8 @@ impl Index {
     
         /// Returns the list of segments that are searchable
         pub fn searchable_segments(&self) -> Result> {
    -        let metas = load_metas(self.directory())?; 
    -        Ok(metas
    -            .segments
    +        Ok(self
    +            .searchable_segment_metas()?
                 .into_iter()
                 .map(|segment_meta| self.segment(segment_meta))
                 .collect())
    @@ -183,18 +182,17 @@ impl Index {
         }
     
         /// Reads the meta.json and returns the list of
    -    /// segments in the last commit.
    -    pub fn segments(&self) -> Result> {
    +    /// `SegmentMeta` from the last commit.
    +    pub fn searchable_segment_metas(&self) -> Result> {
             Ok(load_metas(self.directory())?.segments)
         }
         
         /// Returns the list of segment ids that are searchable.
         pub fn searchable_segment_ids(&self) -> Result> {
    -        Ok(load_metas(self.directory())?
    -            .segments
    -            .iter()
    -            .map(|segment_meta| segment_meta.id())
    -            .collect())           
    +        Ok(self.searchable_segment_metas()?
    +               .iter()
    +               .map(|segment_meta| segment_meta.id())
    +               .collect())          
         }
     
         /// Creates a new generation of searchers after
    diff --git a/src/core/mod.rs b/src/core/mod.rs
    index 6b37c5542..d0a3964a4 100644
    --- a/src/core/mod.rs
    +++ b/src/core/mod.rs
    @@ -25,7 +25,20 @@ pub use self::term_iterator::TermIterator;
     use std::path::PathBuf;
     
     lazy_static! {
    +    /// The meta file contains all the information about the list of segments and the schema
    +    /// of the index.
         pub static ref META_FILEPATH: PathBuf = PathBuf::from("meta.json");
    +    
    +    /// The managed file contains a list of files that were created by the tantivy
    +    /// and will therefore be garbage collected when they are deemed useless by tantivy.
    +    ///
    +    /// Removing this file is safe, but will prevent the garbage collection of all of the file that
    +    /// are currently in the directory
         pub static ref MANAGED_FILEPATH: PathBuf = PathBuf::from(".managed.json");
    +
    +    /// Only one process should be able to write tantivy's index at a time.
    +    /// This file, when present, is in charge of preventing other processes to open an IndexWriter.
    +    ///
    +    /// If the process is killed and this file remains, it is safe to remove it manually.
         pub static ref LOCKFILE_FILEPATH: PathBuf = PathBuf::from(".tantivy-indexer.lock");
     }
    \ No newline at end of file
    diff --git a/src/core/segment.rs b/src/core/segment.rs
    index 7baf9516c..49f811f90 100644
    --- a/src/core/segment.rs
    +++ b/src/core/segment.rs
    @@ -43,10 +43,6 @@ impl Segment {
             self.index.schema()
         }
     
    -    pub fn index(&self,) -> &Index {
    -        &self.index
    -    }
    -
         /// Returns the segment meta-information
         pub fn meta(&self) -> &SegmentMeta {
             &self.meta
    @@ -70,19 +66,25 @@ impl Segment {
             self.meta.relative_path(component)
         }
     
    +
    +    /// Protects a specific component file from being deleted.
    +    ///
    +    /// Returns a FileProtection object. The file is guaranteed
    +    /// to not be garbage collected as long as this `FileProtection`  object
    +    /// lives.
         pub fn protect_from_delete(&self, component: SegmentComponent) -> FileProtection {
             let path = self.relative_path(component);
             self.index.directory().protect_file_from_delete(&path)
         }
     
    -    /// Open one of the component file for read.
    +    /// Open one of the component file for a *regular* read.
         pub fn open_read(&self, component: SegmentComponent) -> result::Result {
             let path = self.relative_path(component);
             let source = try!(self.index.directory().open_read(&path));
             Ok(source)
         }
     
    -    /// Open one of the component file for write.
    +    /// Open one of the component file for *regular* write.
         pub fn open_write(&mut self, component: SegmentComponent) -> result::Result {
             let path = self.relative_path(component);
             let write = try!(self.index.directory_mut().open_write(&path));
    @@ -102,4 +104,65 @@ pub trait SerializableSegment {
     #[derive(Clone,Debug,RustcDecodable,RustcEncodable)]
     pub struct SegmentInfo {
     	pub max_doc: DocId,
    +}
    +
    +#[cfg(test)]
    +mod tests {
    +
    +    use core::SegmentComponent;
    +    use std::path::Path;
    +    use directory::Directory;
    +    use schema::{SchemaBuilder, Document, FieldValue, TEXT, Term};
    +    use Index;
    +
    +    #[test]
    +    fn test_segment_protect_component() {
    +        let mut schema_builder = SchemaBuilder::default();
    +        let text_field = schema_builder.add_text_field("text", TEXT);
    +        let schema = schema_builder.build();
    +        let index = Index::create_in_ram(schema);
    +        let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
    +        
    +        {
    +            // simply creating two segments
    +            // with one delete to create the DELETE file.
    +            {
    +                let doc1 = doc!(text_field=>"a");
    +                index_writer.add_document(doc1);
    +                let doc2 = doc!(text_field=>"b");
    +                index_writer.add_document(doc2);
    +                assert!(index_writer.commit().is_ok());
    +            }
    +            {
    +                index_writer.delete_term(Term::from_field_text(text_field, "a"));
    +                assert!(index_writer.commit().is_ok());
    +            }
    +        }
    +
    +        let segments = index.searchable_segments().unwrap();
    +        let directory = index.directory().clone();
    +        assert_eq!(segments.len(), 1);
    +
    +        
    +        let delete_file_path = Path::new("00000000000000000000000000000000.4.del");
    +        let idx_file_path = Path::new("00000000000000000000000000000000.term");
    +        assert!(directory.exists(&*delete_file_path));
    +        assert!(directory.exists(&*idx_file_path));
    +
    +        {
    +            let _file_protection = segments[0].protect_from_delete(SegmentComponent::DELETE);
    +            index_writer.delete_term(Term::from_field_text(text_field, "b"));
    +            index_writer.commit().unwrap();
    +            // the delete file is protected, and should not be gc'ed.
    +            assert!(directory.exists(&*delete_file_path));
    +        }
    +
    +        index_writer.commit().unwrap();
    +        
    +        // at this point the protection is released.
    +        assert!(!directory.exists(&*delete_file_path));
    +
    +    }
    +        
    +
     }
    \ No newline at end of file
    diff --git a/src/datastruct/stacker/heap.rs b/src/datastruct/stacker/heap.rs
    index cd8d16e89..1ae116f2b 100644
    --- a/src/datastruct/stacker/heap.rs
    +++ b/src/datastruct/stacker/heap.rs
    @@ -1,6 +1,6 @@
     use std::cell::UnsafeCell;
     use std::mem;
    -use common::create_vec_with_len;
    +use common::allocate_vec;
     use std::ptr;
     
     /// `BytesRef` refers to a slice in tantivy's custom `Heap`.
    @@ -105,18 +105,11 @@ struct InnerHeap {
         next_heap: Option>,
     }
     
    -/// initializing a long Vec is crazy slow in 
    -/// debug mode.
    -/// We use this unsafe trick to make unit test
    -/// way faster.
    -fn allocate_fast(num_bytes: usize) -> Vec {
    -    create_vec_with_len(num_bytes)
    -}
     
     impl InnerHeap {
     
         pub fn with_capacity(num_bytes: usize) -> InnerHeap {
    -        let buffer: Vec = allocate_fast(num_bytes);
    +        let buffer: Vec = allocate_vec(num_bytes);
             InnerHeap {
                 buffer: buffer,
                 buffer_len: num_bytes as u32,
    diff --git a/src/directory/managed_directory.rs b/src/directory/managed_directory.rs
    index ffa1697af..1f5f551ce 100644
    --- a/src/directory/managed_directory.rs
    +++ b/src/directory/managed_directory.rs
    @@ -35,11 +35,26 @@ struct MetaInformation {
         protected_files: HashMap,
     }
     
    +
    +/// A `FileProtection` prevents the garbage collection of a file.
    +///
    +/// See `ManagedDirectory.protect_file_from_delete`.
     pub struct FileProtection {
         directory: ManagedDirectory,
         path: PathBuf,
     }
     
    +fn unprotect_file_from_delete(directory: &ManagedDirectory, path: &Path) {
    +    let mut meta_informations_wlock = directory.meta_informations
    +        .write()
    +        .expect("Managed file lock poisoned");
    +    if let Some(counter_ref_mut) = meta_informations_wlock
    +        .protected_files
    +        .get_mut(path) {
    +        (*counter_ref_mut) -= 1;
    +    }
    +}
    +
     impl fmt::Debug for FileProtection {
         fn fmt(&self, formatter: &mut fmt::Formatter) -> result::Result<(), fmt::Error> {
             write!(formatter, "FileProtectionFor({:?})", self.path)    
    @@ -48,7 +63,7 @@ impl fmt::Debug for FileProtection {
     
     impl Drop for FileProtection {
         fn drop(&mut self) {
    -        self.directory.unprotect_file_from_delete(&self.path);
    +        unprotect_file_from_delete(&self.directory, &*self.path);
         }
     }
     
    @@ -152,6 +167,12 @@ impl ManagedDirectory {
     
         }
     
    +
    +    /// Protects a file from being garbage collected.
    +    ///
    +    /// The method returns a `FileProtection` object.
    +    /// The file will not be garbage collected as long as the
    +    /// `FileProtection` object is kept alive. 
         pub fn protect_file_from_delete(&self, path: &Path) -> FileProtection {
             let mut meta_informations_wlock = self.meta_informations
                 .write()
    @@ -167,16 +188,6 @@ impl ManagedDirectory {
             }
         }
     
    -    pub fn unprotect_file_from_delete(&self, path: &Path) {
    -        let mut meta_informations_wlock = self.meta_informations
    -            .write()
    -            .expect("Managed file lock poisoned");
    -        if let Some(counter_ref_mut) = meta_informations_wlock
    -            .protected_files
    -            .get_mut(path) {
    -            (*counter_ref_mut) -= 1;
    -        }
    -    }
     
         /// Saves the file containing the list of existing files
         /// that were created by tantivy.
    @@ -358,4 +369,28 @@ mod tests {
     
         }
     
    +
    +    #[test]
    +    fn test_managed_directory_protect() {
    +        let tempdir = TempDir::new("index").unwrap();
    +        let tempdir_path = PathBuf::from(tempdir.path());
    +        let living_files = HashSet::new();
    +
    +        let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
    +        let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
    +        managed_directory.atomic_write(*TEST_PATH1, &vec!(0u8,1u8)).unwrap();
    +        assert!(managed_directory.exists(*TEST_PATH1));
    +
    +        {
    +            let _file_protection = managed_directory.protect_file_from_delete(*TEST_PATH1);
    +            managed_directory.garbage_collect(living_files.clone());
    +            assert!(managed_directory.exists(*TEST_PATH1));
    +        }
    +
    +        managed_directory.garbage_collect(living_files.clone());
    +        assert!(!managed_directory.exists(*TEST_PATH1));
    +        
    +
    +    }
    +
     }
    diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs
    index fbfe2f70f..48d76a576 100644
    --- a/src/indexer/index_writer.rs
    +++ b/src/indexer/index_writer.rs
    @@ -197,9 +197,6 @@ pub fn compute_deleted_bitset(
     }
     
     
    -// TODO put delete bitset in segment entry
    -// rather than DocToOpstamp.
    -
     // TODO skip delete operation before teh 
     // last delete opstamp
     
    diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs
    index ae8679d47..026921111 100644
    --- a/src/indexer/merger.rs
    +++ b/src/indexer/merger.rs
    @@ -17,9 +17,7 @@ use fastfield::FastFieldSerializer;
     use store::StoreWriter;
     use core::SegmentInfo;
     use std::cmp::{min, max};
    -use std::iter;
    -
    -
    +use common::allocate_vec;
     
     pub struct IndexMerger {
         schema: Schema,
    @@ -35,9 +33,7 @@ struct DeltaPositionComputer {
     impl DeltaPositionComputer {
         fn new() -> DeltaPositionComputer {
             DeltaPositionComputer { 
    -            buffer: iter::repeat(0u32)
    -                .take(512)
    -                .collect::>()
    +            buffer: allocate_vec(512)
             }
         }
     
    diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs
    index 517400b27..8fb027042 100644
    --- a/src/indexer/segment_updater.rs
    +++ b/src/indexer/segment_updater.rs
    @@ -171,7 +171,7 @@ impl SegmentUpdater {
         pub fn new(index: Index,
                    stamper: Stamper,
                    delete_cursor: DeleteCursor) -> Result {
    -        let segments = index.segments()?;
    +        let segments = index.searchable_segment_metas()?;
             let segment_manager = SegmentManager::from_segments(segments, delete_cursor);
             Ok(
                 SegmentUpdater(Arc::new(InnerSegmentUpdater {
    diff --git a/src/schema/term.rs b/src/schema/term.rs
    index 656c9feeb..f5502e00e 100644
    --- a/src/schema/term.rs
    +++ b/src/schema/term.rs
    @@ -1,7 +1,7 @@
     use std::fmt;
     
     use common::BinarySerializable;
    -use common::create_vec_with_len;
    +use common::allocate_vec;
     use byteorder::{BigEndian, ByteOrder};
     use super::Field;
     use std::str;
    @@ -45,7 +45,7 @@ impl Term {
         /// The first byte is `1`, and the 4 following bytes are that of the u32.
         pub fn from_field_u32(field: Field, val: u32) -> Term {
             const U32_TERM_LEN: usize = 1 + 4;
    -        let mut buffer = create_vec_with_len(U32_TERM_LEN);
    +        let mut buffer = allocate_vec(U32_TERM_LEN);
             buffer[0] = field.0;
             // we want BigEndian here to have lexicographic order
             // match the natural order of vals.
    
    From a84871468bd7f7ed9a02a6918749a9cde50d2868 Mon Sep 17 00:00:00 2001
    From: Paul Masurel 
    Date: Wed, 5 Apr 2017 10:01:49 +0900
    Subject: [PATCH 101/107] issue/96 Rename FileError -> OpenReadError
    
    ---
     src/core/segment.rs                | 63 ++++++++----------------------
     src/directory/directory.rs         |  8 ++--
     src/directory/error.rs             |  2 +-
     src/directory/managed_directory.rs | 10 ++---
     src/directory/mmap_directory.rs    | 26 ++++++------
     src/directory/ram_directory.rs     | 12 +++---
     src/error.rs                       | 10 ++---
     src/schema/schema.rs               |  6 +++
     8 files changed, 57 insertions(+), 80 deletions(-)
    
    diff --git a/src/core/segment.rs b/src/core/segment.rs
    index 49f811f90..c9d94cacc 100644
    --- a/src/core/segment.rs
    +++ b/src/core/segment.rs
    @@ -11,7 +11,7 @@ use core::Index;
     use std::result;
     use directory::Directory;
     use core::SegmentMeta;
    -use directory::error::{FileError, OpenWriteError};
    +use directory::error::{OpenReadError, OpenWriteError};
     
     /// A segment is a piece of the index.
     #[derive(Clone)]
    @@ -78,7 +78,7 @@ impl Segment {
         }
     
         /// Open one of the component file for a *regular* read.
    -    pub fn open_read(&self, component: SegmentComponent) -> result::Result {
    +    pub fn open_read(&self, component: SegmentComponent) -> result::Result {
             let path = self.relative_path(component);
             let source = try!(self.index.directory().open_read(&path));
             Ok(source)
    @@ -110,59 +110,30 @@ pub struct SegmentInfo {
     mod tests {
     
         use core::SegmentComponent;
    -    use std::path::Path;
         use directory::Directory;
    -    use schema::{SchemaBuilder, Document, FieldValue, TEXT, Term};
    +    use std::collections::HashSet;
    +    use schema::SchemaBuilder;
         use Index;
     
         #[test]
         fn test_segment_protect_component() {
    -        let mut schema_builder = SchemaBuilder::default();
    -        let text_field = schema_builder.add_text_field("text", TEXT);
    -        let schema = schema_builder.build();
    -        let index = Index::create_in_ram(schema);
    -        let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
    +        let mut index = Index::create_in_ram(SchemaBuilder::new().build());
    +        let segment = index.new_segment();
    +        let path = segment.relative_path(SegmentComponent::POSTINGS);
             
    +        let directory = index.directory_mut();
    +        directory.atomic_write(&*path, &vec!(0u8)).unwrap();
    +        
    +        let living_files = HashSet::new();
             {
    -            // simply creating two segments
    -            // with one delete to create the DELETE file.
    -            {
    -                let doc1 = doc!(text_field=>"a");
    -                index_writer.add_document(doc1);
    -                let doc2 = doc!(text_field=>"b");
    -                index_writer.add_document(doc2);
    -                assert!(index_writer.commit().is_ok());
    -            }
    -            {
    -                index_writer.delete_term(Term::from_field_text(text_field, "a"));
    -                assert!(index_writer.commit().is_ok());
    -            }
    +            let _file_protection = segment.protect_from_delete(SegmentComponent::POSTINGS);
    +            assert!(directory.exists(&*path));
    +            directory.garbage_collect(living_files.clone());
    +            assert!(directory.exists(&*path));
             }
     
    -        let segments = index.searchable_segments().unwrap();
    -        let directory = index.directory().clone();
    -        assert_eq!(segments.len(), 1);
    -
    -        
    -        let delete_file_path = Path::new("00000000000000000000000000000000.4.del");
    -        let idx_file_path = Path::new("00000000000000000000000000000000.term");
    -        assert!(directory.exists(&*delete_file_path));
    -        assert!(directory.exists(&*idx_file_path));
    -
    -        {
    -            let _file_protection = segments[0].protect_from_delete(SegmentComponent::DELETE);
    -            index_writer.delete_term(Term::from_field_text(text_field, "b"));
    -            index_writer.commit().unwrap();
    -            // the delete file is protected, and should not be gc'ed.
    -            assert!(directory.exists(&*delete_file_path));
    -        }
    -
    -        index_writer.commit().unwrap();
    -        
    -        // at this point the protection is released.
    -        assert!(!directory.exists(&*delete_file_path));
    -
    +        directory.garbage_collect(living_files);
    +        assert!(!directory.exists(&*path));
         }
    -        
     
     }
    \ No newline at end of file
    diff --git a/src/directory/directory.rs b/src/directory/directory.rs
    index c4a98b237..b555efbcc 100644
    --- a/src/directory/directory.rs
    +++ b/src/directory/directory.rs
    @@ -1,7 +1,7 @@
     use std::marker::Send;
     use std::fmt;
     use std::path::Path;
    -use directory::error::{FileError, DeleteError, OpenWriteError};
    +use directory::error::{OpenReadError, DeleteError, OpenWriteError};
     use directory::{ReadOnlySource, WritePtr};
     use std::result;
     use std::io;
    @@ -26,7 +26,7 @@ pub trait Directory: fmt::Debug + Send + Sync + 'static {
         ///
         /// Specifically, subsequent writes or flushes should
         /// have no effect on the returned `ReadOnlySource` object. 
    -    fn open_read(&self, path: &Path) -> result::Result;
    +    fn open_read(&self, path: &Path) -> result::Result;
     
         /// Removes a file
         ///
    @@ -34,7 +34,7 @@ pub trait Directory: fmt::Debug + Send + Sync + 'static {
         /// existing ReadOnlySource pointing to it.
         /// 
         /// Removing a nonexistent file, yields a
    -    /// `FileError::DoesNotExist`.
    +    /// `DeleteError::DoesNotExist`.
         fn delete(&self, path: &Path) -> result::Result<(), DeleteError>;
     
         /// Returns true iff the file exists
    @@ -65,7 +65,7 @@ pub trait Directory: fmt::Debug + Send + Sync + 'static {
         /// atomic_write.
         ///
         /// This should only be used for small files.
    -    fn atomic_read(&self, path: &Path) -> Result, FileError>;
    +    fn atomic_read(&self, path: &Path) -> Result, OpenReadError>;
     
         /// Atomically replace the content of a file with data.
         /// 
    diff --git a/src/directory/error.rs b/src/directory/error.rs
    index 711f4a3df..b2a7f24f1 100644
    --- a/src/directory/error.rs
    +++ b/src/directory/error.rs
    @@ -29,7 +29,7 @@ impl From for OpenWriteError {
     
     /// Error that may occur when accessing a file read
     #[derive(Debug)]
    -pub enum FileError {
    +pub enum OpenReadError {
         /// The file does not exists.
         FileDoesNotExist(PathBuf),
         /// Any kind of IO error that happens when 
    diff --git a/src/directory/managed_directory.rs b/src/directory/managed_directory.rs
    index 1f5f551ce..6c56625d2 100644
    --- a/src/directory/managed_directory.rs
    +++ b/src/directory/managed_directory.rs
    @@ -1,5 +1,5 @@
     use std::path::{Path, PathBuf};
    -use directory::error::{FileError, DeleteError, OpenWriteError};
    +use directory::error::{OpenReadError, DeleteError, OpenWriteError};
     use directory::{ReadOnlySource, WritePtr};
     use std::result;
     use std::io;
    @@ -85,13 +85,13 @@ impl ManagedDirectory {
                             })),
                     })
                 }
    -            Err(FileError::FileDoesNotExist(_)) => {
    +            Err(OpenReadError::FileDoesNotExist(_)) => {
                     Ok(ManagedDirectory {
                         directory: box directory,
                         meta_informations: Arc::default(),
                     })
                 }
    -            Err(FileError::IOError(e)) => {
    +            Err(OpenReadError::IOError(e)) => {
                     Err(From::from(e))
                 }
             }
    @@ -224,7 +224,7 @@ impl ManagedDirectory {
     
     impl Directory for ManagedDirectory {
         
    -    fn open_read(&self, path: &Path) -> result::Result {
    +    fn open_read(&self, path: &Path) -> result::Result {
             self.directory.open_read(path)
         }
     
    @@ -238,7 +238,7 @@ impl Directory for ManagedDirectory {
             self.directory.atomic_write(path, data)
         }
     
    -    fn atomic_read(&self, path: &Path) -> result::Result, FileError> {
    +    fn atomic_read(&self, path: &Path) -> result::Result, OpenReadError> {
             self.directory.atomic_read(path)
         }
     
    diff --git a/src/directory/mmap_directory.rs b/src/directory/mmap_directory.rs
    index 9a5670b7f..2437ceb35 100644
    --- a/src/directory/mmap_directory.rs
    +++ b/src/directory/mmap_directory.rs
    @@ -1,7 +1,7 @@
     use atomicwrites;
     use common::make_io_err;
     use directory::Directory;
    -use directory::error::{OpenWriteError, FileError, DeleteError, OpenDirectoryError};
    +use directory::error::{OpenWriteError, OpenReadError, DeleteError, OpenDirectoryError};
     use directory::ReadOnlySource;
     use directory::shared_vec_slice::SharedVecSlice;
     use directory::WritePtr;
    @@ -23,19 +23,19 @@ use std::sync::RwLock;
     use std::sync::Weak;
     use tempdir::TempDir;
     
    -fn open_mmap(full_path: &PathBuf) -> result::Result>, FileError> {
    +fn open_mmap(full_path: &PathBuf) -> result::Result>, OpenReadError> {
         let convert_file_error = |err: io::Error| {
             if err.kind() == io::ErrorKind::NotFound {
    -            FileError::FileDoesNotExist(full_path.clone())
    +            OpenReadError::FileDoesNotExist(full_path.clone())
             }
             else {
    -            FileError::IOError(err)
    +            OpenReadError::IOError(err)
             }
         };
         let file = File::open(&full_path).map_err(convert_file_error)?;
         let meta_data = file
             .metadata()
    -        .map_err(|e| FileError::IOError(e))?;
    +        .map_err(|e| OpenReadError::IOError(e))?;
         if meta_data.len() == 0 {
             // if the file size is 0, it will not be possible 
             // to mmap the file, so we return an anonymous mmap_cache
    @@ -47,7 +47,7 @@ fn open_mmap(full_path: &PathBuf) -> result::Result>, FileError
                 Ok(Some(Arc::new(mmap)))
             }
             Err(e) => {
    -            Err(FileError::IOError(e))
    +            Err(OpenReadError::IOError(e))
             }
         }
         
    @@ -116,7 +116,7 @@ impl MmapCache {
             }
         }
     
    -    fn get_mmap(&mut self, full_path: PathBuf) -> Result>, FileError> {
    +    fn get_mmap(&mut self, full_path: PathBuf) -> Result>, OpenReadError> {
             // if we exceed this limit, then we go through the weak
             // and remove those that are obsolete.
             if self.cache.len() > self.purge_weak_limit {
    @@ -286,13 +286,13 @@ impl Seek for SafeFileWriter {
     
     impl Directory for MmapDirectory {
         
    -    fn open_read(&self, path: &Path) -> result::Result {
    +    fn open_read(&self, path: &Path) -> result::Result {
             debug!("Open Read {:?}", path);
             let full_path = self.resolve_path(path);
             
             let mut mmap_cache = self.mmap_cache
                 .write()
    -            .map_err(|_| FileError::IOError(
    +            .map_err(|_| OpenReadError::IOError(
                     make_io_err(format!("Failed to acquired write lock on mmap cache while reading {:?}", path))
                 ))?;
             
    @@ -367,21 +367,21 @@ impl Directory for MmapDirectory {
             full_path.exists()
         }
     
    -    fn atomic_read(&self, path: &Path) -> Result, FileError> {
    +    fn atomic_read(&self, path: &Path) -> Result, OpenReadError> {
             let full_path = self.resolve_path(path);
             let mut buffer = Vec::new();
             match File::open(&full_path) {
                 Ok(mut file) => {
                     file.read_to_end(&mut buffer)
    -                    .map_err(|e| FileError::IOError(e))?;
    +                    .map_err(|e| OpenReadError::IOError(e))?;
                     Ok(buffer)
                 }
                 Err(e) => {
                     if e.kind() == io::ErrorKind::NotFound {
    -                    Err(FileError::FileDoesNotExist(path.to_owned()))
    +                    Err(OpenReadError::FileDoesNotExist(path.to_owned()))
                     }
                     else {
    -                    Err(FileError::IOError(e))
    +                    Err(OpenReadError::IOError(e))
                     }
                 }
             }
    diff --git a/src/directory/ram_directory.rs b/src/directory/ram_directory.rs
    index b8a3b5c95..ddb97080b 100644
    --- a/src/directory/ram_directory.rs
    +++ b/src/directory/ram_directory.rs
    @@ -6,7 +6,7 @@ use std::result;
     use std::sync::{Arc, RwLock};
     use common::make_io_err;
     use directory::{Directory, ReadOnlySource};
    -use directory::error::{OpenWriteError, FileError, DeleteError};
    +use directory::error::{OpenWriteError, OpenReadError, DeleteError};
     use directory::WritePtr;
     use super::shared_vec_slice::SharedVecSlice;
     
    @@ -87,17 +87,17 @@ impl InnerDirectory {
             Ok(prev_value.is_some())
         }
     
    -    fn open_read(&self, path: &Path) -> Result { 
    +    fn open_read(&self, path: &Path) -> Result { 
             self.0
                 .read()
                 .map_err(|_| {
                     let io_err = make_io_err(format!("Failed to acquire read lock for the directory, when trying to read {:?}", path));
    -                FileError::IOError(io_err)
    +                OpenReadError::IOError(io_err)
                 })
                 .and_then(|readable_map| {
                     readable_map
                     .get(path)
    -                .ok_or_else(|| FileError::FileDoesNotExist(PathBuf::from(path)))
    +                .ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path)))
                     .map(|data| {
                         ReadOnlySource::Anonymous(SharedVecSlice::new(data.clone()))
                     })
    @@ -160,7 +160,7 @@ impl RAMDirectory {
     }
     
     impl Directory for RAMDirectory {
    -    fn open_read(&self, path: &Path) -> result::Result {
    +    fn open_read(&self, path: &Path) -> result::Result {
             self.fs.open_read(path)
         }
         
    @@ -185,7 +185,7 @@ impl Directory for RAMDirectory {
             self.fs.exists(path)
         }
     
    -    fn atomic_read(&self, path: &Path) -> Result, FileError> {
    +    fn atomic_read(&self, path: &Path) -> Result, OpenReadError> {
             let read = self.open_read(path)?;
             Ok(read.as_slice()
                    .to_owned())
    diff --git a/src/error.rs b/src/error.rs
    index 8eae79513..5eaa50a9a 100644
    --- a/src/error.rs
    +++ b/src/error.rs
    @@ -7,7 +7,7 @@ use std::io;
     use std::path::PathBuf;
     use std::error;
     use std::sync::PoisonError;
    -use directory::error::{FileError, OpenWriteError, OpenDirectoryError};
    +use directory::error::{OpenReadError, OpenWriteError, OpenDirectoryError};
     use query;
     use schema;
     
    @@ -59,11 +59,11 @@ impl From> for Error {
         }
     }
     
    -impl From for Error {
    -    fn from(error: FileError) -> Error {
    +impl From for Error {
    +    fn from(error: OpenReadError) -> Error {
             match error {
    -            FileError::FileDoesNotExist(filepath) => Error::PathDoesNotExist(filepath),
    -            FileError::IOError(io_error) => Error::IOError(io_error),
    +            OpenReadError::FileDoesNotExist(filepath) => Error::PathDoesNotExist(filepath),
    +            OpenReadError::IOError(io_error) => Error::IOError(io_error),
             }
         }
     }
    diff --git a/src/schema/schema.rs b/src/schema/schema.rs
    index 287ad40a0..3ed8b6c9d 100644
    --- a/src/schema/schema.rs
    +++ b/src/schema/schema.rs
    @@ -42,6 +42,12 @@ pub struct SchemaBuilder {
     
     impl SchemaBuilder {
         
    +
    +    /// Create a new `SchemaBuilder`
    +    pub fn new() -> SchemaBuilder {
    +        SchemaBuilder::default()
    +    }
    +
         /// Adds a new u32 field.
         /// Returns the associated field handle
         ///
    
    From 4bef6c99eefc52e2cefcc5b8c2e5b17280c61732 Mon Sep 17 00:00:00 2001
    From: Paul Masurel 
    Date: Wed, 5 Apr 2017 10:12:39 +0900
    Subject: [PATCH 102/107] issue/96 Cleaning up some lock management
    
    ---
     src/directory/managed_directory.rs | 39 ++++++++++++++++++------------
     1 file changed, 24 insertions(+), 15 deletions(-)
    
    diff --git a/src/directory/managed_directory.rs b/src/directory/managed_directory.rs
    index 6c56625d2..49c41fa49 100644
    --- a/src/directory/managed_directory.rs
    +++ b/src/directory/managed_directory.rs
    @@ -111,9 +111,10 @@ impl ManagedDirectory {
         pub fn garbage_collect(&mut self, living_files: HashSet) {
             let mut files_to_delete = vec!();
             {   // releasing the lock as .delete() will use it too.
    -            let mut meta_informations_wlock = self.meta_informations.write().unwrap();
    -            let managed_paths_write = &mut meta_informations_wlock.managed_paths;
    -            for managed_path in managed_paths_write.iter() {
    +            let meta_informations_rlock = self.meta_informations
    +                .read()
    +                .expect("Managed directory rlock poisoned in garbage collect.");
    +            for managed_path in &meta_informations_rlock.managed_paths {
                     if !living_files.contains(managed_path) {
                         files_to_delete.push(managed_path.clone());
                     }
    @@ -154,7 +155,9 @@ impl ManagedDirectory {
                 // update the list of managed files by removing 
                 // the file that were removed.
                 {
    -                let mut meta_informations_wlock = self.meta_informations.write().unwrap();
    +                let mut meta_informations_wlock = self.meta_informations
    +                    .write()
    +                    .expect("Managed directory wlock poisoned (2).");
                     let managed_paths_write = &mut meta_informations_wlock.managed_paths;
                     for delete_file in &deleted_files {
                         managed_paths_write.remove(delete_file);
    @@ -174,14 +177,16 @@ impl ManagedDirectory {
         /// The file will not be garbage collected as long as the
         /// `FileProtection` object is kept alive. 
         pub fn protect_file_from_delete(&self, path: &Path) -> FileProtection {
    -        let mut meta_informations_wlock = self.meta_informations
    -            .write()
    -            .expect("Managed file lock poisoned");
             let pathbuf = path.to_owned();
    -        *meta_informations_wlock
    -            .protected_files
    -            .entry(pathbuf.clone())
    -            .or_insert(0) += 1;
    +        {
    +            let mut meta_informations_wlock = self.meta_informations
    +                .write()
    +                .expect("Managed file lock poisoned on protect");
    +            *meta_informations_wlock
    +                .protected_files
    +                .entry(pathbuf.clone())
    +                .or_insert(0) += 1;
    +        }
             FileProtection {
                 directory: self.clone(),
                 path: pathbuf.clone(),
    @@ -192,11 +197,15 @@ impl ManagedDirectory {
         /// Saves the file containing the list of existing files
         /// that were created by tantivy.
         fn save_managed_paths(&mut self,) -> io::Result<()> {
    -        let meta_informations_rlock = self.meta_informations
    -            .read()
    -            .expect("Managed file lock poisoned");
    +        let managed_paths;
    +        {
    +            let meta_informations_rlock = self.meta_informations
    +                .read()
    +                .expect("Managed file lock poisoned");
    +            managed_paths = meta_informations_rlock.managed_paths.clone();
    +        }
             let mut w = vec!();
    -        try!(write!(&mut w, "{}\n", json::as_pretty_json(&meta_informations_rlock.managed_paths)));
    +        try!(write!(&mut w, "{}\n", json::as_pretty_json(&managed_paths)));
             self.directory.atomic_write(&MANAGED_FILEPATH, &w[..])?;
             Ok(())
         }
    
    From a4ba20eea3a0edec6a28bdbba914d0c6ac34415e Mon Sep 17 00:00:00 2001
    From: Paul Masurel 
    Date: Sat, 8 Apr 2017 17:30:25 +0900
    Subject: [PATCH 103/107] issue/96 code clean up, adding comments.wq
    
    ---
     src/indexer/delete_queue.rs    |  8 +++---
     src/indexer/merge_policy.rs    |  5 ++++
     src/indexer/segment_entry.rs   | 47 +++++++++++++++++++++++++++++++---
     src/indexer/segment_manager.rs | 19 +++-----------
     src/indexer/segment_updater.rs | 21 +++++++++++++++
     5 files changed, 77 insertions(+), 23 deletions(-)
    
    diff --git a/src/indexer/delete_queue.rs b/src/indexer/delete_queue.rs
    index 1f476dad6..a031c63d3 100644
    --- a/src/indexer/delete_queue.rs
    +++ b/src/indexer/delete_queue.rs
    @@ -21,7 +21,7 @@ use std::ops::DerefMut;
     #[derive(Default)]
     struct InnerDeleteQueue {
         writer: Vec,
    -    last_block: Option>, // TODO last block... is that ok.
    +    last_block: Option>,
     }
     
     #[derive(Clone, Default)]
    @@ -36,7 +36,7 @@ impl DeleteQueue {
         pub fn new() -> DeleteQueue {
             
             let delete_queue = DeleteQueue {
    -            inner: Arc::new(RwLock::new(InnerDeleteQueue::default()))
    +            inner: Arc::default(),
             };
                 
             let next_block = NextBlock::from(delete_queue.clone());
    @@ -61,7 +61,7 @@ impl DeleteQueue {
         pub fn cursor(&self) -> DeleteCursor {
             let last_block = self.inner
                 .read()
    -            .unwrap()
    +            .expect("Read lock poisoned when opening delete queue cursor")
                 .last_block
                 .clone()
                 .expect("Failed to unwrap last_block. This should never happen
    @@ -253,7 +253,7 @@ impl DeleteCursor {
     
         /// Get the current delete operation.
         /// Calling `.get` does not advance the cursor.
    -    pub fn get<'a>(&'a mut self) -> Option<&'a DeleteOperation> {
    +    pub fn get(&mut self) -> Option<&DeleteOperation> {
             if self.load_block_if_required() {
                 Some(&self.block.operations[self.pos])
             }
    diff --git a/src/indexer/merge_policy.rs b/src/indexer/merge_policy.rs
    index dfd9dfcec..ecab510d7 100644
    --- a/src/indexer/merge_policy.rs
    +++ b/src/indexer/merge_policy.rs
    @@ -51,6 +51,11 @@ pub mod tests {
         use core::SegmentId;
         use core::SegmentMeta;
     
    +
    +    /// Merge policy useful for test purposes.
    +    ///
    +    /// Everytime there is more than one segment,
    +    /// it will suggest to merge them.
         #[derive(Debug)]
         pub struct MergeWheneverPossible;
     
    diff --git a/src/indexer/segment_entry.rs b/src/indexer/segment_entry.rs
    index c93bcfe64..86673d517 100644
    --- a/src/indexer/segment_entry.rs
    +++ b/src/indexer/segment_entry.rs
    @@ -20,6 +20,20 @@ impl SegmentState {
         }
     }
     
    +
    +/// A segment entry describes the state of 
    +/// a given segment, at a given instant.
    +///
    +/// In addition to segment meta,
    +/// it contains a few transient states
    +/// - state expresses whether the segment is already in the 
    +/// middle of a merge
    +/// - delete_bitset is a bitset describing
    +/// documents that were deleted during the commit
    +/// itself.
    +/// - Delete cursor, is the position in the delete queue.
    +/// Deletes happening before the cursor are reflected either
    +/// in the .del file or in the delete_bitset.
     #[derive(Clone)]
     pub struct SegmentEntry {
         meta: SegmentMeta,
    @@ -31,6 +45,8 @@ pub struct SegmentEntry {
     
     impl SegmentEntry {
     
    +
    +    /// Create a new `SegmentEntry`
         pub fn new(segment_meta: SegmentMeta, 
                    delete_cursor: DeleteCursor,
                    delete_bitset: Option) -> SegmentEntry {
    @@ -42,42 +58,65 @@ impl SegmentEntry {
             }
         }
     
    +
    +    /// Return a reference to the segment entry deleted bitset.
    +    ///
    +    /// `DocId` in this bitset are flagged as deleted.
         pub fn delete_bitset(&self,) -> Option<&BitSet> {
             self.delete_bitset.as_ref()
         }
     
    +    /// Set the `SegmentMeta` for this segment.
         pub fn set_meta(&mut self, segment_meta: SegmentMeta) {
             self.meta = segment_meta;
         }
     
    +
    +    /// Return a reference to the segment_entry's delete cursor
         pub fn delete_cursor(&mut self) -> &mut DeleteCursor {
             &mut self.delete_cursor
         }
     
    +    /// Return the `SegmentEntry`. 
    +    ///
    +    /// The state describes whether the segment is available for
    +    /// a merge or not.
         pub fn state(&self) -> SegmentState {
             self.state
         }
     
    -    pub fn set_state(&mut self, state: SegmentState) {
    -        self.state = state;
    -    }
    -
    +    /// Returns the segment id.
         pub fn segment_id(&self) -> SegmentId {
             self.meta.id()
         }
         
    +
    +    /// Accessor to the `SegmentMeta`
         pub fn meta(&self) -> &SegmentMeta {
             &self.meta
         }
     
    +
    +    /// Mark the `SegmentEntry` as in merge.
    +    ///
    +    /// Only segments that are not already 
    +    /// in a merge are elligible for future merge.
         pub fn start_merge(&mut self,) {
             self.state = SegmentState::InMerge;
         }
     
    +    /// Cancel a merge
    +    ///
    +    /// If a merge fails, it is important to switch
    +    /// the segment back to a idle state, so that it
    +    /// may be elligible for future merges.
         pub fn cancel_merge(&mut self,) {
             self.state = SegmentState::Ready;
         }
     
    +
    +    /// Returns true iff a segment should
    +    /// be considered for a merge.
         pub fn is_ready(&self,) -> bool {
             self.state == SegmentState::Ready
         }
    diff --git a/src/indexer/segment_manager.rs b/src/indexer/segment_manager.rs
    index 4961cb148..66f6fe569 100644
    --- a/src/indexer/segment_manager.rs
    +++ b/src/indexer/segment_manager.rs
    @@ -3,7 +3,7 @@ use std::sync::RwLock;
     use core::SegmentMeta;
     use core::{META_FILEPATH, LOCKFILE_FILEPATH};
     use core::SegmentId;
    -use indexer::{SegmentEntry, SegmentState};
    +use indexer::SegmentEntry;
     use std::path::PathBuf;
     use std::collections::hash_set::HashSet;
     use std::sync::{RwLockReadGuard, RwLockWriteGuard};
    @@ -54,6 +54,7 @@ impl SegmentManager {
             }
         }
     
    +    /// Returns all of the segment entries (committed or uncommitted)
         pub fn segment_entries(&self,) -> Vec {
             let mut segment_entries = self.read()
                 .uncommitted
    @@ -66,6 +67,7 @@ impl SegmentManager {
             segment_entries
         }
     
    +    /// Returns the overall number of segments in the `SegmentManager`
         pub fn num_segments(&self,) -> usize {
             let registers_lock = self.read();
             registers_lock.committed.len() + registers_lock.uncommitted.len()
    @@ -95,11 +97,6 @@ impl SegmentManager {
             files
         }
     
    -    pub fn segment_state(&self, segment_id: &SegmentId) -> Option {
    -        self.segment_entry(segment_id)
    -            .map(|segment_entry| segment_entry.state())
    -    }
    -
         pub fn segment_entry(&self, segment_id: &SegmentId) -> Option {
             let registers = self.read();
             registers
    @@ -119,15 +116,7 @@ impl SegmentManager {
             self.registers.write().expect("Failed to acquire write lock on SegmentManager.")
         }
     
    -    pub fn commit(&self, mut segment_entries: Vec) {
    -        // TODO is still relevant!?
    -        // restore the state of the segment_entries
    -        for segment_entry in &mut segment_entries {
    -            let segment_id = segment_entry.segment_id();
    -            if let Some(state) = self.segment_state(&segment_id) {
    -                segment_entry.set_state(state);
    -            }
    -        }
    +    pub fn commit(&self, segment_entries: Vec) {
             let mut registers_lock = self.write();
             registers_lock.committed.clear();
             registers_lock.uncommitted.clear();
    diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs
    index 8fb027042..4ac7ddea7 100644
    --- a/src/indexer/segment_updater.rs
    +++ b/src/indexer/segment_updater.rs
    @@ -237,6 +237,11 @@ impl SegmentUpdater {
             !self.0.killed.load(Ordering::Acquire)
         }
     
    +
    +    /// Apply deletes up to the target opstamp to all segments.
    +    ///
    +    /// Tne method returns copies of the segment entries,
    +    /// updated with the delete information.
         fn purge_deletes(&self, target_opstamp: u64) -> Result> {
             let mut segment_entries = self.0.segment_manager.segment_entries(); 
             for segment_entry in &mut segment_entries {
    @@ -374,6 +379,22 @@ impl SegmentUpdater {
             }).wait()
         }
     
    +
    +    /// Wait for current merging threads.
    +    ///
    +    /// Upon termination of the current merging threads,
    +    /// merge opportunity may appear.
    +    // 
    +    /// We keep waiting until the merge policy judges that
    +    /// no opportunity is available.
    +    ///
    +    /// Note that it is not required to call this 
    +    /// method in your application.
    +    /// Terminating your application without letting 
    +    /// merge terminate is perfectly safe.
    +    /// 
    +    /// Obsolete files will eventually be cleaned up
    +    /// by the directory garbage collector.
         pub fn wait_merging_thread(&self) -> Result<()> {
     
             let mut num_segments: usize;
    
    From ce022e5f06ed6d1f19e7ea7e1e3de32d194b995c Mon Sep 17 00:00:00 2001
    From: Paul Masurel 
    Date: Sat, 8 Apr 2017 17:50:59 +0900
    Subject: [PATCH 104/107] issue/54 Clone segment reader rather than reload.
    
    Closes #54.
    ---
     src/core/index.rs          | 11 ++++-------
     src/core/segment_reader.rs | 14 ++++++++------
     src/store/reader.rs        |  1 +
     3 files changed, 13 insertions(+), 13 deletions(-)
    
    diff --git a/src/core/index.rs b/src/core/index.rs
    index 71565f321..cb97ba569 100644
    --- a/src/core/index.rs
    +++ b/src/core/index.rs
    @@ -202,16 +202,13 @@ impl Index {
         /// published or after a merge.
         pub fn load_searchers(&self) -> Result<()> {
             let searchable_segments = self.searchable_segments()?;
    -        let mut searchers = Vec::new();
    -        for _ in 0..NUM_SEARCHERS {
    -            let searchable_segments_clone = searchable_segments.clone();
    -            let segment_readers: Vec = try!(searchable_segments_clone
    +        let segment_readers: Vec = try!(searchable_segments
                     .into_iter()
                     .map(SegmentReader::open)
                     .collect());
    -            let searcher = Searcher::from(segment_readers);
    -            searchers.push(searcher);
    -        }
    +        let searchers = (0..NUM_SEARCHERS)
    +            .map(|_| Searcher::from(segment_readers.clone()))
    +            .collect();
             self.searcher_pool.publish_new_generation(searchers);
             Ok(())
         }
    diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs
    index 43aa6b50e..3e2c46164 100644
    --- a/src/core/segment_reader.rs
    +++ b/src/core/segment_reader.rs
    @@ -12,6 +12,7 @@ use DocId;
     use std::str;
     use postings::TermInfo;
     use datastruct::FstMap;
    +use std::sync::Arc;
     use std::fmt;
     use rustc_serialize::json;
     use core::SegmentInfo;
    @@ -37,14 +38,15 @@ use error::Error;
     /// The segment reader has a very low memory footprint,
     /// as close to all of the memory data is mmapped.
     ///
    +#[derive(Clone)]
     pub struct SegmentReader {
         segment_info: SegmentInfo,
         segment_id: SegmentId,
    -    term_infos: FstMap,
    +    term_infos: Arc>,
         postings_data: ReadOnlySource,
         store_reader: StoreReader,
    -    fast_fields_reader: U32FastFieldsReader,
    -    fieldnorms_reader: U32FastFieldsReader,
    +    fast_fields_reader: Arc,
    +    fieldnorms_reader: Arc,
         delete_bitset: DeleteBitSet,
         positions_data: ReadOnlySource,
         schema: Schema,
    @@ -171,11 +173,11 @@ impl SegmentReader {
             Ok(SegmentReader {
                 segment_info: segment_info,
                 postings_data: postings_shared_mmap,
    -            term_infos: term_infos,
    +            term_infos: Arc::new(term_infos),
                 segment_id: segment.id(),
                 store_reader: store_reader,
    -            fast_fields_reader: fast_fields_reader,
    -            fieldnorms_reader: fieldnorms_reader,
    +            fast_fields_reader: Arc::new(fast_fields_reader),
    +            fieldnorms_reader: Arc::new(fieldnorms_reader),
                 delete_bitset: delete_bitset,
                 positions_data: positions_data,
                 schema: schema,
    diff --git a/src/store/reader.rs b/src/store/reader.rs
    index c3bfddfb7..3a9918f9d 100644
    --- a/src/store/reader.rs
    +++ b/src/store/reader.rs
    @@ -11,6 +11,7 @@ use std::io::{self, Read};
     use datastruct::SkipList;
     use lz4;
     
    +#[derive(Clone)]
     pub struct StoreReader {
         pub data: ReadOnlySource,
         pub offset_index_source: ReadOnlySource,
    
    From dc43135fe0f0fa4abf63dcf787b365b49925ae62 Mon Sep 17 00:00:00 2001
    From: Paul Masurel 
    Date: Sat, 8 Apr 2017 18:49:37 +0900
    Subject: [PATCH 105/107] NOBUG Remove .info
    
    ---
     src/core/mod.rs                   |  1 -
     src/core/segment.rs               |  6 ------
     src/core/segment_component.rs     |  4 +---
     src/core/segment_meta.rs          | 16 +++++++---------
     src/core/segment_reader.rs        | 29 ++++++-----------------------
     src/indexer/merger.rs             | 10 ++++------
     src/indexer/segment_serializer.rs | 15 ---------------
     src/indexer/segment_writer.rs     | 22 +++++-----------------
     8 files changed, 23 insertions(+), 80 deletions(-)
    
    diff --git a/src/core/mod.rs b/src/core/mod.rs
    index d0a3964a4..6f7cb9edc 100644
    --- a/src/core/mod.rs
    +++ b/src/core/mod.rs
    @@ -14,7 +14,6 @@ pub use self::segment_component::SegmentComponent;
     pub use self::segment_id::SegmentId;
     pub use self::segment_reader::SegmentReader;
     pub use self::segment::Segment;
    -pub use self::segment::SegmentInfo;
     pub use self::segment::SerializableSegment;
     pub use self::index::Index;
     pub use self::segment_meta::SegmentMeta;
    diff --git a/src/core/segment.rs b/src/core/segment.rs
    index c9d94cacc..c99d36e85 100644
    --- a/src/core/segment.rs
    +++ b/src/core/segment.rs
    @@ -1,7 +1,6 @@
     use Result;
     use std::path::PathBuf;
     use schema::Schema;
    -use DocId;
     use std::fmt;
     use core::SegmentId;
     use directory::{ReadOnlySource, WritePtr, FileProtection};
    @@ -101,11 +100,6 @@ pub trait SerializableSegment {
         fn write(&self, serializer: SegmentSerializer) -> Result;
     }
     
    -#[derive(Clone,Debug,RustcDecodable,RustcEncodable)]
    -pub struct SegmentInfo {
    -	pub max_doc: DocId,
    -}
    -
     #[cfg(test)]
     mod tests {
     
    diff --git a/src/core/segment_component.rs b/src/core/segment_component.rs
    index 5e380c597..6f85c4031 100644
    --- a/src/core/segment_component.rs
    +++ b/src/core/segment_component.rs
    @@ -1,6 +1,5 @@
     #[derive(Copy, Clone)]
     pub enum SegmentComponent {
    -    INFO,
         POSTINGS,
         POSITIONS,
         FASTFIELDS,
    @@ -13,8 +12,7 @@ pub enum SegmentComponent {
     impl SegmentComponent {
         
         pub fn iterator() -> impl Iterator {
    -        static SEGMENT_COMPONENTS: [SegmentComponent;  8] = [
    -            SegmentComponent::INFO,
    +        static SEGMENT_COMPONENTS: [SegmentComponent;  7] = [
                 SegmentComponent::POSTINGS,
                 SegmentComponent::POSITIONS,
                 SegmentComponent::FASTFIELDS,
    diff --git a/src/core/segment_meta.rs b/src/core/segment_meta.rs
    index a12428b07..9716c348b 100644
    --- a/src/core/segment_meta.rs
    +++ b/src/core/segment_meta.rs
    @@ -65,17 +65,15 @@ impl SegmentMeta {
         /// It just joins the segment id with the extension 
         /// associated to a segment component.
         pub fn relative_path(&self, component: SegmentComponent) -> PathBuf {
    -        use self::SegmentComponent::*;
             let mut path = self.id().uuid_string();
             path.push_str(&*match component {
    -            POSITIONS => ".pos".to_string(),
    -            INFO => ".info".to_string(),
    -            POSTINGS => ".idx".to_string(),
    -            TERMS => ".term".to_string(),
    -            STORE => ".store".to_string(),
    -            FASTFIELDS => ".fast".to_string(),
    -            FIELDNORMS => ".fieldnorm".to_string(),
    -            DELETE => {format!(".{}.del", self.delete_opstamp().unwrap_or(0))},
    +            SegmentComponent::POSITIONS => ".pos".to_string(),
    +            SegmentComponent::POSTINGS => ".idx".to_string(),
    +            SegmentComponent::TERMS => ".term".to_string(),
    +            SegmentComponent::STORE => ".store".to_string(),
    +            SegmentComponent::FASTFIELDS => ".fast".to_string(),
    +            SegmentComponent::FIELDNORMS => ".fieldnorm".to_string(),
    +            SegmentComponent::DELETE => {format!(".{}.del", self.delete_opstamp().unwrap_or(0))},
             });
             PathBuf::from(path)
         }
    diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs
    index 3e2c46164..ad79f6a42 100644
    --- a/src/core/segment_reader.rs
    +++ b/src/core/segment_reader.rs
    @@ -4,6 +4,7 @@ use core::SegmentId;
     use core::SegmentComponent;
     use schema::Term;
     use common::HasLen;
    +use core::SegmentMeta;
     use fastfield::delete::DeleteBitSet;
     use store::StoreReader;
     use schema::Document;
    @@ -14,8 +15,6 @@ use postings::TermInfo;
     use datastruct::FstMap;
     use std::sync::Arc;
     use std::fmt;
    -use rustc_serialize::json;
    -use core::SegmentInfo;
     use schema::Field;
     use postings::SegmentPostingsOption;
     use postings::SegmentPostings;
    @@ -24,8 +23,6 @@ use schema::Schema;
     use schema::FieldType;
     use postings::FreqHandler;
     use schema::TextIndexingOptions;
    -use error::Error;
    -
     
     /// Entry point to access all of the datastructures of the `Segment`
     ///
    @@ -40,8 +37,8 @@ use error::Error;
     ///
     #[derive(Clone)]
     pub struct SegmentReader {
    -    segment_info: SegmentInfo,
         segment_id: SegmentId,
    +    segment_meta: SegmentMeta,
         term_infos: Arc>,
         postings_data: ReadOnlySource,
         store_reader: StoreReader,
    @@ -58,7 +55,7 @@ impl SegmentReader {
         /// Today, `tantivy` does not handle deletes, so it happens
         /// to also be the number of documents in the index.
         pub fn max_doc(&self) -> DocId {
    -        self.segment_info.max_doc
    +        self.segment_meta.max_doc()
         }
         
         /// Returns the number of documents.
    @@ -67,7 +64,7 @@ impl SegmentReader {
         /// Today, `tantivy` does not handle deletes so max doc and
         /// num_docs are the same.
         pub fn num_docs(&self) -> DocId {
    -        self.segment_info.max_doc - self.num_deleted_docs()
    +        self.segment_meta.num_docs()
         }
         
         /// Return the number of documents that have been
    @@ -130,21 +127,7 @@ impl SegmentReader {
     
         /// Open a new segment for reading.
         pub fn open(segment: Segment) -> Result {
    -        let segment_info_reader = try!(segment.open_read(SegmentComponent::INFO));
    -        let segment_info_data = try!(
    -            str::from_utf8(&*segment_info_reader)
    -                .map_err(|err| {
    -                    let segment_info_filepath = segment.relative_path(SegmentComponent::INFO);
    -                    Error::CorruptedFile(segment_info_filepath, Box::new(err))
    -                })
    -         );
    -        let segment_info: SegmentInfo = try!(
    -            json::decode(&segment_info_data)
    -            .map_err(|err| {
    -                let file_path = segment.relative_path(SegmentComponent::INFO);
    -                Error::CorruptedFile(file_path, Box::new(err))
    -            })
    -        );
    +
             let source = try!(segment.open_read(SegmentComponent::TERMS));
             let term_infos = try!(FstMap::from_source(source));
             let store_reader = StoreReader::from(try!(segment.open_read(SegmentComponent::STORE)));
    @@ -171,7 +154,7 @@ impl SegmentReader {
             
             let schema = segment.schema();
             Ok(SegmentReader {
    -            segment_info: segment_info,
    +            segment_meta: segment.meta().clone(),
                 postings_data: postings_shared_mmap,
                 term_infos: Arc::new(term_infos),
                 segment_id: segment.id(),
    diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs
    index 026921111..bb0bb7ff7 100644
    --- a/src/indexer/merger.rs
    +++ b/src/indexer/merger.rs
    @@ -15,14 +15,13 @@ use fastfield::delete::DeleteBitSet;
     use schema::{Schema, Field};
     use fastfield::FastFieldSerializer;
     use store::StoreWriter;
    -use core::SegmentInfo;
     use std::cmp::{min, max};
     use common::allocate_vec;
     
     pub struct IndexMerger {
         schema: Schema,
         readers: Vec,
    -    segment_info: SegmentInfo,
    +    max_doc: u32,
     }
     
     
    @@ -82,7 +81,7 @@ fn extract_fast_field_reader(segment_reader: &SegmentReader, field: Field) -> Op
     impl IndexMerger {
         pub fn open(schema: Schema, segments: &[Segment]) -> Result {
             let mut readers = vec!();
    -        let mut max_doc = 0;
    +        let mut max_doc: u32 = 0u32;
             for segment in segments {
                 if segment.meta().num_docs() > 0 {
                     let reader = SegmentReader::open(segment.clone())?;
    @@ -93,7 +92,7 @@ impl IndexMerger {
             Ok(IndexMerger {
                 schema: schema,
                 readers: readers,
    -            segment_info: SegmentInfo { max_doc: max_doc },
    +            max_doc: max_doc,
             })
         }
     
    @@ -283,9 +282,8 @@ impl SerializableSegment for IndexMerger {
             try!(self.write_fieldnorms(serializer.get_fieldnorms_serializer()));
             try!(self.write_fast_fields(serializer.get_fast_field_serializer()));
             try!(self.write_storable_fields(serializer.get_store_writer()));
    -        try!(serializer.write_segment_info(&self.segment_info));
             try!(serializer.close());
    -        Ok(self.segment_info.max_doc)
    +        Ok(self.max_doc)
         }
     }
     
    diff --git a/src/indexer/segment_serializer.rs b/src/indexer/segment_serializer.rs
    index bfbca0faf..e84a24c33 100644
    --- a/src/indexer/segment_serializer.rs
    +++ b/src/indexer/segment_serializer.rs
    @@ -1,9 +1,6 @@
     use Result;
     
    -use std::io::Write;
    -use rustc_serialize::json;
     use core::Segment;
    -use core::SegmentInfo;
     use core::SegmentComponent;
     use fastfield::FastFieldSerializer;
     use store::StoreWriter;
    @@ -13,7 +10,6 @@ use postings::PostingsSerializer;
     /// Segment serializer is in charge of laying out on disk
     /// the data accumulated and sorted by the `SegmentWriter`.
     pub struct SegmentSerializer {
    -    segment: Segment,
         store_writer: StoreWriter,
         fast_field_serializer: FastFieldSerializer,
         fieldnorms_serializer: FastFieldSerializer,
    @@ -33,7 +29,6 @@ impl SegmentSerializer {
     
             let postings_serializer = try!(PostingsSerializer::open(segment));
             Ok(SegmentSerializer {
    -            segment: segment.clone(),
                 postings_serializer: postings_serializer,
                 store_writer: StoreWriter::new(store_write),
                 fast_field_serializer: fast_field_serializer,
    @@ -61,16 +56,6 @@ impl SegmentSerializer {
             &mut self.store_writer
         }
     
    -    /// Write the `SegmentInfo`
    -    pub fn write_segment_info(&mut self, segment_info: &SegmentInfo) -> Result<()> {
    -        let mut write = try!(self.segment.open_write(SegmentComponent::INFO));
    -        let json_data = json::encode(segment_info)
    -            .expect("Encoding to segment_info to JSON failed. This should never happen");
    -        try!(write.write_all(json_data.as_bytes()));
    -        try!(write.flush());
    -        Ok(())
    -    }
    -
         /// Finalize the segment serialization.
         pub fn close(self) -> Result<()> {
             try!(self.fast_field_serializer.close());
    diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs
    index 82df64a84..63842f9f1 100644
    --- a/src/indexer/segment_writer.rs
    +++ b/src/indexer/segment_writer.rs
    @@ -3,7 +3,6 @@ use DocId;
     use std::io;
     use schema::Schema;	
     use schema::Term;
    -use core::SegmentInfo;
     use core::Segment;
     use core::SerializableSegment;
     use postings::PostingsWriter;
    @@ -106,14 +105,12 @@ impl<'a> SegmentWriter<'a> {
     	/// Finalize consumes the `SegmentWriter`, so that it cannot 
     	/// be used afterwards.
     	pub fn finalize(mut self) -> Result> {
    -		let segment_info = self.segment_info();
     		for per_field_postings_writer in &mut self.per_field_postings_writers {
     			per_field_postings_writer.close(self.heap);
     		}
     		write(&self.per_field_postings_writers,
     			  &self.fast_field_writers,
     			  &self.fieldnorms_writer,
    -			  segment_info,
     			  self.segment_serializer,
     			  self.heap)?;
     		Ok(self.doc_opstamps)
    @@ -183,14 +180,6 @@ impl<'a> SegmentWriter<'a> {
     		Ok(())
         }
     	
    -	/// Creates the `SegmentInfo` that will be serialized along
    -	/// with the index in JSON format.  
    - 	fn segment_info(&self,) -> SegmentInfo {
    -		SegmentInfo {
    -			max_doc: self.max_doc
    -		}
    -	}
    -	
     	
     	/// Max doc is 
     	/// - the number of documents in the segment assuming there is no deletes
    @@ -218,26 +207,25 @@ impl<'a> SegmentWriter<'a> {
     fn write<'a>(per_field_postings_writers: &[Box],
     		 fast_field_writers: &U32FastFieldsWriter,
     		 fieldnorms_writer: &U32FastFieldsWriter,
    -		 segment_info: SegmentInfo,
     	  	 mut serializer: SegmentSerializer,
    -		 heap: &'a Heap,) -> Result {
    +		 heap: &'a Heap,) -> Result<()> {
     		for per_field_postings_writer in per_field_postings_writers {
     			try!(per_field_postings_writer.serialize(serializer.get_postings_serializer(), heap));
     		}
     		try!(fast_field_writers.serialize(serializer.get_fast_field_serializer()));
     		try!(fieldnorms_writer.serialize(serializer.get_fieldnorms_serializer()));
    -		try!(serializer.write_segment_info(&segment_info));
     		try!(serializer.close());
    -		Ok(segment_info.max_doc)
    +		Ok(())
     }
     
     impl<'a> SerializableSegment for SegmentWriter<'a> {
     	fn write(&self, serializer: SegmentSerializer) -> Result {
    +		let max_doc = self.max_doc;
     		write(&self.per_field_postings_writers,
     		      &self.fast_field_writers,
     			  &self.fieldnorms_writer,
    -			  self.segment_info(),
     		      serializer,
    -			  self.heap)
    +			  self.heap)?;
    +		Ok(max_doc)
     	}
     }
    
    From 60279a03b60ab6783630aa06148b17ef7f527ec0 Mon Sep 17 00:00:00 2001
    From: Paul Masurel 
    Date: Sat, 8 Apr 2017 18:52:43 +0900
    Subject: [PATCH 106/107] RELEASE Tantivy 0.3. See Changelog
    
    ---
     CHANGELOG.md | 47 +++++++++++++++++++++++++++++++++++++++++++++++
     Cargo.toml   |  2 +-
     README.md    |  6 ++++--
     3 files changed, 52 insertions(+), 3 deletions(-)
     create mode 100644 CHANGELOG.md
    
    diff --git a/CHANGELOG.md b/CHANGELOG.md
    new file mode 100644
    index 000000000..82459e469
    --- /dev/null
    +++ b/CHANGELOG.md
    @@ -0,0 +1,47 @@
    +Tantivy 0.3
    +==========================
    +
    +
    +Special thanks to @Kodraus @lnicola @Ameobea @manuel-woelker @celaus
    +for their contribution to this release.
    +
    +Thanks also to everyone in tantivy gitter chat 
    +for their advise and company :)
    +
    +https://gitter.im/tantivy-search/tantivy
    +
    +
    +Warning:
    +
    +Tantivy 0.3 is NOT backward compatible with tantivy 0.2 
    +code and index format.
    +You should not expect backward compatibility before 
    +tantivy 1.0.
    +
    +
    +New Features
    +------------
    +
    +- Delete. You can now delete documents from an index.
    +- Support for windows (Thanks to @lnicola)
    +
    +
    +Various Bugfixes & small improvements
    +----------------------------------------
    +
    +- Added CI for Windows (https://ci.appveyor.com/project/fulmicoton/tantivy)
    +Thanks to @KodrAus ! (#108)
    +- Various dependy version update (Thanks to @Ameobea) #76
    +- Fixed several race conditions in `Index.wait_merge_threads`
    +- Fixed #72. Mmap were never released.
    +- Fixed #80. Fast field used to take an amplitude of 32 bits after a merge. (Ouch!)
    +- Fixed #92. u32 are now encoded using big endian in the fst
    +  in order to make there enumeration consistent with
    +  the natural ordering.
    +- Building binary targets for tantivy-cli (Thanks to @KodrAus)
    +- Misc invisible bug fixes, and code cleanup.
    +- Use 
    +
    +
    +
    +
    diff --git a/Cargo.toml b/Cargo.toml
    index 85fd68405..760a51b40 100644
    --- a/Cargo.toml
    +++ b/Cargo.toml
    @@ -1,6 +1,6 @@
     [package]
     name = "tantivy"
    -version = "0.2.0"
    +version = "0.3.0"
     authors = ["Paul Masurel "]
     build = "build.rs"
     license = "MIT"
    diff --git a/README.md b/README.md
    index 75b24b428..2fcbe7a1c 100644
    --- a/README.md
    +++ b/README.md
    @@ -2,8 +2,9 @@
     
     [![Build Status](https://travis-ci.org/tantivy-search/tantivy.svg?branch=master)](https://travis-ci.org/tantivy-search/tantivy)
     [![Coverage Status](https://coveralls.io/repos/github/tantivy-search/tantivy/badge.svg?branch=master&refresh1)](https://coveralls.io/github/tantivy-search/tantivy?branch=master)
    -[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
     [![Join the chat at https://gitter.im/tantivy-search/tantivy](https://badges.gitter.im/tantivy-search/tantivy.svg)](https://gitter.im/tantivy-search/tantivy?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
    +[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
    +[![Build status](https://ci.appveyor.com/api/projects/status/r7nb13kj23u8m9pj?svg=true)](https://ci.appveyor.com/project/fulmicoton/tantivy)
     ![beacon for google analytics](https://ga-beacon.appspot.com/UA-88834340-1/tantivy/README)
     
     **Tantivy** is a **full text search engine library** written in rust.
    @@ -25,7 +26,8 @@ It is strongly inspired by Lucene's design.
     - LZ4 compressed document store
     - Cheesy logo with a horse
     
    -Tantivy supports Linux and MacOS. Windows is not supported.
    +Tantivy supports Linux, MacOS and Windows.
    +
     
     # Getting started
     
    
    From 44c684af5cc44703a56289ea3c40cd344b2509d7 Mon Sep 17 00:00:00 2001
    From: Paul Masurel 
    Date: Sat, 8 Apr 2017 19:01:31 +0900
    Subject: [PATCH 107/107] NOBUG Fixes winapi version
    
    ---
     Cargo.toml | 2 +-
     1 file changed, 1 insertion(+), 1 deletion(-)
    
    diff --git a/Cargo.toml b/Cargo.toml
    index 760a51b40..7adb1fa47 100644
    --- a/Cargo.toml
    +++ b/Cargo.toml
    @@ -39,7 +39,7 @@ futures = "0.1.9"
     futures-cpupool = "0.1.2"
     
     [target.'cfg(windows)'.dependencies]
    -winapi = "*"
    +winapi = "0.2"
     
     [dev-dependencies]
     rand = "0.3"