mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-04 16:22:55 +00:00
NOBUG Added setting merge policy.
This commit is contained in:
@@ -5,6 +5,7 @@ use core::SerializableSegment;
|
||||
use core::Index;
|
||||
use core::Segment;
|
||||
use std::thread::JoinHandle;
|
||||
use indexer::{MergePolicy, DefaultMergePolicy};
|
||||
use indexer::SegmentWriter;
|
||||
use super::directory_lock::DirectoryLock;
|
||||
use std::clone::Clone;
|
||||
@@ -15,6 +16,7 @@ use indexer::merger::IndexMerger;
|
||||
use core::SegmentId;
|
||||
use datastruct::stacker::Heap;
|
||||
use std::mem::swap;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use chan;
|
||||
use core::SegmentMeta;
|
||||
use super::segment_updater::{SegmentUpdater, SegmentUpdate, SegmentUpdateSender};
|
||||
@@ -53,6 +55,8 @@ pub struct IndexWriter {
|
||||
// lifetime of the lock with that of the IndexWriter.
|
||||
_directory_lock: DirectoryLock,
|
||||
|
||||
_merge_policy: Arc<Mutex<Box<MergePolicy>>>,
|
||||
|
||||
index: Index,
|
||||
heap_size_in_bytes_per_thread: usize,
|
||||
|
||||
@@ -204,12 +208,16 @@ impl IndexWriter {
|
||||
let (document_sender, document_receiver): (DocumentSender, DocumentReceiver) =
|
||||
chan::sync(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||
|
||||
let (segment_update_sender, segment_update_thread) = SegmentUpdater::start_updater(index.clone());
|
||||
let merge_policy: Arc<Mutex<Box<MergePolicy>>> = Arc::new(Mutex::new(box DefaultMergePolicy::default()));
|
||||
|
||||
let (segment_update_sender, segment_update_thread) = SegmentUpdater::start_updater(index.clone(), merge_policy.clone());
|
||||
|
||||
let mut index_writer = IndexWriter {
|
||||
|
||||
_directory_lock: directory_lock,
|
||||
|
||||
_merge_policy: merge_policy,
|
||||
|
||||
heap_size_in_bytes_per_thread: heap_size_in_bytes_per_thread,
|
||||
index: index.clone(),
|
||||
|
||||
@@ -229,7 +237,18 @@ impl IndexWriter {
|
||||
try!(index_writer.start_workers());
|
||||
Ok(index_writer)
|
||||
}
|
||||
|
||||
|
||||
|
||||
/// Returns a clone of the index_writer merge policy.
|
||||
pub fn get_merge_policy(&self) -> Box<MergePolicy> {
|
||||
self._merge_policy.lock().unwrap().box_clone()
|
||||
}
|
||||
|
||||
/// Set the merge policy.
|
||||
pub fn set_merge_policy(&self, merge_policy: Box<MergePolicy>) {
|
||||
*self._merge_policy.lock().unwrap() = merge_policy;
|
||||
}
|
||||
|
||||
fn start_workers(&mut self) -> Result<()> {
|
||||
for _ in 0..self.num_threads {
|
||||
try!(self.add_indexing_worker());
|
||||
@@ -445,6 +464,7 @@ mod tests {
|
||||
use Index;
|
||||
use Term;
|
||||
use Error;
|
||||
use indexer::NoMergePolicy;
|
||||
|
||||
#[test]
|
||||
fn test_lockfile_stops_duplicates() {
|
||||
@@ -456,6 +476,17 @@ mod tests {
|
||||
_ => panic!("Expected FileAlreadyExists error"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_set_merge_policy() {
|
||||
let schema_builder = schema::SchemaBuilder::default();
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
let index_writer = index.writer(40_000_000).unwrap();
|
||||
assert_eq!(format!("{:?}", index_writer.get_merge_policy()), "LogMergePolicy { min_merge_size: 8, min_layer_size: 10000, level_log_size: 0.75 }");
|
||||
let merge_policy = box NoMergePolicy::default();
|
||||
index_writer.set_merge_policy(merge_policy);
|
||||
assert_eq!(format!("{:?}", index_writer.get_merge_policy()), "NoMergePolicy");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_lockfile_released_on_drop() {
|
||||
|
||||
@@ -8,6 +8,10 @@ const DEFAULT_LEVEL_LOG_SIZE: f64 = 0.75;
|
||||
const DEFAULT_MIN_LAYER_SIZE: u32 = 10_000;
|
||||
const DEFAULT_MIN_MERGE_SIZE: usize = 8;
|
||||
|
||||
|
||||
/// LogMergePolicy tries tries to merge segments that have a similar number of
|
||||
/// documents.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct LogMergePolicy {
|
||||
min_merge_size: usize,
|
||||
min_layer_size: u32,
|
||||
@@ -20,7 +24,7 @@ impl LogMergePolicy {
|
||||
}
|
||||
|
||||
/// Set the minimum number of segment that may be merge together.
|
||||
pub fn set_min_merge_size(&mut self, min_merge_size: usize) {
|
||||
pub fn set_min_merge_size(&mut self, min_merge_size: usize) {
|
||||
self.min_merge_size = min_merge_size;
|
||||
}
|
||||
|
||||
@@ -30,7 +34,6 @@ impl LogMergePolicy {
|
||||
self.min_layer_size = min_layer_size;
|
||||
}
|
||||
|
||||
|
||||
/// Set the ratio between two consecutive levels.
|
||||
///
|
||||
/// Segment are group in levels according to their sizes.
|
||||
@@ -83,6 +86,10 @@ impl MergePolicy for LogMergePolicy {
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
fn box_clone(&self) -> Box<MergePolicy> {
|
||||
box self.clone()
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for LogMergePolicy {
|
||||
|
||||
@@ -1,15 +1,30 @@
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use std::marker;
|
||||
use std::fmt::Debug;
|
||||
|
||||
|
||||
/// Set of segment suggested for a merge.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct MergeCandidate(pub Vec<SegmentId>);
|
||||
|
||||
pub trait MergePolicy: marker::Send {
|
||||
|
||||
/// The Merge policy defines which segments should be merged.
|
||||
///
|
||||
/// Every time a the list of segments changes, the segment updater
|
||||
/// asks the merge policy if some segments should be merged.
|
||||
pub trait MergePolicy: marker::Send + Debug {
|
||||
/// Given the list of segment metas, returns the list of merge candidates.
|
||||
///
|
||||
/// This call happens on the segment updater thread, and will block
|
||||
/// other segment updates, so all implementations should happen rapidly.
|
||||
fn compute_merge_candidates(&self, segments: &[SegmentMeta]) -> Vec<MergeCandidate>;
|
||||
/// Returns a boxed clone of the MergePolicy.
|
||||
fn box_clone(&self) -> Box<MergePolicy>;
|
||||
}
|
||||
|
||||
/// Never merge segments.
|
||||
#[derive(Debug)]
|
||||
pub struct NoMergePolicy;
|
||||
|
||||
impl Default for NoMergePolicy {
|
||||
@@ -18,10 +33,13 @@ impl Default for NoMergePolicy {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl MergePolicy for NoMergePolicy {
|
||||
fn compute_merge_candidates(&self, _segments: &[SegmentMeta]) -> Vec<MergeCandidate> {
|
||||
Vec::new()
|
||||
}
|
||||
|
||||
fn box_clone(&self) -> Box<MergePolicy> {
|
||||
box NoMergePolicy
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -17,4 +17,6 @@ pub use self::log_merge_policy::LogMergePolicy;
|
||||
pub use self::merge_policy::{NoMergePolicy, MergeCandidate, MergePolicy};
|
||||
pub use self::segment_manager::SegmentManager;
|
||||
|
||||
|
||||
/// Alias for the default merge policy, which is the LogMergePolicy.
|
||||
pub type DefaultMergePolicy = LogMergePolicy;
|
||||
|
||||
@@ -2,12 +2,13 @@
|
||||
|
||||
use chan;
|
||||
use core::Index;
|
||||
use std::sync::Mutex;
|
||||
use core::Segment;
|
||||
use core::SegmentId;
|
||||
use core::SegmentMeta;
|
||||
use std::mem;
|
||||
use core::SerializableSegment;
|
||||
use indexer::{DefaultMergePolicy, MergePolicy};
|
||||
use indexer::MergePolicy;
|
||||
use indexer::MergeCandidate;
|
||||
use indexer::merger::IndexMerger;
|
||||
use indexer::SegmentSerializer;
|
||||
@@ -135,7 +136,7 @@ pub struct SegmentUpdater {
|
||||
segment_update_receiver: SegmentUpdateReceiver,
|
||||
segment_update_sender: SegmentUpdateSender,
|
||||
segment_manager_arc: Arc<SegmentManager>,
|
||||
merge_policy: Box<MergePolicy>,
|
||||
merge_policy: Arc<Mutex<Box<MergePolicy>>>,
|
||||
merging_thread_id: usize,
|
||||
merging_threads: HashMap<usize, JoinHandle<(Vec<SegmentId>, SegmentMeta)> >,
|
||||
}
|
||||
@@ -143,12 +144,12 @@ pub struct SegmentUpdater {
|
||||
|
||||
impl SegmentUpdater {
|
||||
|
||||
pub fn start_updater(index: Index) -> (SegmentUpdateSender, JoinHandle<()>) {
|
||||
let segment_updater = SegmentUpdater::new(index);
|
||||
pub fn start_updater(index: Index, merge_policy: Arc<Mutex<Box<MergePolicy>>>) -> (SegmentUpdateSender, JoinHandle<()>) {
|
||||
let segment_updater = SegmentUpdater::new(index, merge_policy);
|
||||
(segment_updater.segment_update_sender.clone(), segment_updater.start())
|
||||
}
|
||||
|
||||
fn new(index: Index) -> SegmentUpdater {
|
||||
fn new(index: Index, merge_policy: Arc<Mutex<Box<MergePolicy>>>) -> SegmentUpdater {
|
||||
let segment_manager_arc = get_segment_manager(&index);
|
||||
let (segment_update_sender, segment_update_receiver): (SegmentUpdateSender, SegmentUpdateReceiver) = chan::async();
|
||||
SegmentUpdater {
|
||||
@@ -157,7 +158,7 @@ impl SegmentUpdater {
|
||||
segment_update_sender: segment_update_sender,
|
||||
segment_update_receiver: segment_update_receiver,
|
||||
segment_manager_arc: segment_manager_arc,
|
||||
merge_policy: Box::new(DefaultMergePolicy::default()), // TODO make that configurable
|
||||
merge_policy: merge_policy,
|
||||
merging_thread_id: 0,
|
||||
merging_threads: HashMap::new(),
|
||||
}
|
||||
@@ -236,8 +237,9 @@ impl SegmentUpdater {
|
||||
let (committed_segments, uncommitted_segments) = get_segment_ready_for_commit(segment_manager);
|
||||
// Committed segments cannot be merged with uncommitted_segments.
|
||||
// We therefore consider merges using these two sets of segments independantly.
|
||||
let mut merge_candidates = self.merge_policy.compute_merge_candidates(&uncommitted_segments);
|
||||
let committed_merge_candidates = self.merge_policy.compute_merge_candidates(&committed_segments);
|
||||
let merge_policy_lock = self.merge_policy.lock().unwrap();
|
||||
let mut merge_candidates = merge_policy_lock.compute_merge_candidates(&uncommitted_segments);
|
||||
let committed_merge_candidates = merge_policy_lock.compute_merge_candidates(&committed_segments);
|
||||
merge_candidates.extend_from_slice(&committed_merge_candidates[..]);
|
||||
merge_candidates
|
||||
}
|
||||
|
||||
10
src/lib.rs
10
src/lib.rs
@@ -112,6 +112,16 @@ pub use postings::Postings;
|
||||
pub use postings::SegmentPostingsOption;
|
||||
|
||||
|
||||
|
||||
/// Tantivy's makes it possible to personalize when
|
||||
/// the indexer should merge its segments
|
||||
pub mod merge_policy {
|
||||
pub use indexer::MergePolicy;
|
||||
pub use indexer::LogMergePolicy;
|
||||
pub use indexer::NoMergePolicy;
|
||||
pub use indexer::DefaultMergePolicy;
|
||||
}
|
||||
|
||||
/// u32 identifying a document within a segment.
|
||||
/// Documents have their doc id assigned incrementally,
|
||||
/// as they are added in the segment.
|
||||
|
||||
Reference in New Issue
Block a user