mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-28 14:10:42 +00:00
perf: various optimizations around arenas (#60)
- Use a bitset to track used buckets in the `SharedArenaHashmap`, allowing for more efficient iteration - Create a global pool for both `MemoryArena` and `IndexingContext` - Reduce the MemoryArea page size by half (it's now 512KB instead of 1MB) - Centralize thread pool instances in `SegmentUpdater` so we can elide making them if all nthread sizes are zero
This commit is contained in:
@@ -588,7 +588,7 @@ impl Index {
|
||||
num_threads: usize,
|
||||
overall_memory_budget_in_bytes: usize,
|
||||
) -> crate::Result<IndexWriter<D>> {
|
||||
let memory_arena_in_bytes_per_thread = overall_memory_budget_in_bytes / num_threads;
|
||||
let memory_arena_in_bytes_per_thread = overall_memory_budget_in_bytes / num_threads.max(1);
|
||||
let options = IndexWriterOptions::builder()
|
||||
.num_worker_threads(num_threads)
|
||||
.memory_budget_per_thread(memory_arena_in_bytes_per_thread)
|
||||
|
||||
@@ -5,8 +5,9 @@ use std::io::Write;
|
||||
use std::ops::Deref;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::sync::Arc;
|
||||
|
||||
use parking_lot::RwLock;
|
||||
use rayon::{ThreadPool, ThreadPoolBuilder};
|
||||
|
||||
use super::segment_manager::SegmentManager;
|
||||
@@ -317,6 +318,12 @@ pub fn merge_filtered_segments<T: Into<Box<dyn Directory>>>(
|
||||
Ok(merged_index)
|
||||
}
|
||||
|
||||
struct Pools {
|
||||
pool: ThreadPool,
|
||||
merge_thread_pool: ThreadPool,
|
||||
merge_errors: Arc<RwLock<Vec<TantivyError>>>,
|
||||
}
|
||||
|
||||
pub(crate) struct InnerSegmentUpdater {
|
||||
// we keep a copy of the current active IndexMeta to
|
||||
// avoid loading the file every time we need it in the
|
||||
@@ -325,10 +332,7 @@ pub(crate) struct InnerSegmentUpdater {
|
||||
// This should be up to date as all update happen through
|
||||
// the unique active `SegmentUpdater`.
|
||||
active_index_meta: RwLock<Arc<IndexMeta>>,
|
||||
pool: ThreadPool,
|
||||
merge_thread_pool: ThreadPool,
|
||||
merge_errors: Arc<RwLock<Vec<TantivyError>>>,
|
||||
|
||||
pools: Option<Pools>,
|
||||
index: Index,
|
||||
segment_manager: SegmentManager,
|
||||
merge_policy: RwLock<Arc<dyn MergePolicy>>,
|
||||
@@ -348,40 +352,56 @@ impl SegmentUpdater {
|
||||
) -> crate::Result<SegmentUpdater> {
|
||||
let segments = index.searchable_segment_metas()?;
|
||||
let segment_manager = SegmentManager::from_segments(segments, delete_cursor);
|
||||
let mut builder = ThreadPoolBuilder::new()
|
||||
.thread_name(|_| "segment_updater".to_string())
|
||||
.num_threads(1);
|
||||
|
||||
if let Some(panic_handler) = panic_handler.as_ref() {
|
||||
let panic_handler = panic_handler.clone();
|
||||
builder = builder.panic_handler(move |any| {
|
||||
panic_handler(any);
|
||||
});
|
||||
}
|
||||
|
||||
let pool = builder.build().map_err(|_| {
|
||||
crate::TantivyError::SystemError("Failed to spawn segment updater thread".to_string())
|
||||
})?;
|
||||
let mut builder = ThreadPoolBuilder::new()
|
||||
.thread_name(|i| format!("merge_thread_{i}"))
|
||||
.num_threads(num_merge_threads);
|
||||
if let Some(panic_handler) = panic_handler {
|
||||
let panic_handler = panic_handler.clone();
|
||||
builder = builder.panic_handler(move |any| {
|
||||
panic_handler(any);
|
||||
});
|
||||
}
|
||||
|
||||
let merge_thread_pool = builder.build().map_err(|_| {
|
||||
crate::TantivyError::SystemError("Failed to spawn segment merging thread".to_string())
|
||||
})?;
|
||||
let index_meta = index.load_metas()?;
|
||||
Ok(SegmentUpdater {
|
||||
inner: Arc::new(InnerSegmentUpdater {
|
||||
active_index_meta: RwLock::new(Arc::new(index_meta)),
|
||||
pool,
|
||||
merge_thread_pool,
|
||||
merge_errors: Default::default(),
|
||||
pools: (num_merge_threads > 0).then(|| {
|
||||
let mut builder = ThreadPoolBuilder::new()
|
||||
.thread_name(|_| "segment_updater".to_string())
|
||||
.num_threads(1);
|
||||
|
||||
if let Some(panic_handler) = panic_handler.as_ref() {
|
||||
let panic_handler = panic_handler.clone();
|
||||
builder = builder.panic_handler(move |any| {
|
||||
panic_handler(any);
|
||||
});
|
||||
}
|
||||
|
||||
let pool = builder
|
||||
.build()
|
||||
.map_err(|_| {
|
||||
crate::TantivyError::SystemError(
|
||||
"Failed to spawn segment updater thread".to_string(),
|
||||
)
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let mut builder = ThreadPoolBuilder::new()
|
||||
.thread_name(|i| format!("merge_thread_{i}"))
|
||||
.num_threads(num_merge_threads);
|
||||
if let Some(panic_handler) = panic_handler {
|
||||
let panic_handler = panic_handler.clone();
|
||||
builder = builder.panic_handler(move |any| {
|
||||
panic_handler(any);
|
||||
});
|
||||
}
|
||||
let merge_thread_pool = builder
|
||||
.build()
|
||||
.map_err(|_| {
|
||||
crate::TantivyError::SystemError(
|
||||
"Failed to spawn segment merging thread".to_string(),
|
||||
)
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
Pools {
|
||||
pool,
|
||||
merge_thread_pool,
|
||||
merge_errors: Default::default(),
|
||||
}
|
||||
}),
|
||||
index,
|
||||
segment_manager,
|
||||
merge_policy: RwLock::new(Arc::new(DefaultMergePolicy::default())),
|
||||
@@ -394,12 +414,12 @@ impl SegmentUpdater {
|
||||
}
|
||||
|
||||
pub fn get_merge_policy(&self) -> Arc<dyn MergePolicy> {
|
||||
self.merge_policy.read().unwrap().clone()
|
||||
self.merge_policy.read().clone()
|
||||
}
|
||||
|
||||
pub fn set_merge_policy(&self, merge_policy: Box<dyn MergePolicy>) {
|
||||
let arc_merge_policy = Arc::from(merge_policy);
|
||||
*self.merge_policy.write().unwrap() = arc_merge_policy;
|
||||
*self.merge_policy.write() = arc_merge_policy;
|
||||
}
|
||||
|
||||
fn schedule_task<T: 'static + Send, F: FnOnce() -> crate::Result<T> + 'static + Send>(
|
||||
@@ -412,10 +432,14 @@ impl SegmentUpdater {
|
||||
let (scheduled_result, sender) = FutureResult::create(
|
||||
"A segment_updater future did not succeed. This should never happen.",
|
||||
);
|
||||
self.pool.spawn(|| {
|
||||
let task_result = task();
|
||||
let _ = sender.send(task_result);
|
||||
});
|
||||
self.pools
|
||||
.as_ref()
|
||||
.expect("thread pools should have been configured")
|
||||
.pool
|
||||
.spawn(|| {
|
||||
let task_result = task();
|
||||
let _ = sender.send(task_result);
|
||||
});
|
||||
scheduled_result
|
||||
}
|
||||
|
||||
@@ -538,11 +562,11 @@ impl SegmentUpdater {
|
||||
}
|
||||
|
||||
fn store_meta(&self, index_meta: &IndexMeta) {
|
||||
*self.active_index_meta.write().unwrap() = Arc::new(index_meta.clone());
|
||||
*self.active_index_meta.write() = Arc::new(index_meta.clone());
|
||||
}
|
||||
|
||||
fn load_meta(&self) -> Arc<IndexMeta> {
|
||||
self.active_index_meta.read().unwrap().clone()
|
||||
self.active_index_meta.read().clone()
|
||||
}
|
||||
|
||||
pub(crate) fn make_merge_operation(
|
||||
@@ -605,38 +629,48 @@ impl SegmentUpdater {
|
||||
FutureResult::create("Merge operation failed.");
|
||||
|
||||
let cancel = self.cancel.box_clone();
|
||||
let merge_errors = self.merge_errors.clone();
|
||||
self.merge_thread_pool.spawn(move || {
|
||||
// The fact that `merge_operation` is moved here is important.
|
||||
// Its lifetime is used to track how many merging thread are currently running,
|
||||
// as well as which segment is currently in merge and therefore should not be
|
||||
// candidate for another merge.
|
||||
match merge(
|
||||
&segment_updater.index,
|
||||
segment_entries,
|
||||
merge_operation.target_opstamp(),
|
||||
cancel,
|
||||
false,
|
||||
) {
|
||||
Ok(after_merge_segment_entry) => {
|
||||
let res = segment_updater.end_merge(merge_operation, after_merge_segment_entry);
|
||||
let _send_result = merging_future_send.send(res);
|
||||
}
|
||||
Err(merge_error) => {
|
||||
warn!(
|
||||
"Merge of {:?} was cancelled: {:?}",
|
||||
merge_operation.segment_ids().to_vec(),
|
||||
merge_error
|
||||
);
|
||||
if cfg!(test) {
|
||||
panic!("{merge_error:?}");
|
||||
let merge_errors = self
|
||||
.pools
|
||||
.as_ref()
|
||||
.expect("thread pools should have been configured")
|
||||
.merge_errors
|
||||
.clone();
|
||||
self.pools
|
||||
.as_ref()
|
||||
.expect("thread pools should have been configured")
|
||||
.merge_thread_pool
|
||||
.spawn(move || {
|
||||
// The fact that `merge_operation` is moved here is important.
|
||||
// Its lifetime is used to track how many merging thread are currently running,
|
||||
// as well as which segment is currently in merge and therefore should not be
|
||||
// candidate for another merge.
|
||||
match merge(
|
||||
&segment_updater.index,
|
||||
segment_entries,
|
||||
merge_operation.target_opstamp(),
|
||||
cancel,
|
||||
false,
|
||||
) {
|
||||
Ok(after_merge_segment_entry) => {
|
||||
let res =
|
||||
segment_updater.end_merge(merge_operation, after_merge_segment_entry);
|
||||
let _send_result = merging_future_send.send(res);
|
||||
}
|
||||
Err(merge_error) => {
|
||||
warn!(
|
||||
"Merge of {:?} was cancelled: {:?}",
|
||||
merge_operation.segment_ids().to_vec(),
|
||||
merge_error
|
||||
);
|
||||
if cfg!(test) {
|
||||
panic!("{merge_error:?}");
|
||||
}
|
||||
|
||||
merge_errors.write().unwrap().push(merge_error.clone());
|
||||
let _send_result = merging_future_send.send(Err(merge_error));
|
||||
merge_errors.write().push(merge_error.clone());
|
||||
let _send_result = merging_future_send.send(Err(merge_error));
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
scheduled_result
|
||||
}
|
||||
@@ -679,7 +713,11 @@ impl SegmentUpdater {
|
||||
}
|
||||
|
||||
pub(crate) fn get_merge_errors(&self) -> Vec<TantivyError> {
|
||||
self.merge_errors.read().unwrap().clone()
|
||||
if let Some(pools) = self.pools.as_ref() {
|
||||
pools.merge_errors.read().clone()
|
||||
} else {
|
||||
Vec::new()
|
||||
}
|
||||
}
|
||||
|
||||
fn consider_merge_options(&self) {
|
||||
|
||||
@@ -1,7 +1,13 @@
|
||||
use std::cell::RefCell;
|
||||
|
||||
use stacker::{ArenaHashMap, MemoryArena};
|
||||
|
||||
use crate::indexer::path_to_unordered_id::PathToUnorderedId;
|
||||
|
||||
thread_local! {
|
||||
static CONTEXT_POOL: RefCell<Vec<IndexingContext>> = RefCell::new(Vec::new());
|
||||
}
|
||||
|
||||
/// IndexingContext contains all of the transient memory arenas
|
||||
/// required for building the inverted index.
|
||||
pub(crate) struct IndexingContext {
|
||||
@@ -13,9 +19,27 @@ pub(crate) struct IndexingContext {
|
||||
pub path_to_unordered_id: PathToUnorderedId,
|
||||
}
|
||||
|
||||
impl Default for IndexingContext {
|
||||
fn default() -> Self {
|
||||
Self::create(1)
|
||||
}
|
||||
}
|
||||
|
||||
impl IndexingContext {
|
||||
/// Create a new IndexingContext given the size of the term hash map.
|
||||
/// Gets an IndexingContext from the pool or creates a new one
|
||||
pub(crate) fn new(table_size: usize) -> IndexingContext {
|
||||
CONTEXT_POOL
|
||||
.with(|pool| pool.borrow_mut().pop())
|
||||
.unwrap_or_else(|| Self::create(table_size))
|
||||
}
|
||||
|
||||
/// Returns the memory usage for the inverted index memory arenas, in bytes.
|
||||
pub(crate) fn mem_usage(&self) -> usize {
|
||||
self.term_index.mem_usage() + self.arena.mem_usage()
|
||||
}
|
||||
|
||||
/// Create a new IndexingContext given the size of the term hash map.
|
||||
fn create(table_size: usize) -> IndexingContext {
|
||||
let term_index = ArenaHashMap::with_capacity(table_size);
|
||||
IndexingContext {
|
||||
arena: MemoryArena::default(),
|
||||
@@ -24,8 +48,12 @@ impl IndexingContext {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the memory usage for the inverted index memory arenas, in bytes.
|
||||
pub(crate) fn mem_usage(&self) -> usize {
|
||||
self.term_index.mem_usage() + self.arena.mem_usage()
|
||||
pub fn checkin(mut ctx: IndexingContext) {
|
||||
CONTEXT_POOL.with(|pool| {
|
||||
ctx.term_index.reset();
|
||||
ctx.arena.reset();
|
||||
ctx.path_to_unordered_id = PathToUnorderedId::default();
|
||||
pool.borrow_mut().push(ctx);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -91,6 +91,8 @@ pub(crate) fn serialize_postings(
|
||||
field_serializer.close()?;
|
||||
}
|
||||
|
||||
IndexingContext::checkin(ctx);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user