1mod handle_alter;
18mod handle_apply_staging;
19mod handle_bulk_insert;
20mod handle_catchup;
21mod handle_close;
22mod handle_compaction;
23mod handle_copy_region;
24mod handle_create;
25mod handle_drop;
26mod handle_enter_staging;
27mod handle_flush;
28mod handle_manifest;
29mod handle_open;
30mod handle_rebuild_index;
31mod handle_remap;
32mod handle_truncate;
33mod handle_write;
34
35use std::collections::HashMap;
36use std::path::Path;
37use std::sync::Arc;
38use std::sync::atomic::{AtomicBool, Ordering};
39use std::time::Duration;
40
41use common_base::Plugins;
42use common_error::ext::BoxedError;
43use common_meta::key::SchemaMetadataManagerRef;
44use common_runtime::JoinHandle;
45use common_stat::get_total_memory_bytes;
46use common_telemetry::{error, info, warn};
47use futures::future::try_join_all;
48use object_store::manager::ObjectStoreManagerRef;
49use prometheus::{Histogram, IntGauge};
50use rand::{Rng, rng};
51use snafu::{ResultExt, ensure};
52use store_api::logstore::LogStore;
53use store_api::region_engine::{
54 SetRegionRoleStateResponse, SetRegionRoleStateSuccess, SettableRegionRoleState,
55};
56use store_api::storage::{FileId, RegionId};
57use tokio::sync::mpsc::{Receiver, Sender};
58use tokio::sync::{Mutex, Semaphore, mpsc, oneshot, watch};
59
60use crate::cache::write_cache::{WriteCache, WriteCacheRef};
61use crate::cache::{CacheManager, CacheManagerRef};
62use crate::compaction::CompactionScheduler;
63use crate::compaction::memory_manager::{CompactionMemoryManager, new_compaction_memory_manager};
64use crate::config::MitoConfig;
65use crate::error::{self, CreateDirSnafu, JoinSnafu, Result, WorkerStoppedSnafu};
66use crate::flush::{FlushScheduler, WriteBufferManagerImpl, WriteBufferManagerRef};
67use crate::gc::{GcLimiter, GcLimiterRef};
68use crate::memtable::MemtableBuilderProvider;
69use crate::metrics::{REGION_COUNT, REQUEST_WAIT_TIME, WRITE_STALLING};
70use crate::region::opener::PartitionExprFetcherRef;
71use crate::region::{
72 CatchupRegions, CatchupRegionsRef, MitoRegionRef, OpeningRegions, OpeningRegionsRef, RegionMap,
73 RegionMapRef,
74};
75use crate::request::{
76 BackgroundNotify, BulkInsertRequest, DdlRequest, SenderBulkRequest, SenderDdlRequest,
77 SenderWriteRequest, WorkerRequest, WorkerRequestWithTime,
78};
79use crate::schedule::scheduler::{LocalScheduler, SchedulerRef};
80use crate::sst::file::RegionFileId;
81use crate::sst::file_ref::FileReferenceManagerRef;
82use crate::sst::index::IndexBuildScheduler;
83use crate::sst::index::intermediate::IntermediateManager;
84use crate::sst::index::puffin_manager::PuffinManagerFactory;
85use crate::time_provider::{StdTimeProvider, TimeProviderRef};
86use crate::wal::Wal;
87use crate::worker::handle_manifest::RegionEditQueues;
88
89pub(crate) type WorkerId = u32;
91
92pub(crate) const DROPPING_MARKER_FILE: &str = ".dropping";
93
94pub(crate) const CHECK_REGION_INTERVAL: Duration = Duration::from_secs(60);
96pub(crate) const MAX_INITIAL_CHECK_DELAY_SECS: u64 = 60 * 3;
98
99#[cfg_attr(doc, aquamarine::aquamarine)]
100pub(crate) struct WorkerGroup {
137 workers: Vec<RegionWorker>,
139 flush_job_pool: SchedulerRef,
141 compact_job_pool: SchedulerRef,
143 index_build_job_pool: SchedulerRef,
145 purge_scheduler: SchedulerRef,
147 cache_manager: CacheManagerRef,
149 file_ref_manager: FileReferenceManagerRef,
151 gc_limiter: GcLimiterRef,
153 object_store_manager: ObjectStoreManagerRef,
155 puffin_manager_factory: PuffinManagerFactory,
157 intermediate_manager: IntermediateManager,
159 schema_metadata_manager: SchemaMetadataManagerRef,
161}
162
163impl WorkerGroup {
164 pub(crate) async fn start<S: LogStore>(
168 config: Arc<MitoConfig>,
169 log_store: Arc<S>,
170 object_store_manager: ObjectStoreManagerRef,
171 schema_metadata_manager: SchemaMetadataManagerRef,
172 file_ref_manager: FileReferenceManagerRef,
173 partition_expr_fetcher: PartitionExprFetcherRef,
174 plugins: Plugins,
175 ) -> Result<WorkerGroup> {
176 let (flush_sender, flush_receiver) = watch::channel(());
177 let write_buffer_manager = Arc::new(
178 WriteBufferManagerImpl::new(config.global_write_buffer_size.as_bytes() as usize)
179 .with_notifier(flush_sender.clone()),
180 );
181 let puffin_manager_factory = PuffinManagerFactory::new(
182 &config.index.aux_path,
183 config.index.staging_size.as_bytes(),
184 Some(config.index.write_buffer_size.as_bytes() as _),
185 config.index.staging_ttl,
186 )
187 .await?;
188 let intermediate_manager = IntermediateManager::init_fs(&config.index.aux_path)
189 .await?
190 .with_buffer_size(Some(config.index.write_buffer_size.as_bytes() as _));
191 let index_build_job_pool =
192 Arc::new(LocalScheduler::new(config.max_background_index_builds));
193 let flush_job_pool = Arc::new(LocalScheduler::new(config.max_background_flushes));
194 let compact_job_pool = Arc::new(LocalScheduler::new(config.max_background_compactions));
195 let flush_semaphore = Arc::new(Semaphore::new(config.max_background_flushes));
196 let purge_scheduler = Arc::new(LocalScheduler::new(config.max_background_purges));
198 let write_cache = write_cache_from_config(
199 &config,
200 puffin_manager_factory.clone(),
201 intermediate_manager.clone(),
202 )
203 .await?;
204 let cache_manager = Arc::new(
205 CacheManager::builder()
206 .sst_meta_cache_size(config.sst_meta_cache_size.as_bytes())
207 .vector_cache_size(config.vector_cache_size.as_bytes())
208 .page_cache_size(config.page_cache_size.as_bytes())
209 .selector_result_cache_size(config.selector_result_cache_size.as_bytes())
210 .range_result_cache_size(config.range_result_cache_size.as_bytes())
211 .prefilter_result_cache_size(config.prefilter_result_cache_size.as_bytes())
212 .index_metadata_size(config.index.metadata_cache_size.as_bytes())
213 .index_content_size(config.index.content_cache_size.as_bytes())
214 .index_content_page_size(config.index.content_cache_page_size.as_bytes())
215 .index_result_cache_size(config.index.result_cache_size.as_bytes())
216 .puffin_metadata_size(config.index.metadata_cache_size.as_bytes())
217 .write_cache(write_cache)
218 .build(),
219 );
220 let time_provider = Arc::new(StdTimeProvider);
221 let total_memory = get_total_memory_bytes();
222 let total_memory = if total_memory > 0 {
223 total_memory as u64
224 } else {
225 0
226 };
227 let compaction_limit_bytes = config
228 .experimental_compaction_memory_limit
229 .resolve(total_memory);
230 let compaction_memory_manager =
231 Arc::new(new_compaction_memory_manager(compaction_limit_bytes));
232 let gc_limiter = Arc::new(GcLimiter::new(config.gc.max_concurrent_gc_job));
233
234 let workers = (0..config.num_workers)
235 .map(|id| {
236 WorkerStarter {
237 id: id as WorkerId,
238 config: config.clone(),
239 log_store: log_store.clone(),
240 object_store_manager: object_store_manager.clone(),
241 write_buffer_manager: write_buffer_manager.clone(),
242 index_build_job_pool: index_build_job_pool.clone(),
243 flush_job_pool: flush_job_pool.clone(),
244 compact_job_pool: compact_job_pool.clone(),
245 purge_scheduler: purge_scheduler.clone(),
246 listener: WorkerListener::default(),
247 cache_manager: cache_manager.clone(),
248 compaction_memory_manager: compaction_memory_manager.clone(),
249 puffin_manager_factory: puffin_manager_factory.clone(),
250 intermediate_manager: intermediate_manager.clone(),
251 time_provider: time_provider.clone(),
252 flush_sender: flush_sender.clone(),
253 flush_receiver: flush_receiver.clone(),
254 plugins: plugins.clone(),
255 schema_metadata_manager: schema_metadata_manager.clone(),
256 file_ref_manager: file_ref_manager.clone(),
257 partition_expr_fetcher: partition_expr_fetcher.clone(),
258 flush_semaphore: flush_semaphore.clone(),
259 }
260 .start()
261 })
262 .collect::<Result<Vec<_>>>()?;
263
264 Ok(WorkerGroup {
265 workers,
266 flush_job_pool,
267 compact_job_pool,
268 index_build_job_pool,
269 purge_scheduler,
270 cache_manager,
271 file_ref_manager,
272 gc_limiter,
273 object_store_manager,
274 puffin_manager_factory,
275 intermediate_manager,
276 schema_metadata_manager,
277 })
278 }
279
280 pub(crate) async fn stop(&self) -> Result<()> {
282 info!("Stop region worker group");
283
284 self.compact_job_pool.stop(true).await?;
287 self.flush_job_pool.stop(true).await?;
289 self.purge_scheduler.stop(true).await?;
291 self.index_build_job_pool.stop(true).await?;
293
294 try_join_all(self.workers.iter().map(|worker| worker.stop())).await?;
295
296 Ok(())
297 }
298
299 pub(crate) async fn submit_to_worker(
301 &self,
302 region_id: RegionId,
303 request: WorkerRequest,
304 ) -> Result<()> {
305 self.worker(region_id).submit_request(request).await
306 }
307
308 pub(crate) fn is_region_exists(&self, region_id: RegionId) -> bool {
310 self.worker(region_id).is_region_exists(region_id)
311 }
312
313 pub(crate) fn is_region_opening(&self, region_id: RegionId) -> bool {
315 self.worker(region_id).is_region_opening(region_id)
316 }
317
318 pub(crate) fn is_region_catching_up(&self, region_id: RegionId) -> bool {
320 self.worker(region_id).is_region_catching_up(region_id)
321 }
322
323 pub(crate) fn get_region(&self, region_id: RegionId) -> Option<MitoRegionRef> {
327 self.worker(region_id).get_region(region_id)
328 }
329
330 pub(crate) fn cache_manager(&self) -> CacheManagerRef {
332 self.cache_manager.clone()
333 }
334
335 pub(crate) fn file_ref_manager(&self) -> FileReferenceManagerRef {
336 self.file_ref_manager.clone()
337 }
338
339 pub(crate) fn gc_limiter(&self) -> GcLimiterRef {
340 self.gc_limiter.clone()
341 }
342
343 pub(crate) fn worker(&self, region_id: RegionId) -> &RegionWorker {
345 let index = region_id_to_index(region_id, self.workers.len());
346
347 &self.workers[index]
348 }
349
350 pub(crate) fn all_regions(&self) -> impl Iterator<Item = MitoRegionRef> + use<'_> {
351 self.workers
352 .iter()
353 .flat_map(|worker| worker.regions.list_regions())
354 }
355
356 pub(crate) fn object_store_manager(&self) -> &ObjectStoreManagerRef {
357 &self.object_store_manager
358 }
359
360 pub(crate) fn puffin_manager_factory(&self) -> &PuffinManagerFactory {
361 &self.puffin_manager_factory
362 }
363
364 pub(crate) fn intermediate_manager(&self) -> &IntermediateManager {
365 &self.intermediate_manager
366 }
367
368 pub(crate) fn schema_metadata_manager(&self) -> &SchemaMetadataManagerRef {
369 &self.schema_metadata_manager
370 }
371}
372
373#[cfg(any(test, feature = "test"))]
375impl WorkerGroup {
376 #[allow(clippy::too_many_arguments)]
380 pub(crate) async fn start_for_test<S: LogStore>(
381 config: Arc<MitoConfig>,
382 log_store: Arc<S>,
383 object_store_manager: ObjectStoreManagerRef,
384 write_buffer_manager: Option<WriteBufferManagerRef>,
385 listener: Option<crate::engine::listener::EventListenerRef>,
386 schema_metadata_manager: SchemaMetadataManagerRef,
387 file_ref_manager: FileReferenceManagerRef,
388 time_provider: TimeProviderRef,
389 partition_expr_fetcher: PartitionExprFetcherRef,
390 ) -> Result<WorkerGroup> {
391 let (flush_sender, flush_receiver) = watch::channel(());
392 let write_buffer_manager = write_buffer_manager.unwrap_or_else(|| {
393 Arc::new(
394 WriteBufferManagerImpl::new(config.global_write_buffer_size.as_bytes() as usize)
395 .with_notifier(flush_sender.clone()),
396 )
397 });
398 let index_build_job_pool =
399 Arc::new(LocalScheduler::new(config.max_background_index_builds));
400 let flush_job_pool = Arc::new(LocalScheduler::new(config.max_background_flushes));
401 let compact_job_pool = Arc::new(LocalScheduler::new(config.max_background_compactions));
402 let flush_semaphore = Arc::new(Semaphore::new(config.max_background_flushes));
403 let purge_scheduler = Arc::new(LocalScheduler::new(config.max_background_flushes));
404 let puffin_manager_factory = PuffinManagerFactory::new(
405 &config.index.aux_path,
406 config.index.staging_size.as_bytes(),
407 Some(config.index.write_buffer_size.as_bytes() as _),
408 config.index.staging_ttl,
409 )
410 .await?;
411 let intermediate_manager = IntermediateManager::init_fs(&config.index.aux_path)
412 .await?
413 .with_buffer_size(Some(config.index.write_buffer_size.as_bytes() as _));
414 let write_cache = write_cache_from_config(
415 &config,
416 puffin_manager_factory.clone(),
417 intermediate_manager.clone(),
418 )
419 .await?;
420 let cache_manager = Arc::new(
421 CacheManager::builder()
422 .sst_meta_cache_size(config.sst_meta_cache_size.as_bytes())
423 .vector_cache_size(config.vector_cache_size.as_bytes())
424 .page_cache_size(config.page_cache_size.as_bytes())
425 .selector_result_cache_size(config.selector_result_cache_size.as_bytes())
426 .range_result_cache_size(config.range_result_cache_size.as_bytes())
427 .prefilter_result_cache_size(config.prefilter_result_cache_size.as_bytes())
428 .write_cache(write_cache)
429 .build(),
430 );
431 let total_memory = get_total_memory_bytes();
432 let total_memory = if total_memory > 0 {
433 total_memory as u64
434 } else {
435 0
436 };
437 let compaction_limit_bytes = config
438 .experimental_compaction_memory_limit
439 .resolve(total_memory);
440 let compaction_memory_manager =
441 Arc::new(new_compaction_memory_manager(compaction_limit_bytes));
442 let gc_limiter = Arc::new(GcLimiter::new(config.gc.max_concurrent_gc_job));
443 let workers = (0..config.num_workers)
444 .map(|id| {
445 WorkerStarter {
446 id: id as WorkerId,
447 config: config.clone(),
448 log_store: log_store.clone(),
449 object_store_manager: object_store_manager.clone(),
450 write_buffer_manager: write_buffer_manager.clone(),
451 index_build_job_pool: index_build_job_pool.clone(),
452 flush_job_pool: flush_job_pool.clone(),
453 compact_job_pool: compact_job_pool.clone(),
454 purge_scheduler: purge_scheduler.clone(),
455 listener: WorkerListener::new(listener.clone()),
456 cache_manager: cache_manager.clone(),
457 compaction_memory_manager: compaction_memory_manager.clone(),
458 puffin_manager_factory: puffin_manager_factory.clone(),
459 intermediate_manager: intermediate_manager.clone(),
460 time_provider: time_provider.clone(),
461 flush_sender: flush_sender.clone(),
462 flush_receiver: flush_receiver.clone(),
463 plugins: Plugins::new(),
464 schema_metadata_manager: schema_metadata_manager.clone(),
465 file_ref_manager: file_ref_manager.clone(),
466 partition_expr_fetcher: partition_expr_fetcher.clone(),
467 flush_semaphore: flush_semaphore.clone(),
468 }
469 .start()
470 })
471 .collect::<Result<Vec<_>>>()?;
472
473 Ok(WorkerGroup {
474 workers,
475 flush_job_pool,
476 compact_job_pool,
477 index_build_job_pool,
478 purge_scheduler,
479 cache_manager,
480 file_ref_manager,
481 gc_limiter,
482 object_store_manager,
483 puffin_manager_factory,
484 intermediate_manager,
485 schema_metadata_manager,
486 })
487 }
488
489 pub(crate) fn purge_scheduler(&self) -> &SchedulerRef {
491 &self.purge_scheduler
492 }
493}
494
495fn region_id_to_index(id: RegionId, num_workers: usize) -> usize {
496 ((id.table_id() as usize % num_workers) + (id.region_number() as usize % num_workers))
497 % num_workers
498}
499
500pub async fn write_cache_from_config(
501 config: &MitoConfig,
502 puffin_manager_factory: PuffinManagerFactory,
503 intermediate_manager: IntermediateManager,
504) -> Result<Option<WriteCacheRef>> {
505 if !config.enable_write_cache {
506 return Ok(None);
507 }
508
509 tokio::fs::create_dir_all(Path::new(&config.write_cache_path))
510 .await
511 .context(CreateDirSnafu {
512 dir: &config.write_cache_path,
513 })?;
514
515 let cache = WriteCache::new_fs(
516 &config.write_cache_path,
517 config.write_cache_size,
518 config.write_cache_ttl,
519 Some(config.index_cache_percent),
520 config.enable_refill_cache_on_read,
521 puffin_manager_factory,
522 intermediate_manager,
523 config.manifest_cache_size,
524 )
525 .await?;
526 Ok(Some(Arc::new(cache)))
527}
528
529pub(crate) fn worker_init_check_delay() -> Duration {
531 let init_check_delay = rng().random_range(0..MAX_INITIAL_CHECK_DELAY_SECS);
532 Duration::from_secs(init_check_delay)
533}
534
535struct WorkerStarter<S> {
537 id: WorkerId,
538 config: Arc<MitoConfig>,
539 log_store: Arc<S>,
540 object_store_manager: ObjectStoreManagerRef,
541 write_buffer_manager: WriteBufferManagerRef,
542 compact_job_pool: SchedulerRef,
543 index_build_job_pool: SchedulerRef,
544 flush_job_pool: SchedulerRef,
545 purge_scheduler: SchedulerRef,
546 listener: WorkerListener,
547 cache_manager: CacheManagerRef,
548 compaction_memory_manager: Arc<CompactionMemoryManager>,
549 puffin_manager_factory: PuffinManagerFactory,
550 intermediate_manager: IntermediateManager,
551 time_provider: TimeProviderRef,
552 flush_sender: watch::Sender<()>,
554 flush_receiver: watch::Receiver<()>,
556 plugins: Plugins,
557 schema_metadata_manager: SchemaMetadataManagerRef,
558 file_ref_manager: FileReferenceManagerRef,
559 partition_expr_fetcher: PartitionExprFetcherRef,
560 flush_semaphore: Arc<Semaphore>,
561}
562
563impl<S: LogStore> WorkerStarter<S> {
564 fn start(self) -> Result<RegionWorker> {
566 let regions = Arc::new(RegionMap::default());
567 let opening_regions = Arc::new(OpeningRegions::default());
568 let catchup_regions = Arc::new(CatchupRegions::default());
569 let (sender, receiver) = mpsc::channel(self.config.worker_channel_size);
570
571 let running = Arc::new(AtomicBool::new(true));
572 let now = self.time_provider.current_time_millis();
573 let id_string = self.id.to_string();
574 let mut worker_thread = RegionWorkerLoop {
575 id: self.id,
576 config: self.config.clone(),
577 regions: regions.clone(),
578 catchup_regions: catchup_regions.clone(),
579 dropping_regions: Arc::new(RegionMap::default()),
580 opening_regions: opening_regions.clone(),
581 sender: sender.clone(),
582 receiver,
583 wal: Wal::new(self.log_store),
584 object_store_manager: self.object_store_manager.clone(),
585 running: running.clone(),
586 memtable_builder_provider: MemtableBuilderProvider::new(
587 Some(self.write_buffer_manager.clone()),
588 self.config.clone(),
589 ),
590 purge_scheduler: self.purge_scheduler.clone(),
591 write_buffer_manager: self.write_buffer_manager,
592 index_build_scheduler: IndexBuildScheduler::new(
593 self.index_build_job_pool,
594 self.config.max_background_index_builds,
595 ),
596 flush_scheduler: FlushScheduler::new(self.flush_job_pool),
597 compaction_scheduler: CompactionScheduler::new(
598 self.compact_job_pool,
599 sender.clone(),
600 self.cache_manager.clone(),
601 self.config.clone(),
602 self.listener.clone(),
603 self.plugins.clone(),
604 self.compaction_memory_manager.clone(),
605 self.config.experimental_compaction_on_exhausted,
606 ),
607 stalled_requests: StalledRequests::default(),
608 listener: self.listener,
609 cache_manager: self.cache_manager,
610 puffin_manager_factory: self.puffin_manager_factory,
611 intermediate_manager: self.intermediate_manager,
612 time_provider: self.time_provider,
613 last_periodical_check_millis: now,
614 flush_sender: self.flush_sender,
615 flush_receiver: self.flush_receiver,
616 stalling_count: WRITE_STALLING.with_label_values(&[&id_string]),
617 region_count: REGION_COUNT.with_label_values(&[&id_string]),
618 request_wait_time: REQUEST_WAIT_TIME.with_label_values(&[&id_string]),
619 region_edit_queues: RegionEditQueues::default(),
620 schema_metadata_manager: self.schema_metadata_manager,
621 file_ref_manager: self.file_ref_manager.clone(),
622 partition_expr_fetcher: self.partition_expr_fetcher,
623 flush_semaphore: self.flush_semaphore,
624 };
625 let handle = common_runtime::spawn_global(async move {
626 worker_thread.run().await;
627 });
628
629 Ok(RegionWorker {
630 id: self.id,
631 regions,
632 opening_regions,
633 catchup_regions,
634 sender,
635 handle: Mutex::new(Some(handle)),
636 running,
637 })
638 }
639}
640
641pub(crate) struct RegionWorker {
643 id: WorkerId,
645 regions: RegionMapRef,
647 opening_regions: OpeningRegionsRef,
649 catchup_regions: CatchupRegionsRef,
651 sender: Sender<WorkerRequestWithTime>,
653 handle: Mutex<Option<JoinHandle<()>>>,
655 running: Arc<AtomicBool>,
657}
658
659impl RegionWorker {
660 async fn submit_request(&self, request: WorkerRequest) -> Result<()> {
662 ensure!(self.is_running(), WorkerStoppedSnafu { id: self.id });
663 let request_with_time = WorkerRequestWithTime::new(request);
664 if self.sender.send(request_with_time).await.is_err() {
665 warn!(
666 "Worker {} is already exited but the running flag is still true",
667 self.id
668 );
669 self.set_running(false);
671 return WorkerStoppedSnafu { id: self.id }.fail();
672 }
673
674 Ok(())
675 }
676
677 async fn stop(&self) -> Result<()> {
681 let handle = self.handle.lock().await.take();
682 if let Some(handle) = handle {
683 info!("Stop region worker {}", self.id);
684
685 self.set_running(false);
686 if self
687 .sender
688 .send(WorkerRequestWithTime::new(WorkerRequest::Stop))
689 .await
690 .is_err()
691 {
692 warn!("Worker {} is already exited before stop", self.id);
693 }
694
695 handle.await.context(JoinSnafu)?;
696 }
697
698 Ok(())
699 }
700
701 fn is_running(&self) -> bool {
703 self.running.load(Ordering::Relaxed)
704 }
705
706 fn set_running(&self, value: bool) {
708 self.running.store(value, Ordering::Relaxed)
709 }
710
711 fn is_region_exists(&self, region_id: RegionId) -> bool {
713 self.regions.is_region_exists(region_id)
714 }
715
716 fn is_region_opening(&self, region_id: RegionId) -> bool {
718 self.opening_regions.is_region_exists(region_id)
719 }
720
721 fn is_region_catching_up(&self, region_id: RegionId) -> bool {
723 self.catchup_regions.is_region_exists(region_id)
724 }
725
726 fn get_region(&self, region_id: RegionId) -> Option<MitoRegionRef> {
728 self.regions.get_region(region_id)
729 }
730
731 #[cfg(test)]
732 pub(crate) fn opening_regions(&self) -> &OpeningRegionsRef {
734 &self.opening_regions
735 }
736
737 #[cfg(test)]
738 pub(crate) fn catchup_regions(&self) -> &CatchupRegionsRef {
740 &self.catchup_regions
741 }
742}
743
744impl Drop for RegionWorker {
745 fn drop(&mut self) {
746 if self.is_running() {
747 self.set_running(false);
748 }
750 }
751}
752
753type RequestBuffer = Vec<WorkerRequest>;
754
755#[derive(Default)]
759pub(crate) struct StalledRequests {
760 pub(crate) requests:
767 HashMap<RegionId, (usize, Vec<SenderWriteRequest>, Vec<SenderBulkRequest>)>,
768 pub(crate) estimated_size: usize,
770}
771
772impl StalledRequests {
773 pub(crate) fn append(
775 &mut self,
776 requests: &mut Vec<SenderWriteRequest>,
777 bulk_requests: &mut Vec<SenderBulkRequest>,
778 ) {
779 for req in requests.drain(..) {
780 self.push(req);
781 }
782 for req in bulk_requests.drain(..) {
783 self.push_bulk(req);
784 }
785 }
786
787 pub(crate) fn push(&mut self, req: SenderWriteRequest) {
789 let (size, requests, _) = self.requests.entry(req.request.region_id).or_default();
790 let req_size = req.request.estimated_size();
791 *size += req_size;
792 self.estimated_size += req_size;
793 requests.push(req);
794 }
795
796 pub(crate) fn push_bulk(&mut self, req: SenderBulkRequest) {
797 let region_id = req.region_id;
798 let (size, _, requests) = self.requests.entry(region_id).or_default();
799 let req_size = req.request.estimated_size();
800 *size += req_size;
801 self.estimated_size += req_size;
802 requests.push(req);
803 }
804
805 pub(crate) fn remove(
807 &mut self,
808 region_id: &RegionId,
809 ) -> (Vec<SenderWriteRequest>, Vec<SenderBulkRequest>) {
810 if let Some((size, write_reqs, bulk_reqs)) = self.requests.remove(region_id) {
811 self.estimated_size -= size;
812 (write_reqs, bulk_reqs)
813 } else {
814 (vec![], vec![])
815 }
816 }
817
818 pub(crate) fn stalled_count(&self) -> usize {
820 self.requests
821 .values()
822 .map(|(_, reqs, bulk_reqs)| reqs.len() + bulk_reqs.len())
823 .sum()
824 }
825}
826
827struct RegionWorkerLoop<S> {
829 id: WorkerId,
831 config: Arc<MitoConfig>,
833 regions: RegionMapRef,
835 dropping_regions: RegionMapRef,
837 opening_regions: OpeningRegionsRef,
839 catchup_regions: CatchupRegionsRef,
841 sender: Sender<WorkerRequestWithTime>,
843 receiver: Receiver<WorkerRequestWithTime>,
845 wal: Wal<S>,
847 object_store_manager: ObjectStoreManagerRef,
849 running: Arc<AtomicBool>,
851 memtable_builder_provider: MemtableBuilderProvider,
853 purge_scheduler: SchedulerRef,
855 write_buffer_manager: WriteBufferManagerRef,
857 index_build_scheduler: IndexBuildScheduler,
859 flush_scheduler: FlushScheduler,
861 compaction_scheduler: CompactionScheduler,
863 stalled_requests: StalledRequests,
865 listener: WorkerListener,
867 cache_manager: CacheManagerRef,
869 puffin_manager_factory: PuffinManagerFactory,
871 intermediate_manager: IntermediateManager,
873 time_provider: TimeProviderRef,
875 last_periodical_check_millis: i64,
877 flush_sender: watch::Sender<()>,
879 flush_receiver: watch::Receiver<()>,
881 stalling_count: IntGauge,
883 region_count: IntGauge,
885 request_wait_time: Histogram,
887 region_edit_queues: RegionEditQueues,
889 schema_metadata_manager: SchemaMetadataManagerRef,
891 file_ref_manager: FileReferenceManagerRef,
893 partition_expr_fetcher: PartitionExprFetcherRef,
895 flush_semaphore: Arc<Semaphore>,
897}
898
899impl<S: LogStore> RegionWorkerLoop<S> {
900 async fn run(&mut self) {
902 let init_check_delay = worker_init_check_delay();
903 info!(
904 "Start region worker thread {}, init_check_delay: {:?}",
905 self.id, init_check_delay
906 );
907 self.last_periodical_check_millis += init_check_delay.as_millis() as i64;
908
909 let mut write_req_buffer: Vec<SenderWriteRequest> =
911 Vec::with_capacity(self.config.worker_request_batch_size);
912 let mut bulk_req_buffer: Vec<SenderBulkRequest> =
913 Vec::with_capacity(self.config.worker_request_batch_size);
914 let mut ddl_req_buffer: Vec<SenderDdlRequest> =
915 Vec::with_capacity(self.config.worker_request_batch_size);
916 let mut general_req_buffer: Vec<WorkerRequest> =
917 RequestBuffer::with_capacity(self.config.worker_request_batch_size);
918
919 while self.running.load(Ordering::Relaxed) {
920 write_req_buffer.clear();
922 ddl_req_buffer.clear();
923 general_req_buffer.clear();
924 let mut bulk_insert_req_num = 0;
925
926 let max_wait_time = self.time_provider.wait_duration(CHECK_REGION_INTERVAL);
927 let sleep = tokio::time::sleep(max_wait_time);
928 tokio::pin!(sleep);
929
930 tokio::select! {
931 request_opt = self.receiver.recv() => {
932 match request_opt {
933 Some(request_with_time) => {
934 let wait_time = request_with_time.created_at.elapsed();
936 self.request_wait_time.observe(wait_time.as_secs_f64());
937
938 match request_with_time.request {
939 WorkerRequest::Write(sender_req) => write_req_buffer.push(sender_req),
940 WorkerRequest::Ddl(sender_req) => ddl_req_buffer.push(sender_req),
941 WorkerRequest::BulkInserts(bulk_insert) => {
942 bulk_insert_req_num += 1;
943 self.buffer_bulk_insert_request(bulk_insert, &mut bulk_req_buffer)
944 .await;
945 }
946 req => general_req_buffer.push(req),
947 }
948 },
949 None => break,
951 }
952 }
953 recv_res = self.flush_receiver.changed() => {
954 if recv_res.is_err() {
955 break;
957 } else {
958 self.maybe_flush_worker();
963 self.handle_stalled_requests().await;
965 continue;
966 }
967 }
968 _ = &mut sleep => {
969 self.handle_periodical_tasks();
971 continue;
972 }
973 }
974
975 if self.flush_receiver.has_changed().unwrap_or(false) {
976 self.handle_stalled_requests().await;
980 }
981
982 for _ in 1..self.config.worker_request_batch_size {
984 match self.receiver.try_recv() {
986 Ok(request_with_time) => {
987 let wait_time = request_with_time.created_at.elapsed();
989 self.request_wait_time.observe(wait_time.as_secs_f64());
990
991 match request_with_time.request {
992 WorkerRequest::Write(sender_req) => write_req_buffer.push(sender_req),
993 WorkerRequest::Ddl(sender_req) => ddl_req_buffer.push(sender_req),
994 WorkerRequest::BulkInserts(bulk_insert) => {
995 bulk_insert_req_num += 1;
996 self.buffer_bulk_insert_request(bulk_insert, &mut bulk_req_buffer)
997 .await
998 }
999 req => general_req_buffer.push(req),
1000 }
1001 }
1002 Err(_) => break,
1004 }
1005 }
1006
1007 self.listener.on_recv_requests(
1008 write_req_buffer.len()
1009 + ddl_req_buffer.len()
1010 + general_req_buffer.len()
1011 + bulk_insert_req_num,
1012 );
1013
1014 self.handle_requests(
1015 &mut write_req_buffer,
1016 &mut ddl_req_buffer,
1017 &mut general_req_buffer,
1018 &mut bulk_req_buffer,
1019 )
1020 .await;
1021
1022 self.handle_periodical_tasks();
1023 }
1024
1025 self.clean().await;
1026
1027 info!("Exit region worker thread {}", self.id);
1028 }
1029
1030 async fn buffer_bulk_insert_request(
1031 &mut self,
1032 bulk_insert: BulkInsertRequest,
1033 bulk_requests: &mut Vec<SenderBulkRequest>,
1034 ) {
1035 let BulkInsertRequest {
1036 metadata,
1037 request,
1038 sender,
1039 } = bulk_insert;
1040
1041 if let Some(region_metadata) = metadata {
1042 self.handle_bulk_insert_batch(region_metadata, request, bulk_requests, sender)
1043 .await;
1044 } else {
1045 error!("Cannot find region metadata for {}", request.region_id);
1046 sender.send(
1047 error::RegionNotFoundSnafu {
1048 region_id: request.region_id,
1049 }
1050 .fail(),
1051 );
1052 }
1053 }
1054
1055 async fn handle_requests(
1059 &mut self,
1060 write_requests: &mut Vec<SenderWriteRequest>,
1061 ddl_requests: &mut Vec<SenderDdlRequest>,
1062 general_requests: &mut Vec<WorkerRequest>,
1063 bulk_requests: &mut Vec<SenderBulkRequest>,
1064 ) {
1065 for worker_req in general_requests.drain(..) {
1066 match worker_req {
1067 WorkerRequest::Write(_) | WorkerRequest::Ddl(_) => {
1068 continue;
1070 }
1071 WorkerRequest::BulkInserts(_) => unreachable!("bulk inserts are buffered"),
1072 WorkerRequest::Background { region_id, notify } => {
1073 if matches!(
1074 ¬ify,
1075 BackgroundNotify::RegionEdit(edit_result)
1076 if edit_result.update_region_state
1077 ) {
1078 self.handle_buffered_region_write_requests(
1087 ®ion_id,
1088 write_requests,
1089 bulk_requests,
1090 )
1091 .await;
1092 }
1093 self.handle_background_notify(region_id, notify).await;
1095 }
1096 WorkerRequest::SetRegionRoleStateGracefully {
1097 region_id,
1098 region_role_state,
1099 sender,
1100 } => {
1101 self.set_role_state_gracefully(region_id, region_role_state, sender)
1102 .await;
1103 }
1104 WorkerRequest::EditRegion(request) => {
1105 self.handle_region_edit(request);
1106 }
1107 WorkerRequest::Stop => {
1108 debug_assert!(!self.running.load(Ordering::Relaxed));
1109 }
1110 WorkerRequest::SyncRegion(req) => {
1111 self.handle_region_sync(req).await;
1112 }
1113 WorkerRequest::RemapManifests(req) => {
1114 self.handle_remap_manifests_request(req);
1115 }
1116 WorkerRequest::CopyRegionFrom(req) => {
1117 self.handle_copy_region_from_request(req);
1118 }
1119 }
1120 }
1121
1122 self.handle_write_requests(write_requests, bulk_requests, true)
1125 .await;
1126
1127 self.handle_ddl_requests(ddl_requests).await;
1128 }
1129
1130 async fn handle_ddl_requests(&mut self, ddl_requests: &mut Vec<SenderDdlRequest>) {
1132 if ddl_requests.is_empty() {
1133 return;
1134 }
1135
1136 for ddl in ddl_requests.drain(..) {
1137 let res = match ddl.request {
1138 DdlRequest::Create(req) => self.handle_create_request(ddl.region_id, req).await,
1139 DdlRequest::Drop(req) => {
1140 self.handle_drop_request(ddl.region_id, req.partial_drop)
1141 .await
1142 }
1143 DdlRequest::Open((req, wal_entry_receiver)) => {
1144 self.handle_open_request(ddl.region_id, req, wal_entry_receiver, ddl.sender)
1145 .await;
1146 continue;
1147 }
1148 DdlRequest::Close(_) => {
1149 self.handle_close_request(ddl.region_id, ddl.sender).await;
1150 continue;
1151 }
1152 DdlRequest::Alter(req) => {
1153 self.handle_alter_request(ddl.region_id, req, ddl.sender)
1154 .await;
1155 continue;
1156 }
1157 DdlRequest::Flush(req) => {
1158 self.handle_flush_request(ddl.region_id, req, ddl.sender);
1159 continue;
1160 }
1161 DdlRequest::Compact(req) => {
1162 self.handle_compaction_request(ddl.region_id, req, ddl.sender)
1163 .await;
1164 continue;
1165 }
1166 DdlRequest::BuildIndex(req) => {
1167 self.handle_build_index_request(ddl.region_id, req, ddl.sender)
1168 .await;
1169 continue;
1170 }
1171 DdlRequest::Truncate(req) => {
1172 self.handle_truncate_request(ddl.region_id, req, ddl.sender)
1173 .await;
1174 continue;
1175 }
1176 DdlRequest::Catchup((req, wal_entry_receiver)) => {
1177 self.handle_catchup_request(ddl.region_id, req, wal_entry_receiver, ddl.sender)
1178 .await;
1179 continue;
1180 }
1181 DdlRequest::EnterStaging(req) => {
1182 self.handle_enter_staging_request(
1183 ddl.region_id,
1184 req.partition_directive,
1185 ddl.sender,
1186 )
1187 .await;
1188 continue;
1189 }
1190 DdlRequest::ApplyStagingManifest(req) => {
1191 self.handle_apply_staging_manifest_request(ddl.region_id, req, ddl.sender)
1192 .await;
1193 continue;
1194 }
1195 };
1196
1197 ddl.sender.send(res);
1198 }
1199 }
1200
1201 fn handle_periodical_tasks(&mut self) {
1203 let interval = CHECK_REGION_INTERVAL.as_millis() as i64;
1204 if self
1205 .time_provider
1206 .elapsed_since(self.last_periodical_check_millis)
1207 < interval
1208 {
1209 return;
1210 }
1211
1212 self.last_periodical_check_millis = self.time_provider.current_time_millis();
1213
1214 if let Err(e) = self.flush_periodically() {
1215 error!(e; "Failed to flush regions periodically");
1216 }
1217 }
1218
1219 async fn handle_background_notify(&mut self, region_id: RegionId, notify: BackgroundNotify) {
1221 match notify {
1222 BackgroundNotify::FlushFinished(req) => {
1223 self.handle_flush_finished(region_id, req).await
1224 }
1225 BackgroundNotify::FlushFailed(req) => self.handle_flush_failed(region_id, req).await,
1226 BackgroundNotify::IndexBuildFinished(req) => {
1227 self.handle_index_build_finished(region_id, req).await
1228 }
1229 BackgroundNotify::IndexBuildStopped(req) => {
1230 self.handle_index_build_stopped(region_id, req).await
1231 }
1232 BackgroundNotify::IndexBuildFailed(req) => {
1233 self.handle_index_build_failed(region_id, req).await
1234 }
1235 BackgroundNotify::CompactionFinished(req) => {
1236 self.handle_compaction_finished(region_id, req).await
1237 }
1238 BackgroundNotify::CompactionCancelled(req) => {
1239 self.handle_compaction_cancelled(region_id, req).await
1240 }
1241 BackgroundNotify::CompactionFailed(req) => self.handle_compaction_failure(req).await,
1242 BackgroundNotify::Truncate(req) => self.handle_truncate_result(req).await,
1243 BackgroundNotify::RegionChange(req) => {
1244 self.handle_manifest_region_change_result(req).await
1245 }
1246 BackgroundNotify::EnterStaging(req) => self.handle_enter_staging_result(req).await,
1247 BackgroundNotify::RegionEdit(req) => self.handle_region_edit_result(req).await,
1248 BackgroundNotify::CopyRegionFromFinished(req) => {
1249 self.handle_copy_region_from_finished(req)
1250 }
1251 }
1252 }
1253
1254 async fn set_role_state_gracefully(
1256 &mut self,
1257 region_id: RegionId,
1258 region_role_state: SettableRegionRoleState,
1259 sender: oneshot::Sender<SetRegionRoleStateResponse>,
1260 ) {
1261 if let Some(region) = self.regions.get_region(region_id) {
1262 common_runtime::spawn_global(async move {
1264 match region.set_role_state_gracefully(region_role_state).await {
1265 Ok(()) => {
1266 let last_entry_id = region.version_control.current().last_entry_id;
1267 let _ = sender.send(SetRegionRoleStateResponse::success(
1268 SetRegionRoleStateSuccess::mito(last_entry_id),
1269 ));
1270 }
1271 Err(e) => {
1272 error!(e; "Failed to set region {} role state to {:?}", region_id, region_role_state);
1273 let _ = sender.send(SetRegionRoleStateResponse::invalid_transition(
1274 BoxedError::new(e),
1275 ));
1276 }
1277 }
1278 });
1279 } else {
1280 let _ = sender.send(SetRegionRoleStateResponse::NotFound);
1281 }
1282 }
1283}
1284
1285impl<S> RegionWorkerLoop<S> {
1286 async fn clean(&self) {
1288 let regions = self.regions.list_regions();
1290 for region in regions {
1291 region.stop().await;
1292 }
1293
1294 self.regions.clear();
1295 }
1296
1297 fn notify_group(&mut self) {
1300 let _ = self.flush_sender.send(());
1302 self.flush_receiver.borrow_and_update();
1304 }
1305}
1306
1307#[derive(Default, Clone)]
1309pub(crate) struct WorkerListener {
1310 #[cfg(any(test, feature = "test"))]
1311 listener: Option<crate::engine::listener::EventListenerRef>,
1312}
1313
1314impl WorkerListener {
1315 #[cfg(any(test, feature = "test"))]
1316 pub(crate) fn new(
1317 listener: Option<crate::engine::listener::EventListenerRef>,
1318 ) -> WorkerListener {
1319 WorkerListener { listener }
1320 }
1321
1322 pub(crate) fn on_flush_success(&self, region_id: RegionId) {
1324 #[cfg(any(test, feature = "test"))]
1325 if let Some(listener) = &self.listener {
1326 listener.on_flush_success(region_id);
1327 }
1328 let _ = region_id;
1330 }
1331
1332 pub(crate) fn on_write_stall(&self) {
1334 #[cfg(any(test, feature = "test"))]
1335 if let Some(listener) = &self.listener {
1336 listener.on_write_stall();
1337 }
1338 }
1339
1340 pub(crate) async fn on_flush_begin(&self, region_id: RegionId) {
1341 #[cfg(any(test, feature = "test"))]
1342 if let Some(listener) = &self.listener {
1343 listener.on_flush_begin(region_id).await;
1344 }
1345 let _ = region_id;
1347 }
1348
1349 pub(crate) fn on_later_drop_begin(&self, region_id: RegionId) -> Option<Duration> {
1350 #[cfg(any(test, feature = "test"))]
1351 if let Some(listener) = &self.listener {
1352 return listener.on_later_drop_begin(region_id);
1353 }
1354 let _ = region_id;
1356 None
1357 }
1358
1359 pub(crate) fn on_later_drop_end(&self, region_id: RegionId, removed: bool) {
1361 #[cfg(any(test, feature = "test"))]
1362 if let Some(listener) = &self.listener {
1363 listener.on_later_drop_end(region_id, removed);
1364 }
1365 let _ = region_id;
1367 let _ = removed;
1368 }
1369
1370 pub(crate) async fn on_merge_ssts_finished(&self, region_id: RegionId) {
1371 #[cfg(any(test, feature = "test"))]
1372 if let Some(listener) = &self.listener {
1373 listener.on_merge_ssts_finished(region_id).await;
1374 }
1375 let _ = region_id;
1377 }
1378
1379 pub(crate) fn on_recv_requests(&self, request_num: usize) {
1380 #[cfg(any(test, feature = "test"))]
1381 if let Some(listener) = &self.listener {
1382 listener.on_recv_requests(request_num);
1383 }
1384 let _ = request_num;
1386 }
1387
1388 pub(crate) fn on_file_cache_filled(&self, _file_id: FileId) {
1389 #[cfg(any(test, feature = "test"))]
1390 if let Some(listener) = &self.listener {
1391 listener.on_file_cache_filled(_file_id);
1392 }
1393 }
1394
1395 pub(crate) fn on_compaction_scheduled(&self, _region_id: RegionId) {
1396 #[cfg(any(test, feature = "test"))]
1397 if let Some(listener) = &self.listener {
1398 listener.on_compaction_scheduled(_region_id);
1399 }
1400 }
1401
1402 pub(crate) async fn on_notify_region_change_result_begin(&self, _region_id: RegionId) {
1403 #[cfg(any(test, feature = "test"))]
1404 if let Some(listener) = &self.listener {
1405 listener
1406 .on_notify_region_change_result_begin(_region_id)
1407 .await;
1408 }
1409 }
1410
1411 pub(crate) async fn on_enter_staging_result_begin(&self, _region_id: RegionId) {
1412 #[cfg(any(test, feature = "test"))]
1413 if let Some(listener) = &self.listener {
1414 listener.on_enter_staging_result_begin(_region_id).await;
1415 }
1416 }
1417
1418 pub(crate) async fn on_index_build_finish(&self, _region_file_id: RegionFileId) {
1419 #[cfg(any(test, feature = "test"))]
1420 if let Some(listener) = &self.listener {
1421 listener.on_index_build_finish(_region_file_id).await;
1422 }
1423 }
1424
1425 pub(crate) async fn on_index_build_begin(&self, _region_file_id: RegionFileId) {
1426 #[cfg(any(test, feature = "test"))]
1427 if let Some(listener) = &self.listener {
1428 listener.on_index_build_begin(_region_file_id).await;
1429 }
1430 }
1431
1432 pub(crate) async fn on_index_build_abort(&self, _region_file_id: RegionFileId) {
1433 #[cfg(any(test, feature = "test"))]
1434 if let Some(listener) = &self.listener {
1435 listener.on_index_build_abort(_region_file_id).await;
1436 }
1437 }
1438}
1439
1440#[cfg(test)]
1441mod tests {
1442 use super::*;
1443 use crate::test_util::TestEnv;
1444
1445 #[test]
1446 fn test_region_id_to_index() {
1447 let num_workers = 4;
1448
1449 let region_id = RegionId::new(1, 2);
1450 let index = region_id_to_index(region_id, num_workers);
1451 assert_eq!(index, 3);
1452
1453 let region_id = RegionId::new(2, 3);
1454 let index = region_id_to_index(region_id, num_workers);
1455 assert_eq!(index, 1);
1456 }
1457
1458 #[tokio::test]
1459 async fn test_worker_group_start_stop() {
1460 let env = TestEnv::with_prefix("group-stop").await;
1461 let group = env
1462 .create_worker_group(MitoConfig {
1463 num_workers: 4,
1464 ..Default::default()
1465 })
1466 .await;
1467
1468 group.stop().await.unwrap();
1469 }
1470}