Skip to main content

mito2/
region.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Mito region.
16
17pub mod catchup;
18pub mod opener;
19pub mod options;
20pub mod utils;
21pub(crate) mod version;
22
23use std::collections::hash_map::Entry;
24use std::collections::{HashMap, HashSet};
25use std::sync::atomic::{AtomicI64, AtomicU64, Ordering};
26use std::sync::{Arc, Mutex, RwLock};
27
28use common_base::hash::partition_expr_version;
29use common_telemetry::{error, info, warn};
30use crossbeam_utils::atomic::AtomicCell;
31use partition::expr::PartitionExpr;
32use snafu::{OptionExt, ResultExt, ensure};
33use store_api::ManifestVersion;
34use store_api::codec::PrimaryKeyEncoding;
35use store_api::logstore::provider::Provider;
36use store_api::metadata::RegionMetadataRef;
37use store_api::region_engine::{
38    RegionManifestInfo, RegionRole, RegionStatistic, SettableRegionRoleState,
39};
40use store_api::region_info::RegionInfoEntry;
41use store_api::region_request::{PathType, StagingPartitionDirective};
42use store_api::sst_entry::ManifestSstEntry;
43use store_api::storage::{FileId, RegionId, SequenceNumber};
44use tokio::sync::RwLockWriteGuard;
45pub use utils::*;
46
47use crate::access_layer::AccessLayerRef;
48use crate::error::{
49    FlushableRegionStateSnafu, InvalidPartitionExprSnafu, RegionNotFoundSnafu, RegionStateSnafu,
50    RegionTruncatedSnafu, Result, UnexpectedSnafu, UpdateManifestSnafu,
51};
52use crate::manifest::action::{
53    RegionChange, RegionManifest, RegionMetaAction, RegionMetaActionList,
54};
55use crate::manifest::manager::RegionManifestManager;
56use crate::region::version::{VersionControlRef, VersionRef};
57use crate::request::{OnFailure, OptionOutputTx};
58use crate::sst::file::FileMeta;
59use crate::sst::file_purger::FilePurgerRef;
60use crate::sst::location::{index_file_path, sst_file_path};
61use crate::time_provider::TimeProviderRef;
62
63/// This is the approximate factor to estimate the size of wal.
64const ESTIMATED_WAL_FACTOR: f32 = 0.42825;
65
66/// Region status include region id, memtable usage, sst usage, wal usage and manifest usage.
67#[derive(Debug)]
68pub struct RegionUsage {
69    pub region_id: RegionId,
70    pub wal_usage: u64,
71    pub sst_usage: u64,
72    pub manifest_usage: u64,
73}
74
75impl RegionUsage {
76    pub fn disk_usage(&self) -> u64 {
77        self.wal_usage + self.sst_usage + self.manifest_usage
78    }
79}
80
81#[derive(Debug, Clone, Copy, PartialEq, Eq)]
82pub enum RegionLeaderState {
83    /// The region is opened and is writable.
84    Writable,
85    /// The region is in staging mode - writable but no checkpoint/compaction.
86    Staging,
87    /// The region is entering staging mode. - write requests will be stalled.
88    EnteringStaging,
89    /// The region is altering.
90    Altering,
91    /// The region is dropping.
92    Dropping,
93    /// The region is truncating.
94    Truncating,
95    /// The region is handling a region edit.
96    Editing,
97    /// The region is stepping down.
98    Downgrading,
99}
100
101#[derive(Debug, Clone, Copy, PartialEq, Eq)]
102pub enum RegionRoleState {
103    Leader(RegionLeaderState),
104    Follower,
105}
106
107impl RegionRoleState {
108    /// Converts the region role state to leader state if it is a leader state.
109    pub fn into_leader_state(self) -> Option<RegionLeaderState> {
110        match self {
111            RegionRoleState::Leader(leader_state) => Some(leader_state),
112            RegionRoleState::Follower => None,
113        }
114    }
115
116    pub(crate) fn as_str(&self) -> &'static str {
117        match self {
118            RegionRoleState::Follower => "Follower",
119            RegionRoleState::Leader(RegionLeaderState::Writable) => "Leader(Writable)",
120            RegionRoleState::Leader(RegionLeaderState::Staging) => "Leader(Staging)",
121            RegionRoleState::Leader(RegionLeaderState::EnteringStaging) => {
122                "Leader(EnteringStaging)"
123            }
124            RegionRoleState::Leader(RegionLeaderState::Altering) => "Leader(Altering)",
125            RegionRoleState::Leader(RegionLeaderState::Dropping) => "Leader(Dropping)",
126            RegionRoleState::Leader(RegionLeaderState::Truncating) => "Leader(Truncating)",
127            RegionRoleState::Leader(RegionLeaderState::Editing) => "Leader(Editing)",
128            RegionRoleState::Leader(RegionLeaderState::Downgrading) => "Leader(Downgrading)",
129        }
130    }
131}
132
133/// Metadata and runtime status of a region.
134///
135/// Writing and reading a region follow a single-writer-multi-reader rule:
136/// - Only the region worker thread this region belongs to can modify the metadata.
137/// - Multiple reader threads are allowed to read a specific `version` of a region.
138#[derive(Debug)]
139pub struct MitoRegion {
140    /// Id of this region.
141    ///
142    /// Accessing region id from the version control is inconvenient so
143    /// we also store it here.
144    pub(crate) region_id: RegionId,
145
146    /// Version controller for this region.
147    ///
148    /// We MUST update the version control inside the write lock of the region manifest manager.
149    pub(crate) version_control: VersionControlRef,
150    /// SSTs accessor for this region.
151    pub(crate) access_layer: AccessLayerRef,
152    /// Context to maintain manifest for this region.
153    pub(crate) manifest_ctx: ManifestContextRef,
154    /// SST file purger.
155    pub(crate) file_purger: FilePurgerRef,
156    /// The provider of log store.
157    pub(crate) provider: Provider,
158    /// Last flush time in millis.
159    last_flush_millis: AtomicI64,
160    /// Last schedule compaction time in millis.
161    last_schedule_compaction_millis: AtomicI64,
162    /// Provider to get current time.
163    time_provider: TimeProviderRef,
164    /// The topic's latest entry id since the region's last flushing.
165    /// **Only used for remote WAL pruning.**
166    ///
167    /// The value will be updated to the latest offset of the topic
168    /// if region receives a flush request or schedules a periodic flush task
169    /// and the region's memtable is empty.
170    ///
171    /// There are no WAL entries in range [flushed_entry_id, topic_latest_entry_id] for current region,
172    /// which means these WAL entries maybe able to be pruned up to `topic_latest_entry_id`.
173    pub(crate) topic_latest_entry_id: AtomicU64,
174    /// The total bytes written to the region.
175    pub(crate) written_bytes: Arc<AtomicU64>,
176    /// manifest stats
177    stats: ManifestStats,
178}
179
180pub type MitoRegionRef = Arc<MitoRegion>;
181
182#[derive(Debug, Clone)]
183pub(crate) struct StagingPartitionInfo {
184    pub(crate) partition_directive: StagingPartitionDirective,
185    pub(crate) partition_rule_version: u64,
186}
187
188impl StagingPartitionInfo {
189    /// Returns the partition expression carried by the staging directive, if any.
190    pub(crate) fn partition_expr(&self) -> Option<&str> {
191        self.partition_directive.partition_expr()
192    }
193
194    /// Builds staging partition info from a directive and derives its version marker.
195    pub(crate) fn from_partition_directive(partition_directive: StagingPartitionDirective) -> Self {
196        let partition_rule_version = match &partition_directive {
197            StagingPartitionDirective::UpdatePartitionExpr(expr) => {
198                partition_expr_version(Some(expr))
199            }
200            StagingPartitionDirective::RejectAllWrites => 0,
201        };
202        Self {
203            partition_directive,
204            partition_rule_version,
205        }
206    }
207}
208
209impl MitoRegion {
210    /// Stop background managers for this region.
211    pub(crate) async fn stop(&self) {
212        self.manifest_ctx
213            .manifest_manager
214            .write()
215            .await
216            .stop()
217            .await;
218
219        info!(
220            "Stopped region manifest manager, region_id: {}",
221            self.region_id
222        );
223    }
224
225    /// Returns current metadata of the region.
226    pub fn metadata(&self) -> RegionMetadataRef {
227        let version_data = self.version_control.current();
228        version_data.version.metadata.clone()
229    }
230
231    /// Returns primary key encoding of the region.
232    pub(crate) fn primary_key_encoding(&self) -> PrimaryKeyEncoding {
233        let version_data = self.version_control.current();
234        version_data.version.metadata.primary_key_encoding
235    }
236
237    /// Returns current version of the region.
238    pub(crate) fn version(&self) -> VersionRef {
239        let version_data = self.version_control.current();
240        version_data.version
241    }
242
243    /// Returns last flush timestamp in millis.
244    pub(crate) fn last_flush_millis(&self) -> i64 {
245        self.last_flush_millis.load(Ordering::Relaxed)
246    }
247
248    /// Update flush time to current time.
249    pub(crate) fn update_flush_millis(&self) {
250        let now = self.time_provider.current_time_millis();
251        self.last_flush_millis.store(now, Ordering::Relaxed);
252    }
253
254    /// Returns last schedule compaction timestamp in millis.
255    pub(crate) fn last_schedule_compaction_millis(&self) -> i64 {
256        self.last_schedule_compaction_millis.load(Ordering::Relaxed)
257    }
258
259    /// Update schedule compaction time to current time.
260    pub(crate) fn update_schedule_compaction_millis(&self) {
261        let now = self.time_provider.current_time_millis();
262        self.last_schedule_compaction_millis
263            .store(now, Ordering::Relaxed);
264    }
265
266    /// Returns the table dir.
267    pub(crate) fn table_dir(&self) -> &str {
268        self.access_layer.table_dir()
269    }
270
271    /// Returns the path type of the region.
272    pub(crate) fn path_type(&self) -> PathType {
273        self.access_layer.path_type()
274    }
275
276    /// Returns whether the region is writable.
277    pub(crate) fn is_writable(&self) -> bool {
278        matches!(
279            self.manifest_ctx.state.load(),
280            RegionRoleState::Leader(RegionLeaderState::Writable)
281                | RegionRoleState::Leader(RegionLeaderState::Staging)
282        )
283    }
284
285    /// Returns whether the region is flushable.
286    pub(crate) fn is_flushable(&self) -> bool {
287        matches!(
288            self.manifest_ctx.state.load(),
289            RegionRoleState::Leader(RegionLeaderState::Writable)
290                | RegionRoleState::Leader(RegionLeaderState::Staging)
291                | RegionRoleState::Leader(RegionLeaderState::Downgrading)
292        )
293    }
294
295    /// Returns whether the region should abort index building.
296    pub(crate) fn should_abort_index(&self) -> bool {
297        matches!(
298            self.manifest_ctx.state.load(),
299            RegionRoleState::Follower
300                | RegionRoleState::Leader(RegionLeaderState::Dropping)
301                | RegionRoleState::Leader(RegionLeaderState::Truncating)
302                | RegionRoleState::Leader(RegionLeaderState::Downgrading)
303                | RegionRoleState::Leader(RegionLeaderState::Staging)
304        )
305    }
306
307    /// Returns whether the region is downgrading.
308    pub(crate) fn is_downgrading(&self) -> bool {
309        matches!(
310            self.manifest_ctx.state.load(),
311            RegionRoleState::Leader(RegionLeaderState::Downgrading)
312        )
313    }
314
315    /// Returns whether the region is in staging mode.
316    pub(crate) fn is_staging(&self) -> bool {
317        self.manifest_ctx.state.load() == RegionRoleState::Leader(RegionLeaderState::Staging)
318    }
319
320    /// Returns whether the region is entering staging mode.
321    pub(crate) fn is_enter_staging(&self) -> bool {
322        self.manifest_ctx.state.load()
323            == RegionRoleState::Leader(RegionLeaderState::EnteringStaging)
324    }
325
326    pub fn region_id(&self) -> RegionId {
327        self.region_id
328    }
329
330    pub fn find_committed_sequence(&self) -> SequenceNumber {
331        self.version_control.committed_sequence()
332    }
333
334    /// Returns the latest sequence that has already been persisted into SSTs.
335    ///
336    /// Incremental memtable-only reads must use a cursor greater than or equal to
337    /// this boundary; older cursors are stale because the corresponding updates may
338    /// already have been flushed out of memtables.
339    pub fn flushed_sequence(&self) -> SequenceNumber {
340        self.version_control.current().version.flushed_sequence
341    }
342
343    /// Returns whether the region is readonly.
344    pub fn is_follower(&self) -> bool {
345        self.manifest_ctx.state.load() == RegionRoleState::Follower
346    }
347
348    /// Returns the state of the region.
349    pub(crate) fn state(&self) -> RegionRoleState {
350        self.manifest_ctx.state.load()
351    }
352
353    /// Sets the region role state.
354    pub(crate) fn set_role(&self, next_role: RegionRole) {
355        self.manifest_ctx.set_role(next_role, self.region_id);
356    }
357
358    pub(crate) fn region_role(&self) -> RegionRole {
359        match self.state() {
360            RegionRoleState::Follower => RegionRole::Follower,
361            RegionRoleState::Leader(RegionLeaderState::Staging) => RegionRole::StagingLeader,
362            RegionRoleState::Leader(RegionLeaderState::Downgrading) => {
363                RegionRole::DowngradingLeader
364            }
365            RegionRoleState::Leader(_) => RegionRole::Leader,
366        }
367    }
368
369    /// Sets the altering state.
370    /// You should call this method in the worker loop.
371    pub(crate) fn set_altering(&self) -> Result<()> {
372        self.compare_exchange_state(
373            RegionLeaderState::Writable,
374            RegionRoleState::Leader(RegionLeaderState::Altering),
375        )
376    }
377
378    /// Sets the dropping state.
379    /// You should call this method in the worker loop.
380    pub(crate) fn set_dropping(&self, expect: RegionLeaderState) -> Result<()> {
381        self.compare_exchange_state(expect, RegionRoleState::Leader(RegionLeaderState::Dropping))
382    }
383
384    /// Sets the truncating state.
385    /// You should call this method in the worker loop.
386    pub(crate) fn set_truncating(&self) -> Result<()> {
387        self.compare_exchange_state(
388            RegionLeaderState::Writable,
389            RegionRoleState::Leader(RegionLeaderState::Truncating),
390        )
391    }
392
393    /// Sets the editing state.
394    /// You should call this method in the worker loop.
395    pub(crate) fn set_editing(&self, expect: RegionLeaderState) -> Result<()> {
396        self.compare_exchange_state(expect, RegionRoleState::Leader(RegionLeaderState::Editing))
397    }
398
399    /// Sets the staging state.
400    ///
401    /// You should call this method in the worker loop.
402    /// Transitions from Writable to Staging state.
403    /// Cleans any existing staging manifests before entering staging mode.
404    pub(crate) async fn set_staging(
405        &self,
406        manager: &mut RwLockWriteGuard<'_, RegionManifestManager>,
407    ) -> Result<()> {
408        manager.store().clear_staging_manifests().await?;
409
410        self.compare_exchange_state(
411            RegionLeaderState::Writable,
412            RegionRoleState::Leader(RegionLeaderState::Staging),
413        )
414    }
415
416    /// Sets the entering staging state.
417    pub(crate) fn set_entering_staging(&self) -> Result<()> {
418        self.compare_exchange_state(
419            RegionLeaderState::Writable,
420            RegionRoleState::Leader(RegionLeaderState::EnteringStaging),
421        )
422    }
423
424    /// Exits the staging state back to writable.
425    ///
426    /// You should call this method in the worker loop.
427    /// Transitions from Staging to Writable state.
428    pub fn exit_staging(&self) -> Result<()> {
429        self.manifest_ctx.exit_staging(
430            self.region_id,
431            RegionRoleState::Leader(RegionLeaderState::Writable),
432        )
433    }
434
435    /// Sets the region role state gracefully. This acquires the manifest write lock.
436    pub(crate) async fn set_role_state_gracefully(
437        &self,
438        state: SettableRegionRoleState,
439    ) -> Result<()> {
440        let mut manager: RwLockWriteGuard<'_, RegionManifestManager> =
441            self.manifest_ctx.manifest_manager.write().await;
442        let current_state = self.state();
443
444        match state {
445            SettableRegionRoleState::Leader => {
446                // Exit staging mode and return to normal writable leader
447                // Only allowed from staging state
448                match current_state {
449                    RegionRoleState::Leader(RegionLeaderState::Staging) => {
450                        info!("Exiting staging mode for region {}", self.region_id);
451                        // Use the success exit path that merges all staged manifests
452                        self.exit_staging_on_success(&mut manager).await?;
453                    }
454                    RegionRoleState::Leader(RegionLeaderState::Writable) => {
455                        // Already in desired state - no-op
456                        info!("Region {} already in normal leader mode", self.region_id);
457                    }
458                    _ => {
459                        // Only staging -> leader transition is allowed
460                        return Err(RegionStateSnafu {
461                            region_id: self.region_id,
462                            state: current_state,
463                            expect: RegionRoleState::Leader(RegionLeaderState::Staging),
464                        }
465                        .build());
466                    }
467                }
468            }
469
470            SettableRegionRoleState::StagingLeader => {
471                // Enter staging mode from normal writable leader
472                // Only allowed from writable leader state
473                match current_state {
474                    RegionRoleState::Leader(RegionLeaderState::Writable) => {
475                        info!("Entering staging mode for region {}", self.region_id);
476                        self.set_staging(&mut manager).await?;
477                    }
478                    RegionRoleState::Leader(RegionLeaderState::Staging) => {
479                        // Already in desired state - no-op
480                        info!("Region {} already in staging mode", self.region_id);
481                    }
482                    _ => {
483                        return Err(RegionStateSnafu {
484                            region_id: self.region_id,
485                            state: current_state,
486                            expect: RegionRoleState::Leader(RegionLeaderState::Writable),
487                        }
488                        .build());
489                    }
490                }
491            }
492
493            SettableRegionRoleState::Follower => {
494                // Make this region a follower
495                match current_state {
496                    RegionRoleState::Leader(RegionLeaderState::Staging) => {
497                        info!(
498                            "Exiting staging and demoting region {} to follower",
499                            self.region_id
500                        );
501                        self.exit_staging()?;
502                        self.set_role(RegionRole::Follower);
503                    }
504                    RegionRoleState::Leader(_) => {
505                        info!("Demoting region {} from leader to follower", self.region_id);
506                        self.set_role(RegionRole::Follower);
507                    }
508                    RegionRoleState::Follower => {
509                        // Already in desired state - no-op
510                        info!("Region {} already in follower mode", self.region_id);
511                    }
512                }
513            }
514
515            SettableRegionRoleState::DowngradingLeader => {
516                // downgrade this region to downgrading leader
517                match current_state {
518                    RegionRoleState::Leader(RegionLeaderState::Staging) => {
519                        info!(
520                            "Exiting staging and entering downgrade for region {}",
521                            self.region_id
522                        );
523                        self.exit_staging()?;
524                        self.set_role(RegionRole::DowngradingLeader);
525                    }
526                    RegionRoleState::Leader(RegionLeaderState::Writable) => {
527                        info!("Starting downgrade for region {}", self.region_id);
528                        self.set_role(RegionRole::DowngradingLeader);
529                    }
530                    RegionRoleState::Leader(RegionLeaderState::Downgrading) => {
531                        // Already in desired state - no-op
532                        info!("Region {} already in downgrading mode", self.region_id);
533                    }
534                    _ => {
535                        warn!(
536                            "Cannot start downgrade for region {} from state {:?}",
537                            self.region_id, current_state
538                        );
539                    }
540                }
541            }
542        }
543
544        // Hack(zhongzc): If we have just become leader (writable), persist any backfilled metadata.
545        if self.state() == RegionRoleState::Leader(RegionLeaderState::Writable) {
546            // Persist backfilled metadata if manifest is missing fields (e.g., partition_expr)
547            let manifest_meta = &manager.manifest().metadata;
548            let current_version = self.version();
549            let current_meta = &current_version.metadata;
550            if manifest_meta.partition_expr.is_none() && current_meta.partition_expr.is_some() {
551                let action = RegionMetaAction::Change(RegionChange {
552                    metadata: current_meta.clone(),
553                    sst_format: current_version.options.sst_format.unwrap_or_default(),
554                    append_mode: None,
555                });
556                let result = manager
557                    .update(RegionMetaActionList::with_action(action), false)
558                    .await;
559
560                match result {
561                    Ok(version) => {
562                        info!(
563                            "Successfully persisted backfilled metadata for region {}, version: {}",
564                            self.region_id, version
565                        );
566                    }
567                    Err(e) => {
568                        warn!(e; "Failed to persist backfilled metadata for region {}", self.region_id);
569                    }
570                }
571            }
572        }
573
574        drop(manager);
575
576        Ok(())
577    }
578
579    /// Switches the region state to `RegionRoleState::Leader(RegionLeaderState::Writable)` if the current state is `expect`.
580    /// Otherwise, logs an error.
581    pub(crate) fn switch_state_to_writable(&self, expect: RegionLeaderState) {
582        if let Err(e) = self
583            .compare_exchange_state(expect, RegionRoleState::Leader(RegionLeaderState::Writable))
584        {
585            error!(e; "failed to switch region state to writable, expect state is {:?}", expect);
586        }
587    }
588
589    /// Switches the region state to `RegionRoleState::Leader(RegionLeaderState::Staging)` if the current state is `expect`.
590    /// Otherwise, logs an error.
591    pub(crate) fn switch_state_to_staging(&self, expect: RegionLeaderState) {
592        if let Err(e) =
593            self.compare_exchange_state(expect, RegionRoleState::Leader(RegionLeaderState::Staging))
594        {
595            error!(e; "failed to switch region state to staging, expect state is {:?}", expect);
596        }
597    }
598
599    /// Returns the region statistic.
600    pub(crate) fn region_statistic(&self) -> RegionStatistic {
601        let version = self.version();
602        let memtables = &version.memtables;
603        let memtable_usage = (memtables.mutable_usage() + memtables.immutables_usage()) as u64;
604
605        let sst_usage = version.ssts.owned_sst_usage(self.region_id);
606        let index_usage = version.ssts.owned_index_usage(self.region_id);
607        let flushed_entry_id = version.flushed_entry_id;
608
609        let wal_usage = self.estimated_wal_usage(memtable_usage);
610        let manifest_usage = self.stats.total_manifest_size();
611        let num_rows = version.ssts.owned_num_rows(self.region_id) + version.memtables.num_rows();
612        let num_files = version.ssts.owned_num_files(self.region_id);
613        let manifest_version = self.stats.manifest_version();
614        let file_removed_cnt = self.stats.file_removed_cnt();
615
616        let topic_latest_entry_id = self.topic_latest_entry_id.load(Ordering::Relaxed);
617        let written_bytes = self.written_bytes.load(Ordering::Relaxed);
618
619        RegionStatistic {
620            num_rows,
621            memtable_size: memtable_usage,
622            wal_size: wal_usage,
623            manifest_size: manifest_usage,
624            sst_size: sst_usage,
625            sst_num: num_files,
626            index_size: index_usage,
627            manifest: RegionManifestInfo::Mito {
628                manifest_version,
629                flushed_entry_id,
630                file_removed_cnt,
631            },
632            data_topic_latest_entry_id: topic_latest_entry_id,
633            metadata_topic_latest_entry_id: topic_latest_entry_id,
634            written_bytes,
635        }
636    }
637
638    /// Estimated WAL size in bytes.
639    /// Use the memtables size to estimate the size of wal.
640    fn estimated_wal_usage(&self, memtable_usage: u64) -> u64 {
641        ((memtable_usage as f32) * ESTIMATED_WAL_FACTOR) as u64
642    }
643
644    /// Sets the state of the region to given state if the current state equals to
645    /// the expected.
646    fn compare_exchange_state(
647        &self,
648        expect: RegionLeaderState,
649        state: RegionRoleState,
650    ) -> Result<()> {
651        self.manifest_ctx
652            .state
653            .compare_exchange(RegionRoleState::Leader(expect), state)
654            .map_err(|actual| {
655                RegionStateSnafu {
656                    region_id: self.region_id,
657                    state: actual,
658                    expect: RegionRoleState::Leader(expect),
659                }
660                .build()
661            })?;
662        Ok(())
663    }
664
665    pub fn access_layer(&self) -> AccessLayerRef {
666        self.access_layer.clone()
667    }
668
669    /// Returns the region info entry of the region.
670    pub(crate) fn region_info_entry(&self, node_id: Option<u64>) -> RegionInfoEntry {
671        let region_id = self.region_id;
672        let version = self.version();
673        let state = self.state();
674        let role = self.region_role();
675        let region_options = serde_json::to_string(&version.options)
676            .unwrap_or_else(|err| serde_json::json!({ "error": err.to_string() }).to_string());
677        let sst_format = match version.options.sst_format.unwrap_or_default() {
678            crate::sst::FormatType::PrimaryKey => "primary_key",
679            crate::sst::FormatType::Flat => "flat",
680        }
681        .to_string();
682
683        RegionInfoEntry {
684            region_id,
685            table_id: region_id.table_id(),
686            region_number: region_id.region_number(),
687            region_group: region_id.region_group(),
688            region_sequence: region_id.region_sequence(),
689            state: state.as_str().to_string(),
690            role: role.to_string(),
691            writable: self.is_writable(),
692            committed_sequence: self.find_committed_sequence(),
693            flushed_sequence: Some(self.flushed_sequence()).filter(|sequence| *sequence > 0),
694            manifest_version: self.stats.manifest_version(),
695            compaction_time_window: version
696                .compaction_time_window
697                .map(|duration| humantime::format_duration(duration).to_string()),
698            region_options,
699            sst_format,
700            node_id,
701        }
702    }
703
704    /// Returns the SST entries of the region.
705    pub async fn manifest_sst_entries(&self) -> Vec<ManifestSstEntry> {
706        let table_dir = self.table_dir();
707        let path_type = self.access_layer.path_type();
708
709        let visible_ssts = self
710            .version()
711            .ssts
712            .levels()
713            .iter()
714            .flat_map(|level| level.files().map(|file| file.file_id().file_id()))
715            .collect::<HashSet<_>>();
716
717        let manifest_files = self.manifest_ctx.manifest().await.files.clone();
718        let staging_files = self
719            .manifest_ctx
720            .staging_manifest()
721            .await
722            .map(|m| m.files.clone())
723            .unwrap_or_default();
724        let files = manifest_files
725            .into_iter()
726            .chain(staging_files)
727            .collect::<HashMap<_, _>>();
728
729        files
730            .values()
731            .map(|meta| {
732                let region_id = self.region_id;
733                let origin_region_id = meta.region_id;
734                let (index_version, index_file_path, index_file_size) = if meta.index_file_size > 0
735                {
736                    let index_file_path = index_file_path(table_dir, meta.index_id(), path_type);
737                    (
738                        meta.index_version,
739                        Some(index_file_path),
740                        Some(meta.index_file_size),
741                    )
742                } else {
743                    (0, None, None)
744                };
745                let visible = visible_ssts.contains(&meta.file_id);
746                ManifestSstEntry {
747                    table_dir: table_dir.to_string(),
748                    region_id,
749                    table_id: region_id.table_id(),
750                    region_number: region_id.region_number(),
751                    region_group: region_id.region_group(),
752                    region_sequence: region_id.region_sequence(),
753                    file_id: meta.file_id.to_string(),
754                    index_version,
755                    level: meta.level,
756                    file_path: sst_file_path(table_dir, meta.file_id(), path_type),
757                    file_size: meta.file_size,
758                    index_file_path,
759                    index_file_size,
760                    num_rows: meta.num_rows,
761                    num_row_groups: meta.num_row_groups,
762                    num_series: Some(meta.num_series),
763                    min_ts: meta.time_range.0,
764                    max_ts: meta.time_range.1,
765                    sequence: meta.sequence.map(|s| s.get()),
766                    origin_region_id,
767                    node_id: None,
768                    visible,
769                    primary_key_min: meta.primary_key_min.clone(),
770                    primary_key_max: meta.primary_key_max.clone(),
771                }
772            })
773            .collect()
774    }
775
776    /// Returns the file metas of the region by file ids.
777    pub async fn file_metas(&self, file_ids: &[FileId]) -> Vec<Option<FileMeta>> {
778        let manifest_files = self.manifest_ctx.manifest().await.files.clone();
779
780        file_ids
781            .iter()
782            .map(|file_id| manifest_files.get(file_id).cloned())
783            .collect::<Vec<_>>()
784    }
785
786    /// Exit staging mode successfully by merging all staged manifests and making them visible.
787    pub(crate) async fn exit_staging_on_success(
788        &self,
789        manager: &mut RwLockWriteGuard<'_, RegionManifestManager>,
790    ) -> Result<()> {
791        let current_state = self.manifest_ctx.current_state();
792        ensure!(
793            current_state == RegionRoleState::Leader(RegionLeaderState::Staging),
794            RegionStateSnafu {
795                region_id: self.region_id,
796                state: current_state,
797                expect: RegionRoleState::Leader(RegionLeaderState::Staging),
798            }
799        );
800
801        // Merge all staged manifest actions
802        let merged_actions = match manager.merge_staged_actions(current_state).await? {
803            Some(actions) => actions,
804            None => {
805                info!(
806                    "No staged manifests to merge for region {}, exiting staging mode without changes",
807                    self.region_id
808                );
809                // Even if no manifests to merge, we still need to exit staging mode
810                self.exit_staging()?;
811                return Ok(());
812            }
813        };
814        let expect_change = merged_actions.actions.iter().any(|a| a.is_change());
815        let expect_partition_expr_change = merged_actions
816            .actions
817            .iter()
818            .any(|a| a.is_partition_expr_change());
819        let expect_edit = merged_actions.actions.iter().any(|a| a.is_edit());
820        ensure!(
821            !(expect_change && expect_partition_expr_change),
822            UnexpectedSnafu {
823                reason: "unexpected both change and partition expr change actions in merged actions"
824            }
825        );
826        ensure!(
827            expect_change || expect_partition_expr_change,
828            UnexpectedSnafu {
829                reason: "expect a change or partition expr change action in merged actions"
830            }
831        );
832        ensure!(
833            expect_edit,
834            UnexpectedSnafu {
835                reason: "expect an edit action in merged actions"
836            }
837        );
838
839        let (merged_partition_expr_change, merged_change, merged_edit) =
840            merged_actions.clone().split_region_change_and_edit();
841        if let Some(change) = &merged_change {
842            // In staging exit we only allow metadata-only updates. A `Change`
843            // action is accepted only when column definitions are unchanged;
844            // otherwise it is treated as a schema change and rejected.
845            let current_column_metadatas = &self.version().metadata.column_metadatas;
846            ensure!(
847                change.metadata.column_metadatas == *current_column_metadatas,
848                UnexpectedSnafu {
849                    reason: "change action alters column metadata in staging exit"
850                }
851            );
852        }
853
854        // Submit merged actions using the manifest manager's update method
855        // Pass the `false` so it saves to normal directory, not staging
856        let new_version = manager.update(merged_actions, false).await?;
857        info!(
858            "Successfully submitted merged staged manifests for region {}, new version: {}",
859            self.region_id, new_version
860        );
861
862        // Apply the merged changes to in-memory version control
863        if let Some(change) = merged_partition_expr_change {
864            let mut new_metadata = self.version().metadata.as_ref().clone();
865            new_metadata.set_partition_expr(change.partition_expr);
866            self.version_control.alter_metadata(new_metadata.into());
867        }
868        if let Some(change) = merged_change {
869            self.version_control.alter_metadata(change.metadata);
870        }
871        self.version_control
872            .apply_edit(Some(merged_edit), &[], self.file_purger.clone());
873
874        // Clear all staging manifests and transit state
875        if let Err(e) = manager.clear_staging_manifest_and_dir().await {
876            error!(e; "Failed to clear staging manifest dir for region {}", self.region_id);
877        }
878        self.exit_staging()?;
879
880        Ok(())
881    }
882
883    /// Returns the partition expression string for this region.
884    ///
885    /// If the region is currently in staging state, this returns the partition expression held in
886    /// the staging partition field. Otherwise, it returns the partition expression from the primary
887    /// region metadata (current committed version).
888    pub fn maybe_staging_partition_expr_str(&self) -> Option<String> {
889        let is_staging = self.is_staging();
890        if is_staging {
891            let staging_partition_info = self.manifest_ctx.staging_partition_info();
892            if staging_partition_info.is_none() {
893                warn!(
894                    "Staging partition expr is none for region {} in staging state",
895                    self.region_id
896                );
897            }
898            staging_partition_info
899                .as_ref()
900                .and_then(|info| info.partition_expr().map(ToString::to_string))
901        } else {
902            let version = self.version();
903            version.metadata.partition_expr.clone()
904        }
905    }
906
907    pub fn expected_partition_expr_version(&self) -> u64 {
908        if self.is_staging() {
909            self.manifest_ctx
910                .staging_partition_info()
911                .as_ref()
912                .map(|info| info.partition_rule_version)
913                .unwrap_or_default()
914        } else {
915            self.version().metadata.partition_expr_version
916        }
917    }
918
919    /// Returns whether writes should be rejected for this region in staging mode.
920    pub(crate) fn reject_all_writes_in_staging(&self) -> bool {
921        if !self.is_staging() {
922            return false;
923        }
924        self.manifest_ctx
925            .staging_partition_info()
926            .as_ref()
927            .map(|info| {
928                matches!(
929                    info.partition_directive,
930                    StagingPartitionDirective::RejectAllWrites
931                )
932            })
933            .unwrap_or(false)
934    }
935}
936
937/// Context to update the region manifest.
938#[derive(Debug)]
939pub(crate) struct ManifestContext {
940    /// Manager to maintain manifest for this region.
941    pub(crate) manifest_manager: tokio::sync::RwLock<RegionManifestManager>,
942    /// The state of the region. The region checks the state before updating
943    /// manifest.
944    state: AtomicCell<RegionRoleState>,
945    /// Partition info of the region in staging mode.
946    ///
947    /// During the staging mode, the region metadata in [`VersionControlRef`] is not updated,
948    /// so we need to store the partition info separately.
949    staging_partition_info: Mutex<Option<StagingPartitionInfo>>,
950}
951
952impl ManifestContext {
953    pub(crate) fn new(manager: RegionManifestManager, state: RegionRoleState) -> Self {
954        ManifestContext {
955            manifest_manager: tokio::sync::RwLock::new(manager),
956            state: AtomicCell::new(state),
957            staging_partition_info: Mutex::new(None),
958        }
959    }
960
961    pub(crate) fn staging_partition_info(&self) -> Option<StagingPartitionInfo> {
962        self.staging_partition_info.lock().unwrap().clone()
963    }
964
965    pub(crate) fn set_staging_partition_info(&self, staging_partition_info: StagingPartitionInfo) {
966        let mut current = self.staging_partition_info.lock().unwrap();
967        debug_assert!(current.is_none());
968        *current = Some(staging_partition_info);
969    }
970
971    fn clear_staging_partition_info(&self) {
972        *self.staging_partition_info.lock().unwrap() = None;
973    }
974
975    pub(crate) fn exit_staging(
976        &self,
977        region_id: RegionId,
978        next_state: RegionRoleState,
979    ) -> Result<()> {
980        self.state
981            .compare_exchange(
982                RegionRoleState::Leader(RegionLeaderState::Staging),
983                next_state,
984            )
985            .map_err(|actual| {
986                RegionStateSnafu {
987                    region_id,
988                    state: actual,
989                    expect: RegionRoleState::Leader(RegionLeaderState::Staging),
990                }
991                .build()
992            })?;
993        self.clear_staging_partition_info();
994        Ok(())
995    }
996
997    pub(crate) async fn manifest_version(&self) -> ManifestVersion {
998        self.manifest_manager
999            .read()
1000            .await
1001            .manifest()
1002            .manifest_version
1003    }
1004
1005    pub(crate) async fn has_update(&self) -> Result<bool> {
1006        self.manifest_manager.read().await.has_update().await
1007    }
1008
1009    /// Returns the current region role state.
1010    pub(crate) fn current_state(&self) -> RegionRoleState {
1011        self.state.load()
1012    }
1013
1014    /// Installs the manifest changes from the current version to the target version (inclusive).
1015    ///
1016    /// Returns installed [RegionManifest].
1017    /// **Note**: This function is not guaranteed to install the target version strictly.
1018    /// The installed version may be greater than the target version.
1019    pub(crate) async fn install_manifest_to(
1020        &self,
1021        version: ManifestVersion,
1022    ) -> Result<Arc<RegionManifest>> {
1023        let mut manager = self.manifest_manager.write().await;
1024        manager.install_manifest_to(version).await?;
1025
1026        Ok(manager.manifest())
1027    }
1028
1029    /// Updates the manifest if current state is `expect_state`.
1030    pub(crate) async fn update_manifest(
1031        &self,
1032        expect_state: RegionLeaderState,
1033        action_list: RegionMetaActionList,
1034        is_staging: bool,
1035    ) -> Result<ManifestVersion> {
1036        self.update_manifest_with_state_check(action_list, is_staging, |current_state, region_id| {
1037            // If expect_state is not downgrading, the current state must be either `expect_state` or downgrading.
1038            //
1039            // A downgrading leader rejects user writes but still allows
1040            // flushing the memtable and updating the manifest.
1041            if expect_state != RegionLeaderState::Downgrading {
1042                if current_state == RegionRoleState::Leader(RegionLeaderState::Downgrading) {
1043                    info!(
1044                        "Region {} is in downgrading leader state, updating manifest. Expect state is {:?}",
1045                        region_id, expect_state
1046                    );
1047                }
1048                ensure!(
1049                    current_state == RegionRoleState::Leader(expect_state)
1050                        || current_state == RegionRoleState::Leader(RegionLeaderState::Downgrading),
1051                    UpdateManifestSnafu {
1052                        region_id,
1053                        state: current_state,
1054                    }
1055                );
1056            } else {
1057                ensure!(
1058                    current_state == RegionRoleState::Leader(expect_state),
1059                    RegionStateSnafu {
1060                        region_id,
1061                        state: current_state,
1062                        expect: RegionRoleState::Leader(expect_state),
1063                    }
1064                );
1065            }
1066
1067            Ok(())
1068        })
1069        .await
1070    }
1071
1072    /// Updates the manifest for compaction.
1073    ///
1074    /// Compaction may finish while a direct external region edit is in the transient
1075    /// `Editing` state. Direct external edits can remove files both when followers
1076    /// apply sync-region metadata and when a writable leader performs a direct edit
1077    /// such as `edit_region()`. Allowing compaction to publish in `Editing` is still
1078    /// safe because publication happens under the manifest write lock and compaction
1079    /// rechecks that its input files are still valid before committing.
1080    ///
1081    /// This intentionally writes to the normal manifest path (`is_staging = false`).
1082    /// Entering staging cancels or waits for active compactions before switching the
1083    /// region to `Staging`, so a compaction that started before staging still finishes
1084    /// against the normal manifest. Even if a manual compaction is requested while the
1085    /// region is already staging, compaction only sees SSTs in the normal visible
1086    /// region version; SSTs from staging manifests are not applied to region version
1087    /// control until staging exits successfully.
1088    pub(crate) async fn update_manifest_for_compaction(
1089        &self,
1090        action_list: RegionMetaActionList,
1091    ) -> Result<ManifestVersion> {
1092        self.update_manifest_with_state_check(action_list, false, |current_state, region_id| {
1093            ensure!(
1094                matches!(
1095                    current_state,
1096                    RegionRoleState::Leader(RegionLeaderState::Writable)
1097                        | RegionRoleState::Leader(RegionLeaderState::Editing)
1098                        | RegionRoleState::Leader(RegionLeaderState::Downgrading)
1099                ),
1100                UpdateManifestSnafu {
1101                    region_id,
1102                    state: current_state,
1103                }
1104            );
1105
1106            Ok(())
1107        })
1108        .await
1109    }
1110
1111    async fn update_manifest_with_state_check(
1112        &self,
1113        action_list: RegionMetaActionList,
1114        is_staging: bool,
1115        check_state: impl FnOnce(RegionRoleState, RegionId) -> Result<()>,
1116    ) -> Result<ManifestVersion> {
1117        // Acquires the write lock of the manifest manager.
1118        let mut manager = self.manifest_manager.write().await;
1119        // Gets current manifest.
1120        let manifest = manager.manifest();
1121        // Checks state inside the lock. This is to ensure that we won't update the manifest
1122        // after `set_readonly_gracefully()` is called.
1123        let current_state = self.state.load();
1124        check_state(current_state, manifest.metadata.region_id)?;
1125
1126        for action in &action_list.actions {
1127            // Checks whether the edit is still applicable.
1128            let RegionMetaAction::Edit(edit) = &action else {
1129                continue;
1130            };
1131
1132            // Checks whether the region is truncated.
1133            let Some(truncated_entry_id) = manifest.truncated_entry_id else {
1134                continue;
1135            };
1136
1137            // This is an edit from flush.
1138            if let Some(flushed_entry_id) = edit.flushed_entry_id {
1139                // A flush edit is valid after truncate in two cases:
1140                // 1. `flushed_entry_id` moves past `truncated_entry_id`, meaning it definitely
1141                //    flushed data newer than the truncate point.
1142                // 2. `flushed_entry_id` equals `truncated_entry_id`, but `flushed_sequence`
1143                //    increases. This happens in skip-WAL tables where entry id can stay at 0,
1144                //    while sequence still advances for post-truncate writes.
1145                //
1146                // We still reject stale flushes from before truncate:
1147                // if entry id is equal and sequence does not advance, the flush is outdated.
1148                let is_newer_entry = truncated_entry_id < flushed_entry_id;
1149                let is_same_entry_with_newer_sequence = truncated_entry_id == flushed_entry_id
1150                    && edit.flushed_sequence.is_some_and(|flushed_sequence| {
1151                        manifest.flushed_sequence < flushed_sequence
1152                    });
1153
1154                ensure!(
1155                    is_newer_entry || is_same_entry_with_newer_sequence,
1156                    RegionTruncatedSnafu {
1157                        region_id: manifest.metadata.region_id,
1158                    }
1159                );
1160            }
1161
1162            // This is an edit from compaction.
1163            if !edit.files_to_remove.is_empty() {
1164                // Input files of the compaction task has been truncated.
1165                for file in &edit.files_to_remove {
1166                    ensure!(
1167                        manifest.files.contains_key(&file.file_id),
1168                        RegionTruncatedSnafu {
1169                            region_id: manifest.metadata.region_id,
1170                        }
1171                    );
1172                }
1173            }
1174        }
1175
1176        // Now we can update the manifest.
1177        let version = manager.update(action_list, is_staging).await.inspect_err(
1178            |e| error!(e; "Failed to update manifest, region_id: {}", manifest.metadata.region_id),
1179        )?;
1180
1181        if self.state.load() == RegionRoleState::Follower {
1182            warn!(
1183                "Region {} becomes follower while updating manifest which may cause inconsistency, manifest version: {version}",
1184                manifest.metadata.region_id
1185            );
1186        }
1187
1188        Ok(version)
1189    }
1190
1191    /// Sets the [`RegionRole`].
1192    ///
1193    /// ```text
1194    ///                  +---------------------+
1195    ///                  |   Staging Leader    |
1196    ///                  +----------+----------+
1197    ///                             |
1198    ///                             v
1199    ///     +----------+     +------+-------+     +-------------+
1200    ///     | Follower | <-> |    Leader    | <-> | Downgrading |
1201    ///     +-----+----+     +------+-------+     +------+------+
1202    ///           ^                 ^                    |
1203    ///           +-----------------+--------------------+
1204    ///
1205    /// ```
1206    ///
1207    /// # State Transitions
1208    ///
1209    /// From `Follower`:
1210    /// - `Follower -> Leader`
1211    ///
1212    /// From `Leader`:
1213    /// - `Leader -> Follower`
1214    /// - `Leader -> Downgrading Leader`
1215    ///
1216    /// From `Staging Leader`:
1217    /// - `Staging Leader -> Leader`
1218    /// - `Staging Leader -> Follower`
1219    /// - `Staging Leader -> Downgrading Leader`
1220    ///
1221    /// From `Downgrading Leader`:
1222    /// - `Downgrading Leader -> Leader`
1223    /// - `Downgrading Leader -> Follower`
1224    pub(crate) fn set_role(&self, next_role: RegionRole, region_id: RegionId) {
1225        match next_role {
1226            RegionRole::Follower => {
1227                if self
1228                    .exit_staging(region_id, RegionRoleState::Follower)
1229                    .is_ok()
1230                {
1231                    info!(
1232                        "Convert region {} to follower, previous role state: {:?}",
1233                        region_id,
1234                        RegionRoleState::Leader(RegionLeaderState::Staging)
1235                    );
1236                    return;
1237                }
1238                match self.state.fetch_update(|state| {
1239                    if !matches!(state, RegionRoleState::Follower) {
1240                        Some(RegionRoleState::Follower)
1241                    } else {
1242                        None
1243                    }
1244                }) {
1245                    Ok(state) => info!(
1246                        "Convert region {} to follower, previous role state: {:?}",
1247                        region_id, state
1248                    ),
1249                    Err(state) => {
1250                        if state != RegionRoleState::Follower {
1251                            warn!(
1252                                "Failed to convert region {} to follower, current role state: {:?}",
1253                                region_id, state
1254                            )
1255                        }
1256                    }
1257                }
1258            }
1259            RegionRole::Leader => {
1260                if self
1261                    .exit_staging(
1262                        region_id,
1263                        RegionRoleState::Leader(RegionLeaderState::Writable),
1264                    )
1265                    .is_ok()
1266                {
1267                    info!(
1268                        "Convert region {} to leader, previous role state: {:?}",
1269                        region_id,
1270                        RegionRoleState::Leader(RegionLeaderState::Staging)
1271                    );
1272                    return;
1273                }
1274                match self.state.fetch_update(|state| {
1275                    if matches!(
1276                        state,
1277                        RegionRoleState::Follower
1278                            | RegionRoleState::Leader(RegionLeaderState::Downgrading)
1279                    ) {
1280                        Some(RegionRoleState::Leader(RegionLeaderState::Writable))
1281                    } else {
1282                        None
1283                    }
1284                }) {
1285                    Ok(state) => info!(
1286                        "Convert region {} to leader, previous role state: {:?}",
1287                        region_id, state
1288                    ),
1289                    Err(state) => {
1290                        if state != RegionRoleState::Leader(RegionLeaderState::Writable) {
1291                            warn!(
1292                                "Failed to convert region {} to leader, current role state: {:?}",
1293                                region_id, state
1294                            )
1295                        }
1296                    }
1297                }
1298            }
1299            RegionRole::StagingLeader => {
1300                info!(
1301                    "Ignore direct conversion of region {} to staging leader; staging requires the dedicated workflow",
1302                    region_id
1303                );
1304            }
1305            RegionRole::DowngradingLeader => {
1306                if self
1307                    .exit_staging(
1308                        region_id,
1309                        RegionRoleState::Leader(RegionLeaderState::Downgrading),
1310                    )
1311                    .is_ok()
1312                {
1313                    info!(
1314                        "Convert region {} to downgrading region, previous role state: {:?}",
1315                        region_id,
1316                        RegionRoleState::Leader(RegionLeaderState::Staging)
1317                    );
1318                    return;
1319                }
1320                match self.state.compare_exchange(
1321                    RegionRoleState::Leader(RegionLeaderState::Writable),
1322                    RegionRoleState::Leader(RegionLeaderState::Downgrading),
1323                ) {
1324                    Ok(state) => info!(
1325                        "Convert region {} to downgrading region, previous role state: {:?}",
1326                        region_id, state
1327                    ),
1328                    Err(state) => {
1329                        if state != RegionRoleState::Leader(RegionLeaderState::Downgrading) {
1330                            warn!(
1331                                "Failed to convert region {} to downgrading leader, current role state: {:?}",
1332                                region_id, state
1333                            )
1334                        }
1335                    }
1336                }
1337            }
1338        }
1339    }
1340
1341    /// Returns the normal manifest of the region.
1342    pub(crate) async fn manifest(&self) -> Arc<crate::manifest::action::RegionManifest> {
1343        self.manifest_manager.read().await.manifest()
1344    }
1345
1346    /// Returns the staging manifest of the region.
1347    pub(crate) async fn staging_manifest(
1348        &self,
1349    ) -> Option<Arc<crate::manifest::action::RegionManifest>> {
1350        self.manifest_manager.read().await.staging_manifest()
1351    }
1352}
1353
1354pub(crate) type ManifestContextRef = Arc<ManifestContext>;
1355
1356/// Regions indexed by ids.
1357#[derive(Debug, Default)]
1358pub(crate) struct RegionMap {
1359    regions: RwLock<HashMap<RegionId, MitoRegionRef>>,
1360}
1361
1362impl RegionMap {
1363    /// Returns true if the region exists.
1364    pub(crate) fn is_region_exists(&self, region_id: RegionId) -> bool {
1365        let regions = self.regions.read().unwrap();
1366        regions.contains_key(&region_id)
1367    }
1368
1369    /// Inserts a new region into the map.
1370    pub(crate) fn insert_region(&self, region: MitoRegionRef) {
1371        let mut regions = self.regions.write().unwrap();
1372        regions.insert(region.region_id, region);
1373    }
1374
1375    /// Gets region by region id.
1376    pub(crate) fn get_region(&self, region_id: RegionId) -> Option<MitoRegionRef> {
1377        let regions = self.regions.read().unwrap();
1378        regions.get(&region_id).cloned()
1379    }
1380
1381    /// Gets writable region by region id.
1382    ///
1383    /// Returns error if the region does not exist or is readonly.
1384    pub(crate) fn writable_region(&self, region_id: RegionId) -> Result<MitoRegionRef> {
1385        let region = self
1386            .get_region(region_id)
1387            .context(RegionNotFoundSnafu { region_id })?;
1388        ensure!(
1389            region.is_writable(),
1390            RegionStateSnafu {
1391                region_id,
1392                state: region.state(),
1393                expect: RegionRoleState::Leader(RegionLeaderState::Writable),
1394            }
1395        );
1396        Ok(region)
1397    }
1398
1399    /// Gets readonly region by region id.
1400    ///
1401    /// Returns error if the region does not exist or is writable.
1402    pub(crate) fn follower_region(&self, region_id: RegionId) -> Result<MitoRegionRef> {
1403        let region = self
1404            .get_region(region_id)
1405            .context(RegionNotFoundSnafu { region_id })?;
1406        ensure!(
1407            region.is_follower(),
1408            RegionStateSnafu {
1409                region_id,
1410                state: region.state(),
1411                expect: RegionRoleState::Follower,
1412            }
1413        );
1414
1415        Ok(region)
1416    }
1417
1418    /// Gets region by region id.
1419    ///
1420    /// Calls the callback if the region does not exist.
1421    pub(crate) fn get_region_or<F: OnFailure>(
1422        &self,
1423        region_id: RegionId,
1424        cb: &mut F,
1425    ) -> Option<MitoRegionRef> {
1426        match self
1427            .get_region(region_id)
1428            .context(RegionNotFoundSnafu { region_id })
1429        {
1430            Ok(region) => Some(region),
1431            Err(e) => {
1432                cb.on_failure(e);
1433                None
1434            }
1435        }
1436    }
1437
1438    /// Gets writable region by region id.
1439    ///
1440    /// Calls the callback if the region does not exist or is readonly.
1441    pub(crate) fn writable_region_or<F: OnFailure>(
1442        &self,
1443        region_id: RegionId,
1444        cb: &mut F,
1445    ) -> Option<MitoRegionRef> {
1446        match self.writable_region(region_id) {
1447            Ok(region) => Some(region),
1448            Err(e) => {
1449                cb.on_failure(e);
1450                None
1451            }
1452        }
1453    }
1454
1455    /// Gets writable non-staging region by region id.
1456    ///
1457    /// Returns error if the region does not exist, is readonly, or is in staging mode.
1458    pub(crate) fn writable_non_staging_region(&self, region_id: RegionId) -> Result<MitoRegionRef> {
1459        let region = self.writable_region(region_id)?;
1460        if region.is_staging() {
1461            return Err(crate::error::RegionStateSnafu {
1462                region_id,
1463                state: region.state(),
1464                expect: RegionRoleState::Leader(RegionLeaderState::Writable),
1465            }
1466            .build());
1467        }
1468        Ok(region)
1469    }
1470
1471    /// Gets staging region by region id.
1472    ///
1473    /// Returns error if the region does not exist or is not in staging state.
1474    pub(crate) fn staging_region(&self, region_id: RegionId) -> Result<MitoRegionRef> {
1475        let region = self
1476            .get_region(region_id)
1477            .context(RegionNotFoundSnafu { region_id })?;
1478        ensure!(
1479            region.is_staging(),
1480            RegionStateSnafu {
1481                region_id,
1482                state: region.state(),
1483                expect: RegionRoleState::Leader(RegionLeaderState::Staging),
1484            }
1485        );
1486        Ok(region)
1487    }
1488
1489    /// Gets flushable region by region id.
1490    ///
1491    /// Returns error if the region does not exist or not flushable.
1492    pub(crate) fn flushable_region(&self, region_id: RegionId) -> Result<MitoRegionRef> {
1493        let region = self
1494            .get_region(region_id)
1495            .context(RegionNotFoundSnafu { region_id })?;
1496        ensure!(
1497            region.is_flushable(),
1498            FlushableRegionStateSnafu {
1499                region_id,
1500                state: region.state(),
1501            }
1502        );
1503        Ok(region)
1504    }
1505
1506    /// Remove region by id.
1507    pub(crate) fn remove_region(&self, region_id: RegionId) -> Option<MitoRegionRef> {
1508        let mut regions = self.regions.write().unwrap();
1509        regions.remove(&region_id)
1510    }
1511
1512    /// List all regions.
1513    pub(crate) fn list_regions(&self) -> Vec<MitoRegionRef> {
1514        let regions = self.regions.read().unwrap();
1515        regions.values().cloned().collect()
1516    }
1517
1518    /// Clear the map.
1519    pub(crate) fn clear(&self) {
1520        self.regions.write().unwrap().clear();
1521    }
1522}
1523
1524pub(crate) type RegionMapRef = Arc<RegionMap>;
1525
1526/// Opening regions
1527#[derive(Debug, Default)]
1528pub(crate) struct OpeningRegions {
1529    regions: RwLock<HashMap<RegionId, Vec<OptionOutputTx>>>,
1530}
1531
1532impl OpeningRegions {
1533    /// Registers `sender` for an opening region; Otherwise, it returns `None`.
1534    pub(crate) fn wait_for_opening_region(
1535        &self,
1536        region_id: RegionId,
1537        sender: OptionOutputTx,
1538    ) -> Option<OptionOutputTx> {
1539        let mut regions = self.regions.write().unwrap();
1540        match regions.entry(region_id) {
1541            Entry::Occupied(mut senders) => {
1542                senders.get_mut().push(sender);
1543                None
1544            }
1545            Entry::Vacant(_) => Some(sender),
1546        }
1547    }
1548
1549    /// Returns true if the region exists.
1550    pub(crate) fn is_region_exists(&self, region_id: RegionId) -> bool {
1551        let regions = self.regions.read().unwrap();
1552        regions.contains_key(&region_id)
1553    }
1554
1555    /// Inserts a new region into the map.
1556    pub(crate) fn insert_sender(&self, region: RegionId, sender: OptionOutputTx) {
1557        let mut regions = self.regions.write().unwrap();
1558        regions.insert(region, vec![sender]);
1559    }
1560
1561    /// Remove region by id.
1562    pub(crate) fn remove_sender(&self, region_id: RegionId) -> Vec<OptionOutputTx> {
1563        let mut regions = self.regions.write().unwrap();
1564        regions.remove(&region_id).unwrap_or_default()
1565    }
1566
1567    #[cfg(test)]
1568    pub(crate) fn sender_len(&self, region_id: RegionId) -> usize {
1569        let regions = self.regions.read().unwrap();
1570        if let Some(senders) = regions.get(&region_id) {
1571            senders.len()
1572        } else {
1573            0
1574        }
1575    }
1576}
1577
1578pub(crate) type OpeningRegionsRef = Arc<OpeningRegions>;
1579
1580/// The regions that are catching up.
1581#[derive(Debug, Default)]
1582pub(crate) struct CatchupRegions {
1583    regions: RwLock<HashSet<RegionId>>,
1584}
1585
1586impl CatchupRegions {
1587    /// Returns true if the region exists.
1588    pub(crate) fn is_region_exists(&self, region_id: RegionId) -> bool {
1589        let regions = self.regions.read().unwrap();
1590        regions.contains(&region_id)
1591    }
1592
1593    /// Inserts a new region into the set.
1594    pub(crate) fn insert_region(&self, region_id: RegionId) {
1595        let mut regions = self.regions.write().unwrap();
1596        regions.insert(region_id);
1597    }
1598
1599    /// Remove region by id.
1600    pub(crate) fn remove_region(&self, region_id: RegionId) {
1601        let mut regions = self.regions.write().unwrap();
1602        regions.remove(&region_id);
1603    }
1604}
1605
1606pub(crate) type CatchupRegionsRef = Arc<CatchupRegions>;
1607
1608/// Manifest stats.
1609#[derive(Default, Debug, Clone)]
1610pub struct ManifestStats {
1611    pub(crate) total_manifest_size: Arc<AtomicU64>,
1612    pub(crate) manifest_version: Arc<AtomicU64>,
1613    pub(crate) file_removed_cnt: Arc<AtomicU64>,
1614}
1615
1616impl ManifestStats {
1617    fn total_manifest_size(&self) -> u64 {
1618        self.total_manifest_size.load(Ordering::Relaxed)
1619    }
1620
1621    fn manifest_version(&self) -> u64 {
1622        self.manifest_version.load(Ordering::Relaxed)
1623    }
1624
1625    fn file_removed_cnt(&self) -> u64 {
1626        self.file_removed_cnt.load(Ordering::Relaxed)
1627    }
1628}
1629
1630/// Parses the partition expression from a JSON string.
1631pub fn parse_partition_expr(partition_expr_str: Option<&str>) -> Result<Option<PartitionExpr>> {
1632    match partition_expr_str {
1633        None => Ok(None),
1634        Some("") => Ok(None),
1635        Some(json_str) => {
1636            let expr = partition::expr::PartitionExpr::from_json_str(json_str)
1637                .with_context(|_| InvalidPartitionExprSnafu { expr: json_str })?;
1638            Ok(expr)
1639        }
1640    }
1641}
1642
1643#[cfg(test)]
1644mod tests {
1645    use std::sync::Arc;
1646    use std::sync::atomic::AtomicU64;
1647
1648    use common_datasource::compression::CompressionType;
1649    use common_test_util::temp_dir::create_temp_dir;
1650    use crossbeam_utils::atomic::AtomicCell;
1651    use object_store::ObjectStore;
1652    use object_store::services::Fs;
1653    use store_api::logstore::provider::Provider;
1654    use store_api::region_engine::RegionRole;
1655    use store_api::region_request::PathType;
1656    use store_api::storage::{FileId, RegionId};
1657
1658    use crate::access_layer::AccessLayer;
1659    use crate::error::Error;
1660    use crate::manifest::action::{
1661        RegionChange, RegionEdit, RegionMetaAction, RegionMetaActionList, RegionPartitionExprChange,
1662    };
1663    use crate::manifest::manager::{RegionManifestManager, RegionManifestOptions};
1664    use crate::region::{
1665        ManifestContext, ManifestStats, MitoRegion, RegionLeaderState, RegionRoleState,
1666    };
1667    use crate::sst::FormatType;
1668    use crate::sst::index::intermediate::IntermediateManager;
1669    use crate::sst::index::puffin_manager::PuffinManagerFactory;
1670    use crate::test_util::scheduler_util::SchedulerEnv;
1671    use crate::test_util::version_util::VersionControlBuilder;
1672    use crate::time_provider::StdTimeProvider;
1673
1674    #[test]
1675    fn test_region_state_lock_free() {
1676        assert!(AtomicCell::<RegionRoleState>::is_lock_free());
1677    }
1678
1679    #[test]
1680    fn test_region_role_state_as_str() {
1681        assert_eq!("Follower", RegionRoleState::Follower.as_str());
1682        assert_eq!(
1683            "Leader(Writable)",
1684            RegionRoleState::Leader(RegionLeaderState::Writable).as_str()
1685        );
1686        assert_eq!(
1687            "Leader(Staging)",
1688            RegionRoleState::Leader(RegionLeaderState::Staging).as_str()
1689        );
1690        assert_eq!(
1691            "Leader(Downgrading)",
1692            RegionRoleState::Leader(RegionLeaderState::Downgrading).as_str()
1693        );
1694    }
1695
1696    async fn build_test_region(env: &SchedulerEnv) -> MitoRegion {
1697        let builder = VersionControlBuilder::new();
1698        let version_control = Arc::new(builder.build());
1699        let metadata = version_control.current().version.metadata.clone();
1700
1701        let manager = RegionManifestManager::new(
1702            metadata.clone(),
1703            0,
1704            RegionManifestOptions {
1705                manifest_dir: "".to_string(),
1706                object_store: env.access_layer.object_store().clone(),
1707                compress_type: CompressionType::Uncompressed,
1708                checkpoint_distance: 10,
1709                remove_file_options: Default::default(),
1710                manifest_cache: None,
1711            },
1712            FormatType::PrimaryKey,
1713            &Default::default(),
1714        )
1715        .await
1716        .unwrap();
1717
1718        let manifest_ctx = Arc::new(ManifestContext::new(
1719            manager,
1720            RegionRoleState::Leader(RegionLeaderState::Writable),
1721        ));
1722
1723        MitoRegion {
1724            region_id: metadata.region_id,
1725            version_control,
1726            access_layer: env.access_layer.clone(),
1727            manifest_ctx,
1728            file_purger: crate::test_util::new_noop_file_purger(),
1729            provider: Provider::noop_provider(),
1730            last_flush_millis: Default::default(),
1731            last_schedule_compaction_millis: Default::default(),
1732            time_provider: Arc::new(StdTimeProvider),
1733            topic_latest_entry_id: Default::default(),
1734            written_bytes: Arc::new(AtomicU64::new(0)),
1735            stats: ManifestStats::default(),
1736        }
1737    }
1738
1739    fn empty_edit() -> RegionEdit {
1740        RegionEdit {
1741            files_to_add: Vec::new(),
1742            files_to_remove: Vec::new(),
1743            timestamp_ms: None,
1744            compaction_time_window: None,
1745            flushed_entry_id: None,
1746            flushed_sequence: None,
1747            committed_sequence: None,
1748        }
1749    }
1750
1751    #[tokio::test]
1752    async fn test_compaction_update_manifest_allows_editing_state() {
1753        let env = SchedulerEnv::new().await;
1754        let region = build_test_region(&env).await;
1755        region.set_editing(RegionLeaderState::Writable).unwrap();
1756
1757        let file_id = FileId::random();
1758        let action_list = RegionMetaActionList::with_action(RegionMetaAction::Edit(RegionEdit {
1759            files_to_add: vec![crate::sst::file::FileMeta {
1760                region_id: region.region_id,
1761                file_id,
1762                level: 1,
1763                ..Default::default()
1764            }],
1765            files_to_remove: Vec::new(),
1766            timestamp_ms: None,
1767            compaction_time_window: None,
1768            flushed_entry_id: None,
1769            flushed_sequence: None,
1770            committed_sequence: None,
1771        }));
1772
1773        region
1774            .manifest_ctx
1775            .update_manifest_for_compaction(action_list)
1776            .await
1777            .unwrap();
1778
1779        assert!(
1780            region
1781                .manifest_ctx
1782                .manifest()
1783                .await
1784                .files
1785                .contains_key(&file_id)
1786        );
1787    }
1788
1789    #[tokio::test]
1790    async fn test_exit_staging_partition_expr_change_and_edit_success() {
1791        let env = SchedulerEnv::new().await;
1792        let region = build_test_region(&env).await;
1793
1794        let mut manager = region.manifest_ctx.manifest_manager.write().await;
1795        region.set_staging(&mut manager).await.unwrap();
1796        manager
1797            .update(
1798                RegionMetaActionList::new(vec![
1799                    RegionMetaAction::PartitionExprChange(RegionPartitionExprChange {
1800                        partition_expr: Some("expr_a".to_string()),
1801                    }),
1802                    RegionMetaAction::Edit(empty_edit()),
1803                ]),
1804                true,
1805            )
1806            .await
1807            .unwrap();
1808
1809        region.exit_staging_on_success(&mut manager).await.unwrap();
1810        drop(manager);
1811
1812        assert_eq!(
1813            region.version().metadata.partition_expr.as_deref(),
1814            Some("expr_a")
1815        );
1816        assert_eq!(
1817            region.state(),
1818            RegionRoleState::Leader(RegionLeaderState::Writable)
1819        );
1820    }
1821
1822    #[tokio::test]
1823    async fn test_exit_staging_change_with_same_columns_success() {
1824        let env = SchedulerEnv::new().await;
1825        let region = build_test_region(&env).await;
1826
1827        let mut manager = region.manifest_ctx.manifest_manager.write().await;
1828        region.set_staging(&mut manager).await.unwrap();
1829
1830        let mut changed_metadata = region.version().metadata.as_ref().clone();
1831        changed_metadata.set_partition_expr(Some("expr_b".to_string()));
1832
1833        manager
1834            .update(
1835                RegionMetaActionList::new(vec![
1836                    RegionMetaAction::Change(RegionChange {
1837                        metadata: Arc::new(changed_metadata),
1838                        sst_format: FormatType::PrimaryKey,
1839                        append_mode: None,
1840                    }),
1841                    RegionMetaAction::Edit(empty_edit()),
1842                ]),
1843                true,
1844            )
1845            .await
1846            .unwrap();
1847
1848        region.exit_staging_on_success(&mut manager).await.unwrap();
1849        drop(manager);
1850
1851        assert_eq!(
1852            region.version().metadata.partition_expr.as_deref(),
1853            Some("expr_b")
1854        );
1855        assert_eq!(
1856            region.state(),
1857            RegionRoleState::Leader(RegionLeaderState::Writable)
1858        );
1859    }
1860
1861    #[tokio::test]
1862    async fn test_exit_staging_change_with_different_columns_fails() {
1863        let env = SchedulerEnv::new().await;
1864        let region = build_test_region(&env).await;
1865
1866        let mut manager = region.manifest_ctx.manifest_manager.write().await;
1867        region.set_staging(&mut manager).await.unwrap();
1868
1869        let mut changed_metadata = region.version().metadata.as_ref().clone();
1870        changed_metadata.column_metadatas.rotate_left(1);
1871
1872        manager
1873            .update(
1874                RegionMetaActionList::new(vec![
1875                    RegionMetaAction::Change(RegionChange {
1876                        metadata: Arc::new(changed_metadata),
1877                        sst_format: FormatType::PrimaryKey,
1878                        append_mode: None,
1879                    }),
1880                    RegionMetaAction::Edit(empty_edit()),
1881                ]),
1882                true,
1883            )
1884            .await
1885            .unwrap();
1886
1887        let result = region.exit_staging_on_success(&mut manager).await;
1888        assert!(matches!(result, Err(Error::Unexpected { .. })));
1889    }
1890
1891    #[tokio::test]
1892    async fn test_exit_staging_partition_expr_change_and_change_conflict_fails() {
1893        let env = SchedulerEnv::new().await;
1894        let region = build_test_region(&env).await;
1895
1896        let mut manager = region.manifest_ctx.manifest_manager.write().await;
1897        region.set_staging(&mut manager).await.unwrap();
1898
1899        let mut changed_metadata = region.version().metadata.as_ref().clone();
1900        changed_metadata.set_partition_expr(Some("expr_c".to_string()));
1901
1902        manager
1903            .update(
1904                RegionMetaActionList::new(vec![
1905                    RegionMetaAction::PartitionExprChange(RegionPartitionExprChange {
1906                        partition_expr: Some("expr_c".to_string()),
1907                    }),
1908                    RegionMetaAction::Change(RegionChange {
1909                        metadata: Arc::new(changed_metadata),
1910                        sst_format: FormatType::PrimaryKey,
1911                        append_mode: None,
1912                    }),
1913                    RegionMetaAction::Edit(empty_edit()),
1914                ]),
1915                true,
1916            )
1917            .await
1918            .unwrap();
1919
1920        let result = region.exit_staging_on_success(&mut manager).await;
1921        assert!(matches!(result, Err(Error::Unexpected { .. })));
1922    }
1923
1924    #[tokio::test]
1925    async fn test_set_region_state() {
1926        let env = SchedulerEnv::new().await;
1927        let builder = VersionControlBuilder::new();
1928        let version_control = Arc::new(builder.build());
1929        let manifest_ctx = env
1930            .mock_manifest_context(version_control.current().version.metadata.clone())
1931            .await;
1932
1933        let region_id = RegionId::new(1024, 0);
1934        // Leader -> Follower
1935        manifest_ctx.set_role(RegionRole::Follower, region_id);
1936        assert_eq!(manifest_ctx.state.load(), RegionRoleState::Follower);
1937
1938        // Follower -> Leader
1939        manifest_ctx.set_role(RegionRole::Leader, region_id);
1940        assert_eq!(
1941            manifest_ctx.state.load(),
1942            RegionRoleState::Leader(RegionLeaderState::Writable)
1943        );
1944
1945        // Direct Leader -> StagingLeader should be ignored.
1946        manifest_ctx.set_role(RegionRole::StagingLeader, region_id);
1947        assert_eq!(
1948            manifest_ctx.state.load(),
1949            RegionRoleState::Leader(RegionLeaderState::Writable)
1950        );
1951
1952        // Leader -> Downgrading Leader
1953        manifest_ctx.set_role(RegionRole::DowngradingLeader, region_id);
1954        assert_eq!(
1955            manifest_ctx.state.load(),
1956            RegionRoleState::Leader(RegionLeaderState::Downgrading)
1957        );
1958
1959        // Downgrading Leader -> Follower
1960        manifest_ctx.set_role(RegionRole::Follower, region_id);
1961        assert_eq!(manifest_ctx.state.load(), RegionRoleState::Follower);
1962
1963        // Can't downgrade from follower (Follower -> Downgrading Leader)
1964        manifest_ctx.set_role(RegionRole::DowngradingLeader, region_id);
1965        assert_eq!(manifest_ctx.state.load(), RegionRoleState::Follower);
1966
1967        // Set region role too Downgrading Leader
1968        manifest_ctx.set_role(RegionRole::Leader, region_id);
1969        manifest_ctx.set_role(RegionRole::DowngradingLeader, region_id);
1970        assert_eq!(
1971            manifest_ctx.state.load(),
1972            RegionRoleState::Leader(RegionLeaderState::Downgrading)
1973        );
1974
1975        // Downgrading Leader -> Leader
1976        manifest_ctx.set_role(RegionRole::Leader, region_id);
1977        assert_eq!(
1978            manifest_ctx.state.load(),
1979            RegionRoleState::Leader(RegionLeaderState::Writable)
1980        );
1981    }
1982
1983    #[tokio::test]
1984    async fn test_staging_state_validation() {
1985        let env = SchedulerEnv::new().await;
1986        let builder = VersionControlBuilder::new();
1987        let version_control = Arc::new(builder.build());
1988
1989        // Create context with staging state using the correct pattern from SchedulerEnv
1990        let staging_ctx = {
1991            let manager = RegionManifestManager::new(
1992                version_control.current().version.metadata.clone(),
1993                0,
1994                RegionManifestOptions {
1995                    manifest_dir: "".to_string(),
1996                    object_store: env.access_layer.object_store().clone(),
1997                    compress_type: CompressionType::Uncompressed,
1998                    checkpoint_distance: 10,
1999                    remove_file_options: Default::default(),
2000                    manifest_cache: None,
2001                },
2002                FormatType::PrimaryKey,
2003                &Default::default(),
2004            )
2005            .await
2006            .unwrap();
2007            Arc::new(ManifestContext::new(
2008                manager,
2009                RegionRoleState::Leader(RegionLeaderState::Staging),
2010            ))
2011        };
2012
2013        // Test staging state behavior
2014        assert_eq!(
2015            staging_ctx.current_state(),
2016            RegionRoleState::Leader(RegionLeaderState::Staging)
2017        );
2018
2019        // Test writable context for comparison
2020        let writable_ctx = env
2021            .mock_manifest_context(version_control.current().version.metadata.clone())
2022            .await;
2023
2024        assert_eq!(
2025            writable_ctx.current_state(),
2026            RegionRoleState::Leader(RegionLeaderState::Writable)
2027        );
2028    }
2029
2030    #[tokio::test]
2031    async fn test_staging_state_transitions() {
2032        let builder = VersionControlBuilder::new();
2033        let version_control = Arc::new(builder.build());
2034        let metadata = version_control.current().version.metadata.clone();
2035
2036        // Create MitoRegion for testing state transitions
2037        let temp_dir = create_temp_dir("");
2038        let path_str = temp_dir.path().display().to_string();
2039        let fs_builder = Fs::default().root(&path_str);
2040        let object_store = ObjectStore::new(fs_builder).unwrap().finish();
2041
2042        let index_aux_path = temp_dir.path().join("index_aux");
2043        let puffin_mgr = PuffinManagerFactory::new(&index_aux_path, 4096, None, None)
2044            .await
2045            .unwrap();
2046        let intm_mgr = IntermediateManager::init_fs(index_aux_path.to_str().unwrap())
2047            .await
2048            .unwrap();
2049
2050        let access_layer = Arc::new(AccessLayer::new(
2051            "",
2052            PathType::Bare,
2053            object_store,
2054            puffin_mgr,
2055            intm_mgr,
2056        ));
2057
2058        let manager = RegionManifestManager::new(
2059            metadata.clone(),
2060            0,
2061            RegionManifestOptions {
2062                manifest_dir: "".to_string(),
2063                object_store: access_layer.object_store().clone(),
2064                compress_type: CompressionType::Uncompressed,
2065                checkpoint_distance: 10,
2066                remove_file_options: Default::default(),
2067                manifest_cache: None,
2068            },
2069            FormatType::PrimaryKey,
2070            &Default::default(),
2071        )
2072        .await
2073        .unwrap();
2074
2075        let manifest_ctx = Arc::new(ManifestContext::new(
2076            manager,
2077            RegionRoleState::Leader(RegionLeaderState::Writable),
2078        ));
2079
2080        let region = MitoRegion {
2081            region_id: metadata.region_id,
2082            version_control,
2083            access_layer,
2084            manifest_ctx: manifest_ctx.clone(),
2085            file_purger: crate::test_util::new_noop_file_purger(),
2086            provider: Provider::noop_provider(),
2087            last_flush_millis: Default::default(),
2088            last_schedule_compaction_millis: Default::default(),
2089            time_provider: Arc::new(StdTimeProvider),
2090            topic_latest_entry_id: Default::default(),
2091            written_bytes: Arc::new(AtomicU64::new(0)),
2092            stats: ManifestStats::default(),
2093        };
2094
2095        // Test initial state
2096        assert_eq!(
2097            region.state(),
2098            RegionRoleState::Leader(RegionLeaderState::Writable)
2099        );
2100        assert!(!region.is_staging());
2101
2102        // Test transition to staging
2103        let mut manager = manifest_ctx.manifest_manager.write().await;
2104        region.set_staging(&mut manager).await.unwrap();
2105        drop(manager);
2106        assert_eq!(
2107            region.state(),
2108            RegionRoleState::Leader(RegionLeaderState::Staging)
2109        );
2110        assert!(region.is_staging());
2111
2112        // Test transition back to writable
2113        region.exit_staging().unwrap();
2114        assert_eq!(
2115            region.state(),
2116            RegionRoleState::Leader(RegionLeaderState::Writable)
2117        );
2118        assert!(!region.is_staging());
2119
2120        // Test staging directory cleanup: Create dirty staging files before entering staging mode
2121        {
2122            // Create some dummy staging manifest files to simulate interrupted session
2123            let manager = manifest_ctx.manifest_manager.write().await;
2124            let dummy_actions = RegionMetaActionList::new(vec![]);
2125            let dummy_bytes = dummy_actions.encode().unwrap();
2126
2127            // Create dirty staging files with versions 100 and 101
2128            manager.store().save(100, &dummy_bytes, true).await.unwrap();
2129            manager.store().save(101, &dummy_bytes, true).await.unwrap();
2130            drop(manager);
2131
2132            // Verify dirty files exist before entering staging
2133            let manager = manifest_ctx.manifest_manager.read().await;
2134            let dirty_manifests = manager.store().fetch_staging_manifests().await.unwrap();
2135            assert_eq!(
2136                dirty_manifests.len(),
2137                2,
2138                "Should have 2 dirty staging files"
2139            );
2140            drop(manager);
2141
2142            // Enter staging mode - this should clean up the dirty files
2143            let mut manager = manifest_ctx.manifest_manager.write().await;
2144            region.set_staging(&mut manager).await.unwrap();
2145            drop(manager);
2146
2147            // Verify dirty files are cleaned up after entering staging
2148            let manager = manifest_ctx.manifest_manager.read().await;
2149            let cleaned_manifests = manager.store().fetch_staging_manifests().await.unwrap();
2150            assert_eq!(
2151                cleaned_manifests.len(),
2152                0,
2153                "Dirty staging files should be cleaned up"
2154            );
2155            drop(manager);
2156
2157            // Exit staging to restore normal state for remaining tests
2158            region.exit_staging().unwrap();
2159        }
2160
2161        // Test invalid transitions
2162        let mut manager = manifest_ctx.manifest_manager.write().await;
2163        assert!(region.set_staging(&mut manager).await.is_ok()); // Writable -> Staging should work
2164        drop(manager);
2165        let mut manager = manifest_ctx.manifest_manager.write().await;
2166        assert!(region.set_staging(&mut manager).await.is_err()); // Staging -> Staging should fail
2167        drop(manager);
2168        assert!(region.exit_staging().is_ok()); // Staging -> Writable should work
2169        assert!(region.exit_staging().is_err()); // Writable -> Writable should fail
2170    }
2171}