1use std::sync::Arc;
16use std::time::{Duration, Instant};
17
18use async_stream::try_stream;
19use common_time::Timestamp;
20use either::Either;
21use futures::{Stream, TryStreamExt};
22use object_store::services::Fs;
23use object_store::util::{join_dir, with_instrument_layers};
24use object_store::{ATOMIC_WRITE_DIR, ErrorKind, OLD_ATOMIC_WRITE_DIR, ObjectStore};
25use smallvec::SmallVec;
26use snafu::ResultExt;
27use store_api::metadata::RegionMetadataRef;
28use store_api::region_request::PathType;
29use store_api::sst_entry::StorageSstEntry;
30use store_api::storage::{FileId, RegionId, SequenceNumber};
31
32use crate::cache::CacheManagerRef;
33use crate::cache::file_cache::{FileCacheRef, FileType, IndexKey};
34use crate::cache::write_cache::SstUploadRequest;
35use crate::config::{BloomFilterConfig, FulltextIndexConfig, IndexConfig, InvertedIndexConfig};
36use crate::error::{
37 CleanDirSnafu, DeleteIndexSnafu, DeleteIndexesSnafu, DeleteSstsSnafu, OpenDalSnafu, Result,
38};
39use crate::metrics::{COMPACTION_STAGE_ELAPSED, FLUSH_ELAPSED};
40use crate::read::{FlatSource, Source};
41use crate::region::options::IndexOptions;
42use crate::sst::file::{FileHandle, RegionFileId, RegionIndexId};
43use crate::sst::index::IndexerBuilderImpl;
44use crate::sst::index::intermediate::IntermediateManager;
45use crate::sst::index::puffin_manager::{PuffinManagerFactory, SstPuffinManager};
46use crate::sst::location::{self, region_dir_from_table_dir};
47use crate::sst::parquet::reader::ParquetReaderBuilder;
48use crate::sst::parquet::writer::ParquetWriter;
49use crate::sst::parquet::{SstInfo, WriteOptions};
50use crate::sst::{DEFAULT_WRITE_BUFFER_SIZE, DEFAULT_WRITE_CONCURRENCY};
51
52pub type AccessLayerRef = Arc<AccessLayer>;
53pub type SstInfoArray = SmallVec<[SstInfo; 2]>;
55
56#[derive(Eq, PartialEq, Debug)]
58pub enum WriteType {
59 Flush,
61 Compaction,
63}
64
65#[derive(Debug)]
66pub struct Metrics {
67 pub(crate) write_type: WriteType,
68 pub(crate) iter_source: Duration,
69 pub(crate) write_batch: Duration,
70 pub(crate) update_index: Duration,
71 pub(crate) upload_parquet: Duration,
72 pub(crate) upload_puffin: Duration,
73 pub(crate) compact_memtable: Duration,
74}
75
76impl Metrics {
77 pub fn new(write_type: WriteType) -> Self {
78 Self {
79 write_type,
80 iter_source: Default::default(),
81 write_batch: Default::default(),
82 update_index: Default::default(),
83 upload_parquet: Default::default(),
84 upload_puffin: Default::default(),
85 compact_memtable: Default::default(),
86 }
87 }
88
89 pub(crate) fn merge(mut self, other: Self) -> Self {
90 assert_eq!(self.write_type, other.write_type);
91 self.iter_source += other.iter_source;
92 self.write_batch += other.write_batch;
93 self.update_index += other.update_index;
94 self.upload_parquet += other.upload_parquet;
95 self.upload_puffin += other.upload_puffin;
96 self.compact_memtable += other.compact_memtable;
97 self
98 }
99
100 pub(crate) fn observe(self) {
101 match self.write_type {
102 WriteType::Flush => {
103 FLUSH_ELAPSED
104 .with_label_values(&["iter_source"])
105 .observe(self.iter_source.as_secs_f64());
106 FLUSH_ELAPSED
107 .with_label_values(&["write_batch"])
108 .observe(self.write_batch.as_secs_f64());
109 FLUSH_ELAPSED
110 .with_label_values(&["update_index"])
111 .observe(self.update_index.as_secs_f64());
112 FLUSH_ELAPSED
113 .with_label_values(&["upload_parquet"])
114 .observe(self.upload_parquet.as_secs_f64());
115 FLUSH_ELAPSED
116 .with_label_values(&["upload_puffin"])
117 .observe(self.upload_puffin.as_secs_f64());
118 if !self.compact_memtable.is_zero() {
119 FLUSH_ELAPSED
120 .with_label_values(&["compact_memtable"])
121 .observe(self.upload_puffin.as_secs_f64());
122 }
123 }
124 WriteType::Compaction => {
125 COMPACTION_STAGE_ELAPSED
126 .with_label_values(&["iter_source"])
127 .observe(self.iter_source.as_secs_f64());
128 COMPACTION_STAGE_ELAPSED
129 .with_label_values(&["write_batch"])
130 .observe(self.write_batch.as_secs_f64());
131 COMPACTION_STAGE_ELAPSED
132 .with_label_values(&["update_index"])
133 .observe(self.update_index.as_secs_f64());
134 COMPACTION_STAGE_ELAPSED
135 .with_label_values(&["upload_parquet"])
136 .observe(self.upload_parquet.as_secs_f64());
137 COMPACTION_STAGE_ELAPSED
138 .with_label_values(&["upload_puffin"])
139 .observe(self.upload_puffin.as_secs_f64());
140 }
141 };
142 }
143}
144
145pub struct AccessLayer {
147 table_dir: String,
148 path_type: PathType,
150 object_store: ObjectStore,
152 puffin_manager_factory: PuffinManagerFactory,
154 intermediate_manager: IntermediateManager,
156}
157
158impl std::fmt::Debug for AccessLayer {
159 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
160 f.debug_struct("AccessLayer")
161 .field("table_dir", &self.table_dir)
162 .finish()
163 }
164}
165
166impl AccessLayer {
167 pub fn new(
169 table_dir: impl Into<String>,
170 path_type: PathType,
171 object_store: ObjectStore,
172 puffin_manager_factory: PuffinManagerFactory,
173 intermediate_manager: IntermediateManager,
174 ) -> AccessLayer {
175 AccessLayer {
176 table_dir: table_dir.into(),
177 path_type,
178 object_store,
179 puffin_manager_factory,
180 intermediate_manager,
181 }
182 }
183
184 pub fn table_dir(&self) -> &str {
186 &self.table_dir
187 }
188
189 pub fn object_store(&self) -> &ObjectStore {
191 &self.object_store
192 }
193
194 pub fn path_type(&self) -> PathType {
196 self.path_type
197 }
198
199 pub fn puffin_manager_factory(&self) -> &PuffinManagerFactory {
201 &self.puffin_manager_factory
202 }
203
204 pub fn intermediate_manager(&self) -> &IntermediateManager {
206 &self.intermediate_manager
207 }
208
209 pub(crate) fn build_puffin_manager(&self) -> SstPuffinManager {
211 let store = self.object_store.clone();
212 let path_provider =
213 RegionFilePathFactory::new(self.table_dir().to_string(), self.path_type());
214 self.puffin_manager_factory.build(store, path_provider)
215 }
216
217 pub(crate) async fn delete_index(
218 &self,
219 index_file_id: RegionIndexId,
220 ) -> Result<(), crate::error::Error> {
221 let path = location::index_file_path(
222 &self.table_dir,
223 RegionIndexId::new(index_file_id.file_id, index_file_id.version),
224 self.path_type,
225 );
226 self.object_store
227 .delete(&path)
228 .await
229 .context(DeleteIndexSnafu {
230 file_id: index_file_id.file_id(),
231 })?;
232 Ok(())
233 }
234
235 pub(crate) async fn delete_ssts(
236 &self,
237 region_id: RegionId,
238 file_ids: &[FileId],
239 ) -> Result<(), crate::error::Error> {
240 if file_ids.is_empty() {
241 return Ok(());
242 }
243
244 let attempted_files = file_ids.to_vec();
245 let paths: Vec<_> = file_ids
246 .iter()
247 .map(|file_id| {
248 location::sst_file_path(
249 &self.table_dir,
250 RegionFileId::new(region_id, *file_id),
251 self.path_type,
252 )
253 })
254 .collect();
255
256 let mut deleter = self
257 .object_store
258 .deleter()
259 .await
260 .with_context(|_| DeleteSstsSnafu {
261 region_id,
262 file_ids: attempted_files.clone(),
263 })?;
264 deleter
265 .delete_iter(paths.iter().map(String::as_str))
266 .await
267 .with_context(|_| DeleteSstsSnafu {
268 region_id,
269 file_ids: attempted_files.clone(),
270 })?;
271 deleter.close().await.with_context(|_| DeleteSstsSnafu {
272 region_id,
273 file_ids: attempted_files,
274 })?;
275
276 Ok(())
277 }
278
279 pub(crate) async fn delete_indexes(
280 &self,
281 index_ids: &[RegionIndexId],
282 ) -> Result<(), crate::error::Error> {
283 if index_ids.is_empty() {
284 return Ok(());
285 }
286
287 let file_ids: Vec<_> = index_ids
288 .iter()
289 .map(|index_id| index_id.file_id())
290 .collect();
291 let paths: Vec<_> = index_ids
292 .iter()
293 .map(|index_id| location::index_file_path(&self.table_dir, *index_id, self.path_type))
294 .collect();
295
296 let mut deleter = self
297 .object_store
298 .deleter()
299 .await
300 .context(DeleteIndexesSnafu {
301 file_ids: file_ids.clone(),
302 })?;
303 deleter
304 .delete_iter(paths.iter().map(String::as_str))
305 .await
306 .context(DeleteIndexesSnafu {
307 file_ids: file_ids.clone(),
308 })?;
309 deleter
310 .close()
311 .await
312 .context(DeleteIndexesSnafu { file_ids })?;
313
314 Ok(())
315 }
316
317 pub fn build_region_dir(&self, region_id: RegionId) -> String {
319 region_dir_from_table_dir(&self.table_dir, region_id, self.path_type)
320 }
321
322 pub(crate) fn read_sst(&self, file: FileHandle) -> ParquetReaderBuilder {
324 ParquetReaderBuilder::new(
325 self.table_dir.clone(),
326 self.path_type,
327 file,
328 self.object_store.clone(),
329 )
330 }
331
332 pub async fn write_sst(
336 &self,
337 request: SstWriteRequest,
338 write_opts: &WriteOptions,
339 metrics: &mut Metrics,
340 ) -> Result<SstInfoArray> {
341 let region_id = request.metadata.region_id;
342 let cache_manager = request.cache_manager.clone();
343
344 let sst_info = if let Some(write_cache) = cache_manager.write_cache() {
345 write_cache
347 .write_and_upload_sst(
348 request,
349 SstUploadRequest {
350 dest_path_provider: RegionFilePathFactory::new(
351 self.table_dir.clone(),
352 self.path_type,
353 ),
354 remote_store: self.object_store.clone(),
355 },
356 write_opts,
357 metrics,
358 )
359 .await?
360 } else {
361 let store = self.object_store.clone();
363 let path_provider = RegionFilePathFactory::new(self.table_dir.clone(), self.path_type);
364 let indexer_builder = IndexerBuilderImpl {
365 build_type: request.op_type.into(),
366 metadata: request.metadata.clone(),
367 row_group_size: write_opts.row_group_size,
368 puffin_manager: self
369 .puffin_manager_factory
370 .build(store, path_provider.clone()),
371 write_cache_enabled: false,
372 intermediate_manager: self.intermediate_manager.clone(),
373 index_options: request.index_options,
374 inverted_index_config: request.inverted_index_config,
375 fulltext_index_config: request.fulltext_index_config,
376 bloom_filter_index_config: request.bloom_filter_index_config,
377 #[cfg(feature = "vector_index")]
378 vector_index_config: request.vector_index_config,
379 };
380 let cleaner = TempFileCleaner::new(region_id, self.object_store.clone());
384 let mut writer = ParquetWriter::new_with_object_store(
385 self.object_store.clone(),
386 request.metadata,
387 request.index_config,
388 indexer_builder,
389 path_provider,
390 metrics,
391 )
392 .await
393 .with_file_cleaner(cleaner);
394 match request.source {
395 Either::Left(source) => {
396 writer
397 .write_all(source, request.max_sequence, write_opts)
398 .await?
399 }
400 Either::Right(flat_source) => {
401 writer
402 .write_all_flat(flat_source, request.max_sequence, write_opts)
403 .await?
404 }
405 }
406 };
407
408 if !sst_info.is_empty() {
410 for sst in &sst_info {
411 if let Some(parquet_metadata) = &sst.file_metadata {
412 cache_manager.put_parquet_meta_data(
413 RegionFileId::new(region_id, sst.file_id),
414 parquet_metadata.clone(),
415 )
416 }
417 }
418 }
419
420 Ok(sst_info)
421 }
422
423 pub(crate) async fn put_sst(
425 &self,
426 data: &bytes::Bytes,
427 region_id: RegionId,
428 sst_info: &SstInfo,
429 cache_manager: &CacheManagerRef,
430 ) -> Result<Metrics> {
431 if let Some(write_cache) = cache_manager.write_cache() {
432 let upload_request = SstUploadRequest {
434 dest_path_provider: RegionFilePathFactory::new(
435 self.table_dir.clone(),
436 self.path_type,
437 ),
438 remote_store: self.object_store.clone(),
439 };
440 write_cache
441 .put_and_upload_sst(data, region_id, sst_info, upload_request)
442 .await
443 } else {
444 let start = Instant::now();
445 let cleaner = TempFileCleaner::new(region_id, self.object_store.clone());
446 let path_provider = RegionFilePathFactory::new(self.table_dir.clone(), self.path_type);
447 let sst_file_path =
448 path_provider.build_sst_file_path(RegionFileId::new(region_id, sst_info.file_id));
449 let mut writer = self
450 .object_store
451 .writer_with(&sst_file_path)
452 .chunk(DEFAULT_WRITE_BUFFER_SIZE.as_bytes() as usize)
453 .concurrent(DEFAULT_WRITE_CONCURRENCY)
454 .await
455 .context(OpenDalSnafu)?;
456 if let Err(err) = writer.write(data.clone()).await.context(OpenDalSnafu) {
457 cleaner.clean_by_file_id(sst_info.file_id).await;
458 return Err(err);
459 }
460 if let Err(err) = writer.close().await.context(OpenDalSnafu) {
461 cleaner.clean_by_file_id(sst_info.file_id).await;
462 return Err(err);
463 }
464 let mut metrics = Metrics::new(WriteType::Flush);
465 metrics.write_batch = start.elapsed();
466 Ok(metrics)
467 }
468 }
469
470 pub fn storage_sst_entries(&self) -> impl Stream<Item = Result<StorageSstEntry>> + use<> {
472 let object_store = self.object_store.clone();
473 let table_dir = self.table_dir.clone();
474
475 try_stream! {
476 let mut lister = object_store
477 .lister_with(table_dir.as_str())
478 .recursive(true)
479 .await
480 .context(OpenDalSnafu)?;
481
482 while let Some(entry) = lister.try_next().await.context(OpenDalSnafu)? {
483 let metadata = entry.metadata();
484 if metadata.is_dir() {
485 continue;
486 }
487
488 let path = entry.path();
489 if !path.ends_with(".parquet") && !path.ends_with(".puffin") {
490 continue;
491 }
492
493 let file_size = metadata.content_length();
494 let file_size = if file_size == 0 { None } else { Some(file_size) };
495 let last_modified_ms = metadata
496 .last_modified()
497 .map(|ts| Timestamp::new_millisecond(ts.timestamp_millis()));
498
499 let entry = StorageSstEntry {
500 file_path: path.to_string(),
501 file_size,
502 last_modified_ms,
503 node_id: None,
504 };
505
506 yield entry;
507 }
508 }
509 }
510}
511
512#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
514pub enum OperationType {
515 Flush,
516 Compact,
517}
518
519pub struct SstWriteRequest {
521 pub op_type: OperationType,
522 pub metadata: RegionMetadataRef,
523 pub source: Either<Source, FlatSource>,
524 pub cache_manager: CacheManagerRef,
525 #[allow(dead_code)]
526 pub storage: Option<String>,
527 pub max_sequence: Option<SequenceNumber>,
528
529 pub index_options: IndexOptions,
531 pub index_config: IndexConfig,
532 pub inverted_index_config: InvertedIndexConfig,
533 pub fulltext_index_config: FulltextIndexConfig,
534 pub bloom_filter_index_config: BloomFilterConfig,
535 #[cfg(feature = "vector_index")]
536 pub vector_index_config: crate::config::VectorIndexConfig,
537}
538
539pub(crate) struct TempFileCleaner {
541 region_id: RegionId,
542 object_store: ObjectStore,
543}
544
545impl TempFileCleaner {
546 pub(crate) fn new(region_id: RegionId, object_store: ObjectStore) -> Self {
548 Self {
549 region_id,
550 object_store,
551 }
552 }
553
554 pub(crate) async fn clean_by_file_id(&self, file_id: FileId) {
557 let sst_key = IndexKey::new(self.region_id, file_id, FileType::Parquet).to_string();
558 let index_key = IndexKey::new(self.region_id, file_id, FileType::Puffin(0)).to_string();
559
560 Self::clean_atomic_dir_files(&self.object_store, &[&sst_key, &index_key]).await;
561 }
562
563 pub(crate) async fn clean_atomic_dir_files(
565 local_store: &ObjectStore,
566 names_to_remove: &[&str],
567 ) {
568 let Ok(entries) = local_store.list(ATOMIC_WRITE_DIR).await.inspect_err(|e| {
571 if e.kind() != ErrorKind::NotFound {
572 common_telemetry::error!(e; "Failed to list tmp files for {:?}", names_to_remove)
573 }
574 }) else {
575 return;
576 };
577
578 let actual_files: Vec<_> = entries
581 .into_iter()
582 .filter_map(|entry| {
583 if entry.metadata().is_dir() {
584 return None;
585 }
586
587 let should_remove = names_to_remove
589 .iter()
590 .any(|file| entry.name().starts_with(file));
591 if should_remove {
592 Some(entry.path().to_string())
593 } else {
594 None
595 }
596 })
597 .collect();
598
599 common_telemetry::warn!(
600 "Clean files {:?} under atomic write dir for {:?}",
601 actual_files,
602 names_to_remove
603 );
604
605 if let Err(e) = local_store.delete_iter(actual_files).await {
606 common_telemetry::error!(e; "Failed to delete tmp file for {:?}", names_to_remove);
607 }
608 }
609}
610
611pub(crate) async fn new_fs_cache_store(root: &str) -> Result<ObjectStore> {
612 let atomic_write_dir = join_dir(root, ATOMIC_WRITE_DIR);
613 clean_dir(&atomic_write_dir).await?;
614
615 let old_atomic_temp_dir = join_dir(root, OLD_ATOMIC_WRITE_DIR);
617 clean_dir(&old_atomic_temp_dir).await?;
618
619 let builder = Fs::default().root(root).atomic_write_dir(&atomic_write_dir);
620 let store = ObjectStore::new(builder).context(OpenDalSnafu)?.finish();
621
622 Ok(with_instrument_layers(store, false))
623}
624
625async fn clean_dir(dir: &str) -> Result<()> {
627 if tokio::fs::try_exists(dir)
628 .await
629 .context(CleanDirSnafu { dir })?
630 {
631 tokio::fs::remove_dir_all(dir)
632 .await
633 .context(CleanDirSnafu { dir })?;
634 }
635
636 Ok(())
637}
638
639pub trait FilePathProvider: Send + Sync {
641 fn build_index_file_path(&self, file_id: RegionFileId) -> String;
643
644 fn build_index_file_path_with_version(&self, index_id: RegionIndexId) -> String;
646
647 fn build_sst_file_path(&self, file_id: RegionFileId) -> String;
649}
650
651#[derive(Clone)]
653pub(crate) struct WriteCachePathProvider {
654 file_cache: FileCacheRef,
655}
656
657impl WriteCachePathProvider {
658 pub fn new(file_cache: FileCacheRef) -> Self {
660 Self { file_cache }
661 }
662}
663
664impl FilePathProvider for WriteCachePathProvider {
665 fn build_index_file_path(&self, file_id: RegionFileId) -> String {
666 let puffin_key = IndexKey::new(file_id.region_id(), file_id.file_id(), FileType::Puffin(0));
667 self.file_cache.cache_file_path(puffin_key)
668 }
669
670 fn build_index_file_path_with_version(&self, index_id: RegionIndexId) -> String {
671 let puffin_key = IndexKey::new(
672 index_id.region_id(),
673 index_id.file_id(),
674 FileType::Puffin(index_id.version),
675 );
676 self.file_cache.cache_file_path(puffin_key)
677 }
678
679 fn build_sst_file_path(&self, file_id: RegionFileId) -> String {
680 let parquet_file_key =
681 IndexKey::new(file_id.region_id(), file_id.file_id(), FileType::Parquet);
682 self.file_cache.cache_file_path(parquet_file_key)
683 }
684}
685
686#[derive(Clone, Debug)]
688pub(crate) struct RegionFilePathFactory {
689 pub(crate) table_dir: String,
690 pub(crate) path_type: PathType,
691}
692
693impl RegionFilePathFactory {
694 pub fn new(table_dir: String, path_type: PathType) -> Self {
696 Self {
697 table_dir,
698 path_type,
699 }
700 }
701}
702
703impl FilePathProvider for RegionFilePathFactory {
704 fn build_index_file_path(&self, file_id: RegionFileId) -> String {
705 location::index_file_path_legacy(&self.table_dir, file_id, self.path_type)
706 }
707
708 fn build_index_file_path_with_version(&self, index_id: RegionIndexId) -> String {
709 location::index_file_path(&self.table_dir, index_id, self.path_type)
710 }
711
712 fn build_sst_file_path(&self, file_id: RegionFileId) -> String {
713 location::sst_file_path(&self.table_dir, file_id, self.path_type)
714 }
715}