1use std::sync::Arc;
16
17use common_recordbatch::DfRecordBatch;
18use common_time::Timestamp;
19use common_time::timestamp::TimeUnit;
20use datafusion_common::DataFusionError;
21use datafusion_expr::{LogicalPlan, LogicalPlanBuilder, LogicalTableSource};
22use datatypes::arrow::array::{
23 ArrayRef, BooleanArray, TimestampMillisecondArray, TimestampNanosecondArray, UInt8Array,
24 UInt32Array, UInt64Array,
25};
26use datatypes::arrow::error::ArrowError;
27use datatypes::arrow_array::StringArray;
28use datatypes::schema::{ColumnSchema, Schema, SchemaRef};
29use serde::{Deserialize, Serialize};
30
31use crate::storage::{RegionGroup, RegionId, RegionNumber, RegionSeq, ScanRequest, TableId};
32
33#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
35pub struct ManifestSstEntry {
36 pub table_dir: String,
38 pub region_id: RegionId,
40 pub table_id: TableId,
42 pub region_number: RegionNumber,
44 pub region_group: RegionGroup,
46 pub region_sequence: RegionSeq,
48 pub file_id: String,
50 pub index_version: u64,
52 pub level: u8,
54 pub file_path: String,
56 pub file_size: u64,
58 pub index_file_path: Option<String>,
60 pub index_file_size: Option<u64>,
62 pub num_rows: u64,
64 pub num_row_groups: u64,
66 pub num_series: Option<u64>,
68 pub min_ts: Timestamp,
70 pub max_ts: Timestamp,
72 pub sequence: Option<u64>,
74 pub origin_region_id: RegionId,
76 pub node_id: Option<u64>,
78 pub visible: bool,
80}
81
82impl ManifestSstEntry {
83 pub fn schema() -> SchemaRef {
85 use datatypes::prelude::ConcreteDataType as Ty;
86 Arc::new(Schema::new(vec![
87 ColumnSchema::new("table_dir", Ty::string_datatype(), false),
88 ColumnSchema::new("region_id", Ty::uint64_datatype(), false),
89 ColumnSchema::new("table_id", Ty::uint32_datatype(), false),
90 ColumnSchema::new("region_number", Ty::uint32_datatype(), false),
91 ColumnSchema::new("region_group", Ty::uint8_datatype(), false),
92 ColumnSchema::new("region_sequence", Ty::uint32_datatype(), false),
93 ColumnSchema::new("file_id", Ty::string_datatype(), false),
94 ColumnSchema::new("index_version", Ty::uint64_datatype(), false),
95 ColumnSchema::new("level", Ty::uint8_datatype(), false),
96 ColumnSchema::new("file_path", Ty::string_datatype(), false),
97 ColumnSchema::new("file_size", Ty::uint64_datatype(), false),
98 ColumnSchema::new("index_file_path", Ty::string_datatype(), true),
99 ColumnSchema::new("index_file_size", Ty::uint64_datatype(), true),
100 ColumnSchema::new("num_rows", Ty::uint64_datatype(), false),
101 ColumnSchema::new("num_row_groups", Ty::uint64_datatype(), false),
102 ColumnSchema::new("num_series", Ty::uint64_datatype(), true),
103 ColumnSchema::new("min_ts", Ty::timestamp_nanosecond_datatype(), true),
104 ColumnSchema::new("max_ts", Ty::timestamp_nanosecond_datatype(), true),
105 ColumnSchema::new("sequence", Ty::uint64_datatype(), true),
106 ColumnSchema::new("origin_region_id", Ty::uint64_datatype(), false),
107 ColumnSchema::new("node_id", Ty::uint64_datatype(), true),
108 ColumnSchema::new("visible", Ty::boolean_datatype(), false),
109 ]))
110 }
111
112 pub fn to_record_batch(entries: &[Self]) -> std::result::Result<DfRecordBatch, ArrowError> {
114 let schema = Self::schema();
115 let table_dirs = entries.iter().map(|e| e.table_dir.as_str());
116 let region_ids = entries.iter().map(|e| e.region_id.as_u64());
117 let table_ids = entries.iter().map(|e| e.table_id);
118 let region_numbers = entries.iter().map(|e| e.region_number);
119 let region_groups = entries.iter().map(|e| e.region_group);
120 let region_sequences = entries.iter().map(|e| e.region_sequence);
121 let file_ids = entries.iter().map(|e| e.file_id.as_str());
122 let index_versions = entries.iter().map(|e| e.index_version);
123 let levels = entries.iter().map(|e| e.level);
124 let file_paths = entries.iter().map(|e| e.file_path.as_str());
125 let file_sizes = entries.iter().map(|e| e.file_size);
126 let index_file_paths = entries.iter().map(|e| e.index_file_path.as_ref());
127 let index_file_sizes = entries.iter().map(|e| e.index_file_size);
128 let num_rows = entries.iter().map(|e| e.num_rows);
129 let num_row_groups = entries.iter().map(|e| e.num_row_groups);
130 let num_series = entries.iter().map(|e| e.num_series);
131 let min_ts = entries.iter().map(|e| {
132 e.min_ts
133 .convert_to(TimeUnit::Nanosecond)
134 .map(|ts| ts.value())
135 });
136 let max_ts = entries.iter().map(|e| {
137 e.max_ts
138 .convert_to(TimeUnit::Nanosecond)
139 .map(|ts| ts.value())
140 });
141 let sequences = entries.iter().map(|e| e.sequence);
142 let origin_region_ids = entries.iter().map(|e| e.origin_region_id.as_u64());
143 let node_ids = entries.iter().map(|e| e.node_id);
144 let visible_flags = entries.iter().map(|e| Some(e.visible));
145
146 let columns: Vec<ArrayRef> = vec![
147 Arc::new(StringArray::from_iter_values(table_dirs)),
148 Arc::new(UInt64Array::from_iter_values(region_ids)),
149 Arc::new(UInt32Array::from_iter_values(table_ids)),
150 Arc::new(UInt32Array::from_iter_values(region_numbers)),
151 Arc::new(UInt8Array::from_iter_values(region_groups)),
152 Arc::new(UInt32Array::from_iter_values(region_sequences)),
153 Arc::new(StringArray::from_iter_values(file_ids)),
154 Arc::new(UInt64Array::from_iter(index_versions)),
155 Arc::new(UInt8Array::from_iter_values(levels)),
156 Arc::new(StringArray::from_iter_values(file_paths)),
157 Arc::new(UInt64Array::from_iter_values(file_sizes)),
158 Arc::new(StringArray::from_iter(index_file_paths)),
159 Arc::new(UInt64Array::from_iter(index_file_sizes)),
160 Arc::new(UInt64Array::from_iter_values(num_rows)),
161 Arc::new(UInt64Array::from_iter_values(num_row_groups)),
162 Arc::new(UInt64Array::from_iter(num_series)),
163 Arc::new(TimestampNanosecondArray::from_iter(min_ts)),
164 Arc::new(TimestampNanosecondArray::from_iter(max_ts)),
165 Arc::new(UInt64Array::from_iter(sequences)),
166 Arc::new(UInt64Array::from_iter_values(origin_region_ids)),
167 Arc::new(UInt64Array::from_iter(node_ids)),
168 Arc::new(BooleanArray::from_iter(visible_flags)),
169 ];
170
171 DfRecordBatch::try_new(schema.arrow_schema().clone(), columns)
172 }
173
174 pub fn reserved_table_name_for_inspection() -> &'static str {
180 "__inspect/__mito/__sst_manifest"
181 }
182
183 pub fn build_plan(scan_request: ScanRequest) -> Result<LogicalPlan, DataFusionError> {
185 build_plan_helper(
186 scan_request,
187 Self::reserved_table_name_for_inspection(),
188 Self::schema(),
189 )
190 }
191}
192
193#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
195pub struct StorageSstEntry {
196 pub file_path: String,
198 pub file_size: Option<u64>,
200 pub last_modified_ms: Option<Timestamp>,
202 pub node_id: Option<u64>,
204}
205
206impl StorageSstEntry {
207 pub fn schema() -> SchemaRef {
209 use datatypes::prelude::ConcreteDataType as Ty;
210 Arc::new(Schema::new(vec![
211 ColumnSchema::new("file_path", Ty::string_datatype(), false),
212 ColumnSchema::new("file_size", Ty::uint64_datatype(), true),
213 ColumnSchema::new(
214 "last_modified_ms",
215 Ty::timestamp_millisecond_datatype(),
216 true,
217 ),
218 ColumnSchema::new("node_id", Ty::uint64_datatype(), true),
219 ]))
220 }
221
222 pub fn to_record_batch(entries: &[Self]) -> std::result::Result<DfRecordBatch, ArrowError> {
224 let schema = Self::schema();
225 let file_paths = entries.iter().map(|e| e.file_path.as_str());
226 let file_sizes = entries.iter().map(|e| e.file_size);
227 let last_modified_ms = entries.iter().map(|e| {
228 e.last_modified_ms
229 .and_then(|ts| ts.convert_to(TimeUnit::Millisecond).map(|ts| ts.value()))
230 });
231 let node_ids = entries.iter().map(|e| e.node_id);
232
233 let columns: Vec<ArrayRef> = vec![
234 Arc::new(StringArray::from_iter_values(file_paths)),
235 Arc::new(UInt64Array::from_iter(file_sizes)),
236 Arc::new(TimestampMillisecondArray::from_iter(last_modified_ms)),
237 Arc::new(UInt64Array::from_iter(node_ids)),
238 ];
239
240 DfRecordBatch::try_new(schema.arrow_schema().clone(), columns)
241 }
242
243 pub fn reserved_table_name_for_inspection() -> &'static str {
249 "__inspect/__mito/__sst_storage"
250 }
251
252 pub fn build_plan(scan_request: ScanRequest) -> Result<LogicalPlan, DataFusionError> {
254 build_plan_helper(
255 scan_request,
256 Self::reserved_table_name_for_inspection(),
257 Self::schema(),
258 )
259 }
260}
261
262#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
264pub struct PuffinIndexMetaEntry {
265 pub table_dir: String,
267 pub index_file_path: String,
269 pub region_id: RegionId,
271 pub table_id: TableId,
273 pub region_number: RegionNumber,
275 pub region_group: RegionGroup,
277 pub region_sequence: RegionSeq,
279 pub file_id: String,
281 pub index_file_size: Option<u64>,
283 pub index_type: String,
285 pub target_type: String,
287 pub target_key: String,
289 pub target_json: String,
291 pub blob_size: u64,
293 pub meta_json: Option<String>,
295 pub node_id: Option<u64>,
297}
298
299impl PuffinIndexMetaEntry {
300 pub fn schema() -> SchemaRef {
302 use datatypes::prelude::ConcreteDataType as Ty;
303 Arc::new(Schema::new(vec![
304 ColumnSchema::new("table_dir", Ty::string_datatype(), false),
305 ColumnSchema::new("index_file_path", Ty::string_datatype(), false),
306 ColumnSchema::new("region_id", Ty::uint64_datatype(), false),
307 ColumnSchema::new("table_id", Ty::uint32_datatype(), false),
308 ColumnSchema::new("region_number", Ty::uint32_datatype(), false),
309 ColumnSchema::new("region_group", Ty::uint8_datatype(), false),
310 ColumnSchema::new("region_sequence", Ty::uint32_datatype(), false),
311 ColumnSchema::new("file_id", Ty::string_datatype(), false),
312 ColumnSchema::new("index_file_size", Ty::uint64_datatype(), true),
313 ColumnSchema::new("index_type", Ty::string_datatype(), false),
314 ColumnSchema::new("target_type", Ty::string_datatype(), false),
315 ColumnSchema::new("target_key", Ty::string_datatype(), false),
316 ColumnSchema::new("target_json", Ty::string_datatype(), false),
317 ColumnSchema::new("blob_size", Ty::uint64_datatype(), false),
318 ColumnSchema::new("meta_json", Ty::string_datatype(), true),
319 ColumnSchema::new("node_id", Ty::uint64_datatype(), true),
320 ]))
321 }
322
323 pub fn to_record_batch(entries: &[Self]) -> std::result::Result<DfRecordBatch, ArrowError> {
325 let schema = Self::schema();
326 let table_dirs = entries.iter().map(|e| e.table_dir.as_str());
327 let index_file_paths = entries.iter().map(|e| e.index_file_path.as_str());
328 let region_ids = entries.iter().map(|e| e.region_id.as_u64());
329 let table_ids = entries.iter().map(|e| e.table_id);
330 let region_numbers = entries.iter().map(|e| e.region_number);
331 let region_groups = entries.iter().map(|e| e.region_group);
332 let region_sequences = entries.iter().map(|e| e.region_sequence);
333 let file_ids = entries.iter().map(|e| e.file_id.as_str());
334 let index_file_sizes = entries.iter().map(|e| e.index_file_size);
335 let index_types = entries.iter().map(|e| e.index_type.as_str());
336 let target_types = entries.iter().map(|e| e.target_type.as_str());
337 let target_keys = entries.iter().map(|e| e.target_key.as_str());
338 let target_jsons = entries.iter().map(|e| e.target_json.as_str());
339 let blob_sizes = entries.iter().map(|e| e.blob_size);
340 let meta_jsons = entries.iter().map(|e| e.meta_json.as_deref());
341 let node_ids = entries.iter().map(|e| e.node_id);
342
343 let columns: Vec<ArrayRef> = vec![
344 Arc::new(StringArray::from_iter_values(table_dirs)),
345 Arc::new(StringArray::from_iter_values(index_file_paths)),
346 Arc::new(UInt64Array::from_iter_values(region_ids)),
347 Arc::new(UInt32Array::from_iter_values(table_ids)),
348 Arc::new(UInt32Array::from_iter_values(region_numbers)),
349 Arc::new(UInt8Array::from_iter_values(region_groups)),
350 Arc::new(UInt32Array::from_iter_values(region_sequences)),
351 Arc::new(StringArray::from_iter_values(file_ids)),
352 Arc::new(UInt64Array::from_iter(index_file_sizes)),
353 Arc::new(StringArray::from_iter_values(index_types)),
354 Arc::new(StringArray::from_iter_values(target_types)),
355 Arc::new(StringArray::from_iter_values(target_keys)),
356 Arc::new(StringArray::from_iter_values(target_jsons)),
357 Arc::new(UInt64Array::from_iter_values(blob_sizes)),
358 Arc::new(StringArray::from_iter(meta_jsons)),
359 Arc::new(UInt64Array::from_iter(node_ids)),
360 ];
361
362 DfRecordBatch::try_new(schema.arrow_schema().clone(), columns)
363 }
364
365 pub fn reserved_table_name_for_inspection() -> &'static str {
367 "__inspect/__mito/__puffin_index_meta"
368 }
369
370 pub fn build_plan(scan_request: ScanRequest) -> Result<LogicalPlan, DataFusionError> {
372 build_plan_helper(
373 scan_request,
374 Self::reserved_table_name_for_inspection(),
375 Self::schema(),
376 )
377 }
378}
379
380fn build_plan_helper(
381 scan_request: ScanRequest,
382 table_name: &str,
383 schema: SchemaRef,
384) -> Result<LogicalPlan, DataFusionError> {
385 let table_source = LogicalTableSource::new(schema.arrow_schema().clone());
386
387 let projection = scan_request.projection_input.map(|input| input.projection);
388 let mut builder = LogicalPlanBuilder::scan(table_name, Arc::new(table_source), projection)?;
389
390 for filter in scan_request.filters {
391 builder = builder.filter(filter)?;
392 }
393
394 if let Some(limit) = scan_request.limit {
395 builder = builder.limit(0, Some(limit))?;
396 }
397
398 builder.build()
399}
400
401#[cfg(test)]
402mod tests {
403 use datafusion_common::TableReference;
404 use datafusion_expr::{LogicalPlan, Operator, binary_expr, col, lit};
405 use datatypes::arrow::array::{
406 Array, TimestampMillisecondArray, TimestampNanosecondArray, UInt8Array, UInt32Array,
407 UInt64Array,
408 };
409 use datatypes::arrow_array::StringArray;
410
411 use super::*;
412
413 #[test]
414 fn test_sst_entry_manifest_to_record_batch() {
415 let table_id1: TableId = 1;
417 let region_group1: RegionGroup = 2;
418 let region_seq1: RegionSeq = 3;
419 let region_number1: RegionNumber = ((region_group1 as u32) << 24) | region_seq1;
420 let region_id1 = RegionId::with_group_and_seq(table_id1, region_group1, region_seq1);
421
422 let table_id2: TableId = 5;
423 let region_group2: RegionGroup = 1;
424 let region_seq2: RegionSeq = 42;
425 let region_number2: RegionNumber = ((region_group2 as u32) << 24) | region_seq2;
426 let region_id2 = RegionId::with_group_and_seq(table_id2, region_group2, region_seq2);
427
428 let entries = vec![
429 ManifestSstEntry {
430 table_dir: "tdir1".to_string(),
431 region_id: region_id1,
432 table_id: table_id1,
433 region_number: region_number1,
434 region_group: region_group1,
435 region_sequence: region_seq1,
436 file_id: "f1".to_string(),
437 index_version: 0,
438 level: 1,
439 file_path: "/p1".to_string(),
440 file_size: 100,
441 index_file_path: None,
442 index_file_size: None,
443 num_rows: 10,
444 num_row_groups: 2,
445 num_series: Some(5),
446 min_ts: Timestamp::new_millisecond(1000), max_ts: Timestamp::new_second(2), sequence: None,
449 origin_region_id: region_id1,
450 node_id: Some(1),
451 visible: false,
452 },
453 ManifestSstEntry {
454 table_dir: "tdir2".to_string(),
455 region_id: region_id2,
456 table_id: table_id2,
457 region_number: region_number2,
458 region_group: region_group2,
459 region_sequence: region_seq2,
460 file_id: "f2".to_string(),
461 index_version: 1,
462 level: 3,
463 file_path: "/p2".to_string(),
464 file_size: 200,
465 index_file_path: Some("idx".to_string()),
466 index_file_size: Some(11),
467 num_rows: 20,
468 num_row_groups: 4,
469 num_series: None,
470 min_ts: Timestamp::new_nanosecond(5), max_ts: Timestamp::new_microsecond(2000), sequence: Some(9),
473 origin_region_id: region_id2,
474 node_id: None,
475 visible: true,
476 },
477 ];
478
479 let schema = ManifestSstEntry::schema();
480 let batch = ManifestSstEntry::to_record_batch(&entries).unwrap();
481
482 assert_eq!(schema.arrow_schema().fields().len(), batch.num_columns());
484 assert_eq!(2, batch.num_rows());
485 for (i, f) in schema.arrow_schema().fields().iter().enumerate() {
486 assert_eq!(f.name(), batch.schema().field(i).name());
487 assert_eq!(f.is_nullable(), batch.schema().field(i).is_nullable());
488 assert_eq!(f.data_type(), batch.schema().field(i).data_type());
489 }
490
491 let table_dirs = batch
493 .column(0)
494 .as_any()
495 .downcast_ref::<StringArray>()
496 .unwrap();
497 assert_eq!("tdir1", table_dirs.value(0));
498 assert_eq!("tdir2", table_dirs.value(1));
499
500 let region_ids = batch
501 .column(1)
502 .as_any()
503 .downcast_ref::<UInt64Array>()
504 .unwrap();
505 assert_eq!(region_id1.as_u64(), region_ids.value(0));
506 assert_eq!(region_id2.as_u64(), region_ids.value(1));
507
508 let table_ids = batch
509 .column(2)
510 .as_any()
511 .downcast_ref::<UInt32Array>()
512 .unwrap();
513 assert_eq!(table_id1, table_ids.value(0));
514 assert_eq!(table_id2, table_ids.value(1));
515
516 let region_numbers = batch
517 .column(3)
518 .as_any()
519 .downcast_ref::<UInt32Array>()
520 .unwrap();
521 assert_eq!(region_number1, region_numbers.value(0));
522 assert_eq!(region_number2, region_numbers.value(1));
523
524 let region_groups = batch
525 .column(4)
526 .as_any()
527 .downcast_ref::<UInt8Array>()
528 .unwrap();
529 assert_eq!(region_group1, region_groups.value(0));
530 assert_eq!(region_group2, region_groups.value(1));
531
532 let region_sequences = batch
533 .column(5)
534 .as_any()
535 .downcast_ref::<UInt32Array>()
536 .unwrap();
537 assert_eq!(region_seq1, region_sequences.value(0));
538 assert_eq!(region_seq2, region_sequences.value(1));
539
540 let file_ids = batch
541 .column(6)
542 .as_any()
543 .downcast_ref::<StringArray>()
544 .unwrap();
545 assert_eq!("f1", file_ids.value(0));
546 assert_eq!("f2", file_ids.value(1));
547
548 let index_versions = batch
549 .column(7)
550 .as_any()
551 .downcast_ref::<UInt64Array>()
552 .unwrap();
553 assert_eq!(0, index_versions.value(0));
554 assert_eq!(1, index_versions.value(1));
555
556 let levels = batch
557 .column(8)
558 .as_any()
559 .downcast_ref::<UInt8Array>()
560 .unwrap();
561 assert_eq!(1, levels.value(0));
562 assert_eq!(3, levels.value(1));
563
564 let file_paths = batch
565 .column(9)
566 .as_any()
567 .downcast_ref::<StringArray>()
568 .unwrap();
569 assert_eq!("/p1", file_paths.value(0));
570 assert_eq!("/p2", file_paths.value(1));
571
572 let file_sizes = batch
573 .column(10)
574 .as_any()
575 .downcast_ref::<UInt64Array>()
576 .unwrap();
577 assert_eq!(100, file_sizes.value(0));
578 assert_eq!(200, file_sizes.value(1));
579
580 let index_file_paths = batch
581 .column(11)
582 .as_any()
583 .downcast_ref::<StringArray>()
584 .unwrap();
585 assert!(index_file_paths.is_null(0));
586 assert_eq!("idx", index_file_paths.value(1));
587
588 let index_file_sizes = batch
589 .column(12)
590 .as_any()
591 .downcast_ref::<UInt64Array>()
592 .unwrap();
593 assert!(index_file_sizes.is_null(0));
594 assert_eq!(11, index_file_sizes.value(1));
595
596 let num_rows = batch
597 .column(13)
598 .as_any()
599 .downcast_ref::<UInt64Array>()
600 .unwrap();
601 assert_eq!(10, num_rows.value(0));
602 assert_eq!(20, num_rows.value(1));
603
604 let num_row_groups = batch
605 .column(14)
606 .as_any()
607 .downcast_ref::<UInt64Array>()
608 .unwrap();
609 assert_eq!(2, num_row_groups.value(0));
610 assert_eq!(4, num_row_groups.value(1));
611
612 let num_series = batch
613 .column(15)
614 .as_any()
615 .downcast_ref::<UInt64Array>()
616 .unwrap();
617 assert_eq!(5, num_series.value(0));
618 assert!(num_series.is_null(1));
619
620 let min_ts = batch
621 .column(16)
622 .as_any()
623 .downcast_ref::<TimestampNanosecondArray>()
624 .unwrap();
625 assert_eq!(1_000_000_000, min_ts.value(0));
626 assert_eq!(5, min_ts.value(1));
627
628 let max_ts = batch
629 .column(17)
630 .as_any()
631 .downcast_ref::<TimestampNanosecondArray>()
632 .unwrap();
633 assert_eq!(2_000_000_000, max_ts.value(0));
634 assert_eq!(2_000_000, max_ts.value(1));
635
636 let sequences = batch
637 .column(18)
638 .as_any()
639 .downcast_ref::<UInt64Array>()
640 .unwrap();
641 assert!(sequences.is_null(0));
642 assert_eq!(9, sequences.value(1));
643
644 let origin_region_ids = batch
645 .column(19)
646 .as_any()
647 .downcast_ref::<UInt64Array>()
648 .unwrap();
649 assert_eq!(region_id1.as_u64(), origin_region_ids.value(0));
650 assert_eq!(region_id2.as_u64(), origin_region_ids.value(1));
651
652 let node_ids = batch
653 .column(20)
654 .as_any()
655 .downcast_ref::<UInt64Array>()
656 .unwrap();
657 assert_eq!(1, node_ids.value(0));
658 assert!(node_ids.is_null(1));
659
660 let visible = batch
661 .column(21)
662 .as_any()
663 .downcast_ref::<BooleanArray>()
664 .unwrap();
665 assert!(!visible.value(0));
666 assert!(visible.value(1));
667 }
668
669 #[test]
670 fn test_sst_entry_storage_to_record_batch() {
671 let entries = vec![
672 StorageSstEntry {
673 file_path: "/s1".to_string(),
674 file_size: None,
675 last_modified_ms: None,
676 node_id: Some(1),
677 },
678 StorageSstEntry {
679 file_path: "/s2".to_string(),
680 file_size: Some(123),
681 last_modified_ms: Some(Timestamp::new_millisecond(456)),
682 node_id: None,
683 },
684 ];
685
686 let schema = StorageSstEntry::schema();
687 let batch = StorageSstEntry::to_record_batch(&entries).unwrap();
688
689 assert_eq!(schema.arrow_schema().fields().len(), batch.num_columns());
690 assert_eq!(2, batch.num_rows());
691
692 let file_paths = batch
693 .column(0)
694 .as_any()
695 .downcast_ref::<StringArray>()
696 .unwrap();
697 assert_eq!("/s1", file_paths.value(0));
698 assert_eq!("/s2", file_paths.value(1));
699
700 let file_sizes = batch
701 .column(1)
702 .as_any()
703 .downcast_ref::<UInt64Array>()
704 .unwrap();
705 assert!(file_sizes.is_null(0));
706 assert_eq!(123, file_sizes.value(1));
707
708 let last_modified = batch
709 .column(2)
710 .as_any()
711 .downcast_ref::<TimestampMillisecondArray>()
712 .unwrap();
713 assert!(last_modified.is_null(0));
714 assert_eq!(456, last_modified.value(1));
715
716 let node_ids = batch
717 .column(3)
718 .as_any()
719 .downcast_ref::<UInt64Array>()
720 .unwrap();
721 assert_eq!(1, node_ids.value(0));
722 assert!(node_ids.is_null(1));
723 }
724
725 #[test]
726 fn test_puffin_index_meta_to_record_batch() {
727 let entries = vec![
728 PuffinIndexMetaEntry {
729 table_dir: "table1".to_string(),
730 index_file_path: "index1".to_string(),
731 region_id: RegionId::with_group_and_seq(10, 0, 20),
732 table_id: 10,
733 region_number: 20,
734 region_group: 0,
735 region_sequence: 20,
736 file_id: "file1".to_string(),
737 index_file_size: Some(1024),
738 index_type: "bloom_filter".to_string(),
739 target_type: "column".to_string(),
740 target_key: "1".to_string(),
741 target_json: "{\"column\":1}".to_string(),
742 blob_size: 256,
743 meta_json: Some("{\"bloom\":{}}".to_string()),
744 node_id: Some(42),
745 },
746 PuffinIndexMetaEntry {
747 table_dir: "table2".to_string(),
748 index_file_path: "index2".to_string(),
749 region_id: RegionId::with_group_and_seq(11, 0, 21),
750 table_id: 11,
751 region_number: 21,
752 region_group: 0,
753 region_sequence: 21,
754 file_id: "file2".to_string(),
755 index_file_size: None,
756 index_type: "inverted".to_string(),
757 target_type: "unknown".to_string(),
758 target_key: "legacy".to_string(),
759 target_json: "{}".to_string(),
760 blob_size: 0,
761 meta_json: None,
762 node_id: None,
763 },
764 ];
765
766 let schema = PuffinIndexMetaEntry::schema();
767 let batch = PuffinIndexMetaEntry::to_record_batch(&entries).unwrap();
768
769 assert_eq!(schema.arrow_schema().fields().len(), batch.num_columns());
770 assert_eq!(2, batch.num_rows());
771
772 let table_dirs = batch
773 .column(0)
774 .as_any()
775 .downcast_ref::<StringArray>()
776 .unwrap();
777 assert_eq!("table1", table_dirs.value(0));
778 assert_eq!("table2", table_dirs.value(1));
779
780 let index_file_paths = batch
781 .column(1)
782 .as_any()
783 .downcast_ref::<StringArray>()
784 .unwrap();
785 assert_eq!("index1", index_file_paths.value(0));
786 assert_eq!("index2", index_file_paths.value(1));
787
788 let region_ids = batch
789 .column(2)
790 .as_any()
791 .downcast_ref::<UInt64Array>()
792 .unwrap();
793 assert_eq!(
794 RegionId::with_group_and_seq(10, 0, 20).as_u64(),
795 region_ids.value(0)
796 );
797 assert_eq!(
798 RegionId::with_group_and_seq(11, 0, 21).as_u64(),
799 region_ids.value(1)
800 );
801
802 let table_ids = batch
803 .column(3)
804 .as_any()
805 .downcast_ref::<UInt32Array>()
806 .unwrap();
807 assert_eq!(10, table_ids.value(0));
808 assert_eq!(11, table_ids.value(1));
809
810 let region_numbers = batch
811 .column(4)
812 .as_any()
813 .downcast_ref::<UInt32Array>()
814 .unwrap();
815 assert_eq!(20, region_numbers.value(0));
816 assert_eq!(21, region_numbers.value(1));
817
818 let region_groups = batch
819 .column(5)
820 .as_any()
821 .downcast_ref::<UInt8Array>()
822 .unwrap();
823 assert_eq!(0, region_groups.value(0));
824 assert_eq!(0, region_groups.value(1));
825
826 let region_sequences = batch
827 .column(6)
828 .as_any()
829 .downcast_ref::<UInt32Array>()
830 .unwrap();
831 assert_eq!(20, region_sequences.value(0));
832 assert_eq!(21, region_sequences.value(1));
833
834 let file_ids = batch
835 .column(7)
836 .as_any()
837 .downcast_ref::<StringArray>()
838 .unwrap();
839 assert_eq!("file1", file_ids.value(0));
840 assert_eq!("file2", file_ids.value(1));
841
842 let index_file_sizes = batch
843 .column(8)
844 .as_any()
845 .downcast_ref::<UInt64Array>()
846 .unwrap();
847 assert_eq!(1024, index_file_sizes.value(0));
848 assert!(index_file_sizes.is_null(1));
849
850 let index_types = batch
851 .column(9)
852 .as_any()
853 .downcast_ref::<StringArray>()
854 .unwrap();
855 assert_eq!("bloom_filter", index_types.value(0));
856 assert_eq!("inverted", index_types.value(1));
857
858 let target_types = batch
859 .column(10)
860 .as_any()
861 .downcast_ref::<StringArray>()
862 .unwrap();
863 assert_eq!("column", target_types.value(0));
864 assert_eq!("unknown", target_types.value(1));
865
866 let target_keys = batch
867 .column(11)
868 .as_any()
869 .downcast_ref::<StringArray>()
870 .unwrap();
871 assert_eq!("1", target_keys.value(0));
872 assert_eq!("legacy", target_keys.value(1));
873
874 let target_json = batch
875 .column(12)
876 .as_any()
877 .downcast_ref::<StringArray>()
878 .unwrap();
879 assert_eq!("{\"column\":1}", target_json.value(0));
880 assert_eq!("{}", target_json.value(1));
881
882 let blob_sizes = batch
883 .column(13)
884 .as_any()
885 .downcast_ref::<UInt64Array>()
886 .unwrap();
887 assert_eq!(256, blob_sizes.value(0));
888 assert_eq!(0, blob_sizes.value(1));
889
890 let meta_jsons = batch
891 .column(14)
892 .as_any()
893 .downcast_ref::<StringArray>()
894 .unwrap();
895 assert_eq!("{\"bloom\":{}}", meta_jsons.value(0));
896 assert!(meta_jsons.is_null(1));
897
898 let node_ids = batch
899 .column(15)
900 .as_any()
901 .downcast_ref::<UInt64Array>()
902 .unwrap();
903 assert_eq!(42, node_ids.value(0));
904 assert!(node_ids.is_null(1));
905 }
906
907 #[test]
908 fn test_manifest_build_plan() {
909 let projection_input = Some(vec![0, 1, 2].into());
911 let request = ScanRequest {
912 projection_input,
913 filters: vec![binary_expr(col("table_id"), Operator::Gt, lit(0))],
914 limit: Some(5),
915 ..Default::default()
916 };
917
918 let plan = ManifestSstEntry::build_plan(request).unwrap();
919
920 let (scan, has_filter, has_limit) = extract_scan(&plan);
923
924 assert!(has_filter);
925 assert!(has_limit);
926 assert_eq!(
927 scan.table_name,
928 TableReference::bare(ManifestSstEntry::reserved_table_name_for_inspection())
929 );
930 assert_eq!(scan.projection, Some(vec![0, 1, 2]));
931
932 let fields = scan.projected_schema.fields();
934 assert_eq!(fields.len(), 3);
935 assert_eq!(fields[0].name(), "table_dir");
936 assert_eq!(fields[1].name(), "region_id");
937 assert_eq!(fields[2].name(), "table_id");
938 }
939
940 #[test]
941 fn test_storage_build_plan() {
942 let projection_input = Some(vec![0, 2].into());
943 let request = ScanRequest {
944 projection_input,
945 filters: vec![binary_expr(col("file_path"), Operator::Eq, lit("/a"))],
946 limit: Some(1),
947 ..Default::default()
948 };
949
950 let plan = StorageSstEntry::build_plan(request).unwrap();
951 let (scan, has_filter, has_limit) = extract_scan(&plan);
952 assert!(has_filter);
953 assert!(has_limit);
954 assert_eq!(
955 scan.table_name,
956 TableReference::bare(StorageSstEntry::reserved_table_name_for_inspection())
957 );
958 assert_eq!(scan.projection, Some(vec![0, 2]));
959
960 let fields = scan.projected_schema.fields();
961 assert_eq!(fields.len(), 2);
962 assert_eq!(fields[0].name(), "file_path");
963 assert_eq!(fields[1].name(), "last_modified_ms");
964 }
965
966 fn extract_scan(plan: &LogicalPlan) -> (&datafusion_expr::logical_plan::TableScan, bool, bool) {
968 use datafusion_expr::logical_plan::Limit;
969
970 match plan {
971 LogicalPlan::Filter(f) => {
972 let (scan, _, has_limit) = extract_scan(&f.input);
973 (scan, true, has_limit)
974 }
975 LogicalPlan::Limit(Limit { input, .. }) => {
976 let (scan, has_filter, _) = extract_scan(input);
977 (scan, has_filter, true)
978 }
979 LogicalPlan::TableScan(scan) => (scan, false, false),
980 other => panic!("unexpected plan: {other:?}"),
981 }
982 }
983}