1use std::sync::Arc;
16
17use bytes::Bytes;
18use common_recordbatch::DfRecordBatch;
19use common_time::Timestamp;
20use common_time::timestamp::TimeUnit;
21use datafusion_common::DataFusionError;
22use datafusion_expr::{LogicalPlan, LogicalPlanBuilder, LogicalTableSource};
23use datatypes::arrow::array::{
24 ArrayRef, BinaryArray, BooleanArray, TimestampMillisecondArray, TimestampNanosecondArray,
25 UInt8Array, UInt32Array, UInt64Array,
26};
27use datatypes::arrow::error::ArrowError;
28use datatypes::arrow_array::StringArray;
29use datatypes::schema::{ColumnSchema, Schema, SchemaRef};
30use serde::{Deserialize, Serialize};
31
32use crate::storage::{RegionGroup, RegionId, RegionNumber, RegionSeq, ScanRequest, TableId};
33
34#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
36pub struct ManifestSstEntry {
37 pub table_dir: String,
39 pub region_id: RegionId,
41 pub table_id: TableId,
43 pub region_number: RegionNumber,
45 pub region_group: RegionGroup,
47 pub region_sequence: RegionSeq,
49 pub file_id: String,
51 pub index_version: u64,
53 pub level: u8,
55 pub file_path: String,
57 pub file_size: u64,
59 pub index_file_path: Option<String>,
61 pub index_file_size: Option<u64>,
63 pub num_rows: u64,
65 pub num_row_groups: u64,
67 pub num_series: Option<u64>,
69 pub min_ts: Timestamp,
71 pub max_ts: Timestamp,
73 pub sequence: Option<u64>,
75 pub origin_region_id: RegionId,
77 pub node_id: Option<u64>,
79 pub visible: bool,
81 pub primary_key_min: Option<Bytes>,
83 pub primary_key_max: Option<Bytes>,
85}
86
87impl ManifestSstEntry {
88 pub fn schema() -> SchemaRef {
90 use datatypes::prelude::ConcreteDataType as Ty;
91 Arc::new(Schema::new(vec![
92 ColumnSchema::new("table_dir", Ty::string_datatype(), false),
93 ColumnSchema::new("region_id", Ty::uint64_datatype(), false),
94 ColumnSchema::new("table_id", Ty::uint32_datatype(), false),
95 ColumnSchema::new("region_number", Ty::uint32_datatype(), false),
96 ColumnSchema::new("region_group", Ty::uint8_datatype(), false),
97 ColumnSchema::new("region_sequence", Ty::uint32_datatype(), false),
98 ColumnSchema::new("file_id", Ty::string_datatype(), false),
99 ColumnSchema::new("index_version", Ty::uint64_datatype(), false),
100 ColumnSchema::new("level", Ty::uint8_datatype(), false),
101 ColumnSchema::new("file_path", Ty::string_datatype(), false),
102 ColumnSchema::new("file_size", Ty::uint64_datatype(), false),
103 ColumnSchema::new("index_file_path", Ty::string_datatype(), true),
104 ColumnSchema::new("index_file_size", Ty::uint64_datatype(), true),
105 ColumnSchema::new("num_rows", Ty::uint64_datatype(), false),
106 ColumnSchema::new("num_row_groups", Ty::uint64_datatype(), false),
107 ColumnSchema::new("num_series", Ty::uint64_datatype(), true),
108 ColumnSchema::new("min_ts", Ty::timestamp_nanosecond_datatype(), true),
109 ColumnSchema::new("max_ts", Ty::timestamp_nanosecond_datatype(), true),
110 ColumnSchema::new("sequence", Ty::uint64_datatype(), true),
111 ColumnSchema::new("origin_region_id", Ty::uint64_datatype(), false),
112 ColumnSchema::new("node_id", Ty::uint64_datatype(), true),
113 ColumnSchema::new("visible", Ty::boolean_datatype(), false),
114 ColumnSchema::new("primary_key_min", Ty::binary_datatype(), true),
115 ColumnSchema::new("primary_key_max", Ty::binary_datatype(), true),
116 ]))
117 }
118
119 pub fn to_record_batch(entries: &[Self]) -> std::result::Result<DfRecordBatch, ArrowError> {
121 let schema = Self::schema();
122 let table_dirs = entries.iter().map(|e| e.table_dir.as_str());
123 let region_ids = entries.iter().map(|e| e.region_id.as_u64());
124 let table_ids = entries.iter().map(|e| e.table_id);
125 let region_numbers = entries.iter().map(|e| e.region_number);
126 let region_groups = entries.iter().map(|e| e.region_group);
127 let region_sequences = entries.iter().map(|e| e.region_sequence);
128 let file_ids = entries.iter().map(|e| e.file_id.as_str());
129 let index_versions = entries.iter().map(|e| e.index_version);
130 let levels = entries.iter().map(|e| e.level);
131 let file_paths = entries.iter().map(|e| e.file_path.as_str());
132 let file_sizes = entries.iter().map(|e| e.file_size);
133 let index_file_paths = entries.iter().map(|e| e.index_file_path.as_ref());
134 let index_file_sizes = entries.iter().map(|e| e.index_file_size);
135 let num_rows = entries.iter().map(|e| e.num_rows);
136 let num_row_groups = entries.iter().map(|e| e.num_row_groups);
137 let num_series = entries.iter().map(|e| e.num_series);
138 let min_ts = entries.iter().map(|e| {
139 e.min_ts
140 .convert_to(TimeUnit::Nanosecond)
141 .map(|ts| ts.value())
142 });
143 let max_ts = entries.iter().map(|e| {
144 e.max_ts
145 .convert_to(TimeUnit::Nanosecond)
146 .map(|ts| ts.value())
147 });
148 let sequences = entries.iter().map(|e| e.sequence);
149 let origin_region_ids = entries.iter().map(|e| e.origin_region_id.as_u64());
150 let node_ids = entries.iter().map(|e| e.node_id);
151 let visible_flags = entries.iter().map(|e| Some(e.visible));
152 let primary_key_min = entries.iter().map(|e| e.primary_key_min.as_deref());
153 let primary_key_max = entries.iter().map(|e| e.primary_key_max.as_deref());
154
155 let columns: Vec<ArrayRef> = vec![
156 Arc::new(StringArray::from_iter_values(table_dirs)),
157 Arc::new(UInt64Array::from_iter_values(region_ids)),
158 Arc::new(UInt32Array::from_iter_values(table_ids)),
159 Arc::new(UInt32Array::from_iter_values(region_numbers)),
160 Arc::new(UInt8Array::from_iter_values(region_groups)),
161 Arc::new(UInt32Array::from_iter_values(region_sequences)),
162 Arc::new(StringArray::from_iter_values(file_ids)),
163 Arc::new(UInt64Array::from_iter(index_versions)),
164 Arc::new(UInt8Array::from_iter_values(levels)),
165 Arc::new(StringArray::from_iter_values(file_paths)),
166 Arc::new(UInt64Array::from_iter_values(file_sizes)),
167 Arc::new(StringArray::from_iter(index_file_paths)),
168 Arc::new(UInt64Array::from_iter(index_file_sizes)),
169 Arc::new(UInt64Array::from_iter_values(num_rows)),
170 Arc::new(UInt64Array::from_iter_values(num_row_groups)),
171 Arc::new(UInt64Array::from_iter(num_series)),
172 Arc::new(TimestampNanosecondArray::from_iter(min_ts)),
173 Arc::new(TimestampNanosecondArray::from_iter(max_ts)),
174 Arc::new(UInt64Array::from_iter(sequences)),
175 Arc::new(UInt64Array::from_iter_values(origin_region_ids)),
176 Arc::new(UInt64Array::from_iter(node_ids)),
177 Arc::new(BooleanArray::from_iter(visible_flags)),
178 Arc::new(BinaryArray::from_iter(primary_key_min)),
179 Arc::new(BinaryArray::from_iter(primary_key_max)),
180 ];
181
182 DfRecordBatch::try_new(schema.arrow_schema().clone(), columns)
183 }
184
185 pub fn reserved_table_name_for_inspection() -> &'static str {
191 "__inspect/__mito/__sst_manifest"
192 }
193
194 pub fn build_plan(scan_request: ScanRequest) -> Result<LogicalPlan, DataFusionError> {
196 build_plan_helper(
197 scan_request,
198 Self::reserved_table_name_for_inspection(),
199 Self::schema(),
200 )
201 }
202}
203
204#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
206pub struct StorageSstEntry {
207 pub file_path: String,
209 pub file_size: Option<u64>,
211 pub last_modified_ms: Option<Timestamp>,
213 pub node_id: Option<u64>,
215}
216
217impl StorageSstEntry {
218 pub fn schema() -> SchemaRef {
220 use datatypes::prelude::ConcreteDataType as Ty;
221 Arc::new(Schema::new(vec![
222 ColumnSchema::new("file_path", Ty::string_datatype(), false),
223 ColumnSchema::new("file_size", Ty::uint64_datatype(), true),
224 ColumnSchema::new(
225 "last_modified_ms",
226 Ty::timestamp_millisecond_datatype(),
227 true,
228 ),
229 ColumnSchema::new("node_id", Ty::uint64_datatype(), true),
230 ]))
231 }
232
233 pub fn to_record_batch(entries: &[Self]) -> std::result::Result<DfRecordBatch, ArrowError> {
235 let schema = Self::schema();
236 let file_paths = entries.iter().map(|e| e.file_path.as_str());
237 let file_sizes = entries.iter().map(|e| e.file_size);
238 let last_modified_ms = entries.iter().map(|e| {
239 e.last_modified_ms
240 .and_then(|ts| ts.convert_to(TimeUnit::Millisecond).map(|ts| ts.value()))
241 });
242 let node_ids = entries.iter().map(|e| e.node_id);
243
244 let columns: Vec<ArrayRef> = vec![
245 Arc::new(StringArray::from_iter_values(file_paths)),
246 Arc::new(UInt64Array::from_iter(file_sizes)),
247 Arc::new(TimestampMillisecondArray::from_iter(last_modified_ms)),
248 Arc::new(UInt64Array::from_iter(node_ids)),
249 ];
250
251 DfRecordBatch::try_new(schema.arrow_schema().clone(), columns)
252 }
253
254 pub fn reserved_table_name_for_inspection() -> &'static str {
260 "__inspect/__mito/__sst_storage"
261 }
262
263 pub fn build_plan(scan_request: ScanRequest) -> Result<LogicalPlan, DataFusionError> {
265 build_plan_helper(
266 scan_request,
267 Self::reserved_table_name_for_inspection(),
268 Self::schema(),
269 )
270 }
271}
272
273#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
275pub struct PuffinIndexMetaEntry {
276 pub table_dir: String,
278 pub index_file_path: String,
280 pub region_id: RegionId,
282 pub table_id: TableId,
284 pub region_number: RegionNumber,
286 pub region_group: RegionGroup,
288 pub region_sequence: RegionSeq,
290 pub file_id: String,
292 pub index_file_size: Option<u64>,
294 pub index_type: String,
296 pub target_type: String,
298 pub target_key: String,
300 pub target_json: String,
302 pub blob_size: u64,
304 pub meta_json: Option<String>,
306 pub node_id: Option<u64>,
308}
309
310impl PuffinIndexMetaEntry {
311 pub fn schema() -> SchemaRef {
313 use datatypes::prelude::ConcreteDataType as Ty;
314 Arc::new(Schema::new(vec![
315 ColumnSchema::new("table_dir", Ty::string_datatype(), false),
316 ColumnSchema::new("index_file_path", Ty::string_datatype(), false),
317 ColumnSchema::new("region_id", Ty::uint64_datatype(), false),
318 ColumnSchema::new("table_id", Ty::uint32_datatype(), false),
319 ColumnSchema::new("region_number", Ty::uint32_datatype(), false),
320 ColumnSchema::new("region_group", Ty::uint8_datatype(), false),
321 ColumnSchema::new("region_sequence", Ty::uint32_datatype(), false),
322 ColumnSchema::new("file_id", Ty::string_datatype(), false),
323 ColumnSchema::new("index_file_size", Ty::uint64_datatype(), true),
324 ColumnSchema::new("index_type", Ty::string_datatype(), false),
325 ColumnSchema::new("target_type", Ty::string_datatype(), false),
326 ColumnSchema::new("target_key", Ty::string_datatype(), false),
327 ColumnSchema::new("target_json", Ty::string_datatype(), false),
328 ColumnSchema::new("blob_size", Ty::uint64_datatype(), false),
329 ColumnSchema::new("meta_json", Ty::string_datatype(), true),
330 ColumnSchema::new("node_id", Ty::uint64_datatype(), true),
331 ]))
332 }
333
334 pub fn to_record_batch(entries: &[Self]) -> std::result::Result<DfRecordBatch, ArrowError> {
336 let schema = Self::schema();
337 let table_dirs = entries.iter().map(|e| e.table_dir.as_str());
338 let index_file_paths = entries.iter().map(|e| e.index_file_path.as_str());
339 let region_ids = entries.iter().map(|e| e.region_id.as_u64());
340 let table_ids = entries.iter().map(|e| e.table_id);
341 let region_numbers = entries.iter().map(|e| e.region_number);
342 let region_groups = entries.iter().map(|e| e.region_group);
343 let region_sequences = entries.iter().map(|e| e.region_sequence);
344 let file_ids = entries.iter().map(|e| e.file_id.as_str());
345 let index_file_sizes = entries.iter().map(|e| e.index_file_size);
346 let index_types = entries.iter().map(|e| e.index_type.as_str());
347 let target_types = entries.iter().map(|e| e.target_type.as_str());
348 let target_keys = entries.iter().map(|e| e.target_key.as_str());
349 let target_jsons = entries.iter().map(|e| e.target_json.as_str());
350 let blob_sizes = entries.iter().map(|e| e.blob_size);
351 let meta_jsons = entries.iter().map(|e| e.meta_json.as_deref());
352 let node_ids = entries.iter().map(|e| e.node_id);
353
354 let columns: Vec<ArrayRef> = vec![
355 Arc::new(StringArray::from_iter_values(table_dirs)),
356 Arc::new(StringArray::from_iter_values(index_file_paths)),
357 Arc::new(UInt64Array::from_iter_values(region_ids)),
358 Arc::new(UInt32Array::from_iter_values(table_ids)),
359 Arc::new(UInt32Array::from_iter_values(region_numbers)),
360 Arc::new(UInt8Array::from_iter_values(region_groups)),
361 Arc::new(UInt32Array::from_iter_values(region_sequences)),
362 Arc::new(StringArray::from_iter_values(file_ids)),
363 Arc::new(UInt64Array::from_iter(index_file_sizes)),
364 Arc::new(StringArray::from_iter_values(index_types)),
365 Arc::new(StringArray::from_iter_values(target_types)),
366 Arc::new(StringArray::from_iter_values(target_keys)),
367 Arc::new(StringArray::from_iter_values(target_jsons)),
368 Arc::new(UInt64Array::from_iter_values(blob_sizes)),
369 Arc::new(StringArray::from_iter(meta_jsons)),
370 Arc::new(UInt64Array::from_iter(node_ids)),
371 ];
372
373 DfRecordBatch::try_new(schema.arrow_schema().clone(), columns)
374 }
375
376 pub fn reserved_table_name_for_inspection() -> &'static str {
378 "__inspect/__mito/__puffin_index_meta"
379 }
380
381 pub fn build_plan(scan_request: ScanRequest) -> Result<LogicalPlan, DataFusionError> {
383 build_plan_helper(
384 scan_request,
385 Self::reserved_table_name_for_inspection(),
386 Self::schema(),
387 )
388 }
389}
390
391fn build_plan_helper(
392 scan_request: ScanRequest,
393 table_name: &str,
394 schema: SchemaRef,
395) -> Result<LogicalPlan, DataFusionError> {
396 let table_source = LogicalTableSource::new(schema.arrow_schema().clone());
397
398 let projection = scan_request.projection_input.map(|input| input.projection);
399 let mut builder = LogicalPlanBuilder::scan(table_name, Arc::new(table_source), projection)?;
400
401 for filter in scan_request.filters {
402 builder = builder.filter(filter)?;
403 }
404
405 if let Some(limit) = scan_request.limit {
406 builder = builder.limit(0, Some(limit))?;
407 }
408
409 builder.build()
410}
411
412#[cfg(test)]
413mod tests {
414 use datafusion_common::TableReference;
415 use datafusion_expr::{LogicalPlan, Operator, binary_expr, col, lit};
416 use datatypes::arrow::array::{
417 Array, BinaryArray, TimestampMillisecondArray, TimestampNanosecondArray, UInt8Array,
418 UInt32Array, UInt64Array,
419 };
420 use datatypes::arrow_array::StringArray;
421
422 use super::*;
423
424 #[test]
425 fn test_sst_entry_manifest_to_record_batch() {
426 let table_id1: TableId = 1;
428 let region_group1: RegionGroup = 2;
429 let region_seq1: RegionSeq = 3;
430 let region_number1: RegionNumber = ((region_group1 as u32) << 24) | region_seq1;
431 let region_id1 = RegionId::with_group_and_seq(table_id1, region_group1, region_seq1);
432
433 let table_id2: TableId = 5;
434 let region_group2: RegionGroup = 1;
435 let region_seq2: RegionSeq = 42;
436 let region_number2: RegionNumber = ((region_group2 as u32) << 24) | region_seq2;
437 let region_id2 = RegionId::with_group_and_seq(table_id2, region_group2, region_seq2);
438
439 let entries = vec![
440 ManifestSstEntry {
441 table_dir: "tdir1".to_string(),
442 region_id: region_id1,
443 table_id: table_id1,
444 region_number: region_number1,
445 region_group: region_group1,
446 region_sequence: region_seq1,
447 file_id: "f1".to_string(),
448 index_version: 0,
449 level: 1,
450 file_path: "/p1".to_string(),
451 file_size: 100,
452 index_file_path: None,
453 index_file_size: None,
454 num_rows: 10,
455 num_row_groups: 2,
456 num_series: Some(5),
457 min_ts: Timestamp::new_millisecond(1000), max_ts: Timestamp::new_second(2), sequence: None,
460 origin_region_id: region_id1,
461 node_id: Some(1),
462 visible: false,
463 primary_key_min: Some(Bytes::from_static(b"aaa")),
464 primary_key_max: Some(Bytes::from_static(b"zzz")),
465 },
466 ManifestSstEntry {
467 table_dir: "tdir2".to_string(),
468 region_id: region_id2,
469 table_id: table_id2,
470 region_number: region_number2,
471 region_group: region_group2,
472 region_sequence: region_seq2,
473 file_id: "f2".to_string(),
474 index_version: 1,
475 level: 3,
476 file_path: "/p2".to_string(),
477 file_size: 200,
478 index_file_path: Some("idx".to_string()),
479 index_file_size: Some(11),
480 num_rows: 20,
481 num_row_groups: 4,
482 num_series: None,
483 min_ts: Timestamp::new_nanosecond(5), max_ts: Timestamp::new_microsecond(2000), sequence: Some(9),
486 origin_region_id: region_id2,
487 node_id: None,
488 visible: true,
489 primary_key_min: None,
490 primary_key_max: None,
491 },
492 ];
493
494 let schema = ManifestSstEntry::schema();
495 let batch = ManifestSstEntry::to_record_batch(&entries).unwrap();
496
497 assert_eq!(schema.arrow_schema().fields().len(), batch.num_columns());
499 assert_eq!(2, batch.num_rows());
500 for (i, f) in schema.arrow_schema().fields().iter().enumerate() {
501 assert_eq!(f.name(), batch.schema().field(i).name());
502 assert_eq!(f.is_nullable(), batch.schema().field(i).is_nullable());
503 assert_eq!(f.data_type(), batch.schema().field(i).data_type());
504 }
505
506 let table_dirs = batch
508 .column(0)
509 .as_any()
510 .downcast_ref::<StringArray>()
511 .unwrap();
512 assert_eq!("tdir1", table_dirs.value(0));
513 assert_eq!("tdir2", table_dirs.value(1));
514
515 let region_ids = batch
516 .column(1)
517 .as_any()
518 .downcast_ref::<UInt64Array>()
519 .unwrap();
520 assert_eq!(region_id1.as_u64(), region_ids.value(0));
521 assert_eq!(region_id2.as_u64(), region_ids.value(1));
522
523 let table_ids = batch
524 .column(2)
525 .as_any()
526 .downcast_ref::<UInt32Array>()
527 .unwrap();
528 assert_eq!(table_id1, table_ids.value(0));
529 assert_eq!(table_id2, table_ids.value(1));
530
531 let region_numbers = batch
532 .column(3)
533 .as_any()
534 .downcast_ref::<UInt32Array>()
535 .unwrap();
536 assert_eq!(region_number1, region_numbers.value(0));
537 assert_eq!(region_number2, region_numbers.value(1));
538
539 let region_groups = batch
540 .column(4)
541 .as_any()
542 .downcast_ref::<UInt8Array>()
543 .unwrap();
544 assert_eq!(region_group1, region_groups.value(0));
545 assert_eq!(region_group2, region_groups.value(1));
546
547 let region_sequences = batch
548 .column(5)
549 .as_any()
550 .downcast_ref::<UInt32Array>()
551 .unwrap();
552 assert_eq!(region_seq1, region_sequences.value(0));
553 assert_eq!(region_seq2, region_sequences.value(1));
554
555 let file_ids = batch
556 .column(6)
557 .as_any()
558 .downcast_ref::<StringArray>()
559 .unwrap();
560 assert_eq!("f1", file_ids.value(0));
561 assert_eq!("f2", file_ids.value(1));
562
563 let index_versions = batch
564 .column(7)
565 .as_any()
566 .downcast_ref::<UInt64Array>()
567 .unwrap();
568 assert_eq!(0, index_versions.value(0));
569 assert_eq!(1, index_versions.value(1));
570
571 let levels = batch
572 .column(8)
573 .as_any()
574 .downcast_ref::<UInt8Array>()
575 .unwrap();
576 assert_eq!(1, levels.value(0));
577 assert_eq!(3, levels.value(1));
578
579 let file_paths = batch
580 .column(9)
581 .as_any()
582 .downcast_ref::<StringArray>()
583 .unwrap();
584 assert_eq!("/p1", file_paths.value(0));
585 assert_eq!("/p2", file_paths.value(1));
586
587 let file_sizes = batch
588 .column(10)
589 .as_any()
590 .downcast_ref::<UInt64Array>()
591 .unwrap();
592 assert_eq!(100, file_sizes.value(0));
593 assert_eq!(200, file_sizes.value(1));
594
595 let index_file_paths = batch
596 .column(11)
597 .as_any()
598 .downcast_ref::<StringArray>()
599 .unwrap();
600 assert!(index_file_paths.is_null(0));
601 assert_eq!("idx", index_file_paths.value(1));
602
603 let index_file_sizes = batch
604 .column(12)
605 .as_any()
606 .downcast_ref::<UInt64Array>()
607 .unwrap();
608 assert!(index_file_sizes.is_null(0));
609 assert_eq!(11, index_file_sizes.value(1));
610
611 let num_rows = batch
612 .column(13)
613 .as_any()
614 .downcast_ref::<UInt64Array>()
615 .unwrap();
616 assert_eq!(10, num_rows.value(0));
617 assert_eq!(20, num_rows.value(1));
618
619 let num_row_groups = batch
620 .column(14)
621 .as_any()
622 .downcast_ref::<UInt64Array>()
623 .unwrap();
624 assert_eq!(2, num_row_groups.value(0));
625 assert_eq!(4, num_row_groups.value(1));
626
627 let num_series = batch
628 .column(15)
629 .as_any()
630 .downcast_ref::<UInt64Array>()
631 .unwrap();
632 assert_eq!(5, num_series.value(0));
633 assert!(num_series.is_null(1));
634
635 let min_ts = batch
636 .column(16)
637 .as_any()
638 .downcast_ref::<TimestampNanosecondArray>()
639 .unwrap();
640 assert_eq!(1_000_000_000, min_ts.value(0));
641 assert_eq!(5, min_ts.value(1));
642
643 let max_ts = batch
644 .column(17)
645 .as_any()
646 .downcast_ref::<TimestampNanosecondArray>()
647 .unwrap();
648 assert_eq!(2_000_000_000, max_ts.value(0));
649 assert_eq!(2_000_000, max_ts.value(1));
650
651 let sequences = batch
652 .column(18)
653 .as_any()
654 .downcast_ref::<UInt64Array>()
655 .unwrap();
656 assert!(sequences.is_null(0));
657 assert_eq!(9, sequences.value(1));
658
659 let origin_region_ids = batch
660 .column(19)
661 .as_any()
662 .downcast_ref::<UInt64Array>()
663 .unwrap();
664 assert_eq!(region_id1.as_u64(), origin_region_ids.value(0));
665 assert_eq!(region_id2.as_u64(), origin_region_ids.value(1));
666
667 let node_ids = batch
668 .column(20)
669 .as_any()
670 .downcast_ref::<UInt64Array>()
671 .unwrap();
672 assert_eq!(1, node_ids.value(0));
673 assert!(node_ids.is_null(1));
674
675 let visible = batch
676 .column(21)
677 .as_any()
678 .downcast_ref::<BooleanArray>()
679 .unwrap();
680 assert!(!visible.value(0));
681 assert!(visible.value(1));
682
683 let primary_key_min = batch
684 .column(22)
685 .as_any()
686 .downcast_ref::<BinaryArray>()
687 .unwrap();
688 assert_eq!(b"aaa", primary_key_min.value(0));
689 assert!(primary_key_min.is_null(1));
690
691 let primary_key_max = batch
692 .column(23)
693 .as_any()
694 .downcast_ref::<BinaryArray>()
695 .unwrap();
696 assert_eq!(b"zzz", primary_key_max.value(0));
697 assert!(primary_key_max.is_null(1));
698 }
699
700 #[test]
701 fn test_sst_entry_storage_to_record_batch() {
702 let entries = vec![
703 StorageSstEntry {
704 file_path: "/s1".to_string(),
705 file_size: None,
706 last_modified_ms: None,
707 node_id: Some(1),
708 },
709 StorageSstEntry {
710 file_path: "/s2".to_string(),
711 file_size: Some(123),
712 last_modified_ms: Some(Timestamp::new_millisecond(456)),
713 node_id: None,
714 },
715 ];
716
717 let schema = StorageSstEntry::schema();
718 let batch = StorageSstEntry::to_record_batch(&entries).unwrap();
719
720 assert_eq!(schema.arrow_schema().fields().len(), batch.num_columns());
721 assert_eq!(2, batch.num_rows());
722
723 let file_paths = batch
724 .column(0)
725 .as_any()
726 .downcast_ref::<StringArray>()
727 .unwrap();
728 assert_eq!("/s1", file_paths.value(0));
729 assert_eq!("/s2", file_paths.value(1));
730
731 let file_sizes = batch
732 .column(1)
733 .as_any()
734 .downcast_ref::<UInt64Array>()
735 .unwrap();
736 assert!(file_sizes.is_null(0));
737 assert_eq!(123, file_sizes.value(1));
738
739 let last_modified = batch
740 .column(2)
741 .as_any()
742 .downcast_ref::<TimestampMillisecondArray>()
743 .unwrap();
744 assert!(last_modified.is_null(0));
745 assert_eq!(456, last_modified.value(1));
746
747 let node_ids = batch
748 .column(3)
749 .as_any()
750 .downcast_ref::<UInt64Array>()
751 .unwrap();
752 assert_eq!(1, node_ids.value(0));
753 assert!(node_ids.is_null(1));
754 }
755
756 #[test]
757 fn test_puffin_index_meta_to_record_batch() {
758 let entries = vec![
759 PuffinIndexMetaEntry {
760 table_dir: "table1".to_string(),
761 index_file_path: "index1".to_string(),
762 region_id: RegionId::with_group_and_seq(10, 0, 20),
763 table_id: 10,
764 region_number: 20,
765 region_group: 0,
766 region_sequence: 20,
767 file_id: "file1".to_string(),
768 index_file_size: Some(1024),
769 index_type: "bloom_filter".to_string(),
770 target_type: "column".to_string(),
771 target_key: "1".to_string(),
772 target_json: "{\"column\":1}".to_string(),
773 blob_size: 256,
774 meta_json: Some("{\"bloom\":{}}".to_string()),
775 node_id: Some(42),
776 },
777 PuffinIndexMetaEntry {
778 table_dir: "table2".to_string(),
779 index_file_path: "index2".to_string(),
780 region_id: RegionId::with_group_and_seq(11, 0, 21),
781 table_id: 11,
782 region_number: 21,
783 region_group: 0,
784 region_sequence: 21,
785 file_id: "file2".to_string(),
786 index_file_size: None,
787 index_type: "inverted".to_string(),
788 target_type: "unknown".to_string(),
789 target_key: "legacy".to_string(),
790 target_json: "{}".to_string(),
791 blob_size: 0,
792 meta_json: None,
793 node_id: None,
794 },
795 ];
796
797 let schema = PuffinIndexMetaEntry::schema();
798 let batch = PuffinIndexMetaEntry::to_record_batch(&entries).unwrap();
799
800 assert_eq!(schema.arrow_schema().fields().len(), batch.num_columns());
801 assert_eq!(2, batch.num_rows());
802
803 let table_dirs = batch
804 .column(0)
805 .as_any()
806 .downcast_ref::<StringArray>()
807 .unwrap();
808 assert_eq!("table1", table_dirs.value(0));
809 assert_eq!("table2", table_dirs.value(1));
810
811 let index_file_paths = batch
812 .column(1)
813 .as_any()
814 .downcast_ref::<StringArray>()
815 .unwrap();
816 assert_eq!("index1", index_file_paths.value(0));
817 assert_eq!("index2", index_file_paths.value(1));
818
819 let region_ids = batch
820 .column(2)
821 .as_any()
822 .downcast_ref::<UInt64Array>()
823 .unwrap();
824 assert_eq!(
825 RegionId::with_group_and_seq(10, 0, 20).as_u64(),
826 region_ids.value(0)
827 );
828 assert_eq!(
829 RegionId::with_group_and_seq(11, 0, 21).as_u64(),
830 region_ids.value(1)
831 );
832
833 let table_ids = batch
834 .column(3)
835 .as_any()
836 .downcast_ref::<UInt32Array>()
837 .unwrap();
838 assert_eq!(10, table_ids.value(0));
839 assert_eq!(11, table_ids.value(1));
840
841 let region_numbers = batch
842 .column(4)
843 .as_any()
844 .downcast_ref::<UInt32Array>()
845 .unwrap();
846 assert_eq!(20, region_numbers.value(0));
847 assert_eq!(21, region_numbers.value(1));
848
849 let region_groups = batch
850 .column(5)
851 .as_any()
852 .downcast_ref::<UInt8Array>()
853 .unwrap();
854 assert_eq!(0, region_groups.value(0));
855 assert_eq!(0, region_groups.value(1));
856
857 let region_sequences = batch
858 .column(6)
859 .as_any()
860 .downcast_ref::<UInt32Array>()
861 .unwrap();
862 assert_eq!(20, region_sequences.value(0));
863 assert_eq!(21, region_sequences.value(1));
864
865 let file_ids = batch
866 .column(7)
867 .as_any()
868 .downcast_ref::<StringArray>()
869 .unwrap();
870 assert_eq!("file1", file_ids.value(0));
871 assert_eq!("file2", file_ids.value(1));
872
873 let index_file_sizes = batch
874 .column(8)
875 .as_any()
876 .downcast_ref::<UInt64Array>()
877 .unwrap();
878 assert_eq!(1024, index_file_sizes.value(0));
879 assert!(index_file_sizes.is_null(1));
880
881 let index_types = batch
882 .column(9)
883 .as_any()
884 .downcast_ref::<StringArray>()
885 .unwrap();
886 assert_eq!("bloom_filter", index_types.value(0));
887 assert_eq!("inverted", index_types.value(1));
888
889 let target_types = batch
890 .column(10)
891 .as_any()
892 .downcast_ref::<StringArray>()
893 .unwrap();
894 assert_eq!("column", target_types.value(0));
895 assert_eq!("unknown", target_types.value(1));
896
897 let target_keys = batch
898 .column(11)
899 .as_any()
900 .downcast_ref::<StringArray>()
901 .unwrap();
902 assert_eq!("1", target_keys.value(0));
903 assert_eq!("legacy", target_keys.value(1));
904
905 let target_json = batch
906 .column(12)
907 .as_any()
908 .downcast_ref::<StringArray>()
909 .unwrap();
910 assert_eq!("{\"column\":1}", target_json.value(0));
911 assert_eq!("{}", target_json.value(1));
912
913 let blob_sizes = batch
914 .column(13)
915 .as_any()
916 .downcast_ref::<UInt64Array>()
917 .unwrap();
918 assert_eq!(256, blob_sizes.value(0));
919 assert_eq!(0, blob_sizes.value(1));
920
921 let meta_jsons = batch
922 .column(14)
923 .as_any()
924 .downcast_ref::<StringArray>()
925 .unwrap();
926 assert_eq!("{\"bloom\":{}}", meta_jsons.value(0));
927 assert!(meta_jsons.is_null(1));
928
929 let node_ids = batch
930 .column(15)
931 .as_any()
932 .downcast_ref::<UInt64Array>()
933 .unwrap();
934 assert_eq!(42, node_ids.value(0));
935 assert!(node_ids.is_null(1));
936 }
937
938 #[test]
939 fn test_manifest_build_plan() {
940 let projection_input = Some(vec![0, 1, 2].into());
942 let request = ScanRequest {
943 projection_input,
944 filters: vec![binary_expr(col("table_id"), Operator::Gt, lit(0))],
945 limit: Some(5),
946 ..Default::default()
947 };
948
949 let plan = ManifestSstEntry::build_plan(request).unwrap();
950
951 let (scan, has_filter, has_limit) = extract_scan(&plan);
954
955 assert!(has_filter);
956 assert!(has_limit);
957 assert_eq!(
958 scan.table_name,
959 TableReference::bare(ManifestSstEntry::reserved_table_name_for_inspection())
960 );
961 assert_eq!(scan.projection, Some(vec![0, 1, 2]));
962
963 let fields = scan.projected_schema.fields();
965 assert_eq!(fields.len(), 3);
966 assert_eq!(fields[0].name(), "table_dir");
967 assert_eq!(fields[1].name(), "region_id");
968 assert_eq!(fields[2].name(), "table_id");
969 }
970
971 #[test]
972 fn test_storage_build_plan() {
973 let projection_input = Some(vec![0, 2].into());
974 let request = ScanRequest {
975 projection_input,
976 filters: vec![binary_expr(col("file_path"), Operator::Eq, lit("/a"))],
977 limit: Some(1),
978 ..Default::default()
979 };
980
981 let plan = StorageSstEntry::build_plan(request).unwrap();
982 let (scan, has_filter, has_limit) = extract_scan(&plan);
983 assert!(has_filter);
984 assert!(has_limit);
985 assert_eq!(
986 scan.table_name,
987 TableReference::bare(StorageSstEntry::reserved_table_name_for_inspection())
988 );
989 assert_eq!(scan.projection, Some(vec![0, 2]));
990
991 let fields = scan.projected_schema.fields();
992 assert_eq!(fields.len(), 2);
993 assert_eq!(fields[0].name(), "file_path");
994 assert_eq!(fields[1].name(), "last_modified_ms");
995 }
996
997 fn extract_scan(plan: &LogicalPlan) -> (&datafusion_expr::logical_plan::TableScan, bool, bool) {
999 use datafusion_expr::logical_plan::Limit;
1000
1001 match plan {
1002 LogicalPlan::Filter(f) => {
1003 let (scan, _, has_limit) = extract_scan(&f.input);
1004 (scan, true, has_limit)
1005 }
1006 LogicalPlan::Limit(Limit { input, .. }) => {
1007 let (scan, has_filter, _) = extract_scan(input);
1008 (scan, has_filter, true)
1009 }
1010 LogicalPlan::TableScan(scan) => (scan, false, false),
1011 other => panic!("unexpected plan: {other:?}"),
1012 }
1013 }
1014}