1use std::sync::Arc;
18
19use async_trait::async_trait;
20use datatypes::arrow::array::{Array, BinaryArray};
21use datatypes::arrow::compute::concat_batches;
22use datatypes::arrow::record_batch::RecordBatch;
23use futures::{Stream, TryStreamExt};
24use snafu::ResultExt;
25use store_api::storage::{FileId, TimeSeriesRowSelector};
26
27use crate::cache::{
28 CacheStrategy, SelectorResult, SelectorResultKey, SelectorResultValue,
29 selector_result_cache_hit, selector_result_cache_miss,
30};
31use crate::error::{ComputeArrowSnafu, Result};
32use crate::read::{
33 Batch, BatchReader, BoxedBatchReader, BoxedRecordBatchStream, timestamp_array_to_i64_slice,
34};
35use crate::sst::parquet::DEFAULT_READ_BATCH_SIZE;
36use crate::sst::parquet::flat_format::{primary_key_column_index, time_index_column_index};
37use crate::sst::parquet::format::{PrimaryKeyArray, primary_key_offsets};
38use crate::sst::parquet::read_columns::ParquetReadColumns;
39use crate::sst::parquet::reader::FlatRowGroupReader;
40
41#[allow(dead_code)]
50pub(crate) struct LastRowReader {
51 reader: BoxedBatchReader,
53 selector: LastRowSelector,
55}
56
57#[allow(dead_code)]
58impl LastRowReader {
59 pub(crate) fn new(reader: BoxedBatchReader) -> Self {
61 Self {
62 reader,
63 selector: LastRowSelector::default(),
64 }
65 }
66
67 pub(crate) async fn next_last_row(&mut self) -> Result<Option<Batch>> {
69 while let Some(batch) = self.reader.next_batch().await? {
70 if let Some(yielded) = self.selector.on_next(batch) {
71 return Ok(Some(yielded));
72 }
73 }
74 Ok(self.selector.finish())
75 }
76}
77
78#[async_trait]
79impl BatchReader for LastRowReader {
80 async fn next_batch(&mut self) -> Result<Option<Batch>> {
81 self.next_last_row().await
82 }
83}
84
85#[derive(Default)]
87pub struct LastRowSelector {
88 last_batch: Option<Batch>,
89}
90
91impl LastRowSelector {
92 pub fn on_next(&mut self, batch: Batch) -> Option<Batch> {
94 if let Some(last) = &self.last_batch {
95 if last.primary_key() == batch.primary_key() {
96 self.last_batch = Some(batch);
98 None
99 } else {
100 debug_assert!(!last.is_empty());
103 let last_row = last.slice(last.num_rows() - 1, 1);
104 self.last_batch = Some(batch);
105 Some(last_row)
106 }
107 } else {
108 self.last_batch = Some(batch);
109 None
110 }
111 }
112
113 pub fn finish(&mut self) -> Option<Batch> {
115 if let Some(last) = self.last_batch.take() {
116 let last_row = last.slice(last.num_rows() - 1, 1);
118 return Some(last_row);
119 }
120 None
121 }
122}
123
124pub(crate) enum FlatRowGroupLastRowCachedReader {
128 Hit(FlatLastRowCacheReader),
130 Miss(FlatRowGroupLastRowReader),
132}
133
134impl FlatRowGroupLastRowCachedReader {
135 pub(crate) fn new(
136 file_id: FileId,
137 row_group_idx: usize,
138 cache_strategy: CacheStrategy,
139 read_cols: &ParquetReadColumns,
140 reader: FlatRowGroupReader,
141 ) -> Self {
142 let key = SelectorResultKey {
143 file_id,
144 row_group_idx,
145 selector: TimeSeriesRowSelector::LastRow,
146 };
147
148 if let Some(value) = cache_strategy.get_selector_result(&key) {
149 let is_flat = matches!(&value.result, SelectorResult::Flat(_));
150 let schema_matches = value.read_cols == *read_cols;
151 if is_flat && schema_matches {
152 Self::new_hit(value)
153 } else {
154 Self::new_miss(key, read_cols, reader, cache_strategy)
155 }
156 } else {
157 Self::new_miss(key, read_cols, reader, cache_strategy)
158 }
159 }
160
161 pub(crate) async fn next_batch(&mut self) -> Result<Option<RecordBatch>> {
163 match self {
164 FlatRowGroupLastRowCachedReader::Hit(r) => r.next_batch(),
165 FlatRowGroupLastRowCachedReader::Miss(r) => r.next_batch().await,
166 }
167 }
168
169 fn new_hit(value: Arc<SelectorResultValue>) -> Self {
170 selector_result_cache_hit();
171 Self::Hit(FlatLastRowCacheReader { value, idx: 0 })
172 }
173
174 fn new_miss(
175 key: SelectorResultKey,
176 read_cols: &ParquetReadColumns,
177 reader: FlatRowGroupReader,
178 cache_strategy: CacheStrategy,
179 ) -> Self {
180 selector_result_cache_miss();
181 Self::Miss(FlatRowGroupLastRowReader::new(
182 key,
183 read_cols.clone(),
184 reader,
185 cache_strategy,
186 ))
187 }
188}
189
190pub(crate) struct FlatLastRowCacheReader {
192 value: Arc<SelectorResultValue>,
193 idx: usize,
194}
195
196impl FlatLastRowCacheReader {
197 fn next_batch(&mut self) -> Result<Option<RecordBatch>> {
198 let batches = match &self.value.result {
199 SelectorResult::Flat(batches) => batches,
200 SelectorResult::PrimaryKey(_) => unreachable!(),
201 };
202 if self.idx < batches.len() {
203 let res = Ok(Some(batches[self.idx].clone()));
204 self.idx += 1;
205 res
206 } else {
207 Ok(None)
208 }
209 }
210}
211
212pub(crate) struct BatchBuffer {
214 batches: Vec<RecordBatch>,
215 num_rows: usize,
216}
217
218impl BatchBuffer {
219 fn new() -> Self {
220 Self {
221 batches: Vec::new(),
222 num_rows: 0,
223 }
224 }
225
226 fn is_full(&self) -> bool {
228 self.num_rows >= DEFAULT_READ_BATCH_SIZE
229 }
230
231 fn extend_from_slice(&mut self, batches: &[RecordBatch]) {
233 for batch in batches {
234 self.num_rows += batch.num_rows();
235 }
236 self.batches.extend_from_slice(batches);
237 }
238
239 fn is_empty(&self) -> bool {
241 self.batches.is_empty()
242 }
243
244 fn concat(&mut self) -> Result<RecordBatch> {
246 debug_assert!(!self.batches.is_empty());
247 let schema = self.batches[0].schema();
248 let merged = concat_batches(&schema, &self.batches).context(ComputeArrowSnafu)?;
249 self.batches.clear();
250 self.num_rows = 0;
251 Ok(merged)
252 }
253}
254
255pub(crate) struct FlatRowGroupLastRowReader {
257 key: SelectorResultKey,
258 reader: FlatRowGroupReader,
259 selector: FlatLastTimestampSelector,
260 yielded_batches: Vec<RecordBatch>,
261 cache_strategy: CacheStrategy,
262 read_cols: ParquetReadColumns,
263 pending: BatchBuffer,
265}
266
267impl FlatRowGroupLastRowReader {
268 fn new(
269 key: SelectorResultKey,
270 read_cols: ParquetReadColumns,
271 reader: FlatRowGroupReader,
272 cache_strategy: CacheStrategy,
273 ) -> Self {
274 Self {
275 key,
276 reader,
277 selector: FlatLastTimestampSelector::default(),
278 yielded_batches: vec![],
279 cache_strategy,
280 read_cols,
281 pending: BatchBuffer::new(),
282 }
283 }
284
285 fn flush_pending(&mut self) -> Result<Option<RecordBatch>> {
287 if self.pending.is_empty() {
288 return Ok(None);
289 }
290 let merged = self.pending.concat()?;
291 self.yielded_batches.push(merged.clone());
292 Ok(Some(merged))
293 }
294
295 async fn next_batch(&mut self) -> Result<Option<RecordBatch>> {
296 if self.pending.is_full() {
297 return self.flush_pending();
298 }
299
300 while let Some(batch) = self.reader.next_batch().await? {
301 self.selector.on_next(batch, &mut self.pending)?;
302 if self.pending.is_full() {
303 return self.flush_pending();
304 }
305 }
306
307 self.selector.finish(&mut self.pending)?;
309 if !self.pending.is_empty() {
310 let result = self.flush_pending();
311 self.maybe_update_cache();
313 return result;
314 }
315
316 self.maybe_update_cache();
318 Ok(None)
319 }
320
321 fn maybe_update_cache(&mut self) {
322 if self.yielded_batches.is_empty() {
323 return;
324 }
325 let batches = std::mem::take(&mut self.yielded_batches);
326 let value = Arc::new(SelectorResultValue::new_flat(
327 batches,
328 self.read_cols.clone(),
329 ));
330 self.cache_strategy.put_selector_result(self.key, value);
331 }
332}
333
334#[derive(Default)]
339pub(crate) struct FlatLastTimestampSelector {
340 current_key: Option<LastKeyState>,
342}
343
344#[derive(Debug)]
345struct LastKeyState {
346 key: Vec<u8>,
347 last_timestamp: i64,
348 slices: Vec<RecordBatch>,
349}
350
351impl LastKeyState {
352 fn new(key: Vec<u8>, last_timestamp: i64, first_slice: RecordBatch) -> Self {
353 Self {
354 key,
355 last_timestamp,
356 slices: vec![first_slice],
357 }
358 }
359}
360
361impl FlatLastTimestampSelector {
362 pub(crate) fn on_next(
364 &mut self,
365 batch: RecordBatch,
366 output_buffer: &mut BatchBuffer,
367 ) -> Result<()> {
368 if batch.num_rows() == 0 {
369 return Ok(());
370 }
371
372 let num_columns = batch.num_columns();
373 let pk_col_idx = primary_key_column_index(num_columns);
374 let ts_col_idx = time_index_column_index(num_columns);
375
376 let pk_array = batch
377 .column(pk_col_idx)
378 .as_any()
379 .downcast_ref::<PrimaryKeyArray>()
380 .unwrap();
381 let offsets = primary_key_offsets(pk_array)?;
382 if offsets.is_empty() {
383 return Ok(());
384 }
385
386 let ts_values = timestamp_array_to_i64_slice(batch.column(ts_col_idx));
387 for i in 0..offsets.len() - 1 {
388 let range_start = offsets[i];
389 let range_end = offsets[i + 1];
390 let range_key = primary_key_bytes_at(&batch, pk_col_idx, range_start);
391 let range_last_ts = ts_values[range_end - 1];
392 let range_last_ts_start = last_timestamp_start(ts_values, range_start, range_end);
393 let range_slice = batch.slice(range_last_ts_start, range_end - range_last_ts_start);
394
395 match self.current_key.as_mut() {
396 Some(state) if state.key.as_slice() == range_key => {
397 if range_last_ts > state.last_timestamp {
398 state.last_timestamp = range_last_ts;
399 state.slices.clear();
400 state.slices.push(range_slice);
401 } else if range_last_ts == state.last_timestamp {
402 state.slices.push(range_slice);
403 }
404 }
405 Some(_) => {
406 self.flush_current_key(output_buffer);
407 self.current_key = Some(LastKeyState::new(
408 range_key.to_vec(),
409 range_last_ts,
410 range_slice,
411 ));
412 }
413 None => {
414 self.current_key = Some(LastKeyState::new(
415 range_key.to_vec(),
416 range_last_ts,
417 range_slice,
418 ));
419 }
420 }
421 }
422
423 Ok(())
424 }
425
426 pub(crate) fn finish(&mut self, output_buffer: &mut BatchBuffer) -> Result<()> {
428 self.flush_current_key(output_buffer);
429 Ok(())
430 }
431
432 fn flush_current_key(&mut self, output_buffer: &mut BatchBuffer) {
433 let Some(state) = self.current_key.take() else {
434 return;
435 };
436 output_buffer.extend_from_slice(&state.slices);
437 }
438}
439
440pub(crate) struct FlatLastRowReader {
443 stream: BoxedRecordBatchStream,
444 selector: FlatLastTimestampSelector,
445 pending: BatchBuffer,
446}
447
448impl FlatLastRowReader {
449 pub(crate) fn new(stream: BoxedRecordBatchStream) -> Self {
451 Self {
452 stream,
453 selector: FlatLastTimestampSelector::default(),
454 pending: BatchBuffer::new(),
455 }
456 }
457
458 pub(crate) fn into_stream(mut self) -> impl Stream<Item = Result<RecordBatch>> {
460 async_stream::try_stream! {
461 while let Some(batch) = self.stream.try_next().await? {
462 self.selector.on_next(batch, &mut self.pending)?;
463 if self.pending.is_full() {
464 yield self.pending.concat()?;
465 }
466 }
467 self.selector.finish(&mut self.pending)?;
468 if !self.pending.is_empty() {
469 yield self.pending.concat()?;
470 }
471 }
472 }
473}
474
475fn primary_key_bytes_at(batch: &RecordBatch, pk_col_idx: usize, index: usize) -> &[u8] {
477 let pk_dict = batch
478 .column(pk_col_idx)
479 .as_any()
480 .downcast_ref::<PrimaryKeyArray>()
481 .unwrap();
482 let key = pk_dict.keys().value(index);
483 let binary_values = pk_dict
484 .values()
485 .as_any()
486 .downcast_ref::<BinaryArray>()
487 .unwrap();
488 binary_values.value(key as usize)
489}
490
491fn last_timestamp_start(ts_values: &[i64], range_start: usize, range_end: usize) -> usize {
494 debug_assert!(range_start < range_end);
495
496 let last_ts = ts_values[range_end - 1];
497 let mut start = range_end - 1;
498 while start > range_start && ts_values[start - 1] == last_ts {
499 start -= 1;
500 }
501 start
502}
503
504#[cfg(test)]
505mod tests {
506 use std::sync::Arc;
507
508 use api::v1::OpType;
509 use datatypes::arrow::array::{
510 ArrayRef, BinaryDictionaryBuilder, Int64Array, TimestampMillisecondArray, UInt8Array,
511 UInt64Array,
512 };
513 use datatypes::arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit, UInt32Type};
514 use datatypes::arrow::record_batch::RecordBatch;
515
516 use super::*;
517 use crate::test_util::{VecBatchReader, check_reader_result, new_batch};
518
519 #[tokio::test]
520 async fn test_last_row_one_batch() {
521 let input = [new_batch(
522 b"k1",
523 &[1, 2],
524 &[11, 11],
525 &[OpType::Put, OpType::Put],
526 &[21, 22],
527 )];
528 let reader = VecBatchReader::new(&input);
529 let mut reader = LastRowReader::new(Box::new(reader));
530 check_reader_result(
531 &mut reader,
532 &[new_batch(b"k1", &[2], &[11], &[OpType::Put], &[22])],
533 )
534 .await;
535
536 let input = [new_batch(b"k1", &[1], &[11], &[OpType::Put], &[21])];
538 let reader = VecBatchReader::new(&input);
539 let mut reader = LastRowReader::new(Box::new(reader));
540 check_reader_result(
541 &mut reader,
542 &[new_batch(b"k1", &[1], &[11], &[OpType::Put], &[21])],
543 )
544 .await;
545 }
546
547 #[tokio::test]
548 async fn test_last_row_multi_batch() {
549 let input = [
550 new_batch(
551 b"k1",
552 &[1, 2],
553 &[11, 11],
554 &[OpType::Put, OpType::Put],
555 &[21, 22],
556 ),
557 new_batch(
558 b"k1",
559 &[3, 4],
560 &[11, 11],
561 &[OpType::Put, OpType::Put],
562 &[23, 24],
563 ),
564 new_batch(
565 b"k2",
566 &[1, 2],
567 &[11, 11],
568 &[OpType::Put, OpType::Put],
569 &[31, 32],
570 ),
571 ];
572 let reader = VecBatchReader::new(&input);
573 let mut reader = LastRowReader::new(Box::new(reader));
574 check_reader_result(
575 &mut reader,
576 &[
577 new_batch(b"k1", &[4], &[11], &[OpType::Put], &[24]),
578 new_batch(b"k2", &[2], &[11], &[OpType::Put], &[32]),
579 ],
580 )
581 .await;
582 }
583
584 fn new_flat_batch(primary_keys: &[&[u8]], timestamps: &[i64], fields: &[i64]) -> RecordBatch {
586 let num_rows = timestamps.len();
587 assert_eq!(primary_keys.len(), num_rows);
588 assert_eq!(fields.len(), num_rows);
589
590 let columns: Vec<ArrayRef> = vec![
591 Arc::new(Int64Array::from_iter_values(fields.iter().copied())),
593 Arc::new(TimestampMillisecondArray::from_iter_values(
595 timestamps.iter().copied(),
596 )),
597 {
599 let mut builder = BinaryDictionaryBuilder::<UInt32Type>::new();
600 for &pk in primary_keys {
601 builder.append(pk).unwrap();
602 }
603 Arc::new(builder.finish())
604 },
605 Arc::new(UInt64Array::from_iter_values(vec![1u64; num_rows])),
607 Arc::new(UInt8Array::from_iter_values(vec![1u8; num_rows])),
609 ];
610
611 RecordBatch::try_new(test_flat_schema(), columns).unwrap()
612 }
613
614 fn test_flat_schema() -> SchemaRef {
615 let fields = vec![
616 Field::new("field0", DataType::Int64, false),
617 Field::new(
618 "ts",
619 DataType::Timestamp(TimeUnit::Millisecond, None),
620 false,
621 ),
622 Field::new(
623 "__primary_key",
624 DataType::Dictionary(Box::new(DataType::UInt32), Box::new(DataType::Binary)),
625 false,
626 ),
627 Field::new("__sequence", DataType::UInt64, false),
628 Field::new("__op_type", DataType::UInt8, false),
629 ];
630 Arc::new(Schema::new(fields))
631 }
632
633 fn collect_flat_results(
635 selector: &mut FlatLastTimestampSelector,
636 batches: Vec<RecordBatch>,
637 ) -> Vec<(Vec<u8>, i64)> {
638 let mut output_buffer = BatchBuffer::new();
639 let mut results = Vec::new();
640 for batch in batches {
641 selector.on_next(batch, &mut output_buffer).unwrap();
642 for r in output_buffer.batches.drain(..) {
643 extract_flat_rows(&r, &mut results);
644 }
645 output_buffer.num_rows = 0;
646 }
647 selector.finish(&mut output_buffer).unwrap();
648 for r in output_buffer.batches.drain(..) {
649 extract_flat_rows(&r, &mut results);
650 }
651 results
652 }
653
654 fn extract_flat_rows(batch: &RecordBatch, out: &mut Vec<(Vec<u8>, i64)>) {
656 let ts_col = batch
657 .column(1)
658 .as_any()
659 .downcast_ref::<TimestampMillisecondArray>()
660 .unwrap();
661 let pk_col = batch
662 .column(2)
663 .as_any()
664 .downcast_ref::<PrimaryKeyArray>()
665 .unwrap();
666 let binary_values = pk_col
667 .values()
668 .as_any()
669 .downcast_ref::<BinaryArray>()
670 .unwrap();
671
672 for i in 0..batch.num_rows() {
673 let key_idx = pk_col.keys().value(i);
674 let pk = binary_values.value(key_idx as usize).to_vec();
675 let ts = ts_col.value(i);
676 out.push((pk, ts));
677 }
678 }
679
680 #[test]
681 fn test_flat_single_batch_one_key() {
682 let mut selector = FlatLastTimestampSelector::default();
683 let batch = new_flat_batch(&[b"k1", b"k1", b"k1"], &[1, 2, 3], &[10, 20, 30]);
684 let results = collect_flat_results(&mut selector, vec![batch]);
685 assert_eq!(vec![(b"k1".to_vec(), 3)], results);
686 }
687
688 #[test]
689 fn test_flat_single_batch_multiple_keys() {
690 let mut selector = FlatLastTimestampSelector::default();
691 let batch = new_flat_batch(
692 &[b"k1", b"k1", b"k2", b"k2", b"k3"],
693 &[1, 2, 3, 4, 5],
694 &[10, 20, 30, 40, 50],
695 );
696 let results = collect_flat_results(&mut selector, vec![batch]);
697 assert_eq!(
698 vec![
699 (b"k1".to_vec(), 2),
700 (b"k2".to_vec(), 4),
701 (b"k3".to_vec(), 5),
702 ],
703 results
704 );
705 }
706
707 #[test]
708 fn test_flat_key_spans_batches() {
709 let mut selector = FlatLastTimestampSelector::default();
710 let batches = vec![
711 new_flat_batch(&[b"k1", b"k1"], &[1, 2], &[10, 20]),
712 new_flat_batch(&[b"k1", b"k2"], &[3, 4], &[30, 40]),
713 new_flat_batch(&[b"k2", b"k3"], &[5, 6], &[50, 60]),
714 ];
715 let results = collect_flat_results(&mut selector, batches);
716 assert_eq!(
717 vec![
718 (b"k1".to_vec(), 3),
719 (b"k2".to_vec(), 5),
720 (b"k3".to_vec(), 6),
721 ],
722 results
723 );
724 }
725
726 #[test]
727 fn test_flat_duplicate_last_timestamps() {
728 let mut selector = FlatLastTimestampSelector::default();
729 let batch = new_flat_batch(
731 &[b"k1", b"k1", b"k1", b"k2"],
732 &[1, 3, 3, 5],
733 &[10, 20, 30, 40],
734 );
735 let results = collect_flat_results(&mut selector, vec![batch]);
736 assert_eq!(
737 vec![
738 (b"k1".to_vec(), 3),
739 (b"k1".to_vec(), 3),
740 (b"k2".to_vec(), 5),
741 ],
742 results
743 );
744 }
745
746 #[test]
747 fn test_flat_duplicate_last_timestamps_across_batches() {
748 let mut selector = FlatLastTimestampSelector::default();
749 let batches = vec![
751 new_flat_batch(&[b"k1", b"k1"], &[1, 3], &[10, 20]),
752 new_flat_batch(&[b"k1", b"k2"], &[3, 5], &[30, 40]),
753 ];
754 let results = collect_flat_results(&mut selector, batches);
755 assert_eq!(
756 vec![
757 (b"k1".to_vec(), 3),
758 (b"k1".to_vec(), 3),
759 (b"k2".to_vec(), 5),
760 ],
761 results
762 );
763 }
764
765 #[test]
766 fn test_flat_pending_chain_dropped_by_higher_timestamp() {
767 let mut selector = FlatLastTimestampSelector::default();
768 let batches = vec![
769 new_flat_batch(&[b"k1", b"k1"], &[1, 3], &[10, 20]),
770 new_flat_batch(&[b"k1", b"k1"], &[3, 3], &[21, 22]),
771 new_flat_batch(&[b"k1", b"k1"], &[4, 4], &[23, 24]),
772 ];
773 let results = collect_flat_results(&mut selector, batches);
774 assert_eq!(vec![(b"k1".to_vec(), 4), (b"k1".to_vec(), 4)], results);
775 }
776
777 #[test]
778 fn test_flat_finish_is_one_shot() {
779 let mut selector = FlatLastTimestampSelector::default();
780 let batch = new_flat_batch(&[b"k1", b"k1", b"k2"], &[1, 2, 3], &[10, 20, 30]);
781 let mut output_buffer = BatchBuffer::new();
782
783 selector.on_next(batch, &mut output_buffer).unwrap();
785 let mut pre_finish = Vec::new();
786 for r in output_buffer.batches.drain(..) {
787 extract_flat_rows(&r, &mut pre_finish);
788 }
789 output_buffer.num_rows = 0;
790 assert_eq!(vec![(b"k1".to_vec(), 2)], pre_finish);
791
792 selector.finish(&mut output_buffer).unwrap();
794 assert!(!output_buffer.is_empty());
795 output_buffer.batches.clear();
796 output_buffer.num_rows = 0;
797
798 selector.finish(&mut output_buffer).unwrap();
800 assert!(output_buffer.is_empty());
801 }
802}