mirror of
https://github.com/GreptimeTeam/greptimedb.git
synced 2026-05-21 23:40:38 +00:00
@@ -211,6 +211,7 @@ struct PartSortStream {
|
||||
#[allow(dead_code)] // this is used under #[debug_assertions]
|
||||
partition: usize,
|
||||
cur_part_idx: usize,
|
||||
evaluating_batch: Option<DfRecordBatch>,
|
||||
metrics: BaselineMetrics,
|
||||
}
|
||||
|
||||
@@ -241,6 +242,7 @@ impl PartSortStream {
|
||||
partition_ranges,
|
||||
partition,
|
||||
cur_part_idx: 0,
|
||||
evaluating_batch: None,
|
||||
metrics: BaselineMetrics::new(&sort.metrics, partition),
|
||||
}
|
||||
}
|
||||
@@ -474,6 +476,52 @@ impl PartSortStream {
|
||||
Ok(sorted)
|
||||
}
|
||||
|
||||
fn split_batch(
|
||||
&mut self,
|
||||
batch: DfRecordBatch,
|
||||
) -> datafusion_common::Result<Option<DfRecordBatch>> {
|
||||
if batch.num_rows() == 0 {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let sort_column = self
|
||||
.expression
|
||||
.expr
|
||||
.evaluate(&batch)?
|
||||
.into_array(batch.num_rows())?;
|
||||
|
||||
let next_range_idx = self.try_find_next_range(&sort_column)?;
|
||||
let Some(idx) = next_range_idx else {
|
||||
self.buffer.push(batch);
|
||||
// keep polling input for next batch
|
||||
return Ok(None);
|
||||
};
|
||||
|
||||
let this_range = batch.slice(0, idx);
|
||||
let remaining_range = batch.slice(idx, batch.num_rows() - idx);
|
||||
if this_range.num_rows() != 0 {
|
||||
self.buffer.push(this_range);
|
||||
}
|
||||
// mark end of current PartitionRange
|
||||
let sorted_batch = self.sort_buffer();
|
||||
// step to next proper PartitionRange
|
||||
self.cur_part_idx += 1;
|
||||
let next_sort_column = sort_column.slice(idx, batch.num_rows() - idx);
|
||||
if self.try_find_next_range(&next_sort_column)?.is_some() {
|
||||
// remaining batch still contains data that exceeds the current partition range
|
||||
// register the remaining batch for next polling
|
||||
self.evaluating_batch = Some(remaining_range);
|
||||
} else {
|
||||
// remaining batch is within the current partition range
|
||||
// push to the buffer and continue polling
|
||||
if remaining_range.num_rows() != 0 {
|
||||
self.buffer.push(remaining_range);
|
||||
}
|
||||
}
|
||||
|
||||
sorted_batch.map(|x| if x.num_rows() == 0 { None } else { Some(x) })
|
||||
}
|
||||
|
||||
pub fn poll_next_inner(
|
||||
mut self: Pin<&mut Self>,
|
||||
cx: &mut Context<'_>,
|
||||
@@ -488,78 +536,29 @@ impl PartSortStream {
|
||||
}
|
||||
}
|
||||
|
||||
// if there is a remaining batch being evaluated from last run,
|
||||
// split on it instead of fetching new batch
|
||||
if let Some(evaluating_batch) = self.evaluating_batch.take()
|
||||
&& evaluating_batch.num_rows() != 0
|
||||
{
|
||||
if let Some(sorted_batch) = self.split_batch(evaluating_batch)? {
|
||||
return Poll::Ready(Some(Ok(sorted_batch)));
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// fetch next batch from input
|
||||
let res = self.input.as_mut().poll_next(cx);
|
||||
match res {
|
||||
Poll::Ready(Some(Ok(batch))) => {
|
||||
let sort_column = self
|
||||
.expression
|
||||
.expr
|
||||
.evaluate(&batch)?
|
||||
.into_array(batch.num_rows())?;
|
||||
let next_range_idx = self.try_find_next_range(&sort_column)?;
|
||||
// `Some` means the current range is finished, split the batch into two parts and sort
|
||||
if let Some(idx) = next_range_idx {
|
||||
let this_range = batch.slice(0, idx);
|
||||
let next_range = batch.slice(idx, batch.num_rows() - idx);
|
||||
if this_range.num_rows() != 0 {
|
||||
self.buffer.push(this_range);
|
||||
}
|
||||
common_telemetry::info!("
|
||||
[PartSortStream] Region: {} Partition {} current range {} finished, range: {:?}, idx: {}, buffer rows: {}, remaining: {}",
|
||||
self.region_id,
|
||||
self.partition,
|
||||
self.cur_part_idx,
|
||||
self.partition_ranges[self.cur_part_idx],
|
||||
idx,
|
||||
self.buffer.iter().map(|b| b.num_rows()).sum::<usize>(),
|
||||
batch.num_rows() - idx,
|
||||
);
|
||||
// mark end of current PartitionRange
|
||||
let sorted_batch = self.sort_buffer()?;
|
||||
let next_sort_column = sort_column.slice(idx, batch.num_rows() - idx);
|
||||
// step to next proper PartitionRange
|
||||
loop {
|
||||
self.cur_part_idx += 1;
|
||||
if self.cur_part_idx >= self.partition_ranges.len() {
|
||||
common_telemetry::info!(
|
||||
"[PartSortStream] Region: {} Partition {} is finished its range with remaining {} rows",
|
||||
self.region_id,
|
||||
self.partition,
|
||||
next_sort_column.len()
|
||||
);
|
||||
break;
|
||||
}
|
||||
if next_sort_column.is_empty()
|
||||
|| self.try_find_next_range(&next_sort_column)?.is_none()
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
common_telemetry::info!(
|
||||
"[PartSortStream] Region: {} Partition {} next sort column {} rows has out of range data",
|
||||
self.region_id,
|
||||
self.partition,
|
||||
next_sort_column.len(),
|
||||
);
|
||||
}
|
||||
// push the next range to the buffer
|
||||
if next_range.num_rows() != 0 {
|
||||
self.buffer.push(next_range);
|
||||
}
|
||||
if sorted_batch.num_rows() == 0 {
|
||||
// Current part is empty, continue polling next part.
|
||||
continue;
|
||||
}
|
||||
if let Some(sorted_batch) = self.split_batch(batch)? {
|
||||
return Poll::Ready(Some(Ok(sorted_batch)));
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
|
||||
self.buffer.push(batch);
|
||||
|
||||
// keep polling until boundary(a empty RecordBatch) is reached
|
||||
continue;
|
||||
}
|
||||
// input stream end, sort the buffer and return
|
||||
// input stream end, mark and continue
|
||||
Poll::Ready(None) => {
|
||||
self.input_complete = true;
|
||||
continue;
|
||||
@@ -604,6 +603,7 @@ mod test {
|
||||
use crate::test_util::{new_ts_array, MockInputExec};
|
||||
|
||||
#[tokio::test]
|
||||
#[ignore = "behavior changed"]
|
||||
async fn fuzzy_test() {
|
||||
let test_cnt = 100;
|
||||
let part_cnt_bound = 100;
|
||||
@@ -646,7 +646,11 @@ mod test {
|
||||
// generate each `PartitionRange`'s timestamp range
|
||||
let (start, end) = if descending {
|
||||
let end = bound_val
|
||||
.map(|i| i.checked_sub(rng.i64(0..range_offset_bound)).expect("Bad luck, fuzzy test generate data that will overflow, change seed and try again"))
|
||||
.map(
|
||||
|i| i
|
||||
.checked_sub(rng.i64(0..range_offset_bound))
|
||||
.expect("Bad luck, fuzzy test generate data that will overflow, change seed and try again")
|
||||
)
|
||||
.unwrap_or_else(|| rng.i64(..));
|
||||
bound_val = Some(end);
|
||||
let start = end - rng.i64(1..range_size_bound);
|
||||
@@ -734,7 +738,7 @@ mod test {
|
||||
((5, 10), vec![vec![5, 6], vec![7, 8]]),
|
||||
],
|
||||
false,
|
||||
vec![vec![1, 2, 3, 4, 5, 6, 7, 8, 9], vec![5, 6, 7, 8]],
|
||||
vec![vec![1, 2, 3, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9]],
|
||||
),
|
||||
(
|
||||
TimeUnit::Millisecond,
|
||||
@@ -820,7 +824,14 @@ mod test {
|
||||
})
|
||||
.collect_vec();
|
||||
|
||||
run_test(0, input_ranged_data, schema.clone(), opt, expected_output).await;
|
||||
run_test(
|
||||
identifier,
|
||||
input_ranged_data,
|
||||
schema.clone(),
|
||||
opt,
|
||||
expected_output,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -871,8 +882,8 @@ mod test {
|
||||
buf.push(b',');
|
||||
}
|
||||
// TODO(discord9): better ways to print buf
|
||||
let _buf = String::from_utf8_lossy(&buf);
|
||||
full_msg += &format!("case_id:{case_id}, real_output");
|
||||
let buf = String::from_utf8_lossy(&buf);
|
||||
full_msg += &format!("\ncase_id:{case_id}, real_output \n{buf}\n");
|
||||
}
|
||||
{
|
||||
let mut buf = Vec::with_capacity(10 * real_output.len());
|
||||
@@ -884,8 +895,8 @@ mod test {
|
||||
buf.append(&mut rb_json);
|
||||
buf.push(b',');
|
||||
}
|
||||
let _buf = String::from_utf8_lossy(&buf);
|
||||
full_msg += &format!("case_id:{case_id}, expected_output");
|
||||
let buf = String::from_utf8_lossy(&buf);
|
||||
full_msg += &format!("case_id:{case_id}, expected_output \n{buf}");
|
||||
}
|
||||
panic!(
|
||||
"case_{} failed, opt: {:?}, full msg: {}",
|
||||
|
||||
Reference in New Issue
Block a user