Merge branch 'main' into transform-count-min-max

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
This commit is contained in:
Ruihang Xia
2024-11-01 17:45:18 +08:00
16 changed files with 472 additions and 227 deletions

View File

@@ -0,0 +1,16 @@
# Change Log Level on the Fly
## HTTP API
example:
```bash
curl --data "trace;flow=debug" 127.0.0.1:4000/debug/log_level
```
And database will reply with something like:
```bash
Log Level changed from Some("info") to "trace;flow=debug"%
```
The data is a string in the format of `global_level;module1=level1;module2=level2;...` that follow the same rule of `RUST_LOG`.
The module is the module name of the log, and the level is the log level. The log level can be one of the following: `trace`, `debug`, `info`, `warn`, `error`, `off`(case insensitive).

View File

@@ -18,7 +18,6 @@ use common_error::status_code::StatusCode;
use common_macro::stack_trace_debug;
use common_meta::DatanodeId;
use common_runtime::JoinError;
use rand::distributions::WeightedError;
use snafu::{Location, Snafu};
use store_api::storage::RegionId;
use table::metadata::TableId;
@@ -32,6 +31,14 @@ use crate::pubsub::Message;
#[snafu(visibility(pub))]
#[stack_trace_debug]
pub enum Error {
#[snafu(display("Failed to choose items"))]
ChooseItems {
#[snafu(implicit)]
location: Location,
#[snafu(source)]
error: rand::distributions::WeightedError,
},
#[snafu(display("Exceeded deadline, operation: {}", operation))]
ExceededDeadline {
#[snafu(implicit)]
@@ -643,20 +650,6 @@ pub enum Error {
location: Location,
},
#[snafu(display("Failed to set weight array"))]
WeightArray {
#[snafu(source)]
error: WeightedError,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Weight array is not set"))]
NotSetWeightArray {
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Unexpected table route type: {}", err_msg))]
UnexpectedLogicalRouteTable {
#[snafu(implicit)]
@@ -759,10 +752,9 @@ impl ErrorExt for Error {
| Error::NoEnoughAvailableNode { .. }
| Error::PublishMessage { .. }
| Error::Join { .. }
| Error::WeightArray { .. }
| Error::NotSetWeightArray { .. }
| Error::PeerUnavailable { .. }
| Error::ExceededDeadline { .. } => StatusCode::Internal,
| Error::ExceededDeadline { .. }
| Error::ChooseItems { .. } => StatusCode::Internal,
Error::Unsupported { .. } => StatusCode::Unsupported,

View File

@@ -12,29 +12,23 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::HashSet;
use common_meta::peer::Peer;
use snafu::ensure;
use super::weighted_choose::{WeightedChoose, WeightedItem};
use super::weighted_choose::WeightedChoose;
use crate::error;
use crate::error::Result;
use crate::metasrv::SelectTarget;
use crate::selector::SelectorOptions;
/// According to the `opts`, choose peers from the `weight_array` through `weighted_choose`.
pub fn choose_peers<W>(
mut weight_array: Vec<WeightedItem<Peer>>,
opts: &SelectorOptions,
weighted_choose: &mut W,
) -> Result<Vec<Peer>>
pub fn choose_peers<W>(opts: &SelectorOptions, weighted_choose: &mut W) -> Result<Vec<Peer>>
where
W: WeightedChoose<Peer>,
{
let min_required_items = opts.min_required_items;
ensure!(
!weight_array.is_empty(),
!weighted_choose.is_empty(),
error::NoEnoughAvailableNodeSnafu {
required: min_required_items,
available: 0_usize,
@@ -43,12 +37,11 @@ where
);
if opts.allow_duplication {
weighted_choose.set_weight_array(weight_array)?;
(0..min_required_items)
.map(|_| weighted_choose.choose_one())
.collect::<Result<_>>()
} else {
let weight_array_len = weight_array.len();
let weight_array_len = weighted_choose.len();
// When opts.allow_duplication is false, we need to check that the length of the weighted array is greater than
// or equal to min_required_items, otherwise it may cause an infinite loop.
@@ -61,33 +54,7 @@ where
}
);
if weight_array_len == min_required_items {
return Ok(weight_array.into_iter().map(|item| item.item).collect());
}
weighted_choose.set_weight_array(weight_array.clone())?;
// Assume min_required_items is 3, weight_array_len is 100, then we can choose 3 items from the weight array
// and return. But assume min_required_items is 99, weight_array_len is 100. It's not cheap to choose 99 items
// from the weight array. So we can reverse choose 1 item from the weight array, and return the remaining 99
// items.
if min_required_items * 2 > weight_array_len {
let select_num = weight_array_len - min_required_items;
let mut selected = HashSet::with_capacity(select_num);
while selected.len() < select_num {
let item = weighted_choose.reverse_choose_one()?;
selected.insert(item);
}
weight_array.retain(|item| !selected.contains(&item.item));
Ok(weight_array.into_iter().map(|item| item.item).collect())
} else {
let mut selected = HashSet::with_capacity(min_required_items);
while selected.len() < min_required_items {
let item = weighted_choose.choose_one()?;
selected.insert(item);
}
Ok(selected.into_iter().collect())
}
weighted_choose.choose_multiple(min_required_items)
}
}
@@ -110,7 +77,6 @@ mod tests {
addr: "127.0.0.1:3001".to_string(),
},
weight: 1,
reverse_weight: 1,
},
WeightedItem {
item: Peer {
@@ -118,7 +84,6 @@ mod tests {
addr: "127.0.0.1:3001".to_string(),
},
weight: 1,
reverse_weight: 1,
},
WeightedItem {
item: Peer {
@@ -126,7 +91,6 @@ mod tests {
addr: "127.0.0.1:3001".to_string(),
},
weight: 1,
reverse_weight: 1,
},
WeightedItem {
item: Peer {
@@ -134,7 +98,6 @@ mod tests {
addr: "127.0.0.1:3001".to_string(),
},
weight: 1,
reverse_weight: 1,
},
WeightedItem {
item: Peer {
@@ -142,7 +105,6 @@ mod tests {
addr: "127.0.0.1:3001".to_string(),
},
weight: 1,
reverse_weight: 1,
},
];
@@ -152,14 +114,11 @@ mod tests {
allow_duplication: false,
};
let selected_peers: HashSet<_> = choose_peers(
weight_array.clone(),
&opts,
&mut RandomWeightedChoose::default(),
)
.unwrap()
.into_iter()
.collect();
let selected_peers: HashSet<_> =
choose_peers(&opts, &mut RandomWeightedChoose::new(weight_array.clone()))
.unwrap()
.into_iter()
.collect();
assert_eq!(i, selected_peers.len());
}
@@ -169,11 +128,8 @@ mod tests {
allow_duplication: false,
};
let selected_result = choose_peers(
weight_array.clone(),
&opts,
&mut RandomWeightedChoose::default(),
);
let selected_result =
choose_peers(&opts, &mut RandomWeightedChoose::new(weight_array.clone()));
assert!(selected_result.is_err());
for i in 1..=50 {
@@ -182,12 +138,8 @@ mod tests {
allow_duplication: true,
};
let selected_peers = choose_peers(
weight_array.clone(),
&opts,
&mut RandomWeightedChoose::default(),
)
.unwrap();
let selected_peers =
choose_peers(&opts, &mut RandomWeightedChoose::new(weight_array.clone())).unwrap();
assert_eq!(i, selected_peers.len());
}

View File

@@ -48,13 +48,12 @@ impl Selector for LeaseBasedSelector {
addr: v.node_addr.clone(),
},
weight: 1,
reverse_weight: 1,
})
.collect();
// 3. choose peers by weight_array.
let weighted_choose = &mut RandomWeightedChoose::default();
let selected = choose_peers(weight_array, &opts, weighted_choose)?;
let mut weighted_choose = RandomWeightedChoose::new(weight_array);
let selected = choose_peers(&opts, &mut weighted_choose)?;
Ok(selected)
}

View File

@@ -19,7 +19,6 @@ use common_meta::key::TableMetadataManager;
use common_meta::peer::Peer;
use common_meta::rpc::router::find_leaders;
use common_telemetry::{debug, info};
use parking_lot::RwLock;
use snafu::ResultExt;
use table::metadata::TableId;
@@ -29,36 +28,30 @@ use crate::lease;
use crate::metasrv::SelectorContext;
use crate::selector::common::choose_peers;
use crate::selector::weight_compute::{RegionNumsBasedWeightCompute, WeightCompute};
use crate::selector::weighted_choose::{RandomWeightedChoose, WeightedChoose};
use crate::selector::weighted_choose::RandomWeightedChoose;
use crate::selector::{Namespace, Selector, SelectorOptions};
pub struct LoadBasedSelector<W, C> {
weighted_choose: RwLock<W>,
pub struct LoadBasedSelector<C> {
weight_compute: C,
}
impl<W, C> LoadBasedSelector<W, C> {
pub fn new(weighted_choose: W, weight_compute: C) -> Self {
Self {
weighted_choose: RwLock::new(weighted_choose),
weight_compute,
}
impl<C> LoadBasedSelector<C> {
pub fn new(weight_compute: C) -> Self {
Self { weight_compute }
}
}
impl Default for LoadBasedSelector<RandomWeightedChoose<Peer>, RegionNumsBasedWeightCompute> {
impl Default for LoadBasedSelector<RegionNumsBasedWeightCompute> {
fn default() -> Self {
Self {
weighted_choose: RwLock::new(RandomWeightedChoose::default()),
weight_compute: RegionNumsBasedWeightCompute,
}
}
}
#[async_trait::async_trait]
impl<W, C> Selector for LoadBasedSelector<W, C>
impl<C> Selector for LoadBasedSelector<C>
where
W: WeightedChoose<Peer>,
C: WeightCompute<Source = HashMap<DatanodeStatKey, DatanodeStatValue>>,
{
type Context = SelectorContext;
@@ -100,8 +93,8 @@ where
let weight_array = self.weight_compute.compute(&stat_kvs);
// 5. choose peers by weight_array.
let mut weighted_choose = self.weighted_choose.write();
let selected = choose_peers(weight_array, &opts, &mut *weighted_choose)?;
let mut weighted_choose = RandomWeightedChoose::new(weight_array);
let selected = choose_peers(&opts, &mut weighted_choose)?;
debug!(
"LoadBasedSelector select peers: {:?}, namespace: {}, opts: {:?}.",

View File

@@ -85,7 +85,6 @@ impl WeightCompute for RegionNumsBasedWeightCompute {
.map(|(peer, region_num)| WeightedItem {
item: peer,
weight: (max_weight - region_num + base_weight) as usize,
reverse_weight: (region_num - min_weight + base_weight) as usize,
})
.collect()
}
@@ -181,10 +180,6 @@ mod tests {
},
4,
);
for weight in weight_array.iter() {
assert_eq!(weight.reverse_weight, *expected.get(&weight.item).unwrap());
}
}
fn mock_stat_1() -> Stat {

View File

@@ -12,41 +12,37 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use rand::distributions::WeightedIndex;
use rand::prelude::Distribution;
use rand::seq::SliceRandom;
use rand::thread_rng;
use snafu::{ensure, ResultExt};
use snafu::ResultExt;
use crate::error;
use crate::error::Result;
/// A common trait for weighted balance algorithm.
pub trait WeightedChoose<Item>: Send + Sync {
/// The method will re-set weight array.
///
/// Note:
/// 1. make sure weight_array is not empty.
/// 2. the total weight is greater than 0.
///
/// Otherwise an error will be returned.
fn set_weight_array(&mut self, weight_array: Vec<WeightedItem<Item>>) -> Result<()>;
/// The method will choose one item.
///
/// If not set weight_array before, an error will be returned.
fn choose_one(&mut self) -> Result<Item>;
/// The method will reverse choose one item.
/// The method will choose multiple items.
///
/// If not set weight_array before, an error will be returned.
fn reverse_choose_one(&mut self) -> Result<Item>;
/// Returns less than `amount` items if the weight_array is not enough.
fn choose_multiple(&mut self, amount: usize) -> Result<Vec<Item>>;
/// Returns the length of the weight_array.
fn len(&self) -> usize;
/// Returns whether the weight_array is empty.
fn is_empty(&self) -> bool {
self.len() == 0
}
}
/// The struct represents a weighted item.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct WeightedItem<Item> {
pub item: Item,
pub weight: usize,
pub reverse_weight: usize,
}
/// A implementation of weighted balance: random weighted choose.
@@ -64,16 +60,18 @@ pub struct WeightedItem<Item> {
/// ```
pub struct RandomWeightedChoose<Item> {
items: Vec<WeightedItem<Item>>,
weighted_index: Option<WeightedIndex<usize>>,
reverse_weighted_index: Option<WeightedIndex<usize>>,
}
impl<Item> RandomWeightedChoose<Item> {
pub fn new(items: Vec<WeightedItem<Item>>) -> Self {
Self { items }
}
}
impl<Item> Default for RandomWeightedChoose<Item> {
fn default() -> Self {
Self {
items: Vec::default(),
weighted_index: None,
reverse_weighted_index: None,
}
}
}
@@ -82,48 +80,29 @@ impl<Item> WeightedChoose<Item> for RandomWeightedChoose<Item>
where
Item: Clone + Send + Sync,
{
fn set_weight_array(&mut self, weight_array: Vec<WeightedItem<Item>>) -> Result<()> {
self.weighted_index = Some(
WeightedIndex::new(weight_array.iter().map(|item| item.weight))
.context(error::WeightArraySnafu)?,
);
self.reverse_weighted_index = Some(
WeightedIndex::new(weight_array.iter().map(|item| item.reverse_weight))
.context(error::WeightArraySnafu)?,
);
self.items = weight_array;
Ok(())
}
fn choose_one(&mut self) -> Result<Item> {
ensure!(
!self.items.is_empty() && self.weighted_index.is_some(),
error::NotSetWeightArraySnafu
);
// unwrap safety: whether weighted_index is none has been checked before.
let weighted_index = self.weighted_index.as_ref().unwrap();
Ok(self.items[weighted_index.sample(&mut thread_rng())]
let item = self
.items
.choose_weighted(&mut thread_rng(), |item| item.weight as f64)
.context(error::ChooseItemsSnafu)?
.item
.clone())
.clone();
Ok(item)
}
fn reverse_choose_one(&mut self) -> Result<Item> {
ensure!(
!self.items.is_empty() && self.reverse_weighted_index.is_some(),
error::NotSetWeightArraySnafu
);
fn choose_multiple(&mut self, amount: usize) -> Result<Vec<Item>> {
Ok(self
.items
.choose_multiple_weighted(&mut thread_rng(), amount, |item| item.weight as f64)
.context(error::ChooseItemsSnafu)?
.cloned()
.map(|item| item.item)
.collect::<Vec<_>>())
}
// unwrap safety: whether reverse_weighted_index is none has been checked before.
let reverse_weighted_index = self.reverse_weighted_index.as_ref().unwrap();
Ok(self.items[reverse_weighted_index.sample(&mut thread_rng())]
.item
.clone())
fn len(&self) -> usize {
self.items.len()
}
}
@@ -133,45 +112,22 @@ mod tests {
#[test]
fn test_random_weighted_choose() {
let mut choose = RandomWeightedChoose::default();
choose
.set_weight_array(vec![
WeightedItem {
item: 1,
weight: 100,
reverse_weight: 0,
},
WeightedItem {
item: 2,
weight: 0,
reverse_weight: 100,
},
])
.unwrap();
let mut choose = RandomWeightedChoose::new(vec![
WeightedItem {
item: 1,
weight: 100,
},
WeightedItem { item: 2, weight: 0 },
]);
for _ in 0..100 {
let ret = choose.choose_one().unwrap();
assert_eq!(1, ret);
}
for _ in 0..100 {
let ret = choose.reverse_choose_one().unwrap();
assert_eq!(2, ret);
let ret = choose.choose_multiple(3).unwrap();
assert_eq!(vec![1, 2], ret);
}
}
#[test]
#[should_panic]
fn test_random_weighted_choose_should_panic() {
let mut choose: RandomWeightedChoose<u32> = RandomWeightedChoose::default();
choose.set_weight_array(vec![]).unwrap();
let _ = choose.choose_one().unwrap();
}
#[test]
#[should_panic]
fn test_random_reverse_weighted_choose_should_panic() {
let mut choose: RandomWeightedChoose<u32> = RandomWeightedChoose::default();
choose.set_weight_array(vec![]).unwrap();
let _ = choose.reverse_choose_one().unwrap();
}
}

View File

@@ -80,18 +80,15 @@ impl CacheManager {
CacheManagerBuilder::default()
}
/// Gets cached [ParquetMetaData].
/// Gets cached [ParquetMetaData] from in-memory cache first.
/// If not found, tries to get it from write cache and fill the in-memory cache.
pub async fn get_parquet_meta_data(
&self,
region_id: RegionId,
file_id: FileId,
) -> Option<Arc<ParquetMetaData>> {
// Try to get metadata from sst meta cache
let metadata = self.sst_meta_cache.as_ref().and_then(|sst_meta_cache| {
let value = sst_meta_cache.get(&SstMetaKey(region_id, file_id));
update_hit_miss(value, SST_META_TYPE)
});
let metadata = self.get_parquet_meta_data_from_mem_cache(region_id, file_id);
if metadata.is_some() {
return metadata;
}
@@ -110,6 +107,20 @@ impl CacheManager {
None
}
/// Gets cached [ParquetMetaData] from in-memory cache.
/// This method does not perform I/O.
pub fn get_parquet_meta_data_from_mem_cache(
&self,
region_id: RegionId,
file_id: FileId,
) -> Option<Arc<ParquetMetaData>> {
// Try to get metadata from sst meta cache
self.sst_meta_cache.as_ref().and_then(|sst_meta_cache| {
let value = sst_meta_cache.get(&SstMetaKey(region_id, file_id));
update_hit_miss(value, SST_META_TYPE)
})
}
/// Puts [ParquetMetaData] into the cache.
pub fn put_parquet_meta_data(
&self,

View File

@@ -18,15 +18,17 @@ use common_time::Timestamp;
use smallvec::{smallvec, SmallVec};
use store_api::region_engine::PartitionRange;
use crate::cache::CacheManager;
use crate::memtable::MemtableRef;
use crate::read::scan_region::ScanInput;
use crate::sst::file::{overlaps, FileHandle, FileTimeRange};
use crate::sst::parquet::format::parquet_row_group_time_range;
use crate::sst::parquet::DEFAULT_ROW_GROUP_SIZE;
const ALL_ROW_GROUPS: i64 = -1;
/// Index to access a row group.
#[derive(Clone, Copy, PartialEq)]
#[derive(Debug, Clone, Copy, PartialEq)]
pub(crate) struct RowGroupIndex {
/// Index to the memtable/file.
pub(crate) index: usize,
@@ -38,6 +40,7 @@ pub(crate) struct RowGroupIndex {
/// Meta data of a partition range.
/// If the scanner is [UnorderedScan], each meta only has one row group or memtable.
/// If the scanner is [SeqScan], each meta may have multiple row groups and memtables.
#[derive(Debug, PartialEq)]
pub(crate) struct RangeMeta {
/// The time range of the range.
pub(crate) time_range: FileTimeRange,
@@ -84,7 +87,12 @@ impl RangeMeta {
pub(crate) fn unordered_scan_ranges(input: &ScanInput) -> Vec<RangeMeta> {
let mut ranges = Vec::with_capacity(input.memtables.len() + input.files.len());
Self::push_unordered_mem_ranges(&input.memtables, &mut ranges);
Self::push_unordered_file_ranges(input.memtables.len(), &input.files, &mut ranges);
Self::push_unordered_file_ranges(
input.memtables.len(),
&input.files,
input.cache_manager.as_deref(),
&mut ranges,
);
ranges
}
@@ -164,12 +172,36 @@ impl RangeMeta {
fn push_unordered_file_ranges(
num_memtables: usize,
files: &[FileHandle],
cache: Option<&CacheManager>,
ranges: &mut Vec<RangeMeta>,
) {
// For append mode, we can parallelize reading row groups.
for (i, file) in files.iter().enumerate() {
let file_index = num_memtables + i;
if file.meta_ref().num_row_groups > 0 {
// Get parquet meta from the cache.
let parquet_meta = cache.and_then(|c| {
c.get_parquet_meta_data_from_mem_cache(file.region_id(), file.file_id())
});
if let Some(parquet_meta) = parquet_meta {
// Scans each row group.
for row_group_index in 0..file.meta_ref().num_row_groups {
let time_range = parquet_row_group_time_range(
file.meta_ref(),
&parquet_meta,
row_group_index as usize,
);
let num_rows = parquet_meta.row_group(row_group_index as usize).num_rows();
ranges.push(RangeMeta {
time_range: time_range.unwrap_or_else(|| file.time_range()),
indices: smallvec![file_index],
row_group_indices: smallvec![RowGroupIndex {
index: file_index,
row_group_index: row_group_index as i64,
}],
num_rows: num_rows as usize,
});
}
} else if file.meta_ref().num_row_groups > 0 {
// Scans each row group.
for row_group_index in 0..file.meta_ref().num_row_groups {
ranges.push(RangeMeta {
@@ -217,7 +249,6 @@ impl RangeMeta {
}
}
// TODO(yingwen): Support multiple row groups in a range so we can split them later.
fn push_seq_file_ranges(
num_memtables: usize,
files: &[FileHandle],
@@ -226,15 +257,31 @@ impl RangeMeta {
// For non append-only mode, each range only contains one file.
for (i, file) in files.iter().enumerate() {
let file_index = num_memtables + i;
ranges.push(RangeMeta {
time_range: file.time_range(),
indices: smallvec![file_index],
row_group_indices: smallvec![RowGroupIndex {
index: file_index,
row_group_index: ALL_ROW_GROUPS,
}],
num_rows: file.meta_ref().num_rows as usize,
});
if file.meta_ref().num_row_groups > 0 {
// All row groups share the same time range.
let row_group_indices = (0..file.meta_ref().num_row_groups)
.map(|row_group_index| RowGroupIndex {
index: file_index,
row_group_index: row_group_index as i64,
})
.collect();
ranges.push(RangeMeta {
time_range: file.time_range(),
indices: smallvec![file_index],
row_group_indices,
num_rows: file.meta_ref().num_rows as usize,
});
} else {
ranges.push(RangeMeta {
time_range: file.time_range(),
indices: smallvec![file_index],
row_group_indices: smallvec![RowGroupIndex {
index: file_index,
row_group_index: ALL_ROW_GROUPS,
}],
num_rows: file.meta_ref().num_rows as usize,
});
}
}
}
}
@@ -366,4 +413,212 @@ mod tests {
&[(vec![3], 0, 1000), (vec![1, 2], 3000, 6000)],
);
}
#[test]
fn test_merge_range() {
let mut left = RangeMeta {
time_range: (Timestamp::new_second(1000), Timestamp::new_second(2000)),
indices: smallvec![1],
row_group_indices: smallvec![
RowGroupIndex {
index: 1,
row_group_index: 1
},
RowGroupIndex {
index: 1,
row_group_index: 2
}
],
num_rows: 5,
};
let right = RangeMeta {
time_range: (Timestamp::new_second(800), Timestamp::new_second(1200)),
indices: smallvec![2],
row_group_indices: smallvec![
RowGroupIndex {
index: 2,
row_group_index: 1
},
RowGroupIndex {
index: 2,
row_group_index: 2
}
],
num_rows: 4,
};
left.merge(right);
assert_eq!(
left,
RangeMeta {
time_range: (Timestamp::new_second(800), Timestamp::new_second(2000)),
indices: smallvec![1, 2],
row_group_indices: smallvec![
RowGroupIndex {
index: 1,
row_group_index: 1
},
RowGroupIndex {
index: 1,
row_group_index: 2
},
RowGroupIndex {
index: 2,
row_group_index: 1
},
RowGroupIndex {
index: 2,
row_group_index: 2
},
],
num_rows: 9,
}
);
}
#[test]
fn test_split_range() {
let range = RangeMeta {
time_range: (Timestamp::new_second(1000), Timestamp::new_second(2000)),
indices: smallvec![1],
row_group_indices: smallvec![
RowGroupIndex {
index: 1,
row_group_index: 1
},
RowGroupIndex {
index: 1,
row_group_index: 2
}
],
num_rows: 5,
};
assert!(range.can_split_preserve_order());
let mut output = Vec::new();
range.maybe_split(&mut output);
assert_eq!(
output,
&[
RangeMeta {
time_range: (Timestamp::new_second(1000), Timestamp::new_second(2000)),
indices: smallvec![1],
row_group_indices: smallvec![RowGroupIndex {
index: 1,
row_group_index: 1
},],
num_rows: 2,
},
RangeMeta {
time_range: (Timestamp::new_second(1000), Timestamp::new_second(2000)),
indices: smallvec![1],
row_group_indices: smallvec![RowGroupIndex {
index: 1,
row_group_index: 2
}],
num_rows: 2,
}
]
);
}
#[test]
fn test_not_split_range() {
let range = RangeMeta {
time_range: (Timestamp::new_second(1000), Timestamp::new_second(2000)),
indices: smallvec![1, 2],
row_group_indices: smallvec![
RowGroupIndex {
index: 1,
row_group_index: 1
},
RowGroupIndex {
index: 2,
row_group_index: 1
}
],
num_rows: 5,
};
assert!(!range.can_split_preserve_order());
let mut output = Vec::new();
range.maybe_split(&mut output);
assert_eq!(1, output.len());
}
#[test]
fn test_maybe_split_ranges() {
let ranges = vec![
RangeMeta {
time_range: (Timestamp::new_second(1000), Timestamp::new_second(2000)),
indices: smallvec![1],
row_group_indices: smallvec![
RowGroupIndex {
index: 1,
row_group_index: 0
},
RowGroupIndex {
index: 1,
row_group_index: 1
}
],
num_rows: 4,
},
RangeMeta {
time_range: (Timestamp::new_second(3000), Timestamp::new_second(4000)),
indices: smallvec![2, 3],
row_group_indices: smallvec![
RowGroupIndex {
index: 2,
row_group_index: 0
},
RowGroupIndex {
index: 3,
row_group_index: 0
}
],
num_rows: 5,
},
];
let output = maybe_split_ranges_for_seq_scan(ranges);
assert_eq!(
output,
vec![
RangeMeta {
time_range: (Timestamp::new_second(1000), Timestamp::new_second(2000)),
indices: smallvec![1],
row_group_indices: smallvec![RowGroupIndex {
index: 1,
row_group_index: 0
},],
num_rows: 2,
},
RangeMeta {
time_range: (Timestamp::new_second(1000), Timestamp::new_second(2000)),
indices: smallvec![1],
row_group_indices: smallvec![RowGroupIndex {
index: 1,
row_group_index: 1
}],
num_rows: 2,
},
RangeMeta {
time_range: (Timestamp::new_second(3000), Timestamp::new_second(4000)),
indices: smallvec![2, 3],
row_group_indices: smallvec![
RowGroupIndex {
index: 2,
row_group_index: 0
},
RowGroupIndex {
index: 3,
row_group_index: 0
}
],
num_rows: 5,
},
]
)
}
}

View File

@@ -111,7 +111,8 @@ pub struct FileMeta {
pub region_id: RegionId,
/// Compared to normal file names, FileId ignore the extension
pub file_id: FileId,
/// Timestamp range of file.
/// Timestamp range of file. The timestamps have the same time unit as the
/// data in the SST.
pub time_range: FileTimeRange,
/// SST level of the file.
pub level: Level,

View File

@@ -62,7 +62,8 @@ impl Default for WriteOptions {
/// Parquet SST info returned by the writer.
pub struct SstInfo {
/// Time range of the SST.
/// Time range of the SST. The timestamps have the same time unit as the
/// data in the SST.
pub time_range: FileTimeRange,
/// File size in bytes.
pub file_size: u64,

View File

@@ -31,13 +31,14 @@ use std::collections::{HashMap, VecDeque};
use std::sync::Arc;
use api::v1::SemanticType;
use common_time::Timestamp;
use datafusion_common::ScalarValue;
use datatypes::arrow::array::{ArrayRef, BinaryArray, DictionaryArray, UInt32Array, UInt64Array};
use datatypes::arrow::datatypes::{SchemaRef, UInt32Type};
use datatypes::arrow::record_batch::RecordBatch;
use datatypes::prelude::DataType;
use datatypes::vectors::{Helper, Vector};
use parquet::file::metadata::RowGroupMetaData;
use parquet::file::metadata::{ParquetMetaData, RowGroupMetaData};
use parquet::file::statistics::Statistics;
use snafu::{ensure, OptionExt, ResultExt};
use store_api::metadata::{ColumnMetadata, RegionMetadataRef};
@@ -48,6 +49,7 @@ use crate::error::{
};
use crate::read::{Batch, BatchBuilder, BatchColumn};
use crate::row_converter::{McmpRowCodec, RowCodec, SortField};
use crate::sst::file::{FileMeta, FileTimeRange};
use crate::sst::to_sst_arrow_schema;
/// Arrow array type for the primary key dictionary.
@@ -558,6 +560,50 @@ fn new_primary_key_array(primary_key: &[u8], num_rows: usize) -> ArrayRef {
Arc::new(DictionaryArray::new(keys, values))
}
/// Gets the min/max time index of the row group from the parquet meta.
/// It assumes the parquet is created by the mito engine.
pub(crate) fn parquet_row_group_time_range(
file_meta: &FileMeta,
parquet_meta: &ParquetMetaData,
row_group_idx: usize,
) -> Option<FileTimeRange> {
let row_group_meta = parquet_meta.row_group(row_group_idx);
let num_columns = parquet_meta.file_metadata().schema_descr().num_columns();
assert!(
num_columns >= FIXED_POS_COLUMN_NUM,
"file only has {} columns",
num_columns
);
let time_index_pos = num_columns - FIXED_POS_COLUMN_NUM;
let stats = row_group_meta.column(time_index_pos).statistics()?;
if stats.has_min_max_set() {
// The physical type for the timestamp should be i64.
let (min, max) = match stats {
Statistics::Int64(value_stats) => (*value_stats.min(), *value_stats.max()),
Statistics::Int32(_)
| Statistics::Boolean(_)
| Statistics::Int96(_)
| Statistics::Float(_)
| Statistics::Double(_)
| Statistics::ByteArray(_)
| Statistics::FixedLenByteArray(_) => return None,
};
debug_assert!(
min >= file_meta.time_range.0.value() && min <= file_meta.time_range.1.value()
);
debug_assert!(
max >= file_meta.time_range.0.value() && max <= file_meta.time_range.1.value()
);
let unit = file_meta.time_range.0.unit();
Some((Timestamp::new(min, unit), Timestamp::new(max, unit)))
} else {
None
}
}
#[cfg(test)]
mod tests {
use api::v1::OpType;

View File

@@ -101,6 +101,7 @@ impl WindowedSortPhysicalRule {
} else {
Arc::new(PartSortExec::new(
first_sort_expr.clone(),
sort_exec.fetch(),
scanner_info.partition_ranges.clone(),
sort_exec.input().clone(),
))

View File

@@ -47,6 +47,7 @@ use crate::downcast_ts_array;
pub struct PartSortExec {
/// Physical sort expressions(that is, sort by timestamp)
expression: PhysicalSortExpr,
limit: Option<usize>,
input: Arc<dyn ExecutionPlan>,
/// Execution metrics
metrics: ExecutionPlanMetricsSet,
@@ -57,6 +58,7 @@ pub struct PartSortExec {
impl PartSortExec {
pub fn new(
expression: PhysicalSortExpr,
limit: Option<usize>,
partition_ranges: Vec<Vec<PartitionRange>>,
input: Arc<dyn ExecutionPlan>,
) -> Self {
@@ -69,6 +71,7 @@ impl PartSortExec {
Self {
expression,
limit,
input,
metrics,
partition_ranges,
@@ -95,6 +98,7 @@ impl PartSortExec {
let df_stream = Box::pin(PartSortStream::new(
context,
self,
self.limit,
input_stream,
self.partition_ranges[partition].clone(),
partition,
@@ -106,7 +110,16 @@ impl PartSortExec {
impl DisplayAs for PartSortExec {
fn fmt_as(&self, _t: DisplayFormatType, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "PartSortExec {}", self.expression)
write!(
f,
"PartSortExec: expr={} num_ranges={}",
self.expression,
self.partition_ranges.len(),
)?;
if let Some(limit) = self.limit {
write!(f, " limit={}", limit)?;
}
Ok(())
}
}
@@ -138,6 +151,7 @@ impl ExecutionPlan for PartSortExec {
};
Ok(Arc::new(Self::new(
self.expression.clone(),
self.limit,
self.partition_ranges.clone(),
new_input.clone(),
)))
@@ -170,6 +184,7 @@ struct PartSortStream {
reservation: MemoryReservation,
buffer: Vec<DfRecordBatch>,
expression: PhysicalSortExpr,
limit: Option<usize>,
produced: usize,
input: DfSendableRecordBatchStream,
input_complete: bool,
@@ -185,6 +200,7 @@ impl PartSortStream {
fn new(
context: Arc<TaskContext>,
sort: &PartSortExec,
limit: Option<usize>,
input: DfSendableRecordBatchStream,
partition_ranges: Vec<PartitionRange>,
partition: usize,
@@ -194,6 +210,7 @@ impl PartSortStream {
.register(&context.runtime_env().memory_pool),
buffer: Vec::new(),
expression: sort.expression.clone(),
limit,
produced: 0,
input,
input_complete: false,
@@ -294,7 +311,7 @@ impl PartSortStream {
)
})?;
let indices = sort_to_indices(&sort_column, opt, None).map_err(|e| {
let indices = sort_to_indices(&sort_column, opt, self.limit).map_err(|e| {
DataFusionError::ArrowError(
e,
Some(format!("Fail to sort to indices at {}", location!())),
@@ -674,6 +691,7 @@ mod test {
expr: Arc::new(Column::new("ts", 0)),
options: opt,
},
None,
vec![ranges],
Arc::new(mock_input),
);

View File

@@ -169,7 +169,16 @@ impl WindowedSortExec {
impl DisplayAs for WindowedSortExec {
fn fmt_as(&self, _t: DisplayFormatType, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "WindowedSortExec")
write!(
f,
"WindowedSortExec: expr={} num_ranges={}",
self.expression,
self.ranges.len()
)?;
if let Some(fetch) = self.fetch {
write!(f, " fetch={}", fetch)?;
}
Ok(())
}
}

View File

@@ -69,7 +69,7 @@ EXPLAIN ANALYZE SELECT * FROM test ORDER BY t LIMIT 5;
|_|_|_|
| 1_| 0_|_GlobalLimitExec: skip=0, fetch=5 REDACTED
|_|_|_SortPreservingMergeExec: [t@1 ASC NULLS LAST] REDACTED
|_|_|_WindowedSortExec REDACTED
|_|_|_WindowedSortExec: expr=t@1 ASC NULLS LAST num_ranges=2 fetch=5 REDACTED
|_|_|_SeqScan: region=REDACTED, partition_count=2 (1 memtable ranges, 1 file 1 ranges) REDACTED
|_|_|_|
|_|_| Total rows: 5_|
@@ -101,8 +101,8 @@ EXPLAIN ANALYZE SELECT * FROM test ORDER BY t DESC LIMIT 5;
|_|_|_|
| 1_| 0_|_GlobalLimitExec: skip=0, fetch=5 REDACTED
|_|_|_SortPreservingMergeExec: [t@1 DESC] REDACTED
|_|_|_WindowedSortExec REDACTED
|_|_|_PartSortExec t@1 DESC REDACTED
|_|_|_WindowedSortExec: expr=t@1 DESC num_ranges=2 fetch=5 REDACTED
|_|_|_PartSortExec: expr=t@1 DESC num_ranges=2 limit=5 REDACTED
|_|_|_SeqScan: region=REDACTED, partition_count=2 (1 memtable ranges, 1 file 1 ranges) REDACTED
|_|_|_|
|_|_| Total rows: 5_|
@@ -183,8 +183,8 @@ EXPLAIN ANALYZE SELECT * FROM test_pk ORDER BY t LIMIT 5;
|_|_|_|
| 1_| 0_|_GlobalLimitExec: skip=0, fetch=5 REDACTED
|_|_|_SortPreservingMergeExec: [t@2 ASC NULLS LAST] REDACTED
|_|_|_WindowedSortExec REDACTED
|_|_|_PartSortExec t@2 ASC NULLS LAST REDACTED
|_|_|_WindowedSortExec: expr=t@2 ASC NULLS LAST num_ranges=2 fetch=5 REDACTED
|_|_|_PartSortExec: expr=t@2 ASC NULLS LAST num_ranges=2 limit=5 REDACTED
|_|_|_SeqScan: region=REDACTED, partition_count=2 (1 memtable ranges, 1 file 1 ranges) REDACTED
|_|_|_|
|_|_| Total rows: 5_|
@@ -216,8 +216,8 @@ EXPLAIN ANALYZE SELECT * FROM test_pk ORDER BY t DESC LIMIT 5;
|_|_|_|
| 1_| 0_|_GlobalLimitExec: skip=0, fetch=5 REDACTED
|_|_|_SortPreservingMergeExec: [t@2 DESC] REDACTED
|_|_|_WindowedSortExec REDACTED
|_|_|_PartSortExec t@2 DESC REDACTED
|_|_|_WindowedSortExec: expr=t@2 DESC num_ranges=2 fetch=5 REDACTED
|_|_|_PartSortExec: expr=t@2 DESC num_ranges=2 limit=5 REDACTED
|_|_|_SeqScan: region=REDACTED, partition_count=2 (1 memtable ranges, 1 file 1 ranges) REDACTED
|_|_|_|
|_|_| Total rows: 5_|