Merge branch 'main' into chore/rust-0321

This commit is contained in:
Ning Sun
2026-03-25 09:58:41 +08:00
65 changed files with 3223 additions and 699 deletions

View File

@@ -33,7 +33,7 @@ function upload_artifacts() {
# └── greptime-darwin-amd64-v0.2.0.tar.gz
find "$ARTIFACTS_DIR" -type f \( -name "*.tar.gz" -o -name "*.sha256sum" \) | while IFS= read -r file; do
filename=$(basename "$file")
TARGET_URL="$PROXY_URL/$RELEASE_DIRS/$VERSION/$filename"
TARGET_URL="$PROXY_URL/$RELEASE_DIRS/$VERSION"
curl -X PUT \
-u "$PROXY_USERNAME:$PROXY_PASSWORD" \
@@ -49,7 +49,7 @@ function update_version_info() {
if [[ "$VERSION" =~ ^v[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
echo "Updating latest-version.txt"
echo "$VERSION" > latest-version.txt
TARGET_URL="$PROXY_URL/$RELEASE_DIRS/latest-version.txt"
TARGET_URL="$PROXY_URL/$RELEASE_DIRS"
curl -X PUT \
-u "$PROXY_USERNAME:$PROXY_PASSWORD" \
@@ -62,7 +62,7 @@ function update_version_info() {
echo "Updating latest-nightly-version.txt"
echo "$VERSION" > latest-nightly-version.txt
TARGET_URL="$PROXY_URL/$RELEASE_DIRS/latest-nightly-version.txt"
TARGET_URL="$PROXY_URL/$RELEASE_DIRS"
curl -X PUT \
-u "$PROXY_USERNAME:$PROXY_PASSWORD" \
-F "file=@latest-nightly-version.txt" \

10
Cargo.lock generated
View File

@@ -7301,7 +7301,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667"
dependencies = [
"cfg-if",
"windows-targets 0.52.6",
"windows-targets 0.48.5",
]
[[package]]
@@ -11635,9 +11635,9 @@ dependencies = [
[[package]]
name = "rustls-webpki"
version = "0.103.3"
version = "0.103.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e4a72fe2bcf7a6ac6fd7d0b9e5cb68aeb7d4c0a0271730218b3e92d43b4eb435"
checksum = "df33b2b81ac578cabaf06b89b0631153a3f416b0a886e8a7a1707fb51abbd1ef"
dependencies = [
"ring",
"rustls-pki-types",
@@ -13404,9 +13404,9 @@ checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369"
[[package]]
name = "tar"
version = "0.4.44"
version = "0.4.45"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1d863878d212c87a19c1a610eb53bb01fe12951c0501cf5a0d65f724914a667a"
checksum = "22692a6476a21fa75fdfc11d452fda482af402c008cdbaf3476414e122040973"
dependencies = [
"filetime",
"libc",

View File

@@ -65,11 +65,13 @@ fn init_factory(
fn invalidator<'a>(
cache: &'a Cache<TableName, TableRef>,
ident: &'a CacheIdent,
idents: &'a [&CacheIdent],
) -> BoxFuture<'a, MetaResult<()>> {
Box::pin(async move {
if let CacheIdent::TableName(table_name) = ident {
cache.invalidate(table_name).await
for ident in idents {
if let CacheIdent::TableName(table_name) = ident {
cache.invalidate(table_name).await
}
}
Ok(())
})

View File

@@ -8,7 +8,6 @@ license.workspace = true
testing = []
pg_kvbackend = [
"dep:tokio-postgres",
"dep:backon",
"dep:deadpool-postgres",
"dep:deadpool",
"dep:tokio-postgres-rustls",
@@ -16,7 +15,7 @@ pg_kvbackend = [
"dep:rustls-native-certs",
"dep:rustls",
]
mysql_kvbackend = ["dep:sqlx", "dep:backon"]
mysql_kvbackend = ["dep:sqlx"]
enterprise = ["prost-types"]
[lints]
@@ -28,7 +27,7 @@ api.workspace = true
async-recursion = "1.0"
async-stream.workspace = true
async-trait.workspace = true
backon = { workspace = true, optional = true }
backon.workspace = true
base64.workspace = true
bytes.workspace = true
chrono.workspace = true

View File

@@ -15,10 +15,14 @@
use std::borrow::Borrow;
use std::hash::Hash;
use std::sync::Arc;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::time::Duration;
use futures::future::{BoxFuture, join_all};
use backon::{BackoffBuilder, ExponentialBuilder};
use futures::future::BoxFuture;
use moka::future::Cache;
use snafu::{OptionExt, ResultExt};
use tokio::time::sleep;
use crate::cache_invalidator::{CacheInvalidator, Context};
use crate::error::{self, Error, Result};
@@ -29,12 +33,29 @@ use crate::metrics;
pub type TokenFilter<CacheToken> = Box<dyn Fn(&CacheToken) -> bool + Send + Sync>;
/// Invalidates cached values by [CacheToken]s.
pub type Invalidator<K, V, CacheToken> =
Box<dyn for<'a> Fn(&'a Cache<K, V>, &'a CacheToken) -> BoxFuture<'a, Result<()>> + Send + Sync>;
pub type Invalidator<K, V, CacheToken> = Box<
dyn for<'a> Fn(&'a Cache<K, V>, &'a [&CacheToken]) -> BoxFuture<'a, Result<()>> + Send + Sync,
>;
/// Initializes value (i.e., fetches from remote).
pub type Initializer<K, V> = Arc<dyn Fn(&'_ K) -> BoxFuture<'_, Result<Option<V>>> + Send + Sync>;
#[derive(Debug, Clone, Copy)]
/// Initialization strategy for cache-miss loading.
///
/// This strategy is selected when building [CacheContainer] and remains immutable
/// for the lifetime of the container instance.
pub enum InitStrategy {
/// Fast path: load once without version conflict retry.
///
/// Under concurrent invalidation, callers may observe stale/dirty value.
Unchecked,
/// Strict path: retry load when version changes during initialization.
///
/// This avoids returning dirty value under invalidate/load races.
VersionChecked,
}
/// [CacheContainer] provides ability to:
/// - Cache value loaded by [Initializer].
/// - Invalidate caches by [Invalidator].
@@ -44,6 +65,16 @@ pub struct CacheContainer<K, V, CacheToken> {
invalidator: Invalidator<K, V, CacheToken>,
initializer: Initializer<K, V>,
token_filter: fn(&CacheToken) -> bool,
version: Arc<AtomicUsize>,
init_strategy: InitStrategy,
}
fn latest_get_backoff() -> impl Iterator<Item = Duration> {
ExponentialBuilder::default()
.with_min_delay(Duration::from_millis(10))
.with_max_delay(Duration::from_millis(100))
.with_max_times(3)
.build()
}
impl<K, V, CacheToken> CacheContainer<K, V, CacheToken>
@@ -52,13 +83,37 @@ where
V: Send + Sync,
CacheToken: Send + Sync,
{
/// Constructs an [CacheContainer].
/// Constructs an [CacheContainer] with [InitStrategy::Unchecked].
///
/// This keeps the historical behavior and can return stale/dirty value under
/// concurrent invalidation.
pub fn new(
name: String,
cache: Cache<K, V>,
invalidator: Invalidator<K, V, CacheToken>,
initializer: Initializer<K, V>,
token_filter: fn(&CacheToken) -> bool,
) -> Self {
Self::with_strategy(
name,
cache,
invalidator,
initializer,
token_filter,
InitStrategy::Unchecked,
)
}
/// Constructs an [CacheContainer] with explicit [InitStrategy].
///
/// The strategy is fixed at construction time and cannot be changed later.
pub fn with_strategy(
name: String,
cache: Cache<K, V>,
invalidator: Invalidator<K, V, CacheToken>,
initializer: Initializer<K, V>,
token_filter: fn(&CacheToken) -> bool,
init_strategy: InitStrategy,
) -> Self {
Self {
name,
@@ -66,6 +121,8 @@ where
invalidator,
initializer,
token_filter,
version: Arc::new(AtomicUsize::new(0)),
init_strategy,
}
}
@@ -75,6 +132,67 @@ where
}
}
impl<K, V, CacheToken> CacheContainer<K, V, CacheToken> {
fn inc_version(&self) {
self.version.fetch_add(1, Ordering::Relaxed);
}
}
async fn init<'a, K, V>(init: Initializer<K, V>, key: K, cache_name: &'a str) -> Result<V>
where
K: Send + Sync + 'a,
V: Send + 'a,
{
metrics::CACHE_CONTAINER_CACHE_MISS
.with_label_values(&[cache_name])
.inc();
let _timer = metrics::CACHE_CONTAINER_LOAD_CACHE
.with_label_values(&[cache_name])
.start_timer();
init(&key)
.await
.transpose()
.context(error::ValueNotExistSnafu)?
}
async fn init_with_retry<'a, K, V>(
init: Initializer<K, V>,
key: K,
mut backoff: impl Iterator<Item = Duration> + 'a,
version: Arc<AtomicUsize>,
cache_name: &'a str,
) -> Result<V>
where
K: Send + Sync + 'a,
V: Send + 'a,
{
let mut attempts = 1usize;
loop {
let pre_version = version.load(Ordering::Relaxed);
metrics::CACHE_CONTAINER_CACHE_MISS
.with_label_values(&[cache_name])
.inc();
let _timer = metrics::CACHE_CONTAINER_LOAD_CACHE
.with_label_values(&[cache_name])
.start_timer();
let value = init(&key)
.await
.transpose()
.context(error::ValueNotExistSnafu)??;
if pre_version == version.load(Ordering::Relaxed) {
return Ok(value);
}
if let Some(duration) = backoff.next() {
sleep(duration).await;
attempts += 1;
} else {
return error::GetLatestCacheRetryExceededSnafu { attempts }.fail();
}
}
}
#[async_trait::async_trait]
impl<K, V> CacheInvalidator for CacheContainer<K, V, CacheIdent>
where
@@ -82,14 +200,15 @@ where
V: Send + Sync,
{
async fn invalidate(&self, _ctx: &Context, caches: &[CacheIdent]) -> Result<()> {
let tasks = caches
let idents = caches
.iter()
.filter(|token| (self.token_filter)(token))
.map(|token| (self.invalidator)(&self.cache, token));
join_all(tasks)
.await
.into_iter()
.collect::<Result<Vec<_>>>()?;
.collect::<Vec<_>>();
if !idents.is_empty() {
self.inc_version();
(self.invalidator)(&self.cache, &idents).await?;
}
Ok(())
}
}
@@ -99,27 +218,39 @@ where
K: Copy + Hash + Eq + Send + Sync + 'static,
V: Clone + Send + Sync + 'static,
{
/// Returns a _clone_ of the value corresponding to the key.
/// Returns a value from cache for copyable keys.
///
/// With [InitStrategy::Unchecked], this method prioritizes latency and may
/// return stale/dirty value. With [InitStrategy::VersionChecked], this method
/// retries initialization on version change and avoids dirty returns.
pub async fn get(&self, key: K) -> Result<Option<V>> {
metrics::CACHE_CONTAINER_CACHE_GET
.with_label_values(&[&self.name])
.inc();
let moved_init = self.initializer.clone();
let moved_key = key;
let init = async move {
metrics::CACHE_CONTAINER_CACHE_MISS
.with_label_values(&[&self.name])
.inc();
let _timer = metrics::CACHE_CONTAINER_LOAD_CACHE
.with_label_values(&[&self.name])
.start_timer();
moved_init(&moved_key)
.await
.transpose()
.context(error::ValueNotExistSnafu)?
let result = match self.init_strategy {
InitStrategy::Unchecked => {
self.cache
.try_get_with(key, init(self.initializer.clone(), key, &self.name))
.await
}
InitStrategy::VersionChecked => {
self.cache
.try_get_with(
key,
init_with_retry(
self.initializer.clone(),
key,
latest_get_backoff(),
self.version.clone(),
&self.name,
),
)
.await
}
};
match self.cache.try_get_with(key, init).await {
match result {
Ok(value) => Ok(Some(value)),
Err(err) => match err.as_ref() {
Error::ValueNotExist { .. } => Ok(None),
@@ -136,14 +267,15 @@ where
{
/// Invalidates cache by [CacheToken].
pub async fn invalidate(&self, caches: &[CacheToken]) -> Result<()> {
let tasks = caches
let idents = caches
.iter()
.filter(|token| (self.token_filter)(token))
.map(|token| (self.invalidator)(&self.cache, token));
join_all(tasks)
.await
.into_iter()
.collect::<Result<Vec<_>>>()?;
.collect::<Vec<_>>();
if !idents.is_empty() {
self.inc_version();
(self.invalidator)(&self.cache, &idents).await?;
}
Ok(())
}
@@ -156,7 +288,11 @@ where
self.cache.contains_key(key)
}
/// Returns a _clone_ of the value corresponding to the key.
/// Returns a value from cache by key reference.
///
/// With [InitStrategy::Unchecked], this method prioritizes latency and may
/// return stale/dirty value. With [InitStrategy::VersionChecked], this method
/// retries initialization on version change and avoids dirty returns.
pub async fn get_by_ref<Q>(&self, key: &Q) -> Result<Option<V>>
where
K: Borrow<Q>,
@@ -165,24 +301,32 @@ where
metrics::CACHE_CONTAINER_CACHE_GET
.with_label_values(&[&self.name])
.inc();
let moved_init = self.initializer.clone();
let moved_key = key.to_owned();
let init = async move {
metrics::CACHE_CONTAINER_CACHE_MISS
.with_label_values(&[&self.name])
.inc();
let _timer = metrics::CACHE_CONTAINER_LOAD_CACHE
.with_label_values(&[&self.name])
.start_timer();
moved_init(&moved_key)
.await
.transpose()
.context(error::ValueNotExistSnafu)?
let result = match self.init_strategy {
InitStrategy::Unchecked => {
self.cache
.try_get_with_by_ref(
key,
init(self.initializer.clone(), key.to_owned(), &self.name),
)
.await
}
InitStrategy::VersionChecked => {
self.cache
.try_get_with_by_ref(
key,
init_with_retry(
self.initializer.clone(),
key.to_owned(),
latest_get_backoff(),
self.version.clone(),
&self.name,
),
)
.await
}
};
match self.cache.try_get_with_by_ref(key, init).await {
match result {
Ok(value) => Ok(Some(value)),
Err(err) => match err.as_ref() {
Error::ValueNotExist { .. } => Ok(None),
@@ -296,9 +440,11 @@ mod tests {
moved_counter.fetch_add(1, Ordering::Relaxed);
Box::pin(async { Ok(Some("hi".to_string())) })
});
let invalidator: Invalidator<String, String, String> = Box::new(|cache, key| {
let invalidator: Invalidator<String, String, String> = Box::new(|cache, keys| {
Box::pin(async move {
cache.invalidate(key).await;
for key in keys {
cache.invalidate(*key).await;
}
Ok(())
})
});
@@ -323,4 +469,46 @@ mod tests {
assert_eq!(value, "hi");
assert_eq!(counter.load(Ordering::Relaxed), 2);
}
#[tokio::test(flavor = "multi_thread")]
async fn test_get_by_ref_returns_fresh_value_after_invalidate() {
let cache: Cache<String, String> = CacheBuilder::new(128).build();
let counter = Arc::new(AtomicI32::new(0));
let moved_counter = counter.clone();
let init: Initializer<String, String> = Arc::new(move |_| {
let counter = moved_counter.clone();
Box::pin(async move {
let n = counter.fetch_add(1, Ordering::Relaxed) + 1;
sleep(Duration::from_millis(100)).await;
Ok(Some(format!("v{n}")))
})
});
let invalidator: Invalidator<String, String, String> = Box::new(|cache, keys| {
Box::pin(async move {
for key in keys {
cache.invalidate(*key).await;
}
Ok(())
})
});
let adv_cache = Arc::new(CacheContainer::with_strategy(
"test".to_string(),
cache,
invalidator,
init,
always_true_filter,
InitStrategy::VersionChecked,
));
let moved_cache = adv_cache.clone();
let get_task = tokio::spawn(async move { moved_cache.get_by_ref("foo").await });
sleep(Duration::from_millis(50)).await;
adv_cache.invalidate(&["foo".to_string()]).await.unwrap();
let value = get_task.await.unwrap().unwrap().unwrap();
assert_eq!(value, "v2");
assert_eq!(counter.load(Ordering::Relaxed), 2);
}
}

View File

@@ -170,20 +170,22 @@ async fn handle_drop_flow(
fn invalidator<'a>(
cache: &'a Cache<TableId, FlownodeFlowSet>,
ident: &'a CacheIdent,
idents: &'a [&CacheIdent],
) -> BoxFuture<'a, Result<()>> {
Box::pin(async move {
match ident {
CacheIdent::CreateFlow(create_flow) => handle_create_flow(cache, create_flow).await,
CacheIdent::DropFlow(drop_flow) => handle_drop_flow(cache, drop_flow).await,
CacheIdent::FlowNodeAddressChange(node_id) => {
info!(
"Invalidate flow node cache for node_id in table_flownode: {}",
node_id
);
cache.invalidate_all();
for ident in idents {
match ident {
CacheIdent::CreateFlow(create_flow) => handle_create_flow(cache, create_flow).await,
CacheIdent::DropFlow(drop_flow) => handle_drop_flow(cache, drop_flow).await,
CacheIdent::FlowNodeAddressChange(node_id) => {
info!(
"Invalidate flow node cache for node_id in table_flownode: {}",
node_id
);
cache.invalidate_all();
}
_ => {}
}
_ => {}
}
Ok(())
})

View File

@@ -58,11 +58,13 @@ fn init_factory(schema_manager: SchemaManager) -> Initializer<SchemaName, Arc<Sc
fn invalidator<'a>(
cache: &'a Cache<SchemaName, Arc<SchemaNameValue>>,
ident: &'a CacheIdent,
idents: &'a [&CacheIdent],
) -> BoxFuture<'a, crate::error::Result<()>> {
Box::pin(async move {
if let CacheIdent::SchemaName(schema_name) = ident {
cache.invalidate(schema_name).await
for ident in idents {
if let CacheIdent::SchemaName(schema_name) = ident {
cache.invalidate(schema_name).await
}
}
Ok(())
})

View File

@@ -61,11 +61,13 @@ fn init_factory(table_info_manager: TableInfoManagerRef) -> Initializer<TableId,
fn invalidator<'a>(
cache: &'a Cache<TableId, Arc<TableInfo>>,
ident: &'a CacheIdent,
idents: &'a [&CacheIdent],
) -> BoxFuture<'a, Result<()>> {
Box::pin(async move {
if let CacheIdent::TableId(table_id) = ident {
cache.invalidate(table_id).await
for ident in idents {
if let CacheIdent::TableId(table_id) = ident {
cache.invalidate(table_id).await
}
}
Ok(())
})

View File

@@ -71,11 +71,13 @@ fn init_factory(table_name_manager: TableNameManagerRef) -> Initializer<TableNam
fn invalidator<'a>(
cache: &'a Cache<TableName, TableId>,
ident: &'a CacheIdent,
idents: &'a [&CacheIdent],
) -> BoxFuture<'a, Result<()>> {
Box::pin(async move {
if let CacheIdent::TableName(table_name) = ident {
cache.invalidate(table_name).await
for ident in idents {
if let CacheIdent::TableName(table_name) = ident {
cache.invalidate(table_name).await
}
}
Ok(())
})

View File

@@ -19,6 +19,7 @@ use moka::future::Cache;
use snafu::OptionExt;
use store_api::storage::TableId;
use crate::cache::container::InitStrategy;
use crate::cache::{CacheContainer, Initializer};
use crate::error;
use crate::error::Result;
@@ -65,7 +66,14 @@ pub fn new_table_route_cache(
let table_info_manager = Arc::new(TableRouteManager::new(kv_backend));
let init = init_factory(table_info_manager);
CacheContainer::new(name, cache, Box::new(invalidator), init, filter)
CacheContainer::with_strategy(
name,
cache,
Box::new(invalidator),
init,
filter,
InitStrategy::VersionChecked,
)
}
fn init_factory(
@@ -92,11 +100,13 @@ fn init_factory(
fn invalidator<'a>(
cache: &'a Cache<TableId, Arc<TableRoute>>,
ident: &'a CacheIdent,
idents: &'a [&CacheIdent],
) -> BoxFuture<'a, Result<()>> {
Box::pin(async move {
if let CacheIdent::TableId(table_id) = ident {
cache.invalidate(table_id).await
for ident in idents {
if let CacheIdent::TableId(table_id) = ident {
cache.invalidate(table_id).await
}
}
Ok(())
})

View File

@@ -65,7 +65,7 @@ fn init_factory(table_info_manager: TableInfoManager) -> Initializer<TableId, Ar
/// Never invalidates table id schema cache.
fn invalidator<'a>(
_cache: &'a Cache<TableId, Arc<SchemaName>>,
_ident: &'a CacheIdent,
_idents: &'a [&CacheIdent],
) -> BoxFuture<'a, error::Result<()>> {
Box::pin(std::future::ready(Ok(())))
}

View File

@@ -60,11 +60,13 @@ fn init_factory(view_info_manager: ViewInfoManagerRef) -> Initializer<TableId, A
fn invalidator<'a>(
cache: &'a Cache<TableId, Arc<ViewInfoValue>>,
ident: &'a CacheIdent,
idents: &'a [&CacheIdent],
) -> BoxFuture<'a, Result<()>> {
Box::pin(async move {
if let CacheIdent::TableId(view_id) = ident {
cache.invalidate(view_id).await
for ident in idents {
if let CacheIdent::TableId(view_id) = ident {
cache.invalidate(view_id).await
}
}
Ok(())
})

View File

@@ -21,15 +21,85 @@ use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering};
use common_telemetry::{error, info, warn};
use serde::{Deserialize, Serialize};
use tokio::sync::broadcast::error::RecvError;
use tokio::sync::broadcast::{self, Receiver, Sender};
use crate::error::Result;
use crate::metasrv::MetasrvNodeInfo;
pub(crate) const CANDIDATE_LEASE_SECS: u64 = 600;
pub const CANDIDATE_LEASE_SECS: u64 = 600;
const KEEP_ALIVE_INTERVAL_SECS: u64 = CANDIDATE_LEASE_SECS / 2;
/// The value of the leader. It is used to store the leader's address.
pub struct LeaderValue(pub String);
impl<T: AsRef<[u8]>> From<T> for LeaderValue {
fn from(value: T) -> Self {
let string = String::from_utf8_lossy(value.as_ref());
Self(string.to_string())
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MetasrvNodeInfo {
// The metasrv's address
pub addr: String,
// The node build version
pub version: String,
// The node build git commit hash
pub git_commit: String,
// The node start timestamp in milliseconds
pub start_time_ms: u64,
// The node total cpu millicores
#[serde(default)]
pub total_cpu_millicores: i64,
// The node total memory bytes
#[serde(default)]
pub total_memory_bytes: i64,
/// The node build cpu usage millicores
#[serde(default)]
pub cpu_usage_millicores: i64,
/// The node build memory usage bytes
#[serde(default)]
pub memory_usage_bytes: i64,
// The node hostname
#[serde(default)]
pub hostname: String,
}
// TODO(zyy17): Allow deprecated fields for backward compatibility. Remove this when the deprecated top-level fields are removed from the proto.
#[allow(deprecated)]
impl From<MetasrvNodeInfo> for api::v1::meta::MetasrvNodeInfo {
fn from(node_info: MetasrvNodeInfo) -> Self {
Self {
peer: Some(api::v1::meta::Peer {
addr: node_info.addr,
..Default::default()
}),
// TODO(zyy17): The following top-level fields are deprecated. They are kept for backward compatibility and will be removed in a future version.
// New code should use the fields in `info.NodeInfo` instead.
version: node_info.version.clone(),
git_commit: node_info.git_commit.clone(),
start_time_ms: node_info.start_time_ms,
cpus: node_info.total_cpu_millicores as u32,
memory_bytes: node_info.total_memory_bytes as u64,
// The canonical location for node information.
info: Some(api::v1::meta::NodeInfo {
version: node_info.version,
git_commit: node_info.git_commit,
start_time_ms: node_info.start_time_ms,
total_cpu_millicores: node_info.total_cpu_millicores,
total_memory_bytes: node_info.total_memory_bytes,
cpu_usage_millicores: node_info.cpu_usage_millicores,
memory_usage_bytes: node_info.memory_usage_bytes,
cpus: node_info.total_cpu_millicores as u32,
memory_bytes: node_info.total_memory_bytes as u64,
hostname: node_info.hostname,
}),
}
}
}
/// Messages sent when the leader changes.
#[derive(Debug, Clone)]
pub enum LeaderChangeMessage {
@@ -168,3 +238,5 @@ pub trait Election: Send + Sync {
fn subscribe_leader_change(&self) -> Receiver<LeaderChangeMessage>;
}
pub type ElectionRef = Arc<dyn Election<Leader = LeaderValue>>;

View File

@@ -16,8 +16,6 @@ use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering};
use std::time::Duration;
use common_meta::distributed_time_constants::{META_KEEP_ALIVE_INTERVAL_SECS, META_LEASE_SECS};
use common_meta::key::{CANDIDATES_ROOT, ELECTION_KEY};
use common_telemetry::{error, info, warn};
use etcd_client::{
Client, GetOptions, LeaderKey as EtcdLeaderKey, LeaseKeepAliveStream, LeaseKeeper, PutOptions,
@@ -27,13 +25,15 @@ use tokio::sync::broadcast;
use tokio::sync::broadcast::Receiver;
use tokio::time::{MissedTickBehavior, timeout};
use crate::distributed_time_constants::{META_KEEP_ALIVE_INTERVAL_SECS, META_LEASE_SECS};
use crate::election::{
CANDIDATE_LEASE_SECS, Election, KEEP_ALIVE_INTERVAL_SECS, LeaderChangeMessage, LeaderKey,
listen_leader_change, send_leader_change_and_set_flags,
CANDIDATE_LEASE_SECS, Election, ElectionRef, KEEP_ALIVE_INTERVAL_SECS, LeaderChangeMessage,
LeaderKey, LeaderValue, MetasrvNodeInfo, listen_leader_change,
send_leader_change_and_set_flags,
};
use crate::error;
use crate::error::Result;
use crate::metasrv::{ElectionRef, LeaderValue, MetasrvNodeInfo};
use crate::key::{CANDIDATES_ROOT, ELECTION_KEY};
impl LeaderKey for EtcdLeaderKey {
fn name(&self) -> &[u8] {
@@ -253,7 +253,7 @@ impl Election for EtcdElection {
.leader(self.election_key())
.await
.context(error::EtcdFailedSnafu)?;
let leader_value = res.kv().context(error::NoLeaderSnafu)?.value();
let leader_value = res.kv().context(error::ElectionNoLeaderSnafu)?.value();
Ok(leader_value.into())
}
}
@@ -279,7 +279,7 @@ impl EtcdElection {
ensure!(
res.ttl() > 0,
error::UnexpectedSnafu {
violated: "Failed to refresh the lease",
err_msg: "Failed to refresh the lease".to_string(),
}
);

View File

@@ -36,7 +36,7 @@ fn parse_value_and_expire_time(value: &str) -> Result<(String, Timestamp)> {
.split(LEASE_SEP)
.collect_tuple()
.with_context(|| UnexpectedSnafu {
violated: format!(
err_msg: format!(
"Invalid value {}, expect node info || {} || expire time",
value, LEASE_SEP
),
@@ -45,7 +45,7 @@ fn parse_value_and_expire_time(value: &str) -> Result<(String, Timestamp)> {
let expire_time = match Timestamp::from_str(expire_time, None) {
Ok(ts) => ts,
Err(_) => UnexpectedSnafu {
violated: format!("Invalid timestamp: {}", expire_time),
err_msg: format!("Invalid timestamp: {}", expire_time),
}
.fail()?,
};

View File

@@ -16,7 +16,6 @@ use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering};
use std::time::Duration;
use common_meta::key::{CANDIDATES_ROOT, ELECTION_KEY};
use common_telemetry::{error, info, warn};
use common_time::Timestamp;
use snafu::{OptionExt, ResultExt, ensure};
@@ -29,14 +28,15 @@ use tokio::time::MissedTickBehavior;
use crate::election::rds::{LEASE_SEP, Lease, RdsLeaderKey, parse_value_and_expire_time};
use crate::election::{
Election, LeaderChangeMessage, listen_leader_change, send_leader_change_and_set_flags,
Election, ElectionRef, LeaderChangeMessage, LeaderValue, MetasrvNodeInfo, listen_leader_change,
send_leader_change_and_set_flags,
};
use crate::error::{
AcquireMySqlClientSnafu, DecodeSqlValueSnafu, DeserializeFromJsonSnafu,
LeaderLeaseChangedSnafu, LeaderLeaseExpiredSnafu, MySqlExecutionSnafu, NoLeaderSnafu, Result,
SerializeToJsonSnafu, SqlExecutionTimeoutSnafu, UnexpectedSnafu,
ElectionLeaderLeaseChangedSnafu, ElectionLeaderLeaseExpiredSnafu, ElectionNoLeaderSnafu,
MySqlExecutionSnafu, Result, SerializeToJsonSnafu, SqlExecutionTimeoutSnafu, UnexpectedSnafu,
};
use crate::metasrv::{ElectionRef, LeaderValue, MetasrvNodeInfo};
use crate::key::{CANDIDATES_ROOT, ELECTION_KEY};
struct ElectionSqlFactory<'a> {
table_name: &'a str,
@@ -592,7 +592,7 @@ impl Election for MySqlElection {
ensure!(
lease.expire_time > lease.current,
UnexpectedSnafu {
violated: format!(
err_msg: format!(
"Candidate lease expired at {:?} (current time: {:?}), key: {:?}",
lease.expire_time,
lease.current,
@@ -667,10 +667,10 @@ impl Election for MySqlElection {
let client = self.client.lock().await;
let mut executor = Executor::Default(client);
if let Some(lease) = self.get_value_with_lease(&key, &mut executor).await? {
ensure!(lease.expire_time > lease.current, NoLeaderSnafu);
ensure!(lease.expire_time > lease.current, ElectionNoLeaderSnafu);
Ok(lease.leader_value.as_bytes().into())
} else {
NoLeaderSnafu.fail()
ElectionNoLeaderSnafu.fail()
}
}
}
@@ -705,7 +705,7 @@ impl MySqlElection {
let current_time = match Timestamp::from_str(&current_time_str, None) {
Ok(ts) => ts,
Err(_) => UnexpectedSnafu {
violated: format!("Invalid timestamp: {}", current_time_str),
err_msg: format!("Invalid timestamp: {}", current_time_str),
}
.fail()?,
};
@@ -740,7 +740,7 @@ impl MySqlElection {
current = match Timestamp::from_str(current_time_str, None) {
Ok(ts) => ts,
Err(_) => UnexpectedSnafu {
violated: format!("Invalid timestamp: {}", current_time_str),
err_msg: format!("Invalid timestamp: {}", current_time_str),
}
.fail()?,
};
@@ -777,7 +777,7 @@ impl MySqlElection {
ensure!(
res == 1,
UnexpectedSnafu {
violated: format!("Failed to update key: {}", String::from_utf8_lossy(key)),
err_msg: format!("Failed to update key: {}", String::from_utf8_lossy(key)),
}
);
@@ -920,9 +920,12 @@ impl MySqlElection {
/// will be released.
/// - **Case 2**: If all checks pass, the function returns without performing any actions.
fn lease_check(&self, lease: &Option<Lease>) -> Result<Lease> {
let lease = lease.as_ref().context(NoLeaderSnafu)?;
let lease = lease.as_ref().context(ElectionNoLeaderSnafu)?;
// Case 1: Lease expired
ensure!(lease.expire_time > lease.current, LeaderLeaseExpiredSnafu);
ensure!(
lease.expire_time > lease.current,
ElectionLeaderLeaseExpiredSnafu
);
// Case 2: Everything is fine
Ok(lease.clone())
}
@@ -960,7 +963,7 @@ impl MySqlElection {
let remote_lease = self.get_value_with_lease(&key, &mut executor).await?;
ensure!(
expected_lease.map(|lease| lease.origin) == remote_lease.map(|lease| lease.origin),
LeaderLeaseChangedSnafu
ElectionLeaderLeaseChangedSnafu
);
self.delete_value(&key, &mut executor).await?;
self.put_value_with_lease(
@@ -986,12 +989,11 @@ impl MySqlElection {
mod tests {
use std::{assert_matches, env};
use common_meta::maybe_skip_mysql_integration_test;
use common_telemetry::init_default_ut_logging;
use sqlx::MySqlPool;
use super::*;
use crate::error;
use crate::utils::mysql::create_mysql_pool;
use crate::{error, maybe_skip_mysql_integration_test};
async fn create_mysql_client(
table_name: Option<&str>,
@@ -1002,11 +1004,11 @@ mod tests {
let endpoint = env::var("GT_MYSQL_ENDPOINTS").unwrap_or_default();
if endpoint.is_empty() {
return UnexpectedSnafu {
violated: "MySQL endpoint is empty".to_string(),
err_msg: "MySQL endpoint is empty".to_string(),
}
.fail();
}
let pool = create_mysql_pool(&[endpoint], None).await.unwrap();
let pool = MySqlPool::connect(&endpoint).await.unwrap();
let mut client = ElectionMysqlClient::new(
pool,
execution_timeout,
@@ -1301,7 +1303,7 @@ mod tests {
let err = elected(&leader_mysql_election, table_name, Some(incorrect_lease))
.await
.unwrap_err();
assert_matches!(err, error::Error::LeaderLeaseChanged { .. });
assert_matches!(err, error::Error::ElectionLeaderLeaseChanged { .. });
let lease = get_lease(&leader_mysql_election).await;
assert!(lease.is_none());
drop_table(&leader_mysql_election.client, table_name).await;

View File

@@ -16,7 +16,6 @@ use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering};
use std::time::Duration;
use common_meta::key::{CANDIDATES_ROOT, ELECTION_KEY};
use common_telemetry::{error, info, warn};
use common_time::Timestamp;
use deadpool_postgres::{Manager, Pool};
@@ -28,13 +27,15 @@ use tokio_postgres::types::ToSql;
use crate::election::rds::{LEASE_SEP, Lease, RdsLeaderKey, parse_value_and_expire_time};
use crate::election::{
Election, LeaderChangeMessage, listen_leader_change, send_leader_change_and_set_flags,
Election, ElectionRef, LeaderChangeMessage, LeaderValue, MetasrvNodeInfo, listen_leader_change,
send_leader_change_and_set_flags,
};
use crate::error::{
DeserializeFromJsonSnafu, GetPostgresClientSnafu, NoLeaderSnafu, PostgresExecutionSnafu,
Result, SerializeToJsonSnafu, SqlExecutionTimeoutSnafu, UnexpectedSnafu,
DeserializeFromJsonSnafu, ElectionNoLeaderSnafu, GetPostgresClientSnafu,
PostgresExecutionSnafu, Result, SerializeToJsonSnafu, SqlExecutionTimeoutSnafu,
UnexpectedSnafu,
};
use crate::metasrv::{ElectionRef, LeaderValue, MetasrvNodeInfo};
use crate::key::{CANDIDATES_ROOT, ELECTION_KEY};
struct ElectionSqlFactory<'a> {
lock_id: u64,
@@ -404,13 +405,13 @@ impl Election for PgElection {
.get_value_with_lease(&key)
.await?
.context(UnexpectedSnafu {
violated: format!("Failed to get lease for key: {:?}", key),
err_msg: format!("Failed to get lease for key: {:?}", key),
})?;
ensure!(
lease.expire_time > lease.current,
UnexpectedSnafu {
violated: format!(
err_msg: format!(
"Candidate lease expired at {:?} (current time {:?}), key: {:?}",
lease.expire_time, lease.current, key
),
@@ -464,11 +465,11 @@ impl Election for PgElection {
.query(&self.sql_set.campaign, &[])
.await?;
let row = res.first().context(UnexpectedSnafu {
violated: "Failed to get the result of acquiring advisory lock",
err_msg: "Failed to get the result of acquiring advisory lock".to_string(),
})?;
let is_leader = row.try_get(0).map_err(|_| {
UnexpectedSnafu {
violated: "Failed to get the result of get lock",
err_msg: "Failed to get the result of get lock".to_string(),
}
.build()
})?;
@@ -500,10 +501,10 @@ impl Election for PgElection {
} else {
let key = self.election_key();
if let Some(lease) = self.get_value_with_lease(&key).await? {
ensure!(lease.expire_time > lease.current, NoLeaderSnafu);
ensure!(lease.expire_time > lease.current, ElectionNoLeaderSnafu);
Ok(lease.leader_value.as_bytes().into())
} else {
NoLeaderSnafu.fail()
ElectionNoLeaderSnafu.fail()
}
}
}
@@ -537,7 +538,7 @@ impl PgElection {
let current_time = match Timestamp::from_str(current_time_str, None) {
Ok(ts) => ts,
Err(_) => UnexpectedSnafu {
violated: format!("Invalid timestamp: {}", current_time_str),
err_msg: format!("Invalid timestamp: {}", current_time_str),
}
.fail()?,
};
@@ -576,7 +577,7 @@ impl PgElection {
current = match Timestamp::from_str(current_time_str, None) {
Ok(ts) => ts,
Err(_) => UnexpectedSnafu {
violated: format!("Invalid timestamp: {}", current_time_str),
err_msg: format!("Invalid timestamp: {}", current_time_str),
}
.fail()?,
};
@@ -613,7 +614,7 @@ impl PgElection {
ensure!(
res == 1,
UnexpectedSnafu {
violated: format!("Failed to update key: {}", String::from_utf8_lossy(key)),
err_msg: format!("Failed to update key: {}", String::from_utf8_lossy(key)),
}
);
@@ -742,9 +743,9 @@ impl PgElection {
let lease = self
.get_value_with_lease(&key)
.await?
.context(NoLeaderSnafu)?;
.context(ElectionNoLeaderSnafu)?;
// Case 2
ensure!(lease.expire_time > lease.current, NoLeaderSnafu);
ensure!(lease.expire_time > lease.current, ElectionNoLeaderSnafu);
// Case 3
Ok(())
}
@@ -830,11 +831,11 @@ impl PgElection {
mod tests {
use std::{assert_matches, env};
use common_meta::maybe_skip_postgres_integration_test;
use deadpool_postgres::{Config, Runtime};
use tokio_postgres::NoTls;
use super::*;
use crate::error;
use crate::utils::postgres::create_postgres_pool;
use crate::{error, maybe_skip_postgres_integration_test};
async fn create_postgres_client(
table_name: Option<&str>,
@@ -845,11 +846,13 @@ mod tests {
let endpoint = env::var("GT_POSTGRES_ENDPOINTS").unwrap_or_default();
if endpoint.is_empty() {
return UnexpectedSnafu {
violated: "Postgres endpoint is empty".to_string(),
err_msg: "Postgres endpoint is empty".to_string(),
}
.fail();
}
let pool = create_postgres_pool(&[endpoint], None, None).await.unwrap();
let mut cfg = Config::new();
cfg.url = Some(endpoint);
let pool = cfg.create_pool(Some(Runtime::Tokio1), NoTls).unwrap();
let mut pg_client = ElectionPgClient::new(
pool,
execution_timeout,

View File

@@ -338,6 +338,24 @@ pub enum Error {
location: Location,
},
#[snafu(display("Metasrv election has no leader at this moment"))]
ElectionNoLeader {
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Metasrv election leader lease expired"))]
ElectionLeaderLeaseExpired {
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Metasrv election leader lease changed during election"))]
ElectionLeaderLeaseChanged {
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Table already exists, table: {}", table_name))]
TableAlreadyExists {
table_name: String,
@@ -714,6 +732,16 @@ pub enum Error {
#[snafu(display("Failed to get cache"))]
GetCache { source: Arc<Error> },
#[snafu(display(
"Failed to get latest cache value after {} attempts due to concurrent invalidation",
attempts
))]
GetLatestCacheRetryExceeded {
attempts: usize,
#[snafu(implicit)]
location: Location,
},
#[cfg(feature = "pg_kvbackend")]
#[snafu(display("Failed to execute via Postgres, sql: {}", sql))]
PostgresExecution {
@@ -741,6 +769,15 @@ pub enum Error {
location: Location,
},
#[cfg(feature = "pg_kvbackend")]
#[snafu(display("Failed to get Postgres client"))]
GetPostgresClient {
#[snafu(source)]
error: deadpool::managed::PoolError<tokio_postgres::Error>,
#[snafu(implicit)]
location: Location,
},
#[cfg(feature = "pg_kvbackend")]
#[snafu(display("Failed to {} Postgres transaction", operation))]
PostgresTransaction {
@@ -795,6 +832,24 @@ pub enum Error {
location: Location,
},
#[cfg(feature = "mysql_kvbackend")]
#[snafu(display("Failed to decode sql value"))]
DecodeSqlValue {
#[snafu(source)]
error: sqlx::error::Error,
#[snafu(implicit)]
location: Location,
},
#[cfg(feature = "mysql_kvbackend")]
#[snafu(display("Failed to acquire mysql client from pool"))]
AcquireMySqlClient {
#[snafu(source)]
error: sqlx::Error,
#[snafu(implicit)]
location: Location,
},
#[cfg(feature = "mysql_kvbackend")]
#[snafu(display("Failed to {} MySql transaction", operation))]
MySqlTransaction {
@@ -812,6 +867,15 @@ pub enum Error {
location: Location,
},
#[cfg(any(feature = "pg_kvbackend", feature = "mysql_kvbackend"))]
#[snafu(display("Sql execution timeout, sql: {}, duration: {:?}", sql, duration))]
SqlExecutionTimeout {
sql: String,
duration: std::time::Duration,
#[snafu(implicit)]
location: Location,
},
#[snafu(display(
"Datanode table info not found, table id: {}, datanode id: {}",
table_id,
@@ -1063,8 +1127,12 @@ impl ErrorExt for Error {
| ConnectEtcd { .. }
| MoveValues { .. }
| GetCache { .. }
| GetLatestCacheRetryExceeded { .. }
| SerializeToJson { .. }
| DeserializeFromJson { .. } => StatusCode::Internal,
| DeserializeFromJson { .. }
| ElectionNoLeader { .. }
| ElectionLeaderLeaseExpired { .. }
| ElectionLeaderLeaseChanged { .. } => StatusCode::Internal,
NoLeader { .. } => StatusCode::TableUnavailable,
ValueNotExist { .. }
@@ -1187,15 +1255,18 @@ impl ErrorExt for Error {
PostgresExecution { .. }
| CreatePostgresPool { .. }
| GetPostgresConnection { .. }
| GetPostgresClient { .. }
| PostgresTransaction { .. }
| PostgresTlsConfig { .. }
| InvalidTlsConfig { .. } => StatusCode::Internal,
#[cfg(feature = "mysql_kvbackend")]
MySqlExecution { .. } | CreateMySqlPool { .. } | MySqlTransaction { .. } => {
StatusCode::Internal
}
MySqlExecution { .. }
| CreateMySqlPool { .. }
| DecodeSqlValue { .. }
| AcquireMySqlClient { .. }
| MySqlTransaction { .. } => StatusCode::Internal,
#[cfg(any(feature = "pg_kvbackend", feature = "mysql_kvbackend"))]
RdsTransactionRetryFailed { .. } => StatusCode::Internal,
RdsTransactionRetryFailed { .. } | SqlExecutionTimeout { .. } => StatusCode::Internal,
DatanodeTableInfoNotFound { .. } => StatusCode::Internal,
}
}
@@ -1243,7 +1314,10 @@ impl Error {
/// Determine whether it is a retry later type through [StatusCode]
pub fn is_retry_later(&self) -> bool {
matches!(self, Error::RetryLater { .. })
matches!(
self,
Error::RetryLater { .. } | Error::GetLatestCacheRetryExceeded { .. }
)
}
/// Determine whether it needs to clean poisons.

View File

@@ -19,6 +19,7 @@ pub mod datanode;
pub mod ddl;
pub mod ddl_manager;
pub mod distributed_time_constants;
pub mod election;
pub mod error;
pub mod flow_name;
pub mod heartbeat;

View File

@@ -27,7 +27,6 @@ use api::v1::{
use async_stream::try_stream;
use async_trait::async_trait;
use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
use common_base::AffectedRows;
use common_error::ext::BoxedError;
use common_grpc::flight::do_put::DoPutResponse;
use common_query::Output;
@@ -260,62 +259,6 @@ impl GrpcQueryHandler for Instance {
.context(server_error::ExecuteGrpcQuerySnafu)
}
async fn put_record_batch(
&self,
request: servers::grpc::flight::PutRecordBatchRequest,
table_ref: &mut Option<TableRef>,
ctx: QueryContextRef,
) -> server_error::Result<AffectedRows> {
let result: Result<AffectedRows> = async {
let table = if let Some(table) = table_ref {
table.clone()
} else {
let table = self
.catalog_manager()
.table(
&request.table_name.catalog_name,
&request.table_name.schema_name,
&request.table_name.table_name,
None,
)
.await
.context(CatalogSnafu)?
.with_context(|| TableNotFoundSnafu {
table_name: request.table_name.to_string(),
})?;
*table_ref = Some(table.clone());
table
};
let interceptor_ref = self.plugins.get::<GrpcQueryInterceptorRef<Error>>();
let interceptor = interceptor_ref.as_ref();
interceptor.pre_bulk_insert(table.clone(), ctx.clone())?;
self.plugins
.get::<PermissionCheckerRef>()
.as_ref()
.check_permission(ctx.current_user(), PermissionReq::BulkInsert)
.context(PermissionSnafu)?;
// do we check limit for bulk insert?
self.inserter
.handle_bulk_insert(
table,
request.flight_data,
request.record_batch,
request.schema_bytes,
)
.await
.context(TableOperationSnafu)
}
.await;
result
.map_err(BoxedError::new)
.context(server_error::ExecuteGrpcRequestSnafu)
}
fn handle_put_record_batch_stream(
&self,
stream: servers::grpc::flight::PutRecordBatchRequestStream,

View File

@@ -24,6 +24,8 @@ use common_base::Plugins;
use common_config::Configurable;
#[cfg(any(feature = "pg_kvbackend", feature = "mysql_kvbackend"))]
use common_meta::distributed_time_constants::META_LEASE_SECS;
use common_meta::election::CANDIDATE_LEASE_SECS;
use common_meta::election::etcd::EtcdElection;
use common_meta::kv_backend::chroot::ChrootKvBackend;
use common_meta::kv_backend::etcd::EtcdStore;
use common_meta::kv_backend::memory::MemoryKvBackend;
@@ -42,9 +44,6 @@ use tonic::codec::CompressionEncoding;
use tonic::transport::server::{Router, TcpIncoming};
use crate::cluster::{MetaPeerClientBuilder, MetaPeerClientRef};
#[cfg(any(feature = "pg_kvbackend", feature = "mysql_kvbackend"))]
use crate::election::CANDIDATE_LEASE_SECS;
use crate::election::etcd::EtcdElection;
use crate::error::OtherSnafu;
use crate::metasrv::builder::MetasrvBuilder;
use crate::metasrv::{
@@ -281,7 +280,8 @@ pub async fn metasrv_builder(
etcd_client,
opts.store_key_prefix.clone(),
)
.await?;
.await
.context(error::KvBackendSnafu)?;
(kv_backend, Some(election))
}
@@ -290,10 +290,10 @@ pub async fn metasrv_builder(
use std::time::Duration;
use common_meta::distributed_time_constants::POSTGRES_KEEP_ALIVE_SECS;
use common_meta::election::rds::postgres::{ElectionPgClient, PgElection};
use common_meta::kv_backend::rds::PgStore;
use deadpool_postgres::{Config, ManagerConfig, RecyclingMethod};
use crate::election::rds::postgres::{ElectionPgClient, PgElection};
use crate::utils::postgres::create_postgres_pool;
let candidate_lease_ttl = Duration::from_secs(CANDIDATE_LEASE_SECS);
@@ -321,7 +321,8 @@ pub async fn metasrv_builder(
execution_timeout,
idle_session_timeout,
statement_timeout,
)?;
)
.context(error::KvBackendSnafu)?;
let election = PgElection::with_pg_client(
opts.grpc.server_addr.clone(),
election_client,
@@ -332,7 +333,8 @@ pub async fn metasrv_builder(
&opts.meta_table_name,
opts.meta_election_lock_id,
)
.await?;
.await
.context(error::KvBackendSnafu)?;
let pool = create_postgres_pool(&opts.store_addrs, Some(cfg), opts.backend_tls.clone())
.await?;
@@ -352,9 +354,9 @@ pub async fn metasrv_builder(
(None, BackendImpl::MysqlStore) => {
use std::time::Duration;
use common_meta::election::rds::mysql::{ElectionMysqlClient, MySqlElection};
use common_meta::kv_backend::rds::MySqlStore;
use crate::election::rds::mysql::{ElectionMysqlClient, MySqlElection};
use crate::utils::mysql::create_mysql_pool;
let pool = create_mysql_pool(&opts.store_addrs, opts.backend_tls.as_ref()).await?;
@@ -389,7 +391,8 @@ pub async fn metasrv_builder(
meta_lease_ttl,
&election_table_name,
)
.await?;
.await
.context(error::KvBackendSnafu)?;
(kv_backend, Some(election))
}
};

View File

@@ -247,7 +247,7 @@ impl MetaPeerClient {
// Safety: when self.is_leader() == false, election must not empty.
let election = self.election.as_ref().unwrap();
let leader_addr = election.leader().await?.0;
let leader_addr = election.leader().await.context(error::KvBackendSnafu)?.0;
let channel = self
.channel_manager
@@ -279,7 +279,7 @@ impl MetaPeerClient {
// Safety: when self.is_leader() == false, election must not empty.
let election = self.election.as_ref().unwrap();
let leader_addr = election.leader().await?.0;
let leader_addr = election.leader().await.context(error::KvBackendSnafu)?.0;
let channel = self
.channel_manager

View File

@@ -19,7 +19,6 @@ pub mod bootstrap;
pub mod cache_invalidator;
pub mod cluster;
pub mod discovery;
pub mod election;
pub mod error;
pub mod events;
mod failure_detector;

View File

@@ -32,6 +32,8 @@ use common_meta::ddl_manager::DdlManagerRef;
use common_meta::distributed_time_constants::{
self, BASE_HEARTBEAT_INTERVAL, default_distributed_time_constants, frontend_heartbeat_interval,
};
use common_meta::election::LeaderChangeMessage;
pub use common_meta::election::{ElectionRef, MetasrvNodeInfo};
use common_meta::key::TableMetadataManagerRef;
use common_meta::key::runtime_switch::RuntimeSwitchManagerRef;
use common_meta::kv_backend::{KvBackendRef, ResettableKvBackend, ResettableKvBackendRef};
@@ -64,7 +66,6 @@ use tokio::sync::broadcast::error::RecvError;
use crate::cluster::MetaPeerClientRef;
use crate::discovery;
use crate::election::{Election, LeaderChangeMessage};
use crate::error::{
self, InitMetadataSnafu, KvBackendSnafu, Result, StartProcedureManagerSnafu,
StartTelemetryTaskSnafu, StopProcedureManagerSnafu,
@@ -459,76 +460,6 @@ impl Context {
}
}
/// The value of the leader. It is used to store the leader's address.
pub struct LeaderValue(pub String);
impl<T: AsRef<[u8]>> From<T> for LeaderValue {
fn from(value: T) -> Self {
let string = String::from_utf8_lossy(value.as_ref());
Self(string.to_string())
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MetasrvNodeInfo {
// The metasrv's address
pub addr: String,
// The node build version
pub version: String,
// The node build git commit hash
pub git_commit: String,
// The node start timestamp in milliseconds
pub start_time_ms: u64,
// The node total cpu millicores
#[serde(default)]
pub total_cpu_millicores: i64,
// The node total memory bytes
#[serde(default)]
pub total_memory_bytes: i64,
/// The node build cpu usage millicores
#[serde(default)]
pub cpu_usage_millicores: i64,
/// The node build memory usage bytes
#[serde(default)]
pub memory_usage_bytes: i64,
// The node hostname
#[serde(default)]
pub hostname: String,
}
// TODO(zyy17): Allow deprecated fields for backward compatibility. Remove this when the deprecated top-level fields are removed from the proto.
#[allow(deprecated)]
impl From<MetasrvNodeInfo> for api::v1::meta::MetasrvNodeInfo {
fn from(node_info: MetasrvNodeInfo) -> Self {
Self {
peer: Some(api::v1::meta::Peer {
addr: node_info.addr,
..Default::default()
}),
// TODO(zyy17): The following top-level fields are deprecated. They are kept for backward compatibility and will be removed in a future version.
// New code should use the fields in `info.NodeInfo` instead.
version: node_info.version.clone(),
git_commit: node_info.git_commit.clone(),
start_time_ms: node_info.start_time_ms,
cpus: node_info.total_cpu_millicores as u32,
memory_bytes: node_info.total_memory_bytes as u64,
// The canonical location for node information.
info: Some(api::v1::meta::NodeInfo {
version: node_info.version,
git_commit: node_info.git_commit,
start_time_ms: node_info.start_time_ms,
total_cpu_millicores: node_info.total_cpu_millicores,
total_memory_bytes: node_info.total_memory_bytes,
cpu_usage_millicores: node_info.cpu_usage_millicores,
memory_usage_bytes: node_info.memory_usage_bytes,
cpus: node_info.total_cpu_millicores as u32,
memory_bytes: node_info.total_memory_bytes as u64,
hostname: node_info.hostname,
}),
}
}
}
#[derive(Clone, Copy)]
pub enum SelectTarget {
Datanode,
@@ -552,7 +483,6 @@ pub struct SelectorContext {
pub type SelectorRef = Arc<dyn Selector<Context = SelectorContext, Output = Vec<Peer>>>;
pub type RegionStatAwareSelectorRef =
Arc<dyn RegionStatAwareSelector<Context = SelectorContext, Output = Vec<(RegionId, Peer)>>>;
pub type ElectionRef = Arc<dyn Election<Leader = LeaderValue>>;
pub struct MetaStateHandler {
subscribe_manager: Option<SubscriptionManagerRef>,

View File

@@ -32,7 +32,7 @@ pub struct LeaderHandler {
impl LeaderHandler {
async fn get_leader(&self) -> Result<Option<String>> {
if let Some(election) = &self.election {
let leader_addr = election.leader().await?.0;
let leader_addr = election.leader().await.context(error::KvBackendSnafu)?.0;
return Ok(Some(leader_addr));
}
Ok(None)

View File

@@ -63,7 +63,10 @@ impl cluster_server::Cluster for Metasrv {
let leader_addr = &self.options().grpc.server_addr;
let (leader, followers) = match self.election() {
Some(election) => {
let nodes = election.all_candidates().await?;
let nodes = election
.all_candidates()
.await
.context(error::KvBackendSnafu)?;
let followers = nodes
.into_iter()
.filter(|node_info| &node_info.addr != leader_addr)

View File

@@ -23,7 +23,7 @@ use api::v1::meta::{
use common_telemetry::{debug, error, info, warn};
use futures::StreamExt;
use once_cell::sync::OnceCell;
use snafu::OptionExt;
use snafu::{OptionExt, ResultExt};
use tokio::sync::mpsc;
use tokio::sync::mpsc::Sender;
use tokio_stream::wrappers::ReceiverStream;
@@ -148,7 +148,7 @@ async fn handle_ask_leader(_req: AskLeaderRequest, ctx: Context) -> Result<AskLe
if election.is_leader() {
ctx.server_addr
} else {
election.leader().await?.0
election.leader().await.context(error::KvBackendSnafu)?.0
}
}
None => ctx.server_addr,

View File

@@ -108,6 +108,11 @@ name = "memtable_bench"
harness = false
required-features = ["test"]
[[bench]]
name = "bench_cache_stream"
harness = false
required-features = ["test"]
[[bench]]
name = "bench_filter_time_partition"
harness = false

View File

@@ -0,0 +1,126 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Benchmarks for `cache_flat_range_stream` overhead.
//!
//! Compares consuming batches from a plain stream vs through the caching wrapper
//! that clones batches for the range cache.
//!
//! Run with:
//! ```sh
//! cargo bench -p mito2 --features test --bench bench_cache_stream
//! ```
use std::collections::VecDeque;
use std::sync::Arc;
use criterion::{Criterion, criterion_group, criterion_main};
use futures::TryStreamExt;
use mito_codec::row_converter::DensePrimaryKeyCodec;
use mito2::memtable::bulk::context::BulkIterContext;
use mito2::memtable::bulk::part::{BulkPartConverter, BulkPartEncoder};
use mito2::memtable::bulk::part_reader::EncodedBulkPartIter;
use mito2::read::range_cache::bench_cache_flat_range_stream;
use mito2::sst::parquet::DEFAULT_ROW_GROUP_SIZE;
use mito2::sst::{FlatSchemaOptions, to_flat_sst_arrow_schema};
use mito2::test_util::bench_util::{CpuDataGenerator, cpu_metadata};
fn cache_flat_range_stream_bench(c: &mut Criterion) {
let metadata = Arc::new(cpu_metadata());
let region_id = metadata.region_id;
let start_sec = 1710043200;
// 2000 hosts × 51 steps = 102,000 rows ≈ DEFAULT_ROW_GROUP_SIZE
let num_hosts = 2000;
let end_sec = start_sec + 510;
let generator = CpuDataGenerator::new(metadata.clone(), num_hosts, start_sec, end_sec);
// Build a BulkPart from all the generated data
let schema = to_flat_sst_arrow_schema(&metadata, &FlatSchemaOptions::default());
let codec = Arc::new(DensePrimaryKeyCodec::new(&metadata));
let mut converter = BulkPartConverter::new(
&metadata,
schema,
DEFAULT_ROW_GROUP_SIZE,
codec,
true, // store_pk_columns
);
for kvs in generator.iter() {
converter.append_key_values(&kvs).unwrap();
}
let bulk_part = converter.convert().unwrap();
// Encode to parquet
let encoder = BulkPartEncoder::new(metadata.clone(), DEFAULT_ROW_GROUP_SIZE).unwrap();
let encoded_part = encoder.encode_part(&bulk_part).unwrap().unwrap();
// Decode all record batches
let num_row_groups = encoded_part.metadata().parquet_metadata.num_row_groups();
let context = Arc::new(
BulkIterContext::new(
metadata.clone(),
None, // No projection
None, // No predicate
false,
)
.unwrap(),
);
let row_groups: VecDeque<usize> = (0..num_row_groups).collect();
let rt = tokio::runtime::Runtime::new().unwrap();
let mut group = c.benchmark_group("cache_flat_range_stream");
group.sample_size(10);
group.bench_function("baseline_iter_stream", |b| {
b.iter(|| {
rt.block_on(async {
let iter = EncodedBulkPartIter::try_new(
&encoded_part,
context.clone(),
row_groups.clone(),
None,
None,
)
.unwrap();
let stream: mito2::read::BoxedRecordBatchStream =
Box::pin(futures::stream::iter(iter));
let mut stream = stream;
while let Some(_batch) = stream.try_next().await.unwrap() {}
});
});
});
group.bench_function("cache_flat_range_stream", |b| {
b.iter(|| {
rt.block_on(async {
let iter = EncodedBulkPartIter::try_new(
&encoded_part,
context.clone(),
row_groups.clone(),
None,
None,
)
.unwrap();
let stream: mito2::read::BoxedRecordBatchStream =
Box::pin(futures::stream::iter(iter));
let mut stream = bench_cache_flat_range_stream(stream, 64 * 1024 * 1024, region_id);
while let Some(_batch) = stream.try_next().await.unwrap() {}
});
});
});
}
criterion_group!(benches, cache_flat_range_stream_bench);
criterion_main!(benches);

View File

@@ -12,15 +12,17 @@
// See the License for the specific language governing permissions and
// limitations under the License.
//! Benchmarks for memtable operations: writes, full scans, filtered scans,
//! bulk part conversion, record batch iteration with filters, and flat merge.
//!
//! Run with:
//! ```sh
//! cargo bench -p mito2 --features test --bench memtable_bench
//! ```
use std::sync::Arc;
use api::v1::value::ValueData;
use api::v1::{Row, Rows, SemanticType};
use criterion::{Criterion, criterion_group, criterion_main};
use datafusion_common::Column;
use datafusion_expr::{Expr, lit};
use datatypes::data_type::ConcreteDataType;
use datatypes::schema::ColumnSchema;
use mito_codec::row_converter::DensePrimaryKeyCodec;
use mito2::memtable::bulk::context::BulkIterContext;
use mito2::memtable::bulk::part::BulkPartConverter;
@@ -28,20 +30,13 @@ use mito2::memtable::bulk::part_reader::BulkPartBatchIter;
use mito2::memtable::bulk::{BulkMemtable, BulkMemtableConfig};
use mito2::memtable::partition_tree::{PartitionTreeConfig, PartitionTreeMemtable};
use mito2::memtable::time_series::TimeSeriesMemtable;
use mito2::memtable::{IterBuilder, KeyValues, Memtable, RangesOptions};
use mito2::memtable::{IterBuilder, Memtable, RangesOptions};
use mito2::read::flat_merge::FlatMergeIterator;
use mito2::read::scan_region::PredicateGroup;
use mito2::region::options::MergeMode;
use mito2::sst::{FlatSchemaOptions, to_flat_sst_arrow_schema};
use mito2::test_util::memtable_util::{self, region_metadata_to_row_schema};
use rand::Rng;
use rand::rngs::ThreadRng;
use rand::seq::IndexedRandom;
use store_api::metadata::{
ColumnMetadata, RegionMetadata, RegionMetadataBuilder, RegionMetadataRef,
};
use store_api::storage::RegionId;
use table::predicate::Predicate;
use mito2::test_util::bench_util::{CpuDataGenerator, cpu_metadata};
use mito2::test_util::memtable_util;
/// Writes rows.
fn write_rows(c: &mut Criterion) {
@@ -216,224 +211,6 @@ fn filter_1_host(c: &mut Criterion) {
});
}
struct Host {
hostname: String,
region: String,
datacenter: String,
rack: String,
os: String,
arch: String,
team: String,
service: String,
service_version: String,
service_environment: String,
}
impl Host {
fn random_with_id(id: usize) -> Host {
let mut rng = rand::rng();
let region = format!("ap-southeast-{}", rng.random_range(0..10));
let datacenter = format!(
"{}{}",
region,
['a', 'b', 'c', 'd', 'e'].choose(&mut rng).unwrap()
);
Host {
hostname: format!("host_{id}"),
region,
datacenter,
rack: rng.random_range(0..100).to_string(),
os: "Ubuntu16.04LTS".to_string(),
arch: "x86".to_string(),
team: "CHI".to_string(),
service: rng.random_range(0..100).to_string(),
service_version: rng.random_range(0..10).to_string(),
service_environment: "test".to_string(),
}
}
fn fill_values(&self, values: &mut Vec<api::v1::Value>) {
let tags = [
api::v1::Value {
value_data: Some(ValueData::StringValue(self.hostname.clone())),
},
api::v1::Value {
value_data: Some(ValueData::StringValue(self.region.clone())),
},
api::v1::Value {
value_data: Some(ValueData::StringValue(self.datacenter.clone())),
},
api::v1::Value {
value_data: Some(ValueData::StringValue(self.rack.clone())),
},
api::v1::Value {
value_data: Some(ValueData::StringValue(self.os.clone())),
},
api::v1::Value {
value_data: Some(ValueData::StringValue(self.arch.clone())),
},
api::v1::Value {
value_data: Some(ValueData::StringValue(self.team.clone())),
},
api::v1::Value {
value_data: Some(ValueData::StringValue(self.service.clone())),
},
api::v1::Value {
value_data: Some(ValueData::StringValue(self.service_version.clone())),
},
api::v1::Value {
value_data: Some(ValueData::StringValue(self.service_environment.clone())),
},
];
for tag in tags {
values.push(tag);
}
}
}
struct CpuDataGenerator {
metadata: RegionMetadataRef,
column_schemas: Vec<api::v1::ColumnSchema>,
hosts: Vec<Host>,
start_sec: i64,
end_sec: i64,
}
impl CpuDataGenerator {
fn new(metadata: RegionMetadataRef, num_hosts: usize, start_sec: i64, end_sec: i64) -> Self {
let column_schemas = region_metadata_to_row_schema(&metadata);
Self {
metadata,
column_schemas,
hosts: Self::generate_hosts(num_hosts),
start_sec,
end_sec,
}
}
fn iter(&self) -> impl Iterator<Item = KeyValues> + '_ {
// point per 10s.
(self.start_sec..self.end_sec)
.step_by(10)
.enumerate()
.map(|(seq, ts)| self.build_key_values(seq, ts))
}
fn build_key_values(&self, seq: usize, current_sec: i64) -> KeyValues {
let rows = self
.hosts
.iter()
.map(|host| {
let mut rng = rand::rng();
let mut values = Vec::with_capacity(21);
values.push(api::v1::Value {
value_data: Some(ValueData::TimestampMillisecondValue(current_sec * 1000)),
});
host.fill_values(&mut values);
for _ in 0..10 {
values.push(api::v1::Value {
value_data: Some(ValueData::F64Value(Self::random_f64(&mut rng))),
});
}
Row { values }
})
.collect();
let mutation = api::v1::Mutation {
op_type: api::v1::OpType::Put as i32,
sequence: seq as u64,
rows: Some(Rows {
schema: self.column_schemas.clone(),
rows,
}),
write_hint: None,
};
KeyValues::new(&self.metadata, mutation).unwrap()
}
fn random_host_filter(&self) -> Predicate {
let host = self.random_hostname();
let expr = Expr::Column(Column::from_name("hostname")).eq(lit(host));
Predicate::new(vec![expr])
}
fn random_host_filter_exprs(&self) -> Vec<Expr> {
let host = self.random_hostname();
vec![Expr::Column(Column::from_name("hostname")).eq(lit(host))]
}
fn random_hostname(&self) -> String {
let mut rng = rand::rng();
self.hosts.choose(&mut rng).unwrap().hostname.clone()
}
fn random_f64(rng: &mut ThreadRng) -> f64 {
let base: u32 = rng.random_range(30..95);
base as f64
}
fn generate_hosts(num_hosts: usize) -> Vec<Host> {
(0..num_hosts).map(Host::random_with_id).collect()
}
}
/// Creates a metadata for TSBS cpu-like table.
fn cpu_metadata() -> RegionMetadata {
let mut builder = RegionMetadataBuilder::new(RegionId::new(1, 1));
builder.push_column_metadata(ColumnMetadata {
column_schema: ColumnSchema::new(
"ts",
ConcreteDataType::timestamp_millisecond_datatype(),
false,
),
semantic_type: SemanticType::Timestamp,
column_id: 0,
});
let mut column_id = 1;
let tags = [
"hostname",
"region",
"datacenter",
"rack",
"os",
"arch",
"team",
"service",
"service_version",
"service_environment",
];
for tag in tags {
builder.push_column_metadata(ColumnMetadata {
column_schema: ColumnSchema::new(tag, ConcreteDataType::string_datatype(), true),
semantic_type: SemanticType::Tag,
column_id,
});
column_id += 1;
}
let fields = [
"usage_user",
"usage_system",
"usage_idle",
"usage_nice",
"usage_iowait",
"usage_irq",
"usage_softirq",
"usage_steal",
"usage_guest",
"usage_guest_nice",
];
for field in fields {
builder.push_column_metadata(ColumnMetadata {
column_schema: ColumnSchema::new(field, ConcreteDataType::float64_datatype(), true),
semantic_type: SemanticType::Field,
column_id,
});
column_id += 1;
}
builder.primary_key(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
builder.build().unwrap()
}
fn bulk_part_converter(c: &mut Criterion) {
let metadata = Arc::new(cpu_metadata());
let start_sec = 1710043200;

View File

@@ -350,7 +350,7 @@ impl CacheStrategy {
/// Calls [CacheManager::get_range_result()].
/// It returns None if the strategy is [CacheStrategy::Compaction] or [CacheStrategy::Disabled].
#[cfg_attr(not(test), allow(dead_code))]
#[allow(dead_code)]
pub(crate) fn get_range_result(
&self,
key: &RangeScanCacheKey,
@@ -363,7 +363,6 @@ impl CacheStrategy {
/// Calls [CacheManager::put_range_result()].
/// It does nothing if the strategy isn't [CacheStrategy::EnableAll].
#[cfg_attr(not(test), allow(dead_code))]
pub(crate) fn put_range_result(
&self,
key: RangeScanCacheKey,
@@ -476,7 +475,6 @@ pub struct CacheManager {
/// Cache for time series selectors.
selector_result_cache: Option<SelectorResultCache>,
/// Cache for range scan outputs in flat format.
#[cfg_attr(not(test), allow(dead_code))]
range_result_cache: Option<RangeResultCache>,
/// Cache for index result.
index_result_cache: Option<IndexResultCache>,
@@ -713,7 +711,7 @@ impl CacheManager {
}
/// Gets cached result for range scan.
#[cfg_attr(not(test), allow(dead_code))]
#[allow(dead_code)]
pub(crate) fn get_range_result(
&self,
key: &RangeScanCacheKey,
@@ -723,8 +721,7 @@ impl CacheManager {
.and_then(|cache| update_hit_miss(cache.get(key), RANGE_RESULT_TYPE))
}
/// Puts range scan result into the cache.
#[cfg_attr(not(test), allow(dead_code))]
/// Puts range scan result into cache.
pub(crate) fn put_range_result(
&self,
key: RangeScanCacheKey,
@@ -949,7 +946,7 @@ impl CacheManagerBuilder {
Cache::builder()
.max_capacity(self.range_result_cache_size)
.weigher(range_result_cache_weight)
.eviction_listener(|k, v, cause| {
.eviction_listener(move |k, v, cause| {
let size = range_result_cache_weight(&k, &v);
CACHE_BYTES
.with_label_values(&[RANGE_RESULT_TYPE])
@@ -1361,7 +1358,7 @@ mod tests {
}
.build(),
};
let value = Arc::new(RangeScanCacheValue::new(Vec::new()));
let value = Arc::new(RangeScanCacheValue::new(Vec::new(), 0));
assert!(cache.get_range_result(&key).is_none());
cache.put_range_result(key.clone(), value.clone());

View File

@@ -116,6 +116,8 @@ pub struct MitoConfig {
pub page_cache_size: ReadableSize,
/// Cache size for time series selector (e.g. `last_value()`). Setting it to 0 to disable the cache.
pub selector_result_cache_size: ReadableSize,
/// Cache size for flat range scan results. Setting it to 0 to disable the cache.
pub range_result_cache_size: ReadableSize,
/// Whether to enable the write cache.
pub enable_write_cache: bool,
/// File system path for write cache dir's root, defaults to `{data_home}`.
@@ -200,6 +202,7 @@ impl Default for MitoConfig {
vector_cache_size: ReadableSize::mb(512),
page_cache_size: ReadableSize::mb(512),
selector_result_cache_size: ReadableSize::mb(512),
range_result_cache_size: ReadableSize::mb(512),
enable_write_cache: false,
write_cache_path: String::new(),
write_cache_size: ReadableSize::gb(5),
@@ -336,6 +339,7 @@ impl MitoConfig {
self.vector_cache_size = mem_cache_size;
self.page_cache_size = page_cache_size;
self.selector_result_cache_size = mem_cache_size;
self.range_result_cache_size = mem_cache_size;
self.index.adjust_buffer_and_cache_size(sys_memory);
}

View File

@@ -537,11 +537,15 @@ pub trait IterBuilder: Send + Sync {
}
/// Returns the record batch iterator to read the range.
/// ## Note
/// Implementations should ensure the iterator yields data within given time range.
fn build_record_batch(
&self,
time_range: Option<(Timestamp, Timestamp)>,
metrics: Option<MemScanMetrics>,
) -> Result<BoxedRecordBatchIterator> {
let _metrics = metrics;
let _ = time_range;
UnsupportedOperationSnafu {
err_msg: "Record batch iterator is not supported by this memtable",
}
@@ -700,7 +704,7 @@ impl MemtableRange {
metrics: Option<MemScanMetrics>,
) -> Result<BoxedRecordBatchIterator> {
if self.context.builder.is_record_batch() {
return self.context.builder.build_record_batch(metrics);
return self.context.builder.build_record_batch(time_range, metrics);
}
if let Some(context) = self.context.batch_to_record_batch.as_ref() {

View File

@@ -34,6 +34,7 @@ fn env_usize(name: &str, default: usize) -> usize {
.unwrap_or(default)
}
use common_time::Timestamp;
use datatypes::arrow::datatypes::SchemaRef;
use mito_codec::key_values::KeyValue;
use rayon::prelude::*;
@@ -792,6 +793,7 @@ impl IterBuilder for BulkRangeIterBuilder {
fn build_record_batch(
&self,
_time_range: Option<(Timestamp, Timestamp)>,
metrics: Option<MemScanMetrics>,
) -> Result<BoxedRecordBatchIterator> {
let series_count = self.part.estimated_series_count();
@@ -825,6 +827,7 @@ impl IterBuilder for MultiBulkRangeIterBuilder {
fn build_record_batch(
&self,
_time_range: Option<(Timestamp, Timestamp)>,
metrics: Option<MemScanMetrics>,
) -> Result<BoxedRecordBatchIterator> {
self.part
@@ -864,6 +867,7 @@ impl IterBuilder for EncodedBulkRangeIterBuilder {
fn build_record_batch(
&self,
_time_range: Option<(Timestamp, Timestamp)>,
metrics: Option<MemScanMetrics>,
) -> Result<BoxedRecordBatchIterator> {
if let Some(iter) = self

View File

@@ -967,7 +967,7 @@ impl EncodedBulkPart {
Self { data, metadata }
}
pub(crate) fn metadata(&self) -> &BulkPartMeta {
pub fn metadata(&self) -> &BulkPartMeta {
&self.metadata
}
@@ -977,7 +977,7 @@ impl EncodedBulkPart {
}
/// Returns the encoded data.
pub(crate) fn data(&self) -> &Bytes {
pub fn data(&self) -> &Bytes {
&self.data
}
@@ -1121,10 +1121,7 @@ pub struct BulkPartEncoder {
}
impl BulkPartEncoder {
pub(crate) fn new(
metadata: RegionMetadataRef,
row_group_size: usize,
) -> Result<BulkPartEncoder> {
pub fn new(metadata: RegionMetadataRef, row_group_size: usize) -> Result<BulkPartEncoder> {
// TODO(yingwen): Skip arrow schema if needed.
let json = metadata.to_json().context(InvalidMetadataSnafu)?;
let key_value_meta =
@@ -1216,7 +1213,7 @@ impl BulkPartEncoder {
}
/// Encodes bulk part to a [EncodedBulkPart], returns the encoded data.
fn encode_part(&self, part: &BulkPart) -> Result<Option<EncodedBulkPart>> {
pub fn encode_part(&self, part: &BulkPart) -> Result<Option<EncodedBulkPart>> {
if part.batch.num_rows() == 0 {
return Ok(None);
}

View File

@@ -50,7 +50,7 @@ pub struct EncodedBulkPartIter {
impl EncodedBulkPartIter {
/// Creates a new [BulkPartIter].
pub(crate) fn try_new(
pub fn try_new(
encoded_part: &EncodedBulkPart,
context: BulkIterContextRef,
mut row_groups_to_read: VecDeque<usize>,

View File

@@ -51,15 +51,18 @@ use crate::memtable::bulk::part::BulkPart;
use crate::memtable::simple_bulk_memtable::SimpleBulkMemtable;
use crate::memtable::stats::WriteMetrics;
use crate::memtable::{
AllocTracker, BatchToRecordBatchContext, BoxedBatchIterator, IterBuilder, KeyValues,
MemScanMetrics, Memtable, MemtableBuilder, MemtableId, MemtableRange, MemtableRangeContext,
MemtableRanges, MemtableRef, MemtableStats, RangesOptions, read_column_ids_from_projection,
AllocTracker, BatchToRecordBatchContext, BoxedBatchIterator, BoxedRecordBatchIterator,
IterBuilder, KeyValues, MemScanMetrics, Memtable, MemtableBuilder, MemtableId, MemtableRange,
MemtableRangeContext, MemtableRanges, MemtableRef, MemtableStats, RangesOptions,
read_column_ids_from_projection,
};
use crate::metrics::{
MEMTABLE_ACTIVE_FIELD_BUILDER_COUNT, MEMTABLE_ACTIVE_SERIES_COUNT, READ_ROWS_TOTAL,
READ_STAGE_ELAPSED,
};
use crate::read::dedup::LastNonNullIter;
use crate::read::prune::PruneTimeIterator;
use crate::read::scan_region::PredicateGroup;
use crate::read::{Batch, BatchBuilder, BatchColumn};
use crate::region::options::MergeMode;
@@ -283,25 +286,20 @@ impl Memtable for TimeSeriesMemtable {
.map(|c| c.column_id)
.collect()
};
let builder = Box::new(TimeSeriesIterBuilder {
series_set: self.series_set.clone(),
projection,
predicate: predicate.predicate().cloned(),
dedup: self.dedup,
merge_mode: self.merge_mode,
sequence,
});
let adapter_context = Arc::new(BatchToRecordBatchContext::new(
let batch_to_record_batch = Arc::new(BatchToRecordBatchContext::new(
self.region_metadata.clone(),
read_column_ids,
));
let context = Arc::new(MemtableRangeContext::new_with_batch_to_record_batch(
self.id,
builder,
predicate,
Some(adapter_context),
));
let builder = Box::new(TimeSeriesIterBuilder {
series_set: self.series_set.clone(),
projection,
predicate: predicate.clone(),
dedup: self.dedup,
merge_mode: self.merge_mode,
sequence,
batch_to_record_batch,
});
let context = Arc::new(MemtableRangeContext::new(self.id, builder, predicate));
let range_stats = self.stats();
let range = MemtableRange::new(context, range_stats);
Ok(MemtableRanges {
@@ -443,7 +441,7 @@ impl SeriesSet {
fn iter_series(
&self,
projection: HashSet<ColumnId>,
predicate: Option<Predicate>,
predicate: PredicateGroup,
dedup: bool,
merge_mode: MergeMode,
sequence: Option<SequenceRange>,
@@ -460,7 +458,7 @@ impl SeriesSet {
self.region_metadata.clone(),
self.series.clone(),
projection,
predicate,
predicate.predicate().cloned(),
primary_key_schema,
primary_key_datatypes,
self.codec.clone(),
@@ -1245,10 +1243,11 @@ impl From<ValueBuilder> for Values {
struct TimeSeriesIterBuilder {
series_set: SeriesSet,
projection: HashSet<ColumnId>,
predicate: Option<Predicate>,
predicate: PredicateGroup,
dedup: bool,
sequence: Option<SequenceRange>,
merge_mode: MergeMode,
batch_to_record_batch: Arc<BatchToRecordBatchContext>,
}
impl IterBuilder for TimeSeriesIterBuilder {
@@ -1268,6 +1267,25 @@ impl IterBuilder for TimeSeriesIterBuilder {
Ok(Box::new(iter))
}
}
fn is_record_batch(&self) -> bool {
true
}
fn build_record_batch(
&self,
time_range: Option<(Timestamp, Timestamp)>,
metrics: Option<MemScanMetrics>,
) -> Result<BoxedRecordBatchIterator> {
let iter = self.build(metrics)?;
let iter: BoxedBatchIterator = if let Some(time_range) = time_range {
let time_filters = self.predicate.time_filters();
Box::new(PruneTimeIterator::new(iter, time_range, time_filters))
} else {
iter
};
Ok(self.batch_to_record_batch.adapt_iter(iter))
}
}
#[cfg(test)]
@@ -2014,4 +2032,265 @@ mod tests {
all_timestamps.sort();
assert_eq!(vec![3, 4, 5, 6, 7], all_timestamps);
}
/// Helper to create a TimeSeriesIterBuilder from a memtable and schema.
fn build_iter_builder(
schema: &RegionMetadataRef,
memtable: &TimeSeriesMemtable,
projection: Option<&[ColumnId]>,
dedup: bool,
merge_mode: MergeMode,
sequence: Option<SequenceRange>,
) -> TimeSeriesIterBuilder {
let read_column_ids = read_column_ids_from_projection(schema, projection);
let field_projection = if let Some(projection) = projection {
projection.iter().copied().collect()
} else {
schema.field_columns().map(|c| c.column_id).collect()
};
let adapter_context = Arc::new(BatchToRecordBatchContext::new(
schema.clone(),
read_column_ids,
));
TimeSeriesIterBuilder {
series_set: memtable.series_set.clone(),
projection: field_projection,
predicate: PredicateGroup::default(),
dedup,
merge_mode,
sequence,
batch_to_record_batch: adapter_context,
}
}
#[test]
fn test_iter_builder_build_record_batch_basic() {
let schema = schema_for_test();
let memtable = TimeSeriesMemtable::new(schema.clone(), 1, None, true, MergeMode::LastRow);
let kvs = build_key_values(&schema, "hello".to_string(), 42, 10);
memtable.write(&kvs).unwrap();
let builder = build_iter_builder(&schema, &memtable, None, true, MergeMode::LastRow, None);
let mut iter = builder.build_record_batch(None, None).unwrap();
let rb = iter.next().transpose().unwrap().unwrap();
assert_eq!(10, rb.num_rows());
let rb_schema = rb.schema();
let col_names: Vec<_> = rb_schema
.fields()
.iter()
.map(|f| f.name().as_str())
.collect();
assert_eq!(
col_names,
vec![
"k0",
"k1",
"v0",
"v1",
"ts",
"__primary_key",
"__sequence",
"__op_type",
]
);
assert!(iter.next().is_none());
}
#[test]
fn test_iter_builder_build_record_batch_with_projection() {
let schema = schema_for_test();
let memtable = TimeSeriesMemtable::new(schema.clone(), 1, None, true, MergeMode::LastRow);
let kvs = build_key_values(&schema, "test".to_string(), 1, 5);
memtable.write(&kvs).unwrap();
// Project only field v0 (column_id=3) and ts (column_id=2).
let projection = vec![2, 3];
let builder = build_iter_builder(
&schema,
&memtable,
Some(&projection),
true,
MergeMode::LastRow,
None,
);
let mut iter = builder.build_record_batch(None, None).unwrap();
let rb = iter.next().transpose().unwrap().unwrap();
assert_eq!(5, rb.num_rows());
let rb_schema = rb.schema();
let col_names: Vec<_> = rb_schema
.fields()
.iter()
.map(|f| f.name().as_str())
.collect();
// Only projected columns + internal columns.
assert_eq!(
col_names,
vec!["v0", "ts", "__primary_key", "__sequence", "__op_type",]
);
assert!(iter.next().is_none());
}
#[test]
fn test_iter_builder_build_record_batch_multiple_series() {
let schema = schema_for_test();
let memtable = TimeSeriesMemtable::new(schema.clone(), 1, None, true, MergeMode::LastRow);
let kvs_a = build_key_values(&schema, "aaa".to_string(), 1, 3);
let kvs_b = build_key_values(&schema, "bbb".to_string(), 2, 4);
memtable.write(&kvs_a).unwrap();
memtable.write(&kvs_b).unwrap();
let builder = build_iter_builder(&schema, &memtable, None, true, MergeMode::LastRow, None);
let iter = builder.build_record_batch(None, None).unwrap();
let mut total_rows = 0;
for rb in iter {
let rb = rb.unwrap();
total_rows += rb.num_rows();
assert_eq!(8, rb.num_columns());
}
assert_eq!(7, total_rows);
}
#[test]
fn test_iter_builder_build_record_batch_dedup() {
let schema = schema_for_test();
let memtable = TimeSeriesMemtable::new(schema.clone(), 1, None, true, MergeMode::LastRow);
// Write same data twice — dedup should keep only one copy per timestamp.
let kvs = build_key_values(&schema, "dup".to_string(), 10, 5);
memtable.write(&kvs).unwrap();
memtable.write(&kvs).unwrap();
let builder = build_iter_builder(&schema, &memtable, None, true, MergeMode::LastRow, None);
let iter = builder.build_record_batch(None, None).unwrap();
let total_rows: usize = iter.map(|rb| rb.unwrap().num_rows()).sum();
assert_eq!(5, total_rows);
}
#[test]
fn test_iter_builder_build_record_batch_no_dedup() {
let schema = schema_for_test();
let memtable = TimeSeriesMemtable::new(schema.clone(), 1, None, false, MergeMode::LastRow);
let kvs = build_key_values(&schema, "dup".to_string(), 10, 5);
memtable.write(&kvs).unwrap();
memtable.write(&kvs).unwrap();
let builder = build_iter_builder(&schema, &memtable, None, false, MergeMode::LastRow, None);
let iter = builder.build_record_batch(None, None).unwrap();
let total_rows: usize = iter.map(|rb| rb.unwrap().num_rows()).sum();
assert_eq!(10, total_rows);
}
#[test]
fn test_iter_builder_build_record_batch_with_sequence_filter() {
let schema = schema_for_test();
let memtable = TimeSeriesMemtable::new(schema.clone(), 1, None, true, MergeMode::LastRow);
// build_key_values creates a mutation with base sequence=0.
// Each row gets sequence = base + row_index, so 5 rows get sequences 0,1,2,3,4.
let kvs = build_key_values(&schema, "seq".to_string(), 1, 5);
memtable.write(&kvs).unwrap();
// Filter to sequence > 4 — should yield no rows.
let builder = build_iter_builder(
&schema,
&memtable,
None,
true,
MergeMode::LastRow,
Some(SequenceRange::Gt { min: 4 }),
);
let iter = builder.build_record_batch(None, None).unwrap();
let total_rows: usize = iter.map(|rb| rb.unwrap().num_rows()).sum();
assert_eq!(0, total_rows);
// Filter to sequence <= 2 — should yield 3 rows (sequences 0, 1, 2).
let builder = build_iter_builder(
&schema,
&memtable,
None,
true,
MergeMode::LastRow,
Some(SequenceRange::LtEq { max: 2 }),
);
let iter = builder.build_record_batch(None, None).unwrap();
let total_rows: usize = iter.map(|rb| rb.unwrap().num_rows()).sum();
assert_eq!(3, total_rows);
}
#[test]
fn test_iter_builder_build_record_batch_data_correctness() {
use datatypes::arrow::array::{
Float64Array, Int64Array, TimestampMillisecondArray, UInt8Array,
};
let schema = schema_for_test();
let memtable = TimeSeriesMemtable::new(schema.clone(), 1, None, true, MergeMode::LastRow);
let kvs = build_key_values(&schema, "check".to_string(), 7, 3);
memtable.write(&kvs).unwrap();
let builder = build_iter_builder(&schema, &memtable, None, true, MergeMode::LastRow, None);
let mut iter = builder.build_record_batch(None, None).unwrap();
let rb = iter.next().transpose().unwrap().unwrap();
assert_eq!(3, rb.num_rows());
// Verify timestamp values.
let ts_col = rb
.column_by_name("ts")
.unwrap()
.as_any()
.downcast_ref::<TimestampMillisecondArray>()
.unwrap();
let timestamps: Vec<_> = (0..ts_col.len()).map(|i| ts_col.value(i)).collect();
assert_eq!(vec![0, 1, 2], timestamps);
// Verify field v0 values.
let v0_col = rb
.column_by_name("v0")
.unwrap()
.as_any()
.downcast_ref::<Int64Array>()
.unwrap();
let v0_values: Vec<_> = (0..v0_col.len()).map(|i| v0_col.value(i)).collect();
assert_eq!(vec![0, 1, 2], v0_values);
// Verify field v1 values.
let v1_col = rb
.column_by_name("v1")
.unwrap()
.as_any()
.downcast_ref::<Float64Array>()
.unwrap();
let v1_values: Vec<_> = (0..v1_col.len()).map(|i| v1_col.value(i)).collect();
assert_eq!(vec![0.0, 1.0, 2.0], v1_values);
// Verify op_type is all Put (1).
let op_col = rb
.column_by_name("__op_type")
.unwrap()
.as_any()
.downcast_ref::<UInt8Array>()
.unwrap();
for i in 0..op_col.len() {
assert_eq!(OpType::Put as u8, op_col.value(i));
}
assert!(iter.next().is_none());
}
}

View File

@@ -27,6 +27,9 @@ pub mod projection;
pub(crate) mod prune;
pub(crate) mod pruner;
pub mod range;
#[cfg(feature = "test")]
pub mod range_cache;
#[cfg(not(feature = "test"))]
pub(crate) mod range_cache;
pub mod scan_region;
pub mod scan_util;

View File

@@ -18,18 +18,21 @@ use std::sync::Arc;
use api::v1::SemanticType;
use common_error::ext::BoxedError;
use common_recordbatch::error::{ArrowComputeSnafu, ExternalSnafu};
use common_recordbatch::error::{ArrowComputeSnafu, ExternalSnafu, NewDfRecordBatchSnafu};
use common_recordbatch::{DfRecordBatch, RecordBatch};
use datatypes::arrow::datatypes::Field;
use datatypes::arrow::array::Array;
use datatypes::arrow::datatypes::{DataType as ArrowDataType, Field};
use datatypes::prelude::{ConcreteDataType, DataType};
use datatypes::schema::{Schema, SchemaRef};
use datatypes::value::Value;
use datatypes::vectors::Helper;
use snafu::{OptionExt, ResultExt};
use store_api::metadata::{RegionMetadata, RegionMetadataRef};
use store_api::storage::ColumnId;
use crate::cache::CacheStrategy;
use crate::error::{InvalidRequestSnafu, RecordBatchSnafu, Result};
use crate::read::projection::read_column_ids_from_projection;
use crate::read::projection::{read_column_ids_from_projection, repeated_vector_with_cache};
use crate::sst::parquet::flat_format::sst_column_id_indices;
use crate::sst::parquet::format::FormatProjection;
use crate::sst::{
@@ -248,12 +251,55 @@ impl FlatProjectionMapper {
pub(crate) fn convert(
&self,
batch: &datatypes::arrow::record_batch::RecordBatch,
cache_strategy: &CacheStrategy,
) -> common_recordbatch::error::Result<RecordBatch> {
if self.is_empty_projection {
return RecordBatch::new_with_count(self.output_schema.clone(), batch.num_rows());
}
let columns = self.project_vectors(batch)?;
RecordBatch::new(self.output_schema.clone(), columns)
// Construct output record batch directly from Arrow arrays to avoid
// Arrow -> Vector -> Arrow roundtrips in the hot path.
let mut arrays = Vec::with_capacity(self.output_schema.num_columns());
for (output_idx, index) in self.batch_indices.iter().enumerate() {
let mut array = batch.column(*index).clone();
// Cast dictionary values to the target type.
if let ArrowDataType::Dictionary(_key_type, value_type) = array.data_type() {
// When a string dictionary column contains only a single value, reuse a cached
// repeated vector to avoid repeatedly expanding the dictionary.
if let Some(dict_array) = single_value_string_dictionary(
&array,
&self.output_schema.column_schemas()[output_idx].data_type,
value_type.as_ref(),
) {
let dict_values = dict_array.values();
let value = if dict_values.is_null(0) {
Value::Null
} else {
Value::from(datatypes::arrow_array::string_array_value(dict_values, 0))
};
let repeated = repeated_vector_with_cache(
&self.output_schema.column_schemas()[output_idx].data_type,
&value,
batch.num_rows(),
cache_strategy,
)?;
array = repeated.to_arrow_array();
} else {
let casted = datatypes::arrow::compute::cast(&array, value_type)
.context(ArrowComputeSnafu)?;
array = casted;
}
}
arrays.push(array);
}
let df_record_batch =
DfRecordBatch::try_new(self.output_schema.arrow_schema().clone(), arrays)
.context(NewDfRecordBatchSnafu)?;
Ok(RecordBatch::from_df_record_batch(
self.output_schema.clone(),
df_record_batch,
))
}
/// Projects columns from the input batch and converts them into vectors.
@@ -281,6 +327,28 @@ impl FlatProjectionMapper {
}
}
fn single_value_string_dictionary<'a>(
array: &'a Arc<dyn Array>,
output_type: &ConcreteDataType,
value_type: &ArrowDataType,
) -> Option<&'a datatypes::arrow::array::DictionaryArray<datatypes::arrow::datatypes::UInt32Type>> {
if !matches!(
value_type,
ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 | ArrowDataType::Utf8View
) || !output_type.is_string()
{
return None;
}
let dict_array = array
.as_any()
.downcast_ref::<datatypes::arrow::array::DictionaryArray<
datatypes::arrow::datatypes::UInt32Type,
>>()?;
(dict_array.values().len() == 1 && dict_array.null_count() == 0).then_some(dict_array)
}
/// Returns ids and datatypes of columns of the output batch after applying the `projection`.
///
/// It adds the time index column if it doesn't present in the projection.

View File

@@ -21,7 +21,7 @@ use std::sync::Arc;
use api::v1::SemanticType;
use common_error::ext::BoxedError;
use common_recordbatch::RecordBatch;
use common_recordbatch::error::ExternalSnafu;
use common_recordbatch::error::{DataTypesSnafu, ExternalSnafu};
use datatypes::prelude::{ConcreteDataType, DataType};
use datatypes::schema::{Schema, SchemaRef};
use datatypes::value::Value;
@@ -37,7 +37,7 @@ use crate::read::Batch;
use crate::read::flat_projection::FlatProjectionMapper;
/// Only cache vector when its length `<=` this value.
const MAX_VECTOR_LENGTH_TO_CACHE: usize = 16384;
pub(crate) const MAX_VECTOR_LENGTH_TO_CACHE: usize = 16384;
/// Wrapper enum for different projection mapper implementations.
pub enum ProjectionMapper {
@@ -423,7 +423,7 @@ enum BatchIndex {
}
/// Gets a vector with repeated values from specific cache or creates a new one.
fn repeated_vector_with_cache(
pub(crate) fn repeated_vector_with_cache(
data_type: &ConcreteDataType,
value: &Value,
num_rows: usize,
@@ -450,7 +450,7 @@ fn repeated_vector_with_cache(
}
/// Returns a vector with repeated values.
fn new_repeated_vector(
pub(crate) fn new_repeated_vector(
data_type: &ConcreteDataType,
value: &Value,
num_rows: usize,
@@ -458,8 +458,7 @@ fn new_repeated_vector(
let mut mutable_vector = data_type.create_mutable_vector(1);
mutable_vector
.try_push_value_ref(&value.as_value_ref())
.map_err(BoxedError::new)
.context(ExternalSnafu)?;
.context(DataTypesSnafu)?;
// This requires an additional allocation.
let base_vector = mutable_vector.to_vector();
Ok(base_vector.replicate(&[num_rows]))
@@ -809,6 +808,7 @@ mod tests {
.num_fields(2)
.build(),
);
let cache = CacheStrategy::Disabled;
let mapper = ProjectionMapper::all(&metadata, true).unwrap();
assert_eq!([0, 1, 2, 3, 4], mapper.column_ids());
assert_eq!(
@@ -823,7 +823,7 @@ mod tests {
);
let batch = new_flat_batch(Some(0), &[(1, 1), (2, 2)], &[(3, 3), (4, 4)], 3);
let record_batch = mapper.as_flat().unwrap().convert(&batch).unwrap();
let record_batch = mapper.as_flat().unwrap().convert(&batch, &cache).unwrap();
let expect = "\
+---------------------+----+----+----+----+
| ts | k0 | k1 | v0 | v1 |
@@ -843,6 +843,7 @@ mod tests {
.num_fields(2)
.build(),
);
let cache = CacheStrategy::Disabled;
// Columns v1, k0
let mapper = ProjectionMapper::new(&metadata, [4, 1].into_iter(), true).unwrap();
assert_eq!([4, 1], mapper.column_ids());
@@ -856,7 +857,7 @@ mod tests {
);
let batch = new_flat_batch(None, &[(1, 1)], &[(4, 4)], 3);
let record_batch = mapper.as_flat().unwrap().convert(&batch).unwrap();
let record_batch = mapper.as_flat().unwrap().convert(&batch, &cache).unwrap();
let expect = "\
+----+----+
| v1 | k0 |
@@ -876,6 +877,7 @@ mod tests {
.num_fields(2)
.build(),
);
let cache = CacheStrategy::Disabled;
// Output columns v1, k0. Read also includes v0.
let mapper = ProjectionMapper::new_with_read_columns(
&metadata,
@@ -887,7 +889,7 @@ mod tests {
assert_eq!([4, 1, 3], mapper.column_ids());
let batch = new_flat_batch(None, &[(1, 1)], &[(3, 3), (4, 4)], 3);
let record_batch = mapper.as_flat().unwrap().convert(&batch).unwrap();
let record_batch = mapper.as_flat().unwrap().convert(&batch, &cache).unwrap();
let expect = "\
+----+----+
| v1 | k0 |
@@ -907,6 +909,7 @@ mod tests {
.num_fields(2)
.build(),
);
let cache = CacheStrategy::Disabled;
// Empty projection
let mapper = ProjectionMapper::new(&metadata, [].into_iter(), true).unwrap();
assert_eq!([0], mapper.column_ids()); // Should still read the time index column
@@ -918,7 +921,7 @@ mod tests {
);
let batch = new_flat_batch(Some(0), &[], &[], 3);
let record_batch = flat_mapper.convert(&batch).unwrap();
let record_batch = flat_mapper.convert(&batch, &cache).unwrap();
assert_eq!(3, record_batch.num_rows());
assert_eq!(0, record_batch.num_columns());
assert!(record_batch.schema.is_empty());

View File

@@ -17,12 +17,23 @@
use std::mem;
use std::sync::Arc;
use async_stream::try_stream;
use common_time::range::TimestampRange;
use datatypes::arrow::array::{Array, AsArray, DictionaryArray};
use datatypes::arrow::datatypes::UInt32Type;
use datatypes::arrow::record_batch::RecordBatch;
use datatypes::prelude::ConcreteDataType;
use futures::TryStreamExt;
use store_api::region_engine::PartitionRange;
use store_api::storage::{ColumnId, FileId, RegionId, TimeSeriesRowSelector};
use crate::memtable::record_batch_estimated_size;
use crate::cache::CacheStrategy;
use crate::read::BoxedRecordBatchStream;
use crate::read::scan_region::StreamContext;
use crate::read::scan_util::PartitionMetrics;
use crate::region::options::MergeMode;
use crate::sst::file::FileTimeRange;
use crate::sst::parquet::flat_format::primary_key_column_index;
/// Fingerprint of the scan request fields that affect partition range cache reuse.
///
@@ -124,7 +135,6 @@ impl ScanRequestFingerprint {
.unwrap_or(&[])
}
#[cfg(test)]
pub(crate) fn without_time_filters(&self) -> Self {
Self {
inner: Arc::clone(&self.inner),
@@ -163,7 +173,7 @@ impl ScanRequestFingerprint {
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub(crate) struct RangeScanCacheKey {
pub(crate) region_id: RegionId,
/// Sorted (file_id, row_group_index) pairs that uniquely identify the covered data.
/// Sorted (file_id, row_group_index) pairs that uniquely identify the data this range covers.
pub(crate) row_groups: Vec<(FileId, i64)>,
pub(crate) scan: ScanRequestFingerprint,
}
@@ -179,30 +189,458 @@ impl RangeScanCacheKey {
/// Cached result for one range scan.
pub(crate) struct RangeScanCacheValue {
pub(crate) batches: Vec<RecordBatch>,
/// Precomputed size of all batches, accounting for shared dictionary values.
estimated_batches_size: usize,
}
impl RangeScanCacheValue {
#[cfg_attr(not(test), allow(dead_code))]
pub(crate) fn new(batches: Vec<RecordBatch>) -> Self {
Self { batches }
pub(crate) fn new(batches: Vec<RecordBatch>, estimated_batches_size: usize) -> Self {
Self {
batches,
estimated_batches_size,
}
}
pub(crate) fn estimated_size(&self) -> usize {
mem::size_of::<Self>()
+ self.batches.capacity() * mem::size_of::<RecordBatch>()
+ self
.batches
.iter()
.map(record_batch_estimated_size)
.sum::<usize>()
+ self.estimated_batches_size
}
}
/// Row groups and whether all sources are file-only for a partition range.
#[allow(dead_code)]
pub(crate) struct PartitionRangeRowGroups {
/// Sorted (file_id, row_group_index) pairs.
pub(crate) row_groups: Vec<(FileId, i64)>,
pub(crate) only_file_sources: bool,
}
/// Collects (file_id, row_group_index) pairs from a partition range's row group indices.
#[allow(dead_code)]
pub(crate) fn collect_partition_range_row_groups(
stream_ctx: &StreamContext,
part_range: &PartitionRange,
) -> PartitionRangeRowGroups {
let range_meta = &stream_ctx.ranges[part_range.identifier];
let mut row_groups = Vec::new();
let mut only_file_sources = true;
for index in &range_meta.row_group_indices {
if stream_ctx.is_file_range_index(*index) {
let file_id = stream_ctx.input.file_from_index(*index).file_id().file_id();
row_groups.push((file_id, index.row_group_index));
} else {
only_file_sources = false;
}
}
row_groups.sort_unstable_by(|a, b| a.0.as_bytes().cmp(b.0.as_bytes()).then(a.1.cmp(&b.1)));
PartitionRangeRowGroups {
row_groups,
only_file_sources,
}
}
/// Builds a cache key for the given partition range if it is eligible for caching.
#[allow(dead_code)]
pub(crate) fn build_range_cache_key(
stream_ctx: &StreamContext,
part_range: &PartitionRange,
) -> Option<RangeScanCacheKey> {
let fingerprint = stream_ctx.scan_fingerprint.as_ref()?;
// Dyn filters can change at runtime, so we can't cache when they're present.
let has_dyn_filters = stream_ctx
.input
.predicate_group()
.predicate_without_region()
.is_some_and(|p| !p.dyn_filters().is_empty());
if has_dyn_filters {
return None;
}
let rg = collect_partition_range_row_groups(stream_ctx, part_range);
if !rg.only_file_sources || rg.row_groups.is_empty() {
return None;
}
let range_meta = &stream_ctx.ranges[part_range.identifier];
let scan = if query_time_range_covers_partition_range(
stream_ctx.input.time_range.as_ref(),
range_meta.time_range,
) {
fingerprint.without_time_filters()
} else {
fingerprint.clone()
};
Some(RangeScanCacheKey {
region_id: stream_ctx.input.region_metadata().region_id,
row_groups: rg.row_groups,
scan,
})
}
#[allow(dead_code)]
fn query_time_range_covers_partition_range(
query_time_range: Option<&TimestampRange>,
partition_time_range: FileTimeRange,
) -> bool {
let Some(query_time_range) = query_time_range else {
return true;
};
let (part_start, part_end) = partition_time_range;
query_time_range.contains(&part_start) && query_time_range.contains(&part_end)
}
/// Returns a stream that replays cached record batches.
#[allow(dead_code)]
pub(crate) fn cached_flat_range_stream(value: Arc<RangeScanCacheValue>) -> BoxedRecordBatchStream {
Box::pin(futures::stream::iter(
value.batches.clone().into_iter().map(Ok),
))
}
/// Returns true if two primary key dictionary arrays share the same underlying
/// values buffers by pointer comparison.
///
/// The primary key column is always `DictionaryArray<UInt32Type>` with `Binary` values.
fn pk_values_ptr_eq(a: &DictionaryArray<UInt32Type>, b: &DictionaryArray<UInt32Type>) -> bool {
let a = a.values().as_binary::<i32>();
let b = b.values().as_binary::<i32>();
let values_eq = a.values().ptr_eq(b.values()) && a.offsets().ptr_eq(b.offsets());
match (a.nulls(), b.nulls()) {
(Some(a), Some(b)) => values_eq && a.inner().ptr_eq(b.inner()),
(None, None) => values_eq,
_ => false,
}
}
/// Buffers record batches for caching, tracking memory size while deduplicating
/// shared dictionary values across batches.
///
/// Uses the primary key column as a proxy to detect dictionary sharing: if the PK
/// column's dictionary values are pointer-equal across batches, we assume all
/// dictionary columns share their values and deduct the total dictionary values size.
struct CacheBatchBuffer {
batches: Vec<RecordBatch>,
/// Running total of batch memory.
total_size: usize,
/// The first batch's PK dictionary array, for pointer comparison.
/// `None` if no dictionary PK column exists or no batch has been added yet.
first_pk_dict: Option<DictionaryArray<UInt32Type>>,
/// Sum of `get_array_memory_size()` of all dictionary value arrays from the first batch.
total_dict_values_size: usize,
/// Whether the PK dictionary is still shared across all batches seen so far.
shared: bool,
}
impl CacheBatchBuffer {
fn new() -> Self {
Self {
batches: Vec::new(),
total_size: 0,
first_pk_dict: None,
total_dict_values_size: 0,
shared: true,
}
}
fn push(&mut self, batch: RecordBatch) {
if self.batches.is_empty() {
self.init_first_batch(&batch);
} else {
self.add_subsequent_batch(&batch);
}
self.batches.push(batch);
}
fn init_first_batch(&mut self, batch: &RecordBatch) {
self.total_size += batch.get_array_memory_size();
let pk_col_idx = primary_key_column_index(batch.num_columns());
let mut total_dict_values_size = 0;
for col_idx in 0..batch.num_columns() {
let col = batch.column(col_idx);
if let Some(dict) = col.as_any().downcast_ref::<DictionaryArray<UInt32Type>>() {
total_dict_values_size += dict.values().get_array_memory_size();
if col_idx == pk_col_idx {
self.first_pk_dict = Some(dict.clone());
}
}
}
self.total_dict_values_size = total_dict_values_size;
}
fn add_subsequent_batch(&mut self, batch: &RecordBatch) {
let batch_size = batch.get_array_memory_size();
if self.shared
&& let Some(first_pk_dict) = &self.first_pk_dict
{
let pk_col_idx = primary_key_column_index(batch.num_columns());
let col = batch.column(pk_col_idx);
if let Some(dict) = col.as_any().downcast_ref::<DictionaryArray<UInt32Type>>()
&& pk_values_ptr_eq(first_pk_dict, dict)
{
// PK dict is shared, deduct all dict values sizes.
self.total_size += batch_size - self.total_dict_values_size;
return;
}
// Dictionary diverged.
self.shared = false;
}
self.total_size += batch_size;
}
fn estimated_batches_size(&self) -> usize {
self.total_size
}
fn into_batches(self) -> Vec<RecordBatch> {
self.batches
}
}
/// Wraps a stream to cache its output for future range cache hits.
#[allow(dead_code)]
pub(crate) fn cache_flat_range_stream(
mut stream: BoxedRecordBatchStream,
cache_strategy: CacheStrategy,
key: RangeScanCacheKey,
part_metrics: PartitionMetrics,
) -> BoxedRecordBatchStream {
Box::pin(try_stream! {
let mut buffer = CacheBatchBuffer::new();
while let Some(batch) = stream.try_next().await? {
buffer.push(batch.clone());
yield batch;
}
let estimated_size = buffer.estimated_batches_size();
let batches = buffer.into_batches();
let value = Arc::new(RangeScanCacheValue::new(batches, estimated_size));
part_metrics.inc_range_cache_size(key.estimated_size() + value.estimated_size());
cache_strategy.put_range_result(key, value);
})
}
/// Creates a `cache_flat_range_stream` with dummy internals for benchmarking.
///
/// This avoids exposing `RangeScanCacheKey`, `ScanRequestFingerprint`, and
/// `PartitionMetrics` publicly.
#[cfg(feature = "test")]
pub fn bench_cache_flat_range_stream(
stream: BoxedRecordBatchStream,
cache_size_bytes: u64,
region_id: RegionId,
) -> BoxedRecordBatchStream {
use std::time::Instant;
use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet;
use crate::region::options::MergeMode;
let cache_manager = Arc::new(
crate::cache::CacheManager::builder()
.range_result_cache_size(cache_size_bytes)
.build(),
);
let cache_strategy = CacheStrategy::EnableAll(cache_manager);
let fingerprint = ScanRequestFingerprintBuilder {
read_column_ids: vec![],
read_column_types: vec![],
filters: vec![],
time_filters: vec![],
series_row_selector: None,
append_mode: false,
filter_deleted: false,
merge_mode: MergeMode::LastRow,
partition_expr_version: 0,
}
.build();
let key = RangeScanCacheKey {
region_id,
row_groups: vec![],
scan: fingerprint,
};
let metrics_set = ExecutionPlanMetricsSet::new();
let part_metrics =
PartitionMetrics::new(region_id, 0, "bench", Instant::now(), false, &metrics_set);
cache_flat_range_stream(stream, cache_strategy, key, part_metrics)
}
#[cfg(test)]
mod tests {
use store_api::storage::TimeSeriesRowSelector;
use std::sync::Arc;
use std::time::Instant;
use common_time::Timestamp;
use common_time::range::TimestampRange;
use common_time::timestamp::TimeUnit;
use datafusion_common::ScalarValue;
use datafusion_expr::{Expr, col, lit};
use smallvec::smallvec;
use store_api::storage::FileId;
use super::*;
use crate::cache::CacheManager;
use crate::read::projection::ProjectionMapper;
use crate::read::range::{RangeMeta, RowGroupIndex, SourceIndex};
use crate::read::scan_region::{PredicateGroup, ScanInput};
use crate::test_util::memtable_util::metadata_with_primary_key;
use crate::test_util::scheduler_util::SchedulerEnv;
use crate::test_util::sst_util::sst_file_handle_with_file_id;
fn test_cache_strategy() -> CacheStrategy {
CacheStrategy::EnableAll(Arc::new(
CacheManager::builder()
.range_result_cache_size(1024)
.build(),
))
}
async fn new_stream_context(
filters: Vec<Expr>,
query_time_range: Option<TimestampRange>,
partition_time_range: FileTimeRange,
) -> (StreamContext, PartitionRange) {
let env = SchedulerEnv::new().await;
let metadata = Arc::new(metadata_with_primary_key(vec![0, 1], false));
let mapper = ProjectionMapper::new(&metadata, [0, 2, 3].into_iter(), true).unwrap();
let predicate = PredicateGroup::new(metadata.as_ref(), &filters).unwrap();
let file_id = FileId::random();
let file = sst_file_handle_with_file_id(
file_id,
partition_time_range.0.value(),
partition_time_range.1.value(),
);
let input = ScanInput::new(env.access_layer.clone(), mapper)
.with_predicate(predicate)
.with_time_range(query_time_range)
.with_files(vec![file])
.with_cache(test_cache_strategy())
.with_flat_format(true);
let range_meta = RangeMeta {
time_range: partition_time_range,
indices: smallvec![SourceIndex {
index: 0,
num_row_groups: 1,
}],
row_group_indices: smallvec![RowGroupIndex {
index: 0,
row_group_index: 0,
}],
num_rows: 10,
};
let partition_range = range_meta.new_partition_range(0);
let scan_fingerprint = crate::read::scan_region::build_scan_fingerprint(&input);
let stream_ctx = StreamContext {
input,
ranges: vec![range_meta],
scan_fingerprint,
query_start: Instant::now(),
};
(stream_ctx, partition_range)
}
/// Helper to create a timestamp millisecond literal.
fn ts_lit(val: i64) -> Expr {
lit(ScalarValue::TimestampMillisecond(Some(val), None))
}
#[tokio::test]
async fn strips_time_only_filters_when_query_covers_partition_range() {
let (stream_ctx, part_range) = new_stream_context(
vec![
col("ts").gt_eq(ts_lit(1000)),
col("ts").lt(ts_lit(2001)),
col("ts").is_not_null(),
col("k0").eq(lit("foo")),
],
TimestampRange::with_unit(1000, 2002, TimeUnit::Millisecond),
(
Timestamp::new_millisecond(1000),
Timestamp::new_millisecond(2000),
),
)
.await;
let key = build_range_cache_key(&stream_ctx, &part_range).unwrap();
// Range-reducible time filters should be cleared when query covers partition range.
assert!(key.scan.time_filters().is_empty());
// Non-range time predicates stay in filters.
let mut expected_filters = [
col("k0").eq(lit("foo")).to_string(),
col("ts").is_not_null().to_string(),
];
expected_filters.sort_unstable();
assert_eq!(key.scan.filters(), expected_filters.as_slice());
}
#[tokio::test]
async fn preserves_time_filters_when_query_does_not_cover_partition_range() {
let (stream_ctx, part_range) = new_stream_context(
vec![col("ts").gt_eq(ts_lit(1000)), col("k0").eq(lit("foo"))],
TimestampRange::with_unit(1000, 1500, TimeUnit::Millisecond),
(
Timestamp::new_millisecond(1000),
Timestamp::new_millisecond(2000),
),
)
.await;
let key = build_range_cache_key(&stream_ctx, &part_range).unwrap();
// Time filters should be preserved when query does not cover partition range.
assert_eq!(
key.scan.time_filters(),
[col("ts").gt_eq(ts_lit(1000)).to_string()].as_slice()
);
assert_eq!(
key.scan.filters(),
[col("k0").eq(lit("foo")).to_string()].as_slice()
);
}
#[tokio::test]
async fn strips_time_only_filters_when_query_has_no_time_range_limit() {
let (stream_ctx, part_range) = new_stream_context(
vec![
col("ts").gt_eq(ts_lit(1000)),
col("ts").is_not_null(),
col("k0").eq(lit("foo")),
],
None,
(
Timestamp::new_millisecond(1000),
Timestamp::new_millisecond(2000),
),
)
.await;
let key = build_range_cache_key(&stream_ctx, &part_range).unwrap();
// Range-reducible time filters should be cleared when query has no time range limit.
assert!(key.scan.time_filters().is_empty());
// Non-range time predicates stay in filters.
let mut expected_filters = [
col("k0").eq(lit("foo")).to_string(),
col("ts").is_not_null().to_string(),
];
expected_filters.sort_unstable();
assert_eq!(key.scan.filters(), expected_filters.as_slice());
}
#[test]
fn normalizes_and_clears_time_filters() {
@@ -249,4 +687,170 @@ mod tests {
fingerprint.partition_expr_version
);
}
/// Creates a test schema with 5 columns where the primary key dictionary column
/// is at index 2 (`num_columns - 3`), matching the flat format layout.
///
/// Layout: `[field0: Int64, field1: Int64, pk: Dictionary<UInt32,Binary>, ts: Int64, seq: Int64]`
fn dict_test_schema() -> Arc<datatypes::arrow::datatypes::Schema> {
use datatypes::arrow::datatypes::{DataType as ArrowDataType, Field, Schema};
Arc::new(Schema::new(vec![
Field::new("field0", ArrowDataType::Int64, false),
Field::new("field1", ArrowDataType::Int64, false),
Field::new(
"pk",
ArrowDataType::Dictionary(
Box::new(ArrowDataType::UInt32),
Box::new(ArrowDataType::Binary),
),
false,
),
Field::new("ts", ArrowDataType::Int64, false),
Field::new("seq", ArrowDataType::Int64, false),
]))
}
/// Helper to create a record batch with a dictionary column at the primary key position.
fn make_dict_batch(
schema: Arc<datatypes::arrow::datatypes::Schema>,
dict_values: &datatypes::arrow::array::BinaryArray,
keys: &[u32],
int_values: &[i64],
) -> RecordBatch {
use datatypes::arrow::array::{Int64Array, UInt32Array};
let key_array = UInt32Array::from(keys.to_vec());
let dict_array: DictionaryArray<UInt32Type> =
DictionaryArray::new(key_array, Arc::new(dict_values.clone()));
let int_array = Int64Array::from(int_values.to_vec());
let zeros = Int64Array::from(vec![0i64; int_values.len()]);
RecordBatch::try_new(
schema,
vec![
Arc::new(zeros.clone()),
Arc::new(int_array),
Arc::new(dict_array),
Arc::new(zeros.clone()),
Arc::new(zeros),
],
)
.unwrap()
}
/// Computes the total `get_array_memory_size()` of all dictionary value arrays in a batch.
fn compute_total_dict_values_size(batch: &RecordBatch) -> usize {
batch
.columns()
.iter()
.filter_map(|col| {
col.as_any()
.downcast_ref::<DictionaryArray<UInt32Type>>()
.map(|dict| dict.values().get_array_memory_size())
})
.sum()
}
#[test]
fn cache_batch_buffer_empty() {
let buffer = CacheBatchBuffer::new();
assert_eq!(buffer.estimated_batches_size(), 0);
assert!(buffer.into_batches().is_empty());
}
#[test]
fn cache_batch_buffer_single_batch() {
use datatypes::arrow::array::BinaryArray;
let schema = dict_test_schema();
let dict_values = BinaryArray::from_vec(vec![b"a", b"b", b"c"]);
let batch = make_dict_batch(schema, &dict_values, &[0, 1, 2], &[10, 20, 30]);
let full_size = batch.get_array_memory_size();
let mut buffer = CacheBatchBuffer::new();
buffer.push(batch);
assert_eq!(buffer.estimated_batches_size(), full_size);
assert_eq!(buffer.into_batches().len(), 1);
}
#[test]
fn cache_batch_buffer_shared_dictionary() {
use datatypes::arrow::array::BinaryArray;
let schema = dict_test_schema();
let dict_values = BinaryArray::from_vec(vec![b"alpha", b"beta", b"gamma"]);
// Two batches sharing the same dictionary values array.
let batch1 = make_dict_batch(schema.clone(), &dict_values, &[0, 1], &[10, 20]);
let batch2 = make_dict_batch(schema, &dict_values, &[1, 2], &[30, 40]);
let batch1_full = batch1.get_array_memory_size();
let batch2_full = batch2.get_array_memory_size();
// The total dictionary values size that should be deduplicated for the second batch.
let dict_values_size = compute_total_dict_values_size(&batch2);
let mut buffer = CacheBatchBuffer::new();
buffer.push(batch1);
buffer.push(batch2);
// Second batch's dict values should not be counted again.
assert_eq!(
buffer.estimated_batches_size(),
batch1_full + batch2_full - dict_values_size
);
assert_eq!(buffer.into_batches().len(), 2);
}
#[test]
fn cache_batch_buffer_non_shared_dictionary() {
use datatypes::arrow::array::BinaryArray;
let schema = dict_test_schema();
let dict_values1 = BinaryArray::from_vec(vec![b"a", b"b"]);
let dict_values2 = BinaryArray::from_vec(vec![b"x", b"y"]);
let batch1 = make_dict_batch(schema.clone(), &dict_values1, &[0, 1], &[10, 20]);
let batch2 = make_dict_batch(schema, &dict_values2, &[0, 1], &[30, 40]);
let batch1_full = batch1.get_array_memory_size();
let batch2_full = batch2.get_array_memory_size();
let mut buffer = CacheBatchBuffer::new();
buffer.push(batch1);
buffer.push(batch2);
// Different dictionaries: full size for both.
assert_eq!(buffer.estimated_batches_size(), batch1_full + batch2_full);
}
#[test]
fn cache_batch_buffer_shared_then_diverged() {
use datatypes::arrow::array::BinaryArray;
let schema = dict_test_schema();
let shared_values = BinaryArray::from_vec(vec![b"a", b"b", b"c"]);
let different_values = BinaryArray::from_vec(vec![b"x", b"y"]);
let batch1 = make_dict_batch(schema.clone(), &shared_values, &[0], &[1]);
let batch2 = make_dict_batch(schema.clone(), &shared_values, &[1], &[2]);
let batch3 = make_dict_batch(schema, &different_values, &[0], &[3]);
let size1 = batch1.get_array_memory_size();
let size2 = batch2.get_array_memory_size();
let size3 = batch3.get_array_memory_size();
let dict_values_size = compute_total_dict_values_size(&batch2);
let mut buffer = CacheBatchBuffer::new();
buffer.push(batch1);
buffer.push(batch2);
buffer.push(batch3);
// batch2 shares dict with batch1 (dedup), batch3 does not (full size).
assert_eq!(
buffer.estimated_batches_size(),
size1 + (size2 - dict_values_size) + size3
);
}
}

View File

@@ -40,7 +40,7 @@ use store_api::region_engine::{PartitionRange, RegionScannerRef};
use store_api::storage::{
ColumnId, RegionId, ScanRequest, SequenceRange, TimeSeriesDistribution, TimeSeriesRowSelector,
};
use table::predicate::{Predicate, build_time_range_predicate};
use table::predicate::{Predicate, build_time_range_predicate, extract_time_range_from_expr};
use tokio::sync::{Semaphore, mpsc};
use tokio_stream::wrappers::ReceiverStream;
@@ -1420,7 +1420,6 @@ fn pre_filter_mode(append_mode: bool, merge_mode: MergeMode) -> PreFilterMode {
/// Builds a [ScanRequestFingerprint] from a [ScanInput] if the scan is eligible
/// for partition range caching.
#[cfg_attr(not(test), allow(dead_code))]
pub(crate) fn build_scan_fingerprint(input: &ScanInput) -> Option<ScanRequestFingerprint> {
let eligible = input.flat_format
&& !input.compaction
@@ -1439,7 +1438,14 @@ pub(crate) fn build_scan_fingerprint(input: &ScanInput) -> Option<ScanRequestFin
.map(|col| col.column_schema.name.as_str())
.collect();
let time_index_name = metadata.time_index_column().column_schema.name.clone();
let time_index = metadata.time_index_column();
let time_index_name = time_index.column_schema.name.clone();
let ts_col_unit = time_index
.column_schema
.data_type
.as_timestamp()
.expect("Time index must have timestamp-compatible type")
.unit();
let exprs = input
.predicate_group()
@@ -1464,9 +1470,16 @@ pub(crate) fn build_scan_fingerprint(input: &ScanInput) -> Option<ScanRequestFin
_ => false,
};
if is_time_only {
if is_time_only
&& extract_time_range_from_expr(&time_index_name, ts_col_unit, expr).is_some()
{
// Range-reducible time predicates can be safely dropped from the
// cache key when the query time range covers the partition range.
time_filters.push(expr.to_string());
} else {
// Non-time filters and non-range time predicates (those that
// extract_time_range_from_expr cannot convert to a TimestampRange)
// always stay in the cache key.
filters.push(expr.to_string());
}
}
@@ -1511,6 +1524,10 @@ pub struct StreamContext {
pub input: ScanInput,
/// Metadata for partition ranges.
pub(crate) ranges: Vec<RangeMeta>,
/// Precomputed scan fingerprint for partition range caching.
/// `None` when the scan is not eligible for caching.
#[allow(dead_code)]
pub(crate) scan_fingerprint: Option<ScanRequestFingerprint>,
// Metrics:
/// The start time of the query.
@@ -1523,10 +1540,12 @@ impl StreamContext {
let query_start = input.query_start.unwrap_or_else(Instant::now);
let ranges = RangeMeta::seq_scan_ranges(&input);
READ_SST_COUNT.observe(input.num_files() as f64);
let scan_fingerprint = build_scan_fingerprint(&input);
Self {
input,
ranges,
scan_fingerprint,
query_start,
}
}
@@ -1536,10 +1555,12 @@ impl StreamContext {
let query_start = input.query_start.unwrap_or_else(Instant::now);
let ranges = RangeMeta::unordered_scan_ranges(&input);
READ_SST_COUNT.observe(input.num_files() as f64);
let scan_fingerprint = build_scan_fingerprint(&input);
Self {
input,
ranges,
scan_fingerprint,
query_start,
}
}
@@ -1849,6 +1870,7 @@ mod tests {
use std::sync::Arc;
use datafusion::physical_plan::expressions::lit as physical_lit;
use datafusion_common::ScalarValue;
use datafusion_expr::{col, lit};
use datatypes::value::Value;
use partition::expr::col as partition_col;
@@ -2035,13 +2057,18 @@ mod tests {
assert!(scan_region.use_flat_format());
}
/// Helper to create a timestamp millisecond literal.
fn ts_lit(val: i64) -> datafusion_expr::Expr {
lit(ScalarValue::TimestampMillisecond(Some(val), None))
}
#[tokio::test]
async fn test_build_scan_fingerprint_for_eligible_scan() {
let metadata = Arc::new(metadata_with_primary_key(vec![0, 1], false));
let input = new_scan_input(
metadata.clone(),
vec![
col("ts").gt_eq(lit(1000)),
col("ts").gt_eq(ts_lit(1000)),
col("k0").eq(lit("foo")),
col("v0").gt(lit(1)),
],
@@ -2071,7 +2098,7 @@ mod tests {
col("k0").eq(lit("foo")).to_string(),
col("v0").gt(lit(1)).to_string(),
],
time_filters: vec![col("ts").gt_eq(lit(1000)).to_string()],
time_filters: vec![col("ts").gt_eq(ts_lit(1000)).to_string()],
series_row_selector: Some(TimeSeriesRowSelector::LastRow),
append_mode: false,
filter_deleted: false,

View File

@@ -247,6 +247,12 @@ pub(crate) struct ScanMetricsSet {
num_range_builders: isize,
/// Peak number of file range builders.
num_peak_range_builders: isize,
/// Total bytes added to the range cache during this scan.
range_cache_size: usize,
/// Number of range cache hits during this scan.
range_cache_hit: usize,
/// Number of range cache misses during this scan.
range_cache_miss: usize,
}
/// Wrapper for file metrics that compares by total cost in reverse order.
@@ -345,6 +351,9 @@ impl fmt::Debug for ScanMetricsSet {
build_ranges_peak_mem_size,
num_range_builders: _,
num_peak_range_builders,
range_cache_size,
range_cache_hit,
range_cache_miss,
} = self;
// Write core metrics
@@ -590,6 +599,16 @@ impl fmt::Debug for ScanMetricsSet {
write!(f, "}}")?;
}
if *range_cache_size > 0 {
write!(f, ", \"range_cache_size\":{range_cache_size}")?;
}
if *range_cache_hit > 0 {
write!(f, ", \"range_cache_hit\":{range_cache_hit}")?;
}
if *range_cache_miss > 0 {
write!(f, ", \"range_cache_miss\":{range_cache_miss}")?;
}
write!(
f,
", \"build_ranges_peak_mem_size\":{build_ranges_peak_mem_size}, \
@@ -1097,6 +1116,27 @@ impl PartitionMetrics {
pub(crate) fn dedup_metrics_reporter(&self) -> Arc<dyn DedupMetricsReport> {
self.0.clone()
}
/// Increments the total bytes added to the range cache.
#[allow(dead_code)]
pub(crate) fn inc_range_cache_size(&self, size: usize) {
let mut metrics = self.0.metrics.lock().unwrap();
metrics.range_cache_size += size;
}
/// Increments the range cache hit counter.
#[allow(dead_code)]
pub(crate) fn inc_range_cache_hit(&self) {
let mut metrics = self.0.metrics.lock().unwrap();
metrics.range_cache_hit += 1;
}
/// Increments the range cache miss counter.
#[allow(dead_code)]
pub(crate) fn inc_range_cache_miss(&self) {
let mut metrics = self.0.metrics.lock().unwrap();
metrics.range_cache_miss += 1;
}
}
impl fmt::Debug for PartitionMetrics {

View File

@@ -99,7 +99,8 @@ impl ConvertBatchStream {
let mapper = self.projection_mapper.as_flat().unwrap();
for batch in flat_batch.batches {
self.pending.push_back(mapper.convert(&batch)?);
self.pending
.push_back(mapper.convert(&batch, &self.cache_strategy)?);
}
}
}
@@ -114,7 +115,7 @@ impl ConvertBatchStream {
// Safety: Only flat format returns this batch.
let mapper = self.projection_mapper.as_flat().unwrap();
mapper.convert(&df_record_batch)
mapper.convert(&df_record_batch, &self.cache_strategy)
}
}
}

View File

@@ -29,6 +29,7 @@ pub mod flat_format;
pub mod format;
pub(crate) mod helper;
pub(crate) mod metadata;
pub mod prefilter;
pub mod reader;
pub mod row_group;
pub mod row_selection;

View File

@@ -0,0 +1,528 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Helpers for parquet prefiltering.
use std::ops::Range;
use api::v1::SemanticType;
use common_recordbatch::filter::SimpleFilterEvaluator;
use datatypes::arrow::array::{BinaryArray, BooleanArray};
use datatypes::arrow::record_batch::RecordBatch;
use mito_codec::primary_key_filter::is_partition_column;
use mito_codec::row_converter::PrimaryKeyFilter;
use snafu::{OptionExt, ResultExt};
use store_api::metadata::{RegionMetadata, RegionMetadataRef};
use crate::error::{ComputeArrowSnafu, Result, UnexpectedSnafu};
use crate::sst::parquet::flat_format::primary_key_column_index;
use crate::sst::parquet::format::PrimaryKeyArray;
#[cfg_attr(not(test), allow(dead_code))]
pub(crate) fn matching_row_ranges_by_primary_key(
input: &RecordBatch,
pk_filter: &mut dyn PrimaryKeyFilter,
) -> Result<Vec<Range<usize>>> {
let primary_key_index = primary_key_column_index(input.num_columns());
let pk_dict_array = input
.column(primary_key_index)
.as_any()
.downcast_ref::<PrimaryKeyArray>()
.context(UnexpectedSnafu {
reason: "Primary key column is not a dictionary array",
})?;
let pk_values = pk_dict_array
.values()
.as_any()
.downcast_ref::<BinaryArray>()
.context(UnexpectedSnafu {
reason: "Primary key values are not binary array",
})?;
let keys = pk_dict_array.keys();
let key_values = keys.values();
if key_values.is_empty() {
return Ok(std::iter::once(0..input.num_rows()).collect());
}
let mut matched_row_ranges: Vec<Range<usize>> = Vec::new();
let mut start = 0;
while start < key_values.len() {
let key = key_values[start];
let mut end = start + 1;
while end < key_values.len() && key_values[end] == key {
end += 1;
}
if pk_filter.matches(pk_values.value(key as usize)) {
if let Some(last) = matched_row_ranges.last_mut()
&& last.end == start
{
last.end = end;
} else {
matched_row_ranges.push(start..end);
}
}
start = end;
}
Ok(matched_row_ranges)
}
#[cfg_attr(not(test), allow(dead_code))]
pub(crate) fn prefilter_flat_batch_by_primary_key(
input: RecordBatch,
pk_filter: &mut dyn PrimaryKeyFilter,
) -> Result<Option<RecordBatch>> {
if input.num_rows() == 0 {
return Ok(Some(input));
}
let matched_row_ranges = matching_row_ranges_by_primary_key(&input, pk_filter)?;
if matched_row_ranges.is_empty() {
return Ok(None);
}
if matched_row_ranges.len() == 1
&& matched_row_ranges[0].start == 0
&& matched_row_ranges[0].end == input.num_rows()
{
return Ok(Some(input));
}
if matched_row_ranges.len() == 1 {
let span = &matched_row_ranges[0];
return Ok(Some(input.slice(span.start, span.end - span.start)));
}
let mut mask = vec![false; input.num_rows()];
for span in matched_row_ranges {
mask[span].fill(true);
}
let filtered =
datatypes::arrow::compute::filter_record_batch(&input, &BooleanArray::from(mask))
.context(ComputeArrowSnafu)?;
if filtered.num_rows() == 0 {
Ok(None)
} else {
Ok(Some(filtered))
}
}
#[cfg_attr(not(test), allow(dead_code))]
pub(crate) fn retain_usable_primary_key_filters(
sst_metadata: &RegionMetadataRef,
expected_metadata: Option<&RegionMetadata>,
filters: &mut Vec<SimpleFilterEvaluator>,
) {
filters.retain(|filter| is_usable_primary_key_filter(sst_metadata, expected_metadata, filter));
}
#[cfg_attr(not(test), allow(dead_code))]
pub(crate) fn is_usable_primary_key_filter(
sst_metadata: &RegionMetadataRef,
expected_metadata: Option<&RegionMetadata>,
filter: &SimpleFilterEvaluator,
) -> bool {
// TODO(yingwen): The primary key filter always skips the partition column. Consider using a flag
// to control this behavior. We can remove this behavior after we remove the PartitionTreeMemtable.
if is_partition_column(filter.column_name()) {
return false;
}
let sst_column = match expected_metadata {
Some(expected_metadata) => {
let Some(expected_column) = expected_metadata.column_by_name(filter.column_name())
else {
return false;
};
let Some(sst_column) = sst_metadata.column_by_id(expected_column.column_id) else {
return false;
};
if sst_column.column_schema.name != expected_column.column_schema.name
|| sst_column.semantic_type != expected_column.semantic_type
|| sst_column.column_schema.data_type != expected_column.column_schema.data_type
{
return false;
}
sst_column
}
None => {
let Some(sst_column) = sst_metadata.column_by_name(filter.column_name()) else {
return false;
};
sst_column
}
};
sst_column.semantic_type == SemanticType::Tag
&& sst_metadata
.primary_key_index(sst_column.column_id)
.is_some()
}
#[cfg_attr(not(test), allow(dead_code))]
pub(crate) struct CachedPrimaryKeyFilter {
inner: Box<dyn PrimaryKeyFilter>,
last_primary_key: Vec<u8>,
last_match: Option<bool>,
}
impl CachedPrimaryKeyFilter {
#[cfg_attr(not(test), allow(dead_code))]
pub(crate) fn new(inner: Box<dyn PrimaryKeyFilter>) -> Self {
Self {
inner,
last_primary_key: Vec::new(),
last_match: None,
}
}
}
impl PrimaryKeyFilter for CachedPrimaryKeyFilter {
fn matches(&mut self, pk: &[u8]) -> bool {
if let Some(last_match) = self.last_match
&& self.last_primary_key == pk
{
return last_match;
}
let matched = self.inner.matches(pk);
self.last_primary_key.clear();
self.last_primary_key.extend_from_slice(pk);
self.last_match = Some(matched);
matched
}
}
#[cfg_attr(not(test), allow(dead_code))]
pub(crate) fn batch_single_primary_key(batch: &RecordBatch) -> Result<Option<&[u8]>> {
let primary_key_index = primary_key_column_index(batch.num_columns());
let pk_dict_array = batch
.column(primary_key_index)
.as_any()
.downcast_ref::<PrimaryKeyArray>()
.context(UnexpectedSnafu {
reason: "Primary key column is not a dictionary array",
})?;
let pk_values = pk_dict_array
.values()
.as_any()
.downcast_ref::<BinaryArray>()
.context(UnexpectedSnafu {
reason: "Primary key values are not binary array",
})?;
let keys = pk_dict_array.keys();
if keys.is_empty() {
return Ok(None);
}
let first_key = keys.value(0);
if first_key != keys.value(keys.len() - 1) {
return Ok(None);
}
Ok(Some(pk_values.value(first_key as usize)))
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use std::sync::atomic::{AtomicUsize, Ordering};
use api::v1::SemanticType;
use common_recordbatch::filter::SimpleFilterEvaluator;
use datafusion_expr::{col, lit};
use datatypes::arrow::array::{
ArrayRef, BinaryArray, DictionaryArray, TimestampMillisecondArray, UInt8Array, UInt32Array,
UInt64Array,
};
use datatypes::arrow::datatypes::{Schema, UInt32Type};
use datatypes::arrow::record_batch::RecordBatch;
use datatypes::prelude::ConcreteDataType;
use mito_codec::row_converter::{PrimaryKeyFilter, build_primary_key_codec};
use store_api::codec::PrimaryKeyEncoding;
use store_api::metadata::{ColumnMetadata, RegionMetadata, RegionMetadataBuilder};
use store_api::storage::ColumnSchema;
use super::*;
use crate::sst::internal_fields;
use crate::sst::parquet::format::ReadFormat;
use crate::test_util::sst_util::{
new_primary_key, sst_region_metadata, sst_region_metadata_with_encoding,
};
fn new_test_filters(exprs: &[datafusion_expr::Expr]) -> Vec<SimpleFilterEvaluator> {
exprs
.iter()
.filter_map(SimpleFilterEvaluator::try_new)
.collect()
}
fn expected_metadata_with_reused_tag_name(
old_metadata: &RegionMetadata,
) -> Arc<RegionMetadata> {
let mut builder = RegionMetadataBuilder::new(old_metadata.region_id);
builder
.push_column_metadata(ColumnMetadata {
column_schema: ColumnSchema::new(
"tag_0".to_string(),
ConcreteDataType::string_datatype(),
true,
),
semantic_type: SemanticType::Tag,
column_id: 10,
})
.push_column_metadata(ColumnMetadata {
column_schema: ColumnSchema::new(
"tag_1".to_string(),
ConcreteDataType::string_datatype(),
true,
),
semantic_type: SemanticType::Tag,
column_id: 1,
})
.push_column_metadata(ColumnMetadata {
column_schema: ColumnSchema::new(
"field_0".to_string(),
ConcreteDataType::uint64_datatype(),
true,
),
semantic_type: SemanticType::Field,
column_id: 2,
})
.push_column_metadata(ColumnMetadata {
column_schema: ColumnSchema::new(
"ts".to_string(),
ConcreteDataType::timestamp_millisecond_datatype(),
false,
),
semantic_type: SemanticType::Timestamp,
column_id: 3,
})
.primary_key(vec![10, 1]);
Arc::new(builder.build().unwrap())
}
fn new_raw_batch_with_metadata(
metadata: Arc<RegionMetadata>,
primary_keys: &[&[u8]],
field_values: &[u64],
) -> RecordBatch {
assert_eq!(primary_keys.len(), field_values.len());
let arrow_schema = metadata.schema.arrow_schema();
let field_column = arrow_schema
.field(arrow_schema.index_of("field_0").unwrap())
.clone();
let time_index_column = arrow_schema
.field(arrow_schema.index_of("ts").unwrap())
.clone();
let mut fields = vec![field_column, time_index_column];
fields.extend(
internal_fields()
.into_iter()
.map(|field| field.as_ref().clone()),
);
let schema = Arc::new(Schema::new(fields));
let mut dict_values = Vec::new();
let mut keys = Vec::with_capacity(primary_keys.len());
for pk in primary_keys {
let key = dict_values
.iter()
.position(|existing: &&[u8]| existing == pk)
.unwrap_or_else(|| {
dict_values.push(*pk);
dict_values.len() - 1
});
keys.push(key as u32);
}
let pk_array: ArrayRef = Arc::new(DictionaryArray::<UInt32Type>::new(
UInt32Array::from(keys),
Arc::new(BinaryArray::from_iter_values(dict_values.iter().copied())),
));
RecordBatch::try_new(
schema,
vec![
Arc::new(UInt64Array::from(field_values.to_vec())),
Arc::new(TimestampMillisecondArray::from_iter_values(
0..primary_keys.len() as i64,
)),
pk_array,
Arc::new(UInt64Array::from(vec![1; primary_keys.len()])),
Arc::new(UInt8Array::from(vec![1; primary_keys.len()])),
],
)
.unwrap()
}
fn new_raw_batch(primary_keys: &[&[u8]], field_values: &[u64]) -> RecordBatch {
new_raw_batch_with_metadata(Arc::new(sst_region_metadata()), primary_keys, field_values)
}
fn field_values(batch: &RecordBatch) -> Vec<u64> {
batch
.column(0)
.as_any()
.downcast_ref::<UInt64Array>()
.unwrap()
.values()
.to_vec()
}
#[test]
fn test_retain_usable_primary_key_filters_skips_non_tag_filters() {
let metadata = Arc::new(sst_region_metadata());
let mut filters =
new_test_filters(&[col("field_0").eq(lit(1_u64)), col("ts").gt(lit(0_i64))]);
retain_usable_primary_key_filters(&metadata, None, &mut filters);
assert!(filters.is_empty());
}
#[test]
fn test_retain_usable_primary_key_filters_skips_reused_expected_tag_name() {
let metadata = Arc::new(sst_region_metadata());
let expected_metadata = expected_metadata_with_reused_tag_name(&metadata);
let mut filters = new_test_filters(&[col("tag_0").eq(lit("b"))]);
retain_usable_primary_key_filters(
&metadata,
Some(expected_metadata.as_ref()),
&mut filters,
);
assert!(filters.is_empty());
}
#[test]
fn test_is_usable_primary_key_filter_skips_legacy_primary_key_batches() {
let metadata = Arc::new(sst_region_metadata_with_encoding(
PrimaryKeyEncoding::Sparse,
));
let read_format = ReadFormat::new_flat(
metadata.clone(),
metadata.column_metadatas.iter().map(|c| c.column_id),
None,
"test",
true,
)
.unwrap();
assert!(read_format.as_flat().is_some());
let filter = SimpleFilterEvaluator::try_new(&col("tag_0").eq(lit("b"))).unwrap();
assert!(is_usable_primary_key_filter(&metadata, None, &filter));
}
#[test]
fn test_prefilter_primary_key_drops_single_dictionary_batch() {
let metadata = Arc::new(sst_region_metadata());
let filters = Arc::new(new_test_filters(&[col("tag_0").eq(lit("b"))]));
let mut primary_key_filter =
build_primary_key_codec(metadata.as_ref()).primary_key_filter(&metadata, filters);
let pk_a = new_primary_key(&["a", "x"]);
let batch = new_raw_batch(&[pk_a.as_slice(), pk_a.as_slice()], &[10, 11]);
let filtered =
prefilter_flat_batch_by_primary_key(batch, primary_key_filter.as_mut()).unwrap();
assert!(filtered.is_none());
}
#[test]
fn test_prefilter_primary_key_builds_mask_for_fragmented_matches() {
let metadata = Arc::new(sst_region_metadata());
let filters = Arc::new(new_test_filters(&[col("tag_0")
.eq(lit("a"))
.or(col("tag_0").eq(lit("c")))]));
let mut primary_key_filter =
build_primary_key_codec(metadata.as_ref()).primary_key_filter(&metadata, filters);
let pk_a = new_primary_key(&["a", "x"]);
let pk_b = new_primary_key(&["b", "x"]);
let pk_c = new_primary_key(&["c", "x"]);
let pk_d = new_primary_key(&["d", "x"]);
let batch = new_raw_batch(
&[
pk_a.as_slice(),
pk_a.as_slice(),
pk_b.as_slice(),
pk_b.as_slice(),
pk_c.as_slice(),
pk_c.as_slice(),
pk_d.as_slice(),
pk_d.as_slice(),
],
&[10, 11, 12, 13, 14, 15, 16, 17],
);
let filtered = prefilter_flat_batch_by_primary_key(batch, primary_key_filter.as_mut())
.unwrap()
.unwrap();
assert_eq!(filtered.num_rows(), 4);
assert_eq!(field_values(&filtered), vec![10, 11, 14, 15]);
}
struct CountingPrimaryKeyFilter {
hits: Arc<AtomicUsize>,
expected: Vec<u8>,
}
impl PrimaryKeyFilter for CountingPrimaryKeyFilter {
fn matches(&mut self, pk: &[u8]) -> bool {
self.hits.fetch_add(1, Ordering::Relaxed);
pk == self.expected.as_slice()
}
}
#[test]
fn test_cached_primary_key_filter_reuses_previous_result() {
let expected = new_primary_key(&["a", "x"]);
let hits = Arc::new(AtomicUsize::new(0));
let mut filter = CachedPrimaryKeyFilter::new(Box::new(CountingPrimaryKeyFilter {
hits: Arc::clone(&hits),
expected: expected.clone(),
}));
assert!(filter.matches(expected.as_slice()));
assert!(filter.matches(expected.as_slice()));
assert!(!filter.matches(new_primary_key(&["b", "x"]).as_slice()));
assert_eq!(hits.load(Ordering::Relaxed), 2);
}
#[test]
fn test_batch_single_primary_key() {
let pk_a = new_primary_key(&["a", "x"]);
let pk_b = new_primary_key(&["b", "x"]);
let batch = new_raw_batch(&[pk_a.as_slice(), pk_a.as_slice()], &[10, 11]);
assert_eq!(
batch_single_primary_key(&batch).unwrap(),
Some(pk_a.as_slice())
);
let batch = new_raw_batch(&[pk_a.as_slice(), pk_b.as_slice()], &[10, 11]);
assert_eq!(batch_single_primary_key(&batch).unwrap(), None);
}
}

View File

@@ -15,6 +15,7 @@
//! Utilities for testing.
pub mod batch_util;
pub mod bench_util;
pub mod memtable_util;
pub mod scheduler_util;
pub mod sst_util;

View File

@@ -0,0 +1,259 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Shared utilities for mito2 benchmarks.
//!
//! Provides a TSBS cpu-like data generator ([`CpuDataGenerator`]) and schema
//! ([`cpu_metadata`]) used by multiple benchmark binaries in this directory.
use api::v1::value::ValueData;
use api::v1::{Row, Rows, SemanticType};
use datafusion_common::Column;
use datafusion_expr::{Expr, lit};
use datatypes::data_type::ConcreteDataType;
use datatypes::schema::ColumnSchema;
use rand::Rng;
use rand::rngs::ThreadRng;
use rand::seq::IndexedRandom;
use store_api::metadata::{
ColumnMetadata, RegionMetadata, RegionMetadataBuilder, RegionMetadataRef,
};
use store_api::storage::RegionId;
use table::predicate::Predicate;
use crate::memtable::KeyValues;
use crate::test_util::memtable_util::region_metadata_to_row_schema;
pub struct Host {
pub hostname: String,
pub region: String,
pub datacenter: String,
pub rack: String,
pub os: String,
pub arch: String,
pub team: String,
pub service: String,
pub service_version: String,
pub service_environment: String,
}
impl Host {
pub fn random_with_id(id: usize) -> Host {
let mut rng = rand::rng();
let region = format!("ap-southeast-{}", rng.random_range(0..10));
let datacenter = format!(
"{}{}",
region,
['a', 'b', 'c', 'd', 'e'].choose(&mut rng).unwrap()
);
Host {
hostname: format!("host_{id}"),
region,
datacenter,
rack: rng.random_range(0..100).to_string(),
os: "Ubuntu16.04LTS".to_string(),
arch: "x86".to_string(),
team: "CHI".to_string(),
service: rng.random_range(0..100).to_string(),
service_version: rng.random_range(0..10).to_string(),
service_environment: "test".to_string(),
}
}
pub fn fill_values(&self, values: &mut Vec<api::v1::Value>) {
let tags = [
api::v1::Value {
value_data: Some(ValueData::StringValue(self.hostname.clone())),
},
api::v1::Value {
value_data: Some(ValueData::StringValue(self.region.clone())),
},
api::v1::Value {
value_data: Some(ValueData::StringValue(self.datacenter.clone())),
},
api::v1::Value {
value_data: Some(ValueData::StringValue(self.rack.clone())),
},
api::v1::Value {
value_data: Some(ValueData::StringValue(self.os.clone())),
},
api::v1::Value {
value_data: Some(ValueData::StringValue(self.arch.clone())),
},
api::v1::Value {
value_data: Some(ValueData::StringValue(self.team.clone())),
},
api::v1::Value {
value_data: Some(ValueData::StringValue(self.service.clone())),
},
api::v1::Value {
value_data: Some(ValueData::StringValue(self.service_version.clone())),
},
api::v1::Value {
value_data: Some(ValueData::StringValue(self.service_environment.clone())),
},
];
for tag in tags {
values.push(tag);
}
}
}
pub struct CpuDataGenerator {
pub metadata: RegionMetadataRef,
column_schemas: Vec<api::v1::ColumnSchema>,
hosts: Vec<Host>,
start_sec: i64,
end_sec: i64,
}
impl CpuDataGenerator {
pub fn new(
metadata: RegionMetadataRef,
num_hosts: usize,
start_sec: i64,
end_sec: i64,
) -> Self {
let column_schemas = region_metadata_to_row_schema(&metadata);
Self {
metadata,
column_schemas,
hosts: Self::generate_hosts(num_hosts),
start_sec,
end_sec,
}
}
pub fn iter(&self) -> impl Iterator<Item = KeyValues> + '_ {
// point per 10s.
(self.start_sec..self.end_sec)
.step_by(10)
.enumerate()
.map(|(seq, ts)| self.build_key_values(seq, ts))
}
pub fn build_key_values(&self, seq: usize, current_sec: i64) -> KeyValues {
let rows = self
.hosts
.iter()
.map(|host| {
let mut rng = rand::rng();
let mut values = Vec::with_capacity(21);
values.push(api::v1::Value {
value_data: Some(ValueData::TimestampMillisecondValue(current_sec * 1000)),
});
host.fill_values(&mut values);
for _ in 0..10 {
values.push(api::v1::Value {
value_data: Some(ValueData::F64Value(Self::random_f64(&mut rng))),
});
}
Row { values }
})
.collect();
let mutation = api::v1::Mutation {
op_type: api::v1::OpType::Put as i32,
sequence: seq as u64,
rows: Some(Rows {
schema: self.column_schemas.clone(),
rows,
}),
write_hint: None,
};
KeyValues::new(&self.metadata, mutation).unwrap()
}
pub fn random_host_filter(&self) -> Predicate {
let host = self.random_hostname();
let expr = Expr::Column(Column::from_name("hostname")).eq(lit(host));
Predicate::new(vec![expr])
}
pub fn random_host_filter_exprs(&self) -> Vec<Expr> {
let host = self.random_hostname();
vec![Expr::Column(Column::from_name("hostname")).eq(lit(host))]
}
pub fn random_hostname(&self) -> String {
let mut rng = rand::rng();
self.hosts.choose(&mut rng).unwrap().hostname.clone()
}
pub fn random_f64(rng: &mut ThreadRng) -> f64 {
let base: u32 = rng.random_range(30..95);
base as f64
}
pub fn generate_hosts(num_hosts: usize) -> Vec<Host> {
(0..num_hosts).map(Host::random_with_id).collect()
}
}
/// Creates a metadata for TSBS cpu-like table.
pub fn cpu_metadata() -> RegionMetadata {
let mut builder = RegionMetadataBuilder::new(RegionId::new(1, 1));
builder.push_column_metadata(ColumnMetadata {
column_schema: ColumnSchema::new(
"ts",
ConcreteDataType::timestamp_millisecond_datatype(),
false,
),
semantic_type: SemanticType::Timestamp,
column_id: 0,
});
let mut column_id = 1;
let tags = [
"hostname",
"region",
"datacenter",
"rack",
"os",
"arch",
"team",
"service",
"service_version",
"service_environment",
];
for tag in tags {
builder.push_column_metadata(ColumnMetadata {
column_schema: ColumnSchema::new(tag, ConcreteDataType::string_datatype(), true),
semantic_type: SemanticType::Tag,
column_id,
});
column_id += 1;
}
let fields = [
"usage_user",
"usage_system",
"usage_idle",
"usage_nice",
"usage_iowait",
"usage_irq",
"usage_softirq",
"usage_steal",
"usage_guest",
"usage_guest_nice",
];
for field in fields {
builder.push_column_metadata(ColumnMetadata {
column_schema: ColumnSchema::new(field, ConcreteDataType::float64_datatype(), true),
semantic_type: SemanticType::Field,
column_id,
});
column_id += 1;
}
builder.primary_key(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
builder.build().unwrap()
}

View File

@@ -30,8 +30,7 @@ use mito_codec::row_converter::{DensePrimaryKeyCodec, PrimaryKeyCodecExt, SortFi
use store_api::metadata::{
ColumnMetadata, RegionMetadata, RegionMetadataBuilder, RegionMetadataRef,
};
use store_api::storage::{ColumnId, RegionId, SequenceNumber, SequenceRange};
use table::predicate::Predicate;
use store_api::storage::{ColumnId, RegionId, SequenceNumber};
use crate::error::Result;
use crate::memtable::bulk::part::BulkPart;

View File

@@ -207,6 +207,7 @@ impl WorkerGroup {
.vector_cache_size(config.vector_cache_size.as_bytes())
.page_cache_size(config.page_cache_size.as_bytes())
.selector_result_cache_size(config.selector_result_cache_size.as_bytes())
.range_result_cache_size(config.range_result_cache_size.as_bytes())
.index_metadata_size(config.index.metadata_cache_size.as_bytes())
.index_content_size(config.index.content_cache_size.as_bytes())
.index_content_page_size(config.index.content_cache_page_size.as_bytes())
@@ -421,6 +422,7 @@ impl WorkerGroup {
.vector_cache_size(config.vector_cache_size.as_bytes())
.page_cache_size(config.page_cache_size.as_bytes())
.selector_result_cache_size(config.selector_result_cache_size.as_bytes())
.range_result_cache_size(config.range_result_cache_size.as_bytes())
.write_cache(write_cache)
.build(),
);

View File

@@ -121,10 +121,12 @@ pub fn new_partition_info_cache(
CacheContainer::new(
name,
cache,
Box::new(|cache, ident| {
Box::new(|cache, idents| {
Box::pin(async move {
if let CacheIdent::TableId(table_id) = ident {
cache.invalidate(table_id).await
for ident in idents {
if let CacheIdent::TableId(table_id) = ident {
cache.invalidate(table_id).await
}
}
Ok(())
})

View File

@@ -3315,28 +3315,55 @@ impl PromPlanner {
fn prom_token_to_binary_expr_builder(
token: TokenType,
) -> Result<Box<dyn Fn(DfExpr, DfExpr) -> Result<DfExpr>>> {
let cast_float = |expr| {
if matches!(
&expr,
DfExpr::Cast(Cast {
data_type: ArrowDataType::Float64,
..
})
) || matches!(&expr, DfExpr::Literal(ScalarValue::Float64(_), _))
{
expr
} else {
DfExpr::Cast(Cast {
expr: Box::new(expr),
data_type: ArrowDataType::Float64,
})
}
};
match token.id() {
token::T_ADD => Ok(Box::new(|lhs, rhs| Ok(lhs + rhs))),
token::T_SUB => Ok(Box::new(|lhs, rhs| Ok(lhs - rhs))),
token::T_MUL => Ok(Box::new(|lhs, rhs| Ok(lhs * rhs))),
token::T_DIV => Ok(Box::new(|lhs, rhs| Ok(lhs / rhs))),
token::T_MOD => Ok(Box::new(|lhs: DfExpr, rhs| Ok(lhs % rhs))),
token::T_ADD => Ok(Box::new(move |lhs, rhs| {
Ok(cast_float(lhs) + cast_float(rhs))
})),
token::T_SUB => Ok(Box::new(move |lhs, rhs| {
Ok(cast_float(lhs) - cast_float(rhs))
})),
token::T_MUL => Ok(Box::new(move |lhs, rhs| {
Ok(cast_float(lhs) * cast_float(rhs))
})),
token::T_DIV => Ok(Box::new(move |lhs, rhs| {
Ok(cast_float(lhs) / cast_float(rhs))
})),
token::T_MOD => Ok(Box::new(move |lhs: DfExpr, rhs| {
Ok(cast_float(lhs) % cast_float(rhs))
})),
token::T_EQLC => Ok(Box::new(|lhs, rhs| Ok(lhs.eq(rhs)))),
token::T_NEQ => Ok(Box::new(|lhs, rhs| Ok(lhs.not_eq(rhs)))),
token::T_GTR => Ok(Box::new(|lhs, rhs| Ok(lhs.gt(rhs)))),
token::T_LSS => Ok(Box::new(|lhs, rhs| Ok(lhs.lt(rhs)))),
token::T_GTE => Ok(Box::new(|lhs, rhs| Ok(lhs.gt_eq(rhs)))),
token::T_LTE => Ok(Box::new(|lhs, rhs| Ok(lhs.lt_eq(rhs)))),
token::T_POW => Ok(Box::new(|lhs, rhs| {
token::T_POW => Ok(Box::new(move |lhs, rhs| {
Ok(DfExpr::ScalarFunction(ScalarFunction {
func: datafusion_functions::math::power(),
args: vec![lhs, rhs],
args: vec![cast_float(lhs), cast_float(rhs)],
}))
})),
token::T_ATAN2 => Ok(Box::new(|lhs, rhs| {
token::T_ATAN2 => Ok(Box::new(move |lhs, rhs| {
Ok(DfExpr::ScalarFunction(ScalarFunction {
func: datafusion_functions::math::atan2(),
args: vec![lhs, rhs],
args: vec![cast_float(lhs), cast_float(rhs)],
}))
})),
_ => UnexpectedTokenSnafu { token }.fail(),
@@ -5161,7 +5188,7 @@ mod test {
.unwrap();
let expected = String::from(
"Projection: rhs.tag_0, rhs.timestamp, lhs.field_0 + rhs.field_0 AS lhs.field_0 + rhs.field_0 [tag_0:Utf8, timestamp:Timestamp(ms), lhs.field_0 + rhs.field_0:Float64;N]\
"Projection: rhs.tag_0, rhs.timestamp, CAST(lhs.field_0 AS Float64) + CAST(rhs.field_0 AS Float64) AS lhs.field_0 + rhs.field_0 [tag_0:Utf8, timestamp:Timestamp(ms), lhs.field_0 + rhs.field_0:Float64;N]\
\n Inner Join: lhs.tag_0 = rhs.tag_0, lhs.timestamp = rhs.timestamp [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
\n SubqueryAlias: lhs [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
\n PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
@@ -5216,7 +5243,7 @@ mod test {
async fn binary_op_literal_column() {
let query = r#"1 + some_metric{tag_0="bar"}"#;
let expected = String::from(
"Projection: some_metric.tag_0, some_metric.timestamp, Float64(1) + some_metric.field_0 AS Float64(1) + field_0 [tag_0:Utf8, timestamp:Timestamp(ms), Float64(1) + field_0:Float64;N]\
"Projection: some_metric.tag_0, some_metric.timestamp, Float64(1) + CAST(some_metric.field_0 AS Float64) AS Float64(1) + field_0 [tag_0:Utf8, timestamp:Timestamp(ms), Float64(1) + field_0:Float64;N]\
\n PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
\n PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
\n Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
@@ -5254,7 +5281,7 @@ mod test {
async fn bool_with_additional_arithmetic() {
let query = "some_metric + (1 == bool 2)";
let expected = String::from(
"Projection: some_metric.tag_0, some_metric.timestamp, some_metric.field_0 + CAST(Float64(1) = Float64(2) AS Float64) AS field_0 + Float64(1) = Float64(2) [tag_0:Utf8, timestamp:Timestamp(ms), field_0 + Float64(1) = Float64(2):Float64;N]\
"Projection: some_metric.tag_0, some_metric.timestamp, CAST(some_metric.field_0 AS Float64) + CAST(Float64(1) = Float64(2) AS Float64) AS field_0 + Float64(1) = Float64(2) [tag_0:Utf8, timestamp:Timestamp(ms), field_0 + Float64(1) = Float64(2):Float64;N]\
\n PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
\n PromSeriesDivide: tags=[\"tag_0\"] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
\n Sort: some_metric.tag_0 ASC NULLS FIRST, some_metric.timestamp ASC NULLS FIRST [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
@@ -5364,7 +5391,7 @@ mod test {
PromPlanner::stmt_to_plan(table_provider, &eval_stmt, &build_query_engine_state())
.await
.unwrap();
let expected = "Projection: http_server_requests_seconds_count.uri, http_server_requests_seconds_count.kubernetes_namespace, http_server_requests_seconds_count.kubernetes_pod_name, http_server_requests_seconds_count.greptime_timestamp, http_server_requests_seconds_sum.greptime_value / http_server_requests_seconds_count.greptime_value AS http_server_requests_seconds_sum.greptime_value / http_server_requests_seconds_count.greptime_value\
let expected = "Projection: http_server_requests_seconds_count.uri, http_server_requests_seconds_count.kubernetes_namespace, http_server_requests_seconds_count.kubernetes_pod_name, http_server_requests_seconds_count.greptime_timestamp, CAST(http_server_requests_seconds_sum.greptime_value AS Float64) / CAST(http_server_requests_seconds_count.greptime_value AS Float64) AS http_server_requests_seconds_sum.greptime_value / http_server_requests_seconds_count.greptime_value\
\n Inner Join: http_server_requests_seconds_sum.greptime_timestamp = http_server_requests_seconds_count.greptime_timestamp, http_server_requests_seconds_sum.uri = http_server_requests_seconds_count.uri\
\n SubqueryAlias: http_server_requests_seconds_sum\
\n PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[greptime_timestamp]\
@@ -5755,7 +5782,7 @@ mod test {
let query = "some_alt_metric{__schema__=\"greptime_private\"} / some_metric";
let expected = String::from(
"Projection: some_metric.tag_0, some_metric.timestamp, greptime_private.some_alt_metric.field_0 / some_metric.field_0 AS greptime_private.some_alt_metric.field_0 / some_metric.field_0 [tag_0:Utf8, timestamp:Timestamp(ms), greptime_private.some_alt_metric.field_0 / some_metric.field_0:Float64;N]\
"Projection: some_metric.tag_0, some_metric.timestamp, CAST(greptime_private.some_alt_metric.field_0 AS Float64) / CAST(some_metric.field_0 AS Float64) AS greptime_private.some_alt_metric.field_0 / some_metric.field_0 [tag_0:Utf8, timestamp:Timestamp(ms), greptime_private.some_alt_metric.field_0 / some_metric.field_0:Float64;N]\
\n Inner Join: greptime_private.some_alt_metric.tag_0 = some_metric.tag_0, greptime_private.some_alt_metric.timestamp = some_metric.timestamp [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N, tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
\n SubqueryAlias: greptime_private.some_alt_metric [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\
\n PromInstantManipulate: range=[0..100000000], lookback=[1000], interval=[5000], time index=[timestamp] [tag_0:Utf8, timestamp:Timestamp(ms), field_0:Float64;N]\

View File

@@ -235,7 +235,7 @@ pub(super) fn type_gt_to_pg(origin: &ConcreteDataType) -> Result<Type> {
match origin {
&ConcreteDataType::Null(_) => Ok(Type::UNKNOWN),
&ConcreteDataType::Boolean(_) => Ok(Type::BOOL),
&ConcreteDataType::Int8(_) => Ok(Type::CHAR),
&ConcreteDataType::Int8(_) => Ok(Type::INT2),
&ConcreteDataType::Int16(_) | &ConcreteDataType::UInt8(_) => Ok(Type::INT2),
&ConcreteDataType::Int32(_) | &ConcreteDataType::UInt16(_) => Ok(Type::INT4),
&ConcreteDataType::Int64(_) | &ConcreteDataType::UInt32(_) => Ok(Type::INT8),
@@ -253,7 +253,7 @@ pub(super) fn type_gt_to_pg(origin: &ConcreteDataType) -> Result<Type> {
ConcreteDataType::List(list) => match list.item_type() {
&ConcreteDataType::Null(_) => Ok(Type::UNKNOWN),
&ConcreteDataType::Boolean(_) => Ok(Type::BOOL_ARRAY),
&ConcreteDataType::Int8(_) => Ok(Type::CHAR_ARRAY),
&ConcreteDataType::Int8(_) => Ok(Type::INT2_ARRAY),
&ConcreteDataType::Int16(_) | &ConcreteDataType::UInt8(_) => Ok(Type::INT2_ARRAY),
&ConcreteDataType::Int32(_) | &ConcreteDataType::UInt16(_) => Ok(Type::INT4_ARRAY),
&ConcreteDataType::Int64(_) | &ConcreteDataType::UInt32(_) => Ok(Type::INT8_ARRAY),
@@ -1151,7 +1151,7 @@ mod test {
let pg_field_info = vec![
FieldInfo::new("nulls".into(), None, None, Type::UNKNOWN, FieldFormat::Text),
FieldInfo::new("bools".into(), None, None, Type::BOOL, FieldFormat::Text),
FieldInfo::new("int8s".into(), None, None, Type::CHAR, FieldFormat::Text),
FieldInfo::new("int8s".into(), None, None, Type::INT2, FieldFormat::Text),
FieldInfo::new("int16s".into(), None, None, Type::INT2, FieldFormat::Text),
FieldInfo::new("int32s".into(), None, None, Type::INT4, FieldFormat::Text),
FieldInfo::new("int64s".into(), None, None, Type::INT8, FieldFormat::Text),
@@ -1230,7 +1230,7 @@ mod test {
Type::NUMERIC,
FieldFormat::Text,
),
FieldInfo::new("int8s".into(), None, None, Type::CHAR, FieldFormat::Text),
FieldInfo::new("int8s".into(), None, None, Type::INT2, FieldFormat::Text),
FieldInfo::new("int16s".into(), None, None, Type::INT2, FieldFormat::Text),
FieldInfo::new("int32s".into(), None, None, Type::INT4, FieldFormat::Text),
FieldInfo::new("int64s".into(), None, None, Type::INT8, FieldFormat::Text),

View File

@@ -17,15 +17,13 @@ use std::sync::Arc;
use api::v1::greptime_request::Request;
use async_trait::async_trait;
use common_base::AffectedRows;
use common_grpc::flight::do_put::DoPutResponse;
use common_query::Output;
use futures::Stream;
use session::context::QueryContextRef;
use table::TableRef;
use crate::error::Result;
use crate::grpc::flight::{PutRecordBatchRequest, PutRecordBatchRequestStream};
use crate::grpc::flight::PutRecordBatchRequestStream;
pub type ServerGrpcQueryHandlerRef = Arc<dyn GrpcQueryHandler + Send + Sync>;
@@ -35,13 +33,6 @@ pub type RawRecordBatch = bytes::Bytes;
pub trait GrpcQueryHandler {
async fn do_query(&self, query: Request, ctx: QueryContextRef) -> Result<Output>;
async fn put_record_batch(
&self,
request: PutRecordBatchRequest,
table_ref: &mut Option<TableRef>,
ctx: QueryContextRef,
) -> Result<AffectedRows>;
fn handle_put_record_batch_stream(
&self,
stream: PutRecordBatchRequestStream,

View File

@@ -18,7 +18,6 @@ use api::v1::greptime_request::Request;
use api::v1::query_request::Query;
use async_trait::async_trait;
use catalog::memory::MemoryCatalogManager;
use common_base::AffectedRows;
use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
use common_grpc::flight::do_put::DoPutResponse;
use common_query::Output;
@@ -149,15 +148,6 @@ impl GrpcQueryHandler for DummyInstance {
Ok(output)
}
async fn put_record_batch(
&self,
_request: servers::grpc::flight::PutRecordBatchRequest,
_table_ref: &mut Option<TableRef>,
_ctx: QueryContextRef,
) -> Result<AffectedRows> {
unimplemented!()
}
fn handle_put_record_batch_stream(
&self,
_stream: servers::grpc::flight::PutRecordBatchRequestStream,

View File

@@ -203,7 +203,7 @@ pub fn build_time_range_predicate(
/// Extract time range filter from `WHERE`/`IN (...)`/`BETWEEN` clauses.
/// Return None if no time range can be found in expr.
fn extract_time_range_from_expr(
pub fn extract_time_range_from_expr(
ts_col_name: &str,
ts_col_unit: TimeUnit,
expr: &Expr,

View File

@@ -36,8 +36,9 @@ use store_api::metric_engine_consts::{
LOGICAL_TABLE_METADATA_KEY, PHYSICAL_TABLE_METADATA_KEY, is_metric_engine_option_key,
};
use store_api::mito_engine_options::{
APPEND_MODE_KEY, COMPACTION_TYPE, MEMTABLE_TYPE, MERGE_MODE_KEY, TWCS_FALLBACK_TO_LOCAL,
TWCS_MAX_OUTPUT_FILE_SIZE, TWCS_TIME_WINDOW, TWCS_TRIGGER_FILE_NUM, is_mito_engine_option_key,
APPEND_MODE_KEY, COMPACTION_TYPE, MEMTABLE_TYPE, MERGE_MODE_KEY, SST_FORMAT_KEY,
TWCS_FALLBACK_TO_LOCAL, TWCS_MAX_OUTPUT_FILE_SIZE, TWCS_TIME_WINDOW, TWCS_TRIGGER_FILE_NUM,
is_mito_engine_option_key,
};
use store_api::region_request::{SetRegionOption, UnsetRegionOption};
@@ -56,13 +57,14 @@ pub const TABLE_DATA_MODEL_TRACE_V1: &str = "greptime_trace_v1";
pub const OTLP_METRIC_COMPAT_KEY: &str = "otlp_metric_compat";
pub const OTLP_METRIC_COMPAT_PROM: &str = "prom";
pub const VALID_TABLE_OPTION_KEYS: [&str; 12] = [
pub const VALID_TABLE_OPTION_KEYS: [&str; 13] = [
// common keys:
WRITE_BUFFER_SIZE_KEY,
TTL_KEY,
STORAGE_KEY,
COMMENT_KEY,
SKIP_WAL_KEY,
SST_FORMAT_KEY,
// file engine keys:
FILE_TABLE_LOCATION_KEY,
FILE_TABLE_FORMAT_KEY,
@@ -94,6 +96,7 @@ static VALID_DB_OPT_KEYS: Lazy<HashSet<&str>> = Lazy::new(|| {
set.insert(TWCS_TIME_WINDOW);
set.insert(TWCS_TRIGGER_FILE_NUM);
set.insert(TWCS_MAX_OUTPUT_FILE_SIZE);
set.insert(SST_FORMAT_KEY);
set
});

View File

@@ -15,7 +15,9 @@
use std::sync::Arc;
use std::time::{Duration, SystemTime, UNIX_EPOCH};
use common_query::Output;
use common_query::{Output, OutputData};
use common_recordbatch::util::collect_batches;
use datatypes::arrow::array::{Float64Array, Int64Array};
use frontend::instance::Instance;
use query::parser::{PromQuery, QueryLanguageParser, QueryStatement};
use rstest::rstest;
@@ -151,6 +153,103 @@ async fn create_insert_tql_assert(
check_unordered_output_stream(query_output, expected).await;
}
async fn execute_all(instance: &Arc<Instance>, sql: &str, query_ctx: Arc<QueryContext>) {
instance
.do_query(sql, query_ctx)
.await
.into_iter()
.for_each(|v| {
let _ = v.unwrap();
});
}
#[allow(clippy::too_many_arguments)]
async fn promql_query_as_batches(
ins: Arc<Instance>,
promql: &str,
alias: Option<String>,
query_ctx: Arc<QueryContext>,
start: SystemTime,
end: SystemTime,
interval: Duration,
lookback: Duration,
) -> common_recordbatch::RecordBatches {
let output = promql_query(
ins, promql, alias, query_ctx, start, end, interval, lookback,
)
.await
.unwrap();
match output.data {
OutputData::Stream(stream) => collect_batches(stream).await.unwrap(),
OutputData::RecordBatches(recordbatches) => recordbatches,
_ => unreachable!(),
}
}
const ANON_PROMQL_RATIO_REPRO_DB: &str = "repro_db";
const ANON_PROMQL_RATIO_REPRO_CREATE: &str = r#"
CREATE TABLE phy (
t TIMESTAMP TIME INDEX,
v DOUBLE
) ENGINE=metric WITH ("physical_metric_table" = "");
CREATE TABLE metric_a (
l1 STRING NULL,
l2 STRING NULL,
l3 STRING NULL,
l4 STRING NULL,
l5 STRING NULL,
t TIMESTAMP NOT NULL,
v DOUBLE NULL,
TIME INDEX (t),
PRIMARY KEY (l1, l2, l3, l4, l5)
) ENGINE=metric WITH (on_physical_table = 'phy');
CREATE TABLE metric_b (
l6 STRING NULL,
l1 STRING NULL,
l2 STRING NULL,
l3 STRING NULL,
l4 STRING NULL,
t TIMESTAMP NOT NULL,
v DOUBLE NULL,
TIME INDEX (t),
PRIMARY KEY (l6, l1, l2, l3, l4)
) ENGINE=metric WITH (on_physical_table = 'phy');
"#;
const ANON_PROMQL_RATIO_REPRO_INSERT: &str = r#"
INSERT INTO metric_a (l1, l2, l3, l4, l5, t, v) VALUES
('v1', 'v2', 'v3', 'v4a', 'v5a', 1, 0),
('v1', 'v2', 'v3', 'v4a', 'v5a', 180000, 120),
('v1', 'v2', 'v3', 'v4a', 'v5a', 360000, 240),
('v1', 'v2', 'v3', 'v4a', 'v5b', 1, 0),
('v1', 'v2', 'v3', 'v4a', 'v5b', 180000, 30),
('v1', 'v2', 'v3', 'v4a', 'v5b', 360000, 60),
('v1', 'v2', 'v3-b', 'v4b', 'v5c', 1, 0),
('v1', 'v2', 'v3-b', 'v4b', 'v5c', 180000, 60),
('v1', 'v2', 'v3-b', 'v4b', 'v5c', 360000, 120);
INSERT INTO metric_b (l6, l1, l2, l3, l4, t, v) VALUES
('v6', 'v1', 'v2', 'v3', 'v4a', 1, 1),
('v6', 'v1', 'v2', 'v3', 'v4a', 180000, 1),
('v6', 'v1', 'v2', 'v3', 'v4a', 360000, 1),
('v6', 'v1', 'v2', 'v3-b', 'v4b', 1, 2),
('v6', 'v1', 'v2', 'v3-b', 'v4b', 180000, 2),
('v6', 'v1', 'v2', 'v3-b', 'v4b', 360000, 2);
"#;
const ANON_PROMQL_RATIO_REPRO_NUMERATOR: &str = r#"count(((rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)",__schema__="repro_db"}[3m]) / on(l3,l4) group_left metric_b{l6="v6",l1="v1",l2="v2",l3=~"v3(|-a|-b)",__schema__="repro_db"}) > 0.50))"#;
const ANON_PROMQL_RATIO_REPRO_DENOMINATOR: &str =
r#"count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)",__schema__="repro_db"}[3m]))"#;
const ANON_PROMQL_RATIO_REPRO_WHOLE: &str = r#"(count(((rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)",__schema__="repro_db"}[3m]) / on(l3,l4) group_left metric_b{l6="v6",l1="v1",l2="v2",l3=~"v3(|-a|-b)",__schema__="repro_db"}) > 0.50)) / count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)",__schema__="repro_db"}[3m]))) * 100"#;
const ANON_PROMQL_RATIO_REPRO_SCALAR_DIV: &str =
r#"count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)",__schema__="repro_db"}[3m])) / 2"#;
#[apply(both_instances_cases)]
async fn sql_insert_tql_query_ceil(instance: Arc<dyn MockInstance>) {
let instance = instance.frontend();
@@ -709,3 +808,140 @@ async fn cross_schema_query(instance: Arc<dyn MockInstance>) {
check_unordered_output_stream(query_output, expected).await;
}
#[apply(both_instances_cases)]
async fn anon_promql_ratio_repro(instance: Arc<dyn MockInstance>) {
let ins = instance.frontend();
execute_all(
&ins,
&format!("CREATE DATABASE {ANON_PROMQL_RATIO_REPRO_DB}"),
QueryContext::arc(),
)
.await;
let repro_ctx: Arc<QueryContext> =
QueryContext::with_db_name(Some(ANON_PROMQL_RATIO_REPRO_DB)).into();
execute_all(&ins, ANON_PROMQL_RATIO_REPRO_CREATE, repro_ctx.clone()).await;
execute_all(&ins, ANON_PROMQL_RATIO_REPRO_INSERT, repro_ctx).await;
let start = UNIX_EPOCH.checked_add(Duration::from_secs(180)).unwrap();
let end = UNIX_EPOCH.checked_add(Duration::from_secs(360)).unwrap();
let interval = Duration::from_secs(180);
let lookback = Duration::from_secs(1);
let numerator = promql_query_as_batches(
ins.clone(),
ANON_PROMQL_RATIO_REPRO_NUMERATOR,
Some("num".to_string()),
QueryContext::arc(),
start,
end,
interval,
lookback,
)
.await;
let denominator = promql_query_as_batches(
ins.clone(),
ANON_PROMQL_RATIO_REPRO_DENOMINATOR,
Some("den".to_string()),
QueryContext::arc(),
start,
end,
interval,
lookback,
)
.await;
let whole = promql_query_as_batches(
ins.clone(),
ANON_PROMQL_RATIO_REPRO_WHOLE,
Some("pct".to_string()),
QueryContext::arc(),
start,
end,
interval,
lookback,
)
.await;
let scalar_div = promql_query_as_batches(
ins,
ANON_PROMQL_RATIO_REPRO_SCALAR_DIV,
Some("half_den".to_string()),
QueryContext::arc(),
start,
end,
interval,
lookback,
)
.await;
let numerator = numerator.iter().collect::<Vec<_>>();
let denominator = denominator.iter().collect::<Vec<_>>();
let whole = whole.iter().collect::<Vec<_>>();
let scalar_div = scalar_div.iter().collect::<Vec<_>>();
let numerator_values = numerator[0]
.column_by_name("num")
.unwrap()
.as_any()
.downcast_ref::<Int64Array>()
.unwrap();
let denominator_values = denominator[0]
.column_by_name("den")
.unwrap()
.as_any()
.downcast_ref::<Int64Array>()
.unwrap();
let percentage_values = whole[0]
.column_by_name("pct")
.unwrap()
.as_any()
.downcast_ref::<Float64Array>()
.unwrap();
let scalar_div_values = scalar_div[0]
.column_by_name("half_den")
.unwrap()
.as_any()
.downcast_ref::<Float64Array>()
.unwrap();
assert_eq!(numerator_values.len(), 1, "{}", numerator[0].pretty_print());
assert_eq!(
denominator_values.len(),
1,
"{}",
denominator[0].pretty_print()
);
assert_eq!(percentage_values.len(), 1, "{}", whole[0].pretty_print());
assert_eq!(
scalar_div_values.len(),
1,
"{}",
scalar_div[0].pretty_print()
);
assert_eq!(
numerator_values.value(0),
1,
"{}",
numerator[0].pretty_print()
);
assert_eq!(
denominator_values.value(0),
3,
"{}",
denominator[0].pretty_print()
);
assert!(
(scalar_div_values.value(0) - 1.5).abs() < 1e-9,
"{}",
scalar_div[0].pretty_print()
);
let expected = 100.0 / 3.0;
assert!(
(percentage_values.value(0) - expected).abs() < 1e-9,
"{}",
whole[0].pretty_print()
);
}

View File

@@ -148,6 +148,7 @@ macro_rules! http_tests {
test_jaeger_query_api_for_trace_v1,
test_influxdb_write,
test_influxdb_write_with_hints,
test_http_memory_limit,
);
)*
@@ -1641,6 +1642,7 @@ fn drop_lines_with_inconsistent_results(input: String) -> String {
"metadata_cache_size =",
"content_cache_size =",
"result_cache_size =",
"range_result_cache_size =",
"name =",
"recovery_parallelism =",
"max_background_index_builds =",
@@ -3638,6 +3640,43 @@ transform:
guard.remove_all().await;
}
pub async fn test_influxdb_write_with_hints(storage_type: StorageType) {
common_telemetry::init_default_ut_logging();
let (app, mut guard) =
setup_test_http_app_with_frontend(storage_type, "test_influxdb_write_with_hints").await;
let client = TestClient::new(app).await;
let result = client
.post("/v1/influxdb/write?db=public")
.header("x-greptime-hints", "sst_format=flat,ttl=30d,skip_wal=true")
.body("sst_fmt_table,host=host1 cpu=1.2 1664370459457010101")
.send()
.await;
assert_eq!(result.status(), 204);
let res = client
.get("/v1/sql?sql=show create table sst_fmt_table")
.send()
.await;
assert_eq!(res.status(), StatusCode::OK);
let resp = res.text().await;
assert!(
resp.contains("sst_format = 'flat'"),
"expected sst_format = 'flat' in SHOW CREATE TABLE output, got: {resp}"
);
assert!(
resp.contains("ttl = '30days'"),
"expected ttl = '30days' in SHOW CREATE TABLE output, got: {resp}"
);
assert!(
resp.contains("skip_wal = 'true'"),
"expected skip_wal = 'true' in SHOW CREATE TABLE output, got: {resp}"
);
guard.remove_all().await;
}
/// Test one-to-many VRL pipeline expansion.
/// This test verifies that a VRL processor can return an array, which results in
/// multiple output rows from a single input row.

View File

@@ -442,54 +442,54 @@ Affected Rows: 0
-- SQLNESS REPLACE (Hash.*) REDACTED
tql explain (1752591864, 1752592164, '30s') sum by (a, b, c) (rate(aggr_optimize_not [2m])) / sum by (a, b, c) (rate(aggr_optimize_not_count [2m]));
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| plan_type | plan |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| logical_plan | Projection: aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c, aggr_optimize_not_count.greptime_timestamp, aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) / aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) AS aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) / aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) |
| | Inner Join: aggr_optimize_not.a = aggr_optimize_not_count.a, aggr_optimize_not.b = aggr_optimize_not_count.b, aggr_optimize_not.c = aggr_optimize_not_count.c, aggr_optimize_not.greptime_timestamp = aggr_optimize_not_count.greptime_timestamp |
| | MergeSort: aggr_optimize_not.a ASC NULLS LAST, aggr_optimize_not.b ASC NULLS LAST, aggr_optimize_not.c ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST |
| | MergeScan [is_placeholder=false, remote_input=[ |
| | SubqueryAlias: aggr_optimize_not |
| | Sort: aggr_optimize_not.a ASC NULLS LAST, aggr_optimize_not.b ASC NULLS LAST, aggr_optimize_not.c ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST |
| | Aggregate: groupBy=[[aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.greptime_timestamp]], aggr=[[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]] |
| | Filter: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)) IS NOT NULL |
| | Projection: aggr_optimize_not.greptime_timestamp, prom_rate(greptime_timestamp_range, greptime_value, aggr_optimize_not.greptime_timestamp, Int64(120000)) AS prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.d |
| | PromRangeManipulate: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"] |
| | PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true] |
| | PromSeriesDivide: tags=["a", "b", "c", "d"] |
| | Sort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST |
| | Filter: aggr_optimize_not.greptime_timestamp >= TimestampMillisecond(1752591744001, None) AND aggr_optimize_not.greptime_timestamp <= TimestampMillisecond(1752592164000, None) |
| | TableScan: aggr_optimize_not |
| | ]] |
| | SubqueryAlias: aggr_optimize_not_count |
| | Sort: aggr_optimize_not_count.a ASC NULLS LAST, aggr_optimize_not_count.b ASC NULLS LAST, aggr_optimize_not_count.c ASC NULLS LAST, aggr_optimize_not_count.greptime_timestamp ASC NULLS LAST |
| | Aggregate: groupBy=[[aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c, aggr_optimize_not_count.greptime_timestamp]], aggr=[[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]] |
| | Filter: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)) IS NOT NULL |
| | Projection: aggr_optimize_not_count.greptime_timestamp, prom_rate(greptime_timestamp_range, greptime_value, aggr_optimize_not_count.greptime_timestamp, Int64(120000)) AS prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c |
| | PromRangeManipulate: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"] |
| | PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true] |
| | PromSeriesDivide: tags=["a", "b", "c", "d"] |
| | Sort: aggr_optimize_not_count.a ASC NULLS FIRST, aggr_optimize_not_count.b ASC NULLS FIRST, aggr_optimize_not_count.c ASC NULLS FIRST, aggr_optimize_not_count.d ASC NULLS FIRST, aggr_optimize_not_count.greptime_timestamp ASC NULLS FIRST |
| | MergeScan [is_placeholder=false, remote_input=[ |
| | Filter: aggr_optimize_not_count.greptime_timestamp >= TimestampMillisecond(1752591744001, None) AND aggr_optimize_not_count.greptime_timestamp <= TimestampMillisecond(1752592164000, None) |
| | TableScan: aggr_optimize_not_count |
| | ]] |
| physical_plan | ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c, greptime_timestamp@3 as greptime_timestamp, sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))@5 / sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))@4 as aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) / aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))] |
+---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| plan_type | plan |
+---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| logical_plan | Projection: aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c, aggr_optimize_not_count.greptime_timestamp, CAST(aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) AS Float64) / CAST(aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) AS Float64) AS aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) / aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) |
| | Inner Join: aggr_optimize_not.a = aggr_optimize_not_count.a, aggr_optimize_not.b = aggr_optimize_not_count.b, aggr_optimize_not.c = aggr_optimize_not_count.c, aggr_optimize_not.greptime_timestamp = aggr_optimize_not_count.greptime_timestamp |
| | MergeSort: aggr_optimize_not.a ASC NULLS LAST, aggr_optimize_not.b ASC NULLS LAST, aggr_optimize_not.c ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST |
| | MergeScan [is_placeholder=false, remote_input=[ |
| | SubqueryAlias: aggr_optimize_not |
| | Sort: aggr_optimize_not.a ASC NULLS LAST, aggr_optimize_not.b ASC NULLS LAST, aggr_optimize_not.c ASC NULLS LAST, aggr_optimize_not.greptime_timestamp ASC NULLS LAST |
| | Aggregate: groupBy=[[aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.greptime_timestamp]], aggr=[[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]] |
| | Filter: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)) IS NOT NULL |
| | Projection: aggr_optimize_not.greptime_timestamp, prom_rate(greptime_timestamp_range, greptime_value, aggr_optimize_not.greptime_timestamp, Int64(120000)) AS prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), aggr_optimize_not.a, aggr_optimize_not.b, aggr_optimize_not.c, aggr_optimize_not.d |
| | PromRangeManipulate: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"] |
| | PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true] |
| | PromSeriesDivide: tags=["a", "b", "c", "d"] |
| | Sort: aggr_optimize_not.a ASC NULLS FIRST, aggr_optimize_not.b ASC NULLS FIRST, aggr_optimize_not.c ASC NULLS FIRST, aggr_optimize_not.d ASC NULLS FIRST, aggr_optimize_not.greptime_timestamp ASC NULLS FIRST |
| | Filter: aggr_optimize_not.greptime_timestamp >= TimestampMillisecond(1752591744001, None) AND aggr_optimize_not.greptime_timestamp <= TimestampMillisecond(1752592164000, None) |
| | TableScan: aggr_optimize_not |
| | ]] |
| | SubqueryAlias: aggr_optimize_not_count |
| | Sort: aggr_optimize_not_count.a ASC NULLS LAST, aggr_optimize_not_count.b ASC NULLS LAST, aggr_optimize_not_count.c ASC NULLS LAST, aggr_optimize_not_count.greptime_timestamp ASC NULLS LAST |
| | Aggregate: groupBy=[[aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c, aggr_optimize_not_count.greptime_timestamp]], aggr=[[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))]] |
| | Filter: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)) IS NOT NULL |
| | Projection: aggr_optimize_not_count.greptime_timestamp, prom_rate(greptime_timestamp_range, greptime_value, aggr_optimize_not_count.greptime_timestamp, Int64(120000)) AS prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), aggr_optimize_not_count.a, aggr_optimize_not_count.b, aggr_optimize_not_count.c |
| | PromRangeManipulate: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp], values=["greptime_value"] |
| | PromSeriesNormalize: offset=[0], time index=[greptime_timestamp], filter NaN: [true] |
| | PromSeriesDivide: tags=["a", "b", "c", "d"] |
| | Sort: aggr_optimize_not_count.a ASC NULLS FIRST, aggr_optimize_not_count.b ASC NULLS FIRST, aggr_optimize_not_count.c ASC NULLS FIRST, aggr_optimize_not_count.d ASC NULLS FIRST, aggr_optimize_not_count.greptime_timestamp ASC NULLS FIRST |
| | MergeScan [is_placeholder=false, remote_input=[ |
| | Filter: aggr_optimize_not_count.greptime_timestamp >= TimestampMillisecond(1752591744001, None) AND aggr_optimize_not_count.greptime_timestamp <= TimestampMillisecond(1752592164000, None) |
| | TableScan: aggr_optimize_not_count |
| | ]] |
| physical_plan | ProjectionExec: expr=[a@0 as a, b@1 as b, c@2 as c, greptime_timestamp@3 as greptime_timestamp, sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))@5 / sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))@4 as aggr_optimize_not.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))) / aggr_optimize_not_count.sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))] |
| | REDACTED
| | CoalescePartitionsExec |
| | AggregateExec: mode=SinglePartitioned, gby=[a@2 as a, b@3 as b, c@4 as c, greptime_timestamp@0 as greptime_timestamp], aggr=[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))] |
| | FilterExec: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))@1 IS NOT NULL |
| | ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_rate(greptime_timestamp_range@6, greptime_value@5, greptime_timestamp@4, 120000) as prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), a@0 as a, b@1 as b, c@2 as c] |
| | PromRangeManipulateExec: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp] |
| | PromSeriesNormalizeExec: offset=[0], time index=[greptime_timestamp], filter NaN: [true] |
| | PromSeriesDivideExec: tags=["a", "b", "c", "d"] |
| | SortExec: expr=[a@0 ASC, b@1 ASC, c@2 ASC, d@3 ASC, greptime_timestamp@4 ASC], preserve_partitioning=[true] |
| | CoalescePartitionsExec |
| | AggregateExec: mode=SinglePartitioned, gby=[a@2 as a, b@3 as b, c@4 as c, greptime_timestamp@0 as greptime_timestamp], aggr=[sum(prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)))] |
| | FilterExec: prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000))@1 IS NOT NULL |
| | ProjectionExec: expr=[greptime_timestamp@4 as greptime_timestamp, prom_rate(greptime_timestamp_range@6, greptime_value@5, greptime_timestamp@4, 120000) as prom_rate(greptime_timestamp_range,greptime_value,greptime_timestamp,Int64(120000)), a@0 as a, b@1 as b, c@2 as c] |
| | PromRangeManipulateExec: req range=[1752591864000..1752592164000], interval=[30000], eval range=[120000], time index=[greptime_timestamp] |
| | PromSeriesNormalizeExec: offset=[0], time index=[greptime_timestamp], filter NaN: [true] |
| | PromSeriesDivideExec: tags=["a", "b", "c", "d"] |
| | SortExec: expr=[a@0 ASC, b@1 ASC, c@2 ASC, d@3 ASC, greptime_timestamp@4 ASC], preserve_partitioning=[true] |
| | MergeScanExec: REDACTED
| | SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, greptime_timestamp@3 ASC NULLS LAST], preserve_partitioning=[true] |
| | CooperativeExec |
| | SortExec: expr=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST, c@2 ASC NULLS LAST, greptime_timestamp@3 ASC NULLS LAST], preserve_partitioning=[true] |
| | CooperativeExec |
| | MergeScanExec: REDACTED
| | |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| | |
+---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-- SQLNESS REPLACE (metrics.*) REDACTED
-- SQLNESS REPLACE (RoundRobinBatch.*) REDACTED

View File

@@ -314,6 +314,85 @@ SHOW CREATE DATABASE alter_database;
| | ) |
+----------------+----------------------------------------------+
-- Test sst_format option
ALTER DATABASE alter_database SET 'sst_format'='flat';
Affected Rows: 0
SHOW CREATE DATABASE alter_database;
+----------------+----------------------------------------------+
| Database | Create Database |
+----------------+----------------------------------------------+
| alter_database | CREATE DATABASE IF NOT EXISTS alter_database |
| | WITH( |
| | 'compaction.twcs.time_window' = '30m', |
| | 'compaction.type' = 'twcs', |
| | sst_format = 'flat' |
| | ) |
+----------------+----------------------------------------------+
USE alter_database;
Affected Rows: 0
CREATE TABLE monitor(ts TIMESTAMP TIME INDEX);
Affected Rows: 0
SHOW CREATE TABLE monitor;
+---------+----------------------------------------+
| Table | Create Table |
+---------+----------------------------------------+
| monitor | CREATE TABLE IF NOT EXISTS "monitor" ( |
| | "ts" TIMESTAMP(3) NOT NULL, |
| | TIME INDEX ("ts") |
| | ) |
| | |
| | ENGINE=mito |
| | WITH( |
| | sst_format = 'flat' |
| | ) |
+---------+----------------------------------------+
USE public;
Affected Rows: 0
ALTER DATABASE alter_database SET 'sst_format'='primary_key';
Affected Rows: 0
SHOW CREATE DATABASE alter_database;
+----------------+----------------------------------------------+
| Database | Create Database |
+----------------+----------------------------------------------+
| alter_database | CREATE DATABASE IF NOT EXISTS alter_database |
| | WITH( |
| | 'compaction.twcs.time_window' = '30m', |
| | 'compaction.type' = 'twcs', |
| | sst_format = 'primary_key' |
| | ) |
+----------------+----------------------------------------------+
ALTER DATABASE alter_database UNSET 'sst_format';
Affected Rows: 0
SHOW CREATE DATABASE alter_database;
+----------------+----------------------------------------------+
| Database | Create Database |
+----------------+----------------------------------------------+
| alter_database | CREATE DATABASE IF NOT EXISTS alter_database |
| | WITH( |
| | 'compaction.twcs.time_window' = '30m', |
| | 'compaction.type' = 'twcs' |
| | ) |
+----------------+----------------------------------------------+
DROP DATABASE alter_database;
Affected Rows: 0

View File

@@ -90,5 +90,25 @@ ALTER DATABASE alter_database UNSET 'ttl';
SHOW CREATE DATABASE alter_database;
DROP DATABASE alter_database;
-- Test sst_format option
ALTER DATABASE alter_database SET 'sst_format'='flat';
SHOW CREATE DATABASE alter_database;
USE alter_database;
CREATE TABLE monitor(ts TIMESTAMP TIME INDEX);
SHOW CREATE TABLE monitor;
USE public;
ALTER DATABASE alter_database SET 'sst_format'='primary_key';
SHOW CREATE DATABASE alter_database;
ALTER DATABASE alter_database UNSET 'sst_format';
SHOW CREATE DATABASE alter_database;
DROP DATABASE alter_database;

View File

@@ -0,0 +1,106 @@
CREATE TABLE phy (
t TIMESTAMP TIME INDEX,
v DOUBLE
) ENGINE=metric WITH ("physical_metric_table" = "");
Affected Rows: 0
CREATE TABLE metric_a (
l1 STRING NULL,
l2 STRING NULL,
l3 STRING NULL,
l4 STRING NULL,
l5 STRING NULL,
t TIMESTAMP NOT NULL,
v DOUBLE NULL,
TIME INDEX (t),
PRIMARY KEY (l1, l2, l3, l4, l5)
) ENGINE=metric WITH (on_physical_table = 'phy');
Affected Rows: 0
CREATE TABLE metric_b (
l6 STRING NULL,
l1 STRING NULL,
l2 STRING NULL,
l3 STRING NULL,
l4 STRING NULL,
t TIMESTAMP NOT NULL,
v DOUBLE NULL,
TIME INDEX (t),
PRIMARY KEY (l6, l1, l2, l3, l4)
) ENGINE=metric WITH (on_physical_table = 'phy');
Affected Rows: 0
INSERT INTO metric_a (l1, l2, l3, l4, l5, t, v) VALUES
('v1', 'v2', 'v3', 'v4a', 'v5a', 1, 0),
('v1', 'v2', 'v3', 'v4a', 'v5a', 180000, 120),
('v1', 'v2', 'v3', 'v4a', 'v5a', 360000, 240),
('v1', 'v2', 'v3', 'v4a', 'v5b', 1, 0),
('v1', 'v2', 'v3', 'v4a', 'v5b', 180000, 30),
('v1', 'v2', 'v3', 'v4a', 'v5b', 360000, 60),
('v1', 'v2', 'v3-b', 'v4b', 'v5c', 1, 0),
('v1', 'v2', 'v3-b', 'v4b', 'v5c', 180000, 60),
('v1', 'v2', 'v3-b', 'v4b', 'v5c', 360000, 120);
Affected Rows: 9
INSERT INTO metric_b (l6, l1, l2, l3, l4, t, v) VALUES
('v6', 'v1', 'v2', 'v3', 'v4a', 1, 1),
('v6', 'v1', 'v2', 'v3', 'v4a', 180000, 1),
('v6', 'v1', 'v2', 'v3', 'v4a', 360000, 1),
('v6', 'v1', 'v2', 'v3-b', 'v4b', 1, 2),
('v6', 'v1', 'v2', 'v3-b', 'v4b', 180000, 2),
('v6', 'v1', 'v2', 'v3-b', 'v4b', 360000, 2);
Affected Rows: 6
-- SQLNESS SORT_RESULT 3 1
TQL EVAL (180, 360, '180s') count(((rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]) / on(l3,l4) group_left metric_b{l6="v6",l1="v1",l2="v2",l3=~"v3(|-a|-b)"}) > 0.50));
+---------------------+-------------------------------------------------------------------+
| t | count(metric_a.prom_rate(t_range,v,t,Int64(180000)) / metric_b.v) |
+---------------------+-------------------------------------------------------------------+
| 1970-01-01T00:03:00 | 1 |
+---------------------+-------------------------------------------------------------------+
-- SQLNESS SORT_RESULT 3 1
TQL EVAL (180, 360, '180s') count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]));
+---------------------+---------------------------------------------+
| t | count(prom_rate(t_range,v,t,Int64(180000))) |
+---------------------+---------------------------------------------+
| 1970-01-01T00:03:00 | 3 |
+---------------------+---------------------------------------------+
-- SQLNESS SORT_RESULT 3 1
TQL EVAL (180, 360, '180s') count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m])) / 2;
+---------------------+----------------------------------------------------------+
| t | count(prom_rate(t_range,v,t,Int64(180000))) / Float64(2) |
+---------------------+----------------------------------------------------------+
| 1970-01-01T00:03:00 | 1.5 |
+---------------------+----------------------------------------------------------+
-- SQLNESS SORT_RESULT 3 1
TQL EVAL (180, 360, '180s') (count(((rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]) / on(l3,l4) group_left metric_b{l6="v6",l1="v1",l2="v2",l3=~"v3(|-a|-b)"}) > 0.50)) / count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]))) * 100;
+---------------------+--------------------------------------------------------------------------------------------------------------------------------------------------+
| t | metric_b.count(metric_a.prom_rate(t_range,v,t,Int64(180000)) / metric_b.v) / metric_a.count(prom_rate(t_range,v,t,Int64(180000))) * Float64(100) |
+---------------------+--------------------------------------------------------------------------------------------------------------------------------------------------+
| 1970-01-01T00:03:00 | 33.33333333333333 |
+---------------------+--------------------------------------------------------------------------------------------------------------------------------------------------+
DROP TABLE metric_a;
Affected Rows: 0
DROP TABLE metric_b;
Affected Rows: 0
DROP TABLE phy;
Affected Rows: 0

View File

@@ -0,0 +1,63 @@
CREATE TABLE phy (
t TIMESTAMP TIME INDEX,
v DOUBLE
) ENGINE=metric WITH ("physical_metric_table" = "");
CREATE TABLE metric_a (
l1 STRING NULL,
l2 STRING NULL,
l3 STRING NULL,
l4 STRING NULL,
l5 STRING NULL,
t TIMESTAMP NOT NULL,
v DOUBLE NULL,
TIME INDEX (t),
PRIMARY KEY (l1, l2, l3, l4, l5)
) ENGINE=metric WITH (on_physical_table = 'phy');
CREATE TABLE metric_b (
l6 STRING NULL,
l1 STRING NULL,
l2 STRING NULL,
l3 STRING NULL,
l4 STRING NULL,
t TIMESTAMP NOT NULL,
v DOUBLE NULL,
TIME INDEX (t),
PRIMARY KEY (l6, l1, l2, l3, l4)
) ENGINE=metric WITH (on_physical_table = 'phy');
INSERT INTO metric_a (l1, l2, l3, l4, l5, t, v) VALUES
('v1', 'v2', 'v3', 'v4a', 'v5a', 1, 0),
('v1', 'v2', 'v3', 'v4a', 'v5a', 180000, 120),
('v1', 'v2', 'v3', 'v4a', 'v5a', 360000, 240),
('v1', 'v2', 'v3', 'v4a', 'v5b', 1, 0),
('v1', 'v2', 'v3', 'v4a', 'v5b', 180000, 30),
('v1', 'v2', 'v3', 'v4a', 'v5b', 360000, 60),
('v1', 'v2', 'v3-b', 'v4b', 'v5c', 1, 0),
('v1', 'v2', 'v3-b', 'v4b', 'v5c', 180000, 60),
('v1', 'v2', 'v3-b', 'v4b', 'v5c', 360000, 120);
INSERT INTO metric_b (l6, l1, l2, l3, l4, t, v) VALUES
('v6', 'v1', 'v2', 'v3', 'v4a', 1, 1),
('v6', 'v1', 'v2', 'v3', 'v4a', 180000, 1),
('v6', 'v1', 'v2', 'v3', 'v4a', 360000, 1),
('v6', 'v1', 'v2', 'v3-b', 'v4b', 1, 2),
('v6', 'v1', 'v2', 'v3-b', 'v4b', 180000, 2),
('v6', 'v1', 'v2', 'v3-b', 'v4b', 360000, 2);
-- SQLNESS SORT_RESULT 3 1
TQL EVAL (180, 360, '180s') count(((rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]) / on(l3,l4) group_left metric_b{l6="v6",l1="v1",l2="v2",l3=~"v3(|-a|-b)"}) > 0.50));
-- SQLNESS SORT_RESULT 3 1
TQL EVAL (180, 360, '180s') count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]));
-- SQLNESS SORT_RESULT 3 1
TQL EVAL (180, 360, '180s') count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m])) / 2;
-- SQLNESS SORT_RESULT 3 1
TQL EVAL (180, 360, '180s') (count(((rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]) / on(l3,l4) group_left metric_b{l6="v6",l1="v1",l2="v2",l3=~"v3(|-a|-b)"}) > 0.50)) / count(rate(metric_a{l1="v1",l2="v2",l3=~"v3(|-a|-b)"}[3m]))) * 100;
DROP TABLE metric_a;
DROP TABLE metric_b;
DROP TABLE phy;

View File

@@ -427,8 +427,8 @@ SELECT min(val) as min_computed, max(val) as max_computed FROM computed;
| | Aggregate: groupBy=[[]], aggr=[[min(computed.val), max(computed.val)]] |
| | SubqueryAlias: computed |
| | Projection: metric.ts AS ts, val * Float64(2) + Float64(1) AS val |
| | Projection: metric.ts, val * Float64(2) + Float64(1) AS val * Float64(2) + Float64(1) |
| | Projection: metric.ts, metric.val * Float64(2) AS val * Float64(2) |
| | Projection: metric.ts, CAST(val * Float64(2) AS Float64) + Float64(1) AS val * Float64(2) + Float64(1) |
| | Projection: metric.ts, CAST(metric.val AS Float64) * Float64(2) AS val * Float64(2) |
| | PromInstantManipulate: range=[0..40000], lookback=[300000], interval=[10000], time index=[ts] |
| | PromSeriesDivide: tags=[] |
| | Filter: metric.ts >= TimestampMillisecond(-299999, None) AND metric.ts <= TimestampMillisecond(40000, None) |