From e23979df9f92ef0a31f88296fc86a46a3812708e Mon Sep 17 00:00:00 2001
From: LFC <990479+MichaelScofield@users.noreply.github.com>
Date: Thu, 10 Apr 2025 10:05:50 +0800
Subject: [PATCH 01/82] chore: un-allow clippy's "readonly_write_lock" (#5862)
---
Cargo.toml | 1 -
src/log-store/src/raft_engine/backend.rs | 6 ++++++
2 files changed, 6 insertions(+), 1 deletion(-)
diff --git a/Cargo.toml b/Cargo.toml
index 9c36a76805..38b749e7b0 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -77,7 +77,6 @@ clippy.print_stdout = "warn"
clippy.print_stderr = "warn"
clippy.dbg_macro = "warn"
clippy.implicit_clone = "warn"
-clippy.readonly_write_lock = "allow"
rust.unknown_lints = "deny"
rust.unexpected_cfgs = { level = "warn", check-cfg = ['cfg(tokio_unstable)'] }
diff --git a/src/log-store/src/raft_engine/backend.rs b/src/log-store/src/raft_engine/backend.rs
index 3d41e5298d..8d27994f8b 100644
--- a/src/log-store/src/raft_engine/backend.rs
+++ b/src/log-store/src/raft_engine/backend.rs
@@ -114,7 +114,13 @@ impl TxnService for RaftEngineBackend {
} = txn.into();
let mut succeeded = true;
+
+ // Here we are using the write lock to guard against parallel access inside "txn", and
+ // outside "get" or "put" etc. It doesn't serve the purpose of mutating some Rust data, so
+ // the variable is not "mut". Suppress the clippy warning because of this.
+ #[allow(clippy::readonly_write_lock)]
let engine = self.engine.write().unwrap();
+
for cmp in compare {
let existing_value = engine_get(&engine, &cmp.key)?.map(|kv| kv.value);
if !cmp.compare_value(existing_value.as_ref()) {
From e052c65a5856226b1d0b72dfc215454720e81999 Mon Sep 17 00:00:00 2001
From: LFC <990479+MichaelScofield@users.noreply.github.com>
Date: Thu, 10 Apr 2025 14:30:29 +0800
Subject: [PATCH 02/82] chore: remove repl (#5860)
---
Cargo.lock | 1 -
src/cli/src/error.rs | 63 +-------
src/cli/src/lib.rs | 3 -
src/cli/src/repl.rs | 299 --------------------------------------
src/cmd/Cargo.toml | 1 -
src/cmd/src/error.rs | 63 +-------
src/cmd/tests/cli.rs | 148 -------------------
src/frontend/src/error.rs | 10 +-
8 files changed, 3 insertions(+), 585 deletions(-)
delete mode 100644 src/cli/src/repl.rs
delete mode 100644 src/cmd/tests/cli.rs
diff --git a/Cargo.lock b/Cargo.lock
index 188f9ce144..b19e1bb75f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1832,7 +1832,6 @@ dependencies = [
"regex",
"reqwest",
"rexpect",
- "rustyline",
"serde",
"serde_json",
"servers",
diff --git a/src/cli/src/error.rs b/src/cli/src/error.rs
index be852e7d73..2c18531aaa 100644
--- a/src/cli/src/error.rs
+++ b/src/cli/src/error.rs
@@ -17,7 +17,6 @@ use std::any::Any;
use common_error::ext::{BoxedError, ErrorExt};
use common_error::status_code::StatusCode;
use common_macro::stack_trace_debug;
-use rustyline::error::ReadlineError;
use snafu::{Location, Snafu};
#[derive(Snafu)]
@@ -105,52 +104,6 @@ pub enum Error {
#[snafu(display("Invalid REPL command: {reason}"))]
InvalidReplCommand { reason: String },
- #[snafu(display("Cannot create REPL"))]
- ReplCreation {
- #[snafu(source)]
- error: ReadlineError,
- #[snafu(implicit)]
- location: Location,
- },
-
- #[snafu(display("Error reading command"))]
- Readline {
- #[snafu(source)]
- error: ReadlineError,
- #[snafu(implicit)]
- location: Location,
- },
-
- #[snafu(display("Failed to request database, sql: {sql}"))]
- RequestDatabase {
- sql: String,
- #[snafu(source)]
- source: client::Error,
- #[snafu(implicit)]
- location: Location,
- },
-
- #[snafu(display("Failed to collect RecordBatches"))]
- CollectRecordBatches {
- #[snafu(implicit)]
- location: Location,
- source: common_recordbatch::error::Error,
- },
-
- #[snafu(display("Failed to pretty print Recordbatches"))]
- PrettyPrintRecordBatches {
- #[snafu(implicit)]
- location: Location,
- source: common_recordbatch::error::Error,
- },
-
- #[snafu(display("Failed to start Meta client"))]
- StartMetaClient {
- #[snafu(implicit)]
- location: Location,
- source: meta_client::error::Error,
- },
-
#[snafu(display("Failed to parse SQL: {}", sql))]
ParseSql {
sql: String,
@@ -166,13 +119,6 @@ pub enum Error {
source: query::error::Error,
},
- #[snafu(display("Failed to encode logical plan in substrait"))]
- SubstraitEncodeLogicalPlan {
- #[snafu(implicit)]
- location: Location,
- source: substrait::error::Error,
- },
-
#[snafu(display("Failed to load layered config"))]
LoadLayeredConfig {
#[snafu(source(from(common_config::error::Error, Box::new)))]
@@ -318,17 +264,10 @@ impl ErrorExt for Error {
Error::StartProcedureManager { source, .. }
| Error::StopProcedureManager { source, .. } => source.status_code(),
Error::StartWalOptionsAllocator { source, .. } => source.status_code(),
- Error::ReplCreation { .. } | Error::Readline { .. } | Error::HttpQuerySql { .. } => {
- StatusCode::Internal
- }
- Error::RequestDatabase { source, .. } => source.status_code(),
- Error::CollectRecordBatches { source, .. }
- | Error::PrettyPrintRecordBatches { source, .. } => source.status_code(),
- Error::StartMetaClient { source, .. } => source.status_code(),
+ Error::HttpQuerySql { .. } => StatusCode::Internal,
Error::ParseSql { source, .. } | Error::PlanStatement { source, .. } => {
source.status_code()
}
- Error::SubstraitEncodeLogicalPlan { source, .. } => source.status_code(),
Error::SerdeJson { .. }
| Error::FileIo { .. }
diff --git a/src/cli/src/lib.rs b/src/cli/src/lib.rs
index 3991f3a666..113e88f1c1 100644
--- a/src/cli/src/lib.rs
+++ b/src/cli/src/lib.rs
@@ -23,15 +23,12 @@ mod helper;
// Wait for https://github.com/GreptimeTeam/greptimedb/issues/2373
mod database;
mod import;
-#[allow(unused)]
-mod repl;
use async_trait::async_trait;
use clap::Parser;
use common_error::ext::BoxedError;
pub use database::DatabaseClient;
use error::Result;
-pub use repl::Repl;
pub use crate::bench::BenchTableMetadataCommand;
pub use crate::export::ExportCommand;
diff --git a/src/cli/src/repl.rs b/src/cli/src/repl.rs
deleted file mode 100644
index 8b5e3aa389..0000000000
--- a/src/cli/src/repl.rs
+++ /dev/null
@@ -1,299 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-use std::path::PathBuf;
-use std::sync::Arc;
-use std::time::Instant;
-
-use cache::{
- build_fundamental_cache_registry, with_default_composite_cache_registry, TABLE_CACHE_NAME,
- TABLE_ROUTE_CACHE_NAME,
-};
-use catalog::information_extension::DistributedInformationExtension;
-use catalog::kvbackend::{
- CachedKvBackend, CachedKvBackendBuilder, KvBackendCatalogManager, MetaKvBackend,
-};
-use client::{Client, Database, OutputData, DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
-use common_base::Plugins;
-use common_config::Mode;
-use common_error::ext::ErrorExt;
-use common_meta::cache::{CacheRegistryBuilder, LayeredCacheRegistryBuilder};
-use common_meta::kv_backend::KvBackendRef;
-use common_query::Output;
-use common_recordbatch::RecordBatches;
-use common_telemetry::debug;
-use either::Either;
-use meta_client::client::{ClusterKvBackend, MetaClientBuilder};
-use query::datafusion::DatafusionQueryEngine;
-use query::parser::QueryLanguageParser;
-use query::query_engine::{DefaultSerializer, QueryEngineState};
-use query::QueryEngine;
-use rustyline::error::ReadlineError;
-use rustyline::Editor;
-use session::context::QueryContext;
-use snafu::{OptionExt, ResultExt};
-use substrait::{DFLogicalSubstraitConvertor, SubstraitPlan};
-
-use crate::cmd::ReplCommand;
-use crate::error::{
- CollectRecordBatchesSnafu, ParseSqlSnafu, PlanStatementSnafu, PrettyPrintRecordBatchesSnafu,
- ReadlineSnafu, ReplCreationSnafu, RequestDatabaseSnafu, Result, StartMetaClientSnafu,
- SubstraitEncodeLogicalPlanSnafu,
-};
-use crate::helper::RustylineHelper;
-use crate::{error, AttachCommand};
-
-/// Captures the state of the repl, gathers commands and executes them one by one
-pub struct Repl {
- /// Rustyline editor for interacting with user on command line
- rl: Editor,
-
- /// Current prompt
- prompt: String,
-
- /// Client for interacting with GreptimeDB
- database: Database,
-
- query_engine: Option,
-}
-
-#[allow(clippy::print_stdout)]
-impl Repl {
- fn print_help(&self) {
- println!("{}", ReplCommand::help())
- }
-
- pub(crate) async fn try_new(cmd: &AttachCommand) -> Result {
- let mut rl = Editor::new().context(ReplCreationSnafu)?;
-
- if !cmd.disable_helper {
- rl.set_helper(Some(RustylineHelper::default()));
-
- let history_file = history_file();
- if let Err(e) = rl.load_history(&history_file) {
- debug!(
- "failed to load history file on {}, error: {e}",
- history_file.display()
- );
- }
- }
-
- let client = Client::with_urls([&cmd.grpc_addr]);
- let database = Database::new(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, client);
-
- let query_engine = if let Some(meta_addr) = &cmd.meta_addr {
- create_query_engine(meta_addr).await.map(Some)?
- } else {
- None
- };
-
- Ok(Self {
- rl,
- prompt: "> ".to_string(),
- database,
- query_engine,
- })
- }
-
- /// Parse the next command
- fn next_command(&mut self) -> Result {
- match self.rl.readline(&self.prompt) {
- Ok(ref line) => {
- let request = line.trim();
-
- let _ = self.rl.add_history_entry(request.to_string());
-
- request.try_into()
- }
- Err(ReadlineError::Eof) | Err(ReadlineError::Interrupted) => Ok(ReplCommand::Exit),
- // Some sort of real underlying error
- Err(e) => Err(e).context(ReadlineSnafu),
- }
- }
-
- /// Read Evaluate Print Loop (interactive command line) for GreptimeDB
- ///
- /// Inspired / based on repl.rs from InfluxDB IOX
- pub(crate) async fn run(&mut self) -> Result<()> {
- println!("Ready for commands. (Hint: try 'help')");
-
- loop {
- match self.next_command()? {
- ReplCommand::Help => {
- self.print_help();
- }
- ReplCommand::UseDatabase { db_name } => {
- if self.execute_sql(format!("USE {db_name}")).await {
- println!("Using {db_name}");
- self.database.set_schema(&db_name);
- self.prompt = format!("[{db_name}] > ");
- }
- }
- ReplCommand::Sql { sql } => {
- let _ = self.execute_sql(sql).await;
- }
- ReplCommand::Exit => {
- return Ok(());
- }
- }
- }
- }
-
- async fn execute_sql(&self, sql: String) -> bool {
- self.do_execute_sql(sql)
- .await
- .map_err(|e| {
- let status_code = e.status_code();
- let root_cause = e.output_msg();
- println!("Error: {}({status_code}), {root_cause}", status_code as u32)
- })
- .is_ok()
- }
-
- async fn do_execute_sql(&self, sql: String) -> Result<()> {
- let start = Instant::now();
-
- let output = if let Some(query_engine) = &self.query_engine {
- let query_ctx = Arc::new(QueryContext::with(
- self.database.catalog(),
- self.database.schema(),
- ));
-
- let stmt = QueryLanguageParser::parse_sql(&sql, &query_ctx)
- .with_context(|_| ParseSqlSnafu { sql: sql.clone() })?;
-
- let plan = query_engine
- .planner()
- .plan(&stmt, query_ctx.clone())
- .await
- .context(PlanStatementSnafu)?;
-
- let plan = query_engine
- .optimize(&query_engine.engine_context(query_ctx), &plan)
- .context(PlanStatementSnafu)?;
-
- let plan = DFLogicalSubstraitConvertor {}
- .encode(&plan, DefaultSerializer)
- .context(SubstraitEncodeLogicalPlanSnafu)?;
-
- self.database.logical_plan(plan.to_vec()).await
- } else {
- self.database.sql(&sql).await
- }
- .context(RequestDatabaseSnafu { sql: &sql })?;
-
- let either = match output.data {
- OutputData::Stream(s) => {
- let x = RecordBatches::try_collect(s)
- .await
- .context(CollectRecordBatchesSnafu)?;
- Either::Left(x)
- }
- OutputData::RecordBatches(x) => Either::Left(x),
- OutputData::AffectedRows(rows) => Either::Right(rows),
- };
-
- let end = Instant::now();
-
- match either {
- Either::Left(recordbatches) => {
- let total_rows: usize = recordbatches.iter().map(|x| x.num_rows()).sum();
- if total_rows > 0 {
- println!(
- "{}",
- recordbatches
- .pretty_print()
- .context(PrettyPrintRecordBatchesSnafu)?
- );
- }
- println!("Total Rows: {total_rows}")
- }
- Either::Right(rows) => println!("Affected Rows: {rows}"),
- };
-
- println!("Cost {} ms", (end - start).as_millis());
- Ok(())
- }
-}
-
-impl Drop for Repl {
- fn drop(&mut self) {
- if self.rl.helper().is_some() {
- let history_file = history_file();
- if let Err(e) = self.rl.save_history(&history_file) {
- debug!(
- "failed to save history file on {}, error: {e}",
- history_file.display()
- );
- }
- }
- }
-}
-
-/// Return the location of the history file (defaults to $HOME/".greptimedb_cli_history")
-fn history_file() -> PathBuf {
- let mut buf = match std::env::var("HOME") {
- Ok(home) => PathBuf::from(home),
- Err(_) => PathBuf::new(),
- };
- buf.push(".greptimedb_cli_history");
- buf
-}
-
-async fn create_query_engine(meta_addr: &str) -> Result {
- let mut meta_client = MetaClientBuilder::default().enable_store().build();
- meta_client
- .start([meta_addr])
- .await
- .context(StartMetaClientSnafu)?;
- let meta_client = Arc::new(meta_client);
-
- let cached_meta_backend = Arc::new(
- CachedKvBackendBuilder::new(Arc::new(MetaKvBackend::new(meta_client.clone()))).build(),
- );
- let layered_cache_builder = LayeredCacheRegistryBuilder::default().add_cache_registry(
- CacheRegistryBuilder::default()
- .add_cache(cached_meta_backend.clone())
- .build(),
- );
- let fundamental_cache_registry =
- build_fundamental_cache_registry(Arc::new(MetaKvBackend::new(meta_client.clone())));
- let layered_cache_registry = Arc::new(
- with_default_composite_cache_registry(
- layered_cache_builder.add_cache_registry(fundamental_cache_registry),
- )
- .context(error::BuildCacheRegistrySnafu)?
- .build(),
- );
-
- let information_extension = Arc::new(DistributedInformationExtension::new(meta_client.clone()));
- let catalog_manager = KvBackendCatalogManager::new(
- information_extension,
- cached_meta_backend.clone(),
- layered_cache_registry,
- None,
- );
- let plugins: Plugins = Default::default();
- let state = Arc::new(QueryEngineState::new(
- catalog_manager,
- None,
- None,
- None,
- None,
- false,
- plugins.clone(),
- ));
-
- Ok(DatafusionQueryEngine::new(state, plugins))
-}
diff --git a/src/cmd/Cargo.toml b/src/cmd/Cargo.toml
index c3328fbc8d..b3ffd479a6 100644
--- a/src/cmd/Cargo.toml
+++ b/src/cmd/Cargo.toml
@@ -68,7 +68,6 @@ query.workspace = true
rand.workspace = true
regex.workspace = true
reqwest.workspace = true
-rustyline = "10.1"
serde.workspace = true
serde_json.workspace = true
servers.workspace = true
diff --git a/src/cmd/src/error.rs b/src/cmd/src/error.rs
index 8697710985..a671290503 100644
--- a/src/cmd/src/error.rs
+++ b/src/cmd/src/error.rs
@@ -17,7 +17,6 @@ use std::any::Any;
use common_error::ext::{BoxedError, ErrorExt};
use common_error::status_code::StatusCode;
use common_macro::stack_trace_debug;
-use rustyline::error::ReadlineError;
use snafu::{Location, Snafu};
#[derive(Snafu)]
@@ -181,52 +180,6 @@ pub enum Error {
#[snafu(display("Invalid REPL command: {reason}"))]
InvalidReplCommand { reason: String },
- #[snafu(display("Cannot create REPL"))]
- ReplCreation {
- #[snafu(source)]
- error: ReadlineError,
- #[snafu(implicit)]
- location: Location,
- },
-
- #[snafu(display("Error reading command"))]
- Readline {
- #[snafu(source)]
- error: ReadlineError,
- #[snafu(implicit)]
- location: Location,
- },
-
- #[snafu(display("Failed to request database, sql: {sql}"))]
- RequestDatabase {
- sql: String,
- #[snafu(source)]
- source: client::Error,
- #[snafu(implicit)]
- location: Location,
- },
-
- #[snafu(display("Failed to collect RecordBatches"))]
- CollectRecordBatches {
- #[snafu(implicit)]
- location: Location,
- source: common_recordbatch::error::Error,
- },
-
- #[snafu(display("Failed to pretty print Recordbatches"))]
- PrettyPrintRecordBatches {
- #[snafu(implicit)]
- location: Location,
- source: common_recordbatch::error::Error,
- },
-
- #[snafu(display("Failed to start Meta client"))]
- StartMetaClient {
- #[snafu(implicit)]
- location: Location,
- source: meta_client::error::Error,
- },
-
#[snafu(display("Failed to parse SQL: {}", sql))]
ParseSql {
sql: String,
@@ -242,13 +195,6 @@ pub enum Error {
source: query::error::Error,
},
- #[snafu(display("Failed to encode logical plan in substrait"))]
- SubstraitEncodeLogicalPlan {
- #[snafu(implicit)]
- location: Location,
- source: substrait::error::Error,
- },
-
#[snafu(display("Failed to load layered config"))]
LoadLayeredConfig {
#[snafu(source(from(common_config::error::Error, Box::new)))]
@@ -395,17 +341,10 @@ impl ErrorExt for Error {
| Error::StopProcedureManager { source, .. } => source.status_code(),
Error::BuildWalOptionsAllocator { source, .. }
| Error::StartWalOptionsAllocator { source, .. } => source.status_code(),
- Error::ReplCreation { .. } | Error::Readline { .. } | Error::HttpQuerySql { .. } => {
- StatusCode::Internal
- }
- Error::RequestDatabase { source, .. } => source.status_code(),
- Error::CollectRecordBatches { source, .. }
- | Error::PrettyPrintRecordBatches { source, .. } => source.status_code(),
- Error::StartMetaClient { source, .. } => source.status_code(),
+ Error::HttpQuerySql { .. } => StatusCode::Internal,
Error::ParseSql { source, .. } | Error::PlanStatement { source, .. } => {
source.status_code()
}
- Error::SubstraitEncodeLogicalPlan { source, .. } => source.status_code(),
Error::SerdeJson { .. }
| Error::FileIo { .. }
diff --git a/src/cmd/tests/cli.rs b/src/cmd/tests/cli.rs
deleted file mode 100644
index dfea9afc3e..0000000000
--- a/src/cmd/tests/cli.rs
+++ /dev/null
@@ -1,148 +0,0 @@
-// Copyright 2023 Greptime Team
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#[cfg(target_os = "macos")]
-mod tests {
- use std::path::PathBuf;
- use std::process::{Command, Stdio};
- use std::time::Duration;
-
- use common_test_util::temp_dir::create_temp_dir;
- use rexpect::session::PtyReplSession;
-
- struct Repl {
- repl: PtyReplSession,
- }
-
- impl Repl {
- fn send_line(&mut self, line: &str) {
- let _ = self.repl.send_line(line).unwrap();
-
- // read a line to consume the prompt
- let _ = self.read_line();
- }
-
- fn read_line(&mut self) -> String {
- self.repl.read_line().unwrap()
- }
-
- fn read_expect(&mut self, expect: &str) {
- assert_eq!(self.read_line(), expect);
- }
-
- fn read_contains(&mut self, pat: &str) {
- assert!(self.read_line().contains(pat));
- }
- }
-
- // TODO(LFC): Un-ignore this REPL test.
- // Ignore this REPL test because some logical plans like create database are not supported yet in Datanode.
- #[ignore]
- #[test]
- fn test_repl() {
- let data_home = create_temp_dir("data");
- let wal_dir = create_temp_dir("wal");
-
- let mut bin_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
- bin_path.push("../../target/debug");
- let bin_path = bin_path.to_str().unwrap();
-
- let mut datanode = Command::new("./greptime")
- .current_dir(bin_path)
- .args([
- "datanode",
- "start",
- "--rpc-bind-addr=0.0.0.0:4321",
- "--node-id=1",
- &format!("--data-home={}", data_home.path().display()),
- &format!("--wal-dir={}", wal_dir.path().display()),
- ])
- .stdout(Stdio::null())
- .spawn()
- .unwrap();
-
- // wait for Datanode actually started
- std::thread::sleep(Duration::from_secs(3));
-
- let mut repl_cmd = Command::new("./greptime");
- let _ = repl_cmd.current_dir(bin_path).args([
- "--log-level=off",
- "cli",
- "attach",
- "--grpc-bind-addr=0.0.0.0:4321",
- // history commands can sneaky into stdout and mess up our tests, so disable it
- "--disable-helper",
- ]);
- let pty_session = rexpect::session::spawn_command(repl_cmd, Some(5_000)).unwrap();
- let repl = PtyReplSession {
- prompt: "> ".to_string(),
- pty_session,
- quit_command: None,
- echo_on: false,
- };
- let repl = &mut Repl { repl };
- repl.read_expect("Ready for commands. (Hint: try 'help')");
-
- test_create_database(repl);
-
- test_use_database(repl);
-
- test_create_table(repl);
-
- test_insert(repl);
-
- test_select(repl);
-
- datanode.kill().unwrap();
- let _ = datanode.wait().unwrap();
- }
-
- fn test_create_database(repl: &mut Repl) {
- repl.send_line("CREATE DATABASE db;");
- repl.read_expect("Affected Rows: 1");
- repl.read_contains("Cost");
- }
-
- fn test_use_database(repl: &mut Repl) {
- repl.send_line("USE db");
- repl.read_expect("Total Rows: 0");
- repl.read_contains("Cost");
- repl.read_expect("Using db");
- }
-
- fn test_create_table(repl: &mut Repl) {
- repl.send_line("CREATE TABLE t(x STRING, ts TIMESTAMP TIME INDEX);");
- repl.read_expect("Affected Rows: 0");
- repl.read_contains("Cost");
- }
-
- fn test_insert(repl: &mut Repl) {
- repl.send_line("INSERT INTO t(x, ts) VALUES ('hello', 1676895812239);");
- repl.read_expect("Affected Rows: 1");
- repl.read_contains("Cost");
- }
-
- fn test_select(repl: &mut Repl) {
- repl.send_line("SELECT * FROM t;");
-
- repl.read_expect("+-------+-------------------------+");
- repl.read_expect("| x | ts |");
- repl.read_expect("+-------+-------------------------+");
- repl.read_expect("| hello | 2023-02-20T12:23:32.239 |");
- repl.read_expect("+-------+-------------------------+");
- repl.read_expect("Total Rows: 1");
-
- repl.read_contains("Cost");
- }
-}
diff --git a/src/frontend/src/error.rs b/src/frontend/src/error.rs
index 7d599cb0ce..99edbdbc62 100644
--- a/src/frontend/src/error.rs
+++ b/src/frontend/src/error.rs
@@ -128,13 +128,6 @@ pub enum Error {
source: catalog::error::Error,
},
- #[snafu(display("Failed to start Meta client"))]
- StartMetaClient {
- #[snafu(implicit)]
- location: Location,
- source: meta_client::error::Error,
- },
-
#[snafu(display("Failed to create heartbeat stream to Metasrv"))]
CreateMetaHeartbeatStream {
source: meta_client::error::Error,
@@ -415,8 +408,7 @@ impl ErrorExt for Error {
Error::Catalog { source, .. } => source.status_code(),
- Error::StartMetaClient { source, .. }
- | Error::CreateMetaHeartbeatStream { source, .. } => source.status_code(),
+ Error::CreateMetaHeartbeatStream { source, .. } => source.status_code(),
Error::PlanStatement { source, .. }
| Error::ReadTable { source, .. }
From 54ef29f3949f2ceed2bd8f401498a18c0ca9fe1e Mon Sep 17 00:00:00 2001
From: Weny Xu
Date: Thu, 10 Apr 2025 14:55:46 +0800
Subject: [PATCH 03/82] feat: add `catalog_manager` to
`ProcedureServiceHandler` (#5873)
---
Cargo.lock | 1 +
src/common/function/Cargo.toml | 1 +
src/common/function/src/handlers.rs | 4 ++++
src/common/function/src/state.rs | 5 +++++
src/frontend/src/instance/builder.rs | 1 +
src/operator/src/procedure.rs | 16 ++++++++++++++--
6 files changed, 26 insertions(+), 2 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index b19e1bb75f..cba1fa8793 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2014,6 +2014,7 @@ dependencies = [
"arc-swap",
"async-trait",
"bincode",
+ "catalog",
"chrono",
"common-base",
"common-catalog",
diff --git a/src/common/function/Cargo.toml b/src/common/function/Cargo.toml
index 7a4c968a3e..73821a896a 100644
--- a/src/common/function/Cargo.toml
+++ b/src/common/function/Cargo.toml
@@ -18,6 +18,7 @@ api.workspace = true
arc-swap = "1.0"
async-trait.workspace = true
bincode = "1.3"
+catalog.workspace = true
chrono.workspace = true
common-base.workspace = true
common-catalog.workspace = true
diff --git a/src/common/function/src/handlers.rs b/src/common/function/src/handlers.rs
index 1d994731d5..bcb6ce5460 100644
--- a/src/common/function/src/handlers.rs
+++ b/src/common/function/src/handlers.rs
@@ -15,6 +15,7 @@
use std::sync::Arc;
use async_trait::async_trait;
+use catalog::CatalogManagerRef;
use common_base::AffectedRows;
use common_meta::rpc::procedure::{
AddRegionFollowerRequest, MigrateRegionRequest, ProcedureStateResponse,
@@ -72,6 +73,9 @@ pub trait ProcedureServiceHandler: Send + Sync {
/// Remove a region follower from a region.
async fn remove_region_follower(&self, request: RemoveRegionFollowerRequest) -> Result<()>;
+
+ /// Get the catalog manager
+ fn catalog_manager(&self) -> &CatalogManagerRef;
}
/// This flow service handler is only use for flush flow for now.
diff --git a/src/common/function/src/state.rs b/src/common/function/src/state.rs
index 66f5463fa2..211f7e1438 100644
--- a/src/common/function/src/state.rs
+++ b/src/common/function/src/state.rs
@@ -34,6 +34,7 @@ impl FunctionState {
use api::v1::meta::ProcedureStatus;
use async_trait::async_trait;
+ use catalog::CatalogManagerRef;
use common_base::AffectedRows;
use common_meta::rpc::procedure::{
AddRegionFollowerRequest, MigrateRegionRequest, ProcedureStateResponse,
@@ -80,6 +81,10 @@ impl FunctionState {
) -> Result<()> {
Ok(())
}
+
+ fn catalog_manager(&self) -> &CatalogManagerRef {
+ unimplemented!()
+ }
}
#[async_trait]
diff --git a/src/frontend/src/instance/builder.rs b/src/frontend/src/instance/builder.rs
index 52b2463503..8503999b2c 100644
--- a/src/frontend/src/instance/builder.rs
+++ b/src/frontend/src/instance/builder.rs
@@ -152,6 +152,7 @@ impl FrontendBuilder {
let procedure_service_handler = Arc::new(ProcedureServiceOperator::new(
self.procedure_executor.clone(),
+ self.catalog_manager.clone(),
));
let flow_metadata_manager = Arc::new(FlowMetadataManager::new(kv_backend.clone()));
diff --git a/src/operator/src/procedure.rs b/src/operator/src/procedure.rs
index e2c27c024f..87f805acb1 100644
--- a/src/operator/src/procedure.rs
+++ b/src/operator/src/procedure.rs
@@ -13,6 +13,7 @@
// limitations under the License.
use async_trait::async_trait;
+use catalog::CatalogManagerRef;
use common_error::ext::BoxedError;
use common_function::handlers::ProcedureServiceHandler;
use common_meta::ddl::{ExecutorContext, ProcedureExecutorRef};
@@ -28,11 +29,18 @@ use snafu::ResultExt;
#[derive(Clone)]
pub struct ProcedureServiceOperator {
procedure_executor: ProcedureExecutorRef,
+ catalog_manager: CatalogManagerRef,
}
impl ProcedureServiceOperator {
- pub fn new(procedure_executor: ProcedureExecutorRef) -> Self {
- Self { procedure_executor }
+ pub fn new(
+ procedure_executor: ProcedureExecutorRef,
+ catalog_manager: CatalogManagerRef,
+ ) -> Self {
+ Self {
+ procedure_executor,
+ catalog_manager,
+ }
}
}
@@ -75,4 +83,8 @@ impl ProcedureServiceHandler for ProcedureServiceOperator {
.map_err(BoxedError::new)
.context(query_error::ProcedureServiceSnafu)
}
+
+ fn catalog_manager(&self) -> &CatalogManagerRef {
+ &self.catalog_manager
+ }
}
From dce5e35d7c3c2908bfa251c5db26abe8caec9f22 Mon Sep 17 00:00:00 2001
From: Zhenchi
Date: Thu, 10 Apr 2025 15:32:15 +0800
Subject: [PATCH 04/82] feat: apply terms with fulltext tantivy backend (#5869)
* feat: apply terms with fulltext tantivy backend
Signed-off-by: Zhenchi
* fix test
Signed-off-by: Zhenchi
* address comments
Signed-off-by: Zhenchi
---------
Signed-off-by: Zhenchi
---
src/index/src/fulltext_index.rs | 41 ++-
.../src/fulltext_index/create/bloom_filter.rs | 4 +-
.../src/fulltext_index/create/tantivy.rs | 62 ++--
.../src/fulltext_index/search/tantivy.rs | 6 +-
src/index/src/fulltext_index/tests.rs | 43 ++-
.../src/sst/index/fulltext_index/applier.rs | 36 ++-
.../index/fulltext_index/applier/builder.rs | 115 ++++++-
.../src/sst/index/fulltext_index/creator.rs | 299 ++++++++++++++----
src/mito2/src/sst/index/puffin_manager.rs | 4 +-
src/puffin/src/puffin_manager.rs | 75 +++--
.../fs_puffin_manager/reader.rs | 84 ++---
.../fs_puffin_manager/writer.rs | 10 +-
src/puffin/src/puffin_manager/tests.rs | 8 +-
13 files changed, 591 insertions(+), 196 deletions(-)
diff --git a/src/index/src/fulltext_index.rs b/src/index/src/fulltext_index.rs
index 3a7f58c8ab..4cbbbdf477 100644
--- a/src/index/src/fulltext_index.rs
+++ b/src/index/src/fulltext_index.rs
@@ -12,18 +12,25 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+use puffin::blob_metadata::BlobMetadata;
use serde::{Deserialize, Serialize};
-
+use snafu::ResultExt;
+use tantivy::tokenizer::{LowerCaser, SimpleTokenizer, TextAnalyzer, TokenizerManager};
+use tantivy_jieba::JiebaTokenizer;
pub mod create;
pub mod error;
pub mod search;
pub mod tokenizer;
+pub const KEY_FULLTEXT_CONFIG: &str = "fulltext_config";
+
+use crate::fulltext_index::error::{DeserializeFromJsonSnafu, Result};
+
#[cfg(test)]
mod tests;
/// Configuration for fulltext index.
-#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
pub struct Config {
/// Analyzer to use for tokenization.
pub analyzer: Analyzer,
@@ -33,10 +40,38 @@ pub struct Config {
}
/// Analyzer to use for tokenization.
-#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
pub enum Analyzer {
#[default]
English,
Chinese,
}
+
+impl Config {
+ fn build_tantivy_tokenizer(&self) -> TokenizerManager {
+ let mut builder = match self.analyzer {
+ Analyzer::English => TextAnalyzer::builder(SimpleTokenizer::default()).dynamic(),
+ Analyzer::Chinese => TextAnalyzer::builder(JiebaTokenizer {}).dynamic(),
+ };
+
+ if !self.case_sensitive {
+ builder = builder.filter_dynamic(LowerCaser);
+ }
+
+ let tokenizer = builder.build();
+ let tokenizer_manager = TokenizerManager::new();
+ tokenizer_manager.register("default", tokenizer);
+ tokenizer_manager
+ }
+
+ /// Extracts the fulltext index configuration from the blob metadata.
+ pub fn from_blob_metadata(metadata: &BlobMetadata) -> Result {
+ if let Some(config) = metadata.properties.get(KEY_FULLTEXT_CONFIG) {
+ let config = serde_json::from_str(config).context(DeserializeFromJsonSnafu)?;
+ return Ok(config);
+ }
+
+ Ok(Self::default())
+ }
+}
diff --git a/src/index/src/fulltext_index/create/bloom_filter.rs b/src/index/src/fulltext_index/create/bloom_filter.rs
index 970f89d65d..127464db71 100644
--- a/src/index/src/fulltext_index/create/bloom_filter.rs
+++ b/src/index/src/fulltext_index/create/bloom_filter.rs
@@ -30,12 +30,10 @@ use crate::fulltext_index::error::{
SerializeToJsonSnafu,
};
use crate::fulltext_index::tokenizer::{Analyzer, ChineseTokenizer, EnglishTokenizer};
-use crate::fulltext_index::Config;
+use crate::fulltext_index::{Config, KEY_FULLTEXT_CONFIG};
const PIPE_BUFFER_SIZE_FOR_SENDING_BLOB: usize = 8192;
-pub const KEY_FULLTEXT_CONFIG: &str = "fulltext_config";
-
/// `BloomFilterFulltextIndexCreator` is for creating a fulltext index using a bloom filter.
pub struct BloomFilterFulltextIndexCreator {
inner: Option,
diff --git a/src/index/src/fulltext_index/create/tantivy.rs b/src/index/src/fulltext_index/create/tantivy.rs
index 6b09c1f0fb..274fea596e 100644
--- a/src/index/src/fulltext_index/create/tantivy.rs
+++ b/src/index/src/fulltext_index/create/tantivy.rs
@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+use std::collections::HashMap;
use std::path::{Path, PathBuf};
use async_trait::async_trait;
@@ -21,15 +22,13 @@ use snafu::{OptionExt, ResultExt};
use tantivy::indexer::NoMergePolicy;
use tantivy::schema::{Schema, STORED, TEXT};
use tantivy::store::{Compressor, ZstdCompressor};
-use tantivy::tokenizer::{LowerCaser, SimpleTokenizer, TextAnalyzer, TokenizerManager};
use tantivy::{doc, Index, IndexWriter};
-use tantivy_jieba::JiebaTokenizer;
use crate::fulltext_index::create::FulltextIndexCreator;
use crate::fulltext_index::error::{
- ExternalSnafu, FinishedSnafu, IoSnafu, JoinSnafu, Result, TantivySnafu,
+ ExternalSnafu, FinishedSnafu, IoSnafu, JoinSnafu, Result, SerializeToJsonSnafu, TantivySnafu,
};
-use crate::fulltext_index::{Analyzer, Config};
+use crate::fulltext_index::{Config, KEY_FULLTEXT_CONFIG};
pub const TEXT_FIELD_NAME: &str = "greptime_fulltext_text";
pub const ROWID_FIELD_NAME: &str = "greptime_fulltext_rowid";
@@ -50,6 +49,9 @@ pub struct TantivyFulltextIndexCreator {
/// The directory path in filesystem to store the index.
path: PathBuf,
+
+ /// The configuration of the fulltext index.
+ config: Config,
}
impl TantivyFulltextIndexCreator {
@@ -68,7 +70,7 @@ impl TantivyFulltextIndexCreator {
let mut index = Index::create_in_dir(&path, schema).context(TantivySnafu)?;
index.settings_mut().docstore_compression = Compressor::Zstd(ZstdCompressor::default());
- index.set_tokenizers(Self::build_tokenizer(&config));
+ index.set_tokenizers(config.build_tantivy_tokenizer());
let memory_limit = Self::sanitize_memory_limit(memory_limit);
@@ -84,25 +86,10 @@ impl TantivyFulltextIndexCreator {
rowid_field,
max_rowid: 0,
path: path.as_ref().to_path_buf(),
+ config,
})
}
- fn build_tokenizer(config: &Config) -> TokenizerManager {
- let mut builder = match config.analyzer {
- Analyzer::English => TextAnalyzer::builder(SimpleTokenizer::default()).dynamic(),
- Analyzer::Chinese => TextAnalyzer::builder(JiebaTokenizer {}).dynamic(),
- };
-
- if !config.case_sensitive {
- builder = builder.filter_dynamic(LowerCaser);
- }
-
- let tokenizer = builder.build();
- let tokenizer_manager = TokenizerManager::new();
- tokenizer_manager.register("default", tokenizer);
- tokenizer_manager
- }
-
fn sanitize_memory_limit(memory_limit: usize) -> usize {
// Port from tantivy::indexer::index_writer::{MEMORY_BUDGET_NUM_BYTES_MIN, MEMORY_BUDGET_NUM_BYTES_MAX}
const MARGIN_IN_BYTES: usize = 1_000_000;
@@ -137,8 +124,16 @@ impl FulltextIndexCreator for TantivyFulltextIndexCreator {
.await
.context(JoinSnafu)??;
+ let property_key = KEY_FULLTEXT_CONFIG.to_string();
+ let property_value = serde_json::to_string(&self.config).context(SerializeToJsonSnafu)?;
+
puffin_writer
- .put_dir(blob_key, self.path.clone(), put_options)
+ .put_dir(
+ blob_key,
+ self.path.clone(),
+ put_options,
+ HashMap::from([(property_key, property_value)]),
+ )
.await
.map_err(BoxedError::new)
.context(ExternalSnafu)
@@ -174,6 +169,7 @@ mod tests {
use tantivy::TantivyDocument;
use super::*;
+ use crate::fulltext_index::Analyzer;
struct MockPuffinWriter;
@@ -197,6 +193,7 @@ mod tests {
_key: &str,
_dir: PathBuf,
_options: PutOptions,
+ _properties: HashMap,
) -> puffin::error::Result {
Ok(0)
}
@@ -226,7 +223,7 @@ mod tests {
("foo", vec![3]),
("bar", vec![4]),
];
- query_and_check(temp_dir.path(), &cases).await;
+ query_and_check(temp_dir.path(), config, &cases).await;
}
}
@@ -248,9 +245,13 @@ mod tests {
("hello", vec![0u32, 2]),
("world", vec![1, 2]),
("foo", vec![3]),
+ ("Foo", vec![]),
+ ("FOO", vec![]),
("bar", vec![]),
+ ("Bar", vec![4]),
+ ("BAR", vec![]),
];
- query_and_check(temp_dir.path(), &cases).await;
+ query_and_check(temp_dir.path(), config, &cases).await;
}
}
@@ -274,7 +275,7 @@ mod tests {
("foo", vec![4]),
("bar", vec![5]),
];
- query_and_check(temp_dir.path(), &cases).await;
+ query_and_check(temp_dir.path(), config, &cases).await;
}
}
@@ -297,8 +298,12 @@ mod tests {
("世界", vec![1, 2, 3]),
("foo", vec![4]),
("bar", vec![]),
+ ("Foo", vec![]),
+ ("FOO", vec![]),
+ ("Bar", vec![5]),
+ ("BAR", vec![]),
];
- query_and_check(temp_dir.path(), &cases).await;
+ query_and_check(temp_dir.path(), config, &cases).await;
}
}
@@ -315,8 +320,9 @@ mod tests {
.unwrap();
}
- async fn query_and_check(path: &Path, cases: &[(&str, Vec)]) {
- let index = Index::open_in_dir(path).unwrap();
+ async fn query_and_check(path: &Path, config: Config, cases: &[(&str, Vec)]) {
+ let mut index = Index::open_in_dir(path).unwrap();
+ index.set_tokenizers(config.build_tantivy_tokenizer());
let reader = index.reader().unwrap();
let searcher = reader.searcher();
for (query, expected) in cases {
diff --git a/src/index/src/fulltext_index/search/tantivy.rs b/src/index/src/fulltext_index/search/tantivy.rs
index 61c87e863f..a55b599d21 100644
--- a/src/index/src/fulltext_index/search/tantivy.rs
+++ b/src/index/src/fulltext_index/search/tantivy.rs
@@ -29,6 +29,7 @@ use crate::fulltext_index::error::{
Result, TantivyDocNotFoundSnafu, TantivyParserSnafu, TantivySnafu,
};
use crate::fulltext_index::search::{FulltextIndexSearcher, RowId};
+use crate::fulltext_index::Config;
/// `TantivyFulltextIndexSearcher` is a searcher using Tantivy.
pub struct TantivyFulltextIndexSearcher {
@@ -42,10 +43,11 @@ pub struct TantivyFulltextIndexSearcher {
impl TantivyFulltextIndexSearcher {
/// Creates a new `TantivyFulltextIndexSearcher`.
- pub fn new(path: impl AsRef) -> Result {
+ pub fn new(path: impl AsRef, config: Config) -> Result {
let now = Instant::now();
- let index = Index::open_in_dir(path.as_ref()).context(TantivySnafu)?;
+ let mut index = Index::open_in_dir(path.as_ref()).context(TantivySnafu)?;
+ index.set_tokenizers(config.build_tantivy_tokenizer());
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
diff --git a/src/index/src/fulltext_index/tests.rs b/src/index/src/fulltext_index/tests.rs
index d3491a7e9d..a2a87a645a 100644
--- a/src/index/src/fulltext_index/tests.rs
+++ b/src/index/src/fulltext_index/tests.rs
@@ -19,7 +19,7 @@ use common_test_util::temp_dir::{create_temp_dir, TempDir};
use puffin::puffin_manager::file_accessor::MockFileAccessor;
use puffin::puffin_manager::fs_puffin_manager::FsPuffinManager;
use puffin::puffin_manager::stager::BoundedStager;
-use puffin::puffin_manager::{DirGuard, PuffinManager, PuffinReader, PuffinWriter, PutOptions};
+use puffin::puffin_manager::{PuffinManager, PuffinReader, PuffinWriter, PutOptions};
use crate::fulltext_index::create::{FulltextIndexCreator, TantivyFulltextIndexCreator};
use crate::fulltext_index::search::{FulltextIndexSearcher, RowId, TantivyFulltextIndexSearcher};
@@ -61,8 +61,7 @@ async fn test_search(
prefix: &str,
config: Config,
texts: Vec<&str>,
- query: &str,
- expected: impl IntoIterator- ,
+ query_expected: Vec<(&str, impl IntoIterator
- )>,
) {
let (_staging_dir, stager) = new_bounded_stager(prefix).await;
let file_accessor = Arc::new(MockFileAccessor::new(prefix));
@@ -72,14 +71,16 @@ async fn test_search(
let blob_key = "fulltext_index".to_string();
let mut writer = puffin_manager.writer(&file_name).await.unwrap();
create_index(prefix, &mut writer, &blob_key, texts, config).await;
+ writer.finish().await.unwrap();
let reader = puffin_manager.reader(&file_name).await.unwrap();
let index_dir = reader.dir(&blob_key).await.unwrap();
- let searcher = TantivyFulltextIndexSearcher::new(index_dir.path()).unwrap();
- let results = searcher.search(query).await.unwrap();
-
- let expected = expected.into_iter().collect::
>();
- assert_eq!(results, expected);
+ let searcher = TantivyFulltextIndexSearcher::new(index_dir.path(), config).unwrap();
+ for (query, expected) in query_expected {
+ let results = searcher.search(query).await.unwrap();
+ let expected = expected.into_iter().collect::>();
+ assert_eq!(results, expected);
+ }
}
#[tokio::test]
@@ -91,8 +92,7 @@ async fn test_simple_term() {
"This is a sample text containing Barack Obama",
"Another document mentioning Barack",
],
- "Barack Obama",
- [0, 1],
+ vec![("Barack Obama", [0, 1])],
)
.await;
}
@@ -103,8 +103,7 @@ async fn test_negative_term() {
"test_negative_term_",
Config::default(),
vec!["apple is a fruit", "I like apple", "fruit is healthy"],
- "apple -fruit",
- [1],
+ vec![("apple -fruit", [1])],
)
.await;
}
@@ -119,8 +118,7 @@ async fn test_must_term() {
"I love apples and fruits",
"apple and fruit are good",
],
- "+apple +fruit",
- [2],
+ vec![("+apple +fruit", [2])],
)
.await;
}
@@ -131,8 +129,7 @@ async fn test_boolean_operators() {
"test_boolean_operators_",
Config::default(),
vec!["a b c", "a b", "b c", "c"],
- "a AND b OR c",
- [0, 1, 2, 3],
+ vec![("a AND b OR c", [0, 1, 2, 3])],
)
.await;
}
@@ -146,8 +143,7 @@ async fn test_phrase_term() {
"This is a sample text containing Barack Obama",
"Another document mentioning Barack",
],
- "\"Barack Obama\"",
- [0],
+ vec![("\"Barack Obama\"", [0])],
)
.await;
}
@@ -161,8 +157,7 @@ async fn test_config_english_analyzer_case_insensitive() {
..Config::default()
},
vec!["Banana is a fruit", "I like apple", "Fruit is healthy"],
- "banana",
- [0],
+ vec![("banana", [0]), ("Banana", [0]), ("BANANA", [0])],
)
.await;
}
@@ -175,9 +170,8 @@ async fn test_config_english_analyzer_case_sensitive() {
case_sensitive: true,
..Config::default()
},
- vec!["Banana is a fruit", "I like apple", "Fruit is healthy"],
- "banana",
- [],
+ vec!["Banana is a fruit", "I like banana", "Fruit is healthy"],
+ vec![("banana", [1]), ("Banana", [0])],
)
.await;
}
@@ -191,8 +185,7 @@ async fn test_config_chinese_analyzer() {
..Default::default()
},
vec!["苹果是一种水果", "我喜欢苹果", "水果很健康"],
- "苹果",
- [0, 1],
+ vec![("苹果", [0, 1])],
)
.await;
}
diff --git a/src/mito2/src/sst/index/fulltext_index/applier.rs b/src/mito2/src/sst/index/fulltext_index/applier.rs
index e463bd0ee8..94ceda6891 100644
--- a/src/mito2/src/sst/index/fulltext_index/applier.rs
+++ b/src/mito2/src/sst/index/fulltext_index/applier.rs
@@ -17,9 +17,10 @@ use std::sync::Arc;
use common_telemetry::warn;
use index::fulltext_index::search::{FulltextIndexSearcher, RowId, TantivyFulltextIndexSearcher};
+use index::fulltext_index::Config;
use object_store::ObjectStore;
use puffin::puffin_manager::cache::PuffinMetadataCacheRef;
-use puffin::puffin_manager::{BlobWithMetadata, DirGuard, PuffinManager, PuffinReader};
+use puffin::puffin_manager::{GuardWithMetadata, PuffinManager, PuffinReader};
use snafu::ResultExt;
use store_api::storage::{ColumnId, RegionId};
@@ -93,7 +94,7 @@ impl FulltextIndexApplier {
let mut row_ids: Option> = None;
for (column_id, request) in &self.requests {
- if request.queries.is_empty() {
+ if request.queries.is_empty() && request.terms.is_empty() {
continue;
}
@@ -133,15 +134,21 @@ impl FulltextIndexApplier {
.dir(file_id, &blob_key, file_size_hint)
.await?;
- let path = match &dir {
- Some(dir) => dir.path(),
+ let dir = match &dir {
+ Some(dir) => dir,
None => {
return Ok(None);
}
};
- let searcher = TantivyFulltextIndexSearcher::new(path).context(ApplyFulltextIndexSnafu)?;
+ let config = Config::from_blob_metadata(dir.metadata()).context(ApplyFulltextIndexSnafu)?;
+ let path = dir.path();
+
+ let searcher =
+ TantivyFulltextIndexSearcher::new(path, config).context(ApplyFulltextIndexSnafu)?;
let mut row_ids: Option> = None;
+
+ // 1. Apply queries
for query in &request.queries {
let result = searcher
.search(&query.0)
@@ -161,6 +168,21 @@ impl FulltextIndexApplier {
}
}
+ // 2. Apply terms
+ let query = request.terms_as_query(config.case_sensitive);
+ if !query.0.is_empty() {
+ let result = searcher
+ .search(&query.0)
+ .await
+ .context(ApplyFulltextIndexSnafu)?;
+
+ if let Some(ids) = row_ids.as_mut() {
+ ids.retain(|id| result.contains(id));
+ } else {
+ row_ids = Some(result);
+ }
+ }
+
Ok(row_ids)
}
}
@@ -217,7 +239,7 @@ impl IndexSource {
file_id: FileId,
key: &str,
file_size_hint: Option,
- ) -> Result>> {
+ ) -> Result >> {
let (reader, fallbacked) = self.ensure_reader(file_id, file_size_hint).await?;
let res = reader.blob(key).await;
match res {
@@ -248,7 +270,7 @@ impl IndexSource {
file_id: FileId,
key: &str,
file_size_hint: Option,
- ) -> Result> {
+ ) -> Result >> {
let (reader, fallbacked) = self.ensure_reader(file_id, file_size_hint).await?;
let res = reader.dir(key).await;
match res {
diff --git a/src/mito2/src/sst/index/fulltext_index/applier/builder.rs b/src/mito2/src/sst/index/fulltext_index/applier/builder.rs
index e5cb6cf765..14f5936a01 100644
--- a/src/mito2/src/sst/index/fulltext_index/applier/builder.rs
+++ b/src/mito2/src/sst/index/fulltext_index/applier/builder.rs
@@ -30,12 +30,37 @@ use crate::sst::index::puffin_manager::PuffinManagerFactory;
/// A request for fulltext index.
///
/// It contains all the queries and terms for a column.
-#[derive(Default)]
+#[derive(Default, Debug)]
pub struct FulltextRequest {
pub queries: Vec,
pub terms: Vec,
}
+impl FulltextRequest {
+ /// Convert terms to a query string.
+ ///
+ /// For example, if the terms are ["foo", "bar"], the query string will be `r#"+"foo" +"bar""#`.
+ /// Need to escape the `"` in the term.
+ ///
+ /// `skip_lowercased` is used for the situation that lowercased terms are not indexed.
+ pub fn terms_as_query(&self, skip_lowercased: bool) -> FulltextQuery {
+ let mut query = String::new();
+ for term in &self.terms {
+ if skip_lowercased && term.col_lowered {
+ continue;
+ }
+ // Escape the `"` in the term.
+ let escaped_term = term.term.replace("\"", "\\\"");
+ if query.is_empty() {
+ query = format!("+\"{escaped_term}\"");
+ } else {
+ query.push_str(&format!(" +\"{escaped_term}\""));
+ }
+ }
+ FulltextQuery(query)
+ }
+}
+
/// A query to be matched in fulltext index.
///
/// `query` is the query to be matched, e.g. "+foo -bar" in `SELECT * FROM t WHERE matches(text, "+foo -bar")`.
@@ -543,4 +568,92 @@ mod tests {
}
);
}
+
+ #[test]
+ fn test_terms_as_query() {
+ // Test with empty terms
+ let request = FulltextRequest::default();
+ assert_eq!(request.terms_as_query(false), FulltextQuery(String::new()));
+ assert_eq!(request.terms_as_query(true), FulltextQuery(String::new()));
+
+ // Test with a single term (not lowercased)
+ let mut request = FulltextRequest::default();
+ request.terms.push(FulltextTerm {
+ col_lowered: false,
+ term: "foo".to_string(),
+ });
+ assert_eq!(
+ request.terms_as_query(false),
+ FulltextQuery("+\"foo\"".to_string())
+ );
+ assert_eq!(
+ request.terms_as_query(true),
+ FulltextQuery("+\"foo\"".to_string())
+ );
+
+ // Test with a single lowercased term and skip_lowercased=true
+ let mut request = FulltextRequest::default();
+ request.terms.push(FulltextTerm {
+ col_lowered: true,
+ term: "foo".to_string(),
+ });
+ assert_eq!(
+ request.terms_as_query(false),
+ FulltextQuery("+\"foo\"".to_string())
+ );
+ assert_eq!(request.terms_as_query(true), FulltextQuery(String::new())); // Should skip lowercased term
+
+ // Test with multiple terms, mix of lowercased and not
+ let mut request = FulltextRequest::default();
+ request.terms.push(FulltextTerm {
+ col_lowered: false,
+ term: "foo".to_string(),
+ });
+ request.terms.push(FulltextTerm {
+ col_lowered: true,
+ term: "bar".to_string(),
+ });
+ assert_eq!(
+ request.terms_as_query(false),
+ FulltextQuery("+\"foo\" +\"bar\"".to_string())
+ );
+ assert_eq!(
+ request.terms_as_query(true),
+ FulltextQuery("+\"foo\"".to_string()) // Only the non-lowercased term
+ );
+
+ // Test with term containing quotes that need escaping
+ let mut request = FulltextRequest::default();
+ request.terms.push(FulltextTerm {
+ col_lowered: false,
+ term: "foo\"bar".to_string(),
+ });
+ assert_eq!(
+ request.terms_as_query(false),
+ FulltextQuery("+\"foo\\\"bar\"".to_string())
+ );
+
+ // Test with a complex mix of terms
+ let mut request = FulltextRequest::default();
+ request.terms.push(FulltextTerm {
+ col_lowered: false,
+ term: "foo".to_string(),
+ });
+ request.terms.push(FulltextTerm {
+ col_lowered: true,
+ term: "bar\"quoted\"".to_string(),
+ });
+ request.terms.push(FulltextTerm {
+ col_lowered: false,
+ term: "baz\\escape".to_string(),
+ });
+ assert_eq!(
+ request.terms_as_query(false),
+ FulltextQuery("+\"foo\" +\"bar\\\"quoted\\\"\" +\"baz\\escape\"".to_string())
+ );
+ assert_eq!(
+ request.terms_as_query(true),
+ FulltextQuery("+\"foo\" +\"baz\\escape\"".to_string()) // Skips the lowercased term
+ );
+ }
}
diff --git a/src/mito2/src/sst/index/fulltext_index/creator.rs b/src/mito2/src/sst/index/fulltext_index/creator.rs
index b6eab05bfa..12b83e39d0 100644
--- a/src/mito2/src/sst/index/fulltext_index/creator.rs
+++ b/src/mito2/src/sst/index/fulltext_index/creator.rs
@@ -376,7 +376,9 @@ mod tests {
use crate::access_layer::RegionFilePathFactory;
use crate::read::{Batch, BatchColumn};
use crate::sst::file::FileId;
- use crate::sst::index::fulltext_index::applier::builder::{FulltextQuery, FulltextRequest};
+ use crate::sst::index::fulltext_index::applier::builder::{
+ FulltextQuery, FulltextRequest, FulltextTerm,
+ };
use crate::sst::index::fulltext_index::applier::FulltextIndexApplier;
use crate::sst::index::puffin_manager::PuffinManagerFactory;
@@ -510,14 +512,25 @@ mod tests {
.unwrap()
}
- async fn build_applier_factory(
+ /// Applier factory that can handle both queries and terms.
+ ///
+ /// It builds a fulltext index with the given data rows, and returns a function
+ /// that can handle both queries and terms in a single request.
+ ///
+ /// The function takes two parameters:
+ /// - `queries`: A list of (ColumnId, query_string) pairs for fulltext queries
+ /// - `terms`: A list of (ColumnId, [(bool, String)]) for fulltext terms, where bool indicates if term is lowercased
+ async fn build_fulltext_applier_factory(
prefix: &str,
rows: &[(
Option<&str>, // text_english_case_sensitive
Option<&str>, // text_english_case_insensitive
Option<&str>, // text_chinese
)],
- ) -> impl Fn(Vec<(ColumnId, &str)>) -> BoxFuture<'static, BTreeSet> {
+ ) -> impl Fn(
+ Vec<(ColumnId, &str)>,
+ Vec<(ColumnId, Vec<(bool, &str)>)>,
+ ) -> BoxFuture<'static, Option>> {
let (d, factory) = PuffinManagerFactory::new_for_test_async(prefix).await;
let region_dir = "region0".to_string();
let sst_file_id = FileId::random();
@@ -549,74 +562,253 @@ mod tests {
let _ = indexer.finish(&mut writer).await.unwrap();
writer.finish().await.unwrap();
- move |queries| {
+ move |queries: Vec<(ColumnId, &str)>, terms_requests: Vec<(ColumnId, Vec<(bool, &str)>)>| {
let _d = &d;
- let applier = FulltextIndexApplier::new(
- region_dir.clone(),
- region_metadata.region_id,
- object_store.clone(),
- queries
+ let region_dir = region_dir.clone();
+ let object_store = object_store.clone();
+ let factory = factory.clone();
+
+ let mut requests: HashMap = HashMap::new();
+
+ // Add queries
+ for (column_id, query) in queries {
+ requests
+ .entry(column_id)
+ .or_default()
+ .queries
+ .push(FulltextQuery(query.to_string()));
+ }
+
+ // Add terms
+ for (column_id, terms) in terms_requests {
+ let fulltext_terms = terms
.into_iter()
- .map(|(a, b)| {
- (
- a,
- FulltextRequest {
- queries: vec![FulltextQuery(b.to_string())],
- terms: vec![],
- },
- )
+ .map(|(col_lowered, term)| FulltextTerm {
+ col_lowered,
+ term: term.to_string(),
})
- .collect(),
- factory.clone(),
+ .collect::>();
+
+ requests
+ .entry(column_id)
+ .or_default()
+ .terms
+ .extend(fulltext_terms);
+ }
+
+ let applier = FulltextIndexApplier::new(
+ region_dir,
+ region_metadata.region_id,
+ object_store,
+ requests,
+ factory,
);
- async move { applier.apply(sst_file_id, None).await.unwrap().unwrap() }.boxed()
+ async move { applier.apply(sst_file_id, None).await.unwrap() }.boxed()
}
}
+ fn rows(row_ids: impl IntoIterator- ) -> BTreeSet
{
+ row_ids.into_iter().collect()
+ }
+
#[tokio::test]
- async fn test_fulltext_index_basic() {
- let applier_factory = build_applier_factory(
- "test_fulltext_index_basic_",
+ async fn test_fulltext_index_basic_case_sensitive() {
+ let applier_factory = build_fulltext_applier_factory(
+ "test_fulltext_index_basic_case_sensitive_",
&[
- (Some("hello"), None, Some("你好")),
- (Some("world"), Some("world"), None),
- (None, Some("World"), Some("世界")),
- (
- Some("Hello, World"),
- Some("Hello, World"),
- Some("你好,世界"),
- ),
+ (Some("hello"), None, None),
+ (Some("world"), None, None),
+ (None, None, None),
+ (Some("Hello, World"), None, None),
],
)
.await;
- let row_ids = applier_factory(vec![(1, "hello")]).await;
- assert_eq!(row_ids, vec![0].into_iter().collect());
+ let row_ids = applier_factory(vec![(1, "hello")], vec![]).await;
+ assert_eq!(row_ids, Some(rows([0])));
- let row_ids = applier_factory(vec![(1, "world")]).await;
- assert_eq!(row_ids, vec![1].into_iter().collect());
+ let row_ids = applier_factory(vec![(1, "world")], vec![]).await;
+ assert_eq!(row_ids, Some(rows([1])));
- let row_ids = applier_factory(vec![(2, "hello")]).await;
- assert_eq!(row_ids, vec![3].into_iter().collect());
+ let row_ids = applier_factory(vec![(1, "Hello")], vec![]).await;
+ assert_eq!(row_ids, Some(rows([3])));
- let row_ids = applier_factory(vec![(2, "world")]).await;
- assert_eq!(row_ids, vec![1, 2, 3].into_iter().collect());
+ let row_ids = applier_factory(vec![(1, "World")], vec![]).await;
+ assert_eq!(row_ids, Some(rows([3])));
- let row_ids = applier_factory(vec![(3, "你好")]).await;
- assert_eq!(row_ids, vec![0, 3].into_iter().collect());
+ let row_ids = applier_factory(vec![], vec![(1, vec![(false, "hello")])]).await;
+ assert_eq!(row_ids, Some(rows([0])));
- let row_ids = applier_factory(vec![(3, "世界")]).await;
- assert_eq!(row_ids, vec![2, 3].into_iter().collect());
+ let row_ids = applier_factory(vec![], vec![(1, vec![(true, "hello")])]).await;
+ assert_eq!(row_ids, None);
+
+ let row_ids = applier_factory(vec![], vec![(1, vec![(false, "world")])]).await;
+ assert_eq!(row_ids, Some(rows([1])));
+
+ let row_ids = applier_factory(vec![], vec![(1, vec![(true, "world")])]).await;
+ assert_eq!(row_ids, None);
+
+ let row_ids = applier_factory(vec![], vec![(1, vec![(false, "Hello")])]).await;
+ assert_eq!(row_ids, Some(rows([3])));
+
+ let row_ids = applier_factory(vec![], vec![(1, vec![(true, "Hello")])]).await;
+ assert_eq!(row_ids, None);
+
+ let row_ids = applier_factory(vec![], vec![(1, vec![(false, "Hello, World")])]).await;
+ assert_eq!(row_ids, Some(rows([3])));
+
+ let row_ids = applier_factory(vec![], vec![(1, vec![(true, "Hello, World")])]).await;
+ assert_eq!(row_ids, None);
+ }
+
+ #[tokio::test]
+ async fn test_fulltext_index_basic_case_insensitive() {
+ let applier_factory = build_fulltext_applier_factory(
+ "test_fulltext_index_basic_case_insensitive_",
+ &[
+ (None, Some("hello"), None),
+ (None, None, None),
+ (None, Some("world"), None),
+ (None, Some("Hello, World"), None),
+ ],
+ )
+ .await;
+
+ let row_ids = applier_factory(vec![(2, "hello")], vec![]).await;
+ assert_eq!(row_ids, Some(rows([0, 3])));
+
+ let row_ids = applier_factory(vec![(2, "world")], vec![]).await;
+ assert_eq!(row_ids, Some(rows([2, 3])));
+
+ let row_ids = applier_factory(vec![(2, "Hello")], vec![]).await;
+ assert_eq!(row_ids, Some(rows([0, 3])));
+
+ let row_ids = applier_factory(vec![(2, "World")], vec![]).await;
+ assert_eq!(row_ids, Some(rows([2, 3])));
+
+ let row_ids = applier_factory(vec![], vec![(2, vec![(false, "hello")])]).await;
+ assert_eq!(row_ids, Some(rows([0, 3])));
+
+ let row_ids = applier_factory(vec![], vec![(2, vec![(true, "hello")])]).await;
+ assert_eq!(row_ids, Some(rows([0, 3])));
+
+ let row_ids = applier_factory(vec![], vec![(2, vec![(false, "world")])]).await;
+ assert_eq!(row_ids, Some(rows([2, 3])));
+
+ let row_ids = applier_factory(vec![], vec![(2, vec![(true, "world")])]).await;
+ assert_eq!(row_ids, Some(rows([2, 3])));
+
+ let row_ids = applier_factory(vec![], vec![(2, vec![(false, "Hello")])]).await;
+ assert_eq!(row_ids, Some(rows([0, 3])));
+
+ let row_ids = applier_factory(vec![], vec![(2, vec![(true, "Hello")])]).await;
+ assert_eq!(row_ids, Some(rows([0, 3])));
+
+ let row_ids = applier_factory(vec![], vec![(2, vec![(false, "World")])]).await;
+ assert_eq!(row_ids, Some(rows([2, 3])));
+
+ let row_ids = applier_factory(vec![], vec![(2, vec![(true, "World")])]).await;
+ assert_eq!(row_ids, Some(rows([2, 3])));
+ }
+
+ #[tokio::test]
+ async fn test_fulltext_index_basic_chinese() {
+ let applier_factory = build_fulltext_applier_factory(
+ "test_fulltext_index_basic_chinese_",
+ &[
+ (None, None, Some("你好")),
+ (None, None, None),
+ (None, None, Some("世界")),
+ (None, None, Some("你好,世界")),
+ ],
+ )
+ .await;
+
+ let row_ids = applier_factory(vec![(3, "你好")], vec![]).await;
+ assert_eq!(row_ids, Some(rows([0, 3])));
+
+ let row_ids = applier_factory(vec![(3, "世界")], vec![]).await;
+ assert_eq!(row_ids, Some(rows([2, 3])));
+
+ let row_ids = applier_factory(vec![], vec![(3, vec![(false, "你好")])]).await;
+ assert_eq!(row_ids, Some(rows([0, 3])));
+
+ let row_ids = applier_factory(vec![], vec![(3, vec![(false, "世界")])]).await;
+ assert_eq!(row_ids, Some(rows([2, 3])));
+ }
+
+ #[tokio::test]
+ async fn test_fulltext_index_multi_terms_case_sensitive() {
+ let applier_factory = build_fulltext_applier_factory(
+ "test_fulltext_index_multi_terms_case_sensitive_",
+ &[
+ (Some("Hello"), None, None),
+ (Some("World"), None, None),
+ (None, None, None),
+ (Some("Hello, World"), None, None),
+ ],
+ )
+ .await;
+
+ let row_ids =
+ applier_factory(vec![], vec![(1, vec![(false, "hello"), (false, "world")])]).await;
+ assert_eq!(row_ids, Some(rows([])));
+
+ let row_ids =
+ applier_factory(vec![], vec![(1, vec![(false, "Hello"), (false, "World")])]).await;
+ assert_eq!(row_ids, Some(rows([3])));
+
+ let row_ids =
+ applier_factory(vec![], vec![(1, vec![(true, "Hello"), (false, "World")])]).await;
+ assert_eq!(row_ids, Some(rows([1, 3])));
+
+ let row_ids =
+ applier_factory(vec![], vec![(1, vec![(false, "Hello"), (true, "World")])]).await;
+ assert_eq!(row_ids, Some(rows([0, 3])));
+
+ let row_ids =
+ applier_factory(vec![], vec![(1, vec![(true, "Hello"), (true, "World")])]).await;
+ assert_eq!(row_ids, None);
+ }
+
+ #[tokio::test]
+ async fn test_fulltext_index_multi_terms_case_insensitive() {
+ let applier_factory = build_fulltext_applier_factory(
+ "test_fulltext_index_multi_terms_case_insensitive_",
+ &[
+ (None, Some("hello"), None),
+ (None, None, None),
+ (None, Some("world"), None),
+ (None, Some("Hello, World"), None),
+ ],
+ )
+ .await;
+
+ let row_ids =
+ applier_factory(vec![], vec![(2, vec![(false, "hello"), (false, "world")])]).await;
+ assert_eq!(row_ids, Some(rows([3])));
+
+ let row_ids =
+ applier_factory(vec![], vec![(2, vec![(true, "hello"), (false, "world")])]).await;
+ assert_eq!(row_ids, Some(rows([3])));
+
+ let row_ids =
+ applier_factory(vec![], vec![(2, vec![(false, "hello"), (true, "world")])]).await;
+ assert_eq!(row_ids, Some(rows([3])));
+
+ let row_ids =
+ applier_factory(vec![], vec![(2, vec![(true, "hello"), (true, "world")])]).await;
+ assert_eq!(row_ids, Some(rows([3])));
}
#[tokio::test]
async fn test_fulltext_index_multi_columns() {
- let applier_factory = build_applier_factory(
+ let applier_factory = build_fulltext_applier_factory(
"test_fulltext_index_multi_columns_",
&[
- (Some("hello"), None, Some("你好")),
- (Some("world"), Some("world"), None),
+ (Some("Hello"), None, Some("你好")),
+ (Some("World"), Some("world"), None),
(None, Some("World"), Some("世界")),
(
Some("Hello, World"),
@@ -627,13 +819,14 @@ mod tests {
)
.await;
- let row_ids = applier_factory(vec![(1, "hello"), (3, "你好")]).await;
- assert_eq!(row_ids, vec![0].into_iter().collect());
+ let row_ids = applier_factory(
+ vec![(1, "Hello"), (3, "你好")],
+ vec![(2, vec![(false, "world")])],
+ )
+ .await;
+ assert_eq!(row_ids, Some(rows([3])));
- let row_ids = applier_factory(vec![(1, "world"), (3, "世界")]).await;
- assert_eq!(row_ids, vec![].into_iter().collect());
-
- let row_ids = applier_factory(vec![(2, "world"), (3, "世界")]).await;
- assert_eq!(row_ids, vec![2, 3].into_iter().collect());
+ let row_ids = applier_factory(vec![(2, "World")], vec![(1, vec![(false, "World")])]).await;
+ assert_eq!(row_ids, Some(rows([1, 3])));
}
}
diff --git a/src/mito2/src/sst/index/puffin_manager.rs b/src/mito2/src/sst/index/puffin_manager.rs
index 9f288ecd16..cf4bb185a8 100644
--- a/src/mito2/src/sst/index/puffin_manager.rs
+++ b/src/mito2/src/sst/index/puffin_manager.rs
@@ -174,12 +174,13 @@ impl PuffinFileAccessor for ObjectStorePuffinFileAccessor {
#[cfg(test)]
mod tests {
+
use common_base::range_read::RangeReader;
use common_test_util::temp_dir::create_temp_dir;
use futures::io::Cursor;
use object_store::services::Memory;
use puffin::blob_metadata::CompressionCodec;
- use puffin::puffin_manager::{DirGuard, PuffinManager, PuffinReader, PuffinWriter, PutOptions};
+ use puffin::puffin_manager::{PuffinManager, PuffinReader, PuffinWriter, PutOptions};
use super::*;
@@ -229,6 +230,7 @@ mod tests {
PutOptions {
compression: Some(CompressionCodec::Zstd),
},
+ Default::default(),
)
.await
.unwrap();
diff --git a/src/puffin/src/puffin_manager.rs b/src/puffin/src/puffin_manager.rs
index 1dfec58f5b..9f287128c1 100644
--- a/src/puffin/src/puffin_manager.rs
+++ b/src/puffin/src/puffin_manager.rs
@@ -65,7 +65,13 @@ pub trait PuffinWriter {
/// Returns the number of bytes written.
///
/// The specified `dir` should be accessible from the filesystem.
- async fn put_dir(&mut self, key: &str, dir: PathBuf, options: PutOptions) -> Result;
+ async fn put_dir(
+ &mut self,
+ key: &str,
+ dir: PathBuf,
+ options: PutOptions,
+ properties: HashMap,
+ ) -> Result;
/// Sets whether the footer should be LZ4 compressed.
fn set_footer_lz4_compressed(&mut self, lz4_compressed: bool);
@@ -94,15 +100,15 @@ pub trait PuffinReader {
/// Reads a blob from the Puffin file.
///
- /// The returned `BlobWithMetadata` is used to access the blob data and its metadata.
- /// Users should hold the `BlobWithMetadata` until they are done with the blob data.
- async fn blob(&self, key: &str) -> Result>;
+ /// The returned `GuardWithMetadata` is used to access the blob data and its metadata.
+ /// Users should hold the `GuardWithMetadata` until they are done with the blob data.
+ async fn blob(&self, key: &str) -> Result>;
/// Reads a directory from the Puffin file.
///
- /// The returned `DirGuard` is used to access the directory in the filesystem.
- /// The caller is responsible for holding the `DirGuard` until they are done with the directory.
- async fn dir(&self, key: &str) -> Result;
+ /// The returned `GuardWithMetadata` is used to access the directory data and its metadata.
+ /// Users should hold the `GuardWithMetadata` until they are done with the directory data.
+ async fn dir(&self, key: &str) -> Result>;
}
/// `BlobGuard` is provided by the `PuffinReader` to access the blob data.
@@ -114,32 +120,41 @@ pub trait BlobGuard {
async fn reader(&self) -> Result;
}
-/// `BlobWithMetadata` provides access to the blob data and its metadata.
-pub struct BlobWithMetadata {
- blob: B,
- metadata: BlobMetadata,
-}
-
-impl BlobWithMetadata {
- /// Creates a new `BlobWithMetadata` instance.
- pub fn new(blob: B, metadata: BlobMetadata) -> Self {
- Self { blob, metadata }
- }
-
- /// Returns the reader for the blob data.
- pub async fn reader(&self) -> Result {
- self.blob.reader().await
- }
-
- /// Returns the metadata of the blob.
- pub fn metadata(&self) -> &BlobMetadata {
- &self.metadata
- }
-}
-
/// `DirGuard` is provided by the `PuffinReader` to access the directory in the filesystem.
/// Users should hold the `DirGuard` until they are done with the directory.
#[auto_impl::auto_impl(Arc)]
pub trait DirGuard {
fn path(&self) -> &PathBuf;
}
+
+/// `GuardWithMetadata` provides access to the blob or directory data and its metadata.
+pub struct GuardWithMetadata {
+ guard: G,
+ metadata: BlobMetadata,
+}
+
+impl GuardWithMetadata {
+ /// Creates a new `GuardWithMetadata` instance.
+ pub fn new(guard: G, metadata: BlobMetadata) -> Self {
+ Self { guard, metadata }
+ }
+
+ /// Returns the metadata of the directory.
+ pub fn metadata(&self) -> &BlobMetadata {
+ &self.metadata
+ }
+}
+
+impl GuardWithMetadata {
+ /// Returns the reader for the blob data.
+ pub async fn reader(&self) -> Result {
+ self.guard.reader().await
+ }
+}
+
+impl GuardWithMetadata {
+ /// Returns the path of the directory.
+ pub fn path(&self) -> &PathBuf {
+ self.guard.path()
+ }
+}
diff --git a/src/puffin/src/puffin_manager/fs_puffin_manager/reader.rs b/src/puffin/src/puffin_manager/fs_puffin_manager/reader.rs
index 5d2033e2e9..2c616578f6 100644
--- a/src/puffin/src/puffin_manager/fs_puffin_manager/reader.rs
+++ b/src/puffin/src/puffin_manager/fs_puffin_manager/reader.rs
@@ -36,7 +36,7 @@ use crate::puffin_manager::file_accessor::PuffinFileAccessor;
use crate::puffin_manager::fs_puffin_manager::dir_meta::DirMetadata;
use crate::puffin_manager::fs_puffin_manager::PuffinMetadataCacheRef;
use crate::puffin_manager::stager::{BoxWriter, DirWriterProviderRef, Stager};
-use crate::puffin_manager::{BlobGuard, BlobWithMetadata, PuffinReader};
+use crate::puffin_manager::{BlobGuard, GuardWithMetadata, PuffinReader};
/// `FsPuffinReader` is a `PuffinReader` that provides fs readers for puffin files.
pub struct FsPuffinReader
@@ -96,26 +96,13 @@ where
}
async fn metadata(&self) -> Result> {
- let reader = self.puffin_file_accessor.reader(&self.handle).await?;
- let mut file = PuffinFileReader::new(reader);
+ let mut file = self.puffin_reader().await?;
self.get_puffin_file_metadata(&mut file).await
}
- async fn blob(&self, key: &str) -> Result> {
- let mut reader = self.puffin_file_accessor.reader(&self.handle).await?;
- if let Some(file_size_hint) = self.file_size_hint {
- reader.with_file_size_hint(file_size_hint);
- }
- let mut file = PuffinFileReader::new(reader);
-
- let metadata = self.get_puffin_file_metadata(&mut file).await?;
- let blob_metadata = metadata
- .blobs
- .iter()
- .find(|m| m.blob_type == key)
- .context(BlobNotFoundSnafu { blob: key })?
- .clone();
-
+ async fn blob(&self, key: &str) -> Result> {
+ let mut file = self.puffin_reader().await?;
+ let blob_metadata = self.get_blob_metadata(key, &mut file).await?;
let blob = if blob_metadata.compression_codec.is_none() {
// If the blob is not compressed, we can directly read it from the puffin file.
Either::L(RandomReadBlob {
@@ -140,28 +127,33 @@ where
Either::R(staged_blob)
};
- Ok(BlobWithMetadata::new(blob, blob_metadata))
+ Ok(GuardWithMetadata::new(blob, blob_metadata))
}
- async fn dir(&self, key: &str) -> Result {
- self.stager
+ async fn dir(&self, key: &str) -> Result> {
+ let mut file = self.puffin_reader().await?;
+ let blob_metadata = self.get_blob_metadata(key, &mut file).await?;
+ let dir = self
+ .stager
.get_dir(
&self.handle,
key,
Box::new(|writer_provider| {
let accessor = self.puffin_file_accessor.clone();
let handle = self.handle.clone();
- let key = key.to_string();
+ let blob_metadata = blob_metadata.clone();
Box::pin(Self::init_dir_to_stager(
+ file,
+ blob_metadata,
handle,
- key,
writer_provider,
accessor,
- self.file_size_hint,
))
}),
)
- .await
+ .await?;
+
+ Ok(GuardWithMetadata::new(dir, blob_metadata))
}
}
@@ -188,6 +180,30 @@ where
Ok(metadata)
}
+ async fn get_blob_metadata(
+ &self,
+ key: &str,
+ file: &mut PuffinFileReader,
+ ) -> Result {
+ let metadata = self.get_puffin_file_metadata(file).await?;
+ let blob_metadata = metadata
+ .blobs
+ .iter()
+ .find(|m| m.blob_type == key)
+ .context(BlobNotFoundSnafu { blob: key })?
+ .clone();
+
+ Ok(blob_metadata)
+ }
+
+ async fn puffin_reader(&self) -> Result> {
+ let mut reader = self.puffin_file_accessor.reader(&self.handle).await?;
+ if let Some(file_size_hint) = self.file_size_hint {
+ reader.with_file_size_hint(file_size_hint);
+ }
+ Ok(PuffinFileReader::new(reader))
+ }
+
async fn init_blob_to_stager(
reader: PuffinFileReader,
blob_metadata: BlobMetadata,
@@ -201,26 +217,14 @@ where
}
async fn init_dir_to_stager(
+ mut file: PuffinFileReader,
+ blob_metadata: BlobMetadata,
handle: F::FileHandle,
- key: String,
writer_provider: DirWriterProviderRef,
accessor: F,
- file_size_hint: Option,
) -> Result {
- let mut reader = accessor.reader(&handle).await?;
- if let Some(file_size_hint) = file_size_hint {
- reader.with_file_size_hint(file_size_hint);
- }
- let mut file = PuffinFileReader::new(reader);
-
let puffin_metadata = file.metadata().await?;
- let blob_metadata = puffin_metadata
- .blobs
- .iter()
- .find(|m| m.blob_type == key.as_str())
- .context(BlobNotFoundSnafu { blob: key })?;
-
- let reader = file.blob_reader(blob_metadata)?;
+ let reader = file.blob_reader(&blob_metadata)?;
let meta = reader.metadata().await.context(MetadataSnafu)?;
let buf = reader
.read(0..meta.content_length)
diff --git a/src/puffin/src/puffin_manager/fs_puffin_manager/writer.rs b/src/puffin/src/puffin_manager/fs_puffin_manager/writer.rs
index 61d9df52f0..feb7678756 100644
--- a/src/puffin/src/puffin_manager/fs_puffin_manager/writer.rs
+++ b/src/puffin/src/puffin_manager/fs_puffin_manager/writer.rs
@@ -88,7 +88,13 @@ where
Ok(written_bytes)
}
- async fn put_dir(&mut self, key: &str, dir_path: PathBuf, options: PutOptions) -> Result {
+ async fn put_dir(
+ &mut self,
+ key: &str,
+ dir_path: PathBuf,
+ options: PutOptions,
+ properties: HashMap,
+ ) -> Result {
ensure!(
!self.blob_keys.contains(key),
DuplicateBlobSnafu { blob: key }
@@ -150,7 +156,7 @@ where
blob_type: key.to_string(),
compressed_data: encoded.as_slice(),
compression_codec: None,
- properties: Default::default(),
+ properties,
};
written_bytes += self.puffin_file_writer.add_blob(dir_meta_blob).await?;
diff --git a/src/puffin/src/puffin_manager/tests.rs b/src/puffin/src/puffin_manager/tests.rs
index e2f32e9498..bd3ec9d5a5 100644
--- a/src/puffin/src/puffin_manager/tests.rs
+++ b/src/puffin/src/puffin_manager/tests.rs
@@ -23,7 +23,7 @@ use crate::blob_metadata::CompressionCodec;
use crate::puffin_manager::file_accessor::MockFileAccessor;
use crate::puffin_manager::fs_puffin_manager::FsPuffinManager;
use crate::puffin_manager::stager::BoundedStager;
-use crate::puffin_manager::{DirGuard, PuffinManager, PuffinReader, PuffinWriter, PutOptions};
+use crate::puffin_manager::{PuffinManager, PuffinReader, PuffinWriter, PutOptions};
async fn new_bounded_stager(prefix: &str, capacity: u64) -> (TempDir, Arc>) {
let staging_dir = create_temp_dir(prefix);
@@ -343,6 +343,7 @@ async fn put_dir(
PutOptions {
compression: compression_codec,
},
+ HashMap::from_iter([("test_key".to_string(), "test_value".to_string())]),
)
.await
.unwrap();
@@ -356,6 +357,11 @@ async fn check_dir(
puffin_reader: &impl PuffinReader,
) {
let res_dir = puffin_reader.dir(key).await.unwrap();
+ let metadata = res_dir.metadata();
+ assert_eq!(
+ metadata.properties,
+ HashMap::from_iter([("test_key".to_string(), "test_value".to_string())])
+ );
for (file_name, raw_data) in files_in_dir {
let file_path = if cfg!(windows) {
res_dir.path().join(file_name.replace('/', "\\"))
From 74d8fd00a4ecb026825cb0f26378d5dff77fd01d Mon Sep 17 00:00:00 2001
From: "Lei, HUANG" <6406592+v0y4g3r@users.noreply.github.com>
Date: Thu, 10 Apr 2025 16:07:04 +0800
Subject: [PATCH 05/82] fix: remove metadata region options (#5852)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
* fix/remove-metadata-region-options:
### Add `SKIP_WAL_KEY` Option to Metric Engine
- **Enhancements**:
- Introduced `SKIP_WAL_KEY` to the metric engine options in `create.rs` and `mito_engine_options.rs`.
- Updated test cases in `create.rs` to include `skip_wal` option and ensure it is removed for metadata regions.
- **Refactoring**:
- Updated `requests.rs` to use `SKIP_WAL_KEY` from `store_api::mito_engine_options`.
These changes enhance the metric engine by allowing the option to skip Write-Ahead Logging (WAL) and ensure consistent usage of option keys across modules.
* fix/remove-metadata-region-options: Add note for new options in mito_engine_options.rs
• Introduce a comment to remind developers to check if new options should be removed in region_options_for_metadata_region within metric_engine::engine::create.
* empty
---
src/metric-engine/src/engine/create.rs | 14 ++++++++++----
src/store-api/src/mito_engine_options.rs | 3 +++
src/table/src/requests.rs | 2 +-
3 files changed, 14 insertions(+), 5 deletions(-)
diff --git a/src/metric-engine/src/engine/create.rs b/src/metric-engine/src/engine/create.rs
index 856bdb7b72..bfb7737df7 100644
--- a/src/metric-engine/src/engine/create.rs
+++ b/src/metric-engine/src/engine/create.rs
@@ -34,7 +34,7 @@ use store_api::metric_engine_consts::{
METADATA_SCHEMA_VALUE_COLUMN_INDEX, METADATA_SCHEMA_VALUE_COLUMN_NAME,
};
use store_api::mito_engine_options::{
- APPEND_MODE_KEY, MEMTABLE_PARTITION_TREE_PRIMARY_KEY_ENCODING, TTL_KEY,
+ APPEND_MODE_KEY, MEMTABLE_PARTITION_TREE_PRIMARY_KEY_ENCODING, SKIP_WAL_KEY, TTL_KEY,
};
use store_api::region_engine::RegionEngine;
use store_api::region_request::{AffectedRows, RegionCreateRequest, RegionRequest};
@@ -549,6 +549,7 @@ pub(crate) fn region_options_for_metadata_region(
// Don't allow to set primary key encoding for metadata region.
original.remove(MEMTABLE_PARTITION_TREE_PRIMARY_KEY_ENCODING);
original.insert(TTL_KEY.to_string(), FOREVER.to_string());
+ original.remove(SKIP_WAL_KEY);
original
}
@@ -685,8 +686,12 @@ mod test {
#[tokio::test]
async fn test_create_request_for_physical_regions() {
// original request
- let mut ttl_options = HashMap::new();
- ttl_options.insert("ttl".to_string(), "60m".to_string());
+ let options: HashMap<_, _> = [
+ ("ttl".to_string(), "60m".to_string()),
+ ("skip_wal".to_string(), "true".to_string()),
+ ]
+ .into_iter()
+ .collect();
let request = RegionCreateRequest {
engine: METRIC_ENGINE_NAME.to_string(),
column_metadatas: vec![
@@ -710,7 +715,7 @@ mod test {
},
],
primary_key: vec![0],
- options: ttl_options,
+ options,
region_dir: "/test_dir".to_string(),
};
@@ -742,5 +747,6 @@ mod test {
metadata_region_request.options.get("ttl").unwrap(),
"forever"
);
+ assert!(!metadata_region_request.options.contains_key("skip_wal"));
}
}
diff --git a/src/store-api/src/mito_engine_options.rs b/src/store-api/src/mito_engine_options.rs
index e73060469f..aa6fd8984d 100644
--- a/src/store-api/src/mito_engine_options.rs
+++ b/src/store-api/src/mito_engine_options.rs
@@ -59,6 +59,9 @@ pub const MEMTABLE_PARTITION_TREE_DATA_FREEZE_THRESHOLD: &str =
/// Option key for memtable partition tree fork dictionary bytes.
pub const MEMTABLE_PARTITION_TREE_FORK_DICTIONARY_BYTES: &str =
"memtable.partition_tree.fork_dictionary_bytes";
+/// Option key for skipping WAL.
+pub const SKIP_WAL_KEY: &str = "skip_wal";
+// Note: Adding new options here should also check if this option should be removed in [metric_engine::engine::create::region_options_for_metadata_region].
/// Returns true if the `key` is a valid option key for the mito engine.
pub fn is_mito_engine_option_key(key: &str) -> bool {
diff --git a/src/table/src/requests.rs b/src/table/src/requests.rs
index 5b7ad566c5..75a4ab64d6 100644
--- a/src/table/src/requests.rs
+++ b/src/table/src/requests.rs
@@ -99,7 +99,7 @@ pub const TTL_KEY: &str = store_api::mito_engine_options::TTL_KEY;
pub const STORAGE_KEY: &str = "storage";
pub const COMMENT_KEY: &str = "comment";
pub const AUTO_CREATE_TABLE_KEY: &str = "auto_create_table";
-pub const SKIP_WAL_KEY: &str = "skip_wal";
+pub const SKIP_WAL_KEY: &str = store_api::mito_engine_options::SKIP_WAL_KEY;
impl TableOptions {
pub fn try_from_iter>(
From 382eacdc131ec00d74c17841f0fe65e9ff06d5de Mon Sep 17 00:00:00 2001
From: Weny Xu
Date: Thu, 10 Apr 2025 17:19:32 +0800
Subject: [PATCH 06/82] fix: include follower peers in region distribution
(#5844)
---
src/common/meta/src/rpc/router.rs | 44 +++++++++++++++++++++++++++++++
1 file changed, 44 insertions(+)
diff --git a/src/common/meta/src/rpc/router.rs b/src/common/meta/src/rpc/router.rs
index 0e700cc6da..03a1e0bfa0 100644
--- a/src/common/meta/src/rpc/router.rs
+++ b/src/common/meta/src/rpc/router.rs
@@ -32,6 +32,10 @@ use crate::key::RegionDistribution;
use crate::peer::Peer;
use crate::DatanodeId;
+/// Returns the distribution of regions to datanodes.
+///
+/// The distribution is a map of datanode id to a list of region ids.
+/// The list of region ids is sorted in ascending order.
pub fn region_distribution(region_routes: &[RegionRoute]) -> RegionDistribution {
let mut regions_id_map = RegionDistribution::new();
for route in region_routes.iter() {
@@ -39,6 +43,10 @@ pub fn region_distribution(region_routes: &[RegionRoute]) -> RegionDistribution
let region_id = route.region.id.region_number();
regions_id_map.entry(peer.id).or_default().push(region_id);
}
+ for peer in route.follower_peers.iter() {
+ let region_id = route.region.id.region_number();
+ regions_id_map.entry(peer.id).or_default().push(region_id);
+ }
}
for (_, regions) in regions_id_map.iter_mut() {
// id asc
@@ -550,4 +558,40 @@ mod tests {
assert_eq!(got, p);
}
+
+ #[test]
+ fn test_region_distribution() {
+ let region_routes = vec![
+ RegionRoute {
+ region: Region {
+ id: RegionId::new(1, 1),
+ name: "r1".to_string(),
+ partition: None,
+ attrs: BTreeMap::new(),
+ },
+ leader_peer: Some(Peer::new(1, "a1")),
+ follower_peers: vec![Peer::new(2, "a2"), Peer::new(3, "a3")],
+ leader_state: None,
+ leader_down_since: None,
+ },
+ RegionRoute {
+ region: Region {
+ id: RegionId::new(1, 2),
+ name: "r2".to_string(),
+ partition: None,
+ attrs: BTreeMap::new(),
+ },
+ leader_peer: Some(Peer::new(2, "a2")),
+ follower_peers: vec![Peer::new(1, "a1"), Peer::new(3, "a3")],
+ leader_state: None,
+ leader_down_since: None,
+ },
+ ];
+
+ let distribution = region_distribution(®ion_routes);
+ assert_eq!(distribution.len(), 3);
+ assert_eq!(distribution[&1], vec![1, 2]);
+ assert_eq!(distribution[&2], vec![1, 2]);
+ assert_eq!(distribution[&3], vec![1, 2]);
+ }
}
From 71255b3cbdfa5580b85e4f38453a28a25c12ffa0 Mon Sep 17 00:00:00 2001
From: LFC <990479+MichaelScofield@users.noreply.github.com>
Date: Thu, 10 Apr 2025 18:08:45 +0800
Subject: [PATCH 07/82] refactor: avoid empty display in errors (#5858)
* refactor: avoid empty display in errors
* fix: resolve PR comments
---
Cargo.lock | 1 +
src/common/error/Cargo.toml | 3 +
src/common/error/src/ext.rs | 10 +-
src/common/error/tests/ext.rs | 111 +++++++++++++++++++++
src/common/macro/src/stack_trace_debug.rs | 63 +++++++++++-
src/common/recordbatch/src/adapter.rs | 2 +-
src/common/recordbatch/src/error.rs | 2 +-
src/query/src/datafusion.rs | 9 +-
src/query/src/datafusion/planner.rs | 6 +-
src/query/src/error.rs | 2 +-
src/query/src/plan.rs | 5 +-
src/query/src/planner.rs | 14 +--
src/query/src/range_select/plan.rs | 37 ++++---
src/query/src/range_select/plan_rewrite.rs | 38 ++-----
src/query/src/sql.rs | 8 +-
15 files changed, 227 insertions(+), 84 deletions(-)
create mode 100644 src/common/error/tests/ext.rs
diff --git a/Cargo.lock b/Cargo.lock
index cba1fa8793..da4306f527 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1988,6 +1988,7 @@ dependencies = [
name = "common-error"
version = "0.14.0"
dependencies = [
+ "common-macro",
"http 1.1.0",
"snafu 0.8.5",
"strum 0.27.1",
diff --git a/src/common/error/Cargo.toml b/src/common/error/Cargo.toml
index 148e2c6633..031f944dbd 100644
--- a/src/common/error/Cargo.toml
+++ b/src/common/error/Cargo.toml
@@ -12,3 +12,6 @@ http.workspace = true
snafu.workspace = true
strum.workspace = true
tonic.workspace = true
+
+[dev-dependencies]
+common-macro.workspace = true
diff --git a/src/common/error/src/ext.rs b/src/common/error/src/ext.rs
index 3b4d15a835..3f95c5fe1a 100644
--- a/src/common/error/src/ext.rs
+++ b/src/common/error/src/ext.rs
@@ -42,7 +42,7 @@ pub trait ErrorExt: StackError {
if let Some(external_error) = error.source() {
let external_root = external_error.sources().last().unwrap();
- if error.to_string().is_empty() {
+ if error.transparent() {
format!("{external_root}")
} else {
format!("{error}: {external_root}")
@@ -86,6 +86,14 @@ pub trait StackError: std::error::Error {
}
result
}
+
+ /// Indicates whether this error is "transparent", that it delegates its "display" and "source"
+ /// to the underlying error. Could be useful when you are just wrapping some external error,
+ /// **AND** can not or would not provide meaningful contextual info. For example, the
+ /// `DataFusionError`.
+ fn transparent(&self) -> bool {
+ false
+ }
}
impl StackError for Arc {
diff --git a/src/common/error/tests/ext.rs b/src/common/error/tests/ext.rs
new file mode 100644
index 0000000000..0a39ed51c6
--- /dev/null
+++ b/src/common/error/tests/ext.rs
@@ -0,0 +1,111 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::any::Any;
+
+use common_error::ext::{ErrorExt, PlainError, StackError};
+use common_error::status_code::StatusCode;
+use common_macro::stack_trace_debug;
+use snafu::{Location, ResultExt, Snafu};
+
+#[derive(Snafu)]
+#[stack_trace_debug]
+enum MyError {
+ #[snafu(display(r#"A normal error with "display" attribute, message "{}""#, message))]
+ Normal {
+ message: String,
+ #[snafu(source)]
+ error: PlainError,
+ #[snafu(implicit)]
+ location: Location,
+ },
+
+ #[snafu(transparent)]
+ Transparent {
+ #[snafu(source)]
+ error: PlainError,
+ #[snafu(implicit)]
+ location: Location,
+ },
+}
+
+impl ErrorExt for MyError {
+ fn status_code(&self) -> StatusCode {
+ StatusCode::Unexpected
+ }
+
+ fn as_any(&self) -> &dyn Any {
+ self
+ }
+}
+
+fn normal_error() -> Result<(), MyError> {
+ let plain_error = PlainError::new("".to_string(), StatusCode::Unexpected);
+ Err(plain_error).context(NormalSnafu { message: "blabla" })
+}
+
+fn transparent_error() -> Result<(), MyError> {
+ let plain_error = PlainError::new("".to_string(), StatusCode::Unexpected);
+ Err(plain_error)?
+}
+
+#[test]
+fn test_output_msg() {
+ let result = normal_error();
+ assert_eq!(
+ result.unwrap_err().output_msg(),
+ r#"A normal error with "display" attribute, message "blabla": "#
+ );
+
+ let result = transparent_error();
+ assert_eq!(result.unwrap_err().output_msg(), "");
+}
+
+#[test]
+fn test_to_string() {
+ let result = normal_error();
+ assert_eq!(
+ result.unwrap_err().to_string(),
+ r#"A normal error with "display" attribute, message "blabla""#
+ );
+
+ let result = transparent_error();
+ assert_eq!(result.unwrap_err().to_string(), "");
+}
+
+#[test]
+fn test_debug_format() {
+ let result = normal_error();
+ assert_eq!(
+ format!("{:?}", result.unwrap_err()),
+ r#"0: A normal error with "display" attribute, message "blabla", at src/common/error/tests/ext.rs:55:22
+1: PlainError { msg: "", status_code: Unexpected }"#
+ );
+
+ let result = transparent_error();
+ assert_eq!(
+ format!("{:?}", result.unwrap_err()),
+ r#"0: , at src/common/error/tests/ext.rs:60:5
+1: PlainError { msg: "", status_code: Unexpected }"#
+ );
+}
+
+#[test]
+fn test_transparent_flag() {
+ let result = normal_error();
+ assert!(!result.unwrap_err().transparent());
+
+ let result = transparent_error();
+ assert!(result.unwrap_err().transparent());
+}
diff --git a/src/common/macro/src/stack_trace_debug.rs b/src/common/macro/src/stack_trace_debug.rs
index fbc24260f1..f82f4746d3 100644
--- a/src/common/macro/src/stack_trace_debug.rs
+++ b/src/common/macro/src/stack_trace_debug.rs
@@ -14,7 +14,7 @@
//! implement `::common_error::ext::StackError`
-use proc_macro2::{Span, TokenStream as TokenStream2};
+use proc_macro2::{Literal, Span, TokenStream as TokenStream2, TokenTree};
use quote::{quote, quote_spanned};
use syn::spanned::Spanned;
use syn::{parenthesized, Attribute, Ident, ItemEnum, Variant};
@@ -32,6 +32,7 @@ pub fn stack_trace_style_impl(args: TokenStream2, input: TokenStream2) -> TokenS
variants.push(variant);
}
+ let transparent_fn = build_transparent_fn(enum_name.clone(), &variants);
let debug_fmt_fn = build_debug_fmt_impl(enum_name.clone(), variants.clone());
let next_fn = build_next_impl(enum_name.clone(), variants);
let debug_impl = build_debug_impl(enum_name.clone());
@@ -43,6 +44,7 @@ pub fn stack_trace_style_impl(args: TokenStream2, input: TokenStream2) -> TokenS
impl ::common_error::ext::StackError for #enum_name {
#debug_fmt_fn
#next_fn
+ #transparent_fn
}
#debug_impl
@@ -115,6 +117,7 @@ struct ErrorVariant {
has_source: bool,
has_external_cause: bool,
display: TokenStream2,
+ transparent: bool,
span: Span,
cfg_attr: Option,
}
@@ -140,6 +143,7 @@ impl ErrorVariant {
}
let mut display = None;
+ let mut transparent = false;
let mut cfg_attr = None;
for attr in variant.attrs {
if attr.path().is_ident("snafu") {
@@ -150,17 +154,29 @@ impl ErrorVariant {
let display_ts: TokenStream2 = content.parse()?;
display = Some(display_ts);
Ok(())
+ } else if meta.path.is_ident("transparent") {
+ display = Some(TokenStream2::from(TokenTree::Literal(Literal::string(
+ "",
+ ))));
+ transparent = true;
+ Ok(())
} else {
Err(meta.error("unrecognized repr"))
}
})
- .expect("Each error should contains a display attribute");
+ .unwrap_or_else(|e| panic!("{e}"));
}
if attr.path().is_ident("cfg") {
cfg_attr = Some(attr);
}
}
+ let display = display.unwrap_or_else(|| {
+ panic!(
+ r#"Error "{}" must be annotated with attribute "display" or "transparent"."#,
+ variant.ident,
+ )
+ });
let field_ident = variant
.fields
@@ -174,7 +190,8 @@ impl ErrorVariant {
has_location,
has_source,
has_external_cause,
- display: display.unwrap(),
+ display,
+ transparent,
span,
cfg_attr,
}
@@ -275,4 +292,44 @@ impl ErrorVariant {
}
}
}
+
+ fn build_transparent_match_arm(&self) -> TokenStream2 {
+ let cfg = if let Some(cfg) = &self.cfg_attr {
+ quote_spanned!(cfg.span() => #cfg)
+ } else {
+ quote! {}
+ };
+ let name = &self.name;
+ let fields = &self.fields;
+
+ if self.transparent {
+ quote_spanned! {
+ self.span => #cfg #[allow(unused_variables)] #name { #(#fields),* } => {
+ true
+ },
+ }
+ } else {
+ quote_spanned! {
+ self.span => #cfg #[allow(unused_variables)] #name { #(#fields),* } =>{
+ false
+ }
+ }
+ }
+ }
+}
+
+fn build_transparent_fn(enum_name: Ident, variants: &[ErrorVariant]) -> TokenStream2 {
+ let match_arms = variants
+ .iter()
+ .map(|v| v.build_transparent_match_arm())
+ .collect::>();
+
+ quote! {
+ fn transparent(&self) -> bool {
+ use #enum_name::*;
+ match self {
+ #(#match_arms)*
+ }
+ }
+ }
}
diff --git a/src/common/recordbatch/src/adapter.rs b/src/common/recordbatch/src/adapter.rs
index d342aa8129..3d27d7120f 100644
--- a/src/common/recordbatch/src/adapter.rs
+++ b/src/common/recordbatch/src/adapter.rs
@@ -298,7 +298,7 @@ impl Stream for RecordBatchStreamAdapter {
match Pin::new(&mut self.stream).poll_next(cx) {
Poll::Pending => Poll::Pending,
Poll::Ready(Some(df_record_batch)) => {
- let df_record_batch = df_record_batch.context(error::PollStreamSnafu)?;
+ let df_record_batch = df_record_batch?;
Poll::Ready(Some(RecordBatch::try_from_df_record_batch(
self.schema(),
df_record_batch,
diff --git a/src/common/recordbatch/src/error.rs b/src/common/recordbatch/src/error.rs
index 6a1c61c0a0..dfd85e4aa1 100644
--- a/src/common/recordbatch/src/error.rs
+++ b/src/common/recordbatch/src/error.rs
@@ -65,7 +65,7 @@ pub enum Error {
location: Location,
},
- #[snafu(display(""))]
+ #[snafu(transparent)]
PollStream {
#[snafu(source)]
error: datafusion::error::DataFusionError,
diff --git a/src/query/src/datafusion.rs b/src/query/src/datafusion.rs
index e0f020cd3a..dba7d0215a 100644
--- a/src/query/src/datafusion.rs
+++ b/src/query/src/datafusion.rs
@@ -50,9 +50,9 @@ use crate::dataframe::DataFrame;
pub use crate::datafusion::planner::DfContextProviderAdapter;
use crate::dist_plan::MergeScanLogicalPlan;
use crate::error::{
- CatalogSnafu, ConvertSchemaSnafu, CreateRecordBatchSnafu, DataFusionSnafu,
- MissingTableMutationHandlerSnafu, MissingTimestampColumnSnafu, QueryExecutionSnafu, Result,
- TableMutationSnafu, TableNotFoundSnafu, TableReadOnlySnafu, UnsupportedExprSnafu,
+ CatalogSnafu, ConvertSchemaSnafu, CreateRecordBatchSnafu, MissingTableMutationHandlerSnafu,
+ MissingTimestampColumnSnafu, QueryExecutionSnafu, Result, TableMutationSnafu,
+ TableNotFoundSnafu, TableReadOnlySnafu, UnsupportedExprSnafu,
};
use crate::executor::QueryExecutor;
use crate::metrics::{OnDone, QUERY_STAGE_ELAPSED};
@@ -308,8 +308,7 @@ impl DatafusionQueryEngine {
let physical_plan = state
.query_planner()
.create_physical_plan(&optimized_plan, state)
- .await
- .context(DataFusionSnafu)?;
+ .await?;
Ok(physical_plan)
}
diff --git a/src/query/src/datafusion/planner.rs b/src/query/src/datafusion/planner.rs
index 912393690d..0ad531541f 100644
--- a/src/query/src/datafusion/planner.rs
+++ b/src/query/src/datafusion/planner.rs
@@ -43,7 +43,7 @@ use datafusion_sql::parser::Statement as DfStatement;
use session::context::QueryContextRef;
use snafu::{Location, ResultExt};
-use crate::error::{CatalogSnafu, DataFusionSnafu, Result};
+use crate::error::{CatalogSnafu, Result};
use crate::query_engine::{DefaultPlanDecoder, QueryEngineState};
pub struct DfContextProviderAdapter {
@@ -70,9 +70,7 @@ impl DfContextProviderAdapter {
query_ctx: QueryContextRef,
) -> Result {
let table_names = if let Some(df_stmt) = df_stmt {
- session_state
- .resolve_table_references(df_stmt)
- .context(DataFusionSnafu)?
+ session_state.resolve_table_references(df_stmt)?
} else {
vec![]
};
diff --git a/src/query/src/error.rs b/src/query/src/error.rs
index 1ebba8de5d..c2a2e960b0 100644
--- a/src/query/src/error.rs
+++ b/src/query/src/error.rs
@@ -126,7 +126,7 @@ pub enum Error {
location: Location,
},
- #[snafu(display(""))]
+ #[snafu(transparent)]
DataFusion {
#[snafu(source)]
error: DataFusionError,
diff --git a/src/query/src/plan.rs b/src/query/src/plan.rs
index e94c073c70..8d5586607e 100644
--- a/src/query/src/plan.rs
+++ b/src/query/src/plan.rs
@@ -19,12 +19,11 @@ use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRewriter};
use datafusion_common::TableReference;
use datafusion_expr::{BinaryExpr, Expr, Join, LogicalPlan, Operator};
use session::context::QueryContextRef;
-use snafu::ResultExt;
pub use table::metadata::TableType;
use table::table::adapter::DfTableProviderAdapter;
use table::table_name::TableName;
-use crate::error::{DataFusionSnafu, Result};
+use crate::error::Result;
struct TableNamesExtractAndRewriter {
pub(crate) table_names: HashSet,
@@ -119,7 +118,7 @@ pub fn extract_and_rewrite_full_table_names(
query_ctx: QueryContextRef,
) -> Result<(HashSet, LogicalPlan)> {
let mut extractor = TableNamesExtractAndRewriter::new(query_ctx);
- let plan = plan.rewrite(&mut extractor).context(DataFusionSnafu)?;
+ let plan = plan.rewrite(&mut extractor)?;
Ok((extractor.table_names, plan.data))
}
diff --git a/src/query/src/planner.rs b/src/query/src/planner.rs
index b0d0063d70..e3ee3904b4 100644
--- a/src/query/src/planner.rs
+++ b/src/query/src/planner.rs
@@ -31,7 +31,7 @@ use snafu::ResultExt;
use sql::ast::Expr as SqlExpr;
use sql::statements::statement::Statement;
-use crate::error::{DataFusionSnafu, PlanSqlSnafu, QueryPlanSnafu, Result, SqlSnafu};
+use crate::error::{PlanSqlSnafu, QueryPlanSnafu, Result, SqlSnafu};
use crate::log_query::planner::LogQueryPlanner;
use crate::parser::QueryStatement;
use crate::promql::planner::PromPlanner;
@@ -118,8 +118,7 @@ impl DfLogicalPlanner {
let context = QueryEngineContext::new(self.session_state.clone(), query_ctx);
let plan = self
.engine_state
- .optimize_by_extension_rules(plan, &context)
- .context(DataFusionSnafu)?;
+ .optimize_by_extension_rules(plan, &context)?;
common_telemetry::debug!("Logical planner, optimize result: {plan}");
Ok(plan)
@@ -154,9 +153,7 @@ impl DfLogicalPlanner {
let sql_to_rel = SqlToRel::new_with_options(&context_provider, parser_options);
- sql_to_rel
- .sql_to_expr(sql.into(), schema, &mut PlannerContext::new())
- .context(DataFusionSnafu)
+ Ok(sql_to_rel.sql_to_expr(sql.into(), schema, &mut PlannerContext::new())?)
}
#[tracing::instrument(skip_all)]
@@ -183,10 +180,7 @@ impl DfLogicalPlanner {
#[tracing::instrument(skip_all)]
fn optimize_logical_plan(&self, plan: LogicalPlan) -> Result {
- self.engine_state
- .optimize_logical_plan(plan)
- .context(DataFusionSnafu)
- .map(Into::into)
+ Ok(self.engine_state.optimize_logical_plan(plan)?)
}
}
diff --git a/src/query/src/range_select/plan.rs b/src/query/src/range_select/plan.rs
index eb28aacf1e..f2a25997bc 100644
--- a/src/query/src/range_select/plan.rs
+++ b/src/query/src/range_select/plan.rs
@@ -55,9 +55,9 @@ use datatypes::arrow::record_batch::RecordBatch;
use datatypes::arrow::row::{OwnedRow, RowConverter, SortField};
use futures::{ready, Stream};
use futures_util::StreamExt;
-use snafu::{ensure, ResultExt};
+use snafu::ensure;
-use crate::error::{DataFusionSnafu, RangeQuerySnafu, Result};
+use crate::error::{RangeQuerySnafu, Result};
type Millisecond = ::Native;
@@ -373,25 +373,22 @@ impl RangeSelect {
Ok((None, Arc::new(field)))
},
)
- .collect::>>()
- .context(DataFusionSnafu)?;
+ .collect::>>()?;
// add align_ts
- let ts_field = time_index
- .to_field(input.schema().as_ref())
- .context(DataFusionSnafu)?;
+ let ts_field = time_index.to_field(input.schema().as_ref())?;
let time_index_name = ts_field.1.name().clone();
fields.push(ts_field);
// add by
- let by_fields = exprlist_to_fields(&by, &input).context(DataFusionSnafu)?;
+ let by_fields = exprlist_to_fields(&by, &input)?;
fields.extend(by_fields.clone());
- let schema_before_project = Arc::new(
- DFSchema::new_with_metadata(fields, input.schema().metadata().clone())
- .context(DataFusionSnafu)?,
- );
- let by_schema = Arc::new(
- DFSchema::new_with_metadata(by_fields, input.schema().metadata().clone())
- .context(DataFusionSnafu)?,
- );
+ let schema_before_project = Arc::new(DFSchema::new_with_metadata(
+ fields,
+ input.schema().metadata().clone(),
+ )?);
+ let by_schema = Arc::new(DFSchema::new_with_metadata(
+ by_fields,
+ input.schema().metadata().clone(),
+ )?);
// If the results of project plan can be obtained directly from range plan without any additional
// calculations, no project plan is required. We can simply project the final output of the range
// plan to produce the final result.
@@ -421,10 +418,10 @@ impl RangeSelect {
(f.0.cloned(), Arc::new(f.1.clone()))
})
.collect();
- Arc::new(
- DFSchema::new_with_metadata(project_field, input.schema().metadata().clone())
- .context(DataFusionSnafu)?,
- )
+ Arc::new(DFSchema::new_with_metadata(
+ project_field,
+ input.schema().metadata().clone(),
+ )?)
} else {
schema_before_project.clone()
};
diff --git a/src/query/src/range_select/plan_rewrite.rs b/src/query/src/range_select/plan_rewrite.rs
index ff05a26706..b53e1079b8 100644
--- a/src/query/src/range_select/plan_rewrite.rs
+++ b/src/query/src/range_select/plan_rewrite.rs
@@ -43,8 +43,7 @@ use snafu::{ensure, OptionExt, ResultExt};
use table::table::adapter::DfTableProviderAdapter;
use crate::error::{
- CatalogSnafu, DataFusionSnafu, RangeQuerySnafu, Result, TimeIndexNotFoundSnafu,
- UnknownTableSnafu,
+ CatalogSnafu, RangeQuerySnafu, Result, TimeIndexNotFoundSnafu, UnknownTableSnafu,
};
use crate::plan::ExtractExpr;
use crate::range_select::plan::{Fill, RangeFn, RangeSelect};
@@ -385,8 +384,7 @@ impl RangePlanRewriter {
let new_expr = expr
.iter()
.map(|expr| expr.clone().rewrite(&mut range_rewriter).map(|x| x.data))
- .collect::>>()
- .context(DataFusionSnafu)?;
+ .collect::>>()?;
if range_rewriter.by.is_empty() {
range_rewriter.by = default_by;
}
@@ -408,9 +406,7 @@ impl RangePlanRewriter {
} else {
let project_plan = LogicalPlanBuilder::from(range_plan)
.project(new_expr)
- .context(DataFusionSnafu)?
- .build()
- .context(DataFusionSnafu)?;
+ .and_then(|x| x.build())?;
Ok(Some(project_plan))
}
}
@@ -436,8 +432,7 @@ impl RangePlanRewriter {
}
);
LogicalPlanBuilder::from(inputs[0].clone())
- .explain(*verbose, true)
- .context(DataFusionSnafu)?
+ .explain(*verbose, true)?
.build()
}
LogicalPlan::Explain(Explain { verbose, .. }) => {
@@ -448,8 +443,7 @@ impl RangePlanRewriter {
}
);
LogicalPlanBuilder::from(inputs[0].clone())
- .explain(*verbose, false)
- .context(DataFusionSnafu)?
+ .explain(*verbose, false)?
.build()
}
LogicalPlan::Distinct(Distinct::On(DistinctOn {
@@ -470,13 +464,11 @@ impl RangePlanRewriter {
on_expr.clone(),
select_expr.clone(),
sort_expr.clone(),
- )
- .context(DataFusionSnafu)?
+ )?
.build()
}
_ => plan.with_new_exprs(plan.expressions_consider_join(), inputs),
- }
- .context(DataFusionSnafu)?;
+ }?;
Ok(Some(plan))
} else {
Ok(None)
@@ -606,8 +598,6 @@ fn interval_only_in_expr(expr: &Expr) -> bool {
#[cfg(test)]
mod test {
- use std::error::Error;
-
use arrow::datatypes::IntervalUnit;
use catalog::memory::MemoryCatalogManager;
use catalog::RegisterTableRequest;
@@ -825,12 +815,7 @@ mod test {
/// the right argument is `range_fn(avg(field_0), '5m', 'NULL', '0', '1h')`
async fn range_argument_err_1() {
let query = r#"SELECT range_fn('5m', avg(field_0), 'NULL', '1', tag_0, '1h') FROM test group by tag_0;"#;
- let error = do_query(query)
- .await
- .unwrap_err()
- .source()
- .unwrap()
- .to_string();
+ let error = do_query(query).await.unwrap_err().to_string();
assert_eq!(
error,
"Error during planning: Illegal argument `Utf8(\"5m\")` in range select query"
@@ -840,12 +825,7 @@ mod test {
#[tokio::test]
async fn range_argument_err_2() {
let query = r#"SELECT range_fn(avg(field_0), 5, 'NULL', '1', tag_0, '1h') FROM test group by tag_0;"#;
- let error = do_query(query)
- .await
- .unwrap_err()
- .source()
- .unwrap()
- .to_string();
+ let error = do_query(query).await.unwrap_err().to_string();
assert_eq!(
error,
"Error during planning: Illegal argument `Int64(5)` in range select query"
diff --git a/src/query/src/sql.rs b/src/query/src/sql.rs
index fbda344427..b62289fb6b 100644
--- a/src/query/src/sql.rs
+++ b/src/query/src/sql.rs
@@ -301,8 +301,7 @@ async fn query_from_information_schema_table(
.state()
.clone(),
)
- .read_table(view)
- .context(error::DataFusionSnafu)?;
+ .read_table(view)?;
let planner = query_engine.planner();
let planner = planner
@@ -319,10 +318,7 @@ async fn query_from_information_schema_table(
}
};
- let stream = dataframe
- .execute_stream()
- .await
- .context(error::DataFusionSnafu)?;
+ let stream = dataframe.execute_stream().await?;
Ok(Output::new_with_stream(Box::pin(
RecordBatchStreamAdapter::try_new(stream).context(error::CreateRecordBatchSnafu)?,
From 84e2bc52c228e8f552127a9b4a6afdb8081973a8 Mon Sep 17 00:00:00 2001
From: fys <40801205+fengys1996@users.noreply.github.com>
Date: Fri, 11 Apr 2025 13:54:28 +0800
Subject: [PATCH 08/82] fix: gRPC connection pool leak (#5876)
* fix: gRPC connection pool leak
* use .config() instead of .inner.config
* cancel the bg task if it is running
* fix: cr
* add unit test for pool release
* Avoid potential data races
---
Cargo.lock | 1 +
src/common/grpc/Cargo.toml | 1 +
src/common/grpc/src/channel_manager.rs | 186 ++++++++++++++++++-------
3 files changed, 140 insertions(+), 48 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index da4306f527..1b108a7546 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2099,6 +2099,7 @@ dependencies = [
"rand 0.9.0",
"snafu 0.8.5",
"tokio",
+ "tokio-util",
"tonic 0.12.3",
"tower 0.5.2",
]
diff --git a/src/common/grpc/Cargo.toml b/src/common/grpc/Cargo.toml
index d20e751e41..4dadf0571b 100644
--- a/src/common/grpc/Cargo.toml
+++ b/src/common/grpc/Cargo.toml
@@ -25,6 +25,7 @@ lazy_static.workspace = true
prost.workspace = true
snafu.workspace = true
tokio.workspace = true
+tokio-util.workspace = true
tonic.workspace = true
tower.workspace = true
diff --git a/src/common/grpc/src/channel_manager.rs b/src/common/grpc/src/channel_manager.rs
index 0127829567..713ad58d81 100644
--- a/src/common/grpc/src/channel_manager.rs
+++ b/src/common/grpc/src/channel_manager.rs
@@ -22,6 +22,7 @@ use dashmap::mapref::entry::Entry;
use dashmap::DashMap;
use lazy_static::lazy_static;
use snafu::{OptionExt, ResultExt};
+use tokio_util::sync::CancellationToken;
use tonic::transport::{
Certificate, Channel as InnerChannel, ClientTlsConfig, Endpoint, Identity, Uri,
};
@@ -39,18 +40,48 @@ lazy_static! {
static ref ID: AtomicU64 = AtomicU64::new(0);
}
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, Default)]
pub struct ChannelManager {
+ inner: Arc,
+}
+
+#[derive(Debug)]
+struct Inner {
id: u64,
config: ChannelConfig,
client_tls_config: Option,
pool: Arc,
- channel_recycle_started: Arc,
+ channel_recycle_started: AtomicBool,
+ cancel: CancellationToken,
}
-impl Default for ChannelManager {
+impl Default for Inner {
fn default() -> Self {
- ChannelManager::with_config(ChannelConfig::default())
+ Self::with_config(ChannelConfig::default())
+ }
+}
+
+impl Drop for Inner {
+ fn drop(&mut self) {
+ // Cancel the channel recycle task.
+ self.cancel.cancel();
+ }
+}
+
+impl Inner {
+ fn with_config(config: ChannelConfig) -> Self {
+ let id = ID.fetch_add(1, Ordering::Relaxed);
+ let pool = Arc::new(Pool::default());
+ let cancel = CancellationToken::new();
+
+ Self {
+ id,
+ config,
+ client_tls_config: None,
+ pool,
+ channel_recycle_started: AtomicBool::new(false),
+ cancel,
+ }
}
}
@@ -60,19 +91,14 @@ impl ChannelManager {
}
pub fn with_config(config: ChannelConfig) -> Self {
- let id = ID.fetch_add(1, Ordering::Relaxed);
- let pool = Arc::new(Pool::default());
+ let inner = Inner::with_config(config);
Self {
- id,
- config,
- client_tls_config: None,
- pool,
- channel_recycle_started: Arc::new(AtomicBool::new(false)),
+ inner: Arc::new(inner),
}
}
pub fn with_tls_config(config: ChannelConfig) -> Result {
- let mut cm = Self::with_config(config.clone());
+ let mut inner = Inner::with_config(config.clone());
// setup tls
let path_config = config.client_tls.context(InvalidTlsConfigSnafu {
@@ -88,17 +114,23 @@ impl ChannelManager {
.context(InvalidConfigFilePathSnafu)?;
let client_identity = Identity::from_pem(client_cert, client_key);
- cm.client_tls_config = Some(
+ inner.client_tls_config = Some(
ClientTlsConfig::new()
.ca_certificate(server_root_ca_cert)
.identity(client_identity),
);
- Ok(cm)
+ Ok(Self {
+ inner: Arc::new(inner),
+ })
}
pub fn config(&self) -> &ChannelConfig {
- &self.config
+ &self.inner.config
+ }
+
+ fn pool(&self) -> &Arc {
+ &self.inner.pool
}
pub fn get(&self, addr: impl AsRef) -> Result {
@@ -106,12 +138,12 @@ impl ChannelManager {
let addr = addr.as_ref();
// It will acquire the read lock.
- if let Some(inner_ch) = self.pool.get(addr) {
+ if let Some(inner_ch) = self.pool().get(addr) {
return Ok(inner_ch);
}
// It will acquire the write lock.
- let entry = match self.pool.entry(addr.to_string()) {
+ let entry = match self.pool().entry(addr.to_string()) {
Entry::Occupied(entry) => {
entry.get().increase_access();
entry.into_ref()
@@ -150,7 +182,7 @@ impl ChannelManager {
access: AtomicUsize::new(1),
use_default_connector: false,
};
- self.pool.put(addr, channel);
+ self.pool().put(addr, channel);
Ok(inner_channel)
}
@@ -159,11 +191,11 @@ impl ChannelManager {
where
F: FnMut(&String, &mut Channel) -> bool,
{
- self.pool.retain_channel(f);
+ self.pool().retain_channel(f);
}
fn build_endpoint(&self, addr: &str) -> Result {
- let http_prefix = if self.client_tls_config.is_some() {
+ let http_prefix = if self.inner.client_tls_config.is_some() {
"https"
} else {
"http"
@@ -172,51 +204,52 @@ impl ChannelManager {
let mut endpoint =
Endpoint::new(format!("{http_prefix}://{addr}")).context(CreateChannelSnafu)?;
- if let Some(dur) = self.config.timeout {
+ if let Some(dur) = self.config().timeout {
endpoint = endpoint.timeout(dur);
}
- if let Some(dur) = self.config.connect_timeout {
+ if let Some(dur) = self.config().connect_timeout {
endpoint = endpoint.connect_timeout(dur);
}
- if let Some(limit) = self.config.concurrency_limit {
+ if let Some(limit) = self.config().concurrency_limit {
endpoint = endpoint.concurrency_limit(limit);
}
- if let Some((limit, dur)) = self.config.rate_limit {
+ if let Some((limit, dur)) = self.config().rate_limit {
endpoint = endpoint.rate_limit(limit, dur);
}
- if let Some(size) = self.config.initial_stream_window_size {
+ if let Some(size) = self.config().initial_stream_window_size {
endpoint = endpoint.initial_stream_window_size(size);
}
- if let Some(size) = self.config.initial_connection_window_size {
+ if let Some(size) = self.config().initial_connection_window_size {
endpoint = endpoint.initial_connection_window_size(size);
}
- if let Some(dur) = self.config.http2_keep_alive_interval {
+ if let Some(dur) = self.config().http2_keep_alive_interval {
endpoint = endpoint.http2_keep_alive_interval(dur);
}
- if let Some(dur) = self.config.http2_keep_alive_timeout {
+ if let Some(dur) = self.config().http2_keep_alive_timeout {
endpoint = endpoint.keep_alive_timeout(dur);
}
- if let Some(enabled) = self.config.http2_keep_alive_while_idle {
+ if let Some(enabled) = self.config().http2_keep_alive_while_idle {
endpoint = endpoint.keep_alive_while_idle(enabled);
}
- if let Some(enabled) = self.config.http2_adaptive_window {
+ if let Some(enabled) = self.config().http2_adaptive_window {
endpoint = endpoint.http2_adaptive_window(enabled);
}
- if let Some(tls_config) = &self.client_tls_config {
+ if let Some(tls_config) = &self.inner.client_tls_config {
endpoint = endpoint
.tls_config(tls_config.clone())
.context(CreateChannelSnafu)?;
}
endpoint = endpoint
- .tcp_keepalive(self.config.tcp_keepalive)
- .tcp_nodelay(self.config.tcp_nodelay);
+ .tcp_keepalive(self.config().tcp_keepalive)
+ .tcp_nodelay(self.config().tcp_nodelay);
Ok(endpoint)
}
fn trigger_channel_recycling(&self) {
if self
+ .inner
.channel_recycle_started
.compare_exchange(false, true, Ordering::Relaxed, Ordering::Relaxed)
.is_err()
@@ -224,13 +257,15 @@ impl ChannelManager {
return;
}
- let pool = self.pool.clone();
- let _handle = common_runtime::spawn_global(async {
- recycle_channel_in_loop(pool, RECYCLE_CHANNEL_INTERVAL_SECS).await;
+ let pool = self.pool().clone();
+ let cancel = self.inner.cancel.clone();
+ let id = self.inner.id;
+ let _handle = common_runtime::spawn_global(async move {
+ recycle_channel_in_loop(pool, id, cancel, RECYCLE_CHANNEL_INTERVAL_SECS).await;
});
info!(
"ChannelManager: {}, channel recycle is started, running in the background!",
- self.id
+ self.inner.id
);
}
}
@@ -443,11 +478,23 @@ impl Pool {
}
}
-async fn recycle_channel_in_loop(pool: Arc, interval_secs: u64) {
+async fn recycle_channel_in_loop(
+ pool: Arc,
+ id: u64,
+ cancel: CancellationToken,
+ interval_secs: u64,
+) {
let mut interval = tokio::time::interval(Duration::from_secs(interval_secs));
loop {
- let _ = interval.tick().await;
+ tokio::select! {
+ _ = cancel.cancelled() => {
+ info!("Stop channel recycle, ChannelManager id: {}", id);
+ break;
+ },
+ _ = interval.tick() => {}
+ }
+
pool.retain_channel(|_, c| c.access.swap(0, Ordering::Relaxed) != 0)
}
}
@@ -461,11 +508,7 @@ mod tests {
#[should_panic]
#[test]
fn test_invalid_addr() {
- let pool = Arc::new(Pool::default());
- let mgr = ChannelManager {
- pool,
- ..Default::default()
- };
+ let mgr = ChannelManager::default();
let addr = "http://test";
let _ = mgr.get(addr).unwrap();
@@ -475,7 +518,9 @@ mod tests {
async fn test_access_count() {
let mgr = ChannelManager::new();
// Do not start recycle
- mgr.channel_recycle_started.store(true, Ordering::Relaxed);
+ mgr.inner
+ .channel_recycle_started
+ .store(true, Ordering::Relaxed);
let mgr = Arc::new(mgr);
let addr = "test_uri";
@@ -493,12 +538,12 @@ mod tests {
join.await.unwrap();
}
- assert_eq!(1000, mgr.pool.get_access(addr).unwrap());
+ assert_eq!(1000, mgr.pool().get_access(addr).unwrap());
- mgr.pool
+ mgr.pool()
.retain_channel(|_, c| c.access.swap(0, Ordering::Relaxed) != 0);
- assert_eq!(0, mgr.pool.get_access(addr).unwrap());
+ assert_eq!(0, mgr.pool().get_access(addr).unwrap());
}
#[test]
@@ -624,4 +669,49 @@ mod tests {
true
});
}
+
+ #[tokio::test]
+ async fn test_pool_release_with_channel_recycle() {
+ let mgr = ChannelManager::new();
+
+ let pool_holder = mgr.pool().clone();
+
+ // start channel recycle task
+ let addr = "test_addr";
+ let _ = mgr.get(addr);
+
+ let mgr_clone_1 = mgr.clone();
+ let mgr_clone_2 = mgr.clone();
+ assert_eq!(3, Arc::strong_count(mgr.pool()));
+
+ drop(mgr_clone_1);
+ drop(mgr_clone_2);
+ assert_eq!(3, Arc::strong_count(mgr.pool()));
+
+ drop(mgr);
+
+ // wait for the channel recycle task to finish
+ tokio::time::sleep(Duration::from_millis(10)).await;
+
+ assert_eq!(1, Arc::strong_count(&pool_holder));
+ }
+
+ #[tokio::test]
+ async fn test_pool_release_without_channel_recycle() {
+ let mgr = ChannelManager::new();
+
+ let pool_holder = mgr.pool().clone();
+
+ let mgr_clone_1 = mgr.clone();
+ let mgr_clone_2 = mgr.clone();
+ assert_eq!(2, Arc::strong_count(mgr.pool()));
+
+ drop(mgr_clone_1);
+ drop(mgr_clone_2);
+ assert_eq!(2, Arc::strong_count(mgr.pool()));
+
+ drop(mgr);
+
+ assert_eq!(1, Arc::strong_count(&pool_holder));
+ }
}
From 5a36fa5e18f979a97ebc3adacbc698852a4bc87a Mon Sep 17 00:00:00 2001
From: Weny Xu
Date: Fri, 11 Apr 2025 14:42:41 +0800
Subject: [PATCH 09/82] fix: alway rejects write while downgrading region
(#5842)
* fix: alway rejects write while downgrading region
* chore: apply suggestions from CR
---
.../function/src/admin/migrate_region.rs | 5 +-
src/common/meta/src/instruction.rs | 10 +-
src/datanode/src/heartbeat/handler.rs | 4 +-
.../src/heartbeat/handler/downgrade_region.rs | 119 ++++++++++--------
.../src/heartbeat/handler/upgrade_region.rs | 9 ++
src/file-engine/src/engine.rs | 7 +-
.../src/procedure/region_migration.rs | 7 ++
.../close_downgraded_region.rs | 5 +-
.../downgrade_leader_region.rs | 68 ++--------
.../upgrade_candidate_region.rs | 2 +
src/meta-srv/src/procedure/test_util.rs | 1 +
src/metric-engine/src/engine.rs | 32 ++++-
src/metric-engine/src/engine/catchup.rs | 4 +-
src/mito2/src/engine/catchup_test.rs | 13 +-
src/mito2/src/engine/set_role_state_test.rs | 19 +--
src/mito2/src/worker.rs | 8 +-
src/store-api/src/region_engine.rs | 81 ++++++++++--
src/store-api/src/region_request.rs | 4 +
18 files changed, 244 insertions(+), 154 deletions(-)
diff --git a/src/common/function/src/admin/migrate_region.rs b/src/common/function/src/admin/migrate_region.rs
index 0a487973d3..b1f79c0c07 100644
--- a/src/common/function/src/admin/migrate_region.rs
+++ b/src/common/function/src/admin/migrate_region.rs
@@ -25,12 +25,13 @@ use session::context::QueryContextRef;
use crate::handlers::ProcedureServiceHandlerRef;
use crate::helper::cast_u64;
-const DEFAULT_TIMEOUT_SECS: u64 = 30;
+/// The default timeout for migrate region procedure.
+const DEFAULT_TIMEOUT_SECS: u64 = 300;
/// A function to migrate a region from source peer to target peer.
/// Returns the submitted procedure id if success. Only available in cluster mode.
///
-/// - `migrate_region(region_id, from_peer, to_peer)`, with timeout(30 seconds).
+/// - `migrate_region(region_id, from_peer, to_peer)`, with timeout(300 seconds).
/// - `migrate_region(region_id, from_peer, to_peer, timeout(secs))`.
///
/// The parameters:
diff --git a/src/common/meta/src/instruction.rs b/src/common/meta/src/instruction.rs
index afdc14dff0..5e00437332 100644
--- a/src/common/meta/src/instruction.rs
+++ b/src/common/meta/src/instruction.rs
@@ -57,6 +57,8 @@ impl Display for RegionIdent {
pub struct DowngradeRegionReply {
/// Returns the `last_entry_id` if available.
pub last_entry_id: Option,
+ /// Returns the `metadata_last_entry_id` if available (Only available for metric engine).
+ pub metadata_last_entry_id: Option,
/// Indicates whether the region exists.
pub exists: bool,
/// Return error if any during the operation.
@@ -136,16 +138,14 @@ pub struct DowngradeRegion {
/// `None` stands for don't flush before downgrading the region.
#[serde(default)]
pub flush_timeout: Option,
- /// Rejects all write requests after flushing.
- pub reject_write: bool,
}
impl Display for DowngradeRegion {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
write!(
f,
- "DowngradeRegion(region_id={}, flush_timeout={:?}, rejct_write={})",
- self.region_id, self.flush_timeout, self.reject_write
+ "DowngradeRegion(region_id={}, flush_timeout={:?})",
+ self.region_id, self.flush_timeout,
)
}
}
@@ -157,6 +157,8 @@ pub struct UpgradeRegion {
pub region_id: RegionId,
/// The `last_entry_id` of old leader region.
pub last_entry_id: Option,
+ /// The `last_entry_id` of old leader metadata region (Only used for metric engine).
+ pub metadata_last_entry_id: Option,
/// The timeout of waiting for a wal replay.
///
/// `None` stands for no wait,
diff --git a/src/datanode/src/heartbeat/handler.rs b/src/datanode/src/heartbeat/handler.rs
index bf93d15128..17950847ed 100644
--- a/src/datanode/src/heartbeat/handler.rs
+++ b/src/datanode/src/heartbeat/handler.rs
@@ -220,7 +220,6 @@ mod tests {
let instruction = Instruction::DowngradeRegion(DowngradeRegion {
region_id: RegionId::new(2048, 1),
flush_timeout: Some(Duration::from_secs(1)),
- reject_write: false,
});
assert!(heartbeat_handler
.is_acceptable(&heartbeat_env.create_handler_ctx((meta.clone(), instruction))));
@@ -229,6 +228,7 @@ mod tests {
let instruction = Instruction::UpgradeRegion(UpgradeRegion {
region_id,
last_entry_id: None,
+ metadata_last_entry_id: None,
replay_timeout: None,
location_id: None,
});
@@ -419,7 +419,6 @@ mod tests {
let instruction = Instruction::DowngradeRegion(DowngradeRegion {
region_id,
flush_timeout: Some(Duration::from_secs(1)),
- reject_write: false,
});
let mut ctx = heartbeat_env.create_handler_ctx((meta, instruction));
@@ -442,7 +441,6 @@ mod tests {
let instruction = Instruction::DowngradeRegion(DowngradeRegion {
region_id: RegionId::new(2048, 1),
flush_timeout: Some(Duration::from_secs(1)),
- reject_write: false,
});
let mut ctx = heartbeat_env.create_handler_ctx((meta, instruction));
let control = heartbeat_handler.handle(&mut ctx).await.unwrap();
diff --git a/src/datanode/src/heartbeat/handler/downgrade_region.rs b/src/datanode/src/heartbeat/handler/downgrade_region.rs
index 216a460921..d82e4e065b 100644
--- a/src/datanode/src/heartbeat/handler/downgrade_region.rs
+++ b/src/datanode/src/heartbeat/handler/downgrade_region.rs
@@ -14,7 +14,7 @@
use common_meta::instruction::{DowngradeRegion, DowngradeRegionReply, InstructionReply};
use common_telemetry::tracing::info;
-use common_telemetry::warn;
+use common_telemetry::{error, warn};
use futures_util::future::BoxFuture;
use store_api::region_engine::{SetRegionRoleStateResponse, SettableRegionRoleState};
use store_api::region_request::{RegionFlushRequest, RegionRequest};
@@ -33,25 +33,32 @@ impl HandlerContext {
.set_region_role_state_gracefully(region_id, SettableRegionRoleState::Follower)
.await
{
- Ok(SetRegionRoleStateResponse::Success { last_entry_id }) => {
+ Ok(SetRegionRoleStateResponse::Success(success)) => {
Some(InstructionReply::DowngradeRegion(DowngradeRegionReply {
- last_entry_id,
+ last_entry_id: success.last_entry_id(),
+ metadata_last_entry_id: success.metadata_last_entry_id(),
exists: true,
error: None,
}))
}
Ok(SetRegionRoleStateResponse::NotFound) => {
+ warn!("Region: {region_id} is not found");
Some(InstructionReply::DowngradeRegion(DowngradeRegionReply {
last_entry_id: None,
+ metadata_last_entry_id: None,
exists: false,
error: None,
}))
}
- Err(err) => Some(InstructionReply::DowngradeRegion(DowngradeRegionReply {
- last_entry_id: None,
- exists: true,
- error: Some(format!("{err:?}")),
- })),
+ Err(err) => {
+ error!(err; "Failed to convert region to {}", SettableRegionRoleState::Follower);
+ Some(InstructionReply::DowngradeRegion(DowngradeRegionReply {
+ last_entry_id: None,
+ metadata_last_entry_id: None,
+ exists: true,
+ error: Some(format!("{err:?}")),
+ }))
+ }
}
}
@@ -60,7 +67,6 @@ impl HandlerContext {
DowngradeRegion {
region_id,
flush_timeout,
- reject_write,
}: DowngradeRegion,
) -> BoxFuture<'static, Option> {
Box::pin(async move {
@@ -68,6 +74,7 @@ impl HandlerContext {
warn!("Region: {region_id} is not found");
return Some(InstructionReply::DowngradeRegion(DowngradeRegionReply {
last_entry_id: None,
+ metadata_last_entry_id: None,
exists: false,
error: None,
}));
@@ -89,33 +96,35 @@ impl HandlerContext {
return self.downgrade_to_follower_gracefully(region_id).await;
};
- if reject_write {
- // Sets region to downgrading, the downgrading region will reject all write requests.
- match self
- .region_server
- .set_region_role_state_gracefully(
- region_id,
- SettableRegionRoleState::DowngradingLeader,
- )
- .await
- {
- Ok(SetRegionRoleStateResponse::Success { .. }) => {}
- Ok(SetRegionRoleStateResponse::NotFound) => {
- warn!("Region: {region_id} is not found");
- return Some(InstructionReply::DowngradeRegion(DowngradeRegionReply {
- last_entry_id: None,
- exists: false,
- error: None,
- }));
- }
- Err(err) => {
- warn!(err; "Failed to convert region to downgrading leader");
- return Some(InstructionReply::DowngradeRegion(DowngradeRegionReply {
- last_entry_id: None,
- exists: true,
- error: Some(format!("{err:?}")),
- }));
- }
+ // Sets region to downgrading,
+ // the downgrading region will reject all write requests.
+ // However, the downgrading region will still accept read, flush requests.
+ match self
+ .region_server
+ .set_region_role_state_gracefully(
+ region_id,
+ SettableRegionRoleState::DowngradingLeader,
+ )
+ .await
+ {
+ Ok(SetRegionRoleStateResponse::Success { .. }) => {}
+ Ok(SetRegionRoleStateResponse::NotFound) => {
+ warn!("Region: {region_id} is not found");
+ return Some(InstructionReply::DowngradeRegion(DowngradeRegionReply {
+ last_entry_id: None,
+ metadata_last_entry_id: None,
+ exists: false,
+ error: None,
+ }));
+ }
+ Err(err) => {
+ error!(err; "Failed to convert region to downgrading leader");
+ return Some(InstructionReply::DowngradeRegion(DowngradeRegionReply {
+ last_entry_id: None,
+ metadata_last_entry_id: None,
+ exists: true,
+ error: Some(format!("{err:?}")),
+ }));
}
}
@@ -144,20 +153,25 @@ impl HandlerContext {
}
let mut watcher = register_result.into_watcher();
- let result = self.catchup_tasks.wait(&mut watcher, flush_timeout).await;
+ let result = self.downgrade_tasks.wait(&mut watcher, flush_timeout).await;
match result {
WaitResult::Timeout => {
Some(InstructionReply::DowngradeRegion(DowngradeRegionReply {
last_entry_id: None,
+ metadata_last_entry_id: None,
exists: true,
- error: Some(format!("Flush region: {region_id} is timeout")),
+ error: Some(format!(
+ "Flush region timeout, region: {region_id}, timeout: {:?}",
+ flush_timeout
+ )),
}))
}
WaitResult::Finish(Ok(_)) => self.downgrade_to_follower_gracefully(region_id).await,
WaitResult::Finish(Err(err)) => {
Some(InstructionReply::DowngradeRegion(DowngradeRegionReply {
last_entry_id: None,
+ metadata_last_entry_id: None,
exists: true,
error: Some(format!("{err:?}")),
}))
@@ -174,7 +188,9 @@ mod tests {
use common_meta::instruction::{DowngradeRegion, InstructionReply};
use mito2::engine::MITO_ENGINE_NAME;
- use store_api::region_engine::{RegionRole, SetRegionRoleStateResponse};
+ use store_api::region_engine::{
+ RegionRole, SetRegionRoleStateResponse, SetRegionRoleStateSuccess,
+ };
use store_api::region_request::RegionRequest;
use store_api::storage::RegionId;
use tokio::time::Instant;
@@ -198,7 +214,6 @@ mod tests {
.handle_downgrade_region_instruction(DowngradeRegion {
region_id,
flush_timeout,
- reject_write: false,
})
.await;
assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_)));
@@ -227,7 +242,9 @@ mod tests {
Ok(0)
}));
region_engine.handle_set_readonly_gracefully_mock_fn = Some(Box::new(|_| {
- Ok(SetRegionRoleStateResponse::success(Some(1024)))
+ Ok(SetRegionRoleStateResponse::success(
+ SetRegionRoleStateSuccess::mito(1024),
+ ))
}))
});
mock_region_server.register_test_region(region_id, mock_engine);
@@ -240,7 +257,6 @@ mod tests {
.handle_downgrade_region_instruction(DowngradeRegion {
region_id,
flush_timeout,
- reject_write: false,
})
.await;
assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_)));
@@ -262,7 +278,9 @@ mod tests {
region_engine.mock_role = Some(Some(RegionRole::Leader));
region_engine.handle_request_delay = Some(Duration::from_secs(100));
region_engine.handle_set_readonly_gracefully_mock_fn = Some(Box::new(|_| {
- Ok(SetRegionRoleStateResponse::success(Some(1024)))
+ Ok(SetRegionRoleStateResponse::success(
+ SetRegionRoleStateSuccess::mito(1024),
+ ))
}))
});
mock_region_server.register_test_region(region_id, mock_engine);
@@ -274,7 +292,6 @@ mod tests {
.handle_downgrade_region_instruction(DowngradeRegion {
region_id,
flush_timeout: Some(flush_timeout),
- reject_write: false,
})
.await;
assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_)));
@@ -295,7 +312,9 @@ mod tests {
region_engine.mock_role = Some(Some(RegionRole::Leader));
region_engine.handle_request_delay = Some(Duration::from_millis(300));
region_engine.handle_set_readonly_gracefully_mock_fn = Some(Box::new(|_| {
- Ok(SetRegionRoleStateResponse::success(Some(1024)))
+ Ok(SetRegionRoleStateResponse::success(
+ SetRegionRoleStateSuccess::mito(1024),
+ ))
}))
});
mock_region_server.register_test_region(region_id, mock_engine);
@@ -312,7 +331,6 @@ mod tests {
.handle_downgrade_region_instruction(DowngradeRegion {
region_id,
flush_timeout,
- reject_write: false,
})
.await;
assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_)));
@@ -327,7 +345,6 @@ mod tests {
.handle_downgrade_region_instruction(DowngradeRegion {
region_id,
flush_timeout: Some(Duration::from_millis(500)),
- reject_write: false,
})
.await;
assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_)));
@@ -356,7 +373,9 @@ mod tests {
.fail()
}));
region_engine.handle_set_readonly_gracefully_mock_fn = Some(Box::new(|_| {
- Ok(SetRegionRoleStateResponse::success(Some(1024)))
+ Ok(SetRegionRoleStateResponse::success(
+ SetRegionRoleStateSuccess::mito(1024),
+ ))
}))
});
mock_region_server.register_test_region(region_id, mock_engine);
@@ -373,7 +392,6 @@ mod tests {
.handle_downgrade_region_instruction(DowngradeRegion {
region_id,
flush_timeout,
- reject_write: false,
})
.await;
assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_)));
@@ -388,7 +406,6 @@ mod tests {
.handle_downgrade_region_instruction(DowngradeRegion {
region_id,
flush_timeout: Some(Duration::from_millis(500)),
- reject_write: false,
})
.await;
assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_)));
@@ -419,7 +436,6 @@ mod tests {
.handle_downgrade_region_instruction(DowngradeRegion {
region_id,
flush_timeout: None,
- reject_write: false,
})
.await;
assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_)));
@@ -451,7 +467,6 @@ mod tests {
.handle_downgrade_region_instruction(DowngradeRegion {
region_id,
flush_timeout: None,
- reject_write: false,
})
.await;
assert_matches!(reply, Some(InstructionReply::DowngradeRegion(_)));
diff --git a/src/datanode/src/heartbeat/handler/upgrade_region.rs b/src/datanode/src/heartbeat/handler/upgrade_region.rs
index a23ae71a3d..b9a324c197 100644
--- a/src/datanode/src/heartbeat/handler/upgrade_region.rs
+++ b/src/datanode/src/heartbeat/handler/upgrade_region.rs
@@ -26,6 +26,7 @@ impl HandlerContext {
UpgradeRegion {
region_id,
last_entry_id,
+ metadata_last_entry_id,
replay_timeout,
location_id,
}: UpgradeRegion,
@@ -63,6 +64,7 @@ impl HandlerContext {
RegionRequest::Catchup(RegionCatchupRequest {
set_writable: true,
entry_id: last_entry_id,
+ metadata_entry_id: metadata_last_entry_id,
location_id,
}),
)
@@ -147,6 +149,7 @@ mod tests {
.handle_upgrade_region_instruction(UpgradeRegion {
region_id,
last_entry_id: None,
+ metadata_last_entry_id: None,
replay_timeout,
location_id: None,
})
@@ -185,6 +188,7 @@ mod tests {
.handle_upgrade_region_instruction(UpgradeRegion {
region_id,
last_entry_id: None,
+ metadata_last_entry_id: None,
replay_timeout,
location_id: None,
})
@@ -224,6 +228,7 @@ mod tests {
.handle_upgrade_region_instruction(UpgradeRegion {
region_id,
last_entry_id: None,
+ metadata_last_entry_id: None,
replay_timeout,
location_id: None,
})
@@ -267,6 +272,7 @@ mod tests {
region_id,
replay_timeout,
last_entry_id: None,
+ metadata_last_entry_id: None,
location_id: None,
})
.await;
@@ -284,6 +290,7 @@ mod tests {
.handle_upgrade_region_instruction(UpgradeRegion {
region_id,
last_entry_id: None,
+ metadata_last_entry_id: None,
replay_timeout: Some(Duration::from_millis(500)),
location_id: None,
})
@@ -326,6 +333,7 @@ mod tests {
.handle_upgrade_region_instruction(UpgradeRegion {
region_id,
last_entry_id: None,
+ metadata_last_entry_id: None,
replay_timeout: None,
location_id: None,
})
@@ -344,6 +352,7 @@ mod tests {
.handle_upgrade_region_instruction(UpgradeRegion {
region_id,
last_entry_id: None,
+ metadata_last_entry_id: None,
replay_timeout: Some(Duration::from_millis(200)),
location_id: None,
})
diff --git a/src/file-engine/src/engine.rs b/src/file-engine/src/engine.rs
index 9e9d1aa405..cf5e5c7576 100644
--- a/src/file-engine/src/engine.rs
+++ b/src/file-engine/src/engine.rs
@@ -27,7 +27,8 @@ use snafu::{ensure, OptionExt};
use store_api::metadata::RegionMetadataRef;
use store_api::region_engine::{
RegionEngine, RegionManifestInfo, RegionRole, RegionScannerRef, RegionStatistic,
- SetRegionRoleStateResponse, SettableRegionRoleState, SinglePartitionScanner,
+ SetRegionRoleStateResponse, SetRegionRoleStateSuccess, SettableRegionRoleState,
+ SinglePartitionScanner,
};
use store_api::region_request::{
AffectedRows, RegionCloseRequest, RegionCreateRequest, RegionDropRequest, RegionOpenRequest,
@@ -132,7 +133,9 @@ impl RegionEngine for FileRegionEngine {
let exists = self.inner.get_region(region_id).await.is_some();
if exists {
- Ok(SetRegionRoleStateResponse::success(None))
+ Ok(SetRegionRoleStateResponse::success(
+ SetRegionRoleStateSuccess::file(),
+ ))
} else {
Ok(SetRegionRoleStateResponse::NotFound)
}
diff --git a/src/meta-srv/src/procedure/region_migration.rs b/src/meta-srv/src/procedure/region_migration.rs
index f13939195b..a98320f9e7 100644
--- a/src/meta-srv/src/procedure/region_migration.rs
+++ b/src/meta-srv/src/procedure/region_migration.rs
@@ -127,6 +127,8 @@ pub struct VolatileContext {
leader_region_lease_deadline: Option,
/// The last_entry_id of leader region.
leader_region_last_entry_id: Option,
+ /// The last_entry_id of leader metadata region (Only used for metric engine).
+ leader_region_metadata_last_entry_id: Option,
/// Elapsed time of downgrading region and upgrading region.
operations_elapsed: Duration,
}
@@ -148,6 +150,11 @@ impl VolatileContext {
pub fn set_last_entry_id(&mut self, last_entry_id: u64) {
self.leader_region_last_entry_id = Some(last_entry_id)
}
+
+ /// Sets the `leader_region_metadata_last_entry_id`.
+ pub fn set_metadata_last_entry_id(&mut self, last_entry_id: u64) {
+ self.leader_region_metadata_last_entry_id = Some(last_entry_id);
+ }
}
/// Used to generate new [Context].
diff --git a/src/meta-srv/src/procedure/region_migration/close_downgraded_region.rs b/src/meta-srv/src/procedure/region_migration/close_downgraded_region.rs
index 4e09e421d0..94256ba5ec 100644
--- a/src/meta-srv/src/procedure/region_migration/close_downgraded_region.rs
+++ b/src/meta-srv/src/procedure/region_migration/close_downgraded_region.rs
@@ -16,7 +16,7 @@ use std::any::Any;
use std::time::Duration;
use api::v1::meta::MailboxMessage;
-use common_meta::distributed_time_constants::MAILBOX_RTT_SECS;
+use common_meta::distributed_time_constants::REGION_LEASE_SECS;
use common_meta::instruction::{Instruction, InstructionReply, SimpleReply};
use common_meta::key::datanode_table::RegionInfo;
use common_meta::RegionIdent;
@@ -31,7 +31,8 @@ use crate::procedure::region_migration::migration_end::RegionMigrationEnd;
use crate::procedure::region_migration::{Context, State};
use crate::service::mailbox::Channel;
-const CLOSE_DOWNGRADED_REGION_TIMEOUT: Duration = Duration::from_secs(MAILBOX_RTT_SECS);
+/// Uses lease time of a region as the timeout of closing a downgraded region.
+const CLOSE_DOWNGRADED_REGION_TIMEOUT: Duration = Duration::from_secs(REGION_LEASE_SECS);
#[derive(Debug, Serialize, Deserialize)]
pub struct CloseDowngradedRegion;
diff --git a/src/meta-srv/src/procedure/region_migration/downgrade_leader_region.rs b/src/meta-srv/src/procedure/region_migration/downgrade_leader_region.rs
index 41d437d466..02b7216fe7 100644
--- a/src/meta-srv/src/procedure/region_migration/downgrade_leader_region.rs
+++ b/src/meta-srv/src/procedure/region_migration/downgrade_leader_region.rs
@@ -22,10 +22,8 @@ use common_meta::instruction::{
};
use common_procedure::Status;
use common_telemetry::{error, info, warn};
-use common_wal::options::WalOptions;
use serde::{Deserialize, Serialize};
use snafu::{OptionExt, ResultExt};
-use store_api::storage::RegionId;
use tokio::time::{sleep, Instant};
use crate::error::{self, Result};
@@ -97,32 +95,15 @@ impl DowngradeLeaderRegion {
&self,
ctx: &Context,
flush_timeout: Duration,
- reject_write: bool,
) -> Instruction {
let pc = &ctx.persistent_ctx;
let region_id = pc.region_id;
Instruction::DowngradeRegion(DowngradeRegion {
region_id,
flush_timeout: Some(flush_timeout),
- reject_write,
})
}
- async fn should_reject_write(ctx: &mut Context, region_id: RegionId) -> Result {
- let datanode_table_value = ctx.get_from_peer_datanode_table_value().await?;
- if let Some(wal_option) = datanode_table_value
- .region_info
- .region_wal_options
- .get(®ion_id.region_number())
- {
- let options: WalOptions = serde_json::from_str(wal_option)
- .with_context(|_| error::DeserializeFromJsonSnafu { input: wal_option })?;
- return Ok(matches!(options, WalOptions::RaftEngine));
- }
-
- Ok(true)
- }
-
/// Tries to downgrade a leader region.
///
/// Retry:
@@ -143,9 +124,7 @@ impl DowngradeLeaderRegion {
.context(error::ExceededDeadlineSnafu {
operation: "Downgrade region",
})?;
- let reject_write = Self::should_reject_write(ctx, region_id).await?;
- let downgrade_instruction =
- self.build_downgrade_region_instruction(ctx, operation_timeout, reject_write);
+ let downgrade_instruction = self.build_downgrade_region_instruction(ctx, operation_timeout);
let leader = &ctx.persistent_ctx.from_peer;
let msg = MailboxMessage::json_message(
@@ -174,6 +153,7 @@ impl DowngradeLeaderRegion {
);
let InstructionReply::DowngradeRegion(DowngradeRegionReply {
last_entry_id,
+ metadata_last_entry_id,
exists,
error,
}) = reply
@@ -202,9 +182,10 @@ impl DowngradeLeaderRegion {
);
} else {
info!(
- "Region {} leader is downgraded, last_entry_id: {:?}, elapsed: {:?}",
+ "Region {} leader is downgraded, last_entry_id: {:?}, metadata_last_entry_id: {:?}, elapsed: {:?}",
region_id,
last_entry_id,
+ metadata_last_entry_id,
now.elapsed()
);
}
@@ -213,6 +194,11 @@ impl DowngradeLeaderRegion {
ctx.volatile_ctx.set_last_entry_id(last_entry_id);
}
+ if let Some(metadata_last_entry_id) = metadata_last_entry_id {
+ ctx.volatile_ctx
+ .set_metadata_last_entry_id(metadata_last_entry_id);
+ }
+
Ok(())
}
Err(error::Error::MailboxTimeout { .. }) => {
@@ -276,7 +262,6 @@ mod tests {
use common_meta::key::test_utils::new_test_table_info;
use common_meta::peer::Peer;
use common_meta::rpc::router::{Region, RegionRoute};
- use common_wal::options::KafkaWalOptions;
use store_api::storage::RegionId;
use tokio::time::Instant;
@@ -331,41 +316,6 @@ mod tests {
assert!(!err.is_retryable());
}
- #[tokio::test]
- async fn test_should_reject_writes() {
- let persistent_context = new_persistent_context();
- let region_id = persistent_context.region_id;
- let env = TestingEnv::new();
- let mut ctx = env.context_factory().new_context(persistent_context);
- let wal_options =
- HashMap::from([(1, serde_json::to_string(&WalOptions::RaftEngine).unwrap())]);
- prepare_table_metadata(&ctx, wal_options).await;
-
- let reject_write = DowngradeLeaderRegion::should_reject_write(&mut ctx, region_id)
- .await
- .unwrap();
- assert!(reject_write);
-
- // Remote WAL
- let persistent_context = new_persistent_context();
- let region_id = persistent_context.region_id;
- let env = TestingEnv::new();
- let mut ctx = env.context_factory().new_context(persistent_context);
- let wal_options = HashMap::from([(
- 1,
- serde_json::to_string(&WalOptions::Kafka(KafkaWalOptions {
- topic: "my_topic".to_string(),
- }))
- .unwrap(),
- )]);
- prepare_table_metadata(&ctx, wal_options).await;
-
- let reject_write = DowngradeLeaderRegion::should_reject_write(&mut ctx, region_id)
- .await
- .unwrap();
- assert!(!reject_write);
- }
-
#[tokio::test]
async fn test_pusher_dropped() {
let state = DowngradeLeaderRegion::default();
diff --git a/src/meta-srv/src/procedure/region_migration/upgrade_candidate_region.rs b/src/meta-srv/src/procedure/region_migration/upgrade_candidate_region.rs
index 6f42c670dd..552b9d3863 100644
--- a/src/meta-srv/src/procedure/region_migration/upgrade_candidate_region.rs
+++ b/src/meta-srv/src/procedure/region_migration/upgrade_candidate_region.rs
@@ -76,10 +76,12 @@ impl UpgradeCandidateRegion {
let pc = &ctx.persistent_ctx;
let region_id = pc.region_id;
let last_entry_id = ctx.volatile_ctx.leader_region_last_entry_id;
+ let metadata_last_entry_id = ctx.volatile_ctx.leader_region_metadata_last_entry_id;
Instruction::UpgradeRegion(UpgradeRegion {
region_id,
last_entry_id,
+ metadata_last_entry_id,
replay_timeout: Some(replay_timeout),
location_id: Some(ctx.persistent_ctx.from_peer.id),
})
diff --git a/src/meta-srv/src/procedure/test_util.rs b/src/meta-srv/src/procedure/test_util.rs
index 61254b133b..34ce23abd4 100644
--- a/src/meta-srv/src/procedure/test_util.rs
+++ b/src/meta-srv/src/procedure/test_util.rs
@@ -135,6 +135,7 @@ pub fn new_downgrade_region_reply(
payload: Some(Payload::Json(
serde_json::to_string(&InstructionReply::DowngradeRegion(DowngradeRegionReply {
last_entry_id,
+ metadata_last_entry_id: None,
exists: exist,
error,
}))
diff --git a/src/metric-engine/src/engine.rs b/src/metric-engine/src/engine.rs
index d1e837d078..74978fda78 100644
--- a/src/metric-engine/src/engine.rs
+++ b/src/metric-engine/src/engine.rs
@@ -40,7 +40,7 @@ use store_api::metadata::RegionMetadataRef;
use store_api::metric_engine_consts::METRIC_ENGINE_NAME;
use store_api::region_engine::{
RegionEngine, RegionManifestInfo, RegionRole, RegionScannerRef, RegionStatistic,
- SetRegionRoleStateResponse, SettableRegionRoleState,
+ SetRegionRoleStateResponse, SetRegionRoleStateSuccess, SettableRegionRoleState,
};
use store_api::region_request::{BatchRegionDdlRequest, RegionRequest};
use store_api::storage::{RegionId, ScanRequest, SequenceNumber};
@@ -352,17 +352,39 @@ impl RegionEngine for MetricEngine {
region_id: RegionId,
region_role_state: SettableRegionRoleState,
) -> std::result::Result {
- self.inner
+ let metadata_result = match self
+ .inner
.mito
.set_region_role_state_gracefully(
utils::to_metadata_region_id(region_id),
region_role_state,
)
- .await?;
- self.inner
+ .await?
+ {
+ SetRegionRoleStateResponse::Success(success) => success,
+ SetRegionRoleStateResponse::NotFound => {
+ return Ok(SetRegionRoleStateResponse::NotFound)
+ }
+ };
+
+ let data_result = match self
+ .inner
.mito
.set_region_role_state_gracefully(region_id, region_role_state)
- .await
+ .await?
+ {
+ SetRegionRoleStateResponse::Success(success) => success,
+ SetRegionRoleStateResponse::NotFound => {
+ return Ok(SetRegionRoleStateResponse::NotFound)
+ }
+ };
+
+ Ok(SetRegionRoleStateResponse::success(
+ SetRegionRoleStateSuccess::metric(
+ data_result.last_entry_id().unwrap_or_default(),
+ metadata_result.last_entry_id().unwrap_or_default(),
+ ),
+ ))
}
/// Returns the physical region role.
diff --git a/src/metric-engine/src/engine/catchup.rs b/src/metric-engine/src/engine/catchup.rs
index 44713f0bc4..d2e92f6e0e 100644
--- a/src/metric-engine/src/engine/catchup.rs
+++ b/src/metric-engine/src/engine/catchup.rs
@@ -56,7 +56,8 @@ impl MetricEngineInner {
metadata_region_id,
RegionRequest::Catchup(RegionCatchupRequest {
set_writable: req.set_writable,
- entry_id: None,
+ entry_id: req.metadata_entry_id,
+ metadata_entry_id: None,
location_id: req.location_id,
}),
)
@@ -70,6 +71,7 @@ impl MetricEngineInner {
RegionRequest::Catchup(RegionCatchupRequest {
set_writable: req.set_writable,
entry_id: req.entry_id,
+ metadata_entry_id: None,
location_id: req.location_id,
}),
)
diff --git a/src/mito2/src/engine/catchup_test.rs b/src/mito2/src/engine/catchup_test.rs
index a9de0d6008..8a24fecf3a 100644
--- a/src/mito2/src/engine/catchup_test.rs
+++ b/src/mito2/src/engine/catchup_test.rs
@@ -35,8 +35,8 @@ use crate::test_util::{
use crate::wal::EntryId;
fn get_last_entry_id(resp: SetRegionRoleStateResponse) -> Option {
- if let SetRegionRoleStateResponse::Success { last_entry_id } = resp {
- last_entry_id
+ if let SetRegionRoleStateResponse::Success(success) = resp {
+ success.last_entry_id()
} else {
unreachable!();
}
@@ -118,6 +118,7 @@ async fn test_catchup_with_last_entry_id(factory: Option) {
RegionRequest::Catchup(RegionCatchupRequest {
set_writable: false,
entry_id: last_entry_id,
+ metadata_entry_id: None,
location_id: None,
}),
)
@@ -150,6 +151,7 @@ async fn test_catchup_with_last_entry_id(factory: Option) {
RegionRequest::Catchup(RegionCatchupRequest {
set_writable: true,
entry_id: last_entry_id,
+ metadata_entry_id: None,
location_id: None,
}),
)
@@ -237,6 +239,7 @@ async fn test_catchup_with_incorrect_last_entry_id(factory: Option) {
RegionRequest::Catchup(RegionCatchupRequest {
set_writable: false,
entry_id: None,
+ metadata_entry_id: None,
location_id: None,
}),
)
@@ -353,6 +358,7 @@ async fn test_catchup_without_last_entry_id(factory: Option) {
RegionRequest::Catchup(RegionCatchupRequest {
set_writable: true,
entry_id: None,
+ metadata_entry_id: None,
location_id: None,
}),
)
@@ -442,6 +448,7 @@ async fn test_catchup_with_manifest_update(factory: Option) {
RegionRequest::Catchup(RegionCatchupRequest {
set_writable: false,
entry_id: None,
+ metadata_entry_id: None,
location_id: None,
}),
)
@@ -479,6 +486,7 @@ async fn test_catchup_with_manifest_update(factory: Option) {
RegionRequest::Catchup(RegionCatchupRequest {
set_writable: true,
entry_id: None,
+ metadata_entry_id: None,
location_id: None,
}),
)
@@ -501,6 +509,7 @@ async fn test_catchup_not_exist() {
RegionRequest::Catchup(RegionCatchupRequest {
set_writable: true,
entry_id: None,
+ metadata_entry_id: None,
location_id: None,
}),
)
diff --git a/src/mito2/src/engine/set_role_state_test.rs b/src/mito2/src/engine/set_role_state_test.rs
index 2a4cb9f9ca..93dbc69407 100644
--- a/src/mito2/src/engine/set_role_state_test.rs
+++ b/src/mito2/src/engine/set_role_state_test.rs
@@ -16,7 +16,8 @@ use api::v1::Rows;
use common_error::ext::ErrorExt;
use common_error::status_code::StatusCode;
use store_api::region_engine::{
- RegionEngine, RegionRole, SetRegionRoleStateResponse, SettableRegionRoleState,
+ RegionEngine, RegionRole, SetRegionRoleStateResponse, SetRegionRoleStateSuccess,
+ SettableRegionRoleState,
};
use store_api::region_request::{RegionPutRequest, RegionRequest};
use store_api::storage::RegionId;
@@ -48,9 +49,7 @@ async fn test_set_role_state_gracefully() {
.await
.unwrap();
assert_eq!(
- SetRegionRoleStateResponse::Success {
- last_entry_id: Some(0)
- },
+ SetRegionRoleStateResponse::success(SetRegionRoleStateSuccess::mito(0)),
result
);
@@ -60,9 +59,7 @@ async fn test_set_role_state_gracefully() {
.await
.unwrap();
assert_eq!(
- SetRegionRoleStateResponse::Success {
- last_entry_id: Some(0)
- },
+ SetRegionRoleStateResponse::success(SetRegionRoleStateSuccess::mito(0)),
result
);
@@ -96,9 +93,7 @@ async fn test_set_role_state_gracefully() {
.unwrap();
assert_eq!(
- SetRegionRoleStateResponse::Success {
- last_entry_id: Some(1)
- },
+ SetRegionRoleStateResponse::success(SetRegionRoleStateSuccess::mito(1)),
result
);
}
@@ -144,9 +139,7 @@ async fn test_write_downgrading_region() {
.await
.unwrap();
assert_eq!(
- SetRegionRoleStateResponse::Success {
- last_entry_id: Some(1)
- },
+ SetRegionRoleStateResponse::success(SetRegionRoleStateSuccess::mito(1)),
result
);
diff --git a/src/mito2/src/worker.rs b/src/mito2/src/worker.rs
index 0eb5abff41..7aee65b651 100644
--- a/src/mito2/src/worker.rs
+++ b/src/mito2/src/worker.rs
@@ -41,7 +41,9 @@ use prometheus::IntGauge;
use rand::{rng, Rng};
use snafu::{ensure, ResultExt};
use store_api::logstore::LogStore;
-use store_api::region_engine::{SetRegionRoleStateResponse, SettableRegionRoleState};
+use store_api::region_engine::{
+ SetRegionRoleStateResponse, SetRegionRoleStateSuccess, SettableRegionRoleState,
+};
use store_api::storage::RegionId;
use tokio::sync::mpsc::{Receiver, Sender};
use tokio::sync::{mpsc, oneshot, watch, Mutex};
@@ -931,7 +933,9 @@ impl RegionWorkerLoop {
region.set_role_state_gracefully(region_role_state).await;
let last_entry_id = region.version_control.current().last_entry_id;
- let _ = sender.send(SetRegionRoleStateResponse::success(Some(last_entry_id)));
+ let _ = sender.send(SetRegionRoleStateResponse::success(
+ SetRegionRoleStateSuccess::mito(last_entry_id),
+ ));
});
} else {
let _ = sender.send(SetRegionRoleStateResponse::NotFound);
diff --git a/src/store-api/src/region_engine.rs b/src/store-api/src/region_engine.rs
index d4df5216f4..0a38700f1d 100644
--- a/src/store-api/src/region_engine.rs
+++ b/src/store-api/src/region_engine.rs
@@ -47,6 +47,15 @@ pub enum SettableRegionRoleState {
DowngradingLeader,
}
+impl Display for SettableRegionRoleState {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ match self {
+ SettableRegionRoleState::Follower => write!(f, "Follower"),
+ SettableRegionRoleState::DowngradingLeader => write!(f, "Leader(Downgrading)"),
+ }
+ }
+}
+
impl From for RegionRole {
fn from(value: SettableRegionRoleState) -> Self {
match value {
@@ -63,20 +72,78 @@ pub struct SetRegionRoleStateRequest {
region_role_state: SettableRegionRoleState,
}
+/// The success response of setting region role state.
+#[derive(Debug, PartialEq, Eq)]
+pub enum SetRegionRoleStateSuccess {
+ File,
+ Mito {
+ last_entry_id: entry::Id,
+ },
+ Metric {
+ last_entry_id: entry::Id,
+ metadata_last_entry_id: entry::Id,
+ },
+}
+
+impl SetRegionRoleStateSuccess {
+ /// Returns a [SetRegionRoleStateSuccess::File].
+ pub fn file() -> Self {
+ Self::File
+ }
+
+ /// Returns a [SetRegionRoleStateSuccess::Mito] with the `last_entry_id`.
+ pub fn mito(last_entry_id: entry::Id) -> Self {
+ SetRegionRoleStateSuccess::Mito { last_entry_id }
+ }
+
+ /// Returns a [SetRegionRoleStateSuccess::Metric] with the `last_entry_id` and `metadata_last_entry_id`.
+ pub fn metric(last_entry_id: entry::Id, metadata_last_entry_id: entry::Id) -> Self {
+ SetRegionRoleStateSuccess::Metric {
+ last_entry_id,
+ metadata_last_entry_id,
+ }
+ }
+}
+
+impl SetRegionRoleStateSuccess {
+ /// Returns the last entry id of the region.
+ pub fn last_entry_id(&self) -> Option {
+ match self {
+ SetRegionRoleStateSuccess::File => None,
+ SetRegionRoleStateSuccess::Mito { last_entry_id } => Some(*last_entry_id),
+ SetRegionRoleStateSuccess::Metric { last_entry_id, .. } => Some(*last_entry_id),
+ }
+ }
+
+ /// Returns the last entry id of the metadata of the region.
+ pub fn metadata_last_entry_id(&self) -> Option {
+ match self {
+ SetRegionRoleStateSuccess::File => None,
+ SetRegionRoleStateSuccess::Mito { .. } => None,
+ SetRegionRoleStateSuccess::Metric {
+ metadata_last_entry_id,
+ ..
+ } => Some(*metadata_last_entry_id),
+ }
+ }
+}
+
/// The response of setting region role state.
#[derive(Debug, PartialEq, Eq)]
pub enum SetRegionRoleStateResponse {
- Success {
- /// Returns `last_entry_id` of the region if available(e.g., It's not available in file engine).
- last_entry_id: Option,
- },
+ Success(SetRegionRoleStateSuccess),
NotFound,
}
impl SetRegionRoleStateResponse {
- /// Returns a [SetRegionRoleStateResponse::Success] with the `last_entry_id`.
- pub fn success(last_entry_id: Option) -> Self {
- Self::Success { last_entry_id }
+ /// Returns a [SetRegionRoleStateResponse::Success] with the `File` success.
+ pub fn success(success: SetRegionRoleStateSuccess) -> Self {
+ Self::Success(success)
+ }
+
+ /// Returns true if the response is a [SetRegionRoleStateResponse::NotFound].
+ pub fn is_not_found(&self) -> bool {
+ matches!(self, SetRegionRoleStateResponse::NotFound)
}
}
diff --git a/src/store-api/src/region_request.rs b/src/store-api/src/region_request.rs
index d11963987e..638f2ee114 100644
--- a/src/store-api/src/region_request.rs
+++ b/src/store-api/src/region_request.rs
@@ -1105,6 +1105,10 @@ pub struct RegionCatchupRequest {
/// The `entry_id` that was expected to reply to.
/// `None` stands replaying to latest.
pub entry_id: Option,
+ /// Used for metrics metadata region.
+ /// The `entry_id` that was expected to reply to.
+ /// `None` stands replaying to latest.
+ pub metadata_entry_id: Option,
/// The hint for replaying memtable.
pub location_id: Option,
}
From 5b0c75c85f7847e689860cda60132578c9304fc1 Mon Sep 17 00:00:00 2001
From: liyang
Date: Mon, 14 Apr 2025 09:22:40 +0800
Subject: [PATCH 10/82] ci: not push latest image when schedule release (#5883)
* ci: delete the scheduled release
* do no push latest image when schedule release
* check ref type and name
* check not schedule
---
.github/scripts/create-version.sh | 12 ++++++------
.github/workflows/release.yml | 4 ++--
2 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/.github/scripts/create-version.sh b/.github/scripts/create-version.sh
index e87c74cffb..1de37df190 100755
--- a/.github/scripts/create-version.sh
+++ b/.github/scripts/create-version.sh
@@ -25,7 +25,7 @@ function create_version() {
fi
# Reuse $NEXT_RELEASE_VERSION to identify whether it's a nightly build.
- # It will be like 'nigtly-20230808-7d0d8dc6'.
+ # It will be like 'nightly-20230808-7d0d8dc6'.
if [ "$NEXT_RELEASE_VERSION" = nightly ]; then
echo "$NIGHTLY_RELEASE_PREFIX-$(date "+%Y%m%d")-$(git rev-parse --short HEAD)"
exit 0
@@ -60,9 +60,9 @@ function create_version() {
}
# You can run as following examples:
-# GITHUB_EVENT_NAME=push NEXT_RELEASE_VERSION=v0.4.0 NIGHTLY_RELEASE_PREFIX=nigtly GITHUB_REF_NAME=v0.3.0 ./create-version.sh
-# GITHUB_EVENT_NAME=workflow_dispatch NEXT_RELEASE_VERSION=v0.4.0 NIGHTLY_RELEASE_PREFIX=nigtly ./create-version.sh
-# GITHUB_EVENT_NAME=schedule NEXT_RELEASE_VERSION=v0.4.0 NIGHTLY_RELEASE_PREFIX=nigtly ./create-version.sh
-# GITHUB_EVENT_NAME=schedule NEXT_RELEASE_VERSION=nightly NIGHTLY_RELEASE_PREFIX=nigtly ./create-version.sh
-# GITHUB_EVENT_NAME=workflow_dispatch COMMIT_SHA=f0e7216c4bb6acce9b29a21ec2d683be2e3f984a NEXT_RELEASE_VERSION=dev NIGHTLY_RELEASE_PREFIX=nigtly ./create-version.sh
+# GITHUB_EVENT_NAME=push NEXT_RELEASE_VERSION=v0.4.0 NIGHTLY_RELEASE_PREFIX=nightly GITHUB_REF_NAME=v0.3.0 ./create-version.sh
+# GITHUB_EVENT_NAME=workflow_dispatch NEXT_RELEASE_VERSION=v0.4.0 NIGHTLY_RELEASE_PREFIX=nightly ./create-version.sh
+# GITHUB_EVENT_NAME=schedule NEXT_RELEASE_VERSION=v0.4.0 NIGHTLY_RELEASE_PREFIX=nightly ./create-version.sh
+# GITHUB_EVENT_NAME=schedule NEXT_RELEASE_VERSION=nightly NIGHTLY_RELEASE_PREFIX=nightly ./create-version.sh
+# GITHUB_EVENT_NAME=workflow_dispatch COMMIT_SHA=f0e7216c4bb6acce9b29a21ec2d683be2e3f984a NEXT_RELEASE_VERSION=dev NIGHTLY_RELEASE_PREFIX=nightly ./create-version.sh
create_version
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 52b61320be..fe85a6f2c8 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -317,7 +317,7 @@ jobs:
image-registry-username: ${{ secrets.DOCKERHUB_USERNAME }}
image-registry-password: ${{ secrets.DOCKERHUB_TOKEN }}
version: ${{ needs.allocate-runners.outputs.version }}
- push-latest-tag: true
+ push-latest-tag: ${{ github.ref_type == 'tag' && !contains(github.ref_name, 'nightly') && github.event_name != 'schedule' }}
- name: Set build image result
id: set-build-image-result
@@ -364,7 +364,7 @@ jobs:
dev-mode: false
upload-to-s3: true
update-version-info: true
- push-latest-tag: true
+ push-latest-tag: ${{ github.ref_type == 'tag' && !contains(github.ref_name, 'nightly') && github.event_name != 'schedule' }}
publish-github-release:
name: Create GitHub release and upload artifacts
From be837ddc2428cb8ca7a055ffd37b2ad03cc00495 Mon Sep 17 00:00:00 2001
From: Ning Sun
Date: Mon, 14 Apr 2025 11:13:46 +0800
Subject: [PATCH 11/82] test: add tests to ensure nested data structure for
identity pipeline (#5888)
---
tests-integration/tests/http.rs | 40 ++++++++++++++++-----------------
1 file changed, 20 insertions(+), 20 deletions(-)
diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs
index b4690cf9e1..ffb74e1b16 100644
--- a/tests-integration/tests/http.rs
+++ b/tests-integration/tests/http.rs
@@ -95,9 +95,9 @@ macro_rules! http_tests {
test_pipeline_api,
test_test_pipeline_api,
test_plain_text_ingestion,
- test_identify_pipeline,
- test_identify_pipeline_with_flatten,
- test_identify_pipeline_with_custom_ts,
+ test_identity_pipeline,
+ test_identity_pipeline_with_flatten,
+ test_identity_pipeline_with_custom_ts,
test_pipeline_dispatcher,
test_pipeline_suffix_template,
@@ -1413,15 +1413,15 @@ transform:
guard.remove_all().await;
}
-pub async fn test_identify_pipeline(store_type: StorageType) {
+pub async fn test_identity_pipeline(store_type: StorageType) {
common_telemetry::init_default_ut_logging();
let (app, mut guard) =
- setup_test_http_app_with_frontend(store_type, "test_identify_pipeline").await;
+ setup_test_http_app_with_frontend(store_type, "test_identity_pipeline").await;
// handshake
let client = TestClient::new(app).await;
- let body = r#"{"__time__":1453809242,"__topic__":"","__source__":"10.170.***.***","ip":"10.200.**.***","time":"26/Jan/2016:19:54:02 +0800","url":"POST/PutData?Category=YunOsAccountOpLog&AccessKeyId=&Date=Fri%2C%2028%20Jun%202013%2006%3A53%3A30%20GMT&Topic=raw&Signature=HTTP/1.1","status":"200","user-agent":"aliyun-sdk-java"}
-{"__time__":1453809242,"__topic__":"","__source__":"10.170.***.***","ip":"10.200.**.***","time":"26/Jan/2016:19:54:02 +0800","url":"POST/PutData?Category=YunOsAccountOpLog&AccessKeyId=&Date=Fri%2C%2028%20Jun%202013%2006%3A53%3A30%20GMT&Topic=raw&Signature=HTTP/1.1","status":"200","user-agent":"aliyun-sdk-java","hasagei":"hasagei","dongdongdong":"guaguagua"}"#;
+ let body = r#"{"__time__":1453809242,"__topic__":"","__source__":"10.170.***.***","ip":"10.200.**.***","time":"26/Jan/2016:19:54:02 +0800","url":"POST/PutData?Category=YunOsAccountOpLog&AccessKeyId=&Date=Fri%2C%2028%20Jun%202013%2006%3A53%3A30%20GMT&Topic=raw&Signature=HTTP/1.1","status":"200","user-agent":"aliyun-sdk-java", "json_object": {"a":1,"b":2}, "json_array":[1,2,3]}
+{"__time__":1453809242,"__topic__":"","__source__":"10.170.***.***","ip":"10.200.**.***","time":"26/Jan/2016:19:54:02 +0800","url":"POST/PutData?Category=YunOsAccountOpLog&AccessKeyId=&Date=Fri%2C%2028%20Jun%202013%2006%3A53%3A30%20GMT&Topic=raw&Signature=HTTP/1.1","status":"200","user-agent":"aliyun-sdk-java","hasagei":"hasagei","dongdongdong":"guaguagua", "json_object": {"a":1,"b":2}, "json_array":[1,2,3]}"#;
let res = client
.post("/v1/ingest?db=public&table=logs&pipeline_name=greptime_identity")
.header("Content-Type", "application/json")
@@ -1440,8 +1440,8 @@ pub async fn test_identify_pipeline(store_type: StorageType) {
assert_eq!(res.status(), StatusCode::OK);
- let line1_expected = r#"[null,"10.170.***.***",1453809242,"","10.200.**.***","200","26/Jan/2016:19:54:02 +0800","POST/PutData?Category=YunOsAccountOpLog&AccessKeyId=&Date=Fri%2C%2028%20Jun%202013%2006%3A53%3A30%20GMT&Topic=raw&Signature=HTTP/1.1","aliyun-sdk-java",null,null]"#;
- let line2_expected = r#"[null,"10.170.***.***",1453809242,"","10.200.**.***","200","26/Jan/2016:19:54:02 +0800","POST/PutData?Category=YunOsAccountOpLog&AccessKeyId=&Date=Fri%2C%2028%20Jun%202013%2006%3A53%3A30%20GMT&Topic=raw&Signature=HTTP/1.1","aliyun-sdk-java","guaguagua","hasagei"]"#;
+ let line1_expected = r#"[null,"10.170.***.***",1453809242,"","10.200.**.***",[1,2,3],{"a":1,"b":2},"200","26/Jan/2016:19:54:02 +0800","POST/PutData?Category=YunOsAccountOpLog&AccessKeyId=&Date=Fri%2C%2028%20Jun%202013%2006%3A53%3A30%20GMT&Topic=raw&Signature=HTTP/1.1","aliyun-sdk-java",null,null]"#;
+ let line2_expected = r#"[null,"10.170.***.***",1453809242,"","10.200.**.***",[1,2,3],{"a":1,"b":2},"200","26/Jan/2016:19:54:02 +0800","POST/PutData?Category=YunOsAccountOpLog&AccessKeyId=&Date=Fri%2C%2028%20Jun%202013%2006%3A53%3A30%20GMT&Topic=raw&Signature=HTTP/1.1","aliyun-sdk-java","guaguagua","hasagei"]"#;
let res = client.get("/v1/sql?sql=select * from logs").send().await;
assert_eq!(res.status(), StatusCode::OK);
let resp: serde_json::Value = res.json().await;
@@ -1464,7 +1464,7 @@ pub async fn test_identify_pipeline(store_type: StorageType) {
serde_json::from_str::>(line2_expected).unwrap()
);
- let expected = r#"[["greptime_timestamp","TimestampNanosecond","PRI","NO","","TIMESTAMP"],["__source__","String","","YES","","FIELD"],["__time__","Int64","","YES","","FIELD"],["__topic__","String","","YES","","FIELD"],["ip","String","","YES","","FIELD"],["status","String","","YES","","FIELD"],["time","String","","YES","","FIELD"],["url","String","","YES","","FIELD"],["user-agent","String","","YES","","FIELD"],["dongdongdong","String","","YES","","FIELD"],["hasagei","String","","YES","","FIELD"]]"#;
+ let expected = r#"[["greptime_timestamp","TimestampNanosecond","PRI","NO","","TIMESTAMP"],["__source__","String","","YES","","FIELD"],["__time__","Int64","","YES","","FIELD"],["__topic__","String","","YES","","FIELD"],["ip","String","","YES","","FIELD"],["json_array","Json","","YES","","FIELD"],["json_object","Json","","YES","","FIELD"],["status","String","","YES","","FIELD"],["time","String","","YES","","FIELD"],["url","String","","YES","","FIELD"],["user-agent","String","","YES","","FIELD"],["dongdongdong","String","","YES","","FIELD"],["hasagei","String","","YES","","FIELD"]]"#;
validate_data("identity_schema", &client, "desc logs", expected).await;
guard.remove_all().await;
@@ -1792,10 +1792,10 @@ table_suffix: _${type}
guard.remove_all().await;
}
-pub async fn test_identify_pipeline_with_flatten(store_type: StorageType) {
+pub async fn test_identity_pipeline_with_flatten(store_type: StorageType) {
common_telemetry::init_default_ut_logging();
let (app, mut guard) =
- setup_test_http_app_with_frontend(store_type, "test_identify_pipeline_with_flatten").await;
+ setup_test_http_app_with_frontend(store_type, "test_identity_pipeline_with_flatten").await;
let client = TestClient::new(app).await;
let body = r#"{"__time__":1453809242,"__topic__":"","__source__":"10.170.***.***","ip":"10.200.**.***","time":"26/Jan/2016:19:54:02 +0800","url":"POST/PutData?Category=YunOsAccountOpLog&AccessKeyId=&Date=Fri%2C%2028%20Jun%202013%2006%3A53%3A30%20GMT&Topic=raw&Signature=HTTP/1.1","status":"200","user-agent":"aliyun-sdk-java","custom_map":{"value_a":["a","b","c"],"value_b":"b"}}"#;
@@ -1822,7 +1822,7 @@ pub async fn test_identify_pipeline_with_flatten(store_type: StorageType) {
let expected = r#"[["greptime_timestamp","TimestampNanosecond","PRI","NO","","TIMESTAMP"],["__source__","String","","YES","","FIELD"],["__time__","Int64","","YES","","FIELD"],["__topic__","String","","YES","","FIELD"],["custom_map.value_a","Json","","YES","","FIELD"],["custom_map.value_b","String","","YES","","FIELD"],["ip","String","","YES","","FIELD"],["status","String","","YES","","FIELD"],["time","String","","YES","","FIELD"],["url","String","","YES","","FIELD"],["user-agent","String","","YES","","FIELD"]]"#;
validate_data(
- "test_identify_pipeline_with_flatten_desc_logs",
+ "test_identity_pipeline_with_flatten_desc_logs",
&client,
"desc logs",
expected,
@@ -1831,7 +1831,7 @@ pub async fn test_identify_pipeline_with_flatten(store_type: StorageType) {
let expected = "[[[\"a\",\"b\",\"c\"]]]";
validate_data(
- "test_identify_pipeline_with_flatten_select_json",
+ "test_identity_pipeline_with_flatten_select_json",
&client,
"select `custom_map.value_a` from logs",
expected,
@@ -1841,10 +1841,10 @@ pub async fn test_identify_pipeline_with_flatten(store_type: StorageType) {
guard.remove_all().await;
}
-pub async fn test_identify_pipeline_with_custom_ts(store_type: StorageType) {
+pub async fn test_identity_pipeline_with_custom_ts(store_type: StorageType) {
common_telemetry::init_default_ut_logging();
let (app, mut guard) =
- setup_test_http_app_with_frontend(store_type, "test_identify_pipeline_with_custom_ts")
+ setup_test_http_app_with_frontend(store_type, "test_identity_pipeline_with_custom_ts")
.await;
let client = TestClient::new(app).await;
@@ -1868,7 +1868,7 @@ pub async fn test_identify_pipeline_with_custom_ts(store_type: StorageType) {
let expected = r#"[["__time__","TimestampSecond","PRI","NO","","TIMESTAMP"],["__name__","String","","YES","","FIELD"],["__source__","String","","YES","","FIELD"]]"#;
validate_data(
- "test_identify_pipeline_with_custom_ts_desc_logs",
+ "test_identity_pipeline_with_custom_ts_desc_logs",
&client,
"desc logs",
expected,
@@ -1877,7 +1877,7 @@ pub async fn test_identify_pipeline_with_custom_ts(store_type: StorageType) {
let expected = r#"[[1453809242,"hello","10.170.***.***"],[1453809252,null,"10.170.***.***"]]"#;
validate_data(
- "test_identify_pipeline_with_custom_ts_data",
+ "test_identity_pipeline_with_custom_ts_data",
&client,
"select * from logs",
expected,
@@ -1908,7 +1908,7 @@ pub async fn test_identify_pipeline_with_custom_ts(store_type: StorageType) {
let expected = r#"[["__time__","TimestampNanosecond","PRI","NO","","TIMESTAMP"],["__source__","String","","YES","","FIELD"],["__name__","String","","YES","","FIELD"]]"#;
validate_data(
- "test_identify_pipeline_with_custom_ts_desc_logs",
+ "test_identity_pipeline_with_custom_ts_desc_logs",
&client,
"desc logs",
expected,
@@ -1917,7 +1917,7 @@ pub async fn test_identify_pipeline_with_custom_ts(store_type: StorageType) {
let expected = r#"[[1547577721000000000,"10.170.***.***",null],[1547577724000000000,"10.170.***.***","hello"]]"#;
validate_data(
- "test_identify_pipeline_with_custom_ts_data",
+ "test_identity_pipeline_with_custom_ts_data",
&client,
"select * from logs",
expected,
From 7cd6b0f04bc5e8353897701a59363efd1ec2d1fc Mon Sep 17 00:00:00 2001
From: dennis zhuang
Date: Mon, 14 Apr 2025 14:45:24 +0800
Subject: [PATCH 12/82] docs: update readme (#5891)
* docs: update readme
* chore: format
* docs: shorten
* chore: title
* fix: blank
Co-authored-by: Lei, HUANG <6406592+v0y4g3r@users.noreply.github.com>
---------
Co-authored-by: Lei, HUANG <6406592+v0y4g3r@users.noreply.github.com>
---
README.md | 28 +++++++++++++++-------------
1 file changed, 15 insertions(+), 13 deletions(-)
diff --git a/README.md b/README.md
index 07da173117..912183f5bb 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@
-Unified & Cost-Effective Observerability Database for Metrics, Logs, and Events
+Unified & Cost-Effective Observability Database for Metrics, Logs, and Events
@@ -62,7 +62,7 @@
## Introduction
-**GreptimeDB** is an open-source unified & cost-effective observerability database for **Metrics**, **Logs**, and **Events** (also **Traces** in plan). You can gain real-time insights from Edge to Cloud at Any Scale.
+**GreptimeDB** is an open-source, cloud-native, unified & cost-effective observability database for **Metrics**, **Logs**, and **Traces**. You can gain real-time insights from Edge to Cloud at Any Scale.
## News
@@ -70,27 +70,27 @@
## Why GreptimeDB
-Our core developers have been building observerability data platforms for years. Based on our best practices, GreptimeDB was born to give you:
+Our core developers have been building observability data platforms for years. Based on our best practices, GreptimeDB was born to give you:
-* **Unified Processing of Metrics, Logs, and Events**
+* **Unified Processing of Observability Data**
- GreptimeDB unifies observerability data processing by treating all data - whether metrics, logs, or events - as timestamped events with context. Users can analyze this data using either [SQL](https://docs.greptime.com/user-guide/query-data/sql) or [PromQL](https://docs.greptime.com/user-guide/query-data/promql) and leverage stream processing ([Flow](https://docs.greptime.com/user-guide/flow-computation/overview)) to enable continuous aggregation. [Read more](https://docs.greptime.com/user-guide/concepts/data-model).
+ A unified database that treats metrics, logs, and traces as timestamped wide events with context, supporting [SQL](https://docs.greptime.com/user-guide/query-data/sql)/[PromQL](https://docs.greptime.com/user-guide/query-data/promql) queries and [stream processing](https://docs.greptime.com/user-guide/flow-computation/overview) to simplify complex data stacks.
+
+* **High Performance and Cost-effective**
+
+ Written in Rust, combines a distributed query engine with [rich indexing](https://docs.greptime.com/user-guide/manage-data/data-index) (inverted, fulltext, skip data, and vector) and optimized columnar storage to deliver sub-second responses on petabyte-scale data and high-cost efficiency.
* **Cloud-native Distributed Database**
Built for [Kubernetes](https://docs.greptime.com/user-guide/deployments/deploy-on-kubernetes/greptimedb-operator-management). GreptimeDB achieves seamless scalability with its [cloud-native architecture](https://docs.greptime.com/user-guide/concepts/architecture) of separated compute and storage, built on object storage (AWS S3, Azure Blob Storage, etc.) while enabling cross-cloud deployment through a unified data access layer.
-* **Performance and Cost-effective**
+* **Developer-Friendly**
- Written in pure Rust for superior performance and reliability. GreptimeDB features a distributed query engine with intelligent indexing to handle high cardinality data efficiently. Its optimized columnar storage achieves 50x cost efficiency on cloud object storage through advanced compression. [Benchmark reports](https://www.greptime.com/blogs/2024-09-09-report-summary).
+ Access standardized SQL/PromQL interfaces through built-in web dashboard, REST API, and MySQL/PostgreSQL protocols. Supports widely adopted data ingestion [protocols](https://docs.greptime.com/user-guide/protocols/overview) for seamless migration and integration.
-* **Cloud-Edge Collaboration**
+* **Flexible Deployment Options**
- GreptimeDB seamlessly operates across cloud and edge (ARM/Android/Linux), providing consistent APIs and control plane for unified data management and efficient synchronization. [Learn how to run on Android](https://docs.greptime.com/user-guide/deployments/run-on-android/).
-
-* **Multi-protocol Ingestion, SQL & PromQL Ready**
-
- Widely adopted database protocols and APIs, including MySQL, PostgreSQL, InfluxDB, OpenTelemetry, Loki and Prometheus, etc. Effortless Adoption & Seamless Migration. [Supported Protocols Overview](https://docs.greptime.com/user-guide/protocols/overview).
+ Deploy GreptimeDB anywhere from ARM-based edge devices to cloud environments with unified APIs and bandwidth-efficient data synchronization. Query edge and cloud data seamlessly through identical APIs. [Learn how to run on Android](https://docs.greptime.com/user-guide/deployments/run-on-android/).
For more detailed info please read [Why GreptimeDB](https://docs.greptime.com/user-guide/concepts/why-greptimedb).
@@ -233,3 +233,5 @@ Special thanks to all the contributors who have propelled GreptimeDB forward. Fo
- GreptimeDB's query engine is powered by [Apache Arrow DataFusion™](https://arrow.apache.org/datafusion/).
- [Apache OpenDAL™](https://opendal.apache.org) gives GreptimeDB a very general and elegant data access abstraction layer.
- GreptimeDB's meta service is based on [etcd](https://etcd.io/).
+
+
\ No newline at end of file
From e3675494b492978bd1a941e49b350d7d87353f55 Mon Sep 17 00:00:00 2001
From: Zhenchi
Date: Mon, 14 Apr 2025 15:08:59 +0800
Subject: [PATCH 13/82] feat: apply terms with fulltext bloom backend (#5884)
* feat: apply terms with fulltext bloom backend
Signed-off-by: Zhenchi
* perf: preload jieba
Signed-off-by: Zhenchi
* polish doc
Signed-off-by: Zhenchi
---------
Signed-off-by: Zhenchi
---
Cargo.lock | 1 +
src/index/Cargo.toml | 1 +
src/index/src/fulltext_index/tokenizer.rs | 9 +-
.../src/cache/index/bloom_filter_index.rs | 37 +-
src/mito2/src/read/scan_region.rs | 3 +-
src/mito2/src/read/scan_util.rs | 1 +
.../src/sst/index/bloom_filter/applier.rs | 9 +-
.../src/sst/index/fulltext_index/applier.rs | 237 ++++++-
.../index/fulltext_index/applier/builder.rs | 13 +
.../src/sst/index/fulltext_index/creator.rs | 647 ++++++++++++++++--
src/mito2/src/sst/parquet/reader.rs | 66 +-
11 files changed, 930 insertions(+), 94 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index 1b108a7546..2ab3200029 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5591,6 +5591,7 @@ dependencies = [
"greptime-proto",
"itertools 0.14.0",
"jieba-rs",
+ "lazy_static",
"mockall",
"pin-project",
"prost 0.13.5",
diff --git a/src/index/Cargo.toml b/src/index/Cargo.toml
index dc6e394ef4..c4b7057895 100644
--- a/src/index/Cargo.toml
+++ b/src/index/Cargo.toml
@@ -23,6 +23,7 @@ futures.workspace = true
greptime-proto.workspace = true
itertools.workspace = true
jieba-rs = "0.7"
+lazy_static.workspace = true
mockall.workspace = true
pin-project.workspace = true
prost.workspace = true
diff --git a/src/index/src/fulltext_index/tokenizer.rs b/src/index/src/fulltext_index/tokenizer.rs
index 721ffdd3b9..b00e7fda9c 100644
--- a/src/index/src/fulltext_index/tokenizer.rs
+++ b/src/index/src/fulltext_index/tokenizer.rs
@@ -12,11 +12,13 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-use jieba_rs::Jieba;
-
use crate::fulltext_index::error::Result;
use crate::Bytes;
+lazy_static::lazy_static! {
+ static ref JIEBA: jieba_rs::Jieba = jieba_rs::Jieba::new();
+}
+
/// `Tokenizer` tokenizes a text into a list of tokens.
pub trait Tokenizer: Send {
fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str>;
@@ -44,8 +46,7 @@ pub struct ChineseTokenizer;
impl Tokenizer for ChineseTokenizer {
fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str> {
- let jieba = Jieba::new();
- jieba.cut(text, false)
+ JIEBA.cut(text, false)
}
}
diff --git a/src/mito2/src/cache/index/bloom_filter_index.rs b/src/mito2/src/cache/index/bloom_filter_index.rs
index 097acd6367..61df853573 100644
--- a/src/mito2/src/cache/index/bloom_filter_index.rs
+++ b/src/mito2/src/cache/index/bloom_filter_index.rs
@@ -29,8 +29,15 @@ use crate::sst::file::FileId;
const INDEX_TYPE_BLOOM_FILTER_INDEX: &str = "bloom_filter_index";
+/// Tag for bloom filter index cache.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum Tag {
+ Skipping,
+ Fulltext,
+}
+
/// Cache for bloom filter index.
-pub type BloomFilterIndexCache = IndexCache<(FileId, ColumnId), BloomFilterMeta>;
+pub type BloomFilterIndexCache = IndexCache<(FileId, ColumnId, Tag), BloomFilterMeta>;
pub type BloomFilterIndexCacheRef = Arc;
impl BloomFilterIndexCache {
@@ -48,14 +55,20 @@ impl BloomFilterIndexCache {
}
/// Calculates weight for bloom filter index metadata.
-fn bloom_filter_index_metadata_weight(k: &(FileId, ColumnId), _: &Arc) -> u32 {
+fn bloom_filter_index_metadata_weight(
+ k: &(FileId, ColumnId, Tag),
+ _: &Arc,
+) -> u32 {
(k.0.as_bytes().len()
+ std::mem::size_of::()
+ std::mem::size_of::()) as u32
}
/// Calculates weight for bloom filter index content.
-fn bloom_filter_index_content_weight((k, _): &((FileId, ColumnId), PageKey), v: &Bytes) -> u32 {
+fn bloom_filter_index_content_weight(
+ (k, _): &((FileId, ColumnId, Tag), PageKey),
+ v: &Bytes,
+) -> u32 {
(k.0.as_bytes().len() + std::mem::size_of::() + v.len()) as u32
}
@@ -63,6 +76,7 @@ fn bloom_filter_index_content_weight((k, _): &((FileId, ColumnId), PageKey), v:
pub struct CachedBloomFilterIndexBlobReader {
file_id: FileId,
column_id: ColumnId,
+ tag: Tag,
blob_size: u64,
inner: R,
cache: BloomFilterIndexCacheRef,
@@ -73,6 +87,7 @@ impl CachedBloomFilterIndexBlobReader {
pub fn new(
file_id: FileId,
column_id: ColumnId,
+ tag: Tag,
blob_size: u64,
inner: R,
cache: BloomFilterIndexCacheRef,
@@ -80,6 +95,7 @@ impl CachedBloomFilterIndexBlobReader {
Self {
file_id,
column_id,
+ tag,
blob_size,
inner,
cache,
@@ -93,7 +109,7 @@ impl BloomFilterReader for CachedBloomFilterIndexBl
let inner = &self.inner;
self.cache
.get_or_load(
- (self.file_id, self.column_id),
+ (self.file_id, self.column_id, self.tag),
self.blob_size,
offset,
size,
@@ -107,7 +123,7 @@ impl BloomFilterReader for CachedBloomFilterIndexBl
let fetch = ranges.iter().map(|range| {
let inner = &self.inner;
self.cache.get_or_load(
- (self.file_id, self.column_id),
+ (self.file_id, self.column_id, self.tag),
self.blob_size,
range.start,
(range.end - range.start) as u32,
@@ -123,13 +139,18 @@ impl BloomFilterReader for CachedBloomFilterIndexBl
/// Reads the meta information of the bloom filter.
async fn metadata(&self) -> Result {
- if let Some(cached) = self.cache.get_metadata((self.file_id, self.column_id)) {
+ if let Some(cached) = self
+ .cache
+ .get_metadata((self.file_id, self.column_id, self.tag))
+ {
CACHE_HIT.with_label_values(&[INDEX_METADATA_TYPE]).inc();
Ok((*cached).clone())
} else {
let meta = self.inner.metadata().await?;
- self.cache
- .put_metadata((self.file_id, self.column_id), Arc::new(meta.clone()));
+ self.cache.put_metadata(
+ (self.file_id, self.column_id, self.tag),
+ Arc::new(meta.clone()),
+ );
CACHE_MISS.with_label_values(&[INDEX_METADATA_TYPE]).inc();
Ok(meta)
}
diff --git a/src/mito2/src/read/scan_region.rs b/src/mito2/src/read/scan_region.rs
index 855455453f..8b9efb9ae8 100644
--- a/src/mito2/src/read/scan_region.rs
+++ b/src/mito2/src/read/scan_region.rs
@@ -502,7 +502,7 @@ impl ScanRegion {
let file_cache = self.cache_strategy.write_cache().map(|w| w.file_cache());
let puffin_metadata_cache = self.cache_strategy.puffin_metadata_cache().cloned();
-
+ let bloom_filter_index_cache = self.cache_strategy.bloom_filter_index_cache().cloned();
FulltextIndexApplierBuilder::new(
self.access_layer.region_dir().to_string(),
self.version.metadata.region_id,
@@ -512,6 +512,7 @@ impl ScanRegion {
)
.with_file_cache(file_cache)
.with_puffin_metadata_cache(puffin_metadata_cache)
+ .with_bloom_filter_cache(bloom_filter_index_cache)
.build(&self.request.filters)
.inspect_err(|err| warn!(err; "Failed to build fulltext index applier"))
.ok()
diff --git a/src/mito2/src/read/scan_util.rs b/src/mito2/src/read/scan_util.rs
index 4a211f7117..36899796b0 100644
--- a/src/mito2/src/read/scan_util.rs
+++ b/src/mito2/src/read/scan_util.rs
@@ -505,6 +505,7 @@ pub(crate) fn scan_file_ranges(
// Reports metrics.
reader_metrics.observe_rows(read_type);
+ reader_metrics.filter_metrics.observe();
part_metrics.merge_reader_metrics(&reader_metrics);
}
}
diff --git a/src/mito2/src/sst/index/bloom_filter/applier.rs b/src/mito2/src/sst/index/bloom_filter/applier.rs
index afd5cc16cd..fac5db5405 100644
--- a/src/mito2/src/sst/index/bloom_filter/applier.rs
+++ b/src/mito2/src/sst/index/bloom_filter/applier.rs
@@ -31,7 +31,7 @@ use store_api::storage::{ColumnId, RegionId};
use crate::access_layer::{RegionFilePathFactory, WriteCachePathProvider};
use crate::cache::file_cache::{FileCacheRef, FileType, IndexKey};
use crate::cache::index::bloom_filter_index::{
- BloomFilterIndexCacheRef, CachedBloomFilterIndexBlobReader,
+ BloomFilterIndexCacheRef, CachedBloomFilterIndexBlobReader, Tag,
};
use crate::error::{
ApplyBloomFilterIndexSnafu, Error, MetadataSnafu, PuffinBuildReaderSnafu, PuffinReadBlobSnafu,
@@ -165,6 +165,7 @@ impl BloomFilterIndexApplier {
let reader = CachedBloomFilterIndexBlobReader::new(
file_id,
*column_id,
+ Tag::Skipping,
blob_size,
BloomFilterReaderImpl::new(blob),
bloom_filter_cache.clone(),
@@ -308,13 +309,13 @@ impl BloomFilterIndexApplier {
) -> std::result::Result<(), index::bloom_filter::error::Error> {
let mut applier = BloomFilterApplier::new(Box::new(reader)).await?;
- for (_, output) in output.iter_mut() {
+ for (_, row_group_output) in output.iter_mut() {
// All rows are filtered out, skip the search
- if output.is_empty() {
+ if row_group_output.is_empty() {
continue;
}
- *output = applier.search(predicates, output).await?;
+ *row_group_output = applier.search(predicates, row_group_output).await?;
}
Ok(())
diff --git a/src/mito2/src/sst/index/fulltext_index/applier.rs b/src/mito2/src/sst/index/fulltext_index/applier.rs
index 94ceda6891..063227a89f 100644
--- a/src/mito2/src/sst/index/fulltext_index/applier.rs
+++ b/src/mito2/src/sst/index/fulltext_index/applier.rs
@@ -12,10 +12,15 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-use std::collections::{BTreeSet, HashMap};
+use std::collections::{BTreeSet, HashMap, HashSet};
+use std::iter;
+use std::ops::Range;
use std::sync::Arc;
+use common_base::range_read::RangeReader;
use common_telemetry::warn;
+use index::bloom_filter::applier::{BloomFilterApplier, InListPredicate};
+use index::bloom_filter::reader::BloomFilterReaderImpl;
use index::fulltext_index::search::{FulltextIndexSearcher, RowId, TantivyFulltextIndexSearcher};
use index::fulltext_index::Config;
use object_store::ObjectStore;
@@ -26,11 +31,17 @@ use store_api::storage::{ColumnId, RegionId};
use crate::access_layer::{RegionFilePathFactory, WriteCachePathProvider};
use crate::cache::file_cache::{FileCacheRef, FileType, IndexKey};
-use crate::error::{ApplyFulltextIndexSnafu, PuffinBuildReaderSnafu, PuffinReadBlobSnafu, Result};
+use crate::cache::index::bloom_filter_index::{
+ BloomFilterIndexCacheRef, CachedBloomFilterIndexBlobReader, Tag,
+};
+use crate::error::{
+ ApplyBloomFilterIndexSnafu, ApplyFulltextIndexSnafu, MetadataSnafu, PuffinBuildReaderSnafu,
+ PuffinReadBlobSnafu, Result,
+};
use crate::metrics::INDEX_APPLY_ELAPSED;
use crate::sst::file::FileId;
-use crate::sst::index::fulltext_index::applier::builder::FulltextRequest;
-use crate::sst::index::fulltext_index::INDEX_BLOB_TYPE_TANTIVY;
+use crate::sst::index::fulltext_index::applier::builder::{FulltextRequest, FulltextTerm};
+use crate::sst::index::fulltext_index::{INDEX_BLOB_TYPE_BLOOM, INDEX_BLOB_TYPE_TANTIVY};
use crate::sst::index::puffin_manager::{
PuffinManagerFactory, SstPuffinBlob, SstPuffinDir, SstPuffinReader,
};
@@ -45,6 +56,9 @@ pub struct FulltextIndexApplier {
/// The source of the index.
index_source: IndexSource,
+
+ /// Cache for bloom filter index.
+ bloom_filter_index_cache: Option,
}
pub type FulltextIndexApplierRef = Arc;
@@ -63,6 +77,7 @@ impl FulltextIndexApplier {
Self {
requests,
index_source,
+ bloom_filter_index_cache: None,
}
}
@@ -82,13 +97,25 @@ impl FulltextIndexApplier {
self
}
- /// Applies the queries to the fulltext index of the specified SST file.
- pub async fn apply(
+ /// Sets the bloom filter cache.
+ pub fn with_bloom_filter_cache(
+ mut self,
+ bloom_filter_index_cache: Option,
+ ) -> Self {
+ self.bloom_filter_index_cache = bloom_filter_index_cache;
+ self
+ }
+}
+
+impl FulltextIndexApplier {
+ /// Applies fine-grained fulltext index to the specified SST file.
+ /// Returns the row ids that match the queries.
+ pub async fn apply_fine(
&self,
file_id: FileId,
file_size_hint: Option,
) -> Result>> {
- let _timer = INDEX_APPLY_ELAPSED
+ let timer = INDEX_APPLY_ELAPSED
.with_label_values(&[TYPE_FULLTEXT_INDEX])
.start_timer();
@@ -99,7 +126,7 @@ impl FulltextIndexApplier {
}
let Some(result) = self
- .apply_one_column(file_size_hint, file_id, *column_id, request)
+ .apply_fine_one_column(file_size_hint, file_id, *column_id, request)
.await?
else {
continue;
@@ -118,10 +145,13 @@ impl FulltextIndexApplier {
}
}
+ if row_ids.is_none() {
+ timer.stop_and_discard();
+ }
Ok(row_ids)
}
- async fn apply_one_column(
+ async fn apply_fine_one_column(
&self,
file_size_hint: Option,
file_id: FileId,
@@ -187,6 +217,195 @@ impl FulltextIndexApplier {
}
}
+impl FulltextIndexApplier {
+ /// Applies coarse-grained fulltext index to the specified SST file.
+ /// Returns (row group id -> ranges) that match the queries.
+ pub async fn apply_coarse(
+ &self,
+ file_id: FileId,
+ file_size_hint: Option,
+ row_groups: impl Iterator- ,
+ ) -> Result
>)>>> {
+ let timer = INDEX_APPLY_ELAPSED
+ .with_label_values(&[TYPE_FULLTEXT_INDEX])
+ .start_timer();
+
+ let (input, mut output) = Self::init_coarse_output(row_groups);
+ let mut applied = false;
+
+ for (column_id, request) in &self.requests {
+ if request.terms.is_empty() {
+ // only apply terms
+ continue;
+ }
+
+ applied |= self
+ .apply_coarse_one_column(
+ file_id,
+ file_size_hint,
+ *column_id,
+ &request.terms,
+ &mut output,
+ )
+ .await?;
+ }
+
+ if !applied {
+ timer.stop_and_discard();
+ return Ok(None);
+ }
+
+ Self::adjust_coarse_output(input, &mut output);
+ Ok(Some(output))
+ }
+
+ async fn apply_coarse_one_column(
+ &self,
+ file_id: FileId,
+ file_size_hint: Option,
+ column_id: ColumnId,
+ terms: &[FulltextTerm],
+ output: &mut [(usize, Vec>)],
+ ) -> Result {
+ let blob_key = format!("{INDEX_BLOB_TYPE_BLOOM}-{column_id}");
+ let Some(reader) = self
+ .index_source
+ .blob(file_id, &blob_key, file_size_hint)
+ .await?
+ else {
+ return Ok(false);
+ };
+ let config =
+ Config::from_blob_metadata(reader.metadata()).context(ApplyFulltextIndexSnafu)?;
+
+ let predicates = Self::terms_to_predicates(terms, &config);
+ if predicates.is_empty() {
+ return Ok(false);
+ }
+
+ let range_reader = reader.reader().await.context(PuffinBuildReaderSnafu)?;
+ let reader = if let Some(bloom_filter_cache) = &self.bloom_filter_index_cache {
+ let blob_size = range_reader
+ .metadata()
+ .await
+ .context(MetadataSnafu)?
+ .content_length;
+ let reader = CachedBloomFilterIndexBlobReader::new(
+ file_id,
+ column_id,
+ Tag::Fulltext,
+ blob_size,
+ BloomFilterReaderImpl::new(range_reader),
+ bloom_filter_cache.clone(),
+ );
+ Box::new(reader) as _
+ } else {
+ Box::new(BloomFilterReaderImpl::new(range_reader)) as _
+ };
+
+ let mut applier = BloomFilterApplier::new(reader)
+ .await
+ .context(ApplyBloomFilterIndexSnafu)?;
+ for (_, row_group_output) in output.iter_mut() {
+ // All rows are filtered out, skip the search
+ if row_group_output.is_empty() {
+ continue;
+ }
+
+ *row_group_output = applier
+ .search(&predicates, row_group_output)
+ .await
+ .context(ApplyBloomFilterIndexSnafu)?;
+ }
+
+ Ok(true)
+ }
+
+ /// Initializes the coarse output. Must call `adjust_coarse_output` after applying bloom filters.
+ ///
+ /// `row_groups` is a list of (row group length, whether to search).
+ ///
+ /// Returns (`input`, `output`):
+ /// * `input` is a list of (row group index to search, row group range based on start of the file).
+ /// * `output` is a list of (row group index to search, row group ranges based on start of the file).
+ #[allow(clippy::type_complexity)]
+ fn init_coarse_output(
+ row_groups: impl Iterator- ,
+ ) -> (Vec<(usize, Range
)>, Vec<(usize, Vec>)>) {
+ // Calculates row groups' ranges based on start of the file.
+ let mut input = Vec::with_capacity(row_groups.size_hint().0);
+ let mut start = 0;
+ for (i, (len, to_search)) in row_groups.enumerate() {
+ let end = start + len;
+ if to_search {
+ input.push((i, start..end));
+ }
+ start = end;
+ }
+
+ // Initializes output with input ranges, but ranges are based on start of the file not the row group,
+ // so we need to adjust them later.
+ let output = input
+ .iter()
+ .map(|(i, range)| (*i, vec![range.clone()]))
+ .collect::>();
+
+ (input, output)
+ }
+
+ /// Adjusts the coarse output. Makes the output ranges based on row group start.
+ fn adjust_coarse_output(
+ input: Vec<(usize, Range)>,
+ output: &mut Vec<(usize, Vec>)>,
+ ) {
+ // adjust ranges to be based on row group
+ for ((_, output), (_, input)) in output.iter_mut().zip(input) {
+ let start = input.start;
+ for range in output.iter_mut() {
+ range.start -= start;
+ range.end -= start;
+ }
+ }
+ output.retain(|(_, ranges)| !ranges.is_empty());
+ }
+
+ /// Converts terms to predicates.
+ ///
+ /// Split terms by non-alphanumeric characters and convert them to lowercase if case-insensitive.
+ /// Multiple terms are combined with AND semantics.
+ fn terms_to_predicates(terms: &[FulltextTerm], config: &Config) -> Vec {
+ let mut probes = HashSet::new();
+ for term in terms {
+ if config.case_sensitive && term.col_lowered {
+ // lowercased terms are not indexed
+ continue;
+ }
+
+ let ts = term
+ .term
+ .split(|c: char| !c.is_alphanumeric())
+ .filter(|&t| !t.is_empty())
+ .map(|t| {
+ if !config.case_sensitive {
+ t.to_lowercase()
+ } else {
+ t.to_string()
+ }
+ .into_bytes()
+ });
+
+ probes.extend(ts);
+ }
+
+ probes
+ .into_iter()
+ .map(|p| InListPredicate {
+ list: iter::once(p).collect(),
+ })
+ .collect::>()
+ }
+}
+
/// The source of the index.
struct IndexSource {
region_dir: String,
diff --git a/src/mito2/src/sst/index/fulltext_index/applier/builder.rs b/src/mito2/src/sst/index/fulltext_index/applier/builder.rs
index 14f5936a01..3297275f26 100644
--- a/src/mito2/src/sst/index/fulltext_index/applier/builder.rs
+++ b/src/mito2/src/sst/index/fulltext_index/applier/builder.rs
@@ -23,6 +23,7 @@ use store_api::metadata::RegionMetadata;
use store_api::storage::{ColumnId, ConcreteDataType, RegionId};
use crate::cache::file_cache::FileCacheRef;
+use crate::cache::index::bloom_filter_index::BloomFilterIndexCacheRef;
use crate::error::Result;
use crate::sst::index::fulltext_index::applier::FulltextIndexApplier;
use crate::sst::index::puffin_manager::PuffinManagerFactory;
@@ -86,6 +87,7 @@ pub struct FulltextIndexApplierBuilder<'a> {
metadata: &'a RegionMetadata,
file_cache: Option,
puffin_metadata_cache: Option,
+ bloom_filter_cache: Option,
}
impl<'a> FulltextIndexApplierBuilder<'a> {
@@ -105,6 +107,7 @@ impl<'a> FulltextIndexApplierBuilder<'a> {
metadata,
file_cache: None,
puffin_metadata_cache: None,
+ bloom_filter_cache: None,
}
}
@@ -123,6 +126,15 @@ impl<'a> FulltextIndexApplierBuilder<'a> {
self
}
+ /// Sets the bloom filter cache to be used by the `FulltextIndexApplier`.
+ pub fn with_bloom_filter_cache(
+ mut self,
+ bloom_filter_cache: Option,
+ ) -> Self {
+ self.bloom_filter_cache = bloom_filter_cache;
+ self
+ }
+
/// Builds `SstIndexApplier` from the given expressions.
pub fn build(self, exprs: &[Expr]) -> Result> {
let mut requests = HashMap::new();
@@ -145,6 +157,7 @@ impl<'a> FulltextIndexApplierBuilder<'a> {
)
.with_file_cache(self.file_cache)
.with_puffin_metadata_cache(self.puffin_metadata_cache)
+ .with_bloom_filter_cache(self.bloom_filter_cache)
}))
}
diff --git a/src/mito2/src/sst/index/fulltext_index/creator.rs b/src/mito2/src/sst/index/fulltext_index/creator.rs
index 12b83e39d0..1d884ac3a5 100644
--- a/src/mito2/src/sst/index/fulltext_index/creator.rs
+++ b/src/mito2/src/sst/index/fulltext_index/creator.rs
@@ -360,6 +360,7 @@ mod tests {
use std::sync::Arc;
use api::v1::SemanticType;
+ use common_base::BitVec;
use datatypes::data_type::DataType;
use datatypes::schema::{ColumnSchema, FulltextAnalyzer, FulltextOptions};
use datatypes::vectors::{UInt64Vector, UInt8Vector};
@@ -390,7 +391,7 @@ mod tests {
IntermediateManager::init_fs(path).await.unwrap()
}
- fn mock_region_metadata() -> RegionMetadataRef {
+ fn mock_region_metadata(backend: FulltextBackend) -> RegionMetadataRef {
let mut builder = RegionMetadataBuilder::new(RegionId::new(1, 2));
builder
.push_column_metadata(ColumnMetadata {
@@ -403,7 +404,7 @@ mod tests {
enable: true,
analyzer: FulltextAnalyzer::English,
case_sensitive: true,
- backend: FulltextBackend::Tantivy,
+ backend: backend.clone(),
})
.unwrap(),
semantic_type: SemanticType::Field,
@@ -419,7 +420,7 @@ mod tests {
enable: true,
analyzer: FulltextAnalyzer::English,
case_sensitive: false,
- backend: FulltextBackend::Tantivy,
+ backend: backend.clone(),
})
.unwrap(),
semantic_type: SemanticType::Field,
@@ -435,7 +436,7 @@ mod tests {
enable: true,
analyzer: FulltextAnalyzer::Chinese,
case_sensitive: false,
- backend: FulltextBackend::Tantivy,
+ backend: backend.clone(),
})
.unwrap(),
semantic_type: SemanticType::Field,
@@ -522,6 +523,7 @@ mod tests {
/// - `terms`: A list of (ColumnId, [(bool, String)]) for fulltext terms, where bool indicates if term is lowercased
async fn build_fulltext_applier_factory(
prefix: &str,
+ backend: FulltextBackend,
rows: &[(
Option<&str>, // text_english_case_sensitive
Option<&str>, // text_english_case_insensitive
@@ -530,12 +532,13 @@ mod tests {
) -> impl Fn(
Vec<(ColumnId, &str)>,
Vec<(ColumnId, Vec<(bool, &str)>)>,
+ Option,
) -> BoxFuture<'static, Option>> {
let (d, factory) = PuffinManagerFactory::new_for_test_async(prefix).await;
let region_dir = "region0".to_string();
let sst_file_id = FileId::random();
let object_store = mock_object_store();
- let region_metadata = mock_region_metadata();
+ let region_metadata = mock_region_metadata(backend.clone());
let intm_mgr = new_intm_mgr(d.path().to_string_lossy()).await;
let mut indexer = FulltextIndexer::new(
@@ -544,7 +547,7 @@ mod tests {
&intm_mgr,
®ion_metadata,
true,
- 8096,
+ 1,
1024,
)
.await
@@ -562,7 +565,9 @@ mod tests {
let _ = indexer.finish(&mut writer).await.unwrap();
writer.finish().await.unwrap();
- move |queries: Vec<(ColumnId, &str)>, terms_requests: Vec<(ColumnId, Vec<(bool, &str)>)>| {
+ move |queries: Vec<(ColumnId, &str)>,
+ terms_requests: Vec<(ColumnId, Vec<(bool, &str)>)>,
+ coarse_mask: Option| {
let _d = &d;
let region_dir = region_dir.clone();
let object_store = object_store.clone();
@@ -604,7 +609,29 @@ mod tests {
factory,
);
- async move { applier.apply(sst_file_id, None).await.unwrap() }.boxed()
+ let backend = backend.clone();
+ async move {
+ match backend {
+ FulltextBackend::Tantivy => {
+ applier.apply_fine(sst_file_id, None).await.unwrap()
+ }
+ FulltextBackend::Bloom => {
+ let coarse_mask = coarse_mask.unwrap_or_default();
+ let row_groups = (0..coarse_mask.len()).map(|i| (1, coarse_mask[i]));
+ // row group id == row id
+ let resp = applier
+ .apply_coarse(sst_file_id, None, row_groups)
+ .await
+ .unwrap();
+ resp.map(|r| {
+ r.into_iter()
+ .map(|(row_group_id, _)| row_group_id as RowId)
+ .collect()
+ })
+ }
+ }
+ }
+ .boxed()
}
}
@@ -613,9 +640,10 @@ mod tests {
}
#[tokio::test]
- async fn test_fulltext_index_basic_case_sensitive() {
+ async fn test_fulltext_index_basic_case_sensitive_tantivy() {
let applier_factory = build_fulltext_applier_factory(
- "test_fulltext_index_basic_case_sensitive_",
+ "test_fulltext_index_basic_case_sensitive_tantivy_",
+ FulltextBackend::Tantivy,
&[
(Some("hello"), None, None),
(Some("world"), None, None),
@@ -625,47 +653,159 @@ mod tests {
)
.await;
- let row_ids = applier_factory(vec![(1, "hello")], vec![]).await;
+ let row_ids = applier_factory(vec![(1, "hello")], vec![], None).await;
assert_eq!(row_ids, Some(rows([0])));
- let row_ids = applier_factory(vec![(1, "world")], vec![]).await;
+ let row_ids = applier_factory(vec![(1, "world")], vec![], None).await;
assert_eq!(row_ids, Some(rows([1])));
- let row_ids = applier_factory(vec![(1, "Hello")], vec![]).await;
+ let row_ids = applier_factory(vec![(1, "Hello")], vec![], None).await;
assert_eq!(row_ids, Some(rows([3])));
- let row_ids = applier_factory(vec![(1, "World")], vec![]).await;
+ let row_ids = applier_factory(vec![(1, "World")], vec![], None).await;
assert_eq!(row_ids, Some(rows([3])));
- let row_ids = applier_factory(vec![], vec![(1, vec![(false, "hello")])]).await;
+ let row_ids = applier_factory(vec![], vec![(1, vec![(false, "hello")])], None).await;
assert_eq!(row_ids, Some(rows([0])));
- let row_ids = applier_factory(vec![], vec![(1, vec![(true, "hello")])]).await;
+ let row_ids = applier_factory(vec![], vec![(1, vec![(true, "hello")])], None).await;
assert_eq!(row_ids, None);
- let row_ids = applier_factory(vec![], vec![(1, vec![(false, "world")])]).await;
+ let row_ids = applier_factory(vec![], vec![(1, vec![(false, "world")])], None).await;
assert_eq!(row_ids, Some(rows([1])));
- let row_ids = applier_factory(vec![], vec![(1, vec![(true, "world")])]).await;
+ let row_ids = applier_factory(vec![], vec![(1, vec![(true, "world")])], None).await;
assert_eq!(row_ids, None);
- let row_ids = applier_factory(vec![], vec![(1, vec![(false, "Hello")])]).await;
+ let row_ids = applier_factory(vec![], vec![(1, vec![(false, "Hello")])], None).await;
assert_eq!(row_ids, Some(rows([3])));
- let row_ids = applier_factory(vec![], vec![(1, vec![(true, "Hello")])]).await;
+ let row_ids = applier_factory(vec![], vec![(1, vec![(true, "Hello")])], None).await;
assert_eq!(row_ids, None);
- let row_ids = applier_factory(vec![], vec![(1, vec![(false, "Hello, World")])]).await;
+ let row_ids = applier_factory(vec![], vec![(1, vec![(false, "Hello, World")])], None).await;
assert_eq!(row_ids, Some(rows([3])));
- let row_ids = applier_factory(vec![], vec![(1, vec![(true, "Hello, World")])]).await;
+ let row_ids = applier_factory(vec![], vec![(1, vec![(true, "Hello, World")])], None).await;
assert_eq!(row_ids, None);
}
#[tokio::test]
- async fn test_fulltext_index_basic_case_insensitive() {
+ async fn test_fulltext_index_basic_case_sensitive_bloom() {
let applier_factory = build_fulltext_applier_factory(
- "test_fulltext_index_basic_case_insensitive_",
+ "test_fulltext_index_basic_case_sensitive_bloom_",
+ FulltextBackend::Bloom,
+ &[
+ (Some("hello"), None, None),
+ (Some("world"), None, None),
+ (None, None, None),
+ (Some("Hello, World"), None, None),
+ ],
+ )
+ .await;
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![(1, vec![(false, "hello")])],
+ Some(BitVec::from_slice(&[0b1111])),
+ )
+ .await;
+ assert_eq!(row_ids, Some(rows([0])));
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![(1, vec![(false, "hello")])],
+ Some(BitVec::from_slice(&[0b1110])), // row 0 is filtered out
+ )
+ .await;
+ assert_eq!(row_ids, Some(rows([])));
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![(1, vec![(true, "hello")])],
+ Some(BitVec::from_slice(&[0b1111])),
+ )
+ .await;
+ assert_eq!(row_ids, None);
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![(1, vec![(false, "world")])],
+ Some(BitVec::from_slice(&[0b1111])),
+ )
+ .await;
+ assert_eq!(row_ids, Some(rows([1])));
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![(1, vec![(false, "world")])],
+ Some(BitVec::from_slice(&[0b1101])), // row 1 is filtered out
+ )
+ .await;
+ assert_eq!(row_ids, Some(rows([])));
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![(1, vec![(true, "world")])],
+ Some(BitVec::from_slice(&[0b1111])),
+ )
+ .await;
+ assert_eq!(row_ids, None);
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![(1, vec![(false, "Hello")])],
+ Some(BitVec::from_slice(&[0b1111])),
+ )
+ .await;
+ assert_eq!(row_ids, Some(rows([3])));
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![(1, vec![(false, "Hello")])],
+ Some(BitVec::from_slice(&[0b0111])), // row 3 is filtered out
+ )
+ .await;
+ assert_eq!(row_ids, Some(rows([])));
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![(1, vec![(true, "Hello")])],
+ Some(BitVec::from_slice(&[0b1111])),
+ )
+ .await;
+ assert_eq!(row_ids, None);
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![(1, vec![(false, "Hello, World")])],
+ Some(BitVec::from_slice(&[0b1111])),
+ )
+ .await;
+ assert_eq!(row_ids, Some(rows([3])));
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![(1, vec![(false, "Hello, World")])],
+ Some(BitVec::from_slice(&[0b0111])), // row 3 is filtered out
+ )
+ .await;
+ assert_eq!(row_ids, Some(rows([])));
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![(1, vec![(true, "Hello, World")])],
+ Some(BitVec::from_slice(&[0b1111])),
+ )
+ .await;
+ assert_eq!(row_ids, None);
+ }
+
+ #[tokio::test]
+ async fn test_fulltext_index_basic_case_insensitive_tantivy() {
+ let applier_factory = build_fulltext_applier_factory(
+ "test_fulltext_index_basic_case_insensitive_tantivy_",
+ FulltextBackend::Tantivy,
&[
(None, Some("hello"), None),
(None, None, None),
@@ -675,47 +815,191 @@ mod tests {
)
.await;
- let row_ids = applier_factory(vec![(2, "hello")], vec![]).await;
+ let row_ids = applier_factory(vec![(2, "hello")], vec![], None).await;
assert_eq!(row_ids, Some(rows([0, 3])));
- let row_ids = applier_factory(vec![(2, "world")], vec![]).await;
+ let row_ids = applier_factory(vec![(2, "world")], vec![], None).await;
assert_eq!(row_ids, Some(rows([2, 3])));
- let row_ids = applier_factory(vec![(2, "Hello")], vec![]).await;
+ let row_ids = applier_factory(vec![(2, "Hello")], vec![], None).await;
assert_eq!(row_ids, Some(rows([0, 3])));
- let row_ids = applier_factory(vec![(2, "World")], vec![]).await;
+ let row_ids = applier_factory(vec![(2, "World")], vec![], None).await;
assert_eq!(row_ids, Some(rows([2, 3])));
- let row_ids = applier_factory(vec![], vec![(2, vec![(false, "hello")])]).await;
+ let row_ids = applier_factory(vec![], vec![(2, vec![(false, "hello")])], None).await;
assert_eq!(row_ids, Some(rows([0, 3])));
- let row_ids = applier_factory(vec![], vec![(2, vec![(true, "hello")])]).await;
+ let row_ids = applier_factory(vec![], vec![(2, vec![(true, "hello")])], None).await;
assert_eq!(row_ids, Some(rows([0, 3])));
- let row_ids = applier_factory(vec![], vec![(2, vec![(false, "world")])]).await;
+ let row_ids = applier_factory(vec![], vec![(2, vec![(false, "world")])], None).await;
assert_eq!(row_ids, Some(rows([2, 3])));
- let row_ids = applier_factory(vec![], vec![(2, vec![(true, "world")])]).await;
+ let row_ids = applier_factory(vec![], vec![(2, vec![(true, "world")])], None).await;
assert_eq!(row_ids, Some(rows([2, 3])));
- let row_ids = applier_factory(vec![], vec![(2, vec![(false, "Hello")])]).await;
+ let row_ids = applier_factory(vec![], vec![(2, vec![(false, "Hello")])], None).await;
assert_eq!(row_ids, Some(rows([0, 3])));
- let row_ids = applier_factory(vec![], vec![(2, vec![(true, "Hello")])]).await;
+ let row_ids = applier_factory(vec![], vec![(2, vec![(true, "Hello")])], None).await;
assert_eq!(row_ids, Some(rows([0, 3])));
- let row_ids = applier_factory(vec![], vec![(2, vec![(false, "World")])]).await;
+ let row_ids = applier_factory(vec![], vec![(2, vec![(false, "World")])], None).await;
assert_eq!(row_ids, Some(rows([2, 3])));
- let row_ids = applier_factory(vec![], vec![(2, vec![(true, "World")])]).await;
+ let row_ids = applier_factory(vec![], vec![(2, vec![(true, "World")])], None).await;
assert_eq!(row_ids, Some(rows([2, 3])));
}
#[tokio::test]
- async fn test_fulltext_index_basic_chinese() {
+ async fn test_fulltext_index_basic_case_insensitive_bloom() {
let applier_factory = build_fulltext_applier_factory(
- "test_fulltext_index_basic_chinese_",
+ "test_fulltext_index_basic_case_insensitive_bloom_",
+ FulltextBackend::Bloom,
+ &[
+ (None, Some("hello"), None),
+ (None, None, None),
+ (None, Some("world"), None),
+ (None, Some("Hello, World"), None),
+ ],
+ )
+ .await;
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![(2, vec![(false, "hello")])],
+ Some(BitVec::from_slice(&[0b1111])),
+ )
+ .await;
+ assert_eq!(row_ids, Some(rows([0, 3])));
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![(2, vec![(false, "hello")])],
+ Some(BitVec::from_slice(&[0b1110])), // row 0 is filtered out
+ )
+ .await;
+ assert_eq!(row_ids, Some(rows([3])));
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![(2, vec![(true, "hello")])],
+ Some(BitVec::from_slice(&[0b1111])),
+ )
+ .await;
+ assert_eq!(row_ids, Some(rows([0, 3])));
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![(2, vec![(true, "hello")])],
+ Some(BitVec::from_slice(&[0b1110])), // row 0 is filtered out
+ )
+ .await;
+ assert_eq!(row_ids, Some(rows([3])));
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![(2, vec![(false, "world")])],
+ Some(BitVec::from_slice(&[0b1111])),
+ )
+ .await;
+ assert_eq!(row_ids, Some(rows([2, 3])));
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![(2, vec![(false, "world")])],
+ Some(BitVec::from_slice(&[0b1011])), // row 2 is filtered out
+ )
+ .await;
+ assert_eq!(row_ids, Some(rows([3])));
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![(2, vec![(true, "world")])],
+ Some(BitVec::from_slice(&[0b1111])),
+ )
+ .await;
+ assert_eq!(row_ids, Some(rows([2, 3])));
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![(2, vec![(true, "world")])],
+ Some(BitVec::from_slice(&[0b1011])), // row 2 is filtered out
+ )
+ .await;
+ assert_eq!(row_ids, Some(rows([3])));
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![(2, vec![(false, "Hello")])],
+ Some(BitVec::from_slice(&[0b1111])),
+ )
+ .await;
+ assert_eq!(row_ids, Some(rows([0, 3])));
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![(2, vec![(false, "Hello")])],
+ Some(BitVec::from_slice(&[0b0111])), // row 3 is filtered out
+ )
+ .await;
+ assert_eq!(row_ids, Some(rows([0])));
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![(2, vec![(true, "Hello")])],
+ Some(BitVec::from_slice(&[0b1111])),
+ )
+ .await;
+ assert_eq!(row_ids, Some(rows([0, 3])));
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![(2, vec![(true, "Hello")])],
+ Some(BitVec::from_slice(&[0b1110])), // row 0 is filtered out
+ )
+ .await;
+ assert_eq!(row_ids, Some(rows([3])));
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![(2, vec![(false, "World")])],
+ Some(BitVec::from_slice(&[0b1111])),
+ )
+ .await;
+ assert_eq!(row_ids, Some(rows([2, 3])));
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![(2, vec![(false, "World")])],
+ Some(BitVec::from_slice(&[0b0111])), // row 3 is filtered out
+ )
+ .await;
+ assert_eq!(row_ids, Some(rows([2])));
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![(2, vec![(true, "World")])],
+ Some(BitVec::from_slice(&[0b1111])),
+ )
+ .await;
+ assert_eq!(row_ids, Some(rows([2, 3])));
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![(2, vec![(true, "World")])],
+ Some(BitVec::from_slice(&[0b1011])), // row 2 is filtered out
+ )
+ .await;
+ assert_eq!(row_ids, Some(rows([3])));
+ }
+
+ #[tokio::test]
+ async fn test_fulltext_index_basic_chinese_tantivy() {
+ let applier_factory = build_fulltext_applier_factory(
+ "test_fulltext_index_basic_chinese_tantivy_",
+ FulltextBackend::Tantivy,
&[
(None, None, Some("你好")),
(None, None, None),
@@ -725,23 +1009,71 @@ mod tests {
)
.await;
- let row_ids = applier_factory(vec![(3, "你好")], vec![]).await;
+ let row_ids = applier_factory(vec![(3, "你好")], vec![], None).await;
assert_eq!(row_ids, Some(rows([0, 3])));
- let row_ids = applier_factory(vec![(3, "世界")], vec![]).await;
+ let row_ids = applier_factory(vec![(3, "世界")], vec![], None).await;
assert_eq!(row_ids, Some(rows([2, 3])));
- let row_ids = applier_factory(vec![], vec![(3, vec![(false, "你好")])]).await;
+ let row_ids = applier_factory(vec![], vec![(3, vec![(false, "你好")])], None).await;
assert_eq!(row_ids, Some(rows([0, 3])));
- let row_ids = applier_factory(vec![], vec![(3, vec![(false, "世界")])]).await;
+ let row_ids = applier_factory(vec![], vec![(3, vec![(false, "世界")])], None).await;
assert_eq!(row_ids, Some(rows([2, 3])));
}
#[tokio::test]
- async fn test_fulltext_index_multi_terms_case_sensitive() {
+ async fn test_fulltext_index_basic_chinese_bloom() {
let applier_factory = build_fulltext_applier_factory(
- "test_fulltext_index_multi_terms_case_sensitive_",
+ "test_fulltext_index_basic_chinese_bloom_",
+ FulltextBackend::Bloom,
+ &[
+ (None, None, Some("你好")),
+ (None, None, None),
+ (None, None, Some("世界")),
+ (None, None, Some("你好,世界")),
+ ],
+ )
+ .await;
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![(3, vec![(false, "你好")])],
+ Some(BitVec::from_slice(&[0b1111])),
+ )
+ .await;
+ assert_eq!(row_ids, Some(rows([0, 3])));
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![(3, vec![(false, "你好")])],
+ Some(BitVec::from_slice(&[0b1110])), // row 0 is filtered out
+ )
+ .await;
+ assert_eq!(row_ids, Some(rows([3])));
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![(3, vec![(false, "世界")])],
+ Some(BitVec::from_slice(&[0b1111])),
+ )
+ .await;
+ assert_eq!(row_ids, Some(rows([2, 3])));
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![(3, vec![(false, "世界")])],
+ Some(BitVec::from_slice(&[0b1011])), // row 2 is filtered out
+ )
+ .await;
+ assert_eq!(row_ids, Some(rows([3])));
+ }
+
+ #[tokio::test]
+ async fn test_fulltext_index_multi_terms_case_sensitive_tantivy() {
+ let applier_factory = build_fulltext_applier_factory(
+ "test_fulltext_index_multi_terms_case_sensitive_tantivy_",
+ FulltextBackend::Tantivy,
&[
(Some("Hello"), None, None),
(Some("World"), None, None),
@@ -751,31 +1083,107 @@ mod tests {
)
.await;
- let row_ids =
- applier_factory(vec![], vec![(1, vec![(false, "hello"), (false, "world")])]).await;
+ let row_ids = applier_factory(
+ vec![],
+ vec![(1, vec![(false, "hello"), (false, "world")])],
+ None,
+ )
+ .await;
assert_eq!(row_ids, Some(rows([])));
- let row_ids =
- applier_factory(vec![], vec![(1, vec![(false, "Hello"), (false, "World")])]).await;
+ let row_ids = applier_factory(
+ vec![],
+ vec![(1, vec![(false, "Hello"), (false, "World")])],
+ None,
+ )
+ .await;
assert_eq!(row_ids, Some(rows([3])));
- let row_ids =
- applier_factory(vec![], vec![(1, vec![(true, "Hello"), (false, "World")])]).await;
+ let row_ids = applier_factory(
+ vec![],
+ vec![(1, vec![(true, "Hello"), (false, "World")])],
+ None,
+ )
+ .await;
assert_eq!(row_ids, Some(rows([1, 3])));
- let row_ids =
- applier_factory(vec![], vec![(1, vec![(false, "Hello"), (true, "World")])]).await;
+ let row_ids = applier_factory(
+ vec![],
+ vec![(1, vec![(false, "Hello"), (true, "World")])],
+ None,
+ )
+ .await;
assert_eq!(row_ids, Some(rows([0, 3])));
- let row_ids =
- applier_factory(vec![], vec![(1, vec![(true, "Hello"), (true, "World")])]).await;
+ let row_ids = applier_factory(
+ vec![],
+ vec![(1, vec![(true, "Hello"), (true, "World")])],
+ None,
+ )
+ .await;
assert_eq!(row_ids, None);
}
#[tokio::test]
- async fn test_fulltext_index_multi_terms_case_insensitive() {
+ async fn test_fulltext_index_multi_terms_case_sensitive_bloom() {
let applier_factory = build_fulltext_applier_factory(
- "test_fulltext_index_multi_terms_case_insensitive_",
+ "test_fulltext_index_multi_terms_case_sensitive_bloom_",
+ FulltextBackend::Bloom,
+ &[
+ (Some("Hello"), None, None),
+ (Some("World"), None, None),
+ (None, None, None),
+ (Some("Hello, World"), None, None),
+ ],
+ )
+ .await;
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![(1, vec![(false, "hello"), (false, "world")])],
+ Some(BitVec::from_slice(&[0b1111])),
+ )
+ .await;
+ assert_eq!(row_ids, Some(rows([])));
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![(1, vec![(false, "Hello"), (false, "World")])],
+ Some(BitVec::from_slice(&[0b1111])),
+ )
+ .await;
+ assert_eq!(row_ids, Some(rows([3])));
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![(1, vec![(true, "Hello"), (false, "World")])],
+ Some(BitVec::from_slice(&[0b1111])),
+ )
+ .await;
+ assert_eq!(row_ids, Some(rows([1, 3])));
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![(1, vec![(false, "Hello"), (true, "World")])],
+ Some(BitVec::from_slice(&[0b1111])),
+ )
+ .await;
+ assert_eq!(row_ids, Some(rows([0, 3])));
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![(1, vec![(true, "Hello"), (true, "World")])],
+ Some(BitVec::from_slice(&[0b1111])),
+ )
+ .await;
+ assert_eq!(row_ids, None);
+ }
+
+ #[tokio::test]
+ async fn test_fulltext_index_multi_terms_case_insensitive_tantivy() {
+ let applier_factory = build_fulltext_applier_factory(
+ "test_fulltext_index_multi_terms_case_insensitive_tantivy_",
+ FulltextBackend::Tantivy,
&[
(None, Some("hello"), None),
(None, None, None),
@@ -785,27 +1193,91 @@ mod tests {
)
.await;
- let row_ids =
- applier_factory(vec![], vec![(2, vec![(false, "hello"), (false, "world")])]).await;
+ let row_ids = applier_factory(
+ vec![],
+ vec![(2, vec![(false, "hello"), (false, "world")])],
+ None,
+ )
+ .await;
assert_eq!(row_ids, Some(rows([3])));
- let row_ids =
- applier_factory(vec![], vec![(2, vec![(true, "hello"), (false, "world")])]).await;
+ let row_ids = applier_factory(
+ vec![],
+ vec![(2, vec![(true, "hello"), (false, "world")])],
+ None,
+ )
+ .await;
assert_eq!(row_ids, Some(rows([3])));
- let row_ids =
- applier_factory(vec![], vec![(2, vec![(false, "hello"), (true, "world")])]).await;
+ let row_ids = applier_factory(
+ vec![],
+ vec![(2, vec![(false, "hello"), (true, "world")])],
+ None,
+ )
+ .await;
assert_eq!(row_ids, Some(rows([3])));
- let row_ids =
- applier_factory(vec![], vec![(2, vec![(true, "hello"), (true, "world")])]).await;
+ let row_ids = applier_factory(
+ vec![],
+ vec![(2, vec![(true, "hello"), (true, "world")])],
+ None,
+ )
+ .await;
assert_eq!(row_ids, Some(rows([3])));
}
#[tokio::test]
- async fn test_fulltext_index_multi_columns() {
+ async fn test_fulltext_index_multi_terms_case_insensitive_bloom() {
let applier_factory = build_fulltext_applier_factory(
- "test_fulltext_index_multi_columns_",
+ "test_fulltext_index_multi_terms_case_insensitive_bloom_",
+ FulltextBackend::Bloom,
+ &[
+ (None, Some("hello"), None),
+ (None, None, None),
+ (None, Some("world"), None),
+ (None, Some("Hello, World"), None),
+ ],
+ )
+ .await;
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![(2, vec![(false, "hello"), (false, "world")])],
+ Some(BitVec::from_slice(&[0b1111])),
+ )
+ .await;
+ assert_eq!(row_ids, Some(rows([3])));
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![(2, vec![(true, "hello"), (false, "world")])],
+ Some(BitVec::from_slice(&[0b1111])),
+ )
+ .await;
+ assert_eq!(row_ids, Some(rows([3])));
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![(2, vec![(false, "hello"), (true, "world")])],
+ Some(BitVec::from_slice(&[0b1111])),
+ )
+ .await;
+ assert_eq!(row_ids, Some(rows([3])));
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![(2, vec![(true, "hello"), (true, "world")])],
+ Some(BitVec::from_slice(&[0b1111])),
+ )
+ .await;
+ assert_eq!(row_ids, Some(rows([3])));
+ }
+
+ #[tokio::test]
+ async fn test_fulltext_index_multi_columns_tantivy() {
+ let applier_factory = build_fulltext_applier_factory(
+ "test_fulltext_index_multi_columns_tantivy_",
+ FulltextBackend::Tantivy,
&[
(Some("Hello"), None, Some("你好")),
(Some("World"), Some("world"), None),
@@ -822,11 +1294,52 @@ mod tests {
let row_ids = applier_factory(
vec![(1, "Hello"), (3, "你好")],
vec![(2, vec![(false, "world")])],
+ None,
)
.await;
assert_eq!(row_ids, Some(rows([3])));
- let row_ids = applier_factory(vec![(2, "World")], vec![(1, vec![(false, "World")])]).await;
+ let row_ids =
+ applier_factory(vec![(2, "World")], vec![(1, vec![(false, "World")])], None).await;
+ assert_eq!(row_ids, Some(rows([1, 3])));
+ }
+
+ #[tokio::test]
+ async fn test_fulltext_index_multi_columns_bloom() {
+ let applier_factory = build_fulltext_applier_factory(
+ "test_fulltext_index_multi_columns_bloom_",
+ FulltextBackend::Bloom,
+ &[
+ (Some("Hello"), None, Some("你好")),
+ (Some("World"), Some("world"), None),
+ (None, Some("World"), Some("世界")),
+ (
+ Some("Hello, World"),
+ Some("Hello, World"),
+ Some("你好,世界"),
+ ),
+ ],
+ )
+ .await;
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![
+ (1, vec![(false, "Hello")]),
+ (2, vec![(false, "world")]),
+ (3, vec![(false, "你好")]),
+ ],
+ Some(BitVec::from_slice(&[0b1111])),
+ )
+ .await;
+ assert_eq!(row_ids, Some(rows([3])));
+
+ let row_ids = applier_factory(
+ vec![],
+ vec![(1, vec![(false, "World")]), (2, vec![(false, "World")])],
+ Some(BitVec::from_slice(&[0b1111])),
+ )
+ .await;
assert_eq!(row_ids, Some(rows([1, 3])));
}
}
diff --git a/src/mito2/src/sst/parquet/reader.rs b/src/mito2/src/sst/parquet/reader.rs
index 069e10344c..5c2ab17591 100644
--- a/src/mito2/src/sst/parquet/reader.rs
+++ b/src/mito2/src/sst/parquet/reader.rs
@@ -369,6 +369,9 @@ impl ParquetReaderBuilder {
self.prune_row_groups_by_bloom_filter(parquet_meta, &mut output, metrics)
.await;
+ self.prune_row_groups_by_fulltext_bloom(parquet_meta, &mut output, metrics)
+ .await;
+
output
}
@@ -389,7 +392,7 @@ impl ParquetReaderBuilder {
let file_size_hint = self.file_handle.meta_ref().index_file_size();
let apply_res = match index_applier
- .apply(self.file_handle.file_id(), Some(file_size_hint))
+ .apply_fine(self.file_handle.file_id(), Some(file_size_hint))
.await
{
Ok(Some(res)) => res,
@@ -631,6 +634,67 @@ impl ParquetReaderBuilder {
true
}
+ async fn prune_row_groups_by_fulltext_bloom(
+ &self,
+ parquet_meta: &ParquetMetaData,
+ output: &mut BTreeMap>,
+ metrics: &mut ReaderFilterMetrics,
+ ) -> bool {
+ let Some(index_applier) = &self.fulltext_index_applier else {
+ return false;
+ };
+
+ if !self.file_handle.meta_ref().fulltext_index_available() {
+ return false;
+ }
+
+ let file_size_hint = self.file_handle.meta_ref().index_file_size();
+ let apply_output = match index_applier
+ .apply_coarse(
+ self.file_handle.file_id(),
+ Some(file_size_hint),
+ parquet_meta
+ .row_groups()
+ .iter()
+ .enumerate()
+ .map(|(i, rg)| (rg.num_rows() as usize, output.contains_key(&i))),
+ )
+ .await
+ {
+ Ok(Some(apply_output)) => apply_output,
+ Ok(None) => return false,
+ Err(err) => {
+ if cfg!(any(test, feature = "test")) {
+ panic!(
+ "Failed to apply fulltext index, region_id: {}, file_id: {}, err: {:?}",
+ self.file_handle.region_id(),
+ self.file_handle.file_id(),
+ err
+ );
+ } else {
+ warn!(
+ err; "Failed to apply fulltext index, region_id: {}, file_id: {}",
+ self.file_handle.region_id(), self.file_handle.file_id()
+ );
+ }
+
+ return false;
+ }
+ };
+
+ Self::prune_row_groups_by_ranges(
+ parquet_meta,
+ apply_output
+ .into_iter()
+ .map(|(rg, ranges)| (rg, ranges.into_iter())),
+ output,
+ &mut metrics.rg_fulltext_filtered,
+ &mut metrics.rows_fulltext_filtered,
+ );
+
+ true
+ }
+
/// Prunes row groups by rows. The `rows_in_row_groups` is like a map from row group to
/// a list of row ids to keep.
fn prune_row_groups_by_rows(
From 7ddd7a988855d759c69ab6b6a375556ec162cb46 Mon Sep 17 00:00:00 2001
From: yihong
Date: Mon, 14 Apr 2025 15:13:40 +0800
Subject: [PATCH 14/82] fix: flaky test on windows (#5890)
Signed-off-by: yihong0618
---
src/common/error/tests/ext.rs | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/src/common/error/tests/ext.rs b/src/common/error/tests/ext.rs
index 0a39ed51c6..26eaf2d5e2 100644
--- a/src/common/error/tests/ext.rs
+++ b/src/common/error/tests/ext.rs
@@ -87,15 +87,19 @@ fn test_to_string() {
#[test]
fn test_debug_format() {
let result = normal_error();
+ let debug_output = format!("{:?}", result.unwrap_err());
+ let normalized_output = debug_output.replace('\\', "/");
assert_eq!(
- format!("{:?}", result.unwrap_err()),
+ normalized_output,
r#"0: A normal error with "display" attribute, message "blabla", at src/common/error/tests/ext.rs:55:22
1: PlainError { msg: "", status_code: Unexpected }"#
);
let result = transparent_error();
+ let debug_output = format!("{:?}", result.unwrap_err());
+ let normalized_output = debug_output.replace('\\', "/");
assert_eq!(
- format!("{:?}", result.unwrap_err()),
+ normalized_output,
r#"0: , at src/common/error/tests/ext.rs:60:5
1: PlainError { msg: "", status_code: Unexpected }"#
);
From c522893552e8660e383a04d4d5919e9dbb876fd0 Mon Sep 17 00:00:00 2001
From: Weny Xu
Date: Mon, 14 Apr 2025 20:37:31 +0800
Subject: [PATCH 15/82] fix: ensure logical regions are synced during region
sync (#5878)
* fix: ensure logical regions are synced during region sync
* chore: apply suggestions from CR
* chore: apply suggestions from CR
---
src/datanode/src/region_server.rs | 33 ++-
src/datanode/src/tests.rs | 4 +-
src/file-engine/src/engine.rs | 6 +-
src/metric-engine/src/engine.rs | 42 +--
src/metric-engine/src/engine/alter.rs | 2 +-
src/metric-engine/src/engine/create.rs | 13 +-
src/metric-engine/src/engine/open.rs | 20 +-
.../src/engine/region_metadata.rs | 2 +-
src/metric-engine/src/engine/sync.rs | 261 ++++++++++++++++++
src/metric-engine/src/error.rs | 11 +-
src/metric-engine/src/metadata_region.rs | 19 +-
src/metric-engine/src/test_util.rs | 31 ++-
src/mito2/src/engine.rs | 14 +-
src/mito2/src/manifest/manager.rs | 7 +-
src/mito2/src/manifest/storage.rs | 10 +-
src/mito2/src/request.rs | 5 +-
src/mito2/src/worker/handle_manifest.rs | 4 +-
src/query/src/optimizer/test_util.rs | 4 +-
src/store-api/src/region_engine.rs | 58 +++-
19 files changed, 470 insertions(+), 76 deletions(-)
create mode 100644 src/metric-engine/src/engine/sync.rs
diff --git a/src/datanode/src/region_server.rs b/src/datanode/src/region_server.rs
index c05291050c..14ccfa2816 100644
--- a/src/datanode/src/region_server.rs
+++ b/src/datanode/src/region_server.rs
@@ -308,20 +308,36 @@ impl RegionServer {
.with_context(|_| HandleRegionRequestSnafu { region_id })
}
+ /// Sync region manifest and registers new opened logical regions.
pub async fn sync_region_manifest(
&self,
region_id: RegionId,
manifest_info: RegionManifestInfo,
) -> Result<()> {
- let engine = self
+ let engine_with_status = self
.inner
.region_map
.get(®ion_id)
.with_context(|| RegionNotFoundSnafu { region_id })?;
- engine
+
+ let Some(new_opened_regions) = engine_with_status
.sync_region(region_id, manifest_info)
.await
- .with_context(|_| HandleRegionRequestSnafu { region_id })
+ .with_context(|_| HandleRegionRequestSnafu { region_id })?
+ .new_opened_logical_region_ids()
+ else {
+ return Ok(());
+ };
+
+ for region in new_opened_regions {
+ self.inner.region_map.insert(
+ region,
+ RegionEngineWithStatus::Ready(engine_with_status.engine().clone()),
+ );
+ info!("Logical region {} is registered!", region);
+ }
+
+ Ok(())
}
/// Set region role state gracefully.
@@ -526,6 +542,15 @@ impl RegionEngineWithStatus {
RegionEngineWithStatus::Ready(engine) => engine,
}
}
+
+ /// Returns [RegionEngineRef] reference.
+ pub fn engine(&self) -> &RegionEngineRef {
+ match self {
+ RegionEngineWithStatus::Registering(engine) => engine,
+ RegionEngineWithStatus::Deregistering(engine) => engine,
+ RegionEngineWithStatus::Ready(engine) => engine,
+ }
+ }
}
impl Deref for RegionEngineWithStatus {
@@ -1029,7 +1054,7 @@ impl RegionServerInner {
for region in logical_regions {
self.region_map
.insert(region, RegionEngineWithStatus::Ready(engine.clone()));
- debug!("Logical region {} is registered!", region);
+ info!("Logical region {} is registered!", region);
}
Ok(())
}
diff --git a/src/datanode/src/tests.rs b/src/datanode/src/tests.rs
index b349024cc9..f182e1c423 100644
--- a/src/datanode/src/tests.rs
+++ b/src/datanode/src/tests.rs
@@ -33,7 +33,7 @@ use session::context::QueryContextRef;
use store_api::metadata::RegionMetadataRef;
use store_api::region_engine::{
RegionEngine, RegionManifestInfo, RegionRole, RegionScannerRef, RegionStatistic,
- SetRegionRoleStateResponse, SettableRegionRoleState,
+ SetRegionRoleStateResponse, SettableRegionRoleState, SyncManifestResponse,
};
use store_api::region_request::{AffectedRows, RegionRequest};
use store_api::storage::{RegionId, ScanRequest, SequenceNumber};
@@ -250,7 +250,7 @@ impl RegionEngine for MockRegionEngine {
&self,
_region_id: RegionId,
_manifest_info: RegionManifestInfo,
- ) -> Result<(), BoxedError> {
+ ) -> Result {
unimplemented!()
}
diff --git a/src/file-engine/src/engine.rs b/src/file-engine/src/engine.rs
index cf5e5c7576..09a373caad 100644
--- a/src/file-engine/src/engine.rs
+++ b/src/file-engine/src/engine.rs
@@ -28,7 +28,7 @@ use store_api::metadata::RegionMetadataRef;
use store_api::region_engine::{
RegionEngine, RegionManifestInfo, RegionRole, RegionScannerRef, RegionStatistic,
SetRegionRoleStateResponse, SetRegionRoleStateSuccess, SettableRegionRoleState,
- SinglePartitionScanner,
+ SinglePartitionScanner, SyncManifestResponse,
};
use store_api::region_request::{
AffectedRows, RegionCloseRequest, RegionCreateRequest, RegionDropRequest, RegionOpenRequest,
@@ -145,9 +145,9 @@ impl RegionEngine for FileRegionEngine {
&self,
_region_id: RegionId,
_manifest_info: RegionManifestInfo,
- ) -> Result<(), BoxedError> {
+ ) -> Result {
// File engine doesn't need to sync region manifest.
- Ok(())
+ Ok(SyncManifestResponse::NotSupported)
}
fn role(&self, region_id: RegionId) -> Option {
diff --git a/src/metric-engine/src/engine.rs b/src/metric-engine/src/engine.rs
index 74978fda78..509438b4b2 100644
--- a/src/metric-engine/src/engine.rs
+++ b/src/metric-engine/src/engine.rs
@@ -24,6 +24,7 @@ mod put;
mod read;
mod region_metadata;
mod state;
+mod sync;
use std::any::Any;
use std::collections::HashMap;
@@ -41,6 +42,7 @@ use store_api::metric_engine_consts::METRIC_ENGINE_NAME;
use store_api::region_engine::{
RegionEngine, RegionManifestInfo, RegionRole, RegionScannerRef, RegionStatistic,
SetRegionRoleStateResponse, SetRegionRoleStateSuccess, SettableRegionRoleState,
+ SyncManifestResponse,
};
use store_api::region_request::{BatchRegionDdlRequest, RegionRequest};
use store_api::storage::{RegionId, ScanRequest, SequenceNumber};
@@ -48,7 +50,7 @@ use store_api::storage::{RegionId, ScanRequest, SequenceNumber};
use self::state::MetricEngineState;
use crate::config::EngineConfig;
use crate::data_region::DataRegion;
-use crate::error::{self, MetricManifestInfoSnafu, Result, UnsupportedRegionRequestSnafu};
+use crate::error::{self, Result, UnsupportedRegionRequestSnafu};
use crate::metadata_region::MetadataRegion;
use crate::row_modifier::RowModifier;
use crate::utils;
@@ -311,40 +313,11 @@ impl RegionEngine for MetricEngine {
&self,
region_id: RegionId,
manifest_info: RegionManifestInfo,
- ) -> Result<(), BoxedError> {
- if !manifest_info.is_metric() {
- return Err(BoxedError::new(
- MetricManifestInfoSnafu { region_id }.build(),
- ));
- }
-
- let metadata_region_id = utils::to_metadata_region_id(region_id);
- // checked by ensure above
- let metadata_manifest_version = manifest_info
- .metadata_manifest_version()
- .unwrap_or_default();
- let metadata_flushed_entry_id = manifest_info
- .metadata_flushed_entry_id()
- .unwrap_or_default();
- let metadata_region_manifest =
- RegionManifestInfo::mito(metadata_manifest_version, metadata_flushed_entry_id);
+ ) -> Result {
self.inner
- .mito
- .sync_region(metadata_region_id, metadata_region_manifest)
- .await?;
-
- let data_region_id = utils::to_data_region_id(region_id);
- let data_manifest_version = manifest_info.data_manifest_version();
- let data_flushed_entry_id = manifest_info.data_flushed_entry_id();
- let data_region_manifest =
- RegionManifestInfo::mito(data_manifest_version, data_flushed_entry_id);
-
- self.inner
- .mito
- .sync_region(data_region_id, data_region_manifest)
- .await?;
-
- Ok(())
+ .sync_region(region_id, manifest_info)
+ .await
+ .map_err(BoxedError::new)
}
async fn set_region_role_state_gracefully(
@@ -423,6 +396,7 @@ impl MetricEngine {
self.inner.mito.clone()
}
+ /// Returns all logical regions associated with the physical region.
pub async fn logical_regions(&self, physical_region_id: RegionId) -> Result> {
self.inner
.metadata_region
diff --git a/src/metric-engine/src/engine/alter.rs b/src/metric-engine/src/engine/alter.rs
index 0b23a80bfd..1d82149a7d 100644
--- a/src/metric-engine/src/engine/alter.rs
+++ b/src/metric-engine/src/engine/alter.rs
@@ -145,7 +145,7 @@ impl MetricEngineInner {
let _write_guard = self
.metadata_region
.write_lock_logical_region(*region_id)
- .await;
+ .await?;
write_guards.insert(*region_id, _write_guard);
}
diff --git a/src/metric-engine/src/engine/create.rs b/src/metric-engine/src/engine/create.rs
index bfb7737df7..1ceb20d206 100644
--- a/src/metric-engine/src/engine/create.rs
+++ b/src/metric-engine/src/engine/create.rs
@@ -279,9 +279,16 @@ impl MetricEngineInner {
.add_logical_regions(physical_region_id, true, logical_region_columns)
.await?;
- let mut state = self.state.write().unwrap();
- state.add_physical_columns(data_region_id, new_add_columns);
- state.add_logical_regions(physical_region_id, logical_regions);
+ {
+ let mut state = self.state.write().unwrap();
+ state.add_physical_columns(data_region_id, new_add_columns);
+ state.add_logical_regions(physical_region_id, logical_regions.clone());
+ }
+ for logical_region_id in logical_regions {
+ self.metadata_region
+ .open_logical_region(logical_region_id)
+ .await;
+ }
Ok(())
}
diff --git a/src/metric-engine/src/engine/open.rs b/src/metric-engine/src/engine/open.rs
index eb9f266be2..4b25cf38f2 100644
--- a/src/metric-engine/src/engine/open.rs
+++ b/src/metric-engine/src/engine/open.rs
@@ -132,12 +132,14 @@ impl MetricEngineInner {
/// Includes:
/// - Record physical region's column names
/// - Record the mapping between logical region id and physical region id
+ ///
+ /// Returns new opened logical region ids.
pub(crate) async fn recover_states(
&self,
physical_region_id: RegionId,
primary_key_encoding: PrimaryKeyEncoding,
physical_region_options: PhysicalRegionOptions,
- ) -> Result<()> {
+ ) -> Result> {
// load logical regions and physical column names
let logical_regions = self
.metadata_region
@@ -147,7 +149,6 @@ impl MetricEngineInner {
.data_region
.physical_columns(physical_region_id)
.await?;
- let logical_region_num = logical_regions.len();
{
let mut state = self.state.write().unwrap();
@@ -168,15 +169,22 @@ impl MetricEngineInner {
}
}
+ let mut opened_logical_region_ids = Vec::new();
+ // The `recover_states` may be called multiple times, we only count the logical regions
+ // that are opened for the first time.
for logical_region_id in logical_regions {
- self.metadata_region
+ if self
+ .metadata_region
.open_logical_region(logical_region_id)
- .await;
+ .await
+ {
+ opened_logical_region_ids.push(logical_region_id);
+ }
}
- LOGICAL_REGION_COUNT.add(logical_region_num as i64);
+ LOGICAL_REGION_COUNT.add(opened_logical_region_ids.len() as i64);
- Ok(())
+ Ok(opened_logical_region_ids)
}
}
diff --git a/src/metric-engine/src/engine/region_metadata.rs b/src/metric-engine/src/engine/region_metadata.rs
index 9f00235e96..f8e0dd8dc3 100644
--- a/src/metric-engine/src/engine/region_metadata.rs
+++ b/src/metric-engine/src/engine/region_metadata.rs
@@ -46,7 +46,7 @@ impl MetricEngineInner {
let _read_guard = self
.metadata_region
.read_lock_logical_region(logical_region_id)
- .await;
+ .await?;
// Load logical and physical columns, and intersect them to get logical column metadata.
let logical_column_metadata = self
.metadata_region
diff --git a/src/metric-engine/src/engine/sync.rs b/src/metric-engine/src/engine/sync.rs
new file mode 100644
index 0000000000..fe0d8ef6d0
--- /dev/null
+++ b/src/metric-engine/src/engine/sync.rs
@@ -0,0 +1,261 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::time::Instant;
+
+use common_telemetry::info;
+use snafu::{ensure, OptionExt, ResultExt};
+use store_api::region_engine::{RegionEngine, RegionManifestInfo, SyncManifestResponse};
+use store_api::storage::RegionId;
+
+use crate::engine::MetricEngineInner;
+use crate::error::{
+ MetricManifestInfoSnafu, MitoSyncOperationSnafu, PhysicalRegionNotFoundSnafu, Result,
+};
+use crate::utils;
+
+impl MetricEngineInner {
+ pub async fn sync_region(
+ &self,
+ region_id: RegionId,
+ manifest_info: RegionManifestInfo,
+ ) -> Result {
+ ensure!(
+ manifest_info.is_metric(),
+ MetricManifestInfoSnafu { region_id }
+ );
+
+ let metadata_region_id = utils::to_metadata_region_id(region_id);
+ // checked by ensure above
+ let metadata_manifest_version = manifest_info
+ .metadata_manifest_version()
+ .unwrap_or_default();
+ let metadata_flushed_entry_id = manifest_info
+ .metadata_flushed_entry_id()
+ .unwrap_or_default();
+ let metadata_region_manifest =
+ RegionManifestInfo::mito(metadata_manifest_version, metadata_flushed_entry_id);
+ let metadata_synced = self
+ .mito
+ .sync_region(metadata_region_id, metadata_region_manifest)
+ .await
+ .context(MitoSyncOperationSnafu)?
+ .is_data_synced();
+
+ let data_region_id = utils::to_data_region_id(region_id);
+ let data_manifest_version = manifest_info.data_manifest_version();
+ let data_flushed_entry_id = manifest_info.data_flushed_entry_id();
+ let data_region_manifest =
+ RegionManifestInfo::mito(data_manifest_version, data_flushed_entry_id);
+
+ let data_synced = self
+ .mito
+ .sync_region(data_region_id, data_region_manifest)
+ .await
+ .context(MitoSyncOperationSnafu)?
+ .is_data_synced();
+
+ if !metadata_synced {
+ return Ok(SyncManifestResponse::Metric {
+ metadata_synced,
+ data_synced,
+ new_opened_logical_region_ids: vec![],
+ });
+ }
+
+ let now = Instant::now();
+ // Recovers the states from the metadata region
+ // if the metadata manifest version is updated.
+ let physical_region_options = *self
+ .state
+ .read()
+ .unwrap()
+ .physical_region_states()
+ .get(&data_region_id)
+ .context(PhysicalRegionNotFoundSnafu {
+ region_id: data_region_id,
+ })?
+ .options();
+ let primary_key_encoding = self.mito.get_primary_key_encoding(data_region_id).context(
+ PhysicalRegionNotFoundSnafu {
+ region_id: data_region_id,
+ },
+ )?;
+ let new_opened_logical_region_ids = self
+ .recover_states(
+ data_region_id,
+ primary_key_encoding,
+ physical_region_options,
+ )
+ .await?;
+ info!(
+ "Sync metadata region for physical region {}, cost: {:?}, new opened logical region ids: {:?}",
+ data_region_id,
+ now.elapsed(),
+ new_opened_logical_region_ids
+ );
+
+ Ok(SyncManifestResponse::Metric {
+ metadata_synced,
+ data_synced,
+ new_opened_logical_region_ids,
+ })
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use std::collections::HashMap;
+
+ use api::v1::SemanticType;
+ use common_telemetry::info;
+ use datatypes::data_type::ConcreteDataType;
+ use datatypes::schema::ColumnSchema;
+ use store_api::metadata::ColumnMetadata;
+ use store_api::region_engine::{RegionEngine, RegionManifestInfo};
+ use store_api::region_request::{
+ AddColumn, AlterKind, RegionAlterRequest, RegionFlushRequest, RegionRequest,
+ };
+ use store_api::storage::RegionId;
+
+ use crate::metadata_region::MetadataRegion;
+ use crate::test_util::TestEnv;
+
+ #[tokio::test]
+ async fn test_sync_region_with_new_created_logical_regions() {
+ common_telemetry::init_default_ut_logging();
+ let mut env = TestEnv::with_prefix("sync_with_new_created_logical_regions").await;
+ env.init_metric_region().await;
+
+ info!("creating follower engine");
+ // Create a follower engine.
+ let (_follower_mito, follower_metric) = env.create_follower_engine().await;
+
+ let physical_region_id = env.default_physical_region_id();
+
+ // Flushes the physical region
+ let metric_engine = env.metric();
+ metric_engine
+ .handle_request(
+ env.default_physical_region_id(),
+ RegionRequest::Flush(RegionFlushRequest::default()),
+ )
+ .await
+ .unwrap();
+
+ let response = follower_metric
+ .sync_region(physical_region_id, RegionManifestInfo::metric(1, 0, 1, 0))
+ .await
+ .unwrap();
+ assert!(response.is_metric());
+ let new_opened_logical_region_ids = response.new_opened_logical_region_ids().unwrap();
+ assert_eq!(new_opened_logical_region_ids, vec![RegionId::new(3, 2)]);
+
+ // Sync again, no new logical region should be opened
+ let response = follower_metric
+ .sync_region(physical_region_id, RegionManifestInfo::metric(1, 0, 1, 0))
+ .await
+ .unwrap();
+ assert!(response.is_metric());
+ let new_opened_logical_region_ids = response.new_opened_logical_region_ids().unwrap();
+ assert!(new_opened_logical_region_ids.is_empty());
+ }
+
+ fn test_alter_logical_region_request() -> RegionAlterRequest {
+ RegionAlterRequest {
+ kind: AlterKind::AddColumns {
+ columns: vec![AddColumn {
+ column_metadata: ColumnMetadata {
+ column_id: 0,
+ semantic_type: SemanticType::Tag,
+ column_schema: ColumnSchema::new(
+ "tag1",
+ ConcreteDataType::string_datatype(),
+ false,
+ ),
+ },
+ location: None,
+ }],
+ },
+ }
+ }
+
+ #[tokio::test]
+ async fn test_sync_region_alter_alter_logical_region() {
+ common_telemetry::init_default_ut_logging();
+ let mut env = TestEnv::with_prefix("sync_region_alter_alter_logical_region").await;
+ env.init_metric_region().await;
+
+ info!("creating follower engine");
+ let physical_region_id = env.default_physical_region_id();
+ // Flushes the physical region
+ let metric_engine = env.metric();
+ metric_engine
+ .handle_request(
+ env.default_physical_region_id(),
+ RegionRequest::Flush(RegionFlushRequest::default()),
+ )
+ .await
+ .unwrap();
+
+ // Create a follower engine.
+ let (follower_mito, follower_metric) = env.create_follower_engine().await;
+ let metric_engine = env.metric();
+ let engine_inner = env.metric().inner;
+ let region_id = env.default_logical_region_id();
+ let request = test_alter_logical_region_request();
+
+ engine_inner
+ .alter_logical_regions(
+ physical_region_id,
+ vec![(region_id, request)],
+ &mut HashMap::new(),
+ )
+ .await
+ .unwrap();
+
+ // Flushes the physical region
+ metric_engine
+ .handle_request(
+ env.default_physical_region_id(),
+ RegionRequest::Flush(RegionFlushRequest::default()),
+ )
+ .await
+ .unwrap();
+
+ // Sync the follower engine
+ let response = follower_metric
+ .sync_region(physical_region_id, RegionManifestInfo::metric(2, 0, 2, 0))
+ .await
+ .unwrap();
+ assert!(response.is_metric());
+ let new_opened_logical_region_ids = response.new_opened_logical_region_ids().unwrap();
+ assert!(new_opened_logical_region_ids.is_empty());
+
+ let logical_region_id = env.default_logical_region_id();
+ let metadata_region = MetadataRegion::new(follower_mito.clone());
+ let semantic_type = metadata_region
+ .column_semantic_type(physical_region_id, logical_region_id, "tag1")
+ .await
+ .unwrap()
+ .unwrap();
+ assert_eq!(semantic_type, SemanticType::Tag);
+ let timestamp_index = metadata_region
+ .column_semantic_type(physical_region_id, logical_region_id, "greptime_timestamp")
+ .await
+ .unwrap()
+ .unwrap();
+ assert_eq!(timestamp_index, SemanticType::Timestamp);
+ }
+}
diff --git a/src/metric-engine/src/error.rs b/src/metric-engine/src/error.rs
index 8be535ec9f..5f853037e1 100644
--- a/src/metric-engine/src/error.rs
+++ b/src/metric-engine/src/error.rs
@@ -118,6 +118,7 @@ pub enum Error {
#[snafu(implicit)]
location: Location,
},
+
#[snafu(display("Mito delete operation fails"))]
MitoDeleteOperation {
source: BoxedError,
@@ -132,6 +133,13 @@ pub enum Error {
location: Location,
},
+ #[snafu(display("Mito sync operation fails"))]
+ MitoSyncOperation {
+ source: BoxedError,
+ #[snafu(implicit)]
+ location: Location,
+ },
+
#[snafu(display("Failed to collect record batch stream"))]
CollectRecordBatchStream {
source: common_recordbatch::error::Error,
@@ -311,7 +319,8 @@ impl ErrorExt for Error {
| MitoWriteOperation { source, .. }
| MitoCatchupOperation { source, .. }
| MitoFlushOperation { source, .. }
- | MitoDeleteOperation { source, .. } => source.status_code(),
+ | MitoDeleteOperation { source, .. }
+ | MitoSyncOperation { source, .. } => source.status_code(),
EncodePrimaryKey { source, .. } => source.status_code(),
diff --git a/src/metric-engine/src/metadata_region.rs b/src/metric-engine/src/metadata_region.rs
index 2b066a0bde..7e7bae095f 100644
--- a/src/metric-engine/src/metadata_region.rs
+++ b/src/metric-engine/src/metadata_region.rs
@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+use std::collections::hash_map::Entry;
use std::collections::HashMap;
use std::sync::Arc;
@@ -76,11 +77,22 @@ impl MetadataRegion {
}
}
- pub async fn open_logical_region(&self, logical_region_id: RegionId) {
- self.logical_region_lock
+ /// Open a logical region.
+ ///
+ /// Returns true if the logical region is opened for the first time.
+ pub async fn open_logical_region(&self, logical_region_id: RegionId) -> bool {
+ match self
+ .logical_region_lock
.write()
.await
- .insert(logical_region_id, Arc::new(RwLock::new(())));
+ .entry(logical_region_id)
+ {
+ Entry::Occupied(_) => false,
+ Entry::Vacant(vacant_entry) => {
+ vacant_entry.insert(Arc::new(RwLock::new(())));
+ true
+ }
+ }
}
/// Retrieve a read lock guard of given logical region id.
@@ -178,6 +190,7 @@ impl MetadataRegion {
Ok(columns)
}
+ /// Return all logical regions associated with the physical region.
pub async fn logical_regions(&self, physical_region_id: RegionId) -> Result> {
let metadata_region_id = utils::to_metadata_region_id(physical_region_id);
diff --git a/src/metric-engine/src/test_util.rs b/src/metric-engine/src/test_util.rs
index 284834a029..6bcc002908 100644
--- a/src/metric-engine/src/test_util.rs
+++ b/src/metric-engine/src/test_util.rs
@@ -16,6 +16,7 @@
use api::v1::value::ValueData;
use api::v1::{ColumnDataType, ColumnSchema as PbColumnSchema, Row, SemanticType, Value};
+use common_telemetry::debug;
use datatypes::prelude::ConcreteDataType;
use datatypes::schema::ColumnSchema;
use mito2::config::MitoConfig;
@@ -28,7 +29,7 @@ use store_api::metric_engine_consts::{
};
use store_api::region_engine::RegionEngine;
use store_api::region_request::{
- AddColumn, AlterKind, RegionAlterRequest, RegionCreateRequest, RegionRequest,
+ AddColumn, AlterKind, RegionAlterRequest, RegionCreateRequest, RegionOpenRequest, RegionRequest,
};
use store_api::storage::{ColumnId, RegionId};
@@ -77,6 +78,34 @@ impl TestEnv {
self.metric.clone()
}
+ /// Creates a new follower engine with the same config as the leader engine.
+ pub async fn create_follower_engine(&mut self) -> (MitoEngine, MetricEngine) {
+ let mito = self
+ .mito_env
+ .create_follower_engine(MitoConfig::default())
+ .await;
+ let metric = MetricEngine::new(mito.clone(), EngineConfig::default());
+
+ let region_id = self.default_physical_region_id();
+ debug!("opening default physical region: {region_id}");
+ let physical_region_option = [(PHYSICAL_TABLE_METADATA_KEY.to_string(), String::new())]
+ .into_iter()
+ .collect();
+ metric
+ .handle_request(
+ region_id,
+ RegionRequest::Open(RegionOpenRequest {
+ engine: METRIC_ENGINE_NAME.to_string(),
+ region_dir: self.default_region_dir(),
+ options: physical_region_option,
+ skip_wal_replay: true,
+ }),
+ )
+ .await
+ .unwrap();
+ (mito, metric)
+ }
+
/// Create regions in [MetricEngine] under [`default_region_id`]
/// and region dir `"test_metric_region"`.
///
diff --git a/src/mito2/src/engine.rs b/src/mito2/src/engine.rs
index 110f79b875..7b3c7352da 100644
--- a/src/mito2/src/engine.rs
+++ b/src/mito2/src/engine.rs
@@ -82,7 +82,7 @@ use store_api::manifest::ManifestVersion;
use store_api::metadata::RegionMetadataRef;
use store_api::region_engine::{
BatchResponses, RegionEngine, RegionManifestInfo, RegionRole, RegionScannerRef,
- RegionStatistic, SetRegionRoleStateResponse, SettableRegionRoleState,
+ RegionStatistic, SetRegionRoleStateResponse, SettableRegionRoleState, SyncManifestResponse,
};
use store_api::region_request::{AffectedRows, RegionOpenRequest, RegionRequest};
use store_api::storage::{RegionId, ScanRequest, SequenceNumber};
@@ -496,7 +496,7 @@ impl EngineInner {
&self,
region_id: RegionId,
manifest_info: RegionManifestInfo,
- ) -> Result {
+ ) -> Result<(ManifestVersion, bool)> {
ensure!(manifest_info.is_mito(), MitoManifestInfoSnafu);
let manifest_version = manifest_info.data_manifest_version();
let (request, receiver) =
@@ -631,12 +631,14 @@ impl RegionEngine for MitoEngine {
&self,
region_id: RegionId,
manifest_info: RegionManifestInfo,
- ) -> Result<(), BoxedError> {
- self.inner
+ ) -> Result {
+ let (_, synced) = self
+ .inner
.sync_region(region_id, manifest_info)
.await
- .map_err(BoxedError::new)
- .map(|_| ())
+ .map_err(BoxedError::new)?;
+
+ Ok(SyncManifestResponse::Mito { synced })
}
fn role(&self, region_id: RegionId) -> Option {
diff --git a/src/mito2/src/manifest/manager.rs b/src/mito2/src/manifest/manager.rs
index d60f018b4e..2590d7ae6c 100644
--- a/src/mito2/src/manifest/manager.rs
+++ b/src/mito2/src/manifest/manager.rs
@@ -313,11 +313,12 @@ impl RegionManifestManager {
}
);
+ let region_id = self.manifest.metadata.region_id;
// Fetches manifests from the last version strictly.
let mut manifests = self
.store
// Invariant: last_version < target_version.
- .fetch_manifests_strict_from(last_version + 1, target_version + 1)
+ .fetch_manifests_strict_from(last_version + 1, target_version + 1, region_id)
.await?;
// Case 2: No manifests in range: [current_version+1, target_version+1)
@@ -327,7 +328,7 @@ impl RegionManifestManager {
// [Current Version]......[Target Version]
// [Follower region]
if manifests.is_empty() {
- debug!(
+ info!(
"Manifests are not strict from {}, region: {}, tries to install the last checkpoint",
last_version, self.manifest.metadata.region_id
);
@@ -341,7 +342,7 @@ impl RegionManifestManager {
manifests = self
.store
// Invariant: last_version < target_version.
- .fetch_manifests_strict_from(last_version + 1, target_version + 1)
+ .fetch_manifests_strict_from(last_version + 1, target_version + 1, region_id)
.await?;
}
diff --git a/src/mito2/src/manifest/storage.rs b/src/mito2/src/manifest/storage.rs
index c0ee01ba60..89e23e2cd4 100644
--- a/src/mito2/src/manifest/storage.rs
+++ b/src/mito2/src/manifest/storage.rs
@@ -29,6 +29,7 @@ use regex::Regex;
use serde::{Deserialize, Serialize};
use snafu::{ensure, ResultExt};
use store_api::manifest::ManifestVersion;
+use store_api::storage::RegionId;
use tokio::sync::Semaphore;
use crate::error::{
@@ -243,12 +244,17 @@ impl ManifestObjectStore {
&self,
start_version: ManifestVersion,
end_version: ManifestVersion,
+ region_id: RegionId,
) -> Result)>> {
let mut manifests = self.fetch_manifests(start_version, end_version).await?;
let start_index = manifests.iter().position(|(v, _)| *v == start_version);
debug!(
- "fetches manifests in range [{},{}), start_index: {:?}",
- start_version, end_version, start_index
+ "Fetches manifests in range [{},{}), start_index: {:?}, region_id: {}, manifests: {:?}",
+ start_version,
+ end_version,
+ start_index,
+ region_id,
+ manifests.iter().map(|(v, _)| *v).collect::>()
);
if let Some(start_index) = start_index {
Ok(manifests.split_off(start_index))
diff --git a/src/mito2/src/request.rs b/src/mito2/src/request.rs
index 18ef260abe..33a8f13f07 100644
--- a/src/mito2/src/request.rs
+++ b/src/mito2/src/request.rs
@@ -692,7 +692,7 @@ impl WorkerRequest {
pub(crate) fn new_sync_region_request(
region_id: RegionId,
manifest_version: ManifestVersion,
- ) -> (WorkerRequest, Receiver>) {
+ ) -> (WorkerRequest, Receiver>) {
let (sender, receiver) = oneshot::channel();
(
WorkerRequest::SyncRegion(RegionSyncRequest {
@@ -892,7 +892,8 @@ pub(crate) struct RegionEditResult {
pub(crate) struct RegionSyncRequest {
pub(crate) region_id: RegionId,
pub(crate) manifest_version: ManifestVersion,
- pub(crate) sender: Sender>,
+ /// Returns the latest manifest version and a boolean indicating whether new maniefst is installed.
+ pub(crate) sender: Sender>,
}
#[cfg(test)]
diff --git a/src/mito2/src/worker/handle_manifest.rs b/src/mito2/src/worker/handle_manifest.rs
index f1bec95514..4fd0de0d7b 100644
--- a/src/mito2/src/worker/handle_manifest.rs
+++ b/src/mito2/src/worker/handle_manifest.rs
@@ -136,6 +136,7 @@ impl RegionWorkerLoop {
}
};
+ let original_manifest_version = region.manifest_ctx.manifest_version().await;
let manifest = match region
.manifest_ctx
.install_manifest_to(request.manifest_version)
@@ -173,7 +174,8 @@ impl RegionWorkerLoop {
.build();
region.version_control.overwrite_current(Arc::new(version));
- let _ = sender.send(Ok(manifest.manifest_version));
+ let updated = manifest.manifest_version > original_manifest_version;
+ let _ = sender.send(Ok((manifest.manifest_version, updated)));
}
}
diff --git a/src/query/src/optimizer/test_util.rs b/src/query/src/optimizer/test_util.rs
index 72e4ad093a..2b3b473770 100644
--- a/src/query/src/optimizer/test_util.rs
+++ b/src/query/src/optimizer/test_util.rs
@@ -29,7 +29,7 @@ use store_api::metadata::{
};
use store_api::region_engine::{
RegionEngine, RegionManifestInfo, RegionRole, RegionScannerRef, RegionStatistic,
- SetRegionRoleStateResponse, SettableRegionRoleState,
+ SetRegionRoleStateResponse, SettableRegionRoleState, SyncManifestResponse,
};
use store_api::region_request::RegionRequest;
use store_api::storage::{ConcreteDataType, RegionId, ScanRequest, SequenceNumber};
@@ -113,7 +113,7 @@ impl RegionEngine for MetaRegionEngine {
&self,
_region_id: RegionId,
_manifest_info: RegionManifestInfo,
- ) -> Result<(), BoxedError> {
+ ) -> Result {
unimplemented!()
}
diff --git a/src/store-api/src/region_engine.rs b/src/store-api/src/region_engine.rs
index 0a38700f1d..8522a2e1ca 100644
--- a/src/store-api/src/region_engine.rs
+++ b/src/store-api/src/region_engine.rs
@@ -583,6 +583,62 @@ impl RegionStatistic {
}
}
+/// The response of syncing the manifest.
+#[derive(Debug)]
+pub enum SyncManifestResponse {
+ NotSupported,
+ Mito {
+ /// Indicates if the data region was synced.
+ synced: bool,
+ },
+ Metric {
+ /// Indicates if the metadata region was synced.
+ metadata_synced: bool,
+ /// Indicates if the data region was synced.
+ data_synced: bool,
+ /// The logical regions that were newly opened during the sync operation.
+ /// This only occurs after the metadata region has been successfully synced.
+ new_opened_logical_region_ids: Vec,
+ },
+}
+
+impl SyncManifestResponse {
+ /// Returns true if data region is synced.
+ pub fn is_data_synced(&self) -> bool {
+ match self {
+ SyncManifestResponse::NotSupported => false,
+ SyncManifestResponse::Mito { synced } => *synced,
+ SyncManifestResponse::Metric { data_synced, .. } => *data_synced,
+ }
+ }
+
+ /// Returns true if the engine is supported the sync operation.
+ pub fn is_supported(&self) -> bool {
+ matches!(self, SyncManifestResponse::NotSupported)
+ }
+
+ /// Returns true if the engine is a mito2 engine.
+ pub fn is_mito(&self) -> bool {
+ matches!(self, SyncManifestResponse::Mito { .. })
+ }
+
+ /// Returns true if the engine is a metric engine.
+ pub fn is_metric(&self) -> bool {
+ matches!(self, SyncManifestResponse::Metric { .. })
+ }
+
+ /// Returns the new opened logical region ids.
+ pub fn new_opened_logical_region_ids(self) -> Option> {
+ match self {
+ SyncManifestResponse::Metric {
+ new_opened_logical_region_ids,
+ ..
+ } => Some(new_opened_logical_region_ids),
+ _ => None,
+ }
+ }
+}
+
#[async_trait]
pub trait RegionEngine: Send + Sync {
/// Name of this engine
@@ -689,7 +745,7 @@ pub trait RegionEngine: Send + Sync {
&self,
region_id: RegionId,
manifest_info: RegionManifestInfo,
- ) -> Result<(), BoxedError>;
+ ) -> Result;
/// Sets region role state gracefully.
///
From 747b71bf74b4ba5ead90b8a0278db16d9d63993e Mon Sep 17 00:00:00 2001
From: Ruihang Xia
Date: Mon, 14 Apr 2025 21:12:37 +0800
Subject: [PATCH 16/82] feat: add query engine options (#5895)
* feat: add query engine options
Signed-off-by: Ruihang Xia
* update example
Signed-off-by: Ruihang Xia
---------
Signed-off-by: Ruihang Xia
---
config/config.md | 6 ++++
config/datanode.example.toml | 6 ++++
config/frontend.example.toml | 6 ++++
config/standalone.example.toml | 6 ++++
src/datanode/src/config.rs | 3 ++
src/datanode/src/datanode.rs | 1 +
src/flow/src/adapter.rs | 3 ++
src/flow/src/server.rs | 1 +
src/flow/src/test_utils.rs | 11 ++++++-
src/flow/src/transform.rs | 11 ++++++-
src/frontend/src/frontend.rs | 3 ++
src/frontend/src/instance/builder.rs | 1 +
src/query/src/datafusion.rs | 12 +++++++-
src/query/src/lib.rs | 1 +
src/query/src/options.rs | 30 +++++++++++++++++++
src/query/src/query_engine.rs | 16 +++++++++-
src/query/src/query_engine/context.rs | 2 ++
.../src/query_engine/default_serializer.rs | 11 ++++++-
src/query/src/query_engine/state.rs | 6 ++++
src/query/src/range_select/plan_rewrite.rs | 12 +++++++-
src/query/src/tests.rs | 12 +++++++-
src/query/src/tests/query_engine_test.rs | 23 ++++++++++++--
src/query/src/tests/time_range_filter_test.rs | 13 ++++++--
src/servers/tests/mod.rs | 13 ++++++--
24 files changed, 195 insertions(+), 14 deletions(-)
create mode 100644 src/query/src/options.rs
diff --git a/config/config.md b/config/config.md
index ba2540f2c6..d0d7582db5 100644
--- a/config/config.md
+++ b/config/config.md
@@ -96,6 +96,8 @@
| `procedure.max_running_procedures` | Integer | `128` | Max running procedures. The maximum number of procedures that can be running at the same time. If the number of running procedures exceeds this limit, the procedure will be rejected. |
| `flow` | -- | -- | flow engine options. |
| `flow.num_workers` | Integer | `0` | The number of flow worker in flownode. Not setting(or set to 0) this value will use the number of CPU cores divided by 2. |
+| `query` | -- | -- | The query engine options. |
+| `query.parallelism` | Integer | `0` | Parallelism of the query engine. Default to 0, which means the number of CPU cores. |
| `storage` | -- | -- | The data storage options. |
| `storage.data_home` | String | `./greptimedb_data/` | The working home directory. |
| `storage.type` | String | `File` | The storage type used to store the data. - `File`: the data is stored in the local file system. - `S3`: the data is stored in the S3 object storage. - `Gcs`: the data is stored in the Google Cloud Storage. - `Azblob`: the data is stored in the Azure Blob Storage. - `Oss`: the data is stored in the Aliyun OSS. |
@@ -270,6 +272,8 @@
| `meta_client.metadata_cache_max_capacity` | Integer | `100000` | The configuration about the cache of the metadata. |
| `meta_client.metadata_cache_ttl` | String | `10m` | TTL of the metadata cache. |
| `meta_client.metadata_cache_tti` | String | `5m` | -- |
+| `query` | -- | -- | The query engine options. |
+| `query.parallelism` | Integer | `0` | Parallelism of the query engine. Default to 0, which means the number of CPU cores. |
| `datanode` | -- | -- | Datanode options. |
| `datanode.client` | -- | -- | Datanode client options. |
| `datanode.client.connect_timeout` | String | `10s` | -- |
@@ -429,6 +433,8 @@
| `wal.create_index` | Bool | `true` | Whether to enable WAL index creation. **It's only used when the provider is `kafka`**. |
| `wal.dump_index_interval` | String | `60s` | The interval for dumping WAL indexes. **It's only used when the provider is `kafka`**. |
| `wal.overwrite_entry_start_id` | Bool | `false` | Ignore missing entries during read WAL. **It's only used when the provider is `kafka`**. This option ensures that when Kafka messages are deleted, the system can still successfully replay memtable data without throwing an out-of-range error. However, enabling this option might lead to unexpected data loss, as the system will skip over missing entries instead of treating them as critical errors. |
+| `query` | -- | -- | The query engine options. |
+| `query.parallelism` | Integer | `0` | Parallelism of the query engine. Default to 0, which means the number of CPU cores. |
| `storage` | -- | -- | The data storage options. |
| `storage.data_home` | String | `./greptimedb_data/` | The working home directory. |
| `storage.type` | String | `File` | The storage type used to store the data. - `File`: the data is stored in the local file system. - `S3`: the data is stored in the S3 object storage. - `Gcs`: the data is stored in the Google Cloud Storage. - `Azblob`: the data is stored in the Azure Blob Storage. - `Oss`: the data is stored in the Aliyun OSS. |
diff --git a/config/datanode.example.toml b/config/datanode.example.toml
index af6b5571d2..46beb51a23 100644
--- a/config/datanode.example.toml
+++ b/config/datanode.example.toml
@@ -243,6 +243,12 @@ overwrite_entry_start_id = false
# credential = "base64-credential"
# endpoint = "https://storage.googleapis.com"
+## The query engine options.
+[query]
+## Parallelism of the query engine.
+## Default to 0, which means the number of CPU cores.
+parallelism = 0
+
## The data storage options.
[storage]
## The working home directory.
diff --git a/config/frontend.example.toml b/config/frontend.example.toml
index 3d4cd78144..2e3ee4a69d 100644
--- a/config/frontend.example.toml
+++ b/config/frontend.example.toml
@@ -179,6 +179,12 @@ metadata_cache_ttl = "10m"
# TTI of the metadata cache.
metadata_cache_tti = "5m"
+## The query engine options.
+[query]
+## Parallelism of the query engine.
+## Default to 0, which means the number of CPU cores.
+parallelism = 0
+
## Datanode options.
[datanode]
## Datanode client options.
diff --git a/config/standalone.example.toml b/config/standalone.example.toml
index bdef754712..0e72cfcc7e 100644
--- a/config/standalone.example.toml
+++ b/config/standalone.example.toml
@@ -334,6 +334,12 @@ max_running_procedures = 128
# credential = "base64-credential"
# endpoint = "https://storage.googleapis.com"
+## The query engine options.
+[query]
+## Parallelism of the query engine.
+## Default to 0, which means the number of CPU cores.
+parallelism = 0
+
## The data storage options.
[storage]
## The working home directory.
diff --git a/src/datanode/src/config.rs b/src/datanode/src/config.rs
index 322f337ba3..7d63057a72 100644
--- a/src/datanode/src/config.rs
+++ b/src/datanode/src/config.rs
@@ -26,6 +26,7 @@ use file_engine::config::EngineConfig as FileEngineConfig;
use meta_client::MetaClientOptions;
use metric_engine::config::EngineConfig as MetricEngineConfig;
use mito2::config::MitoConfig;
+use query::options::QueryOptions;
use serde::{Deserialize, Serialize};
use servers::export_metrics::ExportMetricsOption;
use servers::grpc::GrpcOptions;
@@ -375,6 +376,7 @@ pub struct DatanodeOptions {
pub enable_telemetry: bool,
pub export_metrics: ExportMetricsOption,
pub tracing: TracingOptions,
+ pub query: QueryOptions,
/// Deprecated options, please use the new options instead.
#[deprecated(note = "Please use `grpc.addr` instead.")]
@@ -412,6 +414,7 @@ impl Default for DatanodeOptions {
enable_telemetry: true,
export_metrics: ExportMetricsOption::default(),
tracing: TracingOptions::default(),
+ query: QueryOptions::default(),
// Deprecated options
rpc_addr: None,
diff --git a/src/datanode/src/datanode.rs b/src/datanode/src/datanode.rs
index b32a1668c6..4b1e720032 100644
--- a/src/datanode/src/datanode.rs
+++ b/src/datanode/src/datanode.rs
@@ -359,6 +359,7 @@ impl DatanodeBuilder {
None,
false,
self.plugins.clone(),
+ opts.query.clone(),
);
let query_engine = query_engine_factory.query_engine();
diff --git a/src/flow/src/adapter.rs b/src/flow/src/adapter.rs
index 1dd3e7e40e..8fd62ee2a0 100644
--- a/src/flow/src/adapter.rs
+++ b/src/flow/src/adapter.rs
@@ -32,6 +32,7 @@ use datatypes::value::Value;
use greptime_proto::v1;
use itertools::{EitherOrBoth, Itertools};
use meta_client::MetaClientOptions;
+use query::options::QueryOptions;
use query::QueryEngine;
use serde::{Deserialize, Serialize};
use servers::grpc::GrpcOptions;
@@ -109,6 +110,7 @@ pub struct FlownodeOptions {
pub logging: LoggingOptions,
pub tracing: TracingOptions,
pub heartbeat: HeartbeatOptions,
+ pub query: QueryOptions,
}
impl Default for FlownodeOptions {
@@ -122,6 +124,7 @@ impl Default for FlownodeOptions {
logging: LoggingOptions::default(),
tracing: TracingOptions::default(),
heartbeat: HeartbeatOptions::default(),
+ query: QueryOptions::default(),
}
}
}
diff --git a/src/flow/src/server.rs b/src/flow/src/server.rs
index f347ac369e..d0038e6ba1 100644
--- a/src/flow/src/server.rs
+++ b/src/flow/src/server.rs
@@ -332,6 +332,7 @@ impl FlownodeBuilder {
None,
false,
Default::default(),
+ self.opts.query.clone(),
);
let manager = Arc::new(
self.build_manager(query_engine_factory.query_engine())
diff --git a/src/flow/src/test_utils.rs b/src/flow/src/test_utils.rs
index 4d269a80c0..ecaabae32d 100644
--- a/src/flow/src/test_utils.rs
+++ b/src/flow/src/test_utils.rs
@@ -23,6 +23,7 @@ use datatypes::timestamp::TimestampMillisecond;
use datatypes::vectors::{TimestampMillisecondVectorBuilder, VectorRef};
use itertools::Itertools;
use prost::Message;
+use query::options::QueryOptions;
use query::parser::QueryLanguageParser;
use query::query_engine::DefaultSerializer;
use query::QueryEngine;
@@ -146,7 +147,15 @@ pub fn create_test_query_engine() -> Arc {
};
catalog_list.register_table_sync(req_with_ts).unwrap();
- let factory = query::QueryEngineFactory::new(catalog_list, None, None, None, None, false);
+ let factory = query::QueryEngineFactory::new(
+ catalog_list,
+ None,
+ None,
+ None,
+ None,
+ false,
+ QueryOptions::default(),
+ );
let engine = factory.query_engine();
register_function_to_query_engine(&engine);
diff --git a/src/flow/src/transform.rs b/src/flow/src/transform.rs
index 15da89b21f..04c7f40e68 100644
--- a/src/flow/src/transform.rs
+++ b/src/flow/src/transform.rs
@@ -171,6 +171,7 @@ mod test {
use datatypes::vectors::{TimestampMillisecondVectorBuilder, VectorRef};
use itertools::Itertools;
use prost::Message;
+ use query::options::QueryOptions;
use query::parser::QueryLanguageParser;
use query::query_engine::DefaultSerializer;
use query::QueryEngine;
@@ -263,7 +264,15 @@ mod test {
};
catalog_list.register_table_sync(req_with_ts).unwrap();
- let factory = query::QueryEngineFactory::new(catalog_list, None, None, None, None, false);
+ let factory = query::QueryEngineFactory::new(
+ catalog_list,
+ None,
+ None,
+ None,
+ None,
+ false,
+ QueryOptions::default(),
+ );
let engine = factory.query_engine();
register_function_to_query_engine(&engine);
diff --git a/src/frontend/src/frontend.rs b/src/frontend/src/frontend.rs
index 983550d0e7..ba795730c4 100644
--- a/src/frontend/src/frontend.rs
+++ b/src/frontend/src/frontend.rs
@@ -19,6 +19,7 @@ use common_config::config::Configurable;
use common_options::datanode::DatanodeClientOptions;
use common_telemetry::logging::{LoggingOptions, TracingOptions};
use meta_client::MetaClientOptions;
+use query::options::QueryOptions;
use serde::{Deserialize, Serialize};
use servers::export_metrics::{ExportMetricsOption, ExportMetricsTask};
use servers::grpc::GrpcOptions;
@@ -58,6 +59,7 @@ pub struct FrontendOptions {
pub user_provider: Option,
pub export_metrics: ExportMetricsOption,
pub tracing: TracingOptions,
+ pub query: QueryOptions,
pub max_in_flight_write_bytes: Option,
}
@@ -82,6 +84,7 @@ impl Default for FrontendOptions {
user_provider: None,
export_metrics: ExportMetricsOption::default(),
tracing: TracingOptions::default(),
+ query: QueryOptions::default(),
max_in_flight_write_bytes: None,
}
}
diff --git a/src/frontend/src/instance/builder.rs b/src/frontend/src/instance/builder.rs
index 8503999b2c..ffbfeabca1 100644
--- a/src/frontend/src/instance/builder.rs
+++ b/src/frontend/src/instance/builder.rs
@@ -166,6 +166,7 @@ impl FrontendBuilder {
Some(Arc::new(flow_service)),
true,
plugins.clone(),
+ self.options.query.clone(),
)
.query_engine();
diff --git a/src/query/src/datafusion.rs b/src/query/src/datafusion.rs
index dba7d0215a..db4207fd8a 100644
--- a/src/query/src/datafusion.rs
+++ b/src/query/src/datafusion.rs
@@ -567,6 +567,7 @@ mod tests {
use table::table::numbers::{NumbersTable, NUMBERS_TABLE_NAME};
use super::*;
+ use crate::options::QueryOptions;
use crate::parser::QueryLanguageParser;
use crate::query_engine::{QueryEngineFactory, QueryEngineRef};
@@ -581,7 +582,16 @@ mod tests {
};
catalog_manager.register_table_sync(req).unwrap();
- QueryEngineFactory::new(catalog_manager, None, None, None, None, false).query_engine()
+ QueryEngineFactory::new(
+ catalog_manager,
+ None,
+ None,
+ None,
+ None,
+ false,
+ QueryOptions::default(),
+ )
+ .query_engine()
}
#[tokio::test]
diff --git a/src/query/src/lib.rs b/src/query/src/lib.rs
index 6e1fbfae0a..26fbfb27cd 100644
--- a/src/query/src/lib.rs
+++ b/src/query/src/lib.rs
@@ -29,6 +29,7 @@ pub mod executor;
pub mod log_query;
pub mod metrics;
mod optimizer;
+pub mod options;
pub mod parser;
mod part_sort;
pub mod physical_wrapper;
diff --git a/src/query/src/options.rs b/src/query/src/options.rs
new file mode 100644
index 0000000000..441e9f161f
--- /dev/null
+++ b/src/query/src/options.rs
@@ -0,0 +1,30 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use serde::{Deserialize, Serialize};
+
+/// Query engine config
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+#[serde(default)]
+pub struct QueryOptions {
+ /// Parallelism of query engine. Default to 0, which implies the number of logical CPUs.
+ pub parallelism: usize,
+}
+
+#[allow(clippy::derivable_impls)]
+impl Default for QueryOptions {
+ fn default() -> Self {
+ Self { parallelism: 0 }
+ }
+}
diff --git a/src/query/src/query_engine.rs b/src/query/src/query_engine.rs
index c4e8aee7d1..8b0c091054 100644
--- a/src/query/src/query_engine.rs
+++ b/src/query/src/query_engine.rs
@@ -38,6 +38,7 @@ use table::TableRef;
use crate::dataframe::DataFrame;
use crate::datafusion::DatafusionQueryEngine;
use crate::error::Result;
+use crate::options::QueryOptions;
use crate::planner::LogicalPlanner;
pub use crate::query_engine::context::QueryEngineContext;
pub use crate::query_engine::state::QueryEngineState;
@@ -106,6 +107,7 @@ impl QueryEngineFactory {
procedure_service_handler: Option,
flow_service_handler: Option,
with_dist_planner: bool,
+ options: QueryOptions,
) -> Self {
Self::new_with_plugins(
catalog_manager,
@@ -115,9 +117,11 @@ impl QueryEngineFactory {
flow_service_handler,
with_dist_planner,
Default::default(),
+ options,
)
}
+ #[allow(clippy::too_many_arguments)]
pub fn new_with_plugins(
catalog_manager: CatalogManagerRef,
region_query_handler: Option,
@@ -126,6 +130,7 @@ impl QueryEngineFactory {
flow_service_handler: Option,
with_dist_planner: bool,
plugins: Plugins,
+ options: QueryOptions,
) -> Self {
let state = Arc::new(QueryEngineState::new(
catalog_manager,
@@ -135,6 +140,7 @@ impl QueryEngineFactory {
flow_service_handler,
with_dist_planner,
plugins.clone(),
+ options,
));
let query_engine = Arc::new(DatafusionQueryEngine::new(state, plugins));
register_functions(&query_engine);
@@ -166,7 +172,15 @@ mod tests {
#[test]
fn test_query_engine_factory() {
let catalog_list = catalog::memory::new_memory_catalog_manager().unwrap();
- let factory = QueryEngineFactory::new(catalog_list, None, None, None, None, false);
+ let factory = QueryEngineFactory::new(
+ catalog_list,
+ None,
+ None,
+ None,
+ None,
+ false,
+ QueryOptions::default(),
+ );
let engine = factory.query_engine();
diff --git a/src/query/src/query_engine/context.rs b/src/query/src/query_engine/context.rs
index d8c110d2f2..df20a70a42 100644
--- a/src/query/src/query_engine/context.rs
+++ b/src/query/src/query_engine/context.rs
@@ -75,6 +75,7 @@ impl QueryEngineContext {
use common_base::Plugins;
use session::context::QueryContext;
+ use crate::options::QueryOptions;
use crate::query_engine::QueryEngineState;
let state = Arc::new(QueryEngineState::new(
@@ -85,6 +86,7 @@ impl QueryEngineContext {
None,
false,
Plugins::default(),
+ QueryOptions::default(),
));
QueryEngineContext::new(state.session_state(), QueryContext::arc())
diff --git a/src/query/src/query_engine/default_serializer.rs b/src/query/src/query_engine/default_serializer.rs
index 23d6789866..c3feed1d55 100644
--- a/src/query/src/query_engine/default_serializer.rs
+++ b/src/query/src/query_engine/default_serializer.rs
@@ -159,6 +159,7 @@ mod tests {
use super::*;
use crate::dummy_catalog::DummyCatalogList;
use crate::optimizer::test_util::mock_table_provider;
+ use crate::options::QueryOptions;
use crate::QueryEngineFactory;
fn mock_plan(schema: SchemaRef) -> LogicalPlan {
@@ -177,7 +178,15 @@ mod tests {
#[tokio::test]
async fn test_serializer_decode_plan() {
let catalog_list = catalog::memory::new_memory_catalog_manager().unwrap();
- let factory = QueryEngineFactory::new(catalog_list, None, None, None, None, false);
+ let factory = QueryEngineFactory::new(
+ catalog_list,
+ None,
+ None,
+ None,
+ None,
+ false,
+ QueryOptions::default(),
+ );
let engine = factory.query_engine();
diff --git a/src/query/src/query_engine/state.rs b/src/query/src/query_engine/state.rs
index 812fc2c2af..75e1ed84a7 100644
--- a/src/query/src/query_engine/state.rs
+++ b/src/query/src/query_engine/state.rs
@@ -54,6 +54,7 @@ use crate::optimizer::string_normalization::StringNormalizationRule;
use crate::optimizer::type_conversion::TypeConversionRule;
use crate::optimizer::windowed_sort::WindowedSortPhysicalRule;
use crate::optimizer::ExtensionAnalyzerRule;
+use crate::options::QueryOptions as QueryOptionsNew;
use crate::query_engine::options::QueryOptions;
use crate::query_engine::DefaultSerializer;
use crate::range_select::planner::RangeSelectPlanner;
@@ -81,6 +82,7 @@ impl fmt::Debug for QueryEngineState {
}
impl QueryEngineState {
+ #[allow(clippy::too_many_arguments)]
pub fn new(
catalog_list: CatalogManagerRef,
region_query_handler: Option,
@@ -89,9 +91,13 @@ impl QueryEngineState {
flow_service_handler: Option,
with_dist_planner: bool,
plugins: Plugins,
+ options: QueryOptionsNew,
) -> Self {
let runtime_env = Arc::new(RuntimeEnv::default());
let mut session_config = SessionConfig::new().with_create_default_catalog_and_schema(false);
+ if options.parallelism > 0 {
+ session_config = session_config.with_target_partitions(options.parallelism);
+ }
// todo(hl): This serves as a workaround for https://github.com/GreptimeTeam/greptimedb/issues/5659
// and we can add that check back once we upgrade datafusion.
diff --git a/src/query/src/range_select/plan_rewrite.rs b/src/query/src/range_select/plan_rewrite.rs
index b53e1079b8..5e0f223663 100644
--- a/src/query/src/range_select/plan_rewrite.rs
+++ b/src/query/src/range_select/plan_rewrite.rs
@@ -611,6 +611,7 @@ mod test {
use table::test_util::EmptyTable;
use super::*;
+ use crate::options::QueryOptions;
use crate::parser::QueryLanguageParser;
use crate::{QueryEngineFactory, QueryEngineRef};
@@ -663,7 +664,16 @@ mod test {
table,
})
.is_ok());
- QueryEngineFactory::new(catalog_list, None, None, None, None, false).query_engine()
+ QueryEngineFactory::new(
+ catalog_list,
+ None,
+ None,
+ None,
+ None,
+ false,
+ QueryOptions::default(),
+ )
+ .query_engine()
}
async fn do_query(sql: &str) -> Result {
diff --git a/src/query/src/tests.rs b/src/query/src/tests.rs
index f2f2e40bf3..7c004e5229 100644
--- a/src/query/src/tests.rs
+++ b/src/query/src/tests.rs
@@ -18,6 +18,7 @@ use common_recordbatch::{util, RecordBatch};
use session::context::QueryContext;
use table::TableRef;
+use crate::options::QueryOptions;
use crate::parser::QueryLanguageParser;
use crate::{QueryEngineFactory, QueryEngineRef};
@@ -46,5 +47,14 @@ async fn exec_selection(engine: QueryEngineRef, sql: &str) -> Vec {
pub fn new_query_engine_with_table(table: TableRef) -> QueryEngineRef {
let catalog_manager = MemoryCatalogManager::new_with_table(table);
- QueryEngineFactory::new(catalog_manager, None, None, None, None, false).query_engine()
+ QueryEngineFactory::new(
+ catalog_manager,
+ None,
+ None,
+ None,
+ None,
+ false,
+ QueryOptions::default(),
+ )
+ .query_engine()
}
diff --git a/src/query/src/tests/query_engine_test.rs b/src/query/src/tests/query_engine_test.rs
index 0f3f817703..07bac1363a 100644
--- a/src/query/src/tests/query_engine_test.rs
+++ b/src/query/src/tests/query_engine_test.rs
@@ -33,6 +33,7 @@ use table::table::numbers::{NumbersTable, NUMBERS_TABLE_NAME};
use table::test_util::MemTable;
use crate::error::{QueryExecutionSnafu, Result};
+use crate::options::QueryOptions as QueryOptionsNew;
use crate::parser::QueryLanguageParser;
use crate::query_engine::options::QueryOptions;
use crate::query_engine::QueryEngineFactory;
@@ -43,7 +44,15 @@ async fn test_datafusion_query_engine() -> Result<()> {
let catalog_list = catalog::memory::new_memory_catalog_manager()
.map_err(BoxedError::new)
.context(QueryExecutionSnafu)?;
- let factory = QueryEngineFactory::new(catalog_list, None, None, None, None, false);
+ let factory = QueryEngineFactory::new(
+ catalog_list,
+ None,
+ None,
+ None,
+ None,
+ false,
+ QueryOptionsNew::default(),
+ );
let engine = factory.query_engine();
let column_schemas = vec![ColumnSchema::new(
@@ -122,8 +131,16 @@ async fn test_query_validate() -> Result<()> {
disallow_cross_catalog_query: true,
});
- let factory =
- QueryEngineFactory::new_with_plugins(catalog_list, None, None, None, None, false, plugins);
+ let factory = QueryEngineFactory::new_with_plugins(
+ catalog_list,
+ None,
+ None,
+ None,
+ None,
+ false,
+ plugins,
+ QueryOptionsNew::default(),
+ );
let engine = factory.query_engine();
let stmt =
diff --git a/src/query/src/tests/time_range_filter_test.rs b/src/query/src/tests/time_range_filter_test.rs
index e141c99fa5..84bdd8cb18 100644
--- a/src/query/src/tests/time_range_filter_test.rs
+++ b/src/query/src/tests/time_range_filter_test.rs
@@ -33,6 +33,7 @@ use table::predicate::build_time_range_predicate;
use table::test_util::MemTable;
use table::{Table, TableRef};
+use crate::options::QueryOptions;
use crate::tests::exec_selection;
use crate::{QueryEngineFactory, QueryEngineRef};
@@ -102,8 +103,16 @@ fn create_test_engine() -> TimeRangeTester {
};
let _ = catalog_manager.register_table_sync(req).unwrap();
- let engine =
- QueryEngineFactory::new(catalog_manager, None, None, None, None, false).query_engine();
+ let engine = QueryEngineFactory::new(
+ catalog_manager,
+ None,
+ None,
+ None,
+ None,
+ false,
+ QueryOptions::default(),
+ )
+ .query_engine();
TimeRangeTester { engine, filter }
}
diff --git a/src/servers/tests/mod.rs b/src/servers/tests/mod.rs
index aa07980240..13c78a293f 100644
--- a/src/servers/tests/mod.rs
+++ b/src/servers/tests/mod.rs
@@ -21,6 +21,7 @@ use catalog::memory::MemoryCatalogManager;
use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
use common_query::Output;
use datafusion_expr::LogicalPlan;
+use query::options::QueryOptions;
use query::parser::{PromQuery, QueryLanguageParser, QueryStatement};
use query::query_engine::DescribeResult;
use query::{QueryEngineFactory, QueryEngineRef};
@@ -158,8 +159,16 @@ impl GrpcQueryHandler for DummyInstance {
fn create_testing_instance(table: TableRef) -> DummyInstance {
let catalog_manager = MemoryCatalogManager::new_with_table(table);
- let query_engine =
- QueryEngineFactory::new(catalog_manager, None, None, None, None, false).query_engine();
+ let query_engine = QueryEngineFactory::new(
+ catalog_manager,
+ None,
+ None,
+ None,
+ None,
+ false,
+ QueryOptions::default(),
+ )
+ .query_engine();
DummyInstance::new(query_engine)
}
From 6a50d719207088c9d77f634cac7b966057e5dc01 Mon Sep 17 00:00:00 2001
From: "Lei, HUANG" <6406592+v0y4g3r@users.noreply.github.com>
Date: Mon, 14 Apr 2025 21:15:56 +0800
Subject: [PATCH 17/82] fix: memtable panic (#5894)
* fix: memtable panic
* fix: ci
---
src/mito2/src/memtable/time_series.rs | 196 ++++++++++++++++++++++----
1 file changed, 171 insertions(+), 25 deletions(-)
diff --git a/src/mito2/src/memtable/time_series.rs b/src/mito2/src/memtable/time_series.rs
index 82758a542b..44bce1ec74 100644
--- a/src/mito2/src/memtable/time_series.rs
+++ b/src/mito2/src/memtable/time_series.rs
@@ -161,18 +161,15 @@ impl TimeSeriesMemtable {
let primary_key_encoded = self.row_codec.encode(kv.primary_keys())?;
- let (series, series_allocated) = self.series_set.get_or_add_series(primary_key_encoded);
- stats.key_bytes += series_allocated;
+ let (key_allocated, value_allocated) =
+ self.series_set.push_to_series(primary_key_encoded, &kv);
+ stats.key_bytes += key_allocated;
+ stats.value_bytes += value_allocated;
// safety: timestamp of kv must be both present and a valid timestamp value.
let ts = kv.timestamp().as_timestamp().unwrap().unwrap().value();
stats.min_ts = stats.min_ts.min(ts);
stats.max_ts = stats.max_ts.max(ts);
-
- let mut guard = series.write().unwrap();
- let size = guard.push(kv.timestamp(), kv.sequence(), kv.op_type(), kv.fields());
- stats.value_bytes += size;
-
Ok(())
}
}
@@ -368,25 +365,46 @@ impl SeriesSet {
}
impl SeriesSet {
- /// Returns the series for given primary key, or create a new series if not already exist,
- /// along with the allocated memory footprint for primary keys.
- fn get_or_add_series(&self, primary_key: Vec) -> (Arc>, usize) {
+ /// Push [KeyValue] to SeriesSet with given primary key and return key/value allocated memory size.
+ fn push_to_series(&self, primary_key: Vec, kv: &KeyValue) -> (usize, usize) {
if let Some(series) = self.series.read().unwrap().get(&primary_key) {
- return (series.clone(), 0);
+ let value_allocated = series.write().unwrap().push(
+ kv.timestamp(),
+ kv.sequence(),
+ kv.op_type(),
+ kv.fields(),
+ );
+ return (0, value_allocated);
};
- let s = Arc::new(RwLock::new(Series::new(&self.region_metadata)));
+
let mut indices = self.series.write().unwrap();
match indices.entry(primary_key) {
Entry::Vacant(v) => {
let key_len = v.key().len();
- v.insert(s.clone());
- (s, key_len)
+ let mut series = Series::new(&self.region_metadata);
+ let value_allocated =
+ series.push(kv.timestamp(), kv.sequence(), kv.op_type(), kv.fields());
+ v.insert(Arc::new(RwLock::new(series)));
+ (key_len, value_allocated)
}
// safety: series must exist at given index.
- Entry::Occupied(v) => (v.get().clone(), 0),
+ Entry::Occupied(v) => {
+ let value_allocated = v.get().write().unwrap().push(
+ kv.timestamp(),
+ kv.sequence(),
+ kv.op_type(),
+ kv.fields(),
+ );
+ (0, value_allocated)
+ }
}
}
+ #[cfg(test)]
+ fn get_series(&self, primary_key: &[u8]) -> Option>> {
+ self.series.read().unwrap().get(primary_key).cloned()
+ }
+
/// Iterates all series in [SeriesSet].
fn iter_series(
&self,
@@ -948,7 +966,7 @@ mod tests {
use api::helper::ColumnDataTypeWrapper;
use api::v1::value::ValueData;
- use api::v1::{Row, Rows, SemanticType};
+ use api::v1::{Mutation, Row, Rows, SemanticType};
use common_time::Timestamp;
use datatypes::prelude::{ConcreteDataType, ScalarVector};
use datatypes::schema::ColumnSchema;
@@ -959,6 +977,7 @@ mod tests {
use super::*;
use crate::row_converter::SortField;
+ use crate::test_util::column_metadata_to_column_schema;
fn schema_for_test() -> RegionMetadataRef {
let mut builder = RegionMetadataBuilder::new(RegionId::new(123, 456));
@@ -1242,18 +1261,54 @@ mod tests {
let mut handles = Vec::with_capacity(concurrency);
for i in 0..concurrency {
let set = set.clone();
+ let schema = schema.clone();
+ let column_schemas = schema
+ .column_metadatas
+ .iter()
+ .map(column_metadata_to_column_schema)
+ .collect::>();
let handle = std::thread::spawn(move || {
for j in i * 100..(i + 1) * 100 {
let pk = j % pk_num;
let primary_key = format!("pk-{}", pk).as_bytes().to_vec();
- let (series, _) = set.get_or_add_series(primary_key);
- let mut guard = series.write().unwrap();
- guard.push(
- ts_value_ref(j as i64),
- j as u64,
- OpType::Put,
- field_value_ref(j as i64, j as f64),
- );
+
+ let kvs = KeyValues::new(
+ &schema,
+ Mutation {
+ op_type: OpType::Put as i32,
+ sequence: j as u64,
+ rows: Some(Rows {
+ schema: column_schemas.clone(),
+ rows: vec![Row {
+ values: vec![
+ api::v1::Value {
+ value_data: Some(ValueData::StringValue(format!(
+ "{}",
+ j
+ ))),
+ },
+ api::v1::Value {
+ value_data: Some(ValueData::I64Value(j as i64)),
+ },
+ api::v1::Value {
+ value_data: Some(ValueData::TimestampMillisecondValue(
+ j as i64,
+ )),
+ },
+ api::v1::Value {
+ value_data: Some(ValueData::I64Value(j as i64)),
+ },
+ api::v1::Value {
+ value_data: Some(ValueData::F64Value(j as f64)),
+ },
+ ],
+ }],
+ }),
+ write_hint: None,
+ },
+ )
+ .unwrap();
+ set.push_to_series(primary_key, &kvs.iter().next().unwrap());
}
});
handles.push(handle);
@@ -1269,7 +1324,7 @@ mod tests {
for i in 0..pk_num {
let pk = format!("pk-{}", i).as_bytes().to_vec();
- let (series, _) = set.get_or_add_series(pk);
+ let series = set.get_series(&pk).unwrap();
let mut guard = series.write().unwrap();
let values = guard.compact(&schema).unwrap();
timestamps.extend(values.sequence.iter_data().map(|v| v.unwrap() as i64));
@@ -1385,4 +1440,95 @@ mod tests {
}
assert_eq!((0..100i64).collect::>(), v0_all);
}
+
+ #[test]
+ fn test_memtable_concurrent_write_read() {
+ common_telemetry::init_default_ut_logging();
+ let schema = schema_for_test();
+ let memtable = Arc::new(TimeSeriesMemtable::new(
+ schema.clone(),
+ 42,
+ None,
+ true,
+ MergeMode::LastRow,
+ ));
+
+ // Number of writer threads
+ let num_writers = 10;
+ // Number of reader threads
+ let num_readers = 5;
+ // Number of series per writer
+ let series_per_writer = 100;
+ // Number of rows per series
+ let rows_per_series = 10;
+ // Total number of series
+ let total_series = num_writers * series_per_writer;
+
+ // Create a barrier to synchronize the start of all threads
+ let barrier = Arc::new(std::sync::Barrier::new(num_writers + num_readers + 1));
+
+ // Spawn writer threads
+ let mut writer_handles = Vec::with_capacity(num_writers);
+ for writer_id in 0..num_writers {
+ let memtable = memtable.clone();
+ let schema = schema.clone();
+ let barrier = barrier.clone();
+
+ let handle = std::thread::spawn(move || {
+ // Wait for all threads to be ready
+ barrier.wait();
+
+ // Create and write series
+ for series_id in 0..series_per_writer {
+ let series_key = format!("writer-{}-series-{}", writer_id, series_id);
+ let kvs =
+ build_key_values(&schema, series_key, series_id as i64, rows_per_series);
+ memtable.write(&kvs).unwrap();
+ }
+ });
+
+ writer_handles.push(handle);
+ }
+
+ // Spawn reader threads
+ let mut reader_handles = Vec::with_capacity(num_readers);
+ for _ in 0..num_readers {
+ let memtable = memtable.clone();
+ let barrier = barrier.clone();
+
+ let handle = std::thread::spawn(move || {
+ barrier.wait();
+
+ for _ in 0..10 {
+ let iter = memtable.iter(None, None, None).unwrap();
+ for batch_result in iter {
+ let _ = batch_result.unwrap();
+ }
+ }
+ });
+
+ reader_handles.push(handle);
+ }
+
+ barrier.wait();
+
+ for handle in writer_handles {
+ handle.join().unwrap();
+ }
+ for handle in reader_handles {
+ handle.join().unwrap();
+ }
+
+ let iter = memtable.iter(None, None, None).unwrap();
+ let mut series_count = 0;
+ let mut row_count = 0;
+
+ for batch_result in iter {
+ let batch = batch_result.unwrap();
+ series_count += 1;
+ row_count += batch.num_rows();
+ }
+ assert_eq!(total_series, series_count);
+ assert_eq!(total_series * rows_per_series, row_count);
+ }
}
From 8d485e9be0da767db281af758de4859a20ffab0e Mon Sep 17 00:00:00 2001
From: Zhenchi
Date: Tue, 15 Apr 2025 14:36:06 +0800
Subject: [PATCH 18/82] feat: support altering fulltext backend (#5896)
* feat: add `greptime_index_type` to `information_schema.key_column_usage`
Signed-off-by: Zhenchi
* fix: show create
Signed-off-by: Zhenchi
---------
Signed-off-by: Zhenchi
---
.../information_schema/key_column_usage.rs | 69 +++++++--
.../information_schema/table_constraints.rs | 6 +-
src/datatypes/src/schema/column_schema.rs | 2 +-
src/query/src/sql.rs | 34 +---
src/query/src/sql/show_create_table.rs | 10 +-
src/store-api/src/metadata.rs | 16 +-
src/table/src/metadata.rs | 16 +-
tests-integration/tests/http.rs | 2 +-
.../alter/change_col_fulltext_options.result | 146 +++++++++++++-----
.../alter/change_col_fulltext_options.sql | 14 ++
.../common/create/create_with_fulltext.result | 74 ++++-----
.../standalone/common/show/show_create.result | 28 ++--
.../standalone/common/show/show_index.result | 36 ++---
.../common/system/information_schema.result | 32 ++--
14 files changed, 294 insertions(+), 191 deletions(-)
diff --git a/src/catalog/src/system_schema/information_schema/key_column_usage.rs b/src/catalog/src/system_schema/information_schema/key_column_usage.rs
index 9f08839303..ffcd5eaaa5 100644
--- a/src/catalog/src/system_schema/information_schema/key_column_usage.rs
+++ b/src/catalog/src/system_schema/information_schema/key_column_usage.rs
@@ -24,7 +24,7 @@ use datafusion::physical_plan::stream::RecordBatchStreamAdapter as DfRecordBatch
use datafusion::physical_plan::streaming::PartitionStream as DfPartitionStream;
use datafusion::physical_plan::SendableRecordBatchStream as DfSendableRecordBatchStream;
use datatypes::prelude::{ConcreteDataType, MutableVector, ScalarVectorBuilder, VectorRef};
-use datatypes::schema::{ColumnSchema, Schema, SchemaRef};
+use datatypes::schema::{ColumnSchema, FulltextBackend, Schema, SchemaRef};
use datatypes::value::Value;
use datatypes::vectors::{ConstantVector, StringVector, StringVectorBuilder, UInt32VectorBuilder};
use futures_util::TryStreamExt;
@@ -47,20 +47,38 @@ pub const TABLE_SCHEMA: &str = "table_schema";
pub const TABLE_NAME: &str = "table_name";
pub const COLUMN_NAME: &str = "column_name";
pub const ORDINAL_POSITION: &str = "ordinal_position";
+/// The type of the index.
+pub const GREPTIME_INDEX_TYPE: &str = "greptime_index_type";
const INIT_CAPACITY: usize = 42;
-/// Primary key constraint name
-pub(crate) const PRI_CONSTRAINT_NAME: &str = "PRIMARY";
/// Time index constraint name
-pub(crate) const TIME_INDEX_CONSTRAINT_NAME: &str = "TIME INDEX";
+pub(crate) const CONSTRAINT_NAME_TIME_INDEX: &str = "TIME INDEX";
+
+/// Primary key constraint name
+pub(crate) const CONSTRAINT_NAME_PRI: &str = "PRIMARY";
+/// Primary key index type
+pub(crate) const INDEX_TYPE_PRI: &str = "greptime-primary-key-v1";
+
/// Inverted index constraint name
-pub(crate) const INVERTED_INDEX_CONSTRAINT_NAME: &str = "INVERTED INDEX";
+pub(crate) const CONSTRAINT_NAME_INVERTED_INDEX: &str = "INVERTED INDEX";
+/// Inverted index type
+pub(crate) const INDEX_TYPE_INVERTED_INDEX: &str = "greptime-inverted-index-v1";
+
/// Fulltext index constraint name
-pub(crate) const FULLTEXT_INDEX_CONSTRAINT_NAME: &str = "FULLTEXT INDEX";
+pub(crate) const CONSTRAINT_NAME_FULLTEXT_INDEX: &str = "FULLTEXT INDEX";
+/// Fulltext index v1 type
+pub(crate) const INDEX_TYPE_FULLTEXT_TANTIVY: &str = "greptime-fulltext-index-v1";
+/// Fulltext index bloom type
+pub(crate) const INDEX_TYPE_FULLTEXT_BLOOM: &str = "greptime-fulltext-index-bloom";
+
/// Skipping index constraint name
-pub(crate) const SKIPPING_INDEX_CONSTRAINT_NAME: &str = "SKIPPING INDEX";
+pub(crate) const CONSTRAINT_NAME_SKIPPING_INDEX: &str = "SKIPPING INDEX";
+/// Skipping index type
+pub(crate) const INDEX_TYPE_SKIPPING_INDEX: &str = "greptime-bloom-filter-v1";
/// The virtual table implementation for `information_schema.KEY_COLUMN_USAGE`.
+///
+/// Provides an extra column `greptime_index_type` for the index type of the key column.
#[derive(Debug)]
pub(super) struct InformationSchemaKeyColumnUsage {
schema: SchemaRef,
@@ -120,6 +138,11 @@ impl InformationSchemaKeyColumnUsage {
ConcreteDataType::string_datatype(),
true,
),
+ ColumnSchema::new(
+ GREPTIME_INDEX_TYPE,
+ ConcreteDataType::string_datatype(),
+ true,
+ ),
]))
}
@@ -184,6 +207,7 @@ struct InformationSchemaKeyColumnUsageBuilder {
column_name: StringVectorBuilder,
ordinal_position: UInt32VectorBuilder,
position_in_unique_constraint: UInt32VectorBuilder,
+ greptime_index_type: StringVectorBuilder,
}
impl InformationSchemaKeyColumnUsageBuilder {
@@ -206,6 +230,7 @@ impl InformationSchemaKeyColumnUsageBuilder {
column_name: StringVectorBuilder::with_capacity(INIT_CAPACITY),
ordinal_position: UInt32VectorBuilder::with_capacity(INIT_CAPACITY),
position_in_unique_constraint: UInt32VectorBuilder::with_capacity(INIT_CAPACITY),
+ greptime_index_type: StringVectorBuilder::with_capacity(INIT_CAPACITY),
}
}
@@ -229,34 +254,47 @@ impl InformationSchemaKeyColumnUsageBuilder {
for (idx, column) in schema.column_schemas().iter().enumerate() {
let mut constraints = vec![];
+ let mut greptime_index_type = vec![];
if column.is_time_index() {
self.add_key_column_usage(
&predicates,
&schema_name,
- TIME_INDEX_CONSTRAINT_NAME,
+ CONSTRAINT_NAME_TIME_INDEX,
&catalog_name,
&schema_name,
table_name,
&column.name,
1, //always 1 for time index
+ "",
);
}
// TODO(dimbtp): foreign key constraint not supported yet
if keys.contains(&idx) {
- constraints.push(PRI_CONSTRAINT_NAME);
+ constraints.push(CONSTRAINT_NAME_PRI);
+ greptime_index_type.push(INDEX_TYPE_PRI);
}
if column.is_inverted_indexed() {
- constraints.push(INVERTED_INDEX_CONSTRAINT_NAME);
+ constraints.push(CONSTRAINT_NAME_INVERTED_INDEX);
+ greptime_index_type.push(INDEX_TYPE_INVERTED_INDEX);
}
- if column.is_fulltext_indexed() {
- constraints.push(FULLTEXT_INDEX_CONSTRAINT_NAME);
+ if let Ok(Some(options)) = column.fulltext_options() {
+ if options.enable {
+ constraints.push(CONSTRAINT_NAME_FULLTEXT_INDEX);
+ let index_type = match options.backend {
+ FulltextBackend::Bloom => INDEX_TYPE_FULLTEXT_BLOOM,
+ FulltextBackend::Tantivy => INDEX_TYPE_FULLTEXT_TANTIVY,
+ };
+ greptime_index_type.push(index_type);
+ }
}
if column.is_skipping_indexed() {
- constraints.push(SKIPPING_INDEX_CONSTRAINT_NAME);
+ constraints.push(CONSTRAINT_NAME_SKIPPING_INDEX);
+ greptime_index_type.push(INDEX_TYPE_SKIPPING_INDEX);
}
if !constraints.is_empty() {
let aggregated_constraints = constraints.join(", ");
+ let aggregated_index_types = greptime_index_type.join(", ");
self.add_key_column_usage(
&predicates,
&schema_name,
@@ -266,6 +304,7 @@ impl InformationSchemaKeyColumnUsageBuilder {
table_name,
&column.name,
idx as u32 + 1,
+ &aggregated_index_types,
);
}
}
@@ -288,6 +327,7 @@ impl InformationSchemaKeyColumnUsageBuilder {
table_name: &str,
column_name: &str,
ordinal_position: u32,
+ index_types: &str,
) {
let row = [
(CONSTRAINT_SCHEMA, &Value::from(constraint_schema)),
@@ -297,6 +337,7 @@ impl InformationSchemaKeyColumnUsageBuilder {
(TABLE_NAME, &Value::from(table_name)),
(COLUMN_NAME, &Value::from(column_name)),
(ORDINAL_POSITION, &Value::from(ordinal_position)),
+ (GREPTIME_INDEX_TYPE, &Value::from(index_types)),
];
if !predicates.eval(&row) {
@@ -313,6 +354,7 @@ impl InformationSchemaKeyColumnUsageBuilder {
self.column_name.push(Some(column_name));
self.ordinal_position.push(Some(ordinal_position));
self.position_in_unique_constraint.push(None);
+ self.greptime_index_type.push(Some(index_types));
}
fn finish(&mut self) -> Result {
@@ -336,6 +378,7 @@ impl InformationSchemaKeyColumnUsageBuilder {
null_string_vector.clone(),
null_string_vector.clone(),
null_string_vector,
+ Arc::new(self.greptime_index_type.finish()),
];
RecordBatch::new(self.schema.clone(), columns).context(CreateRecordBatchSnafu)
}
diff --git a/src/catalog/src/system_schema/information_schema/table_constraints.rs b/src/catalog/src/system_schema/information_schema/table_constraints.rs
index a1f9d899f4..77ac93632f 100644
--- a/src/catalog/src/system_schema/information_schema/table_constraints.rs
+++ b/src/catalog/src/system_schema/information_schema/table_constraints.rs
@@ -36,7 +36,7 @@ use crate::error::{
CreateRecordBatchSnafu, InternalSnafu, Result, UpgradeWeakCatalogManagerRefSnafu,
};
use crate::information_schema::key_column_usage::{
- PRI_CONSTRAINT_NAME, TIME_INDEX_CONSTRAINT_NAME,
+ CONSTRAINT_NAME_PRI, CONSTRAINT_NAME_TIME_INDEX,
};
use crate::information_schema::Predicates;
use crate::system_schema::information_schema::{InformationTable, TABLE_CONSTRAINTS};
@@ -188,7 +188,7 @@ impl InformationSchemaTableConstraintsBuilder {
self.add_table_constraint(
&predicates,
&schema_name,
- TIME_INDEX_CONSTRAINT_NAME,
+ CONSTRAINT_NAME_TIME_INDEX,
&schema_name,
&table.table_info().name,
TIME_INDEX_CONSTRAINT_TYPE,
@@ -199,7 +199,7 @@ impl InformationSchemaTableConstraintsBuilder {
self.add_table_constraint(
&predicates,
&schema_name,
- PRI_CONSTRAINT_NAME,
+ CONSTRAINT_NAME_PRI,
&schema_name,
&table.table_info().name,
PRI_KEY_CONSTRAINT_TYPE,
diff --git a/src/datatypes/src/schema/column_schema.rs b/src/datatypes/src/schema/column_schema.rs
index 9a975c4008..376c9e6de0 100644
--- a/src/datatypes/src/schema/column_schema.rs
+++ b/src/datatypes/src/schema/column_schema.rs
@@ -537,8 +537,8 @@ impl fmt::Display for FulltextOptions {
#[serde(rename_all = "kebab-case")]
pub enum FulltextBackend {
#[default]
+ Bloom,
Tantivy,
- Bloom, // TODO(zhongzc): when bloom is ready, use it as default
}
impl fmt::Display for FulltextBackend {
diff --git a/src/query/src/sql.rs b/src/query/src/sql.rs
index b62289fb6b..8f823fe809 100644
--- a/src/query/src/sql.rs
+++ b/src/query/src/sql.rs
@@ -40,7 +40,7 @@ use common_recordbatch::RecordBatches;
use common_time::timezone::get_timezone;
use common_time::Timestamp;
use datafusion::common::ScalarValue;
-use datafusion::prelude::{concat_ws, SessionContext};
+use datafusion::prelude::SessionContext;
use datafusion_expr::expr::WildcardOptions;
use datafusion_expr::{case, col, lit, Expr, SortExpr};
use datatypes::prelude::*;
@@ -399,23 +399,6 @@ pub async fn show_index(
query_ctx.current_schema()
};
- let primary_key_expr = case(col("constraint_name").like(lit("%PRIMARY%")))
- .when(lit(true), lit("greptime-primary-key-v1"))
- .otherwise(null())
- .context(error::PlanSqlSnafu)?;
- let inverted_index_expr = case(col("constraint_name").like(lit("%INVERTED INDEX%")))
- .when(lit(true), lit("greptime-inverted-index-v1"))
- .otherwise(null())
- .context(error::PlanSqlSnafu)?;
- let fulltext_index_expr = case(col("constraint_name").like(lit("%FULLTEXT INDEX%")))
- .when(lit(true), lit("greptime-fulltext-index-v1"))
- .otherwise(null())
- .context(error::PlanSqlSnafu)?;
- let skipping_index_expr = case(col("constraint_name").like(lit("%SKIPPING INDEX%")))
- .when(lit(true), lit("greptime-bloom-filter-v1"))
- .otherwise(null())
- .context(error::PlanSqlSnafu)?;
-
let select = vec![
// 1 as `Non_unique`: contain duplicates
lit(1).alias(INDEX_NONT_UNIQUE_COLUMN),
@@ -433,16 +416,6 @@ pub async fn show_index(
.otherwise(lit(YES_STR))
.context(error::PlanSqlSnafu)?
.alias(COLUMN_NULLABLE_COLUMN),
- concat_ws(
- lit(", "),
- vec![
- primary_key_expr,
- inverted_index_expr,
- fulltext_index_expr,
- skipping_index_expr,
- ],
- )
- .alias(INDEX_INDEX_TYPE_COLUMN),
lit("").alias(COLUMN_COMMENT_COLUMN),
lit("").alias(INDEX_COMMENT_COLUMN),
lit(YES_STR).alias(INDEX_VISIBLE_COLUMN),
@@ -467,7 +440,10 @@ pub async fn show_index(
(INDEX_SUB_PART_COLUMN, INDEX_SUB_PART_COLUMN),
(INDEX_PACKED_COLUMN, INDEX_PACKED_COLUMN),
(COLUMN_NULLABLE_COLUMN, COLUMN_NULLABLE_COLUMN),
- (INDEX_INDEX_TYPE_COLUMN, INDEX_INDEX_TYPE_COLUMN),
+ (
+ key_column_usage::GREPTIME_INDEX_TYPE,
+ INDEX_INDEX_TYPE_COLUMN,
+ ),
(COLUMN_COMMENT_COLUMN, COLUMN_COMMENT_COLUMN),
(INDEX_COMMENT_COLUMN, INDEX_COMMENT_COLUMN),
(INDEX_VISIBLE_COLUMN, INDEX_VISIBLE_COLUMN),
diff --git a/src/query/src/sql/show_create_table.rs b/src/query/src/sql/show_create_table.rs
index 3eebfbc03e..bc004f514e 100644
--- a/src/query/src/sql/show_create_table.rs
+++ b/src/query/src/sql/show_create_table.rs
@@ -19,8 +19,8 @@ use std::collections::HashMap;
use common_meta::SchemaOptions;
use datatypes::schema::{
ColumnDefaultConstraint, ColumnSchema, SchemaRef, COLUMN_FULLTEXT_OPT_KEY_ANALYZER,
- COLUMN_FULLTEXT_OPT_KEY_CASE_SENSITIVE, COLUMN_SKIPPING_INDEX_OPT_KEY_GRANULARITY,
- COLUMN_SKIPPING_INDEX_OPT_KEY_TYPE, COMMENT_KEY,
+ COLUMN_FULLTEXT_OPT_KEY_BACKEND, COLUMN_FULLTEXT_OPT_KEY_CASE_SENSITIVE,
+ COLUMN_SKIPPING_INDEX_OPT_KEY_GRANULARITY, COLUMN_SKIPPING_INDEX_OPT_KEY_TYPE, COMMENT_KEY,
};
use snafu::ResultExt;
use sql::ast::{ColumnDef, ColumnOption, ColumnOptionDef, Expr, Ident, ObjectName};
@@ -113,6 +113,10 @@ fn create_column(column_schema: &ColumnSchema, quote_style: char) -> Result,
) -> Result<()> {
if let Some(current_options) = current_options {
- ensure!(
- !current_options.enable,
- InvalidColumnOptionSnafu {
- column_name,
- msg: "FULLTEXT index already enabled".to_string(),
- }
- );
-
ensure!(
current_options.analyzer == options.analyzer
&& current_options.case_sensitive == options.case_sensitive,
diff --git a/src/table/src/metadata.rs b/src/table/src/metadata.rs
index a457afe107..4235588ff0 100644
--- a/src/table/src/metadata.rs
+++ b/src/table/src/metadata.rs
@@ -1149,6 +1149,14 @@ impl TryFrom for TableInfo {
}
}
+/// Set column fulltext options if it passed the validation.
+///
+/// Options allowed to modify:
+/// * backend
+///
+/// Options not allowed to modify:
+/// * analyzer
+/// * case_sensitive
fn set_column_fulltext_options(
column_schema: &mut ColumnSchema,
column_name: &str,
@@ -1156,14 +1164,6 @@ fn set_column_fulltext_options(
current_options: Option,
) -> Result<()> {
if let Some(current_options) = current_options {
- ensure!(
- !current_options.enable,
- error::InvalidColumnOptionSnafu {
- column_name,
- msg: "FULLTEXT index already enabled",
- }
- );
-
ensure!(
current_options.analyzer == options.analyzer
&& current_options.case_sensitive == options.case_sensitive,
diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs
index ffb74e1b16..6eb4d10562 100644
--- a/tests-integration/tests/http.rs
+++ b/tests-integration/tests/http.rs
@@ -1371,7 +1371,7 @@ transform:
assert_eq!(res.status(), StatusCode::OK);
// 3. check schema
- let expected_schema = "[[\"logs1\",\"CREATE TABLE IF NOT EXISTS \\\"logs1\\\" (\\n \\\"id1\\\" INT NULL INVERTED INDEX,\\n \\\"id2\\\" INT NULL INVERTED INDEX,\\n \\\"logger\\\" STRING NULL,\\n \\\"type\\\" STRING NULL SKIPPING INDEX WITH(granularity = '10240', type = 'BLOOM'),\\n \\\"log\\\" STRING NULL FULLTEXT INDEX WITH(analyzer = 'English', case_sensitive = 'false'),\\n \\\"time\\\" TIMESTAMP(9) NOT NULL,\\n TIME INDEX (\\\"time\\\"),\\n PRIMARY KEY (\\\"type\\\", \\\"log\\\")\\n)\\n\\nENGINE=mito\\nWITH(\\n append_mode = 'true'\\n)\"]]";
+ let expected_schema = "[[\"logs1\",\"CREATE TABLE IF NOT EXISTS \\\"logs1\\\" (\\n \\\"id1\\\" INT NULL INVERTED INDEX,\\n \\\"id2\\\" INT NULL INVERTED INDEX,\\n \\\"logger\\\" STRING NULL,\\n \\\"type\\\" STRING NULL SKIPPING INDEX WITH(granularity = '10240', type = 'BLOOM'),\\n \\\"log\\\" STRING NULL FULLTEXT INDEX WITH(analyzer = 'English', backend = 'bloom', case_sensitive = 'false'),\\n \\\"time\\\" TIMESTAMP(9) NOT NULL,\\n TIME INDEX (\\\"time\\\"),\\n PRIMARY KEY (\\\"type\\\", \\\"log\\\")\\n)\\n\\nENGINE=mito\\nWITH(\\n append_mode = 'true'\\n)\"]]";
validate_data(
"pipeline_schema",
&client,
diff --git a/tests/cases/standalone/common/alter/change_col_fulltext_options.result b/tests/cases/standalone/common/alter/change_col_fulltext_options.result
index ee400593cc..13202ae12c 100644
--- a/tests/cases/standalone/common/alter/change_col_fulltext_options.result
+++ b/tests/cases/standalone/common/alter/change_col_fulltext_options.result
@@ -79,29 +79,29 @@ SELECT * FROM test WHERE MATCHES(message, 'hello') ORDER BY message;
-- SQLNESS ARG restart=true
SHOW CREATE TABLE test;
-+-------+---------------------------------------------------------------------------------------------+
-| Table | Create Table |
-+-------+---------------------------------------------------------------------------------------------+
-| test | CREATE TABLE IF NOT EXISTS "test" ( |
-| | "message" STRING NULL FULLTEXT INDEX WITH(analyzer = 'Chinese', case_sensitive = 'true'), |
-| | "time" TIMESTAMP(3) NOT NULL, |
-| | TIME INDEX ("time") |
-| | ) |
-| | |
-| | ENGINE=mito |
-| | WITH( |
-| | append_mode = 'true' |
-| | ) |
-+-------+---------------------------------------------------------------------------------------------+
++-------+----------------------------------------------------------------------------------------------------------------+
+| Table | Create Table |
++-------+----------------------------------------------------------------------------------------------------------------+
+| test | CREATE TABLE IF NOT EXISTS "test" ( |
+| | "message" STRING NULL FULLTEXT INDEX WITH(analyzer = 'Chinese', backend = 'bloom', case_sensitive = 'true'), |
+| | "time" TIMESTAMP(3) NOT NULL, |
+| | TIME INDEX ("time") |
+| | ) |
+| | |
+| | ENGINE=mito |
+| | WITH( |
+| | append_mode = 'true' |
+| | ) |
++-------+----------------------------------------------------------------------------------------------------------------+
SHOW INDEX FROM test;
-+-------+------------+----------------+--------------+-------------+-----------+-------------+----------+--------+------+----------------------------+---------+---------------+---------+------------+
-| Table | Non_unique | Key_name | Seq_in_index | Column_name | Collation | Cardinality | Sub_part | Packed | Null | Index_type | Comment | Index_comment | Visible | Expression |
-+-------+------------+----------------+--------------+-------------+-----------+-------------+----------+--------+------+----------------------------+---------+---------------+---------+------------+
-| test | 1 | FULLTEXT INDEX | 1 | message | A | | | | YES | greptime-fulltext-index-v1 | | | YES | |
-| test | 1 | TIME INDEX | 1 | time | A | | | | NO | | | | YES | |
-+-------+------------+----------------+--------------+-------------+-----------+-------------+----------+--------+------+----------------------------+---------+---------------+---------+------------+
++-------+------------+----------------+--------------+-------------+-----------+-------------+----------+--------+------+-------------------------------+---------+---------------+---------+------------+
+| Table | Non_unique | Key_name | Seq_in_index | Column_name | Collation | Cardinality | Sub_part | Packed | Null | Index_type | Comment | Index_comment | Visible | Expression |
++-------+------------+----------------+--------------+-------------+-----------+-------------+----------+--------+------+-------------------------------+---------+---------------+---------+------------+
+| test | 1 | FULLTEXT INDEX | 1 | message | A | | | | YES | greptime-fulltext-index-bloom | | | YES | |
+| test | 1 | TIME INDEX | 1 | time | A | | | | NO | | | | YES | |
++-------+------------+----------------+--------------+-------------+-----------+-------------+----------+--------+------+-------------------------------+---------+---------------+---------+------------+
ALTER TABLE test MODIFY COLUMN message UNSET FULLTEXT INDEX;
@@ -138,33 +138,33 @@ Affected Rows: 0
SHOW CREATE TABLE test;
-+-------+---------------------------------------------------------------------------------------------+
-| Table | Create Table |
-+-------+---------------------------------------------------------------------------------------------+
-| test | CREATE TABLE IF NOT EXISTS "test" ( |
-| | "message" STRING NULL FULLTEXT INDEX WITH(analyzer = 'Chinese', case_sensitive = 'true'), |
-| | "time" TIMESTAMP(3) NOT NULL, |
-| | TIME INDEX ("time") |
-| | ) |
-| | |
-| | ENGINE=mito |
-| | WITH( |
-| | append_mode = 'true' |
-| | ) |
-+-------+---------------------------------------------------------------------------------------------+
++-------+----------------------------------------------------------------------------------------------------------------+
+| Table | Create Table |
++-------+----------------------------------------------------------------------------------------------------------------+
+| test | CREATE TABLE IF NOT EXISTS "test" ( |
+| | "message" STRING NULL FULLTEXT INDEX WITH(analyzer = 'Chinese', backend = 'bloom', case_sensitive = 'true'), |
+| | "time" TIMESTAMP(3) NOT NULL, |
+| | TIME INDEX ("time") |
+| | ) |
+| | |
+| | ENGINE=mito |
+| | WITH( |
+| | append_mode = 'true' |
+| | ) |
++-------+----------------------------------------------------------------------------------------------------------------+
SHOW INDEX FROM test;
-+-------+------------+----------------+--------------+-------------+-----------+-------------+----------+--------+------+----------------------------+---------+---------------+---------+------------+
-| Table | Non_unique | Key_name | Seq_in_index | Column_name | Collation | Cardinality | Sub_part | Packed | Null | Index_type | Comment | Index_comment | Visible | Expression |
-+-------+------------+----------------+--------------+-------------+-----------+-------------+----------+--------+------+----------------------------+---------+---------------+---------+------------+
-| test | 1 | FULLTEXT INDEX | 1 | message | A | | | | YES | greptime-fulltext-index-v1 | | | YES | |
-| test | 1 | TIME INDEX | 1 | time | A | | | | NO | | | | YES | |
-+-------+------------+----------------+--------------+-------------+-----------+-------------+----------+--------+------+----------------------------+---------+---------------+---------+------------+
++-------+------------+----------------+--------------+-------------+-----------+-------------+----------+--------+------+-------------------------------+---------+---------------+---------+------------+
+| Table | Non_unique | Key_name | Seq_in_index | Column_name | Collation | Cardinality | Sub_part | Packed | Null | Index_type | Comment | Index_comment | Visible | Expression |
++-------+------------+----------------+--------------+-------------+-----------+-------------+----------+--------+------+-------------------------------+---------+---------------+---------+------------+
+| test | 1 | FULLTEXT INDEX | 1 | message | A | | | | YES | greptime-fulltext-index-bloom | | | YES | |
+| test | 1 | TIME INDEX | 1 | time | A | | | | NO | | | | YES | |
++-------+------------+----------------+--------------+-------------+-----------+-------------+----------+--------+------+-------------------------------+---------+---------------+---------+------------+
ALTER TABLE test MODIFY COLUMN message SET FULLTEXT INDEX WITH(analyzer = 'Chinese', case_sensitive = 'false');
-Error: 1004(InvalidArguments), Invalid column option, column name: message, error: FULLTEXT index already enabled
+Error: 1004(InvalidArguments), Invalid column option, column name: message, error: Cannot change analyzer or case_sensitive if FULLTEXT index is set before. Previous analyzer: Chinese, previous case_sensitive: true
ALTER TABLE test MODIFY COLUMN message UNSET FULLTEXT INDEX;
@@ -195,6 +195,66 @@ SHOW INDEX FROM test;
| test | 1 | TIME INDEX | 1 | time | A | | | | NO | | | | YES | |
+-------+------------+------------+--------------+-------------+-----------+-------------+----------+--------+------+------------+---------+---------------+---------+------------+
+ALTER TABLE test MODIFY COLUMN message SET FULLTEXT INDEX WITH(analyzer = 'Chinese', case_sensitive = 'true', backend = 'bloom');
+
+Affected Rows: 0
+
+SHOW CREATE TABLE test;
+
++-------+----------------------------------------------------------------------------------------------------------------+
+| Table | Create Table |
++-------+----------------------------------------------------------------------------------------------------------------+
+| test | CREATE TABLE IF NOT EXISTS "test" ( |
+| | "message" STRING NULL FULLTEXT INDEX WITH(analyzer = 'Chinese', backend = 'bloom', case_sensitive = 'true'), |
+| | "time" TIMESTAMP(3) NOT NULL, |
+| | TIME INDEX ("time") |
+| | ) |
+| | |
+| | ENGINE=mito |
+| | WITH( |
+| | append_mode = 'true' |
+| | ) |
++-------+----------------------------------------------------------------------------------------------------------------+
+
+SHOW INDEX FROM test;
+
++-------+------------+----------------+--------------+-------------+-----------+-------------+----------+--------+------+-------------------------------+---------+---------------+---------+------------+
+| Table | Non_unique | Key_name | Seq_in_index | Column_name | Collation | Cardinality | Sub_part | Packed | Null | Index_type | Comment | Index_comment | Visible | Expression |
++-------+------------+----------------+--------------+-------------+-----------+-------------+----------+--------+------+-------------------------------+---------+---------------+---------+------------+
+| test | 1 | FULLTEXT INDEX | 1 | message | A | | | | YES | greptime-fulltext-index-bloom | | | YES | |
+| test | 1 | TIME INDEX | 1 | time | A | | | | NO | | | | YES | |
++-------+------------+----------------+--------------+-------------+-----------+-------------+----------+--------+------+-------------------------------+---------+---------------+---------+------------+
+
+ALTER TABLE test MODIFY COLUMN message SET FULLTEXT INDEX WITH(analyzer = 'Chinese', case_sensitive = 'true', backend = 'tantivy');
+
+Affected Rows: 0
+
+SHOW CREATE TABLE test;
+
++-------+------------------------------------------------------------------------------------------------------------------+
+| Table | Create Table |
++-------+------------------------------------------------------------------------------------------------------------------+
+| test | CREATE TABLE IF NOT EXISTS "test" ( |
+| | "message" STRING NULL FULLTEXT INDEX WITH(analyzer = 'Chinese', backend = 'tantivy', case_sensitive = 'true'), |
+| | "time" TIMESTAMP(3) NOT NULL, |
+| | TIME INDEX ("time") |
+| | ) |
+| | |
+| | ENGINE=mito |
+| | WITH( |
+| | append_mode = 'true' |
+| | ) |
++-------+------------------------------------------------------------------------------------------------------------------+
+
+SHOW INDEX FROM test;
+
++-------+------------+----------------+--------------+-------------+-----------+-------------+----------+--------+------+----------------------------+---------+---------------+---------+------------+
+| Table | Non_unique | Key_name | Seq_in_index | Column_name | Collation | Cardinality | Sub_part | Packed | Null | Index_type | Comment | Index_comment | Visible | Expression |
++-------+------------+----------------+--------------+-------------+-----------+-------------+----------+--------+------+----------------------------+---------+---------------+---------+------------+
+| test | 1 | FULLTEXT INDEX | 1 | message | A | | | | YES | greptime-fulltext-index-v1 | | | YES | |
+| test | 1 | TIME INDEX | 1 | time | A | | | | NO | | | | YES | |
++-------+------------+----------------+--------------+-------------+-----------+-------------+----------+--------+------+----------------------------+---------+---------------+---------+------------+
+
ALTER TABLE test MODIFY COLUMN message SET FULLTEXT INDEX WITH(analyzer = 'Chinglish', case_sensitive = 'false');
Error: 1002(Unexpected), Invalid fulltext option: Chinglish, expected: 'English' | 'Chinese'
@@ -211,6 +271,10 @@ ALTER TABLE test MODIFY COLUMN message SET FULLTEXT INDEX WITH(analyzer = 'Engli
Error: 1004(InvalidArguments), Invalid column option, column name: message, error: Cannot change analyzer or case_sensitive if FULLTEXT index is set before. Previous analyzer: Chinese, previous case_sensitive: true
+ALTER TABLE test MODIFY COLUMN message SET FULLTEXT INDEX WITH(backend = 'xor');
+
+Error: 1002(Unexpected), Invalid fulltext option: xor, expected: 'bloom' | 'tantivy'
+
DROP TABLE test;
Affected Rows: 0
diff --git a/tests/cases/standalone/common/alter/change_col_fulltext_options.sql b/tests/cases/standalone/common/alter/change_col_fulltext_options.sql
index b5ead6e610..df56e8179e 100644
--- a/tests/cases/standalone/common/alter/change_col_fulltext_options.sql
+++ b/tests/cases/standalone/common/alter/change_col_fulltext_options.sql
@@ -51,6 +51,18 @@ SHOW CREATE TABLE test;
SHOW INDEX FROM test;
+ALTER TABLE test MODIFY COLUMN message SET FULLTEXT INDEX WITH(analyzer = 'Chinese', case_sensitive = 'true', backend = 'bloom');
+
+SHOW CREATE TABLE test;
+
+SHOW INDEX FROM test;
+
+ALTER TABLE test MODIFY COLUMN message SET FULLTEXT INDEX WITH(analyzer = 'Chinese', case_sensitive = 'true', backend = 'tantivy');
+
+SHOW CREATE TABLE test;
+
+SHOW INDEX FROM test;
+
ALTER TABLE test MODIFY COLUMN message SET FULLTEXT INDEX WITH(analyzer = 'Chinglish', case_sensitive = 'false');
ALTER TABLE test MODIFY COLUMN message SET FULLTEXT INDEX WITH(analyzer = 'Chinese', case_sensitive = 'no');
@@ -59,4 +71,6 @@ ALTER TABLE test MODIFY COLUMN time SET FULLTEXT INDEX WITH(analyzer = 'Chinese'
ALTER TABLE test MODIFY COLUMN message SET FULLTEXT INDEX WITH(analyzer = 'English', case_sensitive = 'true');
+ALTER TABLE test MODIFY COLUMN message SET FULLTEXT INDEX WITH(backend = 'xor');
+
DROP TABLE test;
diff --git a/tests/cases/standalone/common/create/create_with_fulltext.result b/tests/cases/standalone/common/create/create_with_fulltext.result
index 3ab0435780..d5ae9ee2dd 100644
--- a/tests/cases/standalone/common/create/create_with_fulltext.result
+++ b/tests/cases/standalone/common/create/create_with_fulltext.result
@@ -7,18 +7,18 @@ Affected Rows: 0
SHOW CREATE TABLE log;
-+-------+------------------------------------------------------------------------------------------+
-| Table | Create Table |
-+-------+------------------------------------------------------------------------------------------+
-| log | CREATE TABLE IF NOT EXISTS "log" ( |
-| | "ts" TIMESTAMP(3) NOT NULL, |
-| | "msg" STRING NULL FULLTEXT INDEX WITH(analyzer = 'English', case_sensitive = 'false'), |
-| | TIME INDEX ("ts") |
-| | ) |
-| | |
-| | ENGINE=mito |
-| | |
-+-------+------------------------------------------------------------------------------------------+
++-------+-------------------------------------------------------------------------------------------------------------+
+| Table | Create Table |
++-------+-------------------------------------------------------------------------------------------------------------+
+| log | CREATE TABLE IF NOT EXISTS "log" ( |
+| | "ts" TIMESTAMP(3) NOT NULL, |
+| | "msg" STRING NULL FULLTEXT INDEX WITH(analyzer = 'English', backend = 'bloom', case_sensitive = 'false'), |
+| | TIME INDEX ("ts") |
+| | ) |
+| | |
+| | ENGINE=mito |
+| | |
++-------+-------------------------------------------------------------------------------------------------------------+
DROP TABLE log;
@@ -33,18 +33,18 @@ Affected Rows: 0
SHOW CREATE TABLE log_with_opts;
-+---------------+-----------------------------------------------------------------------------------------+
-| Table | Create Table |
-+---------------+-----------------------------------------------------------------------------------------+
-| log_with_opts | CREATE TABLE IF NOT EXISTS "log_with_opts" ( |
-| | "ts" TIMESTAMP(3) NOT NULL, |
-| | "msg" STRING NULL FULLTEXT INDEX WITH(analyzer = 'English', case_sensitive = 'true'), |
-| | TIME INDEX ("ts") |
-| | ) |
-| | |
-| | ENGINE=mito |
-| | |
-+---------------+-----------------------------------------------------------------------------------------+
++---------------+------------------------------------------------------------------------------------------------------------+
+| Table | Create Table |
++---------------+------------------------------------------------------------------------------------------------------------+
+| log_with_opts | CREATE TABLE IF NOT EXISTS "log_with_opts" ( |
+| | "ts" TIMESTAMP(3) NOT NULL, |
+| | "msg" STRING NULL FULLTEXT INDEX WITH(analyzer = 'English', backend = 'bloom', case_sensitive = 'true'), |
+| | TIME INDEX ("ts") |
+| | ) |
+| | |
+| | ENGINE=mito |
+| | |
++---------------+------------------------------------------------------------------------------------------------------------+
DROP TABLE log_with_opts;
@@ -60,19 +60,19 @@ Affected Rows: 0
SHOW CREATE TABLE log_multi_fulltext_cols;
-+-------------------------+-------------------------------------------------------------------------------------------+
-| Table | Create Table |
-+-------------------------+-------------------------------------------------------------------------------------------+
-| log_multi_fulltext_cols | CREATE TABLE IF NOT EXISTS "log_multi_fulltext_cols" ( |
-| | "ts" TIMESTAMP(3) NOT NULL, |
-| | "msg" STRING NULL FULLTEXT INDEX WITH(analyzer = 'English', case_sensitive = 'false'), |
-| | "msg2" STRING NULL FULLTEXT INDEX WITH(analyzer = 'English', case_sensitive = 'false'), |
-| | TIME INDEX ("ts") |
-| | ) |
-| | |
-| | ENGINE=mito |
-| | |
-+-------------------------+-------------------------------------------------------------------------------------------+
++-------------------------+--------------------------------------------------------------------------------------------------------------+
+| Table | Create Table |
++-------------------------+--------------------------------------------------------------------------------------------------------------+
+| log_multi_fulltext_cols | CREATE TABLE IF NOT EXISTS "log_multi_fulltext_cols" ( |
+| | "ts" TIMESTAMP(3) NOT NULL, |
+| | "msg" STRING NULL FULLTEXT INDEX WITH(analyzer = 'English', backend = 'bloom', case_sensitive = 'false'), |
+| | "msg2" STRING NULL FULLTEXT INDEX WITH(analyzer = 'English', backend = 'bloom', case_sensitive = 'false'), |
+| | TIME INDEX ("ts") |
+| | ) |
+| | |
+| | ENGINE=mito |
+| | |
++-------------------------+--------------------------------------------------------------------------------------------------------------+
DROP TABLE log_multi_fulltext_cols;
diff --git a/tests/cases/standalone/common/show/show_create.result b/tests/cases/standalone/common/show/show_create.result
index ddbdd4179a..5d7019265a 100644
--- a/tests/cases/standalone/common/show/show_create.result
+++ b/tests/cases/standalone/common/show/show_create.result
@@ -373,20 +373,20 @@ Affected Rows: 0
show create table test_column_constrain_composite_indexes;
-+-----------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| Table | Create Table |
-+-----------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| test_column_constrain_composite_indexes | CREATE TABLE IF NOT EXISTS "test_column_constrain_composite_indexes" ( |
-| | "id" INT NULL SKIPPING INDEX WITH(granularity = '10240', type = 'BLOOM') INVERTED INDEX, |
-| | "host" STRING NULL FULLTEXT INDEX WITH(analyzer = 'English', case_sensitive = 'false') SKIPPING INDEX WITH(granularity = '10240', type = 'BLOOM') INVERTED INDEX, |
-| | "ts" TIMESTAMP(3) NOT NULL, |
-| | TIME INDEX ("ts"), |
-| | PRIMARY KEY ("host") |
-| | ) |
-| | |
-| | ENGINE=mito |
-| | |
-+-----------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------+
++-----------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| Table | Create Table |
++-----------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| test_column_constrain_composite_indexes | CREATE TABLE IF NOT EXISTS "test_column_constrain_composite_indexes" ( |
+| | "id" INT NULL SKIPPING INDEX WITH(granularity = '10240', type = 'BLOOM') INVERTED INDEX, |
+| | "host" STRING NULL FULLTEXT INDEX WITH(analyzer = 'English', backend = 'bloom', case_sensitive = 'false') SKIPPING INDEX WITH(granularity = '10240', type = 'BLOOM') INVERTED INDEX, |
+| | "ts" TIMESTAMP(3) NOT NULL, |
+| | TIME INDEX ("ts"), |
+| | PRIMARY KEY ("host") |
+| | ) |
+| | |
+| | ENGINE=mito |
+| | |
++-----------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
drop table test_column_constrain_composite_indexes;
diff --git a/tests/cases/standalone/common/show/show_index.result b/tests/cases/standalone/common/show/show_index.result
index 0376443746..80010b5331 100644
--- a/tests/cases/standalone/common/show/show_index.result
+++ b/tests/cases/standalone/common/show/show_index.result
@@ -80,27 +80,27 @@ SHOW INDEX FROM test_no_inverted_index;
SHOW INDEX FROM system_metrics;
-+----------------+------------+-----------------------------------------+--------------+-------------+-----------+-------------+----------+--------+------+---------------------------------------------------------------------------------+---------+---------------+---------+------------+
-| Table | Non_unique | Key_name | Seq_in_index | Column_name | Collation | Cardinality | Sub_part | Packed | Null | Index_type | Comment | Index_comment | Visible | Expression |
-+----------------+------------+-----------------------------------------+--------------+-------------+-----------+-------------+----------+--------+------+---------------------------------------------------------------------------------+---------+---------------+---------+------------+
-| system_metrics | 1 | FULLTEXT INDEX | 7 | desc2 | A | | | | YES | greptime-fulltext-index-v1 | | | YES | |
-| system_metrics | 1 | FULLTEXT INDEX | 8 | desc3 | A | | | | YES | greptime-fulltext-index-v1 | | | YES | |
-| system_metrics | 1 | PRIMARY | 1 | host | A | | | | YES | greptime-primary-key-v1 | | | YES | |
-| system_metrics | 1 | PRIMARY, INVERTED INDEX, FULLTEXT INDEX | 2 | idc | A | | | | YES | greptime-primary-key-v1, greptime-inverted-index-v1, greptime-fulltext-index-v1 | | | YES | |
-| system_metrics | 1 | TIME INDEX | 1 | ts | A | | | | NO | | | | YES | |
-+----------------+------------+-----------------------------------------+--------------+-------------+-----------+-------------+----------+--------+------+---------------------------------------------------------------------------------+---------+---------------+---------+------------+
++----------------+------------+-----------------------------------------+--------------+-------------+-----------+-------------+----------+--------+------+------------------------------------------------------------------------------------+---------+---------------+---------+------------+
+| Table | Non_unique | Key_name | Seq_in_index | Column_name | Collation | Cardinality | Sub_part | Packed | Null | Index_type | Comment | Index_comment | Visible | Expression |
++----------------+------------+-----------------------------------------+--------------+-------------+-----------+-------------+----------+--------+------+------------------------------------------------------------------------------------+---------+---------------+---------+------------+
+| system_metrics | 1 | FULLTEXT INDEX | 7 | desc2 | A | | | | YES | greptime-fulltext-index-bloom | | | YES | |
+| system_metrics | 1 | FULLTEXT INDEX | 8 | desc3 | A | | | | YES | greptime-fulltext-index-bloom | | | YES | |
+| system_metrics | 1 | PRIMARY | 1 | host | A | | | | YES | greptime-primary-key-v1 | | | YES | |
+| system_metrics | 1 | PRIMARY, INVERTED INDEX, FULLTEXT INDEX | 2 | idc | A | | | | YES | greptime-primary-key-v1, greptime-inverted-index-v1, greptime-fulltext-index-bloom | | | YES | |
+| system_metrics | 1 | TIME INDEX | 1 | ts | A | | | | NO | | | | YES | |
++----------------+------------+-----------------------------------------+--------------+-------------+-----------+-------------+----------+--------+------+------------------------------------------------------------------------------------+---------+---------------+---------+------------+
SHOW INDEX FROM system_metrics in public;
-+----------------+------------+-----------------------------------------+--------------+-------------+-----------+-------------+----------+--------+------+---------------------------------------------------------------------------------+---------+---------------+---------+------------+
-| Table | Non_unique | Key_name | Seq_in_index | Column_name | Collation | Cardinality | Sub_part | Packed | Null | Index_type | Comment | Index_comment | Visible | Expression |
-+----------------+------------+-----------------------------------------+--------------+-------------+-----------+-------------+----------+--------+------+---------------------------------------------------------------------------------+---------+---------------+---------+------------+
-| system_metrics | 1 | FULLTEXT INDEX | 7 | desc2 | A | | | | YES | greptime-fulltext-index-v1 | | | YES | |
-| system_metrics | 1 | FULLTEXT INDEX | 8 | desc3 | A | | | | YES | greptime-fulltext-index-v1 | | | YES | |
-| system_metrics | 1 | PRIMARY | 1 | host | A | | | | YES | greptime-primary-key-v1 | | | YES | |
-| system_metrics | 1 | PRIMARY, INVERTED INDEX, FULLTEXT INDEX | 2 | idc | A | | | | YES | greptime-primary-key-v1, greptime-inverted-index-v1, greptime-fulltext-index-v1 | | | YES | |
-| system_metrics | 1 | TIME INDEX | 1 | ts | A | | | | NO | | | | YES | |
-+----------------+------------+-----------------------------------------+--------------+-------------+-----------+-------------+----------+--------+------+---------------------------------------------------------------------------------+---------+---------------+---------+------------+
++----------------+------------+-----------------------------------------+--------------+-------------+-----------+-------------+----------+--------+------+------------------------------------------------------------------------------------+---------+---------------+---------+------------+
+| Table | Non_unique | Key_name | Seq_in_index | Column_name | Collation | Cardinality | Sub_part | Packed | Null | Index_type | Comment | Index_comment | Visible | Expression |
++----------------+------------+-----------------------------------------+--------------+-------------+-----------+-------------+----------+--------+------+------------------------------------------------------------------------------------+---------+---------------+---------+------------+
+| system_metrics | 1 | FULLTEXT INDEX | 7 | desc2 | A | | | | YES | greptime-fulltext-index-bloom | | | YES | |
+| system_metrics | 1 | FULLTEXT INDEX | 8 | desc3 | A | | | | YES | greptime-fulltext-index-bloom | | | YES | |
+| system_metrics | 1 | PRIMARY | 1 | host | A | | | | YES | greptime-primary-key-v1 | | | YES | |
+| system_metrics | 1 | PRIMARY, INVERTED INDEX, FULLTEXT INDEX | 2 | idc | A | | | | YES | greptime-primary-key-v1, greptime-inverted-index-v1, greptime-fulltext-index-bloom | | | YES | |
+| system_metrics | 1 | TIME INDEX | 1 | ts | A | | | | NO | | | | YES | |
++----------------+------------+-----------------------------------------+--------------+-------------+-----------+-------------+----------+--------+------+------------------------------------------------------------------------------------+---------+---------------+---------+------------+
SHOW INDEX FROM system_metrics like '%util%';
diff --git a/tests/cases/standalone/common/system/information_schema.result b/tests/cases/standalone/common/system/information_schema.result
index bfcf39b69e..a19e7944c0 100644
--- a/tests/cases/standalone/common/system/information_schema.result
+++ b/tests/cases/standalone/common/system/information_schema.result
@@ -208,6 +208,7 @@ select * from information_schema.columns order by table_schema, table_name, colu
| greptime | information_schema | key_column_usage | constraint_catalog | 1 | 2147483647 | 2147483647 | | | | utf8 | utf8_bin | | | select,insert | | String | string | FIELD | | No | string | | |
| greptime | information_schema | key_column_usage | constraint_name | 3 | 2147483647 | 2147483647 | | | | utf8 | utf8_bin | | | select,insert | | String | string | FIELD | | No | string | | |
| greptime | information_schema | key_column_usage | constraint_schema | 2 | 2147483647 | 2147483647 | | | | utf8 | utf8_bin | | | select,insert | | String | string | FIELD | | No | string | | |
+| greptime | information_schema | key_column_usage | greptime_index_type | 14 | 2147483647 | 2147483647 | | | | utf8 | utf8_bin | | | select,insert | | String | string | FIELD | | Yes | string | | |
| greptime | information_schema | key_column_usage | ordinal_position | 9 | | | 10 | 0 | | | | | | select,insert | | UInt32 | int unsigned | FIELD | | No | int unsigned | | |
| greptime | information_schema | key_column_usage | position_in_unique_constraint | 10 | | | 10 | 0 | | | | | | select,insert | | UInt32 | int unsigned | FIELD | | Yes | int unsigned | | |
| greptime | information_schema | key_column_usage | real_table_catalog | 5 | 2147483647 | 2147483647 | | | | utf8 | utf8_bin | | | select,insert | | String | string | FIELD | | No | string | | |
@@ -593,11 +594,11 @@ select * from KEY_COLUMN_USAGE where CONSTRAINT_NAME = 'TIME INDEX';
select * from KEY_COLUMN_USAGE where CONSTRAINT_NAME != 'TIME INDEX';
-+--------------------+-------------------+-----------------+---------------+--------------------+--------------+------------+-------------+------------------+-------------------------------+-------------------------+-----------------------+------------------------+
-| constraint_catalog | constraint_schema | constraint_name | table_catalog | real_table_catalog | table_schema | table_name | column_name | ordinal_position | position_in_unique_constraint | referenced_table_schema | referenced_table_name | referenced_column_name |
-+--------------------+-------------------+-----------------+---------------+--------------------+--------------+------------+-------------+------------------+-------------------------------+-------------------------+-----------------------+------------------------+
-| def | public | PRIMARY | def | greptime | public | numbers | number | 1 | | | | |
-+--------------------+-------------------+-----------------+---------------+--------------------+--------------+------------+-------------+------------------+-------------------------------+-------------------------+-----------------------+------------------------+
++--------------------+-------------------+-----------------+---------------+--------------------+--------------+------------+-------------+------------------+-------------------------------+-------------------------+-----------------------+------------------------+-------------------------+
+| constraint_catalog | constraint_schema | constraint_name | table_catalog | real_table_catalog | table_schema | table_name | column_name | ordinal_position | position_in_unique_constraint | referenced_table_schema | referenced_table_name | referenced_column_name | greptime_index_type |
++--------------------+-------------------+-----------------+---------------+--------------------+--------------+------------+-------------+------------------+-------------------------------+-------------------------+-----------------------+------------------------+-------------------------+
+| def | public | PRIMARY | def | greptime | public | numbers | number | 1 | | | | | greptime-primary-key-v1 |
++--------------------+-------------------+-----------------+---------------+--------------------+--------------+------------+-------------+------------------+-------------------------------+-------------------------+-----------------------+------------------------+-------------------------+
select * from KEY_COLUMN_USAGE where CONSTRAINT_NAME LIKE '%INDEX';
@@ -606,11 +607,11 @@ select * from KEY_COLUMN_USAGE where CONSTRAINT_NAME LIKE '%INDEX';
select * from KEY_COLUMN_USAGE where CONSTRAINT_NAME NOT LIKE '%INDEX';
-+--------------------+-------------------+-----------------+---------------+--------------------+--------------+------------+-------------+------------------+-------------------------------+-------------------------+-----------------------+------------------------+
-| constraint_catalog | constraint_schema | constraint_name | table_catalog | real_table_catalog | table_schema | table_name | column_name | ordinal_position | position_in_unique_constraint | referenced_table_schema | referenced_table_name | referenced_column_name |
-+--------------------+-------------------+-----------------+---------------+--------------------+--------------+------------+-------------+------------------+-------------------------------+-------------------------+-----------------------+------------------------+
-| def | public | PRIMARY | def | greptime | public | numbers | number | 1 | | | | |
-+--------------------+-------------------+-----------------+---------------+--------------------+--------------+------------+-------------+------------------+-------------------------------+-------------------------+-----------------------+------------------------+
++--------------------+-------------------+-----------------+---------------+--------------------+--------------+------------+-------------+------------------+-------------------------------+-------------------------+-----------------------+------------------------+-------------------------+
+| constraint_catalog | constraint_schema | constraint_name | table_catalog | real_table_catalog | table_schema | table_name | column_name | ordinal_position | position_in_unique_constraint | referenced_table_schema | referenced_table_name | referenced_column_name | greptime_index_type |
++--------------------+-------------------+-----------------+---------------+--------------------+--------------+------------+-------------+------------------+-------------------------------+-------------------------+-----------------------+------------------------+-------------------------+
+| def | public | PRIMARY | def | greptime | public | numbers | number | 1 | | | | | greptime-primary-key-v1 |
++--------------------+-------------------+-----------------+---------------+--------------------+--------------+------------+-------------+------------------+-------------------------------+-------------------------+-----------------------+------------------------+-------------------------+
select * from KEY_COLUMN_USAGE where CONSTRAINT_NAME == 'TIME INDEX' AND CONSTRAINT_SCHEMA != 'my_db';
@@ -688,15 +689,16 @@ desc table key_column_usage;
| referenced_table_schema | String | | YES | | FIELD |
| referenced_table_name | String | | YES | | FIELD |
| referenced_column_name | String | | YES | | FIELD |
+| greptime_index_type | String | | YES | | FIELD |
+-------------------------------+--------+-----+------+---------+---------------+
select * from key_column_usage;
-+--------------------+-------------------+-----------------+---------------+--------------------+--------------+------------+-------------+------------------+-------------------------------+-------------------------+-----------------------+------------------------+
-| constraint_catalog | constraint_schema | constraint_name | table_catalog | real_table_catalog | table_schema | table_name | column_name | ordinal_position | position_in_unique_constraint | referenced_table_schema | referenced_table_name | referenced_column_name |
-+--------------------+-------------------+-----------------+---------------+--------------------+--------------+------------+-------------+------------------+-------------------------------+-------------------------+-----------------------+------------------------+
-| def | public | PRIMARY | def | greptime | public | numbers | number | 1 | | | | |
-+--------------------+-------------------+-----------------+---------------+--------------------+--------------+------------+-------------+------------------+-------------------------------+-------------------------+-----------------------+------------------------+
++--------------------+-------------------+-----------------+---------------+--------------------+--------------+------------+-------------+------------------+-------------------------------+-------------------------+-----------------------+------------------------+-------------------------+
+| constraint_catalog | constraint_schema | constraint_name | table_catalog | real_table_catalog | table_schema | table_name | column_name | ordinal_position | position_in_unique_constraint | referenced_table_schema | referenced_table_name | referenced_column_name | greptime_index_type |
++--------------------+-------------------+-----------------+---------------+--------------------+--------------+------------+-------------+------------------+-------------------------------+-------------------------+-----------------------+------------------------+-------------------------+
+| def | public | PRIMARY | def | greptime | public | numbers | number | 1 | | | | | greptime-primary-key-v1 |
++--------------------+-------------------+-----------------+---------------+--------------------+--------------+------------+-------------+------------------+-------------------------------+-------------------------+-----------------------+------------------------+-------------------------+
-- tables not implemented
DESC TABLE COLUMN_PRIVILEGES;
From 96fbce1797c3fd0ab166c4f14f25028ecb2b1630 Mon Sep 17 00:00:00 2001
From: Ruihang Xia
Date: Tue, 15 Apr 2025 14:45:00 +0800
Subject: [PATCH 19/82] feat: report per-region metrics on region server
(#5893)
* feat: report per-region metrics on region server
Signed-off-by: Ruihang Xia
* rename Change to Ingest
Signed-off-by: Ruihang Xia
---------
Signed-off-by: Ruihang Xia
---
src/datanode/src/metrics.rs | 12 +++++++--
src/datanode/src/region_server.rs | 45 ++++++++++++++++++-------------
2 files changed, 37 insertions(+), 20 deletions(-)
diff --git a/src/datanode/src/metrics.rs b/src/datanode/src/metrics.rs
index d11e8af9fe..12ac482826 100644
--- a/src/datanode/src/metrics.rs
+++ b/src/datanode/src/metrics.rs
@@ -20,13 +20,21 @@ pub const REGION_REQUEST_TYPE: &str = "datanode_region_request_type";
pub const REGION_ROLE: &str = "region_role";
pub const REGION_ID: &str = "region_id";
+pub const RESULT_TYPE: &str = "result";
lazy_static! {
/// The elapsed time of handling a request in the region_server.
pub static ref HANDLE_REGION_REQUEST_ELAPSED: HistogramVec = register_histogram_vec!(
"greptime_datanode_handle_region_request_elapsed",
"datanode handle region request elapsed",
- &[REGION_REQUEST_TYPE]
+ &[REGION_ID, REGION_REQUEST_TYPE]
+ )
+ .unwrap();
+ /// The number of rows in region request received by region server, labeled with request type.
+ pub static ref REGION_CHANGED_ROW_COUNT: IntCounterVec = register_int_counter_vec!(
+ "greptime_datanode_region_changed_row_count",
+ "datanode region changed row count",
+ &[REGION_ID, REGION_REQUEST_TYPE]
)
.unwrap();
/// The elapsed time since the last received heartbeat.
@@ -64,7 +72,7 @@ lazy_static! {
pub static ref HEARTBEAT_RECV_COUNT: IntCounterVec = register_int_counter_vec!(
"greptime_datanode_heartbeat_recv_count",
"datanode heartbeat received",
- &["result"]
+ &[RESULT_TYPE]
)
.unwrap();
}
diff --git a/src/datanode/src/region_server.rs b/src/datanode/src/region_server.rs
index 14ccfa2816..bff28c109b 100644
--- a/src/datanode/src/region_server.rs
+++ b/src/datanode/src/region_server.rs
@@ -690,18 +690,20 @@ impl RegionServerInner {
},
None => return Ok(CurrentEngine::EarlyReturn(0)),
},
- RegionChange::None | RegionChange::Catchup => match current_region_status {
- Some(status) => match status.clone() {
- RegionEngineWithStatus::Registering(_) => {
- return error::RegionNotReadySnafu { region_id }.fail()
- }
- RegionEngineWithStatus::Deregistering(_) => {
- return error::RegionNotFoundSnafu { region_id }.fail()
- }
- RegionEngineWithStatus::Ready(engine) => engine,
- },
- None => return error::RegionNotFoundSnafu { region_id }.fail(),
- },
+ RegionChange::None | RegionChange::Catchup | RegionChange::Ingest => {
+ match current_region_status {
+ Some(status) => match status.clone() {
+ RegionEngineWithStatus::Registering(_) => {
+ return error::RegionNotReadySnafu { region_id }.fail()
+ }
+ RegionEngineWithStatus::Deregistering(_) => {
+ return error::RegionNotFoundSnafu { region_id }.fail()
+ }
+ RegionEngineWithStatus::Ready(engine) => engine,
+ },
+ None => return error::RegionNotFoundSnafu { region_id }.fail(),
+ }
+ }
};
Ok(CurrentEngine::Engine(engine))
@@ -885,8 +887,9 @@ impl RegionServerInner {
request: RegionRequest,
) -> Result {
let request_type = request.request_type();
+ let region_id_str = region_id.to_string();
let _timer = crate::metrics::HANDLE_REGION_REQUEST_ELAPSED
- .with_label_values(&[request_type])
+ .with_label_values(&[®ion_id_str, request_type])
.start_timer();
let region_change = match &request {
@@ -899,9 +902,8 @@ impl RegionServerInner {
RegionChange::Register(attribute)
}
RegionRequest::Close(_) | RegionRequest::Drop(_) => RegionChange::Deregisters,
- RegionRequest::Put(_)
- | RegionRequest::Delete(_)
- | RegionRequest::Alter(_)
+ RegionRequest::Put(_) | RegionRequest::Delete(_) => RegionChange::Ingest,
+ RegionRequest::Alter(_)
| RegionRequest::Flush(_)
| RegionRequest::Compact(_)
| RegionRequest::Truncate(_) => RegionChange::None,
@@ -922,6 +924,12 @@ impl RegionServerInner {
.with_context(|_| HandleRegionRequestSnafu { region_id })
{
Ok(result) => {
+ // Update metrics
+ if matches!(region_change, RegionChange::Ingest) {
+ crate::metrics::REGION_CHANGED_ROW_COUNT
+ .with_label_values(&[®ion_id_str, request_type])
+ .inc_by(result.affected_rows as u64);
+ }
// Sets corresponding region status to ready.
self.set_region_status_ready(region_id, engine, region_change)
.await?;
@@ -968,7 +976,7 @@ impl RegionServerInner {
region_change: RegionChange,
) {
match region_change {
- RegionChange::None => {}
+ RegionChange::None | RegionChange::Ingest => {}
RegionChange::Register(_) => {
self.region_map.remove(®ion_id);
}
@@ -988,7 +996,7 @@ impl RegionServerInner {
) -> Result<()> {
let engine_type = engine.name();
match region_change {
- RegionChange::None => {}
+ RegionChange::None | RegionChange::Ingest => {}
RegionChange::Register(attribute) => {
info!(
"Region {region_id} is registered to engine {}",
@@ -1129,6 +1137,7 @@ enum RegionChange {
Register(RegionAttribute),
Deregisters,
Catchup,
+ Ingest,
}
fn is_metric_engine(engine: &str) -> bool {
From 2189631efd54fa0ead07aaf9a4865b9ef371ba80 Mon Sep 17 00:00:00 2001
From: Zhenchi
Date: Tue, 15 Apr 2025 14:45:56 +0800
Subject: [PATCH 20/82] feat: optimize `matches_term` with constant term
pre-compilation (#5886)
* feat: precompile finder for `matches_term`
Signed-off-by: Zhenchi
* fix sqlness
Signed-off-by: Zhenchi
---------
Signed-off-by: Zhenchi
---
src/query/src/optimizer.rs | 1 +
src/query/src/optimizer/constant_term.rs | 454 ++++++++++++++++++
src/query/src/query_engine/state.rs | 4 +
.../common/tql-explain-analyze/explain.result | 1 +
4 files changed, 460 insertions(+)
create mode 100644 src/query/src/optimizer/constant_term.rs
diff --git a/src/query/src/optimizer.rs b/src/query/src/optimizer.rs
index c98ad0c634..e6596e923a 100644
--- a/src/query/src/optimizer.rs
+++ b/src/query/src/optimizer.rs
@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+pub mod constant_term;
pub mod count_wildcard;
pub mod parallelize_scan;
pub mod pass_distribution;
diff --git a/src/query/src/optimizer/constant_term.rs b/src/query/src/optimizer/constant_term.rs
new file mode 100644
index 0000000000..60e5b76d9d
--- /dev/null
+++ b/src/query/src/optimizer/constant_term.rs
@@ -0,0 +1,454 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::fmt;
+use std::hash::{Hash, Hasher};
+use std::sync::Arc;
+
+use arrow::array::{AsArray, BooleanArray};
+use common_function::scalars::matches_term::MatchesTermFinder;
+use datafusion::config::ConfigOptions;
+use datafusion::error::Result as DfResult;
+use datafusion::physical_optimizer::PhysicalOptimizerRule;
+use datafusion::physical_plan::filter::FilterExec;
+use datafusion::physical_plan::ExecutionPlan;
+use datafusion_common::tree_node::{Transformed, TreeNode};
+use datafusion_common::ScalarValue;
+use datafusion_expr::ColumnarValue;
+use datafusion_physical_expr::expressions::Literal;
+use datafusion_physical_expr::{PhysicalExpr, ScalarFunctionExpr};
+
+/// A physical expression that uses a pre-compiled term finder for the `matches_term` function.
+///
+/// This expression optimizes the `matches_term` function by pre-compiling the term
+/// when the term is a constant value. This avoids recompiling the term for each row
+/// during execution.
+#[derive(Debug)]
+pub struct PreCompiledMatchesTermExpr {
+ /// The text column expression to search in
+ text: Arc,
+ /// The constant term to search for
+ term: String,
+ /// The pre-compiled term finder
+ finder: MatchesTermFinder,
+}
+
+impl fmt::Display for PreCompiledMatchesTermExpr {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ write!(f, "MatchesConstTerm({}, \"{}\")", self.text, self.term)
+ }
+}
+
+impl Hash for PreCompiledMatchesTermExpr {
+ fn hash(&self, state: &mut H) {
+ self.text.hash(state);
+ self.term.hash(state);
+ }
+}
+
+impl PartialEq for PreCompiledMatchesTermExpr {
+ fn eq(&self, other: &Self) -> bool {
+ self.text.eq(&other.text) && self.term.eq(&other.term)
+ }
+}
+
+impl Eq for PreCompiledMatchesTermExpr {}
+
+impl PhysicalExpr for PreCompiledMatchesTermExpr {
+ fn as_any(&self) -> &dyn std::any::Any {
+ self
+ }
+
+ fn data_type(
+ &self,
+ _input_schema: &arrow_schema::Schema,
+ ) -> datafusion::error::Result {
+ Ok(arrow_schema::DataType::Boolean)
+ }
+
+ fn nullable(&self, input_schema: &arrow_schema::Schema) -> datafusion::error::Result {
+ self.text.nullable(input_schema)
+ }
+
+ fn evaluate(
+ &self,
+ batch: &common_recordbatch::DfRecordBatch,
+ ) -> datafusion::error::Result {
+ let num_rows = batch.num_rows();
+
+ let text_value = self.text.evaluate(batch)?;
+ let array = text_value.into_array(num_rows)?;
+ let str_array = array.as_string::();
+
+ let mut result = BooleanArray::builder(num_rows);
+ for text in str_array {
+ match text {
+ Some(text) => {
+ result.append_value(self.finder.find(text));
+ }
+ None => {
+ result.append_null();
+ }
+ }
+ }
+
+ Ok(ColumnarValue::Array(Arc::new(result.finish())))
+ }
+
+ fn children(&self) -> Vec<&Arc> {
+ vec![&self.text]
+ }
+
+ fn with_new_children(
+ self: Arc,
+ children: Vec>,
+ ) -> datafusion::error::Result> {
+ Ok(Arc::new(PreCompiledMatchesTermExpr {
+ text: children[0].clone(),
+ term: self.term.clone(),
+ finder: self.finder.clone(),
+ }))
+ }
+}
+
+/// Optimizer rule that pre-compiles constant term in `matches_term` function.
+///
+/// This optimizer looks for `matches_term` function calls where the second argument
+/// (the term to match) is a constant value. When found, it replaces the function
+/// call with a specialized `PreCompiledMatchesTermExpr` that uses a pre-compiled
+/// term finder.
+///
+/// Example:
+/// ```sql
+/// -- Before optimization:
+/// matches_term(text_column, 'constant_term')
+///
+/// -- After optimization:
+/// PreCompiledMatchesTermExpr(text_column, 'constant_term')
+/// ```
+///
+/// This optimization improves performance by:
+/// 1. Pre-compiling the term once instead of for each row
+/// 2. Using a specialized expression that avoids function call overhead
+#[derive(Debug)]
+pub struct MatchesConstantTermOptimizer;
+
+impl PhysicalOptimizerRule for MatchesConstantTermOptimizer {
+ fn optimize(
+ &self,
+ plan: Arc,
+ _config: &ConfigOptions,
+ ) -> DfResult> {
+ let res = plan
+ .transform_down(&|plan: Arc| {
+ if let Some(filter) = plan.as_any().downcast_ref::() {
+ let pred = filter.predicate().clone();
+ let new_pred = pred.transform_down(&|expr: Arc| {
+ if let Some(func) = expr.as_any().downcast_ref::() {
+ if !func.name().eq_ignore_ascii_case("matches_term") {
+ return Ok(Transformed::no(expr));
+ }
+ let args = func.args();
+ if args.len() != 2 {
+ return Ok(Transformed::no(expr));
+ }
+
+ if let Some(lit) = args[1].as_any().downcast_ref::() {
+ if let ScalarValue::Utf8(Some(term)) = lit.value() {
+ let finder = MatchesTermFinder::new(term);
+ let expr = PreCompiledMatchesTermExpr {
+ text: args[0].clone(),
+ term: term.to_string(),
+ finder,
+ };
+
+ return Ok(Transformed::yes(Arc::new(expr)));
+ }
+ }
+ }
+
+ Ok(Transformed::no(expr))
+ })?;
+
+ if new_pred.transformed {
+ let exec = FilterExec::try_new(new_pred.data, filter.input().clone())?
+ .with_default_selectivity(filter.default_selectivity())?
+ .with_projection(filter.projection().cloned())?;
+ return Ok(Transformed::yes(Arc::new(exec) as _));
+ }
+ }
+
+ Ok(Transformed::no(plan))
+ })?
+ .data;
+
+ Ok(res)
+ }
+
+ fn name(&self) -> &str {
+ "MatchesConstantTerm"
+ }
+
+ fn schema_check(&self) -> bool {
+ false
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use std::sync::Arc;
+
+ use arrow::array::{ArrayRef, StringArray};
+ use arrow::datatypes::{DataType, Field, Schema};
+ use arrow::record_batch::RecordBatch;
+ use catalog::memory::MemoryCatalogManager;
+ use catalog::RegisterTableRequest;
+ use common_catalog::consts::{DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME};
+ use common_function::scalars::matches_term::MatchesTermFunction;
+ use common_function::scalars::udf::create_udf;
+ use common_function::state::FunctionState;
+ use datafusion::physical_optimizer::PhysicalOptimizerRule;
+ use datafusion::physical_plan::filter::FilterExec;
+ use datafusion::physical_plan::get_plan_string;
+ use datafusion::physical_plan::memory::MemoryExec;
+ use datafusion_common::{Column, DFSchema, ScalarValue};
+ use datafusion_expr::expr::ScalarFunction;
+ use datafusion_expr::{Expr, ScalarUDF};
+ use datafusion_physical_expr::{create_physical_expr, ScalarFunctionExpr};
+ use datatypes::prelude::ConcreteDataType;
+ use datatypes::schema::ColumnSchema;
+ use session::context::QueryContext;
+ use table::metadata::{TableInfoBuilder, TableMetaBuilder};
+ use table::test_util::EmptyTable;
+
+ use super::*;
+ use crate::parser::QueryLanguageParser;
+ use crate::{QueryEngineFactory, QueryEngineRef};
+
+ fn create_test_batch() -> RecordBatch {
+ let schema = Schema::new(vec![Field::new("text", DataType::Utf8, true)]);
+
+ let text_array = StringArray::from(vec![
+ Some("hello world"),
+ Some("greeting"),
+ Some("hello there"),
+ None,
+ ]);
+
+ RecordBatch::try_new(Arc::new(schema), vec![Arc::new(text_array) as ArrayRef]).unwrap()
+ }
+
+ fn create_test_engine() -> QueryEngineRef {
+ let table_name = "test".to_string();
+ let columns = vec![
+ ColumnSchema::new(
+ "text".to_string(),
+ ConcreteDataType::string_datatype(),
+ false,
+ ),
+ ColumnSchema::new(
+ "timestamp".to_string(),
+ ConcreteDataType::timestamp_millisecond_datatype(),
+ false,
+ )
+ .with_time_index(true),
+ ];
+
+ let schema = Arc::new(datatypes::schema::Schema::new(columns));
+ let table_meta = TableMetaBuilder::empty()
+ .schema(schema)
+ .primary_key_indices(vec![])
+ .value_indices(vec![0])
+ .next_column_id(2)
+ .build()
+ .unwrap();
+ let table_info = TableInfoBuilder::default()
+ .name(&table_name)
+ .meta(table_meta)
+ .build()
+ .unwrap();
+ let table = EmptyTable::from_table_info(&table_info);
+ let catalog_list = MemoryCatalogManager::with_default_setup();
+ assert!(catalog_list
+ .register_table_sync(RegisterTableRequest {
+ catalog: DEFAULT_CATALOG_NAME.to_string(),
+ schema: DEFAULT_SCHEMA_NAME.to_string(),
+ table_name,
+ table_id: 1024,
+ table,
+ })
+ .is_ok());
+ QueryEngineFactory::new(
+ catalog_list,
+ None,
+ None,
+ None,
+ None,
+ false,
+ Default::default(),
+ )
+ .query_engine()
+ }
+
+ fn matches_term_udf() -> Arc {
+ Arc::new(create_udf(
+ Arc::new(MatchesTermFunction),
+ QueryContext::arc(),
+ Arc::new(FunctionState::default()),
+ ))
+ }
+
+ #[test]
+ fn test_matches_term_optimization() {
+ let batch = create_test_batch();
+
+ // Create a predicate with a constant pattern
+ let predicate = create_physical_expr(
+ &Expr::ScalarFunction(ScalarFunction::new_udf(
+ matches_term_udf(),
+ vec![
+ Expr::Column(Column::from_name("text")),
+ Expr::Literal(ScalarValue::Utf8(Some("hello".to_string()))),
+ ],
+ )),
+ &DFSchema::try_from(batch.schema().clone()).unwrap(),
+ &Default::default(),
+ )
+ .unwrap();
+
+ let input =
+ Arc::new(MemoryExec::try_new(&[vec![batch.clone()]], batch.schema(), None).unwrap());
+ let filter = FilterExec::try_new(predicate, input).unwrap();
+
+ // Apply the optimizer
+ let optimizer = MatchesConstantTermOptimizer;
+ let optimized_plan = optimizer
+ .optimize(Arc::new(filter), &Default::default())
+ .unwrap();
+
+ let optimized_filter = optimized_plan
+ .as_any()
+ .downcast_ref::()
+ .unwrap();
+ let predicate = optimized_filter.predicate();
+
+ // The predicate should be a PreCompiledMatchesTermExpr
+ assert!(
+ std::any::TypeId::of::() == predicate.as_any().type_id()
+ );
+ }
+
+ #[test]
+ fn test_matches_term_no_optimization() {
+ let batch = create_test_batch();
+
+ // Create a predicate with a non-constant pattern
+ let predicate = create_physical_expr(
+ &Expr::ScalarFunction(ScalarFunction::new_udf(
+ matches_term_udf(),
+ vec![
+ Expr::Column(Column::from_name("text")),
+ Expr::Column(Column::from_name("text")),
+ ],
+ )),
+ &DFSchema::try_from(batch.schema().clone()).unwrap(),
+ &Default::default(),
+ )
+ .unwrap();
+
+ let input =
+ Arc::new(MemoryExec::try_new(&[vec![batch.clone()]], batch.schema(), None).unwrap());
+ let filter = FilterExec::try_new(predicate, input).unwrap();
+
+ let optimizer = MatchesConstantTermOptimizer;
+ let optimized_plan = optimizer
+ .optimize(Arc::new(filter), &Default::default())
+ .unwrap();
+
+ let optimized_filter = optimized_plan
+ .as_any()
+ .downcast_ref::()
+ .unwrap();
+ let predicate = optimized_filter.predicate();
+
+ // The predicate should still be a ScalarFunctionExpr
+ assert!(std::any::TypeId::of::() == predicate.as_any().type_id());
+ }
+
+ #[tokio::test]
+ async fn test_matches_term_optimization_from_sql() {
+ let sql = "WITH base AS (
+ SELECT text, timestamp FROM test
+ WHERE MATCHES_TERM(text, 'hello')
+ AND timestamp > '2025-01-01 00:00:00'
+ ),
+ subquery1 AS (
+ SELECT * FROM base
+ WHERE MATCHES_TERM(text, 'world')
+ ),
+ subquery2 AS (
+ SELECT * FROM test
+ WHERE MATCHES_TERM(text, 'greeting')
+ AND timestamp < '2025-01-02 00:00:00'
+ ),
+ union_result AS (
+ SELECT * FROM subquery1
+ UNION ALL
+ SELECT * FROM subquery2
+ ),
+ joined_data AS (
+ SELECT a.text, a.timestamp, b.text as other_text
+ FROM union_result a
+ JOIN test b ON a.timestamp = b.timestamp
+ WHERE MATCHES_TERM(a.text, 'there')
+ )
+ SELECT text, other_text
+ FROM joined_data
+ WHERE MATCHES_TERM(text, '42')
+ AND MATCHES_TERM(other_text, 'foo')";
+
+ let query_ctx = QueryContext::arc();
+
+ let stmt = QueryLanguageParser::parse_sql(sql, &query_ctx).unwrap();
+ let engine = create_test_engine();
+ let logical_plan = engine
+ .planner()
+ .plan(&stmt, query_ctx.clone())
+ .await
+ .unwrap();
+
+ let engine_ctx = engine.engine_context(query_ctx);
+ let state = engine_ctx.state();
+
+ let analyzed_plan = state
+ .analyzer()
+ .execute_and_check(logical_plan.clone(), state.config_options(), |_, _| {})
+ .unwrap();
+
+ let optimized_plan = state
+ .optimizer()
+ .optimize(analyzed_plan, state, |_, _| {})
+ .unwrap();
+
+ let physical_plan = state
+ .query_planner()
+ .create_physical_plan(&optimized_plan, state)
+ .await
+ .unwrap();
+
+ let plan_str = get_plan_string(&physical_plan).join("\n");
+ assert!(plan_str.contains("MatchesConstTerm"));
+ assert!(!plan_str.contains("matches_term"))
+ }
+}
diff --git a/src/query/src/query_engine/state.rs b/src/query/src/query_engine/state.rs
index 75e1ed84a7..03f3a2a13d 100644
--- a/src/query/src/query_engine/state.rs
+++ b/src/query/src/query_engine/state.rs
@@ -45,6 +45,7 @@ use table::table::adapter::DfTableProviderAdapter;
use table::TableRef;
use crate::dist_plan::{DistExtensionPlanner, DistPlannerAnalyzer, MergeSortExtensionPlanner};
+use crate::optimizer::constant_term::MatchesConstantTermOptimizer;
use crate::optimizer::count_wildcard::CountWildcardToTimeIndexRule;
use crate::optimizer::parallelize_scan::ParallelizeScan;
use crate::optimizer::pass_distribution::PassDistribution;
@@ -143,6 +144,9 @@ impl QueryEngineState {
physical_optimizer
.rules
.push(Arc::new(WindowedSortPhysicalRule));
+ physical_optimizer
+ .rules
+ .push(Arc::new(MatchesConstantTermOptimizer));
// Add rule to remove duplicate nodes generated by other rules. Run this in the last.
physical_optimizer.rules.push(Arc::new(RemoveDuplicate));
// Place SanityCheckPlan at the end of the list to ensure that it runs after all other rules.
diff --git a/tests/cases/standalone/common/tql-explain-analyze/explain.result b/tests/cases/standalone/common/tql-explain-analyze/explain.result
index e1bbaa89e3..8b4952ed3d 100644
--- a/tests/cases/standalone/common/tql-explain-analyze/explain.result
+++ b/tests/cases/standalone/common/tql-explain-analyze/explain.result
@@ -167,6 +167,7 @@ TQL EXPLAIN VERBOSE (0, 10, '5s') test;
| physical_plan after ProjectionPushdown_| SAME TEXT AS ABOVE_|
| physical_plan after LimitPushdown_| SAME TEXT AS ABOVE_|
| physical_plan after WindowedSortRule_| SAME TEXT AS ABOVE_|
+| physical_plan after MatchesConstantTerm_| SAME TEXT AS ABOVE_|
| physical_plan after RemoveDuplicateRule_| SAME TEXT AS ABOVE_|
| physical_plan after SanityCheckPlan_| SAME TEXT AS ABOVE_|
| physical_plan_| PromInstantManipulateExec: range=[0..0], lookback=[300000], interval=[300000], time index=[j]_|
From 7b13376239abf0ed491e98e334469bb2d6ad1f88 Mon Sep 17 00:00:00 2001
From: zyy17
Date: Tue, 15 Apr 2025 14:46:31 +0800
Subject: [PATCH 21/82] refactor: add `partition_rules_for_uuid()` (#5743)
* refactor: add partition_rules_for_uuid()
* refactor: support up to 65536 partitions for partition_rules_for_uuid()
---
Cargo.lock | 1 +
src/operator/src/error.rs | 11 +-
src/operator/src/insert.rs | 7 +-
src/sql/Cargo.toml | 1 +
src/sql/src/error.rs | 11 ++
src/sql/src/partition.rs | 268 ++++++++++++++++++++++----------
tests-integration/tests/http.rs | 2 +-
7 files changed, 213 insertions(+), 88 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index 2ab3200029..f1bf4eb3d0 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -11011,6 +11011,7 @@ dependencies = [
"sqlparser_derive 0.1.1",
"store-api",
"table",
+ "uuid",
]
[[package]]
diff --git a/src/operator/src/error.rs b/src/operator/src/error.rs
index c0c102ceda..900a3b4310 100644
--- a/src/operator/src/error.rs
+++ b/src/operator/src/error.rs
@@ -799,6 +799,14 @@ pub enum Error {
#[snafu(implicit)]
location: Location,
},
+
+ #[snafu(display("Failed to create partition rules"))]
+ CreatePartitionRules {
+ #[snafu(source)]
+ source: sql::error::Error,
+ #[snafu(implicit)]
+ location: Location,
+ },
}
pub type Result = std::result::Result;
@@ -840,7 +848,8 @@ impl ErrorExt for Error {
| Error::PhysicalExpr { .. }
| Error::InvalidJsonFormat { .. }
| Error::CursorNotFound { .. }
- | Error::CursorExists { .. } => StatusCode::InvalidArguments,
+ | Error::CursorExists { .. }
+ | Error::CreatePartitionRules { .. } => StatusCode::InvalidArguments,
Error::TableAlreadyExists { .. } | Error::ViewAlreadyExists { .. } => {
StatusCode::TableAlreadyExists
diff --git a/src/operator/src/insert.rs b/src/operator/src/insert.rs
index b2742685fb..2f7c30ccd0 100644
--- a/src/operator/src/insert.rs
+++ b/src/operator/src/insert.rs
@@ -63,8 +63,8 @@ use table::table_reference::TableReference;
use table::TableRef;
use crate::error::{
- CatalogSnafu, ColumnOptionsSnafu, FindRegionLeaderSnafu, InvalidInsertRequestSnafu,
- JoinTaskSnafu, RequestInsertsSnafu, Result, TableNotFoundSnafu,
+ CatalogSnafu, ColumnOptionsSnafu, CreatePartitionRulesSnafu, FindRegionLeaderSnafu,
+ InvalidInsertRequestSnafu, JoinTaskSnafu, RequestInsertsSnafu, Result, TableNotFoundSnafu,
};
use crate::expr_helper;
use crate::region_req_factory::RegionRequestFactory;
@@ -591,7 +591,8 @@ impl Inserter {
} else {
// prebuilt partition rules for uuid data: see the function
// for more information
- let partitions = partition_rule_for_hexstring(TRACE_ID_COLUMN);
+ let partitions = partition_rule_for_hexstring(TRACE_ID_COLUMN)
+ .context(CreatePartitionRulesSnafu)?;
// add skip index to
// - trace_id: when searching by trace id
// - parent_span_id: when searching root span
diff --git a/src/sql/Cargo.toml b/src/sql/Cargo.toml
index 3cb81d6dd4..812fe42709 100644
--- a/src/sql/Cargo.toml
+++ b/src/sql/Cargo.toml
@@ -37,6 +37,7 @@ sqlparser.workspace = true
sqlparser_derive = "0.1"
store-api.workspace = true
table.workspace = true
+uuid.workspace = true
[dev-dependencies]
common-datasource.workspace = true
diff --git a/src/sql/src/error.rs b/src/sql/src/error.rs
index e7253d6c46..e07efdbe6c 100644
--- a/src/sql/src/error.rs
+++ b/src/sql/src/error.rs
@@ -345,6 +345,16 @@ pub enum Error {
#[snafu(implicit)]
location: Location,
},
+
+ #[snafu(display(
+ "Invalid partition number: {}, should be in range [2, 65536]",
+ partition_num
+ ))]
+ InvalidPartitionNumber {
+ partition_num: u32,
+ #[snafu(implicit)]
+ location: Location,
+ },
}
impl ErrorExt for Error {
@@ -380,6 +390,7 @@ impl ErrorExt for Error {
| Simplification { .. }
| InvalidInterval { .. }
| InvalidUnaryOp { .. }
+ | InvalidPartitionNumber { .. }
| UnsupportedUnaryOp { .. } => StatusCode::InvalidArguments,
SerializeColumnDefaultConstraint { source, .. } => source.status_code(),
diff --git a/src/sql/src/partition.rs b/src/sql/src/partition.rs
index 4979bf702f..a1fd8e642e 100644
--- a/src/sql/src/partition.rs
+++ b/src/sql/src/partition.rs
@@ -12,10 +12,18 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+use snafu::ensure;
use sqlparser::ast::{BinaryOperator, Expr, Ident, Value};
+use crate::error::{InvalidPartitionNumberSnafu, Result};
use crate::statements::create::Partitions;
+/// The default number of partitions for OpenTelemetry traces.
+const DEFAULT_PARTITION_NUM_FOR_TRACES: u32 = 16;
+
+/// The maximum number of partitions for OpenTelemetry traces.
+const MAX_PARTITION_NUM_FOR_TRACES: u32 = 65536;
+
macro_rules! between_string {
($col: expr, $left_incl: expr, $right_excl: expr) => {
Expr::BinaryOp {
@@ -38,98 +46,105 @@ macro_rules! between_string {
};
}
-macro_rules! or {
- ($left: expr, $right: expr) => {
- Expr::BinaryOp {
- op: BinaryOperator::Or,
- left: Box::new($left),
- right: Box::new($right),
- }
- };
+pub fn partition_rule_for_hexstring(ident: &str) -> Result {
+ Ok(Partitions {
+ column_list: vec![Ident::new(ident)],
+ exprs: partition_rules_for_uuid(DEFAULT_PARTITION_NUM_FOR_TRACES, ident)?,
+ })
}
-pub fn partition_rule_for_hexstring(ident: &str) -> Partitions {
- let ident = Ident::new(ident);
- let ident_expr = Expr::Identifier(ident.clone());
+// partition_rules_for_uuid can creates partition rules up to 65536 partitions.
+fn partition_rules_for_uuid(partition_num: u32, ident: &str) -> Result> {
+ ensure!(
+ partition_num.is_power_of_two() && (2..=65536).contains(&partition_num),
+ InvalidPartitionNumberSnafu { partition_num }
+ );
- // rules are like:
- //
- // "trace_id < '1'",
- // "trace_id >= '1' AND trace_id < '2'",
- // "trace_id >= '2' AND trace_id < '3'",
- // "trace_id >= '3' AND trace_id < '4'",
- // "trace_id >= '4' AND trace_id < '5'",
- // "trace_id >= '5' AND trace_id < '6'",
- // "trace_id >= '6' AND trace_id < '7'",
- // "trace_id >= '7' AND trace_id < '8'",
- // "trace_id >= '8' AND trace_id < '9'",
- // "trace_id >= '9' AND trace_id < 'A'",
- // "trace_id >= 'A' AND trace_id < 'B' OR trace_id >= 'a' AND trace_id < 'b'",
- // "trace_id >= 'B' AND trace_id < 'C' OR trace_id >= 'b' AND trace_id < 'c'",
- // "trace_id >= 'C' AND trace_id < 'D' OR trace_id >= 'c' AND trace_id < 'd'",
- // "trace_id >= 'D' AND trace_id < 'E' OR trace_id >= 'd' AND trace_id < 'e'",
- // "trace_id >= 'E' AND trace_id < 'F' OR trace_id >= 'e' AND trace_id < 'f'",
- // "trace_id >= 'F' AND trace_id < 'a' OR trace_id >= 'f'",
- let rules = vec![
- Expr::BinaryOp {
- left: Box::new(ident_expr.clone()),
- op: BinaryOperator::Lt,
- right: Box::new(Expr::Value(Value::SingleQuotedString("1".to_string()))),
- },
- // [left, right)
- between_string!(ident_expr, "1", "2"),
- between_string!(ident_expr, "2", "3"),
- between_string!(ident_expr, "3", "4"),
- between_string!(ident_expr, "4", "5"),
- between_string!(ident_expr, "5", "6"),
- between_string!(ident_expr, "6", "7"),
- between_string!(ident_expr, "7", "8"),
- between_string!(ident_expr, "8", "9"),
- between_string!(ident_expr, "9", "A"),
- or!(
- between_string!(ident_expr, "A", "B"),
- between_string!(ident_expr, "a", "b")
- ),
- or!(
- between_string!(ident_expr, "B", "C"),
- between_string!(ident_expr, "b", "c")
- ),
- or!(
- between_string!(ident_expr, "C", "D"),
- between_string!(ident_expr, "c", "d")
- ),
- or!(
- between_string!(ident_expr, "D", "E"),
- between_string!(ident_expr, "d", "e")
- ),
- or!(
- between_string!(ident_expr, "E", "F"),
- between_string!(ident_expr, "e", "f")
- ),
- or!(
- between_string!(ident_expr, "F", "a"),
- Expr::BinaryOp {
+ let ident_expr = Expr::Identifier(Ident::new(ident).clone());
+
+ let (total_partitions, hex_length) = {
+ match partition_num {
+ 2..=16 => (16, 1),
+ 17..=256 => (256, 2),
+ 257..=4096 => (4096, 3),
+ 4097..=MAX_PARTITION_NUM_FOR_TRACES => (MAX_PARTITION_NUM_FOR_TRACES, 4),
+ _ => unreachable!(),
+ }
+ };
+
+ let partition_size = total_partitions / partition_num;
+ let remainder = total_partitions % partition_num;
+
+ let mut rules = Vec::new();
+ let mut current_boundary = 0;
+ for i in 0..partition_num {
+ let mut size = partition_size;
+ if i < remainder {
+ size += 1;
+ }
+ let start = current_boundary;
+ let end = current_boundary + size;
+
+ if i == 0 {
+ // Create the leftmost rule, for example: trace_id < '1'.
+ rules.push(Expr::BinaryOp {
+ left: Box::new(ident_expr.clone()),
+ op: BinaryOperator::Lt,
+ right: Box::new(Expr::Value(Value::SingleQuotedString(format!(
+ "{:0hex_length$x}",
+ end
+ )))),
+ });
+ } else if i == partition_num - 1 {
+ // Create the rightmost rule, for example: trace_id >= 'f'.
+ rules.push(Expr::BinaryOp {
left: Box::new(ident_expr.clone()),
op: BinaryOperator::GtEq,
- right: Box::new(Expr::Value(Value::SingleQuotedString("f".to_string()))),
- }
- ),
- ];
+ right: Box::new(Expr::Value(Value::SingleQuotedString(format!(
+ "{:0hex_length$x}",
+ start
+ )))),
+ });
+ } else {
+ // Create the middle rules, for example: trace_id >= '1' AND trace_id < '2'.
+ rules.push(between_string!(
+ ident_expr,
+ format!("{:0hex_length$x}", start),
+ format!("{:0hex_length$x}", end)
+ ));
+ }
- Partitions {
- column_list: vec![ident],
- exprs: rules,
+ current_boundary = end;
}
+
+ Ok(rules)
}
#[cfg(test)]
mod tests {
+ use std::collections::HashMap;
+
use sqlparser::ast::Expr;
use sqlparser::dialect::GenericDialect;
use sqlparser::parser::Parser;
+ use uuid::Uuid;
use super::*;
+ #[test]
+ fn test_partition_rules_for_uuid() {
+ // NOTE: We only test a subset of partitions to keep the test execution time reasonable.
+ // As the number of partitions increases, we need to increase the number of test samples to ensure uniform distribution.
+ assert!(check_distribution(2, 10_000)); // 2^1
+ assert!(check_distribution(4, 10_000)); // 2^2
+ assert!(check_distribution(8, 10_000)); // 2^3
+ assert!(check_distribution(16, 10_000)); // 2^4
+ assert!(check_distribution(32, 10_000)); // 2^5
+ assert!(check_distribution(64, 100_000)); // 2^6
+ assert!(check_distribution(128, 100_000)); // 2^7
+ assert!(check_distribution(256, 100_000)); // 2^8
+ }
+
#[test]
fn test_rules() {
let expr = vec![
@@ -142,13 +157,13 @@ mod tests {
"trace_id >= '6' AND trace_id < '7'",
"trace_id >= '7' AND trace_id < '8'",
"trace_id >= '8' AND trace_id < '9'",
- "trace_id >= '9' AND trace_id < 'A'",
- "trace_id >= 'A' AND trace_id < 'B' OR trace_id >= 'a' AND trace_id < 'b'",
- "trace_id >= 'B' AND trace_id < 'C' OR trace_id >= 'b' AND trace_id < 'c'",
- "trace_id >= 'C' AND trace_id < 'D' OR trace_id >= 'c' AND trace_id < 'd'",
- "trace_id >= 'D' AND trace_id < 'E' OR trace_id >= 'd' AND trace_id < 'e'",
- "trace_id >= 'E' AND trace_id < 'F' OR trace_id >= 'e' AND trace_id < 'f'",
- "trace_id >= 'F' AND trace_id < 'a' OR trace_id >= 'f'",
+ "trace_id >= '9' AND trace_id < 'a'",
+ "trace_id >= 'a' AND trace_id < 'b'",
+ "trace_id >= 'b' AND trace_id < 'c'",
+ "trace_id >= 'c' AND trace_id < 'd'",
+ "trace_id >= 'd' AND trace_id < 'e'",
+ "trace_id >= 'e' AND trace_id < 'f'",
+ "trace_id >= 'f'",
];
let dialect = GenericDialect {};
@@ -160,6 +175,93 @@ mod tests {
})
.collect::>();
- assert_eq!(results, partition_rule_for_hexstring("trace_id").exprs);
+ assert_eq!(
+ results,
+ partition_rule_for_hexstring("trace_id").unwrap().exprs
+ );
+ }
+
+ fn check_distribution(test_partition: u32, test_uuid_num: usize) -> bool {
+ // Generate test_uuid_num random uuids.
+ let uuids = (0..test_uuid_num)
+ .map(|_| Uuid::new_v4().to_string().replace("-", "").to_lowercase())
+ .collect::>();
+
+ // Generate the partition rules.
+ let rules = partition_rules_for_uuid(test_partition, "test_trace_id").unwrap();
+
+ // Collect the number of partitions for each uuid.
+ let mut stats = HashMap::new();
+ for uuid in uuids {
+ let partition = allocate_partition_for_uuid(uuid.clone(), &rules);
+ // Count the number of uuids in each partition.
+ *stats.entry(partition).or_insert(0) += 1;
+ }
+
+ // Check if the partition distribution is uniform.
+ let expected_ratio = 100.0 / test_partition as f64;
+
+ // tolerance is the allowed deviation from the expected ratio.
+ let tolerance = 100.0 / test_partition as f64 * 0.30;
+
+ // For each partition, its ratio should be as close as possible to the expected ratio.
+ for (_, count) in stats {
+ let ratio = (count as f64 / test_uuid_num as f64) * 100.0;
+ if (ratio - expected_ratio).abs() >= tolerance {
+ return false;
+ }
+ }
+
+ true
+ }
+
+ fn allocate_partition_for_uuid(uuid: String, rules: &[Expr]) -> usize {
+ for (i, rule) in rules.iter().enumerate() {
+ if let Expr::BinaryOp { left, op: _, right } = rule {
+ if i == 0 {
+ // Hit the leftmost rule.
+ if let Expr::Value(Value::SingleQuotedString(leftmost)) = *right.clone() {
+ if uuid < leftmost {
+ return i;
+ }
+ }
+ } else if i == rules.len() - 1 {
+ // Hit the rightmost rule.
+ if let Expr::Value(Value::SingleQuotedString(rightmost)) = *right.clone() {
+ if uuid >= rightmost {
+ return i;
+ }
+ }
+ } else {
+ // Hit the middle rules.
+ if let Expr::BinaryOp {
+ left: _,
+ op: _,
+ right: inner_right,
+ } = *left.clone()
+ {
+ if let Expr::Value(Value::SingleQuotedString(lower)) = *inner_right.clone()
+ {
+ if let Expr::BinaryOp {
+ left: _,
+ op: _,
+ right: inner_right,
+ } = *right.clone()
+ {
+ if let Expr::Value(Value::SingleQuotedString(upper)) =
+ *inner_right.clone()
+ {
+ if uuid >= lower && uuid < upper {
+ return i;
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ panic!("No partition found for uuid: {}, rules: {:?}", uuid, rules);
}
}
diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs
index 6eb4d10562..d20fe50241 100644
--- a/tests-integration/tests/http.rs
+++ b/tests-integration/tests/http.rs
@@ -2488,7 +2488,7 @@ pub async fn test_otlp_traces_v1(store_type: StorageType) {
let expected = r#"[[1736480942444376000,1736480942444499000,123000,null,"c05d7a4ec8e1f231f02ed6e8da8655b4","d24f921c75f68e23","SPAN_KIND_CLIENT","lets-go","STATUS_CODE_UNSET","","","telemetrygen","","telemetrygen","1.2.3.4","telemetrygen-server",[],[]],[1736480942444376000,1736480942444499000,123000,"d24f921c75f68e23","c05d7a4ec8e1f231f02ed6e8da8655b4","9630f2916e2f7909","SPAN_KIND_SERVER","okey-dokey-0","STATUS_CODE_UNSET","","","telemetrygen","","telemetrygen","1.2.3.4","telemetrygen-client",[],[]],[1736480942444589000,1736480942444712000,123000,null,"cc9e0991a2e63d274984bd44ee669203","eba7be77e3558179","SPAN_KIND_CLIENT","lets-go","STATUS_CODE_UNSET","","","telemetrygen","","telemetrygen","1.2.3.4","telemetrygen-server",[],[]],[1736480942444589000,1736480942444712000,123000,"eba7be77e3558179","cc9e0991a2e63d274984bd44ee669203","8f847259b0f6e1ab","SPAN_KIND_SERVER","okey-dokey-0","STATUS_CODE_UNSET","","","telemetrygen","","telemetrygen","1.2.3.4","telemetrygen-client",[],[]]]"#;
validate_data("otlp_traces", &client, "select * from mytable;", expected).await;
- let expected_ddl = r#"[["mytable","CREATE TABLE IF NOT EXISTS \"mytable\" (\n \"timestamp\" TIMESTAMP(9) NOT NULL,\n \"timestamp_end\" TIMESTAMP(9) NULL,\n \"duration_nano\" BIGINT UNSIGNED NULL,\n \"parent_span_id\" STRING NULL SKIPPING INDEX WITH(granularity = '10240', type = 'BLOOM'),\n \"trace_id\" STRING NULL SKIPPING INDEX WITH(granularity = '10240', type = 'BLOOM'),\n \"span_id\" STRING NULL,\n \"span_kind\" STRING NULL,\n \"span_name\" STRING NULL,\n \"span_status_code\" STRING NULL,\n \"span_status_message\" STRING NULL,\n \"trace_state\" STRING NULL,\n \"scope_name\" STRING NULL,\n \"scope_version\" STRING NULL,\n \"service_name\" STRING NULL SKIPPING INDEX WITH(granularity = '10240', type = 'BLOOM'),\n \"span_attributes.net.peer.ip\" STRING NULL,\n \"span_attributes.peer.service\" STRING NULL,\n \"span_events\" JSON NULL,\n \"span_links\" JSON NULL,\n TIME INDEX (\"timestamp\"),\n PRIMARY KEY (\"service_name\")\n)\nPARTITION ON COLUMNS (\"trace_id\") (\n trace_id < '1',\n trace_id >= '1' AND trace_id < '2',\n trace_id >= '2' AND trace_id < '3',\n trace_id >= '3' AND trace_id < '4',\n trace_id >= '4' AND trace_id < '5',\n trace_id >= '5' AND trace_id < '6',\n trace_id >= '6' AND trace_id < '7',\n trace_id >= '7' AND trace_id < '8',\n trace_id >= '8' AND trace_id < '9',\n trace_id >= '9' AND trace_id < 'A',\n trace_id >= 'A' AND trace_id < 'B' OR trace_id >= 'a' AND trace_id < 'b',\n trace_id >= 'B' AND trace_id < 'C' OR trace_id >= 'b' AND trace_id < 'c',\n trace_id >= 'C' AND trace_id < 'D' OR trace_id >= 'c' AND trace_id < 'd',\n trace_id >= 'D' AND trace_id < 'E' OR trace_id >= 'd' AND trace_id < 'e',\n trace_id >= 'E' AND trace_id < 'F' OR trace_id >= 'e' AND trace_id < 'f',\n trace_id >= 'F' AND trace_id < 'a' OR trace_id >= 'f'\n)\nENGINE=mito\nWITH(\n append_mode = 'true',\n table_data_model = 'greptime_trace_v1'\n)"]]"#;
+ let expected_ddl = r#"[["mytable","CREATE TABLE IF NOT EXISTS \"mytable\" (\n \"timestamp\" TIMESTAMP(9) NOT NULL,\n \"timestamp_end\" TIMESTAMP(9) NULL,\n \"duration_nano\" BIGINT UNSIGNED NULL,\n \"parent_span_id\" STRING NULL SKIPPING INDEX WITH(granularity = '10240', type = 'BLOOM'),\n \"trace_id\" STRING NULL SKIPPING INDEX WITH(granularity = '10240', type = 'BLOOM'),\n \"span_id\" STRING NULL,\n \"span_kind\" STRING NULL,\n \"span_name\" STRING NULL,\n \"span_status_code\" STRING NULL,\n \"span_status_message\" STRING NULL,\n \"trace_state\" STRING NULL,\n \"scope_name\" STRING NULL,\n \"scope_version\" STRING NULL,\n \"service_name\" STRING NULL SKIPPING INDEX WITH(granularity = '10240', type = 'BLOOM'),\n \"span_attributes.net.peer.ip\" STRING NULL,\n \"span_attributes.peer.service\" STRING NULL,\n \"span_events\" JSON NULL,\n \"span_links\" JSON NULL,\n TIME INDEX (\"timestamp\"),\n PRIMARY KEY (\"service_name\")\n)\nPARTITION ON COLUMNS (\"trace_id\") (\n trace_id < '1',\n trace_id >= 'f',\n trace_id >= '1' AND trace_id < '2',\n trace_id >= '2' AND trace_id < '3',\n trace_id >= '3' AND trace_id < '4',\n trace_id >= '4' AND trace_id < '5',\n trace_id >= '5' AND trace_id < '6',\n trace_id >= '6' AND trace_id < '7',\n trace_id >= '7' AND trace_id < '8',\n trace_id >= '8' AND trace_id < '9',\n trace_id >= '9' AND trace_id < 'a',\n trace_id >= 'a' AND trace_id < 'b',\n trace_id >= 'b' AND trace_id < 'c',\n trace_id >= 'c' AND trace_id < 'd',\n trace_id >= 'd' AND trace_id < 'e',\n trace_id >= 'e' AND trace_id < 'f'\n)\nENGINE=mito\nWITH(\n append_mode = 'true',\n table_data_model = 'greptime_trace_v1'\n)"]]"#;
validate_data(
"otlp_traces",
&client,
From 032df4c5330a2b94515d38764c56c7da4fe4ee77 Mon Sep 17 00:00:00 2001
From: discord9 <55937128+discord9@users.noreply.github.com>
Date: Tue, 15 Apr 2025 15:03:12 +0800
Subject: [PATCH 22/82] feat(flow): dual engine (#5881)
* feat: partial use batch mode(WIP)
* feat: add flow engine trait
* refactor: more trait method
* dual engine
* feat: dual engine
* refactor: flow map cache
* chore: per review
* chore: per review
---
Cargo.toml | 5 +
src/cmd/src/flownode.rs | 6 +-
src/cmd/src/standalone.rs | 8 +-
src/common/meta/src/ddl/create_flow.rs | 2 +-
src/flow/src/adapter.rs | 53 +--
src/flow/src/adapter/flownode_impl.rs | 437 +++++++++++++++++++++----
src/flow/src/batching_mode.rs | 4 +-
src/flow/src/batching_mode/engine.rs | 48 ++-
src/flow/src/batching_mode/state.rs | 3 +-
src/flow/src/batching_mode/task.rs | 4 +-
src/flow/src/engine.rs | 57 ++++
src/flow/src/error.rs | 2 +-
src/flow/src/lib.rs | 5 +-
src/flow/src/server.rs | 10 +-
src/meta-client/src/client.rs | 1 +
tests-integration/src/standalone.rs | 5 +-
16 files changed, 534 insertions(+), 116 deletions(-)
create mode 100644 src/flow/src/engine.rs
diff --git a/Cargo.toml b/Cargo.toml
index 38b749e7b0..f3bd54a661 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -307,3 +307,8 @@ strip = true
[profile.dev.package.tests-fuzz]
debug = false
strip = true
+
+[profile.dev]
+opt-level = 1
+[profile.dev.package."*"]
+opt-level = 3
diff --git a/src/cmd/src/flownode.rs b/src/cmd/src/flownode.rs
index fc23d37c23..a7b530e558 100644
--- a/src/cmd/src/flownode.rs
+++ b/src/cmd/src/flownode.rs
@@ -32,7 +32,9 @@ use common_meta::key::TableMetadataManager;
use common_telemetry::info;
use common_telemetry::logging::TracingOptions;
use common_version::{short_version, version};
-use flow::{FlownodeBuilder, FlownodeInstance, FlownodeServiceBuilder, FrontendInvoker};
+use flow::{
+ FlownodeBuilder, FlownodeInstance, FlownodeServiceBuilder, FrontendClient, FrontendInvoker,
+};
use meta_client::{MetaClientOptions, MetaClientType};
use snafu::{ensure, OptionExt, ResultExt};
use tracing_appender::non_blocking::WorkerGuard;
@@ -313,12 +315,14 @@ impl StartCommand {
);
let flow_metadata_manager = Arc::new(FlowMetadataManager::new(cached_meta_backend.clone()));
+ let frontend_client = FrontendClient::from_meta_client(meta_client.clone());
let flownode_builder = FlownodeBuilder::new(
opts.clone(),
Plugins::new(),
table_metadata_manager,
catalog_manager.clone(),
flow_metadata_manager,
+ Arc::new(frontend_client),
)
.with_heartbeat_task(heartbeat_task);
diff --git a/src/cmd/src/standalone.rs b/src/cmd/src/standalone.rs
index 4504927cc8..3177a2446f 100644
--- a/src/cmd/src/standalone.rs
+++ b/src/cmd/src/standalone.rs
@@ -57,7 +57,7 @@ use datanode::region_server::RegionServer;
use file_engine::config::EngineConfig as FileEngineConfig;
use flow::{
FlowConfig, FlowWorkerManager, FlownodeBuilder, FlownodeInstance, FlownodeOptions,
- FrontendInvoker,
+ FrontendClient, FrontendInvoker,
};
use frontend::frontend::{Frontend, FrontendOptions};
use frontend::instance::builder::FrontendBuilder;
@@ -523,12 +523,18 @@ impl StartCommand {
flow: opts.flow.clone(),
..Default::default()
};
+
+ // TODO(discord9): for standalone not use grpc, but just somehow get a handler to frontend grpc client without
+ // actually make a connection
+ let fe_server_addr = fe_opts.grpc.bind_addr.clone();
+ let frontend_client = FrontendClient::from_static_grpc_addr(fe_server_addr);
let flow_builder = FlownodeBuilder::new(
flownode_options,
plugins.clone(),
table_metadata_manager.clone(),
catalog_manager.clone(),
flow_metadata_manager.clone(),
+ Arc::new(frontend_client),
);
let flownode = flow_builder
.build()
diff --git a/src/common/meta/src/ddl/create_flow.rs b/src/common/meta/src/ddl/create_flow.rs
index 4e7d661c1d..8b1c0354d4 100644
--- a/src/common/meta/src/ddl/create_flow.rs
+++ b/src/common/meta/src/ddl/create_flow.rs
@@ -324,7 +324,7 @@ pub enum CreateFlowState {
}
/// The type of flow.
-#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum FlowType {
/// The flow is a batching task.
Batching,
diff --git a/src/flow/src/adapter.rs b/src/flow/src/adapter.rs
index 8fd62ee2a0..516254ae55 100644
--- a/src/flow/src/adapter.rs
+++ b/src/flow/src/adapter.rs
@@ -16,7 +16,7 @@
//! and communicating with other parts of the database
#![warn(unused_imports)]
-use std::collections::{BTreeMap, HashMap};
+use std::collections::BTreeMap;
use std::sync::Arc;
use std::time::{Duration, Instant, SystemTime};
@@ -56,6 +56,7 @@ use crate::error::{EvalSnafu, ExternalSnafu, InternalSnafu, InvalidQuerySnafu, U
use crate::expr::Batch;
use crate::metrics::{METRIC_FLOW_INSERT_ELAPSED, METRIC_FLOW_ROWS, METRIC_FLOW_RUN_INTERVAL_MS};
use crate::repr::{self, DiffRow, RelationDesc, Row, BATCH_SIZE};
+use crate::{CreateFlowArgs, FlowId, TableName};
mod flownode_impl;
mod parse_expr;
@@ -78,11 +79,6 @@ pub const AUTO_CREATED_PLACEHOLDER_TS_COL: &str = "__ts_placeholder";
pub const AUTO_CREATED_UPDATE_AT_TS_COL: &str = "update_at";
-// TODO(discord9): refactor common types for flow to a separate module
-/// FlowId is a unique identifier for a flow task
-pub type FlowId = u64;
-pub type TableName = [String; 3];
-
/// Flow config that exists both in standalone&distributed mode
#[derive(Clone, Debug, Serialize, Deserialize, PartialEq)]
#[serde(default)]
@@ -731,25 +727,10 @@ impl FlowWorkerManager {
}
}
-/// The arguments to create a flow in [`FlowWorkerManager`].
-#[derive(Debug, Clone)]
-pub struct CreateFlowArgs {
- pub flow_id: FlowId,
- pub sink_table_name: TableName,
- pub source_table_ids: Vec,
- pub create_if_not_exists: bool,
- pub or_replace: bool,
- pub expire_after: Option,
- pub comment: Option,
- pub sql: String,
- pub flow_options: HashMap,
- pub query_ctx: Option,
-}
-
/// Create&Remove flow
impl FlowWorkerManager {
/// remove a flow by it's id
- pub async fn remove_flow(&self, flow_id: FlowId) -> Result<(), Error> {
+ pub async fn remove_flow_inner(&self, flow_id: FlowId) -> Result<(), Error> {
for handle in self.worker_handles.iter() {
if handle.contains_flow(flow_id).await? {
handle.remove_flow(flow_id).await?;
@@ -766,7 +747,7 @@ impl FlowWorkerManager {
/// 1. parse query into typed plan(and optional parse expire_after expr)
/// 2. render source/sink with output table id and used input table id
#[allow(clippy::too_many_arguments)]
- pub async fn create_flow(&self, args: CreateFlowArgs) -> Result, Error> {
+ pub async fn create_flow_inner(&self, args: CreateFlowArgs) -> Result , Error> {
let CreateFlowArgs {
flow_id,
sink_table_name,
@@ -905,6 +886,32 @@ impl FlowWorkerManager {
info!("Successfully create flow with id={}", flow_id);
Ok(Some(flow_id))
}
+
+ pub async fn flush_flow_inner(&self, flow_id: FlowId) -> Result {
+ debug!("Starting to flush flow_id={:?}", flow_id);
+ // lock to make sure writes before flush are written to flow
+ // and immediately drop to prevent following writes to be blocked
+ drop(self.flush_lock.write().await);
+ let flushed_input_rows = self.node_context.read().await.flush_all_sender().await?;
+ let rows_send = self.run_available(true).await?;
+ let row = self.send_writeback_requests().await?;
+ debug!(
+ "Done to flush flow_id={:?} with {} input rows flushed, {} rows sended and {} output rows flushed",
+ flow_id, flushed_input_rows, rows_send, row
+ );
+ Ok(row)
+ }
+
+ pub async fn flow_exist_inner(&self, flow_id: FlowId) -> Result {
+ let mut exist = false;
+ for handle in self.worker_handles.iter() {
+ if handle.contains_flow(flow_id).await? {
+ exist = true;
+ break;
+ }
+ }
+ Ok(exist)
+ }
}
/// FlowTickManager is a manager for flow tick, which trakc flow execution progress
diff --git a/src/flow/src/adapter/flownode_impl.rs b/src/flow/src/adapter/flownode_impl.rs
index 1daec77fbd..b7d218ef21 100644
--- a/src/flow/src/adapter/flownode_impl.rs
+++ b/src/flow/src/adapter/flownode_impl.rs
@@ -13,40 +13,228 @@
// limitations under the License.
//! impl `FlowNode` trait for FlowNodeManager so standalone can call them
-use std::collections::HashMap;
+use std::collections::{HashMap, HashSet};
+use std::sync::Arc;
use api::v1::flow::{
flow_request, CreateRequest, DropRequest, FlowRequest, FlowResponse, FlushFlow,
};
use api::v1::region::InsertRequests;
use common_error::ext::BoxedError;
-use common_meta::error::{ExternalSnafu, Result, UnexpectedSnafu};
-use common_telemetry::{debug, trace};
+use common_meta::ddl::create_flow::FlowType;
+use common_meta::error::{Result as MetaResult, UnexpectedSnafu};
+use common_runtime::JoinHandle;
+use common_telemetry::{trace, warn};
use datatypes::value::Value;
use itertools::Itertools;
use snafu::{IntoError, OptionExt, ResultExt};
-use store_api::storage::RegionId;
+use store_api::storage::{RegionId, TableId};
use crate::adapter::{CreateFlowArgs, FlowWorkerManager};
-use crate::error::{CreateFlowSnafu, InsertIntoFlowSnafu, InternalSnafu};
+use crate::batching_mode::engine::BatchingEngine;
+use crate::engine::FlowEngine;
+use crate::error::{CreateFlowSnafu, FlowNotFoundSnafu, InsertIntoFlowSnafu, InternalSnafu};
use crate::metrics::METRIC_FLOW_TASK_COUNT;
use crate::repr::{self, DiffRow};
+use crate::{Error, FlowId};
-/// return a function to convert `crate::error::Error` to `common_meta::error::Error`
-fn to_meta_err(
- location: snafu::Location,
-) -> impl FnOnce(crate::error::Error) -> common_meta::error::Error {
- move |err: crate::error::Error| -> common_meta::error::Error {
- common_meta::error::Error::External {
- location,
- source: BoxedError::new(err),
+/// Manage both streaming and batching mode engine
+///
+/// including create/drop/flush flow
+/// and redirect insert requests to the appropriate engine
+pub struct FlowDualEngine {
+ streaming_engine: Arc,
+ batching_engine: Arc,
+ /// helper struct for faster query flow by table id or vice versa
+ src_table2flow: std::sync::RwLock,
+}
+
+struct SrcTableToFlow {
+ /// mapping of table ids to flow ids for streaming mode
+ stream: HashMap>,
+ /// mapping of table ids to flow ids for batching mode
+ batch: HashMap>,
+ /// mapping of flow ids to (flow type, source table ids)
+ flow_infos: HashMap)>,
+}
+
+impl SrcTableToFlow {
+ fn in_stream(&self, table_id: TableId) -> bool {
+ self.stream.contains_key(&table_id)
+ }
+ fn in_batch(&self, table_id: TableId) -> bool {
+ self.batch.contains_key(&table_id)
+ }
+ fn add_flow(&mut self, flow_id: FlowId, flow_type: FlowType, src_table_ids: Vec) {
+ let mapping = match flow_type {
+ FlowType::Streaming => &mut self.stream,
+ FlowType::Batching => &mut self.batch,
+ };
+
+ for src_table in src_table_ids.clone() {
+ mapping
+ .entry(src_table)
+ .and_modify(|flows| {
+ flows.insert(flow_id);
+ })
+ .or_insert_with(|| {
+ let mut set = HashSet::new();
+ set.insert(flow_id);
+ set
+ });
}
+ self.flow_infos.insert(flow_id, (flow_type, src_table_ids));
+ }
+
+ fn remove_flow(&mut self, flow_id: FlowId) {
+ let mapping = match self.get_flow_type(flow_id) {
+ Some(FlowType::Streaming) => &mut self.stream,
+ Some(FlowType::Batching) => &mut self.batch,
+ None => return,
+ };
+ if let Some((_, src_table_ids)) = self.flow_infos.remove(&flow_id) {
+ for src_table in src_table_ids {
+ if let Some(flows) = mapping.get_mut(&src_table) {
+ flows.remove(&flow_id);
+ }
+ }
+ }
+ }
+
+ fn get_flow_type(&self, flow_id: FlowId) -> Option {
+ self.flow_infos
+ .get(&flow_id)
+ .map(|(flow_type, _)| flow_type)
+ .cloned()
+ }
+}
+
+impl FlowEngine for FlowDualEngine {
+ async fn create_flow(&self, args: CreateFlowArgs) -> Result, Error> {
+ let flow_type = args
+ .flow_options
+ .get(FlowType::FLOW_TYPE_KEY)
+ .map(|s| s.as_str());
+
+ let flow_type = match flow_type {
+ Some(FlowType::BATCHING) => FlowType::Batching,
+ Some(FlowType::STREAMING) => FlowType::Streaming,
+ None => FlowType::Batching,
+ Some(flow_type) => {
+ return InternalSnafu {
+ reason: format!("Invalid flow type: {}", flow_type),
+ }
+ .fail()
+ }
+ };
+
+ let flow_id = args.flow_id;
+ let src_table_ids = args.source_table_ids.clone();
+
+ let res = match flow_type {
+ FlowType::Batching => self.batching_engine.create_flow(args).await,
+ FlowType::Streaming => self.streaming_engine.create_flow(args).await,
+ }?;
+
+ self.src_table2flow
+ .write()
+ .unwrap()
+ .add_flow(flow_id, flow_type, src_table_ids);
+
+ Ok(res)
+ }
+
+ async fn remove_flow(&self, flow_id: FlowId) -> Result<(), Error> {
+ let flow_type = self.src_table2flow.read().unwrap().get_flow_type(flow_id);
+ match flow_type {
+ Some(FlowType::Batching) => self.batching_engine.remove_flow(flow_id).await,
+ Some(FlowType::Streaming) => self.streaming_engine.remove_flow(flow_id).await,
+ None => FlowNotFoundSnafu { id: flow_id }.fail(),
+ }?;
+ // remove mapping
+ self.src_table2flow.write().unwrap().remove_flow(flow_id);
+ Ok(())
+ }
+
+ async fn flush_flow(&self, flow_id: FlowId) -> Result {
+ let flow_type = self.src_table2flow.read().unwrap().get_flow_type(flow_id);
+ match flow_type {
+ Some(FlowType::Batching) => self.batching_engine.flush_flow(flow_id).await,
+ Some(FlowType::Streaming) => self.streaming_engine.flush_flow(flow_id).await,
+ None => FlowNotFoundSnafu { id: flow_id }.fail(),
+ }
+ }
+
+ async fn flow_exist(&self, flow_id: FlowId) -> Result {
+ let flow_type = self.src_table2flow.read().unwrap().get_flow_type(flow_id);
+ // not using `flow_type.is_some()` to make sure the flow is actually exist in the underlying engine
+ match flow_type {
+ Some(FlowType::Batching) => self.batching_engine.flow_exist(flow_id).await,
+ Some(FlowType::Streaming) => self.streaming_engine.flow_exist(flow_id).await,
+ None => Ok(false),
+ }
+ }
+
+ async fn handle_flow_inserts(
+ &self,
+ request: api::v1::region::InsertRequests,
+ ) -> Result<(), Error> {
+ // TODO(discord9): make as little clone as possible
+ let mut to_stream_engine = Vec::with_capacity(request.requests.len());
+ let mut to_batch_engine = request.requests;
+
+ {
+ let src_table2flow = self.src_table2flow.read().unwrap();
+ to_batch_engine.retain(|req| {
+ let region_id = RegionId::from(req.region_id);
+ let table_id = region_id.table_id();
+ let is_in_stream = src_table2flow.in_stream(table_id);
+ let is_in_batch = src_table2flow.in_batch(table_id);
+ if is_in_stream {
+ to_stream_engine.push(req.clone());
+ }
+ if is_in_batch {
+ return true;
+ }
+ if !is_in_batch && !is_in_stream {
+ // TODO(discord9): also put to centralized logging for flow once it implemented
+ warn!("Table {} is not any flow's source table", table_id)
+ }
+ false
+ });
+ // drop(src_table2flow);
+ // can't use drop due to https://github.com/rust-lang/rust/pull/128846
+ }
+
+ let streaming_engine = self.streaming_engine.clone();
+ let stream_handler: JoinHandle> =
+ common_runtime::spawn_global(async move {
+ streaming_engine
+ .handle_flow_inserts(api::v1::region::InsertRequests {
+ requests: to_stream_engine,
+ })
+ .await?;
+ Ok(())
+ });
+ self.batching_engine
+ .handle_flow_inserts(api::v1::region::InsertRequests {
+ requests: to_batch_engine,
+ })
+ .await?;
+ stream_handler.await.map_err(|e| {
+ crate::error::UnexpectedSnafu {
+ reason: format!("JoinError when handle inserts for flow stream engine: {e:?}"),
+ }
+ .build()
+ })??;
+
+ Ok(())
}
}
#[async_trait::async_trait]
-impl common_meta::node_manager::Flownode for FlowWorkerManager {
- async fn handle(&self, request: FlowRequest) -> Result {
+impl common_meta::node_manager::Flownode for FlowDualEngine {
+ async fn handle(&self, request: FlowRequest) -> MetaResult {
let query_ctx = request
.header
.and_then(|h| h.query_context)
@@ -109,31 +297,10 @@ impl common_meta::node_manager::Flownode for FlowWorkerManager {
Some(flow_request::Body::Flush(FlushFlow {
flow_id: Some(flow_id),
})) => {
- // TODO(discord9): impl individual flush
- debug!("Starting to flush flow_id={:?}", flow_id);
- // lock to make sure writes before flush are written to flow
- // and immediately drop to prevent following writes to be blocked
- drop(self.flush_lock.write().await);
- let flushed_input_rows = self
- .node_context
- .read()
- .await
- .flush_all_sender()
- .await
- .map_err(to_meta_err(snafu::location!()))?;
- let rows_send = self
- .run_available(true)
- .await
- .map_err(to_meta_err(snafu::location!()))?;
let row = self
- .send_writeback_requests()
+ .flush_flow(flow_id.id as u64)
.await
.map_err(to_meta_err(snafu::location!()))?;
-
- debug!(
- "Done to flush flow_id={:?} with {} input rows flushed, {} rows sended and {} output rows flushed",
- flow_id, flushed_input_rows, rows_send, row
- );
Ok(FlowResponse {
affected_flows: vec![flow_id],
affected_rows: row as u64,
@@ -151,7 +318,167 @@ impl common_meta::node_manager::Flownode for FlowWorkerManager {
}
}
- async fn handle_inserts(&self, request: InsertRequests) -> Result {
+ async fn handle_inserts(&self, request: InsertRequests) -> MetaResult {
+ FlowEngine::handle_flow_inserts(self, request)
+ .await
+ .map(|_| Default::default())
+ .map_err(to_meta_err(snafu::location!()))
+ }
+}
+
+/// return a function to convert `crate::error::Error` to `common_meta::error::Error`
+fn to_meta_err(
+ location: snafu::Location,
+) -> impl FnOnce(crate::error::Error) -> common_meta::error::Error {
+ move |err: crate::error::Error| -> common_meta::error::Error {
+ common_meta::error::Error::External {
+ location,
+ source: BoxedError::new(err),
+ }
+ }
+}
+
+#[async_trait::async_trait]
+impl common_meta::node_manager::Flownode for FlowWorkerManager {
+ async fn handle(&self, request: FlowRequest) -> MetaResult {
+ let query_ctx = request
+ .header
+ .and_then(|h| h.query_context)
+ .map(|ctx| ctx.into());
+ match request.body {
+ Some(flow_request::Body::Create(CreateRequest {
+ flow_id: Some(task_id),
+ source_table_ids,
+ sink_table_name: Some(sink_table_name),
+ create_if_not_exists,
+ expire_after,
+ comment,
+ sql,
+ flow_options,
+ or_replace,
+ })) => {
+ let source_table_ids = source_table_ids.into_iter().map(|id| id.id).collect_vec();
+ let sink_table_name = [
+ sink_table_name.catalog_name,
+ sink_table_name.schema_name,
+ sink_table_name.table_name,
+ ];
+ let expire_after = expire_after.map(|e| e.value);
+ let args = CreateFlowArgs {
+ flow_id: task_id.id as u64,
+ sink_table_name,
+ source_table_ids,
+ create_if_not_exists,
+ or_replace,
+ expire_after,
+ comment: Some(comment),
+ sql: sql.clone(),
+ flow_options,
+ query_ctx,
+ };
+ let ret = self
+ .create_flow(args)
+ .await
+ .map_err(BoxedError::new)
+ .with_context(|_| CreateFlowSnafu { sql: sql.clone() })
+ .map_err(to_meta_err(snafu::location!()))?;
+ METRIC_FLOW_TASK_COUNT.inc();
+ Ok(FlowResponse {
+ affected_flows: ret
+ .map(|id| greptime_proto::v1::FlowId { id: id as u32 })
+ .into_iter()
+ .collect_vec(),
+ ..Default::default()
+ })
+ }
+ Some(flow_request::Body::Drop(DropRequest {
+ flow_id: Some(flow_id),
+ })) => {
+ self.remove_flow(flow_id.id as u64)
+ .await
+ .map_err(to_meta_err(snafu::location!()))?;
+ METRIC_FLOW_TASK_COUNT.dec();
+ Ok(Default::default())
+ }
+ Some(flow_request::Body::Flush(FlushFlow {
+ flow_id: Some(flow_id),
+ })) => {
+ let row = self
+ .flush_flow_inner(flow_id.id as u64)
+ .await
+ .map_err(to_meta_err(snafu::location!()))?;
+ Ok(FlowResponse {
+ affected_flows: vec![flow_id],
+ affected_rows: row as u64,
+ ..Default::default()
+ })
+ }
+ None => UnexpectedSnafu {
+ err_msg: "Missing request body",
+ }
+ .fail(),
+ _ => UnexpectedSnafu {
+ err_msg: "Invalid request body.",
+ }
+ .fail(),
+ }
+ }
+
+ async fn handle_inserts(&self, request: InsertRequests) -> MetaResult {
+ self.handle_inserts_inner(request)
+ .await
+ .map(|_| Default::default())
+ .map_err(to_meta_err(snafu::location!()))
+ }
+}
+
+impl FlowEngine for FlowWorkerManager {
+ async fn create_flow(&self, args: CreateFlowArgs) -> Result, Error> {
+ self.create_flow_inner(args).await
+ }
+
+ async fn remove_flow(&self, flow_id: FlowId) -> Result<(), Error> {
+ self.remove_flow_inner(flow_id).await
+ }
+
+ async fn flush_flow(&self, flow_id: FlowId) -> Result {
+ self.flush_flow_inner(flow_id).await
+ }
+
+ async fn flow_exist(&self, flow_id: FlowId) -> Result {
+ self.flow_exist_inner(flow_id).await
+ }
+
+ async fn handle_flow_inserts(
+ &self,
+ request: api::v1::region::InsertRequests,
+ ) -> Result<(), Error> {
+ self.handle_inserts_inner(request).await
+ }
+}
+
+/// Simple helper enum for fetching value from row with default value
+#[derive(Debug, Clone)]
+enum FetchFromRow {
+ Idx(usize),
+ Default(Value),
+}
+
+impl FetchFromRow {
+ /// Panic if idx is out of bound
+ fn fetch(&self, row: &repr::Row) -> Value {
+ match self {
+ FetchFromRow::Idx(idx) => row.get(*idx).unwrap().clone(),
+ FetchFromRow::Default(v) => v.clone(),
+ }
+ }
+}
+
+impl FlowWorkerManager {
+ async fn handle_inserts_inner(
+ &self,
+ request: InsertRequests,
+ ) -> std::result::Result<(), Error> {
// using try_read to ensure two things:
// 1. flush wouldn't happen until inserts before it is inserted
// 2. inserts happening concurrently with flush wouldn't be block by flush
@@ -172,11 +499,7 @@ impl common_meta::node_manager::Flownode for FlowWorkerManager {
let ctx = self.node_context.read().await;
// TODO(discord9): also check schema version so that altered table can be reported
- let table_schema = ctx
- .table_source
- .table_from_id(&table_id)
- .await
- .map_err(to_meta_err(snafu::location!()))?;
+ let table_schema = ctx.table_source.table_from_id(&table_id).await?;
let default_vals = table_schema
.default_values
.iter()
@@ -210,9 +533,9 @@ impl common_meta::node_manager::Flownode for FlowWorkerManager {
None => InternalSnafu {
reason: format!("Expect column {idx} of table id={table_id} to have name in table schema, found None"),
}
- .fail().map_err(BoxedError::new).context(ExternalSnafu),
+ .fail(),
})
- .collect::>>()?;
+ .collect::, _>>()?;
let name_to_col = HashMap::<_, _>::from_iter(
insert_schema
.iter()
@@ -229,8 +552,8 @@ impl common_meta::node_manager::Flownode for FlowWorkerManager {
.copied()
.map(FetchFromRow::Idx)
.or_else(|| col_default_val.clone().map(FetchFromRow::Default))
- .with_context(|| UnexpectedSnafu {
- err_msg: format!(
+ .with_context(|| crate::error::UnexpectedSnafu {
+ reason: format!(
"Column not found: {}, default_value: {:?}",
col_name, col_default_val
),
@@ -272,27 +595,9 @@ impl common_meta::node_manager::Flownode for FlowWorkerManager {
}
.into_error(err);
common_telemetry::error!(err; "Failed to handle write request");
- let err = to_meta_err(snafu::location!())(err);
return Err(err);
}
}
- Ok(Default::default())
- }
-}
-
-/// Simple helper enum for fetching value from row with default value
-#[derive(Debug, Clone)]
-enum FetchFromRow {
- Idx(usize),
- Default(Value),
-}
-
-impl FetchFromRow {
- /// Panic if idx is out of bound
- fn fetch(&self, row: &repr::Row) -> Value {
- match self {
- FetchFromRow::Idx(idx) => row.get(*idx).unwrap().clone(),
- FetchFromRow::Default(v) => v.clone(),
- }
+ Ok(())
}
}
diff --git a/src/flow/src/batching_mode.rs b/src/flow/src/batching_mode.rs
index 138a44b633..152ad5781c 100644
--- a/src/flow/src/batching_mode.rs
+++ b/src/flow/src/batching_mode.rs
@@ -16,8 +16,8 @@
use std::time::Duration;
-mod engine;
-mod frontend_client;
+pub(crate) mod engine;
+pub(crate) mod frontend_client;
mod state;
mod task;
mod time_window;
diff --git a/src/flow/src/batching_mode/engine.rs b/src/flow/src/batching_mode/engine.rs
index 72ab7042d2..c53107f695 100644
--- a/src/flow/src/batching_mode/engine.rs
+++ b/src/flow/src/batching_mode/engine.rs
@@ -12,10 +12,11 @@
// See the License for the specific language governing permissions and
// limitations under the License.
+//! Batching mode engine
+
use std::collections::{BTreeMap, HashMap};
use std::sync::Arc;
-use api::v1::flow::FlowResponse;
use common_error::ext::BoxedError;
use common_meta::ddl::create_flow::FlowType;
use common_meta::key::flow::FlowMetadataManagerRef;
@@ -30,13 +31,13 @@ use store_api::storage::RegionId;
use table::metadata::TableId;
use tokio::sync::{oneshot, RwLock};
-use crate::adapter::{CreateFlowArgs, FlowId, TableName};
use crate::batching_mode::frontend_client::FrontendClient;
use crate::batching_mode::task::BatchingTask;
use crate::batching_mode::time_window::{find_time_window_expr, TimeWindowExpr};
use crate::batching_mode::utils::sql_to_df_plan;
+use crate::engine::FlowEngine;
use crate::error::{ExternalSnafu, FlowAlreadyExistSnafu, TableNotFoundMetaSnafu, UnexpectedSnafu};
-use crate::Error;
+use crate::{CreateFlowArgs, Error, FlowId, TableName};
/// Batching mode Engine, responsible for driving all the batching mode tasks
///
@@ -67,10 +68,10 @@ impl BatchingEngine {
}
}
- pub async fn handle_inserts(
+ pub async fn handle_inserts_inner(
&self,
request: api::v1::region::InsertRequests,
- ) -> Result {
+ ) -> Result<(), Error> {
let table_info_mgr = self.table_meta.table_info_manager();
let mut group_by_table_id: HashMap> = HashMap::new();
@@ -170,7 +171,7 @@ impl BatchingEngine {
}
drop(tasks);
- Ok(Default::default())
+ Ok(())
}
}
@@ -191,7 +192,7 @@ async fn get_table_name(
}
impl BatchingEngine {
- pub async fn create_flow(&self, args: CreateFlowArgs) -> Result, Error> {
+ pub async fn create_flow_inner(&self, args: CreateFlowArgs) -> Result , Error> {
let CreateFlowArgs {
flow_id,
sink_table_name,
@@ -308,7 +309,7 @@ impl BatchingEngine {
Ok(Some(flow_id))
}
- pub async fn remove_flow(&self, flow_id: FlowId) -> Result<(), Error> {
+ pub async fn remove_flow_inner(&self, flow_id: FlowId) -> Result<(), Error> {
if self.tasks.write().await.remove(&flow_id).is_none() {
warn!("Flow {flow_id} not found in tasks")
}
@@ -324,19 +325,42 @@ impl BatchingEngine {
Ok(())
}
- pub async fn flush_flow(&self, flow_id: FlowId) -> Result<(), Error> {
+ pub async fn flush_flow_inner(&self, flow_id: FlowId) -> Result {
let task = self.tasks.read().await.get(&flow_id).cloned();
let task = task.with_context(|| UnexpectedSnafu {
reason: format!("Can't found task for flow {flow_id}"),
})?;
- task.gen_exec_once(&self.query_engine, &self.frontend_client)
+ let res = task
+ .gen_exec_once(&self.query_engine, &self.frontend_client)
.await?;
- Ok(())
+ let affected_rows = res.map(|(r, _)| r).unwrap_or_default() as usize;
+ Ok(affected_rows)
}
/// Determine if the batching mode flow task exists with given flow id
- pub async fn flow_exist(&self, flow_id: FlowId) -> bool {
+ pub async fn flow_exist_inner(&self, flow_id: FlowId) -> bool {
self.tasks.read().await.contains_key(&flow_id)
}
}
+
+impl FlowEngine for BatchingEngine {
+ async fn create_flow(&self, args: CreateFlowArgs) -> Result, Error> {
+ self.create_flow_inner(args).await
+ }
+ async fn remove_flow(&self, flow_id: FlowId) -> Result<(), Error> {
+ self.remove_flow_inner(flow_id).await
+ }
+ async fn flush_flow(&self, flow_id: FlowId) -> Result {
+ self.flush_flow_inner(flow_id).await
+ }
+ async fn flow_exist(&self, flow_id: FlowId) -> Result {
+ Ok(self.flow_exist_inner(flow_id).await)
+ }
+ async fn handle_flow_inserts(
+ &self,
+ request: api::v1::region::InsertRequests,
+ ) -> Result<(), Error> {
+ self.handle_inserts_inner(request).await
+ }
+}
diff --git a/src/flow/src/batching_mode/state.rs b/src/flow/src/batching_mode/state.rs
index a406dae798..3a9802713c 100644
--- a/src/flow/src/batching_mode/state.rs
+++ b/src/flow/src/batching_mode/state.rs
@@ -26,11 +26,10 @@ use snafu::ResultExt;
use tokio::sync::oneshot;
use tokio::time::Instant;
-use crate::adapter::FlowId;
use crate::batching_mode::task::BatchingTask;
use crate::batching_mode::MIN_REFRESH_DURATION;
use crate::error::{DatatypesSnafu, InternalSnafu, TimeSnafu};
-use crate::Error;
+use crate::{Error, FlowId};
/// The state of the [`BatchingTask`].
#[derive(Debug)]
diff --git a/src/flow/src/batching_mode/task.rs b/src/flow/src/batching_mode/task.rs
index 44312509d4..f4280f54bd 100644
--- a/src/flow/src/batching_mode/task.rs
+++ b/src/flow/src/batching_mode/task.rs
@@ -43,7 +43,7 @@ use tokio::sync::oneshot;
use tokio::sync::oneshot::error::TryRecvError;
use tokio::time::Instant;
-use crate::adapter::{FlowId, AUTO_CREATED_PLACEHOLDER_TS_COL, AUTO_CREATED_UPDATE_AT_TS_COL};
+use crate::adapter::{AUTO_CREATED_PLACEHOLDER_TS_COL, AUTO_CREATED_UPDATE_AT_TS_COL};
use crate::batching_mode::frontend_client::FrontendClient;
use crate::batching_mode::state::TaskState;
use crate::batching_mode::time_window::TimeWindowExpr;
@@ -60,7 +60,7 @@ use crate::error::{
use crate::metrics::{
METRIC_FLOW_BATCHING_ENGINE_QUERY_TIME, METRIC_FLOW_BATCHING_ENGINE_SLOW_QUERY,
};
-use crate::Error;
+use crate::{Error, FlowId};
/// The task's config, immutable once created
#[derive(Clone)]
diff --git a/src/flow/src/engine.rs b/src/flow/src/engine.rs
new file mode 100644
index 0000000000..33da5252d7
--- /dev/null
+++ b/src/flow/src/engine.rs
@@ -0,0 +1,57 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//! Define a trait for flow engine, which is used by both streaming engine and batch engine
+
+use std::collections::HashMap;
+
+use session::context::QueryContext;
+use table::metadata::TableId;
+
+use crate::Error;
+// TODO(discord9): refactor common types for flow to a separate module
+/// FlowId is a unique identifier for a flow task
+pub type FlowId = u64;
+pub type TableName = [String; 3];
+
+/// The arguments to create a flow
+#[derive(Debug, Clone)]
+pub struct CreateFlowArgs {
+ pub flow_id: FlowId,
+ pub sink_table_name: TableName,
+ pub source_table_ids: Vec,
+ pub create_if_not_exists: bool,
+ pub or_replace: bool,
+ pub expire_after: Option,
+ pub comment: Option,
+ pub sql: String,
+ pub flow_options: HashMap,
+ pub query_ctx: Option,
+}
+
+pub trait FlowEngine {
+ /// Create a flow using the provided arguments, return previous flow id if exists and is replaced
+ async fn create_flow(&self, args: CreateFlowArgs) -> Result, Error>;
+ /// Remove a flow by its ID
+ async fn remove_flow(&self, flow_id: FlowId) -> Result<(), Error>;
+ /// Flush the flow, return the number of rows flushed
+ async fn flush_flow(&self, flow_id: FlowId) -> Result;
+ /// Check if the flow exists
+ async fn flow_exist(&self, flow_id: FlowId) -> Result;
+ /// Handle the insert requests for the flow
+ async fn handle_flow_inserts(
+ &self,
+ request: api::v1::region::InsertRequests,
+ ) -> Result<(), Error>;
+}
diff --git a/src/flow/src/error.rs b/src/flow/src/error.rs
index 2488b0a677..1741f8cb1b 100644
--- a/src/flow/src/error.rs
+++ b/src/flow/src/error.rs
@@ -25,8 +25,8 @@ use common_telemetry::common_error::status_code::StatusCode;
use snafu::{Location, ResultExt, Snafu};
use tonic::metadata::MetadataMap;
-use crate::adapter::FlowId;
use crate::expr::EvalError;
+use crate::FlowId;
/// This error is used to represent all possible errors that can occur in the flow module.
#[derive(Snafu)]
diff --git a/src/flow/src/lib.rs b/src/flow/src/lib.rs
index 8ec464730b..5dc3c67491 100644
--- a/src/flow/src/lib.rs
+++ b/src/flow/src/lib.rs
@@ -26,9 +26,10 @@
// allow unused for now because it should be use later
mod adapter;
-mod batching_mode;
+pub(crate) mod batching_mode;
mod compute;
mod df_optimizer;
+pub(crate) mod engine;
pub mod error;
mod expr;
pub mod heartbeat;
@@ -43,6 +44,8 @@ mod utils;
mod test_utils;
pub use adapter::{FlowConfig, FlowWorkerManager, FlowWorkerManagerRef, FlownodeOptions};
+pub use batching_mode::frontend_client::FrontendClient;
+pub(crate) use engine::{CreateFlowArgs, FlowId, TableName};
pub use error::{Error, Result};
pub use server::{
FlownodeBuilder, FlownodeInstance, FlownodeServer, FlownodeServiceBuilder, FrontendInvoker,
diff --git a/src/flow/src/server.rs b/src/flow/src/server.rs
index d0038e6ba1..53712ffb67 100644
--- a/src/flow/src/server.rs
+++ b/src/flow/src/server.rs
@@ -50,7 +50,7 @@ use tonic::codec::CompressionEncoding;
use tonic::transport::server::TcpIncoming;
use tonic::{Request, Response, Status};
-use crate::adapter::{create_worker, CreateFlowArgs, FlowWorkerManagerRef};
+use crate::adapter::{create_worker, FlowWorkerManagerRef};
use crate::error::{
to_status_with_last_err, CacheRequiredSnafu, CreateFlowSnafu, ExternalSnafu, FlowNotFoundSnafu,
ListFlowsSnafu, ParseAddrSnafu, ShutdownServerSnafu, StartServerSnafu, UnexpectedSnafu,
@@ -59,12 +59,13 @@ use crate::heartbeat::HeartbeatTask;
use crate::metrics::{METRIC_FLOW_PROCESSING_TIME, METRIC_FLOW_ROWS};
use crate::transform::register_function_to_query_engine;
use crate::utils::{SizeReportSender, StateReportHandler};
-use crate::{Error, FlowWorkerManager, FlownodeOptions};
+use crate::{CreateFlowArgs, Error, FlowWorkerManager, FlownodeOptions, FrontendClient};
pub const FLOW_NODE_SERVER_NAME: &str = "FLOW_NODE_SERVER";
/// wrapping flow node manager to avoid orphan rule with Arc<...>
#[derive(Clone)]
pub struct FlowService {
+ /// TODO(discord9): replace with dual engine
pub manager: FlowWorkerManagerRef,
}
@@ -290,6 +291,7 @@ pub struct FlownodeBuilder {
heartbeat_task: Option,
/// receive a oneshot sender to send state size report
state_report_handler: Option,
+ frontend_client: Arc,
}
impl FlownodeBuilder {
@@ -300,6 +302,7 @@ impl FlownodeBuilder {
table_meta: TableMetadataManagerRef,
catalog_manager: CatalogManagerRef,
flow_metadata_manager: FlowMetadataManagerRef,
+ frontend_client: Arc,
) -> Self {
Self {
opts,
@@ -309,6 +312,7 @@ impl FlownodeBuilder {
flow_metadata_manager,
heartbeat_task: None,
state_report_handler: None,
+ frontend_client,
}
}
@@ -432,7 +436,7 @@ impl FlownodeBuilder {
),
};
manager
- .create_flow(args)
+ .create_flow_inner(args)
.await
.map_err(BoxedError::new)
.with_context(|_| CreateFlowSnafu {
diff --git a/src/meta-client/src/client.rs b/src/meta-client/src/client.rs
index 3829ae2273..e022717ea1 100644
--- a/src/meta-client/src/client.rs
+++ b/src/meta-client/src/client.rs
@@ -117,6 +117,7 @@ impl MetaClientBuilder {
.enable_store()
.enable_heartbeat()
.enable_procedure()
+ .enable_access_cluster_info()
}
pub fn enable_heartbeat(self) -> Self {
diff --git a/tests-integration/src/standalone.rs b/tests-integration/src/standalone.rs
index 2d6c9bcf97..b85c848c88 100644
--- a/tests-integration/src/standalone.rs
+++ b/tests-integration/src/standalone.rs
@@ -41,7 +41,7 @@ use common_procedure::options::ProcedureConfig;
use common_procedure::ProcedureManagerRef;
use common_wal::config::{DatanodeWalConfig, MetasrvWalConfig};
use datanode::datanode::DatanodeBuilder;
-use flow::FlownodeBuilder;
+use flow::{FlownodeBuilder, FrontendClient};
use frontend::frontend::Frontend;
use frontend::instance::builder::FrontendBuilder;
use frontend::instance::{Instance, StandaloneDatanodeManager};
@@ -174,12 +174,15 @@ impl GreptimeDbStandaloneBuilder {
Some(procedure_manager.clone()),
);
+ let fe_server_addr = opts.frontend_options().grpc.bind_addr.clone();
+ let frontend_client = FrontendClient::from_static_grpc_addr(fe_server_addr);
let flow_builder = FlownodeBuilder::new(
Default::default(),
plugins.clone(),
table_metadata_manager.clone(),
catalog_manager.clone(),
flow_metadata_manager.clone(),
+ Arc::new(frontend_client),
);
let flownode = Arc::new(flow_builder.build().await.unwrap());
From 6700c0762d4937160bfa392d221dd320268eb8d7 Mon Sep 17 00:00:00 2001
From: "Lei, HUANG" <6406592+v0y4g3r@users.noreply.github.com>
Date: Tue, 15 Apr 2025 18:42:07 +0800
Subject: [PATCH 23/82] feat: Column-wise partition rule implementation (#5804)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
* wip: naive impl
* feat/column-partition:
### Add support for DataFusion physical expressions
- **`Cargo.lock` & `Cargo.toml`**: Added `datafusion-physical-expr` as a dependency to support physical expression creation.
- **`expr.rs`**: Implemented conversion methods `try_as_logical_expr` and `try_as_physical_expr` for `Operand` and `PartitionExpr` to facilitate logical and physical expression handling.
- **`multi_dim.rs`**: Enhanced `MultiDimPartitionRule` to utilize physical expressions for partitioning logic, including new methods for evaluating record batches.
- **Tests**: Added unit tests for logical and physical expression conversions and partitioning logic in `expr.rs` and `multi_dim.rs`.
* feat/column-partition:
### Refactor and Enhance Partition Handling
- **Refactor Partition Parsing Logic**: Moved partition parsing logic from `src/operator/src/statement/ddl.rs` to a new utility module `src/partition/src/utils.rs`. This includes functions like `parse_partitions`, `find_partition_bounds`, and `convert_one_expr`.
- **Error Handling Improvements**: Added new error variants `ColumnNotFound`, `InvalidPartitionRule`, and `ParseSqlValue` in `src/partition/src/error.rs` to improve error reporting for partition-related operations.
- **Dependency Updates**: Updated `Cargo.lock` and `Cargo.toml` to include new dependencies `common-time` and `session`.
- **Code Cleanup**: Removed redundant partition parsing functions from `src/operator/src/error.rs` and `src/operator/src/statement/ddl.rs`.
* feat/column-partition:
## Refactor and Enhance SQL and Table Handling
- **Refactor Column Definitions and Error Handling**
- Made `FULLTEXT_GRPC_KEY`, `INVERTED_INDEX_GRPC_KEY`, and `SKIPPING_INDEX_GRPC_KEY` public in `column_def.rs`.
- Removed `IllegalPrimaryKeysDef` error from `error.rs` and moved it to `sql/src/error.rs`.
- Updated error handling in `fill_impure_default.rs` and `expr_helper.rs`.
- **Enhance SQL Utility Functions**
- Moved and refactored functions like `create_to_expr`, `find_primary_keys`, and `validate_create_expr` to `sql/src/util.rs`.
- Added new utility functions for SQL parsing and validation in `sql/src/util.rs`.
- **Improve Partition Handling**
- Added `parse_partition_columns_and_exprs` function in `partition/src/utils.rs`.
- Updated partition rule tests in `partition/src/multi_dim.rs` to use SQL-based partitioning.
- **Simplify Table Name Handling**
- Re-exported `table_idents_to_full_name` from `sql::util` in `session/src/table_name.rs`.
- **Test Enhancements**
- Updated tests in `partition/src/multi_dim.rs` to use SQL for partition rule creation.
* feat/column-partition:
**Add Benchmarking and Enhance Partitioning Logic**
- **Benchmarking**: Introduced a new benchmark for `split_record_batch` in `bench_split_record_batch.rs` using `criterion` and `rand` as development dependencies in `Cargo.toml`.
- **Partitioning Logic**: Enhanced `MultiDimPartitionRule` in `multi_dim.rs` to include a default region for unmatched partition expressions and optimized the `split_record_batch` method.
- **Refactoring**: Moved `sql_to_partition_rule` function to a public scope for reuse in `multi_dim.rs`.
- **Testing**: Added new test module `test_split_record_batch` to validate the partitioning logic.
* Revert "feat/column-partition: ### Refactor and Enhance Partition Handling"
This reverts commit 183fa19f
* fix: revert refctoring parse_partition
* revert some refactor
* feat/column-partition:
### Enhance Partitioning and Error Handling
- **Benchmark Enhancements**: Added new benchmark `bench_split_record_batch_vs_row` in `bench_split_record_batch.rs` to compare row and column-based splitting.
- **Error Handling Improvements**: Introduced new error variants in `error.rs` for better error reporting related to record batch evaluation and arrow kernel computation.
- **Expression Handling**: Updated `expr.rs` to improve error context when converting schemas and creating physical expressions.
- **Partition Rule Enhancements**: Made `row_at` and `record_batch_to_cols` methods public in `multi_dim.rs` and improved error handling for physical expression evaluation and boolean operations.
* feat/column-partition:
### Add `eq` Method and Optimize Expression Caching
- **`expr.rs`**: Added a new `eq` method to the `Operand` struct for equality comparisons.
- **`multi_dim.rs`**: Introduced a caching mechanism for physical expressions using `RwLock` to improve performance in `MultiDimPartitionRule`.
- **`lib.rs`**: Enabled the `let_chains` feature for more concise code.
- **`multi_dim.rs` Tests**: Enhanced test coverage with new test cases for multi-dimensional partitioning, including random record batch generation and default region handling.
* feat/column-partition:
### Add `split_record_batch` Method to `PartitionRule` Trait
- **Files Modified**:
- `src/partition/src/multi_dim.rs`
- `src/partition/src/partition.rs`
- `src/partition/src/splitter.rs`
Added a new method `split_record_batch` to the `PartitionRule` trait, allowing record batches to be split into multiple regions based on partition values. Implemented this method in `MultiDimPartitionRule` and provided unimplemented stubs in test modules.
### Dependency Update
- **File Modified**:
- `src/operator/src/expr_helper.rs`
Removed unused import `ColumnDataType` and `Timezone` from the test module.
### Miscellaneous
- **File Modified**:
- `src/partition/Cargo.toml`
No functional changes; only minor formatting adjustments.
* chore: add license header
* chore: remove useless fules
* feat/column-partition:
Add support for handling unsupported partition expression values
- **`error.rs`**: Introduced a new error variant `UnsupportedPartitionExprValue` to handle unsupported partition expression values, and updated `ErrorExt` to map this error to `StatusCode::InvalidArguments`.
- **`expr.rs`**: Modified the `Operand` implementation to return the new error when encountering unsupported partition expression values.
- **`multi_dim.rs`**: Added a fast path to optimize the selection process when all rows are selected.
* feat/column-partition: Add validation for expression and region length in MultiDimPartitionRule constructor
• Ensure the lengths of exprs and regions match to prevent mismatches.
• Introduce error handling for length discrepancies with a descriptive error message.
* chore: add debug log
* feat/column-partition: Removed the validation check for matching lengths between exprs and regions in MultiDimPartitionRule constructor, simplifying the initialization process.
* fix: unit tests
---
Cargo.lock | 3 +
src/partition/Cargo.toml | 9 +
.../benches/bench_split_record_batch.rs | 226 +++++++++++++
src/partition/src/error.rs | 62 ++++
src/partition/src/expr.rs | 117 ++++++-
src/partition/src/lib.rs | 2 +-
src/partition/src/multi_dim.rs | 307 +++++++++++++++++-
src/partition/src/partition.rs | 9 +
src/partition/src/splitter.rs | 23 +-
9 files changed, 753 insertions(+), 5 deletions(-)
create mode 100644 src/partition/benches/bench_split_record_batch.rs
diff --git a/Cargo.lock b/Cargo.lock
index f1bf4eb3d0..5182582e82 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -8138,10 +8138,13 @@ dependencies = [
"common-macro",
"common-meta",
"common-query",
+ "criterion 0.5.1",
"datafusion-common",
"datafusion-expr",
+ "datafusion-physical-expr",
"datatypes",
"itertools 0.14.0",
+ "rand 0.8.5",
"serde",
"serde_json",
"session",
diff --git a/src/partition/Cargo.toml b/src/partition/Cargo.toml
index ebb7d68f8d..6a0904f8f2 100644
--- a/src/partition/Cargo.toml
+++ b/src/partition/Cargo.toml
@@ -16,6 +16,7 @@ common-meta.workspace = true
common-query.workspace = true
datafusion-common.workspace = true
datafusion-expr.workspace = true
+datafusion-physical-expr.workspace = true
datatypes.workspace = true
itertools.workspace = true
serde.workspace = true
@@ -26,3 +27,11 @@ sql.workspace = true
sqlparser.workspace = true
store-api.workspace = true
table.workspace = true
+
+[dev-dependencies]
+criterion = "0.5"
+rand = "0.8"
+
+[[bench]]
+name = "bench_split_record_batch"
+harness = false
diff --git a/src/partition/benches/bench_split_record_batch.rs b/src/partition/benches/bench_split_record_batch.rs
new file mode 100644
index 0000000000..f6c1bd69d4
--- /dev/null
+++ b/src/partition/benches/bench_split_record_batch.rs
@@ -0,0 +1,226 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+use std::vec;
+
+use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use datatypes::arrow::array::{ArrayRef, Int32Array, StringArray, TimestampMillisecondArray};
+use datatypes::arrow::datatypes::{DataType, Field, Schema, TimeUnit};
+use datatypes::arrow::record_batch::RecordBatch;
+use datatypes::value::Value;
+use partition::expr::{col, Operand};
+use partition::multi_dim::MultiDimPartitionRule;
+use partition::PartitionRule;
+use rand::Rng;
+use store_api::storage::RegionNumber;
+
+fn table_schema() -> Arc {
+ Arc::new(Schema::new(vec![
+ Field::new("a0", DataType::Int32, false),
+ Field::new("a1", DataType::Utf8, false),
+ Field::new("a2", DataType::Int32, false),
+ Field::new(
+ "ts",
+ DataType::Timestamp(TimeUnit::Millisecond, None),
+ false,
+ ),
+ ]))
+}
+
+fn create_test_rule(num_columns: usize) -> MultiDimPartitionRule {
+ let (columns, exprs) = match num_columns {
+ 1 => {
+ let exprs = vec![
+ col("a0").lt(Value::Int32(50)),
+ col("a0").gt_eq(Value::Int32(50)),
+ ];
+ (vec!["a0".to_string()], exprs)
+ }
+ 2 => {
+ let exprs = vec![
+ col("a0")
+ .lt(Value::Int32(50))
+ .and(col("a1").lt(Value::String("server50".into()))),
+ col("a0")
+ .lt(Value::Int32(50))
+ .and(col("a1").gt_eq(Value::String("server50".into()))),
+ col("a0")
+ .gt_eq(Value::Int32(50))
+ .and(col("a1").lt(Value::String("server50".into()))),
+ col("a0")
+ .gt_eq(Value::Int32(50))
+ .and(col("a1").gt_eq(Value::String("server50".into()))),
+ ];
+ (vec!["a0".to_string(), "a1".to_string()], exprs)
+ }
+ 3 => {
+ let expr = vec![
+ col("a0")
+ .lt(Value::Int32(50))
+ .and(col("a1").lt(Value::String("server50".into())))
+ .and(col("a2").lt(Value::Int32(50))),
+ col("a0")
+ .lt(Operand::Value(Value::Int32(50)))
+ .and(col("a1").lt(Value::String("server50".into())))
+ .and(col("a2").gt_eq(Value::Int32(50))),
+ col("a0")
+ .lt(Value::Int32(50))
+ .and(col("a1").gt_eq(Value::String("server50".into())))
+ .and(col("a2").lt(Value::Int32(50))),
+ col("a0")
+ .lt(Value::Int32(50))
+ .and(col("a1").gt_eq(Value::String("server50".into())))
+ .and(col("a2").gt_eq(Value::Int32(50))),
+ col("a0")
+ .gt_eq(Value::Int32(50))
+ .and(col("a1").lt(Value::String("server50".into())))
+ .and(col("a2").lt(Value::Int32(50))),
+ col("a0")
+ .gt_eq(Operand::Value(Value::Int32(50)))
+ .and(col("a1").lt(Value::String("server50".into())))
+ .and(col("a2").gt_eq(Value::Int32(50))),
+ col("a0")
+ .gt_eq(Value::Int32(50))
+ .and(col("a1").gt_eq(Value::String("server50".into())))
+ .and(col("a2").lt(Value::Int32(50))),
+ col("a0")
+ .gt_eq(Value::Int32(50))
+ .and(col("a1").gt_eq(Value::String("server50".into())))
+ .and(col("a2").gt_eq(Value::Int32(50))),
+ ];
+
+ (
+ vec!["a0".to_string(), "a1".to_string(), "a2".to_string()],
+ expr,
+ )
+ }
+ _ => {
+ panic!("invalid number of columns, only 1-3 are supported");
+ }
+ };
+
+ let regions = (0..exprs.len()).map(|v| v as u32).collect();
+ MultiDimPartitionRule::try_new(columns, regions, exprs).unwrap()
+}
+
+fn create_test_batch(size: usize) -> RecordBatch {
+ let mut rng = rand::thread_rng();
+
+ let schema = table_schema();
+ let arrays: Vec = (0..3)
+ .map(|col_idx| {
+ if col_idx % 2 == 0 {
+ // Integer columns (a0, a2)
+ Arc::new(Int32Array::from_iter_values(
+ (0..size).map(|_| rng.gen_range(0..100)),
+ )) as ArrayRef
+ } else {
+ // String columns (a1)
+ let values: Vec = (0..size)
+ .map(|_| {
+ let server_id: i32 = rng.gen_range(0..100);
+ format!("server{}", server_id)
+ })
+ .collect();
+ Arc::new(StringArray::from(values)) as ArrayRef
+ }
+ })
+ .chain(std::iter::once({
+ // Timestamp column (ts)
+ Arc::new(TimestampMillisecondArray::from_iter_values(
+ (0..size).map(|idx| idx as i64),
+ )) as ArrayRef
+ }))
+ .collect();
+ RecordBatch::try_new(schema, arrays).unwrap()
+}
+
+fn bench_split_record_batch_naive_vs_optimized(c: &mut Criterion) {
+ let mut group = c.benchmark_group("split_record_batch");
+
+ for num_columns in [1, 2, 3].iter() {
+ for num_rows in [100, 1000, 10000, 100000].iter() {
+ let rule = create_test_rule(*num_columns);
+ let batch = create_test_batch(*num_rows);
+
+ group.bench_function(format!("naive_{}_{}", num_columns, num_rows), |b| {
+ b.iter(|| {
+ black_box(rule.split_record_batch_naive(black_box(&batch))).unwrap();
+ });
+ });
+ group.bench_function(format!("optimized_{}_{}", num_columns, num_rows), |b| {
+ b.iter(|| {
+ black_box(rule.split_record_batch(black_box(&batch))).unwrap();
+ });
+ });
+ }
+ }
+
+ group.finish();
+}
+
+fn record_batch_to_rows(
+ rule: &MultiDimPartitionRule,
+ record_batch: &RecordBatch,
+) -> Vec> {
+ let num_rows = record_batch.num_rows();
+ let vectors = rule.record_batch_to_cols(record_batch).unwrap();
+ let mut res = Vec::with_capacity(num_rows);
+ let mut current_row = vec![Value::Null; vectors.len()];
+
+ for row in 0..num_rows {
+ rule.row_at(&vectors, row, &mut current_row).unwrap();
+ res.push(current_row.clone());
+ }
+ res
+}
+
+fn find_all_regions(rule: &MultiDimPartitionRule, rows: &[Vec]) -> Vec {
+ rows.iter()
+ .map(|row| rule.find_region(row).unwrap())
+ .collect()
+}
+
+fn bench_split_record_batch_vs_row(c: &mut Criterion) {
+ let mut group = c.benchmark_group("bench_split_record_batch_vs_row");
+
+ for num_columns in [1, 2, 3].iter() {
+ for num_rows in [100, 1000, 10000, 100000].iter() {
+ let rule = create_test_rule(*num_columns);
+ let batch = create_test_batch(*num_rows);
+ let rows = record_batch_to_rows(&rule, &batch);
+
+ group.bench_function(format!("split_by_row_{}_{}", num_columns, num_rows), |b| {
+ b.iter(|| {
+ black_box(find_all_regions(&rule, &rows));
+ });
+ });
+ group.bench_function(format!("split_by_col_{}_{}", num_columns, num_rows), |b| {
+ b.iter(|| {
+ black_box(rule.split_record_batch(black_box(&batch))).unwrap();
+ });
+ });
+ }
+ }
+
+ group.finish();
+}
+
+criterion_group!(
+ benches,
+ bench_split_record_batch_naive_vs_optimized,
+ bench_split_record_batch_vs_row
+);
+criterion_main!(benches);
diff --git a/src/partition/src/error.rs b/src/partition/src/error.rs
index 2487fa0974..2194583f40 100644
--- a/src/partition/src/error.rs
+++ b/src/partition/src/error.rs
@@ -18,6 +18,8 @@ use common_error::ext::ErrorExt;
use common_error::status_code::StatusCode;
use common_macro::stack_trace_debug;
use datafusion_common::ScalarValue;
+use datatypes::arrow;
+use datatypes::prelude::Value;
use snafu::{Location, Snafu};
use store_api::storage::RegionId;
use table::metadata::TableId;
@@ -173,6 +175,59 @@ pub enum Error {
#[snafu(implicit)]
location: Location,
},
+
+ #[snafu(display("Failed to convert to vector"))]
+ ConvertToVector {
+ source: datatypes::error::Error,
+ #[snafu(implicit)]
+ location: Location,
+ },
+
+ #[snafu(display("Failed to evaluate record batch"))]
+ EvaluateRecordBatch {
+ #[snafu(source)]
+ error: datafusion_common::error::DataFusionError,
+ #[snafu(implicit)]
+ location: Location,
+ },
+
+ #[snafu(display("Failed to compute arrow kernel"))]
+ ComputeArrowKernel {
+ #[snafu(source)]
+ error: arrow::error::ArrowError,
+ #[snafu(implicit)]
+ location: Location,
+ },
+
+ #[snafu(display("Unexpected evaluation result column type: {}", data_type))]
+ UnexpectedColumnType {
+ data_type: arrow::datatypes::DataType,
+ #[snafu(implicit)]
+ location: Location,
+ },
+
+ #[snafu(display("Failed to convert to DataFusion's Schema"))]
+ ToDFSchema {
+ #[snafu(source)]
+ error: datafusion_common::error::DataFusionError,
+ #[snafu(implicit)]
+ location: Location,
+ },
+
+ #[snafu(display("Failed to create physical expression"))]
+ CreatePhysicalExpr {
+ #[snafu(source)]
+ error: datafusion_common::error::DataFusionError,
+ #[snafu(implicit)]
+ location: Location,
+ },
+
+ #[snafu(display("Partition expr value is not supported: {:?}", value))]
+ UnsupportedPartitionExprValue {
+ value: Value,
+ #[snafu(implicit)]
+ location: Location,
+ },
}
impl ErrorExt for Error {
@@ -201,6 +256,13 @@ impl ErrorExt for Error {
Error::TableRouteNotFound { .. } => StatusCode::TableNotFound,
Error::TableRouteManager { source, .. } => source.status_code(),
Error::UnexpectedLogicalRouteTable { source, .. } => source.status_code(),
+ Error::ConvertToVector { source, .. } => source.status_code(),
+ Error::EvaluateRecordBatch { .. } => StatusCode::Internal,
+ Error::ComputeArrowKernel { .. } => StatusCode::Internal,
+ Error::UnexpectedColumnType { .. } => StatusCode::Internal,
+ Error::ToDFSchema { .. } => StatusCode::Internal,
+ Error::CreatePhysicalExpr { .. } => StatusCode::Internal,
+ Error::UnsupportedPartitionExprValue { .. } => StatusCode::InvalidArguments,
}
}
diff --git a/src/partition/src/expr.rs b/src/partition/src/expr.rs
index bec9543e72..b758d6dcba 100644
--- a/src/partition/src/expr.rs
+++ b/src/partition/src/expr.rs
@@ -13,12 +13,23 @@
// limitations under the License.
use std::fmt::{Debug, Display, Formatter};
+use std::sync::Arc;
-use datatypes::value::Value;
+use datafusion_common::{ScalarValue, ToDFSchema};
+use datafusion_expr::execution_props::ExecutionProps;
+use datafusion_expr::Expr;
+use datafusion_physical_expr::{create_physical_expr, PhysicalExpr};
+use datatypes::arrow;
+use datatypes::value::{
+ duration_to_scalar_value, time_to_scalar_value, timestamp_to_scalar_value, Value,
+};
use serde::{Deserialize, Serialize};
+use snafu::ResultExt;
use sql::statements::value_to_sql_value;
use sqlparser::ast::{BinaryOperator as ParserBinaryOperator, Expr as ParserExpr, Ident};
+use crate::error;
+
/// Struct for partition expression. This can be converted back to sqlparser's [Expr].
/// by [`Self::to_parser_expr`].
///
@@ -37,6 +48,75 @@ pub enum Operand {
Expr(PartitionExpr),
}
+pub fn col(column_name: impl Into) -> Operand {
+ Operand::Column(column_name.into())
+}
+
+impl From for Operand {
+ fn from(value: Value) -> Self {
+ Operand::Value(value)
+ }
+}
+
+impl Operand {
+ pub fn try_as_logical_expr(&self) -> error::Result {
+ match self {
+ Self::Column(c) => Ok(datafusion_expr::col(c)),
+ Self::Value(v) => {
+ let scalar_value = match v {
+ Value::Boolean(v) => ScalarValue::Boolean(Some(*v)),
+ Value::UInt8(v) => ScalarValue::UInt8(Some(*v)),
+ Value::UInt16(v) => ScalarValue::UInt16(Some(*v)),
+ Value::UInt32(v) => ScalarValue::UInt32(Some(*v)),
+ Value::UInt64(v) => ScalarValue::UInt64(Some(*v)),
+ Value::Int8(v) => ScalarValue::Int8(Some(*v)),
+ Value::Int16(v) => ScalarValue::Int16(Some(*v)),
+ Value::Int32(v) => ScalarValue::Int32(Some(*v)),
+ Value::Int64(v) => ScalarValue::Int64(Some(*v)),
+ Value::Float32(v) => ScalarValue::Float32(Some(v.0)),
+ Value::Float64(v) => ScalarValue::Float64(Some(v.0)),
+ Value::String(v) => ScalarValue::Utf8(Some(v.as_utf8().to_string())),
+ Value::Binary(v) => ScalarValue::Binary(Some(v.to_vec())),
+ Value::Date(v) => ScalarValue::Date32(Some(v.val())),
+ Value::Null => ScalarValue::Null,
+ Value::Timestamp(t) => timestamp_to_scalar_value(t.unit(), Some(t.value())),
+ Value::Time(t) => time_to_scalar_value(*t.unit(), Some(t.value())).unwrap(),
+ Value::IntervalYearMonth(v) => ScalarValue::IntervalYearMonth(Some(v.to_i32())),
+ Value::IntervalDayTime(v) => ScalarValue::IntervalDayTime(Some((*v).into())),
+ Value::IntervalMonthDayNano(v) => {
+ ScalarValue::IntervalMonthDayNano(Some((*v).into()))
+ }
+ Value::Duration(d) => duration_to_scalar_value(d.unit(), Some(d.value())),
+ Value::Decimal128(d) => {
+ let (v, p, s) = d.to_scalar_value();
+ ScalarValue::Decimal128(v, p, s)
+ }
+ other => {
+ return error::UnsupportedPartitionExprValueSnafu {
+ value: other.clone(),
+ }
+ .fail()
+ }
+ };
+ Ok(datafusion_expr::lit(scalar_value))
+ }
+ Self::Expr(e) => e.try_as_logical_expr(),
+ }
+ }
+
+ pub fn lt(self, rhs: impl Into) -> PartitionExpr {
+ PartitionExpr::new(self, RestrictedOp::Lt, rhs.into())
+ }
+
+ pub fn gt_eq(self, rhs: impl Into) -> PartitionExpr {
+ PartitionExpr::new(self, RestrictedOp::GtEq, rhs.into())
+ }
+
+ pub fn eq(self, rhs: impl Into) -> PartitionExpr {
+ PartitionExpr::new(self, RestrictedOp::Eq, rhs.into())
+ }
+}
+
impl Display for Operand {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
match self {
@@ -140,6 +220,41 @@ impl PartitionExpr {
right: Box::new(rhs),
}
}
+
+ pub fn try_as_logical_expr(&self) -> error::Result {
+ let lhs = self.lhs.try_as_logical_expr()?;
+ let rhs = self.rhs.try_as_logical_expr()?;
+
+ let expr = match &self.op {
+ RestrictedOp::And => datafusion_expr::and(lhs, rhs),
+ RestrictedOp::Or => datafusion_expr::or(lhs, rhs),
+ RestrictedOp::Gt => lhs.gt(rhs),
+ RestrictedOp::GtEq => lhs.gt_eq(rhs),
+ RestrictedOp::Lt => lhs.lt(rhs),
+ RestrictedOp::LtEq => lhs.lt_eq(rhs),
+ RestrictedOp::Eq => lhs.eq(rhs),
+ RestrictedOp::NotEq => lhs.not_eq(rhs),
+ };
+ Ok(expr)
+ }
+
+ pub fn try_as_physical_expr(
+ &self,
+ schema: &arrow::datatypes::SchemaRef,
+ ) -> error::Result> {
+ let df_schema = schema
+ .clone()
+ .to_dfschema_ref()
+ .context(error::ToDFSchemaSnafu)?;
+ let execution_props = &ExecutionProps::default();
+ let expr = self.try_as_logical_expr()?;
+ create_physical_expr(&expr, &df_schema, execution_props)
+ .context(error::CreatePhysicalExprSnafu)
+ }
+
+ pub fn and(self, rhs: PartitionExpr) -> PartitionExpr {
+ PartitionExpr::new(Operand::Expr(self), RestrictedOp::And, Operand::Expr(rhs))
+ }
}
impl Display for PartitionExpr {
diff --git a/src/partition/src/lib.rs b/src/partition/src/lib.rs
index b1843a1093..bc56edc584 100644
--- a/src/partition/src/lib.rs
+++ b/src/partition/src/lib.rs
@@ -13,7 +13,7 @@
// limitations under the License.
#![feature(assert_matches)]
-
+#![feature(let_chains)]
//! Structs and traits for partitioning rule.
pub mod error;
diff --git a/src/partition/src/multi_dim.rs b/src/partition/src/multi_dim.rs
index f47d71f98b..551fb6a8de 100644
--- a/src/partition/src/multi_dim.rs
+++ b/src/partition/src/multi_dim.rs
@@ -15,10 +15,18 @@
use std::any::Any;
use std::cmp::Ordering;
use std::collections::HashMap;
+use std::sync::{Arc, RwLock};
+use datafusion_expr::ColumnarValue;
+use datafusion_physical_expr::PhysicalExpr;
+use datatypes::arrow;
+use datatypes::arrow::array::{BooleanArray, BooleanBufferBuilder, RecordBatch};
+use datatypes::arrow::buffer::BooleanBuffer;
+use datatypes::arrow::datatypes::Schema;
use datatypes::prelude::Value;
+use datatypes::vectors::{Helper, VectorRef};
use serde::{Deserialize, Serialize};
-use snafu::{ensure, OptionExt};
+use snafu::{ensure, OptionExt, ResultExt};
use store_api::storage::RegionNumber;
use crate::error::{
@@ -28,6 +36,11 @@ use crate::error::{
use crate::expr::{Operand, PartitionExpr, RestrictedOp};
use crate::PartitionRule;
+/// The default region number when no partition exprs are matched.
+const DEFAULT_REGION: RegionNumber = 0;
+
+type PhysicalExprCache = Option<(Vec>, Arc)>;
+
/// Multi-Dimiension partition rule. RFC [here](https://github.com/GreptimeTeam/greptimedb/blob/main/docs/rfcs/2024-02-21-multi-dimension-partition-rule/rfc.md)
///
/// This partition rule is defined by a set of simple expressions on the partition
@@ -44,6 +57,9 @@ pub struct MultiDimPartitionRule {
regions: Vec,
/// Partition expressions.
exprs: Vec,
+ /// Cache of physical expressions.
+ #[serde(skip)]
+ physical_expr_cache: RwLock,
}
impl MultiDimPartitionRule {
@@ -63,6 +79,7 @@ impl MultiDimPartitionRule {
name_to_index,
regions,
exprs,
+ physical_expr_cache: RwLock::new(None),
};
let mut checker = RuleChecker::new(&rule);
@@ -87,7 +104,7 @@ impl MultiDimPartitionRule {
}
// return the default region number
- Ok(0)
+ Ok(DEFAULT_REGION)
}
fn evaluate_expr(&self, expr: &PartitionExpr, values: &[Value]) -> Result {
@@ -134,6 +151,133 @@ impl MultiDimPartitionRule {
Ok(result)
}
+
+ pub fn row_at(&self, cols: &[VectorRef], index: usize, row: &mut [Value]) -> Result<()> {
+ for (col_idx, col) in cols.iter().enumerate() {
+ row[col_idx] = col.get(index);
+ }
+ Ok(())
+ }
+
+ pub fn record_batch_to_cols(&self, record_batch: &RecordBatch) -> Result> {
+ self.partition_columns
+ .iter()
+ .map(|col_name| {
+ record_batch
+ .column_by_name(col_name)
+ .context(error::UndefinedColumnSnafu { column: col_name })
+ .and_then(|array| {
+ Helper::try_into_vector(array).context(error::ConvertToVectorSnafu)
+ })
+ })
+ .collect::>>()
+ }
+
+ pub fn split_record_batch_naive(
+ &self,
+ record_batch: &RecordBatch,
+ ) -> Result> {
+ let num_rows = record_batch.num_rows();
+
+ let mut result = self
+ .regions
+ .iter()
+ .map(|region| {
+ let mut builder = BooleanBufferBuilder::new(num_rows);
+ builder.append_n(num_rows, false);
+ (*region, builder)
+ })
+ .collect::>();
+
+ let cols = self.record_batch_to_cols(record_batch)?;
+ let mut current_row = vec![Value::Null; self.partition_columns.len()];
+ for row_idx in 0..num_rows {
+ self.row_at(&cols, row_idx, &mut current_row)?;
+ let current_region = self.find_region(¤t_row)?;
+ let region_mask = result
+ .get_mut(¤t_region)
+ .unwrap_or_else(|| panic!("Region {} must be initialized", current_region));
+ region_mask.set_bit(row_idx, true);
+ }
+
+ Ok(result
+ .into_iter()
+ .map(|(region, mut mask)| (region, BooleanArray::new(mask.finish(), None)))
+ .collect())
+ }
+
+ pub fn split_record_batch(
+ &self,
+ record_batch: &RecordBatch,
+ ) -> Result> {
+ let num_rows = record_batch.num_rows();
+ let physical_exprs = {
+ let cache_read_guard = self.physical_expr_cache.read().unwrap();
+ if let Some((cached_exprs, schema)) = cache_read_guard.as_ref()
+ && schema == record_batch.schema_ref()
+ {
+ cached_exprs.clone()
+ } else {
+ drop(cache_read_guard); // Release the read lock before acquiring write lock
+
+ let schema = record_batch.schema();
+ let new_cache = self
+ .exprs
+ .iter()
+ .map(|e| e.try_as_physical_expr(&schema))
+ .collect::>>()?;
+
+ let mut cache_write_guard = self.physical_expr_cache.write().unwrap();
+ cache_write_guard.replace((new_cache.clone(), schema));
+ new_cache
+ }
+ };
+
+ let mut result: HashMap = physical_exprs
+ .iter()
+ .zip(self.regions.iter())
+ .map(|(expr, region_num)| {
+ let ColumnarValue::Array(column) = expr
+ .evaluate(record_batch)
+ .context(error::EvaluateRecordBatchSnafu)?
+ else {
+ unreachable!("Expected an array")
+ };
+ Ok((
+ *region_num,
+ column
+ .as_any()
+ .downcast_ref::()
+ .with_context(|| error::UnexpectedColumnTypeSnafu {
+ data_type: column.data_type().clone(),
+ })?
+ .clone(),
+ ))
+ })
+ .collect::>()?;
+
+ let mut selected = BooleanArray::new(BooleanBuffer::new_unset(num_rows), None);
+ for region_selection in result.values() {
+ selected = arrow::compute::kernels::boolean::or(&selected, region_selection)
+ .context(error::ComputeArrowKernelSnafu)?;
+ }
+
+ // fast path: all rows are selected
+ if selected.true_count() == num_rows {
+ return Ok(result);
+ }
+
+ // find unselected rows and assign to default region
+ let unselected = arrow::compute::kernels::boolean::not(&selected)
+ .context(error::ComputeArrowKernelSnafu)?;
+ let default_region_selection = result
+ .entry(DEFAULT_REGION)
+ .or_insert_with(|| unselected.clone());
+ *default_region_selection =
+ arrow::compute::kernels::boolean::or(default_region_selection, &unselected)
+ .context(error::ComputeArrowKernelSnafu)?;
+ Ok(result)
+ }
}
impl PartitionRule for MultiDimPartitionRule {
@@ -148,6 +292,13 @@ impl PartitionRule for MultiDimPartitionRule {
fn find_region(&self, values: &[Value]) -> Result {
self.find_region(values)
}
+
+ fn split_record_batch(
+ &self,
+ record_batch: &RecordBatch,
+ ) -> Result> {
+ self.split_record_batch(record_batch)
+ }
}
/// Helper for [RuleChecker]
@@ -633,3 +784,155 @@ mod tests {
assert!(rule.is_err());
}
}
+
+#[cfg(test)]
+mod test_split_record_batch {
+ use std::sync::Arc;
+
+ use datatypes::arrow::array::{Int64Array, StringArray};
+ use datatypes::arrow::datatypes::{DataType, Field, Schema};
+ use datatypes::arrow::record_batch::RecordBatch;
+ use rand::Rng;
+
+ use super::*;
+ use crate::expr::col;
+
+ fn test_schema() -> Arc {
+ Arc::new(Schema::new(vec![
+ Field::new("host", DataType::Utf8, false),
+ Field::new("value", DataType::Int64, false),
+ ]))
+ }
+
+ fn generate_random_record_batch(num_rows: usize) -> RecordBatch {
+ let schema = test_schema();
+ let mut rng = rand::thread_rng();
+ let mut host_array = Vec::with_capacity(num_rows);
+ let mut value_array = Vec::with_capacity(num_rows);
+ for _ in 0..num_rows {
+ host_array.push(format!("server{}", rng.gen_range(0..20)));
+ value_array.push(rng.gen_range(0..20));
+ }
+ let host_array = StringArray::from(host_array);
+ let value_array = Int64Array::from(value_array);
+ RecordBatch::try_new(schema, vec![Arc::new(host_array), Arc::new(value_array)]).unwrap()
+ }
+
+ #[test]
+ fn test_split_record_batch_by_one_column() {
+ // Create a simple MultiDimPartitionRule
+ let rule = MultiDimPartitionRule::try_new(
+ vec!["host".to_string(), "value".to_string()],
+ vec![0, 1],
+ vec![
+ col("host").lt(Value::String("server1".into())),
+ col("host").gt_eq(Value::String("server1".into())),
+ ],
+ )
+ .unwrap();
+
+ let batch = generate_random_record_batch(1000);
+ // Split the batch
+ let result = rule.split_record_batch(&batch).unwrap();
+ let expected = rule.split_record_batch_naive(&batch).unwrap();
+ assert_eq!(result.len(), expected.len());
+ for (region, value) in &result {
+ assert_eq!(
+ value,
+ expected.get(region).unwrap(),
+ "failed on region: {}",
+ region
+ );
+ }
+ }
+
+ #[test]
+ fn test_split_record_batch_empty() {
+ // Create a simple MultiDimPartitionRule
+ let rule = MultiDimPartitionRule::try_new(
+ vec!["host".to_string()],
+ vec![1],
+ vec![PartitionExpr::new(
+ Operand::Column("host".to_string()),
+ RestrictedOp::Eq,
+ Operand::Value(Value::String("server1".into())),
+ )],
+ )
+ .unwrap();
+
+ let schema = test_schema();
+ let host_array = StringArray::from(Vec::<&str>::new());
+ let value_array = Int64Array::from(Vec::::new());
+ let batch = RecordBatch::try_new(schema, vec![Arc::new(host_array), Arc::new(value_array)])
+ .unwrap();
+
+ let result = rule.split_record_batch(&batch).unwrap();
+ assert_eq!(result.len(), 1);
+ }
+
+ #[test]
+ fn test_split_record_batch_by_two_columns() {
+ let rule = MultiDimPartitionRule::try_new(
+ vec!["host".to_string(), "value".to_string()],
+ vec![0, 1, 2, 3],
+ vec![
+ col("host")
+ .lt(Value::String("server10".into()))
+ .and(col("value").lt(Value::Int64(10))),
+ col("host")
+ .lt(Value::String("server10".into()))
+ .and(col("value").gt_eq(Value::Int64(10))),
+ col("host")
+ .gt_eq(Value::String("server10".into()))
+ .and(col("value").lt(Value::Int64(10))),
+ col("host")
+ .gt_eq(Value::String("server10".into()))
+ .and(col("value").gt_eq(Value::Int64(10))),
+ ],
+ )
+ .unwrap();
+
+ let batch = generate_random_record_batch(1000);
+ let result = rule.split_record_batch(&batch).unwrap();
+ let expected = rule.split_record_batch_naive(&batch).unwrap();
+ assert_eq!(result.len(), expected.len());
+ for (region, value) in &result {
+ assert_eq!(value, expected.get(region).unwrap());
+ }
+ }
+
+ #[test]
+ fn test_default_region() {
+ let rule = MultiDimPartitionRule::try_new(
+ vec!["host".to_string(), "value".to_string()],
+ vec![0, 1, 2, 3],
+ vec![
+ col("host")
+ .lt(Value::String("server10".into()))
+ .and(col("value").eq(Value::Int64(10))),
+ col("host")
+ .lt(Value::String("server10".into()))
+ .and(col("value").eq(Value::Int64(20))),
+ col("host")
+ .gt_eq(Value::String("server10".into()))
+ .and(col("value").eq(Value::Int64(10))),
+ col("host")
+ .gt_eq(Value::String("server10".into()))
+ .and(col("value").eq(Value::Int64(20))),
+ ],
+ )
+ .unwrap();
+
+ let schema = test_schema();
+ let host_array = StringArray::from(vec!["server1", "server1", "server1", "server100"]);
+ let value_array = Int64Array::from(vec![10, 20, 30, 10]);
+ let batch = RecordBatch::try_new(schema, vec![Arc::new(host_array), Arc::new(value_array)])
+ .unwrap();
+ let result = rule.split_record_batch(&batch).unwrap();
+ let expected = rule.split_record_batch_naive(&batch).unwrap();
+ assert_eq!(result.len(), expected.len());
+ for (region, value) in &result {
+ assert_eq!(value, expected.get(region).unwrap());
+ }
+ }
+}
diff --git a/src/partition/src/partition.rs b/src/partition/src/partition.rs
index ac965034c6..a190d33eca 100644
--- a/src/partition/src/partition.rs
+++ b/src/partition/src/partition.rs
@@ -13,11 +13,13 @@
// limitations under the License.
use std::any::Any;
+use std::collections::HashMap;
use std::fmt::{Debug, Display, Formatter};
use std::sync::Arc;
use common_meta::rpc::router::Partition as MetaPartition;
use datafusion_expr::Operator;
+use datatypes::arrow::array::{BooleanArray, RecordBatch};
use datatypes::prelude::Value;
use itertools::Itertools;
use serde::{Deserialize, Serialize};
@@ -37,6 +39,13 @@ pub trait PartitionRule: Sync + Send {
///
/// Note that the `values` should have the same length as the `partition_columns`.
fn find_region(&self, values: &[Value]) -> Result;
+
+ /// Split the record batch into multiple regions by the partition values.
+ /// The result is a map from region number to a boolean array, where the boolean array is true for the rows that match the partition values.
+ fn split_record_batch(
+ &self,
+ record_batch: &RecordBatch,
+ ) -> Result>;
}
/// The right bound(exclusive) of partition range.
diff --git a/src/partition/src/splitter.rs b/src/partition/src/splitter.rs
index f62210a6b5..87c04a4942 100644
--- a/src/partition/src/splitter.rs
+++ b/src/partition/src/splitter.rs
@@ -136,6 +136,7 @@ mod tests {
use api::v1::value::ValueData;
use api::v1::{ColumnDataType, SemanticType};
+ use datatypes::arrow::array::BooleanArray;
use serde::{Deserialize, Serialize};
use super::*;
@@ -209,6 +210,13 @@ mod tests {
Ok(val.parse::().unwrap() % 2)
}
+
+ fn split_record_batch(
+ &self,
+ _record_batch: &datatypes::arrow::array::RecordBatch,
+ ) -> Result> {
+ unimplemented!()
+ }
}
#[derive(Debug, Serialize, Deserialize)]
@@ -232,6 +240,13 @@ mod tests {
Ok(val)
}
+
+ fn split_record_batch(
+ &self,
+ _record_batch: &datatypes::arrow::array::RecordBatch,
+ ) -> Result> {
+ unimplemented!()
+ }
}
#[derive(Debug, Serialize, Deserialize)]
@@ -249,8 +264,14 @@ mod tests {
fn find_region(&self, _values: &[Value]) -> Result {
Ok(0)
}
- }
+ fn split_record_batch(
+ &self,
+ _record_batch: &datatypes::arrow::array::RecordBatch,
+ ) -> Result> {
+ unimplemented!()
+ }
+ }
#[test]
fn test_writer_splitter() {
let rows = mock_rows();
From dcf1a486f68d91ef1f9308109206291f92959698 Mon Sep 17 00:00:00 2001
From: Ruihang Xia
Date: Tue, 15 Apr 2025 19:05:17 +0800
Subject: [PATCH 24/82] feat: support `@@` (AtAt) operator for term matching
(#5902)
* update dep and sqlness case
Signed-off-by: Ruihang Xia
* implement transcribe rule
Signed-off-by: Ruihang Xia
* more tests
Signed-off-by: Ruihang Xia
* update sqlness result
Signed-off-by: Ruihang Xia
---------
Signed-off-by: Ruihang Xia
---
Cargo.lock | 48 +--
Cargo.toml | 18 +-
src/query/src/optimizer.rs | 1 +
src/query/src/optimizer/transcribe_atat.rs | 230 +++++++++++++
src/query/src/query_engine/state.rs | 2 +
.../cases/standalone/common/expr/atat.result | 315 ++++++++++++++++++
tests/cases/standalone/common/expr/atat.sql | 144 ++++++++
.../common/tql-explain-analyze/explain.result | 1 +
8 files changed, 726 insertions(+), 33 deletions(-)
create mode 100644 src/query/src/optimizer/transcribe_atat.rs
create mode 100644 tests/cases/standalone/common/expr/atat.result
create mode 100644 tests/cases/standalone/common/expr/atat.sql
diff --git a/Cargo.lock b/Cargo.lock
index 5182582e82..2035b5090c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2897,7 +2897,7 @@ checksum = "e8566979429cf69b49a5c740c60791108e86440e8be149bbea4fe54d2c32d6e2"
[[package]]
name = "datafusion"
version = "45.0.0"
-source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375"
+source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220"
dependencies = [
"arrow",
"arrow-array",
@@ -2948,7 +2948,7 @@ dependencies = [
[[package]]
name = "datafusion-catalog"
version = "45.0.0"
-source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375"
+source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220"
dependencies = [
"arrow",
"async-trait",
@@ -2968,7 +2968,7 @@ dependencies = [
[[package]]
name = "datafusion-catalog-listing"
version = "45.0.0"
-source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375"
+source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220"
dependencies = [
"arrow",
"arrow-schema",
@@ -2991,7 +2991,7 @@ dependencies = [
[[package]]
name = "datafusion-common"
version = "45.0.0"
-source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375"
+source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220"
dependencies = [
"ahash 0.8.11",
"arrow",
@@ -3016,7 +3016,7 @@ dependencies = [
[[package]]
name = "datafusion-common-runtime"
version = "45.0.0"
-source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375"
+source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220"
dependencies = [
"log",
"tokio",
@@ -3025,12 +3025,12 @@ dependencies = [
[[package]]
name = "datafusion-doc"
version = "45.0.0"
-source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375"
+source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220"
[[package]]
name = "datafusion-execution"
version = "45.0.0"
-source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375"
+source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220"
dependencies = [
"arrow",
"dashmap",
@@ -3048,7 +3048,7 @@ dependencies = [
[[package]]
name = "datafusion-expr"
version = "45.0.0"
-source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375"
+source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220"
dependencies = [
"arrow",
"chrono",
@@ -3068,7 +3068,7 @@ dependencies = [
[[package]]
name = "datafusion-expr-common"
version = "45.0.0"
-source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375"
+source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220"
dependencies = [
"arrow",
"datafusion-common",
@@ -3079,7 +3079,7 @@ dependencies = [
[[package]]
name = "datafusion-functions"
version = "45.0.0"
-source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375"
+source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220"
dependencies = [
"arrow",
"arrow-buffer",
@@ -3108,7 +3108,7 @@ dependencies = [
[[package]]
name = "datafusion-functions-aggregate"
version = "45.0.0"
-source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375"
+source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220"
dependencies = [
"ahash 0.8.11",
"arrow",
@@ -3129,7 +3129,7 @@ dependencies = [
[[package]]
name = "datafusion-functions-aggregate-common"
version = "45.0.0"
-source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375"
+source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220"
dependencies = [
"ahash 0.8.11",
"arrow",
@@ -3141,7 +3141,7 @@ dependencies = [
[[package]]
name = "datafusion-functions-nested"
version = "45.0.0"
-source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375"
+source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220"
dependencies = [
"arrow",
"arrow-array",
@@ -3163,7 +3163,7 @@ dependencies = [
[[package]]
name = "datafusion-functions-table"
version = "45.0.0"
-source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375"
+source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220"
dependencies = [
"arrow",
"async-trait",
@@ -3178,7 +3178,7 @@ dependencies = [
[[package]]
name = "datafusion-functions-window"
version = "45.0.0"
-source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375"
+source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220"
dependencies = [
"datafusion-common",
"datafusion-doc",
@@ -3194,7 +3194,7 @@ dependencies = [
[[package]]
name = "datafusion-functions-window-common"
version = "45.0.0"
-source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375"
+source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220"
dependencies = [
"datafusion-common",
"datafusion-physical-expr-common",
@@ -3203,7 +3203,7 @@ dependencies = [
[[package]]
name = "datafusion-macros"
version = "45.0.0"
-source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375"
+source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220"
dependencies = [
"datafusion-expr",
"quote",
@@ -3213,7 +3213,7 @@ dependencies = [
[[package]]
name = "datafusion-optimizer"
version = "45.0.0"
-source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375"
+source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220"
dependencies = [
"arrow",
"chrono",
@@ -3231,7 +3231,7 @@ dependencies = [
[[package]]
name = "datafusion-physical-expr"
version = "45.0.0"
-source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375"
+source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220"
dependencies = [
"ahash 0.8.11",
"arrow",
@@ -3254,7 +3254,7 @@ dependencies = [
[[package]]
name = "datafusion-physical-expr-common"
version = "45.0.0"
-source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375"
+source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220"
dependencies = [
"ahash 0.8.11",
"arrow",
@@ -3267,7 +3267,7 @@ dependencies = [
[[package]]
name = "datafusion-physical-optimizer"
version = "45.0.0"
-source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375"
+source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220"
dependencies = [
"arrow",
"arrow-schema",
@@ -3288,7 +3288,7 @@ dependencies = [
[[package]]
name = "datafusion-physical-plan"
version = "45.0.0"
-source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375"
+source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220"
dependencies = [
"ahash 0.8.11",
"arrow",
@@ -3318,7 +3318,7 @@ dependencies = [
[[package]]
name = "datafusion-sql"
version = "45.0.0"
-source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375"
+source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220"
dependencies = [
"arrow",
"arrow-array",
@@ -3336,7 +3336,7 @@ dependencies = [
[[package]]
name = "datafusion-substrait"
version = "45.0.0"
-source = "git+https://github.com/apache/datafusion.git?rev=8ebed674dd71f8a466f658626877944cd16a4375#8ebed674dd71f8a466f658626877944cd16a4375"
+source = "git+https://github.com/waynexia/arrow-datafusion.git?rev=5bbedc6704162afb03478f56ffb629405a4e1220#5bbedc6704162afb03478f56ffb629405a4e1220"
dependencies = [
"async-recursion",
"async-trait",
diff --git a/Cargo.toml b/Cargo.toml
index f3bd54a661..05835d88f3 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -112,15 +112,15 @@ clap = { version = "4.4", features = ["derive"] }
config = "0.13.0"
crossbeam-utils = "0.8"
dashmap = "6.1"
-datafusion = { git = "https://github.com/apache/datafusion.git", rev = "8ebed674dd71f8a466f658626877944cd16a4375" }
-datafusion-common = { git = "https://github.com/apache/datafusion.git", rev = "8ebed674dd71f8a466f658626877944cd16a4375" }
-datafusion-expr = { git = "https://github.com/apache/datafusion.git", rev = "8ebed674dd71f8a466f658626877944cd16a4375" }
-datafusion-functions = { git = "https://github.com/apache/datafusion.git", rev = "8ebed674dd71f8a466f658626877944cd16a4375" }
-datafusion-optimizer = { git = "https://github.com/apache/datafusion.git", rev = "8ebed674dd71f8a466f658626877944cd16a4375" }
-datafusion-physical-expr = { git = "https://github.com/apache/datafusion.git", rev = "8ebed674dd71f8a466f658626877944cd16a4375" }
-datafusion-physical-plan = { git = "https://github.com/apache/datafusion.git", rev = "8ebed674dd71f8a466f658626877944cd16a4375" }
-datafusion-sql = { git = "https://github.com/apache/datafusion.git", rev = "8ebed674dd71f8a466f658626877944cd16a4375" }
-datafusion-substrait = { git = "https://github.com/apache/datafusion.git", rev = "8ebed674dd71f8a466f658626877944cd16a4375" }
+datafusion = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "5bbedc6704162afb03478f56ffb629405a4e1220" }
+datafusion-common = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "5bbedc6704162afb03478f56ffb629405a4e1220" }
+datafusion-expr = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "5bbedc6704162afb03478f56ffb629405a4e1220" }
+datafusion-functions = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "5bbedc6704162afb03478f56ffb629405a4e1220" }
+datafusion-optimizer = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "5bbedc6704162afb03478f56ffb629405a4e1220" }
+datafusion-physical-expr = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "5bbedc6704162afb03478f56ffb629405a4e1220" }
+datafusion-physical-plan = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "5bbedc6704162afb03478f56ffb629405a4e1220" }
+datafusion-sql = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "5bbedc6704162afb03478f56ffb629405a4e1220" }
+datafusion-substrait = { git = "https://github.com/waynexia/arrow-datafusion.git", rev = "5bbedc6704162afb03478f56ffb629405a4e1220" }
deadpool = "0.12"
deadpool-postgres = "0.14"
derive_builder = "0.20"
diff --git a/src/query/src/optimizer.rs b/src/query/src/optimizer.rs
index e6596e923a..52a33029e2 100644
--- a/src/query/src/optimizer.rs
+++ b/src/query/src/optimizer.rs
@@ -21,6 +21,7 @@ pub mod scan_hint;
pub mod string_normalization;
#[cfg(test)]
pub(crate) mod test_util;
+pub mod transcribe_atat;
pub mod type_conversion;
pub mod windowed_sort;
diff --git a/src/query/src/optimizer/transcribe_atat.rs b/src/query/src/optimizer/transcribe_atat.rs
new file mode 100644
index 0000000000..3292f19f08
--- /dev/null
+++ b/src/query/src/optimizer/transcribe_atat.rs
@@ -0,0 +1,230 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::sync::Arc;
+
+use common_function::scalars::matches_term::MatchesTermFunction;
+use common_function::scalars::udf::create_udf;
+use common_function::state::FunctionState;
+use datafusion::config::ConfigOptions;
+use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRewriter};
+use datafusion_common::Result;
+use datafusion_expr::expr::ScalarFunction;
+use datafusion_expr::{Expr, LogicalPlan};
+use datafusion_optimizer::analyzer::AnalyzerRule;
+use session::context::QueryContext;
+
+use crate::plan::ExtractExpr;
+
+/// TranscribeAtatRule is an analyzer rule that transcribes `@@` operator
+/// to `matches_term` function.
+///
+/// Example:
+/// ```sql
+/// SELECT matches_term('cat!', 'cat') as result;
+///
+/// SELECT matches_term(`log_message`, '/start') as `matches_start` FROM t;
+/// ```
+///
+/// to
+///
+/// ```sql
+/// SELECT 'cat!' @@ 'cat' as result;
+///
+/// SELECT `log_message` @@ '/start' as `matches_start` FROM t;
+/// ```
+#[derive(Debug)]
+pub struct TranscribeAtatRule;
+
+impl AnalyzerRule for TranscribeAtatRule {
+ fn analyze(&self, plan: LogicalPlan, _config: &ConfigOptions) -> Result {
+ plan.transform(Self::do_analyze).map(|x| x.data)
+ }
+
+ fn name(&self) -> &str {
+ "TranscribeAtatRule"
+ }
+}
+
+impl TranscribeAtatRule {
+ fn do_analyze(plan: LogicalPlan) -> Result> {
+ let mut rewriter = TranscribeAtatRewriter::default();
+ let new_expr = plan
+ .expressions_consider_join()
+ .into_iter()
+ .map(|e| e.rewrite(&mut rewriter).map(|x| x.data))
+ .collect::>>()?;
+
+ if rewriter.transcribed {
+ let inputs = plan.inputs().into_iter().cloned().collect::>();
+ plan.with_new_exprs(new_expr, inputs).map(Transformed::yes)
+ } else {
+ Ok(Transformed::no(plan))
+ }
+ }
+}
+
+#[derive(Default)]
+struct TranscribeAtatRewriter {
+ transcribed: bool,
+}
+
+impl TreeNodeRewriter for TranscribeAtatRewriter {
+ type Node = Expr;
+
+ fn f_up(&mut self, expr: Expr) -> Result> {
+ if let Expr::BinaryExpr(binary_expr) = &expr
+ && matches!(binary_expr.op, datafusion_expr::Operator::AtAt)
+ {
+ self.transcribed = true;
+ let scalar_udf = create_udf(
+ Arc::new(MatchesTermFunction),
+ QueryContext::arc(),
+ Arc::new(FunctionState::default()),
+ );
+ let exprs = vec![
+ binary_expr.left.as_ref().clone(),
+ binary_expr.right.as_ref().clone(),
+ ];
+ Ok(Transformed::yes(Expr::ScalarFunction(
+ ScalarFunction::new_udf(Arc::new(scalar_udf), exprs),
+ )))
+ } else {
+ Ok(Transformed::no(expr))
+ }
+ }
+}
+#[cfg(test)]
+mod tests {
+
+ use arrow_schema::SchemaRef;
+ use datafusion::datasource::{provider_as_source, MemTable};
+ use datafusion::logical_expr::{col, lit, LogicalPlan, LogicalPlanBuilder};
+ use datafusion_expr::{BinaryExpr, Operator};
+ use datatypes::arrow::datatypes::{DataType, Field, Schema};
+
+ use super::*;
+
+ fn optimize(plan: LogicalPlan) -> Result {
+ TranscribeAtatRule.analyze(plan, &ConfigOptions::default())
+ }
+
+ fn prepare_test_plan_builder() -> LogicalPlanBuilder {
+ let schema = Schema::new(vec![
+ Field::new("a", DataType::Utf8, false),
+ Field::new("b", DataType::Utf8, false),
+ ]);
+ let table = MemTable::try_new(SchemaRef::from(schema), vec![]).unwrap();
+ LogicalPlanBuilder::scan("t", provider_as_source(Arc::new(table)), None).unwrap()
+ }
+
+ #[test]
+ fn test_multiple_atat() {
+ let plan = prepare_test_plan_builder()
+ .filter(
+ Expr::BinaryExpr(BinaryExpr {
+ left: Box::new(col("a")),
+ op: Operator::AtAt,
+ right: Box::new(lit("foo")),
+ })
+ .and(Expr::BinaryExpr(BinaryExpr {
+ left: Box::new(col("b")),
+ op: Operator::AtAt,
+ right: Box::new(lit("bar")),
+ })),
+ )
+ .unwrap()
+ .project(vec![
+ Expr::BinaryExpr(BinaryExpr {
+ left: Box::new(col("a")),
+ op: Operator::AtAt,
+ right: Box::new(col("b")),
+ }),
+ col("b"),
+ ])
+ .unwrap()
+ .build()
+ .unwrap();
+
+ let expected = r#"Projection: matches_term(t.a, t.b), t.b
+ Filter: matches_term(t.a, Utf8("foo")) AND matches_term(t.b, Utf8("bar"))
+ TableScan: t"#;
+
+ let optimized_plan = optimize(plan).unwrap();
+ let formatted = optimized_plan.to_string();
+
+ assert_eq!(formatted, expected);
+ }
+
+ #[test]
+ fn test_nested_atat() {
+ let plan = prepare_test_plan_builder()
+ .filter(
+ Expr::BinaryExpr(BinaryExpr {
+ left: Box::new(col("a")),
+ op: Operator::AtAt,
+ right: Box::new(lit("foo")),
+ })
+ .and(
+ Expr::BinaryExpr(BinaryExpr {
+ left: Box::new(col("b")),
+ op: Operator::AtAt,
+ right: Box::new(lit("bar")),
+ })
+ .or(Expr::BinaryExpr(BinaryExpr {
+ left: Box::new(
+ // Nested case in function argument
+ Expr::BinaryExpr(BinaryExpr {
+ left: Box::new(col("a")),
+ op: Operator::AtAt,
+ right: Box::new(lit("nested")),
+ }),
+ ),
+ op: Operator::Eq,
+ right: Box::new(lit(true)),
+ })),
+ ),
+ )
+ .unwrap()
+ .project(vec![
+ col("a"),
+ // Complex nested expression with multiple @@ operators
+ Expr::BinaryExpr(BinaryExpr {
+ left: Box::new(Expr::BinaryExpr(BinaryExpr {
+ left: Box::new(col("a")),
+ op: Operator::AtAt,
+ right: Box::new(lit("foo")),
+ })),
+ op: Operator::And,
+ right: Box::new(Expr::BinaryExpr(BinaryExpr {
+ left: Box::new(col("b")),
+ op: Operator::AtAt,
+ right: Box::new(lit("bar")),
+ })),
+ }),
+ ])
+ .unwrap()
+ .build()
+ .unwrap();
+
+ let expected = r#"Projection: t.a, matches_term(t.a, Utf8("foo")) AND matches_term(t.b, Utf8("bar"))
+ Filter: matches_term(t.a, Utf8("foo")) AND (matches_term(t.b, Utf8("bar")) OR matches_term(t.a, Utf8("nested")) = Boolean(true))
+ TableScan: t"#;
+
+ let optimized_plan = optimize(plan).unwrap();
+ let formatted = optimized_plan.to_string();
+
+ assert_eq!(formatted, expected);
+ }
+}
diff --git a/src/query/src/query_engine/state.rs b/src/query/src/query_engine/state.rs
index 03f3a2a13d..d55ab471f9 100644
--- a/src/query/src/query_engine/state.rs
+++ b/src/query/src/query_engine/state.rs
@@ -52,6 +52,7 @@ use crate::optimizer::pass_distribution::PassDistribution;
use crate::optimizer::remove_duplicate::RemoveDuplicate;
use crate::optimizer::scan_hint::ScanHintRule;
use crate::optimizer::string_normalization::StringNormalizationRule;
+use crate::optimizer::transcribe_atat::TranscribeAtatRule;
use crate::optimizer::type_conversion::TypeConversionRule;
use crate::optimizer::windowed_sort::WindowedSortPhysicalRule;
use crate::optimizer::ExtensionAnalyzerRule;
@@ -115,6 +116,7 @@ impl QueryEngineState {
// Apply the datafusion rules
let mut analyzer = Analyzer::new();
+ analyzer.rules.insert(0, Arc::new(TranscribeAtatRule));
analyzer.rules.insert(0, Arc::new(StringNormalizationRule));
// Use our custom rule instead to optimize the count(*) query
diff --git a/tests/cases/standalone/common/expr/atat.result b/tests/cases/standalone/common/expr/atat.result
new file mode 100644
index 0000000000..6beec6347a
--- /dev/null
+++ b/tests/cases/standalone/common/expr/atat.result
@@ -0,0 +1,315 @@
+-- Derived from matches_term cases
+-- Test basic term matching
+-- Expect: true
+SELECT 'cat!' @@ 'cat' as result;
+
++--------+
+| result |
++--------+
+| true |
++--------+
+
+-- Test phrase matching with spaces
+-- Expect: true
+SELECT 'warning:hello world!' @@ 'hello world' as result;
+
++--------+
+| result |
++--------+
+| true |
++--------+
+
+-- Test numbers in term
+SELECT 'v1.0!' @@ 'v1.0' as result;
+
++--------+
+| result |
++--------+
+| true |
++--------+
+
+-- Test case sensitivity
+-- Expect: true
+SELECT 'Cat' @@ 'Cat' as result;
+
++--------+
+| result |
++--------+
+| true |
++--------+
+
+-- Expect: false
+SELECT 'cat' @@ 'Cat' as result;
+
++--------+
+| result |
++--------+
+| false |
++--------+
+
+-- Test empty string handling
+-- Expect: true
+SELECT '' @@ '' as result;
+
++--------+
+| result |
++--------+
+| true |
++--------+
+
+-- Expect: false
+SELECT 'any' @@ '' as result;
+
++--------+
+| result |
++--------+
+| false |
++--------+
+
+-- Expect: false
+SELECT '' @@ 'any' as result;
+
++--------+
+| result |
++--------+
+| false |
++--------+
+
+-- Test partial matches (should fail)
+-- Expect: false
+SELECT 'category' @@ 'cat' as result;
+
++--------+
+| result |
++--------+
+| false |
++--------+
+
+-- Expect: false
+SELECT 'rebooted' @@ 'boot' as result;
+
++--------+
+| result |
++--------+
+| false |
++--------+
+
+-- Test adjacent alphanumeric characters
+SELECT 'cat5' @@ 'cat' as result;
+
++--------+
+| result |
++--------+
+| false |
++--------+
+
+SELECT 'dogcat' @@ 'dog' as result;
+
++--------+
+| result |
++--------+
+| false |
++--------+
+
+-- Test leading non-alphanumeric
+-- Expect: true
+SELECT 'dog/cat' @@ '/cat' as result;
+
++--------+
+| result |
++--------+
+| true |
++--------+
+
+-- Expect: true
+SELECT 'dog/cat' @@ 'dog/' as result;
+
++--------+
+| result |
++--------+
+| true |
++--------+
+
+-- Expect: true
+SELECT 'dog/cat' @@ 'dog/cat' as result;
+
++--------+
+| result |
++--------+
+| true |
++--------+
+
+-- Test unicode characters
+-- Expect: true
+SELECT 'café>' @@ 'café' as result;
+
++--------+
+| result |
++--------+
+| true |
++--------+
+
+-- Expect: true
+SELECT 'русский!' @@ 'русский' as result;
+
++--------+
+| result |
++--------+
+| true |
++--------+
+
+-- Test complete word matching
+CREATE TABLE logs (
+ `id` TIMESTAMP TIME INDEX,
+ `log_message` STRING
+);
+
+Affected Rows: 0
+
+INSERT INTO logs VALUES
+ (1, 'An error occurred!'),
+ (2, 'Critical error: system failure'),
+ (3, 'error-prone'),
+ (4, 'errors'),
+ (5, 'error123'),
+ (6, 'errorLogs'),
+ (7, 'Version v1.0 released'),
+ (8, 'v1.0!'),
+ (9, 'v1.0a'),
+ (10, 'v1.0beta'),
+ (11, 'GET /app/start'),
+ (12, 'Command: /start-prosess'),
+ (13, 'Command: /start'),
+ (14, 'start'),
+ (15, 'start/stop'),
+ (16, 'Alert: system failure detected'),
+ (17, 'system failure!'),
+ (18, 'system-failure'),
+ (19, 'system failure2023'),
+ (20, 'critical error: system failure'),
+ (21, 'critical failure detected'),
+ (22, 'critical issue'),
+ (23, 'failure imminent'),
+ (24, 'Warning: high temperature'),
+ (25, 'WARNING: system overload'),
+ (26, 'warned'),
+ (27, 'warnings');
+
+Affected Rows: 27
+
+-- Test complete word matching for 'error'
+-- Expect:
+-- 1|An error occurred!|true
+-- 2|Critical error: system failure|true
+-- 3|error-prone|true
+-- 4|errors|false
+-- 5|error123|false
+-- 6|errorLogs|false
+SELECT `id`, `log_message`, `log_message` @@ 'error' as `matches_error` FROM logs WHERE `id` <= 6 ORDER BY `id`;
+
++-------------------------+--------------------------------+---------------+
+| id | log_message | matches_error |
++-------------------------+--------------------------------+---------------+
+| 1970-01-01T00:00:00.001 | An error occurred! | true |
+| 1970-01-01T00:00:00.002 | Critical error: system failure | true |
+| 1970-01-01T00:00:00.003 | error-prone | true |
+| 1970-01-01T00:00:00.004 | errors | false |
+| 1970-01-01T00:00:00.005 | error123 | false |
+| 1970-01-01T00:00:00.006 | errorLogs | false |
++-------------------------+--------------------------------+---------------+
+
+-- Test complete word matching for 'v1.0'
+-- Expect:
+-- 7|Version v1.0 released|true
+-- 8|v1.0!|true
+-- 9|v1.0a|false
+-- 10|v1.0beta|false
+SELECT `id`, `log_message`, `log_message` @@ 'v1.0' as `matches_version` FROM logs WHERE `id` BETWEEN 7 AND 10 ORDER BY `id`;
+
++-------------------------+-----------------------+-----------------+
+| id | log_message | matches_version |
++-------------------------+-----------------------+-----------------+
+| 1970-01-01T00:00:00.007 | Version v1.0 released | true |
+| 1970-01-01T00:00:00.008 | v1.0! | true |
+| 1970-01-01T00:00:00.009 | v1.0a | false |
+| 1970-01-01T00:00:00.010 | v1.0beta | false |
++-------------------------+-----------------------+-----------------+
+
+-- Test complete word matching for '/start'
+-- Expect:
+-- 11|GET /app/start|true
+-- 12|Command: /start-prosess|true
+-- 13|Command: /start|true
+-- 14|start|false
+-- 15|start/stop|false
+SELECT `id`, `log_message`, `log_message` @@ '/start' as `matches_start` FROM logs WHERE `id` BETWEEN 11 AND 15 ORDER BY `id`;
+
++-------------------------+-------------------------+---------------+
+| id | log_message | matches_start |
++-------------------------+-------------------------+---------------+
+| 1970-01-01T00:00:00.011 | GET /app/start | true |
+| 1970-01-01T00:00:00.012 | Command: /start-prosess | true |
+| 1970-01-01T00:00:00.013 | Command: /start | true |
+| 1970-01-01T00:00:00.014 | start | false |
+| 1970-01-01T00:00:00.015 | start/stop | false |
++-------------------------+-------------------------+---------------+
+
+-- Test phrase matching for 'system failure'
+-- Expect:
+-- 16|Alert: system failure detected|true
+-- 17|system failure!|true
+-- 18|system-failure|false
+-- 19|system failure2023|false
+SELECT `id`, `log_message`, `log_message` @@ 'system failure' as `matches_phrase` FROM logs WHERE `id` BETWEEN 16 AND 19 ORDER BY `id`;
+
++-------------------------+--------------------------------+----------------+
+| id | log_message | matches_phrase |
++-------------------------+--------------------------------+----------------+
+| 1970-01-01T00:00:00.016 | Alert: system failure detected | true |
+| 1970-01-01T00:00:00.017 | system failure! | true |
+| 1970-01-01T00:00:00.018 | system-failure | false |
+| 1970-01-01T00:00:00.019 | system failure2023 | false |
++-------------------------+--------------------------------+----------------+
+
+-- Test multi-word matching using AND
+-- Expect:
+-- 20|critical error: system failure|true|true|true
+-- 21|critical failure detected|true|true|true
+-- 22|critical issue|true|false|false
+-- 23|failure imminent|false|true|false
+SELECT `id`, `log_message`,
+ `log_message` @@ 'critical' as `matches_critical`,
+ `log_message` @@ 'failure' as `matches_failure`,
+ `log_message` @@ 'critical' AND `log_message` @@ 'failure' as `matches_both`
+FROM logs WHERE `id` BETWEEN 20 AND 23 ORDER BY `id`;
+
++-------------------------+--------------------------------+------------------+-----------------+--------------+
+| id | log_message | matches_critical | matches_failure | matches_both |
++-------------------------+--------------------------------+------------------+-----------------+--------------+
+| 1970-01-01T00:00:00.020 | critical error: system failure | true | true | true |
+| 1970-01-01T00:00:00.021 | critical failure detected | true | true | true |
+| 1970-01-01T00:00:00.022 | critical issue | true | false | false |
+| 1970-01-01T00:00:00.023 | failure imminent | false | true | false |
++-------------------------+--------------------------------+------------------+-----------------+--------------+
+
+-- Test case-insensitive matching using lower()
+-- Expect:
+-- 24|Warning: high temperature|true
+-- 25|WARNING: system overload|true
+-- 26|warned|false
+-- 27|warnings|false
+SELECT `id`, `log_message`, lower(`log_message`) @@ 'warning' as `matches_warning` FROM logs WHERE `id` >= 24 ORDER BY `id`;
+
++-------------------------+---------------------------+-----------------+
+| id | log_message | matches_warning |
++-------------------------+---------------------------+-----------------+
+| 1970-01-01T00:00:00.024 | Warning: high temperature | true |
+| 1970-01-01T00:00:00.025 | WARNING: system overload | true |
+| 1970-01-01T00:00:00.026 | warned | false |
+| 1970-01-01T00:00:00.027 | warnings | false |
++-------------------------+---------------------------+-----------------+
+
+DROP TABLE logs;
+
+Affected Rows: 0
+
diff --git a/tests/cases/standalone/common/expr/atat.sql b/tests/cases/standalone/common/expr/atat.sql
new file mode 100644
index 0000000000..da32dcf0bf
--- /dev/null
+++ b/tests/cases/standalone/common/expr/atat.sql
@@ -0,0 +1,144 @@
+-- Derived from matches_term cases
+
+-- Test basic term matching
+-- Expect: true
+SELECT 'cat!' @@ 'cat' as result;
+
+-- Test phrase matching with spaces
+-- Expect: true
+SELECT 'warning:hello world!' @@ 'hello world' as result;
+
+-- Test numbers in term
+SELECT 'v1.0!' @@ 'v1.0' as result;
+
+-- Test case sensitivity
+-- Expect: true
+SELECT 'Cat' @@ 'Cat' as result;
+-- Expect: false
+SELECT 'cat' @@ 'Cat' as result;
+
+-- Test empty string handling
+-- Expect: true
+SELECT '' @@ '' as result;
+-- Expect: false
+SELECT 'any' @@ '' as result;
+-- Expect: false
+SELECT '' @@ 'any' as result;
+
+-- Test partial matches (should fail)
+-- Expect: false
+SELECT 'category' @@ 'cat' as result;
+-- Expect: false
+SELECT 'rebooted' @@ 'boot' as result;
+
+-- Test adjacent alphanumeric characters
+SELECT 'cat5' @@ 'cat' as result;
+SELECT 'dogcat' @@ 'dog' as result;
+
+-- Test leading non-alphanumeric
+-- Expect: true
+SELECT 'dog/cat' @@ '/cat' as result;
+-- Expect: true
+SELECT 'dog/cat' @@ 'dog/' as result;
+-- Expect: true
+SELECT 'dog/cat' @@ 'dog/cat' as result;
+
+-- Test unicode characters
+-- Expect: true
+SELECT 'café>' @@ 'café' as result;
+-- Expect: true
+SELECT 'русский!' @@ 'русский' as result;
+
+-- Test complete word matching
+CREATE TABLE logs (
+ `id` TIMESTAMP TIME INDEX,
+ `log_message` STRING
+);
+
+INSERT INTO logs VALUES
+ (1, 'An error occurred!'),
+ (2, 'Critical error: system failure'),
+ (3, 'error-prone'),
+ (4, 'errors'),
+ (5, 'error123'),
+ (6, 'errorLogs'),
+ (7, 'Version v1.0 released'),
+ (8, 'v1.0!'),
+ (9, 'v1.0a'),
+ (10, 'v1.0beta'),
+ (11, 'GET /app/start'),
+ (12, 'Command: /start-prosess'),
+ (13, 'Command: /start'),
+ (14, 'start'),
+ (15, 'start/stop'),
+ (16, 'Alert: system failure detected'),
+ (17, 'system failure!'),
+ (18, 'system-failure'),
+ (19, 'system failure2023'),
+ (20, 'critical error: system failure'),
+ (21, 'critical failure detected'),
+ (22, 'critical issue'),
+ (23, 'failure imminent'),
+ (24, 'Warning: high temperature'),
+ (25, 'WARNING: system overload'),
+ (26, 'warned'),
+ (27, 'warnings');
+
+-- Test complete word matching for 'error'
+-- Expect:
+-- 1|An error occurred!|true
+-- 2|Critical error: system failure|true
+-- 3|error-prone|true
+-- 4|errors|false
+-- 5|error123|false
+-- 6|errorLogs|false
+SELECT `id`, `log_message`, `log_message` @@ 'error' as `matches_error` FROM logs WHERE `id` <= 6 ORDER BY `id`;
+
+
+-- Test complete word matching for 'v1.0'
+-- Expect:
+-- 7|Version v1.0 released|true
+-- 8|v1.0!|true
+-- 9|v1.0a|false
+-- 10|v1.0beta|false
+SELECT `id`, `log_message`, `log_message` @@ 'v1.0' as `matches_version` FROM logs WHERE `id` BETWEEN 7 AND 10 ORDER BY `id`;
+
+-- Test complete word matching for '/start'
+-- Expect:
+-- 11|GET /app/start|true
+-- 12|Command: /start-prosess|true
+-- 13|Command: /start|true
+-- 14|start|false
+-- 15|start/stop|false
+SELECT `id`, `log_message`, `log_message` @@ '/start' as `matches_start` FROM logs WHERE `id` BETWEEN 11 AND 15 ORDER BY `id`;
+
+-- Test phrase matching for 'system failure'
+-- Expect:
+-- 16|Alert: system failure detected|true
+-- 17|system failure!|true
+-- 18|system-failure|false
+-- 19|system failure2023|false
+SELECT `id`, `log_message`, `log_message` @@ 'system failure' as `matches_phrase` FROM logs WHERE `id` BETWEEN 16 AND 19 ORDER BY `id`;
+
+
+-- Test multi-word matching using AND
+-- Expect:
+-- 20|critical error: system failure|true|true|true
+-- 21|critical failure detected|true|true|true
+-- 22|critical issue|true|false|false
+-- 23|failure imminent|false|true|false
+SELECT `id`, `log_message`,
+ `log_message` @@ 'critical' as `matches_critical`,
+ `log_message` @@ 'failure' as `matches_failure`,
+ `log_message` @@ 'critical' AND `log_message` @@ 'failure' as `matches_both`
+FROM logs WHERE `id` BETWEEN 20 AND 23 ORDER BY `id`;
+
+-- Test case-insensitive matching using lower()
+-- Expect:
+-- 24|Warning: high temperature|true
+-- 25|WARNING: system overload|true
+-- 26|warned|false
+-- 27|warnings|false
+SELECT `id`, `log_message`, lower(`log_message`) @@ 'warning' as `matches_warning` FROM logs WHERE `id` >= 24 ORDER BY `id`;
+
+DROP TABLE logs;
diff --git a/tests/cases/standalone/common/tql-explain-analyze/explain.result b/tests/cases/standalone/common/tql-explain-analyze/explain.result
index 8b4952ed3d..200ec5c814 100644
--- a/tests/cases/standalone/common/tql-explain-analyze/explain.result
+++ b/tests/cases/standalone/common/tql-explain-analyze/explain.result
@@ -80,6 +80,7 @@ TQL EXPLAIN VERBOSE (0, 10, '5s') test;
|_|_TableScan: test_|
| logical_plan after count_wildcard_to_time_index_rule_| SAME TEXT AS ABOVE_|
| logical_plan after StringNormalizationRule_| SAME TEXT AS ABOVE_|
+| logical_plan after TranscribeAtatRule_| SAME TEXT AS ABOVE_|
| logical_plan after inline_table_scan_| SAME TEXT AS ABOVE_|
| logical_plan after expand_wildcard_rule_| SAME TEXT AS ABOVE_|
| logical_plan after resolve_grouping_function_| SAME TEXT AS ABOVE_|
From 799c7cbfa97721ca6e3e16d29df65b95bb453b58 Mon Sep 17 00:00:00 2001
From: "Lei, HUANG" <6406592+v0y4g3r@users.noreply.github.com>
Date: Tue, 15 Apr 2025 22:11:50 +0800
Subject: [PATCH 25/82] feat(mito): bulk insert request handling on datanode
(#5831)
* wip: implement basic request handling
* feat/bulk-insert:
### Add Error Handling and Enhance Bulk Insert Functionality
- **Error Handling**: Introduced a new error variant `ConvertDataType` in `error.rs` to handle conversion failures from `ConcreteDataType` to `ColumnDataType`.
- **Bulk Insert Enhancements**:
- Updated `WorkerRequest::BulkInserts` in `request.rs` to include metadata and sender.
- Implemented `handle_bulk_inserts` in `worker.rs` to process bulk insert requests with region metadata.
- Added functions `region_metadata_to_column_schema` and `record_batch_to_rows` in `handle_bulk_insert.rs` for schema conversion and row processing.
- **API Changes**: Modified `RegionBulkInsertsRequest` in `region_request.rs` to include `region_id`.
Files affected: `error.rs`, `request.rs`, `worker.rs`, `handle_bulk_insert.rs`, `region_request.rs`.
* feat/bulk-insert:
**Enhance Error Handling and Add Unit Tests**
- Improved error handling in `record_batch_to_rows` function within `handle_bulk_insert.rs` by returning `Result` and handling errors with `context`.
- Added unit tests for `region_metadata_to_column_schema` and `record_batch_to_rows` functions in `handle_bulk_insert.rs` to ensure correct functionality and error handling.
* chore: update proto version
* feat/bulk-insert:
- **Refactor Error Handling**: Updated error handling in `error.rs` by modifying the `ConvertDataType` error handling.
- **Improve Logging and Error Reporting**: Enhanced logging and error reporting in `worker.rs` by adding error messages for missing region metadata.
- **Add New Error Type**: Introduced `DecodeArrowIpc` error in `metadata.rs` to handle Arrow IPC decoding failures.
- **Handle Arrow IPC Decoding**: Updated `region_request.rs` to handle Arrow IPC decoding errors using the new `DecodeArrowIpc` error type.
* chore: update proto version
* feat/bulk-insert:
Refactor `handle_bulk_insert.rs` to simplify row construction
- Removed the mutable `current_row` vector and refactored `row_at` function to return a new vector directly.
- Updated `record_batch_to_rows` to utilize the refactored `row_at` function for constructing rows.
* feat/bulk-insert:
### Commit Summary
**Enhancements in Region Server Request Handling**
- Updated `region_server.rs` to include `RegionRequest::BulkInserts(_)` in the `RegionChange::Ingest` category, improving the handling of bulk insert operations.
- Refined the categorization of region requests to ensure accurate mapping to `RegionChange` actions.
---
Cargo.lock | 2 +-
Cargo.toml | 2 +-
src/datanode/src/region_server.rs | 4 +-
src/metric-engine/src/engine.rs | 4 +
src/mito2/src/error.rs | 12 +
src/mito2/src/request.rs | 18 +-
src/mito2/src/worker.rs | 25 ++-
src/mito2/src/worker/handle_bulk_insert.rs | 247 +++++++++++++++++++++
src/mito2/src/worker/handle_write.rs | 2 +-
src/store-api/src/lib.rs | 4 +-
src/store-api/src/metadata.rs | 9 +
src/store-api/src/region_request.rs | 76 ++++++-
12 files changed, 387 insertions(+), 18 deletions(-)
create mode 100644 src/mito2/src/worker/handle_bulk_insert.rs
diff --git a/Cargo.lock b/Cargo.lock
index 2035b5090c..62d14792ad 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4722,7 +4722,7 @@ dependencies = [
[[package]]
name = "greptime-proto"
version = "0.1.0"
-source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=dd4a1996982534636734674db66e44464b0c0d83#dd4a1996982534636734674db66e44464b0c0d83"
+source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=583daa3fbbbe39c90b7b92d13646bc3291d9c941#583daa3fbbbe39c90b7b92d13646bc3291d9c941"
dependencies = [
"prost 0.13.5",
"serde",
diff --git a/Cargo.toml b/Cargo.toml
index 05835d88f3..6a0e59c0a1 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -129,7 +129,7 @@ etcd-client = "0.14"
fst = "0.4.7"
futures = "0.3"
futures-util = "0.3"
-greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "dd4a1996982534636734674db66e44464b0c0d83" }
+greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "583daa3fbbbe39c90b7b92d13646bc3291d9c941" }
hex = "0.4"
http = "1"
humantime = "2.1"
diff --git a/src/datanode/src/region_server.rs b/src/datanode/src/region_server.rs
index bff28c109b..072c1682cc 100644
--- a/src/datanode/src/region_server.rs
+++ b/src/datanode/src/region_server.rs
@@ -902,7 +902,9 @@ impl RegionServerInner {
RegionChange::Register(attribute)
}
RegionRequest::Close(_) | RegionRequest::Drop(_) => RegionChange::Deregisters,
- RegionRequest::Put(_) | RegionRequest::Delete(_) => RegionChange::Ingest,
+ RegionRequest::Put(_) | RegionRequest::Delete(_) | RegionRequest::BulkInserts(_) => {
+ RegionChange::Ingest
+ }
RegionRequest::Alter(_)
| RegionRequest::Flush(_)
| RegionRequest::Compact(_)
diff --git a/src/metric-engine/src/engine.rs b/src/metric-engine/src/engine.rs
index 509438b4b2..71cb843ab1 100644
--- a/src/metric-engine/src/engine.rs
+++ b/src/metric-engine/src/engine.rs
@@ -221,6 +221,10 @@ impl RegionEngine for MetricEngine {
}
}
RegionRequest::Catchup(req) => self.inner.catchup_region(region_id, req).await,
+ RegionRequest::BulkInserts(_) => {
+ // todo(hl): find a way to support bulk inserts in metric engine.
+ UnsupportedRegionRequestSnafu { request }.fail()
+ }
};
result.map_err(BoxedError::new).map(|rows| RegionResponse {
diff --git a/src/mito2/src/error.rs b/src/mito2/src/error.rs
index 9c9c78f07e..0ac04e8b3e 100644
--- a/src/mito2/src/error.rs
+++ b/src/mito2/src/error.rs
@@ -1021,6 +1021,17 @@ pub enum Error {
#[snafu(implicit)]
location: Location,
},
+
+ #[snafu(display(
+ "Failed to convert ConcreteDataType to ColumnDataType: {:?}",
+ data_type
+ ))]
+ ConvertDataType {
+ data_type: ConcreteDataType,
+ source: api::error::Error,
+ #[snafu(implicit)]
+ location: Location,
+ },
}
pub type Result = std::result::Result;
@@ -1172,6 +1183,7 @@ impl ErrorExt for Error {
ManualCompactionOverride {} => StatusCode::Cancelled,
IncompatibleWalProviderChange { .. } => StatusCode::InvalidArguments,
+ ConvertDataType { .. } => StatusCode::Internal,
}
}
diff --git a/src/mito2/src/request.rs b/src/mito2/src/request.rs
index 33a8f13f07..5331ba6fdc 100644
--- a/src/mito2/src/request.rs
+++ b/src/mito2/src/request.rs
@@ -35,9 +35,9 @@ use store_api::manifest::ManifestVersion;
use store_api::metadata::{ColumnMetadata, RegionMetadata, RegionMetadataRef};
use store_api::region_engine::{SetRegionRoleStateResponse, SettableRegionRoleState};
use store_api::region_request::{
- AffectedRows, RegionAlterRequest, RegionCatchupRequest, RegionCloseRequest,
- RegionCompactRequest, RegionCreateRequest, RegionFlushRequest, RegionOpenRequest,
- RegionRequest, RegionTruncateRequest,
+ AffectedRows, RegionAlterRequest, RegionBulkInsertsRequest, RegionCatchupRequest,
+ RegionCloseRequest, RegionCompactRequest, RegionCreateRequest, RegionFlushRequest,
+ RegionOpenRequest, RegionRequest, RegionTruncateRequest,
};
use store_api::storage::{RegionId, SequenceNumber};
use tokio::sync::oneshot::{self, Receiver, Sender};
@@ -569,6 +569,13 @@ pub(crate) enum WorkerRequest {
/// Keep the manifest of a region up to date.
SyncRegion(RegionSyncRequest),
+
+ /// Bulk inserts request and region metadata.
+ BulkInserts {
+ metadata: Option