Compare commits

..

1 Commits

Author SHA1 Message Date
luofucong
48c5a9cdaf impl json_get_string with new json type
Signed-off-by: luofucong <luofc@foxmail.com>
2025-12-26 19:30:34 +08:00
88 changed files with 1761 additions and 7276 deletions

3
.gitignore vendored
View File

@@ -67,6 +67,3 @@ greptimedb_data
# Claude code # Claude code
CLAUDE.md CLAUDE.md
# AGENTS.md
AGENTS.md

View File

@@ -104,14 +104,14 @@ All commit messages SHOULD adhere to the [Conventional Commits specification](ht
## AI-Assisted contributions ## AI-Assisted contributions
We have the following policy for AI-assisted PRs: We has the following policy for AI-assisted PRs:
- The PR author should **understand the core ideas** behind the implementation **end-to-end**, and be able to justify the design and code during review. - The PR author should **understand the core ideas** behind the implementation **end-to-end**, and be able to justify the design and code during review.
- **Calls out unknowns and assumptions**. It's okay to not fully understand some bits of AI generated code. You should comment on these cases and point them out to reviewers so that they can use their knowledge of the codebase to clear up any concerns. For example, you might comment "calling this function here seems to work but I'm not familiar with how it works internally, I wonder if there's a race condition if it is called concurrently". - **Calls out unknowns and assumptions**. It's okay to not fully understand some bits of AI generated code. You should comment on these cases and point them out to reviewers so that they can use their knowledge of the codebase to clear up any concerns. For example, you might comment "calling this function here seems to work but I'm not familiar with how it works internally, I wonder if there's a race condition if it is called concurrently".
### Why fully AI-generated PRs without understanding are not helpful ### Why fully AI-generated PRs without understanding are not helpful
Today, AI tools cannot reliably make complex changes to GreptimeDB on their own, which is why we rely on pull requests and code review. Today, AI tools cannot reliably make complex changes to DataFusion on their own, which is why we rely on pull requests and code review.
The purposes of code review are: The purposes of code review are:

5
Cargo.lock generated
View File

@@ -2190,6 +2190,7 @@ dependencies = [
"approx 0.5.1", "approx 0.5.1",
"arc-swap", "arc-swap",
"arrow", "arrow",
"arrow-cast",
"arrow-schema", "arrow-schema",
"async-trait", "async-trait",
"bincode", "bincode",
@@ -2220,6 +2221,7 @@ dependencies = [
"h3o", "h3o",
"hyperloglogplus", "hyperloglogplus",
"jsonb", "jsonb",
"jsonpath-rust 0.7.5",
"memchr", "memchr",
"mito-codec", "mito-codec",
"nalgebra", "nalgebra",
@@ -5464,7 +5466,7 @@ dependencies = [
[[package]] [[package]]
name = "greptime-proto" name = "greptime-proto"
version = "0.1.0" version = "0.1.0"
source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=520fa524f9d590752ea327683e82ffd65721b27c#520fa524f9d590752ea327683e82ffd65721b27c" source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=173efe5ec62722089db7c531c0b0d470a072b915#173efe5ec62722089db7c531c0b0d470a072b915"
dependencies = [ dependencies = [
"prost 0.13.5", "prost 0.13.5",
"prost-types 0.13.5", "prost-types 0.13.5",
@@ -7622,7 +7624,6 @@ dependencies = [
"async-trait", "async-trait",
"base64 0.22.1", "base64 0.22.1",
"bytes", "bytes",
"chrono",
"common-base", "common-base",
"common-error", "common-error",
"common-macro", "common-macro",

View File

@@ -103,6 +103,7 @@ aquamarine = "0.6"
arrow = { version = "56.2", features = ["prettyprint"] } arrow = { version = "56.2", features = ["prettyprint"] }
arrow-array = { version = "56.2", default-features = false, features = ["chrono-tz"] } arrow-array = { version = "56.2", default-features = false, features = ["chrono-tz"] }
arrow-buffer = "56.2" arrow-buffer = "56.2"
arrow-cast = "56.2"
arrow-flight = "56.2" arrow-flight = "56.2"
arrow-ipc = { version = "56.2", default-features = false, features = ["lz4", "zstd"] } arrow-ipc = { version = "56.2", default-features = false, features = ["lz4", "zstd"] }
arrow-schema = { version = "56.2", features = ["serde"] } arrow-schema = { version = "56.2", features = ["serde"] }
@@ -150,7 +151,7 @@ etcd-client = { version = "0.16.1", features = [
fst = "0.4.7" fst = "0.4.7"
futures = "0.3" futures = "0.3"
futures-util = "0.3" futures-util = "0.3"
greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "520fa524f9d590752ea327683e82ffd65721b27c" } greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "173efe5ec62722089db7c531c0b0d470a072b915" }
hex = "0.4" hex = "0.4"
http = "1" http = "1"
humantime = "2.1" humantime = "2.1"

View File

@@ -14,7 +14,6 @@ BUILDX_BUILDER_NAME ?= gtbuilder
BASE_IMAGE ?= ubuntu BASE_IMAGE ?= ubuntu
RUST_TOOLCHAIN ?= $(shell cat rust-toolchain.toml | grep channel | cut -d'"' -f2) RUST_TOOLCHAIN ?= $(shell cat rust-toolchain.toml | grep channel | cut -d'"' -f2)
CARGO_REGISTRY_CACHE ?= ${HOME}/.cargo/registry CARGO_REGISTRY_CACHE ?= ${HOME}/.cargo/registry
CARGO_GIT_CACHE ?= ${HOME}/.cargo/git
ARCH := $(shell uname -m | sed 's/x86_64/amd64/' | sed 's/aarch64/arm64/') ARCH := $(shell uname -m | sed 's/x86_64/amd64/' | sed 's/aarch64/arm64/')
OUTPUT_DIR := $(shell if [ "$(RELEASE)" = "true" ]; then echo "release"; elif [ ! -z "$(CARGO_PROFILE)" ]; then echo "$(CARGO_PROFILE)" ; else echo "debug"; fi) OUTPUT_DIR := $(shell if [ "$(RELEASE)" = "true" ]; then echo "release"; elif [ ! -z "$(CARGO_PROFILE)" ]; then echo "$(CARGO_PROFILE)" ; else echo "debug"; fi)
SQLNESS_OPTS ?= SQLNESS_OPTS ?=
@@ -87,7 +86,7 @@ build: ## Build debug version greptime.
build-by-dev-builder: ## Build greptime by dev-builder. build-by-dev-builder: ## Build greptime by dev-builder.
docker run --network=host \ docker run --network=host \
${ASSEMBLED_EXTRA_BUILD_ENV} \ ${ASSEMBLED_EXTRA_BUILD_ENV} \
-v ${PWD}:/greptimedb -v ${CARGO_REGISTRY_CACHE}:/root/.cargo/registry -v ${CARGO_GIT_CACHE}:/root/.cargo/git \ -v ${PWD}:/greptimedb -v ${CARGO_REGISTRY_CACHE}:/root/.cargo/registry \
-w /greptimedb ${IMAGE_REGISTRY}/${IMAGE_NAMESPACE}/dev-builder-${BASE_IMAGE}:${DEV_BUILDER_IMAGE_TAG} \ -w /greptimedb ${IMAGE_REGISTRY}/${IMAGE_NAMESPACE}/dev-builder-${BASE_IMAGE}:${DEV_BUILDER_IMAGE_TAG} \
make build \ make build \
CARGO_EXTENSION="${CARGO_EXTENSION}" \ CARGO_EXTENSION="${CARGO_EXTENSION}" \
@@ -101,7 +100,7 @@ build-by-dev-builder: ## Build greptime by dev-builder.
.PHONY: build-android-bin .PHONY: build-android-bin
build-android-bin: ## Build greptime binary for android. build-android-bin: ## Build greptime binary for android.
docker run --network=host \ docker run --network=host \
-v ${PWD}:/greptimedb -v ${CARGO_REGISTRY_CACHE}:/root/.cargo/registry -v ${CARGO_GIT_CACHE}:/root/.cargo/git \ -v ${PWD}:/greptimedb -v ${CARGO_REGISTRY_CACHE}:/root/.cargo/registry \
-w /greptimedb ${IMAGE_REGISTRY}/${IMAGE_NAMESPACE}/dev-builder-android:${DEV_BUILDER_IMAGE_TAG} \ -w /greptimedb ${IMAGE_REGISTRY}/${IMAGE_NAMESPACE}/dev-builder-android:${DEV_BUILDER_IMAGE_TAG} \
make build \ make build \
CARGO_EXTENSION="ndk --platform 23 -t aarch64-linux-android" \ CARGO_EXTENSION="ndk --platform 23 -t aarch64-linux-android" \
@@ -207,7 +206,7 @@ fix-udeps: ## Remove unused dependencies automatically.
@cargo udeps --workspace --all-targets --output json > udeps-report.json || true @cargo udeps --workspace --all-targets --output json > udeps-report.json || true
@echo "Removing unused dependencies..." @echo "Removing unused dependencies..."
@python3 scripts/fix-udeps.py udeps-report.json @python3 scripts/fix-udeps.py udeps-report.json
.PHONY: fmt-check .PHONY: fmt-check
fmt-check: ## Check code format. fmt-check: ## Check code format.
cargo fmt --all -- --check cargo fmt --all -- --check
@@ -225,7 +224,7 @@ stop-etcd: ## Stop single node etcd for testing purpose.
.PHONY: run-it-in-container .PHONY: run-it-in-container
run-it-in-container: start-etcd ## Run integration tests in dev-builder. run-it-in-container: start-etcd ## Run integration tests in dev-builder.
docker run --network=host \ docker run --network=host \
-v ${PWD}:/greptimedb -v ${CARGO_REGISTRY_CACHE}:/root/.cargo/registry -v ${CARGO_GIT_CACHE}:/root/.cargo/git -v /tmp:/tmp \ -v ${PWD}:/greptimedb -v ${CARGO_REGISTRY_CACHE}:/root/.cargo/registry -v /tmp:/tmp \
-w /greptimedb ${IMAGE_REGISTRY}/${IMAGE_NAMESPACE}/dev-builder-${BASE_IMAGE}:${DEV_BUILDER_IMAGE_TAG} \ -w /greptimedb ${IMAGE_REGISTRY}/${IMAGE_NAMESPACE}/dev-builder-${BASE_IMAGE}:${DEV_BUILDER_IMAGE_TAG} \
make test sqlness-test BUILD_JOBS=${BUILD_JOBS} make test sqlness-test BUILD_JOBS=${BUILD_JOBS}

View File

@@ -17,6 +17,7 @@ ahash.workspace = true
api.workspace = true api.workspace = true
arc-swap = "1.0" arc-swap = "1.0"
arrow.workspace = true arrow.workspace = true
arrow-cast.workspace = true
arrow-schema.workspace = true arrow-schema.workspace = true
async-trait.workspace = true async-trait.workspace = true
bincode = "=1.3.3" bincode = "=1.3.3"
@@ -46,6 +47,7 @@ geohash = { version = "0.13", optional = true }
h3o = { version = "0.6", optional = true } h3o = { version = "0.6", optional = true }
hyperloglogplus = "0.4" hyperloglogplus = "0.4"
jsonb.workspace = true jsonb.workspace = true
jsonpath-rust = "0.7.5"
memchr = "2.7" memchr = "2.7"
mito-codec.workspace = true mito-codec.workspace = true
nalgebra.workspace = true nalgebra.workspace = true

View File

@@ -13,17 +13,24 @@
// limitations under the License. // limitations under the License.
use std::fmt::{self, Display}; use std::fmt::{self, Display};
use std::str::FromStr;
use std::sync::Arc; use std::sync::Arc;
use arrow::array::{ArrayRef, BinaryViewArray, StringViewArray, StructArray};
use arrow::compute; use arrow::compute;
use datafusion_common::DataFusionError; use arrow::datatypes::{Float64Type, Int64Type, UInt64Type};
use datafusion_common::arrow::array::{ use datafusion_common::arrow::array::{
Array, AsArray, BinaryViewBuilder, BooleanBuilder, Float64Builder, Int64Builder, Array, AsArray, BinaryViewBuilder, BooleanBuilder, Float64Builder, Int64Builder,
StringViewBuilder, StringViewBuilder,
}; };
use datafusion_common::arrow::datatypes::DataType; use datafusion_common::arrow::datatypes::DataType;
use datafusion_common::{DataFusionError, Result};
use datafusion_expr::type_coercion::aggregates::STRINGS; use datafusion_expr::type_coercion::aggregates::STRINGS;
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature}; use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, Volatility};
use datatypes::arrow_array::string_array_value_at_index;
use datatypes::json::JsonStructureSettings;
use jsonpath_rust::JsonPath;
use serde_json::Value;
use crate::function::{Function, extract_args}; use crate::function::{Function, extract_args};
use crate::helper; use crate::helper;
@@ -158,11 +165,7 @@ impl JsonGetString {
impl Default for JsonGetString { impl Default for JsonGetString {
fn default() -> Self { fn default() -> Self {
Self { Self {
// TODO(LFC): Use a more clear type here instead of "Binary" for Json input, once we have a "Json" type. signature: Signature::any(2, Volatility::Immutable),
signature: helper::one_of_sigs2(
vec![DataType::Binary, DataType::BinaryView],
vec![DataType::Utf8, DataType::Utf8View],
),
} }
} }
} }
@@ -172,7 +175,7 @@ impl Function for JsonGetString {
Self::NAME Self::NAME
} }
fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> { fn return_type(&self, _: &[DataType]) -> Result<DataType> {
Ok(DataType::Utf8View) Ok(DataType::Utf8View)
} }
@@ -180,33 +183,203 @@ impl Function for JsonGetString {
&self.signature &self.signature
} }
fn invoke_with_args( fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
&self,
args: ScalarFunctionArgs,
) -> datafusion_common::Result<ColumnarValue> {
let [arg0, arg1] = extract_args(self.name(), &args)?; let [arg0, arg1] = extract_args(self.name(), &args)?;
let arg0 = compute::cast(&arg0, &DataType::BinaryView)?;
let jsons = arg0.as_binary_view();
let arg1 = compute::cast(&arg1, &DataType::Utf8View)?; let arg1 = compute::cast(&arg1, &DataType::Utf8View)?;
let paths = arg1.as_string_view(); let paths = arg1.as_string_view();
let size = jsons.len(); let result = match arg0.data_type() {
let mut builder = StringViewBuilder::with_capacity(size); DataType::Binary | DataType::LargeBinary | DataType::BinaryView => {
let arg0 = compute::cast(&arg0, &DataType::BinaryView)?;
let jsons = arg0.as_binary_view();
jsonb_get_string(jsons, paths)?
}
DataType::Struct(_) => {
let jsons = arg0.as_struct();
json_struct_get_string(jsons, paths)?
}
_ => {
return Err(DataFusionError::Execution(format!(
"{} not supported argument type {}",
Self::NAME,
arg0.data_type(),
)));
}
};
for i in 0..size { Ok(ColumnarValue::Array(result))
let json = jsons.is_valid(i).then(|| jsons.value(i)); }
let path = paths.is_valid(i).then(|| paths.value(i)); }
let result = match (json, path) {
(Some(json), Some(path)) => { fn jsonb_get_string(jsons: &BinaryViewArray, paths: &StringViewArray) -> Result<ArrayRef> {
get_json_by_path(json, path).and_then(|json| jsonb::to_str(&json).ok()) let size = jsons.len();
} let mut builder = StringViewBuilder::with_capacity(size);
_ => None,
for i in 0..size {
let json = jsons.is_valid(i).then(|| jsons.value(i));
let path = paths.is_valid(i).then(|| paths.value(i));
let result = match (json, path) {
(Some(json), Some(path)) => {
get_json_by_path(json, path).and_then(|json| jsonb::to_str(&json).ok())
}
_ => None,
};
builder.append_option(result);
}
Ok(Arc::new(builder.finish()))
}
fn json_struct_get_string(jsons: &StructArray, paths: &StringViewArray) -> Result<ArrayRef> {
let size = jsons.len();
let mut builder = StringViewBuilder::with_capacity(size);
for i in 0..size {
if jsons.is_null(i) || paths.is_null(i) {
builder.append_null();
continue;
}
let path = paths.value(i);
// naively assume the JSON path is our kind of indexing to the field, by removing its "root"
let field_path = path.replace("$.", "");
let column = jsons.column_by_name(&field_path);
if let Some(column) = column {
if let Some(v) = string_array_value_at_index(column, i) {
builder.append_value(v);
} else {
builder.append_value(arrow_cast::display::array_value_to_string(column, i)?);
}
} else {
let Some(raw) = jsons
.column_by_name(JsonStructureSettings::RAW_FIELD)
.and_then(|x| string_array_value_at_index(x, i))
else {
builder.append_null();
continue;
}; };
builder.append_option(result);
let path: JsonPath<Value> = JsonPath::try_from(path).map_err(|e| {
DataFusionError::Execution(format!("{path} is not a valid JSON path: {e}"))
})?;
// the wanted field is not retrievable from the JSON struct columns directly, we have
// to combine everything (columns and the "_raw") into a complete JSON value to find it
let value = json_struct_to_value(raw, jsons, i)?;
match path.find(&value) {
Value::Null => builder.append_null(),
Value::Array(values) => match values.as_slice() {
[] => builder.append_null(),
[x] => {
if let Some(s) = x.as_str() {
builder.append_value(s)
} else {
builder.append_value(x.to_string())
}
}
x => builder.append_value(
x.iter()
.map(|v| v.to_string())
.collect::<Vec<_>>()
.join(", "),
),
},
// Safety: guarded by the returns of `path.find` as documented
_ => unreachable!(),
}
}
}
Ok(Arc::new(builder.finish()))
}
fn json_struct_to_value(raw: &str, jsons: &StructArray, i: usize) -> Result<Value> {
let Ok(mut json) = Value::from_str(raw) else {
return Err(DataFusionError::Internal(format!(
"inner field '{}' is not a valid JSON string",
JsonStructureSettings::RAW_FIELD
)));
};
for (column_name, column) in jsons.column_names().into_iter().zip(jsons.columns()) {
if column_name == JsonStructureSettings::RAW_FIELD {
continue;
} }
Ok(ColumnarValue::Array(Arc::new(builder.finish()))) let (json_pointer, field) = if let Some((json_object, field)) = column_name.rsplit_once(".")
{
let json_pointer = format!("/{}", json_object.replace(".", "/"));
(json_pointer, field)
} else {
("".to_string(), column_name)
};
let Some(json_object) = json
.pointer_mut(&json_pointer)
.and_then(|x| x.as_object_mut())
else {
return Err(DataFusionError::Internal(format!(
"value at JSON pointer '{}' is not an object",
json_pointer
)));
};
macro_rules! insert {
($column: ident, $i: ident, $json_object: ident, $field: ident) => {{
if let Some(value) = $column
.is_valid($i)
.then(|| serde_json::Value::from($column.value($i)))
{
$json_object.insert($field.to_string(), value);
}
}};
}
match column.data_type() {
// boolean => Value::Bool
DataType::Boolean => {
let column = column.as_boolean();
insert!(column, i, json_object, field);
}
// int => Value::Number
DataType::Int64 => {
let column = column.as_primitive::<Int64Type>();
insert!(column, i, json_object, field);
}
DataType::UInt64 => {
let column = column.as_primitive::<UInt64Type>();
insert!(column, i, json_object, field);
}
DataType::Float64 => {
let column = column.as_primitive::<Float64Type>();
insert!(column, i, json_object, field);
}
// string => Value::String
DataType::Utf8 => {
let column = column.as_string::<i32>();
insert!(column, i, json_object, field);
}
DataType::LargeUtf8 => {
let column = column.as_string::<i64>();
insert!(column, i, json_object, field);
}
DataType::Utf8View => {
let column = column.as_string_view();
insert!(column, i, json_object, field);
}
// other => Value::Array and Value::Object
_ => {
return Err(DataFusionError::NotImplemented(format!(
"{} is not yet supported to be executed with field {} of datatype {}",
JsonGetString::NAME,
column_name,
column.data_type()
)));
}
}
} }
Ok(json)
} }
impl Display for JsonGetString { impl Display for JsonGetString {
@@ -296,11 +469,13 @@ impl Display for JsonGetObject {
mod tests { mod tests {
use std::sync::Arc; use std::sync::Arc;
use arrow::array::{Float64Array, Int64Array, StructArray};
use arrow_schema::Field; use arrow_schema::Field;
use datafusion_common::ScalarValue; use datafusion_common::ScalarValue;
use datafusion_common::arrow::array::{BinaryArray, BinaryViewArray, StringArray}; use datafusion_common::arrow::array::{BinaryArray, BinaryViewArray, StringArray};
use datafusion_common::arrow::datatypes::{Float64Type, Int64Type}; use datafusion_common::arrow::datatypes::{Float64Type, Int64Type};
use datatypes::types::parse_string_to_jsonb; use datatypes::types::parse_string_to_jsonb;
use serde_json::json;
use super::*; use super::*;
@@ -474,42 +649,123 @@ mod tests {
r#"{"a": "d", "b": {"c": "e"}, "c": "f"}"#, r#"{"a": "d", "b": {"c": "e"}, "c": "f"}"#,
r#"{"a": "g", "b": "h", "c": {"a": "g"}}"#, r#"{"a": "g", "b": "h", "c": {"a": "g"}}"#,
]; ];
let paths = vec!["$.a.b", "$.a", ""];
let results = [Some("a"), Some("d"), None];
let jsonbs = json_strings // complete JSON is:
// {
// "kind": "foo",
// "payload": {
// "code": 404,
// "success": false,
// "result": {
// "error": "not found",
// "time_cost": 1.234
// }
// }
// }
let json_struct: ArrayRef = Arc::new(StructArray::new(
vec![
Field::new("kind", DataType::Utf8, true),
Field::new("payload.code", DataType::Int64, true),
Field::new("payload.result.time_cost", DataType::Float64, true),
Field::new(JsonStructureSettings::RAW_FIELD, DataType::Utf8View, true),
]
.into(),
vec![
Arc::new(StringArray::from_iter([Some("foo")])) as ArrayRef,
Arc::new(Int64Array::from_iter([Some(404)])),
Arc::new(Float64Array::from_iter([Some(1.234)])),
Arc::new(StringViewArray::from_iter([Some(
json! ({
"payload": {
"success": false,
"result": {
"error": "not found"
}
}
})
.to_string(),
)])),
],
None,
));
let paths = vec![
"$.a.b",
"$.a",
"",
"$.kind",
"$.payload.code",
"$.payload.result.time_cost",
"$.payload",
"$.payload.success",
"$.payload.result",
"$.payload.result.error",
"$.payload.result.not-exists",
"$.payload.not-exists",
"$.not-exists",
"$",
];
let expects = [
Some("a"),
Some("d"),
None,
Some("foo"),
Some("404"),
Some("1.234"),
Some(
r#"{"code":404,"result":{"error":"not found","time_cost":1.234},"success":false}"#,
),
Some("false"),
Some(r#"{"error":"not found","time_cost":1.234}"#),
Some("not found"),
None,
None,
None,
Some(
r#"{"kind":"foo","payload":{"code":404,"result":{"error":"not found","time_cost":1.234},"success":false}}"#,
),
];
let mut jsons = json_strings
.iter() .iter()
.map(|s| { .map(|s| {
let value = jsonb::parse_value(s.as_bytes()).unwrap(); let value = jsonb::parse_value(s.as_bytes()).unwrap();
value.to_vec() Arc::new(BinaryArray::from_iter_values([value.to_vec()])) as ArrayRef
}) })
.collect::<Vec<_>>(); .collect::<Vec<_>>();
let json_struct_arrays =
std::iter::repeat_n(json_struct, expects.len() - jsons.len()).collect::<Vec<_>>();
jsons.extend(json_struct_arrays);
let args = ScalarFunctionArgs { for i in 0..jsons.len() {
args: vec![ let json = &jsons[i];
ColumnarValue::Array(Arc::new(BinaryArray::from_iter_values(jsonbs))), let path = paths[i];
ColumnarValue::Array(Arc::new(StringArray::from_iter_values(paths))), let expect = expects[i];
],
arg_fields: vec![],
number_rows: 3,
return_field: Arc::new(Field::new("x", DataType::Utf8View, false)),
config_options: Arc::new(Default::default()),
};
let result = json_get_string
.invoke_with_args(args)
.and_then(|x| x.to_array(3))
.unwrap();
let vector = result.as_string_view();
assert_eq!(3, vector.len()); let args = ScalarFunctionArgs {
for (i, gt) in results.iter().enumerate() { args: vec![
let result = vector.is_valid(i).then(|| vector.value(i)); ColumnarValue::Array(json.clone()),
assert_eq!(*gt, result); ColumnarValue::Scalar(path.into()),
],
arg_fields: vec![],
number_rows: 1,
return_field: Arc::new(Field::new("x", DataType::Utf8View, false)),
config_options: Arc::new(Default::default()),
};
let result = json_get_string
.invoke_with_args(args)
.and_then(|x| x.to_array(1))
.unwrap();
let result = result.as_string_view();
assert_eq!(1, result.len());
let actual = result.is_valid(0).then(|| result.value(0));
assert_eq!(actual, expect);
} }
} }
#[test] #[test]
fn test_json_get_object() -> datafusion_common::Result<()> { fn test_json_get_object() -> Result<()> {
let udf = JsonGetObject::default(); let udf = JsonGetObject::default();
assert_eq!("json_get_object", udf.name()); assert_eq!("json_get_object", udf.name());
assert_eq!( assert_eq!(

View File

@@ -14,31 +14,13 @@
//! String scalar functions //! String scalar functions
mod elt;
mod field;
mod format;
mod insert;
mod locate;
mod regexp_extract; mod regexp_extract;
mod space;
pub(crate) use elt::EltFunction;
pub(crate) use field::FieldFunction;
pub(crate) use format::FormatFunction;
pub(crate) use insert::InsertFunction;
pub(crate) use locate::LocateFunction;
pub(crate) use regexp_extract::RegexpExtractFunction; pub(crate) use regexp_extract::RegexpExtractFunction;
pub(crate) use space::SpaceFunction;
use crate::function_registry::FunctionRegistry; use crate::function_registry::FunctionRegistry;
/// Register all string functions /// Register all string functions
pub fn register_string_functions(registry: &FunctionRegistry) { pub fn register_string_functions(registry: &FunctionRegistry) {
EltFunction::register(registry);
FieldFunction::register(registry);
FormatFunction::register(registry);
InsertFunction::register(registry);
LocateFunction::register(registry);
RegexpExtractFunction::register(registry); RegexpExtractFunction::register(registry);
SpaceFunction::register(registry);
} }

View File

@@ -1,252 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! MySQL-compatible ELT function implementation.
//!
//! ELT(N, str1, str2, str3, ...) - Returns the Nth string from the list.
//! Returns NULL if N < 1 or N > number of strings.
use std::fmt;
use std::sync::Arc;
use datafusion_common::DataFusionError;
use datafusion_common::arrow::array::{Array, ArrayRef, AsArray, LargeStringBuilder};
use datafusion_common::arrow::compute::cast;
use datafusion_common::arrow::datatypes::DataType;
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, Volatility};
use crate::function::Function;
use crate::function_registry::FunctionRegistry;
const NAME: &str = "elt";
/// MySQL-compatible ELT function.
///
/// Syntax: ELT(N, str1, str2, str3, ...)
/// Returns the Nth string argument. N is 1-based.
/// Returns NULL if N is NULL, N < 1, or N > number of string arguments.
#[derive(Debug)]
pub struct EltFunction {
signature: Signature,
}
impl EltFunction {
pub fn register(registry: &FunctionRegistry) {
registry.register_scalar(EltFunction::default());
}
}
impl Default for EltFunction {
fn default() -> Self {
Self {
// ELT takes a variable number of arguments: (Int64, String, String, ...)
signature: Signature::variadic_any(Volatility::Immutable),
}
}
}
impl fmt::Display for EltFunction {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", NAME.to_ascii_uppercase())
}
}
impl Function for EltFunction {
fn name(&self) -> &str {
NAME
}
fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> {
Ok(DataType::LargeUtf8)
}
fn signature(&self) -> &Signature {
&self.signature
}
fn invoke_with_args(
&self,
args: ScalarFunctionArgs,
) -> datafusion_common::Result<ColumnarValue> {
if args.args.len() < 2 {
return Err(DataFusionError::Execution(
"ELT requires at least 2 arguments: ELT(N, str1, ...)".to_string(),
));
}
let arrays = ColumnarValue::values_to_arrays(&args.args)?;
let len = arrays[0].len();
let num_strings = arrays.len() - 1;
// First argument is the index (N) - try to cast to Int64
let index_array = if arrays[0].data_type() == &DataType::Null {
// All NULLs - return all NULLs
let mut builder = LargeStringBuilder::with_capacity(len, 0);
for _ in 0..len {
builder.append_null();
}
return Ok(ColumnarValue::Array(Arc::new(builder.finish())));
} else {
cast(arrays[0].as_ref(), &DataType::Int64).map_err(|e| {
DataFusionError::Execution(format!("ELT: index argument cast failed: {}", e))
})?
};
// Cast string arguments to LargeUtf8
let string_arrays: Vec<ArrayRef> = arrays[1..]
.iter()
.enumerate()
.map(|(i, arr)| {
cast(arr.as_ref(), &DataType::LargeUtf8).map_err(|e| {
DataFusionError::Execution(format!(
"ELT: string argument {} cast failed: {}",
i + 1,
e
))
})
})
.collect::<datafusion_common::Result<Vec<_>>>()?;
let mut builder = LargeStringBuilder::with_capacity(len, len * 32);
for i in 0..len {
if index_array.is_null(i) {
builder.append_null();
continue;
}
let n = index_array
.as_primitive::<datafusion_common::arrow::datatypes::Int64Type>()
.value(i);
// N is 1-based, check bounds
if n < 1 || n as usize > num_strings {
builder.append_null();
continue;
}
let str_idx = (n - 1) as usize;
let str_array = string_arrays[str_idx].as_string::<i64>();
if str_array.is_null(i) {
builder.append_null();
} else {
builder.append_value(str_array.value(i));
}
}
Ok(ColumnarValue::Array(Arc::new(builder.finish())))
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use datafusion_common::arrow::array::{Int64Array, StringArray};
use datafusion_common::arrow::datatypes::Field;
use datafusion_expr::ScalarFunctionArgs;
use super::*;
fn create_args(arrays: Vec<ArrayRef>) -> ScalarFunctionArgs {
let arg_fields: Vec<_> = arrays
.iter()
.enumerate()
.map(|(i, arr)| {
Arc::new(Field::new(
format!("arg_{}", i),
arr.data_type().clone(),
true,
))
})
.collect();
ScalarFunctionArgs {
args: arrays.iter().cloned().map(ColumnarValue::Array).collect(),
arg_fields,
return_field: Arc::new(Field::new("result", DataType::LargeUtf8, true)),
number_rows: arrays[0].len(),
config_options: Arc::new(datafusion_common::config::ConfigOptions::default()),
}
}
#[test]
fn test_elt_basic() {
let function = EltFunction::default();
let n = Arc::new(Int64Array::from(vec![1, 2, 3]));
let s1 = Arc::new(StringArray::from(vec!["a", "a", "a"]));
let s2 = Arc::new(StringArray::from(vec!["b", "b", "b"]));
let s3 = Arc::new(StringArray::from(vec!["c", "c", "c"]));
let args = create_args(vec![n, s1, s2, s3]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let str_array = array.as_string::<i64>();
assert_eq!(str_array.value(0), "a");
assert_eq!(str_array.value(1), "b");
assert_eq!(str_array.value(2), "c");
} else {
panic!("Expected array result");
}
}
#[test]
fn test_elt_out_of_bounds() {
let function = EltFunction::default();
let n = Arc::new(Int64Array::from(vec![0, 4, -1]));
let s1 = Arc::new(StringArray::from(vec!["a", "a", "a"]));
let s2 = Arc::new(StringArray::from(vec!["b", "b", "b"]));
let s3 = Arc::new(StringArray::from(vec!["c", "c", "c"]));
let args = create_args(vec![n, s1, s2, s3]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let str_array = array.as_string::<i64>();
assert!(str_array.is_null(0)); // 0 is out of bounds
assert!(str_array.is_null(1)); // 4 is out of bounds
assert!(str_array.is_null(2)); // -1 is out of bounds
} else {
panic!("Expected array result");
}
}
#[test]
fn test_elt_with_nulls() {
let function = EltFunction::default();
// Row 0: n=1, select s1="a" -> "a"
// Row 1: n=NULL -> NULL
// Row 2: n=1, select s1=NULL -> NULL
let n = Arc::new(Int64Array::from(vec![Some(1), None, Some(1)]));
let s1 = Arc::new(StringArray::from(vec![Some("a"), Some("a"), None]));
let s2 = Arc::new(StringArray::from(vec![Some("b"), Some("b"), Some("b")]));
let args = create_args(vec![n, s1, s2]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let str_array = array.as_string::<i64>();
assert_eq!(str_array.value(0), "a");
assert!(str_array.is_null(1)); // N is NULL
assert!(str_array.is_null(2)); // Selected string is NULL
} else {
panic!("Expected array result");
}
}
}

View File

@@ -1,224 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! MySQL-compatible FIELD function implementation.
//!
//! FIELD(str, str1, str2, str3, ...) - Returns the 1-based index of str in the list.
//! Returns 0 if str is not found or is NULL.
use std::fmt;
use std::sync::Arc;
use datafusion_common::DataFusionError;
use datafusion_common::arrow::array::{Array, ArrayRef, AsArray, Int64Builder};
use datafusion_common::arrow::compute::cast;
use datafusion_common::arrow::datatypes::DataType;
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, Volatility};
use crate::function::Function;
use crate::function_registry::FunctionRegistry;
const NAME: &str = "field";
/// MySQL-compatible FIELD function.
///
/// Syntax: FIELD(str, str1, str2, str3, ...)
/// Returns the 1-based index of str in the argument list (str1, str2, str3, ...).
/// Returns 0 if str is not found or is NULL.
#[derive(Debug)]
pub struct FieldFunction {
signature: Signature,
}
impl FieldFunction {
pub fn register(registry: &FunctionRegistry) {
registry.register_scalar(FieldFunction::default());
}
}
impl Default for FieldFunction {
fn default() -> Self {
Self {
// FIELD takes a variable number of arguments: (String, String, String, ...)
signature: Signature::variadic_any(Volatility::Immutable),
}
}
}
impl fmt::Display for FieldFunction {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", NAME.to_ascii_uppercase())
}
}
impl Function for FieldFunction {
fn name(&self) -> &str {
NAME
}
fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> {
Ok(DataType::Int64)
}
fn signature(&self) -> &Signature {
&self.signature
}
fn invoke_with_args(
&self,
args: ScalarFunctionArgs,
) -> datafusion_common::Result<ColumnarValue> {
if args.args.len() < 2 {
return Err(DataFusionError::Execution(
"FIELD requires at least 2 arguments: FIELD(str, str1, ...)".to_string(),
));
}
let arrays = ColumnarValue::values_to_arrays(&args.args)?;
let len = arrays[0].len();
// Cast all arguments to LargeUtf8
let string_arrays: Vec<ArrayRef> = arrays
.iter()
.enumerate()
.map(|(i, arr)| {
cast(arr.as_ref(), &DataType::LargeUtf8).map_err(|e| {
DataFusionError::Execution(format!("FIELD: argument {} cast failed: {}", i, e))
})
})
.collect::<datafusion_common::Result<Vec<_>>>()?;
let search_str = string_arrays[0].as_string::<i64>();
let mut builder = Int64Builder::with_capacity(len);
for i in 0..len {
// If search string is NULL, return 0
if search_str.is_null(i) {
builder.append_value(0);
continue;
}
let needle = search_str.value(i);
let mut found_idx = 0i64;
// Search through the list (starting from index 1 in string_arrays)
for (j, str_arr) in string_arrays[1..].iter().enumerate() {
let str_array = str_arr.as_string::<i64>();
if !str_array.is_null(i) && str_array.value(i) == needle {
found_idx = (j + 1) as i64; // 1-based index
break;
}
}
builder.append_value(found_idx);
}
Ok(ColumnarValue::Array(Arc::new(builder.finish())))
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use datafusion_common::arrow::array::StringArray;
use datafusion_common::arrow::datatypes::Field;
use datafusion_expr::ScalarFunctionArgs;
use super::*;
fn create_args(arrays: Vec<ArrayRef>) -> ScalarFunctionArgs {
let arg_fields: Vec<_> = arrays
.iter()
.enumerate()
.map(|(i, arr)| {
Arc::new(Field::new(
format!("arg_{}", i),
arr.data_type().clone(),
true,
))
})
.collect();
ScalarFunctionArgs {
args: arrays.iter().cloned().map(ColumnarValue::Array).collect(),
arg_fields,
return_field: Arc::new(Field::new("result", DataType::Int64, true)),
number_rows: arrays[0].len(),
config_options: Arc::new(datafusion_common::config::ConfigOptions::default()),
}
}
#[test]
fn test_field_basic() {
let function = FieldFunction::default();
let search = Arc::new(StringArray::from(vec!["b", "d", "a"]));
let s1 = Arc::new(StringArray::from(vec!["a", "a", "a"]));
let s2 = Arc::new(StringArray::from(vec!["b", "b", "b"]));
let s3 = Arc::new(StringArray::from(vec!["c", "c", "c"]));
let args = create_args(vec![search, s1, s2, s3]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let int_array = array.as_primitive::<datafusion_common::arrow::datatypes::Int64Type>();
assert_eq!(int_array.value(0), 2); // "b" is at index 2
assert_eq!(int_array.value(1), 0); // "d" not found
assert_eq!(int_array.value(2), 1); // "a" is at index 1
} else {
panic!("Expected array result");
}
}
#[test]
fn test_field_with_null_search() {
let function = FieldFunction::default();
let search = Arc::new(StringArray::from(vec![Some("a"), None]));
let s1 = Arc::new(StringArray::from(vec!["a", "a"]));
let s2 = Arc::new(StringArray::from(vec!["b", "b"]));
let args = create_args(vec![search, s1, s2]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let int_array = array.as_primitive::<datafusion_common::arrow::datatypes::Int64Type>();
assert_eq!(int_array.value(0), 1); // "a" found at index 1
assert_eq!(int_array.value(1), 0); // NULL search returns 0
} else {
panic!("Expected array result");
}
}
#[test]
fn test_field_case_sensitive() {
let function = FieldFunction::default();
let search = Arc::new(StringArray::from(vec!["A", "a"]));
let s1 = Arc::new(StringArray::from(vec!["a", "a"]));
let s2 = Arc::new(StringArray::from(vec!["A", "A"]));
let args = create_args(vec![search, s1, s2]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let int_array = array.as_primitive::<datafusion_common::arrow::datatypes::Int64Type>();
assert_eq!(int_array.value(0), 2); // "A" matches at index 2
assert_eq!(int_array.value(1), 1); // "a" matches at index 1
} else {
panic!("Expected array result");
}
}
}

View File

@@ -1,512 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! MySQL-compatible FORMAT function implementation.
//!
//! FORMAT(X, D) - Formats the number X with D decimal places using thousand separators.
use std::fmt;
use std::sync::Arc;
use datafusion_common::DataFusionError;
use datafusion_common::arrow::array::{Array, AsArray, LargeStringBuilder};
use datafusion_common::arrow::datatypes as arrow_types;
use datafusion_common::arrow::datatypes::DataType;
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, TypeSignature, Volatility};
use crate::function::Function;
use crate::function_registry::FunctionRegistry;
const NAME: &str = "format";
/// MySQL-compatible FORMAT function.
///
/// Syntax: FORMAT(X, D)
/// Formats the number X to a format like '#,###,###.##', rounded to D decimal places.
/// D can be 0 to 30.
///
/// Note: This implementation uses the en_US locale (comma as thousand separator,
/// period as decimal separator).
#[derive(Debug)]
pub struct FormatFunction {
signature: Signature,
}
impl FormatFunction {
pub fn register(registry: &FunctionRegistry) {
registry.register_scalar(FormatFunction::default());
}
}
impl Default for FormatFunction {
fn default() -> Self {
let mut signatures = Vec::new();
// Support various numeric types for X
let numeric_types = [
DataType::Float64,
DataType::Float32,
DataType::Int64,
DataType::Int32,
DataType::Int16,
DataType::Int8,
DataType::UInt64,
DataType::UInt32,
DataType::UInt16,
DataType::UInt8,
];
// D can be various integer types
let int_types = [
DataType::Int64,
DataType::Int32,
DataType::Int16,
DataType::Int8,
DataType::UInt64,
DataType::UInt32,
DataType::UInt16,
DataType::UInt8,
];
for x_type in &numeric_types {
for d_type in &int_types {
signatures.push(TypeSignature::Exact(vec![x_type.clone(), d_type.clone()]));
}
}
Self {
signature: Signature::one_of(signatures, Volatility::Immutable),
}
}
}
impl fmt::Display for FormatFunction {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", NAME.to_ascii_uppercase())
}
}
impl Function for FormatFunction {
fn name(&self) -> &str {
NAME
}
fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> {
Ok(DataType::LargeUtf8)
}
fn signature(&self) -> &Signature {
&self.signature
}
fn invoke_with_args(
&self,
args: ScalarFunctionArgs,
) -> datafusion_common::Result<ColumnarValue> {
if args.args.len() != 2 {
return Err(DataFusionError::Execution(
"FORMAT requires exactly 2 arguments: FORMAT(X, D)".to_string(),
));
}
let arrays = ColumnarValue::values_to_arrays(&args.args)?;
let len = arrays[0].len();
let x_array = &arrays[0];
let d_array = &arrays[1];
let mut builder = LargeStringBuilder::with_capacity(len, len * 20);
for i in 0..len {
if x_array.is_null(i) || d_array.is_null(i) {
builder.append_null();
continue;
}
let decimal_places = get_decimal_places(d_array, i)?.clamp(0, 30) as usize;
let formatted = match x_array.data_type() {
DataType::Float64 | DataType::Float32 => {
format_number_float(get_float_value(x_array, i)?, decimal_places)
}
DataType::Int64
| DataType::Int32
| DataType::Int16
| DataType::Int8
| DataType::UInt64
| DataType::UInt32
| DataType::UInt16
| DataType::UInt8 => format_number_integer(x_array, i, decimal_places)?,
_ => {
return Err(DataFusionError::Execution(format!(
"FORMAT: unsupported type {:?}",
x_array.data_type()
)));
}
};
builder.append_value(&formatted);
}
Ok(ColumnarValue::Array(Arc::new(builder.finish())))
}
}
/// Get float value from various numeric types.
fn get_float_value(
array: &datafusion_common::arrow::array::ArrayRef,
index: usize,
) -> datafusion_common::Result<f64> {
match array.data_type() {
DataType::Float64 => Ok(array
.as_primitive::<arrow_types::Float64Type>()
.value(index)),
DataType::Float32 => Ok(array
.as_primitive::<arrow_types::Float32Type>()
.value(index) as f64),
_ => Err(DataFusionError::Execution(format!(
"FORMAT: unsupported type {:?}",
array.data_type()
))),
}
}
/// Get decimal places from various integer types.
///
/// MySQL clamps decimal places to `0..=30`. This function returns an `i64` so the caller can clamp.
fn get_decimal_places(
array: &datafusion_common::arrow::array::ArrayRef,
index: usize,
) -> datafusion_common::Result<i64> {
match array.data_type() {
DataType::Int64 => Ok(array.as_primitive::<arrow_types::Int64Type>().value(index)),
DataType::Int32 => Ok(array.as_primitive::<arrow_types::Int32Type>().value(index) as i64),
DataType::Int16 => Ok(array.as_primitive::<arrow_types::Int16Type>().value(index) as i64),
DataType::Int8 => Ok(array.as_primitive::<arrow_types::Int8Type>().value(index) as i64),
DataType::UInt64 => {
let v = array.as_primitive::<arrow_types::UInt64Type>().value(index);
Ok(if v > i64::MAX as u64 {
i64::MAX
} else {
v as i64
})
}
DataType::UInt32 => Ok(array.as_primitive::<arrow_types::UInt32Type>().value(index) as i64),
DataType::UInt16 => Ok(array.as_primitive::<arrow_types::UInt16Type>().value(index) as i64),
DataType::UInt8 => Ok(array.as_primitive::<arrow_types::UInt8Type>().value(index) as i64),
_ => Err(DataFusionError::Execution(format!(
"FORMAT: unsupported type {:?}",
array.data_type()
))),
}
}
fn format_number_integer(
array: &datafusion_common::arrow::array::ArrayRef,
index: usize,
decimal_places: usize,
) -> datafusion_common::Result<String> {
let (is_negative, abs_digits) = match array.data_type() {
DataType::Int64 => {
let v = array.as_primitive::<arrow_types::Int64Type>().value(index) as i128;
(v.is_negative(), v.unsigned_abs().to_string())
}
DataType::Int32 => {
let v = array.as_primitive::<arrow_types::Int32Type>().value(index) as i128;
(v.is_negative(), v.unsigned_abs().to_string())
}
DataType::Int16 => {
let v = array.as_primitive::<arrow_types::Int16Type>().value(index) as i128;
(v.is_negative(), v.unsigned_abs().to_string())
}
DataType::Int8 => {
let v = array.as_primitive::<arrow_types::Int8Type>().value(index) as i128;
(v.is_negative(), v.unsigned_abs().to_string())
}
DataType::UInt64 => {
let v = array.as_primitive::<arrow_types::UInt64Type>().value(index) as u128;
(false, v.to_string())
}
DataType::UInt32 => {
let v = array.as_primitive::<arrow_types::UInt32Type>().value(index) as u128;
(false, v.to_string())
}
DataType::UInt16 => {
let v = array.as_primitive::<arrow_types::UInt16Type>().value(index) as u128;
(false, v.to_string())
}
DataType::UInt8 => {
let v = array.as_primitive::<arrow_types::UInt8Type>().value(index) as u128;
(false, v.to_string())
}
_ => {
return Err(DataFusionError::Execution(format!(
"FORMAT: unsupported type {:?}",
array.data_type()
)));
}
};
let mut result = String::new();
if is_negative {
result.push('-');
}
result.push_str(&add_thousand_separators(&abs_digits));
if decimal_places > 0 {
result.push('.');
result.push_str(&"0".repeat(decimal_places));
}
Ok(result)
}
/// Format a float with thousand separators and `decimal_places` digits after decimal point.
fn format_number_float(x: f64, decimal_places: usize) -> String {
// Handle special cases
if x.is_nan() {
return "NaN".to_string();
}
if x.is_infinite() {
return if x.is_sign_positive() {
"Infinity".to_string()
} else {
"-Infinity".to_string()
};
}
// Round to decimal_places
let multiplier = 10f64.powi(decimal_places as i32);
let rounded = (x * multiplier).round() / multiplier;
// Split into integer and fractional parts
let is_negative = rounded < 0.0;
let abs_value = rounded.abs();
// Format with the specified decimal places
let formatted = if decimal_places == 0 {
format!("{:.0}", abs_value)
} else {
format!("{:.prec$}", abs_value, prec = decimal_places)
};
// Split at decimal point
let parts: Vec<&str> = formatted.split('.').collect();
let int_part = parts[0];
let dec_part = parts.get(1).copied();
// Add thousand separators to integer part
let int_with_sep = add_thousand_separators(int_part);
// Build result
let mut result = String::new();
if is_negative {
result.push('-');
}
result.push_str(&int_with_sep);
if let Some(dec) = dec_part {
result.push('.');
result.push_str(dec);
}
result
}
/// Add thousand separators (commas) to an integer string.
fn add_thousand_separators(s: &str) -> String {
let chars: Vec<char> = s.chars().collect();
let len = chars.len();
if len <= 3 {
return s.to_string();
}
let mut result = String::with_capacity(len + len / 3);
let first_group_len = len % 3;
let first_group_len = if first_group_len == 0 {
3
} else {
first_group_len
};
for (i, ch) in chars.iter().enumerate() {
if i > 0 && i >= first_group_len && (i - first_group_len) % 3 == 0 {
result.push(',');
}
result.push(*ch);
}
result
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use datafusion_common::arrow::array::{Float64Array, Int64Array};
use datafusion_common::arrow::datatypes::Field;
use datafusion_expr::ScalarFunctionArgs;
use super::*;
fn create_args(arrays: Vec<datafusion_common::arrow::array::ArrayRef>) -> ScalarFunctionArgs {
let arg_fields: Vec<_> = arrays
.iter()
.enumerate()
.map(|(i, arr)| {
Arc::new(Field::new(
format!("arg_{}", i),
arr.data_type().clone(),
true,
))
})
.collect();
ScalarFunctionArgs {
args: arrays.iter().cloned().map(ColumnarValue::Array).collect(),
arg_fields,
return_field: Arc::new(Field::new("result", DataType::LargeUtf8, true)),
number_rows: arrays[0].len(),
config_options: Arc::new(datafusion_common::config::ConfigOptions::default()),
}
}
#[test]
fn test_format_basic() {
let function = FormatFunction::default();
let x = Arc::new(Float64Array::from(vec![1234567.891, 1234.5, 1234567.0]));
let d = Arc::new(Int64Array::from(vec![2, 0, 3]));
let args = create_args(vec![x, d]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let str_array = array.as_string::<i64>();
assert_eq!(str_array.value(0), "1,234,567.89");
assert_eq!(str_array.value(1), "1,235"); // rounded
assert_eq!(str_array.value(2), "1,234,567.000");
} else {
panic!("Expected array result");
}
}
#[test]
fn test_format_negative() {
let function = FormatFunction::default();
let x = Arc::new(Float64Array::from(vec![-1234567.891]));
let d = Arc::new(Int64Array::from(vec![2]));
let args = create_args(vec![x, d]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let str_array = array.as_string::<i64>();
assert_eq!(str_array.value(0), "-1,234,567.89");
} else {
panic!("Expected array result");
}
}
#[test]
fn test_format_small_numbers() {
let function = FormatFunction::default();
let x = Arc::new(Float64Array::from(vec![0.5, 12.345, 123.0]));
let d = Arc::new(Int64Array::from(vec![2, 2, 0]));
let args = create_args(vec![x, d]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let str_array = array.as_string::<i64>();
assert_eq!(str_array.value(0), "0.50");
assert_eq!(str_array.value(1), "12.35"); // rounded
assert_eq!(str_array.value(2), "123");
} else {
panic!("Expected array result");
}
}
#[test]
fn test_format_with_nulls() {
let function = FormatFunction::default();
let x = Arc::new(Float64Array::from(vec![Some(1234.5), None]));
let d = Arc::new(Int64Array::from(vec![2, 2]));
let args = create_args(vec![x, d]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let str_array = array.as_string::<i64>();
assert_eq!(str_array.value(0), "1,234.50");
assert!(str_array.is_null(1));
} else {
panic!("Expected array result");
}
}
#[test]
fn test_add_thousand_separators() {
assert_eq!(add_thousand_separators("1"), "1");
assert_eq!(add_thousand_separators("12"), "12");
assert_eq!(add_thousand_separators("123"), "123");
assert_eq!(add_thousand_separators("1234"), "1,234");
assert_eq!(add_thousand_separators("12345"), "12,345");
assert_eq!(add_thousand_separators("123456"), "123,456");
assert_eq!(add_thousand_separators("1234567"), "1,234,567");
assert_eq!(add_thousand_separators("12345678"), "12,345,678");
assert_eq!(add_thousand_separators("123456789"), "123,456,789");
}
#[test]
fn test_format_large_int_no_float_precision_loss() {
let function = FormatFunction::default();
// 2^53 + 1 cannot be represented exactly as f64.
let x = Arc::new(Int64Array::from(vec![9_007_199_254_740_993i64]));
let d = Arc::new(Int64Array::from(vec![0]));
let args = create_args(vec![x, d]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let str_array = array.as_string::<i64>();
assert_eq!(str_array.value(0), "9,007,199,254,740,993");
} else {
panic!("Expected array result");
}
}
#[test]
fn test_format_decimal_places_u64_overflow_clamps() {
use datafusion_common::arrow::array::UInt64Array;
let function = FormatFunction::default();
let x = Arc::new(Int64Array::from(vec![1]));
let d = Arc::new(UInt64Array::from(vec![u64::MAX]));
let args = create_args(vec![x, d]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let str_array = array.as_string::<i64>();
assert_eq!(str_array.value(0), format!("1.{}", "0".repeat(30)));
} else {
panic!("Expected array result");
}
}
}

View File

@@ -1,345 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! MySQL-compatible INSERT function implementation.
//!
//! INSERT(str, pos, len, newstr) - Inserts newstr into str at position pos,
//! replacing len characters.
use std::fmt;
use std::sync::Arc;
use datafusion_common::DataFusionError;
use datafusion_common::arrow::array::{Array, ArrayRef, AsArray, LargeStringBuilder};
use datafusion_common::arrow::compute::cast;
use datafusion_common::arrow::datatypes::DataType;
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, TypeSignature, Volatility};
use crate::function::Function;
use crate::function_registry::FunctionRegistry;
const NAME: &str = "insert";
/// MySQL-compatible INSERT function.
///
/// Syntax: INSERT(str, pos, len, newstr)
/// Returns str with the substring beginning at position pos and len characters long
/// replaced by newstr.
///
/// - pos is 1-based
/// - If pos is out of range, returns the original string
/// - If len is out of range, replaces from pos to end of string
#[derive(Debug)]
pub struct InsertFunction {
signature: Signature,
}
impl InsertFunction {
pub fn register(registry: &FunctionRegistry) {
registry.register_scalar(InsertFunction::default());
}
}
impl Default for InsertFunction {
fn default() -> Self {
let mut signatures = Vec::new();
let string_types = [DataType::Utf8, DataType::LargeUtf8, DataType::Utf8View];
let int_types = [
DataType::Int64,
DataType::Int32,
DataType::Int16,
DataType::Int8,
DataType::UInt64,
DataType::UInt32,
DataType::UInt16,
DataType::UInt8,
];
for str_type in &string_types {
for newstr_type in &string_types {
for pos_type in &int_types {
for len_type in &int_types {
signatures.push(TypeSignature::Exact(vec![
str_type.clone(),
pos_type.clone(),
len_type.clone(),
newstr_type.clone(),
]));
}
}
}
}
Self {
signature: Signature::one_of(signatures, Volatility::Immutable),
}
}
}
impl fmt::Display for InsertFunction {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", NAME.to_ascii_uppercase())
}
}
impl Function for InsertFunction {
fn name(&self) -> &str {
NAME
}
fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> {
Ok(DataType::LargeUtf8)
}
fn signature(&self) -> &Signature {
&self.signature
}
fn invoke_with_args(
&self,
args: ScalarFunctionArgs,
) -> datafusion_common::Result<ColumnarValue> {
if args.args.len() != 4 {
return Err(DataFusionError::Execution(
"INSERT requires exactly 4 arguments: INSERT(str, pos, len, newstr)".to_string(),
));
}
let arrays = ColumnarValue::values_to_arrays(&args.args)?;
let len = arrays[0].len();
// Cast string arguments to LargeUtf8
let str_array = cast_to_large_utf8(&arrays[0], "str")?;
let newstr_array = cast_to_large_utf8(&arrays[3], "newstr")?;
let pos_array = cast_to_int64(&arrays[1], "pos")?;
let replace_len_array = cast_to_int64(&arrays[2], "len")?;
let str_arr = str_array.as_string::<i64>();
let pos_arr = pos_array.as_primitive::<datafusion_common::arrow::datatypes::Int64Type>();
let len_arr =
replace_len_array.as_primitive::<datafusion_common::arrow::datatypes::Int64Type>();
let newstr_arr = newstr_array.as_string::<i64>();
let mut builder = LargeStringBuilder::with_capacity(len, len * 32);
for i in 0..len {
// Check for NULLs
if str_arr.is_null(i)
|| pos_array.is_null(i)
|| replace_len_array.is_null(i)
|| newstr_arr.is_null(i)
{
builder.append_null();
continue;
}
let original = str_arr.value(i);
let pos = pos_arr.value(i);
let replace_len = len_arr.value(i);
let new_str = newstr_arr.value(i);
let result = insert_string(original, pos, replace_len, new_str);
builder.append_value(&result);
}
Ok(ColumnarValue::Array(Arc::new(builder.finish())))
}
}
/// Cast array to LargeUtf8 for uniform string access.
fn cast_to_large_utf8(array: &ArrayRef, name: &str) -> datafusion_common::Result<ArrayRef> {
cast(array.as_ref(), &DataType::LargeUtf8)
.map_err(|e| DataFusionError::Execution(format!("INSERT: {} cast failed: {}", name, e)))
}
fn cast_to_int64(array: &ArrayRef, name: &str) -> datafusion_common::Result<ArrayRef> {
cast(array.as_ref(), &DataType::Int64)
.map_err(|e| DataFusionError::Execution(format!("INSERT: {} cast failed: {}", name, e)))
}
/// Perform the INSERT string operation.
/// pos is 1-based. If pos < 1 or pos > len(str) + 1, returns original string.
fn insert_string(original: &str, pos: i64, replace_len: i64, new_str: &str) -> String {
let char_count = original.chars().count();
// MySQL behavior: if pos < 1 or pos > string length + 1, return original
if pos < 1 || pos as usize > char_count + 1 {
return original.to_string();
}
let start_idx = (pos - 1) as usize; // Convert to 0-based
// Calculate end index for replacement
let replace_len = if replace_len < 0 {
0
} else {
replace_len as usize
};
let end_idx = (start_idx + replace_len).min(char_count);
let start_byte = char_to_byte_idx(original, start_idx);
let end_byte = char_to_byte_idx(original, end_idx);
let mut result = String::with_capacity(original.len() + new_str.len());
result.push_str(&original[..start_byte]);
result.push_str(new_str);
result.push_str(&original[end_byte..]);
result
}
fn char_to_byte_idx(s: &str, char_idx: usize) -> usize {
s.char_indices()
.nth(char_idx)
.map(|(idx, _)| idx)
.unwrap_or(s.len())
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use datafusion_common::arrow::array::{Int64Array, StringArray};
use datafusion_common::arrow::datatypes::Field;
use datafusion_expr::ScalarFunctionArgs;
use super::*;
fn create_args(arrays: Vec<ArrayRef>) -> ScalarFunctionArgs {
let arg_fields: Vec<_> = arrays
.iter()
.enumerate()
.map(|(i, arr)| {
Arc::new(Field::new(
format!("arg_{}", i),
arr.data_type().clone(),
true,
))
})
.collect();
ScalarFunctionArgs {
args: arrays.iter().cloned().map(ColumnarValue::Array).collect(),
arg_fields,
return_field: Arc::new(Field::new("result", DataType::LargeUtf8, true)),
number_rows: arrays[0].len(),
config_options: Arc::new(datafusion_common::config::ConfigOptions::default()),
}
}
#[test]
fn test_insert_basic() {
let function = InsertFunction::default();
// INSERT('Quadratic', 3, 4, 'What') => 'QuWhattic'
let str_arr = Arc::new(StringArray::from(vec!["Quadratic"]));
let pos = Arc::new(Int64Array::from(vec![3]));
let len = Arc::new(Int64Array::from(vec![4]));
let newstr = Arc::new(StringArray::from(vec!["What"]));
let args = create_args(vec![str_arr, pos, len, newstr]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let str_array = array.as_string::<i64>();
assert_eq!(str_array.value(0), "QuWhattic");
} else {
panic!("Expected array result");
}
}
#[test]
fn test_insert_out_of_range_pos() {
let function = InsertFunction::default();
// INSERT('Quadratic', 0, 4, 'What') => 'Quadratic' (pos < 1)
let str_arr = Arc::new(StringArray::from(vec!["Quadratic", "Quadratic"]));
let pos = Arc::new(Int64Array::from(vec![0, 100]));
let len = Arc::new(Int64Array::from(vec![4, 4]));
let newstr = Arc::new(StringArray::from(vec!["What", "What"]));
let args = create_args(vec![str_arr, pos, len, newstr]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let str_array = array.as_string::<i64>();
assert_eq!(str_array.value(0), "Quadratic"); // pos < 1
assert_eq!(str_array.value(1), "Quadratic"); // pos > length
} else {
panic!("Expected array result");
}
}
#[test]
fn test_insert_replace_to_end() {
let function = InsertFunction::default();
// INSERT('Quadratic', 3, 100, 'What') => 'QuWhat' (len exceeds remaining)
let str_arr = Arc::new(StringArray::from(vec!["Quadratic"]));
let pos = Arc::new(Int64Array::from(vec![3]));
let len = Arc::new(Int64Array::from(vec![100]));
let newstr = Arc::new(StringArray::from(vec!["What"]));
let args = create_args(vec![str_arr, pos, len, newstr]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let str_array = array.as_string::<i64>();
assert_eq!(str_array.value(0), "QuWhat");
} else {
panic!("Expected array result");
}
}
#[test]
fn test_insert_unicode() {
let function = InsertFunction::default();
// INSERT('hello世界', 6, 1, 'の') => 'helloの界'
let str_arr = Arc::new(StringArray::from(vec!["hello世界"]));
let pos = Arc::new(Int64Array::from(vec![6]));
let len = Arc::new(Int64Array::from(vec![1]));
let newstr = Arc::new(StringArray::from(vec![""]));
let args = create_args(vec![str_arr, pos, len, newstr]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let str_array = array.as_string::<i64>();
assert_eq!(str_array.value(0), "helloの界");
} else {
panic!("Expected array result");
}
}
#[test]
fn test_insert_with_nulls() {
let function = InsertFunction::default();
let str_arr = Arc::new(StringArray::from(vec![Some("hello"), None]));
let pos = Arc::new(Int64Array::from(vec![1, 1]));
let len = Arc::new(Int64Array::from(vec![1, 1]));
let newstr = Arc::new(StringArray::from(vec!["X", "X"]));
let args = create_args(vec![str_arr, pos, len, newstr]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let str_array = array.as_string::<i64>();
assert_eq!(str_array.value(0), "Xello");
assert!(str_array.is_null(1));
} else {
panic!("Expected array result");
}
}
}

View File

@@ -1,373 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! MySQL-compatible LOCATE function implementation.
//!
//! LOCATE(substr, str) - Returns the position of the first occurrence of substr in str (1-based).
//! LOCATE(substr, str, pos) - Returns the position of the first occurrence of substr in str,
//! starting from position pos.
//! Returns 0 if substr is not found.
use std::fmt;
use std::sync::Arc;
use datafusion_common::DataFusionError;
use datafusion_common::arrow::array::{Array, ArrayRef, AsArray, Int64Builder};
use datafusion_common::arrow::compute::cast;
use datafusion_common::arrow::datatypes::DataType;
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, TypeSignature, Volatility};
use crate::function::Function;
use crate::function_registry::FunctionRegistry;
const NAME: &str = "locate";
/// MySQL-compatible LOCATE function.
///
/// Syntax:
/// - LOCATE(substr, str) - Returns 1-based position of substr in str, or 0 if not found.
/// - LOCATE(substr, str, pos) - Same, but starts searching from position pos.
#[derive(Debug)]
pub struct LocateFunction {
signature: Signature,
}
impl LocateFunction {
pub fn register(registry: &FunctionRegistry) {
registry.register_scalar(LocateFunction::default());
}
}
impl Default for LocateFunction {
fn default() -> Self {
// Support 2 or 3 arguments with various string types
let mut signatures = Vec::new();
let string_types = [DataType::Utf8, DataType::LargeUtf8, DataType::Utf8View];
let int_types = [
DataType::Int64,
DataType::Int32,
DataType::Int16,
DataType::Int8,
DataType::UInt64,
DataType::UInt32,
DataType::UInt16,
DataType::UInt8,
];
// 2-argument form: LOCATE(substr, str)
for substr_type in &string_types {
for str_type in &string_types {
signatures.push(TypeSignature::Exact(vec![
substr_type.clone(),
str_type.clone(),
]));
}
}
// 3-argument form: LOCATE(substr, str, pos)
for substr_type in &string_types {
for str_type in &string_types {
for pos_type in &int_types {
signatures.push(TypeSignature::Exact(vec![
substr_type.clone(),
str_type.clone(),
pos_type.clone(),
]));
}
}
}
Self {
signature: Signature::one_of(signatures, Volatility::Immutable),
}
}
}
impl fmt::Display for LocateFunction {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", NAME.to_ascii_uppercase())
}
}
impl Function for LocateFunction {
fn name(&self) -> &str {
NAME
}
fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> {
Ok(DataType::Int64)
}
fn signature(&self) -> &Signature {
&self.signature
}
fn invoke_with_args(
&self,
args: ScalarFunctionArgs,
) -> datafusion_common::Result<ColumnarValue> {
let arg_count = args.args.len();
if !(2..=3).contains(&arg_count) {
return Err(DataFusionError::Execution(
"LOCATE requires 2 or 3 arguments: LOCATE(substr, str) or LOCATE(substr, str, pos)"
.to_string(),
));
}
let arrays = ColumnarValue::values_to_arrays(&args.args)?;
// Cast string arguments to LargeUtf8 for uniform access
let substr_array = cast_to_large_utf8(&arrays[0], "substr")?;
let str_array = cast_to_large_utf8(&arrays[1], "str")?;
let substr = substr_array.as_string::<i64>();
let str_arr = str_array.as_string::<i64>();
let len = substr.len();
// Handle optional pos argument
let pos_array: Option<ArrayRef> = if arg_count == 3 {
Some(cast_to_int64(&arrays[2], "pos")?)
} else {
None
};
let mut builder = Int64Builder::with_capacity(len);
for i in 0..len {
if substr.is_null(i) || str_arr.is_null(i) {
builder.append_null();
continue;
}
let needle = substr.value(i);
let haystack = str_arr.value(i);
// Get starting position (1-based in MySQL, convert to 0-based)
let start_pos = if let Some(ref pos_arr) = pos_array {
if pos_arr.is_null(i) {
builder.append_null();
continue;
}
let pos = pos_arr
.as_primitive::<datafusion_common::arrow::datatypes::Int64Type>()
.value(i);
if pos < 1 {
// MySQL returns 0 for pos < 1
builder.append_value(0);
continue;
}
(pos - 1) as usize
} else {
0
};
// Find position using character-based indexing (for Unicode support)
let result = locate_substr(haystack, needle, start_pos);
builder.append_value(result);
}
Ok(ColumnarValue::Array(Arc::new(builder.finish())))
}
}
/// Cast array to LargeUtf8 for uniform string access.
fn cast_to_large_utf8(array: &ArrayRef, name: &str) -> datafusion_common::Result<ArrayRef> {
cast(array.as_ref(), &DataType::LargeUtf8)
.map_err(|e| DataFusionError::Execution(format!("LOCATE: {} cast failed: {}", name, e)))
}
fn cast_to_int64(array: &ArrayRef, name: &str) -> datafusion_common::Result<ArrayRef> {
cast(array.as_ref(), &DataType::Int64)
.map_err(|e| DataFusionError::Execution(format!("LOCATE: {} cast failed: {}", name, e)))
}
/// Find the 1-based position of needle in haystack, starting from start_pos (0-based character index).
/// Returns 0 if not found.
fn locate_substr(haystack: &str, needle: &str, start_pos: usize) -> i64 {
// Handle empty needle - MySQL returns start_pos + 1
if needle.is_empty() {
let char_count = haystack.chars().count();
return if start_pos <= char_count {
(start_pos + 1) as i64
} else {
0
};
}
// Convert start_pos (character index) to byte index
let byte_start = haystack
.char_indices()
.nth(start_pos)
.map(|(idx, _)| idx)
.unwrap_or(haystack.len());
if byte_start >= haystack.len() {
return 0;
}
// Search in the substring
let search_str = &haystack[byte_start..];
if let Some(byte_pos) = search_str.find(needle) {
// Convert byte position back to character position
let char_pos = search_str[..byte_pos].chars().count();
// Return 1-based position relative to original string
(start_pos + char_pos + 1) as i64
} else {
0
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use datafusion_common::arrow::array::StringArray;
use datafusion_common::arrow::datatypes::Field;
use datafusion_expr::ScalarFunctionArgs;
use super::*;
fn create_args(arrays: Vec<ArrayRef>) -> ScalarFunctionArgs {
let arg_fields: Vec<_> = arrays
.iter()
.enumerate()
.map(|(i, arr)| {
Arc::new(Field::new(
format!("arg_{}", i),
arr.data_type().clone(),
true,
))
})
.collect();
ScalarFunctionArgs {
args: arrays.iter().cloned().map(ColumnarValue::Array).collect(),
arg_fields,
return_field: Arc::new(Field::new("result", DataType::Int64, true)),
number_rows: arrays[0].len(),
config_options: Arc::new(datafusion_common::config::ConfigOptions::default()),
}
}
#[test]
fn test_locate_basic() {
let function = LocateFunction::default();
let substr = Arc::new(StringArray::from(vec!["world", "xyz", "hello"]));
let str_arr = Arc::new(StringArray::from(vec![
"hello world",
"hello world",
"hello world",
]));
let args = create_args(vec![substr, str_arr]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let int_array = array.as_primitive::<datafusion_common::arrow::datatypes::Int64Type>();
assert_eq!(int_array.value(0), 7); // "world" at position 7
assert_eq!(int_array.value(1), 0); // "xyz" not found
assert_eq!(int_array.value(2), 1); // "hello" at position 1
} else {
panic!("Expected array result");
}
}
#[test]
fn test_locate_with_position() {
let function = LocateFunction::default();
let substr = Arc::new(StringArray::from(vec!["o", "o", "o"]));
let str_arr = Arc::new(StringArray::from(vec![
"hello world",
"hello world",
"hello world",
]));
let pos = Arc::new(datafusion_common::arrow::array::Int64Array::from(vec![
1, 5, 8,
]));
let args = create_args(vec![substr, str_arr, pos]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let int_array = array.as_primitive::<datafusion_common::arrow::datatypes::Int64Type>();
assert_eq!(int_array.value(0), 5); // first 'o' at position 5
assert_eq!(int_array.value(1), 5); // 'o' at position 5 (start from 5)
assert_eq!(int_array.value(2), 8); // 'o' in "world" at position 8
} else {
panic!("Expected array result");
}
}
#[test]
fn test_locate_unicode() {
let function = LocateFunction::default();
let substr = Arc::new(StringArray::from(vec!["", ""]));
let str_arr = Arc::new(StringArray::from(vec!["hello世界", "hello世界"]));
let args = create_args(vec![substr, str_arr]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let int_array = array.as_primitive::<datafusion_common::arrow::datatypes::Int64Type>();
assert_eq!(int_array.value(0), 6); // "世" at position 6
assert_eq!(int_array.value(1), 7); // "界" at position 7
} else {
panic!("Expected array result");
}
}
#[test]
fn test_locate_empty_needle() {
let function = LocateFunction::default();
let substr = Arc::new(StringArray::from(vec!["", ""]));
let str_arr = Arc::new(StringArray::from(vec!["hello", "hello"]));
let pos = Arc::new(datafusion_common::arrow::array::Int64Array::from(vec![
1, 3,
]));
let args = create_args(vec![substr, str_arr, pos]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let int_array = array.as_primitive::<datafusion_common::arrow::datatypes::Int64Type>();
assert_eq!(int_array.value(0), 1); // empty string at pos 1
assert_eq!(int_array.value(1), 3); // empty string at pos 3
} else {
panic!("Expected array result");
}
}
#[test]
fn test_locate_with_nulls() {
let function = LocateFunction::default();
let substr = Arc::new(StringArray::from(vec![Some("o"), None]));
let str_arr = Arc::new(StringArray::from(vec![Some("hello"), Some("hello")]));
let args = create_args(vec![substr, str_arr]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let int_array = array.as_primitive::<datafusion_common::arrow::datatypes::Int64Type>();
assert_eq!(int_array.value(0), 5);
assert!(int_array.is_null(1));
} else {
panic!("Expected array result");
}
}
}

View File

@@ -1,252 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! MySQL-compatible SPACE function implementation.
//!
//! SPACE(N) - Returns a string consisting of N space characters.
use std::fmt;
use std::sync::Arc;
use datafusion_common::DataFusionError;
use datafusion_common::arrow::array::{Array, AsArray, LargeStringBuilder};
use datafusion_common::arrow::datatypes::DataType;
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, TypeSignature, Volatility};
use crate::function::Function;
use crate::function_registry::FunctionRegistry;
const NAME: &str = "space";
// Safety limit for maximum number of spaces
const MAX_SPACE_COUNT: i64 = 1024 * 1024; // 1MB of spaces
/// MySQL-compatible SPACE function.
///
/// Syntax: SPACE(N)
/// Returns a string consisting of N space characters.
/// Returns NULL if N is NULL.
/// Returns empty string if N < 0.
#[derive(Debug)]
pub struct SpaceFunction {
signature: Signature,
}
impl SpaceFunction {
pub fn register(registry: &FunctionRegistry) {
registry.register_scalar(SpaceFunction::default());
}
}
impl Default for SpaceFunction {
fn default() -> Self {
Self {
signature: Signature::one_of(
vec![
TypeSignature::Exact(vec![DataType::Int64]),
TypeSignature::Exact(vec![DataType::Int32]),
TypeSignature::Exact(vec![DataType::Int16]),
TypeSignature::Exact(vec![DataType::Int8]),
TypeSignature::Exact(vec![DataType::UInt64]),
TypeSignature::Exact(vec![DataType::UInt32]),
TypeSignature::Exact(vec![DataType::UInt16]),
TypeSignature::Exact(vec![DataType::UInt8]),
],
Volatility::Immutable,
),
}
}
}
impl fmt::Display for SpaceFunction {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", NAME.to_ascii_uppercase())
}
}
impl Function for SpaceFunction {
fn name(&self) -> &str {
NAME
}
fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> {
Ok(DataType::LargeUtf8)
}
fn signature(&self) -> &Signature {
&self.signature
}
fn invoke_with_args(
&self,
args: ScalarFunctionArgs,
) -> datafusion_common::Result<ColumnarValue> {
if args.args.len() != 1 {
return Err(DataFusionError::Execution(
"SPACE requires exactly 1 argument: SPACE(N)".to_string(),
));
}
let arrays = ColumnarValue::values_to_arrays(&args.args)?;
let len = arrays[0].len();
let n_array = &arrays[0];
let mut builder = LargeStringBuilder::with_capacity(len, len * 10);
for i in 0..len {
if n_array.is_null(i) {
builder.append_null();
continue;
}
let n = get_int_value(n_array, i)?;
if n < 0 {
// MySQL returns empty string for negative values
builder.append_value("");
} else if n > MAX_SPACE_COUNT {
return Err(DataFusionError::Execution(format!(
"SPACE: requested {} spaces exceeds maximum allowed ({})",
n, MAX_SPACE_COUNT
)));
} else {
let spaces = " ".repeat(n as usize);
builder.append_value(&spaces);
}
}
Ok(ColumnarValue::Array(Arc::new(builder.finish())))
}
}
/// Extract integer value from various integer types.
fn get_int_value(
array: &datafusion_common::arrow::array::ArrayRef,
index: usize,
) -> datafusion_common::Result<i64> {
use datafusion_common::arrow::datatypes as arrow_types;
match array.data_type() {
DataType::Int64 => Ok(array.as_primitive::<arrow_types::Int64Type>().value(index)),
DataType::Int32 => Ok(array.as_primitive::<arrow_types::Int32Type>().value(index) as i64),
DataType::Int16 => Ok(array.as_primitive::<arrow_types::Int16Type>().value(index) as i64),
DataType::Int8 => Ok(array.as_primitive::<arrow_types::Int8Type>().value(index) as i64),
DataType::UInt64 => {
let v = array.as_primitive::<arrow_types::UInt64Type>().value(index);
if v > i64::MAX as u64 {
Err(DataFusionError::Execution(format!(
"SPACE: value {} exceeds maximum",
v
)))
} else {
Ok(v as i64)
}
}
DataType::UInt32 => Ok(array.as_primitive::<arrow_types::UInt32Type>().value(index) as i64),
DataType::UInt16 => Ok(array.as_primitive::<arrow_types::UInt16Type>().value(index) as i64),
DataType::UInt8 => Ok(array.as_primitive::<arrow_types::UInt8Type>().value(index) as i64),
_ => Err(DataFusionError::Execution(format!(
"SPACE: unsupported type {:?}",
array.data_type()
))),
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use datafusion_common::arrow::array::Int64Array;
use datafusion_common::arrow::datatypes::Field;
use datafusion_expr::ScalarFunctionArgs;
use super::*;
fn create_args(arrays: Vec<datafusion_common::arrow::array::ArrayRef>) -> ScalarFunctionArgs {
let arg_fields: Vec<_> = arrays
.iter()
.enumerate()
.map(|(i, arr)| {
Arc::new(Field::new(
format!("arg_{}", i),
arr.data_type().clone(),
true,
))
})
.collect();
ScalarFunctionArgs {
args: arrays.iter().cloned().map(ColumnarValue::Array).collect(),
arg_fields,
return_field: Arc::new(Field::new("result", DataType::LargeUtf8, true)),
number_rows: arrays[0].len(),
config_options: Arc::new(datafusion_common::config::ConfigOptions::default()),
}
}
#[test]
fn test_space_basic() {
let function = SpaceFunction::default();
let n = Arc::new(Int64Array::from(vec![0, 1, 5]));
let args = create_args(vec![n]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let str_array = array.as_string::<i64>();
assert_eq!(str_array.value(0), "");
assert_eq!(str_array.value(1), " ");
assert_eq!(str_array.value(2), " ");
} else {
panic!("Expected array result");
}
}
#[test]
fn test_space_negative() {
let function = SpaceFunction::default();
let n = Arc::new(Int64Array::from(vec![-1, -100]));
let args = create_args(vec![n]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let str_array = array.as_string::<i64>();
assert_eq!(str_array.value(0), "");
assert_eq!(str_array.value(1), "");
} else {
panic!("Expected array result");
}
}
#[test]
fn test_space_with_nulls() {
let function = SpaceFunction::default();
let n = Arc::new(Int64Array::from(vec![Some(3), None]));
let args = create_args(vec![n]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let str_array = array.as_string::<i64>();
assert_eq!(str_array.value(0), " ");
assert!(str_array.is_null(1));
} else {
panic!("Expected array result");
}
}
}

View File

@@ -15,14 +15,9 @@
use std::{fmt, mem}; use std::{fmt, mem};
use common_telemetry::debug; use common_telemetry::debug;
use snafu::ensure;
use tokio::sync::{OwnedSemaphorePermit, TryAcquireError}; use tokio::sync::{OwnedSemaphorePermit, TryAcquireError};
use crate::error::{
MemoryAcquireTimeoutSnafu, MemoryLimitExceededSnafu, MemorySemaphoreClosedSnafu, Result,
};
use crate::manager::{MemoryMetrics, MemoryQuota}; use crate::manager::{MemoryMetrics, MemoryQuota};
use crate::policy::OnExhaustedPolicy;
/// Guard representing a slice of reserved memory. /// Guard representing a slice of reserved memory.
pub struct MemoryGuard<M: MemoryMetrics> { pub struct MemoryGuard<M: MemoryMetrics> {
@@ -60,52 +55,11 @@ impl<M: MemoryMetrics> MemoryGuard<M> {
} }
} }
/// Acquires additional memory, waiting if necessary until enough is available. /// Tries to allocate additional memory during task execution.
///
/// On success, merges the new memory into this guard.
///
/// # Errors
/// - Returns error if requested bytes would exceed the manager's total limit
/// - Returns error if the semaphore is unexpectedly closed
pub async fn acquire_additional(&mut self, bytes: u64) -> Result<()> {
match &mut self.state {
GuardState::Unlimited => Ok(()),
GuardState::Limited { permit, quota } => {
if bytes == 0 {
return Ok(());
}
let additional_permits = quota.bytes_to_permits(bytes);
let current_permits = permit.num_permits() as u32;
ensure!(
current_permits.saturating_add(additional_permits) <= quota.limit_permits,
MemoryLimitExceededSnafu {
requested_bytes: bytes,
limit_bytes: quota.permits_to_bytes(quota.limit_permits)
}
);
let additional_permit = quota
.semaphore
.clone()
.acquire_many_owned(additional_permits)
.await
.map_err(|_| MemorySemaphoreClosedSnafu.build())?;
permit.merge(additional_permit);
quota.update_in_use_metric();
debug!("Acquired additional {} bytes", bytes);
Ok(())
}
}
}
/// Tries to acquire additional memory without waiting.
/// ///
/// On success, merges the new memory into this guard and returns true. /// On success, merges the new memory into this guard and returns true.
/// On failure, returns false and leaves this guard unchanged. /// On failure, returns false and leaves this guard unchanged.
pub fn try_acquire_additional(&mut self, bytes: u64) -> bool { pub fn request_additional(&mut self, bytes: u64) -> bool {
match &mut self.state { match &mut self.state {
GuardState::Unlimited => true, GuardState::Unlimited => true,
GuardState::Limited { permit, quota } => { GuardState::Limited { permit, quota } => {
@@ -123,11 +77,11 @@ impl<M: MemoryMetrics> MemoryGuard<M> {
Ok(additional_permit) => { Ok(additional_permit) => {
permit.merge(additional_permit); permit.merge(additional_permit);
quota.update_in_use_metric(); quota.update_in_use_metric();
debug!("Acquired additional {} bytes", bytes); debug!("Allocated additional {} bytes", bytes);
true true
} }
Err(TryAcquireError::NoPermits) | Err(TryAcquireError::Closed) => { Err(TryAcquireError::NoPermits) | Err(TryAcquireError::Closed) => {
quota.metrics.inc_rejected("try_acquire_additional"); quota.metrics.inc_rejected("request_additional");
false false
} }
} }
@@ -135,55 +89,11 @@ impl<M: MemoryMetrics> MemoryGuard<M> {
} }
} }
/// Acquires additional memory based on the given policy. /// Releases a portion of granted memory back to the pool early,
/// /// before the guard is dropped.
/// - For `OnExhaustedPolicy::Wait`: Waits up to the timeout duration for memory to become available
/// - For `OnExhaustedPolicy::Fail`: Returns immediately if memory is not available
///
/// # Errors
/// - `MemoryLimitExceeded`: Requested bytes would exceed the total limit (both policies), or memory is currently exhausted (Fail policy only)
/// - `MemoryAcquireTimeout`: Timeout elapsed while waiting for memory (Wait policy only)
/// - `MemorySemaphoreClosed`: The internal semaphore is unexpectedly closed (rare, indicates system issue)
pub async fn acquire_additional_with_policy(
&mut self,
bytes: u64,
policy: OnExhaustedPolicy,
) -> Result<()> {
match policy {
OnExhaustedPolicy::Wait { timeout } => {
match tokio::time::timeout(timeout, self.acquire_additional(bytes)).await {
Ok(Ok(())) => Ok(()),
Ok(Err(e)) => Err(e),
Err(_elapsed) => MemoryAcquireTimeoutSnafu {
requested_bytes: bytes,
waited: timeout,
}
.fail(),
}
}
OnExhaustedPolicy::Fail => {
if self.try_acquire_additional(bytes) {
Ok(())
} else {
MemoryLimitExceededSnafu {
requested_bytes: bytes,
limit_bytes: match &self.state {
GuardState::Unlimited => 0, // unreachable: unlimited mode always succeeds
GuardState::Limited { quota, .. } => {
quota.permits_to_bytes(quota.limit_permits)
}
},
}
.fail()
}
}
}
}
/// Releases a portion of granted memory back to the pool before the guard is dropped.
/// ///
/// Returns true if the release succeeds or is a no-op; false if the request exceeds granted. /// Returns true if the release succeeds or is a no-op; false if the request exceeds granted.
pub fn release_partial(&mut self, bytes: u64) -> bool { pub fn early_release_partial(&mut self, bytes: u64) -> bool {
match &mut self.state { match &mut self.state {
GuardState::Unlimited => true, GuardState::Unlimited => true,
GuardState::Limited { permit, quota } => { GuardState::Limited { permit, quota } => {
@@ -199,7 +109,7 @@ impl<M: MemoryMetrics> MemoryGuard<M> {
quota.permits_to_bytes(released_permit.num_permits() as u32); quota.permits_to_bytes(released_permit.num_permits() as u32);
drop(released_permit); drop(released_permit);
quota.update_in_use_metric(); quota.update_in_use_metric();
debug!("Released {} bytes from memory guard", released_bytes); debug!("Early released {} bytes from memory guard", released_bytes);
true true
} }
None => false, None => false,

View File

@@ -83,7 +83,7 @@ fn test_request_additional_success() {
assert_eq!(manager.used_bytes(), base); assert_eq!(manager.used_bytes(), base);
// Request additional memory (3MB) - should succeed and merge // Request additional memory (3MB) - should succeed and merge
assert!(guard.try_acquire_additional(3 * PERMIT_GRANULARITY_BYTES)); assert!(guard.request_additional(3 * PERMIT_GRANULARITY_BYTES));
assert_eq!(guard.granted_bytes(), 8 * PERMIT_GRANULARITY_BYTES); assert_eq!(guard.granted_bytes(), 8 * PERMIT_GRANULARITY_BYTES);
assert_eq!(manager.used_bytes(), 8 * PERMIT_GRANULARITY_BYTES); assert_eq!(manager.used_bytes(), 8 * PERMIT_GRANULARITY_BYTES);
} }
@@ -98,11 +98,11 @@ fn test_request_additional_exceeds_limit() {
let mut guard = manager.try_acquire(base).unwrap(); let mut guard = manager.try_acquire(base).unwrap();
// Request additional memory (3MB) - should succeed // Request additional memory (3MB) - should succeed
assert!(guard.try_acquire_additional(3 * PERMIT_GRANULARITY_BYTES)); assert!(guard.request_additional(3 * PERMIT_GRANULARITY_BYTES));
assert_eq!(manager.used_bytes(), 8 * PERMIT_GRANULARITY_BYTES); assert_eq!(manager.used_bytes(), 8 * PERMIT_GRANULARITY_BYTES);
// Request more (3MB) - should fail (would exceed 10MB limit) // Request more (3MB) - should fail (would exceed 10MB limit)
let result = guard.try_acquire_additional(3 * PERMIT_GRANULARITY_BYTES); let result = guard.request_additional(3 * PERMIT_GRANULARITY_BYTES);
assert!(!result); assert!(!result);
// Still at 8MB // Still at 8MB
@@ -119,7 +119,7 @@ fn test_request_additional_auto_release_on_guard_drop() {
let mut guard = manager.try_acquire(5 * PERMIT_GRANULARITY_BYTES).unwrap(); let mut guard = manager.try_acquire(5 * PERMIT_GRANULARITY_BYTES).unwrap();
// Request additional - memory is merged into guard // Request additional - memory is merged into guard
assert!(guard.try_acquire_additional(3 * PERMIT_GRANULARITY_BYTES)); assert!(guard.request_additional(3 * PERMIT_GRANULARITY_BYTES));
assert_eq!(manager.used_bytes(), 8 * PERMIT_GRANULARITY_BYTES); assert_eq!(manager.used_bytes(), 8 * PERMIT_GRANULARITY_BYTES);
// When guard drops, all memory (base + additional) is released together // When guard drops, all memory (base + additional) is released together
@@ -135,7 +135,7 @@ fn test_request_additional_unlimited() {
let mut guard = manager.try_acquire(5 * PERMIT_GRANULARITY_BYTES).unwrap(); let mut guard = manager.try_acquire(5 * PERMIT_GRANULARITY_BYTES).unwrap();
// Should always succeed with unlimited manager // Should always succeed with unlimited manager
assert!(guard.try_acquire_additional(100 * PERMIT_GRANULARITY_BYTES)); assert!(guard.request_additional(100 * PERMIT_GRANULARITY_BYTES));
assert_eq!(guard.granted_bytes(), 0); assert_eq!(guard.granted_bytes(), 0);
assert_eq!(manager.used_bytes(), 0); assert_eq!(manager.used_bytes(), 0);
} }
@@ -148,7 +148,7 @@ fn test_request_additional_zero_bytes() {
let mut guard = manager.try_acquire(5 * PERMIT_GRANULARITY_BYTES).unwrap(); let mut guard = manager.try_acquire(5 * PERMIT_GRANULARITY_BYTES).unwrap();
// Request 0 bytes should succeed without affecting anything // Request 0 bytes should succeed without affecting anything
assert!(guard.try_acquire_additional(0)); assert!(guard.request_additional(0));
assert_eq!(guard.granted_bytes(), 5 * PERMIT_GRANULARITY_BYTES); assert_eq!(guard.granted_bytes(), 5 * PERMIT_GRANULARITY_BYTES);
assert_eq!(manager.used_bytes(), 5 * PERMIT_GRANULARITY_BYTES); assert_eq!(manager.used_bytes(), 5 * PERMIT_GRANULARITY_BYTES);
} }
@@ -162,7 +162,7 @@ fn test_early_release_partial_success() {
assert_eq!(manager.used_bytes(), 8 * PERMIT_GRANULARITY_BYTES); assert_eq!(manager.used_bytes(), 8 * PERMIT_GRANULARITY_BYTES);
// Release half // Release half
assert!(guard.release_partial(4 * PERMIT_GRANULARITY_BYTES)); assert!(guard.early_release_partial(4 * PERMIT_GRANULARITY_BYTES));
assert_eq!(guard.granted_bytes(), 4 * PERMIT_GRANULARITY_BYTES); assert_eq!(guard.granted_bytes(), 4 * PERMIT_GRANULARITY_BYTES);
assert_eq!(manager.used_bytes(), 4 * PERMIT_GRANULARITY_BYTES); assert_eq!(manager.used_bytes(), 4 * PERMIT_GRANULARITY_BYTES);
@@ -177,7 +177,7 @@ fn test_early_release_partial_exceeds_granted() {
let mut guard = manager.try_acquire(5 * PERMIT_GRANULARITY_BYTES).unwrap(); let mut guard = manager.try_acquire(5 * PERMIT_GRANULARITY_BYTES).unwrap();
// Try to release more than granted - should fail // Try to release more than granted - should fail
assert!(!guard.release_partial(10 * PERMIT_GRANULARITY_BYTES)); assert!(!guard.early_release_partial(10 * PERMIT_GRANULARITY_BYTES));
assert_eq!(guard.granted_bytes(), 5 * PERMIT_GRANULARITY_BYTES); assert_eq!(guard.granted_bytes(), 5 * PERMIT_GRANULARITY_BYTES);
assert_eq!(manager.used_bytes(), 5 * PERMIT_GRANULARITY_BYTES); assert_eq!(manager.used_bytes(), 5 * PERMIT_GRANULARITY_BYTES);
} }
@@ -188,7 +188,7 @@ fn test_early_release_partial_unlimited() {
let mut guard = manager.try_acquire(100 * PERMIT_GRANULARITY_BYTES).unwrap(); let mut guard = manager.try_acquire(100 * PERMIT_GRANULARITY_BYTES).unwrap();
// Unlimited guard - release should succeed (no-op) // Unlimited guard - release should succeed (no-op)
assert!(guard.release_partial(50 * PERMIT_GRANULARITY_BYTES)); assert!(guard.early_release_partial(50 * PERMIT_GRANULARITY_BYTES));
assert_eq!(guard.granted_bytes(), 0); assert_eq!(guard.granted_bytes(), 0);
} }
@@ -200,22 +200,22 @@ fn test_request_and_early_release_symmetry() {
let mut guard = manager.try_acquire(5 * PERMIT_GRANULARITY_BYTES).unwrap(); let mut guard = manager.try_acquire(5 * PERMIT_GRANULARITY_BYTES).unwrap();
// Request additional // Request additional
assert!(guard.try_acquire_additional(5 * PERMIT_GRANULARITY_BYTES)); assert!(guard.request_additional(5 * PERMIT_GRANULARITY_BYTES));
assert_eq!(guard.granted_bytes(), 10 * PERMIT_GRANULARITY_BYTES); assert_eq!(guard.granted_bytes(), 10 * PERMIT_GRANULARITY_BYTES);
assert_eq!(manager.used_bytes(), 10 * PERMIT_GRANULARITY_BYTES); assert_eq!(manager.used_bytes(), 10 * PERMIT_GRANULARITY_BYTES);
// Early release some // Early release some
assert!(guard.release_partial(3 * PERMIT_GRANULARITY_BYTES)); assert!(guard.early_release_partial(3 * PERMIT_GRANULARITY_BYTES));
assert_eq!(guard.granted_bytes(), 7 * PERMIT_GRANULARITY_BYTES); assert_eq!(guard.granted_bytes(), 7 * PERMIT_GRANULARITY_BYTES);
assert_eq!(manager.used_bytes(), 7 * PERMIT_GRANULARITY_BYTES); assert_eq!(manager.used_bytes(), 7 * PERMIT_GRANULARITY_BYTES);
// Request again // Request again
assert!(guard.try_acquire_additional(2 * PERMIT_GRANULARITY_BYTES)); assert!(guard.request_additional(2 * PERMIT_GRANULARITY_BYTES));
assert_eq!(guard.granted_bytes(), 9 * PERMIT_GRANULARITY_BYTES); assert_eq!(guard.granted_bytes(), 9 * PERMIT_GRANULARITY_BYTES);
assert_eq!(manager.used_bytes(), 9 * PERMIT_GRANULARITY_BYTES); assert_eq!(manager.used_bytes(), 9 * PERMIT_GRANULARITY_BYTES);
// Early release again // Early release again
assert!(guard.release_partial(4 * PERMIT_GRANULARITY_BYTES)); assert!(guard.early_release_partial(4 * PERMIT_GRANULARITY_BYTES));
assert_eq!(guard.granted_bytes(), 5 * PERMIT_GRANULARITY_BYTES); assert_eq!(guard.granted_bytes(), 5 * PERMIT_GRANULARITY_BYTES);
assert_eq!(manager.used_bytes(), 5 * PERMIT_GRANULARITY_BYTES); assert_eq!(manager.used_bytes(), 5 * PERMIT_GRANULARITY_BYTES);
@@ -226,186 +226,25 @@ fn test_request_and_early_release_symmetry() {
#[test] #[test]
fn test_small_allocation_rounds_up() { fn test_small_allocation_rounds_up() {
// Test that allocations smaller than PERMIT_GRANULARITY_BYTES // Test that allocations smaller than PERMIT_GRANULARITY_BYTES
// round up to 1 permit and can use try_acquire_additional() // round up to 1 permit and can use request_additional()
let limit = 10 * PERMIT_GRANULARITY_BYTES; let limit = 10 * PERMIT_GRANULARITY_BYTES;
let manager = MemoryManager::new(limit, NoOpMetrics); let manager = MemoryManager::new(limit, NoOpMetrics);
let mut guard = manager.try_acquire(512 * 1024).unwrap(); // 512KB let mut guard = manager.try_acquire(512 * 1024).unwrap(); // 512KB
assert_eq!(guard.granted_bytes(), PERMIT_GRANULARITY_BYTES); // Rounds up to 1MB assert_eq!(guard.granted_bytes(), PERMIT_GRANULARITY_BYTES); // Rounds up to 1MB
assert!(guard.try_acquire_additional(2 * PERMIT_GRANULARITY_BYTES)); // Can request more assert!(guard.request_additional(2 * PERMIT_GRANULARITY_BYTES)); // Can request more
assert_eq!(guard.granted_bytes(), 3 * PERMIT_GRANULARITY_BYTES); assert_eq!(guard.granted_bytes(), 3 * PERMIT_GRANULARITY_BYTES);
} }
#[test] #[test]
fn test_acquire_zero_bytes_lazy_allocation() { fn test_acquire_zero_bytes_lazy_allocation() {
// Test that acquire(0) returns 0 permits but can try_acquire_additional() later // Test that acquire(0) returns 0 permits but can request_additional() later
let manager = MemoryManager::new(10 * PERMIT_GRANULARITY_BYTES, NoOpMetrics); let manager = MemoryManager::new(10 * PERMIT_GRANULARITY_BYTES, NoOpMetrics);
let mut guard = manager.try_acquire(0).unwrap(); let mut guard = manager.try_acquire(0).unwrap();
assert_eq!(guard.granted_bytes(), 0); // No permits consumed assert_eq!(guard.granted_bytes(), 0); // No permits consumed
assert_eq!(manager.used_bytes(), 0); assert_eq!(manager.used_bytes(), 0);
assert!(guard.try_acquire_additional(3 * PERMIT_GRANULARITY_BYTES)); // Lazy allocation assert!(guard.request_additional(3 * PERMIT_GRANULARITY_BYTES)); // Lazy allocation
assert_eq!(guard.granted_bytes(), 3 * PERMIT_GRANULARITY_BYTES); assert_eq!(guard.granted_bytes(), 3 * PERMIT_GRANULARITY_BYTES);
} }
#[tokio::test(flavor = "current_thread")]
async fn test_acquire_additional_blocks_and_unblocks() {
let limit = 10 * PERMIT_GRANULARITY_BYTES;
let manager = MemoryManager::new(limit, NoOpMetrics);
// First guard takes 9MB, leaving only 1MB available
let mut guard1 = manager.try_acquire(9 * PERMIT_GRANULARITY_BYTES).unwrap();
assert_eq!(manager.used_bytes(), 9 * PERMIT_GRANULARITY_BYTES);
// Spawn a task that will block trying to acquire additional 5MB (needs total 10MB available)
let manager_clone = manager.clone();
let waiter = tokio::spawn(async move {
let mut guard2 = manager_clone.try_acquire(0).unwrap();
// This will block until enough memory is available
guard2
.acquire_additional(5 * PERMIT_GRANULARITY_BYTES)
.await
.unwrap();
guard2
});
sleep(Duration::from_millis(10)).await;
// Release 5MB from guard1 - this should unblock the waiter
assert!(guard1.release_partial(5 * PERMIT_GRANULARITY_BYTES));
// Waiter should complete now
let guard2 = waiter.await.unwrap();
assert_eq!(guard2.granted_bytes(), 5 * PERMIT_GRANULARITY_BYTES);
// Total: guard1 has 4MB, guard2 has 5MB = 9MB
assert_eq!(manager.used_bytes(), 9 * PERMIT_GRANULARITY_BYTES);
}
#[tokio::test(flavor = "current_thread")]
async fn test_acquire_additional_exceeds_total_limit() {
let limit = 10 * PERMIT_GRANULARITY_BYTES;
let manager = MemoryManager::new(limit, NoOpMetrics);
let mut guard = manager.try_acquire(8 * PERMIT_GRANULARITY_BYTES).unwrap();
// Try to acquire additional 5MB - would exceed total limit of 10MB
let result = guard.acquire_additional(5 * PERMIT_GRANULARITY_BYTES).await;
assert!(result.is_err());
// Guard should remain unchanged
assert_eq!(guard.granted_bytes(), 8 * PERMIT_GRANULARITY_BYTES);
assert_eq!(manager.used_bytes(), 8 * PERMIT_GRANULARITY_BYTES);
}
#[tokio::test(flavor = "current_thread")]
async fn test_acquire_additional_success() {
let limit = 10 * PERMIT_GRANULARITY_BYTES;
let manager = MemoryManager::new(limit, NoOpMetrics);
let mut guard = manager.try_acquire(3 * PERMIT_GRANULARITY_BYTES).unwrap();
assert_eq!(manager.used_bytes(), 3 * PERMIT_GRANULARITY_BYTES);
// Acquire additional 4MB - should succeed
guard
.acquire_additional(4 * PERMIT_GRANULARITY_BYTES)
.await
.unwrap();
assert_eq!(guard.granted_bytes(), 7 * PERMIT_GRANULARITY_BYTES);
assert_eq!(manager.used_bytes(), 7 * PERMIT_GRANULARITY_BYTES);
}
#[tokio::test(flavor = "current_thread")]
async fn test_acquire_additional_with_policy_wait_success() {
use crate::policy::OnExhaustedPolicy;
let limit = 10 * PERMIT_GRANULARITY_BYTES;
let manager = MemoryManager::new(limit, NoOpMetrics);
let mut guard1 = manager.try_acquire(8 * PERMIT_GRANULARITY_BYTES).unwrap();
let manager_clone = manager.clone();
let waiter = tokio::spawn(async move {
let mut guard2 = manager_clone.try_acquire(0).unwrap();
// Wait policy with 1 second timeout
guard2
.acquire_additional_with_policy(
5 * PERMIT_GRANULARITY_BYTES,
OnExhaustedPolicy::Wait {
timeout: Duration::from_secs(1),
},
)
.await
.unwrap();
guard2
});
sleep(Duration::from_millis(10)).await;
// Release memory to unblock waiter
assert!(guard1.release_partial(5 * PERMIT_GRANULARITY_BYTES));
let guard2 = waiter.await.unwrap();
assert_eq!(guard2.granted_bytes(), 5 * PERMIT_GRANULARITY_BYTES);
}
#[tokio::test(flavor = "current_thread")]
async fn test_acquire_additional_with_policy_wait_timeout() {
use crate::policy::OnExhaustedPolicy;
let limit = 10 * PERMIT_GRANULARITY_BYTES;
let manager = MemoryManager::new(limit, NoOpMetrics);
// Take all memory
let _guard1 = manager.try_acquire(10 * PERMIT_GRANULARITY_BYTES).unwrap();
let mut guard2 = manager.try_acquire(0).unwrap();
// Try to acquire with short timeout - should timeout
let result = guard2
.acquire_additional_with_policy(
5 * PERMIT_GRANULARITY_BYTES,
OnExhaustedPolicy::Wait {
timeout: Duration::from_millis(50),
},
)
.await;
assert!(result.is_err());
assert_eq!(guard2.granted_bytes(), 0);
}
#[tokio::test(flavor = "current_thread")]
async fn test_acquire_additional_with_policy_fail() {
use crate::policy::OnExhaustedPolicy;
let limit = 10 * PERMIT_GRANULARITY_BYTES;
let manager = MemoryManager::new(limit, NoOpMetrics);
let _guard1 = manager.try_acquire(8 * PERMIT_GRANULARITY_BYTES).unwrap();
let mut guard2 = manager.try_acquire(0).unwrap();
// Fail policy - should return error immediately
let result = guard2
.acquire_additional_with_policy(5 * PERMIT_GRANULARITY_BYTES, OnExhaustedPolicy::Fail)
.await;
assert!(result.is_err());
assert_eq!(guard2.granted_bytes(), 0);
}
#[tokio::test(flavor = "current_thread")]
async fn test_acquire_additional_unlimited() {
let manager = MemoryManager::new(0, NoOpMetrics); // Unlimited
let mut guard = manager.try_acquire(0).unwrap();
// Should always succeed with unlimited manager
guard
.acquire_additional(1000 * PERMIT_GRANULARITY_BYTES)
.await
.unwrap();
assert_eq!(guard.granted_bytes(), 0);
assert_eq!(manager.used_bytes(), 0);
}

View File

@@ -66,7 +66,7 @@ use store_api::metric_engine_consts::{
}; };
use store_api::region_engine::{ use store_api::region_engine::{
RegionEngineRef, RegionManifestInfo, RegionRole, RegionStatistic, SetRegionRoleStateResponse, RegionEngineRef, RegionManifestInfo, RegionRole, RegionStatistic, SetRegionRoleStateResponse,
SettableRegionRoleState, SyncRegionFromRequest, SettableRegionRoleState,
}; };
use store_api::region_request::{ use store_api::region_request::{
AffectedRows, BatchRegionDdlRequest, RegionCatchupRequest, RegionCloseRequest, AffectedRows, BatchRegionDdlRequest, RegionCatchupRequest, RegionCloseRequest,
@@ -536,13 +536,10 @@ impl RegionServer {
let tracing_context = TracingContext::from_current_span(); let tracing_context = TracingContext::from_current_span();
let span = tracing_context.attach(info_span!("RegionServer::handle_sync_region_request")); let span = tracing_context.attach(info_span!("RegionServer::handle_sync_region_request"));
self.sync_region( self.sync_region(region_id, manifest_info)
region_id, .trace(span)
SyncRegionFromRequest::from_manifest(manifest_info), .await
) .map(|_| RegionResponse::new(AffectedRows::default()))
.trace(span)
.await
.map(|_| RegionResponse::new(AffectedRows::default()))
} }
/// Handles the ListMetadata request and retrieves metadata for specified regions. /// Handles the ListMetadata request and retrieves metadata for specified regions.
@@ -591,7 +588,7 @@ impl RegionServer {
pub async fn sync_region( pub async fn sync_region(
&self, &self,
region_id: RegionId, region_id: RegionId,
request: SyncRegionFromRequest, manifest_info: RegionManifestInfo,
) -> Result<()> { ) -> Result<()> {
let engine_with_status = self let engine_with_status = self
.inner .inner
@@ -600,7 +597,7 @@ impl RegionServer {
.with_context(|| RegionNotFoundSnafu { region_id })?; .with_context(|| RegionNotFoundSnafu { region_id })?;
self.inner self.inner
.handle_sync_region(engine_with_status.engine(), region_id, request) .handle_sync_region(engine_with_status.engine(), region_id, manifest_info)
.await .await
} }
@@ -1219,8 +1216,7 @@ impl RegionServerInner {
| RegionRequest::Compact(_) | RegionRequest::Compact(_)
| RegionRequest::Truncate(_) | RegionRequest::Truncate(_)
| RegionRequest::BuildIndex(_) | RegionRequest::BuildIndex(_)
| RegionRequest::EnterStaging(_) | RegionRequest::EnterStaging(_) => RegionChange::None,
| RegionRequest::ApplyStagingManifest(_) => RegionChange::None,
RegionRequest::Catchup(_) => RegionChange::Catchup, RegionRequest::Catchup(_) => RegionChange::Catchup,
}; };
@@ -1272,10 +1268,10 @@ impl RegionServerInner {
&self, &self,
engine: &RegionEngineRef, engine: &RegionEngineRef,
region_id: RegionId, region_id: RegionId,
request: SyncRegionFromRequest, manifest_info: RegionManifestInfo,
) -> Result<()> { ) -> Result<()> {
let Some(new_opened_regions) = engine let Some(new_opened_regions) = engine
.sync_region(region_id, request) .sync_region(region_id, manifest_info)
.await .await
.with_context(|_| HandleRegionRequestSnafu { region_id })? .with_context(|_| HandleRegionRequestSnafu { region_id })?
.new_opened_logical_region_ids() .new_opened_logical_region_ids()

View File

@@ -33,9 +33,9 @@ use servers::grpc::FlightCompression;
use session::context::QueryContextRef; use session::context::QueryContextRef;
use store_api::metadata::RegionMetadataRef; use store_api::metadata::RegionMetadataRef;
use store_api::region_engine::{ use store_api::region_engine::{
RegionEngine, RegionRole, RegionScannerRef, RegionStatistic, RemapManifestsRequest, CopyRegionFromRequest, CopyRegionFromResponse, RegionEngine, RegionManifestInfo, RegionRole,
RemapManifestsResponse, SetRegionRoleStateResponse, SettableRegionRoleState, RegionScannerRef, RegionStatistic, RemapManifestsRequest, RemapManifestsResponse,
SyncRegionFromRequest, SyncRegionFromResponse, SetRegionRoleStateResponse, SettableRegionRoleState, SyncManifestResponse,
}; };
use store_api::region_request::{AffectedRows, RegionRequest}; use store_api::region_request::{AffectedRows, RegionRequest};
use store_api::storage::{RegionId, ScanRequest, SequenceNumber}; use store_api::storage::{RegionId, ScanRequest, SequenceNumber};
@@ -287,8 +287,8 @@ impl RegionEngine for MockRegionEngine {
async fn sync_region( async fn sync_region(
&self, &self,
_region_id: RegionId, _region_id: RegionId,
_request: SyncRegionFromRequest, _manifest_info: RegionManifestInfo,
) -> Result<SyncRegionFromResponse, BoxedError> { ) -> Result<SyncManifestResponse, BoxedError> {
unimplemented!() unimplemented!()
} }
@@ -299,6 +299,14 @@ impl RegionEngine for MockRegionEngine {
unimplemented!() unimplemented!()
} }
async fn copy_region_from(
&self,
_region_id: RegionId,
_request: CopyRegionFromRequest,
) -> Result<CopyRegionFromResponse, BoxedError> {
unimplemented!()
}
fn as_any(&self) -> &dyn Any { fn as_any(&self) -> &dyn Any {
self self
} }

View File

@@ -19,6 +19,7 @@ use arrow::datatypes::{
Time64NanosecondType, TimeUnit, TimestampMicrosecondType, TimestampMillisecondType, Time64NanosecondType, TimeUnit, TimestampMicrosecondType, TimestampMillisecondType,
TimestampNanosecondType, TimestampSecondType, TimestampNanosecondType, TimestampSecondType,
}; };
use arrow_array::Array;
use common_time::time::Time; use common_time::time::Time;
use common_time::{Duration, Timestamp}; use common_time::{Duration, Timestamp};
@@ -126,3 +127,28 @@ pub fn duration_array_value(array: &ArrayRef, i: usize) -> Duration {
}; };
Duration::new(v, time_unit.into()) Duration::new(v, time_unit.into())
} }
/// Get the string value at index `i` for `Utf8`, `LargeUtf8`, or `Utf8View` arrays.
///
/// Returns `None` when the array type is not a string type or the value is null.
///
/// # Panics
///
/// If index `i` is out of bounds.
pub fn string_array_value_at_index(array: &ArrayRef, i: usize) -> Option<&str> {
match array.data_type() {
DataType::Utf8 => {
let array = array.as_string::<i32>();
array.is_valid(i).then(|| array.value(i))
}
DataType::LargeUtf8 => {
let array = array.as_string::<i64>();
array.is_valid(i).then(|| array.value(i))
}
DataType::Utf8View => {
let array = array.as_string_view();
array.is_valid(i).then(|| array.value(i))
}
_ => None,
}
}

View File

@@ -26,9 +26,10 @@ use object_store::ObjectStore;
use snafu::{OptionExt, ensure}; use snafu::{OptionExt, ensure};
use store_api::metadata::RegionMetadataRef; use store_api::metadata::RegionMetadataRef;
use store_api::region_engine::{ use store_api::region_engine::{
RegionEngine, RegionRole, RegionScannerRef, RegionStatistic, RemapManifestsRequest, CopyRegionFromRequest, CopyRegionFromResponse, RegionEngine, RegionManifestInfo, RegionRole,
RemapManifestsResponse, SetRegionRoleStateResponse, SetRegionRoleStateSuccess, RegionScannerRef, RegionStatistic, RemapManifestsRequest, RemapManifestsResponse,
SettableRegionRoleState, SinglePartitionScanner, SyncRegionFromRequest, SyncRegionFromResponse, SetRegionRoleStateResponse, SetRegionRoleStateSuccess, SettableRegionRoleState,
SinglePartitionScanner, SyncManifestResponse,
}; };
use store_api::region_request::{ use store_api::region_request::{
AffectedRows, RegionCloseRequest, RegionCreateRequest, RegionDropRequest, RegionOpenRequest, AffectedRows, RegionCloseRequest, RegionCreateRequest, RegionDropRequest, RegionOpenRequest,
@@ -144,10 +145,10 @@ impl RegionEngine for FileRegionEngine {
async fn sync_region( async fn sync_region(
&self, &self,
_region_id: RegionId, _region_id: RegionId,
_request: SyncRegionFromRequest, _manifest_info: RegionManifestInfo,
) -> Result<SyncRegionFromResponse, BoxedError> { ) -> Result<SyncManifestResponse, BoxedError> {
// File engine doesn't need to sync region manifest. // File engine doesn't need to sync region manifest.
Ok(SyncRegionFromResponse::NotSupported) Ok(SyncManifestResponse::NotSupported)
} }
async fn remap_manifests( async fn remap_manifests(
@@ -162,6 +163,19 @@ impl RegionEngine for FileRegionEngine {
)) ))
} }
async fn copy_region_from(
&self,
_region_id: RegionId,
_request: CopyRegionFromRequest,
) -> Result<CopyRegionFromResponse, BoxedError> {
Err(BoxedError::new(
UnsupportedSnafu {
operation: "copy_region_from",
}
.build(),
))
}
fn role(&self, region_id: RegionId) -> Option<RegionRole> { fn role(&self, region_id: RegionId) -> Option<RegionRole> {
self.inner.state(region_id) self.inner.state(region_id)
} }

View File

@@ -60,7 +60,6 @@ where
http_server_builder: Option<HttpServerBuilder>, http_server_builder: Option<HttpServerBuilder>,
plugins: Plugins, plugins: Plugins,
flight_handler: Option<FlightCraftRef>, flight_handler: Option<FlightCraftRef>,
pub server_memory_limiter: ServerMemoryLimiter,
} }
impl<T> Services<T> impl<T> Services<T>
@@ -68,13 +67,6 @@ where
T: Into<FrontendOptions> + Configurable + Clone, T: Into<FrontendOptions> + Configurable + Clone,
{ {
pub fn new(opts: T, instance: Arc<Instance>, plugins: Plugins) -> Self { pub fn new(opts: T, instance: Arc<Instance>, plugins: Plugins) -> Self {
let feopts = opts.clone().into();
// Create server request memory limiter for all server protocols
let server_memory_limiter = ServerMemoryLimiter::new(
feopts.max_in_flight_write_bytes.as_bytes(),
feopts.write_bytes_exhausted_policy,
);
Self { Self {
opts, opts,
instance, instance,
@@ -82,7 +74,6 @@ where
http_server_builder: None, http_server_builder: None,
plugins, plugins,
flight_handler: None, flight_handler: None,
server_memory_limiter,
} }
} }
@@ -283,6 +274,12 @@ where
let toml = opts.to_toml().context(TomlFormatSnafu)?; let toml = opts.to_toml().context(TomlFormatSnafu)?;
let opts: FrontendOptions = opts.into(); let opts: FrontendOptions = opts.into();
// Create request memory limiter for all server protocols
let request_memory_limiter = ServerMemoryLimiter::new(
opts.max_in_flight_write_bytes.as_bytes(),
opts.write_bytes_exhausted_policy,
);
let handlers = ServerHandlers::default(); let handlers = ServerHandlers::default();
let user_provider = self.plugins.get::<UserProviderRef>(); let user_provider = self.plugins.get::<UserProviderRef>();
@@ -295,7 +292,7 @@ where
&opts.meta_client, &opts.meta_client,
None, None,
true, true,
self.server_memory_limiter.clone(), request_memory_limiter.clone(),
)?; )?;
handlers.insert((Box::new(grpc_server), grpc_addr)); handlers.insert((Box::new(grpc_server), grpc_addr));
} }
@@ -308,7 +305,7 @@ where
&opts.meta_client, &opts.meta_client,
Some("INTERNAL_GRPC_SERVER".to_string()), Some("INTERNAL_GRPC_SERVER".to_string()),
false, false,
self.server_memory_limiter.clone(), request_memory_limiter.clone(),
)?; )?;
handlers.insert((Box::new(grpc_server), grpc_addr)); handlers.insert((Box::new(grpc_server), grpc_addr));
} }
@@ -318,7 +315,7 @@ where
let http_options = &opts.http; let http_options = &opts.http;
let http_addr = parse_addr(&http_options.addr)?; let http_addr = parse_addr(&http_options.addr)?;
let http_server = let http_server =
self.build_http_server(&opts, toml, self.server_memory_limiter.clone())?; self.build_http_server(&opts, toml, request_memory_limiter.clone())?;
handlers.insert((Box::new(http_server), http_addr)); handlers.insert((Box::new(http_server), http_addr));
} }

View File

@@ -17,7 +17,6 @@ use common_error::ext::{BoxedError, ErrorExt};
use common_error::status_code::StatusCode; use common_error::status_code::StatusCode;
use common_macro::stack_trace_debug; use common_macro::stack_trace_debug;
use common_meta::DatanodeId; use common_meta::DatanodeId;
use common_procedure::ProcedureId;
use common_runtime::JoinError; use common_runtime::JoinError;
use snafu::{Location, Snafu}; use snafu::{Location, Snafu};
use store_api::storage::RegionId; use store_api::storage::RegionId;
@@ -769,35 +768,6 @@ pub enum Error {
location: Location, location: Location,
}, },
#[snafu(display("Failed to create repartition subtasks"))]
RepartitionCreateSubtasks {
source: partition::error::Error,
#[snafu(implicit)]
location: Location,
},
#[snafu(display(
"Source partition expression '{}' does not match any existing region",
expr
))]
RepartitionSourceExprMismatch {
expr: String,
#[snafu(implicit)]
location: Location,
},
#[snafu(display(
"Failed to get the state receiver for repartition subprocedure {}",
procedure_id
))]
RepartitionSubprocedureStateReceiver {
procedure_id: ProcedureId,
#[snafu(source)]
source: common_procedure::Error,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Unsupported operation {}", operation))] #[snafu(display("Unsupported operation {}", operation))]
Unsupported { Unsupported {
operation: String, operation: String,
@@ -1143,8 +1113,7 @@ impl ErrorExt for Error {
| Error::LeaderPeerChanged { .. } | Error::LeaderPeerChanged { .. }
| Error::RepartitionSourceRegionMissing { .. } | Error::RepartitionSourceRegionMissing { .. }
| Error::RepartitionTargetRegionMissing { .. } | Error::RepartitionTargetRegionMissing { .. }
| Error::PartitionExprMismatch { .. } | Error::PartitionExprMismatch { .. } => StatusCode::InvalidArguments,
| Error::RepartitionSourceExprMismatch { .. } => StatusCode::InvalidArguments,
Error::LeaseKeyFromUtf8 { .. } Error::LeaseKeyFromUtf8 { .. }
| Error::LeaseValueFromUtf8 { .. } | Error::LeaseValueFromUtf8 { .. }
| Error::InvalidRegionKeyFromUtf8 { .. } | Error::InvalidRegionKeyFromUtf8 { .. }
@@ -1204,8 +1173,6 @@ impl ErrorExt for Error {
Error::BuildTlsOptions { source, .. } => source.status_code(), Error::BuildTlsOptions { source, .. } => source.status_code(),
Error::Other { source, .. } => source.status_code(), Error::Other { source, .. } => source.status_code(),
Error::RepartitionCreateSubtasks { source, .. } => source.status_code(),
Error::RepartitionSubprocedureStateReceiver { source, .. } => source.status_code(),
Error::NoEnoughAvailableNode { .. } => StatusCode::RuntimeResourcesExhausted, Error::NoEnoughAvailableNode { .. } => StatusCode::RuntimeResourcesExhausted,
#[cfg(feature = "pg_kvbackend")] #[cfg(feature = "pg_kvbackend")]

View File

@@ -12,63 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
pub mod allocate_region;
pub mod collect;
pub mod deallocate_region;
pub mod dispatch;
pub mod group; pub mod group;
pub mod plan; pub mod plan;
pub mod repartition_end;
pub mod repartition_start;
use std::any::Any;
use std::fmt::Debug;
use common_meta::cache_invalidator::CacheInvalidatorRef;
use common_meta::key::TableMetadataManagerRef;
use common_procedure::{Context as ProcedureContext, Status};
use serde::{Deserialize, Serialize};
use store_api::storage::TableId;
use crate::error::Result;
use crate::procedure::repartition::plan::RepartitionPlanEntry;
use crate::service::mailbox::MailboxRef;
#[cfg(test)] #[cfg(test)]
pub mod test_util; pub mod test_util;
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct PersistentContext {
pub catalog_name: String,
pub schema_name: String,
pub table_name: String,
pub table_id: TableId,
pub plans: Vec<RepartitionPlanEntry>,
}
pub struct Context {
pub persistent_ctx: PersistentContext,
pub table_metadata_manager: TableMetadataManagerRef,
pub mailbox: MailboxRef,
pub server_addr: String,
pub cache_invalidator: CacheInvalidatorRef,
}
#[async_trait::async_trait]
#[typetag::serde(tag = "repartition_state")]
pub(crate) trait State: Sync + Send + Debug {
fn name(&self) -> &'static str {
let type_name = std::any::type_name::<Self>();
// short name
type_name.split("::").last().unwrap_or(type_name)
}
/// Yields the next [State] and [Status].
async fn next(
&mut self,
ctx: &mut Context,
procedure_ctx: &ProcedureContext,
) -> Result<(Box<dyn State>, Status)>;
fn as_any(&self) -> &dyn Any;
}

View File

@@ -1,67 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::any::Any;
use common_procedure::{Context as ProcedureContext, Status};
use serde::{Deserialize, Serialize};
use crate::error::Result;
use crate::procedure::repartition::dispatch::Dispatch;
use crate::procedure::repartition::plan::{AllocationPlanEntry, RepartitionPlanEntry};
use crate::procedure::repartition::{Context, State};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AllocateRegion {
plan_entries: Vec<AllocationPlanEntry>,
}
impl AllocateRegion {
pub fn new(plan_entries: Vec<AllocationPlanEntry>) -> Self {
Self { plan_entries }
}
}
#[async_trait::async_trait]
#[typetag::serde]
impl State for AllocateRegion {
async fn next(
&mut self,
ctx: &mut Context,
_procedure_ctx: &ProcedureContext,
) -> Result<(Box<dyn State>, Status)> {
let region_to_allocate = self
.plan_entries
.iter()
.map(|p| p.regions_to_allocate)
.sum::<usize>();
if region_to_allocate == 0 {
let repartition_plan_entries = self
.plan_entries
.iter()
.map(RepartitionPlanEntry::from_allocation_plan_entry)
.collect::<Vec<_>>();
ctx.persistent_ctx.plans = repartition_plan_entries;
return Ok((Box::new(Dispatch), Status::executing(true)));
}
// TODO(weny): allocate regions.
todo!()
}
fn as_any(&self) -> &dyn Any {
self
}
}

View File

@@ -1,106 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::any::Any;
use common_procedure::{Context as ProcedureContext, ProcedureId, Status, watcher};
use common_telemetry::error;
use serde::{Deserialize, Serialize};
use snafu::ResultExt;
use crate::error::{RepartitionSubprocedureStateReceiverSnafu, Result};
use crate::procedure::repartition::deallocate_region::DeallocateRegion;
use crate::procedure::repartition::group::GroupId;
use crate::procedure::repartition::{Context, State};
/// Metadata for tracking a dispatched sub-procedure.
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq)]
pub struct ProcedureMeta {
/// The index of the plan entry in the parent procedure's plan list.
pub plan_index: usize,
/// The group id of the repartition group.
pub group_id: GroupId,
/// The procedure id of the sub-procedure.
pub procedure_id: ProcedureId,
}
/// State for collecting results from dispatched sub-procedures.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Collect {
/// Sub-procedures that are currently in-flight.
pub inflight_procedures: Vec<ProcedureMeta>,
/// Sub-procedures that have completed successfully.
pub succeeded_procedures: Vec<ProcedureMeta>,
/// Sub-procedures that have failed.
pub failed_procedures: Vec<ProcedureMeta>,
/// Sub-procedures whose state could not be determined.
pub unknown_procedures: Vec<ProcedureMeta>,
}
impl Collect {
pub fn new(inflight_procedures: Vec<ProcedureMeta>) -> Self {
Self {
inflight_procedures,
succeeded_procedures: Vec::new(),
failed_procedures: Vec::new(),
unknown_procedures: Vec::new(),
}
}
}
#[async_trait::async_trait]
#[typetag::serde]
impl State for Collect {
async fn next(
&mut self,
_ctx: &mut Context,
procedure_ctx: &ProcedureContext,
) -> Result<(Box<dyn State>, Status)> {
for procedure_meta in self.inflight_procedures.iter() {
let procedure_id = procedure_meta.procedure_id;
let group_id = procedure_meta.group_id;
let Some(mut receiver) = procedure_ctx
.provider
.procedure_state_receiver(procedure_id)
.await
.context(RepartitionSubprocedureStateReceiverSnafu { procedure_id })?
else {
error!(
"failed to get procedure state receiver, procedure_id: {}, group_id: {}",
procedure_id, group_id
);
self.unknown_procedures.push(*procedure_meta);
continue;
};
match watcher::wait(&mut receiver).await {
Ok(_) => self.succeeded_procedures.push(*procedure_meta),
Err(e) => {
error!(e; "failed to wait for repartition subprocedure, procedure_id: {}, group_id: {}", procedure_id, group_id);
self.failed_procedures.push(*procedure_meta);
}
}
}
if !self.failed_procedures.is_empty() || !self.unknown_procedures.is_empty() {
// TODO(weny): retry the failed or unknown procedures.
}
Ok((Box::new(DeallocateRegion), Status::executing(true)))
}
fn as_any(&self) -> &dyn Any {
self
}
}

View File

@@ -1,52 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::any::Any;
use common_procedure::{Context as ProcedureContext, Status};
use serde::{Deserialize, Serialize};
use crate::error::Result;
use crate::procedure::repartition::repartition_end::RepartitionEnd;
use crate::procedure::repartition::{Context, State};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DeallocateRegion;
#[async_trait::async_trait]
#[typetag::serde]
impl State for DeallocateRegion {
async fn next(
&mut self,
ctx: &mut Context,
_procedure_ctx: &ProcedureContext,
) -> Result<(Box<dyn State>, Status)> {
let region_to_deallocate = ctx
.persistent_ctx
.plans
.iter()
.map(|p| p.pending_deallocate_region_ids.len())
.sum::<usize>();
if region_to_deallocate == 0 {
return Ok((Box::new(RepartitionEnd), Status::done()));
}
// TODO(weny): deallocate regions.
todo!()
}
fn as_any(&self) -> &dyn Any {
self
}
}

View File

@@ -1,66 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::any::Any;
use common_procedure::{Context as ProcedureContext, ProcedureWithId, Status};
use serde::{Deserialize, Serialize};
use crate::error::Result;
use crate::procedure::repartition::collect::{Collect, ProcedureMeta};
use crate::procedure::repartition::group::RepartitionGroupProcedure;
use crate::procedure::repartition::{self, Context, State};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Dispatch;
#[async_trait::async_trait]
#[typetag::serde]
impl State for Dispatch {
async fn next(
&mut self,
ctx: &mut Context,
_procedure_ctx: &ProcedureContext,
) -> Result<(Box<dyn State>, Status)> {
let table_id = ctx.persistent_ctx.table_id;
let mut procedures = Vec::with_capacity(ctx.persistent_ctx.plans.len());
let mut procedure_metas = Vec::with_capacity(ctx.persistent_ctx.plans.len());
for (plan_index, plan) in ctx.persistent_ctx.plans.iter().enumerate() {
let persistent_ctx = repartition::group::PersistentContext::new(
plan.group_id,
table_id,
plan.source_regions.clone(),
plan.target_regions.clone(),
);
let group_procedure = RepartitionGroupProcedure::new(persistent_ctx, ctx);
let procedure = ProcedureWithId::with_random_id(Box::new(group_procedure));
procedure_metas.push(ProcedureMeta {
plan_index,
group_id: plan.group_id,
procedure_id: procedure.id,
});
procedures.push(procedure);
}
Ok((
Box::new(Collect::new(procedure_metas)),
Status::suspended(procedures, true),
))
}
fn as_any(&self) -> &dyn Any {
self
}
}

View File

@@ -29,78 +29,19 @@ use common_meta::key::datanode_table::{DatanodeTableKey, DatanodeTableValue, Reg
use common_meta::key::table_route::TableRouteValue; use common_meta::key::table_route::TableRouteValue;
use common_meta::key::{DeserializedValueWithBytes, TableMetadataManagerRef}; use common_meta::key::{DeserializedValueWithBytes, TableMetadataManagerRef};
use common_meta::rpc::router::RegionRoute; use common_meta::rpc::router::RegionRoute;
use common_procedure::{ use common_procedure::{Context as ProcedureContext, Status};
Context as ProcedureContext, LockKey, Procedure, Result as ProcedureResult, Status,
UserMetadata,
};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use snafu::{OptionExt, ResultExt}; use snafu::{OptionExt, ResultExt};
use store_api::storage::{RegionId, TableId}; use store_api::storage::{RegionId, TableId};
use uuid::Uuid; use uuid::Uuid;
use crate::error::{self, Result}; use crate::error::{self, Result};
use crate::procedure::repartition::group::repartition_start::RepartitionStart;
use crate::procedure::repartition::plan::RegionDescriptor; use crate::procedure::repartition::plan::RegionDescriptor;
use crate::procedure::repartition::{self};
use crate::service::mailbox::MailboxRef; use crate::service::mailbox::MailboxRef;
pub type GroupId = Uuid; pub type GroupId = Uuid;
#[allow(dead_code)] pub struct RepartitionGroupProcedure {}
pub struct RepartitionGroupProcedure {
state: Box<dyn State>,
context: Context,
}
impl RepartitionGroupProcedure {
const TYPE_NAME: &'static str = "metasrv-procedure::RepartitionGroup";
pub fn new(persistent_context: PersistentContext, context: &repartition::Context) -> Self {
let state = Box::new(RepartitionStart);
Self {
state,
context: Context {
persistent_ctx: persistent_context,
cache_invalidator: context.cache_invalidator.clone(),
table_metadata_manager: context.table_metadata_manager.clone(),
mailbox: context.mailbox.clone(),
server_addr: context.server_addr.clone(),
},
}
}
}
#[async_trait::async_trait]
impl Procedure for RepartitionGroupProcedure {
fn type_name(&self) -> &str {
Self::TYPE_NAME
}
async fn execute(&mut self, _ctx: &ProcedureContext) -> ProcedureResult<Status> {
todo!()
}
async fn rollback(&mut self, _: &ProcedureContext) -> ProcedureResult<()> {
todo!()
}
fn rollback_supported(&self) -> bool {
true
}
fn dump(&self) -> ProcedureResult<String> {
todo!()
}
fn lock_key(&self) -> LockKey {
todo!()
}
fn user_metadata(&self) -> Option<UserMetadata> {
todo!()
}
}
pub struct Context { pub struct Context {
pub persistent_ctx: PersistentContext, pub persistent_ctx: PersistentContext,
@@ -114,16 +55,11 @@ pub struct Context {
pub server_addr: String, pub server_addr: String,
} }
/// The result of the group preparation phase, containing validated region routes.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct GroupPrepareResult { pub struct GroupPrepareResult {
/// The validated source region routes.
pub source_routes: Vec<RegionRoute>, pub source_routes: Vec<RegionRoute>,
/// The validated target region routes.
pub target_routes: Vec<RegionRoute>, pub target_routes: Vec<RegionRoute>,
/// The primary source region id (first source region), used for retrieving region options.
pub central_region: RegionId, pub central_region: RegionId,
/// The datanode id where the primary source region is located.
pub central_region_datanode_id: DatanodeId, pub central_region_datanode_id: DatanodeId,
} }
@@ -141,23 +77,6 @@ pub struct PersistentContext {
pub group_prepare_result: Option<GroupPrepareResult>, pub group_prepare_result: Option<GroupPrepareResult>,
} }
impl PersistentContext {
pub fn new(
group_id: GroupId,
table_id: TableId,
sources: Vec<RegionDescriptor>,
targets: Vec<RegionDescriptor>,
) -> Self {
Self {
group_id,
table_id,
sources,
targets,
group_prepare_result: None,
}
}
}
impl Context { impl Context {
/// Retrieves the table route value for the given table id. /// Retrieves the table route value for the given table id.
/// ///

View File

@@ -16,79 +16,11 @@ use partition::expr::PartitionExpr;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use store_api::storage::RegionId; use store_api::storage::RegionId;
use crate::procedure::repartition::group::GroupId;
/// Metadata describing a region involved in the plan. /// Metadata describing a region involved in the plan.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct RegionDescriptor { pub struct RegionDescriptor {
/// The region id of the region involved in the plan. /// The region id of the region involved in the plan.
pub region_id: RegionId, pub region_id: RegionId,
/// The partition expression of the region. /// The new partition expression of the region.
pub partition_expr: PartitionExpr, pub partition_expr: PartitionExpr,
} }
/// A plan entry for the region allocation phase, describing source regions
/// and target partition expressions before allocation.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct AllocationPlanEntry {
/// The group id for this plan entry.
pub group_id: GroupId,
/// Source region descriptors involved in the plan.
pub source_regions: Vec<RegionDescriptor>,
/// The target partition expressions for the new or changed regions.
pub target_partition_exprs: Vec<PartitionExpr>,
/// The number of regions that need to be allocated (target count - source count, if positive).
pub regions_to_allocate: usize,
/// The number of regions that need to be deallocated (source count - target count, if positive).
pub regions_to_deallocate: usize,
}
/// A plan entry for the dispatch phase after region allocation,
/// with concrete source and target region descriptors.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct RepartitionPlanEntry {
/// The group id for this plan entry.
pub group_id: GroupId,
/// The source region descriptors involved in the plan.
pub source_regions: Vec<RegionDescriptor>,
/// The target region descriptors involved in the plan.
pub target_regions: Vec<RegionDescriptor>,
/// The region ids of the allocated regions.
pub allocated_region_ids: Vec<RegionId>,
/// The region ids of the regions that are pending deallocation.
pub pending_deallocate_region_ids: Vec<RegionId>,
}
impl RepartitionPlanEntry {
/// Converts an allocation plan entry into a repartition plan entry.
///
/// The target regions are derived from the source regions and the target partition expressions.
/// The allocated region ids and pending deallocate region ids are empty.
pub fn from_allocation_plan_entry(
AllocationPlanEntry {
group_id,
source_regions,
target_partition_exprs,
regions_to_allocate,
regions_to_deallocate,
}: &AllocationPlanEntry,
) -> Self {
debug_assert!(*regions_to_allocate == 0 && *regions_to_deallocate == 0);
let target_regions = source_regions
.iter()
.zip(target_partition_exprs.iter())
.map(|(source_region, target_partition_expr)| RegionDescriptor {
region_id: source_region.region_id,
partition_expr: target_partition_expr.clone(),
})
.collect::<Vec<_>>();
Self {
group_id: *group_id,
source_regions: source_regions.clone(),
target_regions,
allocated_region_ids: vec![],
pending_deallocate_region_ids: vec![],
}
}
}

View File

@@ -1,40 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::any::Any;
use common_procedure::{Context as ProcedureContext, Status};
use serde::{Deserialize, Serialize};
use crate::error::Result;
use crate::procedure::repartition::{Context, State};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RepartitionEnd;
#[async_trait::async_trait]
#[typetag::serde]
impl State for RepartitionEnd {
async fn next(
&mut self,
_ctx: &mut Context,
_procedure_ctx: &ProcedureContext,
) -> Result<(Box<dyn State>, Status)> {
Ok((Box::new(RepartitionEnd), Status::done()))
}
fn as_any(&self) -> &dyn Any {
self
}
}

View File

@@ -1,172 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::any::Any;
use common_meta::key::table_route::PhysicalTableRouteValue;
use common_procedure::{Context as ProcedureContext, Status};
use partition::expr::PartitionExpr;
use partition::subtask::{self, RepartitionSubtask};
use serde::{Deserialize, Serialize};
use snafu::{OptionExt, ResultExt};
use uuid::Uuid;
use crate::error::{self, Result};
use crate::procedure::repartition::allocate_region::AllocateRegion;
use crate::procedure::repartition::plan::{AllocationPlanEntry, RegionDescriptor};
use crate::procedure::repartition::repartition_end::RepartitionEnd;
use crate::procedure::repartition::{Context, State};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RepartitionStart {
from_exprs: Vec<PartitionExpr>,
to_exprs: Vec<PartitionExpr>,
}
impl RepartitionStart {
pub fn new(from_exprs: Vec<PartitionExpr>, to_exprs: Vec<PartitionExpr>) -> Self {
Self {
from_exprs,
to_exprs,
}
}
}
#[async_trait::async_trait]
#[typetag::serde]
impl State for RepartitionStart {
async fn next(
&mut self,
ctx: &mut Context,
_: &ProcedureContext,
) -> Result<(Box<dyn State>, Status)> {
let (_, table_route) = ctx
.table_metadata_manager
.table_route_manager()
.get_physical_table_route(ctx.persistent_ctx.table_id)
.await
.context(error::TableMetadataManagerSnafu)?;
let plans = Self::build_plan(&table_route, &self.from_exprs, &self.to_exprs)?;
if plans.is_empty() {
return Ok((Box::new(RepartitionEnd), Status::done()));
}
Ok((
Box::new(AllocateRegion::new(plans)),
Status::executing(false),
))
}
fn as_any(&self) -> &dyn Any {
self
}
}
impl RepartitionStart {
#[allow(dead_code)]
fn build_plan(
physical_route: &PhysicalTableRouteValue,
from_exprs: &[PartitionExpr],
to_exprs: &[PartitionExpr],
) -> Result<Vec<AllocationPlanEntry>> {
let subtasks = subtask::create_subtasks(from_exprs, to_exprs)
.context(error::RepartitionCreateSubtasksSnafu)?;
if subtasks.is_empty() {
return Ok(vec![]);
}
let src_descriptors = Self::source_region_descriptors(from_exprs, physical_route)?;
Ok(Self::build_plan_entries(
subtasks,
&src_descriptors,
to_exprs,
))
}
#[allow(dead_code)]
fn build_plan_entries(
subtasks: Vec<RepartitionSubtask>,
source_index: &[RegionDescriptor],
target_exprs: &[PartitionExpr],
) -> Vec<AllocationPlanEntry> {
subtasks
.into_iter()
.map(|subtask| {
let group_id = Uuid::new_v4();
let source_regions = subtask
.from_expr_indices
.iter()
.map(|&idx| source_index[idx].clone())
.collect::<Vec<_>>();
let target_partition_exprs = subtask
.to_expr_indices
.iter()
.map(|&idx| target_exprs[idx].clone())
.collect::<Vec<_>>();
let regions_to_allocate = target_partition_exprs
.len()
.saturating_sub(source_regions.len());
let regions_to_deallocate = source_regions
.len()
.saturating_sub(target_partition_exprs.len());
AllocationPlanEntry {
group_id,
source_regions,
target_partition_exprs,
regions_to_allocate,
regions_to_deallocate,
}
})
.collect::<Vec<_>>()
}
fn source_region_descriptors(
from_exprs: &[PartitionExpr],
physical_route: &PhysicalTableRouteValue,
) -> Result<Vec<RegionDescriptor>> {
let existing_regions = physical_route
.region_routes
.iter()
.map(|route| (route.region.id, route.region.partition_expr()))
.collect::<Vec<_>>();
let descriptors = from_exprs
.iter()
.map(|expr| {
let expr_json = expr
.as_json_str()
.context(error::SerializePartitionExprSnafu)?;
let matched_region_id = existing_regions
.iter()
.find_map(|(region_id, existing_expr)| {
(existing_expr == &expr_json).then_some(*region_id)
})
.with_context(|| error::RepartitionSourceExprMismatchSnafu {
expr: expr_json,
})?;
Ok(RegionDescriptor {
region_id: matched_region_id,
partition_expr: expr.clone(),
})
})
.collect::<Result<Vec<_>>>()?;
Ok(descriptors)
}
}

View File

@@ -23,7 +23,6 @@ common-recordbatch.workspace = true
common-runtime.workspace = true common-runtime.workspace = true
common-telemetry.workspace = true common-telemetry.workspace = true
common-time.workspace = true common-time.workspace = true
chrono.workspace = true
datafusion.workspace = true datafusion.workspace = true
datatypes.workspace = true datatypes.workspace = true
futures-util.workspace = true futures-util.workspace = true

View File

@@ -43,10 +43,10 @@ pub(crate) use state::MetricEngineState;
use store_api::metadata::RegionMetadataRef; use store_api::metadata::RegionMetadataRef;
use store_api::metric_engine_consts::METRIC_ENGINE_NAME; use store_api::metric_engine_consts::METRIC_ENGINE_NAME;
use store_api::region_engine::{ use store_api::region_engine::{
BatchResponses, RegionEngine, RegionRole, RegionScannerRef, RegionStatistic, BatchResponses, CopyRegionFromRequest, CopyRegionFromResponse, RegionEngine,
RemapManifestsRequest, RemapManifestsResponse, SetRegionRoleStateResponse, RegionManifestInfo, RegionRole, RegionScannerRef, RegionStatistic, RemapManifestsRequest,
SetRegionRoleStateSuccess, SettableRegionRoleState, SyncRegionFromRequest, RemapManifestsResponse, SetRegionRoleStateResponse, SetRegionRoleStateSuccess,
SyncRegionFromResponse, SettableRegionRoleState, SyncManifestResponse,
}; };
use store_api::region_request::{ use store_api::region_request::{
BatchRegionDdlRequest, RegionCatchupRequest, RegionOpenRequest, RegionRequest, BatchRegionDdlRequest, RegionCatchupRequest, RegionOpenRequest, RegionRequest,
@@ -220,13 +220,6 @@ impl RegionEngine for MetricEngine {
UnsupportedRegionRequestSnafu { request }.fail() UnsupportedRegionRequestSnafu { request }.fail()
} }
} }
RegionRequest::ApplyStagingManifest(_) => {
if self.inner.is_physical_region(region_id) {
return self.inner.mito.handle_request(region_id, request).await;
} else {
UnsupportedRegionRequestSnafu { request }.fail()
}
}
RegionRequest::Put(put) => self.inner.put_region(region_id, put).await, RegionRequest::Put(put) => self.inner.put_region(region_id, put).await,
RegionRequest::Create(create) => { RegionRequest::Create(create) => {
self.inner self.inner
@@ -361,30 +354,12 @@ impl RegionEngine for MetricEngine {
async fn sync_region( async fn sync_region(
&self, &self,
region_id: RegionId, region_id: RegionId,
request: SyncRegionFromRequest, manifest_info: RegionManifestInfo,
) -> Result<SyncRegionFromResponse, BoxedError> { ) -> Result<SyncManifestResponse, BoxedError> {
match request { self.inner
SyncRegionFromRequest::FromManifest(manifest_info) => self .sync_region(region_id, manifest_info)
.inner .await
.sync_region_from_manifest(region_id, manifest_info) .map_err(BoxedError::new)
.await
.map_err(BoxedError::new),
SyncRegionFromRequest::FromRegion {
source_region_id,
parallelism,
} => {
if self.inner.is_physical_region(region_id) {
self.inner
.sync_region_from_region(region_id, source_region_id, parallelism)
.await
.map_err(BoxedError::new)
} else {
Err(BoxedError::new(
error::UnsupportedSyncRegionFromRequestSnafu { region_id }.build(),
))
}
}
}
} }
async fn remap_manifests( async fn remap_manifests(
@@ -401,6 +376,14 @@ impl RegionEngine for MetricEngine {
} }
} }
async fn copy_region_from(
&self,
_region_id: RegionId,
_request: CopyRegionFromRequest,
) -> Result<CopyRegionFromResponse, BoxedError> {
todo!()
}
async fn set_region_role_state_gracefully( async fn set_region_role_state_gracefully(
&self, &self,
region_id: RegionId, region_id: RegionId,

View File

@@ -290,11 +290,6 @@ impl MetricEngineInner {
.metadata_region .metadata_region
.logical_regions(physical_region_id) .logical_regions(physical_region_id)
.await?; .await?;
common_telemetry::debug!(
"Recover states for physical region {}, logical regions: {:?}",
physical_region_id,
logical_regions
);
let physical_columns = self let physical_columns = self
.data_region .data_region
.physical_columns(physical_region_id) .physical_columns(physical_region_id)

View File

@@ -23,7 +23,6 @@ use store_api::metric_engine_consts::{
METRIC_ENGINE_INDEX_SKIPPING_INDEX_GRANULARITY_OPTION, METRIC_ENGINE_INDEX_SKIPPING_INDEX_GRANULARITY_OPTION,
METRIC_ENGINE_INDEX_SKIPPING_INDEX_GRANULARITY_OPTION_DEFAULT, METRIC_ENGINE_INDEX_TYPE_OPTION, METRIC_ENGINE_INDEX_SKIPPING_INDEX_GRANULARITY_OPTION_DEFAULT, METRIC_ENGINE_INDEX_TYPE_OPTION,
}; };
use store_api::mito_engine_options::{COMPACTION_TYPE, COMPACTION_TYPE_TWCS, TWCS_TIME_WINDOW};
use crate::error::{Error, ParseRegionOptionsSnafu, Result}; use crate::error::{Error, ParseRegionOptionsSnafu, Result};
@@ -33,9 +32,6 @@ use crate::error::{Error, ParseRegionOptionsSnafu, Result};
/// value and appropriately increasing the size of the index, it results in an improved indexing effect. /// value and appropriately increasing the size of the index, it results in an improved indexing effect.
const SEG_ROW_COUNT_FOR_DATA_REGION: u32 = 256; const SEG_ROW_COUNT_FOR_DATA_REGION: u32 = 256;
/// The default compaction time window for metric engine data regions.
const DEFAULT_DATA_REGION_COMPACTION_TIME_WINDOW: &str = "1d";
/// Physical region options. /// Physical region options.
#[derive(Debug, Clone, Copy, PartialEq)] #[derive(Debug, Clone, Copy, PartialEq)]
pub struct PhysicalRegionOptions { pub struct PhysicalRegionOptions {
@@ -76,16 +72,6 @@ pub fn set_data_region_options(
"sparse".to_string(), "sparse".to_string(),
); );
} }
if !options.contains_key(TWCS_TIME_WINDOW) {
options.insert(
COMPACTION_TYPE.to_string(),
COMPACTION_TYPE_TWCS.to_string(),
);
options.insert(
TWCS_TIME_WINDOW.to_string(),
DEFAULT_DATA_REGION_COMPACTION_TIME_WINDOW.to_string(),
);
}
} }
impl TryFrom<&HashMap<String, String>> for PhysicalRegionOptions { impl TryFrom<&HashMap<String, String>> for PhysicalRegionOptions {
@@ -206,29 +192,4 @@ mod tests {
} }
); );
} }
#[test]
fn test_set_data_region_options_default_compaction_time_window() {
// Test that default time window is set when not specified
let mut options = HashMap::new();
set_data_region_options(&mut options, false);
assert_eq!(
options.get(COMPACTION_TYPE),
Some(&COMPACTION_TYPE_TWCS.to_string())
);
assert_eq!(options.get(TWCS_TIME_WINDOW), Some(&"1d".to_string()));
}
#[test]
fn test_set_data_region_options_respects_user_compaction_time_window() {
// Test that user-specified time window is preserved
let mut options = HashMap::new();
options.insert(TWCS_TIME_WINDOW.to_string(), "2h".to_string());
options.insert(COMPACTION_TYPE.to_string(), "twcs".to_string());
set_data_region_options(&mut options, false);
// User's time window should be preserved
assert_eq!(options.get(TWCS_TIME_WINDOW), Some(&"2h".to_string()));
}
} }

View File

@@ -12,5 +12,242 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
mod manifest; use std::time::Instant;
mod region;
use common_telemetry::info;
use snafu::{OptionExt, ResultExt, ensure};
use store_api::region_engine::{RegionEngine, RegionManifestInfo, SyncManifestResponse};
use store_api::storage::RegionId;
use crate::engine::MetricEngineInner;
use crate::error::{
MetricManifestInfoSnafu, MitoSyncOperationSnafu, PhysicalRegionNotFoundSnafu, Result,
};
use crate::utils;
impl MetricEngineInner {
pub async fn sync_region(
&self,
region_id: RegionId,
manifest_info: RegionManifestInfo,
) -> Result<SyncManifestResponse> {
ensure!(
manifest_info.is_metric(),
MetricManifestInfoSnafu { region_id }
);
let metadata_region_id = utils::to_metadata_region_id(region_id);
// checked by ensure above
let metadata_manifest_version = manifest_info
.metadata_manifest_version()
.unwrap_or_default();
let metadata_flushed_entry_id = manifest_info
.metadata_flushed_entry_id()
.unwrap_or_default();
let metadata_region_manifest =
RegionManifestInfo::mito(metadata_manifest_version, metadata_flushed_entry_id, 0);
let metadata_synced = self
.mito
.sync_region(metadata_region_id, metadata_region_manifest)
.await
.context(MitoSyncOperationSnafu)?
.is_data_synced();
let data_region_id = utils::to_data_region_id(region_id);
let data_manifest_version = manifest_info.data_manifest_version();
let data_flushed_entry_id = manifest_info.data_flushed_entry_id();
let data_region_manifest =
RegionManifestInfo::mito(data_manifest_version, data_flushed_entry_id, 0);
let data_synced = self
.mito
.sync_region(data_region_id, data_region_manifest)
.await
.context(MitoSyncOperationSnafu)?
.is_data_synced();
if !metadata_synced {
return Ok(SyncManifestResponse::Metric {
metadata_synced,
data_synced,
new_opened_logical_region_ids: vec![],
});
}
let now = Instant::now();
// Recovers the states from the metadata region
// if the metadata manifest version is updated.
let physical_region_options = *self
.state
.read()
.unwrap()
.physical_region_states()
.get(&data_region_id)
.context(PhysicalRegionNotFoundSnafu {
region_id: data_region_id,
})?
.options();
let new_opened_logical_region_ids = self
.recover_states(data_region_id, physical_region_options)
.await?;
info!(
"Sync metadata region for physical region {}, cost: {:?}, new opened logical region ids: {:?}",
data_region_id,
now.elapsed(),
new_opened_logical_region_ids
);
Ok(SyncManifestResponse::Metric {
metadata_synced,
data_synced,
new_opened_logical_region_ids,
})
}
}
#[cfg(test)]
mod tests {
use std::collections::HashMap;
use api::v1::SemanticType;
use common_query::prelude::greptime_timestamp;
use common_telemetry::info;
use datatypes::data_type::ConcreteDataType;
use datatypes::schema::ColumnSchema;
use store_api::metadata::ColumnMetadata;
use store_api::region_engine::{RegionEngine, RegionManifestInfo};
use store_api::region_request::{
AddColumn, AlterKind, RegionAlterRequest, RegionFlushRequest, RegionRequest,
};
use store_api::storage::RegionId;
use crate::metadata_region::MetadataRegion;
use crate::test_util::TestEnv;
#[tokio::test]
async fn test_sync_region_with_new_created_logical_regions() {
common_telemetry::init_default_ut_logging();
let mut env = TestEnv::with_prefix("sync_with_new_created_logical_regions").await;
env.init_metric_region().await;
info!("creating follower engine");
// Create a follower engine.
let (_follower_mito, follower_metric) = env.create_follower_engine().await;
let physical_region_id = env.default_physical_region_id();
// Flushes the physical region
let metric_engine = env.metric();
metric_engine
.handle_request(
env.default_physical_region_id(),
RegionRequest::Flush(RegionFlushRequest::default()),
)
.await
.unwrap();
let response = follower_metric
.sync_region(physical_region_id, RegionManifestInfo::metric(1, 0, 1, 0))
.await
.unwrap();
assert!(response.is_metric());
let new_opened_logical_region_ids = response.new_opened_logical_region_ids().unwrap();
assert_eq!(new_opened_logical_region_ids, vec![RegionId::new(3, 2)]);
// Sync again, no new logical region should be opened
let response = follower_metric
.sync_region(physical_region_id, RegionManifestInfo::metric(1, 0, 1, 0))
.await
.unwrap();
assert!(response.is_metric());
let new_opened_logical_region_ids = response.new_opened_logical_region_ids().unwrap();
assert!(new_opened_logical_region_ids.is_empty());
}
fn test_alter_logical_region_request() -> RegionAlterRequest {
RegionAlterRequest {
kind: AlterKind::AddColumns {
columns: vec![AddColumn {
column_metadata: ColumnMetadata {
column_id: 0,
semantic_type: SemanticType::Tag,
column_schema: ColumnSchema::new(
"tag1",
ConcreteDataType::string_datatype(),
false,
),
},
location: None,
}],
},
}
}
#[tokio::test]
async fn test_sync_region_alter_alter_logical_region() {
common_telemetry::init_default_ut_logging();
let mut env = TestEnv::with_prefix("sync_region_alter_alter_logical_region").await;
env.init_metric_region().await;
info!("creating follower engine");
let physical_region_id = env.default_physical_region_id();
// Flushes the physical region
let metric_engine = env.metric();
metric_engine
.handle_request(
env.default_physical_region_id(),
RegionRequest::Flush(RegionFlushRequest::default()),
)
.await
.unwrap();
// Create a follower engine.
let (follower_mito, follower_metric) = env.create_follower_engine().await;
let metric_engine = env.metric();
let engine_inner = env.metric().inner;
let region_id = env.default_logical_region_id();
let request = test_alter_logical_region_request();
engine_inner
.alter_logical_regions(
physical_region_id,
vec![(region_id, request)],
&mut HashMap::new(),
)
.await
.unwrap();
// Flushes the physical region
metric_engine
.handle_request(
env.default_physical_region_id(),
RegionRequest::Flush(RegionFlushRequest::default()),
)
.await
.unwrap();
// Sync the follower engine
let response = follower_metric
.sync_region(physical_region_id, RegionManifestInfo::metric(2, 0, 2, 0))
.await
.unwrap();
assert!(response.is_metric());
let new_opened_logical_region_ids = response.new_opened_logical_region_ids().unwrap();
assert!(new_opened_logical_region_ids.is_empty());
let logical_region_id = env.default_logical_region_id();
let metadata_region = MetadataRegion::new(follower_mito.clone());
let semantic_type = metadata_region
.column_semantic_type(physical_region_id, logical_region_id, "tag1")
.await
.unwrap()
.unwrap();
assert_eq!(semantic_type, SemanticType::Tag);
let timestamp_index = metadata_region
.column_semantic_type(physical_region_id, logical_region_id, greptime_timestamp())
.await
.unwrap()
.unwrap();
assert_eq!(timestamp_index, SemanticType::Timestamp);
}
}

View File

@@ -1,268 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::time::Instant;
use common_telemetry::info;
use snafu::{OptionExt, ResultExt, ensure};
use store_api::region_engine::{RegionEngine, RegionManifestInfo, SyncRegionFromResponse};
use store_api::storage::RegionId;
use crate::engine::MetricEngineInner;
use crate::error::{
MetricManifestInfoSnafu, MitoSyncOperationSnafu, PhysicalRegionNotFoundSnafu, Result,
};
use crate::utils;
impl MetricEngineInner {
/// Syncs the region from the given manifest information (leader-follower scenario).
///
/// This operation:
/// 1. Syncs the metadata region manifest to the target version.
/// 2. Syncs the data region manifest to the target version.
/// 3. Recovers states and returns newly opened logical regions (if metadata was synced)
pub async fn sync_region_from_manifest(
&self,
region_id: RegionId,
manifest_info: RegionManifestInfo,
) -> Result<SyncRegionFromResponse> {
ensure!(
manifest_info.is_metric(),
MetricManifestInfoSnafu { region_id }
);
let metadata_region_id = utils::to_metadata_region_id(region_id);
// checked by ensure above
let metadata_manifest_version = manifest_info
.metadata_manifest_version()
.unwrap_or_default();
let metadata_flushed_entry_id = manifest_info
.metadata_flushed_entry_id()
.unwrap_or_default();
let metadata_region_manifest =
RegionManifestInfo::mito(metadata_manifest_version, metadata_flushed_entry_id, 0);
let metadata_synced = self
.mito
.sync_region(metadata_region_id, metadata_region_manifest.into())
.await
.context(MitoSyncOperationSnafu)?
.is_data_synced();
let data_region_id = utils::to_data_region_id(region_id);
let data_manifest_version = manifest_info.data_manifest_version();
let data_flushed_entry_id = manifest_info.data_flushed_entry_id();
let data_region_manifest =
RegionManifestInfo::mito(data_manifest_version, data_flushed_entry_id, 0);
let data_synced = self
.mito
.sync_region(data_region_id, data_region_manifest.into())
.await
.context(MitoSyncOperationSnafu)?
.is_data_synced();
if !metadata_synced {
return Ok(SyncRegionFromResponse::Metric {
metadata_synced,
data_synced,
new_opened_logical_region_ids: vec![],
});
}
let now = Instant::now();
// Recovers the states from the metadata region
// if the metadata manifest version is updated.
let physical_region_options = *self
.state
.read()
.unwrap()
.physical_region_states()
.get(&data_region_id)
.context(PhysicalRegionNotFoundSnafu {
region_id: data_region_id,
})?
.options();
let new_opened_logical_region_ids = self
.recover_states(data_region_id, physical_region_options)
.await?;
info!(
"Sync metadata region for physical region {}, cost: {:?}, new opened logical region ids: {:?}",
data_region_id,
now.elapsed(),
new_opened_logical_region_ids
);
Ok(SyncRegionFromResponse::Metric {
metadata_synced,
data_synced,
new_opened_logical_region_ids,
})
}
}
#[cfg(test)]
mod tests {
use std::collections::HashMap;
use api::v1::SemanticType;
use common_query::prelude::greptime_timestamp;
use common_telemetry::info;
use datatypes::data_type::ConcreteDataType;
use datatypes::schema::ColumnSchema;
use store_api::metadata::ColumnMetadata;
use store_api::region_engine::{RegionEngine, RegionManifestInfo};
use store_api::region_request::{
AddColumn, AlterKind, RegionAlterRequest, RegionFlushRequest, RegionRequest,
};
use store_api::storage::RegionId;
use crate::metadata_region::MetadataRegion;
use crate::test_util::TestEnv;
#[tokio::test]
async fn test_sync_region_with_new_created_logical_regions() {
common_telemetry::init_default_ut_logging();
let mut env = TestEnv::with_prefix("sync_with_new_created_logical_regions").await;
env.init_metric_region().await;
info!("creating follower engine");
// Create a follower engine.
let (_follower_mito, follower_metric) = env.create_follower_engine().await;
let physical_region_id = env.default_physical_region_id();
// Flushes the physical region
let metric_engine = env.metric();
metric_engine
.handle_request(
env.default_physical_region_id(),
RegionRequest::Flush(RegionFlushRequest::default()),
)
.await
.unwrap();
let response = follower_metric
.sync_region(
physical_region_id,
RegionManifestInfo::metric(1, 0, 1, 0).into(),
)
.await
.unwrap();
assert!(response.is_metric());
let new_opened_logical_region_ids = response.new_opened_logical_region_ids().unwrap();
assert_eq!(new_opened_logical_region_ids, vec![RegionId::new(3, 2)]);
// Sync again, no new logical region should be opened
let response = follower_metric
.sync_region(
physical_region_id,
RegionManifestInfo::metric(1, 0, 1, 0).into(),
)
.await
.unwrap();
assert!(response.is_metric());
let new_opened_logical_region_ids = response.new_opened_logical_region_ids().unwrap();
assert!(new_opened_logical_region_ids.is_empty());
}
fn test_alter_logical_region_request() -> RegionAlterRequest {
RegionAlterRequest {
kind: AlterKind::AddColumns {
columns: vec![AddColumn {
column_metadata: ColumnMetadata {
column_id: 0,
semantic_type: SemanticType::Tag,
column_schema: ColumnSchema::new(
"tag1",
ConcreteDataType::string_datatype(),
false,
),
},
location: None,
}],
},
}
}
#[tokio::test]
async fn test_sync_region_alter_alter_logical_region() {
common_telemetry::init_default_ut_logging();
let mut env = TestEnv::with_prefix("sync_region_alter_alter_logical_region").await;
env.init_metric_region().await;
info!("creating follower engine");
let physical_region_id = env.default_physical_region_id();
// Flushes the physical region
let metric_engine = env.metric();
metric_engine
.handle_request(
env.default_physical_region_id(),
RegionRequest::Flush(RegionFlushRequest::default()),
)
.await
.unwrap();
// Create a follower engine.
let (follower_mito, follower_metric) = env.create_follower_engine().await;
let metric_engine = env.metric();
let engine_inner = env.metric().inner;
let region_id = env.default_logical_region_id();
let request = test_alter_logical_region_request();
engine_inner
.alter_logical_regions(
physical_region_id,
vec![(region_id, request)],
&mut HashMap::new(),
)
.await
.unwrap();
// Flushes the physical region
metric_engine
.handle_request(
env.default_physical_region_id(),
RegionRequest::Flush(RegionFlushRequest::default()),
)
.await
.unwrap();
// Sync the follower engine
let response = follower_metric
.sync_region(
physical_region_id,
RegionManifestInfo::metric(2, 0, 2, 0).into(),
)
.await
.unwrap();
assert!(response.is_metric());
let new_opened_logical_region_ids = response.new_opened_logical_region_ids().unwrap();
assert!(new_opened_logical_region_ids.is_empty());
let logical_region_id = env.default_logical_region_id();
let metadata_region = MetadataRegion::new(follower_mito.clone());
let semantic_type = metadata_region
.column_semantic_type(physical_region_id, logical_region_id, "tag1")
.await
.unwrap()
.unwrap();
assert_eq!(semantic_type, SemanticType::Tag);
let timestamp_index = metadata_region
.column_semantic_type(physical_region_id, logical_region_id, greptime_timestamp())
.await
.unwrap()
.unwrap();
assert_eq!(timestamp_index, SemanticType::Timestamp);
}
}

View File

@@ -1,386 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::time::Instant;
use common_error::ext::BoxedError;
use common_telemetry::info;
use mito2::manifest::action::RegionEdit;
use snafu::{OptionExt, ResultExt, ensure};
use store_api::region_engine::{MitoCopyRegionFromRequest, SyncRegionFromResponse};
use store_api::storage::RegionId;
use crate::engine::MetricEngineInner;
use crate::error::{
MissingFilesSnafu, MitoCopyRegionFromOperationSnafu, MitoEditRegionSnafu,
PhysicalRegionNotFoundSnafu, Result,
};
use crate::utils;
impl MetricEngineInner {
/// Syncs the logical regions from the source region to the target region in the metric engine.
///
/// This operation:
/// 1. Copies SST files from source metadata region to target metadata region
/// 2. Transforms logical region metadata (updates region numbers to match target)
/// 3. Edits target manifest to remove old file entries (copied files)
/// 4. Recovers states and returns newly opened logical region IDs
///
/// **Note**: Only the metadata region is synced. The data region is not affected.
pub(crate) async fn sync_region_from_region(
&self,
region_id: RegionId,
source_region_id: RegionId,
parallelism: usize,
) -> Result<SyncRegionFromResponse> {
let source_metadata_region_id = utils::to_metadata_region_id(source_region_id);
let target_metadata_region_id = utils::to_metadata_region_id(region_id);
let target_data_region_id = utils::to_data_region_id(region_id);
let source_data_region_id = utils::to_data_region_id(source_region_id);
info!(
"Syncing region from region {} to region {}, parallelism: {}",
source_region_id, region_id, parallelism
);
let res = self
.mito
.copy_region_from(
target_metadata_region_id,
MitoCopyRegionFromRequest {
source_region_id: source_metadata_region_id,
parallelism,
},
)
.await
.map_err(BoxedError::new)
.context(MitoCopyRegionFromOperationSnafu {
source_region_id: source_metadata_region_id,
target_region_id: target_metadata_region_id,
})?;
if res.copied_file_ids.is_empty() {
info!(
"No files were copied from source region {} to target region {}, copied file ids are empty",
source_metadata_region_id, target_metadata_region_id
);
return Ok(SyncRegionFromResponse::Metric {
metadata_synced: false,
data_synced: false,
new_opened_logical_region_ids: vec![],
});
}
let target_region = self.mito.find_region(target_metadata_region_id).context(
PhysicalRegionNotFoundSnafu {
region_id: target_metadata_region_id,
},
)?;
let files_to_remove = target_region.file_metas(&res.copied_file_ids).await;
let missing_file_ids = res
.copied_file_ids
.iter()
.zip(&files_to_remove)
.filter_map(|(file_id, maybe_meta)| {
if maybe_meta.is_none() {
Some(*file_id)
} else {
None
}
})
.collect::<Vec<_>>();
// `copy_region_from` does not trigger compaction,
// so there should be no files removed and thus no missing files.
ensure!(
missing_file_ids.is_empty(),
MissingFilesSnafu {
region_id: target_metadata_region_id,
file_ids: missing_file_ids,
}
);
let files_to_remove = files_to_remove.into_iter().flatten().collect::<Vec<_>>();
// Transform the logical region metadata of the target data region.
self.metadata_region
.transform_logical_region_metadata(target_data_region_id, source_data_region_id)
.await?;
let edit = RegionEdit {
files_to_add: vec![],
files_to_remove: files_to_remove.clone(),
timestamp_ms: Some(chrono::Utc::now().timestamp_millis()),
compaction_time_window: None,
flushed_entry_id: None,
flushed_sequence: None,
committed_sequence: None,
};
self.mito
.edit_region(target_metadata_region_id, edit)
.await
.map_err(BoxedError::new)
.context(MitoEditRegionSnafu {
region_id: target_metadata_region_id,
})?;
info!(
"Successfully edit metadata region: {} after syncing from source metadata region: {}, files to remove: {:?}",
target_metadata_region_id,
source_metadata_region_id,
files_to_remove
.iter()
.map(|meta| meta.file_id)
.collect::<Vec<_>>(),
);
let now = Instant::now();
// Always recover states from the target metadata region after syncing
// from the source metadata region.
let physical_region_options = *self
.state
.read()
.unwrap()
.physical_region_states()
.get(&target_data_region_id)
.context(PhysicalRegionNotFoundSnafu {
region_id: target_data_region_id,
})?
.options();
let new_opened_logical_region_ids = self
.recover_states(target_data_region_id, physical_region_options)
.await?;
info!(
"Sync metadata region from source region {} to target region {}, recover states cost: {:?}, new opened logical region ids: {:?}",
source_metadata_region_id,
target_metadata_region_id,
now.elapsed(),
new_opened_logical_region_ids
);
Ok(SyncRegionFromResponse::Metric {
metadata_synced: true,
data_synced: false,
new_opened_logical_region_ids,
})
}
}
#[cfg(test)]
mod tests {
use common_error::ext::ErrorExt;
use common_error::status_code::StatusCode;
use common_telemetry::debug;
use store_api::metric_engine_consts::{METRIC_ENGINE_NAME, PHYSICAL_TABLE_METADATA_KEY};
use store_api::region_engine::{RegionEngine, SyncRegionFromRequest};
use store_api::region_request::{
BatchRegionDdlRequest, PathType, RegionCloseRequest, RegionFlushRequest, RegionOpenRequest,
RegionRequest,
};
use store_api::storage::RegionId;
use crate::metadata_region::MetadataRegion;
use crate::test_util::{TestEnv, create_logical_region_request};
async fn assert_logical_table_columns(
metadata_region: &MetadataRegion,
physical_region_id: RegionId,
logical_region_id: RegionId,
expected_columns: &[&str],
) {
let mut columns = metadata_region
.logical_columns(physical_region_id, logical_region_id)
.await
.unwrap()
.into_iter()
.map(|(n, _)| n)
.collect::<Vec<_>>();
columns.sort_unstable();
assert_eq!(columns, expected_columns);
}
#[tokio::test]
async fn test_sync_region_from_region() {
common_telemetry::init_default_ut_logging();
let env = TestEnv::new().await;
let metric_engine = env.metric();
let source_physical_region_id = RegionId::new(1024, 0);
let logical_region_id1 = RegionId::new(1025, 0);
let logical_region_id2 = RegionId::new(1026, 0);
env.create_physical_region(source_physical_region_id, "/test_dir1", vec![])
.await;
let region_create_request1 =
create_logical_region_request(&["job"], source_physical_region_id, "logical1");
let region_create_request2 =
create_logical_region_request(&["host"], source_physical_region_id, "logical2");
metric_engine
.handle_batch_ddl_requests(BatchRegionDdlRequest::Create(vec![
(logical_region_id1, region_create_request1),
(logical_region_id2, region_create_request2),
]))
.await
.unwrap();
debug!("Flushing source physical region");
metric_engine
.handle_request(
source_physical_region_id,
RegionRequest::Flush(RegionFlushRequest {
row_group_size: None,
}),
)
.await
.unwrap();
let logical_regions = metric_engine
.logical_regions(source_physical_region_id)
.await
.unwrap();
assert!(logical_regions.contains(&logical_region_id1));
assert!(logical_regions.contains(&logical_region_id2));
let target_physical_region_id = RegionId::new(1024, 1);
let target_logical_region_id1 = RegionId::new(1025, 1);
let target_logical_region_id2 = RegionId::new(1026, 1);
// Prepare target physical region
env.create_physical_region(target_physical_region_id, "/test_dir1", vec![])
.await;
let r = metric_engine
.sync_region(
target_physical_region_id,
SyncRegionFromRequest::FromRegion {
source_region_id: source_physical_region_id,
parallelism: 1,
},
)
.await
.unwrap();
let new_opened_logical_region_ids = r.new_opened_logical_region_ids().unwrap();
assert_eq!(new_opened_logical_region_ids.len(), 2);
assert!(new_opened_logical_region_ids.contains(&target_logical_region_id1));
assert!(new_opened_logical_region_ids.contains(&target_logical_region_id2));
debug!("Sync region from again");
assert_logical_table_columns(
&env.metadata_region(),
target_physical_region_id,
target_logical_region_id1,
&["greptime_timestamp", "greptime_value", "job"],
)
.await;
assert_logical_table_columns(
&env.metadata_region(),
target_physical_region_id,
target_logical_region_id2,
&["greptime_timestamp", "greptime_value", "host"],
)
.await;
let logical_regions = env
.metadata_region()
.logical_regions(target_physical_region_id)
.await
.unwrap();
assert_eq!(logical_regions.len(), 2);
assert!(logical_regions.contains(&target_logical_region_id1));
assert!(logical_regions.contains(&target_logical_region_id2));
// Should be ok to sync region from again.
let r = metric_engine
.sync_region(
target_physical_region_id,
SyncRegionFromRequest::FromRegion {
source_region_id: source_physical_region_id,
parallelism: 1,
},
)
.await
.unwrap();
let new_opened_logical_region_ids = r.new_opened_logical_region_ids().unwrap();
assert!(new_opened_logical_region_ids.is_empty());
// Try to close region and reopen it, should be ok.
metric_engine
.handle_request(
target_physical_region_id,
RegionRequest::Close(RegionCloseRequest {}),
)
.await
.unwrap();
let physical_region_option = [(PHYSICAL_TABLE_METADATA_KEY.to_string(), String::new())]
.into_iter()
.collect();
metric_engine
.handle_request(
target_physical_region_id,
RegionRequest::Open(RegionOpenRequest {
engine: METRIC_ENGINE_NAME.to_string(),
table_dir: "/test_dir1".to_string(),
path_type: PathType::Bare,
options: physical_region_option,
skip_wal_replay: false,
checkpoint: None,
}),
)
.await
.unwrap();
let logical_regions = env
.metadata_region()
.logical_regions(target_physical_region_id)
.await
.unwrap();
assert_eq!(logical_regions.len(), 2);
assert!(logical_regions.contains(&target_logical_region_id1));
assert!(logical_regions.contains(&target_logical_region_id2));
}
#[tokio::test]
async fn test_sync_region_from_region_with_no_files() {
common_telemetry::init_default_ut_logging();
let env = TestEnv::new().await;
let metric_engine = env.metric();
let source_physical_region_id = RegionId::new(1024, 0);
env.create_physical_region(source_physical_region_id, "/test_dir1", vec![])
.await;
let target_physical_region_id = RegionId::new(1024, 1);
env.create_physical_region(target_physical_region_id, "/test_dir1", vec![])
.await;
let r = metric_engine
.sync_region(
target_physical_region_id,
SyncRegionFromRequest::FromRegion {
source_region_id: source_physical_region_id,
parallelism: 1,
},
)
.await
.unwrap();
let new_opened_logical_region_ids = r.new_opened_logical_region_ids().unwrap();
assert!(new_opened_logical_region_ids.is_empty());
}
#[tokio::test]
async fn test_sync_region_from_region_source_not_exist() {
common_telemetry::init_default_ut_logging();
let env = TestEnv::new().await;
let metric_engine = env.metric();
let source_physical_region_id = RegionId::new(1024, 0);
let target_physical_region_id = RegionId::new(1024, 1);
env.create_physical_region(target_physical_region_id, "/test_dir1", vec![])
.await;
let err = metric_engine
.sync_region(
target_physical_region_id,
SyncRegionFromRequest::FromRegion {
source_region_id: source_physical_region_id,
parallelism: 1,
},
)
.await
.unwrap_err();
assert_eq!(err.status_code(), StatusCode::InvalidArguments);
}
}

View File

@@ -21,7 +21,7 @@ use common_macro::stack_trace_debug;
use datatypes::prelude::ConcreteDataType; use datatypes::prelude::ConcreteDataType;
use snafu::{Location, Snafu}; use snafu::{Location, Snafu};
use store_api::region_request::RegionRequest; use store_api::region_request::RegionRequest;
use store_api::storage::{FileId, RegionId}; use store_api::storage::RegionId;
#[derive(Snafu)] #[derive(Snafu)]
#[snafu(visibility(pub))] #[snafu(visibility(pub))]
@@ -128,27 +128,6 @@ pub enum Error {
location: Location, location: Location,
}, },
#[snafu(display(
"Mito copy region from operation fails, source region id: {}, target region id: {}",
source_region_id,
target_region_id
))]
MitoCopyRegionFromOperation {
source: BoxedError,
#[snafu(implicit)]
location: Location,
source_region_id: RegionId,
target_region_id: RegionId,
},
#[snafu(display("Mito edit region operation fails, region id: {}", region_id))]
MitoEditRegion {
region_id: RegionId,
source: BoxedError,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Failed to encode primary key"))] #[snafu(display("Failed to encode primary key"))]
EncodePrimaryKey { EncodePrimaryKey {
source: mito_codec::error::Error, source: mito_codec::error::Error,
@@ -277,21 +256,6 @@ pub enum Error {
location: Location, location: Location,
}, },
#[snafu(display("Unsupported sync region from request for region {}", region_id))]
UnsupportedSyncRegionFromRequest {
region_id: RegionId,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Missing file metas in region {}, file ids: {:?}", region_id, file_ids))]
MissingFiles {
region_id: RegionId,
#[snafu(implicit)]
location: Location,
file_ids: Vec<FileId>,
},
#[snafu(display("Unsupported alter kind: {}", kind))] #[snafu(display("Unsupported alter kind: {}", kind))]
UnsupportedAlterKind { UnsupportedAlterKind {
kind: String, kind: String,
@@ -375,12 +339,11 @@ impl ErrorExt for Error {
| ParseRegionOptions { .. } | ParseRegionOptions { .. }
| UnexpectedRequest { .. } | UnexpectedRequest { .. }
| UnsupportedAlterKind { .. } | UnsupportedAlterKind { .. }
| UnsupportedRemapManifestsRequest { .. } | UnsupportedRemapManifestsRequest { .. } => StatusCode::InvalidArguments,
| UnsupportedSyncRegionFromRequest { .. } => StatusCode::InvalidArguments,
ForbiddenPhysicalAlter { .. } ForbiddenPhysicalAlter { .. } | UnsupportedRegionRequest { .. } => {
| UnsupportedRegionRequest { .. } StatusCode::Unsupported
| MissingFiles { .. } => StatusCode::Unsupported, }
DeserializeColumnMetadata { .. } DeserializeColumnMetadata { .. }
| SerializeColumnMetadata { .. } | SerializeColumnMetadata { .. }
@@ -406,9 +369,7 @@ impl ErrorExt for Error {
| MitoSyncOperation { source, .. } | MitoSyncOperation { source, .. }
| MitoEnterStagingOperation { source, .. } | MitoEnterStagingOperation { source, .. }
| BatchOpenMitoRegion { source, .. } | BatchOpenMitoRegion { source, .. }
| BatchCatchupMitoRegion { source, .. } | BatchCatchupMitoRegion { source, .. } => source.status_code(),
| MitoCopyRegionFromOperation { source, .. }
| MitoEditRegion { source, .. } => source.status_code(),
EncodePrimaryKey { source, .. } => source.status_code(), EncodePrimaryKey { source, .. } => source.status_code(),

View File

@@ -25,7 +25,6 @@ use base64::Engine;
use base64::engine::general_purpose::STANDARD_NO_PAD; use base64::engine::general_purpose::STANDARD_NO_PAD;
use common_base::readable_size::ReadableSize; use common_base::readable_size::ReadableSize;
use common_recordbatch::{RecordBatch, SendableRecordBatchStream}; use common_recordbatch::{RecordBatch, SendableRecordBatchStream};
use common_telemetry::{debug, info, warn};
use datafusion::prelude::{col, lit}; use datafusion::prelude::{col, lit};
use futures_util::TryStreamExt; use futures_util::TryStreamExt;
use futures_util::stream::BoxStream; use futures_util::stream::BoxStream;
@@ -401,11 +400,14 @@ impl MetadataRegion {
.await .await
.context(CacheGetSnafu)?; .context(CacheGetSnafu)?;
let range = region_metadata.key_values.range(prefix.to_string()..);
let mut result = HashMap::new(); let mut result = HashMap::new();
get_all_with_prefix(&region_metadata, prefix, |k, v| { for (k, v) in range {
result.insert(k.to_string(), v.to_string()); if !k.starts_with(prefix) {
Ok(()) break;
})?; }
result.insert(k.clone(), v.clone());
}
Ok(result) Ok(result)
} }
@@ -556,109 +558,6 @@ impl MetadataRegion {
Ok(()) Ok(())
} }
/// Updates logical region metadata so that any entries previously referencing
/// `source_region_id` are modified to reference the data region of `physical_region_id`.
///
/// This method should be called after copying files from `source_region_id`
/// into the target region. It scans the metadata for the target physical
/// region, finds logical regions with the same region number as the source,
/// and reinserts region and column entries updated to use the target's
/// region number.
pub async fn transform_logical_region_metadata(
&self,
physical_region_id: RegionId,
source_region_id: RegionId,
) -> Result<()> {
let metadata_region_id = utils::to_metadata_region_id(physical_region_id);
let data_region_id = utils::to_data_region_id(physical_region_id);
let logical_regions = self
.logical_regions(data_region_id)
.await?
.into_iter()
.filter(|r| r.region_number() == source_region_id.region_number())
.collect::<Vec<_>>();
if logical_regions.is_empty() {
info!(
"No logical regions found from source region {}, physical region id: {}",
source_region_id, physical_region_id,
);
return Ok(());
}
let metadata = self.load_all(metadata_region_id).await?;
let mut output = Vec::new();
for logical_region_id in &logical_regions {
let prefix = MetadataRegion::concat_column_key_prefix(*logical_region_id);
get_all_with_prefix(&metadata, &prefix, |k, v| {
// Safety: we have checked the prefix
let (src_logical_region_id, column_name) = Self::parse_column_key(k)?.unwrap();
// Change the region number to the data region number.
let new_key = MetadataRegion::concat_column_key(
RegionId::new(
src_logical_region_id.table_id(),
data_region_id.region_number(),
),
&column_name,
);
output.push((new_key, v.to_string()));
Ok(())
})?;
let new_key = MetadataRegion::concat_region_key(RegionId::new(
logical_region_id.table_id(),
data_region_id.region_number(),
));
output.push((new_key, String::new()));
}
if output.is_empty() {
warn!(
"No logical regions metadata found from source region {}, physical region id: {}",
source_region_id, physical_region_id
);
return Ok(());
}
debug!(
"Transform logical regions metadata to physical region {}, source region: {}, transformed metadata: {}",
data_region_id,
source_region_id,
output.len(),
);
let put_request = MetadataRegion::build_put_request_from_iter(output.into_iter());
self.mito
.handle_request(
metadata_region_id,
store_api::region_request::RegionRequest::Put(put_request),
)
.await
.context(MitoWriteOperationSnafu)?;
info!(
"Transformed {} logical regions metadata to physical region {}, source region: {}",
logical_regions.len(),
data_region_id,
source_region_id
);
self.cache.invalidate(&metadata_region_id).await;
Ok(())
}
}
fn get_all_with_prefix(
region_metadata: &RegionMetadataCacheEntry,
prefix: &str,
mut callback: impl FnMut(&str, &str) -> Result<()>,
) -> Result<()> {
let range = region_metadata.key_values.range(prefix.to_string()..);
for (k, v) in range {
if !k.starts_with(prefix) {
break;
}
callback(k, v)?;
}
Ok(())
} }
#[cfg(test)] #[cfg(test)]

View File

@@ -62,7 +62,7 @@ use crate::read::projection::ProjectionMapper;
use crate::read::scan_region::{PredicateGroup, ScanInput}; use crate::read::scan_region::{PredicateGroup, ScanInput};
use crate::read::seq_scan::SeqScan; use crate::read::seq_scan::SeqScan;
use crate::read::{BoxedBatchReader, BoxedRecordBatchStream}; use crate::read::{BoxedBatchReader, BoxedRecordBatchStream};
use crate::region::options::{MergeMode, RegionOptions}; use crate::region::options::MergeMode;
use crate::region::version::VersionControlRef; use crate::region::version::VersionControlRef;
use crate::region::{ManifestContextRef, RegionLeaderState, RegionRoleState}; use crate::region::{ManifestContextRef, RegionLeaderState, RegionRoleState};
use crate::request::{OptionOutputTx, OutputTx, WorkerRequestWithTime}; use crate::request::{OptionOutputTx, OutputTx, WorkerRequestWithTime};
@@ -311,24 +311,9 @@ impl CompactionScheduler {
request: CompactionRequest, request: CompactionRequest,
options: compact_request::Options, options: compact_request::Options,
) -> Result<()> { ) -> Result<()> {
let region_id = request.region_id();
let (dynamic_compaction_opts, ttl) = find_dynamic_options(
region_id.table_id(),
&request.current_version.options,
&request.schema_metadata_manager,
)
.await
.unwrap_or_else(|e| {
warn!(e; "Failed to find dynamic options for region: {}", region_id);
(
request.current_version.options.compaction.clone(),
request.current_version.options.ttl.unwrap_or_default(),
)
});
let picker = new_picker( let picker = new_picker(
&options, &options,
&dynamic_compaction_opts, &request.current_version.options.compaction,
request.current_version.options.append_mode, request.current_version.options.append_mode,
Some(self.engine_config.max_background_compactions), Some(self.engine_config.max_background_compactions),
); );
@@ -343,10 +328,21 @@ impl CompactionScheduler {
cache_manager, cache_manager,
manifest_ctx, manifest_ctx,
listener, listener,
schema_metadata_manager: _, schema_metadata_manager,
max_parallelism, max_parallelism,
} = request; } = request;
let ttl = find_ttl(
region_id.table_id(),
current_version.options.ttl,
&schema_metadata_manager,
)
.await
.unwrap_or_else(|e| {
warn!(e; "Failed to get ttl for region: {}", region_id);
TimeToLive::default()
});
debug!( debug!(
"Pick compaction strategy {:?} for region: {}, ttl: {:?}", "Pick compaction strategy {:?} for region: {}, ttl: {:?}",
picker, region_id, ttl picker, region_id, ttl
@@ -355,10 +351,7 @@ impl CompactionScheduler {
let compaction_region = CompactionRegion { let compaction_region = CompactionRegion {
region_id, region_id,
current_version: current_version.clone(), current_version: current_version.clone(),
region_options: RegionOptions { region_options: current_version.options.clone(),
compaction: dynamic_compaction_opts.clone(),
..current_version.options.clone()
},
engine_config: engine_config.clone(), engine_config: engine_config.clone(),
region_metadata: current_version.metadata.clone(), region_metadata: current_version.metadata.clone(),
cache_manager: cache_manager.clone(), cache_manager: cache_manager.clone(),
@@ -389,7 +382,7 @@ impl CompactionScheduler {
// If specified to run compaction remotely, we schedule the compaction job remotely. // If specified to run compaction remotely, we schedule the compaction job remotely.
// It will fall back to local compaction if there is no remote job scheduler. // It will fall back to local compaction if there is no remote job scheduler.
let waiters = if dynamic_compaction_opts.remote_compaction() { let waiters = if current_version.options.compaction.remote_compaction() {
if let Some(remote_job_scheduler) = &self.plugins.get::<RemoteJobSchedulerRef>() { if let Some(remote_job_scheduler) = &self.plugins.get::<RemoteJobSchedulerRef>() {
let remote_compaction_job = CompactionJob { let remote_compaction_job = CompactionJob {
compaction_region: compaction_region.clone(), compaction_region: compaction_region.clone(),
@@ -418,7 +411,7 @@ impl CompactionScheduler {
return Ok(()); return Ok(());
} }
Err(e) => { Err(e) => {
if !dynamic_compaction_opts.fallback_to_local() { if !current_version.options.compaction.fallback_to_local() {
error!(e; "Failed to schedule remote compaction job for region {}", region_id); error!(e; "Failed to schedule remote compaction job for region {}", region_id);
return RemoteCompactionSnafu { return RemoteCompactionSnafu {
region_id, region_id,
@@ -501,88 +494,29 @@ impl Drop for CompactionScheduler {
} }
} }
/// Finds compaction options and TTL together with a single metadata fetch to reduce RTT. /// Finds TTL of table by first examine table options then database options.
async fn find_dynamic_options( async fn find_ttl(
table_id: TableId, table_id: TableId,
region_options: &crate::region::options::RegionOptions, table_ttl: Option<TimeToLive>,
schema_metadata_manager: &SchemaMetadataManagerRef, schema_metadata_manager: &SchemaMetadataManagerRef,
) -> Result<(crate::region::options::CompactionOptions, TimeToLive)> { ) -> Result<TimeToLive> {
if region_options.compaction_override && region_options.ttl.is_some() { // If table TTL is set, we use it.
debug!( if let Some(table_ttl) = table_ttl {
"Use region options directly for table {}: compaction={:?}, ttl={:?}", return Ok(table_ttl);
table_id, region_options.compaction, region_options.ttl
);
return Ok((
region_options.compaction.clone(),
region_options.ttl.unwrap(),
));
} }
let db_options = tokio::time::timeout( let ttl = tokio::time::timeout(
crate::config::FETCH_OPTION_TIMEOUT, crate::config::FETCH_OPTION_TIMEOUT,
schema_metadata_manager.get_schema_options_by_table_id(table_id), schema_metadata_manager.get_schema_options_by_table_id(table_id),
) )
.await .await
.context(TimeoutSnafu)? .context(TimeoutSnafu)?
.context(GetSchemaMetadataSnafu)?; .context(GetSchemaMetadataSnafu)?
.and_then(|options| options.ttl)
.unwrap_or_default()
.into();
let ttl = if region_options.ttl.is_some() { Ok(ttl)
debug!(
"Use region TTL directly for table {}: ttl={:?}",
table_id, region_options.ttl
);
region_options.ttl.unwrap()
} else {
db_options
.as_ref()
.and_then(|options| options.ttl)
.unwrap_or_default()
.into()
};
let compaction = if !region_options.compaction_override {
if let Some(schema_opts) = db_options {
let map: HashMap<String, String> = schema_opts
.extra_options
.iter()
.filter_map(|(k, v)| {
if k.starts_with("compaction.") {
Some((k.clone(), v.clone()))
} else {
None
}
})
.collect();
if map.is_empty() {
region_options.compaction.clone()
} else {
crate::region::options::RegionOptions::try_from(&map)
.map(|o| o.compaction)
.unwrap_or_else(|e| {
error!(e; "Failed to create RegionOptions from map");
region_options.compaction.clone()
})
}
} else {
debug!(
"DB options is None for table {}, use region compaction: compaction={:?}",
table_id, region_options.compaction
);
region_options.compaction.clone()
}
} else {
debug!(
"No schema options for table {}, use region compaction: compaction={:?}",
table_id, region_options.compaction
);
region_options.compaction.clone()
};
debug!(
"Resolved dynamic options for table {}: compaction={:?}, ttl={:?}",
table_id, compaction, ttl
);
Ok((compaction, ttl))
} }
/// Status of running and pending region compaction tasks. /// Status of running and pending region compaction tasks.
@@ -871,12 +805,8 @@ struct PendingCompaction {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use std::time::Duration;
use api::v1::region::StrictWindow; use api::v1::region::StrictWindow;
use common_datasource::compression::CompressionType; use common_datasource::compression::CompressionType;
use common_meta::key::schema_name::SchemaNameValue;
use common_time::DatabaseTimeToLive;
use tokio::sync::{Barrier, oneshot}; use tokio::sync::{Barrier, oneshot};
use super::*; use super::*;
@@ -888,163 +818,6 @@ mod tests {
use crate::test_util::scheduler_util::{SchedulerEnv, VecScheduler}; use crate::test_util::scheduler_util::{SchedulerEnv, VecScheduler};
use crate::test_util::version_util::{VersionControlBuilder, apply_edit}; use crate::test_util::version_util::{VersionControlBuilder, apply_edit};
#[tokio::test]
async fn test_find_compaction_options_db_level() {
let env = SchedulerEnv::new().await;
let builder = VersionControlBuilder::new();
let (schema_metadata_manager, kv_backend) = mock_schema_metadata_manager();
let region_id = builder.region_id();
let table_id = region_id.table_id();
// Register table without ttl but with db-level compaction options
let mut schema_value = SchemaNameValue {
ttl: Some(DatabaseTimeToLive::default()),
..Default::default()
};
schema_value
.extra_options
.insert("compaction.type".to_string(), "twcs".to_string());
schema_value
.extra_options
.insert("compaction.twcs.time_window".to_string(), "2h".to_string());
schema_metadata_manager
.register_region_table_info(
table_id,
"t",
"c",
"s",
Some(schema_value),
kv_backend.clone(),
)
.await;
let version_control = Arc::new(builder.build());
let region_opts = version_control.current().version.options.clone();
let (opts, _) = find_dynamic_options(table_id, &region_opts, &schema_metadata_manager)
.await
.unwrap();
match opts {
crate::region::options::CompactionOptions::Twcs(t) => {
assert_eq!(t.time_window_seconds(), Some(2 * 3600));
}
}
let manifest_ctx = env
.mock_manifest_context(version_control.current().version.metadata.clone())
.await;
let (tx, _rx) = mpsc::channel(4);
let mut scheduler = env.mock_compaction_scheduler(tx);
let (otx, _orx) = oneshot::channel();
let request = scheduler
.region_status
.entry(region_id)
.or_insert_with(|| {
crate::compaction::CompactionStatus::new(
region_id,
version_control.clone(),
env.access_layer.clone(),
)
})
.new_compaction_request(
scheduler.request_sender.clone(),
OptionOutputTx::new(Some(OutputTx::new(otx))),
scheduler.engine_config.clone(),
scheduler.cache_manager.clone(),
&manifest_ctx,
scheduler.listener.clone(),
schema_metadata_manager.clone(),
1,
);
scheduler
.schedule_compaction_request(
request,
compact_request::Options::Regular(Default::default()),
)
.await
.unwrap();
}
#[tokio::test]
async fn test_find_compaction_options_priority() {
fn schema_value_with_twcs(time_window: &str) -> SchemaNameValue {
let mut schema_value = SchemaNameValue {
ttl: Some(DatabaseTimeToLive::default()),
..Default::default()
};
schema_value
.extra_options
.insert("compaction.type".to_string(), "twcs".to_string());
schema_value.extra_options.insert(
"compaction.twcs.time_window".to_string(),
time_window.to_string(),
);
schema_value
}
let cases = [
(
"db options set and table override set",
Some(schema_value_with_twcs("2h")),
true,
Some(Duration::from_secs(5 * 3600)),
Some(5 * 3600),
),
(
"db options set and table override not set",
Some(schema_value_with_twcs("2h")),
false,
None,
Some(2 * 3600),
),
(
"db options not set and table override set",
None,
true,
Some(Duration::from_secs(4 * 3600)),
Some(4 * 3600),
),
(
"db options not set and table override not set",
None,
false,
None,
None,
),
];
for (case_name, schema_value, override_set, table_window, expected_window) in cases {
let builder = VersionControlBuilder::new();
let (schema_metadata_manager, kv_backend) = mock_schema_metadata_manager();
let table_id = builder.region_id().table_id();
schema_metadata_manager
.register_region_table_info(
table_id,
"t",
"c",
"s",
schema_value,
kv_backend.clone(),
)
.await;
let version_control = Arc::new(builder.build());
let mut region_opts = version_control.current().version.options.clone();
region_opts.compaction_override = override_set;
if let Some(window) = table_window {
let crate::region::options::CompactionOptions::Twcs(twcs) =
&mut region_opts.compaction;
twcs.time_window = Some(window);
}
let (opts, _) = find_dynamic_options(table_id, &region_opts, &schema_metadata_manager)
.await
.unwrap();
match opts {
crate::region::options::CompactionOptions::Twcs(t) => {
assert_eq!(t.time_window_seconds(), expected_window, "{case_name}");
}
}
}
}
#[tokio::test] #[tokio::test]
async fn test_schedule_empty() { async fn test_schedule_empty() {
let env = SchedulerEnv::new().await; let env = SchedulerEnv::new().await;

View File

@@ -35,7 +35,7 @@ use crate::access_layer::{
}; };
use crate::cache::{CacheManager, CacheManagerRef}; use crate::cache::{CacheManager, CacheManagerRef};
use crate::compaction::picker::{PickerOutput, new_picker}; use crate::compaction::picker::{PickerOutput, new_picker};
use crate::compaction::{CompactionOutput, CompactionSstReaderBuilder, find_dynamic_options}; use crate::compaction::{CompactionOutput, CompactionSstReaderBuilder, find_ttl};
use crate::config::MitoConfig; use crate::config::MitoConfig;
use crate::error::{ use crate::error::{
EmptyRegionDirSnafu, InvalidPartitionExprSnafu, JoinSnafu, ObjectStoreNotFoundSnafu, Result, EmptyRegionDirSnafu, InvalidPartitionExprSnafu, JoinSnafu, ObjectStoreNotFoundSnafu, Result,
@@ -203,22 +203,16 @@ pub async fn open_compaction_region(
// Use the specified ttl. // Use the specified ttl.
Either::Left(ttl) => ttl, Either::Left(ttl) => ttl,
// Get the ttl from the schema metadata manager. // Get the ttl from the schema metadata manager.
Either::Right(schema_metadata_manager) => { Either::Right(schema_metadata_manager) => find_ttl(
let (_, ttl) = find_dynamic_options( req.region_id.table_id(),
req.region_id.table_id(), current_version.options.ttl,
&req.region_options, &schema_metadata_manager,
&schema_metadata_manager, )
) .await
.await .unwrap_or_else(|e| {
.unwrap_or_else(|e| { warn!(e; "Failed to get ttl for region: {}", region_metadata.region_id);
warn!(e; "Failed to get ttl for region: {}", region_metadata.region_id); TimeToLive::default()
( }),
crate::region::options::CompactionOptions::default(),
TimeToLive::default(),
)
});
ttl
}
}; };
Ok(CompactionRegion { Ok(CompactionRegion {

View File

@@ -162,7 +162,6 @@ impl CompactionTaskImpl {
edit, edit,
result: Ok(()), result: Ok(()),
update_region_state: false, update_region_state: false,
is_staging: false,
}), }),
}) })
.await; .await;

View File

@@ -244,7 +244,6 @@ mod tests {
options: RegionOptions { options: RegionOptions {
ttl: ttl.map(|t| t.into()), ttl: ttl.map(|t| t.into()),
compaction: Default::default(), compaction: Default::default(),
compaction_override: false,
storage: None, storage: None,
append_mode: false, append_mode: false,
wal_options: Default::default(), wal_options: Default::default(),

View File

@@ -76,8 +76,6 @@ mod copy_region_from_test;
#[cfg(test)] #[cfg(test)]
mod remap_manifests_test; mod remap_manifests_test;
#[cfg(test)]
mod apply_staging_manifest_test;
mod puffin_index; mod puffin_index;
use std::any::Any; use std::any::Any;
@@ -89,7 +87,6 @@ use api::region::RegionResponse;
use async_trait::async_trait; use async_trait::async_trait;
use common_base::Plugins; use common_base::Plugins;
use common_error::ext::BoxedError; use common_error::ext::BoxedError;
use common_meta::error::UnexpectedSnafu;
use common_meta::key::SchemaMetadataManagerRef; use common_meta::key::SchemaMetadataManagerRef;
use common_recordbatch::{MemoryPermit, QueryMemoryTracker, SendableRecordBatchStream}; use common_recordbatch::{MemoryPermit, QueryMemoryTracker, SendableRecordBatchStream};
use common_stat::get_total_memory_bytes; use common_stat::get_total_memory_bytes;
@@ -108,10 +105,10 @@ use store_api::metric_engine_consts::{
MANIFEST_INFO_EXTENSION_KEY, TABLE_COLUMN_METADATA_EXTENSION_KEY, MANIFEST_INFO_EXTENSION_KEY, TABLE_COLUMN_METADATA_EXTENSION_KEY,
}; };
use store_api::region_engine::{ use store_api::region_engine::{
BatchResponses, MitoCopyRegionFromRequest, MitoCopyRegionFromResponse, RegionEngine, BatchResponses, CopyRegionFromRequest, CopyRegionFromResponse, MitoCopyRegionFromResponse,
RegionManifestInfo, RegionRole, RegionScannerRef, RegionStatistic, RemapManifestsRequest, RegionEngine, RegionManifestInfo, RegionRole, RegionScannerRef, RegionStatistic,
RemapManifestsResponse, SetRegionRoleStateResponse, SettableRegionRoleState, RemapManifestsRequest, RemapManifestsResponse, SetRegionRoleStateResponse,
SyncRegionFromRequest, SyncRegionFromResponse, SettableRegionRoleState, SyncManifestResponse,
}; };
use store_api::region_request::{ use store_api::region_request::{
AffectedRows, RegionCatchupRequest, RegionOpenRequest, RegionRequest, AffectedRows, RegionCatchupRequest, RegionOpenRequest, RegionRequest,
@@ -125,8 +122,8 @@ use crate::cache::{CacheManagerRef, CacheStrategy};
use crate::config::MitoConfig; use crate::config::MitoConfig;
use crate::engine::puffin_index::{IndexEntryContext, collect_index_entries_from_puffin}; use crate::engine::puffin_index::{IndexEntryContext, collect_index_entries_from_puffin};
use crate::error::{ use crate::error::{
InvalidRequestSnafu, JoinSnafu, MitoManifestInfoSnafu, RecvSnafu, RegionNotFoundSnafu, Result, self, InvalidRequestSnafu, JoinSnafu, MitoManifestInfoSnafu, RecvSnafu, RegionNotFoundSnafu,
SerdeJsonSnafu, SerializeColumnMetadataSnafu, SerializeManifestSnafu, Result, SerdeJsonSnafu, SerializeColumnMetadataSnafu, SerializeManifestSnafu,
}; };
#[cfg(feature = "enterprise")] #[cfg(feature = "enterprise")]
use crate::extension::BoxedExtensionRangeProviderFactory; use crate::extension::BoxedExtensionRangeProviderFactory;
@@ -398,7 +395,7 @@ impl MitoEngine {
} }
/// Edit region's metadata by [RegionEdit] directly. Use with care. /// Edit region's metadata by [RegionEdit] directly. Use with care.
/// Now we only allow adding files or removing files from region (the [RegionEdit] struct can only contain a non-empty "files_to_add" or "files_to_remove" field). /// Now we only allow adding files to region (the [RegionEdit] struct can only contain a non-empty "files_to_add" field).
/// Other region editing intention will result in an "invalid request" error. /// Other region editing intention will result in an "invalid request" error.
/// Also note that if a region is to be edited directly, we MUST not write data to it thereafter. /// Also note that if a region is to be edited directly, we MUST not write data to it thereafter.
pub async fn edit_region(&self, region_id: RegionId, edit: RegionEdit) -> Result<()> { pub async fn edit_region(&self, region_id: RegionId, edit: RegionEdit) -> Result<()> {
@@ -433,7 +430,7 @@ impl MitoEngine {
pub async fn copy_region_from( pub async fn copy_region_from(
&self, &self,
region_id: RegionId, region_id: RegionId,
request: MitoCopyRegionFromRequest, request: CopyRegionFromRequest,
) -> Result<MitoCopyRegionFromResponse> { ) -> Result<MitoCopyRegionFromResponse> {
self.inner.copy_region_from(region_id, request).await self.inner.copy_region_from(region_id, request).await
} }
@@ -642,7 +639,8 @@ impl MitoEngine {
/// ///
/// Only adding or removing files to region is considered valid now. /// Only adding or removing files to region is considered valid now.
fn is_valid_region_edit(edit: &RegionEdit) -> bool { fn is_valid_region_edit(edit: &RegionEdit) -> bool {
(!edit.files_to_add.is_empty() || !edit.files_to_remove.is_empty()) !edit.files_to_add.is_empty()
&& edit.files_to_remove.is_empty()
&& matches!( && matches!(
edit, edit,
RegionEdit { RegionEdit {
@@ -1075,7 +1073,7 @@ impl EngineInner {
async fn copy_region_from( async fn copy_region_from(
&self, &self,
region_id: RegionId, region_id: RegionId,
request: MitoCopyRegionFromRequest, request: CopyRegionFromRequest,
) -> Result<MitoCopyRegionFromResponse> { ) -> Result<MitoCopyRegionFromResponse> {
let (request, receiver) = let (request, receiver) =
WorkerRequest::try_from_copy_region_from_request(region_id, request)?; WorkerRequest::try_from_copy_region_from_request(region_id, request)?;
@@ -1249,21 +1247,15 @@ impl RegionEngine for MitoEngine {
async fn sync_region( async fn sync_region(
&self, &self,
region_id: RegionId, region_id: RegionId,
request: SyncRegionFromRequest, manifest_info: RegionManifestInfo,
) -> Result<SyncRegionFromResponse, BoxedError> { ) -> Result<SyncManifestResponse, BoxedError> {
let manifest_info = request
.into_region_manifest_info()
.context(UnexpectedSnafu {
err_msg: "Expected a manifest info request",
})
.map_err(BoxedError::new)?;
let (_, synced) = self let (_, synced) = self
.inner .inner
.sync_region(region_id, manifest_info) .sync_region(region_id, manifest_info)
.await .await
.map_err(BoxedError::new)?; .map_err(BoxedError::new)?;
Ok(SyncRegionFromResponse::Mito { synced }) Ok(SyncManifestResponse::Mito { synced })
} }
async fn remap_manifests( async fn remap_manifests(
@@ -1276,6 +1268,19 @@ impl RegionEngine for MitoEngine {
.map_err(BoxedError::new) .map_err(BoxedError::new)
} }
async fn copy_region_from(
&self,
_region_id: RegionId,
_request: CopyRegionFromRequest,
) -> Result<CopyRegionFromResponse, BoxedError> {
Err(BoxedError::new(
error::UnsupportedOperationSnafu {
err_msg: "copy_region_from is not supported",
}
.build(),
))
}
fn role(&self, region_id: RegionId) -> Option<RegionRole> { fn role(&self, region_id: RegionId) -> Option<RegionRole> {
self.inner.role(region_id) self.inner.role(region_id)
} }
@@ -1414,7 +1419,7 @@ mod tests {
}; };
assert!(is_valid_region_edit(&edit)); assert!(is_valid_region_edit(&edit));
// Invalid: "files_to_add" and "files_to_remove" are both empty // Invalid: "files_to_add" is empty
let edit = RegionEdit { let edit = RegionEdit {
files_to_add: vec![], files_to_add: vec![],
files_to_remove: vec![], files_to_remove: vec![],
@@ -1426,7 +1431,7 @@ mod tests {
}; };
assert!(!is_valid_region_edit(&edit)); assert!(!is_valid_region_edit(&edit));
// Valid: "files_to_remove" is not empty // Invalid: "files_to_remove" is not empty
let edit = RegionEdit { let edit = RegionEdit {
files_to_add: vec![FileMeta::default()], files_to_add: vec![FileMeta::default()],
files_to_remove: vec![FileMeta::default()], files_to_remove: vec![FileMeta::default()],
@@ -1436,7 +1441,7 @@ mod tests {
flushed_sequence: None, flushed_sequence: None,
committed_sequence: None, committed_sequence: None,
}; };
assert!(is_valid_region_edit(&edit)); assert!(!is_valid_region_edit(&edit));
// Invalid: other fields are not all "None"s // Invalid: other fields are not all "None"s
let edit = RegionEdit { let edit = RegionEdit {

View File

@@ -1,400 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::assert_matches::assert_matches;
use std::fs;
use api::v1::Rows;
use datatypes::value::Value;
use partition::expr::{PartitionExpr, col};
use store_api::region_engine::{
RegionEngine, RegionRole, RemapManifestsRequest, SettableRegionRoleState,
};
use store_api::region_request::{
ApplyStagingManifestRequest, EnterStagingRequest, RegionFlushRequest, RegionRequest,
};
use store_api::storage::{FileId, RegionId};
use crate::config::MitoConfig;
use crate::error::Error;
use crate::manifest::action::RegionManifest;
use crate::sst::file::FileMeta;
use crate::test_util::{CreateRequestBuilder, TestEnv, build_rows, put_rows, rows_schema};
fn range_expr(col_name: &str, start: i64, end: i64) -> PartitionExpr {
col(col_name)
.gt_eq(Value::Int64(start))
.and(col(col_name).lt(Value::Int64(end)))
}
#[tokio::test]
async fn test_apply_staging_manifest_invalid_region_state() {
common_telemetry::init_default_ut_logging();
test_apply_staging_manifest_invalid_region_state_with_format(false).await;
test_apply_staging_manifest_invalid_region_state_with_format(true).await;
}
async fn test_apply_staging_manifest_invalid_region_state_with_format(flat_format: bool) {
let mut env = TestEnv::with_prefix("invalid-region-state").await;
let engine = env
.create_engine(MitoConfig {
default_experimental_flat_format: flat_format,
..Default::default()
})
.await;
let region_id = RegionId::new(1, 1);
let request = CreateRequestBuilder::new()
.partition_expr_json(Some(range_expr("x", 0, 50).as_json_str().unwrap()))
.build();
engine
.handle_request(region_id, RegionRequest::Create(request))
.await
.unwrap();
// Region is in leader state, apply staging manifest request should fail.
let err = engine
.handle_request(
region_id,
RegionRequest::ApplyStagingManifest(ApplyStagingManifestRequest {
partition_expr: range_expr("x", 0, 100).as_json_str().unwrap(),
files_to_add: vec![],
}),
)
.await
.unwrap_err();
assert_matches!(
err.into_inner().as_any().downcast_ref::<Error>().unwrap(),
Error::RegionState { .. }
);
// Region is in leader state, apply staging manifest request should fail.
engine
.set_region_role(region_id, RegionRole::Follower)
.unwrap();
let err = engine
.handle_request(
region_id,
RegionRequest::ApplyStagingManifest(ApplyStagingManifestRequest {
partition_expr: range_expr("x", 0, 100).as_json_str().unwrap(),
files_to_add: vec![],
}),
)
.await
.unwrap_err();
assert_matches!(
err.into_inner().as_any().downcast_ref::<Error>().unwrap(),
Error::RegionState { .. }
);
}
#[tokio::test]
async fn test_apply_staging_manifest_mismatched_partition_expr() {
common_telemetry::init_default_ut_logging();
test_apply_staging_manifest_mismatched_partition_expr_with_format(false).await;
test_apply_staging_manifest_mismatched_partition_expr_with_format(true).await;
}
async fn test_apply_staging_manifest_mismatched_partition_expr_with_format(flat_format: bool) {
let mut env = TestEnv::with_prefix("mismatched-partition-expr").await;
let engine = env
.create_engine(MitoConfig {
default_experimental_flat_format: flat_format,
..Default::default()
})
.await;
let region_id = RegionId::new(1, 1);
let request = CreateRequestBuilder::new().build();
engine
.handle_request(region_id, RegionRequest::Create(request))
.await
.unwrap();
engine
.handle_request(
region_id,
RegionRequest::EnterStaging(EnterStagingRequest {
partition_expr: range_expr("x", 0, 50).as_json_str().unwrap(),
}),
)
.await
.unwrap();
let err = engine
.handle_request(
region_id,
RegionRequest::ApplyStagingManifest(ApplyStagingManifestRequest {
partition_expr: range_expr("x", 0, 100).as_json_str().unwrap(),
files_to_add: vec![],
}),
)
.await
.unwrap_err();
assert_matches!(
err.into_inner().as_any().downcast_ref::<Error>().unwrap(),
Error::StagingPartitionExprMismatch { .. }
)
}
#[tokio::test]
async fn test_apply_staging_manifest_success() {
common_telemetry::init_default_ut_logging();
test_apply_staging_manifest_success_with_format(false).await;
test_apply_staging_manifest_success_with_format(true).await;
}
async fn test_apply_staging_manifest_success_with_format(flat_format: bool) {
let mut env = TestEnv::with_prefix("success").await;
let engine = env
.create_engine(MitoConfig {
default_experimental_flat_format: flat_format,
..Default::default()
})
.await;
let region_id = RegionId::new(1, 1);
let request = CreateRequestBuilder::new()
.partition_expr_json(Some(range_expr("tag_0", 0, 100).as_json_str().unwrap()))
.build();
let column_schemas = rows_schema(&request);
engine
.handle_request(region_id, RegionRequest::Create(request))
.await
.unwrap();
let new_region_id_1 = RegionId::new(1, 2);
let new_region_id_2 = RegionId::new(1, 3);
// Generate some data
for i in 0..3 {
let rows_data = Rows {
schema: column_schemas.clone(),
rows: build_rows(i * 10, (i + 1) * 10),
};
put_rows(&engine, region_id, rows_data).await;
engine
.handle_request(
region_id,
RegionRequest::Flush(RegionFlushRequest {
row_group_size: None,
}),
)
.await
.unwrap();
}
engine
.set_region_role_state_gracefully(region_id, SettableRegionRoleState::StagingLeader)
.await
.unwrap();
let result = engine
.remap_manifests(RemapManifestsRequest {
region_id,
input_regions: vec![region_id],
region_mapping: [(region_id, vec![new_region_id_1, new_region_id_2])]
.into_iter()
.collect(),
new_partition_exprs: [
(
new_region_id_1,
range_expr("tag_0", 0, 50).as_json_str().unwrap(),
),
(
new_region_id_2,
range_expr("tag_0", 50, 100).as_json_str().unwrap(),
),
]
.into_iter()
.collect(),
})
.await
.unwrap();
assert_eq!(result.new_manifests.len(), 2);
let new_manifest_1 =
serde_json::from_str::<RegionManifest>(&result.new_manifests[&new_region_id_1]).unwrap();
let new_manifest_2 =
serde_json::from_str::<RegionManifest>(&result.new_manifests[&new_region_id_2]).unwrap();
assert_eq!(new_manifest_1.files.len(), 3);
assert_eq!(new_manifest_2.files.len(), 3);
let request = CreateRequestBuilder::new().build();
engine
.handle_request(new_region_id_1, RegionRequest::Create(request))
.await
.unwrap();
engine
.handle_request(
new_region_id_1,
RegionRequest::EnterStaging(EnterStagingRequest {
partition_expr: range_expr("tag_0", 0, 50).as_json_str().unwrap(),
}),
)
.await
.unwrap();
let mut files_to_add = new_manifest_1.files.values().cloned().collect::<Vec<_>>();
// Before apply staging manifest, the files should be empty
let region = engine.get_region(new_region_id_1).unwrap();
let manifest = region.manifest_ctx.manifest().await;
assert_eq!(manifest.files.len(), 0);
let staging_manifest = region.manifest_ctx.staging_manifest().await.unwrap();
assert_eq!(staging_manifest.files.len(), 0);
engine
.handle_request(
new_region_id_1,
RegionRequest::ApplyStagingManifest(ApplyStagingManifestRequest {
partition_expr: range_expr("tag_0", 0, 50).as_json_str().unwrap(),
files_to_add: serde_json::to_vec(&files_to_add).unwrap(),
}),
)
.await
.unwrap();
// After apply staging manifest, the files should be the same as the new manifest
let region = engine.get_region(new_region_id_1).unwrap();
let manifest = region.manifest_ctx.manifest().await;
assert_eq!(manifest.files.len(), 3);
assert!(region.is_writable());
assert!(!region.is_staging());
// The manifest partition expr should be the same as the request.
assert_eq!(
manifest.metadata.partition_expr.as_ref().unwrap(),
&range_expr("tag_0", 0, 50).as_json_str().unwrap()
);
// The staging manifest should be cleared.
let staging_manifest = region.manifest_ctx.staging_manifest().await;
assert!(staging_manifest.is_none());
// The staging partition expr should be cleared.
assert!(region.staging_partition_expr.lock().unwrap().is_none());
// The staging manifest directory should be empty.
let data_home = env.data_home();
let region_dir = format!("{}/data/test/1_0000000001", data_home.display());
let staging_manifest_dir = format!("{}/staging/manifest", region_dir);
let staging_files = fs::read_dir(&staging_manifest_dir)
.map(|entries| entries.collect::<Result<Vec<_>, _>>().unwrap_or_default())
.unwrap_or_default();
assert_eq!(staging_files.len(), 0);
// Try to modify the file sequence.
files_to_add.push(FileMeta {
region_id,
file_id: FileId::random(),
..Default::default()
});
// This request will be ignored.
engine
.handle_request(
new_region_id_1,
RegionRequest::ApplyStagingManifest(ApplyStagingManifestRequest {
partition_expr: range_expr("tag_0", 0, 50).as_json_str().unwrap(),
files_to_add: serde_json::to_vec(&files_to_add).unwrap(),
}),
)
.await
.unwrap();
// The files number should not change.
let region = engine.get_region(new_region_id_1).unwrap();
let manifest = region.manifest_ctx.manifest().await;
assert_eq!(manifest.files.len(), 3);
}
#[tokio::test]
async fn test_apply_staging_manifest_invalid_files_to_add() {
common_telemetry::init_default_ut_logging();
test_apply_staging_manifest_invalid_files_to_add_with_format(false).await;
test_apply_staging_manifest_invalid_files_to_add_with_format(true).await;
}
async fn test_apply_staging_manifest_invalid_files_to_add_with_format(flat_format: bool) {
let mut env = TestEnv::with_prefix("invalid-files-to-add").await;
let engine = env
.create_engine(MitoConfig {
default_experimental_flat_format: flat_format,
..Default::default()
})
.await;
let region_id = RegionId::new(1, 1);
let request = CreateRequestBuilder::new().build();
engine
.handle_request(region_id, RegionRequest::Create(request))
.await
.unwrap();
engine
.handle_request(
region_id,
RegionRequest::EnterStaging(EnterStagingRequest {
partition_expr: range_expr("tag_0", 0, 50).as_json_str().unwrap(),
}),
)
.await
.unwrap();
let err = engine
.handle_request(
region_id,
RegionRequest::ApplyStagingManifest(ApplyStagingManifestRequest {
partition_expr: range_expr("tag_0", 0, 50).as_json_str().unwrap(),
files_to_add: b"invalid".to_vec(),
}),
)
.await
.unwrap_err();
assert_matches!(
err.into_inner().as_any().downcast_ref::<Error>().unwrap(),
Error::SerdeJson { .. }
);
}
#[tokio::test]
async fn test_apply_staging_manifest_empty_files() {
common_telemetry::init_default_ut_logging();
test_apply_staging_manifest_empty_files_with_format(false).await;
test_apply_staging_manifest_empty_files_with_format(true).await;
}
async fn test_apply_staging_manifest_empty_files_with_format(flat_format: bool) {
let mut env = TestEnv::with_prefix("empty-files").await;
let engine = env
.create_engine(MitoConfig {
default_experimental_flat_format: flat_format,
..Default::default()
})
.await;
let region_id = RegionId::new(1, 1);
let request = CreateRequestBuilder::new().build();
engine
.handle_request(region_id, RegionRequest::Create(request))
.await
.unwrap();
engine
.handle_request(
region_id,
RegionRequest::EnterStaging(EnterStagingRequest {
partition_expr: range_expr("tag_0", 0, 50).as_json_str().unwrap(),
}),
)
.await
.unwrap();
engine
.handle_request(
region_id,
RegionRequest::ApplyStagingManifest(ApplyStagingManifestRequest {
partition_expr: range_expr("tag_0", 0, 50).as_json_str().unwrap(),
files_to_add: serde_json::to_vec::<Vec<FileMeta>>(&vec![]).unwrap(),
}),
)
.await
.unwrap();
let region = engine.get_region(region_id).unwrap();
let manifest = region.manifest_ctx.manifest().await;
assert_eq!(manifest.files.len(), 0);
let staging_manifest = region.manifest_ctx.staging_manifest().await;
assert!(staging_manifest.is_none());
let staging_partition_expr = region.staging_partition_expr.lock().unwrap();
assert!(staging_partition_expr.is_none());
}

View File

@@ -20,7 +20,7 @@ use api::v1::Rows;
use common_error::ext::ErrorExt; use common_error::ext::ErrorExt;
use common_error::status_code::StatusCode; use common_error::status_code::StatusCode;
use object_store::layers::mock::{Error as MockError, ErrorKind, MockLayerBuilder}; use object_store::layers::mock::{Error as MockError, ErrorKind, MockLayerBuilder};
use store_api::region_engine::{MitoCopyRegionFromRequest, RegionEngine, RegionRole}; use store_api::region_engine::{CopyRegionFromRequest, RegionEngine, RegionRole};
use store_api::region_request::{RegionFlushRequest, RegionRequest}; use store_api::region_request::{RegionFlushRequest, RegionRequest};
use store_api::storage::RegionId; use store_api::storage::RegionId;
@@ -89,7 +89,7 @@ async fn test_engine_copy_region_from_with_format(flat_format: bool, with_index:
let resp = engine let resp = engine
.copy_region_from( .copy_region_from(
target_region_id, target_region_id,
MitoCopyRegionFromRequest { CopyRegionFromRequest {
source_region_id, source_region_id,
parallelism: 1, parallelism: 1,
}, },
@@ -126,7 +126,7 @@ async fn test_engine_copy_region_from_with_format(flat_format: bool, with_index:
let resp2 = engine let resp2 = engine
.copy_region_from( .copy_region_from(
target_region_id, target_region_id,
MitoCopyRegionFromRequest { CopyRegionFromRequest {
source_region_id, source_region_id,
parallelism: 1, parallelism: 1,
}, },
@@ -207,7 +207,7 @@ async fn test_engine_copy_region_failure_with_format(flat_format: bool) {
let err = engine let err = engine
.copy_region_from( .copy_region_from(
target_region_id, target_region_id,
MitoCopyRegionFromRequest { CopyRegionFromRequest {
source_region_id, source_region_id,
parallelism: 1, parallelism: 1,
}, },
@@ -225,6 +225,7 @@ async fn test_engine_copy_region_failure_with_format(flat_format: bool) {
let source_region_dir = format!("{}/data/test/1_0000000001", env.data_home().display()); let source_region_dir = format!("{}/data/test/1_0000000001", env.data_home().display());
assert_file_num_in_dir(&source_region_dir, 1); assert_file_num_in_dir(&source_region_dir, 1);
assert_file_num_in_dir(&format!("{}/index", source_region_dir), 1); assert_file_num_in_dir(&format!("{}/index", source_region_dir), 1);
assert_eq!( assert_eq!(
source_region_files, source_region_files,
collect_filename_in_dir(&source_region_dir) collect_filename_in_dir(&source_region_dir)
@@ -297,7 +298,7 @@ async fn test_engine_copy_region_invalid_args_with_format(flat_format: bool) {
let err = engine let err = engine
.copy_region_from( .copy_region_from(
region_id, region_id,
MitoCopyRegionFromRequest { CopyRegionFromRequest {
source_region_id: RegionId::new(2, 1), source_region_id: RegionId::new(2, 1),
parallelism: 1, parallelism: 1,
}, },
@@ -308,7 +309,7 @@ async fn test_engine_copy_region_invalid_args_with_format(flat_format: bool) {
let err = engine let err = engine
.copy_region_from( .copy_region_from(
region_id, region_id,
MitoCopyRegionFromRequest { CopyRegionFromRequest {
source_region_id: RegionId::new(1, 1), source_region_id: RegionId::new(1, 1),
parallelism: 1, parallelism: 1,
}, },
@@ -346,7 +347,7 @@ async fn test_engine_copy_region_unexpected_state_with_format(flat_format: bool)
let err = engine let err = engine
.copy_region_from( .copy_region_from(
region_id, region_id,
MitoCopyRegionFromRequest { CopyRegionFromRequest {
source_region_id: RegionId::new(1, 2), source_region_id: RegionId::new(1, 2),
parallelism: 1, parallelism: 1,
}, },

View File

@@ -153,7 +153,7 @@ async fn test_sync_after_flush_region_with_format(flat_format: bool) {
// Returns error since the max manifest is 1 // Returns error since the max manifest is 1
let manifest_info = RegionManifestInfo::mito(2, 0, 0); let manifest_info = RegionManifestInfo::mito(2, 0, 0);
let err = follower_engine let err = follower_engine
.sync_region(region_id, manifest_info.into()) .sync_region(region_id, manifest_info)
.await .await
.unwrap_err(); .unwrap_err();
let err = err.as_any().downcast_ref::<Error>().unwrap(); let err = err.as_any().downcast_ref::<Error>().unwrap();
@@ -161,7 +161,7 @@ async fn test_sync_after_flush_region_with_format(flat_format: bool) {
let manifest_info = RegionManifestInfo::mito(1, 0, 0); let manifest_info = RegionManifestInfo::mito(1, 0, 0);
follower_engine follower_engine
.sync_region(region_id, manifest_info.into()) .sync_region(region_id, manifest_info)
.await .await
.unwrap(); .unwrap();
common_telemetry::info!("Scan the region on the follower engine after sync"); common_telemetry::info!("Scan the region on the follower engine after sync");
@@ -266,7 +266,7 @@ async fn test_sync_after_alter_region_with_format(flat_format: bool) {
// Sync the region from the leader engine to the follower engine // Sync the region from the leader engine to the follower engine
let manifest_info = RegionManifestInfo::mito(2, 0, 0); let manifest_info = RegionManifestInfo::mito(2, 0, 0);
follower_engine follower_engine
.sync_region(region_id, manifest_info.into()) .sync_region(region_id, manifest_info)
.await .await
.unwrap(); .unwrap();
let expected = "\ let expected = "\

View File

@@ -26,7 +26,7 @@ use either::Either;
use partition::expr::PartitionExpr; use partition::expr::PartitionExpr;
use smallvec::{SmallVec, smallvec}; use smallvec::{SmallVec, smallvec};
use snafu::ResultExt; use snafu::ResultExt;
use store_api::storage::{RegionId, SequenceNumber}; use store_api::storage::RegionId;
use strum::IntoStaticStr; use strum::IntoStaticStr;
use tokio::sync::{Semaphore, mpsc, watch}; use tokio::sync::{Semaphore, mpsc, watch};
@@ -464,26 +464,24 @@ impl RegionFlushTask {
// Sets `for_flush` flag to true. // Sets `for_flush` flag to true.
let mem_ranges = mem.ranges(None, RangesOptions::for_flush())?; let mem_ranges = mem.ranges(None, RangesOptions::for_flush())?;
let num_mem_ranges = mem_ranges.ranges.len(); let num_mem_ranges = mem_ranges.ranges.len();
let num_mem_rows = mem_ranges.stats.num_rows();
// Aggregate stats from all ranges
let num_mem_rows = mem_ranges.num_rows();
let memtable_series_count = mem_ranges.series_count();
let memtable_id = mem.id(); let memtable_id = mem.id();
// Increases series count for each mem range. We consider each mem range has different series so // Increases series count for each mem range. We consider each mem range has different series so
// the counter may have more series than the actual series count. // the counter may have more series than the actual series count.
series_count += memtable_series_count; series_count += mem_ranges.stats.series_count();
if mem_ranges.is_record_batch() { if mem_ranges.is_record_batch() {
let flush_start = Instant::now(); let flush_start = Instant::now();
let FlushFlatMemResult { let FlushFlatMemResult {
num_encoded, num_encoded,
max_sequence,
num_sources, num_sources,
results, results,
} = self } = self
.flush_flat_mem_ranges(version, &write_opts, mem_ranges) .flush_flat_mem_ranges(version, &write_opts, mem_ranges)
.await?; .await?;
for (source_idx, result) in results.into_iter().enumerate() { for (source_idx, result) in results.into_iter().enumerate() {
let (max_sequence, ssts_written, metrics) = result?; let (ssts_written, metrics) = result?;
if ssts_written.is_empty() { if ssts_written.is_empty() {
// No data written. // No data written.
continue; continue;
@@ -523,7 +521,7 @@ impl RegionFlushTask {
compact_cost, compact_cost,
); );
} else { } else {
let max_sequence = mem_ranges.max_sequence(); let max_sequence = mem_ranges.stats.max_sequence();
let source = memtable_source(mem_ranges, &version.options).await?; let source = memtable_source(mem_ranges, &version.options).await?;
// Flush to level 0. // Flush to level 0.
@@ -585,7 +583,8 @@ impl RegionFlushTask {
)?; )?;
let mut tasks = Vec::with_capacity(flat_sources.encoded.len() + flat_sources.sources.len()); let mut tasks = Vec::with_capacity(flat_sources.encoded.len() + flat_sources.sources.len());
let num_encoded = flat_sources.encoded.len(); let num_encoded = flat_sources.encoded.len();
for (source, max_sequence) in flat_sources.sources { let max_sequence = flat_sources.max_sequence;
for source in flat_sources.sources {
let source = Either::Right(source); let source = Either::Right(source);
let write_request = self.new_write_request(version, max_sequence, source); let write_request = self.new_write_request(version, max_sequence, source);
let access_layer = self.access_layer.clone(); let access_layer = self.access_layer.clone();
@@ -597,11 +596,11 @@ impl RegionFlushTask {
let ssts = access_layer let ssts = access_layer
.write_sst(write_request, &write_opts, &mut metrics) .write_sst(write_request, &write_opts, &mut metrics)
.await?; .await?;
Ok((max_sequence, ssts, metrics)) Ok((ssts, metrics))
}); });
tasks.push(task); tasks.push(task);
} }
for (encoded, max_sequence) in flat_sources.encoded { for encoded in flat_sources.encoded {
let access_layer = self.access_layer.clone(); let access_layer = self.access_layer.clone();
let cache_manager = self.cache_manager.clone(); let cache_manager = self.cache_manager.clone();
let region_id = version.metadata.region_id; let region_id = version.metadata.region_id;
@@ -611,7 +610,7 @@ impl RegionFlushTask {
let metrics = access_layer let metrics = access_layer
.put_sst(&encoded.data, region_id, &encoded.sst_info, &cache_manager) .put_sst(&encoded.data, region_id, &encoded.sst_info, &cache_manager)
.await?; .await?;
Ok((max_sequence, smallvec![encoded.sst_info], metrics)) Ok((smallvec![encoded.sst_info], metrics))
}); });
tasks.push(task); tasks.push(task);
} }
@@ -621,6 +620,7 @@ impl RegionFlushTask {
.context(JoinSnafu)?; .context(JoinSnafu)?;
Ok(FlushFlatMemResult { Ok(FlushFlatMemResult {
num_encoded, num_encoded,
max_sequence,
num_sources, num_sources,
results, results,
}) })
@@ -696,8 +696,9 @@ impl RegionFlushTask {
struct FlushFlatMemResult { struct FlushFlatMemResult {
num_encoded: usize, num_encoded: usize,
max_sequence: u64,
num_sources: usize, num_sources: usize,
results: Vec<Result<(SequenceNumber, SstInfoArray, Metrics)>>, results: Vec<Result<(SstInfoArray, Metrics)>>,
} }
struct DoFlushMemtablesResult { struct DoFlushMemtablesResult {
@@ -743,8 +744,9 @@ async fn memtable_source(mem_ranges: MemtableRanges, options: &RegionOptions) ->
} }
struct FlatSources { struct FlatSources {
sources: SmallVec<[(FlatSource, SequenceNumber); 4]>, max_sequence: u64,
encoded: SmallVec<[(EncodedRange, SequenceNumber); 4]>, sources: SmallVec<[FlatSource; 4]>,
encoded: SmallVec<[EncodedRange; 4]>,
} }
/// Returns the max sequence and [FlatSource] for the given memtable. /// Returns the max sequence and [FlatSource] for the given memtable.
@@ -754,17 +756,18 @@ fn memtable_flat_sources(
options: &RegionOptions, options: &RegionOptions,
field_column_start: usize, field_column_start: usize,
) -> Result<FlatSources> { ) -> Result<FlatSources> {
let MemtableRanges { ranges } = mem_ranges; let MemtableRanges { ranges, stats } = mem_ranges;
let max_sequence = stats.max_sequence();
let mut flat_sources = FlatSources { let mut flat_sources = FlatSources {
max_sequence,
sources: SmallVec::new(), sources: SmallVec::new(),
encoded: SmallVec::new(), encoded: SmallVec::new(),
}; };
if ranges.len() == 1 { if ranges.len() == 1 {
let only_range = ranges.into_values().next().unwrap(); let only_range = ranges.into_values().next().unwrap();
let max_sequence = only_range.stats().max_sequence();
if let Some(encoded) = only_range.encoded() { if let Some(encoded) = only_range.encoded() {
flat_sources.encoded.push((encoded, max_sequence)); flat_sources.encoded.push(encoded);
} else { } else {
let iter = only_range.build_record_batch_iter(None)?; let iter = only_range.build_record_batch_iter(None)?;
// Dedup according to append mode and merge mode. // Dedup according to append mode and merge mode.
@@ -775,39 +778,25 @@ fn memtable_flat_sources(
field_column_start, field_column_start,
iter, iter,
); );
flat_sources flat_sources.sources.push(FlatSource::Iter(iter));
.sources
.push((FlatSource::Iter(iter), max_sequence));
}; };
} else { } else {
// Calculate total rows from all ranges for min_flush_rows calculation let min_flush_rows = stats.num_rows / 8;
let total_rows: usize = ranges.values().map(|r| r.stats().num_rows()).sum();
let min_flush_rows = total_rows / 8;
let min_flush_rows = min_flush_rows.max(DEFAULT_ROW_GROUP_SIZE); let min_flush_rows = min_flush_rows.max(DEFAULT_ROW_GROUP_SIZE);
let mut last_iter_rows = 0; let mut last_iter_rows = 0;
let num_ranges = ranges.len(); let num_ranges = ranges.len();
let mut input_iters = Vec::with_capacity(num_ranges); let mut input_iters = Vec::with_capacity(num_ranges);
let mut current_ranges = Vec::new();
for (_range_id, range) in ranges { for (_range_id, range) in ranges {
if let Some(encoded) = range.encoded() { if let Some(encoded) = range.encoded() {
let max_sequence = range.stats().max_sequence(); flat_sources.encoded.push(encoded);
flat_sources.encoded.push((encoded, max_sequence));
continue; continue;
} }
let iter = range.build_record_batch_iter(None)?; let iter = range.build_record_batch_iter(None)?;
input_iters.push(iter); input_iters.push(iter);
last_iter_rows += range.num_rows(); last_iter_rows += range.num_rows();
current_ranges.push(range);
if last_iter_rows > min_flush_rows { if last_iter_rows > min_flush_rows {
// Calculate max_sequence from all merged ranges
let max_sequence = current_ranges
.iter()
.map(|r| r.stats().max_sequence())
.max()
.unwrap_or(0);
let maybe_dedup = merge_and_dedup( let maybe_dedup = merge_and_dedup(
&schema, &schema,
options.append_mode, options.append_mode,
@@ -816,22 +805,13 @@ fn memtable_flat_sources(
std::mem::replace(&mut input_iters, Vec::with_capacity(num_ranges)), std::mem::replace(&mut input_iters, Vec::with_capacity(num_ranges)),
)?; )?;
flat_sources flat_sources.sources.push(FlatSource::Iter(maybe_dedup));
.sources
.push((FlatSource::Iter(maybe_dedup), max_sequence));
last_iter_rows = 0; last_iter_rows = 0;
current_ranges.clear();
} }
} }
// Handle remaining iters. // Handle remaining iters.
if !input_iters.is_empty() { if !input_iters.is_empty() {
let max_sequence = current_ranges
.iter()
.map(|r| r.stats().max_sequence())
.max()
.unwrap_or(0);
let maybe_dedup = merge_and_dedup( let maybe_dedup = merge_and_dedup(
&schema, &schema,
options.append_mode, options.append_mode,
@@ -840,9 +820,7 @@ fn memtable_flat_sources(
input_iters, input_iters,
)?; )?;
flat_sources flat_sources.sources.push(FlatSource::Iter(maybe_dedup));
.sources
.push((FlatSource::Iter(maybe_dedup), max_sequence));
} }
} }
@@ -1513,7 +1491,7 @@ mod tests {
// Consume the iterator and count rows // Consume the iterator and count rows
let mut total_rows = 0usize; let mut total_rows = 0usize;
for (source, _sequence) in flat_sources.sources { for source in flat_sources.sources {
match source { match source {
crate::read::FlatSource::Iter(iter) => { crate::read::FlatSource::Iter(iter) => {
for rb in iter { for rb in iter {
@@ -1543,7 +1521,7 @@ mod tests {
assert_eq!(1, flat_sources.sources.len()); assert_eq!(1, flat_sources.sources.len());
let mut total_rows = 0usize; let mut total_rows = 0usize;
for (source, _sequence) in flat_sources.sources { for source in flat_sources.sources {
match source { match source {
crate::read::FlatSource::Iter(iter) => { crate::read::FlatSource::Iter(iter) => {
for rb in iter { for rb in iter {

View File

@@ -45,18 +45,6 @@ pub enum RegionMetaAction {
Truncate(RegionTruncate), Truncate(RegionTruncate),
} }
impl RegionMetaAction {
/// Returns true if the action is a change action.
pub fn is_change(&self) -> bool {
matches!(self, RegionMetaAction::Change(_))
}
/// Returns true if the action is an edit action.
pub fn is_edit(&self) -> bool {
matches!(self, RegionMetaAction::Edit(_))
}
}
#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)] #[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)]
pub struct RegionChange { pub struct RegionChange {
/// The metadata after changed. /// The metadata after changed.
@@ -352,51 +340,16 @@ pub struct RemovedFiles {
/// the files are removed from manifest. The timestamp is in milliseconds since unix epoch. /// the files are removed from manifest. The timestamp is in milliseconds since unix epoch.
pub removed_at: i64, pub removed_at: i64,
/// The set of file ids that are removed. /// The set of file ids that are removed.
#[serde(default)]
pub files: HashSet<RemovedFile>, pub files: HashSet<RemovedFile>,
} }
/// A removed file, which can be a data file(optional paired with a index file) or an outdated index file. /// A removed file, which can be a data file(optional paired with a index file) or an outdated index file.
#[derive(Serialize, Hash, Clone, Debug, PartialEq, Eq)] #[derive(Serialize, Deserialize, Hash, Clone, Debug, PartialEq, Eq)]
pub enum RemovedFile { pub enum RemovedFile {
File(FileId, Option<IndexVersion>), File(FileId, Option<IndexVersion>),
Index(FileId, IndexVersion), Index(FileId, IndexVersion),
} }
/// Support deserialize from old format(just FileId as string) for backward compatibility
/// into current format(RemovedFile enum).
/// This is needed just in case there are old manifests with removed files recorded.
impl<'de> Deserialize<'de> for RemovedFile {
fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
#[derive(Deserialize)]
#[serde(untagged)]
enum CompatRemovedFile {
Enum(RemovedFileEnum),
FileId(FileId),
}
#[derive(Deserialize)]
enum RemovedFileEnum {
File(FileId, Option<IndexVersion>),
Index(FileId, IndexVersion),
}
let compat = CompatRemovedFile::deserialize(deserializer)?;
match compat {
CompatRemovedFile::FileId(file_id) => Ok(RemovedFile::File(file_id, None)),
CompatRemovedFile::Enum(e) => match e {
RemovedFileEnum::File(file_id, version) => Ok(RemovedFile::File(file_id, version)),
RemovedFileEnum::Index(file_id, version) => {
Ok(RemovedFile::Index(file_id, version))
}
},
}
}
}
impl RemovedFile { impl RemovedFile {
pub fn file_id(&self) -> FileId { pub fn file_id(&self) -> FileId {
match self { match self {
@@ -485,8 +438,7 @@ impl RegionMetaActionList {
Self { actions } Self { actions }
} }
/// Split the actions into a region change and an edit. pub fn into_region_edit(self) -> RegionEdit {
pub fn split_region_change_and_edit(self) -> (Option<RegionChange>, RegionEdit) {
let mut edit = RegionEdit { let mut edit = RegionEdit {
files_to_add: Vec::new(), files_to_add: Vec::new(),
files_to_remove: Vec::new(), files_to_remove: Vec::new(),
@@ -496,39 +448,31 @@ impl RegionMetaActionList {
flushed_sequence: None, flushed_sequence: None,
committed_sequence: None, committed_sequence: None,
}; };
let mut region_change = None;
for action in self.actions { for action in self.actions {
match action { if let RegionMetaAction::Edit(region_edit) = action {
RegionMetaAction::Change(change) => { // Merge file adds/removes
region_change = Some(change); edit.files_to_add.extend(region_edit.files_to_add);
edit.files_to_remove.extend(region_edit.files_to_remove);
// Max of flushed entry id / sequence
if let Some(eid) = region_edit.flushed_entry_id {
edit.flushed_entry_id = Some(edit.flushed_entry_id.map_or(eid, |v| v.max(eid)));
} }
RegionMetaAction::Edit(region_edit) => { if let Some(seq) = region_edit.flushed_sequence {
// Merge file adds/removes edit.flushed_sequence = Some(edit.flushed_sequence.map_or(seq, |v| v.max(seq)));
edit.files_to_add.extend(region_edit.files_to_add); }
edit.files_to_remove.extend(region_edit.files_to_remove); if let Some(seq) = region_edit.committed_sequence {
// Max of flushed entry id / sequence edit.committed_sequence =
if let Some(eid) = region_edit.flushed_entry_id { Some(edit.committed_sequence.map_or(seq, |v| v.max(seq)));
edit.flushed_entry_id = }
Some(edit.flushed_entry_id.map_or(eid, |v| v.max(eid))); // Prefer the latest non-none time window
} if region_edit.compaction_time_window.is_some() {
if let Some(seq) = region_edit.flushed_sequence { edit.compaction_time_window = region_edit.compaction_time_window;
edit.flushed_sequence =
Some(edit.flushed_sequence.map_or(seq, |v| v.max(seq)));
}
if let Some(seq) = region_edit.committed_sequence {
edit.committed_sequence =
Some(edit.committed_sequence.map_or(seq, |v| v.max(seq)));
}
// Prefer the latest non-none time window
if region_edit.compaction_time_window.is_some() {
edit.compaction_time_window = region_edit.compaction_time_window;
}
} }
_ => {}
} }
} }
(region_change, edit) edit
} }
} }
@@ -1064,115 +1008,4 @@ mod tests {
let deserialized: RegionChange = serde_json::from_str(&serialized).unwrap(); let deserialized: RegionChange = serde_json::from_str(&serialized).unwrap();
assert_eq!(deserialized.sst_format, FormatType::Flat); assert_eq!(deserialized.sst_format, FormatType::Flat);
} }
#[test]
fn test_removed_file_compatibility() {
let file_id = FileId::random();
// Case 1: Deserialize from FileId string (Legacy format)
let json_str = format!("\"{}\"", file_id);
let removed_file: RemovedFile = serde_json::from_str(&json_str).unwrap();
assert_eq!(removed_file, RemovedFile::File(file_id, None));
// Case 2: Deserialize from new format (File)
let removed_file_v2 = RemovedFile::File(file_id, Some(10));
let json_v2 = serde_json::to_string(&removed_file_v2).unwrap();
let deserialized_v2: RemovedFile = serde_json::from_str(&json_v2).unwrap();
assert_eq!(removed_file_v2, deserialized_v2);
// Case 3: Deserialize from new format (Index)
let removed_index = RemovedFile::Index(file_id, 20);
let json_index = serde_json::to_string(&removed_index).unwrap();
let deserialized_index: RemovedFile = serde_json::from_str(&json_index).unwrap();
assert_eq!(removed_index, deserialized_index);
// Case 4: Round-trip serialization/deserialization of new enum format with None as index version
let removed_file = RemovedFile::File(file_id, None);
let json = serde_json::to_string(&removed_file).unwrap();
let deserialized: RemovedFile = serde_json::from_str(&json).unwrap();
assert_eq!(removed_file, deserialized);
// Case 5: Deserialize mixed set in RemovedFilesRecord
// This simulates a Set<RemovedFile> which might contain old strings or new objects if manually constructed or from old versions.
// Actually, if it was HashSet<FileId>, the JSON is ["id1", "id2"].
// If it is HashSet<RemovedFile>, the JSON is [{"File":...}, "id2"] if mixed (which shouldn't happen usually but good to test).
let json_set = format!("[\"{}\"]", file_id);
let removed_files_set: HashSet<RemovedFile> = serde_json::from_str(&json_set).unwrap();
assert!(removed_files_set.contains(&RemovedFile::File(file_id, None)));
}
/// It is intentionally acceptable to ignore the legacy `file_ids` field when
/// deserializing [`RemovedFiles`].
///
/// In older manifests, `file_ids` recorded the set of SSTable files that were
/// candidates for garbage collection at a given `removed_at` timestamp. The
/// newer format stores this information in the `files` field instead. When we
/// deserialize an old manifest entry into the new struct, we *drop* the
/// `file_ids` field instead of trying to recover or merge it.
///
/// Dropping `file_ids` does **not** risk deleting live data: a file is only
/// physically removed when it is both (a) no longer referenced by any region
/// metadata and (b) selected by the GC worker as safe to delete. Losing the
/// historical list of candidate `file_ids` merely means some obsolete files
/// may stay on disk longer than strictly necessary.
///
/// The GC worker periodically scans storage (e.g. by walking the data
/// directories and/or consulting the latest manifest) to discover files that
/// are no longer referenced anywhere. Any files that were only referenced via
/// the dropped `file_ids` field will be rediscovered during these scans and
/// eventually deleted. Thus the system converges to a correct, fully-collected
/// state without relying on `file_ids`, and the only potential impact of
/// ignoring it is temporary disk space overhead, not data loss.
#[test]
fn test_removed_files_backward_compatibility() {
// Define the old version struct with file_ids field
#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)]
struct OldRemovedFiles {
pub removed_at: i64,
pub file_ids: HashSet<FileId>,
}
// Create an old version instance
let mut file_ids = HashSet::new();
file_ids.insert(FileId::random());
file_ids.insert(FileId::random());
let old_removed_files = OldRemovedFiles {
removed_at: 1234567890,
file_ids,
};
// Serialize the old version
let old_json = serde_json::to_string(&old_removed_files).unwrap();
// Try to deserialize into new version - file_ids should be ignored
let result: Result<RemovedFiles, _> = serde_json::from_str(&old_json);
// This should succeed and create a default RemovedFiles (empty files set)
assert!(result.is_ok(), "{:?}", result);
let removed_files = result.unwrap();
assert_eq!(removed_files.removed_at, 1234567890);
assert!(removed_files.files.is_empty());
// Test that new format still works
let file_id = FileId::random();
let new_json = format!(
r#"{{
"removed_at": 1234567890,
"files": ["{}"]
}}"#,
file_id
);
let result: Result<RemovedFiles, _> = serde_json::from_str(&new_json);
assert!(result.is_ok());
let removed_files = result.unwrap();
assert_eq!(removed_files.removed_at, 1234567890);
assert_eq!(removed_files.files.len(), 1);
assert!(
removed_files
.files
.contains(&RemovedFile::File(file_id, None))
);
}
} }

View File

@@ -12,46 +12,43 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
pub(crate) mod checkpoint; use std::collections::HashMap;
pub(crate) mod delta;
pub(crate) mod size_tracker;
pub(crate) mod staging;
pub(crate) mod utils;
use std::iter::Iterator; use std::iter::Iterator;
use std::str::FromStr; use std::str::FromStr;
use std::sync::Arc; use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::atomic::AtomicU64; use std::sync::{Arc, RwLock};
use common_datasource::compression::CompressionType; use common_datasource::compression::CompressionType;
use common_telemetry::debug; use common_telemetry::debug;
use crc32fast::Hasher; use crc32fast::Hasher;
use futures::TryStreamExt;
use futures::future::try_join_all;
use lazy_static::lazy_static; use lazy_static::lazy_static;
use object_store::util::join_dir; use object_store::util::join_dir;
use object_store::{Lister, ObjectStore, util}; use object_store::{Entry, ErrorKind, Lister, ObjectStore, util};
use regex::Regex; use regex::Regex;
use serde::{Deserialize, Serialize};
use snafu::{ResultExt, ensure}; use snafu::{ResultExt, ensure};
use store_api::ManifestVersion; use store_api::ManifestVersion;
use store_api::storage::RegionId; use store_api::storage::RegionId;
use tokio::sync::Semaphore;
use crate::cache::manifest_cache::ManifestCache; use crate::cache::manifest_cache::ManifestCache;
use crate::error::{ChecksumMismatchSnafu, OpenDalSnafu, Result}; use crate::error::{
use crate::manifest::storage::checkpoint::CheckpointStorage; ChecksumMismatchSnafu, CompressObjectSnafu, DecompressObjectSnafu, InvalidScanIndexSnafu,
use crate::manifest::storage::delta::DeltaStorage; OpenDalSnafu, Result, SerdeJsonSnafu, Utf8Snafu,
use crate::manifest::storage::size_tracker::{CheckpointTracker, DeltaTracker, SizeTracker}; };
use crate::manifest::storage::staging::StagingStorage;
use crate::manifest::storage::utils::remove_from_cache;
lazy_static! { lazy_static! {
static ref DELTA_RE: Regex = Regex::new("^\\d+\\.json").unwrap(); static ref DELTA_RE: Regex = Regex::new("^\\d+\\.json").unwrap();
static ref CHECKPOINT_RE: Regex = Regex::new("^\\d+\\.checkpoint").unwrap(); static ref CHECKPOINT_RE: Regex = Regex::new("^\\d+\\.checkpoint").unwrap();
} }
pub const LAST_CHECKPOINT_FILE: &str = "_last_checkpoint"; const LAST_CHECKPOINT_FILE: &str = "_last_checkpoint";
const DEFAULT_MANIFEST_COMPRESSION_TYPE: CompressionType = CompressionType::Gzip; const DEFAULT_MANIFEST_COMPRESSION_TYPE: CompressionType = CompressionType::Gzip;
/// Due to backward compatibility, it is possible that the user's manifest file has not been compressed. /// Due to backward compatibility, it is possible that the user's manifest file has not been compressed.
/// So when we encounter problems, we need to fall back to `FALL_BACK_COMPRESS_TYPE` for processing. /// So when we encounter problems, we need to fall back to `FALL_BACK_COMPRESS_TYPE` for processing.
pub(crate) const FALL_BACK_COMPRESS_TYPE: CompressionType = CompressionType::Uncompressed; const FALL_BACK_COMPRESS_TYPE: CompressionType = CompressionType::Uncompressed;
const FETCH_MANIFEST_PARALLELISM: usize = 16; const FETCH_MANIFEST_PARALLELISM: usize = 16;
/// Returns the directory to the manifest files. /// Returns the directory to the manifest files.
@@ -84,13 +81,13 @@ pub fn gen_path(path: &str, file: &str, compress_type: CompressionType) -> Strin
} }
} }
pub(crate) fn checkpoint_checksum(data: &[u8]) -> u32 { fn checkpoint_checksum(data: &[u8]) -> u32 {
let mut hasher = Hasher::new(); let mut hasher = Hasher::new();
hasher.update(data); hasher.update(data);
hasher.finalize() hasher.finalize()
} }
pub(crate) fn verify_checksum(data: &[u8], wanted: Option<u32>) -> Result<()> { fn verify_checksum(data: &[u8], wanted: Option<u32>) -> Result<()> {
if let Some(checksum) = wanted { if let Some(checksum) = wanted {
let calculated_checksum = checkpoint_checksum(data); let calculated_checksum = checkpoint_checksum(data);
ensure!( ensure!(
@@ -130,20 +127,26 @@ pub fn is_checkpoint_file(file_name: &str) -> bool {
CHECKPOINT_RE.is_match(file_name) CHECKPOINT_RE.is_match(file_name)
} }
/// Key to identify a manifest file.
#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
enum FileKey {
/// A delta file (`.json`).
Delta(ManifestVersion),
/// A checkpoint file (`.checkpoint`).
Checkpoint(ManifestVersion),
}
#[derive(Clone, Debug)] #[derive(Clone, Debug)]
pub struct ManifestObjectStore { pub struct ManifestObjectStore {
object_store: ObjectStore, object_store: ObjectStore,
compress_type: CompressionType,
path: String, path: String,
staging_path: String,
/// Stores the size of each manifest file.
manifest_size_map: Arc<RwLock<HashMap<FileKey, u64>>>,
total_manifest_size: Arc<AtomicU64>,
/// Optional manifest cache for local caching. /// Optional manifest cache for local caching.
manifest_cache: Option<ManifestCache>, manifest_cache: Option<ManifestCache>,
// Tracks the size of each file in the manifest directory.
size_tracker: SizeTracker,
// The checkpoint file storage.
checkpoint_storage: CheckpointStorage<CheckpointTracker>,
// The delta file storage.
delta_storage: DeltaStorage<DeltaTracker>,
/// The staging file storage.
staging_storage: StagingStorage,
} }
impl ManifestObjectStore { impl ManifestObjectStore {
@@ -157,37 +160,43 @@ impl ManifestObjectStore {
common_telemetry::info!("Create manifest store, cache: {}", manifest_cache.is_some()); common_telemetry::info!("Create manifest store, cache: {}", manifest_cache.is_some());
let path = util::normalize_dir(path); let path = util::normalize_dir(path);
let size_tracker = SizeTracker::new(total_manifest_size); let staging_path = {
let checkpoint_tracker = Arc::new(size_tracker.checkpoint_tracker()); // Convert "region_dir/manifest/" to "region_dir/staging/manifest/"
let delta_tracker = Arc::new(size_tracker.manifest_tracker()); let parent_dir = path.trim_end_matches("manifest/").trim_end_matches('/');
let checkpoint_storage = CheckpointStorage::new( util::normalize_dir(&format!("{}/staging/manifest", parent_dir))
path.clone(), };
object_store.clone(),
compress_type,
manifest_cache.clone(),
checkpoint_tracker,
);
let delta_storage = DeltaStorage::new(
path.clone(),
object_store.clone(),
compress_type,
manifest_cache.clone(),
delta_tracker,
);
let staging_storage =
StagingStorage::new(path.clone(), object_store.clone(), compress_type);
Self { Self {
object_store, object_store,
compress_type,
path, path,
staging_path,
manifest_size_map: Arc::new(RwLock::new(HashMap::new())),
total_manifest_size,
manifest_cache, manifest_cache,
size_tracker,
checkpoint_storage,
delta_storage,
staging_storage,
} }
} }
/// Returns the delta file path under the **current** compression algorithm
fn delta_file_path(&self, version: ManifestVersion, is_staging: bool) -> String {
let base_path = if is_staging {
&self.staging_path
} else {
&self.path
};
gen_path(base_path, &delta_file(version), self.compress_type)
}
/// Returns the checkpoint file path under the **current** compression algorithm
fn checkpoint_file_path(&self, version: ManifestVersion) -> String {
gen_path(&self.path, &checkpoint_file(version), self.compress_type)
}
/// Returns the last checkpoint path, because the last checkpoint is not compressed,
/// so its path name has nothing to do with the compression algorithm used by `ManifestObjectStore`
pub(crate) fn last_checkpoint_path(&self) -> String {
format!("{}{}", self.path, LAST_CHECKPOINT_FILE)
}
/// Returns the manifest dir /// Returns the manifest dir
pub(crate) fn manifest_dir(&self) -> &str { pub(crate) fn manifest_dir(&self) -> &str {
&self.path &self.path
@@ -195,14 +204,75 @@ impl ManifestObjectStore {
/// Returns an iterator of manifests from normal or staging directory. /// Returns an iterator of manifests from normal or staging directory.
pub(crate) async fn manifest_lister(&self, is_staging: bool) -> Result<Option<Lister>> { pub(crate) async fn manifest_lister(&self, is_staging: bool) -> Result<Option<Lister>> {
if is_staging { let path = if is_staging {
self.staging_storage.manifest_lister().await &self.staging_path
} else { } else {
self.delta_storage.manifest_lister().await &self.path
};
match self.object_store.lister_with(path).await {
Ok(streamer) => Ok(Some(streamer)),
Err(e) if e.kind() == ErrorKind::NotFound => {
debug!("Manifest directory does not exist: {}", path);
Ok(None)
}
Err(e) => Err(e).context(OpenDalSnafu)?,
} }
} }
/// Return all `R`s in the directory that meet the `filter` conditions (that is, the `filter` closure returns `Some(R)`),
/// and discard `R` that does not meet the conditions (that is, the `filter` closure returns `None`)
/// Return an empty vector when directory is not found.
pub async fn get_paths<F, R>(&self, filter: F, is_staging: bool) -> Result<Vec<R>>
where
F: Fn(Entry) -> Option<R>,
{
let Some(streamer) = self.manifest_lister(is_staging).await? else {
return Ok(vec![]);
};
streamer
.try_filter_map(|e| async { Ok(filter(e)) })
.try_collect::<Vec<_>>()
.await
.context(OpenDalSnafu)
}
/// Sorts the manifest files.
fn sort_manifests(entries: &mut [(ManifestVersion, Entry)]) {
entries.sort_unstable_by(|(v1, _), (v2, _)| v1.cmp(v2));
}
/// Scans the manifest files in the range of [start, end) and return all manifest entries.
pub async fn scan(
&self,
start: ManifestVersion,
end: ManifestVersion,
) -> Result<Vec<(ManifestVersion, Entry)>> {
ensure!(start <= end, InvalidScanIndexSnafu { start, end });
let mut entries: Vec<(ManifestVersion, Entry)> = self
.get_paths(
|entry| {
let file_name = entry.name();
if is_delta_file(file_name) {
let version = file_version(file_name);
if start <= version && version < end {
return Some((version, entry));
}
}
None
},
false,
)
.await?;
Self::sort_manifests(&mut entries);
Ok(entries)
}
/// Fetches manifests in range [start_version, end_version). /// Fetches manifests in range [start_version, end_version).
///
/// This functions is guaranteed to return manifests from the `start_version` strictly (must contain `start_version`). /// This functions is guaranteed to return manifests from the `start_version` strictly (must contain `start_version`).
pub async fn fetch_manifests_strict_from( pub async fn fetch_manifests_strict_from(
&self, &self,
@@ -210,9 +280,70 @@ impl ManifestObjectStore {
end_version: ManifestVersion, end_version: ManifestVersion,
region_id: RegionId, region_id: RegionId,
) -> Result<Vec<(ManifestVersion, Vec<u8>)>> { ) -> Result<Vec<(ManifestVersion, Vec<u8>)>> {
self.delta_storage let mut manifests = self.fetch_manifests(start_version, end_version).await?;
.fetch_manifests_strict_from(start_version, end_version, region_id) let start_index = manifests.iter().position(|(v, _)| *v == start_version);
.await debug!(
"Fetches manifests in range [{},{}), start_index: {:?}, region_id: {}, manifests: {:?}",
start_version,
end_version,
start_index,
region_id,
manifests.iter().map(|(v, _)| *v).collect::<Vec<_>>()
);
if let Some(start_index) = start_index {
Ok(manifests.split_off(start_index))
} else {
Ok(vec![])
}
}
/// Common implementation for fetching manifests from entries in parallel.
/// If `is_staging` is true, cache is skipped.
async fn fetch_manifests_from_entries(
&self,
entries: Vec<(ManifestVersion, Entry)>,
is_staging: bool,
) -> Result<Vec<(ManifestVersion, Vec<u8>)>> {
if entries.is_empty() {
return Ok(vec![]);
}
// TODO(weny): Make it configurable.
let semaphore = Semaphore::new(FETCH_MANIFEST_PARALLELISM);
let tasks = entries.iter().map(|(v, entry)| async {
// Safety: semaphore must exist.
let _permit = semaphore.acquire().await.unwrap();
let cache_key = entry.path();
// Try to get from cache first
if let Some(data) = self.get_from_cache(cache_key, is_staging).await {
return Ok((*v, data));
}
// Fetch from remote object store
let compress_type = file_compress_type(entry.name());
let bytes = self
.object_store
.read(entry.path())
.await
.context(OpenDalSnafu)?;
let data = compress_type
.decode(bytes)
.await
.context(DecompressObjectSnafu {
compress_type,
path: entry.path(),
})?;
// Add to cache
self.put_to_cache(cache_key.to_string(), &data, is_staging)
.await;
Ok((*v, data))
});
try_join_all(tasks).await
} }
/// Fetch all manifests in concurrent, and return the manifests in range [start_version, end_version) /// Fetch all manifests in concurrent, and return the manifests in range [start_version, end_version)
@@ -224,9 +355,8 @@ impl ManifestObjectStore {
start_version: ManifestVersion, start_version: ManifestVersion,
end_version: ManifestVersion, end_version: ManifestVersion,
) -> Result<Vec<(ManifestVersion, Vec<u8>)>> { ) -> Result<Vec<(ManifestVersion, Vec<u8>)>> {
self.delta_storage let manifests = self.scan(start_version, end_version).await?;
.fetch_manifests(start_version, end_version) self.fetch_manifests_from_entries(manifests, false).await
.await
} }
/// Delete manifest files that version < end. /// Delete manifest files that version < end.
@@ -240,18 +370,20 @@ impl ManifestObjectStore {
) -> Result<usize> { ) -> Result<usize> {
// Stores (entry, is_checkpoint, version) in a Vec. // Stores (entry, is_checkpoint, version) in a Vec.
let entries: Vec<_> = self let entries: Vec<_> = self
.delta_storage .get_paths(
.get_paths(|entry| { |entry| {
let file_name = entry.name(); let file_name = entry.name();
let is_checkpoint = is_checkpoint_file(file_name); let is_checkpoint = is_checkpoint_file(file_name);
if is_delta_file(file_name) || is_checkpoint_file(file_name) { if is_delta_file(file_name) || is_checkpoint_file(file_name) {
let version = file_version(file_name); let version = file_version(file_name);
if version < end { if version < end {
return Some((entry, is_checkpoint, version)); return Some((entry, is_checkpoint, version));
}
} }
} None
None },
}) false,
)
.await?; .await?;
let checkpoint_version = if keep_last_checkpoint { let checkpoint_version = if keep_last_checkpoint {
// Note that the order of entries is unspecific. // Note that the order of entries is unspecific.
@@ -296,7 +428,7 @@ impl ManifestObjectStore {
// Remove from cache first // Remove from cache first
for (entry, _, _) in &del_entries { for (entry, _, _) in &del_entries {
remove_from_cache(self.manifest_cache.as_ref(), entry.path()).await; self.remove_from_cache(entry.path()).await;
} }
self.object_store self.object_store
@@ -307,11 +439,9 @@ impl ManifestObjectStore {
// delete manifest sizes // delete manifest sizes
for (_, is_checkpoint, version) in &del_entries { for (_, is_checkpoint, version) in &del_entries {
if *is_checkpoint { if *is_checkpoint {
self.size_tracker self.unset_file_size(&FileKey::Checkpoint(*version));
.remove(&size_tracker::FileKey::Checkpoint(*version));
} else { } else {
self.size_tracker self.unset_file_size(&FileKey::Delta(*version));
.remove(&size_tracker::FileKey::Delta(*version));
} }
} }
@@ -325,11 +455,22 @@ impl ManifestObjectStore {
bytes: &[u8], bytes: &[u8],
is_staging: bool, is_staging: bool,
) -> Result<()> { ) -> Result<()> {
if is_staging { let path = self.delta_file_path(version, is_staging);
self.staging_storage.save(version, bytes).await debug!("Save log to manifest storage, version: {}", version);
} else { let data = self
self.delta_storage.save(version, bytes).await .compress_type
} .encode(bytes)
.await
.context(CompressObjectSnafu {
compress_type: self.compress_type,
path: &path,
})?;
let delta_size = data.len();
self.write_and_put_cache(&path, data, is_staging).await?;
self.set_delta_file_size(version, delta_size as u64);
Ok(())
} }
/// Save the checkpoint manifest file. /// Save the checkpoint manifest file.
@@ -338,50 +479,155 @@ impl ManifestObjectStore {
version: ManifestVersion, version: ManifestVersion,
bytes: &[u8], bytes: &[u8],
) -> Result<()> { ) -> Result<()> {
self.checkpoint_storage let path = self.checkpoint_file_path(version);
.save_checkpoint(version, bytes) let data = self
.compress_type
.encode(bytes)
.await .await
.context(CompressObjectSnafu {
compress_type: self.compress_type,
path: &path,
})?;
let checkpoint_size = data.len();
let checksum = checkpoint_checksum(bytes);
self.write_and_put_cache(&path, data, false).await?;
self.set_checkpoint_file_size(version, checkpoint_size as u64);
// Because last checkpoint file only contain size and version, which is tiny, so we don't compress it.
let last_checkpoint_path = self.last_checkpoint_path();
let checkpoint_metadata = CheckpointMetadata {
size: bytes.len(),
version,
checksum: Some(checksum),
extend_metadata: HashMap::new(),
};
debug!(
"Save checkpoint in path: {}, metadata: {:?}",
last_checkpoint_path, checkpoint_metadata
);
let bytes = checkpoint_metadata.encode()?;
self.object_store
.write(&last_checkpoint_path, bytes)
.await
.context(OpenDalSnafu)?;
Ok(())
}
async fn load_checkpoint(
&mut self,
metadata: CheckpointMetadata,
) -> Result<Option<(ManifestVersion, Vec<u8>)>> {
let version = metadata.version;
let path = self.checkpoint_file_path(version);
// Try to get from cache first
if let Some(data) = self.get_from_cache(&path, false).await {
verify_checksum(&data, metadata.checksum)?;
return Ok(Some((version, data)));
}
// Due to backward compatibility, it is possible that the user's checkpoint not compressed,
// so if we don't find file by compressed type. fall back to checkpoint not compressed find again.
let checkpoint_data = match self.object_store.read(&path).await {
Ok(checkpoint) => {
let checkpoint_size = checkpoint.len();
let decompress_data =
self.compress_type
.decode(checkpoint)
.await
.with_context(|_| DecompressObjectSnafu {
compress_type: self.compress_type,
path: path.clone(),
})?;
verify_checksum(&decompress_data, metadata.checksum)?;
// set the checkpoint size
self.set_checkpoint_file_size(version, checkpoint_size as u64);
// Add to cache
self.put_to_cache(path, &decompress_data, false).await;
Ok(Some(decompress_data))
}
Err(e) => {
if e.kind() == ErrorKind::NotFound {
if self.compress_type != FALL_BACK_COMPRESS_TYPE {
let fall_back_path = gen_path(
&self.path,
&checkpoint_file(version),
FALL_BACK_COMPRESS_TYPE,
);
debug!(
"Failed to load checkpoint from path: {}, fall back to path: {}",
path, fall_back_path
);
// Try to get fallback from cache first
if let Some(data) = self.get_from_cache(&fall_back_path, false).await {
verify_checksum(&data, metadata.checksum)?;
return Ok(Some((version, data)));
}
match self.object_store.read(&fall_back_path).await {
Ok(checkpoint) => {
let checkpoint_size = checkpoint.len();
let decompress_data = FALL_BACK_COMPRESS_TYPE
.decode(checkpoint)
.await
.with_context(|_| DecompressObjectSnafu {
compress_type: FALL_BACK_COMPRESS_TYPE,
path: fall_back_path.clone(),
})?;
verify_checksum(&decompress_data, metadata.checksum)?;
self.set_checkpoint_file_size(version, checkpoint_size as u64);
// Add fallback to cache
self.put_to_cache(fall_back_path, &decompress_data, false)
.await;
Ok(Some(decompress_data))
}
Err(e) if e.kind() == ErrorKind::NotFound => Ok(None),
Err(e) => Err(e).context(OpenDalSnafu),
}
} else {
Ok(None)
}
} else {
Err(e).context(OpenDalSnafu)
}
}
}?;
Ok(checkpoint_data.map(|data| (version, data)))
} }
/// Load the latest checkpoint. /// Load the latest checkpoint.
/// Return manifest version and the raw [RegionCheckpoint](crate::manifest::action::RegionCheckpoint) content if any /// Return manifest version and the raw [RegionCheckpoint](crate::manifest::action::RegionCheckpoint) content if any
pub async fn load_last_checkpoint(&mut self) -> Result<Option<(ManifestVersion, Vec<u8>)>> { pub async fn load_last_checkpoint(&mut self) -> Result<Option<(ManifestVersion, Vec<u8>)>> {
self.checkpoint_storage.load_last_checkpoint().await let last_checkpoint_path = self.last_checkpoint_path();
// Fetch from remote object store without cache
let last_checkpoint_data = match self.object_store.read(&last_checkpoint_path).await {
Ok(data) => data.to_vec(),
Err(e) if e.kind() == ErrorKind::NotFound => {
return Ok(None);
}
Err(e) => {
return Err(e).context(OpenDalSnafu)?;
}
};
let checkpoint_metadata = CheckpointMetadata::decode(&last_checkpoint_data)?;
debug!(
"Load checkpoint in path: {}, metadata: {:?}",
last_checkpoint_path, checkpoint_metadata
);
self.load_checkpoint(checkpoint_metadata).await
} }
/// Compute the size(Byte) in manifest size map. #[cfg(test)]
pub(crate) fn total_manifest_size(&self) -> u64 {
self.size_tracker.total()
}
/// Resets the size of all files.
pub(crate) fn reset_manifest_size(&mut self) {
self.size_tracker.reset();
}
/// Set the size of the delta file by delta version.
pub(crate) fn set_delta_file_size(&mut self, version: ManifestVersion, size: u64) {
self.size_tracker.record_delta(version, size);
}
/// Set the size of the checkpoint file by checkpoint version.
pub(crate) fn set_checkpoint_file_size(&self, version: ManifestVersion, size: u64) {
self.size_tracker.record_checkpoint(version, size);
}
/// Fetch all staging manifest files and return them as (version, action_list) pairs.
pub async fn fetch_staging_manifests(&self) -> Result<Vec<(ManifestVersion, Vec<u8>)>> {
self.staging_storage.fetch_manifests().await
}
/// Clear all staging manifest files.
pub async fn clear_staging_manifests(&mut self) -> Result<()> {
self.staging_storage.clear().await
}
}
#[cfg(test)]
impl ManifestObjectStore {
pub async fn read_file(&self, path: &str) -> Result<Vec<u8>> { pub async fn read_file(&self, path: &str) -> Result<Vec<u8>> {
self.object_store self.object_store
.read(path) .read(path)
@@ -390,18 +636,214 @@ impl ManifestObjectStore {
.map(|v| v.to_vec()) .map(|v| v.to_vec())
} }
pub(crate) fn checkpoint_storage(&self) -> &CheckpointStorage<CheckpointTracker> { #[cfg(test)]
&self.checkpoint_storage pub async fn write_last_checkpoint(
&mut self,
version: ManifestVersion,
bytes: &[u8],
) -> Result<()> {
let path = self.checkpoint_file_path(version);
let data = self
.compress_type
.encode(bytes)
.await
.context(CompressObjectSnafu {
compress_type: self.compress_type,
path: &path,
})?;
let checkpoint_size = data.len();
self.object_store
.write(&path, data)
.await
.context(OpenDalSnafu)?;
self.set_checkpoint_file_size(version, checkpoint_size as u64);
let last_checkpoint_path = self.last_checkpoint_path();
let checkpoint_metadata = CheckpointMetadata {
size: bytes.len(),
version,
checksum: Some(1218259706),
extend_metadata: HashMap::new(),
};
debug!(
"Rewrite checkpoint in path: {}, metadata: {:?}",
last_checkpoint_path, checkpoint_metadata
);
let bytes = checkpoint_metadata.encode()?;
// Overwrite the last checkpoint with the modified content
self.object_store
.write(&last_checkpoint_path, bytes.clone())
.await
.context(OpenDalSnafu)?;
Ok(())
} }
pub(crate) fn delta_storage(&self) -> &DeltaStorage<DeltaTracker> { /// Compute the size(Byte) in manifest size map.
&self.delta_storage pub(crate) fn total_manifest_size(&self) -> u64 {
self.manifest_size_map.read().unwrap().values().sum()
} }
pub(crate) fn set_compress_type(&mut self, compress_type: CompressionType) { /// Resets the size of all files.
self.checkpoint_storage.set_compress_type(compress_type); pub(crate) fn reset_manifest_size(&mut self) {
self.delta_storage.set_compress_type(compress_type); self.manifest_size_map.write().unwrap().clear();
self.staging_storage.set_compress_type(compress_type); self.total_manifest_size.store(0, Ordering::Relaxed);
}
/// Set the size of the delta file by delta version.
pub(crate) fn set_delta_file_size(&mut self, version: ManifestVersion, size: u64) {
let mut m = self.manifest_size_map.write().unwrap();
m.insert(FileKey::Delta(version), size);
self.inc_total_manifest_size(size);
}
/// Set the size of the checkpoint file by checkpoint version.
pub(crate) fn set_checkpoint_file_size(&self, version: ManifestVersion, size: u64) {
let mut m = self.manifest_size_map.write().unwrap();
m.insert(FileKey::Checkpoint(version), size);
self.inc_total_manifest_size(size);
}
fn unset_file_size(&self, key: &FileKey) {
let mut m = self.manifest_size_map.write().unwrap();
if let Some(val) = m.remove(key) {
debug!("Unset file size: {:?}, size: {}", key, val);
self.dec_total_manifest_size(val);
}
}
fn inc_total_manifest_size(&self, val: u64) {
self.total_manifest_size.fetch_add(val, Ordering::Relaxed);
}
fn dec_total_manifest_size(&self, val: u64) {
self.total_manifest_size.fetch_sub(val, Ordering::Relaxed);
}
/// Fetch all staging manifest files and return them as (version, action_list) pairs.
pub async fn fetch_staging_manifests(&self) -> Result<Vec<(ManifestVersion, Vec<u8>)>> {
let manifest_entries = self
.get_paths(
|entry| {
let file_name = entry.name();
if is_delta_file(file_name) {
let version = file_version(file_name);
Some((version, entry))
} else {
None
}
},
true,
)
.await?;
let mut sorted_entries = manifest_entries;
Self::sort_manifests(&mut sorted_entries);
self.fetch_manifests_from_entries(sorted_entries, true)
.await
}
/// Clear all staging manifest files.
pub async fn clear_staging_manifests(&mut self) -> Result<()> {
self.object_store
.remove_all(&self.staging_path)
.await
.context(OpenDalSnafu)?;
debug!(
"Cleared all staging manifest files from {}",
self.staging_path
);
Ok(())
}
/// Gets a manifest file from cache.
/// Returns the file data if found in cache, None otherwise.
/// If `is_staging` is true, always returns None.
async fn get_from_cache(&self, key: &str, is_staging: bool) -> Option<Vec<u8>> {
if is_staging {
return None;
}
let cache = self.manifest_cache.as_ref()?;
cache.get_file(key).await
}
/// Puts a manifest file into cache.
/// If `is_staging` is true, does nothing.
async fn put_to_cache(&self, key: String, data: &[u8], is_staging: bool) {
if is_staging {
return;
}
let Some(cache) = &self.manifest_cache else {
return;
};
cache.put_file(key, data.to_vec()).await;
}
/// Writes data to object store and puts it into cache.
/// If `is_staging` is true, cache is skipped.
async fn write_and_put_cache(&self, path: &str, data: Vec<u8>, is_staging: bool) -> Result<()> {
// Clone data for cache before writing, only if cache is enabled and not staging
let cache_data = if !is_staging && self.manifest_cache.is_some() {
Some(data.clone())
} else {
None
};
// Write to object store
self.object_store
.write(path, data)
.await
.context(OpenDalSnafu)?;
// Put to cache if we cloned the data
if let Some(data) = cache_data {
self.put_to_cache(path.to_string(), &data, is_staging).await;
}
Ok(())
}
/// Removes a manifest file from cache.
async fn remove_from_cache(&self, key: &str) {
let Some(cache) = &self.manifest_cache else {
return;
};
cache.remove(key).await;
}
}
#[derive(Serialize, Deserialize, Debug)]
pub(crate) struct CheckpointMetadata {
pub size: usize,
/// The latest version this checkpoint contains.
pub version: ManifestVersion,
pub checksum: Option<u32>,
pub extend_metadata: HashMap<String, String>,
}
impl CheckpointMetadata {
fn encode(&self) -> Result<Vec<u8>> {
Ok(serde_json::to_string(self)
.context(SerdeJsonSnafu)?
.into_bytes())
}
fn decode(bs: &[u8]) -> Result<Self> {
let data = std::str::from_utf8(bs).context(Utf8Snafu)?;
serde_json::from_str(data).context(SerdeJsonSnafu)
} }
} }
@@ -412,7 +854,6 @@ mod tests {
use object_store::services::Fs; use object_store::services::Fs;
use super::*; use super::*;
use crate::manifest::storage::checkpoint::CheckpointMetadata;
fn new_test_manifest_store() -> ManifestObjectStore { fn new_test_manifest_store() -> ManifestObjectStore {
common_telemetry::init_default_ut_logging(); common_telemetry::init_default_ut_logging();
@@ -449,14 +890,14 @@ mod tests {
#[tokio::test] #[tokio::test]
async fn test_manifest_log_store_uncompress() { async fn test_manifest_log_store_uncompress() {
let mut log_store = new_test_manifest_store(); let mut log_store = new_test_manifest_store();
log_store.set_compress_type(CompressionType::Uncompressed); log_store.compress_type = CompressionType::Uncompressed;
test_manifest_log_store_case(log_store).await; test_manifest_log_store_case(log_store).await;
} }
#[tokio::test] #[tokio::test]
async fn test_manifest_log_store_compress() { async fn test_manifest_log_store_compress() {
let mut log_store = new_test_manifest_store(); let mut log_store = new_test_manifest_store();
log_store.set_compress_type(CompressionType::Gzip); log_store.compress_type = CompressionType::Gzip;
test_manifest_log_store_case(log_store).await; test_manifest_log_store_case(log_store).await;
} }
@@ -500,7 +941,6 @@ mod tests {
//delete (,4) logs and keep checkpoint 3. //delete (,4) logs and keep checkpoint 3.
let _ = log_store.delete_until(4, true).await.unwrap(); let _ = log_store.delete_until(4, true).await.unwrap();
let _ = log_store let _ = log_store
.checkpoint_storage
.load_checkpoint(new_checkpoint_metadata_with_version(3)) .load_checkpoint(new_checkpoint_metadata_with_version(3))
.await .await
.unwrap() .unwrap()
@@ -518,7 +958,6 @@ mod tests {
let _ = log_store.delete_until(11, false).await.unwrap(); let _ = log_store.delete_until(11, false).await.unwrap();
assert!( assert!(
log_store log_store
.checkpoint_storage
.load_checkpoint(new_checkpoint_metadata_with_version(3)) .load_checkpoint(new_checkpoint_metadata_with_version(3))
.await .await
.unwrap() .unwrap()
@@ -537,7 +976,7 @@ mod tests {
let mut log_store = new_test_manifest_store(); let mut log_store = new_test_manifest_store();
// write uncompress data to stimulate previously uncompressed data // write uncompress data to stimulate previously uncompressed data
log_store.set_compress_type(CompressionType::Uncompressed); log_store.compress_type = CompressionType::Uncompressed;
for v in 0..5 { for v in 0..5 {
log_store log_store
.save(v, format!("hello, {v}").as_bytes(), false) .save(v, format!("hello, {v}").as_bytes(), false)
@@ -550,7 +989,7 @@ mod tests {
.unwrap(); .unwrap();
// change compress type // change compress type
log_store.set_compress_type(CompressionType::Gzip); log_store.compress_type = CompressionType::Gzip;
// test load_last_checkpoint work correctly for previously uncompressed data // test load_last_checkpoint work correctly for previously uncompressed data
let (v, checkpoint) = log_store.load_last_checkpoint().await.unwrap().unwrap(); let (v, checkpoint) = log_store.load_last_checkpoint().await.unwrap().unwrap();
@@ -579,7 +1018,6 @@ mod tests {
assert_eq!(format!("hello, {v}").as_bytes(), bytes); assert_eq!(format!("hello, {v}").as_bytes(), bytes);
} }
let (v, checkpoint) = log_store let (v, checkpoint) = log_store
.checkpoint_storage
.load_checkpoint(new_checkpoint_metadata_with_version(5)) .load_checkpoint(new_checkpoint_metadata_with_version(5))
.await .await
.unwrap() .unwrap()
@@ -614,7 +1052,7 @@ mod tests {
async fn test_uncompressed_manifest_files_size() { async fn test_uncompressed_manifest_files_size() {
let mut log_store = new_test_manifest_store(); let mut log_store = new_test_manifest_store();
// write 5 manifest files with uncompressed8B per file // write 5 manifest files with uncompressed8B per file
log_store.set_compress_type(CompressionType::Uncompressed); log_store.compress_type = CompressionType::Uncompressed;
for v in 0..5 { for v in 0..5 {
log_store log_store
.save(v, format!("hello, {v}").as_bytes(), false) .save(v, format!("hello, {v}").as_bytes(), false)
@@ -652,7 +1090,7 @@ mod tests {
async fn test_compressed_manifest_files_size() { async fn test_compressed_manifest_files_size() {
let mut log_store = new_test_manifest_store(); let mut log_store = new_test_manifest_store();
// Test with compressed manifest files // Test with compressed manifest files
log_store.set_compress_type(CompressionType::Gzip); log_store.compress_type = CompressionType::Gzip;
// write 5 manifest files // write 5 manifest files
for v in 0..5 { for v in 0..5 {
log_store log_store

View File

@@ -1,316 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::HashMap;
use std::sync::Arc;
use common_datasource::compression::CompressionType;
use common_telemetry::debug;
use object_store::{ErrorKind, ObjectStore};
use serde::{Deserialize, Serialize};
use snafu::ResultExt;
use store_api::ManifestVersion;
use crate::cache::manifest_cache::ManifestCache;
use crate::error::{
CompressObjectSnafu, DecompressObjectSnafu, OpenDalSnafu, Result, SerdeJsonSnafu, Utf8Snafu,
};
use crate::manifest::storage::size_tracker::Tracker;
use crate::manifest::storage::utils::{get_from_cache, put_to_cache, write_and_put_cache};
use crate::manifest::storage::{
FALL_BACK_COMPRESS_TYPE, LAST_CHECKPOINT_FILE, checkpoint_checksum, checkpoint_file, gen_path,
verify_checksum,
};
#[derive(Serialize, Deserialize, Debug)]
pub(crate) struct CheckpointMetadata {
pub size: usize,
/// The latest version this checkpoint contains.
pub version: ManifestVersion,
pub checksum: Option<u32>,
pub extend_metadata: HashMap<String, String>,
}
impl CheckpointMetadata {
fn encode(&self) -> Result<Vec<u8>> {
Ok(serde_json::to_string(self)
.context(SerdeJsonSnafu)?
.into_bytes())
}
fn decode(bs: &[u8]) -> Result<Self> {
let data = std::str::from_utf8(bs).context(Utf8Snafu)?;
serde_json::from_str(data).context(SerdeJsonSnafu)
}
}
/// Handle checkpoint storage operations.
#[derive(Debug, Clone)]
pub(crate) struct CheckpointStorage<T: Tracker> {
object_store: ObjectStore,
compress_type: CompressionType,
path: String,
manifest_cache: Option<ManifestCache>,
size_tracker: Arc<T>,
}
impl<T: Tracker> CheckpointStorage<T> {
pub fn new(
path: String,
object_store: ObjectStore,
compress_type: CompressionType,
manifest_cache: Option<ManifestCache>,
size_tracker: Arc<T>,
) -> Self {
Self {
object_store,
compress_type,
path,
manifest_cache,
size_tracker,
}
}
/// Returns the last checkpoint path, because the last checkpoint is not compressed,
/// so its path name has nothing to do with the compression algorithm used by `ManifestObjectStore`
pub(crate) fn last_checkpoint_path(&self) -> String {
format!("{}{}", self.path, LAST_CHECKPOINT_FILE)
}
/// Returns the checkpoint file path under the **current** compression algorithm
fn checkpoint_file_path(&self, version: ManifestVersion) -> String {
gen_path(&self.path, &checkpoint_file(version), self.compress_type)
}
pub(crate) async fn load_checkpoint(
&mut self,
metadata: CheckpointMetadata,
) -> Result<Option<(ManifestVersion, Vec<u8>)>> {
let version = metadata.version;
let path = self.checkpoint_file_path(version);
// Try to get from cache first
if let Some(data) = get_from_cache(self.manifest_cache.as_ref(), &path).await {
verify_checksum(&data, metadata.checksum)?;
return Ok(Some((version, data)));
}
// Due to backward compatibility, it is possible that the user's checkpoint not compressed,
// so if we don't find file by compressed type. fall back to checkpoint not compressed find again.
let checkpoint_data = match self.object_store.read(&path).await {
Ok(checkpoint) => {
let checkpoint_size = checkpoint.len();
let decompress_data =
self.compress_type
.decode(checkpoint)
.await
.with_context(|_| DecompressObjectSnafu {
compress_type: self.compress_type,
path: path.clone(),
})?;
verify_checksum(&decompress_data, metadata.checksum)?;
// set the checkpoint size
self.size_tracker.record(version, checkpoint_size as u64);
// Add to cache
put_to_cache(self.manifest_cache.as_ref(), path, &decompress_data).await;
Ok(Some(decompress_data))
}
Err(e) => {
if e.kind() == ErrorKind::NotFound {
if self.compress_type != FALL_BACK_COMPRESS_TYPE {
let fall_back_path = gen_path(
&self.path,
&checkpoint_file(version),
FALL_BACK_COMPRESS_TYPE,
);
debug!(
"Failed to load checkpoint from path: {}, fall back to path: {}",
path, fall_back_path
);
// Try to get fallback from cache first
if let Some(data) =
get_from_cache(self.manifest_cache.as_ref(), &fall_back_path).await
{
verify_checksum(&data, metadata.checksum)?;
return Ok(Some((version, data)));
}
match self.object_store.read(&fall_back_path).await {
Ok(checkpoint) => {
let checkpoint_size = checkpoint.len();
let decompress_data = FALL_BACK_COMPRESS_TYPE
.decode(checkpoint)
.await
.with_context(|_| DecompressObjectSnafu {
compress_type: FALL_BACK_COMPRESS_TYPE,
path: fall_back_path.clone(),
})?;
verify_checksum(&decompress_data, metadata.checksum)?;
self.size_tracker.record(version, checkpoint_size as u64);
// Add fallback to cache
put_to_cache(
self.manifest_cache.as_ref(),
fall_back_path,
&decompress_data,
)
.await;
Ok(Some(decompress_data))
}
Err(e) if e.kind() == ErrorKind::NotFound => Ok(None),
Err(e) => return Err(e).context(OpenDalSnafu),
}
} else {
Ok(None)
}
} else {
Err(e).context(OpenDalSnafu)
}
}
}?;
Ok(checkpoint_data.map(|data| (version, data)))
}
/// Load the latest checkpoint.
/// Return manifest version and the raw [RegionCheckpoint](crate::manifest::action::RegionCheckpoint) content if any
pub async fn load_last_checkpoint(&mut self) -> Result<Option<(ManifestVersion, Vec<u8>)>> {
let last_checkpoint_path = self.last_checkpoint_path();
// Fetch from remote object store without cache
let last_checkpoint_data = match self.object_store.read(&last_checkpoint_path).await {
Ok(data) => data.to_vec(),
Err(e) if e.kind() == ErrorKind::NotFound => {
return Ok(None);
}
Err(e) => {
return Err(e).context(OpenDalSnafu)?;
}
};
let checkpoint_metadata = CheckpointMetadata::decode(&last_checkpoint_data)?;
debug!(
"Load checkpoint in path: {}, metadata: {:?}",
last_checkpoint_path, checkpoint_metadata
);
self.load_checkpoint(checkpoint_metadata).await
}
/// Save the checkpoint manifest file.
pub(crate) async fn save_checkpoint(
&self,
version: ManifestVersion,
bytes: &[u8],
) -> Result<()> {
let path = self.checkpoint_file_path(version);
let data = self
.compress_type
.encode(bytes)
.await
.context(CompressObjectSnafu {
compress_type: self.compress_type,
path: &path,
})?;
let checkpoint_size = data.len();
let checksum = checkpoint_checksum(bytes);
write_and_put_cache(
&self.object_store,
self.manifest_cache.as_ref(),
&path,
data,
)
.await?;
self.size_tracker.record(version, checkpoint_size as u64);
// Because last checkpoint file only contain size and version, which is tiny, so we don't compress it.
let last_checkpoint_path = self.last_checkpoint_path();
let checkpoint_metadata = CheckpointMetadata {
size: bytes.len(),
version,
checksum: Some(checksum),
extend_metadata: HashMap::new(),
};
debug!(
"Save checkpoint in path: {}, metadata: {:?}",
last_checkpoint_path, checkpoint_metadata
);
let bytes = checkpoint_metadata.encode()?;
self.object_store
.write(&last_checkpoint_path, bytes)
.await
.context(OpenDalSnafu)?;
Ok(())
}
}
#[cfg(test)]
impl<T: Tracker> CheckpointStorage<T> {
pub async fn write_last_checkpoint(
&self,
version: ManifestVersion,
bytes: &[u8],
) -> Result<()> {
let path = self.checkpoint_file_path(version);
let data = self
.compress_type
.encode(bytes)
.await
.context(CompressObjectSnafu {
compress_type: self.compress_type,
path: &path,
})?;
let checkpoint_size = data.len();
self.object_store
.write(&path, data)
.await
.context(OpenDalSnafu)?;
self.size_tracker.record(version, checkpoint_size as u64);
let last_checkpoint_path = self.last_checkpoint_path();
let checkpoint_metadata = CheckpointMetadata {
size: bytes.len(),
version,
checksum: Some(1218259706),
extend_metadata: HashMap::new(),
};
debug!(
"Rewrite checkpoint in path: {}, metadata: {:?}",
last_checkpoint_path, checkpoint_metadata
);
let bytes = checkpoint_metadata.encode()?;
// Overwrite the last checkpoint with the modified content
self.object_store
.write(&last_checkpoint_path, bytes.clone())
.await
.context(OpenDalSnafu)?;
Ok(())
}
pub fn set_compress_type(&mut self, compress_type: CompressionType) {
self.compress_type = compress_type;
}
}

View File

@@ -1,251 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::sync::Arc;
use common_datasource::compression::CompressionType;
use common_telemetry::debug;
use futures::TryStreamExt;
use futures::future::try_join_all;
use object_store::{Entry, ErrorKind, Lister, ObjectStore};
use snafu::{ResultExt, ensure};
use store_api::ManifestVersion;
use store_api::storage::RegionId;
use tokio::sync::Semaphore;
use crate::cache::manifest_cache::ManifestCache;
use crate::error::{
CompressObjectSnafu, DecompressObjectSnafu, InvalidScanIndexSnafu, OpenDalSnafu, Result,
};
use crate::manifest::storage::size_tracker::Tracker;
use crate::manifest::storage::utils::{
get_from_cache, put_to_cache, sort_manifests, write_and_put_cache,
};
use crate::manifest::storage::{
FETCH_MANIFEST_PARALLELISM, delta_file, file_compress_type, file_version, gen_path,
is_delta_file,
};
#[derive(Debug, Clone)]
pub(crate) struct DeltaStorage<T: Tracker> {
object_store: ObjectStore,
compress_type: CompressionType,
path: String,
delta_tracker: Arc<T>,
manifest_cache: Option<ManifestCache>,
}
impl<T: Tracker> DeltaStorage<T> {
pub(crate) fn new(
path: String,
object_store: ObjectStore,
compress_type: CompressionType,
manifest_cache: Option<ManifestCache>,
delta_tracker: Arc<T>,
) -> Self {
Self {
object_store,
compress_type,
path,
delta_tracker,
manifest_cache,
}
}
pub(crate) fn path(&self) -> &str {
&self.path
}
pub(crate) fn object_store(&self) -> &ObjectStore {
&self.object_store
}
fn delta_file_path(&self, version: ManifestVersion) -> String {
gen_path(&self.path, &delta_file(version), self.compress_type)
}
/// Returns an iterator of manifests from path directory.
pub(crate) async fn manifest_lister(&self) -> Result<Option<Lister>> {
match self.object_store.lister_with(&self.path).await {
Ok(streamer) => Ok(Some(streamer)),
Err(e) if e.kind() == ErrorKind::NotFound => {
debug!("Manifest directory does not exist: {}", self.path);
Ok(None)
}
Err(e) => Err(e).context(OpenDalSnafu)?,
}
}
/// Return all `R`s in the directory that meet the `filter` conditions (that is, the `filter` closure returns `Some(R)`),
/// and discard `R` that does not meet the conditions (that is, the `filter` closure returns `None`)
/// Return an empty vector when directory is not found.
pub async fn get_paths<F, R>(&self, filter: F) -> Result<Vec<R>>
where
F: Fn(Entry) -> Option<R>,
{
let Some(streamer) = self.manifest_lister().await? else {
return Ok(vec![]);
};
streamer
.try_filter_map(|e| async { Ok(filter(e)) })
.try_collect::<Vec<_>>()
.await
.context(OpenDalSnafu)
}
/// Scans the manifest files in the range of [start, end) and return all manifest entries.
pub async fn scan(
&self,
start: ManifestVersion,
end: ManifestVersion,
) -> Result<Vec<(ManifestVersion, Entry)>> {
ensure!(start <= end, InvalidScanIndexSnafu { start, end });
let mut entries: Vec<(ManifestVersion, Entry)> = self
.get_paths(|entry| {
let file_name = entry.name();
if is_delta_file(file_name) {
let version = file_version(file_name);
if start <= version && version < end {
return Some((version, entry));
}
}
None
})
.await?;
sort_manifests(&mut entries);
Ok(entries)
}
/// Fetches manifests in range [start_version, end_version).
///
/// This functions is guaranteed to return manifests from the `start_version` strictly (must contain `start_version`).
pub async fn fetch_manifests_strict_from(
&self,
start_version: ManifestVersion,
end_version: ManifestVersion,
region_id: RegionId,
) -> Result<Vec<(ManifestVersion, Vec<u8>)>> {
let mut manifests = self.fetch_manifests(start_version, end_version).await?;
let start_index = manifests.iter().position(|(v, _)| *v == start_version);
debug!(
"Fetches manifests in range [{},{}), start_index: {:?}, region_id: {}, manifests: {:?}",
start_version,
end_version,
start_index,
region_id,
manifests.iter().map(|(v, _)| *v).collect::<Vec<_>>()
);
if let Some(start_index) = start_index {
Ok(manifests.split_off(start_index))
} else {
Ok(vec![])
}
}
/// Common implementation for fetching manifests from entries in parallel.
pub(crate) async fn fetch_manifests_from_entries(
&self,
entries: Vec<(ManifestVersion, Entry)>,
) -> Result<Vec<(ManifestVersion, Vec<u8>)>> {
if entries.is_empty() {
return Ok(vec![]);
}
// TODO(weny): Make it configurable.
let semaphore = Semaphore::new(FETCH_MANIFEST_PARALLELISM);
let tasks = entries.iter().map(|(v, entry)| async {
// Safety: semaphore must exist.
let _permit = semaphore.acquire().await.unwrap();
let cache_key = entry.path();
// Try to get from cache first
if let Some(data) = get_from_cache(self.manifest_cache.as_ref(), cache_key).await {
return Ok((*v, data));
}
// Fetch from remote object store
let compress_type = file_compress_type(entry.name());
let bytes = self
.object_store
.read(entry.path())
.await
.context(OpenDalSnafu)?;
let data = compress_type
.decode(bytes)
.await
.context(DecompressObjectSnafu {
compress_type,
path: entry.path(),
})?;
// Add to cache
put_to_cache(self.manifest_cache.as_ref(), cache_key.to_string(), &data).await;
Ok((*v, data))
});
try_join_all(tasks).await
}
/// Fetch all manifests in concurrent, and return the manifests in range [start_version, end_version)
///
/// **Notes**: This function is no guarantee to return manifests from the `start_version` strictly.
/// Uses [fetch_manifests_strict_from](DeltaStorage::fetch_manifests_strict_from) to get manifests from the `start_version`.
pub async fn fetch_manifests(
&self,
start_version: ManifestVersion,
end_version: ManifestVersion,
) -> Result<Vec<(ManifestVersion, Vec<u8>)>> {
let manifests = self.scan(start_version, end_version).await?;
self.fetch_manifests_from_entries(manifests).await
}
/// Save the delta manifest file.
pub async fn save(&mut self, version: ManifestVersion, bytes: &[u8]) -> Result<()> {
let path = self.delta_file_path(version);
debug!("Save log to manifest storage, version: {}", version);
let data = self
.compress_type
.encode(bytes)
.await
.context(CompressObjectSnafu {
compress_type: self.compress_type,
path: &path,
})?;
let delta_size = data.len();
write_and_put_cache(
&self.object_store,
self.manifest_cache.as_ref(),
&path,
data,
)
.await?;
self.delta_tracker.record(version, delta_size as u64);
Ok(())
}
}
#[cfg(test)]
impl<T: Tracker> DeltaStorage<T> {
pub fn set_compress_type(&mut self, compress_type: CompressionType) {
self.compress_type = compress_type;
}
}

View File

@@ -1,130 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::HashMap;
use std::fmt::Debug;
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::{Arc, RwLock};
use store_api::ManifestVersion;
/// Key to identify a manifest file.
#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
pub(crate) enum FileKey {
/// A delta file (`.json`).
Delta(ManifestVersion),
/// A checkpoint file (`.checkpoint`).
Checkpoint(ManifestVersion),
}
pub(crate) trait Tracker: Send + Sync + Debug {
fn record(&self, version: ManifestVersion, size: u64);
}
#[derive(Debug, Clone)]
pub struct CheckpointTracker {
size_tracker: SizeTracker,
}
impl Tracker for CheckpointTracker {
fn record(&self, version: ManifestVersion, size: u64) {
self.size_tracker.record(FileKey::Checkpoint(version), size);
}
}
#[derive(Debug, Clone)]
pub struct DeltaTracker {
size_tracker: SizeTracker,
}
impl Tracker for DeltaTracker {
fn record(&self, version: ManifestVersion, size: u64) {
self.size_tracker.record(FileKey::Delta(version), size);
}
}
#[derive(Debug, Clone)]
pub struct NoopTracker;
impl Tracker for NoopTracker {
fn record(&self, _version: ManifestVersion, _size: u64) {
// noop
}
}
#[derive(Debug, Clone, Default)]
pub(crate) struct SizeTracker {
file_sizes: Arc<RwLock<HashMap<FileKey, u64>>>,
total_size: Arc<AtomicU64>,
}
impl SizeTracker {
/// Returns a new [SizeTracker].
pub fn new(total_size: Arc<AtomicU64>) -> Self {
Self {
file_sizes: Arc::new(RwLock::new(HashMap::new())),
total_size,
}
}
/// Returns the manifest tracker.
pub(crate) fn manifest_tracker(&self) -> DeltaTracker {
DeltaTracker {
size_tracker: self.clone(),
}
}
/// Returns the checkpoint tracker.
pub(crate) fn checkpoint_tracker(&self) -> CheckpointTracker {
CheckpointTracker {
size_tracker: self.clone(),
}
}
/// Records a delta file size.
pub(crate) fn record_delta(&self, version: ManifestVersion, size: u64) {
self.record(FileKey::Delta(version), size);
}
/// Records a checkpoint file size.
pub(crate) fn record_checkpoint(&self, version: ManifestVersion, size: u64) {
self.record(FileKey::Checkpoint(version), size);
}
/// Removes a file from tracking.
pub(crate) fn remove(&self, key: &FileKey) {
if let Some(size) = self.file_sizes.write().unwrap().remove(key) {
self.total_size.fetch_sub(size, Ordering::Relaxed);
}
}
/// Returns the total tracked size.
pub(crate) fn total(&self) -> u64 {
self.total_size.load(Ordering::Relaxed)
}
/// Resets all tracking.
pub(crate) fn reset(&self) {
self.file_sizes.write().unwrap().clear();
self.total_size.store(0, Ordering::Relaxed);
}
fn record(&self, key: FileKey, size: u64) {
// Remove the old size if present
if let Some(old_size) = self.file_sizes.write().unwrap().insert(key, size) {
self.total_size.fetch_sub(old_size, Ordering::Relaxed);
}
self.total_size.fetch_add(size, Ordering::Relaxed);
}
}

View File

@@ -1,109 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::sync::Arc;
use common_datasource::compression::CompressionType;
use common_telemetry::debug;
use object_store::{Lister, ObjectStore, util};
use snafu::ResultExt;
use store_api::ManifestVersion;
use crate::error::{OpenDalSnafu, Result};
use crate::manifest::storage::delta::DeltaStorage;
use crate::manifest::storage::size_tracker::NoopTracker;
use crate::manifest::storage::utils::sort_manifests;
use crate::manifest::storage::{file_version, is_delta_file};
#[derive(Debug, Clone)]
pub(crate) struct StagingStorage {
delta_storage: DeltaStorage<NoopTracker>,
}
impl StagingStorage {
pub fn new(path: String, object_store: ObjectStore, compress_type: CompressionType) -> Self {
let staging_path = {
// Convert "region_dir/manifest/" to "region_dir/staging/manifest/"
let parent_dir = path.trim_end_matches("manifest/").trim_end_matches('/');
util::normalize_dir(&format!("{}/staging/manifest", parent_dir))
};
let delta_storage = DeltaStorage::new(
staging_path.clone(),
object_store.clone(),
compress_type,
// StagingStorage does not use a manifest cache; set to None.
None,
// StagingStorage does not track file sizes, since all staging files are
// deleted after exiting staging mode.
Arc::new(NoopTracker),
);
Self { delta_storage }
}
/// Returns an iterator of manifests from staging directory.
pub(crate) async fn manifest_lister(&self) -> Result<Option<Lister>> {
self.delta_storage.manifest_lister().await
}
/// Fetch all staging manifest files and return them as (version, action_list) pairs.
pub(crate) async fn fetch_manifests(&self) -> Result<Vec<(ManifestVersion, Vec<u8>)>> {
let manifest_entries = self
.delta_storage
.get_paths(|entry| {
let file_name = entry.name();
if is_delta_file(file_name) {
let version = file_version(file_name);
Some((version, entry))
} else {
None
}
})
.await?;
let mut sorted_entries = manifest_entries;
sort_manifests(&mut sorted_entries);
self.delta_storage
.fetch_manifests_from_entries(sorted_entries)
.await
}
/// Save the delta manifest file.
pub(crate) async fn save(&mut self, version: ManifestVersion, bytes: &[u8]) -> Result<()> {
self.delta_storage.save(version, bytes).await
}
/// Clean all staging manifest files.
pub(crate) async fn clear(&self) -> Result<()> {
self.delta_storage
.object_store()
.remove_all(self.delta_storage.path())
.await
.context(OpenDalSnafu)?;
debug!(
"Cleared all staging manifest files from {}",
self.delta_storage.path()
);
Ok(())
}
}
#[cfg(test)]
impl StagingStorage {
pub fn set_compress_type(&mut self, compress_type: CompressionType) {
self.delta_storage.set_compress_type(compress_type);
}
}

View File

@@ -1,73 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use object_store::{Entry, ObjectStore};
use snafu::ResultExt;
use store_api::ManifestVersion;
use crate::cache::manifest_cache::ManifestCache;
use crate::error::{OpenDalSnafu, Result};
/// Gets a manifest file from cache.
/// Returns the file data if found in cache, None otherwise.
pub(crate) async fn get_from_cache(cache: Option<&ManifestCache>, key: &str) -> Option<Vec<u8>> {
let cache = cache?;
cache.get_file(key).await
}
/// Puts a manifest file into cache.
pub(crate) async fn put_to_cache(cache: Option<&ManifestCache>, key: String, data: &[u8]) {
let Some(cache) = cache else {
return;
};
cache.put_file(key, data.to_vec()).await
}
/// Removes a manifest file from cache.
pub(crate) async fn remove_from_cache(cache: Option<&ManifestCache>, key: &str) {
let Some(cache) = cache else {
return;
};
cache.remove(key).await
}
/// Writes data to object store and puts it into cache.
pub(crate) async fn write_and_put_cache(
object_store: &ObjectStore,
cache: Option<&ManifestCache>,
path: &str,
data: Vec<u8>,
) -> Result<()> {
// Clone data for cache before writing, only if cache is enabled.
let cache_data = if cache.is_some() {
Some(data.clone())
} else {
None
};
// Write to object store
object_store.write(path, data).await.context(OpenDalSnafu)?;
// Put to cache if we cloned the data
if let Some(data) = cache_data {
put_to_cache(cache, path.to_string(), &data).await;
}
Ok(())
}
/// Sorts the manifest files.
pub(crate) fn sort_manifests(entries: &mut [(ManifestVersion, Entry)]) {
entries.sort_unstable_by_key(|(version, _)| *version);
}

View File

@@ -25,7 +25,7 @@ use crate::manifest::action::{
RegionCheckpoint, RegionEdit, RegionMetaAction, RegionMetaActionList, RegionCheckpoint, RegionEdit, RegionMetaAction, RegionMetaActionList,
}; };
use crate::manifest::manager::RegionManifestManager; use crate::manifest::manager::RegionManifestManager;
use crate::manifest::storage::checkpoint::CheckpointMetadata; use crate::manifest::storage::CheckpointMetadata;
use crate::manifest::tests::utils::basic_region_metadata; use crate::manifest::tests::utils::basic_region_metadata;
use crate::sst::file::FileMeta; use crate::sst::file::FileMeta;
use crate::test_util::TestEnv; use crate::test_util::TestEnv;
@@ -117,8 +117,7 @@ async fn manager_without_checkpoint() {
expected.sort_unstable(); expected.sort_unstable();
let mut paths = manager let mut paths = manager
.store() .store()
.delta_storage() .get_paths(|e| Some(e.name().to_string()), false)
.get_paths(|e| Some(e.name().to_string()))
.await .await
.unwrap(); .unwrap();
paths.sort_unstable(); paths.sort_unstable();
@@ -160,8 +159,7 @@ async fn manager_with_checkpoint_distance_1() {
expected.sort_unstable(); expected.sort_unstable();
let mut paths = manager let mut paths = manager
.store() .store()
.delta_storage() .get_paths(|e| Some(e.name().to_string()), false)
.get_paths(|e| Some(e.name().to_string()))
.await .await
.unwrap(); .unwrap();
paths.sort_unstable(); paths.sort_unstable();
@@ -170,7 +168,7 @@ async fn manager_with_checkpoint_distance_1() {
// check content in `_last_checkpoint` // check content in `_last_checkpoint`
let raw_bytes = manager let raw_bytes = manager
.store() .store()
.read_file(&manager.store().checkpoint_storage().last_checkpoint_path()) .read_file(&manager.store().last_checkpoint_path())
.await .await
.unwrap(); .unwrap();
let raw_json = std::str::from_utf8(&raw_bytes).unwrap(); let raw_json = std::str::from_utf8(&raw_bytes).unwrap();
@@ -215,7 +213,7 @@ async fn test_corrupted_data_causing_checksum_error() {
// Corrupt the last checkpoint data // Corrupt the last checkpoint data
let mut corrupted_bytes = manager let mut corrupted_bytes = manager
.store() .store()
.read_file(&manager.store().checkpoint_storage().last_checkpoint_path()) .read_file(&manager.store().last_checkpoint_path())
.await .await
.unwrap(); .unwrap();
corrupted_bytes[0] ^= 1; corrupted_bytes[0] ^= 1;
@@ -223,7 +221,6 @@ async fn test_corrupted_data_causing_checksum_error() {
// Overwrite the latest checkpoint data // Overwrite the latest checkpoint data
manager manager
.store() .store()
.checkpoint_storage()
.write_last_checkpoint(9, &corrupted_bytes) .write_last_checkpoint(9, &corrupted_bytes)
.await .await
.unwrap(); .unwrap();
@@ -413,8 +410,7 @@ async fn manifest_install_manifest_to_with_checkpoint() {
expected.sort_unstable(); expected.sort_unstable();
let mut paths = manager let mut paths = manager
.store() .store()
.delta_storage() .get_paths(|e| Some(e.name().to_string()), false)
.get_paths(|e| Some(e.name().to_string()))
.await .await
.unwrap(); .unwrap();

View File

@@ -136,18 +136,18 @@ impl RangesOptions {
#[derive(Debug, Default, Clone)] #[derive(Debug, Default, Clone)]
pub struct MemtableStats { pub struct MemtableStats {
/// The estimated bytes allocated by this memtable from heap. /// The estimated bytes allocated by this memtable from heap.
pub estimated_bytes: usize, estimated_bytes: usize,
/// The inclusive time range that this memtable contains. It is None if /// The inclusive time range that this memtable contains. It is None if
/// and only if the memtable is empty. /// and only if the memtable is empty.
pub time_range: Option<(Timestamp, Timestamp)>, time_range: Option<(Timestamp, Timestamp)>,
/// Total rows in memtable /// Total rows in memtable
pub num_rows: usize, pub num_rows: usize,
/// Total number of ranges in the memtable. /// Total number of ranges in the memtable.
pub num_ranges: usize, pub num_ranges: usize,
/// The maximum sequence number in the memtable. /// The maximum sequence number in the memtable.
pub max_sequence: SequenceNumber, max_sequence: SequenceNumber,
/// Number of estimated timeseries in memtable. /// Number of estimated timeseries in memtable.
pub series_count: usize, series_count: usize,
} }
impl MemtableStats { impl MemtableStats {
@@ -204,27 +204,8 @@ pub type BoxedRecordBatchIterator = Box<dyn Iterator<Item = Result<RecordBatch>>
pub struct MemtableRanges { pub struct MemtableRanges {
/// Range IDs and ranges. /// Range IDs and ranges.
pub ranges: BTreeMap<usize, MemtableRange>, pub ranges: BTreeMap<usize, MemtableRange>,
} /// Statistics of the memtable at the query time.
pub stats: MemtableStats,
impl MemtableRanges {
/// Returns the total number of rows across all ranges.
pub fn num_rows(&self) -> usize {
self.ranges.values().map(|r| r.stats().num_rows()).sum()
}
/// Returns the total series count across all ranges.
pub fn series_count(&self) -> usize {
self.ranges.values().map(|r| r.stats().series_count()).sum()
}
/// Returns the maximum sequence number across all ranges.
pub fn max_sequence(&self) -> SequenceNumber {
self.ranges
.values()
.map(|r| r.stats().max_sequence())
.max()
.unwrap_or(0)
}
} }
impl IterBuilder for MemtableRanges { impl IterBuilder for MemtableRanges {
@@ -588,19 +569,15 @@ impl MemtableRangeContext {
pub struct MemtableRange { pub struct MemtableRange {
/// Shared context. /// Shared context.
context: MemtableRangeContextRef, context: MemtableRangeContextRef,
/// Statistics for this memtable range. /// Number of rows in current memtable range.
stats: MemtableStats, // todo(hl): use [MemtableRangeStats] instead.
num_rows: usize,
} }
impl MemtableRange { impl MemtableRange {
/// Creates a new range from context and stats. /// Creates a new range from context.
pub fn new(context: MemtableRangeContextRef, stats: MemtableStats) -> Self { pub fn new(context: MemtableRangeContextRef, num_rows: usize) -> Self {
Self { context, stats } Self { context, num_rows }
}
/// Returns the statistics for this range.
pub fn stats(&self) -> &MemtableStats {
&self.stats
} }
/// Returns the id of the memtable to read. /// Returns the id of the memtable to read.
@@ -647,7 +624,7 @@ impl MemtableRange {
} }
pub fn num_rows(&self) -> usize { pub fn num_rows(&self) -> usize {
self.stats.num_rows self.num_rows
} }
/// Returns the encoded range if available. /// Returns the encoded range if available.

View File

@@ -382,7 +382,7 @@ impl Memtable for BulkMemtable {
if !bulk_parts.unordered_part.is_empty() if !bulk_parts.unordered_part.is_empty()
&& let Some(unordered_bulk_part) = bulk_parts.unordered_part.to_bulk_part()? && let Some(unordered_bulk_part) = bulk_parts.unordered_part.to_bulk_part()?
{ {
let part_stats = unordered_bulk_part.to_memtable_stats(&self.metadata); let num_rows = unordered_bulk_part.num_rows();
let range = MemtableRange::new( let range = MemtableRange::new(
Arc::new(MemtableRangeContext::new( Arc::new(MemtableRangeContext::new(
self.id, self.id,
@@ -393,7 +393,7 @@ impl Memtable for BulkMemtable {
}), }),
predicate.clone(), predicate.clone(),
)), )),
part_stats, num_rows,
); );
ranges.insert(range_id, range); ranges.insert(range_id, range);
range_id += 1; range_id += 1;
@@ -406,7 +406,6 @@ impl Memtable for BulkMemtable {
continue; continue;
} }
let part_stats = part_wrapper.part.to_memtable_stats(&self.metadata);
let range = MemtableRange::new( let range = MemtableRange::new(
Arc::new(MemtableRangeContext::new( Arc::new(MemtableRangeContext::new(
self.id, self.id,
@@ -417,7 +416,7 @@ impl Memtable for BulkMemtable {
}), }),
predicate.clone(), predicate.clone(),
)), )),
part_stats, part_wrapper.part.num_rows(),
); );
ranges.insert(range_id, range); ranges.insert(range_id, range);
range_id += 1; range_id += 1;
@@ -430,7 +429,6 @@ impl Memtable for BulkMemtable {
continue; continue;
} }
let part_stats = encoded_part_wrapper.part.to_memtable_stats();
let range = MemtableRange::new( let range = MemtableRange::new(
Arc::new(MemtableRangeContext::new( Arc::new(MemtableRangeContext::new(
self.id, self.id,
@@ -442,14 +440,18 @@ impl Memtable for BulkMemtable {
}), }),
predicate.clone(), predicate.clone(),
)), )),
part_stats, encoded_part_wrapper.part.metadata().num_rows,
); );
ranges.insert(range_id, range); ranges.insert(range_id, range);
range_id += 1; range_id += 1;
} }
} }
Ok(MemtableRanges { ranges }) let mut stats = self.stats();
stats.num_ranges = ranges.len();
// TODO(yingwen): Supports per range stats.
Ok(MemtableRanges { ranges, stats })
} }
fn is_empty(&self) -> bool { fn is_empty(&self) -> bool {
@@ -809,14 +811,6 @@ impl PartToMerge {
} }
} }
/// Gets the maximum sequence number of this part.
fn max_sequence(&self) -> u64 {
match self {
PartToMerge::Bulk { part, .. } => part.sequence,
PartToMerge::Encoded { part, .. } => part.metadata().max_sequence,
}
}
/// Creates a record batch iterator for this part. /// Creates a record batch iterator for this part.
fn create_iterator( fn create_iterator(
self, self,
@@ -990,7 +984,7 @@ impl MemtableCompactor {
return Ok(None); return Ok(None);
} }
// Calculates timestamp bounds and max sequence for merged data // Calculates timestamp bounds for merged data
let min_timestamp = parts_to_merge let min_timestamp = parts_to_merge
.iter() .iter()
.map(|p| p.min_timestamp()) .map(|p| p.min_timestamp())
@@ -1001,11 +995,6 @@ impl MemtableCompactor {
.map(|p| p.max_timestamp()) .map(|p| p.max_timestamp())
.max() .max()
.unwrap_or(i64::MIN); .unwrap_or(i64::MIN);
let max_sequence = parts_to_merge
.iter()
.map(|p| p.max_sequence())
.max()
.unwrap_or(0);
let context = Arc::new(BulkIterContext::new( let context = Arc::new(BulkIterContext::new(
metadata.clone(), metadata.clone(),
@@ -1062,7 +1051,6 @@ impl MemtableCompactor {
arrow_schema.clone(), arrow_schema.clone(),
min_timestamp, min_timestamp,
max_timestamp, max_timestamp,
max_sequence,
&mut metrics, &mut metrics,
)?; )?;
@@ -1290,8 +1278,7 @@ mod tests {
.unwrap(); .unwrap();
assert_eq!(3, ranges.ranges.len()); assert_eq!(3, ranges.ranges.len());
let total_rows: usize = ranges.ranges.values().map(|r| r.stats().num_rows()).sum(); assert_eq!(5, ranges.stats.num_rows);
assert_eq!(5, total_rows);
for (_range_id, range) in ranges.ranges.iter() { for (_range_id, range) in ranges.ranges.iter() {
assert!(range.num_rows() > 0); assert!(range.num_rows() > 0);
@@ -1459,9 +1446,8 @@ mod tests {
.unwrap(); .unwrap();
assert_eq!(3, ranges.ranges.len()); assert_eq!(3, ranges.ranges.len());
let total_rows: usize = ranges.ranges.values().map(|r| r.stats().num_rows()).sum(); assert_eq!(5, ranges.stats.num_rows);
assert_eq!(5, total_rows); assert_eq!(3, ranges.stats.num_ranges);
assert_eq!(3, ranges.ranges.len());
for (range_id, range) in ranges.ranges.iter() { for (range_id, range) in ranges.ranges.iter() {
assert!(*range_id < 3); assert!(*range_id < 3);
@@ -1538,8 +1524,7 @@ mod tests {
// Should have ranges for both bulk parts and encoded parts // Should have ranges for both bulk parts and encoded parts
assert_eq!(3, ranges.ranges.len()); assert_eq!(3, ranges.ranges.len());
let total_rows: usize = ranges.ranges.values().map(|r| r.stats().num_rows()).sum(); assert_eq!(10, ranges.stats.num_rows);
assert_eq!(10, total_rows);
for (_range_id, range) in ranges.ranges.iter() { for (_range_id, range) in ranges.ranges.iter() {
assert!(range.num_rows() > 0); assert!(range.num_rows() > 0);
@@ -1621,8 +1606,7 @@ mod tests {
// Should have at least 1 range (the compacted part) // Should have at least 1 range (the compacted part)
assert!(!ranges.ranges.is_empty()); assert!(!ranges.ranges.is_empty());
let total_rows: usize = ranges.ranges.values().map(|r| r.stats().num_rows()).sum(); assert_eq!(10, ranges.stats.num_rows);
assert_eq!(10, total_rows);
// Read all data and verify // Read all data and verify
let mut total_rows_read = 0; let mut total_rows_read = 0;
@@ -1709,8 +1693,7 @@ mod tests {
) )
.unwrap(); .unwrap();
let total_rows: usize = ranges.ranges.values().map(|r| r.stats().num_rows()).sum(); assert_eq!(13, ranges.stats.num_rows);
assert_eq!(13, total_rows);
let mut total_rows_read = 0; let mut total_rows_read = 0;
for (_range_id, range) in ranges.ranges.iter() { for (_range_id, range) in ranges.ranges.iter() {
@@ -1767,8 +1750,7 @@ mod tests {
// Should have 1 range for the unordered_part // Should have 1 range for the unordered_part
assert_eq!(1, ranges.ranges.len()); assert_eq!(1, ranges.ranges.len());
let total_rows: usize = ranges.ranges.values().map(|r| r.stats().num_rows()).sum(); assert_eq!(3, ranges.stats.num_rows);
assert_eq!(3, total_rows);
// Verify data is sorted correctly in the range // Verify data is sorted correctly in the range
let range = ranges.ranges.get(&0).unwrap(); let range = ranges.ranges.get(&0).unwrap();

View File

@@ -66,7 +66,7 @@ use crate::error::{
use crate::memtable::bulk::context::BulkIterContextRef; use crate::memtable::bulk::context::BulkIterContextRef;
use crate::memtable::bulk::part_reader::EncodedBulkPartIter; use crate::memtable::bulk::part_reader::EncodedBulkPartIter;
use crate::memtable::time_series::{ValueBuilder, Values}; use crate::memtable::time_series::{ValueBuilder, Values};
use crate::memtable::{BoxedRecordBatchIterator, MemScanMetrics, MemtableStats}; use crate::memtable::{BoxedRecordBatchIterator, MemScanMetrics};
use crate::sst::index::IndexOutput; use crate::sst::index::IndexOutput;
use crate::sst::parquet::file_range::{PreFilterMode, row_group_contains_delete}; use crate::sst::parquet::file_range::{PreFilterMode, row_group_contains_delete};
use crate::sst::parquet::flat_format::primary_key_column_index; use crate::sst::parquet::flat_format::primary_key_column_index;
@@ -170,22 +170,6 @@ impl BulkPart {
} }
} }
/// Creates MemtableStats from this BulkPart.
pub fn to_memtable_stats(&self, region_metadata: &RegionMetadataRef) -> MemtableStats {
let ts_type = region_metadata.time_index_type();
let min_ts = ts_type.create_timestamp(self.min_timestamp);
let max_ts = ts_type.create_timestamp(self.max_timestamp);
MemtableStats {
estimated_bytes: self.estimated_size(),
time_range: Some((min_ts, max_ts)),
num_rows: self.num_rows(),
num_ranges: 1,
max_sequence: self.sequence,
series_count: self.estimated_series_count(),
}
}
/// Fills missing columns in the BulkPart batch with default values. /// Fills missing columns in the BulkPart batch with default values.
/// ///
/// This function checks if the batch schema matches the region metadata schema, /// This function checks if the batch schema matches the region metadata schema,
@@ -981,23 +965,6 @@ impl EncodedBulkPart {
&self.data &self.data
} }
/// Creates MemtableStats from this EncodedBulkPart.
pub fn to_memtable_stats(&self) -> MemtableStats {
let meta = &self.metadata;
let ts_type = meta.region_metadata.time_index_type();
let min_ts = ts_type.create_timestamp(meta.min_timestamp);
let max_ts = ts_type.create_timestamp(meta.max_timestamp);
MemtableStats {
estimated_bytes: self.size_bytes(),
time_range: Some((min_ts, max_ts)),
num_rows: meta.num_rows,
num_ranges: 1,
max_sequence: meta.max_sequence,
series_count: meta.num_series as usize,
}
}
/// Converts this `EncodedBulkPart` to `SstInfo`. /// Converts this `EncodedBulkPart` to `SstInfo`.
/// ///
/// # Arguments /// # Arguments
@@ -1094,8 +1061,6 @@ pub struct BulkPartMeta {
pub region_metadata: RegionMetadataRef, pub region_metadata: RegionMetadataRef,
/// Number of series. /// Number of series.
pub num_series: u64, pub num_series: u64,
/// Maximum sequence number in part.
pub max_sequence: u64,
} }
/// Metrics for encoding a part. /// Metrics for encoding a part.
@@ -1157,7 +1122,6 @@ impl BulkPartEncoder {
arrow_schema: SchemaRef, arrow_schema: SchemaRef,
min_timestamp: i64, min_timestamp: i64,
max_timestamp: i64, max_timestamp: i64,
max_sequence: u64,
metrics: &mut BulkPartEncodeMetrics, metrics: &mut BulkPartEncodeMetrics,
) -> Result<Option<EncodedBulkPart>> { ) -> Result<Option<EncodedBulkPart>> {
let mut buf = Vec::with_capacity(4096); let mut buf = Vec::with_capacity(4096);
@@ -1209,7 +1173,6 @@ impl BulkPartEncoder {
parquet_metadata, parquet_metadata,
region_metadata: self.metadata.clone(), region_metadata: self.metadata.clone(),
num_series, num_series,
max_sequence,
}, },
})) }))
} }
@@ -1243,7 +1206,6 @@ impl BulkPartEncoder {
parquet_metadata, parquet_metadata,
region_metadata: self.metadata.clone(), region_metadata: self.metadata.clone(),
num_series: part.estimated_series_count() as u64, num_series: part.estimated_series_count() as u64,
max_sequence: part.sequence,
}, },
})) }))
} }

View File

@@ -203,10 +203,10 @@ impl Memtable for PartitionTreeMemtable {
}); });
let context = Arc::new(MemtableRangeContext::new(self.id, builder, predicate)); let context = Arc::new(MemtableRangeContext::new(self.id, builder, predicate));
let range_stats = self.stats(); let stats = self.stats();
let range = MemtableRange::new(context, range_stats);
Ok(MemtableRanges { Ok(MemtableRanges {
ranges: [(0, range)].into(), ranges: [(0, MemtableRange::new(context, stats.num_rows))].into(),
stats,
}) })
} }

View File

@@ -243,23 +243,6 @@ impl Memtable for SimpleBulkMemtable {
let sequence = options.sequence; let sequence = options.sequence;
let start_time = Instant::now(); let start_time = Instant::now();
let projection = Arc::new(self.build_projection(projection)); let projection = Arc::new(self.build_projection(projection));
// Use the memtable's overall time range and max sequence for all ranges
let max_sequence = self.max_sequence.load(Ordering::Relaxed);
let time_range = {
let num_rows = self.num_rows.load(Ordering::Relaxed);
if num_rows > 0 {
let ts_type = self.region_metadata.time_index_type();
let max_timestamp =
ts_type.create_timestamp(self.max_timestamp.load(Ordering::Relaxed));
let min_timestamp =
ts_type.create_timestamp(self.min_timestamp.load(Ordering::Relaxed));
Some((min_timestamp, max_timestamp))
} else {
None
}
};
let values = self.series.read().unwrap().read_to_values(); let values = self.series.read().unwrap().read_to_values();
let contexts = values let contexts = values
.into_par_iter() .into_par_iter()
@@ -284,24 +267,13 @@ impl Memtable for SimpleBulkMemtable {
.map(|result| { .map(|result| {
result.map(|batch| { result.map(|batch| {
let num_rows = batch.num_rows(); let num_rows = batch.num_rows();
let estimated_bytes = batch.memory_size();
let range_stats = MemtableStats {
estimated_bytes,
time_range,
num_rows,
num_ranges: 1,
max_sequence,
series_count: 1,
};
let builder = BatchRangeBuilder { let builder = BatchRangeBuilder {
batch, batch,
merge_mode: self.merge_mode, merge_mode: self.merge_mode,
scan_cost: start_time.elapsed(), scan_cost: start_time.elapsed(),
}; };
( (
range_stats, num_rows,
Arc::new(MemtableRangeContext::new( Arc::new(MemtableRangeContext::new(
self.id, self.id,
Box::new(builder), Box::new(builder),
@@ -315,10 +287,13 @@ impl Memtable for SimpleBulkMemtable {
let ranges = contexts let ranges = contexts
.into_iter() .into_iter()
.enumerate() .enumerate()
.map(|(idx, (range_stats, context))| (idx, MemtableRange::new(context, range_stats))) .map(|(idx, (num_rows, context))| (idx, MemtableRange::new(context, num_rows)))
.collect(); .collect();
Ok(MemtableRanges { ranges }) Ok(MemtableRanges {
ranges,
stats: self.stats(),
})
} }
fn is_empty(&self) -> bool { fn is_empty(&self) -> bool {
@@ -344,7 +319,14 @@ impl Memtable for SimpleBulkMemtable {
series_count: 0, series_count: 0,
}; };
} }
let ts_type = self.region_metadata.time_index_type(); let ts_type = self
.region_metadata
.time_index_column()
.column_schema
.data_type
.clone()
.as_timestamp()
.expect("Timestamp column must have timestamp type");
let max_timestamp = ts_type.create_timestamp(self.max_timestamp.load(Ordering::Relaxed)); let max_timestamp = ts_type.create_timestamp(self.max_timestamp.load(Ordering::Relaxed));
let min_timestamp = ts_type.create_timestamp(self.min_timestamp.load(Ordering::Relaxed)); let min_timestamp = ts_type.create_timestamp(self.min_timestamp.load(Ordering::Relaxed));
MemtableStats { MemtableStats {

View File

@@ -325,10 +325,10 @@ impl Memtable for TimeSeriesMemtable {
}); });
let context = Arc::new(MemtableRangeContext::new(self.id, builder, predicate)); let context = Arc::new(MemtableRangeContext::new(self.id, builder, predicate));
let range_stats = self.stats(); let stats = self.stats();
let range = MemtableRange::new(context, range_stats);
Ok(MemtableRanges { Ok(MemtableRanges {
ranges: [(0, range)].into(), ranges: [(0, MemtableRange::new(context, stats.num_rows))].into(),
stats,
}) })
} }

View File

@@ -458,7 +458,10 @@ impl ScanRegion {
.with_pre_filter_mode(filter_mode), .with_pre_filter_mode(filter_mode),
)?; )?;
mem_range_builders.extend(ranges_in_memtable.ranges.into_values().map(|v| { mem_range_builders.extend(ranges_in_memtable.ranges.into_values().map(|v| {
let stats = v.stats().clone(); // todo: we should add stats to MemtableRange
let mut stats = ranges_in_memtable.stats.clone();
stats.num_ranges = 1;
stats.num_rows = v.num_rows();
MemRangeBuilder::new(v, stats) MemRangeBuilder::new(v, stats)
})); }));
} }

View File

@@ -45,7 +45,7 @@ pub use utils::*;
use crate::access_layer::AccessLayerRef; use crate::access_layer::AccessLayerRef;
use crate::error::{ use crate::error::{
FlushableRegionStateSnafu, InvalidPartitionExprSnafu, RegionNotFoundSnafu, RegionStateSnafu, FlushableRegionStateSnafu, InvalidPartitionExprSnafu, RegionNotFoundSnafu, RegionStateSnafu,
RegionTruncatedSnafu, Result, UnexpectedSnafu, UpdateManifestSnafu, RegionTruncatedSnafu, Result, UpdateManifestSnafu,
}; };
use crate::manifest::action::{ use crate::manifest::action::{
RegionChange, RegionManifest, RegionMetaAction, RegionMetaActionList, RegionChange, RegionManifest, RegionMetaAction, RegionMetaActionList,
@@ -102,16 +102,6 @@ pub enum RegionRoleState {
Follower, Follower,
} }
impl RegionRoleState {
/// Converts the region role state to leader state if it is a leader state.
pub fn into_leader_state(self) -> Option<RegionLeaderState> {
match self {
RegionRoleState::Leader(leader_state) => Some(leader_state),
RegionRoleState::Follower => None,
}
}
}
/// Metadata and runtime status of a region. /// Metadata and runtime status of a region.
/// ///
/// Writing and reading a region follow a single-writer-multi-reader rule: /// Writing and reading a region follow a single-writer-multi-reader rule:
@@ -332,8 +322,11 @@ impl MitoRegion {
/// Sets the editing state. /// Sets the editing state.
/// You should call this method in the worker loop. /// You should call this method in the worker loop.
pub(crate) fn set_editing(&self, expect: RegionLeaderState) -> Result<()> { pub(crate) fn set_editing(&self) -> Result<()> {
self.compare_exchange_state(expect, RegionRoleState::Leader(RegionLeaderState::Editing)) self.compare_exchange_state(
RegionLeaderState::Writable,
RegionRoleState::Leader(RegionLeaderState::Editing),
)
} }
/// Sets the staging state. /// Sets the staging state.
@@ -366,7 +359,6 @@ impl MitoRegion {
/// You should call this method in the worker loop. /// You should call this method in the worker loop.
/// Transitions from Staging to Writable state. /// Transitions from Staging to Writable state.
pub fn exit_staging(&self) -> Result<()> { pub fn exit_staging(&self) -> Result<()> {
*self.staging_partition_expr.lock().unwrap() = None;
self.compare_exchange_state( self.compare_exchange_state(
RegionLeaderState::Staging, RegionLeaderState::Staging,
RegionRoleState::Leader(RegionLeaderState::Writable), RegionRoleState::Leader(RegionLeaderState::Writable),
@@ -378,8 +370,7 @@ impl MitoRegion {
&self, &self,
state: SettableRegionRoleState, state: SettableRegionRoleState,
) -> Result<()> { ) -> Result<()> {
let mut manager: RwLockWriteGuard<'_, RegionManifestManager> = let mut manager = self.manifest_ctx.manifest_manager.write().await;
self.manifest_ctx.manifest_manager.write().await;
let current_state = self.state(); let current_state = self.state();
match state { match state {
@@ -714,20 +705,6 @@ impl MitoRegion {
return Ok(()); return Ok(());
} }
}; };
let expect_change = merged_actions.actions.iter().any(|a| a.is_change());
let expect_edit = merged_actions.actions.iter().any(|a| a.is_edit());
ensure!(
expect_change,
UnexpectedSnafu {
reason: "expect a change action in merged actions"
}
);
ensure!(
expect_edit,
UnexpectedSnafu {
reason: "expect an edit action in merged actions"
}
);
// Submit merged actions using the manifest manager's update method // Submit merged actions using the manifest manager's update method
// Pass the `false` so it saves to normal directory, not staging // Pass the `false` so it saves to normal directory, not staging
@@ -739,17 +716,12 @@ impl MitoRegion {
); );
// Apply the merged changes to in-memory version control // Apply the merged changes to in-memory version control
let (merged_change, merged_edit) = merged_actions.split_region_change_and_edit(); let merged_edit = merged_actions.into_region_edit();
// Safety: we have already ensured that there is a change action in the merged actions.
let new_metadata = merged_change.as_ref().unwrap().metadata.clone();
self.version_control.alter_schema(new_metadata);
self.version_control self.version_control
.apply_edit(Some(merged_edit), &[], self.file_purger.clone()); .apply_edit(Some(merged_edit), &[], self.file_purger.clone());
// Clear all staging manifests and transit state // Clear all staging manifests and transit state
if let Err(e) = manager.clear_staging_manifest_and_dir().await { manager.store().clear_staging_manifests().await?;
error!(e; "Failed to clear staging manifest dir for region {}", self.region_id);
}
self.exit_staging()?; self.exit_staging()?;
Ok(()) Ok(())

View File

@@ -29,7 +29,6 @@ use serde_json::Value;
use serde_with::{DisplayFromStr, NoneAsEmptyString, serde_as, with_prefix}; use serde_with::{DisplayFromStr, NoneAsEmptyString, serde_as, with_prefix};
use snafu::{ResultExt, ensure}; use snafu::{ResultExt, ensure};
use store_api::codec::PrimaryKeyEncoding; use store_api::codec::PrimaryKeyEncoding;
use store_api::mito_engine_options::COMPACTION_OVERRIDE;
use store_api::storage::ColumnId; use store_api::storage::ColumnId;
use strum::EnumString; use strum::EnumString;
@@ -63,7 +62,6 @@ pub struct RegionOptions {
pub ttl: Option<TimeToLive>, pub ttl: Option<TimeToLive>,
/// Compaction options. /// Compaction options.
pub compaction: CompactionOptions, pub compaction: CompactionOptions,
pub compaction_override: bool,
/// Custom storage. Uses default storage if it is `None`. /// Custom storage. Uses default storage if it is `None`.
pub storage: Option<String>, pub storage: Option<String>,
/// If append mode is enabled, the region keeps duplicate rows. /// If append mode is enabled, the region keeps duplicate rows.
@@ -127,8 +125,7 @@ impl TryFrom<&HashMap<String, String>> for RegionOptions {
// See https://github.com/serde-rs/serde/issues/1626 // See https://github.com/serde-rs/serde/issues/1626
let options: RegionOptionsWithoutEnum = let options: RegionOptionsWithoutEnum =
serde_json::from_str(&json).context(JsonOptionsSnafu)?; serde_json::from_str(&json).context(JsonOptionsSnafu)?;
let has_compaction_type = validate_enum_options(options_map, "compaction.type")?; let compaction = if validate_enum_options(options_map, "compaction.type")? {
let compaction = if has_compaction_type {
serde_json::from_str(&json).context(JsonOptionsSnafu)? serde_json::from_str(&json).context(JsonOptionsSnafu)?
} else { } else {
CompactionOptions::default() CompactionOptions::default()
@@ -149,16 +146,9 @@ impl TryFrom<&HashMap<String, String>> for RegionOptions {
None None
}; };
let compaction_override_flag = options_map
.get(COMPACTION_OVERRIDE)
.map(|v| matches!(v.to_lowercase().as_str(), "true" | "1"))
.unwrap_or(false);
let compaction_override = has_compaction_type || compaction_override_flag;
let opts = RegionOptions { let opts = RegionOptions {
ttl: options.ttl, ttl: options.ttl,
compaction, compaction,
compaction_override,
storage: options.storage, storage: options.storage,
append_mode: options.append_mode, append_mode: options.append_mode,
wal_options, wal_options,
@@ -527,7 +517,6 @@ mod tests {
time_window: Some(Duration::from_secs(3600 * 2)), time_window: Some(Duration::from_secs(3600 * 2)),
..Default::default() ..Default::default()
}), }),
compaction_override: true,
..Default::default() ..Default::default()
}; };
assert_eq!(expect, options); assert_eq!(expect, options);
@@ -655,7 +644,6 @@ mod tests {
remote_compaction: false, remote_compaction: false,
fallback_to_local: true, fallback_to_local: true,
}), }),
compaction_override: true,
storage: Some("S3".to_string()), storage: Some("S3".to_string()),
append_mode: false, append_mode: false,
wal_options, wal_options,
@@ -688,7 +676,6 @@ mod tests {
remote_compaction: false, remote_compaction: false,
fallback_to_local: true, fallback_to_local: true,
}), }),
compaction_override: false,
storage: Some("S3".to_string()), storage: Some("S3".to_string()),
append_mode: false, append_mode: false,
wal_options: WalOptions::Kafka(KafkaWalOptions { wal_options: WalOptions::Kafka(KafkaWalOptions {
@@ -753,7 +740,6 @@ mod tests {
remote_compaction: false, remote_compaction: false,
fallback_to_local: true, fallback_to_local: true,
}), }),
compaction_override: false,
storage: Some("S3".to_string()), storage: Some("S3".to_string()),
append_mode: false, append_mode: false,
wal_options: WalOptions::Kafka(KafkaWalOptions { wal_options: WalOptions::Kafka(KafkaWalOptions {

View File

@@ -37,10 +37,10 @@ use store_api::region_engine::{
MitoCopyRegionFromResponse, SetRegionRoleStateResponse, SettableRegionRoleState, MitoCopyRegionFromResponse, SetRegionRoleStateResponse, SettableRegionRoleState,
}; };
use store_api::region_request::{ use store_api::region_request::{
AffectedRows, ApplyStagingManifestRequest, EnterStagingRequest, RegionAlterRequest, AffectedRows, EnterStagingRequest, RegionAlterRequest, RegionBuildIndexRequest,
RegionBuildIndexRequest, RegionBulkInsertsRequest, RegionCatchupRequest, RegionCloseRequest, RegionBulkInsertsRequest, RegionCatchupRequest, RegionCloseRequest, RegionCompactRequest,
RegionCompactRequest, RegionCreateRequest, RegionFlushRequest, RegionOpenRequest, RegionCreateRequest, RegionFlushRequest, RegionOpenRequest, RegionRequest,
RegionRequest, RegionTruncateRequest, RegionTruncateRequest,
}; };
use store_api::storage::{FileId, RegionId}; use store_api::storage::{FileId, RegionId};
use tokio::sync::oneshot::{self, Receiver, Sender}; use tokio::sync::oneshot::{self, Receiver, Sender};
@@ -741,11 +741,6 @@ impl WorkerRequest {
sender: sender.into(), sender: sender.into(),
request: region_bulk_inserts_request, request: region_bulk_inserts_request,
}, },
RegionRequest::ApplyStagingManifest(v) => WorkerRequest::Ddl(SenderDdlRequest {
region_id,
sender: sender.into(),
request: DdlRequest::ApplyStagingManifest(v),
}),
}; };
Ok((worker_request, receiver)) Ok((worker_request, receiver))
@@ -824,13 +819,13 @@ impl WorkerRequest {
Ok((WorkerRequest::RemapManifests(request), receiver)) Ok((WorkerRequest::RemapManifests(request), receiver))
} }
/// Converts [CopyRegionFromRequest] from a [MitoCopyRegionFromRequest](store_api::region_engine::MitoCopyRegionFromRequest). /// Converts [CopyRegionFromRequest] from a [CopyRegionFromRequest](store_api::region_engine::CopyRegionFromRequest).
pub(crate) fn try_from_copy_region_from_request( pub(crate) fn try_from_copy_region_from_request(
region_id: RegionId, region_id: RegionId,
store_api::region_engine::MitoCopyRegionFromRequest { store_api::region_engine::CopyRegionFromRequest {
source_region_id, source_region_id,
parallelism, parallelism,
}: store_api::region_engine::MitoCopyRegionFromRequest, }: store_api::region_engine::CopyRegionFromRequest,
) -> Result<(WorkerRequest, Receiver<Result<MitoCopyRegionFromResponse>>)> { ) -> Result<(WorkerRequest, Receiver<Result<MitoCopyRegionFromResponse>>)> {
let (sender, receiver) = oneshot::channel(); let (sender, receiver) = oneshot::channel();
let request = CopyRegionFromRequest { let request = CopyRegionFromRequest {
@@ -857,7 +852,6 @@ pub(crate) enum DdlRequest {
Truncate(RegionTruncateRequest), Truncate(RegionTruncateRequest),
Catchup((RegionCatchupRequest, Option<WalEntryReceiver>)), Catchup((RegionCatchupRequest, Option<WalEntryReceiver>)),
EnterStaging(EnterStagingRequest), EnterStaging(EnterStagingRequest),
ApplyStagingManifest(ApplyStagingManifestRequest),
} }
/// Sender and Ddl request. /// Sender and Ddl request.
@@ -1086,8 +1080,6 @@ pub(crate) struct RegionEditResult {
pub(crate) result: Result<()>, pub(crate) result: Result<()>,
/// Whether region state need to be set to Writable after handling this request. /// Whether region state need to be set to Writable after handling this request.
pub(crate) update_region_state: bool, pub(crate) update_region_state: bool,
/// The region is in staging mode before handling this request.
pub(crate) is_staging: bool,
} }
#[derive(Debug)] #[derive(Debug)]

View File

@@ -15,7 +15,6 @@
//! Structs and utilities for writing regions. //! Structs and utilities for writing regions.
mod handle_alter; mod handle_alter;
mod handle_apply_staging;
mod handle_bulk_insert; mod handle_bulk_insert;
mod handle_catchup; mod handle_catchup;
mod handle_close; mod handle_close;
@@ -1006,7 +1005,7 @@ impl<S: LogStore> RegionWorkerLoop<S> {
.await; .await;
} }
WorkerRequest::EditRegion(request) => { WorkerRequest::EditRegion(request) => {
self.handle_region_edit(request); self.handle_region_edit(request).await;
} }
WorkerRequest::Stop => { WorkerRequest::Stop => {
debug_assert!(!self.running.load(Ordering::Relaxed)); debug_assert!(!self.running.load(Ordering::Relaxed));
@@ -1108,11 +1107,6 @@ impl<S: LogStore> RegionWorkerLoop<S> {
.await; .await;
continue; continue;
} }
DdlRequest::ApplyStagingManifest(req) => {
self.handle_apply_staging_manifest_request(ddl.region_id, req, ddl.sender)
.await;
continue;
}
}; };
ddl.sender.send(res); ddl.sender.send(res);

View File

@@ -1,140 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use chrono::Utc;
use common_telemetry::{debug, info};
use snafu::ResultExt;
use store_api::logstore::LogStore;
use store_api::region_request::ApplyStagingManifestRequest;
use store_api::storage::RegionId;
use tokio::sync::oneshot;
use crate::error::{
RegionStateSnafu, SerdeJsonSnafu, StagingPartitionExprMismatchSnafu, UnexpectedSnafu,
};
use crate::manifest::action::RegionEdit;
use crate::region::{RegionLeaderState, RegionRoleState};
use crate::request::{OptionOutputTx, RegionEditRequest};
use crate::sst::file::FileMeta;
use crate::worker::RegionWorkerLoop;
impl<S: LogStore> RegionWorkerLoop<S> {
pub(crate) async fn handle_apply_staging_manifest_request(
&mut self,
region_id: RegionId,
request: ApplyStagingManifestRequest,
sender: OptionOutputTx,
) {
let region = match self.regions.writable_region(region_id) {
Ok(region) => region,
Err(e) => {
sender.send(Err(e));
return;
}
};
if !region.is_staging() {
let manifest_partition_expr = region.metadata().partition_expr.as_ref().cloned();
let is_match = manifest_partition_expr.as_ref() == Some(&request.partition_expr);
debug!(
"region {} manifest partition expr: {:?}, request partition expr: {:?}",
region_id, manifest_partition_expr, request.partition_expr
);
if is_match {
// If current partition expr is already the same as the request,
// treats the region already applied the staging manifest.
info!(
"Region {} already applied the staging manifest, partition expr: {}, ignore the apply staging manifest request",
region_id, request.partition_expr
);
sender.send(Ok(0));
return;
}
sender.send(
RegionStateSnafu {
region_id,
state: region.state(),
expect: RegionRoleState::Leader(RegionLeaderState::Staging),
}
.fail(),
);
return;
}
let staging_partition_expr = region.staging_partition_expr.lock().unwrap().clone();
// If the partition expr mismatch, return error.
if staging_partition_expr.as_ref() != Some(&request.partition_expr) {
sender.send(
StagingPartitionExprMismatchSnafu {
manifest_expr: staging_partition_expr,
request_expr: request.partition_expr,
}
.fail(),
);
return;
}
let (tx, rx) = oneshot::channel();
let files_to_add = match serde_json::from_slice::<Vec<FileMeta>>(&request.files_to_add)
.context(SerdeJsonSnafu)
{
Ok(files_to_add) => files_to_add,
Err(e) => {
sender.send(Err(e));
return;
}
};
info!("Applying staging manifest request to region {}", region_id);
self.handle_region_edit(RegionEditRequest {
region_id,
edit: RegionEdit {
files_to_add,
files_to_remove: vec![],
timestamp_ms: Some(Utc::now().timestamp_millis()),
compaction_time_window: None,
flushed_entry_id: None,
flushed_sequence: None,
committed_sequence: None,
},
tx,
});
common_runtime::spawn_global(async move {
// Await the result from the region edit and forward the outcome to the original sender.
// If the operation completes successfully, respond with Ok(0); otherwise, respond with an appropriate error.
if let Ok(result) = rx.await {
let Ok(()) = result else {
sender.send(result.map(|_| 0));
return;
};
let mut manager = region.manifest_ctx.manifest_manager.write().await;
match region.exit_staging_on_success(&mut manager).await {
Ok(()) => {
sender.send(Ok(0));
}
Err(e) => sender.send(Err(e)),
}
} else {
sender.send(
UnexpectedSnafu {
reason: "edit region receiver channel closed",
}
.fail(),
);
}
});
}
}

View File

@@ -214,7 +214,7 @@ impl<S: LogStore> RegionWorkerLoop<S> {
impl<S> RegionWorkerLoop<S> { impl<S> RegionWorkerLoop<S> {
/// Handles region edit request. /// Handles region edit request.
pub(crate) fn handle_region_edit(&mut self, request: RegionEditRequest) { pub(crate) async fn handle_region_edit(&mut self, request: RegionEditRequest) {
let region_id = request.region_id; let region_id = request.region_id;
let Some(region) = self.regions.get_region(region_id) else { let Some(region) = self.regions.get_region(region_id) else {
let _ = request.tx.send(RegionNotFoundSnafu { region_id }.fail()); let _ = request.tx.send(RegionNotFoundSnafu { region_id }.fail());
@@ -246,15 +246,8 @@ impl<S> RegionWorkerLoop<S> {
file.sequence = NonZeroU64::new(file_sequence); file.sequence = NonZeroU64::new(file_sequence);
} }
// Allow retrieving `is_staging` before spawn the edit region task.
let is_staging = region.is_staging();
let expect_state = if is_staging {
RegionLeaderState::Staging
} else {
RegionLeaderState::Writable
};
// Marks the region as editing. // Marks the region as editing.
if let Err(e) = region.set_editing(expect_state) { if let Err(e) = region.set_editing() {
let _ = sender.send(Err(e)); let _ = sender.send(Err(e));
return; return;
} }
@@ -265,8 +258,7 @@ impl<S> RegionWorkerLoop<S> {
// Now the region is in editing state. // Now the region is in editing state.
// Updates manifest in background. // Updates manifest in background.
common_runtime::spawn_global(async move { common_runtime::spawn_global(async move {
let result = let result = edit_region(&region, edit.clone(), cache_manager, listener).await;
edit_region(&region, edit.clone(), cache_manager, listener, is_staging).await;
let notify = WorkerRequest::Background { let notify = WorkerRequest::Background {
region_id, region_id,
notify: BackgroundNotify::RegionEdit(RegionEditResult { notify: BackgroundNotify::RegionEdit(RegionEditResult {
@@ -276,7 +268,6 @@ impl<S> RegionWorkerLoop<S> {
result, result,
// we always need to restore region state after region edit // we always need to restore region state after region edit
update_region_state: true, update_region_state: true,
is_staging,
}), }),
}; };
@@ -308,39 +299,29 @@ impl<S> RegionWorkerLoop<S> {
} }
}; };
let need_compaction = if edit_result.is_staging { let need_compaction =
if edit_result.update_region_state { edit_result.result.is_ok() && !edit_result.edit.files_to_add.is_empty();
// For staging regions, edits are not applied immediately,
// as they remain invisible until the region exits the staging state.
region.switch_state_to_staging(RegionLeaderState::Editing);
}
false if edit_result.result.is_ok() {
} else { // Applies the edit to the region.
let need_compaction = region.version_control.apply_edit(
edit_result.result.is_ok() && !edit_result.edit.files_to_add.is_empty(); Some(edit_result.edit),
// Only apply the edit if the result is ok and region is not in staging state. &[],
if edit_result.result.is_ok() { region.file_purger.clone(),
// Applies the edit to the region. );
region.version_control.apply_edit( }
Some(edit_result.edit),
&[],
region.file_purger.clone(),
);
}
if edit_result.update_region_state {
region.switch_state_to_writable(RegionLeaderState::Editing);
}
need_compaction if edit_result.update_region_state {
}; // Sets the region as writable.
region.switch_state_to_writable(RegionLeaderState::Editing);
}
let _ = edit_result.sender.send(edit_result.result); let _ = edit_result.sender.send(edit_result.result);
if let Some(edit_queue) = self.region_edit_queues.get_mut(&edit_result.region_id) if let Some(edit_queue) = self.region_edit_queues.get_mut(&edit_result.region_id)
&& let Some(request) = edit_queue.dequeue() && let Some(request) = edit_queue.dequeue()
{ {
self.handle_region_edit(request); self.handle_region_edit(request).await;
} }
if need_compaction { if need_compaction {
@@ -482,9 +463,9 @@ async fn edit_region(
edit: RegionEdit, edit: RegionEdit,
cache_manager: CacheManagerRef, cache_manager: CacheManagerRef,
listener: WorkerListener, listener: WorkerListener,
is_staging: bool,
) -> Result<()> { ) -> Result<()> {
let region_id = region.region_id; let region_id = region.region_id;
let is_staging = region.is_staging();
if let Some(write_cache) = cache_manager.write_cache() { if let Some(write_cache) = cache_manager.write_cache() {
for file_meta in &edit.files_to_add { for file_meta in &edit.files_to_add {
let write_cache = write_cache.clone(); let write_cache = write_cache.clone();
@@ -549,10 +530,7 @@ async fn edit_region(
} }
} }
info!( info!("Applying {edit:?} to region {}", region_id);
"Applying {edit:?} to region {}, is_staging: {}",
region_id, is_staging
);
let action_list = RegionMetaActionList::with_action(RegionMetaAction::Edit(edit)); let action_list = RegionMetaActionList::with_action(RegionMetaAction::Edit(edit));
region region

View File

@@ -68,7 +68,6 @@ use sql::statements::{
sql_column_def_to_grpc_column_def, sql_data_type_to_concrete_data_type, value_to_sql_value, sql_column_def_to_grpc_column_def, sql_data_type_to_concrete_data_type, value_to_sql_value,
}; };
use sql::util::extract_tables_from_query; use sql::util::extract_tables_from_query;
use store_api::mito_engine_options::{COMPACTION_OVERRIDE, COMPACTION_TYPE};
use table::requests::{FILE_TABLE_META_KEY, TableOptions}; use table::requests::{FILE_TABLE_META_KEY, TableOptions};
use table::table_reference::TableReference; use table::table_reference::TableReference;
#[cfg(feature = "enterprise")] #[cfg(feature = "enterprise")]
@@ -217,11 +216,6 @@ pub fn create_to_expr(
.context(UnrecognizedTableOptionSnafu)?, .context(UnrecognizedTableOptionSnafu)?,
); );
let mut table_options = table_options;
if table_options.contains_key(COMPACTION_TYPE) {
table_options.insert(COMPACTION_OVERRIDE.to_string(), "true".to_string());
}
let primary_keys = find_primary_keys(&create.columns, &create.constraints)?; let primary_keys = find_primary_keys(&create.columns, &create.constraints)?;
let expr = CreateTableExpr { let expr = CreateTableExpr {

View File

@@ -117,13 +117,10 @@ impl StatementExecutor {
.map(|v| v.into_inner()); .map(|v| v.into_inner());
let create_expr = &mut expr_helper::create_to_expr(&stmt, &ctx)?; let create_expr = &mut expr_helper::create_to_expr(&stmt, &ctx)?;
// Don't inherit schema-level TTL/compaction options into table options: // We don't put ttl into the table options
// TTL is applied during compaction, and `compaction.*` is handled separately. // Because it will be used directly while compaction.
if let Some(schema_options) = schema_options { if let Some(schema_options) = schema_options {
for (key, value) in schema_options.extra_options.iter() { for (key, value) in schema_options.extra_options.iter() {
if key.starts_with("compaction.") {
continue;
}
create_expr create_expr
.table_options .table_options
.entry(key.clone()) .entry(key.clone())

View File

@@ -303,43 +303,4 @@ mod tests {
let subtasks = create_subtasks(&from, &to).unwrap(); let subtasks = create_subtasks(&from, &to).unwrap();
assert!(subtasks.is_empty()); assert!(subtasks.is_empty());
} }
#[test]
fn test_three_components() {
// Left: A:[0,10), B:[20,30), C:[40,50)
let from = vec![
col("x")
.gt_eq(Value::Int64(0))
.and(col("x").lt(Value::Int64(10))),
col("x")
.gt_eq(Value::Int64(20))
.and(col("x").lt(Value::Int64(30))),
col("x")
.gt_eq(Value::Int64(40))
.and(col("x").lt(Value::Int64(50))),
];
// Right: A:[0,10), B:[20,30), C:[40,60)
let to = vec![
col("x")
.gt_eq(Value::Int64(0))
.and(col("x").lt(Value::Int64(10))),
col("x")
.gt_eq(Value::Int64(20))
.and(col("x").lt(Value::Int64(30))),
col("x")
.gt_eq(Value::Int64(40))
.and(col("x").lt(Value::Int64(60))),
];
let subtasks = create_subtasks(&from, &to).unwrap();
assert_eq!(subtasks.len(), 3);
assert_eq!(subtasks[0].from_expr_indices, vec![0]);
assert_eq!(subtasks[0].to_expr_indices, vec![0]);
assert_eq!(subtasks[0].transition_map, vec![vec![0]]);
assert_eq!(subtasks[1].from_expr_indices, vec![1]);
assert_eq!(subtasks[1].to_expr_indices, vec![1]);
assert_eq!(subtasks[1].transition_map, vec![vec![1]]);
assert_eq!(subtasks[2].from_expr_indices, vec![2]);
assert_eq!(subtasks[2].to_expr_indices, vec![2]);
assert_eq!(subtasks[2].transition_map, vec![vec![2]]);
}
} }

View File

@@ -41,6 +41,8 @@ use snafu::{Location, ResultExt};
use crate::error::{CatalogSnafu, Result}; use crate::error::{CatalogSnafu, Result};
use crate::query_engine::{DefaultPlanDecoder, QueryEngineState}; use crate::query_engine::{DefaultPlanDecoder, QueryEngineState};
mod function_alias;
pub struct DfContextProviderAdapter { pub struct DfContextProviderAdapter {
engine_state: Arc<QueryEngineState>, engine_state: Arc<QueryEngineState>,
session_state: SessionState, session_state: SessionState,
@@ -147,7 +149,17 @@ impl ContextProvider for DfContextProviderAdapter {
fn get_function_meta(&self, name: &str) -> Option<Arc<ScalarUDF>> { fn get_function_meta(&self, name: &str) -> Option<Arc<ScalarUDF>> {
self.engine_state.scalar_function(name).map_or_else( self.engine_state.scalar_function(name).map_or_else(
|| self.session_state.scalar_functions().get(name).cloned(), || {
self.session_state
.scalar_functions()
.get(name)
.cloned()
.or_else(|| {
function_alias::resolve_scalar(name).and_then(|name| {
self.session_state.scalar_functions().get(name).cloned()
})
})
},
|func| { |func| {
Some(Arc::new(func.provide(FunctionContext { Some(Arc::new(func.provide(FunctionContext {
query_ctx: self.query_ctx.clone(), query_ctx: self.query_ctx.clone(),
@@ -159,7 +171,17 @@ impl ContextProvider for DfContextProviderAdapter {
fn get_aggregate_meta(&self, name: &str) -> Option<Arc<AggregateUDF>> { fn get_aggregate_meta(&self, name: &str) -> Option<Arc<AggregateUDF>> {
self.engine_state.aggr_function(name).map_or_else( self.engine_state.aggr_function(name).map_or_else(
|| self.session_state.aggregate_functions().get(name).cloned(), || {
self.session_state
.aggregate_functions()
.get(name)
.cloned()
.or_else(|| {
function_alias::resolve_aggregate(name).and_then(|name| {
self.session_state.aggregate_functions().get(name).cloned()
})
})
},
|func| Some(Arc::new(func)), |func| Some(Arc::new(func)),
) )
} }
@@ -193,12 +215,14 @@ impl ContextProvider for DfContextProviderAdapter {
fn udf_names(&self) -> Vec<String> { fn udf_names(&self) -> Vec<String> {
let mut names = self.engine_state.scalar_names(); let mut names = self.engine_state.scalar_names();
names.extend(self.session_state.scalar_functions().keys().cloned()); names.extend(self.session_state.scalar_functions().keys().cloned());
names.extend(function_alias::scalar_alias_names().map(|name| name.to_string()));
names names
} }
fn udaf_names(&self) -> Vec<String> { fn udaf_names(&self) -> Vec<String> {
let mut names = self.engine_state.aggr_names(); let mut names = self.engine_state.aggr_names();
names.extend(self.session_state.aggregate_functions().keys().cloned()); names.extend(self.session_state.aggregate_functions().keys().cloned());
names.extend(function_alias::aggregate_alias_names().map(|name| name.to_string()));
names names
} }
@@ -233,9 +257,14 @@ impl ContextProvider for DfContextProviderAdapter {
.table_functions() .table_functions()
.get(name) .get(name)
.cloned() .cloned()
.ok_or_else(|| { .or_else(|| {
DataFusionError::Plan(format!("table function '{name}' not found")) function_alias::resolve_scalar(name)
})?; .and_then(|alias| self.session_state.table_functions().get(alias).cloned())
});
let tbl_func = tbl_func.ok_or_else(|| {
DataFusionError::Plan(format!("table function '{name}' not found"))
})?;
let provider = tbl_func.create_table_provider(&args)?; let provider = tbl_func.create_table_provider(&args)?;
Ok(provider_as_source(provider)) Ok(provider_as_source(provider))

View File

@@ -0,0 +1,86 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::HashMap;
use once_cell::sync::Lazy;
const SCALAR_ALIASES: &[(&str, &str)] = &[
// SQL compat aliases.
("ucase", "upper"),
("lcase", "lower"),
("ceiling", "ceil"),
("mid", "substr"),
// MySQL's RAND([seed]) accepts an optional seed argument, while DataFusion's `random()`
// does not. We alias the name for `rand()` compatibility, and `rand(seed)` will error
// due to mismatched arity.
("rand", "random"),
];
const AGGREGATE_ALIASES: &[(&str, &str)] = &[
// MySQL compat aliases that don't override existing DataFusion aggregate names.
//
// NOTE: We intentionally do NOT alias `stddev` here, because DataFusion defines `stddev`
// as sample standard deviation while MySQL's `STDDEV` is population standard deviation.
("std", "stddev_pop"),
("variance", "var_pop"),
];
static SCALAR_FUNCTION_ALIAS: Lazy<HashMap<&'static str, &'static str>> =
Lazy::new(|| SCALAR_ALIASES.iter().copied().collect());
static AGGREGATE_FUNCTION_ALIAS: Lazy<HashMap<&'static str, &'static str>> =
Lazy::new(|| AGGREGATE_ALIASES.iter().copied().collect());
pub fn resolve_scalar(name: &str) -> Option<&'static str> {
let name = name.to_ascii_lowercase();
SCALAR_FUNCTION_ALIAS.get(name.as_str()).copied()
}
pub fn resolve_aggregate(name: &str) -> Option<&'static str> {
let name = name.to_ascii_lowercase();
AGGREGATE_FUNCTION_ALIAS.get(name.as_str()).copied()
}
pub fn scalar_alias_names() -> impl Iterator<Item = &'static str> {
SCALAR_ALIASES.iter().map(|(name, _)| *name)
}
pub fn aggregate_alias_names() -> impl Iterator<Item = &'static str> {
AGGREGATE_ALIASES.iter().map(|(name, _)| *name)
}
#[cfg(test)]
mod tests {
use super::{resolve_aggregate, resolve_scalar};
#[test]
fn resolves_scalar_aliases_case_insensitive() {
assert_eq!(resolve_scalar("ucase"), Some("upper"));
assert_eq!(resolve_scalar("UCASE"), Some("upper"));
assert_eq!(resolve_scalar("lcase"), Some("lower"));
assert_eq!(resolve_scalar("ceiling"), Some("ceil"));
assert_eq!(resolve_scalar("MID"), Some("substr"));
assert_eq!(resolve_scalar("RAND"), Some("random"));
assert_eq!(resolve_scalar("not_a_real_alias"), None);
}
#[test]
fn resolves_aggregate_aliases_case_insensitive() {
assert_eq!(resolve_aggregate("std"), Some("stddev_pop"));
assert_eq!(resolve_aggregate("variance"), Some("var_pop"));
assert_eq!(resolve_aggregate("STDDEV"), None);
assert_eq!(resolve_aggregate("not_a_real_alias"), None);
}
}

View File

@@ -28,9 +28,9 @@ use store_api::metadata::{
ColumnMetadata, RegionMetadata, RegionMetadataBuilder, RegionMetadataRef, ColumnMetadata, RegionMetadata, RegionMetadataBuilder, RegionMetadataRef,
}; };
use store_api::region_engine::{ use store_api::region_engine::{
RegionEngine, RegionRole, RegionScannerRef, RegionStatistic, RemapManifestsRequest, CopyRegionFromRequest, CopyRegionFromResponse, RegionEngine, RegionManifestInfo, RegionRole,
RemapManifestsResponse, SetRegionRoleStateResponse, SettableRegionRoleState, RegionScannerRef, RegionStatistic, RemapManifestsRequest, RemapManifestsResponse,
SyncRegionFromRequest, SyncRegionFromResponse, SetRegionRoleStateResponse, SettableRegionRoleState, SyncManifestResponse,
}; };
use store_api::region_request::RegionRequest; use store_api::region_request::RegionRequest;
use store_api::storage::{ConcreteDataType, RegionId, ScanRequest, SequenceNumber}; use store_api::storage::{ConcreteDataType, RegionId, ScanRequest, SequenceNumber};
@@ -113,8 +113,8 @@ impl RegionEngine for MetaRegionEngine {
async fn sync_region( async fn sync_region(
&self, &self,
_region_id: RegionId, _region_id: RegionId,
_request: SyncRegionFromRequest, _manifest_info: RegionManifestInfo,
) -> Result<SyncRegionFromResponse, BoxedError> { ) -> Result<SyncManifestResponse, BoxedError> {
unimplemented!() unimplemented!()
} }
@@ -125,6 +125,14 @@ impl RegionEngine for MetaRegionEngine {
unimplemented!() unimplemented!()
} }
async fn copy_region_from(
&self,
_region_id: RegionId,
_request: CopyRegionFromRequest,
) -> Result<CopyRegionFromResponse, BoxedError> {
unimplemented!()
}
fn role(&self, _region_id: RegionId) -> Option<RegionRole> { fn role(&self, _region_id: RegionId) -> Option<RegionRole> {
None None
} }

View File

@@ -209,7 +209,6 @@ impl QueryEngineState {
.build(); .build();
let df_context = SessionContext::new_with_state(session_state); let df_context = SessionContext::new_with_state(session_state);
register_function_aliases(&df_context);
Self { Self {
df_context, df_context,
@@ -416,41 +415,6 @@ impl QueryPlanner for DfQueryPlanner {
} }
} }
/// MySQL-compatible scalar function aliases: (target_name, alias)
const SCALAR_FUNCTION_ALIASES: &[(&str, &str)] = &[
("upper", "ucase"),
("lower", "lcase"),
("ceil", "ceiling"),
("substr", "mid"),
("random", "rand"),
];
/// MySQL-compatible aggregate function aliases: (target_name, alias)
const AGGREGATE_FUNCTION_ALIASES: &[(&str, &str)] =
&[("stddev_pop", "std"), ("var_pop", "variance")];
/// Register function aliases.
///
/// This function adds aliases like `ucase` -> `upper`, `lcase` -> `lower`, etc.
/// to make GreptimeDB more compatible with MySQL syntax.
fn register_function_aliases(ctx: &SessionContext) {
let state = ctx.state();
for (target, alias) in SCALAR_FUNCTION_ALIASES {
if let Some(func) = state.scalar_functions().get(*target) {
let aliased = func.as_ref().clone().with_aliases([*alias]);
ctx.register_udf(aliased);
}
}
for (target, alias) in AGGREGATE_FUNCTION_ALIASES {
if let Some(func) = state.aggregate_functions().get(*target) {
let aliased = func.as_ref().clone().with_aliases([*alias]);
ctx.register_udaf(aliased);
}
}
}
impl DfQueryPlanner { impl DfQueryPlanner {
fn new( fn new(
catalog_manager: CatalogManagerRef, catalog_manager: CatalogManagerRef,

View File

@@ -56,13 +56,11 @@ fn create_sql_options(table_meta: &TableMeta, schema_options: Option<SchemaOptio
if let Some(ttl) = table_opts.ttl.map(|t| t.to_string()) { if let Some(ttl) = table_opts.ttl.map(|t| t.to_string()) {
options.insert(TTL_KEY.to_string(), ttl); options.insert(TTL_KEY.to_string(), ttl);
} else if let Some(database_ttl) = schema_options } else if let Some(database_ttl) = schema_options
.as_ref()
.and_then(|o| o.ttl) .and_then(|o| o.ttl)
.map(|ttl| ttl.to_string()) .map(|ttl| ttl.to_string())
{ {
options.insert(TTL_KEY.to_string(), database_ttl); options.insert(TTL_KEY.to_string(), database_ttl);
}; };
for (k, v) in table_opts for (k, v) in table_opts
.extra_options .extra_options
.iter() .iter()

View File

@@ -1 +1 @@
v0.11.11 v0.11.9

View File

@@ -27,8 +27,6 @@ pub const TTL_KEY: &str = "ttl";
pub const SNAPSHOT_READ: &str = "snapshot_read"; pub const SNAPSHOT_READ: &str = "snapshot_read";
/// Option key for compaction type. /// Option key for compaction type.
pub const COMPACTION_TYPE: &str = "compaction.type"; pub const COMPACTION_TYPE: &str = "compaction.type";
/// Option key for forcing compaction options override.
pub const COMPACTION_OVERRIDE: &str = "compaction.override";
/// TWCS compaction strategy. /// TWCS compaction strategy.
pub const COMPACTION_TYPE_TWCS: &str = "twcs"; pub const COMPACTION_TYPE_TWCS: &str = "twcs";
/// Option key for twcs min file num to trigger a compaction. /// Option key for twcs min file num to trigger a compaction.
@@ -63,7 +61,6 @@ pub fn is_mito_engine_option_key(key: &str) -> bool {
[ [
"ttl", "ttl",
COMPACTION_TYPE, COMPACTION_TYPE,
COMPACTION_OVERRIDE,
TWCS_TRIGGER_FILE_NUM, TWCS_TRIGGER_FILE_NUM,
TWCS_MAX_OUTPUT_FILE_SIZE, TWCS_MAX_OUTPUT_FILE_SIZE,
TWCS_TIME_WINDOW, TWCS_TIME_WINDOW,
@@ -93,7 +90,6 @@ mod tests {
fn test_is_mito_engine_option_key() { fn test_is_mito_engine_option_key() {
assert!(is_mito_engine_option_key("ttl")); assert!(is_mito_engine_option_key("ttl"));
assert!(is_mito_engine_option_key("compaction.type")); assert!(is_mito_engine_option_key("compaction.type"));
assert!(is_mito_engine_option_key("compaction.override"));
assert!(is_mito_engine_option_key( assert!(is_mito_engine_option_key(
"compaction.twcs.trigger_file_num" "compaction.twcs.trigger_file_num"
)); ));

View File

@@ -637,64 +637,9 @@ impl RegionStatistic {
} }
} }
/// Request to sync the region from a manifest or a region. /// The response of syncing the manifest.
#[derive(Debug, Clone)]
pub enum SyncRegionFromRequest {
/// Syncs the region using manifest information.
/// Used in leader-follower manifest sync scenarios.
FromManifest(RegionManifestInfo),
/// Syncs the region from another region.
///
/// Used by the metric engine to sync logical regions from a source physical region
/// to a target physical region. This copies metadata region SST files and transforms
/// logical region entries to use the target's region number.
FromRegion {
/// The [`RegionId`] of the source region.
source_region_id: RegionId,
/// The parallelism of the sync operation.
parallelism: usize,
},
}
impl From<RegionManifestInfo> for SyncRegionFromRequest {
fn from(manifest_info: RegionManifestInfo) -> Self {
SyncRegionFromRequest::FromManifest(manifest_info)
}
}
impl SyncRegionFromRequest {
/// Creates a new request from a manifest info.
pub fn from_manifest(manifest_info: RegionManifestInfo) -> Self {
SyncRegionFromRequest::FromManifest(manifest_info)
}
/// Creates a new request from a region.
pub fn from_region(source_region_id: RegionId, parallelism: usize) -> Self {
SyncRegionFromRequest::FromRegion {
source_region_id,
parallelism,
}
}
/// Returns true if the request is from a manifest.
pub fn is_from_manifest(&self) -> bool {
matches!(self, SyncRegionFromRequest::FromManifest { .. })
}
/// Converts the request to a region manifest info.
///
/// Returns None if the request is not from a manifest.
pub fn into_region_manifest_info(self) -> Option<RegionManifestInfo> {
match self {
SyncRegionFromRequest::FromManifest(manifest_info) => Some(manifest_info),
SyncRegionFromRequest::FromRegion { .. } => None,
}
}
}
/// The response of syncing the region.
#[derive(Debug)] #[derive(Debug)]
pub enum SyncRegionFromResponse { pub enum SyncManifestResponse {
NotSupported, NotSupported,
Mito { Mito {
/// Indicates if the data region was synced. /// Indicates if the data region was synced.
@@ -711,30 +656,35 @@ pub enum SyncRegionFromResponse {
}, },
} }
impl SyncRegionFromResponse { impl SyncManifestResponse {
/// Returns true if data region is synced. /// Returns true if data region is synced.
pub fn is_data_synced(&self) -> bool { pub fn is_data_synced(&self) -> bool {
match self { match self {
SyncRegionFromResponse::NotSupported => false, SyncManifestResponse::NotSupported => false,
SyncRegionFromResponse::Mito { synced } => *synced, SyncManifestResponse::Mito { synced } => *synced,
SyncRegionFromResponse::Metric { data_synced, .. } => *data_synced, SyncManifestResponse::Metric { data_synced, .. } => *data_synced,
} }
} }
/// Returns true if the engine is supported the sync operation.
pub fn is_supported(&self) -> bool {
matches!(self, SyncManifestResponse::NotSupported)
}
/// Returns true if the engine is a mito2 engine. /// Returns true if the engine is a mito2 engine.
pub fn is_mito(&self) -> bool { pub fn is_mito(&self) -> bool {
matches!(self, SyncRegionFromResponse::Mito { .. }) matches!(self, SyncManifestResponse::Mito { .. })
} }
/// Returns true if the engine is a metric engine. /// Returns true if the engine is a metric engine.
pub fn is_metric(&self) -> bool { pub fn is_metric(&self) -> bool {
matches!(self, SyncRegionFromResponse::Metric { .. }) matches!(self, SyncManifestResponse::Metric { .. })
} }
/// Returns the new opened logical region ids. /// Returns the new opened logical region ids.
pub fn new_opened_logical_region_ids(self) -> Option<Vec<RegionId>> { pub fn new_opened_logical_region_ids(self) -> Option<Vec<RegionId>> {
match self { match self {
SyncRegionFromResponse::Metric { SyncManifestResponse::Metric {
new_opened_logical_region_ids, new_opened_logical_region_ids,
.. ..
} => Some(new_opened_logical_region_ids), } => Some(new_opened_logical_region_ids),
@@ -765,7 +715,7 @@ pub struct RemapManifestsResponse {
/// Request to copy files from a source region to a target region. /// Request to copy files from a source region to a target region.
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub struct MitoCopyRegionFromRequest { pub struct CopyRegionFromRequest {
/// The [`RegionId`] of the source region. /// The [`RegionId`] of the source region.
pub source_region_id: RegionId, pub source_region_id: RegionId,
/// The parallelism of the copy operation. /// The parallelism of the copy operation.
@@ -778,6 +728,37 @@ pub struct MitoCopyRegionFromResponse {
pub copied_file_ids: Vec<FileId>, pub copied_file_ids: Vec<FileId>,
} }
#[derive(Debug, Clone)]
pub struct MetricCopyRegionFromResponse {
/// The logical regions that were newly opened after the copy operation.
pub new_opened_logical_region_ids: Vec<RegionId>,
}
/// Response to copy region from a source region to a target region.
#[derive(Debug, Clone)]
pub enum CopyRegionFromResponse {
Mito(MitoCopyRegionFromResponse),
Metric(MetricCopyRegionFromResponse),
}
impl CopyRegionFromResponse {
/// Converts the response to a mito2 response.
pub fn into_mito(self) -> Option<MitoCopyRegionFromResponse> {
match self {
CopyRegionFromResponse::Mito(response) => Some(response),
CopyRegionFromResponse::Metric(_) => None,
}
}
/// Converts the response to a metric response.
pub fn into_metric(self) -> Option<MetricCopyRegionFromResponse> {
match self {
CopyRegionFromResponse::Metric(response) => Some(response),
CopyRegionFromResponse::Mito(_) => None,
}
}
}
#[async_trait] #[async_trait]
pub trait RegionEngine: Send + Sync { pub trait RegionEngine: Send + Sync {
/// Name of this engine /// Name of this engine
@@ -899,8 +880,8 @@ pub trait RegionEngine: Send + Sync {
async fn sync_region( async fn sync_region(
&self, &self,
region_id: RegionId, region_id: RegionId,
request: SyncRegionFromRequest, manifest_info: RegionManifestInfo,
) -> Result<SyncRegionFromResponse, BoxedError>; ) -> Result<SyncManifestResponse, BoxedError>;
/// Remaps manifests from old regions to new regions. /// Remaps manifests from old regions to new regions.
async fn remap_manifests( async fn remap_manifests(
@@ -908,6 +889,13 @@ pub trait RegionEngine: Send + Sync {
request: RemapManifestsRequest, request: RemapManifestsRequest,
) -> Result<RemapManifestsResponse, BoxedError>; ) -> Result<RemapManifestsResponse, BoxedError>;
/// Copies region from a source region to a target region.
async fn copy_region_from(
&self,
region_id: RegionId,
request: CopyRegionFromRequest,
) -> Result<CopyRegionFromResponse, BoxedError>;
/// Sets region role state gracefully. /// Sets region role state gracefully.
/// ///
/// After the call returns, the engine ensures no more write operations will succeed in the region. /// After the call returns, the engine ensures no more write operations will succeed in the region.

View File

@@ -152,7 +152,6 @@ pub enum RegionRequest {
Catchup(RegionCatchupRequest), Catchup(RegionCatchupRequest),
BulkInserts(RegionBulkInsertsRequest), BulkInserts(RegionBulkInsertsRequest),
EnterStaging(EnterStagingRequest), EnterStaging(EnterStagingRequest),
ApplyStagingManifest(ApplyStagingManifestRequest),
} }
impl RegionRequest { impl RegionRequest {
@@ -183,9 +182,6 @@ impl RegionRequest {
reason: "ListMetadata request should be handled separately by RegionServer", reason: "ListMetadata request should be handled separately by RegionServer",
} }
.fail(), .fail(),
region_request::Body::ApplyStagingManifest(apply) => {
make_region_apply_staging_manifest(apply)
}
} }
} }
@@ -417,28 +413,6 @@ fn make_region_bulk_inserts(request: BulkInsertRequest) -> Result<Vec<(RegionId,
)]) )])
} }
fn make_region_apply_staging_manifest(
api::v1::region::ApplyStagingManifestRequest {
region_id,
partition_expr,
files_to_add,
}: api::v1::region::ApplyStagingManifestRequest,
) -> Result<Vec<(RegionId, RegionRequest)>> {
let region_id = region_id.into();
let files_to_add = files_to_add
.context(UnexpectedSnafu {
reason: "'files_to_add' field is missing",
})?
.data;
Ok(vec![(
region_id,
RegionRequest::ApplyStagingManifest(ApplyStagingManifestRequest {
partition_expr,
files_to_add,
}),
)])
}
/// Request to put data into a region. /// Request to put data into a region.
#[derive(Debug)] #[derive(Debug)]
pub struct RegionPutRequest { pub struct RegionPutRequest {
@@ -1454,30 +1428,6 @@ pub struct EnterStagingRequest {
pub partition_expr: String, pub partition_expr: String,
} }
/// This request is used as part of the region repartition.
///
/// After a region has entered staging mode with a new region rule (partition
/// expression) and a separate process (for example, `remap_manifests`) has
/// generated the new file assignments for the staging region, this request
/// applies that generated manifest to the region.
///
/// In practice, this means:
/// - The `partition_expr` identifies the staging region rule that the manifest
/// was generated for.
/// - `files_to_add` carries the serialized metadata (such as file manifests or
/// file lists) that should be attached to the region under the new rule.
///
/// It should typically be called **after** the staging region has been
/// initialized by [`EnterStagingRequest`] and the new file layout has been
/// computed, to finalize the repartition operation.
#[derive(Debug, Clone)]
pub struct ApplyStagingManifestRequest {
/// The partition expression of the staging region.
pub partition_expr: String,
/// The files to add to the region.
pub files_to_add: Vec<u8>,
}
impl fmt::Display for RegionRequest { impl fmt::Display for RegionRequest {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self { match self {
@@ -1495,7 +1445,6 @@ impl fmt::Display for RegionRequest {
RegionRequest::Catchup(_) => write!(f, "Catchup"), RegionRequest::Catchup(_) => write!(f, "Catchup"),
RegionRequest::BulkInserts(_) => write!(f, "BulkInserts"), RegionRequest::BulkInserts(_) => write!(f, "BulkInserts"),
RegionRequest::EnterStaging(_) => write!(f, "EnterStaging"), RegionRequest::EnterStaging(_) => write!(f, "EnterStaging"),
RegionRequest::ApplyStagingManifest(_) => write!(f, "ApplyStagingManifest"),
} }
} }
} }

View File

@@ -56,6 +56,32 @@ async fn query_data(frontend: &Arc<Instance>) -> io::Result<()> {
))?; ))?;
execute_sql_and_expect(frontend, sql, &expected).await; execute_sql_and_expect(frontend, sql, &expected).await;
// query 1:
let sql = "SELECT json_get_string(data, '$.commit.collection') AS event, count() AS count FROM bluesky GROUP BY event ORDER BY count DESC";
let expected = r#"
+-----------------------+-------+
| event | count |
+-----------------------+-------+
| app.bsky.feed.post | 3 |
| app.bsky.feed.like | 3 |
| app.bsky.graph.follow | 3 |
| app.bsky.feed.repost | 1 |
+-----------------------+-------+"#;
execute_sql_and_expect(frontend, sql, expected).await;
// query 2:
let sql = "SELECT json_get_string(data, '$.commit.collection') AS event, count() AS count, count(DISTINCT json_get_string(data, '$.did')) AS users FROM bluesky WHERE (json_get_string(data, '$.kind') = 'commit') AND (json_get_string(data, '$.commit.operation') = 'create') GROUP BY event ORDER BY count DESC";
let expected = r#"
+-----------------------+-------+-------+
| event | count | users |
+-----------------------+-------+-------+
| app.bsky.feed.post | 3 | 3 |
| app.bsky.feed.like | 3 | 3 |
| app.bsky.graph.follow | 3 | 3 |
| app.bsky.feed.repost | 1 | 1 |
+-----------------------+-------+-------+"#;
execute_sql_and_expect(frontend, sql, expected).await;
Ok(()) Ok(())
} }

View File

@@ -110,25 +110,27 @@ Affected Rows: 0
SHOW CREATE TABLE test1; SHOW CREATE TABLE test1;
+-------+---------------------------------------+ +-------+-----------------------------------------+
| Table | Create Table | | Table | Create Table |
+-------+---------------------------------------+ +-------+-----------------------------------------+
| test1 | CREATE TABLE IF NOT EXISTS "test1" ( | | test1 | CREATE TABLE IF NOT EXISTS "test1" ( |
| | "host" STRING NULL, | | | "host" STRING NULL, |
| | "cpu" DOUBLE NULL, | | | "cpu" DOUBLE NULL, |
| | "ts" TIMESTAMP(3) NOT NULL, | | | "ts" TIMESTAMP(3) NOT NULL, |
| | TIME INDEX ("ts") | | | TIME INDEX ("ts") |
| | ) | | | ) |
| | | | | |
| | ENGINE=mito | | | ENGINE=mito |
| | WITH( | | | WITH( |
| | append_mode = 'false', | | | append_mode = 'false', |
| | 'memtable.type' = 'partition_tree', | | | 'compaction.twcs.time_window' = '1h', |
| | merge_mode = 'last_non_null', | | | 'compaction.type' = 'twcs', |
| | skip_wal = 'true', | | | 'memtable.type' = 'partition_tree', |
| | ttl = '1h' | | | merge_mode = 'last_non_null', |
| | ) | | | skip_wal = 'true', |
+-------+---------------------------------------+ | | ttl = '1h' |
| | ) |
+-------+-----------------------------------------+
CREATE TABLE test2(host STRING, cpu DOUBLE, ts TIMESTAMP TIME INDEX) WITH ( CREATE TABLE test2(host STRING, cpu DOUBLE, ts TIMESTAMP TIME INDEX) WITH (
'append_mode'='true', 'append_mode'='true',
@@ -139,25 +141,27 @@ Affected Rows: 0
SHOW CREATE TABLE test2; SHOW CREATE TABLE test2;
+-------+---------------------------------------+ +-------+-----------------------------------------+
| Table | Create Table | | Table | Create Table |
+-------+---------------------------------------+ +-------+-----------------------------------------+
| test2 | CREATE TABLE IF NOT EXISTS "test2" ( | | test2 | CREATE TABLE IF NOT EXISTS "test2" ( |
| | "host" STRING NULL, | | | "host" STRING NULL, |
| | "cpu" DOUBLE NULL, | | | "cpu" DOUBLE NULL, |
| | "ts" TIMESTAMP(3) NOT NULL, | | | "ts" TIMESTAMP(3) NOT NULL, |
| | TIME INDEX ("ts") | | | TIME INDEX ("ts") |
| | ) | | | ) |
| | | | | |
| | ENGINE=mito | | | ENGINE=mito |
| | WITH( | | | WITH( |
| | append_mode = 'true', | | | append_mode = 'true', |
| | 'memtable.type' = 'partition_tree', | | | 'compaction.twcs.time_window' = '1h', |
| | merge_mode = '', | | | 'compaction.type' = 'twcs', |
| | skip_wal = 'false', | | | 'memtable.type' = 'partition_tree', |
| | ttl = '1h' | | | merge_mode = '', |
| | ) | | | skip_wal = 'false', |
+-------+---------------------------------------+ | | ttl = '1h' |
| | ) |
+-------+-----------------------------------------+
INSERT INTO test2 VALUES('host1', 1.0, '2023-10-01 00:00:00'); INSERT INTO test2 VALUES('host1', 1.0, '2023-10-01 00:00:00');
@@ -179,166 +183,6 @@ DROP DATABASE mydb;
Affected Rows: 0 Affected Rows: 0
--- test compaction options----
CREATE DATABASE test_compaction_opt;
Affected Rows: 1
USE test_compaction_opt;
Affected Rows: 0
SHOW CREATE DATABASE test_compaction_opt;
+---------------------+---------------------------------------------------+
| Database | Create Database |
+---------------------+---------------------------------------------------+
| test_compaction_opt | CREATE DATABASE IF NOT EXISTS test_compaction_opt |
+---------------------+---------------------------------------------------+
CREATE TABLE test_table(ts TIMESTAMP TIME INDEX, val INT);
Affected Rows: 0
SHOW CREATE TABLE test_table;
+------------+-------------------------------------------+
| Table | Create Table |
+------------+-------------------------------------------+
| test_table | CREATE TABLE IF NOT EXISTS "test_table" ( |
| | "ts" TIMESTAMP(3) NOT NULL, |
| | "val" INT NULL, |
| | TIME INDEX ("ts") |
| | ) |
| | |
| | ENGINE=mito |
| | |
+------------+-------------------------------------------+
ALTER DATABASE test_compaction_opt SET 'compaction.type' = 'twcs';
Affected Rows: 0
ALTER DATABASE test_compaction_opt SET 'compaction.twcs.time_window' = '2h';
Affected Rows: 0
SHOW CREATE DATABASE test_compaction_opt;
+---------------------+---------------------------------------------------+
| Database | Create Database |
+---------------------+---------------------------------------------------+
| test_compaction_opt | CREATE DATABASE IF NOT EXISTS test_compaction_opt |
| | WITH( |
| | 'compaction.twcs.time_window' = '2h', |
| | 'compaction.type' = 'twcs' |
| | ) |
+---------------------+---------------------------------------------------+
SHOW CREATE TABLE test_table;
+------------+-------------------------------------------+
| Table | Create Table |
+------------+-------------------------------------------+
| test_table | CREATE TABLE IF NOT EXISTS "test_table" ( |
| | "ts" TIMESTAMP(3) NOT NULL, |
| | "val" INT NULL, |
| | TIME INDEX ("ts") |
| | ) |
| | |
| | ENGINE=mito |
| | |
+------------+-------------------------------------------+
CREATE TABLE test_table2(ts TIMESTAMP TIME INDEX, val INT);
Affected Rows: 0
SHOW CREATE TABLE test_table2;
+-------------+--------------------------------------------+
| Table | Create Table |
+-------------+--------------------------------------------+
| test_table2 | CREATE TABLE IF NOT EXISTS "test_table2" ( |
| | "ts" TIMESTAMP(3) NOT NULL, |
| | "val" INT NULL, |
| | TIME INDEX ("ts") |
| | ) |
| | |
| | ENGINE=mito |
| | |
+-------------+--------------------------------------------+
USE public;
Affected Rows: 0
DROP DATABASE test_compaction_opt;
Affected Rows: 0
CREATE DATABASE test_compaction_opt2;
Affected Rows: 1
USE test_compaction_opt2;
Affected Rows: 0
CREATE TABLE test_table(ts TIMESTAMP TIME INDEX, v INT) WITH ('compaction.type'='twcs','compaction.twcs.time_window'='1h');
Affected Rows: 0
SHOW CREATE TABLE test_table;
+------------+-------------------------------------------+
| Table | Create Table |
+------------+-------------------------------------------+
| test_table | CREATE TABLE IF NOT EXISTS "test_table" ( |
| | "ts" TIMESTAMP(3) NOT NULL, |
| | "v" INT NULL, |
| | TIME INDEX ("ts") |
| | ) |
| | |
| | ENGINE=mito |
| | WITH( |
| | 'compaction.override' = 'true', |
| | 'compaction.twcs.time_window' = '1h', |
| | 'compaction.type' = 'twcs' |
| | ) |
+------------+-------------------------------------------+
ALTER DATABASE test_compaction_opt2 SET 'compaction.twcs.time_window' = '3h';
Affected Rows: 0
SHOW CREATE TABLE test_table;
+------------+-------------------------------------------+
| Table | Create Table |
+------------+-------------------------------------------+
| test_table | CREATE TABLE IF NOT EXISTS "test_table" ( |
| | "ts" TIMESTAMP(3) NOT NULL, |
| | "v" INT NULL, |
| | TIME INDEX ("ts") |
| | ) |
| | |
| | ENGINE=mito |
| | WITH( |
| | 'compaction.override' = 'true', |
| | 'compaction.twcs.time_window' = '1h', |
| | 'compaction.type' = 'twcs' |
| | ) |
+------------+-------------------------------------------+
USE public;
Affected Rows: 0
DROP DATABASE test_compaction_opt2;
Affected Rows: 0
SHOW DATABASES; SHOW DATABASES;
+--------------------+ +--------------------+

View File

@@ -49,48 +49,5 @@ USE public;
DROP DATABASE mydb; DROP DATABASE mydb;
--- test compaction options----
CREATE DATABASE test_compaction_opt;
USE test_compaction_opt;
SHOW CREATE DATABASE test_compaction_opt;
CREATE TABLE test_table(ts TIMESTAMP TIME INDEX, val INT);
SHOW CREATE TABLE test_table;
ALTER DATABASE test_compaction_opt SET 'compaction.type' = 'twcs';
ALTER DATABASE test_compaction_opt SET 'compaction.twcs.time_window' = '2h';
SHOW CREATE DATABASE test_compaction_opt;
SHOW CREATE TABLE test_table;
CREATE TABLE test_table2(ts TIMESTAMP TIME INDEX, val INT);
SHOW CREATE TABLE test_table2;
USE public;
DROP DATABASE test_compaction_opt;
CREATE DATABASE test_compaction_opt2;
USE test_compaction_opt2;
CREATE TABLE test_table(ts TIMESTAMP TIME INDEX, v INT) WITH ('compaction.type'='twcs','compaction.twcs.time_window'='1h');
SHOW CREATE TABLE test_table;
ALTER DATABASE test_compaction_opt2 SET 'compaction.twcs.time_window' = '3h';
SHOW CREATE TABLE test_table;
USE public;
DROP DATABASE test_compaction_opt2;
SHOW DATABASES; SHOW DATABASES;

View File

@@ -1,347 +0,0 @@
-- MySQL-compatible string function tests
-- LOCATE function tests
SELECT LOCATE('world', 'hello world');
+-------------------------------------------+
| locate(Utf8("world"),Utf8("hello world")) |
+-------------------------------------------+
| 7 |
+-------------------------------------------+
SELECT LOCATE('xyz', 'hello world');
+-----------------------------------------+
| locate(Utf8("xyz"),Utf8("hello world")) |
+-----------------------------------------+
| 0 |
+-----------------------------------------+
SELECT LOCATE('o', 'hello world');
+---------------------------------------+
| locate(Utf8("o"),Utf8("hello world")) |
+---------------------------------------+
| 5 |
+---------------------------------------+
SELECT LOCATE('o', 'hello world', 5);
+------------------------------------------------+
| locate(Utf8("o"),Utf8("hello world"),Int64(5)) |
+------------------------------------------------+
| 5 |
+------------------------------------------------+
SELECT LOCATE('o', 'hello world', 6);
+------------------------------------------------+
| locate(Utf8("o"),Utf8("hello world"),Int64(6)) |
+------------------------------------------------+
| 8 |
+------------------------------------------------+
SELECT LOCATE('', 'hello');
+--------------------------------+
| locate(Utf8(""),Utf8("hello")) |
+--------------------------------+
| 1 |
+--------------------------------+
SELECT LOCATE('世', 'hello世界');
+--------------------------------------+
| locate(Utf8("世"),Utf8("hello世界")) |
+--------------------------------------+
| 6 |
+--------------------------------------+
SELECT LOCATE(NULL, 'hello');
+----------------------------+
| locate(NULL,Utf8("hello")) |
+----------------------------+
| |
+----------------------------+
SELECT LOCATE('o', NULL);
+------------------------+
| locate(Utf8("o"),NULL) |
+------------------------+
| |
+------------------------+
-- ELT function tests
SELECT ELT(1, 'a', 'b', 'c');
+---------------------------------------------+
| elt(Int64(1),Utf8("a"),Utf8("b"),Utf8("c")) |
+---------------------------------------------+
| a |
+---------------------------------------------+
SELECT ELT(2, 'a', 'b', 'c');
+---------------------------------------------+
| elt(Int64(2),Utf8("a"),Utf8("b"),Utf8("c")) |
+---------------------------------------------+
| b |
+---------------------------------------------+
SELECT ELT(3, 'a', 'b', 'c');
+---------------------------------------------+
| elt(Int64(3),Utf8("a"),Utf8("b"),Utf8("c")) |
+---------------------------------------------+
| c |
+---------------------------------------------+
SELECT ELT(0, 'a', 'b', 'c');
+---------------------------------------------+
| elt(Int64(0),Utf8("a"),Utf8("b"),Utf8("c")) |
+---------------------------------------------+
| |
+---------------------------------------------+
SELECT ELT(4, 'a', 'b', 'c');
+---------------------------------------------+
| elt(Int64(4),Utf8("a"),Utf8("b"),Utf8("c")) |
+---------------------------------------------+
| |
+---------------------------------------------+
SELECT ELT(NULL, 'a', 'b', 'c');
+-----------------------------------------+
| elt(NULL,Utf8("a"),Utf8("b"),Utf8("c")) |
+-----------------------------------------+
| |
+-----------------------------------------+
-- FIELD function tests
SELECT FIELD('b', 'a', 'b', 'c');
+------------------------------------------------+
| field(Utf8("b"),Utf8("a"),Utf8("b"),Utf8("c")) |
+------------------------------------------------+
| 2 |
+------------------------------------------------+
SELECT FIELD('d', 'a', 'b', 'c');
+------------------------------------------------+
| field(Utf8("d"),Utf8("a"),Utf8("b"),Utf8("c")) |
+------------------------------------------------+
| 0 |
+------------------------------------------------+
SELECT FIELD('a', 'a', 'b', 'c');
+------------------------------------------------+
| field(Utf8("a"),Utf8("a"),Utf8("b"),Utf8("c")) |
+------------------------------------------------+
| 1 |
+------------------------------------------------+
SELECT FIELD('A', 'a', 'b', 'c');
+------------------------------------------------+
| field(Utf8("A"),Utf8("a"),Utf8("b"),Utf8("c")) |
+------------------------------------------------+
| 0 |
+------------------------------------------------+
SELECT FIELD(NULL, 'a', 'b', 'c');
+-------------------------------------------+
| field(NULL,Utf8("a"),Utf8("b"),Utf8("c")) |
+-------------------------------------------+
| 0 |
+-------------------------------------------+
-- INSERT function tests
SELECT INSERT('Quadratic', 3, 4, 'What');
+----------------------------------------------------------+
| insert(Utf8("Quadratic"),Int64(3),Int64(4),Utf8("What")) |
+----------------------------------------------------------+
| QuWhattic |
+----------------------------------------------------------+
SELECT INSERT('Quadratic', 3, 100, 'What');
+------------------------------------------------------------+
| insert(Utf8("Quadratic"),Int64(3),Int64(100),Utf8("What")) |
+------------------------------------------------------------+
| QuWhat |
+------------------------------------------------------------+
SELECT INSERT('Quadratic', 0, 4, 'What');
+----------------------------------------------------------+
| insert(Utf8("Quadratic"),Int64(0),Int64(4),Utf8("What")) |
+----------------------------------------------------------+
| Quadratic |
+----------------------------------------------------------+
SELECT INSERT('hello', 1, 0, 'X');
+---------------------------------------------------+
| insert(Utf8("hello"),Int64(1),Int64(0),Utf8("X")) |
+---------------------------------------------------+
| Xhello |
+---------------------------------------------------+
SELECT INSERT('hello世界', 6, 1, 'の');
+--------------------------------------------------------+
| insert(Utf8("hello世界"),Int64(6),Int64(1),Utf8("の")) |
+--------------------------------------------------------+
| helloの界 |
+--------------------------------------------------------+
SELECT INSERT(NULL, 1, 1, 'X');
+------------------------------------------+
| insert(NULL,Int64(1),Int64(1),Utf8("X")) |
+------------------------------------------+
| |
+------------------------------------------+
-- SPACE function tests
SELECT SPACE(5);
+-----------------+
| space(Int64(5)) |
+-----------------+
| |
+-----------------+
SELECT SPACE(0);
+-----------------+
| space(Int64(0)) |
+-----------------+
| |
+-----------------+
SELECT SPACE(-1);
+------------------+
| space(Int64(-1)) |
+------------------+
| |
+------------------+
SELECT CONCAT('a', SPACE(3), 'b');
+---------------------------------------------+
| concat(Utf8("a"),space(Int64(3)),Utf8("b")) |
+---------------------------------------------+
| a b |
+---------------------------------------------+
SELECT SPACE(NULL);
+-------------+
| space(NULL) |
+-------------+
| |
+-------------+
-- FORMAT function tests
SELECT FORMAT(1234567.891, 2);
+---------------------------------------+
| format(Float64(1234567.891),Int64(2)) |
+---------------------------------------+
| 1,234,567.89 |
+---------------------------------------+
SELECT FORMAT(1234567.891, 0);
+---------------------------------------+
| format(Float64(1234567.891),Int64(0)) |
+---------------------------------------+
| 1,234,568 |
+---------------------------------------+
SELECT FORMAT(1234.5, 4);
+----------------------------------+
| format(Float64(1234.5),Int64(4)) |
+----------------------------------+
| 1,234.5000 |
+----------------------------------+
SELECT FORMAT(-1234567.891, 2);
+----------------------------------------+
| format(Float64(-1234567.891),Int64(2)) |
+----------------------------------------+
| -1,234,567.89 |
+----------------------------------------+
SELECT FORMAT(0.5, 2);
+-------------------------------+
| format(Float64(0.5),Int64(2)) |
+-------------------------------+
| 0.50 |
+-------------------------------+
SELECT FORMAT(123, 2);
+-----------------------------+
| format(Int64(123),Int64(2)) |
+-----------------------------+
| 123.00 |
+-----------------------------+
SELECT FORMAT(NULL, 2);
+-----------------------+
| format(NULL,Int64(2)) |
+-----------------------+
| |
+-----------------------+
-- Combined test with table
CREATE TABLE string_test(idx INT, val VARCHAR, ts TIMESTAMP TIME INDEX);
Affected Rows: 0
INSERT INTO string_test VALUES
(1, 'hello world', 1),
(2, 'foo bar baz', 2),
(3, 'hello世界', 3);
Affected Rows: 3
SELECT idx, val, LOCATE('o', val) as loc FROM string_test ORDER BY idx;
+-----+-------------+-----+
| idx | val | loc |
+-----+-------------+-----+
| 1 | hello world | 5 |
| 2 | foo bar baz | 2 |
| 3 | hello世界 | 5 |
+-----+-------------+-----+
SELECT idx, val, INSERT(val, 1, 5, 'hi') as inserted FROM string_test ORDER BY idx;
+-----+-------------+----------+
| idx | val | inserted |
+-----+-------------+----------+
| 1 | hello world | hi world |
| 2 | foo bar baz | hiar baz |
| 3 | hello世界 | hi世界 |
+-----+-------------+----------+
DROP TABLE string_test;
Affected Rows: 0

View File

@@ -1,97 +0,0 @@
-- MySQL-compatible string function tests
-- LOCATE function tests
SELECT LOCATE('world', 'hello world');
SELECT LOCATE('xyz', 'hello world');
SELECT LOCATE('o', 'hello world');
SELECT LOCATE('o', 'hello world', 5);
SELECT LOCATE('o', 'hello world', 6);
SELECT LOCATE('', 'hello');
SELECT LOCATE('', 'hello世界');
SELECT LOCATE(NULL, 'hello');
SELECT LOCATE('o', NULL);
-- ELT function tests
SELECT ELT(1, 'a', 'b', 'c');
SELECT ELT(2, 'a', 'b', 'c');
SELECT ELT(3, 'a', 'b', 'c');
SELECT ELT(0, 'a', 'b', 'c');
SELECT ELT(4, 'a', 'b', 'c');
SELECT ELT(NULL, 'a', 'b', 'c');
-- FIELD function tests
SELECT FIELD('b', 'a', 'b', 'c');
SELECT FIELD('d', 'a', 'b', 'c');
SELECT FIELD('a', 'a', 'b', 'c');
SELECT FIELD('A', 'a', 'b', 'c');
SELECT FIELD(NULL, 'a', 'b', 'c');
-- INSERT function tests
SELECT INSERT('Quadratic', 3, 4, 'What');
SELECT INSERT('Quadratic', 3, 100, 'What');
SELECT INSERT('Quadratic', 0, 4, 'What');
SELECT INSERT('hello', 1, 0, 'X');
SELECT INSERT('hello世界', 6, 1, '');
SELECT INSERT(NULL, 1, 1, 'X');
-- SPACE function tests
SELECT SPACE(5);
SELECT SPACE(0);
SELECT SPACE(-1);
SELECT CONCAT('a', SPACE(3), 'b');
SELECT SPACE(NULL);
-- FORMAT function tests
SELECT FORMAT(1234567.891, 2);
SELECT FORMAT(1234567.891, 0);
SELECT FORMAT(1234.5, 4);
SELECT FORMAT(-1234567.891, 2);
SELECT FORMAT(0.5, 2);
SELECT FORMAT(123, 2);
SELECT FORMAT(NULL, 2);
-- Combined test with table
CREATE TABLE string_test(idx INT, val VARCHAR, ts TIMESTAMP TIME INDEX);
INSERT INTO string_test VALUES
(1, 'hello world', 1),
(2, 'foo bar baz', 2),
(3, 'hello世界', 3);
SELECT idx, val, LOCATE('o', val) as loc FROM string_test ORDER BY idx;
SELECT idx, val, INSERT(val, 1, 5, 'hi') as inserted FROM string_test ORDER BY idx;
DROP TABLE string_test;

View File

@@ -293,7 +293,7 @@ impl Env {
.write(true) .write(true)
.truncate(truncate_log) .truncate(truncate_log)
.append(!truncate_log) .append(!truncate_log)
.open(&stdout_file_name) .open(stdout_file_name)
.unwrap(); .unwrap();
let args = mode.get_args(&self.sqlness_home, self, db_ctx, id); let args = mode.get_args(&self.sqlness_home, self, db_ctx, id);
@@ -333,13 +333,9 @@ impl Env {
}); });
for check_ip_addr in &check_ip_addrs { for check_ip_addr in &check_ip_addrs {
if !util::check_port(check_ip_addr.parse().unwrap(), Duration::from_secs(30)).await { if !util::check_port(check_ip_addr.parse().unwrap(), Duration::from_secs(10)).await {
Env::stop_server(&mut process); Env::stop_server(&mut process);
panic!( panic!("{} doesn't up in 10 seconds, quit.", mode.name())
"{} doesn't up in 30 seconds, check {} for more details.",
mode.name(),
stdout_file_name
)
} }
} }