Compare commits

..

18 Commits

Author SHA1 Message Date
Weny Xu
b25f24c6fe feat(meta-srv): add repartition procedure skeleton (#7487)
Signed-off-by: WenyXu <wenymedia@gmail.com>
2025-12-26 11:23:47 +00:00
Lei, HUANG
7bc0934eb3 refactor(mito2): make MemtableStats fields public (#7488)
Change visibility of estimated_bytes, time_range, max_sequence, and
series_count fields from private to public for external access.

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>
2025-12-26 09:57:18 +00:00
Yingwen
89b9469250 feat: Implement per range stats for bulk memtable (#7486)
* feat: implement per range stats for MemtableRange

Signed-off-by: evenyag <realevenyag@gmail.com>

* refactor: extract methods to MemtableRanges

Signed-off-by: evenyag <realevenyag@gmail.com>

* fix: simple bulk memtable set other fields in stats

Signed-off-by: evenyag <realevenyag@gmail.com>

* refactor: use time_index_type()

Signed-off-by: evenyag <realevenyag@gmail.com>

* refactor: use time index type

Signed-off-by: evenyag <realevenyag@gmail.com>

---------

Signed-off-by: evenyag <realevenyag@gmail.com>
2025-12-26 07:24:11 +00:00
Weny Xu
518a4e013b refactor(mito2): reorganize manifest storage into modular components (#7483)
* refactor(mito2): reorganize manifest storage into modular components

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: apply suggestions

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: apply suggestions from CR

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: sort

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: fmt

Signed-off-by: WenyXu <wenymedia@gmail.com>

---------

Signed-off-by: WenyXu <wenymedia@gmail.com>
2025-12-26 02:24:27 +00:00
Lei, HUANG
fffad499ca chore: mount cargo git cache in docker builds (#7484)
Mount the cargo git cache directory (${HOME}/.cargo/git) in docker build
containers to improve rebuild performance by caching git dependencies.

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>
2025-12-26 01:56:11 +00:00
yihong
0c9f58316d fix: more wait time for sqlness start and better message (#7485)
Signed-off-by: yihong0618 <zouzou0208@gmail.com>
2025-12-26 01:55:20 +00:00
ZonaHe
4f290111db feat: update dashboard to v0.11.11 (#7481)
Co-authored-by: sunchanglong <sunchanglong@users.noreply.github.com>
2025-12-25 18:43:14 +00:00
Weny Xu
294f19fa1d feat(metric-engine): support sync logical regions from source region (#7438)
* chore: move file

Signed-off-by: WenyXu <wenymedia@gmail.com>

* feat(metric-engine): support sync logical regions from source region

Signed-off-by: WenyXu <wenymedia@gmail.com>

* fix: fix unit tests

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: apply suggestions

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: add comments

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: add comments

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: apply suggestions from CR

Signed-off-by: WenyXu <wenymedia@gmail.com>

---------

Signed-off-by: WenyXu <wenymedia@gmail.com>
2025-12-25 09:06:58 +00:00
ZonaHe
be530ac1de feat: update dashboard to v0.11.10 (#7479)
Co-authored-by: sunchanglong <sunchanglong@users.noreply.github.com>
2025-12-25 04:27:10 +00:00
jeremyhi
434b4d8183 feat: refine the MemoryGuard (#7466)
* feat: refine MemoryGuard

Signed-off-by: jeremyhi <fengjiachun@gmail.com>

* chore: add test

Signed-off-by: jeremyhi <fengjiachun@gmail.com>

---------

Signed-off-by: jeremyhi <fengjiachun@gmail.com>
2025-12-25 04:09:32 +00:00
Lei, HUANG
3ad0b60c4b chore(metric-engine): set default compaction time window for data region (#7474)
chore: set compaction time window for metric engine data region to 1 day by default

Signed-off-by: Lei, HUANG <mrsatangel@gmail.com>
2025-12-25 03:55:17 +00:00
Ning Sun
19ae845225 refactor: cache server memory limiter for other components (#7470) 2025-12-25 03:46:50 +00:00
dennis zhuang
3866512cf6 feat: add more MySQL-compatible string functions (#7454)
* feat: add more mysql string functions

Signed-off-by: Dennis Zhuang <killme2008@gmail.com>

* refactor: use datafusion aliasing mechanism, close #7415

Signed-off-by: Dennis Zhuang <killme2008@gmail.com>

* chore: comment

Signed-off-by: Dennis Zhuang <killme2008@gmail.com>

* fix: comment and style

Signed-off-by: Dennis Zhuang <killme2008@gmail.com>

---------

Signed-off-by: Dennis Zhuang <killme2008@gmail.com>
2025-12-25 03:28:57 +00:00
LFC
d4870ee2af fix: typo in AI-assisted contributions policy (#7472)
* Fix typo in AI-assisted contributions policy

* Update project name from DataFusion to GreptimeDB
2025-12-25 03:03:14 +00:00
discord9
aea4e9fa55 fix: RemovedFiles deser compatibility (#7475)
* fix: compat for RemovedFiles

Signed-off-by: discord9 <discord9@163.com>

* cr

Signed-off-by: discord9 <discord9@163.com>

---------

Signed-off-by: discord9 <discord9@163.com>
2025-12-25 02:50:34 +00:00
AntiTopQuark
cea578244c fix(compaction): unify behavior of database compaction options with TTL (#7402)
* fix: fix dynamic compactiom option,unify behavior of database compaction options with TTL option

Signed-off-by: AntiTopQuark <AntiTopQuark1350@outlook.com>

* fix unit test

Signed-off-by: AntiTopQuark <AntiTopQuark1350@outlook.com>

* add debug log

Signed-off-by: AntiTopQuark <AntiTopQuark1350@outlook.com>

---------

Signed-off-by: AntiTopQuark <AntiTopQuark1350@outlook.com>
2025-12-25 02:34:42 +00:00
Weny Xu
e1b18614ee feat(mito2): implement ApplyStagingManifest request handling (#7456)
* feat(mito2): implement `ApplyStagingManifest` request handling

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: apply suggestions from CR

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: fmt

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: apply suggestions from CR

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: apply suggestions from CR

Signed-off-by: WenyXu <wenymedia@gmail.com>

* fix: fix logic

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: update proto

Signed-off-by: WenyXu <wenymedia@gmail.com>

---------

Signed-off-by: WenyXu <wenymedia@gmail.com>
2025-12-24 09:05:09 +00:00
Frost Ming
4bae75ccdb docs: refer to the correct project name in AI guidelines (#7471)
doc: refer to the correct project name in AI guidelines
2025-12-24 07:58:36 +00:00
88 changed files with 7276 additions and 1761 deletions

3
.gitignore vendored
View File

@@ -67,3 +67,6 @@ greptimedb_data
# Claude code
CLAUDE.md
# AGENTS.md
AGENTS.md

View File

@@ -104,14 +104,14 @@ All commit messages SHOULD adhere to the [Conventional Commits specification](ht
## AI-Assisted contributions
We has the following policy for AI-assisted PRs:
We have the following policy for AI-assisted PRs:
- The PR author should **understand the core ideas** behind the implementation **end-to-end**, and be able to justify the design and code during review.
- **Calls out unknowns and assumptions**. It's okay to not fully understand some bits of AI generated code. You should comment on these cases and point them out to reviewers so that they can use their knowledge of the codebase to clear up any concerns. For example, you might comment "calling this function here seems to work but I'm not familiar with how it works internally, I wonder if there's a race condition if it is called concurrently".
### Why fully AI-generated PRs without understanding are not helpful
Today, AI tools cannot reliably make complex changes to DataFusion on their own, which is why we rely on pull requests and code review.
Today, AI tools cannot reliably make complex changes to GreptimeDB on their own, which is why we rely on pull requests and code review.
The purposes of code review are:

5
Cargo.lock generated
View File

@@ -2190,7 +2190,6 @@ dependencies = [
"approx 0.5.1",
"arc-swap",
"arrow",
"arrow-cast",
"arrow-schema",
"async-trait",
"bincode",
@@ -2221,7 +2220,6 @@ dependencies = [
"h3o",
"hyperloglogplus",
"jsonb",
"jsonpath-rust 0.7.5",
"memchr",
"mito-codec",
"nalgebra",
@@ -5466,7 +5464,7 @@ dependencies = [
[[package]]
name = "greptime-proto"
version = "0.1.0"
source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=173efe5ec62722089db7c531c0b0d470a072b915#173efe5ec62722089db7c531c0b0d470a072b915"
source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=520fa524f9d590752ea327683e82ffd65721b27c#520fa524f9d590752ea327683e82ffd65721b27c"
dependencies = [
"prost 0.13.5",
"prost-types 0.13.5",
@@ -7624,6 +7622,7 @@ dependencies = [
"async-trait",
"base64 0.22.1",
"bytes",
"chrono",
"common-base",
"common-error",
"common-macro",

View File

@@ -103,7 +103,6 @@ aquamarine = "0.6"
arrow = { version = "56.2", features = ["prettyprint"] }
arrow-array = { version = "56.2", default-features = false, features = ["chrono-tz"] }
arrow-buffer = "56.2"
arrow-cast = "56.2"
arrow-flight = "56.2"
arrow-ipc = { version = "56.2", default-features = false, features = ["lz4", "zstd"] }
arrow-schema = { version = "56.2", features = ["serde"] }
@@ -151,7 +150,7 @@ etcd-client = { version = "0.16.1", features = [
fst = "0.4.7"
futures = "0.3"
futures-util = "0.3"
greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "173efe5ec62722089db7c531c0b0d470a072b915" }
greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "520fa524f9d590752ea327683e82ffd65721b27c" }
hex = "0.4"
http = "1"
humantime = "2.1"

View File

@@ -14,6 +14,7 @@ BUILDX_BUILDER_NAME ?= gtbuilder
BASE_IMAGE ?= ubuntu
RUST_TOOLCHAIN ?= $(shell cat rust-toolchain.toml | grep channel | cut -d'"' -f2)
CARGO_REGISTRY_CACHE ?= ${HOME}/.cargo/registry
CARGO_GIT_CACHE ?= ${HOME}/.cargo/git
ARCH := $(shell uname -m | sed 's/x86_64/amd64/' | sed 's/aarch64/arm64/')
OUTPUT_DIR := $(shell if [ "$(RELEASE)" = "true" ]; then echo "release"; elif [ ! -z "$(CARGO_PROFILE)" ]; then echo "$(CARGO_PROFILE)" ; else echo "debug"; fi)
SQLNESS_OPTS ?=
@@ -86,7 +87,7 @@ build: ## Build debug version greptime.
build-by-dev-builder: ## Build greptime by dev-builder.
docker run --network=host \
${ASSEMBLED_EXTRA_BUILD_ENV} \
-v ${PWD}:/greptimedb -v ${CARGO_REGISTRY_CACHE}:/root/.cargo/registry \
-v ${PWD}:/greptimedb -v ${CARGO_REGISTRY_CACHE}:/root/.cargo/registry -v ${CARGO_GIT_CACHE}:/root/.cargo/git \
-w /greptimedb ${IMAGE_REGISTRY}/${IMAGE_NAMESPACE}/dev-builder-${BASE_IMAGE}:${DEV_BUILDER_IMAGE_TAG} \
make build \
CARGO_EXTENSION="${CARGO_EXTENSION}" \
@@ -100,7 +101,7 @@ build-by-dev-builder: ## Build greptime by dev-builder.
.PHONY: build-android-bin
build-android-bin: ## Build greptime binary for android.
docker run --network=host \
-v ${PWD}:/greptimedb -v ${CARGO_REGISTRY_CACHE}:/root/.cargo/registry \
-v ${PWD}:/greptimedb -v ${CARGO_REGISTRY_CACHE}:/root/.cargo/registry -v ${CARGO_GIT_CACHE}:/root/.cargo/git \
-w /greptimedb ${IMAGE_REGISTRY}/${IMAGE_NAMESPACE}/dev-builder-android:${DEV_BUILDER_IMAGE_TAG} \
make build \
CARGO_EXTENSION="ndk --platform 23 -t aarch64-linux-android" \
@@ -206,7 +207,7 @@ fix-udeps: ## Remove unused dependencies automatically.
@cargo udeps --workspace --all-targets --output json > udeps-report.json || true
@echo "Removing unused dependencies..."
@python3 scripts/fix-udeps.py udeps-report.json
.PHONY: fmt-check
fmt-check: ## Check code format.
cargo fmt --all -- --check
@@ -224,7 +225,7 @@ stop-etcd: ## Stop single node etcd for testing purpose.
.PHONY: run-it-in-container
run-it-in-container: start-etcd ## Run integration tests in dev-builder.
docker run --network=host \
-v ${PWD}:/greptimedb -v ${CARGO_REGISTRY_CACHE}:/root/.cargo/registry -v /tmp:/tmp \
-v ${PWD}:/greptimedb -v ${CARGO_REGISTRY_CACHE}:/root/.cargo/registry -v ${CARGO_GIT_CACHE}:/root/.cargo/git -v /tmp:/tmp \
-w /greptimedb ${IMAGE_REGISTRY}/${IMAGE_NAMESPACE}/dev-builder-${BASE_IMAGE}:${DEV_BUILDER_IMAGE_TAG} \
make test sqlness-test BUILD_JOBS=${BUILD_JOBS}

View File

@@ -17,7 +17,6 @@ ahash.workspace = true
api.workspace = true
arc-swap = "1.0"
arrow.workspace = true
arrow-cast.workspace = true
arrow-schema.workspace = true
async-trait.workspace = true
bincode = "=1.3.3"
@@ -47,7 +46,6 @@ geohash = { version = "0.13", optional = true }
h3o = { version = "0.6", optional = true }
hyperloglogplus = "0.4"
jsonb.workspace = true
jsonpath-rust = "0.7.5"
memchr = "2.7"
mito-codec.workspace = true
nalgebra.workspace = true

View File

@@ -13,24 +13,17 @@
// limitations under the License.
use std::fmt::{self, Display};
use std::str::FromStr;
use std::sync::Arc;
use arrow::array::{ArrayRef, BinaryViewArray, StringViewArray, StructArray};
use arrow::compute;
use arrow::datatypes::{Float64Type, Int64Type, UInt64Type};
use datafusion_common::DataFusionError;
use datafusion_common::arrow::array::{
Array, AsArray, BinaryViewBuilder, BooleanBuilder, Float64Builder, Int64Builder,
StringViewBuilder,
};
use datafusion_common::arrow::datatypes::DataType;
use datafusion_common::{DataFusionError, Result};
use datafusion_expr::type_coercion::aggregates::STRINGS;
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, Volatility};
use datatypes::arrow_array::string_array_value_at_index;
use datatypes::json::JsonStructureSettings;
use jsonpath_rust::JsonPath;
use serde_json::Value;
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature};
use crate::function::{Function, extract_args};
use crate::helper;
@@ -165,7 +158,11 @@ impl JsonGetString {
impl Default for JsonGetString {
fn default() -> Self {
Self {
signature: Signature::any(2, Volatility::Immutable),
// TODO(LFC): Use a more clear type here instead of "Binary" for Json input, once we have a "Json" type.
signature: helper::one_of_sigs2(
vec![DataType::Binary, DataType::BinaryView],
vec![DataType::Utf8, DataType::Utf8View],
),
}
}
}
@@ -175,7 +172,7 @@ impl Function for JsonGetString {
Self::NAME
}
fn return_type(&self, _: &[DataType]) -> Result<DataType> {
fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> {
Ok(DataType::Utf8View)
}
@@ -183,203 +180,33 @@ impl Function for JsonGetString {
&self.signature
}
fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
fn invoke_with_args(
&self,
args: ScalarFunctionArgs,
) -> datafusion_common::Result<ColumnarValue> {
let [arg0, arg1] = extract_args(self.name(), &args)?;
let arg0 = compute::cast(&arg0, &DataType::BinaryView)?;
let jsons = arg0.as_binary_view();
let arg1 = compute::cast(&arg1, &DataType::Utf8View)?;
let paths = arg1.as_string_view();
let result = match arg0.data_type() {
DataType::Binary | DataType::LargeBinary | DataType::BinaryView => {
let arg0 = compute::cast(&arg0, &DataType::BinaryView)?;
let jsons = arg0.as_binary_view();
jsonb_get_string(jsons, paths)?
}
DataType::Struct(_) => {
let jsons = arg0.as_struct();
json_struct_get_string(jsons, paths)?
}
_ => {
return Err(DataFusionError::Execution(format!(
"{} not supported argument type {}",
Self::NAME,
arg0.data_type(),
)));
}
};
let size = jsons.len();
let mut builder = StringViewBuilder::with_capacity(size);
Ok(ColumnarValue::Array(result))
}
}
fn jsonb_get_string(jsons: &BinaryViewArray, paths: &StringViewArray) -> Result<ArrayRef> {
let size = jsons.len();
let mut builder = StringViewBuilder::with_capacity(size);
for i in 0..size {
let json = jsons.is_valid(i).then(|| jsons.value(i));
let path = paths.is_valid(i).then(|| paths.value(i));
let result = match (json, path) {
(Some(json), Some(path)) => {
get_json_by_path(json, path).and_then(|json| jsonb::to_str(&json).ok())
}
_ => None,
};
builder.append_option(result);
}
Ok(Arc::new(builder.finish()))
}
fn json_struct_get_string(jsons: &StructArray, paths: &StringViewArray) -> Result<ArrayRef> {
let size = jsons.len();
let mut builder = StringViewBuilder::with_capacity(size);
for i in 0..size {
if jsons.is_null(i) || paths.is_null(i) {
builder.append_null();
continue;
}
let path = paths.value(i);
// naively assume the JSON path is our kind of indexing to the field, by removing its "root"
let field_path = path.replace("$.", "");
let column = jsons.column_by_name(&field_path);
if let Some(column) = column {
if let Some(v) = string_array_value_at_index(column, i) {
builder.append_value(v);
} else {
builder.append_value(arrow_cast::display::array_value_to_string(column, i)?);
}
} else {
let Some(raw) = jsons
.column_by_name(JsonStructureSettings::RAW_FIELD)
.and_then(|x| string_array_value_at_index(x, i))
else {
builder.append_null();
continue;
};
let path: JsonPath<Value> = JsonPath::try_from(path).map_err(|e| {
DataFusionError::Execution(format!("{path} is not a valid JSON path: {e}"))
})?;
// the wanted field is not retrievable from the JSON struct columns directly, we have
// to combine everything (columns and the "_raw") into a complete JSON value to find it
let value = json_struct_to_value(raw, jsons, i)?;
match path.find(&value) {
Value::Null => builder.append_null(),
Value::Array(values) => match values.as_slice() {
[] => builder.append_null(),
[x] => {
if let Some(s) = x.as_str() {
builder.append_value(s)
} else {
builder.append_value(x.to_string())
}
}
x => builder.append_value(
x.iter()
.map(|v| v.to_string())
.collect::<Vec<_>>()
.join(", "),
),
},
// Safety: guarded by the returns of `path.find` as documented
_ => unreachable!(),
}
}
}
Ok(Arc::new(builder.finish()))
}
fn json_struct_to_value(raw: &str, jsons: &StructArray, i: usize) -> Result<Value> {
let Ok(mut json) = Value::from_str(raw) else {
return Err(DataFusionError::Internal(format!(
"inner field '{}' is not a valid JSON string",
JsonStructureSettings::RAW_FIELD
)));
};
for (column_name, column) in jsons.column_names().into_iter().zip(jsons.columns()) {
if column_name == JsonStructureSettings::RAW_FIELD {
continue;
}
let (json_pointer, field) = if let Some((json_object, field)) = column_name.rsplit_once(".")
{
let json_pointer = format!("/{}", json_object.replace(".", "/"));
(json_pointer, field)
} else {
("".to_string(), column_name)
};
let Some(json_object) = json
.pointer_mut(&json_pointer)
.and_then(|x| x.as_object_mut())
else {
return Err(DataFusionError::Internal(format!(
"value at JSON pointer '{}' is not an object",
json_pointer
)));
};
macro_rules! insert {
($column: ident, $i: ident, $json_object: ident, $field: ident) => {{
if let Some(value) = $column
.is_valid($i)
.then(|| serde_json::Value::from($column.value($i)))
{
$json_object.insert($field.to_string(), value);
for i in 0..size {
let json = jsons.is_valid(i).then(|| jsons.value(i));
let path = paths.is_valid(i).then(|| paths.value(i));
let result = match (json, path) {
(Some(json), Some(path)) => {
get_json_by_path(json, path).and_then(|json| jsonb::to_str(&json).ok())
}
}};
_ => None,
};
builder.append_option(result);
}
match column.data_type() {
// boolean => Value::Bool
DataType::Boolean => {
let column = column.as_boolean();
insert!(column, i, json_object, field);
}
// int => Value::Number
DataType::Int64 => {
let column = column.as_primitive::<Int64Type>();
insert!(column, i, json_object, field);
}
DataType::UInt64 => {
let column = column.as_primitive::<UInt64Type>();
insert!(column, i, json_object, field);
}
DataType::Float64 => {
let column = column.as_primitive::<Float64Type>();
insert!(column, i, json_object, field);
}
// string => Value::String
DataType::Utf8 => {
let column = column.as_string::<i32>();
insert!(column, i, json_object, field);
}
DataType::LargeUtf8 => {
let column = column.as_string::<i64>();
insert!(column, i, json_object, field);
}
DataType::Utf8View => {
let column = column.as_string_view();
insert!(column, i, json_object, field);
}
// other => Value::Array and Value::Object
_ => {
return Err(DataFusionError::NotImplemented(format!(
"{} is not yet supported to be executed with field {} of datatype {}",
JsonGetString::NAME,
column_name,
column.data_type()
)));
}
}
Ok(ColumnarValue::Array(Arc::new(builder.finish())))
}
Ok(json)
}
impl Display for JsonGetString {
@@ -469,13 +296,11 @@ impl Display for JsonGetObject {
mod tests {
use std::sync::Arc;
use arrow::array::{Float64Array, Int64Array, StructArray};
use arrow_schema::Field;
use datafusion_common::ScalarValue;
use datafusion_common::arrow::array::{BinaryArray, BinaryViewArray, StringArray};
use datafusion_common::arrow::datatypes::{Float64Type, Int64Type};
use datatypes::types::parse_string_to_jsonb;
use serde_json::json;
use super::*;
@@ -649,123 +474,42 @@ mod tests {
r#"{"a": "d", "b": {"c": "e"}, "c": "f"}"#,
r#"{"a": "g", "b": "h", "c": {"a": "g"}}"#,
];
let paths = vec!["$.a.b", "$.a", ""];
let results = [Some("a"), Some("d"), None];
// complete JSON is:
// {
// "kind": "foo",
// "payload": {
// "code": 404,
// "success": false,
// "result": {
// "error": "not found",
// "time_cost": 1.234
// }
// }
// }
let json_struct: ArrayRef = Arc::new(StructArray::new(
vec![
Field::new("kind", DataType::Utf8, true),
Field::new("payload.code", DataType::Int64, true),
Field::new("payload.result.time_cost", DataType::Float64, true),
Field::new(JsonStructureSettings::RAW_FIELD, DataType::Utf8View, true),
]
.into(),
vec![
Arc::new(StringArray::from_iter([Some("foo")])) as ArrayRef,
Arc::new(Int64Array::from_iter([Some(404)])),
Arc::new(Float64Array::from_iter([Some(1.234)])),
Arc::new(StringViewArray::from_iter([Some(
json! ({
"payload": {
"success": false,
"result": {
"error": "not found"
}
}
})
.to_string(),
)])),
],
None,
));
let paths = vec![
"$.a.b",
"$.a",
"",
"$.kind",
"$.payload.code",
"$.payload.result.time_cost",
"$.payload",
"$.payload.success",
"$.payload.result",
"$.payload.result.error",
"$.payload.result.not-exists",
"$.payload.not-exists",
"$.not-exists",
"$",
];
let expects = [
Some("a"),
Some("d"),
None,
Some("foo"),
Some("404"),
Some("1.234"),
Some(
r#"{"code":404,"result":{"error":"not found","time_cost":1.234},"success":false}"#,
),
Some("false"),
Some(r#"{"error":"not found","time_cost":1.234}"#),
Some("not found"),
None,
None,
None,
Some(
r#"{"kind":"foo","payload":{"code":404,"result":{"error":"not found","time_cost":1.234},"success":false}}"#,
),
];
let mut jsons = json_strings
let jsonbs = json_strings
.iter()
.map(|s| {
let value = jsonb::parse_value(s.as_bytes()).unwrap();
Arc::new(BinaryArray::from_iter_values([value.to_vec()])) as ArrayRef
value.to_vec()
})
.collect::<Vec<_>>();
let json_struct_arrays =
std::iter::repeat_n(json_struct, expects.len() - jsons.len()).collect::<Vec<_>>();
jsons.extend(json_struct_arrays);
for i in 0..jsons.len() {
let json = &jsons[i];
let path = paths[i];
let expect = expects[i];
let args = ScalarFunctionArgs {
args: vec![
ColumnarValue::Array(Arc::new(BinaryArray::from_iter_values(jsonbs))),
ColumnarValue::Array(Arc::new(StringArray::from_iter_values(paths))),
],
arg_fields: vec![],
number_rows: 3,
return_field: Arc::new(Field::new("x", DataType::Utf8View, false)),
config_options: Arc::new(Default::default()),
};
let result = json_get_string
.invoke_with_args(args)
.and_then(|x| x.to_array(3))
.unwrap();
let vector = result.as_string_view();
let args = ScalarFunctionArgs {
args: vec![
ColumnarValue::Array(json.clone()),
ColumnarValue::Scalar(path.into()),
],
arg_fields: vec![],
number_rows: 1,
return_field: Arc::new(Field::new("x", DataType::Utf8View, false)),
config_options: Arc::new(Default::default()),
};
let result = json_get_string
.invoke_with_args(args)
.and_then(|x| x.to_array(1))
.unwrap();
let result = result.as_string_view();
assert_eq!(1, result.len());
let actual = result.is_valid(0).then(|| result.value(0));
assert_eq!(actual, expect);
assert_eq!(3, vector.len());
for (i, gt) in results.iter().enumerate() {
let result = vector.is_valid(i).then(|| vector.value(i));
assert_eq!(*gt, result);
}
}
#[test]
fn test_json_get_object() -> Result<()> {
fn test_json_get_object() -> datafusion_common::Result<()> {
let udf = JsonGetObject::default();
assert_eq!("json_get_object", udf.name());
assert_eq!(

View File

@@ -14,13 +14,31 @@
//! String scalar functions
mod elt;
mod field;
mod format;
mod insert;
mod locate;
mod regexp_extract;
mod space;
pub(crate) use elt::EltFunction;
pub(crate) use field::FieldFunction;
pub(crate) use format::FormatFunction;
pub(crate) use insert::InsertFunction;
pub(crate) use locate::LocateFunction;
pub(crate) use regexp_extract::RegexpExtractFunction;
pub(crate) use space::SpaceFunction;
use crate::function_registry::FunctionRegistry;
/// Register all string functions
pub fn register_string_functions(registry: &FunctionRegistry) {
EltFunction::register(registry);
FieldFunction::register(registry);
FormatFunction::register(registry);
InsertFunction::register(registry);
LocateFunction::register(registry);
RegexpExtractFunction::register(registry);
SpaceFunction::register(registry);
}

View File

@@ -0,0 +1,252 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! MySQL-compatible ELT function implementation.
//!
//! ELT(N, str1, str2, str3, ...) - Returns the Nth string from the list.
//! Returns NULL if N < 1 or N > number of strings.
use std::fmt;
use std::sync::Arc;
use datafusion_common::DataFusionError;
use datafusion_common::arrow::array::{Array, ArrayRef, AsArray, LargeStringBuilder};
use datafusion_common::arrow::compute::cast;
use datafusion_common::arrow::datatypes::DataType;
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, Volatility};
use crate::function::Function;
use crate::function_registry::FunctionRegistry;
const NAME: &str = "elt";
/// MySQL-compatible ELT function.
///
/// Syntax: ELT(N, str1, str2, str3, ...)
/// Returns the Nth string argument. N is 1-based.
/// Returns NULL if N is NULL, N < 1, or N > number of string arguments.
#[derive(Debug)]
pub struct EltFunction {
signature: Signature,
}
impl EltFunction {
pub fn register(registry: &FunctionRegistry) {
registry.register_scalar(EltFunction::default());
}
}
impl Default for EltFunction {
fn default() -> Self {
Self {
// ELT takes a variable number of arguments: (Int64, String, String, ...)
signature: Signature::variadic_any(Volatility::Immutable),
}
}
}
impl fmt::Display for EltFunction {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", NAME.to_ascii_uppercase())
}
}
impl Function for EltFunction {
fn name(&self) -> &str {
NAME
}
fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> {
Ok(DataType::LargeUtf8)
}
fn signature(&self) -> &Signature {
&self.signature
}
fn invoke_with_args(
&self,
args: ScalarFunctionArgs,
) -> datafusion_common::Result<ColumnarValue> {
if args.args.len() < 2 {
return Err(DataFusionError::Execution(
"ELT requires at least 2 arguments: ELT(N, str1, ...)".to_string(),
));
}
let arrays = ColumnarValue::values_to_arrays(&args.args)?;
let len = arrays[0].len();
let num_strings = arrays.len() - 1;
// First argument is the index (N) - try to cast to Int64
let index_array = if arrays[0].data_type() == &DataType::Null {
// All NULLs - return all NULLs
let mut builder = LargeStringBuilder::with_capacity(len, 0);
for _ in 0..len {
builder.append_null();
}
return Ok(ColumnarValue::Array(Arc::new(builder.finish())));
} else {
cast(arrays[0].as_ref(), &DataType::Int64).map_err(|e| {
DataFusionError::Execution(format!("ELT: index argument cast failed: {}", e))
})?
};
// Cast string arguments to LargeUtf8
let string_arrays: Vec<ArrayRef> = arrays[1..]
.iter()
.enumerate()
.map(|(i, arr)| {
cast(arr.as_ref(), &DataType::LargeUtf8).map_err(|e| {
DataFusionError::Execution(format!(
"ELT: string argument {} cast failed: {}",
i + 1,
e
))
})
})
.collect::<datafusion_common::Result<Vec<_>>>()?;
let mut builder = LargeStringBuilder::with_capacity(len, len * 32);
for i in 0..len {
if index_array.is_null(i) {
builder.append_null();
continue;
}
let n = index_array
.as_primitive::<datafusion_common::arrow::datatypes::Int64Type>()
.value(i);
// N is 1-based, check bounds
if n < 1 || n as usize > num_strings {
builder.append_null();
continue;
}
let str_idx = (n - 1) as usize;
let str_array = string_arrays[str_idx].as_string::<i64>();
if str_array.is_null(i) {
builder.append_null();
} else {
builder.append_value(str_array.value(i));
}
}
Ok(ColumnarValue::Array(Arc::new(builder.finish())))
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use datafusion_common::arrow::array::{Int64Array, StringArray};
use datafusion_common::arrow::datatypes::Field;
use datafusion_expr::ScalarFunctionArgs;
use super::*;
fn create_args(arrays: Vec<ArrayRef>) -> ScalarFunctionArgs {
let arg_fields: Vec<_> = arrays
.iter()
.enumerate()
.map(|(i, arr)| {
Arc::new(Field::new(
format!("arg_{}", i),
arr.data_type().clone(),
true,
))
})
.collect();
ScalarFunctionArgs {
args: arrays.iter().cloned().map(ColumnarValue::Array).collect(),
arg_fields,
return_field: Arc::new(Field::new("result", DataType::LargeUtf8, true)),
number_rows: arrays[0].len(),
config_options: Arc::new(datafusion_common::config::ConfigOptions::default()),
}
}
#[test]
fn test_elt_basic() {
let function = EltFunction::default();
let n = Arc::new(Int64Array::from(vec![1, 2, 3]));
let s1 = Arc::new(StringArray::from(vec!["a", "a", "a"]));
let s2 = Arc::new(StringArray::from(vec!["b", "b", "b"]));
let s3 = Arc::new(StringArray::from(vec!["c", "c", "c"]));
let args = create_args(vec![n, s1, s2, s3]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let str_array = array.as_string::<i64>();
assert_eq!(str_array.value(0), "a");
assert_eq!(str_array.value(1), "b");
assert_eq!(str_array.value(2), "c");
} else {
panic!("Expected array result");
}
}
#[test]
fn test_elt_out_of_bounds() {
let function = EltFunction::default();
let n = Arc::new(Int64Array::from(vec![0, 4, -1]));
let s1 = Arc::new(StringArray::from(vec!["a", "a", "a"]));
let s2 = Arc::new(StringArray::from(vec!["b", "b", "b"]));
let s3 = Arc::new(StringArray::from(vec!["c", "c", "c"]));
let args = create_args(vec![n, s1, s2, s3]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let str_array = array.as_string::<i64>();
assert!(str_array.is_null(0)); // 0 is out of bounds
assert!(str_array.is_null(1)); // 4 is out of bounds
assert!(str_array.is_null(2)); // -1 is out of bounds
} else {
panic!("Expected array result");
}
}
#[test]
fn test_elt_with_nulls() {
let function = EltFunction::default();
// Row 0: n=1, select s1="a" -> "a"
// Row 1: n=NULL -> NULL
// Row 2: n=1, select s1=NULL -> NULL
let n = Arc::new(Int64Array::from(vec![Some(1), None, Some(1)]));
let s1 = Arc::new(StringArray::from(vec![Some("a"), Some("a"), None]));
let s2 = Arc::new(StringArray::from(vec![Some("b"), Some("b"), Some("b")]));
let args = create_args(vec![n, s1, s2]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let str_array = array.as_string::<i64>();
assert_eq!(str_array.value(0), "a");
assert!(str_array.is_null(1)); // N is NULL
assert!(str_array.is_null(2)); // Selected string is NULL
} else {
panic!("Expected array result");
}
}
}

View File

@@ -0,0 +1,224 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! MySQL-compatible FIELD function implementation.
//!
//! FIELD(str, str1, str2, str3, ...) - Returns the 1-based index of str in the list.
//! Returns 0 if str is not found or is NULL.
use std::fmt;
use std::sync::Arc;
use datafusion_common::DataFusionError;
use datafusion_common::arrow::array::{Array, ArrayRef, AsArray, Int64Builder};
use datafusion_common::arrow::compute::cast;
use datafusion_common::arrow::datatypes::DataType;
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, Volatility};
use crate::function::Function;
use crate::function_registry::FunctionRegistry;
const NAME: &str = "field";
/// MySQL-compatible FIELD function.
///
/// Syntax: FIELD(str, str1, str2, str3, ...)
/// Returns the 1-based index of str in the argument list (str1, str2, str3, ...).
/// Returns 0 if str is not found or is NULL.
#[derive(Debug)]
pub struct FieldFunction {
signature: Signature,
}
impl FieldFunction {
pub fn register(registry: &FunctionRegistry) {
registry.register_scalar(FieldFunction::default());
}
}
impl Default for FieldFunction {
fn default() -> Self {
Self {
// FIELD takes a variable number of arguments: (String, String, String, ...)
signature: Signature::variadic_any(Volatility::Immutable),
}
}
}
impl fmt::Display for FieldFunction {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", NAME.to_ascii_uppercase())
}
}
impl Function for FieldFunction {
fn name(&self) -> &str {
NAME
}
fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> {
Ok(DataType::Int64)
}
fn signature(&self) -> &Signature {
&self.signature
}
fn invoke_with_args(
&self,
args: ScalarFunctionArgs,
) -> datafusion_common::Result<ColumnarValue> {
if args.args.len() < 2 {
return Err(DataFusionError::Execution(
"FIELD requires at least 2 arguments: FIELD(str, str1, ...)".to_string(),
));
}
let arrays = ColumnarValue::values_to_arrays(&args.args)?;
let len = arrays[0].len();
// Cast all arguments to LargeUtf8
let string_arrays: Vec<ArrayRef> = arrays
.iter()
.enumerate()
.map(|(i, arr)| {
cast(arr.as_ref(), &DataType::LargeUtf8).map_err(|e| {
DataFusionError::Execution(format!("FIELD: argument {} cast failed: {}", i, e))
})
})
.collect::<datafusion_common::Result<Vec<_>>>()?;
let search_str = string_arrays[0].as_string::<i64>();
let mut builder = Int64Builder::with_capacity(len);
for i in 0..len {
// If search string is NULL, return 0
if search_str.is_null(i) {
builder.append_value(0);
continue;
}
let needle = search_str.value(i);
let mut found_idx = 0i64;
// Search through the list (starting from index 1 in string_arrays)
for (j, str_arr) in string_arrays[1..].iter().enumerate() {
let str_array = str_arr.as_string::<i64>();
if !str_array.is_null(i) && str_array.value(i) == needle {
found_idx = (j + 1) as i64; // 1-based index
break;
}
}
builder.append_value(found_idx);
}
Ok(ColumnarValue::Array(Arc::new(builder.finish())))
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use datafusion_common::arrow::array::StringArray;
use datafusion_common::arrow::datatypes::Field;
use datafusion_expr::ScalarFunctionArgs;
use super::*;
fn create_args(arrays: Vec<ArrayRef>) -> ScalarFunctionArgs {
let arg_fields: Vec<_> = arrays
.iter()
.enumerate()
.map(|(i, arr)| {
Arc::new(Field::new(
format!("arg_{}", i),
arr.data_type().clone(),
true,
))
})
.collect();
ScalarFunctionArgs {
args: arrays.iter().cloned().map(ColumnarValue::Array).collect(),
arg_fields,
return_field: Arc::new(Field::new("result", DataType::Int64, true)),
number_rows: arrays[0].len(),
config_options: Arc::new(datafusion_common::config::ConfigOptions::default()),
}
}
#[test]
fn test_field_basic() {
let function = FieldFunction::default();
let search = Arc::new(StringArray::from(vec!["b", "d", "a"]));
let s1 = Arc::new(StringArray::from(vec!["a", "a", "a"]));
let s2 = Arc::new(StringArray::from(vec!["b", "b", "b"]));
let s3 = Arc::new(StringArray::from(vec!["c", "c", "c"]));
let args = create_args(vec![search, s1, s2, s3]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let int_array = array.as_primitive::<datafusion_common::arrow::datatypes::Int64Type>();
assert_eq!(int_array.value(0), 2); // "b" is at index 2
assert_eq!(int_array.value(1), 0); // "d" not found
assert_eq!(int_array.value(2), 1); // "a" is at index 1
} else {
panic!("Expected array result");
}
}
#[test]
fn test_field_with_null_search() {
let function = FieldFunction::default();
let search = Arc::new(StringArray::from(vec![Some("a"), None]));
let s1 = Arc::new(StringArray::from(vec!["a", "a"]));
let s2 = Arc::new(StringArray::from(vec!["b", "b"]));
let args = create_args(vec![search, s1, s2]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let int_array = array.as_primitive::<datafusion_common::arrow::datatypes::Int64Type>();
assert_eq!(int_array.value(0), 1); // "a" found at index 1
assert_eq!(int_array.value(1), 0); // NULL search returns 0
} else {
panic!("Expected array result");
}
}
#[test]
fn test_field_case_sensitive() {
let function = FieldFunction::default();
let search = Arc::new(StringArray::from(vec!["A", "a"]));
let s1 = Arc::new(StringArray::from(vec!["a", "a"]));
let s2 = Arc::new(StringArray::from(vec!["A", "A"]));
let args = create_args(vec![search, s1, s2]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let int_array = array.as_primitive::<datafusion_common::arrow::datatypes::Int64Type>();
assert_eq!(int_array.value(0), 2); // "A" matches at index 2
assert_eq!(int_array.value(1), 1); // "a" matches at index 1
} else {
panic!("Expected array result");
}
}
}

View File

@@ -0,0 +1,512 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! MySQL-compatible FORMAT function implementation.
//!
//! FORMAT(X, D) - Formats the number X with D decimal places using thousand separators.
use std::fmt;
use std::sync::Arc;
use datafusion_common::DataFusionError;
use datafusion_common::arrow::array::{Array, AsArray, LargeStringBuilder};
use datafusion_common::arrow::datatypes as arrow_types;
use datafusion_common::arrow::datatypes::DataType;
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, TypeSignature, Volatility};
use crate::function::Function;
use crate::function_registry::FunctionRegistry;
const NAME: &str = "format";
/// MySQL-compatible FORMAT function.
///
/// Syntax: FORMAT(X, D)
/// Formats the number X to a format like '#,###,###.##', rounded to D decimal places.
/// D can be 0 to 30.
///
/// Note: This implementation uses the en_US locale (comma as thousand separator,
/// period as decimal separator).
#[derive(Debug)]
pub struct FormatFunction {
signature: Signature,
}
impl FormatFunction {
pub fn register(registry: &FunctionRegistry) {
registry.register_scalar(FormatFunction::default());
}
}
impl Default for FormatFunction {
fn default() -> Self {
let mut signatures = Vec::new();
// Support various numeric types for X
let numeric_types = [
DataType::Float64,
DataType::Float32,
DataType::Int64,
DataType::Int32,
DataType::Int16,
DataType::Int8,
DataType::UInt64,
DataType::UInt32,
DataType::UInt16,
DataType::UInt8,
];
// D can be various integer types
let int_types = [
DataType::Int64,
DataType::Int32,
DataType::Int16,
DataType::Int8,
DataType::UInt64,
DataType::UInt32,
DataType::UInt16,
DataType::UInt8,
];
for x_type in &numeric_types {
for d_type in &int_types {
signatures.push(TypeSignature::Exact(vec![x_type.clone(), d_type.clone()]));
}
}
Self {
signature: Signature::one_of(signatures, Volatility::Immutable),
}
}
}
impl fmt::Display for FormatFunction {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", NAME.to_ascii_uppercase())
}
}
impl Function for FormatFunction {
fn name(&self) -> &str {
NAME
}
fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> {
Ok(DataType::LargeUtf8)
}
fn signature(&self) -> &Signature {
&self.signature
}
fn invoke_with_args(
&self,
args: ScalarFunctionArgs,
) -> datafusion_common::Result<ColumnarValue> {
if args.args.len() != 2 {
return Err(DataFusionError::Execution(
"FORMAT requires exactly 2 arguments: FORMAT(X, D)".to_string(),
));
}
let arrays = ColumnarValue::values_to_arrays(&args.args)?;
let len = arrays[0].len();
let x_array = &arrays[0];
let d_array = &arrays[1];
let mut builder = LargeStringBuilder::with_capacity(len, len * 20);
for i in 0..len {
if x_array.is_null(i) || d_array.is_null(i) {
builder.append_null();
continue;
}
let decimal_places = get_decimal_places(d_array, i)?.clamp(0, 30) as usize;
let formatted = match x_array.data_type() {
DataType::Float64 | DataType::Float32 => {
format_number_float(get_float_value(x_array, i)?, decimal_places)
}
DataType::Int64
| DataType::Int32
| DataType::Int16
| DataType::Int8
| DataType::UInt64
| DataType::UInt32
| DataType::UInt16
| DataType::UInt8 => format_number_integer(x_array, i, decimal_places)?,
_ => {
return Err(DataFusionError::Execution(format!(
"FORMAT: unsupported type {:?}",
x_array.data_type()
)));
}
};
builder.append_value(&formatted);
}
Ok(ColumnarValue::Array(Arc::new(builder.finish())))
}
}
/// Get float value from various numeric types.
fn get_float_value(
array: &datafusion_common::arrow::array::ArrayRef,
index: usize,
) -> datafusion_common::Result<f64> {
match array.data_type() {
DataType::Float64 => Ok(array
.as_primitive::<arrow_types::Float64Type>()
.value(index)),
DataType::Float32 => Ok(array
.as_primitive::<arrow_types::Float32Type>()
.value(index) as f64),
_ => Err(DataFusionError::Execution(format!(
"FORMAT: unsupported type {:?}",
array.data_type()
))),
}
}
/// Get decimal places from various integer types.
///
/// MySQL clamps decimal places to `0..=30`. This function returns an `i64` so the caller can clamp.
fn get_decimal_places(
array: &datafusion_common::arrow::array::ArrayRef,
index: usize,
) -> datafusion_common::Result<i64> {
match array.data_type() {
DataType::Int64 => Ok(array.as_primitive::<arrow_types::Int64Type>().value(index)),
DataType::Int32 => Ok(array.as_primitive::<arrow_types::Int32Type>().value(index) as i64),
DataType::Int16 => Ok(array.as_primitive::<arrow_types::Int16Type>().value(index) as i64),
DataType::Int8 => Ok(array.as_primitive::<arrow_types::Int8Type>().value(index) as i64),
DataType::UInt64 => {
let v = array.as_primitive::<arrow_types::UInt64Type>().value(index);
Ok(if v > i64::MAX as u64 {
i64::MAX
} else {
v as i64
})
}
DataType::UInt32 => Ok(array.as_primitive::<arrow_types::UInt32Type>().value(index) as i64),
DataType::UInt16 => Ok(array.as_primitive::<arrow_types::UInt16Type>().value(index) as i64),
DataType::UInt8 => Ok(array.as_primitive::<arrow_types::UInt8Type>().value(index) as i64),
_ => Err(DataFusionError::Execution(format!(
"FORMAT: unsupported type {:?}",
array.data_type()
))),
}
}
fn format_number_integer(
array: &datafusion_common::arrow::array::ArrayRef,
index: usize,
decimal_places: usize,
) -> datafusion_common::Result<String> {
let (is_negative, abs_digits) = match array.data_type() {
DataType::Int64 => {
let v = array.as_primitive::<arrow_types::Int64Type>().value(index) as i128;
(v.is_negative(), v.unsigned_abs().to_string())
}
DataType::Int32 => {
let v = array.as_primitive::<arrow_types::Int32Type>().value(index) as i128;
(v.is_negative(), v.unsigned_abs().to_string())
}
DataType::Int16 => {
let v = array.as_primitive::<arrow_types::Int16Type>().value(index) as i128;
(v.is_negative(), v.unsigned_abs().to_string())
}
DataType::Int8 => {
let v = array.as_primitive::<arrow_types::Int8Type>().value(index) as i128;
(v.is_negative(), v.unsigned_abs().to_string())
}
DataType::UInt64 => {
let v = array.as_primitive::<arrow_types::UInt64Type>().value(index) as u128;
(false, v.to_string())
}
DataType::UInt32 => {
let v = array.as_primitive::<arrow_types::UInt32Type>().value(index) as u128;
(false, v.to_string())
}
DataType::UInt16 => {
let v = array.as_primitive::<arrow_types::UInt16Type>().value(index) as u128;
(false, v.to_string())
}
DataType::UInt8 => {
let v = array.as_primitive::<arrow_types::UInt8Type>().value(index) as u128;
(false, v.to_string())
}
_ => {
return Err(DataFusionError::Execution(format!(
"FORMAT: unsupported type {:?}",
array.data_type()
)));
}
};
let mut result = String::new();
if is_negative {
result.push('-');
}
result.push_str(&add_thousand_separators(&abs_digits));
if decimal_places > 0 {
result.push('.');
result.push_str(&"0".repeat(decimal_places));
}
Ok(result)
}
/// Format a float with thousand separators and `decimal_places` digits after decimal point.
fn format_number_float(x: f64, decimal_places: usize) -> String {
// Handle special cases
if x.is_nan() {
return "NaN".to_string();
}
if x.is_infinite() {
return if x.is_sign_positive() {
"Infinity".to_string()
} else {
"-Infinity".to_string()
};
}
// Round to decimal_places
let multiplier = 10f64.powi(decimal_places as i32);
let rounded = (x * multiplier).round() / multiplier;
// Split into integer and fractional parts
let is_negative = rounded < 0.0;
let abs_value = rounded.abs();
// Format with the specified decimal places
let formatted = if decimal_places == 0 {
format!("{:.0}", abs_value)
} else {
format!("{:.prec$}", abs_value, prec = decimal_places)
};
// Split at decimal point
let parts: Vec<&str> = formatted.split('.').collect();
let int_part = parts[0];
let dec_part = parts.get(1).copied();
// Add thousand separators to integer part
let int_with_sep = add_thousand_separators(int_part);
// Build result
let mut result = String::new();
if is_negative {
result.push('-');
}
result.push_str(&int_with_sep);
if let Some(dec) = dec_part {
result.push('.');
result.push_str(dec);
}
result
}
/// Add thousand separators (commas) to an integer string.
fn add_thousand_separators(s: &str) -> String {
let chars: Vec<char> = s.chars().collect();
let len = chars.len();
if len <= 3 {
return s.to_string();
}
let mut result = String::with_capacity(len + len / 3);
let first_group_len = len % 3;
let first_group_len = if first_group_len == 0 {
3
} else {
first_group_len
};
for (i, ch) in chars.iter().enumerate() {
if i > 0 && i >= first_group_len && (i - first_group_len) % 3 == 0 {
result.push(',');
}
result.push(*ch);
}
result
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use datafusion_common::arrow::array::{Float64Array, Int64Array};
use datafusion_common::arrow::datatypes::Field;
use datafusion_expr::ScalarFunctionArgs;
use super::*;
fn create_args(arrays: Vec<datafusion_common::arrow::array::ArrayRef>) -> ScalarFunctionArgs {
let arg_fields: Vec<_> = arrays
.iter()
.enumerate()
.map(|(i, arr)| {
Arc::new(Field::new(
format!("arg_{}", i),
arr.data_type().clone(),
true,
))
})
.collect();
ScalarFunctionArgs {
args: arrays.iter().cloned().map(ColumnarValue::Array).collect(),
arg_fields,
return_field: Arc::new(Field::new("result", DataType::LargeUtf8, true)),
number_rows: arrays[0].len(),
config_options: Arc::new(datafusion_common::config::ConfigOptions::default()),
}
}
#[test]
fn test_format_basic() {
let function = FormatFunction::default();
let x = Arc::new(Float64Array::from(vec![1234567.891, 1234.5, 1234567.0]));
let d = Arc::new(Int64Array::from(vec![2, 0, 3]));
let args = create_args(vec![x, d]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let str_array = array.as_string::<i64>();
assert_eq!(str_array.value(0), "1,234,567.89");
assert_eq!(str_array.value(1), "1,235"); // rounded
assert_eq!(str_array.value(2), "1,234,567.000");
} else {
panic!("Expected array result");
}
}
#[test]
fn test_format_negative() {
let function = FormatFunction::default();
let x = Arc::new(Float64Array::from(vec![-1234567.891]));
let d = Arc::new(Int64Array::from(vec![2]));
let args = create_args(vec![x, d]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let str_array = array.as_string::<i64>();
assert_eq!(str_array.value(0), "-1,234,567.89");
} else {
panic!("Expected array result");
}
}
#[test]
fn test_format_small_numbers() {
let function = FormatFunction::default();
let x = Arc::new(Float64Array::from(vec![0.5, 12.345, 123.0]));
let d = Arc::new(Int64Array::from(vec![2, 2, 0]));
let args = create_args(vec![x, d]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let str_array = array.as_string::<i64>();
assert_eq!(str_array.value(0), "0.50");
assert_eq!(str_array.value(1), "12.35"); // rounded
assert_eq!(str_array.value(2), "123");
} else {
panic!("Expected array result");
}
}
#[test]
fn test_format_with_nulls() {
let function = FormatFunction::default();
let x = Arc::new(Float64Array::from(vec![Some(1234.5), None]));
let d = Arc::new(Int64Array::from(vec![2, 2]));
let args = create_args(vec![x, d]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let str_array = array.as_string::<i64>();
assert_eq!(str_array.value(0), "1,234.50");
assert!(str_array.is_null(1));
} else {
panic!("Expected array result");
}
}
#[test]
fn test_add_thousand_separators() {
assert_eq!(add_thousand_separators("1"), "1");
assert_eq!(add_thousand_separators("12"), "12");
assert_eq!(add_thousand_separators("123"), "123");
assert_eq!(add_thousand_separators("1234"), "1,234");
assert_eq!(add_thousand_separators("12345"), "12,345");
assert_eq!(add_thousand_separators("123456"), "123,456");
assert_eq!(add_thousand_separators("1234567"), "1,234,567");
assert_eq!(add_thousand_separators("12345678"), "12,345,678");
assert_eq!(add_thousand_separators("123456789"), "123,456,789");
}
#[test]
fn test_format_large_int_no_float_precision_loss() {
let function = FormatFunction::default();
// 2^53 + 1 cannot be represented exactly as f64.
let x = Arc::new(Int64Array::from(vec![9_007_199_254_740_993i64]));
let d = Arc::new(Int64Array::from(vec![0]));
let args = create_args(vec![x, d]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let str_array = array.as_string::<i64>();
assert_eq!(str_array.value(0), "9,007,199,254,740,993");
} else {
panic!("Expected array result");
}
}
#[test]
fn test_format_decimal_places_u64_overflow_clamps() {
use datafusion_common::arrow::array::UInt64Array;
let function = FormatFunction::default();
let x = Arc::new(Int64Array::from(vec![1]));
let d = Arc::new(UInt64Array::from(vec![u64::MAX]));
let args = create_args(vec![x, d]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let str_array = array.as_string::<i64>();
assert_eq!(str_array.value(0), format!("1.{}", "0".repeat(30)));
} else {
panic!("Expected array result");
}
}
}

View File

@@ -0,0 +1,345 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! MySQL-compatible INSERT function implementation.
//!
//! INSERT(str, pos, len, newstr) - Inserts newstr into str at position pos,
//! replacing len characters.
use std::fmt;
use std::sync::Arc;
use datafusion_common::DataFusionError;
use datafusion_common::arrow::array::{Array, ArrayRef, AsArray, LargeStringBuilder};
use datafusion_common::arrow::compute::cast;
use datafusion_common::arrow::datatypes::DataType;
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, TypeSignature, Volatility};
use crate::function::Function;
use crate::function_registry::FunctionRegistry;
const NAME: &str = "insert";
/// MySQL-compatible INSERT function.
///
/// Syntax: INSERT(str, pos, len, newstr)
/// Returns str with the substring beginning at position pos and len characters long
/// replaced by newstr.
///
/// - pos is 1-based
/// - If pos is out of range, returns the original string
/// - If len is out of range, replaces from pos to end of string
#[derive(Debug)]
pub struct InsertFunction {
signature: Signature,
}
impl InsertFunction {
pub fn register(registry: &FunctionRegistry) {
registry.register_scalar(InsertFunction::default());
}
}
impl Default for InsertFunction {
fn default() -> Self {
let mut signatures = Vec::new();
let string_types = [DataType::Utf8, DataType::LargeUtf8, DataType::Utf8View];
let int_types = [
DataType::Int64,
DataType::Int32,
DataType::Int16,
DataType::Int8,
DataType::UInt64,
DataType::UInt32,
DataType::UInt16,
DataType::UInt8,
];
for str_type in &string_types {
for newstr_type in &string_types {
for pos_type in &int_types {
for len_type in &int_types {
signatures.push(TypeSignature::Exact(vec![
str_type.clone(),
pos_type.clone(),
len_type.clone(),
newstr_type.clone(),
]));
}
}
}
}
Self {
signature: Signature::one_of(signatures, Volatility::Immutable),
}
}
}
impl fmt::Display for InsertFunction {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", NAME.to_ascii_uppercase())
}
}
impl Function for InsertFunction {
fn name(&self) -> &str {
NAME
}
fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> {
Ok(DataType::LargeUtf8)
}
fn signature(&self) -> &Signature {
&self.signature
}
fn invoke_with_args(
&self,
args: ScalarFunctionArgs,
) -> datafusion_common::Result<ColumnarValue> {
if args.args.len() != 4 {
return Err(DataFusionError::Execution(
"INSERT requires exactly 4 arguments: INSERT(str, pos, len, newstr)".to_string(),
));
}
let arrays = ColumnarValue::values_to_arrays(&args.args)?;
let len = arrays[0].len();
// Cast string arguments to LargeUtf8
let str_array = cast_to_large_utf8(&arrays[0], "str")?;
let newstr_array = cast_to_large_utf8(&arrays[3], "newstr")?;
let pos_array = cast_to_int64(&arrays[1], "pos")?;
let replace_len_array = cast_to_int64(&arrays[2], "len")?;
let str_arr = str_array.as_string::<i64>();
let pos_arr = pos_array.as_primitive::<datafusion_common::arrow::datatypes::Int64Type>();
let len_arr =
replace_len_array.as_primitive::<datafusion_common::arrow::datatypes::Int64Type>();
let newstr_arr = newstr_array.as_string::<i64>();
let mut builder = LargeStringBuilder::with_capacity(len, len * 32);
for i in 0..len {
// Check for NULLs
if str_arr.is_null(i)
|| pos_array.is_null(i)
|| replace_len_array.is_null(i)
|| newstr_arr.is_null(i)
{
builder.append_null();
continue;
}
let original = str_arr.value(i);
let pos = pos_arr.value(i);
let replace_len = len_arr.value(i);
let new_str = newstr_arr.value(i);
let result = insert_string(original, pos, replace_len, new_str);
builder.append_value(&result);
}
Ok(ColumnarValue::Array(Arc::new(builder.finish())))
}
}
/// Cast array to LargeUtf8 for uniform string access.
fn cast_to_large_utf8(array: &ArrayRef, name: &str) -> datafusion_common::Result<ArrayRef> {
cast(array.as_ref(), &DataType::LargeUtf8)
.map_err(|e| DataFusionError::Execution(format!("INSERT: {} cast failed: {}", name, e)))
}
fn cast_to_int64(array: &ArrayRef, name: &str) -> datafusion_common::Result<ArrayRef> {
cast(array.as_ref(), &DataType::Int64)
.map_err(|e| DataFusionError::Execution(format!("INSERT: {} cast failed: {}", name, e)))
}
/// Perform the INSERT string operation.
/// pos is 1-based. If pos < 1 or pos > len(str) + 1, returns original string.
fn insert_string(original: &str, pos: i64, replace_len: i64, new_str: &str) -> String {
let char_count = original.chars().count();
// MySQL behavior: if pos < 1 or pos > string length + 1, return original
if pos < 1 || pos as usize > char_count + 1 {
return original.to_string();
}
let start_idx = (pos - 1) as usize; // Convert to 0-based
// Calculate end index for replacement
let replace_len = if replace_len < 0 {
0
} else {
replace_len as usize
};
let end_idx = (start_idx + replace_len).min(char_count);
let start_byte = char_to_byte_idx(original, start_idx);
let end_byte = char_to_byte_idx(original, end_idx);
let mut result = String::with_capacity(original.len() + new_str.len());
result.push_str(&original[..start_byte]);
result.push_str(new_str);
result.push_str(&original[end_byte..]);
result
}
fn char_to_byte_idx(s: &str, char_idx: usize) -> usize {
s.char_indices()
.nth(char_idx)
.map(|(idx, _)| idx)
.unwrap_or(s.len())
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use datafusion_common::arrow::array::{Int64Array, StringArray};
use datafusion_common::arrow::datatypes::Field;
use datafusion_expr::ScalarFunctionArgs;
use super::*;
fn create_args(arrays: Vec<ArrayRef>) -> ScalarFunctionArgs {
let arg_fields: Vec<_> = arrays
.iter()
.enumerate()
.map(|(i, arr)| {
Arc::new(Field::new(
format!("arg_{}", i),
arr.data_type().clone(),
true,
))
})
.collect();
ScalarFunctionArgs {
args: arrays.iter().cloned().map(ColumnarValue::Array).collect(),
arg_fields,
return_field: Arc::new(Field::new("result", DataType::LargeUtf8, true)),
number_rows: arrays[0].len(),
config_options: Arc::new(datafusion_common::config::ConfigOptions::default()),
}
}
#[test]
fn test_insert_basic() {
let function = InsertFunction::default();
// INSERT('Quadratic', 3, 4, 'What') => 'QuWhattic'
let str_arr = Arc::new(StringArray::from(vec!["Quadratic"]));
let pos = Arc::new(Int64Array::from(vec![3]));
let len = Arc::new(Int64Array::from(vec![4]));
let newstr = Arc::new(StringArray::from(vec!["What"]));
let args = create_args(vec![str_arr, pos, len, newstr]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let str_array = array.as_string::<i64>();
assert_eq!(str_array.value(0), "QuWhattic");
} else {
panic!("Expected array result");
}
}
#[test]
fn test_insert_out_of_range_pos() {
let function = InsertFunction::default();
// INSERT('Quadratic', 0, 4, 'What') => 'Quadratic' (pos < 1)
let str_arr = Arc::new(StringArray::from(vec!["Quadratic", "Quadratic"]));
let pos = Arc::new(Int64Array::from(vec![0, 100]));
let len = Arc::new(Int64Array::from(vec![4, 4]));
let newstr = Arc::new(StringArray::from(vec!["What", "What"]));
let args = create_args(vec![str_arr, pos, len, newstr]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let str_array = array.as_string::<i64>();
assert_eq!(str_array.value(0), "Quadratic"); // pos < 1
assert_eq!(str_array.value(1), "Quadratic"); // pos > length
} else {
panic!("Expected array result");
}
}
#[test]
fn test_insert_replace_to_end() {
let function = InsertFunction::default();
// INSERT('Quadratic', 3, 100, 'What') => 'QuWhat' (len exceeds remaining)
let str_arr = Arc::new(StringArray::from(vec!["Quadratic"]));
let pos = Arc::new(Int64Array::from(vec![3]));
let len = Arc::new(Int64Array::from(vec![100]));
let newstr = Arc::new(StringArray::from(vec!["What"]));
let args = create_args(vec![str_arr, pos, len, newstr]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let str_array = array.as_string::<i64>();
assert_eq!(str_array.value(0), "QuWhat");
} else {
panic!("Expected array result");
}
}
#[test]
fn test_insert_unicode() {
let function = InsertFunction::default();
// INSERT('hello世界', 6, 1, 'の') => 'helloの界'
let str_arr = Arc::new(StringArray::from(vec!["hello世界"]));
let pos = Arc::new(Int64Array::from(vec![6]));
let len = Arc::new(Int64Array::from(vec![1]));
let newstr = Arc::new(StringArray::from(vec![""]));
let args = create_args(vec![str_arr, pos, len, newstr]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let str_array = array.as_string::<i64>();
assert_eq!(str_array.value(0), "helloの界");
} else {
panic!("Expected array result");
}
}
#[test]
fn test_insert_with_nulls() {
let function = InsertFunction::default();
let str_arr = Arc::new(StringArray::from(vec![Some("hello"), None]));
let pos = Arc::new(Int64Array::from(vec![1, 1]));
let len = Arc::new(Int64Array::from(vec![1, 1]));
let newstr = Arc::new(StringArray::from(vec!["X", "X"]));
let args = create_args(vec![str_arr, pos, len, newstr]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let str_array = array.as_string::<i64>();
assert_eq!(str_array.value(0), "Xello");
assert!(str_array.is_null(1));
} else {
panic!("Expected array result");
}
}
}

View File

@@ -0,0 +1,373 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! MySQL-compatible LOCATE function implementation.
//!
//! LOCATE(substr, str) - Returns the position of the first occurrence of substr in str (1-based).
//! LOCATE(substr, str, pos) - Returns the position of the first occurrence of substr in str,
//! starting from position pos.
//! Returns 0 if substr is not found.
use std::fmt;
use std::sync::Arc;
use datafusion_common::DataFusionError;
use datafusion_common::arrow::array::{Array, ArrayRef, AsArray, Int64Builder};
use datafusion_common::arrow::compute::cast;
use datafusion_common::arrow::datatypes::DataType;
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, TypeSignature, Volatility};
use crate::function::Function;
use crate::function_registry::FunctionRegistry;
const NAME: &str = "locate";
/// MySQL-compatible LOCATE function.
///
/// Syntax:
/// - LOCATE(substr, str) - Returns 1-based position of substr in str, or 0 if not found.
/// - LOCATE(substr, str, pos) - Same, but starts searching from position pos.
#[derive(Debug)]
pub struct LocateFunction {
signature: Signature,
}
impl LocateFunction {
pub fn register(registry: &FunctionRegistry) {
registry.register_scalar(LocateFunction::default());
}
}
impl Default for LocateFunction {
fn default() -> Self {
// Support 2 or 3 arguments with various string types
let mut signatures = Vec::new();
let string_types = [DataType::Utf8, DataType::LargeUtf8, DataType::Utf8View];
let int_types = [
DataType::Int64,
DataType::Int32,
DataType::Int16,
DataType::Int8,
DataType::UInt64,
DataType::UInt32,
DataType::UInt16,
DataType::UInt8,
];
// 2-argument form: LOCATE(substr, str)
for substr_type in &string_types {
for str_type in &string_types {
signatures.push(TypeSignature::Exact(vec![
substr_type.clone(),
str_type.clone(),
]));
}
}
// 3-argument form: LOCATE(substr, str, pos)
for substr_type in &string_types {
for str_type in &string_types {
for pos_type in &int_types {
signatures.push(TypeSignature::Exact(vec![
substr_type.clone(),
str_type.clone(),
pos_type.clone(),
]));
}
}
}
Self {
signature: Signature::one_of(signatures, Volatility::Immutable),
}
}
}
impl fmt::Display for LocateFunction {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", NAME.to_ascii_uppercase())
}
}
impl Function for LocateFunction {
fn name(&self) -> &str {
NAME
}
fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> {
Ok(DataType::Int64)
}
fn signature(&self) -> &Signature {
&self.signature
}
fn invoke_with_args(
&self,
args: ScalarFunctionArgs,
) -> datafusion_common::Result<ColumnarValue> {
let arg_count = args.args.len();
if !(2..=3).contains(&arg_count) {
return Err(DataFusionError::Execution(
"LOCATE requires 2 or 3 arguments: LOCATE(substr, str) or LOCATE(substr, str, pos)"
.to_string(),
));
}
let arrays = ColumnarValue::values_to_arrays(&args.args)?;
// Cast string arguments to LargeUtf8 for uniform access
let substr_array = cast_to_large_utf8(&arrays[0], "substr")?;
let str_array = cast_to_large_utf8(&arrays[1], "str")?;
let substr = substr_array.as_string::<i64>();
let str_arr = str_array.as_string::<i64>();
let len = substr.len();
// Handle optional pos argument
let pos_array: Option<ArrayRef> = if arg_count == 3 {
Some(cast_to_int64(&arrays[2], "pos")?)
} else {
None
};
let mut builder = Int64Builder::with_capacity(len);
for i in 0..len {
if substr.is_null(i) || str_arr.is_null(i) {
builder.append_null();
continue;
}
let needle = substr.value(i);
let haystack = str_arr.value(i);
// Get starting position (1-based in MySQL, convert to 0-based)
let start_pos = if let Some(ref pos_arr) = pos_array {
if pos_arr.is_null(i) {
builder.append_null();
continue;
}
let pos = pos_arr
.as_primitive::<datafusion_common::arrow::datatypes::Int64Type>()
.value(i);
if pos < 1 {
// MySQL returns 0 for pos < 1
builder.append_value(0);
continue;
}
(pos - 1) as usize
} else {
0
};
// Find position using character-based indexing (for Unicode support)
let result = locate_substr(haystack, needle, start_pos);
builder.append_value(result);
}
Ok(ColumnarValue::Array(Arc::new(builder.finish())))
}
}
/// Cast array to LargeUtf8 for uniform string access.
fn cast_to_large_utf8(array: &ArrayRef, name: &str) -> datafusion_common::Result<ArrayRef> {
cast(array.as_ref(), &DataType::LargeUtf8)
.map_err(|e| DataFusionError::Execution(format!("LOCATE: {} cast failed: {}", name, e)))
}
fn cast_to_int64(array: &ArrayRef, name: &str) -> datafusion_common::Result<ArrayRef> {
cast(array.as_ref(), &DataType::Int64)
.map_err(|e| DataFusionError::Execution(format!("LOCATE: {} cast failed: {}", name, e)))
}
/// Find the 1-based position of needle in haystack, starting from start_pos (0-based character index).
/// Returns 0 if not found.
fn locate_substr(haystack: &str, needle: &str, start_pos: usize) -> i64 {
// Handle empty needle - MySQL returns start_pos + 1
if needle.is_empty() {
let char_count = haystack.chars().count();
return if start_pos <= char_count {
(start_pos + 1) as i64
} else {
0
};
}
// Convert start_pos (character index) to byte index
let byte_start = haystack
.char_indices()
.nth(start_pos)
.map(|(idx, _)| idx)
.unwrap_or(haystack.len());
if byte_start >= haystack.len() {
return 0;
}
// Search in the substring
let search_str = &haystack[byte_start..];
if let Some(byte_pos) = search_str.find(needle) {
// Convert byte position back to character position
let char_pos = search_str[..byte_pos].chars().count();
// Return 1-based position relative to original string
(start_pos + char_pos + 1) as i64
} else {
0
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use datafusion_common::arrow::array::StringArray;
use datafusion_common::arrow::datatypes::Field;
use datafusion_expr::ScalarFunctionArgs;
use super::*;
fn create_args(arrays: Vec<ArrayRef>) -> ScalarFunctionArgs {
let arg_fields: Vec<_> = arrays
.iter()
.enumerate()
.map(|(i, arr)| {
Arc::new(Field::new(
format!("arg_{}", i),
arr.data_type().clone(),
true,
))
})
.collect();
ScalarFunctionArgs {
args: arrays.iter().cloned().map(ColumnarValue::Array).collect(),
arg_fields,
return_field: Arc::new(Field::new("result", DataType::Int64, true)),
number_rows: arrays[0].len(),
config_options: Arc::new(datafusion_common::config::ConfigOptions::default()),
}
}
#[test]
fn test_locate_basic() {
let function = LocateFunction::default();
let substr = Arc::new(StringArray::from(vec!["world", "xyz", "hello"]));
let str_arr = Arc::new(StringArray::from(vec![
"hello world",
"hello world",
"hello world",
]));
let args = create_args(vec![substr, str_arr]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let int_array = array.as_primitive::<datafusion_common::arrow::datatypes::Int64Type>();
assert_eq!(int_array.value(0), 7); // "world" at position 7
assert_eq!(int_array.value(1), 0); // "xyz" not found
assert_eq!(int_array.value(2), 1); // "hello" at position 1
} else {
panic!("Expected array result");
}
}
#[test]
fn test_locate_with_position() {
let function = LocateFunction::default();
let substr = Arc::new(StringArray::from(vec!["o", "o", "o"]));
let str_arr = Arc::new(StringArray::from(vec![
"hello world",
"hello world",
"hello world",
]));
let pos = Arc::new(datafusion_common::arrow::array::Int64Array::from(vec![
1, 5, 8,
]));
let args = create_args(vec![substr, str_arr, pos]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let int_array = array.as_primitive::<datafusion_common::arrow::datatypes::Int64Type>();
assert_eq!(int_array.value(0), 5); // first 'o' at position 5
assert_eq!(int_array.value(1), 5); // 'o' at position 5 (start from 5)
assert_eq!(int_array.value(2), 8); // 'o' in "world" at position 8
} else {
panic!("Expected array result");
}
}
#[test]
fn test_locate_unicode() {
let function = LocateFunction::default();
let substr = Arc::new(StringArray::from(vec!["", ""]));
let str_arr = Arc::new(StringArray::from(vec!["hello世界", "hello世界"]));
let args = create_args(vec![substr, str_arr]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let int_array = array.as_primitive::<datafusion_common::arrow::datatypes::Int64Type>();
assert_eq!(int_array.value(0), 6); // "世" at position 6
assert_eq!(int_array.value(1), 7); // "界" at position 7
} else {
panic!("Expected array result");
}
}
#[test]
fn test_locate_empty_needle() {
let function = LocateFunction::default();
let substr = Arc::new(StringArray::from(vec!["", ""]));
let str_arr = Arc::new(StringArray::from(vec!["hello", "hello"]));
let pos = Arc::new(datafusion_common::arrow::array::Int64Array::from(vec![
1, 3,
]));
let args = create_args(vec![substr, str_arr, pos]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let int_array = array.as_primitive::<datafusion_common::arrow::datatypes::Int64Type>();
assert_eq!(int_array.value(0), 1); // empty string at pos 1
assert_eq!(int_array.value(1), 3); // empty string at pos 3
} else {
panic!("Expected array result");
}
}
#[test]
fn test_locate_with_nulls() {
let function = LocateFunction::default();
let substr = Arc::new(StringArray::from(vec![Some("o"), None]));
let str_arr = Arc::new(StringArray::from(vec![Some("hello"), Some("hello")]));
let args = create_args(vec![substr, str_arr]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let int_array = array.as_primitive::<datafusion_common::arrow::datatypes::Int64Type>();
assert_eq!(int_array.value(0), 5);
assert!(int_array.is_null(1));
} else {
panic!("Expected array result");
}
}
}

View File

@@ -0,0 +1,252 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! MySQL-compatible SPACE function implementation.
//!
//! SPACE(N) - Returns a string consisting of N space characters.
use std::fmt;
use std::sync::Arc;
use datafusion_common::DataFusionError;
use datafusion_common::arrow::array::{Array, AsArray, LargeStringBuilder};
use datafusion_common::arrow::datatypes::DataType;
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, TypeSignature, Volatility};
use crate::function::Function;
use crate::function_registry::FunctionRegistry;
const NAME: &str = "space";
// Safety limit for maximum number of spaces
const MAX_SPACE_COUNT: i64 = 1024 * 1024; // 1MB of spaces
/// MySQL-compatible SPACE function.
///
/// Syntax: SPACE(N)
/// Returns a string consisting of N space characters.
/// Returns NULL if N is NULL.
/// Returns empty string if N < 0.
#[derive(Debug)]
pub struct SpaceFunction {
signature: Signature,
}
impl SpaceFunction {
pub fn register(registry: &FunctionRegistry) {
registry.register_scalar(SpaceFunction::default());
}
}
impl Default for SpaceFunction {
fn default() -> Self {
Self {
signature: Signature::one_of(
vec![
TypeSignature::Exact(vec![DataType::Int64]),
TypeSignature::Exact(vec![DataType::Int32]),
TypeSignature::Exact(vec![DataType::Int16]),
TypeSignature::Exact(vec![DataType::Int8]),
TypeSignature::Exact(vec![DataType::UInt64]),
TypeSignature::Exact(vec![DataType::UInt32]),
TypeSignature::Exact(vec![DataType::UInt16]),
TypeSignature::Exact(vec![DataType::UInt8]),
],
Volatility::Immutable,
),
}
}
}
impl fmt::Display for SpaceFunction {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "{}", NAME.to_ascii_uppercase())
}
}
impl Function for SpaceFunction {
fn name(&self) -> &str {
NAME
}
fn return_type(&self, _: &[DataType]) -> datafusion_common::Result<DataType> {
Ok(DataType::LargeUtf8)
}
fn signature(&self) -> &Signature {
&self.signature
}
fn invoke_with_args(
&self,
args: ScalarFunctionArgs,
) -> datafusion_common::Result<ColumnarValue> {
if args.args.len() != 1 {
return Err(DataFusionError::Execution(
"SPACE requires exactly 1 argument: SPACE(N)".to_string(),
));
}
let arrays = ColumnarValue::values_to_arrays(&args.args)?;
let len = arrays[0].len();
let n_array = &arrays[0];
let mut builder = LargeStringBuilder::with_capacity(len, len * 10);
for i in 0..len {
if n_array.is_null(i) {
builder.append_null();
continue;
}
let n = get_int_value(n_array, i)?;
if n < 0 {
// MySQL returns empty string for negative values
builder.append_value("");
} else if n > MAX_SPACE_COUNT {
return Err(DataFusionError::Execution(format!(
"SPACE: requested {} spaces exceeds maximum allowed ({})",
n, MAX_SPACE_COUNT
)));
} else {
let spaces = " ".repeat(n as usize);
builder.append_value(&spaces);
}
}
Ok(ColumnarValue::Array(Arc::new(builder.finish())))
}
}
/// Extract integer value from various integer types.
fn get_int_value(
array: &datafusion_common::arrow::array::ArrayRef,
index: usize,
) -> datafusion_common::Result<i64> {
use datafusion_common::arrow::datatypes as arrow_types;
match array.data_type() {
DataType::Int64 => Ok(array.as_primitive::<arrow_types::Int64Type>().value(index)),
DataType::Int32 => Ok(array.as_primitive::<arrow_types::Int32Type>().value(index) as i64),
DataType::Int16 => Ok(array.as_primitive::<arrow_types::Int16Type>().value(index) as i64),
DataType::Int8 => Ok(array.as_primitive::<arrow_types::Int8Type>().value(index) as i64),
DataType::UInt64 => {
let v = array.as_primitive::<arrow_types::UInt64Type>().value(index);
if v > i64::MAX as u64 {
Err(DataFusionError::Execution(format!(
"SPACE: value {} exceeds maximum",
v
)))
} else {
Ok(v as i64)
}
}
DataType::UInt32 => Ok(array.as_primitive::<arrow_types::UInt32Type>().value(index) as i64),
DataType::UInt16 => Ok(array.as_primitive::<arrow_types::UInt16Type>().value(index) as i64),
DataType::UInt8 => Ok(array.as_primitive::<arrow_types::UInt8Type>().value(index) as i64),
_ => Err(DataFusionError::Execution(format!(
"SPACE: unsupported type {:?}",
array.data_type()
))),
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use datafusion_common::arrow::array::Int64Array;
use datafusion_common::arrow::datatypes::Field;
use datafusion_expr::ScalarFunctionArgs;
use super::*;
fn create_args(arrays: Vec<datafusion_common::arrow::array::ArrayRef>) -> ScalarFunctionArgs {
let arg_fields: Vec<_> = arrays
.iter()
.enumerate()
.map(|(i, arr)| {
Arc::new(Field::new(
format!("arg_{}", i),
arr.data_type().clone(),
true,
))
})
.collect();
ScalarFunctionArgs {
args: arrays.iter().cloned().map(ColumnarValue::Array).collect(),
arg_fields,
return_field: Arc::new(Field::new("result", DataType::LargeUtf8, true)),
number_rows: arrays[0].len(),
config_options: Arc::new(datafusion_common::config::ConfigOptions::default()),
}
}
#[test]
fn test_space_basic() {
let function = SpaceFunction::default();
let n = Arc::new(Int64Array::from(vec![0, 1, 5]));
let args = create_args(vec![n]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let str_array = array.as_string::<i64>();
assert_eq!(str_array.value(0), "");
assert_eq!(str_array.value(1), " ");
assert_eq!(str_array.value(2), " ");
} else {
panic!("Expected array result");
}
}
#[test]
fn test_space_negative() {
let function = SpaceFunction::default();
let n = Arc::new(Int64Array::from(vec![-1, -100]));
let args = create_args(vec![n]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let str_array = array.as_string::<i64>();
assert_eq!(str_array.value(0), "");
assert_eq!(str_array.value(1), "");
} else {
panic!("Expected array result");
}
}
#[test]
fn test_space_with_nulls() {
let function = SpaceFunction::default();
let n = Arc::new(Int64Array::from(vec![Some(3), None]));
let args = create_args(vec![n]);
let result = function.invoke_with_args(args).unwrap();
if let ColumnarValue::Array(array) = result {
let str_array = array.as_string::<i64>();
assert_eq!(str_array.value(0), " ");
assert!(str_array.is_null(1));
} else {
panic!("Expected array result");
}
}
}

View File

@@ -15,9 +15,14 @@
use std::{fmt, mem};
use common_telemetry::debug;
use snafu::ensure;
use tokio::sync::{OwnedSemaphorePermit, TryAcquireError};
use crate::error::{
MemoryAcquireTimeoutSnafu, MemoryLimitExceededSnafu, MemorySemaphoreClosedSnafu, Result,
};
use crate::manager::{MemoryMetrics, MemoryQuota};
use crate::policy::OnExhaustedPolicy;
/// Guard representing a slice of reserved memory.
pub struct MemoryGuard<M: MemoryMetrics> {
@@ -55,11 +60,52 @@ impl<M: MemoryMetrics> MemoryGuard<M> {
}
}
/// Tries to allocate additional memory during task execution.
/// Acquires additional memory, waiting if necessary until enough is available.
///
/// On success, merges the new memory into this guard.
///
/// # Errors
/// - Returns error if requested bytes would exceed the manager's total limit
/// - Returns error if the semaphore is unexpectedly closed
pub async fn acquire_additional(&mut self, bytes: u64) -> Result<()> {
match &mut self.state {
GuardState::Unlimited => Ok(()),
GuardState::Limited { permit, quota } => {
if bytes == 0 {
return Ok(());
}
let additional_permits = quota.bytes_to_permits(bytes);
let current_permits = permit.num_permits() as u32;
ensure!(
current_permits.saturating_add(additional_permits) <= quota.limit_permits,
MemoryLimitExceededSnafu {
requested_bytes: bytes,
limit_bytes: quota.permits_to_bytes(quota.limit_permits)
}
);
let additional_permit = quota
.semaphore
.clone()
.acquire_many_owned(additional_permits)
.await
.map_err(|_| MemorySemaphoreClosedSnafu.build())?;
permit.merge(additional_permit);
quota.update_in_use_metric();
debug!("Acquired additional {} bytes", bytes);
Ok(())
}
}
}
/// Tries to acquire additional memory without waiting.
///
/// On success, merges the new memory into this guard and returns true.
/// On failure, returns false and leaves this guard unchanged.
pub fn request_additional(&mut self, bytes: u64) -> bool {
pub fn try_acquire_additional(&mut self, bytes: u64) -> bool {
match &mut self.state {
GuardState::Unlimited => true,
GuardState::Limited { permit, quota } => {
@@ -77,11 +123,11 @@ impl<M: MemoryMetrics> MemoryGuard<M> {
Ok(additional_permit) => {
permit.merge(additional_permit);
quota.update_in_use_metric();
debug!("Allocated additional {} bytes", bytes);
debug!("Acquired additional {} bytes", bytes);
true
}
Err(TryAcquireError::NoPermits) | Err(TryAcquireError::Closed) => {
quota.metrics.inc_rejected("request_additional");
quota.metrics.inc_rejected("try_acquire_additional");
false
}
}
@@ -89,11 +135,55 @@ impl<M: MemoryMetrics> MemoryGuard<M> {
}
}
/// Releases a portion of granted memory back to the pool early,
/// before the guard is dropped.
/// Acquires additional memory based on the given policy.
///
/// - For `OnExhaustedPolicy::Wait`: Waits up to the timeout duration for memory to become available
/// - For `OnExhaustedPolicy::Fail`: Returns immediately if memory is not available
///
/// # Errors
/// - `MemoryLimitExceeded`: Requested bytes would exceed the total limit (both policies), or memory is currently exhausted (Fail policy only)
/// - `MemoryAcquireTimeout`: Timeout elapsed while waiting for memory (Wait policy only)
/// - `MemorySemaphoreClosed`: The internal semaphore is unexpectedly closed (rare, indicates system issue)
pub async fn acquire_additional_with_policy(
&mut self,
bytes: u64,
policy: OnExhaustedPolicy,
) -> Result<()> {
match policy {
OnExhaustedPolicy::Wait { timeout } => {
match tokio::time::timeout(timeout, self.acquire_additional(bytes)).await {
Ok(Ok(())) => Ok(()),
Ok(Err(e)) => Err(e),
Err(_elapsed) => MemoryAcquireTimeoutSnafu {
requested_bytes: bytes,
waited: timeout,
}
.fail(),
}
}
OnExhaustedPolicy::Fail => {
if self.try_acquire_additional(bytes) {
Ok(())
} else {
MemoryLimitExceededSnafu {
requested_bytes: bytes,
limit_bytes: match &self.state {
GuardState::Unlimited => 0, // unreachable: unlimited mode always succeeds
GuardState::Limited { quota, .. } => {
quota.permits_to_bytes(quota.limit_permits)
}
},
}
.fail()
}
}
}
}
/// Releases a portion of granted memory back to the pool before the guard is dropped.
///
/// Returns true if the release succeeds or is a no-op; false if the request exceeds granted.
pub fn early_release_partial(&mut self, bytes: u64) -> bool {
pub fn release_partial(&mut self, bytes: u64) -> bool {
match &mut self.state {
GuardState::Unlimited => true,
GuardState::Limited { permit, quota } => {
@@ -109,7 +199,7 @@ impl<M: MemoryMetrics> MemoryGuard<M> {
quota.permits_to_bytes(released_permit.num_permits() as u32);
drop(released_permit);
quota.update_in_use_metric();
debug!("Early released {} bytes from memory guard", released_bytes);
debug!("Released {} bytes from memory guard", released_bytes);
true
}
None => false,

View File

@@ -83,7 +83,7 @@ fn test_request_additional_success() {
assert_eq!(manager.used_bytes(), base);
// Request additional memory (3MB) - should succeed and merge
assert!(guard.request_additional(3 * PERMIT_GRANULARITY_BYTES));
assert!(guard.try_acquire_additional(3 * PERMIT_GRANULARITY_BYTES));
assert_eq!(guard.granted_bytes(), 8 * PERMIT_GRANULARITY_BYTES);
assert_eq!(manager.used_bytes(), 8 * PERMIT_GRANULARITY_BYTES);
}
@@ -98,11 +98,11 @@ fn test_request_additional_exceeds_limit() {
let mut guard = manager.try_acquire(base).unwrap();
// Request additional memory (3MB) - should succeed
assert!(guard.request_additional(3 * PERMIT_GRANULARITY_BYTES));
assert!(guard.try_acquire_additional(3 * PERMIT_GRANULARITY_BYTES));
assert_eq!(manager.used_bytes(), 8 * PERMIT_GRANULARITY_BYTES);
// Request more (3MB) - should fail (would exceed 10MB limit)
let result = guard.request_additional(3 * PERMIT_GRANULARITY_BYTES);
let result = guard.try_acquire_additional(3 * PERMIT_GRANULARITY_BYTES);
assert!(!result);
// Still at 8MB
@@ -119,7 +119,7 @@ fn test_request_additional_auto_release_on_guard_drop() {
let mut guard = manager.try_acquire(5 * PERMIT_GRANULARITY_BYTES).unwrap();
// Request additional - memory is merged into guard
assert!(guard.request_additional(3 * PERMIT_GRANULARITY_BYTES));
assert!(guard.try_acquire_additional(3 * PERMIT_GRANULARITY_BYTES));
assert_eq!(manager.used_bytes(), 8 * PERMIT_GRANULARITY_BYTES);
// When guard drops, all memory (base + additional) is released together
@@ -135,7 +135,7 @@ fn test_request_additional_unlimited() {
let mut guard = manager.try_acquire(5 * PERMIT_GRANULARITY_BYTES).unwrap();
// Should always succeed with unlimited manager
assert!(guard.request_additional(100 * PERMIT_GRANULARITY_BYTES));
assert!(guard.try_acquire_additional(100 * PERMIT_GRANULARITY_BYTES));
assert_eq!(guard.granted_bytes(), 0);
assert_eq!(manager.used_bytes(), 0);
}
@@ -148,7 +148,7 @@ fn test_request_additional_zero_bytes() {
let mut guard = manager.try_acquire(5 * PERMIT_GRANULARITY_BYTES).unwrap();
// Request 0 bytes should succeed without affecting anything
assert!(guard.request_additional(0));
assert!(guard.try_acquire_additional(0));
assert_eq!(guard.granted_bytes(), 5 * PERMIT_GRANULARITY_BYTES);
assert_eq!(manager.used_bytes(), 5 * PERMIT_GRANULARITY_BYTES);
}
@@ -162,7 +162,7 @@ fn test_early_release_partial_success() {
assert_eq!(manager.used_bytes(), 8 * PERMIT_GRANULARITY_BYTES);
// Release half
assert!(guard.early_release_partial(4 * PERMIT_GRANULARITY_BYTES));
assert!(guard.release_partial(4 * PERMIT_GRANULARITY_BYTES));
assert_eq!(guard.granted_bytes(), 4 * PERMIT_GRANULARITY_BYTES);
assert_eq!(manager.used_bytes(), 4 * PERMIT_GRANULARITY_BYTES);
@@ -177,7 +177,7 @@ fn test_early_release_partial_exceeds_granted() {
let mut guard = manager.try_acquire(5 * PERMIT_GRANULARITY_BYTES).unwrap();
// Try to release more than granted - should fail
assert!(!guard.early_release_partial(10 * PERMIT_GRANULARITY_BYTES));
assert!(!guard.release_partial(10 * PERMIT_GRANULARITY_BYTES));
assert_eq!(guard.granted_bytes(), 5 * PERMIT_GRANULARITY_BYTES);
assert_eq!(manager.used_bytes(), 5 * PERMIT_GRANULARITY_BYTES);
}
@@ -188,7 +188,7 @@ fn test_early_release_partial_unlimited() {
let mut guard = manager.try_acquire(100 * PERMIT_GRANULARITY_BYTES).unwrap();
// Unlimited guard - release should succeed (no-op)
assert!(guard.early_release_partial(50 * PERMIT_GRANULARITY_BYTES));
assert!(guard.release_partial(50 * PERMIT_GRANULARITY_BYTES));
assert_eq!(guard.granted_bytes(), 0);
}
@@ -200,22 +200,22 @@ fn test_request_and_early_release_symmetry() {
let mut guard = manager.try_acquire(5 * PERMIT_GRANULARITY_BYTES).unwrap();
// Request additional
assert!(guard.request_additional(5 * PERMIT_GRANULARITY_BYTES));
assert!(guard.try_acquire_additional(5 * PERMIT_GRANULARITY_BYTES));
assert_eq!(guard.granted_bytes(), 10 * PERMIT_GRANULARITY_BYTES);
assert_eq!(manager.used_bytes(), 10 * PERMIT_GRANULARITY_BYTES);
// Early release some
assert!(guard.early_release_partial(3 * PERMIT_GRANULARITY_BYTES));
assert!(guard.release_partial(3 * PERMIT_GRANULARITY_BYTES));
assert_eq!(guard.granted_bytes(), 7 * PERMIT_GRANULARITY_BYTES);
assert_eq!(manager.used_bytes(), 7 * PERMIT_GRANULARITY_BYTES);
// Request again
assert!(guard.request_additional(2 * PERMIT_GRANULARITY_BYTES));
assert!(guard.try_acquire_additional(2 * PERMIT_GRANULARITY_BYTES));
assert_eq!(guard.granted_bytes(), 9 * PERMIT_GRANULARITY_BYTES);
assert_eq!(manager.used_bytes(), 9 * PERMIT_GRANULARITY_BYTES);
// Early release again
assert!(guard.early_release_partial(4 * PERMIT_GRANULARITY_BYTES));
assert!(guard.release_partial(4 * PERMIT_GRANULARITY_BYTES));
assert_eq!(guard.granted_bytes(), 5 * PERMIT_GRANULARITY_BYTES);
assert_eq!(manager.used_bytes(), 5 * PERMIT_GRANULARITY_BYTES);
@@ -226,25 +226,186 @@ fn test_request_and_early_release_symmetry() {
#[test]
fn test_small_allocation_rounds_up() {
// Test that allocations smaller than PERMIT_GRANULARITY_BYTES
// round up to 1 permit and can use request_additional()
// round up to 1 permit and can use try_acquire_additional()
let limit = 10 * PERMIT_GRANULARITY_BYTES;
let manager = MemoryManager::new(limit, NoOpMetrics);
let mut guard = manager.try_acquire(512 * 1024).unwrap(); // 512KB
assert_eq!(guard.granted_bytes(), PERMIT_GRANULARITY_BYTES); // Rounds up to 1MB
assert!(guard.request_additional(2 * PERMIT_GRANULARITY_BYTES)); // Can request more
assert!(guard.try_acquire_additional(2 * PERMIT_GRANULARITY_BYTES)); // Can request more
assert_eq!(guard.granted_bytes(), 3 * PERMIT_GRANULARITY_BYTES);
}
#[test]
fn test_acquire_zero_bytes_lazy_allocation() {
// Test that acquire(0) returns 0 permits but can request_additional() later
// Test that acquire(0) returns 0 permits but can try_acquire_additional() later
let manager = MemoryManager::new(10 * PERMIT_GRANULARITY_BYTES, NoOpMetrics);
let mut guard = manager.try_acquire(0).unwrap();
assert_eq!(guard.granted_bytes(), 0); // No permits consumed
assert_eq!(manager.used_bytes(), 0);
assert!(guard.request_additional(3 * PERMIT_GRANULARITY_BYTES)); // Lazy allocation
assert!(guard.try_acquire_additional(3 * PERMIT_GRANULARITY_BYTES)); // Lazy allocation
assert_eq!(guard.granted_bytes(), 3 * PERMIT_GRANULARITY_BYTES);
}
#[tokio::test(flavor = "current_thread")]
async fn test_acquire_additional_blocks_and_unblocks() {
let limit = 10 * PERMIT_GRANULARITY_BYTES;
let manager = MemoryManager::new(limit, NoOpMetrics);
// First guard takes 9MB, leaving only 1MB available
let mut guard1 = manager.try_acquire(9 * PERMIT_GRANULARITY_BYTES).unwrap();
assert_eq!(manager.used_bytes(), 9 * PERMIT_GRANULARITY_BYTES);
// Spawn a task that will block trying to acquire additional 5MB (needs total 10MB available)
let manager_clone = manager.clone();
let waiter = tokio::spawn(async move {
let mut guard2 = manager_clone.try_acquire(0).unwrap();
// This will block until enough memory is available
guard2
.acquire_additional(5 * PERMIT_GRANULARITY_BYTES)
.await
.unwrap();
guard2
});
sleep(Duration::from_millis(10)).await;
// Release 5MB from guard1 - this should unblock the waiter
assert!(guard1.release_partial(5 * PERMIT_GRANULARITY_BYTES));
// Waiter should complete now
let guard2 = waiter.await.unwrap();
assert_eq!(guard2.granted_bytes(), 5 * PERMIT_GRANULARITY_BYTES);
// Total: guard1 has 4MB, guard2 has 5MB = 9MB
assert_eq!(manager.used_bytes(), 9 * PERMIT_GRANULARITY_BYTES);
}
#[tokio::test(flavor = "current_thread")]
async fn test_acquire_additional_exceeds_total_limit() {
let limit = 10 * PERMIT_GRANULARITY_BYTES;
let manager = MemoryManager::new(limit, NoOpMetrics);
let mut guard = manager.try_acquire(8 * PERMIT_GRANULARITY_BYTES).unwrap();
// Try to acquire additional 5MB - would exceed total limit of 10MB
let result = guard.acquire_additional(5 * PERMIT_GRANULARITY_BYTES).await;
assert!(result.is_err());
// Guard should remain unchanged
assert_eq!(guard.granted_bytes(), 8 * PERMIT_GRANULARITY_BYTES);
assert_eq!(manager.used_bytes(), 8 * PERMIT_GRANULARITY_BYTES);
}
#[tokio::test(flavor = "current_thread")]
async fn test_acquire_additional_success() {
let limit = 10 * PERMIT_GRANULARITY_BYTES;
let manager = MemoryManager::new(limit, NoOpMetrics);
let mut guard = manager.try_acquire(3 * PERMIT_GRANULARITY_BYTES).unwrap();
assert_eq!(manager.used_bytes(), 3 * PERMIT_GRANULARITY_BYTES);
// Acquire additional 4MB - should succeed
guard
.acquire_additional(4 * PERMIT_GRANULARITY_BYTES)
.await
.unwrap();
assert_eq!(guard.granted_bytes(), 7 * PERMIT_GRANULARITY_BYTES);
assert_eq!(manager.used_bytes(), 7 * PERMIT_GRANULARITY_BYTES);
}
#[tokio::test(flavor = "current_thread")]
async fn test_acquire_additional_with_policy_wait_success() {
use crate::policy::OnExhaustedPolicy;
let limit = 10 * PERMIT_GRANULARITY_BYTES;
let manager = MemoryManager::new(limit, NoOpMetrics);
let mut guard1 = manager.try_acquire(8 * PERMIT_GRANULARITY_BYTES).unwrap();
let manager_clone = manager.clone();
let waiter = tokio::spawn(async move {
let mut guard2 = manager_clone.try_acquire(0).unwrap();
// Wait policy with 1 second timeout
guard2
.acquire_additional_with_policy(
5 * PERMIT_GRANULARITY_BYTES,
OnExhaustedPolicy::Wait {
timeout: Duration::from_secs(1),
},
)
.await
.unwrap();
guard2
});
sleep(Duration::from_millis(10)).await;
// Release memory to unblock waiter
assert!(guard1.release_partial(5 * PERMIT_GRANULARITY_BYTES));
let guard2 = waiter.await.unwrap();
assert_eq!(guard2.granted_bytes(), 5 * PERMIT_GRANULARITY_BYTES);
}
#[tokio::test(flavor = "current_thread")]
async fn test_acquire_additional_with_policy_wait_timeout() {
use crate::policy::OnExhaustedPolicy;
let limit = 10 * PERMIT_GRANULARITY_BYTES;
let manager = MemoryManager::new(limit, NoOpMetrics);
// Take all memory
let _guard1 = manager.try_acquire(10 * PERMIT_GRANULARITY_BYTES).unwrap();
let mut guard2 = manager.try_acquire(0).unwrap();
// Try to acquire with short timeout - should timeout
let result = guard2
.acquire_additional_with_policy(
5 * PERMIT_GRANULARITY_BYTES,
OnExhaustedPolicy::Wait {
timeout: Duration::from_millis(50),
},
)
.await;
assert!(result.is_err());
assert_eq!(guard2.granted_bytes(), 0);
}
#[tokio::test(flavor = "current_thread")]
async fn test_acquire_additional_with_policy_fail() {
use crate::policy::OnExhaustedPolicy;
let limit = 10 * PERMIT_GRANULARITY_BYTES;
let manager = MemoryManager::new(limit, NoOpMetrics);
let _guard1 = manager.try_acquire(8 * PERMIT_GRANULARITY_BYTES).unwrap();
let mut guard2 = manager.try_acquire(0).unwrap();
// Fail policy - should return error immediately
let result = guard2
.acquire_additional_with_policy(5 * PERMIT_GRANULARITY_BYTES, OnExhaustedPolicy::Fail)
.await;
assert!(result.is_err());
assert_eq!(guard2.granted_bytes(), 0);
}
#[tokio::test(flavor = "current_thread")]
async fn test_acquire_additional_unlimited() {
let manager = MemoryManager::new(0, NoOpMetrics); // Unlimited
let mut guard = manager.try_acquire(0).unwrap();
// Should always succeed with unlimited manager
guard
.acquire_additional(1000 * PERMIT_GRANULARITY_BYTES)
.await
.unwrap();
assert_eq!(guard.granted_bytes(), 0);
assert_eq!(manager.used_bytes(), 0);
}

View File

@@ -66,7 +66,7 @@ use store_api::metric_engine_consts::{
};
use store_api::region_engine::{
RegionEngineRef, RegionManifestInfo, RegionRole, RegionStatistic, SetRegionRoleStateResponse,
SettableRegionRoleState,
SettableRegionRoleState, SyncRegionFromRequest,
};
use store_api::region_request::{
AffectedRows, BatchRegionDdlRequest, RegionCatchupRequest, RegionCloseRequest,
@@ -536,10 +536,13 @@ impl RegionServer {
let tracing_context = TracingContext::from_current_span();
let span = tracing_context.attach(info_span!("RegionServer::handle_sync_region_request"));
self.sync_region(region_id, manifest_info)
.trace(span)
.await
.map(|_| RegionResponse::new(AffectedRows::default()))
self.sync_region(
region_id,
SyncRegionFromRequest::from_manifest(manifest_info),
)
.trace(span)
.await
.map(|_| RegionResponse::new(AffectedRows::default()))
}
/// Handles the ListMetadata request and retrieves metadata for specified regions.
@@ -588,7 +591,7 @@ impl RegionServer {
pub async fn sync_region(
&self,
region_id: RegionId,
manifest_info: RegionManifestInfo,
request: SyncRegionFromRequest,
) -> Result<()> {
let engine_with_status = self
.inner
@@ -597,7 +600,7 @@ impl RegionServer {
.with_context(|| RegionNotFoundSnafu { region_id })?;
self.inner
.handle_sync_region(engine_with_status.engine(), region_id, manifest_info)
.handle_sync_region(engine_with_status.engine(), region_id, request)
.await
}
@@ -1216,7 +1219,8 @@ impl RegionServerInner {
| RegionRequest::Compact(_)
| RegionRequest::Truncate(_)
| RegionRequest::BuildIndex(_)
| RegionRequest::EnterStaging(_) => RegionChange::None,
| RegionRequest::EnterStaging(_)
| RegionRequest::ApplyStagingManifest(_) => RegionChange::None,
RegionRequest::Catchup(_) => RegionChange::Catchup,
};
@@ -1268,10 +1272,10 @@ impl RegionServerInner {
&self,
engine: &RegionEngineRef,
region_id: RegionId,
manifest_info: RegionManifestInfo,
request: SyncRegionFromRequest,
) -> Result<()> {
let Some(new_opened_regions) = engine
.sync_region(region_id, manifest_info)
.sync_region(region_id, request)
.await
.with_context(|_| HandleRegionRequestSnafu { region_id })?
.new_opened_logical_region_ids()

View File

@@ -33,9 +33,9 @@ use servers::grpc::FlightCompression;
use session::context::QueryContextRef;
use store_api::metadata::RegionMetadataRef;
use store_api::region_engine::{
CopyRegionFromRequest, CopyRegionFromResponse, RegionEngine, RegionManifestInfo, RegionRole,
RegionScannerRef, RegionStatistic, RemapManifestsRequest, RemapManifestsResponse,
SetRegionRoleStateResponse, SettableRegionRoleState, SyncManifestResponse,
RegionEngine, RegionRole, RegionScannerRef, RegionStatistic, RemapManifestsRequest,
RemapManifestsResponse, SetRegionRoleStateResponse, SettableRegionRoleState,
SyncRegionFromRequest, SyncRegionFromResponse,
};
use store_api::region_request::{AffectedRows, RegionRequest};
use store_api::storage::{RegionId, ScanRequest, SequenceNumber};
@@ -287,8 +287,8 @@ impl RegionEngine for MockRegionEngine {
async fn sync_region(
&self,
_region_id: RegionId,
_manifest_info: RegionManifestInfo,
) -> Result<SyncManifestResponse, BoxedError> {
_request: SyncRegionFromRequest,
) -> Result<SyncRegionFromResponse, BoxedError> {
unimplemented!()
}
@@ -299,14 +299,6 @@ impl RegionEngine for MockRegionEngine {
unimplemented!()
}
async fn copy_region_from(
&self,
_region_id: RegionId,
_request: CopyRegionFromRequest,
) -> Result<CopyRegionFromResponse, BoxedError> {
unimplemented!()
}
fn as_any(&self) -> &dyn Any {
self
}

View File

@@ -19,7 +19,6 @@ use arrow::datatypes::{
Time64NanosecondType, TimeUnit, TimestampMicrosecondType, TimestampMillisecondType,
TimestampNanosecondType, TimestampSecondType,
};
use arrow_array::Array;
use common_time::time::Time;
use common_time::{Duration, Timestamp};
@@ -127,28 +126,3 @@ pub fn duration_array_value(array: &ArrayRef, i: usize) -> Duration {
};
Duration::new(v, time_unit.into())
}
/// Get the string value at index `i` for `Utf8`, `LargeUtf8`, or `Utf8View` arrays.
///
/// Returns `None` when the array type is not a string type or the value is null.
///
/// # Panics
///
/// If index `i` is out of bounds.
pub fn string_array_value_at_index(array: &ArrayRef, i: usize) -> Option<&str> {
match array.data_type() {
DataType::Utf8 => {
let array = array.as_string::<i32>();
array.is_valid(i).then(|| array.value(i))
}
DataType::LargeUtf8 => {
let array = array.as_string::<i64>();
array.is_valid(i).then(|| array.value(i))
}
DataType::Utf8View => {
let array = array.as_string_view();
array.is_valid(i).then(|| array.value(i))
}
_ => None,
}
}

View File

@@ -26,10 +26,9 @@ use object_store::ObjectStore;
use snafu::{OptionExt, ensure};
use store_api::metadata::RegionMetadataRef;
use store_api::region_engine::{
CopyRegionFromRequest, CopyRegionFromResponse, RegionEngine, RegionManifestInfo, RegionRole,
RegionScannerRef, RegionStatistic, RemapManifestsRequest, RemapManifestsResponse,
SetRegionRoleStateResponse, SetRegionRoleStateSuccess, SettableRegionRoleState,
SinglePartitionScanner, SyncManifestResponse,
RegionEngine, RegionRole, RegionScannerRef, RegionStatistic, RemapManifestsRequest,
RemapManifestsResponse, SetRegionRoleStateResponse, SetRegionRoleStateSuccess,
SettableRegionRoleState, SinglePartitionScanner, SyncRegionFromRequest, SyncRegionFromResponse,
};
use store_api::region_request::{
AffectedRows, RegionCloseRequest, RegionCreateRequest, RegionDropRequest, RegionOpenRequest,
@@ -145,10 +144,10 @@ impl RegionEngine for FileRegionEngine {
async fn sync_region(
&self,
_region_id: RegionId,
_manifest_info: RegionManifestInfo,
) -> Result<SyncManifestResponse, BoxedError> {
_request: SyncRegionFromRequest,
) -> Result<SyncRegionFromResponse, BoxedError> {
// File engine doesn't need to sync region manifest.
Ok(SyncManifestResponse::NotSupported)
Ok(SyncRegionFromResponse::NotSupported)
}
async fn remap_manifests(
@@ -163,19 +162,6 @@ impl RegionEngine for FileRegionEngine {
))
}
async fn copy_region_from(
&self,
_region_id: RegionId,
_request: CopyRegionFromRequest,
) -> Result<CopyRegionFromResponse, BoxedError> {
Err(BoxedError::new(
UnsupportedSnafu {
operation: "copy_region_from",
}
.build(),
))
}
fn role(&self, region_id: RegionId) -> Option<RegionRole> {
self.inner.state(region_id)
}

View File

@@ -60,6 +60,7 @@ where
http_server_builder: Option<HttpServerBuilder>,
plugins: Plugins,
flight_handler: Option<FlightCraftRef>,
pub server_memory_limiter: ServerMemoryLimiter,
}
impl<T> Services<T>
@@ -67,6 +68,13 @@ where
T: Into<FrontendOptions> + Configurable + Clone,
{
pub fn new(opts: T, instance: Arc<Instance>, plugins: Plugins) -> Self {
let feopts = opts.clone().into();
// Create server request memory limiter for all server protocols
let server_memory_limiter = ServerMemoryLimiter::new(
feopts.max_in_flight_write_bytes.as_bytes(),
feopts.write_bytes_exhausted_policy,
);
Self {
opts,
instance,
@@ -74,6 +82,7 @@ where
http_server_builder: None,
plugins,
flight_handler: None,
server_memory_limiter,
}
}
@@ -274,12 +283,6 @@ where
let toml = opts.to_toml().context(TomlFormatSnafu)?;
let opts: FrontendOptions = opts.into();
// Create request memory limiter for all server protocols
let request_memory_limiter = ServerMemoryLimiter::new(
opts.max_in_flight_write_bytes.as_bytes(),
opts.write_bytes_exhausted_policy,
);
let handlers = ServerHandlers::default();
let user_provider = self.plugins.get::<UserProviderRef>();
@@ -292,7 +295,7 @@ where
&opts.meta_client,
None,
true,
request_memory_limiter.clone(),
self.server_memory_limiter.clone(),
)?;
handlers.insert((Box::new(grpc_server), grpc_addr));
}
@@ -305,7 +308,7 @@ where
&opts.meta_client,
Some("INTERNAL_GRPC_SERVER".to_string()),
false,
request_memory_limiter.clone(),
self.server_memory_limiter.clone(),
)?;
handlers.insert((Box::new(grpc_server), grpc_addr));
}
@@ -315,7 +318,7 @@ where
let http_options = &opts.http;
let http_addr = parse_addr(&http_options.addr)?;
let http_server =
self.build_http_server(&opts, toml, request_memory_limiter.clone())?;
self.build_http_server(&opts, toml, self.server_memory_limiter.clone())?;
handlers.insert((Box::new(http_server), http_addr));
}

View File

@@ -17,6 +17,7 @@ use common_error::ext::{BoxedError, ErrorExt};
use common_error::status_code::StatusCode;
use common_macro::stack_trace_debug;
use common_meta::DatanodeId;
use common_procedure::ProcedureId;
use common_runtime::JoinError;
use snafu::{Location, Snafu};
use store_api::storage::RegionId;
@@ -768,6 +769,35 @@ pub enum Error {
location: Location,
},
#[snafu(display("Failed to create repartition subtasks"))]
RepartitionCreateSubtasks {
source: partition::error::Error,
#[snafu(implicit)]
location: Location,
},
#[snafu(display(
"Source partition expression '{}' does not match any existing region",
expr
))]
RepartitionSourceExprMismatch {
expr: String,
#[snafu(implicit)]
location: Location,
},
#[snafu(display(
"Failed to get the state receiver for repartition subprocedure {}",
procedure_id
))]
RepartitionSubprocedureStateReceiver {
procedure_id: ProcedureId,
#[snafu(source)]
source: common_procedure::Error,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Unsupported operation {}", operation))]
Unsupported {
operation: String,
@@ -1113,7 +1143,8 @@ impl ErrorExt for Error {
| Error::LeaderPeerChanged { .. }
| Error::RepartitionSourceRegionMissing { .. }
| Error::RepartitionTargetRegionMissing { .. }
| Error::PartitionExprMismatch { .. } => StatusCode::InvalidArguments,
| Error::PartitionExprMismatch { .. }
| Error::RepartitionSourceExprMismatch { .. } => StatusCode::InvalidArguments,
Error::LeaseKeyFromUtf8 { .. }
| Error::LeaseValueFromUtf8 { .. }
| Error::InvalidRegionKeyFromUtf8 { .. }
@@ -1173,6 +1204,8 @@ impl ErrorExt for Error {
Error::BuildTlsOptions { source, .. } => source.status_code(),
Error::Other { source, .. } => source.status_code(),
Error::RepartitionCreateSubtasks { source, .. } => source.status_code(),
Error::RepartitionSubprocedureStateReceiver { source, .. } => source.status_code(),
Error::NoEnoughAvailableNode { .. } => StatusCode::RuntimeResourcesExhausted,
#[cfg(feature = "pg_kvbackend")]

View File

@@ -12,8 +12,63 @@
// See the License for the specific language governing permissions and
// limitations under the License.
pub mod allocate_region;
pub mod collect;
pub mod deallocate_region;
pub mod dispatch;
pub mod group;
pub mod plan;
pub mod repartition_end;
pub mod repartition_start;
use std::any::Any;
use std::fmt::Debug;
use common_meta::cache_invalidator::CacheInvalidatorRef;
use common_meta::key::TableMetadataManagerRef;
use common_procedure::{Context as ProcedureContext, Status};
use serde::{Deserialize, Serialize};
use store_api::storage::TableId;
use crate::error::Result;
use crate::procedure::repartition::plan::RepartitionPlanEntry;
use crate::service::mailbox::MailboxRef;
#[cfg(test)]
pub mod test_util;
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct PersistentContext {
pub catalog_name: String,
pub schema_name: String,
pub table_name: String,
pub table_id: TableId,
pub plans: Vec<RepartitionPlanEntry>,
}
pub struct Context {
pub persistent_ctx: PersistentContext,
pub table_metadata_manager: TableMetadataManagerRef,
pub mailbox: MailboxRef,
pub server_addr: String,
pub cache_invalidator: CacheInvalidatorRef,
}
#[async_trait::async_trait]
#[typetag::serde(tag = "repartition_state")]
pub(crate) trait State: Sync + Send + Debug {
fn name(&self) -> &'static str {
let type_name = std::any::type_name::<Self>();
// short name
type_name.split("::").last().unwrap_or(type_name)
}
/// Yields the next [State] and [Status].
async fn next(
&mut self,
ctx: &mut Context,
procedure_ctx: &ProcedureContext,
) -> Result<(Box<dyn State>, Status)>;
fn as_any(&self) -> &dyn Any;
}

View File

@@ -0,0 +1,67 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::any::Any;
use common_procedure::{Context as ProcedureContext, Status};
use serde::{Deserialize, Serialize};
use crate::error::Result;
use crate::procedure::repartition::dispatch::Dispatch;
use crate::procedure::repartition::plan::{AllocationPlanEntry, RepartitionPlanEntry};
use crate::procedure::repartition::{Context, State};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AllocateRegion {
plan_entries: Vec<AllocationPlanEntry>,
}
impl AllocateRegion {
pub fn new(plan_entries: Vec<AllocationPlanEntry>) -> Self {
Self { plan_entries }
}
}
#[async_trait::async_trait]
#[typetag::serde]
impl State for AllocateRegion {
async fn next(
&mut self,
ctx: &mut Context,
_procedure_ctx: &ProcedureContext,
) -> Result<(Box<dyn State>, Status)> {
let region_to_allocate = self
.plan_entries
.iter()
.map(|p| p.regions_to_allocate)
.sum::<usize>();
if region_to_allocate == 0 {
let repartition_plan_entries = self
.plan_entries
.iter()
.map(RepartitionPlanEntry::from_allocation_plan_entry)
.collect::<Vec<_>>();
ctx.persistent_ctx.plans = repartition_plan_entries;
return Ok((Box::new(Dispatch), Status::executing(true)));
}
// TODO(weny): allocate regions.
todo!()
}
fn as_any(&self) -> &dyn Any {
self
}
}

View File

@@ -0,0 +1,106 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::any::Any;
use common_procedure::{Context as ProcedureContext, ProcedureId, Status, watcher};
use common_telemetry::error;
use serde::{Deserialize, Serialize};
use snafu::ResultExt;
use crate::error::{RepartitionSubprocedureStateReceiverSnafu, Result};
use crate::procedure::repartition::deallocate_region::DeallocateRegion;
use crate::procedure::repartition::group::GroupId;
use crate::procedure::repartition::{Context, State};
/// Metadata for tracking a dispatched sub-procedure.
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq)]
pub struct ProcedureMeta {
/// The index of the plan entry in the parent procedure's plan list.
pub plan_index: usize,
/// The group id of the repartition group.
pub group_id: GroupId,
/// The procedure id of the sub-procedure.
pub procedure_id: ProcedureId,
}
/// State for collecting results from dispatched sub-procedures.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Collect {
/// Sub-procedures that are currently in-flight.
pub inflight_procedures: Vec<ProcedureMeta>,
/// Sub-procedures that have completed successfully.
pub succeeded_procedures: Vec<ProcedureMeta>,
/// Sub-procedures that have failed.
pub failed_procedures: Vec<ProcedureMeta>,
/// Sub-procedures whose state could not be determined.
pub unknown_procedures: Vec<ProcedureMeta>,
}
impl Collect {
pub fn new(inflight_procedures: Vec<ProcedureMeta>) -> Self {
Self {
inflight_procedures,
succeeded_procedures: Vec::new(),
failed_procedures: Vec::new(),
unknown_procedures: Vec::new(),
}
}
}
#[async_trait::async_trait]
#[typetag::serde]
impl State for Collect {
async fn next(
&mut self,
_ctx: &mut Context,
procedure_ctx: &ProcedureContext,
) -> Result<(Box<dyn State>, Status)> {
for procedure_meta in self.inflight_procedures.iter() {
let procedure_id = procedure_meta.procedure_id;
let group_id = procedure_meta.group_id;
let Some(mut receiver) = procedure_ctx
.provider
.procedure_state_receiver(procedure_id)
.await
.context(RepartitionSubprocedureStateReceiverSnafu { procedure_id })?
else {
error!(
"failed to get procedure state receiver, procedure_id: {}, group_id: {}",
procedure_id, group_id
);
self.unknown_procedures.push(*procedure_meta);
continue;
};
match watcher::wait(&mut receiver).await {
Ok(_) => self.succeeded_procedures.push(*procedure_meta),
Err(e) => {
error!(e; "failed to wait for repartition subprocedure, procedure_id: {}, group_id: {}", procedure_id, group_id);
self.failed_procedures.push(*procedure_meta);
}
}
}
if !self.failed_procedures.is_empty() || !self.unknown_procedures.is_empty() {
// TODO(weny): retry the failed or unknown procedures.
}
Ok((Box::new(DeallocateRegion), Status::executing(true)))
}
fn as_any(&self) -> &dyn Any {
self
}
}

View File

@@ -0,0 +1,52 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::any::Any;
use common_procedure::{Context as ProcedureContext, Status};
use serde::{Deserialize, Serialize};
use crate::error::Result;
use crate::procedure::repartition::repartition_end::RepartitionEnd;
use crate::procedure::repartition::{Context, State};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DeallocateRegion;
#[async_trait::async_trait]
#[typetag::serde]
impl State for DeallocateRegion {
async fn next(
&mut self,
ctx: &mut Context,
_procedure_ctx: &ProcedureContext,
) -> Result<(Box<dyn State>, Status)> {
let region_to_deallocate = ctx
.persistent_ctx
.plans
.iter()
.map(|p| p.pending_deallocate_region_ids.len())
.sum::<usize>();
if region_to_deallocate == 0 {
return Ok((Box::new(RepartitionEnd), Status::done()));
}
// TODO(weny): deallocate regions.
todo!()
}
fn as_any(&self) -> &dyn Any {
self
}
}

View File

@@ -0,0 +1,66 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::any::Any;
use common_procedure::{Context as ProcedureContext, ProcedureWithId, Status};
use serde::{Deserialize, Serialize};
use crate::error::Result;
use crate::procedure::repartition::collect::{Collect, ProcedureMeta};
use crate::procedure::repartition::group::RepartitionGroupProcedure;
use crate::procedure::repartition::{self, Context, State};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Dispatch;
#[async_trait::async_trait]
#[typetag::serde]
impl State for Dispatch {
async fn next(
&mut self,
ctx: &mut Context,
_procedure_ctx: &ProcedureContext,
) -> Result<(Box<dyn State>, Status)> {
let table_id = ctx.persistent_ctx.table_id;
let mut procedures = Vec::with_capacity(ctx.persistent_ctx.plans.len());
let mut procedure_metas = Vec::with_capacity(ctx.persistent_ctx.plans.len());
for (plan_index, plan) in ctx.persistent_ctx.plans.iter().enumerate() {
let persistent_ctx = repartition::group::PersistentContext::new(
plan.group_id,
table_id,
plan.source_regions.clone(),
plan.target_regions.clone(),
);
let group_procedure = RepartitionGroupProcedure::new(persistent_ctx, ctx);
let procedure = ProcedureWithId::with_random_id(Box::new(group_procedure));
procedure_metas.push(ProcedureMeta {
plan_index,
group_id: plan.group_id,
procedure_id: procedure.id,
});
procedures.push(procedure);
}
Ok((
Box::new(Collect::new(procedure_metas)),
Status::suspended(procedures, true),
))
}
fn as_any(&self) -> &dyn Any {
self
}
}

View File

@@ -29,19 +29,78 @@ use common_meta::key::datanode_table::{DatanodeTableKey, DatanodeTableValue, Reg
use common_meta::key::table_route::TableRouteValue;
use common_meta::key::{DeserializedValueWithBytes, TableMetadataManagerRef};
use common_meta::rpc::router::RegionRoute;
use common_procedure::{Context as ProcedureContext, Status};
use common_procedure::{
Context as ProcedureContext, LockKey, Procedure, Result as ProcedureResult, Status,
UserMetadata,
};
use serde::{Deserialize, Serialize};
use snafu::{OptionExt, ResultExt};
use store_api::storage::{RegionId, TableId};
use uuid::Uuid;
use crate::error::{self, Result};
use crate::procedure::repartition::group::repartition_start::RepartitionStart;
use crate::procedure::repartition::plan::RegionDescriptor;
use crate::procedure::repartition::{self};
use crate::service::mailbox::MailboxRef;
pub type GroupId = Uuid;
pub struct RepartitionGroupProcedure {}
#[allow(dead_code)]
pub struct RepartitionGroupProcedure {
state: Box<dyn State>,
context: Context,
}
impl RepartitionGroupProcedure {
const TYPE_NAME: &'static str = "metasrv-procedure::RepartitionGroup";
pub fn new(persistent_context: PersistentContext, context: &repartition::Context) -> Self {
let state = Box::new(RepartitionStart);
Self {
state,
context: Context {
persistent_ctx: persistent_context,
cache_invalidator: context.cache_invalidator.clone(),
table_metadata_manager: context.table_metadata_manager.clone(),
mailbox: context.mailbox.clone(),
server_addr: context.server_addr.clone(),
},
}
}
}
#[async_trait::async_trait]
impl Procedure for RepartitionGroupProcedure {
fn type_name(&self) -> &str {
Self::TYPE_NAME
}
async fn execute(&mut self, _ctx: &ProcedureContext) -> ProcedureResult<Status> {
todo!()
}
async fn rollback(&mut self, _: &ProcedureContext) -> ProcedureResult<()> {
todo!()
}
fn rollback_supported(&self) -> bool {
true
}
fn dump(&self) -> ProcedureResult<String> {
todo!()
}
fn lock_key(&self) -> LockKey {
todo!()
}
fn user_metadata(&self) -> Option<UserMetadata> {
todo!()
}
}
pub struct Context {
pub persistent_ctx: PersistentContext,
@@ -55,11 +114,16 @@ pub struct Context {
pub server_addr: String,
}
/// The result of the group preparation phase, containing validated region routes.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct GroupPrepareResult {
/// The validated source region routes.
pub source_routes: Vec<RegionRoute>,
/// The validated target region routes.
pub target_routes: Vec<RegionRoute>,
/// The primary source region id (first source region), used for retrieving region options.
pub central_region: RegionId,
/// The datanode id where the primary source region is located.
pub central_region_datanode_id: DatanodeId,
}
@@ -77,6 +141,23 @@ pub struct PersistentContext {
pub group_prepare_result: Option<GroupPrepareResult>,
}
impl PersistentContext {
pub fn new(
group_id: GroupId,
table_id: TableId,
sources: Vec<RegionDescriptor>,
targets: Vec<RegionDescriptor>,
) -> Self {
Self {
group_id,
table_id,
sources,
targets,
group_prepare_result: None,
}
}
}
impl Context {
/// Retrieves the table route value for the given table id.
///

View File

@@ -16,11 +16,79 @@ use partition::expr::PartitionExpr;
use serde::{Deserialize, Serialize};
use store_api::storage::RegionId;
use crate::procedure::repartition::group::GroupId;
/// Metadata describing a region involved in the plan.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct RegionDescriptor {
/// The region id of the region involved in the plan.
pub region_id: RegionId,
/// The new partition expression of the region.
/// The partition expression of the region.
pub partition_expr: PartitionExpr,
}
/// A plan entry for the region allocation phase, describing source regions
/// and target partition expressions before allocation.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct AllocationPlanEntry {
/// The group id for this plan entry.
pub group_id: GroupId,
/// Source region descriptors involved in the plan.
pub source_regions: Vec<RegionDescriptor>,
/// The target partition expressions for the new or changed regions.
pub target_partition_exprs: Vec<PartitionExpr>,
/// The number of regions that need to be allocated (target count - source count, if positive).
pub regions_to_allocate: usize,
/// The number of regions that need to be deallocated (source count - target count, if positive).
pub regions_to_deallocate: usize,
}
/// A plan entry for the dispatch phase after region allocation,
/// with concrete source and target region descriptors.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct RepartitionPlanEntry {
/// The group id for this plan entry.
pub group_id: GroupId,
/// The source region descriptors involved in the plan.
pub source_regions: Vec<RegionDescriptor>,
/// The target region descriptors involved in the plan.
pub target_regions: Vec<RegionDescriptor>,
/// The region ids of the allocated regions.
pub allocated_region_ids: Vec<RegionId>,
/// The region ids of the regions that are pending deallocation.
pub pending_deallocate_region_ids: Vec<RegionId>,
}
impl RepartitionPlanEntry {
/// Converts an allocation plan entry into a repartition plan entry.
///
/// The target regions are derived from the source regions and the target partition expressions.
/// The allocated region ids and pending deallocate region ids are empty.
pub fn from_allocation_plan_entry(
AllocationPlanEntry {
group_id,
source_regions,
target_partition_exprs,
regions_to_allocate,
regions_to_deallocate,
}: &AllocationPlanEntry,
) -> Self {
debug_assert!(*regions_to_allocate == 0 && *regions_to_deallocate == 0);
let target_regions = source_regions
.iter()
.zip(target_partition_exprs.iter())
.map(|(source_region, target_partition_expr)| RegionDescriptor {
region_id: source_region.region_id,
partition_expr: target_partition_expr.clone(),
})
.collect::<Vec<_>>();
Self {
group_id: *group_id,
source_regions: source_regions.clone(),
target_regions,
allocated_region_ids: vec![],
pending_deallocate_region_ids: vec![],
}
}
}

View File

@@ -0,0 +1,40 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::any::Any;
use common_procedure::{Context as ProcedureContext, Status};
use serde::{Deserialize, Serialize};
use crate::error::Result;
use crate::procedure::repartition::{Context, State};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RepartitionEnd;
#[async_trait::async_trait]
#[typetag::serde]
impl State for RepartitionEnd {
async fn next(
&mut self,
_ctx: &mut Context,
_procedure_ctx: &ProcedureContext,
) -> Result<(Box<dyn State>, Status)> {
Ok((Box::new(RepartitionEnd), Status::done()))
}
fn as_any(&self) -> &dyn Any {
self
}
}

View File

@@ -0,0 +1,172 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::any::Any;
use common_meta::key::table_route::PhysicalTableRouteValue;
use common_procedure::{Context as ProcedureContext, Status};
use partition::expr::PartitionExpr;
use partition::subtask::{self, RepartitionSubtask};
use serde::{Deserialize, Serialize};
use snafu::{OptionExt, ResultExt};
use uuid::Uuid;
use crate::error::{self, Result};
use crate::procedure::repartition::allocate_region::AllocateRegion;
use crate::procedure::repartition::plan::{AllocationPlanEntry, RegionDescriptor};
use crate::procedure::repartition::repartition_end::RepartitionEnd;
use crate::procedure::repartition::{Context, State};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct RepartitionStart {
from_exprs: Vec<PartitionExpr>,
to_exprs: Vec<PartitionExpr>,
}
impl RepartitionStart {
pub fn new(from_exprs: Vec<PartitionExpr>, to_exprs: Vec<PartitionExpr>) -> Self {
Self {
from_exprs,
to_exprs,
}
}
}
#[async_trait::async_trait]
#[typetag::serde]
impl State for RepartitionStart {
async fn next(
&mut self,
ctx: &mut Context,
_: &ProcedureContext,
) -> Result<(Box<dyn State>, Status)> {
let (_, table_route) = ctx
.table_metadata_manager
.table_route_manager()
.get_physical_table_route(ctx.persistent_ctx.table_id)
.await
.context(error::TableMetadataManagerSnafu)?;
let plans = Self::build_plan(&table_route, &self.from_exprs, &self.to_exprs)?;
if plans.is_empty() {
return Ok((Box::new(RepartitionEnd), Status::done()));
}
Ok((
Box::new(AllocateRegion::new(plans)),
Status::executing(false),
))
}
fn as_any(&self) -> &dyn Any {
self
}
}
impl RepartitionStart {
#[allow(dead_code)]
fn build_plan(
physical_route: &PhysicalTableRouteValue,
from_exprs: &[PartitionExpr],
to_exprs: &[PartitionExpr],
) -> Result<Vec<AllocationPlanEntry>> {
let subtasks = subtask::create_subtasks(from_exprs, to_exprs)
.context(error::RepartitionCreateSubtasksSnafu)?;
if subtasks.is_empty() {
return Ok(vec![]);
}
let src_descriptors = Self::source_region_descriptors(from_exprs, physical_route)?;
Ok(Self::build_plan_entries(
subtasks,
&src_descriptors,
to_exprs,
))
}
#[allow(dead_code)]
fn build_plan_entries(
subtasks: Vec<RepartitionSubtask>,
source_index: &[RegionDescriptor],
target_exprs: &[PartitionExpr],
) -> Vec<AllocationPlanEntry> {
subtasks
.into_iter()
.map(|subtask| {
let group_id = Uuid::new_v4();
let source_regions = subtask
.from_expr_indices
.iter()
.map(|&idx| source_index[idx].clone())
.collect::<Vec<_>>();
let target_partition_exprs = subtask
.to_expr_indices
.iter()
.map(|&idx| target_exprs[idx].clone())
.collect::<Vec<_>>();
let regions_to_allocate = target_partition_exprs
.len()
.saturating_sub(source_regions.len());
let regions_to_deallocate = source_regions
.len()
.saturating_sub(target_partition_exprs.len());
AllocationPlanEntry {
group_id,
source_regions,
target_partition_exprs,
regions_to_allocate,
regions_to_deallocate,
}
})
.collect::<Vec<_>>()
}
fn source_region_descriptors(
from_exprs: &[PartitionExpr],
physical_route: &PhysicalTableRouteValue,
) -> Result<Vec<RegionDescriptor>> {
let existing_regions = physical_route
.region_routes
.iter()
.map(|route| (route.region.id, route.region.partition_expr()))
.collect::<Vec<_>>();
let descriptors = from_exprs
.iter()
.map(|expr| {
let expr_json = expr
.as_json_str()
.context(error::SerializePartitionExprSnafu)?;
let matched_region_id = existing_regions
.iter()
.find_map(|(region_id, existing_expr)| {
(existing_expr == &expr_json).then_some(*region_id)
})
.with_context(|| error::RepartitionSourceExprMismatchSnafu {
expr: expr_json,
})?;
Ok(RegionDescriptor {
region_id: matched_region_id,
partition_expr: expr.clone(),
})
})
.collect::<Result<Vec<_>>>()?;
Ok(descriptors)
}
}

View File

@@ -23,6 +23,7 @@ common-recordbatch.workspace = true
common-runtime.workspace = true
common-telemetry.workspace = true
common-time.workspace = true
chrono.workspace = true
datafusion.workspace = true
datatypes.workspace = true
futures-util.workspace = true

View File

@@ -43,10 +43,10 @@ pub(crate) use state::MetricEngineState;
use store_api::metadata::RegionMetadataRef;
use store_api::metric_engine_consts::METRIC_ENGINE_NAME;
use store_api::region_engine::{
BatchResponses, CopyRegionFromRequest, CopyRegionFromResponse, RegionEngine,
RegionManifestInfo, RegionRole, RegionScannerRef, RegionStatistic, RemapManifestsRequest,
RemapManifestsResponse, SetRegionRoleStateResponse, SetRegionRoleStateSuccess,
SettableRegionRoleState, SyncManifestResponse,
BatchResponses, RegionEngine, RegionRole, RegionScannerRef, RegionStatistic,
RemapManifestsRequest, RemapManifestsResponse, SetRegionRoleStateResponse,
SetRegionRoleStateSuccess, SettableRegionRoleState, SyncRegionFromRequest,
SyncRegionFromResponse,
};
use store_api::region_request::{
BatchRegionDdlRequest, RegionCatchupRequest, RegionOpenRequest, RegionRequest,
@@ -220,6 +220,13 @@ impl RegionEngine for MetricEngine {
UnsupportedRegionRequestSnafu { request }.fail()
}
}
RegionRequest::ApplyStagingManifest(_) => {
if self.inner.is_physical_region(region_id) {
return self.inner.mito.handle_request(region_id, request).await;
} else {
UnsupportedRegionRequestSnafu { request }.fail()
}
}
RegionRequest::Put(put) => self.inner.put_region(region_id, put).await,
RegionRequest::Create(create) => {
self.inner
@@ -354,12 +361,30 @@ impl RegionEngine for MetricEngine {
async fn sync_region(
&self,
region_id: RegionId,
manifest_info: RegionManifestInfo,
) -> Result<SyncManifestResponse, BoxedError> {
self.inner
.sync_region(region_id, manifest_info)
.await
.map_err(BoxedError::new)
request: SyncRegionFromRequest,
) -> Result<SyncRegionFromResponse, BoxedError> {
match request {
SyncRegionFromRequest::FromManifest(manifest_info) => self
.inner
.sync_region_from_manifest(region_id, manifest_info)
.await
.map_err(BoxedError::new),
SyncRegionFromRequest::FromRegion {
source_region_id,
parallelism,
} => {
if self.inner.is_physical_region(region_id) {
self.inner
.sync_region_from_region(region_id, source_region_id, parallelism)
.await
.map_err(BoxedError::new)
} else {
Err(BoxedError::new(
error::UnsupportedSyncRegionFromRequestSnafu { region_id }.build(),
))
}
}
}
}
async fn remap_manifests(
@@ -376,14 +401,6 @@ impl RegionEngine for MetricEngine {
}
}
async fn copy_region_from(
&self,
_region_id: RegionId,
_request: CopyRegionFromRequest,
) -> Result<CopyRegionFromResponse, BoxedError> {
todo!()
}
async fn set_region_role_state_gracefully(
&self,
region_id: RegionId,

View File

@@ -290,6 +290,11 @@ impl MetricEngineInner {
.metadata_region
.logical_regions(physical_region_id)
.await?;
common_telemetry::debug!(
"Recover states for physical region {}, logical regions: {:?}",
physical_region_id,
logical_regions
);
let physical_columns = self
.data_region
.physical_columns(physical_region_id)

View File

@@ -23,6 +23,7 @@ use store_api::metric_engine_consts::{
METRIC_ENGINE_INDEX_SKIPPING_INDEX_GRANULARITY_OPTION,
METRIC_ENGINE_INDEX_SKIPPING_INDEX_GRANULARITY_OPTION_DEFAULT, METRIC_ENGINE_INDEX_TYPE_OPTION,
};
use store_api::mito_engine_options::{COMPACTION_TYPE, COMPACTION_TYPE_TWCS, TWCS_TIME_WINDOW};
use crate::error::{Error, ParseRegionOptionsSnafu, Result};
@@ -32,6 +33,9 @@ use crate::error::{Error, ParseRegionOptionsSnafu, Result};
/// value and appropriately increasing the size of the index, it results in an improved indexing effect.
const SEG_ROW_COUNT_FOR_DATA_REGION: u32 = 256;
/// The default compaction time window for metric engine data regions.
const DEFAULT_DATA_REGION_COMPACTION_TIME_WINDOW: &str = "1d";
/// Physical region options.
#[derive(Debug, Clone, Copy, PartialEq)]
pub struct PhysicalRegionOptions {
@@ -72,6 +76,16 @@ pub fn set_data_region_options(
"sparse".to_string(),
);
}
if !options.contains_key(TWCS_TIME_WINDOW) {
options.insert(
COMPACTION_TYPE.to_string(),
COMPACTION_TYPE_TWCS.to_string(),
);
options.insert(
TWCS_TIME_WINDOW.to_string(),
DEFAULT_DATA_REGION_COMPACTION_TIME_WINDOW.to_string(),
);
}
}
impl TryFrom<&HashMap<String, String>> for PhysicalRegionOptions {
@@ -192,4 +206,29 @@ mod tests {
}
);
}
#[test]
fn test_set_data_region_options_default_compaction_time_window() {
// Test that default time window is set when not specified
let mut options = HashMap::new();
set_data_region_options(&mut options, false);
assert_eq!(
options.get(COMPACTION_TYPE),
Some(&COMPACTION_TYPE_TWCS.to_string())
);
assert_eq!(options.get(TWCS_TIME_WINDOW), Some(&"1d".to_string()));
}
#[test]
fn test_set_data_region_options_respects_user_compaction_time_window() {
// Test that user-specified time window is preserved
let mut options = HashMap::new();
options.insert(TWCS_TIME_WINDOW.to_string(), "2h".to_string());
options.insert(COMPACTION_TYPE.to_string(), "twcs".to_string());
set_data_region_options(&mut options, false);
// User's time window should be preserved
assert_eq!(options.get(TWCS_TIME_WINDOW), Some(&"2h".to_string()));
}
}

View File

@@ -12,242 +12,5 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::time::Instant;
use common_telemetry::info;
use snafu::{OptionExt, ResultExt, ensure};
use store_api::region_engine::{RegionEngine, RegionManifestInfo, SyncManifestResponse};
use store_api::storage::RegionId;
use crate::engine::MetricEngineInner;
use crate::error::{
MetricManifestInfoSnafu, MitoSyncOperationSnafu, PhysicalRegionNotFoundSnafu, Result,
};
use crate::utils;
impl MetricEngineInner {
pub async fn sync_region(
&self,
region_id: RegionId,
manifest_info: RegionManifestInfo,
) -> Result<SyncManifestResponse> {
ensure!(
manifest_info.is_metric(),
MetricManifestInfoSnafu { region_id }
);
let metadata_region_id = utils::to_metadata_region_id(region_id);
// checked by ensure above
let metadata_manifest_version = manifest_info
.metadata_manifest_version()
.unwrap_or_default();
let metadata_flushed_entry_id = manifest_info
.metadata_flushed_entry_id()
.unwrap_or_default();
let metadata_region_manifest =
RegionManifestInfo::mito(metadata_manifest_version, metadata_flushed_entry_id, 0);
let metadata_synced = self
.mito
.sync_region(metadata_region_id, metadata_region_manifest)
.await
.context(MitoSyncOperationSnafu)?
.is_data_synced();
let data_region_id = utils::to_data_region_id(region_id);
let data_manifest_version = manifest_info.data_manifest_version();
let data_flushed_entry_id = manifest_info.data_flushed_entry_id();
let data_region_manifest =
RegionManifestInfo::mito(data_manifest_version, data_flushed_entry_id, 0);
let data_synced = self
.mito
.sync_region(data_region_id, data_region_manifest)
.await
.context(MitoSyncOperationSnafu)?
.is_data_synced();
if !metadata_synced {
return Ok(SyncManifestResponse::Metric {
metadata_synced,
data_synced,
new_opened_logical_region_ids: vec![],
});
}
let now = Instant::now();
// Recovers the states from the metadata region
// if the metadata manifest version is updated.
let physical_region_options = *self
.state
.read()
.unwrap()
.physical_region_states()
.get(&data_region_id)
.context(PhysicalRegionNotFoundSnafu {
region_id: data_region_id,
})?
.options();
let new_opened_logical_region_ids = self
.recover_states(data_region_id, physical_region_options)
.await?;
info!(
"Sync metadata region for physical region {}, cost: {:?}, new opened logical region ids: {:?}",
data_region_id,
now.elapsed(),
new_opened_logical_region_ids
);
Ok(SyncManifestResponse::Metric {
metadata_synced,
data_synced,
new_opened_logical_region_ids,
})
}
}
#[cfg(test)]
mod tests {
use std::collections::HashMap;
use api::v1::SemanticType;
use common_query::prelude::greptime_timestamp;
use common_telemetry::info;
use datatypes::data_type::ConcreteDataType;
use datatypes::schema::ColumnSchema;
use store_api::metadata::ColumnMetadata;
use store_api::region_engine::{RegionEngine, RegionManifestInfo};
use store_api::region_request::{
AddColumn, AlterKind, RegionAlterRequest, RegionFlushRequest, RegionRequest,
};
use store_api::storage::RegionId;
use crate::metadata_region::MetadataRegion;
use crate::test_util::TestEnv;
#[tokio::test]
async fn test_sync_region_with_new_created_logical_regions() {
common_telemetry::init_default_ut_logging();
let mut env = TestEnv::with_prefix("sync_with_new_created_logical_regions").await;
env.init_metric_region().await;
info!("creating follower engine");
// Create a follower engine.
let (_follower_mito, follower_metric) = env.create_follower_engine().await;
let physical_region_id = env.default_physical_region_id();
// Flushes the physical region
let metric_engine = env.metric();
metric_engine
.handle_request(
env.default_physical_region_id(),
RegionRequest::Flush(RegionFlushRequest::default()),
)
.await
.unwrap();
let response = follower_metric
.sync_region(physical_region_id, RegionManifestInfo::metric(1, 0, 1, 0))
.await
.unwrap();
assert!(response.is_metric());
let new_opened_logical_region_ids = response.new_opened_logical_region_ids().unwrap();
assert_eq!(new_opened_logical_region_ids, vec![RegionId::new(3, 2)]);
// Sync again, no new logical region should be opened
let response = follower_metric
.sync_region(physical_region_id, RegionManifestInfo::metric(1, 0, 1, 0))
.await
.unwrap();
assert!(response.is_metric());
let new_opened_logical_region_ids = response.new_opened_logical_region_ids().unwrap();
assert!(new_opened_logical_region_ids.is_empty());
}
fn test_alter_logical_region_request() -> RegionAlterRequest {
RegionAlterRequest {
kind: AlterKind::AddColumns {
columns: vec![AddColumn {
column_metadata: ColumnMetadata {
column_id: 0,
semantic_type: SemanticType::Tag,
column_schema: ColumnSchema::new(
"tag1",
ConcreteDataType::string_datatype(),
false,
),
},
location: None,
}],
},
}
}
#[tokio::test]
async fn test_sync_region_alter_alter_logical_region() {
common_telemetry::init_default_ut_logging();
let mut env = TestEnv::with_prefix("sync_region_alter_alter_logical_region").await;
env.init_metric_region().await;
info!("creating follower engine");
let physical_region_id = env.default_physical_region_id();
// Flushes the physical region
let metric_engine = env.metric();
metric_engine
.handle_request(
env.default_physical_region_id(),
RegionRequest::Flush(RegionFlushRequest::default()),
)
.await
.unwrap();
// Create a follower engine.
let (follower_mito, follower_metric) = env.create_follower_engine().await;
let metric_engine = env.metric();
let engine_inner = env.metric().inner;
let region_id = env.default_logical_region_id();
let request = test_alter_logical_region_request();
engine_inner
.alter_logical_regions(
physical_region_id,
vec![(region_id, request)],
&mut HashMap::new(),
)
.await
.unwrap();
// Flushes the physical region
metric_engine
.handle_request(
env.default_physical_region_id(),
RegionRequest::Flush(RegionFlushRequest::default()),
)
.await
.unwrap();
// Sync the follower engine
let response = follower_metric
.sync_region(physical_region_id, RegionManifestInfo::metric(2, 0, 2, 0))
.await
.unwrap();
assert!(response.is_metric());
let new_opened_logical_region_ids = response.new_opened_logical_region_ids().unwrap();
assert!(new_opened_logical_region_ids.is_empty());
let logical_region_id = env.default_logical_region_id();
let metadata_region = MetadataRegion::new(follower_mito.clone());
let semantic_type = metadata_region
.column_semantic_type(physical_region_id, logical_region_id, "tag1")
.await
.unwrap()
.unwrap();
assert_eq!(semantic_type, SemanticType::Tag);
let timestamp_index = metadata_region
.column_semantic_type(physical_region_id, logical_region_id, greptime_timestamp())
.await
.unwrap()
.unwrap();
assert_eq!(timestamp_index, SemanticType::Timestamp);
}
}
mod manifest;
mod region;

View File

@@ -0,0 +1,268 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::time::Instant;
use common_telemetry::info;
use snafu::{OptionExt, ResultExt, ensure};
use store_api::region_engine::{RegionEngine, RegionManifestInfo, SyncRegionFromResponse};
use store_api::storage::RegionId;
use crate::engine::MetricEngineInner;
use crate::error::{
MetricManifestInfoSnafu, MitoSyncOperationSnafu, PhysicalRegionNotFoundSnafu, Result,
};
use crate::utils;
impl MetricEngineInner {
/// Syncs the region from the given manifest information (leader-follower scenario).
///
/// This operation:
/// 1. Syncs the metadata region manifest to the target version.
/// 2. Syncs the data region manifest to the target version.
/// 3. Recovers states and returns newly opened logical regions (if metadata was synced)
pub async fn sync_region_from_manifest(
&self,
region_id: RegionId,
manifest_info: RegionManifestInfo,
) -> Result<SyncRegionFromResponse> {
ensure!(
manifest_info.is_metric(),
MetricManifestInfoSnafu { region_id }
);
let metadata_region_id = utils::to_metadata_region_id(region_id);
// checked by ensure above
let metadata_manifest_version = manifest_info
.metadata_manifest_version()
.unwrap_or_default();
let metadata_flushed_entry_id = manifest_info
.metadata_flushed_entry_id()
.unwrap_or_default();
let metadata_region_manifest =
RegionManifestInfo::mito(metadata_manifest_version, metadata_flushed_entry_id, 0);
let metadata_synced = self
.mito
.sync_region(metadata_region_id, metadata_region_manifest.into())
.await
.context(MitoSyncOperationSnafu)?
.is_data_synced();
let data_region_id = utils::to_data_region_id(region_id);
let data_manifest_version = manifest_info.data_manifest_version();
let data_flushed_entry_id = manifest_info.data_flushed_entry_id();
let data_region_manifest =
RegionManifestInfo::mito(data_manifest_version, data_flushed_entry_id, 0);
let data_synced = self
.mito
.sync_region(data_region_id, data_region_manifest.into())
.await
.context(MitoSyncOperationSnafu)?
.is_data_synced();
if !metadata_synced {
return Ok(SyncRegionFromResponse::Metric {
metadata_synced,
data_synced,
new_opened_logical_region_ids: vec![],
});
}
let now = Instant::now();
// Recovers the states from the metadata region
// if the metadata manifest version is updated.
let physical_region_options = *self
.state
.read()
.unwrap()
.physical_region_states()
.get(&data_region_id)
.context(PhysicalRegionNotFoundSnafu {
region_id: data_region_id,
})?
.options();
let new_opened_logical_region_ids = self
.recover_states(data_region_id, physical_region_options)
.await?;
info!(
"Sync metadata region for physical region {}, cost: {:?}, new opened logical region ids: {:?}",
data_region_id,
now.elapsed(),
new_opened_logical_region_ids
);
Ok(SyncRegionFromResponse::Metric {
metadata_synced,
data_synced,
new_opened_logical_region_ids,
})
}
}
#[cfg(test)]
mod tests {
use std::collections::HashMap;
use api::v1::SemanticType;
use common_query::prelude::greptime_timestamp;
use common_telemetry::info;
use datatypes::data_type::ConcreteDataType;
use datatypes::schema::ColumnSchema;
use store_api::metadata::ColumnMetadata;
use store_api::region_engine::{RegionEngine, RegionManifestInfo};
use store_api::region_request::{
AddColumn, AlterKind, RegionAlterRequest, RegionFlushRequest, RegionRequest,
};
use store_api::storage::RegionId;
use crate::metadata_region::MetadataRegion;
use crate::test_util::TestEnv;
#[tokio::test]
async fn test_sync_region_with_new_created_logical_regions() {
common_telemetry::init_default_ut_logging();
let mut env = TestEnv::with_prefix("sync_with_new_created_logical_regions").await;
env.init_metric_region().await;
info!("creating follower engine");
// Create a follower engine.
let (_follower_mito, follower_metric) = env.create_follower_engine().await;
let physical_region_id = env.default_physical_region_id();
// Flushes the physical region
let metric_engine = env.metric();
metric_engine
.handle_request(
env.default_physical_region_id(),
RegionRequest::Flush(RegionFlushRequest::default()),
)
.await
.unwrap();
let response = follower_metric
.sync_region(
physical_region_id,
RegionManifestInfo::metric(1, 0, 1, 0).into(),
)
.await
.unwrap();
assert!(response.is_metric());
let new_opened_logical_region_ids = response.new_opened_logical_region_ids().unwrap();
assert_eq!(new_opened_logical_region_ids, vec![RegionId::new(3, 2)]);
// Sync again, no new logical region should be opened
let response = follower_metric
.sync_region(
physical_region_id,
RegionManifestInfo::metric(1, 0, 1, 0).into(),
)
.await
.unwrap();
assert!(response.is_metric());
let new_opened_logical_region_ids = response.new_opened_logical_region_ids().unwrap();
assert!(new_opened_logical_region_ids.is_empty());
}
fn test_alter_logical_region_request() -> RegionAlterRequest {
RegionAlterRequest {
kind: AlterKind::AddColumns {
columns: vec![AddColumn {
column_metadata: ColumnMetadata {
column_id: 0,
semantic_type: SemanticType::Tag,
column_schema: ColumnSchema::new(
"tag1",
ConcreteDataType::string_datatype(),
false,
),
},
location: None,
}],
},
}
}
#[tokio::test]
async fn test_sync_region_alter_alter_logical_region() {
common_telemetry::init_default_ut_logging();
let mut env = TestEnv::with_prefix("sync_region_alter_alter_logical_region").await;
env.init_metric_region().await;
info!("creating follower engine");
let physical_region_id = env.default_physical_region_id();
// Flushes the physical region
let metric_engine = env.metric();
metric_engine
.handle_request(
env.default_physical_region_id(),
RegionRequest::Flush(RegionFlushRequest::default()),
)
.await
.unwrap();
// Create a follower engine.
let (follower_mito, follower_metric) = env.create_follower_engine().await;
let metric_engine = env.metric();
let engine_inner = env.metric().inner;
let region_id = env.default_logical_region_id();
let request = test_alter_logical_region_request();
engine_inner
.alter_logical_regions(
physical_region_id,
vec![(region_id, request)],
&mut HashMap::new(),
)
.await
.unwrap();
// Flushes the physical region
metric_engine
.handle_request(
env.default_physical_region_id(),
RegionRequest::Flush(RegionFlushRequest::default()),
)
.await
.unwrap();
// Sync the follower engine
let response = follower_metric
.sync_region(
physical_region_id,
RegionManifestInfo::metric(2, 0, 2, 0).into(),
)
.await
.unwrap();
assert!(response.is_metric());
let new_opened_logical_region_ids = response.new_opened_logical_region_ids().unwrap();
assert!(new_opened_logical_region_ids.is_empty());
let logical_region_id = env.default_logical_region_id();
let metadata_region = MetadataRegion::new(follower_mito.clone());
let semantic_type = metadata_region
.column_semantic_type(physical_region_id, logical_region_id, "tag1")
.await
.unwrap()
.unwrap();
assert_eq!(semantic_type, SemanticType::Tag);
let timestamp_index = metadata_region
.column_semantic_type(physical_region_id, logical_region_id, greptime_timestamp())
.await
.unwrap()
.unwrap();
assert_eq!(timestamp_index, SemanticType::Timestamp);
}
}

View File

@@ -0,0 +1,386 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::time::Instant;
use common_error::ext::BoxedError;
use common_telemetry::info;
use mito2::manifest::action::RegionEdit;
use snafu::{OptionExt, ResultExt, ensure};
use store_api::region_engine::{MitoCopyRegionFromRequest, SyncRegionFromResponse};
use store_api::storage::RegionId;
use crate::engine::MetricEngineInner;
use crate::error::{
MissingFilesSnafu, MitoCopyRegionFromOperationSnafu, MitoEditRegionSnafu,
PhysicalRegionNotFoundSnafu, Result,
};
use crate::utils;
impl MetricEngineInner {
/// Syncs the logical regions from the source region to the target region in the metric engine.
///
/// This operation:
/// 1. Copies SST files from source metadata region to target metadata region
/// 2. Transforms logical region metadata (updates region numbers to match target)
/// 3. Edits target manifest to remove old file entries (copied files)
/// 4. Recovers states and returns newly opened logical region IDs
///
/// **Note**: Only the metadata region is synced. The data region is not affected.
pub(crate) async fn sync_region_from_region(
&self,
region_id: RegionId,
source_region_id: RegionId,
parallelism: usize,
) -> Result<SyncRegionFromResponse> {
let source_metadata_region_id = utils::to_metadata_region_id(source_region_id);
let target_metadata_region_id = utils::to_metadata_region_id(region_id);
let target_data_region_id = utils::to_data_region_id(region_id);
let source_data_region_id = utils::to_data_region_id(source_region_id);
info!(
"Syncing region from region {} to region {}, parallelism: {}",
source_region_id, region_id, parallelism
);
let res = self
.mito
.copy_region_from(
target_metadata_region_id,
MitoCopyRegionFromRequest {
source_region_id: source_metadata_region_id,
parallelism,
},
)
.await
.map_err(BoxedError::new)
.context(MitoCopyRegionFromOperationSnafu {
source_region_id: source_metadata_region_id,
target_region_id: target_metadata_region_id,
})?;
if res.copied_file_ids.is_empty() {
info!(
"No files were copied from source region {} to target region {}, copied file ids are empty",
source_metadata_region_id, target_metadata_region_id
);
return Ok(SyncRegionFromResponse::Metric {
metadata_synced: false,
data_synced: false,
new_opened_logical_region_ids: vec![],
});
}
let target_region = self.mito.find_region(target_metadata_region_id).context(
PhysicalRegionNotFoundSnafu {
region_id: target_metadata_region_id,
},
)?;
let files_to_remove = target_region.file_metas(&res.copied_file_ids).await;
let missing_file_ids = res
.copied_file_ids
.iter()
.zip(&files_to_remove)
.filter_map(|(file_id, maybe_meta)| {
if maybe_meta.is_none() {
Some(*file_id)
} else {
None
}
})
.collect::<Vec<_>>();
// `copy_region_from` does not trigger compaction,
// so there should be no files removed and thus no missing files.
ensure!(
missing_file_ids.is_empty(),
MissingFilesSnafu {
region_id: target_metadata_region_id,
file_ids: missing_file_ids,
}
);
let files_to_remove = files_to_remove.into_iter().flatten().collect::<Vec<_>>();
// Transform the logical region metadata of the target data region.
self.metadata_region
.transform_logical_region_metadata(target_data_region_id, source_data_region_id)
.await?;
let edit = RegionEdit {
files_to_add: vec![],
files_to_remove: files_to_remove.clone(),
timestamp_ms: Some(chrono::Utc::now().timestamp_millis()),
compaction_time_window: None,
flushed_entry_id: None,
flushed_sequence: None,
committed_sequence: None,
};
self.mito
.edit_region(target_metadata_region_id, edit)
.await
.map_err(BoxedError::new)
.context(MitoEditRegionSnafu {
region_id: target_metadata_region_id,
})?;
info!(
"Successfully edit metadata region: {} after syncing from source metadata region: {}, files to remove: {:?}",
target_metadata_region_id,
source_metadata_region_id,
files_to_remove
.iter()
.map(|meta| meta.file_id)
.collect::<Vec<_>>(),
);
let now = Instant::now();
// Always recover states from the target metadata region after syncing
// from the source metadata region.
let physical_region_options = *self
.state
.read()
.unwrap()
.physical_region_states()
.get(&target_data_region_id)
.context(PhysicalRegionNotFoundSnafu {
region_id: target_data_region_id,
})?
.options();
let new_opened_logical_region_ids = self
.recover_states(target_data_region_id, physical_region_options)
.await?;
info!(
"Sync metadata region from source region {} to target region {}, recover states cost: {:?}, new opened logical region ids: {:?}",
source_metadata_region_id,
target_metadata_region_id,
now.elapsed(),
new_opened_logical_region_ids
);
Ok(SyncRegionFromResponse::Metric {
metadata_synced: true,
data_synced: false,
new_opened_logical_region_ids,
})
}
}
#[cfg(test)]
mod tests {
use common_error::ext::ErrorExt;
use common_error::status_code::StatusCode;
use common_telemetry::debug;
use store_api::metric_engine_consts::{METRIC_ENGINE_NAME, PHYSICAL_TABLE_METADATA_KEY};
use store_api::region_engine::{RegionEngine, SyncRegionFromRequest};
use store_api::region_request::{
BatchRegionDdlRequest, PathType, RegionCloseRequest, RegionFlushRequest, RegionOpenRequest,
RegionRequest,
};
use store_api::storage::RegionId;
use crate::metadata_region::MetadataRegion;
use crate::test_util::{TestEnv, create_logical_region_request};
async fn assert_logical_table_columns(
metadata_region: &MetadataRegion,
physical_region_id: RegionId,
logical_region_id: RegionId,
expected_columns: &[&str],
) {
let mut columns = metadata_region
.logical_columns(physical_region_id, logical_region_id)
.await
.unwrap()
.into_iter()
.map(|(n, _)| n)
.collect::<Vec<_>>();
columns.sort_unstable();
assert_eq!(columns, expected_columns);
}
#[tokio::test]
async fn test_sync_region_from_region() {
common_telemetry::init_default_ut_logging();
let env = TestEnv::new().await;
let metric_engine = env.metric();
let source_physical_region_id = RegionId::new(1024, 0);
let logical_region_id1 = RegionId::new(1025, 0);
let logical_region_id2 = RegionId::new(1026, 0);
env.create_physical_region(source_physical_region_id, "/test_dir1", vec![])
.await;
let region_create_request1 =
create_logical_region_request(&["job"], source_physical_region_id, "logical1");
let region_create_request2 =
create_logical_region_request(&["host"], source_physical_region_id, "logical2");
metric_engine
.handle_batch_ddl_requests(BatchRegionDdlRequest::Create(vec![
(logical_region_id1, region_create_request1),
(logical_region_id2, region_create_request2),
]))
.await
.unwrap();
debug!("Flushing source physical region");
metric_engine
.handle_request(
source_physical_region_id,
RegionRequest::Flush(RegionFlushRequest {
row_group_size: None,
}),
)
.await
.unwrap();
let logical_regions = metric_engine
.logical_regions(source_physical_region_id)
.await
.unwrap();
assert!(logical_regions.contains(&logical_region_id1));
assert!(logical_regions.contains(&logical_region_id2));
let target_physical_region_id = RegionId::new(1024, 1);
let target_logical_region_id1 = RegionId::new(1025, 1);
let target_logical_region_id2 = RegionId::new(1026, 1);
// Prepare target physical region
env.create_physical_region(target_physical_region_id, "/test_dir1", vec![])
.await;
let r = metric_engine
.sync_region(
target_physical_region_id,
SyncRegionFromRequest::FromRegion {
source_region_id: source_physical_region_id,
parallelism: 1,
},
)
.await
.unwrap();
let new_opened_logical_region_ids = r.new_opened_logical_region_ids().unwrap();
assert_eq!(new_opened_logical_region_ids.len(), 2);
assert!(new_opened_logical_region_ids.contains(&target_logical_region_id1));
assert!(new_opened_logical_region_ids.contains(&target_logical_region_id2));
debug!("Sync region from again");
assert_logical_table_columns(
&env.metadata_region(),
target_physical_region_id,
target_logical_region_id1,
&["greptime_timestamp", "greptime_value", "job"],
)
.await;
assert_logical_table_columns(
&env.metadata_region(),
target_physical_region_id,
target_logical_region_id2,
&["greptime_timestamp", "greptime_value", "host"],
)
.await;
let logical_regions = env
.metadata_region()
.logical_regions(target_physical_region_id)
.await
.unwrap();
assert_eq!(logical_regions.len(), 2);
assert!(logical_regions.contains(&target_logical_region_id1));
assert!(logical_regions.contains(&target_logical_region_id2));
// Should be ok to sync region from again.
let r = metric_engine
.sync_region(
target_physical_region_id,
SyncRegionFromRequest::FromRegion {
source_region_id: source_physical_region_id,
parallelism: 1,
},
)
.await
.unwrap();
let new_opened_logical_region_ids = r.new_opened_logical_region_ids().unwrap();
assert!(new_opened_logical_region_ids.is_empty());
// Try to close region and reopen it, should be ok.
metric_engine
.handle_request(
target_physical_region_id,
RegionRequest::Close(RegionCloseRequest {}),
)
.await
.unwrap();
let physical_region_option = [(PHYSICAL_TABLE_METADATA_KEY.to_string(), String::new())]
.into_iter()
.collect();
metric_engine
.handle_request(
target_physical_region_id,
RegionRequest::Open(RegionOpenRequest {
engine: METRIC_ENGINE_NAME.to_string(),
table_dir: "/test_dir1".to_string(),
path_type: PathType::Bare,
options: physical_region_option,
skip_wal_replay: false,
checkpoint: None,
}),
)
.await
.unwrap();
let logical_regions = env
.metadata_region()
.logical_regions(target_physical_region_id)
.await
.unwrap();
assert_eq!(logical_regions.len(), 2);
assert!(logical_regions.contains(&target_logical_region_id1));
assert!(logical_regions.contains(&target_logical_region_id2));
}
#[tokio::test]
async fn test_sync_region_from_region_with_no_files() {
common_telemetry::init_default_ut_logging();
let env = TestEnv::new().await;
let metric_engine = env.metric();
let source_physical_region_id = RegionId::new(1024, 0);
env.create_physical_region(source_physical_region_id, "/test_dir1", vec![])
.await;
let target_physical_region_id = RegionId::new(1024, 1);
env.create_physical_region(target_physical_region_id, "/test_dir1", vec![])
.await;
let r = metric_engine
.sync_region(
target_physical_region_id,
SyncRegionFromRequest::FromRegion {
source_region_id: source_physical_region_id,
parallelism: 1,
},
)
.await
.unwrap();
let new_opened_logical_region_ids = r.new_opened_logical_region_ids().unwrap();
assert!(new_opened_logical_region_ids.is_empty());
}
#[tokio::test]
async fn test_sync_region_from_region_source_not_exist() {
common_telemetry::init_default_ut_logging();
let env = TestEnv::new().await;
let metric_engine = env.metric();
let source_physical_region_id = RegionId::new(1024, 0);
let target_physical_region_id = RegionId::new(1024, 1);
env.create_physical_region(target_physical_region_id, "/test_dir1", vec![])
.await;
let err = metric_engine
.sync_region(
target_physical_region_id,
SyncRegionFromRequest::FromRegion {
source_region_id: source_physical_region_id,
parallelism: 1,
},
)
.await
.unwrap_err();
assert_eq!(err.status_code(), StatusCode::InvalidArguments);
}
}

View File

@@ -21,7 +21,7 @@ use common_macro::stack_trace_debug;
use datatypes::prelude::ConcreteDataType;
use snafu::{Location, Snafu};
use store_api::region_request::RegionRequest;
use store_api::storage::RegionId;
use store_api::storage::{FileId, RegionId};
#[derive(Snafu)]
#[snafu(visibility(pub))]
@@ -128,6 +128,27 @@ pub enum Error {
location: Location,
},
#[snafu(display(
"Mito copy region from operation fails, source region id: {}, target region id: {}",
source_region_id,
target_region_id
))]
MitoCopyRegionFromOperation {
source: BoxedError,
#[snafu(implicit)]
location: Location,
source_region_id: RegionId,
target_region_id: RegionId,
},
#[snafu(display("Mito edit region operation fails, region id: {}", region_id))]
MitoEditRegion {
region_id: RegionId,
source: BoxedError,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Failed to encode primary key"))]
EncodePrimaryKey {
source: mito_codec::error::Error,
@@ -256,6 +277,21 @@ pub enum Error {
location: Location,
},
#[snafu(display("Unsupported sync region from request for region {}", region_id))]
UnsupportedSyncRegionFromRequest {
region_id: RegionId,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Missing file metas in region {}, file ids: {:?}", region_id, file_ids))]
MissingFiles {
region_id: RegionId,
#[snafu(implicit)]
location: Location,
file_ids: Vec<FileId>,
},
#[snafu(display("Unsupported alter kind: {}", kind))]
UnsupportedAlterKind {
kind: String,
@@ -339,11 +375,12 @@ impl ErrorExt for Error {
| ParseRegionOptions { .. }
| UnexpectedRequest { .. }
| UnsupportedAlterKind { .. }
| UnsupportedRemapManifestsRequest { .. } => StatusCode::InvalidArguments,
| UnsupportedRemapManifestsRequest { .. }
| UnsupportedSyncRegionFromRequest { .. } => StatusCode::InvalidArguments,
ForbiddenPhysicalAlter { .. } | UnsupportedRegionRequest { .. } => {
StatusCode::Unsupported
}
ForbiddenPhysicalAlter { .. }
| UnsupportedRegionRequest { .. }
| MissingFiles { .. } => StatusCode::Unsupported,
DeserializeColumnMetadata { .. }
| SerializeColumnMetadata { .. }
@@ -369,7 +406,9 @@ impl ErrorExt for Error {
| MitoSyncOperation { source, .. }
| MitoEnterStagingOperation { source, .. }
| BatchOpenMitoRegion { source, .. }
| BatchCatchupMitoRegion { source, .. } => source.status_code(),
| BatchCatchupMitoRegion { source, .. }
| MitoCopyRegionFromOperation { source, .. }
| MitoEditRegion { source, .. } => source.status_code(),
EncodePrimaryKey { source, .. } => source.status_code(),

View File

@@ -25,6 +25,7 @@ use base64::Engine;
use base64::engine::general_purpose::STANDARD_NO_PAD;
use common_base::readable_size::ReadableSize;
use common_recordbatch::{RecordBatch, SendableRecordBatchStream};
use common_telemetry::{debug, info, warn};
use datafusion::prelude::{col, lit};
use futures_util::TryStreamExt;
use futures_util::stream::BoxStream;
@@ -400,14 +401,11 @@ impl MetadataRegion {
.await
.context(CacheGetSnafu)?;
let range = region_metadata.key_values.range(prefix.to_string()..);
let mut result = HashMap::new();
for (k, v) in range {
if !k.starts_with(prefix) {
break;
}
result.insert(k.clone(), v.clone());
}
get_all_with_prefix(&region_metadata, prefix, |k, v| {
result.insert(k.to_string(), v.to_string());
Ok(())
})?;
Ok(result)
}
@@ -558,6 +556,109 @@ impl MetadataRegion {
Ok(())
}
/// Updates logical region metadata so that any entries previously referencing
/// `source_region_id` are modified to reference the data region of `physical_region_id`.
///
/// This method should be called after copying files from `source_region_id`
/// into the target region. It scans the metadata for the target physical
/// region, finds logical regions with the same region number as the source,
/// and reinserts region and column entries updated to use the target's
/// region number.
pub async fn transform_logical_region_metadata(
&self,
physical_region_id: RegionId,
source_region_id: RegionId,
) -> Result<()> {
let metadata_region_id = utils::to_metadata_region_id(physical_region_id);
let data_region_id = utils::to_data_region_id(physical_region_id);
let logical_regions = self
.logical_regions(data_region_id)
.await?
.into_iter()
.filter(|r| r.region_number() == source_region_id.region_number())
.collect::<Vec<_>>();
if logical_regions.is_empty() {
info!(
"No logical regions found from source region {}, physical region id: {}",
source_region_id, physical_region_id,
);
return Ok(());
}
let metadata = self.load_all(metadata_region_id).await?;
let mut output = Vec::new();
for logical_region_id in &logical_regions {
let prefix = MetadataRegion::concat_column_key_prefix(*logical_region_id);
get_all_with_prefix(&metadata, &prefix, |k, v| {
// Safety: we have checked the prefix
let (src_logical_region_id, column_name) = Self::parse_column_key(k)?.unwrap();
// Change the region number to the data region number.
let new_key = MetadataRegion::concat_column_key(
RegionId::new(
src_logical_region_id.table_id(),
data_region_id.region_number(),
),
&column_name,
);
output.push((new_key, v.to_string()));
Ok(())
})?;
let new_key = MetadataRegion::concat_region_key(RegionId::new(
logical_region_id.table_id(),
data_region_id.region_number(),
));
output.push((new_key, String::new()));
}
if output.is_empty() {
warn!(
"No logical regions metadata found from source region {}, physical region id: {}",
source_region_id, physical_region_id
);
return Ok(());
}
debug!(
"Transform logical regions metadata to physical region {}, source region: {}, transformed metadata: {}",
data_region_id,
source_region_id,
output.len(),
);
let put_request = MetadataRegion::build_put_request_from_iter(output.into_iter());
self.mito
.handle_request(
metadata_region_id,
store_api::region_request::RegionRequest::Put(put_request),
)
.await
.context(MitoWriteOperationSnafu)?;
info!(
"Transformed {} logical regions metadata to physical region {}, source region: {}",
logical_regions.len(),
data_region_id,
source_region_id
);
self.cache.invalidate(&metadata_region_id).await;
Ok(())
}
}
fn get_all_with_prefix(
region_metadata: &RegionMetadataCacheEntry,
prefix: &str,
mut callback: impl FnMut(&str, &str) -> Result<()>,
) -> Result<()> {
let range = region_metadata.key_values.range(prefix.to_string()..);
for (k, v) in range {
if !k.starts_with(prefix) {
break;
}
callback(k, v)?;
}
Ok(())
}
#[cfg(test)]

View File

@@ -62,7 +62,7 @@ use crate::read::projection::ProjectionMapper;
use crate::read::scan_region::{PredicateGroup, ScanInput};
use crate::read::seq_scan::SeqScan;
use crate::read::{BoxedBatchReader, BoxedRecordBatchStream};
use crate::region::options::MergeMode;
use crate::region::options::{MergeMode, RegionOptions};
use crate::region::version::VersionControlRef;
use crate::region::{ManifestContextRef, RegionLeaderState, RegionRoleState};
use crate::request::{OptionOutputTx, OutputTx, WorkerRequestWithTime};
@@ -311,9 +311,24 @@ impl CompactionScheduler {
request: CompactionRequest,
options: compact_request::Options,
) -> Result<()> {
let region_id = request.region_id();
let (dynamic_compaction_opts, ttl) = find_dynamic_options(
region_id.table_id(),
&request.current_version.options,
&request.schema_metadata_manager,
)
.await
.unwrap_or_else(|e| {
warn!(e; "Failed to find dynamic options for region: {}", region_id);
(
request.current_version.options.compaction.clone(),
request.current_version.options.ttl.unwrap_or_default(),
)
});
let picker = new_picker(
&options,
&request.current_version.options.compaction,
&dynamic_compaction_opts,
request.current_version.options.append_mode,
Some(self.engine_config.max_background_compactions),
);
@@ -328,21 +343,10 @@ impl CompactionScheduler {
cache_manager,
manifest_ctx,
listener,
schema_metadata_manager,
schema_metadata_manager: _,
max_parallelism,
} = request;
let ttl = find_ttl(
region_id.table_id(),
current_version.options.ttl,
&schema_metadata_manager,
)
.await
.unwrap_or_else(|e| {
warn!(e; "Failed to get ttl for region: {}", region_id);
TimeToLive::default()
});
debug!(
"Pick compaction strategy {:?} for region: {}, ttl: {:?}",
picker, region_id, ttl
@@ -351,7 +355,10 @@ impl CompactionScheduler {
let compaction_region = CompactionRegion {
region_id,
current_version: current_version.clone(),
region_options: current_version.options.clone(),
region_options: RegionOptions {
compaction: dynamic_compaction_opts.clone(),
..current_version.options.clone()
},
engine_config: engine_config.clone(),
region_metadata: current_version.metadata.clone(),
cache_manager: cache_manager.clone(),
@@ -382,7 +389,7 @@ impl CompactionScheduler {
// If specified to run compaction remotely, we schedule the compaction job remotely.
// It will fall back to local compaction if there is no remote job scheduler.
let waiters = if current_version.options.compaction.remote_compaction() {
let waiters = if dynamic_compaction_opts.remote_compaction() {
if let Some(remote_job_scheduler) = &self.plugins.get::<RemoteJobSchedulerRef>() {
let remote_compaction_job = CompactionJob {
compaction_region: compaction_region.clone(),
@@ -411,7 +418,7 @@ impl CompactionScheduler {
return Ok(());
}
Err(e) => {
if !current_version.options.compaction.fallback_to_local() {
if !dynamic_compaction_opts.fallback_to_local() {
error!(e; "Failed to schedule remote compaction job for region {}", region_id);
return RemoteCompactionSnafu {
region_id,
@@ -494,29 +501,88 @@ impl Drop for CompactionScheduler {
}
}
/// Finds TTL of table by first examine table options then database options.
async fn find_ttl(
/// Finds compaction options and TTL together with a single metadata fetch to reduce RTT.
async fn find_dynamic_options(
table_id: TableId,
table_ttl: Option<TimeToLive>,
region_options: &crate::region::options::RegionOptions,
schema_metadata_manager: &SchemaMetadataManagerRef,
) -> Result<TimeToLive> {
// If table TTL is set, we use it.
if let Some(table_ttl) = table_ttl {
return Ok(table_ttl);
) -> Result<(crate::region::options::CompactionOptions, TimeToLive)> {
if region_options.compaction_override && region_options.ttl.is_some() {
debug!(
"Use region options directly for table {}: compaction={:?}, ttl={:?}",
table_id, region_options.compaction, region_options.ttl
);
return Ok((
region_options.compaction.clone(),
region_options.ttl.unwrap(),
));
}
let ttl = tokio::time::timeout(
let db_options = tokio::time::timeout(
crate::config::FETCH_OPTION_TIMEOUT,
schema_metadata_manager.get_schema_options_by_table_id(table_id),
)
.await
.context(TimeoutSnafu)?
.context(GetSchemaMetadataSnafu)?
.and_then(|options| options.ttl)
.unwrap_or_default()
.into();
.context(GetSchemaMetadataSnafu)?;
Ok(ttl)
let ttl = if region_options.ttl.is_some() {
debug!(
"Use region TTL directly for table {}: ttl={:?}",
table_id, region_options.ttl
);
region_options.ttl.unwrap()
} else {
db_options
.as_ref()
.and_then(|options| options.ttl)
.unwrap_or_default()
.into()
};
let compaction = if !region_options.compaction_override {
if let Some(schema_opts) = db_options {
let map: HashMap<String, String> = schema_opts
.extra_options
.iter()
.filter_map(|(k, v)| {
if k.starts_with("compaction.") {
Some((k.clone(), v.clone()))
} else {
None
}
})
.collect();
if map.is_empty() {
region_options.compaction.clone()
} else {
crate::region::options::RegionOptions::try_from(&map)
.map(|o| o.compaction)
.unwrap_or_else(|e| {
error!(e; "Failed to create RegionOptions from map");
region_options.compaction.clone()
})
}
} else {
debug!(
"DB options is None for table {}, use region compaction: compaction={:?}",
table_id, region_options.compaction
);
region_options.compaction.clone()
}
} else {
debug!(
"No schema options for table {}, use region compaction: compaction={:?}",
table_id, region_options.compaction
);
region_options.compaction.clone()
};
debug!(
"Resolved dynamic options for table {}: compaction={:?}, ttl={:?}",
table_id, compaction, ttl
);
Ok((compaction, ttl))
}
/// Status of running and pending region compaction tasks.
@@ -805,8 +871,12 @@ struct PendingCompaction {
#[cfg(test)]
mod tests {
use std::time::Duration;
use api::v1::region::StrictWindow;
use common_datasource::compression::CompressionType;
use common_meta::key::schema_name::SchemaNameValue;
use common_time::DatabaseTimeToLive;
use tokio::sync::{Barrier, oneshot};
use super::*;
@@ -818,6 +888,163 @@ mod tests {
use crate::test_util::scheduler_util::{SchedulerEnv, VecScheduler};
use crate::test_util::version_util::{VersionControlBuilder, apply_edit};
#[tokio::test]
async fn test_find_compaction_options_db_level() {
let env = SchedulerEnv::new().await;
let builder = VersionControlBuilder::new();
let (schema_metadata_manager, kv_backend) = mock_schema_metadata_manager();
let region_id = builder.region_id();
let table_id = region_id.table_id();
// Register table without ttl but with db-level compaction options
let mut schema_value = SchemaNameValue {
ttl: Some(DatabaseTimeToLive::default()),
..Default::default()
};
schema_value
.extra_options
.insert("compaction.type".to_string(), "twcs".to_string());
schema_value
.extra_options
.insert("compaction.twcs.time_window".to_string(), "2h".to_string());
schema_metadata_manager
.register_region_table_info(
table_id,
"t",
"c",
"s",
Some(schema_value),
kv_backend.clone(),
)
.await;
let version_control = Arc::new(builder.build());
let region_opts = version_control.current().version.options.clone();
let (opts, _) = find_dynamic_options(table_id, &region_opts, &schema_metadata_manager)
.await
.unwrap();
match opts {
crate::region::options::CompactionOptions::Twcs(t) => {
assert_eq!(t.time_window_seconds(), Some(2 * 3600));
}
}
let manifest_ctx = env
.mock_manifest_context(version_control.current().version.metadata.clone())
.await;
let (tx, _rx) = mpsc::channel(4);
let mut scheduler = env.mock_compaction_scheduler(tx);
let (otx, _orx) = oneshot::channel();
let request = scheduler
.region_status
.entry(region_id)
.or_insert_with(|| {
crate::compaction::CompactionStatus::new(
region_id,
version_control.clone(),
env.access_layer.clone(),
)
})
.new_compaction_request(
scheduler.request_sender.clone(),
OptionOutputTx::new(Some(OutputTx::new(otx))),
scheduler.engine_config.clone(),
scheduler.cache_manager.clone(),
&manifest_ctx,
scheduler.listener.clone(),
schema_metadata_manager.clone(),
1,
);
scheduler
.schedule_compaction_request(
request,
compact_request::Options::Regular(Default::default()),
)
.await
.unwrap();
}
#[tokio::test]
async fn test_find_compaction_options_priority() {
fn schema_value_with_twcs(time_window: &str) -> SchemaNameValue {
let mut schema_value = SchemaNameValue {
ttl: Some(DatabaseTimeToLive::default()),
..Default::default()
};
schema_value
.extra_options
.insert("compaction.type".to_string(), "twcs".to_string());
schema_value.extra_options.insert(
"compaction.twcs.time_window".to_string(),
time_window.to_string(),
);
schema_value
}
let cases = [
(
"db options set and table override set",
Some(schema_value_with_twcs("2h")),
true,
Some(Duration::from_secs(5 * 3600)),
Some(5 * 3600),
),
(
"db options set and table override not set",
Some(schema_value_with_twcs("2h")),
false,
None,
Some(2 * 3600),
),
(
"db options not set and table override set",
None,
true,
Some(Duration::from_secs(4 * 3600)),
Some(4 * 3600),
),
(
"db options not set and table override not set",
None,
false,
None,
None,
),
];
for (case_name, schema_value, override_set, table_window, expected_window) in cases {
let builder = VersionControlBuilder::new();
let (schema_metadata_manager, kv_backend) = mock_schema_metadata_manager();
let table_id = builder.region_id().table_id();
schema_metadata_manager
.register_region_table_info(
table_id,
"t",
"c",
"s",
schema_value,
kv_backend.clone(),
)
.await;
let version_control = Arc::new(builder.build());
let mut region_opts = version_control.current().version.options.clone();
region_opts.compaction_override = override_set;
if let Some(window) = table_window {
let crate::region::options::CompactionOptions::Twcs(twcs) =
&mut region_opts.compaction;
twcs.time_window = Some(window);
}
let (opts, _) = find_dynamic_options(table_id, &region_opts, &schema_metadata_manager)
.await
.unwrap();
match opts {
crate::region::options::CompactionOptions::Twcs(t) => {
assert_eq!(t.time_window_seconds(), expected_window, "{case_name}");
}
}
}
}
#[tokio::test]
async fn test_schedule_empty() {
let env = SchedulerEnv::new().await;

View File

@@ -35,7 +35,7 @@ use crate::access_layer::{
};
use crate::cache::{CacheManager, CacheManagerRef};
use crate::compaction::picker::{PickerOutput, new_picker};
use crate::compaction::{CompactionOutput, CompactionSstReaderBuilder, find_ttl};
use crate::compaction::{CompactionOutput, CompactionSstReaderBuilder, find_dynamic_options};
use crate::config::MitoConfig;
use crate::error::{
EmptyRegionDirSnafu, InvalidPartitionExprSnafu, JoinSnafu, ObjectStoreNotFoundSnafu, Result,
@@ -203,16 +203,22 @@ pub async fn open_compaction_region(
// Use the specified ttl.
Either::Left(ttl) => ttl,
// Get the ttl from the schema metadata manager.
Either::Right(schema_metadata_manager) => find_ttl(
req.region_id.table_id(),
current_version.options.ttl,
&schema_metadata_manager,
)
.await
.unwrap_or_else(|e| {
warn!(e; "Failed to get ttl for region: {}", region_metadata.region_id);
TimeToLive::default()
}),
Either::Right(schema_metadata_manager) => {
let (_, ttl) = find_dynamic_options(
req.region_id.table_id(),
&req.region_options,
&schema_metadata_manager,
)
.await
.unwrap_or_else(|e| {
warn!(e; "Failed to get ttl for region: {}", region_metadata.region_id);
(
crate::region::options::CompactionOptions::default(),
TimeToLive::default(),
)
});
ttl
}
};
Ok(CompactionRegion {

View File

@@ -162,6 +162,7 @@ impl CompactionTaskImpl {
edit,
result: Ok(()),
update_region_state: false,
is_staging: false,
}),
})
.await;

View File

@@ -244,6 +244,7 @@ mod tests {
options: RegionOptions {
ttl: ttl.map(|t| t.into()),
compaction: Default::default(),
compaction_override: false,
storage: None,
append_mode: false,
wal_options: Default::default(),

View File

@@ -76,6 +76,8 @@ mod copy_region_from_test;
#[cfg(test)]
mod remap_manifests_test;
#[cfg(test)]
mod apply_staging_manifest_test;
mod puffin_index;
use std::any::Any;
@@ -87,6 +89,7 @@ use api::region::RegionResponse;
use async_trait::async_trait;
use common_base::Plugins;
use common_error::ext::BoxedError;
use common_meta::error::UnexpectedSnafu;
use common_meta::key::SchemaMetadataManagerRef;
use common_recordbatch::{MemoryPermit, QueryMemoryTracker, SendableRecordBatchStream};
use common_stat::get_total_memory_bytes;
@@ -105,10 +108,10 @@ use store_api::metric_engine_consts::{
MANIFEST_INFO_EXTENSION_KEY, TABLE_COLUMN_METADATA_EXTENSION_KEY,
};
use store_api::region_engine::{
BatchResponses, CopyRegionFromRequest, CopyRegionFromResponse, MitoCopyRegionFromResponse,
RegionEngine, RegionManifestInfo, RegionRole, RegionScannerRef, RegionStatistic,
RemapManifestsRequest, RemapManifestsResponse, SetRegionRoleStateResponse,
SettableRegionRoleState, SyncManifestResponse,
BatchResponses, MitoCopyRegionFromRequest, MitoCopyRegionFromResponse, RegionEngine,
RegionManifestInfo, RegionRole, RegionScannerRef, RegionStatistic, RemapManifestsRequest,
RemapManifestsResponse, SetRegionRoleStateResponse, SettableRegionRoleState,
SyncRegionFromRequest, SyncRegionFromResponse,
};
use store_api::region_request::{
AffectedRows, RegionCatchupRequest, RegionOpenRequest, RegionRequest,
@@ -122,8 +125,8 @@ use crate::cache::{CacheManagerRef, CacheStrategy};
use crate::config::MitoConfig;
use crate::engine::puffin_index::{IndexEntryContext, collect_index_entries_from_puffin};
use crate::error::{
self, InvalidRequestSnafu, JoinSnafu, MitoManifestInfoSnafu, RecvSnafu, RegionNotFoundSnafu,
Result, SerdeJsonSnafu, SerializeColumnMetadataSnafu, SerializeManifestSnafu,
InvalidRequestSnafu, JoinSnafu, MitoManifestInfoSnafu, RecvSnafu, RegionNotFoundSnafu, Result,
SerdeJsonSnafu, SerializeColumnMetadataSnafu, SerializeManifestSnafu,
};
#[cfg(feature = "enterprise")]
use crate::extension::BoxedExtensionRangeProviderFactory;
@@ -395,7 +398,7 @@ impl MitoEngine {
}
/// Edit region's metadata by [RegionEdit] directly. Use with care.
/// Now we only allow adding files to region (the [RegionEdit] struct can only contain a non-empty "files_to_add" field).
/// Now we only allow adding files or removing files from region (the [RegionEdit] struct can only contain a non-empty "files_to_add" or "files_to_remove" field).
/// Other region editing intention will result in an "invalid request" error.
/// Also note that if a region is to be edited directly, we MUST not write data to it thereafter.
pub async fn edit_region(&self, region_id: RegionId, edit: RegionEdit) -> Result<()> {
@@ -430,7 +433,7 @@ impl MitoEngine {
pub async fn copy_region_from(
&self,
region_id: RegionId,
request: CopyRegionFromRequest,
request: MitoCopyRegionFromRequest,
) -> Result<MitoCopyRegionFromResponse> {
self.inner.copy_region_from(region_id, request).await
}
@@ -639,8 +642,7 @@ impl MitoEngine {
///
/// Only adding or removing files to region is considered valid now.
fn is_valid_region_edit(edit: &RegionEdit) -> bool {
!edit.files_to_add.is_empty()
&& edit.files_to_remove.is_empty()
(!edit.files_to_add.is_empty() || !edit.files_to_remove.is_empty())
&& matches!(
edit,
RegionEdit {
@@ -1073,7 +1075,7 @@ impl EngineInner {
async fn copy_region_from(
&self,
region_id: RegionId,
request: CopyRegionFromRequest,
request: MitoCopyRegionFromRequest,
) -> Result<MitoCopyRegionFromResponse> {
let (request, receiver) =
WorkerRequest::try_from_copy_region_from_request(region_id, request)?;
@@ -1247,15 +1249,21 @@ impl RegionEngine for MitoEngine {
async fn sync_region(
&self,
region_id: RegionId,
manifest_info: RegionManifestInfo,
) -> Result<SyncManifestResponse, BoxedError> {
request: SyncRegionFromRequest,
) -> Result<SyncRegionFromResponse, BoxedError> {
let manifest_info = request
.into_region_manifest_info()
.context(UnexpectedSnafu {
err_msg: "Expected a manifest info request",
})
.map_err(BoxedError::new)?;
let (_, synced) = self
.inner
.sync_region(region_id, manifest_info)
.await
.map_err(BoxedError::new)?;
Ok(SyncManifestResponse::Mito { synced })
Ok(SyncRegionFromResponse::Mito { synced })
}
async fn remap_manifests(
@@ -1268,19 +1276,6 @@ impl RegionEngine for MitoEngine {
.map_err(BoxedError::new)
}
async fn copy_region_from(
&self,
_region_id: RegionId,
_request: CopyRegionFromRequest,
) -> Result<CopyRegionFromResponse, BoxedError> {
Err(BoxedError::new(
error::UnsupportedOperationSnafu {
err_msg: "copy_region_from is not supported",
}
.build(),
))
}
fn role(&self, region_id: RegionId) -> Option<RegionRole> {
self.inner.role(region_id)
}
@@ -1419,7 +1414,7 @@ mod tests {
};
assert!(is_valid_region_edit(&edit));
// Invalid: "files_to_add" is empty
// Invalid: "files_to_add" and "files_to_remove" are both empty
let edit = RegionEdit {
files_to_add: vec![],
files_to_remove: vec![],
@@ -1431,7 +1426,7 @@ mod tests {
};
assert!(!is_valid_region_edit(&edit));
// Invalid: "files_to_remove" is not empty
// Valid: "files_to_remove" is not empty
let edit = RegionEdit {
files_to_add: vec![FileMeta::default()],
files_to_remove: vec![FileMeta::default()],
@@ -1441,7 +1436,7 @@ mod tests {
flushed_sequence: None,
committed_sequence: None,
};
assert!(!is_valid_region_edit(&edit));
assert!(is_valid_region_edit(&edit));
// Invalid: other fields are not all "None"s
let edit = RegionEdit {

View File

@@ -0,0 +1,400 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::assert_matches::assert_matches;
use std::fs;
use api::v1::Rows;
use datatypes::value::Value;
use partition::expr::{PartitionExpr, col};
use store_api::region_engine::{
RegionEngine, RegionRole, RemapManifestsRequest, SettableRegionRoleState,
};
use store_api::region_request::{
ApplyStagingManifestRequest, EnterStagingRequest, RegionFlushRequest, RegionRequest,
};
use store_api::storage::{FileId, RegionId};
use crate::config::MitoConfig;
use crate::error::Error;
use crate::manifest::action::RegionManifest;
use crate::sst::file::FileMeta;
use crate::test_util::{CreateRequestBuilder, TestEnv, build_rows, put_rows, rows_schema};
fn range_expr(col_name: &str, start: i64, end: i64) -> PartitionExpr {
col(col_name)
.gt_eq(Value::Int64(start))
.and(col(col_name).lt(Value::Int64(end)))
}
#[tokio::test]
async fn test_apply_staging_manifest_invalid_region_state() {
common_telemetry::init_default_ut_logging();
test_apply_staging_manifest_invalid_region_state_with_format(false).await;
test_apply_staging_manifest_invalid_region_state_with_format(true).await;
}
async fn test_apply_staging_manifest_invalid_region_state_with_format(flat_format: bool) {
let mut env = TestEnv::with_prefix("invalid-region-state").await;
let engine = env
.create_engine(MitoConfig {
default_experimental_flat_format: flat_format,
..Default::default()
})
.await;
let region_id = RegionId::new(1, 1);
let request = CreateRequestBuilder::new()
.partition_expr_json(Some(range_expr("x", 0, 50).as_json_str().unwrap()))
.build();
engine
.handle_request(region_id, RegionRequest::Create(request))
.await
.unwrap();
// Region is in leader state, apply staging manifest request should fail.
let err = engine
.handle_request(
region_id,
RegionRequest::ApplyStagingManifest(ApplyStagingManifestRequest {
partition_expr: range_expr("x", 0, 100).as_json_str().unwrap(),
files_to_add: vec![],
}),
)
.await
.unwrap_err();
assert_matches!(
err.into_inner().as_any().downcast_ref::<Error>().unwrap(),
Error::RegionState { .. }
);
// Region is in leader state, apply staging manifest request should fail.
engine
.set_region_role(region_id, RegionRole::Follower)
.unwrap();
let err = engine
.handle_request(
region_id,
RegionRequest::ApplyStagingManifest(ApplyStagingManifestRequest {
partition_expr: range_expr("x", 0, 100).as_json_str().unwrap(),
files_to_add: vec![],
}),
)
.await
.unwrap_err();
assert_matches!(
err.into_inner().as_any().downcast_ref::<Error>().unwrap(),
Error::RegionState { .. }
);
}
#[tokio::test]
async fn test_apply_staging_manifest_mismatched_partition_expr() {
common_telemetry::init_default_ut_logging();
test_apply_staging_manifest_mismatched_partition_expr_with_format(false).await;
test_apply_staging_manifest_mismatched_partition_expr_with_format(true).await;
}
async fn test_apply_staging_manifest_mismatched_partition_expr_with_format(flat_format: bool) {
let mut env = TestEnv::with_prefix("mismatched-partition-expr").await;
let engine = env
.create_engine(MitoConfig {
default_experimental_flat_format: flat_format,
..Default::default()
})
.await;
let region_id = RegionId::new(1, 1);
let request = CreateRequestBuilder::new().build();
engine
.handle_request(region_id, RegionRequest::Create(request))
.await
.unwrap();
engine
.handle_request(
region_id,
RegionRequest::EnterStaging(EnterStagingRequest {
partition_expr: range_expr("x", 0, 50).as_json_str().unwrap(),
}),
)
.await
.unwrap();
let err = engine
.handle_request(
region_id,
RegionRequest::ApplyStagingManifest(ApplyStagingManifestRequest {
partition_expr: range_expr("x", 0, 100).as_json_str().unwrap(),
files_to_add: vec![],
}),
)
.await
.unwrap_err();
assert_matches!(
err.into_inner().as_any().downcast_ref::<Error>().unwrap(),
Error::StagingPartitionExprMismatch { .. }
)
}
#[tokio::test]
async fn test_apply_staging_manifest_success() {
common_telemetry::init_default_ut_logging();
test_apply_staging_manifest_success_with_format(false).await;
test_apply_staging_manifest_success_with_format(true).await;
}
async fn test_apply_staging_manifest_success_with_format(flat_format: bool) {
let mut env = TestEnv::with_prefix("success").await;
let engine = env
.create_engine(MitoConfig {
default_experimental_flat_format: flat_format,
..Default::default()
})
.await;
let region_id = RegionId::new(1, 1);
let request = CreateRequestBuilder::new()
.partition_expr_json(Some(range_expr("tag_0", 0, 100).as_json_str().unwrap()))
.build();
let column_schemas = rows_schema(&request);
engine
.handle_request(region_id, RegionRequest::Create(request))
.await
.unwrap();
let new_region_id_1 = RegionId::new(1, 2);
let new_region_id_2 = RegionId::new(1, 3);
// Generate some data
for i in 0..3 {
let rows_data = Rows {
schema: column_schemas.clone(),
rows: build_rows(i * 10, (i + 1) * 10),
};
put_rows(&engine, region_id, rows_data).await;
engine
.handle_request(
region_id,
RegionRequest::Flush(RegionFlushRequest {
row_group_size: None,
}),
)
.await
.unwrap();
}
engine
.set_region_role_state_gracefully(region_id, SettableRegionRoleState::StagingLeader)
.await
.unwrap();
let result = engine
.remap_manifests(RemapManifestsRequest {
region_id,
input_regions: vec![region_id],
region_mapping: [(region_id, vec![new_region_id_1, new_region_id_2])]
.into_iter()
.collect(),
new_partition_exprs: [
(
new_region_id_1,
range_expr("tag_0", 0, 50).as_json_str().unwrap(),
),
(
new_region_id_2,
range_expr("tag_0", 50, 100).as_json_str().unwrap(),
),
]
.into_iter()
.collect(),
})
.await
.unwrap();
assert_eq!(result.new_manifests.len(), 2);
let new_manifest_1 =
serde_json::from_str::<RegionManifest>(&result.new_manifests[&new_region_id_1]).unwrap();
let new_manifest_2 =
serde_json::from_str::<RegionManifest>(&result.new_manifests[&new_region_id_2]).unwrap();
assert_eq!(new_manifest_1.files.len(), 3);
assert_eq!(new_manifest_2.files.len(), 3);
let request = CreateRequestBuilder::new().build();
engine
.handle_request(new_region_id_1, RegionRequest::Create(request))
.await
.unwrap();
engine
.handle_request(
new_region_id_1,
RegionRequest::EnterStaging(EnterStagingRequest {
partition_expr: range_expr("tag_0", 0, 50).as_json_str().unwrap(),
}),
)
.await
.unwrap();
let mut files_to_add = new_manifest_1.files.values().cloned().collect::<Vec<_>>();
// Before apply staging manifest, the files should be empty
let region = engine.get_region(new_region_id_1).unwrap();
let manifest = region.manifest_ctx.manifest().await;
assert_eq!(manifest.files.len(), 0);
let staging_manifest = region.manifest_ctx.staging_manifest().await.unwrap();
assert_eq!(staging_manifest.files.len(), 0);
engine
.handle_request(
new_region_id_1,
RegionRequest::ApplyStagingManifest(ApplyStagingManifestRequest {
partition_expr: range_expr("tag_0", 0, 50).as_json_str().unwrap(),
files_to_add: serde_json::to_vec(&files_to_add).unwrap(),
}),
)
.await
.unwrap();
// After apply staging manifest, the files should be the same as the new manifest
let region = engine.get_region(new_region_id_1).unwrap();
let manifest = region.manifest_ctx.manifest().await;
assert_eq!(manifest.files.len(), 3);
assert!(region.is_writable());
assert!(!region.is_staging());
// The manifest partition expr should be the same as the request.
assert_eq!(
manifest.metadata.partition_expr.as_ref().unwrap(),
&range_expr("tag_0", 0, 50).as_json_str().unwrap()
);
// The staging manifest should be cleared.
let staging_manifest = region.manifest_ctx.staging_manifest().await;
assert!(staging_manifest.is_none());
// The staging partition expr should be cleared.
assert!(region.staging_partition_expr.lock().unwrap().is_none());
// The staging manifest directory should be empty.
let data_home = env.data_home();
let region_dir = format!("{}/data/test/1_0000000001", data_home.display());
let staging_manifest_dir = format!("{}/staging/manifest", region_dir);
let staging_files = fs::read_dir(&staging_manifest_dir)
.map(|entries| entries.collect::<Result<Vec<_>, _>>().unwrap_or_default())
.unwrap_or_default();
assert_eq!(staging_files.len(), 0);
// Try to modify the file sequence.
files_to_add.push(FileMeta {
region_id,
file_id: FileId::random(),
..Default::default()
});
// This request will be ignored.
engine
.handle_request(
new_region_id_1,
RegionRequest::ApplyStagingManifest(ApplyStagingManifestRequest {
partition_expr: range_expr("tag_0", 0, 50).as_json_str().unwrap(),
files_to_add: serde_json::to_vec(&files_to_add).unwrap(),
}),
)
.await
.unwrap();
// The files number should not change.
let region = engine.get_region(new_region_id_1).unwrap();
let manifest = region.manifest_ctx.manifest().await;
assert_eq!(manifest.files.len(), 3);
}
#[tokio::test]
async fn test_apply_staging_manifest_invalid_files_to_add() {
common_telemetry::init_default_ut_logging();
test_apply_staging_manifest_invalid_files_to_add_with_format(false).await;
test_apply_staging_manifest_invalid_files_to_add_with_format(true).await;
}
async fn test_apply_staging_manifest_invalid_files_to_add_with_format(flat_format: bool) {
let mut env = TestEnv::with_prefix("invalid-files-to-add").await;
let engine = env
.create_engine(MitoConfig {
default_experimental_flat_format: flat_format,
..Default::default()
})
.await;
let region_id = RegionId::new(1, 1);
let request = CreateRequestBuilder::new().build();
engine
.handle_request(region_id, RegionRequest::Create(request))
.await
.unwrap();
engine
.handle_request(
region_id,
RegionRequest::EnterStaging(EnterStagingRequest {
partition_expr: range_expr("tag_0", 0, 50).as_json_str().unwrap(),
}),
)
.await
.unwrap();
let err = engine
.handle_request(
region_id,
RegionRequest::ApplyStagingManifest(ApplyStagingManifestRequest {
partition_expr: range_expr("tag_0", 0, 50).as_json_str().unwrap(),
files_to_add: b"invalid".to_vec(),
}),
)
.await
.unwrap_err();
assert_matches!(
err.into_inner().as_any().downcast_ref::<Error>().unwrap(),
Error::SerdeJson { .. }
);
}
#[tokio::test]
async fn test_apply_staging_manifest_empty_files() {
common_telemetry::init_default_ut_logging();
test_apply_staging_manifest_empty_files_with_format(false).await;
test_apply_staging_manifest_empty_files_with_format(true).await;
}
async fn test_apply_staging_manifest_empty_files_with_format(flat_format: bool) {
let mut env = TestEnv::with_prefix("empty-files").await;
let engine = env
.create_engine(MitoConfig {
default_experimental_flat_format: flat_format,
..Default::default()
})
.await;
let region_id = RegionId::new(1, 1);
let request = CreateRequestBuilder::new().build();
engine
.handle_request(region_id, RegionRequest::Create(request))
.await
.unwrap();
engine
.handle_request(
region_id,
RegionRequest::EnterStaging(EnterStagingRequest {
partition_expr: range_expr("tag_0", 0, 50).as_json_str().unwrap(),
}),
)
.await
.unwrap();
engine
.handle_request(
region_id,
RegionRequest::ApplyStagingManifest(ApplyStagingManifestRequest {
partition_expr: range_expr("tag_0", 0, 50).as_json_str().unwrap(),
files_to_add: serde_json::to_vec::<Vec<FileMeta>>(&vec![]).unwrap(),
}),
)
.await
.unwrap();
let region = engine.get_region(region_id).unwrap();
let manifest = region.manifest_ctx.manifest().await;
assert_eq!(manifest.files.len(), 0);
let staging_manifest = region.manifest_ctx.staging_manifest().await;
assert!(staging_manifest.is_none());
let staging_partition_expr = region.staging_partition_expr.lock().unwrap();
assert!(staging_partition_expr.is_none());
}

View File

@@ -20,7 +20,7 @@ use api::v1::Rows;
use common_error::ext::ErrorExt;
use common_error::status_code::StatusCode;
use object_store::layers::mock::{Error as MockError, ErrorKind, MockLayerBuilder};
use store_api::region_engine::{CopyRegionFromRequest, RegionEngine, RegionRole};
use store_api::region_engine::{MitoCopyRegionFromRequest, RegionEngine, RegionRole};
use store_api::region_request::{RegionFlushRequest, RegionRequest};
use store_api::storage::RegionId;
@@ -89,7 +89,7 @@ async fn test_engine_copy_region_from_with_format(flat_format: bool, with_index:
let resp = engine
.copy_region_from(
target_region_id,
CopyRegionFromRequest {
MitoCopyRegionFromRequest {
source_region_id,
parallelism: 1,
},
@@ -126,7 +126,7 @@ async fn test_engine_copy_region_from_with_format(flat_format: bool, with_index:
let resp2 = engine
.copy_region_from(
target_region_id,
CopyRegionFromRequest {
MitoCopyRegionFromRequest {
source_region_id,
parallelism: 1,
},
@@ -207,7 +207,7 @@ async fn test_engine_copy_region_failure_with_format(flat_format: bool) {
let err = engine
.copy_region_from(
target_region_id,
CopyRegionFromRequest {
MitoCopyRegionFromRequest {
source_region_id,
parallelism: 1,
},
@@ -225,7 +225,6 @@ async fn test_engine_copy_region_failure_with_format(flat_format: bool) {
let source_region_dir = format!("{}/data/test/1_0000000001", env.data_home().display());
assert_file_num_in_dir(&source_region_dir, 1);
assert_file_num_in_dir(&format!("{}/index", source_region_dir), 1);
assert_eq!(
source_region_files,
collect_filename_in_dir(&source_region_dir)
@@ -298,7 +297,7 @@ async fn test_engine_copy_region_invalid_args_with_format(flat_format: bool) {
let err = engine
.copy_region_from(
region_id,
CopyRegionFromRequest {
MitoCopyRegionFromRequest {
source_region_id: RegionId::new(2, 1),
parallelism: 1,
},
@@ -309,7 +308,7 @@ async fn test_engine_copy_region_invalid_args_with_format(flat_format: bool) {
let err = engine
.copy_region_from(
region_id,
CopyRegionFromRequest {
MitoCopyRegionFromRequest {
source_region_id: RegionId::new(1, 1),
parallelism: 1,
},
@@ -347,7 +346,7 @@ async fn test_engine_copy_region_unexpected_state_with_format(flat_format: bool)
let err = engine
.copy_region_from(
region_id,
CopyRegionFromRequest {
MitoCopyRegionFromRequest {
source_region_id: RegionId::new(1, 2),
parallelism: 1,
},

View File

@@ -153,7 +153,7 @@ async fn test_sync_after_flush_region_with_format(flat_format: bool) {
// Returns error since the max manifest is 1
let manifest_info = RegionManifestInfo::mito(2, 0, 0);
let err = follower_engine
.sync_region(region_id, manifest_info)
.sync_region(region_id, manifest_info.into())
.await
.unwrap_err();
let err = err.as_any().downcast_ref::<Error>().unwrap();
@@ -161,7 +161,7 @@ async fn test_sync_after_flush_region_with_format(flat_format: bool) {
let manifest_info = RegionManifestInfo::mito(1, 0, 0);
follower_engine
.sync_region(region_id, manifest_info)
.sync_region(region_id, manifest_info.into())
.await
.unwrap();
common_telemetry::info!("Scan the region on the follower engine after sync");
@@ -266,7 +266,7 @@ async fn test_sync_after_alter_region_with_format(flat_format: bool) {
// Sync the region from the leader engine to the follower engine
let manifest_info = RegionManifestInfo::mito(2, 0, 0);
follower_engine
.sync_region(region_id, manifest_info)
.sync_region(region_id, manifest_info.into())
.await
.unwrap();
let expected = "\

View File

@@ -26,7 +26,7 @@ use either::Either;
use partition::expr::PartitionExpr;
use smallvec::{SmallVec, smallvec};
use snafu::ResultExt;
use store_api::storage::RegionId;
use store_api::storage::{RegionId, SequenceNumber};
use strum::IntoStaticStr;
use tokio::sync::{Semaphore, mpsc, watch};
@@ -464,24 +464,26 @@ impl RegionFlushTask {
// Sets `for_flush` flag to true.
let mem_ranges = mem.ranges(None, RangesOptions::for_flush())?;
let num_mem_ranges = mem_ranges.ranges.len();
let num_mem_rows = mem_ranges.stats.num_rows();
// Aggregate stats from all ranges
let num_mem_rows = mem_ranges.num_rows();
let memtable_series_count = mem_ranges.series_count();
let memtable_id = mem.id();
// Increases series count for each mem range. We consider each mem range has different series so
// the counter may have more series than the actual series count.
series_count += mem_ranges.stats.series_count();
series_count += memtable_series_count;
if mem_ranges.is_record_batch() {
let flush_start = Instant::now();
let FlushFlatMemResult {
num_encoded,
max_sequence,
num_sources,
results,
} = self
.flush_flat_mem_ranges(version, &write_opts, mem_ranges)
.await?;
for (source_idx, result) in results.into_iter().enumerate() {
let (ssts_written, metrics) = result?;
let (max_sequence, ssts_written, metrics) = result?;
if ssts_written.is_empty() {
// No data written.
continue;
@@ -521,7 +523,7 @@ impl RegionFlushTask {
compact_cost,
);
} else {
let max_sequence = mem_ranges.stats.max_sequence();
let max_sequence = mem_ranges.max_sequence();
let source = memtable_source(mem_ranges, &version.options).await?;
// Flush to level 0.
@@ -583,8 +585,7 @@ impl RegionFlushTask {
)?;
let mut tasks = Vec::with_capacity(flat_sources.encoded.len() + flat_sources.sources.len());
let num_encoded = flat_sources.encoded.len();
let max_sequence = flat_sources.max_sequence;
for source in flat_sources.sources {
for (source, max_sequence) in flat_sources.sources {
let source = Either::Right(source);
let write_request = self.new_write_request(version, max_sequence, source);
let access_layer = self.access_layer.clone();
@@ -596,11 +597,11 @@ impl RegionFlushTask {
let ssts = access_layer
.write_sst(write_request, &write_opts, &mut metrics)
.await?;
Ok((ssts, metrics))
Ok((max_sequence, ssts, metrics))
});
tasks.push(task);
}
for encoded in flat_sources.encoded {
for (encoded, max_sequence) in flat_sources.encoded {
let access_layer = self.access_layer.clone();
let cache_manager = self.cache_manager.clone();
let region_id = version.metadata.region_id;
@@ -610,7 +611,7 @@ impl RegionFlushTask {
let metrics = access_layer
.put_sst(&encoded.data, region_id, &encoded.sst_info, &cache_manager)
.await?;
Ok((smallvec![encoded.sst_info], metrics))
Ok((max_sequence, smallvec![encoded.sst_info], metrics))
});
tasks.push(task);
}
@@ -620,7 +621,6 @@ impl RegionFlushTask {
.context(JoinSnafu)?;
Ok(FlushFlatMemResult {
num_encoded,
max_sequence,
num_sources,
results,
})
@@ -696,9 +696,8 @@ impl RegionFlushTask {
struct FlushFlatMemResult {
num_encoded: usize,
max_sequence: u64,
num_sources: usize,
results: Vec<Result<(SstInfoArray, Metrics)>>,
results: Vec<Result<(SequenceNumber, SstInfoArray, Metrics)>>,
}
struct DoFlushMemtablesResult {
@@ -744,9 +743,8 @@ async fn memtable_source(mem_ranges: MemtableRanges, options: &RegionOptions) ->
}
struct FlatSources {
max_sequence: u64,
sources: SmallVec<[FlatSource; 4]>,
encoded: SmallVec<[EncodedRange; 4]>,
sources: SmallVec<[(FlatSource, SequenceNumber); 4]>,
encoded: SmallVec<[(EncodedRange, SequenceNumber); 4]>,
}
/// Returns the max sequence and [FlatSource] for the given memtable.
@@ -756,18 +754,17 @@ fn memtable_flat_sources(
options: &RegionOptions,
field_column_start: usize,
) -> Result<FlatSources> {
let MemtableRanges { ranges, stats } = mem_ranges;
let max_sequence = stats.max_sequence();
let MemtableRanges { ranges } = mem_ranges;
let mut flat_sources = FlatSources {
max_sequence,
sources: SmallVec::new(),
encoded: SmallVec::new(),
};
if ranges.len() == 1 {
let only_range = ranges.into_values().next().unwrap();
let max_sequence = only_range.stats().max_sequence();
if let Some(encoded) = only_range.encoded() {
flat_sources.encoded.push(encoded);
flat_sources.encoded.push((encoded, max_sequence));
} else {
let iter = only_range.build_record_batch_iter(None)?;
// Dedup according to append mode and merge mode.
@@ -778,25 +775,39 @@ fn memtable_flat_sources(
field_column_start,
iter,
);
flat_sources.sources.push(FlatSource::Iter(iter));
flat_sources
.sources
.push((FlatSource::Iter(iter), max_sequence));
};
} else {
let min_flush_rows = stats.num_rows / 8;
// Calculate total rows from all ranges for min_flush_rows calculation
let total_rows: usize = ranges.values().map(|r| r.stats().num_rows()).sum();
let min_flush_rows = total_rows / 8;
let min_flush_rows = min_flush_rows.max(DEFAULT_ROW_GROUP_SIZE);
let mut last_iter_rows = 0;
let num_ranges = ranges.len();
let mut input_iters = Vec::with_capacity(num_ranges);
let mut current_ranges = Vec::new();
for (_range_id, range) in ranges {
if let Some(encoded) = range.encoded() {
flat_sources.encoded.push(encoded);
let max_sequence = range.stats().max_sequence();
flat_sources.encoded.push((encoded, max_sequence));
continue;
}
let iter = range.build_record_batch_iter(None)?;
input_iters.push(iter);
last_iter_rows += range.num_rows();
current_ranges.push(range);
if last_iter_rows > min_flush_rows {
// Calculate max_sequence from all merged ranges
let max_sequence = current_ranges
.iter()
.map(|r| r.stats().max_sequence())
.max()
.unwrap_or(0);
let maybe_dedup = merge_and_dedup(
&schema,
options.append_mode,
@@ -805,13 +816,22 @@ fn memtable_flat_sources(
std::mem::replace(&mut input_iters, Vec::with_capacity(num_ranges)),
)?;
flat_sources.sources.push(FlatSource::Iter(maybe_dedup));
flat_sources
.sources
.push((FlatSource::Iter(maybe_dedup), max_sequence));
last_iter_rows = 0;
current_ranges.clear();
}
}
// Handle remaining iters.
if !input_iters.is_empty() {
let max_sequence = current_ranges
.iter()
.map(|r| r.stats().max_sequence())
.max()
.unwrap_or(0);
let maybe_dedup = merge_and_dedup(
&schema,
options.append_mode,
@@ -820,7 +840,9 @@ fn memtable_flat_sources(
input_iters,
)?;
flat_sources.sources.push(FlatSource::Iter(maybe_dedup));
flat_sources
.sources
.push((FlatSource::Iter(maybe_dedup), max_sequence));
}
}
@@ -1491,7 +1513,7 @@ mod tests {
// Consume the iterator and count rows
let mut total_rows = 0usize;
for source in flat_sources.sources {
for (source, _sequence) in flat_sources.sources {
match source {
crate::read::FlatSource::Iter(iter) => {
for rb in iter {
@@ -1521,7 +1543,7 @@ mod tests {
assert_eq!(1, flat_sources.sources.len());
let mut total_rows = 0usize;
for source in flat_sources.sources {
for (source, _sequence) in flat_sources.sources {
match source {
crate::read::FlatSource::Iter(iter) => {
for rb in iter {

View File

@@ -45,6 +45,18 @@ pub enum RegionMetaAction {
Truncate(RegionTruncate),
}
impl RegionMetaAction {
/// Returns true if the action is a change action.
pub fn is_change(&self) -> bool {
matches!(self, RegionMetaAction::Change(_))
}
/// Returns true if the action is an edit action.
pub fn is_edit(&self) -> bool {
matches!(self, RegionMetaAction::Edit(_))
}
}
#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)]
pub struct RegionChange {
/// The metadata after changed.
@@ -340,16 +352,51 @@ pub struct RemovedFiles {
/// the files are removed from manifest. The timestamp is in milliseconds since unix epoch.
pub removed_at: i64,
/// The set of file ids that are removed.
#[serde(default)]
pub files: HashSet<RemovedFile>,
}
/// A removed file, which can be a data file(optional paired with a index file) or an outdated index file.
#[derive(Serialize, Deserialize, Hash, Clone, Debug, PartialEq, Eq)]
#[derive(Serialize, Hash, Clone, Debug, PartialEq, Eq)]
pub enum RemovedFile {
File(FileId, Option<IndexVersion>),
Index(FileId, IndexVersion),
}
/// Support deserialize from old format(just FileId as string) for backward compatibility
/// into current format(RemovedFile enum).
/// This is needed just in case there are old manifests with removed files recorded.
impl<'de> Deserialize<'de> for RemovedFile {
fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
#[derive(Deserialize)]
#[serde(untagged)]
enum CompatRemovedFile {
Enum(RemovedFileEnum),
FileId(FileId),
}
#[derive(Deserialize)]
enum RemovedFileEnum {
File(FileId, Option<IndexVersion>),
Index(FileId, IndexVersion),
}
let compat = CompatRemovedFile::deserialize(deserializer)?;
match compat {
CompatRemovedFile::FileId(file_id) => Ok(RemovedFile::File(file_id, None)),
CompatRemovedFile::Enum(e) => match e {
RemovedFileEnum::File(file_id, version) => Ok(RemovedFile::File(file_id, version)),
RemovedFileEnum::Index(file_id, version) => {
Ok(RemovedFile::Index(file_id, version))
}
},
}
}
}
impl RemovedFile {
pub fn file_id(&self) -> FileId {
match self {
@@ -438,7 +485,8 @@ impl RegionMetaActionList {
Self { actions }
}
pub fn into_region_edit(self) -> RegionEdit {
/// Split the actions into a region change and an edit.
pub fn split_region_change_and_edit(self) -> (Option<RegionChange>, RegionEdit) {
let mut edit = RegionEdit {
files_to_add: Vec::new(),
files_to_remove: Vec::new(),
@@ -448,31 +496,39 @@ impl RegionMetaActionList {
flushed_sequence: None,
committed_sequence: None,
};
let mut region_change = None;
for action in self.actions {
if let RegionMetaAction::Edit(region_edit) = action {
// Merge file adds/removes
edit.files_to_add.extend(region_edit.files_to_add);
edit.files_to_remove.extend(region_edit.files_to_remove);
// Max of flushed entry id / sequence
if let Some(eid) = region_edit.flushed_entry_id {
edit.flushed_entry_id = Some(edit.flushed_entry_id.map_or(eid, |v| v.max(eid)));
match action {
RegionMetaAction::Change(change) => {
region_change = Some(change);
}
if let Some(seq) = region_edit.flushed_sequence {
edit.flushed_sequence = Some(edit.flushed_sequence.map_or(seq, |v| v.max(seq)));
}
if let Some(seq) = region_edit.committed_sequence {
edit.committed_sequence =
Some(edit.committed_sequence.map_or(seq, |v| v.max(seq)));
}
// Prefer the latest non-none time window
if region_edit.compaction_time_window.is_some() {
edit.compaction_time_window = region_edit.compaction_time_window;
RegionMetaAction::Edit(region_edit) => {
// Merge file adds/removes
edit.files_to_add.extend(region_edit.files_to_add);
edit.files_to_remove.extend(region_edit.files_to_remove);
// Max of flushed entry id / sequence
if let Some(eid) = region_edit.flushed_entry_id {
edit.flushed_entry_id =
Some(edit.flushed_entry_id.map_or(eid, |v| v.max(eid)));
}
if let Some(seq) = region_edit.flushed_sequence {
edit.flushed_sequence =
Some(edit.flushed_sequence.map_or(seq, |v| v.max(seq)));
}
if let Some(seq) = region_edit.committed_sequence {
edit.committed_sequence =
Some(edit.committed_sequence.map_or(seq, |v| v.max(seq)));
}
// Prefer the latest non-none time window
if region_edit.compaction_time_window.is_some() {
edit.compaction_time_window = region_edit.compaction_time_window;
}
}
_ => {}
}
}
edit
(region_change, edit)
}
}
@@ -1008,4 +1064,115 @@ mod tests {
let deserialized: RegionChange = serde_json::from_str(&serialized).unwrap();
assert_eq!(deserialized.sst_format, FormatType::Flat);
}
#[test]
fn test_removed_file_compatibility() {
let file_id = FileId::random();
// Case 1: Deserialize from FileId string (Legacy format)
let json_str = format!("\"{}\"", file_id);
let removed_file: RemovedFile = serde_json::from_str(&json_str).unwrap();
assert_eq!(removed_file, RemovedFile::File(file_id, None));
// Case 2: Deserialize from new format (File)
let removed_file_v2 = RemovedFile::File(file_id, Some(10));
let json_v2 = serde_json::to_string(&removed_file_v2).unwrap();
let deserialized_v2: RemovedFile = serde_json::from_str(&json_v2).unwrap();
assert_eq!(removed_file_v2, deserialized_v2);
// Case 3: Deserialize from new format (Index)
let removed_index = RemovedFile::Index(file_id, 20);
let json_index = serde_json::to_string(&removed_index).unwrap();
let deserialized_index: RemovedFile = serde_json::from_str(&json_index).unwrap();
assert_eq!(removed_index, deserialized_index);
// Case 4: Round-trip serialization/deserialization of new enum format with None as index version
let removed_file = RemovedFile::File(file_id, None);
let json = serde_json::to_string(&removed_file).unwrap();
let deserialized: RemovedFile = serde_json::from_str(&json).unwrap();
assert_eq!(removed_file, deserialized);
// Case 5: Deserialize mixed set in RemovedFilesRecord
// This simulates a Set<RemovedFile> which might contain old strings or new objects if manually constructed or from old versions.
// Actually, if it was HashSet<FileId>, the JSON is ["id1", "id2"].
// If it is HashSet<RemovedFile>, the JSON is [{"File":...}, "id2"] if mixed (which shouldn't happen usually but good to test).
let json_set = format!("[\"{}\"]", file_id);
let removed_files_set: HashSet<RemovedFile> = serde_json::from_str(&json_set).unwrap();
assert!(removed_files_set.contains(&RemovedFile::File(file_id, None)));
}
/// It is intentionally acceptable to ignore the legacy `file_ids` field when
/// deserializing [`RemovedFiles`].
///
/// In older manifests, `file_ids` recorded the set of SSTable files that were
/// candidates for garbage collection at a given `removed_at` timestamp. The
/// newer format stores this information in the `files` field instead. When we
/// deserialize an old manifest entry into the new struct, we *drop* the
/// `file_ids` field instead of trying to recover or merge it.
///
/// Dropping `file_ids` does **not** risk deleting live data: a file is only
/// physically removed when it is both (a) no longer referenced by any region
/// metadata and (b) selected by the GC worker as safe to delete. Losing the
/// historical list of candidate `file_ids` merely means some obsolete files
/// may stay on disk longer than strictly necessary.
///
/// The GC worker periodically scans storage (e.g. by walking the data
/// directories and/or consulting the latest manifest) to discover files that
/// are no longer referenced anywhere. Any files that were only referenced via
/// the dropped `file_ids` field will be rediscovered during these scans and
/// eventually deleted. Thus the system converges to a correct, fully-collected
/// state without relying on `file_ids`, and the only potential impact of
/// ignoring it is temporary disk space overhead, not data loss.
#[test]
fn test_removed_files_backward_compatibility() {
// Define the old version struct with file_ids field
#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)]
struct OldRemovedFiles {
pub removed_at: i64,
pub file_ids: HashSet<FileId>,
}
// Create an old version instance
let mut file_ids = HashSet::new();
file_ids.insert(FileId::random());
file_ids.insert(FileId::random());
let old_removed_files = OldRemovedFiles {
removed_at: 1234567890,
file_ids,
};
// Serialize the old version
let old_json = serde_json::to_string(&old_removed_files).unwrap();
// Try to deserialize into new version - file_ids should be ignored
let result: Result<RemovedFiles, _> = serde_json::from_str(&old_json);
// This should succeed and create a default RemovedFiles (empty files set)
assert!(result.is_ok(), "{:?}", result);
let removed_files = result.unwrap();
assert_eq!(removed_files.removed_at, 1234567890);
assert!(removed_files.files.is_empty());
// Test that new format still works
let file_id = FileId::random();
let new_json = format!(
r#"{{
"removed_at": 1234567890,
"files": ["{}"]
}}"#,
file_id
);
let result: Result<RemovedFiles, _> = serde_json::from_str(&new_json);
assert!(result.is_ok());
let removed_files = result.unwrap();
assert_eq!(removed_files.removed_at, 1234567890);
assert_eq!(removed_files.files.len(), 1);
assert!(
removed_files
.files
.contains(&RemovedFile::File(file_id, None))
);
}
}

View File

@@ -12,43 +12,46 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::HashMap;
pub(crate) mod checkpoint;
pub(crate) mod delta;
pub(crate) mod size_tracker;
pub(crate) mod staging;
pub(crate) mod utils;
use std::iter::Iterator;
use std::str::FromStr;
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::{Arc, RwLock};
use std::sync::Arc;
use std::sync::atomic::AtomicU64;
use common_datasource::compression::CompressionType;
use common_telemetry::debug;
use crc32fast::Hasher;
use futures::TryStreamExt;
use futures::future::try_join_all;
use lazy_static::lazy_static;
use object_store::util::join_dir;
use object_store::{Entry, ErrorKind, Lister, ObjectStore, util};
use object_store::{Lister, ObjectStore, util};
use regex::Regex;
use serde::{Deserialize, Serialize};
use snafu::{ResultExt, ensure};
use store_api::ManifestVersion;
use store_api::storage::RegionId;
use tokio::sync::Semaphore;
use crate::cache::manifest_cache::ManifestCache;
use crate::error::{
ChecksumMismatchSnafu, CompressObjectSnafu, DecompressObjectSnafu, InvalidScanIndexSnafu,
OpenDalSnafu, Result, SerdeJsonSnafu, Utf8Snafu,
};
use crate::error::{ChecksumMismatchSnafu, OpenDalSnafu, Result};
use crate::manifest::storage::checkpoint::CheckpointStorage;
use crate::manifest::storage::delta::DeltaStorage;
use crate::manifest::storage::size_tracker::{CheckpointTracker, DeltaTracker, SizeTracker};
use crate::manifest::storage::staging::StagingStorage;
use crate::manifest::storage::utils::remove_from_cache;
lazy_static! {
static ref DELTA_RE: Regex = Regex::new("^\\d+\\.json").unwrap();
static ref CHECKPOINT_RE: Regex = Regex::new("^\\d+\\.checkpoint").unwrap();
}
const LAST_CHECKPOINT_FILE: &str = "_last_checkpoint";
pub const LAST_CHECKPOINT_FILE: &str = "_last_checkpoint";
const DEFAULT_MANIFEST_COMPRESSION_TYPE: CompressionType = CompressionType::Gzip;
/// Due to backward compatibility, it is possible that the user's manifest file has not been compressed.
/// So when we encounter problems, we need to fall back to `FALL_BACK_COMPRESS_TYPE` for processing.
const FALL_BACK_COMPRESS_TYPE: CompressionType = CompressionType::Uncompressed;
pub(crate) const FALL_BACK_COMPRESS_TYPE: CompressionType = CompressionType::Uncompressed;
const FETCH_MANIFEST_PARALLELISM: usize = 16;
/// Returns the directory to the manifest files.
@@ -81,13 +84,13 @@ pub fn gen_path(path: &str, file: &str, compress_type: CompressionType) -> Strin
}
}
fn checkpoint_checksum(data: &[u8]) -> u32 {
pub(crate) fn checkpoint_checksum(data: &[u8]) -> u32 {
let mut hasher = Hasher::new();
hasher.update(data);
hasher.finalize()
}
fn verify_checksum(data: &[u8], wanted: Option<u32>) -> Result<()> {
pub(crate) fn verify_checksum(data: &[u8], wanted: Option<u32>) -> Result<()> {
if let Some(checksum) = wanted {
let calculated_checksum = checkpoint_checksum(data);
ensure!(
@@ -127,26 +130,20 @@ pub fn is_checkpoint_file(file_name: &str) -> bool {
CHECKPOINT_RE.is_match(file_name)
}
/// Key to identify a manifest file.
#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
enum FileKey {
/// A delta file (`.json`).
Delta(ManifestVersion),
/// A checkpoint file (`.checkpoint`).
Checkpoint(ManifestVersion),
}
#[derive(Clone, Debug)]
pub struct ManifestObjectStore {
object_store: ObjectStore,
compress_type: CompressionType,
path: String,
staging_path: String,
/// Stores the size of each manifest file.
manifest_size_map: Arc<RwLock<HashMap<FileKey, u64>>>,
total_manifest_size: Arc<AtomicU64>,
/// Optional manifest cache for local caching.
manifest_cache: Option<ManifestCache>,
// Tracks the size of each file in the manifest directory.
size_tracker: SizeTracker,
// The checkpoint file storage.
checkpoint_storage: CheckpointStorage<CheckpointTracker>,
// The delta file storage.
delta_storage: DeltaStorage<DeltaTracker>,
/// The staging file storage.
staging_storage: StagingStorage,
}
impl ManifestObjectStore {
@@ -160,43 +157,37 @@ impl ManifestObjectStore {
common_telemetry::info!("Create manifest store, cache: {}", manifest_cache.is_some());
let path = util::normalize_dir(path);
let staging_path = {
// Convert "region_dir/manifest/" to "region_dir/staging/manifest/"
let parent_dir = path.trim_end_matches("manifest/").trim_end_matches('/');
util::normalize_dir(&format!("{}/staging/manifest", parent_dir))
};
let size_tracker = SizeTracker::new(total_manifest_size);
let checkpoint_tracker = Arc::new(size_tracker.checkpoint_tracker());
let delta_tracker = Arc::new(size_tracker.manifest_tracker());
let checkpoint_storage = CheckpointStorage::new(
path.clone(),
object_store.clone(),
compress_type,
manifest_cache.clone(),
checkpoint_tracker,
);
let delta_storage = DeltaStorage::new(
path.clone(),
object_store.clone(),
compress_type,
manifest_cache.clone(),
delta_tracker,
);
let staging_storage =
StagingStorage::new(path.clone(), object_store.clone(), compress_type);
Self {
object_store,
compress_type,
path,
staging_path,
manifest_size_map: Arc::new(RwLock::new(HashMap::new())),
total_manifest_size,
manifest_cache,
size_tracker,
checkpoint_storage,
delta_storage,
staging_storage,
}
}
/// Returns the delta file path under the **current** compression algorithm
fn delta_file_path(&self, version: ManifestVersion, is_staging: bool) -> String {
let base_path = if is_staging {
&self.staging_path
} else {
&self.path
};
gen_path(base_path, &delta_file(version), self.compress_type)
}
/// Returns the checkpoint file path under the **current** compression algorithm
fn checkpoint_file_path(&self, version: ManifestVersion) -> String {
gen_path(&self.path, &checkpoint_file(version), self.compress_type)
}
/// Returns the last checkpoint path, because the last checkpoint is not compressed,
/// so its path name has nothing to do with the compression algorithm used by `ManifestObjectStore`
pub(crate) fn last_checkpoint_path(&self) -> String {
format!("{}{}", self.path, LAST_CHECKPOINT_FILE)
}
/// Returns the manifest dir
pub(crate) fn manifest_dir(&self) -> &str {
&self.path
@@ -204,75 +195,14 @@ impl ManifestObjectStore {
/// Returns an iterator of manifests from normal or staging directory.
pub(crate) async fn manifest_lister(&self, is_staging: bool) -> Result<Option<Lister>> {
let path = if is_staging {
&self.staging_path
if is_staging {
self.staging_storage.manifest_lister().await
} else {
&self.path
};
match self.object_store.lister_with(path).await {
Ok(streamer) => Ok(Some(streamer)),
Err(e) if e.kind() == ErrorKind::NotFound => {
debug!("Manifest directory does not exist: {}", path);
Ok(None)
}
Err(e) => Err(e).context(OpenDalSnafu)?,
self.delta_storage.manifest_lister().await
}
}
/// Return all `R`s in the directory that meet the `filter` conditions (that is, the `filter` closure returns `Some(R)`),
/// and discard `R` that does not meet the conditions (that is, the `filter` closure returns `None`)
/// Return an empty vector when directory is not found.
pub async fn get_paths<F, R>(&self, filter: F, is_staging: bool) -> Result<Vec<R>>
where
F: Fn(Entry) -> Option<R>,
{
let Some(streamer) = self.manifest_lister(is_staging).await? else {
return Ok(vec![]);
};
streamer
.try_filter_map(|e| async { Ok(filter(e)) })
.try_collect::<Vec<_>>()
.await
.context(OpenDalSnafu)
}
/// Sorts the manifest files.
fn sort_manifests(entries: &mut [(ManifestVersion, Entry)]) {
entries.sort_unstable_by(|(v1, _), (v2, _)| v1.cmp(v2));
}
/// Scans the manifest files in the range of [start, end) and return all manifest entries.
pub async fn scan(
&self,
start: ManifestVersion,
end: ManifestVersion,
) -> Result<Vec<(ManifestVersion, Entry)>> {
ensure!(start <= end, InvalidScanIndexSnafu { start, end });
let mut entries: Vec<(ManifestVersion, Entry)> = self
.get_paths(
|entry| {
let file_name = entry.name();
if is_delta_file(file_name) {
let version = file_version(file_name);
if start <= version && version < end {
return Some((version, entry));
}
}
None
},
false,
)
.await?;
Self::sort_manifests(&mut entries);
Ok(entries)
}
/// Fetches manifests in range [start_version, end_version).
///
/// This functions is guaranteed to return manifests from the `start_version` strictly (must contain `start_version`).
pub async fn fetch_manifests_strict_from(
&self,
@@ -280,70 +210,9 @@ impl ManifestObjectStore {
end_version: ManifestVersion,
region_id: RegionId,
) -> Result<Vec<(ManifestVersion, Vec<u8>)>> {
let mut manifests = self.fetch_manifests(start_version, end_version).await?;
let start_index = manifests.iter().position(|(v, _)| *v == start_version);
debug!(
"Fetches manifests in range [{},{}), start_index: {:?}, region_id: {}, manifests: {:?}",
start_version,
end_version,
start_index,
region_id,
manifests.iter().map(|(v, _)| *v).collect::<Vec<_>>()
);
if let Some(start_index) = start_index {
Ok(manifests.split_off(start_index))
} else {
Ok(vec![])
}
}
/// Common implementation for fetching manifests from entries in parallel.
/// If `is_staging` is true, cache is skipped.
async fn fetch_manifests_from_entries(
&self,
entries: Vec<(ManifestVersion, Entry)>,
is_staging: bool,
) -> Result<Vec<(ManifestVersion, Vec<u8>)>> {
if entries.is_empty() {
return Ok(vec![]);
}
// TODO(weny): Make it configurable.
let semaphore = Semaphore::new(FETCH_MANIFEST_PARALLELISM);
let tasks = entries.iter().map(|(v, entry)| async {
// Safety: semaphore must exist.
let _permit = semaphore.acquire().await.unwrap();
let cache_key = entry.path();
// Try to get from cache first
if let Some(data) = self.get_from_cache(cache_key, is_staging).await {
return Ok((*v, data));
}
// Fetch from remote object store
let compress_type = file_compress_type(entry.name());
let bytes = self
.object_store
.read(entry.path())
.await
.context(OpenDalSnafu)?;
let data = compress_type
.decode(bytes)
.await
.context(DecompressObjectSnafu {
compress_type,
path: entry.path(),
})?;
// Add to cache
self.put_to_cache(cache_key.to_string(), &data, is_staging)
.await;
Ok((*v, data))
});
try_join_all(tasks).await
self.delta_storage
.fetch_manifests_strict_from(start_version, end_version, region_id)
.await
}
/// Fetch all manifests in concurrent, and return the manifests in range [start_version, end_version)
@@ -355,8 +224,9 @@ impl ManifestObjectStore {
start_version: ManifestVersion,
end_version: ManifestVersion,
) -> Result<Vec<(ManifestVersion, Vec<u8>)>> {
let manifests = self.scan(start_version, end_version).await?;
self.fetch_manifests_from_entries(manifests, false).await
self.delta_storage
.fetch_manifests(start_version, end_version)
.await
}
/// Delete manifest files that version < end.
@@ -370,20 +240,18 @@ impl ManifestObjectStore {
) -> Result<usize> {
// Stores (entry, is_checkpoint, version) in a Vec.
let entries: Vec<_> = self
.get_paths(
|entry| {
let file_name = entry.name();
let is_checkpoint = is_checkpoint_file(file_name);
if is_delta_file(file_name) || is_checkpoint_file(file_name) {
let version = file_version(file_name);
if version < end {
return Some((entry, is_checkpoint, version));
}
.delta_storage
.get_paths(|entry| {
let file_name = entry.name();
let is_checkpoint = is_checkpoint_file(file_name);
if is_delta_file(file_name) || is_checkpoint_file(file_name) {
let version = file_version(file_name);
if version < end {
return Some((entry, is_checkpoint, version));
}
None
},
false,
)
}
None
})
.await?;
let checkpoint_version = if keep_last_checkpoint {
// Note that the order of entries is unspecific.
@@ -428,7 +296,7 @@ impl ManifestObjectStore {
// Remove from cache first
for (entry, _, _) in &del_entries {
self.remove_from_cache(entry.path()).await;
remove_from_cache(self.manifest_cache.as_ref(), entry.path()).await;
}
self.object_store
@@ -439,9 +307,11 @@ impl ManifestObjectStore {
// delete manifest sizes
for (_, is_checkpoint, version) in &del_entries {
if *is_checkpoint {
self.unset_file_size(&FileKey::Checkpoint(*version));
self.size_tracker
.remove(&size_tracker::FileKey::Checkpoint(*version));
} else {
self.unset_file_size(&FileKey::Delta(*version));
self.size_tracker
.remove(&size_tracker::FileKey::Delta(*version));
}
}
@@ -455,22 +325,11 @@ impl ManifestObjectStore {
bytes: &[u8],
is_staging: bool,
) -> Result<()> {
let path = self.delta_file_path(version, is_staging);
debug!("Save log to manifest storage, version: {}", version);
let data = self
.compress_type
.encode(bytes)
.await
.context(CompressObjectSnafu {
compress_type: self.compress_type,
path: &path,
})?;
let delta_size = data.len();
self.write_and_put_cache(&path, data, is_staging).await?;
self.set_delta_file_size(version, delta_size as u64);
Ok(())
if is_staging {
self.staging_storage.save(version, bytes).await
} else {
self.delta_storage.save(version, bytes).await
}
}
/// Save the checkpoint manifest file.
@@ -479,155 +338,50 @@ impl ManifestObjectStore {
version: ManifestVersion,
bytes: &[u8],
) -> Result<()> {
let path = self.checkpoint_file_path(version);
let data = self
.compress_type
.encode(bytes)
self.checkpoint_storage
.save_checkpoint(version, bytes)
.await
.context(CompressObjectSnafu {
compress_type: self.compress_type,
path: &path,
})?;
let checkpoint_size = data.len();
let checksum = checkpoint_checksum(bytes);
self.write_and_put_cache(&path, data, false).await?;
self.set_checkpoint_file_size(version, checkpoint_size as u64);
// Because last checkpoint file only contain size and version, which is tiny, so we don't compress it.
let last_checkpoint_path = self.last_checkpoint_path();
let checkpoint_metadata = CheckpointMetadata {
size: bytes.len(),
version,
checksum: Some(checksum),
extend_metadata: HashMap::new(),
};
debug!(
"Save checkpoint in path: {}, metadata: {:?}",
last_checkpoint_path, checkpoint_metadata
);
let bytes = checkpoint_metadata.encode()?;
self.object_store
.write(&last_checkpoint_path, bytes)
.await
.context(OpenDalSnafu)?;
Ok(())
}
async fn load_checkpoint(
&mut self,
metadata: CheckpointMetadata,
) -> Result<Option<(ManifestVersion, Vec<u8>)>> {
let version = metadata.version;
let path = self.checkpoint_file_path(version);
// Try to get from cache first
if let Some(data) = self.get_from_cache(&path, false).await {
verify_checksum(&data, metadata.checksum)?;
return Ok(Some((version, data)));
}
// Due to backward compatibility, it is possible that the user's checkpoint not compressed,
// so if we don't find file by compressed type. fall back to checkpoint not compressed find again.
let checkpoint_data = match self.object_store.read(&path).await {
Ok(checkpoint) => {
let checkpoint_size = checkpoint.len();
let decompress_data =
self.compress_type
.decode(checkpoint)
.await
.with_context(|_| DecompressObjectSnafu {
compress_type: self.compress_type,
path: path.clone(),
})?;
verify_checksum(&decompress_data, metadata.checksum)?;
// set the checkpoint size
self.set_checkpoint_file_size(version, checkpoint_size as u64);
// Add to cache
self.put_to_cache(path, &decompress_data, false).await;
Ok(Some(decompress_data))
}
Err(e) => {
if e.kind() == ErrorKind::NotFound {
if self.compress_type != FALL_BACK_COMPRESS_TYPE {
let fall_back_path = gen_path(
&self.path,
&checkpoint_file(version),
FALL_BACK_COMPRESS_TYPE,
);
debug!(
"Failed to load checkpoint from path: {}, fall back to path: {}",
path, fall_back_path
);
// Try to get fallback from cache first
if let Some(data) = self.get_from_cache(&fall_back_path, false).await {
verify_checksum(&data, metadata.checksum)?;
return Ok(Some((version, data)));
}
match self.object_store.read(&fall_back_path).await {
Ok(checkpoint) => {
let checkpoint_size = checkpoint.len();
let decompress_data = FALL_BACK_COMPRESS_TYPE
.decode(checkpoint)
.await
.with_context(|_| DecompressObjectSnafu {
compress_type: FALL_BACK_COMPRESS_TYPE,
path: fall_back_path.clone(),
})?;
verify_checksum(&decompress_data, metadata.checksum)?;
self.set_checkpoint_file_size(version, checkpoint_size as u64);
// Add fallback to cache
self.put_to_cache(fall_back_path, &decompress_data, false)
.await;
Ok(Some(decompress_data))
}
Err(e) if e.kind() == ErrorKind::NotFound => Ok(None),
Err(e) => Err(e).context(OpenDalSnafu),
}
} else {
Ok(None)
}
} else {
Err(e).context(OpenDalSnafu)
}
}
}?;
Ok(checkpoint_data.map(|data| (version, data)))
}
/// Load the latest checkpoint.
/// Return manifest version and the raw [RegionCheckpoint](crate::manifest::action::RegionCheckpoint) content if any
pub async fn load_last_checkpoint(&mut self) -> Result<Option<(ManifestVersion, Vec<u8>)>> {
let last_checkpoint_path = self.last_checkpoint_path();
// Fetch from remote object store without cache
let last_checkpoint_data = match self.object_store.read(&last_checkpoint_path).await {
Ok(data) => data.to_vec(),
Err(e) if e.kind() == ErrorKind::NotFound => {
return Ok(None);
}
Err(e) => {
return Err(e).context(OpenDalSnafu)?;
}
};
let checkpoint_metadata = CheckpointMetadata::decode(&last_checkpoint_data)?;
debug!(
"Load checkpoint in path: {}, metadata: {:?}",
last_checkpoint_path, checkpoint_metadata
);
self.load_checkpoint(checkpoint_metadata).await
self.checkpoint_storage.load_last_checkpoint().await
}
#[cfg(test)]
/// Compute the size(Byte) in manifest size map.
pub(crate) fn total_manifest_size(&self) -> u64 {
self.size_tracker.total()
}
/// Resets the size of all files.
pub(crate) fn reset_manifest_size(&mut self) {
self.size_tracker.reset();
}
/// Set the size of the delta file by delta version.
pub(crate) fn set_delta_file_size(&mut self, version: ManifestVersion, size: u64) {
self.size_tracker.record_delta(version, size);
}
/// Set the size of the checkpoint file by checkpoint version.
pub(crate) fn set_checkpoint_file_size(&self, version: ManifestVersion, size: u64) {
self.size_tracker.record_checkpoint(version, size);
}
/// Fetch all staging manifest files and return them as (version, action_list) pairs.
pub async fn fetch_staging_manifests(&self) -> Result<Vec<(ManifestVersion, Vec<u8>)>> {
self.staging_storage.fetch_manifests().await
}
/// Clear all staging manifest files.
pub async fn clear_staging_manifests(&mut self) -> Result<()> {
self.staging_storage.clear().await
}
}
#[cfg(test)]
impl ManifestObjectStore {
pub async fn read_file(&self, path: &str) -> Result<Vec<u8>> {
self.object_store
.read(path)
@@ -636,214 +390,18 @@ impl ManifestObjectStore {
.map(|v| v.to_vec())
}
#[cfg(test)]
pub async fn write_last_checkpoint(
&mut self,
version: ManifestVersion,
bytes: &[u8],
) -> Result<()> {
let path = self.checkpoint_file_path(version);
let data = self
.compress_type
.encode(bytes)
.await
.context(CompressObjectSnafu {
compress_type: self.compress_type,
path: &path,
})?;
let checkpoint_size = data.len();
self.object_store
.write(&path, data)
.await
.context(OpenDalSnafu)?;
self.set_checkpoint_file_size(version, checkpoint_size as u64);
let last_checkpoint_path = self.last_checkpoint_path();
let checkpoint_metadata = CheckpointMetadata {
size: bytes.len(),
version,
checksum: Some(1218259706),
extend_metadata: HashMap::new(),
};
debug!(
"Rewrite checkpoint in path: {}, metadata: {:?}",
last_checkpoint_path, checkpoint_metadata
);
let bytes = checkpoint_metadata.encode()?;
// Overwrite the last checkpoint with the modified content
self.object_store
.write(&last_checkpoint_path, bytes.clone())
.await
.context(OpenDalSnafu)?;
Ok(())
pub(crate) fn checkpoint_storage(&self) -> &CheckpointStorage<CheckpointTracker> {
&self.checkpoint_storage
}
/// Compute the size(Byte) in manifest size map.
pub(crate) fn total_manifest_size(&self) -> u64 {
self.manifest_size_map.read().unwrap().values().sum()
pub(crate) fn delta_storage(&self) -> &DeltaStorage<DeltaTracker> {
&self.delta_storage
}
/// Resets the size of all files.
pub(crate) fn reset_manifest_size(&mut self) {
self.manifest_size_map.write().unwrap().clear();
self.total_manifest_size.store(0, Ordering::Relaxed);
}
/// Set the size of the delta file by delta version.
pub(crate) fn set_delta_file_size(&mut self, version: ManifestVersion, size: u64) {
let mut m = self.manifest_size_map.write().unwrap();
m.insert(FileKey::Delta(version), size);
self.inc_total_manifest_size(size);
}
/// Set the size of the checkpoint file by checkpoint version.
pub(crate) fn set_checkpoint_file_size(&self, version: ManifestVersion, size: u64) {
let mut m = self.manifest_size_map.write().unwrap();
m.insert(FileKey::Checkpoint(version), size);
self.inc_total_manifest_size(size);
}
fn unset_file_size(&self, key: &FileKey) {
let mut m = self.manifest_size_map.write().unwrap();
if let Some(val) = m.remove(key) {
debug!("Unset file size: {:?}, size: {}", key, val);
self.dec_total_manifest_size(val);
}
}
fn inc_total_manifest_size(&self, val: u64) {
self.total_manifest_size.fetch_add(val, Ordering::Relaxed);
}
fn dec_total_manifest_size(&self, val: u64) {
self.total_manifest_size.fetch_sub(val, Ordering::Relaxed);
}
/// Fetch all staging manifest files and return them as (version, action_list) pairs.
pub async fn fetch_staging_manifests(&self) -> Result<Vec<(ManifestVersion, Vec<u8>)>> {
let manifest_entries = self
.get_paths(
|entry| {
let file_name = entry.name();
if is_delta_file(file_name) {
let version = file_version(file_name);
Some((version, entry))
} else {
None
}
},
true,
)
.await?;
let mut sorted_entries = manifest_entries;
Self::sort_manifests(&mut sorted_entries);
self.fetch_manifests_from_entries(sorted_entries, true)
.await
}
/// Clear all staging manifest files.
pub async fn clear_staging_manifests(&mut self) -> Result<()> {
self.object_store
.remove_all(&self.staging_path)
.await
.context(OpenDalSnafu)?;
debug!(
"Cleared all staging manifest files from {}",
self.staging_path
);
Ok(())
}
/// Gets a manifest file from cache.
/// Returns the file data if found in cache, None otherwise.
/// If `is_staging` is true, always returns None.
async fn get_from_cache(&self, key: &str, is_staging: bool) -> Option<Vec<u8>> {
if is_staging {
return None;
}
let cache = self.manifest_cache.as_ref()?;
cache.get_file(key).await
}
/// Puts a manifest file into cache.
/// If `is_staging` is true, does nothing.
async fn put_to_cache(&self, key: String, data: &[u8], is_staging: bool) {
if is_staging {
return;
}
let Some(cache) = &self.manifest_cache else {
return;
};
cache.put_file(key, data.to_vec()).await;
}
/// Writes data to object store and puts it into cache.
/// If `is_staging` is true, cache is skipped.
async fn write_and_put_cache(&self, path: &str, data: Vec<u8>, is_staging: bool) -> Result<()> {
// Clone data for cache before writing, only if cache is enabled and not staging
let cache_data = if !is_staging && self.manifest_cache.is_some() {
Some(data.clone())
} else {
None
};
// Write to object store
self.object_store
.write(path, data)
.await
.context(OpenDalSnafu)?;
// Put to cache if we cloned the data
if let Some(data) = cache_data {
self.put_to_cache(path.to_string(), &data, is_staging).await;
}
Ok(())
}
/// Removes a manifest file from cache.
async fn remove_from_cache(&self, key: &str) {
let Some(cache) = &self.manifest_cache else {
return;
};
cache.remove(key).await;
}
}
#[derive(Serialize, Deserialize, Debug)]
pub(crate) struct CheckpointMetadata {
pub size: usize,
/// The latest version this checkpoint contains.
pub version: ManifestVersion,
pub checksum: Option<u32>,
pub extend_metadata: HashMap<String, String>,
}
impl CheckpointMetadata {
fn encode(&self) -> Result<Vec<u8>> {
Ok(serde_json::to_string(self)
.context(SerdeJsonSnafu)?
.into_bytes())
}
fn decode(bs: &[u8]) -> Result<Self> {
let data = std::str::from_utf8(bs).context(Utf8Snafu)?;
serde_json::from_str(data).context(SerdeJsonSnafu)
pub(crate) fn set_compress_type(&mut self, compress_type: CompressionType) {
self.checkpoint_storage.set_compress_type(compress_type);
self.delta_storage.set_compress_type(compress_type);
self.staging_storage.set_compress_type(compress_type);
}
}
@@ -854,6 +412,7 @@ mod tests {
use object_store::services::Fs;
use super::*;
use crate::manifest::storage::checkpoint::CheckpointMetadata;
fn new_test_manifest_store() -> ManifestObjectStore {
common_telemetry::init_default_ut_logging();
@@ -890,14 +449,14 @@ mod tests {
#[tokio::test]
async fn test_manifest_log_store_uncompress() {
let mut log_store = new_test_manifest_store();
log_store.compress_type = CompressionType::Uncompressed;
log_store.set_compress_type(CompressionType::Uncompressed);
test_manifest_log_store_case(log_store).await;
}
#[tokio::test]
async fn test_manifest_log_store_compress() {
let mut log_store = new_test_manifest_store();
log_store.compress_type = CompressionType::Gzip;
log_store.set_compress_type(CompressionType::Gzip);
test_manifest_log_store_case(log_store).await;
}
@@ -941,6 +500,7 @@ mod tests {
//delete (,4) logs and keep checkpoint 3.
let _ = log_store.delete_until(4, true).await.unwrap();
let _ = log_store
.checkpoint_storage
.load_checkpoint(new_checkpoint_metadata_with_version(3))
.await
.unwrap()
@@ -958,6 +518,7 @@ mod tests {
let _ = log_store.delete_until(11, false).await.unwrap();
assert!(
log_store
.checkpoint_storage
.load_checkpoint(new_checkpoint_metadata_with_version(3))
.await
.unwrap()
@@ -976,7 +537,7 @@ mod tests {
let mut log_store = new_test_manifest_store();
// write uncompress data to stimulate previously uncompressed data
log_store.compress_type = CompressionType::Uncompressed;
log_store.set_compress_type(CompressionType::Uncompressed);
for v in 0..5 {
log_store
.save(v, format!("hello, {v}").as_bytes(), false)
@@ -989,7 +550,7 @@ mod tests {
.unwrap();
// change compress type
log_store.compress_type = CompressionType::Gzip;
log_store.set_compress_type(CompressionType::Gzip);
// test load_last_checkpoint work correctly for previously uncompressed data
let (v, checkpoint) = log_store.load_last_checkpoint().await.unwrap().unwrap();
@@ -1018,6 +579,7 @@ mod tests {
assert_eq!(format!("hello, {v}").as_bytes(), bytes);
}
let (v, checkpoint) = log_store
.checkpoint_storage
.load_checkpoint(new_checkpoint_metadata_with_version(5))
.await
.unwrap()
@@ -1052,7 +614,7 @@ mod tests {
async fn test_uncompressed_manifest_files_size() {
let mut log_store = new_test_manifest_store();
// write 5 manifest files with uncompressed8B per file
log_store.compress_type = CompressionType::Uncompressed;
log_store.set_compress_type(CompressionType::Uncompressed);
for v in 0..5 {
log_store
.save(v, format!("hello, {v}").as_bytes(), false)
@@ -1090,7 +652,7 @@ mod tests {
async fn test_compressed_manifest_files_size() {
let mut log_store = new_test_manifest_store();
// Test with compressed manifest files
log_store.compress_type = CompressionType::Gzip;
log_store.set_compress_type(CompressionType::Gzip);
// write 5 manifest files
for v in 0..5 {
log_store

View File

@@ -0,0 +1,316 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::HashMap;
use std::sync::Arc;
use common_datasource::compression::CompressionType;
use common_telemetry::debug;
use object_store::{ErrorKind, ObjectStore};
use serde::{Deserialize, Serialize};
use snafu::ResultExt;
use store_api::ManifestVersion;
use crate::cache::manifest_cache::ManifestCache;
use crate::error::{
CompressObjectSnafu, DecompressObjectSnafu, OpenDalSnafu, Result, SerdeJsonSnafu, Utf8Snafu,
};
use crate::manifest::storage::size_tracker::Tracker;
use crate::manifest::storage::utils::{get_from_cache, put_to_cache, write_and_put_cache};
use crate::manifest::storage::{
FALL_BACK_COMPRESS_TYPE, LAST_CHECKPOINT_FILE, checkpoint_checksum, checkpoint_file, gen_path,
verify_checksum,
};
#[derive(Serialize, Deserialize, Debug)]
pub(crate) struct CheckpointMetadata {
pub size: usize,
/// The latest version this checkpoint contains.
pub version: ManifestVersion,
pub checksum: Option<u32>,
pub extend_metadata: HashMap<String, String>,
}
impl CheckpointMetadata {
fn encode(&self) -> Result<Vec<u8>> {
Ok(serde_json::to_string(self)
.context(SerdeJsonSnafu)?
.into_bytes())
}
fn decode(bs: &[u8]) -> Result<Self> {
let data = std::str::from_utf8(bs).context(Utf8Snafu)?;
serde_json::from_str(data).context(SerdeJsonSnafu)
}
}
/// Handle checkpoint storage operations.
#[derive(Debug, Clone)]
pub(crate) struct CheckpointStorage<T: Tracker> {
object_store: ObjectStore,
compress_type: CompressionType,
path: String,
manifest_cache: Option<ManifestCache>,
size_tracker: Arc<T>,
}
impl<T: Tracker> CheckpointStorage<T> {
pub fn new(
path: String,
object_store: ObjectStore,
compress_type: CompressionType,
manifest_cache: Option<ManifestCache>,
size_tracker: Arc<T>,
) -> Self {
Self {
object_store,
compress_type,
path,
manifest_cache,
size_tracker,
}
}
/// Returns the last checkpoint path, because the last checkpoint is not compressed,
/// so its path name has nothing to do with the compression algorithm used by `ManifestObjectStore`
pub(crate) fn last_checkpoint_path(&self) -> String {
format!("{}{}", self.path, LAST_CHECKPOINT_FILE)
}
/// Returns the checkpoint file path under the **current** compression algorithm
fn checkpoint_file_path(&self, version: ManifestVersion) -> String {
gen_path(&self.path, &checkpoint_file(version), self.compress_type)
}
pub(crate) async fn load_checkpoint(
&mut self,
metadata: CheckpointMetadata,
) -> Result<Option<(ManifestVersion, Vec<u8>)>> {
let version = metadata.version;
let path = self.checkpoint_file_path(version);
// Try to get from cache first
if let Some(data) = get_from_cache(self.manifest_cache.as_ref(), &path).await {
verify_checksum(&data, metadata.checksum)?;
return Ok(Some((version, data)));
}
// Due to backward compatibility, it is possible that the user's checkpoint not compressed,
// so if we don't find file by compressed type. fall back to checkpoint not compressed find again.
let checkpoint_data = match self.object_store.read(&path).await {
Ok(checkpoint) => {
let checkpoint_size = checkpoint.len();
let decompress_data =
self.compress_type
.decode(checkpoint)
.await
.with_context(|_| DecompressObjectSnafu {
compress_type: self.compress_type,
path: path.clone(),
})?;
verify_checksum(&decompress_data, metadata.checksum)?;
// set the checkpoint size
self.size_tracker.record(version, checkpoint_size as u64);
// Add to cache
put_to_cache(self.manifest_cache.as_ref(), path, &decompress_data).await;
Ok(Some(decompress_data))
}
Err(e) => {
if e.kind() == ErrorKind::NotFound {
if self.compress_type != FALL_BACK_COMPRESS_TYPE {
let fall_back_path = gen_path(
&self.path,
&checkpoint_file(version),
FALL_BACK_COMPRESS_TYPE,
);
debug!(
"Failed to load checkpoint from path: {}, fall back to path: {}",
path, fall_back_path
);
// Try to get fallback from cache first
if let Some(data) =
get_from_cache(self.manifest_cache.as_ref(), &fall_back_path).await
{
verify_checksum(&data, metadata.checksum)?;
return Ok(Some((version, data)));
}
match self.object_store.read(&fall_back_path).await {
Ok(checkpoint) => {
let checkpoint_size = checkpoint.len();
let decompress_data = FALL_BACK_COMPRESS_TYPE
.decode(checkpoint)
.await
.with_context(|_| DecompressObjectSnafu {
compress_type: FALL_BACK_COMPRESS_TYPE,
path: fall_back_path.clone(),
})?;
verify_checksum(&decompress_data, metadata.checksum)?;
self.size_tracker.record(version, checkpoint_size as u64);
// Add fallback to cache
put_to_cache(
self.manifest_cache.as_ref(),
fall_back_path,
&decompress_data,
)
.await;
Ok(Some(decompress_data))
}
Err(e) if e.kind() == ErrorKind::NotFound => Ok(None),
Err(e) => return Err(e).context(OpenDalSnafu),
}
} else {
Ok(None)
}
} else {
Err(e).context(OpenDalSnafu)
}
}
}?;
Ok(checkpoint_data.map(|data| (version, data)))
}
/// Load the latest checkpoint.
/// Return manifest version and the raw [RegionCheckpoint](crate::manifest::action::RegionCheckpoint) content if any
pub async fn load_last_checkpoint(&mut self) -> Result<Option<(ManifestVersion, Vec<u8>)>> {
let last_checkpoint_path = self.last_checkpoint_path();
// Fetch from remote object store without cache
let last_checkpoint_data = match self.object_store.read(&last_checkpoint_path).await {
Ok(data) => data.to_vec(),
Err(e) if e.kind() == ErrorKind::NotFound => {
return Ok(None);
}
Err(e) => {
return Err(e).context(OpenDalSnafu)?;
}
};
let checkpoint_metadata = CheckpointMetadata::decode(&last_checkpoint_data)?;
debug!(
"Load checkpoint in path: {}, metadata: {:?}",
last_checkpoint_path, checkpoint_metadata
);
self.load_checkpoint(checkpoint_metadata).await
}
/// Save the checkpoint manifest file.
pub(crate) async fn save_checkpoint(
&self,
version: ManifestVersion,
bytes: &[u8],
) -> Result<()> {
let path = self.checkpoint_file_path(version);
let data = self
.compress_type
.encode(bytes)
.await
.context(CompressObjectSnafu {
compress_type: self.compress_type,
path: &path,
})?;
let checkpoint_size = data.len();
let checksum = checkpoint_checksum(bytes);
write_and_put_cache(
&self.object_store,
self.manifest_cache.as_ref(),
&path,
data,
)
.await?;
self.size_tracker.record(version, checkpoint_size as u64);
// Because last checkpoint file only contain size and version, which is tiny, so we don't compress it.
let last_checkpoint_path = self.last_checkpoint_path();
let checkpoint_metadata = CheckpointMetadata {
size: bytes.len(),
version,
checksum: Some(checksum),
extend_metadata: HashMap::new(),
};
debug!(
"Save checkpoint in path: {}, metadata: {:?}",
last_checkpoint_path, checkpoint_metadata
);
let bytes = checkpoint_metadata.encode()?;
self.object_store
.write(&last_checkpoint_path, bytes)
.await
.context(OpenDalSnafu)?;
Ok(())
}
}
#[cfg(test)]
impl<T: Tracker> CheckpointStorage<T> {
pub async fn write_last_checkpoint(
&self,
version: ManifestVersion,
bytes: &[u8],
) -> Result<()> {
let path = self.checkpoint_file_path(version);
let data = self
.compress_type
.encode(bytes)
.await
.context(CompressObjectSnafu {
compress_type: self.compress_type,
path: &path,
})?;
let checkpoint_size = data.len();
self.object_store
.write(&path, data)
.await
.context(OpenDalSnafu)?;
self.size_tracker.record(version, checkpoint_size as u64);
let last_checkpoint_path = self.last_checkpoint_path();
let checkpoint_metadata = CheckpointMetadata {
size: bytes.len(),
version,
checksum: Some(1218259706),
extend_metadata: HashMap::new(),
};
debug!(
"Rewrite checkpoint in path: {}, metadata: {:?}",
last_checkpoint_path, checkpoint_metadata
);
let bytes = checkpoint_metadata.encode()?;
// Overwrite the last checkpoint with the modified content
self.object_store
.write(&last_checkpoint_path, bytes.clone())
.await
.context(OpenDalSnafu)?;
Ok(())
}
pub fn set_compress_type(&mut self, compress_type: CompressionType) {
self.compress_type = compress_type;
}
}

View File

@@ -0,0 +1,251 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::sync::Arc;
use common_datasource::compression::CompressionType;
use common_telemetry::debug;
use futures::TryStreamExt;
use futures::future::try_join_all;
use object_store::{Entry, ErrorKind, Lister, ObjectStore};
use snafu::{ResultExt, ensure};
use store_api::ManifestVersion;
use store_api::storage::RegionId;
use tokio::sync::Semaphore;
use crate::cache::manifest_cache::ManifestCache;
use crate::error::{
CompressObjectSnafu, DecompressObjectSnafu, InvalidScanIndexSnafu, OpenDalSnafu, Result,
};
use crate::manifest::storage::size_tracker::Tracker;
use crate::manifest::storage::utils::{
get_from_cache, put_to_cache, sort_manifests, write_and_put_cache,
};
use crate::manifest::storage::{
FETCH_MANIFEST_PARALLELISM, delta_file, file_compress_type, file_version, gen_path,
is_delta_file,
};
#[derive(Debug, Clone)]
pub(crate) struct DeltaStorage<T: Tracker> {
object_store: ObjectStore,
compress_type: CompressionType,
path: String,
delta_tracker: Arc<T>,
manifest_cache: Option<ManifestCache>,
}
impl<T: Tracker> DeltaStorage<T> {
pub(crate) fn new(
path: String,
object_store: ObjectStore,
compress_type: CompressionType,
manifest_cache: Option<ManifestCache>,
delta_tracker: Arc<T>,
) -> Self {
Self {
object_store,
compress_type,
path,
delta_tracker,
manifest_cache,
}
}
pub(crate) fn path(&self) -> &str {
&self.path
}
pub(crate) fn object_store(&self) -> &ObjectStore {
&self.object_store
}
fn delta_file_path(&self, version: ManifestVersion) -> String {
gen_path(&self.path, &delta_file(version), self.compress_type)
}
/// Returns an iterator of manifests from path directory.
pub(crate) async fn manifest_lister(&self) -> Result<Option<Lister>> {
match self.object_store.lister_with(&self.path).await {
Ok(streamer) => Ok(Some(streamer)),
Err(e) if e.kind() == ErrorKind::NotFound => {
debug!("Manifest directory does not exist: {}", self.path);
Ok(None)
}
Err(e) => Err(e).context(OpenDalSnafu)?,
}
}
/// Return all `R`s in the directory that meet the `filter` conditions (that is, the `filter` closure returns `Some(R)`),
/// and discard `R` that does not meet the conditions (that is, the `filter` closure returns `None`)
/// Return an empty vector when directory is not found.
pub async fn get_paths<F, R>(&self, filter: F) -> Result<Vec<R>>
where
F: Fn(Entry) -> Option<R>,
{
let Some(streamer) = self.manifest_lister().await? else {
return Ok(vec![]);
};
streamer
.try_filter_map(|e| async { Ok(filter(e)) })
.try_collect::<Vec<_>>()
.await
.context(OpenDalSnafu)
}
/// Scans the manifest files in the range of [start, end) and return all manifest entries.
pub async fn scan(
&self,
start: ManifestVersion,
end: ManifestVersion,
) -> Result<Vec<(ManifestVersion, Entry)>> {
ensure!(start <= end, InvalidScanIndexSnafu { start, end });
let mut entries: Vec<(ManifestVersion, Entry)> = self
.get_paths(|entry| {
let file_name = entry.name();
if is_delta_file(file_name) {
let version = file_version(file_name);
if start <= version && version < end {
return Some((version, entry));
}
}
None
})
.await?;
sort_manifests(&mut entries);
Ok(entries)
}
/// Fetches manifests in range [start_version, end_version).
///
/// This functions is guaranteed to return manifests from the `start_version` strictly (must contain `start_version`).
pub async fn fetch_manifests_strict_from(
&self,
start_version: ManifestVersion,
end_version: ManifestVersion,
region_id: RegionId,
) -> Result<Vec<(ManifestVersion, Vec<u8>)>> {
let mut manifests = self.fetch_manifests(start_version, end_version).await?;
let start_index = manifests.iter().position(|(v, _)| *v == start_version);
debug!(
"Fetches manifests in range [{},{}), start_index: {:?}, region_id: {}, manifests: {:?}",
start_version,
end_version,
start_index,
region_id,
manifests.iter().map(|(v, _)| *v).collect::<Vec<_>>()
);
if let Some(start_index) = start_index {
Ok(manifests.split_off(start_index))
} else {
Ok(vec![])
}
}
/// Common implementation for fetching manifests from entries in parallel.
pub(crate) async fn fetch_manifests_from_entries(
&self,
entries: Vec<(ManifestVersion, Entry)>,
) -> Result<Vec<(ManifestVersion, Vec<u8>)>> {
if entries.is_empty() {
return Ok(vec![]);
}
// TODO(weny): Make it configurable.
let semaphore = Semaphore::new(FETCH_MANIFEST_PARALLELISM);
let tasks = entries.iter().map(|(v, entry)| async {
// Safety: semaphore must exist.
let _permit = semaphore.acquire().await.unwrap();
let cache_key = entry.path();
// Try to get from cache first
if let Some(data) = get_from_cache(self.manifest_cache.as_ref(), cache_key).await {
return Ok((*v, data));
}
// Fetch from remote object store
let compress_type = file_compress_type(entry.name());
let bytes = self
.object_store
.read(entry.path())
.await
.context(OpenDalSnafu)?;
let data = compress_type
.decode(bytes)
.await
.context(DecompressObjectSnafu {
compress_type,
path: entry.path(),
})?;
// Add to cache
put_to_cache(self.manifest_cache.as_ref(), cache_key.to_string(), &data).await;
Ok((*v, data))
});
try_join_all(tasks).await
}
/// Fetch all manifests in concurrent, and return the manifests in range [start_version, end_version)
///
/// **Notes**: This function is no guarantee to return manifests from the `start_version` strictly.
/// Uses [fetch_manifests_strict_from](DeltaStorage::fetch_manifests_strict_from) to get manifests from the `start_version`.
pub async fn fetch_manifests(
&self,
start_version: ManifestVersion,
end_version: ManifestVersion,
) -> Result<Vec<(ManifestVersion, Vec<u8>)>> {
let manifests = self.scan(start_version, end_version).await?;
self.fetch_manifests_from_entries(manifests).await
}
/// Save the delta manifest file.
pub async fn save(&mut self, version: ManifestVersion, bytes: &[u8]) -> Result<()> {
let path = self.delta_file_path(version);
debug!("Save log to manifest storage, version: {}", version);
let data = self
.compress_type
.encode(bytes)
.await
.context(CompressObjectSnafu {
compress_type: self.compress_type,
path: &path,
})?;
let delta_size = data.len();
write_and_put_cache(
&self.object_store,
self.manifest_cache.as_ref(),
&path,
data,
)
.await?;
self.delta_tracker.record(version, delta_size as u64);
Ok(())
}
}
#[cfg(test)]
impl<T: Tracker> DeltaStorage<T> {
pub fn set_compress_type(&mut self, compress_type: CompressionType) {
self.compress_type = compress_type;
}
}

View File

@@ -0,0 +1,130 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::HashMap;
use std::fmt::Debug;
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::{Arc, RwLock};
use store_api::ManifestVersion;
/// Key to identify a manifest file.
#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)]
pub(crate) enum FileKey {
/// A delta file (`.json`).
Delta(ManifestVersion),
/// A checkpoint file (`.checkpoint`).
Checkpoint(ManifestVersion),
}
pub(crate) trait Tracker: Send + Sync + Debug {
fn record(&self, version: ManifestVersion, size: u64);
}
#[derive(Debug, Clone)]
pub struct CheckpointTracker {
size_tracker: SizeTracker,
}
impl Tracker for CheckpointTracker {
fn record(&self, version: ManifestVersion, size: u64) {
self.size_tracker.record(FileKey::Checkpoint(version), size);
}
}
#[derive(Debug, Clone)]
pub struct DeltaTracker {
size_tracker: SizeTracker,
}
impl Tracker for DeltaTracker {
fn record(&self, version: ManifestVersion, size: u64) {
self.size_tracker.record(FileKey::Delta(version), size);
}
}
#[derive(Debug, Clone)]
pub struct NoopTracker;
impl Tracker for NoopTracker {
fn record(&self, _version: ManifestVersion, _size: u64) {
// noop
}
}
#[derive(Debug, Clone, Default)]
pub(crate) struct SizeTracker {
file_sizes: Arc<RwLock<HashMap<FileKey, u64>>>,
total_size: Arc<AtomicU64>,
}
impl SizeTracker {
/// Returns a new [SizeTracker].
pub fn new(total_size: Arc<AtomicU64>) -> Self {
Self {
file_sizes: Arc::new(RwLock::new(HashMap::new())),
total_size,
}
}
/// Returns the manifest tracker.
pub(crate) fn manifest_tracker(&self) -> DeltaTracker {
DeltaTracker {
size_tracker: self.clone(),
}
}
/// Returns the checkpoint tracker.
pub(crate) fn checkpoint_tracker(&self) -> CheckpointTracker {
CheckpointTracker {
size_tracker: self.clone(),
}
}
/// Records a delta file size.
pub(crate) fn record_delta(&self, version: ManifestVersion, size: u64) {
self.record(FileKey::Delta(version), size);
}
/// Records a checkpoint file size.
pub(crate) fn record_checkpoint(&self, version: ManifestVersion, size: u64) {
self.record(FileKey::Checkpoint(version), size);
}
/// Removes a file from tracking.
pub(crate) fn remove(&self, key: &FileKey) {
if let Some(size) = self.file_sizes.write().unwrap().remove(key) {
self.total_size.fetch_sub(size, Ordering::Relaxed);
}
}
/// Returns the total tracked size.
pub(crate) fn total(&self) -> u64 {
self.total_size.load(Ordering::Relaxed)
}
/// Resets all tracking.
pub(crate) fn reset(&self) {
self.file_sizes.write().unwrap().clear();
self.total_size.store(0, Ordering::Relaxed);
}
fn record(&self, key: FileKey, size: u64) {
// Remove the old size if present
if let Some(old_size) = self.file_sizes.write().unwrap().insert(key, size) {
self.total_size.fetch_sub(old_size, Ordering::Relaxed);
}
self.total_size.fetch_add(size, Ordering::Relaxed);
}
}

View File

@@ -0,0 +1,109 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::sync::Arc;
use common_datasource::compression::CompressionType;
use common_telemetry::debug;
use object_store::{Lister, ObjectStore, util};
use snafu::ResultExt;
use store_api::ManifestVersion;
use crate::error::{OpenDalSnafu, Result};
use crate::manifest::storage::delta::DeltaStorage;
use crate::manifest::storage::size_tracker::NoopTracker;
use crate::manifest::storage::utils::sort_manifests;
use crate::manifest::storage::{file_version, is_delta_file};
#[derive(Debug, Clone)]
pub(crate) struct StagingStorage {
delta_storage: DeltaStorage<NoopTracker>,
}
impl StagingStorage {
pub fn new(path: String, object_store: ObjectStore, compress_type: CompressionType) -> Self {
let staging_path = {
// Convert "region_dir/manifest/" to "region_dir/staging/manifest/"
let parent_dir = path.trim_end_matches("manifest/").trim_end_matches('/');
util::normalize_dir(&format!("{}/staging/manifest", parent_dir))
};
let delta_storage = DeltaStorage::new(
staging_path.clone(),
object_store.clone(),
compress_type,
// StagingStorage does not use a manifest cache; set to None.
None,
// StagingStorage does not track file sizes, since all staging files are
// deleted after exiting staging mode.
Arc::new(NoopTracker),
);
Self { delta_storage }
}
/// Returns an iterator of manifests from staging directory.
pub(crate) async fn manifest_lister(&self) -> Result<Option<Lister>> {
self.delta_storage.manifest_lister().await
}
/// Fetch all staging manifest files and return them as (version, action_list) pairs.
pub(crate) async fn fetch_manifests(&self) -> Result<Vec<(ManifestVersion, Vec<u8>)>> {
let manifest_entries = self
.delta_storage
.get_paths(|entry| {
let file_name = entry.name();
if is_delta_file(file_name) {
let version = file_version(file_name);
Some((version, entry))
} else {
None
}
})
.await?;
let mut sorted_entries = manifest_entries;
sort_manifests(&mut sorted_entries);
self.delta_storage
.fetch_manifests_from_entries(sorted_entries)
.await
}
/// Save the delta manifest file.
pub(crate) async fn save(&mut self, version: ManifestVersion, bytes: &[u8]) -> Result<()> {
self.delta_storage.save(version, bytes).await
}
/// Clean all staging manifest files.
pub(crate) async fn clear(&self) -> Result<()> {
self.delta_storage
.object_store()
.remove_all(self.delta_storage.path())
.await
.context(OpenDalSnafu)?;
debug!(
"Cleared all staging manifest files from {}",
self.delta_storage.path()
);
Ok(())
}
}
#[cfg(test)]
impl StagingStorage {
pub fn set_compress_type(&mut self, compress_type: CompressionType) {
self.delta_storage.set_compress_type(compress_type);
}
}

View File

@@ -0,0 +1,73 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use object_store::{Entry, ObjectStore};
use snafu::ResultExt;
use store_api::ManifestVersion;
use crate::cache::manifest_cache::ManifestCache;
use crate::error::{OpenDalSnafu, Result};
/// Gets a manifest file from cache.
/// Returns the file data if found in cache, None otherwise.
pub(crate) async fn get_from_cache(cache: Option<&ManifestCache>, key: &str) -> Option<Vec<u8>> {
let cache = cache?;
cache.get_file(key).await
}
/// Puts a manifest file into cache.
pub(crate) async fn put_to_cache(cache: Option<&ManifestCache>, key: String, data: &[u8]) {
let Some(cache) = cache else {
return;
};
cache.put_file(key, data.to_vec()).await
}
/// Removes a manifest file from cache.
pub(crate) async fn remove_from_cache(cache: Option<&ManifestCache>, key: &str) {
let Some(cache) = cache else {
return;
};
cache.remove(key).await
}
/// Writes data to object store and puts it into cache.
pub(crate) async fn write_and_put_cache(
object_store: &ObjectStore,
cache: Option<&ManifestCache>,
path: &str,
data: Vec<u8>,
) -> Result<()> {
// Clone data for cache before writing, only if cache is enabled.
let cache_data = if cache.is_some() {
Some(data.clone())
} else {
None
};
// Write to object store
object_store.write(path, data).await.context(OpenDalSnafu)?;
// Put to cache if we cloned the data
if let Some(data) = cache_data {
put_to_cache(cache, path.to_string(), &data).await;
}
Ok(())
}
/// Sorts the manifest files.
pub(crate) fn sort_manifests(entries: &mut [(ManifestVersion, Entry)]) {
entries.sort_unstable_by_key(|(version, _)| *version);
}

View File

@@ -25,7 +25,7 @@ use crate::manifest::action::{
RegionCheckpoint, RegionEdit, RegionMetaAction, RegionMetaActionList,
};
use crate::manifest::manager::RegionManifestManager;
use crate::manifest::storage::CheckpointMetadata;
use crate::manifest::storage::checkpoint::CheckpointMetadata;
use crate::manifest::tests::utils::basic_region_metadata;
use crate::sst::file::FileMeta;
use crate::test_util::TestEnv;
@@ -117,7 +117,8 @@ async fn manager_without_checkpoint() {
expected.sort_unstable();
let mut paths = manager
.store()
.get_paths(|e| Some(e.name().to_string()), false)
.delta_storage()
.get_paths(|e| Some(e.name().to_string()))
.await
.unwrap();
paths.sort_unstable();
@@ -159,7 +160,8 @@ async fn manager_with_checkpoint_distance_1() {
expected.sort_unstable();
let mut paths = manager
.store()
.get_paths(|e| Some(e.name().to_string()), false)
.delta_storage()
.get_paths(|e| Some(e.name().to_string()))
.await
.unwrap();
paths.sort_unstable();
@@ -168,7 +170,7 @@ async fn manager_with_checkpoint_distance_1() {
// check content in `_last_checkpoint`
let raw_bytes = manager
.store()
.read_file(&manager.store().last_checkpoint_path())
.read_file(&manager.store().checkpoint_storage().last_checkpoint_path())
.await
.unwrap();
let raw_json = std::str::from_utf8(&raw_bytes).unwrap();
@@ -213,7 +215,7 @@ async fn test_corrupted_data_causing_checksum_error() {
// Corrupt the last checkpoint data
let mut corrupted_bytes = manager
.store()
.read_file(&manager.store().last_checkpoint_path())
.read_file(&manager.store().checkpoint_storage().last_checkpoint_path())
.await
.unwrap();
corrupted_bytes[0] ^= 1;
@@ -221,6 +223,7 @@ async fn test_corrupted_data_causing_checksum_error() {
// Overwrite the latest checkpoint data
manager
.store()
.checkpoint_storage()
.write_last_checkpoint(9, &corrupted_bytes)
.await
.unwrap();
@@ -410,7 +413,8 @@ async fn manifest_install_manifest_to_with_checkpoint() {
expected.sort_unstable();
let mut paths = manager
.store()
.get_paths(|e| Some(e.name().to_string()), false)
.delta_storage()
.get_paths(|e| Some(e.name().to_string()))
.await
.unwrap();

View File

@@ -136,18 +136,18 @@ impl RangesOptions {
#[derive(Debug, Default, Clone)]
pub struct MemtableStats {
/// The estimated bytes allocated by this memtable from heap.
estimated_bytes: usize,
pub estimated_bytes: usize,
/// The inclusive time range that this memtable contains. It is None if
/// and only if the memtable is empty.
time_range: Option<(Timestamp, Timestamp)>,
pub time_range: Option<(Timestamp, Timestamp)>,
/// Total rows in memtable
pub num_rows: usize,
/// Total number of ranges in the memtable.
pub num_ranges: usize,
/// The maximum sequence number in the memtable.
max_sequence: SequenceNumber,
pub max_sequence: SequenceNumber,
/// Number of estimated timeseries in memtable.
series_count: usize,
pub series_count: usize,
}
impl MemtableStats {
@@ -204,8 +204,27 @@ pub type BoxedRecordBatchIterator = Box<dyn Iterator<Item = Result<RecordBatch>>
pub struct MemtableRanges {
/// Range IDs and ranges.
pub ranges: BTreeMap<usize, MemtableRange>,
/// Statistics of the memtable at the query time.
pub stats: MemtableStats,
}
impl MemtableRanges {
/// Returns the total number of rows across all ranges.
pub fn num_rows(&self) -> usize {
self.ranges.values().map(|r| r.stats().num_rows()).sum()
}
/// Returns the total series count across all ranges.
pub fn series_count(&self) -> usize {
self.ranges.values().map(|r| r.stats().series_count()).sum()
}
/// Returns the maximum sequence number across all ranges.
pub fn max_sequence(&self) -> SequenceNumber {
self.ranges
.values()
.map(|r| r.stats().max_sequence())
.max()
.unwrap_or(0)
}
}
impl IterBuilder for MemtableRanges {
@@ -569,15 +588,19 @@ impl MemtableRangeContext {
pub struct MemtableRange {
/// Shared context.
context: MemtableRangeContextRef,
/// Number of rows in current memtable range.
// todo(hl): use [MemtableRangeStats] instead.
num_rows: usize,
/// Statistics for this memtable range.
stats: MemtableStats,
}
impl MemtableRange {
/// Creates a new range from context.
pub fn new(context: MemtableRangeContextRef, num_rows: usize) -> Self {
Self { context, num_rows }
/// Creates a new range from context and stats.
pub fn new(context: MemtableRangeContextRef, stats: MemtableStats) -> Self {
Self { context, stats }
}
/// Returns the statistics for this range.
pub fn stats(&self) -> &MemtableStats {
&self.stats
}
/// Returns the id of the memtable to read.
@@ -624,7 +647,7 @@ impl MemtableRange {
}
pub fn num_rows(&self) -> usize {
self.num_rows
self.stats.num_rows
}
/// Returns the encoded range if available.

View File

@@ -382,7 +382,7 @@ impl Memtable for BulkMemtable {
if !bulk_parts.unordered_part.is_empty()
&& let Some(unordered_bulk_part) = bulk_parts.unordered_part.to_bulk_part()?
{
let num_rows = unordered_bulk_part.num_rows();
let part_stats = unordered_bulk_part.to_memtable_stats(&self.metadata);
let range = MemtableRange::new(
Arc::new(MemtableRangeContext::new(
self.id,
@@ -393,7 +393,7 @@ impl Memtable for BulkMemtable {
}),
predicate.clone(),
)),
num_rows,
part_stats,
);
ranges.insert(range_id, range);
range_id += 1;
@@ -406,6 +406,7 @@ impl Memtable for BulkMemtable {
continue;
}
let part_stats = part_wrapper.part.to_memtable_stats(&self.metadata);
let range = MemtableRange::new(
Arc::new(MemtableRangeContext::new(
self.id,
@@ -416,7 +417,7 @@ impl Memtable for BulkMemtable {
}),
predicate.clone(),
)),
part_wrapper.part.num_rows(),
part_stats,
);
ranges.insert(range_id, range);
range_id += 1;
@@ -429,6 +430,7 @@ impl Memtable for BulkMemtable {
continue;
}
let part_stats = encoded_part_wrapper.part.to_memtable_stats();
let range = MemtableRange::new(
Arc::new(MemtableRangeContext::new(
self.id,
@@ -440,18 +442,14 @@ impl Memtable for BulkMemtable {
}),
predicate.clone(),
)),
encoded_part_wrapper.part.metadata().num_rows,
part_stats,
);
ranges.insert(range_id, range);
range_id += 1;
}
}
let mut stats = self.stats();
stats.num_ranges = ranges.len();
// TODO(yingwen): Supports per range stats.
Ok(MemtableRanges { ranges, stats })
Ok(MemtableRanges { ranges })
}
fn is_empty(&self) -> bool {
@@ -811,6 +809,14 @@ impl PartToMerge {
}
}
/// Gets the maximum sequence number of this part.
fn max_sequence(&self) -> u64 {
match self {
PartToMerge::Bulk { part, .. } => part.sequence,
PartToMerge::Encoded { part, .. } => part.metadata().max_sequence,
}
}
/// Creates a record batch iterator for this part.
fn create_iterator(
self,
@@ -984,7 +990,7 @@ impl MemtableCompactor {
return Ok(None);
}
// Calculates timestamp bounds for merged data
// Calculates timestamp bounds and max sequence for merged data
let min_timestamp = parts_to_merge
.iter()
.map(|p| p.min_timestamp())
@@ -995,6 +1001,11 @@ impl MemtableCompactor {
.map(|p| p.max_timestamp())
.max()
.unwrap_or(i64::MIN);
let max_sequence = parts_to_merge
.iter()
.map(|p| p.max_sequence())
.max()
.unwrap_or(0);
let context = Arc::new(BulkIterContext::new(
metadata.clone(),
@@ -1051,6 +1062,7 @@ impl MemtableCompactor {
arrow_schema.clone(),
min_timestamp,
max_timestamp,
max_sequence,
&mut metrics,
)?;
@@ -1278,7 +1290,8 @@ mod tests {
.unwrap();
assert_eq!(3, ranges.ranges.len());
assert_eq!(5, ranges.stats.num_rows);
let total_rows: usize = ranges.ranges.values().map(|r| r.stats().num_rows()).sum();
assert_eq!(5, total_rows);
for (_range_id, range) in ranges.ranges.iter() {
assert!(range.num_rows() > 0);
@@ -1446,8 +1459,9 @@ mod tests {
.unwrap();
assert_eq!(3, ranges.ranges.len());
assert_eq!(5, ranges.stats.num_rows);
assert_eq!(3, ranges.stats.num_ranges);
let total_rows: usize = ranges.ranges.values().map(|r| r.stats().num_rows()).sum();
assert_eq!(5, total_rows);
assert_eq!(3, ranges.ranges.len());
for (range_id, range) in ranges.ranges.iter() {
assert!(*range_id < 3);
@@ -1524,7 +1538,8 @@ mod tests {
// Should have ranges for both bulk parts and encoded parts
assert_eq!(3, ranges.ranges.len());
assert_eq!(10, ranges.stats.num_rows);
let total_rows: usize = ranges.ranges.values().map(|r| r.stats().num_rows()).sum();
assert_eq!(10, total_rows);
for (_range_id, range) in ranges.ranges.iter() {
assert!(range.num_rows() > 0);
@@ -1606,7 +1621,8 @@ mod tests {
// Should have at least 1 range (the compacted part)
assert!(!ranges.ranges.is_empty());
assert_eq!(10, ranges.stats.num_rows);
let total_rows: usize = ranges.ranges.values().map(|r| r.stats().num_rows()).sum();
assert_eq!(10, total_rows);
// Read all data and verify
let mut total_rows_read = 0;
@@ -1693,7 +1709,8 @@ mod tests {
)
.unwrap();
assert_eq!(13, ranges.stats.num_rows);
let total_rows: usize = ranges.ranges.values().map(|r| r.stats().num_rows()).sum();
assert_eq!(13, total_rows);
let mut total_rows_read = 0;
for (_range_id, range) in ranges.ranges.iter() {
@@ -1750,7 +1767,8 @@ mod tests {
// Should have 1 range for the unordered_part
assert_eq!(1, ranges.ranges.len());
assert_eq!(3, ranges.stats.num_rows);
let total_rows: usize = ranges.ranges.values().map(|r| r.stats().num_rows()).sum();
assert_eq!(3, total_rows);
// Verify data is sorted correctly in the range
let range = ranges.ranges.get(&0).unwrap();

View File

@@ -66,7 +66,7 @@ use crate::error::{
use crate::memtable::bulk::context::BulkIterContextRef;
use crate::memtable::bulk::part_reader::EncodedBulkPartIter;
use crate::memtable::time_series::{ValueBuilder, Values};
use crate::memtable::{BoxedRecordBatchIterator, MemScanMetrics};
use crate::memtable::{BoxedRecordBatchIterator, MemScanMetrics, MemtableStats};
use crate::sst::index::IndexOutput;
use crate::sst::parquet::file_range::{PreFilterMode, row_group_contains_delete};
use crate::sst::parquet::flat_format::primary_key_column_index;
@@ -170,6 +170,22 @@ impl BulkPart {
}
}
/// Creates MemtableStats from this BulkPart.
pub fn to_memtable_stats(&self, region_metadata: &RegionMetadataRef) -> MemtableStats {
let ts_type = region_metadata.time_index_type();
let min_ts = ts_type.create_timestamp(self.min_timestamp);
let max_ts = ts_type.create_timestamp(self.max_timestamp);
MemtableStats {
estimated_bytes: self.estimated_size(),
time_range: Some((min_ts, max_ts)),
num_rows: self.num_rows(),
num_ranges: 1,
max_sequence: self.sequence,
series_count: self.estimated_series_count(),
}
}
/// Fills missing columns in the BulkPart batch with default values.
///
/// This function checks if the batch schema matches the region metadata schema,
@@ -965,6 +981,23 @@ impl EncodedBulkPart {
&self.data
}
/// Creates MemtableStats from this EncodedBulkPart.
pub fn to_memtable_stats(&self) -> MemtableStats {
let meta = &self.metadata;
let ts_type = meta.region_metadata.time_index_type();
let min_ts = ts_type.create_timestamp(meta.min_timestamp);
let max_ts = ts_type.create_timestamp(meta.max_timestamp);
MemtableStats {
estimated_bytes: self.size_bytes(),
time_range: Some((min_ts, max_ts)),
num_rows: meta.num_rows,
num_ranges: 1,
max_sequence: meta.max_sequence,
series_count: meta.num_series as usize,
}
}
/// Converts this `EncodedBulkPart` to `SstInfo`.
///
/// # Arguments
@@ -1061,6 +1094,8 @@ pub struct BulkPartMeta {
pub region_metadata: RegionMetadataRef,
/// Number of series.
pub num_series: u64,
/// Maximum sequence number in part.
pub max_sequence: u64,
}
/// Metrics for encoding a part.
@@ -1122,6 +1157,7 @@ impl BulkPartEncoder {
arrow_schema: SchemaRef,
min_timestamp: i64,
max_timestamp: i64,
max_sequence: u64,
metrics: &mut BulkPartEncodeMetrics,
) -> Result<Option<EncodedBulkPart>> {
let mut buf = Vec::with_capacity(4096);
@@ -1173,6 +1209,7 @@ impl BulkPartEncoder {
parquet_metadata,
region_metadata: self.metadata.clone(),
num_series,
max_sequence,
},
}))
}
@@ -1206,6 +1243,7 @@ impl BulkPartEncoder {
parquet_metadata,
region_metadata: self.metadata.clone(),
num_series: part.estimated_series_count() as u64,
max_sequence: part.sequence,
},
}))
}

View File

@@ -203,10 +203,10 @@ impl Memtable for PartitionTreeMemtable {
});
let context = Arc::new(MemtableRangeContext::new(self.id, builder, predicate));
let stats = self.stats();
let range_stats = self.stats();
let range = MemtableRange::new(context, range_stats);
Ok(MemtableRanges {
ranges: [(0, MemtableRange::new(context, stats.num_rows))].into(),
stats,
ranges: [(0, range)].into(),
})
}

View File

@@ -243,6 +243,23 @@ impl Memtable for SimpleBulkMemtable {
let sequence = options.sequence;
let start_time = Instant::now();
let projection = Arc::new(self.build_projection(projection));
// Use the memtable's overall time range and max sequence for all ranges
let max_sequence = self.max_sequence.load(Ordering::Relaxed);
let time_range = {
let num_rows = self.num_rows.load(Ordering::Relaxed);
if num_rows > 0 {
let ts_type = self.region_metadata.time_index_type();
let max_timestamp =
ts_type.create_timestamp(self.max_timestamp.load(Ordering::Relaxed));
let min_timestamp =
ts_type.create_timestamp(self.min_timestamp.load(Ordering::Relaxed));
Some((min_timestamp, max_timestamp))
} else {
None
}
};
let values = self.series.read().unwrap().read_to_values();
let contexts = values
.into_par_iter()
@@ -267,13 +284,24 @@ impl Memtable for SimpleBulkMemtable {
.map(|result| {
result.map(|batch| {
let num_rows = batch.num_rows();
let estimated_bytes = batch.memory_size();
let range_stats = MemtableStats {
estimated_bytes,
time_range,
num_rows,
num_ranges: 1,
max_sequence,
series_count: 1,
};
let builder = BatchRangeBuilder {
batch,
merge_mode: self.merge_mode,
scan_cost: start_time.elapsed(),
};
(
num_rows,
range_stats,
Arc::new(MemtableRangeContext::new(
self.id,
Box::new(builder),
@@ -287,13 +315,10 @@ impl Memtable for SimpleBulkMemtable {
let ranges = contexts
.into_iter()
.enumerate()
.map(|(idx, (num_rows, context))| (idx, MemtableRange::new(context, num_rows)))
.map(|(idx, (range_stats, context))| (idx, MemtableRange::new(context, range_stats)))
.collect();
Ok(MemtableRanges {
ranges,
stats: self.stats(),
})
Ok(MemtableRanges { ranges })
}
fn is_empty(&self) -> bool {
@@ -319,14 +344,7 @@ impl Memtable for SimpleBulkMemtable {
series_count: 0,
};
}
let ts_type = self
.region_metadata
.time_index_column()
.column_schema
.data_type
.clone()
.as_timestamp()
.expect("Timestamp column must have timestamp type");
let ts_type = self.region_metadata.time_index_type();
let max_timestamp = ts_type.create_timestamp(self.max_timestamp.load(Ordering::Relaxed));
let min_timestamp = ts_type.create_timestamp(self.min_timestamp.load(Ordering::Relaxed));
MemtableStats {

View File

@@ -325,10 +325,10 @@ impl Memtable for TimeSeriesMemtable {
});
let context = Arc::new(MemtableRangeContext::new(self.id, builder, predicate));
let stats = self.stats();
let range_stats = self.stats();
let range = MemtableRange::new(context, range_stats);
Ok(MemtableRanges {
ranges: [(0, MemtableRange::new(context, stats.num_rows))].into(),
stats,
ranges: [(0, range)].into(),
})
}

View File

@@ -458,10 +458,7 @@ impl ScanRegion {
.with_pre_filter_mode(filter_mode),
)?;
mem_range_builders.extend(ranges_in_memtable.ranges.into_values().map(|v| {
// todo: we should add stats to MemtableRange
let mut stats = ranges_in_memtable.stats.clone();
stats.num_ranges = 1;
stats.num_rows = v.num_rows();
let stats = v.stats().clone();
MemRangeBuilder::new(v, stats)
}));
}

View File

@@ -45,7 +45,7 @@ pub use utils::*;
use crate::access_layer::AccessLayerRef;
use crate::error::{
FlushableRegionStateSnafu, InvalidPartitionExprSnafu, RegionNotFoundSnafu, RegionStateSnafu,
RegionTruncatedSnafu, Result, UpdateManifestSnafu,
RegionTruncatedSnafu, Result, UnexpectedSnafu, UpdateManifestSnafu,
};
use crate::manifest::action::{
RegionChange, RegionManifest, RegionMetaAction, RegionMetaActionList,
@@ -102,6 +102,16 @@ pub enum RegionRoleState {
Follower,
}
impl RegionRoleState {
/// Converts the region role state to leader state if it is a leader state.
pub fn into_leader_state(self) -> Option<RegionLeaderState> {
match self {
RegionRoleState::Leader(leader_state) => Some(leader_state),
RegionRoleState::Follower => None,
}
}
}
/// Metadata and runtime status of a region.
///
/// Writing and reading a region follow a single-writer-multi-reader rule:
@@ -322,11 +332,8 @@ impl MitoRegion {
/// Sets the editing state.
/// You should call this method in the worker loop.
pub(crate) fn set_editing(&self) -> Result<()> {
self.compare_exchange_state(
RegionLeaderState::Writable,
RegionRoleState::Leader(RegionLeaderState::Editing),
)
pub(crate) fn set_editing(&self, expect: RegionLeaderState) -> Result<()> {
self.compare_exchange_state(expect, RegionRoleState::Leader(RegionLeaderState::Editing))
}
/// Sets the staging state.
@@ -359,6 +366,7 @@ impl MitoRegion {
/// You should call this method in the worker loop.
/// Transitions from Staging to Writable state.
pub fn exit_staging(&self) -> Result<()> {
*self.staging_partition_expr.lock().unwrap() = None;
self.compare_exchange_state(
RegionLeaderState::Staging,
RegionRoleState::Leader(RegionLeaderState::Writable),
@@ -370,7 +378,8 @@ impl MitoRegion {
&self,
state: SettableRegionRoleState,
) -> Result<()> {
let mut manager = self.manifest_ctx.manifest_manager.write().await;
let mut manager: RwLockWriteGuard<'_, RegionManifestManager> =
self.manifest_ctx.manifest_manager.write().await;
let current_state = self.state();
match state {
@@ -705,6 +714,20 @@ impl MitoRegion {
return Ok(());
}
};
let expect_change = merged_actions.actions.iter().any(|a| a.is_change());
let expect_edit = merged_actions.actions.iter().any(|a| a.is_edit());
ensure!(
expect_change,
UnexpectedSnafu {
reason: "expect a change action in merged actions"
}
);
ensure!(
expect_edit,
UnexpectedSnafu {
reason: "expect an edit action in merged actions"
}
);
// Submit merged actions using the manifest manager's update method
// Pass the `false` so it saves to normal directory, not staging
@@ -716,12 +739,17 @@ impl MitoRegion {
);
// Apply the merged changes to in-memory version control
let merged_edit = merged_actions.into_region_edit();
let (merged_change, merged_edit) = merged_actions.split_region_change_and_edit();
// Safety: we have already ensured that there is a change action in the merged actions.
let new_metadata = merged_change.as_ref().unwrap().metadata.clone();
self.version_control.alter_schema(new_metadata);
self.version_control
.apply_edit(Some(merged_edit), &[], self.file_purger.clone());
// Clear all staging manifests and transit state
manager.store().clear_staging_manifests().await?;
if let Err(e) = manager.clear_staging_manifest_and_dir().await {
error!(e; "Failed to clear staging manifest dir for region {}", self.region_id);
}
self.exit_staging()?;
Ok(())

View File

@@ -29,6 +29,7 @@ use serde_json::Value;
use serde_with::{DisplayFromStr, NoneAsEmptyString, serde_as, with_prefix};
use snafu::{ResultExt, ensure};
use store_api::codec::PrimaryKeyEncoding;
use store_api::mito_engine_options::COMPACTION_OVERRIDE;
use store_api::storage::ColumnId;
use strum::EnumString;
@@ -62,6 +63,7 @@ pub struct RegionOptions {
pub ttl: Option<TimeToLive>,
/// Compaction options.
pub compaction: CompactionOptions,
pub compaction_override: bool,
/// Custom storage. Uses default storage if it is `None`.
pub storage: Option<String>,
/// If append mode is enabled, the region keeps duplicate rows.
@@ -125,7 +127,8 @@ impl TryFrom<&HashMap<String, String>> for RegionOptions {
// See https://github.com/serde-rs/serde/issues/1626
let options: RegionOptionsWithoutEnum =
serde_json::from_str(&json).context(JsonOptionsSnafu)?;
let compaction = if validate_enum_options(options_map, "compaction.type")? {
let has_compaction_type = validate_enum_options(options_map, "compaction.type")?;
let compaction = if has_compaction_type {
serde_json::from_str(&json).context(JsonOptionsSnafu)?
} else {
CompactionOptions::default()
@@ -146,9 +149,16 @@ impl TryFrom<&HashMap<String, String>> for RegionOptions {
None
};
let compaction_override_flag = options_map
.get(COMPACTION_OVERRIDE)
.map(|v| matches!(v.to_lowercase().as_str(), "true" | "1"))
.unwrap_or(false);
let compaction_override = has_compaction_type || compaction_override_flag;
let opts = RegionOptions {
ttl: options.ttl,
compaction,
compaction_override,
storage: options.storage,
append_mode: options.append_mode,
wal_options,
@@ -517,6 +527,7 @@ mod tests {
time_window: Some(Duration::from_secs(3600 * 2)),
..Default::default()
}),
compaction_override: true,
..Default::default()
};
assert_eq!(expect, options);
@@ -644,6 +655,7 @@ mod tests {
remote_compaction: false,
fallback_to_local: true,
}),
compaction_override: true,
storage: Some("S3".to_string()),
append_mode: false,
wal_options,
@@ -676,6 +688,7 @@ mod tests {
remote_compaction: false,
fallback_to_local: true,
}),
compaction_override: false,
storage: Some("S3".to_string()),
append_mode: false,
wal_options: WalOptions::Kafka(KafkaWalOptions {
@@ -740,6 +753,7 @@ mod tests {
remote_compaction: false,
fallback_to_local: true,
}),
compaction_override: false,
storage: Some("S3".to_string()),
append_mode: false,
wal_options: WalOptions::Kafka(KafkaWalOptions {

View File

@@ -37,10 +37,10 @@ use store_api::region_engine::{
MitoCopyRegionFromResponse, SetRegionRoleStateResponse, SettableRegionRoleState,
};
use store_api::region_request::{
AffectedRows, EnterStagingRequest, RegionAlterRequest, RegionBuildIndexRequest,
RegionBulkInsertsRequest, RegionCatchupRequest, RegionCloseRequest, RegionCompactRequest,
RegionCreateRequest, RegionFlushRequest, RegionOpenRequest, RegionRequest,
RegionTruncateRequest,
AffectedRows, ApplyStagingManifestRequest, EnterStagingRequest, RegionAlterRequest,
RegionBuildIndexRequest, RegionBulkInsertsRequest, RegionCatchupRequest, RegionCloseRequest,
RegionCompactRequest, RegionCreateRequest, RegionFlushRequest, RegionOpenRequest,
RegionRequest, RegionTruncateRequest,
};
use store_api::storage::{FileId, RegionId};
use tokio::sync::oneshot::{self, Receiver, Sender};
@@ -741,6 +741,11 @@ impl WorkerRequest {
sender: sender.into(),
request: region_bulk_inserts_request,
},
RegionRequest::ApplyStagingManifest(v) => WorkerRequest::Ddl(SenderDdlRequest {
region_id,
sender: sender.into(),
request: DdlRequest::ApplyStagingManifest(v),
}),
};
Ok((worker_request, receiver))
@@ -819,13 +824,13 @@ impl WorkerRequest {
Ok((WorkerRequest::RemapManifests(request), receiver))
}
/// Converts [CopyRegionFromRequest] from a [CopyRegionFromRequest](store_api::region_engine::CopyRegionFromRequest).
/// Converts [CopyRegionFromRequest] from a [MitoCopyRegionFromRequest](store_api::region_engine::MitoCopyRegionFromRequest).
pub(crate) fn try_from_copy_region_from_request(
region_id: RegionId,
store_api::region_engine::CopyRegionFromRequest {
store_api::region_engine::MitoCopyRegionFromRequest {
source_region_id,
parallelism,
}: store_api::region_engine::CopyRegionFromRequest,
}: store_api::region_engine::MitoCopyRegionFromRequest,
) -> Result<(WorkerRequest, Receiver<Result<MitoCopyRegionFromResponse>>)> {
let (sender, receiver) = oneshot::channel();
let request = CopyRegionFromRequest {
@@ -852,6 +857,7 @@ pub(crate) enum DdlRequest {
Truncate(RegionTruncateRequest),
Catchup((RegionCatchupRequest, Option<WalEntryReceiver>)),
EnterStaging(EnterStagingRequest),
ApplyStagingManifest(ApplyStagingManifestRequest),
}
/// Sender and Ddl request.
@@ -1080,6 +1086,8 @@ pub(crate) struct RegionEditResult {
pub(crate) result: Result<()>,
/// Whether region state need to be set to Writable after handling this request.
pub(crate) update_region_state: bool,
/// The region is in staging mode before handling this request.
pub(crate) is_staging: bool,
}
#[derive(Debug)]

View File

@@ -15,6 +15,7 @@
//! Structs and utilities for writing regions.
mod handle_alter;
mod handle_apply_staging;
mod handle_bulk_insert;
mod handle_catchup;
mod handle_close;
@@ -1005,7 +1006,7 @@ impl<S: LogStore> RegionWorkerLoop<S> {
.await;
}
WorkerRequest::EditRegion(request) => {
self.handle_region_edit(request).await;
self.handle_region_edit(request);
}
WorkerRequest::Stop => {
debug_assert!(!self.running.load(Ordering::Relaxed));
@@ -1107,6 +1108,11 @@ impl<S: LogStore> RegionWorkerLoop<S> {
.await;
continue;
}
DdlRequest::ApplyStagingManifest(req) => {
self.handle_apply_staging_manifest_request(ddl.region_id, req, ddl.sender)
.await;
continue;
}
};
ddl.sender.send(res);

View File

@@ -0,0 +1,140 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use chrono::Utc;
use common_telemetry::{debug, info};
use snafu::ResultExt;
use store_api::logstore::LogStore;
use store_api::region_request::ApplyStagingManifestRequest;
use store_api::storage::RegionId;
use tokio::sync::oneshot;
use crate::error::{
RegionStateSnafu, SerdeJsonSnafu, StagingPartitionExprMismatchSnafu, UnexpectedSnafu,
};
use crate::manifest::action::RegionEdit;
use crate::region::{RegionLeaderState, RegionRoleState};
use crate::request::{OptionOutputTx, RegionEditRequest};
use crate::sst::file::FileMeta;
use crate::worker::RegionWorkerLoop;
impl<S: LogStore> RegionWorkerLoop<S> {
pub(crate) async fn handle_apply_staging_manifest_request(
&mut self,
region_id: RegionId,
request: ApplyStagingManifestRequest,
sender: OptionOutputTx,
) {
let region = match self.regions.writable_region(region_id) {
Ok(region) => region,
Err(e) => {
sender.send(Err(e));
return;
}
};
if !region.is_staging() {
let manifest_partition_expr = region.metadata().partition_expr.as_ref().cloned();
let is_match = manifest_partition_expr.as_ref() == Some(&request.partition_expr);
debug!(
"region {} manifest partition expr: {:?}, request partition expr: {:?}",
region_id, manifest_partition_expr, request.partition_expr
);
if is_match {
// If current partition expr is already the same as the request,
// treats the region already applied the staging manifest.
info!(
"Region {} already applied the staging manifest, partition expr: {}, ignore the apply staging manifest request",
region_id, request.partition_expr
);
sender.send(Ok(0));
return;
}
sender.send(
RegionStateSnafu {
region_id,
state: region.state(),
expect: RegionRoleState::Leader(RegionLeaderState::Staging),
}
.fail(),
);
return;
}
let staging_partition_expr = region.staging_partition_expr.lock().unwrap().clone();
// If the partition expr mismatch, return error.
if staging_partition_expr.as_ref() != Some(&request.partition_expr) {
sender.send(
StagingPartitionExprMismatchSnafu {
manifest_expr: staging_partition_expr,
request_expr: request.partition_expr,
}
.fail(),
);
return;
}
let (tx, rx) = oneshot::channel();
let files_to_add = match serde_json::from_slice::<Vec<FileMeta>>(&request.files_to_add)
.context(SerdeJsonSnafu)
{
Ok(files_to_add) => files_to_add,
Err(e) => {
sender.send(Err(e));
return;
}
};
info!("Applying staging manifest request to region {}", region_id);
self.handle_region_edit(RegionEditRequest {
region_id,
edit: RegionEdit {
files_to_add,
files_to_remove: vec![],
timestamp_ms: Some(Utc::now().timestamp_millis()),
compaction_time_window: None,
flushed_entry_id: None,
flushed_sequence: None,
committed_sequence: None,
},
tx,
});
common_runtime::spawn_global(async move {
// Await the result from the region edit and forward the outcome to the original sender.
// If the operation completes successfully, respond with Ok(0); otherwise, respond with an appropriate error.
if let Ok(result) = rx.await {
let Ok(()) = result else {
sender.send(result.map(|_| 0));
return;
};
let mut manager = region.manifest_ctx.manifest_manager.write().await;
match region.exit_staging_on_success(&mut manager).await {
Ok(()) => {
sender.send(Ok(0));
}
Err(e) => sender.send(Err(e)),
}
} else {
sender.send(
UnexpectedSnafu {
reason: "edit region receiver channel closed",
}
.fail(),
);
}
});
}
}

View File

@@ -214,7 +214,7 @@ impl<S: LogStore> RegionWorkerLoop<S> {
impl<S> RegionWorkerLoop<S> {
/// Handles region edit request.
pub(crate) async fn handle_region_edit(&mut self, request: RegionEditRequest) {
pub(crate) fn handle_region_edit(&mut self, request: RegionEditRequest) {
let region_id = request.region_id;
let Some(region) = self.regions.get_region(region_id) else {
let _ = request.tx.send(RegionNotFoundSnafu { region_id }.fail());
@@ -246,8 +246,15 @@ impl<S> RegionWorkerLoop<S> {
file.sequence = NonZeroU64::new(file_sequence);
}
// Allow retrieving `is_staging` before spawn the edit region task.
let is_staging = region.is_staging();
let expect_state = if is_staging {
RegionLeaderState::Staging
} else {
RegionLeaderState::Writable
};
// Marks the region as editing.
if let Err(e) = region.set_editing() {
if let Err(e) = region.set_editing(expect_state) {
let _ = sender.send(Err(e));
return;
}
@@ -258,7 +265,8 @@ impl<S> RegionWorkerLoop<S> {
// Now the region is in editing state.
// Updates manifest in background.
common_runtime::spawn_global(async move {
let result = edit_region(&region, edit.clone(), cache_manager, listener).await;
let result =
edit_region(&region, edit.clone(), cache_manager, listener, is_staging).await;
let notify = WorkerRequest::Background {
region_id,
notify: BackgroundNotify::RegionEdit(RegionEditResult {
@@ -268,6 +276,7 @@ impl<S> RegionWorkerLoop<S> {
result,
// we always need to restore region state after region edit
update_region_state: true,
is_staging,
}),
};
@@ -299,29 +308,39 @@ impl<S> RegionWorkerLoop<S> {
}
};
let need_compaction =
edit_result.result.is_ok() && !edit_result.edit.files_to_add.is_empty();
let need_compaction = if edit_result.is_staging {
if edit_result.update_region_state {
// For staging regions, edits are not applied immediately,
// as they remain invisible until the region exits the staging state.
region.switch_state_to_staging(RegionLeaderState::Editing);
}
if edit_result.result.is_ok() {
// Applies the edit to the region.
region.version_control.apply_edit(
Some(edit_result.edit),
&[],
region.file_purger.clone(),
);
}
false
} else {
let need_compaction =
edit_result.result.is_ok() && !edit_result.edit.files_to_add.is_empty();
// Only apply the edit if the result is ok and region is not in staging state.
if edit_result.result.is_ok() {
// Applies the edit to the region.
region.version_control.apply_edit(
Some(edit_result.edit),
&[],
region.file_purger.clone(),
);
}
if edit_result.update_region_state {
region.switch_state_to_writable(RegionLeaderState::Editing);
}
if edit_result.update_region_state {
// Sets the region as writable.
region.switch_state_to_writable(RegionLeaderState::Editing);
}
need_compaction
};
let _ = edit_result.sender.send(edit_result.result);
if let Some(edit_queue) = self.region_edit_queues.get_mut(&edit_result.region_id)
&& let Some(request) = edit_queue.dequeue()
{
self.handle_region_edit(request).await;
self.handle_region_edit(request);
}
if need_compaction {
@@ -463,9 +482,9 @@ async fn edit_region(
edit: RegionEdit,
cache_manager: CacheManagerRef,
listener: WorkerListener,
is_staging: bool,
) -> Result<()> {
let region_id = region.region_id;
let is_staging = region.is_staging();
if let Some(write_cache) = cache_manager.write_cache() {
for file_meta in &edit.files_to_add {
let write_cache = write_cache.clone();
@@ -530,7 +549,10 @@ async fn edit_region(
}
}
info!("Applying {edit:?} to region {}", region_id);
info!(
"Applying {edit:?} to region {}, is_staging: {}",
region_id, is_staging
);
let action_list = RegionMetaActionList::with_action(RegionMetaAction::Edit(edit));
region

View File

@@ -68,6 +68,7 @@ use sql::statements::{
sql_column_def_to_grpc_column_def, sql_data_type_to_concrete_data_type, value_to_sql_value,
};
use sql::util::extract_tables_from_query;
use store_api::mito_engine_options::{COMPACTION_OVERRIDE, COMPACTION_TYPE};
use table::requests::{FILE_TABLE_META_KEY, TableOptions};
use table::table_reference::TableReference;
#[cfg(feature = "enterprise")]
@@ -216,6 +217,11 @@ pub fn create_to_expr(
.context(UnrecognizedTableOptionSnafu)?,
);
let mut table_options = table_options;
if table_options.contains_key(COMPACTION_TYPE) {
table_options.insert(COMPACTION_OVERRIDE.to_string(), "true".to_string());
}
let primary_keys = find_primary_keys(&create.columns, &create.constraints)?;
let expr = CreateTableExpr {

View File

@@ -117,10 +117,13 @@ impl StatementExecutor {
.map(|v| v.into_inner());
let create_expr = &mut expr_helper::create_to_expr(&stmt, &ctx)?;
// We don't put ttl into the table options
// Because it will be used directly while compaction.
// Don't inherit schema-level TTL/compaction options into table options:
// TTL is applied during compaction, and `compaction.*` is handled separately.
if let Some(schema_options) = schema_options {
for (key, value) in schema_options.extra_options.iter() {
if key.starts_with("compaction.") {
continue;
}
create_expr
.table_options
.entry(key.clone())

View File

@@ -303,4 +303,43 @@ mod tests {
let subtasks = create_subtasks(&from, &to).unwrap();
assert!(subtasks.is_empty());
}
#[test]
fn test_three_components() {
// Left: A:[0,10), B:[20,30), C:[40,50)
let from = vec![
col("x")
.gt_eq(Value::Int64(0))
.and(col("x").lt(Value::Int64(10))),
col("x")
.gt_eq(Value::Int64(20))
.and(col("x").lt(Value::Int64(30))),
col("x")
.gt_eq(Value::Int64(40))
.and(col("x").lt(Value::Int64(50))),
];
// Right: A:[0,10), B:[20,30), C:[40,60)
let to = vec![
col("x")
.gt_eq(Value::Int64(0))
.and(col("x").lt(Value::Int64(10))),
col("x")
.gt_eq(Value::Int64(20))
.and(col("x").lt(Value::Int64(30))),
col("x")
.gt_eq(Value::Int64(40))
.and(col("x").lt(Value::Int64(60))),
];
let subtasks = create_subtasks(&from, &to).unwrap();
assert_eq!(subtasks.len(), 3);
assert_eq!(subtasks[0].from_expr_indices, vec![0]);
assert_eq!(subtasks[0].to_expr_indices, vec![0]);
assert_eq!(subtasks[0].transition_map, vec![vec![0]]);
assert_eq!(subtasks[1].from_expr_indices, vec![1]);
assert_eq!(subtasks[1].to_expr_indices, vec![1]);
assert_eq!(subtasks[1].transition_map, vec![vec![1]]);
assert_eq!(subtasks[2].from_expr_indices, vec![2]);
assert_eq!(subtasks[2].to_expr_indices, vec![2]);
assert_eq!(subtasks[2].transition_map, vec![vec![2]]);
}
}

View File

@@ -41,8 +41,6 @@ use snafu::{Location, ResultExt};
use crate::error::{CatalogSnafu, Result};
use crate::query_engine::{DefaultPlanDecoder, QueryEngineState};
mod function_alias;
pub struct DfContextProviderAdapter {
engine_state: Arc<QueryEngineState>,
session_state: SessionState,
@@ -149,17 +147,7 @@ impl ContextProvider for DfContextProviderAdapter {
fn get_function_meta(&self, name: &str) -> Option<Arc<ScalarUDF>> {
self.engine_state.scalar_function(name).map_or_else(
|| {
self.session_state
.scalar_functions()
.get(name)
.cloned()
.or_else(|| {
function_alias::resolve_scalar(name).and_then(|name| {
self.session_state.scalar_functions().get(name).cloned()
})
})
},
|| self.session_state.scalar_functions().get(name).cloned(),
|func| {
Some(Arc::new(func.provide(FunctionContext {
query_ctx: self.query_ctx.clone(),
@@ -171,17 +159,7 @@ impl ContextProvider for DfContextProviderAdapter {
fn get_aggregate_meta(&self, name: &str) -> Option<Arc<AggregateUDF>> {
self.engine_state.aggr_function(name).map_or_else(
|| {
self.session_state
.aggregate_functions()
.get(name)
.cloned()
.or_else(|| {
function_alias::resolve_aggregate(name).and_then(|name| {
self.session_state.aggregate_functions().get(name).cloned()
})
})
},
|| self.session_state.aggregate_functions().get(name).cloned(),
|func| Some(Arc::new(func)),
)
}
@@ -215,14 +193,12 @@ impl ContextProvider for DfContextProviderAdapter {
fn udf_names(&self) -> Vec<String> {
let mut names = self.engine_state.scalar_names();
names.extend(self.session_state.scalar_functions().keys().cloned());
names.extend(function_alias::scalar_alias_names().map(|name| name.to_string()));
names
}
fn udaf_names(&self) -> Vec<String> {
let mut names = self.engine_state.aggr_names();
names.extend(self.session_state.aggregate_functions().keys().cloned());
names.extend(function_alias::aggregate_alias_names().map(|name| name.to_string()));
names
}
@@ -257,14 +233,9 @@ impl ContextProvider for DfContextProviderAdapter {
.table_functions()
.get(name)
.cloned()
.or_else(|| {
function_alias::resolve_scalar(name)
.and_then(|alias| self.session_state.table_functions().get(alias).cloned())
});
let tbl_func = tbl_func.ok_or_else(|| {
DataFusionError::Plan(format!("table function '{name}' not found"))
})?;
.ok_or_else(|| {
DataFusionError::Plan(format!("table function '{name}' not found"))
})?;
let provider = tbl_func.create_table_provider(&args)?;
Ok(provider_as_source(provider))

View File

@@ -1,86 +0,0 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::HashMap;
use once_cell::sync::Lazy;
const SCALAR_ALIASES: &[(&str, &str)] = &[
// SQL compat aliases.
("ucase", "upper"),
("lcase", "lower"),
("ceiling", "ceil"),
("mid", "substr"),
// MySQL's RAND([seed]) accepts an optional seed argument, while DataFusion's `random()`
// does not. We alias the name for `rand()` compatibility, and `rand(seed)` will error
// due to mismatched arity.
("rand", "random"),
];
const AGGREGATE_ALIASES: &[(&str, &str)] = &[
// MySQL compat aliases that don't override existing DataFusion aggregate names.
//
// NOTE: We intentionally do NOT alias `stddev` here, because DataFusion defines `stddev`
// as sample standard deviation while MySQL's `STDDEV` is population standard deviation.
("std", "stddev_pop"),
("variance", "var_pop"),
];
static SCALAR_FUNCTION_ALIAS: Lazy<HashMap<&'static str, &'static str>> =
Lazy::new(|| SCALAR_ALIASES.iter().copied().collect());
static AGGREGATE_FUNCTION_ALIAS: Lazy<HashMap<&'static str, &'static str>> =
Lazy::new(|| AGGREGATE_ALIASES.iter().copied().collect());
pub fn resolve_scalar(name: &str) -> Option<&'static str> {
let name = name.to_ascii_lowercase();
SCALAR_FUNCTION_ALIAS.get(name.as_str()).copied()
}
pub fn resolve_aggregate(name: &str) -> Option<&'static str> {
let name = name.to_ascii_lowercase();
AGGREGATE_FUNCTION_ALIAS.get(name.as_str()).copied()
}
pub fn scalar_alias_names() -> impl Iterator<Item = &'static str> {
SCALAR_ALIASES.iter().map(|(name, _)| *name)
}
pub fn aggregate_alias_names() -> impl Iterator<Item = &'static str> {
AGGREGATE_ALIASES.iter().map(|(name, _)| *name)
}
#[cfg(test)]
mod tests {
use super::{resolve_aggregate, resolve_scalar};
#[test]
fn resolves_scalar_aliases_case_insensitive() {
assert_eq!(resolve_scalar("ucase"), Some("upper"));
assert_eq!(resolve_scalar("UCASE"), Some("upper"));
assert_eq!(resolve_scalar("lcase"), Some("lower"));
assert_eq!(resolve_scalar("ceiling"), Some("ceil"));
assert_eq!(resolve_scalar("MID"), Some("substr"));
assert_eq!(resolve_scalar("RAND"), Some("random"));
assert_eq!(resolve_scalar("not_a_real_alias"), None);
}
#[test]
fn resolves_aggregate_aliases_case_insensitive() {
assert_eq!(resolve_aggregate("std"), Some("stddev_pop"));
assert_eq!(resolve_aggregate("variance"), Some("var_pop"));
assert_eq!(resolve_aggregate("STDDEV"), None);
assert_eq!(resolve_aggregate("not_a_real_alias"), None);
}
}

View File

@@ -28,9 +28,9 @@ use store_api::metadata::{
ColumnMetadata, RegionMetadata, RegionMetadataBuilder, RegionMetadataRef,
};
use store_api::region_engine::{
CopyRegionFromRequest, CopyRegionFromResponse, RegionEngine, RegionManifestInfo, RegionRole,
RegionScannerRef, RegionStatistic, RemapManifestsRequest, RemapManifestsResponse,
SetRegionRoleStateResponse, SettableRegionRoleState, SyncManifestResponse,
RegionEngine, RegionRole, RegionScannerRef, RegionStatistic, RemapManifestsRequest,
RemapManifestsResponse, SetRegionRoleStateResponse, SettableRegionRoleState,
SyncRegionFromRequest, SyncRegionFromResponse,
};
use store_api::region_request::RegionRequest;
use store_api::storage::{ConcreteDataType, RegionId, ScanRequest, SequenceNumber};
@@ -113,8 +113,8 @@ impl RegionEngine for MetaRegionEngine {
async fn sync_region(
&self,
_region_id: RegionId,
_manifest_info: RegionManifestInfo,
) -> Result<SyncManifestResponse, BoxedError> {
_request: SyncRegionFromRequest,
) -> Result<SyncRegionFromResponse, BoxedError> {
unimplemented!()
}
@@ -125,14 +125,6 @@ impl RegionEngine for MetaRegionEngine {
unimplemented!()
}
async fn copy_region_from(
&self,
_region_id: RegionId,
_request: CopyRegionFromRequest,
) -> Result<CopyRegionFromResponse, BoxedError> {
unimplemented!()
}
fn role(&self, _region_id: RegionId) -> Option<RegionRole> {
None
}

View File

@@ -209,6 +209,7 @@ impl QueryEngineState {
.build();
let df_context = SessionContext::new_with_state(session_state);
register_function_aliases(&df_context);
Self {
df_context,
@@ -415,6 +416,41 @@ impl QueryPlanner for DfQueryPlanner {
}
}
/// MySQL-compatible scalar function aliases: (target_name, alias)
const SCALAR_FUNCTION_ALIASES: &[(&str, &str)] = &[
("upper", "ucase"),
("lower", "lcase"),
("ceil", "ceiling"),
("substr", "mid"),
("random", "rand"),
];
/// MySQL-compatible aggregate function aliases: (target_name, alias)
const AGGREGATE_FUNCTION_ALIASES: &[(&str, &str)] =
&[("stddev_pop", "std"), ("var_pop", "variance")];
/// Register function aliases.
///
/// This function adds aliases like `ucase` -> `upper`, `lcase` -> `lower`, etc.
/// to make GreptimeDB more compatible with MySQL syntax.
fn register_function_aliases(ctx: &SessionContext) {
let state = ctx.state();
for (target, alias) in SCALAR_FUNCTION_ALIASES {
if let Some(func) = state.scalar_functions().get(*target) {
let aliased = func.as_ref().clone().with_aliases([*alias]);
ctx.register_udf(aliased);
}
}
for (target, alias) in AGGREGATE_FUNCTION_ALIASES {
if let Some(func) = state.aggregate_functions().get(*target) {
let aliased = func.as_ref().clone().with_aliases([*alias]);
ctx.register_udaf(aliased);
}
}
}
impl DfQueryPlanner {
fn new(
catalog_manager: CatalogManagerRef,

View File

@@ -56,11 +56,13 @@ fn create_sql_options(table_meta: &TableMeta, schema_options: Option<SchemaOptio
if let Some(ttl) = table_opts.ttl.map(|t| t.to_string()) {
options.insert(TTL_KEY.to_string(), ttl);
} else if let Some(database_ttl) = schema_options
.as_ref()
.and_then(|o| o.ttl)
.map(|ttl| ttl.to_string())
{
options.insert(TTL_KEY.to_string(), database_ttl);
};
for (k, v) in table_opts
.extra_options
.iter()

View File

@@ -1 +1 @@
v0.11.9
v0.11.11

View File

@@ -27,6 +27,8 @@ pub const TTL_KEY: &str = "ttl";
pub const SNAPSHOT_READ: &str = "snapshot_read";
/// Option key for compaction type.
pub const COMPACTION_TYPE: &str = "compaction.type";
/// Option key for forcing compaction options override.
pub const COMPACTION_OVERRIDE: &str = "compaction.override";
/// TWCS compaction strategy.
pub const COMPACTION_TYPE_TWCS: &str = "twcs";
/// Option key for twcs min file num to trigger a compaction.
@@ -61,6 +63,7 @@ pub fn is_mito_engine_option_key(key: &str) -> bool {
[
"ttl",
COMPACTION_TYPE,
COMPACTION_OVERRIDE,
TWCS_TRIGGER_FILE_NUM,
TWCS_MAX_OUTPUT_FILE_SIZE,
TWCS_TIME_WINDOW,
@@ -90,6 +93,7 @@ mod tests {
fn test_is_mito_engine_option_key() {
assert!(is_mito_engine_option_key("ttl"));
assert!(is_mito_engine_option_key("compaction.type"));
assert!(is_mito_engine_option_key("compaction.override"));
assert!(is_mito_engine_option_key(
"compaction.twcs.trigger_file_num"
));

View File

@@ -637,9 +637,64 @@ impl RegionStatistic {
}
}
/// The response of syncing the manifest.
/// Request to sync the region from a manifest or a region.
#[derive(Debug, Clone)]
pub enum SyncRegionFromRequest {
/// Syncs the region using manifest information.
/// Used in leader-follower manifest sync scenarios.
FromManifest(RegionManifestInfo),
/// Syncs the region from another region.
///
/// Used by the metric engine to sync logical regions from a source physical region
/// to a target physical region. This copies metadata region SST files and transforms
/// logical region entries to use the target's region number.
FromRegion {
/// The [`RegionId`] of the source region.
source_region_id: RegionId,
/// The parallelism of the sync operation.
parallelism: usize,
},
}
impl From<RegionManifestInfo> for SyncRegionFromRequest {
fn from(manifest_info: RegionManifestInfo) -> Self {
SyncRegionFromRequest::FromManifest(manifest_info)
}
}
impl SyncRegionFromRequest {
/// Creates a new request from a manifest info.
pub fn from_manifest(manifest_info: RegionManifestInfo) -> Self {
SyncRegionFromRequest::FromManifest(manifest_info)
}
/// Creates a new request from a region.
pub fn from_region(source_region_id: RegionId, parallelism: usize) -> Self {
SyncRegionFromRequest::FromRegion {
source_region_id,
parallelism,
}
}
/// Returns true if the request is from a manifest.
pub fn is_from_manifest(&self) -> bool {
matches!(self, SyncRegionFromRequest::FromManifest { .. })
}
/// Converts the request to a region manifest info.
///
/// Returns None if the request is not from a manifest.
pub fn into_region_manifest_info(self) -> Option<RegionManifestInfo> {
match self {
SyncRegionFromRequest::FromManifest(manifest_info) => Some(manifest_info),
SyncRegionFromRequest::FromRegion { .. } => None,
}
}
}
/// The response of syncing the region.
#[derive(Debug)]
pub enum SyncManifestResponse {
pub enum SyncRegionFromResponse {
NotSupported,
Mito {
/// Indicates if the data region was synced.
@@ -656,35 +711,30 @@ pub enum SyncManifestResponse {
},
}
impl SyncManifestResponse {
impl SyncRegionFromResponse {
/// Returns true if data region is synced.
pub fn is_data_synced(&self) -> bool {
match self {
SyncManifestResponse::NotSupported => false,
SyncManifestResponse::Mito { synced } => *synced,
SyncManifestResponse::Metric { data_synced, .. } => *data_synced,
SyncRegionFromResponse::NotSupported => false,
SyncRegionFromResponse::Mito { synced } => *synced,
SyncRegionFromResponse::Metric { data_synced, .. } => *data_synced,
}
}
/// Returns true if the engine is supported the sync operation.
pub fn is_supported(&self) -> bool {
matches!(self, SyncManifestResponse::NotSupported)
}
/// Returns true if the engine is a mito2 engine.
pub fn is_mito(&self) -> bool {
matches!(self, SyncManifestResponse::Mito { .. })
matches!(self, SyncRegionFromResponse::Mito { .. })
}
/// Returns true if the engine is a metric engine.
pub fn is_metric(&self) -> bool {
matches!(self, SyncManifestResponse::Metric { .. })
matches!(self, SyncRegionFromResponse::Metric { .. })
}
/// Returns the new opened logical region ids.
pub fn new_opened_logical_region_ids(self) -> Option<Vec<RegionId>> {
match self {
SyncManifestResponse::Metric {
SyncRegionFromResponse::Metric {
new_opened_logical_region_ids,
..
} => Some(new_opened_logical_region_ids),
@@ -715,7 +765,7 @@ pub struct RemapManifestsResponse {
/// Request to copy files from a source region to a target region.
#[derive(Debug, Clone)]
pub struct CopyRegionFromRequest {
pub struct MitoCopyRegionFromRequest {
/// The [`RegionId`] of the source region.
pub source_region_id: RegionId,
/// The parallelism of the copy operation.
@@ -728,37 +778,6 @@ pub struct MitoCopyRegionFromResponse {
pub copied_file_ids: Vec<FileId>,
}
#[derive(Debug, Clone)]
pub struct MetricCopyRegionFromResponse {
/// The logical regions that were newly opened after the copy operation.
pub new_opened_logical_region_ids: Vec<RegionId>,
}
/// Response to copy region from a source region to a target region.
#[derive(Debug, Clone)]
pub enum CopyRegionFromResponse {
Mito(MitoCopyRegionFromResponse),
Metric(MetricCopyRegionFromResponse),
}
impl CopyRegionFromResponse {
/// Converts the response to a mito2 response.
pub fn into_mito(self) -> Option<MitoCopyRegionFromResponse> {
match self {
CopyRegionFromResponse::Mito(response) => Some(response),
CopyRegionFromResponse::Metric(_) => None,
}
}
/// Converts the response to a metric response.
pub fn into_metric(self) -> Option<MetricCopyRegionFromResponse> {
match self {
CopyRegionFromResponse::Metric(response) => Some(response),
CopyRegionFromResponse::Mito(_) => None,
}
}
}
#[async_trait]
pub trait RegionEngine: Send + Sync {
/// Name of this engine
@@ -880,8 +899,8 @@ pub trait RegionEngine: Send + Sync {
async fn sync_region(
&self,
region_id: RegionId,
manifest_info: RegionManifestInfo,
) -> Result<SyncManifestResponse, BoxedError>;
request: SyncRegionFromRequest,
) -> Result<SyncRegionFromResponse, BoxedError>;
/// Remaps manifests from old regions to new regions.
async fn remap_manifests(
@@ -889,13 +908,6 @@ pub trait RegionEngine: Send + Sync {
request: RemapManifestsRequest,
) -> Result<RemapManifestsResponse, BoxedError>;
/// Copies region from a source region to a target region.
async fn copy_region_from(
&self,
region_id: RegionId,
request: CopyRegionFromRequest,
) -> Result<CopyRegionFromResponse, BoxedError>;
/// Sets region role state gracefully.
///
/// After the call returns, the engine ensures no more write operations will succeed in the region.

View File

@@ -152,6 +152,7 @@ pub enum RegionRequest {
Catchup(RegionCatchupRequest),
BulkInserts(RegionBulkInsertsRequest),
EnterStaging(EnterStagingRequest),
ApplyStagingManifest(ApplyStagingManifestRequest),
}
impl RegionRequest {
@@ -182,6 +183,9 @@ impl RegionRequest {
reason: "ListMetadata request should be handled separately by RegionServer",
}
.fail(),
region_request::Body::ApplyStagingManifest(apply) => {
make_region_apply_staging_manifest(apply)
}
}
}
@@ -413,6 +417,28 @@ fn make_region_bulk_inserts(request: BulkInsertRequest) -> Result<Vec<(RegionId,
)])
}
fn make_region_apply_staging_manifest(
api::v1::region::ApplyStagingManifestRequest {
region_id,
partition_expr,
files_to_add,
}: api::v1::region::ApplyStagingManifestRequest,
) -> Result<Vec<(RegionId, RegionRequest)>> {
let region_id = region_id.into();
let files_to_add = files_to_add
.context(UnexpectedSnafu {
reason: "'files_to_add' field is missing",
})?
.data;
Ok(vec![(
region_id,
RegionRequest::ApplyStagingManifest(ApplyStagingManifestRequest {
partition_expr,
files_to_add,
}),
)])
}
/// Request to put data into a region.
#[derive(Debug)]
pub struct RegionPutRequest {
@@ -1428,6 +1454,30 @@ pub struct EnterStagingRequest {
pub partition_expr: String,
}
/// This request is used as part of the region repartition.
///
/// After a region has entered staging mode with a new region rule (partition
/// expression) and a separate process (for example, `remap_manifests`) has
/// generated the new file assignments for the staging region, this request
/// applies that generated manifest to the region.
///
/// In practice, this means:
/// - The `partition_expr` identifies the staging region rule that the manifest
/// was generated for.
/// - `files_to_add` carries the serialized metadata (such as file manifests or
/// file lists) that should be attached to the region under the new rule.
///
/// It should typically be called **after** the staging region has been
/// initialized by [`EnterStagingRequest`] and the new file layout has been
/// computed, to finalize the repartition operation.
#[derive(Debug, Clone)]
pub struct ApplyStagingManifestRequest {
/// The partition expression of the staging region.
pub partition_expr: String,
/// The files to add to the region.
pub files_to_add: Vec<u8>,
}
impl fmt::Display for RegionRequest {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
@@ -1445,6 +1495,7 @@ impl fmt::Display for RegionRequest {
RegionRequest::Catchup(_) => write!(f, "Catchup"),
RegionRequest::BulkInserts(_) => write!(f, "BulkInserts"),
RegionRequest::EnterStaging(_) => write!(f, "EnterStaging"),
RegionRequest::ApplyStagingManifest(_) => write!(f, "ApplyStagingManifest"),
}
}
}

View File

@@ -56,32 +56,6 @@ async fn query_data(frontend: &Arc<Instance>) -> io::Result<()> {
))?;
execute_sql_and_expect(frontend, sql, &expected).await;
// query 1:
let sql = "SELECT json_get_string(data, '$.commit.collection') AS event, count() AS count FROM bluesky GROUP BY event ORDER BY count DESC";
let expected = r#"
+-----------------------+-------+
| event | count |
+-----------------------+-------+
| app.bsky.feed.post | 3 |
| app.bsky.feed.like | 3 |
| app.bsky.graph.follow | 3 |
| app.bsky.feed.repost | 1 |
+-----------------------+-------+"#;
execute_sql_and_expect(frontend, sql, expected).await;
// query 2:
let sql = "SELECT json_get_string(data, '$.commit.collection') AS event, count() AS count, count(DISTINCT json_get_string(data, '$.did')) AS users FROM bluesky WHERE (json_get_string(data, '$.kind') = 'commit') AND (json_get_string(data, '$.commit.operation') = 'create') GROUP BY event ORDER BY count DESC";
let expected = r#"
+-----------------------+-------+-------+
| event | count | users |
+-----------------------+-------+-------+
| app.bsky.feed.post | 3 | 3 |
| app.bsky.feed.like | 3 | 3 |
| app.bsky.graph.follow | 3 | 3 |
| app.bsky.feed.repost | 1 | 1 |
+-----------------------+-------+-------+"#;
execute_sql_and_expect(frontend, sql, expected).await;
Ok(())
}

View File

@@ -110,27 +110,25 @@ Affected Rows: 0
SHOW CREATE TABLE test1;
+-------+-----------------------------------------+
| Table | Create Table |
+-------+-----------------------------------------+
| test1 | CREATE TABLE IF NOT EXISTS "test1" ( |
| | "host" STRING NULL, |
| | "cpu" DOUBLE NULL, |
| | "ts" TIMESTAMP(3) NOT NULL, |
| | TIME INDEX ("ts") |
| | ) |
| | |
| | ENGINE=mito |
| | WITH( |
| | append_mode = 'false', |
| | 'compaction.twcs.time_window' = '1h', |
| | 'compaction.type' = 'twcs', |
| | 'memtable.type' = 'partition_tree', |
| | merge_mode = 'last_non_null', |
| | skip_wal = 'true', |
| | ttl = '1h' |
| | ) |
+-------+-----------------------------------------+
+-------+---------------------------------------+
| Table | Create Table |
+-------+---------------------------------------+
| test1 | CREATE TABLE IF NOT EXISTS "test1" ( |
| | "host" STRING NULL, |
| | "cpu" DOUBLE NULL, |
| | "ts" TIMESTAMP(3) NOT NULL, |
| | TIME INDEX ("ts") |
| | ) |
| | |
| | ENGINE=mito |
| | WITH( |
| | append_mode = 'false', |
| | 'memtable.type' = 'partition_tree', |
| | merge_mode = 'last_non_null', |
| | skip_wal = 'true', |
| | ttl = '1h' |
| | ) |
+-------+---------------------------------------+
CREATE TABLE test2(host STRING, cpu DOUBLE, ts TIMESTAMP TIME INDEX) WITH (
'append_mode'='true',
@@ -141,27 +139,25 @@ Affected Rows: 0
SHOW CREATE TABLE test2;
+-------+-----------------------------------------+
| Table | Create Table |
+-------+-----------------------------------------+
| test2 | CREATE TABLE IF NOT EXISTS "test2" ( |
| | "host" STRING NULL, |
| | "cpu" DOUBLE NULL, |
| | "ts" TIMESTAMP(3) NOT NULL, |
| | TIME INDEX ("ts") |
| | ) |
| | |
| | ENGINE=mito |
| | WITH( |
| | append_mode = 'true', |
| | 'compaction.twcs.time_window' = '1h', |
| | 'compaction.type' = 'twcs', |
| | 'memtable.type' = 'partition_tree', |
| | merge_mode = '', |
| | skip_wal = 'false', |
| | ttl = '1h' |
| | ) |
+-------+-----------------------------------------+
+-------+---------------------------------------+
| Table | Create Table |
+-------+---------------------------------------+
| test2 | CREATE TABLE IF NOT EXISTS "test2" ( |
| | "host" STRING NULL, |
| | "cpu" DOUBLE NULL, |
| | "ts" TIMESTAMP(3) NOT NULL, |
| | TIME INDEX ("ts") |
| | ) |
| | |
| | ENGINE=mito |
| | WITH( |
| | append_mode = 'true', |
| | 'memtable.type' = 'partition_tree', |
| | merge_mode = '', |
| | skip_wal = 'false', |
| | ttl = '1h' |
| | ) |
+-------+---------------------------------------+
INSERT INTO test2 VALUES('host1', 1.0, '2023-10-01 00:00:00');
@@ -183,6 +179,166 @@ DROP DATABASE mydb;
Affected Rows: 0
--- test compaction options----
CREATE DATABASE test_compaction_opt;
Affected Rows: 1
USE test_compaction_opt;
Affected Rows: 0
SHOW CREATE DATABASE test_compaction_opt;
+---------------------+---------------------------------------------------+
| Database | Create Database |
+---------------------+---------------------------------------------------+
| test_compaction_opt | CREATE DATABASE IF NOT EXISTS test_compaction_opt |
+---------------------+---------------------------------------------------+
CREATE TABLE test_table(ts TIMESTAMP TIME INDEX, val INT);
Affected Rows: 0
SHOW CREATE TABLE test_table;
+------------+-------------------------------------------+
| Table | Create Table |
+------------+-------------------------------------------+
| test_table | CREATE TABLE IF NOT EXISTS "test_table" ( |
| | "ts" TIMESTAMP(3) NOT NULL, |
| | "val" INT NULL, |
| | TIME INDEX ("ts") |
| | ) |
| | |
| | ENGINE=mito |
| | |
+------------+-------------------------------------------+
ALTER DATABASE test_compaction_opt SET 'compaction.type' = 'twcs';
Affected Rows: 0
ALTER DATABASE test_compaction_opt SET 'compaction.twcs.time_window' = '2h';
Affected Rows: 0
SHOW CREATE DATABASE test_compaction_opt;
+---------------------+---------------------------------------------------+
| Database | Create Database |
+---------------------+---------------------------------------------------+
| test_compaction_opt | CREATE DATABASE IF NOT EXISTS test_compaction_opt |
| | WITH( |
| | 'compaction.twcs.time_window' = '2h', |
| | 'compaction.type' = 'twcs' |
| | ) |
+---------------------+---------------------------------------------------+
SHOW CREATE TABLE test_table;
+------------+-------------------------------------------+
| Table | Create Table |
+------------+-------------------------------------------+
| test_table | CREATE TABLE IF NOT EXISTS "test_table" ( |
| | "ts" TIMESTAMP(3) NOT NULL, |
| | "val" INT NULL, |
| | TIME INDEX ("ts") |
| | ) |
| | |
| | ENGINE=mito |
| | |
+------------+-------------------------------------------+
CREATE TABLE test_table2(ts TIMESTAMP TIME INDEX, val INT);
Affected Rows: 0
SHOW CREATE TABLE test_table2;
+-------------+--------------------------------------------+
| Table | Create Table |
+-------------+--------------------------------------------+
| test_table2 | CREATE TABLE IF NOT EXISTS "test_table2" ( |
| | "ts" TIMESTAMP(3) NOT NULL, |
| | "val" INT NULL, |
| | TIME INDEX ("ts") |
| | ) |
| | |
| | ENGINE=mito |
| | |
+-------------+--------------------------------------------+
USE public;
Affected Rows: 0
DROP DATABASE test_compaction_opt;
Affected Rows: 0
CREATE DATABASE test_compaction_opt2;
Affected Rows: 1
USE test_compaction_opt2;
Affected Rows: 0
CREATE TABLE test_table(ts TIMESTAMP TIME INDEX, v INT) WITH ('compaction.type'='twcs','compaction.twcs.time_window'='1h');
Affected Rows: 0
SHOW CREATE TABLE test_table;
+------------+-------------------------------------------+
| Table | Create Table |
+------------+-------------------------------------------+
| test_table | CREATE TABLE IF NOT EXISTS "test_table" ( |
| | "ts" TIMESTAMP(3) NOT NULL, |
| | "v" INT NULL, |
| | TIME INDEX ("ts") |
| | ) |
| | |
| | ENGINE=mito |
| | WITH( |
| | 'compaction.override' = 'true', |
| | 'compaction.twcs.time_window' = '1h', |
| | 'compaction.type' = 'twcs' |
| | ) |
+------------+-------------------------------------------+
ALTER DATABASE test_compaction_opt2 SET 'compaction.twcs.time_window' = '3h';
Affected Rows: 0
SHOW CREATE TABLE test_table;
+------------+-------------------------------------------+
| Table | Create Table |
+------------+-------------------------------------------+
| test_table | CREATE TABLE IF NOT EXISTS "test_table" ( |
| | "ts" TIMESTAMP(3) NOT NULL, |
| | "v" INT NULL, |
| | TIME INDEX ("ts") |
| | ) |
| | |
| | ENGINE=mito |
| | WITH( |
| | 'compaction.override' = 'true', |
| | 'compaction.twcs.time_window' = '1h', |
| | 'compaction.type' = 'twcs' |
| | ) |
+------------+-------------------------------------------+
USE public;
Affected Rows: 0
DROP DATABASE test_compaction_opt2;
Affected Rows: 0
SHOW DATABASES;
+--------------------+

View File

@@ -49,5 +49,48 @@ USE public;
DROP DATABASE mydb;
--- test compaction options----
CREATE DATABASE test_compaction_opt;
USE test_compaction_opt;
SHOW CREATE DATABASE test_compaction_opt;
CREATE TABLE test_table(ts TIMESTAMP TIME INDEX, val INT);
SHOW CREATE TABLE test_table;
ALTER DATABASE test_compaction_opt SET 'compaction.type' = 'twcs';
ALTER DATABASE test_compaction_opt SET 'compaction.twcs.time_window' = '2h';
SHOW CREATE DATABASE test_compaction_opt;
SHOW CREATE TABLE test_table;
CREATE TABLE test_table2(ts TIMESTAMP TIME INDEX, val INT);
SHOW CREATE TABLE test_table2;
USE public;
DROP DATABASE test_compaction_opt;
CREATE DATABASE test_compaction_opt2;
USE test_compaction_opt2;
CREATE TABLE test_table(ts TIMESTAMP TIME INDEX, v INT) WITH ('compaction.type'='twcs','compaction.twcs.time_window'='1h');
SHOW CREATE TABLE test_table;
ALTER DATABASE test_compaction_opt2 SET 'compaction.twcs.time_window' = '3h';
SHOW CREATE TABLE test_table;
USE public;
DROP DATABASE test_compaction_opt2;
SHOW DATABASES;

View File

@@ -0,0 +1,347 @@
-- MySQL-compatible string function tests
-- LOCATE function tests
SELECT LOCATE('world', 'hello world');
+-------------------------------------------+
| locate(Utf8("world"),Utf8("hello world")) |
+-------------------------------------------+
| 7 |
+-------------------------------------------+
SELECT LOCATE('xyz', 'hello world');
+-----------------------------------------+
| locate(Utf8("xyz"),Utf8("hello world")) |
+-----------------------------------------+
| 0 |
+-----------------------------------------+
SELECT LOCATE('o', 'hello world');
+---------------------------------------+
| locate(Utf8("o"),Utf8("hello world")) |
+---------------------------------------+
| 5 |
+---------------------------------------+
SELECT LOCATE('o', 'hello world', 5);
+------------------------------------------------+
| locate(Utf8("o"),Utf8("hello world"),Int64(5)) |
+------------------------------------------------+
| 5 |
+------------------------------------------------+
SELECT LOCATE('o', 'hello world', 6);
+------------------------------------------------+
| locate(Utf8("o"),Utf8("hello world"),Int64(6)) |
+------------------------------------------------+
| 8 |
+------------------------------------------------+
SELECT LOCATE('', 'hello');
+--------------------------------+
| locate(Utf8(""),Utf8("hello")) |
+--------------------------------+
| 1 |
+--------------------------------+
SELECT LOCATE('世', 'hello世界');
+--------------------------------------+
| locate(Utf8("世"),Utf8("hello世界")) |
+--------------------------------------+
| 6 |
+--------------------------------------+
SELECT LOCATE(NULL, 'hello');
+----------------------------+
| locate(NULL,Utf8("hello")) |
+----------------------------+
| |
+----------------------------+
SELECT LOCATE('o', NULL);
+------------------------+
| locate(Utf8("o"),NULL) |
+------------------------+
| |
+------------------------+
-- ELT function tests
SELECT ELT(1, 'a', 'b', 'c');
+---------------------------------------------+
| elt(Int64(1),Utf8("a"),Utf8("b"),Utf8("c")) |
+---------------------------------------------+
| a |
+---------------------------------------------+
SELECT ELT(2, 'a', 'b', 'c');
+---------------------------------------------+
| elt(Int64(2),Utf8("a"),Utf8("b"),Utf8("c")) |
+---------------------------------------------+
| b |
+---------------------------------------------+
SELECT ELT(3, 'a', 'b', 'c');
+---------------------------------------------+
| elt(Int64(3),Utf8("a"),Utf8("b"),Utf8("c")) |
+---------------------------------------------+
| c |
+---------------------------------------------+
SELECT ELT(0, 'a', 'b', 'c');
+---------------------------------------------+
| elt(Int64(0),Utf8("a"),Utf8("b"),Utf8("c")) |
+---------------------------------------------+
| |
+---------------------------------------------+
SELECT ELT(4, 'a', 'b', 'c');
+---------------------------------------------+
| elt(Int64(4),Utf8("a"),Utf8("b"),Utf8("c")) |
+---------------------------------------------+
| |
+---------------------------------------------+
SELECT ELT(NULL, 'a', 'b', 'c');
+-----------------------------------------+
| elt(NULL,Utf8("a"),Utf8("b"),Utf8("c")) |
+-----------------------------------------+
| |
+-----------------------------------------+
-- FIELD function tests
SELECT FIELD('b', 'a', 'b', 'c');
+------------------------------------------------+
| field(Utf8("b"),Utf8("a"),Utf8("b"),Utf8("c")) |
+------------------------------------------------+
| 2 |
+------------------------------------------------+
SELECT FIELD('d', 'a', 'b', 'c');
+------------------------------------------------+
| field(Utf8("d"),Utf8("a"),Utf8("b"),Utf8("c")) |
+------------------------------------------------+
| 0 |
+------------------------------------------------+
SELECT FIELD('a', 'a', 'b', 'c');
+------------------------------------------------+
| field(Utf8("a"),Utf8("a"),Utf8("b"),Utf8("c")) |
+------------------------------------------------+
| 1 |
+------------------------------------------------+
SELECT FIELD('A', 'a', 'b', 'c');
+------------------------------------------------+
| field(Utf8("A"),Utf8("a"),Utf8("b"),Utf8("c")) |
+------------------------------------------------+
| 0 |
+------------------------------------------------+
SELECT FIELD(NULL, 'a', 'b', 'c');
+-------------------------------------------+
| field(NULL,Utf8("a"),Utf8("b"),Utf8("c")) |
+-------------------------------------------+
| 0 |
+-------------------------------------------+
-- INSERT function tests
SELECT INSERT('Quadratic', 3, 4, 'What');
+----------------------------------------------------------+
| insert(Utf8("Quadratic"),Int64(3),Int64(4),Utf8("What")) |
+----------------------------------------------------------+
| QuWhattic |
+----------------------------------------------------------+
SELECT INSERT('Quadratic', 3, 100, 'What');
+------------------------------------------------------------+
| insert(Utf8("Quadratic"),Int64(3),Int64(100),Utf8("What")) |
+------------------------------------------------------------+
| QuWhat |
+------------------------------------------------------------+
SELECT INSERT('Quadratic', 0, 4, 'What');
+----------------------------------------------------------+
| insert(Utf8("Quadratic"),Int64(0),Int64(4),Utf8("What")) |
+----------------------------------------------------------+
| Quadratic |
+----------------------------------------------------------+
SELECT INSERT('hello', 1, 0, 'X');
+---------------------------------------------------+
| insert(Utf8("hello"),Int64(1),Int64(0),Utf8("X")) |
+---------------------------------------------------+
| Xhello |
+---------------------------------------------------+
SELECT INSERT('hello世界', 6, 1, 'の');
+--------------------------------------------------------+
| insert(Utf8("hello世界"),Int64(6),Int64(1),Utf8("の")) |
+--------------------------------------------------------+
| helloの界 |
+--------------------------------------------------------+
SELECT INSERT(NULL, 1, 1, 'X');
+------------------------------------------+
| insert(NULL,Int64(1),Int64(1),Utf8("X")) |
+------------------------------------------+
| |
+------------------------------------------+
-- SPACE function tests
SELECT SPACE(5);
+-----------------+
| space(Int64(5)) |
+-----------------+
| |
+-----------------+
SELECT SPACE(0);
+-----------------+
| space(Int64(0)) |
+-----------------+
| |
+-----------------+
SELECT SPACE(-1);
+------------------+
| space(Int64(-1)) |
+------------------+
| |
+------------------+
SELECT CONCAT('a', SPACE(3), 'b');
+---------------------------------------------+
| concat(Utf8("a"),space(Int64(3)),Utf8("b")) |
+---------------------------------------------+
| a b |
+---------------------------------------------+
SELECT SPACE(NULL);
+-------------+
| space(NULL) |
+-------------+
| |
+-------------+
-- FORMAT function tests
SELECT FORMAT(1234567.891, 2);
+---------------------------------------+
| format(Float64(1234567.891),Int64(2)) |
+---------------------------------------+
| 1,234,567.89 |
+---------------------------------------+
SELECT FORMAT(1234567.891, 0);
+---------------------------------------+
| format(Float64(1234567.891),Int64(0)) |
+---------------------------------------+
| 1,234,568 |
+---------------------------------------+
SELECT FORMAT(1234.5, 4);
+----------------------------------+
| format(Float64(1234.5),Int64(4)) |
+----------------------------------+
| 1,234.5000 |
+----------------------------------+
SELECT FORMAT(-1234567.891, 2);
+----------------------------------------+
| format(Float64(-1234567.891),Int64(2)) |
+----------------------------------------+
| -1,234,567.89 |
+----------------------------------------+
SELECT FORMAT(0.5, 2);
+-------------------------------+
| format(Float64(0.5),Int64(2)) |
+-------------------------------+
| 0.50 |
+-------------------------------+
SELECT FORMAT(123, 2);
+-----------------------------+
| format(Int64(123),Int64(2)) |
+-----------------------------+
| 123.00 |
+-----------------------------+
SELECT FORMAT(NULL, 2);
+-----------------------+
| format(NULL,Int64(2)) |
+-----------------------+
| |
+-----------------------+
-- Combined test with table
CREATE TABLE string_test(idx INT, val VARCHAR, ts TIMESTAMP TIME INDEX);
Affected Rows: 0
INSERT INTO string_test VALUES
(1, 'hello world', 1),
(2, 'foo bar baz', 2),
(3, 'hello世界', 3);
Affected Rows: 3
SELECT idx, val, LOCATE('o', val) as loc FROM string_test ORDER BY idx;
+-----+-------------+-----+
| idx | val | loc |
+-----+-------------+-----+
| 1 | hello world | 5 |
| 2 | foo bar baz | 2 |
| 3 | hello世界 | 5 |
+-----+-------------+-----+
SELECT idx, val, INSERT(val, 1, 5, 'hi') as inserted FROM string_test ORDER BY idx;
+-----+-------------+----------+
| idx | val | inserted |
+-----+-------------+----------+
| 1 | hello world | hi world |
| 2 | foo bar baz | hiar baz |
| 3 | hello世界 | hi世界 |
+-----+-------------+----------+
DROP TABLE string_test;
Affected Rows: 0

View File

@@ -0,0 +1,97 @@
-- MySQL-compatible string function tests
-- LOCATE function tests
SELECT LOCATE('world', 'hello world');
SELECT LOCATE('xyz', 'hello world');
SELECT LOCATE('o', 'hello world');
SELECT LOCATE('o', 'hello world', 5);
SELECT LOCATE('o', 'hello world', 6);
SELECT LOCATE('', 'hello');
SELECT LOCATE('', 'hello世界');
SELECT LOCATE(NULL, 'hello');
SELECT LOCATE('o', NULL);
-- ELT function tests
SELECT ELT(1, 'a', 'b', 'c');
SELECT ELT(2, 'a', 'b', 'c');
SELECT ELT(3, 'a', 'b', 'c');
SELECT ELT(0, 'a', 'b', 'c');
SELECT ELT(4, 'a', 'b', 'c');
SELECT ELT(NULL, 'a', 'b', 'c');
-- FIELD function tests
SELECT FIELD('b', 'a', 'b', 'c');
SELECT FIELD('d', 'a', 'b', 'c');
SELECT FIELD('a', 'a', 'b', 'c');
SELECT FIELD('A', 'a', 'b', 'c');
SELECT FIELD(NULL, 'a', 'b', 'c');
-- INSERT function tests
SELECT INSERT('Quadratic', 3, 4, 'What');
SELECT INSERT('Quadratic', 3, 100, 'What');
SELECT INSERT('Quadratic', 0, 4, 'What');
SELECT INSERT('hello', 1, 0, 'X');
SELECT INSERT('hello世界', 6, 1, '');
SELECT INSERT(NULL, 1, 1, 'X');
-- SPACE function tests
SELECT SPACE(5);
SELECT SPACE(0);
SELECT SPACE(-1);
SELECT CONCAT('a', SPACE(3), 'b');
SELECT SPACE(NULL);
-- FORMAT function tests
SELECT FORMAT(1234567.891, 2);
SELECT FORMAT(1234567.891, 0);
SELECT FORMAT(1234.5, 4);
SELECT FORMAT(-1234567.891, 2);
SELECT FORMAT(0.5, 2);
SELECT FORMAT(123, 2);
SELECT FORMAT(NULL, 2);
-- Combined test with table
CREATE TABLE string_test(idx INT, val VARCHAR, ts TIMESTAMP TIME INDEX);
INSERT INTO string_test VALUES
(1, 'hello world', 1),
(2, 'foo bar baz', 2),
(3, 'hello世界', 3);
SELECT idx, val, LOCATE('o', val) as loc FROM string_test ORDER BY idx;
SELECT idx, val, INSERT(val, 1, 5, 'hi') as inserted FROM string_test ORDER BY idx;
DROP TABLE string_test;

View File

@@ -293,7 +293,7 @@ impl Env {
.write(true)
.truncate(truncate_log)
.append(!truncate_log)
.open(stdout_file_name)
.open(&stdout_file_name)
.unwrap();
let args = mode.get_args(&self.sqlness_home, self, db_ctx, id);
@@ -333,9 +333,13 @@ impl Env {
});
for check_ip_addr in &check_ip_addrs {
if !util::check_port(check_ip_addr.parse().unwrap(), Duration::from_secs(10)).await {
if !util::check_port(check_ip_addr.parse().unwrap(), Duration::from_secs(30)).await {
Env::stop_server(&mut process);
panic!("{} doesn't up in 10 seconds, quit.", mode.name())
panic!(
"{} doesn't up in 30 seconds, check {} for more details.",
mode.name(),
stdout_file_name
)
}
}