feat(procedure): auto split large value to multiple values (#3605)

* feat: implement MultipleValuesStream

* refactor: move KeySet to common-procedure

* refactor: move MultipleValuesStream to common-procedure

* refactor: refactor String to KeySet

* fix: fix dropping `collecting` unexpectedly

* fix: fix typo

* refactor: add the fast path of put

* refactor: remove `single_value_collector`

* refactor: use `extend` instead of `push`

* test: add more tests for `KvStateStore`

* test(etcd_store): add more tests for `KvStateStore`

* chore: apply suggestions from CR

* chore: apply suggestions from CR

* refactor: refactor with async_stream

* Update src/common/procedure/src/store/util.rs

Co-authored-by: Yingwen <realevenyag@gmail.com>

---------

Co-authored-by: Yingwen <realevenyag@gmail.com>
This commit is contained in:
Weny Xu
2024-04-01 20:04:29 +08:00
committed by GitHub
parent 5e24448b96
commit 6c316d268f
5 changed files with 519 additions and 41 deletions

View File

@@ -17,8 +17,10 @@ use std::sync::Arc;
use async_trait::async_trait;
use common_error::ext::BoxedError;
use common_procedure::error::{DeleteStatesSnafu, ListStateSnafu, PutStateSnafu};
use common_procedure::store::state_store::{KeyValueStream, StateStore};
use common_procedure::store::state_store::{KeySet, KeyValueStream, StateStore};
use common_procedure::store::util::multiple_value_stream;
use common_procedure::Result as ProcedureResult;
use futures::future::try_join_all;
use futures::StreamExt;
use snafu::ResultExt;
@@ -42,16 +44,20 @@ fn strip_prefix(key: &str) -> String {
pub struct KvStateStore {
kv_backend: KvBackendRef,
// limit is set to 0, it is treated as no limit.
max_size_per_range: usize,
// The max num of keys to be returned in a range scan request
// `None` stands no limit.
max_num_per_range: Option<usize>,
// The max bytes of value.
// `None` stands no limit.
max_size_per_value: Option<usize>,
}
impl KvStateStore {
// `max_size_per_range` is set to 0, it is treated as no limit.
pub fn new(kv_backend: KvBackendRef) -> Self {
Self {
kv_backend,
max_size_per_range: 0,
max_num_per_range: None,
max_size_per_value: None,
}
}
}
@@ -64,20 +70,80 @@ fn decode_kv(kv: KeyValue) -> Result<(String, Vec<u8>)> {
Ok((key, value))
}
enum SplitValue<'a> {
Single(&'a [u8]),
Multiple(Vec<&'a [u8]>),
}
fn split_value(value: &[u8], max_size_per_value: Option<usize>) -> SplitValue<'_> {
if let Some(max_size_per_value) = max_size_per_value {
if value.len() <= max_size_per_value {
SplitValue::Single(value)
} else {
SplitValue::Multiple(value.chunks(max_size_per_value).collect::<Vec<_>>())
}
} else {
SplitValue::Single(value)
}
}
#[async_trait]
impl StateStore for KvStateStore {
async fn put(&self, key: &str, value: Vec<u8>) -> ProcedureResult<()> {
let _ = self
.kv_backend
.put(PutRequest {
key: with_prefix(key).into_bytes(),
value,
..Default::default()
})
.await
.map_err(BoxedError::new)
.context(PutStateSnafu { key })?;
Ok(())
let split = split_value(&value, self.max_size_per_value);
let key = with_prefix(key);
match split {
SplitValue::Single(_) => {
self.kv_backend
.put(
PutRequest::new()
.with_key(key.to_string().into_bytes())
.with_value(value),
)
.await
.map_err(BoxedError::new)
.context(PutStateSnafu { key })?;
Ok(())
}
SplitValue::Multiple(values) => {
// Note:
// The length of values can be up to usize::MAX.
// The KeySet::with_segment_suffix method uses a 10-digit number to store the segment number,
// which is large enough for the usize type.
// The first segment key: "0b00001111"
// The 2nd segment key: "0b00001111/0000000001"
// The 3rd segment key: "0b00001111/0000000002"
let operations = values
.into_iter()
.enumerate()
.map(|(idx, value)| {
let key = if idx > 0 {
KeySet::with_segment_suffix(&key, idx)
} else {
key.to_string()
};
let kv_backend = self.kv_backend.clone();
async move {
kv_backend
.put(
PutRequest::new()
.with_key(key.into_bytes())
.with_value(value),
)
.await
}
})
.collect::<Vec<_>>();
try_join_all(operations)
.await
.map_err(BoxedError::new)
.context(PutStateSnafu { key })?;
Ok(())
}
}
}
async fn walk_top_down(&self, path: &str) -> ProcedureResult<KeyValueStream> {
@@ -90,7 +156,7 @@ impl StateStore for KvStateStore {
let stream = PaginationStream::new(
self.kv_backend.clone(),
req,
self.max_size_per_range,
self.max_num_per_range.unwrap_or_default(),
Arc::new(decode_kv),
);
@@ -100,6 +166,8 @@ impl StateStore for KvStateStore {
.with_context(|_| ListStateSnafu { path })
});
let stream = multiple_value_stream(Box::pin(stream));
Ok(Box::pin(stream))
}
@@ -128,19 +196,26 @@ impl StateStore for KvStateStore {
#[cfg(test)]
mod tests {
use std::env;
use std::sync::Arc;
use common_procedure::store::state_store::KeyValue;
use common_telemetry::info;
use futures::TryStreamExt;
use rand::{Rng, RngCore};
use uuid::Uuid;
use super::*;
use crate::kv_backend::chroot::ChrootKvBackend;
use crate::kv_backend::etcd::EtcdStore;
use crate::kv_backend::memory::MemoryKvBackend;
#[tokio::test]
async fn test_meta_state_store() {
let store = &KvStateStore {
kv_backend: Arc::new(MemoryKvBackend::new()),
max_size_per_range: 1, // for testing "more" in range
max_num_per_range: Some(1), // for testing "more" in range
max_size_per_value: None,
};
let walk_top_down = async move |path: &str| -> Vec<KeyValue> {
@@ -165,9 +240,9 @@ mod tests {
let data = walk_top_down("/").await;
assert_eq!(
vec![
("a/1".to_string(), b"v1".to_vec()),
("a/2".to_string(), b"v2".to_vec()),
("b/1".to_string(), b"v3".to_vec())
("a/1".into(), b"v1".to_vec()),
("a/2".into(), b"v2".to_vec()),
("b/1".into(), b"v3".to_vec())
],
data
);
@@ -175,8 +250,8 @@ mod tests {
let data = walk_top_down("a/").await;
assert_eq!(
vec![
("a/1".to_string(), b"v1".to_vec()),
("a/2".to_string(), b"v2".to_vec()),
("a/1".into(), b"v1".to_vec()),
("a/2".into(), b"v2".to_vec()),
],
data
);
@@ -187,6 +262,122 @@ mod tests {
.unwrap();
let data = walk_top_down("a/").await;
assert_eq!(vec![("a/1".to_string(), b"v1".to_vec()),], data);
assert_eq!(vec![("a/1".into(), b"v1".to_vec()),], data);
}
struct TestCase {
prefix: String,
key: String,
value: Vec<u8>,
}
async fn test_meta_state_store_split_value_with_size_limit(
kv_backend: KvBackendRef,
size_limit: u32,
num_per_range: u32,
max_bytes: u32,
) {
let num_cases = rand::thread_rng().gen_range(1..=26);
let mut cases = Vec::with_capacity(num_cases);
for i in 0..num_cases {
let size = rand::thread_rng().gen_range(size_limit..=max_bytes);
let mut large_value = vec![0u8; size as usize];
rand::thread_rng().fill_bytes(&mut large_value);
// Starts from `a`.
let prefix = format!("{}/", std::char::from_u32(97 + i as u32).unwrap());
cases.push(TestCase {
key: format!("{}{}.commit", prefix, Uuid::new_v4()),
prefix,
value: large_value,
})
}
let store = &KvStateStore {
kv_backend: kv_backend.clone(),
max_num_per_range: Some(num_per_range as usize), // for testing "more" in range
max_size_per_value: Some(size_limit as usize),
};
let walk_top_down = async move |path: &str| -> Vec<KeyValue> {
let mut data = store
.walk_top_down(path)
.await
.unwrap()
.try_collect::<Vec<_>>()
.await
.unwrap();
data.sort_unstable_by(|a, b| a.0.cmp(&b.0));
data
};
// Puts the values
for TestCase { key, value, .. } in &cases {
store.put(key, value.clone()).await.unwrap();
}
// Validates the values
for TestCase { prefix, key, value } in &cases {
let data = walk_top_down(prefix).await;
assert_eq!(data.len(), 1);
let (keyset, got) = data.into_iter().next().unwrap();
let num_expected_keys = value.len().div_ceil(size_limit as usize);
assert_eq!(&got, value);
assert_eq!(keyset.key(), key);
assert_eq!(keyset.keys().len(), num_expected_keys);
}
// Deletes the values
for TestCase { prefix, .. } in &cases {
let data = walk_top_down(prefix).await;
let (keyset, _) = data.into_iter().next().unwrap();
// Deletes values
store.batch_delete(keyset.keys().as_slice()).await.unwrap();
let data = walk_top_down(prefix).await;
assert_eq!(data.len(), 0);
}
}
#[tokio::test]
async fn test_meta_state_store_split_value() {
let size_limit = rand::thread_rng().gen_range(128..=512);
let page_size = rand::thread_rng().gen_range(1..10);
let kv_backend = Arc::new(MemoryKvBackend::new());
test_meta_state_store_split_value_with_size_limit(kv_backend, size_limit, page_size, 8192)
.await;
}
#[tokio::test]
async fn test_etcd_store_split_value() {
common_telemetry::init_default_ut_logging();
let prefix = "test_etcd_store_split_value/";
let endpoints = env::var("GT_ETCD_ENDPOINTS").unwrap_or_default();
let kv_backend: KvBackendRef = if endpoints.is_empty() {
Arc::new(MemoryKvBackend::new())
} else {
let endpoints = endpoints
.split(',')
.map(|s| s.to_string())
.collect::<Vec<String>>();
let backend = EtcdStore::with_endpoints(endpoints, 128)
.await
.expect("malformed endpoints");
// Each retry requires a new isolation namespace.
let chroot = format!("{}{}", prefix, Uuid::new_v4());
info!("chroot length: {}", chroot.len());
Arc::new(ChrootKvBackend::new(chroot.into(), backend))
};
let key_preserve_size = 1024;
// The etcd default size limit of any requests is 1.5MiB.
// However, some KvBackends, the `ChrootKvBackend`, will add the prefix to `key`;
// we don't know the exact size of the key.
let size_limit = 1536 * 1024 - key_preserve_size;
let page_size = rand::thread_rng().gen_range(1..10);
test_meta_state_store_split_value_with_size_limit(
kv_backend,
size_limit,
page_size,
size_limit * 10,
)
.await;
}
}

View File

@@ -134,6 +134,17 @@ pub enum Error {
source: Arc<Error>,
location: Location,
},
#[snafu(display("Failed to parse segment key: {key}"))]
ParseSegmentKey {
location: Location,
key: String,
#[snafu(source)]
error: std::num::ParseIntError,
},
#[snafu(display("Unexpected: {err_msg}"))]
Unexpected { location: Location, err_msg: String },
}
pub type Result<T> = std::result::Result<T, Error>;
@@ -156,7 +167,10 @@ impl ErrorExt for Error {
Error::LoaderConflict { .. } | Error::DuplicateProcedure { .. } => {
StatusCode::InvalidArguments
}
Error::ProcedurePanic { .. } | Error::CorruptedData { .. } => StatusCode::Unexpected,
Error::ProcedurePanic { .. }
| Error::CorruptedData { .. }
| Error::ParseSegmentKey { .. }
| Error::Unexpected { .. } => StatusCode::Unexpected,
Error::ProcedureExec { source, .. } => source.status_code(),
Error::StartRemoveOutdatedMetaTask { source, .. }
| Error::StopRemoveOutdatedMetaTask { source, .. } => source.status_code(),

View File

@@ -25,6 +25,7 @@ pub(crate) use crate::store::state_store::StateStoreRef;
use crate::ProcedureId;
pub mod state_store;
pub mod util;
/// Key prefix of procedure store.
const PROC_PATH: &str = "procedure/";
@@ -143,16 +144,17 @@ impl ProcedureStore {
// 8 should be enough for most procedures.
let mut step_keys = Vec::with_capacity(8);
let mut finish_keys = Vec::new();
while let Some((key, _)) = key_values.try_next().await? {
let Some(curr_key) = ParsedKey::parse_str(&self.proc_path, &key) else {
while let Some((key_set, _)) = key_values.try_next().await? {
let key = key_set.key();
let Some(curr_key) = ParsedKey::parse_str(&self.proc_path, key) else {
logging::warn!("Unknown key while deleting procedures, key: {}", key);
continue;
};
if curr_key.key_type == KeyType::Step {
step_keys.push(key);
step_keys.extend(key_set.keys());
} else {
// .commit or .rollback
finish_keys.push(key);
finish_keys.extend(key_set.keys());
}
}
@@ -184,8 +186,9 @@ impl ProcedureStore {
// Scan all procedures.
let mut key_values = self.store.walk_top_down(&self.proc_path).await?;
while let Some((key, value)) = key_values.try_next().await? {
let Some(curr_key) = ParsedKey::parse_str(&self.proc_path, &key) else {
while let Some((key_set, value)) = key_values.try_next().await? {
let key = key_set.key();
let Some(curr_key) = ParsedKey::parse_str(&self.proc_path, key) else {
logging::warn!("Unknown key while loading procedures, key: {}", key);
continue;
};

View File

@@ -25,8 +25,64 @@ use snafu::ResultExt;
use crate::error::{DeleteStateSnafu, ListStateSnafu, PutStateSnafu, Result};
/// The set of keys.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct KeySet {
key: String,
segments: usize,
}
impl PartialOrd for KeySet {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
}
}
impl Ord for KeySet {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.key.cmp(&other.key)
}
}
impl From<&str> for KeySet {
fn from(value: &str) -> Self {
KeySet {
key: value.to_string(),
segments: 0,
}
}
}
impl KeySet {
pub fn new(key: String, segments: usize) -> Self {
Self { key, segments }
}
pub fn with_segment_suffix(key: &str, version: usize) -> String {
format!("{key}/{version:010}")
}
pub fn with_prefix(key: &str) -> String {
format!("{key}/")
}
pub fn keys(&self) -> Vec<String> {
let mut keys = Vec::with_capacity(self.segments + 1);
keys.push(self.key.to_string());
for i in 1..=self.segments {
keys.push(Self::with_segment_suffix(&self.key, i))
}
keys
}
pub fn key(&self) -> &str {
&self.key
}
}
/// Key value from state store.
pub type KeyValue = (String, Vec<u8>);
pub type KeyValue = (KeySet, Vec<u8>);
/// Stream that yields [KeyValue].
pub type KeyValueStream = Pin<Box<dyn Stream<Item = Result<KeyValue>> + Send>>;
@@ -123,7 +179,7 @@ impl StateStore for ObjectStateStore {
))
})
.context(ListStateSnafu { path: key })?;
yield (key.to_string(), value);
yield (key.into(), value);
}
}
});
@@ -193,9 +249,9 @@ mod tests {
data.sort_unstable_by(|a, b| a.0.cmp(&b.0));
assert_eq!(
vec![
("a/1".to_string(), b"v1".to_vec()),
("a/2".to_string(), b"v2".to_vec()),
("b/1".to_string(), b"v3".to_vec())
("a/1".into(), b"v1".to_vec()),
("a/2".into(), b"v2".to_vec()),
("b/1".into(), b"v3".to_vec())
],
data
);
@@ -210,8 +266,8 @@ mod tests {
data.sort_unstable_by(|a, b| a.0.cmp(&b.0));
assert_eq!(
vec![
("a/1".to_string(), b"v1".to_vec()),
("a/2".to_string(), b"v2".to_vec()),
("a/1".into(), b"v1".to_vec()),
("a/2".into(), b"v2".to_vec()),
],
data
);
@@ -228,7 +284,7 @@ mod tests {
.await
.unwrap();
data.sort_unstable_by(|a, b| a.0.cmp(&b.0));
assert_eq!(vec![("a/1".to_string(), b"v1".to_vec()),], data);
assert_eq!(vec![("a/1".into(), b"v1".to_vec()),], data);
}
#[tokio::test]
@@ -257,8 +313,8 @@ mod tests {
data.sort_unstable_by(|a, b| a.0.cmp(&b.0));
assert_eq!(
vec![
("a/1".to_string(), b"v1".to_vec()),
("a/2".to_string(), b"v2".to_vec()),
("a/1".into(), b"v1".to_vec()),
("a/2".into(), b"v2".to_vec()),
],
data
);

View File

@@ -0,0 +1,214 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::pin::Pin;
use async_stream::try_stream;
use futures::{Stream, TryStreamExt};
use snafu::{ensure, ResultExt};
use super::state_store::KeySet;
use crate::error;
use crate::error::Result;
pub struct CollectingState {
pairs: Vec<(String, Vec<u8>)>,
}
fn parse_segments(segments: Vec<(String, Vec<u8>)>, prefix: &str) -> Result<Vec<(usize, Vec<u8>)>> {
segments
.into_iter()
.map(|(key, value)| {
let suffix = key.trim_start_matches(prefix);
let index = suffix
.parse::<usize>()
.context(error::ParseSegmentKeySnafu { key })?;
Ok((index, value))
})
.collect::<Result<Vec<_>>>()
}
/// Collects multiple values into a single key-value pair.
/// Returns an error if:
/// - Part values are lost.
/// - Failed to parse the key of segment.
fn multiple_values_collector(
CollectingState { mut pairs }: CollectingState,
) -> Result<(KeySet, Vec<u8>)> {
if pairs.len() == 1 {
// Safety: must exist.
let (key, value) = pairs.into_iter().next().unwrap();
Ok((KeySet::new(key, 0), value))
} else {
let segments = pairs.split_off(1);
// Safety: must exist.
let (key, value) = pairs.into_iter().next().unwrap();
let prefix = KeySet::with_prefix(&key);
let mut parsed_segments = parse_segments(segments, &prefix)?;
parsed_segments.sort_unstable_by(|a, b| a.0.cmp(&b.0));
// Safety: `parsed_segments` must larger than 0.
let segment_num = parsed_segments.last().unwrap().0;
ensure!(
// The segment index start from 1.
parsed_segments.len() == segment_num,
error::UnexpectedSnafu {
err_msg: format!(
"Corrupted segment keys, parsed segment indexes: {:?}",
parsed_segments
.into_iter()
.map(|(key, _)| key)
.collect::<Vec<_>>()
)
}
);
let segment_values = parsed_segments.into_iter().map(|(_, value)| value);
let mut values = Vec::with_capacity(segment_values.len() + 1);
values.push(value);
values.extend(segment_values);
Ok((KeySet::new(key, segment_num), values.concat()))
}
}
impl CollectingState {
fn new(key: String, value: Vec<u8>) -> CollectingState {
Self {
pairs: vec![(key, value)],
}
}
fn push(&mut self, key: String, value: Vec<u8>) {
self.pairs.push((key, value));
}
fn key(&self) -> &str {
self.pairs[0].0.as_str()
}
}
pub type Upstream = dyn Stream<Item = Result<(String, Vec<u8>)>> + Send;
pub fn multiple_value_stream(
mut upstream: Pin<Box<Upstream>>,
) -> impl Stream<Item = Result<(KeySet, Vec<u8>)>> {
try_stream! {
let mut collecting: Option<CollectingState> = None;
while let Some((key, value)) = upstream.try_next().await? {
match collecting.take() {
Some(mut current) => {
if key.starts_with(current.key()) {
// Pushes the key value pair into `collecting`.
current.push(key, value);
collecting = Some(current);
} else {
// Starts to collect next key value pair.
collecting = Some(CollectingState::new(key, value));
yield multiple_values_collector(current)?;
}
}
None => collecting = Some(CollectingState::new(key, value)),
}
}
if let Some(current) = collecting.take() {
yield multiple_values_collector(current)?
}
}
}
#[cfg(test)]
mod tests {
use std::assert_matches::assert_matches;
use futures::stream::{self};
use futures::TryStreamExt;
use super::*;
use crate::error::{self};
#[test]
fn test_key_set_keys() {
let key = KeySet::new("baz".to_string(), 3);
let keys = key.keys();
assert_eq!(keys.len(), 4);
assert_eq!(&keys[0], "baz");
assert_eq!(&keys[1], &KeySet::with_segment_suffix("baz", 1));
}
#[tokio::test]
async fn test_multiple_values_collector() {
let upstream = stream::iter(vec![
Ok(("foo".to_string(), vec![0, 1, 2, 3])),
Ok(("foo/0002".to_string(), vec![6, 7])),
Ok(("foo/0003".to_string(), vec![8])),
Ok(("foo/0001".to_string(), vec![4, 5])),
Ok(("bar".to_string(), vec![0, 1, 2, 3])),
Ok(("baz".to_string(), vec![0, 1, 2, 3])),
Ok(("baz/0003".to_string(), vec![8])),
Ok(("baz/0001".to_string(), vec![4, 5])),
Ok(("baz/0002".to_string(), vec![6, 7])),
]);
let mut stream = Box::pin(multiple_value_stream(Box::pin(upstream)));
let (key, value) = stream.try_next().await.unwrap().unwrap();
let keys = key.keys();
assert_eq!(keys[0], "foo");
assert_eq!(keys.len(), 4);
assert_eq!(value, vec![0, 1, 2, 3, 4, 5, 6, 7, 8]);
let (key, value) = stream.try_next().await.unwrap().unwrap();
let keys = key.keys();
assert_eq!(keys[0], "bar");
assert_eq!(keys.len(), 1);
assert_eq!(value, vec![0, 1, 2, 3]);
let (key, value) = stream.try_next().await.unwrap().unwrap();
let keys = key.keys();
assert_eq!(keys[0], "baz");
assert_eq!(keys.len(), 4);
assert_eq!(value, vec![0, 1, 2, 3, 4, 5, 6, 7, 8]);
assert!(stream.try_next().await.unwrap().is_none());
// Call again
assert!(stream.try_next().await.unwrap().is_none());
}
#[tokio::test]
async fn test_empty_upstream() {
let upstream = stream::iter(vec![]);
let mut stream = Box::pin(multiple_value_stream(Box::pin(upstream)));
assert!(stream.try_next().await.unwrap().is_none());
// Call again
assert!(stream.try_next().await.unwrap().is_none());
}
#[tokio::test]
async fn test_multiple_values_collector_err() {
let upstream = stream::iter(vec![
Err(error::UnexpectedSnafu { err_msg: "mock" }.build()),
Ok(("foo".to_string(), vec![0, 1, 2, 3])),
Ok(("foo/0001".to_string(), vec![4, 5])),
]);
let mut stream = Box::pin(multiple_value_stream(Box::pin(upstream)));
let err = stream.try_next().await.unwrap_err();
assert_matches!(err, error::Error::Unexpected { .. });
let upstream = stream::iter(vec![
Ok(("foo".to_string(), vec![0, 1, 2, 3])),
Ok(("foo/0001".to_string(), vec![4, 5])),
Err(error::UnexpectedSnafu { err_msg: "mock" }.build()),
]);
let mut stream = Box::pin(multiple_value_stream(Box::pin(upstream)));
let err = stream.try_next().await.unwrap_err();
assert_matches!(err, error::Error::Unexpected { .. });
}
}