Compare commits

..

7 Commits

Author SHA1 Message Date
luofucong
5befc62b80 resolve PR comments
Signed-off-by: luofucong <luofc@foxmail.com>
2025-12-31 17:42:06 +08:00
luofucong
e3e7fd13a2 fix ci
Signed-off-by: luofucong <luofc@foxmail.com>
2025-12-31 17:42:06 +08:00
luofucong
2845c8c2ba feat: ingest jsonbench data through pipeline
Signed-off-by: luofucong <luofc@foxmail.com>
2025-12-30 20:56:21 +08:00
LFC
dc9fc582a0 feat: impl json_get_int for new json type (#7495)
Update src/common/function/src/scalars/json/json_get.rs



impl `json_get_int` for new json type

Signed-off-by: luofucong <luofc@foxmail.com>
2025-12-30 09:42:16 +00:00
Weny Xu
b1d81913f5 feat: update ApplyStagingManifestRequest to fetch manifest from central region (#7493)
* feat: update ApplyStagingManifestRequest to fetch manifest from central region

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: refine comments

Signed-off-by: WenyXu <wenymedia@gmail.com>

* refactor(mito2): rename `StagingDataStorage` to `StagingBlobStorage`

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: apply suggestions

Signed-off-by: WenyXu <wenymedia@gmail.com>

* chore: update proto

Signed-off-by: WenyXu <wenymedia@gmail.com>

---------

Signed-off-by: WenyXu <wenymedia@gmail.com>
2025-12-30 07:29:56 +00:00
Yingwen
554f3943b6 ci: update breaking change title level (#7497)
Signed-off-by: evenyag <realevenyag@gmail.com>
2025-12-30 06:17:51 +00:00
dennis zhuang
e4b5ef275f feat: impl vector index building (#7468)
* feat: impl vector index building

Signed-off-by: Dennis Zhuang <killme2008@gmail.com>

* feat: supports flat format

Signed-off-by: Dennis Zhuang <killme2008@gmail.com>

* ci: add vector_index feature to test

Signed-off-by: Dennis Zhuang <killme2008@gmail.com>

* chore: apply suggestions

Signed-off-by: Dennis Zhuang <killme2008@gmail.com>

* chore: apply suggestions from copilot

Signed-off-by: Dennis Zhuang <killme2008@gmail.com>

---------

Signed-off-by: Dennis Zhuang <killme2008@gmail.com>
2025-12-30 03:38:51 +00:00
63 changed files with 3250 additions and 536 deletions

View File

@@ -755,7 +755,7 @@ jobs:
run: ../../.github/scripts/pull-test-deps-images.sh && docker compose up -d --wait
- name: Run nextest cases
run: cargo nextest run --workspace -F dashboard -F pg_kvbackend -F mysql_kvbackend
run: cargo nextest run --workspace -F dashboard -F pg_kvbackend -F mysql_kvbackend -F vector_index
env:
CARGO_BUILD_RUSTFLAGS: "-C link-arg=-fuse-ld=mold"
RUST_BACKTRACE: 1
@@ -813,7 +813,7 @@ jobs:
run: ../../.github/scripts/pull-test-deps-images.sh && docker compose up -d --wait
- name: Run nextest cases
run: cargo llvm-cov nextest --workspace --lcov --output-path lcov.info -F dashboard -F pg_kvbackend -F mysql_kvbackend
run: cargo llvm-cov nextest --workspace --lcov --output-path lcov.info -F dashboard -F pg_kvbackend -F mysql_kvbackend -F vector_index
env:
CARGO_BUILD_RUSTFLAGS: "-C link-arg=-fuse-ld=mold"
RUST_BACKTRACE: 1

6
Cargo.lock generated
View File

@@ -5466,7 +5466,7 @@ dependencies = [
[[package]]
name = "greptime-proto"
version = "0.1.0"
source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=520fa524f9d590752ea327683e82ffd65721b27c#520fa524f9d590752ea327683e82ffd65721b27c"
source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=a2e5099d72a1cfa8ba41fa4296101eb5f874074a#a2e5099d72a1cfa8ba41fa4296101eb5f874074a"
dependencies = [
"prost 0.13.5",
"prost-types 0.13.5",
@@ -7779,7 +7779,6 @@ dependencies = [
"either",
"futures",
"greptime-proto",
"humantime",
"humantime-serde",
"index",
"itertools 0.14.0",
@@ -7798,6 +7797,7 @@ dependencies = [
"rand 0.9.1",
"rayon",
"regex",
"roaring",
"rskafka",
"rstest",
"rstest_reuse",
@@ -7816,6 +7816,7 @@ dependencies = [
"tokio-util",
"toml 0.8.23",
"tracing",
"usearch",
"uuid",
]
@@ -9473,6 +9474,7 @@ dependencies = [
"ahash 0.8.12",
"api",
"arrow",
"arrow-schema",
"async-trait",
"catalog",
"chrono",

View File

@@ -151,7 +151,7 @@ etcd-client = { version = "0.16.1", features = [
fst = "0.4.7"
futures = "0.3"
futures-util = "0.3"
greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "520fa524f9d590752ea327683e82ffd65721b27c" }
greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "a2e5099d72a1cfa8ba41fa4296101eb5f874074a" }
hex = "0.4"
http = "1"
humantime = "2.1"

View File

@@ -17,7 +17,7 @@ Release date: {{ timestamp | date(format="%B %d, %Y") }}
{%- set breakings = commits | filter(attribute="breaking", value=true) -%}
{%- if breakings | length > 0 %}
## Breaking changes
### Breaking changes
{% for commit in breakings %}
* {{ commit.github.pr_title }}\
{% if commit.github.username %} by \

View File

@@ -895,7 +895,7 @@ pub fn is_column_type_value_eq(
.unwrap_or(false)
}
fn encode_json_value(value: JsonValue) -> v1::JsonValue {
pub fn encode_json_value(value: JsonValue) -> v1::JsonValue {
fn helper(json: JsonVariant) -> v1::JsonValue {
let value = match json {
JsonVariant::Null => None,

View File

@@ -17,8 +17,8 @@ use std::collections::HashMap;
use arrow_schema::extension::{EXTENSION_TYPE_METADATA_KEY, EXTENSION_TYPE_NAME_KEY};
use datatypes::schema::{
COMMENT_KEY, ColumnDefaultConstraint, ColumnSchema, FULLTEXT_KEY, FulltextAnalyzer,
FulltextBackend, FulltextOptions, INVERTED_INDEX_KEY, SKIPPING_INDEX_KEY, SkippingIndexOptions,
SkippingIndexType,
FulltextBackend, FulltextOptions, INVERTED_INDEX_KEY, Metadata, SKIPPING_INDEX_KEY,
SkippingIndexOptions, SkippingIndexType,
};
use greptime_proto::v1::{
Analyzer, FulltextBackend as PbFulltextBackend, SkippingIndexType as PbSkippingIndexType,
@@ -36,6 +36,14 @@ const INVERTED_INDEX_GRPC_KEY: &str = "inverted_index";
/// Key used to store skip index options in gRPC column options.
const SKIPPING_INDEX_GRPC_KEY: &str = "skipping_index";
const COLUMN_OPTION_MAPPINGS: [(&str, &str); 5] = [
(FULLTEXT_GRPC_KEY, FULLTEXT_KEY),
(INVERTED_INDEX_GRPC_KEY, INVERTED_INDEX_KEY),
(SKIPPING_INDEX_GRPC_KEY, SKIPPING_INDEX_KEY),
(EXTENSION_TYPE_NAME_KEY, EXTENSION_TYPE_NAME_KEY),
(EXTENSION_TYPE_METADATA_KEY, EXTENSION_TYPE_METADATA_KEY),
];
/// Tries to construct a `ColumnSchema` from the given `ColumnDef`.
pub fn try_as_column_schema(column_def: &ColumnDef) -> Result<ColumnSchema> {
let data_type = ColumnDataTypeWrapper::try_new(
@@ -131,6 +139,21 @@ pub fn try_as_column_def(column_schema: &ColumnSchema, is_primary_key: bool) ->
})
}
/// Collect the [ColumnOptions] into the [Metadata] that can be used in, for example, [ColumnSchema].
pub fn collect_column_options(column_options: Option<&ColumnOptions>) -> Metadata {
let Some(ColumnOptions { options }) = column_options else {
return Metadata::default();
};
let mut metadata = Metadata::with_capacity(options.len());
for (x, y) in COLUMN_OPTION_MAPPINGS {
if let Some(v) = options.get(x) {
metadata.insert(y.to_string(), v.clone());
}
}
metadata
}
/// Constructs a `ColumnOptions` from the given `ColumnSchema`.
pub fn options_from_column_schema(column_schema: &ColumnSchema) -> Option<ColumnOptions> {
let mut options = ColumnOptions::default();

View File

@@ -18,6 +18,7 @@ default = [
]
enterprise = ["common-meta/enterprise", "frontend/enterprise", "meta-srv/enterprise"]
tokio-console = ["common-telemetry/tokio-console"]
vector_index = ["mito2/vector_index"]
[lints]
workspace = true

View File

@@ -233,6 +233,8 @@ impl ObjbenchCommand {
inverted_index_config: MitoConfig::default().inverted_index,
fulltext_index_config,
bloom_filter_index_config: MitoConfig::default().bloom_filter_index,
#[cfg(feature = "vector_index")]
vector_index_config: Default::default(),
};
// Write SST

View File

@@ -27,7 +27,7 @@ use datafusion_common::arrow::datatypes::DataType;
use datafusion_common::{DataFusionError, Result};
use datafusion_expr::type_coercion::aggregates::STRINGS;
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, Signature, Volatility};
use datatypes::arrow_array::string_array_value_at_index;
use datatypes::arrow_array::{int_array_value_at_index, string_array_value_at_index};
use datatypes::json::JsonStructureSettings;
use jsonpath_rust::JsonPath;
use serde_json::Value;
@@ -131,13 +131,6 @@ macro_rules! json_get {
};
}
json_get!(
JsonGetInt,
Int64,
i64,
"Get the value from the JSONB by the given path and return it as an integer."
);
json_get!(
JsonGetFloat,
Float64,
@@ -152,17 +145,65 @@ json_get!(
"Get the value from the JSONB by the given path and return it as a boolean."
);
/// Get the value from the JSONB by the given path and return it as a string.
#[derive(Clone, Debug)]
pub struct JsonGetString {
enum JsonResultValue<'a> {
Jsonb(Vec<u8>),
JsonStructByColumn(&'a ArrayRef, usize),
JsonStructByValue(&'a Value),
}
trait JsonGetResultBuilder {
fn append_value(&mut self, value: JsonResultValue<'_>) -> Result<()>;
fn append_null(&mut self);
fn build(&mut self) -> ArrayRef;
}
/// Common implementation for JSON get scalar functions.
///
/// `JsonGet` encapsulates the logic for extracting values from JSON inputs
/// based on a path expression. Different JSON get functions reuse this
/// implementation by supplying their own `JsonGetResultBuilder` to control
/// how the resulting values are materialized into an Arrow array.
struct JsonGet {
signature: Signature,
}
impl JsonGetString {
pub const NAME: &'static str = "json_get_string";
impl JsonGet {
fn invoke<F, B>(&self, args: ScalarFunctionArgs, builder_factory: F) -> Result<ColumnarValue>
where
F: Fn(usize) -> B,
B: JsonGetResultBuilder,
{
let [arg0, arg1] = extract_args("JSON_GET", &args)?;
let arg1 = compute::cast(&arg1, &DataType::Utf8View)?;
let paths = arg1.as_string_view();
let mut builder = (builder_factory)(arg0.len());
match arg0.data_type() {
DataType::Binary | DataType::LargeBinary | DataType::BinaryView => {
let arg0 = compute::cast(&arg0, &DataType::BinaryView)?;
let jsons = arg0.as_binary_view();
jsonb_get(jsons, paths, &mut builder)?;
}
DataType::Struct(_) => {
let jsons = arg0.as_struct();
json_struct_get(jsons, paths, &mut builder)?
}
_ => {
return Err(DataFusionError::Execution(format!(
"JSON_GET not supported argument type {}",
arg0.data_type(),
)));
}
};
Ok(ColumnarValue::Array(builder.build()))
}
}
impl Default for JsonGetString {
impl Default for JsonGet {
fn default() -> Self {
Self {
signature: Signature::any(2, Volatility::Immutable),
@@ -170,6 +211,13 @@ impl Default for JsonGetString {
}
}
#[derive(Default)]
pub struct JsonGetString(JsonGet);
impl JsonGetString {
pub const NAME: &'static str = "json_get_string";
}
impl Function for JsonGetString {
fn name(&self) -> &str {
Self::NAME
@@ -180,61 +228,142 @@ impl Function for JsonGetString {
}
fn signature(&self) -> &Signature {
&self.signature
&self.0.signature
}
fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
let [arg0, arg1] = extract_args(self.name(), &args)?;
struct StringResultBuilder(StringViewBuilder);
let arg1 = compute::cast(&arg1, &DataType::Utf8View)?;
let paths = arg1.as_string_view();
impl JsonGetResultBuilder for StringResultBuilder {
fn append_value(&mut self, value: JsonResultValue<'_>) -> Result<()> {
match value {
JsonResultValue::Jsonb(value) => {
self.0.append_option(jsonb::to_str(&value).ok())
}
JsonResultValue::JsonStructByColumn(column, i) => {
if let Some(v) = string_array_value_at_index(column, i) {
self.0.append_value(v);
} else {
self.0
.append_value(arrow_cast::display::array_value_to_string(
column, i,
)?);
}
}
JsonResultValue::JsonStructByValue(value) => {
if let Some(s) = value.as_str() {
self.0.append_value(s)
} else {
self.0.append_value(value.to_string())
}
}
}
Ok(())
}
let result = match arg0.data_type() {
DataType::Binary | DataType::LargeBinary | DataType::BinaryView => {
let arg0 = compute::cast(&arg0, &DataType::BinaryView)?;
let jsons = arg0.as_binary_view();
jsonb_get_string(jsons, paths)?
fn append_null(&mut self) {
self.0.append_null();
}
DataType::Struct(_) => {
let jsons = arg0.as_struct();
json_struct_get_string(jsons, paths)?
}
_ => {
return Err(DataFusionError::Execution(format!(
"{} not supported argument type {}",
Self::NAME,
arg0.data_type(),
)));
}
};
Ok(ColumnarValue::Array(result))
fn build(&mut self) -> ArrayRef {
Arc::new(self.0.finish())
}
}
self.0.invoke(args, |len: usize| {
StringResultBuilder(StringViewBuilder::with_capacity(len))
})
}
}
fn jsonb_get_string(jsons: &BinaryViewArray, paths: &StringViewArray) -> Result<ArrayRef> {
let size = jsons.len();
let mut builder = StringViewBuilder::with_capacity(size);
#[derive(Default)]
pub struct JsonGetInt(JsonGet);
impl JsonGetInt {
pub const NAME: &'static str = "json_get_int";
}
impl Function for JsonGetInt {
fn name(&self) -> &str {
Self::NAME
}
fn return_type(&self, _: &[DataType]) -> Result<DataType> {
Ok(DataType::Int64)
}
fn signature(&self) -> &Signature {
&self.0.signature
}
fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> {
struct IntResultBuilder(Int64Builder);
impl JsonGetResultBuilder for IntResultBuilder {
fn append_value(&mut self, value: JsonResultValue<'_>) -> Result<()> {
match value {
JsonResultValue::Jsonb(value) => {
self.0.append_option(jsonb::to_i64(&value).ok())
}
JsonResultValue::JsonStructByColumn(column, i) => {
self.0.append_option(int_array_value_at_index(column, i))
}
JsonResultValue::JsonStructByValue(value) => {
self.0.append_option(value.as_i64())
}
}
Ok(())
}
fn append_null(&mut self) {
self.0.append_null();
}
fn build(&mut self) -> ArrayRef {
Arc::new(self.0.finish())
}
}
self.0.invoke(args, |len: usize| {
IntResultBuilder(Int64Builder::with_capacity(len))
})
}
}
impl Display for JsonGetInt {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}", Self::NAME.to_ascii_uppercase())
}
}
fn jsonb_get(
jsons: &BinaryViewArray,
paths: &StringViewArray,
builder: &mut impl JsonGetResultBuilder,
) -> Result<()> {
let size = jsons.len();
for i in 0..size {
let json = jsons.is_valid(i).then(|| jsons.value(i));
let path = paths.is_valid(i).then(|| paths.value(i));
let result = match (json, path) {
(Some(json), Some(path)) => {
get_json_by_path(json, path).and_then(|json| jsonb::to_str(&json).ok())
}
(Some(json), Some(path)) => get_json_by_path(json, path),
_ => None,
};
builder.append_option(result);
if let Some(v) = result {
builder.append_value(JsonResultValue::Jsonb(v))?;
} else {
builder.append_null();
}
}
Ok(Arc::new(builder.finish()))
Ok(())
}
fn json_struct_get_string(jsons: &StructArray, paths: &StringViewArray) -> Result<ArrayRef> {
fn json_struct_get(
jsons: &StructArray,
paths: &StringViewArray,
builder: &mut impl JsonGetResultBuilder,
) -> Result<()> {
let size = jsons.len();
let mut builder = StringViewBuilder::with_capacity(size);
for i in 0..size {
if jsons.is_null(i) || paths.is_null(i) {
builder.append_null();
@@ -247,11 +376,7 @@ fn json_struct_get_string(jsons: &StructArray, paths: &StringViewArray) -> Resul
let column = jsons.column_by_name(&field_path);
if let Some(column) = column {
if let Some(v) = string_array_value_at_index(column, i) {
builder.append_value(v);
} else {
builder.append_value(arrow_cast::display::array_value_to_string(column, i)?);
}
builder.append_value(JsonResultValue::JsonStructByColumn(column, i))?;
} else {
let Some(raw) = jsons
.column_by_name(JsonStructureSettings::RAW_FIELD)
@@ -272,27 +397,15 @@ fn json_struct_get_string(jsons: &StructArray, paths: &StringViewArray) -> Resul
Value::Null => builder.append_null(),
Value::Array(values) => match values.as_slice() {
[] => builder.append_null(),
[x] => {
if let Some(s) = x.as_str() {
builder.append_value(s)
} else {
builder.append_value(x.to_string())
}
}
x => builder.append_value(
x.iter()
.map(|v| v.to_string())
.collect::<Vec<_>>()
.join(", "),
),
[x] => builder.append_value(JsonResultValue::JsonStructByValue(x))?,
_ => builder.append_value(JsonResultValue::JsonStructByValue(&value))?,
},
// Safety: guarded by the returns of `path.find` as documented
_ => unreachable!(),
value => builder.append_value(JsonResultValue::JsonStructByValue(&value))?,
}
}
}
Ok(Arc::new(builder.finish()))
Ok(())
}
fn json_struct_to_value(raw: &str, jsons: &StructArray, i: usize) -> Result<Value> {
@@ -479,6 +592,50 @@ mod tests {
use super::*;
/// Create a JSON object like this (as a one element struct array for testing):
///
/// ```JSON
/// {
/// "kind": "foo",
/// "payload": {
/// "code": 404,
/// "success": false,
/// "result": {
/// "error": "not found",
/// "time_cost": 1.234
/// }
/// }
/// }
/// ```
fn test_json_struct() -> ArrayRef {
Arc::new(StructArray::new(
vec![
Field::new("kind", DataType::Utf8, true),
Field::new("payload.code", DataType::Int64, true),
Field::new("payload.result.time_cost", DataType::Float64, true),
Field::new(JsonStructureSettings::RAW_FIELD, DataType::Utf8View, true),
]
.into(),
vec![
Arc::new(StringArray::from_iter([Some("foo")])) as ArrayRef,
Arc::new(Int64Array::from_iter([Some(404)])),
Arc::new(Float64Array::from_iter([Some(1.234)])),
Arc::new(StringViewArray::from_iter([Some(
json! ({
"payload": {
"success": false,
"result": {
"error": "not found"
}
}
})
.to_string(),
)])),
],
None,
))
}
#[test]
fn test_json_get_int() {
let json_get_int = JsonGetInt::default();
@@ -496,37 +653,55 @@ mod tests {
r#"{"a": 4, "b": {"c": 6}, "c": 6}"#,
r#"{"a": 7, "b": 8, "c": {"a": 7}}"#,
];
let paths = vec!["$.a.b", "$.a", "$.c"];
let results = [Some(2), Some(4), None];
let json_struct = test_json_struct();
let jsonbs = json_strings
let path_expects = vec![
("$.a.b", Some(2)),
("$.a", Some(4)),
("$.c", None),
("$.kind", None),
("$.payload.code", Some(404)),
("$.payload.success", None),
("$.payload.result.time_cost", None),
("$.payload.not-exists", None),
("$.not-exists", None),
("$", None),
];
let mut jsons = json_strings
.iter()
.map(|s| {
let value = jsonb::parse_value(s.as_bytes()).unwrap();
value.to_vec()
Arc::new(BinaryArray::from_iter_values([value.to_vec()])) as ArrayRef
})
.collect::<Vec<_>>();
let json_struct_arrays =
std::iter::repeat_n(json_struct, path_expects.len() - jsons.len()).collect::<Vec<_>>();
jsons.extend(json_struct_arrays);
let args = ScalarFunctionArgs {
args: vec![
ColumnarValue::Array(Arc::new(BinaryArray::from_iter_values(jsonbs))),
ColumnarValue::Array(Arc::new(StringArray::from_iter_values(paths))),
],
arg_fields: vec![],
number_rows: 3,
return_field: Arc::new(Field::new("x", DataType::Int64, false)),
config_options: Arc::new(Default::default()),
};
let result = json_get_int
.invoke_with_args(args)
.and_then(|x| x.to_array(3))
.unwrap();
let vector = result.as_primitive::<Int64Type>();
for i in 0..jsons.len() {
let json = &jsons[i];
let (path, expect) = path_expects[i];
assert_eq!(3, vector.len());
for (i, gt) in results.iter().enumerate() {
let result = vector.is_valid(i).then(|| vector.value(i));
assert_eq!(*gt, result);
let args = ScalarFunctionArgs {
args: vec![
ColumnarValue::Array(json.clone()),
ColumnarValue::Scalar(path.into()),
],
arg_fields: vec![],
number_rows: 1,
return_field: Arc::new(Field::new("x", DataType::Int64, false)),
config_options: Arc::new(Default::default()),
};
let result = json_get_int
.invoke_with_args(args)
.and_then(|x| x.to_array(1))
.unwrap();
let result = result.as_primitive::<Int64Type>();
assert_eq!(1, result.len());
let actual = result.is_valid(0).then(|| result.value(0));
assert_eq!(actual, expect);
}
}
@@ -649,45 +824,7 @@ mod tests {
r#"{"a": "d", "b": {"c": "e"}, "c": "f"}"#,
r#"{"a": "g", "b": "h", "c": {"a": "g"}}"#,
];
// complete JSON is:
// {
// "kind": "foo",
// "payload": {
// "code": 404,
// "success": false,
// "result": {
// "error": "not found",
// "time_cost": 1.234
// }
// }
// }
let json_struct: ArrayRef = Arc::new(StructArray::new(
vec![
Field::new("kind", DataType::Utf8, true),
Field::new("payload.code", DataType::Int64, true),
Field::new("payload.result.time_cost", DataType::Float64, true),
Field::new(JsonStructureSettings::RAW_FIELD, DataType::Utf8View, true),
]
.into(),
vec![
Arc::new(StringArray::from_iter([Some("foo")])) as ArrayRef,
Arc::new(Int64Array::from_iter([Some(404)])),
Arc::new(Float64Array::from_iter([Some(1.234)])),
Arc::new(StringViewArray::from_iter([Some(
json! ({
"payload": {
"success": false,
"result": {
"error": "not found"
}
}
})
.to_string(),
)])),
],
None,
));
let json_struct = test_json_struct();
let paths = vec![
"$.a.b",

View File

@@ -15,9 +15,10 @@
use arrow::array::{ArrayRef, AsArray};
use arrow::datatypes::{
DataType, DurationMicrosecondType, DurationMillisecondType, DurationNanosecondType,
DurationSecondType, Time32MillisecondType, Time32SecondType, Time64MicrosecondType,
Time64NanosecondType, TimeUnit, TimestampMicrosecondType, TimestampMillisecondType,
TimestampNanosecondType, TimestampSecondType,
DurationSecondType, Int8Type, Int16Type, Int32Type, Int64Type, Time32MillisecondType,
Time32SecondType, Time64MicrosecondType, Time64NanosecondType, TimeUnit,
TimestampMicrosecondType, TimestampMillisecondType, TimestampNanosecondType,
TimestampSecondType, UInt8Type, UInt16Type, UInt32Type, UInt64Type,
};
use arrow_array::Array;
use common_time::time::Time;
@@ -152,3 +153,62 @@ pub fn string_array_value_at_index(array: &ArrayRef, i: usize) -> Option<&str> {
_ => None,
}
}
/// Get the integer value (`i64`) at index `i` for any integer array.
///
/// Returns `None` when:
///
/// - the array type is not an integer type;
/// - the value is larger than `i64::MAX`;
/// - the value is null.
///
/// # Panics
///
/// If index `i` is out of bounds.
pub fn int_array_value_at_index(array: &ArrayRef, i: usize) -> Option<i64> {
match array.data_type() {
DataType::Int8 => {
let array = array.as_primitive::<Int8Type>();
array.is_valid(i).then(|| array.value(i) as i64)
}
DataType::Int16 => {
let array = array.as_primitive::<Int16Type>();
array.is_valid(i).then(|| array.value(i) as i64)
}
DataType::Int32 => {
let array = array.as_primitive::<Int32Type>();
array.is_valid(i).then(|| array.value(i) as i64)
}
DataType::Int64 => {
let array = array.as_primitive::<Int64Type>();
array.is_valid(i).then(|| array.value(i))
}
DataType::UInt8 => {
let array = array.as_primitive::<UInt8Type>();
array.is_valid(i).then(|| array.value(i) as i64)
}
DataType::UInt16 => {
let array = array.as_primitive::<UInt16Type>();
array.is_valid(i).then(|| array.value(i) as i64)
}
DataType::UInt32 => {
let array = array.as_primitive::<UInt32Type>();
array.is_valid(i).then(|| array.value(i) as i64)
}
DataType::UInt64 => {
let array = array.as_primitive::<UInt64Type>();
array
.is_valid(i)
.then(|| {
let i = array.value(i);
if i <= i64::MAX as u64 {
Some(i as i64)
} else {
None
}
})
.flatten()
}
_ => None,
}
}

View File

@@ -816,7 +816,7 @@ mod tests {
let result = encode_by_struct(&json_struct, json);
assert_eq!(
result.unwrap_err().to_string(),
"Cannot cast value bar to Number(I64)"
r#"Cannot cast value bar to "<Number>""#
);
let json = json!({

View File

@@ -13,7 +13,7 @@
// limitations under the License.
use std::collections::BTreeMap;
use std::fmt::{Display, Formatter};
use std::fmt::{Debug, Display, Formatter};
use std::str::FromStr;
use std::sync::Arc;
@@ -133,28 +133,24 @@ impl From<&ConcreteDataType> for JsonNativeType {
impl Display for JsonNativeType {
fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
match self {
JsonNativeType::Null => write!(f, "Null"),
JsonNativeType::Bool => write!(f, "Bool"),
JsonNativeType::Number(t) => {
write!(f, "Number({t:?})")
}
JsonNativeType::String => write!(f, "String"),
JsonNativeType::Array(item_type) => {
write!(f, "Array[{}]", item_type)
}
JsonNativeType::Object(object) => {
write!(
f,
"Object{{{}}}",
fn to_serde_value(t: &JsonNativeType) -> serde_json::Value {
match t {
JsonNativeType::Null => serde_json::Value::String("<Null>".to_string()),
JsonNativeType::Bool => serde_json::Value::String("<Bool>".to_string()),
JsonNativeType::Number(_) => serde_json::Value::String("<Number>".to_string()),
JsonNativeType::String => serde_json::Value::String("<String>".to_string()),
JsonNativeType::Array(item_type) => {
serde_json::Value::Array(vec![to_serde_value(item_type)])
}
JsonNativeType::Object(object) => serde_json::Value::Object(
object
.iter()
.map(|(k, v)| format!(r#""{k}": {v}"#))
.collect::<Vec<_>>()
.join(", ")
)
.map(|(k, v)| (k.clone(), to_serde_value(v)))
.collect(),
),
}
}
write!(f, "{}", to_serde_value(self))
}
}
@@ -183,7 +179,11 @@ impl JsonType {
}
}
pub(crate) fn native_type(&self) -> &JsonNativeType {
pub fn is_native_type(&self) -> bool {
matches!(self.format, JsonFormat::Native(_))
}
pub fn native_type(&self) -> &JsonNativeType {
match &self.format {
JsonFormat::Jsonb => &JsonNativeType::String,
JsonFormat::Native(x) => x.as_ref(),
@@ -650,15 +650,16 @@ mod tests {
"list": [1, 2, 3],
"object": {"a": 1}
}"#;
let expected = r#"Json<Object{"hello": String, "list": Array[Number(I64)], "object": Object{"a": Number(I64)}}>"#;
let expected =
r#"Json<{"hello":"<String>","list":["<Number>"],"object":{"a":"<Number>"}}>"#;
test(json, json_type, Ok(expected))?;
// cannot merge with other non-object json values:
let jsons = [r#""s""#, "1", "[1]"];
let expects = [
r#"Failed to merge JSON datatype: datatypes have conflict, this: Object{"hello": String, "list": Array[Number(I64)], "object": Object{"a": Number(I64)}}, that: String"#,
r#"Failed to merge JSON datatype: datatypes have conflict, this: Object{"hello": String, "list": Array[Number(I64)], "object": Object{"a": Number(I64)}}, that: Number(I64)"#,
r#"Failed to merge JSON datatype: datatypes have conflict, this: Object{"hello": String, "list": Array[Number(I64)], "object": Object{"a": Number(I64)}}, that: Array[Number(I64)]"#,
r#"Failed to merge JSON datatype: datatypes have conflict, this: {"hello":"<String>","list":["<Number>"],"object":{"a":"<Number>"}}, that: "<String>""#,
r#"Failed to merge JSON datatype: datatypes have conflict, this: {"hello":"<String>","list":["<Number>"],"object":{"a":"<Number>"}}, that: "<Number>""#,
r#"Failed to merge JSON datatype: datatypes have conflict, this: {"hello":"<String>","list":["<Number>"],"object":{"a":"<Number>"}}, that: ["<Number>"]"#,
];
for (json, expect) in jsons.into_iter().zip(expects.into_iter()) {
test(json, json_type, Err(expect))?;
@@ -670,7 +671,7 @@ mod tests {
"float": 0.123,
"no": 42
}"#;
let expected = r#"Failed to merge JSON datatype: datatypes have conflict, this: String, that: Number(I64)"#;
let expected = r#"Failed to merge JSON datatype: datatypes have conflict, this: "<String>", that: "<Number>""#;
test(json, json_type, Err(expected))?;
// can merge with another json object:
@@ -679,7 +680,7 @@ mod tests {
"float": 0.123,
"int": 42
}"#;
let expected = r#"Json<Object{"float": Number(F64), "hello": String, "int": Number(I64), "list": Array[Number(I64)], "object": Object{"a": Number(I64)}}>"#;
let expected = r#"Json<{"float":"<Number>","hello":"<String>","int":"<Number>","list":["<Number>"],"object":{"a":"<Number>"}}>"#;
test(json, json_type, Ok(expected))?;
// can merge with some complex nested json object:
@@ -689,7 +690,7 @@ mod tests {
"float": 0.456,
"int": 0
}"#;
let expected = r#"Json<Object{"float": Number(F64), "hello": String, "int": Number(I64), "list": Array[Number(I64)], "object": Object{"a": Number(I64), "foo": String, "l": Array[String], "o": Object{"key": String}}}>"#;
let expected = r#"Json<{"float":"<Number>","hello":"<String>","int":"<Number>","list":["<Number>"],"object":{"a":"<Number>","foo":"<String>","l":["<String>"],"o":{"key":"<String>"}}}>"#;
test(json, json_type, Ok(expected))?;
Ok(())

View File

@@ -321,10 +321,10 @@ mod tests {
Ok(()),
Ok(()),
Err(
"Failed to merge JSON datatype: datatypes have conflict, this: Number(I64), that: String",
r#"Failed to merge JSON datatype: datatypes have conflict, this: "<Number>", that: "<String>""#,
),
Err(
"Failed to merge JSON datatype: datatypes have conflict, this: Number(I64), that: Array[Bool]",
r#"Failed to merge JSON datatype: datatypes have conflict, this: "<Number>", that: ["<Bool>"]"#,
),
];
let mut builder = JsonVectorBuilder::new(JsonNativeType::Null, 1);
@@ -396,12 +396,12 @@ mod tests {
// test children builders:
assert_eq!(builder.builders.len(), 6);
let expect_types = [
r#"Json<Object{"list": Array[Number(I64)], "s": String}>"#,
r#"Json<Object{"float": Number(F64), "s": String}>"#,
r#"Json<Object{"float": Number(F64), "int": Number(I64)}>"#,
r#"Json<Object{"int": Number(I64), "object": Object{"hello": String, "timestamp": Number(I64)}}>"#,
r#"Json<Object{"nested": Object{"a": Object{"b": Object{"b": Object{"a": String}}}}, "object": Object{"timestamp": Number(I64)}}>"#,
r#"Json<Object{"nested": Object{"a": Object{"b": Object{"a": Object{"b": String}}}}, "object": Object{"timestamp": Number(I64)}}>"#,
r#"Json<{"list":["<Number>"],"s":"<String>"}>"#,
r#"Json<{"float":"<Number>","s":"<String>"}>"#,
r#"Json<{"float":"<Number>","int":"<Number>"}>"#,
r#"Json<{"int":"<Number>","object":{"hello":"<String>","timestamp":"<Number>"}}>"#,
r#"Json<{"nested":{"a":{"b":{"b":{"a":"<String>"}}}},"object":{"timestamp":"<Number>"}}>"#,
r#"Json<{"nested":{"a":{"b":{"a":{"b":"<String>"}}}},"object":{"timestamp":"<Number>"}}>"#,
];
let expect_vectors = [
r#"
@@ -456,7 +456,7 @@ mod tests {
}
// test final merged json type:
let expected = r#"Json<Object{"float": Number(F64), "int": Number(I64), "list": Array[Number(I64)], "nested": Object{"a": Object{"b": Object{"a": Object{"b": String}, "b": Object{"a": String}}}}, "object": Object{"hello": String, "timestamp": Number(I64)}, "s": String}>"#;
let expected = r#"Json<{"float":"<Number>","int":"<Number>","list":["<Number>"],"nested":{"a":{"b":{"a":{"b":"<String>"},"b":{"a":"<String>"}}}},"object":{"hello":"<String>","timestamp":"<Number>"},"s":"<String>"}>"#;
assert_eq!(builder.data_type().to_string(), expected);
// test final produced vector:

View File

@@ -8,6 +8,7 @@ license.workspace = true
default = []
test = ["common-test-util", "rstest", "rstest_reuse", "rskafka"]
enterprise = []
vector_index = ["dep:usearch", "dep:roaring", "index/vector_index"]
[lints]
workspace = true
@@ -28,9 +29,10 @@ common-datasource.workspace = true
common-decimal.workspace = true
common-error.workspace = true
common-grpc.workspace = true
common-function.workspace = true
common-macro.workspace = true
common-meta.workspace = true
common-memory-manager.workspace = true
common-meta.workspace = true
common-query.workspace = true
common-recordbatch.workspace = true
common-runtime.workspace = true
@@ -49,7 +51,6 @@ dotenv.workspace = true
either.workspace = true
futures.workspace = true
humantime-serde.workspace = true
humantime.workspace = true
index.workspace = true
itertools.workspace = true
greptime-proto.workspace = true
@@ -67,6 +68,7 @@ partition.workspace = true
puffin.workspace = true
rand.workspace = true
rayon = "1.10"
roaring = { version = "0.10", optional = true }
regex.workspace = true
rskafka = { workspace = true, optional = true }
rstest = { workspace = true, optional = true }
@@ -84,6 +86,7 @@ tokio.workspace = true
tokio-stream.workspace = true
tokio-util.workspace = true
tracing.workspace = true
usearch = { version = "2.21", default-features = false, features = ["fp16lib"], optional = true }
uuid.workspace = true
[dev-dependencies]

View File

@@ -313,6 +313,8 @@ impl AccessLayer {
inverted_index_config: request.inverted_index_config,
fulltext_index_config: request.fulltext_index_config,
bloom_filter_index_config: request.bloom_filter_index_config,
#[cfg(feature = "vector_index")]
vector_index_config: request.vector_index_config,
};
// We disable write cache on file system but we still use atomic write.
// TODO(yingwen): If we support other non-fs stores without the write cache, then
@@ -467,6 +469,8 @@ pub struct SstWriteRequest {
pub inverted_index_config: InvertedIndexConfig,
pub fulltext_index_config: FulltextIndexConfig,
pub bloom_filter_index_config: BloomFilterConfig,
#[cfg(feature = "vector_index")]
pub vector_index_config: crate::config::VectorIndexConfig,
}
/// Cleaner to remove temp files on the atomic write dir.

View File

@@ -227,6 +227,8 @@ impl WriteCache {
inverted_index_config: write_request.inverted_index_config,
fulltext_index_config: write_request.fulltext_index_config,
bloom_filter_index_config: write_request.bloom_filter_index_config,
#[cfg(feature = "vector_index")]
vector_index_config: write_request.vector_index_config,
};
let cleaner = TempFileCleaner::new(region_id, store.clone());
@@ -520,6 +522,8 @@ mod tests {
inverted_index_config: Default::default(),
fulltext_index_config: Default::default(),
bloom_filter_index_config: Default::default(),
#[cfg(feature = "vector_index")]
vector_index_config: Default::default(),
};
let upload_request = SstUploadRequest {
@@ -620,6 +624,8 @@ mod tests {
inverted_index_config: Default::default(),
fulltext_index_config: Default::default(),
bloom_filter_index_config: Default::default(),
#[cfg(feature = "vector_index")]
vector_index_config: Default::default(),
};
let write_opts = WriteOptions {
row_group_size: 512,
@@ -701,6 +707,8 @@ mod tests {
inverted_index_config: Default::default(),
fulltext_index_config: Default::default(),
bloom_filter_index_config: Default::default(),
#[cfg(feature = "vector_index")]
vector_index_config: Default::default(),
};
let write_opts = WriteOptions {
row_group_size: 512,

View File

@@ -332,6 +332,8 @@ impl DefaultCompactor {
let inverted_index_config = compaction_region.engine_config.inverted_index.clone();
let fulltext_index_config = compaction_region.engine_config.fulltext_index.clone();
let bloom_filter_index_config = compaction_region.engine_config.bloom_filter_index.clone();
#[cfg(feature = "vector_index")]
let vector_index_config = compaction_region.engine_config.vector_index.clone();
let input_file_names = output
.inputs
@@ -378,6 +380,8 @@ impl DefaultCompactor {
inverted_index_config,
fulltext_index_config,
bloom_filter_index_config,
#[cfg(feature = "vector_index")]
vector_index_config,
},
&write_opts,
&mut metrics,

View File

@@ -158,6 +158,9 @@ pub struct MitoConfig {
pub fulltext_index: FulltextIndexConfig,
/// Bloom filter index configs.
pub bloom_filter_index: BloomFilterConfig,
/// Vector index configs (HNSW).
#[cfg(feature = "vector_index")]
pub vector_index: VectorIndexConfig,
/// Memtable config
pub memtable: MemtableConfig,
@@ -214,6 +217,8 @@ impl Default for MitoConfig {
inverted_index: InvertedIndexConfig::default(),
fulltext_index: FulltextIndexConfig::default(),
bloom_filter_index: BloomFilterConfig::default(),
#[cfg(feature = "vector_index")]
vector_index: VectorIndexConfig::default(),
memtable: MemtableConfig::default(),
min_compaction_interval: Duration::from_secs(0),
default_experimental_flat_format: false,
@@ -643,6 +648,51 @@ impl BloomFilterConfig {
}
}
/// Configuration options for the vector index (HNSW).
#[cfg(feature = "vector_index")]
#[serde_as]
#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
#[serde(default)]
pub struct VectorIndexConfig {
/// Whether to create the index on flush: automatically or never.
pub create_on_flush: Mode,
/// Whether to create the index on compaction: automatically or never.
pub create_on_compaction: Mode,
/// Whether to apply the index on query: automatically or never.
pub apply_on_query: Mode,
/// Memory threshold for creating the index.
pub mem_threshold_on_create: MemoryThreshold,
}
#[cfg(feature = "vector_index")]
impl Default for VectorIndexConfig {
fn default() -> Self {
Self {
create_on_flush: Mode::Auto,
create_on_compaction: Mode::Auto,
apply_on_query: Mode::Auto,
mem_threshold_on_create: MemoryThreshold::Auto,
}
}
}
#[cfg(feature = "vector_index")]
impl VectorIndexConfig {
pub fn mem_threshold_on_create(&self) -> Option<usize> {
match self.mem_threshold_on_create {
MemoryThreshold::Auto => {
if let Some(sys_memory) = get_total_memory_readable() {
Some((sys_memory / INDEX_CREATE_MEM_THRESHOLD_FACTOR).as_bytes() as usize)
} else {
Some(ReadableSize::mb(64).as_bytes() as usize)
}
}
MemoryThreshold::Unlimited => None,
MemoryThreshold::Size(size) => Some(size.as_bytes() as usize),
}
}
}
/// Divide cpu num by a non-zero `divisor` and returns at least 1.
fn divide_num_cpus(divisor: usize) -> usize {
debug_assert!(divisor > 0);

View File

@@ -126,7 +126,7 @@ use crate::config::MitoConfig;
use crate::engine::puffin_index::{IndexEntryContext, collect_index_entries_from_puffin};
use crate::error::{
InvalidRequestSnafu, JoinSnafu, MitoManifestInfoSnafu, RecvSnafu, RegionNotFoundSnafu, Result,
SerdeJsonSnafu, SerializeColumnMetadataSnafu, SerializeManifestSnafu,
SerdeJsonSnafu, SerializeColumnMetadataSnafu,
};
#[cfg(feature = "enterprise")]
use crate::extension::BoxedExtensionRangeProviderFactory;
@@ -1057,19 +1057,8 @@ impl EngineInner {
let region_id = request.region_id;
let (request, receiver) = WorkerRequest::try_from_remap_manifests_request(request)?;
self.workers.submit_to_worker(region_id, request).await?;
let manifests = receiver.await.context(RecvSnafu)??;
let new_manifests = manifests
.into_iter()
.map(|(region_id, manifest)| {
Ok((
region_id,
serde_json::to_string(&manifest)
.context(SerializeManifestSnafu { region_id })?,
))
})
.collect::<Result<HashMap<_, _>>>()?;
Ok(RemapManifestsResponse { new_manifests })
let manifest_paths = receiver.await.context(RecvSnafu)??;
Ok(RemapManifestsResponse { manifest_paths })
}
async fn copy_region_from(

View File

@@ -69,7 +69,8 @@ async fn test_apply_staging_manifest_invalid_region_state_with_format(flat_forma
region_id,
RegionRequest::ApplyStagingManifest(ApplyStagingManifestRequest {
partition_expr: range_expr("x", 0, 100).as_json_str().unwrap(),
files_to_add: vec![],
central_region_id: RegionId::new(1, 0),
manifest_path: "manifest.json".to_string(),
}),
)
.await
@@ -88,7 +89,8 @@ async fn test_apply_staging_manifest_invalid_region_state_with_format(flat_forma
region_id,
RegionRequest::ApplyStagingManifest(ApplyStagingManifestRequest {
partition_expr: range_expr("x", 0, 100).as_json_str().unwrap(),
files_to_add: vec![],
central_region_id: RegionId::new(1, 0),
manifest_path: "manifest.json".to_string(),
}),
)
.await
@@ -136,7 +138,8 @@ async fn test_apply_staging_manifest_mismatched_partition_expr_with_format(flat_
region_id,
RegionRequest::ApplyStagingManifest(ApplyStagingManifestRequest {
partition_expr: range_expr("x", 0, 100).as_json_str().unwrap(),
files_to_add: vec![],
central_region_id: RegionId::new(1, 0),
manifest_path: "dummy".to_string(),
}),
)
.await
@@ -144,7 +147,36 @@ async fn test_apply_staging_manifest_mismatched_partition_expr_with_format(flat_
assert_matches!(
err.into_inner().as_any().downcast_ref::<Error>().unwrap(),
Error::StagingPartitionExprMismatch { .. }
)
);
// If staging manifest's partition expr is different from the request.
let result = engine
.remap_manifests(RemapManifestsRequest {
region_id,
input_regions: vec![region_id],
region_mapping: [(region_id, vec![region_id])].into_iter().collect(),
new_partition_exprs: [(region_id, range_expr("x", 0, 49).as_json_str().unwrap())]
.into_iter()
.collect(),
})
.await
.unwrap();
let err = engine
.handle_request(
region_id,
RegionRequest::ApplyStagingManifest(ApplyStagingManifestRequest {
partition_expr: range_expr("x", 0, 50).as_json_str().unwrap(),
central_region_id: region_id,
manifest_path: result.manifest_paths[&region_id].clone(),
}),
)
.await
.unwrap_err();
assert_matches!(
err.into_inner().as_any().downcast_ref::<Error>().unwrap(),
Error::StagingPartitionExprMismatch { .. }
);
}
#[tokio::test]
@@ -216,13 +248,26 @@ async fn test_apply_staging_manifest_success_with_format(flat_format: bool) {
})
.await
.unwrap();
assert_eq!(result.new_manifests.len(), 2);
let new_manifest_1 =
serde_json::from_str::<RegionManifest>(&result.new_manifests[&new_region_id_1]).unwrap();
let new_manifest_2 =
serde_json::from_str::<RegionManifest>(&result.new_manifests[&new_region_id_2]).unwrap();
let region = engine.get_region(region_id).unwrap();
let manager = region.manifest_ctx.manifest_manager.write().await;
let manifest_storage = manager.store();
let blob_store = manifest_storage.staging_storage().blob_storage();
assert_eq!(result.manifest_paths.len(), 2);
common_telemetry::debug!("manifest paths: {:?}", result.manifest_paths);
let new_manifest_1 = blob_store
.get(&result.manifest_paths[&new_region_id_1])
.await
.unwrap();
let new_manifest_2 = blob_store
.get(&result.manifest_paths[&new_region_id_2])
.await
.unwrap();
let new_manifest_1 = serde_json::from_slice::<RegionManifest>(&new_manifest_1).unwrap();
let new_manifest_2 = serde_json::from_slice::<RegionManifest>(&new_manifest_2).unwrap();
assert_eq!(new_manifest_1.files.len(), 3);
assert_eq!(new_manifest_2.files.len(), 3);
drop(manager);
let request = CreateRequestBuilder::new().build();
engine
@@ -238,7 +283,6 @@ async fn test_apply_staging_manifest_success_with_format(flat_format: bool) {
)
.await
.unwrap();
let mut files_to_add = new_manifest_1.files.values().cloned().collect::<Vec<_>>();
// Before apply staging manifest, the files should be empty
let region = engine.get_region(new_region_id_1).unwrap();
let manifest = region.manifest_ctx.manifest().await;
@@ -251,7 +295,8 @@ async fn test_apply_staging_manifest_success_with_format(flat_format: bool) {
new_region_id_1,
RegionRequest::ApplyStagingManifest(ApplyStagingManifestRequest {
partition_expr: range_expr("tag_0", 0, 50).as_json_str().unwrap(),
files_to_add: serde_json::to_vec(&files_to_add).unwrap(),
central_region_id: region_id,
manifest_path: result.manifest_paths[&new_region_id_1].clone(),
}),
)
.await
@@ -277,23 +322,52 @@ async fn test_apply_staging_manifest_success_with_format(flat_format: bool) {
let region_dir = format!("{}/data/test/1_0000000001", data_home.display());
let staging_manifest_dir = format!("{}/staging/manifest", region_dir);
let staging_files = fs::read_dir(&staging_manifest_dir)
.map(|entries| entries.collect::<Result<Vec<_>, _>>().unwrap_or_default())
.map(|entries| {
entries
.filter(|e| e.as_ref().unwrap().metadata().unwrap().is_file())
.collect::<Result<Vec<_>, _>>()
.unwrap_or_default()
})
.unwrap_or_default();
assert_eq!(staging_files.len(), 0);
assert_eq!(staging_files.len(), 0, "staging_files: {:?}", staging_files);
let region = engine.get_region(region_id).unwrap();
let manager = region.manifest_ctx.manifest_manager.write().await;
let manifest_storage = manager.store();
let blob_store = manifest_storage.staging_storage().blob_storage();
let new_manifest_1 = blob_store
.get(&result.manifest_paths[&new_region_id_1])
.await
.unwrap();
let mut new_manifest_1 = serde_json::from_slice::<RegionManifest>(&new_manifest_1).unwrap();
// Try to modify the file sequence.
files_to_add.push(FileMeta {
region_id,
file_id: FileId::random(),
..Default::default()
});
let file_id = FileId::random();
new_manifest_1.files.insert(
file_id,
FileMeta {
region_id,
file_id,
..Default::default()
},
);
blob_store
.put(
&result.manifest_paths[&new_region_id_1],
serde_json::to_vec(&new_manifest_1).unwrap(),
)
.await
.unwrap();
drop(manager);
// This request will be ignored.
engine
.handle_request(
new_region_id_1,
RegionRequest::ApplyStagingManifest(ApplyStagingManifestRequest {
partition_expr: range_expr("tag_0", 0, 50).as_json_str().unwrap(),
files_to_add: serde_json::to_vec(&files_to_add).unwrap(),
central_region_id: region_id,
manifest_path: result.manifest_paths[&new_region_id_1].clone(),
}),
)
.await
@@ -334,12 +408,40 @@ async fn test_apply_staging_manifest_invalid_files_to_add_with_format(flat_forma
)
.await
.unwrap();
// Apply staging manifest with not exists manifest path.
let err = engine
.handle_request(
region_id,
RegionRequest::ApplyStagingManifest(ApplyStagingManifestRequest {
partition_expr: range_expr("tag_0", 0, 50).as_json_str().unwrap(),
files_to_add: b"invalid".to_vec(),
central_region_id: RegionId::new(1, 0),
manifest_path: "dummy".to_string(),
}),
)
.await
.unwrap_err();
assert_matches!(
err.into_inner().as_any().downcast_ref::<Error>().unwrap(),
Error::OpenDal { .. }
);
// Apply staging manifest with invalid bytes.
let region = engine.get_region(region_id).unwrap();
let manager = region.manifest_ctx.manifest_manager.write().await;
let manifest_storage = manager.store();
let blob_store = manifest_storage.staging_storage().blob_storage();
blob_store
.put("invalid_bytes", b"invalid_bytes".to_vec())
.await
.unwrap();
drop(manager);
let err = engine
.handle_request(
region_id,
RegionRequest::ApplyStagingManifest(ApplyStagingManifestRequest {
partition_expr: range_expr("tag_0", 0, 50).as_json_str().unwrap(),
central_region_id: region_id,
manifest_path: "invalid_bytes".to_string(),
}),
)
.await
@@ -349,52 +451,3 @@ async fn test_apply_staging_manifest_invalid_files_to_add_with_format(flat_forma
Error::SerdeJson { .. }
);
}
#[tokio::test]
async fn test_apply_staging_manifest_empty_files() {
common_telemetry::init_default_ut_logging();
test_apply_staging_manifest_empty_files_with_format(false).await;
test_apply_staging_manifest_empty_files_with_format(true).await;
}
async fn test_apply_staging_manifest_empty_files_with_format(flat_format: bool) {
let mut env = TestEnv::with_prefix("empty-files").await;
let engine = env
.create_engine(MitoConfig {
default_experimental_flat_format: flat_format,
..Default::default()
})
.await;
let region_id = RegionId::new(1, 1);
let request = CreateRequestBuilder::new().build();
engine
.handle_request(region_id, RegionRequest::Create(request))
.await
.unwrap();
engine
.handle_request(
region_id,
RegionRequest::EnterStaging(EnterStagingRequest {
partition_expr: range_expr("tag_0", 0, 50).as_json_str().unwrap(),
}),
)
.await
.unwrap();
engine
.handle_request(
region_id,
RegionRequest::ApplyStagingManifest(ApplyStagingManifestRequest {
partition_expr: range_expr("tag_0", 0, 50).as_json_str().unwrap(),
files_to_add: serde_json::to_vec::<Vec<FileMeta>>(&vec![]).unwrap(),
}),
)
.await
.unwrap();
let region = engine.get_region(region_id).unwrap();
let manifest = region.manifest_ctx.manifest().await;
assert_eq!(manifest.files.len(), 0);
let staging_manifest = region.manifest_ctx.staging_manifest().await;
assert!(staging_manifest.is_none());
let staging_partition_expr = region.staging_partition_expr.lock().unwrap();
assert!(staging_partition_expr.is_none());
}

View File

@@ -229,11 +229,23 @@ async fn test_remap_manifests_success_with_format(flat_format: bool) {
})
.await
.unwrap();
assert_eq!(result.new_manifests.len(), 2);
let new_manifest_1 =
serde_json::from_str::<RegionManifest>(&result.new_manifests[&new_region_id_1]).unwrap();
let new_manifest_2 =
serde_json::from_str::<RegionManifest>(&result.new_manifests[&new_region_id_2]).unwrap();
let region = engine.get_region(region_id).unwrap();
let manager = region.manifest_ctx.manifest_manager.write().await;
let manifest_storage = manager.store();
let blob_store = manifest_storage.staging_storage().blob_storage();
assert_eq!(result.manifest_paths.len(), 2);
common_telemetry::debug!("manifest paths: {:?}", result.manifest_paths);
let new_manifest_1 = blob_store
.get(&result.manifest_paths[&new_region_id_1])
.await
.unwrap();
let new_manifest_2 = blob_store
.get(&result.manifest_paths[&new_region_id_2])
.await
.unwrap();
let new_manifest_1 = serde_json::from_slice::<RegionManifest>(&new_manifest_1).unwrap();
let new_manifest_2 = serde_json::from_slice::<RegionManifest>(&new_manifest_2).unwrap();
assert_eq!(new_manifest_1.files.len(), 3);
assert_eq!(new_manifest_2.files.len(), 3);
}

View File

@@ -1039,6 +1039,22 @@ pub enum Error {
location: Location,
},
#[cfg(feature = "vector_index")]
#[snafu(display("Failed to build vector index: {}", reason))]
VectorIndexBuild {
reason: String,
#[snafu(implicit)]
location: Location,
},
#[cfg(feature = "vector_index")]
#[snafu(display("Failed to finish vector index: {}", reason))]
VectorIndexFinish {
reason: String,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Manual compaction is override by following operations."))]
ManualCompactionOverride {},
@@ -1345,6 +1361,9 @@ impl ErrorExt for Error {
source.status_code()
}
#[cfg(feature = "vector_index")]
VectorIndexBuild { .. } | VectorIndexFinish { .. } => StatusCode::Internal,
ManualCompactionOverride {} => StatusCode::Cancelled,
CompactionMemoryExhausted { source, .. } => source.status_code(),

View File

@@ -669,6 +669,8 @@ impl RegionFlushTask {
inverted_index_config: self.engine_config.inverted_index.clone(),
fulltext_index_config: self.engine_config.fulltext_index.clone(),
bloom_filter_index_config: self.engine_config.bloom_filter_index.clone(),
#[cfg(feature = "vector_index")]
vector_index_config: self.engine_config.vector_index.clone(),
}
}

View File

@@ -378,6 +378,11 @@ impl ManifestObjectStore {
pub async fn clear_staging_manifests(&mut self) -> Result<()> {
self.staging_storage.clear().await
}
/// Returns the staging storage.
pub(crate) fn staging_storage(&self) -> &StagingStorage {
&self.staging_storage
}
}
#[cfg(test)]

View File

@@ -26,20 +26,104 @@ use crate::manifest::storage::size_tracker::NoopTracker;
use crate::manifest::storage::utils::sort_manifests;
use crate::manifest::storage::{file_version, is_delta_file};
/// A simple blob storage for arbitrary binary data in the staging directory.
///
/// This is primarily used during repartition operations to store generated
/// manifests that will be consumed by other regions via [`ApplyStagingManifestRequest`](store_api::region_request::ApplyStagingManifestRequest).
/// The blobs are stored in `{region_dir}/staging/blob/` directory.
#[derive(Debug, Clone)]
pub(crate) struct StagingBlobStorage {
object_store: ObjectStore,
path: String,
}
/// Returns the staging path from the blob path.
///
/// # Example
/// - Input: `"data/table/region_0001/manifest/"`
/// - Output: `"data/table/region_0001/staging/blob/"`
pub fn staging_blob_path(manifest_path: &str) -> String {
let parent_dir = manifest_path
.trim_end_matches("manifest/")
.trim_end_matches('/');
util::normalize_dir(&format!("{}/staging/blob", parent_dir))
}
impl StagingBlobStorage {
pub fn new(path: String, object_store: ObjectStore) -> Self {
let path = util::normalize_dir(&path);
common_telemetry::debug!(
"Staging blob storage path: {}, root: {}",
path,
object_store.info().root()
);
Self { object_store, path }
}
/// Put the bytes to the blob storage.
pub async fn put(&self, path: &str, bytes: Vec<u8>) -> Result<()> {
let path = format!("{}{}", self.path, path);
common_telemetry::debug!(
"Putting blob to staging blob storage, path: {}, root: {}, bytes: {}",
path,
self.object_store.info().root(),
bytes.len()
);
self.object_store
.write(&path, bytes)
.await
.context(OpenDalSnafu)?;
Ok(())
}
/// Get the bytes from the blob storage.
pub async fn get(&self, path: &str) -> Result<Vec<u8>> {
let path = format!("{}{}", self.path, path);
common_telemetry::debug!(
"Reading blob from staging blob storage, path: {}, root: {}",
path,
self.object_store.info().root()
);
let bytes = self.object_store.read(&path).await.context(OpenDalSnafu)?;
Ok(bytes.to_vec())
}
}
/// Storage for staging manifest files and blobs used during repartition operations.
///
/// Fields:
/// - `delta_storage`: Manages incremental manifest delta files specific to the staging region.
/// - `blob_storage`: Manages arbitrary blobs, such as generated manifests for regions.
///
/// Directory structure:
/// - `{region_dir}/staging/manifest/` — for incremental manifest delta files for the staging region.
/// - `{region_dir}/staging/blob/` — for arbitrary blobs (e.g., generated region manifests).
#[derive(Debug, Clone)]
pub(crate) struct StagingStorage {
delta_storage: DeltaStorage<NoopTracker>,
blob_storage: StagingBlobStorage,
}
/// Returns the staging path from the manifest path.
///
/// # Example
/// - Input: `"data/table/region_0001/manifest/"`
/// - Output: `"data/table/region_0001/staging/manifest/"`
pub fn staging_manifest_path(manifest_path: &str) -> String {
let parent_dir = manifest_path
.trim_end_matches("manifest/")
.trim_end_matches('/');
util::normalize_dir(&format!("{}/staging/manifest", parent_dir))
}
impl StagingStorage {
pub fn new(path: String, object_store: ObjectStore, compress_type: CompressionType) -> Self {
let staging_path = {
// Convert "region_dir/manifest/" to "region_dir/staging/manifest/"
let parent_dir = path.trim_end_matches("manifest/").trim_end_matches('/');
util::normalize_dir(&format!("{}/staging/manifest", parent_dir))
};
let staging_blob_path = staging_blob_path(&path);
let blob_storage = StagingBlobStorage::new(staging_blob_path, object_store.clone());
let staging_manifest_path = staging_manifest_path(&path);
let delta_storage = DeltaStorage::new(
staging_path.clone(),
staging_manifest_path.clone(),
object_store.clone(),
compress_type,
// StagingStorage does not use a manifest cache; set to None.
@@ -48,7 +132,16 @@ impl StagingStorage {
// deleted after exiting staging mode.
Arc::new(NoopTracker),
);
Self { delta_storage }
Self {
delta_storage,
blob_storage,
}
}
/// Returns the blob storage.
pub(crate) fn blob_storage(&self) -> &StagingBlobStorage {
&self.blob_storage
}
/// Returns an iterator of manifests from staging directory.
@@ -107,3 +200,22 @@ impl StagingStorage {
self.delta_storage.set_compress_type(compress_type);
}
}
#[cfg(test)]
mod tests {
use crate::manifest::storage::staging::{staging_blob_path, staging_manifest_path};
#[test]
fn test_staging_path() {
let path = "/data/table/region_0001/manifest/";
let expected = "/data/table/region_0001/staging/manifest/";
assert_eq!(staging_manifest_path(path), expected);
}
#[test]
fn test_staging_blob_path() {
let path = "/data/table/region_0001/manifest/";
let expected = "/data/table/region_0001/staging/blob/";
assert_eq!(staging_blob_path(path), expected);
}
}

View File

@@ -50,7 +50,7 @@ use crate::error::{
FlushRegionSnafu, InvalidPartitionExprSnafu, InvalidRequestSnafu, MissingPartitionExprSnafu,
Result, UnexpectedSnafu,
};
use crate::manifest::action::{RegionEdit, RegionManifest, TruncateKind};
use crate::manifest::action::{RegionEdit, TruncateKind};
use crate::memtable::MemtableId;
use crate::memtable::bulk::part::BulkPart;
use crate::metrics::COMPACTION_ELAPSED_TOTAL;
@@ -796,10 +796,7 @@ impl WorkerRequest {
region_mapping,
new_partition_exprs,
}: store_api::region_engine::RemapManifestsRequest,
) -> Result<(
WorkerRequest,
Receiver<Result<HashMap<RegionId, RegionManifest>>>,
)> {
) -> Result<(WorkerRequest, Receiver<Result<HashMap<RegionId, String>>>)> {
let (sender, receiver) = oneshot::channel();
let new_partition_exprs = new_partition_exprs
.into_iter()
@@ -1116,8 +1113,10 @@ pub(crate) struct RemapManifestsRequest {
pub(crate) region_mapping: HashMap<RegionId, Vec<RegionId>>,
/// New partition expressions for the new regions.
pub(crate) new_partition_exprs: HashMap<RegionId, PartitionExpr>,
/// Result sender.
pub(crate) sender: Sender<Result<HashMap<RegionId, RegionManifest>>>,
/// Sender for the result of the remap operation.
///
/// The result is a map from region IDs to their corresponding staging manifest paths.
pub(crate) sender: Sender<Result<HashMap<RegionId, String>>>,
}
#[derive(Debug)]

View File

@@ -287,6 +287,9 @@ pub enum IndexType {
FulltextIndex,
/// Bloom Filter index
BloomFilterIndex,
/// Vector index (HNSW).
#[cfg(feature = "vector_index")]
VectorIndex,
}
/// Metadata of indexes created for a specific column in an SST file.

View File

@@ -20,6 +20,8 @@ pub(crate) mod inverted_index;
pub mod puffin_manager;
mod statistics;
pub(crate) mod store;
#[cfg(feature = "vector_index")]
pub(crate) mod vector_index;
use std::cmp::Ordering;
use std::collections::{BinaryHeap, HashMap, HashSet};
@@ -41,10 +43,14 @@ use store_api::metadata::RegionMetadataRef;
use store_api::storage::{ColumnId, FileId, RegionId};
use strum::IntoStaticStr;
use tokio::sync::mpsc::Sender;
#[cfg(feature = "vector_index")]
use vector_index::creator::VectorIndexer;
use crate::access_layer::{AccessLayerRef, FilePathProvider, OperationType, RegionFilePathFactory};
use crate::cache::file_cache::{FileCacheRef, FileType, IndexKey};
use crate::cache::write_cache::{UploadTracker, WriteCacheRef};
#[cfg(feature = "vector_index")]
use crate::config::VectorIndexConfig;
use crate::config::{BloomFilterConfig, FulltextIndexConfig, InvertedIndexConfig};
use crate::error::{
BuildIndexAsyncSnafu, DecodeSnafu, Error, InvalidRecordBatchSnafu, RegionClosedSnafu,
@@ -76,6 +82,8 @@ use crate::worker::WorkerListener;
pub(crate) const TYPE_INVERTED_INDEX: &str = "inverted_index";
pub(crate) const TYPE_FULLTEXT_INDEX: &str = "fulltext_index";
pub(crate) const TYPE_BLOOM_FILTER_INDEX: &str = "bloom_filter_index";
#[cfg(feature = "vector_index")]
pub(crate) const TYPE_VECTOR_INDEX: &str = "vector_index";
/// Triggers background download of an index file to the local cache.
pub(crate) fn trigger_index_background_download(
@@ -114,6 +122,9 @@ pub struct IndexOutput {
pub fulltext_index: FulltextIndexOutput,
/// Bloom filter output.
pub bloom_filter: BloomFilterOutput,
/// Vector index output.
#[cfg(feature = "vector_index")]
pub vector_index: VectorIndexOutput,
}
impl IndexOutput {
@@ -128,6 +139,10 @@ impl IndexOutput {
if self.bloom_filter.is_available() {
indexes.push(IndexType::BloomFilterIndex);
}
#[cfg(feature = "vector_index")]
if self.vector_index.is_available() {
indexes.push(IndexType::VectorIndex);
}
indexes
}
@@ -151,6 +166,12 @@ impl IndexOutput {
.push(IndexType::BloomFilterIndex);
}
}
#[cfg(feature = "vector_index")]
if self.vector_index.is_available() {
for &col in &self.vector_index.columns {
map.entry(col).or_default().push(IndexType::VectorIndex);
}
}
map.into_iter()
.map(|(column_id, created_indexes)| ColumnIndexMetadata {
@@ -184,6 +205,9 @@ pub type InvertedIndexOutput = IndexBaseOutput;
pub type FulltextIndexOutput = IndexBaseOutput;
/// Output of the bloom filter creation.
pub type BloomFilterOutput = IndexBaseOutput;
/// Output of the vector index creation.
#[cfg(feature = "vector_index")]
pub type VectorIndexOutput = IndexBaseOutput;
/// The index creator that hides the error handling details.
#[derive(Default)]
@@ -199,6 +223,10 @@ pub struct Indexer {
last_mem_fulltext_index: usize,
bloom_filter_indexer: Option<BloomFilterIndexer>,
last_mem_bloom_filter: usize,
#[cfg(feature = "vector_index")]
vector_indexer: Option<VectorIndexer>,
#[cfg(feature = "vector_index")]
last_mem_vector_index: usize,
intermediate_manager: Option<IntermediateManager>,
}
@@ -259,6 +287,18 @@ impl Indexer {
.with_label_values(&[TYPE_BLOOM_FILTER_INDEX])
.add(bloom_filter_mem as i64 - self.last_mem_bloom_filter as i64);
self.last_mem_bloom_filter = bloom_filter_mem;
#[cfg(feature = "vector_index")]
{
let vector_mem = self
.vector_indexer
.as_ref()
.map_or(0, |creator| creator.memory_usage());
INDEX_CREATE_MEMORY_USAGE
.with_label_values(&[TYPE_VECTOR_INDEX])
.add(vector_mem as i64 - self.last_mem_vector_index as i64);
self.last_mem_vector_index = vector_mem;
}
}
}
@@ -279,6 +319,8 @@ pub(crate) struct IndexerBuilderImpl {
pub(crate) inverted_index_config: InvertedIndexConfig,
pub(crate) fulltext_index_config: FulltextIndexConfig,
pub(crate) bloom_filter_index_config: BloomFilterConfig,
#[cfg(feature = "vector_index")]
pub(crate) vector_index_config: VectorIndexConfig,
}
#[async_trait::async_trait]
@@ -296,11 +338,23 @@ impl IndexerBuilder for IndexerBuilderImpl {
indexer.inverted_indexer = self.build_inverted_indexer(file_id);
indexer.fulltext_indexer = self.build_fulltext_indexer(file_id).await;
indexer.bloom_filter_indexer = self.build_bloom_filter_indexer(file_id);
indexer.intermediate_manager = Some(self.intermediate_manager.clone());
if indexer.inverted_indexer.is_none()
&& indexer.fulltext_indexer.is_none()
&& indexer.bloom_filter_indexer.is_none()
#[cfg(feature = "vector_index")]
{
indexer.vector_indexer = self.build_vector_indexer(file_id);
}
indexer.intermediate_manager = Some(self.intermediate_manager.clone());
#[cfg(feature = "vector_index")]
let has_any_indexer = indexer.inverted_indexer.is_some()
|| indexer.fulltext_indexer.is_some()
|| indexer.bloom_filter_indexer.is_some()
|| indexer.vector_indexer.is_some();
#[cfg(not(feature = "vector_index"))]
let has_any_indexer = indexer.inverted_indexer.is_some()
|| indexer.fulltext_indexer.is_some()
|| indexer.bloom_filter_indexer.is_some();
if !has_any_indexer {
indexer.abort().await;
return Indexer::default();
}
@@ -476,6 +530,69 @@ impl IndexerBuilderImpl {
None
}
#[cfg(feature = "vector_index")]
fn build_vector_indexer(&self, file_id: FileId) -> Option<VectorIndexer> {
let create = match self.build_type {
IndexBuildType::Flush => self.vector_index_config.create_on_flush.auto(),
IndexBuildType::Compact => self.vector_index_config.create_on_compaction.auto(),
_ => true,
};
if !create {
debug!(
"Skip creating vector index due to config, region_id: {}, file_id: {}",
self.metadata.region_id, file_id,
);
return None;
}
// Get vector index column IDs and options from metadata
let vector_index_options = self.metadata.vector_indexed_column_ids();
if vector_index_options.is_empty() {
debug!(
"No vector columns to index, skip creating vector index, region_id: {}, file_id: {}",
self.metadata.region_id, file_id,
);
return None;
}
let mem_limit = self.vector_index_config.mem_threshold_on_create();
let indexer = VectorIndexer::new(
file_id,
&self.metadata,
self.intermediate_manager.clone(),
mem_limit,
&vector_index_options,
);
let err = match indexer {
Ok(indexer) => {
if indexer.is_none() {
debug!(
"Skip creating vector index due to no columns require indexing, region_id: {}, file_id: {}",
self.metadata.region_id, file_id,
);
}
return indexer;
}
Err(err) => err,
};
if cfg!(any(test, feature = "test")) {
panic!(
"Failed to create vector index, region_id: {}, file_id: {}, err: {:?}",
self.metadata.region_id, file_id, err
);
} else {
warn!(
err; "Failed to create vector index, region_id: {}, file_id: {}",
self.metadata.region_id, file_id,
);
}
None
}
}
/// Type of an index build task.
@@ -1115,6 +1232,8 @@ mod tests {
with_inverted: bool,
with_fulltext: bool,
with_skipping_bloom: bool,
#[cfg(feature = "vector_index")]
with_vector: bool,
}
fn mock_region_metadata(
@@ -1122,6 +1241,8 @@ mod tests {
with_inverted,
with_fulltext,
with_skipping_bloom,
#[cfg(feature = "vector_index")]
with_vector,
}: MetaConfig,
) -> RegionMetadataRef {
let mut builder = RegionMetadataBuilder::new(RegionId::new(1, 2));
@@ -1187,6 +1308,24 @@ mod tests {
builder.push_column_metadata(column);
}
#[cfg(feature = "vector_index")]
if with_vector {
use index::vector::VectorIndexOptions;
let options = VectorIndexOptions::default();
let column_schema =
ColumnSchema::new("vec", ConcreteDataType::vector_datatype(4), true)
.with_vector_index_options(&options)
.unwrap();
let column = ColumnMetadata {
column_schema,
semantic_type: SemanticType::Field,
column_id: 6,
};
builder.push_column_metadata(column);
}
Arc::new(builder.build().unwrap())
}
@@ -1237,6 +1376,8 @@ mod tests {
inverted_index_config: Default::default(),
fulltext_index_config: Default::default(),
bloom_filter_index_config: Default::default(),
#[cfg(feature = "vector_index")]
vector_index_config: Default::default(),
};
let mut metrics = Metrics::new(WriteType::Flush);
env.access_layer
@@ -1287,6 +1428,8 @@ mod tests {
inverted_index_config: InvertedIndexConfig::default(),
fulltext_index_config: FulltextIndexConfig::default(),
bloom_filter_index_config: BloomFilterConfig::default(),
#[cfg(feature = "vector_index")]
vector_index_config: Default::default(),
})
}
@@ -1300,6 +1443,8 @@ mod tests {
with_inverted: true,
with_fulltext: true,
with_skipping_bloom: true,
#[cfg(feature = "vector_index")]
with_vector: false,
});
let indexer = IndexerBuilderImpl {
build_type: IndexBuildType::Flush,
@@ -1312,6 +1457,8 @@ mod tests {
inverted_index_config: InvertedIndexConfig::default(),
fulltext_index_config: FulltextIndexConfig::default(),
bloom_filter_index_config: BloomFilterConfig::default(),
#[cfg(feature = "vector_index")]
vector_index_config: Default::default(),
}
.build(FileId::random(), 0)
.await;
@@ -1331,6 +1478,8 @@ mod tests {
with_inverted: true,
with_fulltext: true,
with_skipping_bloom: true,
#[cfg(feature = "vector_index")]
with_vector: false,
});
let indexer = IndexerBuilderImpl {
build_type: IndexBuildType::Flush,
@@ -1346,6 +1495,8 @@ mod tests {
},
fulltext_index_config: FulltextIndexConfig::default(),
bloom_filter_index_config: BloomFilterConfig::default(),
#[cfg(feature = "vector_index")]
vector_index_config: Default::default(),
}
.build(FileId::random(), 0)
.await;
@@ -1368,6 +1519,8 @@ mod tests {
..Default::default()
},
bloom_filter_index_config: BloomFilterConfig::default(),
#[cfg(feature = "vector_index")]
vector_index_config: Default::default(),
}
.build(FileId::random(), 0)
.await;
@@ -1390,6 +1543,8 @@ mod tests {
create_on_compaction: Mode::Disable,
..Default::default()
},
#[cfg(feature = "vector_index")]
vector_index_config: Default::default(),
}
.build(FileId::random(), 0)
.await;
@@ -1409,6 +1564,8 @@ mod tests {
with_inverted: false,
with_fulltext: true,
with_skipping_bloom: true,
#[cfg(feature = "vector_index")]
with_vector: false,
});
let indexer = IndexerBuilderImpl {
build_type: IndexBuildType::Flush,
@@ -1421,6 +1578,8 @@ mod tests {
inverted_index_config: InvertedIndexConfig::default(),
fulltext_index_config: FulltextIndexConfig::default(),
bloom_filter_index_config: BloomFilterConfig::default(),
#[cfg(feature = "vector_index")]
vector_index_config: Default::default(),
}
.build(FileId::random(), 0)
.await;
@@ -1433,6 +1592,8 @@ mod tests {
with_inverted: true,
with_fulltext: false,
with_skipping_bloom: true,
#[cfg(feature = "vector_index")]
with_vector: false,
});
let indexer = IndexerBuilderImpl {
build_type: IndexBuildType::Flush,
@@ -1445,6 +1606,8 @@ mod tests {
inverted_index_config: InvertedIndexConfig::default(),
fulltext_index_config: FulltextIndexConfig::default(),
bloom_filter_index_config: BloomFilterConfig::default(),
#[cfg(feature = "vector_index")]
vector_index_config: Default::default(),
}
.build(FileId::random(), 0)
.await;
@@ -1457,6 +1620,8 @@ mod tests {
with_inverted: true,
with_fulltext: true,
with_skipping_bloom: false,
#[cfg(feature = "vector_index")]
with_vector: false,
});
let indexer = IndexerBuilderImpl {
build_type: IndexBuildType::Flush,
@@ -1469,6 +1634,8 @@ mod tests {
inverted_index_config: InvertedIndexConfig::default(),
fulltext_index_config: FulltextIndexConfig::default(),
bloom_filter_index_config: BloomFilterConfig::default(),
#[cfg(feature = "vector_index")]
vector_index_config: Default::default(),
}
.build(FileId::random(), 0)
.await;
@@ -1488,6 +1655,8 @@ mod tests {
with_inverted: true,
with_fulltext: true,
with_skipping_bloom: true,
#[cfg(feature = "vector_index")]
with_vector: false,
});
let indexer = IndexerBuilderImpl {
build_type: IndexBuildType::Flush,
@@ -1500,6 +1669,8 @@ mod tests {
inverted_index_config: InvertedIndexConfig::default(),
fulltext_index_config: FulltextIndexConfig::default(),
bloom_filter_index_config: BloomFilterConfig::default(),
#[cfg(feature = "vector_index")]
vector_index_config: Default::default(),
}
.build(FileId::random(), 0)
.await;
@@ -1507,6 +1678,82 @@ mod tests {
assert!(indexer.inverted_indexer.is_none());
}
#[cfg(feature = "vector_index")]
#[tokio::test]
async fn test_update_flat_builds_vector_index() {
use datatypes::arrow::array::BinaryBuilder;
use datatypes::arrow::datatypes::{DataType, Field, Schema};
struct TestPathProvider;
impl FilePathProvider for TestPathProvider {
fn build_index_file_path(&self, file_id: RegionFileId) -> String {
format!("index/{}.puffin", file_id)
}
fn build_index_file_path_with_version(&self, index_id: RegionIndexId) -> String {
format!("index/{}.puffin", index_id)
}
fn build_sst_file_path(&self, file_id: RegionFileId) -> String {
format!("sst/{}.parquet", file_id)
}
}
fn f32s_to_bytes(values: &[f32]) -> Vec<u8> {
let mut bytes = Vec::with_capacity(values.len() * 4);
for v in values {
bytes.extend_from_slice(&v.to_le_bytes());
}
bytes
}
let (dir, factory) =
PuffinManagerFactory::new_for_test_async("test_update_flat_builds_vector_index_").await;
let intm_manager = mock_intm_mgr(dir.path().to_string_lossy()).await;
let metadata = mock_region_metadata(MetaConfig {
with_inverted: false,
with_fulltext: false,
with_skipping_bloom: false,
with_vector: true,
});
let mut indexer = IndexerBuilderImpl {
build_type: IndexBuildType::Flush,
metadata,
row_group_size: 1024,
puffin_manager: factory.build(mock_object_store(), TestPathProvider),
write_cache_enabled: false,
intermediate_manager: intm_manager,
index_options: IndexOptions::default(),
inverted_index_config: InvertedIndexConfig::default(),
fulltext_index_config: FulltextIndexConfig::default(),
bloom_filter_index_config: BloomFilterConfig::default(),
vector_index_config: Default::default(),
}
.build(FileId::random(), 0)
.await;
assert!(indexer.vector_indexer.is_some());
let vec1 = f32s_to_bytes(&[1.0, 0.0, 0.0, 0.0]);
let vec2 = f32s_to_bytes(&[0.0, 1.0, 0.0, 0.0]);
let mut builder = BinaryBuilder::with_capacity(2, vec1.len() + vec2.len());
builder.append_value(&vec1);
builder.append_value(&vec2);
let schema = Arc::new(Schema::new(vec![Field::new("vec", DataType::Binary, true)]));
let batch = RecordBatch::try_new(schema, vec![Arc::new(builder.finish())]).unwrap();
indexer.update_flat(&batch).await;
let output = indexer.finish().await;
assert!(output.vector_index.is_available());
assert!(output.vector_index.columns.contains(&6));
}
#[tokio::test]
async fn test_index_build_task_sst_not_exist() {
let env = SchedulerEnv::new().await;
@@ -1839,6 +2086,8 @@ mod tests {
inverted_index_config: InvertedIndexConfig::default(),
fulltext_index_config: FulltextIndexConfig::default(),
bloom_filter_index_config: BloomFilterConfig::default(),
#[cfg(feature = "vector_index")]
vector_index_config: Default::default(),
});
let sst_info = mock_sst_file(metadata.clone(), &env, IndexBuildMode::Async).await;

View File

@@ -23,6 +23,8 @@ impl Indexer {
self.do_abort_inverted_index().await;
self.do_abort_fulltext_index().await;
self.do_abort_bloom_filter().await;
#[cfg(feature = "vector_index")]
self.do_abort_vector_index().await;
self.do_prune_intm_sst_dir().await;
if self.write_cache_enabled {
self.do_abort_clean_fs_temp_dir().await;
@@ -106,4 +108,26 @@ impl Indexer {
.to_string();
TempFileCleaner::clean_atomic_dir_files(fs_accessor.store().store(), &[&fs_handle]).await;
}
#[cfg(feature = "vector_index")]
async fn do_abort_vector_index(&mut self) {
let Some(mut indexer) = self.vector_indexer.take() else {
return;
};
let Err(err) = indexer.abort().await else {
return;
};
if cfg!(any(test, feature = "test")) {
panic!(
"Failed to abort vector index, region_id: {}, file_id: {}, err: {:?}",
self.region_id, self.file_id, err
);
} else {
warn!(
err; "Failed to abort vector index, region_id: {}, file_id: {}",
self.region_id, self.file_id,
);
}
}
}

View File

@@ -17,6 +17,8 @@ use puffin::puffin_manager::{PuffinManager, PuffinWriter};
use store_api::storage::ColumnId;
use crate::sst::file::{RegionFileId, RegionIndexId};
#[cfg(feature = "vector_index")]
use crate::sst::index::VectorIndexOutput;
use crate::sst::index::puffin_manager::SstPuffinWriter;
use crate::sst::index::statistics::{ByteCount, RowCount};
use crate::sst::index::{
@@ -54,6 +56,15 @@ impl Indexer {
return IndexOutput::default();
}
#[cfg(feature = "vector_index")]
{
let success = self.do_finish_vector_index(&mut writer, &mut output).await;
if !success {
self.do_abort().await;
return IndexOutput::default();
}
}
self.do_prune_intm_sst_dir().await;
output.file_size = self.do_finish_puffin_writer(writer).await;
output.version = self.index_version;
@@ -276,6 +287,63 @@ impl Indexer {
output.columns = column_ids;
}
#[cfg(feature = "vector_index")]
async fn do_finish_vector_index(
&mut self,
puffin_writer: &mut SstPuffinWriter,
index_output: &mut IndexOutput,
) -> bool {
let Some(mut indexer) = self.vector_indexer.take() else {
return true;
};
let column_ids = indexer.column_ids().collect();
let err = match indexer.finish(puffin_writer).await {
Ok((row_count, byte_count)) => {
self.fill_vector_index_output(
&mut index_output.vector_index,
row_count,
byte_count,
column_ids,
);
return true;
}
Err(err) => err,
};
if cfg!(any(test, feature = "test")) {
panic!(
"Failed to finish vector index, region_id: {}, file_id: {}, err: {:?}",
self.region_id, self.file_id, err
);
} else {
warn!(
err; "Failed to finish vector index, region_id: {}, file_id: {}",
self.region_id, self.file_id,
);
}
false
}
#[cfg(feature = "vector_index")]
fn fill_vector_index_output(
&mut self,
output: &mut VectorIndexOutput,
row_count: RowCount,
byte_count: ByteCount,
column_ids: Vec<ColumnId>,
) {
debug!(
"Vector index created, region_id: {}, file_id: {}, written_bytes: {}, written_rows: {}, columns: {:?}",
self.region_id, self.file_id, byte_count, row_count, column_ids
);
output.index_size = byte_count;
output.row_count = row_count;
output.columns = column_ids;
}
pub(crate) async fn do_prune_intm_sst_dir(&mut self) {
if let Some(manager) = self.intermediate_manager.take()
&& let Err(e) = manager.prune_sst_dir(&self.region_id, &self.file_id).await

View File

@@ -33,6 +33,10 @@ impl Indexer {
if !self.do_update_bloom_filter(batch).await {
self.do_abort().await;
}
#[cfg(feature = "vector_index")]
if !self.do_update_vector_index(batch).await {
self.do_abort().await;
}
}
/// Returns false if the update failed.
@@ -110,6 +114,32 @@ impl Indexer {
false
}
/// Returns false if the update failed.
#[cfg(feature = "vector_index")]
async fn do_update_vector_index(&mut self, batch: &mut Batch) -> bool {
let Some(creator) = self.vector_indexer.as_mut() else {
return true;
};
let Err(err) = creator.update(batch).await else {
return true;
};
if cfg!(any(test, feature = "test")) {
panic!(
"Failed to update vector index, region_id: {}, file_id: {}, err: {:?}",
self.region_id, self.file_id, err
);
} else {
warn!(
err; "Failed to update vector index, region_id: {}, file_id: {}",
self.region_id, self.file_id,
);
}
false
}
pub(crate) async fn do_update_flat(&mut self, batch: &RecordBatch) {
if batch.num_rows() == 0 {
return;
@@ -124,6 +154,10 @@ impl Indexer {
if !self.do_update_flat_bloom_filter(batch).await {
self.do_abort().await;
}
#[cfg(feature = "vector_index")]
if !self.do_update_flat_vector_index(batch).await {
self.do_abort().await;
}
}
/// Returns false if the update failed.
@@ -200,4 +234,30 @@ impl Indexer {
false
}
/// Returns false if the update failed.
#[cfg(feature = "vector_index")]
async fn do_update_flat_vector_index(&mut self, batch: &RecordBatch) -> bool {
let Some(creator) = self.vector_indexer.as_mut() else {
return true;
};
let Err(err) = creator.update_flat(batch).await else {
return true;
};
if cfg!(any(test, feature = "test")) {
panic!(
"Failed to update vector index with flat format, region_id: {}, file_id: {}, err: {:?}",
self.region_id, self.file_id, err
);
} else {
warn!(
err; "Failed to update vector index with flat format, region_id: {}, file_id: {}",
self.region_id, self.file_id,
);
}
false
}
}

View File

@@ -0,0 +1,920 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Vector index creator using pluggable vector index engines.
use std::collections::HashMap;
use std::sync::Arc;
use std::sync::atomic::AtomicUsize;
use common_telemetry::warn;
use datatypes::arrow::array::{Array, BinaryArray};
use datatypes::arrow::record_batch::RecordBatch;
use datatypes::data_type::ConcreteDataType;
use datatypes::prelude::ValueRef;
use index::vector::{VectorDistanceMetric, VectorIndexOptions, distance_metric_to_usearch};
use puffin::puffin_manager::{PuffinWriter, PutOptions};
use roaring::RoaringBitmap;
use snafu::{ResultExt, ensure};
use store_api::metadata::RegionMetadataRef;
use store_api::storage::{ColumnId, FileId, VectorIndexEngine, VectorIndexEngineType};
use tokio_util::compat::TokioAsyncReadCompatExt;
use usearch::MetricKind;
use crate::error::{
BiErrorsSnafu, OperateAbortedIndexSnafu, PuffinAddBlobSnafu, Result, VectorIndexBuildSnafu,
VectorIndexFinishSnafu,
};
use crate::metrics::{INDEX_CREATE_BYTES_TOTAL, INDEX_CREATE_ROWS_TOTAL};
use crate::read::Batch;
use crate::sst::index::TYPE_VECTOR_INDEX;
use crate::sst::index::intermediate::{
IntermediateLocation, IntermediateManager, TempFileProvider,
};
use crate::sst::index::puffin_manager::SstPuffinWriter;
use crate::sst::index::statistics::{ByteCount, RowCount, Statistics};
use crate::sst::index::vector_index::util::bytes_to_f32_slice;
use crate::sst::index::vector_index::{INDEX_BLOB_TYPE, engine};
/// The buffer size for the pipe used to send index data to the puffin blob.
const PIPE_BUFFER_SIZE_FOR_SENDING_BLOB: usize = 8192;
/// Configuration for a single column's vector index.
#[derive(Debug, Clone)]
pub struct VectorIndexConfig {
/// The vector index engine type.
pub engine: VectorIndexEngineType,
/// The dimension of vectors in this column.
pub dim: usize,
/// The distance metric to use (e.g., L2, Cosine, IP) - usearch format.
pub metric: MetricKind,
/// The original distance metric (for serialization).
pub distance_metric: VectorDistanceMetric,
/// HNSW connectivity parameter (M in the paper).
/// Higher values give better recall but use more memory.
pub connectivity: usize,
/// Expansion factor during index construction (ef_construction).
pub expansion_add: usize,
/// Expansion factor during search (ef_search).
pub expansion_search: usize,
}
impl VectorIndexConfig {
/// Creates a new vector index config from VectorIndexOptions.
pub fn new(dim: usize, options: &VectorIndexOptions) -> Self {
Self {
engine: options.engine,
dim,
metric: distance_metric_to_usearch(options.metric),
distance_metric: options.metric,
connectivity: options.connectivity as usize,
expansion_add: options.expansion_add as usize,
expansion_search: options.expansion_search as usize,
}
}
}
/// Creator for a single column's vector index.
struct VectorIndexCreator {
/// The vector index engine (e.g., USearch HNSW).
engine: Box<dyn VectorIndexEngine>,
/// Configuration for this index.
config: VectorIndexConfig,
/// Bitmap tracking which row offsets have NULL vectors.
/// HNSW keys are sequential (0, 1, 2...) but row offsets may have gaps due to NULLs.
null_bitmap: RoaringBitmap,
/// Current row offset (including NULLs).
current_row_offset: u64,
/// Next HNSW key to assign (only for non-NULL vectors).
next_hnsw_key: u64,
/// Memory usage estimation.
memory_usage: usize,
}
impl VectorIndexCreator {
/// Creates a new vector index creator.
fn new(config: VectorIndexConfig) -> Result<Self> {
let engine_instance = engine::create_engine(config.engine, &config)?;
Ok(Self {
engine: engine_instance,
config,
null_bitmap: RoaringBitmap::new(),
current_row_offset: 0,
next_hnsw_key: 0,
memory_usage: 0,
})
}
/// Reserves capacity for the expected number of vectors.
#[allow(dead_code)]
fn reserve(&mut self, capacity: usize) -> Result<()> {
self.engine.reserve(capacity).map_err(|e| {
VectorIndexBuildSnafu {
reason: format!("Failed to reserve capacity: {}", e),
}
.build()
})
}
/// Adds a vector to the index.
/// Returns the HNSW key assigned to this vector.
fn add_vector(&mut self, vector: &[f32]) -> Result<u64> {
let key = self.next_hnsw_key;
self.engine.add(key, vector).map_err(|e| {
VectorIndexBuildSnafu {
reason: e.to_string(),
}
.build()
})?;
self.next_hnsw_key += 1;
self.current_row_offset += 1;
self.memory_usage = self.engine.memory_usage();
Ok(key)
}
/// Records a NULL vector at the current row offset.
fn add_null(&mut self) {
self.null_bitmap.insert(self.current_row_offset as u32);
self.current_row_offset += 1;
}
/// Records multiple NULL vectors starting at the current row offset.
fn add_nulls(&mut self, n: usize) {
let start = self.current_row_offset as u32;
let end = start + n as u32;
self.null_bitmap.insert_range(start..end);
self.current_row_offset += n as u64;
}
/// Returns the serialized size of the index.
fn serialized_length(&self) -> usize {
self.engine.serialized_length()
}
/// Serializes the index to a buffer.
fn save_to_buffer(&self, buffer: &mut [u8]) -> Result<()> {
self.engine.save_to_buffer(buffer).map_err(|e| {
VectorIndexFinishSnafu {
reason: format!("Failed to serialize index: {}", e),
}
.build()
})
}
/// Returns the memory usage of this creator.
fn memory_usage(&self) -> usize {
self.memory_usage + self.null_bitmap.serialized_size()
}
/// Returns the number of vectors in the index (excluding NULLs).
fn size(&self) -> usize {
self.engine.size()
}
/// Returns the engine type.
fn engine_type(&self) -> VectorIndexEngineType {
self.config.engine
}
/// Returns the distance metric.
fn metric(&self) -> VectorDistanceMetric {
self.config.distance_metric
}
}
/// The indexer for vector indexes across multiple columns.
pub struct VectorIndexer {
/// Per-column vector index creators.
creators: HashMap<ColumnId, VectorIndexCreator>,
/// Provider for intermediate files.
temp_file_provider: Arc<TempFileProvider>,
/// Whether the indexing process has been aborted.
aborted: bool,
/// Statistics for this indexer.
stats: Statistics,
/// Global memory usage tracker.
#[allow(dead_code)]
global_memory_usage: Arc<AtomicUsize>,
/// Region metadata for column lookups.
#[allow(dead_code)]
metadata: RegionMetadataRef,
/// Memory usage threshold.
memory_usage_threshold: Option<usize>,
}
impl VectorIndexer {
/// Creates a new vector indexer.
///
/// Returns `None` if there are no vector columns that need indexing.
pub fn new(
sst_file_id: FileId,
metadata: &RegionMetadataRef,
intermediate_manager: IntermediateManager,
memory_usage_threshold: Option<usize>,
vector_index_options: &HashMap<ColumnId, VectorIndexOptions>,
) -> Result<Option<Self>> {
let mut creators = HashMap::new();
let temp_file_provider = Arc::new(TempFileProvider::new(
IntermediateLocation::new(&metadata.region_id, &sst_file_id),
intermediate_manager,
));
let global_memory_usage = Arc::new(AtomicUsize::new(0));
// Find all vector columns that have vector index enabled
for column in &metadata.column_metadatas {
// Check if this column has vector index options configured
let Some(options) = vector_index_options.get(&column.column_id) else {
continue;
};
// Verify the column is a vector type
let ConcreteDataType::Vector(vector_type) = &column.column_schema.data_type else {
continue;
};
let config = VectorIndexConfig::new(vector_type.dim as usize, options);
let creator = VectorIndexCreator::new(config)?;
creators.insert(column.column_id, creator);
}
if creators.is_empty() {
return Ok(None);
}
let indexer = Self {
creators,
temp_file_provider,
aborted: false,
stats: Statistics::new(TYPE_VECTOR_INDEX),
global_memory_usage,
metadata: metadata.clone(),
memory_usage_threshold,
};
Ok(Some(indexer))
}
/// Updates index with a batch of rows.
/// Garbage will be cleaned up if failed to update.
pub async fn update(&mut self, batch: &mut Batch) -> Result<()> {
ensure!(!self.aborted, OperateAbortedIndexSnafu);
if self.creators.is_empty() {
return Ok(());
}
if let Err(update_err) = self.do_update(batch).await {
// Clean up garbage if failed to update
if let Err(err) = self.do_cleanup().await {
if cfg!(any(test, feature = "test")) {
panic!("Failed to clean up vector index creator, err: {err:?}");
} else {
warn!(err; "Failed to clean up vector index creator");
}
}
return Err(update_err);
}
Ok(())
}
/// Updates index with a flat format `RecordBatch`.
/// Garbage will be cleaned up if failed to update.
pub async fn update_flat(&mut self, batch: &RecordBatch) -> Result<()> {
ensure!(!self.aborted, OperateAbortedIndexSnafu);
if self.creators.is_empty() || batch.num_rows() == 0 {
return Ok(());
}
if let Err(update_err) = self.do_update_flat(batch).await {
// Clean up garbage if failed to update
if let Err(err) = self.do_cleanup().await {
if cfg!(any(test, feature = "test")) {
panic!("Failed to clean up vector index creator, err: {err:?}");
} else {
warn!(err; "Failed to clean up vector index creator");
}
}
return Err(update_err);
}
Ok(())
}
/// Internal update implementation.
async fn do_update(&mut self, batch: &mut Batch) -> Result<()> {
let mut guard = self.stats.record_update();
let n = batch.num_rows();
guard.inc_row_count(n);
for (col_id, creator) in &mut self.creators {
let Some(values) = batch.field_col_value(*col_id) else {
continue;
};
// Process each row in the batch
for i in 0..n {
let value = values.data.get_ref(i);
if value.is_null() {
creator.add_null();
} else {
// Extract the vector bytes and convert to f32 slice
if let ValueRef::Binary(bytes) = value {
let floats = bytes_to_f32_slice(bytes);
if floats.len() != creator.config.dim {
return VectorIndexBuildSnafu {
reason: format!(
"Vector dimension mismatch: expected {}, got {}",
creator.config.dim,
floats.len()
),
}
.fail();
}
creator.add_vector(&floats)?;
} else {
creator.add_null();
}
}
}
// Check memory limit - abort index creation if exceeded
if let Some(threshold) = self.memory_usage_threshold {
let current_usage = creator.memory_usage();
if current_usage > threshold {
warn!(
"Vector index memory usage {} exceeds threshold {}, aborting index creation, region_id: {}",
current_usage, threshold, self.metadata.region_id
);
return VectorIndexBuildSnafu {
reason: format!(
"Memory usage {} exceeds threshold {}",
current_usage, threshold
),
}
.fail();
}
}
}
Ok(())
}
/// Internal flat update implementation.
async fn do_update_flat(&mut self, batch: &RecordBatch) -> Result<()> {
let mut guard = self.stats.record_update();
let n = batch.num_rows();
guard.inc_row_count(n);
for (col_id, creator) in &mut self.creators {
// This should never happen: creator exists but column not in metadata
let column_meta = self.metadata.column_by_id(*col_id).ok_or_else(|| {
VectorIndexBuildSnafu {
reason: format!(
"Column {} not found in region metadata, this is a bug",
col_id
),
}
.build()
})?;
let column_name = &column_meta.column_schema.name;
// Column not in batch is normal for flat format - treat as NULLs
let Some(column_array) = batch.column_by_name(column_name) else {
creator.add_nulls(n);
continue;
};
// Vector type must be stored as binary array
let binary_array = column_array
.as_any()
.downcast_ref::<BinaryArray>()
.ok_or_else(|| {
VectorIndexBuildSnafu {
reason: format!(
"Column {} is not a binary array, got {:?}",
column_name,
column_array.data_type()
),
}
.build()
})?;
for i in 0..n {
if !binary_array.is_valid(i) {
creator.add_null();
} else {
let bytes = binary_array.value(i);
let floats = bytes_to_f32_slice(bytes);
if floats.len() != creator.config.dim {
return VectorIndexBuildSnafu {
reason: format!(
"Vector dimension mismatch: expected {}, got {}",
creator.config.dim,
floats.len()
),
}
.fail();
}
creator.add_vector(&floats)?;
}
}
if let Some(threshold) = self.memory_usage_threshold {
let current_usage = creator.memory_usage();
if current_usage > threshold {
warn!(
"Vector index memory usage {} exceeds threshold {}, aborting index creation, region_id: {}",
current_usage, threshold, self.metadata.region_id
);
return VectorIndexBuildSnafu {
reason: format!(
"Memory usage {} exceeds threshold {}",
current_usage, threshold
),
}
.fail();
}
}
}
Ok(())
}
/// Finishes index creation and writes to puffin.
/// Returns the number of rows and bytes written.
pub async fn finish(
&mut self,
puffin_writer: &mut SstPuffinWriter,
) -> Result<(RowCount, ByteCount)> {
ensure!(!self.aborted, OperateAbortedIndexSnafu);
if self.stats.row_count() == 0 {
// No IO is performed, no garbage to clean up
return Ok((0, 0));
}
let finish_res = self.do_finish(puffin_writer).await;
// Clean up garbage no matter finish successfully or not
if let Err(err) = self.do_cleanup().await {
if cfg!(any(test, feature = "test")) {
panic!("Failed to clean up vector index creator, err: {err:?}");
} else {
warn!(err; "Failed to clean up vector index creator");
}
}
// Report metrics on successful finish
if finish_res.is_ok() {
INDEX_CREATE_ROWS_TOTAL
.with_label_values(&[TYPE_VECTOR_INDEX])
.inc_by(self.stats.row_count() as u64);
INDEX_CREATE_BYTES_TOTAL
.with_label_values(&[TYPE_VECTOR_INDEX])
.inc_by(self.stats.byte_count());
}
finish_res.map(|_| (self.stats.row_count(), self.stats.byte_count()))
}
/// Internal finish implementation.
async fn do_finish(&mut self, puffin_writer: &mut SstPuffinWriter) -> Result<()> {
let mut guard = self.stats.record_finish();
for (id, creator) in &mut self.creators {
if creator.size() == 0 {
// No vectors to index
continue;
}
let written_bytes = Self::do_finish_single_creator(*id, creator, puffin_writer).await?;
guard.inc_byte_count(written_bytes);
}
Ok(())
}
/// Finishes a single column's vector index.
///
/// The blob format v1 (header = 33 bytes):
/// ```text
/// +------------------+
/// | Version | 1 byte (u8, = 1)
/// +------------------+
/// | Engine type | 1 byte (u8, engine identifier)
/// +------------------+
/// | Dimension | 4 bytes (u32, little-endian)
/// +------------------+
/// | Metric | 1 byte (u8, distance metric)
/// +------------------+
/// | Connectivity | 2 bytes (u16, little-endian, HNSW M parameter)
/// +------------------+
/// | Expansion add | 2 bytes (u16, little-endian, ef_construction)
/// +------------------+
/// | Expansion search | 2 bytes (u16, little-endian, ef_search)
/// +------------------+
/// | Total rows | 8 bytes (u64, little-endian, total rows in SST)
/// +------------------+
/// | Indexed rows | 8 bytes (u64, little-endian, non-NULL rows indexed)
/// +------------------+
/// | NULL bitmap len | 4 bytes (u32, little-endian)
/// +------------------+
/// | NULL bitmap | variable length (serialized RoaringBitmap)
/// +------------------+
/// | Vector index | variable length (engine-specific serialized format)
/// +------------------+
/// ```
async fn do_finish_single_creator(
col_id: ColumnId,
creator: &mut VectorIndexCreator,
puffin_writer: &mut SstPuffinWriter,
) -> Result<ByteCount> {
// Serialize the NULL bitmap
let mut null_bitmap_bytes = Vec::new();
creator
.null_bitmap
.serialize_into(&mut null_bitmap_bytes)
.map_err(|e| {
VectorIndexFinishSnafu {
reason: format!("Failed to serialize NULL bitmap: {}", e),
}
.build()
})?;
// Serialize the vector index
let index_size = creator.serialized_length();
let mut index_bytes = vec![0u8; index_size];
creator.save_to_buffer(&mut index_bytes)?;
// Header size: version(1) + engine(1) + dim(4) + metric(1) +
// connectivity(2) + expansion_add(2) + expansion_search(2) +
// total_rows(8) + indexed_rows(8) + bitmap_len(4) = 33 bytes
/// Size of the vector index blob header in bytes.
/// Header format: version(1) + engine(1) + dim(4) + metric(1) +
/// connectivity(2) + expansion_add(2) + expansion_search(2) +
/// total_rows(8) + indexed_rows(8) + bitmap_len(4) = 33 bytes
const VECTOR_INDEX_BLOB_HEADER_SIZE: usize = 33;
let total_size =
VECTOR_INDEX_BLOB_HEADER_SIZE + null_bitmap_bytes.len() + index_bytes.len();
let mut blob_data = Vec::with_capacity(total_size);
// Write version (1 byte)
blob_data.push(1u8);
// Write engine type (1 byte)
blob_data.push(creator.engine_type().as_u8());
// Write dimension (4 bytes, little-endian)
blob_data.extend_from_slice(&(creator.config.dim as u32).to_le_bytes());
// Write metric (1 byte)
blob_data.push(creator.metric().as_u8());
// Write connectivity/M (2 bytes, little-endian)
blob_data.extend_from_slice(&(creator.config.connectivity as u16).to_le_bytes());
// Write expansion_add/ef_construction (2 bytes, little-endian)
blob_data.extend_from_slice(&(creator.config.expansion_add as u16).to_le_bytes());
// Write expansion_search/ef_search (2 bytes, little-endian)
blob_data.extend_from_slice(&(creator.config.expansion_search as u16).to_le_bytes());
// Write total_rows (8 bytes, little-endian)
blob_data.extend_from_slice(&creator.current_row_offset.to_le_bytes());
// Write indexed_rows (8 bytes, little-endian)
blob_data.extend_from_slice(&creator.next_hnsw_key.to_le_bytes());
// Write NULL bitmap length (4 bytes, little-endian)
let bitmap_len: u32 = null_bitmap_bytes.len().try_into().map_err(|_| {
VectorIndexBuildSnafu {
reason: format!(
"NULL bitmap size {} exceeds maximum allowed size {}",
null_bitmap_bytes.len(),
u32::MAX
),
}
.build()
})?;
blob_data.extend_from_slice(&bitmap_len.to_le_bytes());
// Write NULL bitmap
blob_data.extend_from_slice(&null_bitmap_bytes);
// Write vector index
blob_data.extend_from_slice(&index_bytes);
// Create blob name following the same pattern as bloom filter
let blob_name = format!("{}-{}", INDEX_BLOB_TYPE, col_id);
// Write to puffin using a pipe
let (tx, rx) = tokio::io::duplex(PIPE_BUFFER_SIZE_FOR_SENDING_BLOB);
// Writer task writes the blob data to the pipe
let write_index = async move {
use tokio::io::AsyncWriteExt;
let mut writer = tx;
writer.write_all(&blob_data).await?;
writer.shutdown().await?;
Ok::<(), std::io::Error>(())
};
let (index_write_result, puffin_add_blob) = futures::join!(
write_index,
puffin_writer.put_blob(
&blob_name,
rx.compat(),
PutOptions::default(),
Default::default()
)
);
match (
puffin_add_blob.context(PuffinAddBlobSnafu),
index_write_result.map_err(|e| {
VectorIndexFinishSnafu {
reason: format!("Failed to write blob data: {}", e),
}
.build()
}),
) {
(Err(e1), Err(e2)) => BiErrorsSnafu {
first: Box::new(e1),
second: Box::new(e2),
}
.fail()?,
(Ok(_), e @ Err(_)) => e?,
(e @ Err(_), Ok(_)) => e.map(|_| ())?,
(Ok(written_bytes), Ok(_)) => {
return Ok(written_bytes);
}
}
Ok(0)
}
/// Aborts index creation and cleans up garbage.
pub async fn abort(&mut self) -> Result<()> {
if self.aborted {
return Ok(());
}
self.aborted = true;
self.do_cleanup().await
}
/// Cleans up temporary files.
async fn do_cleanup(&mut self) -> Result<()> {
let mut _guard = self.stats.record_cleanup();
self.creators.clear();
self.temp_file_provider.cleanup().await
}
/// Returns the memory usage of the indexer.
pub fn memory_usage(&self) -> usize {
self.creators.values().map(|c| c.memory_usage()).sum()
}
/// Returns the column IDs being indexed.
pub fn column_ids(&self) -> impl Iterator<Item = ColumnId> + '_ {
self.creators.keys().copied()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_vector_index_creator() {
let options = VectorIndexOptions::default();
let config = VectorIndexConfig::new(4, &options);
let mut creator = VectorIndexCreator::new(config).unwrap();
creator.reserve(10).unwrap();
// Add some vectors
let v1 = vec![1.0f32, 0.0, 0.0, 0.0];
let v2 = vec![0.0f32, 1.0, 0.0, 0.0];
creator.add_vector(&v1).unwrap();
creator.add_null();
creator.add_vector(&v2).unwrap();
assert_eq!(creator.size(), 2); // 2 vectors (excluding NULL)
assert_eq!(creator.current_row_offset, 3); // 3 rows total
assert!(creator.null_bitmap.contains(1)); // Row 1 is NULL
}
#[test]
fn test_vector_index_creator_serialization() {
let options = VectorIndexOptions::default();
let config = VectorIndexConfig::new(4, &options);
let mut creator = VectorIndexCreator::new(config).unwrap();
creator.reserve(10).unwrap();
// Add vectors
let vectors = vec![
vec![1.0f32, 0.0, 0.0, 0.0],
vec![0.0f32, 1.0, 0.0, 0.0],
vec![0.0f32, 0.0, 1.0, 0.0],
];
for v in &vectors {
creator.add_vector(v).unwrap();
}
// Test serialization
let size = creator.serialized_length();
assert!(size > 0);
let mut buffer = vec![0u8; size];
creator.save_to_buffer(&mut buffer).unwrap();
// Verify buffer is not empty and starts with some data
assert!(!buffer.iter().all(|&b| b == 0));
}
#[test]
fn test_vector_index_creator_null_bitmap_serialization() {
let options = VectorIndexOptions::default();
let config = VectorIndexConfig::new(4, &options);
let mut creator = VectorIndexCreator::new(config).unwrap();
creator.reserve(10).unwrap();
// Add pattern: vector, null, vector, null, null, vector
creator.add_vector(&[1.0, 0.0, 0.0, 0.0]).unwrap();
creator.add_null();
creator.add_vector(&[0.0, 1.0, 0.0, 0.0]).unwrap();
creator.add_nulls(2);
creator.add_vector(&[0.0, 0.0, 1.0, 0.0]).unwrap();
assert_eq!(creator.size(), 3); // 3 vectors
assert_eq!(creator.current_row_offset, 6); // 6 rows total
assert!(!creator.null_bitmap.contains(0));
assert!(creator.null_bitmap.contains(1));
assert!(!creator.null_bitmap.contains(2));
assert!(creator.null_bitmap.contains(3));
assert!(creator.null_bitmap.contains(4));
assert!(!creator.null_bitmap.contains(5));
// Test NULL bitmap serialization
let mut bitmap_bytes = Vec::new();
creator
.null_bitmap
.serialize_into(&mut bitmap_bytes)
.unwrap();
// Deserialize and verify
let restored = RoaringBitmap::deserialize_from(&bitmap_bytes[..]).unwrap();
assert_eq!(restored.len(), 3); // 3 NULLs
assert!(restored.contains(1));
assert!(restored.contains(3));
assert!(restored.contains(4));
}
#[test]
fn test_vector_index_config() {
use index::vector::VectorDistanceMetric;
let options = VectorIndexOptions {
engine: VectorIndexEngineType::default(),
metric: VectorDistanceMetric::Cosine,
connectivity: 32,
expansion_add: 256,
expansion_search: 128,
};
let config = VectorIndexConfig::new(128, &options);
assert_eq!(config.engine, VectorIndexEngineType::Usearch);
assert_eq!(config.dim, 128);
assert_eq!(config.metric, MetricKind::Cos);
assert_eq!(config.connectivity, 32);
assert_eq!(config.expansion_add, 256);
assert_eq!(config.expansion_search, 128);
}
#[test]
fn test_vector_index_header_format() {
use index::vector::VectorDistanceMetric;
// Create config with specific HNSW parameters
let options = VectorIndexOptions {
engine: VectorIndexEngineType::Usearch,
metric: VectorDistanceMetric::L2sq,
connectivity: 24,
expansion_add: 200,
expansion_search: 100,
};
let config = VectorIndexConfig::new(4, &options);
let mut creator = VectorIndexCreator::new(config).unwrap();
creator.reserve(10).unwrap();
// Add pattern: vector, null, vector, null, vector
creator.add_vector(&[1.0, 0.0, 0.0, 0.0]).unwrap();
creator.add_null();
creator.add_vector(&[0.0, 1.0, 0.0, 0.0]).unwrap();
creator.add_null();
creator.add_vector(&[0.0, 0.0, 1.0, 0.0]).unwrap();
// Verify counts
assert_eq!(creator.current_row_offset, 5); // total_rows
assert_eq!(creator.next_hnsw_key, 3); // indexed_rows
// Build blob data manually (simulating write_to_puffin header writing)
let mut null_bitmap_bytes = Vec::new();
creator
.null_bitmap
.serialize_into(&mut null_bitmap_bytes)
.unwrap();
let index_size = creator.serialized_length();
let mut index_bytes = vec![0u8; index_size];
creator.save_to_buffer(&mut index_bytes).unwrap();
// Header: 33 bytes
let header_size = 33;
let total_size = header_size + null_bitmap_bytes.len() + index_bytes.len();
let mut blob_data = Vec::with_capacity(total_size);
// Write header fields
blob_data.push(1u8); // version
blob_data.push(creator.engine_type().as_u8()); // engine type
blob_data.extend_from_slice(&(creator.config.dim as u32).to_le_bytes()); // dimension
blob_data.push(creator.metric().as_u8()); // metric
blob_data.extend_from_slice(&(creator.config.connectivity as u16).to_le_bytes());
blob_data.extend_from_slice(&(creator.config.expansion_add as u16).to_le_bytes());
blob_data.extend_from_slice(&(creator.config.expansion_search as u16).to_le_bytes());
blob_data.extend_from_slice(&creator.current_row_offset.to_le_bytes()); // total_rows
blob_data.extend_from_slice(&creator.next_hnsw_key.to_le_bytes()); // indexed_rows
let bitmap_len: u32 = null_bitmap_bytes.len().try_into().unwrap();
blob_data.extend_from_slice(&bitmap_len.to_le_bytes());
blob_data.extend_from_slice(&null_bitmap_bytes);
blob_data.extend_from_slice(&index_bytes);
// Verify header size
assert_eq!(blob_data.len(), total_size);
// Parse header and verify values
assert_eq!(blob_data[0], 1); // version
assert_eq!(blob_data[1], VectorIndexEngineType::Usearch.as_u8()); // engine
let dim = u32::from_le_bytes([blob_data[2], blob_data[3], blob_data[4], blob_data[5]]);
assert_eq!(dim, 4);
let metric = blob_data[6];
assert_eq!(
metric,
datatypes::schema::VectorDistanceMetric::L2sq.as_u8()
);
let connectivity = u16::from_le_bytes([blob_data[7], blob_data[8]]);
assert_eq!(connectivity, 24);
let expansion_add = u16::from_le_bytes([blob_data[9], blob_data[10]]);
assert_eq!(expansion_add, 200);
let expansion_search = u16::from_le_bytes([blob_data[11], blob_data[12]]);
assert_eq!(expansion_search, 100);
let total_rows = u64::from_le_bytes([
blob_data[13],
blob_data[14],
blob_data[15],
blob_data[16],
blob_data[17],
blob_data[18],
blob_data[19],
blob_data[20],
]);
assert_eq!(total_rows, 5);
let indexed_rows = u64::from_le_bytes([
blob_data[21],
blob_data[22],
blob_data[23],
blob_data[24],
blob_data[25],
blob_data[26],
blob_data[27],
blob_data[28],
]);
assert_eq!(indexed_rows, 3);
let null_bitmap_len =
u32::from_le_bytes([blob_data[29], blob_data[30], blob_data[31], blob_data[32]]);
assert_eq!(null_bitmap_len as usize, null_bitmap_bytes.len());
// Verify null bitmap can be deserialized
let null_bitmap_data = &blob_data[header_size..header_size + null_bitmap_len as usize];
let restored_bitmap = RoaringBitmap::deserialize_from(null_bitmap_data).unwrap();
assert_eq!(restored_bitmap.len(), 2); // 2 nulls
assert!(restored_bitmap.contains(1));
assert!(restored_bitmap.contains(3));
}
}

View File

@@ -0,0 +1,45 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Pluggable vector index engine implementations.
mod usearch_impl;
use store_api::storage::{VectorIndexEngine, VectorIndexEngineType};
pub use usearch_impl::UsearchEngine;
use crate::error::Result;
use crate::sst::index::vector_index::creator::VectorIndexConfig;
/// Creates a new vector index engine based on the engine type.
pub fn create_engine(
engine_type: VectorIndexEngineType,
config: &VectorIndexConfig,
) -> Result<Box<dyn VectorIndexEngine>> {
match engine_type {
VectorIndexEngineType::Usearch => Ok(Box::new(UsearchEngine::create(config)?)),
}
}
/// Loads a vector index engine from serialized data.
#[allow(unused)]
pub fn load_engine(
engine_type: VectorIndexEngineType,
config: &VectorIndexConfig,
data: &[u8],
) -> Result<Box<dyn VectorIndexEngine>> {
match engine_type {
VectorIndexEngineType::Usearch => Ok(Box::new(UsearchEngine::load(config, data)?)),
}
}

View File

@@ -0,0 +1,231 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! USearch HNSW implementation of VectorIndexEngine.
use common_error::ext::BoxedError;
use store_api::storage::{VectorIndexEngine, VectorSearchMatches};
use usearch::{Index, IndexOptions, ScalarKind};
use crate::error::{Result, VectorIndexBuildSnafu};
use crate::sst::index::vector_index::creator::VectorIndexConfig;
/// USearch-based vector index engine using HNSW algorithm.
pub struct UsearchEngine {
index: Index,
}
impl UsearchEngine {
/// Creates a new USearch engine with the given configuration.
pub fn create(config: &VectorIndexConfig) -> Result<Self> {
let options = IndexOptions {
dimensions: config.dim,
metric: config.metric,
quantization: ScalarKind::F32,
connectivity: config.connectivity,
expansion_add: config.expansion_add,
expansion_search: config.expansion_search,
multi: false,
};
let index = Index::new(&options).map_err(|e| {
VectorIndexBuildSnafu {
reason: format!("Failed to create USearch index: {}", e),
}
.build()
})?;
Ok(Self { index })
}
/// Loads a USearch engine from serialized data.
#[allow(unused)]
pub fn load(config: &VectorIndexConfig, data: &[u8]) -> Result<Self> {
let options = IndexOptions {
dimensions: config.dim,
metric: config.metric,
quantization: ScalarKind::F32,
// These will be loaded from serialized data
connectivity: 0,
expansion_add: 0,
expansion_search: 0,
multi: false,
};
let index = Index::new(&options).map_err(|e| {
VectorIndexBuildSnafu {
reason: format!("Failed to create USearch index for loading: {}", e),
}
.build()
})?;
index.load_from_buffer(data).map_err(|e| {
VectorIndexBuildSnafu {
reason: format!("Failed to load USearch index from buffer: {}", e),
}
.build()
})?;
Ok(Self { index })
}
}
impl VectorIndexEngine for UsearchEngine {
fn add(&mut self, key: u64, vector: &[f32]) -> Result<(), BoxedError> {
// Reserve capacity if needed
if self.index.size() >= self.index.capacity() {
let new_capacity = std::cmp::max(1, self.index.capacity() * 2);
self.index.reserve(new_capacity).map_err(|e| {
BoxedError::new(
VectorIndexBuildSnafu {
reason: format!("Failed to reserve capacity: {}", e),
}
.build(),
)
})?;
}
self.index.add(key, vector).map_err(|e| {
BoxedError::new(
VectorIndexBuildSnafu {
reason: format!("Failed to add vector: {}", e),
}
.build(),
)
})
}
fn search(&self, query: &[f32], k: usize) -> Result<VectorSearchMatches, BoxedError> {
let matches = self.index.search(query, k).map_err(|e| {
BoxedError::new(
VectorIndexBuildSnafu {
reason: format!("Failed to search: {}", e),
}
.build(),
)
})?;
Ok(VectorSearchMatches {
keys: matches.keys,
distances: matches.distances,
})
}
fn serialized_length(&self) -> usize {
self.index.serialized_length()
}
fn save_to_buffer(&self, buffer: &mut [u8]) -> Result<(), BoxedError> {
self.index.save_to_buffer(buffer).map_err(|e| {
BoxedError::new(
VectorIndexBuildSnafu {
reason: format!("Failed to save to buffer: {}", e),
}
.build(),
)
})
}
fn reserve(&mut self, capacity: usize) -> Result<(), BoxedError> {
self.index.reserve(capacity).map_err(|e| {
BoxedError::new(
VectorIndexBuildSnafu {
reason: format!("Failed to reserve: {}", e),
}
.build(),
)
})
}
fn size(&self) -> usize {
self.index.size()
}
fn capacity(&self) -> usize {
self.index.capacity()
}
fn memory_usage(&self) -> usize {
self.index.memory_usage()
}
}
#[cfg(test)]
mod tests {
use index::vector::VectorDistanceMetric;
use store_api::storage::VectorIndexEngineType;
use usearch::MetricKind;
use super::*;
fn test_config() -> VectorIndexConfig {
VectorIndexConfig {
engine: VectorIndexEngineType::Usearch,
dim: 4,
metric: MetricKind::L2sq,
distance_metric: VectorDistanceMetric::L2sq,
connectivity: 16,
expansion_add: 128,
expansion_search: 64,
}
}
#[test]
fn test_usearch_engine_create() {
let config = test_config();
let engine = UsearchEngine::create(&config).unwrap();
assert_eq!(engine.size(), 0);
}
#[test]
fn test_usearch_engine_add_and_search() {
let config = test_config();
let mut engine = UsearchEngine::create(&config).unwrap();
// Add some vectors
engine.add(0, &[1.0, 0.0, 0.0, 0.0]).unwrap();
engine.add(1, &[0.0, 1.0, 0.0, 0.0]).unwrap();
engine.add(2, &[0.0, 0.0, 1.0, 0.0]).unwrap();
assert_eq!(engine.size(), 3);
// Search
let matches = engine.search(&[1.0, 0.0, 0.0, 0.0], 2).unwrap();
assert_eq!(matches.keys.len(), 2);
// First result should be the exact match (key 0)
assert_eq!(matches.keys[0], 0);
}
#[test]
fn test_usearch_engine_serialization() {
let config = test_config();
let mut engine = UsearchEngine::create(&config).unwrap();
engine.add(0, &[1.0, 0.0, 0.0, 0.0]).unwrap();
engine.add(1, &[0.0, 1.0, 0.0, 0.0]).unwrap();
// Serialize
let len = engine.serialized_length();
let mut buffer = vec![0u8; len];
engine.save_to_buffer(&mut buffer).unwrap();
// Load
let loaded = UsearchEngine::load(&config, &buffer).unwrap();
assert_eq!(loaded.size(), 2);
// Verify search works on loaded index
let matches = loaded.search(&[1.0, 0.0, 0.0, 0.0], 1).unwrap();
assert_eq!(matches.keys[0], 0);
}
}

View File

@@ -0,0 +1,22 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Vector index module for HNSW-based approximate nearest neighbor search.
pub(crate) mod creator;
pub(crate) mod engine;
pub(crate) mod util;
/// The blob type identifier for vector index in puffin files.
pub(crate) const INDEX_BLOB_TYPE: &str = "greptime-vector-index-v1";

View File

@@ -0,0 +1,108 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Utility functions for vector index operations.
use std::borrow::Cow;
/// Converts a byte slice (little-endian format) to f32 slice, handling unaligned data gracefully.
/// Returns `Cow::Borrowed` for aligned data on little-endian systems (zero-copy)
/// or `Cow::Owned` for unaligned data or big-endian systems.
///
/// # Panics
///
/// Panics if the byte slice length is not a multiple of 4.
pub fn bytes_to_f32_slice(bytes: &[u8]) -> Cow<'_, [f32]> {
assert!(
bytes.len().is_multiple_of(4),
"Vector bytes length {} is not a multiple of 4",
bytes.len()
);
if bytes.is_empty() {
return Cow::Borrowed(&[]);
}
let ptr = bytes.as_ptr();
// Fast path: zero-copy only when data is aligned AND we're on little-endian system
// (since vector data is stored in little-endian format)
#[cfg(target_endian = "little")]
if (ptr as usize).is_multiple_of(std::mem::align_of::<f32>()) {
// Safety: We've verified alignment and length requirements,
// and on little-endian systems the byte representation matches f32 layout
return Cow::Borrowed(unsafe {
std::slice::from_raw_parts(ptr as *const f32, bytes.len() / 4)
});
}
// Slow path: data is not aligned or we're on big-endian system
let floats: Vec<f32> = bytes
.chunks_exact(4)
.map(|chunk| f32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]))
.collect();
Cow::Owned(floats)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_bytes_to_f32_slice() {
let floats = [1.0f32, 2.0, 3.0, 4.0];
let bytes: Vec<u8> = floats.iter().flat_map(|f| f.to_le_bytes()).collect();
let result = bytes_to_f32_slice(&bytes);
assert_eq!(result.len(), 4);
assert_eq!(result[0], 1.0);
assert_eq!(result[1], 2.0);
assert_eq!(result[2], 3.0);
assert_eq!(result[3], 4.0);
}
#[test]
fn test_bytes_to_f32_slice_unaligned() {
// Create a buffer with an extra byte at the start to force misalignment
let floats = [1.0f32, 2.0, 3.0, 4.0];
let mut bytes: Vec<u8> = vec![0u8]; // padding byte
bytes.extend(floats.iter().flat_map(|f| f.to_le_bytes()));
// Take a slice starting at offset 1 (unaligned)
let unaligned_bytes = &bytes[1..];
// Verify it's actually unaligned
let ptr = unaligned_bytes.as_ptr();
let is_aligned = (ptr as usize).is_multiple_of(std::mem::align_of::<f32>());
// The function should work regardless of alignment
let result = bytes_to_f32_slice(unaligned_bytes);
assert_eq!(result.len(), 4);
assert_eq!(result[0], 1.0);
assert_eq!(result[1], 2.0);
assert_eq!(result[2], 3.0);
assert_eq!(result[3], 4.0);
// If it was unaligned, it should return an owned Vec (Cow::Owned)
if !is_aligned {
assert!(matches!(result, Cow::Owned(_)));
}
}
#[test]
fn test_bytes_to_f32_slice_empty() {
let bytes: &[u8] = &[];
let result = bytes_to_f32_slice(bytes);
assert!(result.is_empty());
}
}

View File

@@ -742,6 +742,8 @@ mod tests {
inverted_index_config: Default::default(),
fulltext_index_config: Default::default(),
bloom_filter_index_config: Default::default(),
#[cfg(feature = "vector_index")]
vector_index_config: Default::default(),
};
let mut metrics = Metrics::new(WriteType::Flush);
@@ -1152,6 +1154,8 @@ mod tests {
inverted_index_config: Default::default(),
fulltext_index_config: Default::default(),
bloom_filter_index_config: Default::default(),
#[cfg(feature = "vector_index")]
vector_index_config: Default::default(),
}
}
@@ -1271,6 +1275,8 @@ mod tests {
inverted_index_config: Default::default(),
fulltext_index_config: Default::default(),
bloom_filter_index_config: Default::default(),
#[cfg(feature = "vector_index")]
vector_index_config: Default::default(),
};
let mut metrics = Metrics::new(WriteType::Flush);

View File

@@ -21,12 +21,14 @@ use store_api::storage::RegionId;
use tokio::sync::oneshot;
use crate::error::{
RegionStateSnafu, SerdeJsonSnafu, StagingPartitionExprMismatchSnafu, UnexpectedSnafu,
RegionStateSnafu, Result, SerdeJsonSnafu, StagingPartitionExprMismatchSnafu, UnexpectedSnafu,
};
use crate::manifest::action::RegionEdit;
use crate::region::{RegionLeaderState, RegionRoleState};
use crate::request::{OptionOutputTx, RegionEditRequest};
use crate::sst::file::FileMeta;
use crate::manifest::action::{RegionEdit, RegionManifest};
use crate::manifest::storage::manifest_dir;
use crate::manifest::storage::staging::{StagingBlobStorage, staging_blob_path};
use crate::region::{MitoRegionRef, RegionLeaderState, RegionRoleState};
use crate::request::{OptionOutputTx, RegionEditRequest, WorkerRequest, WorkerRequestWithTime};
use crate::sst::location::region_dir_from_table_dir;
use crate::worker::RegionWorkerLoop;
impl<S: LogStore> RegionWorkerLoop<S> {
@@ -86,21 +88,32 @@ impl<S: LogStore> RegionWorkerLoop<S> {
return;
}
let (tx, rx) = oneshot::channel();
let files_to_add = match serde_json::from_slice::<Vec<FileMeta>>(&request.files_to_add)
.context(SerdeJsonSnafu)
{
Ok(files_to_add) => files_to_add,
Err(e) => {
sender.send(Err(e));
let worker_sender = self.sender.clone();
common_runtime::spawn_global(async move {
let staging_manifest = match Self::fetch_staging_manifest(
&region,
request.central_region_id,
&request.manifest_path,
)
.await
{
Ok(staging_manifest) => staging_manifest,
Err(e) => {
sender.send(Err(e));
return;
}
};
if staging_manifest.metadata.partition_expr.as_ref() != Some(&request.partition_expr) {
sender.send(Err(StagingPartitionExprMismatchSnafu {
manifest_expr: staging_manifest.metadata.partition_expr.clone(),
request_expr: request.partition_expr,
}
.build()));
return;
}
};
info!("Applying staging manifest request to region {}", region_id);
self.handle_region_edit(RegionEditRequest {
region_id,
edit: RegionEdit {
let files_to_add = staging_manifest.files.values().cloned().collect::<Vec<_>>();
let edit = RegionEdit {
files_to_add,
files_to_remove: vec![],
timestamp_ms: Some(Utc::now().timestamp_millis()),
@@ -108,11 +121,23 @@ impl<S: LogStore> RegionWorkerLoop<S> {
flushed_entry_id: None,
flushed_sequence: None,
committed_sequence: None,
},
tx,
});
};
let (tx, rx) = oneshot::channel();
info!(
"Applying staging manifest request to region {}",
region.region_id,
);
let _ = worker_sender
.send(WorkerRequestWithTime::new(WorkerRequest::EditRegion(
RegionEditRequest {
region_id: region.region_id,
edit,
tx,
},
)))
.await;
common_runtime::spawn_global(async move {
// Await the result from the region edit and forward the outcome to the original sender.
// If the operation completes successfully, respond with Ok(0); otherwise, respond with an appropriate error.
if let Ok(result) = rx.await {
@@ -137,4 +162,25 @@ impl<S: LogStore> RegionWorkerLoop<S> {
}
});
}
/// Fetches the staging manifest from the central region's staging blob storage.
///
/// The `central_region_id` is used to locate the staging directory because the staging
/// manifest was created by the central region during `remap_manifests` operation.
async fn fetch_staging_manifest(
region: &MitoRegionRef,
central_region_id: RegionId,
manifest_path: &str,
) -> Result<RegionManifest> {
let region_dir =
region_dir_from_table_dir(region.table_dir(), central_region_id, region.path_type());
let staging_blob_path = staging_blob_path(&manifest_dir(&region_dir));
let staging_blob_storage = StagingBlobStorage::new(
staging_blob_path,
region.access_layer().object_store().clone(),
);
let staging_manifest = staging_blob_storage.get(manifest_path).await?;
serde_json::from_slice::<RegionManifest>(&staging_manifest).context(SerdeJsonSnafu)
}
}

View File

@@ -64,6 +64,8 @@ impl<S> RegionWorkerLoop<S> {
inverted_index_config: self.config.inverted_index.clone(),
fulltext_index_config: self.config.fulltext_index.clone(),
bloom_filter_index_config: self.config.bloom_filter_index.clone(),
#[cfg(feature = "vector_index")]
vector_index_config: self.config.vector_index.clone(),
index_options: version.options.index_options.clone(),
row_group_size: WriteOptions::default().row_group_size,
intermediate_manager,

View File

@@ -16,14 +16,13 @@ use std::collections::HashMap;
use std::time::Instant;
use common_error::ext::BoxedError;
use common_telemetry::info;
use common_telemetry::{debug, info};
use futures::future::try_join_all;
use partition::expr::PartitionExpr;
use snafu::{OptionExt, ResultExt};
use store_api::storage::RegionId;
use crate::error::{FetchManifestsSnafu, InvalidRequestSnafu, MissingManifestSnafu, Result};
use crate::manifest::action::RegionManifest;
use crate::error::{self, FetchManifestsSnafu, InvalidRequestSnafu, MissingManifestSnafu, Result};
use crate::region::{MitoRegionRef, RegionMetadataLoader};
use crate::remap_manifest::RemapManifest;
use crate::request::RemapManifestsRequest;
@@ -75,13 +74,17 @@ impl<S> RegionWorkerLoop<S> {
});
}
// Fetches manifests for input regions, remaps them according to the provided
// mapping and partition expressions.
//
// Returns a map from each new region to its relative staging manifest path.
async fn fetch_and_remap_manifests(
region: MitoRegionRef,
region_metadata_loader: RegionMetadataLoader,
input_regions: Vec<RegionId>,
new_partition_exprs: HashMap<RegionId, PartitionExpr>,
region_mapping: HashMap<RegionId, Vec<RegionId>>,
) -> Result<HashMap<RegionId, RegionManifest>> {
) -> Result<HashMap<RegionId, String>> {
let mut tasks = Vec::with_capacity(input_regions.len());
let region_options = region.version().options.clone();
let table_dir = region.table_dir();
@@ -97,7 +100,6 @@ impl<S> RegionWorkerLoop<S> {
.await
});
}
let results = try_join_all(tasks)
.await
.map_err(BoxedError::new)
@@ -112,12 +114,38 @@ impl<S> RegionWorkerLoop<S> {
.collect::<Result<HashMap<_, _>>>()?;
let mut mapper = RemapManifest::new(manifests, new_partition_exprs, region_mapping);
let remap_result = mapper.remap_manifests()?;
// Write new manifests to staging blob storage.
let manifest_manager = region.manifest_ctx.manifest_manager.write().await;
let manifest_storage = manifest_manager.store();
let staging_blob_storage = manifest_storage.staging_storage().blob_storage().clone();
let mut tasks = Vec::with_capacity(remap_result.new_manifests.len());
for (remap_region_id, manifest) in &remap_result.new_manifests {
let bytes = serde_json::to_vec(&manifest).context(error::SerializeManifestSnafu {
region_id: *remap_region_id,
})?;
let key = remap_manifest_key(remap_region_id);
tasks.push(async {
debug!(
"Putting manifest to staging blob storage, region_id: {}, key: {}",
*remap_region_id, key
);
staging_blob_storage.put(&key, bytes).await?;
Ok((*remap_region_id, key))
});
}
let r = try_join_all(tasks).await?;
info!(
"Remap manifests cost: {:?}, region: {}",
now.elapsed(),
region.region_id
);
Ok(remap_result.new_manifests)
Ok(r.into_iter().collect::<HashMap<_, _>>())
}
}
fn remap_manifest_key(region_id: &RegionId) -> String {
format!("remap_manifest_{}", region_id.as_u64())
}

View File

@@ -13,6 +13,7 @@ workspace = true
ahash.workspace = true
api.workspace = true
arrow.workspace = true
arrow-schema.workspace = true
async-trait.workspace = true
catalog.workspace = true
chrono.workspace = true

View File

@@ -800,6 +800,20 @@ pub enum Error {
#[snafu(implicit)]
location: Location,
},
#[snafu(transparent)]
GreptimeProto {
source: api::error::Error,
#[snafu(implicit)]
location: Location,
},
#[snafu(transparent)]
Datatypes {
source: datatypes::error::Error,
#[snafu(implicit)]
location: Location,
},
}
pub type Result<T> = std::result::Result<T, Error>;
@@ -920,6 +934,9 @@ impl ErrorExt for Error {
FloatIsNan { .. }
| InvalidEpochForResolution { .. }
| UnsupportedTypeInPipeline { .. } => StatusCode::InvalidArguments,
GreptimeProto { source, .. } => source.status_code(),
Datatypes { source, .. } => source.status_code(),
}
}

View File

@@ -19,13 +19,17 @@ use std::collections::{BTreeMap, HashSet};
use std::sync::Arc;
use ahash::{HashMap, HashMapExt};
use api::helper::proto_value_type;
use api::v1::column_data_type_extension::TypeExt;
use api::helper::{ColumnDataTypeWrapper, encode_json_value};
use api::v1::column_def::{collect_column_options, options_from_column_schema};
use api::v1::value::ValueData;
use api::v1::{ColumnDataType, ColumnDataTypeExtension, JsonTypeExtension, SemanticType};
use api::v1::{ColumnDataType, SemanticType};
use arrow_schema::extension::ExtensionType;
use coerce::{coerce_columns, coerce_value};
use common_query::prelude::{greptime_timestamp, greptime_value};
use common_telemetry::warn;
use datatypes::data_type::ConcreteDataType;
use datatypes::extension::json::JsonExtensionType;
use datatypes::value::Value;
use greptime_proto::v1::{ColumnSchema, Row, Rows, Value as GreptimeValue};
use itertools::Itertools;
use jsonb::Number;
@@ -33,12 +37,15 @@ use once_cell::sync::OnceCell;
use serde_json as serde_json_crate;
use session::context::Channel;
use snafu::OptionExt;
use table::Table;
use vrl::prelude::{Bytes, VrlValueConvert};
use vrl::value::value::StdError;
use vrl::value::{KeyString, Value as VrlValue};
use crate::error::{
ArrayElementMustBeObjectSnafu, IdentifyPipelineColumnTypeMismatchSnafu, InvalidTimestampSnafu,
Result, TimeIndexMustBeNonNullSnafu, TransformColumnNameMustBeUniqueSnafu,
ArrayElementMustBeObjectSnafu, CoerceIncompatibleTypesSnafu,
IdentifyPipelineColumnTypeMismatchSnafu, InvalidTimestampSnafu, Result,
TimeIndexMustBeNonNullSnafu, TransformColumnNameMustBeUniqueSnafu,
TransformMultipleTimestampIndexSnafu, TransformTimestampIndexCountSnafu, ValueMustBeMapSnafu,
};
use crate::etl::PipelineDocVersion;
@@ -272,15 +279,75 @@ impl GreptimeTransformer {
}
}
#[derive(Clone)]
pub struct ColumnMetadata {
column_schema: datatypes::schema::ColumnSchema,
semantic_type: SemanticType,
}
impl From<ColumnSchema> for ColumnMetadata {
fn from(value: ColumnSchema) -> Self {
let datatype = value.datatype();
let semantic_type = value.semantic_type();
let ColumnSchema {
column_name,
datatype: _,
semantic_type: _,
datatype_extension,
options,
} = value;
let column_schema = datatypes::schema::ColumnSchema::new(
column_name,
ColumnDataTypeWrapper::new(datatype, datatype_extension).into(),
semantic_type != SemanticType::Timestamp,
);
let metadata = collect_column_options(options.as_ref());
let column_schema = column_schema.with_metadata(metadata);
Self {
column_schema,
semantic_type,
}
}
}
impl TryFrom<ColumnMetadata> for ColumnSchema {
type Error = api::error::Error;
fn try_from(value: ColumnMetadata) -> std::result::Result<Self, Self::Error> {
let ColumnMetadata {
column_schema,
semantic_type,
} = value;
let options = options_from_column_schema(&column_schema);
let (datatype, datatype_extension) =
ColumnDataTypeWrapper::try_from(column_schema.data_type).map(|x| x.into_parts())?;
Ok(ColumnSchema {
column_name: column_schema.name,
datatype: datatype as _,
semantic_type: semantic_type as _,
datatype_extension,
options,
})
}
}
/// This is used to record the current state schema information and a sequential cache of field names.
/// As you traverse the user input JSON, this will change.
/// It will record a superset of all user input schemas.
#[derive(Debug, Default)]
#[derive(Default)]
pub struct SchemaInfo {
/// schema info
pub schema: Vec<ColumnSchema>,
pub schema: Vec<ColumnMetadata>,
/// index of the column name
pub index: HashMap<String, usize>,
/// The pipeline's corresponding table (if already created). Useful to retrieve column schemas.
table: Option<Arc<Table>>,
}
impl SchemaInfo {
@@ -288,6 +355,7 @@ impl SchemaInfo {
Self {
schema: Vec::with_capacity(capacity),
index: HashMap::with_capacity(capacity),
table: None,
}
}
@@ -297,46 +365,88 @@ impl SchemaInfo {
index.insert(schema.column_name.clone(), i);
}
Self {
schema: schema_list,
schema: schema_list.into_iter().map(Into::into).collect(),
index,
table: None,
}
}
pub fn set_table(&mut self, table: Option<Arc<Table>>) {
self.table = table;
}
fn find_column_schema_in_table(&self, column_name: &str) -> Option<ColumnMetadata> {
if let Some(table) = &self.table
&& let Some(i) = table.schema_ref().column_index_by_name(column_name)
{
let column_schema = table.schema_ref().column_schemas()[i].clone();
let semantic_type = if column_schema.is_time_index() {
SemanticType::Timestamp
} else if table.table_info().meta.primary_key_indices.contains(&i) {
SemanticType::Tag
} else {
SemanticType::Field
};
Some(ColumnMetadata {
column_schema,
semantic_type,
})
} else {
None
}
}
pub fn column_schemas(&self) -> api::error::Result<Vec<ColumnSchema>> {
self.schema
.iter()
.map(|x| x.clone().try_into())
.collect::<api::error::Result<Vec<_>>>()
}
}
fn resolve_schema(
index: Option<usize>,
value_data: ValueData,
column_schema: ColumnSchema,
row: &mut Vec<GreptimeValue>,
pipeline_context: &PipelineContext,
column: &str,
value_type: &ConcreteDataType,
schema_info: &mut SchemaInfo,
) -> Result<()> {
if let Some(index) = index {
let api_value = GreptimeValue {
value_data: Some(value_data),
};
// Safety unwrap is fine here because api_value is always valid
let value_column_data_type = proto_value_type(&api_value).unwrap();
// Safety unwrap is fine here because index is always valid
let schema_column_data_type = schema_info.schema.get(index).unwrap().datatype();
if value_column_data_type != schema_column_data_type {
IdentifyPipelineColumnTypeMismatchSnafu {
column: column_schema.column_name,
expected: schema_column_data_type.as_str_name(),
actual: value_column_data_type.as_str_name(),
let column_type = &mut schema_info.schema[index].column_schema.data_type;
match (column_type, value_type) {
(ConcreteDataType::Json(column_type), ConcreteDataType::Json(value_type))
if column_type.is_include(value_type) =>
{
Ok(())
}
.fail()
} else {
row[index] = api_value;
Ok(())
(column_type, value_type) if column_type == value_type => Ok(()),
(column_type, value_type) => IdentifyPipelineColumnTypeMismatchSnafu {
column,
expected: column_type.to_string(),
actual: value_type.to_string(),
}
.fail(),
}
} else {
let key = column_schema.column_name.clone();
let column_schema = schema_info
.find_column_schema_in_table(column)
.unwrap_or_else(|| {
let semantic_type = decide_semantic(pipeline_context, column);
let column_schema = datatypes::schema::ColumnSchema::new(
column,
value_type.clone(),
semantic_type != SemanticType::Timestamp,
);
ColumnMetadata {
column_schema,
semantic_type,
}
});
let key = column.to_string();
schema_info.schema.push(column_schema);
schema_info.index.insert(key, schema_info.schema.len() - 1);
let api_value = GreptimeValue {
value_data: Some(value_data),
};
row.push(api_value);
Ok(())
}
}
@@ -481,11 +591,11 @@ pub(crate) fn values_to_row(
Ok(Row { values: row })
}
fn decide_semantic(p_ctx: &PipelineContext, column_name: &str) -> i32 {
fn decide_semantic(p_ctx: &PipelineContext, column_name: &str) -> SemanticType {
if p_ctx.channel == Channel::Prometheus && column_name != greptime_value() {
SemanticType::Tag as i32
SemanticType::Tag
} else {
SemanticType::Field as i32
SemanticType::Field
}
}
@@ -497,55 +607,56 @@ fn resolve_value(
p_ctx: &PipelineContext,
) -> Result<()> {
let index = schema_info.index.get(&column_name).copied();
let mut resolve_simple_type =
|value_data: ValueData, column_name: String, data_type: ColumnDataType| {
let semantic_type = decide_semantic(p_ctx, &column_name);
resolve_schema(
index,
value_data,
ColumnSchema {
column_name,
datatype: data_type as i32,
semantic_type,
datatype_extension: None,
options: None,
},
row,
schema_info,
)
};
match value {
VrlValue::Null => {}
let value_data = match value {
VrlValue::Null => None,
VrlValue::Integer(v) => {
// safe unwrap after type matched
resolve_simple_type(ValueData::I64Value(v), column_name, ColumnDataType::Int64)?;
resolve_schema(
index,
p_ctx,
&column_name,
&ConcreteDataType::int64_datatype(),
schema_info,
)?;
Some(ValueData::I64Value(v))
}
VrlValue::Float(v) => {
// safe unwrap after type matched
resolve_simple_type(
ValueData::F64Value(v.into()),
column_name,
ColumnDataType::Float64,
resolve_schema(
index,
p_ctx,
&column_name,
&ConcreteDataType::float64_datatype(),
schema_info,
)?;
Some(ValueData::F64Value(v.into()))
}
VrlValue::Boolean(v) => {
resolve_simple_type(
ValueData::BoolValue(v),
column_name,
ColumnDataType::Boolean,
resolve_schema(
index,
p_ctx,
&column_name,
&ConcreteDataType::boolean_datatype(),
schema_info,
)?;
Some(ValueData::BoolValue(v))
}
VrlValue::Bytes(v) => {
resolve_simple_type(
ValueData::StringValue(String::from_utf8_lossy_owned(v.to_vec())),
column_name,
ColumnDataType::String,
resolve_schema(
index,
p_ctx,
&column_name,
&ConcreteDataType::string_datatype(),
schema_info,
)?;
Some(ValueData::StringValue(String::from_utf8_lossy_owned(
v.to_vec(),
)))
}
VrlValue::Regex(v) => {
@@ -553,42 +664,83 @@ fn resolve_value(
"Persisting regex value in the table, this should not happen, column_name: {}",
column_name
);
resolve_simple_type(
ValueData::StringValue(v.to_string()),
column_name,
ColumnDataType::String,
resolve_schema(
index,
p_ctx,
&column_name,
&ConcreteDataType::string_datatype(),
schema_info,
)?;
Some(ValueData::StringValue(v.to_string()))
}
VrlValue::Timestamp(ts) => {
let ns = ts.timestamp_nanos_opt().context(InvalidTimestampSnafu {
input: ts.to_rfc3339(),
})?;
resolve_simple_type(
ValueData::TimestampNanosecondValue(ns),
column_name,
ColumnDataType::TimestampNanosecond,
resolve_schema(
index,
p_ctx,
&column_name,
&ConcreteDataType::timestamp_nanosecond_datatype(),
schema_info,
)?;
Some(ValueData::TimestampNanosecondValue(ns))
}
VrlValue::Array(_) | VrlValue::Object(_) => {
let data = vrl_value_to_jsonb_value(&value);
resolve_schema(
index,
ValueData::BinaryValue(data.to_vec()),
ColumnSchema {
column_name,
datatype: ColumnDataType::Binary as i32,
semantic_type: SemanticType::Field as i32,
datatype_extension: Some(ColumnDataTypeExtension {
type_ext: Some(TypeExt::JsonType(JsonTypeExtension::JsonBinary.into())),
}),
options: None,
},
row,
schema_info,
)?;
let is_json_native_type = schema_info
.find_column_schema_in_table(&column_name)
.is_some_and(|x| {
if let ConcreteDataType::Json(column_type) = &x.column_schema.data_type {
column_type.is_native_type()
} else {
false
}
});
let value = if is_json_native_type {
let json_extension_type: Option<JsonExtensionType> =
if let Some(x) = schema_info.find_column_schema_in_table(&column_name) {
x.column_schema.extension_type()?
} else {
None
};
let settings = json_extension_type
.and_then(|x| x.metadata().json_structure_settings.clone())
.unwrap_or_default();
let value: serde_json::Value = value.try_into().map_err(|e: StdError| {
CoerceIncompatibleTypesSnafu { msg: e.to_string() }.build()
})?;
let value = settings.encode(value)?;
resolve_schema(index, p_ctx, &column_name, &value.data_type(), schema_info)?;
let Value::Json(value) = value else {
unreachable!()
};
ValueData::JsonValue(encode_json_value(*value))
} else {
resolve_schema(
index,
p_ctx,
&column_name,
&ConcreteDataType::binary_datatype(),
schema_info,
)?;
let value = vrl_value_to_jsonb_value(&value);
ValueData::BinaryValue(value.to_vec())
};
Some(value)
}
};
let value = GreptimeValue { value_data };
if let Some(index) = index {
row[index] = value;
} else {
row.push(value);
}
Ok(())
}
@@ -626,20 +778,24 @@ fn identity_pipeline_inner(
let custom_ts = pipeline_ctx.pipeline_definition.get_custom_ts();
// set time index column schema first
schema_info.schema.push(ColumnSchema {
column_name: custom_ts
let column_schema = datatypes::schema::ColumnSchema::new(
custom_ts
.map(|ts| ts.get_column_name().to_string())
.unwrap_or_else(|| greptime_timestamp().to_string()),
datatype: custom_ts.map(|c| c.get_datatype()).unwrap_or_else(|| {
if pipeline_ctx.channel == Channel::Prometheus {
ColumnDataType::TimestampMillisecond
} else {
ColumnDataType::TimestampNanosecond
}
}) as i32,
semantic_type: SemanticType::Timestamp as i32,
datatype_extension: None,
options: None,
custom_ts
.map(|c| ConcreteDataType::from(ColumnDataTypeWrapper::new(c.get_datatype(), None)))
.unwrap_or_else(|| {
if pipeline_ctx.channel == Channel::Prometheus {
ConcreteDataType::timestamp_millisecond_datatype()
} else {
ConcreteDataType::timestamp_nanosecond_datatype()
}
}),
false,
);
schema_info.schema.push(ColumnMetadata {
column_schema,
semantic_type: SemanticType::Timestamp,
});
let mut opt_map = HashMap::new();
@@ -697,28 +853,29 @@ pub fn identity_pipeline(
input.push(result);
}
identity_pipeline_inner(input, pipeline_ctx).map(|(mut schema, opt_map)| {
identity_pipeline_inner(input, pipeline_ctx).and_then(|(mut schema, opt_map)| {
if let Some(table) = table {
let table_info = table.table_info();
for tag_name in table_info.meta.row_key_column_names() {
if let Some(index) = schema.index.get(tag_name) {
schema.schema[*index].semantic_type = SemanticType::Tag as i32;
schema.schema[*index].semantic_type = SemanticType::Tag;
}
}
}
opt_map
let column_schemas = schema.column_schemas()?;
Ok(opt_map
.into_iter()
.map(|(opt, rows)| {
(
opt,
Rows {
schema: schema.schema.clone(),
schema: column_schemas.clone(),
rows,
},
)
})
.collect::<HashMap<ContextOpt, Rows>>()
.collect::<HashMap<ContextOpt, Rows>>())
})
}
@@ -850,7 +1007,7 @@ mod tests {
assert!(rows.is_err());
assert_eq!(
rows.err().unwrap().to_string(),
"Column datatype mismatch. For column: score, expected datatype: FLOAT64, actual datatype: STRING".to_string(),
"Column datatype mismatch. For column: score, expected datatype: Float64, actual datatype: Binary".to_string(),
);
}
{
@@ -879,7 +1036,7 @@ mod tests {
assert!(rows.is_err());
assert_eq!(
rows.err().unwrap().to_string(),
"Column datatype mismatch. For column: score, expected datatype: FLOAT64, actual datatype: INT64".to_string(),
"Column datatype mismatch. For column: score, expected datatype: Float64, actual datatype: Int64".to_string(),
);
}
{
@@ -942,7 +1099,7 @@ mod tests {
.map(|(mut schema, mut rows)| {
for name in tag_column_names {
if let Some(index) = schema.index.get(&name) {
schema.schema[*index].semantic_type = SemanticType::Tag as i32;
schema.schema[*index].semantic_type = SemanticType::Tag;
}
}
@@ -950,7 +1107,7 @@ mod tests {
let rows = rows.remove(&ContextOpt::default()).unwrap();
Rows {
schema: schema.schema,
schema: schema.column_schemas().unwrap(),
rows,
}
});

View File

@@ -61,7 +61,7 @@ pub fn parse_and_exec(input_str: &str, pipeline_yaml: &str) -> Rows {
}
Rows {
schema: schema_info.schema.clone(),
schema: schema_info.column_schemas().unwrap(),
rows,
}
}

View File

@@ -52,7 +52,7 @@ transform:
// check schema
assert_eq!(output.schema[0].column_name, "commit");
let type_id: i32 = ColumnDataType::Binary.into();
let type_id: i32 = ColumnDataType::Json.into();
assert_eq!(output.schema[0].datatype, type_id);
// check value
@@ -91,7 +91,7 @@ transform:
// check schema
assert_eq!(output.schema[0].column_name, "commit_json");
let type_id: i32 = ColumnDataType::Binary.into();
let type_id: i32 = ColumnDataType::Json.into();
assert_eq!(output.schema[0].datatype, type_id);
// check value
@@ -160,7 +160,7 @@ transform:
// check schema
assert_eq!(output.schema[0].column_name, "commit");
let type_id: i32 = ColumnDataType::Binary.into();
let type_id: i32 = ColumnDataType::Json.into();
assert_eq!(output.schema[0].datatype, type_id);
// check value

View File

@@ -664,6 +664,13 @@ pub enum Error {
#[snafu(implicit)]
location: Location,
},
#[snafu(transparent)]
GreptimeProto {
source: api::error::Error,
#[snafu(implicit)]
location: Location,
},
}
pub type Result<T, E = Error> = std::result::Result<T, E>;
@@ -794,6 +801,8 @@ impl ErrorExt for Error {
Suspended { .. } => StatusCode::Suspended,
MemoryLimitExceeded { .. } => StatusCode::RateLimited,
GreptimeProto { source, .. } => source.status_code(),
}
}

View File

@@ -23,6 +23,7 @@ pub mod memory_limit;
pub mod prom_query_gateway;
pub mod region_server;
use std::any::Any;
use std::net::SocketAddr;
use std::time::Duration;
@@ -399,4 +400,8 @@ impl Server for GrpcServer {
fn bind_addr(&self) -> Option<SocketAddr> {
self.bind_addr
}
fn as_any(&self) -> &dyn Any {
self
}
}

View File

@@ -1285,6 +1285,10 @@ impl Server for HttpServer {
fn bind_addr(&self) -> Option<SocketAddr> {
self.bind_addr
}
fn as_any(&self) -> &dyn std::any::Any {
self
}
}
#[cfg(test)]

View File

@@ -31,7 +31,7 @@ use axum_extra::TypedHeader;
use common_catalog::consts::default_engine;
use common_error::ext::{BoxedError, ErrorExt};
use common_query::{Output, OutputData};
use common_telemetry::{debug, error, warn};
use common_telemetry::{error, warn};
use headers::ContentType;
use lazy_static::lazy_static;
use mime_guess::mime;
@@ -738,11 +738,6 @@ pub async fn log_ingester(
let value = extract_pipeline_value_by_content_type(content_type, payload, ignore_errors)?;
debug!(
"receiving logs: {:?}",
serde_json::to_string(&value).unwrap()
);
query_ctx.set_channel(Channel::Log);
let query_ctx = Arc::new(query_ctx);

View File

@@ -152,7 +152,7 @@ pub async fn loki_ingest(
rows.push(row);
}
let schemas = schema_info.schema;
let schemas = schema_info.column_schemas()?;
// fill Null for missing values
for row in rows.iter_mut() {
row.resize(schemas.len(), GreptimeValue::default());
@@ -746,13 +746,16 @@ fn process_labels(
} else {
// not exist
// add schema and append to values
schemas.push(ColumnSchema {
column_name: k.clone(),
datatype: ColumnDataType::String.into(),
semantic_type: SemanticType::Tag.into(),
datatype_extension: None,
options: None,
});
schemas.push(
ColumnSchema {
column_name: k.clone(),
datatype: ColumnDataType::String.into(),
semantic_type: SemanticType::Tag.into(),
datatype_extension: None,
options: None,
}
.into(),
);
column_indexer.insert(k, schemas.len() - 1);
row.push(GreptimeValue {

View File

@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::any::Any;
use std::future::Future;
use std::net::SocketAddr;
use std::sync::Arc;
@@ -265,4 +266,8 @@ impl Server for MysqlServer {
fn bind_addr(&self) -> Option<SocketAddr> {
self.bind_addr
}
fn as_any(&self) -> &dyn Any {
self
}
}

View File

@@ -381,6 +381,7 @@ fn extract_field_from_attr_and_combine_schema(
if let Some(index) = select_schema.index.get(key) {
let column_schema = &select_schema.schema[*index];
let column_schema: ColumnSchema = column_schema.clone().try_into()?;
// datatype of the same column name should be the same
ensure!(
column_schema.datatype == schema.datatype,
@@ -393,7 +394,7 @@ fn extract_field_from_attr_and_combine_schema(
);
extracted_values[*index] = value;
} else {
select_schema.schema.push(schema);
select_schema.schema.push(schema.into());
select_schema
.index
.insert(key.clone(), select_schema.schema.len() - 1);
@@ -480,7 +481,7 @@ fn parse_export_logs_service_request_to_rows(
let mut parse_ctx = ParseContext::new(select_info);
let mut rows = parse_resource(&mut parse_ctx, request.resource_logs)?;
schemas.extend(parse_ctx.select_schema.schema);
schemas.extend(parse_ctx.select_schema.column_schemas()?);
rows.iter_mut().for_each(|row| {
row.values.resize(schemas.len(), GreptimeValue::default());

View File

@@ -135,12 +135,18 @@ async fn run_custom_pipeline(
let mut schema_info = SchemaInfo::default();
schema_info
.schema
.push(time_index_column_schema(ts_name, timeunit));
.push(time_index_column_schema(ts_name, timeunit).into());
schema_info
}
};
let table = handler
.get_table(&table_name, query_ctx)
.await
.context(CatalogSnafu)?;
schema_info.set_table(table);
for pipeline_map in pipeline_maps {
let result = pipeline
.exec_mut(pipeline_map, pipeline_ctx, &mut schema_info)
@@ -194,7 +200,7 @@ async fn run_custom_pipeline(
RowInsertRequest {
rows: Some(Rows {
rows,
schema: schema_info.schema.clone(),
schema: schema_info.column_schemas()?,
}),
table_name: table_name.clone(),
},

View File

@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::any::Any;
use std::future::Future;
use std::net::SocketAddr;
use std::sync::Arc;
@@ -144,4 +145,8 @@ impl Server for PostgresServer {
fn bind_addr(&self) -> Option<SocketAddr> {
self.bind_addr
}
fn as_any(&self) -> &dyn Any {
self
}
}

View File

@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::any::Any;
use std::collections::HashMap;
use std::fmt::{Debug, Formatter};
use std::net::SocketAddr;
@@ -147,6 +148,8 @@ pub trait Server: Send + Sync {
fn bind_addr(&self) -> Option<SocketAddr> {
None
}
fn as_any(&self) -> &dyn Any;
}
struct AcceptTask {

View File

@@ -29,7 +29,7 @@ use common_error::status_code::StatusCode;
use common_macro::stack_trace_debug;
use datatypes::arrow;
use datatypes::arrow::datatypes::FieldRef;
use datatypes::schema::{ColumnSchema, FulltextOptions, Schema, SchemaRef};
use datatypes::schema::{ColumnSchema, FulltextOptions, Schema, SchemaRef, VectorIndexOptions};
use datatypes::types::TimestampType;
use itertools::Itertools;
use serde::de::Error;
@@ -384,6 +384,22 @@ impl RegionMetadata {
inverted_index
}
/// Gets the column IDs that have vector indexes along with their options.
/// Returns a map from column ID to the vector index options.
pub fn vector_indexed_column_ids(&self) -> HashMap<ColumnId, VectorIndexOptions> {
self.column_metadatas
.iter()
.filter_map(|column| {
column
.column_schema
.vector_index_options()
.ok()
.flatten()
.map(|options| (column.column_id, options))
})
.collect()
}
/// Checks whether the metadata is valid.
fn validate(&self) -> Result<()> {
// Id to name.

View File

@@ -759,8 +759,11 @@ pub struct RemapManifestsRequest {
/// Response to remap manifests from old regions to new regions.
#[derive(Debug, Clone)]
pub struct RemapManifestsResponse {
/// The new manifests for the new regions.
pub new_manifests: HashMap<RegionId, String>,
/// Maps region id to its staging manifest path.
///
/// These paths are relative paths within the central region's staging blob storage,
/// and should be passed to [`ApplyStagingManifestRequest`](RegionRequest::ApplyStagingManifest) to finalize the repartition.
pub manifest_paths: HashMap<RegionId, String>,
}
/// Request to copy files from a source region to a target region.

View File

@@ -421,20 +421,17 @@ fn make_region_apply_staging_manifest(
api::v1::region::ApplyStagingManifestRequest {
region_id,
partition_expr,
files_to_add,
central_region_id,
manifest_path,
}: api::v1::region::ApplyStagingManifestRequest,
) -> Result<Vec<(RegionId, RegionRequest)>> {
let region_id = region_id.into();
let files_to_add = files_to_add
.context(UnexpectedSnafu {
reason: "'files_to_add' field is missing",
})?
.data;
Ok(vec![(
region_id,
RegionRequest::ApplyStagingManifest(ApplyStagingManifestRequest {
partition_expr,
files_to_add,
central_region_id: central_region_id.into(),
manifest_path,
}),
)])
}
@@ -1464,8 +1461,10 @@ pub struct EnterStagingRequest {
/// In practice, this means:
/// - The `partition_expr` identifies the staging region rule that the manifest
/// was generated for.
/// - `files_to_add` carries the serialized metadata (such as file manifests or
/// file lists) that should be attached to the region under the new rule.
/// - `central_region_id` specifies which region holds the staging blob storage
/// where the manifest was written during the `remap_manifests` operation.
/// - `manifest_path` is the relative path within the central region's staging
/// blob storage to fetch the generated manifest.
///
/// It should typically be called **after** the staging region has been
/// initialized by [`EnterStagingRequest`] and the new file layout has been
@@ -1474,8 +1473,11 @@ pub struct EnterStagingRequest {
pub struct ApplyStagingManifestRequest {
/// The partition expression of the staging region.
pub partition_expr: String,
/// The files to add to the region.
pub files_to_add: Vec<u8>,
/// The region that stores the staging manifests in its staging blob storage.
pub central_region_id: RegionId,
/// The relative path to the staging manifest within the central region's
/// staging blob storage.
pub manifest_path: String,
}
impl fmt::Display for RegionRequest {

View File

@@ -1,14 +1,14 @@
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------+
| data | ts |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------+
| {_raw: {"commit":{"cid":"bafyreidblutgvj75o4q4akzyyejedjj6l3it6hgqwee6jpwv2wqph5fsgm","record":{"$type":"app.bsky.feed.post","createdAt":"2024-11-21T16:09:27.095Z","langs":["en"],"reply":{"parent":{"cid":"bafyreibfglofvqou2yiqvwzk4rcgkhhxrbunyemshdjledgwymimqkg24e","uri":"at://did:plc:6tr6tuzlx2db3rduzr2d6r24/app.bsky.feed.post/3lbhqo2rtys2z"},"root":{"cid":"bafyreibfglofvqou2yiqvwzk4rcgkhhxrbunyemshdjledgwymimqkg24e","uri":"at://did:plc:6tr6tuzlx2db3rduzr2d6r24/app.bsky.feed.post/3lbhqo2rtys2z"}},"text":"aaaaah. LIght shines in a corner of WTF...."},"rev":"3lbhtytnn2k2f","rkey":"3lbhtyteurk2y"}}, commit.collection: app.bsky.feed.post, commit.operation: create, did: did:plc:yj3sjq3blzpynh27cumnp5ks, kind: commit, time_us: 1732206349000167} | 1970-01-01T00:00:00.001 |
| {_raw: {"commit":{"cid":"bafyreia5l5vrkh5oj4cjyhcqby2dprhyvcyofo2q5562tijlae2pzih23m","record":{"$type":"app.bsky.feed.like","createdAt":"2024-11-21T16:25:46.221Z","subject":{"cid":"bafyreidjvrcmckkm765mct5fph36x7kupkfo35rjklbf2k76xkzwyiauge","uri":"at://did:plc:azrv4rcbws6kmcga4fsbphg2/app.bsky.feed.post/3lbgjdpbiec2l"}},"rev":"3lbhuvzds6d2a","rkey":"3lbhuvzdked2a"}}, commit.collection: app.bsky.feed.like, commit.operation: create, did: did:plc:3i4xf2v4wcnyktgv6satke64, kind: commit, time_us: 1732206349000644} | 1970-01-01T00:00:00.002 |
| {_raw: {"commit":{"cid":"bafyreiew2p6cgirfaj45qoenm4fgumib7xoloclrap3jgkz5es7g7kby3i","record":{"$type":"app.bsky.graph.follow","createdAt":"2024-11-21T16:27:40.923Z","subject":"did:plc:r7cdh4sgzqbfdc6wcdxxti7c"},"rev":"3lbhuvze3gi2u","rkey":"3lbhuvzdtmi2u"}}, commit.collection: app.bsky.graph.follow, commit.operation: create, did: did:plc:gccfnqqizz4urhchsaie6jft, kind: commit, time_us: 1732206349001108} | 1970-01-01T00:00:00.003 |
| {_raw: {"commit":{"cid":"bafyreidjix4dauj2afjlbzmhj3a7gwftcevvmmy6edww6vrjdbst26rkby","record":{"$type":"app.bsky.feed.like","createdAt":"2024-11-21T16:15:58.232Z","subject":{"cid":"bafyreiavpshyqzrlo5m7fqodjhs6jevweqnif4phasiwimv4a7mnsqi2fe","uri":"at://did:plc:fusulxqc52zbrc75fi6xrcof/app.bsky.feed.post/3lbhskq5zn22f"}},"rev":"3lbhueija5p22","rkey":"3lbhueiizcx22"}}, commit.collection: app.bsky.feed.like, commit.operation: create, did: did:plc:msxqf3twq7abtdw7dbfskphk, kind: commit, time_us: 1732206349001372} | 1970-01-01T00:00:00.004 |
| {_raw: {"commit":{"cid":"bafyreigzdsdne3z2xxcakgisieyj7y47hj6eg7lj6v4q25ah5q2qotu5ku","record":{"$type":"app.bsky.feed.post","createdAt":"2024-11-21T16:09:27.254Z","langs":["en"],"reply":{"parent":{"cid":"bafyreih35fe2jj3gchmgk4amold4l6sfxd2sby5wrg3jrws5fkdypxrbg4","uri":"at://did:plc:6wx2gg5yqgvmlu35r6y3bk6d/app.bsky.feed.post/3lbhtj2eb4s2o"},"root":{"cid":"bafyreifipyt3vctd4ptuoicvio7rbr5xvjv4afwuggnd2prnmn55mu6luu","uri":"at://did:plc:474ldquxwzrlcvjhhbbk2wte/app.bsky.feed.post/3lbhdzrynik27"}},"text":"okay i take mine back because I hadnt heard this one yet^^"},"rev":"3lbhtytohxc2o","rkey":"3lbhtytjqzk2q"}}, commit.collection: app.bsky.feed.post, commit.operation: create, did: did:plc:l5o3qjrmfztir54cpwlv2eme, kind: commit, time_us: 1732206349001905} | 1970-01-01T00:00:00.005 |
| {_raw: {"commit":{"cid":"bafyreiaa2vsdr4ckwjg4jq47zfd7mewidywfz3qh3dmglcd6ozi4xwdega","record":{"$type":"app.bsky.graph.follow","createdAt":"2024-11-21T16:15:21.495Z","subject":"did:plc:amsdn2tbjxo3xrwqneqhh4cm"},"rev":"3lbhudfo3yi2w","rkey":"3lbhudfnw4y2w"}}, commit.collection: app.bsky.graph.follow, commit.operation: create, did: did:plc:jkaaf5j2yb2pvpx3ualm3vbh, kind: commit, time_us: 1732206349002758} | 1970-01-01T00:00:00.006 |
| {_raw: {"commit":{"cid":"bafyreihaatlpar3abtx6ck3kde2ksic6zzflk4ppduhf6dxurytqrv33ni","record":{"$type":"app.bsky.graph.follow","createdAt":"2024-11-21T16:18:39.913Z","subject":"did:plc:gf3vum7insztt5rxrpxdz2id"},"rev":"3lbhujcp4ix2n","rkey":"3lbhujcoxmp2n"}}, commit.collection: app.bsky.graph.follow, commit.operation: create, did: did:plc:tdwz2h4id5dxezvohftsmffu, kind: commit, time_us: 1732206349003106} | 1970-01-01T00:00:00.007 |
| {_raw: {"commit":{"cid":"bafyreid5ycocp5zq2g7fcx2xxzxrbafuh7b5qhtwuwiomzo6vqila2cbpu","record":{"$type":"app.bsky.feed.repost","createdAt":"2024-11-21T16:23:36.714Z","subject":{"cid":"bafyreieaacfiobnuqvjhhsndyi5s3fd6krbzdduxsyrzfv43kczpcmkl6y","uri":"at://did:plc:o5q6dynpme4ndolc3heztasm/app.bsky.feed.post/3lbfli3qsoc2o"}},"rev":"3lbhus5vior2t","rkey":"3lbhus5vbtz2t"}}, commit.collection: app.bsky.feed.repost, commit.operation: create, did: did:plc:cdsd346mwow7aj3tgfkwsct3, kind: commit, time_us: 1732206349003461} | 1970-01-01T00:00:00.008 |
| {_raw: {"commit":{"cid":"bafyreibugobcike72y4zxvdyz2oopyt6ywwqfielcwojkb27p7s6rlomgm","record":{"$type":"app.bsky.feed.post","createdAt":"2024-11-21T16:25:44.376Z","langs":["en"],"reply":{"parent":{"cid":"bafyreiaev27cfcxxvn2pdhrwwquzwgclujnulzbcfnn4p4fwgb6migjhw4","uri":"at://did:plc:zec6cslvgc3hhdatrhk6pq5p/app.bsky.feed.post/3lbhujvds4c2b"},"root":{"cid":"bafyreif7qjxhvecwnhlynijj6pf47jwvtkahsz3zh2kaipwu2bw2dxwaqq","uri":"at://did:plc:s4bwqchfzm6gjqfeb6mexgbu/app.bsky.feed.post/3lbhug53kkk2m"}},"text":"\n⌜ Blinking. She hadn't realized she spoke out loud. ⌟\n\n It was nothing like that — . I was only thinking . . . \n\n⌜ Trailing off, her mind occupied. ⌟\n"},"rev":"3lbhuvzeccx2w","rkey":"3lbhuvxf4qs2m"}}, commit.collection: app.bsky.feed.post, commit.operation: create, did: did:plc:s4bwqchfzm6gjqfeb6mexgbu, kind: commit, time_us: 1732206349003907} | 1970-01-01T00:00:00.009 |
| {_raw: {"commit":{"cid":"bafyreidjk2svg2fdjiiwohmfmvp3hdxhpb33ycnixzbkyib5m6cocindxq","record":{"$type":"app.bsky.feed.like","createdAt":"2024-11-21T16:25:46.167Z","subject":{"cid":"bafyreiaumopip75nzx2xjbugtwemdppsyx54bd2odf6q45f3o7xkocgari","uri":"at://did:plc:ig2jv6gqup4t7gdq2pmanknw/app.bsky.feed.post/3lbhuvtlaec2c"}},"rev":"3lbhuvzedg52j","rkey":"3lbhuvzdyof2j"}}, commit.collection: app.bsky.feed.like, commit.operation: create, did: did:plc:hbc74dlsxhq53kp5oxges6d7, kind: commit, time_us: 1732206349004769} | 1970-01-01T00:00:00.010 |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------+
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------+
| data | time_us |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------+
| {_raw: {"commit":{"cid":"bafyreidblutgvj75o4q4akzyyejedjj6l3it6hgqwee6jpwv2wqph5fsgm","record":{"$type":"app.bsky.feed.post","createdAt":"2024-11-21T16:09:27.095Z","langs":["en"],"reply":{"parent":{"cid":"bafyreibfglofvqou2yiqvwzk4rcgkhhxrbunyemshdjledgwymimqkg24e","uri":"at://did:plc:6tr6tuzlx2db3rduzr2d6r24/app.bsky.feed.post/3lbhqo2rtys2z"},"root":{"cid":"bafyreibfglofvqou2yiqvwzk4rcgkhhxrbunyemshdjledgwymimqkg24e","uri":"at://did:plc:6tr6tuzlx2db3rduzr2d6r24/app.bsky.feed.post/3lbhqo2rtys2z"}},"text":"aaaaah. LIght shines in a corner of WTF...."},"rev":"3lbhtytnn2k2f","rkey":"3lbhtyteurk2y"}}, commit.collection: app.bsky.feed.post, commit.operation: create, did: did:plc:yj3sjq3blzpynh27cumnp5ks, kind: commit, time_us: 1732206349000167} | 2024-11-21T16:25:49.000167 |
| {_raw: {"commit":{"cid":"bafyreia5l5vrkh5oj4cjyhcqby2dprhyvcyofo2q5562tijlae2pzih23m","record":{"$type":"app.bsky.feed.like","createdAt":"2024-11-21T16:25:46.221Z","subject":{"cid":"bafyreidjvrcmckkm765mct5fph36x7kupkfo35rjklbf2k76xkzwyiauge","uri":"at://did:plc:azrv4rcbws6kmcga4fsbphg2/app.bsky.feed.post/3lbgjdpbiec2l"}},"rev":"3lbhuvzds6d2a","rkey":"3lbhuvzdked2a"}}, commit.collection: app.bsky.feed.like, commit.operation: create, did: did:plc:3i4xf2v4wcnyktgv6satke64, kind: commit, time_us: 1732206349000644} | 2024-11-21T16:25:49.000644 |
| {_raw: {"commit":{"cid":"bafyreiew2p6cgirfaj45qoenm4fgumib7xoloclrap3jgkz5es7g7kby3i","record":{"$type":"app.bsky.graph.follow","createdAt":"2024-11-21T16:27:40.923Z","subject":"did:plc:r7cdh4sgzqbfdc6wcdxxti7c"},"rev":"3lbhuvze3gi2u","rkey":"3lbhuvzdtmi2u"}}, commit.collection: app.bsky.graph.follow, commit.operation: create, did: did:plc:gccfnqqizz4urhchsaie6jft, kind: commit, time_us: 1732206349001108} | 2024-11-21T16:25:49.001108 |
| {_raw: {"commit":{"cid":"bafyreidjix4dauj2afjlbzmhj3a7gwftcevvmmy6edww6vrjdbst26rkby","record":{"$type":"app.bsky.feed.like","createdAt":"2024-11-21T16:15:58.232Z","subject":{"cid":"bafyreiavpshyqzrlo5m7fqodjhs6jevweqnif4phasiwimv4a7mnsqi2fe","uri":"at://did:plc:fusulxqc52zbrc75fi6xrcof/app.bsky.feed.post/3lbhskq5zn22f"}},"rev":"3lbhueija5p22","rkey":"3lbhueiizcx22"}}, commit.collection: app.bsky.feed.like, commit.operation: create, did: did:plc:msxqf3twq7abtdw7dbfskphk, kind: commit, time_us: 1732206349001372} | 2024-11-21T16:25:49.001372 |
| {_raw: {"commit":{"cid":"bafyreigzdsdne3z2xxcakgisieyj7y47hj6eg7lj6v4q25ah5q2qotu5ku","record":{"$type":"app.bsky.feed.post","createdAt":"2024-11-21T16:09:27.254Z","langs":["en"],"reply":{"parent":{"cid":"bafyreih35fe2jj3gchmgk4amold4l6sfxd2sby5wrg3jrws5fkdypxrbg4","uri":"at://did:plc:6wx2gg5yqgvmlu35r6y3bk6d/app.bsky.feed.post/3lbhtj2eb4s2o"},"root":{"cid":"bafyreifipyt3vctd4ptuoicvio7rbr5xvjv4afwuggnd2prnmn55mu6luu","uri":"at://did:plc:474ldquxwzrlcvjhhbbk2wte/app.bsky.feed.post/3lbhdzrynik27"}},"text":"okay i take mine back because I hadnt heard this one yet^^"},"rev":"3lbhtytohxc2o","rkey":"3lbhtytjqzk2q"}}, commit.collection: app.bsky.feed.post, commit.operation: create, did: did:plc:l5o3qjrmfztir54cpwlv2eme, kind: commit, time_us: 1732206349001905} | 2024-11-21T16:25:49.001905 |
| {_raw: {"commit":{"cid":"bafyreiaa2vsdr4ckwjg4jq47zfd7mewidywfz3qh3dmglcd6ozi4xwdega","record":{"$type":"app.bsky.graph.follow","createdAt":"2024-11-21T16:15:21.495Z","subject":"did:plc:amsdn2tbjxo3xrwqneqhh4cm"},"rev":"3lbhudfo3yi2w","rkey":"3lbhudfnw4y2w"}}, commit.collection: app.bsky.graph.follow, commit.operation: create, did: did:plc:jkaaf5j2yb2pvpx3ualm3vbh, kind: commit, time_us: 1732206349002758} | 2024-11-21T16:25:49.002758 |
| {_raw: {"commit":{"cid":"bafyreihaatlpar3abtx6ck3kde2ksic6zzflk4ppduhf6dxurytqrv33ni","record":{"$type":"app.bsky.graph.follow","createdAt":"2024-11-21T16:18:39.913Z","subject":"did:plc:gf3vum7insztt5rxrpxdz2id"},"rev":"3lbhujcp4ix2n","rkey":"3lbhujcoxmp2n"}}, commit.collection: app.bsky.graph.follow, commit.operation: create, did: did:plc:tdwz2h4id5dxezvohftsmffu, kind: commit, time_us: 1732206349003106} | 2024-11-21T16:25:49.003106 |
| {_raw: {"commit":{"cid":"bafyreid5ycocp5zq2g7fcx2xxzxrbafuh7b5qhtwuwiomzo6vqila2cbpu","record":{"$type":"app.bsky.feed.repost","createdAt":"2024-11-21T16:23:36.714Z","subject":{"cid":"bafyreieaacfiobnuqvjhhsndyi5s3fd6krbzdduxsyrzfv43kczpcmkl6y","uri":"at://did:plc:o5q6dynpme4ndolc3heztasm/app.bsky.feed.post/3lbfli3qsoc2o"}},"rev":"3lbhus5vior2t","rkey":"3lbhus5vbtz2t"}}, commit.collection: app.bsky.feed.repost, commit.operation: create, did: did:plc:cdsd346mwow7aj3tgfkwsct3, kind: commit, time_us: 1732206349003461} | 2024-11-21T16:25:49.003461 |
| {_raw: {"commit":{"cid":"bafyreibugobcike72y4zxvdyz2oopyt6ywwqfielcwojkb27p7s6rlomgm","record":{"$type":"app.bsky.feed.post","createdAt":"2024-11-21T16:25:44.376Z","langs":["en"],"reply":{"parent":{"cid":"bafyreiaev27cfcxxvn2pdhrwwquzwgclujnulzbcfnn4p4fwgb6migjhw4","uri":"at://did:plc:zec6cslvgc3hhdatrhk6pq5p/app.bsky.feed.post/3lbhujvds4c2b"},"root":{"cid":"bafyreif7qjxhvecwnhlynijj6pf47jwvtkahsz3zh2kaipwu2bw2dxwaqq","uri":"at://did:plc:s4bwqchfzm6gjqfeb6mexgbu/app.bsky.feed.post/3lbhug53kkk2m"}},"text":"\n⌜ Blinking. She hadn't realized she spoke out loud. ⌟\n\n It was nothing like that — . I was only thinking . . . \n\n⌜ Trailing off, her mind occupied. ⌟\n"},"rev":"3lbhuvzeccx2w","rkey":"3lbhuvxf4qs2m"}}, commit.collection: app.bsky.feed.post, commit.operation: create, did: did:plc:s4bwqchfzm6gjqfeb6mexgbu, kind: commit, time_us: 1732206349003907} | 2024-11-21T16:25:49.003907 |
| {_raw: {"commit":{"cid":"bafyreidjk2svg2fdjiiwohmfmvp3hdxhpb33ycnixzbkyib5m6cocindxq","record":{"$type":"app.bsky.feed.like","createdAt":"2024-11-21T16:25:46.167Z","subject":{"cid":"bafyreiaumopip75nzx2xjbugtwemdppsyx54bd2odf6q45f3o7xkocgari","uri":"at://did:plc:ig2jv6gqup4t7gdq2pmanknw/app.bsky.feed.post/3lbhuvtlaec2c"}},"rev":"3lbhuvzedg52j","rkey":"3lbhuvzdyof2j"}}, commit.collection: app.bsky.feed.like, commit.operation: create, did: did:plc:hbc74dlsxhq53kp5oxges6d7, kind: commit, time_us: 1732206349004769} | 2024-11-21T16:25:49.004769 |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------+

View File

@@ -48,9 +48,9 @@ use flow::{FlownodeBuilder, FrontendClient, GrpcQueryHandlerWithBoxedError};
use frontend::frontend::Frontend;
use frontend::instance::builder::FrontendBuilder;
use frontend::instance::{Instance, StandaloneDatanodeManager};
use frontend::server::Services;
use meta_srv::metasrv::{FLOW_ID_SEQ, TABLE_ID_SEQ};
use servers::grpc::GrpcOptions;
use servers::server::ServerHandlers;
use snafu::ResultExt;
use standalone::options::StandaloneOptions;
@@ -249,7 +249,7 @@ impl GreptimeDbStandaloneBuilder {
procedure_executor.clone(),
Arc::new(ProcessManager::new(server_addr, None)),
)
.with_plugin(plugins)
.with_plugin(plugins.clone())
.try_build()
.await
.unwrap();
@@ -282,14 +282,15 @@ impl GreptimeDbStandaloneBuilder {
test_util::prepare_another_catalog_and_schema(&instance).await;
let mut frontend = Frontend {
let servers = Services::new(opts.clone(), instance.clone(), plugins)
.build()
.unwrap();
let frontend = Frontend {
instance,
servers: ServerHandlers::default(),
servers,
heartbeat_task: None,
};
frontend.start().await.unwrap();
GreptimeDbStandalone {
frontend: Arc::new(frontend),
opts,

View File

@@ -1546,6 +1546,12 @@ create_on_compaction = "auto"
apply_on_query = "auto"
mem_threshold_on_create = "auto"
[region_engine.mito.vector_index]
create_on_flush = "auto"
create_on_compaction = "auto"
apply_on_query = "auto"
mem_threshold_on_create = "auto"
[region_engine.mito.memtable]
type = "time_series"

View File

@@ -18,14 +18,114 @@ use std::{fs, io};
use common_test_util::find_workspace_path;
use frontend::instance::Instance;
use http::StatusCode;
use servers::http::test_helpers::TestClient;
use servers::http::{HTTP_SERVER, HttpServer};
use servers::server::ServerHandlers;
use tests_integration::standalone::GreptimeDbStandaloneBuilder;
use tests_integration::test_util::execute_sql_and_expect;
#[tokio::test]
async fn test_load_jsonbench_data() {
#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
async fn test_load_jsonbench_data_by_pipeline() -> io::Result<()> {
common_telemetry::init_default_ut_logging();
let instance = GreptimeDbStandaloneBuilder::new("test_load_jsonbench_data")
let instance = GreptimeDbStandaloneBuilder::new("test_load_jsonbench_data_by_pipeline")
.build()
.await;
let frontend = instance.fe_instance();
let ServerHandlers::Init(handlers) = instance.frontend.server_handlers() else {
unreachable!()
};
let router = {
let handlers = handlers.lock().unwrap();
let server = handlers
.get(HTTP_SERVER)
.and_then(|x| x.0.as_any().downcast_ref::<HttpServer>())
.unwrap();
server.build(server.make_app()).unwrap()
};
let client = TestClient::new(router).await;
create_table(frontend).await;
desc_table(frontend).await;
create_pipeline(&client).await;
insert_data_by_pipeline(&client).await?;
query_data(frontend).await
}
async fn insert_data_by_pipeline(client: &TestClient) -> io::Result<()> {
let file = fs::read(find_workspace_path(
"tests-integration/resources/jsonbench-head-10.ndjson",
))?;
let response = client
.post("/v1/ingest?table=bluesky&pipeline_name=jsonbench")
.header("Content-Type", "text/plain")
.body(file)
.send()
.await;
assert_eq!(response.status(), StatusCode::OK);
let response = response.text().await;
// Note that this pattern also matches the inserted rows: "10".
let pattern = r#"{"output":[{"affectedrows":10}]"#;
assert!(response.starts_with(pattern));
Ok(())
}
async fn create_pipeline(client: &TestClient) {
let pipeline = r#"
version: 2
processors:
- json_parse:
fields:
- message, data
ignore_missing: true
- simple_extract:
fields:
- data, time_us
key: "time_us"
ignore_missing: false
- epoch:
fields:
- time_us
resolution: microsecond
- select:
fields:
- time_us
- data
transform:
- fields:
- time_us
type: epoch, us
index: timestamp
"#;
let response = client
.post("/v1/pipelines/jsonbench")
.header("Content-Type", "application/x-yaml")
.body(pipeline)
.send()
.await;
assert_eq!(response.status(), StatusCode::OK);
let response = response.text().await;
let pattern = r#"{"pipelines":[{"name":"jsonbench""#;
assert!(response.starts_with(pattern));
}
#[tokio::test(flavor = "multi_thread", worker_threads = 4)]
async fn test_load_jsonbench_data_by_sql() -> io::Result<()> {
common_telemetry::init_default_ut_logging();
let instance = GreptimeDbStandaloneBuilder::new("test_load_jsonbench_data_by_sql")
.build()
.await;
let frontend = instance.fe_instance();
@@ -34,9 +134,9 @@ async fn test_load_jsonbench_data() {
desc_table(frontend).await;
insert_data(frontend).await.unwrap();
insert_data_by_sql(frontend).await?;
query_data(frontend).await.unwrap();
query_data(frontend).await
}
async fn query_data(frontend: &Arc<Instance>) -> io::Result<()> {
@@ -46,22 +146,21 @@ async fn query_data(frontend: &Arc<Instance>) -> io::Result<()> {
| count(*) |
+----------+
| 10 |
+----------+
"#;
+----------+"#;
execute_sql_and_expect(frontend, sql, expected).await;
let sql = "SELECT * FROM bluesky ORDER BY ts";
let sql = "SELECT * FROM bluesky ORDER BY time_us";
let expected = fs::read_to_string(find_workspace_path(
"tests-integration/resources/jsonbench-select-all.txt",
))?;
execute_sql_and_expect(frontend, sql, &expected).await;
// query 1:
let sql = "\
SELECT \
json_get_string(data, '$.commit.collection') AS event, count() AS count \
FROM bluesky \
GROUP BY event \
let sql = "
SELECT
json_get_string(data, '$.commit.collection') AS event, count() AS count
FROM bluesky
GROUP BY event
ORDER BY count DESC, event ASC";
let expected = r#"
+-----------------------+-------+
@@ -75,16 +174,16 @@ ORDER BY count DESC, event ASC";
execute_sql_and_expect(frontend, sql, expected).await;
// query 2:
let sql = "\
SELECT \
json_get_string(data, '$.commit.collection') AS event, \
count() AS count, \
count(DISTINCT json_get_string(data, '$.did')) AS users \
FROM bluesky \
WHERE \
(json_get_string(data, '$.kind') = 'commit') AND \
(json_get_string(data, '$.commit.operation') = 'create') \
GROUP BY event \
let sql = "
SELECT
json_get_string(data, '$.commit.collection') AS event,
count() AS count,
count(DISTINCT json_get_string(data, '$.did')) AS users
FROM bluesky
WHERE
(json_get_string(data, '$.kind') = 'commit') AND
(json_get_string(data, '$.commit.operation') = 'create')
GROUP BY event
ORDER BY count DESC, event ASC";
let expected = r#"
+-----------------------+-------+-------+
@@ -97,22 +196,100 @@ ORDER BY count DESC, event ASC";
+-----------------------+-------+-------+"#;
execute_sql_and_expect(frontend, sql, expected).await;
// query 3:
let sql = "
SELECT
json_get_string(data, '$.commit.collection') AS event,
date_part('hour', to_timestamp_micros(json_get_int(data, '$.time_us'))) as hour_of_day,
count() AS count
FROM bluesky
WHERE
(json_get_string(data, '$.kind') = 'commit') AND
(json_get_string(data, '$.commit.operation') = 'create') AND
json_get_string(data, '$.commit.collection') IN
('app.bsky.feed.post', 'app.bsky.feed.repost', 'app.bsky.feed.like')
GROUP BY event, hour_of_day
ORDER BY hour_of_day, event";
let expected = r#"
+----------------------+-------------+-------+
| event | hour_of_day | count |
+----------------------+-------------+-------+
| app.bsky.feed.like | 16 | 3 |
| app.bsky.feed.post | 16 | 3 |
| app.bsky.feed.repost | 16 | 1 |
+----------------------+-------------+-------+"#;
execute_sql_and_expect(frontend, sql, expected).await;
// query 4:
let sql = "
SELECT
json_get_string(data, '$.did') as user_id,
min(to_timestamp_micros(json_get_int(data, '$.time_us'))) AS first_post_ts
FROM bluesky
WHERE
(json_get_string(data, '$.kind') = 'commit') AND
(json_get_string(data, '$.commit.operation') = 'create') AND
(json_get_string(data, '$.commit.collection') = 'app.bsky.feed.post')
GROUP BY user_id
ORDER BY first_post_ts ASC, user_id DESC
LIMIT 3";
let expected = r#"
+----------------------------------+----------------------------+
| user_id | first_post_ts |
+----------------------------------+----------------------------+
| did:plc:yj3sjq3blzpynh27cumnp5ks | 2024-11-21T16:25:49.000167 |
| did:plc:l5o3qjrmfztir54cpwlv2eme | 2024-11-21T16:25:49.001905 |
| did:plc:s4bwqchfzm6gjqfeb6mexgbu | 2024-11-21T16:25:49.003907 |
+----------------------------------+----------------------------+"#;
execute_sql_and_expect(frontend, sql, expected).await;
// query 5:
let sql = "
SELECT
json_get_string(data, '$.did') as user_id,
date_part(
'epoch',
max(to_timestamp_micros(json_get_int(data, '$.time_us'))) -
min(to_timestamp_micros(json_get_int(data, '$.time_us')))
) AS activity_span
FROM bluesky
WHERE
(json_get_string(data, '$.kind') = 'commit') AND
(json_get_string(data, '$.commit.operation') = 'create') AND
(json_get_string(data, '$.commit.collection') = 'app.bsky.feed.post')
GROUP BY user_id
ORDER BY activity_span DESC, user_id DESC
LIMIT 3";
let expected = r#"
+----------------------------------+---------------+
| user_id | activity_span |
+----------------------------------+---------------+
| did:plc:yj3sjq3blzpynh27cumnp5ks | 0.0 |
| did:plc:s4bwqchfzm6gjqfeb6mexgbu | 0.0 |
| did:plc:l5o3qjrmfztir54cpwlv2eme | 0.0 |
+----------------------------------+---------------+"#;
execute_sql_and_expect(frontend, sql, expected).await;
Ok(())
}
async fn insert_data(frontend: &Arc<Instance>) -> io::Result<()> {
async fn insert_data_by_sql(frontend: &Arc<Instance>) -> io::Result<()> {
let file = fs::File::open(find_workspace_path(
"tests-integration/resources/jsonbench-head-10.ndjson",
))?;
let reader = io::BufReader::new(file);
for (i, line) in reader.lines().enumerate() {
for line in reader.lines() {
let line = line?;
if line.is_empty() {
continue;
}
let json: serde_json::Value = serde_json::from_str(&line)?;
let time_us = json.pointer("/time_us").and_then(|x| x.as_u64()).unwrap();
let sql = format!(
"INSERT INTO bluesky (ts, data) VALUES ({}, '{}')",
i + 1,
"INSERT INTO bluesky (time_us, data) VALUES ({}, '{}')",
time_us,
line.replace("'", "''"), // standard method to escape the single quote
);
execute_sql_and_expect(frontend, &sql, "Affected Rows: 1").await;
@@ -123,12 +300,12 @@ async fn insert_data(frontend: &Arc<Instance>) -> io::Result<()> {
async fn desc_table(frontend: &Arc<Instance>) {
let sql = "DESC TABLE bluesky";
let expected = r#"
+--------+----------------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
| Column | Type | Key | Null | Default | Semantic Type |
+--------+----------------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
| data | Json<Object{"_raw": String, "commit.collection": String, "commit.operation": String, "did": String, "kind": String, "time_us": Number(I64)}> | | YES | | FIELD |
| ts | TimestampMillisecond | PRI | NO | | TIMESTAMP |
+--------+----------------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+"#;
+---------+------------------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
| Column | Type | Key | Null | Default | Semantic Type |
+---------+------------------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
| data | Json<{"_raw":"<String>","commit.collection":"<String>","commit.operation":"<String>","did":"<String>","kind":"<String>","time_us":"<Number>"}> | | YES | | FIELD |
| time_us | TimestampMicrosecond | PRI | NO | | TIMESTAMP |
+---------+------------------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+"#;
execute_sql_and_expect(frontend, sql, expected).await;
}
@@ -145,7 +322,7 @@ CREATE TABLE bluesky (
time_us Bigint
>,
),
ts Timestamp TIME INDEX,
time_us TimestampMicrosecond TIME INDEX,
)
"#;
execute_sql_and_expect(frontend, sql, "Affected Rows: 0").await;

View File

@@ -12,7 +12,7 @@ DESC TABLE t;
| Column | Type | Key | Null | Default | Semantic Type |
+--------+----------------------+-----+------+---------+---------------+
| ts | TimestampMillisecond | PRI | NO | | TIMESTAMP |
| j | Json<Null> | | YES | | FIELD |
| j | Json<"<Null>"> | | YES | | FIELD |
+--------+----------------------+-----+------+---------+---------------+
INSERT INTO t VALUES
@@ -24,12 +24,12 @@ Affected Rows: 3
DESC TABLE t;
+--------+-----------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
| Column | Type | Key | Null | Default | Semantic Type |
+--------+-----------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
| ts | TimestampMillisecond | PRI | NO | | TIMESTAMP |
| j | Json<Object{"int": Number(I64), "list": Array[Number(F64)], "nested": Object{"a": Object{"x": String}, "b": Object{"y": Number(I64)}}}> | | YES | | FIELD |
+--------+-----------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
+--------+---------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
| Column | Type | Key | Null | Default | Semantic Type |
+--------+---------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
| ts | TimestampMillisecond | PRI | NO | | TIMESTAMP |
| j | Json<{"int":"<Number>","list":["<Number>"],"nested":{"a":{"x":"<String>"},"b":{"y":"<Number>"}}}> | | YES | | FIELD |
+--------+---------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
INSERT INTO t VALUES
(1762128004000, '{"int": 4, "bool": true, "nested": {"a": {"y": 1}}}'),
@@ -39,12 +39,12 @@ Affected Rows: 2
DESC TABLE t;
+--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
| Column | Type | Key | Null | Default | Semantic Type |
+--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
| ts | TimestampMillisecond | PRI | NO | | TIMESTAMP |
| j | Json<Object{"bool": Bool, "int": Number(I64), "list": Array[Number(F64)], "nested": Object{"a": Object{"x": String, "y": Number(I64)}, "b": Object{"x": String, "y": Number(I64)}}}> | | YES | | FIELD |
+--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
+--------+-------------------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
| Column | Type | Key | Null | Default | Semantic Type |
+--------+-------------------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
| ts | TimestampMillisecond | PRI | NO | | TIMESTAMP |
| j | Json<{"bool":"<Bool>","int":"<Number>","list":["<Number>"],"nested":{"a":{"x":"<String>","y":"<Number>"},"b":{"x":"<String>","y":"<Number>"}}}> | | YES | | FIELD |
+--------+-------------------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
INSERT INTO t VALUES (1762128006000, '{"int": 6, "list": [-6.0], "bool": true, "nested": {"a": {"x": "ax", "y": 66}, "b": {"y": -66, "x": "bx"}}}');
@@ -52,12 +52,12 @@ Affected Rows: 1
DESC TABLE t;
+--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
| Column | Type | Key | Null | Default | Semantic Type |
+--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
| ts | TimestampMillisecond | PRI | NO | | TIMESTAMP |
| j | Json<Object{"bool": Bool, "int": Number(I64), "list": Array[Number(F64)], "nested": Object{"a": Object{"x": String, "y": Number(I64)}, "b": Object{"x": String, "y": Number(I64)}}}> | | YES | | FIELD |
+--------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
+--------+-------------------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
| Column | Type | Key | Null | Default | Semantic Type |
+--------+-------------------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
| ts | TimestampMillisecond | PRI | NO | | TIMESTAMP |
| j | Json<{"bool":"<Bool>","int":"<Number>","list":["<Number>"],"nested":{"a":{"x":"<String>","y":"<Number>"},"b":{"x":"<String>","y":"<Number>"}}}> | | YES | | FIELD |
+--------+-------------------------------------------------------------------------------------------------------------------------------------------------+-----+------+---------+---------------+
INSERT INTO t VALUES (1762128011000, '{}');