feat(mito): Implement SST format for mito2 (#2178)

* chore: update comment

* feat: stream writer takes arrow's types

* feat: Define Batch struct

* feat: arrow_schema_to_store

* refactor: rename

* feat: write parquet in new format with tsids

* feat: reader support projection

* feat: Impl read compat

* refactor: rename SchemaCompat to CompatRecordBatch

* feat: changing sst format

* feat: make it compile

* feat: remove tsid and some structs

* feat: from_sst_record_batch wip

* chore: push array

* chore: wip

* feat: decode batches from RecordBatch

* feat: reader converts record batches

* feat: remove compat mod

* chore: remove some codes

* feat: sort fields by column id

* test: test to_sst_arrow_schema

* feat: do not sort fields

* test: more test helpers

* feat: simplify projection

* fix: projection indices is incorrect

* refactor: define write/read format

* test: test write format

* test: test projection

* test: test convert record batch

* feat: remove unused errors

* refactor: wrap get_field_batch_columns

* chore: clippy

* chore: fix clippy

* feat: build arrow schema from region meta in ReadFormat

* feat: initialize the parquet reader at `build()`

* chore: fix typo
This commit is contained in:
Yingwen
2023-08-17 14:25:50 +08:00
committed by GitHub
parent 832e5dcfd7
commit 4ba12155fe
8 changed files with 917 additions and 109 deletions

View File

@@ -23,6 +23,7 @@ use std::sync::Arc;
use api::v1::SemanticType;
use common_error::ext::ErrorExt;
use common_error::status_code::StatusCode;
use datatypes::arrow::datatypes::FieldRef;
use datatypes::prelude::DataType;
use datatypes::schema::{ColumnSchema, Schema, SchemaRef};
use serde::de::Error;
@@ -124,6 +125,11 @@ impl<'de> Deserialize<'de> for RegionMetadata {
}
impl RegionMetadata {
/// Decode the metadata from a JSON str.
pub fn from_json(s: &str) -> Result<Self> {
serde_json::from_str(s).context(SerdeJsonSnafu)
}
/// Encode the metadata to a JSON string.
pub fn to_json(&self) -> Result<String> {
serde_json::to_string(&self).context(SerdeJsonSnafu)
@@ -136,6 +142,11 @@ impl RegionMetadata {
.map(|index| &self.column_metadatas[*index])
}
/// Find column index by id.
pub fn column_index_by_id(&self, column_id: ColumnId) -> Option<usize> {
self.id_to_index.get(&column_id).copied()
}
/// Returns the time index column
///
/// # Panics
@@ -145,6 +156,26 @@ impl RegionMetadata {
&self.column_metadatas[index]
}
/// Returns the arrow field of the time index column.
pub fn time_index_field(&self) -> FieldRef {
let index = self.id_to_index[&self.time_index];
self.schema.arrow_schema().fields[index].clone()
}
/// Finds a column by name.
pub fn column_by_name(&self, name: &str) -> Option<&ColumnMetadata> {
self.schema
.column_index_by_name(name)
.map(|index| &self.column_metadatas[index])
}
/// Returns all field columns.
pub fn field_columns(&self) -> impl Iterator<Item = &ColumnMetadata> {
self.column_metadatas
.iter()
.filter(|column| column.semantic_type == SemanticType::Field)
}
/// Checks whether the metadata is valid.
fn validate(&self) -> Result<()> {
// Id to name.
@@ -264,6 +295,7 @@ impl RegionMetadata {
/// Checks whether it is a valid column.
fn validate_column_metadata(column_metadata: &ColumnMetadata) -> Result<()> {
// TODO(yingwen): Ensure column name is not internal columns.
if column_metadata.semantic_type == SemanticType::Timestamp {
ensure!(
column_metadata

View File

@@ -81,6 +81,9 @@ pub const SEQUENCE_COLUMN_NAME: &str = "__sequence";
/// Name for reserved column: op_type
pub const OP_TYPE_COLUMN_NAME: &str = "__op_type";
/// Name for reserved column: primary_key
pub const PRIMARY_KEY_COLUMN_NAME: &str = "__primary_key";
// -----------------------------------------------------------------------------
// ---------- Default options --------------------------------------------------