Compare commits

..

6 Commits

Author SHA1 Message Date
JohnsonLee
a8477e4142 fix: table resolving logic related to pg_catalog (#4580)
* fix: table resolving logic related to pg_catalog

refer to
https://github.com/GreptimeTeam/greptimedb/issues/3560#issuecomment-2287794348
and #4543

* refactor: remove CatalogProtocol type

* fix: sqlness

* fix: forbid create database pg_catalog with mysql client

* refactor: use QueryContext as arguments rather than Channel

* refactor: pass None as default behaviour in information_schema

* test: fix test
2024-09-09 00:47:59 +00:00
Yiran
b950e705f5 chore: update the document link in README.md (#4690) 2024-09-07 15:27:32 +00:00
Ruihang Xia
d2d62e0c6f fix: unconditional statistics (#4694)
* fix: unconditional statistics

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

* add more sqlness case

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>

---------

Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
2024-09-07 04:28:11 +00:00
localhost
5d9f8a3be7 feat: add test pipeline api (#4667)
* chore: add test pipeline api

* chore: add test for test pipeline api

* chore: fix taplo check

* chore: change pipeline dryrun api path

* chore: add more info for pipeline dryrun api
2024-09-06 08:36:49 +00:00
jeremyhi
e88465840d feat: add extension field to HeartbeatRequest (#4688)
* feat: add extension field to HeartbeatRequest

* chore: extension to extensions

* chore: upgrade proto
2024-09-06 08:29:20 +00:00
localhost
67d95d2088 refactor!: add processor builder and transform buidler (#4571)
* chore: add processor builder and transform buidler

* chore: in process

* chore: intermediate state from hashmap to vector in pipeline

* chore: remove useless code and rename some struct

* chore: fix typos

* chore: format code

* chore: add error handling and optimize code readability

* chore: fix typos

* chore: remove useless code

* chore: add some doc

* chore: fix by pr commit

* chore: remove useless code and change struct name

* chore: modify the location of the find_key_index function.
2024-09-06 07:51:08 +00:00
97 changed files with 3462 additions and 2486 deletions

4
Cargo.lock generated
View File

@@ -3156,6 +3156,7 @@ dependencies = [
"arrow",
"arrow-array",
"arrow-schema",
"base64 0.21.7",
"common-base",
"common-decimal",
"common-error",
@@ -3164,6 +3165,7 @@ dependencies = [
"common-time",
"datafusion-common",
"enum_dispatch",
"greptime-proto",
"num",
"num-traits",
"ordered-float 3.9.2",
@@ -4300,7 +4302,7 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
[[package]]
name = "greptime-proto"
version = "0.1.0"
source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=c437b55725b7f5224fe9d46db21072b4a682ee4b#c437b55725b7f5224fe9d46db21072b4a682ee4b"
source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=157cfdb52709e489cf1f3ce8e3042ed4ee8a524a#157cfdb52709e489cf1f3ce8e3042ed4ee8a524a"
dependencies = [
"prost 0.12.6",
"serde",

View File

@@ -120,7 +120,7 @@ etcd-client = { version = "0.13" }
fst = "0.4.7"
futures = "0.3"
futures-util = "0.3"
greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "c437b55725b7f5224fe9d46db21072b4a682ee4b" }
greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "157cfdb52709e489cf1f3ce8e3042ed4ee8a524a" }
humantime = "2.1"
humantime-serde = "1.1"
itertools = "0.10"

View File

@@ -74,7 +74,7 @@ Our core developers have been building time-series data platforms for years. Bas
* **Compatible with InfluxDB, Prometheus and more protocols**
Widely adopted database protocols and APIs, including MySQL, PostgreSQL, and Prometheus Remote Storage, etc. [Read more](https://docs.greptime.com/user-guide/clients/overview).
Widely adopted database protocols and APIs, including MySQL, PostgreSQL, and Prometheus Remote Storage, etc. [Read more](https://docs.greptime.com/user-guide/protocols/overview).
## Try GreptimeDB

View File

@@ -21,14 +21,14 @@ use greptime_proto::v1::region::RegionResponse as RegionResponseV1;
#[derive(Debug)]
pub struct RegionResponse {
pub affected_rows: AffectedRows,
pub extension: HashMap<String, Vec<u8>>,
pub extensions: HashMap<String, Vec<u8>>,
}
impl RegionResponse {
pub fn from_region_response(region_response: RegionResponseV1) -> Self {
Self {
affected_rows: region_response.affected_rows as _,
extension: region_response.extension,
extensions: region_response.extensions,
}
}
@@ -36,7 +36,7 @@ impl RegionResponse {
pub fn new(affected_rows: AffectedRows) -> Self {
Self {
affected_rows,
extension: Default::default(),
extensions: Default::default(),
}
}
}

View File

@@ -36,6 +36,7 @@ use futures_util::{StreamExt, TryStreamExt};
use meta_client::client::MetaClient;
use moka::sync::Cache;
use partition::manager::{PartitionRuleManager, PartitionRuleManagerRef};
use session::context::{Channel, QueryContext};
use snafu::prelude::*;
use table::dist_table::DistTable;
use table::table::numbers::{NumbersTable, NUMBERS_TABLE_NAME};
@@ -152,7 +153,11 @@ impl CatalogManager for KvBackendCatalogManager {
Ok(keys)
}
async fn schema_names(&self, catalog: &str) -> Result<Vec<String>> {
async fn schema_names(
&self,
catalog: &str,
query_ctx: Option<&QueryContext>,
) -> Result<Vec<String>> {
let stream = self
.table_metadata_manager
.schema_manager()
@@ -163,12 +168,17 @@ impl CatalogManager for KvBackendCatalogManager {
.map_err(BoxedError::new)
.context(ListSchemasSnafu { catalog })?;
keys.extend(self.system_catalog.schema_names());
keys.extend(self.system_catalog.schema_names(query_ctx));
Ok(keys.into_iter().collect())
}
async fn table_names(&self, catalog: &str, schema: &str) -> Result<Vec<String>> {
async fn table_names(
&self,
catalog: &str,
schema: &str,
query_ctx: Option<&QueryContext>,
) -> Result<Vec<String>> {
let stream = self
.table_metadata_manager
.table_name_manager()
@@ -181,7 +191,7 @@ impl CatalogManager for KvBackendCatalogManager {
.into_iter()
.map(|(k, _)| k)
.collect::<Vec<_>>();
tables.extend_from_slice(&self.system_catalog.table_names(schema));
tables.extend_from_slice(&self.system_catalog.table_names(schema, query_ctx));
Ok(tables.into_iter().collect())
}
@@ -194,8 +204,13 @@ impl CatalogManager for KvBackendCatalogManager {
.context(TableMetadataManagerSnafu)
}
async fn schema_exists(&self, catalog: &str, schema: &str) -> Result<bool> {
if self.system_catalog.schema_exists(schema) {
async fn schema_exists(
&self,
catalog: &str,
schema: &str,
query_ctx: Option<&QueryContext>,
) -> Result<bool> {
if self.system_catalog.schema_exists(schema, query_ctx) {
return Ok(true);
}
@@ -206,8 +221,14 @@ impl CatalogManager for KvBackendCatalogManager {
.context(TableMetadataManagerSnafu)
}
async fn table_exists(&self, catalog: &str, schema: &str, table: &str) -> Result<bool> {
if self.system_catalog.table_exists(schema, table) {
async fn table_exists(
&self,
catalog: &str,
schema: &str,
table: &str,
query_ctx: Option<&QueryContext>,
) -> Result<bool> {
if self.system_catalog.table_exists(schema, table, query_ctx) {
return Ok(true);
}
@@ -225,10 +246,12 @@ impl CatalogManager for KvBackendCatalogManager {
catalog_name: &str,
schema_name: &str,
table_name: &str,
query_ctx: Option<&QueryContext>,
) -> Result<Option<TableRef>> {
if let Some(table) = self
.system_catalog
.table(catalog_name, schema_name, table_name)
let channel = query_ctx.map_or(Channel::Unknown, |ctx| ctx.channel());
if let Some(table) =
self.system_catalog
.table(catalog_name, schema_name, table_name, query_ctx)
{
return Ok(Some(table));
}
@@ -236,23 +259,45 @@ impl CatalogManager for KvBackendCatalogManager {
let table_cache: TableCacheRef = self.cache_registry.get().context(CacheNotFoundSnafu {
name: "table_cache",
})?;
table_cache
if let Some(table) = table_cache
.get_by_ref(&TableName {
catalog_name: catalog_name.to_string(),
schema_name: schema_name.to_string(),
table_name: table_name.to_string(),
})
.await
.context(GetTableCacheSnafu)
.context(GetTableCacheSnafu)?
{
return Ok(Some(table));
}
if channel == Channel::Postgres {
// falldown to pg_catalog
if let Some(table) =
self.system_catalog
.table(catalog_name, PG_CATALOG_NAME, table_name, query_ctx)
{
return Ok(Some(table));
}
}
return Ok(None);
}
fn tables<'a>(&'a self, catalog: &'a str, schema: &'a str) -> BoxStream<'a, Result<TableRef>> {
fn tables<'a>(
&'a self,
catalog: &'a str,
schema: &'a str,
query_ctx: Option<&'a QueryContext>,
) -> BoxStream<'a, Result<TableRef>> {
let sys_tables = try_stream!({
// System tables
let sys_table_names = self.system_catalog.table_names(schema);
let sys_table_names = self.system_catalog.table_names(schema, query_ctx);
for table_name in sys_table_names {
if let Some(table) = self.system_catalog.table(catalog, schema, &table_name) {
if let Some(table) =
self.system_catalog
.table(catalog, schema, &table_name, query_ctx)
{
yield table;
}
}
@@ -320,18 +365,27 @@ struct SystemCatalog {
}
impl SystemCatalog {
// TODO(j0hn50n133): remove the duplicated hard-coded table names logic
fn schema_names(&self) -> Vec<String> {
vec![
INFORMATION_SCHEMA_NAME.to_string(),
PG_CATALOG_NAME.to_string(),
]
fn schema_names(&self, query_ctx: Option<&QueryContext>) -> Vec<String> {
let channel = query_ctx.map_or(Channel::Unknown, |ctx| ctx.channel());
match channel {
// pg_catalog only visible under postgres protocol
Channel::Postgres => vec![
INFORMATION_SCHEMA_NAME.to_string(),
PG_CATALOG_NAME.to_string(),
],
_ => {
vec![INFORMATION_SCHEMA_NAME.to_string()]
}
}
}
fn table_names(&self, schema: &str) -> Vec<String> {
fn table_names(&self, schema: &str, query_ctx: Option<&QueryContext>) -> Vec<String> {
let channel = query_ctx.map_or(Channel::Unknown, |ctx| ctx.channel());
match schema {
INFORMATION_SCHEMA_NAME => self.information_schema_provider.table_names(),
PG_CATALOG_NAME => self.pg_catalog_provider.table_names(),
PG_CATALOG_NAME if channel == Channel::Postgres => {
self.pg_catalog_provider.table_names()
}
DEFAULT_SCHEMA_NAME => {
vec![NUMBERS_TABLE_NAME.to_string()]
}
@@ -339,23 +393,35 @@ impl SystemCatalog {
}
}
fn schema_exists(&self, schema: &str) -> bool {
schema == INFORMATION_SCHEMA_NAME || schema == PG_CATALOG_NAME
fn schema_exists(&self, schema: &str, query_ctx: Option<&QueryContext>) -> bool {
let channel = query_ctx.map_or(Channel::Unknown, |ctx| ctx.channel());
match channel {
Channel::Postgres => schema == PG_CATALOG_NAME || schema == INFORMATION_SCHEMA_NAME,
_ => schema == INFORMATION_SCHEMA_NAME,
}
}
fn table_exists(&self, schema: &str, table: &str) -> bool {
fn table_exists(&self, schema: &str, table: &str, query_ctx: Option<&QueryContext>) -> bool {
let channel = query_ctx.map_or(Channel::Unknown, |ctx| ctx.channel());
if schema == INFORMATION_SCHEMA_NAME {
self.information_schema_provider.table(table).is_some()
} else if schema == DEFAULT_SCHEMA_NAME {
table == NUMBERS_TABLE_NAME
} else if schema == PG_CATALOG_NAME {
} else if schema == PG_CATALOG_NAME && channel == Channel::Postgres {
self.pg_catalog_provider.table(table).is_some()
} else {
false
}
}
fn table(&self, catalog: &str, schema: &str, table_name: &str) -> Option<TableRef> {
fn table(
&self,
catalog: &str,
schema: &str,
table_name: &str,
query_ctx: Option<&QueryContext>,
) -> Option<TableRef> {
let channel = query_ctx.map_or(Channel::Unknown, |ctx| ctx.channel());
if schema == INFORMATION_SCHEMA_NAME {
let information_schema_provider =
self.catalog_cache.get_with_by_ref(catalog, move || {
@@ -366,7 +432,7 @@ impl SystemCatalog {
))
});
information_schema_provider.table(table_name)
} else if schema == PG_CATALOG_NAME {
} else if schema == PG_CATALOG_NAME && channel == Channel::Postgres {
if catalog == DEFAULT_CATALOG_NAME {
self.pg_catalog_provider.table(table_name)
} else {

View File

@@ -20,8 +20,10 @@ use std::fmt::{Debug, Formatter};
use std::sync::Arc;
use api::v1::CreateTableExpr;
use common_catalog::consts::{INFORMATION_SCHEMA_NAME, PG_CATALOG_NAME};
use futures::future::BoxFuture;
use futures_util::stream::BoxStream;
use session::context::QueryContext;
use table::metadata::TableId;
use table::TableRef;
@@ -44,15 +46,35 @@ pub trait CatalogManager: Send + Sync {
async fn catalog_names(&self) -> Result<Vec<String>>;
async fn schema_names(&self, catalog: &str) -> Result<Vec<String>>;
async fn schema_names(
&self,
catalog: &str,
query_ctx: Option<&QueryContext>,
) -> Result<Vec<String>>;
async fn table_names(&self, catalog: &str, schema: &str) -> Result<Vec<String>>;
async fn table_names(
&self,
catalog: &str,
schema: &str,
query_ctx: Option<&QueryContext>,
) -> Result<Vec<String>>;
async fn catalog_exists(&self, catalog: &str) -> Result<bool>;
async fn schema_exists(&self, catalog: &str, schema: &str) -> Result<bool>;
async fn schema_exists(
&self,
catalog: &str,
schema: &str,
query_ctx: Option<&QueryContext>,
) -> Result<bool>;
async fn table_exists(&self, catalog: &str, schema: &str, table: &str) -> Result<bool>;
async fn table_exists(
&self,
catalog: &str,
schema: &str,
table: &str,
query_ctx: Option<&QueryContext>,
) -> Result<bool>;
/// Returns the table by catalog, schema and table name.
async fn table(
@@ -60,10 +82,25 @@ pub trait CatalogManager: Send + Sync {
catalog: &str,
schema: &str,
table_name: &str,
query_ctx: Option<&QueryContext>,
) -> Result<Option<TableRef>>;
/// Returns all tables with a stream by catalog and schema.
fn tables<'a>(&'a self, catalog: &'a str, schema: &'a str) -> BoxStream<'a, Result<TableRef>>;
fn tables<'a>(
&'a self,
catalog: &'a str,
schema: &'a str,
query_ctx: Option<&'a QueryContext>,
) -> BoxStream<'a, Result<TableRef>>;
/// Check if `schema` is a reserved schema name
fn is_reserved_schema_name(&self, schema: &str) -> bool {
// We have to check whether a schema name is reserved before create schema.
// We need this rather than use schema_exists directly because `pg_catalog` is
// only visible via postgres protocol. So if we don't check, a mysql client may
// create a schema named `pg_catalog` which is somehow malformed.
schema == INFORMATION_SCHEMA_NAME || schema == PG_CATALOG_NAME
}
}
pub type CatalogManagerRef = Arc<dyn CatalogManager>;

View File

@@ -26,6 +26,7 @@ use common_catalog::consts::{
use common_meta::key::flow::FlowMetadataManager;
use common_meta::kv_backend::memory::MemoryKvBackend;
use futures_util::stream::BoxStream;
use session::context::QueryContext;
use snafu::OptionExt;
use table::TableRef;
@@ -53,7 +54,11 @@ impl CatalogManager for MemoryCatalogManager {
Ok(self.catalogs.read().unwrap().keys().cloned().collect())
}
async fn schema_names(&self, catalog: &str) -> Result<Vec<String>> {
async fn schema_names(
&self,
catalog: &str,
_query_ctx: Option<&QueryContext>,
) -> Result<Vec<String>> {
Ok(self
.catalogs
.read()
@@ -67,7 +72,12 @@ impl CatalogManager for MemoryCatalogManager {
.collect())
}
async fn table_names(&self, catalog: &str, schema: &str) -> Result<Vec<String>> {
async fn table_names(
&self,
catalog: &str,
schema: &str,
_query_ctx: Option<&QueryContext>,
) -> Result<Vec<String>> {
Ok(self
.catalogs
.read()
@@ -87,11 +97,22 @@ impl CatalogManager for MemoryCatalogManager {
self.catalog_exist_sync(catalog)
}
async fn schema_exists(&self, catalog: &str, schema: &str) -> Result<bool> {
async fn schema_exists(
&self,
catalog: &str,
schema: &str,
_query_ctx: Option<&QueryContext>,
) -> Result<bool> {
self.schema_exist_sync(catalog, schema)
}
async fn table_exists(&self, catalog: &str, schema: &str, table: &str) -> Result<bool> {
async fn table_exists(
&self,
catalog: &str,
schema: &str,
table: &str,
_query_ctx: Option<&QueryContext>,
) -> Result<bool> {
let catalogs = self.catalogs.read().unwrap();
Ok(catalogs
.get(catalog)
@@ -108,6 +129,7 @@ impl CatalogManager for MemoryCatalogManager {
catalog: &str,
schema: &str,
table_name: &str,
_query_ctx: Option<&QueryContext>,
) -> Result<Option<TableRef>> {
let result = try {
self.catalogs
@@ -121,7 +143,12 @@ impl CatalogManager for MemoryCatalogManager {
Ok(result)
}
fn tables<'a>(&'a self, catalog: &'a str, schema: &'a str) -> BoxStream<'a, Result<TableRef>> {
fn tables<'a>(
&'a self,
catalog: &'a str,
schema: &'a str,
_query_ctx: Option<&QueryContext>,
) -> BoxStream<'a, Result<TableRef>> {
let catalogs = self.catalogs.read().unwrap();
let Some(schemas) = catalogs.get(catalog) else {
@@ -371,11 +398,12 @@ mod tests {
DEFAULT_CATALOG_NAME,
DEFAULT_SCHEMA_NAME,
NUMBERS_TABLE_NAME,
None,
)
.await
.unwrap()
.unwrap();
let stream = catalog_list.tables(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME);
let stream = catalog_list.tables(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, None);
let tables = stream.try_collect::<Vec<_>>().await.unwrap();
assert_eq!(tables.len(), 1);
assert_eq!(
@@ -384,7 +412,12 @@ mod tests {
);
assert!(catalog_list
.table(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "not_exists")
.table(
DEFAULT_CATALOG_NAME,
DEFAULT_SCHEMA_NAME,
"not_exists",
None
)
.await
.unwrap()
.is_none());
@@ -411,7 +444,7 @@ mod tests {
};
catalog.register_table_sync(register_table_req).unwrap();
assert!(catalog
.table(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, table_name)
.table(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, table_name, None)
.await
.unwrap()
.is_some());
@@ -423,7 +456,7 @@ mod tests {
};
catalog.deregister_table_sync(deregister_table_req).unwrap();
assert!(catalog
.table(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, table_name)
.table(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, table_name, None)
.await
.unwrap()
.is_none());

View File

@@ -257,8 +257,8 @@ impl InformationSchemaColumnsBuilder {
.context(UpgradeWeakCatalogManagerRefSnafu)?;
let predicates = Predicates::from_scan_request(&request);
for schema_name in catalog_manager.schema_names(&catalog_name).await? {
let mut stream = catalog_manager.tables(&catalog_name, &schema_name);
for schema_name in catalog_manager.schema_names(&catalog_name, None).await? {
let mut stream = catalog_manager.tables(&catalog_name, &schema_name, None);
while let Some(table) = stream.try_next().await? {
let keys = &table.table_info().meta.primary_key_indices;

View File

@@ -212,8 +212,8 @@ impl InformationSchemaKeyColumnUsageBuilder {
.context(UpgradeWeakCatalogManagerRefSnafu)?;
let predicates = Predicates::from_scan_request(&request);
for schema_name in catalog_manager.schema_names(&catalog_name).await? {
let mut stream = catalog_manager.tables(&catalog_name, &schema_name);
for schema_name in catalog_manager.schema_names(&catalog_name, None).await? {
let mut stream = catalog_manager.tables(&catalog_name, &schema_name, None);
while let Some(table) = stream.try_next().await? {
let mut primary_constraints = vec![];

View File

@@ -240,9 +240,9 @@ impl InformationSchemaPartitionsBuilder {
let predicates = Predicates::from_scan_request(&request);
for schema_name in catalog_manager.schema_names(&catalog_name).await? {
for schema_name in catalog_manager.schema_names(&catalog_name, None).await? {
let table_info_stream = catalog_manager
.tables(&catalog_name, &schema_name)
.tables(&catalog_name, &schema_name, None)
.try_filter_map(|t| async move {
let table_info = t.table_info();
if table_info.table_type == TableType::Temporary {

View File

@@ -176,9 +176,9 @@ impl InformationSchemaRegionPeersBuilder {
let predicates = Predicates::from_scan_request(&request);
for schema_name in catalog_manager.schema_names(&catalog_name).await? {
for schema_name in catalog_manager.schema_names(&catalog_name, None).await? {
let table_id_stream = catalog_manager
.tables(&catalog_name, &schema_name)
.tables(&catalog_name, &schema_name, None)
.try_filter_map(|t| async move {
let table_info = t.table_info();
if table_info.table_type == TableType::Temporary {

View File

@@ -171,7 +171,7 @@ impl InformationSchemaSchemataBuilder {
let table_metadata_manager = utils::table_meta_manager(&self.catalog_manager)?;
let predicates = Predicates::from_scan_request(&request);
for schema_name in catalog_manager.schema_names(&catalog_name).await? {
for schema_name in catalog_manager.schema_names(&catalog_name, None).await? {
let opts = if let Some(table_metadata_manager) = &table_metadata_manager {
table_metadata_manager
.schema_manager()

View File

@@ -176,8 +176,8 @@ impl InformationSchemaTableConstraintsBuilder {
.context(UpgradeWeakCatalogManagerRefSnafu)?;
let predicates = Predicates::from_scan_request(&request);
for schema_name in catalog_manager.schema_names(&catalog_name).await? {
let mut stream = catalog_manager.tables(&catalog_name, &schema_name);
for schema_name in catalog_manager.schema_names(&catalog_name, None).await? {
let mut stream = catalog_manager.tables(&catalog_name, &schema_name, None);
while let Some(table) = stream.try_next().await? {
let keys = &table.table_info().meta.primary_key_indices;

View File

@@ -234,8 +234,8 @@ impl InformationSchemaTablesBuilder {
.context(UpgradeWeakCatalogManagerRefSnafu)?;
let predicates = Predicates::from_scan_request(&request);
for schema_name in catalog_manager.schema_names(&catalog_name).await? {
let mut stream = catalog_manager.tables(&catalog_name, &schema_name);
for schema_name in catalog_manager.schema_names(&catalog_name, None).await? {
let mut stream = catalog_manager.tables(&catalog_name, &schema_name, None);
while let Some(table) = stream.try_next().await? {
let table_info = table.table_info();

View File

@@ -192,8 +192,8 @@ impl InformationSchemaViewsBuilder {
.context(CastManagerSnafu)?
.view_info_cache()?;
for schema_name in catalog_manager.schema_names(&catalog_name).await? {
let mut stream = catalog_manager.tables(&catalog_name, &schema_name);
for schema_name in catalog_manager.schema_names(&catalog_name, None).await? {
let mut stream = catalog_manager.tables(&catalog_name, &schema_name, None);
while let Some(table) = stream.try_next().await? {
let table_info = table.table_info();

View File

@@ -18,15 +18,16 @@ mod pg_namespace;
mod table_names;
use std::collections::HashMap;
use std::sync::{Arc, Weak};
use std::sync::{Arc, LazyLock, Weak};
use common_catalog::consts::{self, PG_CATALOG_NAME};
use common_catalog::consts::{self, DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, PG_CATALOG_NAME};
use datatypes::schema::ColumnSchema;
use lazy_static::lazy_static;
use paste::paste;
use pg_catalog_memory_table::get_schema_columns;
use pg_class::PGClass;
use pg_namespace::PGNamespace;
use session::context::{Channel, QueryContext};
use table::TableRef;
pub use table_names::*;
@@ -142,3 +143,12 @@ impl SystemSchemaProviderInner for PGCatalogProvider {
&self.catalog_name
}
}
/// Provide query context to call the [`CatalogManager`]'s method.
static PG_QUERY_CTX: LazyLock<QueryContext> = LazyLock::new(|| {
QueryContext::with_channel(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, Channel::Postgres)
});
fn query_ctx() -> Option<&'static QueryContext> {
Some(&PG_QUERY_CTX)
}

View File

@@ -32,7 +32,7 @@ use store_api::storage::ScanRequest;
use table::metadata::TableType;
use super::pg_namespace::oid_map::PGNamespaceOidMapRef;
use super::{OID_COLUMN_NAME, PG_CLASS};
use super::{query_ctx, OID_COLUMN_NAME, PG_CLASS};
use crate::error::{
CreateRecordBatchSnafu, InternalSnafu, Result, UpgradeWeakCatalogManagerRefSnafu,
};
@@ -202,8 +202,11 @@ impl PGClassBuilder {
.upgrade()
.context(UpgradeWeakCatalogManagerRefSnafu)?;
let predicates = Predicates::from_scan_request(&request);
for schema_name in catalog_manager.schema_names(&catalog_name).await? {
let mut stream = catalog_manager.tables(&catalog_name, &schema_name);
for schema_name in catalog_manager
.schema_names(&catalog_name, query_ctx())
.await?
{
let mut stream = catalog_manager.tables(&catalog_name, &schema_name, query_ctx());
while let Some(table) = stream.try_next().await? {
let table_info = table.table_info();
self.add_class(

View File

@@ -31,7 +31,7 @@ use datatypes::vectors::{StringVectorBuilder, UInt32VectorBuilder, VectorRef};
use snafu::{OptionExt, ResultExt};
use store_api::storage::ScanRequest;
use super::{PGNamespaceOidMapRef, OID_COLUMN_NAME, PG_NAMESPACE};
use super::{query_ctx, PGNamespaceOidMapRef, OID_COLUMN_NAME, PG_NAMESPACE};
use crate::error::{
CreateRecordBatchSnafu, InternalSnafu, Result, UpgradeWeakCatalogManagerRefSnafu,
};
@@ -180,7 +180,10 @@ impl PGNamespaceBuilder {
.upgrade()
.context(UpgradeWeakCatalogManagerRefSnafu)?;
let predicates = Predicates::from_scan_request(&request);
for schema_name in catalog_manager.schema_names(&catalog_name).await? {
for schema_name in catalog_manager
.schema_names(&catalog_name, query_ctx())
.await?
{
self.add_namespace(&predicates, &schema_name);
}
self.finish()

View File

@@ -23,7 +23,7 @@ use datafusion::datasource::view::ViewTable;
use datafusion::datasource::{provider_as_source, TableProvider};
use datafusion::logical_expr::TableSource;
use itertools::Itertools;
use session::context::QueryContext;
use session::context::QueryContextRef;
use snafu::{ensure, OptionExt, ResultExt};
use table::metadata::TableType;
use table::table::adapter::DfTableProviderAdapter;
@@ -45,6 +45,7 @@ pub struct DfTableSourceProvider {
disallow_cross_catalog_query: bool,
default_catalog: String,
default_schema: String,
query_ctx: QueryContextRef,
plan_decoder: SubstraitPlanDecoderRef,
enable_ident_normalization: bool,
}
@@ -53,7 +54,7 @@ impl DfTableSourceProvider {
pub fn new(
catalog_manager: CatalogManagerRef,
disallow_cross_catalog_query: bool,
query_ctx: &QueryContext,
query_ctx: QueryContextRef,
plan_decoder: SubstraitPlanDecoderRef,
enable_ident_normalization: bool,
) -> Self {
@@ -63,6 +64,7 @@ impl DfTableSourceProvider {
resolved_tables: HashMap::new(),
default_catalog: query_ctx.current_catalog().to_owned(),
default_schema: query_ctx.current_schema(),
query_ctx,
plan_decoder,
enable_ident_normalization,
}
@@ -71,8 +73,7 @@ impl DfTableSourceProvider {
pub fn resolve_table_ref(&self, table_ref: TableReference) -> Result<ResolvedTableReference> {
if self.disallow_cross_catalog_query {
match &table_ref {
TableReference::Bare { .. } => (),
TableReference::Partial { .. } => {}
TableReference::Bare { .. } | TableReference::Partial { .. } => {}
TableReference::Full {
catalog, schema, ..
} => {
@@ -107,7 +108,7 @@ impl DfTableSourceProvider {
let table = self
.catalog_manager
.table(catalog_name, schema_name, table_name)
.table(catalog_name, schema_name, table_name, Some(&self.query_ctx))
.await?
.with_context(|| TableNotExistSnafu {
table: format_full_table_name(catalog_name, schema_name, table_name),
@@ -210,12 +211,12 @@ mod tests {
#[test]
fn test_validate_table_ref() {
let query_ctx = &QueryContext::with("greptime", "public");
let query_ctx = Arc::new(QueryContext::with("greptime", "public"));
let table_provider = DfTableSourceProvider::new(
MemoryCatalogManager::with_default_setup(),
true,
query_ctx,
query_ctx.clone(),
DummyDecoder::arc(),
true,
);
@@ -308,7 +309,7 @@ mod tests {
#[tokio::test]
async fn test_resolve_view() {
let query_ctx = &QueryContext::with("greptime", "public");
let query_ctx = Arc::new(QueryContext::with("greptime", "public"));
let backend = Arc::new(MemoryKvBackend::default());
let layered_cache_builder = LayeredCacheRegistryBuilder::default()
.add_cache_registry(CacheRegistryBuilder::default().build());
@@ -344,8 +345,13 @@ mod tests {
.await
.unwrap();
let mut table_provider =
DfTableSourceProvider::new(catalog_manager, true, query_ctx, MockDecoder::arc(), true);
let mut table_provider = DfTableSourceProvider::new(
catalog_manager,
true,
query_ctx.clone(),
MockDecoder::arc(),
true,
);
// View not found
let table_ref = TableReference::bare("not_exists_view");

View File

@@ -112,7 +112,7 @@ impl SchemaProvider for DummySchemaProvider {
async fn table(&self, name: &str) -> datafusion::error::Result<Option<Arc<dyn TableProvider>>> {
let table = self
.catalog_manager
.table(&self.catalog_name, &self.schema_name, name)
.table(&self.catalog_name, &self.schema_name, name, None)
.await?
.with_context(|| TableNotExistSnafu {
table: format_full_table_name(&self.catalog_name, &self.schema_name, name),

View File

@@ -131,7 +131,7 @@ impl AlterLogicalTablesProcedure {
let phy_raw_schemas = future::join_all(alter_region_tasks)
.await
.into_iter()
.map(|res| res.map(|mut res| res.extension.remove(ALTER_PHYSICAL_EXTENSION_KEY)))
.map(|res| res.map(|mut res| res.extensions.remove(ALTER_PHYSICAL_EXTENSION_KEY)))
.collect::<Result<Vec<_>>>()?;
if phy_raw_schemas.is_empty() {

View File

@@ -157,7 +157,7 @@ impl CreateLogicalTablesProcedure {
let phy_raw_schemas = join_all(create_region_tasks)
.await
.into_iter()
.map(|res| res.map(|mut res| res.extension.remove(ALTER_PHYSICAL_EXTENSION_KEY)))
.map(|res| res.map(|mut res| res.extensions.remove(ALTER_PHYSICAL_EXTENSION_KEY)))
.collect::<Result<Vec<_>>>()?;
if phy_raw_schemas.is_empty() {

View File

@@ -324,10 +324,12 @@ impl HeartbeatTask {
region_id: stat.region_id.as_u64(),
engine: stat.engine,
role: RegionRole::from(stat.role).into(),
// TODO(jeremy): w/rcus
// TODO(weny): w/rcus
rcus: 0,
wcus: 0,
approximate_bytes: region_server.region_disk_usage(stat.region_id).unwrap_or(0),
// TODO(weny): add extensions
extensions: Default::default(),
})
.collect()
}

View File

@@ -366,10 +366,10 @@ impl RegionServerHandler for RegionServer {
// merge results by sum up affected rows and merge extensions.
let mut affected_rows = 0;
let mut extension = HashMap::new();
let mut extensions = HashMap::new();
for result in results {
affected_rows += result.affected_rows;
extension.extend(result.extension);
extensions.extend(result.extensions);
}
Ok(RegionResponseV1 {
@@ -380,7 +380,7 @@ impl RegionServerHandler for RegionServer {
}),
}),
affected_rows: affected_rows as _,
extension,
extensions,
})
}
}
@@ -708,7 +708,7 @@ impl RegionServerInner {
.await?;
Ok(RegionResponse {
affected_rows: result.affected_rows,
extension: result.extension,
extensions: result.extensions,
})
}
Err(err) => {

View File

@@ -15,6 +15,7 @@ workspace = true
arrow.workspace = true
arrow-array.workspace = true
arrow-schema.workspace = true
base64.workspace = true
common-base.workspace = true
common-decimal.workspace = true
common-error.workspace = true
@@ -23,6 +24,7 @@ common-telemetry.workspace = true
common-time.workspace = true
datafusion-common.workspace = true
enum_dispatch = "0.3"
greptime-proto.workspace = true
num = "0.4"
num-traits = "0.2"
ordered-float = { version = "3.0", features = ["serde"] }

View File

@@ -18,6 +18,8 @@ use std::sync::Arc;
use arrow::datatypes::{DataType as ArrowDataType, Field};
use arrow_array::{Array, ListArray};
use base64::engine::general_purpose::URL_SAFE;
use base64::Engine as _;
use common_base::bytes::{Bytes, StringBytes};
use common_decimal::Decimal128;
use common_telemetry::error;
@@ -28,8 +30,10 @@ use common_time::time::Time;
use common_time::timestamp::{TimeUnit, Timestamp};
use common_time::{Duration, Interval, Timezone};
use datafusion_common::ScalarValue;
use greptime_proto::v1::value::ValueData;
pub use ordered_float::OrderedFloat;
use serde::{Deserialize, Serialize, Serializer};
use serde_json::{Number, Value as JsonValue};
use snafu::{ensure, ResultExt};
use crate::error::{self, ConvertArrowArrayToScalarsSnafu, Error, Result, TryFromValueSnafu};
@@ -1364,15 +1368,179 @@ impl<'a> ValueRef<'a> {
}
}
pub fn column_data_to_json(data: ValueData) -> JsonValue {
match data {
ValueData::BinaryValue(b) => JsonValue::String(URL_SAFE.encode(b)),
ValueData::BoolValue(b) => JsonValue::Bool(b),
ValueData::U8Value(i) => JsonValue::Number(i.into()),
ValueData::U16Value(i) => JsonValue::Number(i.into()),
ValueData::U32Value(i) => JsonValue::Number(i.into()),
ValueData::U64Value(i) => JsonValue::Number(i.into()),
ValueData::I8Value(i) => JsonValue::Number(i.into()),
ValueData::I16Value(i) => JsonValue::Number(i.into()),
ValueData::I32Value(i) => JsonValue::Number(i.into()),
ValueData::I64Value(i) => JsonValue::Number(i.into()),
ValueData::F32Value(f) => Number::from_f64(f as f64)
.map(JsonValue::Number)
.unwrap_or(JsonValue::Null),
ValueData::F64Value(f) => Number::from_f64(f)
.map(JsonValue::Number)
.unwrap_or(JsonValue::Null),
ValueData::StringValue(s) => JsonValue::String(s),
ValueData::DateValue(d) => JsonValue::String(Date::from(d).to_string()),
ValueData::DatetimeValue(d) => JsonValue::String(DateTime::from(d).to_string()),
ValueData::TimeSecondValue(d) => JsonValue::String(Time::new_second(d).to_iso8601_string()),
ValueData::TimeMillisecondValue(d) => {
JsonValue::String(Time::new_millisecond(d).to_iso8601_string())
}
ValueData::TimeMicrosecondValue(d) => {
JsonValue::String(Time::new_microsecond(d).to_iso8601_string())
}
ValueData::TimeNanosecondValue(d) => {
JsonValue::String(Time::new_nanosecond(d).to_iso8601_string())
}
ValueData::TimestampMicrosecondValue(d) => {
JsonValue::String(Timestamp::new_microsecond(d).to_iso8601_string())
}
ValueData::TimestampMillisecondValue(d) => {
JsonValue::String(Timestamp::new_millisecond(d).to_iso8601_string())
}
ValueData::TimestampNanosecondValue(d) => {
JsonValue::String(Timestamp::new_nanosecond(d).to_iso8601_string())
}
ValueData::TimestampSecondValue(d) => {
JsonValue::String(Timestamp::new_second(d).to_iso8601_string())
}
ValueData::IntervalYearMonthValue(d) => JsonValue::String(format!("interval year [{}]", d)),
ValueData::IntervalMonthDayNanoValue(d) => JsonValue::String(format!(
"interval month [{}][{}][{}]",
d.months, d.days, d.nanoseconds
)),
ValueData::IntervalDayTimeValue(d) => JsonValue::String(format!("interval day [{}]", d)),
ValueData::Decimal128Value(d) => {
JsonValue::String(format!("decimal128 [{}][{}]", d.hi, d.lo))
}
}
}
#[cfg(test)]
mod tests {
use arrow::datatypes::DataType as ArrowDataType;
use common_time::timezone::set_default_timezone;
use greptime_proto::v1::{Decimal128 as ProtoDecimal128, IntervalMonthDayNano};
use num_traits::Float;
use super::*;
use crate::vectors::ListVectorBuilder;
#[test]
fn test_column_data_to_json() {
assert_eq!(
column_data_to_json(ValueData::BinaryValue(b"hello".to_vec())),
JsonValue::String("aGVsbG8=".to_string())
);
assert_eq!(
column_data_to_json(ValueData::BoolValue(true)),
JsonValue::Bool(true)
);
assert_eq!(
column_data_to_json(ValueData::U8Value(1)),
JsonValue::Number(1.into())
);
assert_eq!(
column_data_to_json(ValueData::U16Value(2)),
JsonValue::Number(2.into())
);
assert_eq!(
column_data_to_json(ValueData::U32Value(3)),
JsonValue::Number(3.into())
);
assert_eq!(
column_data_to_json(ValueData::U64Value(4)),
JsonValue::Number(4.into())
);
assert_eq!(
column_data_to_json(ValueData::I8Value(5)),
JsonValue::Number(5.into())
);
assert_eq!(
column_data_to_json(ValueData::I16Value(6)),
JsonValue::Number(6.into())
);
assert_eq!(
column_data_to_json(ValueData::I32Value(7)),
JsonValue::Number(7.into())
);
assert_eq!(
column_data_to_json(ValueData::I64Value(8)),
JsonValue::Number(8.into())
);
assert_eq!(
column_data_to_json(ValueData::F32Value(9.0)),
JsonValue::Number(Number::from_f64(9.0_f64).unwrap())
);
assert_eq!(
column_data_to_json(ValueData::F64Value(10.0)),
JsonValue::Number(Number::from_f64(10.0_f64).unwrap())
);
assert_eq!(
column_data_to_json(ValueData::StringValue("hello".to_string())),
JsonValue::String("hello".to_string())
);
assert_eq!(
column_data_to_json(ValueData::DateValue(123)),
JsonValue::String("1970-05-04".to_string())
);
assert_eq!(
column_data_to_json(ValueData::DatetimeValue(456)),
JsonValue::String("1970-01-01 00:00:00.456+0000".to_string())
);
assert_eq!(
column_data_to_json(ValueData::TimeSecondValue(789)),
JsonValue::String("00:13:09+0000".to_string())
);
assert_eq!(
column_data_to_json(ValueData::TimeMillisecondValue(789)),
JsonValue::String("00:00:00.789+0000".to_string())
);
assert_eq!(
column_data_to_json(ValueData::TimeMicrosecondValue(789)),
JsonValue::String("00:00:00.000789+0000".to_string())
);
assert_eq!(
column_data_to_json(ValueData::TimestampMillisecondValue(1234567890)),
JsonValue::String("1970-01-15 06:56:07.890+0000".to_string())
);
assert_eq!(
column_data_to_json(ValueData::TimestampNanosecondValue(1234567890123456789)),
JsonValue::String("2009-02-13 23:31:30.123456789+0000".to_string())
);
assert_eq!(
column_data_to_json(ValueData::TimestampSecondValue(1234567890)),
JsonValue::String("2009-02-13 23:31:30+0000".to_string())
);
assert_eq!(
column_data_to_json(ValueData::IntervalYearMonthValue(12)),
JsonValue::String("interval year [12]".to_string())
);
assert_eq!(
column_data_to_json(ValueData::IntervalMonthDayNanoValue(IntervalMonthDayNano {
months: 1,
days: 2,
nanoseconds: 3,
})),
JsonValue::String("interval month [1][2][3]".to_string())
);
assert_eq!(
column_data_to_json(ValueData::IntervalDayTimeValue(4)),
JsonValue::String("interval day [4]".to_string())
);
assert_eq!(
column_data_to_json(ValueData::Decimal128Value(ProtoDecimal128 { hi: 5, lo: 6 })),
JsonValue::String("decimal128 [5][6]".to_string())
);
}
#[test]
fn test_try_from_scalar_value() {
assert_eq!(

View File

@@ -356,9 +356,10 @@ impl SqlQueryHandler for Instance {
async fn is_valid_schema(&self, catalog: &str, schema: &str) -> Result<bool> {
self.catalog_manager
.schema_exists(catalog, schema)
.schema_exists(catalog, schema, None)
.await
.context(error::CatalogSnafu)
.map(|b| b && !self.catalog_manager.is_reserved_schema_name(schema))
}
}

View File

@@ -102,7 +102,7 @@ impl Instance {
) -> Result<Output> {
let table = self
.catalog_manager
.table(catalog_name, schema_name, table_name)
.table(catalog_name, schema_name, table_name, Some(ctx))
.await
.context(CatalogSnafu)?
.with_context(|| TableNotFoundSnafu {

View File

@@ -152,7 +152,12 @@ mod python {
if let Some(table) = self
.catalog_manager
.table(&expr.catalog_name, &expr.schema_name, &expr.table_name)
.table(
&expr.catalog_name,
&expr.schema_name,
&expr.table_name,
None,
)
.await
.context(CatalogSnafu)?
{
@@ -185,6 +190,7 @@ mod python {
&table_name.catalog_name,
&table_name.schema_name,
&table_name.table_name,
None,
)
.await
.context(CatalogSnafu)?

View File

@@ -93,6 +93,7 @@ mod tests {
approximate_bytes: 0,
engine: default_engine().to_string(),
role: RegionRole::Follower,
extensions: Default::default(),
}
}
acc.stat = Some(Stat {

View File

@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::HashSet;
use std::collections::{HashMap, HashSet};
use api::v1::meta::HeartbeatRequest;
use common_meta::ClusterId;
@@ -57,6 +57,8 @@ pub struct RegionStat {
pub engine: String,
/// The region role.
pub role: RegionRole,
/// The extension info of this region
pub extensions: HashMap<String, Vec<u8>>,
}
impl Stat {
@@ -142,6 +144,7 @@ impl TryFrom<api::v1::meta::RegionStat> for RegionStat {
approximate_bytes: value.approximate_bytes,
engine: value.engine.to_string(),
role: RegionRole::from(value.role()),
extensions: value.extensions,
})
}
}

View File

@@ -135,6 +135,7 @@ mod test {
wcus: 0,
approximate_bytes: 0,
engine: String::new(),
extensions: Default::default(),
}
}

View File

@@ -100,7 +100,7 @@ pub mod mock {
}),
}),
affected_rows: 0,
extension: Default::default(),
extensions: Default::default(),
})
}
}

View File

@@ -199,6 +199,7 @@ mod tests {
approximate_bytes: 1,
engine: "mito2".to_string(),
role: RegionRole::Leader,
extensions: Default::default(),
}],
..Default::default()
}
@@ -215,6 +216,7 @@ mod tests {
approximate_bytes: 1,
engine: "mito2".to_string(),
role: RegionRole::Leader,
extensions: Default::default(),
}],
..Default::default()
}
@@ -231,6 +233,7 @@ mod tests {
approximate_bytes: 1,
engine: "mito2".to_string(),
role: RegionRole::Leader,
extensions: Default::default(),
}],
..Default::default()
}

View File

@@ -162,7 +162,7 @@ impl RegionEngine for MetricEngine {
result.map_err(BoxedError::new).map(|rows| RegionResponse {
affected_rows: rows,
extension: extension_return_value,
extensions: extension_return_value,
})
}

View File

@@ -709,6 +709,10 @@ impl ScanInput {
rows_in_files + rows_in_memtables
}
pub(crate) fn predicate(&self) -> Option<Predicate> {
self.predicate.clone()
}
/// Retrieves [`PartitionRange`] from memtable and files
pub(crate) fn partition_ranges(&self) -> Vec<PartitionRange> {
let mut id = 0;

View File

@@ -515,6 +515,11 @@ impl RegionScanner for SeqScan {
self.properties.partitions = ranges;
Ok(())
}
fn has_predicate(&self) -> bool {
let predicate = self.stream_ctx.input.predicate();
predicate.map(|p| !p.exprs().is_empty()).unwrap_or(false)
}
}
impl DisplayAs for SeqScan {

View File

@@ -228,6 +228,11 @@ impl RegionScanner for UnorderedScan {
Ok(stream)
}
fn has_predicate(&self) -> bool {
let predicate = self.stream_ctx.input.predicate();
predicate.map(|p| !p.exprs().is_empty()).unwrap_or(false)
}
}
impl DisplayAs for UnorderedScan {

View File

@@ -232,7 +232,7 @@ impl Deleter {
async fn get_table(&self, catalog: &str, schema: &str, table: &str) -> Result<TableRef> {
self.catalog_manager
.table(catalog, schema, table)
.table(catalog, schema, table, None)
.await
.context(CatalogSnafu)?
.with_context(|| TableNotFoundSnafu {

View File

@@ -119,7 +119,7 @@ impl FlowServiceOperator {
if let Some(prev) = &mut final_result {
prev.affected_rows = res.affected_rows;
prev.affected_flows.extend(res.affected_flows);
prev.extension.extend(res.extension);
prev.extensions.extend(res.extensions);
} else {
final_result = Some(res);
}

View File

@@ -608,7 +608,7 @@ impl Inserter {
table: &str,
) -> Result<Option<TableRef>> {
self.catalog_manager
.table(catalog, schema, table)
.table(catalog, schema, table, None)
.await
.context(CatalogSnafu)
}

View File

@@ -64,7 +64,7 @@ impl<'a> RowToRegion<'a> {
let catalog_name = self.ctx.current_catalog();
let schema_name = self.ctx.current_schema();
self.catalog_manager
.table(catalog_name, &schema_name, table_name)
.table(catalog_name, &schema_name, table_name, None)
.await
.context(CatalogSnafu)?
.with_context(|| TableNotFoundSnafu {

View File

@@ -139,7 +139,7 @@ impl<'a> StatementToRegion<'a> {
async fn get_table(&self, catalog: &str, schema: &str, table: &str) -> Result<TableRef> {
self.catalog_manager
.table(catalog, schema, table)
.table(catalog, schema, table, None)
.await
.context(CatalogSnafu)?
.with_context(|| TableNotFoundSnafu {

View File

@@ -219,7 +219,7 @@ impl Requester {
) -> Result<Vec<PartitionInfo>> {
let table = self
.catalog_manager
.table(catalog, schema, table_name)
.table(catalog, schema, table_name, None)
.await
.context(CatalogSnafu)?;

View File

@@ -286,7 +286,7 @@ impl StatementExecutor {
let table_ref = self
.catalog_manager
.table(&catalog, &schema, &table)
.table(&catalog, &schema, &table, Some(&query_ctx))
.await
.context(CatalogSnafu)?
.context(TableNotFoundSnafu { table_name: &table })?;
@@ -313,7 +313,7 @@ impl StatementExecutor {
let catalog = query_ctx.current_catalog();
ensure!(
self.catalog_manager
.schema_exists(catalog, db.as_ref())
.schema_exists(catalog, db.as_ref(), Some(&query_ctx))
.await
.context(CatalogSnafu)?,
SchemaNotFoundSnafu { schema_info: &db }
@@ -382,7 +382,7 @@ impl StatementExecutor {
table,
} = table_ref;
self.catalog_manager
.table(catalog, schema, table)
.table(catalog, schema, table, None)
.await
.context(CatalogSnafu)?
.with_context(|| TableNotFoundSnafu {

View File

@@ -57,7 +57,7 @@ impl StatementExecutor {
);
let table_names = self
.catalog_manager
.table_names(&req.catalog_name, &req.schema_name)
.table_names(&req.catalog_name, &req.schema_name, Some(&ctx))
.await
.context(CatalogSnafu)?;

View File

@@ -106,7 +106,7 @@ impl StatementExecutor {
.context(error::ExternalSnafu)?;
let table_ref = self
.catalog_manager
.table(&catalog, &schema, &table)
.table(&catalog, &schema, &table, Some(&ctx))
.await
.context(CatalogSnafu)?
.context(TableNotFoundSnafu { table_name: &table })?;
@@ -207,6 +207,7 @@ impl StatementExecutor {
&create_table.catalog_name,
&create_table.schema_name,
&create_table.table_name,
Some(&query_ctx),
)
.await
.context(CatalogSnafu)?
@@ -487,7 +488,12 @@ impl StatementExecutor {
// if view or table exists.
if let Some(table) = self
.catalog_manager
.table(&expr.catalog_name, &expr.schema_name, &expr.view_name)
.table(
&expr.catalog_name,
&expr.schema_name,
&expr.view_name,
Some(&ctx),
)
.await
.context(CatalogSnafu)?
{
@@ -656,7 +662,7 @@ impl StatementExecutor {
) -> Result<Output> {
let view_info = if let Some(view) = self
.catalog_manager
.table(&catalog, &schema, &view)
.table(&catalog, &schema, &view, None)
.await
.context(CatalogSnafu)?
{
@@ -766,6 +772,7 @@ impl StatementExecutor {
&table_name.catalog_name,
&table_name.schema_name,
&table_name.table_name,
Some(&query_context),
)
.await
.context(CatalogSnafu)?
@@ -816,7 +823,7 @@ impl StatementExecutor {
if self
.catalog_manager
.schema_exists(&catalog, &schema)
.schema_exists(&catalog, &schema, None)
.await
.context(CatalogSnafu)?
{
@@ -858,6 +865,7 @@ impl StatementExecutor {
&table_name.catalog_name,
&table_name.schema_name,
&table_name.table_name,
Some(&query_context),
)
.await
.context(CatalogSnafu)?
@@ -944,7 +952,12 @@ impl StatementExecutor {
let table = self
.catalog_manager
.table(&catalog_name, &schema_name, &table_name)
.table(
&catalog_name,
&schema_name,
&table_name,
Some(&query_context),
)
.await
.context(CatalogSnafu)?
.with_context(|| TableNotFoundSnafu {
@@ -1167,9 +1180,10 @@ impl StatementExecutor {
if !self
.catalog_manager
.schema_exists(catalog, database)
.schema_exists(catalog, database, None)
.await
.context(CatalogSnafu)?
&& !self.catalog_manager.is_reserved_schema_name(database)
{
self.create_database_procedure(
catalog.to_string(),

View File

@@ -39,7 +39,7 @@ impl StatementExecutor {
let table = self
.catalog_manager
.table(&catalog, &schema, &table)
.table(&catalog, &schema, &table, Some(&query_ctx))
.await
.context(CatalogSnafu)?
.with_context(|| TableNotFoundSnafu {

View File

@@ -143,7 +143,7 @@ impl StatementExecutor {
let table_ref = self
.catalog_manager
.table(&catalog, &schema, &view)
.table(&catalog, &schema, &view, Some(&query_ctx))
.await
.context(CatalogSnafu)?
.context(ViewNotFoundSnafu { view_name: &view })?;

View File

@@ -13,27 +13,13 @@
// limitations under the License.
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use pipeline::{parse, Array, Content, GreptimeTransformer, Pipeline, Value as PipelineValue};
use pipeline::{parse, Content, GreptimeTransformer, Pipeline};
use serde_json::{Deserializer, Value};
fn processor_map(
pipeline: &Pipeline<GreptimeTransformer>,
input_values: Vec<Value>,
) -> impl IntoIterator<Item = greptime_proto::v1::Rows> {
let pipeline_data = input_values
.into_iter()
.map(|v| PipelineValue::try_from(v).unwrap())
.collect::<Vec<_>>();
pipeline.exec(PipelineValue::Array(Array {
values: pipeline_data,
}))
}
fn processor_mut(
pipeline: &Pipeline<GreptimeTransformer>,
input_values: Vec<Value>,
) -> impl IntoIterator<Item = Vec<greptime_proto::v1::Row>> {
) -> Result<Vec<greptime_proto::v1::Row>, String> {
let mut payload = pipeline.init_intermediate_state();
let mut result = Vec::with_capacity(input_values.len());
@@ -249,11 +235,10 @@ fn criterion_benchmark(c: &mut Criterion) {
let pipeline = prepare_pipeline();
let mut group = c.benchmark_group("pipeline");
group.sample_size(50);
group.bench_function("processor map", |b| {
b.iter(|| processor_map(black_box(&pipeline), black_box(input_value.clone())))
});
group.bench_function("processor mut", |b| {
b.iter(|| processor_mut(black_box(&pipeline), black_box(input_value.clone())))
b.iter(|| {
processor_mut(black_box(&pipeline), black_box(input_value.clone())).unwrap();
})
});
group.finish();
}

View File

@@ -19,92 +19,24 @@ pub mod processor;
pub mod transform;
pub mod value;
use ahash::{HashMap, HashSet};
use common_telemetry::{debug, warn};
use ahash::HashSet;
use common_telemetry::debug;
use itertools::{merge, Itertools};
use processor::Processor;
use transform::{Transformer, Transforms};
use value::{Map, Value};
use processor::{Processor, ProcessorBuilder, Processors};
use transform::{TransformBuilders, Transformer, Transforms};
use value::Value;
use yaml_rust::YamlLoader;
const DESCRIPTION: &str = "description";
const PROCESSORS: &str = "processors";
const TRANSFORM: &str = "transform";
const TRANSFORMS: &str = "transforms";
pub enum Content {
Json(String),
Yaml(String),
}
/// set the index for the processor keys
/// the index is the position of the key in the final intermediate keys
fn set_processor_keys_index(
processors: &mut processor::Processors,
final_intermediate_keys: &Vec<String>,
) -> Result<(), String> {
let final_intermediate_key_index = final_intermediate_keys
.iter()
.enumerate()
.map(|(i, k)| (k.as_str(), i))
.collect::<HashMap<_, _>>();
for processor in processors.iter_mut() {
for field in processor.fields_mut().iter_mut() {
let index = final_intermediate_key_index.get(field.input_field.name.as_str()).ok_or(format!(
"input field {} is not found in intermediate keys: {final_intermediate_keys:?} when set processor keys index",
field.input_field.name
))?;
field.set_input_index(*index);
for (k, v) in field.output_fields_index_mapping.iter_mut() {
let index = final_intermediate_key_index.get(k.as_str());
match index {
Some(index) => {
*v = *index;
}
None => {
warn!(
"output field {k} is not found in intermediate keys: {final_intermediate_keys:?} when set processor keys index"
);
}
}
}
}
}
Ok(())
}
fn set_transform_keys_index(
transforms: &mut Transforms,
final_intermediate_keys: &[String],
output_keys: &[String],
) -> Result<(), String> {
let final_intermediate_key_index = final_intermediate_keys
.iter()
.enumerate()
.map(|(i, k)| (k.as_str(), i))
.collect::<HashMap<_, _>>();
let output_key_index = output_keys
.iter()
.enumerate()
.map(|(i, k)| (k.as_str(), i))
.collect::<HashMap<_, _>>();
for transform in transforms.iter_mut() {
for field in transform.fields.iter_mut() {
let index = final_intermediate_key_index.get(field.input_field.name.as_str()).ok_or(format!(
"input field {} is not found in intermediate keys: {final_intermediate_keys:?} when set transform keys index",
field.input_field.name
))?;
field.set_input_index(*index);
for (k, v) in field.output_fields_index_mapping.iter_mut() {
let index = output_key_index.get(k.as_str()).ok_or(format!(
"output field {k} is not found in output keys: {final_intermediate_keys:?} when set transform keys index"
))?;
*v = *index;
}
}
}
Ok(())
}
pub fn parse<T>(input: &Content) -> Result<Pipeline<T>, String>
where
T: Transformer,
@@ -117,24 +49,22 @@ where
let description = doc[DESCRIPTION].as_str().map(|s| s.to_string());
let mut processors = if let Some(v) = doc[PROCESSORS].as_vec() {
let processor_builder_list = if let Some(v) = doc[PROCESSORS].as_vec() {
v.try_into()?
} else {
processor::Processors::default()
processor::ProcessorBuilderList::default()
};
let transforms = if let Some(v) = doc[TRANSFORM].as_vec() {
v.try_into()?
} else {
Transforms::default()
};
let transform_builders =
if let Some(v) = doc[TRANSFORMS].as_vec().or(doc[TRANSFORM].as_vec()) {
v.try_into()?
} else {
TransformBuilders::default()
};
let mut transformer = T::new(transforms)?;
let transforms = transformer.transforms_mut();
let processors_output_keys = processors.output_keys();
let processors_required_keys = processors.required_keys();
let processors_required_original_keys = processors.required_original_keys();
let processors_required_keys = &processor_builder_list.input_keys;
let processors_output_keys = &processor_builder_list.output_keys;
let processors_required_original_keys = &processor_builder_list.original_input_keys;
debug!(
"processors_required_original_keys: {:?}",
@@ -143,7 +73,7 @@ where
debug!("processors_required_keys: {:?}", processors_required_keys);
debug!("processors_output_keys: {:?}", processors_output_keys);
let transforms_required_keys = transforms.required_keys();
let transforms_required_keys = &transform_builders.required_keys;
let mut tr_keys = Vec::with_capacity(50);
for key in transforms_required_keys.iter() {
if !processors_output_keys.contains(key)
@@ -183,9 +113,33 @@ where
final_intermediate_keys.extend(intermediate_keys_exclude_original);
let output_keys = transforms.output_keys().clone();
set_processor_keys_index(&mut processors, &final_intermediate_keys)?;
set_transform_keys_index(transforms, &final_intermediate_keys, &output_keys)?;
let output_keys = transform_builders.output_keys.clone();
let processors_kind_list = processor_builder_list
.processor_builders
.into_iter()
.map(|builder| builder.build(&final_intermediate_keys))
.collect::<Result<Vec<_>, _>>()?;
let processors = Processors {
processors: processors_kind_list,
required_keys: processors_required_keys.clone(),
output_keys: processors_output_keys.clone(),
required_original_keys: processors_required_original_keys.clone(),
};
let transfor_list = transform_builders
.builders
.into_iter()
.map(|builder| builder.build(&final_intermediate_keys, &output_keys))
.collect::<Result<Vec<_>, String>>()?;
let transformers = Transforms {
transforms: transfor_list,
required_keys: transforms_required_keys.clone(),
output_keys: output_keys.clone(),
};
let transformer = T::new(transformers)?;
Ok(Pipeline {
description,
@@ -238,38 +192,6 @@ impl<T> Pipeline<T>
where
T: Transformer,
{
fn exec_map(&self, map: &mut Map) -> Result<(), String> {
let v = map;
for processor in self.processors.iter() {
processor.exec_map(v)?;
}
Ok(())
}
pub fn exec(&self, mut val: Value) -> Result<T::Output, String> {
let result = match val {
Value::Map(ref mut map) => {
self.exec_map(map)?;
val
}
Value::Array(arr) => arr
.values
.into_iter()
.map(|mut v| match v {
Value::Map(ref mut map) => {
self.exec_map(map)?;
Ok(v)
}
_ => Err(format!("expected a map, but got {}", v)),
})
.collect::<Result<Vec<Value>, String>>()
.map(|values| Value::Array(value::Array { values }))?,
_ => return Err(format!("expected a map or array, but got {}", val)),
};
self.transformer.transform(result)
}
pub fn exec_mut(&self, val: &mut Vec<Value>) -> Result<T::VecOutput, String> {
for processor in self.processors.iter() {
processor.exec_mut(val)?;
@@ -347,9 +269,24 @@ where
}
}
pub(crate) fn find_key_index(
intermediate_keys: &[String],
key: &str,
kind: &str,
) -> Result<usize, String> {
intermediate_keys
.iter()
.position(|k| k == key)
.ok_or(format!(
"{} processor.{} not found in intermediate keys",
kind, key
))
}
#[cfg(test)]
mod tests {
use api::v1::Rows;
use greptime_proto::v1::value::ValueData;
use greptime_proto::v1::{self, ColumnDataType, SemanticType};
@@ -359,96 +296,43 @@ mod tests {
#[test]
fn test_pipeline_prepare() {
{
let input_value_str = r#"
{
"my_field": "1,2",
"foo": "bar"
}
"#;
let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap();
let pipeline_yaml = r#"
---
description: Pipeline for Apache Tomcat
let input_value_str = r#"
{
"my_field": "1,2",
"foo": "bar"
}
"#;
let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap();
let pipeline_yaml = r#"description: 'Pipeline for Apache Tomcat'
processors:
- csv:
field: my_field, my_field,field1, field2
field: my_field
target_fields: field1, field2
transform:
- field: field1
type: uint32
- field: field2
type: uint32
"#;
let pipeline: Pipeline<GreptimeTransformer> =
parse(&Content::Yaml(pipeline_yaml.into())).unwrap();
let mut payload = pipeline.init_intermediate_state();
pipeline.prepare(input_value, &mut payload).unwrap();
assert_eq!(
&["greptime_timestamp", "my_field"].to_vec(),
pipeline.required_keys()
);
assert_eq!(
payload,
vec![
Value::Null,
Value::String("1,2".to_string()),
Value::Null,
Value::Null
]
);
let result = pipeline.exec_mut(&mut payload).unwrap();
let pipeline: Pipeline<GreptimeTransformer> =
parse(&Content::Yaml(pipeline_yaml.into())).unwrap();
let mut payload = pipeline.init_intermediate_state();
pipeline.prepare(input_value, &mut payload).unwrap();
assert_eq!(&["my_field"].to_vec(), pipeline.required_keys());
assert_eq!(
payload,
vec![Value::String("1,2".to_string()), Value::Null, Value::Null]
);
let result = pipeline.exec_mut(&mut payload).unwrap();
assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1)));
assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2)));
match &result.values[2].value_data {
Some(ValueData::TimestampNanosecondValue(v)) => {
assert_ne!(*v, 0);
}
_ => panic!("expect null value"),
assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1)));
assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2)));
match &result.values[2].value_data {
Some(ValueData::TimestampNanosecondValue(v)) => {
assert_ne!(*v, 0);
}
}
{
let input_value_str = r#"
{
"reqTimeSec": "1573840000.000"
}
"#;
let pipeline_yaml = r#"
---
description: Pipeline for Demo Log
processors:
- gsub:
field: reqTimeSec
pattern: "\\."
replacement: ""
- epoch:
field: reqTimeSec
resolution: millisecond
ignore_missing: true
transform:
- field: reqTimeSec
type: epoch, millisecond
index: timestamp
"#;
let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap();
let pipeline: Pipeline<GreptimeTransformer> =
parse(&Content::Yaml(pipeline_yaml.into())).unwrap();
let mut payload = pipeline.init_intermediate_state();
pipeline.prepare(input_value, &mut payload).unwrap();
assert_eq!(&["reqTimeSec"].to_vec(), pipeline.required_keys());
assert_eq!(payload, vec![Value::String("1573840000.000".to_string())]);
let result = pipeline.exec_mut(&mut payload).unwrap();
assert_eq!(
result.values[0].value_data,
Some(ValueData::TimestampMillisecondValue(1573840000000))
);
_ => panic!("expect null value"),
}
}
@@ -541,21 +425,19 @@ transform:
#[test]
fn test_csv_pipeline() {
let input_value_str = r#"
{
"my_field": "1,2",
"foo": "bar"
}
"#;
{
"my_field": "1,2",
"foo": "bar"
}
"#;
let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap();
let pipeline_yaml = r#"
---
description: Pipeline for Apache Tomcat
processors:
- csv:
field: my_field,my_field, field1, field2
field: my_field
target_fields: field1, field2
transform:
- field: field1
type: uint32
@@ -565,8 +447,22 @@ transform:
let pipeline: Pipeline<GreptimeTransformer> =
parse(&Content::Yaml(pipeline_yaml.into())).unwrap();
let output = pipeline.exec(input_value.try_into().unwrap());
assert!(output.is_ok());
let mut payload = pipeline.init_intermediate_state();
pipeline.prepare(input_value, &mut payload).unwrap();
assert_eq!(&["my_field"].to_vec(), pipeline.required_keys());
assert_eq!(
payload,
vec![Value::String("1,2".to_string()), Value::Null, Value::Null]
);
let result = pipeline.exec_mut(&mut payload).unwrap();
assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1)));
assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2)));
match &result.values[2].value_data {
Some(ValueData::TimestampNanosecondValue(v)) => {
assert_ne!(*v, 0);
}
_ => panic!("expect null value"),
}
}
#[test]
@@ -596,7 +492,14 @@ transform:
let pipeline: Pipeline<GreptimeTransformer> =
parse(&Content::Yaml(pipeline_yaml.into())).unwrap();
let output = pipeline.exec(input_value.try_into().unwrap()).unwrap();
let schema = pipeline.schemas().clone();
let mut result = pipeline.init_intermediate_state();
pipeline.prepare(input_value, &mut result).unwrap();
let row = pipeline.exec_mut(&mut result).unwrap();
let output = Rows {
schema,
rows: vec![row],
};
let schemas = output.schema;
assert_eq!(schemas.len(), 1);

View File

@@ -12,69 +12,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::BTreeMap;
use std::ops::Deref;
use std::str::FromStr;
use ahash::{HashSet, HashSetExt};
use itertools::Itertools;
#[derive(Debug, Default, Clone)]
pub struct Fields(Vec<Field>);
impl Fields {
pub(crate) fn new(fields: Vec<Field>) -> Result<Self, String> {
let ff = Fields(fields);
ff.check()
}
pub(crate) fn one(field: Field) -> Self {
Fields(vec![field])
}
pub(crate) fn get_target_fields(&self) -> Vec<&str> {
self.0.iter().map(|f| f.get_target_field()).collect()
}
fn check(self) -> Result<Self, String> {
if self.0.is_empty() {
return Err("fields must not be empty".to_string());
}
let mut set = HashSet::new();
for f in self.0.iter() {
if set.contains(&f.input_field.name) {
return Err(format!(
"field name must be unique, but got duplicated: {}",
f.input_field.name
));
}
set.insert(&f.input_field.name);
}
Ok(self)
}
}
impl std::fmt::Display for Fields {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
let s = self.0.iter().map(|f| f.to_string()).join(";");
write!(f, "{s}")
}
}
impl std::ops::Deref for Fields {
type Target = Vec<Field>;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl std::ops::DerefMut for Fields {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut self.0
}
}
use crate::etl::find_key_index;
/// Information about the input field including the name and index in intermediate keys.
#[derive(Debug, Default, Clone)]
pub struct InputFieldInfo {
pub(crate) name: String,
@@ -82,132 +25,202 @@ pub struct InputFieldInfo {
}
impl InputFieldInfo {
/// Create a new input field info with the given field name and index.
pub(crate) fn new(field: impl Into<String>, index: usize) -> Self {
InputFieldInfo {
name: field.into(),
index,
}
}
}
pub(crate) fn name(field: impl Into<String>) -> Self {
InputFieldInfo {
name: field.into(),
index: 0,
/// Information about a field that has one input and one output.
#[derive(Debug, Default, Clone)]
pub struct OneInputOneOutputField {
input: InputFieldInfo,
output: Option<(String, usize)>,
}
impl OneInputOneOutputField {
/// Create a new field with the given input and output.
pub(crate) fn new(input: InputFieldInfo, output: (String, usize)) -> Self {
OneInputOneOutputField {
input,
output: Some(output),
}
}
/// Build a new field with the given processor kind, intermediate keys, input field, and target field.
pub(crate) fn build(
processor_kind: &str,
intermediate_keys: &[String],
input_field: &str,
target_field: &str,
) -> Result<Self, String> {
let input_index = find_key_index(intermediate_keys, input_field, processor_kind)?;
let input_field_info = InputFieldInfo::new(input_field, input_index);
let output_index = find_key_index(intermediate_keys, target_field, processor_kind)?;
Ok(OneInputOneOutputField::new(
input_field_info,
(target_field.to_string(), output_index),
))
}
/// Get the input field information.
pub(crate) fn input(&self) -> &InputFieldInfo {
&self.input
}
/// Get the index of the input field.
pub(crate) fn input_index(&self) -> usize {
self.input.index
}
/// Get the name of the input field.
pub(crate) fn input_name(&self) -> &str {
&self.input.name
}
/// Get the index of the output field.
pub(crate) fn output_index(&self) -> usize {
*self.output().1
}
/// Get the name of the output field.
pub(crate) fn output_name(&self) -> &str {
self.output().0
}
/// Get the output field information.
pub(crate) fn output(&self) -> (&String, &usize) {
if let Some((name, index)) = &self.output {
(name, index)
} else {
(&self.input.name, &self.input.index)
}
}
}
/// Used to represent the input and output fields of a processor or transform.
/// Information about a field that has one input and multiple outputs.
#[derive(Debug, Default, Clone)]
pub struct OneInputMultiOutputField {
input: InputFieldInfo,
/// Typically, processors that output multiple keys need to be distinguished by splicing the keys together.
prefix: Option<String>,
}
impl OneInputMultiOutputField {
/// Create a new field with the given input and prefix.
pub(crate) fn new(input: InputFieldInfo, prefix: Option<String>) -> Self {
OneInputMultiOutputField { input, prefix }
}
/// Get the input field information.
pub(crate) fn input(&self) -> &InputFieldInfo {
&self.input
}
/// Get the index of the input field.
pub(crate) fn input_index(&self) -> usize {
self.input.index
}
/// Get the name of the input field.
pub(crate) fn input_name(&self) -> &str {
&self.input.name
}
/// Get the prefix for the output fields.
pub(crate) fn target_prefix(&self) -> &str {
self.prefix.as_deref().unwrap_or(&self.input.name)
}
}
/// Raw processor-defined inputs and outputs
#[derive(Debug, Default, Clone)]
pub struct Field {
/// The input field name and index.
pub input_field: InputFieldInfo,
/// The output field name and index mapping.
pub output_fields_index_mapping: BTreeMap<String, usize>,
// rename
pub target_field: Option<String>,
// 1-to-many mapping
// processors:
// - csv
pub target_fields: Option<Vec<String>>,
pub(crate) input_field: String,
pub(crate) target_field: Option<String>,
}
impl Field {
pub(crate) fn new(field: impl Into<String>) -> Self {
Field {
input_field: InputFieldInfo::name(field.into()),
output_fields_index_mapping: BTreeMap::new(),
target_field: None,
target_fields: None,
}
}
/// target column_name in processor or transform
/// if target_field is None, return input field name
pub(crate) fn get_target_field(&self) -> &str {
self.target_field
.as_deref()
.unwrap_or(&self.input_field.name)
}
/// input column_name in processor or transform
pub(crate) fn get_field_name(&self) -> &str {
&self.input_field.name
}
/// set input column index in processor or transform
pub(crate) fn set_input_index(&mut self, index: usize) {
self.input_field.index = index;
}
pub(crate) fn set_output_index(&mut self, key: &str, index: usize) {
if let Some(v) = self.output_fields_index_mapping.get_mut(key) {
*v = index;
}
}
pub(crate) fn insert_output_index(&mut self, key: String, index: usize) {
self.output_fields_index_mapping.insert(key, index);
}
}
impl std::str::FromStr for Field {
impl FromStr for Field {
type Err = String;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let mut parts = s.split(',');
let field = parts.next().ok_or("field is missing")?.trim().to_string();
let input_field = parts
.next()
.ok_or("input field is missing")?
.trim()
.to_string();
let target_field = parts.next().map(|x| x.trim().to_string());
if field.is_empty() {
return Err("field is empty".to_string());
if input_field.is_empty() {
return Err("input field is empty".to_string());
}
let renamed_field = match parts.next() {
Some(s) if !s.trim().is_empty() => Some(s.trim().to_string()),
_ => None,
};
// TODO(qtang): ???? what's this?
// weird design? field: <field>,<target_field>,<target_fields>,<target_fields>....
// and only use in csv processor
let fields: Vec<_> = parts
.map(|s| s.trim())
.filter(|s| !s.is_empty())
.map(|s| s.to_string())
.collect();
let target_fields = if fields.is_empty() {
None
} else {
Some(fields)
};
Ok(Field {
input_field: InputFieldInfo::name(field),
output_fields_index_mapping: BTreeMap::new(),
target_field: renamed_field,
target_fields,
input_field,
target_field,
})
}
}
impl std::fmt::Display for Field {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
match (&self.target_field, &self.target_fields) {
(Some(target_field), None) => write!(f, "{}, {target_field}", self.input_field.name),
(None, Some(target_fields)) => {
write!(
f,
"{}, {}",
self.input_field.name,
target_fields.iter().join(",")
)
}
_ => write!(f, "{}", self.input_field.name),
impl Field {
/// Create a new field with the given input and target fields.
pub(crate) fn new(input_field: impl Into<String>, target_field: Option<String>) -> Self {
Field {
input_field: input_field.into(),
target_field,
}
}
/// Get the input field.
pub(crate) fn input_field(&self) -> &str {
&self.input_field
}
/// Get the target field.
pub(crate) fn target_field(&self) -> Option<&str> {
self.target_field.as_deref()
}
/// Get the target field or the input field if the target field is not set.
pub(crate) fn target_or_input_field(&self) -> &str {
self.target_field.as_deref().unwrap_or(&self.input_field)
}
}
/// A collection of fields.
#[derive(Debug, Default, Clone)]
pub struct Fields(Vec<Field>);
impl Fields {
pub(crate) fn new(fields: Vec<Field>) -> Self {
Fields(fields)
}
pub(crate) fn one(field: Field) -> Self {
Fields(vec![field])
}
}
impl Deref for Fields {
type Target = Vec<Field>;
fn deref(&self) -> &Self::Target {
&self.0
}
}
impl IntoIterator for Fields {
type Item = Field;
type IntoIter = std::vec::IntoIter<Field>;
fn into_iter(self) -> Self::IntoIter {
self.0.into_iter()
}
}
#[cfg(test)]
@@ -227,35 +240,14 @@ mod tests {
let cases = [
// ("field", "field", None, None),
(
"field, target_field",
"field",
Some("target_field".into()),
None,
),
(
"field, target_field1, target_field2, target_field3",
"field",
Some("target_field1".into()),
Some(vec!["target_field2".into(), "target_field3".into()]),
),
(
"field,, target_field1, target_field2, target_field3",
"field",
None,
Some(vec![
"target_field1".into(),
"target_field2".into(),
"target_field3".into(),
]),
),
("field, target_field", "field", Some("target_field")),
("field", "field", None),
];
for (s, field, target_field, target_fields) in cases.into_iter() {
for (s, field, target_field) in cases.into_iter() {
let f: Field = s.parse().unwrap();
assert_eq!(f.get_field_name(), field, "{s}");
assert_eq!(f.target_field, target_field, "{s}");
assert_eq!(f.target_fields, target_fields, "{s}");
assert_eq!(f.input_field(), field, "{s}");
assert_eq!(f.target_field(), target_field, "{s}");
}
}
}

View File

@@ -25,22 +25,22 @@ pub mod timestamp;
pub mod urlencoding;
use ahash::{HashSet, HashSetExt};
use cmcd::CmcdProcessor;
use csv::CsvProcessor;
use date::DateProcessor;
use dissect::DissectProcessor;
use cmcd::{CmcdProcessor, CmcdProcessorBuilder};
use csv::{CsvProcessor, CsvProcessorBuilder};
use date::{DateProcessor, DateProcessorBuilder};
use dissect::{DissectProcessor, DissectProcessorBuilder};
use enum_dispatch::enum_dispatch;
use epoch::EpochProcessor;
use gsub::GsubProcessor;
use epoch::{EpochProcessor, EpochProcessorBuilder};
use gsub::{GsubProcessor, GsubProcessorBuilder};
use itertools::Itertools;
use join::JoinProcessor;
use letter::LetterProcessor;
use regex::RegexProcessor;
use timestamp::TimestampProcessor;
use urlencoding::UrlEncodingProcessor;
use join::{JoinProcessor, JoinProcessorBuilder};
use letter::{LetterProcessor, LetterProcessorBuilder};
use regex::{RegexProcessor, RegexProcessorBuilder};
use timestamp::{TimestampProcessor, TimestampProcessorBuilder};
use urlencoding::{UrlEncodingProcessor, UrlEncodingProcessorBuilder};
use crate::etl::field::{Field, Fields};
use crate::etl::value::{Map, Value};
use super::field::{Field, Fields};
use crate::etl::value::Value;
const FIELD_NAME: &str = "field";
const FIELDS_NAME: &str = "fields";
@@ -49,6 +49,7 @@ const METHOD_NAME: &str = "method";
const PATTERN_NAME: &str = "pattern";
const PATTERNS_NAME: &str = "patterns";
const SEPARATOR_NAME: &str = "separator";
const TARGET_FIELDS_NAME: &str = "target_fields";
// const IF_NAME: &str = "if";
// const IGNORE_FAILURE_NAME: &str = "ignore_failure";
@@ -62,55 +63,14 @@ const SEPARATOR_NAME: &str = "separator";
/// The output of a processor is a map of key-value pairs that will be merged into the document when you use exec_map method.
#[enum_dispatch(ProcessorKind)]
pub trait Processor: std::fmt::Debug + Send + Sync + 'static {
/// Get the processor's fields
/// fields is just the same processor for multiple keys. It is not the case that a processor has multiple inputs
fn fields(&self) -> &Fields;
/// Get the processor's fields mutably
fn fields_mut(&mut self) -> &mut Fields;
/// Get the processor's kind
fn kind(&self) -> &str;
/// Whether to ignore missing
fn ignore_missing(&self) -> bool;
/// processor all output keys
/// if a processor has multiple output keys, it should return all of them
fn output_keys(&self) -> HashSet<String>;
/// Execute the processor on a document
/// and return a map of key-value pairs
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String>;
/// Execute the processor on a vector which be preprocessed by the pipeline
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String>;
/// Execute the processor on a map
/// and merge the output into the original map
fn exec_map(&self, map: &mut Map) -> Result<(), String> {
for ff @ Field {
input_field: field_info,
..
} in self.fields().iter()
{
match map.get(&field_info.name) {
Some(v) => {
map.extend(self.exec_field(v, ff)?);
}
None if self.ignore_missing() => {}
None => {
return Err(format!(
"{} processor: field '{}' is required but missing in {map}",
self.kind(),
field_info.name,
))
}
}
}
Ok(())
}
}
#[derive(Debug)]
@@ -129,6 +89,42 @@ pub enum ProcessorKind {
Date(DateProcessor),
}
/// ProcessorBuilder trait defines the interface for all processor builders
/// A processor builder is used to create a processor
#[enum_dispatch(ProcessorBuilders)]
pub trait ProcessorBuilder: std::fmt::Debug + Send + Sync + 'static {
/// Get the processor's output keys
fn output_keys(&self) -> HashSet<&str>;
/// Get the processor's input keys
fn input_keys(&self) -> HashSet<&str>;
/// Build the processor
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String>;
}
#[derive(Debug)]
#[enum_dispatch]
pub enum ProcessorBuilders {
Cmcd(CmcdProcessorBuilder),
Csv(CsvProcessorBuilder),
Dissect(DissectProcessorBuilder),
Gsub(GsubProcessorBuilder),
Join(JoinProcessorBuilder),
Letter(LetterProcessorBuilder),
Regex(RegexProcessorBuilder),
Timestamp(TimestampProcessorBuilder),
UrlEncoding(UrlEncodingProcessorBuilder),
Epoch(EpochProcessorBuilder),
Date(DateProcessorBuilder),
}
#[derive(Debug, Default)]
pub struct ProcessorBuilderList {
pub(crate) processor_builders: Vec<ProcessorBuilders>,
pub(crate) input_keys: Vec<String>,
pub(crate) output_keys: Vec<String>,
pub(crate) original_input_keys: Vec<String>,
}
#[derive(Debug, Default)]
pub struct Processors {
/// A ordered list of processors
@@ -174,52 +170,63 @@ impl Processors {
}
}
impl TryFrom<&Vec<yaml_rust::Yaml>> for Processors {
impl TryFrom<&Vec<yaml_rust::Yaml>> for ProcessorBuilderList {
type Error = String;
fn try_from(vec: &Vec<yaml_rust::Yaml>) -> Result<Self, Self::Error> {
let mut processors = vec![];
let mut processors_builders = vec![];
let mut all_output_keys = HashSet::with_capacity(50);
let mut all_required_keys = HashSet::with_capacity(50);
let mut all_required_original_keys = HashSet::with_capacity(50);
for doc in vec {
let processor = parse_processor(doc)?;
// get all required keys
let processor_required_keys: Vec<String> = processor
.fields()
.iter()
.map(|f| f.input_field.name.clone())
.collect();
for key in &processor_required_keys {
if !all_output_keys.contains(key) {
all_required_original_keys.insert(key.clone());
}
}
all_required_keys.extend(processor_required_keys);
let processor_output_keys = processor.output_keys().into_iter();
all_output_keys.extend(processor_output_keys);
processors.push(processor);
processors_builders.push(processor);
}
let all_required_keys = all_required_keys.into_iter().sorted().collect();
let all_output_keys = all_output_keys.into_iter().sorted().collect();
let all_required_original_keys = all_required_original_keys.into_iter().sorted().collect();
for processor in processors_builders.iter() {
{
// get all required keys
let processor_required_keys = processor.input_keys();
Ok(Processors {
processors,
required_keys: all_required_keys,
for key in &processor_required_keys {
if !all_output_keys.contains(key) {
all_required_original_keys.insert(*key);
}
}
all_required_keys.extend(processor_required_keys);
let processor_output_keys = processor.output_keys().into_iter();
all_output_keys.extend(processor_output_keys);
}
}
let all_required_keys = all_required_keys
.into_iter()
.map(|x| x.to_string())
.sorted()
.collect();
let all_output_keys = all_output_keys
.into_iter()
.map(|x| x.to_string())
.sorted()
.collect();
let all_required_original_keys = all_required_original_keys
.into_iter()
.map(|x| x.to_string())
.sorted()
.collect();
Ok(ProcessorBuilderList {
processor_builders: processors_builders,
input_keys: all_required_keys,
output_keys: all_output_keys,
required_original_keys: all_required_original_keys,
original_input_keys: all_required_original_keys,
})
}
}
fn parse_processor(doc: &yaml_rust::Yaml) -> Result<ProcessorKind, String> {
fn parse_processor(doc: &yaml_rust::Yaml) -> Result<ProcessorBuilders, String> {
let map = doc.as_hash().ok_or("processor must be a map".to_string())?;
let key = map
@@ -238,20 +245,24 @@ fn parse_processor(doc: &yaml_rust::Yaml) -> Result<ProcessorKind, String> {
.ok_or("processor key must be a string".to_string())?;
let processor = match str_key {
cmcd::PROCESSOR_CMCD => ProcessorKind::Cmcd(CmcdProcessor::try_from(value)?),
csv::PROCESSOR_CSV => ProcessorKind::Csv(CsvProcessor::try_from(value)?),
dissect::PROCESSOR_DISSECT => ProcessorKind::Dissect(DissectProcessor::try_from(value)?),
epoch::PROCESSOR_EPOCH => ProcessorKind::Epoch(EpochProcessor::try_from(value)?),
date::PROCESSOR_DATE => ProcessorKind::Date(DateProcessor::try_from(value)?),
gsub::PROCESSOR_GSUB => ProcessorKind::Gsub(GsubProcessor::try_from(value)?),
join::PROCESSOR_JOIN => ProcessorKind::Join(JoinProcessor::try_from(value)?),
letter::PROCESSOR_LETTER => ProcessorKind::Letter(LetterProcessor::try_from(value)?),
regex::PROCESSOR_REGEX => ProcessorKind::Regex(RegexProcessor::try_from(value)?),
cmcd::PROCESSOR_CMCD => ProcessorBuilders::Cmcd(CmcdProcessorBuilder::try_from(value)?),
csv::PROCESSOR_CSV => ProcessorBuilders::Csv(CsvProcessorBuilder::try_from(value)?),
dissect::PROCESSOR_DISSECT => {
ProcessorBuilders::Dissect(DissectProcessorBuilder::try_from(value)?)
}
epoch::PROCESSOR_EPOCH => ProcessorBuilders::Epoch(EpochProcessorBuilder::try_from(value)?),
date::PROCESSOR_DATE => ProcessorBuilders::Date(DateProcessorBuilder::try_from(value)?),
gsub::PROCESSOR_GSUB => ProcessorBuilders::Gsub(GsubProcessorBuilder::try_from(value)?),
join::PROCESSOR_JOIN => ProcessorBuilders::Join(JoinProcessorBuilder::try_from(value)?),
letter::PROCESSOR_LETTER => {
ProcessorBuilders::Letter(LetterProcessorBuilder::try_from(value)?)
}
regex::PROCESSOR_REGEX => ProcessorBuilders::Regex(RegexProcessorBuilder::try_from(value)?),
timestamp::PROCESSOR_TIMESTAMP => {
ProcessorKind::Timestamp(TimestampProcessor::try_from(value)?)
ProcessorBuilders::Timestamp(TimestampProcessorBuilder::try_from(value)?)
}
urlencoding::PROCESSOR_URL_ENCODING => {
ProcessorKind::UrlEncoding(UrlEncodingProcessor::try_from(value)?)
ProcessorBuilders::UrlEncoding(UrlEncodingProcessorBuilder::try_from(value)?)
}
_ => return Err(format!("unsupported {} processor", str_key)),
};
@@ -301,19 +312,10 @@ where
})
}
pub(crate) fn yaml_fields(v: &yaml_rust::Yaml, field: &str) -> Result<Fields, String> {
let v = yaml_parse_strings(v, field)?;
Fields::new(v)
pub(crate) fn yaml_new_fields(v: &yaml_rust::Yaml, field: &str) -> Result<Fields, String> {
yaml_parse_strings(v, field).map(Fields::new)
}
pub(crate) fn yaml_field(v: &yaml_rust::Yaml, field: &str) -> Result<Field, String> {
pub(crate) fn yaml_new_field(v: &yaml_rust::Yaml, field: &str) -> Result<Field, String> {
yaml_parse_string(v, field)
}
pub(crate) fn update_one_one_output_keys(fields: &mut Fields) {
for field in fields.iter_mut() {
field
.output_fields_index_mapping
.insert(field.get_target_field().to_string(), 0_usize);
}
}

View File

@@ -12,14 +12,18 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use std::collections::BTreeMap;
use ahash::HashSet;
use urlencoding::decode;
use crate::etl::field::{Field, Fields};
use crate::etl::field::{Field, Fields, InputFieldInfo, OneInputMultiOutputField};
use crate::etl::find_key_index;
use crate::etl::processor::{
yaml_bool, yaml_field, yaml_fields, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
yaml_bool, yaml_new_field, yaml_new_fields, Processor, ProcessorBuilder, ProcessorKind,
FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
};
use crate::etl::value::{Map, Value};
use crate::etl::value::Value;
pub(crate) const PROCESSOR_CMCD: &str = "cmcd";
@@ -63,6 +67,178 @@ const CMCD_KEYS: [&str; 18] = [
CMCD_KEY_V,
];
/// CmcdProcessorBuilder is a builder for CmcdProcessor
/// parse from raw yaml
#[derive(Debug, Default)]
pub struct CmcdProcessorBuilder {
fields: Fields,
output_keys: HashSet<String>,
ignore_missing: bool,
}
impl CmcdProcessorBuilder {
/// build_cmcd_outputs build cmcd output info
/// generate index and function for each output
pub(super) fn build_cmcd_outputs(
field: &Field,
intermediate_keys: &[String],
) -> Result<(BTreeMap<String, usize>, Vec<CmcdOutputInfo>), String> {
let mut output_index = BTreeMap::new();
let mut cmcd_field_outputs = Vec::with_capacity(CMCD_KEYS.len());
for cmcd in CMCD_KEYS {
let final_key = generate_key(field.target_or_input_field(), cmcd);
let index = find_key_index(intermediate_keys, &final_key, "cmcd")?;
output_index.insert(final_key.clone(), index);
match cmcd {
CMCD_KEY_BS | CMCD_KEY_SU => {
let output_info = CmcdOutputInfo::new(final_key, cmcd, index, bs_su);
cmcd_field_outputs.push(output_info);
}
CMCD_KEY_BR | CMCD_KEY_BL | CMCD_KEY_D | CMCD_KEY_DL | CMCD_KEY_MTP
| CMCD_KEY_RTP | CMCD_KEY_TB => {
let output_info = CmcdOutputInfo::new(final_key, cmcd, index, br_tb);
cmcd_field_outputs.push(output_info);
}
CMCD_KEY_CID | CMCD_KEY_NRR | CMCD_KEY_OT | CMCD_KEY_SF | CMCD_KEY_SID
| CMCD_KEY_ST | CMCD_KEY_V => {
let output_info = CmcdOutputInfo::new(final_key, cmcd, index, cid_v);
cmcd_field_outputs.push(output_info);
}
CMCD_KEY_NOR => {
let output_info = CmcdOutputInfo::new(final_key, cmcd, index, nor);
cmcd_field_outputs.push(output_info);
}
CMCD_KEY_PR => {
let output_info = CmcdOutputInfo::new(final_key, cmcd, index, pr);
cmcd_field_outputs.push(output_info);
}
_ => {}
}
}
Ok((output_index, cmcd_field_outputs))
}
/// build CmcdProcessor from CmcdProcessorBuilder
pub fn build(self, intermediate_keys: &[String]) -> Result<CmcdProcessor, String> {
let mut real_fields = vec![];
let mut cmcd_outputs = Vec::with_capacity(CMCD_KEYS.len());
for field in self.fields.into_iter() {
let input_index = find_key_index(intermediate_keys, field.input_field(), "cmcd")?;
let input_field_info = InputFieldInfo::new(field.input_field(), input_index);
let (_, cmcd_field_outputs) = Self::build_cmcd_outputs(&field, intermediate_keys)?;
cmcd_outputs.push(cmcd_field_outputs);
let real_field = OneInputMultiOutputField::new(input_field_info, field.target_field);
real_fields.push(real_field);
}
Ok(CmcdProcessor {
fields: real_fields,
cmcd_outputs,
ignore_missing: self.ignore_missing,
})
}
}
impl ProcessorBuilder for CmcdProcessorBuilder {
fn output_keys(&self) -> HashSet<&str> {
self.output_keys.iter().map(|s| s.as_str()).collect()
}
fn input_keys(&self) -> HashSet<&str> {
self.fields.iter().map(|f| f.input_field()).collect()
}
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String> {
self.build(intermediate_keys).map(ProcessorKind::Cmcd)
}
}
fn generate_key(prefix: &str, key: &str) -> String {
format!("{}_{}", prefix, key)
}
/// CmcdOutputInfo is a struct to store output info
#[derive(Debug)]
pub(super) struct CmcdOutputInfo {
/// {input_field}_{cmcd_key}
final_key: String,
/// cmcd key
key: &'static str,
/// index in intermediate_keys
index: usize,
/// function to resolve value
f: fn(&str, &str, Option<&str>) -> Result<Value, String>,
}
impl CmcdOutputInfo {
fn new(
final_key: String,
key: &'static str,
index: usize,
f: fn(&str, &str, Option<&str>) -> Result<Value, String>,
) -> Self {
Self {
final_key,
key,
index,
f,
}
}
}
impl Default for CmcdOutputInfo {
fn default() -> Self {
Self {
final_key: String::default(),
key: "",
index: 0,
f: |_, _, _| Ok(Value::Null),
}
}
}
/// function to resolve CMCD_KEY_BS | CMCD_KEY_SU
fn bs_su(_: &str, _: &str, _: Option<&str>) -> Result<Value, String> {
Ok(Value::Boolean(true))
}
/// function to resolve CMCD_KEY_BR | CMCD_KEY_BL | CMCD_KEY_D | CMCD_KEY_DL | CMCD_KEY_MTP | CMCD_KEY_RTP | CMCD_KEY_TB
fn br_tb(s: &str, k: &str, v: Option<&str>) -> Result<Value, String> {
let v = v.ok_or(format!("{k} missing value in {s}"))?;
let val: i64 = v
.parse()
.map_err(|_| format!("failed to parse {v} as i64"))?;
Ok(Value::Int64(val))
}
/// function to resolve CMCD_KEY_CID | CMCD_KEY_NRR | CMCD_KEY_OT | CMCD_KEY_SF | CMCD_KEY_SID | CMCD_KEY_V
fn cid_v(s: &str, k: &str, v: Option<&str>) -> Result<Value, String> {
let v = v.ok_or(format!("{k} missing value in {s}"))?;
Ok(Value::String(v.to_string()))
}
/// function to resolve CMCD_KEY_NOR
fn nor(s: &str, k: &str, v: Option<&str>) -> Result<Value, String> {
let v = v.ok_or(format!("{k} missing value in {s}"))?;
let val = match decode(v) {
Ok(val) => val.to_string(),
Err(_) => v.to_string(),
};
Ok(Value::String(val))
}
/// function to resolve CMCD_KEY_PR
fn pr(s: &str, k: &str, v: Option<&str>) -> Result<Value, String> {
let v = v.ok_or(format!("{k} missing value in {s}"))?;
let val: f64 = v
.parse()
.map_err(|_| format!("failed to parse {v} as f64"))?;
Ok(Value::Float64(val))
}
/// Common Media Client Data Specification:
/// https://cdn.cta.tech/cta/media/media/resources/standards/pdfs/cta-5004-final.pdf
///
@@ -100,98 +276,43 @@ const CMCD_KEYS: [&str; 18] = [
/// 12. Transport Layer Security SHOULD be used to protect all transmission of CMCD data.
#[derive(Debug, Default)]
pub struct CmcdProcessor {
fields: Fields,
fields: Vec<OneInputMultiOutputField>,
cmcd_outputs: Vec<Vec<CmcdOutputInfo>>,
ignore_missing: bool,
}
impl CmcdProcessor {
fn with_fields(&mut self, mut fields: Fields) {
Self::update_output_keys(&mut fields);
self.fields = fields;
}
fn with_ignore_missing(&mut self, ignore_missing: bool) {
self.ignore_missing = ignore_missing;
}
fn generate_key(prefix: &str, key: &str) -> String {
format!("{}_{}", prefix, key)
}
fn parse(prefix: &str, s: &str) -> Result<Map, String> {
let mut map = Map::default();
fn parse(&self, field_index: usize, s: &str) -> Result<Vec<(usize, Value)>, String> {
let parts = s.split(',');
let mut result = Vec::new();
for part in parts {
let mut kv = part.split('=');
let k = kv.next().ok_or(format!("{part} missing key in {s}"))?;
let v = kv.next();
let key = Self::generate_key(prefix, k);
match k {
CMCD_KEY_BS | CMCD_KEY_SU => {
map.insert(key, Value::Boolean(true));
for cmcd_key in self.cmcd_outputs[field_index].iter() {
if cmcd_key.key == k {
let val = (cmcd_key.f)(s, k, v)?;
result.push((cmcd_key.index, val));
}
CMCD_KEY_BR | CMCD_KEY_BL | CMCD_KEY_D | CMCD_KEY_DL | CMCD_KEY_MTP
| CMCD_KEY_RTP | CMCD_KEY_TB => {
let v = v.ok_or(format!("{k} missing value in {s}"))?;
let val: i64 = v
.parse()
.map_err(|_| format!("failed to parse {v} as i64"))?;
map.insert(key, Value::Int64(val));
}
CMCD_KEY_CID | CMCD_KEY_NRR | CMCD_KEY_OT | CMCD_KEY_SF | CMCD_KEY_SID
| CMCD_KEY_ST | CMCD_KEY_V => {
let v = v.ok_or(format!("{k} missing value in {s}"))?;
map.insert(key, Value::String(v.to_string()));
}
CMCD_KEY_NOR => {
let v = v.ok_or(format!("{k} missing value in {s}"))?;
let val = match decode(v) {
Ok(val) => val.to_string(),
Err(_) => v.to_string(),
};
map.insert(key, Value::String(val));
}
CMCD_KEY_PR => {
let v = v.ok_or(format!("{k} missing value in {s}"))?;
let val: f64 = v
.parse()
.map_err(|_| format!("failed to parse {v} as f64"))?;
map.insert(key, Value::Float64(val));
}
_ => match v {
Some(v) => map.insert(key, Value::String(v.to_string())),
None => map.insert(k, Value::Boolean(true)),
},
}
}
Ok(map)
}
fn process_field(&self, val: &str, field: &Field) -> Result<Map, String> {
let prefix = field.get_target_field();
Self::parse(prefix, val)
}
fn update_output_keys(fields: &mut Fields) {
for field in fields.iter_mut() {
for key in CMCD_KEYS.iter() {
field
.output_fields_index_mapping
.insert(Self::generate_key(field.get_target_field(), key), 0);
}
}
Ok(result)
}
}
impl TryFrom<&yaml_rust::yaml::Hash> for CmcdProcessor {
impl TryFrom<&yaml_rust::yaml::Hash> for CmcdProcessorBuilder {
type Error = String;
fn try_from(value: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
let mut processor = CmcdProcessor::default();
let mut fields = Fields::default();
let mut ignore_missing = false;
for (k, v) in value.iter() {
let key = k
@@ -199,25 +320,40 @@ impl TryFrom<&yaml_rust::yaml::Hash> for CmcdProcessor {
.ok_or(format!("key must be a string, but got {k:?}"))?;
match key {
FIELD_NAME => {
processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?));
fields = Fields::one(yaml_new_field(v, FIELD_NAME)?);
}
FIELDS_NAME => {
processor.with_fields(yaml_fields(v, FIELDS_NAME)?);
fields = yaml_new_fields(v, FIELDS_NAME)?;
}
IGNORE_MISSING_NAME => {
processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?);
ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?;
}
_ => {}
}
}
Ok(processor)
let output_keys = fields
.iter()
.flat_map(|f| {
CMCD_KEYS
.iter()
.map(|cmcd_key| generate_key(f.target_or_input_field(), cmcd_key))
})
.collect();
let builder = CmcdProcessorBuilder {
fields,
output_keys,
ignore_missing,
};
Ok(builder)
}
}
impl crate::etl::processor::Processor for CmcdProcessor {
impl Processor for CmcdProcessor {
fn kind(&self) -> &str {
PROCESSOR_CMCD
}
@@ -226,51 +362,14 @@ impl crate::etl::processor::Processor for CmcdProcessor {
self.ignore_missing
}
fn fields(&self) -> &Fields {
&self.fields
}
fn fields_mut(&mut self) -> &mut Fields {
&mut self.fields
}
fn output_keys(&self) -> HashSet<String> {
self.fields
.iter()
.map(|field| {
field
.target_field
.clone()
.unwrap_or_else(|| field.get_field_name().to_string())
})
.flat_map(|keys| {
CMCD_KEYS
.iter()
.map(move |key| format!("{}_{}", keys, *key))
})
.collect()
}
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
match val {
Value::String(val) => self.process_field(val, field),
_ => Err(format!(
"{} processor: expect string value, but got {val:?}",
self.kind()
)),
}
}
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String> {
for field in self.fields.iter() {
match val.get(field.input_field.index) {
for (field_index, field) in self.fields.iter().enumerate() {
let field_value_index = field.input_index();
match val.get(field_value_index) {
Some(Value::String(v)) => {
// TODO(qtang): Let this method use the intermediate state collection directly.
let map = self.process_field(v, field)?;
for (k, v) in map.values.into_iter() {
if let Some(index) = field.output_fields_index_mapping.get(&k) {
val[*index] = v;
}
let result_list = self.parse(field_index, v)?;
for (output_index, v) in result_list {
val[output_index] = v;
}
}
Some(Value::Null) | None => {
@@ -278,7 +377,7 @@ impl crate::etl::processor::Processor for CmcdProcessor {
return Err(format!(
"{} processor: missing field: {}",
self.kind(),
field.get_field_name()
field.input_name()
));
}
}
@@ -299,7 +398,8 @@ mod tests {
use ahash::HashMap;
use urlencoding::decode;
use super::CmcdProcessor;
use super::{CmcdProcessorBuilder, CMCD_KEYS};
use crate::etl::field::{Field, Fields};
use crate::etl::value::{Map, Value};
#[test]
@@ -329,6 +429,7 @@ mod tests {
],
),
(
// we not resolve `b` key
"b%2Crtp%3D15000%2Csid%3D%226e2fb550-c457-11e9-bb97-0800200c9a66%22",
vec![
(
@@ -336,7 +437,6 @@ mod tests {
Value::String("\"6e2fb550-c457-11e9-bb97-0800200c9a66\"".into()),
),
("prefix_rtp", Value::Int64(15000)),
("b", Value::Boolean(true)),
],
),
(
@@ -347,16 +447,17 @@ mod tests {
],
),
(
// we not resolve custom key
"d%3D4004%2Ccom.example-myNumericKey%3D500%2Ccom.examplemyStringKey%3D%22myStringValue%22",
vec![
(
"prefix_com.example-myNumericKey",
Value::String("500".into()),
),
(
"prefix_com.examplemyStringKey",
Value::String("\"myStringValue\"".into()),
),
// (
// "prefix_com.example-myNumericKey",
// Value::String("500".into()),
// ),
// (
// "prefix_com.examplemyStringKey",
// Value::String("\"myStringValue\"".into()),
// ),
("prefix_d", Value::Int64(4004)),
],
),
@@ -431,6 +532,24 @@ mod tests {
),
];
let field = Field::new("prefix", None);
let output_keys = CMCD_KEYS
.iter()
.map(|k| format!("prefix_{}", k))
.collect::<Vec<String>>();
let mut intermediate_keys = vec!["prefix".to_string()];
intermediate_keys.append(&mut (output_keys.clone()));
let builder = CmcdProcessorBuilder {
fields: Fields::new(vec![field]),
output_keys: output_keys.iter().map(|s| s.to_string()).collect(),
ignore_missing: false,
};
let processor = builder.build(&intermediate_keys).unwrap();
for (s, vec) in ss.into_iter() {
let decoded = decode(s).unwrap().to_string();
@@ -440,7 +559,12 @@ mod tests {
.collect::<HashMap<String, Value>>();
let expected = Map { values };
let actual = CmcdProcessor::parse("prefix", &decoded).unwrap();
let actual = processor.parse(0, &decoded).unwrap();
let actual = actual
.into_iter()
.map(|(index, value)| (intermediate_keys[index].clone(), value))
.collect::<HashMap<String, Value>>();
let actual = Map { values: actual };
assert_eq!(actual, expected);
}
}

View File

@@ -14,17 +14,18 @@
// Reference: https://www.elastic.co/guide/en/elasticsearch/reference/current/csv-processor.html
use ahash::{HashMap, HashSet};
use ahash::HashSet;
use csv::{ReaderBuilder, Trim};
use itertools::EitherOrBoth::{Both, Left, Right};
use itertools::Itertools;
use crate::etl::field::{Field, Fields};
use crate::etl::field::{Fields, InputFieldInfo, OneInputMultiOutputField};
use crate::etl::find_key_index;
use crate::etl::processor::{
yaml_bool, yaml_field, yaml_fields, yaml_string, Processor, FIELDS_NAME, FIELD_NAME,
IGNORE_MISSING_NAME,
yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, ProcessorBuilder,
ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
};
use crate::etl::value::{Map, Value};
use crate::etl::value::Value;
pub(crate) const PROCESSOR_CSV: &str = "csv";
@@ -32,18 +33,78 @@ const SEPARATOR_NAME: &str = "separator";
const QUOTE_NAME: &str = "quote";
const TRIM_NAME: &str = "trim";
const EMPTY_VALUE_NAME: &str = "empty_value";
const TARGET_FIELDS: &str = "target_fields";
#[derive(Debug, Default)]
pub struct CsvProcessorBuilder {
reader: ReaderBuilder,
fields: Fields,
ignore_missing: bool,
// Value used to fill empty fields, empty fields will be skipped if this is not provided.
empty_value: Option<String>,
target_fields: Vec<String>,
// description
// if
// ignore_failure
// on_failure
// tag
}
impl CsvProcessorBuilder {
fn build(self, intermediate_keys: &[String]) -> Result<CsvProcessor, String> {
let mut real_fields = vec![];
for field in self.fields {
let input_index = find_key_index(intermediate_keys, field.input_field(), "csv")?;
let input_field_info = InputFieldInfo::new(field.input_field(), input_index);
let real_field = OneInputMultiOutputField::new(input_field_info, None);
real_fields.push(real_field);
}
let output_index_info = self
.target_fields
.iter()
.map(|f| find_key_index(intermediate_keys, f, "csv"))
.collect::<Result<Vec<_>, String>>()?;
Ok(CsvProcessor {
reader: self.reader,
fields: real_fields,
ignore_missing: self.ignore_missing,
empty_value: self.empty_value,
output_index_info,
})
}
}
impl ProcessorBuilder for CsvProcessorBuilder {
fn output_keys(&self) -> HashSet<&str> {
self.target_fields.iter().map(|s| s.as_str()).collect()
}
fn input_keys(&self) -> HashSet<&str> {
self.fields.iter().map(|f| f.input_field()).collect()
}
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String> {
self.build(intermediate_keys).map(ProcessorKind::Csv)
}
}
/// only support string value
#[derive(Debug)]
pub struct CsvProcessor {
reader: ReaderBuilder,
fields: Fields,
fields: Vec<OneInputMultiOutputField>,
ignore_missing: bool,
// Value used to fill empty fields, empty fields will be skipped if this is not provided.
empty_value: Option<String>,
output_index_info: Vec<usize>,
// description
// if
// ignore_failure
@@ -52,81 +113,19 @@ pub struct CsvProcessor {
}
impl CsvProcessor {
fn new() -> Self {
let mut reader = ReaderBuilder::new();
reader.has_headers(false);
Self {
reader,
fields: Fields::default(),
ignore_missing: false,
empty_value: None,
}
}
fn with_fields(&mut self, fields: Fields) {
self.fields = fields;
}
fn try_separator(&mut self, separator: String) -> Result<(), String> {
if separator.len() != 1 {
Err(format!(
"'{}' must be a single character, but got '{}'",
SEPARATOR_NAME, separator
))
} else {
self.reader.delimiter(separator.as_bytes()[0]);
Ok(())
}
}
fn try_quote(&mut self, quote: String) -> Result<(), String> {
if quote.len() != 1 {
Err(format!(
"'{}' must be a single character, but got '{}'",
QUOTE_NAME, quote
))
} else {
self.reader.quote(quote.as_bytes()[0]);
Ok(())
}
}
fn with_trim(&mut self, trim: bool) {
if trim {
self.reader.trim(Trim::All);
} else {
self.reader.trim(Trim::None);
}
}
fn with_ignore_missing(&mut self, ignore_missing: bool) {
self.ignore_missing = ignore_missing;
}
fn with_empty_value(&mut self, empty_value: String) {
self.empty_value = Some(empty_value);
}
// process the csv format string to a map with target_fields as keys
fn process_field(&self, val: &str, field: &Field) -> Result<Map, String> {
fn process(&self, val: &str) -> Result<Vec<(usize, Value)>, String> {
let mut reader = self.reader.from_reader(val.as_bytes());
if let Some(result) = reader.records().next() {
let record: csv::StringRecord = result.map_err(|e| e.to_string())?;
let values: HashMap<String, Value> = field
.target_fields
.as_ref()
.ok_or(format!(
"target fields must be set after '{}'",
field.get_field_name()
))?
let values: Vec<(usize, Value)> = self
.output_index_info
.iter()
.map(|f| f.to_string())
.zip_longest(record.iter())
.filter_map(|zipped| match zipped {
Both(target_field, val) => Some((target_field, Value::String(val.into()))),
Both(target_field, val) => Some((*target_field, Value::String(val.into()))),
// if target fields are more than extracted fields, fill the rest with empty value
Left(target_field) => {
let value = self
@@ -134,69 +133,101 @@ impl CsvProcessor {
.as_ref()
.map(|s| Value::String(s.clone()))
.unwrap_or(Value::Null);
Some((target_field, value))
Some((*target_field, value))
}
// if extracted fields are more than target fields, ignore the rest
Right(_) => None,
})
.collect();
Ok(Map { values })
Ok(values)
} else {
Err("expected at least one record from csv format, but got none".into())
}
}
fn update_output_keys(&mut self) {
self.fields.iter_mut().for_each(|f| {
if let Some(tfs) = f.target_fields.as_ref() {
tfs.iter().for_each(|tf| {
if !tf.is_empty() {
f.output_fields_index_mapping.insert(tf.to_string(), 0);
}
});
}
})
}
}
impl TryFrom<&yaml_rust::yaml::Hash> for CsvProcessor {
impl TryFrom<&yaml_rust::yaml::Hash> for CsvProcessorBuilder {
type Error = String;
fn try_from(hash: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
let mut processor = CsvProcessor::new();
let mut reader = ReaderBuilder::new();
reader.has_headers(false);
let mut fields = Fields::default();
let mut ignore_missing = false;
let mut empty_value = None;
let mut target_fields = vec![];
for (k, v) in hash {
let key = k
.as_str()
.ok_or(format!("key must be a string, but got {k:?}"))?;
match key {
FIELD_NAME => {
processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?));
fields = Fields::one(yaml_new_field(v, FIELD_NAME)?);
}
FIELDS_NAME => {
processor.with_fields(yaml_fields(v, FIELDS_NAME)?);
fields = yaml_new_fields(v, FIELDS_NAME)?;
}
TARGET_FIELDS => {
target_fields = yaml_string(v, TARGET_FIELDS)?
.split(',')
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
.collect();
}
SEPARATOR_NAME => {
processor.try_separator(yaml_string(v, SEPARATOR_NAME)?)?;
let separator = yaml_string(v, SEPARATOR_NAME)?;
if separator.len() != 1 {
return Err(format!(
"'{}' must be a single character, but got '{}'",
SEPARATOR_NAME, separator
));
} else {
reader.delimiter(separator.as_bytes()[0]);
}
}
QUOTE_NAME => {
processor.try_quote(yaml_string(v, QUOTE_NAME)?)?;
let quote = yaml_string(v, QUOTE_NAME)?;
if quote.len() != 1 {
return Err(format!(
"'{}' must be a single character, but got '{}'",
QUOTE_NAME, quote
));
} else {
reader.quote(quote.as_bytes()[0]);
}
}
TRIM_NAME => {
processor.with_trim(yaml_bool(v, TRIM_NAME)?);
let trim = yaml_bool(v, TRIM_NAME)?;
if trim {
reader.trim(Trim::All);
} else {
reader.trim(Trim::None);
}
}
IGNORE_MISSING_NAME => {
processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?);
ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?;
}
EMPTY_VALUE_NAME => {
processor.with_empty_value(yaml_string(v, EMPTY_VALUE_NAME)?);
empty_value = Some(yaml_string(v, EMPTY_VALUE_NAME)?);
}
_ => {}
}
}
processor.update_output_keys();
Ok(processor)
let builder = {
CsvProcessorBuilder {
reader,
fields,
ignore_missing,
empty_value,
target_fields,
}
};
Ok(builder)
}
}
@@ -209,41 +240,14 @@ impl Processor for CsvProcessor {
self.ignore_missing
}
fn fields(&self) -> &Fields {
&self.fields
}
fn fields_mut(&mut self) -> &mut Fields {
&mut self.fields
}
fn output_keys(&self) -> HashSet<String> {
self.fields
.iter()
.flat_map(|f| f.target_fields.clone().unwrap_or_default())
.collect()
}
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
match val {
Value::String(val) => self.process_field(val, field),
_ => Err(format!(
"{} processor: expect string value, but got {val:?}",
self.kind()
)),
}
}
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String> {
for field in self.fields.iter() {
match val.get(field.input_field.index) {
let index = field.input_index();
match val.get(index) {
Some(Value::String(v)) => {
// TODO(qtang): Let this method use the intermediate state collection directly.
let map = self.process_field(v, field)?;
for (k, v) in map.values.into_iter() {
if let Some(index) = field.output_fields_index_mapping.get(&k) {
val[*index] = v;
}
let resule_list = self.process(v)?;
for (k, v) in resule_list {
val[k] = v;
}
}
Some(Value::Null) | None => {
@@ -251,7 +255,7 @@ impl Processor for CsvProcessor {
return Err(format!(
"{} processor: missing field: {}",
self.kind(),
field.get_field_name()
field.input_name()
));
}
}
@@ -267,116 +271,140 @@ impl Processor for CsvProcessor {
}
}
// TODO(yuanbohan): more test cases
#[cfg(test)]
mod tests {
use ahash::HashMap;
use super::{CsvProcessor, Value};
use crate::etl::field::Fields;
use crate::etl::processor::Processor;
use crate::etl::value::Map;
use super::Value;
use crate::etl::processor::csv::CsvProcessorBuilder;
#[test]
fn test_equal_length() {
let mut processor = CsvProcessor::new();
let field = "data,, a, b".parse().unwrap();
processor.with_fields(Fields::one(field));
let mut reader = csv::ReaderBuilder::new();
reader.has_headers(false);
let builder = CsvProcessorBuilder {
reader,
target_fields: vec!["a".into(), "b".into()],
..Default::default()
};
let values: HashMap<String, Value> = [("data".into(), Value::String("1,2".into()))]
let intermediate_keys = vec!["data".into(), "a".into(), "b".into()];
let processor = builder.build(&intermediate_keys).unwrap();
let result = processor
.process("1,2")
.unwrap()
.into_iter()
.collect();
let mut m = Map { values };
processor.exec_map(&mut m).unwrap();
.map(|(k, v)| (intermediate_keys[k].clone(), v))
.collect::<HashMap<_, _>>();
let values = [
("data".into(), Value::String("1,2".into())),
("a".into(), Value::String("1".into())),
("b".into(), Value::String("2".into())),
]
.into_iter()
.collect();
let expected = Map { values };
.collect::<HashMap<_, _>>();
assert_eq!(expected, m);
assert_eq!(result, values);
}
// test target_fields length larger than the record length
#[test]
fn test_target_fields_has_more_length() {
let values = [("data".into(), Value::String("1,2".into()))]
.into_iter()
.collect();
let mut input = Map { values };
// with no empty value
{
let mut processor = CsvProcessor::new();
let field = "data,, a,b,c".parse().unwrap();
processor.with_fields(Fields::one(field));
let mut reader = csv::ReaderBuilder::new();
reader.has_headers(false);
let builder = CsvProcessorBuilder {
reader,
target_fields: vec!["a".into(), "b".into(), "c".into()],
..Default::default()
};
processor.exec_map(&mut input).unwrap();
let intermediate_keys = vec!["data".into(), "a".into(), "b".into(), "c".into()];
let processor = builder.build(&intermediate_keys).unwrap();
let result = processor
.process("1,2")
.unwrap()
.into_iter()
.map(|(k, v)| (intermediate_keys[k].clone(), v))
.collect::<HashMap<_, _>>();
let values = [
("data".into(), Value::String("1,2".into())),
("a".into(), Value::String("1".into())),
("b".into(), Value::String("2".into())),
("c".into(), Value::Null),
]
.into_iter()
.collect();
let expected = Map { values };
.collect::<HashMap<_, _>>();
assert_eq!(expected, input);
assert_eq!(result, values);
}
// with empty value
{
let mut processor = CsvProcessor::new();
let field = "data,, a,b,c".parse().unwrap();
processor.with_fields(Fields::one(field));
processor.with_empty_value("default".into());
let mut reader = csv::ReaderBuilder::new();
reader.has_headers(false);
let builder = CsvProcessorBuilder {
reader,
target_fields: vec!["a".into(), "b".into(), "c".into()],
empty_value: Some("default".into()),
..Default::default()
};
processor.exec_map(&mut input).unwrap();
let intermediate_keys = vec!["data".into(), "a".into(), "b".into(), "c".into()];
let processor = builder.build(&intermediate_keys).unwrap();
let result = processor
.process("1,2")
.unwrap()
.into_iter()
.map(|(k, v)| (intermediate_keys[k].clone(), v))
.collect::<HashMap<_, _>>();
let values = [
("data".into(), Value::String("1,2".into())),
("a".into(), Value::String("1".into())),
("b".into(), Value::String("2".into())),
("c".into(), Value::String("default".into())),
]
.into_iter()
.collect();
let expected = Map { values };
assert_eq!(expected, input);
assert_eq!(result, values);
}
}
// test record has larger length
#[test]
fn test_target_fields_has_less_length() {
let values = [("data".into(), Value::String("1,2,3".into()))]
let mut reader = csv::ReaderBuilder::new();
reader.has_headers(false);
let builder = CsvProcessorBuilder {
reader,
target_fields: vec!["a".into(), "b".into()],
empty_value: Some("default".into()),
..Default::default()
};
let intermediate_keys = vec!["data".into(), "a".into(), "b".into()];
let processor = builder.build(&intermediate_keys).unwrap();
let result = processor
.process("1,2")
.unwrap()
.into_iter()
.collect();
let mut input = Map { values };
let mut processor = CsvProcessor::new();
let field = "data,,a,b".parse().unwrap();
processor.with_fields(Fields::one(field));
processor.exec_map(&mut input).unwrap();
.map(|(k, v)| (intermediate_keys[k].clone(), v))
.collect::<HashMap<_, _>>();
let values = [
("data".into(), Value::String("1,2,3".into())),
("a".into(), Value::String("1".into())),
("b".into(), Value::String("2".into())),
]
.into_iter()
.collect();
let expected = Map { values };
assert_eq!(expected, input);
assert_eq!(result, values);
}
}

View File

@@ -19,12 +19,12 @@ use chrono::{DateTime, NaiveDateTime};
use chrono_tz::Tz;
use lazy_static::lazy_static;
use crate::etl::field::{Field, Fields};
use crate::etl::field::{Fields, OneInputOneOutputField};
use crate::etl::processor::{
update_one_one_output_keys, yaml_bool, yaml_field, yaml_fields, yaml_string, yaml_strings,
Processor, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, yaml_strings, Processor,
ProcessorBuilder, ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
};
use crate::etl::value::{Map, Timestamp, Value};
use crate::etl::value::{Timestamp, Value};
pub(crate) const PROCESSOR_DATE: &str = "date";
@@ -57,9 +57,15 @@ lazy_static! {
.collect();
}
#[derive(Debug, Default)]
#[derive(Debug)]
struct Formats(Vec<Arc<String>>);
impl Default for Formats {
fn default() -> Self {
Formats(DEFAULT_FORMATS.clone())
}
}
impl Formats {
fn new(mut formats: Vec<Arc<String>>) -> Self {
formats.sort();
@@ -76,16 +82,119 @@ impl std::ops::Deref for Formats {
}
}
#[derive(Debug, Default)]
pub struct DateProcessorBuilder {
fields: Fields,
formats: Formats,
timezone: Option<Arc<String>>,
locale: Option<Arc<String>>,
ignore_missing: bool,
}
impl ProcessorBuilder for DateProcessorBuilder {
fn output_keys(&self) -> HashSet<&str> {
self.fields
.iter()
.map(|f| f.target_or_input_field())
.collect()
}
fn input_keys(&self) -> HashSet<&str> {
self.fields.iter().map(|f| f.input_field()).collect()
}
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String> {
self.build(intermediate_keys).map(ProcessorKind::Date)
}
}
impl DateProcessorBuilder {
pub fn build(self, intermediate_keys: &[String]) -> Result<DateProcessor, String> {
let mut real_fields = vec![];
for field in self.fields.into_iter() {
let input = OneInputOneOutputField::build(
"date",
intermediate_keys,
field.input_field(),
field.target_or_input_field(),
)?;
real_fields.push(input);
}
Ok(DateProcessor {
fields: real_fields,
formats: self.formats,
timezone: self.timezone,
locale: self.locale,
ignore_missing: self.ignore_missing,
})
}
}
impl TryFrom<&yaml_rust::yaml::Hash> for DateProcessorBuilder {
type Error = String;
fn try_from(hash: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
let mut fields = Fields::default();
let mut formats = Formats::default();
let mut timezone = None;
let mut locale = None;
let mut ignore_missing = false;
for (k, v) in hash {
let key = k
.as_str()
.ok_or(format!("key must be a string, but got {k:?}"))?;
match key {
FIELD_NAME => {
fields = Fields::one(yaml_new_field(v, FIELD_NAME)?);
}
FIELDS_NAME => {
fields = yaml_new_fields(v, FIELDS_NAME)?;
}
FORMATS_NAME => {
let format_strs = yaml_strings(v, FORMATS_NAME)?;
if format_strs.is_empty() {
formats = Formats::new(DEFAULT_FORMATS.clone());
} else {
formats = Formats::new(format_strs.into_iter().map(Arc::new).collect());
}
}
TIMEZONE_NAME => {
timezone = Some(Arc::new(yaml_string(v, TIMEZONE_NAME)?));
}
LOCALE_NAME => {
locale = Some(Arc::new(yaml_string(v, LOCALE_NAME)?));
}
IGNORE_MISSING_NAME => {
ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?;
}
_ => {}
}
}
let builder = DateProcessorBuilder {
fields,
formats,
timezone,
locale,
ignore_missing,
};
Ok(builder)
}
}
/// deprecated it should be removed in the future
/// Reserved for compatibility only
#[derive(Debug, Default)]
pub struct DateProcessor {
fields: Fields,
fields: Vec<OneInputOneOutputField>,
formats: Formats,
timezone: Option<Arc<String>>,
locale: Option<Arc<String>>, // to support locale
output_format: Option<Arc<String>>,
ignore_missing: bool,
// description
@@ -96,43 +205,6 @@ pub struct DateProcessor {
}
impl DateProcessor {
fn with_fields(&mut self, mut fields: Fields) {
update_one_one_output_keys(&mut fields);
self.fields = fields
}
fn with_formats(&mut self, v: Option<Vec<Arc<String>>>) {
let v = match v {
Some(v) if !v.is_empty() => v,
_ => DEFAULT_FORMATS.clone(),
};
let formats = Formats::new(v);
self.formats = formats;
}
fn with_timezone(&mut self, timezone: String) {
if !timezone.is_empty() {
self.timezone = Some(Arc::new(timezone));
}
}
fn with_locale(&mut self, locale: String) {
if !locale.is_empty() {
self.locale = Some(Arc::new(locale));
}
}
fn with_output_format(&mut self, output_format: String) {
if !output_format.is_empty() {
self.output_format = Some(Arc::new(output_format));
}
}
fn with_ignore_missing(&mut self, ignore_missing: bool) {
self.ignore_missing = ignore_missing;
}
fn parse(&self, val: &str) -> Result<Timestamp, String> {
let mut tz = Tz::UTC;
if let Some(timezone) = &self.timezone {
@@ -147,61 +219,6 @@ impl DateProcessor {
Err(format!("{} processor: failed to parse {val}", self.kind(),))
}
fn process_field(&self, val: &str, field: &Field) -> Result<Map, String> {
let key = field.get_target_field();
Ok(Map::one(key, Value::Timestamp(self.parse(val)?)))
}
}
impl TryFrom<&yaml_rust::yaml::Hash> for DateProcessor {
type Error = String;
fn try_from(hash: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
let mut processor = DateProcessor::default();
let mut formats_opt = None;
for (k, v) in hash {
let key = k
.as_str()
.ok_or(format!("key must be a string, but got {k:?}"))?;
match key {
FIELD_NAME => {
processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?));
}
FIELDS_NAME => {
processor.with_fields(yaml_fields(v, FIELDS_NAME)?);
}
FORMATS_NAME => {
let formats = yaml_strings(v, FORMATS_NAME)?;
formats_opt = Some(formats.into_iter().map(Arc::new).collect());
}
TIMEZONE_NAME => {
processor.with_timezone(yaml_string(v, TIMEZONE_NAME)?);
}
LOCALE_NAME => {
processor.with_locale(yaml_string(v, LOCALE_NAME)?);
}
OUTPUT_FORMAT_NAME => {
processor.with_output_format(yaml_string(v, OUTPUT_FORMAT_NAME)?);
}
IGNORE_MISSING_NAME => {
processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?);
}
_ => {}
}
}
processor.with_formats(formats_opt);
Ok(processor)
}
}
impl Processor for DateProcessor {
@@ -213,53 +230,21 @@ impl Processor for DateProcessor {
self.ignore_missing
}
fn fields(&self) -> &Fields {
&self.fields
}
fn fields_mut(&mut self) -> &mut Fields {
&mut self.fields
}
fn output_keys(&self) -> HashSet<String> {
self.fields
.iter()
.map(|f| f.get_target_field().to_string())
.collect()
}
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
match val {
Value::String(s) => self.process_field(s, field),
_ => Err(format!(
"{} processor: expect string value, but got {val:?}",
self.kind()
)),
}
}
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String> {
for field in self.fields().iter() {
let index = field.input_field.index;
for field in self.fields.iter() {
let index = field.input_index();
match val.get(index) {
Some(Value::String(s)) => {
// TODO(qtang): Let this method use the intermediate state collection directly.
let mut map = self.process_field(s, field)?;
field
.output_fields_index_mapping
.iter()
.for_each(|(k, output_index)| {
if let Some(v) = map.remove(k) {
val[*output_index] = v;
}
});
let timestamp = self.parse(s)?;
let output_index = field.output_index();
val[output_index] = Value::Timestamp(timestamp);
}
Some(Value::Null) | None => {
if !self.ignore_missing {
return Err(format!(
"{} processor: missing field: {}",
self.kind(),
field.get_field_name()
field.input_name()
));
}
}
@@ -318,8 +303,7 @@ mod tests {
#[test]
fn test_parse() {
let mut processor = DateProcessor::default();
processor.with_formats(None);
let processor = DateProcessor::default();
let values: Vec<&str> = vec![
"2014-5-17T12:34:56",
@@ -340,7 +324,6 @@ mod tests {
#[test]
fn test_parse_with_formats() {
let mut processor = DateProcessor::default();
let formats = vec![
"%Y-%m-%dT%H:%M:%S%:z",
"%Y-%m-%dT%H:%M:%S%.3f%:z",
@@ -349,8 +332,11 @@ mod tests {
]
.into_iter()
.map(|s| Arc::new(s.to_string()))
.collect();
processor.with_formats(Some(formats));
.collect::<Vec<_>>();
let processor = DateProcessor {
formats: super::Formats(formats),
..Default::default()
};
let values: Vec<&str> = vec![
"2014-5-17T12:34:56",
@@ -371,9 +357,10 @@ mod tests {
#[test]
fn test_parse_with_timezone() {
let mut processor = DateProcessor::default();
processor.with_formats(None);
processor.with_timezone("Asia/Tokyo".to_string());
let processor = DateProcessor {
timezone: Some(Arc::new("Asia/Tokyo".to_string())),
..Default::default()
};
let values: Vec<&str> = vec![
"2014-5-17T12:34:56",

File diff suppressed because it is too large Load Diff

View File

@@ -14,17 +14,17 @@
use ahash::HashSet;
use crate::etl::field::{Field, Fields};
use crate::etl::field::{Fields, OneInputOneOutputField};
use crate::etl::processor::{
update_one_one_output_keys, yaml_bool, yaml_field, yaml_fields, yaml_string, Processor,
FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, ProcessorBuilder,
ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
};
use crate::etl::value::time::{
MICROSECOND_RESOLUTION, MICRO_RESOLUTION, MILLISECOND_RESOLUTION, MILLI_RESOLUTION,
MS_RESOLUTION, NANOSECOND_RESOLUTION, NANO_RESOLUTION, NS_RESOLUTION, SECOND_RESOLUTION,
SEC_RESOLUTION, S_RESOLUTION, US_RESOLUTION,
};
use crate::etl::value::{Map, Timestamp, Value};
use crate::etl::value::{Timestamp, Value};
pub(crate) const PROCESSOR_EPOCH: &str = "epoch";
const RESOLUTION_NAME: &str = "resolution";
@@ -52,12 +52,56 @@ impl TryFrom<&str> for Resolution {
}
}
#[derive(Debug, Default)]
pub struct EpochProcessorBuilder {
fields: Fields,
resolution: Resolution,
ignore_missing: bool,
}
impl ProcessorBuilder for EpochProcessorBuilder {
fn output_keys(&self) -> HashSet<&str> {
self.fields
.iter()
.map(|f| f.target_or_input_field())
.collect()
}
fn input_keys(&self) -> HashSet<&str> {
self.fields.iter().map(|f| f.input_field()).collect()
}
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String> {
self.build(intermediate_keys).map(ProcessorKind::Epoch)
}
}
impl EpochProcessorBuilder {
pub fn build(self, intermediate_keys: &[String]) -> Result<EpochProcessor, String> {
let mut real_fields = vec![];
for field in self.fields.into_iter() {
let input = OneInputOneOutputField::build(
"epoch",
intermediate_keys,
field.input_field(),
field.target_or_input_field(),
)?;
real_fields.push(input);
}
Ok(EpochProcessor {
fields: real_fields,
resolution: self.resolution,
ignore_missing: self.ignore_missing,
})
}
}
/// support string, integer, float, time, epoch
/// deprecated it should be removed in the future
/// Reserved for compatibility only
#[derive(Debug, Default)]
pub struct EpochProcessor {
fields: Fields,
fields: Vec<OneInputOneOutputField>,
resolution: Resolution,
ignore_missing: bool,
// description
@@ -68,19 +112,6 @@ pub struct EpochProcessor {
}
impl EpochProcessor {
fn with_fields(&mut self, mut fields: Fields) {
update_one_one_output_keys(&mut fields);
self.fields = fields
}
fn with_resolution(&mut self, resolution: Resolution) {
self.resolution = resolution;
}
fn with_ignore_missing(&mut self, ignore_missing: bool) {
self.ignore_missing = ignore_missing;
}
fn parse(&self, val: &Value) -> Result<Timestamp, String> {
let t: i64 = match val {
Value::String(s) => s
@@ -117,19 +148,15 @@ impl EpochProcessor {
Resolution::Nano => Ok(Timestamp::Nanosecond(t)),
}
}
fn process_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
let key = field.get_target_field();
Ok(Map::one(key, Value::Timestamp(self.parse(val)?)))
}
}
impl TryFrom<&yaml_rust::yaml::Hash> for EpochProcessor {
impl TryFrom<&yaml_rust::yaml::Hash> for EpochProcessorBuilder {
type Error = String;
fn try_from(hash: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
let mut processor = EpochProcessor::default();
let mut fields = Fields::default();
let mut resolution = Resolution::default();
let mut ignore_missing = false;
for (k, v) in hash {
let key = k
@@ -138,24 +165,29 @@ impl TryFrom<&yaml_rust::yaml::Hash> for EpochProcessor {
match key {
FIELD_NAME => {
processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?));
fields = Fields::one(yaml_new_field(v, FIELD_NAME)?);
}
FIELDS_NAME => {
processor.with_fields(yaml_fields(v, FIELDS_NAME)?);
fields = yaml_new_fields(v, FIELDS_NAME)?;
}
RESOLUTION_NAME => {
let s = yaml_string(v, RESOLUTION_NAME)?.as_str().try_into()?;
processor.with_resolution(s);
resolution = s;
}
IGNORE_MISSING_NAME => {
processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?);
ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?;
}
_ => {}
}
}
let builder = EpochProcessorBuilder {
fields,
resolution,
ignore_missing,
};
Ok(processor)
Ok(builder)
}
}
@@ -168,49 +200,23 @@ impl Processor for EpochProcessor {
self.ignore_missing
}
fn fields(&self) -> &Fields {
&self.fields
}
fn fields_mut(&mut self) -> &mut Fields {
&mut self.fields
}
fn output_keys(&self) -> HashSet<String> {
self.fields
.iter()
.map(|f| f.get_target_field().to_string())
.collect()
}
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
self.process_field(val, field)
}
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String> {
for field in self.fields.iter() {
let index = field.input_field.index;
let index = field.input_index();
match val.get(index) {
Some(Value::Null) | None => {
if !self.ignore_missing {
return Err(format!(
"{} processor: missing field: {}",
self.kind(),
field.get_field_name()
field.input_name()
));
}
}
Some(v) => {
// TODO(qtang): Let this method use the intermediate state collection directly.
let mut map = self.process_field(v, field)?;
field
.output_fields_index_mapping
.iter()
.for_each(|(k, output_index)| {
if let Some(v) = map.remove(k) {
val[*output_index] = v;
}
});
let timestamp = self.parse(v)?;
let output_index = field.output_index();
val[output_index] = Value::Timestamp(timestamp);
}
}
}
@@ -225,8 +231,10 @@ mod tests {
#[test]
fn test_parse_epoch() {
let mut processor = EpochProcessor::default();
processor.with_resolution(super::Resolution::Second);
let processor = EpochProcessor {
resolution: super::Resolution::Second,
..Default::default()
};
let values = [
Value::String("1573840000".into()),

View File

@@ -15,45 +15,43 @@
use ahash::HashSet;
use regex::Regex;
use crate::etl::field::{Field, Fields};
use crate::etl::field::{Fields, OneInputOneOutputField};
use crate::etl::processor::{
update_one_one_output_keys, yaml_bool, yaml_field, yaml_fields, yaml_string, Processor,
FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, PATTERN_NAME,
yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, ProcessorBuilder,
ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, PATTERN_NAME,
};
use crate::etl::value::{Array, Map, Value};
use crate::etl::value::Value;
pub(crate) const PROCESSOR_GSUB: &str = "gsub";
const REPLACEMENT_NAME: &str = "replacement";
/// A processor to replace all matches of a pattern in string by a replacement, only support string value, and array string value
#[derive(Debug, Default)]
pub struct GsubProcessor {
pub struct GsubProcessorBuilder {
fields: Fields,
pattern: Option<Regex>,
replacement: Option<String>,
ignore_missing: bool,
}
impl GsubProcessor {
fn with_fields(&mut self, mut fields: Fields) {
update_one_one_output_keys(&mut fields);
self.fields = fields;
impl ProcessorBuilder for GsubProcessorBuilder {
fn output_keys(&self) -> HashSet<&str> {
self.fields
.iter()
.map(|f| f.target_or_input_field())
.collect()
}
fn with_ignore_missing(&mut self, ignore_missing: bool) {
self.ignore_missing = ignore_missing;
fn input_keys(&self) -> HashSet<&str> {
self.fields.iter().map(|f| f.input_field()).collect()
}
fn try_pattern(&mut self, pattern: &str) -> Result<(), String> {
self.pattern = Some(Regex::new(pattern).map_err(|e| e.to_string())?);
Ok(())
}
fn with_replacement(&mut self, replacement: impl Into<String>) {
self.replacement = Some(replacement.into());
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String> {
self.build(intermediate_keys).map(ProcessorKind::Gsub)
}
}
impl GsubProcessorBuilder {
fn check(self) -> Result<Self, String> {
if self.pattern.is_none() {
return Err("pattern is required".to_string());
@@ -66,7 +64,49 @@ impl GsubProcessor {
Ok(self)
}
fn process_string_field(&self, val: &str, field: &Field) -> Result<Map, String> {
fn build(self, intermediate_keys: &[String]) -> Result<GsubProcessor, String> {
let mut real_fields = vec![];
for field in self.fields.into_iter() {
let input = OneInputOneOutputField::build(
"gsub",
intermediate_keys,
field.input_field(),
field.target_or_input_field(),
)?;
real_fields.push(input);
}
Ok(GsubProcessor {
fields: real_fields,
pattern: self.pattern,
replacement: self.replacement,
ignore_missing: self.ignore_missing,
})
}
}
/// A processor to replace all matches of a pattern in string by a replacement, only support string value, and array string value
#[derive(Debug, Default)]
pub struct GsubProcessor {
fields: Vec<OneInputOneOutputField>,
pattern: Option<Regex>,
replacement: Option<String>,
ignore_missing: bool,
}
impl GsubProcessor {
fn check(self) -> Result<Self, String> {
if self.pattern.is_none() {
return Err("pattern is required".to_string());
}
if self.replacement.is_none() {
return Err("replacement is required".to_string());
}
Ok(self)
}
fn process_string(&self, val: &str) -> Result<Value, String> {
let replacement = self.replacement.as_ref().unwrap();
let new_val = self
.pattern
@@ -76,42 +116,28 @@ impl GsubProcessor {
.to_string();
let val = Value::String(new_val);
let key = field.get_target_field();
Ok(Map::one(key, val))
Ok(val)
}
fn process_array_field(&self, arr: &Array, field: &Field) -> Result<Map, String> {
let key = field.get_target_field();
let re = self.pattern.as_ref().unwrap();
let replacement = self.replacement.as_ref().unwrap();
let mut result = Array::default();
for val in arr.iter() {
match val {
Value::String(haystack) => {
let new_val = re.replace_all(haystack, replacement).to_string();
result.push(Value::String(new_val));
}
_ => {
return Err(format!(
"{} processor: expect string or array string, but got {val:?}",
self.kind()
))
}
}
fn process(&self, val: &Value) -> Result<Value, String> {
match val {
Value::String(val) => self.process_string(val),
_ => Err(format!(
"{} processor: expect string or array string, but got {val:?}",
self.kind()
)),
}
Ok(Map::one(key, Value::Array(result)))
}
}
impl TryFrom<&yaml_rust::yaml::Hash> for GsubProcessor {
impl TryFrom<&yaml_rust::yaml::Hash> for GsubProcessorBuilder {
type Error = String;
fn try_from(value: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
let mut processor = GsubProcessor::default();
let mut fields = Fields::default();
let mut ignore_missing = false;
let mut pattern = None;
let mut replacement = None;
for (k, v) in value.iter() {
let key = k
@@ -119,27 +145,36 @@ impl TryFrom<&yaml_rust::yaml::Hash> for GsubProcessor {
.ok_or(format!("key must be a string, but got {k:?}"))?;
match key {
FIELD_NAME => {
processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?));
fields = Fields::one(yaml_new_field(v, FIELD_NAME)?);
}
FIELDS_NAME => {
processor.with_fields(yaml_fields(v, FIELDS_NAME)?);
fields = yaml_new_fields(v, FIELDS_NAME)?;
}
PATTERN_NAME => {
processor.try_pattern(&yaml_string(v, PATTERN_NAME)?)?;
let pattern_str = yaml_string(v, PATTERN_NAME)?;
pattern = Some(Regex::new(&pattern_str).map_err(|e| e.to_string())?);
}
REPLACEMENT_NAME => {
processor.with_replacement(yaml_string(v, REPLACEMENT_NAME)?);
let replacement_str = yaml_string(v, REPLACEMENT_NAME)?;
replacement = Some(replacement_str);
}
IGNORE_MISSING_NAME => {
processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?);
ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?;
}
_ => {}
}
}
processor.check()
let builder = GsubProcessorBuilder {
fields,
pattern,
replacement,
ignore_missing,
};
builder.check()
}
}
@@ -152,56 +187,23 @@ impl crate::etl::processor::Processor for GsubProcessor {
self.ignore_missing
}
fn fields(&self) -> &Fields {
&self.fields
}
fn fields_mut(&mut self) -> &mut Fields {
&mut self.fields
}
fn output_keys(&self) -> HashSet<String> {
self.fields
.iter()
.map(|f| f.get_target_field().to_string())
.collect()
}
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
match val {
Value::String(val) => self.process_string_field(val, field),
Value::Array(arr) => self.process_array_field(arr, field),
_ => Err(format!(
"{} processor: expect string or array string, but got {val:?}",
self.kind()
)),
}
}
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String> {
for field in self.fields.iter() {
let index = field.input_field.index;
let index = field.input_index();
match val.get(index) {
Some(Value::Null) | None => {
if !self.ignore_missing {
return Err(format!(
"{} processor: missing field: {}",
self.kind(),
field.get_field_name()
field.input_name()
));
}
}
Some(v) => {
// TODO(qtang): Let this method use the intermediate state collection directly.
let mut map = self.exec_field(v, field)?;
field
.output_fields_index_mapping
.iter()
.for_each(|(k, output_index)| {
if let Some(v) = map.remove(k) {
val[*output_index] = v;
}
});
let result = self.process(v)?;
let output_index = field.output_index();
val[output_index] = result;
}
}
}
@@ -211,55 +213,20 @@ impl crate::etl::processor::Processor for GsubProcessor {
#[cfg(test)]
mod tests {
use crate::etl::field::Field;
use crate::etl::processor::gsub::GsubProcessor;
use crate::etl::processor::Processor;
use crate::etl::value::{Map, Value};
use crate::etl::value::Value;
#[test]
fn test_string_value() {
let mut processor = GsubProcessor::default();
processor.try_pattern(r"\d+").unwrap();
processor.with_replacement("xxx");
let processor = GsubProcessor {
pattern: Some(regex::Regex::new(r"\d+").unwrap()),
replacement: Some("xxx".to_string()),
..Default::default()
};
let field = Field::new("message");
let val = Value::String("123".to_string());
let result = processor.exec_field(&val, &field).unwrap();
let result = processor.process(&val).unwrap();
assert_eq!(
result,
Map::one("message", Value::String("xxx".to_string()))
);
}
#[test]
fn test_array_string_value() {
let mut processor = GsubProcessor::default();
processor.try_pattern(r"\d+").unwrap();
processor.with_replacement("xxx");
let field = Field::new("message");
let val = Value::Array(
vec![
Value::String("123".to_string()),
Value::String("456".to_string()),
]
.into(),
);
let result = processor.exec_field(&val, &field).unwrap();
assert_eq!(
result,
Map::one(
"message",
Value::Array(
vec![
Value::String("xxx".to_string()),
Value::String("xxx".to_string())
]
.into()
)
)
);
assert_eq!(result, Value::String("xxx".to_string()));
}
}

View File

@@ -14,40 +14,78 @@
use ahash::HashSet;
use crate::etl::field::{Field, Fields};
use crate::etl::field::{Fields, OneInputOneOutputField};
use crate::etl::processor::{
update_one_one_output_keys, yaml_bool, yaml_field, yaml_fields, yaml_string, Processor,
FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, SEPARATOR_NAME,
yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, ProcessorBuilder,
ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, SEPARATOR_NAME,
};
use crate::etl::value::{Array, Map, Value};
use crate::etl::value::{Array, Value};
pub(crate) const PROCESSOR_JOIN: &str = "join";
/// A processor to join each element of an array into a single string using a separator string between each element
#[derive(Debug, Default)]
pub struct JoinProcessor {
pub struct JoinProcessorBuilder {
fields: Fields,
separator: Option<String>,
ignore_missing: bool,
}
impl ProcessorBuilder for JoinProcessorBuilder {
fn output_keys(&self) -> HashSet<&str> {
self.fields
.iter()
.map(|f| f.target_or_input_field())
.collect()
}
fn input_keys(&self) -> HashSet<&str> {
self.fields.iter().map(|f| f.input_field()).collect()
}
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String> {
self.build(intermediate_keys).map(ProcessorKind::Join)
}
}
impl JoinProcessorBuilder {
fn check(self) -> Result<Self, String> {
if self.separator.is_none() {
return Err("separator is required".to_string());
}
Ok(self)
}
pub fn build(self, intermediate_keys: &[String]) -> Result<JoinProcessor, String> {
let mut real_fields = vec![];
for field in self.fields.into_iter() {
let input = OneInputOneOutputField::build(
"join",
intermediate_keys,
field.input_field(),
field.target_or_input_field(),
)?;
real_fields.push(input);
}
Ok(JoinProcessor {
fields: real_fields,
separator: self.separator,
ignore_missing: self.ignore_missing,
})
}
}
/// A processor to join each element of an array into a single string using a separator string between each element
#[derive(Debug, Default)]
pub struct JoinProcessor {
fields: Vec<OneInputOneOutputField>,
separator: Option<String>,
ignore_missing: bool,
}
impl JoinProcessor {
fn with_fields(&mut self, mut fields: Fields) {
update_one_one_output_keys(&mut fields);
self.fields = fields;
}
fn with_separator(&mut self, separator: impl Into<String>) {
self.separator = Some(separator.into());
}
fn with_ignore_missing(&mut self, ignore_missing: bool) {
self.ignore_missing = ignore_missing;
}
fn process_field(&self, arr: &Array, field: &Field) -> Result<Map, String> {
let key = field.get_target_field();
fn process(&self, arr: &Array) -> Result<Value, String> {
let sep = self.separator.as_ref().unwrap();
let val = arr
.iter()
@@ -55,7 +93,7 @@ impl JoinProcessor {
.collect::<Vec<String>>()
.join(sep);
Ok(Map::one(key, Value::String(val)))
Ok(Value::String(val))
}
fn check(self) -> Result<Self, String> {
@@ -67,11 +105,13 @@ impl JoinProcessor {
}
}
impl TryFrom<&yaml_rust::yaml::Hash> for JoinProcessor {
impl TryFrom<&yaml_rust::yaml::Hash> for JoinProcessorBuilder {
type Error = String;
fn try_from(value: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
let mut processor = JoinProcessor::default();
let mut fields = Fields::default();
let mut separator = None;
let mut ignore_missing = false;
for (k, v) in value.iter() {
let key = k
@@ -79,30 +119,31 @@ impl TryFrom<&yaml_rust::yaml::Hash> for JoinProcessor {
.ok_or(format!("key must be a string, but got {k:?}"))?;
match key {
FIELD_NAME => {
processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?));
fields = Fields::one(yaml_new_field(v, FIELD_NAME)?);
}
FIELDS_NAME => {
processor.with_fields(yaml_fields(v, FIELDS_NAME)?);
fields = yaml_new_fields(v, FIELDS_NAME)?;
}
SEPARATOR_NAME => {
processor.with_separator(yaml_string(v, SEPARATOR_NAME)?);
separator = Some(yaml_string(v, SEPARATOR_NAME)?);
}
IGNORE_MISSING_NAME => {
processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?);
ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?;
}
_ => {}
}
}
processor.check()
let builder = JoinProcessorBuilder {
fields,
separator,
ignore_missing,
};
builder.check()
}
}
impl Processor for JoinProcessor {
fn fields(&self) -> &Fields {
&self.fields
}
fn kind(&self) -> &str {
PROCESSOR_JOIN
}
@@ -111,49 +152,21 @@ impl Processor for JoinProcessor {
self.ignore_missing
}
fn fields_mut(&mut self) -> &mut Fields {
&mut self.fields
}
fn output_keys(&self) -> HashSet<String> {
self.fields
.iter()
.map(|f| f.get_target_field().to_string())
.collect()
}
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
match val {
Value::Array(arr) => self.process_field(arr, field),
_ => Err(format!(
"{} processor: expect array value, but got {val:?}",
self.kind()
)),
}
}
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String> {
for field in self.fields.iter() {
let index = field.input_field.index;
let index = field.input_index();
match val.get(index) {
Some(Value::Array(arr)) => {
// TODO(qtang): Let this method use the intermediate state collection directly.
let mut map = self.process_field(arr, field)?;
field
.output_fields_index_mapping
.iter()
.for_each(|(k, output_index)| {
if let Some(v) = map.remove(k) {
val[*output_index] = v;
}
});
let result = self.process(arr)?;
let output_index = field.output_index();
val[output_index] = result;
}
Some(Value::Null) | None => {
if !self.ignore_missing {
return Err(format!(
"{} processor: missing field: {}",
self.kind(),
field.get_field_name()
field.input_name()
));
}
}
@@ -173,25 +186,22 @@ impl Processor for JoinProcessor {
#[cfg(test)]
mod tests {
use crate::etl::field::Field;
use crate::etl::processor::join::JoinProcessor;
use crate::etl::processor::Processor;
use crate::etl::value::{Map, Value};
use crate::etl::value::Value;
#[test]
fn test_join_processor() {
let mut processor = JoinProcessor::default();
processor.with_separator("-");
let processor = JoinProcessor {
separator: Some("-".to_string()),
..Default::default()
};
let field = Field::new("test");
let arr = Value::Array(
vec![
Value::String("a".to_string()),
Value::String("b".to_string()),
]
.into(),
);
let result = processor.exec_field(&arr, &field).unwrap();
assert_eq!(result, Map::one("test", Value::String("a-b".to_string())));
let arr = vec![
Value::String("a".to_string()),
Value::String("b".to_string()),
]
.into();
let result = processor.process(&arr).unwrap();
assert_eq!(result, Value::String("a-b".to_string()));
}
}

View File

@@ -14,12 +14,12 @@
use ahash::HashSet;
use crate::etl::field::{Field, Fields};
use crate::etl::field::{Fields, OneInputOneOutputField};
use crate::etl::processor::{
update_one_one_output_keys, yaml_bool, yaml_field, yaml_fields, yaml_string, Processor,
FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, METHOD_NAME,
yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, ProcessorBuilder,
ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, METHOD_NAME,
};
use crate::etl::value::{Map, Value};
use crate::etl::value::Value;
pub(crate) const PROCESSOR_LETTER: &str = "letter";
@@ -54,29 +54,61 @@ impl std::str::FromStr for Method {
}
}
/// only support string value
#[derive(Debug, Default)]
pub struct LetterProcessor {
pub struct LetterProcessorBuilder {
fields: Fields,
method: Method,
ignore_missing: bool,
}
impl ProcessorBuilder for LetterProcessorBuilder {
fn output_keys(&self) -> HashSet<&str> {
self.fields
.iter()
.map(|f| f.target_or_input_field())
.collect()
}
fn input_keys(&self) -> HashSet<&str> {
self.fields.iter().map(|f| f.input_field()).collect()
}
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String> {
self.build(intermediate_keys).map(ProcessorKind::Letter)
}
}
impl LetterProcessorBuilder {
pub fn build(self, intermediate_keys: &[String]) -> Result<LetterProcessor, String> {
let mut real_fields = vec![];
for field in self.fields.into_iter() {
let input = OneInputOneOutputField::build(
"letter",
intermediate_keys,
field.input_field(),
field.target_or_input_field(),
)?;
real_fields.push(input);
}
Ok(LetterProcessor {
fields: real_fields,
method: self.method,
ignore_missing: self.ignore_missing,
})
}
}
/// only support string value
#[derive(Debug, Default)]
pub struct LetterProcessor {
fields: Vec<OneInputOneOutputField>,
method: Method,
ignore_missing: bool,
}
impl LetterProcessor {
fn with_fields(&mut self, mut fields: Fields) {
update_one_one_output_keys(&mut fields);
self.fields = fields;
}
fn with_method(&mut self, method: Method) {
self.method = method;
}
fn with_ignore_missing(&mut self, ignore_missing: bool) {
self.ignore_missing = ignore_missing;
}
fn process_field(&self, val: &str, field: &Field) -> Result<Map, String> {
fn process_field(&self, val: &str) -> Result<Value, String> {
let processed = match self.method {
Method::Upper => val.to_uppercase(),
Method::Lower => val.to_lowercase(),
@@ -84,17 +116,17 @@ impl LetterProcessor {
};
let val = Value::String(processed);
let key = field.get_target_field();
Ok(Map::one(key, val))
Ok(val)
}
}
impl TryFrom<&yaml_rust::yaml::Hash> for LetterProcessor {
impl TryFrom<&yaml_rust::yaml::Hash> for LetterProcessorBuilder {
type Error = String;
fn try_from(value: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
let mut processor = LetterProcessor::default();
let mut fields = Fields::default();
let mut method = Method::Lower;
let mut ignore_missing = false;
for (k, v) in value.iter() {
let key = k
@@ -102,23 +134,26 @@ impl TryFrom<&yaml_rust::yaml::Hash> for LetterProcessor {
.ok_or(format!("key must be a string, but got {k:?}"))?;
match key {
FIELD_NAME => {
processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?));
fields = Fields::one(yaml_new_field(v, FIELD_NAME)?);
}
FIELDS_NAME => {
processor.with_fields(yaml_fields(v, FIELDS_NAME)?);
fields = yaml_new_fields(v, FIELDS_NAME)?;
}
METHOD_NAME => {
let method = yaml_string(v, METHOD_NAME)?;
processor.with_method(method.parse()?);
method = yaml_string(v, METHOD_NAME)?.parse()?;
}
IGNORE_MISSING_NAME => {
processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?);
ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?;
}
_ => {}
}
}
Ok(processor)
Ok(LetterProcessorBuilder {
fields,
method,
ignore_missing,
})
}
}
@@ -131,53 +166,21 @@ impl Processor for LetterProcessor {
self.ignore_missing
}
fn fields(&self) -> &Fields {
&self.fields
}
fn fields_mut(&mut self) -> &mut Fields {
&mut self.fields
}
fn output_keys(&self) -> HashSet<String> {
self.fields
.iter()
.map(|f| f.get_target_field().to_string())
.collect()
}
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
match val {
Value::String(val) => self.process_field(val, field),
_ => Err(format!(
"{} processor: expect string value, but got {val:?}",
self.kind()
)),
}
}
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String> {
for field in self.fields.iter() {
let index = field.input_field.index;
let index = field.input_index();
match val.get(index) {
Some(Value::String(s)) => {
// TODO(qtang): Let this method use the intermediate state collection directly.
let mut processed = self.process_field(s, field)?;
field
.output_fields_index_mapping
.iter()
.for_each(|(k, output_index)| {
if let Some(v) = processed.remove(k) {
val[*output_index] = v;
}
});
let result = self.process_field(s)?;
let (_, output_index) = field.output();
val[*output_index] = result;
}
Some(Value::Null) | None => {
if !self.ignore_missing {
return Err(format!(
"{} processor: missing field: {}",
self.kind(),
field.get_field_name()
&field.input().name
));
}
}
@@ -204,33 +207,36 @@ fn capitalize(s: &str) -> String {
#[cfg(test)]
mod tests {
use crate::etl::field::Fields;
use crate::etl::processor::letter::{LetterProcessor, Method};
use crate::etl::value::{Map, Value};
use crate::etl::value::Value;
#[test]
fn test_process() {
let field = "letter";
let ff: crate::etl::processor::Field = field.parse().unwrap();
let mut processor = LetterProcessor::default();
processor.with_fields(Fields::one(ff.clone()));
{
processor.with_method(Method::Upper);
let processed = processor.process_field("pipeline", &ff).unwrap();
assert_eq!(Map::one(field, Value::String("PIPELINE".into())), processed)
let processor = LetterProcessor {
method: Method::Upper,
..Default::default()
};
let processed = processor.process_field("pipeline").unwrap();
assert_eq!(Value::String("PIPELINE".into()), processed)
}
{
processor.with_method(Method::Lower);
let processed = processor.process_field("Pipeline", &ff).unwrap();
assert_eq!(Map::one(field, Value::String("pipeline".into())), processed)
let processor = LetterProcessor {
method: Method::Lower,
..Default::default()
};
let processed = processor.process_field("Pipeline").unwrap();
assert_eq!(Value::String("pipeline".into()), processed)
}
{
processor.with_method(Method::Capital);
let processed = processor.process_field("pipeline", &ff).unwrap();
assert_eq!(Map::one(field, Value::String("Pipeline".into())), processed)
let processor = LetterProcessor {
method: Method::Capital,
..Default::default()
};
let processed = processor.process_field("pipeline").unwrap();
assert_eq!(Value::String("Pipeline".into()), processed)
}
}
}

View File

@@ -18,16 +18,17 @@ const PATTERNS_NAME: &str = "patterns";
pub(crate) const PROCESSOR_REGEX: &str = "regex";
use ahash::HashSet;
use ahash::{HashSet, HashSetExt};
use lazy_static::lazy_static;
use regex::Regex;
use crate::etl::field::Fields;
use crate::etl::field::{Fields, InputFieldInfo, OneInputMultiOutputField};
use crate::etl::find_key_index;
use crate::etl::processor::{
yaml_bool, yaml_field, yaml_fields, yaml_string, yaml_strings, Field, Processor, FIELDS_NAME,
FIELD_NAME, IGNORE_MISSING_NAME, PATTERN_NAME,
yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, yaml_strings, Processor,
ProcessorBuilder, ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, PATTERN_NAME,
};
use crate::etl::value::{Map, Value};
use crate::etl::value::Value;
lazy_static! {
static ref GROUPS_NAME_REGEX: Regex = Regex::new(r"\(\?P?<([[:word:]]+)>.+?\)").unwrap();
@@ -40,6 +41,10 @@ fn get_regex_group_names(s: &str) -> Vec<String> {
.collect()
}
fn generate_key(prefix: &str, group: &str) -> String {
format!("{prefix}_{group}")
}
#[derive(Debug)]
struct GroupRegex {
origin: String,
@@ -72,34 +77,29 @@ impl std::str::FromStr for GroupRegex {
}
}
/// only support string value
/// if no value found from a pattern, the target_field will be ignored
#[derive(Debug, Default)]
pub struct RegexProcessor {
pub struct RegexProcessorBuilder {
fields: Fields,
patterns: Vec<GroupRegex>,
ignore_missing: bool,
output_keys: HashSet<String>,
}
impl RegexProcessor {
fn with_fields(&mut self, fields: Fields) {
self.fields = fields;
impl ProcessorBuilder for RegexProcessorBuilder {
fn output_keys(&self) -> HashSet<&str> {
self.output_keys.iter().map(|k| k.as_str()).collect()
}
fn try_with_patterns(&mut self, patterns: Vec<String>) -> Result<(), String> {
let mut rs = vec![];
for pattern in patterns {
let gr = pattern.parse()?;
rs.push(gr);
}
self.patterns = rs;
Ok(())
fn input_keys(&self) -> HashSet<&str> {
self.fields.iter().map(|f| f.input_field()).collect()
}
fn with_ignore_missing(&mut self, ignore_missing: bool) {
self.ignore_missing = ignore_missing;
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String> {
self.build(intermediate_keys).map(ProcessorKind::Regex)
}
}
impl RegexProcessorBuilder {
fn check(self) -> Result<Self, String> {
if self.fields.is_empty() {
return Err(format!(
@@ -118,47 +118,78 @@ impl RegexProcessor {
Ok(self)
}
fn generate_key(prefix: &str, group: &str) -> String {
format!("{prefix}_{group}")
fn build_group_output_info(
group_regex: &GroupRegex,
om_field: &OneInputMultiOutputField,
intermediate_keys: &[String],
) -> Result<Vec<OutPutInfo>, String> {
group_regex
.groups
.iter()
.map(|g| {
let key = generate_key(om_field.target_prefix(), g);
let index = find_key_index(intermediate_keys, &key, "regex");
index.map(|index| OutPutInfo {
final_key: key,
group_name: g.to_string(),
index,
})
})
.collect::<Result<Vec<_>, String>>()
}
fn process_field(&self, val: &str, field: &Field, gr: &GroupRegex) -> Result<Map, String> {
let mut map = Map::default();
if let Some(captures) = gr.regex.captures(val) {
for group in &gr.groups {
if let Some(capture) = captures.name(group) {
let value = capture.as_str().to_string();
let prefix = field.get_target_field();
let key = Self::generate_key(prefix, group);
map.insert(key, Value::String(value));
}
}
}
Ok(map)
fn build_group_output_infos(
patterns: &[GroupRegex],
om_field: &OneInputMultiOutputField,
intermediate_keys: &[String],
) -> Result<Vec<Vec<OutPutInfo>>, String> {
patterns
.iter()
.map(|group_regex| {
Self::build_group_output_info(group_regex, om_field, intermediate_keys)
})
.collect::<Result<Vec<_>, String>>()
}
fn update_output_keys(&mut self) {
for field in self.fields.iter_mut() {
for gr in &self.patterns {
for group in &gr.groups {
field
.output_fields_index_mapping
.insert(Self::generate_key(field.get_target_field(), group), 0_usize);
}
}
fn build_output_info(
real_fields: &[OneInputMultiOutputField],
patterns: &[GroupRegex],
intermediate_keys: &[String],
) -> Result<RegexProcessorOutputInfo, String> {
let inner = real_fields
.iter()
.map(|om_field| Self::build_group_output_infos(patterns, om_field, intermediate_keys))
.collect::<Result<Vec<_>, String>>();
inner.map(|inner| RegexProcessorOutputInfo { inner })
}
fn build(self, intermediate_keys: &[String]) -> Result<RegexProcessor, String> {
let mut real_fields = vec![];
for field in self.fields.into_iter() {
let input_index = find_key_index(intermediate_keys, field.input_field(), "regex")?;
let input_field_info = InputFieldInfo::new(field.input_field(), input_index);
let input = OneInputMultiOutputField::new(input_field_info, field.target_field);
real_fields.push(input);
}
let output_info = Self::build_output_info(&real_fields, &self.patterns, intermediate_keys)?;
Ok(RegexProcessor {
// fields: Fields::one(Field::new("test".to_string())),
fields: real_fields,
patterns: self.patterns,
output_info,
ignore_missing: self.ignore_missing,
})
}
}
impl TryFrom<&yaml_rust::yaml::Hash> for RegexProcessor {
impl TryFrom<&yaml_rust::yaml::Hash> for RegexProcessorBuilder {
type Error = String;
fn try_from(value: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
let mut processor = RegexProcessor::default();
let mut fields = Fields::default();
let mut patterns: Vec<GroupRegex> = vec![];
let mut ignore_missing = false;
for (k, v) in value.iter() {
let key = k
@@ -166,28 +197,113 @@ impl TryFrom<&yaml_rust::yaml::Hash> for RegexProcessor {
.ok_or(format!("key must be a string, but got {k:?}"))?;
match key {
FIELD_NAME => {
processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?));
fields = Fields::one(yaml_new_field(v, FIELD_NAME)?);
}
FIELDS_NAME => {
processor.with_fields(yaml_fields(v, FIELDS_NAME)?);
fields = yaml_new_fields(v, FIELDS_NAME)?;
}
PATTERN_NAME => {
processor.try_with_patterns(vec![yaml_string(v, PATTERN_NAME)?])?;
let pattern = yaml_string(v, PATTERN_NAME)?;
let gr = pattern.parse()?;
patterns.push(gr);
}
PATTERNS_NAME => {
processor.try_with_patterns(yaml_strings(v, PATTERNS_NAME)?)?;
for pattern in yaml_strings(v, PATTERNS_NAME)? {
let gr = pattern.parse()?;
patterns.push(gr);
}
}
IGNORE_MISSING_NAME => {
processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?);
ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?;
}
_ => {}
}
}
processor.check().map(|mut p| {
p.update_output_keys();
p
})
let pattern_output_keys = patterns
.iter()
.flat_map(|pattern| pattern.groups.iter())
.collect::<Vec<_>>();
let mut output_keys = HashSet::new();
for field in fields.iter() {
for x in pattern_output_keys.iter() {
output_keys.insert(generate_key(field.target_or_input_field(), x));
}
}
let processor_builder = RegexProcessorBuilder {
fields,
patterns,
ignore_missing,
output_keys,
};
processor_builder.check()
}
}
#[derive(Debug, Default)]
struct OutPutInfo {
final_key: String,
group_name: String,
index: usize,
}
#[derive(Debug, Default)]
struct RegexProcessorOutputInfo {
pub inner: Vec<Vec<Vec<OutPutInfo>>>,
}
impl RegexProcessorOutputInfo {
fn get_output_index(
&self,
field_index: usize,
pattern_index: usize,
group_index: usize,
) -> usize {
self.inner[field_index][pattern_index][group_index].index
}
}
/// only support string value
/// if no value found from a pattern, the target_field will be ignored
#[derive(Debug, Default)]
pub struct RegexProcessor {
fields: Vec<OneInputMultiOutputField>,
output_info: RegexProcessorOutputInfo,
patterns: Vec<GroupRegex>,
ignore_missing: bool,
}
impl RegexProcessor {
fn try_with_patterns(&mut self, patterns: Vec<String>) -> Result<(), String> {
let mut rs = vec![];
for pattern in patterns {
let gr = pattern.parse()?;
rs.push(gr);
}
self.patterns = rs;
Ok(())
}
fn process(
&self,
val: &str,
gr: &GroupRegex,
index: (usize, usize),
) -> Result<Vec<(usize, Value)>, String> {
let mut result = Vec::new();
if let Some(captures) = gr.regex.captures(val) {
for (group_index, group) in gr.groups.iter().enumerate() {
if let Some(capture) = captures.name(group) {
let value = capture.as_str().to_string();
let index = self
.output_info
.get_output_index(index.0, index.1, group_index);
result.push((index, Value::String(value)));
}
}
}
Ok(result)
}
}
@@ -200,71 +316,40 @@ impl Processor for RegexProcessor {
self.ignore_missing
}
fn fields(&self) -> &Fields {
&self.fields
}
fn fields_mut(&mut self) -> &mut Fields {
&mut self.fields
}
fn output_keys(&self) -> HashSet<String> {
self.fields
.iter()
.flat_map(|f| {
self.patterns.iter().flat_map(move |p| {
p.groups
.iter()
.map(move |g| Self::generate_key(&f.input_field.name, g))
})
})
.collect()
}
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
match val {
Value::String(val) => {
let mut map = Map::default();
for gr in &self.patterns {
let m = self.process_field(val, field, gr)?;
map.extend(m);
}
Ok(map)
}
_ => Err(format!(
"{} processor: expect string value, but got {val:?}",
self.kind()
)),
}
}
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String> {
for field in self.fields.iter() {
let index = field.input_field.index;
for (field_index, field) in self.fields.iter().enumerate() {
let index = field.input_index();
let mut result_list = None;
match val.get(index) {
Some(Value::String(s)) => {
let mut map = Map::default();
for gr in &self.patterns {
// TODO(qtang): Let this method use the intermediate state collection directly.
let m = self.process_field(s, field, gr)?;
map.extend(m);
}
field
.output_fields_index_mapping
.iter()
.for_each(|(k, output_index)| {
if let Some(v) = map.remove(k) {
val[*output_index] = v;
// we get rust borrow checker error here
// for (gr_index, gr) in self.patterns.iter().enumerate() {
// let result_list = self.process(s.as_str(), gr, (field_index, gr_index))?;
// for (output_index, result) in result_list {
//cannot borrow `*val` as mutable because it is also borrowed as immutable mutable borrow occurs here
// val[output_index] = result;
// }
// }
for (gr_index, gr) in self.patterns.iter().enumerate() {
let result = self.process(s.as_str(), gr, (field_index, gr_index))?;
if !result.is_empty() {
match result_list.as_mut() {
None => {
result_list = Some(result);
}
Some(result_list) => {
result_list.extend(result);
}
}
});
}
}
}
Some(Value::Null) | None => {
if !self.ignore_missing {
return Err(format!(
"{} processor: missing field: {}",
self.kind(),
field.get_field_name()
field.input_name()
));
}
}
@@ -275,6 +360,15 @@ impl Processor for RegexProcessor {
));
}
}
// safety here
match result_list {
None => {}
Some(result_list) => {
for (output_index, result) in result_list {
val[output_index] = result;
}
}
}
}
Ok(())
@@ -282,37 +376,42 @@ impl Processor for RegexProcessor {
}
#[cfg(test)]
mod tests {
use ahash::{HashMap, HashMapExt};
use itertools::Itertools;
use super::RegexProcessor;
use crate::etl::field::Fields;
use crate::etl::processor::Processor;
use crate::etl::processor::regex::RegexProcessorBuilder;
use crate::etl::value::{Map, Value};
#[test]
fn test_simple_parse() {
let mut processor = RegexProcessor::default();
let pipeline_str = r#"fields: ["a"]
patterns: ['(?<ar>\d)']
ignore_missing: false"#;
let processor_yaml = yaml_rust::YamlLoader::load_from_str(pipeline_str)
.unwrap()
.pop()
.unwrap();
let processor_yaml_hash = processor_yaml.as_hash().unwrap();
let builder = RegexProcessorBuilder::try_from(processor_yaml_hash).unwrap();
let intermediate_keys = ["a".to_string(), "a_ar".to_string()];
let processor = builder.build(&intermediate_keys).unwrap();
// single field (with prefix), multiple patterns
let f = ["a"].iter().map(|f| f.parse().unwrap()).collect();
processor.with_fields(Fields::new(f).unwrap());
let ar = "(?<ar>\\d)";
let result = processor
.process("123", &processor.patterns[0], (0, 0))
.unwrap()
.into_iter()
.map(|(k, v)| (intermediate_keys[k].clone(), v))
.collect();
let patterns = [ar].iter().map(|p| p.to_string()).collect();
processor.try_with_patterns(patterns).unwrap();
let mut map = Map::default();
map.insert("a", Value::String("123".to_string()));
processor.exec_map(&mut map).unwrap();
let map = Map { values: result };
let v = Map {
values: vec![
("a_ar".to_string(), Value::String("1".to_string())),
("a".to_string(), Value::String("123".to_string())),
]
.into_iter()
.collect(),
values: vec![("a_ar".to_string(), Value::String("1".to_string()))]
.into_iter()
.collect(),
};
assert_eq!(v, map);
@@ -320,17 +419,14 @@ mod tests {
#[test]
fn test_process() {
let mut processor = RegexProcessor::default();
let cc = "[c=c,n=US_CA_SANJOSE,o=55155]";
let cg = "[a=12.34.567.89,b=12345678,c=g,n=US_CA_SANJOSE,o=20940]";
let co = "[a=987.654.321.09,c=o]";
let cp = "[c=p,n=US_CA_SANJOSE,o=55155]";
let cw = "[c=w,n=US_CA_SANJOSE,o=55155]";
let breadcrumbs = Value::String([cc, cg, co, cp, cw].iter().join(","));
let breadcrumbs_str = [cc, cg, co, cp, cw].iter().join(",");
let values = [
("breadcrumbs", breadcrumbs.clone()),
("breadcrumbs_parent", Value::String(cc.to_string())),
("breadcrumbs_edge", Value::String(cg.to_string())),
("breadcrumbs_origin", Value::String(co.to_string())),
@@ -340,61 +436,141 @@ mod tests {
.into_iter()
.map(|(k, v)| (k.to_string(), v))
.collect();
let mut temporary_map = Map { values };
let temporary_map = Map { values };
{
// single field (with prefix), multiple patterns
let ff = ["breadcrumbs, breadcrumbs"]
.iter()
.map(|f| f.parse().unwrap())
.collect();
processor.with_fields(Fields::new(ff).unwrap());
let ccr = "(?<parent>\\[[^\\[]*c=c[^\\]]*\\])";
let cgr = "(?<edge>\\[[^\\[]*c=g[^\\]]*\\])";
let cor = "(?<origin>\\[[^\\[]*c=o[^\\]]*\\])";
let cpr = "(?<peer>\\[[^\\[]*c=p[^\\]]*\\])";
let cwr = "(?<wrapper>\\[[^\\[]*c=w[^\\]]*\\])";
let patterns = [ccr, cgr, cor, cpr, cwr]
.iter()
.map(|p| p.to_string())
.collect();
processor.try_with_patterns(patterns).unwrap();
let pipeline_str = r#"fields: ["breadcrumbs"]
patterns:
- '(?<parent>\[[^\[]*c=c[^\]]*\])'
- '(?<edge>\[[^\[]*c=g[^\]]*\])'
- '(?<origin>\[[^\[]*c=o[^\]]*\])'
- '(?<peer>\[[^\[]*c=p[^\]]*\])'
- '(?<wrapper>\[[^\[]*c=w[^\]]*\])'
ignore_missing: false"#;
let mut map = Map::default();
map.insert("breadcrumbs", breadcrumbs.clone());
processor.exec_map(&mut map).unwrap();
assert_eq!(map, temporary_map);
let processor_yaml = yaml_rust::YamlLoader::load_from_str(pipeline_str)
.unwrap()
.pop()
.unwrap();
let processor_yaml_hash = processor_yaml.as_hash().unwrap();
let builder = RegexProcessorBuilder::try_from(processor_yaml_hash).unwrap();
let intermediate_keys = [
"breadcrumbs",
"breadcrumbs_parent",
"breadcrumbs_edge",
"breadcrumbs_origin",
"breadcrumbs_peer",
"breadcrumbs_wrapper",
]
.iter()
.map(|k| k.to_string())
.collect_vec();
let processor = builder.build(&intermediate_keys).unwrap();
let mut result = HashMap::new();
for (index, pattern) in processor.patterns.iter().enumerate() {
let r = processor
.process(&breadcrumbs_str, pattern, (0, index))
.unwrap()
.into_iter()
.map(|(k, v)| (intermediate_keys[k].clone(), v))
.collect::<HashMap<_, _>>();
result.extend(r);
}
let map = Map { values: result };
assert_eq!(temporary_map, map);
}
{
// multiple fields (with prefix), multiple patterns
let ff = [
"breadcrumbs_parent, parent",
"breadcrumbs_edge, edge",
"breadcrumbs_origin, origin",
"breadcrumbs_peer, peer",
"breadcrumbs_wrapper, wrapper",
]
.iter()
.map(|f| f.parse().unwrap())
.collect();
processor.with_fields(Fields::new(ff).unwrap());
let patterns = [
"a=(?<ip>[^,\\]]+)",
"b=(?<request_id>[^,\\]]+)",
"k=(?<request_end_time>[^,\\]]+)",
"l=(?<turn_around_time>[^,\\]]+)",
"m=(?<dns_lookup_time>[^,\\]]+)",
"n=(?<geo>[^,\\]]+)",
"o=(?<asn>[^,\\]]+)",
let pipeline_str = r#"fields:
- breadcrumbs_parent, parent
- breadcrumbs_edge, edge
- breadcrumbs_origin, origin
- breadcrumbs_peer, peer
- breadcrumbs_wrapper, wrapper
patterns:
- 'a=(?<ip>[^,\]]+)'
- 'b=(?<request_id>[^,\]]+)'
- 'k=(?<request_end_time>[^,\]]+)'
- 'l=(?<turn_around_time>[^,\]]+)'
- 'm=(?<dns_lookup_time>[^,\]]+)'
- 'n=(?<geo>[^,\]]+)'
- 'o=(?<asn>[^,\]]+)'
ignore_missing: false"#;
let processor_yaml = yaml_rust::YamlLoader::load_from_str(pipeline_str)
.unwrap()
.pop()
.unwrap();
let processor_yaml_hash = processor_yaml.as_hash().unwrap();
let builder = RegexProcessorBuilder::try_from(processor_yaml_hash).unwrap();
let intermediate_keys = [
"breadcrumbs_parent",
"breadcrumbs_edge",
"breadcrumbs_origin",
"breadcrumbs_peer",
"breadcrumbs_wrapper",
"edge_ip",
"edge_request_id",
"edge_request_end_time",
"edge_turn_around_time",
"edge_dns_lookup_time",
"edge_geo",
"edge_asn",
"origin_ip",
"origin_request_id",
"origin_request_end_time",
"origin_turn_around_time",
"origin_dns_lookup_time",
"origin_geo",
"origin_asn",
"peer_ip",
"peer_request_id",
"peer_request_end_time",
"peer_turn_around_time",
"peer_dns_lookup_time",
"peer_geo",
"peer_asn",
"parent_ip",
"parent_request_id",
"parent_request_end_time",
"parent_turn_around_time",
"parent_dns_lookup_time",
"parent_geo",
"parent_asn",
"wrapper_ip",
"wrapper_request_id",
"wrapper_request_end_time",
"wrapper_turn_around_time",
"wrapper_dns_lookup_time",
"wrapper_geo",
"wrapper_asn",
]
.iter()
.map(|p| p.to_string())
.collect();
processor.try_with_patterns(patterns).unwrap();
.map(|k| k.to_string())
.collect_vec();
let processor = builder.build(&intermediate_keys).unwrap();
let mut result = HashMap::new();
for (field_index, field) in processor.fields.iter().enumerate() {
for (pattern_index, pattern) in processor.patterns.iter().enumerate() {
let s = temporary_map
.get(field.input_name())
.unwrap()
.to_str_value();
let r = processor
.process(&s, pattern, (field_index, pattern_index))
.unwrap()
.into_iter()
.map(|(k, v)| (intermediate_keys[k].clone(), v))
.collect::<HashMap<_, _>>();
result.extend(r);
}
}
let new_values = vec![
("edge_ip", Value::String("12.34.567.89".to_string())),
@@ -413,11 +589,7 @@ mod tests {
.map(|(k, v)| (k.to_string(), v))
.collect();
let mut expected_map = temporary_map.clone();
processor.exec_map(&mut temporary_map).unwrap();
expected_map.extend(Map { values: new_values });
assert_eq!(expected_map, temporary_map);
assert_eq!(result, new_values);
}
}
}

View File

@@ -19,18 +19,17 @@ use chrono::{DateTime, NaiveDateTime};
use chrono_tz::Tz;
use lazy_static::lazy_static;
use super::yaml_strings;
use crate::etl::field::{Field, Fields};
use crate::etl::field::{Fields, OneInputOneOutputField};
use crate::etl::processor::{
update_one_one_output_keys, yaml_bool, yaml_field, yaml_fields, yaml_string, Processor,
FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, yaml_strings, Processor,
ProcessorBuilder, ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
};
use crate::etl::value::time::{
MICROSECOND_RESOLUTION, MICRO_RESOLUTION, MILLISECOND_RESOLUTION, MILLI_RESOLUTION,
MS_RESOLUTION, NANOSECOND_RESOLUTION, NANO_RESOLUTION, NS_RESOLUTION, SECOND_RESOLUTION,
SEC_RESOLUTION, S_RESOLUTION, US_RESOLUTION,
};
use crate::etl::value::{Map, Timestamp, Value};
use crate::etl::value::{Timestamp, Value};
pub(crate) const PROCESSOR_TIMESTAMP: &str = "timestamp";
const RESOLUTION_NAME: &str = "resolution";
@@ -108,10 +107,56 @@ impl std::ops::Deref for Formats {
}
}
#[derive(Debug)]
pub struct TimestampProcessorBuilder {
fields: Fields,
formats: Formats,
resolution: Resolution,
ignore_missing: bool,
}
impl ProcessorBuilder for TimestampProcessorBuilder {
fn output_keys(&self) -> HashSet<&str> {
self.fields
.iter()
.map(|f| f.target_or_input_field())
.collect()
}
fn input_keys(&self) -> HashSet<&str> {
self.fields.iter().map(|f| f.input_field()).collect()
}
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String> {
self.build(intermediate_keys).map(ProcessorKind::Timestamp)
}
}
impl TimestampProcessorBuilder {
pub fn build(self, intermediate_keys: &[String]) -> Result<TimestampProcessor, String> {
let mut real_fields = vec![];
for field in self.fields.into_iter() {
let input = OneInputOneOutputField::build(
"timestamp",
intermediate_keys,
field.input_field(),
field.target_or_input_field(),
)?;
real_fields.push(input);
}
Ok(TimestampProcessor {
fields: real_fields,
formats: self.formats,
resolution: self.resolution,
ignore_missing: self.ignore_missing,
})
}
}
/// support string, integer, float, time, epoch
#[derive(Debug, Default)]
pub struct TimestampProcessor {
fields: Fields,
fields: Vec<OneInputOneOutputField>,
formats: Formats,
resolution: Resolution,
ignore_missing: bool,
@@ -123,29 +168,6 @@ pub struct TimestampProcessor {
}
impl TimestampProcessor {
fn with_fields(&mut self, mut fields: Fields) {
update_one_one_output_keys(&mut fields);
self.fields = fields
}
fn with_resolution(&mut self, resolution: Resolution) {
self.resolution = resolution;
}
fn with_formats(&mut self, v: Option<Vec<(Arc<String>, Tz)>>) {
let v = match v {
Some(v) if !v.is_empty() => v,
_ => DEFAULT_FORMATS.clone(),
};
let formats = Formats::new(v);
self.formats = formats;
}
fn with_ignore_missing(&mut self, ignore_missing: bool) {
self.ignore_missing = ignore_missing;
}
/// try to parse val with timezone first, if failed, parse without timezone
fn try_parse(val: &str, fmt: &str, tz: Tz) -> Result<i64, String> {
if let Ok(dt) = DateTime::parse_from_str(val, fmt) {
@@ -212,12 +234,6 @@ impl TimestampProcessor {
Resolution::Nano => Ok(Timestamp::Nanosecond(t)),
}
}
fn process_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
let key = field.get_target_field();
Ok(Map::one(key, Value::Timestamp(self.parse(val)?)))
}
}
fn parse_formats(yaml: &yaml_rust::yaml::Yaml) -> Result<Vec<(Arc<String>, Tz)>, String> {
@@ -250,11 +266,14 @@ fn parse_formats(yaml: &yaml_rust::yaml::Yaml) -> Result<Vec<(Arc<String>, Tz)>,
};
}
impl TryFrom<&yaml_rust::yaml::Hash> for TimestampProcessor {
impl TryFrom<&yaml_rust::yaml::Hash> for TimestampProcessorBuilder {
type Error = String;
fn try_from(hash: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
let mut processor = TimestampProcessor::default();
let mut fields = Fields::default();
let mut formats = Formats::default();
let mut resolution = Resolution::default();
let mut ignore_missing = false;
for (k, v) in hash {
let key = k
@@ -263,28 +282,33 @@ impl TryFrom<&yaml_rust::yaml::Hash> for TimestampProcessor {
match key {
FIELD_NAME => {
processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?));
fields = Fields::one(yaml_new_field(v, FIELD_NAME)?);
}
FIELDS_NAME => {
processor.with_fields(yaml_fields(v, FIELDS_NAME)?);
fields = yaml_new_fields(v, FIELDS_NAME)?;
}
FORMATS_NAME => {
let formats = parse_formats(v)?;
processor.with_formats(Some(formats));
let formats_vec = parse_formats(v)?;
formats = Formats::new(formats_vec);
}
RESOLUTION_NAME => {
let s = yaml_string(v, RESOLUTION_NAME)?.as_str().try_into()?;
processor.with_resolution(s);
resolution = yaml_string(v, RESOLUTION_NAME)?.as_str().try_into()?;
}
IGNORE_MISSING_NAME => {
processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?);
ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?;
}
_ => {}
}
}
Ok(processor)
let processor_builder = TimestampProcessorBuilder {
fields,
formats,
resolution,
ignore_missing,
};
Ok(processor_builder)
}
}
@@ -297,49 +321,23 @@ impl Processor for TimestampProcessor {
self.ignore_missing
}
fn fields(&self) -> &Fields {
&self.fields
}
fn fields_mut(&mut self) -> &mut Fields {
&mut self.fields
}
fn output_keys(&self) -> HashSet<String> {
self.fields
.iter()
.map(|f| f.get_target_field().to_string())
.collect()
}
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
self.process_field(val, field)
}
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String> {
for field in self.fields.iter() {
let index = field.input_field.index;
let index = field.input().index;
match val.get(index) {
Some(Value::Null) | None => {
if !self.ignore_missing {
return Err(format!(
"{} processor: missing field: {}",
self.kind(),
field.get_field_name()
&field.input().name
));
}
}
Some(v) => {
// TODO(qtang): Let this method use the intermediate state collection directly.
let mut map = self.process_field(v, field)?;
field
.output_fields_index_mapping
.iter()
.for_each(|(k, output_index)| {
if let Some(v) = map.remove(k) {
val[*output_index] = v;
}
});
let result = self.parse(v)?;
let (_, index) = field.output();
val[*index] = Value::Timestamp(result);
}
}
}
@@ -351,9 +349,18 @@ impl Processor for TimestampProcessor {
mod tests {
use yaml_rust::YamlLoader;
use super::TimestampProcessor;
use super::{TimestampProcessor, TimestampProcessorBuilder};
use crate::etl::value::{Timestamp, Value};
fn builder_to_native_processor(builder: TimestampProcessorBuilder) -> TimestampProcessor {
TimestampProcessor {
fields: vec![],
formats: builder.formats,
resolution: builder.resolution,
ignore_missing: builder.ignore_missing,
}
}
#[test]
fn test_parse_epoch() {
let processor_yaml_str = r#"fields:
@@ -367,7 +374,9 @@ formats:
"#;
let yaml = &YamlLoader::load_from_str(processor_yaml_str).unwrap()[0];
let timestamp_yaml = yaml.as_hash().unwrap();
let processor = TimestampProcessor::try_from(timestamp_yaml).unwrap();
let processor = builder_to_native_processor(
TimestampProcessorBuilder::try_from(timestamp_yaml).unwrap(),
);
let values = [
(
@@ -419,7 +428,9 @@ formats:
"#;
let yaml = &YamlLoader::load_from_str(processor_yaml_str).unwrap()[0];
let timestamp_yaml = yaml.as_hash().unwrap();
let processor = TimestampProcessor::try_from(timestamp_yaml).unwrap();
let processor = builder_to_native_processor(
TimestampProcessorBuilder::try_from(timestamp_yaml).unwrap(),
);
let values: Vec<&str> = vec![
"2014-5-17T12:34:56",

View File

@@ -15,12 +15,12 @@
use ahash::HashSet;
use urlencoding::{decode, encode};
use crate::etl::field::{Field, Fields};
use crate::etl::field::{Fields, OneInputOneOutputField};
use crate::etl::processor::{
yaml_bool, yaml_field, yaml_fields, yaml_string, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
METHOD_NAME,
yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, ProcessorBuilder, ProcessorKind,
FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, METHOD_NAME,
};
use crate::etl::value::{Map, Value};
use crate::etl::value::Value;
pub(crate) const PROCESSOR_URL_ENCODING: &str = "urlencoding";
@@ -52,54 +52,76 @@ impl std::str::FromStr for Method {
}
}
/// only support string value
#[derive(Debug, Default)]
pub struct UrlEncodingProcessor {
pub struct UrlEncodingProcessorBuilder {
fields: Fields,
method: Method,
ignore_missing: bool,
}
impl ProcessorBuilder for UrlEncodingProcessorBuilder {
fn output_keys(&self) -> HashSet<&str> {
self.fields
.iter()
.map(|f| f.target_or_input_field())
.collect()
}
fn input_keys(&self) -> HashSet<&str> {
self.fields.iter().map(|f| f.input_field()).collect()
}
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String> {
self.build(intermediate_keys)
.map(ProcessorKind::UrlEncoding)
}
}
impl UrlEncodingProcessorBuilder {
fn build(self, intermediate_keys: &[String]) -> Result<UrlEncodingProcessor, String> {
let mut real_fields = vec![];
for field in self.fields.into_iter() {
let input = OneInputOneOutputField::build(
"urlencoding",
intermediate_keys,
field.input_field(),
field.target_or_input_field(),
)?;
real_fields.push(input);
}
Ok(UrlEncodingProcessor {
fields: real_fields,
method: self.method,
ignore_missing: self.ignore_missing,
})
}
}
/// only support string value
#[derive(Debug, Default)]
pub struct UrlEncodingProcessor {
fields: Vec<OneInputOneOutputField>,
method: Method,
ignore_missing: bool,
}
impl UrlEncodingProcessor {
fn with_fields(&mut self, mut fields: Fields) {
Self::update_output_keys(&mut fields);
self.fields = fields;
}
fn with_ignore_missing(&mut self, ignore_missing: bool) {
self.ignore_missing = ignore_missing;
}
fn with_method(&mut self, method: Method) {
self.method = method;
}
fn process_field(&self, val: &str, field: &Field) -> Result<Map, String> {
fn process_field(&self, val: &str) -> Result<Value, String> {
let processed = match self.method {
Method::Encode => encode(val).to_string(),
Method::Decode => decode(val).map_err(|e| e.to_string())?.into_owned(),
};
let val = Value::String(processed);
let key = field.get_target_field();
Ok(Map::one(key, val))
}
fn update_output_keys(fields: &mut Fields) {
for field in fields.iter_mut() {
field
.output_fields_index_mapping
.insert(field.get_target_field().to_string(), 0_usize);
}
Ok(Value::String(processed))
}
}
impl TryFrom<&yaml_rust::yaml::Hash> for UrlEncodingProcessor {
impl TryFrom<&yaml_rust::yaml::Hash> for UrlEncodingProcessorBuilder {
type Error = String;
fn try_from(value: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
let mut processor = UrlEncodingProcessor::default();
let mut fields = Fields::default();
let mut method = Method::Decode;
let mut ignore_missing = false;
for (k, v) in value.iter() {
let key = k
@@ -107,24 +129,29 @@ impl TryFrom<&yaml_rust::yaml::Hash> for UrlEncodingProcessor {
.ok_or(format!("key must be a string, but got {k:?}"))?;
match key {
FIELD_NAME => {
processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?));
fields = Fields::one(yaml_new_field(v, FIELD_NAME)?);
}
FIELDS_NAME => {
processor.with_fields(yaml_fields(v, FIELDS_NAME)?);
fields = yaml_new_fields(v, FIELDS_NAME)?;
}
IGNORE_MISSING_NAME => {
processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?);
ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?;
}
METHOD_NAME => {
let method = yaml_string(v, METHOD_NAME)?;
processor.with_method(method.parse()?);
let method_str = yaml_string(v, METHOD_NAME)?;
method = method_str.parse()?;
}
_ => {}
}
}
let processor = UrlEncodingProcessorBuilder {
fields,
method,
ignore_missing,
};
Ok(processor)
}
@@ -139,52 +166,21 @@ impl crate::etl::processor::Processor for UrlEncodingProcessor {
self.ignore_missing
}
fn fields(&self) -> &Fields {
&self.fields
}
fn fields_mut(&mut self) -> &mut Fields {
&mut self.fields
}
fn output_keys(&self) -> HashSet<String> {
self.fields
.iter()
.map(|f| f.get_target_field().to_string())
.collect()
}
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
match val {
Value::String(val) => self.process_field(val, field),
_ => Err(format!(
"{} processor: expect string value, but got {val:?}",
self.kind()
)),
}
}
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String> {
for field in self.fields.iter() {
let index = field.input_field.index;
let index = field.input_index();
match val.get(index) {
Some(Value::String(s)) => {
let mut map = self.process_field(s, field)?;
field
.output_fields_index_mapping
.iter()
.for_each(|(k, output_index)| {
if let Some(v) = map.remove(k) {
val[*output_index] = v;
}
});
let result = self.process_field(s)?;
let output_index = field.output_index();
val[output_index] = result;
}
Some(Value::Null) | None => {
if !self.ignore_missing {
return Err(format!(
"{} processor: missing field: {}",
self.kind(),
field.get_field_name()
field.output_name()
));
}
}
@@ -202,29 +198,28 @@ impl crate::etl::processor::Processor for UrlEncodingProcessor {
#[cfg(test)]
mod tests {
use crate::etl::field::{Field, Fields};
use crate::etl::processor::urlencoding::UrlEncodingProcessor;
use crate::etl::value::{Map, Value};
use crate::etl::value::Value;
#[test]
fn test_decode_url() {
let field = "url";
let ff: Field = field.parse().unwrap();
let decoded = "//BC/[a=6.7.8.9,c=g,k=0,l=1]";
let encoded = "%2F%2FBC%2F%5Ba%3D6.7.8.9%2Cc%3Dg%2Ck%3D0%2Cl%3D1%5D";
let mut processor = UrlEncodingProcessor::default();
processor.with_fields(Fields::one(ff.clone()));
{
let result = processor.process_field(encoded, &ff).unwrap();
assert_eq!(Map::one(field, Value::String(decoded.into())), result)
let processor = UrlEncodingProcessor::default();
let result = processor.process_field(encoded).unwrap();
assert_eq!(Value::String(decoded.into()), result)
}
{
processor.with_method(super::Method::Encode);
let result = processor.process_field(decoded, &ff).unwrap();
assert_eq!(Map::one(field, Value::String(encoded.into())), result)
let processor = UrlEncodingProcessor {
fields: vec![],
method: super::Method::Encode,
ignore_missing: false,
};
let result = processor.process_field(decoded).unwrap();
assert_eq!(Value::String(encoded.into()), result)
}
}
}

View File

@@ -17,8 +17,8 @@ pub mod transformer;
use itertools::Itertools;
use crate::etl::field::Fields;
use crate::etl::processor::{update_one_one_output_keys, yaml_field, yaml_fields, yaml_string};
use crate::etl::find_key_index;
use crate::etl::processor::yaml_string;
use crate::etl::transform::index::Index;
use crate::etl::value::Value;
@@ -31,6 +31,9 @@ const TRANSFORM_ON_FAILURE: &str = "on_failure";
pub use transformer::greptime::GreptimeTransformer;
use super::field::{Fields, InputFieldInfo, OneInputOneOutputField};
use super::processor::{yaml_new_field, yaml_new_fields};
pub trait Transformer: std::fmt::Display + Sized + Send + Sync + 'static {
type Output;
type VecOutput;
@@ -39,12 +42,11 @@ pub trait Transformer: std::fmt::Display + Sized + Send + Sync + 'static {
fn schemas(&self) -> &Vec<greptime_proto::v1::ColumnSchema>;
fn transforms(&self) -> &Transforms;
fn transforms_mut(&mut self) -> &mut Transforms;
fn transform(&self, val: Value) -> Result<Self::Output, String>;
fn transform_mut(&self, val: &mut Vec<Value>) -> Result<Self::VecOutput, String>;
}
/// On Failure behavior when transform fails
#[derive(Debug, Clone, Default)]
#[derive(Debug, Clone, Default, Copy)]
pub enum OnFailure {
// Return None if transform fails
#[default]
@@ -74,12 +76,18 @@ impl std::fmt::Display for OnFailure {
}
}
}
#[derive(Debug, Default, Clone)]
pub struct TransformBuilders {
pub(crate) builders: Vec<TransformBuilder>,
pub(crate) output_keys: Vec<String>,
pub(crate) required_keys: Vec<String>,
}
#[derive(Debug, Default, Clone)]
pub struct Transforms {
transforms: Vec<Transform>,
output_keys: Vec<String>,
required_keys: Vec<String>,
pub(crate) transforms: Vec<Transform>,
pub(crate) output_keys: Vec<String>,
pub(crate) required_keys: Vec<String>,
}
impl Transforms {
@@ -130,7 +138,7 @@ impl std::ops::DerefMut for Transforms {
}
}
impl TryFrom<&Vec<yaml_rust::Yaml>> for Transforms {
impl TryFrom<&Vec<yaml_rust::Yaml>> for TransformBuilders {
type Error = String;
fn try_from(docs: &Vec<yaml_rust::Yaml>) -> Result<Self, Self::Error> {
@@ -138,41 +146,78 @@ impl TryFrom<&Vec<yaml_rust::Yaml>> for Transforms {
let mut all_output_keys: Vec<String> = Vec::with_capacity(100);
let mut all_required_keys = Vec::with_capacity(100);
for doc in docs {
let transform: Transform = doc
let transform_builder: TransformBuilder = doc
.as_hash()
.ok_or("transform element must be a map".to_string())?
.try_into()?;
let mut transform_output_keys = transform
let mut transform_output_keys = transform_builder
.fields
.iter()
.map(|f| f.get_target_field().to_string())
.map(|f| f.target_or_input_field().to_string())
.collect();
all_output_keys.append(&mut transform_output_keys);
let mut transform_required_keys = transform
let mut transform_required_keys = transform_builder
.fields
.iter()
.map(|f| f.input_field.name.clone())
.map(|f| f.input_field().to_string())
.collect();
all_required_keys.append(&mut transform_required_keys);
transforms.push(transform);
transforms.push(transform_builder);
}
all_required_keys.sort();
Ok(Transforms {
transforms,
Ok(TransformBuilders {
builders: transforms,
output_keys: all_output_keys,
required_keys: all_required_keys,
})
}
}
#[derive(Debug, Clone)]
pub struct TransformBuilder {
fields: Fields,
type_: Value,
default: Option<Value>,
index: Option<Index>,
on_failure: Option<OnFailure>,
}
impl TransformBuilder {
pub fn build(
self,
intermediate_keys: &[String],
output_keys: &[String],
) -> Result<Transform, String> {
let mut real_fields = vec![];
for field in self.fields {
let input_index = find_key_index(intermediate_keys, field.input_field(), "transform")?;
let input_field_info = InputFieldInfo::new(field.input_field(), input_index);
let output_index =
find_key_index(output_keys, field.target_or_input_field(), "transform")?;
let input = OneInputOneOutputField::new(
input_field_info,
(field.target_or_input_field().to_string(), output_index),
);
real_fields.push(input);
}
Ok(Transform {
real_fields,
type_: self.type_,
default: self.default,
index: self.index,
on_failure: self.on_failure,
})
}
}
/// only field is required
#[derive(Debug, Clone)]
pub struct Transform {
pub fields: Fields,
pub real_fields: Vec<OneInputOneOutputField>,
pub type_: Value,
@@ -192,7 +237,7 @@ impl std::fmt::Display for Transform {
};
let type_ = format!("type: {}", self.type_);
let fields = format!("field(s): {}", self.fields);
let fields = format!("field(s): {:?}", self.real_fields);
let default = if let Some(default) = &self.default {
format!(", default: {}", default)
} else {
@@ -212,7 +257,7 @@ impl std::fmt::Display for Transform {
impl Default for Transform {
fn default() -> Self {
Transform {
fields: Fields::default(),
real_fields: Vec::new(),
type_: Value::Null,
default: None,
index: None,
@@ -222,40 +267,6 @@ impl Default for Transform {
}
impl Transform {
fn with_fields(&mut self, mut fields: Fields) {
update_one_one_output_keys(&mut fields);
self.fields = fields;
}
fn with_type(&mut self, type_: Value) {
self.type_ = type_;
}
fn try_default(&mut self, default: Value) -> Result<(), String> {
match (&self.type_, &default) {
(Value::Null, _) => Err(format!(
"transform {} type MUST BE set before default {}",
self.fields, &default,
)),
(_, Value::Null) => Ok(()), // if default is not set, then it will be regarded as default null
(_, _) => {
let target = self
.type_
.parse_str_value(default.to_str_value().as_str())?;
self.default = Some(target);
Ok(())
}
}
}
fn with_index(&mut self, index: Index) {
self.index = Some(index);
}
fn with_on_failure(&mut self, on_failure: OnFailure) {
self.on_failure = Some(on_failure);
}
pub(crate) fn get_default(&self) -> Option<&Value> {
self.default.as_ref()
}
@@ -265,52 +276,74 @@ impl Transform {
}
}
impl TryFrom<&yaml_rust::yaml::Hash> for Transform {
impl TryFrom<&yaml_rust::yaml::Hash> for TransformBuilder {
type Error = String;
fn try_from(hash: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
let mut transform = Transform::default();
let mut default_opt = None;
let mut fields = Fields::default();
let mut type_ = Value::Null;
let mut default = None;
let mut index = None;
let mut on_failure = None;
for (k, v) in hash {
let key = k.as_str().ok_or("key must be a string")?;
match key {
TRANSFORM_FIELD => {
transform.with_fields(Fields::one(yaml_field(v, TRANSFORM_FIELD)?));
fields = Fields::one(yaml_new_field(v, TRANSFORM_FIELD)?);
}
TRANSFORM_FIELDS => {
transform.with_fields(yaml_fields(v, TRANSFORM_FIELDS)?);
fields = yaml_new_fields(v, TRANSFORM_FIELDS)?;
}
TRANSFORM_TYPE => {
let t = yaml_string(v, TRANSFORM_TYPE)?;
transform.with_type(Value::parse_str_type(&t)?);
type_ = Value::parse_str_type(&t)?;
}
TRANSFORM_INDEX => {
let index = yaml_string(v, TRANSFORM_INDEX)?;
transform.with_index(index.try_into()?);
let index_str = yaml_string(v, TRANSFORM_INDEX)?;
index = Some(index_str.try_into()?);
}
TRANSFORM_DEFAULT => {
default_opt = Some(Value::try_from(v)?);
default = Some(Value::try_from(v)?);
}
TRANSFORM_ON_FAILURE => {
let on_failure = yaml_string(v, TRANSFORM_ON_FAILURE)?;
transform.with_on_failure(on_failure.parse()?);
let on_failure_str = yaml_string(v, TRANSFORM_ON_FAILURE)?;
on_failure = Some(on_failure_str.parse()?);
}
_ => {}
}
}
let mut final_default = None;
if let Some(default) = default_opt {
transform.try_default(default)?;
if let Some(default_value) = default {
match (&type_, &default_value) {
(Value::Null, _) => {
return Err(format!(
"transform {:?} type MUST BE set before default {}",
fields, &default_value,
));
}
(_, Value::Null) => {} // if default is not set, then it will be regarded as default null
(_, _) => {
let target = type_.parse_str_value(default_value.to_str_value().as_str())?;
final_default = Some(target);
}
}
}
let builder = TransformBuilder {
fields,
type_,
default: final_default,
index,
on_failure,
};
Ok(transform)
Ok(builder)
}
}

View File

@@ -20,10 +20,10 @@ use coerce::{coerce_columns, coerce_value};
use greptime_proto::v1::{ColumnSchema, Row, Rows, Value as GreptimeValue};
use itertools::Itertools;
use crate::etl::field::{Field, Fields};
use crate::etl::field::{InputFieldInfo, OneInputOneOutputField};
use crate::etl::transform::index::Index;
use crate::etl::transform::{Transform, Transformer, Transforms};
use crate::etl::value::{Array, Map, Timestamp, Value};
use crate::etl::value::{Timestamp, Value};
const DEFAULT_GREPTIME_TIMESTAMP_COLUMN: &str = "greptime_timestamp";
@@ -36,23 +36,41 @@ pub struct GreptimeTransformer {
}
impl GreptimeTransformer {
fn default_greptime_timestamp_column() -> Transform {
/// Add a default timestamp column to the transforms
fn add_greptime_timestamp_column(transforms: &mut Transforms) {
let ns = chrono::Utc::now().timestamp_nanos_opt().unwrap_or(0);
let type_ = Value::Timestamp(Timestamp::Nanosecond(ns));
let default = Some(type_.clone());
let mut field = Field::new(DEFAULT_GREPTIME_TIMESTAMP_COLUMN);
field.insert_output_index(DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string(), 0);
let fields = Fields::new(vec![field]).unwrap();
Transform {
fields,
let transform = Transform {
real_fields: vec![OneInputOneOutputField::new(
InputFieldInfo {
name: DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string(),
index: usize::MAX,
},
(
DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string(),
transforms
.transforms
.iter()
.map(|x| x.real_fields.len())
.sum(),
),
)],
type_,
default,
index: Some(Index::Time),
on_failure: Some(crate::etl::transform::OnFailure::Default),
}
};
let required_keys = transforms.required_keys_mut();
required_keys.push(DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string());
let output_keys = transforms.output_keys_mut();
output_keys.push(DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string());
transforms.push(transform);
}
/// Generate the schema for the GreptimeTransformer
fn schemas(transforms: &Transforms) -> Result<Vec<ColumnSchema>, String> {
let mut schema = vec![];
for transform in transforms.iter() {
@@ -60,53 +78,6 @@ impl GreptimeTransformer {
}
Ok(schema)
}
fn transform_map(&self, map: &Map) -> Result<Row, String> {
let mut values = vec![GreptimeValue { value_data: None }; self.schema.len()];
for transform in self.transforms.iter() {
for field in transform.fields.iter() {
let value_data = match map.get(field.get_field_name()) {
Some(val) => coerce_value(val, transform)?,
None => {
let default = transform.get_default();
match default {
Some(default) => coerce_value(default, transform)?,
None => None,
}
}
};
if let Some(i) = field
.output_fields_index_mapping
.iter()
.next()
.map(|kv| kv.1)
{
values[*i] = GreptimeValue { value_data }
} else {
return Err(format!(
"field: {} output_fields is empty.",
field.get_field_name()
));
}
}
}
Ok(Row { values })
}
fn transform_array(&self, arr: &Array) -> Result<Vec<Row>, String> {
let mut rows = Vec::with_capacity(arr.len());
for v in arr.iter() {
match v {
Value::Map(map) => {
let row = self.transform_map(map)?;
rows.push(row);
}
_ => return Err(format!("Expected map, found: {v:?}")),
}
}
Ok(rows)
}
}
impl std::fmt::Display for GreptimeTransformer {
@@ -129,9 +100,9 @@ impl Transformer for GreptimeTransformer {
for transform in transforms.iter() {
let target_fields_set = transform
.fields
.real_fields
.iter()
.map(|f| f.get_target_field())
.map(|f| f.output_name())
.collect::<HashSet<_>>();
let intersections: Vec<_> = column_names_set.intersection(&target_fields_set).collect();
@@ -146,12 +117,15 @@ impl Transformer for GreptimeTransformer {
if let Some(idx) = transform.index {
if idx == Index::Time {
match transform.fields.len() {
1 => timestamp_columns.push(transform.fields.first().unwrap().get_field_name()),
_ => return Err(format!(
"Illegal to set multiple timestamp Index columns, please set only one: {}",
transform.fields.get_target_fields().join(", ")
)),
match transform.real_fields.len() {
1 => timestamp_columns
.push(transform.real_fields.first().unwrap().input_name()),
_ => {
return Err(format!(
"Illegal to set multiple timestamp Index columns, please set only one: {}",
transform.real_fields.iter().map(|x|x.input_name()).join(", ")
))
}
}
}
}
@@ -159,13 +133,7 @@ impl Transformer for GreptimeTransformer {
match timestamp_columns.len() {
0 => {
transforms.push(GreptimeTransformer::default_greptime_timestamp_column());
let required_keys = transforms.required_keys_mut();
required_keys.push(DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string());
let output_keys = transforms.output_keys_mut();
output_keys.push(DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string());
GreptimeTransformer::add_greptime_timestamp_column(&mut transforms);
let schema = GreptimeTransformer::schemas(&transforms)?;
Ok(GreptimeTransformer { transforms, schema })
@@ -184,54 +152,26 @@ impl Transformer for GreptimeTransformer {
}
}
fn transform(&self, value: Value) -> Result<Self::Output, String> {
match value {
Value::Map(map) => {
let rows = vec![self.transform_map(&map)?];
Ok(Rows {
schema: self.schema.clone(),
rows,
})
}
Value::Array(arr) => {
let rows = self.transform_array(&arr)?;
Ok(Rows {
schema: self.schema.clone(),
rows,
})
}
_ => Err(format!("Expected map or array, found: {}", value)),
}
}
fn transform_mut(&self, val: &mut Vec<Value>) -> Result<Self::VecOutput, String> {
let mut values = vec![GreptimeValue { value_data: None }; self.schema.len()];
for transform in self.transforms.iter() {
for field in transform.fields.iter() {
let index = field.input_field.index;
for field in transform.real_fields.iter() {
let index = field.input_index();
let output_index = field.output_index();
match val.get(index) {
Some(v) => {
let value_data = coerce_value(v, transform)
.map_err(|e| format!("{} processor: {}", field.get_field_name(), e))?;
.map_err(|e| format!("{} processor: {}", field.input_name(), e))?;
// every transform fields has only one output field
if let Some(i) = field
.output_fields_index_mapping
.iter()
.next()
.map(|kv| kv.1)
{
values[*i] = GreptimeValue { value_data }
} else {
return Err(format!(
"field: {} output_fields is empty.",
field.get_field_name()
));
}
values[output_index] = GreptimeValue { value_data };
}
_ => {
return Err(format!(
"Get field not in the array field: {field:?}, {val:?}"
))
None => {
let default = transform.get_default();
let value_data = match default {
Some(default) => coerce_value(default, transform)?,
None => None,
};
values[output_index] = GreptimeValue { value_data };
}
}
}

View File

@@ -66,8 +66,8 @@ impl TryFrom<Value> for ValueData {
pub(crate) fn coerce_columns(transform: &Transform) -> Result<Vec<ColumnSchema>, String> {
let mut columns = Vec::new();
for field in transform.fields.iter() {
let column_name = field.get_target_field().to_string();
for field in transform.real_fields.iter() {
let column_name = field.output_name().to_string();
let datatype = coerce_type(transform)? as i32;
@@ -134,7 +134,7 @@ fn coerce_type(transform: &Transform) -> Result<ColumnDataType, String> {
Value::Null => Err(format!(
"Null type not supported when to coerce '{}' type",
transform.fields
transform.type_.to_str_type()
)),
}
}
@@ -144,15 +144,18 @@ pub(crate) fn coerce_value(
transform: &Transform,
) -> Result<Option<ValueData>, String> {
match val {
Value::Null => match transform.on_failure {
Some(OnFailure::Ignore) => Ok(None),
Some(OnFailure::Default) => transform
.get_default()
.map(|default| coerce_value(default, transform))
.unwrap_or_else(|| {
coerce_value(transform.get_type_matched_default_val(), transform)
}),
None => Ok(None),
Value::Null => match &transform.default {
Some(default) => coerce_value(default, transform),
None => match transform.on_failure {
Some(OnFailure::Ignore) => Ok(None),
Some(OnFailure::Default) => transform
.get_default()
.map(|default| coerce_value(default, transform))
.unwrap_or_else(|| {
coerce_value(transform.get_type_matched_default_val(), transform)
}),
None => Ok(None),
},
},
Value::Int8(n) => coerce_i64_value(*n as i64, transform),
@@ -404,12 +407,11 @@ fn coerce_string_value(s: &String, transform: &Transform) -> Result<Option<Value
#[cfg(test)]
mod tests {
use super::*;
use crate::etl::field::Fields;
#[test]
fn test_coerce_string_without_on_failure() {
let transform = Transform {
fields: Fields::default(),
real_fields: vec![],
type_: Value::Int32(0),
default: None,
index: None,
@@ -434,7 +436,7 @@ mod tests {
#[test]
fn test_coerce_string_with_on_failure_ignore() {
let transform = Transform {
fields: Fields::default(),
real_fields: vec![],
type_: Value::Int32(0),
default: None,
index: None,
@@ -449,7 +451,7 @@ mod tests {
#[test]
fn test_coerce_string_with_on_failure_default() {
let mut transform = Transform {
fields: Fields::default(),
real_fields: vec![],
type_: Value::Int32(0),
default: None,
index: None,

View File

@@ -110,7 +110,12 @@ impl PipelineOperator {
// exist in catalog, just open
if let Some(table) = self
.catalog_manager
.table(&expr.catalog_name, &expr.schema_name, &expr.table_name)
.table(
&expr.catalog_name,
&expr.schema_name,
&expr.table_name,
Some(&ctx),
)
.await
.context(CatalogSnafu)?
{
@@ -130,7 +135,7 @@ impl PipelineOperator {
// get from catalog
let table = self
.catalog_manager
.table(catalog, schema, table_name)
.table(catalog, schema, table_name, Some(&ctx))
.await
.context(CatalogSnafu)?
.context(PipelineTableNotFoundSnafu)?;

View File

@@ -13,20 +13,45 @@
// limitations under the License.
use greptime_proto::v1::{ColumnDataType, ColumnSchema, Rows, SemanticType};
use pipeline::{parse, Content, GreptimeTransformer, Pipeline, Value};
use pipeline::{parse, Content, GreptimeTransformer, Pipeline};
/// test util function to parse and execute pipeline
pub fn parse_and_exec(input_str: &str, pipeline_yaml: &str) -> Rows {
let input_value: Value = serde_json::from_str::<serde_json::Value>(input_str)
.expect("failed to parse into json")
.try_into()
.expect("failed to convert into value");
let input_value = serde_json::from_str::<serde_json::Value>(input_str).unwrap();
let yaml_content = Content::Yaml(pipeline_yaml.into());
let pipeline: Pipeline<GreptimeTransformer> =
parse(&yaml_content).expect("failed to parse pipeline");
let mut result = pipeline.init_intermediate_state();
pipeline.exec(input_value).expect("failed to exec pipeline")
let schema = pipeline.schemas().clone();
let mut rows = Vec::new();
match input_value {
serde_json::Value::Array(array) => {
for value in array {
pipeline.prepare(value, &mut result).unwrap();
let row = pipeline
.exec_mut(&mut result)
.expect("failed to exec pipeline");
rows.push(row);
pipeline.reset_intermediate_state(&mut result);
}
}
serde_json::Value::Object(_) => {
pipeline.prepare(input_value, &mut result).unwrap();
let row = pipeline
.exec_mut(&mut result)
.expect("failed to exec pipeline");
rows.push(row);
}
_ => {
panic!("invalid input value");
}
}
Rows { schema, rows }
}
/// test util function to create column schema

View File

@@ -157,7 +157,7 @@ transform:
fn test_modifier() {
let empty_str = r#"
{
"str": "key1 key2 key3 key4 key5 key6 key7 key8"
"str": "key1 key2 key3 key4 key5 key6"
}"#;
let pipeline_yaml = r#"
@@ -165,7 +165,7 @@ processors:
- dissect:
field: str
patterns:
- "%{key1} %{key2} %{+key3} %{+key3/2} %{key5->} %{?key6} %{*key_7} %{&key_7}"
- "%{key1} %{key2} %{+key3} %{+key3/2} %{key5->} %{?key6}"
transform:
- fields:
@@ -173,7 +173,6 @@ transform:
- key2
- key3
- key5
- key7
type: string
"#;
@@ -184,7 +183,6 @@ transform:
make_string_column_schema("key2".to_string()),
make_string_column_schema("key3".to_string()),
make_string_column_schema("key5".to_string()),
make_string_column_schema("key7".to_string()),
common::make_column_schema(
"greptime_timestamp".to_string(),
ColumnDataType::TimestampNanosecond,
@@ -209,10 +207,6 @@ transform:
output.rows[0].values[3].value_data,
Some(StringValue("key5".to_string()))
);
assert_eq!(
output.rows[0].values[4].value_data,
Some(StringValue("key8".to_string()))
);
}
#[test]

View File

@@ -12,18 +12,18 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use api::v1::Rows;
use common_telemetry::tracing::info;
use greptime_proto::v1::value::ValueData::{
BoolValue, F64Value, StringValue, TimestampNanosecondValue, TimestampSecondValue, U32Value,
U64Value, U8Value,
};
use greptime_proto::v1::Value as GreptimeValue;
use pipeline::{parse, Content, GreptimeTransformer, Pipeline, Value};
use pipeline::{parse, Content, GreptimeTransformer, Pipeline};
#[test]
fn test_complex_data() {
let input_value_str = r#"
[
{
"version": 1,
"streamId": "12345",
@@ -73,12 +73,9 @@ fn test_complex_data() {
"ewExecutionInfo": "c:4380:7:161:162:161:n:::12473:200|C:4380:3:0:4:0:n:::6967:200|R:4380:20:99:99:1:n:::35982:200",
"customField": "any-custom-value"
}
]
"#;
let input_value: Value = serde_json::from_str::<serde_json::Value>(input_value_str)
.expect("failed to parse input value")
.try_into()
.expect("failed to convert input value");
let input_value = serde_json::from_str::<serde_json::Value>(input_value_str)
.expect("failed to parse input value");
let pipeline_yaml = r#"
---
@@ -422,7 +419,19 @@ transform:
let yaml_content = Content::Yaml(pipeline_yaml.into());
let pipeline: Pipeline<GreptimeTransformer> =
parse(&yaml_content).expect("failed to parse pipeline");
let output = pipeline.exec(input_value).expect("failed to exec pipeline");
let mut stats = pipeline.init_intermediate_state();
pipeline
.prepare(input_value, &mut stats)
.expect("failed to prepare pipeline");
let row = pipeline
.exec_mut(&mut stats)
.expect("failed to exec pipeline");
let output = Rows {
schema: pipeline.schemas().clone(),
rows: vec![row],
};
assert_eq!(output.rows.len(), 1);
let values = output.rows.first().unwrap().values.clone();
@@ -464,10 +473,7 @@ fn test_simple_data() {
"line": "2024-05-25 20:16:37.217 hello world"
}
"#;
let input_value: Value = serde_json::from_str::<serde_json::Value>(input_value_str)
.unwrap()
.try_into()
.unwrap();
let input_value = serde_json::from_str::<serde_json::Value>(input_value_str).unwrap();
let pipeline_yaml = r#"
processors:
@@ -493,11 +499,13 @@ transform:
let yaml_content = Content::Yaml(pipeline_yaml.into());
let pipeline: Pipeline<GreptimeTransformer> = parse(&yaml_content).unwrap();
let output = pipeline.exec(input_value).unwrap();
let r = output
.rows
let mut status = pipeline.init_intermediate_state();
pipeline.prepare(input_value, &mut status).unwrap();
let row = pipeline.exec_mut(&mut status).unwrap();
let r = row
.values
.into_iter()
.flat_map(|v| v.values)
.map(|v| v.value_data.unwrap())
.collect::<Vec<_>>();

View File

@@ -116,7 +116,7 @@ impl DatafusionQueryEngine {
let default_catalog = &query_ctx.current_catalog().to_owned();
let default_schema = &query_ctx.current_schema();
let table_name = dml.table_name.resolve(default_catalog, default_schema);
let table = self.find_table(&table_name).await?;
let table = self.find_table(&table_name, &query_ctx).await?;
let output = self
.exec_query_plan(LogicalPlan::DfPlan((*dml.input).clone()), query_ctx.clone())
@@ -241,14 +241,18 @@ impl DatafusionQueryEngine {
.context(TableMutationSnafu)
}
async fn find_table(&self, table_name: &ResolvedTableReference) -> Result<TableRef> {
async fn find_table(
&self,
table_name: &ResolvedTableReference,
query_context: &QueryContextRef,
) -> Result<TableRef> {
let catalog_name = table_name.catalog.as_ref();
let schema_name = table_name.schema.as_ref();
let table_name = table_name.table.as_ref();
self.state
.catalog_manager()
.table(catalog_name, schema_name, table_name)
.table(catalog_name, schema_name, table_name, Some(query_context))
.await
.context(CatalogSnafu)?
.with_context(|| TableNotFoundSnafu { table: table_name })
@@ -529,7 +533,7 @@ mod tests {
use datatypes::prelude::ConcreteDataType;
use datatypes::schema::ColumnSchema;
use datatypes::vectors::{Helper, UInt32Vector, UInt64Vector, VectorRef};
use session::context::QueryContext;
use session::context::{QueryContext, QueryContextBuilder};
use table::table::numbers::{NumbersTable, NUMBERS_TABLE_NAME};
use super::*;
@@ -618,12 +622,16 @@ mod tests {
.as_any()
.downcast_ref::<DatafusionQueryEngine>()
.unwrap();
let query_ctx = Arc::new(QueryContextBuilder::default().build());
let table = engine
.find_table(&ResolvedTableReference {
catalog: "greptime".into(),
schema: "public".into(),
table: "numbers".into(),
})
.find_table(
&ResolvedTableReference {
catalog: "greptime".into(),
schema: "public".into(),
table: "numbers".into(),
},
&query_ctx,
)
.await
.unwrap();

View File

@@ -61,7 +61,7 @@ impl DfContextProviderAdapter {
let mut table_provider = DfTableSourceProvider::new(
engine_state.catalog_manager().clone(),
engine_state.disallow_cross_catalog_query(),
query_ctx.as_ref(),
query_ctx.clone(),
Arc::new(DefaultPlanDecoder::new(session_state.clone(), &query_ctx)?),
session_state
.config_options()

View File

@@ -128,6 +128,7 @@ impl DistExtensionPlanner {
&table_name.catalog_name,
&table_name.schema_name,
&table_name.table_name,
None,
)
.await
.context(CatalogSnafu)?

View File

@@ -68,7 +68,7 @@ impl DfLogicalPlanner {
let table_provider = DfTableSourceProvider::new(
self.engine_state.catalog_manager().clone(),
self.engine_state.disallow_cross_catalog_query(),
query_ctx.as_ref(),
query_ctx.clone(),
Arc::new(DefaultPlanDecoder::new(
self.session_state.clone(),
&query_ctx,
@@ -144,14 +144,15 @@ impl DfLogicalPlanner {
#[tracing::instrument(skip_all)]
async fn plan_pql(&self, stmt: EvalStmt, query_ctx: QueryContextRef) -> Result<LogicalPlan> {
let plan_decoder = Arc::new(DefaultPlanDecoder::new(
self.session_state.clone(),
&query_ctx,
)?);
let table_provider = DfTableSourceProvider::new(
self.engine_state.catalog_manager().clone(),
self.engine_state.disallow_cross_catalog_query(),
query_ctx.as_ref(),
Arc::new(DefaultPlanDecoder::new(
self.session_state.clone(),
&query_ctx,
)?),
query_ctx,
plan_decoder,
self.session_state
.config_options()
.sql_parser

View File

@@ -2379,7 +2379,7 @@ mod test {
DfTableSourceProvider::new(
catalog_list,
false,
QueryContext::arc().as_ref(),
QueryContext::arc(),
DummyDecoder::arc(),
false,
)
@@ -3219,7 +3219,7 @@ mod test {
DfTableSourceProvider::new(
catalog_list.clone(),
false,
QueryContext::arc().as_ref(),
QueryContext::arc(),
DummyDecoder::arc(),
true,
),
@@ -3249,7 +3249,7 @@ mod test {
DfTableSourceProvider::new(
catalog_list.clone(),
false,
QueryContext::arc().as_ref(),
QueryContext::arc(),
DummyDecoder::arc(),
true,
),

View File

@@ -232,6 +232,7 @@ async fn query_from_information_schema_table(
query_ctx.current_catalog(),
INFORMATION_SCHEMA_NAME,
table_name,
Some(&query_ctx),
)
.await
.context(error::CatalogSnafu)?

View File

@@ -753,6 +753,7 @@ impl HttpServer {
"/pipelines/:pipeline_name",
routing::delete(event::delete_pipeline),
)
.route("/pipelines/dryrun", routing::post(event::pipeline_dryrun))
.layer(
ServiceBuilder::new()
.layer(HandleErrorLayer::new(handle_error))

View File

@@ -23,15 +23,16 @@ use axum::headers::ContentType;
use axum::http::header::CONTENT_TYPE;
use axum::http::{Request, StatusCode};
use axum::response::{IntoResponse, Response};
use axum::{async_trait, BoxError, Extension, TypedHeader};
use axum::{async_trait, BoxError, Extension, Json, TypedHeader};
use common_query::{Output, OutputData};
use common_telemetry::{error, warn};
use datatypes::value::column_data_to_json;
use pipeline::error::PipelineTransformSnafu;
use pipeline::util::to_pipeline_version;
use pipeline::PipelineVersion;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use serde_json::{Deserializer, Value};
use serde_json::{Deserializer, Map, Value};
use session::context::{Channel, QueryContext, QueryContextRef};
use snafu::{ensure, OptionExt, ResultExt};
@@ -230,6 +231,117 @@ fn transform_ndjson_array_factory(
})
}
#[axum_macros::debug_handler]
pub async fn pipeline_dryrun(
State(log_state): State<LogState>,
Query(query_params): Query<LogIngesterQueryParams>,
Extension(mut query_ctx): Extension<QueryContext>,
TypedHeader(content_type): TypedHeader<ContentType>,
payload: String,
) -> Result<Response> {
let handler = log_state.log_handler;
let pipeline_name = query_params.pipeline_name.context(InvalidParameterSnafu {
reason: "pipeline_name is required",
})?;
let version = to_pipeline_version(query_params.version).context(PipelineSnafu)?;
let ignore_errors = query_params.ignore_errors.unwrap_or(false);
let value = extract_pipeline_value_by_content_type(content_type, payload, ignore_errors)?;
if value.len() > 10 {
return Err(InvalidParameterSnafu {
reason: "too many rows for dryrun",
}
.build());
}
query_ctx.set_channel(Channel::Http);
let query_ctx = Arc::new(query_ctx);
let pipeline = handler
.get_pipeline(&pipeline_name, version, query_ctx.clone())
.await?;
let mut intermediate_state = pipeline.init_intermediate_state();
let mut results = Vec::with_capacity(value.len());
for v in value {
pipeline
.prepare(v, &mut intermediate_state)
.map_err(|reason| PipelineTransformSnafu { reason }.build())
.context(PipelineSnafu)?;
let r = pipeline
.exec_mut(&mut intermediate_state)
.map_err(|reason| PipelineTransformSnafu { reason }.build())
.context(PipelineSnafu)?;
results.push(r);
pipeline.reset_intermediate_state(&mut intermediate_state);
}
let colume_type_key = "colume_type";
let data_type_key = "data_type";
let name_key = "name";
let schema = pipeline
.schemas()
.iter()
.map(|cs| {
let mut map = Map::new();
map.insert(name_key.to_string(), Value::String(cs.column_name.clone()));
map.insert(
data_type_key.to_string(),
Value::String(cs.datatype().as_str_name().to_string()),
);
map.insert(
colume_type_key.to_string(),
Value::String(cs.semantic_type().as_str_name().to_string()),
);
map.insert(
"fulltext".to_string(),
Value::Bool(
cs.options
.clone()
.is_some_and(|x| x.options.contains_key("fulltext")),
),
);
Value::Object(map)
})
.collect::<Vec<_>>();
let rows = results
.into_iter()
.map(|row| {
let row = row
.values
.into_iter()
.enumerate()
.map(|(idx, v)| {
v.value_data
.map(|d| {
let mut map = Map::new();
map.insert("value".to_string(), column_data_to_json(d));
map.insert("key".to_string(), schema[idx][name_key].clone());
map.insert(
"semantic_type".to_string(),
schema[idx][colume_type_key].clone(),
);
map.insert("data_type".to_string(), schema[idx][data_type_key].clone());
Value::Object(map)
})
.unwrap_or(Value::Null)
})
.collect();
Value::Array(row)
})
.collect::<Vec<_>>();
let mut result = Map::new();
result.insert("schema".to_string(), Value::Array(schema));
result.insert("rows".to_string(), Value::Array(rows));
let result = Value::Object(result);
Ok(Json(result).into_response())
}
#[axum_macros::debug_handler]
pub async fn log_ingester(
State(log_state): State<LogState>,

View File

@@ -405,11 +405,11 @@ async fn get_all_column_names(
schema: &str,
manager: &CatalogManagerRef,
) -> std::result::Result<HashSet<String>, catalog::error::Error> {
let table_names = manager.table_names(catalog, schema).await?;
let table_names = manager.table_names(catalog, schema, None).await?;
let mut labels = HashSet::new();
for table_name in table_names {
let Some(table) = manager.table(catalog, schema, &table_name).await? else {
let Some(table) = manager.table(catalog, schema, &table_name, None).await? else {
continue;
};
for column in table.primary_key_columns() {
@@ -436,6 +436,7 @@ async fn retrieve_series_from_query_result(
query_ctx.current_catalog(),
&query_ctx.current_schema(),
table_name,
Some(query_ctx),
)
.await
.context(CatalogSnafu)?
@@ -691,7 +692,7 @@ pub async fn label_values_query(
if label_name == METRIC_NAME_LABEL {
let mut table_names = match handler
.catalog_manager()
.table_names(&catalog, &schema)
.table_names(&catalog, &schema, Some(&query_ctx))
.await
{
Ok(table_names) => table_names,
@@ -777,7 +778,11 @@ async fn retrieve_field_names(
if matches.is_empty() {
// query all tables if no matcher is provided
while let Some(table) = manager.tables(catalog, &schema).next().await {
while let Some(table) = manager
.tables(catalog, &schema, Some(query_ctx))
.next()
.await
{
let table = table.context(CatalogSnafu)?;
for column in table.field_columns() {
field_columns.insert(column.name);
@@ -788,7 +793,7 @@ async fn retrieve_field_names(
for table_name in matches {
let table = manager
.table(catalog, &schema, &table_name)
.table(catalog, &schema, &table_name, Some(query_ctx))
.await
.context(CatalogSnafu)?
.with_context(|| TableNotFoundSnafu {

View File

@@ -261,6 +261,7 @@ impl QueryContext {
impl QueryContextBuilder {
pub fn build(self) -> QueryContext {
let channel = self.channel.unwrap_or_default();
QueryContext {
current_catalog: self
.current_catalog
@@ -270,8 +271,10 @@ impl QueryContextBuilder {
.sql_dialect
.unwrap_or_else(|| Arc::new(GreptimeDbDialect {})),
extensions: self.extensions.unwrap_or_default(),
configuration_parameter: self.configuration_parameter.unwrap_or_default(),
channel: self.channel.unwrap_or_default(),
configuration_parameter: self
.configuration_parameter
.unwrap_or_else(|| Arc::new(ConfigurationVariables::default())),
channel,
}
}

View File

@@ -233,6 +233,9 @@ pub trait RegionScanner: Debug + DisplayAs + Send {
/// # Panics
/// Panics if the `partition` is out of bound.
fn scan_partition(&self, partition: usize) -> Result<SendableRecordBatchStream, BoxedError>;
/// Check if there is any predicate that may be executed in this scanner.
fn has_predicate(&self) -> bool;
}
pub type RegionScannerRef = Box<dyn RegionScanner>;
@@ -367,6 +370,10 @@ impl RegionScanner for SinglePartitionScanner {
))
})
}
fn has_predicate(&self) -> bool {
false
}
}
impl DisplayAs for SinglePartitionScanner {

View File

@@ -180,7 +180,7 @@ impl ExecutionPlan for RegionScanExec {
}
fn statistics(&self) -> DfResult<Statistics> {
let statistics = if self.append_mode {
let statistics = if self.append_mode && !self.scanner.lock().unwrap().has_predicate() {
let column_statistics = self
.arrow_schema
.fields

View File

@@ -181,7 +181,8 @@ mod test {
.table(
"greptime",
"database_created_through_grpc",
"table_created_through_grpc"
"table_created_through_grpc",
None,
)
.await
.unwrap()
@@ -510,7 +511,7 @@ CREATE TABLE {table_name} (
let table = instance
.frontend()
.catalog_manager()
.table("greptime", "public", table_name)
.table("greptime", "public", table_name, None)
.await
.unwrap()
.unwrap();

View File

@@ -278,7 +278,7 @@ mod tests {
assert!(instance
.frontend()
.catalog_manager()
.table("greptime", "public", "demo")
.table("greptime", "public", "demo", None)
.await
.unwrap()
.is_none())

View File

@@ -462,7 +462,6 @@ async fn test_execute_show_databases_tables(instance: Arc<dyn MockInstance>) {
+--------------------+
| greptime_private |
| information_schema |
| pg_catalog |
| public |
+--------------------+\
";
@@ -1900,7 +1899,6 @@ async fn test_show_databases(instance: Arc<dyn MockInstance>) {
+--------------------+
| greptime_private |
| information_schema |
| pg_catalog |
| public |
+--------------------+";
check_output_stream(output, expected).await;
@@ -1914,7 +1912,6 @@ async fn test_show_databases(instance: Arc<dyn MockInstance>) {
| Database |
+--------------------+
| information_schema |
| pg_catalog |
+--------------------+";
check_output_stream(output, expected).await;
}

View File

@@ -78,6 +78,7 @@ macro_rules! http_tests {
test_vm_proto_remote_write,
test_pipeline_api,
test_test_pipeline_api,
test_plain_text_ingestion,
);
)*
@@ -1146,6 +1147,171 @@ transform:
guard.remove_all().await;
}
pub async fn test_test_pipeline_api(store_type: StorageType) {
common_telemetry::init_default_ut_logging();
let (app, mut guard) = setup_test_http_app_with_frontend(store_type, "test_pipeline_api").await;
// handshake
let client = TestClient::new(app);
let body = r#"
processors:
- date:
field: time
formats:
- "%Y-%m-%d %H:%M:%S%.3f"
ignore_missing: true
transform:
- fields:
- id1
- id2
type: int32
- fields:
- type
- log
- logger
type: string
- field: time
type: time
index: timestamp
"#;
// 1. create pipeline
let res = client
.post("/v1/events/pipelines/test")
.header("Content-Type", "application/x-yaml")
.body(body)
.send()
.await;
assert_eq!(res.status(), StatusCode::OK);
let content = res.text().await;
let content = serde_json::from_str(&content);
assert!(content.is_ok());
// {"execution_time_ms":13,"pipelines":[{"name":"test","version":"2024-07-04 08:31:00.987136"}]}
let content: Value = content.unwrap();
let execution_time = content.get("execution_time_ms");
assert!(execution_time.unwrap().is_number());
let pipelines = content.get("pipelines");
let pipelines = pipelines.unwrap().as_array().unwrap();
assert_eq!(pipelines.len(), 1);
let pipeline = pipelines.first().unwrap();
assert_eq!(pipeline.get("name").unwrap(), "test");
// 2. write data
let data_body = r#"
[
{
"id1": "2436",
"id2": "2528",
"logger": "INTERACT.MANAGER",
"type": "I",
"time": "2024-05-25 20:16:37.217",
"log": "ClusterAdapter:enter sendTextDataToCluster\\n"
}
]
"#;
let res = client
.post("/v1/events/pipelines/dryrun?pipeline_name=test")
.header("Content-Type", "application/json")
.body(data_body)
.send()
.await;
assert_eq!(res.status(), StatusCode::OK);
let body: serde_json::Value = res.json().await;
let schema = &body["schema"];
let rows = &body["rows"];
assert_eq!(
schema,
&json!([
{
"colume_type": "FIELD",
"data_type": "INT32",
"fulltext": false,
"name": "id1"
},
{
"colume_type": "FIELD",
"data_type": "INT32",
"fulltext": false,
"name": "id2"
},
{
"colume_type": "FIELD",
"data_type": "STRING",
"fulltext": false,
"name": "type"
},
{
"colume_type": "FIELD",
"data_type": "STRING",
"fulltext": false,
"name": "log"
},
{
"colume_type": "FIELD",
"data_type": "STRING",
"fulltext": false,
"name": "logger"
},
{
"colume_type": "TIMESTAMP",
"data_type": "TIMESTAMP_NANOSECOND",
"fulltext": false,
"name": "time"
}
])
);
assert_eq!(
rows,
&json!([
[
{
"data_type": "INT32",
"key": "id1",
"semantic_type": "FIELD",
"value": 2436
},
{
"data_type": "INT32",
"key": "id2",
"semantic_type": "FIELD",
"value": 2528
},
{
"data_type": "STRING",
"key": "type",
"semantic_type": "FIELD",
"value": "I"
},
{
"data_type": "STRING",
"key": "log",
"semantic_type": "FIELD",
"value": "ClusterAdapter:enter sendTextDataToCluster\\n"
},
{
"data_type": "STRING",
"key": "logger",
"semantic_type": "FIELD",
"value": "INTERACT.MANAGER"
},
{
"data_type": "TIMESTAMP_NANOSECOND",
"key": "time",
"semantic_type": "TIMESTAMP",
"value": "2024-05-25 20:16:37.217+0000"
}
]
])
);
guard.remove_all().await;
}
pub async fn test_plain_text_ingestion(store_type: StorageType) {
common_telemetry::init_default_ut_logging();
let (app, mut guard) = setup_test_http_app_with_frontend(store_type, "test_pipeline_api").await;

View File

@@ -1013,7 +1013,7 @@ async fn prepare_testing_metric_table(cluster: &GreptimeDbCluster) -> TableId {
let table = cluster
.frontend
.catalog_manager()
.table(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "phy")
.table(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "phy", None)
.await
.unwrap()
.unwrap();
@@ -1039,7 +1039,12 @@ async fn prepare_testing_table(cluster: &GreptimeDbCluster) -> TableId {
let table = cluster
.frontend
.catalog_manager()
.table(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, TEST_TABLE_NAME)
.table(
DEFAULT_CATALOG_NAME,
DEFAULT_SCHEMA_NAME,
TEST_TABLE_NAME,
None,
)
.await
.unwrap()
.unwrap();

View File

@@ -54,3 +54,50 @@ drop table test;
Affected Rows: 0
-- Append table
create table count_where_bug (
tag String,
ts TimestampMillisecond time index,
num Int64,
primary key (tag),
) engine=mito with('append_mode'='true');
Affected Rows: 0
insert into count_where_bug (tag, ts, num)
values ('a', '2024-09-06T06:00:01Z', 1),
('a', '2024-09-06T06:00:02Z', 2),
('a', '2024-09-06T06:00:03Z', 3),
('b', '2024-09-06T06:00:04Z', 4),
('b', '2024-09-06T06:00:05Z', 5);
Affected Rows: 5
select count(1) from count_where_bug where tag = 'b';
+-----------------+
| COUNT(Int64(1)) |
+-----------------+
| 2 |
+-----------------+
select count(1) from count_where_bug where ts > '2024-09-06T06:00:04Z';
+-----------------+
| COUNT(Int64(1)) |
+-----------------+
| 1 |
+-----------------+
select count(1) from count_where_bug where num != 3;
+-----------------+
| COUNT(Int64(1)) |
+-----------------+
| 4 |
+-----------------+
drop table count_where_bug;
Affected Rows: 0

View File

@@ -17,3 +17,27 @@ select count(*) from (select * from test cross join "HelloWorld");
drop table "HelloWorld";
drop table test;
-- Append table
create table count_where_bug (
tag String,
ts TimestampMillisecond time index,
num Int64,
primary key (tag),
) engine=mito with('append_mode'='true');
insert into count_where_bug (tag, ts, num)
values ('a', '2024-09-06T06:00:01Z', 1),
('a', '2024-09-06T06:00:02Z', 2),
('a', '2024-09-06T06:00:03Z', 3),
('b', '2024-09-06T06:00:04Z', 4),
('b', '2024-09-06T06:00:05Z', 5);
select count(1) from count_where_bug where tag = 'b';
select count(1) from count_where_bug where ts > '2024-09-06T06:00:04Z';
select count(1) from count_where_bug where num != 3;
drop table count_where_bug;

View File

@@ -18,7 +18,6 @@ show databases;
| greptime_private |
| illegal-database |
| information_schema |
| pg_catalog |
| public |
+--------------------+

View File

@@ -10,7 +10,6 @@ SHOW DATABASES;
| greptime_private |
| information_schema |
| mydb |
| pg_catalog |
| public |
+--------------------+
@@ -22,7 +21,6 @@ SHOW FULL DATABASES;
| greptime_private | |
| information_schema | |
| mydb | ttl='1h' |
| pg_catalog | |
| public | |
+--------------------+----------+
@@ -78,7 +76,6 @@ SHOW DATABASES;
+--------------------+
| greptime_private |
| information_schema |
| pg_catalog |
| public |
+--------------------+

View File

@@ -24,16 +24,13 @@ Affected Rows: 0
select table_catalog, table_schema, table_name from information_schema.tables where table_schema != 'information_schema';
+---------------+--------------+--------------+
| table_catalog | table_schema | table_name |
+---------------+--------------+--------------+
| greptime | abc | t |
| greptime | abcde | t |
| greptime | pg_catalog | pg_class |
| greptime | pg_catalog | pg_type |
| greptime | pg_catalog | pg_namespace |
| greptime | public | numbers |
+---------------+--------------+--------------+
+---------------+--------------+------------+
| table_catalog | table_schema | table_name |
+---------------+--------------+------------+
| greptime | abc | t |
| greptime | abcde | t |
| greptime | public | numbers |
+---------------+--------------+------------+
use public;

View File

@@ -5,7 +5,6 @@ SHOW DATABASES;
+--------------------+
| greptime_private |
| information_schema |
| pg_catalog |
| public |
+--------------------+
@@ -16,7 +15,6 @@ SHOW FULL DATABASES;
+--------------------+---------+
| greptime_private | |
| information_schema | |
| pg_catalog | |
| public | |
+--------------------+---------+

View File

@@ -45,9 +45,6 @@ order by table_schema, table_name;
|greptime|information_schema|tables|LOCALTEMPORARY|3|0|0|0|0|0||11|Fixed|0|0|0|DATETIME|||utf8_bin|0|||Y|
|greptime|information_schema|triggers|LOCALTEMPORARY|24|0|0|0|0|0||11|Fixed|0|0|0|DATETIME|||utf8_bin|0|||Y|
|greptime|information_schema|views|LOCALTEMPORARY|32|0|0|0|0|0||11|Fixed|0|0|0|DATETIME|||utf8_bin|0|||Y|
|greptime|pg_catalog|pg_class|LOCALTEMPORARY|256|0|0|0|0|0||11|Fixed|0|0|0|DATETIME|||utf8_bin|0|||Y|
|greptime|pg_catalog|pg_namespace|LOCALTEMPORARY|258|0|0|0|0|0||11|Fixed|0|0|0|DATETIME|||utf8_bin|0|||Y|
|greptime|pg_catalog|pg_type|LOCALTEMPORARY|257|0|0|0|0|0||11|Fixed|0|0|0|DATETIME|||utf8_bin|0|||Y|
|greptime|public|numbers|LOCALTEMPORARY|2|0|0|0|0|0|test_engine|11|Fixed|0|0|0|DATETIME|||utf8_bin|0|||Y|
+++++++++++++++++++++++++
@@ -413,16 +410,6 @@ select * from information_schema.columns order by table_schema, table_name, colu
| greptime | information_schema | views | table_name | 3 | 2147483647 | 2147483647 | | | | utf8 | utf8_bin | | | select,insert | | String | string | FIELD | | No | string | | |
| greptime | information_schema | views | table_schema | 2 | 2147483647 | 2147483647 | | | | utf8 | utf8_bin | | | select,insert | | String | string | FIELD | | No | string | | |
| greptime | information_schema | views | view_definition | 4 | 2147483647 | 2147483647 | | | | utf8 | utf8_bin | | | select,insert | | String | string | FIELD | | No | string | | |
| greptime | pg_catalog | pg_class | oid | 1 | | | 10 | 0 | | | | | | select,insert | | UInt32 | int unsigned | FIELD | | No | int unsigned | | |
| greptime | pg_catalog | pg_class | relkind | 4 | 2147483647 | 2147483647 | | | | utf8 | utf8_bin | | | select,insert | | String | string | FIELD | | No | string | | |
| greptime | pg_catalog | pg_class | relname | 2 | 2147483647 | 2147483647 | | | | utf8 | utf8_bin | | | select,insert | | String | string | FIELD | | No | string | | |
| greptime | pg_catalog | pg_class | relnamespace | 3 | | | 10 | 0 | | | | | | select,insert | | UInt32 | int unsigned | FIELD | | No | int unsigned | | |
| greptime | pg_catalog | pg_class | relowner | 5 | | | 10 | 0 | | | | | | select,insert | | UInt32 | int unsigned | FIELD | | No | int unsigned | | |
| greptime | pg_catalog | pg_namespace | nspname | 2 | 2147483647 | 2147483647 | | | | utf8 | utf8_bin | | | select,insert | | String | string | FIELD | | No | string | | |
| greptime | pg_catalog | pg_namespace | oid | 1 | | | 10 | 0 | | | | | | select,insert | | UInt32 | int unsigned | FIELD | | No | int unsigned | | |
| greptime | pg_catalog | pg_type | oid | 1 | | | 10 | 0 | | | | | | select,insert | | UInt32 | int unsigned | FIELD | | No | int unsigned | | |
| greptime | pg_catalog | pg_type | typlen | 3 | | | 5 | 0 | | | | | | select,insert | | Int16 | smallint | FIELD | | No | smallint | | |
| greptime | pg_catalog | pg_type | typname | 2 | 2147483647 | 2147483647 | | | | utf8 | utf8_bin | | | select,insert | | String | string | FIELD | | No | string | | |
| greptime | public | numbers | number | 1 | | | 10 | 0 | | | | PRI | | select,insert | | UInt32 | int unsigned | TAG | | No | int unsigned | | |
+---------------+--------------------+---------------------------------------+-----------------------------------+------------------+--------------------------+------------------------+-------------------+---------------+--------------------+--------------------+----------------+------------+-------+---------------+-----------------------+----------------------+-----------------+---------------+----------------+-------------+-----------------+----------------+--------+
@@ -596,7 +583,6 @@ select * from schemata where catalog_name = 'greptime' and schema_name != 'publi
+--------------+--------------------+----------------------------+------------------------+----------+---------+
| greptime | greptime_private | utf8 | utf8_bin | | |
| greptime | information_schema | utf8 | utf8_bin | | |
| greptime | pg_catalog | utf8 | utf8_bin | | |
+--------------+--------------------+----------------------------+------------------------+----------+---------+
-- test engines

View File

@@ -5,30 +5,7 @@ Error: 1004(InvalidArguments), Schema pg_catalog already exists
select * from pg_catalog.pg_type order by oid;
+-----+-----------+--------+
| oid | typname | typlen |
+-----+-----------+--------+
| 1 | String | -1 |
| 2 | Binary | -1 |
| 3 | Int8 | 1 |
| 4 | Int16 | 2 |
| 5 | Int32 | 4 |
| 6 | Int64 | 8 |
| 7 | UInt8 | 1 |
| 8 | UInt16 | 2 |
| 9 | UInt32 | 4 |
| 10 | UInt64 | 8 |
| 11 | Float32 | 4 |
| 12 | Float64 | 8 |
| 13 | Decimal | 16 |
| 14 | Date | 4 |
| 15 | DateTime | 8 |
| 16 | Timestamp | 8 |
| 17 | Time | 8 |
| 18 | Duration | 8 |
| 19 | Interval | 16 |
| 20 | List | -1 |
+-----+-----------+--------+
Error: 4001(TableNotFound), Failed to plan SQL: Table not found: greptime.pg_catalog.pg_type
-- \d
SELECT n.nspname as "Schema",
@@ -44,11 +21,7 @@ WHERE c.relkind IN ('r','p','v','m','S','f','')
AND pg_catalog.pg_table_is_visible(c.oid)
ORDER BY 1,2;
+--------+---------+-------+-------+
| Schema | Name | Type | Owner |
+--------+---------+-------+-------+
| public | numbers | table | |
+--------+---------+-------+-------+
Error: 4001(TableNotFound), Failed to plan SQL: Table not found: greptime.pg_catalog.pg_class
-- \dt
SELECT n.nspname as "Schema",
@@ -64,11 +37,7 @@ WHERE c.relkind IN ('r','p','')
AND pg_catalog.pg_table_is_visible(c.oid)
ORDER BY 1,2;
+--------+---------+-------+-------+
| Schema | Name | Type | Owner |
+--------+---------+-------+-------+
| public | numbers | table | |
+--------+---------+-------+-------+
Error: 4001(TableNotFound), Failed to plan SQL: Table not found: greptime.pg_catalog.pg_class
-- make sure oid of namespace keep stable
SELECT * FROM pg_namespace ORDER BY oid;
@@ -100,11 +69,7 @@ where relnamespace = (
where nspname = 'my_db'
);
+---------+
| relname |
+---------+
| foo |
+---------+
Error: 4001(TableNotFound), Failed to plan SQL: Table not found: greptime.pg_catalog.pg_class
-- \dt
SELECT n.nspname as "Schema",
@@ -120,12 +85,7 @@ WHERE c.relkind IN ('r','p','')
AND pg_catalog.pg_table_is_visible(c.oid)
ORDER BY 1,2;
+--------+---------+-------+-------+
| Schema | Name | Type | Owner |
+--------+---------+-------+-------+
| my_db | foo | table | |
| public | numbers | table | |
+--------+---------+-------+-------+
Error: 4001(TableNotFound), Failed to plan SQL: Table not found: greptime.pg_catalog.pg_class
-- show tables in `my_db`, `public`
select relname
@@ -137,12 +97,7 @@ where relnamespace in (
)
order by relname;
+---------+
| relname |
+---------+
| foo |
| numbers |
+---------+
Error: 4001(TableNotFound), Failed to plan SQL: Table not found: greptime.pg_catalog.pg_class
select relname
from pg_catalog.pg_class
@@ -152,11 +107,7 @@ where relnamespace in (
where nspname like 'my%'
);
+---------+
| relname |
+---------+
| foo |
+---------+
Error: 4001(TableNotFound), Failed to plan SQL: Table not found: greptime.pg_catalog.pg_class
select relnamespace, relname, relkind
from pg_catalog.pg_class
@@ -169,11 +120,7 @@ where relnamespace in (
)
order by relnamespace, relname;
+--------------+---------+---------+
| relnamespace | relname | relkind |
+--------------+---------+---------+
| 434869349 | foo | r |
+--------------+---------+---------+
Error: 4001(TableNotFound), Failed to plan SQL: Table not found: greptime.pg_catalog.pg_class
use public;
@@ -190,24 +137,11 @@ Affected Rows: 0
-- pg_class
desc table pg_class;
+--------------+--------+-----+------+---------+---------------+
| Column | Type | Key | Null | Default | Semantic Type |
+--------------+--------+-----+------+---------+---------------+
| oid | UInt32 | | NO | | FIELD |
| relname | String | | NO | | FIELD |
| relnamespace | UInt32 | | NO | | FIELD |
| relkind | String | | NO | | FIELD |
| relowner | UInt32 | | NO | | FIELD |
+--------------+--------+-----+------+---------+---------------+
Error: 4001(TableNotFound), Table not found: pg_class
desc table pg_namespace;
+---------+--------+-----+------+---------+---------------+
| Column | Type | Key | Null | Default | Semantic Type |
+---------+--------+-----+------+---------+---------------+
| oid | UInt32 | | NO | | FIELD |
| nspname | String | | NO | | FIELD |
+---------+--------+-----+------+---------+---------------+
Error: 4001(TableNotFound), Table not found: pg_namespace
drop table my_db.foo;

View File

@@ -77,11 +77,7 @@ WHERE c.relkind IN ('v','')
AND pg_catalog.pg_table_is_visible(c.oid)
ORDER BY 1,2;
+--------+-----------+------+-------+
| Schema | Name | Type | Owner |
+--------+-----------+------+-------+
| public | test_view | view | |
+--------+-----------+------+-------+
Error: 4001(TableNotFound), Failed to plan SQL: Table not found: greptime.pg_catalog.pg_class
-- SQLNESS REPLACE (\s\d+\s) ID
-- SQLNESS REPLACE (\s[\-0-9T:\.]{15,}) DATETIME
@@ -110,9 +106,6 @@ SELECT * FROM INFORMATION_SCHEMA.TABLES ORDER BY TABLE_NAME, TABLE_TYPE;
|greptime|information_schema|optimizer_trace|LOCALTEMPORARY|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|||utf8_bin|ID|||Y|
|greptime|information_schema|parameters|LOCALTEMPORARY|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|||utf8_bin|ID|||Y|
|greptime|information_schema|partitions|LOCALTEMPORARY|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|||utf8_bin|ID|||Y|
|greptime|pg_catalog|pg_class|LOCALTEMPORARY|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|||utf8_bin|ID|||Y|
|greptime|pg_catalog|pg_namespace|LOCALTEMPORARY|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|||utf8_bin|ID|||Y|
|greptime|pg_catalog|pg_type|LOCALTEMPORARY|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|||utf8_bin|ID|||Y|
|greptime|information_schema|profiling|LOCALTEMPORARY|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|||utf8_bin|ID|||Y|
|greptime|information_schema|referential_constraints|LOCALTEMPORARY|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|||utf8_bin|ID|||Y|
|greptime|information_schema|region_peers|LOCALTEMPORARY|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|||utf8_bin|ID|||Y|
@@ -205,6 +198,5 @@ WHERE c.relkind IN ('v','')
AND pg_catalog.pg_table_is_visible(c.oid)
ORDER BY 1,2;
++
++
Error: 4001(TableNotFound), Failed to plan SQL: Table not found: greptime.pg_catalog.pg_class