mirror of
https://github.com/GreptimeTeam/greptimedb.git
synced 2025-12-27 08:29:59 +00:00
Compare commits
6 Commits
v0.9.3
...
v0.10.0-ni
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a8477e4142 | ||
|
|
b950e705f5 | ||
|
|
d2d62e0c6f | ||
|
|
5d9f8a3be7 | ||
|
|
e88465840d | ||
|
|
67d95d2088 |
4
Cargo.lock
generated
4
Cargo.lock
generated
@@ -3156,6 +3156,7 @@ dependencies = [
|
||||
"arrow",
|
||||
"arrow-array",
|
||||
"arrow-schema",
|
||||
"base64 0.21.7",
|
||||
"common-base",
|
||||
"common-decimal",
|
||||
"common-error",
|
||||
@@ -3164,6 +3165,7 @@ dependencies = [
|
||||
"common-time",
|
||||
"datafusion-common",
|
||||
"enum_dispatch",
|
||||
"greptime-proto",
|
||||
"num",
|
||||
"num-traits",
|
||||
"ordered-float 3.9.2",
|
||||
@@ -4300,7 +4302,7 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
|
||||
[[package]]
|
||||
name = "greptime-proto"
|
||||
version = "0.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=c437b55725b7f5224fe9d46db21072b4a682ee4b#c437b55725b7f5224fe9d46db21072b4a682ee4b"
|
||||
source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=157cfdb52709e489cf1f3ce8e3042ed4ee8a524a#157cfdb52709e489cf1f3ce8e3042ed4ee8a524a"
|
||||
dependencies = [
|
||||
"prost 0.12.6",
|
||||
"serde",
|
||||
|
||||
@@ -120,7 +120,7 @@ etcd-client = { version = "0.13" }
|
||||
fst = "0.4.7"
|
||||
futures = "0.3"
|
||||
futures-util = "0.3"
|
||||
greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "c437b55725b7f5224fe9d46db21072b4a682ee4b" }
|
||||
greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "157cfdb52709e489cf1f3ce8e3042ed4ee8a524a" }
|
||||
humantime = "2.1"
|
||||
humantime-serde = "1.1"
|
||||
itertools = "0.10"
|
||||
|
||||
@@ -74,7 +74,7 @@ Our core developers have been building time-series data platforms for years. Bas
|
||||
|
||||
* **Compatible with InfluxDB, Prometheus and more protocols**
|
||||
|
||||
Widely adopted database protocols and APIs, including MySQL, PostgreSQL, and Prometheus Remote Storage, etc. [Read more](https://docs.greptime.com/user-guide/clients/overview).
|
||||
Widely adopted database protocols and APIs, including MySQL, PostgreSQL, and Prometheus Remote Storage, etc. [Read more](https://docs.greptime.com/user-guide/protocols/overview).
|
||||
|
||||
## Try GreptimeDB
|
||||
|
||||
|
||||
@@ -21,14 +21,14 @@ use greptime_proto::v1::region::RegionResponse as RegionResponseV1;
|
||||
#[derive(Debug)]
|
||||
pub struct RegionResponse {
|
||||
pub affected_rows: AffectedRows,
|
||||
pub extension: HashMap<String, Vec<u8>>,
|
||||
pub extensions: HashMap<String, Vec<u8>>,
|
||||
}
|
||||
|
||||
impl RegionResponse {
|
||||
pub fn from_region_response(region_response: RegionResponseV1) -> Self {
|
||||
Self {
|
||||
affected_rows: region_response.affected_rows as _,
|
||||
extension: region_response.extension,
|
||||
extensions: region_response.extensions,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -36,7 +36,7 @@ impl RegionResponse {
|
||||
pub fn new(affected_rows: AffectedRows) -> Self {
|
||||
Self {
|
||||
affected_rows,
|
||||
extension: Default::default(),
|
||||
extensions: Default::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -36,6 +36,7 @@ use futures_util::{StreamExt, TryStreamExt};
|
||||
use meta_client::client::MetaClient;
|
||||
use moka::sync::Cache;
|
||||
use partition::manager::{PartitionRuleManager, PartitionRuleManagerRef};
|
||||
use session::context::{Channel, QueryContext};
|
||||
use snafu::prelude::*;
|
||||
use table::dist_table::DistTable;
|
||||
use table::table::numbers::{NumbersTable, NUMBERS_TABLE_NAME};
|
||||
@@ -152,7 +153,11 @@ impl CatalogManager for KvBackendCatalogManager {
|
||||
Ok(keys)
|
||||
}
|
||||
|
||||
async fn schema_names(&self, catalog: &str) -> Result<Vec<String>> {
|
||||
async fn schema_names(
|
||||
&self,
|
||||
catalog: &str,
|
||||
query_ctx: Option<&QueryContext>,
|
||||
) -> Result<Vec<String>> {
|
||||
let stream = self
|
||||
.table_metadata_manager
|
||||
.schema_manager()
|
||||
@@ -163,12 +168,17 @@ impl CatalogManager for KvBackendCatalogManager {
|
||||
.map_err(BoxedError::new)
|
||||
.context(ListSchemasSnafu { catalog })?;
|
||||
|
||||
keys.extend(self.system_catalog.schema_names());
|
||||
keys.extend(self.system_catalog.schema_names(query_ctx));
|
||||
|
||||
Ok(keys.into_iter().collect())
|
||||
}
|
||||
|
||||
async fn table_names(&self, catalog: &str, schema: &str) -> Result<Vec<String>> {
|
||||
async fn table_names(
|
||||
&self,
|
||||
catalog: &str,
|
||||
schema: &str,
|
||||
query_ctx: Option<&QueryContext>,
|
||||
) -> Result<Vec<String>> {
|
||||
let stream = self
|
||||
.table_metadata_manager
|
||||
.table_name_manager()
|
||||
@@ -181,7 +191,7 @@ impl CatalogManager for KvBackendCatalogManager {
|
||||
.into_iter()
|
||||
.map(|(k, _)| k)
|
||||
.collect::<Vec<_>>();
|
||||
tables.extend_from_slice(&self.system_catalog.table_names(schema));
|
||||
tables.extend_from_slice(&self.system_catalog.table_names(schema, query_ctx));
|
||||
|
||||
Ok(tables.into_iter().collect())
|
||||
}
|
||||
@@ -194,8 +204,13 @@ impl CatalogManager for KvBackendCatalogManager {
|
||||
.context(TableMetadataManagerSnafu)
|
||||
}
|
||||
|
||||
async fn schema_exists(&self, catalog: &str, schema: &str) -> Result<bool> {
|
||||
if self.system_catalog.schema_exists(schema) {
|
||||
async fn schema_exists(
|
||||
&self,
|
||||
catalog: &str,
|
||||
schema: &str,
|
||||
query_ctx: Option<&QueryContext>,
|
||||
) -> Result<bool> {
|
||||
if self.system_catalog.schema_exists(schema, query_ctx) {
|
||||
return Ok(true);
|
||||
}
|
||||
|
||||
@@ -206,8 +221,14 @@ impl CatalogManager for KvBackendCatalogManager {
|
||||
.context(TableMetadataManagerSnafu)
|
||||
}
|
||||
|
||||
async fn table_exists(&self, catalog: &str, schema: &str, table: &str) -> Result<bool> {
|
||||
if self.system_catalog.table_exists(schema, table) {
|
||||
async fn table_exists(
|
||||
&self,
|
||||
catalog: &str,
|
||||
schema: &str,
|
||||
table: &str,
|
||||
query_ctx: Option<&QueryContext>,
|
||||
) -> Result<bool> {
|
||||
if self.system_catalog.table_exists(schema, table, query_ctx) {
|
||||
return Ok(true);
|
||||
}
|
||||
|
||||
@@ -225,10 +246,12 @@ impl CatalogManager for KvBackendCatalogManager {
|
||||
catalog_name: &str,
|
||||
schema_name: &str,
|
||||
table_name: &str,
|
||||
query_ctx: Option<&QueryContext>,
|
||||
) -> Result<Option<TableRef>> {
|
||||
if let Some(table) = self
|
||||
.system_catalog
|
||||
.table(catalog_name, schema_name, table_name)
|
||||
let channel = query_ctx.map_or(Channel::Unknown, |ctx| ctx.channel());
|
||||
if let Some(table) =
|
||||
self.system_catalog
|
||||
.table(catalog_name, schema_name, table_name, query_ctx)
|
||||
{
|
||||
return Ok(Some(table));
|
||||
}
|
||||
@@ -236,23 +259,45 @@ impl CatalogManager for KvBackendCatalogManager {
|
||||
let table_cache: TableCacheRef = self.cache_registry.get().context(CacheNotFoundSnafu {
|
||||
name: "table_cache",
|
||||
})?;
|
||||
|
||||
table_cache
|
||||
if let Some(table) = table_cache
|
||||
.get_by_ref(&TableName {
|
||||
catalog_name: catalog_name.to_string(),
|
||||
schema_name: schema_name.to_string(),
|
||||
table_name: table_name.to_string(),
|
||||
})
|
||||
.await
|
||||
.context(GetTableCacheSnafu)
|
||||
.context(GetTableCacheSnafu)?
|
||||
{
|
||||
return Ok(Some(table));
|
||||
}
|
||||
|
||||
if channel == Channel::Postgres {
|
||||
// falldown to pg_catalog
|
||||
if let Some(table) =
|
||||
self.system_catalog
|
||||
.table(catalog_name, PG_CATALOG_NAME, table_name, query_ctx)
|
||||
{
|
||||
return Ok(Some(table));
|
||||
}
|
||||
}
|
||||
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
fn tables<'a>(&'a self, catalog: &'a str, schema: &'a str) -> BoxStream<'a, Result<TableRef>> {
|
||||
fn tables<'a>(
|
||||
&'a self,
|
||||
catalog: &'a str,
|
||||
schema: &'a str,
|
||||
query_ctx: Option<&'a QueryContext>,
|
||||
) -> BoxStream<'a, Result<TableRef>> {
|
||||
let sys_tables = try_stream!({
|
||||
// System tables
|
||||
let sys_table_names = self.system_catalog.table_names(schema);
|
||||
let sys_table_names = self.system_catalog.table_names(schema, query_ctx);
|
||||
for table_name in sys_table_names {
|
||||
if let Some(table) = self.system_catalog.table(catalog, schema, &table_name) {
|
||||
if let Some(table) =
|
||||
self.system_catalog
|
||||
.table(catalog, schema, &table_name, query_ctx)
|
||||
{
|
||||
yield table;
|
||||
}
|
||||
}
|
||||
@@ -320,18 +365,27 @@ struct SystemCatalog {
|
||||
}
|
||||
|
||||
impl SystemCatalog {
|
||||
// TODO(j0hn50n133): remove the duplicated hard-coded table names logic
|
||||
fn schema_names(&self) -> Vec<String> {
|
||||
vec![
|
||||
INFORMATION_SCHEMA_NAME.to_string(),
|
||||
PG_CATALOG_NAME.to_string(),
|
||||
]
|
||||
fn schema_names(&self, query_ctx: Option<&QueryContext>) -> Vec<String> {
|
||||
let channel = query_ctx.map_or(Channel::Unknown, |ctx| ctx.channel());
|
||||
match channel {
|
||||
// pg_catalog only visible under postgres protocol
|
||||
Channel::Postgres => vec![
|
||||
INFORMATION_SCHEMA_NAME.to_string(),
|
||||
PG_CATALOG_NAME.to_string(),
|
||||
],
|
||||
_ => {
|
||||
vec![INFORMATION_SCHEMA_NAME.to_string()]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn table_names(&self, schema: &str) -> Vec<String> {
|
||||
fn table_names(&self, schema: &str, query_ctx: Option<&QueryContext>) -> Vec<String> {
|
||||
let channel = query_ctx.map_or(Channel::Unknown, |ctx| ctx.channel());
|
||||
match schema {
|
||||
INFORMATION_SCHEMA_NAME => self.information_schema_provider.table_names(),
|
||||
PG_CATALOG_NAME => self.pg_catalog_provider.table_names(),
|
||||
PG_CATALOG_NAME if channel == Channel::Postgres => {
|
||||
self.pg_catalog_provider.table_names()
|
||||
}
|
||||
DEFAULT_SCHEMA_NAME => {
|
||||
vec![NUMBERS_TABLE_NAME.to_string()]
|
||||
}
|
||||
@@ -339,23 +393,35 @@ impl SystemCatalog {
|
||||
}
|
||||
}
|
||||
|
||||
fn schema_exists(&self, schema: &str) -> bool {
|
||||
schema == INFORMATION_SCHEMA_NAME || schema == PG_CATALOG_NAME
|
||||
fn schema_exists(&self, schema: &str, query_ctx: Option<&QueryContext>) -> bool {
|
||||
let channel = query_ctx.map_or(Channel::Unknown, |ctx| ctx.channel());
|
||||
match channel {
|
||||
Channel::Postgres => schema == PG_CATALOG_NAME || schema == INFORMATION_SCHEMA_NAME,
|
||||
_ => schema == INFORMATION_SCHEMA_NAME,
|
||||
}
|
||||
}
|
||||
|
||||
fn table_exists(&self, schema: &str, table: &str) -> bool {
|
||||
fn table_exists(&self, schema: &str, table: &str, query_ctx: Option<&QueryContext>) -> bool {
|
||||
let channel = query_ctx.map_or(Channel::Unknown, |ctx| ctx.channel());
|
||||
if schema == INFORMATION_SCHEMA_NAME {
|
||||
self.information_schema_provider.table(table).is_some()
|
||||
} else if schema == DEFAULT_SCHEMA_NAME {
|
||||
table == NUMBERS_TABLE_NAME
|
||||
} else if schema == PG_CATALOG_NAME {
|
||||
} else if schema == PG_CATALOG_NAME && channel == Channel::Postgres {
|
||||
self.pg_catalog_provider.table(table).is_some()
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
fn table(&self, catalog: &str, schema: &str, table_name: &str) -> Option<TableRef> {
|
||||
fn table(
|
||||
&self,
|
||||
catalog: &str,
|
||||
schema: &str,
|
||||
table_name: &str,
|
||||
query_ctx: Option<&QueryContext>,
|
||||
) -> Option<TableRef> {
|
||||
let channel = query_ctx.map_or(Channel::Unknown, |ctx| ctx.channel());
|
||||
if schema == INFORMATION_SCHEMA_NAME {
|
||||
let information_schema_provider =
|
||||
self.catalog_cache.get_with_by_ref(catalog, move || {
|
||||
@@ -366,7 +432,7 @@ impl SystemCatalog {
|
||||
))
|
||||
});
|
||||
information_schema_provider.table(table_name)
|
||||
} else if schema == PG_CATALOG_NAME {
|
||||
} else if schema == PG_CATALOG_NAME && channel == Channel::Postgres {
|
||||
if catalog == DEFAULT_CATALOG_NAME {
|
||||
self.pg_catalog_provider.table(table_name)
|
||||
} else {
|
||||
|
||||
@@ -20,8 +20,10 @@ use std::fmt::{Debug, Formatter};
|
||||
use std::sync::Arc;
|
||||
|
||||
use api::v1::CreateTableExpr;
|
||||
use common_catalog::consts::{INFORMATION_SCHEMA_NAME, PG_CATALOG_NAME};
|
||||
use futures::future::BoxFuture;
|
||||
use futures_util::stream::BoxStream;
|
||||
use session::context::QueryContext;
|
||||
use table::metadata::TableId;
|
||||
use table::TableRef;
|
||||
|
||||
@@ -44,15 +46,35 @@ pub trait CatalogManager: Send + Sync {
|
||||
|
||||
async fn catalog_names(&self) -> Result<Vec<String>>;
|
||||
|
||||
async fn schema_names(&self, catalog: &str) -> Result<Vec<String>>;
|
||||
async fn schema_names(
|
||||
&self,
|
||||
catalog: &str,
|
||||
query_ctx: Option<&QueryContext>,
|
||||
) -> Result<Vec<String>>;
|
||||
|
||||
async fn table_names(&self, catalog: &str, schema: &str) -> Result<Vec<String>>;
|
||||
async fn table_names(
|
||||
&self,
|
||||
catalog: &str,
|
||||
schema: &str,
|
||||
query_ctx: Option<&QueryContext>,
|
||||
) -> Result<Vec<String>>;
|
||||
|
||||
async fn catalog_exists(&self, catalog: &str) -> Result<bool>;
|
||||
|
||||
async fn schema_exists(&self, catalog: &str, schema: &str) -> Result<bool>;
|
||||
async fn schema_exists(
|
||||
&self,
|
||||
catalog: &str,
|
||||
schema: &str,
|
||||
query_ctx: Option<&QueryContext>,
|
||||
) -> Result<bool>;
|
||||
|
||||
async fn table_exists(&self, catalog: &str, schema: &str, table: &str) -> Result<bool>;
|
||||
async fn table_exists(
|
||||
&self,
|
||||
catalog: &str,
|
||||
schema: &str,
|
||||
table: &str,
|
||||
query_ctx: Option<&QueryContext>,
|
||||
) -> Result<bool>;
|
||||
|
||||
/// Returns the table by catalog, schema and table name.
|
||||
async fn table(
|
||||
@@ -60,10 +82,25 @@ pub trait CatalogManager: Send + Sync {
|
||||
catalog: &str,
|
||||
schema: &str,
|
||||
table_name: &str,
|
||||
query_ctx: Option<&QueryContext>,
|
||||
) -> Result<Option<TableRef>>;
|
||||
|
||||
/// Returns all tables with a stream by catalog and schema.
|
||||
fn tables<'a>(&'a self, catalog: &'a str, schema: &'a str) -> BoxStream<'a, Result<TableRef>>;
|
||||
fn tables<'a>(
|
||||
&'a self,
|
||||
catalog: &'a str,
|
||||
schema: &'a str,
|
||||
query_ctx: Option<&'a QueryContext>,
|
||||
) -> BoxStream<'a, Result<TableRef>>;
|
||||
|
||||
/// Check if `schema` is a reserved schema name
|
||||
fn is_reserved_schema_name(&self, schema: &str) -> bool {
|
||||
// We have to check whether a schema name is reserved before create schema.
|
||||
// We need this rather than use schema_exists directly because `pg_catalog` is
|
||||
// only visible via postgres protocol. So if we don't check, a mysql client may
|
||||
// create a schema named `pg_catalog` which is somehow malformed.
|
||||
schema == INFORMATION_SCHEMA_NAME || schema == PG_CATALOG_NAME
|
||||
}
|
||||
}
|
||||
|
||||
pub type CatalogManagerRef = Arc<dyn CatalogManager>;
|
||||
|
||||
@@ -26,6 +26,7 @@ use common_catalog::consts::{
|
||||
use common_meta::key::flow::FlowMetadataManager;
|
||||
use common_meta::kv_backend::memory::MemoryKvBackend;
|
||||
use futures_util::stream::BoxStream;
|
||||
use session::context::QueryContext;
|
||||
use snafu::OptionExt;
|
||||
use table::TableRef;
|
||||
|
||||
@@ -53,7 +54,11 @@ impl CatalogManager for MemoryCatalogManager {
|
||||
Ok(self.catalogs.read().unwrap().keys().cloned().collect())
|
||||
}
|
||||
|
||||
async fn schema_names(&self, catalog: &str) -> Result<Vec<String>> {
|
||||
async fn schema_names(
|
||||
&self,
|
||||
catalog: &str,
|
||||
_query_ctx: Option<&QueryContext>,
|
||||
) -> Result<Vec<String>> {
|
||||
Ok(self
|
||||
.catalogs
|
||||
.read()
|
||||
@@ -67,7 +72,12 @@ impl CatalogManager for MemoryCatalogManager {
|
||||
.collect())
|
||||
}
|
||||
|
||||
async fn table_names(&self, catalog: &str, schema: &str) -> Result<Vec<String>> {
|
||||
async fn table_names(
|
||||
&self,
|
||||
catalog: &str,
|
||||
schema: &str,
|
||||
_query_ctx: Option<&QueryContext>,
|
||||
) -> Result<Vec<String>> {
|
||||
Ok(self
|
||||
.catalogs
|
||||
.read()
|
||||
@@ -87,11 +97,22 @@ impl CatalogManager for MemoryCatalogManager {
|
||||
self.catalog_exist_sync(catalog)
|
||||
}
|
||||
|
||||
async fn schema_exists(&self, catalog: &str, schema: &str) -> Result<bool> {
|
||||
async fn schema_exists(
|
||||
&self,
|
||||
catalog: &str,
|
||||
schema: &str,
|
||||
_query_ctx: Option<&QueryContext>,
|
||||
) -> Result<bool> {
|
||||
self.schema_exist_sync(catalog, schema)
|
||||
}
|
||||
|
||||
async fn table_exists(&self, catalog: &str, schema: &str, table: &str) -> Result<bool> {
|
||||
async fn table_exists(
|
||||
&self,
|
||||
catalog: &str,
|
||||
schema: &str,
|
||||
table: &str,
|
||||
_query_ctx: Option<&QueryContext>,
|
||||
) -> Result<bool> {
|
||||
let catalogs = self.catalogs.read().unwrap();
|
||||
Ok(catalogs
|
||||
.get(catalog)
|
||||
@@ -108,6 +129,7 @@ impl CatalogManager for MemoryCatalogManager {
|
||||
catalog: &str,
|
||||
schema: &str,
|
||||
table_name: &str,
|
||||
_query_ctx: Option<&QueryContext>,
|
||||
) -> Result<Option<TableRef>> {
|
||||
let result = try {
|
||||
self.catalogs
|
||||
@@ -121,7 +143,12 @@ impl CatalogManager for MemoryCatalogManager {
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
fn tables<'a>(&'a self, catalog: &'a str, schema: &'a str) -> BoxStream<'a, Result<TableRef>> {
|
||||
fn tables<'a>(
|
||||
&'a self,
|
||||
catalog: &'a str,
|
||||
schema: &'a str,
|
||||
_query_ctx: Option<&QueryContext>,
|
||||
) -> BoxStream<'a, Result<TableRef>> {
|
||||
let catalogs = self.catalogs.read().unwrap();
|
||||
|
||||
let Some(schemas) = catalogs.get(catalog) else {
|
||||
@@ -371,11 +398,12 @@ mod tests {
|
||||
DEFAULT_CATALOG_NAME,
|
||||
DEFAULT_SCHEMA_NAME,
|
||||
NUMBERS_TABLE_NAME,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
let stream = catalog_list.tables(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME);
|
||||
let stream = catalog_list.tables(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, None);
|
||||
let tables = stream.try_collect::<Vec<_>>().await.unwrap();
|
||||
assert_eq!(tables.len(), 1);
|
||||
assert_eq!(
|
||||
@@ -384,7 +412,12 @@ mod tests {
|
||||
);
|
||||
|
||||
assert!(catalog_list
|
||||
.table(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "not_exists")
|
||||
.table(
|
||||
DEFAULT_CATALOG_NAME,
|
||||
DEFAULT_SCHEMA_NAME,
|
||||
"not_exists",
|
||||
None
|
||||
)
|
||||
.await
|
||||
.unwrap()
|
||||
.is_none());
|
||||
@@ -411,7 +444,7 @@ mod tests {
|
||||
};
|
||||
catalog.register_table_sync(register_table_req).unwrap();
|
||||
assert!(catalog
|
||||
.table(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, table_name)
|
||||
.table(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, table_name, None)
|
||||
.await
|
||||
.unwrap()
|
||||
.is_some());
|
||||
@@ -423,7 +456,7 @@ mod tests {
|
||||
};
|
||||
catalog.deregister_table_sync(deregister_table_req).unwrap();
|
||||
assert!(catalog
|
||||
.table(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, table_name)
|
||||
.table(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, table_name, None)
|
||||
.await
|
||||
.unwrap()
|
||||
.is_none());
|
||||
|
||||
@@ -257,8 +257,8 @@ impl InformationSchemaColumnsBuilder {
|
||||
.context(UpgradeWeakCatalogManagerRefSnafu)?;
|
||||
let predicates = Predicates::from_scan_request(&request);
|
||||
|
||||
for schema_name in catalog_manager.schema_names(&catalog_name).await? {
|
||||
let mut stream = catalog_manager.tables(&catalog_name, &schema_name);
|
||||
for schema_name in catalog_manager.schema_names(&catalog_name, None).await? {
|
||||
let mut stream = catalog_manager.tables(&catalog_name, &schema_name, None);
|
||||
|
||||
while let Some(table) = stream.try_next().await? {
|
||||
let keys = &table.table_info().meta.primary_key_indices;
|
||||
|
||||
@@ -212,8 +212,8 @@ impl InformationSchemaKeyColumnUsageBuilder {
|
||||
.context(UpgradeWeakCatalogManagerRefSnafu)?;
|
||||
let predicates = Predicates::from_scan_request(&request);
|
||||
|
||||
for schema_name in catalog_manager.schema_names(&catalog_name).await? {
|
||||
let mut stream = catalog_manager.tables(&catalog_name, &schema_name);
|
||||
for schema_name in catalog_manager.schema_names(&catalog_name, None).await? {
|
||||
let mut stream = catalog_manager.tables(&catalog_name, &schema_name, None);
|
||||
|
||||
while let Some(table) = stream.try_next().await? {
|
||||
let mut primary_constraints = vec![];
|
||||
|
||||
@@ -240,9 +240,9 @@ impl InformationSchemaPartitionsBuilder {
|
||||
|
||||
let predicates = Predicates::from_scan_request(&request);
|
||||
|
||||
for schema_name in catalog_manager.schema_names(&catalog_name).await? {
|
||||
for schema_name in catalog_manager.schema_names(&catalog_name, None).await? {
|
||||
let table_info_stream = catalog_manager
|
||||
.tables(&catalog_name, &schema_name)
|
||||
.tables(&catalog_name, &schema_name, None)
|
||||
.try_filter_map(|t| async move {
|
||||
let table_info = t.table_info();
|
||||
if table_info.table_type == TableType::Temporary {
|
||||
|
||||
@@ -176,9 +176,9 @@ impl InformationSchemaRegionPeersBuilder {
|
||||
|
||||
let predicates = Predicates::from_scan_request(&request);
|
||||
|
||||
for schema_name in catalog_manager.schema_names(&catalog_name).await? {
|
||||
for schema_name in catalog_manager.schema_names(&catalog_name, None).await? {
|
||||
let table_id_stream = catalog_manager
|
||||
.tables(&catalog_name, &schema_name)
|
||||
.tables(&catalog_name, &schema_name, None)
|
||||
.try_filter_map(|t| async move {
|
||||
let table_info = t.table_info();
|
||||
if table_info.table_type == TableType::Temporary {
|
||||
|
||||
@@ -171,7 +171,7 @@ impl InformationSchemaSchemataBuilder {
|
||||
let table_metadata_manager = utils::table_meta_manager(&self.catalog_manager)?;
|
||||
let predicates = Predicates::from_scan_request(&request);
|
||||
|
||||
for schema_name in catalog_manager.schema_names(&catalog_name).await? {
|
||||
for schema_name in catalog_manager.schema_names(&catalog_name, None).await? {
|
||||
let opts = if let Some(table_metadata_manager) = &table_metadata_manager {
|
||||
table_metadata_manager
|
||||
.schema_manager()
|
||||
|
||||
@@ -176,8 +176,8 @@ impl InformationSchemaTableConstraintsBuilder {
|
||||
.context(UpgradeWeakCatalogManagerRefSnafu)?;
|
||||
let predicates = Predicates::from_scan_request(&request);
|
||||
|
||||
for schema_name in catalog_manager.schema_names(&catalog_name).await? {
|
||||
let mut stream = catalog_manager.tables(&catalog_name, &schema_name);
|
||||
for schema_name in catalog_manager.schema_names(&catalog_name, None).await? {
|
||||
let mut stream = catalog_manager.tables(&catalog_name, &schema_name, None);
|
||||
|
||||
while let Some(table) = stream.try_next().await? {
|
||||
let keys = &table.table_info().meta.primary_key_indices;
|
||||
|
||||
@@ -234,8 +234,8 @@ impl InformationSchemaTablesBuilder {
|
||||
.context(UpgradeWeakCatalogManagerRefSnafu)?;
|
||||
let predicates = Predicates::from_scan_request(&request);
|
||||
|
||||
for schema_name in catalog_manager.schema_names(&catalog_name).await? {
|
||||
let mut stream = catalog_manager.tables(&catalog_name, &schema_name);
|
||||
for schema_name in catalog_manager.schema_names(&catalog_name, None).await? {
|
||||
let mut stream = catalog_manager.tables(&catalog_name, &schema_name, None);
|
||||
|
||||
while let Some(table) = stream.try_next().await? {
|
||||
let table_info = table.table_info();
|
||||
|
||||
@@ -192,8 +192,8 @@ impl InformationSchemaViewsBuilder {
|
||||
.context(CastManagerSnafu)?
|
||||
.view_info_cache()?;
|
||||
|
||||
for schema_name in catalog_manager.schema_names(&catalog_name).await? {
|
||||
let mut stream = catalog_manager.tables(&catalog_name, &schema_name);
|
||||
for schema_name in catalog_manager.schema_names(&catalog_name, None).await? {
|
||||
let mut stream = catalog_manager.tables(&catalog_name, &schema_name, None);
|
||||
|
||||
while let Some(table) = stream.try_next().await? {
|
||||
let table_info = table.table_info();
|
||||
|
||||
@@ -18,15 +18,16 @@ mod pg_namespace;
|
||||
mod table_names;
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::{Arc, Weak};
|
||||
use std::sync::{Arc, LazyLock, Weak};
|
||||
|
||||
use common_catalog::consts::{self, PG_CATALOG_NAME};
|
||||
use common_catalog::consts::{self, DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, PG_CATALOG_NAME};
|
||||
use datatypes::schema::ColumnSchema;
|
||||
use lazy_static::lazy_static;
|
||||
use paste::paste;
|
||||
use pg_catalog_memory_table::get_schema_columns;
|
||||
use pg_class::PGClass;
|
||||
use pg_namespace::PGNamespace;
|
||||
use session::context::{Channel, QueryContext};
|
||||
use table::TableRef;
|
||||
pub use table_names::*;
|
||||
|
||||
@@ -142,3 +143,12 @@ impl SystemSchemaProviderInner for PGCatalogProvider {
|
||||
&self.catalog_name
|
||||
}
|
||||
}
|
||||
|
||||
/// Provide query context to call the [`CatalogManager`]'s method.
|
||||
static PG_QUERY_CTX: LazyLock<QueryContext> = LazyLock::new(|| {
|
||||
QueryContext::with_channel(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, Channel::Postgres)
|
||||
});
|
||||
|
||||
fn query_ctx() -> Option<&'static QueryContext> {
|
||||
Some(&PG_QUERY_CTX)
|
||||
}
|
||||
|
||||
@@ -32,7 +32,7 @@ use store_api::storage::ScanRequest;
|
||||
use table::metadata::TableType;
|
||||
|
||||
use super::pg_namespace::oid_map::PGNamespaceOidMapRef;
|
||||
use super::{OID_COLUMN_NAME, PG_CLASS};
|
||||
use super::{query_ctx, OID_COLUMN_NAME, PG_CLASS};
|
||||
use crate::error::{
|
||||
CreateRecordBatchSnafu, InternalSnafu, Result, UpgradeWeakCatalogManagerRefSnafu,
|
||||
};
|
||||
@@ -202,8 +202,11 @@ impl PGClassBuilder {
|
||||
.upgrade()
|
||||
.context(UpgradeWeakCatalogManagerRefSnafu)?;
|
||||
let predicates = Predicates::from_scan_request(&request);
|
||||
for schema_name in catalog_manager.schema_names(&catalog_name).await? {
|
||||
let mut stream = catalog_manager.tables(&catalog_name, &schema_name);
|
||||
for schema_name in catalog_manager
|
||||
.schema_names(&catalog_name, query_ctx())
|
||||
.await?
|
||||
{
|
||||
let mut stream = catalog_manager.tables(&catalog_name, &schema_name, query_ctx());
|
||||
while let Some(table) = stream.try_next().await? {
|
||||
let table_info = table.table_info();
|
||||
self.add_class(
|
||||
|
||||
@@ -31,7 +31,7 @@ use datatypes::vectors::{StringVectorBuilder, UInt32VectorBuilder, VectorRef};
|
||||
use snafu::{OptionExt, ResultExt};
|
||||
use store_api::storage::ScanRequest;
|
||||
|
||||
use super::{PGNamespaceOidMapRef, OID_COLUMN_NAME, PG_NAMESPACE};
|
||||
use super::{query_ctx, PGNamespaceOidMapRef, OID_COLUMN_NAME, PG_NAMESPACE};
|
||||
use crate::error::{
|
||||
CreateRecordBatchSnafu, InternalSnafu, Result, UpgradeWeakCatalogManagerRefSnafu,
|
||||
};
|
||||
@@ -180,7 +180,10 @@ impl PGNamespaceBuilder {
|
||||
.upgrade()
|
||||
.context(UpgradeWeakCatalogManagerRefSnafu)?;
|
||||
let predicates = Predicates::from_scan_request(&request);
|
||||
for schema_name in catalog_manager.schema_names(&catalog_name).await? {
|
||||
for schema_name in catalog_manager
|
||||
.schema_names(&catalog_name, query_ctx())
|
||||
.await?
|
||||
{
|
||||
self.add_namespace(&predicates, &schema_name);
|
||||
}
|
||||
self.finish()
|
||||
|
||||
@@ -23,7 +23,7 @@ use datafusion::datasource::view::ViewTable;
|
||||
use datafusion::datasource::{provider_as_source, TableProvider};
|
||||
use datafusion::logical_expr::TableSource;
|
||||
use itertools::Itertools;
|
||||
use session::context::QueryContext;
|
||||
use session::context::QueryContextRef;
|
||||
use snafu::{ensure, OptionExt, ResultExt};
|
||||
use table::metadata::TableType;
|
||||
use table::table::adapter::DfTableProviderAdapter;
|
||||
@@ -45,6 +45,7 @@ pub struct DfTableSourceProvider {
|
||||
disallow_cross_catalog_query: bool,
|
||||
default_catalog: String,
|
||||
default_schema: String,
|
||||
query_ctx: QueryContextRef,
|
||||
plan_decoder: SubstraitPlanDecoderRef,
|
||||
enable_ident_normalization: bool,
|
||||
}
|
||||
@@ -53,7 +54,7 @@ impl DfTableSourceProvider {
|
||||
pub fn new(
|
||||
catalog_manager: CatalogManagerRef,
|
||||
disallow_cross_catalog_query: bool,
|
||||
query_ctx: &QueryContext,
|
||||
query_ctx: QueryContextRef,
|
||||
plan_decoder: SubstraitPlanDecoderRef,
|
||||
enable_ident_normalization: bool,
|
||||
) -> Self {
|
||||
@@ -63,6 +64,7 @@ impl DfTableSourceProvider {
|
||||
resolved_tables: HashMap::new(),
|
||||
default_catalog: query_ctx.current_catalog().to_owned(),
|
||||
default_schema: query_ctx.current_schema(),
|
||||
query_ctx,
|
||||
plan_decoder,
|
||||
enable_ident_normalization,
|
||||
}
|
||||
@@ -71,8 +73,7 @@ impl DfTableSourceProvider {
|
||||
pub fn resolve_table_ref(&self, table_ref: TableReference) -> Result<ResolvedTableReference> {
|
||||
if self.disallow_cross_catalog_query {
|
||||
match &table_ref {
|
||||
TableReference::Bare { .. } => (),
|
||||
TableReference::Partial { .. } => {}
|
||||
TableReference::Bare { .. } | TableReference::Partial { .. } => {}
|
||||
TableReference::Full {
|
||||
catalog, schema, ..
|
||||
} => {
|
||||
@@ -107,7 +108,7 @@ impl DfTableSourceProvider {
|
||||
|
||||
let table = self
|
||||
.catalog_manager
|
||||
.table(catalog_name, schema_name, table_name)
|
||||
.table(catalog_name, schema_name, table_name, Some(&self.query_ctx))
|
||||
.await?
|
||||
.with_context(|| TableNotExistSnafu {
|
||||
table: format_full_table_name(catalog_name, schema_name, table_name),
|
||||
@@ -210,12 +211,12 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_validate_table_ref() {
|
||||
let query_ctx = &QueryContext::with("greptime", "public");
|
||||
let query_ctx = Arc::new(QueryContext::with("greptime", "public"));
|
||||
|
||||
let table_provider = DfTableSourceProvider::new(
|
||||
MemoryCatalogManager::with_default_setup(),
|
||||
true,
|
||||
query_ctx,
|
||||
query_ctx.clone(),
|
||||
DummyDecoder::arc(),
|
||||
true,
|
||||
);
|
||||
@@ -308,7 +309,7 @@ mod tests {
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_resolve_view() {
|
||||
let query_ctx = &QueryContext::with("greptime", "public");
|
||||
let query_ctx = Arc::new(QueryContext::with("greptime", "public"));
|
||||
let backend = Arc::new(MemoryKvBackend::default());
|
||||
let layered_cache_builder = LayeredCacheRegistryBuilder::default()
|
||||
.add_cache_registry(CacheRegistryBuilder::default().build());
|
||||
@@ -344,8 +345,13 @@ mod tests {
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let mut table_provider =
|
||||
DfTableSourceProvider::new(catalog_manager, true, query_ctx, MockDecoder::arc(), true);
|
||||
let mut table_provider = DfTableSourceProvider::new(
|
||||
catalog_manager,
|
||||
true,
|
||||
query_ctx.clone(),
|
||||
MockDecoder::arc(),
|
||||
true,
|
||||
);
|
||||
|
||||
// View not found
|
||||
let table_ref = TableReference::bare("not_exists_view");
|
||||
|
||||
@@ -112,7 +112,7 @@ impl SchemaProvider for DummySchemaProvider {
|
||||
async fn table(&self, name: &str) -> datafusion::error::Result<Option<Arc<dyn TableProvider>>> {
|
||||
let table = self
|
||||
.catalog_manager
|
||||
.table(&self.catalog_name, &self.schema_name, name)
|
||||
.table(&self.catalog_name, &self.schema_name, name, None)
|
||||
.await?
|
||||
.with_context(|| TableNotExistSnafu {
|
||||
table: format_full_table_name(&self.catalog_name, &self.schema_name, name),
|
||||
|
||||
@@ -131,7 +131,7 @@ impl AlterLogicalTablesProcedure {
|
||||
let phy_raw_schemas = future::join_all(alter_region_tasks)
|
||||
.await
|
||||
.into_iter()
|
||||
.map(|res| res.map(|mut res| res.extension.remove(ALTER_PHYSICAL_EXTENSION_KEY)))
|
||||
.map(|res| res.map(|mut res| res.extensions.remove(ALTER_PHYSICAL_EXTENSION_KEY)))
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
|
||||
if phy_raw_schemas.is_empty() {
|
||||
|
||||
@@ -157,7 +157,7 @@ impl CreateLogicalTablesProcedure {
|
||||
let phy_raw_schemas = join_all(create_region_tasks)
|
||||
.await
|
||||
.into_iter()
|
||||
.map(|res| res.map(|mut res| res.extension.remove(ALTER_PHYSICAL_EXTENSION_KEY)))
|
||||
.map(|res| res.map(|mut res| res.extensions.remove(ALTER_PHYSICAL_EXTENSION_KEY)))
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
|
||||
if phy_raw_schemas.is_empty() {
|
||||
|
||||
@@ -324,10 +324,12 @@ impl HeartbeatTask {
|
||||
region_id: stat.region_id.as_u64(),
|
||||
engine: stat.engine,
|
||||
role: RegionRole::from(stat.role).into(),
|
||||
// TODO(jeremy): w/rcus
|
||||
// TODO(weny): w/rcus
|
||||
rcus: 0,
|
||||
wcus: 0,
|
||||
approximate_bytes: region_server.region_disk_usage(stat.region_id).unwrap_or(0),
|
||||
// TODO(weny): add extensions
|
||||
extensions: Default::default(),
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
@@ -366,10 +366,10 @@ impl RegionServerHandler for RegionServer {
|
||||
|
||||
// merge results by sum up affected rows and merge extensions.
|
||||
let mut affected_rows = 0;
|
||||
let mut extension = HashMap::new();
|
||||
let mut extensions = HashMap::new();
|
||||
for result in results {
|
||||
affected_rows += result.affected_rows;
|
||||
extension.extend(result.extension);
|
||||
extensions.extend(result.extensions);
|
||||
}
|
||||
|
||||
Ok(RegionResponseV1 {
|
||||
@@ -380,7 +380,7 @@ impl RegionServerHandler for RegionServer {
|
||||
}),
|
||||
}),
|
||||
affected_rows: affected_rows as _,
|
||||
extension,
|
||||
extensions,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -708,7 +708,7 @@ impl RegionServerInner {
|
||||
.await?;
|
||||
Ok(RegionResponse {
|
||||
affected_rows: result.affected_rows,
|
||||
extension: result.extension,
|
||||
extensions: result.extensions,
|
||||
})
|
||||
}
|
||||
Err(err) => {
|
||||
|
||||
@@ -15,6 +15,7 @@ workspace = true
|
||||
arrow.workspace = true
|
||||
arrow-array.workspace = true
|
||||
arrow-schema.workspace = true
|
||||
base64.workspace = true
|
||||
common-base.workspace = true
|
||||
common-decimal.workspace = true
|
||||
common-error.workspace = true
|
||||
@@ -23,6 +24,7 @@ common-telemetry.workspace = true
|
||||
common-time.workspace = true
|
||||
datafusion-common.workspace = true
|
||||
enum_dispatch = "0.3"
|
||||
greptime-proto.workspace = true
|
||||
num = "0.4"
|
||||
num-traits = "0.2"
|
||||
ordered-float = { version = "3.0", features = ["serde"] }
|
||||
|
||||
@@ -18,6 +18,8 @@ use std::sync::Arc;
|
||||
|
||||
use arrow::datatypes::{DataType as ArrowDataType, Field};
|
||||
use arrow_array::{Array, ListArray};
|
||||
use base64::engine::general_purpose::URL_SAFE;
|
||||
use base64::Engine as _;
|
||||
use common_base::bytes::{Bytes, StringBytes};
|
||||
use common_decimal::Decimal128;
|
||||
use common_telemetry::error;
|
||||
@@ -28,8 +30,10 @@ use common_time::time::Time;
|
||||
use common_time::timestamp::{TimeUnit, Timestamp};
|
||||
use common_time::{Duration, Interval, Timezone};
|
||||
use datafusion_common::ScalarValue;
|
||||
use greptime_proto::v1::value::ValueData;
|
||||
pub use ordered_float::OrderedFloat;
|
||||
use serde::{Deserialize, Serialize, Serializer};
|
||||
use serde_json::{Number, Value as JsonValue};
|
||||
use snafu::{ensure, ResultExt};
|
||||
|
||||
use crate::error::{self, ConvertArrowArrayToScalarsSnafu, Error, Result, TryFromValueSnafu};
|
||||
@@ -1364,15 +1368,179 @@ impl<'a> ValueRef<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn column_data_to_json(data: ValueData) -> JsonValue {
|
||||
match data {
|
||||
ValueData::BinaryValue(b) => JsonValue::String(URL_SAFE.encode(b)),
|
||||
ValueData::BoolValue(b) => JsonValue::Bool(b),
|
||||
ValueData::U8Value(i) => JsonValue::Number(i.into()),
|
||||
ValueData::U16Value(i) => JsonValue::Number(i.into()),
|
||||
ValueData::U32Value(i) => JsonValue::Number(i.into()),
|
||||
ValueData::U64Value(i) => JsonValue::Number(i.into()),
|
||||
ValueData::I8Value(i) => JsonValue::Number(i.into()),
|
||||
ValueData::I16Value(i) => JsonValue::Number(i.into()),
|
||||
ValueData::I32Value(i) => JsonValue::Number(i.into()),
|
||||
ValueData::I64Value(i) => JsonValue::Number(i.into()),
|
||||
ValueData::F32Value(f) => Number::from_f64(f as f64)
|
||||
.map(JsonValue::Number)
|
||||
.unwrap_or(JsonValue::Null),
|
||||
ValueData::F64Value(f) => Number::from_f64(f)
|
||||
.map(JsonValue::Number)
|
||||
.unwrap_or(JsonValue::Null),
|
||||
ValueData::StringValue(s) => JsonValue::String(s),
|
||||
ValueData::DateValue(d) => JsonValue::String(Date::from(d).to_string()),
|
||||
ValueData::DatetimeValue(d) => JsonValue::String(DateTime::from(d).to_string()),
|
||||
ValueData::TimeSecondValue(d) => JsonValue::String(Time::new_second(d).to_iso8601_string()),
|
||||
ValueData::TimeMillisecondValue(d) => {
|
||||
JsonValue::String(Time::new_millisecond(d).to_iso8601_string())
|
||||
}
|
||||
ValueData::TimeMicrosecondValue(d) => {
|
||||
JsonValue::String(Time::new_microsecond(d).to_iso8601_string())
|
||||
}
|
||||
ValueData::TimeNanosecondValue(d) => {
|
||||
JsonValue::String(Time::new_nanosecond(d).to_iso8601_string())
|
||||
}
|
||||
ValueData::TimestampMicrosecondValue(d) => {
|
||||
JsonValue::String(Timestamp::new_microsecond(d).to_iso8601_string())
|
||||
}
|
||||
ValueData::TimestampMillisecondValue(d) => {
|
||||
JsonValue::String(Timestamp::new_millisecond(d).to_iso8601_string())
|
||||
}
|
||||
ValueData::TimestampNanosecondValue(d) => {
|
||||
JsonValue::String(Timestamp::new_nanosecond(d).to_iso8601_string())
|
||||
}
|
||||
ValueData::TimestampSecondValue(d) => {
|
||||
JsonValue::String(Timestamp::new_second(d).to_iso8601_string())
|
||||
}
|
||||
ValueData::IntervalYearMonthValue(d) => JsonValue::String(format!("interval year [{}]", d)),
|
||||
ValueData::IntervalMonthDayNanoValue(d) => JsonValue::String(format!(
|
||||
"interval month [{}][{}][{}]",
|
||||
d.months, d.days, d.nanoseconds
|
||||
)),
|
||||
ValueData::IntervalDayTimeValue(d) => JsonValue::String(format!("interval day [{}]", d)),
|
||||
ValueData::Decimal128Value(d) => {
|
||||
JsonValue::String(format!("decimal128 [{}][{}]", d.hi, d.lo))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use arrow::datatypes::DataType as ArrowDataType;
|
||||
use common_time::timezone::set_default_timezone;
|
||||
use greptime_proto::v1::{Decimal128 as ProtoDecimal128, IntervalMonthDayNano};
|
||||
use num_traits::Float;
|
||||
|
||||
use super::*;
|
||||
use crate::vectors::ListVectorBuilder;
|
||||
|
||||
#[test]
|
||||
fn test_column_data_to_json() {
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::BinaryValue(b"hello".to_vec())),
|
||||
JsonValue::String("aGVsbG8=".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::BoolValue(true)),
|
||||
JsonValue::Bool(true)
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::U8Value(1)),
|
||||
JsonValue::Number(1.into())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::U16Value(2)),
|
||||
JsonValue::Number(2.into())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::U32Value(3)),
|
||||
JsonValue::Number(3.into())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::U64Value(4)),
|
||||
JsonValue::Number(4.into())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::I8Value(5)),
|
||||
JsonValue::Number(5.into())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::I16Value(6)),
|
||||
JsonValue::Number(6.into())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::I32Value(7)),
|
||||
JsonValue::Number(7.into())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::I64Value(8)),
|
||||
JsonValue::Number(8.into())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::F32Value(9.0)),
|
||||
JsonValue::Number(Number::from_f64(9.0_f64).unwrap())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::F64Value(10.0)),
|
||||
JsonValue::Number(Number::from_f64(10.0_f64).unwrap())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::StringValue("hello".to_string())),
|
||||
JsonValue::String("hello".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::DateValue(123)),
|
||||
JsonValue::String("1970-05-04".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::DatetimeValue(456)),
|
||||
JsonValue::String("1970-01-01 00:00:00.456+0000".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::TimeSecondValue(789)),
|
||||
JsonValue::String("00:13:09+0000".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::TimeMillisecondValue(789)),
|
||||
JsonValue::String("00:00:00.789+0000".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::TimeMicrosecondValue(789)),
|
||||
JsonValue::String("00:00:00.000789+0000".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::TimestampMillisecondValue(1234567890)),
|
||||
JsonValue::String("1970-01-15 06:56:07.890+0000".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::TimestampNanosecondValue(1234567890123456789)),
|
||||
JsonValue::String("2009-02-13 23:31:30.123456789+0000".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::TimestampSecondValue(1234567890)),
|
||||
JsonValue::String("2009-02-13 23:31:30+0000".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::IntervalYearMonthValue(12)),
|
||||
JsonValue::String("interval year [12]".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::IntervalMonthDayNanoValue(IntervalMonthDayNano {
|
||||
months: 1,
|
||||
days: 2,
|
||||
nanoseconds: 3,
|
||||
})),
|
||||
JsonValue::String("interval month [1][2][3]".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::IntervalDayTimeValue(4)),
|
||||
JsonValue::String("interval day [4]".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::Decimal128Value(ProtoDecimal128 { hi: 5, lo: 6 })),
|
||||
JsonValue::String("decimal128 [5][6]".to_string())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_try_from_scalar_value() {
|
||||
assert_eq!(
|
||||
|
||||
@@ -356,9 +356,10 @@ impl SqlQueryHandler for Instance {
|
||||
|
||||
async fn is_valid_schema(&self, catalog: &str, schema: &str) -> Result<bool> {
|
||||
self.catalog_manager
|
||||
.schema_exists(catalog, schema)
|
||||
.schema_exists(catalog, schema, None)
|
||||
.await
|
||||
.context(error::CatalogSnafu)
|
||||
.map(|b| b && !self.catalog_manager.is_reserved_schema_name(schema))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -102,7 +102,7 @@ impl Instance {
|
||||
) -> Result<Output> {
|
||||
let table = self
|
||||
.catalog_manager
|
||||
.table(catalog_name, schema_name, table_name)
|
||||
.table(catalog_name, schema_name, table_name, Some(ctx))
|
||||
.await
|
||||
.context(CatalogSnafu)?
|
||||
.with_context(|| TableNotFoundSnafu {
|
||||
|
||||
@@ -152,7 +152,12 @@ mod python {
|
||||
|
||||
if let Some(table) = self
|
||||
.catalog_manager
|
||||
.table(&expr.catalog_name, &expr.schema_name, &expr.table_name)
|
||||
.table(
|
||||
&expr.catalog_name,
|
||||
&expr.schema_name,
|
||||
&expr.table_name,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.context(CatalogSnafu)?
|
||||
{
|
||||
@@ -185,6 +190,7 @@ mod python {
|
||||
&table_name.catalog_name,
|
||||
&table_name.schema_name,
|
||||
&table_name.table_name,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.context(CatalogSnafu)?
|
||||
|
||||
@@ -93,6 +93,7 @@ mod tests {
|
||||
approximate_bytes: 0,
|
||||
engine: default_engine().to_string(),
|
||||
role: RegionRole::Follower,
|
||||
extensions: Default::default(),
|
||||
}
|
||||
}
|
||||
acc.stat = Some(Stat {
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::HashSet;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
|
||||
use api::v1::meta::HeartbeatRequest;
|
||||
use common_meta::ClusterId;
|
||||
@@ -57,6 +57,8 @@ pub struct RegionStat {
|
||||
pub engine: String,
|
||||
/// The region role.
|
||||
pub role: RegionRole,
|
||||
/// The extension info of this region
|
||||
pub extensions: HashMap<String, Vec<u8>>,
|
||||
}
|
||||
|
||||
impl Stat {
|
||||
@@ -142,6 +144,7 @@ impl TryFrom<api::v1::meta::RegionStat> for RegionStat {
|
||||
approximate_bytes: value.approximate_bytes,
|
||||
engine: value.engine.to_string(),
|
||||
role: RegionRole::from(value.role()),
|
||||
extensions: value.extensions,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -135,6 +135,7 @@ mod test {
|
||||
wcus: 0,
|
||||
approximate_bytes: 0,
|
||||
engine: String::new(),
|
||||
extensions: Default::default(),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -100,7 +100,7 @@ pub mod mock {
|
||||
}),
|
||||
}),
|
||||
affected_rows: 0,
|
||||
extension: Default::default(),
|
||||
extensions: Default::default(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -199,6 +199,7 @@ mod tests {
|
||||
approximate_bytes: 1,
|
||||
engine: "mito2".to_string(),
|
||||
role: RegionRole::Leader,
|
||||
extensions: Default::default(),
|
||||
}],
|
||||
..Default::default()
|
||||
}
|
||||
@@ -215,6 +216,7 @@ mod tests {
|
||||
approximate_bytes: 1,
|
||||
engine: "mito2".to_string(),
|
||||
role: RegionRole::Leader,
|
||||
extensions: Default::default(),
|
||||
}],
|
||||
..Default::default()
|
||||
}
|
||||
@@ -231,6 +233,7 @@ mod tests {
|
||||
approximate_bytes: 1,
|
||||
engine: "mito2".to_string(),
|
||||
role: RegionRole::Leader,
|
||||
extensions: Default::default(),
|
||||
}],
|
||||
..Default::default()
|
||||
}
|
||||
|
||||
@@ -162,7 +162,7 @@ impl RegionEngine for MetricEngine {
|
||||
|
||||
result.map_err(BoxedError::new).map(|rows| RegionResponse {
|
||||
affected_rows: rows,
|
||||
extension: extension_return_value,
|
||||
extensions: extension_return_value,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@@ -709,6 +709,10 @@ impl ScanInput {
|
||||
rows_in_files + rows_in_memtables
|
||||
}
|
||||
|
||||
pub(crate) fn predicate(&self) -> Option<Predicate> {
|
||||
self.predicate.clone()
|
||||
}
|
||||
|
||||
/// Retrieves [`PartitionRange`] from memtable and files
|
||||
pub(crate) fn partition_ranges(&self) -> Vec<PartitionRange> {
|
||||
let mut id = 0;
|
||||
|
||||
@@ -515,6 +515,11 @@ impl RegionScanner for SeqScan {
|
||||
self.properties.partitions = ranges;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn has_predicate(&self) -> bool {
|
||||
let predicate = self.stream_ctx.input.predicate();
|
||||
predicate.map(|p| !p.exprs().is_empty()).unwrap_or(false)
|
||||
}
|
||||
}
|
||||
|
||||
impl DisplayAs for SeqScan {
|
||||
|
||||
@@ -228,6 +228,11 @@ impl RegionScanner for UnorderedScan {
|
||||
|
||||
Ok(stream)
|
||||
}
|
||||
|
||||
fn has_predicate(&self) -> bool {
|
||||
let predicate = self.stream_ctx.input.predicate();
|
||||
predicate.map(|p| !p.exprs().is_empty()).unwrap_or(false)
|
||||
}
|
||||
}
|
||||
|
||||
impl DisplayAs for UnorderedScan {
|
||||
|
||||
@@ -232,7 +232,7 @@ impl Deleter {
|
||||
|
||||
async fn get_table(&self, catalog: &str, schema: &str, table: &str) -> Result<TableRef> {
|
||||
self.catalog_manager
|
||||
.table(catalog, schema, table)
|
||||
.table(catalog, schema, table, None)
|
||||
.await
|
||||
.context(CatalogSnafu)?
|
||||
.with_context(|| TableNotFoundSnafu {
|
||||
|
||||
@@ -119,7 +119,7 @@ impl FlowServiceOperator {
|
||||
if let Some(prev) = &mut final_result {
|
||||
prev.affected_rows = res.affected_rows;
|
||||
prev.affected_flows.extend(res.affected_flows);
|
||||
prev.extension.extend(res.extension);
|
||||
prev.extensions.extend(res.extensions);
|
||||
} else {
|
||||
final_result = Some(res);
|
||||
}
|
||||
|
||||
@@ -608,7 +608,7 @@ impl Inserter {
|
||||
table: &str,
|
||||
) -> Result<Option<TableRef>> {
|
||||
self.catalog_manager
|
||||
.table(catalog, schema, table)
|
||||
.table(catalog, schema, table, None)
|
||||
.await
|
||||
.context(CatalogSnafu)
|
||||
}
|
||||
|
||||
@@ -64,7 +64,7 @@ impl<'a> RowToRegion<'a> {
|
||||
let catalog_name = self.ctx.current_catalog();
|
||||
let schema_name = self.ctx.current_schema();
|
||||
self.catalog_manager
|
||||
.table(catalog_name, &schema_name, table_name)
|
||||
.table(catalog_name, &schema_name, table_name, None)
|
||||
.await
|
||||
.context(CatalogSnafu)?
|
||||
.with_context(|| TableNotFoundSnafu {
|
||||
|
||||
@@ -139,7 +139,7 @@ impl<'a> StatementToRegion<'a> {
|
||||
|
||||
async fn get_table(&self, catalog: &str, schema: &str, table: &str) -> Result<TableRef> {
|
||||
self.catalog_manager
|
||||
.table(catalog, schema, table)
|
||||
.table(catalog, schema, table, None)
|
||||
.await
|
||||
.context(CatalogSnafu)?
|
||||
.with_context(|| TableNotFoundSnafu {
|
||||
|
||||
@@ -219,7 +219,7 @@ impl Requester {
|
||||
) -> Result<Vec<PartitionInfo>> {
|
||||
let table = self
|
||||
.catalog_manager
|
||||
.table(catalog, schema, table_name)
|
||||
.table(catalog, schema, table_name, None)
|
||||
.await
|
||||
.context(CatalogSnafu)?;
|
||||
|
||||
|
||||
@@ -286,7 +286,7 @@ impl StatementExecutor {
|
||||
|
||||
let table_ref = self
|
||||
.catalog_manager
|
||||
.table(&catalog, &schema, &table)
|
||||
.table(&catalog, &schema, &table, Some(&query_ctx))
|
||||
.await
|
||||
.context(CatalogSnafu)?
|
||||
.context(TableNotFoundSnafu { table_name: &table })?;
|
||||
@@ -313,7 +313,7 @@ impl StatementExecutor {
|
||||
let catalog = query_ctx.current_catalog();
|
||||
ensure!(
|
||||
self.catalog_manager
|
||||
.schema_exists(catalog, db.as_ref())
|
||||
.schema_exists(catalog, db.as_ref(), Some(&query_ctx))
|
||||
.await
|
||||
.context(CatalogSnafu)?,
|
||||
SchemaNotFoundSnafu { schema_info: &db }
|
||||
@@ -382,7 +382,7 @@ impl StatementExecutor {
|
||||
table,
|
||||
} = table_ref;
|
||||
self.catalog_manager
|
||||
.table(catalog, schema, table)
|
||||
.table(catalog, schema, table, None)
|
||||
.await
|
||||
.context(CatalogSnafu)?
|
||||
.with_context(|| TableNotFoundSnafu {
|
||||
|
||||
@@ -57,7 +57,7 @@ impl StatementExecutor {
|
||||
);
|
||||
let table_names = self
|
||||
.catalog_manager
|
||||
.table_names(&req.catalog_name, &req.schema_name)
|
||||
.table_names(&req.catalog_name, &req.schema_name, Some(&ctx))
|
||||
.await
|
||||
.context(CatalogSnafu)?;
|
||||
|
||||
|
||||
@@ -106,7 +106,7 @@ impl StatementExecutor {
|
||||
.context(error::ExternalSnafu)?;
|
||||
let table_ref = self
|
||||
.catalog_manager
|
||||
.table(&catalog, &schema, &table)
|
||||
.table(&catalog, &schema, &table, Some(&ctx))
|
||||
.await
|
||||
.context(CatalogSnafu)?
|
||||
.context(TableNotFoundSnafu { table_name: &table })?;
|
||||
@@ -207,6 +207,7 @@ impl StatementExecutor {
|
||||
&create_table.catalog_name,
|
||||
&create_table.schema_name,
|
||||
&create_table.table_name,
|
||||
Some(&query_ctx),
|
||||
)
|
||||
.await
|
||||
.context(CatalogSnafu)?
|
||||
@@ -487,7 +488,12 @@ impl StatementExecutor {
|
||||
// if view or table exists.
|
||||
if let Some(table) = self
|
||||
.catalog_manager
|
||||
.table(&expr.catalog_name, &expr.schema_name, &expr.view_name)
|
||||
.table(
|
||||
&expr.catalog_name,
|
||||
&expr.schema_name,
|
||||
&expr.view_name,
|
||||
Some(&ctx),
|
||||
)
|
||||
.await
|
||||
.context(CatalogSnafu)?
|
||||
{
|
||||
@@ -656,7 +662,7 @@ impl StatementExecutor {
|
||||
) -> Result<Output> {
|
||||
let view_info = if let Some(view) = self
|
||||
.catalog_manager
|
||||
.table(&catalog, &schema, &view)
|
||||
.table(&catalog, &schema, &view, None)
|
||||
.await
|
||||
.context(CatalogSnafu)?
|
||||
{
|
||||
@@ -766,6 +772,7 @@ impl StatementExecutor {
|
||||
&table_name.catalog_name,
|
||||
&table_name.schema_name,
|
||||
&table_name.table_name,
|
||||
Some(&query_context),
|
||||
)
|
||||
.await
|
||||
.context(CatalogSnafu)?
|
||||
@@ -816,7 +823,7 @@ impl StatementExecutor {
|
||||
|
||||
if self
|
||||
.catalog_manager
|
||||
.schema_exists(&catalog, &schema)
|
||||
.schema_exists(&catalog, &schema, None)
|
||||
.await
|
||||
.context(CatalogSnafu)?
|
||||
{
|
||||
@@ -858,6 +865,7 @@ impl StatementExecutor {
|
||||
&table_name.catalog_name,
|
||||
&table_name.schema_name,
|
||||
&table_name.table_name,
|
||||
Some(&query_context),
|
||||
)
|
||||
.await
|
||||
.context(CatalogSnafu)?
|
||||
@@ -944,7 +952,12 @@ impl StatementExecutor {
|
||||
|
||||
let table = self
|
||||
.catalog_manager
|
||||
.table(&catalog_name, &schema_name, &table_name)
|
||||
.table(
|
||||
&catalog_name,
|
||||
&schema_name,
|
||||
&table_name,
|
||||
Some(&query_context),
|
||||
)
|
||||
.await
|
||||
.context(CatalogSnafu)?
|
||||
.with_context(|| TableNotFoundSnafu {
|
||||
@@ -1167,9 +1180,10 @@ impl StatementExecutor {
|
||||
|
||||
if !self
|
||||
.catalog_manager
|
||||
.schema_exists(catalog, database)
|
||||
.schema_exists(catalog, database, None)
|
||||
.await
|
||||
.context(CatalogSnafu)?
|
||||
&& !self.catalog_manager.is_reserved_schema_name(database)
|
||||
{
|
||||
self.create_database_procedure(
|
||||
catalog.to_string(),
|
||||
|
||||
@@ -39,7 +39,7 @@ impl StatementExecutor {
|
||||
|
||||
let table = self
|
||||
.catalog_manager
|
||||
.table(&catalog, &schema, &table)
|
||||
.table(&catalog, &schema, &table, Some(&query_ctx))
|
||||
.await
|
||||
.context(CatalogSnafu)?
|
||||
.with_context(|| TableNotFoundSnafu {
|
||||
|
||||
@@ -143,7 +143,7 @@ impl StatementExecutor {
|
||||
|
||||
let table_ref = self
|
||||
.catalog_manager
|
||||
.table(&catalog, &schema, &view)
|
||||
.table(&catalog, &schema, &view, Some(&query_ctx))
|
||||
.await
|
||||
.context(CatalogSnafu)?
|
||||
.context(ViewNotFoundSnafu { view_name: &view })?;
|
||||
|
||||
@@ -13,27 +13,13 @@
|
||||
// limitations under the License.
|
||||
|
||||
use criterion::{black_box, criterion_group, criterion_main, Criterion};
|
||||
use pipeline::{parse, Array, Content, GreptimeTransformer, Pipeline, Value as PipelineValue};
|
||||
use pipeline::{parse, Content, GreptimeTransformer, Pipeline};
|
||||
use serde_json::{Deserializer, Value};
|
||||
|
||||
fn processor_map(
|
||||
pipeline: &Pipeline<GreptimeTransformer>,
|
||||
input_values: Vec<Value>,
|
||||
) -> impl IntoIterator<Item = greptime_proto::v1::Rows> {
|
||||
let pipeline_data = input_values
|
||||
.into_iter()
|
||||
.map(|v| PipelineValue::try_from(v).unwrap())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
pipeline.exec(PipelineValue::Array(Array {
|
||||
values: pipeline_data,
|
||||
}))
|
||||
}
|
||||
|
||||
fn processor_mut(
|
||||
pipeline: &Pipeline<GreptimeTransformer>,
|
||||
input_values: Vec<Value>,
|
||||
) -> impl IntoIterator<Item = Vec<greptime_proto::v1::Row>> {
|
||||
) -> Result<Vec<greptime_proto::v1::Row>, String> {
|
||||
let mut payload = pipeline.init_intermediate_state();
|
||||
let mut result = Vec::with_capacity(input_values.len());
|
||||
|
||||
@@ -249,11 +235,10 @@ fn criterion_benchmark(c: &mut Criterion) {
|
||||
let pipeline = prepare_pipeline();
|
||||
let mut group = c.benchmark_group("pipeline");
|
||||
group.sample_size(50);
|
||||
group.bench_function("processor map", |b| {
|
||||
b.iter(|| processor_map(black_box(&pipeline), black_box(input_value.clone())))
|
||||
});
|
||||
group.bench_function("processor mut", |b| {
|
||||
b.iter(|| processor_mut(black_box(&pipeline), black_box(input_value.clone())))
|
||||
b.iter(|| {
|
||||
processor_mut(black_box(&pipeline), black_box(input_value.clone())).unwrap();
|
||||
})
|
||||
});
|
||||
group.finish();
|
||||
}
|
||||
|
||||
@@ -19,92 +19,24 @@ pub mod processor;
|
||||
pub mod transform;
|
||||
pub mod value;
|
||||
|
||||
use ahash::{HashMap, HashSet};
|
||||
use common_telemetry::{debug, warn};
|
||||
use ahash::HashSet;
|
||||
use common_telemetry::debug;
|
||||
use itertools::{merge, Itertools};
|
||||
use processor::Processor;
|
||||
use transform::{Transformer, Transforms};
|
||||
use value::{Map, Value};
|
||||
use processor::{Processor, ProcessorBuilder, Processors};
|
||||
use transform::{TransformBuilders, Transformer, Transforms};
|
||||
use value::Value;
|
||||
use yaml_rust::YamlLoader;
|
||||
|
||||
const DESCRIPTION: &str = "description";
|
||||
const PROCESSORS: &str = "processors";
|
||||
const TRANSFORM: &str = "transform";
|
||||
const TRANSFORMS: &str = "transforms";
|
||||
|
||||
pub enum Content {
|
||||
Json(String),
|
||||
Yaml(String),
|
||||
}
|
||||
|
||||
/// set the index for the processor keys
|
||||
/// the index is the position of the key in the final intermediate keys
|
||||
fn set_processor_keys_index(
|
||||
processors: &mut processor::Processors,
|
||||
final_intermediate_keys: &Vec<String>,
|
||||
) -> Result<(), String> {
|
||||
let final_intermediate_key_index = final_intermediate_keys
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, k)| (k.as_str(), i))
|
||||
.collect::<HashMap<_, _>>();
|
||||
for processor in processors.iter_mut() {
|
||||
for field in processor.fields_mut().iter_mut() {
|
||||
let index = final_intermediate_key_index.get(field.input_field.name.as_str()).ok_or(format!(
|
||||
"input field {} is not found in intermediate keys: {final_intermediate_keys:?} when set processor keys index",
|
||||
field.input_field.name
|
||||
))?;
|
||||
field.set_input_index(*index);
|
||||
for (k, v) in field.output_fields_index_mapping.iter_mut() {
|
||||
let index = final_intermediate_key_index.get(k.as_str());
|
||||
match index {
|
||||
Some(index) => {
|
||||
*v = *index;
|
||||
}
|
||||
None => {
|
||||
warn!(
|
||||
"output field {k} is not found in intermediate keys: {final_intermediate_keys:?} when set processor keys index"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn set_transform_keys_index(
|
||||
transforms: &mut Transforms,
|
||||
final_intermediate_keys: &[String],
|
||||
output_keys: &[String],
|
||||
) -> Result<(), String> {
|
||||
let final_intermediate_key_index = final_intermediate_keys
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, k)| (k.as_str(), i))
|
||||
.collect::<HashMap<_, _>>();
|
||||
let output_key_index = output_keys
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, k)| (k.as_str(), i))
|
||||
.collect::<HashMap<_, _>>();
|
||||
for transform in transforms.iter_mut() {
|
||||
for field in transform.fields.iter_mut() {
|
||||
let index = final_intermediate_key_index.get(field.input_field.name.as_str()).ok_or(format!(
|
||||
"input field {} is not found in intermediate keys: {final_intermediate_keys:?} when set transform keys index",
|
||||
field.input_field.name
|
||||
))?;
|
||||
field.set_input_index(*index);
|
||||
for (k, v) in field.output_fields_index_mapping.iter_mut() {
|
||||
let index = output_key_index.get(k.as_str()).ok_or(format!(
|
||||
"output field {k} is not found in output keys: {final_intermediate_keys:?} when set transform keys index"
|
||||
))?;
|
||||
*v = *index;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn parse<T>(input: &Content) -> Result<Pipeline<T>, String>
|
||||
where
|
||||
T: Transformer,
|
||||
@@ -117,24 +49,22 @@ where
|
||||
|
||||
let description = doc[DESCRIPTION].as_str().map(|s| s.to_string());
|
||||
|
||||
let mut processors = if let Some(v) = doc[PROCESSORS].as_vec() {
|
||||
let processor_builder_list = if let Some(v) = doc[PROCESSORS].as_vec() {
|
||||
v.try_into()?
|
||||
} else {
|
||||
processor::Processors::default()
|
||||
processor::ProcessorBuilderList::default()
|
||||
};
|
||||
|
||||
let transforms = if let Some(v) = doc[TRANSFORM].as_vec() {
|
||||
v.try_into()?
|
||||
} else {
|
||||
Transforms::default()
|
||||
};
|
||||
let transform_builders =
|
||||
if let Some(v) = doc[TRANSFORMS].as_vec().or(doc[TRANSFORM].as_vec()) {
|
||||
v.try_into()?
|
||||
} else {
|
||||
TransformBuilders::default()
|
||||
};
|
||||
|
||||
let mut transformer = T::new(transforms)?;
|
||||
let transforms = transformer.transforms_mut();
|
||||
|
||||
let processors_output_keys = processors.output_keys();
|
||||
let processors_required_keys = processors.required_keys();
|
||||
let processors_required_original_keys = processors.required_original_keys();
|
||||
let processors_required_keys = &processor_builder_list.input_keys;
|
||||
let processors_output_keys = &processor_builder_list.output_keys;
|
||||
let processors_required_original_keys = &processor_builder_list.original_input_keys;
|
||||
|
||||
debug!(
|
||||
"processors_required_original_keys: {:?}",
|
||||
@@ -143,7 +73,7 @@ where
|
||||
debug!("processors_required_keys: {:?}", processors_required_keys);
|
||||
debug!("processors_output_keys: {:?}", processors_output_keys);
|
||||
|
||||
let transforms_required_keys = transforms.required_keys();
|
||||
let transforms_required_keys = &transform_builders.required_keys;
|
||||
let mut tr_keys = Vec::with_capacity(50);
|
||||
for key in transforms_required_keys.iter() {
|
||||
if !processors_output_keys.contains(key)
|
||||
@@ -183,9 +113,33 @@ where
|
||||
|
||||
final_intermediate_keys.extend(intermediate_keys_exclude_original);
|
||||
|
||||
let output_keys = transforms.output_keys().clone();
|
||||
set_processor_keys_index(&mut processors, &final_intermediate_keys)?;
|
||||
set_transform_keys_index(transforms, &final_intermediate_keys, &output_keys)?;
|
||||
let output_keys = transform_builders.output_keys.clone();
|
||||
|
||||
let processors_kind_list = processor_builder_list
|
||||
.processor_builders
|
||||
.into_iter()
|
||||
.map(|builder| builder.build(&final_intermediate_keys))
|
||||
.collect::<Result<Vec<_>, _>>()?;
|
||||
let processors = Processors {
|
||||
processors: processors_kind_list,
|
||||
required_keys: processors_required_keys.clone(),
|
||||
output_keys: processors_output_keys.clone(),
|
||||
required_original_keys: processors_required_original_keys.clone(),
|
||||
};
|
||||
|
||||
let transfor_list = transform_builders
|
||||
.builders
|
||||
.into_iter()
|
||||
.map(|builder| builder.build(&final_intermediate_keys, &output_keys))
|
||||
.collect::<Result<Vec<_>, String>>()?;
|
||||
|
||||
let transformers = Transforms {
|
||||
transforms: transfor_list,
|
||||
required_keys: transforms_required_keys.clone(),
|
||||
output_keys: output_keys.clone(),
|
||||
};
|
||||
|
||||
let transformer = T::new(transformers)?;
|
||||
|
||||
Ok(Pipeline {
|
||||
description,
|
||||
@@ -238,38 +192,6 @@ impl<T> Pipeline<T>
|
||||
where
|
||||
T: Transformer,
|
||||
{
|
||||
fn exec_map(&self, map: &mut Map) -> Result<(), String> {
|
||||
let v = map;
|
||||
for processor in self.processors.iter() {
|
||||
processor.exec_map(v)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn exec(&self, mut val: Value) -> Result<T::Output, String> {
|
||||
let result = match val {
|
||||
Value::Map(ref mut map) => {
|
||||
self.exec_map(map)?;
|
||||
val
|
||||
}
|
||||
Value::Array(arr) => arr
|
||||
.values
|
||||
.into_iter()
|
||||
.map(|mut v| match v {
|
||||
Value::Map(ref mut map) => {
|
||||
self.exec_map(map)?;
|
||||
Ok(v)
|
||||
}
|
||||
_ => Err(format!("expected a map, but got {}", v)),
|
||||
})
|
||||
.collect::<Result<Vec<Value>, String>>()
|
||||
.map(|values| Value::Array(value::Array { values }))?,
|
||||
_ => return Err(format!("expected a map or array, but got {}", val)),
|
||||
};
|
||||
|
||||
self.transformer.transform(result)
|
||||
}
|
||||
|
||||
pub fn exec_mut(&self, val: &mut Vec<Value>) -> Result<T::VecOutput, String> {
|
||||
for processor in self.processors.iter() {
|
||||
processor.exec_mut(val)?;
|
||||
@@ -347,9 +269,24 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn find_key_index(
|
||||
intermediate_keys: &[String],
|
||||
key: &str,
|
||||
kind: &str,
|
||||
) -> Result<usize, String> {
|
||||
intermediate_keys
|
||||
.iter()
|
||||
.position(|k| k == key)
|
||||
.ok_or(format!(
|
||||
"{} processor.{} not found in intermediate keys",
|
||||
kind, key
|
||||
))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use api::v1::Rows;
|
||||
use greptime_proto::v1::value::ValueData;
|
||||
use greptime_proto::v1::{self, ColumnDataType, SemanticType};
|
||||
|
||||
@@ -359,96 +296,43 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_pipeline_prepare() {
|
||||
{
|
||||
let input_value_str = r#"
|
||||
{
|
||||
"my_field": "1,2",
|
||||
"foo": "bar"
|
||||
}
|
||||
"#;
|
||||
let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap();
|
||||
|
||||
let pipeline_yaml = r#"
|
||||
---
|
||||
description: Pipeline for Apache Tomcat
|
||||
let input_value_str = r#"
|
||||
{
|
||||
"my_field": "1,2",
|
||||
"foo": "bar"
|
||||
}
|
||||
"#;
|
||||
let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap();
|
||||
|
||||
let pipeline_yaml = r#"description: 'Pipeline for Apache Tomcat'
|
||||
processors:
|
||||
- csv:
|
||||
field: my_field, my_field,field1, field2
|
||||
|
||||
field: my_field
|
||||
target_fields: field1, field2
|
||||
transform:
|
||||
- field: field1
|
||||
type: uint32
|
||||
- field: field2
|
||||
type: uint32
|
||||
"#;
|
||||
let pipeline: Pipeline<GreptimeTransformer> =
|
||||
parse(&Content::Yaml(pipeline_yaml.into())).unwrap();
|
||||
let mut payload = pipeline.init_intermediate_state();
|
||||
pipeline.prepare(input_value, &mut payload).unwrap();
|
||||
assert_eq!(
|
||||
&["greptime_timestamp", "my_field"].to_vec(),
|
||||
pipeline.required_keys()
|
||||
);
|
||||
assert_eq!(
|
||||
payload,
|
||||
vec![
|
||||
Value::Null,
|
||||
Value::String("1,2".to_string()),
|
||||
Value::Null,
|
||||
Value::Null
|
||||
]
|
||||
);
|
||||
let result = pipeline.exec_mut(&mut payload).unwrap();
|
||||
let pipeline: Pipeline<GreptimeTransformer> =
|
||||
parse(&Content::Yaml(pipeline_yaml.into())).unwrap();
|
||||
let mut payload = pipeline.init_intermediate_state();
|
||||
pipeline.prepare(input_value, &mut payload).unwrap();
|
||||
assert_eq!(&["my_field"].to_vec(), pipeline.required_keys());
|
||||
assert_eq!(
|
||||
payload,
|
||||
vec![Value::String("1,2".to_string()), Value::Null, Value::Null]
|
||||
);
|
||||
let result = pipeline.exec_mut(&mut payload).unwrap();
|
||||
|
||||
assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1)));
|
||||
assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2)));
|
||||
match &result.values[2].value_data {
|
||||
Some(ValueData::TimestampNanosecondValue(v)) => {
|
||||
assert_ne!(*v, 0);
|
||||
}
|
||||
_ => panic!("expect null value"),
|
||||
assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1)));
|
||||
assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2)));
|
||||
match &result.values[2].value_data {
|
||||
Some(ValueData::TimestampNanosecondValue(v)) => {
|
||||
assert_ne!(*v, 0);
|
||||
}
|
||||
}
|
||||
{
|
||||
let input_value_str = r#"
|
||||
{
|
||||
"reqTimeSec": "1573840000.000"
|
||||
}
|
||||
"#;
|
||||
|
||||
let pipeline_yaml = r#"
|
||||
---
|
||||
description: Pipeline for Demo Log
|
||||
|
||||
processors:
|
||||
- gsub:
|
||||
field: reqTimeSec
|
||||
pattern: "\\."
|
||||
replacement: ""
|
||||
- epoch:
|
||||
field: reqTimeSec
|
||||
resolution: millisecond
|
||||
ignore_missing: true
|
||||
|
||||
transform:
|
||||
- field: reqTimeSec
|
||||
type: epoch, millisecond
|
||||
index: timestamp
|
||||
"#;
|
||||
let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap();
|
||||
let pipeline: Pipeline<GreptimeTransformer> =
|
||||
parse(&Content::Yaml(pipeline_yaml.into())).unwrap();
|
||||
let mut payload = pipeline.init_intermediate_state();
|
||||
pipeline.prepare(input_value, &mut payload).unwrap();
|
||||
assert_eq!(&["reqTimeSec"].to_vec(), pipeline.required_keys());
|
||||
assert_eq!(payload, vec![Value::String("1573840000.000".to_string())]);
|
||||
let result = pipeline.exec_mut(&mut payload).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
result.values[0].value_data,
|
||||
Some(ValueData::TimestampMillisecondValue(1573840000000))
|
||||
);
|
||||
_ => panic!("expect null value"),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -541,21 +425,19 @@ transform:
|
||||
#[test]
|
||||
fn test_csv_pipeline() {
|
||||
let input_value_str = r#"
|
||||
{
|
||||
"my_field": "1,2",
|
||||
"foo": "bar"
|
||||
}
|
||||
"#;
|
||||
{
|
||||
"my_field": "1,2",
|
||||
"foo": "bar"
|
||||
}
|
||||
"#;
|
||||
let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap();
|
||||
|
||||
let pipeline_yaml = r#"
|
||||
---
|
||||
description: Pipeline for Apache Tomcat
|
||||
|
||||
processors:
|
||||
- csv:
|
||||
field: my_field,my_field, field1, field2
|
||||
|
||||
field: my_field
|
||||
target_fields: field1, field2
|
||||
transform:
|
||||
- field: field1
|
||||
type: uint32
|
||||
@@ -565,8 +447,22 @@ transform:
|
||||
|
||||
let pipeline: Pipeline<GreptimeTransformer> =
|
||||
parse(&Content::Yaml(pipeline_yaml.into())).unwrap();
|
||||
let output = pipeline.exec(input_value.try_into().unwrap());
|
||||
assert!(output.is_ok());
|
||||
let mut payload = pipeline.init_intermediate_state();
|
||||
pipeline.prepare(input_value, &mut payload).unwrap();
|
||||
assert_eq!(&["my_field"].to_vec(), pipeline.required_keys());
|
||||
assert_eq!(
|
||||
payload,
|
||||
vec![Value::String("1,2".to_string()), Value::Null, Value::Null]
|
||||
);
|
||||
let result = pipeline.exec_mut(&mut payload).unwrap();
|
||||
assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1)));
|
||||
assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2)));
|
||||
match &result.values[2].value_data {
|
||||
Some(ValueData::TimestampNanosecondValue(v)) => {
|
||||
assert_ne!(*v, 0);
|
||||
}
|
||||
_ => panic!("expect null value"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -596,7 +492,14 @@ transform:
|
||||
|
||||
let pipeline: Pipeline<GreptimeTransformer> =
|
||||
parse(&Content::Yaml(pipeline_yaml.into())).unwrap();
|
||||
let output = pipeline.exec(input_value.try_into().unwrap()).unwrap();
|
||||
let schema = pipeline.schemas().clone();
|
||||
let mut result = pipeline.init_intermediate_state();
|
||||
pipeline.prepare(input_value, &mut result).unwrap();
|
||||
let row = pipeline.exec_mut(&mut result).unwrap();
|
||||
let output = Rows {
|
||||
schema,
|
||||
rows: vec![row],
|
||||
};
|
||||
let schemas = output.schema;
|
||||
|
||||
assert_eq!(schemas.len(), 1);
|
||||
|
||||
@@ -12,69 +12,12 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
use std::ops::Deref;
|
||||
use std::str::FromStr;
|
||||
|
||||
use ahash::{HashSet, HashSetExt};
|
||||
use itertools::Itertools;
|
||||
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct Fields(Vec<Field>);
|
||||
|
||||
impl Fields {
|
||||
pub(crate) fn new(fields: Vec<Field>) -> Result<Self, String> {
|
||||
let ff = Fields(fields);
|
||||
ff.check()
|
||||
}
|
||||
|
||||
pub(crate) fn one(field: Field) -> Self {
|
||||
Fields(vec![field])
|
||||
}
|
||||
|
||||
pub(crate) fn get_target_fields(&self) -> Vec<&str> {
|
||||
self.0.iter().map(|f| f.get_target_field()).collect()
|
||||
}
|
||||
|
||||
fn check(self) -> Result<Self, String> {
|
||||
if self.0.is_empty() {
|
||||
return Err("fields must not be empty".to_string());
|
||||
}
|
||||
|
||||
let mut set = HashSet::new();
|
||||
for f in self.0.iter() {
|
||||
if set.contains(&f.input_field.name) {
|
||||
return Err(format!(
|
||||
"field name must be unique, but got duplicated: {}",
|
||||
f.input_field.name
|
||||
));
|
||||
}
|
||||
set.insert(&f.input_field.name);
|
||||
}
|
||||
|
||||
Ok(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Fields {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
let s = self.0.iter().map(|f| f.to_string()).join(";");
|
||||
write!(f, "{s}")
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::Deref for Fields {
|
||||
type Target = Vec<Field>;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::DerefMut for Fields {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
&mut self.0
|
||||
}
|
||||
}
|
||||
use crate::etl::find_key_index;
|
||||
|
||||
/// Information about the input field including the name and index in intermediate keys.
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct InputFieldInfo {
|
||||
pub(crate) name: String,
|
||||
@@ -82,132 +25,202 @@ pub struct InputFieldInfo {
|
||||
}
|
||||
|
||||
impl InputFieldInfo {
|
||||
/// Create a new input field info with the given field name and index.
|
||||
pub(crate) fn new(field: impl Into<String>, index: usize) -> Self {
|
||||
InputFieldInfo {
|
||||
name: field.into(),
|
||||
index,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn name(field: impl Into<String>) -> Self {
|
||||
InputFieldInfo {
|
||||
name: field.into(),
|
||||
index: 0,
|
||||
/// Information about a field that has one input and one output.
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct OneInputOneOutputField {
|
||||
input: InputFieldInfo,
|
||||
output: Option<(String, usize)>,
|
||||
}
|
||||
|
||||
impl OneInputOneOutputField {
|
||||
/// Create a new field with the given input and output.
|
||||
pub(crate) fn new(input: InputFieldInfo, output: (String, usize)) -> Self {
|
||||
OneInputOneOutputField {
|
||||
input,
|
||||
output: Some(output),
|
||||
}
|
||||
}
|
||||
|
||||
/// Build a new field with the given processor kind, intermediate keys, input field, and target field.
|
||||
pub(crate) fn build(
|
||||
processor_kind: &str,
|
||||
intermediate_keys: &[String],
|
||||
input_field: &str,
|
||||
target_field: &str,
|
||||
) -> Result<Self, String> {
|
||||
let input_index = find_key_index(intermediate_keys, input_field, processor_kind)?;
|
||||
|
||||
let input_field_info = InputFieldInfo::new(input_field, input_index);
|
||||
let output_index = find_key_index(intermediate_keys, target_field, processor_kind)?;
|
||||
Ok(OneInputOneOutputField::new(
|
||||
input_field_info,
|
||||
(target_field.to_string(), output_index),
|
||||
))
|
||||
}
|
||||
|
||||
/// Get the input field information.
|
||||
pub(crate) fn input(&self) -> &InputFieldInfo {
|
||||
&self.input
|
||||
}
|
||||
|
||||
/// Get the index of the input field.
|
||||
pub(crate) fn input_index(&self) -> usize {
|
||||
self.input.index
|
||||
}
|
||||
|
||||
/// Get the name of the input field.
|
||||
pub(crate) fn input_name(&self) -> &str {
|
||||
&self.input.name
|
||||
}
|
||||
|
||||
/// Get the index of the output field.
|
||||
pub(crate) fn output_index(&self) -> usize {
|
||||
*self.output().1
|
||||
}
|
||||
|
||||
/// Get the name of the output field.
|
||||
pub(crate) fn output_name(&self) -> &str {
|
||||
self.output().0
|
||||
}
|
||||
|
||||
/// Get the output field information.
|
||||
pub(crate) fn output(&self) -> (&String, &usize) {
|
||||
if let Some((name, index)) = &self.output {
|
||||
(name, index)
|
||||
} else {
|
||||
(&self.input.name, &self.input.index)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Used to represent the input and output fields of a processor or transform.
|
||||
/// Information about a field that has one input and multiple outputs.
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct OneInputMultiOutputField {
|
||||
input: InputFieldInfo,
|
||||
/// Typically, processors that output multiple keys need to be distinguished by splicing the keys together.
|
||||
prefix: Option<String>,
|
||||
}
|
||||
|
||||
impl OneInputMultiOutputField {
|
||||
/// Create a new field with the given input and prefix.
|
||||
pub(crate) fn new(input: InputFieldInfo, prefix: Option<String>) -> Self {
|
||||
OneInputMultiOutputField { input, prefix }
|
||||
}
|
||||
|
||||
/// Get the input field information.
|
||||
pub(crate) fn input(&self) -> &InputFieldInfo {
|
||||
&self.input
|
||||
}
|
||||
|
||||
/// Get the index of the input field.
|
||||
pub(crate) fn input_index(&self) -> usize {
|
||||
self.input.index
|
||||
}
|
||||
|
||||
/// Get the name of the input field.
|
||||
pub(crate) fn input_name(&self) -> &str {
|
||||
&self.input.name
|
||||
}
|
||||
|
||||
/// Get the prefix for the output fields.
|
||||
pub(crate) fn target_prefix(&self) -> &str {
|
||||
self.prefix.as_deref().unwrap_or(&self.input.name)
|
||||
}
|
||||
}
|
||||
|
||||
/// Raw processor-defined inputs and outputs
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct Field {
|
||||
/// The input field name and index.
|
||||
pub input_field: InputFieldInfo,
|
||||
|
||||
/// The output field name and index mapping.
|
||||
pub output_fields_index_mapping: BTreeMap<String, usize>,
|
||||
|
||||
// rename
|
||||
pub target_field: Option<String>,
|
||||
|
||||
// 1-to-many mapping
|
||||
// processors:
|
||||
// - csv
|
||||
pub target_fields: Option<Vec<String>>,
|
||||
pub(crate) input_field: String,
|
||||
pub(crate) target_field: Option<String>,
|
||||
}
|
||||
|
||||
impl Field {
|
||||
pub(crate) fn new(field: impl Into<String>) -> Self {
|
||||
Field {
|
||||
input_field: InputFieldInfo::name(field.into()),
|
||||
output_fields_index_mapping: BTreeMap::new(),
|
||||
target_field: None,
|
||||
target_fields: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// target column_name in processor or transform
|
||||
/// if target_field is None, return input field name
|
||||
pub(crate) fn get_target_field(&self) -> &str {
|
||||
self.target_field
|
||||
.as_deref()
|
||||
.unwrap_or(&self.input_field.name)
|
||||
}
|
||||
|
||||
/// input column_name in processor or transform
|
||||
pub(crate) fn get_field_name(&self) -> &str {
|
||||
&self.input_field.name
|
||||
}
|
||||
|
||||
/// set input column index in processor or transform
|
||||
pub(crate) fn set_input_index(&mut self, index: usize) {
|
||||
self.input_field.index = index;
|
||||
}
|
||||
|
||||
pub(crate) fn set_output_index(&mut self, key: &str, index: usize) {
|
||||
if let Some(v) = self.output_fields_index_mapping.get_mut(key) {
|
||||
*v = index;
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn insert_output_index(&mut self, key: String, index: usize) {
|
||||
self.output_fields_index_mapping.insert(key, index);
|
||||
}
|
||||
}
|
||||
|
||||
impl std::str::FromStr for Field {
|
||||
impl FromStr for Field {
|
||||
type Err = String;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
let mut parts = s.split(',');
|
||||
let field = parts.next().ok_or("field is missing")?.trim().to_string();
|
||||
let input_field = parts
|
||||
.next()
|
||||
.ok_or("input field is missing")?
|
||||
.trim()
|
||||
.to_string();
|
||||
let target_field = parts.next().map(|x| x.trim().to_string());
|
||||
|
||||
if field.is_empty() {
|
||||
return Err("field is empty".to_string());
|
||||
if input_field.is_empty() {
|
||||
return Err("input field is empty".to_string());
|
||||
}
|
||||
|
||||
let renamed_field = match parts.next() {
|
||||
Some(s) if !s.trim().is_empty() => Some(s.trim().to_string()),
|
||||
_ => None,
|
||||
};
|
||||
|
||||
// TODO(qtang): ???? what's this?
|
||||
// weird design? field: <field>,<target_field>,<target_fields>,<target_fields>....
|
||||
// and only use in csv processor
|
||||
let fields: Vec<_> = parts
|
||||
.map(|s| s.trim())
|
||||
.filter(|s| !s.is_empty())
|
||||
.map(|s| s.to_string())
|
||||
.collect();
|
||||
let target_fields = if fields.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(fields)
|
||||
};
|
||||
|
||||
Ok(Field {
|
||||
input_field: InputFieldInfo::name(field),
|
||||
output_fields_index_mapping: BTreeMap::new(),
|
||||
target_field: renamed_field,
|
||||
target_fields,
|
||||
input_field,
|
||||
target_field,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Field {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
match (&self.target_field, &self.target_fields) {
|
||||
(Some(target_field), None) => write!(f, "{}, {target_field}", self.input_field.name),
|
||||
(None, Some(target_fields)) => {
|
||||
write!(
|
||||
f,
|
||||
"{}, {}",
|
||||
self.input_field.name,
|
||||
target_fields.iter().join(",")
|
||||
)
|
||||
}
|
||||
_ => write!(f, "{}", self.input_field.name),
|
||||
impl Field {
|
||||
/// Create a new field with the given input and target fields.
|
||||
pub(crate) fn new(input_field: impl Into<String>, target_field: Option<String>) -> Self {
|
||||
Field {
|
||||
input_field: input_field.into(),
|
||||
target_field,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the input field.
|
||||
pub(crate) fn input_field(&self) -> &str {
|
||||
&self.input_field
|
||||
}
|
||||
|
||||
/// Get the target field.
|
||||
pub(crate) fn target_field(&self) -> Option<&str> {
|
||||
self.target_field.as_deref()
|
||||
}
|
||||
|
||||
/// Get the target field or the input field if the target field is not set.
|
||||
pub(crate) fn target_or_input_field(&self) -> &str {
|
||||
self.target_field.as_deref().unwrap_or(&self.input_field)
|
||||
}
|
||||
}
|
||||
|
||||
/// A collection of fields.
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct Fields(Vec<Field>);
|
||||
|
||||
impl Fields {
|
||||
pub(crate) fn new(fields: Vec<Field>) -> Self {
|
||||
Fields(fields)
|
||||
}
|
||||
|
||||
pub(crate) fn one(field: Field) -> Self {
|
||||
Fields(vec![field])
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for Fields {
|
||||
type Target = Vec<Field>;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl IntoIterator for Fields {
|
||||
type Item = Field;
|
||||
type IntoIter = std::vec::IntoIter<Field>;
|
||||
|
||||
fn into_iter(self) -> Self::IntoIter {
|
||||
self.0.into_iter()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -227,35 +240,14 @@ mod tests {
|
||||
|
||||
let cases = [
|
||||
// ("field", "field", None, None),
|
||||
(
|
||||
"field, target_field",
|
||||
"field",
|
||||
Some("target_field".into()),
|
||||
None,
|
||||
),
|
||||
(
|
||||
"field, target_field1, target_field2, target_field3",
|
||||
"field",
|
||||
Some("target_field1".into()),
|
||||
Some(vec!["target_field2".into(), "target_field3".into()]),
|
||||
),
|
||||
(
|
||||
"field,, target_field1, target_field2, target_field3",
|
||||
"field",
|
||||
None,
|
||||
Some(vec![
|
||||
"target_field1".into(),
|
||||
"target_field2".into(),
|
||||
"target_field3".into(),
|
||||
]),
|
||||
),
|
||||
("field, target_field", "field", Some("target_field")),
|
||||
("field", "field", None),
|
||||
];
|
||||
|
||||
for (s, field, target_field, target_fields) in cases.into_iter() {
|
||||
for (s, field, target_field) in cases.into_iter() {
|
||||
let f: Field = s.parse().unwrap();
|
||||
assert_eq!(f.get_field_name(), field, "{s}");
|
||||
assert_eq!(f.target_field, target_field, "{s}");
|
||||
assert_eq!(f.target_fields, target_fields, "{s}");
|
||||
assert_eq!(f.input_field(), field, "{s}");
|
||||
assert_eq!(f.target_field(), target_field, "{s}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -25,22 +25,22 @@ pub mod timestamp;
|
||||
pub mod urlencoding;
|
||||
|
||||
use ahash::{HashSet, HashSetExt};
|
||||
use cmcd::CmcdProcessor;
|
||||
use csv::CsvProcessor;
|
||||
use date::DateProcessor;
|
||||
use dissect::DissectProcessor;
|
||||
use cmcd::{CmcdProcessor, CmcdProcessorBuilder};
|
||||
use csv::{CsvProcessor, CsvProcessorBuilder};
|
||||
use date::{DateProcessor, DateProcessorBuilder};
|
||||
use dissect::{DissectProcessor, DissectProcessorBuilder};
|
||||
use enum_dispatch::enum_dispatch;
|
||||
use epoch::EpochProcessor;
|
||||
use gsub::GsubProcessor;
|
||||
use epoch::{EpochProcessor, EpochProcessorBuilder};
|
||||
use gsub::{GsubProcessor, GsubProcessorBuilder};
|
||||
use itertools::Itertools;
|
||||
use join::JoinProcessor;
|
||||
use letter::LetterProcessor;
|
||||
use regex::RegexProcessor;
|
||||
use timestamp::TimestampProcessor;
|
||||
use urlencoding::UrlEncodingProcessor;
|
||||
use join::{JoinProcessor, JoinProcessorBuilder};
|
||||
use letter::{LetterProcessor, LetterProcessorBuilder};
|
||||
use regex::{RegexProcessor, RegexProcessorBuilder};
|
||||
use timestamp::{TimestampProcessor, TimestampProcessorBuilder};
|
||||
use urlencoding::{UrlEncodingProcessor, UrlEncodingProcessorBuilder};
|
||||
|
||||
use crate::etl::field::{Field, Fields};
|
||||
use crate::etl::value::{Map, Value};
|
||||
use super::field::{Field, Fields};
|
||||
use crate::etl::value::Value;
|
||||
|
||||
const FIELD_NAME: &str = "field";
|
||||
const FIELDS_NAME: &str = "fields";
|
||||
@@ -49,6 +49,7 @@ const METHOD_NAME: &str = "method";
|
||||
const PATTERN_NAME: &str = "pattern";
|
||||
const PATTERNS_NAME: &str = "patterns";
|
||||
const SEPARATOR_NAME: &str = "separator";
|
||||
const TARGET_FIELDS_NAME: &str = "target_fields";
|
||||
|
||||
// const IF_NAME: &str = "if";
|
||||
// const IGNORE_FAILURE_NAME: &str = "ignore_failure";
|
||||
@@ -62,55 +63,14 @@ const SEPARATOR_NAME: &str = "separator";
|
||||
/// The output of a processor is a map of key-value pairs that will be merged into the document when you use exec_map method.
|
||||
#[enum_dispatch(ProcessorKind)]
|
||||
pub trait Processor: std::fmt::Debug + Send + Sync + 'static {
|
||||
/// Get the processor's fields
|
||||
/// fields is just the same processor for multiple keys. It is not the case that a processor has multiple inputs
|
||||
fn fields(&self) -> &Fields;
|
||||
|
||||
/// Get the processor's fields mutably
|
||||
fn fields_mut(&mut self) -> &mut Fields;
|
||||
|
||||
/// Get the processor's kind
|
||||
fn kind(&self) -> &str;
|
||||
|
||||
/// Whether to ignore missing
|
||||
fn ignore_missing(&self) -> bool;
|
||||
|
||||
/// processor all output keys
|
||||
/// if a processor has multiple output keys, it should return all of them
|
||||
fn output_keys(&self) -> HashSet<String>;
|
||||
|
||||
/// Execute the processor on a document
|
||||
/// and return a map of key-value pairs
|
||||
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String>;
|
||||
|
||||
/// Execute the processor on a vector which be preprocessed by the pipeline
|
||||
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String>;
|
||||
|
||||
/// Execute the processor on a map
|
||||
/// and merge the output into the original map
|
||||
fn exec_map(&self, map: &mut Map) -> Result<(), String> {
|
||||
for ff @ Field {
|
||||
input_field: field_info,
|
||||
..
|
||||
} in self.fields().iter()
|
||||
{
|
||||
match map.get(&field_info.name) {
|
||||
Some(v) => {
|
||||
map.extend(self.exec_field(v, ff)?);
|
||||
}
|
||||
None if self.ignore_missing() => {}
|
||||
None => {
|
||||
return Err(format!(
|
||||
"{} processor: field '{}' is required but missing in {map}",
|
||||
self.kind(),
|
||||
field_info.name,
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
@@ -129,6 +89,42 @@ pub enum ProcessorKind {
|
||||
Date(DateProcessor),
|
||||
}
|
||||
|
||||
/// ProcessorBuilder trait defines the interface for all processor builders
|
||||
/// A processor builder is used to create a processor
|
||||
#[enum_dispatch(ProcessorBuilders)]
|
||||
pub trait ProcessorBuilder: std::fmt::Debug + Send + Sync + 'static {
|
||||
/// Get the processor's output keys
|
||||
fn output_keys(&self) -> HashSet<&str>;
|
||||
/// Get the processor's input keys
|
||||
fn input_keys(&self) -> HashSet<&str>;
|
||||
/// Build the processor
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String>;
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
#[enum_dispatch]
|
||||
pub enum ProcessorBuilders {
|
||||
Cmcd(CmcdProcessorBuilder),
|
||||
Csv(CsvProcessorBuilder),
|
||||
Dissect(DissectProcessorBuilder),
|
||||
Gsub(GsubProcessorBuilder),
|
||||
Join(JoinProcessorBuilder),
|
||||
Letter(LetterProcessorBuilder),
|
||||
Regex(RegexProcessorBuilder),
|
||||
Timestamp(TimestampProcessorBuilder),
|
||||
UrlEncoding(UrlEncodingProcessorBuilder),
|
||||
Epoch(EpochProcessorBuilder),
|
||||
Date(DateProcessorBuilder),
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct ProcessorBuilderList {
|
||||
pub(crate) processor_builders: Vec<ProcessorBuilders>,
|
||||
pub(crate) input_keys: Vec<String>,
|
||||
pub(crate) output_keys: Vec<String>,
|
||||
pub(crate) original_input_keys: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct Processors {
|
||||
/// A ordered list of processors
|
||||
@@ -174,52 +170,63 @@ impl Processors {
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&Vec<yaml_rust::Yaml>> for Processors {
|
||||
impl TryFrom<&Vec<yaml_rust::Yaml>> for ProcessorBuilderList {
|
||||
type Error = String;
|
||||
|
||||
fn try_from(vec: &Vec<yaml_rust::Yaml>) -> Result<Self, Self::Error> {
|
||||
let mut processors = vec![];
|
||||
let mut processors_builders = vec![];
|
||||
let mut all_output_keys = HashSet::with_capacity(50);
|
||||
let mut all_required_keys = HashSet::with_capacity(50);
|
||||
let mut all_required_original_keys = HashSet::with_capacity(50);
|
||||
for doc in vec {
|
||||
let processor = parse_processor(doc)?;
|
||||
|
||||
// get all required keys
|
||||
let processor_required_keys: Vec<String> = processor
|
||||
.fields()
|
||||
.iter()
|
||||
.map(|f| f.input_field.name.clone())
|
||||
.collect();
|
||||
|
||||
for key in &processor_required_keys {
|
||||
if !all_output_keys.contains(key) {
|
||||
all_required_original_keys.insert(key.clone());
|
||||
}
|
||||
}
|
||||
|
||||
all_required_keys.extend(processor_required_keys);
|
||||
|
||||
let processor_output_keys = processor.output_keys().into_iter();
|
||||
all_output_keys.extend(processor_output_keys);
|
||||
|
||||
processors.push(processor);
|
||||
processors_builders.push(processor);
|
||||
}
|
||||
|
||||
let all_required_keys = all_required_keys.into_iter().sorted().collect();
|
||||
let all_output_keys = all_output_keys.into_iter().sorted().collect();
|
||||
let all_required_original_keys = all_required_original_keys.into_iter().sorted().collect();
|
||||
for processor in processors_builders.iter() {
|
||||
{
|
||||
// get all required keys
|
||||
let processor_required_keys = processor.input_keys();
|
||||
|
||||
Ok(Processors {
|
||||
processors,
|
||||
required_keys: all_required_keys,
|
||||
for key in &processor_required_keys {
|
||||
if !all_output_keys.contains(key) {
|
||||
all_required_original_keys.insert(*key);
|
||||
}
|
||||
}
|
||||
|
||||
all_required_keys.extend(processor_required_keys);
|
||||
|
||||
let processor_output_keys = processor.output_keys().into_iter();
|
||||
all_output_keys.extend(processor_output_keys);
|
||||
}
|
||||
}
|
||||
|
||||
let all_required_keys = all_required_keys
|
||||
.into_iter()
|
||||
.map(|x| x.to_string())
|
||||
.sorted()
|
||||
.collect();
|
||||
let all_output_keys = all_output_keys
|
||||
.into_iter()
|
||||
.map(|x| x.to_string())
|
||||
.sorted()
|
||||
.collect();
|
||||
let all_required_original_keys = all_required_original_keys
|
||||
.into_iter()
|
||||
.map(|x| x.to_string())
|
||||
.sorted()
|
||||
.collect();
|
||||
|
||||
Ok(ProcessorBuilderList {
|
||||
processor_builders: processors_builders,
|
||||
input_keys: all_required_keys,
|
||||
output_keys: all_output_keys,
|
||||
required_original_keys: all_required_original_keys,
|
||||
original_input_keys: all_required_original_keys,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_processor(doc: &yaml_rust::Yaml) -> Result<ProcessorKind, String> {
|
||||
fn parse_processor(doc: &yaml_rust::Yaml) -> Result<ProcessorBuilders, String> {
|
||||
let map = doc.as_hash().ok_or("processor must be a map".to_string())?;
|
||||
|
||||
let key = map
|
||||
@@ -238,20 +245,24 @@ fn parse_processor(doc: &yaml_rust::Yaml) -> Result<ProcessorKind, String> {
|
||||
.ok_or("processor key must be a string".to_string())?;
|
||||
|
||||
let processor = match str_key {
|
||||
cmcd::PROCESSOR_CMCD => ProcessorKind::Cmcd(CmcdProcessor::try_from(value)?),
|
||||
csv::PROCESSOR_CSV => ProcessorKind::Csv(CsvProcessor::try_from(value)?),
|
||||
dissect::PROCESSOR_DISSECT => ProcessorKind::Dissect(DissectProcessor::try_from(value)?),
|
||||
epoch::PROCESSOR_EPOCH => ProcessorKind::Epoch(EpochProcessor::try_from(value)?),
|
||||
date::PROCESSOR_DATE => ProcessorKind::Date(DateProcessor::try_from(value)?),
|
||||
gsub::PROCESSOR_GSUB => ProcessorKind::Gsub(GsubProcessor::try_from(value)?),
|
||||
join::PROCESSOR_JOIN => ProcessorKind::Join(JoinProcessor::try_from(value)?),
|
||||
letter::PROCESSOR_LETTER => ProcessorKind::Letter(LetterProcessor::try_from(value)?),
|
||||
regex::PROCESSOR_REGEX => ProcessorKind::Regex(RegexProcessor::try_from(value)?),
|
||||
cmcd::PROCESSOR_CMCD => ProcessorBuilders::Cmcd(CmcdProcessorBuilder::try_from(value)?),
|
||||
csv::PROCESSOR_CSV => ProcessorBuilders::Csv(CsvProcessorBuilder::try_from(value)?),
|
||||
dissect::PROCESSOR_DISSECT => {
|
||||
ProcessorBuilders::Dissect(DissectProcessorBuilder::try_from(value)?)
|
||||
}
|
||||
epoch::PROCESSOR_EPOCH => ProcessorBuilders::Epoch(EpochProcessorBuilder::try_from(value)?),
|
||||
date::PROCESSOR_DATE => ProcessorBuilders::Date(DateProcessorBuilder::try_from(value)?),
|
||||
gsub::PROCESSOR_GSUB => ProcessorBuilders::Gsub(GsubProcessorBuilder::try_from(value)?),
|
||||
join::PROCESSOR_JOIN => ProcessorBuilders::Join(JoinProcessorBuilder::try_from(value)?),
|
||||
letter::PROCESSOR_LETTER => {
|
||||
ProcessorBuilders::Letter(LetterProcessorBuilder::try_from(value)?)
|
||||
}
|
||||
regex::PROCESSOR_REGEX => ProcessorBuilders::Regex(RegexProcessorBuilder::try_from(value)?),
|
||||
timestamp::PROCESSOR_TIMESTAMP => {
|
||||
ProcessorKind::Timestamp(TimestampProcessor::try_from(value)?)
|
||||
ProcessorBuilders::Timestamp(TimestampProcessorBuilder::try_from(value)?)
|
||||
}
|
||||
urlencoding::PROCESSOR_URL_ENCODING => {
|
||||
ProcessorKind::UrlEncoding(UrlEncodingProcessor::try_from(value)?)
|
||||
ProcessorBuilders::UrlEncoding(UrlEncodingProcessorBuilder::try_from(value)?)
|
||||
}
|
||||
_ => return Err(format!("unsupported {} processor", str_key)),
|
||||
};
|
||||
@@ -301,19 +312,10 @@ where
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn yaml_fields(v: &yaml_rust::Yaml, field: &str) -> Result<Fields, String> {
|
||||
let v = yaml_parse_strings(v, field)?;
|
||||
Fields::new(v)
|
||||
pub(crate) fn yaml_new_fields(v: &yaml_rust::Yaml, field: &str) -> Result<Fields, String> {
|
||||
yaml_parse_strings(v, field).map(Fields::new)
|
||||
}
|
||||
|
||||
pub(crate) fn yaml_field(v: &yaml_rust::Yaml, field: &str) -> Result<Field, String> {
|
||||
pub(crate) fn yaml_new_field(v: &yaml_rust::Yaml, field: &str) -> Result<Field, String> {
|
||||
yaml_parse_string(v, field)
|
||||
}
|
||||
|
||||
pub(crate) fn update_one_one_output_keys(fields: &mut Fields) {
|
||||
for field in fields.iter_mut() {
|
||||
field
|
||||
.output_fields_index_mapping
|
||||
.insert(field.get_target_field().to_string(), 0_usize);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,14 +12,18 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use ahash::HashSet;
|
||||
use urlencoding::decode;
|
||||
|
||||
use crate::etl::field::{Field, Fields};
|
||||
use crate::etl::field::{Field, Fields, InputFieldInfo, OneInputMultiOutputField};
|
||||
use crate::etl::find_key_index;
|
||||
use crate::etl::processor::{
|
||||
yaml_bool, yaml_field, yaml_fields, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
|
||||
yaml_bool, yaml_new_field, yaml_new_fields, Processor, ProcessorBuilder, ProcessorKind,
|
||||
FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
|
||||
};
|
||||
use crate::etl::value::{Map, Value};
|
||||
use crate::etl::value::Value;
|
||||
|
||||
pub(crate) const PROCESSOR_CMCD: &str = "cmcd";
|
||||
|
||||
@@ -63,6 +67,178 @@ const CMCD_KEYS: [&str; 18] = [
|
||||
CMCD_KEY_V,
|
||||
];
|
||||
|
||||
/// CmcdProcessorBuilder is a builder for CmcdProcessor
|
||||
/// parse from raw yaml
|
||||
#[derive(Debug, Default)]
|
||||
pub struct CmcdProcessorBuilder {
|
||||
fields: Fields,
|
||||
output_keys: HashSet<String>,
|
||||
ignore_missing: bool,
|
||||
}
|
||||
|
||||
impl CmcdProcessorBuilder {
|
||||
/// build_cmcd_outputs build cmcd output info
|
||||
/// generate index and function for each output
|
||||
pub(super) fn build_cmcd_outputs(
|
||||
field: &Field,
|
||||
intermediate_keys: &[String],
|
||||
) -> Result<(BTreeMap<String, usize>, Vec<CmcdOutputInfo>), String> {
|
||||
let mut output_index = BTreeMap::new();
|
||||
let mut cmcd_field_outputs = Vec::with_capacity(CMCD_KEYS.len());
|
||||
for cmcd in CMCD_KEYS {
|
||||
let final_key = generate_key(field.target_or_input_field(), cmcd);
|
||||
let index = find_key_index(intermediate_keys, &final_key, "cmcd")?;
|
||||
output_index.insert(final_key.clone(), index);
|
||||
match cmcd {
|
||||
CMCD_KEY_BS | CMCD_KEY_SU => {
|
||||
let output_info = CmcdOutputInfo::new(final_key, cmcd, index, bs_su);
|
||||
cmcd_field_outputs.push(output_info);
|
||||
}
|
||||
CMCD_KEY_BR | CMCD_KEY_BL | CMCD_KEY_D | CMCD_KEY_DL | CMCD_KEY_MTP
|
||||
| CMCD_KEY_RTP | CMCD_KEY_TB => {
|
||||
let output_info = CmcdOutputInfo::new(final_key, cmcd, index, br_tb);
|
||||
cmcd_field_outputs.push(output_info);
|
||||
}
|
||||
CMCD_KEY_CID | CMCD_KEY_NRR | CMCD_KEY_OT | CMCD_KEY_SF | CMCD_KEY_SID
|
||||
| CMCD_KEY_ST | CMCD_KEY_V => {
|
||||
let output_info = CmcdOutputInfo::new(final_key, cmcd, index, cid_v);
|
||||
cmcd_field_outputs.push(output_info);
|
||||
}
|
||||
CMCD_KEY_NOR => {
|
||||
let output_info = CmcdOutputInfo::new(final_key, cmcd, index, nor);
|
||||
cmcd_field_outputs.push(output_info);
|
||||
}
|
||||
CMCD_KEY_PR => {
|
||||
let output_info = CmcdOutputInfo::new(final_key, cmcd, index, pr);
|
||||
cmcd_field_outputs.push(output_info);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
Ok((output_index, cmcd_field_outputs))
|
||||
}
|
||||
|
||||
/// build CmcdProcessor from CmcdProcessorBuilder
|
||||
pub fn build(self, intermediate_keys: &[String]) -> Result<CmcdProcessor, String> {
|
||||
let mut real_fields = vec![];
|
||||
let mut cmcd_outputs = Vec::with_capacity(CMCD_KEYS.len());
|
||||
for field in self.fields.into_iter() {
|
||||
let input_index = find_key_index(intermediate_keys, field.input_field(), "cmcd")?;
|
||||
|
||||
let input_field_info = InputFieldInfo::new(field.input_field(), input_index);
|
||||
|
||||
let (_, cmcd_field_outputs) = Self::build_cmcd_outputs(&field, intermediate_keys)?;
|
||||
|
||||
cmcd_outputs.push(cmcd_field_outputs);
|
||||
|
||||
let real_field = OneInputMultiOutputField::new(input_field_info, field.target_field);
|
||||
real_fields.push(real_field);
|
||||
}
|
||||
Ok(CmcdProcessor {
|
||||
fields: real_fields,
|
||||
cmcd_outputs,
|
||||
ignore_missing: self.ignore_missing,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl ProcessorBuilder for CmcdProcessorBuilder {
|
||||
fn output_keys(&self) -> HashSet<&str> {
|
||||
self.output_keys.iter().map(|s| s.as_str()).collect()
|
||||
}
|
||||
|
||||
fn input_keys(&self) -> HashSet<&str> {
|
||||
self.fields.iter().map(|f| f.input_field()).collect()
|
||||
}
|
||||
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String> {
|
||||
self.build(intermediate_keys).map(ProcessorKind::Cmcd)
|
||||
}
|
||||
}
|
||||
|
||||
fn generate_key(prefix: &str, key: &str) -> String {
|
||||
format!("{}_{}", prefix, key)
|
||||
}
|
||||
|
||||
/// CmcdOutputInfo is a struct to store output info
|
||||
#[derive(Debug)]
|
||||
pub(super) struct CmcdOutputInfo {
|
||||
/// {input_field}_{cmcd_key}
|
||||
final_key: String,
|
||||
/// cmcd key
|
||||
key: &'static str,
|
||||
/// index in intermediate_keys
|
||||
index: usize,
|
||||
/// function to resolve value
|
||||
f: fn(&str, &str, Option<&str>) -> Result<Value, String>,
|
||||
}
|
||||
|
||||
impl CmcdOutputInfo {
|
||||
fn new(
|
||||
final_key: String,
|
||||
key: &'static str,
|
||||
index: usize,
|
||||
f: fn(&str, &str, Option<&str>) -> Result<Value, String>,
|
||||
) -> Self {
|
||||
Self {
|
||||
final_key,
|
||||
key,
|
||||
index,
|
||||
f,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for CmcdOutputInfo {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
final_key: String::default(),
|
||||
key: "",
|
||||
index: 0,
|
||||
f: |_, _, _| Ok(Value::Null),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// function to resolve CMCD_KEY_BS | CMCD_KEY_SU
|
||||
fn bs_su(_: &str, _: &str, _: Option<&str>) -> Result<Value, String> {
|
||||
Ok(Value::Boolean(true))
|
||||
}
|
||||
|
||||
/// function to resolve CMCD_KEY_BR | CMCD_KEY_BL | CMCD_KEY_D | CMCD_KEY_DL | CMCD_KEY_MTP | CMCD_KEY_RTP | CMCD_KEY_TB
|
||||
fn br_tb(s: &str, k: &str, v: Option<&str>) -> Result<Value, String> {
|
||||
let v = v.ok_or(format!("{k} missing value in {s}"))?;
|
||||
let val: i64 = v
|
||||
.parse()
|
||||
.map_err(|_| format!("failed to parse {v} as i64"))?;
|
||||
Ok(Value::Int64(val))
|
||||
}
|
||||
|
||||
/// function to resolve CMCD_KEY_CID | CMCD_KEY_NRR | CMCD_KEY_OT | CMCD_KEY_SF | CMCD_KEY_SID | CMCD_KEY_V
|
||||
fn cid_v(s: &str, k: &str, v: Option<&str>) -> Result<Value, String> {
|
||||
let v = v.ok_or(format!("{k} missing value in {s}"))?;
|
||||
Ok(Value::String(v.to_string()))
|
||||
}
|
||||
|
||||
/// function to resolve CMCD_KEY_NOR
|
||||
fn nor(s: &str, k: &str, v: Option<&str>) -> Result<Value, String> {
|
||||
let v = v.ok_or(format!("{k} missing value in {s}"))?;
|
||||
let val = match decode(v) {
|
||||
Ok(val) => val.to_string(),
|
||||
Err(_) => v.to_string(),
|
||||
};
|
||||
Ok(Value::String(val))
|
||||
}
|
||||
|
||||
/// function to resolve CMCD_KEY_PR
|
||||
fn pr(s: &str, k: &str, v: Option<&str>) -> Result<Value, String> {
|
||||
let v = v.ok_or(format!("{k} missing value in {s}"))?;
|
||||
let val: f64 = v
|
||||
.parse()
|
||||
.map_err(|_| format!("failed to parse {v} as f64"))?;
|
||||
Ok(Value::Float64(val))
|
||||
}
|
||||
|
||||
/// Common Media Client Data Specification:
|
||||
/// https://cdn.cta.tech/cta/media/media/resources/standards/pdfs/cta-5004-final.pdf
|
||||
///
|
||||
@@ -100,98 +276,43 @@ const CMCD_KEYS: [&str; 18] = [
|
||||
/// 12. Transport Layer Security SHOULD be used to protect all transmission of CMCD data.
|
||||
#[derive(Debug, Default)]
|
||||
pub struct CmcdProcessor {
|
||||
fields: Fields,
|
||||
fields: Vec<OneInputMultiOutputField>,
|
||||
cmcd_outputs: Vec<Vec<CmcdOutputInfo>>,
|
||||
|
||||
ignore_missing: bool,
|
||||
}
|
||||
|
||||
impl CmcdProcessor {
|
||||
fn with_fields(&mut self, mut fields: Fields) {
|
||||
Self::update_output_keys(&mut fields);
|
||||
self.fields = fields;
|
||||
}
|
||||
|
||||
fn with_ignore_missing(&mut self, ignore_missing: bool) {
|
||||
self.ignore_missing = ignore_missing;
|
||||
}
|
||||
|
||||
fn generate_key(prefix: &str, key: &str) -> String {
|
||||
format!("{}_{}", prefix, key)
|
||||
}
|
||||
|
||||
fn parse(prefix: &str, s: &str) -> Result<Map, String> {
|
||||
let mut map = Map::default();
|
||||
fn parse(&self, field_index: usize, s: &str) -> Result<Vec<(usize, Value)>, String> {
|
||||
let parts = s.split(',');
|
||||
let mut result = Vec::new();
|
||||
for part in parts {
|
||||
let mut kv = part.split('=');
|
||||
let k = kv.next().ok_or(format!("{part} missing key in {s}"))?;
|
||||
let v = kv.next();
|
||||
|
||||
let key = Self::generate_key(prefix, k);
|
||||
match k {
|
||||
CMCD_KEY_BS | CMCD_KEY_SU => {
|
||||
map.insert(key, Value::Boolean(true));
|
||||
for cmcd_key in self.cmcd_outputs[field_index].iter() {
|
||||
if cmcd_key.key == k {
|
||||
let val = (cmcd_key.f)(s, k, v)?;
|
||||
result.push((cmcd_key.index, val));
|
||||
}
|
||||
CMCD_KEY_BR | CMCD_KEY_BL | CMCD_KEY_D | CMCD_KEY_DL | CMCD_KEY_MTP
|
||||
| CMCD_KEY_RTP | CMCD_KEY_TB => {
|
||||
let v = v.ok_or(format!("{k} missing value in {s}"))?;
|
||||
let val: i64 = v
|
||||
.parse()
|
||||
.map_err(|_| format!("failed to parse {v} as i64"))?;
|
||||
map.insert(key, Value::Int64(val));
|
||||
}
|
||||
CMCD_KEY_CID | CMCD_KEY_NRR | CMCD_KEY_OT | CMCD_KEY_SF | CMCD_KEY_SID
|
||||
| CMCD_KEY_ST | CMCD_KEY_V => {
|
||||
let v = v.ok_or(format!("{k} missing value in {s}"))?;
|
||||
map.insert(key, Value::String(v.to_string()));
|
||||
}
|
||||
CMCD_KEY_NOR => {
|
||||
let v = v.ok_or(format!("{k} missing value in {s}"))?;
|
||||
let val = match decode(v) {
|
||||
Ok(val) => val.to_string(),
|
||||
Err(_) => v.to_string(),
|
||||
};
|
||||
map.insert(key, Value::String(val));
|
||||
}
|
||||
CMCD_KEY_PR => {
|
||||
let v = v.ok_or(format!("{k} missing value in {s}"))?;
|
||||
let val: f64 = v
|
||||
.parse()
|
||||
.map_err(|_| format!("failed to parse {v} as f64"))?;
|
||||
map.insert(key, Value::Float64(val));
|
||||
}
|
||||
_ => match v {
|
||||
Some(v) => map.insert(key, Value::String(v.to_string())),
|
||||
None => map.insert(k, Value::Boolean(true)),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
Ok(map)
|
||||
}
|
||||
|
||||
fn process_field(&self, val: &str, field: &Field) -> Result<Map, String> {
|
||||
let prefix = field.get_target_field();
|
||||
|
||||
Self::parse(prefix, val)
|
||||
}
|
||||
|
||||
fn update_output_keys(fields: &mut Fields) {
|
||||
for field in fields.iter_mut() {
|
||||
for key in CMCD_KEYS.iter() {
|
||||
field
|
||||
.output_fields_index_mapping
|
||||
.insert(Self::generate_key(field.get_target_field(), key), 0);
|
||||
}
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for CmcdProcessor {
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for CmcdProcessorBuilder {
|
||||
type Error = String;
|
||||
|
||||
fn try_from(value: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
|
||||
let mut processor = CmcdProcessor::default();
|
||||
let mut fields = Fields::default();
|
||||
let mut ignore_missing = false;
|
||||
|
||||
for (k, v) in value.iter() {
|
||||
let key = k
|
||||
@@ -199,25 +320,40 @@ impl TryFrom<&yaml_rust::yaml::Hash> for CmcdProcessor {
|
||||
.ok_or(format!("key must be a string, but got {k:?}"))?;
|
||||
match key {
|
||||
FIELD_NAME => {
|
||||
processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?));
|
||||
fields = Fields::one(yaml_new_field(v, FIELD_NAME)?);
|
||||
}
|
||||
FIELDS_NAME => {
|
||||
processor.with_fields(yaml_fields(v, FIELDS_NAME)?);
|
||||
fields = yaml_new_fields(v, FIELDS_NAME)?;
|
||||
}
|
||||
|
||||
IGNORE_MISSING_NAME => {
|
||||
processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?);
|
||||
ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?;
|
||||
}
|
||||
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(processor)
|
||||
let output_keys = fields
|
||||
.iter()
|
||||
.flat_map(|f| {
|
||||
CMCD_KEYS
|
||||
.iter()
|
||||
.map(|cmcd_key| generate_key(f.target_or_input_field(), cmcd_key))
|
||||
})
|
||||
.collect();
|
||||
|
||||
let builder = CmcdProcessorBuilder {
|
||||
fields,
|
||||
output_keys,
|
||||
ignore_missing,
|
||||
};
|
||||
|
||||
Ok(builder)
|
||||
}
|
||||
}
|
||||
|
||||
impl crate::etl::processor::Processor for CmcdProcessor {
|
||||
impl Processor for CmcdProcessor {
|
||||
fn kind(&self) -> &str {
|
||||
PROCESSOR_CMCD
|
||||
}
|
||||
@@ -226,51 +362,14 @@ impl crate::etl::processor::Processor for CmcdProcessor {
|
||||
self.ignore_missing
|
||||
}
|
||||
|
||||
fn fields(&self) -> &Fields {
|
||||
&self.fields
|
||||
}
|
||||
|
||||
fn fields_mut(&mut self) -> &mut Fields {
|
||||
&mut self.fields
|
||||
}
|
||||
|
||||
fn output_keys(&self) -> HashSet<String> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|field| {
|
||||
field
|
||||
.target_field
|
||||
.clone()
|
||||
.unwrap_or_else(|| field.get_field_name().to_string())
|
||||
})
|
||||
.flat_map(|keys| {
|
||||
CMCD_KEYS
|
||||
.iter()
|
||||
.map(move |key| format!("{}_{}", keys, *key))
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
|
||||
match val {
|
||||
Value::String(val) => self.process_field(val, field),
|
||||
_ => Err(format!(
|
||||
"{} processor: expect string value, but got {val:?}",
|
||||
self.kind()
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String> {
|
||||
for field in self.fields.iter() {
|
||||
match val.get(field.input_field.index) {
|
||||
for (field_index, field) in self.fields.iter().enumerate() {
|
||||
let field_value_index = field.input_index();
|
||||
match val.get(field_value_index) {
|
||||
Some(Value::String(v)) => {
|
||||
// TODO(qtang): Let this method use the intermediate state collection directly.
|
||||
let map = self.process_field(v, field)?;
|
||||
for (k, v) in map.values.into_iter() {
|
||||
if let Some(index) = field.output_fields_index_mapping.get(&k) {
|
||||
val[*index] = v;
|
||||
}
|
||||
let result_list = self.parse(field_index, v)?;
|
||||
for (output_index, v) in result_list {
|
||||
val[output_index] = v;
|
||||
}
|
||||
}
|
||||
Some(Value::Null) | None => {
|
||||
@@ -278,7 +377,7 @@ impl crate::etl::processor::Processor for CmcdProcessor {
|
||||
return Err(format!(
|
||||
"{} processor: missing field: {}",
|
||||
self.kind(),
|
||||
field.get_field_name()
|
||||
field.input_name()
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -299,7 +398,8 @@ mod tests {
|
||||
use ahash::HashMap;
|
||||
use urlencoding::decode;
|
||||
|
||||
use super::CmcdProcessor;
|
||||
use super::{CmcdProcessorBuilder, CMCD_KEYS};
|
||||
use crate::etl::field::{Field, Fields};
|
||||
use crate::etl::value::{Map, Value};
|
||||
|
||||
#[test]
|
||||
@@ -329,6 +429,7 @@ mod tests {
|
||||
],
|
||||
),
|
||||
(
|
||||
// we not resolve `b` key
|
||||
"b%2Crtp%3D15000%2Csid%3D%226e2fb550-c457-11e9-bb97-0800200c9a66%22",
|
||||
vec![
|
||||
(
|
||||
@@ -336,7 +437,6 @@ mod tests {
|
||||
Value::String("\"6e2fb550-c457-11e9-bb97-0800200c9a66\"".into()),
|
||||
),
|
||||
("prefix_rtp", Value::Int64(15000)),
|
||||
("b", Value::Boolean(true)),
|
||||
],
|
||||
),
|
||||
(
|
||||
@@ -347,16 +447,17 @@ mod tests {
|
||||
],
|
||||
),
|
||||
(
|
||||
// we not resolve custom key
|
||||
"d%3D4004%2Ccom.example-myNumericKey%3D500%2Ccom.examplemyStringKey%3D%22myStringValue%22",
|
||||
vec![
|
||||
(
|
||||
"prefix_com.example-myNumericKey",
|
||||
Value::String("500".into()),
|
||||
),
|
||||
(
|
||||
"prefix_com.examplemyStringKey",
|
||||
Value::String("\"myStringValue\"".into()),
|
||||
),
|
||||
// (
|
||||
// "prefix_com.example-myNumericKey",
|
||||
// Value::String("500".into()),
|
||||
// ),
|
||||
// (
|
||||
// "prefix_com.examplemyStringKey",
|
||||
// Value::String("\"myStringValue\"".into()),
|
||||
// ),
|
||||
("prefix_d", Value::Int64(4004)),
|
||||
],
|
||||
),
|
||||
@@ -431,6 +532,24 @@ mod tests {
|
||||
),
|
||||
];
|
||||
|
||||
let field = Field::new("prefix", None);
|
||||
|
||||
let output_keys = CMCD_KEYS
|
||||
.iter()
|
||||
.map(|k| format!("prefix_{}", k))
|
||||
.collect::<Vec<String>>();
|
||||
|
||||
let mut intermediate_keys = vec!["prefix".to_string()];
|
||||
intermediate_keys.append(&mut (output_keys.clone()));
|
||||
|
||||
let builder = CmcdProcessorBuilder {
|
||||
fields: Fields::new(vec![field]),
|
||||
output_keys: output_keys.iter().map(|s| s.to_string()).collect(),
|
||||
ignore_missing: false,
|
||||
};
|
||||
|
||||
let processor = builder.build(&intermediate_keys).unwrap();
|
||||
|
||||
for (s, vec) in ss.into_iter() {
|
||||
let decoded = decode(s).unwrap().to_string();
|
||||
|
||||
@@ -440,7 +559,12 @@ mod tests {
|
||||
.collect::<HashMap<String, Value>>();
|
||||
let expected = Map { values };
|
||||
|
||||
let actual = CmcdProcessor::parse("prefix", &decoded).unwrap();
|
||||
let actual = processor.parse(0, &decoded).unwrap();
|
||||
let actual = actual
|
||||
.into_iter()
|
||||
.map(|(index, value)| (intermediate_keys[index].clone(), value))
|
||||
.collect::<HashMap<String, Value>>();
|
||||
let actual = Map { values: actual };
|
||||
assert_eq!(actual, expected);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,17 +14,18 @@
|
||||
|
||||
// Reference: https://www.elastic.co/guide/en/elasticsearch/reference/current/csv-processor.html
|
||||
|
||||
use ahash::{HashMap, HashSet};
|
||||
use ahash::HashSet;
|
||||
use csv::{ReaderBuilder, Trim};
|
||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||
use itertools::Itertools;
|
||||
|
||||
use crate::etl::field::{Field, Fields};
|
||||
use crate::etl::field::{Fields, InputFieldInfo, OneInputMultiOutputField};
|
||||
use crate::etl::find_key_index;
|
||||
use crate::etl::processor::{
|
||||
yaml_bool, yaml_field, yaml_fields, yaml_string, Processor, FIELDS_NAME, FIELD_NAME,
|
||||
IGNORE_MISSING_NAME,
|
||||
yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, ProcessorBuilder,
|
||||
ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
|
||||
};
|
||||
use crate::etl::value::{Map, Value};
|
||||
use crate::etl::value::Value;
|
||||
|
||||
pub(crate) const PROCESSOR_CSV: &str = "csv";
|
||||
|
||||
@@ -32,18 +33,78 @@ const SEPARATOR_NAME: &str = "separator";
|
||||
const QUOTE_NAME: &str = "quote";
|
||||
const TRIM_NAME: &str = "trim";
|
||||
const EMPTY_VALUE_NAME: &str = "empty_value";
|
||||
const TARGET_FIELDS: &str = "target_fields";
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct CsvProcessorBuilder {
|
||||
reader: ReaderBuilder,
|
||||
|
||||
fields: Fields,
|
||||
ignore_missing: bool,
|
||||
|
||||
// Value used to fill empty fields, empty fields will be skipped if this is not provided.
|
||||
empty_value: Option<String>,
|
||||
target_fields: Vec<String>,
|
||||
// description
|
||||
// if
|
||||
// ignore_failure
|
||||
// on_failure
|
||||
// tag
|
||||
}
|
||||
|
||||
impl CsvProcessorBuilder {
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<CsvProcessor, String> {
|
||||
let mut real_fields = vec![];
|
||||
|
||||
for field in self.fields {
|
||||
let input_index = find_key_index(intermediate_keys, field.input_field(), "csv")?;
|
||||
|
||||
let input_field_info = InputFieldInfo::new(field.input_field(), input_index);
|
||||
let real_field = OneInputMultiOutputField::new(input_field_info, None);
|
||||
real_fields.push(real_field);
|
||||
}
|
||||
|
||||
let output_index_info = self
|
||||
.target_fields
|
||||
.iter()
|
||||
.map(|f| find_key_index(intermediate_keys, f, "csv"))
|
||||
.collect::<Result<Vec<_>, String>>()?;
|
||||
Ok(CsvProcessor {
|
||||
reader: self.reader,
|
||||
fields: real_fields,
|
||||
ignore_missing: self.ignore_missing,
|
||||
empty_value: self.empty_value,
|
||||
output_index_info,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl ProcessorBuilder for CsvProcessorBuilder {
|
||||
fn output_keys(&self) -> HashSet<&str> {
|
||||
self.target_fields.iter().map(|s| s.as_str()).collect()
|
||||
}
|
||||
|
||||
fn input_keys(&self) -> HashSet<&str> {
|
||||
self.fields.iter().map(|f| f.input_field()).collect()
|
||||
}
|
||||
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String> {
|
||||
self.build(intermediate_keys).map(ProcessorKind::Csv)
|
||||
}
|
||||
}
|
||||
|
||||
/// only support string value
|
||||
#[derive(Debug)]
|
||||
pub struct CsvProcessor {
|
||||
reader: ReaderBuilder,
|
||||
|
||||
fields: Fields,
|
||||
fields: Vec<OneInputMultiOutputField>,
|
||||
|
||||
ignore_missing: bool,
|
||||
|
||||
// Value used to fill empty fields, empty fields will be skipped if this is not provided.
|
||||
empty_value: Option<String>,
|
||||
output_index_info: Vec<usize>,
|
||||
// description
|
||||
// if
|
||||
// ignore_failure
|
||||
@@ -52,81 +113,19 @@ pub struct CsvProcessor {
|
||||
}
|
||||
|
||||
impl CsvProcessor {
|
||||
fn new() -> Self {
|
||||
let mut reader = ReaderBuilder::new();
|
||||
reader.has_headers(false);
|
||||
|
||||
Self {
|
||||
reader,
|
||||
fields: Fields::default(),
|
||||
ignore_missing: false,
|
||||
empty_value: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn with_fields(&mut self, fields: Fields) {
|
||||
self.fields = fields;
|
||||
}
|
||||
|
||||
fn try_separator(&mut self, separator: String) -> Result<(), String> {
|
||||
if separator.len() != 1 {
|
||||
Err(format!(
|
||||
"'{}' must be a single character, but got '{}'",
|
||||
SEPARATOR_NAME, separator
|
||||
))
|
||||
} else {
|
||||
self.reader.delimiter(separator.as_bytes()[0]);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn try_quote(&mut self, quote: String) -> Result<(), String> {
|
||||
if quote.len() != 1 {
|
||||
Err(format!(
|
||||
"'{}' must be a single character, but got '{}'",
|
||||
QUOTE_NAME, quote
|
||||
))
|
||||
} else {
|
||||
self.reader.quote(quote.as_bytes()[0]);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn with_trim(&mut self, trim: bool) {
|
||||
if trim {
|
||||
self.reader.trim(Trim::All);
|
||||
} else {
|
||||
self.reader.trim(Trim::None);
|
||||
}
|
||||
}
|
||||
|
||||
fn with_ignore_missing(&mut self, ignore_missing: bool) {
|
||||
self.ignore_missing = ignore_missing;
|
||||
}
|
||||
|
||||
fn with_empty_value(&mut self, empty_value: String) {
|
||||
self.empty_value = Some(empty_value);
|
||||
}
|
||||
|
||||
// process the csv format string to a map with target_fields as keys
|
||||
fn process_field(&self, val: &str, field: &Field) -> Result<Map, String> {
|
||||
fn process(&self, val: &str) -> Result<Vec<(usize, Value)>, String> {
|
||||
let mut reader = self.reader.from_reader(val.as_bytes());
|
||||
|
||||
if let Some(result) = reader.records().next() {
|
||||
let record: csv::StringRecord = result.map_err(|e| e.to_string())?;
|
||||
|
||||
let values: HashMap<String, Value> = field
|
||||
.target_fields
|
||||
.as_ref()
|
||||
.ok_or(format!(
|
||||
"target fields must be set after '{}'",
|
||||
field.get_field_name()
|
||||
))?
|
||||
let values: Vec<(usize, Value)> = self
|
||||
.output_index_info
|
||||
.iter()
|
||||
.map(|f| f.to_string())
|
||||
.zip_longest(record.iter())
|
||||
.filter_map(|zipped| match zipped {
|
||||
Both(target_field, val) => Some((target_field, Value::String(val.into()))),
|
||||
Both(target_field, val) => Some((*target_field, Value::String(val.into()))),
|
||||
// if target fields are more than extracted fields, fill the rest with empty value
|
||||
Left(target_field) => {
|
||||
let value = self
|
||||
@@ -134,69 +133,101 @@ impl CsvProcessor {
|
||||
.as_ref()
|
||||
.map(|s| Value::String(s.clone()))
|
||||
.unwrap_or(Value::Null);
|
||||
Some((target_field, value))
|
||||
Some((*target_field, value))
|
||||
}
|
||||
// if extracted fields are more than target fields, ignore the rest
|
||||
Right(_) => None,
|
||||
})
|
||||
.collect();
|
||||
|
||||
Ok(Map { values })
|
||||
Ok(values)
|
||||
} else {
|
||||
Err("expected at least one record from csv format, but got none".into())
|
||||
}
|
||||
}
|
||||
|
||||
fn update_output_keys(&mut self) {
|
||||
self.fields.iter_mut().for_each(|f| {
|
||||
if let Some(tfs) = f.target_fields.as_ref() {
|
||||
tfs.iter().for_each(|tf| {
|
||||
if !tf.is_empty() {
|
||||
f.output_fields_index_mapping.insert(tf.to_string(), 0);
|
||||
}
|
||||
});
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for CsvProcessor {
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for CsvProcessorBuilder {
|
||||
type Error = String;
|
||||
|
||||
fn try_from(hash: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
|
||||
let mut processor = CsvProcessor::new();
|
||||
let mut reader = ReaderBuilder::new();
|
||||
reader.has_headers(false);
|
||||
|
||||
let mut fields = Fields::default();
|
||||
let mut ignore_missing = false;
|
||||
let mut empty_value = None;
|
||||
let mut target_fields = vec![];
|
||||
|
||||
for (k, v) in hash {
|
||||
let key = k
|
||||
.as_str()
|
||||
.ok_or(format!("key must be a string, but got {k:?}"))?;
|
||||
match key {
|
||||
FIELD_NAME => {
|
||||
processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?));
|
||||
fields = Fields::one(yaml_new_field(v, FIELD_NAME)?);
|
||||
}
|
||||
FIELDS_NAME => {
|
||||
processor.with_fields(yaml_fields(v, FIELDS_NAME)?);
|
||||
fields = yaml_new_fields(v, FIELDS_NAME)?;
|
||||
}
|
||||
TARGET_FIELDS => {
|
||||
target_fields = yaml_string(v, TARGET_FIELDS)?
|
||||
.split(',')
|
||||
.map(|s| s.trim().to_string())
|
||||
.filter(|s| !s.is_empty())
|
||||
.collect();
|
||||
}
|
||||
SEPARATOR_NAME => {
|
||||
processor.try_separator(yaml_string(v, SEPARATOR_NAME)?)?;
|
||||
let separator = yaml_string(v, SEPARATOR_NAME)?;
|
||||
if separator.len() != 1 {
|
||||
return Err(format!(
|
||||
"'{}' must be a single character, but got '{}'",
|
||||
SEPARATOR_NAME, separator
|
||||
));
|
||||
} else {
|
||||
reader.delimiter(separator.as_bytes()[0]);
|
||||
}
|
||||
}
|
||||
QUOTE_NAME => {
|
||||
processor.try_quote(yaml_string(v, QUOTE_NAME)?)?;
|
||||
let quote = yaml_string(v, QUOTE_NAME)?;
|
||||
if quote.len() != 1 {
|
||||
return Err(format!(
|
||||
"'{}' must be a single character, but got '{}'",
|
||||
QUOTE_NAME, quote
|
||||
));
|
||||
} else {
|
||||
reader.quote(quote.as_bytes()[0]);
|
||||
}
|
||||
}
|
||||
TRIM_NAME => {
|
||||
processor.with_trim(yaml_bool(v, TRIM_NAME)?);
|
||||
let trim = yaml_bool(v, TRIM_NAME)?;
|
||||
if trim {
|
||||
reader.trim(Trim::All);
|
||||
} else {
|
||||
reader.trim(Trim::None);
|
||||
}
|
||||
}
|
||||
IGNORE_MISSING_NAME => {
|
||||
processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?);
|
||||
ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?;
|
||||
}
|
||||
EMPTY_VALUE_NAME => {
|
||||
processor.with_empty_value(yaml_string(v, EMPTY_VALUE_NAME)?);
|
||||
empty_value = Some(yaml_string(v, EMPTY_VALUE_NAME)?);
|
||||
}
|
||||
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
processor.update_output_keys();
|
||||
Ok(processor)
|
||||
let builder = {
|
||||
CsvProcessorBuilder {
|
||||
reader,
|
||||
fields,
|
||||
ignore_missing,
|
||||
empty_value,
|
||||
target_fields,
|
||||
}
|
||||
};
|
||||
|
||||
Ok(builder)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -209,41 +240,14 @@ impl Processor for CsvProcessor {
|
||||
self.ignore_missing
|
||||
}
|
||||
|
||||
fn fields(&self) -> &Fields {
|
||||
&self.fields
|
||||
}
|
||||
|
||||
fn fields_mut(&mut self) -> &mut Fields {
|
||||
&mut self.fields
|
||||
}
|
||||
|
||||
fn output_keys(&self) -> HashSet<String> {
|
||||
self.fields
|
||||
.iter()
|
||||
.flat_map(|f| f.target_fields.clone().unwrap_or_default())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
|
||||
match val {
|
||||
Value::String(val) => self.process_field(val, field),
|
||||
_ => Err(format!(
|
||||
"{} processor: expect string value, but got {val:?}",
|
||||
self.kind()
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String> {
|
||||
for field in self.fields.iter() {
|
||||
match val.get(field.input_field.index) {
|
||||
let index = field.input_index();
|
||||
match val.get(index) {
|
||||
Some(Value::String(v)) => {
|
||||
// TODO(qtang): Let this method use the intermediate state collection directly.
|
||||
let map = self.process_field(v, field)?;
|
||||
for (k, v) in map.values.into_iter() {
|
||||
if let Some(index) = field.output_fields_index_mapping.get(&k) {
|
||||
val[*index] = v;
|
||||
}
|
||||
let resule_list = self.process(v)?;
|
||||
for (k, v) in resule_list {
|
||||
val[k] = v;
|
||||
}
|
||||
}
|
||||
Some(Value::Null) | None => {
|
||||
@@ -251,7 +255,7 @@ impl Processor for CsvProcessor {
|
||||
return Err(format!(
|
||||
"{} processor: missing field: {}",
|
||||
self.kind(),
|
||||
field.get_field_name()
|
||||
field.input_name()
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -267,116 +271,140 @@ impl Processor for CsvProcessor {
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(yuanbohan): more test cases
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use ahash::HashMap;
|
||||
|
||||
use super::{CsvProcessor, Value};
|
||||
use crate::etl::field::Fields;
|
||||
use crate::etl::processor::Processor;
|
||||
use crate::etl::value::Map;
|
||||
use super::Value;
|
||||
use crate::etl::processor::csv::CsvProcessorBuilder;
|
||||
|
||||
#[test]
|
||||
fn test_equal_length() {
|
||||
let mut processor = CsvProcessor::new();
|
||||
let field = "data,, a, b".parse().unwrap();
|
||||
processor.with_fields(Fields::one(field));
|
||||
let mut reader = csv::ReaderBuilder::new();
|
||||
reader.has_headers(false);
|
||||
let builder = CsvProcessorBuilder {
|
||||
reader,
|
||||
target_fields: vec!["a".into(), "b".into()],
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let values: HashMap<String, Value> = [("data".into(), Value::String("1,2".into()))]
|
||||
let intermediate_keys = vec!["data".into(), "a".into(), "b".into()];
|
||||
|
||||
let processor = builder.build(&intermediate_keys).unwrap();
|
||||
let result = processor
|
||||
.process("1,2")
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.collect();
|
||||
let mut m = Map { values };
|
||||
|
||||
processor.exec_map(&mut m).unwrap();
|
||||
.map(|(k, v)| (intermediate_keys[k].clone(), v))
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
let values = [
|
||||
("data".into(), Value::String("1,2".into())),
|
||||
("a".into(), Value::String("1".into())),
|
||||
("b".into(), Value::String("2".into())),
|
||||
]
|
||||
.into_iter()
|
||||
.collect();
|
||||
let expected = Map { values };
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
assert_eq!(expected, m);
|
||||
assert_eq!(result, values);
|
||||
}
|
||||
|
||||
// test target_fields length larger than the record length
|
||||
#[test]
|
||||
fn test_target_fields_has_more_length() {
|
||||
let values = [("data".into(), Value::String("1,2".into()))]
|
||||
.into_iter()
|
||||
.collect();
|
||||
let mut input = Map { values };
|
||||
|
||||
// with no empty value
|
||||
{
|
||||
let mut processor = CsvProcessor::new();
|
||||
let field = "data,, a,b,c".parse().unwrap();
|
||||
processor.with_fields(Fields::one(field));
|
||||
let mut reader = csv::ReaderBuilder::new();
|
||||
reader.has_headers(false);
|
||||
let builder = CsvProcessorBuilder {
|
||||
reader,
|
||||
target_fields: vec!["a".into(), "b".into(), "c".into()],
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
processor.exec_map(&mut input).unwrap();
|
||||
let intermediate_keys = vec!["data".into(), "a".into(), "b".into(), "c".into()];
|
||||
|
||||
let processor = builder.build(&intermediate_keys).unwrap();
|
||||
let result = processor
|
||||
.process("1,2")
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|(k, v)| (intermediate_keys[k].clone(), v))
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
let values = [
|
||||
("data".into(), Value::String("1,2".into())),
|
||||
("a".into(), Value::String("1".into())),
|
||||
("b".into(), Value::String("2".into())),
|
||||
("c".into(), Value::Null),
|
||||
]
|
||||
.into_iter()
|
||||
.collect();
|
||||
let expected = Map { values };
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
assert_eq!(expected, input);
|
||||
assert_eq!(result, values);
|
||||
}
|
||||
|
||||
// with empty value
|
||||
{
|
||||
let mut processor = CsvProcessor::new();
|
||||
let field = "data,, a,b,c".parse().unwrap();
|
||||
processor.with_fields(Fields::one(field));
|
||||
processor.with_empty_value("default".into());
|
||||
let mut reader = csv::ReaderBuilder::new();
|
||||
reader.has_headers(false);
|
||||
let builder = CsvProcessorBuilder {
|
||||
reader,
|
||||
target_fields: vec!["a".into(), "b".into(), "c".into()],
|
||||
empty_value: Some("default".into()),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
processor.exec_map(&mut input).unwrap();
|
||||
let intermediate_keys = vec!["data".into(), "a".into(), "b".into(), "c".into()];
|
||||
|
||||
let processor = builder.build(&intermediate_keys).unwrap();
|
||||
let result = processor
|
||||
.process("1,2")
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|(k, v)| (intermediate_keys[k].clone(), v))
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
let values = [
|
||||
("data".into(), Value::String("1,2".into())),
|
||||
("a".into(), Value::String("1".into())),
|
||||
("b".into(), Value::String("2".into())),
|
||||
("c".into(), Value::String("default".into())),
|
||||
]
|
||||
.into_iter()
|
||||
.collect();
|
||||
let expected = Map { values };
|
||||
|
||||
assert_eq!(expected, input);
|
||||
assert_eq!(result, values);
|
||||
}
|
||||
}
|
||||
|
||||
// test record has larger length
|
||||
#[test]
|
||||
fn test_target_fields_has_less_length() {
|
||||
let values = [("data".into(), Value::String("1,2,3".into()))]
|
||||
let mut reader = csv::ReaderBuilder::new();
|
||||
reader.has_headers(false);
|
||||
let builder = CsvProcessorBuilder {
|
||||
reader,
|
||||
target_fields: vec!["a".into(), "b".into()],
|
||||
empty_value: Some("default".into()),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let intermediate_keys = vec!["data".into(), "a".into(), "b".into()];
|
||||
|
||||
let processor = builder.build(&intermediate_keys).unwrap();
|
||||
let result = processor
|
||||
.process("1,2")
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.collect();
|
||||
let mut input = Map { values };
|
||||
|
||||
let mut processor = CsvProcessor::new();
|
||||
let field = "data,,a,b".parse().unwrap();
|
||||
processor.with_fields(Fields::one(field));
|
||||
|
||||
processor.exec_map(&mut input).unwrap();
|
||||
.map(|(k, v)| (intermediate_keys[k].clone(), v))
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
let values = [
|
||||
("data".into(), Value::String("1,2,3".into())),
|
||||
("a".into(), Value::String("1".into())),
|
||||
("b".into(), Value::String("2".into())),
|
||||
]
|
||||
.into_iter()
|
||||
.collect();
|
||||
let expected = Map { values };
|
||||
|
||||
assert_eq!(expected, input);
|
||||
assert_eq!(result, values);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,12 +19,12 @@ use chrono::{DateTime, NaiveDateTime};
|
||||
use chrono_tz::Tz;
|
||||
use lazy_static::lazy_static;
|
||||
|
||||
use crate::etl::field::{Field, Fields};
|
||||
use crate::etl::field::{Fields, OneInputOneOutputField};
|
||||
use crate::etl::processor::{
|
||||
update_one_one_output_keys, yaml_bool, yaml_field, yaml_fields, yaml_string, yaml_strings,
|
||||
Processor, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
|
||||
yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, yaml_strings, Processor,
|
||||
ProcessorBuilder, ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
|
||||
};
|
||||
use crate::etl::value::{Map, Timestamp, Value};
|
||||
use crate::etl::value::{Timestamp, Value};
|
||||
|
||||
pub(crate) const PROCESSOR_DATE: &str = "date";
|
||||
|
||||
@@ -57,9 +57,15 @@ lazy_static! {
|
||||
.collect();
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
#[derive(Debug)]
|
||||
struct Formats(Vec<Arc<String>>);
|
||||
|
||||
impl Default for Formats {
|
||||
fn default() -> Self {
|
||||
Formats(DEFAULT_FORMATS.clone())
|
||||
}
|
||||
}
|
||||
|
||||
impl Formats {
|
||||
fn new(mut formats: Vec<Arc<String>>) -> Self {
|
||||
formats.sort();
|
||||
@@ -76,16 +82,119 @@ impl std::ops::Deref for Formats {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct DateProcessorBuilder {
|
||||
fields: Fields,
|
||||
formats: Formats,
|
||||
timezone: Option<Arc<String>>,
|
||||
locale: Option<Arc<String>>,
|
||||
ignore_missing: bool,
|
||||
}
|
||||
|
||||
impl ProcessorBuilder for DateProcessorBuilder {
|
||||
fn output_keys(&self) -> HashSet<&str> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|f| f.target_or_input_field())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn input_keys(&self) -> HashSet<&str> {
|
||||
self.fields.iter().map(|f| f.input_field()).collect()
|
||||
}
|
||||
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String> {
|
||||
self.build(intermediate_keys).map(ProcessorKind::Date)
|
||||
}
|
||||
}
|
||||
|
||||
impl DateProcessorBuilder {
|
||||
pub fn build(self, intermediate_keys: &[String]) -> Result<DateProcessor, String> {
|
||||
let mut real_fields = vec![];
|
||||
for field in self.fields.into_iter() {
|
||||
let input = OneInputOneOutputField::build(
|
||||
"date",
|
||||
intermediate_keys,
|
||||
field.input_field(),
|
||||
field.target_or_input_field(),
|
||||
)?;
|
||||
real_fields.push(input);
|
||||
}
|
||||
Ok(DateProcessor {
|
||||
fields: real_fields,
|
||||
formats: self.formats,
|
||||
timezone: self.timezone,
|
||||
locale: self.locale,
|
||||
ignore_missing: self.ignore_missing,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for DateProcessorBuilder {
|
||||
type Error = String;
|
||||
|
||||
fn try_from(hash: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
|
||||
let mut fields = Fields::default();
|
||||
let mut formats = Formats::default();
|
||||
let mut timezone = None;
|
||||
let mut locale = None;
|
||||
let mut ignore_missing = false;
|
||||
|
||||
for (k, v) in hash {
|
||||
let key = k
|
||||
.as_str()
|
||||
.ok_or(format!("key must be a string, but got {k:?}"))?;
|
||||
|
||||
match key {
|
||||
FIELD_NAME => {
|
||||
fields = Fields::one(yaml_new_field(v, FIELD_NAME)?);
|
||||
}
|
||||
FIELDS_NAME => {
|
||||
fields = yaml_new_fields(v, FIELDS_NAME)?;
|
||||
}
|
||||
|
||||
FORMATS_NAME => {
|
||||
let format_strs = yaml_strings(v, FORMATS_NAME)?;
|
||||
if format_strs.is_empty() {
|
||||
formats = Formats::new(DEFAULT_FORMATS.clone());
|
||||
} else {
|
||||
formats = Formats::new(format_strs.into_iter().map(Arc::new).collect());
|
||||
}
|
||||
}
|
||||
TIMEZONE_NAME => {
|
||||
timezone = Some(Arc::new(yaml_string(v, TIMEZONE_NAME)?));
|
||||
}
|
||||
LOCALE_NAME => {
|
||||
locale = Some(Arc::new(yaml_string(v, LOCALE_NAME)?));
|
||||
}
|
||||
IGNORE_MISSING_NAME => {
|
||||
ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?;
|
||||
}
|
||||
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
let builder = DateProcessorBuilder {
|
||||
fields,
|
||||
formats,
|
||||
timezone,
|
||||
locale,
|
||||
ignore_missing,
|
||||
};
|
||||
|
||||
Ok(builder)
|
||||
}
|
||||
}
|
||||
|
||||
/// deprecated it should be removed in the future
|
||||
/// Reserved for compatibility only
|
||||
#[derive(Debug, Default)]
|
||||
pub struct DateProcessor {
|
||||
fields: Fields,
|
||||
|
||||
fields: Vec<OneInputOneOutputField>,
|
||||
formats: Formats,
|
||||
timezone: Option<Arc<String>>,
|
||||
locale: Option<Arc<String>>, // to support locale
|
||||
output_format: Option<Arc<String>>,
|
||||
|
||||
ignore_missing: bool,
|
||||
// description
|
||||
@@ -96,43 +205,6 @@ pub struct DateProcessor {
|
||||
}
|
||||
|
||||
impl DateProcessor {
|
||||
fn with_fields(&mut self, mut fields: Fields) {
|
||||
update_one_one_output_keys(&mut fields);
|
||||
self.fields = fields
|
||||
}
|
||||
|
||||
fn with_formats(&mut self, v: Option<Vec<Arc<String>>>) {
|
||||
let v = match v {
|
||||
Some(v) if !v.is_empty() => v,
|
||||
_ => DEFAULT_FORMATS.clone(),
|
||||
};
|
||||
|
||||
let formats = Formats::new(v);
|
||||
self.formats = formats;
|
||||
}
|
||||
|
||||
fn with_timezone(&mut self, timezone: String) {
|
||||
if !timezone.is_empty() {
|
||||
self.timezone = Some(Arc::new(timezone));
|
||||
}
|
||||
}
|
||||
|
||||
fn with_locale(&mut self, locale: String) {
|
||||
if !locale.is_empty() {
|
||||
self.locale = Some(Arc::new(locale));
|
||||
}
|
||||
}
|
||||
|
||||
fn with_output_format(&mut self, output_format: String) {
|
||||
if !output_format.is_empty() {
|
||||
self.output_format = Some(Arc::new(output_format));
|
||||
}
|
||||
}
|
||||
|
||||
fn with_ignore_missing(&mut self, ignore_missing: bool) {
|
||||
self.ignore_missing = ignore_missing;
|
||||
}
|
||||
|
||||
fn parse(&self, val: &str) -> Result<Timestamp, String> {
|
||||
let mut tz = Tz::UTC;
|
||||
if let Some(timezone) = &self.timezone {
|
||||
@@ -147,61 +219,6 @@ impl DateProcessor {
|
||||
|
||||
Err(format!("{} processor: failed to parse {val}", self.kind(),))
|
||||
}
|
||||
|
||||
fn process_field(&self, val: &str, field: &Field) -> Result<Map, String> {
|
||||
let key = field.get_target_field();
|
||||
|
||||
Ok(Map::one(key, Value::Timestamp(self.parse(val)?)))
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for DateProcessor {
|
||||
type Error = String;
|
||||
|
||||
fn try_from(hash: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
|
||||
let mut processor = DateProcessor::default();
|
||||
|
||||
let mut formats_opt = None;
|
||||
|
||||
for (k, v) in hash {
|
||||
let key = k
|
||||
.as_str()
|
||||
.ok_or(format!("key must be a string, but got {k:?}"))?;
|
||||
|
||||
match key {
|
||||
FIELD_NAME => {
|
||||
processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?));
|
||||
}
|
||||
FIELDS_NAME => {
|
||||
processor.with_fields(yaml_fields(v, FIELDS_NAME)?);
|
||||
}
|
||||
|
||||
FORMATS_NAME => {
|
||||
let formats = yaml_strings(v, FORMATS_NAME)?;
|
||||
formats_opt = Some(formats.into_iter().map(Arc::new).collect());
|
||||
}
|
||||
TIMEZONE_NAME => {
|
||||
processor.with_timezone(yaml_string(v, TIMEZONE_NAME)?);
|
||||
}
|
||||
LOCALE_NAME => {
|
||||
processor.with_locale(yaml_string(v, LOCALE_NAME)?);
|
||||
}
|
||||
OUTPUT_FORMAT_NAME => {
|
||||
processor.with_output_format(yaml_string(v, OUTPUT_FORMAT_NAME)?);
|
||||
}
|
||||
|
||||
IGNORE_MISSING_NAME => {
|
||||
processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?);
|
||||
}
|
||||
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
processor.with_formats(formats_opt);
|
||||
|
||||
Ok(processor)
|
||||
}
|
||||
}
|
||||
|
||||
impl Processor for DateProcessor {
|
||||
@@ -213,53 +230,21 @@ impl Processor for DateProcessor {
|
||||
self.ignore_missing
|
||||
}
|
||||
|
||||
fn fields(&self) -> &Fields {
|
||||
&self.fields
|
||||
}
|
||||
|
||||
fn fields_mut(&mut self) -> &mut Fields {
|
||||
&mut self.fields
|
||||
}
|
||||
|
||||
fn output_keys(&self) -> HashSet<String> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|f| f.get_target_field().to_string())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
|
||||
match val {
|
||||
Value::String(s) => self.process_field(s, field),
|
||||
_ => Err(format!(
|
||||
"{} processor: expect string value, but got {val:?}",
|
||||
self.kind()
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String> {
|
||||
for field in self.fields().iter() {
|
||||
let index = field.input_field.index;
|
||||
for field in self.fields.iter() {
|
||||
let index = field.input_index();
|
||||
match val.get(index) {
|
||||
Some(Value::String(s)) => {
|
||||
// TODO(qtang): Let this method use the intermediate state collection directly.
|
||||
let mut map = self.process_field(s, field)?;
|
||||
field
|
||||
.output_fields_index_mapping
|
||||
.iter()
|
||||
.for_each(|(k, output_index)| {
|
||||
if let Some(v) = map.remove(k) {
|
||||
val[*output_index] = v;
|
||||
}
|
||||
});
|
||||
let timestamp = self.parse(s)?;
|
||||
let output_index = field.output_index();
|
||||
val[output_index] = Value::Timestamp(timestamp);
|
||||
}
|
||||
Some(Value::Null) | None => {
|
||||
if !self.ignore_missing {
|
||||
return Err(format!(
|
||||
"{} processor: missing field: {}",
|
||||
self.kind(),
|
||||
field.get_field_name()
|
||||
field.input_name()
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -318,8 +303,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_parse() {
|
||||
let mut processor = DateProcessor::default();
|
||||
processor.with_formats(None);
|
||||
let processor = DateProcessor::default();
|
||||
|
||||
let values: Vec<&str> = vec![
|
||||
"2014-5-17T12:34:56",
|
||||
@@ -340,7 +324,6 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_parse_with_formats() {
|
||||
let mut processor = DateProcessor::default();
|
||||
let formats = vec![
|
||||
"%Y-%m-%dT%H:%M:%S%:z",
|
||||
"%Y-%m-%dT%H:%M:%S%.3f%:z",
|
||||
@@ -349,8 +332,11 @@ mod tests {
|
||||
]
|
||||
.into_iter()
|
||||
.map(|s| Arc::new(s.to_string()))
|
||||
.collect();
|
||||
processor.with_formats(Some(formats));
|
||||
.collect::<Vec<_>>();
|
||||
let processor = DateProcessor {
|
||||
formats: super::Formats(formats),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let values: Vec<&str> = vec![
|
||||
"2014-5-17T12:34:56",
|
||||
@@ -371,9 +357,10 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_parse_with_timezone() {
|
||||
let mut processor = DateProcessor::default();
|
||||
processor.with_formats(None);
|
||||
processor.with_timezone("Asia/Tokyo".to_string());
|
||||
let processor = DateProcessor {
|
||||
timezone: Some(Arc::new("Asia/Tokyo".to_string())),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let values: Vec<&str> = vec![
|
||||
"2014-5-17T12:34:56",
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -14,17 +14,17 @@
|
||||
|
||||
use ahash::HashSet;
|
||||
|
||||
use crate::etl::field::{Field, Fields};
|
||||
use crate::etl::field::{Fields, OneInputOneOutputField};
|
||||
use crate::etl::processor::{
|
||||
update_one_one_output_keys, yaml_bool, yaml_field, yaml_fields, yaml_string, Processor,
|
||||
FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
|
||||
yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, ProcessorBuilder,
|
||||
ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
|
||||
};
|
||||
use crate::etl::value::time::{
|
||||
MICROSECOND_RESOLUTION, MICRO_RESOLUTION, MILLISECOND_RESOLUTION, MILLI_RESOLUTION,
|
||||
MS_RESOLUTION, NANOSECOND_RESOLUTION, NANO_RESOLUTION, NS_RESOLUTION, SECOND_RESOLUTION,
|
||||
SEC_RESOLUTION, S_RESOLUTION, US_RESOLUTION,
|
||||
};
|
||||
use crate::etl::value::{Map, Timestamp, Value};
|
||||
use crate::etl::value::{Timestamp, Value};
|
||||
|
||||
pub(crate) const PROCESSOR_EPOCH: &str = "epoch";
|
||||
const RESOLUTION_NAME: &str = "resolution";
|
||||
@@ -52,12 +52,56 @@ impl TryFrom<&str> for Resolution {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct EpochProcessorBuilder {
|
||||
fields: Fields,
|
||||
resolution: Resolution,
|
||||
ignore_missing: bool,
|
||||
}
|
||||
|
||||
impl ProcessorBuilder for EpochProcessorBuilder {
|
||||
fn output_keys(&self) -> HashSet<&str> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|f| f.target_or_input_field())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn input_keys(&self) -> HashSet<&str> {
|
||||
self.fields.iter().map(|f| f.input_field()).collect()
|
||||
}
|
||||
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String> {
|
||||
self.build(intermediate_keys).map(ProcessorKind::Epoch)
|
||||
}
|
||||
}
|
||||
|
||||
impl EpochProcessorBuilder {
|
||||
pub fn build(self, intermediate_keys: &[String]) -> Result<EpochProcessor, String> {
|
||||
let mut real_fields = vec![];
|
||||
for field in self.fields.into_iter() {
|
||||
let input = OneInputOneOutputField::build(
|
||||
"epoch",
|
||||
intermediate_keys,
|
||||
field.input_field(),
|
||||
field.target_or_input_field(),
|
||||
)?;
|
||||
real_fields.push(input);
|
||||
}
|
||||
Ok(EpochProcessor {
|
||||
fields: real_fields,
|
||||
resolution: self.resolution,
|
||||
ignore_missing: self.ignore_missing,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// support string, integer, float, time, epoch
|
||||
/// deprecated it should be removed in the future
|
||||
/// Reserved for compatibility only
|
||||
#[derive(Debug, Default)]
|
||||
pub struct EpochProcessor {
|
||||
fields: Fields,
|
||||
fields: Vec<OneInputOneOutputField>,
|
||||
resolution: Resolution,
|
||||
ignore_missing: bool,
|
||||
// description
|
||||
@@ -68,19 +112,6 @@ pub struct EpochProcessor {
|
||||
}
|
||||
|
||||
impl EpochProcessor {
|
||||
fn with_fields(&mut self, mut fields: Fields) {
|
||||
update_one_one_output_keys(&mut fields);
|
||||
self.fields = fields
|
||||
}
|
||||
|
||||
fn with_resolution(&mut self, resolution: Resolution) {
|
||||
self.resolution = resolution;
|
||||
}
|
||||
|
||||
fn with_ignore_missing(&mut self, ignore_missing: bool) {
|
||||
self.ignore_missing = ignore_missing;
|
||||
}
|
||||
|
||||
fn parse(&self, val: &Value) -> Result<Timestamp, String> {
|
||||
let t: i64 = match val {
|
||||
Value::String(s) => s
|
||||
@@ -117,19 +148,15 @@ impl EpochProcessor {
|
||||
Resolution::Nano => Ok(Timestamp::Nanosecond(t)),
|
||||
}
|
||||
}
|
||||
|
||||
fn process_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
|
||||
let key = field.get_target_field();
|
||||
|
||||
Ok(Map::one(key, Value::Timestamp(self.parse(val)?)))
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for EpochProcessor {
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for EpochProcessorBuilder {
|
||||
type Error = String;
|
||||
|
||||
fn try_from(hash: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
|
||||
let mut processor = EpochProcessor::default();
|
||||
let mut fields = Fields::default();
|
||||
let mut resolution = Resolution::default();
|
||||
let mut ignore_missing = false;
|
||||
|
||||
for (k, v) in hash {
|
||||
let key = k
|
||||
@@ -138,24 +165,29 @@ impl TryFrom<&yaml_rust::yaml::Hash> for EpochProcessor {
|
||||
|
||||
match key {
|
||||
FIELD_NAME => {
|
||||
processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?));
|
||||
fields = Fields::one(yaml_new_field(v, FIELD_NAME)?);
|
||||
}
|
||||
FIELDS_NAME => {
|
||||
processor.with_fields(yaml_fields(v, FIELDS_NAME)?);
|
||||
fields = yaml_new_fields(v, FIELDS_NAME)?;
|
||||
}
|
||||
RESOLUTION_NAME => {
|
||||
let s = yaml_string(v, RESOLUTION_NAME)?.as_str().try_into()?;
|
||||
processor.with_resolution(s);
|
||||
resolution = s;
|
||||
}
|
||||
IGNORE_MISSING_NAME => {
|
||||
processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?);
|
||||
ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?;
|
||||
}
|
||||
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
let builder = EpochProcessorBuilder {
|
||||
fields,
|
||||
resolution,
|
||||
ignore_missing,
|
||||
};
|
||||
|
||||
Ok(processor)
|
||||
Ok(builder)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -168,49 +200,23 @@ impl Processor for EpochProcessor {
|
||||
self.ignore_missing
|
||||
}
|
||||
|
||||
fn fields(&self) -> &Fields {
|
||||
&self.fields
|
||||
}
|
||||
|
||||
fn fields_mut(&mut self) -> &mut Fields {
|
||||
&mut self.fields
|
||||
}
|
||||
|
||||
fn output_keys(&self) -> HashSet<String> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|f| f.get_target_field().to_string())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
|
||||
self.process_field(val, field)
|
||||
}
|
||||
|
||||
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String> {
|
||||
for field in self.fields.iter() {
|
||||
let index = field.input_field.index;
|
||||
let index = field.input_index();
|
||||
match val.get(index) {
|
||||
Some(Value::Null) | None => {
|
||||
if !self.ignore_missing {
|
||||
return Err(format!(
|
||||
"{} processor: missing field: {}",
|
||||
self.kind(),
|
||||
field.get_field_name()
|
||||
field.input_name()
|
||||
));
|
||||
}
|
||||
}
|
||||
Some(v) => {
|
||||
// TODO(qtang): Let this method use the intermediate state collection directly.
|
||||
let mut map = self.process_field(v, field)?;
|
||||
field
|
||||
.output_fields_index_mapping
|
||||
.iter()
|
||||
.for_each(|(k, output_index)| {
|
||||
if let Some(v) = map.remove(k) {
|
||||
val[*output_index] = v;
|
||||
}
|
||||
});
|
||||
let timestamp = self.parse(v)?;
|
||||
let output_index = field.output_index();
|
||||
val[output_index] = Value::Timestamp(timestamp);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -225,8 +231,10 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_parse_epoch() {
|
||||
let mut processor = EpochProcessor::default();
|
||||
processor.with_resolution(super::Resolution::Second);
|
||||
let processor = EpochProcessor {
|
||||
resolution: super::Resolution::Second,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let values = [
|
||||
Value::String("1573840000".into()),
|
||||
|
||||
@@ -15,45 +15,43 @@
|
||||
use ahash::HashSet;
|
||||
use regex::Regex;
|
||||
|
||||
use crate::etl::field::{Field, Fields};
|
||||
use crate::etl::field::{Fields, OneInputOneOutputField};
|
||||
use crate::etl::processor::{
|
||||
update_one_one_output_keys, yaml_bool, yaml_field, yaml_fields, yaml_string, Processor,
|
||||
FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, PATTERN_NAME,
|
||||
yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, ProcessorBuilder,
|
||||
ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, PATTERN_NAME,
|
||||
};
|
||||
use crate::etl::value::{Array, Map, Value};
|
||||
use crate::etl::value::Value;
|
||||
|
||||
pub(crate) const PROCESSOR_GSUB: &str = "gsub";
|
||||
|
||||
const REPLACEMENT_NAME: &str = "replacement";
|
||||
|
||||
/// A processor to replace all matches of a pattern in string by a replacement, only support string value, and array string value
|
||||
#[derive(Debug, Default)]
|
||||
pub struct GsubProcessor {
|
||||
pub struct GsubProcessorBuilder {
|
||||
fields: Fields,
|
||||
pattern: Option<Regex>,
|
||||
replacement: Option<String>,
|
||||
ignore_missing: bool,
|
||||
}
|
||||
|
||||
impl GsubProcessor {
|
||||
fn with_fields(&mut self, mut fields: Fields) {
|
||||
update_one_one_output_keys(&mut fields);
|
||||
self.fields = fields;
|
||||
impl ProcessorBuilder for GsubProcessorBuilder {
|
||||
fn output_keys(&self) -> HashSet<&str> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|f| f.target_or_input_field())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn with_ignore_missing(&mut self, ignore_missing: bool) {
|
||||
self.ignore_missing = ignore_missing;
|
||||
fn input_keys(&self) -> HashSet<&str> {
|
||||
self.fields.iter().map(|f| f.input_field()).collect()
|
||||
}
|
||||
|
||||
fn try_pattern(&mut self, pattern: &str) -> Result<(), String> {
|
||||
self.pattern = Some(Regex::new(pattern).map_err(|e| e.to_string())?);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn with_replacement(&mut self, replacement: impl Into<String>) {
|
||||
self.replacement = Some(replacement.into());
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String> {
|
||||
self.build(intermediate_keys).map(ProcessorKind::Gsub)
|
||||
}
|
||||
}
|
||||
|
||||
impl GsubProcessorBuilder {
|
||||
fn check(self) -> Result<Self, String> {
|
||||
if self.pattern.is_none() {
|
||||
return Err("pattern is required".to_string());
|
||||
@@ -66,7 +64,49 @@ impl GsubProcessor {
|
||||
Ok(self)
|
||||
}
|
||||
|
||||
fn process_string_field(&self, val: &str, field: &Field) -> Result<Map, String> {
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<GsubProcessor, String> {
|
||||
let mut real_fields = vec![];
|
||||
for field in self.fields.into_iter() {
|
||||
let input = OneInputOneOutputField::build(
|
||||
"gsub",
|
||||
intermediate_keys,
|
||||
field.input_field(),
|
||||
field.target_or_input_field(),
|
||||
)?;
|
||||
real_fields.push(input);
|
||||
}
|
||||
Ok(GsubProcessor {
|
||||
fields: real_fields,
|
||||
pattern: self.pattern,
|
||||
replacement: self.replacement,
|
||||
ignore_missing: self.ignore_missing,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// A processor to replace all matches of a pattern in string by a replacement, only support string value, and array string value
|
||||
#[derive(Debug, Default)]
|
||||
pub struct GsubProcessor {
|
||||
fields: Vec<OneInputOneOutputField>,
|
||||
pattern: Option<Regex>,
|
||||
replacement: Option<String>,
|
||||
ignore_missing: bool,
|
||||
}
|
||||
|
||||
impl GsubProcessor {
|
||||
fn check(self) -> Result<Self, String> {
|
||||
if self.pattern.is_none() {
|
||||
return Err("pattern is required".to_string());
|
||||
}
|
||||
|
||||
if self.replacement.is_none() {
|
||||
return Err("replacement is required".to_string());
|
||||
}
|
||||
|
||||
Ok(self)
|
||||
}
|
||||
|
||||
fn process_string(&self, val: &str) -> Result<Value, String> {
|
||||
let replacement = self.replacement.as_ref().unwrap();
|
||||
let new_val = self
|
||||
.pattern
|
||||
@@ -76,42 +116,28 @@ impl GsubProcessor {
|
||||
.to_string();
|
||||
let val = Value::String(new_val);
|
||||
|
||||
let key = field.get_target_field();
|
||||
|
||||
Ok(Map::one(key, val))
|
||||
Ok(val)
|
||||
}
|
||||
|
||||
fn process_array_field(&self, arr: &Array, field: &Field) -> Result<Map, String> {
|
||||
let key = field.get_target_field();
|
||||
|
||||
let re = self.pattern.as_ref().unwrap();
|
||||
let replacement = self.replacement.as_ref().unwrap();
|
||||
|
||||
let mut result = Array::default();
|
||||
for val in arr.iter() {
|
||||
match val {
|
||||
Value::String(haystack) => {
|
||||
let new_val = re.replace_all(haystack, replacement).to_string();
|
||||
result.push(Value::String(new_val));
|
||||
}
|
||||
_ => {
|
||||
return Err(format!(
|
||||
"{} processor: expect string or array string, but got {val:?}",
|
||||
self.kind()
|
||||
))
|
||||
}
|
||||
}
|
||||
fn process(&self, val: &Value) -> Result<Value, String> {
|
||||
match val {
|
||||
Value::String(val) => self.process_string(val),
|
||||
_ => Err(format!(
|
||||
"{} processor: expect string or array string, but got {val:?}",
|
||||
self.kind()
|
||||
)),
|
||||
}
|
||||
|
||||
Ok(Map::one(key, Value::Array(result)))
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for GsubProcessor {
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for GsubProcessorBuilder {
|
||||
type Error = String;
|
||||
|
||||
fn try_from(value: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
|
||||
let mut processor = GsubProcessor::default();
|
||||
let mut fields = Fields::default();
|
||||
let mut ignore_missing = false;
|
||||
let mut pattern = None;
|
||||
let mut replacement = None;
|
||||
|
||||
for (k, v) in value.iter() {
|
||||
let key = k
|
||||
@@ -119,27 +145,36 @@ impl TryFrom<&yaml_rust::yaml::Hash> for GsubProcessor {
|
||||
.ok_or(format!("key must be a string, but got {k:?}"))?;
|
||||
match key {
|
||||
FIELD_NAME => {
|
||||
processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?));
|
||||
fields = Fields::one(yaml_new_field(v, FIELD_NAME)?);
|
||||
}
|
||||
FIELDS_NAME => {
|
||||
processor.with_fields(yaml_fields(v, FIELDS_NAME)?);
|
||||
fields = yaml_new_fields(v, FIELDS_NAME)?;
|
||||
}
|
||||
PATTERN_NAME => {
|
||||
processor.try_pattern(&yaml_string(v, PATTERN_NAME)?)?;
|
||||
let pattern_str = yaml_string(v, PATTERN_NAME)?;
|
||||
pattern = Some(Regex::new(&pattern_str).map_err(|e| e.to_string())?);
|
||||
}
|
||||
REPLACEMENT_NAME => {
|
||||
processor.with_replacement(yaml_string(v, REPLACEMENT_NAME)?);
|
||||
let replacement_str = yaml_string(v, REPLACEMENT_NAME)?;
|
||||
replacement = Some(replacement_str);
|
||||
}
|
||||
|
||||
IGNORE_MISSING_NAME => {
|
||||
processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?);
|
||||
ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?;
|
||||
}
|
||||
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
processor.check()
|
||||
let builder = GsubProcessorBuilder {
|
||||
fields,
|
||||
pattern,
|
||||
replacement,
|
||||
ignore_missing,
|
||||
};
|
||||
|
||||
builder.check()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -152,56 +187,23 @@ impl crate::etl::processor::Processor for GsubProcessor {
|
||||
self.ignore_missing
|
||||
}
|
||||
|
||||
fn fields(&self) -> &Fields {
|
||||
&self.fields
|
||||
}
|
||||
|
||||
fn fields_mut(&mut self) -> &mut Fields {
|
||||
&mut self.fields
|
||||
}
|
||||
|
||||
fn output_keys(&self) -> HashSet<String> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|f| f.get_target_field().to_string())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
|
||||
match val {
|
||||
Value::String(val) => self.process_string_field(val, field),
|
||||
Value::Array(arr) => self.process_array_field(arr, field),
|
||||
_ => Err(format!(
|
||||
"{} processor: expect string or array string, but got {val:?}",
|
||||
self.kind()
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String> {
|
||||
for field in self.fields.iter() {
|
||||
let index = field.input_field.index;
|
||||
let index = field.input_index();
|
||||
match val.get(index) {
|
||||
Some(Value::Null) | None => {
|
||||
if !self.ignore_missing {
|
||||
return Err(format!(
|
||||
"{} processor: missing field: {}",
|
||||
self.kind(),
|
||||
field.get_field_name()
|
||||
field.input_name()
|
||||
));
|
||||
}
|
||||
}
|
||||
Some(v) => {
|
||||
// TODO(qtang): Let this method use the intermediate state collection directly.
|
||||
let mut map = self.exec_field(v, field)?;
|
||||
field
|
||||
.output_fields_index_mapping
|
||||
.iter()
|
||||
.for_each(|(k, output_index)| {
|
||||
if let Some(v) = map.remove(k) {
|
||||
val[*output_index] = v;
|
||||
}
|
||||
});
|
||||
let result = self.process(v)?;
|
||||
let output_index = field.output_index();
|
||||
val[output_index] = result;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -211,55 +213,20 @@ impl crate::etl::processor::Processor for GsubProcessor {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::etl::field::Field;
|
||||
use crate::etl::processor::gsub::GsubProcessor;
|
||||
use crate::etl::processor::Processor;
|
||||
use crate::etl::value::{Map, Value};
|
||||
use crate::etl::value::Value;
|
||||
|
||||
#[test]
|
||||
fn test_string_value() {
|
||||
let mut processor = GsubProcessor::default();
|
||||
processor.try_pattern(r"\d+").unwrap();
|
||||
processor.with_replacement("xxx");
|
||||
let processor = GsubProcessor {
|
||||
pattern: Some(regex::Regex::new(r"\d+").unwrap()),
|
||||
replacement: Some("xxx".to_string()),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let field = Field::new("message");
|
||||
let val = Value::String("123".to_string());
|
||||
let result = processor.exec_field(&val, &field).unwrap();
|
||||
let result = processor.process(&val).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
result,
|
||||
Map::one("message", Value::String("xxx".to_string()))
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_array_string_value() {
|
||||
let mut processor = GsubProcessor::default();
|
||||
processor.try_pattern(r"\d+").unwrap();
|
||||
processor.with_replacement("xxx");
|
||||
|
||||
let field = Field::new("message");
|
||||
let val = Value::Array(
|
||||
vec![
|
||||
Value::String("123".to_string()),
|
||||
Value::String("456".to_string()),
|
||||
]
|
||||
.into(),
|
||||
);
|
||||
let result = processor.exec_field(&val, &field).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
result,
|
||||
Map::one(
|
||||
"message",
|
||||
Value::Array(
|
||||
vec![
|
||||
Value::String("xxx".to_string()),
|
||||
Value::String("xxx".to_string())
|
||||
]
|
||||
.into()
|
||||
)
|
||||
)
|
||||
);
|
||||
assert_eq!(result, Value::String("xxx".to_string()));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,40 +14,78 @@
|
||||
|
||||
use ahash::HashSet;
|
||||
|
||||
use crate::etl::field::{Field, Fields};
|
||||
use crate::etl::field::{Fields, OneInputOneOutputField};
|
||||
use crate::etl::processor::{
|
||||
update_one_one_output_keys, yaml_bool, yaml_field, yaml_fields, yaml_string, Processor,
|
||||
FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, SEPARATOR_NAME,
|
||||
yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, ProcessorBuilder,
|
||||
ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, SEPARATOR_NAME,
|
||||
};
|
||||
use crate::etl::value::{Array, Map, Value};
|
||||
use crate::etl::value::{Array, Value};
|
||||
|
||||
pub(crate) const PROCESSOR_JOIN: &str = "join";
|
||||
|
||||
/// A processor to join each element of an array into a single string using a separator string between each element
|
||||
#[derive(Debug, Default)]
|
||||
pub struct JoinProcessor {
|
||||
pub struct JoinProcessorBuilder {
|
||||
fields: Fields,
|
||||
separator: Option<String>,
|
||||
ignore_missing: bool,
|
||||
}
|
||||
|
||||
impl ProcessorBuilder for JoinProcessorBuilder {
|
||||
fn output_keys(&self) -> HashSet<&str> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|f| f.target_or_input_field())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn input_keys(&self) -> HashSet<&str> {
|
||||
self.fields.iter().map(|f| f.input_field()).collect()
|
||||
}
|
||||
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String> {
|
||||
self.build(intermediate_keys).map(ProcessorKind::Join)
|
||||
}
|
||||
}
|
||||
|
||||
impl JoinProcessorBuilder {
|
||||
fn check(self) -> Result<Self, String> {
|
||||
if self.separator.is_none() {
|
||||
return Err("separator is required".to_string());
|
||||
}
|
||||
|
||||
Ok(self)
|
||||
}
|
||||
|
||||
pub fn build(self, intermediate_keys: &[String]) -> Result<JoinProcessor, String> {
|
||||
let mut real_fields = vec![];
|
||||
for field in self.fields.into_iter() {
|
||||
let input = OneInputOneOutputField::build(
|
||||
"join",
|
||||
intermediate_keys,
|
||||
field.input_field(),
|
||||
field.target_or_input_field(),
|
||||
)?;
|
||||
real_fields.push(input);
|
||||
}
|
||||
|
||||
Ok(JoinProcessor {
|
||||
fields: real_fields,
|
||||
separator: self.separator,
|
||||
ignore_missing: self.ignore_missing,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// A processor to join each element of an array into a single string using a separator string between each element
|
||||
#[derive(Debug, Default)]
|
||||
pub struct JoinProcessor {
|
||||
fields: Vec<OneInputOneOutputField>,
|
||||
separator: Option<String>,
|
||||
ignore_missing: bool,
|
||||
}
|
||||
|
||||
impl JoinProcessor {
|
||||
fn with_fields(&mut self, mut fields: Fields) {
|
||||
update_one_one_output_keys(&mut fields);
|
||||
self.fields = fields;
|
||||
}
|
||||
|
||||
fn with_separator(&mut self, separator: impl Into<String>) {
|
||||
self.separator = Some(separator.into());
|
||||
}
|
||||
|
||||
fn with_ignore_missing(&mut self, ignore_missing: bool) {
|
||||
self.ignore_missing = ignore_missing;
|
||||
}
|
||||
|
||||
fn process_field(&self, arr: &Array, field: &Field) -> Result<Map, String> {
|
||||
let key = field.get_target_field();
|
||||
|
||||
fn process(&self, arr: &Array) -> Result<Value, String> {
|
||||
let sep = self.separator.as_ref().unwrap();
|
||||
let val = arr
|
||||
.iter()
|
||||
@@ -55,7 +93,7 @@ impl JoinProcessor {
|
||||
.collect::<Vec<String>>()
|
||||
.join(sep);
|
||||
|
||||
Ok(Map::one(key, Value::String(val)))
|
||||
Ok(Value::String(val))
|
||||
}
|
||||
|
||||
fn check(self) -> Result<Self, String> {
|
||||
@@ -67,11 +105,13 @@ impl JoinProcessor {
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for JoinProcessor {
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for JoinProcessorBuilder {
|
||||
type Error = String;
|
||||
|
||||
fn try_from(value: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
|
||||
let mut processor = JoinProcessor::default();
|
||||
let mut fields = Fields::default();
|
||||
let mut separator = None;
|
||||
let mut ignore_missing = false;
|
||||
|
||||
for (k, v) in value.iter() {
|
||||
let key = k
|
||||
@@ -79,30 +119,31 @@ impl TryFrom<&yaml_rust::yaml::Hash> for JoinProcessor {
|
||||
.ok_or(format!("key must be a string, but got {k:?}"))?;
|
||||
match key {
|
||||
FIELD_NAME => {
|
||||
processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?));
|
||||
fields = Fields::one(yaml_new_field(v, FIELD_NAME)?);
|
||||
}
|
||||
FIELDS_NAME => {
|
||||
processor.with_fields(yaml_fields(v, FIELDS_NAME)?);
|
||||
fields = yaml_new_fields(v, FIELDS_NAME)?;
|
||||
}
|
||||
SEPARATOR_NAME => {
|
||||
processor.with_separator(yaml_string(v, SEPARATOR_NAME)?);
|
||||
separator = Some(yaml_string(v, SEPARATOR_NAME)?);
|
||||
}
|
||||
IGNORE_MISSING_NAME => {
|
||||
processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?);
|
||||
ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
processor.check()
|
||||
let builder = JoinProcessorBuilder {
|
||||
fields,
|
||||
separator,
|
||||
ignore_missing,
|
||||
};
|
||||
builder.check()
|
||||
}
|
||||
}
|
||||
|
||||
impl Processor for JoinProcessor {
|
||||
fn fields(&self) -> &Fields {
|
||||
&self.fields
|
||||
}
|
||||
|
||||
fn kind(&self) -> &str {
|
||||
PROCESSOR_JOIN
|
||||
}
|
||||
@@ -111,49 +152,21 @@ impl Processor for JoinProcessor {
|
||||
self.ignore_missing
|
||||
}
|
||||
|
||||
fn fields_mut(&mut self) -> &mut Fields {
|
||||
&mut self.fields
|
||||
}
|
||||
|
||||
fn output_keys(&self) -> HashSet<String> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|f| f.get_target_field().to_string())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
|
||||
match val {
|
||||
Value::Array(arr) => self.process_field(arr, field),
|
||||
_ => Err(format!(
|
||||
"{} processor: expect array value, but got {val:?}",
|
||||
self.kind()
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String> {
|
||||
for field in self.fields.iter() {
|
||||
let index = field.input_field.index;
|
||||
let index = field.input_index();
|
||||
match val.get(index) {
|
||||
Some(Value::Array(arr)) => {
|
||||
// TODO(qtang): Let this method use the intermediate state collection directly.
|
||||
let mut map = self.process_field(arr, field)?;
|
||||
field
|
||||
.output_fields_index_mapping
|
||||
.iter()
|
||||
.for_each(|(k, output_index)| {
|
||||
if let Some(v) = map.remove(k) {
|
||||
val[*output_index] = v;
|
||||
}
|
||||
});
|
||||
let result = self.process(arr)?;
|
||||
let output_index = field.output_index();
|
||||
val[output_index] = result;
|
||||
}
|
||||
Some(Value::Null) | None => {
|
||||
if !self.ignore_missing {
|
||||
return Err(format!(
|
||||
"{} processor: missing field: {}",
|
||||
self.kind(),
|
||||
field.get_field_name()
|
||||
field.input_name()
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -173,25 +186,22 @@ impl Processor for JoinProcessor {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use crate::etl::field::Field;
|
||||
use crate::etl::processor::join::JoinProcessor;
|
||||
use crate::etl::processor::Processor;
|
||||
use crate::etl::value::{Map, Value};
|
||||
use crate::etl::value::Value;
|
||||
|
||||
#[test]
|
||||
fn test_join_processor() {
|
||||
let mut processor = JoinProcessor::default();
|
||||
processor.with_separator("-");
|
||||
let processor = JoinProcessor {
|
||||
separator: Some("-".to_string()),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let field = Field::new("test");
|
||||
let arr = Value::Array(
|
||||
vec![
|
||||
Value::String("a".to_string()),
|
||||
Value::String("b".to_string()),
|
||||
]
|
||||
.into(),
|
||||
);
|
||||
let result = processor.exec_field(&arr, &field).unwrap();
|
||||
assert_eq!(result, Map::one("test", Value::String("a-b".to_string())));
|
||||
let arr = vec![
|
||||
Value::String("a".to_string()),
|
||||
Value::String("b".to_string()),
|
||||
]
|
||||
.into();
|
||||
let result = processor.process(&arr).unwrap();
|
||||
assert_eq!(result, Value::String("a-b".to_string()));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,12 +14,12 @@
|
||||
|
||||
use ahash::HashSet;
|
||||
|
||||
use crate::etl::field::{Field, Fields};
|
||||
use crate::etl::field::{Fields, OneInputOneOutputField};
|
||||
use crate::etl::processor::{
|
||||
update_one_one_output_keys, yaml_bool, yaml_field, yaml_fields, yaml_string, Processor,
|
||||
FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, METHOD_NAME,
|
||||
yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, ProcessorBuilder,
|
||||
ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, METHOD_NAME,
|
||||
};
|
||||
use crate::etl::value::{Map, Value};
|
||||
use crate::etl::value::Value;
|
||||
|
||||
pub(crate) const PROCESSOR_LETTER: &str = "letter";
|
||||
|
||||
@@ -54,29 +54,61 @@ impl std::str::FromStr for Method {
|
||||
}
|
||||
}
|
||||
|
||||
/// only support string value
|
||||
#[derive(Debug, Default)]
|
||||
pub struct LetterProcessor {
|
||||
pub struct LetterProcessorBuilder {
|
||||
fields: Fields,
|
||||
method: Method,
|
||||
ignore_missing: bool,
|
||||
}
|
||||
|
||||
impl ProcessorBuilder for LetterProcessorBuilder {
|
||||
fn output_keys(&self) -> HashSet<&str> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|f| f.target_or_input_field())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn input_keys(&self) -> HashSet<&str> {
|
||||
self.fields.iter().map(|f| f.input_field()).collect()
|
||||
}
|
||||
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String> {
|
||||
self.build(intermediate_keys).map(ProcessorKind::Letter)
|
||||
}
|
||||
}
|
||||
|
||||
impl LetterProcessorBuilder {
|
||||
pub fn build(self, intermediate_keys: &[String]) -> Result<LetterProcessor, String> {
|
||||
let mut real_fields = vec![];
|
||||
for field in self.fields.into_iter() {
|
||||
let input = OneInputOneOutputField::build(
|
||||
"letter",
|
||||
intermediate_keys,
|
||||
field.input_field(),
|
||||
field.target_or_input_field(),
|
||||
)?;
|
||||
real_fields.push(input);
|
||||
}
|
||||
|
||||
Ok(LetterProcessor {
|
||||
fields: real_fields,
|
||||
method: self.method,
|
||||
ignore_missing: self.ignore_missing,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// only support string value
|
||||
#[derive(Debug, Default)]
|
||||
pub struct LetterProcessor {
|
||||
fields: Vec<OneInputOneOutputField>,
|
||||
method: Method,
|
||||
ignore_missing: bool,
|
||||
}
|
||||
|
||||
impl LetterProcessor {
|
||||
fn with_fields(&mut self, mut fields: Fields) {
|
||||
update_one_one_output_keys(&mut fields);
|
||||
self.fields = fields;
|
||||
}
|
||||
|
||||
fn with_method(&mut self, method: Method) {
|
||||
self.method = method;
|
||||
}
|
||||
|
||||
fn with_ignore_missing(&mut self, ignore_missing: bool) {
|
||||
self.ignore_missing = ignore_missing;
|
||||
}
|
||||
|
||||
fn process_field(&self, val: &str, field: &Field) -> Result<Map, String> {
|
||||
fn process_field(&self, val: &str) -> Result<Value, String> {
|
||||
let processed = match self.method {
|
||||
Method::Upper => val.to_uppercase(),
|
||||
Method::Lower => val.to_lowercase(),
|
||||
@@ -84,17 +116,17 @@ impl LetterProcessor {
|
||||
};
|
||||
let val = Value::String(processed);
|
||||
|
||||
let key = field.get_target_field();
|
||||
|
||||
Ok(Map::one(key, val))
|
||||
Ok(val)
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for LetterProcessor {
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for LetterProcessorBuilder {
|
||||
type Error = String;
|
||||
|
||||
fn try_from(value: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
|
||||
let mut processor = LetterProcessor::default();
|
||||
let mut fields = Fields::default();
|
||||
let mut method = Method::Lower;
|
||||
let mut ignore_missing = false;
|
||||
|
||||
for (k, v) in value.iter() {
|
||||
let key = k
|
||||
@@ -102,23 +134,26 @@ impl TryFrom<&yaml_rust::yaml::Hash> for LetterProcessor {
|
||||
.ok_or(format!("key must be a string, but got {k:?}"))?;
|
||||
match key {
|
||||
FIELD_NAME => {
|
||||
processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?));
|
||||
fields = Fields::one(yaml_new_field(v, FIELD_NAME)?);
|
||||
}
|
||||
FIELDS_NAME => {
|
||||
processor.with_fields(yaml_fields(v, FIELDS_NAME)?);
|
||||
fields = yaml_new_fields(v, FIELDS_NAME)?;
|
||||
}
|
||||
METHOD_NAME => {
|
||||
let method = yaml_string(v, METHOD_NAME)?;
|
||||
processor.with_method(method.parse()?);
|
||||
method = yaml_string(v, METHOD_NAME)?.parse()?;
|
||||
}
|
||||
IGNORE_MISSING_NAME => {
|
||||
processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?);
|
||||
ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(processor)
|
||||
Ok(LetterProcessorBuilder {
|
||||
fields,
|
||||
method,
|
||||
ignore_missing,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -131,53 +166,21 @@ impl Processor for LetterProcessor {
|
||||
self.ignore_missing
|
||||
}
|
||||
|
||||
fn fields(&self) -> &Fields {
|
||||
&self.fields
|
||||
}
|
||||
|
||||
fn fields_mut(&mut self) -> &mut Fields {
|
||||
&mut self.fields
|
||||
}
|
||||
|
||||
fn output_keys(&self) -> HashSet<String> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|f| f.get_target_field().to_string())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
|
||||
match val {
|
||||
Value::String(val) => self.process_field(val, field),
|
||||
_ => Err(format!(
|
||||
"{} processor: expect string value, but got {val:?}",
|
||||
self.kind()
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String> {
|
||||
for field in self.fields.iter() {
|
||||
let index = field.input_field.index;
|
||||
let index = field.input_index();
|
||||
match val.get(index) {
|
||||
Some(Value::String(s)) => {
|
||||
// TODO(qtang): Let this method use the intermediate state collection directly.
|
||||
let mut processed = self.process_field(s, field)?;
|
||||
field
|
||||
.output_fields_index_mapping
|
||||
.iter()
|
||||
.for_each(|(k, output_index)| {
|
||||
if let Some(v) = processed.remove(k) {
|
||||
val[*output_index] = v;
|
||||
}
|
||||
});
|
||||
let result = self.process_field(s)?;
|
||||
let (_, output_index) = field.output();
|
||||
val[*output_index] = result;
|
||||
}
|
||||
Some(Value::Null) | None => {
|
||||
if !self.ignore_missing {
|
||||
return Err(format!(
|
||||
"{} processor: missing field: {}",
|
||||
self.kind(),
|
||||
field.get_field_name()
|
||||
&field.input().name
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -204,33 +207,36 @@ fn capitalize(s: &str) -> String {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::etl::field::Fields;
|
||||
use crate::etl::processor::letter::{LetterProcessor, Method};
|
||||
use crate::etl::value::{Map, Value};
|
||||
use crate::etl::value::Value;
|
||||
|
||||
#[test]
|
||||
fn test_process() {
|
||||
let field = "letter";
|
||||
let ff: crate::etl::processor::Field = field.parse().unwrap();
|
||||
let mut processor = LetterProcessor::default();
|
||||
processor.with_fields(Fields::one(ff.clone()));
|
||||
|
||||
{
|
||||
processor.with_method(Method::Upper);
|
||||
let processed = processor.process_field("pipeline", &ff).unwrap();
|
||||
assert_eq!(Map::one(field, Value::String("PIPELINE".into())), processed)
|
||||
let processor = LetterProcessor {
|
||||
method: Method::Upper,
|
||||
..Default::default()
|
||||
};
|
||||
let processed = processor.process_field("pipeline").unwrap();
|
||||
assert_eq!(Value::String("PIPELINE".into()), processed)
|
||||
}
|
||||
|
||||
{
|
||||
processor.with_method(Method::Lower);
|
||||
let processed = processor.process_field("Pipeline", &ff).unwrap();
|
||||
assert_eq!(Map::one(field, Value::String("pipeline".into())), processed)
|
||||
let processor = LetterProcessor {
|
||||
method: Method::Lower,
|
||||
..Default::default()
|
||||
};
|
||||
let processed = processor.process_field("Pipeline").unwrap();
|
||||
assert_eq!(Value::String("pipeline".into()), processed)
|
||||
}
|
||||
|
||||
{
|
||||
processor.with_method(Method::Capital);
|
||||
let processed = processor.process_field("pipeline", &ff).unwrap();
|
||||
assert_eq!(Map::one(field, Value::String("Pipeline".into())), processed)
|
||||
let processor = LetterProcessor {
|
||||
method: Method::Capital,
|
||||
..Default::default()
|
||||
};
|
||||
let processed = processor.process_field("pipeline").unwrap();
|
||||
assert_eq!(Value::String("Pipeline".into()), processed)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,16 +18,17 @@ const PATTERNS_NAME: &str = "patterns";
|
||||
|
||||
pub(crate) const PROCESSOR_REGEX: &str = "regex";
|
||||
|
||||
use ahash::HashSet;
|
||||
use ahash::{HashSet, HashSetExt};
|
||||
use lazy_static::lazy_static;
|
||||
use regex::Regex;
|
||||
|
||||
use crate::etl::field::Fields;
|
||||
use crate::etl::field::{Fields, InputFieldInfo, OneInputMultiOutputField};
|
||||
use crate::etl::find_key_index;
|
||||
use crate::etl::processor::{
|
||||
yaml_bool, yaml_field, yaml_fields, yaml_string, yaml_strings, Field, Processor, FIELDS_NAME,
|
||||
FIELD_NAME, IGNORE_MISSING_NAME, PATTERN_NAME,
|
||||
yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, yaml_strings, Processor,
|
||||
ProcessorBuilder, ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, PATTERN_NAME,
|
||||
};
|
||||
use crate::etl::value::{Map, Value};
|
||||
use crate::etl::value::Value;
|
||||
|
||||
lazy_static! {
|
||||
static ref GROUPS_NAME_REGEX: Regex = Regex::new(r"\(\?P?<([[:word:]]+)>.+?\)").unwrap();
|
||||
@@ -40,6 +41,10 @@ fn get_regex_group_names(s: &str) -> Vec<String> {
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn generate_key(prefix: &str, group: &str) -> String {
|
||||
format!("{prefix}_{group}")
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct GroupRegex {
|
||||
origin: String,
|
||||
@@ -72,34 +77,29 @@ impl std::str::FromStr for GroupRegex {
|
||||
}
|
||||
}
|
||||
|
||||
/// only support string value
|
||||
/// if no value found from a pattern, the target_field will be ignored
|
||||
#[derive(Debug, Default)]
|
||||
pub struct RegexProcessor {
|
||||
pub struct RegexProcessorBuilder {
|
||||
fields: Fields,
|
||||
patterns: Vec<GroupRegex>,
|
||||
ignore_missing: bool,
|
||||
output_keys: HashSet<String>,
|
||||
}
|
||||
|
||||
impl RegexProcessor {
|
||||
fn with_fields(&mut self, fields: Fields) {
|
||||
self.fields = fields;
|
||||
impl ProcessorBuilder for RegexProcessorBuilder {
|
||||
fn output_keys(&self) -> HashSet<&str> {
|
||||
self.output_keys.iter().map(|k| k.as_str()).collect()
|
||||
}
|
||||
|
||||
fn try_with_patterns(&mut self, patterns: Vec<String>) -> Result<(), String> {
|
||||
let mut rs = vec![];
|
||||
for pattern in patterns {
|
||||
let gr = pattern.parse()?;
|
||||
rs.push(gr);
|
||||
}
|
||||
self.patterns = rs;
|
||||
Ok(())
|
||||
fn input_keys(&self) -> HashSet<&str> {
|
||||
self.fields.iter().map(|f| f.input_field()).collect()
|
||||
}
|
||||
|
||||
fn with_ignore_missing(&mut self, ignore_missing: bool) {
|
||||
self.ignore_missing = ignore_missing;
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String> {
|
||||
self.build(intermediate_keys).map(ProcessorKind::Regex)
|
||||
}
|
||||
}
|
||||
|
||||
impl RegexProcessorBuilder {
|
||||
fn check(self) -> Result<Self, String> {
|
||||
if self.fields.is_empty() {
|
||||
return Err(format!(
|
||||
@@ -118,47 +118,78 @@ impl RegexProcessor {
|
||||
Ok(self)
|
||||
}
|
||||
|
||||
fn generate_key(prefix: &str, group: &str) -> String {
|
||||
format!("{prefix}_{group}")
|
||||
fn build_group_output_info(
|
||||
group_regex: &GroupRegex,
|
||||
om_field: &OneInputMultiOutputField,
|
||||
intermediate_keys: &[String],
|
||||
) -> Result<Vec<OutPutInfo>, String> {
|
||||
group_regex
|
||||
.groups
|
||||
.iter()
|
||||
.map(|g| {
|
||||
let key = generate_key(om_field.target_prefix(), g);
|
||||
let index = find_key_index(intermediate_keys, &key, "regex");
|
||||
index.map(|index| OutPutInfo {
|
||||
final_key: key,
|
||||
group_name: g.to_string(),
|
||||
index,
|
||||
})
|
||||
})
|
||||
.collect::<Result<Vec<_>, String>>()
|
||||
}
|
||||
|
||||
fn process_field(&self, val: &str, field: &Field, gr: &GroupRegex) -> Result<Map, String> {
|
||||
let mut map = Map::default();
|
||||
|
||||
if let Some(captures) = gr.regex.captures(val) {
|
||||
for group in &gr.groups {
|
||||
if let Some(capture) = captures.name(group) {
|
||||
let value = capture.as_str().to_string();
|
||||
let prefix = field.get_target_field();
|
||||
|
||||
let key = Self::generate_key(prefix, group);
|
||||
|
||||
map.insert(key, Value::String(value));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(map)
|
||||
fn build_group_output_infos(
|
||||
patterns: &[GroupRegex],
|
||||
om_field: &OneInputMultiOutputField,
|
||||
intermediate_keys: &[String],
|
||||
) -> Result<Vec<Vec<OutPutInfo>>, String> {
|
||||
patterns
|
||||
.iter()
|
||||
.map(|group_regex| {
|
||||
Self::build_group_output_info(group_regex, om_field, intermediate_keys)
|
||||
})
|
||||
.collect::<Result<Vec<_>, String>>()
|
||||
}
|
||||
|
||||
fn update_output_keys(&mut self) {
|
||||
for field in self.fields.iter_mut() {
|
||||
for gr in &self.patterns {
|
||||
for group in &gr.groups {
|
||||
field
|
||||
.output_fields_index_mapping
|
||||
.insert(Self::generate_key(field.get_target_field(), group), 0_usize);
|
||||
}
|
||||
}
|
||||
fn build_output_info(
|
||||
real_fields: &[OneInputMultiOutputField],
|
||||
patterns: &[GroupRegex],
|
||||
intermediate_keys: &[String],
|
||||
) -> Result<RegexProcessorOutputInfo, String> {
|
||||
let inner = real_fields
|
||||
.iter()
|
||||
.map(|om_field| Self::build_group_output_infos(patterns, om_field, intermediate_keys))
|
||||
.collect::<Result<Vec<_>, String>>();
|
||||
inner.map(|inner| RegexProcessorOutputInfo { inner })
|
||||
}
|
||||
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<RegexProcessor, String> {
|
||||
let mut real_fields = vec![];
|
||||
for field in self.fields.into_iter() {
|
||||
let input_index = find_key_index(intermediate_keys, field.input_field(), "regex")?;
|
||||
let input_field_info = InputFieldInfo::new(field.input_field(), input_index);
|
||||
|
||||
let input = OneInputMultiOutputField::new(input_field_info, field.target_field);
|
||||
real_fields.push(input);
|
||||
}
|
||||
let output_info = Self::build_output_info(&real_fields, &self.patterns, intermediate_keys)?;
|
||||
Ok(RegexProcessor {
|
||||
// fields: Fields::one(Field::new("test".to_string())),
|
||||
fields: real_fields,
|
||||
patterns: self.patterns,
|
||||
output_info,
|
||||
ignore_missing: self.ignore_missing,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for RegexProcessor {
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for RegexProcessorBuilder {
|
||||
type Error = String;
|
||||
|
||||
fn try_from(value: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
|
||||
let mut processor = RegexProcessor::default();
|
||||
let mut fields = Fields::default();
|
||||
let mut patterns: Vec<GroupRegex> = vec![];
|
||||
let mut ignore_missing = false;
|
||||
|
||||
for (k, v) in value.iter() {
|
||||
let key = k
|
||||
@@ -166,28 +197,113 @@ impl TryFrom<&yaml_rust::yaml::Hash> for RegexProcessor {
|
||||
.ok_or(format!("key must be a string, but got {k:?}"))?;
|
||||
match key {
|
||||
FIELD_NAME => {
|
||||
processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?));
|
||||
fields = Fields::one(yaml_new_field(v, FIELD_NAME)?);
|
||||
}
|
||||
FIELDS_NAME => {
|
||||
processor.with_fields(yaml_fields(v, FIELDS_NAME)?);
|
||||
fields = yaml_new_fields(v, FIELDS_NAME)?;
|
||||
}
|
||||
PATTERN_NAME => {
|
||||
processor.try_with_patterns(vec![yaml_string(v, PATTERN_NAME)?])?;
|
||||
let pattern = yaml_string(v, PATTERN_NAME)?;
|
||||
let gr = pattern.parse()?;
|
||||
patterns.push(gr);
|
||||
}
|
||||
PATTERNS_NAME => {
|
||||
processor.try_with_patterns(yaml_strings(v, PATTERNS_NAME)?)?;
|
||||
for pattern in yaml_strings(v, PATTERNS_NAME)? {
|
||||
let gr = pattern.parse()?;
|
||||
patterns.push(gr);
|
||||
}
|
||||
}
|
||||
IGNORE_MISSING_NAME => {
|
||||
processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?);
|
||||
ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
processor.check().map(|mut p| {
|
||||
p.update_output_keys();
|
||||
p
|
||||
})
|
||||
let pattern_output_keys = patterns
|
||||
.iter()
|
||||
.flat_map(|pattern| pattern.groups.iter())
|
||||
.collect::<Vec<_>>();
|
||||
let mut output_keys = HashSet::new();
|
||||
for field in fields.iter() {
|
||||
for x in pattern_output_keys.iter() {
|
||||
output_keys.insert(generate_key(field.target_or_input_field(), x));
|
||||
}
|
||||
}
|
||||
|
||||
let processor_builder = RegexProcessorBuilder {
|
||||
fields,
|
||||
patterns,
|
||||
ignore_missing,
|
||||
output_keys,
|
||||
};
|
||||
|
||||
processor_builder.check()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
struct OutPutInfo {
|
||||
final_key: String,
|
||||
group_name: String,
|
||||
index: usize,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
struct RegexProcessorOutputInfo {
|
||||
pub inner: Vec<Vec<Vec<OutPutInfo>>>,
|
||||
}
|
||||
|
||||
impl RegexProcessorOutputInfo {
|
||||
fn get_output_index(
|
||||
&self,
|
||||
field_index: usize,
|
||||
pattern_index: usize,
|
||||
group_index: usize,
|
||||
) -> usize {
|
||||
self.inner[field_index][pattern_index][group_index].index
|
||||
}
|
||||
}
|
||||
/// only support string value
|
||||
/// if no value found from a pattern, the target_field will be ignored
|
||||
#[derive(Debug, Default)]
|
||||
pub struct RegexProcessor {
|
||||
fields: Vec<OneInputMultiOutputField>,
|
||||
output_info: RegexProcessorOutputInfo,
|
||||
patterns: Vec<GroupRegex>,
|
||||
ignore_missing: bool,
|
||||
}
|
||||
|
||||
impl RegexProcessor {
|
||||
fn try_with_patterns(&mut self, patterns: Vec<String>) -> Result<(), String> {
|
||||
let mut rs = vec![];
|
||||
for pattern in patterns {
|
||||
let gr = pattern.parse()?;
|
||||
rs.push(gr);
|
||||
}
|
||||
self.patterns = rs;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn process(
|
||||
&self,
|
||||
val: &str,
|
||||
gr: &GroupRegex,
|
||||
index: (usize, usize),
|
||||
) -> Result<Vec<(usize, Value)>, String> {
|
||||
let mut result = Vec::new();
|
||||
if let Some(captures) = gr.regex.captures(val) {
|
||||
for (group_index, group) in gr.groups.iter().enumerate() {
|
||||
if let Some(capture) = captures.name(group) {
|
||||
let value = capture.as_str().to_string();
|
||||
let index = self
|
||||
.output_info
|
||||
.get_output_index(index.0, index.1, group_index);
|
||||
result.push((index, Value::String(value)));
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -200,71 +316,40 @@ impl Processor for RegexProcessor {
|
||||
self.ignore_missing
|
||||
}
|
||||
|
||||
fn fields(&self) -> &Fields {
|
||||
&self.fields
|
||||
}
|
||||
|
||||
fn fields_mut(&mut self) -> &mut Fields {
|
||||
&mut self.fields
|
||||
}
|
||||
|
||||
fn output_keys(&self) -> HashSet<String> {
|
||||
self.fields
|
||||
.iter()
|
||||
.flat_map(|f| {
|
||||
self.patterns.iter().flat_map(move |p| {
|
||||
p.groups
|
||||
.iter()
|
||||
.map(move |g| Self::generate_key(&f.input_field.name, g))
|
||||
})
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
|
||||
match val {
|
||||
Value::String(val) => {
|
||||
let mut map = Map::default();
|
||||
for gr in &self.patterns {
|
||||
let m = self.process_field(val, field, gr)?;
|
||||
map.extend(m);
|
||||
}
|
||||
Ok(map)
|
||||
}
|
||||
_ => Err(format!(
|
||||
"{} processor: expect string value, but got {val:?}",
|
||||
self.kind()
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String> {
|
||||
for field in self.fields.iter() {
|
||||
let index = field.input_field.index;
|
||||
for (field_index, field) in self.fields.iter().enumerate() {
|
||||
let index = field.input_index();
|
||||
let mut result_list = None;
|
||||
match val.get(index) {
|
||||
Some(Value::String(s)) => {
|
||||
let mut map = Map::default();
|
||||
for gr in &self.patterns {
|
||||
// TODO(qtang): Let this method use the intermediate state collection directly.
|
||||
let m = self.process_field(s, field, gr)?;
|
||||
map.extend(m);
|
||||
}
|
||||
|
||||
field
|
||||
.output_fields_index_mapping
|
||||
.iter()
|
||||
.for_each(|(k, output_index)| {
|
||||
if let Some(v) = map.remove(k) {
|
||||
val[*output_index] = v;
|
||||
// we get rust borrow checker error here
|
||||
// for (gr_index, gr) in self.patterns.iter().enumerate() {
|
||||
// let result_list = self.process(s.as_str(), gr, (field_index, gr_index))?;
|
||||
// for (output_index, result) in result_list {
|
||||
//cannot borrow `*val` as mutable because it is also borrowed as immutable mutable borrow occurs here
|
||||
// val[output_index] = result;
|
||||
// }
|
||||
// }
|
||||
for (gr_index, gr) in self.patterns.iter().enumerate() {
|
||||
let result = self.process(s.as_str(), gr, (field_index, gr_index))?;
|
||||
if !result.is_empty() {
|
||||
match result_list.as_mut() {
|
||||
None => {
|
||||
result_list = Some(result);
|
||||
}
|
||||
Some(result_list) => {
|
||||
result_list.extend(result);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
Some(Value::Null) | None => {
|
||||
if !self.ignore_missing {
|
||||
return Err(format!(
|
||||
"{} processor: missing field: {}",
|
||||
self.kind(),
|
||||
field.get_field_name()
|
||||
field.input_name()
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -275,6 +360,15 @@ impl Processor for RegexProcessor {
|
||||
));
|
||||
}
|
||||
}
|
||||
// safety here
|
||||
match result_list {
|
||||
None => {}
|
||||
Some(result_list) => {
|
||||
for (output_index, result) in result_list {
|
||||
val[output_index] = result;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -282,37 +376,42 @@ impl Processor for RegexProcessor {
|
||||
}
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use ahash::{HashMap, HashMapExt};
|
||||
use itertools::Itertools;
|
||||
|
||||
use super::RegexProcessor;
|
||||
use crate::etl::field::Fields;
|
||||
use crate::etl::processor::Processor;
|
||||
use crate::etl::processor::regex::RegexProcessorBuilder;
|
||||
use crate::etl::value::{Map, Value};
|
||||
|
||||
#[test]
|
||||
fn test_simple_parse() {
|
||||
let mut processor = RegexProcessor::default();
|
||||
let pipeline_str = r#"fields: ["a"]
|
||||
patterns: ['(?<ar>\d)']
|
||||
ignore_missing: false"#;
|
||||
|
||||
let processor_yaml = yaml_rust::YamlLoader::load_from_str(pipeline_str)
|
||||
.unwrap()
|
||||
.pop()
|
||||
.unwrap();
|
||||
let processor_yaml_hash = processor_yaml.as_hash().unwrap();
|
||||
let builder = RegexProcessorBuilder::try_from(processor_yaml_hash).unwrap();
|
||||
let intermediate_keys = ["a".to_string(), "a_ar".to_string()];
|
||||
let processor = builder.build(&intermediate_keys).unwrap();
|
||||
|
||||
// single field (with prefix), multiple patterns
|
||||
let f = ["a"].iter().map(|f| f.parse().unwrap()).collect();
|
||||
processor.with_fields(Fields::new(f).unwrap());
|
||||
|
||||
let ar = "(?<ar>\\d)";
|
||||
let result = processor
|
||||
.process("123", &processor.patterns[0], (0, 0))
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|(k, v)| (intermediate_keys[k].clone(), v))
|
||||
.collect();
|
||||
|
||||
let patterns = [ar].iter().map(|p| p.to_string()).collect();
|
||||
processor.try_with_patterns(patterns).unwrap();
|
||||
|
||||
let mut map = Map::default();
|
||||
map.insert("a", Value::String("123".to_string()));
|
||||
processor.exec_map(&mut map).unwrap();
|
||||
let map = Map { values: result };
|
||||
|
||||
let v = Map {
|
||||
values: vec![
|
||||
("a_ar".to_string(), Value::String("1".to_string())),
|
||||
("a".to_string(), Value::String("123".to_string())),
|
||||
]
|
||||
.into_iter()
|
||||
.collect(),
|
||||
values: vec![("a_ar".to_string(), Value::String("1".to_string()))]
|
||||
.into_iter()
|
||||
.collect(),
|
||||
};
|
||||
|
||||
assert_eq!(v, map);
|
||||
@@ -320,17 +419,14 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_process() {
|
||||
let mut processor = RegexProcessor::default();
|
||||
|
||||
let cc = "[c=c,n=US_CA_SANJOSE,o=55155]";
|
||||
let cg = "[a=12.34.567.89,b=12345678,c=g,n=US_CA_SANJOSE,o=20940]";
|
||||
let co = "[a=987.654.321.09,c=o]";
|
||||
let cp = "[c=p,n=US_CA_SANJOSE,o=55155]";
|
||||
let cw = "[c=w,n=US_CA_SANJOSE,o=55155]";
|
||||
let breadcrumbs = Value::String([cc, cg, co, cp, cw].iter().join(","));
|
||||
let breadcrumbs_str = [cc, cg, co, cp, cw].iter().join(",");
|
||||
|
||||
let values = [
|
||||
("breadcrumbs", breadcrumbs.clone()),
|
||||
("breadcrumbs_parent", Value::String(cc.to_string())),
|
||||
("breadcrumbs_edge", Value::String(cg.to_string())),
|
||||
("breadcrumbs_origin", Value::String(co.to_string())),
|
||||
@@ -340,61 +436,141 @@ mod tests {
|
||||
.into_iter()
|
||||
.map(|(k, v)| (k.to_string(), v))
|
||||
.collect();
|
||||
let mut temporary_map = Map { values };
|
||||
let temporary_map = Map { values };
|
||||
|
||||
{
|
||||
// single field (with prefix), multiple patterns
|
||||
let ff = ["breadcrumbs, breadcrumbs"]
|
||||
.iter()
|
||||
.map(|f| f.parse().unwrap())
|
||||
.collect();
|
||||
processor.with_fields(Fields::new(ff).unwrap());
|
||||
|
||||
let ccr = "(?<parent>\\[[^\\[]*c=c[^\\]]*\\])";
|
||||
let cgr = "(?<edge>\\[[^\\[]*c=g[^\\]]*\\])";
|
||||
let cor = "(?<origin>\\[[^\\[]*c=o[^\\]]*\\])";
|
||||
let cpr = "(?<peer>\\[[^\\[]*c=p[^\\]]*\\])";
|
||||
let cwr = "(?<wrapper>\\[[^\\[]*c=w[^\\]]*\\])";
|
||||
let patterns = [ccr, cgr, cor, cpr, cwr]
|
||||
.iter()
|
||||
.map(|p| p.to_string())
|
||||
.collect();
|
||||
processor.try_with_patterns(patterns).unwrap();
|
||||
let pipeline_str = r#"fields: ["breadcrumbs"]
|
||||
patterns:
|
||||
- '(?<parent>\[[^\[]*c=c[^\]]*\])'
|
||||
- '(?<edge>\[[^\[]*c=g[^\]]*\])'
|
||||
- '(?<origin>\[[^\[]*c=o[^\]]*\])'
|
||||
- '(?<peer>\[[^\[]*c=p[^\]]*\])'
|
||||
- '(?<wrapper>\[[^\[]*c=w[^\]]*\])'
|
||||
ignore_missing: false"#;
|
||||
|
||||
let mut map = Map::default();
|
||||
map.insert("breadcrumbs", breadcrumbs.clone());
|
||||
processor.exec_map(&mut map).unwrap();
|
||||
|
||||
assert_eq!(map, temporary_map);
|
||||
let processor_yaml = yaml_rust::YamlLoader::load_from_str(pipeline_str)
|
||||
.unwrap()
|
||||
.pop()
|
||||
.unwrap();
|
||||
let processor_yaml_hash = processor_yaml.as_hash().unwrap();
|
||||
let builder = RegexProcessorBuilder::try_from(processor_yaml_hash).unwrap();
|
||||
let intermediate_keys = [
|
||||
"breadcrumbs",
|
||||
"breadcrumbs_parent",
|
||||
"breadcrumbs_edge",
|
||||
"breadcrumbs_origin",
|
||||
"breadcrumbs_peer",
|
||||
"breadcrumbs_wrapper",
|
||||
]
|
||||
.iter()
|
||||
.map(|k| k.to_string())
|
||||
.collect_vec();
|
||||
let processor = builder.build(&intermediate_keys).unwrap();
|
||||
let mut result = HashMap::new();
|
||||
for (index, pattern) in processor.patterns.iter().enumerate() {
|
||||
let r = processor
|
||||
.process(&breadcrumbs_str, pattern, (0, index))
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|(k, v)| (intermediate_keys[k].clone(), v))
|
||||
.collect::<HashMap<_, _>>();
|
||||
result.extend(r);
|
||||
}
|
||||
let map = Map { values: result };
|
||||
assert_eq!(temporary_map, map);
|
||||
}
|
||||
|
||||
{
|
||||
// multiple fields (with prefix), multiple patterns
|
||||
let ff = [
|
||||
"breadcrumbs_parent, parent",
|
||||
"breadcrumbs_edge, edge",
|
||||
"breadcrumbs_origin, origin",
|
||||
"breadcrumbs_peer, peer",
|
||||
"breadcrumbs_wrapper, wrapper",
|
||||
]
|
||||
.iter()
|
||||
.map(|f| f.parse().unwrap())
|
||||
.collect();
|
||||
processor.with_fields(Fields::new(ff).unwrap());
|
||||
|
||||
let patterns = [
|
||||
"a=(?<ip>[^,\\]]+)",
|
||||
"b=(?<request_id>[^,\\]]+)",
|
||||
"k=(?<request_end_time>[^,\\]]+)",
|
||||
"l=(?<turn_around_time>[^,\\]]+)",
|
||||
"m=(?<dns_lookup_time>[^,\\]]+)",
|
||||
"n=(?<geo>[^,\\]]+)",
|
||||
"o=(?<asn>[^,\\]]+)",
|
||||
let pipeline_str = r#"fields:
|
||||
- breadcrumbs_parent, parent
|
||||
- breadcrumbs_edge, edge
|
||||
- breadcrumbs_origin, origin
|
||||
- breadcrumbs_peer, peer
|
||||
- breadcrumbs_wrapper, wrapper
|
||||
patterns:
|
||||
- 'a=(?<ip>[^,\]]+)'
|
||||
- 'b=(?<request_id>[^,\]]+)'
|
||||
- 'k=(?<request_end_time>[^,\]]+)'
|
||||
- 'l=(?<turn_around_time>[^,\]]+)'
|
||||
- 'm=(?<dns_lookup_time>[^,\]]+)'
|
||||
- 'n=(?<geo>[^,\]]+)'
|
||||
- 'o=(?<asn>[^,\]]+)'
|
||||
ignore_missing: false"#;
|
||||
|
||||
let processor_yaml = yaml_rust::YamlLoader::load_from_str(pipeline_str)
|
||||
.unwrap()
|
||||
.pop()
|
||||
.unwrap();
|
||||
let processor_yaml_hash = processor_yaml.as_hash().unwrap();
|
||||
let builder = RegexProcessorBuilder::try_from(processor_yaml_hash).unwrap();
|
||||
|
||||
let intermediate_keys = [
|
||||
"breadcrumbs_parent",
|
||||
"breadcrumbs_edge",
|
||||
"breadcrumbs_origin",
|
||||
"breadcrumbs_peer",
|
||||
"breadcrumbs_wrapper",
|
||||
"edge_ip",
|
||||
"edge_request_id",
|
||||
"edge_request_end_time",
|
||||
"edge_turn_around_time",
|
||||
"edge_dns_lookup_time",
|
||||
"edge_geo",
|
||||
"edge_asn",
|
||||
"origin_ip",
|
||||
"origin_request_id",
|
||||
"origin_request_end_time",
|
||||
"origin_turn_around_time",
|
||||
"origin_dns_lookup_time",
|
||||
"origin_geo",
|
||||
"origin_asn",
|
||||
"peer_ip",
|
||||
"peer_request_id",
|
||||
"peer_request_end_time",
|
||||
"peer_turn_around_time",
|
||||
"peer_dns_lookup_time",
|
||||
"peer_geo",
|
||||
"peer_asn",
|
||||
"parent_ip",
|
||||
"parent_request_id",
|
||||
"parent_request_end_time",
|
||||
"parent_turn_around_time",
|
||||
"parent_dns_lookup_time",
|
||||
"parent_geo",
|
||||
"parent_asn",
|
||||
"wrapper_ip",
|
||||
"wrapper_request_id",
|
||||
"wrapper_request_end_time",
|
||||
"wrapper_turn_around_time",
|
||||
"wrapper_dns_lookup_time",
|
||||
"wrapper_geo",
|
||||
"wrapper_asn",
|
||||
]
|
||||
.iter()
|
||||
.map(|p| p.to_string())
|
||||
.collect();
|
||||
processor.try_with_patterns(patterns).unwrap();
|
||||
.map(|k| k.to_string())
|
||||
.collect_vec();
|
||||
let processor = builder.build(&intermediate_keys).unwrap();
|
||||
|
||||
let mut result = HashMap::new();
|
||||
for (field_index, field) in processor.fields.iter().enumerate() {
|
||||
for (pattern_index, pattern) in processor.patterns.iter().enumerate() {
|
||||
let s = temporary_map
|
||||
.get(field.input_name())
|
||||
.unwrap()
|
||||
.to_str_value();
|
||||
let r = processor
|
||||
.process(&s, pattern, (field_index, pattern_index))
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|(k, v)| (intermediate_keys[k].clone(), v))
|
||||
.collect::<HashMap<_, _>>();
|
||||
result.extend(r);
|
||||
}
|
||||
}
|
||||
|
||||
let new_values = vec![
|
||||
("edge_ip", Value::String("12.34.567.89".to_string())),
|
||||
@@ -413,11 +589,7 @@ mod tests {
|
||||
.map(|(k, v)| (k.to_string(), v))
|
||||
.collect();
|
||||
|
||||
let mut expected_map = temporary_map.clone();
|
||||
processor.exec_map(&mut temporary_map).unwrap();
|
||||
expected_map.extend(Map { values: new_values });
|
||||
|
||||
assert_eq!(expected_map, temporary_map);
|
||||
assert_eq!(result, new_values);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,18 +19,17 @@ use chrono::{DateTime, NaiveDateTime};
|
||||
use chrono_tz::Tz;
|
||||
use lazy_static::lazy_static;
|
||||
|
||||
use super::yaml_strings;
|
||||
use crate::etl::field::{Field, Fields};
|
||||
use crate::etl::field::{Fields, OneInputOneOutputField};
|
||||
use crate::etl::processor::{
|
||||
update_one_one_output_keys, yaml_bool, yaml_field, yaml_fields, yaml_string, Processor,
|
||||
FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
|
||||
yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, yaml_strings, Processor,
|
||||
ProcessorBuilder, ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
|
||||
};
|
||||
use crate::etl::value::time::{
|
||||
MICROSECOND_RESOLUTION, MICRO_RESOLUTION, MILLISECOND_RESOLUTION, MILLI_RESOLUTION,
|
||||
MS_RESOLUTION, NANOSECOND_RESOLUTION, NANO_RESOLUTION, NS_RESOLUTION, SECOND_RESOLUTION,
|
||||
SEC_RESOLUTION, S_RESOLUTION, US_RESOLUTION,
|
||||
};
|
||||
use crate::etl::value::{Map, Timestamp, Value};
|
||||
use crate::etl::value::{Timestamp, Value};
|
||||
|
||||
pub(crate) const PROCESSOR_TIMESTAMP: &str = "timestamp";
|
||||
const RESOLUTION_NAME: &str = "resolution";
|
||||
@@ -108,10 +107,56 @@ impl std::ops::Deref for Formats {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct TimestampProcessorBuilder {
|
||||
fields: Fields,
|
||||
formats: Formats,
|
||||
resolution: Resolution,
|
||||
ignore_missing: bool,
|
||||
}
|
||||
|
||||
impl ProcessorBuilder for TimestampProcessorBuilder {
|
||||
fn output_keys(&self) -> HashSet<&str> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|f| f.target_or_input_field())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn input_keys(&self) -> HashSet<&str> {
|
||||
self.fields.iter().map(|f| f.input_field()).collect()
|
||||
}
|
||||
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String> {
|
||||
self.build(intermediate_keys).map(ProcessorKind::Timestamp)
|
||||
}
|
||||
}
|
||||
|
||||
impl TimestampProcessorBuilder {
|
||||
pub fn build(self, intermediate_keys: &[String]) -> Result<TimestampProcessor, String> {
|
||||
let mut real_fields = vec![];
|
||||
for field in self.fields.into_iter() {
|
||||
let input = OneInputOneOutputField::build(
|
||||
"timestamp",
|
||||
intermediate_keys,
|
||||
field.input_field(),
|
||||
field.target_or_input_field(),
|
||||
)?;
|
||||
real_fields.push(input);
|
||||
}
|
||||
Ok(TimestampProcessor {
|
||||
fields: real_fields,
|
||||
formats: self.formats,
|
||||
resolution: self.resolution,
|
||||
ignore_missing: self.ignore_missing,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// support string, integer, float, time, epoch
|
||||
#[derive(Debug, Default)]
|
||||
pub struct TimestampProcessor {
|
||||
fields: Fields,
|
||||
fields: Vec<OneInputOneOutputField>,
|
||||
formats: Formats,
|
||||
resolution: Resolution,
|
||||
ignore_missing: bool,
|
||||
@@ -123,29 +168,6 @@ pub struct TimestampProcessor {
|
||||
}
|
||||
|
||||
impl TimestampProcessor {
|
||||
fn with_fields(&mut self, mut fields: Fields) {
|
||||
update_one_one_output_keys(&mut fields);
|
||||
self.fields = fields
|
||||
}
|
||||
|
||||
fn with_resolution(&mut self, resolution: Resolution) {
|
||||
self.resolution = resolution;
|
||||
}
|
||||
|
||||
fn with_formats(&mut self, v: Option<Vec<(Arc<String>, Tz)>>) {
|
||||
let v = match v {
|
||||
Some(v) if !v.is_empty() => v,
|
||||
_ => DEFAULT_FORMATS.clone(),
|
||||
};
|
||||
|
||||
let formats = Formats::new(v);
|
||||
self.formats = formats;
|
||||
}
|
||||
|
||||
fn with_ignore_missing(&mut self, ignore_missing: bool) {
|
||||
self.ignore_missing = ignore_missing;
|
||||
}
|
||||
|
||||
/// try to parse val with timezone first, if failed, parse without timezone
|
||||
fn try_parse(val: &str, fmt: &str, tz: Tz) -> Result<i64, String> {
|
||||
if let Ok(dt) = DateTime::parse_from_str(val, fmt) {
|
||||
@@ -212,12 +234,6 @@ impl TimestampProcessor {
|
||||
Resolution::Nano => Ok(Timestamp::Nanosecond(t)),
|
||||
}
|
||||
}
|
||||
|
||||
fn process_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
|
||||
let key = field.get_target_field();
|
||||
|
||||
Ok(Map::one(key, Value::Timestamp(self.parse(val)?)))
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_formats(yaml: &yaml_rust::yaml::Yaml) -> Result<Vec<(Arc<String>, Tz)>, String> {
|
||||
@@ -250,11 +266,14 @@ fn parse_formats(yaml: &yaml_rust::yaml::Yaml) -> Result<Vec<(Arc<String>, Tz)>,
|
||||
};
|
||||
}
|
||||
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for TimestampProcessor {
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for TimestampProcessorBuilder {
|
||||
type Error = String;
|
||||
|
||||
fn try_from(hash: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
|
||||
let mut processor = TimestampProcessor::default();
|
||||
let mut fields = Fields::default();
|
||||
let mut formats = Formats::default();
|
||||
let mut resolution = Resolution::default();
|
||||
let mut ignore_missing = false;
|
||||
|
||||
for (k, v) in hash {
|
||||
let key = k
|
||||
@@ -263,28 +282,33 @@ impl TryFrom<&yaml_rust::yaml::Hash> for TimestampProcessor {
|
||||
|
||||
match key {
|
||||
FIELD_NAME => {
|
||||
processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?));
|
||||
fields = Fields::one(yaml_new_field(v, FIELD_NAME)?);
|
||||
}
|
||||
FIELDS_NAME => {
|
||||
processor.with_fields(yaml_fields(v, FIELDS_NAME)?);
|
||||
fields = yaml_new_fields(v, FIELDS_NAME)?;
|
||||
}
|
||||
FORMATS_NAME => {
|
||||
let formats = parse_formats(v)?;
|
||||
processor.with_formats(Some(formats));
|
||||
let formats_vec = parse_formats(v)?;
|
||||
formats = Formats::new(formats_vec);
|
||||
}
|
||||
RESOLUTION_NAME => {
|
||||
let s = yaml_string(v, RESOLUTION_NAME)?.as_str().try_into()?;
|
||||
processor.with_resolution(s);
|
||||
resolution = yaml_string(v, RESOLUTION_NAME)?.as_str().try_into()?;
|
||||
}
|
||||
IGNORE_MISSING_NAME => {
|
||||
processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?);
|
||||
ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?;
|
||||
}
|
||||
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(processor)
|
||||
let processor_builder = TimestampProcessorBuilder {
|
||||
fields,
|
||||
formats,
|
||||
resolution,
|
||||
ignore_missing,
|
||||
};
|
||||
|
||||
Ok(processor_builder)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -297,49 +321,23 @@ impl Processor for TimestampProcessor {
|
||||
self.ignore_missing
|
||||
}
|
||||
|
||||
fn fields(&self) -> &Fields {
|
||||
&self.fields
|
||||
}
|
||||
|
||||
fn fields_mut(&mut self) -> &mut Fields {
|
||||
&mut self.fields
|
||||
}
|
||||
|
||||
fn output_keys(&self) -> HashSet<String> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|f| f.get_target_field().to_string())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
|
||||
self.process_field(val, field)
|
||||
}
|
||||
|
||||
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String> {
|
||||
for field in self.fields.iter() {
|
||||
let index = field.input_field.index;
|
||||
let index = field.input().index;
|
||||
match val.get(index) {
|
||||
Some(Value::Null) | None => {
|
||||
if !self.ignore_missing {
|
||||
return Err(format!(
|
||||
"{} processor: missing field: {}",
|
||||
self.kind(),
|
||||
field.get_field_name()
|
||||
&field.input().name
|
||||
));
|
||||
}
|
||||
}
|
||||
Some(v) => {
|
||||
// TODO(qtang): Let this method use the intermediate state collection directly.
|
||||
let mut map = self.process_field(v, field)?;
|
||||
field
|
||||
.output_fields_index_mapping
|
||||
.iter()
|
||||
.for_each(|(k, output_index)| {
|
||||
if let Some(v) = map.remove(k) {
|
||||
val[*output_index] = v;
|
||||
}
|
||||
});
|
||||
let result = self.parse(v)?;
|
||||
let (_, index) = field.output();
|
||||
val[*index] = Value::Timestamp(result);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -351,9 +349,18 @@ impl Processor for TimestampProcessor {
|
||||
mod tests {
|
||||
use yaml_rust::YamlLoader;
|
||||
|
||||
use super::TimestampProcessor;
|
||||
use super::{TimestampProcessor, TimestampProcessorBuilder};
|
||||
use crate::etl::value::{Timestamp, Value};
|
||||
|
||||
fn builder_to_native_processor(builder: TimestampProcessorBuilder) -> TimestampProcessor {
|
||||
TimestampProcessor {
|
||||
fields: vec![],
|
||||
formats: builder.formats,
|
||||
resolution: builder.resolution,
|
||||
ignore_missing: builder.ignore_missing,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_epoch() {
|
||||
let processor_yaml_str = r#"fields:
|
||||
@@ -367,7 +374,9 @@ formats:
|
||||
"#;
|
||||
let yaml = &YamlLoader::load_from_str(processor_yaml_str).unwrap()[0];
|
||||
let timestamp_yaml = yaml.as_hash().unwrap();
|
||||
let processor = TimestampProcessor::try_from(timestamp_yaml).unwrap();
|
||||
let processor = builder_to_native_processor(
|
||||
TimestampProcessorBuilder::try_from(timestamp_yaml).unwrap(),
|
||||
);
|
||||
|
||||
let values = [
|
||||
(
|
||||
@@ -419,7 +428,9 @@ formats:
|
||||
"#;
|
||||
let yaml = &YamlLoader::load_from_str(processor_yaml_str).unwrap()[0];
|
||||
let timestamp_yaml = yaml.as_hash().unwrap();
|
||||
let processor = TimestampProcessor::try_from(timestamp_yaml).unwrap();
|
||||
let processor = builder_to_native_processor(
|
||||
TimestampProcessorBuilder::try_from(timestamp_yaml).unwrap(),
|
||||
);
|
||||
|
||||
let values: Vec<&str> = vec![
|
||||
"2014-5-17T12:34:56",
|
||||
|
||||
@@ -15,12 +15,12 @@
|
||||
use ahash::HashSet;
|
||||
use urlencoding::{decode, encode};
|
||||
|
||||
use crate::etl::field::{Field, Fields};
|
||||
use crate::etl::field::{Fields, OneInputOneOutputField};
|
||||
use crate::etl::processor::{
|
||||
yaml_bool, yaml_field, yaml_fields, yaml_string, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
|
||||
METHOD_NAME,
|
||||
yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, ProcessorBuilder, ProcessorKind,
|
||||
FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, METHOD_NAME,
|
||||
};
|
||||
use crate::etl::value::{Map, Value};
|
||||
use crate::etl::value::Value;
|
||||
|
||||
pub(crate) const PROCESSOR_URL_ENCODING: &str = "urlencoding";
|
||||
|
||||
@@ -52,54 +52,76 @@ impl std::str::FromStr for Method {
|
||||
}
|
||||
}
|
||||
|
||||
/// only support string value
|
||||
#[derive(Debug, Default)]
|
||||
pub struct UrlEncodingProcessor {
|
||||
pub struct UrlEncodingProcessorBuilder {
|
||||
fields: Fields,
|
||||
method: Method,
|
||||
ignore_missing: bool,
|
||||
}
|
||||
|
||||
impl ProcessorBuilder for UrlEncodingProcessorBuilder {
|
||||
fn output_keys(&self) -> HashSet<&str> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|f| f.target_or_input_field())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn input_keys(&self) -> HashSet<&str> {
|
||||
self.fields.iter().map(|f| f.input_field()).collect()
|
||||
}
|
||||
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String> {
|
||||
self.build(intermediate_keys)
|
||||
.map(ProcessorKind::UrlEncoding)
|
||||
}
|
||||
}
|
||||
|
||||
impl UrlEncodingProcessorBuilder {
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<UrlEncodingProcessor, String> {
|
||||
let mut real_fields = vec![];
|
||||
for field in self.fields.into_iter() {
|
||||
let input = OneInputOneOutputField::build(
|
||||
"urlencoding",
|
||||
intermediate_keys,
|
||||
field.input_field(),
|
||||
field.target_or_input_field(),
|
||||
)?;
|
||||
real_fields.push(input);
|
||||
}
|
||||
Ok(UrlEncodingProcessor {
|
||||
fields: real_fields,
|
||||
method: self.method,
|
||||
ignore_missing: self.ignore_missing,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// only support string value
|
||||
#[derive(Debug, Default)]
|
||||
pub struct UrlEncodingProcessor {
|
||||
fields: Vec<OneInputOneOutputField>,
|
||||
method: Method,
|
||||
ignore_missing: bool,
|
||||
}
|
||||
|
||||
impl UrlEncodingProcessor {
|
||||
fn with_fields(&mut self, mut fields: Fields) {
|
||||
Self::update_output_keys(&mut fields);
|
||||
self.fields = fields;
|
||||
}
|
||||
|
||||
fn with_ignore_missing(&mut self, ignore_missing: bool) {
|
||||
self.ignore_missing = ignore_missing;
|
||||
}
|
||||
|
||||
fn with_method(&mut self, method: Method) {
|
||||
self.method = method;
|
||||
}
|
||||
|
||||
fn process_field(&self, val: &str, field: &Field) -> Result<Map, String> {
|
||||
fn process_field(&self, val: &str) -> Result<Value, String> {
|
||||
let processed = match self.method {
|
||||
Method::Encode => encode(val).to_string(),
|
||||
Method::Decode => decode(val).map_err(|e| e.to_string())?.into_owned(),
|
||||
};
|
||||
let val = Value::String(processed);
|
||||
|
||||
let key = field.get_target_field();
|
||||
|
||||
Ok(Map::one(key, val))
|
||||
}
|
||||
|
||||
fn update_output_keys(fields: &mut Fields) {
|
||||
for field in fields.iter_mut() {
|
||||
field
|
||||
.output_fields_index_mapping
|
||||
.insert(field.get_target_field().to_string(), 0_usize);
|
||||
}
|
||||
Ok(Value::String(processed))
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for UrlEncodingProcessor {
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for UrlEncodingProcessorBuilder {
|
||||
type Error = String;
|
||||
|
||||
fn try_from(value: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
|
||||
let mut processor = UrlEncodingProcessor::default();
|
||||
let mut fields = Fields::default();
|
||||
let mut method = Method::Decode;
|
||||
let mut ignore_missing = false;
|
||||
|
||||
for (k, v) in value.iter() {
|
||||
let key = k
|
||||
@@ -107,24 +129,29 @@ impl TryFrom<&yaml_rust::yaml::Hash> for UrlEncodingProcessor {
|
||||
.ok_or(format!("key must be a string, but got {k:?}"))?;
|
||||
match key {
|
||||
FIELD_NAME => {
|
||||
processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?));
|
||||
fields = Fields::one(yaml_new_field(v, FIELD_NAME)?);
|
||||
}
|
||||
FIELDS_NAME => {
|
||||
processor.with_fields(yaml_fields(v, FIELDS_NAME)?);
|
||||
fields = yaml_new_fields(v, FIELDS_NAME)?;
|
||||
}
|
||||
|
||||
IGNORE_MISSING_NAME => {
|
||||
processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?);
|
||||
ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?;
|
||||
}
|
||||
|
||||
METHOD_NAME => {
|
||||
let method = yaml_string(v, METHOD_NAME)?;
|
||||
processor.with_method(method.parse()?);
|
||||
let method_str = yaml_string(v, METHOD_NAME)?;
|
||||
method = method_str.parse()?;
|
||||
}
|
||||
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
let processor = UrlEncodingProcessorBuilder {
|
||||
fields,
|
||||
method,
|
||||
ignore_missing,
|
||||
};
|
||||
|
||||
Ok(processor)
|
||||
}
|
||||
@@ -139,52 +166,21 @@ impl crate::etl::processor::Processor for UrlEncodingProcessor {
|
||||
self.ignore_missing
|
||||
}
|
||||
|
||||
fn fields(&self) -> &Fields {
|
||||
&self.fields
|
||||
}
|
||||
|
||||
fn fields_mut(&mut self) -> &mut Fields {
|
||||
&mut self.fields
|
||||
}
|
||||
|
||||
fn output_keys(&self) -> HashSet<String> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|f| f.get_target_field().to_string())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
|
||||
match val {
|
||||
Value::String(val) => self.process_field(val, field),
|
||||
_ => Err(format!(
|
||||
"{} processor: expect string value, but got {val:?}",
|
||||
self.kind()
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String> {
|
||||
for field in self.fields.iter() {
|
||||
let index = field.input_field.index;
|
||||
let index = field.input_index();
|
||||
match val.get(index) {
|
||||
Some(Value::String(s)) => {
|
||||
let mut map = self.process_field(s, field)?;
|
||||
field
|
||||
.output_fields_index_mapping
|
||||
.iter()
|
||||
.for_each(|(k, output_index)| {
|
||||
if let Some(v) = map.remove(k) {
|
||||
val[*output_index] = v;
|
||||
}
|
||||
});
|
||||
let result = self.process_field(s)?;
|
||||
let output_index = field.output_index();
|
||||
val[output_index] = result;
|
||||
}
|
||||
Some(Value::Null) | None => {
|
||||
if !self.ignore_missing {
|
||||
return Err(format!(
|
||||
"{} processor: missing field: {}",
|
||||
self.kind(),
|
||||
field.get_field_name()
|
||||
field.output_name()
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -202,29 +198,28 @@ impl crate::etl::processor::Processor for UrlEncodingProcessor {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::etl::field::{Field, Fields};
|
||||
|
||||
use crate::etl::processor::urlencoding::UrlEncodingProcessor;
|
||||
use crate::etl::value::{Map, Value};
|
||||
use crate::etl::value::Value;
|
||||
|
||||
#[test]
|
||||
fn test_decode_url() {
|
||||
let field = "url";
|
||||
let ff: Field = field.parse().unwrap();
|
||||
|
||||
let decoded = "//BC/[a=6.7.8.9,c=g,k=0,l=1]";
|
||||
let encoded = "%2F%2FBC%2F%5Ba%3D6.7.8.9%2Cc%3Dg%2Ck%3D0%2Cl%3D1%5D";
|
||||
|
||||
let mut processor = UrlEncodingProcessor::default();
|
||||
processor.with_fields(Fields::one(ff.clone()));
|
||||
|
||||
{
|
||||
let result = processor.process_field(encoded, &ff).unwrap();
|
||||
assert_eq!(Map::one(field, Value::String(decoded.into())), result)
|
||||
let processor = UrlEncodingProcessor::default();
|
||||
let result = processor.process_field(encoded).unwrap();
|
||||
assert_eq!(Value::String(decoded.into()), result)
|
||||
}
|
||||
{
|
||||
processor.with_method(super::Method::Encode);
|
||||
let result = processor.process_field(decoded, &ff).unwrap();
|
||||
assert_eq!(Map::one(field, Value::String(encoded.into())), result)
|
||||
let processor = UrlEncodingProcessor {
|
||||
fields: vec![],
|
||||
method: super::Method::Encode,
|
||||
ignore_missing: false,
|
||||
};
|
||||
let result = processor.process_field(decoded).unwrap();
|
||||
assert_eq!(Value::String(encoded.into()), result)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,8 +17,8 @@ pub mod transformer;
|
||||
|
||||
use itertools::Itertools;
|
||||
|
||||
use crate::etl::field::Fields;
|
||||
use crate::etl::processor::{update_one_one_output_keys, yaml_field, yaml_fields, yaml_string};
|
||||
use crate::etl::find_key_index;
|
||||
use crate::etl::processor::yaml_string;
|
||||
use crate::etl::transform::index::Index;
|
||||
use crate::etl::value::Value;
|
||||
|
||||
@@ -31,6 +31,9 @@ const TRANSFORM_ON_FAILURE: &str = "on_failure";
|
||||
|
||||
pub use transformer::greptime::GreptimeTransformer;
|
||||
|
||||
use super::field::{Fields, InputFieldInfo, OneInputOneOutputField};
|
||||
use super::processor::{yaml_new_field, yaml_new_fields};
|
||||
|
||||
pub trait Transformer: std::fmt::Display + Sized + Send + Sync + 'static {
|
||||
type Output;
|
||||
type VecOutput;
|
||||
@@ -39,12 +42,11 @@ pub trait Transformer: std::fmt::Display + Sized + Send + Sync + 'static {
|
||||
fn schemas(&self) -> &Vec<greptime_proto::v1::ColumnSchema>;
|
||||
fn transforms(&self) -> &Transforms;
|
||||
fn transforms_mut(&mut self) -> &mut Transforms;
|
||||
fn transform(&self, val: Value) -> Result<Self::Output, String>;
|
||||
fn transform_mut(&self, val: &mut Vec<Value>) -> Result<Self::VecOutput, String>;
|
||||
}
|
||||
|
||||
/// On Failure behavior when transform fails
|
||||
#[derive(Debug, Clone, Default)]
|
||||
#[derive(Debug, Clone, Default, Copy)]
|
||||
pub enum OnFailure {
|
||||
// Return None if transform fails
|
||||
#[default]
|
||||
@@ -74,12 +76,18 @@ impl std::fmt::Display for OnFailure {
|
||||
}
|
||||
}
|
||||
}
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct TransformBuilders {
|
||||
pub(crate) builders: Vec<TransformBuilder>,
|
||||
pub(crate) output_keys: Vec<String>,
|
||||
pub(crate) required_keys: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct Transforms {
|
||||
transforms: Vec<Transform>,
|
||||
output_keys: Vec<String>,
|
||||
required_keys: Vec<String>,
|
||||
pub(crate) transforms: Vec<Transform>,
|
||||
pub(crate) output_keys: Vec<String>,
|
||||
pub(crate) required_keys: Vec<String>,
|
||||
}
|
||||
|
||||
impl Transforms {
|
||||
@@ -130,7 +138,7 @@ impl std::ops::DerefMut for Transforms {
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&Vec<yaml_rust::Yaml>> for Transforms {
|
||||
impl TryFrom<&Vec<yaml_rust::Yaml>> for TransformBuilders {
|
||||
type Error = String;
|
||||
|
||||
fn try_from(docs: &Vec<yaml_rust::Yaml>) -> Result<Self, Self::Error> {
|
||||
@@ -138,41 +146,78 @@ impl TryFrom<&Vec<yaml_rust::Yaml>> for Transforms {
|
||||
let mut all_output_keys: Vec<String> = Vec::with_capacity(100);
|
||||
let mut all_required_keys = Vec::with_capacity(100);
|
||||
for doc in docs {
|
||||
let transform: Transform = doc
|
||||
let transform_builder: TransformBuilder = doc
|
||||
.as_hash()
|
||||
.ok_or("transform element must be a map".to_string())?
|
||||
.try_into()?;
|
||||
let mut transform_output_keys = transform
|
||||
let mut transform_output_keys = transform_builder
|
||||
.fields
|
||||
.iter()
|
||||
.map(|f| f.get_target_field().to_string())
|
||||
.map(|f| f.target_or_input_field().to_string())
|
||||
.collect();
|
||||
all_output_keys.append(&mut transform_output_keys);
|
||||
|
||||
let mut transform_required_keys = transform
|
||||
let mut transform_required_keys = transform_builder
|
||||
.fields
|
||||
.iter()
|
||||
.map(|f| f.input_field.name.clone())
|
||||
.map(|f| f.input_field().to_string())
|
||||
.collect();
|
||||
all_required_keys.append(&mut transform_required_keys);
|
||||
|
||||
transforms.push(transform);
|
||||
transforms.push(transform_builder);
|
||||
}
|
||||
|
||||
all_required_keys.sort();
|
||||
|
||||
Ok(Transforms {
|
||||
transforms,
|
||||
Ok(TransformBuilders {
|
||||
builders: transforms,
|
||||
output_keys: all_output_keys,
|
||||
required_keys: all_required_keys,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct TransformBuilder {
|
||||
fields: Fields,
|
||||
type_: Value,
|
||||
default: Option<Value>,
|
||||
index: Option<Index>,
|
||||
on_failure: Option<OnFailure>,
|
||||
}
|
||||
|
||||
impl TransformBuilder {
|
||||
pub fn build(
|
||||
self,
|
||||
intermediate_keys: &[String],
|
||||
output_keys: &[String],
|
||||
) -> Result<Transform, String> {
|
||||
let mut real_fields = vec![];
|
||||
for field in self.fields {
|
||||
let input_index = find_key_index(intermediate_keys, field.input_field(), "transform")?;
|
||||
let input_field_info = InputFieldInfo::new(field.input_field(), input_index);
|
||||
let output_index =
|
||||
find_key_index(output_keys, field.target_or_input_field(), "transform")?;
|
||||
let input = OneInputOneOutputField::new(
|
||||
input_field_info,
|
||||
(field.target_or_input_field().to_string(), output_index),
|
||||
);
|
||||
real_fields.push(input);
|
||||
}
|
||||
Ok(Transform {
|
||||
real_fields,
|
||||
type_: self.type_,
|
||||
default: self.default,
|
||||
index: self.index,
|
||||
on_failure: self.on_failure,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// only field is required
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Transform {
|
||||
pub fields: Fields,
|
||||
pub real_fields: Vec<OneInputOneOutputField>,
|
||||
|
||||
pub type_: Value,
|
||||
|
||||
@@ -192,7 +237,7 @@ impl std::fmt::Display for Transform {
|
||||
};
|
||||
|
||||
let type_ = format!("type: {}", self.type_);
|
||||
let fields = format!("field(s): {}", self.fields);
|
||||
let fields = format!("field(s): {:?}", self.real_fields);
|
||||
let default = if let Some(default) = &self.default {
|
||||
format!(", default: {}", default)
|
||||
} else {
|
||||
@@ -212,7 +257,7 @@ impl std::fmt::Display for Transform {
|
||||
impl Default for Transform {
|
||||
fn default() -> Self {
|
||||
Transform {
|
||||
fields: Fields::default(),
|
||||
real_fields: Vec::new(),
|
||||
type_: Value::Null,
|
||||
default: None,
|
||||
index: None,
|
||||
@@ -222,40 +267,6 @@ impl Default for Transform {
|
||||
}
|
||||
|
||||
impl Transform {
|
||||
fn with_fields(&mut self, mut fields: Fields) {
|
||||
update_one_one_output_keys(&mut fields);
|
||||
self.fields = fields;
|
||||
}
|
||||
|
||||
fn with_type(&mut self, type_: Value) {
|
||||
self.type_ = type_;
|
||||
}
|
||||
|
||||
fn try_default(&mut self, default: Value) -> Result<(), String> {
|
||||
match (&self.type_, &default) {
|
||||
(Value::Null, _) => Err(format!(
|
||||
"transform {} type MUST BE set before default {}",
|
||||
self.fields, &default,
|
||||
)),
|
||||
(_, Value::Null) => Ok(()), // if default is not set, then it will be regarded as default null
|
||||
(_, _) => {
|
||||
let target = self
|
||||
.type_
|
||||
.parse_str_value(default.to_str_value().as_str())?;
|
||||
self.default = Some(target);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn with_index(&mut self, index: Index) {
|
||||
self.index = Some(index);
|
||||
}
|
||||
|
||||
fn with_on_failure(&mut self, on_failure: OnFailure) {
|
||||
self.on_failure = Some(on_failure);
|
||||
}
|
||||
|
||||
pub(crate) fn get_default(&self) -> Option<&Value> {
|
||||
self.default.as_ref()
|
||||
}
|
||||
@@ -265,52 +276,74 @@ impl Transform {
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for Transform {
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for TransformBuilder {
|
||||
type Error = String;
|
||||
|
||||
fn try_from(hash: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
|
||||
let mut transform = Transform::default();
|
||||
|
||||
let mut default_opt = None;
|
||||
let mut fields = Fields::default();
|
||||
let mut type_ = Value::Null;
|
||||
let mut default = None;
|
||||
let mut index = None;
|
||||
let mut on_failure = None;
|
||||
|
||||
for (k, v) in hash {
|
||||
let key = k.as_str().ok_or("key must be a string")?;
|
||||
match key {
|
||||
TRANSFORM_FIELD => {
|
||||
transform.with_fields(Fields::one(yaml_field(v, TRANSFORM_FIELD)?));
|
||||
fields = Fields::one(yaml_new_field(v, TRANSFORM_FIELD)?);
|
||||
}
|
||||
|
||||
TRANSFORM_FIELDS => {
|
||||
transform.with_fields(yaml_fields(v, TRANSFORM_FIELDS)?);
|
||||
fields = yaml_new_fields(v, TRANSFORM_FIELDS)?;
|
||||
}
|
||||
|
||||
TRANSFORM_TYPE => {
|
||||
let t = yaml_string(v, TRANSFORM_TYPE)?;
|
||||
transform.with_type(Value::parse_str_type(&t)?);
|
||||
type_ = Value::parse_str_type(&t)?;
|
||||
}
|
||||
|
||||
TRANSFORM_INDEX => {
|
||||
let index = yaml_string(v, TRANSFORM_INDEX)?;
|
||||
transform.with_index(index.try_into()?);
|
||||
let index_str = yaml_string(v, TRANSFORM_INDEX)?;
|
||||
index = Some(index_str.try_into()?);
|
||||
}
|
||||
|
||||
TRANSFORM_DEFAULT => {
|
||||
default_opt = Some(Value::try_from(v)?);
|
||||
default = Some(Value::try_from(v)?);
|
||||
}
|
||||
|
||||
TRANSFORM_ON_FAILURE => {
|
||||
let on_failure = yaml_string(v, TRANSFORM_ON_FAILURE)?;
|
||||
transform.with_on_failure(on_failure.parse()?);
|
||||
let on_failure_str = yaml_string(v, TRANSFORM_ON_FAILURE)?;
|
||||
on_failure = Some(on_failure_str.parse()?);
|
||||
}
|
||||
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
let mut final_default = None;
|
||||
|
||||
if let Some(default) = default_opt {
|
||||
transform.try_default(default)?;
|
||||
if let Some(default_value) = default {
|
||||
match (&type_, &default_value) {
|
||||
(Value::Null, _) => {
|
||||
return Err(format!(
|
||||
"transform {:?} type MUST BE set before default {}",
|
||||
fields, &default_value,
|
||||
));
|
||||
}
|
||||
(_, Value::Null) => {} // if default is not set, then it will be regarded as default null
|
||||
(_, _) => {
|
||||
let target = type_.parse_str_value(default_value.to_str_value().as_str())?;
|
||||
final_default = Some(target);
|
||||
}
|
||||
}
|
||||
}
|
||||
let builder = TransformBuilder {
|
||||
fields,
|
||||
type_,
|
||||
default: final_default,
|
||||
index,
|
||||
on_failure,
|
||||
};
|
||||
|
||||
Ok(transform)
|
||||
Ok(builder)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,10 +20,10 @@ use coerce::{coerce_columns, coerce_value};
|
||||
use greptime_proto::v1::{ColumnSchema, Row, Rows, Value as GreptimeValue};
|
||||
use itertools::Itertools;
|
||||
|
||||
use crate::etl::field::{Field, Fields};
|
||||
use crate::etl::field::{InputFieldInfo, OneInputOneOutputField};
|
||||
use crate::etl::transform::index::Index;
|
||||
use crate::etl::transform::{Transform, Transformer, Transforms};
|
||||
use crate::etl::value::{Array, Map, Timestamp, Value};
|
||||
use crate::etl::value::{Timestamp, Value};
|
||||
|
||||
const DEFAULT_GREPTIME_TIMESTAMP_COLUMN: &str = "greptime_timestamp";
|
||||
|
||||
@@ -36,23 +36,41 @@ pub struct GreptimeTransformer {
|
||||
}
|
||||
|
||||
impl GreptimeTransformer {
|
||||
fn default_greptime_timestamp_column() -> Transform {
|
||||
/// Add a default timestamp column to the transforms
|
||||
fn add_greptime_timestamp_column(transforms: &mut Transforms) {
|
||||
let ns = chrono::Utc::now().timestamp_nanos_opt().unwrap_or(0);
|
||||
let type_ = Value::Timestamp(Timestamp::Nanosecond(ns));
|
||||
let default = Some(type_.clone());
|
||||
let mut field = Field::new(DEFAULT_GREPTIME_TIMESTAMP_COLUMN);
|
||||
field.insert_output_index(DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string(), 0);
|
||||
let fields = Fields::new(vec![field]).unwrap();
|
||||
|
||||
Transform {
|
||||
fields,
|
||||
let transform = Transform {
|
||||
real_fields: vec![OneInputOneOutputField::new(
|
||||
InputFieldInfo {
|
||||
name: DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string(),
|
||||
index: usize::MAX,
|
||||
},
|
||||
(
|
||||
DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string(),
|
||||
transforms
|
||||
.transforms
|
||||
.iter()
|
||||
.map(|x| x.real_fields.len())
|
||||
.sum(),
|
||||
),
|
||||
)],
|
||||
type_,
|
||||
default,
|
||||
index: Some(Index::Time),
|
||||
on_failure: Some(crate::etl::transform::OnFailure::Default),
|
||||
}
|
||||
};
|
||||
let required_keys = transforms.required_keys_mut();
|
||||
required_keys.push(DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string());
|
||||
|
||||
let output_keys = transforms.output_keys_mut();
|
||||
output_keys.push(DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string());
|
||||
transforms.push(transform);
|
||||
}
|
||||
|
||||
/// Generate the schema for the GreptimeTransformer
|
||||
fn schemas(transforms: &Transforms) -> Result<Vec<ColumnSchema>, String> {
|
||||
let mut schema = vec![];
|
||||
for transform in transforms.iter() {
|
||||
@@ -60,53 +78,6 @@ impl GreptimeTransformer {
|
||||
}
|
||||
Ok(schema)
|
||||
}
|
||||
|
||||
fn transform_map(&self, map: &Map) -> Result<Row, String> {
|
||||
let mut values = vec![GreptimeValue { value_data: None }; self.schema.len()];
|
||||
for transform in self.transforms.iter() {
|
||||
for field in transform.fields.iter() {
|
||||
let value_data = match map.get(field.get_field_name()) {
|
||||
Some(val) => coerce_value(val, transform)?,
|
||||
None => {
|
||||
let default = transform.get_default();
|
||||
match default {
|
||||
Some(default) => coerce_value(default, transform)?,
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
};
|
||||
if let Some(i) = field
|
||||
.output_fields_index_mapping
|
||||
.iter()
|
||||
.next()
|
||||
.map(|kv| kv.1)
|
||||
{
|
||||
values[*i] = GreptimeValue { value_data }
|
||||
} else {
|
||||
return Err(format!(
|
||||
"field: {} output_fields is empty.",
|
||||
field.get_field_name()
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Row { values })
|
||||
}
|
||||
|
||||
fn transform_array(&self, arr: &Array) -> Result<Vec<Row>, String> {
|
||||
let mut rows = Vec::with_capacity(arr.len());
|
||||
for v in arr.iter() {
|
||||
match v {
|
||||
Value::Map(map) => {
|
||||
let row = self.transform_map(map)?;
|
||||
rows.push(row);
|
||||
}
|
||||
_ => return Err(format!("Expected map, found: {v:?}")),
|
||||
}
|
||||
}
|
||||
Ok(rows)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for GreptimeTransformer {
|
||||
@@ -129,9 +100,9 @@ impl Transformer for GreptimeTransformer {
|
||||
|
||||
for transform in transforms.iter() {
|
||||
let target_fields_set = transform
|
||||
.fields
|
||||
.real_fields
|
||||
.iter()
|
||||
.map(|f| f.get_target_field())
|
||||
.map(|f| f.output_name())
|
||||
.collect::<HashSet<_>>();
|
||||
|
||||
let intersections: Vec<_> = column_names_set.intersection(&target_fields_set).collect();
|
||||
@@ -146,12 +117,15 @@ impl Transformer for GreptimeTransformer {
|
||||
|
||||
if let Some(idx) = transform.index {
|
||||
if idx == Index::Time {
|
||||
match transform.fields.len() {
|
||||
1 => timestamp_columns.push(transform.fields.first().unwrap().get_field_name()),
|
||||
_ => return Err(format!(
|
||||
"Illegal to set multiple timestamp Index columns, please set only one: {}",
|
||||
transform.fields.get_target_fields().join(", ")
|
||||
)),
|
||||
match transform.real_fields.len() {
|
||||
1 => timestamp_columns
|
||||
.push(transform.real_fields.first().unwrap().input_name()),
|
||||
_ => {
|
||||
return Err(format!(
|
||||
"Illegal to set multiple timestamp Index columns, please set only one: {}",
|
||||
transform.real_fields.iter().map(|x|x.input_name()).join(", ")
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -159,13 +133,7 @@ impl Transformer for GreptimeTransformer {
|
||||
|
||||
match timestamp_columns.len() {
|
||||
0 => {
|
||||
transforms.push(GreptimeTransformer::default_greptime_timestamp_column());
|
||||
|
||||
let required_keys = transforms.required_keys_mut();
|
||||
required_keys.push(DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string());
|
||||
|
||||
let output_keys = transforms.output_keys_mut();
|
||||
output_keys.push(DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string());
|
||||
GreptimeTransformer::add_greptime_timestamp_column(&mut transforms);
|
||||
|
||||
let schema = GreptimeTransformer::schemas(&transforms)?;
|
||||
Ok(GreptimeTransformer { transforms, schema })
|
||||
@@ -184,54 +152,26 @@ impl Transformer for GreptimeTransformer {
|
||||
}
|
||||
}
|
||||
|
||||
fn transform(&self, value: Value) -> Result<Self::Output, String> {
|
||||
match value {
|
||||
Value::Map(map) => {
|
||||
let rows = vec![self.transform_map(&map)?];
|
||||
Ok(Rows {
|
||||
schema: self.schema.clone(),
|
||||
rows,
|
||||
})
|
||||
}
|
||||
Value::Array(arr) => {
|
||||
let rows = self.transform_array(&arr)?;
|
||||
Ok(Rows {
|
||||
schema: self.schema.clone(),
|
||||
rows,
|
||||
})
|
||||
}
|
||||
_ => Err(format!("Expected map or array, found: {}", value)),
|
||||
}
|
||||
}
|
||||
|
||||
fn transform_mut(&self, val: &mut Vec<Value>) -> Result<Self::VecOutput, String> {
|
||||
let mut values = vec![GreptimeValue { value_data: None }; self.schema.len()];
|
||||
for transform in self.transforms.iter() {
|
||||
for field in transform.fields.iter() {
|
||||
let index = field.input_field.index;
|
||||
for field in transform.real_fields.iter() {
|
||||
let index = field.input_index();
|
||||
let output_index = field.output_index();
|
||||
match val.get(index) {
|
||||
Some(v) => {
|
||||
let value_data = coerce_value(v, transform)
|
||||
.map_err(|e| format!("{} processor: {}", field.get_field_name(), e))?;
|
||||
.map_err(|e| format!("{} processor: {}", field.input_name(), e))?;
|
||||
// every transform fields has only one output field
|
||||
if let Some(i) = field
|
||||
.output_fields_index_mapping
|
||||
.iter()
|
||||
.next()
|
||||
.map(|kv| kv.1)
|
||||
{
|
||||
values[*i] = GreptimeValue { value_data }
|
||||
} else {
|
||||
return Err(format!(
|
||||
"field: {} output_fields is empty.",
|
||||
field.get_field_name()
|
||||
));
|
||||
}
|
||||
values[output_index] = GreptimeValue { value_data };
|
||||
}
|
||||
_ => {
|
||||
return Err(format!(
|
||||
"Get field not in the array field: {field:?}, {val:?}"
|
||||
))
|
||||
None => {
|
||||
let default = transform.get_default();
|
||||
let value_data = match default {
|
||||
Some(default) => coerce_value(default, transform)?,
|
||||
None => None,
|
||||
};
|
||||
values[output_index] = GreptimeValue { value_data };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -66,8 +66,8 @@ impl TryFrom<Value> for ValueData {
|
||||
pub(crate) fn coerce_columns(transform: &Transform) -> Result<Vec<ColumnSchema>, String> {
|
||||
let mut columns = Vec::new();
|
||||
|
||||
for field in transform.fields.iter() {
|
||||
let column_name = field.get_target_field().to_string();
|
||||
for field in transform.real_fields.iter() {
|
||||
let column_name = field.output_name().to_string();
|
||||
|
||||
let datatype = coerce_type(transform)? as i32;
|
||||
|
||||
@@ -134,7 +134,7 @@ fn coerce_type(transform: &Transform) -> Result<ColumnDataType, String> {
|
||||
|
||||
Value::Null => Err(format!(
|
||||
"Null type not supported when to coerce '{}' type",
|
||||
transform.fields
|
||||
transform.type_.to_str_type()
|
||||
)),
|
||||
}
|
||||
}
|
||||
@@ -144,15 +144,18 @@ pub(crate) fn coerce_value(
|
||||
transform: &Transform,
|
||||
) -> Result<Option<ValueData>, String> {
|
||||
match val {
|
||||
Value::Null => match transform.on_failure {
|
||||
Some(OnFailure::Ignore) => Ok(None),
|
||||
Some(OnFailure::Default) => transform
|
||||
.get_default()
|
||||
.map(|default| coerce_value(default, transform))
|
||||
.unwrap_or_else(|| {
|
||||
coerce_value(transform.get_type_matched_default_val(), transform)
|
||||
}),
|
||||
None => Ok(None),
|
||||
Value::Null => match &transform.default {
|
||||
Some(default) => coerce_value(default, transform),
|
||||
None => match transform.on_failure {
|
||||
Some(OnFailure::Ignore) => Ok(None),
|
||||
Some(OnFailure::Default) => transform
|
||||
.get_default()
|
||||
.map(|default| coerce_value(default, transform))
|
||||
.unwrap_or_else(|| {
|
||||
coerce_value(transform.get_type_matched_default_val(), transform)
|
||||
}),
|
||||
None => Ok(None),
|
||||
},
|
||||
},
|
||||
|
||||
Value::Int8(n) => coerce_i64_value(*n as i64, transform),
|
||||
@@ -404,12 +407,11 @@ fn coerce_string_value(s: &String, transform: &Transform) -> Result<Option<Value
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::etl::field::Fields;
|
||||
|
||||
#[test]
|
||||
fn test_coerce_string_without_on_failure() {
|
||||
let transform = Transform {
|
||||
fields: Fields::default(),
|
||||
real_fields: vec![],
|
||||
type_: Value::Int32(0),
|
||||
default: None,
|
||||
index: None,
|
||||
@@ -434,7 +436,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_coerce_string_with_on_failure_ignore() {
|
||||
let transform = Transform {
|
||||
fields: Fields::default(),
|
||||
real_fields: vec![],
|
||||
type_: Value::Int32(0),
|
||||
default: None,
|
||||
index: None,
|
||||
@@ -449,7 +451,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_coerce_string_with_on_failure_default() {
|
||||
let mut transform = Transform {
|
||||
fields: Fields::default(),
|
||||
real_fields: vec![],
|
||||
type_: Value::Int32(0),
|
||||
default: None,
|
||||
index: None,
|
||||
|
||||
@@ -110,7 +110,12 @@ impl PipelineOperator {
|
||||
// exist in catalog, just open
|
||||
if let Some(table) = self
|
||||
.catalog_manager
|
||||
.table(&expr.catalog_name, &expr.schema_name, &expr.table_name)
|
||||
.table(
|
||||
&expr.catalog_name,
|
||||
&expr.schema_name,
|
||||
&expr.table_name,
|
||||
Some(&ctx),
|
||||
)
|
||||
.await
|
||||
.context(CatalogSnafu)?
|
||||
{
|
||||
@@ -130,7 +135,7 @@ impl PipelineOperator {
|
||||
// get from catalog
|
||||
let table = self
|
||||
.catalog_manager
|
||||
.table(catalog, schema, table_name)
|
||||
.table(catalog, schema, table_name, Some(&ctx))
|
||||
.await
|
||||
.context(CatalogSnafu)?
|
||||
.context(PipelineTableNotFoundSnafu)?;
|
||||
|
||||
@@ -13,20 +13,45 @@
|
||||
// limitations under the License.
|
||||
|
||||
use greptime_proto::v1::{ColumnDataType, ColumnSchema, Rows, SemanticType};
|
||||
use pipeline::{parse, Content, GreptimeTransformer, Pipeline, Value};
|
||||
use pipeline::{parse, Content, GreptimeTransformer, Pipeline};
|
||||
|
||||
/// test util function to parse and execute pipeline
|
||||
pub fn parse_and_exec(input_str: &str, pipeline_yaml: &str) -> Rows {
|
||||
let input_value: Value = serde_json::from_str::<serde_json::Value>(input_str)
|
||||
.expect("failed to parse into json")
|
||||
.try_into()
|
||||
.expect("failed to convert into value");
|
||||
let input_value = serde_json::from_str::<serde_json::Value>(input_str).unwrap();
|
||||
|
||||
let yaml_content = Content::Yaml(pipeline_yaml.into());
|
||||
let pipeline: Pipeline<GreptimeTransformer> =
|
||||
parse(&yaml_content).expect("failed to parse pipeline");
|
||||
let mut result = pipeline.init_intermediate_state();
|
||||
|
||||
pipeline.exec(input_value).expect("failed to exec pipeline")
|
||||
let schema = pipeline.schemas().clone();
|
||||
|
||||
let mut rows = Vec::new();
|
||||
|
||||
match input_value {
|
||||
serde_json::Value::Array(array) => {
|
||||
for value in array {
|
||||
pipeline.prepare(value, &mut result).unwrap();
|
||||
let row = pipeline
|
||||
.exec_mut(&mut result)
|
||||
.expect("failed to exec pipeline");
|
||||
rows.push(row);
|
||||
pipeline.reset_intermediate_state(&mut result);
|
||||
}
|
||||
}
|
||||
serde_json::Value::Object(_) => {
|
||||
pipeline.prepare(input_value, &mut result).unwrap();
|
||||
let row = pipeline
|
||||
.exec_mut(&mut result)
|
||||
.expect("failed to exec pipeline");
|
||||
rows.push(row);
|
||||
}
|
||||
_ => {
|
||||
panic!("invalid input value");
|
||||
}
|
||||
}
|
||||
|
||||
Rows { schema, rows }
|
||||
}
|
||||
|
||||
/// test util function to create column schema
|
||||
|
||||
@@ -157,7 +157,7 @@ transform:
|
||||
fn test_modifier() {
|
||||
let empty_str = r#"
|
||||
{
|
||||
"str": "key1 key2 key3 key4 key5 key6 key7 key8"
|
||||
"str": "key1 key2 key3 key4 key5 key6"
|
||||
}"#;
|
||||
|
||||
let pipeline_yaml = r#"
|
||||
@@ -165,7 +165,7 @@ processors:
|
||||
- dissect:
|
||||
field: str
|
||||
patterns:
|
||||
- "%{key1} %{key2} %{+key3} %{+key3/2} %{key5->} %{?key6} %{*key_7} %{&key_7}"
|
||||
- "%{key1} %{key2} %{+key3} %{+key3/2} %{key5->} %{?key6}"
|
||||
|
||||
transform:
|
||||
- fields:
|
||||
@@ -173,7 +173,6 @@ transform:
|
||||
- key2
|
||||
- key3
|
||||
- key5
|
||||
- key7
|
||||
type: string
|
||||
"#;
|
||||
|
||||
@@ -184,7 +183,6 @@ transform:
|
||||
make_string_column_schema("key2".to_string()),
|
||||
make_string_column_schema("key3".to_string()),
|
||||
make_string_column_schema("key5".to_string()),
|
||||
make_string_column_schema("key7".to_string()),
|
||||
common::make_column_schema(
|
||||
"greptime_timestamp".to_string(),
|
||||
ColumnDataType::TimestampNanosecond,
|
||||
@@ -209,10 +207,6 @@ transform:
|
||||
output.rows[0].values[3].value_data,
|
||||
Some(StringValue("key5".to_string()))
|
||||
);
|
||||
assert_eq!(
|
||||
output.rows[0].values[4].value_data,
|
||||
Some(StringValue("key8".to_string()))
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -12,18 +12,18 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use api::v1::Rows;
|
||||
use common_telemetry::tracing::info;
|
||||
use greptime_proto::v1::value::ValueData::{
|
||||
BoolValue, F64Value, StringValue, TimestampNanosecondValue, TimestampSecondValue, U32Value,
|
||||
U64Value, U8Value,
|
||||
};
|
||||
use greptime_proto::v1::Value as GreptimeValue;
|
||||
use pipeline::{parse, Content, GreptimeTransformer, Pipeline, Value};
|
||||
use pipeline::{parse, Content, GreptimeTransformer, Pipeline};
|
||||
|
||||
#[test]
|
||||
fn test_complex_data() {
|
||||
let input_value_str = r#"
|
||||
[
|
||||
{
|
||||
"version": 1,
|
||||
"streamId": "12345",
|
||||
@@ -73,12 +73,9 @@ fn test_complex_data() {
|
||||
"ewExecutionInfo": "c:4380:7:161:162:161:n:::12473:200|C:4380:3:0:4:0:n:::6967:200|R:4380:20:99:99:1:n:::35982:200",
|
||||
"customField": "any-custom-value"
|
||||
}
|
||||
]
|
||||
"#;
|
||||
let input_value: Value = serde_json::from_str::<serde_json::Value>(input_value_str)
|
||||
.expect("failed to parse input value")
|
||||
.try_into()
|
||||
.expect("failed to convert input value");
|
||||
let input_value = serde_json::from_str::<serde_json::Value>(input_value_str)
|
||||
.expect("failed to parse input value");
|
||||
|
||||
let pipeline_yaml = r#"
|
||||
---
|
||||
@@ -422,7 +419,19 @@ transform:
|
||||
let yaml_content = Content::Yaml(pipeline_yaml.into());
|
||||
let pipeline: Pipeline<GreptimeTransformer> =
|
||||
parse(&yaml_content).expect("failed to parse pipeline");
|
||||
let output = pipeline.exec(input_value).expect("failed to exec pipeline");
|
||||
let mut stats = pipeline.init_intermediate_state();
|
||||
pipeline
|
||||
.prepare(input_value, &mut stats)
|
||||
.expect("failed to prepare pipeline");
|
||||
|
||||
let row = pipeline
|
||||
.exec_mut(&mut stats)
|
||||
.expect("failed to exec pipeline");
|
||||
|
||||
let output = Rows {
|
||||
schema: pipeline.schemas().clone(),
|
||||
rows: vec![row],
|
||||
};
|
||||
|
||||
assert_eq!(output.rows.len(), 1);
|
||||
let values = output.rows.first().unwrap().values.clone();
|
||||
@@ -464,10 +473,7 @@ fn test_simple_data() {
|
||||
"line": "2024-05-25 20:16:37.217 hello world"
|
||||
}
|
||||
"#;
|
||||
let input_value: Value = serde_json::from_str::<serde_json::Value>(input_value_str)
|
||||
.unwrap()
|
||||
.try_into()
|
||||
.unwrap();
|
||||
let input_value = serde_json::from_str::<serde_json::Value>(input_value_str).unwrap();
|
||||
|
||||
let pipeline_yaml = r#"
|
||||
processors:
|
||||
@@ -493,11 +499,13 @@ transform:
|
||||
|
||||
let yaml_content = Content::Yaml(pipeline_yaml.into());
|
||||
let pipeline: Pipeline<GreptimeTransformer> = parse(&yaml_content).unwrap();
|
||||
let output = pipeline.exec(input_value).unwrap();
|
||||
let r = output
|
||||
.rows
|
||||
|
||||
let mut status = pipeline.init_intermediate_state();
|
||||
pipeline.prepare(input_value, &mut status).unwrap();
|
||||
let row = pipeline.exec_mut(&mut status).unwrap();
|
||||
let r = row
|
||||
.values
|
||||
.into_iter()
|
||||
.flat_map(|v| v.values)
|
||||
.map(|v| v.value_data.unwrap())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
|
||||
@@ -116,7 +116,7 @@ impl DatafusionQueryEngine {
|
||||
let default_catalog = &query_ctx.current_catalog().to_owned();
|
||||
let default_schema = &query_ctx.current_schema();
|
||||
let table_name = dml.table_name.resolve(default_catalog, default_schema);
|
||||
let table = self.find_table(&table_name).await?;
|
||||
let table = self.find_table(&table_name, &query_ctx).await?;
|
||||
|
||||
let output = self
|
||||
.exec_query_plan(LogicalPlan::DfPlan((*dml.input).clone()), query_ctx.clone())
|
||||
@@ -241,14 +241,18 @@ impl DatafusionQueryEngine {
|
||||
.context(TableMutationSnafu)
|
||||
}
|
||||
|
||||
async fn find_table(&self, table_name: &ResolvedTableReference) -> Result<TableRef> {
|
||||
async fn find_table(
|
||||
&self,
|
||||
table_name: &ResolvedTableReference,
|
||||
query_context: &QueryContextRef,
|
||||
) -> Result<TableRef> {
|
||||
let catalog_name = table_name.catalog.as_ref();
|
||||
let schema_name = table_name.schema.as_ref();
|
||||
let table_name = table_name.table.as_ref();
|
||||
|
||||
self.state
|
||||
.catalog_manager()
|
||||
.table(catalog_name, schema_name, table_name)
|
||||
.table(catalog_name, schema_name, table_name, Some(query_context))
|
||||
.await
|
||||
.context(CatalogSnafu)?
|
||||
.with_context(|| TableNotFoundSnafu { table: table_name })
|
||||
@@ -529,7 +533,7 @@ mod tests {
|
||||
use datatypes::prelude::ConcreteDataType;
|
||||
use datatypes::schema::ColumnSchema;
|
||||
use datatypes::vectors::{Helper, UInt32Vector, UInt64Vector, VectorRef};
|
||||
use session::context::QueryContext;
|
||||
use session::context::{QueryContext, QueryContextBuilder};
|
||||
use table::table::numbers::{NumbersTable, NUMBERS_TABLE_NAME};
|
||||
|
||||
use super::*;
|
||||
@@ -618,12 +622,16 @@ mod tests {
|
||||
.as_any()
|
||||
.downcast_ref::<DatafusionQueryEngine>()
|
||||
.unwrap();
|
||||
let query_ctx = Arc::new(QueryContextBuilder::default().build());
|
||||
let table = engine
|
||||
.find_table(&ResolvedTableReference {
|
||||
catalog: "greptime".into(),
|
||||
schema: "public".into(),
|
||||
table: "numbers".into(),
|
||||
})
|
||||
.find_table(
|
||||
&ResolvedTableReference {
|
||||
catalog: "greptime".into(),
|
||||
schema: "public".into(),
|
||||
table: "numbers".into(),
|
||||
},
|
||||
&query_ctx,
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
|
||||
@@ -61,7 +61,7 @@ impl DfContextProviderAdapter {
|
||||
let mut table_provider = DfTableSourceProvider::new(
|
||||
engine_state.catalog_manager().clone(),
|
||||
engine_state.disallow_cross_catalog_query(),
|
||||
query_ctx.as_ref(),
|
||||
query_ctx.clone(),
|
||||
Arc::new(DefaultPlanDecoder::new(session_state.clone(), &query_ctx)?),
|
||||
session_state
|
||||
.config_options()
|
||||
|
||||
@@ -128,6 +128,7 @@ impl DistExtensionPlanner {
|
||||
&table_name.catalog_name,
|
||||
&table_name.schema_name,
|
||||
&table_name.table_name,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.context(CatalogSnafu)?
|
||||
|
||||
@@ -68,7 +68,7 @@ impl DfLogicalPlanner {
|
||||
let table_provider = DfTableSourceProvider::new(
|
||||
self.engine_state.catalog_manager().clone(),
|
||||
self.engine_state.disallow_cross_catalog_query(),
|
||||
query_ctx.as_ref(),
|
||||
query_ctx.clone(),
|
||||
Arc::new(DefaultPlanDecoder::new(
|
||||
self.session_state.clone(),
|
||||
&query_ctx,
|
||||
@@ -144,14 +144,15 @@ impl DfLogicalPlanner {
|
||||
|
||||
#[tracing::instrument(skip_all)]
|
||||
async fn plan_pql(&self, stmt: EvalStmt, query_ctx: QueryContextRef) -> Result<LogicalPlan> {
|
||||
let plan_decoder = Arc::new(DefaultPlanDecoder::new(
|
||||
self.session_state.clone(),
|
||||
&query_ctx,
|
||||
)?);
|
||||
let table_provider = DfTableSourceProvider::new(
|
||||
self.engine_state.catalog_manager().clone(),
|
||||
self.engine_state.disallow_cross_catalog_query(),
|
||||
query_ctx.as_ref(),
|
||||
Arc::new(DefaultPlanDecoder::new(
|
||||
self.session_state.clone(),
|
||||
&query_ctx,
|
||||
)?),
|
||||
query_ctx,
|
||||
plan_decoder,
|
||||
self.session_state
|
||||
.config_options()
|
||||
.sql_parser
|
||||
|
||||
@@ -2379,7 +2379,7 @@ mod test {
|
||||
DfTableSourceProvider::new(
|
||||
catalog_list,
|
||||
false,
|
||||
QueryContext::arc().as_ref(),
|
||||
QueryContext::arc(),
|
||||
DummyDecoder::arc(),
|
||||
false,
|
||||
)
|
||||
@@ -3219,7 +3219,7 @@ mod test {
|
||||
DfTableSourceProvider::new(
|
||||
catalog_list.clone(),
|
||||
false,
|
||||
QueryContext::arc().as_ref(),
|
||||
QueryContext::arc(),
|
||||
DummyDecoder::arc(),
|
||||
true,
|
||||
),
|
||||
@@ -3249,7 +3249,7 @@ mod test {
|
||||
DfTableSourceProvider::new(
|
||||
catalog_list.clone(),
|
||||
false,
|
||||
QueryContext::arc().as_ref(),
|
||||
QueryContext::arc(),
|
||||
DummyDecoder::arc(),
|
||||
true,
|
||||
),
|
||||
|
||||
@@ -232,6 +232,7 @@ async fn query_from_information_schema_table(
|
||||
query_ctx.current_catalog(),
|
||||
INFORMATION_SCHEMA_NAME,
|
||||
table_name,
|
||||
Some(&query_ctx),
|
||||
)
|
||||
.await
|
||||
.context(error::CatalogSnafu)?
|
||||
|
||||
@@ -753,6 +753,7 @@ impl HttpServer {
|
||||
"/pipelines/:pipeline_name",
|
||||
routing::delete(event::delete_pipeline),
|
||||
)
|
||||
.route("/pipelines/dryrun", routing::post(event::pipeline_dryrun))
|
||||
.layer(
|
||||
ServiceBuilder::new()
|
||||
.layer(HandleErrorLayer::new(handle_error))
|
||||
|
||||
@@ -23,15 +23,16 @@ use axum::headers::ContentType;
|
||||
use axum::http::header::CONTENT_TYPE;
|
||||
use axum::http::{Request, StatusCode};
|
||||
use axum::response::{IntoResponse, Response};
|
||||
use axum::{async_trait, BoxError, Extension, TypedHeader};
|
||||
use axum::{async_trait, BoxError, Extension, Json, TypedHeader};
|
||||
use common_query::{Output, OutputData};
|
||||
use common_telemetry::{error, warn};
|
||||
use datatypes::value::column_data_to_json;
|
||||
use pipeline::error::PipelineTransformSnafu;
|
||||
use pipeline::util::to_pipeline_version;
|
||||
use pipeline::PipelineVersion;
|
||||
use schemars::JsonSchema;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::{Deserializer, Value};
|
||||
use serde_json::{Deserializer, Map, Value};
|
||||
use session::context::{Channel, QueryContext, QueryContextRef};
|
||||
use snafu::{ensure, OptionExt, ResultExt};
|
||||
|
||||
@@ -230,6 +231,117 @@ fn transform_ndjson_array_factory(
|
||||
})
|
||||
}
|
||||
|
||||
#[axum_macros::debug_handler]
|
||||
pub async fn pipeline_dryrun(
|
||||
State(log_state): State<LogState>,
|
||||
Query(query_params): Query<LogIngesterQueryParams>,
|
||||
Extension(mut query_ctx): Extension<QueryContext>,
|
||||
TypedHeader(content_type): TypedHeader<ContentType>,
|
||||
payload: String,
|
||||
) -> Result<Response> {
|
||||
let handler = log_state.log_handler;
|
||||
let pipeline_name = query_params.pipeline_name.context(InvalidParameterSnafu {
|
||||
reason: "pipeline_name is required",
|
||||
})?;
|
||||
|
||||
let version = to_pipeline_version(query_params.version).context(PipelineSnafu)?;
|
||||
|
||||
let ignore_errors = query_params.ignore_errors.unwrap_or(false);
|
||||
|
||||
let value = extract_pipeline_value_by_content_type(content_type, payload, ignore_errors)?;
|
||||
|
||||
if value.len() > 10 {
|
||||
return Err(InvalidParameterSnafu {
|
||||
reason: "too many rows for dryrun",
|
||||
}
|
||||
.build());
|
||||
}
|
||||
|
||||
query_ctx.set_channel(Channel::Http);
|
||||
let query_ctx = Arc::new(query_ctx);
|
||||
|
||||
let pipeline = handler
|
||||
.get_pipeline(&pipeline_name, version, query_ctx.clone())
|
||||
.await?;
|
||||
|
||||
let mut intermediate_state = pipeline.init_intermediate_state();
|
||||
|
||||
let mut results = Vec::with_capacity(value.len());
|
||||
for v in value {
|
||||
pipeline
|
||||
.prepare(v, &mut intermediate_state)
|
||||
.map_err(|reason| PipelineTransformSnafu { reason }.build())
|
||||
.context(PipelineSnafu)?;
|
||||
let r = pipeline
|
||||
.exec_mut(&mut intermediate_state)
|
||||
.map_err(|reason| PipelineTransformSnafu { reason }.build())
|
||||
.context(PipelineSnafu)?;
|
||||
results.push(r);
|
||||
pipeline.reset_intermediate_state(&mut intermediate_state);
|
||||
}
|
||||
|
||||
let colume_type_key = "colume_type";
|
||||
let data_type_key = "data_type";
|
||||
let name_key = "name";
|
||||
|
||||
let schema = pipeline
|
||||
.schemas()
|
||||
.iter()
|
||||
.map(|cs| {
|
||||
let mut map = Map::new();
|
||||
map.insert(name_key.to_string(), Value::String(cs.column_name.clone()));
|
||||
map.insert(
|
||||
data_type_key.to_string(),
|
||||
Value::String(cs.datatype().as_str_name().to_string()),
|
||||
);
|
||||
map.insert(
|
||||
colume_type_key.to_string(),
|
||||
Value::String(cs.semantic_type().as_str_name().to_string()),
|
||||
);
|
||||
map.insert(
|
||||
"fulltext".to_string(),
|
||||
Value::Bool(
|
||||
cs.options
|
||||
.clone()
|
||||
.is_some_and(|x| x.options.contains_key("fulltext")),
|
||||
),
|
||||
);
|
||||
Value::Object(map)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
let rows = results
|
||||
.into_iter()
|
||||
.map(|row| {
|
||||
let row = row
|
||||
.values
|
||||
.into_iter()
|
||||
.enumerate()
|
||||
.map(|(idx, v)| {
|
||||
v.value_data
|
||||
.map(|d| {
|
||||
let mut map = Map::new();
|
||||
map.insert("value".to_string(), column_data_to_json(d));
|
||||
map.insert("key".to_string(), schema[idx][name_key].clone());
|
||||
map.insert(
|
||||
"semantic_type".to_string(),
|
||||
schema[idx][colume_type_key].clone(),
|
||||
);
|
||||
map.insert("data_type".to_string(), schema[idx][data_type_key].clone());
|
||||
Value::Object(map)
|
||||
})
|
||||
.unwrap_or(Value::Null)
|
||||
})
|
||||
.collect();
|
||||
Value::Array(row)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
let mut result = Map::new();
|
||||
result.insert("schema".to_string(), Value::Array(schema));
|
||||
result.insert("rows".to_string(), Value::Array(rows));
|
||||
let result = Value::Object(result);
|
||||
Ok(Json(result).into_response())
|
||||
}
|
||||
|
||||
#[axum_macros::debug_handler]
|
||||
pub async fn log_ingester(
|
||||
State(log_state): State<LogState>,
|
||||
|
||||
@@ -405,11 +405,11 @@ async fn get_all_column_names(
|
||||
schema: &str,
|
||||
manager: &CatalogManagerRef,
|
||||
) -> std::result::Result<HashSet<String>, catalog::error::Error> {
|
||||
let table_names = manager.table_names(catalog, schema).await?;
|
||||
let table_names = manager.table_names(catalog, schema, None).await?;
|
||||
|
||||
let mut labels = HashSet::new();
|
||||
for table_name in table_names {
|
||||
let Some(table) = manager.table(catalog, schema, &table_name).await? else {
|
||||
let Some(table) = manager.table(catalog, schema, &table_name, None).await? else {
|
||||
continue;
|
||||
};
|
||||
for column in table.primary_key_columns() {
|
||||
@@ -436,6 +436,7 @@ async fn retrieve_series_from_query_result(
|
||||
query_ctx.current_catalog(),
|
||||
&query_ctx.current_schema(),
|
||||
table_name,
|
||||
Some(query_ctx),
|
||||
)
|
||||
.await
|
||||
.context(CatalogSnafu)?
|
||||
@@ -691,7 +692,7 @@ pub async fn label_values_query(
|
||||
if label_name == METRIC_NAME_LABEL {
|
||||
let mut table_names = match handler
|
||||
.catalog_manager()
|
||||
.table_names(&catalog, &schema)
|
||||
.table_names(&catalog, &schema, Some(&query_ctx))
|
||||
.await
|
||||
{
|
||||
Ok(table_names) => table_names,
|
||||
@@ -777,7 +778,11 @@ async fn retrieve_field_names(
|
||||
|
||||
if matches.is_empty() {
|
||||
// query all tables if no matcher is provided
|
||||
while let Some(table) = manager.tables(catalog, &schema).next().await {
|
||||
while let Some(table) = manager
|
||||
.tables(catalog, &schema, Some(query_ctx))
|
||||
.next()
|
||||
.await
|
||||
{
|
||||
let table = table.context(CatalogSnafu)?;
|
||||
for column in table.field_columns() {
|
||||
field_columns.insert(column.name);
|
||||
@@ -788,7 +793,7 @@ async fn retrieve_field_names(
|
||||
|
||||
for table_name in matches {
|
||||
let table = manager
|
||||
.table(catalog, &schema, &table_name)
|
||||
.table(catalog, &schema, &table_name, Some(query_ctx))
|
||||
.await
|
||||
.context(CatalogSnafu)?
|
||||
.with_context(|| TableNotFoundSnafu {
|
||||
|
||||
@@ -261,6 +261,7 @@ impl QueryContext {
|
||||
|
||||
impl QueryContextBuilder {
|
||||
pub fn build(self) -> QueryContext {
|
||||
let channel = self.channel.unwrap_or_default();
|
||||
QueryContext {
|
||||
current_catalog: self
|
||||
.current_catalog
|
||||
@@ -270,8 +271,10 @@ impl QueryContextBuilder {
|
||||
.sql_dialect
|
||||
.unwrap_or_else(|| Arc::new(GreptimeDbDialect {})),
|
||||
extensions: self.extensions.unwrap_or_default(),
|
||||
configuration_parameter: self.configuration_parameter.unwrap_or_default(),
|
||||
channel: self.channel.unwrap_or_default(),
|
||||
configuration_parameter: self
|
||||
.configuration_parameter
|
||||
.unwrap_or_else(|| Arc::new(ConfigurationVariables::default())),
|
||||
channel,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -233,6 +233,9 @@ pub trait RegionScanner: Debug + DisplayAs + Send {
|
||||
/// # Panics
|
||||
/// Panics if the `partition` is out of bound.
|
||||
fn scan_partition(&self, partition: usize) -> Result<SendableRecordBatchStream, BoxedError>;
|
||||
|
||||
/// Check if there is any predicate that may be executed in this scanner.
|
||||
fn has_predicate(&self) -> bool;
|
||||
}
|
||||
|
||||
pub type RegionScannerRef = Box<dyn RegionScanner>;
|
||||
@@ -367,6 +370,10 @@ impl RegionScanner for SinglePartitionScanner {
|
||||
))
|
||||
})
|
||||
}
|
||||
|
||||
fn has_predicate(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
impl DisplayAs for SinglePartitionScanner {
|
||||
|
||||
@@ -180,7 +180,7 @@ impl ExecutionPlan for RegionScanExec {
|
||||
}
|
||||
|
||||
fn statistics(&self) -> DfResult<Statistics> {
|
||||
let statistics = if self.append_mode {
|
||||
let statistics = if self.append_mode && !self.scanner.lock().unwrap().has_predicate() {
|
||||
let column_statistics = self
|
||||
.arrow_schema
|
||||
.fields
|
||||
|
||||
@@ -181,7 +181,8 @@ mod test {
|
||||
.table(
|
||||
"greptime",
|
||||
"database_created_through_grpc",
|
||||
"table_created_through_grpc"
|
||||
"table_created_through_grpc",
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.unwrap()
|
||||
@@ -510,7 +511,7 @@ CREATE TABLE {table_name} (
|
||||
let table = instance
|
||||
.frontend()
|
||||
.catalog_manager()
|
||||
.table("greptime", "public", table_name)
|
||||
.table("greptime", "public", table_name, None)
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
|
||||
@@ -278,7 +278,7 @@ mod tests {
|
||||
assert!(instance
|
||||
.frontend()
|
||||
.catalog_manager()
|
||||
.table("greptime", "public", "demo")
|
||||
.table("greptime", "public", "demo", None)
|
||||
.await
|
||||
.unwrap()
|
||||
.is_none())
|
||||
|
||||
@@ -462,7 +462,6 @@ async fn test_execute_show_databases_tables(instance: Arc<dyn MockInstance>) {
|
||||
+--------------------+
|
||||
| greptime_private |
|
||||
| information_schema |
|
||||
| pg_catalog |
|
||||
| public |
|
||||
+--------------------+\
|
||||
";
|
||||
@@ -1900,7 +1899,6 @@ async fn test_show_databases(instance: Arc<dyn MockInstance>) {
|
||||
+--------------------+
|
||||
| greptime_private |
|
||||
| information_schema |
|
||||
| pg_catalog |
|
||||
| public |
|
||||
+--------------------+";
|
||||
check_output_stream(output, expected).await;
|
||||
@@ -1914,7 +1912,6 @@ async fn test_show_databases(instance: Arc<dyn MockInstance>) {
|
||||
| Database |
|
||||
+--------------------+
|
||||
| information_schema |
|
||||
| pg_catalog |
|
||||
+--------------------+";
|
||||
check_output_stream(output, expected).await;
|
||||
}
|
||||
|
||||
@@ -78,6 +78,7 @@ macro_rules! http_tests {
|
||||
test_vm_proto_remote_write,
|
||||
|
||||
test_pipeline_api,
|
||||
test_test_pipeline_api,
|
||||
test_plain_text_ingestion,
|
||||
);
|
||||
)*
|
||||
@@ -1146,6 +1147,171 @@ transform:
|
||||
guard.remove_all().await;
|
||||
}
|
||||
|
||||
pub async fn test_test_pipeline_api(store_type: StorageType) {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let (app, mut guard) = setup_test_http_app_with_frontend(store_type, "test_pipeline_api").await;
|
||||
|
||||
// handshake
|
||||
let client = TestClient::new(app);
|
||||
|
||||
let body = r#"
|
||||
processors:
|
||||
- date:
|
||||
field: time
|
||||
formats:
|
||||
- "%Y-%m-%d %H:%M:%S%.3f"
|
||||
ignore_missing: true
|
||||
|
||||
transform:
|
||||
- fields:
|
||||
- id1
|
||||
- id2
|
||||
type: int32
|
||||
- fields:
|
||||
- type
|
||||
- log
|
||||
- logger
|
||||
type: string
|
||||
- field: time
|
||||
type: time
|
||||
index: timestamp
|
||||
"#;
|
||||
|
||||
// 1. create pipeline
|
||||
let res = client
|
||||
.post("/v1/events/pipelines/test")
|
||||
.header("Content-Type", "application/x-yaml")
|
||||
.body(body)
|
||||
.send()
|
||||
.await;
|
||||
|
||||
assert_eq!(res.status(), StatusCode::OK);
|
||||
|
||||
let content = res.text().await;
|
||||
|
||||
let content = serde_json::from_str(&content);
|
||||
assert!(content.is_ok());
|
||||
// {"execution_time_ms":13,"pipelines":[{"name":"test","version":"2024-07-04 08:31:00.987136"}]}
|
||||
let content: Value = content.unwrap();
|
||||
|
||||
let execution_time = content.get("execution_time_ms");
|
||||
assert!(execution_time.unwrap().is_number());
|
||||
let pipelines = content.get("pipelines");
|
||||
let pipelines = pipelines.unwrap().as_array().unwrap();
|
||||
assert_eq!(pipelines.len(), 1);
|
||||
let pipeline = pipelines.first().unwrap();
|
||||
assert_eq!(pipeline.get("name").unwrap(), "test");
|
||||
|
||||
// 2. write data
|
||||
let data_body = r#"
|
||||
[
|
||||
{
|
||||
"id1": "2436",
|
||||
"id2": "2528",
|
||||
"logger": "INTERACT.MANAGER",
|
||||
"type": "I",
|
||||
"time": "2024-05-25 20:16:37.217",
|
||||
"log": "ClusterAdapter:enter sendTextDataToCluster\\n"
|
||||
}
|
||||
]
|
||||
"#;
|
||||
let res = client
|
||||
.post("/v1/events/pipelines/dryrun?pipeline_name=test")
|
||||
.header("Content-Type", "application/json")
|
||||
.body(data_body)
|
||||
.send()
|
||||
.await;
|
||||
assert_eq!(res.status(), StatusCode::OK);
|
||||
let body: serde_json::Value = res.json().await;
|
||||
let schema = &body["schema"];
|
||||
let rows = &body["rows"];
|
||||
assert_eq!(
|
||||
schema,
|
||||
&json!([
|
||||
{
|
||||
"colume_type": "FIELD",
|
||||
"data_type": "INT32",
|
||||
"fulltext": false,
|
||||
"name": "id1"
|
||||
},
|
||||
{
|
||||
"colume_type": "FIELD",
|
||||
"data_type": "INT32",
|
||||
"fulltext": false,
|
||||
"name": "id2"
|
||||
},
|
||||
{
|
||||
"colume_type": "FIELD",
|
||||
"data_type": "STRING",
|
||||
"fulltext": false,
|
||||
"name": "type"
|
||||
},
|
||||
{
|
||||
"colume_type": "FIELD",
|
||||
"data_type": "STRING",
|
||||
"fulltext": false,
|
||||
"name": "log"
|
||||
},
|
||||
{
|
||||
"colume_type": "FIELD",
|
||||
"data_type": "STRING",
|
||||
"fulltext": false,
|
||||
"name": "logger"
|
||||
},
|
||||
{
|
||||
"colume_type": "TIMESTAMP",
|
||||
"data_type": "TIMESTAMP_NANOSECOND",
|
||||
"fulltext": false,
|
||||
"name": "time"
|
||||
}
|
||||
])
|
||||
);
|
||||
assert_eq!(
|
||||
rows,
|
||||
&json!([
|
||||
[
|
||||
{
|
||||
"data_type": "INT32",
|
||||
"key": "id1",
|
||||
"semantic_type": "FIELD",
|
||||
"value": 2436
|
||||
},
|
||||
{
|
||||
"data_type": "INT32",
|
||||
"key": "id2",
|
||||
"semantic_type": "FIELD",
|
||||
"value": 2528
|
||||
},
|
||||
{
|
||||
"data_type": "STRING",
|
||||
"key": "type",
|
||||
"semantic_type": "FIELD",
|
||||
"value": "I"
|
||||
},
|
||||
{
|
||||
"data_type": "STRING",
|
||||
"key": "log",
|
||||
"semantic_type": "FIELD",
|
||||
"value": "ClusterAdapter:enter sendTextDataToCluster\\n"
|
||||
},
|
||||
{
|
||||
"data_type": "STRING",
|
||||
"key": "logger",
|
||||
"semantic_type": "FIELD",
|
||||
"value": "INTERACT.MANAGER"
|
||||
},
|
||||
{
|
||||
"data_type": "TIMESTAMP_NANOSECOND",
|
||||
"key": "time",
|
||||
"semantic_type": "TIMESTAMP",
|
||||
"value": "2024-05-25 20:16:37.217+0000"
|
||||
}
|
||||
]
|
||||
])
|
||||
);
|
||||
guard.remove_all().await;
|
||||
}
|
||||
|
||||
pub async fn test_plain_text_ingestion(store_type: StorageType) {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let (app, mut guard) = setup_test_http_app_with_frontend(store_type, "test_pipeline_api").await;
|
||||
|
||||
@@ -1013,7 +1013,7 @@ async fn prepare_testing_metric_table(cluster: &GreptimeDbCluster) -> TableId {
|
||||
let table = cluster
|
||||
.frontend
|
||||
.catalog_manager()
|
||||
.table(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "phy")
|
||||
.table(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, "phy", None)
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
@@ -1039,7 +1039,12 @@ async fn prepare_testing_table(cluster: &GreptimeDbCluster) -> TableId {
|
||||
let table = cluster
|
||||
.frontend
|
||||
.catalog_manager()
|
||||
.table(DEFAULT_CATALOG_NAME, DEFAULT_SCHEMA_NAME, TEST_TABLE_NAME)
|
||||
.table(
|
||||
DEFAULT_CATALOG_NAME,
|
||||
DEFAULT_SCHEMA_NAME,
|
||||
TEST_TABLE_NAME,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
|
||||
@@ -54,3 +54,50 @@ drop table test;
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
-- Append table
|
||||
create table count_where_bug (
|
||||
tag String,
|
||||
ts TimestampMillisecond time index,
|
||||
num Int64,
|
||||
primary key (tag),
|
||||
) engine=mito with('append_mode'='true');
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
insert into count_where_bug (tag, ts, num)
|
||||
values ('a', '2024-09-06T06:00:01Z', 1),
|
||||
('a', '2024-09-06T06:00:02Z', 2),
|
||||
('a', '2024-09-06T06:00:03Z', 3),
|
||||
('b', '2024-09-06T06:00:04Z', 4),
|
||||
('b', '2024-09-06T06:00:05Z', 5);
|
||||
|
||||
Affected Rows: 5
|
||||
|
||||
select count(1) from count_where_bug where tag = 'b';
|
||||
|
||||
+-----------------+
|
||||
| COUNT(Int64(1)) |
|
||||
+-----------------+
|
||||
| 2 |
|
||||
+-----------------+
|
||||
|
||||
select count(1) from count_where_bug where ts > '2024-09-06T06:00:04Z';
|
||||
|
||||
+-----------------+
|
||||
| COUNT(Int64(1)) |
|
||||
+-----------------+
|
||||
| 1 |
|
||||
+-----------------+
|
||||
|
||||
select count(1) from count_where_bug where num != 3;
|
||||
|
||||
+-----------------+
|
||||
| COUNT(Int64(1)) |
|
||||
+-----------------+
|
||||
| 4 |
|
||||
+-----------------+
|
||||
|
||||
drop table count_where_bug;
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
|
||||
@@ -17,3 +17,27 @@ select count(*) from (select * from test cross join "HelloWorld");
|
||||
drop table "HelloWorld";
|
||||
|
||||
drop table test;
|
||||
|
||||
-- Append table
|
||||
|
||||
create table count_where_bug (
|
||||
tag String,
|
||||
ts TimestampMillisecond time index,
|
||||
num Int64,
|
||||
primary key (tag),
|
||||
) engine=mito with('append_mode'='true');
|
||||
|
||||
insert into count_where_bug (tag, ts, num)
|
||||
values ('a', '2024-09-06T06:00:01Z', 1),
|
||||
('a', '2024-09-06T06:00:02Z', 2),
|
||||
('a', '2024-09-06T06:00:03Z', 3),
|
||||
('b', '2024-09-06T06:00:04Z', 4),
|
||||
('b', '2024-09-06T06:00:05Z', 5);
|
||||
|
||||
select count(1) from count_where_bug where tag = 'b';
|
||||
|
||||
select count(1) from count_where_bug where ts > '2024-09-06T06:00:04Z';
|
||||
|
||||
select count(1) from count_where_bug where num != 3;
|
||||
|
||||
drop table count_where_bug;
|
||||
|
||||
@@ -18,7 +18,6 @@ show databases;
|
||||
| greptime_private |
|
||||
| illegal-database |
|
||||
| information_schema |
|
||||
| pg_catalog |
|
||||
| public |
|
||||
+--------------------+
|
||||
|
||||
|
||||
@@ -10,7 +10,6 @@ SHOW DATABASES;
|
||||
| greptime_private |
|
||||
| information_schema |
|
||||
| mydb |
|
||||
| pg_catalog |
|
||||
| public |
|
||||
+--------------------+
|
||||
|
||||
@@ -22,7 +21,6 @@ SHOW FULL DATABASES;
|
||||
| greptime_private | |
|
||||
| information_schema | |
|
||||
| mydb | ttl='1h' |
|
||||
| pg_catalog | |
|
||||
| public | |
|
||||
+--------------------+----------+
|
||||
|
||||
@@ -78,7 +76,6 @@ SHOW DATABASES;
|
||||
+--------------------+
|
||||
| greptime_private |
|
||||
| information_schema |
|
||||
| pg_catalog |
|
||||
| public |
|
||||
+--------------------+
|
||||
|
||||
|
||||
@@ -24,16 +24,13 @@ Affected Rows: 0
|
||||
|
||||
select table_catalog, table_schema, table_name from information_schema.tables where table_schema != 'information_schema';
|
||||
|
||||
+---------------+--------------+--------------+
|
||||
| table_catalog | table_schema | table_name |
|
||||
+---------------+--------------+--------------+
|
||||
| greptime | abc | t |
|
||||
| greptime | abcde | t |
|
||||
| greptime | pg_catalog | pg_class |
|
||||
| greptime | pg_catalog | pg_type |
|
||||
| greptime | pg_catalog | pg_namespace |
|
||||
| greptime | public | numbers |
|
||||
+---------------+--------------+--------------+
|
||||
+---------------+--------------+------------+
|
||||
| table_catalog | table_schema | table_name |
|
||||
+---------------+--------------+------------+
|
||||
| greptime | abc | t |
|
||||
| greptime | abcde | t |
|
||||
| greptime | public | numbers |
|
||||
+---------------+--------------+------------+
|
||||
|
||||
use public;
|
||||
|
||||
|
||||
@@ -5,7 +5,6 @@ SHOW DATABASES;
|
||||
+--------------------+
|
||||
| greptime_private |
|
||||
| information_schema |
|
||||
| pg_catalog |
|
||||
| public |
|
||||
+--------------------+
|
||||
|
||||
@@ -16,7 +15,6 @@ SHOW FULL DATABASES;
|
||||
+--------------------+---------+
|
||||
| greptime_private | |
|
||||
| information_schema | |
|
||||
| pg_catalog | |
|
||||
| public | |
|
||||
+--------------------+---------+
|
||||
|
||||
|
||||
@@ -45,9 +45,6 @@ order by table_schema, table_name;
|
||||
|greptime|information_schema|tables|LOCALTEMPORARY|3|0|0|0|0|0||11|Fixed|0|0|0|DATETIME|||utf8_bin|0|||Y|
|
||||
|greptime|information_schema|triggers|LOCALTEMPORARY|24|0|0|0|0|0||11|Fixed|0|0|0|DATETIME|||utf8_bin|0|||Y|
|
||||
|greptime|information_schema|views|LOCALTEMPORARY|32|0|0|0|0|0||11|Fixed|0|0|0|DATETIME|||utf8_bin|0|||Y|
|
||||
|greptime|pg_catalog|pg_class|LOCALTEMPORARY|256|0|0|0|0|0||11|Fixed|0|0|0|DATETIME|||utf8_bin|0|||Y|
|
||||
|greptime|pg_catalog|pg_namespace|LOCALTEMPORARY|258|0|0|0|0|0||11|Fixed|0|0|0|DATETIME|||utf8_bin|0|||Y|
|
||||
|greptime|pg_catalog|pg_type|LOCALTEMPORARY|257|0|0|0|0|0||11|Fixed|0|0|0|DATETIME|||utf8_bin|0|||Y|
|
||||
|greptime|public|numbers|LOCALTEMPORARY|2|0|0|0|0|0|test_engine|11|Fixed|0|0|0|DATETIME|||utf8_bin|0|||Y|
|
||||
+++++++++++++++++++++++++
|
||||
|
||||
@@ -413,16 +410,6 @@ select * from information_schema.columns order by table_schema, table_name, colu
|
||||
| greptime | information_schema | views | table_name | 3 | 2147483647 | 2147483647 | | | | utf8 | utf8_bin | | | select,insert | | String | string | FIELD | | No | string | | |
|
||||
| greptime | information_schema | views | table_schema | 2 | 2147483647 | 2147483647 | | | | utf8 | utf8_bin | | | select,insert | | String | string | FIELD | | No | string | | |
|
||||
| greptime | information_schema | views | view_definition | 4 | 2147483647 | 2147483647 | | | | utf8 | utf8_bin | | | select,insert | | String | string | FIELD | | No | string | | |
|
||||
| greptime | pg_catalog | pg_class | oid | 1 | | | 10 | 0 | | | | | | select,insert | | UInt32 | int unsigned | FIELD | | No | int unsigned | | |
|
||||
| greptime | pg_catalog | pg_class | relkind | 4 | 2147483647 | 2147483647 | | | | utf8 | utf8_bin | | | select,insert | | String | string | FIELD | | No | string | | |
|
||||
| greptime | pg_catalog | pg_class | relname | 2 | 2147483647 | 2147483647 | | | | utf8 | utf8_bin | | | select,insert | | String | string | FIELD | | No | string | | |
|
||||
| greptime | pg_catalog | pg_class | relnamespace | 3 | | | 10 | 0 | | | | | | select,insert | | UInt32 | int unsigned | FIELD | | No | int unsigned | | |
|
||||
| greptime | pg_catalog | pg_class | relowner | 5 | | | 10 | 0 | | | | | | select,insert | | UInt32 | int unsigned | FIELD | | No | int unsigned | | |
|
||||
| greptime | pg_catalog | pg_namespace | nspname | 2 | 2147483647 | 2147483647 | | | | utf8 | utf8_bin | | | select,insert | | String | string | FIELD | | No | string | | |
|
||||
| greptime | pg_catalog | pg_namespace | oid | 1 | | | 10 | 0 | | | | | | select,insert | | UInt32 | int unsigned | FIELD | | No | int unsigned | | |
|
||||
| greptime | pg_catalog | pg_type | oid | 1 | | | 10 | 0 | | | | | | select,insert | | UInt32 | int unsigned | FIELD | | No | int unsigned | | |
|
||||
| greptime | pg_catalog | pg_type | typlen | 3 | | | 5 | 0 | | | | | | select,insert | | Int16 | smallint | FIELD | | No | smallint | | |
|
||||
| greptime | pg_catalog | pg_type | typname | 2 | 2147483647 | 2147483647 | | | | utf8 | utf8_bin | | | select,insert | | String | string | FIELD | | No | string | | |
|
||||
| greptime | public | numbers | number | 1 | | | 10 | 0 | | | | PRI | | select,insert | | UInt32 | int unsigned | TAG | | No | int unsigned | | |
|
||||
+---------------+--------------------+---------------------------------------+-----------------------------------+------------------+--------------------------+------------------------+-------------------+---------------+--------------------+--------------------+----------------+------------+-------+---------------+-----------------------+----------------------+-----------------+---------------+----------------+-------------+-----------------+----------------+--------+
|
||||
|
||||
@@ -596,7 +583,6 @@ select * from schemata where catalog_name = 'greptime' and schema_name != 'publi
|
||||
+--------------+--------------------+----------------------------+------------------------+----------+---------+
|
||||
| greptime | greptime_private | utf8 | utf8_bin | | |
|
||||
| greptime | information_schema | utf8 | utf8_bin | | |
|
||||
| greptime | pg_catalog | utf8 | utf8_bin | | |
|
||||
+--------------+--------------------+----------------------------+------------------------+----------+---------+
|
||||
|
||||
-- test engines
|
||||
|
||||
@@ -5,30 +5,7 @@ Error: 1004(InvalidArguments), Schema pg_catalog already exists
|
||||
|
||||
select * from pg_catalog.pg_type order by oid;
|
||||
|
||||
+-----+-----------+--------+
|
||||
| oid | typname | typlen |
|
||||
+-----+-----------+--------+
|
||||
| 1 | String | -1 |
|
||||
| 2 | Binary | -1 |
|
||||
| 3 | Int8 | 1 |
|
||||
| 4 | Int16 | 2 |
|
||||
| 5 | Int32 | 4 |
|
||||
| 6 | Int64 | 8 |
|
||||
| 7 | UInt8 | 1 |
|
||||
| 8 | UInt16 | 2 |
|
||||
| 9 | UInt32 | 4 |
|
||||
| 10 | UInt64 | 8 |
|
||||
| 11 | Float32 | 4 |
|
||||
| 12 | Float64 | 8 |
|
||||
| 13 | Decimal | 16 |
|
||||
| 14 | Date | 4 |
|
||||
| 15 | DateTime | 8 |
|
||||
| 16 | Timestamp | 8 |
|
||||
| 17 | Time | 8 |
|
||||
| 18 | Duration | 8 |
|
||||
| 19 | Interval | 16 |
|
||||
| 20 | List | -1 |
|
||||
+-----+-----------+--------+
|
||||
Error: 4001(TableNotFound), Failed to plan SQL: Table not found: greptime.pg_catalog.pg_type
|
||||
|
||||
-- \d
|
||||
SELECT n.nspname as "Schema",
|
||||
@@ -44,11 +21,7 @@ WHERE c.relkind IN ('r','p','v','m','S','f','')
|
||||
AND pg_catalog.pg_table_is_visible(c.oid)
|
||||
ORDER BY 1,2;
|
||||
|
||||
+--------+---------+-------+-------+
|
||||
| Schema | Name | Type | Owner |
|
||||
+--------+---------+-------+-------+
|
||||
| public | numbers | table | |
|
||||
+--------+---------+-------+-------+
|
||||
Error: 4001(TableNotFound), Failed to plan SQL: Table not found: greptime.pg_catalog.pg_class
|
||||
|
||||
-- \dt
|
||||
SELECT n.nspname as "Schema",
|
||||
@@ -64,11 +37,7 @@ WHERE c.relkind IN ('r','p','')
|
||||
AND pg_catalog.pg_table_is_visible(c.oid)
|
||||
ORDER BY 1,2;
|
||||
|
||||
+--------+---------+-------+-------+
|
||||
| Schema | Name | Type | Owner |
|
||||
+--------+---------+-------+-------+
|
||||
| public | numbers | table | |
|
||||
+--------+---------+-------+-------+
|
||||
Error: 4001(TableNotFound), Failed to plan SQL: Table not found: greptime.pg_catalog.pg_class
|
||||
|
||||
-- make sure oid of namespace keep stable
|
||||
SELECT * FROM pg_namespace ORDER BY oid;
|
||||
@@ -100,11 +69,7 @@ where relnamespace = (
|
||||
where nspname = 'my_db'
|
||||
);
|
||||
|
||||
+---------+
|
||||
| relname |
|
||||
+---------+
|
||||
| foo |
|
||||
+---------+
|
||||
Error: 4001(TableNotFound), Failed to plan SQL: Table not found: greptime.pg_catalog.pg_class
|
||||
|
||||
-- \dt
|
||||
SELECT n.nspname as "Schema",
|
||||
@@ -120,12 +85,7 @@ WHERE c.relkind IN ('r','p','')
|
||||
AND pg_catalog.pg_table_is_visible(c.oid)
|
||||
ORDER BY 1,2;
|
||||
|
||||
+--------+---------+-------+-------+
|
||||
| Schema | Name | Type | Owner |
|
||||
+--------+---------+-------+-------+
|
||||
| my_db | foo | table | |
|
||||
| public | numbers | table | |
|
||||
+--------+---------+-------+-------+
|
||||
Error: 4001(TableNotFound), Failed to plan SQL: Table not found: greptime.pg_catalog.pg_class
|
||||
|
||||
-- show tables in `my_db`, `public`
|
||||
select relname
|
||||
@@ -137,12 +97,7 @@ where relnamespace in (
|
||||
)
|
||||
order by relname;
|
||||
|
||||
+---------+
|
||||
| relname |
|
||||
+---------+
|
||||
| foo |
|
||||
| numbers |
|
||||
+---------+
|
||||
Error: 4001(TableNotFound), Failed to plan SQL: Table not found: greptime.pg_catalog.pg_class
|
||||
|
||||
select relname
|
||||
from pg_catalog.pg_class
|
||||
@@ -152,11 +107,7 @@ where relnamespace in (
|
||||
where nspname like 'my%'
|
||||
);
|
||||
|
||||
+---------+
|
||||
| relname |
|
||||
+---------+
|
||||
| foo |
|
||||
+---------+
|
||||
Error: 4001(TableNotFound), Failed to plan SQL: Table not found: greptime.pg_catalog.pg_class
|
||||
|
||||
select relnamespace, relname, relkind
|
||||
from pg_catalog.pg_class
|
||||
@@ -169,11 +120,7 @@ where relnamespace in (
|
||||
)
|
||||
order by relnamespace, relname;
|
||||
|
||||
+--------------+---------+---------+
|
||||
| relnamespace | relname | relkind |
|
||||
+--------------+---------+---------+
|
||||
| 434869349 | foo | r |
|
||||
+--------------+---------+---------+
|
||||
Error: 4001(TableNotFound), Failed to plan SQL: Table not found: greptime.pg_catalog.pg_class
|
||||
|
||||
use public;
|
||||
|
||||
@@ -190,24 +137,11 @@ Affected Rows: 0
|
||||
-- pg_class
|
||||
desc table pg_class;
|
||||
|
||||
+--------------+--------+-----+------+---------+---------------+
|
||||
| Column | Type | Key | Null | Default | Semantic Type |
|
||||
+--------------+--------+-----+------+---------+---------------+
|
||||
| oid | UInt32 | | NO | | FIELD |
|
||||
| relname | String | | NO | | FIELD |
|
||||
| relnamespace | UInt32 | | NO | | FIELD |
|
||||
| relkind | String | | NO | | FIELD |
|
||||
| relowner | UInt32 | | NO | | FIELD |
|
||||
+--------------+--------+-----+------+---------+---------------+
|
||||
Error: 4001(TableNotFound), Table not found: pg_class
|
||||
|
||||
desc table pg_namespace;
|
||||
|
||||
+---------+--------+-----+------+---------+---------------+
|
||||
| Column | Type | Key | Null | Default | Semantic Type |
|
||||
+---------+--------+-----+------+---------+---------------+
|
||||
| oid | UInt32 | | NO | | FIELD |
|
||||
| nspname | String | | NO | | FIELD |
|
||||
+---------+--------+-----+------+---------+---------------+
|
||||
Error: 4001(TableNotFound), Table not found: pg_namespace
|
||||
|
||||
drop table my_db.foo;
|
||||
|
||||
|
||||
@@ -77,11 +77,7 @@ WHERE c.relkind IN ('v','')
|
||||
AND pg_catalog.pg_table_is_visible(c.oid)
|
||||
ORDER BY 1,2;
|
||||
|
||||
+--------+-----------+------+-------+
|
||||
| Schema | Name | Type | Owner |
|
||||
+--------+-----------+------+-------+
|
||||
| public | test_view | view | |
|
||||
+--------+-----------+------+-------+
|
||||
Error: 4001(TableNotFound), Failed to plan SQL: Table not found: greptime.pg_catalog.pg_class
|
||||
|
||||
-- SQLNESS REPLACE (\s\d+\s) ID
|
||||
-- SQLNESS REPLACE (\s[\-0-9T:\.]{15,}) DATETIME
|
||||
@@ -110,9 +106,6 @@ SELECT * FROM INFORMATION_SCHEMA.TABLES ORDER BY TABLE_NAME, TABLE_TYPE;
|
||||
|greptime|information_schema|optimizer_trace|LOCALTEMPORARY|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|||utf8_bin|ID|||Y|
|
||||
|greptime|information_schema|parameters|LOCALTEMPORARY|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|||utf8_bin|ID|||Y|
|
||||
|greptime|information_schema|partitions|LOCALTEMPORARY|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|||utf8_bin|ID|||Y|
|
||||
|greptime|pg_catalog|pg_class|LOCALTEMPORARY|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|||utf8_bin|ID|||Y|
|
||||
|greptime|pg_catalog|pg_namespace|LOCALTEMPORARY|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|||utf8_bin|ID|||Y|
|
||||
|greptime|pg_catalog|pg_type|LOCALTEMPORARY|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|||utf8_bin|ID|||Y|
|
||||
|greptime|information_schema|profiling|LOCALTEMPORARY|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|||utf8_bin|ID|||Y|
|
||||
|greptime|information_schema|referential_constraints|LOCALTEMPORARY|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|||utf8_bin|ID|||Y|
|
||||
|greptime|information_schema|region_peers|LOCALTEMPORARY|ID|ID|ID|ID|ID|ID||ID|Fixed|ID|ID|ID|DATETIME|||utf8_bin|ID|||Y|
|
||||
@@ -205,6 +198,5 @@ WHERE c.relkind IN ('v','')
|
||||
AND pg_catalog.pg_table_is_visible(c.oid)
|
||||
ORDER BY 1,2;
|
||||
|
||||
++
|
||||
++
|
||||
Error: 4001(TableNotFound), Failed to plan SQL: Table not found: greptime.pg_catalog.pg_class
|
||||
|
||||
|
||||
Reference in New Issue
Block a user