diff --git a/Cargo.lock b/Cargo.lock index 111f7852a8..1511654c7e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2996,9 +2996,9 @@ dependencies = [ [[package]] name = "crc" -version = "3.2.1" +version = "3.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69e6e4d7b33a94f0991c26729976b10ebde1d34c3ee82408fb536164fa10d636" +checksum = "9710d3b3739c2e349eb44fe848ad0b7c8cb1e42bd87ee49371df2f7acaf3e675" dependencies = [ "crc-catalog", ] @@ -3830,7 +3830,7 @@ dependencies = [ "jsonb", "num", "num-traits", - "ordered-float 3.9.2", + "ordered-float 4.3.0", "paste", "serde", "serde_json", @@ -4151,12 +4151,16 @@ dependencies = [ [[package]] name = "domain" -version = "0.10.4" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c84070523f8ba0f9127ff156920f27eb27b302b425efe60bf5f41ec244d1c60" +checksum = "a11dd7f04a6a6d2aea0153c6e31f5ea7af8b2efdf52cdaeea7a9a592c7fefef9" dependencies = [ + "bumpalo", "bytes", + "domain-macros", "futures-util", + "hashbrown 0.14.5", + "log", "moka", "octseq", "rand 0.8.5", @@ -4167,6 +4171,17 @@ dependencies = [ "tracing", ] +[[package]] +name = "domain-macros" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e197fdfd2cdb5fdeb7f8ddcf3aed5d5d04ecde2890d448b14ffb716f7376b70" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.100", +] + [[package]] name = "dotenv" version = "0.15.0" @@ -8566,17 +8581,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "ordered-float" -version = "3.9.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1e1c390732d15f1d48471625cd92d154e66db2c56645e29a9cd26f4699f72dc" -dependencies = [ - "num-traits", - "rand 0.8.5", - "serde", -] - [[package]] name = "ordered-float" version = "4.3.0" @@ -8584,6 +8588,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "44d501f1a72f71d3c063a6bbc8f7271fa73aa09fe5d6283b6571e2ed176a2537" dependencies = [ "num-traits", + "rand 0.8.5", + "serde", ] [[package]] @@ -9120,6 +9126,7 @@ dependencies = [ "moka", "once_cell", "operator", + "ordered-float 4.3.0", "paste", "prometheus", "query", @@ -11368,6 +11375,7 @@ dependencies = [ "tracing", "urlencoding", "uuid", + "vrl", "zstd 0.13.2", ] @@ -13030,9 +13038,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.44.2" +version = "1.45.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6b88822cbe49de4185e3a4cbf8321dd487cf5fe0c5c65695fef6346371e9c48" +checksum = "75ef51a33ef1da925cea3e4eb122833cb377c61439ca401b770f54902b806779" dependencies = [ "backtrace", "bytes", @@ -13988,9 +13996,9 @@ dependencies = [ [[package]] name = "vrl" -version = "0.24.0" +version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9ceadaa40aef567a26079ff014ca7a567ba85344f1b81090b5ec7d7bb16a219" +checksum = "4f49394b948406ea1564aa00152e011d87a38ad35d277ebddda257a9ee39c419" dependencies = [ "aes", "aes-siv", diff --git a/Cargo.toml b/Cargo.toml index f6c0cfe0cf..a61412f621 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -167,6 +167,7 @@ opentelemetry-proto = { version = "0.27", features = [ "with-serde", "logs", ] } +ordered-float = { version = "4.3", features = ["serde"] } parking_lot = "0.12" parquet = { version = "54.2", default-features = false, features = ["arrow", "async", "object_store"] } paste = "1.0" @@ -228,6 +229,7 @@ tracing-appender = "0.2" tracing-subscriber = { version = "0.3", features = ["env-filter", "json", "fmt"] } typetag = "0.2" uuid = { version = "1.7", features = ["serde", "v4", "fast-rng"] } +vrl = "0.25" zstd = "0.13" # DO_NOT_REMOVE_THIS: END_OF_EXTERNAL_DEPENDENCIES diff --git a/src/datatypes/Cargo.toml b/src/datatypes/Cargo.toml index cccaf67300..e64402afd5 100644 --- a/src/datatypes/Cargo.toml +++ b/src/datatypes/Cargo.toml @@ -28,7 +28,7 @@ greptime-proto.workspace = true jsonb.workspace = true num = "0.4" num-traits = "0.2" -ordered-float = { version = "3.0", features = ["serde"] } +ordered-float.workspace = true paste.workspace = true serde.workspace = true serde_json.workspace = true diff --git a/src/pipeline/Cargo.toml b/src/pipeline/Cargo.toml index aa72850887..4b9f13efd4 100644 --- a/src/pipeline/Cargo.toml +++ b/src/pipeline/Cargo.toml @@ -47,6 +47,7 @@ lazy_static.workspace = true moka = { workspace = true, features = ["sync"] } once_cell.workspace = true operator.workspace = true +ordered-float.workspace = true paste.workspace = true prometheus.workspace = true query.workspace = true @@ -59,7 +60,7 @@ sql.workspace = true table.workspace = true tokio.workspace = true urlencoding = "2.1" -vrl = "0.24" +vrl.workspace = true yaml-rust = "0.4" [dev-dependencies] diff --git a/src/pipeline/benches/processor.rs b/src/pipeline/benches/processor.rs index 3f11cc39d1..a2ef070786 100644 --- a/src/pipeline/benches/processor.rs +++ b/src/pipeline/benches/processor.rs @@ -16,23 +16,21 @@ use std::sync::Arc; use criterion::{black_box, criterion_group, criterion_main, Criterion}; use pipeline::error::Result; -use pipeline::{ - json_to_map, parse, setup_pipeline, Content, Pipeline, PipelineContext, SchemaInfo, -}; -use serde_json::{Deserializer, Value}; +use pipeline::{parse, setup_pipeline, Content, Pipeline, PipelineContext, SchemaInfo}; +use serde_json::Deserializer; +use vrl::value::Value as VrlValue; fn processor_mut( pipeline: Arc, pipeline_ctx: &PipelineContext<'_>, schema_info: &mut SchemaInfo, - input_values: Vec, + input_values: Vec, ) -> Result> { let mut result = Vec::with_capacity(input_values.len()); for v in input_values { - let payload = json_to_map(v).unwrap(); let r = pipeline - .exec_mut(payload, pipeline_ctx, schema_info)? + .exec_mut(v, pipeline_ctx, schema_info)? .into_transformed() .expect("expect transformed result "); result.push(r.0); @@ -237,7 +235,7 @@ transform: fn criterion_benchmark(c: &mut Criterion) { let input_value_str = include_str!("./data.log"); let input_value = Deserializer::from_str(input_value_str) - .into_iter::() + .into_iter::() .collect::, _>>() .unwrap(); let pipeline = prepare_pipeline(); diff --git a/src/pipeline/src/dispatcher.rs b/src/pipeline/src/dispatcher.rs index 2213b85d58..b4d9411486 100644 --- a/src/pipeline/src/dispatcher.rs +++ b/src/pipeline/src/dispatcher.rs @@ -14,6 +14,7 @@ use common_telemetry::debug; use snafu::OptionExt; +use vrl::value::Value as VrlValue; use yaml_rust::Yaml; use crate::error::{ @@ -21,7 +22,7 @@ use crate::error::{ ValueRequiredForDispatcherRuleSnafu, }; use crate::etl::ctx_req::TABLE_SUFFIX_KEY; -use crate::Value; +use crate::etl::value::yaml_to_vrl_value; const FIELD: &str = "field"; const PIPELINE: &str = "pipeline"; @@ -62,7 +63,7 @@ pub(crate) struct Dispatcher { /// name #[derive(Debug, PartialEq)] pub(crate) struct Rule { - pub value: Value, + pub value: VrlValue, pub table_suffix: String, pub pipeline: Option, } @@ -90,7 +91,8 @@ impl TryFrom<&Yaml> for Dispatcher { if rule[VALUE].is_badvalue() { ValueRequiredForDispatcherRuleSnafu.fail()?; } - let value = Value::try_from(&rule[VALUE])?; + + let value = yaml_to_vrl_value(&rule[VALUE])?; Ok(Rule { value, @@ -109,8 +111,9 @@ impl TryFrom<&Yaml> for Dispatcher { impl Dispatcher { /// execute dispatcher and returns matched rule if any - pub(crate) fn exec(&self, data: &Value) -> Option<&Rule> { - if let Some(value) = data.get(&self.field) { + pub(crate) fn exec(&self, data: &VrlValue) -> Option<&Rule> { + let data = data.as_object()?; + if let Some(value) = data.get(self.field.as_str()) { for rule in &self.rules { if rule.value == *value { return Some(rule); diff --git a/src/pipeline/src/error.rs b/src/pipeline/src/error.rs index 1ded1064f9..a42adb2d9f 100644 --- a/src/pipeline/src/error.rs +++ b/src/pipeline/src/error.rs @@ -62,7 +62,7 @@ pub enum Error { #[snafu(display("Processor {processor}: expect string value, but got {v:?}"))] ProcessorExpectString { processor: String, - v: crate::Value, + v: vrl::value::Value, #[snafu(implicit)] location: Location, }, @@ -229,12 +229,6 @@ pub enum Error { location: Location, }, - #[snafu(display("Failed to get timestamp"))] - DateFailedToGetTimestamp { - #[snafu(implicit)] - location: Location, - }, - #[snafu(display("Invalid Pattern: '{s}'. {detail}"))] DissectInvalidPattern { s: String, @@ -372,13 +366,6 @@ pub enum Error { #[snafu(implicit)] location: Location, }, - #[snafu(display("Url decoding error"))] - UrlEncodingDecode { - #[snafu(source)] - error: std::string::FromUtf8Error, - #[snafu(implicit)] - location: Location, - }, #[snafu(display("Invalid transform on_failure value: {value}"))] TransformOnFailureInvalidValue { value: String, @@ -433,17 +420,6 @@ pub enum Error { #[snafu(implicit)] location: Location, }, - #[snafu(display("Null type not supported"))] - CoerceUnsupportedNullType { - #[snafu(implicit)] - location: Location, - }, - #[snafu(display("Null type not supported when to coerce '{ty}' type"))] - CoerceUnsupportedNullTypeTo { - ty: String, - #[snafu(implicit)] - location: Location, - }, #[snafu(display("Type: {ty} value not supported for Epoch"))] CoerceUnsupportedEpochType { ty: String, @@ -556,12 +532,6 @@ pub enum Error { #[snafu(implicit)] location: Location, }, - #[snafu(display("Input value must be an object"))] - InputValueMustBeObject { - #[snafu(implicit)] - location: Location, - }, - #[snafu(display("Column options error"))] ColumnOptions { #[snafu(source)] @@ -575,12 +545,6 @@ pub enum Error { #[snafu(implicit)] location: Location, }, - #[snafu(display("Unsupported number type: {value:?}"))] - UnsupportedNumberType { - value: serde_json::Number, - #[snafu(implicit)] - location: Location, - }, #[snafu(display("Failed to parse json"))] JsonParse { #[snafu(source)] @@ -694,14 +658,6 @@ pub enum Error { #[snafu(implicit)] location: Location, }, - - #[snafu(display("Float is not a number: {}", input_float))] - FloatNaN { - input_float: f64, - #[snafu(implicit)] - location: Location, - }, - #[snafu(display("Invalid timestamp value: {}", input))] InvalidTimestamp { input: String, @@ -709,14 +665,13 @@ pub enum Error { location: Location, }, - #[snafu(display("Failed to convert bytes to utf8"))] - BytesToUtf8 { - #[snafu(source)] - error: std::string::FromUtf8Error, + #[snafu(display("Invalid epoch value '{}' for resolution '{}'", value, resolution))] + InvalidEpochForResolution { + value: i64, + resolution: String, #[snafu(implicit)] location: Location, }, - #[snafu(display("Please don't use regex in Vrl script"))] VrlRegexValue { #[snafu(implicit)] @@ -808,6 +763,21 @@ pub enum Error { #[snafu(implicit)] location: Location, }, + + #[snafu(display("Float is NaN"))] + FloatIsNan { + #[snafu(source)] + error: ordered_float::FloatIsNan, + #[snafu(implicit)] + location: Location, + }, + + #[snafu(display("Unsupported type in pipeline: {}", ty))] + UnsupportedTypeInPipeline { + ty: String, + #[snafu(implicit)] + location: Location, + }, } pub type Result = std::result::Result; @@ -858,7 +828,6 @@ impl ErrorExt for Error { | DateParseTimezone { .. } | DateParse { .. } | DateFailedToGetLocalTimezone { .. } - | DateFailedToGetTimestamp { .. } | DissectInvalidPattern { .. } | DissectEmptyPattern { .. } | DissectSplitExceedsInput { .. } @@ -881,7 +850,6 @@ impl ErrorExt for Error { | RegexNoValidPattern { .. } | UrlEncodingInvalidMethod { .. } | DigestPatternInvalid { .. } - | UrlEncodingDecode { .. } | TransformOnFailureInvalidValue { .. } | TransformElementMustBeMap { .. } | TransformFieldMustBeSet { .. } @@ -891,8 +859,6 @@ impl ErrorExt for Error { | TransformTimestampIndexCount { .. } | AutoTransformOneTimestamp { .. } | InvalidVersionNumber { .. } - | CoerceUnsupportedNullType { .. } - | CoerceUnsupportedNullTypeTo { .. } | CoerceUnsupportedEpochType { .. } | CoerceStringToType { .. } | CoerceJsonTypeTo { .. } @@ -908,10 +874,8 @@ impl ErrorExt for Error { | ValueYamlKeyMustBeString { .. } | YamlLoad { .. } | YamlParse { .. } - | InputValueMustBeObject { .. } | ColumnOptions { .. } | UnsupportedIndexType { .. } - | UnsupportedNumberType { .. } | IdentifyPipelineColumnTypeMismatch { .. } | JsonParse { .. } | JsonPathParse { .. } @@ -924,12 +888,14 @@ impl ErrorExt for Error { | InvalidTableSuffixTemplate { .. } | CompileVrl { .. } | ExecuteVrl { .. } - | FloatNaN { .. } - | BytesToUtf8 { .. } | InvalidTimestamp { .. } | VrlRegexValue { .. } | VrlReturnValue { .. } | PipelineMissing { .. } => StatusCode::InvalidArguments, + + FloatIsNan { .. } + | InvalidEpochForResolution { .. } + | UnsupportedTypeInPipeline { .. } => StatusCode::InvalidArguments, } } diff --git a/src/pipeline/src/etl.rs b/src/pipeline/src/etl.rs index 40210e5662..4896c3aaea 100644 --- a/src/pipeline/src/etl.rs +++ b/src/pipeline/src/etl.rs @@ -19,21 +19,19 @@ pub mod processor; pub mod transform; pub mod value; -use std::collections::BTreeMap; - use api::v1::Row; use common_time::timestamp::TimeUnit; use itertools::Itertools; use processor::{Processor, Processors}; use snafu::{ensure, OptionExt, ResultExt}; use transform::Transforms; -use value::Value; +use vrl::core::Value as VrlValue; use yaml_rust::{Yaml, YamlLoader}; use crate::dispatcher::{Dispatcher, Rule}; use crate::error::{ - AutoTransformOneTimestampSnafu, Error, InputValueMustBeObjectSnafu, IntermediateKeyIndexSnafu, - InvalidVersionNumberSnafu, Result, YamlLoadSnafu, YamlParseSnafu, + AutoTransformOneTimestampSnafu, Error, IntermediateKeyIndexSnafu, InvalidVersionNumberSnafu, + Result, YamlLoadSnafu, YamlParseSnafu, }; use crate::etl::processor::ProcessorKind; use crate::etl::transform::transformer::greptime::values_to_row; @@ -228,7 +226,7 @@ impl DispatchedTo { #[derive(Debug)] pub enum PipelineExecOutput { Transformed(TransformedOutput), - DispatchedTo(DispatchedTo, Value), + DispatchedTo(DispatchedTo, VrlValue), } #[derive(Debug)] @@ -261,40 +259,6 @@ impl PipelineExecOutput { } } -pub fn json_to_map(val: serde_json::Value) -> Result { - match val { - serde_json::Value::Object(map) => { - let mut intermediate_state = BTreeMap::new(); - for (k, v) in map { - intermediate_state.insert(k, Value::try_from(v)?); - } - Ok(Value::Map(intermediate_state.into())) - } - _ => InputValueMustBeObjectSnafu.fail(), - } -} - -pub fn json_array_to_map(val: Vec) -> Result> { - val.into_iter().map(json_to_map).collect() -} - -pub fn simd_json_to_map(val: simd_json::OwnedValue) -> Result { - match val { - simd_json::OwnedValue::Object(map) => { - let mut intermediate_state = BTreeMap::new(); - for (k, v) in map.into_iter() { - intermediate_state.insert(k, Value::try_from(v)?); - } - Ok(Value::Map(intermediate_state.into())) - } - _ => InputValueMustBeObjectSnafu.fail(), - } -} - -pub fn simd_json_array_to_map(val: Vec) -> Result> { - val.into_iter().map(simd_json_to_map).collect() -} - impl Pipeline { fn is_v1(&self) -> bool { self.doc_version == PipelineDocVersion::V1 @@ -302,7 +266,7 @@ impl Pipeline { pub fn exec_mut( &self, - mut val: Value, + mut val: VrlValue, pipeline_ctx: &PipelineContext<'_>, schema_info: &mut SchemaInfo, ) -> Result { @@ -409,11 +373,14 @@ macro_rules! setup_pipeline { } #[cfg(test)] mod tests { + use std::collections::BTreeMap; use std::sync::Arc; use api::v1::Rows; use greptime_proto::v1::value::ValueData; use greptime_proto::v1::{self, ColumnDataType, SemanticType}; + use vrl::prelude::Bytes; + use vrl::value::KeyString; use super::*; @@ -454,7 +421,7 @@ transform: session::context::Channel::Unknown, ); - let payload = json_to_map(input_value).unwrap(); + let payload = input_value.into(); let result = pipeline .exec_mut(payload, &pipeline_ctx, &mut schema_info) .unwrap() @@ -515,9 +482,10 @@ transform: &pipeline_param, session::context::Channel::Unknown, ); - let mut payload = BTreeMap::new(); - payload.insert("message".to_string(), Value::String(message)); - let payload = Value::Map(payload.into()); + let payload = VrlValue::Object(BTreeMap::from([( + KeyString::from("message"), + VrlValue::Bytes(Bytes::from(message)), + )])); let result = pipeline .exec_mut(payload, &pipeline_ctx, &mut schema_info) @@ -613,7 +581,7 @@ transform: session::context::Channel::Unknown, ); - let payload = json_to_map(input_value).unwrap(); + let payload = input_value.into(); let result = pipeline .exec_mut(payload, &pipeline_ctx, &mut schema_info) .unwrap() @@ -666,7 +634,7 @@ transform: session::context::Channel::Unknown, ); let schema = pipeline.schemas().unwrap().clone(); - let result = json_to_map(input_value).unwrap(); + let result = input_value.into(); let row = pipeline .exec_mut(result, &pipeline_ctx, &mut schema_info) @@ -732,7 +700,7 @@ transform: assert_eq!( dispatcher.rules[0], crate::dispatcher::Rule { - value: Value::String("http".to_string()), + value: VrlValue::Bytes(Bytes::from("http")), table_suffix: "http_events".to_string(), pipeline: None } @@ -741,7 +709,7 @@ transform: assert_eq!( dispatcher.rules[1], crate::dispatcher::Rule { - value: Value::String("database".to_string()), + value: VrlValue::Bytes(Bytes::from("database")), table_suffix: "db_events".to_string(), pipeline: Some("database_pipeline".to_string()), } diff --git a/src/pipeline/src/etl/ctx_req.rs b/src/pipeline/src/etl/ctx_req.rs index 2ce60a16d0..f8fc7c11f2 100644 --- a/src/pipeline/src/etl/ctx_req.rs +++ b/src/pipeline/src/etl/ctx_req.rs @@ -19,10 +19,10 @@ use ahash::{HashMap, HashMapExt}; use api::v1::{RowInsertRequest, RowInsertRequests, Rows}; use session::context::{QueryContext, QueryContextRef}; use snafu::OptionExt; +use vrl::value::Value as VrlValue; use crate::error::{Result, ValueMustBeMapSnafu}; use crate::tablesuffix::TableSuffixTemplate; -use crate::Value; const GREPTIME_AUTO_CREATE_TABLE: &str = "greptime_auto_create_table"; const GREPTIME_TTL: &str = "greptime_ttl"; @@ -86,32 +86,34 @@ impl ContextOpt { } impl ContextOpt { - pub fn from_pipeline_map_to_opt(pipeline_map: &mut Value) -> Result { - let pipeline_map = pipeline_map.as_map_mut().context(ValueMustBeMapSnafu)?; + pub fn from_pipeline_map_to_opt(value: &mut VrlValue) -> Result { + let map = value.as_object_mut().context(ValueMustBeMapSnafu)?; + let mut opt = Self::default(); for k in PIPELINE_HINT_KEYS { - if let Some(v) = pipeline_map.remove(k) { + if let Some(v) = map.remove(k) { + let v = v.to_string_lossy().to_string(); match k { GREPTIME_AUTO_CREATE_TABLE => { - opt.auto_create_table = Some(v.to_str_value()); + opt.auto_create_table = Some(v); } GREPTIME_TTL => { - opt.ttl = Some(v.to_str_value()); + opt.ttl = Some(v); } GREPTIME_APPEND_MODE => { - opt.append_mode = Some(v.to_str_value()); + opt.append_mode = Some(v); } GREPTIME_MERGE_MODE => { - opt.merge_mode = Some(v.to_str_value()); + opt.merge_mode = Some(v); } GREPTIME_PHYSICAL_TABLE => { - opt.physical_table = Some(v.to_str_value()); + opt.physical_table = Some(v); } GREPTIME_SKIP_WAL => { - opt.skip_wal = Some(v.to_str_value()); + opt.skip_wal = Some(v); } GREPTIME_TABLE_SUFFIX => { - opt.table_suffix = Some(v.to_str_value()); + opt.table_suffix = Some(v); } _ => {} } @@ -123,7 +125,7 @@ impl ContextOpt { pub(crate) fn resolve_table_suffix( &mut self, table_suffix: Option<&TableSuffixTemplate>, - pipeline_map: &Value, + pipeline_map: &VrlValue, ) -> Option { self.table_suffix .take() diff --git a/src/pipeline/src/etl/processor.rs b/src/pipeline/src/etl/processor.rs index 9d3e1d5c0f..b2f4285257 100644 --- a/src/pipeline/src/etl/processor.rs +++ b/src/pipeline/src/etl/processor.rs @@ -28,7 +28,7 @@ pub mod regex; pub mod select; pub mod simple_extract; pub mod urlencoding; -pub mod vrl; +pub mod vrl_processor; use std::str::FromStr; @@ -47,6 +47,7 @@ use letter::LetterProcessor; use regex::RegexProcessor; use snafu::{OptionExt, ResultExt}; use urlencoding::UrlEncodingProcessor; +use vrl::value::Value as VrlValue; use crate::error::{ Error, FailedParseFieldFromStringSnafu, FieldMustBeTypeSnafu, InvalidFieldRenameSnafu, @@ -57,8 +58,7 @@ use crate::etl::field::{Field, Fields}; use crate::etl::processor::json_parse::JsonParseProcessor; use crate::etl::processor::select::SelectProcessor; use crate::etl::processor::simple_extract::SimpleExtractProcessor; -use crate::etl::processor::vrl::VrlProcessor; -use crate::Value; +use crate::etl::processor::vrl_processor::VrlProcessor; const FIELD_NAME: &str = "field"; const FIELDS_NAME: &str = "fields"; @@ -123,7 +123,7 @@ pub trait Processor: std::fmt::Debug + Send + Sync + 'static { fn ignore_missing(&self) -> bool; /// Execute the processor on a vector which be preprocessed by the pipeline - fn exec_mut(&self, val: Value) -> Result; + fn exec_mut(&self, val: VrlValue) -> Result; } #[derive(Debug)] @@ -224,7 +224,7 @@ fn parse_processor(doc: &yaml_rust::Yaml) -> Result { json_parse::PROCESSOR_JSON_PARSE => { ProcessorKind::JsonParse(JsonParseProcessor::try_from(value)?) } - vrl::PROCESSOR_VRL => ProcessorKind::Vrl(VrlProcessor::try_from(value)?), + vrl_processor::PROCESSOR_VRL => ProcessorKind::Vrl(VrlProcessor::try_from(value)?), select::PROCESSOR_SELECT => ProcessorKind::Select(SelectProcessor::try_from(value)?), _ => return UnsupportedProcessorSnafu { processor: str_key }.fail(), }; diff --git a/src/pipeline/src/etl/processor/cmcd.rs b/src/pipeline/src/etl/processor/cmcd.rs index 3d736192c7..cca6fbfe0d 100644 --- a/src/pipeline/src/etl/processor/cmcd.rs +++ b/src/pipeline/src/etl/processor/cmcd.rs @@ -18,20 +18,22 @@ use std::collections::BTreeMap; +use ordered_float::NotNan; use snafu::{OptionExt, ResultExt}; use urlencoding::decode; +use vrl::prelude::Bytes; +use vrl::value::{KeyString, Value as VrlValue}; use crate::error::{ CmcdMissingKeySnafu, CmcdMissingValueSnafu, Error, FailedToParseFloatKeySnafu, - FailedToParseIntKeySnafu, KeyMustBeStringSnafu, ProcessorExpectStringSnafu, - ProcessorMissingFieldSnafu, Result, + FailedToParseIntKeySnafu, FloatIsNanSnafu, KeyMustBeStringSnafu, ProcessorExpectStringSnafu, + ProcessorMissingFieldSnafu, Result, ValueMustBeMapSnafu, }; use crate::etl::field::Fields; use crate::etl::processor::{ yaml_bool, yaml_new_field, yaml_new_fields, Processor, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, }; -use crate::etl::value::Value; pub(crate) const PROCESSOR_CMCD: &str = "cmcd"; @@ -76,42 +78,43 @@ const CMCD_KEYS: [&str; 18] = [ ]; /// function to resolve CMCD_KEY_BS | CMCD_KEY_SU -fn bs_su(_: &str, _: &str, _: Option<&str>) -> Result { - Ok(Value::Boolean(true)) +fn bs_su(_: &str, _: &str, _: Option<&str>) -> Result { + Ok(VrlValue::Boolean(true)) } /// function to resolve CMCD_KEY_BR | CMCD_KEY_BL | CMCD_KEY_D | CMCD_KEY_DL | CMCD_KEY_MTP | CMCD_KEY_RTP | CMCD_KEY_TB -fn br_tb(s: &str, k: &str, v: Option<&str>) -> Result { +fn br_tb(s: &str, k: &str, v: Option<&str>) -> Result { let v = v.context(CmcdMissingValueSnafu { k, s })?; let val: i64 = v .parse() .context(FailedToParseIntKeySnafu { key: k, value: v })?; - Ok(Value::Int64(val)) + Ok(VrlValue::Integer(val)) } /// function to resolve CMCD_KEY_CID | CMCD_KEY_NRR | CMCD_KEY_OT | CMCD_KEY_SF | CMCD_KEY_SID | CMCD_KEY_V -fn cid_v(s: &str, k: &str, v: Option<&str>) -> Result { +fn cid_v(s: &str, k: &str, v: Option<&str>) -> Result { let v = v.context(CmcdMissingValueSnafu { k, s })?; - Ok(Value::String(v.to_string())) + Ok(VrlValue::Bytes(Bytes::from(v.to_string()))) } /// function to resolve CMCD_KEY_NOR -fn nor(s: &str, k: &str, v: Option<&str>) -> Result { +fn nor(s: &str, k: &str, v: Option<&str>) -> Result { let v = v.context(CmcdMissingValueSnafu { k, s })?; let val = match decode(v) { Ok(val) => val.to_string(), Err(_) => v.to_string(), }; - Ok(Value::String(val)) + Ok(VrlValue::Bytes(Bytes::from(val))) } /// function to resolve CMCD_KEY_PR -fn pr(s: &str, k: &str, v: Option<&str>) -> Result { +fn pr(s: &str, k: &str, v: Option<&str>) -> Result { let v = v.context(CmcdMissingValueSnafu { k, s })?; let val: f64 = v .parse() .context(FailedToParseFloatKeySnafu { key: k, value: v })?; - Ok(Value::Float64(val)) + let val = NotNan::new(val).context(FloatIsNanSnafu)?; + Ok(VrlValue::Float(val)) } /// Common Media Client Data Specification: @@ -156,11 +159,11 @@ pub struct CmcdProcessor { } impl CmcdProcessor { - fn generate_key(prefix: &str, key: &str) -> String { - format!("{}_{}", prefix, key) + fn generate_key(prefix: &str, key: &str) -> KeyString { + KeyString::from(format!("{}_{}", prefix, key)) } - fn parse(&self, name: &str, value: &str) -> Result> { + fn parse(&self, name: &str, value: &str) -> Result> { let mut working_set = BTreeMap::new(); let parts = value.split(','); @@ -250,16 +253,18 @@ impl Processor for CmcdProcessor { self.ignore_missing } - fn exec_mut(&self, mut val: Value) -> Result { + fn exec_mut(&self, mut val: VrlValue) -> Result { for field in self.fields.iter() { let name = field.input_field(); - + let val = val.as_object_mut().context(ValueMustBeMapSnafu)?; match val.get(name) { - Some(Value::String(s)) => { - let results = self.parse(field.target_or_input_field(), s)?; - val.extend(results.into())?; + Some(VrlValue::Bytes(s)) => { + let s = String::from_utf8_lossy(s); + let results = self.parse(field.target_or_input_field(), &s)?; + + val.extend(results); } - Some(Value::Null) | None => { + Some(VrlValue::Null) | None => { if !self.ignore_missing { return ProcessorMissingFieldSnafu { processor: self.kind().to_string(), @@ -288,7 +293,6 @@ mod tests { use super::*; use crate::etl::field::{Field, Fields}; - use crate::etl::value::Value; #[test] fn test_cmcd() { @@ -297,23 +301,23 @@ mod tests { "sid%3D%226e2fb550-c457-11e9-bb97-0800200c9a66%22", vec![( "prefix_sid", - Value::String("\"6e2fb550-c457-11e9-bb97-0800200c9a66\"".into()), + VrlValue::Bytes(Bytes::from("\"6e2fb550-c457-11e9-bb97-0800200c9a66\"")), )], ), ( "br%3D3200%2Cbs%2Cd%3D4004%2Cmtp%3D25400%2Cot%3Dv%2Crtp%3D15000%2Csid%3D%226e2fb550-c457-11e9-bb97-0800200c9a66%22%2Ctb%3D6000", vec![ - ("prefix_bs", Value::Boolean(true)), - ("prefix_ot", Value::String("v".into())), - ("prefix_rtp", Value::Int64(15000)), - ("prefix_br", Value::Int64(3200)), - ("prefix_tb", Value::Int64(6000)), - ("prefix_d", Value::Int64(4004)), + ("prefix_bs", VrlValue::Boolean(true)), + ("prefix_ot", VrlValue::Bytes(Bytes::from("v"))), + ("prefix_rtp", VrlValue::Integer(15000)), + ("prefix_br", VrlValue::Integer(3200)), + ("prefix_tb", VrlValue::Integer(6000)), + ("prefix_d", VrlValue::Integer(4004)), ( "prefix_sid", - Value::String("\"6e2fb550-c457-11e9-bb97-0800200c9a66\"".into()), + VrlValue::Bytes(Bytes::from("\"6e2fb550-c457-11e9-bb97-0800200c9a66\"")), ), - ("prefix_mtp", Value::Int64(25400)), + ("prefix_mtp", VrlValue::Integer(25400)), ], ), ( @@ -322,16 +326,16 @@ mod tests { vec![ ( "prefix_sid", - Value::String("\"6e2fb550-c457-11e9-bb97-0800200c9a66\"".into()), + VrlValue::Bytes(Bytes::from("\"6e2fb550-c457-11e9-bb97-0800200c9a66\"")), ), - ("prefix_rtp", Value::Int64(15000)), + ("prefix_rtp", VrlValue::Integer(15000)), ], ), ( "bs%2Csu", vec![ - ("prefix_su", Value::Boolean(true)), - ("prefix_bs", Value::Boolean(true)), + ("prefix_su", VrlValue::Boolean(true)), + ("prefix_bs", VrlValue::Boolean(true)), ], ), ( @@ -346,7 +350,7 @@ mod tests { // "prefix_com.examplemyStringKey", // Value::String("\"myStringValue\"".into()), // ), - ("prefix_d", Value::Int64(4004)), + ("prefix_d", VrlValue::Integer(4004)), ], ), ( @@ -354,11 +358,11 @@ mod tests { vec![ ( "prefix_sid", - Value::String("\"6e2fb550-c457-11e9-bb97-0800200c9a66\"".into()), + VrlValue::Bytes(Bytes::from("\"6e2fb550-c457-11e9-bb97-0800200c9a66\"")), ), ( "prefix_nor", - Value::String("\"../300kbps/segment35.m4v\"".into()), + VrlValue::Bytes(Bytes::from("\"../300kbps/segment35.m4v\"")), ), ], @@ -366,56 +370,56 @@ mod tests { ( "nrr%3D%2212323-48763%22%2Csid%3D%226e2fb550-c457-11e9-bb97-0800200c9a66%22", vec![ - ("prefix_nrr", Value::String("\"12323-48763\"".into())), + ("prefix_nrr", VrlValue::Bytes(Bytes::from("\"12323-48763\""))), ( "prefix_sid", - Value::String("\"6e2fb550-c457-11e9-bb97-0800200c9a66\"".into()), + VrlValue::Bytes(Bytes::from("\"6e2fb550-c457-11e9-bb97-0800200c9a66\"")), ), ], ), ( "nor%3D%22..%252F300kbps%252Ftrack.m4v%22%2Cnrr%3D%2212323-48763%22%2Csid%3D%226e2fb550-c457-11e9-bb97-0800200c9a66%22", vec![ - ("prefix_nrr", Value::String("\"12323-48763\"".into())), + ("prefix_nrr", VrlValue::Bytes(Bytes::from("\"12323-48763\""))), ( "prefix_sid", - Value::String("\"6e2fb550-c457-11e9-bb97-0800200c9a66\"".into()), + VrlValue::Bytes(Bytes::from("\"6e2fb550-c457-11e9-bb97-0800200c9a66\"")), ), ( "prefix_nor", - Value::String("\"../300kbps/track.m4v\"".into()), + VrlValue::Bytes(Bytes::from("\"../300kbps/track.m4v\"")), ), ], ), ( "bl%3D21300%2Cbr%3D3200%2Cbs%2Ccid%3D%22faec5fc2-ac30-11eabb37-0242ac130002%22%2Cd%3D4004%2Cdl%3D18500%2Cmtp%3D48100%2Cnor%3D%22..%252F300kbps%252Ftrack.m4v%22%2Cnrr%3D%2212323-48763%22%2Cot%3Dv%2Cpr%3D1.08%2Crtp%3D12000%2Csf%3Dd%2Csid%3D%226e2fb550-c457-11e9-bb97-0800200c9a66%22%2Cst%3Dv%2Csu%2Ctb%3D6000", vec![ - ("prefix_bl", Value::Int64(21300)), - ("prefix_bs", Value::Boolean(true)), - ("prefix_st", Value::String("v".into())), - ("prefix_ot", Value::String("v".into())), + ("prefix_bl", VrlValue::Integer(21300)), + ("prefix_bs", VrlValue::Boolean(true)), + ("prefix_st", VrlValue::Bytes(Bytes::from("v"))), + ("prefix_ot", VrlValue::Bytes(Bytes::from("v"))), ( "prefix_sid", - Value::String("\"6e2fb550-c457-11e9-bb97-0800200c9a66\"".into()), + VrlValue::Bytes(Bytes::from("\"6e2fb550-c457-11e9-bb97-0800200c9a66\"")), ), - ("prefix_tb", Value::Int64(6000)), - ("prefix_d", Value::Int64(4004)), + ("prefix_tb", VrlValue::Integer(6000)), + ("prefix_d", VrlValue::Integer(4004)), ( "prefix_cid", - Value::String("\"faec5fc2-ac30-11eabb37-0242ac130002\"".into()), + VrlValue::Bytes(Bytes::from("\"faec5fc2-ac30-11eabb37-0242ac130002\"")), ), - ("prefix_mtp", Value::Int64(48100)), - ("prefix_rtp", Value::Int64(12000)), + ("prefix_mtp", VrlValue::Integer(48100)), + ("prefix_rtp", VrlValue::Integer(12000)), ( "prefix_nor", - Value::String("\"../300kbps/track.m4v\"".into()), + VrlValue::Bytes(Bytes::from("\"../300kbps/track.m4v\"")), ), - ("prefix_sf", Value::String("d".into())), - ("prefix_br", Value::Int64(3200)), - ("prefix_nrr", Value::String("\"12323-48763\"".into())), - ("prefix_pr", Value::Float64(1.08)), - ("prefix_su", Value::Boolean(true)), - ("prefix_dl", Value::Int64(18500)), + ("prefix_sf", VrlValue::Bytes(Bytes::from("d"))), + ("prefix_br", VrlValue::Integer(3200)), + ("prefix_nrr", VrlValue::Bytes(Bytes::from("\"12323-48763\""))), + ("prefix_pr", VrlValue::Float(NotNan::new(1.08).unwrap())), + ("prefix_su", VrlValue::Boolean(true)), + ("prefix_dl", VrlValue::Integer(18500)), ], ), ]; @@ -432,8 +436,8 @@ mod tests { let expected = vec .into_iter() - .map(|(k, v)| (k.to_string(), v)) - .collect::>(); + .map(|(k, v)| (KeyString::from(k.to_string()), v)) + .collect::>(); let actual = processor.parse("prefix", &decoded).unwrap(); assert_eq!(actual, expected); diff --git a/src/pipeline/src/etl/processor/csv.rs b/src/pipeline/src/etl/processor/csv.rs index b15ee42dc9..77d27bbbdb 100644 --- a/src/pipeline/src/etl/processor/csv.rs +++ b/src/pipeline/src/etl/processor/csv.rs @@ -20,17 +20,19 @@ use csv::{ReaderBuilder, Trim}; use itertools::EitherOrBoth::{Both, Left, Right}; use itertools::Itertools; use snafu::{OptionExt, ResultExt}; +use vrl::prelude::Bytes; +use vrl::value::{KeyString, Value as VrlValue}; use crate::error::{ CsvNoRecordSnafu, CsvQuoteNameSnafu, CsvReadSnafu, CsvSeparatorNameSnafu, Error, KeyMustBeStringSnafu, ProcessorExpectStringSnafu, ProcessorMissingFieldSnafu, Result, + ValueMustBeMapSnafu, }; use crate::etl::field::Fields; use crate::etl::processor::{ yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, }; -use crate::etl::value::Value; pub(crate) const PROCESSOR_CSV: &str = "csv"; @@ -60,8 +62,8 @@ pub struct CsvProcessor { impl CsvProcessor { // process the csv format string to a map with target_fields as keys - fn process(&self, val: &str) -> Result> { - let mut reader = self.reader.from_reader(val.as_bytes()); + fn process(&self, val: &[u8]) -> Result> { + let mut reader = self.reader.from_reader(val); if let Some(result) = reader.records().next() { let record: csv::StringRecord = result.context(CsvReadSnafu)?; @@ -71,17 +73,18 @@ impl CsvProcessor { .iter() .zip_longest(record.iter()) .filter_map(|zipped| match zipped { - Both(target_field, val) => { - Some((target_field.clone(), Value::String(val.into()))) - } + Both(target_field, val) => Some(( + KeyString::from(target_field.clone()), + VrlValue::Bytes(Bytes::from(val.to_string())), + )), // if target fields are more than extracted fields, fill the rest with empty value Left(target_field) => { let value = self .empty_value .as_ref() - .map(|s| Value::String(s.clone())) - .unwrap_or(Value::Null); - Some((target_field.clone(), value)) + .map(|s| VrlValue::Bytes(Bytes::from(s.clone()))) + .unwrap_or(VrlValue::Null); + Some((KeyString::from(target_field.clone()), value)) } // if extracted fields are more than target fields, ignore the rest Right(_) => None, @@ -190,16 +193,18 @@ impl Processor for CsvProcessor { self.ignore_missing } - fn exec_mut(&self, mut val: Value) -> Result { + fn exec_mut(&self, mut val: VrlValue) -> Result { for field in self.fields.iter() { let name = field.input_field(); + let val = val.as_object_mut().context(ValueMustBeMapSnafu)?; + match val.get(name) { - Some(Value::String(v)) => { + Some(VrlValue::Bytes(v)) => { let results = self.process(v)?; - val.extend(results.into())?; + val.extend(results); } - Some(Value::Null) | None => { + Some(VrlValue::Null) | None => { if !self.ignore_missing { return ProcessorMissingFieldSnafu { processor: self.kind().to_string(), @@ -238,11 +243,11 @@ mod tests { ..Default::default() }; - let result = processor.process("1,2").unwrap(); + let result = processor.process(b"1,2").unwrap(); - let values: BTreeMap = [ - ("a".into(), Value::String("1".into())), - ("b".into(), Value::String("2".into())), + let values: BTreeMap = [ + (KeyString::from("a"), VrlValue::Bytes(Bytes::from("1"))), + (KeyString::from("b"), VrlValue::Bytes(Bytes::from("2"))), ] .into_iter() .collect(); @@ -264,12 +269,12 @@ mod tests { ..Default::default() }; - let result = processor.process("1,2").unwrap(); + let result = processor.process(b"1,2").unwrap(); - let values: BTreeMap = [ - ("a".into(), Value::String("1".into())), - ("b".into(), Value::String("2".into())), - ("c".into(), Value::Null), + let values: BTreeMap = [ + (KeyString::from("a"), VrlValue::Bytes(Bytes::from("1"))), + (KeyString::from("b"), VrlValue::Bytes(Bytes::from("2"))), + (KeyString::from("c"), VrlValue::Null), ] .into_iter() .collect(); @@ -289,12 +294,15 @@ mod tests { ..Default::default() }; - let result = processor.process("1,2").unwrap(); + let result = processor.process(b"1,2").unwrap(); - let values: BTreeMap = [ - ("a".into(), Value::String("1".into())), - ("b".into(), Value::String("2".into())), - ("c".into(), Value::String("default".into())), + let values: BTreeMap = [ + (KeyString::from("a"), VrlValue::Bytes(Bytes::from("1"))), + (KeyString::from("b"), VrlValue::Bytes(Bytes::from("2"))), + ( + KeyString::from("c"), + VrlValue::Bytes(Bytes::from("default")), + ), ] .into_iter() .collect(); @@ -315,11 +323,11 @@ mod tests { ..Default::default() }; - let result = processor.process("1,2").unwrap(); + let result = processor.process(b"1,2").unwrap(); - let values: BTreeMap = [ - ("a".into(), Value::String("1".into())), - ("b".into(), Value::String("2".into())), + let values: BTreeMap = [ + (KeyString::from("a"), VrlValue::Bytes(Bytes::from("1"))), + (KeyString::from("b"), VrlValue::Bytes(Bytes::from("2"))), ] .into_iter() .collect(); diff --git a/src/pipeline/src/etl/processor/date.rs b/src/pipeline/src/etl/processor/date.rs index e60107a064..dd74d97943 100644 --- a/src/pipeline/src/etl/processor/date.rs +++ b/src/pipeline/src/etl/processor/date.rs @@ -14,22 +14,22 @@ use std::sync::Arc; -use chrono::{DateTime, NaiveDateTime}; +use chrono::{DateTime, NaiveDateTime, Utc}; use chrono_tz::Tz; use lazy_static::lazy_static; use snafu::{OptionExt, ResultExt}; +use vrl::value::{KeyString, Value as VrlValue}; use crate::error::{ - DateFailedToGetLocalTimezoneSnafu, DateFailedToGetTimestampSnafu, DateParseSnafu, - DateParseTimezoneSnafu, Error, KeyMustBeStringSnafu, ProcessorExpectStringSnafu, - ProcessorFailedToParseStringSnafu, ProcessorMissingFieldSnafu, Result, + DateFailedToGetLocalTimezoneSnafu, DateParseSnafu, DateParseTimezoneSnafu, Error, + KeyMustBeStringSnafu, ProcessorExpectStringSnafu, ProcessorFailedToParseStringSnafu, + ProcessorMissingFieldSnafu, Result, ValueMustBeMapSnafu, }; use crate::etl::field::Fields; use crate::etl::processor::{ yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, yaml_strings, Processor, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, }; -use crate::etl::value::{Timestamp, Value}; pub(crate) const PROCESSOR_DATE: &str = "date"; @@ -162,7 +162,7 @@ pub struct DateProcessor { } impl DateProcessor { - fn parse(&self, val: &str) -> Result { + fn parse(&self, val: &str) -> Result> { let mut tz = Tz::UTC; if let Some(timezone) = &self.timezone { tz = timezone.parse::().context(DateParseTimezoneSnafu { @@ -171,8 +171,8 @@ impl DateProcessor { } for fmt in self.formats.iter() { - if let Ok(ns) = try_parse(val, fmt, tz) { - return Ok(Timestamp::Nanosecond(ns)); + if let Ok(utc_ts) = try_parse(val, fmt, tz) { + return Ok(utc_ts); } } @@ -193,16 +193,19 @@ impl Processor for DateProcessor { self.ignore_missing } - fn exec_mut(&self, mut val: Value) -> Result { + fn exec_mut(&self, mut val: VrlValue) -> Result { for field in self.fields.iter() { let index = field.input_field(); + + let val = val.as_object_mut().context(ValueMustBeMapSnafu)?; + match val.get(index) { - Some(Value::String(s)) => { - let timestamp = self.parse(s)?; + Some(VrlValue::Bytes(s)) => { + let timestamp = self.parse(String::from_utf8_lossy(s).as_ref())?; let output_key = field.target_or_input_field(); - val.insert(output_key.to_string(), Value::Timestamp(timestamp))?; + val.insert(KeyString::from(output_key), VrlValue::Timestamp(timestamp)); } - Some(Value::Null) | None => { + Some(VrlValue::Null) | None => { if !self.ignore_missing { return ProcessorMissingFieldSnafu { processor: self.kind().to_string(), @@ -224,21 +227,19 @@ impl Processor for DateProcessor { } } -/// try to parse val with timezone first, if failed, parse without timezone -fn try_parse(val: &str, fmt: &str, tz: Tz) -> Result { +// parse the datetime with timezone info +// if failed, try to parse using naive date time and add tz info +// finally convert the datetime to utc +fn try_parse(val: &str, fmt: &str, tz: Tz) -> Result> { if let Ok(dt) = DateTime::parse_from_str(val, fmt) { - Ok(dt - .timestamp_nanos_opt() - .context(DateFailedToGetTimestampSnafu)?) + Ok(dt.to_utc()) } else { let dt = NaiveDateTime::parse_from_str(val, fmt) .context(DateParseSnafu { value: val })? .and_local_timezone(tz) .single() .context(DateFailedToGetLocalTimezoneSnafu)?; - Ok(dt - .timestamp_nanos_opt() - .context(DateFailedToGetTimestampSnafu)?) + Ok(dt.to_utc()) } } diff --git a/src/pipeline/src/etl/processor/decolorize.rs b/src/pipeline/src/etl/processor/decolorize.rs index 251332519e..5cbdcd5c3c 100644 --- a/src/pipeline/src/etl/processor/decolorize.rs +++ b/src/pipeline/src/etl/processor/decolorize.rs @@ -21,15 +21,17 @@ use once_cell::sync::Lazy; use regex::Regex; use snafu::OptionExt; +use vrl::prelude::Bytes; +use vrl::value::{KeyString, Value as VrlValue}; use crate::error::{ Error, KeyMustBeStringSnafu, ProcessorExpectStringSnafu, ProcessorMissingFieldSnafu, Result, + ValueMustBeMapSnafu, }; use crate::etl::field::Fields; use crate::etl::processor::{ yaml_bool, yaml_new_field, yaml_new_fields, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, }; -use crate::etl::value::Value; pub(crate) const PROCESSOR_DECOLORIZE: &str = "decolorize"; @@ -43,13 +45,15 @@ pub struct DecolorizeProcessor { } impl DecolorizeProcessor { - fn process_string(&self, val: &str) -> Result { - Ok(Value::String(RE.replace_all(val, "").into_owned())) + fn process_string(&self, val: &str) -> Result { + Ok(VrlValue::Bytes(Bytes::from( + RE.replace_all(val, "").to_string(), + ))) } - fn process(&self, val: &Value) -> Result { + fn process(&self, val: &VrlValue) -> Result { match val { - Value::String(val) => self.process_string(val), + VrlValue::Bytes(val) => self.process_string(String::from_utf8_lossy(val).as_ref()), _ => ProcessorExpectStringSnafu { processor: PROCESSOR_DECOLORIZE, v: val.clone(), @@ -101,11 +105,12 @@ impl crate::etl::processor::Processor for DecolorizeProcessor { self.ignore_missing } - fn exec_mut(&self, mut val: Value) -> Result { + fn exec_mut(&self, mut val: VrlValue) -> Result { for field in self.fields.iter() { let index = field.input_field(); + let val = val.as_object_mut().context(ValueMustBeMapSnafu)?; match val.get(index) { - Some(Value::Null) | None => { + Some(VrlValue::Null) | None => { if !self.ignore_missing { return ProcessorMissingFieldSnafu { processor: self.kind(), @@ -117,7 +122,7 @@ impl crate::etl::processor::Processor for DecolorizeProcessor { Some(v) => { let result = self.process(v)?; let output_index = field.target_or_input_field(); - val.insert(output_index.to_string(), result)?; + val.insert(KeyString::from(output_index), result); } } } @@ -136,16 +141,19 @@ mod tests { ignore_missing: false, }; - let val = Value::String("\x1b[32mGreen\x1b[0m".to_string()); + let val = VrlValue::Bytes(Bytes::from("\x1b[32mGreen\x1b[0m".to_string())); let result = processor.process(&val).unwrap(); - assert_eq!(result, Value::String("Green".to_string())); + assert_eq!(result, VrlValue::Bytes(Bytes::from("Green".to_string()))); - let val = Value::String("Plain text".to_string()); + let val = VrlValue::Bytes(Bytes::from("Plain text".to_string())); let result = processor.process(&val).unwrap(); - assert_eq!(result, Value::String("Plain text".to_string())); + assert_eq!( + result, + VrlValue::Bytes(Bytes::from("Plain text".to_string())) + ); - let val = Value::String("\x1b[46mfoo\x1b[0m bar".to_string()); + let val = VrlValue::Bytes(Bytes::from("\x1b[46mfoo\x1b[0m bar".to_string())); let result = processor.process(&val).unwrap(); - assert_eq!(result, Value::String("foo bar".to_string())); + assert_eq!(result, VrlValue::Bytes(Bytes::from("foo bar".to_string()))); } } diff --git a/src/pipeline/src/etl/processor/digest.rs b/src/pipeline/src/etl/processor/digest.rs index 9a2efef772..5639b69183 100644 --- a/src/pipeline/src/etl/processor/digest.rs +++ b/src/pipeline/src/etl/processor/digest.rs @@ -23,16 +23,17 @@ use std::borrow::Cow; use regex::Regex; use snafu::OptionExt; +use vrl::prelude::Bytes; +use vrl::value::{KeyString, Value as VrlValue}; use crate::error::{ DigestPatternInvalidSnafu, Error, KeyMustBeStringSnafu, ProcessorExpectStringSnafu, - ProcessorMissingFieldSnafu, Result, + ProcessorMissingFieldSnafu, Result, ValueMustBeMapSnafu, }; use crate::etl::field::Fields; use crate::etl::processor::{ yaml_bool, yaml_new_field, yaml_new_fields, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, }; -use crate::etl::value::Value; pub(crate) const PROCESSOR_DIGEST: &str = "digest"; @@ -100,7 +101,7 @@ impl DigestProcessor { re.replace_all(val, "").to_string() } - fn process_string(&self, val: &str) -> Result { + fn process_string(&self, val: &str) -> Result { let mut input = Cow::from(val); for pattern in &self.patterns { if let Cow::Owned(new_string) = pattern.replace_all(&input, "") { @@ -108,12 +109,12 @@ impl DigestProcessor { } } - Ok(Value::String(input.into_owned())) + Ok(VrlValue::Bytes(Bytes::from(input.to_string()))) } - fn process(&self, val: &Value) -> Result { + fn process(&self, val: &VrlValue) -> Result { match val { - Value::String(val) => self.process_string(val), + VrlValue::Bytes(val) => self.process_string(String::from_utf8_lossy(val).as_ref()), _ => ProcessorExpectStringSnafu { processor: PROCESSOR_DIGEST, v: val.clone(), @@ -200,11 +201,12 @@ impl crate::etl::processor::Processor for DigestProcessor { self.ignore_missing } - fn exec_mut(&self, mut val: Value) -> Result { + fn exec_mut(&self, mut val: VrlValue) -> Result { for field in self.fields.iter() { let index = field.input_field(); + let val = val.as_object_mut().context(ValueMustBeMapSnafu)?; match val.get(index) { - Some(Value::Null) | None => { + Some(VrlValue::Null) | None => { if !self.ignore_missing { return ProcessorMissingFieldSnafu { processor: self.kind(), @@ -216,7 +218,7 @@ impl crate::etl::processor::Processor for DigestProcessor { Some(v) => { let result = self.process(v)?; let output_index = field.target_or_input_field(); - val.insert(output_index.to_string(), result)?; + val.insert(KeyString::from(output_index), result); } } } @@ -237,24 +239,31 @@ mod tests { patterns: vec![PresetPattern::Ip.regex()], }; - let input = Value::String("192.168.1.1".to_string()); + let input = VrlValue::Bytes(Bytes::from("192.168.1.1".to_string())); let result = processor.process(&input).unwrap(); - assert_eq!(result, Value::String("".to_string())); - let input = Value::String("192.168.1.1:8080".to_string()); + assert_eq!(result, VrlValue::Bytes(Bytes::from("".to_string()))); + let input = VrlValue::Bytes(Bytes::from("192.168.1.1:8080".to_string())); let result = processor.process(&input).unwrap(); - assert_eq!(result, Value::String("".to_string())); + assert_eq!(result, VrlValue::Bytes(Bytes::from("".to_string()))); - let input = Value::String("[2001:0db8:85a3:0000:0000:8a2e:0370:7334]".to_string()); + let input = VrlValue::Bytes(Bytes::from( + "[2001:0db8:85a3:0000:0000:8a2e:0370:7334]".to_string(), + )); let result = processor.process(&input).unwrap(); - assert_eq!(result, Value::String("".to_string())); + assert_eq!(result, VrlValue::Bytes(Bytes::from("".to_string()))); - let input = Value::String("[2001:0db8:85a3:0000:0000:8a2e:0370:7334]:8080".to_string()); + let input = VrlValue::Bytes(Bytes::from( + "[2001:0db8:85a3:0000:0000:8a2e:0370:7334]:8080".to_string(), + )); let result = processor.process(&input).unwrap(); - assert_eq!(result, Value::String("".to_string())); + assert_eq!(result, VrlValue::Bytes(Bytes::from("".to_string()))); - let input = Value::String("not an ip".to_string()); + let input = VrlValue::Bytes(Bytes::from("not an ip".to_string())); let result = processor.process(&input).unwrap(); - assert_eq!(result, Value::String("not an ip".to_string())); + assert_eq!( + result, + VrlValue::Bytes(Bytes::from("not an ip".to_string())) + ); } #[test] @@ -265,29 +274,40 @@ mod tests { patterns: vec![PresetPattern::Uuid.regex()], }; // UUID v4 - let input = Value::String("123e4567-e89b-12d3-a456-426614174000".to_string()); + let input = VrlValue::Bytes(Bytes::from( + "123e4567-e89b-12d3-a456-426614174000".to_string(), + )); let result = processor.process(&input).unwrap(); - assert_eq!(result, Value::String("".to_string())); + assert_eq!(result, VrlValue::Bytes(Bytes::from("".to_string()))); // UUID v1 - let input = Value::String("6ba7b810-9dad-11d1-80b4-00c04fd430c8".to_string()); + let input = VrlValue::Bytes(Bytes::from( + "6ba7b810-9dad-11d1-80b4-00c04fd430c8".to_string(), + )); let result = processor.process(&input).unwrap(); - assert_eq!(result, Value::String("".to_string())); + assert_eq!(result, VrlValue::Bytes(Bytes::from("".to_string()))); // UUID v5 - let input = Value::String("886313e1-3b8a-5372-9b90-0c9aee199e5d".to_string()); + let input = VrlValue::Bytes(Bytes::from( + "886313e1-3b8a-5372-9b90-0c9aee199e5d".to_string(), + )); let result = processor.process(&input).unwrap(); - assert_eq!(result, Value::String("".to_string())); + assert_eq!(result, VrlValue::Bytes(Bytes::from("".to_string()))); // UUID with uppercase letters - let input = Value::String("A987FBC9-4BED-3078-CF07-9141BA07C9F3".to_string()); + let input = VrlValue::Bytes(Bytes::from( + "A987FBC9-4BED-3078-CF07-9141BA07C9F3".to_string(), + )); let result = processor.process(&input).unwrap(); - assert_eq!(result, Value::String("".to_string())); + assert_eq!(result, VrlValue::Bytes(Bytes::from("".to_string()))); // Negative case - let input = Value::String("not a uuid".to_string()); + let input = VrlValue::Bytes(Bytes::from("not a uuid".to_string())); let result = processor.process(&input).unwrap(); - assert_eq!(result, Value::String("not a uuid".to_string())); + assert_eq!( + result, + VrlValue::Bytes(Bytes::from("not a uuid".to_string())) + ); } #[test] @@ -299,45 +319,48 @@ mod tests { }; // Basic brackets - let input = Value::String("[content]".to_string()); + let input = VrlValue::Bytes(Bytes::from("[content]".to_string())); let result = processor.process(&input).unwrap(); - assert_eq!(result, Value::String("".to_string())); + assert_eq!(result, VrlValue::Bytes(Bytes::from("".to_string()))); - let input = Value::String("(content)".to_string()); + let input = VrlValue::Bytes(Bytes::from("(content)".to_string())); let result = processor.process(&input).unwrap(); - assert_eq!(result, Value::String("".to_string())); + assert_eq!(result, VrlValue::Bytes(Bytes::from("".to_string()))); // Chinese brackets - let input = Value::String("「content」".to_string()); + let input = VrlValue::Bytes(Bytes::from("「content」".to_string())); let result = processor.process(&input).unwrap(); - assert_eq!(result, Value::String("".to_string())); + assert_eq!(result, VrlValue::Bytes(Bytes::from("".to_string()))); - let input = Value::String("『content』".to_string()); + let input = VrlValue::Bytes(Bytes::from("『content』".to_string())); let result = processor.process(&input).unwrap(); - assert_eq!(result, Value::String("".to_string())); + assert_eq!(result, VrlValue::Bytes(Bytes::from("".to_string()))); - let input = Value::String("【content】".to_string()); + let input = VrlValue::Bytes(Bytes::from("【content】".to_string())); let result = processor.process(&input).unwrap(); - assert_eq!(result, Value::String("".to_string())); + assert_eq!(result, VrlValue::Bytes(Bytes::from("".to_string()))); // Unmatched/unclosed brackets should not match - let input = Value::String("[content".to_string()); + let input = VrlValue::Bytes(Bytes::from("[content".to_string())); let result = processor.process(&input).unwrap(); - assert_eq!(result, Value::String("[content".to_string())); + assert_eq!(result, VrlValue::Bytes(Bytes::from("[content".to_string()))); - let input = Value::String("content]".to_string()); + let input = VrlValue::Bytes(Bytes::from("content]".to_string())); let result = processor.process(&input).unwrap(); - assert_eq!(result, Value::String("content]".to_string())); + assert_eq!(result, VrlValue::Bytes(Bytes::from("content]".to_string()))); // Bad case - let input = Value::String("[content}".to_string()); + let input = VrlValue::Bytes(Bytes::from("[content}".to_string())); let result = processor.process(&input).unwrap(); - assert_eq!(result, Value::String("".to_string())); + assert_eq!(result, VrlValue::Bytes(Bytes::from("".to_string()))); // Negative case - let input = Value::String("no brackets".to_string()); + let input = VrlValue::Bytes(Bytes::from("no brackets".to_string())); let result = processor.process(&input).unwrap(); - assert_eq!(result, Value::String("no brackets".to_string())); + assert_eq!( + result, + VrlValue::Bytes(Bytes::from("no brackets".to_string())) + ); } #[test] @@ -348,16 +371,19 @@ mod tests { patterns: vec![PresetPattern::Quoted.regex()], }; - let input = Value::String("\"quoted content\"".to_string()); + let input = VrlValue::Bytes(Bytes::from("\"quoted content\"".to_string())); let result = processor.process(&input).unwrap(); - assert_eq!(result, Value::String("".to_string())); + assert_eq!(result, VrlValue::Bytes(Bytes::from("".to_string()))); - let input = Value::String("no quotes".to_string()); + let input = VrlValue::Bytes(Bytes::from("no quotes".to_string())); let result = processor.process(&input).unwrap(); - assert_eq!(result, Value::String("no quotes".to_string())); - let input = Value::String("".to_string()); + assert_eq!( + result, + VrlValue::Bytes(Bytes::from("no quotes".to_string())) + ); + let input = VrlValue::Bytes(Bytes::from("".to_string())); let result = processor.process(&input).unwrap(); - assert_eq!(result, Value::String("".to_string())); + assert_eq!(result, VrlValue::Bytes(Bytes::from("".to_string()))); } #[test] @@ -368,15 +394,18 @@ mod tests { patterns: vec![Regex::new(r"\d+").unwrap()], }; - let input = Value::String("12345".to_string()); + let input = VrlValue::Bytes(Bytes::from("12345".to_string())); let result = processor.process(&input).unwrap(); - assert_eq!(result, Value::String("".to_string())); + assert_eq!(result, VrlValue::Bytes(Bytes::from("".to_string()))); - let input = Value::String("no digits".to_string()); + let input = VrlValue::Bytes(Bytes::from("no digits".to_string())); let result = processor.process(&input).unwrap(); - assert_eq!(result, Value::String("no digits".to_string())); - let input = Value::String("".to_string()); + assert_eq!( + result, + VrlValue::Bytes(Bytes::from("no digits".to_string())) + ); + let input = VrlValue::Bytes(Bytes::from("".to_string())); let result = processor.process(&input).unwrap(); - assert_eq!(result, Value::String("".to_string())); + assert_eq!(result, VrlValue::Bytes(Bytes::from("".to_string()))); } } diff --git a/src/pipeline/src/etl/processor/dissect.rs b/src/pipeline/src/etl/processor/dissect.rs index 4a32a59b28..8ae2bb7aba 100644 --- a/src/pipeline/src/etl/processor/dissect.rs +++ b/src/pipeline/src/etl/processor/dissect.rs @@ -17,6 +17,8 @@ use std::ops::Deref; use ahash::{HashMap, HashMapExt, HashSet, HashSetExt}; use itertools::Itertools; use snafu::OptionExt; +use vrl::prelude::Bytes; +use vrl::value::{KeyString, Value as VrlValue}; use crate::error::{ DissectAppendOrderAlreadySetSnafu, DissectConsecutiveNamesSnafu, DissectEmptyPatternSnafu, @@ -24,13 +26,13 @@ use crate::error::{ DissectNoMatchingPatternSnafu, DissectOrderOnlyAppendModifierSnafu, DissectOrderOnlyAppendSnafu, DissectSplitExceedsInputSnafu, DissectSplitNotMatchInputSnafu, Error, KeyMustBeStringSnafu, ProcessorExpectStringSnafu, ProcessorMissingFieldSnafu, Result, + ValueMustBeMapSnafu, }; use crate::etl::field::Fields; use crate::etl::processor::{ yaml_bool, yaml_new_field, yaml_new_fields, yaml_parse_string, yaml_parse_strings, yaml_string, Processor, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, PATTERNS_NAME, PATTERN_NAME, }; -use crate::etl::value::Value; pub(crate) const PROCESSOR_DISSECT: &str = "dissect"; @@ -421,7 +423,7 @@ impl DissectProcessor { name: &'a Name, value: String, appends: &mut HashMap<&'a String, Vec<(String, u32)>>, - map: &mut Vec<(&'a String, Value)>, + map: &mut Vec<(&'a String, VrlValue)>, ) { match name.start_modifier { Some(StartModifier::NamedSkip) => { @@ -438,12 +440,16 @@ impl DissectProcessor { // because transform can know the key name } None => { - map.push((&name.name, Value::String(value))); + map.push((&name.name, VrlValue::Bytes(Bytes::from(value)))); } } } - fn process_pattern(&self, chs: &[char], pattern: &Pattern) -> Result> { + fn process_pattern( + &self, + chs: &[char], + pattern: &Pattern, + ) -> Result> { let mut map = Vec::new(); let mut pos = 0; @@ -523,14 +529,17 @@ impl DissectProcessor { for (name, mut values) in appends { values.sort_by(|a, b| a.1.cmp(&b.1)); let value = values.into_iter().map(|(a, _)| a).join(sep); - map.push((name, Value::String(value))); + map.push((name, VrlValue::Bytes(Bytes::from(value)))); } } - Ok(map.into_iter().map(|(k, v)| (k.to_string(), v)).collect()) + Ok(map + .into_iter() + .map(|(k, v)| (KeyString::from(k.clone()), v)) + .collect()) } - fn process(&self, val: &str) -> Result> { + fn process(&self, val: &str) -> Result> { let chs = val.chars().collect::>(); for pattern in &self.patterns { @@ -600,17 +609,18 @@ impl Processor for DissectProcessor { self.ignore_missing } - fn exec_mut(&self, mut val: Value) -> Result { + fn exec_mut(&self, mut val: VrlValue) -> Result { for field in self.fields.iter() { let index = field.input_field(); + let val = val.as_object_mut().context(ValueMustBeMapSnafu)?; match val.get(index) { - Some(Value::String(val_str)) => { - let r = self.process(val_str)?; + Some(VrlValue::Bytes(val_str)) => { + let r = self.process(String::from_utf8_lossy(val_str).as_ref())?; for (k, v) in r { - val.insert(k, v)?; + val.insert(k, v); } } - Some(Value::Null) | None => { + Some(VrlValue::Null) | None => { if !self.ignore_missing { return ProcessorMissingFieldSnafu { processor: self.kind(), @@ -639,17 +649,18 @@ fn is_valid_char(ch: char) -> bool { #[cfg(test)] mod tests { use ahash::HashMap; + use vrl::prelude::Bytes; + use vrl::value::{KeyString, Value as VrlValue}; use super::{DissectProcessor, EndModifier, Name, Part, StartModifier}; use crate::etl::processor::dissect::Pattern; - use crate::etl::value::Value; - fn assert(pattern_str: &str, input: &str, expected: HashMap) { + fn assert(pattern_str: &str, input: &str, expected: HashMap) { let chs = input.chars().collect::>(); let patterns: Vec = vec![pattern_str.parse().unwrap()]; let processor = DissectProcessor::default(); - let result: HashMap = processor + let result: HashMap = processor .process_pattern(&chs, &patterns[0]) .unwrap() .into_iter() @@ -991,8 +1002,13 @@ mod tests { ("httpversion", "1.0"), ] .into_iter() - .map(|(k, v)| (k.to_string(), Value::String(v.to_string()))) - .collect::>(); + .map(|(k, v)| { + ( + KeyString::from(k.to_string()), + VrlValue::Bytes(Bytes::from(v.to_string())), + ) + }) + .collect::>(); { // pattern start with Name @@ -1032,9 +1048,12 @@ mod tests { ] .into_iter() .map(|(pattern, input, expected)| { - let map = expected - .into_iter() - .map(|(k, v)| (k.to_string(), Value::String(v.to_string()))); + let map = expected.into_iter().map(|(k, v)| { + ( + KeyString::from(k.to_string()), + VrlValue::Bytes(Bytes::from(v.to_string())), + ) + }); (pattern, input, map) }); @@ -1042,7 +1061,7 @@ mod tests { assert( pattern_str, input, - expected.collect::>(), + expected.collect::>(), ); } } @@ -1063,9 +1082,12 @@ mod tests { ] .into_iter() .map(|(pattern, input, expected)| { - let map = expected - .into_iter() - .map(|(k, v)| (k.to_string(), Value::String(v.to_string()))); + let map = expected.into_iter().map(|(k, v)| { + ( + KeyString::from(k.to_string()), + VrlValue::Bytes(Bytes::from(v.to_string())), + ) + }); (pattern, input, map) }); @@ -1073,7 +1095,7 @@ mod tests { assert( pattern_str, input, - expected.collect::>(), + expected.collect::>(), ); } } @@ -1090,9 +1112,12 @@ mod tests { )] .into_iter() .map(|(pattern, input, expected)| { - let map = expected - .into_iter() - .map(|(k, v)| (k.to_string(), Value::String(v.to_string()))); + let map = expected.into_iter().map(|(k, v)| { + ( + KeyString::from(k.to_string()), + VrlValue::Bytes(Bytes::from(v.to_string())), + ) + }); (pattern, input, map) }); @@ -1100,7 +1125,7 @@ mod tests { assert( pattern_str, input, - expected.collect::>(), + expected.collect::>(), ); } } diff --git a/src/pipeline/src/etl/processor/epoch.rs b/src/pipeline/src/etl/processor/epoch.rs index 5d70483920..ca0aec9413 100644 --- a/src/pipeline/src/etl/processor/epoch.rs +++ b/src/pipeline/src/etl/processor/epoch.rs @@ -12,24 +12,26 @@ // See the License for the specific language governing permissions and // limitations under the License. +use chrono::{DateTime, Utc}; use common_time::timestamp::TimeUnit; use snafu::{OptionExt, ResultExt}; +use vrl::value::{KeyString, Value as VrlValue}; use crate::error::{ - EpochInvalidResolutionSnafu, Error, FailedToParseIntSnafu, KeyMustBeStringSnafu, - ProcessorMissingFieldSnafu, ProcessorUnsupportedValueSnafu, Result, + EpochInvalidResolutionSnafu, Error, FailedToParseIntSnafu, InvalidEpochForResolutionSnafu, + KeyMustBeStringSnafu, ProcessorMissingFieldSnafu, ProcessorUnsupportedValueSnafu, Result, + ValueMustBeMapSnafu, }; use crate::etl::field::Fields; use crate::etl::processor::{ yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, }; -use crate::etl::value::time::{ +use crate::etl::value::{ MICROSECOND_RESOLUTION, MICRO_RESOLUTION, MILLISECOND_RESOLUTION, MILLI_RESOLUTION, MS_RESOLUTION, NANOSECOND_RESOLUTION, NANO_RESOLUTION, NS_RESOLUTION, SECOND_RESOLUTION, SEC_RESOLUTION, S_RESOLUTION, US_RESOLUTION, }; -use crate::etl::value::{Timestamp, Value}; pub(crate) const PROCESSOR_EPOCH: &str = "epoch"; const RESOLUTION_NAME: &str = "resolution"; @@ -43,6 +45,18 @@ pub(crate) enum Resolution { Nano, } +impl std::fmt::Display for Resolution { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let text = match self { + Resolution::Second => SECOND_RESOLUTION, + Resolution::Milli => MILLISECOND_RESOLUTION, + Resolution::Micro => MICROSECOND_RESOLUTION, + Resolution::Nano => NANOSECOND_RESOLUTION, + }; + write!(f, "{}", text) + } +} + impl TryFrom<&str> for Resolution { type Error = Error; @@ -84,43 +98,36 @@ pub struct EpochProcessor { } impl EpochProcessor { - fn parse(&self, val: &Value) -> Result { - let t: i64 = match val { - Value::String(s) => s - .parse::() - .context(FailedToParseIntSnafu { value: s })?, - Value::Int16(i) => *i as i64, - Value::Int32(i) => *i as i64, - Value::Int64(i) => *i, - Value::Uint8(i) => *i as i64, - Value::Uint16(i) => *i as i64, - Value::Uint32(i) => *i as i64, - Value::Uint64(i) => *i as i64, - Value::Float32(f) => *f as i64, - Value::Float64(f) => *f as i64, - - Value::Timestamp(t) => match self.resolution { - Resolution::Second => t.timestamp(), - Resolution::Milli => t.timestamp_millis(), - Resolution::Micro => t.timestamp_micros(), - Resolution::Nano => t.timestamp_nanos(), - }, - - _ => { - return ProcessorUnsupportedValueSnafu { - processor: PROCESSOR_EPOCH, - val: val.to_string(), + fn parse(&self, val: &VrlValue) -> Result> { + let t: i64 = + match val { + VrlValue::Bytes(bytes) => String::from_utf8_lossy(bytes).parse::().context( + FailedToParseIntSnafu { + value: val.to_string_lossy(), + }, + )?, + VrlValue::Integer(ts) => *ts, + VrlValue::Float(not_nan) => not_nan.into_inner() as i64, + VrlValue::Timestamp(date_time) => return Ok(*date_time), + _ => { + return ProcessorUnsupportedValueSnafu { + processor: PROCESSOR_EPOCH, + val: val.to_string(), + } + .fail(); } - .fail(); - } - }; + }; match self.resolution { - Resolution::Second => Ok(Timestamp::Second(t)), - Resolution::Milli => Ok(Timestamp::Millisecond(t)), - Resolution::Micro => Ok(Timestamp::Microsecond(t)), - Resolution::Nano => Ok(Timestamp::Nanosecond(t)), + Resolution::Second => DateTime::from_timestamp(t, 0), + Resolution::Milli => DateTime::from_timestamp_millis(t), + Resolution::Micro => DateTime::from_timestamp_micros(t), + Resolution::Nano => Some(DateTime::from_timestamp_nanos(t)), } + .context(InvalidEpochForResolutionSnafu { + value: t, + resolution: self.resolution.to_string(), + }) } } @@ -174,11 +181,12 @@ impl Processor for EpochProcessor { self.ignore_missing } - fn exec_mut(&self, mut val: Value) -> Result { + fn exec_mut(&self, mut val: VrlValue) -> Result { for field in self.fields.iter() { let index = field.input_field(); + let val = val.as_object_mut().context(ValueMustBeMapSnafu)?; match val.get(index) { - Some(Value::Null) | None => { + Some(VrlValue::Null) | None => { if !self.ignore_missing { return ProcessorMissingFieldSnafu { processor: self.kind(), @@ -190,7 +198,10 @@ impl Processor for EpochProcessor { Some(v) => { let timestamp = self.parse(v)?; let output_index = field.target_or_input_field(); - val.insert(output_index.to_string(), Value::Timestamp(timestamp))?; + val.insert( + KeyString::from(output_index.to_string()), + VrlValue::Timestamp(timestamp), + ); } } } @@ -200,8 +211,12 @@ impl Processor for EpochProcessor { #[cfg(test)] mod tests { + use chrono::DateTime; + use ordered_float::NotNan; + use vrl::prelude::Bytes; + use vrl::value::Value as VrlValue; + use super::EpochProcessor; - use crate::etl::value::Value; #[test] fn test_parse_epoch() { @@ -211,15 +226,15 @@ mod tests { }; let values = [ - Value::String("1573840000".into()), - Value::Int32(1573840000), - Value::Uint64(1573840000), - Value::Float32(1573840000.0), + VrlValue::Bytes(Bytes::from("1573840000")), + VrlValue::Integer(1573840000), + VrlValue::Integer(1573840000), + VrlValue::Float(NotNan::new(1573840000.0).unwrap()), ]; for value in values { let parsed = processor.parse(&value).unwrap(); - assert_eq!(parsed, super::Timestamp::Second(1573840000)); + assert_eq!(parsed, DateTime::from_timestamp(1573840000, 0).unwrap()); } } } diff --git a/src/pipeline/src/etl/processor/gsub.rs b/src/pipeline/src/etl/processor/gsub.rs index 06047b6dfb..c4c44e0ace 100644 --- a/src/pipeline/src/etl/processor/gsub.rs +++ b/src/pipeline/src/etl/processor/gsub.rs @@ -14,17 +14,19 @@ use regex::Regex; use snafu::{OptionExt, ResultExt}; +use vrl::prelude::Bytes; +use vrl::value::{KeyString, Value as VrlValue}; use crate::error::{ Error, GsubPatternRequiredSnafu, GsubReplacementRequiredSnafu, KeyMustBeStringSnafu, ProcessorExpectStringSnafu, ProcessorMissingFieldSnafu, RegexSnafu, Result, + ValueMustBeMapSnafu, }; use crate::etl::field::Fields; use crate::etl::processor::{ yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, PATTERN_NAME, }; -use crate::etl::value::Value; pub(crate) const PROCESSOR_GSUB: &str = "gsub"; @@ -40,16 +42,16 @@ pub struct GsubProcessor { } impl GsubProcessor { - fn process_string(&self, val: &str) -> Result { + fn process_string(&self, val: &str) -> Result { let new_val = self.pattern.replace_all(val, &self.replacement).to_string(); - let val = Value::String(new_val); + let val = VrlValue::Bytes(Bytes::from(new_val)); Ok(val) } - fn process(&self, val: &Value) -> Result { + fn process(&self, val: &VrlValue) -> Result { match val { - Value::String(val) => self.process_string(val), + VrlValue::Bytes(val) => self.process_string(String::from_utf8_lossy(val).as_ref()), _ => ProcessorExpectStringSnafu { processor: PROCESSOR_GSUB, v: val.clone(), @@ -117,11 +119,12 @@ impl crate::etl::processor::Processor for GsubProcessor { self.ignore_missing } - fn exec_mut(&self, mut val: Value) -> Result { + fn exec_mut(&self, mut val: VrlValue) -> Result { for field in self.fields.iter() { let index = field.input_field(); + let val = val.as_object_mut().context(ValueMustBeMapSnafu)?; match val.get(index) { - Some(Value::Null) | None => { + Some(VrlValue::Null) | None => { if !self.ignore_missing { return ProcessorMissingFieldSnafu { processor: self.kind(), @@ -133,7 +136,7 @@ impl crate::etl::processor::Processor for GsubProcessor { Some(v) => { let result = self.process(v)?; let output_index = field.target_or_input_field(); - val.insert(output_index.to_string(), result)?; + val.insert(KeyString::from(output_index.to_string()), result); } } } @@ -145,7 +148,6 @@ impl crate::etl::processor::Processor for GsubProcessor { mod tests { use super::*; use crate::etl::processor::gsub::GsubProcessor; - use crate::etl::value::Value; #[test] fn test_string_value() { @@ -156,9 +158,9 @@ mod tests { ignore_missing: false, }; - let val = Value::String("123".to_string()); + let val = VrlValue::Bytes(Bytes::from("123")); let result = processor.process(&val).unwrap(); - assert_eq!(result, Value::String("xxx".to_string())); + assert_eq!(result, VrlValue::Bytes(Bytes::from("xxx"))); } } diff --git a/src/pipeline/src/etl/processor/join.rs b/src/pipeline/src/etl/processor/join.rs index 816d38187e..3712dd70c7 100644 --- a/src/pipeline/src/etl/processor/join.rs +++ b/src/pipeline/src/etl/processor/join.rs @@ -13,17 +13,18 @@ // limitations under the License. use snafu::OptionExt; +use vrl::prelude::Bytes; +use vrl::value::{KeyString, Value as VrlValue}; use crate::error::{ Error, JoinSeparatorRequiredSnafu, KeyMustBeStringSnafu, ProcessorExpectStringSnafu, - ProcessorMissingFieldSnafu, Result, + ProcessorMissingFieldSnafu, Result, ValueMustBeMapSnafu, }; use crate::etl::field::Fields; use crate::etl::processor::{ yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, SEPARATOR_NAME, }; -use crate::etl::value::{Array, Value}; pub(crate) const PROCESSOR_JOIN: &str = "join"; @@ -36,14 +37,14 @@ pub struct JoinProcessor { } impl JoinProcessor { - fn process(&self, arr: &Array) -> Result { + fn process(&self, arr: &[VrlValue]) -> Result { let val = arr .iter() - .map(|v| v.to_str_value()) - .collect::>() + .map(|v| v.to_string_lossy()) + .collect::>() .join(&self.separator); - Ok(Value::String(val)) + Ok(VrlValue::Bytes(Bytes::from(val))) } } @@ -94,16 +95,17 @@ impl Processor for JoinProcessor { self.ignore_missing } - fn exec_mut(&self, mut val: Value) -> Result { + fn exec_mut(&self, mut val: VrlValue) -> Result { for field in self.fields.iter() { let index = field.input_field(); + let val = val.as_object_mut().context(ValueMustBeMapSnafu)?; match val.get(index) { - Some(Value::Array(arr)) => { + Some(VrlValue::Array(arr)) => { let result = self.process(arr)?; let output_index = field.target_or_input_field(); - val.insert(output_index.to_string(), result)?; + val.insert(KeyString::from(output_index.to_string()), result); } - Some(Value::Null) | None => { + Some(VrlValue::Null) | None => { if !self.ignore_missing { return ProcessorMissingFieldSnafu { processor: self.kind(), @@ -129,8 +131,10 @@ impl Processor for JoinProcessor { #[cfg(test)] mod tests { + use vrl::prelude::Bytes; + use vrl::value::Value as VrlValue; + use crate::etl::processor::join::JoinProcessor; - use crate::etl::value::Value; #[test] fn test_join_processor() { @@ -140,11 +144,10 @@ mod tests { }; let arr = vec![ - Value::String("a".to_string()), - Value::String("b".to_string()), - ] - .into(); + VrlValue::Bytes(Bytes::from("a")), + VrlValue::Bytes(Bytes::from("b")), + ]; let result = processor.process(&arr).unwrap(); - assert_eq!(result, Value::String("a-b".to_string())); + assert_eq!(result, VrlValue::Bytes(Bytes::from("a-b"))); } } diff --git a/src/pipeline/src/etl/processor/json_parse.rs b/src/pipeline/src/etl/processor/json_parse.rs index 84ea18ebdc..b6ef533fdc 100644 --- a/src/pipeline/src/etl/processor/json_parse.rs +++ b/src/pipeline/src/etl/processor/json_parse.rs @@ -13,16 +13,17 @@ // limitations under the License. use snafu::{OptionExt as _, ResultExt}; +use vrl::value::{KeyString, Value as VrlValue}; use crate::error::{ Error, FieldMustBeTypeSnafu, JsonParseSnafu, KeyMustBeStringSnafu, ProcessorMissingFieldSnafu, - ProcessorUnsupportedValueSnafu, Result, + ProcessorUnsupportedValueSnafu, Result, ValueMustBeMapSnafu, }; use crate::etl::field::Fields; use crate::etl::processor::{ yaml_bool, yaml_new_field, yaml_new_fields, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, }; -use crate::{json_to_map, Processor, Value}; +use crate::Processor; pub(crate) const PROCESSOR_JSON_PARSE: &str = "json_parse"; @@ -67,21 +68,21 @@ impl TryFrom<&yaml_rust::yaml::Hash> for JsonParseProcessor { } impl JsonParseProcessor { - fn process_field(&self, val: &Value) -> Result { + fn process_field(&self, val: &VrlValue) -> Result { let Some(json_str) = val.as_str() else { return FieldMustBeTypeSnafu { - field: val.to_str_type(), + field: val.to_string(), ty: "string", } .fail(); }; - let parsed: serde_json::Value = serde_json::from_str(json_str).context(JsonParseSnafu)?; + let parsed: VrlValue = serde_json::from_str(&json_str).context(JsonParseSnafu)?; match parsed { - serde_json::Value::Object(_) => Ok(json_to_map(parsed)?), - serde_json::Value::Array(arr) => Ok(Value::Array(arr.try_into()?)), + VrlValue::Object(_) => Ok(parsed), + VrlValue::Array(_) => Ok(parsed), _ => ProcessorUnsupportedValueSnafu { processor: self.kind(), - val: val.to_str_type(), + val: val.to_string(), } .fail(), } @@ -97,14 +98,15 @@ impl Processor for JsonParseProcessor { self.ignore_missing } - fn exec_mut(&self, mut val: Value) -> Result { + fn exec_mut(&self, mut val: VrlValue) -> Result { for field in self.fields.iter() { let index = field.input_field(); + let val = val.as_object_mut().context(ValueMustBeMapSnafu)?; match val.get(index) { Some(v) => { let processed = self.process_field(v)?; let output_index = field.target_or_input_field(); - val.insert(output_index.to_string(), processed)?; + val.insert(KeyString::from(output_index.to_string()), processed); } None => { if !self.ignore_missing { @@ -123,24 +125,27 @@ impl Processor for JsonParseProcessor { #[cfg(test)] mod test { + use std::collections::BTreeMap; + + use vrl::prelude::Bytes; + use vrl::value::{KeyString, Value as VrlValue}; + + use crate::etl::processor::json_parse::JsonParseProcessor; #[test] fn test_json_parse() { - use super::*; - use crate::Value; - let processor = JsonParseProcessor { ..Default::default() }; let result = processor - .process_field(&Value::String(r#"{"hello": "world"}"#.to_string())) + .process_field(&VrlValue::Bytes(Bytes::from(r#"{"hello": "world"}"#))) .unwrap(); - let expected = Value::Map(crate::Map::one( - "hello".to_string(), - Value::String("world".to_string()), - )); + let expected = VrlValue::Object(BTreeMap::from([( + KeyString::from("hello"), + VrlValue::Bytes(Bytes::from("world")), + )])); assert_eq!(result, expected); } diff --git a/src/pipeline/src/etl/processor/json_path.rs b/src/pipeline/src/etl/processor/json_path.rs index df515f966d..ff9b1c12bf 100644 --- a/src/pipeline/src/etl/processor/json_path.rs +++ b/src/pipeline/src/etl/processor/json_path.rs @@ -14,17 +14,17 @@ use jsonpath_rust::JsonPath; use snafu::{OptionExt, ResultExt}; +use vrl::value::{KeyString, Value as VrlValue}; use crate::error::{ - Error, JsonPathParseResultIndexSnafu, JsonPathParseSnafu, KeyMustBeStringSnafu, - ProcessorMissingFieldSnafu, Result, + Error, JsonParseSnafu, JsonPathParseResultIndexSnafu, JsonPathParseSnafu, KeyMustBeStringSnafu, + ProcessorMissingFieldSnafu, Result, ValueMustBeMapSnafu, }; use crate::etl::field::Fields; use crate::etl::processor::{ yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, JSON_PATH_NAME, JSON_PATH_RESULT_INDEX_NAME, }; -use crate::Value; pub(crate) const PROCESSOR_JSON_PATH: &str = "json_path"; @@ -84,7 +84,7 @@ impl TryFrom<&yaml_rust::yaml::Hash> for JsonPathProcessor { #[derive(Debug)] pub struct JsonPathProcessor { fields: Fields, - json_path: JsonPath, + json_path: JsonPath, ignore_missing: bool, result_index: Option, } @@ -101,17 +101,22 @@ impl Default for JsonPathProcessor { } impl JsonPathProcessor { - fn process_field(&self, val: &Value) -> Result { - let processed = self.json_path.find(val); - match processed { - Value::Array(arr) => { + fn process_field(&self, val: &VrlValue) -> Result { + let v = serde_json::to_value(val).context(JsonParseSnafu)?; + let p = self.json_path.find(&v); + match p { + serde_json::Value::Array(arr) => { if let Some(index) = self.result_index { - Ok(arr.get(index).cloned().unwrap_or(Value::Null)) + Ok(arr + .get(index) + .cloned() + .map(|v| v.into()) + .unwrap_or(VrlValue::Null)) } else { - Ok(Value::Array(arr)) + Ok(VrlValue::Array(arr.into_iter().map(|v| v.into()).collect())) } } - v => Ok(v), + v => Ok(v.into()), } } } @@ -125,14 +130,15 @@ impl Processor for JsonPathProcessor { self.ignore_missing } - fn exec_mut(&self, mut val: Value) -> Result { + fn exec_mut(&self, mut val: VrlValue) -> Result { for field in self.fields.iter() { let index = field.input_field(); + let val = val.as_object_mut().context(ValueMustBeMapSnafu)?; match val.get(index) { Some(v) => { let processed = self.process_field(v)?; let output_index = field.target_or_input_field(); - val.insert(output_index.to_string(), processed)?; + val.insert(KeyString::from(output_index), processed); } None => { if !self.ignore_missing { @@ -151,12 +157,13 @@ impl Processor for JsonPathProcessor { #[cfg(test)] mod test { - use crate::Map; + use std::collections::BTreeMap; + + use vrl::prelude::Bytes; #[test] fn test_json_path() { use super::*; - use crate::Value; let json_path = JsonPath::try_from("$.hello").unwrap(); let processor = JsonPathProcessor { @@ -166,11 +173,11 @@ mod test { }; let result = processor - .process_field(&Value::Map(Map::one( - "hello", - Value::String("world".to_string()), - ))) + .process_field(&VrlValue::Object(BTreeMap::from([( + KeyString::from("hello"), + VrlValue::Bytes(Bytes::from("world")), + )]))) .unwrap(); - assert_eq!(result, Value::String("world".to_string())); + assert_eq!(result, VrlValue::Bytes(Bytes::from("world"))); } } diff --git a/src/pipeline/src/etl/processor/letter.rs b/src/pipeline/src/etl/processor/letter.rs index 2e8c894bce..4882cabd64 100644 --- a/src/pipeline/src/etl/processor/letter.rs +++ b/src/pipeline/src/etl/processor/letter.rs @@ -13,17 +13,18 @@ // limitations under the License. use snafu::OptionExt; +use vrl::prelude::Bytes; +use vrl::value::{KeyString, Value as VrlValue}; use crate::error::{ Error, KeyMustBeStringSnafu, LetterInvalidMethodSnafu, ProcessorExpectStringSnafu, - ProcessorMissingFieldSnafu, Result, + ProcessorMissingFieldSnafu, Result, ValueMustBeMapSnafu, }; use crate::etl::field::Fields; use crate::etl::processor::{ yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, METHOD_NAME, }; -use crate::etl::value::Value; pub(crate) const PROCESSOR_LETTER: &str = "letter"; @@ -67,15 +68,14 @@ pub struct LetterProcessor { } impl LetterProcessor { - fn process_field(&self, val: &str) -> Result { - let processed = match self.method { - Method::Upper => val.to_uppercase(), - Method::Lower => val.to_lowercase(), - Method::Capital => capitalize(val), - }; - let val = Value::String(processed); - - Ok(val) + fn process_field(&self, val: &Bytes) -> VrlValue { + match self.method { + Method::Upper => VrlValue::Bytes(Bytes::from(val.to_ascii_uppercase())), + Method::Lower => VrlValue::Bytes(Bytes::from(val.to_ascii_lowercase())), + Method::Capital => VrlValue::Bytes(Bytes::from(capitalize( + String::from_utf8_lossy(val).as_ref(), + ))), + } } } @@ -125,16 +125,17 @@ impl Processor for LetterProcessor { self.ignore_missing } - fn exec_mut(&self, mut val: Value) -> Result { + fn exec_mut(&self, mut val: VrlValue) -> Result { for field in self.fields.iter() { let index = field.input_field(); + let val = val.as_object_mut().context(ValueMustBeMapSnafu)?; match val.get(index) { - Some(Value::String(s)) => { - let result = self.process_field(s)?; + Some(VrlValue::Bytes(s)) => { + let result = self.process_field(s); let output_key = field.target_or_input_field(); - val.insert(output_key.to_string(), result)?; + val.insert(KeyString::from(output_key), result); } - Some(Value::Null) | None => { + Some(VrlValue::Null) | None => { if !self.ignore_missing { return ProcessorMissingFieldSnafu { processor: self.kind(), @@ -167,8 +168,10 @@ fn capitalize(s: &str) -> String { #[cfg(test)] mod tests { + use vrl::prelude::Bytes; + use vrl::value::Value as VrlValue; + use crate::etl::processor::letter::{LetterProcessor, Method}; - use crate::etl::value::Value; #[test] fn test_process() { @@ -177,8 +180,8 @@ mod tests { method: Method::Upper, ..Default::default() }; - let processed = processor.process_field("pipeline").unwrap(); - assert_eq!(Value::String("PIPELINE".into()), processed) + let processed = processor.process_field(&Bytes::from("pipeline")); + assert_eq!(VrlValue::Bytes(Bytes::from("PIPELINE")), processed) } { @@ -186,8 +189,8 @@ mod tests { method: Method::Lower, ..Default::default() }; - let processed = processor.process_field("Pipeline").unwrap(); - assert_eq!(Value::String("pipeline".into()), processed) + let processed = processor.process_field(&Bytes::from("Pipeline")); + assert_eq!(VrlValue::Bytes(Bytes::from("pipeline")), processed) } { @@ -195,8 +198,8 @@ mod tests { method: Method::Capital, ..Default::default() }; - let processed = processor.process_field("pipeline").unwrap(); - assert_eq!(Value::String("Pipeline".into()), processed) + let processed = processor.process_field(&Bytes::from("pipeline")); + assert_eq!(VrlValue::Bytes(Bytes::from("Pipeline")), processed) } } } diff --git a/src/pipeline/src/etl/processor/regex.rs b/src/pipeline/src/etl/processor/regex.rs index b9ff830666..10040b2669 100644 --- a/src/pipeline/src/etl/processor/regex.rs +++ b/src/pipeline/src/etl/processor/regex.rs @@ -23,18 +23,19 @@ use std::collections::BTreeMap; use lazy_static::lazy_static; use regex::Regex; use snafu::{OptionExt, ResultExt}; +use vrl::prelude::Bytes; +use vrl::value::{KeyString, Value as VrlValue}; use crate::error::{ Error, KeyMustBeStringSnafu, ProcessorExpectStringSnafu, ProcessorMissingFieldSnafu, RegexNamedGroupNotFoundSnafu, RegexNoValidFieldSnafu, RegexNoValidPatternSnafu, RegexSnafu, - Result, + Result, ValueMustBeMapSnafu, }; use crate::etl::field::Fields; use crate::etl::processor::{ yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, yaml_strings, Processor, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, PATTERN_NAME, }; -use crate::etl::value::Value; lazy_static! { static ref GROUPS_NAME_REGEX: Regex = Regex::new(r"\(\?P?<([[:word:]]+)>.+?\)").unwrap(); @@ -168,14 +169,17 @@ impl RegexProcessor { Ok(()) } - fn process(&self, prefix: &str, val: &str) -> Result> { + fn process(&self, prefix: &str, val: &str) -> Result> { let mut result = BTreeMap::new(); for gr in self.patterns.iter() { if let Some(captures) = gr.regex.captures(val) { for group in gr.groups.iter() { if let Some(capture) = captures.name(group) { let value = capture.as_str().to_string(); - result.insert(generate_key(prefix, group), Value::String(value)); + result.insert( + KeyString::from(generate_key(prefix, group)), + VrlValue::Bytes(Bytes::from(value)), + ); } } } @@ -193,16 +197,17 @@ impl Processor for RegexProcessor { self.ignore_missing } - fn exec_mut(&self, mut val: Value) -> Result { + fn exec_mut(&self, mut val: VrlValue) -> Result { for field in self.fields.iter() { let index = field.input_field(); let prefix = field.target_or_input_field(); + let val = val.as_object_mut().context(ValueMustBeMapSnafu)?; match val.get(index) { - Some(Value::String(s)) => { - let result = self.process(prefix, s)?; - val.extend(result.into())?; + Some(VrlValue::Bytes(s)) => { + let result = self.process(prefix, String::from_utf8_lossy(s).as_ref())?; + val.extend(result); } - Some(Value::Null) | None => { + Some(VrlValue::Null) | None => { if !self.ignore_missing { return ProcessorMissingFieldSnafu { processor: self.kind(), @@ -226,12 +231,11 @@ impl Processor for RegexProcessor { } #[cfg(test)] mod tests { - use ahash::{HashMap, HashMapExt}; use itertools::Itertools; + use vrl::value::Value as VrlValue; use super::*; use crate::etl::processor::regex::RegexProcessor; - use crate::etl::value::{Map, Value}; #[test] fn test_simple_parse() { @@ -250,15 +254,11 @@ ignore_missing: false"#; let result = processor.process("a", "123").unwrap(); - let map = Map { values: result }; + let v = vec![(KeyString::from("a_ar"), VrlValue::Bytes(Bytes::from("1")))] + .into_iter() + .collect::>(); - let v = Map { - values: vec![("a_ar".to_string(), Value::String("1".to_string()))] - .into_iter() - .collect(), - }; - - assert_eq!(v, map); + assert_eq!(v, result); } #[test] @@ -270,15 +270,30 @@ ignore_missing: false"#; let cw = "[c=w,n=US_CA_SANJOSE,o=55155]"; let breadcrumbs_str = [cc, cg, co, cp, cw].iter().join(","); - let temporary_map: BTreeMap = [ - ("breadcrumbs_parent", Value::String(cc.to_string())), - ("breadcrumbs_edge", Value::String(cg.to_string())), - ("breadcrumbs_origin", Value::String(co.to_string())), - ("breadcrumbs_peer", Value::String(cp.to_string())), - ("breadcrumbs_wrapper", Value::String(cw.to_string())), + let temporary_map: BTreeMap = [ + ( + "breadcrumbs_parent", + VrlValue::Bytes(Bytes::from(cc.to_string())), + ), + ( + "breadcrumbs_edge", + VrlValue::Bytes(Bytes::from(cg.to_string())), + ), + ( + "breadcrumbs_origin", + VrlValue::Bytes(Bytes::from(co.to_string())), + ), + ( + "breadcrumbs_peer", + VrlValue::Bytes(Bytes::from(cp.to_string())), + ), + ( + "breadcrumbs_wrapper", + VrlValue::Bytes(Bytes::from(cw.to_string())), + ), ] .into_iter() - .map(|(k, v)| (k.to_string(), v)) + .map(|(k, v)| (KeyString::from(k), v)) .collect(); { @@ -331,35 +346,66 @@ ignore_missing: false"#; let processor_yaml_hash = processor_yaml.as_hash().unwrap(); let processor = RegexProcessor::try_from(processor_yaml_hash).unwrap(); - let mut result = HashMap::new(); + let mut result = BTreeMap::new(); for field in processor.fields.iter() { - let s = temporary_map - .get(field.input_field()) - .unwrap() - .to_str_value(); + let s = temporary_map.get(field.input_field()).unwrap(); + let s = s.to_string_lossy(); let prefix = field.target_or_input_field(); - let r = processor.process(prefix, &s).unwrap(); + let r = processor.process(prefix, s.as_ref()).unwrap(); result.extend(r); } let new_values = vec![ - ("edge_ip", Value::String("12.34.567.89".to_string())), - ("edge_request_id", Value::String("12345678".to_string())), - ("edge_geo", Value::String("US_CA_SANJOSE".to_string())), - ("edge_asn", Value::String("20940".to_string())), - ("origin_ip", Value::String("987.654.321.09".to_string())), - ("peer_asn", Value::String("55155".to_string())), - ("peer_geo", Value::String("US_CA_SANJOSE".to_string())), - ("parent_asn", Value::String("55155".to_string())), - ("parent_geo", Value::String("US_CA_SANJOSE".to_string())), - ("wrapper_asn", Value::String("55155".to_string())), - ("wrapper_geo", Value::String("US_CA_SANJOSE".to_string())), + ( + "edge_ip", + VrlValue::Bytes(Bytes::from("12.34.567.89".to_string())), + ), + ( + "edge_request_id", + VrlValue::Bytes(Bytes::from("12345678".to_string())), + ), + ( + "edge_geo", + VrlValue::Bytes(Bytes::from("US_CA_SANJOSE".to_string())), + ), + ( + "edge_asn", + VrlValue::Bytes(Bytes::from("20940".to_string())), + ), + ( + "origin_ip", + VrlValue::Bytes(Bytes::from("987.654.321.09".to_string())), + ), + ( + "peer_asn", + VrlValue::Bytes(Bytes::from("55155".to_string())), + ), + ( + "peer_geo", + VrlValue::Bytes(Bytes::from("US_CA_SANJOSE".to_string())), + ), + ( + "parent_asn", + VrlValue::Bytes(Bytes::from("55155".to_string())), + ), + ( + "parent_geo", + VrlValue::Bytes(Bytes::from("US_CA_SANJOSE".to_string())), + ), + ( + "wrapper_asn", + VrlValue::Bytes(Bytes::from("55155".to_string())), + ), + ( + "wrapper_geo", + VrlValue::Bytes(Bytes::from("US_CA_SANJOSE".to_string())), + ), ] .into_iter() - .map(|(k, v)| (k.to_string(), v)) - .collect(); + .map(|(k, v)| (KeyString::from(k), v)) + .collect::>(); assert_eq!(result, new_values); } diff --git a/src/pipeline/src/etl/processor/select.rs b/src/pipeline/src/etl/processor/select.rs index dbb3a11353..a460d3cb02 100644 --- a/src/pipeline/src/etl/processor/select.rs +++ b/src/pipeline/src/etl/processor/select.rs @@ -14,6 +14,7 @@ use ahash::{HashSet, HashSetExt}; use snafu::OptionExt; +use vrl::value::{KeyString, Value as VrlValue}; use crate::error::{ Error, KeyMustBeStringSnafu, ProcessorUnsupportedValueSnafu, Result, ValueMustBeMapSnafu, @@ -22,7 +23,7 @@ use crate::etl::field::Fields; use crate::etl::processor::{ yaml_new_field, yaml_new_fields, yaml_string, FIELDS_NAME, FIELD_NAME, TYPE_NAME, }; -use crate::{Processor, Value}; +use crate::Processor; pub(crate) const PROCESSOR_SELECT: &str = "select"; const INCLUDE_KEY: &str = "include"; @@ -98,8 +99,8 @@ impl Processor for SelectProcessor { true } - fn exec_mut(&self, mut val: Value) -> Result { - let v_map = val.as_map_mut().context(ValueMustBeMapSnafu)?; + fn exec_mut(&self, mut val: VrlValue) -> Result { + let v_map = val.as_object_mut().context(ValueMustBeMapSnafu)?; match self.select_type { SelectType::Include => { @@ -109,7 +110,7 @@ impl Processor for SelectProcessor { let field_name = field.input_field(); if let Some(target_name) = field.target_field() { if let Some(v) = v_map.remove(field_name) { - v_map.insert(target_name.to_string(), v); + v_map.insert(KeyString::from(target_name), v); } include_key_set.insert(target_name); } else { @@ -133,9 +134,12 @@ impl Processor for SelectProcessor { mod test { use std::collections::BTreeMap; + use vrl::prelude::Bytes; + use vrl::value::{KeyString, Value as VrlValue}; + use crate::etl::field::{Field, Fields}; use crate::etl::processor::select::{SelectProcessor, SelectType}; - use crate::{Map, Processor, Value}; + use crate::Processor; #[test] fn test_select() { @@ -145,15 +149,24 @@ mod test { }; let mut p = BTreeMap::new(); - p.insert("hello".to_string(), Value::String("world".to_string())); - p.insert("hello2".to_string(), Value::String("world2".to_string())); + p.insert( + KeyString::from("hello"), + VrlValue::Bytes(Bytes::from("world".to_string())), + ); + p.insert( + KeyString::from("hello2"), + VrlValue::Bytes(Bytes::from("world2".to_string())), + ); - let result = processor.exec_mut(Value::Map(Map { values: p })); + let result = processor.exec_mut(VrlValue::Object(p)); assert!(result.is_ok()); let mut result = result.unwrap(); - let p = result.as_map_mut().unwrap(); + let p = result.as_object_mut().unwrap(); assert_eq!(p.len(), 1); - assert_eq!(p.get("hello"), Some(&Value::String("world".to_string()))); + assert_eq!( + p.get(&KeyString::from("hello")), + Some(&VrlValue::Bytes(Bytes::from("world".to_string()))) + ); } #[test] @@ -164,15 +177,24 @@ mod test { }; let mut p = BTreeMap::new(); - p.insert("hello".to_string(), Value::String("world".to_string())); - p.insert("hello2".to_string(), Value::String("world2".to_string())); + p.insert( + KeyString::from("hello"), + VrlValue::Bytes(Bytes::from("world".to_string())), + ); + p.insert( + KeyString::from("hello2"), + VrlValue::Bytes(Bytes::from("world2".to_string())), + ); - let result = processor.exec_mut(Value::Map(Map { values: p })); + let result = processor.exec_mut(VrlValue::Object(p)); assert!(result.is_ok()); let mut result = result.unwrap(); - let p = result.as_map_mut().unwrap(); + let p = result.as_object_mut().unwrap(); assert_eq!(p.len(), 1); - assert_eq!(p.get("hello3"), Some(&Value::String("world".to_string()))); + assert_eq!( + p.get(&KeyString::from("hello3")), + Some(&VrlValue::Bytes(Bytes::from("world".to_string()))) + ); } #[test] @@ -183,15 +205,24 @@ mod test { }; let mut p = BTreeMap::new(); - p.insert("hello".to_string(), Value::String("world".to_string())); - p.insert("hello2".to_string(), Value::String("world2".to_string())); + p.insert( + KeyString::from("hello"), + VrlValue::Bytes(Bytes::from("world".to_string())), + ); + p.insert( + KeyString::from("hello2"), + VrlValue::Bytes(Bytes::from("world2".to_string())), + ); - let result = processor.exec_mut(Value::Map(Map { values: p })); + let result = processor.exec_mut(VrlValue::Object(p)); assert!(result.is_ok()); let mut result = result.unwrap(); - let p = result.as_map_mut().unwrap(); + let p = result.as_object_mut().unwrap(); assert_eq!(p.len(), 1); - assert_eq!(p.get("hello"), None); - assert_eq!(p.get("hello2"), Some(&Value::String("world2".to_string()))); + assert_eq!(p.get(&KeyString::from("hello")), None); + assert_eq!( + p.get(&KeyString::from("hello2")), + Some(&VrlValue::Bytes(Bytes::from("world2".to_string()))) + ); } } diff --git a/src/pipeline/src/etl/processor/simple_extract.rs b/src/pipeline/src/etl/processor/simple_extract.rs index 0fcf2c4979..258238e223 100644 --- a/src/pipeline/src/etl/processor/simple_extract.rs +++ b/src/pipeline/src/etl/processor/simple_extract.rs @@ -13,14 +13,17 @@ // limitations under the License. use snafu::OptionExt as _; +use vrl::value::{KeyString, Value as VrlValue}; -use crate::error::{Error, KeyMustBeStringSnafu, ProcessorMissingFieldSnafu, Result}; +use crate::error::{ + Error, KeyMustBeStringSnafu, ProcessorMissingFieldSnafu, Result, ValueMustBeMapSnafu, +}; use crate::etl::field::Fields; use crate::etl::processor::{ yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, KEY_NAME, }; -use crate::{Processor, Value}; +use crate::Processor; pub(crate) const PROCESSOR_SIMPLE_EXTRACT: &str = "simple_extract"; @@ -74,14 +77,14 @@ impl TryFrom<&yaml_rust::yaml::Hash> for SimpleExtractProcessor { } impl SimpleExtractProcessor { - fn process_field(&self, val: &Value) -> Result { + fn process_field(&self, val: &VrlValue) -> Result { let mut current = val; for key in self.key.iter() { - let Value::Map(map) = current else { - return Ok(Value::Null); + let VrlValue::Object(map) = current else { + return Ok(VrlValue::Null); }; - let Some(v) = map.get(key) else { - return Ok(Value::Null); + let Some(v) = map.get(key.as_str()) else { + return Ok(VrlValue::Null); }; current = v; } @@ -98,14 +101,15 @@ impl Processor for SimpleExtractProcessor { self.ignore_missing } - fn exec_mut(&self, mut val: Value) -> Result { + fn exec_mut(&self, mut val: VrlValue) -> Result { for field in self.fields.iter() { let index = field.input_field(); + let val = val.as_object_mut().context(ValueMustBeMapSnafu)?; match val.get(index) { Some(v) => { let processed = self.process_field(v)?; let output_index = field.target_or_input_field(); - val.insert(output_index.to_string(), processed)?; + val.insert(KeyString::from(output_index), processed); } None => { if !self.ignore_missing { @@ -124,11 +128,13 @@ impl Processor for SimpleExtractProcessor { #[cfg(test)] mod test { + use std::collections::BTreeMap; + + use vrl::prelude::Bytes; #[test] fn test_simple_extract() { use super::*; - use crate::{Map, Value}; let processor = SimpleExtractProcessor { key: vec!["hello".to_string()], @@ -136,12 +142,12 @@ mod test { }; let result = processor - .process_field(&Value::Map(Map::one( - "hello", - Value::String("world".to_string()), - ))) + .process_field(&VrlValue::Object(BTreeMap::from([( + KeyString::from("hello"), + VrlValue::Bytes(Bytes::from("world".to_string())), + )]))) .unwrap(); - assert_eq!(result, Value::String("world".to_string())); + assert_eq!(result, VrlValue::Bytes(Bytes::from("world".to_string()))); } } diff --git a/src/pipeline/src/etl/processor/urlencoding.rs b/src/pipeline/src/etl/processor/urlencoding.rs index e56076b1dd..7675eec0dc 100644 --- a/src/pipeline/src/etl/processor/urlencoding.rs +++ b/src/pipeline/src/etl/processor/urlencoding.rs @@ -12,19 +12,20 @@ // See the License for the specific language governing permissions and // limitations under the License. -use snafu::{OptionExt, ResultExt}; -use urlencoding::{decode, encode}; +use snafu::OptionExt; +use urlencoding::{decode_binary, encode_binary}; +use vrl::prelude::Bytes; +use vrl::value::{KeyString, Value as VrlValue}; use crate::error::{ Error, KeyMustBeStringSnafu, ProcessorExpectStringSnafu, ProcessorMissingFieldSnafu, Result, - UrlEncodingDecodeSnafu, UrlEncodingInvalidMethodSnafu, + UrlEncodingInvalidMethodSnafu, ValueMustBeMapSnafu, }; use crate::etl::field::Fields; use crate::etl::processor::{ yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, METHOD_NAME, }; -use crate::etl::value::Value; pub(crate) const PROCESSOR_URL_ENCODING: &str = "urlencoding"; @@ -65,12 +66,12 @@ pub struct UrlEncodingProcessor { } impl UrlEncodingProcessor { - fn process_field(&self, val: &str) -> Result { + fn process_field(&self, val: &Bytes) -> Result { let processed = match self.method { - Method::Encode => encode(val).to_string(), - Method::Decode => decode(val).context(UrlEncodingDecodeSnafu)?.into_owned(), + Method::Encode => Bytes::from_iter(encode_binary(val).bytes()), + Method::Decode => Bytes::from(decode_binary(val).to_vec()), }; - Ok(Value::String(processed)) + Ok(VrlValue::Bytes(processed)) } } @@ -125,16 +126,17 @@ impl crate::etl::processor::Processor for UrlEncodingProcessor { self.ignore_missing } - fn exec_mut(&self, mut val: Value) -> Result { + fn exec_mut(&self, mut val: VrlValue) -> Result { for field in self.fields.iter() { let index = field.input_field(); + let val = val.as_object_mut().context(ValueMustBeMapSnafu)?; match val.get(index) { - Some(Value::String(s)) => { + Some(VrlValue::Bytes(s)) => { let result = self.process_field(s)?; let output_index = field.target_or_input_field(); - val.insert(output_index.to_string(), result)?; + val.insert(KeyString::from(output_index), result); } - Some(Value::Null) | None => { + Some(VrlValue::Null) | None => { if !self.ignore_missing { return ProcessorMissingFieldSnafu { processor: self.kind(), @@ -159,9 +161,11 @@ impl crate::etl::processor::Processor for UrlEncodingProcessor { #[cfg(test)] mod tests { + use vrl::prelude::Bytes; + use vrl::value::Value as VrlValue; + use crate::etl::field::Fields; use crate::etl::processor::urlencoding::UrlEncodingProcessor; - use crate::etl::value::Value; #[test] fn test_decode_url() { @@ -170,8 +174,8 @@ mod tests { { let processor = UrlEncodingProcessor::default(); - let result = processor.process_field(encoded).unwrap(); - assert_eq!(Value::String(decoded.into()), result) + let result = processor.process_field(&Bytes::from(encoded)).unwrap(); + assert_eq!(VrlValue::Bytes(Bytes::from(decoded)), result) } { let processor = UrlEncodingProcessor { @@ -179,8 +183,8 @@ mod tests { method: super::Method::Encode, ignore_missing: false, }; - let result = processor.process_field(decoded).unwrap(); - assert_eq!(Value::String(encoded.into()), result) + let result = processor.process_field(&Bytes::from(decoded)).unwrap(); + assert_eq!(VrlValue::Bytes(Bytes::from(encoded)), result) } } } diff --git a/src/pipeline/src/etl/processor/vrl.rs b/src/pipeline/src/etl/processor/vrl_processor.rs similarity index 53% rename from src/pipeline/src/etl/processor/vrl.rs rename to src/pipeline/src/etl/processor/vrl_processor.rs index b2a90b5955..5540d42cf1 100644 --- a/src/pipeline/src/etl/processor/vrl.rs +++ b/src/pipeline/src/etl/processor/vrl_processor.rs @@ -15,19 +15,18 @@ use std::collections::BTreeMap; use chrono_tz::Tz; -use snafu::{OptionExt, ResultExt}; +use snafu::OptionExt; use vrl::compiler::runtime::Runtime; use vrl::compiler::{compile, Program, TargetValue}; use vrl::diagnostic::Formatter; -use vrl::prelude::{Bytes, NotNan, TimeZone}; -use vrl::value::{KeyString, Kind, Secrets, Value as VrlValue}; +use vrl::prelude::TimeZone; +use vrl::value::{Kind, Secrets, Value as VrlValue}; use crate::error::{ - BytesToUtf8Snafu, CompileVrlSnafu, Error, ExecuteVrlSnafu, FloatNaNSnafu, - InvalidTimestampSnafu, KeyMustBeStringSnafu, Result, VrlRegexValueSnafu, VrlReturnValueSnafu, + CompileVrlSnafu, Error, ExecuteVrlSnafu, KeyMustBeStringSnafu, Result, VrlRegexValueSnafu, + VrlReturnValueSnafu, }; use crate::etl::processor::yaml_string; -use crate::Value as PipelineValue; pub(crate) const PROCESSOR_VRL: &str = "vrl"; const SOURCE: &str = "source"; @@ -62,11 +61,9 @@ impl VrlProcessor { Ok(Self { source, program }) } - pub fn resolve(&self, m: PipelineValue) -> Result { - let pipeline_vrl = pipeline_value_to_vrl_value(m)?; - + pub fn resolve(&self, value: VrlValue) -> Result { let mut target = TargetValue { - value: pipeline_vrl, + value, metadata: VrlValue::Object(BTreeMap::new()), secrets: Secrets::default(), }; @@ -82,7 +79,7 @@ impl VrlProcessor { .build() })?; - vrl_value_to_pipeline_value(re) + Ok(re) } } @@ -113,91 +110,17 @@ impl crate::etl::processor::Processor for VrlProcessor { true } - fn exec_mut(&self, val: PipelineValue) -> Result { + fn exec_mut(&self, val: VrlValue) -> Result { let val = self.resolve(val)?; - if let PipelineValue::Map(m) = val { - Ok(PipelineValue::Map(m.values.into())) + if let VrlValue::Object(_) = val { + Ok(val) } else { VrlRegexValueSnafu.fail() } } } -fn pipeline_value_to_vrl_value(v: PipelineValue) -> Result { - match v { - PipelineValue::Null => Ok(VrlValue::Null), - PipelineValue::Int8(x) => Ok(VrlValue::Integer(x as i64)), - PipelineValue::Int16(x) => Ok(VrlValue::Integer(x as i64)), - PipelineValue::Int32(x) => Ok(VrlValue::Integer(x as i64)), - PipelineValue::Int64(x) => Ok(VrlValue::Integer(x)), - PipelineValue::Uint8(x) => Ok(VrlValue::Integer(x as i64)), - PipelineValue::Uint16(x) => Ok(VrlValue::Integer(x as i64)), - PipelineValue::Uint32(x) => Ok(VrlValue::Integer(x as i64)), - PipelineValue::Uint64(x) => Ok(VrlValue::Integer(x as i64)), - PipelineValue::Float32(x) => NotNan::new(x as f64) - .map_err(|_| FloatNaNSnafu { input_float: x }.build()) - .map(VrlValue::Float), - PipelineValue::Float64(x) => NotNan::new(x) - .map_err(|_| FloatNaNSnafu { input_float: x }.build()) - .map(VrlValue::Float), - PipelineValue::Boolean(x) => Ok(VrlValue::Boolean(x)), - PipelineValue::String(x) => Ok(VrlValue::Bytes(Bytes::copy_from_slice(x.as_bytes()))), - PipelineValue::Timestamp(x) => x - .to_datetime() - .context(InvalidTimestampSnafu { - input: x.to_string(), - }) - .map(VrlValue::Timestamp), - PipelineValue::Array(array) => Ok(VrlValue::Array( - array - .into_iter() - .map(pipeline_value_to_vrl_value) - .collect::>>()?, - )), - PipelineValue::Map(m) => { - let values = m - .values - .into_iter() - .map(|(k, v)| pipeline_value_to_vrl_value(v).map(|v| (KeyString::from(k), v))) - .collect::>>()?; - Ok(VrlValue::Object(values)) - } - } -} - -fn vrl_value_to_pipeline_value(v: VrlValue) -> Result { - match v { - VrlValue::Bytes(bytes) => String::from_utf8(bytes.to_vec()) - .context(BytesToUtf8Snafu) - .map(PipelineValue::String), - VrlValue::Regex(_) => VrlRegexValueSnafu.fail(), - VrlValue::Integer(x) => Ok(PipelineValue::Int64(x)), - VrlValue::Float(not_nan) => Ok(PipelineValue::Float64(not_nan.into_inner())), - VrlValue::Boolean(b) => Ok(PipelineValue::Boolean(b)), - VrlValue::Timestamp(date_time) => crate::etl::value::Timestamp::from_datetime(date_time) - .context(InvalidTimestampSnafu { - input: date_time.to_string(), - }) - .map(PipelineValue::Timestamp), - VrlValue::Object(bm) => { - let b = bm - .into_iter() - .map(|(k, v)| vrl_value_to_pipeline_value(v).map(|v| (k.to_string(), v))) - .collect::>>()?; - Ok(PipelineValue::Map(b.into())) - } - VrlValue::Array(values) => { - let a = values - .into_iter() - .map(vrl_value_to_pipeline_value) - .collect::>>()?; - Ok(PipelineValue::Array(a.into())) - } - VrlValue::Null => Ok(PipelineValue::Null), - } -} - fn check_regex_output(output_kind: &Kind) -> Result<()> { if output_kind.is_regex() { return VrlRegexValueSnafu.fail(); @@ -223,9 +146,10 @@ fn check_regex_output(output_kind: &Kind) -> Result<()> { #[cfg(test)] mod tests { + use vrl::prelude::Bytes; + use vrl::value::KeyString; + use super::*; - use crate::etl::value::Timestamp; - use crate::Map; #[test] fn test_vrl() { @@ -243,31 +167,27 @@ del(.user_info) let mut n = BTreeMap::new(); n.insert( - "name".to_string(), - PipelineValue::String("certain_name".to_string()), + KeyString::from("name"), + VrlValue::Bytes(Bytes::from("certain_name")), ); let mut m = BTreeMap::new(); - m.insert( - "user_info".to_string(), - PipelineValue::Map(Map { values: n }), - ); + m.insert(KeyString::from("user_info"), VrlValue::Object(n)); - let re = v.resolve(PipelineValue::Map(Map { values: m })); + let re = v.resolve(VrlValue::Object(m)); assert!(re.is_ok()); let re = re.unwrap(); - assert!(matches!(re, PipelineValue::Map(_))); + assert!(matches!(re, VrlValue::Object(_))); + let re = re.as_object().unwrap(); assert!(re.get("name").is_some()); let name = re.get("name").unwrap(); - assert!(matches!(name.get("a").unwrap(), PipelineValue::String(x) if x == "certain_name")); - assert!(matches!(name.get("b").unwrap(), PipelineValue::String(x) if x == "certain_name")); + let name = name.as_object().unwrap(); + assert!(matches!(name.get("a").unwrap(), VrlValue::Bytes(x) if x == "certain_name")); + assert!(matches!(name.get("b").unwrap(), VrlValue::Bytes(x) if x == "certain_name")); assert!(re.get("timestamp").is_some()); let timestamp = re.get("timestamp").unwrap(); - assert!(matches!( - timestamp, - PipelineValue::Timestamp(Timestamp::Nanosecond(_)) - )); + assert!(matches!(timestamp, VrlValue::Timestamp(_))); } #[test] diff --git a/src/pipeline/src/etl/transform.rs b/src/pipeline/src/etl/transform.rs index b65fb853b7..85f011c95d 100644 --- a/src/pipeline/src/etl/transform.rs +++ b/src/pipeline/src/etl/transform.rs @@ -15,16 +15,20 @@ pub mod index; pub mod transformer; +use api::v1::value::ValueData; +use api::v1::ColumnDataType; +use chrono::Utc; use snafu::{ensure, OptionExt}; use crate::error::{ Error, KeyMustBeStringSnafu, Result, TransformElementMustBeMapSnafu, TransformFieldMustBeSetSnafu, TransformOnFailureInvalidValueSnafu, TransformTypeMustBeSetSnafu, + UnsupportedTypeInPipelineSnafu, }; use crate::etl::field::Fields; use crate::etl::processor::{yaml_bool, yaml_new_field, yaml_new_fields, yaml_string}; use crate::etl::transform::index::Index; -use crate::etl::value::{Timestamp, Value}; +use crate::etl::value::{parse_str_type, parse_str_value}; const TRANSFORM_FIELD: &str = "field"; const TRANSFORM_FIELDS: &str = "fields"; @@ -124,39 +128,61 @@ impl TryFrom<&Vec> for Transforms { #[derive(Debug, Clone)] pub struct Transform { pub fields: Fields, - pub type_: Value, - pub default: Option, + pub type_: ColumnDataType, + pub default: Option, pub index: Option, pub tag: bool, pub on_failure: Option, } -impl Default for Transform { - fn default() -> Self { - Transform { - fields: Fields::default(), - type_: Value::Null, - default: None, - index: None, - tag: false, - on_failure: None, - } - } -} +// valid types +// ColumnDataType::Int8 +// ColumnDataType::Int16 +// ColumnDataType::Int32 +// ColumnDataType::Int64 +// ColumnDataType::Uint8 +// ColumnDataType::Uint16 +// ColumnDataType::Uint32 +// ColumnDataType::Uint64 +// ColumnDataType::Float32 +// ColumnDataType::Float64 +// ColumnDataType::Boolean +// ColumnDataType::String +// ColumnDataType::TimestampNanosecond +// ColumnDataType::TimestampMicrosecond +// ColumnDataType::TimestampMillisecond +// ColumnDataType::TimestampSecond +// ColumnDataType::Binary impl Transform { - pub(crate) fn get_default(&self) -> Option<&Value> { + pub(crate) fn get_default(&self) -> Option<&ValueData> { self.default.as_ref() } - pub(crate) fn get_type_matched_default_val(&self) -> &Value { - &self.type_ + pub(crate) fn get_type_matched_default_val(&self) -> Result { + get_default_for_type(&self.type_) } - pub(crate) fn get_default_value_when_data_is_none(&self) -> Option { - if matches!(self.type_, Value::Timestamp(_)) && self.index.is_some_and(|i| i == Index::Time) - { - return Some(Value::Timestamp(Timestamp::default())); + pub(crate) fn get_default_value_when_data_is_none(&self) -> Option { + if is_timestamp_type(&self.type_) && self.index.is_some_and(|i| i == Index::Time) { + let now = Utc::now(); + match self.type_ { + ColumnDataType::TimestampSecond => { + return Some(ValueData::TimestampSecondValue(now.timestamp())); + } + ColumnDataType::TimestampMillisecond => { + return Some(ValueData::TimestampMillisecondValue(now.timestamp_millis())); + } + ColumnDataType::TimestampMicrosecond => { + return Some(ValueData::TimestampMicrosecondValue(now.timestamp_micros())); + } + ColumnDataType::TimestampNanosecond => { + return Some(ValueData::TimestampNanosecondValue( + now.timestamp_nanos_opt()?, + )); + } + _ => {} + } } None } @@ -166,17 +192,57 @@ impl Transform { } } +fn is_timestamp_type(ty: &ColumnDataType) -> bool { + matches!( + ty, + ColumnDataType::TimestampSecond + | ColumnDataType::TimestampMillisecond + | ColumnDataType::TimestampMicrosecond + | ColumnDataType::TimestampNanosecond + ) +} + +fn get_default_for_type(ty: &ColumnDataType) -> Result { + let v = match ty { + ColumnDataType::Boolean => ValueData::BoolValue(false), + ColumnDataType::Int8 => ValueData::I8Value(0), + ColumnDataType::Int16 => ValueData::I16Value(0), + ColumnDataType::Int32 => ValueData::I32Value(0), + ColumnDataType::Int64 => ValueData::I64Value(0), + ColumnDataType::Uint8 => ValueData::U8Value(0), + ColumnDataType::Uint16 => ValueData::U16Value(0), + ColumnDataType::Uint32 => ValueData::U32Value(0), + ColumnDataType::Uint64 => ValueData::U64Value(0), + ColumnDataType::Float32 => ValueData::F32Value(0.0), + ColumnDataType::Float64 => ValueData::F64Value(0.0), + ColumnDataType::Binary => ValueData::BinaryValue(jsonb::Value::Null.to_vec()), + ColumnDataType::String => ValueData::StringValue(String::new()), + + ColumnDataType::TimestampSecond => ValueData::TimestampSecondValue(0), + ColumnDataType::TimestampMillisecond => ValueData::TimestampMillisecondValue(0), + ColumnDataType::TimestampMicrosecond => ValueData::TimestampMicrosecondValue(0), + ColumnDataType::TimestampNanosecond => ValueData::TimestampNanosecondValue(0), + + _ => UnsupportedTypeInPipelineSnafu { + ty: ty.as_str_name(), + } + .fail()?, + }; + Ok(v) +} + impl TryFrom<&yaml_rust::yaml::Hash> for Transform { type Error = Error; fn try_from(hash: &yaml_rust::yaml::Hash) -> Result { let mut fields = Fields::default(); - let mut type_ = Value::Null; let mut default = None; let mut index = None; let mut tag = false; let mut on_failure = None; + let mut type_ = None; + for (k, v) in hash { let key = k .as_str() @@ -192,7 +258,7 @@ impl TryFrom<&yaml_rust::yaml::Hash> for Transform { TRANSFORM_TYPE => { let t = yaml_string(v, TRANSFORM_TYPE)?; - type_ = Value::parse_str_type(&t)?; + type_ = Some(parse_str_type(&t)?); } TRANSFORM_INDEX => { @@ -205,7 +271,17 @@ impl TryFrom<&yaml_rust::yaml::Hash> for Transform { } TRANSFORM_DEFAULT => { - default = Some(Value::try_from(v)?); + default = match v { + yaml_rust::Yaml::Real(r) => Some(r.clone()), + yaml_rust::Yaml::Integer(i) => Some(i.to_string()), + yaml_rust::Yaml::String(s) => Some(s.clone()), + yaml_rust::Yaml::Boolean(b) => Some(b.to_string()), + yaml_rust::Yaml::Array(_) + | yaml_rust::Yaml::Hash(_) + | yaml_rust::Yaml::Alias(_) + | yaml_rust::Yaml::Null + | yaml_rust::Yaml::BadValue => None, + }; } TRANSFORM_ON_FAILURE => { @@ -219,23 +295,14 @@ impl TryFrom<&yaml_rust::yaml::Hash> for Transform { // ensure fields and type ensure!(!fields.is_empty(), TransformFieldMustBeSetSnafu); - ensure!( - type_ != Value::Null, - TransformTypeMustBeSetSnafu { - fields: format!("{:?}", fields) - } - ); + let type_ = type_.context(TransformTypeMustBeSetSnafu { + fields: format!("{:?}", fields), + })?; let final_default = if let Some(default_value) = default { - match default_value { - // if default is not set, then it will be regarded as default null - Value::Null => None, - _ => { - let target = type_.parse_str_value(default_value.to_str_value().as_str())?; - on_failure = Some(OnFailure::Default); - Some(target) - } - } + let target = parse_str_value(&type_, &default_value)?; + on_failure = Some(OnFailure::Default); + Some(target) } else { None }; diff --git a/src/pipeline/src/etl/transform/transformer/greptime.rs b/src/pipeline/src/etl/transform/transformer/greptime.rs index f3a3f175c8..77328479dd 100644 --- a/src/pipeline/src/etl/transform/transformer/greptime.rs +++ b/src/pipeline/src/etl/transform/transformer/greptime.rs @@ -14,6 +14,7 @@ pub mod coerce; +use std::borrow::Cow; use std::collections::{BTreeMap, HashSet}; use std::sync::Arc; @@ -24,26 +25,27 @@ use api::v1::value::ValueData; use api::v1::{ColumnDataType, ColumnDataTypeExtension, JsonTypeExtension, SemanticType}; use coerce::{coerce_columns, coerce_value}; use common_query::prelude::{GREPTIME_TIMESTAMP, GREPTIME_VALUE}; +use common_telemetry::warn; use greptime_proto::v1::{ColumnSchema, Row, Rows, Value as GreptimeValue}; use itertools::Itertools; +use jsonb::Number; use once_cell::sync::OnceCell; -use serde_json::Number; use session::context::Channel; use snafu::OptionExt; +use vrl::prelude::VrlValueConvert; +use vrl::value::{KeyString, Value as VrlValue}; use crate::error::{ - IdentifyPipelineColumnTypeMismatchSnafu, ReachedMaxNestedLevelsSnafu, Result, - TimeIndexMustBeNonNullSnafu, TransformColumnNameMustBeUniqueSnafu, - TransformMultipleTimestampIndexSnafu, TransformTimestampIndexCountSnafu, - UnsupportedNumberTypeSnafu, ValueMustBeMapSnafu, + IdentifyPipelineColumnTypeMismatchSnafu, InvalidTimestampSnafu, ReachedMaxNestedLevelsSnafu, + Result, TimeIndexMustBeNonNullSnafu, TransformColumnNameMustBeUniqueSnafu, + TransformMultipleTimestampIndexSnafu, TransformTimestampIndexCountSnafu, ValueMustBeMapSnafu, }; use crate::etl::ctx_req::ContextOpt; use crate::etl::field::{Field, Fields}; use crate::etl::transform::index::Index; use crate::etl::transform::{Transform, Transforms}; -use crate::etl::value::{Timestamp, Value}; use crate::etl::PipelineDocVersion; -use crate::{unwrap_or_continue_if_err, Map, PipelineContext}; +use crate::{unwrap_or_continue_if_err, PipelineContext}; const DEFAULT_GREPTIME_TIMESTAMP_COLUMN: &str = "greptime_timestamp"; const DEFAULT_MAX_NESTED_LEVELS_FOR_JSON_FLATTENING: usize = 10; @@ -133,7 +135,7 @@ impl GreptimePipelineParams { impl GreptimeTransformer { /// Add a default timestamp column to the transforms fn add_greptime_timestamp_column(transforms: &mut Transforms) { - let type_ = Value::Timestamp(Timestamp::Nanosecond(0)); + let type_ = ColumnDataType::TimestampNanosecond; let default = None; let transform = Transform { @@ -220,7 +222,7 @@ impl GreptimeTransformer { pub fn transform_mut( &self, - pipeline_map: &mut Value, + pipeline_map: &mut VrlValue, is_v1: bool, ) -> Result> { let mut values = vec![GreptimeValue { value_data: None }; self.schema.len()]; @@ -229,6 +231,7 @@ impl GreptimeTransformer { for field in transform.fields.iter() { let column_name = field.input_field(); + let pipeline_map = pipeline_map.as_object_mut().context(ValueMustBeMapSnafu)?; // let keep us `get` here to be compatible with v1 match pipeline_map.get(column_name) { Some(v) => { @@ -240,11 +243,8 @@ impl GreptimeTransformer { let value_data = match transform.on_failure { Some(crate::etl::transform::OnFailure::Default) => { match transform.get_default() { - Some(default) => coerce_value(default, transform)?, - None => match transform.get_default_value_when_data_is_none() { - Some(default) => coerce_value(&default, transform)?, - None => None, - }, + Some(default) => Some(default.clone()), + None => transform.get_default_value_when_data_is_none(), } } Some(crate::etl::transform::OnFailure::Ignore) => None, @@ -349,63 +349,22 @@ fn resolve_schema( } } -fn resolve_number_schema( - n: Number, - column_name: String, - index: Option, - row: &mut Vec, - schema_info: &mut SchemaInfo, -) -> Result<()> { - let (value, datatype, semantic_type) = if n.is_i64() { - ( - ValueData::I64Value(n.as_i64().unwrap()), - ColumnDataType::Int64 as i32, - SemanticType::Field as i32, - ) - } else if n.is_u64() { - ( - ValueData::U64Value(n.as_u64().unwrap()), - ColumnDataType::Uint64 as i32, - SemanticType::Field as i32, - ) - } else if n.is_f64() { - ( - ValueData::F64Value(n.as_f64().unwrap()), - ColumnDataType::Float64 as i32, - SemanticType::Field as i32, - ) - } else { - return UnsupportedNumberTypeSnafu { value: n }.fail(); - }; - resolve_schema( - index, - value, - ColumnSchema { - column_name, - datatype, - semantic_type, - datatype_extension: None, - options: None, - }, - row, - schema_info, - ) -} - -fn calc_ts(p_ctx: &PipelineContext, values: &Value) -> Result> { +fn calc_ts(p_ctx: &PipelineContext, values: &VrlValue) -> Result> { match p_ctx.channel { - Channel::Prometheus => Ok(Some(ValueData::TimestampMillisecondValue( - values - .get(GREPTIME_TIMESTAMP) - .and_then(|v| v.as_i64()) - .unwrap_or_default(), - ))), + Channel::Prometheus => { + let ts = values + .as_object() + .and_then(|m| m.get(GREPTIME_TIMESTAMP)) + .and_then(|ts| ts.try_into_i64().ok()) + .unwrap_or_default(); + Ok(Some(ValueData::TimestampMillisecondValue(ts))) + } _ => { let custom_ts = p_ctx.pipeline_definition.get_custom_ts(); match custom_ts { Some(ts) => { - let ts_field = values.get(ts.get_column_name()); - Some(ts.get_timestamp(ts_field)).transpose() + let ts_field = values.as_object().and_then(|m| m.get(ts.get_column_name())); + Some(ts.get_timestamp_value(ts_field)).transpose() } None => Ok(Some(ValueData::TimestampNanosecondValue( chrono::Utc::now().timestamp_nanos_opt().unwrap_or_default(), @@ -417,7 +376,7 @@ fn calc_ts(p_ctx: &PipelineContext, values: &Value) -> Result> pub(crate) fn values_to_row( schema_info: &mut SchemaInfo, - values: Value, + values: VrlValue, pipeline_ctx: &PipelineContext<'_>, row: Option>, need_calc_ts: bool, @@ -439,14 +398,20 @@ pub(crate) fn values_to_row( .as_ref() .map_or(DEFAULT_GREPTIME_TIMESTAMP_COLUMN, |ts| ts.get_column_name()); - let values = values.into_map().context(ValueMustBeMapSnafu)?; + let values = values.into_object().context(ValueMustBeMapSnafu)?; for (column_name, value) in values { - if column_name == ts_column_name { + if column_name.as_str() == ts_column_name { continue; } - resolve_value(value, column_name, &mut row, schema_info, pipeline_ctx)?; + resolve_value( + value, + column_name.into(), + &mut row, + schema_info, + pipeline_ctx, + )?; } Ok(Row { values: row }) } @@ -460,7 +425,7 @@ fn decide_semantic(p_ctx: &PipelineContext, column_name: &str) -> i32 { } fn resolve_value( - value: Value, + value: VrlValue, column_name: String, row: &mut Vec, schema_info: &mut SchemaInfo, @@ -486,27 +451,23 @@ fn resolve_value( }; match value { - Value::Null => {} + VrlValue::Null => {} - Value::Int8(_) | Value::Int16(_) | Value::Int32(_) | Value::Int64(_) => { + VrlValue::Integer(v) => { // safe unwrap after type matched - let v = value.as_i64().unwrap(); resolve_simple_type(ValueData::I64Value(v), column_name, ColumnDataType::Int64)?; } - Value::Uint8(_) | Value::Uint16(_) | Value::Uint32(_) | Value::Uint64(_) => { + VrlValue::Float(v) => { // safe unwrap after type matched - let v = value.as_u64().unwrap(); - resolve_simple_type(ValueData::U64Value(v), column_name, ColumnDataType::Uint64)?; + resolve_simple_type( + ValueData::F64Value(v.into()), + column_name, + ColumnDataType::Float64, + )?; } - Value::Float32(_) | Value::Float64(_) => { - // safe unwrap after type matched - let v = value.as_f64().unwrap(); - resolve_simple_type(ValueData::F64Value(v), column_name, ColumnDataType::Float64)?; - } - - Value::Boolean(v) => { + VrlValue::Boolean(v) => { resolve_simple_type( ValueData::BoolValue(v), column_name, @@ -514,15 +475,30 @@ fn resolve_value( )?; } - Value::String(v) => { + VrlValue::Bytes(v) => { resolve_simple_type( - ValueData::StringValue(v), + ValueData::StringValue(String::from_utf8_lossy_owned(v.to_vec())), column_name, ColumnDataType::String, )?; } - Value::Timestamp(Timestamp::Nanosecond(ns)) => { + VrlValue::Regex(v) => { + warn!( + "Persisting regex value in the table, this should not happen, column_name: {}", + column_name + ); + resolve_simple_type( + ValueData::StringValue(v.to_string()), + column_name, + ColumnDataType::String, + )?; + } + + VrlValue::Timestamp(ts) => { + let ns = ts.timestamp_nanos_opt().context(InvalidTimestampSnafu { + input: ts.to_rfc3339(), + })?; resolve_simple_type( ValueData::TimestampNanosecondValue(ns), column_name, @@ -530,32 +506,8 @@ fn resolve_value( )?; } - Value::Timestamp(Timestamp::Microsecond(us)) => { - resolve_simple_type( - ValueData::TimestampMicrosecondValue(us), - column_name, - ColumnDataType::TimestampMicrosecond, - )?; - } - - Value::Timestamp(Timestamp::Millisecond(ms)) => { - resolve_simple_type( - ValueData::TimestampMillisecondValue(ms), - column_name, - ColumnDataType::TimestampMillisecond, - )?; - } - - Value::Timestamp(Timestamp::Second(s)) => { - resolve_simple_type( - ValueData::TimestampSecondValue(s), - column_name, - ColumnDataType::TimestampSecond, - )?; - } - - Value::Array(_) | Value::Map(_) => { - let data: jsonb::Value = value.into(); + VrlValue::Array(_) | VrlValue::Object(_) => { + let data = vrl_value_to_jsonb_value(&value); resolve_schema( index, ValueData::BinaryValue(data.to_vec()), @@ -576,8 +528,32 @@ fn resolve_value( Ok(()) } +fn vrl_value_to_jsonb_value<'a>(value: &'a VrlValue) -> jsonb::Value<'a> { + match value { + VrlValue::Bytes(bytes) => jsonb::Value::String(String::from_utf8_lossy(bytes)), + VrlValue::Regex(value_regex) => jsonb::Value::String(Cow::Borrowed(value_regex.as_str())), + VrlValue::Integer(i) => jsonb::Value::Number(Number::Int64(*i)), + VrlValue::Float(not_nan) => jsonb::Value::Number(Number::Float64(not_nan.into_inner())), + VrlValue::Boolean(b) => jsonb::Value::Bool(*b), + VrlValue::Timestamp(date_time) => jsonb::Value::String(Cow::Owned(date_time.to_rfc3339())), + VrlValue::Object(btree_map) => jsonb::Value::Object( + btree_map + .iter() + .map(|(key, value)| (key.to_string(), vrl_value_to_jsonb_value(value))) + .collect(), + ), + VrlValue::Array(values) => jsonb::Value::Array( + values + .iter() + .map(|value| vrl_value_to_jsonb_value(value)) + .collect(), + ), + VrlValue::Null => jsonb::Value::Null, + } +} + fn identity_pipeline_inner( - pipeline_maps: Vec, + pipeline_maps: Vec, pipeline_ctx: &PipelineContext<'_>, ) -> Result<(SchemaInfo, HashMap>)> { let skip_error = pipeline_ctx.pipeline_param.skip_error(); @@ -587,7 +563,7 @@ fn identity_pipeline_inner( // set time index column schema first schema_info.schema.push(ColumnSchema { column_name: custom_ts - .map(|ts| ts.get_column_name().clone()) + .map(|ts| ts.get_column_name().to_string()) .unwrap_or_else(|| DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string()), datatype: custom_ts.map(|c| c.get_datatype()).unwrap_or_else(|| { if pipeline_ctx.channel == Channel::Prometheus { @@ -642,7 +618,7 @@ fn identity_pipeline_inner( /// 4. The pipeline will return an error if the same column datatype is mismatched /// 5. The pipeline will analyze the schema of each json record and merge them to get the final schema. pub fn identity_pipeline( - array: Vec, + array: Vec, table: Option>, pipeline_ctx: &PipelineContext<'_>, ) -> Result> { @@ -690,22 +666,22 @@ pub fn identity_pipeline( /// /// The `max_nested_levels` parameter is used to limit the nested levels of the JSON object. /// The error will be returned if the nested levels is greater than the `max_nested_levels`. -pub fn flatten_object(object: Value, max_nested_levels: usize) -> Result { +pub fn flatten_object(object: VrlValue, max_nested_levels: usize) -> Result { let mut flattened = BTreeMap::new(); - let object = object.into_map().context(ValueMustBeMapSnafu)?; + let object = object.into_object().context(ValueMustBeMapSnafu)?; if !object.is_empty() { // it will use recursion to flatten the object. do_flatten_object(&mut flattened, None, object, 1, max_nested_levels)?; } - Ok(Value::Map(Map { values: flattened })) + Ok(VrlValue::Object(flattened)) } fn do_flatten_object( - dest: &mut BTreeMap, + dest: &mut BTreeMap, base: Option<&str>, - object: BTreeMap, + object: BTreeMap, current_level: usize, max_nested_levels: usize, ) -> Result<()> { @@ -715,14 +691,17 @@ fn do_flatten_object( } for (key, value) in object { - let new_key = base.map_or_else(|| key.clone(), |base_key| format!("{base_key}.{key}")); + let new_key = base.map_or_else( + || key.clone(), + |base_key| format!("{base_key}.{key}").into(), + ); match value { - Value::Map(object) => { + VrlValue::Object(object) => { do_flatten_object( dest, Some(&new_key), - object.values, + object, current_level + 1, max_nested_levels, )?; @@ -742,7 +721,6 @@ mod tests { use api::v1::SemanticType; use super::*; - use crate::etl::{json_array_to_map, json_to_map}; use crate::{identity_pipeline, PipelineDefinition}; #[test] @@ -754,7 +732,7 @@ mod tests { Channel::Unknown, ); { - let array = vec![ + let array = [ serde_json::json!({ "woshinull": null, "name": "Alice", @@ -774,7 +752,7 @@ mod tests { "gaga": "gaga" }), ]; - let array = json_array_to_map(array).unwrap(); + let array = array.iter().map(|v| v.into()).collect(); let rows = identity_pipeline(array, None, &pipeline_ctx); assert!(rows.is_err()); assert_eq!( @@ -783,7 +761,7 @@ mod tests { ); } { - let array = vec![ + let array = [ serde_json::json!({ "woshinull": null, "name": "Alice", @@ -803,7 +781,8 @@ mod tests { "gaga": "gaga" }), ]; - let rows = identity_pipeline(json_array_to_map(array).unwrap(), None, &pipeline_ctx); + let array = array.iter().map(|v| v.into()).collect(); + let rows = identity_pipeline(array, None, &pipeline_ctx); assert!(rows.is_err()); assert_eq!( rows.err().unwrap().to_string(), @@ -811,7 +790,7 @@ mod tests { ); } { - let array = vec![ + let array = [ serde_json::json!({ "woshinull": null, "name": "Alice", @@ -831,7 +810,8 @@ mod tests { "gaga": "gaga" }), ]; - let rows = identity_pipeline(json_array_to_map(array).unwrap(), None, &pipeline_ctx); + let array = array.iter().map(|v| v.into()).collect(); + let rows = identity_pipeline(array, None, &pipeline_ctx); assert!(rows.is_ok()); let mut rows = rows.unwrap(); assert!(rows.len() == 1); @@ -842,7 +822,7 @@ mod tests { assert_eq!(8, rows.rows[1].values.len()); } { - let array = vec![ + let array = [ serde_json::json!({ "woshinull": null, "name": "Alice", @@ -864,22 +844,23 @@ mod tests { ]; let tag_column_names = ["name".to_string(), "address".to_string()]; - let rows = identity_pipeline_inner(json_array_to_map(array).unwrap(), &pipeline_ctx) - .map(|(mut schema, mut rows)| { - for name in tag_column_names { - if let Some(index) = schema.index.get(&name) { - schema.schema[*index].semantic_type = SemanticType::Tag as i32; + let rows = + identity_pipeline_inner(array.iter().map(|v| v.into()).collect(), &pipeline_ctx) + .map(|(mut schema, mut rows)| { + for name in tag_column_names { + if let Some(index) = schema.index.get(&name) { + schema.schema[*index].semantic_type = SemanticType::Tag as i32; + } } - } - assert!(rows.len() == 1); - let rows = rows.remove(&ContextOpt::default()).unwrap(); + assert!(rows.len() == 1); + let rows = rows.remove(&ContextOpt::default()).unwrap(); - Rows { - schema: schema.schema, - rows, - } - }); + Rows { + schema: schema.schema, + rows, + } + }); assert!(rows.is_ok()); let rows = rows.unwrap(); @@ -976,8 +957,8 @@ mod tests { ]; for (input, max_depth, expected) in test_cases { - let input = json_to_map(input).unwrap(); - let expected = expected.map(|e| json_to_map(e).unwrap()); + let input = input.into(); + let expected = expected.map(|e| e.into()); let flattened_object = flatten_object(input, max_depth).ok(); assert_eq!(flattened_object, expected); diff --git a/src/pipeline/src/etl/transform/transformer/greptime/coerce.rs b/src/pipeline/src/etl/transform/transformer/greptime/coerce.rs index 41172a2876..52fc35041b 100644 --- a/src/pipeline/src/etl/transform/transformer/greptime/coerce.rs +++ b/src/pipeline/src/etl/transform/transformer/greptime/coerce.rs @@ -18,58 +18,17 @@ use api::v1::{ColumnDataTypeExtension, ColumnOptions, JsonTypeExtension}; use datatypes::schema::{FulltextOptions, SkippingIndexOptions}; use greptime_proto::v1::value::ValueData; use greptime_proto::v1::{ColumnDataType, ColumnSchema, SemanticType}; -use snafu::ResultExt; +use snafu::{OptionExt, ResultExt}; +use vrl::value::Value as VrlValue; use crate::error::{ CoerceIncompatibleTypesSnafu, CoerceJsonTypeToSnafu, CoerceStringToTypeSnafu, - CoerceTypeToJsonSnafu, CoerceUnsupportedEpochTypeSnafu, CoerceUnsupportedNullTypeSnafu, - CoerceUnsupportedNullTypeToSnafu, ColumnOptionsSnafu, Error, Result, + CoerceTypeToJsonSnafu, CoerceUnsupportedEpochTypeSnafu, ColumnOptionsSnafu, + InvalidTimestampSnafu, Result, UnsupportedTypeInPipelineSnafu, VrlRegexValueSnafu, }; use crate::etl::transform::index::Index; +use crate::etl::transform::transformer::greptime::vrl_value_to_jsonb_value; use crate::etl::transform::{OnFailure, Transform}; -use crate::etl::value::{Timestamp, Value}; - -impl TryFrom for ValueData { - type Error = Error; - - fn try_from(value: Value) -> Result { - match value { - Value::Null => CoerceUnsupportedNullTypeSnafu.fail(), - - Value::Int8(v) => Ok(ValueData::I32Value(v as i32)), - Value::Int16(v) => Ok(ValueData::I32Value(v as i32)), - Value::Int32(v) => Ok(ValueData::I32Value(v)), - Value::Int64(v) => Ok(ValueData::I64Value(v)), - - Value::Uint8(v) => Ok(ValueData::U32Value(v as u32)), - Value::Uint16(v) => Ok(ValueData::U32Value(v as u32)), - Value::Uint32(v) => Ok(ValueData::U32Value(v)), - Value::Uint64(v) => Ok(ValueData::U64Value(v)), - - Value::Float32(v) => Ok(ValueData::F32Value(v)), - Value::Float64(v) => Ok(ValueData::F64Value(v)), - - Value::Boolean(v) => Ok(ValueData::BoolValue(v)), - Value::String(v) => Ok(ValueData::StringValue(v)), - - Value::Timestamp(Timestamp::Nanosecond(ns)) => { - Ok(ValueData::TimestampNanosecondValue(ns)) - } - Value::Timestamp(Timestamp::Microsecond(us)) => { - Ok(ValueData::TimestampMicrosecondValue(us)) - } - Value::Timestamp(Timestamp::Millisecond(ms)) => { - Ok(ValueData::TimestampMillisecondValue(ms)) - } - Value::Timestamp(Timestamp::Second(s)) => Ok(ValueData::TimestampSecondValue(s)), - - Value::Array(_) | Value::Map(_) => { - let data: jsonb::Value = value.into(); - Ok(ValueData::BinaryValue(data.to_vec())) - } - } - } -} pub(crate) fn coerce_columns(transform: &Transform) -> Result> { let mut columns = Vec::new(); @@ -77,15 +36,21 @@ pub(crate) fn coerce_columns(transform: &Transform) -> Result> for field in transform.fields.iter() { let column_name = field.target_or_input_field().to_string(); - let (datatype, datatype_extension) = coerce_type(transform)?; + let ext = if matches!(transform.type_, ColumnDataType::Binary) { + Some(ColumnDataTypeExtension { + type_ext: Some(TypeExt::JsonType(JsonTypeExtension::JsonBinary.into())), + }) + } else { + None + }; let semantic_type = coerce_semantic_type(transform) as i32; let column = ColumnSchema { column_name, - datatype: datatype as i32, + datatype: transform.type_ as i32, semantic_type, - datatype_extension, + datatype_extension: ext, options: coerce_options(transform)?, }; columns.push(column); @@ -123,113 +88,60 @@ fn coerce_options(transform: &Transform) -> Result> { } } -fn coerce_type(transform: &Transform) -> Result<(ColumnDataType, Option)> { - match transform.type_ { - Value::Int8(_) => Ok((ColumnDataType::Int8, None)), - Value::Int16(_) => Ok((ColumnDataType::Int16, None)), - Value::Int32(_) => Ok((ColumnDataType::Int32, None)), - Value::Int64(_) => Ok((ColumnDataType::Int64, None)), - - Value::Uint8(_) => Ok((ColumnDataType::Uint8, None)), - Value::Uint16(_) => Ok((ColumnDataType::Uint16, None)), - Value::Uint32(_) => Ok((ColumnDataType::Uint32, None)), - Value::Uint64(_) => Ok((ColumnDataType::Uint64, None)), - - Value::Float32(_) => Ok((ColumnDataType::Float32, None)), - Value::Float64(_) => Ok((ColumnDataType::Float64, None)), - - Value::Boolean(_) => Ok((ColumnDataType::Boolean, None)), - Value::String(_) => Ok((ColumnDataType::String, None)), - - Value::Timestamp(Timestamp::Nanosecond(_)) => { - Ok((ColumnDataType::TimestampNanosecond, None)) - } - Value::Timestamp(Timestamp::Microsecond(_)) => { - Ok((ColumnDataType::TimestampMicrosecond, None)) - } - Value::Timestamp(Timestamp::Millisecond(_)) => { - Ok((ColumnDataType::TimestampMillisecond, None)) - } - Value::Timestamp(Timestamp::Second(_)) => Ok((ColumnDataType::TimestampSecond, None)), - - Value::Array(_) | Value::Map(_) => Ok(( - ColumnDataType::Binary, - Some(ColumnDataTypeExtension { - type_ext: Some(TypeExt::JsonType(JsonTypeExtension::JsonBinary.into())), - }), - )), - - Value::Null => CoerceUnsupportedNullTypeToSnafu { - ty: transform.type_.to_str_type(), - } - .fail(), - } -} - -pub(crate) fn coerce_value(val: &Value, transform: &Transform) -> Result> { +pub(crate) fn coerce_value(val: &VrlValue, transform: &Transform) -> Result> { match val { - Value::Null => Ok(None), - - Value::Int8(n) => coerce_i64_value(*n as i64, transform), - Value::Int16(n) => coerce_i64_value(*n as i64, transform), - Value::Int32(n) => coerce_i64_value(*n as i64, transform), - Value::Int64(n) => coerce_i64_value(*n, transform), - - Value::Uint8(n) => coerce_u64_value(*n as u64, transform), - Value::Uint16(n) => coerce_u64_value(*n as u64, transform), - Value::Uint32(n) => coerce_u64_value(*n as u64, transform), - Value::Uint64(n) => coerce_u64_value(*n, transform), - - Value::Float32(n) => coerce_f64_value(*n as f64, transform), - Value::Float64(n) => coerce_f64_value(*n, transform), - - Value::Boolean(b) => coerce_bool_value(*b, transform), - Value::String(s) => coerce_string_value(s, transform), - - Value::Timestamp(input_timestamp) => match &transform.type_ { - Value::Timestamp(target_timestamp) => match target_timestamp { - Timestamp::Nanosecond(_) => Ok(Some(ValueData::TimestampNanosecondValue( - input_timestamp.timestamp_nanos(), - ))), - Timestamp::Microsecond(_) => Ok(Some(ValueData::TimestampMicrosecondValue( - input_timestamp.timestamp_micros(), - ))), - Timestamp::Millisecond(_) => Ok(Some(ValueData::TimestampMillisecondValue( - input_timestamp.timestamp_millis(), - ))), - Timestamp::Second(_) => Ok(Some(ValueData::TimestampSecondValue( - input_timestamp.timestamp(), - ))), - }, + VrlValue::Null => Ok(None), + VrlValue::Integer(n) => coerce_i64_value(*n, transform), + VrlValue::Float(n) => coerce_f64_value(n.into_inner(), transform), + VrlValue::Boolean(b) => coerce_bool_value(*b, transform), + VrlValue::Bytes(b) => coerce_string_value(String::from_utf8_lossy(b).as_ref(), transform), + VrlValue::Timestamp(ts) => match transform.type_ { + ColumnDataType::TimestampNanosecond => Ok(Some(ValueData::TimestampNanosecondValue( + ts.timestamp_nanos_opt().context(InvalidTimestampSnafu { + input: ts.to_rfc3339(), + })?, + ))), + ColumnDataType::TimestampMicrosecond => Ok(Some(ValueData::TimestampMicrosecondValue( + ts.timestamp_micros(), + ))), + ColumnDataType::TimestampMillisecond => Ok(Some(ValueData::TimestampMillisecondValue( + ts.timestamp_millis(), + ))), + ColumnDataType::TimestampSecond => { + Ok(Some(ValueData::TimestampSecondValue(ts.timestamp()))) + } _ => CoerceIncompatibleTypesSnafu { msg: "Timestamp can only be coerced to another type", } .fail(), }, - - Value::Array(_) | Value::Map(_) => coerce_json_value(val, transform), + VrlValue::Array(_) | VrlValue::Object(_) => coerce_json_value(val, transform), + VrlValue::Regex(_) => VrlRegexValueSnafu.fail(), } } fn coerce_bool_value(b: bool, transform: &Transform) -> Result> { let val = match transform.type_ { - Value::Int8(_) => ValueData::I8Value(b as i32), - Value::Int16(_) => ValueData::I16Value(b as i32), - Value::Int32(_) => ValueData::I32Value(b as i32), - Value::Int64(_) => ValueData::I64Value(b as i64), + ColumnDataType::Int8 => ValueData::I8Value(b as i32), + ColumnDataType::Int16 => ValueData::I16Value(b as i32), + ColumnDataType::Int32 => ValueData::I32Value(b as i32), + ColumnDataType::Int64 => ValueData::I64Value(b as i64), - Value::Uint8(_) => ValueData::U8Value(b as u32), - Value::Uint16(_) => ValueData::U16Value(b as u32), - Value::Uint32(_) => ValueData::U32Value(b as u32), - Value::Uint64(_) => ValueData::U64Value(b as u64), + ColumnDataType::Uint8 => ValueData::U8Value(b as u32), + ColumnDataType::Uint16 => ValueData::U16Value(b as u32), + ColumnDataType::Uint32 => ValueData::U32Value(b as u32), + ColumnDataType::Uint64 => ValueData::U64Value(b as u64), - Value::Float32(_) => ValueData::F32Value(if b { 1.0 } else { 0.0 }), - Value::Float64(_) => ValueData::F64Value(if b { 1.0 } else { 0.0 }), + ColumnDataType::Float32 => ValueData::F32Value(if b { 1.0 } else { 0.0 }), + ColumnDataType::Float64 => ValueData::F64Value(if b { 1.0 } else { 0.0 }), - Value::Boolean(_) => ValueData::BoolValue(b), - Value::String(_) => ValueData::StringValue(b.to_string()), + ColumnDataType::Boolean => ValueData::BoolValue(b), + ColumnDataType::String => ValueData::StringValue(b.to_string()), - Value::Timestamp(_) => match transform.on_failure { + ColumnDataType::TimestampNanosecond + | ColumnDataType::TimestampMicrosecond + | ColumnDataType::TimestampMillisecond + | ColumnDataType::TimestampSecond => match transform.on_failure { Some(OnFailure::Ignore) => return Ok(None), Some(OnFailure::Default) => { return CoerceUnsupportedEpochTypeSnafu { ty: "Default" }.fail(); @@ -239,14 +151,19 @@ fn coerce_bool_value(b: bool, transform: &Transform) -> Result } }, - Value::Array(_) | Value::Map(_) => { + ColumnDataType::Binary => { return CoerceJsonTypeToSnafu { - ty: transform.type_.to_str_type(), + ty: transform.type_.as_str_name(), } .fail() } - Value::Null => return Ok(None), + _ => { + return UnsupportedTypeInPipelineSnafu { + ty: transform.type_.as_str_name(), + } + .fail() + } }; Ok(Some(val)) @@ -254,37 +171,35 @@ fn coerce_bool_value(b: bool, transform: &Transform) -> Result fn coerce_i64_value(n: i64, transform: &Transform) -> Result> { let val = match &transform.type_ { - Value::Int8(_) => ValueData::I8Value(n as i32), - Value::Int16(_) => ValueData::I16Value(n as i32), - Value::Int32(_) => ValueData::I32Value(n as i32), - Value::Int64(_) => ValueData::I64Value(n), + ColumnDataType::Int8 => ValueData::I8Value(n as i32), + ColumnDataType::Int16 => ValueData::I16Value(n as i32), + ColumnDataType::Int32 => ValueData::I32Value(n as i32), + ColumnDataType::Int64 => ValueData::I64Value(n), - Value::Uint8(_) => ValueData::U8Value(n as u32), - Value::Uint16(_) => ValueData::U16Value(n as u32), - Value::Uint32(_) => ValueData::U32Value(n as u32), - Value::Uint64(_) => ValueData::U64Value(n as u64), + ColumnDataType::Uint8 => ValueData::U8Value(n as u32), + ColumnDataType::Uint16 => ValueData::U16Value(n as u32), + ColumnDataType::Uint32 => ValueData::U32Value(n as u32), + ColumnDataType::Uint64 => ValueData::U64Value(n as u64), - Value::Float32(_) => ValueData::F32Value(n as f32), - Value::Float64(_) => ValueData::F64Value(n as f64), + ColumnDataType::Float32 => ValueData::F32Value(n as f32), + ColumnDataType::Float64 => ValueData::F64Value(n as f64), - Value::Boolean(_) => ValueData::BoolValue(n != 0), - Value::String(_) => ValueData::StringValue(n.to_string()), + ColumnDataType::Boolean => ValueData::BoolValue(n != 0), + ColumnDataType::String => ValueData::StringValue(n.to_string()), - Value::Timestamp(unit) => match unit { - Timestamp::Nanosecond(_) => ValueData::TimestampNanosecondValue(n), - Timestamp::Microsecond(_) => ValueData::TimestampMicrosecondValue(n), - Timestamp::Millisecond(_) => ValueData::TimestampMillisecondValue(n), - Timestamp::Second(_) => ValueData::TimestampSecondValue(n), - }, + ColumnDataType::TimestampNanosecond => ValueData::TimestampNanosecondValue(n), + ColumnDataType::TimestampMicrosecond => ValueData::TimestampMicrosecondValue(n), + ColumnDataType::TimestampMillisecond => ValueData::TimestampMillisecondValue(n), + ColumnDataType::TimestampSecond => ValueData::TimestampSecondValue(n), - Value::Array(_) | Value::Map(_) => { + ColumnDataType::Binary => { return CoerceJsonTypeToSnafu { - ty: transform.type_.to_str_type(), + ty: transform.type_.as_str_name(), } .fail() } - Value::Null => return Ok(None), + _ => return Ok(None), }; Ok(Some(val)) @@ -292,37 +207,35 @@ fn coerce_i64_value(n: i64, transform: &Transform) -> Result> fn coerce_u64_value(n: u64, transform: &Transform) -> Result> { let val = match &transform.type_ { - Value::Int8(_) => ValueData::I8Value(n as i32), - Value::Int16(_) => ValueData::I16Value(n as i32), - Value::Int32(_) => ValueData::I32Value(n as i32), - Value::Int64(_) => ValueData::I64Value(n as i64), + ColumnDataType::Int8 => ValueData::I8Value(n as i32), + ColumnDataType::Int16 => ValueData::I16Value(n as i32), + ColumnDataType::Int32 => ValueData::I32Value(n as i32), + ColumnDataType::Int64 => ValueData::I64Value(n as i64), - Value::Uint8(_) => ValueData::U8Value(n as u32), - Value::Uint16(_) => ValueData::U16Value(n as u32), - Value::Uint32(_) => ValueData::U32Value(n as u32), - Value::Uint64(_) => ValueData::U64Value(n), + ColumnDataType::Uint8 => ValueData::U8Value(n as u32), + ColumnDataType::Uint16 => ValueData::U16Value(n as u32), + ColumnDataType::Uint32 => ValueData::U32Value(n as u32), + ColumnDataType::Uint64 => ValueData::U64Value(n), - Value::Float32(_) => ValueData::F32Value(n as f32), - Value::Float64(_) => ValueData::F64Value(n as f64), + ColumnDataType::Float32 => ValueData::F32Value(n as f32), + ColumnDataType::Float64 => ValueData::F64Value(n as f64), - Value::Boolean(_) => ValueData::BoolValue(n != 0), - Value::String(_) => ValueData::StringValue(n.to_string()), + ColumnDataType::Boolean => ValueData::BoolValue(n != 0), + ColumnDataType::String => ValueData::StringValue(n.to_string()), - Value::Timestamp(unit) => match unit { - Timestamp::Nanosecond(_) => ValueData::TimestampNanosecondValue(n as i64), - Timestamp::Microsecond(_) => ValueData::TimestampMicrosecondValue(n as i64), - Timestamp::Millisecond(_) => ValueData::TimestampMillisecondValue(n as i64), - Timestamp::Second(_) => ValueData::TimestampSecondValue(n as i64), - }, + ColumnDataType::TimestampNanosecond => ValueData::TimestampNanosecondValue(n as i64), + ColumnDataType::TimestampMicrosecond => ValueData::TimestampMicrosecondValue(n as i64), + ColumnDataType::TimestampMillisecond => ValueData::TimestampMillisecondValue(n as i64), + ColumnDataType::TimestampSecond => ValueData::TimestampSecondValue(n as i64), - Value::Array(_) | Value::Map(_) => { + ColumnDataType::Binary => { return CoerceJsonTypeToSnafu { - ty: transform.type_.to_str_type(), + ty: transform.type_.as_str_name(), } .fail() } - Value::Null => return Ok(None), + _ => return Ok(None), }; Ok(Some(val)) @@ -330,23 +243,26 @@ fn coerce_u64_value(n: u64, transform: &Transform) -> Result> fn coerce_f64_value(n: f64, transform: &Transform) -> Result> { let val = match transform.type_ { - Value::Int8(_) => ValueData::I8Value(n as i32), - Value::Int16(_) => ValueData::I16Value(n as i32), - Value::Int32(_) => ValueData::I32Value(n as i32), - Value::Int64(_) => ValueData::I64Value(n as i64), + ColumnDataType::Int8 => ValueData::I8Value(n as i32), + ColumnDataType::Int16 => ValueData::I16Value(n as i32), + ColumnDataType::Int32 => ValueData::I32Value(n as i32), + ColumnDataType::Int64 => ValueData::I64Value(n as i64), - Value::Uint8(_) => ValueData::U8Value(n as u32), - Value::Uint16(_) => ValueData::U16Value(n as u32), - Value::Uint32(_) => ValueData::U32Value(n as u32), - Value::Uint64(_) => ValueData::U64Value(n as u64), + ColumnDataType::Uint8 => ValueData::U8Value(n as u32), + ColumnDataType::Uint16 => ValueData::U16Value(n as u32), + ColumnDataType::Uint32 => ValueData::U32Value(n as u32), + ColumnDataType::Uint64 => ValueData::U64Value(n as u64), - Value::Float32(_) => ValueData::F32Value(n as f32), - Value::Float64(_) => ValueData::F64Value(n), + ColumnDataType::Float32 => ValueData::F32Value(n as f32), + ColumnDataType::Float64 => ValueData::F64Value(n), - Value::Boolean(_) => ValueData::BoolValue(n != 0.0), - Value::String(_) => ValueData::StringValue(n.to_string()), + ColumnDataType::Boolean => ValueData::BoolValue(n != 0.0), + ColumnDataType::String => ValueData::StringValue(n.to_string()), - Value::Timestamp(_) => match transform.on_failure { + ColumnDataType::TimestampNanosecond + | ColumnDataType::TimestampMicrosecond + | ColumnDataType::TimestampMillisecond + | ColumnDataType::TimestampSecond => match transform.on_failure { Some(OnFailure::Ignore) => return Ok(None), Some(OnFailure::Default) => { return CoerceUnsupportedEpochTypeSnafu { ty: "Default" }.fail(); @@ -356,14 +272,14 @@ fn coerce_f64_value(n: f64, transform: &Transform) -> Result> } }, - Value::Array(_) | Value::Map(_) => { + ColumnDataType::Binary => { return CoerceJsonTypeToSnafu { - ty: transform.type_.to_str_type(), + ty: transform.type_.as_str_name(), } .fail() } - Value::Null => return Ok(None), + _ => return Ok(None), }; Ok(Some(val)) @@ -376,12 +292,12 @@ macro_rules! coerce_string_value { Err(_) => match $transform.on_failure { Some(OnFailure::Ignore) => Ok(None), Some(OnFailure::Default) => match $transform.get_default() { - Some(default) => coerce_value(default, $transform), - None => coerce_value($transform.get_type_matched_default_val(), $transform), + Some(default) => Ok(Some(default.clone())), + None => $transform.get_type_matched_default_val().map(Some), }, None => CoerceStringToTypeSnafu { s: $s, - ty: $transform.type_.to_str_type(), + ty: $transform.type_.as_str_name(), } .fail(), }, @@ -389,92 +305,85 @@ macro_rules! coerce_string_value { }; } -fn coerce_string_value(s: &String, transform: &Transform) -> Result> { +fn coerce_string_value(s: &str, transform: &Transform) -> Result> { match transform.type_ { - Value::Int8(_) => { + ColumnDataType::Int8 => { coerce_string_value!(s, transform, i32, I8Value) } - Value::Int16(_) => { + ColumnDataType::Int16 => { coerce_string_value!(s, transform, i32, I16Value) } - Value::Int32(_) => { + ColumnDataType::Int32 => { coerce_string_value!(s, transform, i32, I32Value) } - Value::Int64(_) => { + ColumnDataType::Int64 => { coerce_string_value!(s, transform, i64, I64Value) } - Value::Uint8(_) => { + ColumnDataType::Uint8 => { coerce_string_value!(s, transform, u32, U8Value) } - Value::Uint16(_) => { + ColumnDataType::Uint16 => { coerce_string_value!(s, transform, u32, U16Value) } - Value::Uint32(_) => { + ColumnDataType::Uint32 => { coerce_string_value!(s, transform, u32, U32Value) } - Value::Uint64(_) => { + ColumnDataType::Uint64 => { coerce_string_value!(s, transform, u64, U64Value) } - Value::Float32(_) => { + ColumnDataType::Float32 => { coerce_string_value!(s, transform, f32, F32Value) } - Value::Float64(_) => { + ColumnDataType::Float64 => { coerce_string_value!(s, transform, f64, F64Value) } - Value::Boolean(_) => { + ColumnDataType::Boolean => { coerce_string_value!(s, transform, bool, BoolValue) } - Value::String(_) => Ok(Some(ValueData::StringValue(s.to_string()))), + ColumnDataType::String => Ok(Some(ValueData::StringValue(s.to_string()))), - Value::Timestamp(_) => match transform.on_failure { + ColumnDataType::TimestampNanosecond + | ColumnDataType::TimestampMicrosecond + | ColumnDataType::TimestampMillisecond + | ColumnDataType::TimestampSecond => match transform.on_failure { Some(OnFailure::Ignore) => Ok(None), Some(OnFailure::Default) => CoerceUnsupportedEpochTypeSnafu { ty: "Default" }.fail(), None => CoerceUnsupportedEpochTypeSnafu { ty: "String" }.fail(), }, - Value::Array(_) | Value::Map(_) => CoerceStringToTypeSnafu { + ColumnDataType::Binary => CoerceStringToTypeSnafu { s, - ty: transform.type_.to_str_type(), + ty: transform.type_.as_str_name(), } .fail(), - Value::Null => Ok(None), + _ => Ok(None), } } -fn coerce_json_value(v: &Value, transform: &Transform) -> Result> { +fn coerce_json_value(v: &VrlValue, transform: &Transform) -> Result> { match &transform.type_ { - Value::Array(_) | Value::Map(_) => (), + ColumnDataType::Binary => (), t => { return CoerceTypeToJsonSnafu { - ty: t.to_str_type(), + ty: t.as_str_name(), } .fail(); } } - match v { - Value::Map(_) => { - let data: jsonb::Value = v.into(); - Ok(Some(ValueData::BinaryValue(data.to_vec()))) - } - Value::Array(_) => { - let data: jsonb::Value = v.into(); - Ok(Some(ValueData::BinaryValue(data.to_vec()))) - } - _ => CoerceTypeToJsonSnafu { - ty: v.to_str_type(), - } - .fail(), - } + let data: jsonb::Value = vrl_value_to_jsonb_value(v); + Ok(Some(ValueData::BinaryValue(data.to_vec()))) } #[cfg(test)] mod tests { + use vrl::prelude::Bytes; + use super::*; use crate::etl::field::Fields; @@ -482,7 +391,7 @@ mod tests { fn test_coerce_string_without_on_failure() { let transform = Transform { fields: Fields::default(), - type_: Value::Int32(0), + type_: ColumnDataType::Int32, default: None, index: None, on_failure: None, @@ -491,14 +400,14 @@ mod tests { // valid string { - let val = Value::String("123".to_string()); + let val = VrlValue::Integer(123); let result = coerce_value(&val, &transform).unwrap(); assert_eq!(result, Some(ValueData::I32Value(123))); } // invalid string { - let val = Value::String("hello".to_string()); + let val = VrlValue::Bytes(Bytes::from("hello")); let result = coerce_value(&val, &transform); assert!(result.is_err()); } @@ -508,14 +417,14 @@ mod tests { fn test_coerce_string_with_on_failure_ignore() { let transform = Transform { fields: Fields::default(), - type_: Value::Int32(0), + type_: ColumnDataType::Int32, default: None, index: None, on_failure: Some(OnFailure::Ignore), tag: false, }; - let val = Value::String("hello".to_string()); + let val = VrlValue::Bytes(Bytes::from("hello")); let result = coerce_value(&val, &transform).unwrap(); assert_eq!(result, None); } @@ -524,7 +433,7 @@ mod tests { fn test_coerce_string_with_on_failure_default() { let mut transform = Transform { fields: Fields::default(), - type_: Value::Int32(0), + type_: ColumnDataType::Int32, default: None, index: None, on_failure: Some(OnFailure::Default), @@ -533,15 +442,15 @@ mod tests { // with no explicit default value { - let val = Value::String("hello".to_string()); + let val = VrlValue::Bytes(Bytes::from("hello")); let result = coerce_value(&val, &transform).unwrap(); assert_eq!(result, Some(ValueData::I32Value(0))); } // with explicit default value { - transform.default = Some(Value::Int32(42)); - let val = Value::String("hello".to_string()); + transform.default = Some(ValueData::I32Value(42)); + let val = VrlValue::Bytes(Bytes::from("hello")); let result = coerce_value(&val, &transform).unwrap(); assert_eq!(result, Some(ValueData::I32Value(42))); } diff --git a/src/pipeline/src/etl/value.rs b/src/pipeline/src/etl/value.rs index f5904ed31e..ff0d7bd00c 100644 --- a/src/pipeline/src/etl/value.rs +++ b/src/pipeline/src/etl/value.rs @@ -12,982 +12,188 @@ // See the License for the specific language governing permissions and // limitations under the License. -pub mod array; -pub mod map; -pub mod time; - use std::collections::BTreeMap; -use std::result::Result as StdResult; -pub use array::Array; -use jsonb::{Number as JsonbNumber, Object as JsonbObject, Value as JsonbValue}; -use jsonpath_rust::parser::{parse_json_path, JsonPathIndex}; -use jsonpath_rust::path::{JsonLike, Path}; -use jsonpath_rust::{jsp_idx, jsp_obj, JsonPath, JsonPathParserError, JsonPathStr}; -pub use map::Map; -use regex::Regex; +use api::v1::value::ValueData; +use api::v1::ColumnDataType; +use ordered_float::NotNan; use snafu::{OptionExt, ResultExt}; -pub use time::Timestamp; +use vrl::prelude::Bytes; +use vrl::value::{KeyString, Value as VrlValue}; use crate::error::{ - Error, Result, UnsupportedNumberTypeSnafu, ValueDefaultValueUnsupportedSnafu, - ValueInvalidResolutionSnafu, ValueMustBeMapSnafu, ValueParseBooleanSnafu, ValueParseFloatSnafu, - ValueParseIntSnafu, ValueParseTypeSnafu, ValueUnsupportedYamlTypeSnafu, - ValueYamlKeyMustBeStringSnafu, + FloatIsNanSnafu, Result, ValueDefaultValueUnsupportedSnafu, ValueInvalidResolutionSnafu, + ValueParseBooleanSnafu, ValueParseFloatSnafu, ValueParseIntSnafu, ValueParseTypeSnafu, + ValueUnsupportedYamlTypeSnafu, ValueYamlKeyMustBeStringSnafu, }; -/// Value can be used as type -/// acts as value: the enclosed value is the actual value -/// acts as type: the enclosed value is the default value -#[derive(Debug, Clone, PartialEq, Default)] -pub enum Value { - // as value: null - // as type: no type specified - #[default] - Null, +pub(crate) const NANOSECOND_RESOLUTION: &str = "nanosecond"; +pub(crate) const NANO_RESOLUTION: &str = "nano"; +pub(crate) const NS_RESOLUTION: &str = "ns"; +pub(crate) const MICROSECOND_RESOLUTION: &str = "microsecond"; +pub(crate) const MICRO_RESOLUTION: &str = "micro"; +pub(crate) const US_RESOLUTION: &str = "us"; +pub(crate) const MILLISECOND_RESOLUTION: &str = "millisecond"; +pub(crate) const MILLI_RESOLUTION: &str = "milli"; +pub(crate) const MS_RESOLUTION: &str = "ms"; +pub(crate) const SECOND_RESOLUTION: &str = "second"; +pub(crate) const SEC_RESOLUTION: &str = "sec"; +pub(crate) const S_RESOLUTION: &str = "s"; - Int8(i8), - Int16(i16), - Int32(i32), - Int64(i64), +pub(crate) const VALID_RESOLUTIONS: [&str; 12] = [ + NANOSECOND_RESOLUTION, + NANO_RESOLUTION, + NS_RESOLUTION, + MICROSECOND_RESOLUTION, + MICRO_RESOLUTION, + US_RESOLUTION, + MILLISECOND_RESOLUTION, + MILLI_RESOLUTION, + MS_RESOLUTION, + SECOND_RESOLUTION, + SEC_RESOLUTION, + S_RESOLUTION, +]; - Uint8(u8), - Uint16(u16), - Uint32(u32), - Uint64(u64), +pub fn parse_str_type(t: &str) -> Result { + let mut parts = t.splitn(2, ','); + let head = parts.next().unwrap_or_default(); + let tail = parts.next().map(|s| s.trim().to_string()); + match head.to_lowercase().as_str() { + "int8" => Ok(ColumnDataType::Int8), + "int16" => Ok(ColumnDataType::Int16), + "int32" => Ok(ColumnDataType::Int32), + "int64" => Ok(ColumnDataType::Int64), - Float32(f32), - Float64(f64), + "uint8" => Ok(ColumnDataType::Uint8), + "uint16" => Ok(ColumnDataType::Uint16), + "uint32" => Ok(ColumnDataType::Uint32), + "uint64" => Ok(ColumnDataType::Uint64), - Boolean(bool), - String(String), + "float32" => Ok(ColumnDataType::Float32), + "float64" => Ok(ColumnDataType::Float64), - Timestamp(Timestamp), + "boolean" => Ok(ColumnDataType::Boolean), + "string" => Ok(ColumnDataType::String), - /// We only consider object and array to be json types. - Array(Array), - Map(Map), + "timestamp" | "epoch" | "time" => match tail { + Some(resolution) if !resolution.is_empty() => match resolution.as_str() { + NANOSECOND_RESOLUTION | NANO_RESOLUTION | NS_RESOLUTION => { + Ok(ColumnDataType::TimestampNanosecond) + } + MICROSECOND_RESOLUTION | MICRO_RESOLUTION | US_RESOLUTION => { + Ok(ColumnDataType::TimestampMicrosecond) + } + MILLISECOND_RESOLUTION | MILLI_RESOLUTION | MS_RESOLUTION => { + Ok(ColumnDataType::TimestampMillisecond) + } + SECOND_RESOLUTION | SEC_RESOLUTION | S_RESOLUTION => { + Ok(ColumnDataType::TimestampSecond) + } + _ => ValueInvalidResolutionSnafu { + resolution, + valid_resolution: VALID_RESOLUTIONS.join(","), + } + .fail(), + }, + _ => Ok(ColumnDataType::TimestampNanosecond), + }, + + // We only consider object and array to be json types. and use Map to represent json + // TODO(qtang): Needs to be defined with better semantics + "json" => Ok(ColumnDataType::Binary), + + _ => ValueParseTypeSnafu { t }.fail(), + } } -impl Value { - pub fn get(&self, key: &str) -> Option<&Self> { - match self { - Value::Map(map) => map.get(key), - _ => None, +pub fn parse_str_value(type_: &ColumnDataType, v: &str) -> Result { + match type_ { + ColumnDataType::Int8 => v + .parse::() + .map(|v| ValueData::I8Value(v as i32)) + .context(ValueParseIntSnafu { ty: "int8", v }), + ColumnDataType::Int16 => v + .parse::() + .map(|v| ValueData::I16Value(v as i32)) + .context(ValueParseIntSnafu { ty: "int16", v }), + ColumnDataType::Int32 => v + .parse::() + .map(ValueData::I32Value) + .context(ValueParseIntSnafu { ty: "int32", v }), + ColumnDataType::Int64 => v + .parse::() + .map(ValueData::I64Value) + .context(ValueParseIntSnafu { ty: "int64", v }), + + ColumnDataType::Uint8 => v + .parse::() + .map(|v| ValueData::U8Value(v as u32)) + .context(ValueParseIntSnafu { ty: "uint8", v }), + ColumnDataType::Uint16 => v + .parse::() + .map(|v| ValueData::U16Value(v as u32)) + .context(ValueParseIntSnafu { ty: "uint16", v }), + ColumnDataType::Uint32 => v + .parse::() + .map(ValueData::U32Value) + .context(ValueParseIntSnafu { ty: "uint32", v }), + ColumnDataType::Uint64 => v + .parse::() + .map(ValueData::U64Value) + .context(ValueParseIntSnafu { ty: "uint64", v }), + + ColumnDataType::Float32 => v + .parse::() + .map(ValueData::F32Value) + .context(ValueParseFloatSnafu { ty: "float32", v }), + ColumnDataType::Float64 => v + .parse::() + .map(ValueData::F64Value) + .context(ValueParseFloatSnafu { ty: "float64", v }), + + ColumnDataType::Boolean => v + .parse::() + .map(ValueData::BoolValue) + .context(ValueParseBooleanSnafu { ty: "boolean", v }), + ColumnDataType::String => Ok(ValueData::StringValue(v.to_string())), + + _ => ValueDefaultValueUnsupportedSnafu { + value: format!("{:?}", type_), } + .fail(), } +} - pub fn get_mut(&mut self, key: &str) -> Option<&mut Self> { - match self { - Value::Map(map) => map.get_mut(key), - _ => None, - } - } - - pub fn remove(&mut self, key: &str) -> Option { - match self { - Value::Map(map) => map.remove(key), - _ => None, - } - } - - pub fn extend(&mut self, other: Map) -> Result<()> { - match self { - Value::Map(map) => { - map.extend(other); - Ok(()) - } - _ => ValueMustBeMapSnafu.fail(), - } - } - - pub fn insert(&mut self, key: String, value: Value) -> Result<()> { - match self { - Value::Map(map) => { - map.insert(key, value); - Ok(()) - } - _ => ValueMustBeMapSnafu.fail(), - } - } - - pub fn is_null(&self) -> bool { - matches!(self, Value::Null) - } - - pub fn parse_str_type(t: &str) -> Result { - let mut parts = t.splitn(2, ','); - let head = parts.next().unwrap_or_default(); - let tail = parts.next().map(|s| s.trim().to_string()); - match head.to_lowercase().as_str() { - "int8" => Ok(Value::Int8(0)), - "int16" => Ok(Value::Int16(0)), - "int32" => Ok(Value::Int32(0)), - "int64" => Ok(Value::Int64(0)), - - "uint8" => Ok(Value::Uint8(0)), - "uint16" => Ok(Value::Uint16(0)), - "uint32" => Ok(Value::Uint32(0)), - "uint64" => Ok(Value::Uint64(0)), - - "float32" => Ok(Value::Float32(0.0)), - "float64" => Ok(Value::Float64(0.0)), - - "boolean" => Ok(Value::Boolean(false)), - "string" => Ok(Value::String("".to_string())), - - "timestamp" | "epoch" | "time" => match tail { - Some(resolution) if !resolution.is_empty() => match resolution.as_str() { - time::NANOSECOND_RESOLUTION | time::NANO_RESOLUTION | time::NS_RESOLUTION => { - Ok(Value::Timestamp(Timestamp::Nanosecond(0))) - } - time::MICROSECOND_RESOLUTION | time::MICRO_RESOLUTION | time::US_RESOLUTION => { - Ok(Value::Timestamp(Timestamp::Microsecond(0))) - } - time::MILLISECOND_RESOLUTION | time::MILLI_RESOLUTION | time::MS_RESOLUTION => { - Ok(Value::Timestamp(Timestamp::Millisecond(0))) - } - time::SECOND_RESOLUTION | time::SEC_RESOLUTION | time::S_RESOLUTION => { - Ok(Value::Timestamp(Timestamp::Second(0))) - } - _ => ValueInvalidResolutionSnafu { - resolution, - valid_resolution: time::VALID_RESOLUTIONS.join(","), - } - .fail(), - }, - _ => Ok(Value::Timestamp(Timestamp::Nanosecond(0))), - }, - - // We only consider object and array to be json types. and use Map to represent json - // TODO(qtang): Needs to be defined with better semantics - "json" => Ok(Value::Map(Map::default())), - - _ => ValueParseTypeSnafu { t }.fail(), - } - } - - /// only support string, bool, number, null - pub fn parse_str_value(&self, v: &str) -> Result { - match self { - Value::Int8(_) => v - .parse::() - .map(Value::Int8) - .context(ValueParseIntSnafu { ty: "int8", v }), - Value::Int16(_) => v - .parse::() - .map(Value::Int16) - .context(ValueParseIntSnafu { ty: "int16", v }), - Value::Int32(_) => v - .parse::() - .map(Value::Int32) - .context(ValueParseIntSnafu { ty: "int32", v }), - Value::Int64(_) => v - .parse::() - .map(Value::Int64) - .context(ValueParseIntSnafu { ty: "int64", v }), - - Value::Uint8(_) => v - .parse::() - .map(Value::Uint8) - .context(ValueParseIntSnafu { ty: "uint8", v }), - Value::Uint16(_) => v - .parse::() - .map(Value::Uint16) - .context(ValueParseIntSnafu { ty: "uint16", v }), - Value::Uint32(_) => v - .parse::() - .map(Value::Uint32) - .context(ValueParseIntSnafu { ty: "uint32", v }), - Value::Uint64(_) => v - .parse::() - .map(Value::Uint64) - .context(ValueParseIntSnafu { ty: "uint64", v }), - - Value::Float32(_) => v - .parse::() - .map(Value::Float32) - .context(ValueParseFloatSnafu { ty: "float32", v }), - Value::Float64(_) => v +pub fn yaml_to_vrl_value(v: &yaml_rust::Yaml) -> Result { + match v { + yaml_rust::Yaml::Null => Ok(VrlValue::Null), + yaml_rust::Yaml::Boolean(v) => Ok(VrlValue::Boolean(*v)), + yaml_rust::Yaml::Integer(v) => Ok(VrlValue::Integer(*v)), + yaml_rust::Yaml::Real(v) => { + let f = v .parse::() - .map(Value::Float64) - .context(ValueParseFloatSnafu { ty: "float64", v }), - - Value::Boolean(_) => v - .parse::() - .map(Value::Boolean) - .context(ValueParseBooleanSnafu { ty: "boolean", v }), - Value::String(_) => Ok(Value::String(v.to_string())), - - Value::Null => Ok(Value::Null), - - _ => ValueDefaultValueUnsupportedSnafu { - value: format!("{:?}", self), + .context(ValueParseFloatSnafu { ty: "float64", v })?; + NotNan::new(f).map(VrlValue::Float).context(FloatIsNanSnafu) + } + yaml_rust::Yaml::String(v) => Ok(VrlValue::Bytes(Bytes::from(v.to_string()))), + yaml_rust::Yaml::Array(arr) => { + let mut values = vec![]; + for v in arr { + values.push(yaml_to_vrl_value(v)?); } - .fail(), + Ok(VrlValue::Array(values)) } - } - - /// only support string, bool, number, null - pub fn to_str_value(&self) -> String { - match self { - Value::Int8(v) => format!("{}", v), - Value::Int16(v) => format!("{}", v), - Value::Int32(v) => format!("{}", v), - Value::Int64(v) => format!("{}", v), - - Value::Uint8(v) => format!("{}", v), - Value::Uint16(v) => format!("{}", v), - Value::Uint32(v) => format!("{}", v), - Value::Uint64(v) => format!("{}", v), - - Value::Float32(v) => format!("{}", v), - Value::Float64(v) => format!("{}", v), - - Value::Boolean(v) => format!("{}", v), - Value::String(v) => v.to_string(), - - v => v.to_string(), + yaml_rust::Yaml::Hash(v) => { + let mut values = BTreeMap::new(); + for (k, v) in v { + let key = k + .as_str() + .with_context(|| ValueYamlKeyMustBeStringSnafu { value: v.clone() })?; + values.insert(KeyString::from(key), yaml_to_vrl_value(v)?); + } + Ok(VrlValue::Object(values)) } - } - - pub fn to_str_type(&self) -> &str { - match self { - Value::Int8(_) => "int8", - Value::Int16(_) => "int16", - Value::Int32(_) => "int32", - Value::Int64(_) => "int64", - - Value::Uint8(_) => "uint8", - Value::Uint16(_) => "uint16", - Value::Uint32(_) => "uint32", - Value::Uint64(_) => "uint64", - - Value::Float32(_) => "float32", - Value::Float64(_) => "float64", - - Value::Boolean(_) => "boolean", - Value::String(_) => "string", - - Value::Timestamp(_) => "epoch", - - Value::Array(_) | Value::Map(_) => "json", - - Value::Null => "null", - } - } - - pub fn as_str(&self) -> Option<&str> { - match self { - Value::String(v) => Some(v), - _ => None, - } - } - - pub fn as_i64(&self) -> Option { - match self { - Value::Uint32(v) => Some(*v as i64), - Value::Uint16(v) => Some(*v as i64), - Value::Uint8(v) => Some(*v as i64), - Value::Int64(v) => Some(*v), - Value::Int32(v) => Some(*v as i64), - Value::Int16(v) => Some(*v as i64), - Value::Int8(v) => Some(*v as i64), - _ => None, - } - } - - pub fn as_u64(&self) -> Option { - match self { - Value::Uint64(v) => Some(*v), - Value::Uint32(v) => Some(*v as u64), - Value::Uint16(v) => Some(*v as u64), - Value::Uint8(v) => Some(*v as u64), - _ => None, - } - } - - pub fn as_f64(&self) -> Option { - match self { - Value::Float32(v) => Some(*v as f64), - Value::Float64(v) => Some(*v), - Value::Uint64(v) => Some(*v as f64), - Value::Uint32(v) => Some(*v as f64), - Value::Uint16(v) => Some(*v as f64), - Value::Uint8(v) => Some(*v as f64), - Value::Int64(v) => Some(*v as f64), - Value::Int32(v) => Some(*v as f64), - Value::Int16(v) => Some(*v as f64), - Value::Int8(v) => Some(*v as f64), - _ => None, - } - } - - pub fn as_map_mut(&mut self) -> Option<&mut BTreeMap> { - match self { - Value::Map(map) => Some(map), - _ => None, - } - } - - pub fn as_map(&self) -> Option<&BTreeMap> { - match self { - Value::Map(map) => Some(map), - _ => None, - } - } - - pub fn into_map(self) -> Option> { - match self { - Value::Map(map) => Some(map.values), - _ => None, - } - } - - // ref https://github.com/serde-rs/json/blob/master/src/value/mod.rs#L779 - pub fn pointer(&self, pointer: &str) -> Option<&Value> { - if pointer.is_empty() { - return Some(self); - } - if !pointer.starts_with('/') { - return None; - } - pointer - .split('/') - .skip(1) - .map(|x| x.replace("~1", "/").replace("~0", "~")) - .try_fold(self, |target, token| match target { - Value::Map(map) => map.get(&token), - Value::Array(list) => parse_index(&token).and_then(|x| list.get(x)), - _ => None, - }) - } - - // ref https://github.com/serde-rs/json/blob/master/src/value/mod.rs#L834 - pub fn pointer_mut(&mut self, pointer: &str) -> Option<&mut Value> { - if pointer.is_empty() { - return Some(self); - } - if !pointer.starts_with('/') { - return None; - } - pointer - .split('/') - .skip(1) - .map(|x| x.replace("~1", "/").replace("~0", "~")) - .try_fold(self, |target, token| match target { - Value::Map(map) => map.get_mut(&token), - Value::Array(list) => parse_index(&token).and_then(move |x| list.get_mut(x)), - _ => None, - }) - } -} - -// ref https://github.com/serde-rs/json/blob/master/src/value/mod.rs#L259 -fn parse_index(s: &str) -> Option { - if s.starts_with('+') || (s.starts_with('0') && s.len() != 1) { - return None; - } - s.parse().ok() -} - -impl std::fmt::Display for Value { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - let str = match self { - Value::Null => "null".to_string(), - - Value::Int8(v) => format!("int8({})", v), - Value::Int16(v) => format!("int16({})", v), - Value::Int32(v) => format!("int32({})", v), - Value::Int64(v) => format!("int64({})", v), - - Value::Uint8(v) => format!("uint8({})", v), - Value::Uint16(v) => format!("uint16({})", v), - Value::Uint32(v) => format!("uint32({})", v), - Value::Uint64(v) => format!("uint64({})", v), - - Value::Float32(v) => format!("float32({})", v), - Value::Float64(v) => format!("float64({})", v), - - Value::Boolean(v) => format!("boolean({})", v), - Value::String(v) => format!("string({})", v), - - Value::Timestamp(v) => format!("epoch({})", v), - - Value::Array(v) => format!("{}", v), - Value::Map(v) => format!("{}", v), - }; - - write!(f, "{}", str) - } -} - -impl TryFrom for Value { - type Error = Error; - - fn try_from(v: simd_json::value::OwnedValue) -> Result { - match v { - simd_json::value::OwnedValue::Static(v) => match v { - simd_json::value::StaticNode::Null => Ok(Value::Null), - simd_json::value::StaticNode::Bool(v) => Ok(Value::Boolean(v)), - simd_json::value::StaticNode::I64(v) => Ok(Value::Int64(v)), - simd_json::value::StaticNode::U64(v) => Ok(Value::Uint64(v)), - simd_json::value::StaticNode::F64(v) => Ok(Value::Float64(v)), - }, - simd_json::OwnedValue::String(s) => Ok(Value::String(s)), - simd_json::OwnedValue::Array(values) => { - let mut re = Vec::with_capacity(values.len()); - for v in values.into_iter() { - re.push(Value::try_from(v)?); - } - Ok(Value::Array(Array { values: re })) - } - simd_json::OwnedValue::Object(map) => { - let mut values = BTreeMap::new(); - for (k, v) in map.into_iter() { - values.insert(k, Value::try_from(v)?); - } - Ok(Value::Map(Map { values })) - } - } - } -} - -impl TryFrom for Value { - type Error = Error; - - fn try_from(v: serde_json::Value) -> Result { - match v { - serde_json::Value::Null => Ok(Value::Null), - serde_json::Value::Bool(v) => Ok(Value::Boolean(v)), - serde_json::Value::Number(v) => { - if let Some(v) = v.as_i64() { - Ok(Value::Int64(v)) - } else if let Some(v) = v.as_u64() { - Ok(Value::Uint64(v)) - } else if let Some(v) = v.as_f64() { - Ok(Value::Float64(v)) - } else { - UnsupportedNumberTypeSnafu { value: v }.fail() - } - } - serde_json::Value::String(v) => Ok(Value::String(v)), - serde_json::Value::Array(v) => { - let mut values = Vec::with_capacity(v.len()); - for v in v { - values.push(Value::try_from(v)?); - } - Ok(Value::Array(Array { values })) - } - serde_json::Value::Object(v) => { - let mut values = BTreeMap::new(); - for (k, v) in v { - values.insert(k, Value::try_from(v)?); - } - Ok(Value::Map(Map { values })) - } - } - } -} - -impl TryFrom<&yaml_rust::Yaml> for Value { - type Error = Error; - - fn try_from(v: &yaml_rust::Yaml) -> Result { - match v { - yaml_rust::Yaml::Null => Ok(Value::Null), - yaml_rust::Yaml::Boolean(v) => Ok(Value::Boolean(*v)), - yaml_rust::Yaml::Integer(v) => Ok(Value::Int64(*v)), - yaml_rust::Yaml::Real(v) => match v.parse::() { - Ok(v) => Ok(Value::Float64(v)), - Err(e) => Err(e).context(ValueParseFloatSnafu { ty: "float64", v }), - }, - yaml_rust::Yaml::String(v) => Ok(Value::String(v.to_string())), - yaml_rust::Yaml::Array(arr) => { - let mut values = vec![]; - for v in arr { - values.push(Value::try_from(v)?); - } - Ok(Value::Array(Array { values })) - } - yaml_rust::Yaml::Hash(v) => { - let mut values = BTreeMap::new(); - for (k, v) in v { - let key = k - .as_str() - .with_context(|| ValueYamlKeyMustBeStringSnafu { value: v.clone() })?; - values.insert(key.to_string(), Value::try_from(v)?); - } - Ok(Value::Map(Map { values })) - } - _ => ValueUnsupportedYamlTypeSnafu { value: v.clone() }.fail(), - } - } -} - -impl From<&Value> for JsonbValue<'_> { - fn from(value: &Value) -> Self { - match value { - Value::Null => JsonbValue::Null, - Value::Boolean(v) => JsonbValue::Bool(*v), - - Value::Int8(v) => JsonbValue::Number(JsonbNumber::Int64(*v as i64)), - Value::Int16(v) => JsonbValue::Number(JsonbNumber::Int64(*v as i64)), - Value::Int32(v) => JsonbValue::Number(JsonbNumber::Int64(*v as i64)), - Value::Int64(v) => JsonbValue::Number(JsonbNumber::Int64(*v)), - - Value::Uint8(v) => JsonbValue::Number(JsonbNumber::UInt64(*v as u64)), - Value::Uint16(v) => JsonbValue::Number(JsonbNumber::UInt64(*v as u64)), - Value::Uint32(v) => JsonbValue::Number(JsonbNumber::UInt64(*v as u64)), - Value::Uint64(v) => JsonbValue::Number(JsonbNumber::UInt64(*v)), - Value::Float32(v) => JsonbValue::Number(JsonbNumber::Float64(*v as f64)), - Value::Float64(v) => JsonbValue::Number(JsonbNumber::Float64(*v)), - Value::String(v) => JsonbValue::String(v.clone().into()), - Value::Timestamp(v) => JsonbValue::String(v.to_string().into()), - Value::Array(arr) => { - let mut vals: Vec = Vec::with_capacity(arr.len()); - for val in arr.iter() { - vals.push(val.into()); - } - JsonbValue::Array(vals) - } - Value::Map(obj) => { - let mut map = JsonbObject::new(); - for (k, v) in obj.iter() { - let val: JsonbValue = v.into(); - map.insert(k.to_string(), val); - } - JsonbValue::Object(map) - } - } - } -} - -impl From for JsonbValue<'_> { - fn from(value: Value) -> Self { - match value { - Value::Null => JsonbValue::Null, - Value::Boolean(v) => JsonbValue::Bool(v), - - Value::Int8(v) => JsonbValue::Number(JsonbNumber::Int64(v as i64)), - Value::Int16(v) => JsonbValue::Number(JsonbNumber::Int64(v as i64)), - Value::Int32(v) => JsonbValue::Number(JsonbNumber::Int64(v as i64)), - Value::Int64(v) => JsonbValue::Number(JsonbNumber::Int64(v)), - - Value::Uint8(v) => JsonbValue::Number(JsonbNumber::UInt64(v as u64)), - Value::Uint16(v) => JsonbValue::Number(JsonbNumber::UInt64(v as u64)), - Value::Uint32(v) => JsonbValue::Number(JsonbNumber::UInt64(v as u64)), - Value::Uint64(v) => JsonbValue::Number(JsonbNumber::UInt64(v)), - Value::Float32(v) => JsonbValue::Number(JsonbNumber::Float64(v as f64)), - Value::Float64(v) => JsonbValue::Number(JsonbNumber::Float64(v)), - Value::String(v) => JsonbValue::String(v.into()), - Value::Timestamp(v) => JsonbValue::String(v.to_string().into()), - Value::Array(arr) => { - let mut vals: Vec = Vec::with_capacity(arr.len()); - for val in arr.into_iter() { - vals.push(val.into()); - } - JsonbValue::Array(vals) - } - Value::Map(obj) => { - let mut map = JsonbObject::new(); - for (k, v) in obj.values.into_iter() { - let val: JsonbValue = v.into(); - map.insert(k, val); - } - JsonbValue::Object(map) - } - } - } -} - -impl From for Value { - fn from(value: String) -> Self { - Value::String(value) - } -} - -impl From<&str> for Value { - fn from(value: &str) -> Self { - Value::String(value.to_string()) - } -} - -impl From for Value { - fn from(value: i64) -> Self { - Value::Int64(value) - } -} - -impl From for Value { - fn from(value: f64) -> Self { - Value::Float64(value) - } -} - -impl From> for Value { - fn from(value: Vec) -> Self { - Value::Array(Array { - values: value.into_iter().map(Value::String).collect(), - }) - } -} - -impl From> for Value { - fn from(value: Vec) -> Self { - Value::Array(Array { values: value }) - } -} - -impl From for Value { - fn from(value: bool) -> Self { - Value::Boolean(value) - } -} - -impl JsonLike for Value { - fn get(&self, key: &str) -> Option<&Self> { - self.get(key) - } - - fn itre(&self, pref: String) -> Vec> { - let res = match self { - Value::Array(elems) => { - let mut res = vec![]; - for (idx, el) in elems.iter().enumerate() { - res.push(jsonpath_rust::JsonPathValue::Slice( - el, - jsonpath_rust::jsp_idx(&pref, idx), - )); - } - res - } - Value::Map(elems) => { - let mut res = vec![]; - for (key, el) in elems.iter() { - res.push(jsonpath_rust::JsonPathValue::Slice( - el, - jsonpath_rust::jsp_obj(&pref, key), - )); - } - res - } - _ => vec![], - }; - if res.is_empty() { - vec![jsonpath_rust::JsonPathValue::NoValue] - } else { - res - } - } - - fn array_len(&self) -> jsonpath_rust::JsonPathValue<'static, Self> { - match self { - Value::Array(elems) => { - jsonpath_rust::JsonPathValue::NewValue(Value::Int64(elems.len() as i64)) - } - _ => jsonpath_rust::JsonPathValue::NoValue, - } - } - - fn init_with_usize(cnt: usize) -> Self { - Value::Int64(cnt as i64) - } - - fn deep_flatten(&self, pref: String) -> Vec<(&Self, String)> { - let mut acc = vec![]; - match self { - Value::Map(elems) => { - for (f, v) in elems.iter() { - let pref = jsp_obj(&pref, f); - acc.push((v, pref.clone())); - acc.append(&mut v.deep_flatten(pref)); - } - } - Value::Array(elems) => { - for (i, v) in elems.iter().enumerate() { - let pref = jsp_idx(&pref, i); - acc.push((v, pref.clone())); - acc.append(&mut v.deep_flatten(pref)); - } - } - _ => (), - } - acc - } - - fn deep_path_by_key<'a>( - &'a self, - key: jsonpath_rust::path::ObjectField<'a, Self>, - pref: String, - ) -> Vec<(&'a Self, String)> { - let mut result: Vec<(&'a Value, String)> = jsonpath_rust::JsonPathValue::vec_as_pair( - key.find(jsonpath_rust::JsonPathValue::new_slice(self, pref.clone())), - ); - match self { - Value::Map(elems) => { - let mut next_levels: Vec<(&'a Value, String)> = elems - .iter() - .flat_map(|(k, v)| v.deep_path_by_key(key.clone(), jsp_obj(&pref, k))) - .collect(); - result.append(&mut next_levels); - result - } - Value::Array(elems) => { - let mut next_levels: Vec<(&'a Value, String)> = elems - .iter() - .enumerate() - .flat_map(|(i, v)| v.deep_path_by_key(key.clone(), jsp_idx(&pref, i))) - .collect(); - result.append(&mut next_levels); - result - } - _ => result, - } - } - - fn as_u64(&self) -> Option { - match self { - Value::Uint64(v) => Some(*v), - Value::Uint32(v) => Some(*v as u64), - Value::Uint16(v) => Some(*v as u64), - Value::Uint8(v) => Some(*v as u64), - Value::Int64(v) if *v >= 0 => Some(*v as u64), - Value::Int32(v) if *v >= 0 => Some(*v as u64), - Value::Int16(v) if *v >= 0 => Some(*v as u64), - Value::Int8(v) if *v >= 0 => Some(*v as u64), - Value::Float64(v) if *v >= 0.0 => Some(*v as u64), - Value::Float32(v) if *v >= 0.0 => Some(*v as u64), - _ => None, - } - } - - fn is_array(&self) -> bool { - matches!(self, Value::Array(_)) - } - - fn as_array(&self) -> Option<&Vec> { - match self { - Value::Array(arr) => Some(&arr.values), - _ => None, - } - } - - fn size(left: Vec<&Self>, right: Vec<&Self>) -> bool { - if let Some(v) = right.first() { - let sz = match v { - Value::Int64(n) => *n as usize, - Value::Int32(n) => *n as usize, - Value::Int16(n) => *n as usize, - Value::Int8(n) => *n as usize, - - Value::Uint64(n) => *n as usize, - Value::Uint32(n) => *n as usize, - Value::Uint16(n) => *n as usize, - Value::Uint8(n) => *n as usize, - Value::Float32(n) => *n as usize, - Value::Float64(n) => *n as usize, - _ => return false, - }; - for el in left.iter() { - match el { - Value::String(v) if v.len() == sz => true, - Value::Array(elems) if elems.len() == sz => true, - Value::Map(fields) if fields.len() == sz => true, - _ => return false, - }; - } - return true; - } - false - } - - fn sub_set_of(left: Vec<&Self>, right: Vec<&Self>) -> bool { - if left.is_empty() { - return true; - } - if right.is_empty() { - return false; - } - - if let Some(elems) = left.first().and_then(|e| e.as_array()) { - if let Some(Value::Array(right_elems)) = right.first() { - if right_elems.is_empty() { - return false; - } - - for el in elems { - let mut res = false; - - for r in right_elems.iter() { - if el.eq(r) { - res = true - } - } - if !res { - return false; - } - } - return true; - } - } - false - } - - fn any_of(left: Vec<&Self>, right: Vec<&Self>) -> bool { - if left.is_empty() { - return true; - } - if right.is_empty() { - return false; - } - - if let Some(Value::Array(elems)) = right.first() { - if elems.is_empty() { - return false; - } - - for el in left.iter() { - if let Some(left_elems) = el.as_array() { - for l in left_elems.iter() { - for r in elems.iter() { - if l.eq(r) { - return true; - } - } - } - } else { - for r in elems.iter() { - if el.eq(&r) { - return true; - } - } - } - } - } - - false - } - - fn regex(left: Vec<&Self>, right: Vec<&Self>) -> bool { - if left.is_empty() || right.is_empty() { - return false; - } - - match right.first() { - Some(Value::String(str)) => { - if let Ok(regex) = Regex::new(str) { - for el in left.iter() { - if let Some(v) = el.as_str() { - if regex.is_match(v) { - return true; - } - } - } - } - false - } - _ => false, - } - } - - fn inside(left: Vec<&Self>, right: Vec<&Self>) -> bool { - if left.is_empty() { - return false; - } - - match right.first() { - Some(Value::Array(elems)) => { - for el in left.iter() { - if elems.contains(el) { - return true; - } - } - false - } - Some(Value::Map(elems)) => { - for el in left.iter() { - for r in elems.values() { - if el.eq(&r) { - return true; - } - } - } - false - } - _ => false, - } - } - - fn less(left: Vec<&Self>, right: Vec<&Self>) -> bool { - if left.len() == 1 && right.len() == 1 { - match (left.first(), right.first()) { - (Some(l), Some(r)) => l - .as_f64() - .and_then(|v1| r.as_f64().map(|v2| v1 < v2)) - .unwrap_or(false), - _ => false, - } - } else { - false - } - } - - fn eq(left: Vec<&Self>, right: Vec<&Self>) -> bool { - if left.len() != right.len() { - false - } else { - left.iter().zip(right).all(|(a, b)| a.eq(&b)) - } - } - - fn array(data: Vec) -> Self { - Value::Array(Array { values: data }) - } - - fn null() -> Self { - Value::Null - } - - // ref https://github.com/besok/jsonpath-rust/blob/main/src/path/mod.rs#L423 - fn reference( - &self, - path: T, - ) -> std::result::Result, JsonPathParserError> - where - T: Into, - { - Ok(self.pointer(&path_to_json_path(path.into())?)) - } - - // https://github.com/besok/jsonpath-rust/blob/main/src/path/mod.rs#L430 - fn reference_mut( - &mut self, - path: T, - ) -> std::result::Result, JsonPathParserError> - where - T: Into, - { - Ok(self.pointer_mut(&path_to_json_path(path.into())?)) - } -} - -// ref https://github.com/besok/jsonpath-rust/blob/main/src/path/mod.rs#L438 -fn path_to_json_path(path: JsonPathStr) -> StdResult { - convert_part(&parse_json_path(path.as_str())?) -} - -// https://github.com/besok/jsonpath-rust/blob/main/src/path/mod.rs#L442 -fn convert_part(path: &JsonPath) -> StdResult { - match path { - JsonPath::Chain(elems) => elems - .iter() - .map(convert_part) - .collect::>(), - - JsonPath::Index(JsonPathIndex::Single(v)) => Ok(format!("/{}", v)), - JsonPath::Field(e) => Ok(format!("/{}", e)), - JsonPath::Root => Ok("".to_string()), - e => Err(JsonPathParserError::InvalidJsonPath(e.to_string())), + _ => ValueUnsupportedYamlTypeSnafu { value: v.clone() }.fail(), } } diff --git a/src/pipeline/src/etl/value/array.rs b/src/pipeline/src/etl/value/array.rs deleted file mode 100644 index 0658d502df..0000000000 --- a/src/pipeline/src/etl/value/array.rs +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use crate::error::{Error, Result}; -use crate::etl::value::Value; - -#[derive(Debug, Clone, PartialEq, Default)] -pub struct Array { - pub values: Vec, -} - -impl Array { - pub fn new() -> Self { - Array { values: vec![] } - } -} - -impl std::fmt::Display for Array { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - let values = self - .values - .iter() - .map(|v| v.to_string()) - .collect::>() - .join(", "); - write!(f, "[{}]", values) - } -} - -impl std::ops::Deref for Array { - type Target = Vec; - - fn deref(&self) -> &Self::Target { - &self.values - } -} - -impl std::ops::DerefMut for Array { - fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.values - } -} - -impl IntoIterator for Array { - type Item = Value; - - type IntoIter = std::vec::IntoIter; - - fn into_iter(self) -> Self::IntoIter { - self.values.into_iter() - } -} - -impl From> for Array { - fn from(values: Vec) -> Self { - Array { values } - } -} - -impl TryFrom> for Array { - type Error = Error; - - fn try_from(value: Vec) -> Result { - let values = value - .into_iter() - .map(|v| v.try_into()) - .collect::>>()?; - Ok(Array { values }) - } -} diff --git a/src/pipeline/src/etl/value/map.rs b/src/pipeline/src/etl/value/map.rs deleted file mode 100644 index 0c92a036c3..0000000000 --- a/src/pipeline/src/etl/value/map.rs +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use std::collections::BTreeMap; - -use crate::etl::value::Value; - -#[derive(Debug, Clone, PartialEq, Default)] -pub struct Map { - pub values: BTreeMap, -} - -impl Map { - pub fn one(key: impl Into, value: Value) -> Map { - let mut map = Map::default(); - map.insert(key, value); - map - } - - pub fn insert(&mut self, key: impl Into, value: Value) { - self.values.insert(key.into(), value); - } - - pub fn extend(&mut self, Map { values }: Map) { - self.values.extend(values); - } -} - -impl From> for Map { - fn from(values: BTreeMap) -> Self { - Self { values } - } -} - -impl std::ops::Deref for Map { - type Target = BTreeMap; - - fn deref(&self) -> &Self::Target { - &self.values - } -} - -impl std::ops::DerefMut for Map { - fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.values - } -} - -impl std::fmt::Display for Map { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - let values = self - .values - .iter() - .map(|(k, v)| format!("{}: {}", k, v)) - .collect::>() - .join(", "); - write!(f, "{{{}}}", values) - } -} diff --git a/src/pipeline/src/etl/value/time.rs b/src/pipeline/src/etl/value/time.rs deleted file mode 100644 index ad022ed6f9..0000000000 --- a/src/pipeline/src/etl/value/time.rs +++ /dev/null @@ -1,140 +0,0 @@ -// Copyright 2023 Greptime Team -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use chrono::{DateTime, Utc}; -use common_time::timestamp::TimeUnit; - -#[derive(Debug, Clone, PartialEq)] -pub enum Timestamp { - Nanosecond(i64), - Microsecond(i64), - Millisecond(i64), - Second(i64), -} - -pub(crate) const NANOSECOND_RESOLUTION: &str = "nanosecond"; -pub(crate) const NANO_RESOLUTION: &str = "nano"; -pub(crate) const NS_RESOLUTION: &str = "ns"; -pub(crate) const MICROSECOND_RESOLUTION: &str = "microsecond"; -pub(crate) const MICRO_RESOLUTION: &str = "micro"; -pub(crate) const US_RESOLUTION: &str = "us"; -pub(crate) const MILLISECOND_RESOLUTION: &str = "millisecond"; -pub(crate) const MILLI_RESOLUTION: &str = "milli"; -pub(crate) const MS_RESOLUTION: &str = "ms"; -pub(crate) const SECOND_RESOLUTION: &str = "second"; -pub(crate) const SEC_RESOLUTION: &str = "sec"; -pub(crate) const S_RESOLUTION: &str = "s"; - -pub(crate) const VALID_RESOLUTIONS: [&str; 12] = [ - NANOSECOND_RESOLUTION, - NANO_RESOLUTION, - NS_RESOLUTION, - MICROSECOND_RESOLUTION, - MICRO_RESOLUTION, - US_RESOLUTION, - MILLISECOND_RESOLUTION, - MILLI_RESOLUTION, - MS_RESOLUTION, - SECOND_RESOLUTION, - SEC_RESOLUTION, - S_RESOLUTION, -]; - -impl Timestamp { - pub(crate) fn timestamp_nanos(&self) -> i64 { - match self { - Timestamp::Nanosecond(v) => *v, - Timestamp::Microsecond(v) => *v * 1_000, - Timestamp::Millisecond(v) => *v * 1_000_000, - Timestamp::Second(v) => *v * 1_000_000_000, - } - } - - pub(crate) fn timestamp_micros(&self) -> i64 { - match self { - Timestamp::Nanosecond(v) => *v / 1_000, - Timestamp::Microsecond(v) => *v, - Timestamp::Millisecond(v) => *v * 1_000, - Timestamp::Second(v) => *v * 1_000_000, - } - } - - pub(crate) fn timestamp_millis(&self) -> i64 { - match self { - Timestamp::Nanosecond(v) => *v / 1_000_000, - Timestamp::Microsecond(v) => *v / 1_000, - Timestamp::Millisecond(v) => *v, - Timestamp::Second(v) => *v * 1_000, - } - } - - pub(crate) fn timestamp(&self) -> i64 { - match self { - Timestamp::Nanosecond(v) => *v / 1_000_000_000, - Timestamp::Microsecond(v) => *v / 1_000_000, - Timestamp::Millisecond(v) => *v / 1_000, - Timestamp::Second(v) => *v, - } - } - - pub(crate) fn to_unit(&self, unit: &TimeUnit) -> i64 { - match unit { - TimeUnit::Second => self.timestamp(), - TimeUnit::Millisecond => self.timestamp_millis(), - TimeUnit::Microsecond => self.timestamp_micros(), - TimeUnit::Nanosecond => self.timestamp_nanos(), - } - } - - pub fn get_unit(&self) -> TimeUnit { - match self { - Timestamp::Nanosecond(_) => TimeUnit::Nanosecond, - Timestamp::Microsecond(_) => TimeUnit::Microsecond, - Timestamp::Millisecond(_) => TimeUnit::Millisecond, - Timestamp::Second(_) => TimeUnit::Second, - } - } - - pub fn to_datetime(&self) -> Option> { - match self { - Timestamp::Nanosecond(v) => Some(DateTime::from_timestamp_nanos(*v)), - Timestamp::Microsecond(v) => DateTime::from_timestamp_micros(*v), - Timestamp::Millisecond(v) => DateTime::from_timestamp_millis(*v), - Timestamp::Second(v) => DateTime::from_timestamp(*v, 0), - } - } - - pub fn from_datetime(dt: DateTime) -> Option { - dt.timestamp_nanos_opt().map(Timestamp::Nanosecond) - } -} - -impl Default for Timestamp { - fn default() -> Self { - Timestamp::Nanosecond(chrono::Utc::now().timestamp_nanos_opt().unwrap_or_default()) - } -} - -impl std::fmt::Display for Timestamp { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - let (value, resolution) = match self { - Timestamp::Nanosecond(v) => (v, NANOSECOND_RESOLUTION), - Timestamp::Microsecond(v) => (v, MICROSECOND_RESOLUTION), - Timestamp::Millisecond(v) => (v, MILLISECOND_RESOLUTION), - Timestamp::Second(v) => (v, SECOND_RESOLUTION), - }; - - write!(f, "{}, resolution: {}", value, resolution) - } -} diff --git a/src/pipeline/src/lib.rs b/src/pipeline/src/lib.rs index 709a93dfab..8792dded4a 100644 --- a/src/pipeline/src/lib.rs +++ b/src/pipeline/src/lib.rs @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#![feature(string_from_utf8_lossy_owned)] + mod dispatcher; pub mod error; mod etl; @@ -24,10 +26,8 @@ pub use etl::processor::Processor; pub use etl::transform::transformer::greptime::{GreptimePipelineParams, SchemaInfo}; pub use etl::transform::transformer::identity_pipeline; pub use etl::transform::GreptimeTransformer; -pub use etl::value::{Array, Map, Timestamp, Value}; pub use etl::{ - json_array_to_map, json_to_map, parse, simd_json_array_to_map, simd_json_to_map, Content, - DispatchedTo, Pipeline, PipelineExecOutput, TransformedOutput, TransformerMode, + parse, Content, DispatchedTo, Pipeline, PipelineExecOutput, TransformedOutput, TransformerMode, }; pub use manager::{ pipeline_operator, table, util, IdentityTimeIndex, PipelineContext, PipelineDefinition, diff --git a/src/pipeline/src/manager.rs b/src/pipeline/src/manager.rs index 70ed8860ac..bca1c2a28e 100644 --- a/src/pipeline/src/manager.rs +++ b/src/pipeline/src/manager.rs @@ -16,18 +16,22 @@ use std::sync::Arc; use api::v1::value::ValueData; use api::v1::ColumnDataType; +use chrono::{DateTime, Utc}; use common_time::timestamp::TimeUnit; use common_time::Timestamp; use datatypes::timestamp::TimestampNanosecond; use itertools::Itertools; use session::context::Channel; -use snafu::ensure; +use snafu::{ensure, OptionExt}; use util::to_pipeline_version; +use vrl::value::Value as VrlValue; -use crate::error::{CastTypeSnafu, InvalidCustomTimeIndexSnafu, PipelineMissingSnafu, Result}; -use crate::etl::value::time::{MS_RESOLUTION, NS_RESOLUTION, S_RESOLUTION, US_RESOLUTION}; +use crate::error::{ + CastTypeSnafu, InvalidCustomTimeIndexSnafu, InvalidTimestampSnafu, PipelineMissingSnafu, Result, +}; +use crate::etl::value::{MS_RESOLUTION, NS_RESOLUTION, S_RESOLUTION, US_RESOLUTION}; use crate::table::PipelineTable; -use crate::{GreptimePipelineParams, Pipeline, Value}; +use crate::{GreptimePipelineParams, Pipeline}; mod pipeline_cache; pub mod pipeline_operator; @@ -232,7 +236,7 @@ impl IdentityTimeIndex { } } - pub fn get_column_name(&self) -> &String { + pub fn get_column_name(&self) -> &str { match self { IdentityTimeIndex::Epoch(field, _, _) => field, IdentityTimeIndex::DateStr(field, _, _) => field, @@ -258,25 +262,25 @@ impl IdentityTimeIndex { } } - pub fn get_timestamp(&self, value: Option<&Value>) -> Result { + pub fn get_timestamp_value(&self, value: Option<&VrlValue>) -> Result { match self { IdentityTimeIndex::Epoch(_, unit, ignore_errors) => { let v = match value { - Some(Value::Int32(v)) => *v as i64, - Some(Value::Int64(v)) => *v, - Some(Value::Uint32(v)) => *v as i64, - Some(Value::Uint64(v)) => *v as i64, - Some(Value::String(s)) => match s.parse::() { + Some(VrlValue::Integer(v)) => *v, + Some(VrlValue::Bytes(s)) => match String::from_utf8_lossy(s).parse::() { Ok(v) => v, Err(_) => { return if_ignore_errors( *ignore_errors, *unit, - format!("failed to convert {} to number", s), + format!( + "failed to convert {} to number", + String::from_utf8_lossy(s) + ), ) } }, - Some(Value::Timestamp(timestamp)) => timestamp.to_unit(unit), + Some(VrlValue::Timestamp(timestamp)) => datetime_utc_to_unit(timestamp, unit)?, Some(v) => { return if_ignore_errors( *ignore_errors, @@ -292,7 +296,7 @@ impl IdentityTimeIndex { } IdentityTimeIndex::DateStr(_, format, ignore_errors) => { let v = match value { - Some(Value::String(s)) => s, + Some(VrlValue::Bytes(s)) => String::from_utf8_lossy(s), Some(v) => { return if_ignore_errors( *ignore_errors, @@ -309,7 +313,7 @@ impl IdentityTimeIndex { } }; - let timestamp = match chrono::DateTime::parse_from_str(v, format) { + let timestamp = match chrono::DateTime::parse_from_str(&v, format) { Ok(ts) => ts, Err(_) => { return if_ignore_errors( @@ -321,13 +325,31 @@ impl IdentityTimeIndex { }; Ok(ValueData::TimestampNanosecondValue( - timestamp.timestamp_nanos_opt().unwrap_or_default(), + timestamp + .timestamp_nanos_opt() + .context(InvalidTimestampSnafu { + input: timestamp.to_rfc3339(), + })?, )) } } } } +fn datetime_utc_to_unit(timestamp: &DateTime, unit: &TimeUnit) -> Result { + let ts = match unit { + TimeUnit::Nanosecond => timestamp + .timestamp_nanos_opt() + .context(InvalidTimestampSnafu { + input: timestamp.to_rfc3339(), + })?, + TimeUnit::Microsecond => timestamp.timestamp_micros(), + TimeUnit::Millisecond => timestamp.timestamp_millis(), + TimeUnit::Second => timestamp.timestamp(), + }; + Ok(ts) +} + fn if_ignore_errors(ignore_errors: bool, unit: TimeUnit, msg: String) -> Result { if ignore_errors { Ok(time_unit_to_value_data( diff --git a/src/pipeline/src/tablesuffix.rs b/src/pipeline/src/tablesuffix.rs index 3c51dad980..8733bc3841 100644 --- a/src/pipeline/src/tablesuffix.rs +++ b/src/pipeline/src/tablesuffix.rs @@ -15,12 +15,12 @@ use dyn_fmt::AsStrFormatExt; use regex::Regex; use snafu::{ensure, OptionExt}; +use vrl::value::Value as VrlValue; use yaml_rust::Yaml; use crate::error::{ Error, InvalidTableSuffixTemplateSnafu, RequiredTableSuffixTemplateSnafu, Result, }; -use crate::Value; const REPLACE_KEY: &str = "{}"; @@ -47,22 +47,16 @@ pub(crate) struct TableSuffixTemplate { } impl TableSuffixTemplate { - pub fn apply(&self, val: &Value) -> Option { + pub fn apply(&self, val: &VrlValue) -> Option { + let val = val.as_object()?; let values = self .keys .iter() .filter_map(|key| { - let v = val.get(key)?; + let v = val.get(key.as_str())?; match v { - Value::Int8(v) => Some(v.to_string()), - Value::Int16(v) => Some(v.to_string()), - Value::Int32(v) => Some(v.to_string()), - Value::Int64(v) => Some(v.to_string()), - Value::Uint8(v) => Some(v.to_string()), - Value::Uint16(v) => Some(v.to_string()), - Value::Uint32(v) => Some(v.to_string()), - Value::Uint64(v) => Some(v.to_string()), - Value::String(v) => Some(v.clone()), + VrlValue::Integer(v) => Some(v.to_string()), + VrlValue::Bytes(v) => Some(String::from_utf8_lossy_owned(v.to_vec())), _ => None, } }) diff --git a/src/pipeline/tests/common.rs b/src/pipeline/tests/common.rs index 5285053861..ac53308d45 100644 --- a/src/pipeline/tests/common.rs +++ b/src/pipeline/tests/common.rs @@ -13,11 +13,12 @@ // limitations under the License. use greptime_proto::v1::{ColumnDataType, ColumnSchema, Rows, SemanticType}; -use pipeline::{json_to_map, parse, setup_pipeline, Content, Pipeline, PipelineContext}; +use pipeline::{parse, setup_pipeline, Content, Pipeline, PipelineContext}; +use vrl::value::Value as VrlValue; /// test util function to parse and execute pipeline pub fn parse_and_exec(input_str: &str, pipeline_yaml: &str) -> Rows { - let input_value = serde_json::from_str::(input_str).unwrap(); + let input_value = serde_json::from_str::(input_str).unwrap(); let yaml_content = Content::Yaml(pipeline_yaml); let pipeline: Pipeline = parse(&yaml_content).expect("failed to parse pipeline"); @@ -32,21 +33,19 @@ pub fn parse_and_exec(input_str: &str, pipeline_yaml: &str) -> Rows { let mut rows = Vec::new(); match input_value { - serde_json::Value::Array(array) => { + VrlValue::Array(array) => { for value in array { - let intermediate_status = json_to_map(value).unwrap(); let row = pipeline - .exec_mut(intermediate_status, &pipeline_ctx, &mut schema_info) + .exec_mut(value, &pipeline_ctx, &mut schema_info) .expect("failed to exec pipeline") .into_transformed() .expect("expect transformed result "); rows.push(row.0); } } - serde_json::Value::Object(_) => { - let intermediate_status = json_to_map(input_value).unwrap(); + VrlValue::Object(_) => { let row = pipeline - .exec_mut(intermediate_status, &pipeline_ctx, &mut schema_info) + .exec_mut(input_value, &pipeline_ctx, &mut schema_info) .expect("failed to exec pipeline") .into_transformed() .expect("expect transformed result "); diff --git a/src/pipeline/tests/dissect.rs b/src/pipeline/tests/dissect.rs index 7eb3df749f..1af3ee3c23 100644 --- a/src/pipeline/tests/dissect.rs +++ b/src/pipeline/tests/dissect.rs @@ -16,7 +16,7 @@ mod common; use greptime_proto::v1::value::ValueData::StringValue; use greptime_proto::v1::{ColumnDataType, SemanticType}; -use pipeline::{json_to_map, setup_pipeline, PipelineContext}; +use pipeline::{setup_pipeline, PipelineContext}; fn make_string_column_schema(name: String) -> greptime_proto::v1::ColumnSchema { common::make_column_schema(name, ColumnDataType::String, SemanticType::Field) @@ -282,7 +282,7 @@ transform: session::context::Channel::Unknown, ); - let result = json_to_map(input_value).unwrap(); + let result = input_value.into(); let row = pipeline.exec_mut(result, &pipeline_ctx, &mut schema_info); diff --git a/src/pipeline/tests/pipeline.rs b/src/pipeline/tests/pipeline.rs index 16146cbc2a..e10230c79d 100644 --- a/src/pipeline/tests/pipeline.rs +++ b/src/pipeline/tests/pipeline.rs @@ -20,7 +20,7 @@ use greptime_proto::v1::value::ValueData::{ U32Value, U64Value, U8Value, }; use greptime_proto::v1::Value as GreptimeValue; -use pipeline::{json_to_map, parse, setup_pipeline, Content, Pipeline, PipelineContext}; +use pipeline::{parse, setup_pipeline, Content, Pipeline, PipelineContext}; #[test] fn test_complex_data() { @@ -425,7 +425,7 @@ transform: &pipeline_param, session::context::Channel::Unknown, ); - let stats = json_to_map(input_value).unwrap(); + let stats = input_value.into(); let row = pipeline .exec_mut(stats, &pipeline_ctx, &mut schema_info) @@ -500,7 +500,7 @@ transform: session::context::Channel::Unknown, ); - let status = json_to_map(input_value).unwrap(); + let status = input_value.into(); let row = pipeline .exec_mut(status, &pipeline_ctx, &mut schema_info) .unwrap() @@ -615,7 +615,7 @@ transform: session::context::Channel::Unknown, ); - let status = json_to_map(input_value).unwrap(); + let status = input_value.into(); let row = pipeline .exec_mut(status, &pipeline_ctx, &mut schema_info) .unwrap() @@ -687,7 +687,7 @@ transform: session::context::Channel::Unknown, ); - let status = json_to_map(input_value).unwrap(); + let status = input_value.into(); let row = pipeline .exec_mut(status, &pipeline_ctx, &mut schema_info) .unwrap() @@ -733,7 +733,7 @@ transform: session::context::Channel::Unknown, ); - let status = json_to_map(input_value).unwrap(); + let status = input_value.into(); let row = pipeline .exec_mut(status, &pipeline_ctx, &mut schema_info) .unwrap() @@ -798,7 +798,7 @@ transform: session::context::Channel::Unknown, ); - let status = json_to_map(input_value).unwrap(); + let status = input_value.into(); let row = pipeline .exec_mut(status, &pipeline_ctx, &mut schema_info) .unwrap() @@ -845,7 +845,7 @@ transform: session::context::Channel::Unknown, ); - let status = json_to_map(input_value).unwrap(); + let status = input_value.into(); let row = pipeline .exec_mut(status, &pipeline_ctx, &mut schema_info) .unwrap() @@ -913,7 +913,7 @@ transform: session::context::Channel::Unknown, ); - let status = json_to_map(input_value1).unwrap(); + let status = input_value1.into(); let dispatched_to = pipeline .exec_mut(status, &pipeline_ctx, &mut schema_info) .unwrap() @@ -922,7 +922,7 @@ transform: assert_eq!(dispatched_to.table_suffix, "http"); assert_eq!(dispatched_to.pipeline.unwrap(), "access_log_pipeline"); - let status = json_to_map(input_value2).unwrap(); + let status = input_value2.into(); let row = pipeline .exec_mut(status, &pipeline_ctx, &mut schema_info) .unwrap() @@ -983,7 +983,7 @@ table_suffix: _${logger} session::context::Channel::Unknown, ); - let status = json_to_map(input_value).unwrap(); + let status = input_value.into(); let exec_re = pipeline .exec_mut(status, &pipeline_ctx, &mut schema_info) .unwrap(); diff --git a/src/servers/Cargo.toml b/src/servers/Cargo.toml index 38a1ffac2d..ad9fd0988c 100644 --- a/src/servers/Cargo.toml +++ b/src/servers/Cargo.toml @@ -128,6 +128,7 @@ tower-http = { version = "0.6", features = ["full"] } tracing.workspace = true urlencoding = "2.1" uuid.workspace = true +vrl.workspace = true zstd.workspace = true [target.'cfg(not(windows))'.dependencies] diff --git a/src/servers/src/elasticsearch.rs b/src/servers/src/elasticsearch.rs index de8244ecec..bfcd13845d 100644 --- a/src/servers/src/elasticsearch.rs +++ b/src/servers/src/elasticsearch.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::BTreeMap; use std::sync::Arc; use std::time::Instant; @@ -30,9 +31,10 @@ use pipeline::{ use serde_json::{json, Deserializer, Value}; use session::context::{Channel, QueryContext}; use snafu::{ensure, ResultExt}; +use vrl::value::Value as VrlValue; use crate::error::{ - status_code_to_http_status, InvalidElasticsearchInputSnafu, ParseJsonSnafu, PipelineSnafu, + status_code_to_http_status, InvalidElasticsearchInputSnafu, ParseJsonSnafu, Result as ServersResult, }; use crate::http::event::{ @@ -287,8 +289,8 @@ fn parse_bulk_request( msg_field: &Option, ) -> ServersResult> { // Read the ndjson payload and convert it to `Vec`. Return error if the input is not a valid JSON. - let values: Vec = Deserializer::from_str(input) - .into_iter::() + let values: Vec = Deserializer::from_str(input) + .into_iter::() .collect::>() .context(ParseJsonSnafu)?; @@ -307,12 +309,13 @@ fn parse_bulk_request( // For Elasticsearch post `_bulk` API, each chunk contains two objects: // 1. The first object is the command, it should be `create` or `index`. // 2. The second object is the document data. - while let Some(mut cmd) = values.next() { + while let Some(cmd) = values.next() { // NOTE: Although the native Elasticsearch API supports upsert in `index` command, we don't support change any data in `index` command and it's same as `create` command. - let index = if let Some(cmd) = cmd.get_mut("create") { - get_index_from_cmd(cmd.take())? - } else if let Some(cmd) = cmd.get_mut("index") { - get_index_from_cmd(cmd.take())? + let mut cmd = cmd.into_object(); + let index = if let Some(cmd) = cmd.as_mut().and_then(|c| c.remove("create")) { + get_index_from_cmd(cmd)? + } else if let Some(cmd) = cmd.as_mut().and_then(|c| c.remove("index")) { + get_index_from_cmd(cmd)? } else { return InvalidElasticsearchInputSnafu { reason: format!( @@ -339,7 +342,6 @@ fn parse_bulk_request( } ); - let log_value = pipeline::json_to_map(log_value).context(PipelineSnafu)?; requests.push(PipelineIngestRequest { table: index.unwrap_or_else(|| index_from_url.as_ref().unwrap().clone()), values: vec![log_value], @@ -357,39 +359,50 @@ fn parse_bulk_request( } // Get the index from the command. We will take index as the table name in GreptimeDB. -fn get_index_from_cmd(mut v: Value) -> ServersResult> { - if let Some(index) = v.get_mut("_index") { - if let Value::String(index) = index.take() { - Ok(Some(index)) - } else { - // If the `_index` exists, it should be a string. - InvalidElasticsearchInputSnafu { - reason: "index is not a string in bulk request".to_string(), - } - .fail() - } +fn get_index_from_cmd(v: VrlValue) -> ServersResult> { + let Some(index) = v.into_object().and_then(|mut m| m.remove("_index")) else { + return Ok(None); + }; + + if let VrlValue::Bytes(index) = index { + Ok(Some(String::from_utf8_lossy(&index).to_string())) } else { - Ok(None) + // If the `_index` exists, it should be a string. + InvalidElasticsearchInputSnafu { + reason: "index is not a string in bulk request", + } + .fail() } } // If the msg_field is provided, fetch the value of the field from the document data. // For example, if the `msg_field` is `message`, and the document data is `{"message":"hello"}`, the log value will be Value::String("hello"). -fn get_log_value_from_msg_field(mut v: Value, msg_field: &str) -> Value { - if let Some(message) = v.get_mut(msg_field) { - let message = message.take(); +fn get_log_value_from_msg_field(v: VrlValue, msg_field: &str) -> VrlValue { + let VrlValue::Object(mut m) = v else { + return v; + }; + + if let Some(message) = m.remove(msg_field) { match message { - Value::String(s) => match serde_json::from_str::(&s) { - Ok(s) => s, - // If the message is not a valid JSON, return a map with the original message key and value. - Err(_) => json!({msg_field: s}), - }, + VrlValue::Bytes(bytes) => { + match serde_json::from_slice::(&bytes) { + Ok(v) => v, + // If the message is not a valid JSON, return a map with the original message key and value. + Err(_) => { + let map = BTreeMap::from([( + msg_field.to_string().into(), + VrlValue::Bytes(bytes), + )]); + VrlValue::Object(map) + } + } + } // If the message is not a string, just use the original message as the log value. _ => message, } } else { // If the msg_field is not found, just use the original message as the log value. - v + VrlValue::Object(m) } } @@ -414,12 +427,14 @@ mod tests { PipelineIngestRequest { table: "test".to_string(), values: vec![ - pipeline::json_to_map(json!({"foo1": "foo1_value", "bar1": "bar1_value"})).unwrap(), + json!({"foo1": "foo1_value", "bar1": "bar1_value"}).into(), ], }, PipelineIngestRequest { table: "test".to_string(), - values: vec![pipeline::json_to_map(json!({"foo2": "foo2_value", "bar2": "bar2_value"})).unwrap()], + values: vec![ + json!({"foo2": "foo2_value", "bar2": "bar2_value"}).into(), + ], }, ]), ), @@ -436,11 +451,15 @@ mod tests { Ok(vec![ PipelineIngestRequest { table: "test".to_string(), - values: vec![pipeline::json_to_map(json!({"foo1": "foo1_value", "bar1": "bar1_value"})).unwrap()], + values: vec![ + json!({"foo1": "foo1_value", "bar1": "bar1_value"}).into(), + ], }, PipelineIngestRequest { table: "logs".to_string(), - values: vec![pipeline::json_to_map(json!({"foo2": "foo2_value", "bar2": "bar2_value"})).unwrap()], + values: vec![ + json!({"foo2": "foo2_value", "bar2": "bar2_value"}).into(), + ], }, ]), ), @@ -457,11 +476,15 @@ mod tests { Ok(vec![ PipelineIngestRequest { table: "test".to_string(), - values: vec![pipeline::json_to_map(json!({"foo1": "foo1_value", "bar1": "bar1_value"})).unwrap()], + values: vec![ + json!({"foo1": "foo1_value", "bar1": "bar1_value"}).into(), + ], }, PipelineIngestRequest { table: "logs".to_string(), - values: vec![pipeline::json_to_map(json!({"foo2": "foo2_value", "bar2": "bar2_value"})).unwrap()], + values: vec![ + json!({"foo2": "foo2_value", "bar2": "bar2_value"}).into(), + ], }, ]), ), @@ -477,7 +500,9 @@ mod tests { Ok(vec![ PipelineIngestRequest { table: "test".to_string(), - values: vec![pipeline::json_to_map(json!({"foo1": "foo1_value", "bar1": "bar1_value"})).unwrap()], + values: vec![ + json!({"foo1": "foo1_value", "bar1": "bar1_value"}).into(), + ], }, ]), ), @@ -494,11 +519,15 @@ mod tests { Ok(vec![ PipelineIngestRequest { table: "test".to_string(), - values: vec![pipeline::json_to_map(json!({"foo1": "foo1_value", "bar1": "bar1_value"})).unwrap()], + values: vec![ + json!({"foo1": "foo1_value", "bar1": "bar1_value"}).into(), + ], }, PipelineIngestRequest { table: "test".to_string(), - values: vec![pipeline::json_to_map(json!({"foo2": "foo2_value", "bar2": "bar2_value"})).unwrap()], + values: vec![ + json!({"foo2": "foo2_value", "bar2": "bar2_value"}).into(), + ], }, ]), ), @@ -516,13 +545,13 @@ mod tests { PipelineIngestRequest { table: "logs-generic-default".to_string(), values: vec![ - pipeline::json_to_map(json!({"message": "172.16.0.1 - - [25/May/2024:20:19:37 +0000] \"GET /contact HTTP/1.1\" 404 162 \"-\" \"Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1\""})).unwrap(), + json!({"message": "172.16.0.1 - - [25/May/2024:20:19:37 +0000] \"GET /contact HTTP/1.1\" 404 162 \"-\" \"Mozilla/5.0 (iPhone; CPU iPhone OS 14_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1\""}).into(), ], }, PipelineIngestRequest { table: "logs-generic-default".to_string(), values: vec![ - pipeline::json_to_map(json!({"message": "10.0.0.1 - - [25/May/2024:20:18:37 +0000] \"GET /images/logo.png HTTP/1.1\" 304 0 \"-\" \"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0\""})).unwrap(), + json!({"message": "10.0.0.1 - - [25/May/2024:20:18:37 +0000] \"GET /images/logo.png HTTP/1.1\" 304 0 \"-\" \"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0\""}).into(), ], }, ]), diff --git a/src/servers/src/http/event.rs b/src/servers/src/http/event.rs index 0400e92140..f291cc2eef 100644 --- a/src/servers/src/http/event.rs +++ b/src/servers/src/http/event.rs @@ -35,14 +35,14 @@ use headers::ContentType; use lazy_static::lazy_static; use mime_guess::mime; use pipeline::util::to_pipeline_version; -use pipeline::{ - ContextReq, GreptimePipelineParams, PipelineContext, PipelineDefinition, Value as PipelineValue, -}; +use pipeline::{ContextReq, GreptimePipelineParams, PipelineContext, PipelineDefinition}; use serde::{Deserialize, Serialize}; use serde_json::{json, Deserializer, Map, Value as JsonValue}; use session::context::{Channel, QueryContext, QueryContextRef}; +use simd_json::Buffers; use snafu::{ensure, OptionExt, ResultExt}; use strum::{EnumIter, IntoEnumIterator}; +use vrl::value::{KeyString, Value as VrlValue}; use crate::error::{ status_code_to_http_status, Error, InvalidParameterSnafu, ParseJsonSnafu, PipelineSnafu, Result, @@ -117,7 +117,7 @@ pub(crate) struct PipelineIngestRequest { /// The table where the log data will be written to. pub table: String, /// The log data to be ingested. - pub values: Vec, + pub values: Vec, } pub struct PipelineContent(String); @@ -295,18 +295,18 @@ pub async fn delete_pipeline( /// Transform NDJSON array into a single array /// always return an array fn transform_ndjson_array_factory( - values: impl IntoIterator>, + values: impl IntoIterator>, ignore_error: bool, -) -> Result> { +) -> Result> { values .into_iter() .try_fold(Vec::with_capacity(100), |mut acc_array, item| match item { Ok(item_value) => { match item_value { - JsonValue::Array(item_array) => { + VrlValue::Array(item_array) => { acc_array.extend(item_array); } - JsonValue::Object(_) => { + VrlValue::Object(_) => { acc_array.push(item_value); } _ => { @@ -331,7 +331,7 @@ fn transform_ndjson_array_factory( /// Dryrun pipeline with given data async fn dryrun_pipeline_inner( - value: Vec, + value: Vec, pipeline: Arc, pipeline_handler: PipelineHandlerRef, query_ctx: &QueryContextRef, @@ -494,7 +494,7 @@ fn add_step_info_for_pipeline_dryrun_error(step_msg: &str, e: Error) -> Response /// Parse the data with given content type /// If the content type is invalid, return error /// content type is one of application/json, text/plain, application/x-ndjson -fn parse_dryrun_data(data_type: String, data: String) -> Result> { +fn parse_dryrun_data(data_type: String, data: String) -> Result> { if let Ok(content_type) = ContentType::from_str(&data_type) { extract_pipeline_value_by_content_type(content_type, Bytes::from(data), false) } else { @@ -741,17 +741,15 @@ impl<'a> TryFrom<&'a ContentType> for EventPayloadResolver<'a> { } impl EventPayloadResolver<'_> { - fn parse_payload(&self, payload: Bytes, ignore_errors: bool) -> Result> { + fn parse_payload(&self, payload: Bytes, ignore_errors: bool) -> Result> { match self.inner { - EventPayloadResolverInner::Json => { - pipeline::json_array_to_map(transform_ndjson_array_factory( - Deserializer::from_slice(&payload).into_iter(), - ignore_errors, - )?) - .context(PipelineSnafu) - } + EventPayloadResolverInner::Json => transform_ndjson_array_factory( + Deserializer::from_slice(&payload).into_iter(), + ignore_errors, + ), EventPayloadResolverInner::Ndjson => { let mut result = Vec::with_capacity(1000); + let mut buffer = Buffers::new(1000); for (index, line) in payload.lines().enumerate() { let mut line = match line { Ok(line) if !line.is_empty() => line, @@ -768,8 +766,10 @@ impl EventPayloadResolver<'_> { // simd_json, according to description, only de-escapes string at character level, // like any other json parser. So it should be safe here. - if let Ok(v) = simd_json::to_owned_value(unsafe { line.as_bytes_mut() }) { - let v = pipeline::simd_json_to_map(v).context(PipelineSnafu)?; + if let Ok(v) = simd_json::serde::from_slice_with_buffers( + unsafe { line.as_bytes_mut() }, + &mut buffer, + ) { result.push(v); } else if !ignore_errors { warn!("invalid JSON at index: {}, content: {:?}", index, line); @@ -787,8 +787,11 @@ impl EventPayloadResolver<'_> { .filter_map(|line| line.ok().filter(|line| !line.is_empty())) .map(|line| { let mut map = BTreeMap::new(); - map.insert("message".to_string(), PipelineValue::String(line)); - PipelineValue::Map(map.into()) + map.insert( + KeyString::from("message"), + VrlValue::Bytes(Bytes::from(line)), + ); + VrlValue::Object(map) }) .collect::>(); Ok(result) @@ -801,7 +804,7 @@ fn extract_pipeline_value_by_content_type( content_type: ContentType, payload: Bytes, ignore_errors: bool, -) -> Result> { +) -> Result> { EventPayloadResolver::try_from(&content_type).and_then(|resolver| { resolver .parse_payload(payload, ignore_errors) @@ -899,36 +902,37 @@ pub struct LogState { #[cfg(test)] mod tests { + use super::*; #[test] fn test_transform_ndjson() { let s = "{\"a\": 1}\n{\"b\": 2}"; - let a = JsonValue::Array( - transform_ndjson_array_factory(Deserializer::from_str(s).into_iter(), false).unwrap(), + let a = serde_json::to_string( + &transform_ndjson_array_factory(Deserializer::from_str(s).into_iter(), false).unwrap(), ) - .to_string(); + .unwrap(); assert_eq!(a, "[{\"a\":1},{\"b\":2}]"); let s = "{\"a\": 1}"; - let a = JsonValue::Array( - transform_ndjson_array_factory(Deserializer::from_str(s).into_iter(), false).unwrap(), + let a = serde_json::to_string( + &transform_ndjson_array_factory(Deserializer::from_str(s).into_iter(), false).unwrap(), ) - .to_string(); + .unwrap(); assert_eq!(a, "[{\"a\":1}]"); let s = "[{\"a\": 1}]"; - let a = JsonValue::Array( - transform_ndjson_array_factory(Deserializer::from_str(s).into_iter(), false).unwrap(), + let a = serde_json::to_string( + &transform_ndjson_array_factory(Deserializer::from_str(s).into_iter(), false).unwrap(), ) - .to_string(); + .unwrap(); assert_eq!(a, "[{\"a\":1}]"); let s = "[{\"a\": 1}, {\"b\": 2}]"; - let a = JsonValue::Array( - transform_ndjson_array_factory(Deserializer::from_str(s).into_iter(), false).unwrap(), + let a = serde_json::to_string( + &transform_ndjson_array_factory(Deserializer::from_str(s).into_iter(), false).unwrap(), ) - .to_string(); + .unwrap(); assert_eq!(a, "[{\"a\":1},{\"b\":2}]"); } @@ -945,21 +949,18 @@ mod tests { let fail_rest = extract_pipeline_value_by_content_type(ContentType::json(), payload.clone(), true); assert!(fail_rest.is_ok()); - assert_eq!( - fail_rest.unwrap(), - pipeline::json_array_to_map(vec![json!({"a": 1})]).unwrap() - ); + assert_eq!(fail_rest.unwrap(), vec![json!({"a": 1}).into()]); let fail_only_wrong = extract_pipeline_value_by_content_type(NDJSON_CONTENT_TYPE.clone(), payload, true); assert!(fail_only_wrong.is_ok()); let mut map1 = BTreeMap::new(); - map1.insert("a".to_string(), PipelineValue::Uint64(1)); - let map1 = PipelineValue::Map(map1.into()); + map1.insert(KeyString::from("a"), VrlValue::Integer(1)); + let map1 = VrlValue::Object(map1); let mut map2 = BTreeMap::new(); - map2.insert("c".to_string(), PipelineValue::Uint64(1)); - let map2 = PipelineValue::Map(map2.into()); + map2.insert(KeyString::from("c"), VrlValue::Integer(1)); + let map2 = VrlValue::Object(map2); assert_eq!(fail_only_wrong.unwrap(), vec![map1, map2]); } } diff --git a/src/servers/src/http/loki.rs b/src/servers/src/http/loki.rs index 090b70fe92..a16702f77a 100644 --- a/src/servers/src/http/loki.rs +++ b/src/servers/src/http/loki.rs @@ -25,6 +25,7 @@ use axum::extract::State; use axum::Extension; use axum_extra::TypedHeader; use bytes::Bytes; +use chrono::DateTime; use common_query::prelude::GREPTIME_TIMESTAMP; use common_query::{Output, OutputData}; use common_telemetry::{error, warn}; @@ -39,6 +40,7 @@ use prost::Message; use quoted_string::test_utils::TestSpec; use session::context::{Channel, QueryContext}; use snafu::{ensure, OptionExt, ResultExt}; +use vrl::value::{KeyString, Value as VrlValue}; use crate::error::{ DecodeOtlpRequestSnafu, InvalidLokiLabelsSnafu, InvalidLokiPayloadSnafu, ParseJsonSnafu, @@ -197,7 +199,7 @@ pub async fn loki_ingest( } /// This is the holder of the loki lines parsed from json or protobuf. -/// The generic here is either [serde_json::Value] or [Vec]. +/// The generic here is either [VrlValue] or [Vec]. /// Depending on the target destination, this can be converted to [LokiRawItem] or [LokiPipeline]. pub struct LokiMiddleItem { pub ts: i64, @@ -218,7 +220,7 @@ pub struct LokiRawItem { /// This is the line item prepared for the pipeline engine. pub struct LokiPipeline { - pub map: pipeline::Value, + pub map: VrlValue, } /// This is the flow of the Loki ingestion. @@ -255,7 +257,7 @@ pub struct LokiPipeline { /// +------------------+ +---------------------+ fn extract_item(content_type: ContentType, bytes: Bytes) -> Result>> where - LokiMiddleItem: Into, + LokiMiddleItem: Into, LokiMiddleItem>: Into, { match content_type { @@ -270,15 +272,14 @@ where } struct LokiJsonParser { - pub streams: VecDeque, + pub streams: VecDeque, } impl LokiJsonParser { pub fn from_bytes(bytes: Bytes) -> Result { - let payload: serde_json::Value = - serde_json::from_slice(bytes.as_ref()).context(ParseJsonSnafu)?; + let payload: VrlValue = serde_json::from_slice(bytes.as_ref()).context(ParseJsonSnafu)?; - let serde_json::Value::Object(mut map) = payload else { + let VrlValue::Object(mut map) = payload else { return InvalidLokiPayloadSnafu { msg: "payload is not an object", } @@ -289,7 +290,7 @@ impl LokiJsonParser { msg: "missing streams", })?; - let serde_json::Value::Array(streams) = streams else { + let VrlValue::Array(streams) = streams else { return InvalidLokiPayloadSnafu { msg: "streams is not an array", } @@ -308,7 +309,7 @@ impl Iterator for LokiJsonParser { fn next(&mut self) -> Option { while let Some(stream) = self.streams.pop_front() { // get lines from the map - let serde_json::Value::Object(mut map) = stream else { + let VrlValue::Object(mut map) = stream else { warn!("stream is not an object, {:?}", stream); continue; }; @@ -316,7 +317,7 @@ impl Iterator for LokiJsonParser { warn!("missing lines on stream, {:?}", map); continue; }; - let serde_json::Value::Array(lines) = lines else { + let VrlValue::Array(lines) = lines else { warn!("lines is not an array, {:?}", lines); continue; }; @@ -325,13 +326,15 @@ impl Iterator for LokiJsonParser { let labels = map .remove(LABEL_KEY) .and_then(|m| match m { - serde_json::Value::Object(labels) => Some(labels), + VrlValue::Object(labels) => Some(labels), _ => None, }) .map(|m| { m.into_iter() .filter_map(|(k, v)| match v { - serde_json::Value::String(v) => Some((k, v)), + VrlValue::Bytes(v) => { + Some((k.into(), String::from_utf8_lossy(&v).to_string())) + } _ => None, }) .collect::>() @@ -347,16 +350,16 @@ impl Iterator for LokiJsonParser { } struct JsonStreamItem { - pub lines: VecDeque, + pub lines: VecDeque, pub labels: Option>, } impl Iterator for JsonStreamItem { - type Item = LokiMiddleItem; + type Item = LokiMiddleItem; fn next(&mut self) -> Option { while let Some(line) = self.lines.pop_front() { - let serde_json::Value::Array(line) = line else { + let VrlValue::Array(line) = line else { warn!("line is not an array, {:?}", line); continue; }; @@ -364,11 +367,11 @@ impl Iterator for JsonStreamItem { warn!("line is too short, {:?}", line); continue; } - let mut line: VecDeque = line.into(); + let mut line: VecDeque = line.into(); // get ts let ts = line.pop_front().and_then(|ts| match ts { - serde_json::Value::String(ts) => ts.parse::().ok(), + VrlValue::Bytes(ts) => String::from_utf8_lossy(&ts).parse::().ok(), _ => { warn!("missing or invalid timestamp, {:?}", ts); None @@ -379,7 +382,7 @@ impl Iterator for JsonStreamItem { }; let line_text = line.pop_front().and_then(|l| match l { - serde_json::Value::String(l) => Some(l), + VrlValue::Bytes(l) => Some(String::from_utf8_lossy(&l).to_string()), _ => { warn!("missing or invalid line, {:?}", l); None @@ -402,8 +405,8 @@ impl Iterator for JsonStreamItem { } } -impl From> for LokiRawItem { - fn from(val: LokiMiddleItem) -> Self { +impl From> for LokiRawItem { + fn from(val: LokiMiddleItem) -> Self { let LokiMiddleItem { ts, line, @@ -413,13 +416,16 @@ impl From> for LokiRawItem { let structured_metadata = structured_metadata .and_then(|m| match m { - serde_json::Value::Object(m) => Some(m), + VrlValue::Object(m) => Some(m), _ => None, }) .map(|m| { m.into_iter() .filter_map(|(k, v)| match v { - serde_json::Value::String(v) => Some((k, Value::String(v.into()))), + VrlValue::Bytes(bytes) => Some(( + k.into(), + Value::String(String::from_utf8_lossy(&bytes).to_string().into()), + )), _ => None, }) .collect::>() @@ -436,8 +442,8 @@ impl From> for LokiRawItem { } } -impl From> for LokiPipeline { - fn from(value: LokiMiddleItem) -> Self { +impl From> for LokiPipeline { + fn from(value: LokiMiddleItem) -> Self { let LokiMiddleItem { ts, line, @@ -447,37 +453,33 @@ impl From> for LokiPipeline { let mut map = BTreeMap::new(); map.insert( - GREPTIME_TIMESTAMP.to_string(), - pipeline::Value::Timestamp(pipeline::Timestamp::Nanosecond(ts)), + KeyString::from(GREPTIME_TIMESTAMP), + VrlValue::Timestamp(DateTime::from_timestamp_nanos(ts)), ); map.insert( - LOKI_LINE_COLUMN_NAME.to_string(), - pipeline::Value::String(line), + KeyString::from(LOKI_LINE_COLUMN_NAME), + VrlValue::Bytes(line.into()), ); - if let Some(serde_json::Value::Object(m)) = structured_metadata { + if let Some(VrlValue::Object(m)) = structured_metadata { for (k, v) in m { - match pipeline::Value::try_from(v) { - Ok(v) => { - map.insert(format!("{}{}", LOKI_PIPELINE_METADATA_PREFIX, k), v); - } - Err(e) => { - warn!("not a valid value, {:?}", e); - } - } + map.insert( + KeyString::from(format!("{}{}", LOKI_PIPELINE_METADATA_PREFIX, k)), + v, + ); } } if let Some(v) = labels { v.into_iter().for_each(|(k, v)| { map.insert( - format!("{}{}", LOKI_PIPELINE_LABEL_PREFIX, k), - pipeline::Value::String(v), + KeyString::from(format!("{}{}", LOKI_PIPELINE_LABEL_PREFIX, k)), + VrlValue::Bytes(v.into()), ); }); } LokiPipeline { - map: pipeline::Value::Map(pipeline::Map::from(map)), + map: VrlValue::Object(map), } } } @@ -584,12 +586,12 @@ impl From>> for LokiPipeline { let mut map = BTreeMap::new(); map.insert( - GREPTIME_TIMESTAMP.to_string(), - pipeline::Value::Timestamp(pipeline::Timestamp::Nanosecond(ts)), + KeyString::from(GREPTIME_TIMESTAMP), + VrlValue::Timestamp(DateTime::from_timestamp_nanos(ts)), ); map.insert( - LOKI_LINE_COLUMN_NAME.to_string(), - pipeline::Value::String(line), + KeyString::from(LOKI_LINE_COLUMN_NAME), + VrlValue::Bytes(line.into()), ); structured_metadata @@ -597,22 +599,22 @@ impl From>> for LokiPipeline { .into_iter() .for_each(|d| { map.insert( - format!("{}{}", LOKI_PIPELINE_METADATA_PREFIX, d.name), - pipeline::Value::String(d.value), + KeyString::from(format!("{}{}", LOKI_PIPELINE_METADATA_PREFIX, d.name)), + VrlValue::Bytes(d.value.into()), ); }); if let Some(v) = labels { v.into_iter().for_each(|(k, v)| { map.insert( - format!("{}{}", LOKI_PIPELINE_LABEL_PREFIX, k), - pipeline::Value::String(v), + KeyString::from(format!("{}{}", LOKI_PIPELINE_LABEL_PREFIX, k)), + VrlValue::Bytes(v.into()), ); }); } LokiPipeline { - map: pipeline::Value::Map(pipeline::Map::from(map)), + map: VrlValue::Object(map), } } } diff --git a/src/servers/src/interceptor.rs b/src/servers/src/interceptor.rs index 750924142a..c366fd2efd 100644 --- a/src/servers/src/interceptor.rs +++ b/src/servers/src/interceptor.rs @@ -23,10 +23,10 @@ use common_error::ext::ErrorExt; use common_query::Output; use datafusion_expr::LogicalPlan; use log_query::LogQuery; -use pipeline::Value; use query::parser::PromQuery; use session::context::QueryContextRef; use sql::statements::statement::Statement; +use vrl::value::Value; /// SqlQueryInterceptor can track life cycle of a sql query and customize or /// abort its execution at given point. diff --git a/src/servers/src/otlp/logs.rs b/src/servers/src/otlp/logs.rs index 59d2230934..20c1dd7bae 100644 --- a/src/servers/src/otlp/logs.rs +++ b/src/servers/src/otlp/logs.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::HashMap as StdHashMap; +use std::collections::{BTreeMap, HashMap as StdHashMap}; use api::v1::column_data_type_extension::TypeExt; use api::v1::value::ValueData; @@ -20,6 +20,7 @@ use api::v1::{ ColumnDataType, ColumnDataTypeExtension, ColumnOptions, ColumnSchema, JsonTypeExtension, Row, RowInsertRequest, Rows, SemanticType, Value as GreptimeValue, }; +use bytes::Bytes; use jsonb::{Number as JsonbNumber, Value as JsonbValue}; use opentelemetry_proto::tonic::collector::logs::v1::ExportLogsServiceRequest; use opentelemetry_proto::tonic::common::v1::{any_value, AnyValue, InstrumentationScope, KeyValue}; @@ -27,13 +28,13 @@ use opentelemetry_proto::tonic::logs::v1::{LogRecord, ResourceLogs, ScopeLogs}; use pipeline::{ ContextReq, GreptimePipelineParams, PipelineContext, PipelineWay, SchemaInfo, SelectInfo, }; -use serde_json::{Map, Value}; use session::context::QueryContextRef; -use snafu::{ensure, ResultExt}; +use snafu::ensure; +use vrl::prelude::NotNan; +use vrl::value::{KeyString, Value as VrlValue}; use crate::error::{ - IncompatibleSchemaSnafu, NotSupportedSnafu, PipelineSnafu, Result, - UnsupportedJsonDataTypeForTagSnafu, + IncompatibleSchemaSnafu, NotSupportedSnafu, Result, UnsupportedJsonDataTypeForTagSnafu, }; use crate::http::event::PipelineIngestRequest; use crate::otlp::trace::attributes::OtlpAnyValue; @@ -69,8 +70,7 @@ pub async fn to_grpc_insert_requests( Ok(ContextReq::default_opt_with_reqs(vec![insert_request])) } PipelineWay::Pipeline(pipeline_def) => { - let data = parse_export_logs_service_request(request); - let array = pipeline::json_array_to_map(data).context(PipelineSnafu)?; + let array = parse_export_logs_service_request(request); let pipeline_ctx = PipelineContext::new(&pipeline_def, &pipeline_params, query_ctx.channel()); @@ -93,16 +93,16 @@ pub async fn to_grpc_insert_requests( } } -fn scope_to_pipeline_value(scope: Option) -> (Value, Value, Value) { +fn scope_to_pipeline_value(scope: Option) -> (VrlValue, VrlValue, VrlValue) { scope .map(|x| { ( - Value::Object(key_value_to_map(x.attributes)), - Value::String(x.version), - Value::String(x.name), + VrlValue::Object(key_value_to_map(x.attributes)), + VrlValue::Bytes(x.version.into()), + VrlValue::Bytes(x.name.into()), ) }) - .unwrap_or((Value::Null, Value::Null, Value::Null)) + .unwrap_or((VrlValue::Null, VrlValue::Null, VrlValue::Null)) } fn scope_to_jsonb( @@ -121,53 +121,59 @@ fn scope_to_jsonb( fn log_to_pipeline_value( log: LogRecord, - resource_schema_url: Value, - resource_attr: Value, - scope_schema_url: Value, - scope_name: Value, - scope_version: Value, - scope_attrs: Value, -) -> Value { - let log_attrs = Value::Object(key_value_to_map(log.attributes)); - let mut map = Map::new(); - map.insert("Timestamp".to_string(), Value::from(log.time_unix_nano)); + resource_schema_url: VrlValue, + resource_attr: VrlValue, + scope_schema_url: VrlValue, + scope_name: VrlValue, + scope_version: VrlValue, + scope_attrs: VrlValue, +) -> VrlValue { + let log_attrs = VrlValue::Object(key_value_to_map(log.attributes)); + let mut map = BTreeMap::new(); map.insert( - "ObservedTimestamp".to_string(), - Value::from(log.observed_time_unix_nano), + "Timestamp".into(), + VrlValue::Integer(log.time_unix_nano as i64), + ); + map.insert( + "ObservedTimestamp".into(), + VrlValue::Integer(log.observed_time_unix_nano as i64), ); // need to be convert to string map.insert( - "TraceId".to_string(), - Value::String(bytes_to_hex_string(&log.trace_id)), + "TraceId".into(), + VrlValue::Bytes(bytes_to_hex_string(&log.trace_id).into()), ); map.insert( - "SpanId".to_string(), - Value::String(bytes_to_hex_string(&log.span_id)), + "SpanId".into(), + VrlValue::Bytes(bytes_to_hex_string(&log.span_id).into()), ); - map.insert("TraceFlags".to_string(), Value::from(log.flags)); - map.insert("SeverityText".to_string(), Value::String(log.severity_text)); + map.insert("TraceFlags".into(), VrlValue::Integer(log.flags as i64)); map.insert( - "SeverityNumber".to_string(), - Value::from(log.severity_number), + "SeverityText".into(), + VrlValue::Bytes(log.severity_text.into()), + ); + map.insert( + "SeverityNumber".into(), + VrlValue::Integer(log.severity_number as i64), ); // need to be convert to string map.insert( - "Body".to_string(), + "Body".into(), log.body .as_ref() - .map(|x| Value::String(log_body_to_string(x))) - .unwrap_or(Value::Null), + .map(|x| VrlValue::Bytes(log_body_to_string(x).into())) + .unwrap_or(VrlValue::Null), ); - map.insert("ResourceSchemaUrl".to_string(), resource_schema_url); + map.insert("ResourceSchemaUrl".into(), resource_schema_url); - map.insert("ResourceAttributes".to_string(), resource_attr); - map.insert("ScopeSchemaUrl".to_string(), scope_schema_url); - map.insert("ScopeName".to_string(), scope_name); - map.insert("ScopeVersion".to_string(), scope_version); - map.insert("ScopeAttributes".to_string(), scope_attrs); - map.insert("LogAttributes".to_string(), log_attrs); - Value::Object(map) + map.insert("ResourceAttributes".into(), resource_attr); + map.insert("ScopeSchemaUrl".into(), scope_schema_url); + map.insert("ScopeName".into(), scope_name); + map.insert("ScopeVersion".into(), scope_version); + map.insert("ScopeAttributes".into(), scope_attrs); + map.insert("LogAttributes".into(), log_attrs); + VrlValue::Object(map) } fn build_otlp_logs_identity_schema() -> Vec { @@ -622,18 +628,18 @@ fn merge_values( /// transform otlp logs request to pipeline value /// https://opentelemetry.io/docs/concepts/signals/logs/ -fn parse_export_logs_service_request(request: ExportLogsServiceRequest) -> Vec { +fn parse_export_logs_service_request(request: ExportLogsServiceRequest) -> Vec { let mut result = Vec::new(); for r in request.resource_logs { let resource_attr = r .resource - .map(|x| Value::Object(key_value_to_map(x.attributes))) - .unwrap_or(Value::Null); - let resource_schema_url = Value::String(r.schema_url); + .map(|x| VrlValue::Object(key_value_to_map(x.attributes))) + .unwrap_or(VrlValue::Null); + let resource_schema_url = VrlValue::Bytes(r.schema_url.into()); for scope_logs in r.scope_logs { let (scope_attrs, scope_version, scope_name) = scope_to_pipeline_value(scope_logs.scope); - let scope_schema_url = Value::String(scope_logs.schema_url); + let scope_schema_url = VrlValue::Bytes(scope_logs.schema_url.into()); for log in scope_logs.log_records { let value = log_to_pipeline_value( log, @@ -652,43 +658,39 @@ fn parse_export_logs_service_request(request: ExportLogsServiceRequest) -> Vec Value { +fn any_value_to_vrl_value(value: any_value::Value) -> VrlValue { match value { - any_value::Value::StringValue(s) => Value::String(s), - any_value::Value::IntValue(i) => Value::from(i), - any_value::Value::DoubleValue(d) => Value::from(d), - any_value::Value::BoolValue(b) => Value::Bool(b), - any_value::Value::ArrayValue(a) => { - let values = a + any_value::Value::StringValue(s) => VrlValue::Bytes(s.into()), + any_value::Value::IntValue(i) => VrlValue::Integer(i), + any_value::Value::DoubleValue(d) => VrlValue::Float(NotNan::new(d).unwrap()), + any_value::Value::BoolValue(b) => VrlValue::Boolean(b), + any_value::Value::ArrayValue(array_value) => { + let values = array_value .values .into_iter() - .map(|v| match v.value { - Some(value) => any_value_to_pipeline_value(value), - None => Value::Null, - }) + .filter_map(|v| v.value.map(any_value_to_vrl_value)) .collect(); - Value::Array(values) + VrlValue::Array(values) } - any_value::Value::KvlistValue(kv) => { - let value = key_value_to_map(kv.values); - Value::Object(value) + any_value::Value::KvlistValue(key_value_list) => { + VrlValue::Object(key_value_to_map(key_value_list.values)) } - any_value::Value::BytesValue(b) => Value::String(bytes_to_hex_string(&b)), + any_value::Value::BytesValue(items) => VrlValue::Bytes(Bytes::from(items)), } } // convert otlp keyValue vec to map -fn key_value_to_map(key_values: Vec) -> Map { - let mut map = Map::new(); +fn key_value_to_map(key_values: Vec) -> BTreeMap { + let mut map = BTreeMap::new(); for kv in key_values { let value = match kv.value { Some(value) => match value.value { - Some(value) => any_value_to_pipeline_value(value), - None => Value::Null, + Some(value) => any_value_to_vrl_value(value), + None => VrlValue::Null, }, - None => Value::Null, + None => VrlValue::Null, }; - map.insert(kv.key.clone(), value); + map.insert(kv.key.into(), value); } map } diff --git a/src/servers/src/pipeline.rs b/src/servers/src/pipeline.rs index 2ddab66728..ea504ddf63 100644 --- a/src/servers/src/pipeline.rs +++ b/src/servers/src/pipeline.rs @@ -20,12 +20,13 @@ use api::greptime_proto; use api::v1::{ColumnDataType, ColumnSchema, RowInsertRequest, Rows, SemanticType}; use common_time::timestamp::TimeUnit; use pipeline::{ - unwrap_or_continue_if_err, ContextReq, DispatchedTo, Pipeline, PipelineContext, - PipelineDefinition, PipelineExecOutput, SchemaInfo, TransformedOutput, TransformerMode, Value, - GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME, + identity_pipeline, unwrap_or_continue_if_err, ContextReq, DispatchedTo, Pipeline, + PipelineContext, PipelineDefinition, PipelineExecOutput, SchemaInfo, TransformedOutput, + TransformerMode, GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME, }; use session::context::{Channel, QueryContextRef}; use snafu::ResultExt; +use vrl::value::Value as VrlValue; use crate::error::{CatalogSnafu, PipelineSnafu, Result}; use crate::http::event::PipelineIngestRequest; @@ -93,7 +94,7 @@ async fn run_identity_pipeline( .await .context(CatalogSnafu)? }; - pipeline::identity_pipeline(data_array, table, pipeline_ctx) + identity_pipeline(data_array, table, pipeline_ctx) .map(|opt_map| ContextReq::from_opt_map(opt_map, table_name)) .context(PipelineSnafu) } @@ -117,7 +118,7 @@ async fn run_custom_pipeline( } = pipeline_req; let arr_len = pipeline_maps.len(); let mut transformed_map = HashMap::new(); - let mut dispatched: BTreeMap> = BTreeMap::new(); + let mut dispatched: BTreeMap> = BTreeMap::new(); let mut schema_info = match pipeline.transformer() { TransformerMode::GreptimeTransformer(greptime_transformer) => { diff --git a/src/servers/src/proto.rs b/src/servers/src/proto.rs index 1e9c2dfed8..94351cfa50 100644 --- a/src/servers/src/proto.rs +++ b/src/servers/src/proto.rs @@ -20,12 +20,15 @@ use std::slice; use api::prom_store::remote::Sample; use bytes::{Buf, Bytes}; use common_query::prelude::{GREPTIME_TIMESTAMP, GREPTIME_VALUE}; -use pipeline::{ContextReq, GreptimePipelineParams, PipelineContext, PipelineDefinition, Value}; +use common_telemetry::warn; +use pipeline::{ContextReq, GreptimePipelineParams, PipelineContext, PipelineDefinition}; use prost::encoding::message::merge; use prost::encoding::{decode_key, decode_varint, WireType}; use prost::DecodeError; use session::context::QueryContextRef; use snafu::OptionExt; +use vrl::prelude::NotNan; +use vrl::value::{KeyString, Value as VrlValue}; use crate::error::InternalSnafu; use crate::http::event::PipelineIngestRequest; @@ -342,7 +345,7 @@ impl PromWriteRequest { /// let's keep it that way for now. pub struct PromSeriesProcessor { pub(crate) use_pipeline: bool, - pub(crate) table_values: BTreeMap>, + pub(crate) table_values: BTreeMap>, // optional fields for pipeline pub(crate) pipeline_handler: Option, @@ -379,29 +382,33 @@ impl PromSeriesProcessor { series: &mut PromTimeSeries, prom_validation_mode: PromValidationMode, ) -> Result<(), DecodeError> { - let mut vec_pipeline_map: Vec = Vec::new(); + let mut vec_pipeline_map = Vec::new(); let mut pipeline_map = BTreeMap::new(); for l in series.labels.iter() { let name = prom_validation_mode.decode_string(&l.name)?; let value = prom_validation_mode.decode_string(&l.value)?; - pipeline_map.insert(name, Value::String(value)); + pipeline_map.insert(KeyString::from(name), VrlValue::Bytes(value.into())); } let one_sample = series.samples.len() == 1; for s in series.samples.iter() { - // skip NaN value - if s.value.is_nan() { + let Ok(value) = NotNan::new(s.value) else { + warn!("Invalid float value: {}", s.value); continue; - } + }; + let timestamp = s.timestamp; - pipeline_map.insert(GREPTIME_TIMESTAMP.to_string(), Value::Int64(timestamp)); - pipeline_map.insert(GREPTIME_VALUE.to_string(), Value::Float64(s.value)); + pipeline_map.insert( + KeyString::from(GREPTIME_TIMESTAMP), + VrlValue::Integer(timestamp), + ); + pipeline_map.insert(KeyString::from(GREPTIME_VALUE), VrlValue::Float(value)); if one_sample { - vec_pipeline_map.push(Value::Map(pipeline_map.into())); + vec_pipeline_map.push(VrlValue::Object(pipeline_map)); break; } else { - vec_pipeline_map.push(Value::Map(pipeline_map.clone().into())); + vec_pipeline_map.push(VrlValue::Object(pipeline_map.clone())); } }