From 89a3da8a3a1e639dc9534fb5ad5e9622103d9199 Mon Sep 17 00:00:00 2001 From: liyang Date: Fri, 6 Sep 2024 00:00:53 +0800 Subject: [PATCH 1/8] chore(dockerfile): remove mysql and postgresql clients in greptimedb image (#4685) --- docker/ci/ubuntu/Dockerfile | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docker/ci/ubuntu/Dockerfile b/docker/ci/ubuntu/Dockerfile index cc3bed6f25..580b73e56f 100644 --- a/docker/ci/ubuntu/Dockerfile +++ b/docker/ci/ubuntu/Dockerfile @@ -11,9 +11,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \ python3.10 \ python3.10-dev \ python3-pip \ - curl \ - mysql-client \ - postgresql-client + curl COPY $DOCKER_BUILD_ROOT/docker/python/requirements.txt /etc/greptime/requirements.txt From 114772ba8756f44cd9980a984dbe77285f57f30c Mon Sep 17 00:00:00 2001 From: "Lei, HUANG" <6406592+v0y4g3r@users.noreply.github.com> Date: Fri, 6 Sep 2024 10:31:41 +0800 Subject: [PATCH 2/8] chore: bump version v0.9.3 (#4684) --- Cargo.lock | 136 ++++++++++++++++++++++++++--------------------------- Cargo.toml | 2 +- 2 files changed, 69 insertions(+), 69 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4d1aff23c6..a433b41841 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -214,7 +214,7 @@ checksum = "d301b3b94cb4b2f23d7917810addbbaff90738e0ca2be692bd027e70d7e0330c" [[package]] name = "api" -version = "0.9.2" +version = "0.9.3" dependencies = [ "common-base", "common-decimal", @@ -762,7 +762,7 @@ dependencies = [ [[package]] name = "auth" -version = "0.9.2" +version = "0.9.3" dependencies = [ "api", "async-trait", @@ -1286,7 +1286,7 @@ dependencies = [ [[package]] name = "cache" -version = "0.9.2" +version = "0.9.3" dependencies = [ "catalog", "common-error", @@ -1294,7 +1294,7 @@ dependencies = [ "common-meta", "moka", "snafu 0.8.4", - "substrait 0.9.2", + "substrait 0.9.3", ] [[package]] @@ -1321,7 +1321,7 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "catalog" -version = "0.9.2" +version = "0.9.3" dependencies = [ "api", "arrow", @@ -1647,7 +1647,7 @@ checksum = "4b82cf0babdbd58558212896d1a4272303a57bdb245c2bf1147185fb45640e70" [[package]] name = "client" -version = "0.9.2" +version = "0.9.3" dependencies = [ "api", "arc-swap", @@ -1677,7 +1677,7 @@ dependencies = [ "serde_json", "snafu 0.8.4", "substrait 0.37.3", - "substrait 0.9.2", + "substrait 0.9.3", "tokio", "tokio-stream", "tonic 0.11.0", @@ -1707,7 +1707,7 @@ dependencies = [ [[package]] name = "cmd" -version = "0.9.2" +version = "0.9.3" dependencies = [ "async-trait", "auth", @@ -1763,7 +1763,7 @@ dependencies = [ "session", "snafu 0.8.4", "store-api", - "substrait 0.9.2", + "substrait 0.9.3", "table", "temp-env", "tempfile", @@ -1809,7 +1809,7 @@ checksum = "55b672471b4e9f9e95499ea597ff64941a309b2cdbffcc46f2cc5e2d971fd335" [[package]] name = "common-base" -version = "0.9.2" +version = "0.9.3" dependencies = [ "anymap", "bitvec", @@ -1825,7 +1825,7 @@ dependencies = [ [[package]] name = "common-catalog" -version = "0.9.2" +version = "0.9.3" dependencies = [ "chrono", "common-error", @@ -1836,7 +1836,7 @@ dependencies = [ [[package]] name = "common-config" -version = "0.9.2" +version = "0.9.3" dependencies = [ "common-base", "common-error", @@ -1859,7 +1859,7 @@ dependencies = [ [[package]] name = "common-datasource" -version = "0.9.2" +version = "0.9.3" dependencies = [ "arrow", "arrow-schema", @@ -1896,7 +1896,7 @@ dependencies = [ [[package]] name = "common-decimal" -version = "0.9.2" +version = "0.9.3" dependencies = [ "bigdecimal", "common-error", @@ -1909,7 +1909,7 @@ dependencies = [ [[package]] name = "common-error" -version = "0.9.2" +version = "0.9.3" dependencies = [ "snafu 0.8.4", "strum 0.25.0", @@ -1918,7 +1918,7 @@ dependencies = [ [[package]] name = "common-frontend" -version = "0.9.2" +version = "0.9.3" dependencies = [ "api", "async-trait", @@ -1933,7 +1933,7 @@ dependencies = [ [[package]] name = "common-function" -version = "0.9.2" +version = "0.9.3" dependencies = [ "api", "arc-swap", @@ -1970,7 +1970,7 @@ dependencies = [ [[package]] name = "common-greptimedb-telemetry" -version = "0.9.2" +version = "0.9.3" dependencies = [ "async-trait", "common-runtime", @@ -1987,7 +1987,7 @@ dependencies = [ [[package]] name = "common-grpc" -version = "0.9.2" +version = "0.9.3" dependencies = [ "api", "arrow-flight", @@ -2013,7 +2013,7 @@ dependencies = [ [[package]] name = "common-grpc-expr" -version = "0.9.2" +version = "0.9.3" dependencies = [ "api", "common-base", @@ -2031,7 +2031,7 @@ dependencies = [ [[package]] name = "common-macro" -version = "0.9.2" +version = "0.9.3" dependencies = [ "arc-swap", "common-query", @@ -2045,7 +2045,7 @@ dependencies = [ [[package]] name = "common-mem-prof" -version = "0.9.2" +version = "0.9.3" dependencies = [ "common-error", "common-macro", @@ -2058,7 +2058,7 @@ dependencies = [ [[package]] name = "common-meta" -version = "0.9.2" +version = "0.9.3" dependencies = [ "anymap2", "api", @@ -2114,11 +2114,11 @@ dependencies = [ [[package]] name = "common-plugins" -version = "0.9.2" +version = "0.9.3" [[package]] name = "common-procedure" -version = "0.9.2" +version = "0.9.3" dependencies = [ "async-stream", "async-trait", @@ -2144,7 +2144,7 @@ dependencies = [ [[package]] name = "common-procedure-test" -version = "0.9.2" +version = "0.9.3" dependencies = [ "async-trait", "common-procedure", @@ -2152,7 +2152,7 @@ dependencies = [ [[package]] name = "common-query" -version = "0.9.2" +version = "0.9.3" dependencies = [ "api", "async-trait", @@ -2178,7 +2178,7 @@ dependencies = [ [[package]] name = "common-recordbatch" -version = "0.9.2" +version = "0.9.3" dependencies = [ "arc-swap", "common-error", @@ -2197,7 +2197,7 @@ dependencies = [ [[package]] name = "common-runtime" -version = "0.9.2" +version = "0.9.3" dependencies = [ "async-trait", "common-error", @@ -2219,7 +2219,7 @@ dependencies = [ [[package]] name = "common-telemetry" -version = "0.9.2" +version = "0.9.3" dependencies = [ "atty", "backtrace", @@ -2246,7 +2246,7 @@ dependencies = [ [[package]] name = "common-test-util" -version = "0.9.2" +version = "0.9.3" dependencies = [ "client", "common-query", @@ -2258,7 +2258,7 @@ dependencies = [ [[package]] name = "common-time" -version = "0.9.2" +version = "0.9.3" dependencies = [ "arrow", "chrono", @@ -2274,7 +2274,7 @@ dependencies = [ [[package]] name = "common-version" -version = "0.9.2" +version = "0.9.3" dependencies = [ "build-data", "const_format", @@ -2285,7 +2285,7 @@ dependencies = [ [[package]] name = "common-wal" -version = "0.9.2" +version = "0.9.3" dependencies = [ "common-base", "common-error", @@ -3093,7 +3093,7 @@ dependencies = [ [[package]] name = "datanode" -version = "0.9.2" +version = "0.9.3" dependencies = [ "api", "arrow-flight", @@ -3142,7 +3142,7 @@ dependencies = [ "session", "snafu 0.8.4", "store-api", - "substrait 0.9.2", + "substrait 0.9.3", "table", "tokio", "toml 0.8.14", @@ -3151,7 +3151,7 @@ dependencies = [ [[package]] name = "datatypes" -version = "0.9.2" +version = "0.9.3" dependencies = [ "arrow", "arrow-array", @@ -3721,7 +3721,7 @@ dependencies = [ [[package]] name = "file-engine" -version = "0.9.2" +version = "0.9.3" dependencies = [ "api", "async-trait", @@ -3823,7 +3823,7 @@ checksum = "28a80e3145d8ad11ba0995949bbcf48b9df2be62772b3d351ef017dff6ecb853" [[package]] name = "flow" -version = "0.9.2" +version = "0.9.3" dependencies = [ "api", "arrow", @@ -3880,7 +3880,7 @@ dependencies = [ "snafu 0.8.4", "store-api", "strum 0.25.0", - "substrait 0.9.2", + "substrait 0.9.3", "table", "tokio", "tonic 0.11.0", @@ -3927,7 +3927,7 @@ checksum = "6c2141d6d6c8512188a7891b4b01590a45f6dac67afb4f255c4124dbb86d4eaa" [[package]] name = "frontend" -version = "0.9.2" +version = "0.9.3" dependencies = [ "api", "arc-swap", @@ -5078,7 +5078,7 @@ dependencies = [ [[package]] name = "index" -version = "0.9.2" +version = "0.9.3" dependencies = [ "async-trait", "asynchronous-codec", @@ -5858,7 +5858,7 @@ checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" [[package]] name = "log-store" -version = "0.9.2" +version = "0.9.3" dependencies = [ "async-stream", "async-trait", @@ -6170,7 +6170,7 @@ dependencies = [ [[package]] name = "meta-client" -version = "0.9.2" +version = "0.9.3" dependencies = [ "api", "async-trait", @@ -6196,7 +6196,7 @@ dependencies = [ [[package]] name = "meta-srv" -version = "0.9.2" +version = "0.9.3" dependencies = [ "api", "async-trait", @@ -6274,7 +6274,7 @@ dependencies = [ [[package]] name = "metric-engine" -version = "0.9.2" +version = "0.9.3" dependencies = [ "api", "aquamarine", @@ -6365,7 +6365,7 @@ dependencies = [ [[package]] name = "mito2" -version = "0.9.2" +version = "0.9.3" dependencies = [ "api", "aquamarine", @@ -7012,7 +7012,7 @@ dependencies = [ [[package]] name = "object-store" -version = "0.9.2" +version = "0.9.3" dependencies = [ "anyhow", "bytes", @@ -7259,7 +7259,7 @@ dependencies = [ [[package]] name = "operator" -version = "0.9.2" +version = "0.9.3" dependencies = [ "api", "async-trait", @@ -7304,7 +7304,7 @@ dependencies = [ "sql", "sqlparser 0.45.0 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=54a267ac89c09b11c0c88934690530807185d3e7)", "store-api", - "substrait 0.9.2", + "substrait 0.9.3", "table", "tokio", "tokio-util", @@ -7554,7 +7554,7 @@ dependencies = [ [[package]] name = "partition" -version = "0.9.2" +version = "0.9.3" dependencies = [ "api", "async-trait", @@ -7843,7 +7843,7 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "pipeline" -version = "0.9.2" +version = "0.9.3" dependencies = [ "ahash 0.8.11", "api", @@ -8004,7 +8004,7 @@ dependencies = [ [[package]] name = "plugins" -version = "0.9.2" +version = "0.9.3" dependencies = [ "auth", "common-base", @@ -8273,7 +8273,7 @@ dependencies = [ [[package]] name = "promql" -version = "0.9.2" +version = "0.9.3" dependencies = [ "ahash 0.8.11", "async-trait", @@ -8508,7 +8508,7 @@ dependencies = [ [[package]] name = "puffin" -version = "0.9.2" +version = "0.9.3" dependencies = [ "async-compression 0.4.11", "async-trait", @@ -8630,7 +8630,7 @@ dependencies = [ [[package]] name = "query" -version = "0.9.2" +version = "0.9.3" dependencies = [ "ahash 0.8.11", "api", @@ -8693,7 +8693,7 @@ dependencies = [ "stats-cli", "store-api", "streaming-stats", - "substrait 0.9.2", + "substrait 0.9.3", "table", "tokio", "tokio-stream", @@ -10055,7 +10055,7 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "script" -version = "0.9.2" +version = "0.9.3" dependencies = [ "api", "arc-swap", @@ -10349,7 +10349,7 @@ dependencies = [ [[package]] name = "servers" -version = "0.9.2" +version = "0.9.3" dependencies = [ "aide", "api", @@ -10455,7 +10455,7 @@ dependencies = [ [[package]] name = "session" -version = "0.9.2" +version = "0.9.3" dependencies = [ "api", "arc-swap", @@ -10756,7 +10756,7 @@ dependencies = [ [[package]] name = "sql" -version = "0.9.2" +version = "0.9.3" dependencies = [ "api", "chrono", @@ -10816,7 +10816,7 @@ dependencies = [ [[package]] name = "sqlness-runner" -version = "0.9.2" +version = "0.9.3" dependencies = [ "async-trait", "clap 4.5.7", @@ -11033,7 +11033,7 @@ dependencies = [ [[package]] name = "store-api" -version = "0.9.2" +version = "0.9.3" dependencies = [ "api", "aquamarine", @@ -11202,7 +11202,7 @@ dependencies = [ [[package]] name = "substrait" -version = "0.9.2" +version = "0.9.3" dependencies = [ "async-trait", "bytes", @@ -11403,7 +11403,7 @@ dependencies = [ [[package]] name = "table" -version = "0.9.2" +version = "0.9.3" dependencies = [ "api", "async-trait", @@ -11668,7 +11668,7 @@ checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76" [[package]] name = "tests-fuzz" -version = "0.9.2" +version = "0.9.3" dependencies = [ "arbitrary", "async-trait", @@ -11710,7 +11710,7 @@ dependencies = [ [[package]] name = "tests-integration" -version = "0.9.2" +version = "0.9.3" dependencies = [ "api", "arrow-flight", @@ -11770,7 +11770,7 @@ dependencies = [ "sql", "sqlx", "store-api", - "substrait 0.9.2", + "substrait 0.9.3", "table", "tempfile", "time", diff --git a/Cargo.toml b/Cargo.toml index b596558713..e4a04c1f47 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -64,7 +64,7 @@ members = [ resolver = "2" [workspace.package] -version = "0.9.2" +version = "0.9.3" edition = "2021" license = "Apache-2.0" From 506dc20765f892b3d7ad77af841f6bbf7c1a3892 Mon Sep 17 00:00:00 2001 From: Yingwen Date: Fri, 6 Sep 2024 12:13:23 +0800 Subject: [PATCH 3/8] fix: last non null iter not init (#4687) --- src/mito2/src/read/dedup.rs | 61 ++++++++++++++++++++++++++++++++++++- 1 file changed, 60 insertions(+), 1 deletion(-) diff --git a/src/mito2/src/read/dedup.rs b/src/mito2/src/read/dedup.rs index 52ff05fd12..ddc96049e7 100644 --- a/src/mito2/src/read/dedup.rs +++ b/src/mito2/src/read/dedup.rs @@ -258,13 +258,18 @@ impl LastFieldsBuilder { fn maybe_init(&mut self, batch: &Batch) { debug_assert!(!batch.is_empty()); - if self.initialized || batch.fields().is_empty() { + if self.initialized { // Already initialized or no fields to merge. return; } self.initialized = true; + if batch.fields().is_empty() { + // No fields to merge. + return; + } + let last_idx = batch.num_rows() - 1; let fields = batch.fields(); // Safety: The last_idx is valid. @@ -1165,4 +1170,58 @@ mod tests { ]; assert_eq!(&expect, &actual[..]); } + + /// Returns a new [Batch] without fields. + fn new_batch_no_fields( + primary_key: &[u8], + timestamps: &[i64], + sequences: &[u64], + op_types: &[OpType], + ) -> Batch { + let mut builder = BatchBuilder::new(primary_key.to_vec()); + builder + .timestamps_array(Arc::new(TimestampMillisecondArray::from_iter_values( + timestamps.iter().copied(), + ))) + .unwrap() + .sequences_array(Arc::new(UInt64Array::from_iter_values( + sequences.iter().copied(), + ))) + .unwrap() + .op_types_array(Arc::new(UInt8Array::from_iter_values( + op_types.iter().map(|v| *v as u8), + ))) + .unwrap(); + builder.build().unwrap() + } + + #[test] + fn test_last_non_null_iter_no_batch() { + let input = [ + new_batch_no_fields( + b"k1", + &[1, 1, 2], + &[13, 12, 13], + &[OpType::Put, OpType::Put, OpType::Put], + ), + new_batch_no_fields(b"k1", &[2, 3], &[12, 13], &[OpType::Put, OpType::Delete]), + new_batch_no_fields( + b"k2", + &[1, 1, 2], + &[13, 12, 13], + &[OpType::Put, OpType::Put, OpType::Put], + ), + ]; + let iter = input.into_iter().map(Ok); + let iter = LastNonNullIter::new(iter); + let actual: Vec<_> = iter.map(|batch| batch.unwrap()).collect(); + let expect = [ + new_batch_no_fields(b"k1", &[1], &[13], &[OpType::Put]), + new_batch_no_fields(b"k1", &[2], &[13], &[OpType::Put]), + new_batch_no_fields(b"k1", &[3], &[13], &[OpType::Delete]), + new_batch_no_fields(b"k2", &[1], &[13], &[OpType::Put]), + new_batch_no_fields(b"k2", &[2], &[13], &[OpType::Put]), + ]; + assert_eq!(&expect, &actual[..]); + } } From 67d95d2088e76077e27dc5f522f8dd12fa0bc8e0 Mon Sep 17 00:00:00 2001 From: localhost Date: Fri, 6 Sep 2024 15:51:08 +0800 Subject: [PATCH 4/8] refactor!: add processor builder and transform buidler (#4571) * chore: add processor builder and transform buidler * chore: in process * chore: intermediate state from hashmap to vector in pipeline * chore: remove useless code and rename some struct * chore: fix typos * chore: format code * chore: add error handling and optimize code readability * chore: fix typos * chore: remove useless code * chore: add some doc * chore: fix by pr commit * chore: remove useless code and change struct name * chore: modify the location of the find_key_index function. --- src/pipeline/benches/processor.rs | 25 +- src/pipeline/src/etl.rs | 331 +++----- src/pipeline/src/etl/field.rs | 360 +++++---- src/pipeline/src/etl/processor.rs | 216 +++--- src/pipeline/src/etl/processor/cmcd.rs | 384 ++++++---- src/pipeline/src/etl/processor/csv.rs | 388 +++++----- src/pipeline/src/etl/processor/date.rs | 279 ++++--- src/pipeline/src/etl/processor/dissect.rs | 704 ++++++++++-------- src/pipeline/src/etl/processor/epoch.rs | 136 ++-- src/pipeline/src/etl/processor/gsub.rs | 235 +++--- src/pipeline/src/etl/processor/join.rs | 174 +++-- src/pipeline/src/etl/processor/letter.rs | 174 ++--- src/pipeline/src/etl/processor/regex.rs | 544 +++++++++----- src/pipeline/src/etl/processor/timestamp.rs | 171 +++-- src/pipeline/src/etl/processor/urlencoding.rs | 173 +++-- src/pipeline/src/etl/transform.rs | 169 +++-- .../src/etl/transform/transformer/greptime.rs | 162 ++-- .../transform/transformer/greptime/coerce.rs | 34 +- src/pipeline/tests/common.rs | 37 +- src/pipeline/tests/dissect.rs | 10 +- src/pipeline/tests/pipeline.rs | 40 +- 21 files changed, 2530 insertions(+), 2216 deletions(-) diff --git a/src/pipeline/benches/processor.rs b/src/pipeline/benches/processor.rs index 8ed0021a6e..281d8ce0ef 100644 --- a/src/pipeline/benches/processor.rs +++ b/src/pipeline/benches/processor.rs @@ -13,27 +13,13 @@ // limitations under the License. use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use pipeline::{parse, Array, Content, GreptimeTransformer, Pipeline, Value as PipelineValue}; +use pipeline::{parse, Content, GreptimeTransformer, Pipeline}; use serde_json::{Deserializer, Value}; -fn processor_map( - pipeline: &Pipeline, - input_values: Vec, -) -> impl IntoIterator { - let pipeline_data = input_values - .into_iter() - .map(|v| PipelineValue::try_from(v).unwrap()) - .collect::>(); - - pipeline.exec(PipelineValue::Array(Array { - values: pipeline_data, - })) -} - fn processor_mut( pipeline: &Pipeline, input_values: Vec, -) -> impl IntoIterator> { +) -> Result, String> { let mut payload = pipeline.init_intermediate_state(); let mut result = Vec::with_capacity(input_values.len()); @@ -249,11 +235,10 @@ fn criterion_benchmark(c: &mut Criterion) { let pipeline = prepare_pipeline(); let mut group = c.benchmark_group("pipeline"); group.sample_size(50); - group.bench_function("processor map", |b| { - b.iter(|| processor_map(black_box(&pipeline), black_box(input_value.clone()))) - }); group.bench_function("processor mut", |b| { - b.iter(|| processor_mut(black_box(&pipeline), black_box(input_value.clone()))) + b.iter(|| { + processor_mut(black_box(&pipeline), black_box(input_value.clone())).unwrap(); + }) }); group.finish(); } diff --git a/src/pipeline/src/etl.rs b/src/pipeline/src/etl.rs index b2c8802dd5..de4c544a01 100644 --- a/src/pipeline/src/etl.rs +++ b/src/pipeline/src/etl.rs @@ -19,92 +19,24 @@ pub mod processor; pub mod transform; pub mod value; -use ahash::{HashMap, HashSet}; -use common_telemetry::{debug, warn}; +use ahash::HashSet; +use common_telemetry::debug; use itertools::{merge, Itertools}; -use processor::Processor; -use transform::{Transformer, Transforms}; -use value::{Map, Value}; +use processor::{Processor, ProcessorBuilder, Processors}; +use transform::{TransformBuilders, Transformer, Transforms}; +use value::Value; use yaml_rust::YamlLoader; const DESCRIPTION: &str = "description"; const PROCESSORS: &str = "processors"; const TRANSFORM: &str = "transform"; +const TRANSFORMS: &str = "transforms"; pub enum Content { Json(String), Yaml(String), } -/// set the index for the processor keys -/// the index is the position of the key in the final intermediate keys -fn set_processor_keys_index( - processors: &mut processor::Processors, - final_intermediate_keys: &Vec, -) -> Result<(), String> { - let final_intermediate_key_index = final_intermediate_keys - .iter() - .enumerate() - .map(|(i, k)| (k.as_str(), i)) - .collect::>(); - for processor in processors.iter_mut() { - for field in processor.fields_mut().iter_mut() { - let index = final_intermediate_key_index.get(field.input_field.name.as_str()).ok_or(format!( - "input field {} is not found in intermediate keys: {final_intermediate_keys:?} when set processor keys index", - field.input_field.name - ))?; - field.set_input_index(*index); - for (k, v) in field.output_fields_index_mapping.iter_mut() { - let index = final_intermediate_key_index.get(k.as_str()); - match index { - Some(index) => { - *v = *index; - } - None => { - warn!( - "output field {k} is not found in intermediate keys: {final_intermediate_keys:?} when set processor keys index" - ); - } - } - } - } - } - Ok(()) -} - -fn set_transform_keys_index( - transforms: &mut Transforms, - final_intermediate_keys: &[String], - output_keys: &[String], -) -> Result<(), String> { - let final_intermediate_key_index = final_intermediate_keys - .iter() - .enumerate() - .map(|(i, k)| (k.as_str(), i)) - .collect::>(); - let output_key_index = output_keys - .iter() - .enumerate() - .map(|(i, k)| (k.as_str(), i)) - .collect::>(); - for transform in transforms.iter_mut() { - for field in transform.fields.iter_mut() { - let index = final_intermediate_key_index.get(field.input_field.name.as_str()).ok_or(format!( - "input field {} is not found in intermediate keys: {final_intermediate_keys:?} when set transform keys index", - field.input_field.name - ))?; - field.set_input_index(*index); - for (k, v) in field.output_fields_index_mapping.iter_mut() { - let index = output_key_index.get(k.as_str()).ok_or(format!( - "output field {k} is not found in output keys: {final_intermediate_keys:?} when set transform keys index" - ))?; - *v = *index; - } - } - } - Ok(()) -} - pub fn parse(input: &Content) -> Result, String> where T: Transformer, @@ -117,24 +49,22 @@ where let description = doc[DESCRIPTION].as_str().map(|s| s.to_string()); - let mut processors = if let Some(v) = doc[PROCESSORS].as_vec() { + let processor_builder_list = if let Some(v) = doc[PROCESSORS].as_vec() { v.try_into()? } else { - processor::Processors::default() + processor::ProcessorBuilderList::default() }; - let transforms = if let Some(v) = doc[TRANSFORM].as_vec() { - v.try_into()? - } else { - Transforms::default() - }; + let transform_builders = + if let Some(v) = doc[TRANSFORMS].as_vec().or(doc[TRANSFORM].as_vec()) { + v.try_into()? + } else { + TransformBuilders::default() + }; - let mut transformer = T::new(transforms)?; - let transforms = transformer.transforms_mut(); - - let processors_output_keys = processors.output_keys(); - let processors_required_keys = processors.required_keys(); - let processors_required_original_keys = processors.required_original_keys(); + let processors_required_keys = &processor_builder_list.input_keys; + let processors_output_keys = &processor_builder_list.output_keys; + let processors_required_original_keys = &processor_builder_list.original_input_keys; debug!( "processors_required_original_keys: {:?}", @@ -143,7 +73,7 @@ where debug!("processors_required_keys: {:?}", processors_required_keys); debug!("processors_output_keys: {:?}", processors_output_keys); - let transforms_required_keys = transforms.required_keys(); + let transforms_required_keys = &transform_builders.required_keys; let mut tr_keys = Vec::with_capacity(50); for key in transforms_required_keys.iter() { if !processors_output_keys.contains(key) @@ -183,9 +113,33 @@ where final_intermediate_keys.extend(intermediate_keys_exclude_original); - let output_keys = transforms.output_keys().clone(); - set_processor_keys_index(&mut processors, &final_intermediate_keys)?; - set_transform_keys_index(transforms, &final_intermediate_keys, &output_keys)?; + let output_keys = transform_builders.output_keys.clone(); + + let processors_kind_list = processor_builder_list + .processor_builders + .into_iter() + .map(|builder| builder.build(&final_intermediate_keys)) + .collect::, _>>()?; + let processors = Processors { + processors: processors_kind_list, + required_keys: processors_required_keys.clone(), + output_keys: processors_output_keys.clone(), + required_original_keys: processors_required_original_keys.clone(), + }; + + let transfor_list = transform_builders + .builders + .into_iter() + .map(|builder| builder.build(&final_intermediate_keys, &output_keys)) + .collect::, String>>()?; + + let transformers = Transforms { + transforms: transfor_list, + required_keys: transforms_required_keys.clone(), + output_keys: output_keys.clone(), + }; + + let transformer = T::new(transformers)?; Ok(Pipeline { description, @@ -238,38 +192,6 @@ impl Pipeline where T: Transformer, { - fn exec_map(&self, map: &mut Map) -> Result<(), String> { - let v = map; - for processor in self.processors.iter() { - processor.exec_map(v)?; - } - Ok(()) - } - - pub fn exec(&self, mut val: Value) -> Result { - let result = match val { - Value::Map(ref mut map) => { - self.exec_map(map)?; - val - } - Value::Array(arr) => arr - .values - .into_iter() - .map(|mut v| match v { - Value::Map(ref mut map) => { - self.exec_map(map)?; - Ok(v) - } - _ => Err(format!("expected a map, but got {}", v)), - }) - .collect::, String>>() - .map(|values| Value::Array(value::Array { values }))?, - _ => return Err(format!("expected a map or array, but got {}", val)), - }; - - self.transformer.transform(result) - } - pub fn exec_mut(&self, val: &mut Vec) -> Result { for processor in self.processors.iter() { processor.exec_mut(val)?; @@ -347,9 +269,24 @@ where } } +pub(crate) fn find_key_index( + intermediate_keys: &[String], + key: &str, + kind: &str, +) -> Result { + intermediate_keys + .iter() + .position(|k| k == key) + .ok_or(format!( + "{} processor.{} not found in intermediate keys", + kind, key + )) +} + #[cfg(test)] mod tests { + use api::v1::Rows; use greptime_proto::v1::value::ValueData; use greptime_proto::v1::{self, ColumnDataType, SemanticType}; @@ -359,96 +296,43 @@ mod tests { #[test] fn test_pipeline_prepare() { - { - let input_value_str = r#" - { - "my_field": "1,2", - "foo": "bar" - } - "#; - let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap(); - - let pipeline_yaml = r#" ---- -description: Pipeline for Apache Tomcat + let input_value_str = r#" + { + "my_field": "1,2", + "foo": "bar" + } + "#; + let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap(); + let pipeline_yaml = r#"description: 'Pipeline for Apache Tomcat' processors: - csv: - field: my_field, my_field,field1, field2 - + field: my_field + target_fields: field1, field2 transform: - field: field1 type: uint32 - field: field2 type: uint32 "#; - let pipeline: Pipeline = - parse(&Content::Yaml(pipeline_yaml.into())).unwrap(); - let mut payload = pipeline.init_intermediate_state(); - pipeline.prepare(input_value, &mut payload).unwrap(); - assert_eq!( - &["greptime_timestamp", "my_field"].to_vec(), - pipeline.required_keys() - ); - assert_eq!( - payload, - vec![ - Value::Null, - Value::String("1,2".to_string()), - Value::Null, - Value::Null - ] - ); - let result = pipeline.exec_mut(&mut payload).unwrap(); + let pipeline: Pipeline = + parse(&Content::Yaml(pipeline_yaml.into())).unwrap(); + let mut payload = pipeline.init_intermediate_state(); + pipeline.prepare(input_value, &mut payload).unwrap(); + assert_eq!(&["my_field"].to_vec(), pipeline.required_keys()); + assert_eq!( + payload, + vec![Value::String("1,2".to_string()), Value::Null, Value::Null] + ); + let result = pipeline.exec_mut(&mut payload).unwrap(); - assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1))); - assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2))); - match &result.values[2].value_data { - Some(ValueData::TimestampNanosecondValue(v)) => { - assert_ne!(*v, 0); - } - _ => panic!("expect null value"), + assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1))); + assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2))); + match &result.values[2].value_data { + Some(ValueData::TimestampNanosecondValue(v)) => { + assert_ne!(*v, 0); } - } - { - let input_value_str = r#" - { - "reqTimeSec": "1573840000.000" - } - "#; - - let pipeline_yaml = r#" ---- -description: Pipeline for Demo Log - -processors: - - gsub: - field: reqTimeSec - pattern: "\\." - replacement: "" - - epoch: - field: reqTimeSec - resolution: millisecond - ignore_missing: true - -transform: - - field: reqTimeSec - type: epoch, millisecond - index: timestamp -"#; - let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap(); - let pipeline: Pipeline = - parse(&Content::Yaml(pipeline_yaml.into())).unwrap(); - let mut payload = pipeline.init_intermediate_state(); - pipeline.prepare(input_value, &mut payload).unwrap(); - assert_eq!(&["reqTimeSec"].to_vec(), pipeline.required_keys()); - assert_eq!(payload, vec![Value::String("1573840000.000".to_string())]); - let result = pipeline.exec_mut(&mut payload).unwrap(); - - assert_eq!( - result.values[0].value_data, - Some(ValueData::TimestampMillisecondValue(1573840000000)) - ); + _ => panic!("expect null value"), } } @@ -541,21 +425,19 @@ transform: #[test] fn test_csv_pipeline() { let input_value_str = r#" - { - "my_field": "1,2", - "foo": "bar" - } - "#; + { + "my_field": "1,2", + "foo": "bar" + } + "#; let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap(); let pipeline_yaml = r#" ---- description: Pipeline for Apache Tomcat - processors: - csv: - field: my_field,my_field, field1, field2 - + field: my_field + target_fields: field1, field2 transform: - field: field1 type: uint32 @@ -565,8 +447,22 @@ transform: let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml.into())).unwrap(); - let output = pipeline.exec(input_value.try_into().unwrap()); - assert!(output.is_ok()); + let mut payload = pipeline.init_intermediate_state(); + pipeline.prepare(input_value, &mut payload).unwrap(); + assert_eq!(&["my_field"].to_vec(), pipeline.required_keys()); + assert_eq!( + payload, + vec![Value::String("1,2".to_string()), Value::Null, Value::Null] + ); + let result = pipeline.exec_mut(&mut payload).unwrap(); + assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1))); + assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2))); + match &result.values[2].value_data { + Some(ValueData::TimestampNanosecondValue(v)) => { + assert_ne!(*v, 0); + } + _ => panic!("expect null value"), + } } #[test] @@ -596,7 +492,14 @@ transform: let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml.into())).unwrap(); - let output = pipeline.exec(input_value.try_into().unwrap()).unwrap(); + let schema = pipeline.schemas().clone(); + let mut result = pipeline.init_intermediate_state(); + pipeline.prepare(input_value, &mut result).unwrap(); + let row = pipeline.exec_mut(&mut result).unwrap(); + let output = Rows { + schema, + rows: vec![row], + }; let schemas = output.schema; assert_eq!(schemas.len(), 1); diff --git a/src/pipeline/src/etl/field.rs b/src/pipeline/src/etl/field.rs index 80d19c0056..ff2f1ee7b5 100644 --- a/src/pipeline/src/etl/field.rs +++ b/src/pipeline/src/etl/field.rs @@ -12,69 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::BTreeMap; +use std::ops::Deref; +use std::str::FromStr; -use ahash::{HashSet, HashSetExt}; -use itertools::Itertools; - -#[derive(Debug, Default, Clone)] -pub struct Fields(Vec); - -impl Fields { - pub(crate) fn new(fields: Vec) -> Result { - let ff = Fields(fields); - ff.check() - } - - pub(crate) fn one(field: Field) -> Self { - Fields(vec![field]) - } - - pub(crate) fn get_target_fields(&self) -> Vec<&str> { - self.0.iter().map(|f| f.get_target_field()).collect() - } - - fn check(self) -> Result { - if self.0.is_empty() { - return Err("fields must not be empty".to_string()); - } - - let mut set = HashSet::new(); - for f in self.0.iter() { - if set.contains(&f.input_field.name) { - return Err(format!( - "field name must be unique, but got duplicated: {}", - f.input_field.name - )); - } - set.insert(&f.input_field.name); - } - - Ok(self) - } -} - -impl std::fmt::Display for Fields { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - let s = self.0.iter().map(|f| f.to_string()).join(";"); - write!(f, "{s}") - } -} - -impl std::ops::Deref for Fields { - type Target = Vec; - - fn deref(&self) -> &Self::Target { - &self.0 - } -} - -impl std::ops::DerefMut for Fields { - fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.0 - } -} +use crate::etl::find_key_index; +/// Information about the input field including the name and index in intermediate keys. #[derive(Debug, Default, Clone)] pub struct InputFieldInfo { pub(crate) name: String, @@ -82,132 +25,202 @@ pub struct InputFieldInfo { } impl InputFieldInfo { + /// Create a new input field info with the given field name and index. pub(crate) fn new(field: impl Into, index: usize) -> Self { InputFieldInfo { name: field.into(), index, } } +} - pub(crate) fn name(field: impl Into) -> Self { - InputFieldInfo { - name: field.into(), - index: 0, +/// Information about a field that has one input and one output. +#[derive(Debug, Default, Clone)] +pub struct OneInputOneOutputField { + input: InputFieldInfo, + output: Option<(String, usize)>, +} + +impl OneInputOneOutputField { + /// Create a new field with the given input and output. + pub(crate) fn new(input: InputFieldInfo, output: (String, usize)) -> Self { + OneInputOneOutputField { + input, + output: Some(output), + } + } + + /// Build a new field with the given processor kind, intermediate keys, input field, and target field. + pub(crate) fn build( + processor_kind: &str, + intermediate_keys: &[String], + input_field: &str, + target_field: &str, + ) -> Result { + let input_index = find_key_index(intermediate_keys, input_field, processor_kind)?; + + let input_field_info = InputFieldInfo::new(input_field, input_index); + let output_index = find_key_index(intermediate_keys, target_field, processor_kind)?; + Ok(OneInputOneOutputField::new( + input_field_info, + (target_field.to_string(), output_index), + )) + } + + /// Get the input field information. + pub(crate) fn input(&self) -> &InputFieldInfo { + &self.input + } + + /// Get the index of the input field. + pub(crate) fn input_index(&self) -> usize { + self.input.index + } + + /// Get the name of the input field. + pub(crate) fn input_name(&self) -> &str { + &self.input.name + } + + /// Get the index of the output field. + pub(crate) fn output_index(&self) -> usize { + *self.output().1 + } + + /// Get the name of the output field. + pub(crate) fn output_name(&self) -> &str { + self.output().0 + } + + /// Get the output field information. + pub(crate) fn output(&self) -> (&String, &usize) { + if let Some((name, index)) = &self.output { + (name, index) + } else { + (&self.input.name, &self.input.index) } } } -/// Used to represent the input and output fields of a processor or transform. +/// Information about a field that has one input and multiple outputs. +#[derive(Debug, Default, Clone)] +pub struct OneInputMultiOutputField { + input: InputFieldInfo, + /// Typically, processors that output multiple keys need to be distinguished by splicing the keys together. + prefix: Option, +} + +impl OneInputMultiOutputField { + /// Create a new field with the given input and prefix. + pub(crate) fn new(input: InputFieldInfo, prefix: Option) -> Self { + OneInputMultiOutputField { input, prefix } + } + + /// Get the input field information. + pub(crate) fn input(&self) -> &InputFieldInfo { + &self.input + } + + /// Get the index of the input field. + pub(crate) fn input_index(&self) -> usize { + self.input.index + } + + /// Get the name of the input field. + pub(crate) fn input_name(&self) -> &str { + &self.input.name + } + + /// Get the prefix for the output fields. + pub(crate) fn target_prefix(&self) -> &str { + self.prefix.as_deref().unwrap_or(&self.input.name) + } +} + +/// Raw processor-defined inputs and outputs #[derive(Debug, Default, Clone)] pub struct Field { - /// The input field name and index. - pub input_field: InputFieldInfo, - - /// The output field name and index mapping. - pub output_fields_index_mapping: BTreeMap, - - // rename - pub target_field: Option, - - // 1-to-many mapping - // processors: - // - csv - pub target_fields: Option>, + pub(crate) input_field: String, + pub(crate) target_field: Option, } -impl Field { - pub(crate) fn new(field: impl Into) -> Self { - Field { - input_field: InputFieldInfo::name(field.into()), - output_fields_index_mapping: BTreeMap::new(), - target_field: None, - target_fields: None, - } - } - - /// target column_name in processor or transform - /// if target_field is None, return input field name - pub(crate) fn get_target_field(&self) -> &str { - self.target_field - .as_deref() - .unwrap_or(&self.input_field.name) - } - - /// input column_name in processor or transform - pub(crate) fn get_field_name(&self) -> &str { - &self.input_field.name - } - - /// set input column index in processor or transform - pub(crate) fn set_input_index(&mut self, index: usize) { - self.input_field.index = index; - } - - pub(crate) fn set_output_index(&mut self, key: &str, index: usize) { - if let Some(v) = self.output_fields_index_mapping.get_mut(key) { - *v = index; - } - } - - pub(crate) fn insert_output_index(&mut self, key: String, index: usize) { - self.output_fields_index_mapping.insert(key, index); - } -} - -impl std::str::FromStr for Field { +impl FromStr for Field { type Err = String; fn from_str(s: &str) -> Result { let mut parts = s.split(','); - let field = parts.next().ok_or("field is missing")?.trim().to_string(); + let input_field = parts + .next() + .ok_or("input field is missing")? + .trim() + .to_string(); + let target_field = parts.next().map(|x| x.trim().to_string()); - if field.is_empty() { - return Err("field is empty".to_string()); + if input_field.is_empty() { + return Err("input field is empty".to_string()); } - let renamed_field = match parts.next() { - Some(s) if !s.trim().is_empty() => Some(s.trim().to_string()), - _ => None, - }; - - // TODO(qtang): ???? what's this? - // weird design? field: ,,,.... - // and only use in csv processor - let fields: Vec<_> = parts - .map(|s| s.trim()) - .filter(|s| !s.is_empty()) - .map(|s| s.to_string()) - .collect(); - let target_fields = if fields.is_empty() { - None - } else { - Some(fields) - }; - Ok(Field { - input_field: InputFieldInfo::name(field), - output_fields_index_mapping: BTreeMap::new(), - target_field: renamed_field, - target_fields, + input_field, + target_field, }) } } -impl std::fmt::Display for Field { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - match (&self.target_field, &self.target_fields) { - (Some(target_field), None) => write!(f, "{}, {target_field}", self.input_field.name), - (None, Some(target_fields)) => { - write!( - f, - "{}, {}", - self.input_field.name, - target_fields.iter().join(",") - ) - } - _ => write!(f, "{}", self.input_field.name), +impl Field { + /// Create a new field with the given input and target fields. + pub(crate) fn new(input_field: impl Into, target_field: Option) -> Self { + Field { + input_field: input_field.into(), + target_field, } } + + /// Get the input field. + pub(crate) fn input_field(&self) -> &str { + &self.input_field + } + + /// Get the target field. + pub(crate) fn target_field(&self) -> Option<&str> { + self.target_field.as_deref() + } + + /// Get the target field or the input field if the target field is not set. + pub(crate) fn target_or_input_field(&self) -> &str { + self.target_field.as_deref().unwrap_or(&self.input_field) + } +} + +/// A collection of fields. +#[derive(Debug, Default, Clone)] +pub struct Fields(Vec); + +impl Fields { + pub(crate) fn new(fields: Vec) -> Self { + Fields(fields) + } + + pub(crate) fn one(field: Field) -> Self { + Fields(vec![field]) + } +} + +impl Deref for Fields { + type Target = Vec; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl IntoIterator for Fields { + type Item = Field; + type IntoIter = std::vec::IntoIter; + + fn into_iter(self) -> Self::IntoIter { + self.0.into_iter() + } } #[cfg(test)] @@ -227,35 +240,14 @@ mod tests { let cases = [ // ("field", "field", None, None), - ( - "field, target_field", - "field", - Some("target_field".into()), - None, - ), - ( - "field, target_field1, target_field2, target_field3", - "field", - Some("target_field1".into()), - Some(vec!["target_field2".into(), "target_field3".into()]), - ), - ( - "field,, target_field1, target_field2, target_field3", - "field", - None, - Some(vec![ - "target_field1".into(), - "target_field2".into(), - "target_field3".into(), - ]), - ), + ("field, target_field", "field", Some("target_field")), + ("field", "field", None), ]; - for (s, field, target_field, target_fields) in cases.into_iter() { + for (s, field, target_field) in cases.into_iter() { let f: Field = s.parse().unwrap(); - assert_eq!(f.get_field_name(), field, "{s}"); - assert_eq!(f.target_field, target_field, "{s}"); - assert_eq!(f.target_fields, target_fields, "{s}"); + assert_eq!(f.input_field(), field, "{s}"); + assert_eq!(f.target_field(), target_field, "{s}"); } } } diff --git a/src/pipeline/src/etl/processor.rs b/src/pipeline/src/etl/processor.rs index 185b155c32..257cce4dfc 100644 --- a/src/pipeline/src/etl/processor.rs +++ b/src/pipeline/src/etl/processor.rs @@ -25,22 +25,22 @@ pub mod timestamp; pub mod urlencoding; use ahash::{HashSet, HashSetExt}; -use cmcd::CmcdProcessor; -use csv::CsvProcessor; -use date::DateProcessor; -use dissect::DissectProcessor; +use cmcd::{CmcdProcessor, CmcdProcessorBuilder}; +use csv::{CsvProcessor, CsvProcessorBuilder}; +use date::{DateProcessor, DateProcessorBuilder}; +use dissect::{DissectProcessor, DissectProcessorBuilder}; use enum_dispatch::enum_dispatch; -use epoch::EpochProcessor; -use gsub::GsubProcessor; +use epoch::{EpochProcessor, EpochProcessorBuilder}; +use gsub::{GsubProcessor, GsubProcessorBuilder}; use itertools::Itertools; -use join::JoinProcessor; -use letter::LetterProcessor; -use regex::RegexProcessor; -use timestamp::TimestampProcessor; -use urlencoding::UrlEncodingProcessor; +use join::{JoinProcessor, JoinProcessorBuilder}; +use letter::{LetterProcessor, LetterProcessorBuilder}; +use regex::{RegexProcessor, RegexProcessorBuilder}; +use timestamp::{TimestampProcessor, TimestampProcessorBuilder}; +use urlencoding::{UrlEncodingProcessor, UrlEncodingProcessorBuilder}; -use crate::etl::field::{Field, Fields}; -use crate::etl::value::{Map, Value}; +use super::field::{Field, Fields}; +use crate::etl::value::Value; const FIELD_NAME: &str = "field"; const FIELDS_NAME: &str = "fields"; @@ -49,6 +49,7 @@ const METHOD_NAME: &str = "method"; const PATTERN_NAME: &str = "pattern"; const PATTERNS_NAME: &str = "patterns"; const SEPARATOR_NAME: &str = "separator"; +const TARGET_FIELDS_NAME: &str = "target_fields"; // const IF_NAME: &str = "if"; // const IGNORE_FAILURE_NAME: &str = "ignore_failure"; @@ -62,55 +63,14 @@ const SEPARATOR_NAME: &str = "separator"; /// The output of a processor is a map of key-value pairs that will be merged into the document when you use exec_map method. #[enum_dispatch(ProcessorKind)] pub trait Processor: std::fmt::Debug + Send + Sync + 'static { - /// Get the processor's fields - /// fields is just the same processor for multiple keys. It is not the case that a processor has multiple inputs - fn fields(&self) -> &Fields; - - /// Get the processor's fields mutably - fn fields_mut(&mut self) -> &mut Fields; - /// Get the processor's kind fn kind(&self) -> &str; /// Whether to ignore missing fn ignore_missing(&self) -> bool; - /// processor all output keys - /// if a processor has multiple output keys, it should return all of them - fn output_keys(&self) -> HashSet; - - /// Execute the processor on a document - /// and return a map of key-value pairs - fn exec_field(&self, val: &Value, field: &Field) -> Result; - /// Execute the processor on a vector which be preprocessed by the pipeline fn exec_mut(&self, val: &mut Vec) -> Result<(), String>; - - /// Execute the processor on a map - /// and merge the output into the original map - fn exec_map(&self, map: &mut Map) -> Result<(), String> { - for ff @ Field { - input_field: field_info, - .. - } in self.fields().iter() - { - match map.get(&field_info.name) { - Some(v) => { - map.extend(self.exec_field(v, ff)?); - } - None if self.ignore_missing() => {} - None => { - return Err(format!( - "{} processor: field '{}' is required but missing in {map}", - self.kind(), - field_info.name, - )) - } - } - } - - Ok(()) - } } #[derive(Debug)] @@ -129,6 +89,42 @@ pub enum ProcessorKind { Date(DateProcessor), } +/// ProcessorBuilder trait defines the interface for all processor builders +/// A processor builder is used to create a processor +#[enum_dispatch(ProcessorBuilders)] +pub trait ProcessorBuilder: std::fmt::Debug + Send + Sync + 'static { + /// Get the processor's output keys + fn output_keys(&self) -> HashSet<&str>; + /// Get the processor's input keys + fn input_keys(&self) -> HashSet<&str>; + /// Build the processor + fn build(self, intermediate_keys: &[String]) -> Result; +} + +#[derive(Debug)] +#[enum_dispatch] +pub enum ProcessorBuilders { + Cmcd(CmcdProcessorBuilder), + Csv(CsvProcessorBuilder), + Dissect(DissectProcessorBuilder), + Gsub(GsubProcessorBuilder), + Join(JoinProcessorBuilder), + Letter(LetterProcessorBuilder), + Regex(RegexProcessorBuilder), + Timestamp(TimestampProcessorBuilder), + UrlEncoding(UrlEncodingProcessorBuilder), + Epoch(EpochProcessorBuilder), + Date(DateProcessorBuilder), +} + +#[derive(Debug, Default)] +pub struct ProcessorBuilderList { + pub(crate) processor_builders: Vec, + pub(crate) input_keys: Vec, + pub(crate) output_keys: Vec, + pub(crate) original_input_keys: Vec, +} + #[derive(Debug, Default)] pub struct Processors { /// A ordered list of processors @@ -174,52 +170,63 @@ impl Processors { } } -impl TryFrom<&Vec> for Processors { +impl TryFrom<&Vec> for ProcessorBuilderList { type Error = String; fn try_from(vec: &Vec) -> Result { - let mut processors = vec![]; + let mut processors_builders = vec![]; let mut all_output_keys = HashSet::with_capacity(50); let mut all_required_keys = HashSet::with_capacity(50); let mut all_required_original_keys = HashSet::with_capacity(50); for doc in vec { let processor = parse_processor(doc)?; - - // get all required keys - let processor_required_keys: Vec = processor - .fields() - .iter() - .map(|f| f.input_field.name.clone()) - .collect(); - - for key in &processor_required_keys { - if !all_output_keys.contains(key) { - all_required_original_keys.insert(key.clone()); - } - } - - all_required_keys.extend(processor_required_keys); - - let processor_output_keys = processor.output_keys().into_iter(); - all_output_keys.extend(processor_output_keys); - - processors.push(processor); + processors_builders.push(processor); } - let all_required_keys = all_required_keys.into_iter().sorted().collect(); - let all_output_keys = all_output_keys.into_iter().sorted().collect(); - let all_required_original_keys = all_required_original_keys.into_iter().sorted().collect(); + for processor in processors_builders.iter() { + { + // get all required keys + let processor_required_keys = processor.input_keys(); - Ok(Processors { - processors, - required_keys: all_required_keys, + for key in &processor_required_keys { + if !all_output_keys.contains(key) { + all_required_original_keys.insert(*key); + } + } + + all_required_keys.extend(processor_required_keys); + + let processor_output_keys = processor.output_keys().into_iter(); + all_output_keys.extend(processor_output_keys); + } + } + + let all_required_keys = all_required_keys + .into_iter() + .map(|x| x.to_string()) + .sorted() + .collect(); + let all_output_keys = all_output_keys + .into_iter() + .map(|x| x.to_string()) + .sorted() + .collect(); + let all_required_original_keys = all_required_original_keys + .into_iter() + .map(|x| x.to_string()) + .sorted() + .collect(); + + Ok(ProcessorBuilderList { + processor_builders: processors_builders, + input_keys: all_required_keys, output_keys: all_output_keys, - required_original_keys: all_required_original_keys, + original_input_keys: all_required_original_keys, }) } } -fn parse_processor(doc: &yaml_rust::Yaml) -> Result { +fn parse_processor(doc: &yaml_rust::Yaml) -> Result { let map = doc.as_hash().ok_or("processor must be a map".to_string())?; let key = map @@ -238,20 +245,24 @@ fn parse_processor(doc: &yaml_rust::Yaml) -> Result { .ok_or("processor key must be a string".to_string())?; let processor = match str_key { - cmcd::PROCESSOR_CMCD => ProcessorKind::Cmcd(CmcdProcessor::try_from(value)?), - csv::PROCESSOR_CSV => ProcessorKind::Csv(CsvProcessor::try_from(value)?), - dissect::PROCESSOR_DISSECT => ProcessorKind::Dissect(DissectProcessor::try_from(value)?), - epoch::PROCESSOR_EPOCH => ProcessorKind::Epoch(EpochProcessor::try_from(value)?), - date::PROCESSOR_DATE => ProcessorKind::Date(DateProcessor::try_from(value)?), - gsub::PROCESSOR_GSUB => ProcessorKind::Gsub(GsubProcessor::try_from(value)?), - join::PROCESSOR_JOIN => ProcessorKind::Join(JoinProcessor::try_from(value)?), - letter::PROCESSOR_LETTER => ProcessorKind::Letter(LetterProcessor::try_from(value)?), - regex::PROCESSOR_REGEX => ProcessorKind::Regex(RegexProcessor::try_from(value)?), + cmcd::PROCESSOR_CMCD => ProcessorBuilders::Cmcd(CmcdProcessorBuilder::try_from(value)?), + csv::PROCESSOR_CSV => ProcessorBuilders::Csv(CsvProcessorBuilder::try_from(value)?), + dissect::PROCESSOR_DISSECT => { + ProcessorBuilders::Dissect(DissectProcessorBuilder::try_from(value)?) + } + epoch::PROCESSOR_EPOCH => ProcessorBuilders::Epoch(EpochProcessorBuilder::try_from(value)?), + date::PROCESSOR_DATE => ProcessorBuilders::Date(DateProcessorBuilder::try_from(value)?), + gsub::PROCESSOR_GSUB => ProcessorBuilders::Gsub(GsubProcessorBuilder::try_from(value)?), + join::PROCESSOR_JOIN => ProcessorBuilders::Join(JoinProcessorBuilder::try_from(value)?), + letter::PROCESSOR_LETTER => { + ProcessorBuilders::Letter(LetterProcessorBuilder::try_from(value)?) + } + regex::PROCESSOR_REGEX => ProcessorBuilders::Regex(RegexProcessorBuilder::try_from(value)?), timestamp::PROCESSOR_TIMESTAMP => { - ProcessorKind::Timestamp(TimestampProcessor::try_from(value)?) + ProcessorBuilders::Timestamp(TimestampProcessorBuilder::try_from(value)?) } urlencoding::PROCESSOR_URL_ENCODING => { - ProcessorKind::UrlEncoding(UrlEncodingProcessor::try_from(value)?) + ProcessorBuilders::UrlEncoding(UrlEncodingProcessorBuilder::try_from(value)?) } _ => return Err(format!("unsupported {} processor", str_key)), }; @@ -301,19 +312,10 @@ where }) } -pub(crate) fn yaml_fields(v: &yaml_rust::Yaml, field: &str) -> Result { - let v = yaml_parse_strings(v, field)?; - Fields::new(v) +pub(crate) fn yaml_new_fields(v: &yaml_rust::Yaml, field: &str) -> Result { + yaml_parse_strings(v, field).map(Fields::new) } -pub(crate) fn yaml_field(v: &yaml_rust::Yaml, field: &str) -> Result { +pub(crate) fn yaml_new_field(v: &yaml_rust::Yaml, field: &str) -> Result { yaml_parse_string(v, field) } - -pub(crate) fn update_one_one_output_keys(fields: &mut Fields) { - for field in fields.iter_mut() { - field - .output_fields_index_mapping - .insert(field.get_target_field().to_string(), 0_usize); - } -} diff --git a/src/pipeline/src/etl/processor/cmcd.rs b/src/pipeline/src/etl/processor/cmcd.rs index f4e6aa9d36..1556829d65 100644 --- a/src/pipeline/src/etl/processor/cmcd.rs +++ b/src/pipeline/src/etl/processor/cmcd.rs @@ -12,14 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::BTreeMap; + use ahash::HashSet; use urlencoding::decode; -use crate::etl::field::{Field, Fields}; +use crate::etl::field::{Field, Fields, InputFieldInfo, OneInputMultiOutputField}; +use crate::etl::find_key_index; use crate::etl::processor::{ - yaml_bool, yaml_field, yaml_fields, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, + yaml_bool, yaml_new_field, yaml_new_fields, Processor, ProcessorBuilder, ProcessorKind, + FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, }; -use crate::etl::value::{Map, Value}; +use crate::etl::value::Value; pub(crate) const PROCESSOR_CMCD: &str = "cmcd"; @@ -63,6 +67,178 @@ const CMCD_KEYS: [&str; 18] = [ CMCD_KEY_V, ]; +/// CmcdProcessorBuilder is a builder for CmcdProcessor +/// parse from raw yaml +#[derive(Debug, Default)] +pub struct CmcdProcessorBuilder { + fields: Fields, + output_keys: HashSet, + ignore_missing: bool, +} + +impl CmcdProcessorBuilder { + /// build_cmcd_outputs build cmcd output info + /// generate index and function for each output + pub(super) fn build_cmcd_outputs( + field: &Field, + intermediate_keys: &[String], + ) -> Result<(BTreeMap, Vec), String> { + let mut output_index = BTreeMap::new(); + let mut cmcd_field_outputs = Vec::with_capacity(CMCD_KEYS.len()); + for cmcd in CMCD_KEYS { + let final_key = generate_key(field.target_or_input_field(), cmcd); + let index = find_key_index(intermediate_keys, &final_key, "cmcd")?; + output_index.insert(final_key.clone(), index); + match cmcd { + CMCD_KEY_BS | CMCD_KEY_SU => { + let output_info = CmcdOutputInfo::new(final_key, cmcd, index, bs_su); + cmcd_field_outputs.push(output_info); + } + CMCD_KEY_BR | CMCD_KEY_BL | CMCD_KEY_D | CMCD_KEY_DL | CMCD_KEY_MTP + | CMCD_KEY_RTP | CMCD_KEY_TB => { + let output_info = CmcdOutputInfo::new(final_key, cmcd, index, br_tb); + cmcd_field_outputs.push(output_info); + } + CMCD_KEY_CID | CMCD_KEY_NRR | CMCD_KEY_OT | CMCD_KEY_SF | CMCD_KEY_SID + | CMCD_KEY_ST | CMCD_KEY_V => { + let output_info = CmcdOutputInfo::new(final_key, cmcd, index, cid_v); + cmcd_field_outputs.push(output_info); + } + CMCD_KEY_NOR => { + let output_info = CmcdOutputInfo::new(final_key, cmcd, index, nor); + cmcd_field_outputs.push(output_info); + } + CMCD_KEY_PR => { + let output_info = CmcdOutputInfo::new(final_key, cmcd, index, pr); + cmcd_field_outputs.push(output_info); + } + _ => {} + } + } + Ok((output_index, cmcd_field_outputs)) + } + + /// build CmcdProcessor from CmcdProcessorBuilder + pub fn build(self, intermediate_keys: &[String]) -> Result { + let mut real_fields = vec![]; + let mut cmcd_outputs = Vec::with_capacity(CMCD_KEYS.len()); + for field in self.fields.into_iter() { + let input_index = find_key_index(intermediate_keys, field.input_field(), "cmcd")?; + + let input_field_info = InputFieldInfo::new(field.input_field(), input_index); + + let (_, cmcd_field_outputs) = Self::build_cmcd_outputs(&field, intermediate_keys)?; + + cmcd_outputs.push(cmcd_field_outputs); + + let real_field = OneInputMultiOutputField::new(input_field_info, field.target_field); + real_fields.push(real_field); + } + Ok(CmcdProcessor { + fields: real_fields, + cmcd_outputs, + ignore_missing: self.ignore_missing, + }) + } +} + +impl ProcessorBuilder for CmcdProcessorBuilder { + fn output_keys(&self) -> HashSet<&str> { + self.output_keys.iter().map(|s| s.as_str()).collect() + } + + fn input_keys(&self) -> HashSet<&str> { + self.fields.iter().map(|f| f.input_field()).collect() + } + + fn build(self, intermediate_keys: &[String]) -> Result { + self.build(intermediate_keys).map(ProcessorKind::Cmcd) + } +} + +fn generate_key(prefix: &str, key: &str) -> String { + format!("{}_{}", prefix, key) +} + +/// CmcdOutputInfo is a struct to store output info +#[derive(Debug)] +pub(super) struct CmcdOutputInfo { + /// {input_field}_{cmcd_key} + final_key: String, + /// cmcd key + key: &'static str, + /// index in intermediate_keys + index: usize, + /// function to resolve value + f: fn(&str, &str, Option<&str>) -> Result, +} + +impl CmcdOutputInfo { + fn new( + final_key: String, + key: &'static str, + index: usize, + f: fn(&str, &str, Option<&str>) -> Result, + ) -> Self { + Self { + final_key, + key, + index, + f, + } + } +} + +impl Default for CmcdOutputInfo { + fn default() -> Self { + Self { + final_key: String::default(), + key: "", + index: 0, + f: |_, _, _| Ok(Value::Null), + } + } +} + +/// function to resolve CMCD_KEY_BS | CMCD_KEY_SU +fn bs_su(_: &str, _: &str, _: Option<&str>) -> Result { + Ok(Value::Boolean(true)) +} + +/// function to resolve CMCD_KEY_BR | CMCD_KEY_BL | CMCD_KEY_D | CMCD_KEY_DL | CMCD_KEY_MTP | CMCD_KEY_RTP | CMCD_KEY_TB +fn br_tb(s: &str, k: &str, v: Option<&str>) -> Result { + let v = v.ok_or(format!("{k} missing value in {s}"))?; + let val: i64 = v + .parse() + .map_err(|_| format!("failed to parse {v} as i64"))?; + Ok(Value::Int64(val)) +} + +/// function to resolve CMCD_KEY_CID | CMCD_KEY_NRR | CMCD_KEY_OT | CMCD_KEY_SF | CMCD_KEY_SID | CMCD_KEY_V +fn cid_v(s: &str, k: &str, v: Option<&str>) -> Result { + let v = v.ok_or(format!("{k} missing value in {s}"))?; + Ok(Value::String(v.to_string())) +} + +/// function to resolve CMCD_KEY_NOR +fn nor(s: &str, k: &str, v: Option<&str>) -> Result { + let v = v.ok_or(format!("{k} missing value in {s}"))?; + let val = match decode(v) { + Ok(val) => val.to_string(), + Err(_) => v.to_string(), + }; + Ok(Value::String(val)) +} + +/// function to resolve CMCD_KEY_PR +fn pr(s: &str, k: &str, v: Option<&str>) -> Result { + let v = v.ok_or(format!("{k} missing value in {s}"))?; + let val: f64 = v + .parse() + .map_err(|_| format!("failed to parse {v} as f64"))?; + Ok(Value::Float64(val)) +} + /// Common Media Client Data Specification: /// https://cdn.cta.tech/cta/media/media/resources/standards/pdfs/cta-5004-final.pdf /// @@ -100,98 +276,43 @@ const CMCD_KEYS: [&str; 18] = [ /// 12. Transport Layer Security SHOULD be used to protect all transmission of CMCD data. #[derive(Debug, Default)] pub struct CmcdProcessor { - fields: Fields, + fields: Vec, + cmcd_outputs: Vec>, ignore_missing: bool, } impl CmcdProcessor { - fn with_fields(&mut self, mut fields: Fields) { - Self::update_output_keys(&mut fields); - self.fields = fields; - } - - fn with_ignore_missing(&mut self, ignore_missing: bool) { - self.ignore_missing = ignore_missing; - } - fn generate_key(prefix: &str, key: &str) -> String { format!("{}_{}", prefix, key) } - fn parse(prefix: &str, s: &str) -> Result { - let mut map = Map::default(); + fn parse(&self, field_index: usize, s: &str) -> Result, String> { let parts = s.split(','); + let mut result = Vec::new(); for part in parts { let mut kv = part.split('='); let k = kv.next().ok_or(format!("{part} missing key in {s}"))?; let v = kv.next(); - let key = Self::generate_key(prefix, k); - match k { - CMCD_KEY_BS | CMCD_KEY_SU => { - map.insert(key, Value::Boolean(true)); + for cmcd_key in self.cmcd_outputs[field_index].iter() { + if cmcd_key.key == k { + let val = (cmcd_key.f)(s, k, v)?; + result.push((cmcd_key.index, val)); } - CMCD_KEY_BR | CMCD_KEY_BL | CMCD_KEY_D | CMCD_KEY_DL | CMCD_KEY_MTP - | CMCD_KEY_RTP | CMCD_KEY_TB => { - let v = v.ok_or(format!("{k} missing value in {s}"))?; - let val: i64 = v - .parse() - .map_err(|_| format!("failed to parse {v} as i64"))?; - map.insert(key, Value::Int64(val)); - } - CMCD_KEY_CID | CMCD_KEY_NRR | CMCD_KEY_OT | CMCD_KEY_SF | CMCD_KEY_SID - | CMCD_KEY_ST | CMCD_KEY_V => { - let v = v.ok_or(format!("{k} missing value in {s}"))?; - map.insert(key, Value::String(v.to_string())); - } - CMCD_KEY_NOR => { - let v = v.ok_or(format!("{k} missing value in {s}"))?; - let val = match decode(v) { - Ok(val) => val.to_string(), - Err(_) => v.to_string(), - }; - map.insert(key, Value::String(val)); - } - CMCD_KEY_PR => { - let v = v.ok_or(format!("{k} missing value in {s}"))?; - let val: f64 = v - .parse() - .map_err(|_| format!("failed to parse {v} as f64"))?; - map.insert(key, Value::Float64(val)); - } - _ => match v { - Some(v) => map.insert(key, Value::String(v.to_string())), - None => map.insert(k, Value::Boolean(true)), - }, } } - Ok(map) - } - - fn process_field(&self, val: &str, field: &Field) -> Result { - let prefix = field.get_target_field(); - - Self::parse(prefix, val) - } - - fn update_output_keys(fields: &mut Fields) { - for field in fields.iter_mut() { - for key in CMCD_KEYS.iter() { - field - .output_fields_index_mapping - .insert(Self::generate_key(field.get_target_field(), key), 0); - } - } + Ok(result) } } -impl TryFrom<&yaml_rust::yaml::Hash> for CmcdProcessor { +impl TryFrom<&yaml_rust::yaml::Hash> for CmcdProcessorBuilder { type Error = String; fn try_from(value: &yaml_rust::yaml::Hash) -> Result { - let mut processor = CmcdProcessor::default(); + let mut fields = Fields::default(); + let mut ignore_missing = false; for (k, v) in value.iter() { let key = k @@ -199,25 +320,40 @@ impl TryFrom<&yaml_rust::yaml::Hash> for CmcdProcessor { .ok_or(format!("key must be a string, but got {k:?}"))?; match key { FIELD_NAME => { - processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?)); + fields = Fields::one(yaml_new_field(v, FIELD_NAME)?); } FIELDS_NAME => { - processor.with_fields(yaml_fields(v, FIELDS_NAME)?); + fields = yaml_new_fields(v, FIELDS_NAME)?; } IGNORE_MISSING_NAME => { - processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?); + ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?; } _ => {} } } - Ok(processor) + let output_keys = fields + .iter() + .flat_map(|f| { + CMCD_KEYS + .iter() + .map(|cmcd_key| generate_key(f.target_or_input_field(), cmcd_key)) + }) + .collect(); + + let builder = CmcdProcessorBuilder { + fields, + output_keys, + ignore_missing, + }; + + Ok(builder) } } -impl crate::etl::processor::Processor for CmcdProcessor { +impl Processor for CmcdProcessor { fn kind(&self) -> &str { PROCESSOR_CMCD } @@ -226,51 +362,14 @@ impl crate::etl::processor::Processor for CmcdProcessor { self.ignore_missing } - fn fields(&self) -> &Fields { - &self.fields - } - - fn fields_mut(&mut self) -> &mut Fields { - &mut self.fields - } - - fn output_keys(&self) -> HashSet { - self.fields - .iter() - .map(|field| { - field - .target_field - .clone() - .unwrap_or_else(|| field.get_field_name().to_string()) - }) - .flat_map(|keys| { - CMCD_KEYS - .iter() - .map(move |key| format!("{}_{}", keys, *key)) - }) - .collect() - } - - fn exec_field(&self, val: &Value, field: &Field) -> Result { - match val { - Value::String(val) => self.process_field(val, field), - _ => Err(format!( - "{} processor: expect string value, but got {val:?}", - self.kind() - )), - } - } - fn exec_mut(&self, val: &mut Vec) -> Result<(), String> { - for field in self.fields.iter() { - match val.get(field.input_field.index) { + for (field_index, field) in self.fields.iter().enumerate() { + let field_value_index = field.input_index(); + match val.get(field_value_index) { Some(Value::String(v)) => { - // TODO(qtang): Let this method use the intermediate state collection directly. - let map = self.process_field(v, field)?; - for (k, v) in map.values.into_iter() { - if let Some(index) = field.output_fields_index_mapping.get(&k) { - val[*index] = v; - } + let result_list = self.parse(field_index, v)?; + for (output_index, v) in result_list { + val[output_index] = v; } } Some(Value::Null) | None => { @@ -278,7 +377,7 @@ impl crate::etl::processor::Processor for CmcdProcessor { return Err(format!( "{} processor: missing field: {}", self.kind(), - field.get_field_name() + field.input_name() )); } } @@ -299,7 +398,8 @@ mod tests { use ahash::HashMap; use urlencoding::decode; - use super::CmcdProcessor; + use super::{CmcdProcessorBuilder, CMCD_KEYS}; + use crate::etl::field::{Field, Fields}; use crate::etl::value::{Map, Value}; #[test] @@ -329,6 +429,7 @@ mod tests { ], ), ( + // we not resolve `b` key "b%2Crtp%3D15000%2Csid%3D%226e2fb550-c457-11e9-bb97-0800200c9a66%22", vec![ ( @@ -336,7 +437,6 @@ mod tests { Value::String("\"6e2fb550-c457-11e9-bb97-0800200c9a66\"".into()), ), ("prefix_rtp", Value::Int64(15000)), - ("b", Value::Boolean(true)), ], ), ( @@ -347,16 +447,17 @@ mod tests { ], ), ( + // we not resolve custom key "d%3D4004%2Ccom.example-myNumericKey%3D500%2Ccom.examplemyStringKey%3D%22myStringValue%22", vec![ - ( - "prefix_com.example-myNumericKey", - Value::String("500".into()), - ), - ( - "prefix_com.examplemyStringKey", - Value::String("\"myStringValue\"".into()), - ), + // ( + // "prefix_com.example-myNumericKey", + // Value::String("500".into()), + // ), + // ( + // "prefix_com.examplemyStringKey", + // Value::String("\"myStringValue\"".into()), + // ), ("prefix_d", Value::Int64(4004)), ], ), @@ -431,6 +532,24 @@ mod tests { ), ]; + let field = Field::new("prefix", None); + + let output_keys = CMCD_KEYS + .iter() + .map(|k| format!("prefix_{}", k)) + .collect::>(); + + let mut intermediate_keys = vec!["prefix".to_string()]; + intermediate_keys.append(&mut (output_keys.clone())); + + let builder = CmcdProcessorBuilder { + fields: Fields::new(vec![field]), + output_keys: output_keys.iter().map(|s| s.to_string()).collect(), + ignore_missing: false, + }; + + let processor = builder.build(&intermediate_keys).unwrap(); + for (s, vec) in ss.into_iter() { let decoded = decode(s).unwrap().to_string(); @@ -440,7 +559,12 @@ mod tests { .collect::>(); let expected = Map { values }; - let actual = CmcdProcessor::parse("prefix", &decoded).unwrap(); + let actual = processor.parse(0, &decoded).unwrap(); + let actual = actual + .into_iter() + .map(|(index, value)| (intermediate_keys[index].clone(), value)) + .collect::>(); + let actual = Map { values: actual }; assert_eq!(actual, expected); } } diff --git a/src/pipeline/src/etl/processor/csv.rs b/src/pipeline/src/etl/processor/csv.rs index 2f0750865a..fb1fca2bfb 100644 --- a/src/pipeline/src/etl/processor/csv.rs +++ b/src/pipeline/src/etl/processor/csv.rs @@ -14,17 +14,18 @@ // Reference: https://www.elastic.co/guide/en/elasticsearch/reference/current/csv-processor.html -use ahash::{HashMap, HashSet}; +use ahash::HashSet; use csv::{ReaderBuilder, Trim}; use itertools::EitherOrBoth::{Both, Left, Right}; use itertools::Itertools; -use crate::etl::field::{Field, Fields}; +use crate::etl::field::{Fields, InputFieldInfo, OneInputMultiOutputField}; +use crate::etl::find_key_index; use crate::etl::processor::{ - yaml_bool, yaml_field, yaml_fields, yaml_string, Processor, FIELDS_NAME, FIELD_NAME, - IGNORE_MISSING_NAME, + yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, ProcessorBuilder, + ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, }; -use crate::etl::value::{Map, Value}; +use crate::etl::value::Value; pub(crate) const PROCESSOR_CSV: &str = "csv"; @@ -32,18 +33,78 @@ const SEPARATOR_NAME: &str = "separator"; const QUOTE_NAME: &str = "quote"; const TRIM_NAME: &str = "trim"; const EMPTY_VALUE_NAME: &str = "empty_value"; +const TARGET_FIELDS: &str = "target_fields"; + +#[derive(Debug, Default)] +pub struct CsvProcessorBuilder { + reader: ReaderBuilder, + + fields: Fields, + ignore_missing: bool, + + // Value used to fill empty fields, empty fields will be skipped if this is not provided. + empty_value: Option, + target_fields: Vec, + // description + // if + // ignore_failure + // on_failure + // tag +} + +impl CsvProcessorBuilder { + fn build(self, intermediate_keys: &[String]) -> Result { + let mut real_fields = vec![]; + + for field in self.fields { + let input_index = find_key_index(intermediate_keys, field.input_field(), "csv")?; + + let input_field_info = InputFieldInfo::new(field.input_field(), input_index); + let real_field = OneInputMultiOutputField::new(input_field_info, None); + real_fields.push(real_field); + } + + let output_index_info = self + .target_fields + .iter() + .map(|f| find_key_index(intermediate_keys, f, "csv")) + .collect::, String>>()?; + Ok(CsvProcessor { + reader: self.reader, + fields: real_fields, + ignore_missing: self.ignore_missing, + empty_value: self.empty_value, + output_index_info, + }) + } +} + +impl ProcessorBuilder for CsvProcessorBuilder { + fn output_keys(&self) -> HashSet<&str> { + self.target_fields.iter().map(|s| s.as_str()).collect() + } + + fn input_keys(&self) -> HashSet<&str> { + self.fields.iter().map(|f| f.input_field()).collect() + } + + fn build(self, intermediate_keys: &[String]) -> Result { + self.build(intermediate_keys).map(ProcessorKind::Csv) + } +} /// only support string value #[derive(Debug)] pub struct CsvProcessor { reader: ReaderBuilder, - fields: Fields, + fields: Vec, ignore_missing: bool, // Value used to fill empty fields, empty fields will be skipped if this is not provided. empty_value: Option, + output_index_info: Vec, // description // if // ignore_failure @@ -52,81 +113,19 @@ pub struct CsvProcessor { } impl CsvProcessor { - fn new() -> Self { - let mut reader = ReaderBuilder::new(); - reader.has_headers(false); - - Self { - reader, - fields: Fields::default(), - ignore_missing: false, - empty_value: None, - } - } - - fn with_fields(&mut self, fields: Fields) { - self.fields = fields; - } - - fn try_separator(&mut self, separator: String) -> Result<(), String> { - if separator.len() != 1 { - Err(format!( - "'{}' must be a single character, but got '{}'", - SEPARATOR_NAME, separator - )) - } else { - self.reader.delimiter(separator.as_bytes()[0]); - Ok(()) - } - } - - fn try_quote(&mut self, quote: String) -> Result<(), String> { - if quote.len() != 1 { - Err(format!( - "'{}' must be a single character, but got '{}'", - QUOTE_NAME, quote - )) - } else { - self.reader.quote(quote.as_bytes()[0]); - Ok(()) - } - } - - fn with_trim(&mut self, trim: bool) { - if trim { - self.reader.trim(Trim::All); - } else { - self.reader.trim(Trim::None); - } - } - - fn with_ignore_missing(&mut self, ignore_missing: bool) { - self.ignore_missing = ignore_missing; - } - - fn with_empty_value(&mut self, empty_value: String) { - self.empty_value = Some(empty_value); - } - // process the csv format string to a map with target_fields as keys - fn process_field(&self, val: &str, field: &Field) -> Result { + fn process(&self, val: &str) -> Result, String> { let mut reader = self.reader.from_reader(val.as_bytes()); if let Some(result) = reader.records().next() { let record: csv::StringRecord = result.map_err(|e| e.to_string())?; - let values: HashMap = field - .target_fields - .as_ref() - .ok_or(format!( - "target fields must be set after '{}'", - field.get_field_name() - ))? + let values: Vec<(usize, Value)> = self + .output_index_info .iter() - .map(|f| f.to_string()) .zip_longest(record.iter()) .filter_map(|zipped| match zipped { - Both(target_field, val) => Some((target_field, Value::String(val.into()))), + Both(target_field, val) => Some((*target_field, Value::String(val.into()))), // if target fields are more than extracted fields, fill the rest with empty value Left(target_field) => { let value = self @@ -134,69 +133,101 @@ impl CsvProcessor { .as_ref() .map(|s| Value::String(s.clone())) .unwrap_or(Value::Null); - Some((target_field, value)) + Some((*target_field, value)) } // if extracted fields are more than target fields, ignore the rest Right(_) => None, }) .collect(); - Ok(Map { values }) + Ok(values) } else { Err("expected at least one record from csv format, but got none".into()) } } - - fn update_output_keys(&mut self) { - self.fields.iter_mut().for_each(|f| { - if let Some(tfs) = f.target_fields.as_ref() { - tfs.iter().for_each(|tf| { - if !tf.is_empty() { - f.output_fields_index_mapping.insert(tf.to_string(), 0); - } - }); - } - }) - } } -impl TryFrom<&yaml_rust::yaml::Hash> for CsvProcessor { +impl TryFrom<&yaml_rust::yaml::Hash> for CsvProcessorBuilder { type Error = String; fn try_from(hash: &yaml_rust::yaml::Hash) -> Result { - let mut processor = CsvProcessor::new(); + let mut reader = ReaderBuilder::new(); + reader.has_headers(false); + + let mut fields = Fields::default(); + let mut ignore_missing = false; + let mut empty_value = None; + let mut target_fields = vec![]; + for (k, v) in hash { let key = k .as_str() .ok_or(format!("key must be a string, but got {k:?}"))?; match key { FIELD_NAME => { - processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?)); + fields = Fields::one(yaml_new_field(v, FIELD_NAME)?); } FIELDS_NAME => { - processor.with_fields(yaml_fields(v, FIELDS_NAME)?); + fields = yaml_new_fields(v, FIELDS_NAME)?; + } + TARGET_FIELDS => { + target_fields = yaml_string(v, TARGET_FIELDS)? + .split(',') + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + .collect(); } SEPARATOR_NAME => { - processor.try_separator(yaml_string(v, SEPARATOR_NAME)?)?; + let separator = yaml_string(v, SEPARATOR_NAME)?; + if separator.len() != 1 { + return Err(format!( + "'{}' must be a single character, but got '{}'", + SEPARATOR_NAME, separator + )); + } else { + reader.delimiter(separator.as_bytes()[0]); + } } QUOTE_NAME => { - processor.try_quote(yaml_string(v, QUOTE_NAME)?)?; + let quote = yaml_string(v, QUOTE_NAME)?; + if quote.len() != 1 { + return Err(format!( + "'{}' must be a single character, but got '{}'", + QUOTE_NAME, quote + )); + } else { + reader.quote(quote.as_bytes()[0]); + } } TRIM_NAME => { - processor.with_trim(yaml_bool(v, TRIM_NAME)?); + let trim = yaml_bool(v, TRIM_NAME)?; + if trim { + reader.trim(Trim::All); + } else { + reader.trim(Trim::None); + } } IGNORE_MISSING_NAME => { - processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?); + ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?; } EMPTY_VALUE_NAME => { - processor.with_empty_value(yaml_string(v, EMPTY_VALUE_NAME)?); + empty_value = Some(yaml_string(v, EMPTY_VALUE_NAME)?); } _ => {} } } - processor.update_output_keys(); - Ok(processor) + let builder = { + CsvProcessorBuilder { + reader, + fields, + ignore_missing, + empty_value, + target_fields, + } + }; + + Ok(builder) } } @@ -209,41 +240,14 @@ impl Processor for CsvProcessor { self.ignore_missing } - fn fields(&self) -> &Fields { - &self.fields - } - - fn fields_mut(&mut self) -> &mut Fields { - &mut self.fields - } - - fn output_keys(&self) -> HashSet { - self.fields - .iter() - .flat_map(|f| f.target_fields.clone().unwrap_or_default()) - .collect() - } - - fn exec_field(&self, val: &Value, field: &Field) -> Result { - match val { - Value::String(val) => self.process_field(val, field), - _ => Err(format!( - "{} processor: expect string value, but got {val:?}", - self.kind() - )), - } - } - fn exec_mut(&self, val: &mut Vec) -> Result<(), String> { for field in self.fields.iter() { - match val.get(field.input_field.index) { + let index = field.input_index(); + match val.get(index) { Some(Value::String(v)) => { - // TODO(qtang): Let this method use the intermediate state collection directly. - let map = self.process_field(v, field)?; - for (k, v) in map.values.into_iter() { - if let Some(index) = field.output_fields_index_mapping.get(&k) { - val[*index] = v; - } + let resule_list = self.process(v)?; + for (k, v) in resule_list { + val[k] = v; } } Some(Value::Null) | None => { @@ -251,7 +255,7 @@ impl Processor for CsvProcessor { return Err(format!( "{} processor: missing field: {}", self.kind(), - field.get_field_name() + field.input_name() )); } } @@ -267,116 +271,140 @@ impl Processor for CsvProcessor { } } -// TODO(yuanbohan): more test cases #[cfg(test)] mod tests { + use ahash::HashMap; - use super::{CsvProcessor, Value}; - use crate::etl::field::Fields; - use crate::etl::processor::Processor; - use crate::etl::value::Map; + use super::Value; + use crate::etl::processor::csv::CsvProcessorBuilder; #[test] fn test_equal_length() { - let mut processor = CsvProcessor::new(); - let field = "data,, a, b".parse().unwrap(); - processor.with_fields(Fields::one(field)); + let mut reader = csv::ReaderBuilder::new(); + reader.has_headers(false); + let builder = CsvProcessorBuilder { + reader, + target_fields: vec!["a".into(), "b".into()], + ..Default::default() + }; - let values: HashMap = [("data".into(), Value::String("1,2".into()))] + let intermediate_keys = vec!["data".into(), "a".into(), "b".into()]; + + let processor = builder.build(&intermediate_keys).unwrap(); + let result = processor + .process("1,2") + .unwrap() .into_iter() - .collect(); - let mut m = Map { values }; - - processor.exec_map(&mut m).unwrap(); + .map(|(k, v)| (intermediate_keys[k].clone(), v)) + .collect::>(); let values = [ - ("data".into(), Value::String("1,2".into())), ("a".into(), Value::String("1".into())), ("b".into(), Value::String("2".into())), ] .into_iter() - .collect(); - let expected = Map { values }; + .collect::>(); - assert_eq!(expected, m); + assert_eq!(result, values); } // test target_fields length larger than the record length #[test] fn test_target_fields_has_more_length() { - let values = [("data".into(), Value::String("1,2".into()))] - .into_iter() - .collect(); - let mut input = Map { values }; - // with no empty value { - let mut processor = CsvProcessor::new(); - let field = "data,, a,b,c".parse().unwrap(); - processor.with_fields(Fields::one(field)); + let mut reader = csv::ReaderBuilder::new(); + reader.has_headers(false); + let builder = CsvProcessorBuilder { + reader, + target_fields: vec!["a".into(), "b".into(), "c".into()], + ..Default::default() + }; - processor.exec_map(&mut input).unwrap(); + let intermediate_keys = vec!["data".into(), "a".into(), "b".into(), "c".into()]; + + let processor = builder.build(&intermediate_keys).unwrap(); + let result = processor + .process("1,2") + .unwrap() + .into_iter() + .map(|(k, v)| (intermediate_keys[k].clone(), v)) + .collect::>(); let values = [ - ("data".into(), Value::String("1,2".into())), ("a".into(), Value::String("1".into())), ("b".into(), Value::String("2".into())), ("c".into(), Value::Null), ] .into_iter() - .collect(); - let expected = Map { values }; + .collect::>(); - assert_eq!(expected, input); + assert_eq!(result, values); } // with empty value { - let mut processor = CsvProcessor::new(); - let field = "data,, a,b,c".parse().unwrap(); - processor.with_fields(Fields::one(field)); - processor.with_empty_value("default".into()); + let mut reader = csv::ReaderBuilder::new(); + reader.has_headers(false); + let builder = CsvProcessorBuilder { + reader, + target_fields: vec!["a".into(), "b".into(), "c".into()], + empty_value: Some("default".into()), + ..Default::default() + }; - processor.exec_map(&mut input).unwrap(); + let intermediate_keys = vec!["data".into(), "a".into(), "b".into(), "c".into()]; + + let processor = builder.build(&intermediate_keys).unwrap(); + let result = processor + .process("1,2") + .unwrap() + .into_iter() + .map(|(k, v)| (intermediate_keys[k].clone(), v)) + .collect::>(); let values = [ - ("data".into(), Value::String("1,2".into())), ("a".into(), Value::String("1".into())), ("b".into(), Value::String("2".into())), ("c".into(), Value::String("default".into())), ] .into_iter() .collect(); - let expected = Map { values }; - assert_eq!(expected, input); + assert_eq!(result, values); } } // test record has larger length #[test] fn test_target_fields_has_less_length() { - let values = [("data".into(), Value::String("1,2,3".into()))] + let mut reader = csv::ReaderBuilder::new(); + reader.has_headers(false); + let builder = CsvProcessorBuilder { + reader, + target_fields: vec!["a".into(), "b".into()], + empty_value: Some("default".into()), + ..Default::default() + }; + + let intermediate_keys = vec!["data".into(), "a".into(), "b".into()]; + + let processor = builder.build(&intermediate_keys).unwrap(); + let result = processor + .process("1,2") + .unwrap() .into_iter() - .collect(); - let mut input = Map { values }; - - let mut processor = CsvProcessor::new(); - let field = "data,,a,b".parse().unwrap(); - processor.with_fields(Fields::one(field)); - - processor.exec_map(&mut input).unwrap(); + .map(|(k, v)| (intermediate_keys[k].clone(), v)) + .collect::>(); let values = [ - ("data".into(), Value::String("1,2,3".into())), ("a".into(), Value::String("1".into())), ("b".into(), Value::String("2".into())), ] .into_iter() .collect(); - let expected = Map { values }; - assert_eq!(expected, input); + assert_eq!(result, values); } } diff --git a/src/pipeline/src/etl/processor/date.rs b/src/pipeline/src/etl/processor/date.rs index 3230c497f4..b9bfcf3b6c 100644 --- a/src/pipeline/src/etl/processor/date.rs +++ b/src/pipeline/src/etl/processor/date.rs @@ -19,12 +19,12 @@ use chrono::{DateTime, NaiveDateTime}; use chrono_tz::Tz; use lazy_static::lazy_static; -use crate::etl::field::{Field, Fields}; +use crate::etl::field::{Fields, OneInputOneOutputField}; use crate::etl::processor::{ - update_one_one_output_keys, yaml_bool, yaml_field, yaml_fields, yaml_string, yaml_strings, - Processor, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, + yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, yaml_strings, Processor, + ProcessorBuilder, ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, }; -use crate::etl::value::{Map, Timestamp, Value}; +use crate::etl::value::{Timestamp, Value}; pub(crate) const PROCESSOR_DATE: &str = "date"; @@ -57,9 +57,15 @@ lazy_static! { .collect(); } -#[derive(Debug, Default)] +#[derive(Debug)] struct Formats(Vec>); +impl Default for Formats { + fn default() -> Self { + Formats(DEFAULT_FORMATS.clone()) + } +} + impl Formats { fn new(mut formats: Vec>) -> Self { formats.sort(); @@ -76,16 +82,119 @@ impl std::ops::Deref for Formats { } } +#[derive(Debug, Default)] +pub struct DateProcessorBuilder { + fields: Fields, + formats: Formats, + timezone: Option>, + locale: Option>, + ignore_missing: bool, +} + +impl ProcessorBuilder for DateProcessorBuilder { + fn output_keys(&self) -> HashSet<&str> { + self.fields + .iter() + .map(|f| f.target_or_input_field()) + .collect() + } + + fn input_keys(&self) -> HashSet<&str> { + self.fields.iter().map(|f| f.input_field()).collect() + } + + fn build(self, intermediate_keys: &[String]) -> Result { + self.build(intermediate_keys).map(ProcessorKind::Date) + } +} + +impl DateProcessorBuilder { + pub fn build(self, intermediate_keys: &[String]) -> Result { + let mut real_fields = vec![]; + for field in self.fields.into_iter() { + let input = OneInputOneOutputField::build( + "date", + intermediate_keys, + field.input_field(), + field.target_or_input_field(), + )?; + real_fields.push(input); + } + Ok(DateProcessor { + fields: real_fields, + formats: self.formats, + timezone: self.timezone, + locale: self.locale, + ignore_missing: self.ignore_missing, + }) + } +} + +impl TryFrom<&yaml_rust::yaml::Hash> for DateProcessorBuilder { + type Error = String; + + fn try_from(hash: &yaml_rust::yaml::Hash) -> Result { + let mut fields = Fields::default(); + let mut formats = Formats::default(); + let mut timezone = None; + let mut locale = None; + let mut ignore_missing = false; + + for (k, v) in hash { + let key = k + .as_str() + .ok_or(format!("key must be a string, but got {k:?}"))?; + + match key { + FIELD_NAME => { + fields = Fields::one(yaml_new_field(v, FIELD_NAME)?); + } + FIELDS_NAME => { + fields = yaml_new_fields(v, FIELDS_NAME)?; + } + + FORMATS_NAME => { + let format_strs = yaml_strings(v, FORMATS_NAME)?; + if format_strs.is_empty() { + formats = Formats::new(DEFAULT_FORMATS.clone()); + } else { + formats = Formats::new(format_strs.into_iter().map(Arc::new).collect()); + } + } + TIMEZONE_NAME => { + timezone = Some(Arc::new(yaml_string(v, TIMEZONE_NAME)?)); + } + LOCALE_NAME => { + locale = Some(Arc::new(yaml_string(v, LOCALE_NAME)?)); + } + IGNORE_MISSING_NAME => { + ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?; + } + + _ => {} + } + } + + let builder = DateProcessorBuilder { + fields, + formats, + timezone, + locale, + ignore_missing, + }; + + Ok(builder) + } +} + /// deprecated it should be removed in the future /// Reserved for compatibility only #[derive(Debug, Default)] pub struct DateProcessor { - fields: Fields, - + fields: Vec, formats: Formats, timezone: Option>, locale: Option>, // to support locale - output_format: Option>, ignore_missing: bool, // description @@ -96,43 +205,6 @@ pub struct DateProcessor { } impl DateProcessor { - fn with_fields(&mut self, mut fields: Fields) { - update_one_one_output_keys(&mut fields); - self.fields = fields - } - - fn with_formats(&mut self, v: Option>>) { - let v = match v { - Some(v) if !v.is_empty() => v, - _ => DEFAULT_FORMATS.clone(), - }; - - let formats = Formats::new(v); - self.formats = formats; - } - - fn with_timezone(&mut self, timezone: String) { - if !timezone.is_empty() { - self.timezone = Some(Arc::new(timezone)); - } - } - - fn with_locale(&mut self, locale: String) { - if !locale.is_empty() { - self.locale = Some(Arc::new(locale)); - } - } - - fn with_output_format(&mut self, output_format: String) { - if !output_format.is_empty() { - self.output_format = Some(Arc::new(output_format)); - } - } - - fn with_ignore_missing(&mut self, ignore_missing: bool) { - self.ignore_missing = ignore_missing; - } - fn parse(&self, val: &str) -> Result { let mut tz = Tz::UTC; if let Some(timezone) = &self.timezone { @@ -147,61 +219,6 @@ impl DateProcessor { Err(format!("{} processor: failed to parse {val}", self.kind(),)) } - - fn process_field(&self, val: &str, field: &Field) -> Result { - let key = field.get_target_field(); - - Ok(Map::one(key, Value::Timestamp(self.parse(val)?))) - } -} - -impl TryFrom<&yaml_rust::yaml::Hash> for DateProcessor { - type Error = String; - - fn try_from(hash: &yaml_rust::yaml::Hash) -> Result { - let mut processor = DateProcessor::default(); - - let mut formats_opt = None; - - for (k, v) in hash { - let key = k - .as_str() - .ok_or(format!("key must be a string, but got {k:?}"))?; - - match key { - FIELD_NAME => { - processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?)); - } - FIELDS_NAME => { - processor.with_fields(yaml_fields(v, FIELDS_NAME)?); - } - - FORMATS_NAME => { - let formats = yaml_strings(v, FORMATS_NAME)?; - formats_opt = Some(formats.into_iter().map(Arc::new).collect()); - } - TIMEZONE_NAME => { - processor.with_timezone(yaml_string(v, TIMEZONE_NAME)?); - } - LOCALE_NAME => { - processor.with_locale(yaml_string(v, LOCALE_NAME)?); - } - OUTPUT_FORMAT_NAME => { - processor.with_output_format(yaml_string(v, OUTPUT_FORMAT_NAME)?); - } - - IGNORE_MISSING_NAME => { - processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?); - } - - _ => {} - } - } - - processor.with_formats(formats_opt); - - Ok(processor) - } } impl Processor for DateProcessor { @@ -213,53 +230,21 @@ impl Processor for DateProcessor { self.ignore_missing } - fn fields(&self) -> &Fields { - &self.fields - } - - fn fields_mut(&mut self) -> &mut Fields { - &mut self.fields - } - - fn output_keys(&self) -> HashSet { - self.fields - .iter() - .map(|f| f.get_target_field().to_string()) - .collect() - } - - fn exec_field(&self, val: &Value, field: &Field) -> Result { - match val { - Value::String(s) => self.process_field(s, field), - _ => Err(format!( - "{} processor: expect string value, but got {val:?}", - self.kind() - )), - } - } - fn exec_mut(&self, val: &mut Vec) -> Result<(), String> { - for field in self.fields().iter() { - let index = field.input_field.index; + for field in self.fields.iter() { + let index = field.input_index(); match val.get(index) { Some(Value::String(s)) => { - // TODO(qtang): Let this method use the intermediate state collection directly. - let mut map = self.process_field(s, field)?; - field - .output_fields_index_mapping - .iter() - .for_each(|(k, output_index)| { - if let Some(v) = map.remove(k) { - val[*output_index] = v; - } - }); + let timestamp = self.parse(s)?; + let output_index = field.output_index(); + val[output_index] = Value::Timestamp(timestamp); } Some(Value::Null) | None => { if !self.ignore_missing { return Err(format!( "{} processor: missing field: {}", self.kind(), - field.get_field_name() + field.input_name() )); } } @@ -318,8 +303,7 @@ mod tests { #[test] fn test_parse() { - let mut processor = DateProcessor::default(); - processor.with_formats(None); + let processor = DateProcessor::default(); let values: Vec<&str> = vec![ "2014-5-17T12:34:56", @@ -340,7 +324,6 @@ mod tests { #[test] fn test_parse_with_formats() { - let mut processor = DateProcessor::default(); let formats = vec![ "%Y-%m-%dT%H:%M:%S%:z", "%Y-%m-%dT%H:%M:%S%.3f%:z", @@ -349,8 +332,11 @@ mod tests { ] .into_iter() .map(|s| Arc::new(s.to_string())) - .collect(); - processor.with_formats(Some(formats)); + .collect::>(); + let processor = DateProcessor { + formats: super::Formats(formats), + ..Default::default() + }; let values: Vec<&str> = vec![ "2014-5-17T12:34:56", @@ -371,9 +357,10 @@ mod tests { #[test] fn test_parse_with_timezone() { - let mut processor = DateProcessor::default(); - processor.with_formats(None); - processor.with_timezone("Asia/Tokyo".to_string()); + let processor = DateProcessor { + timezone: Some(Arc::new("Asia/Tokyo".to_string())), + ..Default::default() + }; let values: Vec<&str> = vec![ "2014-5-17T12:34:56", diff --git a/src/pipeline/src/etl/processor/dissect.rs b/src/pipeline/src/etl/processor/dissect.rs index ae544f5c43..9a4b8a966e 100644 --- a/src/pipeline/src/etl/processor/dissect.rs +++ b/src/pipeline/src/etl/processor/dissect.rs @@ -12,16 +12,20 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::ops::Deref; + use ahash::{HashMap, HashMapExt, HashSet, HashSetExt}; use common_telemetry::warn; use itertools::Itertools; -use crate::etl::field::{Field, Fields}; +use crate::etl::field::{Fields, InputFieldInfo, OneInputMultiOutputField}; +use crate::etl::find_key_index; use crate::etl::processor::{ - yaml_bool, yaml_field, yaml_fields, yaml_parse_string, yaml_parse_strings, yaml_string, - Processor, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, PATTERNS_NAME, PATTERN_NAME, + yaml_bool, yaml_new_field, yaml_new_fields, yaml_parse_string, yaml_parse_strings, yaml_string, + Processor, ProcessorBuilder, ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, + PATTERNS_NAME, PATTERN_NAME, }; -use crate::etl::value::{Map, Value}; +use crate::etl::value::Value; pub(crate) const PROCESSOR_DISSECT: &str = "dissect"; @@ -59,13 +63,13 @@ impl std::fmt::Display for EndModifier { } #[derive(Debug, PartialEq, Default)] -struct Name { +struct NameInfo { name: String, start_modifier: Option, end_modifier: Option, } -impl Name { +impl NameInfo { fn is_name_empty(&self) -> bool { self.name.is_empty() } @@ -125,18 +129,87 @@ impl Name { } } +impl std::fmt::Display for NameInfo { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "{}", self.name) + } +} + +impl From<&str> for NameInfo { + fn from(value: &str) -> Self { + NameInfo { + name: value.to_string(), + start_modifier: None, + end_modifier: None, + } + } +} + +#[derive(Debug, PartialEq, Default)] +struct Name { + name: String, + index: usize, + start_modifier: Option, + end_modifier: Option, +} + impl std::fmt::Display for Name { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { write!(f, "{}", self.name) } } -impl From<&str> for Name { - fn from(value: &str) -> Self { +impl From for Name { + fn from(value: NameInfo) -> Self { Name { - name: value.to_string(), - start_modifier: None, - end_modifier: None, + name: value.name, + index: 0, + start_modifier: value.start_modifier, + end_modifier: value.end_modifier, + } + } +} + +impl Name { + fn is_name_empty(&self) -> bool { + self.name.is_empty() + } + + fn is_empty(&self) -> bool { + self.name.is_empty() && self.start_modifier.is_none() && self.end_modifier.is_none() + } + + fn is_end_modifier_set(&self) -> bool { + self.end_modifier.is_some() + } +} + +#[derive(Debug, PartialEq)] +enum PartInfo { + Split(String), + Name(NameInfo), +} + +impl PartInfo { + fn is_empty(&self) -> bool { + match self { + PartInfo::Split(v) => v.is_empty(), + PartInfo::Name(v) => v.is_empty(), + } + } + + fn empty_split() -> Self { + PartInfo::Split(String::new()) + } + + fn empty_name() -> Self { + PartInfo::Name(NameInfo::default()) + } + + fn push(&mut self, ch: char) { + match self { + PartInfo::Split(v) => v.push(ch), + PartInfo::Name(v) => v.name.push(ch), } } } @@ -162,11 +235,13 @@ impl Part { fn empty_name() -> Self { Part::Name(Name::default()) } +} - fn push(&mut self, ch: char) { - match self { - Part::Split(v) => v.push(ch), - Part::Name(v) => v.name.push(ch), +impl From for Part { + fn from(value: PartInfo) -> Self { + match value { + PartInfo::Split(v) => Part::Split(v), + PartInfo::Name(v) => Part::Name(v.into()), } } } @@ -177,7 +252,7 @@ struct Pattern { parts: Vec, } -impl std::ops::Deref for Pattern { +impl Deref for Pattern { type Target = Vec; fn deref(&self) -> &Self::Target { @@ -185,18 +260,42 @@ impl std::ops::Deref for Pattern { } } -impl std::ops::DerefMut for Pattern { +impl From for Pattern { + fn from(value: PatternInfo) -> Self { + let parts = value.parts.into_iter().map(|x| x.into()).collect(); + Pattern { + origin: value.origin, + parts, + } + } +} + +#[derive(Debug, Default)] +struct PatternInfo { + origin: String, + parts: Vec, +} + +impl std::ops::Deref for PatternInfo { + type Target = Vec; + + fn deref(&self) -> &Self::Target { + &self.parts + } +} + +impl std::ops::DerefMut for PatternInfo { fn deref_mut(&mut self) -> &mut Self::Target { &mut self.parts } } -impl std::str::FromStr for Pattern { +impl std::str::FromStr for PatternInfo { type Err = String; fn from_str(s: &str) -> Result { let mut parts = vec![]; - let mut cursor = Part::empty_split(); + let mut cursor = PartInfo::empty_split(); let origin = s.to_string(); let chars: Vec = origin.chars().collect(); @@ -206,27 +305,27 @@ impl std::str::FromStr for Pattern { let ch = chars[pos]; match (ch, &mut cursor) { // if cursor is Split part, and found %{, then ready to start a Name part - ('%', Part::Split(_)) if matches!(chars.get(pos + 1), Some('{')) => { + ('%', PartInfo::Split(_)) if matches!(chars.get(pos + 1), Some('{')) => { if !cursor.is_empty() { parts.push(cursor); } - cursor = Part::empty_name(); + cursor = PartInfo::empty_name(); pos += 1; // skip '{' } // if cursor is Split part, and not found % or {, then continue the Split part - (_, Part::Split(_)) => { + (_, PartInfo::Split(_)) => { cursor.push(ch); } // if cursor is Name part, and found }, then end the Name part, start the next Split part - ('}', Part::Name(_)) => { + ('}', PartInfo::Name(_)) => { parts.push(cursor); - cursor = Part::empty_split(); + cursor = PartInfo::empty_split(); } - ('+', Part::Name(name)) if !name.is_start_modifier_set() => { + ('+', PartInfo::Name(name)) if !name.is_start_modifier_set() => { name.try_start_modifier(StartModifier::Append(None))?; } - ('/', Part::Name(name)) if name.is_append_modifier_set() => { + ('/', PartInfo::Name(name)) if name.is_append_modifier_set() => { let mut order = 0; let mut j = pos + 1; while j < chars.len() { @@ -248,16 +347,16 @@ impl std::str::FromStr for Pattern { name.try_append_order(order)?; pos = j - 1; // this will change the position to the last digit of the order } - ('?', Part::Name(name)) if !name.is_start_modifier_set() => { + ('?', PartInfo::Name(name)) if !name.is_start_modifier_set() => { name.try_start_modifier(StartModifier::NamedSkip)?; } - ('*', Part::Name(name)) if !name.is_start_modifier_set() => { + ('*', PartInfo::Name(name)) if !name.is_start_modifier_set() => { name.try_start_modifier(StartModifier::MapKey)?; } - ('&', Part::Name(name)) if !name.is_start_modifier_set() => { + ('&', PartInfo::Name(name)) if !name.is_start_modifier_set() => { name.try_start_modifier(StartModifier::MapVal)?; } - ('-', Part::Name(name)) if !name.is_end_modifier_set() => { + ('-', PartInfo::Name(name)) if !name.is_end_modifier_set() => { if let Some('>') = chars.get(pos + 1) { } else { return Err(format!( @@ -273,7 +372,7 @@ impl std::str::FromStr for Pattern { name.try_end_modifier()?; pos += 1; // only skip '>', the next loop will skip '}' } - (_, Part::Name(name)) if !is_valid_char(ch) => { + (_, PartInfo::Name(name)) if !is_valid_char(ch) => { let tail: String = if name.is_name_empty() { format!("Invalid '{ch}'") } else { @@ -281,7 +380,7 @@ impl std::str::FromStr for Pattern { }; return Err(format!("Invalid Pattern: '{s}'. {tail}")); } - (_, Part::Name(_)) => { + (_, PartInfo::Name(_)) => { cursor.push(ch); } } @@ -290,8 +389,8 @@ impl std::str::FromStr for Pattern { } match cursor { - Part::Split(ref split) if !split.is_empty() => parts.push(cursor), - Part::Name(name) if !name.is_empty() => { + PartInfo::Split(ref split) if !split.is_empty() => parts.push(cursor), + PartInfo::Name(name) if !name.is_empty() => { return Err(format!("Invalid Pattern: '{s}'. '{name}' is not closed")) } _ => {} @@ -303,7 +402,7 @@ impl std::str::FromStr for Pattern { } } -impl Pattern { +impl PatternInfo { fn check(&self) -> Result<(), String> { if self.len() == 0 { return Err("Empty pattern is not allowed".to_string()); @@ -316,19 +415,19 @@ impl Pattern { let this_part = &self[i]; let next_part = self.get(i + 1); match (this_part, next_part) { - (Part::Split(split), _) if split.is_empty() => { + (PartInfo::Split(split), _) if split.is_empty() => { return Err(format!( "Invalid Pattern: '{}'. Empty split is not allowed", self.origin )); } - (Part::Name(name1), Some(Part::Name(name2))) => { + (PartInfo::Name(name1), Some(PartInfo::Name(name2))) => { return Err(format!( "Invalid Pattern: '{}'. consecutive names are not allowed: '{}' '{}'", self.origin, name1, name2 )); } - (Part::Name(name), _) if name.is_name_empty() => { + (PartInfo::Name(name), _) if name.is_name_empty() => { if let Some(ref m) = name.start_modifier { return Err(format!( "Invalid Pattern: '{}'. only '{}' modifier is invalid", @@ -336,7 +435,7 @@ impl Pattern { )); } } - (Part::Name(name), _) => match name.start_modifier { + (PartInfo::Name(name), _) => match name.start_modifier { Some(StartModifier::MapKey) => { if map_keys.contains(&name.name) { return Err(format!( @@ -379,15 +478,131 @@ impl Pattern { } } -impl std::fmt::Display for Pattern { +impl std::fmt::Display for PatternInfo { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { write!(f, "{}", self.origin) } } #[derive(Debug, Default)] -pub struct DissectProcessor { +pub struct DissectProcessorBuilder { fields: Fields, + patterns: Vec, + ignore_missing: bool, + append_separator: Option, + output_keys: HashSet, +} + +impl DissectProcessorBuilder { + fn build_output_keys(patterns: &[PatternInfo]) -> HashSet { + patterns + .iter() + .flat_map(|pattern| pattern.iter()) + .filter_map(|p| match p { + PartInfo::Name(name) => { + if !name.is_empty() + && (name.start_modifier.is_none() + || name + .start_modifier + .as_ref() + .is_some_and(|x| matches!(x, StartModifier::Append(_)))) + { + Some(name.to_string()) + } else { + None + } + } + _ => None, + }) + .collect() + } + + fn part_info_to_part( + part_info: PartInfo, + intermediate_keys: &[String], + ) -> Result { + match part_info { + PartInfo::Split(s) => Ok(Part::Split(s)), + PartInfo::Name(n) => match n.start_modifier { + None | Some(StartModifier::Append(_)) => { + let index = find_key_index(intermediate_keys, &n.name, "dissect")?; + Ok(Part::Name(Name { + name: n.name, + index, + start_modifier: n.start_modifier, + end_modifier: n.end_modifier, + })) + } + _ => Ok(Part::Name(Name { + name: n.name, + index: usize::MAX, + start_modifier: n.start_modifier, + end_modifier: n.end_modifier, + })), + }, + } + } + + fn pattern_info_to_pattern( + pattern_info: PatternInfo, + intermediate_keys: &[String], + ) -> Result { + let original = pattern_info.origin; + let pattern = pattern_info + .parts + .into_iter() + .map(|part_info| Self::part_info_to_part(part_info, intermediate_keys)) + .collect::, String>>()?; + Ok(Pattern { + origin: original, + parts: pattern, + }) + } + + fn build_patterns_from_pattern_infos( + patterns: Vec, + intermediate_keys: &[String], + ) -> Result, String> { + patterns + .into_iter() + .map(|pattern_info| Self::pattern_info_to_pattern(pattern_info, intermediate_keys)) + .collect() + } +} + +impl ProcessorBuilder for DissectProcessorBuilder { + fn output_keys(&self) -> HashSet<&str> { + self.output_keys.iter().map(|s| s.as_str()).collect() + } + + fn input_keys(&self) -> HashSet<&str> { + self.fields.iter().map(|f| f.input_field()).collect() + } + + fn build(self, intermediate_keys: &[String]) -> Result { + let mut real_fields = vec![]; + for field in self.fields.into_iter() { + let input_index = find_key_index(intermediate_keys, field.input_field(), "dissect")?; + + let input_field_info = InputFieldInfo::new(field.input_field(), input_index); + + let real_field = OneInputMultiOutputField::new(input_field_info, field.target_field); + real_fields.push(real_field); + } + let patterns = Self::build_patterns_from_pattern_infos(self.patterns, intermediate_keys)?; + let processor = DissectProcessor { + fields: real_fields, + patterns, + ignore_missing: self.ignore_missing, + append_separator: self.append_separator, + }; + Ok(ProcessorKind::Dissect(processor)) + } +} + +#[derive(Debug, Default)] +pub struct DissectProcessor { + fields: Vec, patterns: Vec, ignore_missing: bool, @@ -396,59 +611,51 @@ pub struct DissectProcessor { } impl DissectProcessor { - fn with_fields(&mut self, fields: Fields) { - self.fields = fields; - } - - fn with_ignore_missing(&mut self, ignore_missing: bool) { - self.ignore_missing = ignore_missing; - } - - fn with_patterns(&mut self, patterns: Vec) { - self.patterns = patterns; - } - - fn with_append_separator(&mut self, append_separator: String) { - self.append_separator = Some(append_separator); - } - - fn process_pattern(&self, chs: &[char], pattern: &Pattern) -> Result { - let mut map = Map::default(); + fn process_pattern( + &self, + chs: &[char], + pattern: &Pattern, + ) -> Result, String> { + let mut map = Vec::new(); let mut pos = 0; - let mut appends: HashMap> = HashMap::new(); - let mut maps: HashMap = HashMap::new(); + let mut appends: HashMap> = HashMap::new(); + // let mut maps: HashMap = HashMap::new(); let mut process_name_value = |name: &Name, value: String| { - let name_str = name.to_string(); + let name_index = name.index; match name.start_modifier { Some(StartModifier::NamedSkip) => { // do nothing, ignore this match } Some(StartModifier::Append(order)) => { appends - .entry(name_str) + .entry(name_index) .or_default() .push((value, order.unwrap_or_default())); } - Some(StartModifier::MapKey) => match maps.get(&name_str) { - Some(map_val) => { - map.insert(value, Value::String(map_val.to_string())); - } - None => { - maps.insert(name_str, value); - } - }, - Some(StartModifier::MapVal) => match maps.get(&name_str) { - Some(map_key) => { - map.insert(map_key, Value::String(value)); - } - None => { - maps.insert(name_str, value); - } - }, + // Some(StartModifier::MapKey) => match maps.get(&name_index) { + // Some(map_val) => { + // map.insert(value, Value::String(map_val.to_string())); + // } + // None => { + // maps.insert(name_index, value); + // } + // }, + // Some(StartModifier::MapVal) => match maps.get(&name_index) { + // Some(map_key) => { + // map.insert(map_key, Value::String(value)); + // } + // None => { + // maps.insert(name_index, value); + // } + // }, + Some(_) => { + // do nothing, ignore MapKey and MapVal + // because transform can know the key name + } None => { - map.insert(name.to_string(), Value::String(value)); + map.push((name_index, Value::String(value))); } } }; @@ -524,60 +731,37 @@ impl DissectProcessor { for (name, mut values) in appends { values.sort_by(|a, b| a.1.cmp(&b.1)); let value = values.into_iter().map(|(a, _)| a).join(sep); - map.insert(name, Value::String(value)); + map.push((name, Value::String(value))); } } Ok(map) } - fn process(&self, val: &str) -> Result { + fn process(&self, val: &str) -> Result, String> { let chs = val.chars().collect::>(); for pattern in &self.patterns { - if let Ok(map) = self.process_pattern(&chs, pattern) { - return Ok(map); + match self.process_pattern(&chs, pattern) { + Ok(map) => return Ok(map), + Err(e) => { + warn!("dissect processor: {}", e); + } } } Err("No matching pattern found".to_string()) } - - /// Update the output keys for each field. - fn update_output_keys(&mut self) { - // every pattern had been checked, so we can get all the output keys - let output_keys = self - .patterns - .iter() - .flat_map(|pattern| pattern.iter()) - .filter_map(|p| match p { - Part::Name(name) => { - if !name.is_empty() - && !name.start_modifier.as_ref().is_some_and(|x| { - *x == StartModifier::NamedSkip || *x == StartModifier::MapVal - }) - { - Some(name) - } else { - None - } - } - _ => None, - }) - .collect::>(); - for field in self.fields.iter_mut() { - for k in &output_keys { - field.output_fields_index_mapping.insert(k.to_string(), 0); - } - } - } } -impl TryFrom<&yaml_rust::yaml::Hash> for DissectProcessor { +impl TryFrom<&yaml_rust::yaml::Hash> for DissectProcessorBuilder { type Error = String; fn try_from(value: &yaml_rust::yaml::Hash) -> Result { - let mut processor = Self::default(); + let mut fields = Fields::default(); + let mut patterns = vec![]; + let mut ignore_missing = false; + let mut append_separator = None; for (k, v) in value.iter() { let key = k @@ -585,27 +769,38 @@ impl TryFrom<&yaml_rust::yaml::Hash> for DissectProcessor { .ok_or(format!("key must be a string, but got '{k:?}'"))?; match key { - FIELD_NAME => processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?)), - FIELDS_NAME => processor.with_fields(yaml_fields(v, FIELDS_NAME)?), + FIELD_NAME => { + fields = Fields::one(yaml_new_field(v, FIELD_NAME)?); + } + FIELDS_NAME => { + fields = yaml_new_fields(v, FIELDS_NAME)?; + } PATTERN_NAME => { - let pattern: Pattern = yaml_parse_string(v, PATTERN_NAME)?; - processor.with_patterns(vec![pattern]); + let pattern: PatternInfo = yaml_parse_string(v, PATTERN_NAME)?; + patterns = vec![pattern]; } PATTERNS_NAME => { - let patterns = yaml_parse_strings(v, PATTERNS_NAME)?; - processor.with_patterns(patterns); + patterns = yaml_parse_strings(v, PATTERNS_NAME)?; } IGNORE_MISSING_NAME => { - processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?) + ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?; } APPEND_SEPARATOR_NAME => { - processor.with_append_separator(yaml_string(v, APPEND_SEPARATOR_NAME)?) + append_separator = Some(yaml_string(v, APPEND_SEPARATOR_NAME)?); } _ => {} } } - processor.update_output_keys(); - Ok(processor) + let output_keys = Self::build_output_keys(&patterns); + let builder = DissectProcessorBuilder { + fields, + patterns, + ignore_missing, + append_separator, + output_keys, + }; + + Ok(builder) } } @@ -618,59 +813,15 @@ impl Processor for DissectProcessor { self.ignore_missing } - fn fields(&self) -> &Fields { - &self.fields - } - - fn fields_mut(&mut self) -> &mut Fields { - &mut self.fields - } - - fn output_keys(&self) -> HashSet { - let mut result = HashSet::with_capacity(30); - for pattern in &self.patterns { - for part in pattern.iter() { - if let Part::Name(name) = part { - if !name.is_empty() { - result.insert(name.to_string()); - } - } - } - } - result - } - - fn exec_field(&self, val: &Value, _field: &Field) -> Result { - match val { - Value::String(val) => match self.process(val) { - Ok(map) => Ok(map), - Err(e) => { - warn!("dissect processor: {}", e); - Ok(Map::default()) - } - }, - _ => Err(format!( - "{} processor: expect string value, but got {val:?}", - self.kind() - )), - } - } - fn exec_mut(&self, val: &mut Vec) -> Result<(), String> { for field in self.fields.iter() { - let index = field.input_field.index; + let index = field.input_index(); match val.get(index) { - // TODO(qtang): Let this method use the intermediate state collection directly. Some(Value::String(val_str)) => match self.process(val_str) { - Ok(mut map) => { - field - .output_fields_index_mapping - .iter() - .for_each(|(k, output_index)| { - if let Some(v) = map.remove(k) { - val[*output_index] = v - } - }); + Ok(r) => { + for (k, v) in r { + val[k] = v; + } } Err(e) => { warn!("dissect processor: {}", e); @@ -681,7 +832,7 @@ impl Processor for DissectProcessor { return Err(format!( "{} processor: missing field: {}", self.kind(), - field.get_field_name() + field.input_name() )); } } @@ -705,17 +856,29 @@ fn is_valid_char(ch: char) -> bool { mod tests { use ahash::HashMap; - use super::{DissectProcessor, EndModifier, Name, Part, Pattern, StartModifier}; - use crate::etl::value::{Map, Value}; + use super::{DissectProcessor, EndModifier, NameInfo, PartInfo, PatternInfo, StartModifier}; + use crate::etl::processor::dissect::DissectProcessorBuilder; + use crate::etl::value::Value; fn assert(pattern_str: &str, input: &str, expected: HashMap) { let chs = input.chars().collect::>(); - let pattern = pattern_str.parse().unwrap(); + let pattern_infos: Vec = vec![pattern_str.parse().unwrap()]; + let output_keys: Vec = DissectProcessorBuilder::build_output_keys(&pattern_infos) + .into_iter() + .collect(); + let pattern = + DissectProcessorBuilder::build_patterns_from_pattern_infos(pattern_infos, &output_keys) + .unwrap(); let processor = DissectProcessor::default(); - let map = processor.process_pattern(&chs, &pattern).unwrap(); + let result: HashMap = processor + .process_pattern(&chs, &pattern[0]) + .unwrap() + .into_iter() + .map(|(k, v)| (output_keys[k].to_string(), v)) + .collect(); - assert_eq!(map, Map::from(expected), "pattern: {}", pattern_str); + assert_eq!(result, expected, "pattern: {}", pattern_str); } #[test] @@ -723,28 +886,28 @@ mod tests { let cases = [( "%{clientip} %{ident} %{auth} [%{timestamp}] \"%{verb} %{request} HTTP/%{httpversion}\" %{status} %{size}", vec![ - Part::Name("clientip".into()), - Part::Split(" ".into()), - Part::Name("ident".into()), - Part::Split(" ".into()), - Part::Name("auth".into()), - Part::Split(" [".into()), - Part::Name("timestamp".into()), - Part::Split("] \"".into()), - Part::Name("verb".into()), - Part::Split(" ".into()), - Part::Name("request".into()), - Part::Split(" HTTP/".into()), - Part::Name("httpversion".into()), - Part::Split("\" ".into()), - Part::Name("status".into()), - Part::Split(" ".into()), - Part::Name("size".into()), + PartInfo::Name("clientip".into()), + PartInfo::Split(" ".into()), + PartInfo::Name("ident".into()), + PartInfo::Split(" ".into()), + PartInfo::Name("auth".into()), + PartInfo::Split(" [".into()), + PartInfo::Name("timestamp".into()), + PartInfo::Split("] \"".into()), + PartInfo::Name("verb".into()), + PartInfo::Split(" ".into()), + PartInfo::Name("request".into()), + PartInfo::Split(" HTTP/".into()), + PartInfo::Name("httpversion".into()), + PartInfo::Split("\" ".into()), + PartInfo::Name("status".into()), + PartInfo::Split(" ".into()), + PartInfo::Name("size".into()), ], )]; for (pattern, expected) in cases.into_iter() { - let p: Pattern = pattern.parse().unwrap(); + let p: PatternInfo = pattern.parse().unwrap(); assert_eq!(p.parts, expected); } } @@ -755,13 +918,13 @@ mod tests { ( "%{} %{}", vec![ - Part::Name(Name { + PartInfo::Name(NameInfo { name: "".into(), start_modifier: None, end_modifier: None, }), - Part::Split(" ".into()), - Part::Name(Name { + PartInfo::Split(" ".into()), + PartInfo::Name(NameInfo { name: "".into(), start_modifier: None, end_modifier: None, @@ -771,61 +934,61 @@ mod tests { ( "%{ts->} %{level}", vec![ - Part::Name(Name { + PartInfo::Name(NameInfo { name: "ts".into(), start_modifier: None, end_modifier: Some(EndModifier), }), - Part::Split(" ".into()), - Part::Name("level".into()), + PartInfo::Split(" ".into()), + PartInfo::Name("level".into()), ], ), ( "[%{ts}]%{->}[%{level}]", vec![ - Part::Split("[".into()), - Part::Name(Name { + PartInfo::Split("[".into()), + PartInfo::Name(NameInfo { name: "ts".into(), start_modifier: None, end_modifier: None, }), - Part::Split("]".into()), - Part::Name(Name { + PartInfo::Split("]".into()), + PartInfo::Name(NameInfo { name: "".into(), start_modifier: None, end_modifier: Some(EndModifier), }), - Part::Split("[".into()), - Part::Name(Name { + PartInfo::Split("[".into()), + PartInfo::Name(NameInfo { name: "level".into(), start_modifier: None, end_modifier: None, }), - Part::Split("]".into()), + PartInfo::Split("]".into()), ], ), ( "%{+name} %{+name} %{+name} %{+name}", vec![ - Part::Name(Name { + PartInfo::Name(NameInfo { name: "name".into(), start_modifier: Some(StartModifier::Append(None)), end_modifier: None, }), - Part::Split(" ".into()), - Part::Name(Name { + PartInfo::Split(" ".into()), + PartInfo::Name(NameInfo { name: "name".into(), start_modifier: Some(StartModifier::Append(None)), end_modifier: None, }), - Part::Split(" ".into()), - Part::Name(Name { + PartInfo::Split(" ".into()), + PartInfo::Name(NameInfo { name: "name".into(), start_modifier: Some(StartModifier::Append(None)), end_modifier: None, }), - Part::Split(" ".into()), - Part::Name(Name { + PartInfo::Split(" ".into()), + PartInfo::Name(NameInfo { name: "name".into(), start_modifier: Some(StartModifier::Append(None)), end_modifier: None, @@ -835,25 +998,25 @@ mod tests { ( "%{+name/2} %{+name/4} %{+name/3} %{+name/1}", vec![ - Part::Name(Name { + PartInfo::Name(NameInfo { name: "name".into(), start_modifier: Some(StartModifier::Append(Some(2))), end_modifier: None, }), - Part::Split(" ".into()), - Part::Name(Name { + PartInfo::Split(" ".into()), + PartInfo::Name(NameInfo { name: "name".into(), start_modifier: Some(StartModifier::Append(Some(4))), end_modifier: None, }), - Part::Split(" ".into()), - Part::Name(Name { + PartInfo::Split(" ".into()), + PartInfo::Name(NameInfo { name: "name".into(), start_modifier: Some(StartModifier::Append(Some(3))), end_modifier: None, }), - Part::Split(" ".into()), - Part::Name(Name { + PartInfo::Split(" ".into()), + PartInfo::Name(NameInfo { name: "name".into(), start_modifier: Some(StartModifier::Append(Some(1))), end_modifier: None, @@ -863,67 +1026,67 @@ mod tests { ( "%{clientip} %{?ident} %{?auth} [%{timestamp}]", vec![ - Part::Name(Name { + PartInfo::Name(NameInfo { name: "clientip".into(), start_modifier: None, end_modifier: None, }), - Part::Split(" ".into()), - Part::Name(Name { + PartInfo::Split(" ".into()), + PartInfo::Name(NameInfo { name: "ident".into(), start_modifier: Some(StartModifier::NamedSkip), end_modifier: None, }), - Part::Split(" ".into()), - Part::Name(Name { + PartInfo::Split(" ".into()), + PartInfo::Name(NameInfo { name: "auth".into(), start_modifier: Some(StartModifier::NamedSkip), end_modifier: None, }), - Part::Split(" [".into()), - Part::Name(Name { + PartInfo::Split(" [".into()), + PartInfo::Name(NameInfo { name: "timestamp".into(), start_modifier: None, end_modifier: None, }), - Part::Split("]".into()), + PartInfo::Split("]".into()), ], ), ( "[%{ts}] [%{level}] %{*p1}:%{&p1} %{*p2}:%{&p2}", vec![ - Part::Split("[".into()), - Part::Name(Name { + PartInfo::Split("[".into()), + PartInfo::Name(NameInfo { name: "ts".into(), start_modifier: None, end_modifier: None, }), - Part::Split("] [".into()), - Part::Name(Name { + PartInfo::Split("] [".into()), + PartInfo::Name(NameInfo { name: "level".into(), start_modifier: None, end_modifier: None, }), - Part::Split("] ".into()), - Part::Name(Name { + PartInfo::Split("] ".into()), + PartInfo::Name(NameInfo { name: "p1".into(), start_modifier: Some(StartModifier::MapKey), end_modifier: None, }), - Part::Split(":".into()), - Part::Name(Name { + PartInfo::Split(":".into()), + PartInfo::Name(NameInfo { name: "p1".into(), start_modifier: Some(StartModifier::MapVal), end_modifier: None, }), - Part::Split(" ".into()), - Part::Name(Name { + PartInfo::Split(" ".into()), + PartInfo::Name(NameInfo { name: "p2".into(), start_modifier: Some(StartModifier::MapKey), end_modifier: None, }), - Part::Split(":".into()), - Part::Name(Name { + PartInfo::Split(":".into()), + PartInfo::Name(NameInfo { name: "p2".into(), start_modifier: Some(StartModifier::MapVal), end_modifier: None, @@ -933,13 +1096,13 @@ mod tests { ( "%{&p1}:%{*p1}", vec![ - Part::Name(Name { + PartInfo::Name(NameInfo { name: "p1".into(), start_modifier: Some(StartModifier::MapVal), end_modifier: None, }), - Part::Split(":".into()), - Part::Name(Name { + PartInfo::Split(":".into()), + PartInfo::Name(NameInfo { name: "p1".into(), start_modifier: Some(StartModifier::MapKey), end_modifier: None, @@ -949,7 +1112,7 @@ mod tests { ]; for (pattern, expected) in cases.into_iter() { - let p: Pattern = pattern.parse().unwrap(); + let p: PatternInfo = pattern.parse().unwrap(); assert_eq!(p.parts, expected); } } @@ -1029,7 +1192,7 @@ mod tests { ]; for (pattern, expected) in cases.into_iter() { - let err = pattern.parse::().unwrap_err(); + let err = pattern.parse::().unwrap_err(); assert_eq!(err, expected); } } @@ -1164,45 +1327,4 @@ mod tests { ); } } - - #[test] - fn test_dissect_reference_keys() { - let cases = [ - ( - "[%{ts}] [%{level}] %{*p1}:%{&p1} %{*p2}:%{&p2}", - "[2018-08-10T17:15:42,466] [ERR] ip:1.2.3.4 error:REFUSED", - [ - ("ts", "2018-08-10T17:15:42,466"), - ("level", "ERR"), - ("ip", "1.2.3.4"), - ("error", "REFUSED"), - ], - ), - ( - "[%{ts}] [%{level}] %{&p1}:%{*p1} %{*p2}:%{&p2}", - "[2018-08-10T17:15:42,466] [ERR] ip:1.2.3.4 error:REFUSED", - [ - ("ts", "2018-08-10T17:15:42,466"), - ("level", "ERR"), - ("1.2.3.4", "ip"), - ("error", "REFUSED"), - ], - ), - ] - .into_iter() - .map(|(pattern, input, expected)| { - let map = expected - .into_iter() - .map(|(k, v)| (k.to_string(), Value::String(v.to_string()))); - (pattern, input, map) - }); - - for (pattern_str, input, expected) in cases { - assert( - pattern_str, - input, - expected.collect::>(), - ); - } - } } diff --git a/src/pipeline/src/etl/processor/epoch.rs b/src/pipeline/src/etl/processor/epoch.rs index 7af075bdb0..32c7d61786 100644 --- a/src/pipeline/src/etl/processor/epoch.rs +++ b/src/pipeline/src/etl/processor/epoch.rs @@ -14,17 +14,17 @@ use ahash::HashSet; -use crate::etl::field::{Field, Fields}; +use crate::etl::field::{Fields, OneInputOneOutputField}; use crate::etl::processor::{ - update_one_one_output_keys, yaml_bool, yaml_field, yaml_fields, yaml_string, Processor, - FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, + yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, ProcessorBuilder, + ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, }; use crate::etl::value::time::{ MICROSECOND_RESOLUTION, MICRO_RESOLUTION, MILLISECOND_RESOLUTION, MILLI_RESOLUTION, MS_RESOLUTION, NANOSECOND_RESOLUTION, NANO_RESOLUTION, NS_RESOLUTION, SECOND_RESOLUTION, SEC_RESOLUTION, S_RESOLUTION, US_RESOLUTION, }; -use crate::etl::value::{Map, Timestamp, Value}; +use crate::etl::value::{Timestamp, Value}; pub(crate) const PROCESSOR_EPOCH: &str = "epoch"; const RESOLUTION_NAME: &str = "resolution"; @@ -52,12 +52,56 @@ impl TryFrom<&str> for Resolution { } } +#[derive(Debug, Default)] +pub struct EpochProcessorBuilder { + fields: Fields, + resolution: Resolution, + ignore_missing: bool, +} + +impl ProcessorBuilder for EpochProcessorBuilder { + fn output_keys(&self) -> HashSet<&str> { + self.fields + .iter() + .map(|f| f.target_or_input_field()) + .collect() + } + + fn input_keys(&self) -> HashSet<&str> { + self.fields.iter().map(|f| f.input_field()).collect() + } + + fn build(self, intermediate_keys: &[String]) -> Result { + self.build(intermediate_keys).map(ProcessorKind::Epoch) + } +} + +impl EpochProcessorBuilder { + pub fn build(self, intermediate_keys: &[String]) -> Result { + let mut real_fields = vec![]; + for field in self.fields.into_iter() { + let input = OneInputOneOutputField::build( + "epoch", + intermediate_keys, + field.input_field(), + field.target_or_input_field(), + )?; + real_fields.push(input); + } + Ok(EpochProcessor { + fields: real_fields, + resolution: self.resolution, + ignore_missing: self.ignore_missing, + }) + } +} + /// support string, integer, float, time, epoch /// deprecated it should be removed in the future /// Reserved for compatibility only #[derive(Debug, Default)] pub struct EpochProcessor { - fields: Fields, + fields: Vec, resolution: Resolution, ignore_missing: bool, // description @@ -68,19 +112,6 @@ pub struct EpochProcessor { } impl EpochProcessor { - fn with_fields(&mut self, mut fields: Fields) { - update_one_one_output_keys(&mut fields); - self.fields = fields - } - - fn with_resolution(&mut self, resolution: Resolution) { - self.resolution = resolution; - } - - fn with_ignore_missing(&mut self, ignore_missing: bool) { - self.ignore_missing = ignore_missing; - } - fn parse(&self, val: &Value) -> Result { let t: i64 = match val { Value::String(s) => s @@ -117,19 +148,15 @@ impl EpochProcessor { Resolution::Nano => Ok(Timestamp::Nanosecond(t)), } } - - fn process_field(&self, val: &Value, field: &Field) -> Result { - let key = field.get_target_field(); - - Ok(Map::one(key, Value::Timestamp(self.parse(val)?))) - } } -impl TryFrom<&yaml_rust::yaml::Hash> for EpochProcessor { +impl TryFrom<&yaml_rust::yaml::Hash> for EpochProcessorBuilder { type Error = String; fn try_from(hash: &yaml_rust::yaml::Hash) -> Result { - let mut processor = EpochProcessor::default(); + let mut fields = Fields::default(); + let mut resolution = Resolution::default(); + let mut ignore_missing = false; for (k, v) in hash { let key = k @@ -138,24 +165,29 @@ impl TryFrom<&yaml_rust::yaml::Hash> for EpochProcessor { match key { FIELD_NAME => { - processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?)); + fields = Fields::one(yaml_new_field(v, FIELD_NAME)?); } FIELDS_NAME => { - processor.with_fields(yaml_fields(v, FIELDS_NAME)?); + fields = yaml_new_fields(v, FIELDS_NAME)?; } RESOLUTION_NAME => { let s = yaml_string(v, RESOLUTION_NAME)?.as_str().try_into()?; - processor.with_resolution(s); + resolution = s; } IGNORE_MISSING_NAME => { - processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?); + ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?; } _ => {} } } + let builder = EpochProcessorBuilder { + fields, + resolution, + ignore_missing, + }; - Ok(processor) + Ok(builder) } } @@ -168,49 +200,23 @@ impl Processor for EpochProcessor { self.ignore_missing } - fn fields(&self) -> &Fields { - &self.fields - } - - fn fields_mut(&mut self) -> &mut Fields { - &mut self.fields - } - - fn output_keys(&self) -> HashSet { - self.fields - .iter() - .map(|f| f.get_target_field().to_string()) - .collect() - } - - fn exec_field(&self, val: &Value, field: &Field) -> Result { - self.process_field(val, field) - } - fn exec_mut(&self, val: &mut Vec) -> Result<(), String> { for field in self.fields.iter() { - let index = field.input_field.index; + let index = field.input_index(); match val.get(index) { Some(Value::Null) | None => { if !self.ignore_missing { return Err(format!( "{} processor: missing field: {}", self.kind(), - field.get_field_name() + field.input_name() )); } } Some(v) => { - // TODO(qtang): Let this method use the intermediate state collection directly. - let mut map = self.process_field(v, field)?; - field - .output_fields_index_mapping - .iter() - .for_each(|(k, output_index)| { - if let Some(v) = map.remove(k) { - val[*output_index] = v; - } - }); + let timestamp = self.parse(v)?; + let output_index = field.output_index(); + val[output_index] = Value::Timestamp(timestamp); } } } @@ -225,8 +231,10 @@ mod tests { #[test] fn test_parse_epoch() { - let mut processor = EpochProcessor::default(); - processor.with_resolution(super::Resolution::Second); + let processor = EpochProcessor { + resolution: super::Resolution::Second, + ..Default::default() + }; let values = [ Value::String("1573840000".into()), diff --git a/src/pipeline/src/etl/processor/gsub.rs b/src/pipeline/src/etl/processor/gsub.rs index 9129dc1a0f..1b8e581e6a 100644 --- a/src/pipeline/src/etl/processor/gsub.rs +++ b/src/pipeline/src/etl/processor/gsub.rs @@ -15,45 +15,43 @@ use ahash::HashSet; use regex::Regex; -use crate::etl::field::{Field, Fields}; +use crate::etl::field::{Fields, OneInputOneOutputField}; use crate::etl::processor::{ - update_one_one_output_keys, yaml_bool, yaml_field, yaml_fields, yaml_string, Processor, - FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, PATTERN_NAME, + yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, ProcessorBuilder, + ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, PATTERN_NAME, }; -use crate::etl::value::{Array, Map, Value}; +use crate::etl::value::Value; pub(crate) const PROCESSOR_GSUB: &str = "gsub"; const REPLACEMENT_NAME: &str = "replacement"; -/// A processor to replace all matches of a pattern in string by a replacement, only support string value, and array string value #[derive(Debug, Default)] -pub struct GsubProcessor { +pub struct GsubProcessorBuilder { fields: Fields, pattern: Option, replacement: Option, ignore_missing: bool, } -impl GsubProcessor { - fn with_fields(&mut self, mut fields: Fields) { - update_one_one_output_keys(&mut fields); - self.fields = fields; +impl ProcessorBuilder for GsubProcessorBuilder { + fn output_keys(&self) -> HashSet<&str> { + self.fields + .iter() + .map(|f| f.target_or_input_field()) + .collect() } - fn with_ignore_missing(&mut self, ignore_missing: bool) { - self.ignore_missing = ignore_missing; + fn input_keys(&self) -> HashSet<&str> { + self.fields.iter().map(|f| f.input_field()).collect() } - fn try_pattern(&mut self, pattern: &str) -> Result<(), String> { - self.pattern = Some(Regex::new(pattern).map_err(|e| e.to_string())?); - Ok(()) - } - - fn with_replacement(&mut self, replacement: impl Into) { - self.replacement = Some(replacement.into()); + fn build(self, intermediate_keys: &[String]) -> Result { + self.build(intermediate_keys).map(ProcessorKind::Gsub) } +} +impl GsubProcessorBuilder { fn check(self) -> Result { if self.pattern.is_none() { return Err("pattern is required".to_string()); @@ -66,7 +64,49 @@ impl GsubProcessor { Ok(self) } - fn process_string_field(&self, val: &str, field: &Field) -> Result { + fn build(self, intermediate_keys: &[String]) -> Result { + let mut real_fields = vec![]; + for field in self.fields.into_iter() { + let input = OneInputOneOutputField::build( + "gsub", + intermediate_keys, + field.input_field(), + field.target_or_input_field(), + )?; + real_fields.push(input); + } + Ok(GsubProcessor { + fields: real_fields, + pattern: self.pattern, + replacement: self.replacement, + ignore_missing: self.ignore_missing, + }) + } +} + +/// A processor to replace all matches of a pattern in string by a replacement, only support string value, and array string value +#[derive(Debug, Default)] +pub struct GsubProcessor { + fields: Vec, + pattern: Option, + replacement: Option, + ignore_missing: bool, +} + +impl GsubProcessor { + fn check(self) -> Result { + if self.pattern.is_none() { + return Err("pattern is required".to_string()); + } + + if self.replacement.is_none() { + return Err("replacement is required".to_string()); + } + + Ok(self) + } + + fn process_string(&self, val: &str) -> Result { let replacement = self.replacement.as_ref().unwrap(); let new_val = self .pattern @@ -76,42 +116,28 @@ impl GsubProcessor { .to_string(); let val = Value::String(new_val); - let key = field.get_target_field(); - - Ok(Map::one(key, val)) + Ok(val) } - fn process_array_field(&self, arr: &Array, field: &Field) -> Result { - let key = field.get_target_field(); - - let re = self.pattern.as_ref().unwrap(); - let replacement = self.replacement.as_ref().unwrap(); - - let mut result = Array::default(); - for val in arr.iter() { - match val { - Value::String(haystack) => { - let new_val = re.replace_all(haystack, replacement).to_string(); - result.push(Value::String(new_val)); - } - _ => { - return Err(format!( - "{} processor: expect string or array string, but got {val:?}", - self.kind() - )) - } - } + fn process(&self, val: &Value) -> Result { + match val { + Value::String(val) => self.process_string(val), + _ => Err(format!( + "{} processor: expect string or array string, but got {val:?}", + self.kind() + )), } - - Ok(Map::one(key, Value::Array(result))) } } -impl TryFrom<&yaml_rust::yaml::Hash> for GsubProcessor { +impl TryFrom<&yaml_rust::yaml::Hash> for GsubProcessorBuilder { type Error = String; fn try_from(value: &yaml_rust::yaml::Hash) -> Result { - let mut processor = GsubProcessor::default(); + let mut fields = Fields::default(); + let mut ignore_missing = false; + let mut pattern = None; + let mut replacement = None; for (k, v) in value.iter() { let key = k @@ -119,27 +145,36 @@ impl TryFrom<&yaml_rust::yaml::Hash> for GsubProcessor { .ok_or(format!("key must be a string, but got {k:?}"))?; match key { FIELD_NAME => { - processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?)); + fields = Fields::one(yaml_new_field(v, FIELD_NAME)?); } FIELDS_NAME => { - processor.with_fields(yaml_fields(v, FIELDS_NAME)?); + fields = yaml_new_fields(v, FIELDS_NAME)?; } PATTERN_NAME => { - processor.try_pattern(&yaml_string(v, PATTERN_NAME)?)?; + let pattern_str = yaml_string(v, PATTERN_NAME)?; + pattern = Some(Regex::new(&pattern_str).map_err(|e| e.to_string())?); } REPLACEMENT_NAME => { - processor.with_replacement(yaml_string(v, REPLACEMENT_NAME)?); + let replacement_str = yaml_string(v, REPLACEMENT_NAME)?; + replacement = Some(replacement_str); } IGNORE_MISSING_NAME => { - processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?); + ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?; } _ => {} } } - processor.check() + let builder = GsubProcessorBuilder { + fields, + pattern, + replacement, + ignore_missing, + }; + + builder.check() } } @@ -152,56 +187,23 @@ impl crate::etl::processor::Processor for GsubProcessor { self.ignore_missing } - fn fields(&self) -> &Fields { - &self.fields - } - - fn fields_mut(&mut self) -> &mut Fields { - &mut self.fields - } - - fn output_keys(&self) -> HashSet { - self.fields - .iter() - .map(|f| f.get_target_field().to_string()) - .collect() - } - - fn exec_field(&self, val: &Value, field: &Field) -> Result { - match val { - Value::String(val) => self.process_string_field(val, field), - Value::Array(arr) => self.process_array_field(arr, field), - _ => Err(format!( - "{} processor: expect string or array string, but got {val:?}", - self.kind() - )), - } - } - fn exec_mut(&self, val: &mut Vec) -> Result<(), String> { for field in self.fields.iter() { - let index = field.input_field.index; + let index = field.input_index(); match val.get(index) { Some(Value::Null) | None => { if !self.ignore_missing { return Err(format!( "{} processor: missing field: {}", self.kind(), - field.get_field_name() + field.input_name() )); } } Some(v) => { - // TODO(qtang): Let this method use the intermediate state collection directly. - let mut map = self.exec_field(v, field)?; - field - .output_fields_index_mapping - .iter() - .for_each(|(k, output_index)| { - if let Some(v) = map.remove(k) { - val[*output_index] = v; - } - }); + let result = self.process(v)?; + let output_index = field.output_index(); + val[output_index] = result; } } } @@ -211,55 +213,20 @@ impl crate::etl::processor::Processor for GsubProcessor { #[cfg(test)] mod tests { - use crate::etl::field::Field; use crate::etl::processor::gsub::GsubProcessor; - use crate::etl::processor::Processor; - use crate::etl::value::{Map, Value}; + use crate::etl::value::Value; #[test] fn test_string_value() { - let mut processor = GsubProcessor::default(); - processor.try_pattern(r"\d+").unwrap(); - processor.with_replacement("xxx"); + let processor = GsubProcessor { + pattern: Some(regex::Regex::new(r"\d+").unwrap()), + replacement: Some("xxx".to_string()), + ..Default::default() + }; - let field = Field::new("message"); let val = Value::String("123".to_string()); - let result = processor.exec_field(&val, &field).unwrap(); + let result = processor.process(&val).unwrap(); - assert_eq!( - result, - Map::one("message", Value::String("xxx".to_string())) - ); - } - - #[test] - fn test_array_string_value() { - let mut processor = GsubProcessor::default(); - processor.try_pattern(r"\d+").unwrap(); - processor.with_replacement("xxx"); - - let field = Field::new("message"); - let val = Value::Array( - vec![ - Value::String("123".to_string()), - Value::String("456".to_string()), - ] - .into(), - ); - let result = processor.exec_field(&val, &field).unwrap(); - - assert_eq!( - result, - Map::one( - "message", - Value::Array( - vec![ - Value::String("xxx".to_string()), - Value::String("xxx".to_string()) - ] - .into() - ) - ) - ); + assert_eq!(result, Value::String("xxx".to_string())); } } diff --git a/src/pipeline/src/etl/processor/join.rs b/src/pipeline/src/etl/processor/join.rs index b1ab620b66..d4b309d5c2 100644 --- a/src/pipeline/src/etl/processor/join.rs +++ b/src/pipeline/src/etl/processor/join.rs @@ -14,40 +14,78 @@ use ahash::HashSet; -use crate::etl::field::{Field, Fields}; +use crate::etl::field::{Fields, OneInputOneOutputField}; use crate::etl::processor::{ - update_one_one_output_keys, yaml_bool, yaml_field, yaml_fields, yaml_string, Processor, - FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, SEPARATOR_NAME, + yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, ProcessorBuilder, + ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, SEPARATOR_NAME, }; -use crate::etl::value::{Array, Map, Value}; +use crate::etl::value::{Array, Value}; pub(crate) const PROCESSOR_JOIN: &str = "join"; -/// A processor to join each element of an array into a single string using a separator string between each element #[derive(Debug, Default)] -pub struct JoinProcessor { +pub struct JoinProcessorBuilder { fields: Fields, separator: Option, ignore_missing: bool, } +impl ProcessorBuilder for JoinProcessorBuilder { + fn output_keys(&self) -> HashSet<&str> { + self.fields + .iter() + .map(|f| f.target_or_input_field()) + .collect() + } + + fn input_keys(&self) -> HashSet<&str> { + self.fields.iter().map(|f| f.input_field()).collect() + } + + fn build(self, intermediate_keys: &[String]) -> Result { + self.build(intermediate_keys).map(ProcessorKind::Join) + } +} + +impl JoinProcessorBuilder { + fn check(self) -> Result { + if self.separator.is_none() { + return Err("separator is required".to_string()); + } + + Ok(self) + } + + pub fn build(self, intermediate_keys: &[String]) -> Result { + let mut real_fields = vec![]; + for field in self.fields.into_iter() { + let input = OneInputOneOutputField::build( + "join", + intermediate_keys, + field.input_field(), + field.target_or_input_field(), + )?; + real_fields.push(input); + } + + Ok(JoinProcessor { + fields: real_fields, + separator: self.separator, + ignore_missing: self.ignore_missing, + }) + } +} + +/// A processor to join each element of an array into a single string using a separator string between each element +#[derive(Debug, Default)] +pub struct JoinProcessor { + fields: Vec, + separator: Option, + ignore_missing: bool, +} + impl JoinProcessor { - fn with_fields(&mut self, mut fields: Fields) { - update_one_one_output_keys(&mut fields); - self.fields = fields; - } - - fn with_separator(&mut self, separator: impl Into) { - self.separator = Some(separator.into()); - } - - fn with_ignore_missing(&mut self, ignore_missing: bool) { - self.ignore_missing = ignore_missing; - } - - fn process_field(&self, arr: &Array, field: &Field) -> Result { - let key = field.get_target_field(); - + fn process(&self, arr: &Array) -> Result { let sep = self.separator.as_ref().unwrap(); let val = arr .iter() @@ -55,7 +93,7 @@ impl JoinProcessor { .collect::>() .join(sep); - Ok(Map::one(key, Value::String(val))) + Ok(Value::String(val)) } fn check(self) -> Result { @@ -67,11 +105,13 @@ impl JoinProcessor { } } -impl TryFrom<&yaml_rust::yaml::Hash> for JoinProcessor { +impl TryFrom<&yaml_rust::yaml::Hash> for JoinProcessorBuilder { type Error = String; fn try_from(value: &yaml_rust::yaml::Hash) -> Result { - let mut processor = JoinProcessor::default(); + let mut fields = Fields::default(); + let mut separator = None; + let mut ignore_missing = false; for (k, v) in value.iter() { let key = k @@ -79,30 +119,31 @@ impl TryFrom<&yaml_rust::yaml::Hash> for JoinProcessor { .ok_or(format!("key must be a string, but got {k:?}"))?; match key { FIELD_NAME => { - processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?)); + fields = Fields::one(yaml_new_field(v, FIELD_NAME)?); } FIELDS_NAME => { - processor.with_fields(yaml_fields(v, FIELDS_NAME)?); + fields = yaml_new_fields(v, FIELDS_NAME)?; } SEPARATOR_NAME => { - processor.with_separator(yaml_string(v, SEPARATOR_NAME)?); + separator = Some(yaml_string(v, SEPARATOR_NAME)?); } IGNORE_MISSING_NAME => { - processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?); + ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?; } _ => {} } } - processor.check() + let builder = JoinProcessorBuilder { + fields, + separator, + ignore_missing, + }; + builder.check() } } impl Processor for JoinProcessor { - fn fields(&self) -> &Fields { - &self.fields - } - fn kind(&self) -> &str { PROCESSOR_JOIN } @@ -111,49 +152,21 @@ impl Processor for JoinProcessor { self.ignore_missing } - fn fields_mut(&mut self) -> &mut Fields { - &mut self.fields - } - - fn output_keys(&self) -> HashSet { - self.fields - .iter() - .map(|f| f.get_target_field().to_string()) - .collect() - } - - fn exec_field(&self, val: &Value, field: &Field) -> Result { - match val { - Value::Array(arr) => self.process_field(arr, field), - _ => Err(format!( - "{} processor: expect array value, but got {val:?}", - self.kind() - )), - } - } - fn exec_mut(&self, val: &mut Vec) -> Result<(), String> { for field in self.fields.iter() { - let index = field.input_field.index; + let index = field.input_index(); match val.get(index) { Some(Value::Array(arr)) => { - // TODO(qtang): Let this method use the intermediate state collection directly. - let mut map = self.process_field(arr, field)?; - field - .output_fields_index_mapping - .iter() - .for_each(|(k, output_index)| { - if let Some(v) = map.remove(k) { - val[*output_index] = v; - } - }); + let result = self.process(arr)?; + let output_index = field.output_index(); + val[output_index] = result; } Some(Value::Null) | None => { if !self.ignore_missing { return Err(format!( "{} processor: missing field: {}", self.kind(), - field.get_field_name() + field.input_name() )); } } @@ -173,25 +186,22 @@ impl Processor for JoinProcessor { #[cfg(test)] mod tests { - use crate::etl::field::Field; use crate::etl::processor::join::JoinProcessor; - use crate::etl::processor::Processor; - use crate::etl::value::{Map, Value}; + use crate::etl::value::Value; #[test] fn test_join_processor() { - let mut processor = JoinProcessor::default(); - processor.with_separator("-"); + let processor = JoinProcessor { + separator: Some("-".to_string()), + ..Default::default() + }; - let field = Field::new("test"); - let arr = Value::Array( - vec![ - Value::String("a".to_string()), - Value::String("b".to_string()), - ] - .into(), - ); - let result = processor.exec_field(&arr, &field).unwrap(); - assert_eq!(result, Map::one("test", Value::String("a-b".to_string()))); + let arr = vec![ + Value::String("a".to_string()), + Value::String("b".to_string()), + ] + .into(); + let result = processor.process(&arr).unwrap(); + assert_eq!(result, Value::String("a-b".to_string())); } } diff --git a/src/pipeline/src/etl/processor/letter.rs b/src/pipeline/src/etl/processor/letter.rs index 6502d6f221..f388b5a2a9 100644 --- a/src/pipeline/src/etl/processor/letter.rs +++ b/src/pipeline/src/etl/processor/letter.rs @@ -14,12 +14,12 @@ use ahash::HashSet; -use crate::etl::field::{Field, Fields}; +use crate::etl::field::{Fields, OneInputOneOutputField}; use crate::etl::processor::{ - update_one_one_output_keys, yaml_bool, yaml_field, yaml_fields, yaml_string, Processor, - FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, METHOD_NAME, + yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, ProcessorBuilder, + ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, METHOD_NAME, }; -use crate::etl::value::{Map, Value}; +use crate::etl::value::Value; pub(crate) const PROCESSOR_LETTER: &str = "letter"; @@ -54,29 +54,61 @@ impl std::str::FromStr for Method { } } -/// only support string value #[derive(Debug, Default)] -pub struct LetterProcessor { +pub struct LetterProcessorBuilder { fields: Fields, method: Method, ignore_missing: bool, } +impl ProcessorBuilder for LetterProcessorBuilder { + fn output_keys(&self) -> HashSet<&str> { + self.fields + .iter() + .map(|f| f.target_or_input_field()) + .collect() + } + + fn input_keys(&self) -> HashSet<&str> { + self.fields.iter().map(|f| f.input_field()).collect() + } + + fn build(self, intermediate_keys: &[String]) -> Result { + self.build(intermediate_keys).map(ProcessorKind::Letter) + } +} + +impl LetterProcessorBuilder { + pub fn build(self, intermediate_keys: &[String]) -> Result { + let mut real_fields = vec![]; + for field in self.fields.into_iter() { + let input = OneInputOneOutputField::build( + "letter", + intermediate_keys, + field.input_field(), + field.target_or_input_field(), + )?; + real_fields.push(input); + } + + Ok(LetterProcessor { + fields: real_fields, + method: self.method, + ignore_missing: self.ignore_missing, + }) + } +} + +/// only support string value +#[derive(Debug, Default)] +pub struct LetterProcessor { + fields: Vec, + method: Method, + ignore_missing: bool, +} + impl LetterProcessor { - fn with_fields(&mut self, mut fields: Fields) { - update_one_one_output_keys(&mut fields); - self.fields = fields; - } - - fn with_method(&mut self, method: Method) { - self.method = method; - } - - fn with_ignore_missing(&mut self, ignore_missing: bool) { - self.ignore_missing = ignore_missing; - } - - fn process_field(&self, val: &str, field: &Field) -> Result { + fn process_field(&self, val: &str) -> Result { let processed = match self.method { Method::Upper => val.to_uppercase(), Method::Lower => val.to_lowercase(), @@ -84,17 +116,17 @@ impl LetterProcessor { }; let val = Value::String(processed); - let key = field.get_target_field(); - - Ok(Map::one(key, val)) + Ok(val) } } -impl TryFrom<&yaml_rust::yaml::Hash> for LetterProcessor { +impl TryFrom<&yaml_rust::yaml::Hash> for LetterProcessorBuilder { type Error = String; fn try_from(value: &yaml_rust::yaml::Hash) -> Result { - let mut processor = LetterProcessor::default(); + let mut fields = Fields::default(); + let mut method = Method::Lower; + let mut ignore_missing = false; for (k, v) in value.iter() { let key = k @@ -102,23 +134,26 @@ impl TryFrom<&yaml_rust::yaml::Hash> for LetterProcessor { .ok_or(format!("key must be a string, but got {k:?}"))?; match key { FIELD_NAME => { - processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?)); + fields = Fields::one(yaml_new_field(v, FIELD_NAME)?); } FIELDS_NAME => { - processor.with_fields(yaml_fields(v, FIELDS_NAME)?); + fields = yaml_new_fields(v, FIELDS_NAME)?; } METHOD_NAME => { - let method = yaml_string(v, METHOD_NAME)?; - processor.with_method(method.parse()?); + method = yaml_string(v, METHOD_NAME)?.parse()?; } IGNORE_MISSING_NAME => { - processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?); + ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?; } _ => {} } } - Ok(processor) + Ok(LetterProcessorBuilder { + fields, + method, + ignore_missing, + }) } } @@ -131,53 +166,21 @@ impl Processor for LetterProcessor { self.ignore_missing } - fn fields(&self) -> &Fields { - &self.fields - } - - fn fields_mut(&mut self) -> &mut Fields { - &mut self.fields - } - - fn output_keys(&self) -> HashSet { - self.fields - .iter() - .map(|f| f.get_target_field().to_string()) - .collect() - } - - fn exec_field(&self, val: &Value, field: &Field) -> Result { - match val { - Value::String(val) => self.process_field(val, field), - _ => Err(format!( - "{} processor: expect string value, but got {val:?}", - self.kind() - )), - } - } - fn exec_mut(&self, val: &mut Vec) -> Result<(), String> { for field in self.fields.iter() { - let index = field.input_field.index; + let index = field.input_index(); match val.get(index) { Some(Value::String(s)) => { - // TODO(qtang): Let this method use the intermediate state collection directly. - let mut processed = self.process_field(s, field)?; - field - .output_fields_index_mapping - .iter() - .for_each(|(k, output_index)| { - if let Some(v) = processed.remove(k) { - val[*output_index] = v; - } - }); + let result = self.process_field(s)?; + let (_, output_index) = field.output(); + val[*output_index] = result; } Some(Value::Null) | None => { if !self.ignore_missing { return Err(format!( "{} processor: missing field: {}", self.kind(), - field.get_field_name() + &field.input().name )); } } @@ -204,33 +207,36 @@ fn capitalize(s: &str) -> String { #[cfg(test)] mod tests { - use crate::etl::field::Fields; use crate::etl::processor::letter::{LetterProcessor, Method}; - use crate::etl::value::{Map, Value}; + use crate::etl::value::Value; #[test] fn test_process() { - let field = "letter"; - let ff: crate::etl::processor::Field = field.parse().unwrap(); - let mut processor = LetterProcessor::default(); - processor.with_fields(Fields::one(ff.clone())); - { - processor.with_method(Method::Upper); - let processed = processor.process_field("pipeline", &ff).unwrap(); - assert_eq!(Map::one(field, Value::String("PIPELINE".into())), processed) + let processor = LetterProcessor { + method: Method::Upper, + ..Default::default() + }; + let processed = processor.process_field("pipeline").unwrap(); + assert_eq!(Value::String("PIPELINE".into()), processed) } { - processor.with_method(Method::Lower); - let processed = processor.process_field("Pipeline", &ff).unwrap(); - assert_eq!(Map::one(field, Value::String("pipeline".into())), processed) + let processor = LetterProcessor { + method: Method::Lower, + ..Default::default() + }; + let processed = processor.process_field("Pipeline").unwrap(); + assert_eq!(Value::String("pipeline".into()), processed) } { - processor.with_method(Method::Capital); - let processed = processor.process_field("pipeline", &ff).unwrap(); - assert_eq!(Map::one(field, Value::String("Pipeline".into())), processed) + let processor = LetterProcessor { + method: Method::Capital, + ..Default::default() + }; + let processed = processor.process_field("pipeline").unwrap(); + assert_eq!(Value::String("Pipeline".into()), processed) } } } diff --git a/src/pipeline/src/etl/processor/regex.rs b/src/pipeline/src/etl/processor/regex.rs index e5af339585..a1de2ea76d 100644 --- a/src/pipeline/src/etl/processor/regex.rs +++ b/src/pipeline/src/etl/processor/regex.rs @@ -18,16 +18,17 @@ const PATTERNS_NAME: &str = "patterns"; pub(crate) const PROCESSOR_REGEX: &str = "regex"; -use ahash::HashSet; +use ahash::{HashSet, HashSetExt}; use lazy_static::lazy_static; use regex::Regex; -use crate::etl::field::Fields; +use crate::etl::field::{Fields, InputFieldInfo, OneInputMultiOutputField}; +use crate::etl::find_key_index; use crate::etl::processor::{ - yaml_bool, yaml_field, yaml_fields, yaml_string, yaml_strings, Field, Processor, FIELDS_NAME, - FIELD_NAME, IGNORE_MISSING_NAME, PATTERN_NAME, + yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, yaml_strings, Processor, + ProcessorBuilder, ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, PATTERN_NAME, }; -use crate::etl::value::{Map, Value}; +use crate::etl::value::Value; lazy_static! { static ref GROUPS_NAME_REGEX: Regex = Regex::new(r"\(\?P?<([[:word:]]+)>.+?\)").unwrap(); @@ -40,6 +41,10 @@ fn get_regex_group_names(s: &str) -> Vec { .collect() } +fn generate_key(prefix: &str, group: &str) -> String { + format!("{prefix}_{group}") +} + #[derive(Debug)] struct GroupRegex { origin: String, @@ -72,34 +77,29 @@ impl std::str::FromStr for GroupRegex { } } -/// only support string value -/// if no value found from a pattern, the target_field will be ignored #[derive(Debug, Default)] -pub struct RegexProcessor { +pub struct RegexProcessorBuilder { fields: Fields, patterns: Vec, ignore_missing: bool, + output_keys: HashSet, } -impl RegexProcessor { - fn with_fields(&mut self, fields: Fields) { - self.fields = fields; +impl ProcessorBuilder for RegexProcessorBuilder { + fn output_keys(&self) -> HashSet<&str> { + self.output_keys.iter().map(|k| k.as_str()).collect() } - fn try_with_patterns(&mut self, patterns: Vec) -> Result<(), String> { - let mut rs = vec![]; - for pattern in patterns { - let gr = pattern.parse()?; - rs.push(gr); - } - self.patterns = rs; - Ok(()) + fn input_keys(&self) -> HashSet<&str> { + self.fields.iter().map(|f| f.input_field()).collect() } - fn with_ignore_missing(&mut self, ignore_missing: bool) { - self.ignore_missing = ignore_missing; + fn build(self, intermediate_keys: &[String]) -> Result { + self.build(intermediate_keys).map(ProcessorKind::Regex) } +} +impl RegexProcessorBuilder { fn check(self) -> Result { if self.fields.is_empty() { return Err(format!( @@ -118,47 +118,78 @@ impl RegexProcessor { Ok(self) } - fn generate_key(prefix: &str, group: &str) -> String { - format!("{prefix}_{group}") + fn build_group_output_info( + group_regex: &GroupRegex, + om_field: &OneInputMultiOutputField, + intermediate_keys: &[String], + ) -> Result, String> { + group_regex + .groups + .iter() + .map(|g| { + let key = generate_key(om_field.target_prefix(), g); + let index = find_key_index(intermediate_keys, &key, "regex"); + index.map(|index| OutPutInfo { + final_key: key, + group_name: g.to_string(), + index, + }) + }) + .collect::, String>>() } - fn process_field(&self, val: &str, field: &Field, gr: &GroupRegex) -> Result { - let mut map = Map::default(); - - if let Some(captures) = gr.regex.captures(val) { - for group in &gr.groups { - if let Some(capture) = captures.name(group) { - let value = capture.as_str().to_string(); - let prefix = field.get_target_field(); - - let key = Self::generate_key(prefix, group); - - map.insert(key, Value::String(value)); - } - } - } - - Ok(map) + fn build_group_output_infos( + patterns: &[GroupRegex], + om_field: &OneInputMultiOutputField, + intermediate_keys: &[String], + ) -> Result>, String> { + patterns + .iter() + .map(|group_regex| { + Self::build_group_output_info(group_regex, om_field, intermediate_keys) + }) + .collect::, String>>() } - fn update_output_keys(&mut self) { - for field in self.fields.iter_mut() { - for gr in &self.patterns { - for group in &gr.groups { - field - .output_fields_index_mapping - .insert(Self::generate_key(field.get_target_field(), group), 0_usize); - } - } + fn build_output_info( + real_fields: &[OneInputMultiOutputField], + patterns: &[GroupRegex], + intermediate_keys: &[String], + ) -> Result { + let inner = real_fields + .iter() + .map(|om_field| Self::build_group_output_infos(patterns, om_field, intermediate_keys)) + .collect::, String>>(); + inner.map(|inner| RegexProcessorOutputInfo { inner }) + } + + fn build(self, intermediate_keys: &[String]) -> Result { + let mut real_fields = vec![]; + for field in self.fields.into_iter() { + let input_index = find_key_index(intermediate_keys, field.input_field(), "regex")?; + let input_field_info = InputFieldInfo::new(field.input_field(), input_index); + + let input = OneInputMultiOutputField::new(input_field_info, field.target_field); + real_fields.push(input); } + let output_info = Self::build_output_info(&real_fields, &self.patterns, intermediate_keys)?; + Ok(RegexProcessor { + // fields: Fields::one(Field::new("test".to_string())), + fields: real_fields, + patterns: self.patterns, + output_info, + ignore_missing: self.ignore_missing, + }) } } -impl TryFrom<&yaml_rust::yaml::Hash> for RegexProcessor { +impl TryFrom<&yaml_rust::yaml::Hash> for RegexProcessorBuilder { type Error = String; fn try_from(value: &yaml_rust::yaml::Hash) -> Result { - let mut processor = RegexProcessor::default(); + let mut fields = Fields::default(); + let mut patterns: Vec = vec![]; + let mut ignore_missing = false; for (k, v) in value.iter() { let key = k @@ -166,28 +197,113 @@ impl TryFrom<&yaml_rust::yaml::Hash> for RegexProcessor { .ok_or(format!("key must be a string, but got {k:?}"))?; match key { FIELD_NAME => { - processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?)); + fields = Fields::one(yaml_new_field(v, FIELD_NAME)?); } FIELDS_NAME => { - processor.with_fields(yaml_fields(v, FIELDS_NAME)?); + fields = yaml_new_fields(v, FIELDS_NAME)?; } PATTERN_NAME => { - processor.try_with_patterns(vec![yaml_string(v, PATTERN_NAME)?])?; + let pattern = yaml_string(v, PATTERN_NAME)?; + let gr = pattern.parse()?; + patterns.push(gr); } PATTERNS_NAME => { - processor.try_with_patterns(yaml_strings(v, PATTERNS_NAME)?)?; + for pattern in yaml_strings(v, PATTERNS_NAME)? { + let gr = pattern.parse()?; + patterns.push(gr); + } } IGNORE_MISSING_NAME => { - processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?); + ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?; } _ => {} } } - processor.check().map(|mut p| { - p.update_output_keys(); - p - }) + let pattern_output_keys = patterns + .iter() + .flat_map(|pattern| pattern.groups.iter()) + .collect::>(); + let mut output_keys = HashSet::new(); + for field in fields.iter() { + for x in pattern_output_keys.iter() { + output_keys.insert(generate_key(field.target_or_input_field(), x)); + } + } + + let processor_builder = RegexProcessorBuilder { + fields, + patterns, + ignore_missing, + output_keys, + }; + + processor_builder.check() + } +} + +#[derive(Debug, Default)] +struct OutPutInfo { + final_key: String, + group_name: String, + index: usize, +} + +#[derive(Debug, Default)] +struct RegexProcessorOutputInfo { + pub inner: Vec>>, +} + +impl RegexProcessorOutputInfo { + fn get_output_index( + &self, + field_index: usize, + pattern_index: usize, + group_index: usize, + ) -> usize { + self.inner[field_index][pattern_index][group_index].index + } +} +/// only support string value +/// if no value found from a pattern, the target_field will be ignored +#[derive(Debug, Default)] +pub struct RegexProcessor { + fields: Vec, + output_info: RegexProcessorOutputInfo, + patterns: Vec, + ignore_missing: bool, +} + +impl RegexProcessor { + fn try_with_patterns(&mut self, patterns: Vec) -> Result<(), String> { + let mut rs = vec![]; + for pattern in patterns { + let gr = pattern.parse()?; + rs.push(gr); + } + self.patterns = rs; + Ok(()) + } + + fn process( + &self, + val: &str, + gr: &GroupRegex, + index: (usize, usize), + ) -> Result, String> { + let mut result = Vec::new(); + if let Some(captures) = gr.regex.captures(val) { + for (group_index, group) in gr.groups.iter().enumerate() { + if let Some(capture) = captures.name(group) { + let value = capture.as_str().to_string(); + let index = self + .output_info + .get_output_index(index.0, index.1, group_index); + result.push((index, Value::String(value))); + } + } + } + Ok(result) } } @@ -200,71 +316,40 @@ impl Processor for RegexProcessor { self.ignore_missing } - fn fields(&self) -> &Fields { - &self.fields - } - - fn fields_mut(&mut self) -> &mut Fields { - &mut self.fields - } - - fn output_keys(&self) -> HashSet { - self.fields - .iter() - .flat_map(|f| { - self.patterns.iter().flat_map(move |p| { - p.groups - .iter() - .map(move |g| Self::generate_key(&f.input_field.name, g)) - }) - }) - .collect() - } - - fn exec_field(&self, val: &Value, field: &Field) -> Result { - match val { - Value::String(val) => { - let mut map = Map::default(); - for gr in &self.patterns { - let m = self.process_field(val, field, gr)?; - map.extend(m); - } - Ok(map) - } - _ => Err(format!( - "{} processor: expect string value, but got {val:?}", - self.kind() - )), - } - } - fn exec_mut(&self, val: &mut Vec) -> Result<(), String> { - for field in self.fields.iter() { - let index = field.input_field.index; + for (field_index, field) in self.fields.iter().enumerate() { + let index = field.input_index(); + let mut result_list = None; match val.get(index) { Some(Value::String(s)) => { - let mut map = Map::default(); - for gr in &self.patterns { - // TODO(qtang): Let this method use the intermediate state collection directly. - let m = self.process_field(s, field, gr)?; - map.extend(m); - } - - field - .output_fields_index_mapping - .iter() - .for_each(|(k, output_index)| { - if let Some(v) = map.remove(k) { - val[*output_index] = v; + // we get rust borrow checker error here + // for (gr_index, gr) in self.patterns.iter().enumerate() { + // let result_list = self.process(s.as_str(), gr, (field_index, gr_index))?; + // for (output_index, result) in result_list { + //cannot borrow `*val` as mutable because it is also borrowed as immutable mutable borrow occurs here + // val[output_index] = result; + // } + // } + for (gr_index, gr) in self.patterns.iter().enumerate() { + let result = self.process(s.as_str(), gr, (field_index, gr_index))?; + if !result.is_empty() { + match result_list.as_mut() { + None => { + result_list = Some(result); + } + Some(result_list) => { + result_list.extend(result); + } } - }); + } + } } Some(Value::Null) | None => { if !self.ignore_missing { return Err(format!( "{} processor: missing field: {}", self.kind(), - field.get_field_name() + field.input_name() )); } } @@ -275,6 +360,15 @@ impl Processor for RegexProcessor { )); } } + // safety here + match result_list { + None => {} + Some(result_list) => { + for (output_index, result) in result_list { + val[output_index] = result; + } + } + } } Ok(()) @@ -282,37 +376,42 @@ impl Processor for RegexProcessor { } #[cfg(test)] mod tests { + use ahash::{HashMap, HashMapExt}; use itertools::Itertools; - use super::RegexProcessor; - use crate::etl::field::Fields; - use crate::etl::processor::Processor; + use crate::etl::processor::regex::RegexProcessorBuilder; use crate::etl::value::{Map, Value}; #[test] fn test_simple_parse() { - let mut processor = RegexProcessor::default(); + let pipeline_str = r#"fields: ["a"] +patterns: ['(?\d)'] +ignore_missing: false"#; + + let processor_yaml = yaml_rust::YamlLoader::load_from_str(pipeline_str) + .unwrap() + .pop() + .unwrap(); + let processor_yaml_hash = processor_yaml.as_hash().unwrap(); + let builder = RegexProcessorBuilder::try_from(processor_yaml_hash).unwrap(); + let intermediate_keys = ["a".to_string(), "a_ar".to_string()]; + let processor = builder.build(&intermediate_keys).unwrap(); // single field (with prefix), multiple patterns - let f = ["a"].iter().map(|f| f.parse().unwrap()).collect(); - processor.with_fields(Fields::new(f).unwrap()); - let ar = "(?\\d)"; + let result = processor + .process("123", &processor.patterns[0], (0, 0)) + .unwrap() + .into_iter() + .map(|(k, v)| (intermediate_keys[k].clone(), v)) + .collect(); - let patterns = [ar].iter().map(|p| p.to_string()).collect(); - processor.try_with_patterns(patterns).unwrap(); - - let mut map = Map::default(); - map.insert("a", Value::String("123".to_string())); - processor.exec_map(&mut map).unwrap(); + let map = Map { values: result }; let v = Map { - values: vec![ - ("a_ar".to_string(), Value::String("1".to_string())), - ("a".to_string(), Value::String("123".to_string())), - ] - .into_iter() - .collect(), + values: vec![("a_ar".to_string(), Value::String("1".to_string()))] + .into_iter() + .collect(), }; assert_eq!(v, map); @@ -320,17 +419,14 @@ mod tests { #[test] fn test_process() { - let mut processor = RegexProcessor::default(); - let cc = "[c=c,n=US_CA_SANJOSE,o=55155]"; let cg = "[a=12.34.567.89,b=12345678,c=g,n=US_CA_SANJOSE,o=20940]"; let co = "[a=987.654.321.09,c=o]"; let cp = "[c=p,n=US_CA_SANJOSE,o=55155]"; let cw = "[c=w,n=US_CA_SANJOSE,o=55155]"; - let breadcrumbs = Value::String([cc, cg, co, cp, cw].iter().join(",")); + let breadcrumbs_str = [cc, cg, co, cp, cw].iter().join(","); let values = [ - ("breadcrumbs", breadcrumbs.clone()), ("breadcrumbs_parent", Value::String(cc.to_string())), ("breadcrumbs_edge", Value::String(cg.to_string())), ("breadcrumbs_origin", Value::String(co.to_string())), @@ -340,61 +436,141 @@ mod tests { .into_iter() .map(|(k, v)| (k.to_string(), v)) .collect(); - let mut temporary_map = Map { values }; + let temporary_map = Map { values }; { // single field (with prefix), multiple patterns - let ff = ["breadcrumbs, breadcrumbs"] - .iter() - .map(|f| f.parse().unwrap()) - .collect(); - processor.with_fields(Fields::new(ff).unwrap()); - let ccr = "(?\\[[^\\[]*c=c[^\\]]*\\])"; - let cgr = "(?\\[[^\\[]*c=g[^\\]]*\\])"; - let cor = "(?\\[[^\\[]*c=o[^\\]]*\\])"; - let cpr = "(?\\[[^\\[]*c=p[^\\]]*\\])"; - let cwr = "(?\\[[^\\[]*c=w[^\\]]*\\])"; - let patterns = [ccr, cgr, cor, cpr, cwr] - .iter() - .map(|p| p.to_string()) - .collect(); - processor.try_with_patterns(patterns).unwrap(); + let pipeline_str = r#"fields: ["breadcrumbs"] +patterns: + - '(?\[[^\[]*c=c[^\]]*\])' + - '(?\[[^\[]*c=g[^\]]*\])' + - '(?\[[^\[]*c=o[^\]]*\])' + - '(?\[[^\[]*c=p[^\]]*\])' + - '(?\[[^\[]*c=w[^\]]*\])' +ignore_missing: false"#; - let mut map = Map::default(); - map.insert("breadcrumbs", breadcrumbs.clone()); - processor.exec_map(&mut map).unwrap(); - - assert_eq!(map, temporary_map); + let processor_yaml = yaml_rust::YamlLoader::load_from_str(pipeline_str) + .unwrap() + .pop() + .unwrap(); + let processor_yaml_hash = processor_yaml.as_hash().unwrap(); + let builder = RegexProcessorBuilder::try_from(processor_yaml_hash).unwrap(); + let intermediate_keys = [ + "breadcrumbs", + "breadcrumbs_parent", + "breadcrumbs_edge", + "breadcrumbs_origin", + "breadcrumbs_peer", + "breadcrumbs_wrapper", + ] + .iter() + .map(|k| k.to_string()) + .collect_vec(); + let processor = builder.build(&intermediate_keys).unwrap(); + let mut result = HashMap::new(); + for (index, pattern) in processor.patterns.iter().enumerate() { + let r = processor + .process(&breadcrumbs_str, pattern, (0, index)) + .unwrap() + .into_iter() + .map(|(k, v)| (intermediate_keys[k].clone(), v)) + .collect::>(); + result.extend(r); + } + let map = Map { values: result }; + assert_eq!(temporary_map, map); } { // multiple fields (with prefix), multiple patterns - let ff = [ - "breadcrumbs_parent, parent", - "breadcrumbs_edge, edge", - "breadcrumbs_origin, origin", - "breadcrumbs_peer, peer", - "breadcrumbs_wrapper, wrapper", - ] - .iter() - .map(|f| f.parse().unwrap()) - .collect(); - processor.with_fields(Fields::new(ff).unwrap()); - let patterns = [ - "a=(?[^,\\]]+)", - "b=(?[^,\\]]+)", - "k=(?[^,\\]]+)", - "l=(?[^,\\]]+)", - "m=(?[^,\\]]+)", - "n=(?[^,\\]]+)", - "o=(?[^,\\]]+)", + let pipeline_str = r#"fields: + - breadcrumbs_parent, parent + - breadcrumbs_edge, edge + - breadcrumbs_origin, origin + - breadcrumbs_peer, peer + - breadcrumbs_wrapper, wrapper +patterns: + - 'a=(?[^,\]]+)' + - 'b=(?[^,\]]+)' + - 'k=(?[^,\]]+)' + - 'l=(?[^,\]]+)' + - 'm=(?[^,\]]+)' + - 'n=(?[^,\]]+)' + - 'o=(?[^,\]]+)' +ignore_missing: false"#; + + let processor_yaml = yaml_rust::YamlLoader::load_from_str(pipeline_str) + .unwrap() + .pop() + .unwrap(); + let processor_yaml_hash = processor_yaml.as_hash().unwrap(); + let builder = RegexProcessorBuilder::try_from(processor_yaml_hash).unwrap(); + + let intermediate_keys = [ + "breadcrumbs_parent", + "breadcrumbs_edge", + "breadcrumbs_origin", + "breadcrumbs_peer", + "breadcrumbs_wrapper", + "edge_ip", + "edge_request_id", + "edge_request_end_time", + "edge_turn_around_time", + "edge_dns_lookup_time", + "edge_geo", + "edge_asn", + "origin_ip", + "origin_request_id", + "origin_request_end_time", + "origin_turn_around_time", + "origin_dns_lookup_time", + "origin_geo", + "origin_asn", + "peer_ip", + "peer_request_id", + "peer_request_end_time", + "peer_turn_around_time", + "peer_dns_lookup_time", + "peer_geo", + "peer_asn", + "parent_ip", + "parent_request_id", + "parent_request_end_time", + "parent_turn_around_time", + "parent_dns_lookup_time", + "parent_geo", + "parent_asn", + "wrapper_ip", + "wrapper_request_id", + "wrapper_request_end_time", + "wrapper_turn_around_time", + "wrapper_dns_lookup_time", + "wrapper_geo", + "wrapper_asn", ] .iter() - .map(|p| p.to_string()) - .collect(); - processor.try_with_patterns(patterns).unwrap(); + .map(|k| k.to_string()) + .collect_vec(); + let processor = builder.build(&intermediate_keys).unwrap(); + + let mut result = HashMap::new(); + for (field_index, field) in processor.fields.iter().enumerate() { + for (pattern_index, pattern) in processor.patterns.iter().enumerate() { + let s = temporary_map + .get(field.input_name()) + .unwrap() + .to_str_value(); + let r = processor + .process(&s, pattern, (field_index, pattern_index)) + .unwrap() + .into_iter() + .map(|(k, v)| (intermediate_keys[k].clone(), v)) + .collect::>(); + result.extend(r); + } + } let new_values = vec![ ("edge_ip", Value::String("12.34.567.89".to_string())), @@ -413,11 +589,7 @@ mod tests { .map(|(k, v)| (k.to_string(), v)) .collect(); - let mut expected_map = temporary_map.clone(); - processor.exec_map(&mut temporary_map).unwrap(); - expected_map.extend(Map { values: new_values }); - - assert_eq!(expected_map, temporary_map); + assert_eq!(result, new_values); } } } diff --git a/src/pipeline/src/etl/processor/timestamp.rs b/src/pipeline/src/etl/processor/timestamp.rs index 1be9177a8a..7ab9571101 100644 --- a/src/pipeline/src/etl/processor/timestamp.rs +++ b/src/pipeline/src/etl/processor/timestamp.rs @@ -19,18 +19,17 @@ use chrono::{DateTime, NaiveDateTime}; use chrono_tz::Tz; use lazy_static::lazy_static; -use super::yaml_strings; -use crate::etl::field::{Field, Fields}; +use crate::etl::field::{Fields, OneInputOneOutputField}; use crate::etl::processor::{ - update_one_one_output_keys, yaml_bool, yaml_field, yaml_fields, yaml_string, Processor, - FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, + yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, yaml_strings, Processor, + ProcessorBuilder, ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, }; use crate::etl::value::time::{ MICROSECOND_RESOLUTION, MICRO_RESOLUTION, MILLISECOND_RESOLUTION, MILLI_RESOLUTION, MS_RESOLUTION, NANOSECOND_RESOLUTION, NANO_RESOLUTION, NS_RESOLUTION, SECOND_RESOLUTION, SEC_RESOLUTION, S_RESOLUTION, US_RESOLUTION, }; -use crate::etl::value::{Map, Timestamp, Value}; +use crate::etl::value::{Timestamp, Value}; pub(crate) const PROCESSOR_TIMESTAMP: &str = "timestamp"; const RESOLUTION_NAME: &str = "resolution"; @@ -108,10 +107,56 @@ impl std::ops::Deref for Formats { } } +#[derive(Debug)] +pub struct TimestampProcessorBuilder { + fields: Fields, + formats: Formats, + resolution: Resolution, + ignore_missing: bool, +} + +impl ProcessorBuilder for TimestampProcessorBuilder { + fn output_keys(&self) -> HashSet<&str> { + self.fields + .iter() + .map(|f| f.target_or_input_field()) + .collect() + } + + fn input_keys(&self) -> HashSet<&str> { + self.fields.iter().map(|f| f.input_field()).collect() + } + + fn build(self, intermediate_keys: &[String]) -> Result { + self.build(intermediate_keys).map(ProcessorKind::Timestamp) + } +} + +impl TimestampProcessorBuilder { + pub fn build(self, intermediate_keys: &[String]) -> Result { + let mut real_fields = vec![]; + for field in self.fields.into_iter() { + let input = OneInputOneOutputField::build( + "timestamp", + intermediate_keys, + field.input_field(), + field.target_or_input_field(), + )?; + real_fields.push(input); + } + Ok(TimestampProcessor { + fields: real_fields, + formats: self.formats, + resolution: self.resolution, + ignore_missing: self.ignore_missing, + }) + } +} + /// support string, integer, float, time, epoch #[derive(Debug, Default)] pub struct TimestampProcessor { - fields: Fields, + fields: Vec, formats: Formats, resolution: Resolution, ignore_missing: bool, @@ -123,29 +168,6 @@ pub struct TimestampProcessor { } impl TimestampProcessor { - fn with_fields(&mut self, mut fields: Fields) { - update_one_one_output_keys(&mut fields); - self.fields = fields - } - - fn with_resolution(&mut self, resolution: Resolution) { - self.resolution = resolution; - } - - fn with_formats(&mut self, v: Option, Tz)>>) { - let v = match v { - Some(v) if !v.is_empty() => v, - _ => DEFAULT_FORMATS.clone(), - }; - - let formats = Formats::new(v); - self.formats = formats; - } - - fn with_ignore_missing(&mut self, ignore_missing: bool) { - self.ignore_missing = ignore_missing; - } - /// try to parse val with timezone first, if failed, parse without timezone fn try_parse(val: &str, fmt: &str, tz: Tz) -> Result { if let Ok(dt) = DateTime::parse_from_str(val, fmt) { @@ -212,12 +234,6 @@ impl TimestampProcessor { Resolution::Nano => Ok(Timestamp::Nanosecond(t)), } } - - fn process_field(&self, val: &Value, field: &Field) -> Result { - let key = field.get_target_field(); - - Ok(Map::one(key, Value::Timestamp(self.parse(val)?))) - } } fn parse_formats(yaml: &yaml_rust::yaml::Yaml) -> Result, Tz)>, String> { @@ -250,11 +266,14 @@ fn parse_formats(yaml: &yaml_rust::yaml::Yaml) -> Result, Tz)>, }; } -impl TryFrom<&yaml_rust::yaml::Hash> for TimestampProcessor { +impl TryFrom<&yaml_rust::yaml::Hash> for TimestampProcessorBuilder { type Error = String; fn try_from(hash: &yaml_rust::yaml::Hash) -> Result { - let mut processor = TimestampProcessor::default(); + let mut fields = Fields::default(); + let mut formats = Formats::default(); + let mut resolution = Resolution::default(); + let mut ignore_missing = false; for (k, v) in hash { let key = k @@ -263,28 +282,33 @@ impl TryFrom<&yaml_rust::yaml::Hash> for TimestampProcessor { match key { FIELD_NAME => { - processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?)); + fields = Fields::one(yaml_new_field(v, FIELD_NAME)?); } FIELDS_NAME => { - processor.with_fields(yaml_fields(v, FIELDS_NAME)?); + fields = yaml_new_fields(v, FIELDS_NAME)?; } FORMATS_NAME => { - let formats = parse_formats(v)?; - processor.with_formats(Some(formats)); + let formats_vec = parse_formats(v)?; + formats = Formats::new(formats_vec); } RESOLUTION_NAME => { - let s = yaml_string(v, RESOLUTION_NAME)?.as_str().try_into()?; - processor.with_resolution(s); + resolution = yaml_string(v, RESOLUTION_NAME)?.as_str().try_into()?; } IGNORE_MISSING_NAME => { - processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?); + ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?; } - _ => {} } } - Ok(processor) + let processor_builder = TimestampProcessorBuilder { + fields, + formats, + resolution, + ignore_missing, + }; + + Ok(processor_builder) } } @@ -297,49 +321,23 @@ impl Processor for TimestampProcessor { self.ignore_missing } - fn fields(&self) -> &Fields { - &self.fields - } - - fn fields_mut(&mut self) -> &mut Fields { - &mut self.fields - } - - fn output_keys(&self) -> HashSet { - self.fields - .iter() - .map(|f| f.get_target_field().to_string()) - .collect() - } - - fn exec_field(&self, val: &Value, field: &Field) -> Result { - self.process_field(val, field) - } - fn exec_mut(&self, val: &mut Vec) -> Result<(), String> { for field in self.fields.iter() { - let index = field.input_field.index; + let index = field.input().index; match val.get(index) { Some(Value::Null) | None => { if !self.ignore_missing { return Err(format!( "{} processor: missing field: {}", self.kind(), - field.get_field_name() + &field.input().name )); } } Some(v) => { - // TODO(qtang): Let this method use the intermediate state collection directly. - let mut map = self.process_field(v, field)?; - field - .output_fields_index_mapping - .iter() - .for_each(|(k, output_index)| { - if let Some(v) = map.remove(k) { - val[*output_index] = v; - } - }); + let result = self.parse(v)?; + let (_, index) = field.output(); + val[*index] = Value::Timestamp(result); } } } @@ -351,9 +349,18 @@ impl Processor for TimestampProcessor { mod tests { use yaml_rust::YamlLoader; - use super::TimestampProcessor; + use super::{TimestampProcessor, TimestampProcessorBuilder}; use crate::etl::value::{Timestamp, Value}; + fn builder_to_native_processor(builder: TimestampProcessorBuilder) -> TimestampProcessor { + TimestampProcessor { + fields: vec![], + formats: builder.formats, + resolution: builder.resolution, + ignore_missing: builder.ignore_missing, + } + } + #[test] fn test_parse_epoch() { let processor_yaml_str = r#"fields: @@ -367,7 +374,9 @@ formats: "#; let yaml = &YamlLoader::load_from_str(processor_yaml_str).unwrap()[0]; let timestamp_yaml = yaml.as_hash().unwrap(); - let processor = TimestampProcessor::try_from(timestamp_yaml).unwrap(); + let processor = builder_to_native_processor( + TimestampProcessorBuilder::try_from(timestamp_yaml).unwrap(), + ); let values = [ ( @@ -419,7 +428,9 @@ formats: "#; let yaml = &YamlLoader::load_from_str(processor_yaml_str).unwrap()[0]; let timestamp_yaml = yaml.as_hash().unwrap(); - let processor = TimestampProcessor::try_from(timestamp_yaml).unwrap(); + let processor = builder_to_native_processor( + TimestampProcessorBuilder::try_from(timestamp_yaml).unwrap(), + ); let values: Vec<&str> = vec![ "2014-5-17T12:34:56", diff --git a/src/pipeline/src/etl/processor/urlencoding.rs b/src/pipeline/src/etl/processor/urlencoding.rs index 67a9ff9ecc..7db9d092f2 100644 --- a/src/pipeline/src/etl/processor/urlencoding.rs +++ b/src/pipeline/src/etl/processor/urlencoding.rs @@ -15,12 +15,12 @@ use ahash::HashSet; use urlencoding::{decode, encode}; -use crate::etl::field::{Field, Fields}; +use crate::etl::field::{Fields, OneInputOneOutputField}; use crate::etl::processor::{ - yaml_bool, yaml_field, yaml_fields, yaml_string, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, - METHOD_NAME, + yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, ProcessorBuilder, ProcessorKind, + FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, METHOD_NAME, }; -use crate::etl::value::{Map, Value}; +use crate::etl::value::Value; pub(crate) const PROCESSOR_URL_ENCODING: &str = "urlencoding"; @@ -52,54 +52,76 @@ impl std::str::FromStr for Method { } } -/// only support string value #[derive(Debug, Default)] -pub struct UrlEncodingProcessor { +pub struct UrlEncodingProcessorBuilder { fields: Fields, method: Method, ignore_missing: bool, } +impl ProcessorBuilder for UrlEncodingProcessorBuilder { + fn output_keys(&self) -> HashSet<&str> { + self.fields + .iter() + .map(|f| f.target_or_input_field()) + .collect() + } + + fn input_keys(&self) -> HashSet<&str> { + self.fields.iter().map(|f| f.input_field()).collect() + } + + fn build(self, intermediate_keys: &[String]) -> Result { + self.build(intermediate_keys) + .map(ProcessorKind::UrlEncoding) + } +} + +impl UrlEncodingProcessorBuilder { + fn build(self, intermediate_keys: &[String]) -> Result { + let mut real_fields = vec![]; + for field in self.fields.into_iter() { + let input = OneInputOneOutputField::build( + "urlencoding", + intermediate_keys, + field.input_field(), + field.target_or_input_field(), + )?; + real_fields.push(input); + } + Ok(UrlEncodingProcessor { + fields: real_fields, + method: self.method, + ignore_missing: self.ignore_missing, + }) + } +} + +/// only support string value +#[derive(Debug, Default)] +pub struct UrlEncodingProcessor { + fields: Vec, + method: Method, + ignore_missing: bool, +} + impl UrlEncodingProcessor { - fn with_fields(&mut self, mut fields: Fields) { - Self::update_output_keys(&mut fields); - self.fields = fields; - } - - fn with_ignore_missing(&mut self, ignore_missing: bool) { - self.ignore_missing = ignore_missing; - } - - fn with_method(&mut self, method: Method) { - self.method = method; - } - - fn process_field(&self, val: &str, field: &Field) -> Result { + fn process_field(&self, val: &str) -> Result { let processed = match self.method { Method::Encode => encode(val).to_string(), Method::Decode => decode(val).map_err(|e| e.to_string())?.into_owned(), }; - let val = Value::String(processed); - - let key = field.get_target_field(); - - Ok(Map::one(key, val)) - } - - fn update_output_keys(fields: &mut Fields) { - for field in fields.iter_mut() { - field - .output_fields_index_mapping - .insert(field.get_target_field().to_string(), 0_usize); - } + Ok(Value::String(processed)) } } -impl TryFrom<&yaml_rust::yaml::Hash> for UrlEncodingProcessor { +impl TryFrom<&yaml_rust::yaml::Hash> for UrlEncodingProcessorBuilder { type Error = String; fn try_from(value: &yaml_rust::yaml::Hash) -> Result { - let mut processor = UrlEncodingProcessor::default(); + let mut fields = Fields::default(); + let mut method = Method::Decode; + let mut ignore_missing = false; for (k, v) in value.iter() { let key = k @@ -107,24 +129,29 @@ impl TryFrom<&yaml_rust::yaml::Hash> for UrlEncodingProcessor { .ok_or(format!("key must be a string, but got {k:?}"))?; match key { FIELD_NAME => { - processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?)); + fields = Fields::one(yaml_new_field(v, FIELD_NAME)?); } FIELDS_NAME => { - processor.with_fields(yaml_fields(v, FIELDS_NAME)?); + fields = yaml_new_fields(v, FIELDS_NAME)?; } IGNORE_MISSING_NAME => { - processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?); + ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?; } METHOD_NAME => { - let method = yaml_string(v, METHOD_NAME)?; - processor.with_method(method.parse()?); + let method_str = yaml_string(v, METHOD_NAME)?; + method = method_str.parse()?; } _ => {} } } + let processor = UrlEncodingProcessorBuilder { + fields, + method, + ignore_missing, + }; Ok(processor) } @@ -139,52 +166,21 @@ impl crate::etl::processor::Processor for UrlEncodingProcessor { self.ignore_missing } - fn fields(&self) -> &Fields { - &self.fields - } - - fn fields_mut(&mut self) -> &mut Fields { - &mut self.fields - } - - fn output_keys(&self) -> HashSet { - self.fields - .iter() - .map(|f| f.get_target_field().to_string()) - .collect() - } - - fn exec_field(&self, val: &Value, field: &Field) -> Result { - match val { - Value::String(val) => self.process_field(val, field), - _ => Err(format!( - "{} processor: expect string value, but got {val:?}", - self.kind() - )), - } - } - fn exec_mut(&self, val: &mut Vec) -> Result<(), String> { for field in self.fields.iter() { - let index = field.input_field.index; + let index = field.input_index(); match val.get(index) { Some(Value::String(s)) => { - let mut map = self.process_field(s, field)?; - field - .output_fields_index_mapping - .iter() - .for_each(|(k, output_index)| { - if let Some(v) = map.remove(k) { - val[*output_index] = v; - } - }); + let result = self.process_field(s)?; + let output_index = field.output_index(); + val[output_index] = result; } Some(Value::Null) | None => { if !self.ignore_missing { return Err(format!( "{} processor: missing field: {}", self.kind(), - field.get_field_name() + field.output_name() )); } } @@ -202,29 +198,28 @@ impl crate::etl::processor::Processor for UrlEncodingProcessor { #[cfg(test)] mod tests { - use crate::etl::field::{Field, Fields}; + use crate::etl::processor::urlencoding::UrlEncodingProcessor; - use crate::etl::value::{Map, Value}; + use crate::etl::value::Value; #[test] fn test_decode_url() { - let field = "url"; - let ff: Field = field.parse().unwrap(); - let decoded = "//BC/[a=6.7.8.9,c=g,k=0,l=1]"; let encoded = "%2F%2FBC%2F%5Ba%3D6.7.8.9%2Cc%3Dg%2Ck%3D0%2Cl%3D1%5D"; - let mut processor = UrlEncodingProcessor::default(); - processor.with_fields(Fields::one(ff.clone())); - { - let result = processor.process_field(encoded, &ff).unwrap(); - assert_eq!(Map::one(field, Value::String(decoded.into())), result) + let processor = UrlEncodingProcessor::default(); + let result = processor.process_field(encoded).unwrap(); + assert_eq!(Value::String(decoded.into()), result) } { - processor.with_method(super::Method::Encode); - let result = processor.process_field(decoded, &ff).unwrap(); - assert_eq!(Map::one(field, Value::String(encoded.into())), result) + let processor = UrlEncodingProcessor { + fields: vec![], + method: super::Method::Encode, + ignore_missing: false, + }; + let result = processor.process_field(decoded).unwrap(); + assert_eq!(Value::String(encoded.into()), result) } } } diff --git a/src/pipeline/src/etl/transform.rs b/src/pipeline/src/etl/transform.rs index f6becad872..15d1bf3378 100644 --- a/src/pipeline/src/etl/transform.rs +++ b/src/pipeline/src/etl/transform.rs @@ -17,8 +17,8 @@ pub mod transformer; use itertools::Itertools; -use crate::etl::field::Fields; -use crate::etl::processor::{update_one_one_output_keys, yaml_field, yaml_fields, yaml_string}; +use crate::etl::find_key_index; +use crate::etl::processor::yaml_string; use crate::etl::transform::index::Index; use crate::etl::value::Value; @@ -31,6 +31,9 @@ const TRANSFORM_ON_FAILURE: &str = "on_failure"; pub use transformer::greptime::GreptimeTransformer; +use super::field::{Fields, InputFieldInfo, OneInputOneOutputField}; +use super::processor::{yaml_new_field, yaml_new_fields}; + pub trait Transformer: std::fmt::Display + Sized + Send + Sync + 'static { type Output; type VecOutput; @@ -39,12 +42,11 @@ pub trait Transformer: std::fmt::Display + Sized + Send + Sync + 'static { fn schemas(&self) -> &Vec; fn transforms(&self) -> &Transforms; fn transforms_mut(&mut self) -> &mut Transforms; - fn transform(&self, val: Value) -> Result; fn transform_mut(&self, val: &mut Vec) -> Result; } /// On Failure behavior when transform fails -#[derive(Debug, Clone, Default)] +#[derive(Debug, Clone, Default, Copy)] pub enum OnFailure { // Return None if transform fails #[default] @@ -74,12 +76,18 @@ impl std::fmt::Display for OnFailure { } } } +#[derive(Debug, Default, Clone)] +pub struct TransformBuilders { + pub(crate) builders: Vec, + pub(crate) output_keys: Vec, + pub(crate) required_keys: Vec, +} #[derive(Debug, Default, Clone)] pub struct Transforms { - transforms: Vec, - output_keys: Vec, - required_keys: Vec, + pub(crate) transforms: Vec, + pub(crate) output_keys: Vec, + pub(crate) required_keys: Vec, } impl Transforms { @@ -130,7 +138,7 @@ impl std::ops::DerefMut for Transforms { } } -impl TryFrom<&Vec> for Transforms { +impl TryFrom<&Vec> for TransformBuilders { type Error = String; fn try_from(docs: &Vec) -> Result { @@ -138,41 +146,78 @@ impl TryFrom<&Vec> for Transforms { let mut all_output_keys: Vec = Vec::with_capacity(100); let mut all_required_keys = Vec::with_capacity(100); for doc in docs { - let transform: Transform = doc + let transform_builder: TransformBuilder = doc .as_hash() .ok_or("transform element must be a map".to_string())? .try_into()?; - let mut transform_output_keys = transform + let mut transform_output_keys = transform_builder .fields .iter() - .map(|f| f.get_target_field().to_string()) + .map(|f| f.target_or_input_field().to_string()) .collect(); all_output_keys.append(&mut transform_output_keys); - let mut transform_required_keys = transform + let mut transform_required_keys = transform_builder .fields .iter() - .map(|f| f.input_field.name.clone()) + .map(|f| f.input_field().to_string()) .collect(); all_required_keys.append(&mut transform_required_keys); - transforms.push(transform); + transforms.push(transform_builder); } all_required_keys.sort(); - Ok(Transforms { - transforms, + Ok(TransformBuilders { + builders: transforms, output_keys: all_output_keys, required_keys: all_required_keys, }) } } +#[derive(Debug, Clone)] +pub struct TransformBuilder { + fields: Fields, + type_: Value, + default: Option, + index: Option, + on_failure: Option, +} + +impl TransformBuilder { + pub fn build( + self, + intermediate_keys: &[String], + output_keys: &[String], + ) -> Result { + let mut real_fields = vec![]; + for field in self.fields { + let input_index = find_key_index(intermediate_keys, field.input_field(), "transform")?; + let input_field_info = InputFieldInfo::new(field.input_field(), input_index); + let output_index = + find_key_index(output_keys, field.target_or_input_field(), "transform")?; + let input = OneInputOneOutputField::new( + input_field_info, + (field.target_or_input_field().to_string(), output_index), + ); + real_fields.push(input); + } + Ok(Transform { + real_fields, + type_: self.type_, + default: self.default, + index: self.index, + on_failure: self.on_failure, + }) + } +} + /// only field is required #[derive(Debug, Clone)] pub struct Transform { - pub fields: Fields, + pub real_fields: Vec, pub type_: Value, @@ -192,7 +237,7 @@ impl std::fmt::Display for Transform { }; let type_ = format!("type: {}", self.type_); - let fields = format!("field(s): {}", self.fields); + let fields = format!("field(s): {:?}", self.real_fields); let default = if let Some(default) = &self.default { format!(", default: {}", default) } else { @@ -212,7 +257,7 @@ impl std::fmt::Display for Transform { impl Default for Transform { fn default() -> Self { Transform { - fields: Fields::default(), + real_fields: Vec::new(), type_: Value::Null, default: None, index: None, @@ -222,40 +267,6 @@ impl Default for Transform { } impl Transform { - fn with_fields(&mut self, mut fields: Fields) { - update_one_one_output_keys(&mut fields); - self.fields = fields; - } - - fn with_type(&mut self, type_: Value) { - self.type_ = type_; - } - - fn try_default(&mut self, default: Value) -> Result<(), String> { - match (&self.type_, &default) { - (Value::Null, _) => Err(format!( - "transform {} type MUST BE set before default {}", - self.fields, &default, - )), - (_, Value::Null) => Ok(()), // if default is not set, then it will be regarded as default null - (_, _) => { - let target = self - .type_ - .parse_str_value(default.to_str_value().as_str())?; - self.default = Some(target); - Ok(()) - } - } - } - - fn with_index(&mut self, index: Index) { - self.index = Some(index); - } - - fn with_on_failure(&mut self, on_failure: OnFailure) { - self.on_failure = Some(on_failure); - } - pub(crate) fn get_default(&self) -> Option<&Value> { self.default.as_ref() } @@ -265,52 +276,74 @@ impl Transform { } } -impl TryFrom<&yaml_rust::yaml::Hash> for Transform { +impl TryFrom<&yaml_rust::yaml::Hash> for TransformBuilder { type Error = String; fn try_from(hash: &yaml_rust::yaml::Hash) -> Result { - let mut transform = Transform::default(); - - let mut default_opt = None; + let mut fields = Fields::default(); + let mut type_ = Value::Null; + let mut default = None; + let mut index = None; + let mut on_failure = None; for (k, v) in hash { let key = k.as_str().ok_or("key must be a string")?; match key { TRANSFORM_FIELD => { - transform.with_fields(Fields::one(yaml_field(v, TRANSFORM_FIELD)?)); + fields = Fields::one(yaml_new_field(v, TRANSFORM_FIELD)?); } TRANSFORM_FIELDS => { - transform.with_fields(yaml_fields(v, TRANSFORM_FIELDS)?); + fields = yaml_new_fields(v, TRANSFORM_FIELDS)?; } TRANSFORM_TYPE => { let t = yaml_string(v, TRANSFORM_TYPE)?; - transform.with_type(Value::parse_str_type(&t)?); + type_ = Value::parse_str_type(&t)?; } TRANSFORM_INDEX => { - let index = yaml_string(v, TRANSFORM_INDEX)?; - transform.with_index(index.try_into()?); + let index_str = yaml_string(v, TRANSFORM_INDEX)?; + index = Some(index_str.try_into()?); } TRANSFORM_DEFAULT => { - default_opt = Some(Value::try_from(v)?); + default = Some(Value::try_from(v)?); } TRANSFORM_ON_FAILURE => { - let on_failure = yaml_string(v, TRANSFORM_ON_FAILURE)?; - transform.with_on_failure(on_failure.parse()?); + let on_failure_str = yaml_string(v, TRANSFORM_ON_FAILURE)?; + on_failure = Some(on_failure_str.parse()?); } _ => {} } } + let mut final_default = None; - if let Some(default) = default_opt { - transform.try_default(default)?; + if let Some(default_value) = default { + match (&type_, &default_value) { + (Value::Null, _) => { + return Err(format!( + "transform {:?} type MUST BE set before default {}", + fields, &default_value, + )); + } + (_, Value::Null) => {} // if default is not set, then it will be regarded as default null + (_, _) => { + let target = type_.parse_str_value(default_value.to_str_value().as_str())?; + final_default = Some(target); + } + } } + let builder = TransformBuilder { + fields, + type_, + default: final_default, + index, + on_failure, + }; - Ok(transform) + Ok(builder) } } diff --git a/src/pipeline/src/etl/transform/transformer/greptime.rs b/src/pipeline/src/etl/transform/transformer/greptime.rs index d9eaec2920..9753b01004 100644 --- a/src/pipeline/src/etl/transform/transformer/greptime.rs +++ b/src/pipeline/src/etl/transform/transformer/greptime.rs @@ -20,10 +20,10 @@ use coerce::{coerce_columns, coerce_value}; use greptime_proto::v1::{ColumnSchema, Row, Rows, Value as GreptimeValue}; use itertools::Itertools; -use crate::etl::field::{Field, Fields}; +use crate::etl::field::{InputFieldInfo, OneInputOneOutputField}; use crate::etl::transform::index::Index; use crate::etl::transform::{Transform, Transformer, Transforms}; -use crate::etl::value::{Array, Map, Timestamp, Value}; +use crate::etl::value::{Timestamp, Value}; const DEFAULT_GREPTIME_TIMESTAMP_COLUMN: &str = "greptime_timestamp"; @@ -36,23 +36,41 @@ pub struct GreptimeTransformer { } impl GreptimeTransformer { - fn default_greptime_timestamp_column() -> Transform { + /// Add a default timestamp column to the transforms + fn add_greptime_timestamp_column(transforms: &mut Transforms) { let ns = chrono::Utc::now().timestamp_nanos_opt().unwrap_or(0); let type_ = Value::Timestamp(Timestamp::Nanosecond(ns)); let default = Some(type_.clone()); - let mut field = Field::new(DEFAULT_GREPTIME_TIMESTAMP_COLUMN); - field.insert_output_index(DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string(), 0); - let fields = Fields::new(vec![field]).unwrap(); - Transform { - fields, + let transform = Transform { + real_fields: vec![OneInputOneOutputField::new( + InputFieldInfo { + name: DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string(), + index: usize::MAX, + }, + ( + DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string(), + transforms + .transforms + .iter() + .map(|x| x.real_fields.len()) + .sum(), + ), + )], type_, default, index: Some(Index::Time), on_failure: Some(crate::etl::transform::OnFailure::Default), - } + }; + let required_keys = transforms.required_keys_mut(); + required_keys.push(DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string()); + + let output_keys = transforms.output_keys_mut(); + output_keys.push(DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string()); + transforms.push(transform); } + /// Generate the schema for the GreptimeTransformer fn schemas(transforms: &Transforms) -> Result, String> { let mut schema = vec![]; for transform in transforms.iter() { @@ -60,53 +78,6 @@ impl GreptimeTransformer { } Ok(schema) } - - fn transform_map(&self, map: &Map) -> Result { - let mut values = vec![GreptimeValue { value_data: None }; self.schema.len()]; - for transform in self.transforms.iter() { - for field in transform.fields.iter() { - let value_data = match map.get(field.get_field_name()) { - Some(val) => coerce_value(val, transform)?, - None => { - let default = transform.get_default(); - match default { - Some(default) => coerce_value(default, transform)?, - None => None, - } - } - }; - if let Some(i) = field - .output_fields_index_mapping - .iter() - .next() - .map(|kv| kv.1) - { - values[*i] = GreptimeValue { value_data } - } else { - return Err(format!( - "field: {} output_fields is empty.", - field.get_field_name() - )); - } - } - } - - Ok(Row { values }) - } - - fn transform_array(&self, arr: &Array) -> Result, String> { - let mut rows = Vec::with_capacity(arr.len()); - for v in arr.iter() { - match v { - Value::Map(map) => { - let row = self.transform_map(map)?; - rows.push(row); - } - _ => return Err(format!("Expected map, found: {v:?}")), - } - } - Ok(rows) - } } impl std::fmt::Display for GreptimeTransformer { @@ -129,9 +100,9 @@ impl Transformer for GreptimeTransformer { for transform in transforms.iter() { let target_fields_set = transform - .fields + .real_fields .iter() - .map(|f| f.get_target_field()) + .map(|f| f.output_name()) .collect::>(); let intersections: Vec<_> = column_names_set.intersection(&target_fields_set).collect(); @@ -146,12 +117,15 @@ impl Transformer for GreptimeTransformer { if let Some(idx) = transform.index { if idx == Index::Time { - match transform.fields.len() { - 1 => timestamp_columns.push(transform.fields.first().unwrap().get_field_name()), - _ => return Err(format!( - "Illegal to set multiple timestamp Index columns, please set only one: {}", - transform.fields.get_target_fields().join(", ") - )), + match transform.real_fields.len() { + 1 => timestamp_columns + .push(transform.real_fields.first().unwrap().input_name()), + _ => { + return Err(format!( + "Illegal to set multiple timestamp Index columns, please set only one: {}", + transform.real_fields.iter().map(|x|x.input_name()).join(", ") + )) + } } } } @@ -159,13 +133,7 @@ impl Transformer for GreptimeTransformer { match timestamp_columns.len() { 0 => { - transforms.push(GreptimeTransformer::default_greptime_timestamp_column()); - - let required_keys = transforms.required_keys_mut(); - required_keys.push(DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string()); - - let output_keys = transforms.output_keys_mut(); - output_keys.push(DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string()); + GreptimeTransformer::add_greptime_timestamp_column(&mut transforms); let schema = GreptimeTransformer::schemas(&transforms)?; Ok(GreptimeTransformer { transforms, schema }) @@ -184,54 +152,26 @@ impl Transformer for GreptimeTransformer { } } - fn transform(&self, value: Value) -> Result { - match value { - Value::Map(map) => { - let rows = vec![self.transform_map(&map)?]; - Ok(Rows { - schema: self.schema.clone(), - rows, - }) - } - Value::Array(arr) => { - let rows = self.transform_array(&arr)?; - Ok(Rows { - schema: self.schema.clone(), - rows, - }) - } - _ => Err(format!("Expected map or array, found: {}", value)), - } - } - fn transform_mut(&self, val: &mut Vec) -> Result { let mut values = vec![GreptimeValue { value_data: None }; self.schema.len()]; for transform in self.transforms.iter() { - for field in transform.fields.iter() { - let index = field.input_field.index; + for field in transform.real_fields.iter() { + let index = field.input_index(); + let output_index = field.output_index(); match val.get(index) { Some(v) => { let value_data = coerce_value(v, transform) - .map_err(|e| format!("{} processor: {}", field.get_field_name(), e))?; + .map_err(|e| format!("{} processor: {}", field.input_name(), e))?; // every transform fields has only one output field - if let Some(i) = field - .output_fields_index_mapping - .iter() - .next() - .map(|kv| kv.1) - { - values[*i] = GreptimeValue { value_data } - } else { - return Err(format!( - "field: {} output_fields is empty.", - field.get_field_name() - )); - } + values[output_index] = GreptimeValue { value_data }; } - _ => { - return Err(format!( - "Get field not in the array field: {field:?}, {val:?}" - )) + None => { + let default = transform.get_default(); + let value_data = match default { + Some(default) => coerce_value(default, transform)?, + None => None, + }; + values[output_index] = GreptimeValue { value_data }; } } } diff --git a/src/pipeline/src/etl/transform/transformer/greptime/coerce.rs b/src/pipeline/src/etl/transform/transformer/greptime/coerce.rs index 4e83d0b203..8c7efef22f 100644 --- a/src/pipeline/src/etl/transform/transformer/greptime/coerce.rs +++ b/src/pipeline/src/etl/transform/transformer/greptime/coerce.rs @@ -66,8 +66,8 @@ impl TryFrom for ValueData { pub(crate) fn coerce_columns(transform: &Transform) -> Result, String> { let mut columns = Vec::new(); - for field in transform.fields.iter() { - let column_name = field.get_target_field().to_string(); + for field in transform.real_fields.iter() { + let column_name = field.output_name().to_string(); let datatype = coerce_type(transform)? as i32; @@ -134,7 +134,7 @@ fn coerce_type(transform: &Transform) -> Result { Value::Null => Err(format!( "Null type not supported when to coerce '{}' type", - transform.fields + transform.type_.to_str_type() )), } } @@ -144,15 +144,18 @@ pub(crate) fn coerce_value( transform: &Transform, ) -> Result, String> { match val { - Value::Null => match transform.on_failure { - Some(OnFailure::Ignore) => Ok(None), - Some(OnFailure::Default) => transform - .get_default() - .map(|default| coerce_value(default, transform)) - .unwrap_or_else(|| { - coerce_value(transform.get_type_matched_default_val(), transform) - }), - None => Ok(None), + Value::Null => match &transform.default { + Some(default) => coerce_value(default, transform), + None => match transform.on_failure { + Some(OnFailure::Ignore) => Ok(None), + Some(OnFailure::Default) => transform + .get_default() + .map(|default| coerce_value(default, transform)) + .unwrap_or_else(|| { + coerce_value(transform.get_type_matched_default_val(), transform) + }), + None => Ok(None), + }, }, Value::Int8(n) => coerce_i64_value(*n as i64, transform), @@ -404,12 +407,11 @@ fn coerce_string_value(s: &String, transform: &Transform) -> Result Rows { - let input_value: Value = serde_json::from_str::(input_str) - .expect("failed to parse into json") - .try_into() - .expect("failed to convert into value"); + let input_value = serde_json::from_str::(input_str).unwrap(); let yaml_content = Content::Yaml(pipeline_yaml.into()); let pipeline: Pipeline = parse(&yaml_content).expect("failed to parse pipeline"); + let mut result = pipeline.init_intermediate_state(); - pipeline.exec(input_value).expect("failed to exec pipeline") + let schema = pipeline.schemas().clone(); + + let mut rows = Vec::new(); + + match input_value { + serde_json::Value::Array(array) => { + for value in array { + pipeline.prepare(value, &mut result).unwrap(); + let row = pipeline + .exec_mut(&mut result) + .expect("failed to exec pipeline"); + rows.push(row); + pipeline.reset_intermediate_state(&mut result); + } + } + serde_json::Value::Object(_) => { + pipeline.prepare(input_value, &mut result).unwrap(); + let row = pipeline + .exec_mut(&mut result) + .expect("failed to exec pipeline"); + rows.push(row); + } + _ => { + panic!("invalid input value"); + } + } + + Rows { schema, rows } } /// test util function to create column schema diff --git a/src/pipeline/tests/dissect.rs b/src/pipeline/tests/dissect.rs index 10f9e27996..82ce63399c 100644 --- a/src/pipeline/tests/dissect.rs +++ b/src/pipeline/tests/dissect.rs @@ -157,7 +157,7 @@ transform: fn test_modifier() { let empty_str = r#" { - "str": "key1 key2 key3 key4 key5 key6 key7 key8" + "str": "key1 key2 key3 key4 key5 key6" }"#; let pipeline_yaml = r#" @@ -165,7 +165,7 @@ processors: - dissect: field: str patterns: - - "%{key1} %{key2} %{+key3} %{+key3/2} %{key5->} %{?key6} %{*key_7} %{&key_7}" + - "%{key1} %{key2} %{+key3} %{+key3/2} %{key5->} %{?key6}" transform: - fields: @@ -173,7 +173,6 @@ transform: - key2 - key3 - key5 - - key7 type: string "#; @@ -184,7 +183,6 @@ transform: make_string_column_schema("key2".to_string()), make_string_column_schema("key3".to_string()), make_string_column_schema("key5".to_string()), - make_string_column_schema("key7".to_string()), common::make_column_schema( "greptime_timestamp".to_string(), ColumnDataType::TimestampNanosecond, @@ -209,10 +207,6 @@ transform: output.rows[0].values[3].value_data, Some(StringValue("key5".to_string())) ); - assert_eq!( - output.rows[0].values[4].value_data, - Some(StringValue("key8".to_string())) - ); } #[test] diff --git a/src/pipeline/tests/pipeline.rs b/src/pipeline/tests/pipeline.rs index af3b5a8c20..d5712eaedd 100644 --- a/src/pipeline/tests/pipeline.rs +++ b/src/pipeline/tests/pipeline.rs @@ -12,18 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. +use api::v1::Rows; use common_telemetry::tracing::info; use greptime_proto::v1::value::ValueData::{ BoolValue, F64Value, StringValue, TimestampNanosecondValue, TimestampSecondValue, U32Value, U64Value, U8Value, }; use greptime_proto::v1::Value as GreptimeValue; -use pipeline::{parse, Content, GreptimeTransformer, Pipeline, Value}; +use pipeline::{parse, Content, GreptimeTransformer, Pipeline}; #[test] fn test_complex_data() { let input_value_str = r#" - [ { "version": 1, "streamId": "12345", @@ -73,12 +73,9 @@ fn test_complex_data() { "ewExecutionInfo": "c:4380:7:161:162:161:n:::12473:200|C:4380:3:0:4:0:n:::6967:200|R:4380:20:99:99:1:n:::35982:200", "customField": "any-custom-value" } - ] "#; - let input_value: Value = serde_json::from_str::(input_value_str) - .expect("failed to parse input value") - .try_into() - .expect("failed to convert input value"); + let input_value = serde_json::from_str::(input_value_str) + .expect("failed to parse input value"); let pipeline_yaml = r#" --- @@ -422,7 +419,19 @@ transform: let yaml_content = Content::Yaml(pipeline_yaml.into()); let pipeline: Pipeline = parse(&yaml_content).expect("failed to parse pipeline"); - let output = pipeline.exec(input_value).expect("failed to exec pipeline"); + let mut stats = pipeline.init_intermediate_state(); + pipeline + .prepare(input_value, &mut stats) + .expect("failed to prepare pipeline"); + + let row = pipeline + .exec_mut(&mut stats) + .expect("failed to exec pipeline"); + + let output = Rows { + schema: pipeline.schemas().clone(), + rows: vec![row], + }; assert_eq!(output.rows.len(), 1); let values = output.rows.first().unwrap().values.clone(); @@ -464,10 +473,7 @@ fn test_simple_data() { "line": "2024-05-25 20:16:37.217 hello world" } "#; - let input_value: Value = serde_json::from_str::(input_value_str) - .unwrap() - .try_into() - .unwrap(); + let input_value = serde_json::from_str::(input_value_str).unwrap(); let pipeline_yaml = r#" processors: @@ -493,11 +499,13 @@ transform: let yaml_content = Content::Yaml(pipeline_yaml.into()); let pipeline: Pipeline = parse(&yaml_content).unwrap(); - let output = pipeline.exec(input_value).unwrap(); - let r = output - .rows + + let mut status = pipeline.init_intermediate_state(); + pipeline.prepare(input_value, &mut status).unwrap(); + let row = pipeline.exec_mut(&mut status).unwrap(); + let r = row + .values .into_iter() - .flat_map(|v| v.values) .map(|v| v.value_data.unwrap()) .collect::>(); From e88465840d2e11bf65eb4f3907aaf95ee04c0878 Mon Sep 17 00:00:00 2001 From: jeremyhi Date: Fri, 6 Sep 2024 16:29:20 +0800 Subject: [PATCH 5/8] feat: add extension field to HeartbeatRequest (#4688) * feat: add extension field to HeartbeatRequest * chore: extension to extensions * chore: upgrade proto --- Cargo.lock | 2 +- Cargo.toml | 2 +- src/api/src/region.rs | 6 +++--- src/common/meta/src/ddl/alter_logical_tables.rs | 2 +- src/common/meta/src/ddl/create_logical_tables.rs | 2 +- src/datanode/src/heartbeat.rs | 4 +++- src/datanode/src/region_server.rs | 8 ++++---- src/meta-srv/src/handler/failure_handler.rs | 1 + src/meta-srv/src/handler/node_stat.rs | 5 ++++- src/meta-srv/src/handler/region_lease_handler.rs | 1 + src/meta-srv/src/procedure/utils.rs | 2 +- src/meta-srv/src/selector/weight_compute.rs | 3 +++ src/metric-engine/src/engine.rs | 2 +- src/operator/src/flow.rs | 2 +- 14 files changed, 26 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a433b41841..6391920f47 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4300,7 +4300,7 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" [[package]] name = "greptime-proto" version = "0.1.0" -source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=c437b55725b7f5224fe9d46db21072b4a682ee4b#c437b55725b7f5224fe9d46db21072b4a682ee4b" +source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=157cfdb52709e489cf1f3ce8e3042ed4ee8a524a#157cfdb52709e489cf1f3ce8e3042ed4ee8a524a" dependencies = [ "prost 0.12.6", "serde", diff --git a/Cargo.toml b/Cargo.toml index e4a04c1f47..93ea8db134 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -120,7 +120,7 @@ etcd-client = { version = "0.13" } fst = "0.4.7" futures = "0.3" futures-util = "0.3" -greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "c437b55725b7f5224fe9d46db21072b4a682ee4b" } +greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "157cfdb52709e489cf1f3ce8e3042ed4ee8a524a" } humantime = "2.1" humantime-serde = "1.1" itertools = "0.10" diff --git a/src/api/src/region.rs b/src/api/src/region.rs index 0493378213..d752382534 100644 --- a/src/api/src/region.rs +++ b/src/api/src/region.rs @@ -21,14 +21,14 @@ use greptime_proto::v1::region::RegionResponse as RegionResponseV1; #[derive(Debug)] pub struct RegionResponse { pub affected_rows: AffectedRows, - pub extension: HashMap>, + pub extensions: HashMap>, } impl RegionResponse { pub fn from_region_response(region_response: RegionResponseV1) -> Self { Self { affected_rows: region_response.affected_rows as _, - extension: region_response.extension, + extensions: region_response.extensions, } } @@ -36,7 +36,7 @@ impl RegionResponse { pub fn new(affected_rows: AffectedRows) -> Self { Self { affected_rows, - extension: Default::default(), + extensions: Default::default(), } } } diff --git a/src/common/meta/src/ddl/alter_logical_tables.rs b/src/common/meta/src/ddl/alter_logical_tables.rs index 48d34b4307..3af359ef6e 100644 --- a/src/common/meta/src/ddl/alter_logical_tables.rs +++ b/src/common/meta/src/ddl/alter_logical_tables.rs @@ -131,7 +131,7 @@ impl AlterLogicalTablesProcedure { let phy_raw_schemas = future::join_all(alter_region_tasks) .await .into_iter() - .map(|res| res.map(|mut res| res.extension.remove(ALTER_PHYSICAL_EXTENSION_KEY))) + .map(|res| res.map(|mut res| res.extensions.remove(ALTER_PHYSICAL_EXTENSION_KEY))) .collect::>>()?; if phy_raw_schemas.is_empty() { diff --git a/src/common/meta/src/ddl/create_logical_tables.rs b/src/common/meta/src/ddl/create_logical_tables.rs index 5095b7c32e..4b867147be 100644 --- a/src/common/meta/src/ddl/create_logical_tables.rs +++ b/src/common/meta/src/ddl/create_logical_tables.rs @@ -157,7 +157,7 @@ impl CreateLogicalTablesProcedure { let phy_raw_schemas = join_all(create_region_tasks) .await .into_iter() - .map(|res| res.map(|mut res| res.extension.remove(ALTER_PHYSICAL_EXTENSION_KEY))) + .map(|res| res.map(|mut res| res.extensions.remove(ALTER_PHYSICAL_EXTENSION_KEY))) .collect::>>()?; if phy_raw_schemas.is_empty() { diff --git a/src/datanode/src/heartbeat.rs b/src/datanode/src/heartbeat.rs index 04e9d9ac5b..68b4637fce 100644 --- a/src/datanode/src/heartbeat.rs +++ b/src/datanode/src/heartbeat.rs @@ -324,10 +324,12 @@ impl HeartbeatTask { region_id: stat.region_id.as_u64(), engine: stat.engine, role: RegionRole::from(stat.role).into(), - // TODO(jeremy): w/rcus + // TODO(weny): w/rcus rcus: 0, wcus: 0, approximate_bytes: region_server.region_disk_usage(stat.region_id).unwrap_or(0), + // TODO(weny): add extensions + extensions: Default::default(), }) .collect() } diff --git a/src/datanode/src/region_server.rs b/src/datanode/src/region_server.rs index f6cc479d6a..56068a38c3 100644 --- a/src/datanode/src/region_server.rs +++ b/src/datanode/src/region_server.rs @@ -366,10 +366,10 @@ impl RegionServerHandler for RegionServer { // merge results by sum up affected rows and merge extensions. let mut affected_rows = 0; - let mut extension = HashMap::new(); + let mut extensions = HashMap::new(); for result in results { affected_rows += result.affected_rows; - extension.extend(result.extension); + extensions.extend(result.extensions); } Ok(RegionResponseV1 { @@ -380,7 +380,7 @@ impl RegionServerHandler for RegionServer { }), }), affected_rows: affected_rows as _, - extension, + extensions, }) } } @@ -708,7 +708,7 @@ impl RegionServerInner { .await?; Ok(RegionResponse { affected_rows: result.affected_rows, - extension: result.extension, + extensions: result.extensions, }) } Err(err) => { diff --git a/src/meta-srv/src/handler/failure_handler.rs b/src/meta-srv/src/handler/failure_handler.rs index f8acdd75c2..ebeeaf6b7f 100644 --- a/src/meta-srv/src/handler/failure_handler.rs +++ b/src/meta-srv/src/handler/failure_handler.rs @@ -93,6 +93,7 @@ mod tests { approximate_bytes: 0, engine: default_engine().to_string(), role: RegionRole::Follower, + extensions: Default::default(), } } acc.stat = Some(Stat { diff --git a/src/meta-srv/src/handler/node_stat.rs b/src/meta-srv/src/handler/node_stat.rs index b7fe55a0f4..5f1ec1cc2b 100644 --- a/src/meta-srv/src/handler/node_stat.rs +++ b/src/meta-srv/src/handler/node_stat.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use api::v1::meta::HeartbeatRequest; use common_meta::ClusterId; @@ -57,6 +57,8 @@ pub struct RegionStat { pub engine: String, /// The region role. pub role: RegionRole, + /// The extension info of this region + pub extensions: HashMap>, } impl Stat { @@ -142,6 +144,7 @@ impl TryFrom for RegionStat { approximate_bytes: value.approximate_bytes, engine: value.engine.to_string(), role: RegionRole::from(value.role()), + extensions: value.extensions, }) } } diff --git a/src/meta-srv/src/handler/region_lease_handler.rs b/src/meta-srv/src/handler/region_lease_handler.rs index 2481e86c8f..28ddb436e0 100644 --- a/src/meta-srv/src/handler/region_lease_handler.rs +++ b/src/meta-srv/src/handler/region_lease_handler.rs @@ -135,6 +135,7 @@ mod test { wcus: 0, approximate_bytes: 0, engine: String::new(), + extensions: Default::default(), } } diff --git a/src/meta-srv/src/procedure/utils.rs b/src/meta-srv/src/procedure/utils.rs index 09f0400ba1..c4e1688de0 100644 --- a/src/meta-srv/src/procedure/utils.rs +++ b/src/meta-srv/src/procedure/utils.rs @@ -100,7 +100,7 @@ pub mod mock { }), }), affected_rows: 0, - extension: Default::default(), + extensions: Default::default(), }) } } diff --git a/src/meta-srv/src/selector/weight_compute.rs b/src/meta-srv/src/selector/weight_compute.rs index a87a1b3b7f..c8c555d204 100644 --- a/src/meta-srv/src/selector/weight_compute.rs +++ b/src/meta-srv/src/selector/weight_compute.rs @@ -199,6 +199,7 @@ mod tests { approximate_bytes: 1, engine: "mito2".to_string(), role: RegionRole::Leader, + extensions: Default::default(), }], ..Default::default() } @@ -215,6 +216,7 @@ mod tests { approximate_bytes: 1, engine: "mito2".to_string(), role: RegionRole::Leader, + extensions: Default::default(), }], ..Default::default() } @@ -231,6 +233,7 @@ mod tests { approximate_bytes: 1, engine: "mito2".to_string(), role: RegionRole::Leader, + extensions: Default::default(), }], ..Default::default() } diff --git a/src/metric-engine/src/engine.rs b/src/metric-engine/src/engine.rs index f4e386a053..08414a97e4 100644 --- a/src/metric-engine/src/engine.rs +++ b/src/metric-engine/src/engine.rs @@ -162,7 +162,7 @@ impl RegionEngine for MetricEngine { result.map_err(BoxedError::new).map(|rows| RegionResponse { affected_rows: rows, - extension: extension_return_value, + extensions: extension_return_value, }) } diff --git a/src/operator/src/flow.rs b/src/operator/src/flow.rs index d6344e278d..1c82fcf00a 100644 --- a/src/operator/src/flow.rs +++ b/src/operator/src/flow.rs @@ -119,7 +119,7 @@ impl FlowServiceOperator { if let Some(prev) = &mut final_result { prev.affected_rows = res.affected_rows; prev.affected_flows.extend(res.affected_flows); - prev.extension.extend(res.extension); + prev.extensions.extend(res.extensions); } else { final_result = Some(res); } From 5d9f8a3be74702abfeab7c7afb6eeee1e8f20a09 Mon Sep 17 00:00:00 2001 From: localhost Date: Fri, 6 Sep 2024 16:36:49 +0800 Subject: [PATCH 6/8] feat: add test pipeline api (#4667) * chore: add test pipeline api * chore: add test for test pipeline api * chore: fix taplo check * chore: change pipeline dryrun api path * chore: add more info for pipeline dryrun api --- Cargo.lock | 2 + src/datatypes/Cargo.toml | 2 + src/datatypes/src/value.rs | 168 ++++++++++++++++++++++++++++++++ src/servers/src/http.rs | 1 + src/servers/src/http/event.rs | 116 +++++++++++++++++++++- tests-integration/tests/http.rs | 166 +++++++++++++++++++++++++++++++ 6 files changed, 453 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6391920f47..d483ec7088 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3156,6 +3156,7 @@ dependencies = [ "arrow", "arrow-array", "arrow-schema", + "base64 0.21.7", "common-base", "common-decimal", "common-error", @@ -3164,6 +3165,7 @@ dependencies = [ "common-time", "datafusion-common", "enum_dispatch", + "greptime-proto", "num", "num-traits", "ordered-float 3.9.2", diff --git a/src/datatypes/Cargo.toml b/src/datatypes/Cargo.toml index 281057ce80..b10ea682dd 100644 --- a/src/datatypes/Cargo.toml +++ b/src/datatypes/Cargo.toml @@ -15,6 +15,7 @@ workspace = true arrow.workspace = true arrow-array.workspace = true arrow-schema.workspace = true +base64.workspace = true common-base.workspace = true common-decimal.workspace = true common-error.workspace = true @@ -23,6 +24,7 @@ common-telemetry.workspace = true common-time.workspace = true datafusion-common.workspace = true enum_dispatch = "0.3" +greptime-proto.workspace = true num = "0.4" num-traits = "0.2" ordered-float = { version = "3.0", features = ["serde"] } diff --git a/src/datatypes/src/value.rs b/src/datatypes/src/value.rs index 15aa028f4f..6c49154e40 100644 --- a/src/datatypes/src/value.rs +++ b/src/datatypes/src/value.rs @@ -18,6 +18,8 @@ use std::sync::Arc; use arrow::datatypes::{DataType as ArrowDataType, Field}; use arrow_array::{Array, ListArray}; +use base64::engine::general_purpose::URL_SAFE; +use base64::Engine as _; use common_base::bytes::{Bytes, StringBytes}; use common_decimal::Decimal128; use common_telemetry::error; @@ -28,8 +30,10 @@ use common_time::time::Time; use common_time::timestamp::{TimeUnit, Timestamp}; use common_time::{Duration, Interval, Timezone}; use datafusion_common::ScalarValue; +use greptime_proto::v1::value::ValueData; pub use ordered_float::OrderedFloat; use serde::{Deserialize, Serialize, Serializer}; +use serde_json::{Number, Value as JsonValue}; use snafu::{ensure, ResultExt}; use crate::error::{self, ConvertArrowArrayToScalarsSnafu, Error, Result, TryFromValueSnafu}; @@ -1364,15 +1368,179 @@ impl<'a> ValueRef<'a> { } } +pub fn column_data_to_json(data: ValueData) -> JsonValue { + match data { + ValueData::BinaryValue(b) => JsonValue::String(URL_SAFE.encode(b)), + ValueData::BoolValue(b) => JsonValue::Bool(b), + ValueData::U8Value(i) => JsonValue::Number(i.into()), + ValueData::U16Value(i) => JsonValue::Number(i.into()), + ValueData::U32Value(i) => JsonValue::Number(i.into()), + ValueData::U64Value(i) => JsonValue::Number(i.into()), + ValueData::I8Value(i) => JsonValue::Number(i.into()), + ValueData::I16Value(i) => JsonValue::Number(i.into()), + ValueData::I32Value(i) => JsonValue::Number(i.into()), + ValueData::I64Value(i) => JsonValue::Number(i.into()), + ValueData::F32Value(f) => Number::from_f64(f as f64) + .map(JsonValue::Number) + .unwrap_or(JsonValue::Null), + ValueData::F64Value(f) => Number::from_f64(f) + .map(JsonValue::Number) + .unwrap_or(JsonValue::Null), + ValueData::StringValue(s) => JsonValue::String(s), + ValueData::DateValue(d) => JsonValue::String(Date::from(d).to_string()), + ValueData::DatetimeValue(d) => JsonValue::String(DateTime::from(d).to_string()), + ValueData::TimeSecondValue(d) => JsonValue::String(Time::new_second(d).to_iso8601_string()), + ValueData::TimeMillisecondValue(d) => { + JsonValue::String(Time::new_millisecond(d).to_iso8601_string()) + } + ValueData::TimeMicrosecondValue(d) => { + JsonValue::String(Time::new_microsecond(d).to_iso8601_string()) + } + ValueData::TimeNanosecondValue(d) => { + JsonValue::String(Time::new_nanosecond(d).to_iso8601_string()) + } + ValueData::TimestampMicrosecondValue(d) => { + JsonValue::String(Timestamp::new_microsecond(d).to_iso8601_string()) + } + ValueData::TimestampMillisecondValue(d) => { + JsonValue::String(Timestamp::new_millisecond(d).to_iso8601_string()) + } + ValueData::TimestampNanosecondValue(d) => { + JsonValue::String(Timestamp::new_nanosecond(d).to_iso8601_string()) + } + ValueData::TimestampSecondValue(d) => { + JsonValue::String(Timestamp::new_second(d).to_iso8601_string()) + } + ValueData::IntervalYearMonthValue(d) => JsonValue::String(format!("interval year [{}]", d)), + ValueData::IntervalMonthDayNanoValue(d) => JsonValue::String(format!( + "interval month [{}][{}][{}]", + d.months, d.days, d.nanoseconds + )), + ValueData::IntervalDayTimeValue(d) => JsonValue::String(format!("interval day [{}]", d)), + ValueData::Decimal128Value(d) => { + JsonValue::String(format!("decimal128 [{}][{}]", d.hi, d.lo)) + } + } +} + #[cfg(test)] mod tests { use arrow::datatypes::DataType as ArrowDataType; use common_time::timezone::set_default_timezone; + use greptime_proto::v1::{Decimal128 as ProtoDecimal128, IntervalMonthDayNano}; use num_traits::Float; use super::*; use crate::vectors::ListVectorBuilder; + #[test] + fn test_column_data_to_json() { + assert_eq!( + column_data_to_json(ValueData::BinaryValue(b"hello".to_vec())), + JsonValue::String("aGVsbG8=".to_string()) + ); + assert_eq!( + column_data_to_json(ValueData::BoolValue(true)), + JsonValue::Bool(true) + ); + assert_eq!( + column_data_to_json(ValueData::U8Value(1)), + JsonValue::Number(1.into()) + ); + assert_eq!( + column_data_to_json(ValueData::U16Value(2)), + JsonValue::Number(2.into()) + ); + assert_eq!( + column_data_to_json(ValueData::U32Value(3)), + JsonValue::Number(3.into()) + ); + assert_eq!( + column_data_to_json(ValueData::U64Value(4)), + JsonValue::Number(4.into()) + ); + assert_eq!( + column_data_to_json(ValueData::I8Value(5)), + JsonValue::Number(5.into()) + ); + assert_eq!( + column_data_to_json(ValueData::I16Value(6)), + JsonValue::Number(6.into()) + ); + assert_eq!( + column_data_to_json(ValueData::I32Value(7)), + JsonValue::Number(7.into()) + ); + assert_eq!( + column_data_to_json(ValueData::I64Value(8)), + JsonValue::Number(8.into()) + ); + assert_eq!( + column_data_to_json(ValueData::F32Value(9.0)), + JsonValue::Number(Number::from_f64(9.0_f64).unwrap()) + ); + assert_eq!( + column_data_to_json(ValueData::F64Value(10.0)), + JsonValue::Number(Number::from_f64(10.0_f64).unwrap()) + ); + assert_eq!( + column_data_to_json(ValueData::StringValue("hello".to_string())), + JsonValue::String("hello".to_string()) + ); + assert_eq!( + column_data_to_json(ValueData::DateValue(123)), + JsonValue::String("1970-05-04".to_string()) + ); + assert_eq!( + column_data_to_json(ValueData::DatetimeValue(456)), + JsonValue::String("1970-01-01 00:00:00.456+0000".to_string()) + ); + assert_eq!( + column_data_to_json(ValueData::TimeSecondValue(789)), + JsonValue::String("00:13:09+0000".to_string()) + ); + assert_eq!( + column_data_to_json(ValueData::TimeMillisecondValue(789)), + JsonValue::String("00:00:00.789+0000".to_string()) + ); + assert_eq!( + column_data_to_json(ValueData::TimeMicrosecondValue(789)), + JsonValue::String("00:00:00.000789+0000".to_string()) + ); + assert_eq!( + column_data_to_json(ValueData::TimestampMillisecondValue(1234567890)), + JsonValue::String("1970-01-15 06:56:07.890+0000".to_string()) + ); + assert_eq!( + column_data_to_json(ValueData::TimestampNanosecondValue(1234567890123456789)), + JsonValue::String("2009-02-13 23:31:30.123456789+0000".to_string()) + ); + assert_eq!( + column_data_to_json(ValueData::TimestampSecondValue(1234567890)), + JsonValue::String("2009-02-13 23:31:30+0000".to_string()) + ); + assert_eq!( + column_data_to_json(ValueData::IntervalYearMonthValue(12)), + JsonValue::String("interval year [12]".to_string()) + ); + assert_eq!( + column_data_to_json(ValueData::IntervalMonthDayNanoValue(IntervalMonthDayNano { + months: 1, + days: 2, + nanoseconds: 3, + })), + JsonValue::String("interval month [1][2][3]".to_string()) + ); + assert_eq!( + column_data_to_json(ValueData::IntervalDayTimeValue(4)), + JsonValue::String("interval day [4]".to_string()) + ); + assert_eq!( + column_data_to_json(ValueData::Decimal128Value(ProtoDecimal128 { hi: 5, lo: 6 })), + JsonValue::String("decimal128 [5][6]".to_string()) + ); + } + #[test] fn test_try_from_scalar_value() { assert_eq!( diff --git a/src/servers/src/http.rs b/src/servers/src/http.rs index ad4ff52225..5ac52157ea 100644 --- a/src/servers/src/http.rs +++ b/src/servers/src/http.rs @@ -753,6 +753,7 @@ impl HttpServer { "/pipelines/:pipeline_name", routing::delete(event::delete_pipeline), ) + .route("/pipelines/dryrun", routing::post(event::pipeline_dryrun)) .layer( ServiceBuilder::new() .layer(HandleErrorLayer::new(handle_error)) diff --git a/src/servers/src/http/event.rs b/src/servers/src/http/event.rs index fb436142fc..dbd7f1232a 100644 --- a/src/servers/src/http/event.rs +++ b/src/servers/src/http/event.rs @@ -23,15 +23,16 @@ use axum::headers::ContentType; use axum::http::header::CONTENT_TYPE; use axum::http::{Request, StatusCode}; use axum::response::{IntoResponse, Response}; -use axum::{async_trait, BoxError, Extension, TypedHeader}; +use axum::{async_trait, BoxError, Extension, Json, TypedHeader}; use common_query::{Output, OutputData}; use common_telemetry::{error, warn}; +use datatypes::value::column_data_to_json; use pipeline::error::PipelineTransformSnafu; use pipeline::util::to_pipeline_version; use pipeline::PipelineVersion; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; -use serde_json::{Deserializer, Value}; +use serde_json::{Deserializer, Map, Value}; use session::context::{Channel, QueryContext, QueryContextRef}; use snafu::{ensure, OptionExt, ResultExt}; @@ -230,6 +231,117 @@ fn transform_ndjson_array_factory( }) } +#[axum_macros::debug_handler] +pub async fn pipeline_dryrun( + State(log_state): State, + Query(query_params): Query, + Extension(mut query_ctx): Extension, + TypedHeader(content_type): TypedHeader, + payload: String, +) -> Result { + let handler = log_state.log_handler; + let pipeline_name = query_params.pipeline_name.context(InvalidParameterSnafu { + reason: "pipeline_name is required", + })?; + + let version = to_pipeline_version(query_params.version).context(PipelineSnafu)?; + + let ignore_errors = query_params.ignore_errors.unwrap_or(false); + + let value = extract_pipeline_value_by_content_type(content_type, payload, ignore_errors)?; + + if value.len() > 10 { + return Err(InvalidParameterSnafu { + reason: "too many rows for dryrun", + } + .build()); + } + + query_ctx.set_channel(Channel::Http); + let query_ctx = Arc::new(query_ctx); + + let pipeline = handler + .get_pipeline(&pipeline_name, version, query_ctx.clone()) + .await?; + + let mut intermediate_state = pipeline.init_intermediate_state(); + + let mut results = Vec::with_capacity(value.len()); + for v in value { + pipeline + .prepare(v, &mut intermediate_state) + .map_err(|reason| PipelineTransformSnafu { reason }.build()) + .context(PipelineSnafu)?; + let r = pipeline + .exec_mut(&mut intermediate_state) + .map_err(|reason| PipelineTransformSnafu { reason }.build()) + .context(PipelineSnafu)?; + results.push(r); + pipeline.reset_intermediate_state(&mut intermediate_state); + } + + let colume_type_key = "colume_type"; + let data_type_key = "data_type"; + let name_key = "name"; + + let schema = pipeline + .schemas() + .iter() + .map(|cs| { + let mut map = Map::new(); + map.insert(name_key.to_string(), Value::String(cs.column_name.clone())); + map.insert( + data_type_key.to_string(), + Value::String(cs.datatype().as_str_name().to_string()), + ); + map.insert( + colume_type_key.to_string(), + Value::String(cs.semantic_type().as_str_name().to_string()), + ); + map.insert( + "fulltext".to_string(), + Value::Bool( + cs.options + .clone() + .is_some_and(|x| x.options.contains_key("fulltext")), + ), + ); + Value::Object(map) + }) + .collect::>(); + let rows = results + .into_iter() + .map(|row| { + let row = row + .values + .into_iter() + .enumerate() + .map(|(idx, v)| { + v.value_data + .map(|d| { + let mut map = Map::new(); + map.insert("value".to_string(), column_data_to_json(d)); + map.insert("key".to_string(), schema[idx][name_key].clone()); + map.insert( + "semantic_type".to_string(), + schema[idx][colume_type_key].clone(), + ); + map.insert("data_type".to_string(), schema[idx][data_type_key].clone()); + Value::Object(map) + }) + .unwrap_or(Value::Null) + }) + .collect(); + Value::Array(row) + }) + .collect::>(); + let mut result = Map::new(); + result.insert("schema".to_string(), Value::Array(schema)); + result.insert("rows".to_string(), Value::Array(rows)); + let result = Value::Object(result); + Ok(Json(result).into_response()) +} + #[axum_macros::debug_handler] pub async fn log_ingester( State(log_state): State, diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs index 497ea4969c..56307e0427 100644 --- a/tests-integration/tests/http.rs +++ b/tests-integration/tests/http.rs @@ -78,6 +78,7 @@ macro_rules! http_tests { test_vm_proto_remote_write, test_pipeline_api, + test_test_pipeline_api, test_plain_text_ingestion, ); )* @@ -1146,6 +1147,171 @@ transform: guard.remove_all().await; } +pub async fn test_test_pipeline_api(store_type: StorageType) { + common_telemetry::init_default_ut_logging(); + let (app, mut guard) = setup_test_http_app_with_frontend(store_type, "test_pipeline_api").await; + + // handshake + let client = TestClient::new(app); + + let body = r#" +processors: + - date: + field: time + formats: + - "%Y-%m-%d %H:%M:%S%.3f" + ignore_missing: true + +transform: + - fields: + - id1 + - id2 + type: int32 + - fields: + - type + - log + - logger + type: string + - field: time + type: time + index: timestamp +"#; + + // 1. create pipeline + let res = client + .post("/v1/events/pipelines/test") + .header("Content-Type", "application/x-yaml") + .body(body) + .send() + .await; + + assert_eq!(res.status(), StatusCode::OK); + + let content = res.text().await; + + let content = serde_json::from_str(&content); + assert!(content.is_ok()); + // {"execution_time_ms":13,"pipelines":[{"name":"test","version":"2024-07-04 08:31:00.987136"}]} + let content: Value = content.unwrap(); + + let execution_time = content.get("execution_time_ms"); + assert!(execution_time.unwrap().is_number()); + let pipelines = content.get("pipelines"); + let pipelines = pipelines.unwrap().as_array().unwrap(); + assert_eq!(pipelines.len(), 1); + let pipeline = pipelines.first().unwrap(); + assert_eq!(pipeline.get("name").unwrap(), "test"); + + // 2. write data + let data_body = r#" + [ + { + "id1": "2436", + "id2": "2528", + "logger": "INTERACT.MANAGER", + "type": "I", + "time": "2024-05-25 20:16:37.217", + "log": "ClusterAdapter:enter sendTextDataToCluster\\n" + } + ] + "#; + let res = client + .post("/v1/events/pipelines/dryrun?pipeline_name=test") + .header("Content-Type", "application/json") + .body(data_body) + .send() + .await; + assert_eq!(res.status(), StatusCode::OK); + let body: serde_json::Value = res.json().await; + let schema = &body["schema"]; + let rows = &body["rows"]; + assert_eq!( + schema, + &json!([ + { + "colume_type": "FIELD", + "data_type": "INT32", + "fulltext": false, + "name": "id1" + }, + { + "colume_type": "FIELD", + "data_type": "INT32", + "fulltext": false, + "name": "id2" + }, + { + "colume_type": "FIELD", + "data_type": "STRING", + "fulltext": false, + "name": "type" + }, + { + "colume_type": "FIELD", + "data_type": "STRING", + "fulltext": false, + "name": "log" + }, + { + "colume_type": "FIELD", + "data_type": "STRING", + "fulltext": false, + "name": "logger" + }, + { + "colume_type": "TIMESTAMP", + "data_type": "TIMESTAMP_NANOSECOND", + "fulltext": false, + "name": "time" + } + ]) + ); + assert_eq!( + rows, + &json!([ + [ + { + "data_type": "INT32", + "key": "id1", + "semantic_type": "FIELD", + "value": 2436 + }, + { + "data_type": "INT32", + "key": "id2", + "semantic_type": "FIELD", + "value": 2528 + }, + { + "data_type": "STRING", + "key": "type", + "semantic_type": "FIELD", + "value": "I" + }, + { + "data_type": "STRING", + "key": "log", + "semantic_type": "FIELD", + "value": "ClusterAdapter:enter sendTextDataToCluster\\n" + }, + { + "data_type": "STRING", + "key": "logger", + "semantic_type": "FIELD", + "value": "INTERACT.MANAGER" + }, + { + "data_type": "TIMESTAMP_NANOSECOND", + "key": "time", + "semantic_type": "TIMESTAMP", + "value": "2024-05-25 20:16:37.217+0000" + } + ] + ]) + ); + guard.remove_all().await; +} + pub async fn test_plain_text_ingestion(store_type: StorageType) { common_telemetry::init_default_ut_logging(); let (app, mut guard) = setup_test_http_app_with_frontend(store_type, "test_pipeline_api").await; From d2d62e0c6f42e75f500a85847494faa97032b6e3 Mon Sep 17 00:00:00 2001 From: Ruihang Xia Date: Sat, 7 Sep 2024 12:28:11 +0800 Subject: [PATCH 7/8] fix: unconditional statistics (#4694) * fix: unconditional statistics Signed-off-by: Ruihang Xia * add more sqlness case Signed-off-by: Ruihang Xia --------- Signed-off-by: Ruihang Xia --- src/mito2/src/read/scan_region.rs | 4 ++ src/mito2/src/read/seq_scan.rs | 5 ++ src/mito2/src/read/unordered_scan.rs | 5 ++ src/store-api/src/region_engine.rs | 7 +++ src/table/src/table/scan.rs | 2 +- .../standalone/common/aggregate/count.result | 47 +++++++++++++++++++ .../standalone/common/aggregate/count.sql | 24 ++++++++++ 7 files changed, 93 insertions(+), 1 deletion(-) diff --git a/src/mito2/src/read/scan_region.rs b/src/mito2/src/read/scan_region.rs index dcf5b4395c..ec45c9b934 100644 --- a/src/mito2/src/read/scan_region.rs +++ b/src/mito2/src/read/scan_region.rs @@ -709,6 +709,10 @@ impl ScanInput { rows_in_files + rows_in_memtables } + pub(crate) fn predicate(&self) -> Option { + self.predicate.clone() + } + /// Retrieves [`PartitionRange`] from memtable and files pub(crate) fn partition_ranges(&self) -> Vec { let mut id = 0; diff --git a/src/mito2/src/read/seq_scan.rs b/src/mito2/src/read/seq_scan.rs index ec5fcf53d3..ca232df834 100644 --- a/src/mito2/src/read/seq_scan.rs +++ b/src/mito2/src/read/seq_scan.rs @@ -515,6 +515,11 @@ impl RegionScanner for SeqScan { self.properties.partitions = ranges; Ok(()) } + + fn has_predicate(&self) -> bool { + let predicate = self.stream_ctx.input.predicate(); + predicate.map(|p| !p.exprs().is_empty()).unwrap_or(false) + } } impl DisplayAs for SeqScan { diff --git a/src/mito2/src/read/unordered_scan.rs b/src/mito2/src/read/unordered_scan.rs index ec43654e09..5dfcc519d6 100644 --- a/src/mito2/src/read/unordered_scan.rs +++ b/src/mito2/src/read/unordered_scan.rs @@ -228,6 +228,11 @@ impl RegionScanner for UnorderedScan { Ok(stream) } + + fn has_predicate(&self) -> bool { + let predicate = self.stream_ctx.input.predicate(); + predicate.map(|p| !p.exprs().is_empty()).unwrap_or(false) + } } impl DisplayAs for UnorderedScan { diff --git a/src/store-api/src/region_engine.rs b/src/store-api/src/region_engine.rs index cf37fe82f9..84555a595b 100644 --- a/src/store-api/src/region_engine.rs +++ b/src/store-api/src/region_engine.rs @@ -233,6 +233,9 @@ pub trait RegionScanner: Debug + DisplayAs + Send { /// # Panics /// Panics if the `partition` is out of bound. fn scan_partition(&self, partition: usize) -> Result; + + /// Check if there is any predicate that may be executed in this scanner. + fn has_predicate(&self) -> bool; } pub type RegionScannerRef = Box; @@ -367,6 +370,10 @@ impl RegionScanner for SinglePartitionScanner { )) }) } + + fn has_predicate(&self) -> bool { + false + } } impl DisplayAs for SinglePartitionScanner { diff --git a/src/table/src/table/scan.rs b/src/table/src/table/scan.rs index 19283058c6..e67c6dc032 100644 --- a/src/table/src/table/scan.rs +++ b/src/table/src/table/scan.rs @@ -180,7 +180,7 @@ impl ExecutionPlan for RegionScanExec { } fn statistics(&self) -> DfResult { - let statistics = if self.append_mode { + let statistics = if self.append_mode && !self.scanner.lock().unwrap().has_predicate() { let column_statistics = self .arrow_schema .fields diff --git a/tests/cases/standalone/common/aggregate/count.result b/tests/cases/standalone/common/aggregate/count.result index 4523118d18..f93189d985 100644 --- a/tests/cases/standalone/common/aggregate/count.result +++ b/tests/cases/standalone/common/aggregate/count.result @@ -54,3 +54,50 @@ drop table test; Affected Rows: 0 +-- Append table +create table count_where_bug ( + tag String, + ts TimestampMillisecond time index, + num Int64, + primary key (tag), +) engine=mito with('append_mode'='true'); + +Affected Rows: 0 + +insert into count_where_bug (tag, ts, num) +values ('a', '2024-09-06T06:00:01Z', 1), + ('a', '2024-09-06T06:00:02Z', 2), + ('a', '2024-09-06T06:00:03Z', 3), + ('b', '2024-09-06T06:00:04Z', 4), + ('b', '2024-09-06T06:00:05Z', 5); + +Affected Rows: 5 + +select count(1) from count_where_bug where tag = 'b'; + ++-----------------+ +| COUNT(Int64(1)) | ++-----------------+ +| 2 | ++-----------------+ + +select count(1) from count_where_bug where ts > '2024-09-06T06:00:04Z'; + ++-----------------+ +| COUNT(Int64(1)) | ++-----------------+ +| 1 | ++-----------------+ + +select count(1) from count_where_bug where num != 3; + ++-----------------+ +| COUNT(Int64(1)) | ++-----------------+ +| 4 | ++-----------------+ + +drop table count_where_bug; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/common/aggregate/count.sql b/tests/cases/standalone/common/aggregate/count.sql index 80100c96ae..22745b723c 100644 --- a/tests/cases/standalone/common/aggregate/count.sql +++ b/tests/cases/standalone/common/aggregate/count.sql @@ -17,3 +17,27 @@ select count(*) from (select * from test cross join "HelloWorld"); drop table "HelloWorld"; drop table test; + +-- Append table + +create table count_where_bug ( + tag String, + ts TimestampMillisecond time index, + num Int64, + primary key (tag), +) engine=mito with('append_mode'='true'); + +insert into count_where_bug (tag, ts, num) +values ('a', '2024-09-06T06:00:01Z', 1), + ('a', '2024-09-06T06:00:02Z', 2), + ('a', '2024-09-06T06:00:03Z', 3), + ('b', '2024-09-06T06:00:04Z', 4), + ('b', '2024-09-06T06:00:05Z', 5); + +select count(1) from count_where_bug where tag = 'b'; + +select count(1) from count_where_bug where ts > '2024-09-06T06:00:04Z'; + +select count(1) from count_where_bug where num != 3; + +drop table count_where_bug; From b950e705f5e6d681084c0313cc8e458ff94943db Mon Sep 17 00:00:00 2001 From: Yiran Date: Sat, 7 Sep 2024 23:27:32 +0800 Subject: [PATCH 8/8] chore: update the document link in README.md (#4690) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1eb0db19ff..cb0519f321 100644 --- a/README.md +++ b/README.md @@ -74,7 +74,7 @@ Our core developers have been building time-series data platforms for years. Bas * **Compatible with InfluxDB, Prometheus and more protocols** - Widely adopted database protocols and APIs, including MySQL, PostgreSQL, and Prometheus Remote Storage, etc. [Read more](https://docs.greptime.com/user-guide/clients/overview). + Widely adopted database protocols and APIs, including MySQL, PostgreSQL, and Prometheus Remote Storage, etc. [Read more](https://docs.greptime.com/user-guide/protocols/overview). ## Try GreptimeDB