mirror of
https://github.com/GreptimeTeam/greptimedb.git
synced 2026-01-05 21:02:58 +00:00
Merge branch 'main' into transform-count-min-max
Signed-off-by: Ruihang Xia <waynestxia@gmail.com>
This commit is contained in:
140
Cargo.lock
generated
140
Cargo.lock
generated
@@ -214,7 +214,7 @@ checksum = "d301b3b94cb4b2f23d7917810addbbaff90738e0ca2be692bd027e70d7e0330c"
|
||||
|
||||
[[package]]
|
||||
name = "api"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"common-base",
|
||||
"common-decimal",
|
||||
@@ -762,7 +762,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "auth"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"api",
|
||||
"async-trait",
|
||||
@@ -1286,7 +1286,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "cache"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"catalog",
|
||||
"common-error",
|
||||
@@ -1294,7 +1294,7 @@ dependencies = [
|
||||
"common-meta",
|
||||
"moka",
|
||||
"snafu 0.8.4",
|
||||
"substrait 0.9.2",
|
||||
"substrait 0.9.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1321,7 +1321,7 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
|
||||
|
||||
[[package]]
|
||||
name = "catalog"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"api",
|
||||
"arrow",
|
||||
@@ -1647,7 +1647,7 @@ checksum = "4b82cf0babdbd58558212896d1a4272303a57bdb245c2bf1147185fb45640e70"
|
||||
|
||||
[[package]]
|
||||
name = "client"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"api",
|
||||
"arc-swap",
|
||||
@@ -1677,7 +1677,7 @@ dependencies = [
|
||||
"serde_json",
|
||||
"snafu 0.8.4",
|
||||
"substrait 0.37.3",
|
||||
"substrait 0.9.2",
|
||||
"substrait 0.9.3",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tonic 0.11.0",
|
||||
@@ -1707,7 +1707,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "cmd"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"auth",
|
||||
@@ -1763,7 +1763,7 @@ dependencies = [
|
||||
"session",
|
||||
"snafu 0.8.4",
|
||||
"store-api",
|
||||
"substrait 0.9.2",
|
||||
"substrait 0.9.3",
|
||||
"table",
|
||||
"temp-env",
|
||||
"tempfile",
|
||||
@@ -1809,7 +1809,7 @@ checksum = "55b672471b4e9f9e95499ea597ff64941a309b2cdbffcc46f2cc5e2d971fd335"
|
||||
|
||||
[[package]]
|
||||
name = "common-base"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"anymap",
|
||||
"bitvec",
|
||||
@@ -1825,7 +1825,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-catalog"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"chrono",
|
||||
"common-error",
|
||||
@@ -1836,7 +1836,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-config"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"common-base",
|
||||
"common-error",
|
||||
@@ -1859,7 +1859,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-datasource"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-schema",
|
||||
@@ -1896,7 +1896,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-decimal"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"bigdecimal",
|
||||
"common-error",
|
||||
@@ -1909,7 +1909,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-error"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"snafu 0.8.4",
|
||||
"strum 0.25.0",
|
||||
@@ -1918,7 +1918,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-frontend"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"api",
|
||||
"async-trait",
|
||||
@@ -1933,7 +1933,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-function"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"api",
|
||||
"arc-swap",
|
||||
@@ -1970,7 +1970,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-greptimedb-telemetry"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"common-runtime",
|
||||
@@ -1987,7 +1987,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-grpc"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"api",
|
||||
"arrow-flight",
|
||||
@@ -2013,7 +2013,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-grpc-expr"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"api",
|
||||
"common-base",
|
||||
@@ -2031,7 +2031,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-macro"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"arc-swap",
|
||||
"common-query",
|
||||
@@ -2045,7 +2045,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-mem-prof"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"common-error",
|
||||
"common-macro",
|
||||
@@ -2058,7 +2058,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-meta"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"anymap2",
|
||||
"api",
|
||||
@@ -2114,11 +2114,11 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-plugins"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
|
||||
[[package]]
|
||||
name = "common-procedure"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"async-stream",
|
||||
"async-trait",
|
||||
@@ -2144,7 +2144,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-procedure-test"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"common-procedure",
|
||||
@@ -2152,7 +2152,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-query"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"api",
|
||||
"async-trait",
|
||||
@@ -2178,7 +2178,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-recordbatch"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"arc-swap",
|
||||
"common-error",
|
||||
@@ -2197,7 +2197,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-runtime"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"common-error",
|
||||
@@ -2219,7 +2219,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-telemetry"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"atty",
|
||||
"backtrace",
|
||||
@@ -2246,7 +2246,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-test-util"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"client",
|
||||
"common-query",
|
||||
@@ -2258,7 +2258,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-time"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"chrono",
|
||||
@@ -2274,7 +2274,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-version"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"build-data",
|
||||
"const_format",
|
||||
@@ -2285,7 +2285,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "common-wal"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"common-base",
|
||||
"common-error",
|
||||
@@ -3093,7 +3093,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "datanode"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"api",
|
||||
"arrow-flight",
|
||||
@@ -3142,7 +3142,7 @@ dependencies = [
|
||||
"session",
|
||||
"snafu 0.8.4",
|
||||
"store-api",
|
||||
"substrait 0.9.2",
|
||||
"substrait 0.9.3",
|
||||
"table",
|
||||
"tokio",
|
||||
"toml 0.8.14",
|
||||
@@ -3151,11 +3151,12 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "datatypes"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"arrow",
|
||||
"arrow-array",
|
||||
"arrow-schema",
|
||||
"base64 0.21.7",
|
||||
"common-base",
|
||||
"common-decimal",
|
||||
"common-error",
|
||||
@@ -3164,6 +3165,7 @@ dependencies = [
|
||||
"common-time",
|
||||
"datafusion-common",
|
||||
"enum_dispatch",
|
||||
"greptime-proto",
|
||||
"num",
|
||||
"num-traits",
|
||||
"ordered-float 3.9.2",
|
||||
@@ -3721,7 +3723,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "file-engine"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"api",
|
||||
"async-trait",
|
||||
@@ -3823,7 +3825,7 @@ checksum = "28a80e3145d8ad11ba0995949bbcf48b9df2be62772b3d351ef017dff6ecb853"
|
||||
|
||||
[[package]]
|
||||
name = "flow"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"api",
|
||||
"arrow",
|
||||
@@ -3880,7 +3882,7 @@ dependencies = [
|
||||
"snafu 0.8.4",
|
||||
"store-api",
|
||||
"strum 0.25.0",
|
||||
"substrait 0.9.2",
|
||||
"substrait 0.9.3",
|
||||
"table",
|
||||
"tokio",
|
||||
"tonic 0.11.0",
|
||||
@@ -3927,7 +3929,7 @@ checksum = "6c2141d6d6c8512188a7891b4b01590a45f6dac67afb4f255c4124dbb86d4eaa"
|
||||
|
||||
[[package]]
|
||||
name = "frontend"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"api",
|
||||
"arc-swap",
|
||||
@@ -4300,7 +4302,7 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
|
||||
[[package]]
|
||||
name = "greptime-proto"
|
||||
version = "0.1.0"
|
||||
source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=c437b55725b7f5224fe9d46db21072b4a682ee4b#c437b55725b7f5224fe9d46db21072b4a682ee4b"
|
||||
source = "git+https://github.com/GreptimeTeam/greptime-proto.git?rev=157cfdb52709e489cf1f3ce8e3042ed4ee8a524a#157cfdb52709e489cf1f3ce8e3042ed4ee8a524a"
|
||||
dependencies = [
|
||||
"prost 0.12.6",
|
||||
"serde",
|
||||
@@ -5078,7 +5080,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "index"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"asynchronous-codec",
|
||||
@@ -5858,7 +5860,7 @@ checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c"
|
||||
|
||||
[[package]]
|
||||
name = "log-store"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"async-stream",
|
||||
"async-trait",
|
||||
@@ -6170,7 +6172,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "meta-client"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"api",
|
||||
"async-trait",
|
||||
@@ -6196,7 +6198,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "meta-srv"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"api",
|
||||
"async-trait",
|
||||
@@ -6274,7 +6276,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "metric-engine"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"api",
|
||||
"aquamarine",
|
||||
@@ -6365,7 +6367,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "mito2"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"api",
|
||||
"aquamarine",
|
||||
@@ -7012,7 +7014,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "object-store"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bytes",
|
||||
@@ -7259,7 +7261,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "operator"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"api",
|
||||
"async-trait",
|
||||
@@ -7304,7 +7306,7 @@ dependencies = [
|
||||
"sql",
|
||||
"sqlparser 0.45.0 (git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=54a267ac89c09b11c0c88934690530807185d3e7)",
|
||||
"store-api",
|
||||
"substrait 0.9.2",
|
||||
"substrait 0.9.3",
|
||||
"table",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
@@ -7554,7 +7556,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "partition"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"api",
|
||||
"async-trait",
|
||||
@@ -7843,7 +7845,7 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184"
|
||||
|
||||
[[package]]
|
||||
name = "pipeline"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"ahash 0.8.11",
|
||||
"api",
|
||||
@@ -8004,7 +8006,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "plugins"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"auth",
|
||||
"common-base",
|
||||
@@ -8273,7 +8275,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "promql"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"ahash 0.8.11",
|
||||
"async-trait",
|
||||
@@ -8508,7 +8510,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "puffin"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"async-compression 0.4.11",
|
||||
"async-trait",
|
||||
@@ -8630,7 +8632,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "query"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"ahash 0.8.11",
|
||||
"api",
|
||||
@@ -8693,7 +8695,7 @@ dependencies = [
|
||||
"stats-cli",
|
||||
"store-api",
|
||||
"streaming-stats",
|
||||
"substrait 0.9.2",
|
||||
"substrait 0.9.3",
|
||||
"table",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
@@ -10055,7 +10057,7 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
|
||||
|
||||
[[package]]
|
||||
name = "script"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"api",
|
||||
"arc-swap",
|
||||
@@ -10349,7 +10351,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "servers"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"aide",
|
||||
"api",
|
||||
@@ -10455,7 +10457,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "session"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"api",
|
||||
"arc-swap",
|
||||
@@ -10756,7 +10758,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "sql"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"api",
|
||||
"chrono",
|
||||
@@ -10816,7 +10818,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "sqlness-runner"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"clap 4.5.7",
|
||||
@@ -11033,7 +11035,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "store-api"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"api",
|
||||
"aquamarine",
|
||||
@@ -11202,7 +11204,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "substrait"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"async-trait",
|
||||
"bytes",
|
||||
@@ -11403,7 +11405,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "table"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"api",
|
||||
"async-trait",
|
||||
@@ -11668,7 +11670,7 @@ checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76"
|
||||
|
||||
[[package]]
|
||||
name = "tests-fuzz"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"arbitrary",
|
||||
"async-trait",
|
||||
@@ -11710,7 +11712,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "tests-integration"
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
dependencies = [
|
||||
"api",
|
||||
"arrow-flight",
|
||||
@@ -11770,7 +11772,7 @@ dependencies = [
|
||||
"sql",
|
||||
"sqlx",
|
||||
"store-api",
|
||||
"substrait 0.9.2",
|
||||
"substrait 0.9.3",
|
||||
"table",
|
||||
"tempfile",
|
||||
"time",
|
||||
|
||||
@@ -64,7 +64,7 @@ members = [
|
||||
resolver = "2"
|
||||
|
||||
[workspace.package]
|
||||
version = "0.9.2"
|
||||
version = "0.9.3"
|
||||
edition = "2021"
|
||||
license = "Apache-2.0"
|
||||
|
||||
@@ -120,7 +120,7 @@ etcd-client = { version = "0.13" }
|
||||
fst = "0.4.7"
|
||||
futures = "0.3"
|
||||
futures-util = "0.3"
|
||||
greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "c437b55725b7f5224fe9d46db21072b4a682ee4b" }
|
||||
greptime-proto = { git = "https://github.com/GreptimeTeam/greptime-proto.git", rev = "157cfdb52709e489cf1f3ce8e3042ed4ee8a524a" }
|
||||
humantime = "2.1"
|
||||
humantime-serde = "1.1"
|
||||
itertools = "0.10"
|
||||
|
||||
@@ -74,7 +74,7 @@ Our core developers have been building time-series data platforms for years. Bas
|
||||
|
||||
* **Compatible with InfluxDB, Prometheus and more protocols**
|
||||
|
||||
Widely adopted database protocols and APIs, including MySQL, PostgreSQL, and Prometheus Remote Storage, etc. [Read more](https://docs.greptime.com/user-guide/clients/overview).
|
||||
Widely adopted database protocols and APIs, including MySQL, PostgreSQL, and Prometheus Remote Storage, etc. [Read more](https://docs.greptime.com/user-guide/protocols/overview).
|
||||
|
||||
## Try GreptimeDB
|
||||
|
||||
|
||||
@@ -11,9 +11,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
|
||||
python3.10 \
|
||||
python3.10-dev \
|
||||
python3-pip \
|
||||
curl \
|
||||
mysql-client \
|
||||
postgresql-client
|
||||
curl
|
||||
|
||||
COPY $DOCKER_BUILD_ROOT/docker/python/requirements.txt /etc/greptime/requirements.txt
|
||||
|
||||
|
||||
@@ -21,14 +21,14 @@ use greptime_proto::v1::region::RegionResponse as RegionResponseV1;
|
||||
#[derive(Debug)]
|
||||
pub struct RegionResponse {
|
||||
pub affected_rows: AffectedRows,
|
||||
pub extension: HashMap<String, Vec<u8>>,
|
||||
pub extensions: HashMap<String, Vec<u8>>,
|
||||
}
|
||||
|
||||
impl RegionResponse {
|
||||
pub fn from_region_response(region_response: RegionResponseV1) -> Self {
|
||||
Self {
|
||||
affected_rows: region_response.affected_rows as _,
|
||||
extension: region_response.extension,
|
||||
extensions: region_response.extensions,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -36,7 +36,7 @@ impl RegionResponse {
|
||||
pub fn new(affected_rows: AffectedRows) -> Self {
|
||||
Self {
|
||||
affected_rows,
|
||||
extension: Default::default(),
|
||||
extensions: Default::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -131,7 +131,7 @@ impl AlterLogicalTablesProcedure {
|
||||
let phy_raw_schemas = future::join_all(alter_region_tasks)
|
||||
.await
|
||||
.into_iter()
|
||||
.map(|res| res.map(|mut res| res.extension.remove(ALTER_PHYSICAL_EXTENSION_KEY)))
|
||||
.map(|res| res.map(|mut res| res.extensions.remove(ALTER_PHYSICAL_EXTENSION_KEY)))
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
|
||||
if phy_raw_schemas.is_empty() {
|
||||
|
||||
@@ -157,7 +157,7 @@ impl CreateLogicalTablesProcedure {
|
||||
let phy_raw_schemas = join_all(create_region_tasks)
|
||||
.await
|
||||
.into_iter()
|
||||
.map(|res| res.map(|mut res| res.extension.remove(ALTER_PHYSICAL_EXTENSION_KEY)))
|
||||
.map(|res| res.map(|mut res| res.extensions.remove(ALTER_PHYSICAL_EXTENSION_KEY)))
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
|
||||
if phy_raw_schemas.is_empty() {
|
||||
|
||||
@@ -324,10 +324,12 @@ impl HeartbeatTask {
|
||||
region_id: stat.region_id.as_u64(),
|
||||
engine: stat.engine,
|
||||
role: RegionRole::from(stat.role).into(),
|
||||
// TODO(jeremy): w/rcus
|
||||
// TODO(weny): w/rcus
|
||||
rcus: 0,
|
||||
wcus: 0,
|
||||
approximate_bytes: region_server.region_disk_usage(stat.region_id).unwrap_or(0),
|
||||
// TODO(weny): add extensions
|
||||
extensions: Default::default(),
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
@@ -366,10 +366,10 @@ impl RegionServerHandler for RegionServer {
|
||||
|
||||
// merge results by sum up affected rows and merge extensions.
|
||||
let mut affected_rows = 0;
|
||||
let mut extension = HashMap::new();
|
||||
let mut extensions = HashMap::new();
|
||||
for result in results {
|
||||
affected_rows += result.affected_rows;
|
||||
extension.extend(result.extension);
|
||||
extensions.extend(result.extensions);
|
||||
}
|
||||
|
||||
Ok(RegionResponseV1 {
|
||||
@@ -380,7 +380,7 @@ impl RegionServerHandler for RegionServer {
|
||||
}),
|
||||
}),
|
||||
affected_rows: affected_rows as _,
|
||||
extension,
|
||||
extensions,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -708,7 +708,7 @@ impl RegionServerInner {
|
||||
.await?;
|
||||
Ok(RegionResponse {
|
||||
affected_rows: result.affected_rows,
|
||||
extension: result.extension,
|
||||
extensions: result.extensions,
|
||||
})
|
||||
}
|
||||
Err(err) => {
|
||||
|
||||
@@ -15,6 +15,7 @@ workspace = true
|
||||
arrow.workspace = true
|
||||
arrow-array.workspace = true
|
||||
arrow-schema.workspace = true
|
||||
base64.workspace = true
|
||||
common-base.workspace = true
|
||||
common-decimal.workspace = true
|
||||
common-error.workspace = true
|
||||
@@ -23,6 +24,7 @@ common-telemetry.workspace = true
|
||||
common-time.workspace = true
|
||||
datafusion-common.workspace = true
|
||||
enum_dispatch = "0.3"
|
||||
greptime-proto.workspace = true
|
||||
num = "0.4"
|
||||
num-traits = "0.2"
|
||||
ordered-float = { version = "3.0", features = ["serde"] }
|
||||
|
||||
@@ -18,6 +18,8 @@ use std::sync::Arc;
|
||||
|
||||
use arrow::datatypes::{DataType as ArrowDataType, Field};
|
||||
use arrow_array::{Array, ListArray};
|
||||
use base64::engine::general_purpose::URL_SAFE;
|
||||
use base64::Engine as _;
|
||||
use common_base::bytes::{Bytes, StringBytes};
|
||||
use common_decimal::Decimal128;
|
||||
use common_telemetry::error;
|
||||
@@ -28,8 +30,10 @@ use common_time::time::Time;
|
||||
use common_time::timestamp::{TimeUnit, Timestamp};
|
||||
use common_time::{Duration, Interval, Timezone};
|
||||
use datafusion_common::ScalarValue;
|
||||
use greptime_proto::v1::value::ValueData;
|
||||
pub use ordered_float::OrderedFloat;
|
||||
use serde::{Deserialize, Serialize, Serializer};
|
||||
use serde_json::{Number, Value as JsonValue};
|
||||
use snafu::{ensure, ResultExt};
|
||||
|
||||
use crate::error::{self, ConvertArrowArrayToScalarsSnafu, Error, Result, TryFromValueSnafu};
|
||||
@@ -1364,15 +1368,179 @@ impl<'a> ValueRef<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn column_data_to_json(data: ValueData) -> JsonValue {
|
||||
match data {
|
||||
ValueData::BinaryValue(b) => JsonValue::String(URL_SAFE.encode(b)),
|
||||
ValueData::BoolValue(b) => JsonValue::Bool(b),
|
||||
ValueData::U8Value(i) => JsonValue::Number(i.into()),
|
||||
ValueData::U16Value(i) => JsonValue::Number(i.into()),
|
||||
ValueData::U32Value(i) => JsonValue::Number(i.into()),
|
||||
ValueData::U64Value(i) => JsonValue::Number(i.into()),
|
||||
ValueData::I8Value(i) => JsonValue::Number(i.into()),
|
||||
ValueData::I16Value(i) => JsonValue::Number(i.into()),
|
||||
ValueData::I32Value(i) => JsonValue::Number(i.into()),
|
||||
ValueData::I64Value(i) => JsonValue::Number(i.into()),
|
||||
ValueData::F32Value(f) => Number::from_f64(f as f64)
|
||||
.map(JsonValue::Number)
|
||||
.unwrap_or(JsonValue::Null),
|
||||
ValueData::F64Value(f) => Number::from_f64(f)
|
||||
.map(JsonValue::Number)
|
||||
.unwrap_or(JsonValue::Null),
|
||||
ValueData::StringValue(s) => JsonValue::String(s),
|
||||
ValueData::DateValue(d) => JsonValue::String(Date::from(d).to_string()),
|
||||
ValueData::DatetimeValue(d) => JsonValue::String(DateTime::from(d).to_string()),
|
||||
ValueData::TimeSecondValue(d) => JsonValue::String(Time::new_second(d).to_iso8601_string()),
|
||||
ValueData::TimeMillisecondValue(d) => {
|
||||
JsonValue::String(Time::new_millisecond(d).to_iso8601_string())
|
||||
}
|
||||
ValueData::TimeMicrosecondValue(d) => {
|
||||
JsonValue::String(Time::new_microsecond(d).to_iso8601_string())
|
||||
}
|
||||
ValueData::TimeNanosecondValue(d) => {
|
||||
JsonValue::String(Time::new_nanosecond(d).to_iso8601_string())
|
||||
}
|
||||
ValueData::TimestampMicrosecondValue(d) => {
|
||||
JsonValue::String(Timestamp::new_microsecond(d).to_iso8601_string())
|
||||
}
|
||||
ValueData::TimestampMillisecondValue(d) => {
|
||||
JsonValue::String(Timestamp::new_millisecond(d).to_iso8601_string())
|
||||
}
|
||||
ValueData::TimestampNanosecondValue(d) => {
|
||||
JsonValue::String(Timestamp::new_nanosecond(d).to_iso8601_string())
|
||||
}
|
||||
ValueData::TimestampSecondValue(d) => {
|
||||
JsonValue::String(Timestamp::new_second(d).to_iso8601_string())
|
||||
}
|
||||
ValueData::IntervalYearMonthValue(d) => JsonValue::String(format!("interval year [{}]", d)),
|
||||
ValueData::IntervalMonthDayNanoValue(d) => JsonValue::String(format!(
|
||||
"interval month [{}][{}][{}]",
|
||||
d.months, d.days, d.nanoseconds
|
||||
)),
|
||||
ValueData::IntervalDayTimeValue(d) => JsonValue::String(format!("interval day [{}]", d)),
|
||||
ValueData::Decimal128Value(d) => {
|
||||
JsonValue::String(format!("decimal128 [{}][{}]", d.hi, d.lo))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use arrow::datatypes::DataType as ArrowDataType;
|
||||
use common_time::timezone::set_default_timezone;
|
||||
use greptime_proto::v1::{Decimal128 as ProtoDecimal128, IntervalMonthDayNano};
|
||||
use num_traits::Float;
|
||||
|
||||
use super::*;
|
||||
use crate::vectors::ListVectorBuilder;
|
||||
|
||||
#[test]
|
||||
fn test_column_data_to_json() {
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::BinaryValue(b"hello".to_vec())),
|
||||
JsonValue::String("aGVsbG8=".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::BoolValue(true)),
|
||||
JsonValue::Bool(true)
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::U8Value(1)),
|
||||
JsonValue::Number(1.into())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::U16Value(2)),
|
||||
JsonValue::Number(2.into())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::U32Value(3)),
|
||||
JsonValue::Number(3.into())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::U64Value(4)),
|
||||
JsonValue::Number(4.into())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::I8Value(5)),
|
||||
JsonValue::Number(5.into())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::I16Value(6)),
|
||||
JsonValue::Number(6.into())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::I32Value(7)),
|
||||
JsonValue::Number(7.into())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::I64Value(8)),
|
||||
JsonValue::Number(8.into())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::F32Value(9.0)),
|
||||
JsonValue::Number(Number::from_f64(9.0_f64).unwrap())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::F64Value(10.0)),
|
||||
JsonValue::Number(Number::from_f64(10.0_f64).unwrap())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::StringValue("hello".to_string())),
|
||||
JsonValue::String("hello".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::DateValue(123)),
|
||||
JsonValue::String("1970-05-04".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::DatetimeValue(456)),
|
||||
JsonValue::String("1970-01-01 00:00:00.456+0000".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::TimeSecondValue(789)),
|
||||
JsonValue::String("00:13:09+0000".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::TimeMillisecondValue(789)),
|
||||
JsonValue::String("00:00:00.789+0000".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::TimeMicrosecondValue(789)),
|
||||
JsonValue::String("00:00:00.000789+0000".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::TimestampMillisecondValue(1234567890)),
|
||||
JsonValue::String("1970-01-15 06:56:07.890+0000".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::TimestampNanosecondValue(1234567890123456789)),
|
||||
JsonValue::String("2009-02-13 23:31:30.123456789+0000".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::TimestampSecondValue(1234567890)),
|
||||
JsonValue::String("2009-02-13 23:31:30+0000".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::IntervalYearMonthValue(12)),
|
||||
JsonValue::String("interval year [12]".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::IntervalMonthDayNanoValue(IntervalMonthDayNano {
|
||||
months: 1,
|
||||
days: 2,
|
||||
nanoseconds: 3,
|
||||
})),
|
||||
JsonValue::String("interval month [1][2][3]".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::IntervalDayTimeValue(4)),
|
||||
JsonValue::String("interval day [4]".to_string())
|
||||
);
|
||||
assert_eq!(
|
||||
column_data_to_json(ValueData::Decimal128Value(ProtoDecimal128 { hi: 5, lo: 6 })),
|
||||
JsonValue::String("decimal128 [5][6]".to_string())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_try_from_scalar_value() {
|
||||
assert_eq!(
|
||||
|
||||
@@ -93,6 +93,7 @@ mod tests {
|
||||
approximate_bytes: 0,
|
||||
engine: default_engine().to_string(),
|
||||
role: RegionRole::Follower,
|
||||
extensions: Default::default(),
|
||||
}
|
||||
}
|
||||
acc.stat = Some(Stat {
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::HashSet;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
|
||||
use api::v1::meta::HeartbeatRequest;
|
||||
use common_meta::ClusterId;
|
||||
@@ -57,6 +57,8 @@ pub struct RegionStat {
|
||||
pub engine: String,
|
||||
/// The region role.
|
||||
pub role: RegionRole,
|
||||
/// The extension info of this region
|
||||
pub extensions: HashMap<String, Vec<u8>>,
|
||||
}
|
||||
|
||||
impl Stat {
|
||||
@@ -142,6 +144,7 @@ impl TryFrom<api::v1::meta::RegionStat> for RegionStat {
|
||||
approximate_bytes: value.approximate_bytes,
|
||||
engine: value.engine.to_string(),
|
||||
role: RegionRole::from(value.role()),
|
||||
extensions: value.extensions,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -135,6 +135,7 @@ mod test {
|
||||
wcus: 0,
|
||||
approximate_bytes: 0,
|
||||
engine: String::new(),
|
||||
extensions: Default::default(),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -100,7 +100,7 @@ pub mod mock {
|
||||
}),
|
||||
}),
|
||||
affected_rows: 0,
|
||||
extension: Default::default(),
|
||||
extensions: Default::default(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -199,6 +199,7 @@ mod tests {
|
||||
approximate_bytes: 1,
|
||||
engine: "mito2".to_string(),
|
||||
role: RegionRole::Leader,
|
||||
extensions: Default::default(),
|
||||
}],
|
||||
..Default::default()
|
||||
}
|
||||
@@ -215,6 +216,7 @@ mod tests {
|
||||
approximate_bytes: 1,
|
||||
engine: "mito2".to_string(),
|
||||
role: RegionRole::Leader,
|
||||
extensions: Default::default(),
|
||||
}],
|
||||
..Default::default()
|
||||
}
|
||||
@@ -231,6 +233,7 @@ mod tests {
|
||||
approximate_bytes: 1,
|
||||
engine: "mito2".to_string(),
|
||||
role: RegionRole::Leader,
|
||||
extensions: Default::default(),
|
||||
}],
|
||||
..Default::default()
|
||||
}
|
||||
|
||||
@@ -162,7 +162,7 @@ impl RegionEngine for MetricEngine {
|
||||
|
||||
result.map_err(BoxedError::new).map(|rows| RegionResponse {
|
||||
affected_rows: rows,
|
||||
extension: extension_return_value,
|
||||
extensions: extension_return_value,
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@@ -258,13 +258,18 @@ impl LastFieldsBuilder {
|
||||
fn maybe_init(&mut self, batch: &Batch) {
|
||||
debug_assert!(!batch.is_empty());
|
||||
|
||||
if self.initialized || batch.fields().is_empty() {
|
||||
if self.initialized {
|
||||
// Already initialized or no fields to merge.
|
||||
return;
|
||||
}
|
||||
|
||||
self.initialized = true;
|
||||
|
||||
if batch.fields().is_empty() {
|
||||
// No fields to merge.
|
||||
return;
|
||||
}
|
||||
|
||||
let last_idx = batch.num_rows() - 1;
|
||||
let fields = batch.fields();
|
||||
// Safety: The last_idx is valid.
|
||||
@@ -1165,4 +1170,58 @@ mod tests {
|
||||
];
|
||||
assert_eq!(&expect, &actual[..]);
|
||||
}
|
||||
|
||||
/// Returns a new [Batch] without fields.
|
||||
fn new_batch_no_fields(
|
||||
primary_key: &[u8],
|
||||
timestamps: &[i64],
|
||||
sequences: &[u64],
|
||||
op_types: &[OpType],
|
||||
) -> Batch {
|
||||
let mut builder = BatchBuilder::new(primary_key.to_vec());
|
||||
builder
|
||||
.timestamps_array(Arc::new(TimestampMillisecondArray::from_iter_values(
|
||||
timestamps.iter().copied(),
|
||||
)))
|
||||
.unwrap()
|
||||
.sequences_array(Arc::new(UInt64Array::from_iter_values(
|
||||
sequences.iter().copied(),
|
||||
)))
|
||||
.unwrap()
|
||||
.op_types_array(Arc::new(UInt8Array::from_iter_values(
|
||||
op_types.iter().map(|v| *v as u8),
|
||||
)))
|
||||
.unwrap();
|
||||
builder.build().unwrap()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_last_non_null_iter_no_batch() {
|
||||
let input = [
|
||||
new_batch_no_fields(
|
||||
b"k1",
|
||||
&[1, 1, 2],
|
||||
&[13, 12, 13],
|
||||
&[OpType::Put, OpType::Put, OpType::Put],
|
||||
),
|
||||
new_batch_no_fields(b"k1", &[2, 3], &[12, 13], &[OpType::Put, OpType::Delete]),
|
||||
new_batch_no_fields(
|
||||
b"k2",
|
||||
&[1, 1, 2],
|
||||
&[13, 12, 13],
|
||||
&[OpType::Put, OpType::Put, OpType::Put],
|
||||
),
|
||||
];
|
||||
let iter = input.into_iter().map(Ok);
|
||||
let iter = LastNonNullIter::new(iter);
|
||||
let actual: Vec<_> = iter.map(|batch| batch.unwrap()).collect();
|
||||
let expect = [
|
||||
new_batch_no_fields(b"k1", &[1], &[13], &[OpType::Put]),
|
||||
new_batch_no_fields(b"k1", &[2], &[13], &[OpType::Put]),
|
||||
new_batch_no_fields(b"k1", &[3], &[13], &[OpType::Delete]),
|
||||
new_batch_no_fields(b"k2", &[1], &[13], &[OpType::Put]),
|
||||
new_batch_no_fields(b"k2", &[2], &[13], &[OpType::Put]),
|
||||
];
|
||||
assert_eq!(&expect, &actual[..]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -709,6 +709,10 @@ impl ScanInput {
|
||||
rows_in_files + rows_in_memtables
|
||||
}
|
||||
|
||||
pub(crate) fn predicate(&self) -> Option<Predicate> {
|
||||
self.predicate.clone()
|
||||
}
|
||||
|
||||
/// Retrieves [`PartitionRange`] from memtable and files
|
||||
pub(crate) fn partition_ranges(&self) -> Vec<PartitionRange> {
|
||||
let mut id = 0;
|
||||
|
||||
@@ -515,6 +515,11 @@ impl RegionScanner for SeqScan {
|
||||
self.properties.partitions = ranges;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn has_predicate(&self) -> bool {
|
||||
let predicate = self.stream_ctx.input.predicate();
|
||||
predicate.map(|p| !p.exprs().is_empty()).unwrap_or(false)
|
||||
}
|
||||
}
|
||||
|
||||
impl DisplayAs for SeqScan {
|
||||
|
||||
@@ -228,6 +228,11 @@ impl RegionScanner for UnorderedScan {
|
||||
|
||||
Ok(stream)
|
||||
}
|
||||
|
||||
fn has_predicate(&self) -> bool {
|
||||
let predicate = self.stream_ctx.input.predicate();
|
||||
predicate.map(|p| !p.exprs().is_empty()).unwrap_or(false)
|
||||
}
|
||||
}
|
||||
|
||||
impl DisplayAs for UnorderedScan {
|
||||
|
||||
@@ -119,7 +119,7 @@ impl FlowServiceOperator {
|
||||
if let Some(prev) = &mut final_result {
|
||||
prev.affected_rows = res.affected_rows;
|
||||
prev.affected_flows.extend(res.affected_flows);
|
||||
prev.extension.extend(res.extension);
|
||||
prev.extensions.extend(res.extensions);
|
||||
} else {
|
||||
final_result = Some(res);
|
||||
}
|
||||
|
||||
@@ -13,27 +13,13 @@
|
||||
// limitations under the License.
|
||||
|
||||
use criterion::{black_box, criterion_group, criterion_main, Criterion};
|
||||
use pipeline::{parse, Array, Content, GreptimeTransformer, Pipeline, Value as PipelineValue};
|
||||
use pipeline::{parse, Content, GreptimeTransformer, Pipeline};
|
||||
use serde_json::{Deserializer, Value};
|
||||
|
||||
fn processor_map(
|
||||
pipeline: &Pipeline<GreptimeTransformer>,
|
||||
input_values: Vec<Value>,
|
||||
) -> impl IntoIterator<Item = greptime_proto::v1::Rows> {
|
||||
let pipeline_data = input_values
|
||||
.into_iter()
|
||||
.map(|v| PipelineValue::try_from(v).unwrap())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
pipeline.exec(PipelineValue::Array(Array {
|
||||
values: pipeline_data,
|
||||
}))
|
||||
}
|
||||
|
||||
fn processor_mut(
|
||||
pipeline: &Pipeline<GreptimeTransformer>,
|
||||
input_values: Vec<Value>,
|
||||
) -> impl IntoIterator<Item = Vec<greptime_proto::v1::Row>> {
|
||||
) -> Result<Vec<greptime_proto::v1::Row>, String> {
|
||||
let mut payload = pipeline.init_intermediate_state();
|
||||
let mut result = Vec::with_capacity(input_values.len());
|
||||
|
||||
@@ -249,11 +235,10 @@ fn criterion_benchmark(c: &mut Criterion) {
|
||||
let pipeline = prepare_pipeline();
|
||||
let mut group = c.benchmark_group("pipeline");
|
||||
group.sample_size(50);
|
||||
group.bench_function("processor map", |b| {
|
||||
b.iter(|| processor_map(black_box(&pipeline), black_box(input_value.clone())))
|
||||
});
|
||||
group.bench_function("processor mut", |b| {
|
||||
b.iter(|| processor_mut(black_box(&pipeline), black_box(input_value.clone())))
|
||||
b.iter(|| {
|
||||
processor_mut(black_box(&pipeline), black_box(input_value.clone())).unwrap();
|
||||
})
|
||||
});
|
||||
group.finish();
|
||||
}
|
||||
|
||||
@@ -19,92 +19,24 @@ pub mod processor;
|
||||
pub mod transform;
|
||||
pub mod value;
|
||||
|
||||
use ahash::{HashMap, HashSet};
|
||||
use common_telemetry::{debug, warn};
|
||||
use ahash::HashSet;
|
||||
use common_telemetry::debug;
|
||||
use itertools::{merge, Itertools};
|
||||
use processor::Processor;
|
||||
use transform::{Transformer, Transforms};
|
||||
use value::{Map, Value};
|
||||
use processor::{Processor, ProcessorBuilder, Processors};
|
||||
use transform::{TransformBuilders, Transformer, Transforms};
|
||||
use value::Value;
|
||||
use yaml_rust::YamlLoader;
|
||||
|
||||
const DESCRIPTION: &str = "description";
|
||||
const PROCESSORS: &str = "processors";
|
||||
const TRANSFORM: &str = "transform";
|
||||
const TRANSFORMS: &str = "transforms";
|
||||
|
||||
pub enum Content {
|
||||
Json(String),
|
||||
Yaml(String),
|
||||
}
|
||||
|
||||
/// set the index for the processor keys
|
||||
/// the index is the position of the key in the final intermediate keys
|
||||
fn set_processor_keys_index(
|
||||
processors: &mut processor::Processors,
|
||||
final_intermediate_keys: &Vec<String>,
|
||||
) -> Result<(), String> {
|
||||
let final_intermediate_key_index = final_intermediate_keys
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, k)| (k.as_str(), i))
|
||||
.collect::<HashMap<_, _>>();
|
||||
for processor in processors.iter_mut() {
|
||||
for field in processor.fields_mut().iter_mut() {
|
||||
let index = final_intermediate_key_index.get(field.input_field.name.as_str()).ok_or(format!(
|
||||
"input field {} is not found in intermediate keys: {final_intermediate_keys:?} when set processor keys index",
|
||||
field.input_field.name
|
||||
))?;
|
||||
field.set_input_index(*index);
|
||||
for (k, v) in field.output_fields_index_mapping.iter_mut() {
|
||||
let index = final_intermediate_key_index.get(k.as_str());
|
||||
match index {
|
||||
Some(index) => {
|
||||
*v = *index;
|
||||
}
|
||||
None => {
|
||||
warn!(
|
||||
"output field {k} is not found in intermediate keys: {final_intermediate_keys:?} when set processor keys index"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn set_transform_keys_index(
|
||||
transforms: &mut Transforms,
|
||||
final_intermediate_keys: &[String],
|
||||
output_keys: &[String],
|
||||
) -> Result<(), String> {
|
||||
let final_intermediate_key_index = final_intermediate_keys
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, k)| (k.as_str(), i))
|
||||
.collect::<HashMap<_, _>>();
|
||||
let output_key_index = output_keys
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, k)| (k.as_str(), i))
|
||||
.collect::<HashMap<_, _>>();
|
||||
for transform in transforms.iter_mut() {
|
||||
for field in transform.fields.iter_mut() {
|
||||
let index = final_intermediate_key_index.get(field.input_field.name.as_str()).ok_or(format!(
|
||||
"input field {} is not found in intermediate keys: {final_intermediate_keys:?} when set transform keys index",
|
||||
field.input_field.name
|
||||
))?;
|
||||
field.set_input_index(*index);
|
||||
for (k, v) in field.output_fields_index_mapping.iter_mut() {
|
||||
let index = output_key_index.get(k.as_str()).ok_or(format!(
|
||||
"output field {k} is not found in output keys: {final_intermediate_keys:?} when set transform keys index"
|
||||
))?;
|
||||
*v = *index;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn parse<T>(input: &Content) -> Result<Pipeline<T>, String>
|
||||
where
|
||||
T: Transformer,
|
||||
@@ -117,24 +49,22 @@ where
|
||||
|
||||
let description = doc[DESCRIPTION].as_str().map(|s| s.to_string());
|
||||
|
||||
let mut processors = if let Some(v) = doc[PROCESSORS].as_vec() {
|
||||
let processor_builder_list = if let Some(v) = doc[PROCESSORS].as_vec() {
|
||||
v.try_into()?
|
||||
} else {
|
||||
processor::Processors::default()
|
||||
processor::ProcessorBuilderList::default()
|
||||
};
|
||||
|
||||
let transforms = if let Some(v) = doc[TRANSFORM].as_vec() {
|
||||
v.try_into()?
|
||||
} else {
|
||||
Transforms::default()
|
||||
};
|
||||
let transform_builders =
|
||||
if let Some(v) = doc[TRANSFORMS].as_vec().or(doc[TRANSFORM].as_vec()) {
|
||||
v.try_into()?
|
||||
} else {
|
||||
TransformBuilders::default()
|
||||
};
|
||||
|
||||
let mut transformer = T::new(transforms)?;
|
||||
let transforms = transformer.transforms_mut();
|
||||
|
||||
let processors_output_keys = processors.output_keys();
|
||||
let processors_required_keys = processors.required_keys();
|
||||
let processors_required_original_keys = processors.required_original_keys();
|
||||
let processors_required_keys = &processor_builder_list.input_keys;
|
||||
let processors_output_keys = &processor_builder_list.output_keys;
|
||||
let processors_required_original_keys = &processor_builder_list.original_input_keys;
|
||||
|
||||
debug!(
|
||||
"processors_required_original_keys: {:?}",
|
||||
@@ -143,7 +73,7 @@ where
|
||||
debug!("processors_required_keys: {:?}", processors_required_keys);
|
||||
debug!("processors_output_keys: {:?}", processors_output_keys);
|
||||
|
||||
let transforms_required_keys = transforms.required_keys();
|
||||
let transforms_required_keys = &transform_builders.required_keys;
|
||||
let mut tr_keys = Vec::with_capacity(50);
|
||||
for key in transforms_required_keys.iter() {
|
||||
if !processors_output_keys.contains(key)
|
||||
@@ -183,9 +113,33 @@ where
|
||||
|
||||
final_intermediate_keys.extend(intermediate_keys_exclude_original);
|
||||
|
||||
let output_keys = transforms.output_keys().clone();
|
||||
set_processor_keys_index(&mut processors, &final_intermediate_keys)?;
|
||||
set_transform_keys_index(transforms, &final_intermediate_keys, &output_keys)?;
|
||||
let output_keys = transform_builders.output_keys.clone();
|
||||
|
||||
let processors_kind_list = processor_builder_list
|
||||
.processor_builders
|
||||
.into_iter()
|
||||
.map(|builder| builder.build(&final_intermediate_keys))
|
||||
.collect::<Result<Vec<_>, _>>()?;
|
||||
let processors = Processors {
|
||||
processors: processors_kind_list,
|
||||
required_keys: processors_required_keys.clone(),
|
||||
output_keys: processors_output_keys.clone(),
|
||||
required_original_keys: processors_required_original_keys.clone(),
|
||||
};
|
||||
|
||||
let transfor_list = transform_builders
|
||||
.builders
|
||||
.into_iter()
|
||||
.map(|builder| builder.build(&final_intermediate_keys, &output_keys))
|
||||
.collect::<Result<Vec<_>, String>>()?;
|
||||
|
||||
let transformers = Transforms {
|
||||
transforms: transfor_list,
|
||||
required_keys: transforms_required_keys.clone(),
|
||||
output_keys: output_keys.clone(),
|
||||
};
|
||||
|
||||
let transformer = T::new(transformers)?;
|
||||
|
||||
Ok(Pipeline {
|
||||
description,
|
||||
@@ -238,38 +192,6 @@ impl<T> Pipeline<T>
|
||||
where
|
||||
T: Transformer,
|
||||
{
|
||||
fn exec_map(&self, map: &mut Map) -> Result<(), String> {
|
||||
let v = map;
|
||||
for processor in self.processors.iter() {
|
||||
processor.exec_map(v)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn exec(&self, mut val: Value) -> Result<T::Output, String> {
|
||||
let result = match val {
|
||||
Value::Map(ref mut map) => {
|
||||
self.exec_map(map)?;
|
||||
val
|
||||
}
|
||||
Value::Array(arr) => arr
|
||||
.values
|
||||
.into_iter()
|
||||
.map(|mut v| match v {
|
||||
Value::Map(ref mut map) => {
|
||||
self.exec_map(map)?;
|
||||
Ok(v)
|
||||
}
|
||||
_ => Err(format!("expected a map, but got {}", v)),
|
||||
})
|
||||
.collect::<Result<Vec<Value>, String>>()
|
||||
.map(|values| Value::Array(value::Array { values }))?,
|
||||
_ => return Err(format!("expected a map or array, but got {}", val)),
|
||||
};
|
||||
|
||||
self.transformer.transform(result)
|
||||
}
|
||||
|
||||
pub fn exec_mut(&self, val: &mut Vec<Value>) -> Result<T::VecOutput, String> {
|
||||
for processor in self.processors.iter() {
|
||||
processor.exec_mut(val)?;
|
||||
@@ -347,9 +269,24 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn find_key_index(
|
||||
intermediate_keys: &[String],
|
||||
key: &str,
|
||||
kind: &str,
|
||||
) -> Result<usize, String> {
|
||||
intermediate_keys
|
||||
.iter()
|
||||
.position(|k| k == key)
|
||||
.ok_or(format!(
|
||||
"{} processor.{} not found in intermediate keys",
|
||||
kind, key
|
||||
))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use api::v1::Rows;
|
||||
use greptime_proto::v1::value::ValueData;
|
||||
use greptime_proto::v1::{self, ColumnDataType, SemanticType};
|
||||
|
||||
@@ -359,96 +296,43 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_pipeline_prepare() {
|
||||
{
|
||||
let input_value_str = r#"
|
||||
{
|
||||
"my_field": "1,2",
|
||||
"foo": "bar"
|
||||
}
|
||||
"#;
|
||||
let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap();
|
||||
|
||||
let pipeline_yaml = r#"
|
||||
---
|
||||
description: Pipeline for Apache Tomcat
|
||||
let input_value_str = r#"
|
||||
{
|
||||
"my_field": "1,2",
|
||||
"foo": "bar"
|
||||
}
|
||||
"#;
|
||||
let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap();
|
||||
|
||||
let pipeline_yaml = r#"description: 'Pipeline for Apache Tomcat'
|
||||
processors:
|
||||
- csv:
|
||||
field: my_field, my_field,field1, field2
|
||||
|
||||
field: my_field
|
||||
target_fields: field1, field2
|
||||
transform:
|
||||
- field: field1
|
||||
type: uint32
|
||||
- field: field2
|
||||
type: uint32
|
||||
"#;
|
||||
let pipeline: Pipeline<GreptimeTransformer> =
|
||||
parse(&Content::Yaml(pipeline_yaml.into())).unwrap();
|
||||
let mut payload = pipeline.init_intermediate_state();
|
||||
pipeline.prepare(input_value, &mut payload).unwrap();
|
||||
assert_eq!(
|
||||
&["greptime_timestamp", "my_field"].to_vec(),
|
||||
pipeline.required_keys()
|
||||
);
|
||||
assert_eq!(
|
||||
payload,
|
||||
vec![
|
||||
Value::Null,
|
||||
Value::String("1,2".to_string()),
|
||||
Value::Null,
|
||||
Value::Null
|
||||
]
|
||||
);
|
||||
let result = pipeline.exec_mut(&mut payload).unwrap();
|
||||
let pipeline: Pipeline<GreptimeTransformer> =
|
||||
parse(&Content::Yaml(pipeline_yaml.into())).unwrap();
|
||||
let mut payload = pipeline.init_intermediate_state();
|
||||
pipeline.prepare(input_value, &mut payload).unwrap();
|
||||
assert_eq!(&["my_field"].to_vec(), pipeline.required_keys());
|
||||
assert_eq!(
|
||||
payload,
|
||||
vec![Value::String("1,2".to_string()), Value::Null, Value::Null]
|
||||
);
|
||||
let result = pipeline.exec_mut(&mut payload).unwrap();
|
||||
|
||||
assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1)));
|
||||
assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2)));
|
||||
match &result.values[2].value_data {
|
||||
Some(ValueData::TimestampNanosecondValue(v)) => {
|
||||
assert_ne!(*v, 0);
|
||||
}
|
||||
_ => panic!("expect null value"),
|
||||
assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1)));
|
||||
assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2)));
|
||||
match &result.values[2].value_data {
|
||||
Some(ValueData::TimestampNanosecondValue(v)) => {
|
||||
assert_ne!(*v, 0);
|
||||
}
|
||||
}
|
||||
{
|
||||
let input_value_str = r#"
|
||||
{
|
||||
"reqTimeSec": "1573840000.000"
|
||||
}
|
||||
"#;
|
||||
|
||||
let pipeline_yaml = r#"
|
||||
---
|
||||
description: Pipeline for Demo Log
|
||||
|
||||
processors:
|
||||
- gsub:
|
||||
field: reqTimeSec
|
||||
pattern: "\\."
|
||||
replacement: ""
|
||||
- epoch:
|
||||
field: reqTimeSec
|
||||
resolution: millisecond
|
||||
ignore_missing: true
|
||||
|
||||
transform:
|
||||
- field: reqTimeSec
|
||||
type: epoch, millisecond
|
||||
index: timestamp
|
||||
"#;
|
||||
let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap();
|
||||
let pipeline: Pipeline<GreptimeTransformer> =
|
||||
parse(&Content::Yaml(pipeline_yaml.into())).unwrap();
|
||||
let mut payload = pipeline.init_intermediate_state();
|
||||
pipeline.prepare(input_value, &mut payload).unwrap();
|
||||
assert_eq!(&["reqTimeSec"].to_vec(), pipeline.required_keys());
|
||||
assert_eq!(payload, vec![Value::String("1573840000.000".to_string())]);
|
||||
let result = pipeline.exec_mut(&mut payload).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
result.values[0].value_data,
|
||||
Some(ValueData::TimestampMillisecondValue(1573840000000))
|
||||
);
|
||||
_ => panic!("expect null value"),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -541,21 +425,19 @@ transform:
|
||||
#[test]
|
||||
fn test_csv_pipeline() {
|
||||
let input_value_str = r#"
|
||||
{
|
||||
"my_field": "1,2",
|
||||
"foo": "bar"
|
||||
}
|
||||
"#;
|
||||
{
|
||||
"my_field": "1,2",
|
||||
"foo": "bar"
|
||||
}
|
||||
"#;
|
||||
let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap();
|
||||
|
||||
let pipeline_yaml = r#"
|
||||
---
|
||||
description: Pipeline for Apache Tomcat
|
||||
|
||||
processors:
|
||||
- csv:
|
||||
field: my_field,my_field, field1, field2
|
||||
|
||||
field: my_field
|
||||
target_fields: field1, field2
|
||||
transform:
|
||||
- field: field1
|
||||
type: uint32
|
||||
@@ -565,8 +447,22 @@ transform:
|
||||
|
||||
let pipeline: Pipeline<GreptimeTransformer> =
|
||||
parse(&Content::Yaml(pipeline_yaml.into())).unwrap();
|
||||
let output = pipeline.exec(input_value.try_into().unwrap());
|
||||
assert!(output.is_ok());
|
||||
let mut payload = pipeline.init_intermediate_state();
|
||||
pipeline.prepare(input_value, &mut payload).unwrap();
|
||||
assert_eq!(&["my_field"].to_vec(), pipeline.required_keys());
|
||||
assert_eq!(
|
||||
payload,
|
||||
vec![Value::String("1,2".to_string()), Value::Null, Value::Null]
|
||||
);
|
||||
let result = pipeline.exec_mut(&mut payload).unwrap();
|
||||
assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1)));
|
||||
assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2)));
|
||||
match &result.values[2].value_data {
|
||||
Some(ValueData::TimestampNanosecondValue(v)) => {
|
||||
assert_ne!(*v, 0);
|
||||
}
|
||||
_ => panic!("expect null value"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -596,7 +492,14 @@ transform:
|
||||
|
||||
let pipeline: Pipeline<GreptimeTransformer> =
|
||||
parse(&Content::Yaml(pipeline_yaml.into())).unwrap();
|
||||
let output = pipeline.exec(input_value.try_into().unwrap()).unwrap();
|
||||
let schema = pipeline.schemas().clone();
|
||||
let mut result = pipeline.init_intermediate_state();
|
||||
pipeline.prepare(input_value, &mut result).unwrap();
|
||||
let row = pipeline.exec_mut(&mut result).unwrap();
|
||||
let output = Rows {
|
||||
schema,
|
||||
rows: vec![row],
|
||||
};
|
||||
let schemas = output.schema;
|
||||
|
||||
assert_eq!(schemas.len(), 1);
|
||||
|
||||
@@ -12,69 +12,12 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
use std::ops::Deref;
|
||||
use std::str::FromStr;
|
||||
|
||||
use ahash::{HashSet, HashSetExt};
|
||||
use itertools::Itertools;
|
||||
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct Fields(Vec<Field>);
|
||||
|
||||
impl Fields {
|
||||
pub(crate) fn new(fields: Vec<Field>) -> Result<Self, String> {
|
||||
let ff = Fields(fields);
|
||||
ff.check()
|
||||
}
|
||||
|
||||
pub(crate) fn one(field: Field) -> Self {
|
||||
Fields(vec![field])
|
||||
}
|
||||
|
||||
pub(crate) fn get_target_fields(&self) -> Vec<&str> {
|
||||
self.0.iter().map(|f| f.get_target_field()).collect()
|
||||
}
|
||||
|
||||
fn check(self) -> Result<Self, String> {
|
||||
if self.0.is_empty() {
|
||||
return Err("fields must not be empty".to_string());
|
||||
}
|
||||
|
||||
let mut set = HashSet::new();
|
||||
for f in self.0.iter() {
|
||||
if set.contains(&f.input_field.name) {
|
||||
return Err(format!(
|
||||
"field name must be unique, but got duplicated: {}",
|
||||
f.input_field.name
|
||||
));
|
||||
}
|
||||
set.insert(&f.input_field.name);
|
||||
}
|
||||
|
||||
Ok(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Fields {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
let s = self.0.iter().map(|f| f.to_string()).join(";");
|
||||
write!(f, "{s}")
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::Deref for Fields {
|
||||
type Target = Vec<Field>;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::DerefMut for Fields {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
&mut self.0
|
||||
}
|
||||
}
|
||||
use crate::etl::find_key_index;
|
||||
|
||||
/// Information about the input field including the name and index in intermediate keys.
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct InputFieldInfo {
|
||||
pub(crate) name: String,
|
||||
@@ -82,132 +25,202 @@ pub struct InputFieldInfo {
|
||||
}
|
||||
|
||||
impl InputFieldInfo {
|
||||
/// Create a new input field info with the given field name and index.
|
||||
pub(crate) fn new(field: impl Into<String>, index: usize) -> Self {
|
||||
InputFieldInfo {
|
||||
name: field.into(),
|
||||
index,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn name(field: impl Into<String>) -> Self {
|
||||
InputFieldInfo {
|
||||
name: field.into(),
|
||||
index: 0,
|
||||
/// Information about a field that has one input and one output.
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct OneInputOneOutputField {
|
||||
input: InputFieldInfo,
|
||||
output: Option<(String, usize)>,
|
||||
}
|
||||
|
||||
impl OneInputOneOutputField {
|
||||
/// Create a new field with the given input and output.
|
||||
pub(crate) fn new(input: InputFieldInfo, output: (String, usize)) -> Self {
|
||||
OneInputOneOutputField {
|
||||
input,
|
||||
output: Some(output),
|
||||
}
|
||||
}
|
||||
|
||||
/// Build a new field with the given processor kind, intermediate keys, input field, and target field.
|
||||
pub(crate) fn build(
|
||||
processor_kind: &str,
|
||||
intermediate_keys: &[String],
|
||||
input_field: &str,
|
||||
target_field: &str,
|
||||
) -> Result<Self, String> {
|
||||
let input_index = find_key_index(intermediate_keys, input_field, processor_kind)?;
|
||||
|
||||
let input_field_info = InputFieldInfo::new(input_field, input_index);
|
||||
let output_index = find_key_index(intermediate_keys, target_field, processor_kind)?;
|
||||
Ok(OneInputOneOutputField::new(
|
||||
input_field_info,
|
||||
(target_field.to_string(), output_index),
|
||||
))
|
||||
}
|
||||
|
||||
/// Get the input field information.
|
||||
pub(crate) fn input(&self) -> &InputFieldInfo {
|
||||
&self.input
|
||||
}
|
||||
|
||||
/// Get the index of the input field.
|
||||
pub(crate) fn input_index(&self) -> usize {
|
||||
self.input.index
|
||||
}
|
||||
|
||||
/// Get the name of the input field.
|
||||
pub(crate) fn input_name(&self) -> &str {
|
||||
&self.input.name
|
||||
}
|
||||
|
||||
/// Get the index of the output field.
|
||||
pub(crate) fn output_index(&self) -> usize {
|
||||
*self.output().1
|
||||
}
|
||||
|
||||
/// Get the name of the output field.
|
||||
pub(crate) fn output_name(&self) -> &str {
|
||||
self.output().0
|
||||
}
|
||||
|
||||
/// Get the output field information.
|
||||
pub(crate) fn output(&self) -> (&String, &usize) {
|
||||
if let Some((name, index)) = &self.output {
|
||||
(name, index)
|
||||
} else {
|
||||
(&self.input.name, &self.input.index)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Used to represent the input and output fields of a processor or transform.
|
||||
/// Information about a field that has one input and multiple outputs.
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct OneInputMultiOutputField {
|
||||
input: InputFieldInfo,
|
||||
/// Typically, processors that output multiple keys need to be distinguished by splicing the keys together.
|
||||
prefix: Option<String>,
|
||||
}
|
||||
|
||||
impl OneInputMultiOutputField {
|
||||
/// Create a new field with the given input and prefix.
|
||||
pub(crate) fn new(input: InputFieldInfo, prefix: Option<String>) -> Self {
|
||||
OneInputMultiOutputField { input, prefix }
|
||||
}
|
||||
|
||||
/// Get the input field information.
|
||||
pub(crate) fn input(&self) -> &InputFieldInfo {
|
||||
&self.input
|
||||
}
|
||||
|
||||
/// Get the index of the input field.
|
||||
pub(crate) fn input_index(&self) -> usize {
|
||||
self.input.index
|
||||
}
|
||||
|
||||
/// Get the name of the input field.
|
||||
pub(crate) fn input_name(&self) -> &str {
|
||||
&self.input.name
|
||||
}
|
||||
|
||||
/// Get the prefix for the output fields.
|
||||
pub(crate) fn target_prefix(&self) -> &str {
|
||||
self.prefix.as_deref().unwrap_or(&self.input.name)
|
||||
}
|
||||
}
|
||||
|
||||
/// Raw processor-defined inputs and outputs
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct Field {
|
||||
/// The input field name and index.
|
||||
pub input_field: InputFieldInfo,
|
||||
|
||||
/// The output field name and index mapping.
|
||||
pub output_fields_index_mapping: BTreeMap<String, usize>,
|
||||
|
||||
// rename
|
||||
pub target_field: Option<String>,
|
||||
|
||||
// 1-to-many mapping
|
||||
// processors:
|
||||
// - csv
|
||||
pub target_fields: Option<Vec<String>>,
|
||||
pub(crate) input_field: String,
|
||||
pub(crate) target_field: Option<String>,
|
||||
}
|
||||
|
||||
impl Field {
|
||||
pub(crate) fn new(field: impl Into<String>) -> Self {
|
||||
Field {
|
||||
input_field: InputFieldInfo::name(field.into()),
|
||||
output_fields_index_mapping: BTreeMap::new(),
|
||||
target_field: None,
|
||||
target_fields: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// target column_name in processor or transform
|
||||
/// if target_field is None, return input field name
|
||||
pub(crate) fn get_target_field(&self) -> &str {
|
||||
self.target_field
|
||||
.as_deref()
|
||||
.unwrap_or(&self.input_field.name)
|
||||
}
|
||||
|
||||
/// input column_name in processor or transform
|
||||
pub(crate) fn get_field_name(&self) -> &str {
|
||||
&self.input_field.name
|
||||
}
|
||||
|
||||
/// set input column index in processor or transform
|
||||
pub(crate) fn set_input_index(&mut self, index: usize) {
|
||||
self.input_field.index = index;
|
||||
}
|
||||
|
||||
pub(crate) fn set_output_index(&mut self, key: &str, index: usize) {
|
||||
if let Some(v) = self.output_fields_index_mapping.get_mut(key) {
|
||||
*v = index;
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn insert_output_index(&mut self, key: String, index: usize) {
|
||||
self.output_fields_index_mapping.insert(key, index);
|
||||
}
|
||||
}
|
||||
|
||||
impl std::str::FromStr for Field {
|
||||
impl FromStr for Field {
|
||||
type Err = String;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
let mut parts = s.split(',');
|
||||
let field = parts.next().ok_or("field is missing")?.trim().to_string();
|
||||
let input_field = parts
|
||||
.next()
|
||||
.ok_or("input field is missing")?
|
||||
.trim()
|
||||
.to_string();
|
||||
let target_field = parts.next().map(|x| x.trim().to_string());
|
||||
|
||||
if field.is_empty() {
|
||||
return Err("field is empty".to_string());
|
||||
if input_field.is_empty() {
|
||||
return Err("input field is empty".to_string());
|
||||
}
|
||||
|
||||
let renamed_field = match parts.next() {
|
||||
Some(s) if !s.trim().is_empty() => Some(s.trim().to_string()),
|
||||
_ => None,
|
||||
};
|
||||
|
||||
// TODO(qtang): ???? what's this?
|
||||
// weird design? field: <field>,<target_field>,<target_fields>,<target_fields>....
|
||||
// and only use in csv processor
|
||||
let fields: Vec<_> = parts
|
||||
.map(|s| s.trim())
|
||||
.filter(|s| !s.is_empty())
|
||||
.map(|s| s.to_string())
|
||||
.collect();
|
||||
let target_fields = if fields.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(fields)
|
||||
};
|
||||
|
||||
Ok(Field {
|
||||
input_field: InputFieldInfo::name(field),
|
||||
output_fields_index_mapping: BTreeMap::new(),
|
||||
target_field: renamed_field,
|
||||
target_fields,
|
||||
input_field,
|
||||
target_field,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Field {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
match (&self.target_field, &self.target_fields) {
|
||||
(Some(target_field), None) => write!(f, "{}, {target_field}", self.input_field.name),
|
||||
(None, Some(target_fields)) => {
|
||||
write!(
|
||||
f,
|
||||
"{}, {}",
|
||||
self.input_field.name,
|
||||
target_fields.iter().join(",")
|
||||
)
|
||||
}
|
||||
_ => write!(f, "{}", self.input_field.name),
|
||||
impl Field {
|
||||
/// Create a new field with the given input and target fields.
|
||||
pub(crate) fn new(input_field: impl Into<String>, target_field: Option<String>) -> Self {
|
||||
Field {
|
||||
input_field: input_field.into(),
|
||||
target_field,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the input field.
|
||||
pub(crate) fn input_field(&self) -> &str {
|
||||
&self.input_field
|
||||
}
|
||||
|
||||
/// Get the target field.
|
||||
pub(crate) fn target_field(&self) -> Option<&str> {
|
||||
self.target_field.as_deref()
|
||||
}
|
||||
|
||||
/// Get the target field or the input field if the target field is not set.
|
||||
pub(crate) fn target_or_input_field(&self) -> &str {
|
||||
self.target_field.as_deref().unwrap_or(&self.input_field)
|
||||
}
|
||||
}
|
||||
|
||||
/// A collection of fields.
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct Fields(Vec<Field>);
|
||||
|
||||
impl Fields {
|
||||
pub(crate) fn new(fields: Vec<Field>) -> Self {
|
||||
Fields(fields)
|
||||
}
|
||||
|
||||
pub(crate) fn one(field: Field) -> Self {
|
||||
Fields(vec![field])
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for Fields {
|
||||
type Target = Vec<Field>;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl IntoIterator for Fields {
|
||||
type Item = Field;
|
||||
type IntoIter = std::vec::IntoIter<Field>;
|
||||
|
||||
fn into_iter(self) -> Self::IntoIter {
|
||||
self.0.into_iter()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -227,35 +240,14 @@ mod tests {
|
||||
|
||||
let cases = [
|
||||
// ("field", "field", None, None),
|
||||
(
|
||||
"field, target_field",
|
||||
"field",
|
||||
Some("target_field".into()),
|
||||
None,
|
||||
),
|
||||
(
|
||||
"field, target_field1, target_field2, target_field3",
|
||||
"field",
|
||||
Some("target_field1".into()),
|
||||
Some(vec!["target_field2".into(), "target_field3".into()]),
|
||||
),
|
||||
(
|
||||
"field,, target_field1, target_field2, target_field3",
|
||||
"field",
|
||||
None,
|
||||
Some(vec![
|
||||
"target_field1".into(),
|
||||
"target_field2".into(),
|
||||
"target_field3".into(),
|
||||
]),
|
||||
),
|
||||
("field, target_field", "field", Some("target_field")),
|
||||
("field", "field", None),
|
||||
];
|
||||
|
||||
for (s, field, target_field, target_fields) in cases.into_iter() {
|
||||
for (s, field, target_field) in cases.into_iter() {
|
||||
let f: Field = s.parse().unwrap();
|
||||
assert_eq!(f.get_field_name(), field, "{s}");
|
||||
assert_eq!(f.target_field, target_field, "{s}");
|
||||
assert_eq!(f.target_fields, target_fields, "{s}");
|
||||
assert_eq!(f.input_field(), field, "{s}");
|
||||
assert_eq!(f.target_field(), target_field, "{s}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -25,22 +25,22 @@ pub mod timestamp;
|
||||
pub mod urlencoding;
|
||||
|
||||
use ahash::{HashSet, HashSetExt};
|
||||
use cmcd::CmcdProcessor;
|
||||
use csv::CsvProcessor;
|
||||
use date::DateProcessor;
|
||||
use dissect::DissectProcessor;
|
||||
use cmcd::{CmcdProcessor, CmcdProcessorBuilder};
|
||||
use csv::{CsvProcessor, CsvProcessorBuilder};
|
||||
use date::{DateProcessor, DateProcessorBuilder};
|
||||
use dissect::{DissectProcessor, DissectProcessorBuilder};
|
||||
use enum_dispatch::enum_dispatch;
|
||||
use epoch::EpochProcessor;
|
||||
use gsub::GsubProcessor;
|
||||
use epoch::{EpochProcessor, EpochProcessorBuilder};
|
||||
use gsub::{GsubProcessor, GsubProcessorBuilder};
|
||||
use itertools::Itertools;
|
||||
use join::JoinProcessor;
|
||||
use letter::LetterProcessor;
|
||||
use regex::RegexProcessor;
|
||||
use timestamp::TimestampProcessor;
|
||||
use urlencoding::UrlEncodingProcessor;
|
||||
use join::{JoinProcessor, JoinProcessorBuilder};
|
||||
use letter::{LetterProcessor, LetterProcessorBuilder};
|
||||
use regex::{RegexProcessor, RegexProcessorBuilder};
|
||||
use timestamp::{TimestampProcessor, TimestampProcessorBuilder};
|
||||
use urlencoding::{UrlEncodingProcessor, UrlEncodingProcessorBuilder};
|
||||
|
||||
use crate::etl::field::{Field, Fields};
|
||||
use crate::etl::value::{Map, Value};
|
||||
use super::field::{Field, Fields};
|
||||
use crate::etl::value::Value;
|
||||
|
||||
const FIELD_NAME: &str = "field";
|
||||
const FIELDS_NAME: &str = "fields";
|
||||
@@ -49,6 +49,7 @@ const METHOD_NAME: &str = "method";
|
||||
const PATTERN_NAME: &str = "pattern";
|
||||
const PATTERNS_NAME: &str = "patterns";
|
||||
const SEPARATOR_NAME: &str = "separator";
|
||||
const TARGET_FIELDS_NAME: &str = "target_fields";
|
||||
|
||||
// const IF_NAME: &str = "if";
|
||||
// const IGNORE_FAILURE_NAME: &str = "ignore_failure";
|
||||
@@ -62,55 +63,14 @@ const SEPARATOR_NAME: &str = "separator";
|
||||
/// The output of a processor is a map of key-value pairs that will be merged into the document when you use exec_map method.
|
||||
#[enum_dispatch(ProcessorKind)]
|
||||
pub trait Processor: std::fmt::Debug + Send + Sync + 'static {
|
||||
/// Get the processor's fields
|
||||
/// fields is just the same processor for multiple keys. It is not the case that a processor has multiple inputs
|
||||
fn fields(&self) -> &Fields;
|
||||
|
||||
/// Get the processor's fields mutably
|
||||
fn fields_mut(&mut self) -> &mut Fields;
|
||||
|
||||
/// Get the processor's kind
|
||||
fn kind(&self) -> &str;
|
||||
|
||||
/// Whether to ignore missing
|
||||
fn ignore_missing(&self) -> bool;
|
||||
|
||||
/// processor all output keys
|
||||
/// if a processor has multiple output keys, it should return all of them
|
||||
fn output_keys(&self) -> HashSet<String>;
|
||||
|
||||
/// Execute the processor on a document
|
||||
/// and return a map of key-value pairs
|
||||
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String>;
|
||||
|
||||
/// Execute the processor on a vector which be preprocessed by the pipeline
|
||||
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String>;
|
||||
|
||||
/// Execute the processor on a map
|
||||
/// and merge the output into the original map
|
||||
fn exec_map(&self, map: &mut Map) -> Result<(), String> {
|
||||
for ff @ Field {
|
||||
input_field: field_info,
|
||||
..
|
||||
} in self.fields().iter()
|
||||
{
|
||||
match map.get(&field_info.name) {
|
||||
Some(v) => {
|
||||
map.extend(self.exec_field(v, ff)?);
|
||||
}
|
||||
None if self.ignore_missing() => {}
|
||||
None => {
|
||||
return Err(format!(
|
||||
"{} processor: field '{}' is required but missing in {map}",
|
||||
self.kind(),
|
||||
field_info.name,
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
@@ -129,6 +89,42 @@ pub enum ProcessorKind {
|
||||
Date(DateProcessor),
|
||||
}
|
||||
|
||||
/// ProcessorBuilder trait defines the interface for all processor builders
|
||||
/// A processor builder is used to create a processor
|
||||
#[enum_dispatch(ProcessorBuilders)]
|
||||
pub trait ProcessorBuilder: std::fmt::Debug + Send + Sync + 'static {
|
||||
/// Get the processor's output keys
|
||||
fn output_keys(&self) -> HashSet<&str>;
|
||||
/// Get the processor's input keys
|
||||
fn input_keys(&self) -> HashSet<&str>;
|
||||
/// Build the processor
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String>;
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
#[enum_dispatch]
|
||||
pub enum ProcessorBuilders {
|
||||
Cmcd(CmcdProcessorBuilder),
|
||||
Csv(CsvProcessorBuilder),
|
||||
Dissect(DissectProcessorBuilder),
|
||||
Gsub(GsubProcessorBuilder),
|
||||
Join(JoinProcessorBuilder),
|
||||
Letter(LetterProcessorBuilder),
|
||||
Regex(RegexProcessorBuilder),
|
||||
Timestamp(TimestampProcessorBuilder),
|
||||
UrlEncoding(UrlEncodingProcessorBuilder),
|
||||
Epoch(EpochProcessorBuilder),
|
||||
Date(DateProcessorBuilder),
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct ProcessorBuilderList {
|
||||
pub(crate) processor_builders: Vec<ProcessorBuilders>,
|
||||
pub(crate) input_keys: Vec<String>,
|
||||
pub(crate) output_keys: Vec<String>,
|
||||
pub(crate) original_input_keys: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct Processors {
|
||||
/// A ordered list of processors
|
||||
@@ -174,52 +170,63 @@ impl Processors {
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&Vec<yaml_rust::Yaml>> for Processors {
|
||||
impl TryFrom<&Vec<yaml_rust::Yaml>> for ProcessorBuilderList {
|
||||
type Error = String;
|
||||
|
||||
fn try_from(vec: &Vec<yaml_rust::Yaml>) -> Result<Self, Self::Error> {
|
||||
let mut processors = vec![];
|
||||
let mut processors_builders = vec![];
|
||||
let mut all_output_keys = HashSet::with_capacity(50);
|
||||
let mut all_required_keys = HashSet::with_capacity(50);
|
||||
let mut all_required_original_keys = HashSet::with_capacity(50);
|
||||
for doc in vec {
|
||||
let processor = parse_processor(doc)?;
|
||||
|
||||
// get all required keys
|
||||
let processor_required_keys: Vec<String> = processor
|
||||
.fields()
|
||||
.iter()
|
||||
.map(|f| f.input_field.name.clone())
|
||||
.collect();
|
||||
|
||||
for key in &processor_required_keys {
|
||||
if !all_output_keys.contains(key) {
|
||||
all_required_original_keys.insert(key.clone());
|
||||
}
|
||||
}
|
||||
|
||||
all_required_keys.extend(processor_required_keys);
|
||||
|
||||
let processor_output_keys = processor.output_keys().into_iter();
|
||||
all_output_keys.extend(processor_output_keys);
|
||||
|
||||
processors.push(processor);
|
||||
processors_builders.push(processor);
|
||||
}
|
||||
|
||||
let all_required_keys = all_required_keys.into_iter().sorted().collect();
|
||||
let all_output_keys = all_output_keys.into_iter().sorted().collect();
|
||||
let all_required_original_keys = all_required_original_keys.into_iter().sorted().collect();
|
||||
for processor in processors_builders.iter() {
|
||||
{
|
||||
// get all required keys
|
||||
let processor_required_keys = processor.input_keys();
|
||||
|
||||
Ok(Processors {
|
||||
processors,
|
||||
required_keys: all_required_keys,
|
||||
for key in &processor_required_keys {
|
||||
if !all_output_keys.contains(key) {
|
||||
all_required_original_keys.insert(*key);
|
||||
}
|
||||
}
|
||||
|
||||
all_required_keys.extend(processor_required_keys);
|
||||
|
||||
let processor_output_keys = processor.output_keys().into_iter();
|
||||
all_output_keys.extend(processor_output_keys);
|
||||
}
|
||||
}
|
||||
|
||||
let all_required_keys = all_required_keys
|
||||
.into_iter()
|
||||
.map(|x| x.to_string())
|
||||
.sorted()
|
||||
.collect();
|
||||
let all_output_keys = all_output_keys
|
||||
.into_iter()
|
||||
.map(|x| x.to_string())
|
||||
.sorted()
|
||||
.collect();
|
||||
let all_required_original_keys = all_required_original_keys
|
||||
.into_iter()
|
||||
.map(|x| x.to_string())
|
||||
.sorted()
|
||||
.collect();
|
||||
|
||||
Ok(ProcessorBuilderList {
|
||||
processor_builders: processors_builders,
|
||||
input_keys: all_required_keys,
|
||||
output_keys: all_output_keys,
|
||||
required_original_keys: all_required_original_keys,
|
||||
original_input_keys: all_required_original_keys,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_processor(doc: &yaml_rust::Yaml) -> Result<ProcessorKind, String> {
|
||||
fn parse_processor(doc: &yaml_rust::Yaml) -> Result<ProcessorBuilders, String> {
|
||||
let map = doc.as_hash().ok_or("processor must be a map".to_string())?;
|
||||
|
||||
let key = map
|
||||
@@ -238,20 +245,24 @@ fn parse_processor(doc: &yaml_rust::Yaml) -> Result<ProcessorKind, String> {
|
||||
.ok_or("processor key must be a string".to_string())?;
|
||||
|
||||
let processor = match str_key {
|
||||
cmcd::PROCESSOR_CMCD => ProcessorKind::Cmcd(CmcdProcessor::try_from(value)?),
|
||||
csv::PROCESSOR_CSV => ProcessorKind::Csv(CsvProcessor::try_from(value)?),
|
||||
dissect::PROCESSOR_DISSECT => ProcessorKind::Dissect(DissectProcessor::try_from(value)?),
|
||||
epoch::PROCESSOR_EPOCH => ProcessorKind::Epoch(EpochProcessor::try_from(value)?),
|
||||
date::PROCESSOR_DATE => ProcessorKind::Date(DateProcessor::try_from(value)?),
|
||||
gsub::PROCESSOR_GSUB => ProcessorKind::Gsub(GsubProcessor::try_from(value)?),
|
||||
join::PROCESSOR_JOIN => ProcessorKind::Join(JoinProcessor::try_from(value)?),
|
||||
letter::PROCESSOR_LETTER => ProcessorKind::Letter(LetterProcessor::try_from(value)?),
|
||||
regex::PROCESSOR_REGEX => ProcessorKind::Regex(RegexProcessor::try_from(value)?),
|
||||
cmcd::PROCESSOR_CMCD => ProcessorBuilders::Cmcd(CmcdProcessorBuilder::try_from(value)?),
|
||||
csv::PROCESSOR_CSV => ProcessorBuilders::Csv(CsvProcessorBuilder::try_from(value)?),
|
||||
dissect::PROCESSOR_DISSECT => {
|
||||
ProcessorBuilders::Dissect(DissectProcessorBuilder::try_from(value)?)
|
||||
}
|
||||
epoch::PROCESSOR_EPOCH => ProcessorBuilders::Epoch(EpochProcessorBuilder::try_from(value)?),
|
||||
date::PROCESSOR_DATE => ProcessorBuilders::Date(DateProcessorBuilder::try_from(value)?),
|
||||
gsub::PROCESSOR_GSUB => ProcessorBuilders::Gsub(GsubProcessorBuilder::try_from(value)?),
|
||||
join::PROCESSOR_JOIN => ProcessorBuilders::Join(JoinProcessorBuilder::try_from(value)?),
|
||||
letter::PROCESSOR_LETTER => {
|
||||
ProcessorBuilders::Letter(LetterProcessorBuilder::try_from(value)?)
|
||||
}
|
||||
regex::PROCESSOR_REGEX => ProcessorBuilders::Regex(RegexProcessorBuilder::try_from(value)?),
|
||||
timestamp::PROCESSOR_TIMESTAMP => {
|
||||
ProcessorKind::Timestamp(TimestampProcessor::try_from(value)?)
|
||||
ProcessorBuilders::Timestamp(TimestampProcessorBuilder::try_from(value)?)
|
||||
}
|
||||
urlencoding::PROCESSOR_URL_ENCODING => {
|
||||
ProcessorKind::UrlEncoding(UrlEncodingProcessor::try_from(value)?)
|
||||
ProcessorBuilders::UrlEncoding(UrlEncodingProcessorBuilder::try_from(value)?)
|
||||
}
|
||||
_ => return Err(format!("unsupported {} processor", str_key)),
|
||||
};
|
||||
@@ -301,19 +312,10 @@ where
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn yaml_fields(v: &yaml_rust::Yaml, field: &str) -> Result<Fields, String> {
|
||||
let v = yaml_parse_strings(v, field)?;
|
||||
Fields::new(v)
|
||||
pub(crate) fn yaml_new_fields(v: &yaml_rust::Yaml, field: &str) -> Result<Fields, String> {
|
||||
yaml_parse_strings(v, field).map(Fields::new)
|
||||
}
|
||||
|
||||
pub(crate) fn yaml_field(v: &yaml_rust::Yaml, field: &str) -> Result<Field, String> {
|
||||
pub(crate) fn yaml_new_field(v: &yaml_rust::Yaml, field: &str) -> Result<Field, String> {
|
||||
yaml_parse_string(v, field)
|
||||
}
|
||||
|
||||
pub(crate) fn update_one_one_output_keys(fields: &mut Fields) {
|
||||
for field in fields.iter_mut() {
|
||||
field
|
||||
.output_fields_index_mapping
|
||||
.insert(field.get_target_field().to_string(), 0_usize);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,14 +12,18 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use ahash::HashSet;
|
||||
use urlencoding::decode;
|
||||
|
||||
use crate::etl::field::{Field, Fields};
|
||||
use crate::etl::field::{Field, Fields, InputFieldInfo, OneInputMultiOutputField};
|
||||
use crate::etl::find_key_index;
|
||||
use crate::etl::processor::{
|
||||
yaml_bool, yaml_field, yaml_fields, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
|
||||
yaml_bool, yaml_new_field, yaml_new_fields, Processor, ProcessorBuilder, ProcessorKind,
|
||||
FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
|
||||
};
|
||||
use crate::etl::value::{Map, Value};
|
||||
use crate::etl::value::Value;
|
||||
|
||||
pub(crate) const PROCESSOR_CMCD: &str = "cmcd";
|
||||
|
||||
@@ -63,6 +67,178 @@ const CMCD_KEYS: [&str; 18] = [
|
||||
CMCD_KEY_V,
|
||||
];
|
||||
|
||||
/// CmcdProcessorBuilder is a builder for CmcdProcessor
|
||||
/// parse from raw yaml
|
||||
#[derive(Debug, Default)]
|
||||
pub struct CmcdProcessorBuilder {
|
||||
fields: Fields,
|
||||
output_keys: HashSet<String>,
|
||||
ignore_missing: bool,
|
||||
}
|
||||
|
||||
impl CmcdProcessorBuilder {
|
||||
/// build_cmcd_outputs build cmcd output info
|
||||
/// generate index and function for each output
|
||||
pub(super) fn build_cmcd_outputs(
|
||||
field: &Field,
|
||||
intermediate_keys: &[String],
|
||||
) -> Result<(BTreeMap<String, usize>, Vec<CmcdOutputInfo>), String> {
|
||||
let mut output_index = BTreeMap::new();
|
||||
let mut cmcd_field_outputs = Vec::with_capacity(CMCD_KEYS.len());
|
||||
for cmcd in CMCD_KEYS {
|
||||
let final_key = generate_key(field.target_or_input_field(), cmcd);
|
||||
let index = find_key_index(intermediate_keys, &final_key, "cmcd")?;
|
||||
output_index.insert(final_key.clone(), index);
|
||||
match cmcd {
|
||||
CMCD_KEY_BS | CMCD_KEY_SU => {
|
||||
let output_info = CmcdOutputInfo::new(final_key, cmcd, index, bs_su);
|
||||
cmcd_field_outputs.push(output_info);
|
||||
}
|
||||
CMCD_KEY_BR | CMCD_KEY_BL | CMCD_KEY_D | CMCD_KEY_DL | CMCD_KEY_MTP
|
||||
| CMCD_KEY_RTP | CMCD_KEY_TB => {
|
||||
let output_info = CmcdOutputInfo::new(final_key, cmcd, index, br_tb);
|
||||
cmcd_field_outputs.push(output_info);
|
||||
}
|
||||
CMCD_KEY_CID | CMCD_KEY_NRR | CMCD_KEY_OT | CMCD_KEY_SF | CMCD_KEY_SID
|
||||
| CMCD_KEY_ST | CMCD_KEY_V => {
|
||||
let output_info = CmcdOutputInfo::new(final_key, cmcd, index, cid_v);
|
||||
cmcd_field_outputs.push(output_info);
|
||||
}
|
||||
CMCD_KEY_NOR => {
|
||||
let output_info = CmcdOutputInfo::new(final_key, cmcd, index, nor);
|
||||
cmcd_field_outputs.push(output_info);
|
||||
}
|
||||
CMCD_KEY_PR => {
|
||||
let output_info = CmcdOutputInfo::new(final_key, cmcd, index, pr);
|
||||
cmcd_field_outputs.push(output_info);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
Ok((output_index, cmcd_field_outputs))
|
||||
}
|
||||
|
||||
/// build CmcdProcessor from CmcdProcessorBuilder
|
||||
pub fn build(self, intermediate_keys: &[String]) -> Result<CmcdProcessor, String> {
|
||||
let mut real_fields = vec![];
|
||||
let mut cmcd_outputs = Vec::with_capacity(CMCD_KEYS.len());
|
||||
for field in self.fields.into_iter() {
|
||||
let input_index = find_key_index(intermediate_keys, field.input_field(), "cmcd")?;
|
||||
|
||||
let input_field_info = InputFieldInfo::new(field.input_field(), input_index);
|
||||
|
||||
let (_, cmcd_field_outputs) = Self::build_cmcd_outputs(&field, intermediate_keys)?;
|
||||
|
||||
cmcd_outputs.push(cmcd_field_outputs);
|
||||
|
||||
let real_field = OneInputMultiOutputField::new(input_field_info, field.target_field);
|
||||
real_fields.push(real_field);
|
||||
}
|
||||
Ok(CmcdProcessor {
|
||||
fields: real_fields,
|
||||
cmcd_outputs,
|
||||
ignore_missing: self.ignore_missing,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl ProcessorBuilder for CmcdProcessorBuilder {
|
||||
fn output_keys(&self) -> HashSet<&str> {
|
||||
self.output_keys.iter().map(|s| s.as_str()).collect()
|
||||
}
|
||||
|
||||
fn input_keys(&self) -> HashSet<&str> {
|
||||
self.fields.iter().map(|f| f.input_field()).collect()
|
||||
}
|
||||
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String> {
|
||||
self.build(intermediate_keys).map(ProcessorKind::Cmcd)
|
||||
}
|
||||
}
|
||||
|
||||
fn generate_key(prefix: &str, key: &str) -> String {
|
||||
format!("{}_{}", prefix, key)
|
||||
}
|
||||
|
||||
/// CmcdOutputInfo is a struct to store output info
|
||||
#[derive(Debug)]
|
||||
pub(super) struct CmcdOutputInfo {
|
||||
/// {input_field}_{cmcd_key}
|
||||
final_key: String,
|
||||
/// cmcd key
|
||||
key: &'static str,
|
||||
/// index in intermediate_keys
|
||||
index: usize,
|
||||
/// function to resolve value
|
||||
f: fn(&str, &str, Option<&str>) -> Result<Value, String>,
|
||||
}
|
||||
|
||||
impl CmcdOutputInfo {
|
||||
fn new(
|
||||
final_key: String,
|
||||
key: &'static str,
|
||||
index: usize,
|
||||
f: fn(&str, &str, Option<&str>) -> Result<Value, String>,
|
||||
) -> Self {
|
||||
Self {
|
||||
final_key,
|
||||
key,
|
||||
index,
|
||||
f,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for CmcdOutputInfo {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
final_key: String::default(),
|
||||
key: "",
|
||||
index: 0,
|
||||
f: |_, _, _| Ok(Value::Null),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// function to resolve CMCD_KEY_BS | CMCD_KEY_SU
|
||||
fn bs_su(_: &str, _: &str, _: Option<&str>) -> Result<Value, String> {
|
||||
Ok(Value::Boolean(true))
|
||||
}
|
||||
|
||||
/// function to resolve CMCD_KEY_BR | CMCD_KEY_BL | CMCD_KEY_D | CMCD_KEY_DL | CMCD_KEY_MTP | CMCD_KEY_RTP | CMCD_KEY_TB
|
||||
fn br_tb(s: &str, k: &str, v: Option<&str>) -> Result<Value, String> {
|
||||
let v = v.ok_or(format!("{k} missing value in {s}"))?;
|
||||
let val: i64 = v
|
||||
.parse()
|
||||
.map_err(|_| format!("failed to parse {v} as i64"))?;
|
||||
Ok(Value::Int64(val))
|
||||
}
|
||||
|
||||
/// function to resolve CMCD_KEY_CID | CMCD_KEY_NRR | CMCD_KEY_OT | CMCD_KEY_SF | CMCD_KEY_SID | CMCD_KEY_V
|
||||
fn cid_v(s: &str, k: &str, v: Option<&str>) -> Result<Value, String> {
|
||||
let v = v.ok_or(format!("{k} missing value in {s}"))?;
|
||||
Ok(Value::String(v.to_string()))
|
||||
}
|
||||
|
||||
/// function to resolve CMCD_KEY_NOR
|
||||
fn nor(s: &str, k: &str, v: Option<&str>) -> Result<Value, String> {
|
||||
let v = v.ok_or(format!("{k} missing value in {s}"))?;
|
||||
let val = match decode(v) {
|
||||
Ok(val) => val.to_string(),
|
||||
Err(_) => v.to_string(),
|
||||
};
|
||||
Ok(Value::String(val))
|
||||
}
|
||||
|
||||
/// function to resolve CMCD_KEY_PR
|
||||
fn pr(s: &str, k: &str, v: Option<&str>) -> Result<Value, String> {
|
||||
let v = v.ok_or(format!("{k} missing value in {s}"))?;
|
||||
let val: f64 = v
|
||||
.parse()
|
||||
.map_err(|_| format!("failed to parse {v} as f64"))?;
|
||||
Ok(Value::Float64(val))
|
||||
}
|
||||
|
||||
/// Common Media Client Data Specification:
|
||||
/// https://cdn.cta.tech/cta/media/media/resources/standards/pdfs/cta-5004-final.pdf
|
||||
///
|
||||
@@ -100,98 +276,43 @@ const CMCD_KEYS: [&str; 18] = [
|
||||
/// 12. Transport Layer Security SHOULD be used to protect all transmission of CMCD data.
|
||||
#[derive(Debug, Default)]
|
||||
pub struct CmcdProcessor {
|
||||
fields: Fields,
|
||||
fields: Vec<OneInputMultiOutputField>,
|
||||
cmcd_outputs: Vec<Vec<CmcdOutputInfo>>,
|
||||
|
||||
ignore_missing: bool,
|
||||
}
|
||||
|
||||
impl CmcdProcessor {
|
||||
fn with_fields(&mut self, mut fields: Fields) {
|
||||
Self::update_output_keys(&mut fields);
|
||||
self.fields = fields;
|
||||
}
|
||||
|
||||
fn with_ignore_missing(&mut self, ignore_missing: bool) {
|
||||
self.ignore_missing = ignore_missing;
|
||||
}
|
||||
|
||||
fn generate_key(prefix: &str, key: &str) -> String {
|
||||
format!("{}_{}", prefix, key)
|
||||
}
|
||||
|
||||
fn parse(prefix: &str, s: &str) -> Result<Map, String> {
|
||||
let mut map = Map::default();
|
||||
fn parse(&self, field_index: usize, s: &str) -> Result<Vec<(usize, Value)>, String> {
|
||||
let parts = s.split(',');
|
||||
let mut result = Vec::new();
|
||||
for part in parts {
|
||||
let mut kv = part.split('=');
|
||||
let k = kv.next().ok_or(format!("{part} missing key in {s}"))?;
|
||||
let v = kv.next();
|
||||
|
||||
let key = Self::generate_key(prefix, k);
|
||||
match k {
|
||||
CMCD_KEY_BS | CMCD_KEY_SU => {
|
||||
map.insert(key, Value::Boolean(true));
|
||||
for cmcd_key in self.cmcd_outputs[field_index].iter() {
|
||||
if cmcd_key.key == k {
|
||||
let val = (cmcd_key.f)(s, k, v)?;
|
||||
result.push((cmcd_key.index, val));
|
||||
}
|
||||
CMCD_KEY_BR | CMCD_KEY_BL | CMCD_KEY_D | CMCD_KEY_DL | CMCD_KEY_MTP
|
||||
| CMCD_KEY_RTP | CMCD_KEY_TB => {
|
||||
let v = v.ok_or(format!("{k} missing value in {s}"))?;
|
||||
let val: i64 = v
|
||||
.parse()
|
||||
.map_err(|_| format!("failed to parse {v} as i64"))?;
|
||||
map.insert(key, Value::Int64(val));
|
||||
}
|
||||
CMCD_KEY_CID | CMCD_KEY_NRR | CMCD_KEY_OT | CMCD_KEY_SF | CMCD_KEY_SID
|
||||
| CMCD_KEY_ST | CMCD_KEY_V => {
|
||||
let v = v.ok_or(format!("{k} missing value in {s}"))?;
|
||||
map.insert(key, Value::String(v.to_string()));
|
||||
}
|
||||
CMCD_KEY_NOR => {
|
||||
let v = v.ok_or(format!("{k} missing value in {s}"))?;
|
||||
let val = match decode(v) {
|
||||
Ok(val) => val.to_string(),
|
||||
Err(_) => v.to_string(),
|
||||
};
|
||||
map.insert(key, Value::String(val));
|
||||
}
|
||||
CMCD_KEY_PR => {
|
||||
let v = v.ok_or(format!("{k} missing value in {s}"))?;
|
||||
let val: f64 = v
|
||||
.parse()
|
||||
.map_err(|_| format!("failed to parse {v} as f64"))?;
|
||||
map.insert(key, Value::Float64(val));
|
||||
}
|
||||
_ => match v {
|
||||
Some(v) => map.insert(key, Value::String(v.to_string())),
|
||||
None => map.insert(k, Value::Boolean(true)),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
Ok(map)
|
||||
}
|
||||
|
||||
fn process_field(&self, val: &str, field: &Field) -> Result<Map, String> {
|
||||
let prefix = field.get_target_field();
|
||||
|
||||
Self::parse(prefix, val)
|
||||
}
|
||||
|
||||
fn update_output_keys(fields: &mut Fields) {
|
||||
for field in fields.iter_mut() {
|
||||
for key in CMCD_KEYS.iter() {
|
||||
field
|
||||
.output_fields_index_mapping
|
||||
.insert(Self::generate_key(field.get_target_field(), key), 0);
|
||||
}
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for CmcdProcessor {
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for CmcdProcessorBuilder {
|
||||
type Error = String;
|
||||
|
||||
fn try_from(value: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
|
||||
let mut processor = CmcdProcessor::default();
|
||||
let mut fields = Fields::default();
|
||||
let mut ignore_missing = false;
|
||||
|
||||
for (k, v) in value.iter() {
|
||||
let key = k
|
||||
@@ -199,25 +320,40 @@ impl TryFrom<&yaml_rust::yaml::Hash> for CmcdProcessor {
|
||||
.ok_or(format!("key must be a string, but got {k:?}"))?;
|
||||
match key {
|
||||
FIELD_NAME => {
|
||||
processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?));
|
||||
fields = Fields::one(yaml_new_field(v, FIELD_NAME)?);
|
||||
}
|
||||
FIELDS_NAME => {
|
||||
processor.with_fields(yaml_fields(v, FIELDS_NAME)?);
|
||||
fields = yaml_new_fields(v, FIELDS_NAME)?;
|
||||
}
|
||||
|
||||
IGNORE_MISSING_NAME => {
|
||||
processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?);
|
||||
ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?;
|
||||
}
|
||||
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(processor)
|
||||
let output_keys = fields
|
||||
.iter()
|
||||
.flat_map(|f| {
|
||||
CMCD_KEYS
|
||||
.iter()
|
||||
.map(|cmcd_key| generate_key(f.target_or_input_field(), cmcd_key))
|
||||
})
|
||||
.collect();
|
||||
|
||||
let builder = CmcdProcessorBuilder {
|
||||
fields,
|
||||
output_keys,
|
||||
ignore_missing,
|
||||
};
|
||||
|
||||
Ok(builder)
|
||||
}
|
||||
}
|
||||
|
||||
impl crate::etl::processor::Processor for CmcdProcessor {
|
||||
impl Processor for CmcdProcessor {
|
||||
fn kind(&self) -> &str {
|
||||
PROCESSOR_CMCD
|
||||
}
|
||||
@@ -226,51 +362,14 @@ impl crate::etl::processor::Processor for CmcdProcessor {
|
||||
self.ignore_missing
|
||||
}
|
||||
|
||||
fn fields(&self) -> &Fields {
|
||||
&self.fields
|
||||
}
|
||||
|
||||
fn fields_mut(&mut self) -> &mut Fields {
|
||||
&mut self.fields
|
||||
}
|
||||
|
||||
fn output_keys(&self) -> HashSet<String> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|field| {
|
||||
field
|
||||
.target_field
|
||||
.clone()
|
||||
.unwrap_or_else(|| field.get_field_name().to_string())
|
||||
})
|
||||
.flat_map(|keys| {
|
||||
CMCD_KEYS
|
||||
.iter()
|
||||
.map(move |key| format!("{}_{}", keys, *key))
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
|
||||
match val {
|
||||
Value::String(val) => self.process_field(val, field),
|
||||
_ => Err(format!(
|
||||
"{} processor: expect string value, but got {val:?}",
|
||||
self.kind()
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String> {
|
||||
for field in self.fields.iter() {
|
||||
match val.get(field.input_field.index) {
|
||||
for (field_index, field) in self.fields.iter().enumerate() {
|
||||
let field_value_index = field.input_index();
|
||||
match val.get(field_value_index) {
|
||||
Some(Value::String(v)) => {
|
||||
// TODO(qtang): Let this method use the intermediate state collection directly.
|
||||
let map = self.process_field(v, field)?;
|
||||
for (k, v) in map.values.into_iter() {
|
||||
if let Some(index) = field.output_fields_index_mapping.get(&k) {
|
||||
val[*index] = v;
|
||||
}
|
||||
let result_list = self.parse(field_index, v)?;
|
||||
for (output_index, v) in result_list {
|
||||
val[output_index] = v;
|
||||
}
|
||||
}
|
||||
Some(Value::Null) | None => {
|
||||
@@ -278,7 +377,7 @@ impl crate::etl::processor::Processor for CmcdProcessor {
|
||||
return Err(format!(
|
||||
"{} processor: missing field: {}",
|
||||
self.kind(),
|
||||
field.get_field_name()
|
||||
field.input_name()
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -299,7 +398,8 @@ mod tests {
|
||||
use ahash::HashMap;
|
||||
use urlencoding::decode;
|
||||
|
||||
use super::CmcdProcessor;
|
||||
use super::{CmcdProcessorBuilder, CMCD_KEYS};
|
||||
use crate::etl::field::{Field, Fields};
|
||||
use crate::etl::value::{Map, Value};
|
||||
|
||||
#[test]
|
||||
@@ -329,6 +429,7 @@ mod tests {
|
||||
],
|
||||
),
|
||||
(
|
||||
// we not resolve `b` key
|
||||
"b%2Crtp%3D15000%2Csid%3D%226e2fb550-c457-11e9-bb97-0800200c9a66%22",
|
||||
vec![
|
||||
(
|
||||
@@ -336,7 +437,6 @@ mod tests {
|
||||
Value::String("\"6e2fb550-c457-11e9-bb97-0800200c9a66\"".into()),
|
||||
),
|
||||
("prefix_rtp", Value::Int64(15000)),
|
||||
("b", Value::Boolean(true)),
|
||||
],
|
||||
),
|
||||
(
|
||||
@@ -347,16 +447,17 @@ mod tests {
|
||||
],
|
||||
),
|
||||
(
|
||||
// we not resolve custom key
|
||||
"d%3D4004%2Ccom.example-myNumericKey%3D500%2Ccom.examplemyStringKey%3D%22myStringValue%22",
|
||||
vec![
|
||||
(
|
||||
"prefix_com.example-myNumericKey",
|
||||
Value::String("500".into()),
|
||||
),
|
||||
(
|
||||
"prefix_com.examplemyStringKey",
|
||||
Value::String("\"myStringValue\"".into()),
|
||||
),
|
||||
// (
|
||||
// "prefix_com.example-myNumericKey",
|
||||
// Value::String("500".into()),
|
||||
// ),
|
||||
// (
|
||||
// "prefix_com.examplemyStringKey",
|
||||
// Value::String("\"myStringValue\"".into()),
|
||||
// ),
|
||||
("prefix_d", Value::Int64(4004)),
|
||||
],
|
||||
),
|
||||
@@ -431,6 +532,24 @@ mod tests {
|
||||
),
|
||||
];
|
||||
|
||||
let field = Field::new("prefix", None);
|
||||
|
||||
let output_keys = CMCD_KEYS
|
||||
.iter()
|
||||
.map(|k| format!("prefix_{}", k))
|
||||
.collect::<Vec<String>>();
|
||||
|
||||
let mut intermediate_keys = vec!["prefix".to_string()];
|
||||
intermediate_keys.append(&mut (output_keys.clone()));
|
||||
|
||||
let builder = CmcdProcessorBuilder {
|
||||
fields: Fields::new(vec![field]),
|
||||
output_keys: output_keys.iter().map(|s| s.to_string()).collect(),
|
||||
ignore_missing: false,
|
||||
};
|
||||
|
||||
let processor = builder.build(&intermediate_keys).unwrap();
|
||||
|
||||
for (s, vec) in ss.into_iter() {
|
||||
let decoded = decode(s).unwrap().to_string();
|
||||
|
||||
@@ -440,7 +559,12 @@ mod tests {
|
||||
.collect::<HashMap<String, Value>>();
|
||||
let expected = Map { values };
|
||||
|
||||
let actual = CmcdProcessor::parse("prefix", &decoded).unwrap();
|
||||
let actual = processor.parse(0, &decoded).unwrap();
|
||||
let actual = actual
|
||||
.into_iter()
|
||||
.map(|(index, value)| (intermediate_keys[index].clone(), value))
|
||||
.collect::<HashMap<String, Value>>();
|
||||
let actual = Map { values: actual };
|
||||
assert_eq!(actual, expected);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,17 +14,18 @@
|
||||
|
||||
// Reference: https://www.elastic.co/guide/en/elasticsearch/reference/current/csv-processor.html
|
||||
|
||||
use ahash::{HashMap, HashSet};
|
||||
use ahash::HashSet;
|
||||
use csv::{ReaderBuilder, Trim};
|
||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||
use itertools::Itertools;
|
||||
|
||||
use crate::etl::field::{Field, Fields};
|
||||
use crate::etl::field::{Fields, InputFieldInfo, OneInputMultiOutputField};
|
||||
use crate::etl::find_key_index;
|
||||
use crate::etl::processor::{
|
||||
yaml_bool, yaml_field, yaml_fields, yaml_string, Processor, FIELDS_NAME, FIELD_NAME,
|
||||
IGNORE_MISSING_NAME,
|
||||
yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, ProcessorBuilder,
|
||||
ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
|
||||
};
|
||||
use crate::etl::value::{Map, Value};
|
||||
use crate::etl::value::Value;
|
||||
|
||||
pub(crate) const PROCESSOR_CSV: &str = "csv";
|
||||
|
||||
@@ -32,18 +33,78 @@ const SEPARATOR_NAME: &str = "separator";
|
||||
const QUOTE_NAME: &str = "quote";
|
||||
const TRIM_NAME: &str = "trim";
|
||||
const EMPTY_VALUE_NAME: &str = "empty_value";
|
||||
const TARGET_FIELDS: &str = "target_fields";
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct CsvProcessorBuilder {
|
||||
reader: ReaderBuilder,
|
||||
|
||||
fields: Fields,
|
||||
ignore_missing: bool,
|
||||
|
||||
// Value used to fill empty fields, empty fields will be skipped if this is not provided.
|
||||
empty_value: Option<String>,
|
||||
target_fields: Vec<String>,
|
||||
// description
|
||||
// if
|
||||
// ignore_failure
|
||||
// on_failure
|
||||
// tag
|
||||
}
|
||||
|
||||
impl CsvProcessorBuilder {
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<CsvProcessor, String> {
|
||||
let mut real_fields = vec![];
|
||||
|
||||
for field in self.fields {
|
||||
let input_index = find_key_index(intermediate_keys, field.input_field(), "csv")?;
|
||||
|
||||
let input_field_info = InputFieldInfo::new(field.input_field(), input_index);
|
||||
let real_field = OneInputMultiOutputField::new(input_field_info, None);
|
||||
real_fields.push(real_field);
|
||||
}
|
||||
|
||||
let output_index_info = self
|
||||
.target_fields
|
||||
.iter()
|
||||
.map(|f| find_key_index(intermediate_keys, f, "csv"))
|
||||
.collect::<Result<Vec<_>, String>>()?;
|
||||
Ok(CsvProcessor {
|
||||
reader: self.reader,
|
||||
fields: real_fields,
|
||||
ignore_missing: self.ignore_missing,
|
||||
empty_value: self.empty_value,
|
||||
output_index_info,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl ProcessorBuilder for CsvProcessorBuilder {
|
||||
fn output_keys(&self) -> HashSet<&str> {
|
||||
self.target_fields.iter().map(|s| s.as_str()).collect()
|
||||
}
|
||||
|
||||
fn input_keys(&self) -> HashSet<&str> {
|
||||
self.fields.iter().map(|f| f.input_field()).collect()
|
||||
}
|
||||
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String> {
|
||||
self.build(intermediate_keys).map(ProcessorKind::Csv)
|
||||
}
|
||||
}
|
||||
|
||||
/// only support string value
|
||||
#[derive(Debug)]
|
||||
pub struct CsvProcessor {
|
||||
reader: ReaderBuilder,
|
||||
|
||||
fields: Fields,
|
||||
fields: Vec<OneInputMultiOutputField>,
|
||||
|
||||
ignore_missing: bool,
|
||||
|
||||
// Value used to fill empty fields, empty fields will be skipped if this is not provided.
|
||||
empty_value: Option<String>,
|
||||
output_index_info: Vec<usize>,
|
||||
// description
|
||||
// if
|
||||
// ignore_failure
|
||||
@@ -52,81 +113,19 @@ pub struct CsvProcessor {
|
||||
}
|
||||
|
||||
impl CsvProcessor {
|
||||
fn new() -> Self {
|
||||
let mut reader = ReaderBuilder::new();
|
||||
reader.has_headers(false);
|
||||
|
||||
Self {
|
||||
reader,
|
||||
fields: Fields::default(),
|
||||
ignore_missing: false,
|
||||
empty_value: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn with_fields(&mut self, fields: Fields) {
|
||||
self.fields = fields;
|
||||
}
|
||||
|
||||
fn try_separator(&mut self, separator: String) -> Result<(), String> {
|
||||
if separator.len() != 1 {
|
||||
Err(format!(
|
||||
"'{}' must be a single character, but got '{}'",
|
||||
SEPARATOR_NAME, separator
|
||||
))
|
||||
} else {
|
||||
self.reader.delimiter(separator.as_bytes()[0]);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn try_quote(&mut self, quote: String) -> Result<(), String> {
|
||||
if quote.len() != 1 {
|
||||
Err(format!(
|
||||
"'{}' must be a single character, but got '{}'",
|
||||
QUOTE_NAME, quote
|
||||
))
|
||||
} else {
|
||||
self.reader.quote(quote.as_bytes()[0]);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn with_trim(&mut self, trim: bool) {
|
||||
if trim {
|
||||
self.reader.trim(Trim::All);
|
||||
} else {
|
||||
self.reader.trim(Trim::None);
|
||||
}
|
||||
}
|
||||
|
||||
fn with_ignore_missing(&mut self, ignore_missing: bool) {
|
||||
self.ignore_missing = ignore_missing;
|
||||
}
|
||||
|
||||
fn with_empty_value(&mut self, empty_value: String) {
|
||||
self.empty_value = Some(empty_value);
|
||||
}
|
||||
|
||||
// process the csv format string to a map with target_fields as keys
|
||||
fn process_field(&self, val: &str, field: &Field) -> Result<Map, String> {
|
||||
fn process(&self, val: &str) -> Result<Vec<(usize, Value)>, String> {
|
||||
let mut reader = self.reader.from_reader(val.as_bytes());
|
||||
|
||||
if let Some(result) = reader.records().next() {
|
||||
let record: csv::StringRecord = result.map_err(|e| e.to_string())?;
|
||||
|
||||
let values: HashMap<String, Value> = field
|
||||
.target_fields
|
||||
.as_ref()
|
||||
.ok_or(format!(
|
||||
"target fields must be set after '{}'",
|
||||
field.get_field_name()
|
||||
))?
|
||||
let values: Vec<(usize, Value)> = self
|
||||
.output_index_info
|
||||
.iter()
|
||||
.map(|f| f.to_string())
|
||||
.zip_longest(record.iter())
|
||||
.filter_map(|zipped| match zipped {
|
||||
Both(target_field, val) => Some((target_field, Value::String(val.into()))),
|
||||
Both(target_field, val) => Some((*target_field, Value::String(val.into()))),
|
||||
// if target fields are more than extracted fields, fill the rest with empty value
|
||||
Left(target_field) => {
|
||||
let value = self
|
||||
@@ -134,69 +133,101 @@ impl CsvProcessor {
|
||||
.as_ref()
|
||||
.map(|s| Value::String(s.clone()))
|
||||
.unwrap_or(Value::Null);
|
||||
Some((target_field, value))
|
||||
Some((*target_field, value))
|
||||
}
|
||||
// if extracted fields are more than target fields, ignore the rest
|
||||
Right(_) => None,
|
||||
})
|
||||
.collect();
|
||||
|
||||
Ok(Map { values })
|
||||
Ok(values)
|
||||
} else {
|
||||
Err("expected at least one record from csv format, but got none".into())
|
||||
}
|
||||
}
|
||||
|
||||
fn update_output_keys(&mut self) {
|
||||
self.fields.iter_mut().for_each(|f| {
|
||||
if let Some(tfs) = f.target_fields.as_ref() {
|
||||
tfs.iter().for_each(|tf| {
|
||||
if !tf.is_empty() {
|
||||
f.output_fields_index_mapping.insert(tf.to_string(), 0);
|
||||
}
|
||||
});
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for CsvProcessor {
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for CsvProcessorBuilder {
|
||||
type Error = String;
|
||||
|
||||
fn try_from(hash: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
|
||||
let mut processor = CsvProcessor::new();
|
||||
let mut reader = ReaderBuilder::new();
|
||||
reader.has_headers(false);
|
||||
|
||||
let mut fields = Fields::default();
|
||||
let mut ignore_missing = false;
|
||||
let mut empty_value = None;
|
||||
let mut target_fields = vec![];
|
||||
|
||||
for (k, v) in hash {
|
||||
let key = k
|
||||
.as_str()
|
||||
.ok_or(format!("key must be a string, but got {k:?}"))?;
|
||||
match key {
|
||||
FIELD_NAME => {
|
||||
processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?));
|
||||
fields = Fields::one(yaml_new_field(v, FIELD_NAME)?);
|
||||
}
|
||||
FIELDS_NAME => {
|
||||
processor.with_fields(yaml_fields(v, FIELDS_NAME)?);
|
||||
fields = yaml_new_fields(v, FIELDS_NAME)?;
|
||||
}
|
||||
TARGET_FIELDS => {
|
||||
target_fields = yaml_string(v, TARGET_FIELDS)?
|
||||
.split(',')
|
||||
.map(|s| s.trim().to_string())
|
||||
.filter(|s| !s.is_empty())
|
||||
.collect();
|
||||
}
|
||||
SEPARATOR_NAME => {
|
||||
processor.try_separator(yaml_string(v, SEPARATOR_NAME)?)?;
|
||||
let separator = yaml_string(v, SEPARATOR_NAME)?;
|
||||
if separator.len() != 1 {
|
||||
return Err(format!(
|
||||
"'{}' must be a single character, but got '{}'",
|
||||
SEPARATOR_NAME, separator
|
||||
));
|
||||
} else {
|
||||
reader.delimiter(separator.as_bytes()[0]);
|
||||
}
|
||||
}
|
||||
QUOTE_NAME => {
|
||||
processor.try_quote(yaml_string(v, QUOTE_NAME)?)?;
|
||||
let quote = yaml_string(v, QUOTE_NAME)?;
|
||||
if quote.len() != 1 {
|
||||
return Err(format!(
|
||||
"'{}' must be a single character, but got '{}'",
|
||||
QUOTE_NAME, quote
|
||||
));
|
||||
} else {
|
||||
reader.quote(quote.as_bytes()[0]);
|
||||
}
|
||||
}
|
||||
TRIM_NAME => {
|
||||
processor.with_trim(yaml_bool(v, TRIM_NAME)?);
|
||||
let trim = yaml_bool(v, TRIM_NAME)?;
|
||||
if trim {
|
||||
reader.trim(Trim::All);
|
||||
} else {
|
||||
reader.trim(Trim::None);
|
||||
}
|
||||
}
|
||||
IGNORE_MISSING_NAME => {
|
||||
processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?);
|
||||
ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?;
|
||||
}
|
||||
EMPTY_VALUE_NAME => {
|
||||
processor.with_empty_value(yaml_string(v, EMPTY_VALUE_NAME)?);
|
||||
empty_value = Some(yaml_string(v, EMPTY_VALUE_NAME)?);
|
||||
}
|
||||
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
processor.update_output_keys();
|
||||
Ok(processor)
|
||||
let builder = {
|
||||
CsvProcessorBuilder {
|
||||
reader,
|
||||
fields,
|
||||
ignore_missing,
|
||||
empty_value,
|
||||
target_fields,
|
||||
}
|
||||
};
|
||||
|
||||
Ok(builder)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -209,41 +240,14 @@ impl Processor for CsvProcessor {
|
||||
self.ignore_missing
|
||||
}
|
||||
|
||||
fn fields(&self) -> &Fields {
|
||||
&self.fields
|
||||
}
|
||||
|
||||
fn fields_mut(&mut self) -> &mut Fields {
|
||||
&mut self.fields
|
||||
}
|
||||
|
||||
fn output_keys(&self) -> HashSet<String> {
|
||||
self.fields
|
||||
.iter()
|
||||
.flat_map(|f| f.target_fields.clone().unwrap_or_default())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
|
||||
match val {
|
||||
Value::String(val) => self.process_field(val, field),
|
||||
_ => Err(format!(
|
||||
"{} processor: expect string value, but got {val:?}",
|
||||
self.kind()
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String> {
|
||||
for field in self.fields.iter() {
|
||||
match val.get(field.input_field.index) {
|
||||
let index = field.input_index();
|
||||
match val.get(index) {
|
||||
Some(Value::String(v)) => {
|
||||
// TODO(qtang): Let this method use the intermediate state collection directly.
|
||||
let map = self.process_field(v, field)?;
|
||||
for (k, v) in map.values.into_iter() {
|
||||
if let Some(index) = field.output_fields_index_mapping.get(&k) {
|
||||
val[*index] = v;
|
||||
}
|
||||
let resule_list = self.process(v)?;
|
||||
for (k, v) in resule_list {
|
||||
val[k] = v;
|
||||
}
|
||||
}
|
||||
Some(Value::Null) | None => {
|
||||
@@ -251,7 +255,7 @@ impl Processor for CsvProcessor {
|
||||
return Err(format!(
|
||||
"{} processor: missing field: {}",
|
||||
self.kind(),
|
||||
field.get_field_name()
|
||||
field.input_name()
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -267,116 +271,140 @@ impl Processor for CsvProcessor {
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(yuanbohan): more test cases
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use ahash::HashMap;
|
||||
|
||||
use super::{CsvProcessor, Value};
|
||||
use crate::etl::field::Fields;
|
||||
use crate::etl::processor::Processor;
|
||||
use crate::etl::value::Map;
|
||||
use super::Value;
|
||||
use crate::etl::processor::csv::CsvProcessorBuilder;
|
||||
|
||||
#[test]
|
||||
fn test_equal_length() {
|
||||
let mut processor = CsvProcessor::new();
|
||||
let field = "data,, a, b".parse().unwrap();
|
||||
processor.with_fields(Fields::one(field));
|
||||
let mut reader = csv::ReaderBuilder::new();
|
||||
reader.has_headers(false);
|
||||
let builder = CsvProcessorBuilder {
|
||||
reader,
|
||||
target_fields: vec!["a".into(), "b".into()],
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let values: HashMap<String, Value> = [("data".into(), Value::String("1,2".into()))]
|
||||
let intermediate_keys = vec!["data".into(), "a".into(), "b".into()];
|
||||
|
||||
let processor = builder.build(&intermediate_keys).unwrap();
|
||||
let result = processor
|
||||
.process("1,2")
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.collect();
|
||||
let mut m = Map { values };
|
||||
|
||||
processor.exec_map(&mut m).unwrap();
|
||||
.map(|(k, v)| (intermediate_keys[k].clone(), v))
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
let values = [
|
||||
("data".into(), Value::String("1,2".into())),
|
||||
("a".into(), Value::String("1".into())),
|
||||
("b".into(), Value::String("2".into())),
|
||||
]
|
||||
.into_iter()
|
||||
.collect();
|
||||
let expected = Map { values };
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
assert_eq!(expected, m);
|
||||
assert_eq!(result, values);
|
||||
}
|
||||
|
||||
// test target_fields length larger than the record length
|
||||
#[test]
|
||||
fn test_target_fields_has_more_length() {
|
||||
let values = [("data".into(), Value::String("1,2".into()))]
|
||||
.into_iter()
|
||||
.collect();
|
||||
let mut input = Map { values };
|
||||
|
||||
// with no empty value
|
||||
{
|
||||
let mut processor = CsvProcessor::new();
|
||||
let field = "data,, a,b,c".parse().unwrap();
|
||||
processor.with_fields(Fields::one(field));
|
||||
let mut reader = csv::ReaderBuilder::new();
|
||||
reader.has_headers(false);
|
||||
let builder = CsvProcessorBuilder {
|
||||
reader,
|
||||
target_fields: vec!["a".into(), "b".into(), "c".into()],
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
processor.exec_map(&mut input).unwrap();
|
||||
let intermediate_keys = vec!["data".into(), "a".into(), "b".into(), "c".into()];
|
||||
|
||||
let processor = builder.build(&intermediate_keys).unwrap();
|
||||
let result = processor
|
||||
.process("1,2")
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|(k, v)| (intermediate_keys[k].clone(), v))
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
let values = [
|
||||
("data".into(), Value::String("1,2".into())),
|
||||
("a".into(), Value::String("1".into())),
|
||||
("b".into(), Value::String("2".into())),
|
||||
("c".into(), Value::Null),
|
||||
]
|
||||
.into_iter()
|
||||
.collect();
|
||||
let expected = Map { values };
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
assert_eq!(expected, input);
|
||||
assert_eq!(result, values);
|
||||
}
|
||||
|
||||
// with empty value
|
||||
{
|
||||
let mut processor = CsvProcessor::new();
|
||||
let field = "data,, a,b,c".parse().unwrap();
|
||||
processor.with_fields(Fields::one(field));
|
||||
processor.with_empty_value("default".into());
|
||||
let mut reader = csv::ReaderBuilder::new();
|
||||
reader.has_headers(false);
|
||||
let builder = CsvProcessorBuilder {
|
||||
reader,
|
||||
target_fields: vec!["a".into(), "b".into(), "c".into()],
|
||||
empty_value: Some("default".into()),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
processor.exec_map(&mut input).unwrap();
|
||||
let intermediate_keys = vec!["data".into(), "a".into(), "b".into(), "c".into()];
|
||||
|
||||
let processor = builder.build(&intermediate_keys).unwrap();
|
||||
let result = processor
|
||||
.process("1,2")
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|(k, v)| (intermediate_keys[k].clone(), v))
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
let values = [
|
||||
("data".into(), Value::String("1,2".into())),
|
||||
("a".into(), Value::String("1".into())),
|
||||
("b".into(), Value::String("2".into())),
|
||||
("c".into(), Value::String("default".into())),
|
||||
]
|
||||
.into_iter()
|
||||
.collect();
|
||||
let expected = Map { values };
|
||||
|
||||
assert_eq!(expected, input);
|
||||
assert_eq!(result, values);
|
||||
}
|
||||
}
|
||||
|
||||
// test record has larger length
|
||||
#[test]
|
||||
fn test_target_fields_has_less_length() {
|
||||
let values = [("data".into(), Value::String("1,2,3".into()))]
|
||||
let mut reader = csv::ReaderBuilder::new();
|
||||
reader.has_headers(false);
|
||||
let builder = CsvProcessorBuilder {
|
||||
reader,
|
||||
target_fields: vec!["a".into(), "b".into()],
|
||||
empty_value: Some("default".into()),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let intermediate_keys = vec!["data".into(), "a".into(), "b".into()];
|
||||
|
||||
let processor = builder.build(&intermediate_keys).unwrap();
|
||||
let result = processor
|
||||
.process("1,2")
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.collect();
|
||||
let mut input = Map { values };
|
||||
|
||||
let mut processor = CsvProcessor::new();
|
||||
let field = "data,,a,b".parse().unwrap();
|
||||
processor.with_fields(Fields::one(field));
|
||||
|
||||
processor.exec_map(&mut input).unwrap();
|
||||
.map(|(k, v)| (intermediate_keys[k].clone(), v))
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
let values = [
|
||||
("data".into(), Value::String("1,2,3".into())),
|
||||
("a".into(), Value::String("1".into())),
|
||||
("b".into(), Value::String("2".into())),
|
||||
]
|
||||
.into_iter()
|
||||
.collect();
|
||||
let expected = Map { values };
|
||||
|
||||
assert_eq!(expected, input);
|
||||
assert_eq!(result, values);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,12 +19,12 @@ use chrono::{DateTime, NaiveDateTime};
|
||||
use chrono_tz::Tz;
|
||||
use lazy_static::lazy_static;
|
||||
|
||||
use crate::etl::field::{Field, Fields};
|
||||
use crate::etl::field::{Fields, OneInputOneOutputField};
|
||||
use crate::etl::processor::{
|
||||
update_one_one_output_keys, yaml_bool, yaml_field, yaml_fields, yaml_string, yaml_strings,
|
||||
Processor, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
|
||||
yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, yaml_strings, Processor,
|
||||
ProcessorBuilder, ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
|
||||
};
|
||||
use crate::etl::value::{Map, Timestamp, Value};
|
||||
use crate::etl::value::{Timestamp, Value};
|
||||
|
||||
pub(crate) const PROCESSOR_DATE: &str = "date";
|
||||
|
||||
@@ -57,9 +57,15 @@ lazy_static! {
|
||||
.collect();
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
#[derive(Debug)]
|
||||
struct Formats(Vec<Arc<String>>);
|
||||
|
||||
impl Default for Formats {
|
||||
fn default() -> Self {
|
||||
Formats(DEFAULT_FORMATS.clone())
|
||||
}
|
||||
}
|
||||
|
||||
impl Formats {
|
||||
fn new(mut formats: Vec<Arc<String>>) -> Self {
|
||||
formats.sort();
|
||||
@@ -76,16 +82,119 @@ impl std::ops::Deref for Formats {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct DateProcessorBuilder {
|
||||
fields: Fields,
|
||||
formats: Formats,
|
||||
timezone: Option<Arc<String>>,
|
||||
locale: Option<Arc<String>>,
|
||||
ignore_missing: bool,
|
||||
}
|
||||
|
||||
impl ProcessorBuilder for DateProcessorBuilder {
|
||||
fn output_keys(&self) -> HashSet<&str> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|f| f.target_or_input_field())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn input_keys(&self) -> HashSet<&str> {
|
||||
self.fields.iter().map(|f| f.input_field()).collect()
|
||||
}
|
||||
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String> {
|
||||
self.build(intermediate_keys).map(ProcessorKind::Date)
|
||||
}
|
||||
}
|
||||
|
||||
impl DateProcessorBuilder {
|
||||
pub fn build(self, intermediate_keys: &[String]) -> Result<DateProcessor, String> {
|
||||
let mut real_fields = vec![];
|
||||
for field in self.fields.into_iter() {
|
||||
let input = OneInputOneOutputField::build(
|
||||
"date",
|
||||
intermediate_keys,
|
||||
field.input_field(),
|
||||
field.target_or_input_field(),
|
||||
)?;
|
||||
real_fields.push(input);
|
||||
}
|
||||
Ok(DateProcessor {
|
||||
fields: real_fields,
|
||||
formats: self.formats,
|
||||
timezone: self.timezone,
|
||||
locale: self.locale,
|
||||
ignore_missing: self.ignore_missing,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for DateProcessorBuilder {
|
||||
type Error = String;
|
||||
|
||||
fn try_from(hash: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
|
||||
let mut fields = Fields::default();
|
||||
let mut formats = Formats::default();
|
||||
let mut timezone = None;
|
||||
let mut locale = None;
|
||||
let mut ignore_missing = false;
|
||||
|
||||
for (k, v) in hash {
|
||||
let key = k
|
||||
.as_str()
|
||||
.ok_or(format!("key must be a string, but got {k:?}"))?;
|
||||
|
||||
match key {
|
||||
FIELD_NAME => {
|
||||
fields = Fields::one(yaml_new_field(v, FIELD_NAME)?);
|
||||
}
|
||||
FIELDS_NAME => {
|
||||
fields = yaml_new_fields(v, FIELDS_NAME)?;
|
||||
}
|
||||
|
||||
FORMATS_NAME => {
|
||||
let format_strs = yaml_strings(v, FORMATS_NAME)?;
|
||||
if format_strs.is_empty() {
|
||||
formats = Formats::new(DEFAULT_FORMATS.clone());
|
||||
} else {
|
||||
formats = Formats::new(format_strs.into_iter().map(Arc::new).collect());
|
||||
}
|
||||
}
|
||||
TIMEZONE_NAME => {
|
||||
timezone = Some(Arc::new(yaml_string(v, TIMEZONE_NAME)?));
|
||||
}
|
||||
LOCALE_NAME => {
|
||||
locale = Some(Arc::new(yaml_string(v, LOCALE_NAME)?));
|
||||
}
|
||||
IGNORE_MISSING_NAME => {
|
||||
ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?;
|
||||
}
|
||||
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
let builder = DateProcessorBuilder {
|
||||
fields,
|
||||
formats,
|
||||
timezone,
|
||||
locale,
|
||||
ignore_missing,
|
||||
};
|
||||
|
||||
Ok(builder)
|
||||
}
|
||||
}
|
||||
|
||||
/// deprecated it should be removed in the future
|
||||
/// Reserved for compatibility only
|
||||
#[derive(Debug, Default)]
|
||||
pub struct DateProcessor {
|
||||
fields: Fields,
|
||||
|
||||
fields: Vec<OneInputOneOutputField>,
|
||||
formats: Formats,
|
||||
timezone: Option<Arc<String>>,
|
||||
locale: Option<Arc<String>>, // to support locale
|
||||
output_format: Option<Arc<String>>,
|
||||
|
||||
ignore_missing: bool,
|
||||
// description
|
||||
@@ -96,43 +205,6 @@ pub struct DateProcessor {
|
||||
}
|
||||
|
||||
impl DateProcessor {
|
||||
fn with_fields(&mut self, mut fields: Fields) {
|
||||
update_one_one_output_keys(&mut fields);
|
||||
self.fields = fields
|
||||
}
|
||||
|
||||
fn with_formats(&mut self, v: Option<Vec<Arc<String>>>) {
|
||||
let v = match v {
|
||||
Some(v) if !v.is_empty() => v,
|
||||
_ => DEFAULT_FORMATS.clone(),
|
||||
};
|
||||
|
||||
let formats = Formats::new(v);
|
||||
self.formats = formats;
|
||||
}
|
||||
|
||||
fn with_timezone(&mut self, timezone: String) {
|
||||
if !timezone.is_empty() {
|
||||
self.timezone = Some(Arc::new(timezone));
|
||||
}
|
||||
}
|
||||
|
||||
fn with_locale(&mut self, locale: String) {
|
||||
if !locale.is_empty() {
|
||||
self.locale = Some(Arc::new(locale));
|
||||
}
|
||||
}
|
||||
|
||||
fn with_output_format(&mut self, output_format: String) {
|
||||
if !output_format.is_empty() {
|
||||
self.output_format = Some(Arc::new(output_format));
|
||||
}
|
||||
}
|
||||
|
||||
fn with_ignore_missing(&mut self, ignore_missing: bool) {
|
||||
self.ignore_missing = ignore_missing;
|
||||
}
|
||||
|
||||
fn parse(&self, val: &str) -> Result<Timestamp, String> {
|
||||
let mut tz = Tz::UTC;
|
||||
if let Some(timezone) = &self.timezone {
|
||||
@@ -147,61 +219,6 @@ impl DateProcessor {
|
||||
|
||||
Err(format!("{} processor: failed to parse {val}", self.kind(),))
|
||||
}
|
||||
|
||||
fn process_field(&self, val: &str, field: &Field) -> Result<Map, String> {
|
||||
let key = field.get_target_field();
|
||||
|
||||
Ok(Map::one(key, Value::Timestamp(self.parse(val)?)))
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for DateProcessor {
|
||||
type Error = String;
|
||||
|
||||
fn try_from(hash: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
|
||||
let mut processor = DateProcessor::default();
|
||||
|
||||
let mut formats_opt = None;
|
||||
|
||||
for (k, v) in hash {
|
||||
let key = k
|
||||
.as_str()
|
||||
.ok_or(format!("key must be a string, but got {k:?}"))?;
|
||||
|
||||
match key {
|
||||
FIELD_NAME => {
|
||||
processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?));
|
||||
}
|
||||
FIELDS_NAME => {
|
||||
processor.with_fields(yaml_fields(v, FIELDS_NAME)?);
|
||||
}
|
||||
|
||||
FORMATS_NAME => {
|
||||
let formats = yaml_strings(v, FORMATS_NAME)?;
|
||||
formats_opt = Some(formats.into_iter().map(Arc::new).collect());
|
||||
}
|
||||
TIMEZONE_NAME => {
|
||||
processor.with_timezone(yaml_string(v, TIMEZONE_NAME)?);
|
||||
}
|
||||
LOCALE_NAME => {
|
||||
processor.with_locale(yaml_string(v, LOCALE_NAME)?);
|
||||
}
|
||||
OUTPUT_FORMAT_NAME => {
|
||||
processor.with_output_format(yaml_string(v, OUTPUT_FORMAT_NAME)?);
|
||||
}
|
||||
|
||||
IGNORE_MISSING_NAME => {
|
||||
processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?);
|
||||
}
|
||||
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
processor.with_formats(formats_opt);
|
||||
|
||||
Ok(processor)
|
||||
}
|
||||
}
|
||||
|
||||
impl Processor for DateProcessor {
|
||||
@@ -213,53 +230,21 @@ impl Processor for DateProcessor {
|
||||
self.ignore_missing
|
||||
}
|
||||
|
||||
fn fields(&self) -> &Fields {
|
||||
&self.fields
|
||||
}
|
||||
|
||||
fn fields_mut(&mut self) -> &mut Fields {
|
||||
&mut self.fields
|
||||
}
|
||||
|
||||
fn output_keys(&self) -> HashSet<String> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|f| f.get_target_field().to_string())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
|
||||
match val {
|
||||
Value::String(s) => self.process_field(s, field),
|
||||
_ => Err(format!(
|
||||
"{} processor: expect string value, but got {val:?}",
|
||||
self.kind()
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String> {
|
||||
for field in self.fields().iter() {
|
||||
let index = field.input_field.index;
|
||||
for field in self.fields.iter() {
|
||||
let index = field.input_index();
|
||||
match val.get(index) {
|
||||
Some(Value::String(s)) => {
|
||||
// TODO(qtang): Let this method use the intermediate state collection directly.
|
||||
let mut map = self.process_field(s, field)?;
|
||||
field
|
||||
.output_fields_index_mapping
|
||||
.iter()
|
||||
.for_each(|(k, output_index)| {
|
||||
if let Some(v) = map.remove(k) {
|
||||
val[*output_index] = v;
|
||||
}
|
||||
});
|
||||
let timestamp = self.parse(s)?;
|
||||
let output_index = field.output_index();
|
||||
val[output_index] = Value::Timestamp(timestamp);
|
||||
}
|
||||
Some(Value::Null) | None => {
|
||||
if !self.ignore_missing {
|
||||
return Err(format!(
|
||||
"{} processor: missing field: {}",
|
||||
self.kind(),
|
||||
field.get_field_name()
|
||||
field.input_name()
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -318,8 +303,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_parse() {
|
||||
let mut processor = DateProcessor::default();
|
||||
processor.with_formats(None);
|
||||
let processor = DateProcessor::default();
|
||||
|
||||
let values: Vec<&str> = vec![
|
||||
"2014-5-17T12:34:56",
|
||||
@@ -340,7 +324,6 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_parse_with_formats() {
|
||||
let mut processor = DateProcessor::default();
|
||||
let formats = vec![
|
||||
"%Y-%m-%dT%H:%M:%S%:z",
|
||||
"%Y-%m-%dT%H:%M:%S%.3f%:z",
|
||||
@@ -349,8 +332,11 @@ mod tests {
|
||||
]
|
||||
.into_iter()
|
||||
.map(|s| Arc::new(s.to_string()))
|
||||
.collect();
|
||||
processor.with_formats(Some(formats));
|
||||
.collect::<Vec<_>>();
|
||||
let processor = DateProcessor {
|
||||
formats: super::Formats(formats),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let values: Vec<&str> = vec![
|
||||
"2014-5-17T12:34:56",
|
||||
@@ -371,9 +357,10 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_parse_with_timezone() {
|
||||
let mut processor = DateProcessor::default();
|
||||
processor.with_formats(None);
|
||||
processor.with_timezone("Asia/Tokyo".to_string());
|
||||
let processor = DateProcessor {
|
||||
timezone: Some(Arc::new("Asia/Tokyo".to_string())),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let values: Vec<&str> = vec![
|
||||
"2014-5-17T12:34:56",
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -14,17 +14,17 @@
|
||||
|
||||
use ahash::HashSet;
|
||||
|
||||
use crate::etl::field::{Field, Fields};
|
||||
use crate::etl::field::{Fields, OneInputOneOutputField};
|
||||
use crate::etl::processor::{
|
||||
update_one_one_output_keys, yaml_bool, yaml_field, yaml_fields, yaml_string, Processor,
|
||||
FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
|
||||
yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, ProcessorBuilder,
|
||||
ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
|
||||
};
|
||||
use crate::etl::value::time::{
|
||||
MICROSECOND_RESOLUTION, MICRO_RESOLUTION, MILLISECOND_RESOLUTION, MILLI_RESOLUTION,
|
||||
MS_RESOLUTION, NANOSECOND_RESOLUTION, NANO_RESOLUTION, NS_RESOLUTION, SECOND_RESOLUTION,
|
||||
SEC_RESOLUTION, S_RESOLUTION, US_RESOLUTION,
|
||||
};
|
||||
use crate::etl::value::{Map, Timestamp, Value};
|
||||
use crate::etl::value::{Timestamp, Value};
|
||||
|
||||
pub(crate) const PROCESSOR_EPOCH: &str = "epoch";
|
||||
const RESOLUTION_NAME: &str = "resolution";
|
||||
@@ -52,12 +52,56 @@ impl TryFrom<&str> for Resolution {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct EpochProcessorBuilder {
|
||||
fields: Fields,
|
||||
resolution: Resolution,
|
||||
ignore_missing: bool,
|
||||
}
|
||||
|
||||
impl ProcessorBuilder for EpochProcessorBuilder {
|
||||
fn output_keys(&self) -> HashSet<&str> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|f| f.target_or_input_field())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn input_keys(&self) -> HashSet<&str> {
|
||||
self.fields.iter().map(|f| f.input_field()).collect()
|
||||
}
|
||||
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String> {
|
||||
self.build(intermediate_keys).map(ProcessorKind::Epoch)
|
||||
}
|
||||
}
|
||||
|
||||
impl EpochProcessorBuilder {
|
||||
pub fn build(self, intermediate_keys: &[String]) -> Result<EpochProcessor, String> {
|
||||
let mut real_fields = vec![];
|
||||
for field in self.fields.into_iter() {
|
||||
let input = OneInputOneOutputField::build(
|
||||
"epoch",
|
||||
intermediate_keys,
|
||||
field.input_field(),
|
||||
field.target_or_input_field(),
|
||||
)?;
|
||||
real_fields.push(input);
|
||||
}
|
||||
Ok(EpochProcessor {
|
||||
fields: real_fields,
|
||||
resolution: self.resolution,
|
||||
ignore_missing: self.ignore_missing,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// support string, integer, float, time, epoch
|
||||
/// deprecated it should be removed in the future
|
||||
/// Reserved for compatibility only
|
||||
#[derive(Debug, Default)]
|
||||
pub struct EpochProcessor {
|
||||
fields: Fields,
|
||||
fields: Vec<OneInputOneOutputField>,
|
||||
resolution: Resolution,
|
||||
ignore_missing: bool,
|
||||
// description
|
||||
@@ -68,19 +112,6 @@ pub struct EpochProcessor {
|
||||
}
|
||||
|
||||
impl EpochProcessor {
|
||||
fn with_fields(&mut self, mut fields: Fields) {
|
||||
update_one_one_output_keys(&mut fields);
|
||||
self.fields = fields
|
||||
}
|
||||
|
||||
fn with_resolution(&mut self, resolution: Resolution) {
|
||||
self.resolution = resolution;
|
||||
}
|
||||
|
||||
fn with_ignore_missing(&mut self, ignore_missing: bool) {
|
||||
self.ignore_missing = ignore_missing;
|
||||
}
|
||||
|
||||
fn parse(&self, val: &Value) -> Result<Timestamp, String> {
|
||||
let t: i64 = match val {
|
||||
Value::String(s) => s
|
||||
@@ -117,19 +148,15 @@ impl EpochProcessor {
|
||||
Resolution::Nano => Ok(Timestamp::Nanosecond(t)),
|
||||
}
|
||||
}
|
||||
|
||||
fn process_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
|
||||
let key = field.get_target_field();
|
||||
|
||||
Ok(Map::one(key, Value::Timestamp(self.parse(val)?)))
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for EpochProcessor {
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for EpochProcessorBuilder {
|
||||
type Error = String;
|
||||
|
||||
fn try_from(hash: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
|
||||
let mut processor = EpochProcessor::default();
|
||||
let mut fields = Fields::default();
|
||||
let mut resolution = Resolution::default();
|
||||
let mut ignore_missing = false;
|
||||
|
||||
for (k, v) in hash {
|
||||
let key = k
|
||||
@@ -138,24 +165,29 @@ impl TryFrom<&yaml_rust::yaml::Hash> for EpochProcessor {
|
||||
|
||||
match key {
|
||||
FIELD_NAME => {
|
||||
processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?));
|
||||
fields = Fields::one(yaml_new_field(v, FIELD_NAME)?);
|
||||
}
|
||||
FIELDS_NAME => {
|
||||
processor.with_fields(yaml_fields(v, FIELDS_NAME)?);
|
||||
fields = yaml_new_fields(v, FIELDS_NAME)?;
|
||||
}
|
||||
RESOLUTION_NAME => {
|
||||
let s = yaml_string(v, RESOLUTION_NAME)?.as_str().try_into()?;
|
||||
processor.with_resolution(s);
|
||||
resolution = s;
|
||||
}
|
||||
IGNORE_MISSING_NAME => {
|
||||
processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?);
|
||||
ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?;
|
||||
}
|
||||
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
let builder = EpochProcessorBuilder {
|
||||
fields,
|
||||
resolution,
|
||||
ignore_missing,
|
||||
};
|
||||
|
||||
Ok(processor)
|
||||
Ok(builder)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -168,49 +200,23 @@ impl Processor for EpochProcessor {
|
||||
self.ignore_missing
|
||||
}
|
||||
|
||||
fn fields(&self) -> &Fields {
|
||||
&self.fields
|
||||
}
|
||||
|
||||
fn fields_mut(&mut self) -> &mut Fields {
|
||||
&mut self.fields
|
||||
}
|
||||
|
||||
fn output_keys(&self) -> HashSet<String> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|f| f.get_target_field().to_string())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
|
||||
self.process_field(val, field)
|
||||
}
|
||||
|
||||
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String> {
|
||||
for field in self.fields.iter() {
|
||||
let index = field.input_field.index;
|
||||
let index = field.input_index();
|
||||
match val.get(index) {
|
||||
Some(Value::Null) | None => {
|
||||
if !self.ignore_missing {
|
||||
return Err(format!(
|
||||
"{} processor: missing field: {}",
|
||||
self.kind(),
|
||||
field.get_field_name()
|
||||
field.input_name()
|
||||
));
|
||||
}
|
||||
}
|
||||
Some(v) => {
|
||||
// TODO(qtang): Let this method use the intermediate state collection directly.
|
||||
let mut map = self.process_field(v, field)?;
|
||||
field
|
||||
.output_fields_index_mapping
|
||||
.iter()
|
||||
.for_each(|(k, output_index)| {
|
||||
if let Some(v) = map.remove(k) {
|
||||
val[*output_index] = v;
|
||||
}
|
||||
});
|
||||
let timestamp = self.parse(v)?;
|
||||
let output_index = field.output_index();
|
||||
val[output_index] = Value::Timestamp(timestamp);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -225,8 +231,10 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_parse_epoch() {
|
||||
let mut processor = EpochProcessor::default();
|
||||
processor.with_resolution(super::Resolution::Second);
|
||||
let processor = EpochProcessor {
|
||||
resolution: super::Resolution::Second,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let values = [
|
||||
Value::String("1573840000".into()),
|
||||
|
||||
@@ -15,45 +15,43 @@
|
||||
use ahash::HashSet;
|
||||
use regex::Regex;
|
||||
|
||||
use crate::etl::field::{Field, Fields};
|
||||
use crate::etl::field::{Fields, OneInputOneOutputField};
|
||||
use crate::etl::processor::{
|
||||
update_one_one_output_keys, yaml_bool, yaml_field, yaml_fields, yaml_string, Processor,
|
||||
FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, PATTERN_NAME,
|
||||
yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, ProcessorBuilder,
|
||||
ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, PATTERN_NAME,
|
||||
};
|
||||
use crate::etl::value::{Array, Map, Value};
|
||||
use crate::etl::value::Value;
|
||||
|
||||
pub(crate) const PROCESSOR_GSUB: &str = "gsub";
|
||||
|
||||
const REPLACEMENT_NAME: &str = "replacement";
|
||||
|
||||
/// A processor to replace all matches of a pattern in string by a replacement, only support string value, and array string value
|
||||
#[derive(Debug, Default)]
|
||||
pub struct GsubProcessor {
|
||||
pub struct GsubProcessorBuilder {
|
||||
fields: Fields,
|
||||
pattern: Option<Regex>,
|
||||
replacement: Option<String>,
|
||||
ignore_missing: bool,
|
||||
}
|
||||
|
||||
impl GsubProcessor {
|
||||
fn with_fields(&mut self, mut fields: Fields) {
|
||||
update_one_one_output_keys(&mut fields);
|
||||
self.fields = fields;
|
||||
impl ProcessorBuilder for GsubProcessorBuilder {
|
||||
fn output_keys(&self) -> HashSet<&str> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|f| f.target_or_input_field())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn with_ignore_missing(&mut self, ignore_missing: bool) {
|
||||
self.ignore_missing = ignore_missing;
|
||||
fn input_keys(&self) -> HashSet<&str> {
|
||||
self.fields.iter().map(|f| f.input_field()).collect()
|
||||
}
|
||||
|
||||
fn try_pattern(&mut self, pattern: &str) -> Result<(), String> {
|
||||
self.pattern = Some(Regex::new(pattern).map_err(|e| e.to_string())?);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn with_replacement(&mut self, replacement: impl Into<String>) {
|
||||
self.replacement = Some(replacement.into());
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String> {
|
||||
self.build(intermediate_keys).map(ProcessorKind::Gsub)
|
||||
}
|
||||
}
|
||||
|
||||
impl GsubProcessorBuilder {
|
||||
fn check(self) -> Result<Self, String> {
|
||||
if self.pattern.is_none() {
|
||||
return Err("pattern is required".to_string());
|
||||
@@ -66,7 +64,49 @@ impl GsubProcessor {
|
||||
Ok(self)
|
||||
}
|
||||
|
||||
fn process_string_field(&self, val: &str, field: &Field) -> Result<Map, String> {
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<GsubProcessor, String> {
|
||||
let mut real_fields = vec![];
|
||||
for field in self.fields.into_iter() {
|
||||
let input = OneInputOneOutputField::build(
|
||||
"gsub",
|
||||
intermediate_keys,
|
||||
field.input_field(),
|
||||
field.target_or_input_field(),
|
||||
)?;
|
||||
real_fields.push(input);
|
||||
}
|
||||
Ok(GsubProcessor {
|
||||
fields: real_fields,
|
||||
pattern: self.pattern,
|
||||
replacement: self.replacement,
|
||||
ignore_missing: self.ignore_missing,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// A processor to replace all matches of a pattern in string by a replacement, only support string value, and array string value
|
||||
#[derive(Debug, Default)]
|
||||
pub struct GsubProcessor {
|
||||
fields: Vec<OneInputOneOutputField>,
|
||||
pattern: Option<Regex>,
|
||||
replacement: Option<String>,
|
||||
ignore_missing: bool,
|
||||
}
|
||||
|
||||
impl GsubProcessor {
|
||||
fn check(self) -> Result<Self, String> {
|
||||
if self.pattern.is_none() {
|
||||
return Err("pattern is required".to_string());
|
||||
}
|
||||
|
||||
if self.replacement.is_none() {
|
||||
return Err("replacement is required".to_string());
|
||||
}
|
||||
|
||||
Ok(self)
|
||||
}
|
||||
|
||||
fn process_string(&self, val: &str) -> Result<Value, String> {
|
||||
let replacement = self.replacement.as_ref().unwrap();
|
||||
let new_val = self
|
||||
.pattern
|
||||
@@ -76,42 +116,28 @@ impl GsubProcessor {
|
||||
.to_string();
|
||||
let val = Value::String(new_val);
|
||||
|
||||
let key = field.get_target_field();
|
||||
|
||||
Ok(Map::one(key, val))
|
||||
Ok(val)
|
||||
}
|
||||
|
||||
fn process_array_field(&self, arr: &Array, field: &Field) -> Result<Map, String> {
|
||||
let key = field.get_target_field();
|
||||
|
||||
let re = self.pattern.as_ref().unwrap();
|
||||
let replacement = self.replacement.as_ref().unwrap();
|
||||
|
||||
let mut result = Array::default();
|
||||
for val in arr.iter() {
|
||||
match val {
|
||||
Value::String(haystack) => {
|
||||
let new_val = re.replace_all(haystack, replacement).to_string();
|
||||
result.push(Value::String(new_val));
|
||||
}
|
||||
_ => {
|
||||
return Err(format!(
|
||||
"{} processor: expect string or array string, but got {val:?}",
|
||||
self.kind()
|
||||
))
|
||||
}
|
||||
}
|
||||
fn process(&self, val: &Value) -> Result<Value, String> {
|
||||
match val {
|
||||
Value::String(val) => self.process_string(val),
|
||||
_ => Err(format!(
|
||||
"{} processor: expect string or array string, but got {val:?}",
|
||||
self.kind()
|
||||
)),
|
||||
}
|
||||
|
||||
Ok(Map::one(key, Value::Array(result)))
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for GsubProcessor {
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for GsubProcessorBuilder {
|
||||
type Error = String;
|
||||
|
||||
fn try_from(value: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
|
||||
let mut processor = GsubProcessor::default();
|
||||
let mut fields = Fields::default();
|
||||
let mut ignore_missing = false;
|
||||
let mut pattern = None;
|
||||
let mut replacement = None;
|
||||
|
||||
for (k, v) in value.iter() {
|
||||
let key = k
|
||||
@@ -119,27 +145,36 @@ impl TryFrom<&yaml_rust::yaml::Hash> for GsubProcessor {
|
||||
.ok_or(format!("key must be a string, but got {k:?}"))?;
|
||||
match key {
|
||||
FIELD_NAME => {
|
||||
processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?));
|
||||
fields = Fields::one(yaml_new_field(v, FIELD_NAME)?);
|
||||
}
|
||||
FIELDS_NAME => {
|
||||
processor.with_fields(yaml_fields(v, FIELDS_NAME)?);
|
||||
fields = yaml_new_fields(v, FIELDS_NAME)?;
|
||||
}
|
||||
PATTERN_NAME => {
|
||||
processor.try_pattern(&yaml_string(v, PATTERN_NAME)?)?;
|
||||
let pattern_str = yaml_string(v, PATTERN_NAME)?;
|
||||
pattern = Some(Regex::new(&pattern_str).map_err(|e| e.to_string())?);
|
||||
}
|
||||
REPLACEMENT_NAME => {
|
||||
processor.with_replacement(yaml_string(v, REPLACEMENT_NAME)?);
|
||||
let replacement_str = yaml_string(v, REPLACEMENT_NAME)?;
|
||||
replacement = Some(replacement_str);
|
||||
}
|
||||
|
||||
IGNORE_MISSING_NAME => {
|
||||
processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?);
|
||||
ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?;
|
||||
}
|
||||
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
processor.check()
|
||||
let builder = GsubProcessorBuilder {
|
||||
fields,
|
||||
pattern,
|
||||
replacement,
|
||||
ignore_missing,
|
||||
};
|
||||
|
||||
builder.check()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -152,56 +187,23 @@ impl crate::etl::processor::Processor for GsubProcessor {
|
||||
self.ignore_missing
|
||||
}
|
||||
|
||||
fn fields(&self) -> &Fields {
|
||||
&self.fields
|
||||
}
|
||||
|
||||
fn fields_mut(&mut self) -> &mut Fields {
|
||||
&mut self.fields
|
||||
}
|
||||
|
||||
fn output_keys(&self) -> HashSet<String> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|f| f.get_target_field().to_string())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
|
||||
match val {
|
||||
Value::String(val) => self.process_string_field(val, field),
|
||||
Value::Array(arr) => self.process_array_field(arr, field),
|
||||
_ => Err(format!(
|
||||
"{} processor: expect string or array string, but got {val:?}",
|
||||
self.kind()
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String> {
|
||||
for field in self.fields.iter() {
|
||||
let index = field.input_field.index;
|
||||
let index = field.input_index();
|
||||
match val.get(index) {
|
||||
Some(Value::Null) | None => {
|
||||
if !self.ignore_missing {
|
||||
return Err(format!(
|
||||
"{} processor: missing field: {}",
|
||||
self.kind(),
|
||||
field.get_field_name()
|
||||
field.input_name()
|
||||
));
|
||||
}
|
||||
}
|
||||
Some(v) => {
|
||||
// TODO(qtang): Let this method use the intermediate state collection directly.
|
||||
let mut map = self.exec_field(v, field)?;
|
||||
field
|
||||
.output_fields_index_mapping
|
||||
.iter()
|
||||
.for_each(|(k, output_index)| {
|
||||
if let Some(v) = map.remove(k) {
|
||||
val[*output_index] = v;
|
||||
}
|
||||
});
|
||||
let result = self.process(v)?;
|
||||
let output_index = field.output_index();
|
||||
val[output_index] = result;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -211,55 +213,20 @@ impl crate::etl::processor::Processor for GsubProcessor {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::etl::field::Field;
|
||||
use crate::etl::processor::gsub::GsubProcessor;
|
||||
use crate::etl::processor::Processor;
|
||||
use crate::etl::value::{Map, Value};
|
||||
use crate::etl::value::Value;
|
||||
|
||||
#[test]
|
||||
fn test_string_value() {
|
||||
let mut processor = GsubProcessor::default();
|
||||
processor.try_pattern(r"\d+").unwrap();
|
||||
processor.with_replacement("xxx");
|
||||
let processor = GsubProcessor {
|
||||
pattern: Some(regex::Regex::new(r"\d+").unwrap()),
|
||||
replacement: Some("xxx".to_string()),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let field = Field::new("message");
|
||||
let val = Value::String("123".to_string());
|
||||
let result = processor.exec_field(&val, &field).unwrap();
|
||||
let result = processor.process(&val).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
result,
|
||||
Map::one("message", Value::String("xxx".to_string()))
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_array_string_value() {
|
||||
let mut processor = GsubProcessor::default();
|
||||
processor.try_pattern(r"\d+").unwrap();
|
||||
processor.with_replacement("xxx");
|
||||
|
||||
let field = Field::new("message");
|
||||
let val = Value::Array(
|
||||
vec![
|
||||
Value::String("123".to_string()),
|
||||
Value::String("456".to_string()),
|
||||
]
|
||||
.into(),
|
||||
);
|
||||
let result = processor.exec_field(&val, &field).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
result,
|
||||
Map::one(
|
||||
"message",
|
||||
Value::Array(
|
||||
vec![
|
||||
Value::String("xxx".to_string()),
|
||||
Value::String("xxx".to_string())
|
||||
]
|
||||
.into()
|
||||
)
|
||||
)
|
||||
);
|
||||
assert_eq!(result, Value::String("xxx".to_string()));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,40 +14,78 @@
|
||||
|
||||
use ahash::HashSet;
|
||||
|
||||
use crate::etl::field::{Field, Fields};
|
||||
use crate::etl::field::{Fields, OneInputOneOutputField};
|
||||
use crate::etl::processor::{
|
||||
update_one_one_output_keys, yaml_bool, yaml_field, yaml_fields, yaml_string, Processor,
|
||||
FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, SEPARATOR_NAME,
|
||||
yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, ProcessorBuilder,
|
||||
ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, SEPARATOR_NAME,
|
||||
};
|
||||
use crate::etl::value::{Array, Map, Value};
|
||||
use crate::etl::value::{Array, Value};
|
||||
|
||||
pub(crate) const PROCESSOR_JOIN: &str = "join";
|
||||
|
||||
/// A processor to join each element of an array into a single string using a separator string between each element
|
||||
#[derive(Debug, Default)]
|
||||
pub struct JoinProcessor {
|
||||
pub struct JoinProcessorBuilder {
|
||||
fields: Fields,
|
||||
separator: Option<String>,
|
||||
ignore_missing: bool,
|
||||
}
|
||||
|
||||
impl ProcessorBuilder for JoinProcessorBuilder {
|
||||
fn output_keys(&self) -> HashSet<&str> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|f| f.target_or_input_field())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn input_keys(&self) -> HashSet<&str> {
|
||||
self.fields.iter().map(|f| f.input_field()).collect()
|
||||
}
|
||||
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String> {
|
||||
self.build(intermediate_keys).map(ProcessorKind::Join)
|
||||
}
|
||||
}
|
||||
|
||||
impl JoinProcessorBuilder {
|
||||
fn check(self) -> Result<Self, String> {
|
||||
if self.separator.is_none() {
|
||||
return Err("separator is required".to_string());
|
||||
}
|
||||
|
||||
Ok(self)
|
||||
}
|
||||
|
||||
pub fn build(self, intermediate_keys: &[String]) -> Result<JoinProcessor, String> {
|
||||
let mut real_fields = vec![];
|
||||
for field in self.fields.into_iter() {
|
||||
let input = OneInputOneOutputField::build(
|
||||
"join",
|
||||
intermediate_keys,
|
||||
field.input_field(),
|
||||
field.target_or_input_field(),
|
||||
)?;
|
||||
real_fields.push(input);
|
||||
}
|
||||
|
||||
Ok(JoinProcessor {
|
||||
fields: real_fields,
|
||||
separator: self.separator,
|
||||
ignore_missing: self.ignore_missing,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// A processor to join each element of an array into a single string using a separator string between each element
|
||||
#[derive(Debug, Default)]
|
||||
pub struct JoinProcessor {
|
||||
fields: Vec<OneInputOneOutputField>,
|
||||
separator: Option<String>,
|
||||
ignore_missing: bool,
|
||||
}
|
||||
|
||||
impl JoinProcessor {
|
||||
fn with_fields(&mut self, mut fields: Fields) {
|
||||
update_one_one_output_keys(&mut fields);
|
||||
self.fields = fields;
|
||||
}
|
||||
|
||||
fn with_separator(&mut self, separator: impl Into<String>) {
|
||||
self.separator = Some(separator.into());
|
||||
}
|
||||
|
||||
fn with_ignore_missing(&mut self, ignore_missing: bool) {
|
||||
self.ignore_missing = ignore_missing;
|
||||
}
|
||||
|
||||
fn process_field(&self, arr: &Array, field: &Field) -> Result<Map, String> {
|
||||
let key = field.get_target_field();
|
||||
|
||||
fn process(&self, arr: &Array) -> Result<Value, String> {
|
||||
let sep = self.separator.as_ref().unwrap();
|
||||
let val = arr
|
||||
.iter()
|
||||
@@ -55,7 +93,7 @@ impl JoinProcessor {
|
||||
.collect::<Vec<String>>()
|
||||
.join(sep);
|
||||
|
||||
Ok(Map::one(key, Value::String(val)))
|
||||
Ok(Value::String(val))
|
||||
}
|
||||
|
||||
fn check(self) -> Result<Self, String> {
|
||||
@@ -67,11 +105,13 @@ impl JoinProcessor {
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for JoinProcessor {
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for JoinProcessorBuilder {
|
||||
type Error = String;
|
||||
|
||||
fn try_from(value: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
|
||||
let mut processor = JoinProcessor::default();
|
||||
let mut fields = Fields::default();
|
||||
let mut separator = None;
|
||||
let mut ignore_missing = false;
|
||||
|
||||
for (k, v) in value.iter() {
|
||||
let key = k
|
||||
@@ -79,30 +119,31 @@ impl TryFrom<&yaml_rust::yaml::Hash> for JoinProcessor {
|
||||
.ok_or(format!("key must be a string, but got {k:?}"))?;
|
||||
match key {
|
||||
FIELD_NAME => {
|
||||
processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?));
|
||||
fields = Fields::one(yaml_new_field(v, FIELD_NAME)?);
|
||||
}
|
||||
FIELDS_NAME => {
|
||||
processor.with_fields(yaml_fields(v, FIELDS_NAME)?);
|
||||
fields = yaml_new_fields(v, FIELDS_NAME)?;
|
||||
}
|
||||
SEPARATOR_NAME => {
|
||||
processor.with_separator(yaml_string(v, SEPARATOR_NAME)?);
|
||||
separator = Some(yaml_string(v, SEPARATOR_NAME)?);
|
||||
}
|
||||
IGNORE_MISSING_NAME => {
|
||||
processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?);
|
||||
ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
processor.check()
|
||||
let builder = JoinProcessorBuilder {
|
||||
fields,
|
||||
separator,
|
||||
ignore_missing,
|
||||
};
|
||||
builder.check()
|
||||
}
|
||||
}
|
||||
|
||||
impl Processor for JoinProcessor {
|
||||
fn fields(&self) -> &Fields {
|
||||
&self.fields
|
||||
}
|
||||
|
||||
fn kind(&self) -> &str {
|
||||
PROCESSOR_JOIN
|
||||
}
|
||||
@@ -111,49 +152,21 @@ impl Processor for JoinProcessor {
|
||||
self.ignore_missing
|
||||
}
|
||||
|
||||
fn fields_mut(&mut self) -> &mut Fields {
|
||||
&mut self.fields
|
||||
}
|
||||
|
||||
fn output_keys(&self) -> HashSet<String> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|f| f.get_target_field().to_string())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
|
||||
match val {
|
||||
Value::Array(arr) => self.process_field(arr, field),
|
||||
_ => Err(format!(
|
||||
"{} processor: expect array value, but got {val:?}",
|
||||
self.kind()
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String> {
|
||||
for field in self.fields.iter() {
|
||||
let index = field.input_field.index;
|
||||
let index = field.input_index();
|
||||
match val.get(index) {
|
||||
Some(Value::Array(arr)) => {
|
||||
// TODO(qtang): Let this method use the intermediate state collection directly.
|
||||
let mut map = self.process_field(arr, field)?;
|
||||
field
|
||||
.output_fields_index_mapping
|
||||
.iter()
|
||||
.for_each(|(k, output_index)| {
|
||||
if let Some(v) = map.remove(k) {
|
||||
val[*output_index] = v;
|
||||
}
|
||||
});
|
||||
let result = self.process(arr)?;
|
||||
let output_index = field.output_index();
|
||||
val[output_index] = result;
|
||||
}
|
||||
Some(Value::Null) | None => {
|
||||
if !self.ignore_missing {
|
||||
return Err(format!(
|
||||
"{} processor: missing field: {}",
|
||||
self.kind(),
|
||||
field.get_field_name()
|
||||
field.input_name()
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -173,25 +186,22 @@ impl Processor for JoinProcessor {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use crate::etl::field::Field;
|
||||
use crate::etl::processor::join::JoinProcessor;
|
||||
use crate::etl::processor::Processor;
|
||||
use crate::etl::value::{Map, Value};
|
||||
use crate::etl::value::Value;
|
||||
|
||||
#[test]
|
||||
fn test_join_processor() {
|
||||
let mut processor = JoinProcessor::default();
|
||||
processor.with_separator("-");
|
||||
let processor = JoinProcessor {
|
||||
separator: Some("-".to_string()),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let field = Field::new("test");
|
||||
let arr = Value::Array(
|
||||
vec![
|
||||
Value::String("a".to_string()),
|
||||
Value::String("b".to_string()),
|
||||
]
|
||||
.into(),
|
||||
);
|
||||
let result = processor.exec_field(&arr, &field).unwrap();
|
||||
assert_eq!(result, Map::one("test", Value::String("a-b".to_string())));
|
||||
let arr = vec![
|
||||
Value::String("a".to_string()),
|
||||
Value::String("b".to_string()),
|
||||
]
|
||||
.into();
|
||||
let result = processor.process(&arr).unwrap();
|
||||
assert_eq!(result, Value::String("a-b".to_string()));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,12 +14,12 @@
|
||||
|
||||
use ahash::HashSet;
|
||||
|
||||
use crate::etl::field::{Field, Fields};
|
||||
use crate::etl::field::{Fields, OneInputOneOutputField};
|
||||
use crate::etl::processor::{
|
||||
update_one_one_output_keys, yaml_bool, yaml_field, yaml_fields, yaml_string, Processor,
|
||||
FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, METHOD_NAME,
|
||||
yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, ProcessorBuilder,
|
||||
ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, METHOD_NAME,
|
||||
};
|
||||
use crate::etl::value::{Map, Value};
|
||||
use crate::etl::value::Value;
|
||||
|
||||
pub(crate) const PROCESSOR_LETTER: &str = "letter";
|
||||
|
||||
@@ -54,29 +54,61 @@ impl std::str::FromStr for Method {
|
||||
}
|
||||
}
|
||||
|
||||
/// only support string value
|
||||
#[derive(Debug, Default)]
|
||||
pub struct LetterProcessor {
|
||||
pub struct LetterProcessorBuilder {
|
||||
fields: Fields,
|
||||
method: Method,
|
||||
ignore_missing: bool,
|
||||
}
|
||||
|
||||
impl ProcessorBuilder for LetterProcessorBuilder {
|
||||
fn output_keys(&self) -> HashSet<&str> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|f| f.target_or_input_field())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn input_keys(&self) -> HashSet<&str> {
|
||||
self.fields.iter().map(|f| f.input_field()).collect()
|
||||
}
|
||||
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String> {
|
||||
self.build(intermediate_keys).map(ProcessorKind::Letter)
|
||||
}
|
||||
}
|
||||
|
||||
impl LetterProcessorBuilder {
|
||||
pub fn build(self, intermediate_keys: &[String]) -> Result<LetterProcessor, String> {
|
||||
let mut real_fields = vec![];
|
||||
for field in self.fields.into_iter() {
|
||||
let input = OneInputOneOutputField::build(
|
||||
"letter",
|
||||
intermediate_keys,
|
||||
field.input_field(),
|
||||
field.target_or_input_field(),
|
||||
)?;
|
||||
real_fields.push(input);
|
||||
}
|
||||
|
||||
Ok(LetterProcessor {
|
||||
fields: real_fields,
|
||||
method: self.method,
|
||||
ignore_missing: self.ignore_missing,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// only support string value
|
||||
#[derive(Debug, Default)]
|
||||
pub struct LetterProcessor {
|
||||
fields: Vec<OneInputOneOutputField>,
|
||||
method: Method,
|
||||
ignore_missing: bool,
|
||||
}
|
||||
|
||||
impl LetterProcessor {
|
||||
fn with_fields(&mut self, mut fields: Fields) {
|
||||
update_one_one_output_keys(&mut fields);
|
||||
self.fields = fields;
|
||||
}
|
||||
|
||||
fn with_method(&mut self, method: Method) {
|
||||
self.method = method;
|
||||
}
|
||||
|
||||
fn with_ignore_missing(&mut self, ignore_missing: bool) {
|
||||
self.ignore_missing = ignore_missing;
|
||||
}
|
||||
|
||||
fn process_field(&self, val: &str, field: &Field) -> Result<Map, String> {
|
||||
fn process_field(&self, val: &str) -> Result<Value, String> {
|
||||
let processed = match self.method {
|
||||
Method::Upper => val.to_uppercase(),
|
||||
Method::Lower => val.to_lowercase(),
|
||||
@@ -84,17 +116,17 @@ impl LetterProcessor {
|
||||
};
|
||||
let val = Value::String(processed);
|
||||
|
||||
let key = field.get_target_field();
|
||||
|
||||
Ok(Map::one(key, val))
|
||||
Ok(val)
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for LetterProcessor {
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for LetterProcessorBuilder {
|
||||
type Error = String;
|
||||
|
||||
fn try_from(value: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
|
||||
let mut processor = LetterProcessor::default();
|
||||
let mut fields = Fields::default();
|
||||
let mut method = Method::Lower;
|
||||
let mut ignore_missing = false;
|
||||
|
||||
for (k, v) in value.iter() {
|
||||
let key = k
|
||||
@@ -102,23 +134,26 @@ impl TryFrom<&yaml_rust::yaml::Hash> for LetterProcessor {
|
||||
.ok_or(format!("key must be a string, but got {k:?}"))?;
|
||||
match key {
|
||||
FIELD_NAME => {
|
||||
processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?));
|
||||
fields = Fields::one(yaml_new_field(v, FIELD_NAME)?);
|
||||
}
|
||||
FIELDS_NAME => {
|
||||
processor.with_fields(yaml_fields(v, FIELDS_NAME)?);
|
||||
fields = yaml_new_fields(v, FIELDS_NAME)?;
|
||||
}
|
||||
METHOD_NAME => {
|
||||
let method = yaml_string(v, METHOD_NAME)?;
|
||||
processor.with_method(method.parse()?);
|
||||
method = yaml_string(v, METHOD_NAME)?.parse()?;
|
||||
}
|
||||
IGNORE_MISSING_NAME => {
|
||||
processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?);
|
||||
ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(processor)
|
||||
Ok(LetterProcessorBuilder {
|
||||
fields,
|
||||
method,
|
||||
ignore_missing,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -131,53 +166,21 @@ impl Processor for LetterProcessor {
|
||||
self.ignore_missing
|
||||
}
|
||||
|
||||
fn fields(&self) -> &Fields {
|
||||
&self.fields
|
||||
}
|
||||
|
||||
fn fields_mut(&mut self) -> &mut Fields {
|
||||
&mut self.fields
|
||||
}
|
||||
|
||||
fn output_keys(&self) -> HashSet<String> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|f| f.get_target_field().to_string())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
|
||||
match val {
|
||||
Value::String(val) => self.process_field(val, field),
|
||||
_ => Err(format!(
|
||||
"{} processor: expect string value, but got {val:?}",
|
||||
self.kind()
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String> {
|
||||
for field in self.fields.iter() {
|
||||
let index = field.input_field.index;
|
||||
let index = field.input_index();
|
||||
match val.get(index) {
|
||||
Some(Value::String(s)) => {
|
||||
// TODO(qtang): Let this method use the intermediate state collection directly.
|
||||
let mut processed = self.process_field(s, field)?;
|
||||
field
|
||||
.output_fields_index_mapping
|
||||
.iter()
|
||||
.for_each(|(k, output_index)| {
|
||||
if let Some(v) = processed.remove(k) {
|
||||
val[*output_index] = v;
|
||||
}
|
||||
});
|
||||
let result = self.process_field(s)?;
|
||||
let (_, output_index) = field.output();
|
||||
val[*output_index] = result;
|
||||
}
|
||||
Some(Value::Null) | None => {
|
||||
if !self.ignore_missing {
|
||||
return Err(format!(
|
||||
"{} processor: missing field: {}",
|
||||
self.kind(),
|
||||
field.get_field_name()
|
||||
&field.input().name
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -204,33 +207,36 @@ fn capitalize(s: &str) -> String {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::etl::field::Fields;
|
||||
use crate::etl::processor::letter::{LetterProcessor, Method};
|
||||
use crate::etl::value::{Map, Value};
|
||||
use crate::etl::value::Value;
|
||||
|
||||
#[test]
|
||||
fn test_process() {
|
||||
let field = "letter";
|
||||
let ff: crate::etl::processor::Field = field.parse().unwrap();
|
||||
let mut processor = LetterProcessor::default();
|
||||
processor.with_fields(Fields::one(ff.clone()));
|
||||
|
||||
{
|
||||
processor.with_method(Method::Upper);
|
||||
let processed = processor.process_field("pipeline", &ff).unwrap();
|
||||
assert_eq!(Map::one(field, Value::String("PIPELINE".into())), processed)
|
||||
let processor = LetterProcessor {
|
||||
method: Method::Upper,
|
||||
..Default::default()
|
||||
};
|
||||
let processed = processor.process_field("pipeline").unwrap();
|
||||
assert_eq!(Value::String("PIPELINE".into()), processed)
|
||||
}
|
||||
|
||||
{
|
||||
processor.with_method(Method::Lower);
|
||||
let processed = processor.process_field("Pipeline", &ff).unwrap();
|
||||
assert_eq!(Map::one(field, Value::String("pipeline".into())), processed)
|
||||
let processor = LetterProcessor {
|
||||
method: Method::Lower,
|
||||
..Default::default()
|
||||
};
|
||||
let processed = processor.process_field("Pipeline").unwrap();
|
||||
assert_eq!(Value::String("pipeline".into()), processed)
|
||||
}
|
||||
|
||||
{
|
||||
processor.with_method(Method::Capital);
|
||||
let processed = processor.process_field("pipeline", &ff).unwrap();
|
||||
assert_eq!(Map::one(field, Value::String("Pipeline".into())), processed)
|
||||
let processor = LetterProcessor {
|
||||
method: Method::Capital,
|
||||
..Default::default()
|
||||
};
|
||||
let processed = processor.process_field("pipeline").unwrap();
|
||||
assert_eq!(Value::String("Pipeline".into()), processed)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,16 +18,17 @@ const PATTERNS_NAME: &str = "patterns";
|
||||
|
||||
pub(crate) const PROCESSOR_REGEX: &str = "regex";
|
||||
|
||||
use ahash::HashSet;
|
||||
use ahash::{HashSet, HashSetExt};
|
||||
use lazy_static::lazy_static;
|
||||
use regex::Regex;
|
||||
|
||||
use crate::etl::field::Fields;
|
||||
use crate::etl::field::{Fields, InputFieldInfo, OneInputMultiOutputField};
|
||||
use crate::etl::find_key_index;
|
||||
use crate::etl::processor::{
|
||||
yaml_bool, yaml_field, yaml_fields, yaml_string, yaml_strings, Field, Processor, FIELDS_NAME,
|
||||
FIELD_NAME, IGNORE_MISSING_NAME, PATTERN_NAME,
|
||||
yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, yaml_strings, Processor,
|
||||
ProcessorBuilder, ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, PATTERN_NAME,
|
||||
};
|
||||
use crate::etl::value::{Map, Value};
|
||||
use crate::etl::value::Value;
|
||||
|
||||
lazy_static! {
|
||||
static ref GROUPS_NAME_REGEX: Regex = Regex::new(r"\(\?P?<([[:word:]]+)>.+?\)").unwrap();
|
||||
@@ -40,6 +41,10 @@ fn get_regex_group_names(s: &str) -> Vec<String> {
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn generate_key(prefix: &str, group: &str) -> String {
|
||||
format!("{prefix}_{group}")
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct GroupRegex {
|
||||
origin: String,
|
||||
@@ -72,34 +77,29 @@ impl std::str::FromStr for GroupRegex {
|
||||
}
|
||||
}
|
||||
|
||||
/// only support string value
|
||||
/// if no value found from a pattern, the target_field will be ignored
|
||||
#[derive(Debug, Default)]
|
||||
pub struct RegexProcessor {
|
||||
pub struct RegexProcessorBuilder {
|
||||
fields: Fields,
|
||||
patterns: Vec<GroupRegex>,
|
||||
ignore_missing: bool,
|
||||
output_keys: HashSet<String>,
|
||||
}
|
||||
|
||||
impl RegexProcessor {
|
||||
fn with_fields(&mut self, fields: Fields) {
|
||||
self.fields = fields;
|
||||
impl ProcessorBuilder for RegexProcessorBuilder {
|
||||
fn output_keys(&self) -> HashSet<&str> {
|
||||
self.output_keys.iter().map(|k| k.as_str()).collect()
|
||||
}
|
||||
|
||||
fn try_with_patterns(&mut self, patterns: Vec<String>) -> Result<(), String> {
|
||||
let mut rs = vec![];
|
||||
for pattern in patterns {
|
||||
let gr = pattern.parse()?;
|
||||
rs.push(gr);
|
||||
}
|
||||
self.patterns = rs;
|
||||
Ok(())
|
||||
fn input_keys(&self) -> HashSet<&str> {
|
||||
self.fields.iter().map(|f| f.input_field()).collect()
|
||||
}
|
||||
|
||||
fn with_ignore_missing(&mut self, ignore_missing: bool) {
|
||||
self.ignore_missing = ignore_missing;
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String> {
|
||||
self.build(intermediate_keys).map(ProcessorKind::Regex)
|
||||
}
|
||||
}
|
||||
|
||||
impl RegexProcessorBuilder {
|
||||
fn check(self) -> Result<Self, String> {
|
||||
if self.fields.is_empty() {
|
||||
return Err(format!(
|
||||
@@ -118,47 +118,78 @@ impl RegexProcessor {
|
||||
Ok(self)
|
||||
}
|
||||
|
||||
fn generate_key(prefix: &str, group: &str) -> String {
|
||||
format!("{prefix}_{group}")
|
||||
fn build_group_output_info(
|
||||
group_regex: &GroupRegex,
|
||||
om_field: &OneInputMultiOutputField,
|
||||
intermediate_keys: &[String],
|
||||
) -> Result<Vec<OutPutInfo>, String> {
|
||||
group_regex
|
||||
.groups
|
||||
.iter()
|
||||
.map(|g| {
|
||||
let key = generate_key(om_field.target_prefix(), g);
|
||||
let index = find_key_index(intermediate_keys, &key, "regex");
|
||||
index.map(|index| OutPutInfo {
|
||||
final_key: key,
|
||||
group_name: g.to_string(),
|
||||
index,
|
||||
})
|
||||
})
|
||||
.collect::<Result<Vec<_>, String>>()
|
||||
}
|
||||
|
||||
fn process_field(&self, val: &str, field: &Field, gr: &GroupRegex) -> Result<Map, String> {
|
||||
let mut map = Map::default();
|
||||
|
||||
if let Some(captures) = gr.regex.captures(val) {
|
||||
for group in &gr.groups {
|
||||
if let Some(capture) = captures.name(group) {
|
||||
let value = capture.as_str().to_string();
|
||||
let prefix = field.get_target_field();
|
||||
|
||||
let key = Self::generate_key(prefix, group);
|
||||
|
||||
map.insert(key, Value::String(value));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(map)
|
||||
fn build_group_output_infos(
|
||||
patterns: &[GroupRegex],
|
||||
om_field: &OneInputMultiOutputField,
|
||||
intermediate_keys: &[String],
|
||||
) -> Result<Vec<Vec<OutPutInfo>>, String> {
|
||||
patterns
|
||||
.iter()
|
||||
.map(|group_regex| {
|
||||
Self::build_group_output_info(group_regex, om_field, intermediate_keys)
|
||||
})
|
||||
.collect::<Result<Vec<_>, String>>()
|
||||
}
|
||||
|
||||
fn update_output_keys(&mut self) {
|
||||
for field in self.fields.iter_mut() {
|
||||
for gr in &self.patterns {
|
||||
for group in &gr.groups {
|
||||
field
|
||||
.output_fields_index_mapping
|
||||
.insert(Self::generate_key(field.get_target_field(), group), 0_usize);
|
||||
}
|
||||
}
|
||||
fn build_output_info(
|
||||
real_fields: &[OneInputMultiOutputField],
|
||||
patterns: &[GroupRegex],
|
||||
intermediate_keys: &[String],
|
||||
) -> Result<RegexProcessorOutputInfo, String> {
|
||||
let inner = real_fields
|
||||
.iter()
|
||||
.map(|om_field| Self::build_group_output_infos(patterns, om_field, intermediate_keys))
|
||||
.collect::<Result<Vec<_>, String>>();
|
||||
inner.map(|inner| RegexProcessorOutputInfo { inner })
|
||||
}
|
||||
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<RegexProcessor, String> {
|
||||
let mut real_fields = vec![];
|
||||
for field in self.fields.into_iter() {
|
||||
let input_index = find_key_index(intermediate_keys, field.input_field(), "regex")?;
|
||||
let input_field_info = InputFieldInfo::new(field.input_field(), input_index);
|
||||
|
||||
let input = OneInputMultiOutputField::new(input_field_info, field.target_field);
|
||||
real_fields.push(input);
|
||||
}
|
||||
let output_info = Self::build_output_info(&real_fields, &self.patterns, intermediate_keys)?;
|
||||
Ok(RegexProcessor {
|
||||
// fields: Fields::one(Field::new("test".to_string())),
|
||||
fields: real_fields,
|
||||
patterns: self.patterns,
|
||||
output_info,
|
||||
ignore_missing: self.ignore_missing,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for RegexProcessor {
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for RegexProcessorBuilder {
|
||||
type Error = String;
|
||||
|
||||
fn try_from(value: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
|
||||
let mut processor = RegexProcessor::default();
|
||||
let mut fields = Fields::default();
|
||||
let mut patterns: Vec<GroupRegex> = vec![];
|
||||
let mut ignore_missing = false;
|
||||
|
||||
for (k, v) in value.iter() {
|
||||
let key = k
|
||||
@@ -166,28 +197,113 @@ impl TryFrom<&yaml_rust::yaml::Hash> for RegexProcessor {
|
||||
.ok_or(format!("key must be a string, but got {k:?}"))?;
|
||||
match key {
|
||||
FIELD_NAME => {
|
||||
processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?));
|
||||
fields = Fields::one(yaml_new_field(v, FIELD_NAME)?);
|
||||
}
|
||||
FIELDS_NAME => {
|
||||
processor.with_fields(yaml_fields(v, FIELDS_NAME)?);
|
||||
fields = yaml_new_fields(v, FIELDS_NAME)?;
|
||||
}
|
||||
PATTERN_NAME => {
|
||||
processor.try_with_patterns(vec![yaml_string(v, PATTERN_NAME)?])?;
|
||||
let pattern = yaml_string(v, PATTERN_NAME)?;
|
||||
let gr = pattern.parse()?;
|
||||
patterns.push(gr);
|
||||
}
|
||||
PATTERNS_NAME => {
|
||||
processor.try_with_patterns(yaml_strings(v, PATTERNS_NAME)?)?;
|
||||
for pattern in yaml_strings(v, PATTERNS_NAME)? {
|
||||
let gr = pattern.parse()?;
|
||||
patterns.push(gr);
|
||||
}
|
||||
}
|
||||
IGNORE_MISSING_NAME => {
|
||||
processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?);
|
||||
ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
processor.check().map(|mut p| {
|
||||
p.update_output_keys();
|
||||
p
|
||||
})
|
||||
let pattern_output_keys = patterns
|
||||
.iter()
|
||||
.flat_map(|pattern| pattern.groups.iter())
|
||||
.collect::<Vec<_>>();
|
||||
let mut output_keys = HashSet::new();
|
||||
for field in fields.iter() {
|
||||
for x in pattern_output_keys.iter() {
|
||||
output_keys.insert(generate_key(field.target_or_input_field(), x));
|
||||
}
|
||||
}
|
||||
|
||||
let processor_builder = RegexProcessorBuilder {
|
||||
fields,
|
||||
patterns,
|
||||
ignore_missing,
|
||||
output_keys,
|
||||
};
|
||||
|
||||
processor_builder.check()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
struct OutPutInfo {
|
||||
final_key: String,
|
||||
group_name: String,
|
||||
index: usize,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
struct RegexProcessorOutputInfo {
|
||||
pub inner: Vec<Vec<Vec<OutPutInfo>>>,
|
||||
}
|
||||
|
||||
impl RegexProcessorOutputInfo {
|
||||
fn get_output_index(
|
||||
&self,
|
||||
field_index: usize,
|
||||
pattern_index: usize,
|
||||
group_index: usize,
|
||||
) -> usize {
|
||||
self.inner[field_index][pattern_index][group_index].index
|
||||
}
|
||||
}
|
||||
/// only support string value
|
||||
/// if no value found from a pattern, the target_field will be ignored
|
||||
#[derive(Debug, Default)]
|
||||
pub struct RegexProcessor {
|
||||
fields: Vec<OneInputMultiOutputField>,
|
||||
output_info: RegexProcessorOutputInfo,
|
||||
patterns: Vec<GroupRegex>,
|
||||
ignore_missing: bool,
|
||||
}
|
||||
|
||||
impl RegexProcessor {
|
||||
fn try_with_patterns(&mut self, patterns: Vec<String>) -> Result<(), String> {
|
||||
let mut rs = vec![];
|
||||
for pattern in patterns {
|
||||
let gr = pattern.parse()?;
|
||||
rs.push(gr);
|
||||
}
|
||||
self.patterns = rs;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn process(
|
||||
&self,
|
||||
val: &str,
|
||||
gr: &GroupRegex,
|
||||
index: (usize, usize),
|
||||
) -> Result<Vec<(usize, Value)>, String> {
|
||||
let mut result = Vec::new();
|
||||
if let Some(captures) = gr.regex.captures(val) {
|
||||
for (group_index, group) in gr.groups.iter().enumerate() {
|
||||
if let Some(capture) = captures.name(group) {
|
||||
let value = capture.as_str().to_string();
|
||||
let index = self
|
||||
.output_info
|
||||
.get_output_index(index.0, index.1, group_index);
|
||||
result.push((index, Value::String(value)));
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -200,71 +316,40 @@ impl Processor for RegexProcessor {
|
||||
self.ignore_missing
|
||||
}
|
||||
|
||||
fn fields(&self) -> &Fields {
|
||||
&self.fields
|
||||
}
|
||||
|
||||
fn fields_mut(&mut self) -> &mut Fields {
|
||||
&mut self.fields
|
||||
}
|
||||
|
||||
fn output_keys(&self) -> HashSet<String> {
|
||||
self.fields
|
||||
.iter()
|
||||
.flat_map(|f| {
|
||||
self.patterns.iter().flat_map(move |p| {
|
||||
p.groups
|
||||
.iter()
|
||||
.map(move |g| Self::generate_key(&f.input_field.name, g))
|
||||
})
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
|
||||
match val {
|
||||
Value::String(val) => {
|
||||
let mut map = Map::default();
|
||||
for gr in &self.patterns {
|
||||
let m = self.process_field(val, field, gr)?;
|
||||
map.extend(m);
|
||||
}
|
||||
Ok(map)
|
||||
}
|
||||
_ => Err(format!(
|
||||
"{} processor: expect string value, but got {val:?}",
|
||||
self.kind()
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String> {
|
||||
for field in self.fields.iter() {
|
||||
let index = field.input_field.index;
|
||||
for (field_index, field) in self.fields.iter().enumerate() {
|
||||
let index = field.input_index();
|
||||
let mut result_list = None;
|
||||
match val.get(index) {
|
||||
Some(Value::String(s)) => {
|
||||
let mut map = Map::default();
|
||||
for gr in &self.patterns {
|
||||
// TODO(qtang): Let this method use the intermediate state collection directly.
|
||||
let m = self.process_field(s, field, gr)?;
|
||||
map.extend(m);
|
||||
}
|
||||
|
||||
field
|
||||
.output_fields_index_mapping
|
||||
.iter()
|
||||
.for_each(|(k, output_index)| {
|
||||
if let Some(v) = map.remove(k) {
|
||||
val[*output_index] = v;
|
||||
// we get rust borrow checker error here
|
||||
// for (gr_index, gr) in self.patterns.iter().enumerate() {
|
||||
// let result_list = self.process(s.as_str(), gr, (field_index, gr_index))?;
|
||||
// for (output_index, result) in result_list {
|
||||
//cannot borrow `*val` as mutable because it is also borrowed as immutable mutable borrow occurs here
|
||||
// val[output_index] = result;
|
||||
// }
|
||||
// }
|
||||
for (gr_index, gr) in self.patterns.iter().enumerate() {
|
||||
let result = self.process(s.as_str(), gr, (field_index, gr_index))?;
|
||||
if !result.is_empty() {
|
||||
match result_list.as_mut() {
|
||||
None => {
|
||||
result_list = Some(result);
|
||||
}
|
||||
Some(result_list) => {
|
||||
result_list.extend(result);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
Some(Value::Null) | None => {
|
||||
if !self.ignore_missing {
|
||||
return Err(format!(
|
||||
"{} processor: missing field: {}",
|
||||
self.kind(),
|
||||
field.get_field_name()
|
||||
field.input_name()
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -275,6 +360,15 @@ impl Processor for RegexProcessor {
|
||||
));
|
||||
}
|
||||
}
|
||||
// safety here
|
||||
match result_list {
|
||||
None => {}
|
||||
Some(result_list) => {
|
||||
for (output_index, result) in result_list {
|
||||
val[output_index] = result;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -282,37 +376,42 @@ impl Processor for RegexProcessor {
|
||||
}
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use ahash::{HashMap, HashMapExt};
|
||||
use itertools::Itertools;
|
||||
|
||||
use super::RegexProcessor;
|
||||
use crate::etl::field::Fields;
|
||||
use crate::etl::processor::Processor;
|
||||
use crate::etl::processor::regex::RegexProcessorBuilder;
|
||||
use crate::etl::value::{Map, Value};
|
||||
|
||||
#[test]
|
||||
fn test_simple_parse() {
|
||||
let mut processor = RegexProcessor::default();
|
||||
let pipeline_str = r#"fields: ["a"]
|
||||
patterns: ['(?<ar>\d)']
|
||||
ignore_missing: false"#;
|
||||
|
||||
let processor_yaml = yaml_rust::YamlLoader::load_from_str(pipeline_str)
|
||||
.unwrap()
|
||||
.pop()
|
||||
.unwrap();
|
||||
let processor_yaml_hash = processor_yaml.as_hash().unwrap();
|
||||
let builder = RegexProcessorBuilder::try_from(processor_yaml_hash).unwrap();
|
||||
let intermediate_keys = ["a".to_string(), "a_ar".to_string()];
|
||||
let processor = builder.build(&intermediate_keys).unwrap();
|
||||
|
||||
// single field (with prefix), multiple patterns
|
||||
let f = ["a"].iter().map(|f| f.parse().unwrap()).collect();
|
||||
processor.with_fields(Fields::new(f).unwrap());
|
||||
|
||||
let ar = "(?<ar>\\d)";
|
||||
let result = processor
|
||||
.process("123", &processor.patterns[0], (0, 0))
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|(k, v)| (intermediate_keys[k].clone(), v))
|
||||
.collect();
|
||||
|
||||
let patterns = [ar].iter().map(|p| p.to_string()).collect();
|
||||
processor.try_with_patterns(patterns).unwrap();
|
||||
|
||||
let mut map = Map::default();
|
||||
map.insert("a", Value::String("123".to_string()));
|
||||
processor.exec_map(&mut map).unwrap();
|
||||
let map = Map { values: result };
|
||||
|
||||
let v = Map {
|
||||
values: vec![
|
||||
("a_ar".to_string(), Value::String("1".to_string())),
|
||||
("a".to_string(), Value::String("123".to_string())),
|
||||
]
|
||||
.into_iter()
|
||||
.collect(),
|
||||
values: vec![("a_ar".to_string(), Value::String("1".to_string()))]
|
||||
.into_iter()
|
||||
.collect(),
|
||||
};
|
||||
|
||||
assert_eq!(v, map);
|
||||
@@ -320,17 +419,14 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_process() {
|
||||
let mut processor = RegexProcessor::default();
|
||||
|
||||
let cc = "[c=c,n=US_CA_SANJOSE,o=55155]";
|
||||
let cg = "[a=12.34.567.89,b=12345678,c=g,n=US_CA_SANJOSE,o=20940]";
|
||||
let co = "[a=987.654.321.09,c=o]";
|
||||
let cp = "[c=p,n=US_CA_SANJOSE,o=55155]";
|
||||
let cw = "[c=w,n=US_CA_SANJOSE,o=55155]";
|
||||
let breadcrumbs = Value::String([cc, cg, co, cp, cw].iter().join(","));
|
||||
let breadcrumbs_str = [cc, cg, co, cp, cw].iter().join(",");
|
||||
|
||||
let values = [
|
||||
("breadcrumbs", breadcrumbs.clone()),
|
||||
("breadcrumbs_parent", Value::String(cc.to_string())),
|
||||
("breadcrumbs_edge", Value::String(cg.to_string())),
|
||||
("breadcrumbs_origin", Value::String(co.to_string())),
|
||||
@@ -340,61 +436,141 @@ mod tests {
|
||||
.into_iter()
|
||||
.map(|(k, v)| (k.to_string(), v))
|
||||
.collect();
|
||||
let mut temporary_map = Map { values };
|
||||
let temporary_map = Map { values };
|
||||
|
||||
{
|
||||
// single field (with prefix), multiple patterns
|
||||
let ff = ["breadcrumbs, breadcrumbs"]
|
||||
.iter()
|
||||
.map(|f| f.parse().unwrap())
|
||||
.collect();
|
||||
processor.with_fields(Fields::new(ff).unwrap());
|
||||
|
||||
let ccr = "(?<parent>\\[[^\\[]*c=c[^\\]]*\\])";
|
||||
let cgr = "(?<edge>\\[[^\\[]*c=g[^\\]]*\\])";
|
||||
let cor = "(?<origin>\\[[^\\[]*c=o[^\\]]*\\])";
|
||||
let cpr = "(?<peer>\\[[^\\[]*c=p[^\\]]*\\])";
|
||||
let cwr = "(?<wrapper>\\[[^\\[]*c=w[^\\]]*\\])";
|
||||
let patterns = [ccr, cgr, cor, cpr, cwr]
|
||||
.iter()
|
||||
.map(|p| p.to_string())
|
||||
.collect();
|
||||
processor.try_with_patterns(patterns).unwrap();
|
||||
let pipeline_str = r#"fields: ["breadcrumbs"]
|
||||
patterns:
|
||||
- '(?<parent>\[[^\[]*c=c[^\]]*\])'
|
||||
- '(?<edge>\[[^\[]*c=g[^\]]*\])'
|
||||
- '(?<origin>\[[^\[]*c=o[^\]]*\])'
|
||||
- '(?<peer>\[[^\[]*c=p[^\]]*\])'
|
||||
- '(?<wrapper>\[[^\[]*c=w[^\]]*\])'
|
||||
ignore_missing: false"#;
|
||||
|
||||
let mut map = Map::default();
|
||||
map.insert("breadcrumbs", breadcrumbs.clone());
|
||||
processor.exec_map(&mut map).unwrap();
|
||||
|
||||
assert_eq!(map, temporary_map);
|
||||
let processor_yaml = yaml_rust::YamlLoader::load_from_str(pipeline_str)
|
||||
.unwrap()
|
||||
.pop()
|
||||
.unwrap();
|
||||
let processor_yaml_hash = processor_yaml.as_hash().unwrap();
|
||||
let builder = RegexProcessorBuilder::try_from(processor_yaml_hash).unwrap();
|
||||
let intermediate_keys = [
|
||||
"breadcrumbs",
|
||||
"breadcrumbs_parent",
|
||||
"breadcrumbs_edge",
|
||||
"breadcrumbs_origin",
|
||||
"breadcrumbs_peer",
|
||||
"breadcrumbs_wrapper",
|
||||
]
|
||||
.iter()
|
||||
.map(|k| k.to_string())
|
||||
.collect_vec();
|
||||
let processor = builder.build(&intermediate_keys).unwrap();
|
||||
let mut result = HashMap::new();
|
||||
for (index, pattern) in processor.patterns.iter().enumerate() {
|
||||
let r = processor
|
||||
.process(&breadcrumbs_str, pattern, (0, index))
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|(k, v)| (intermediate_keys[k].clone(), v))
|
||||
.collect::<HashMap<_, _>>();
|
||||
result.extend(r);
|
||||
}
|
||||
let map = Map { values: result };
|
||||
assert_eq!(temporary_map, map);
|
||||
}
|
||||
|
||||
{
|
||||
// multiple fields (with prefix), multiple patterns
|
||||
let ff = [
|
||||
"breadcrumbs_parent, parent",
|
||||
"breadcrumbs_edge, edge",
|
||||
"breadcrumbs_origin, origin",
|
||||
"breadcrumbs_peer, peer",
|
||||
"breadcrumbs_wrapper, wrapper",
|
||||
]
|
||||
.iter()
|
||||
.map(|f| f.parse().unwrap())
|
||||
.collect();
|
||||
processor.with_fields(Fields::new(ff).unwrap());
|
||||
|
||||
let patterns = [
|
||||
"a=(?<ip>[^,\\]]+)",
|
||||
"b=(?<request_id>[^,\\]]+)",
|
||||
"k=(?<request_end_time>[^,\\]]+)",
|
||||
"l=(?<turn_around_time>[^,\\]]+)",
|
||||
"m=(?<dns_lookup_time>[^,\\]]+)",
|
||||
"n=(?<geo>[^,\\]]+)",
|
||||
"o=(?<asn>[^,\\]]+)",
|
||||
let pipeline_str = r#"fields:
|
||||
- breadcrumbs_parent, parent
|
||||
- breadcrumbs_edge, edge
|
||||
- breadcrumbs_origin, origin
|
||||
- breadcrumbs_peer, peer
|
||||
- breadcrumbs_wrapper, wrapper
|
||||
patterns:
|
||||
- 'a=(?<ip>[^,\]]+)'
|
||||
- 'b=(?<request_id>[^,\]]+)'
|
||||
- 'k=(?<request_end_time>[^,\]]+)'
|
||||
- 'l=(?<turn_around_time>[^,\]]+)'
|
||||
- 'm=(?<dns_lookup_time>[^,\]]+)'
|
||||
- 'n=(?<geo>[^,\]]+)'
|
||||
- 'o=(?<asn>[^,\]]+)'
|
||||
ignore_missing: false"#;
|
||||
|
||||
let processor_yaml = yaml_rust::YamlLoader::load_from_str(pipeline_str)
|
||||
.unwrap()
|
||||
.pop()
|
||||
.unwrap();
|
||||
let processor_yaml_hash = processor_yaml.as_hash().unwrap();
|
||||
let builder = RegexProcessorBuilder::try_from(processor_yaml_hash).unwrap();
|
||||
|
||||
let intermediate_keys = [
|
||||
"breadcrumbs_parent",
|
||||
"breadcrumbs_edge",
|
||||
"breadcrumbs_origin",
|
||||
"breadcrumbs_peer",
|
||||
"breadcrumbs_wrapper",
|
||||
"edge_ip",
|
||||
"edge_request_id",
|
||||
"edge_request_end_time",
|
||||
"edge_turn_around_time",
|
||||
"edge_dns_lookup_time",
|
||||
"edge_geo",
|
||||
"edge_asn",
|
||||
"origin_ip",
|
||||
"origin_request_id",
|
||||
"origin_request_end_time",
|
||||
"origin_turn_around_time",
|
||||
"origin_dns_lookup_time",
|
||||
"origin_geo",
|
||||
"origin_asn",
|
||||
"peer_ip",
|
||||
"peer_request_id",
|
||||
"peer_request_end_time",
|
||||
"peer_turn_around_time",
|
||||
"peer_dns_lookup_time",
|
||||
"peer_geo",
|
||||
"peer_asn",
|
||||
"parent_ip",
|
||||
"parent_request_id",
|
||||
"parent_request_end_time",
|
||||
"parent_turn_around_time",
|
||||
"parent_dns_lookup_time",
|
||||
"parent_geo",
|
||||
"parent_asn",
|
||||
"wrapper_ip",
|
||||
"wrapper_request_id",
|
||||
"wrapper_request_end_time",
|
||||
"wrapper_turn_around_time",
|
||||
"wrapper_dns_lookup_time",
|
||||
"wrapper_geo",
|
||||
"wrapper_asn",
|
||||
]
|
||||
.iter()
|
||||
.map(|p| p.to_string())
|
||||
.collect();
|
||||
processor.try_with_patterns(patterns).unwrap();
|
||||
.map(|k| k.to_string())
|
||||
.collect_vec();
|
||||
let processor = builder.build(&intermediate_keys).unwrap();
|
||||
|
||||
let mut result = HashMap::new();
|
||||
for (field_index, field) in processor.fields.iter().enumerate() {
|
||||
for (pattern_index, pattern) in processor.patterns.iter().enumerate() {
|
||||
let s = temporary_map
|
||||
.get(field.input_name())
|
||||
.unwrap()
|
||||
.to_str_value();
|
||||
let r = processor
|
||||
.process(&s, pattern, (field_index, pattern_index))
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|(k, v)| (intermediate_keys[k].clone(), v))
|
||||
.collect::<HashMap<_, _>>();
|
||||
result.extend(r);
|
||||
}
|
||||
}
|
||||
|
||||
let new_values = vec![
|
||||
("edge_ip", Value::String("12.34.567.89".to_string())),
|
||||
@@ -413,11 +589,7 @@ mod tests {
|
||||
.map(|(k, v)| (k.to_string(), v))
|
||||
.collect();
|
||||
|
||||
let mut expected_map = temporary_map.clone();
|
||||
processor.exec_map(&mut temporary_map).unwrap();
|
||||
expected_map.extend(Map { values: new_values });
|
||||
|
||||
assert_eq!(expected_map, temporary_map);
|
||||
assert_eq!(result, new_values);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,18 +19,17 @@ use chrono::{DateTime, NaiveDateTime};
|
||||
use chrono_tz::Tz;
|
||||
use lazy_static::lazy_static;
|
||||
|
||||
use super::yaml_strings;
|
||||
use crate::etl::field::{Field, Fields};
|
||||
use crate::etl::field::{Fields, OneInputOneOutputField};
|
||||
use crate::etl::processor::{
|
||||
update_one_one_output_keys, yaml_bool, yaml_field, yaml_fields, yaml_string, Processor,
|
||||
FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
|
||||
yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, yaml_strings, Processor,
|
||||
ProcessorBuilder, ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
|
||||
};
|
||||
use crate::etl::value::time::{
|
||||
MICROSECOND_RESOLUTION, MICRO_RESOLUTION, MILLISECOND_RESOLUTION, MILLI_RESOLUTION,
|
||||
MS_RESOLUTION, NANOSECOND_RESOLUTION, NANO_RESOLUTION, NS_RESOLUTION, SECOND_RESOLUTION,
|
||||
SEC_RESOLUTION, S_RESOLUTION, US_RESOLUTION,
|
||||
};
|
||||
use crate::etl::value::{Map, Timestamp, Value};
|
||||
use crate::etl::value::{Timestamp, Value};
|
||||
|
||||
pub(crate) const PROCESSOR_TIMESTAMP: &str = "timestamp";
|
||||
const RESOLUTION_NAME: &str = "resolution";
|
||||
@@ -108,10 +107,56 @@ impl std::ops::Deref for Formats {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct TimestampProcessorBuilder {
|
||||
fields: Fields,
|
||||
formats: Formats,
|
||||
resolution: Resolution,
|
||||
ignore_missing: bool,
|
||||
}
|
||||
|
||||
impl ProcessorBuilder for TimestampProcessorBuilder {
|
||||
fn output_keys(&self) -> HashSet<&str> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|f| f.target_or_input_field())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn input_keys(&self) -> HashSet<&str> {
|
||||
self.fields.iter().map(|f| f.input_field()).collect()
|
||||
}
|
||||
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String> {
|
||||
self.build(intermediate_keys).map(ProcessorKind::Timestamp)
|
||||
}
|
||||
}
|
||||
|
||||
impl TimestampProcessorBuilder {
|
||||
pub fn build(self, intermediate_keys: &[String]) -> Result<TimestampProcessor, String> {
|
||||
let mut real_fields = vec![];
|
||||
for field in self.fields.into_iter() {
|
||||
let input = OneInputOneOutputField::build(
|
||||
"timestamp",
|
||||
intermediate_keys,
|
||||
field.input_field(),
|
||||
field.target_or_input_field(),
|
||||
)?;
|
||||
real_fields.push(input);
|
||||
}
|
||||
Ok(TimestampProcessor {
|
||||
fields: real_fields,
|
||||
formats: self.formats,
|
||||
resolution: self.resolution,
|
||||
ignore_missing: self.ignore_missing,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// support string, integer, float, time, epoch
|
||||
#[derive(Debug, Default)]
|
||||
pub struct TimestampProcessor {
|
||||
fields: Fields,
|
||||
fields: Vec<OneInputOneOutputField>,
|
||||
formats: Formats,
|
||||
resolution: Resolution,
|
||||
ignore_missing: bool,
|
||||
@@ -123,29 +168,6 @@ pub struct TimestampProcessor {
|
||||
}
|
||||
|
||||
impl TimestampProcessor {
|
||||
fn with_fields(&mut self, mut fields: Fields) {
|
||||
update_one_one_output_keys(&mut fields);
|
||||
self.fields = fields
|
||||
}
|
||||
|
||||
fn with_resolution(&mut self, resolution: Resolution) {
|
||||
self.resolution = resolution;
|
||||
}
|
||||
|
||||
fn with_formats(&mut self, v: Option<Vec<(Arc<String>, Tz)>>) {
|
||||
let v = match v {
|
||||
Some(v) if !v.is_empty() => v,
|
||||
_ => DEFAULT_FORMATS.clone(),
|
||||
};
|
||||
|
||||
let formats = Formats::new(v);
|
||||
self.formats = formats;
|
||||
}
|
||||
|
||||
fn with_ignore_missing(&mut self, ignore_missing: bool) {
|
||||
self.ignore_missing = ignore_missing;
|
||||
}
|
||||
|
||||
/// try to parse val with timezone first, if failed, parse without timezone
|
||||
fn try_parse(val: &str, fmt: &str, tz: Tz) -> Result<i64, String> {
|
||||
if let Ok(dt) = DateTime::parse_from_str(val, fmt) {
|
||||
@@ -212,12 +234,6 @@ impl TimestampProcessor {
|
||||
Resolution::Nano => Ok(Timestamp::Nanosecond(t)),
|
||||
}
|
||||
}
|
||||
|
||||
fn process_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
|
||||
let key = field.get_target_field();
|
||||
|
||||
Ok(Map::one(key, Value::Timestamp(self.parse(val)?)))
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_formats(yaml: &yaml_rust::yaml::Yaml) -> Result<Vec<(Arc<String>, Tz)>, String> {
|
||||
@@ -250,11 +266,14 @@ fn parse_formats(yaml: &yaml_rust::yaml::Yaml) -> Result<Vec<(Arc<String>, Tz)>,
|
||||
};
|
||||
}
|
||||
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for TimestampProcessor {
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for TimestampProcessorBuilder {
|
||||
type Error = String;
|
||||
|
||||
fn try_from(hash: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
|
||||
let mut processor = TimestampProcessor::default();
|
||||
let mut fields = Fields::default();
|
||||
let mut formats = Formats::default();
|
||||
let mut resolution = Resolution::default();
|
||||
let mut ignore_missing = false;
|
||||
|
||||
for (k, v) in hash {
|
||||
let key = k
|
||||
@@ -263,28 +282,33 @@ impl TryFrom<&yaml_rust::yaml::Hash> for TimestampProcessor {
|
||||
|
||||
match key {
|
||||
FIELD_NAME => {
|
||||
processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?));
|
||||
fields = Fields::one(yaml_new_field(v, FIELD_NAME)?);
|
||||
}
|
||||
FIELDS_NAME => {
|
||||
processor.with_fields(yaml_fields(v, FIELDS_NAME)?);
|
||||
fields = yaml_new_fields(v, FIELDS_NAME)?;
|
||||
}
|
||||
FORMATS_NAME => {
|
||||
let formats = parse_formats(v)?;
|
||||
processor.with_formats(Some(formats));
|
||||
let formats_vec = parse_formats(v)?;
|
||||
formats = Formats::new(formats_vec);
|
||||
}
|
||||
RESOLUTION_NAME => {
|
||||
let s = yaml_string(v, RESOLUTION_NAME)?.as_str().try_into()?;
|
||||
processor.with_resolution(s);
|
||||
resolution = yaml_string(v, RESOLUTION_NAME)?.as_str().try_into()?;
|
||||
}
|
||||
IGNORE_MISSING_NAME => {
|
||||
processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?);
|
||||
ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?;
|
||||
}
|
||||
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(processor)
|
||||
let processor_builder = TimestampProcessorBuilder {
|
||||
fields,
|
||||
formats,
|
||||
resolution,
|
||||
ignore_missing,
|
||||
};
|
||||
|
||||
Ok(processor_builder)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -297,49 +321,23 @@ impl Processor for TimestampProcessor {
|
||||
self.ignore_missing
|
||||
}
|
||||
|
||||
fn fields(&self) -> &Fields {
|
||||
&self.fields
|
||||
}
|
||||
|
||||
fn fields_mut(&mut self) -> &mut Fields {
|
||||
&mut self.fields
|
||||
}
|
||||
|
||||
fn output_keys(&self) -> HashSet<String> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|f| f.get_target_field().to_string())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
|
||||
self.process_field(val, field)
|
||||
}
|
||||
|
||||
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String> {
|
||||
for field in self.fields.iter() {
|
||||
let index = field.input_field.index;
|
||||
let index = field.input().index;
|
||||
match val.get(index) {
|
||||
Some(Value::Null) | None => {
|
||||
if !self.ignore_missing {
|
||||
return Err(format!(
|
||||
"{} processor: missing field: {}",
|
||||
self.kind(),
|
||||
field.get_field_name()
|
||||
&field.input().name
|
||||
));
|
||||
}
|
||||
}
|
||||
Some(v) => {
|
||||
// TODO(qtang): Let this method use the intermediate state collection directly.
|
||||
let mut map = self.process_field(v, field)?;
|
||||
field
|
||||
.output_fields_index_mapping
|
||||
.iter()
|
||||
.for_each(|(k, output_index)| {
|
||||
if let Some(v) = map.remove(k) {
|
||||
val[*output_index] = v;
|
||||
}
|
||||
});
|
||||
let result = self.parse(v)?;
|
||||
let (_, index) = field.output();
|
||||
val[*index] = Value::Timestamp(result);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -351,9 +349,18 @@ impl Processor for TimestampProcessor {
|
||||
mod tests {
|
||||
use yaml_rust::YamlLoader;
|
||||
|
||||
use super::TimestampProcessor;
|
||||
use super::{TimestampProcessor, TimestampProcessorBuilder};
|
||||
use crate::etl::value::{Timestamp, Value};
|
||||
|
||||
fn builder_to_native_processor(builder: TimestampProcessorBuilder) -> TimestampProcessor {
|
||||
TimestampProcessor {
|
||||
fields: vec![],
|
||||
formats: builder.formats,
|
||||
resolution: builder.resolution,
|
||||
ignore_missing: builder.ignore_missing,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_epoch() {
|
||||
let processor_yaml_str = r#"fields:
|
||||
@@ -367,7 +374,9 @@ formats:
|
||||
"#;
|
||||
let yaml = &YamlLoader::load_from_str(processor_yaml_str).unwrap()[0];
|
||||
let timestamp_yaml = yaml.as_hash().unwrap();
|
||||
let processor = TimestampProcessor::try_from(timestamp_yaml).unwrap();
|
||||
let processor = builder_to_native_processor(
|
||||
TimestampProcessorBuilder::try_from(timestamp_yaml).unwrap(),
|
||||
);
|
||||
|
||||
let values = [
|
||||
(
|
||||
@@ -419,7 +428,9 @@ formats:
|
||||
"#;
|
||||
let yaml = &YamlLoader::load_from_str(processor_yaml_str).unwrap()[0];
|
||||
let timestamp_yaml = yaml.as_hash().unwrap();
|
||||
let processor = TimestampProcessor::try_from(timestamp_yaml).unwrap();
|
||||
let processor = builder_to_native_processor(
|
||||
TimestampProcessorBuilder::try_from(timestamp_yaml).unwrap(),
|
||||
);
|
||||
|
||||
let values: Vec<&str> = vec![
|
||||
"2014-5-17T12:34:56",
|
||||
|
||||
@@ -15,12 +15,12 @@
|
||||
use ahash::HashSet;
|
||||
use urlencoding::{decode, encode};
|
||||
|
||||
use crate::etl::field::{Field, Fields};
|
||||
use crate::etl::field::{Fields, OneInputOneOutputField};
|
||||
use crate::etl::processor::{
|
||||
yaml_bool, yaml_field, yaml_fields, yaml_string, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
|
||||
METHOD_NAME,
|
||||
yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, ProcessorBuilder, ProcessorKind,
|
||||
FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, METHOD_NAME,
|
||||
};
|
||||
use crate::etl::value::{Map, Value};
|
||||
use crate::etl::value::Value;
|
||||
|
||||
pub(crate) const PROCESSOR_URL_ENCODING: &str = "urlencoding";
|
||||
|
||||
@@ -52,54 +52,76 @@ impl std::str::FromStr for Method {
|
||||
}
|
||||
}
|
||||
|
||||
/// only support string value
|
||||
#[derive(Debug, Default)]
|
||||
pub struct UrlEncodingProcessor {
|
||||
pub struct UrlEncodingProcessorBuilder {
|
||||
fields: Fields,
|
||||
method: Method,
|
||||
ignore_missing: bool,
|
||||
}
|
||||
|
||||
impl ProcessorBuilder for UrlEncodingProcessorBuilder {
|
||||
fn output_keys(&self) -> HashSet<&str> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|f| f.target_or_input_field())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn input_keys(&self) -> HashSet<&str> {
|
||||
self.fields.iter().map(|f| f.input_field()).collect()
|
||||
}
|
||||
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String> {
|
||||
self.build(intermediate_keys)
|
||||
.map(ProcessorKind::UrlEncoding)
|
||||
}
|
||||
}
|
||||
|
||||
impl UrlEncodingProcessorBuilder {
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<UrlEncodingProcessor, String> {
|
||||
let mut real_fields = vec![];
|
||||
for field in self.fields.into_iter() {
|
||||
let input = OneInputOneOutputField::build(
|
||||
"urlencoding",
|
||||
intermediate_keys,
|
||||
field.input_field(),
|
||||
field.target_or_input_field(),
|
||||
)?;
|
||||
real_fields.push(input);
|
||||
}
|
||||
Ok(UrlEncodingProcessor {
|
||||
fields: real_fields,
|
||||
method: self.method,
|
||||
ignore_missing: self.ignore_missing,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// only support string value
|
||||
#[derive(Debug, Default)]
|
||||
pub struct UrlEncodingProcessor {
|
||||
fields: Vec<OneInputOneOutputField>,
|
||||
method: Method,
|
||||
ignore_missing: bool,
|
||||
}
|
||||
|
||||
impl UrlEncodingProcessor {
|
||||
fn with_fields(&mut self, mut fields: Fields) {
|
||||
Self::update_output_keys(&mut fields);
|
||||
self.fields = fields;
|
||||
}
|
||||
|
||||
fn with_ignore_missing(&mut self, ignore_missing: bool) {
|
||||
self.ignore_missing = ignore_missing;
|
||||
}
|
||||
|
||||
fn with_method(&mut self, method: Method) {
|
||||
self.method = method;
|
||||
}
|
||||
|
||||
fn process_field(&self, val: &str, field: &Field) -> Result<Map, String> {
|
||||
fn process_field(&self, val: &str) -> Result<Value, String> {
|
||||
let processed = match self.method {
|
||||
Method::Encode => encode(val).to_string(),
|
||||
Method::Decode => decode(val).map_err(|e| e.to_string())?.into_owned(),
|
||||
};
|
||||
let val = Value::String(processed);
|
||||
|
||||
let key = field.get_target_field();
|
||||
|
||||
Ok(Map::one(key, val))
|
||||
}
|
||||
|
||||
fn update_output_keys(fields: &mut Fields) {
|
||||
for field in fields.iter_mut() {
|
||||
field
|
||||
.output_fields_index_mapping
|
||||
.insert(field.get_target_field().to_string(), 0_usize);
|
||||
}
|
||||
Ok(Value::String(processed))
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for UrlEncodingProcessor {
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for UrlEncodingProcessorBuilder {
|
||||
type Error = String;
|
||||
|
||||
fn try_from(value: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
|
||||
let mut processor = UrlEncodingProcessor::default();
|
||||
let mut fields = Fields::default();
|
||||
let mut method = Method::Decode;
|
||||
let mut ignore_missing = false;
|
||||
|
||||
for (k, v) in value.iter() {
|
||||
let key = k
|
||||
@@ -107,24 +129,29 @@ impl TryFrom<&yaml_rust::yaml::Hash> for UrlEncodingProcessor {
|
||||
.ok_or(format!("key must be a string, but got {k:?}"))?;
|
||||
match key {
|
||||
FIELD_NAME => {
|
||||
processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?));
|
||||
fields = Fields::one(yaml_new_field(v, FIELD_NAME)?);
|
||||
}
|
||||
FIELDS_NAME => {
|
||||
processor.with_fields(yaml_fields(v, FIELDS_NAME)?);
|
||||
fields = yaml_new_fields(v, FIELDS_NAME)?;
|
||||
}
|
||||
|
||||
IGNORE_MISSING_NAME => {
|
||||
processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?);
|
||||
ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?;
|
||||
}
|
||||
|
||||
METHOD_NAME => {
|
||||
let method = yaml_string(v, METHOD_NAME)?;
|
||||
processor.with_method(method.parse()?);
|
||||
let method_str = yaml_string(v, METHOD_NAME)?;
|
||||
method = method_str.parse()?;
|
||||
}
|
||||
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
let processor = UrlEncodingProcessorBuilder {
|
||||
fields,
|
||||
method,
|
||||
ignore_missing,
|
||||
};
|
||||
|
||||
Ok(processor)
|
||||
}
|
||||
@@ -139,52 +166,21 @@ impl crate::etl::processor::Processor for UrlEncodingProcessor {
|
||||
self.ignore_missing
|
||||
}
|
||||
|
||||
fn fields(&self) -> &Fields {
|
||||
&self.fields
|
||||
}
|
||||
|
||||
fn fields_mut(&mut self) -> &mut Fields {
|
||||
&mut self.fields
|
||||
}
|
||||
|
||||
fn output_keys(&self) -> HashSet<String> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|f| f.get_target_field().to_string())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
|
||||
match val {
|
||||
Value::String(val) => self.process_field(val, field),
|
||||
_ => Err(format!(
|
||||
"{} processor: expect string value, but got {val:?}",
|
||||
self.kind()
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String> {
|
||||
for field in self.fields.iter() {
|
||||
let index = field.input_field.index;
|
||||
let index = field.input_index();
|
||||
match val.get(index) {
|
||||
Some(Value::String(s)) => {
|
||||
let mut map = self.process_field(s, field)?;
|
||||
field
|
||||
.output_fields_index_mapping
|
||||
.iter()
|
||||
.for_each(|(k, output_index)| {
|
||||
if let Some(v) = map.remove(k) {
|
||||
val[*output_index] = v;
|
||||
}
|
||||
});
|
||||
let result = self.process_field(s)?;
|
||||
let output_index = field.output_index();
|
||||
val[output_index] = result;
|
||||
}
|
||||
Some(Value::Null) | None => {
|
||||
if !self.ignore_missing {
|
||||
return Err(format!(
|
||||
"{} processor: missing field: {}",
|
||||
self.kind(),
|
||||
field.get_field_name()
|
||||
field.output_name()
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -202,29 +198,28 @@ impl crate::etl::processor::Processor for UrlEncodingProcessor {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::etl::field::{Field, Fields};
|
||||
|
||||
use crate::etl::processor::urlencoding::UrlEncodingProcessor;
|
||||
use crate::etl::value::{Map, Value};
|
||||
use crate::etl::value::Value;
|
||||
|
||||
#[test]
|
||||
fn test_decode_url() {
|
||||
let field = "url";
|
||||
let ff: Field = field.parse().unwrap();
|
||||
|
||||
let decoded = "//BC/[a=6.7.8.9,c=g,k=0,l=1]";
|
||||
let encoded = "%2F%2FBC%2F%5Ba%3D6.7.8.9%2Cc%3Dg%2Ck%3D0%2Cl%3D1%5D";
|
||||
|
||||
let mut processor = UrlEncodingProcessor::default();
|
||||
processor.with_fields(Fields::one(ff.clone()));
|
||||
|
||||
{
|
||||
let result = processor.process_field(encoded, &ff).unwrap();
|
||||
assert_eq!(Map::one(field, Value::String(decoded.into())), result)
|
||||
let processor = UrlEncodingProcessor::default();
|
||||
let result = processor.process_field(encoded).unwrap();
|
||||
assert_eq!(Value::String(decoded.into()), result)
|
||||
}
|
||||
{
|
||||
processor.with_method(super::Method::Encode);
|
||||
let result = processor.process_field(decoded, &ff).unwrap();
|
||||
assert_eq!(Map::one(field, Value::String(encoded.into())), result)
|
||||
let processor = UrlEncodingProcessor {
|
||||
fields: vec![],
|
||||
method: super::Method::Encode,
|
||||
ignore_missing: false,
|
||||
};
|
||||
let result = processor.process_field(decoded).unwrap();
|
||||
assert_eq!(Value::String(encoded.into()), result)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,8 +17,8 @@ pub mod transformer;
|
||||
|
||||
use itertools::Itertools;
|
||||
|
||||
use crate::etl::field::Fields;
|
||||
use crate::etl::processor::{update_one_one_output_keys, yaml_field, yaml_fields, yaml_string};
|
||||
use crate::etl::find_key_index;
|
||||
use crate::etl::processor::yaml_string;
|
||||
use crate::etl::transform::index::Index;
|
||||
use crate::etl::value::Value;
|
||||
|
||||
@@ -31,6 +31,9 @@ const TRANSFORM_ON_FAILURE: &str = "on_failure";
|
||||
|
||||
pub use transformer::greptime::GreptimeTransformer;
|
||||
|
||||
use super::field::{Fields, InputFieldInfo, OneInputOneOutputField};
|
||||
use super::processor::{yaml_new_field, yaml_new_fields};
|
||||
|
||||
pub trait Transformer: std::fmt::Display + Sized + Send + Sync + 'static {
|
||||
type Output;
|
||||
type VecOutput;
|
||||
@@ -39,12 +42,11 @@ pub trait Transformer: std::fmt::Display + Sized + Send + Sync + 'static {
|
||||
fn schemas(&self) -> &Vec<greptime_proto::v1::ColumnSchema>;
|
||||
fn transforms(&self) -> &Transforms;
|
||||
fn transforms_mut(&mut self) -> &mut Transforms;
|
||||
fn transform(&self, val: Value) -> Result<Self::Output, String>;
|
||||
fn transform_mut(&self, val: &mut Vec<Value>) -> Result<Self::VecOutput, String>;
|
||||
}
|
||||
|
||||
/// On Failure behavior when transform fails
|
||||
#[derive(Debug, Clone, Default)]
|
||||
#[derive(Debug, Clone, Default, Copy)]
|
||||
pub enum OnFailure {
|
||||
// Return None if transform fails
|
||||
#[default]
|
||||
@@ -74,12 +76,18 @@ impl std::fmt::Display for OnFailure {
|
||||
}
|
||||
}
|
||||
}
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct TransformBuilders {
|
||||
pub(crate) builders: Vec<TransformBuilder>,
|
||||
pub(crate) output_keys: Vec<String>,
|
||||
pub(crate) required_keys: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct Transforms {
|
||||
transforms: Vec<Transform>,
|
||||
output_keys: Vec<String>,
|
||||
required_keys: Vec<String>,
|
||||
pub(crate) transforms: Vec<Transform>,
|
||||
pub(crate) output_keys: Vec<String>,
|
||||
pub(crate) required_keys: Vec<String>,
|
||||
}
|
||||
|
||||
impl Transforms {
|
||||
@@ -130,7 +138,7 @@ impl std::ops::DerefMut for Transforms {
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&Vec<yaml_rust::Yaml>> for Transforms {
|
||||
impl TryFrom<&Vec<yaml_rust::Yaml>> for TransformBuilders {
|
||||
type Error = String;
|
||||
|
||||
fn try_from(docs: &Vec<yaml_rust::Yaml>) -> Result<Self, Self::Error> {
|
||||
@@ -138,41 +146,78 @@ impl TryFrom<&Vec<yaml_rust::Yaml>> for Transforms {
|
||||
let mut all_output_keys: Vec<String> = Vec::with_capacity(100);
|
||||
let mut all_required_keys = Vec::with_capacity(100);
|
||||
for doc in docs {
|
||||
let transform: Transform = doc
|
||||
let transform_builder: TransformBuilder = doc
|
||||
.as_hash()
|
||||
.ok_or("transform element must be a map".to_string())?
|
||||
.try_into()?;
|
||||
let mut transform_output_keys = transform
|
||||
let mut transform_output_keys = transform_builder
|
||||
.fields
|
||||
.iter()
|
||||
.map(|f| f.get_target_field().to_string())
|
||||
.map(|f| f.target_or_input_field().to_string())
|
||||
.collect();
|
||||
all_output_keys.append(&mut transform_output_keys);
|
||||
|
||||
let mut transform_required_keys = transform
|
||||
let mut transform_required_keys = transform_builder
|
||||
.fields
|
||||
.iter()
|
||||
.map(|f| f.input_field.name.clone())
|
||||
.map(|f| f.input_field().to_string())
|
||||
.collect();
|
||||
all_required_keys.append(&mut transform_required_keys);
|
||||
|
||||
transforms.push(transform);
|
||||
transforms.push(transform_builder);
|
||||
}
|
||||
|
||||
all_required_keys.sort();
|
||||
|
||||
Ok(Transforms {
|
||||
transforms,
|
||||
Ok(TransformBuilders {
|
||||
builders: transforms,
|
||||
output_keys: all_output_keys,
|
||||
required_keys: all_required_keys,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct TransformBuilder {
|
||||
fields: Fields,
|
||||
type_: Value,
|
||||
default: Option<Value>,
|
||||
index: Option<Index>,
|
||||
on_failure: Option<OnFailure>,
|
||||
}
|
||||
|
||||
impl TransformBuilder {
|
||||
pub fn build(
|
||||
self,
|
||||
intermediate_keys: &[String],
|
||||
output_keys: &[String],
|
||||
) -> Result<Transform, String> {
|
||||
let mut real_fields = vec![];
|
||||
for field in self.fields {
|
||||
let input_index = find_key_index(intermediate_keys, field.input_field(), "transform")?;
|
||||
let input_field_info = InputFieldInfo::new(field.input_field(), input_index);
|
||||
let output_index =
|
||||
find_key_index(output_keys, field.target_or_input_field(), "transform")?;
|
||||
let input = OneInputOneOutputField::new(
|
||||
input_field_info,
|
||||
(field.target_or_input_field().to_string(), output_index),
|
||||
);
|
||||
real_fields.push(input);
|
||||
}
|
||||
Ok(Transform {
|
||||
real_fields,
|
||||
type_: self.type_,
|
||||
default: self.default,
|
||||
index: self.index,
|
||||
on_failure: self.on_failure,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// only field is required
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Transform {
|
||||
pub fields: Fields,
|
||||
pub real_fields: Vec<OneInputOneOutputField>,
|
||||
|
||||
pub type_: Value,
|
||||
|
||||
@@ -192,7 +237,7 @@ impl std::fmt::Display for Transform {
|
||||
};
|
||||
|
||||
let type_ = format!("type: {}", self.type_);
|
||||
let fields = format!("field(s): {}", self.fields);
|
||||
let fields = format!("field(s): {:?}", self.real_fields);
|
||||
let default = if let Some(default) = &self.default {
|
||||
format!(", default: {}", default)
|
||||
} else {
|
||||
@@ -212,7 +257,7 @@ impl std::fmt::Display for Transform {
|
||||
impl Default for Transform {
|
||||
fn default() -> Self {
|
||||
Transform {
|
||||
fields: Fields::default(),
|
||||
real_fields: Vec::new(),
|
||||
type_: Value::Null,
|
||||
default: None,
|
||||
index: None,
|
||||
@@ -222,40 +267,6 @@ impl Default for Transform {
|
||||
}
|
||||
|
||||
impl Transform {
|
||||
fn with_fields(&mut self, mut fields: Fields) {
|
||||
update_one_one_output_keys(&mut fields);
|
||||
self.fields = fields;
|
||||
}
|
||||
|
||||
fn with_type(&mut self, type_: Value) {
|
||||
self.type_ = type_;
|
||||
}
|
||||
|
||||
fn try_default(&mut self, default: Value) -> Result<(), String> {
|
||||
match (&self.type_, &default) {
|
||||
(Value::Null, _) => Err(format!(
|
||||
"transform {} type MUST BE set before default {}",
|
||||
self.fields, &default,
|
||||
)),
|
||||
(_, Value::Null) => Ok(()), // if default is not set, then it will be regarded as default null
|
||||
(_, _) => {
|
||||
let target = self
|
||||
.type_
|
||||
.parse_str_value(default.to_str_value().as_str())?;
|
||||
self.default = Some(target);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn with_index(&mut self, index: Index) {
|
||||
self.index = Some(index);
|
||||
}
|
||||
|
||||
fn with_on_failure(&mut self, on_failure: OnFailure) {
|
||||
self.on_failure = Some(on_failure);
|
||||
}
|
||||
|
||||
pub(crate) fn get_default(&self) -> Option<&Value> {
|
||||
self.default.as_ref()
|
||||
}
|
||||
@@ -265,52 +276,74 @@ impl Transform {
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for Transform {
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for TransformBuilder {
|
||||
type Error = String;
|
||||
|
||||
fn try_from(hash: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
|
||||
let mut transform = Transform::default();
|
||||
|
||||
let mut default_opt = None;
|
||||
let mut fields = Fields::default();
|
||||
let mut type_ = Value::Null;
|
||||
let mut default = None;
|
||||
let mut index = None;
|
||||
let mut on_failure = None;
|
||||
|
||||
for (k, v) in hash {
|
||||
let key = k.as_str().ok_or("key must be a string")?;
|
||||
match key {
|
||||
TRANSFORM_FIELD => {
|
||||
transform.with_fields(Fields::one(yaml_field(v, TRANSFORM_FIELD)?));
|
||||
fields = Fields::one(yaml_new_field(v, TRANSFORM_FIELD)?);
|
||||
}
|
||||
|
||||
TRANSFORM_FIELDS => {
|
||||
transform.with_fields(yaml_fields(v, TRANSFORM_FIELDS)?);
|
||||
fields = yaml_new_fields(v, TRANSFORM_FIELDS)?;
|
||||
}
|
||||
|
||||
TRANSFORM_TYPE => {
|
||||
let t = yaml_string(v, TRANSFORM_TYPE)?;
|
||||
transform.with_type(Value::parse_str_type(&t)?);
|
||||
type_ = Value::parse_str_type(&t)?;
|
||||
}
|
||||
|
||||
TRANSFORM_INDEX => {
|
||||
let index = yaml_string(v, TRANSFORM_INDEX)?;
|
||||
transform.with_index(index.try_into()?);
|
||||
let index_str = yaml_string(v, TRANSFORM_INDEX)?;
|
||||
index = Some(index_str.try_into()?);
|
||||
}
|
||||
|
||||
TRANSFORM_DEFAULT => {
|
||||
default_opt = Some(Value::try_from(v)?);
|
||||
default = Some(Value::try_from(v)?);
|
||||
}
|
||||
|
||||
TRANSFORM_ON_FAILURE => {
|
||||
let on_failure = yaml_string(v, TRANSFORM_ON_FAILURE)?;
|
||||
transform.with_on_failure(on_failure.parse()?);
|
||||
let on_failure_str = yaml_string(v, TRANSFORM_ON_FAILURE)?;
|
||||
on_failure = Some(on_failure_str.parse()?);
|
||||
}
|
||||
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
let mut final_default = None;
|
||||
|
||||
if let Some(default) = default_opt {
|
||||
transform.try_default(default)?;
|
||||
if let Some(default_value) = default {
|
||||
match (&type_, &default_value) {
|
||||
(Value::Null, _) => {
|
||||
return Err(format!(
|
||||
"transform {:?} type MUST BE set before default {}",
|
||||
fields, &default_value,
|
||||
));
|
||||
}
|
||||
(_, Value::Null) => {} // if default is not set, then it will be regarded as default null
|
||||
(_, _) => {
|
||||
let target = type_.parse_str_value(default_value.to_str_value().as_str())?;
|
||||
final_default = Some(target);
|
||||
}
|
||||
}
|
||||
}
|
||||
let builder = TransformBuilder {
|
||||
fields,
|
||||
type_,
|
||||
default: final_default,
|
||||
index,
|
||||
on_failure,
|
||||
};
|
||||
|
||||
Ok(transform)
|
||||
Ok(builder)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,10 +20,10 @@ use coerce::{coerce_columns, coerce_value};
|
||||
use greptime_proto::v1::{ColumnSchema, Row, Rows, Value as GreptimeValue};
|
||||
use itertools::Itertools;
|
||||
|
||||
use crate::etl::field::{Field, Fields};
|
||||
use crate::etl::field::{InputFieldInfo, OneInputOneOutputField};
|
||||
use crate::etl::transform::index::Index;
|
||||
use crate::etl::transform::{Transform, Transformer, Transforms};
|
||||
use crate::etl::value::{Array, Map, Timestamp, Value};
|
||||
use crate::etl::value::{Timestamp, Value};
|
||||
|
||||
const DEFAULT_GREPTIME_TIMESTAMP_COLUMN: &str = "greptime_timestamp";
|
||||
|
||||
@@ -36,23 +36,41 @@ pub struct GreptimeTransformer {
|
||||
}
|
||||
|
||||
impl GreptimeTransformer {
|
||||
fn default_greptime_timestamp_column() -> Transform {
|
||||
/// Add a default timestamp column to the transforms
|
||||
fn add_greptime_timestamp_column(transforms: &mut Transforms) {
|
||||
let ns = chrono::Utc::now().timestamp_nanos_opt().unwrap_or(0);
|
||||
let type_ = Value::Timestamp(Timestamp::Nanosecond(ns));
|
||||
let default = Some(type_.clone());
|
||||
let mut field = Field::new(DEFAULT_GREPTIME_TIMESTAMP_COLUMN);
|
||||
field.insert_output_index(DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string(), 0);
|
||||
let fields = Fields::new(vec![field]).unwrap();
|
||||
|
||||
Transform {
|
||||
fields,
|
||||
let transform = Transform {
|
||||
real_fields: vec![OneInputOneOutputField::new(
|
||||
InputFieldInfo {
|
||||
name: DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string(),
|
||||
index: usize::MAX,
|
||||
},
|
||||
(
|
||||
DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string(),
|
||||
transforms
|
||||
.transforms
|
||||
.iter()
|
||||
.map(|x| x.real_fields.len())
|
||||
.sum(),
|
||||
),
|
||||
)],
|
||||
type_,
|
||||
default,
|
||||
index: Some(Index::Time),
|
||||
on_failure: Some(crate::etl::transform::OnFailure::Default),
|
||||
}
|
||||
};
|
||||
let required_keys = transforms.required_keys_mut();
|
||||
required_keys.push(DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string());
|
||||
|
||||
let output_keys = transforms.output_keys_mut();
|
||||
output_keys.push(DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string());
|
||||
transforms.push(transform);
|
||||
}
|
||||
|
||||
/// Generate the schema for the GreptimeTransformer
|
||||
fn schemas(transforms: &Transforms) -> Result<Vec<ColumnSchema>, String> {
|
||||
let mut schema = vec![];
|
||||
for transform in transforms.iter() {
|
||||
@@ -60,53 +78,6 @@ impl GreptimeTransformer {
|
||||
}
|
||||
Ok(schema)
|
||||
}
|
||||
|
||||
fn transform_map(&self, map: &Map) -> Result<Row, String> {
|
||||
let mut values = vec![GreptimeValue { value_data: None }; self.schema.len()];
|
||||
for transform in self.transforms.iter() {
|
||||
for field in transform.fields.iter() {
|
||||
let value_data = match map.get(field.get_field_name()) {
|
||||
Some(val) => coerce_value(val, transform)?,
|
||||
None => {
|
||||
let default = transform.get_default();
|
||||
match default {
|
||||
Some(default) => coerce_value(default, transform)?,
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
};
|
||||
if let Some(i) = field
|
||||
.output_fields_index_mapping
|
||||
.iter()
|
||||
.next()
|
||||
.map(|kv| kv.1)
|
||||
{
|
||||
values[*i] = GreptimeValue { value_data }
|
||||
} else {
|
||||
return Err(format!(
|
||||
"field: {} output_fields is empty.",
|
||||
field.get_field_name()
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Row { values })
|
||||
}
|
||||
|
||||
fn transform_array(&self, arr: &Array) -> Result<Vec<Row>, String> {
|
||||
let mut rows = Vec::with_capacity(arr.len());
|
||||
for v in arr.iter() {
|
||||
match v {
|
||||
Value::Map(map) => {
|
||||
let row = self.transform_map(map)?;
|
||||
rows.push(row);
|
||||
}
|
||||
_ => return Err(format!("Expected map, found: {v:?}")),
|
||||
}
|
||||
}
|
||||
Ok(rows)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for GreptimeTransformer {
|
||||
@@ -129,9 +100,9 @@ impl Transformer for GreptimeTransformer {
|
||||
|
||||
for transform in transforms.iter() {
|
||||
let target_fields_set = transform
|
||||
.fields
|
||||
.real_fields
|
||||
.iter()
|
||||
.map(|f| f.get_target_field())
|
||||
.map(|f| f.output_name())
|
||||
.collect::<HashSet<_>>();
|
||||
|
||||
let intersections: Vec<_> = column_names_set.intersection(&target_fields_set).collect();
|
||||
@@ -146,12 +117,15 @@ impl Transformer for GreptimeTransformer {
|
||||
|
||||
if let Some(idx) = transform.index {
|
||||
if idx == Index::Time {
|
||||
match transform.fields.len() {
|
||||
1 => timestamp_columns.push(transform.fields.first().unwrap().get_field_name()),
|
||||
_ => return Err(format!(
|
||||
"Illegal to set multiple timestamp Index columns, please set only one: {}",
|
||||
transform.fields.get_target_fields().join(", ")
|
||||
)),
|
||||
match transform.real_fields.len() {
|
||||
1 => timestamp_columns
|
||||
.push(transform.real_fields.first().unwrap().input_name()),
|
||||
_ => {
|
||||
return Err(format!(
|
||||
"Illegal to set multiple timestamp Index columns, please set only one: {}",
|
||||
transform.real_fields.iter().map(|x|x.input_name()).join(", ")
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -159,13 +133,7 @@ impl Transformer for GreptimeTransformer {
|
||||
|
||||
match timestamp_columns.len() {
|
||||
0 => {
|
||||
transforms.push(GreptimeTransformer::default_greptime_timestamp_column());
|
||||
|
||||
let required_keys = transforms.required_keys_mut();
|
||||
required_keys.push(DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string());
|
||||
|
||||
let output_keys = transforms.output_keys_mut();
|
||||
output_keys.push(DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string());
|
||||
GreptimeTransformer::add_greptime_timestamp_column(&mut transforms);
|
||||
|
||||
let schema = GreptimeTransformer::schemas(&transforms)?;
|
||||
Ok(GreptimeTransformer { transforms, schema })
|
||||
@@ -184,54 +152,26 @@ impl Transformer for GreptimeTransformer {
|
||||
}
|
||||
}
|
||||
|
||||
fn transform(&self, value: Value) -> Result<Self::Output, String> {
|
||||
match value {
|
||||
Value::Map(map) => {
|
||||
let rows = vec![self.transform_map(&map)?];
|
||||
Ok(Rows {
|
||||
schema: self.schema.clone(),
|
||||
rows,
|
||||
})
|
||||
}
|
||||
Value::Array(arr) => {
|
||||
let rows = self.transform_array(&arr)?;
|
||||
Ok(Rows {
|
||||
schema: self.schema.clone(),
|
||||
rows,
|
||||
})
|
||||
}
|
||||
_ => Err(format!("Expected map or array, found: {}", value)),
|
||||
}
|
||||
}
|
||||
|
||||
fn transform_mut(&self, val: &mut Vec<Value>) -> Result<Self::VecOutput, String> {
|
||||
let mut values = vec![GreptimeValue { value_data: None }; self.schema.len()];
|
||||
for transform in self.transforms.iter() {
|
||||
for field in transform.fields.iter() {
|
||||
let index = field.input_field.index;
|
||||
for field in transform.real_fields.iter() {
|
||||
let index = field.input_index();
|
||||
let output_index = field.output_index();
|
||||
match val.get(index) {
|
||||
Some(v) => {
|
||||
let value_data = coerce_value(v, transform)
|
||||
.map_err(|e| format!("{} processor: {}", field.get_field_name(), e))?;
|
||||
.map_err(|e| format!("{} processor: {}", field.input_name(), e))?;
|
||||
// every transform fields has only one output field
|
||||
if let Some(i) = field
|
||||
.output_fields_index_mapping
|
||||
.iter()
|
||||
.next()
|
||||
.map(|kv| kv.1)
|
||||
{
|
||||
values[*i] = GreptimeValue { value_data }
|
||||
} else {
|
||||
return Err(format!(
|
||||
"field: {} output_fields is empty.",
|
||||
field.get_field_name()
|
||||
));
|
||||
}
|
||||
values[output_index] = GreptimeValue { value_data };
|
||||
}
|
||||
_ => {
|
||||
return Err(format!(
|
||||
"Get field not in the array field: {field:?}, {val:?}"
|
||||
))
|
||||
None => {
|
||||
let default = transform.get_default();
|
||||
let value_data = match default {
|
||||
Some(default) => coerce_value(default, transform)?,
|
||||
None => None,
|
||||
};
|
||||
values[output_index] = GreptimeValue { value_data };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -66,8 +66,8 @@ impl TryFrom<Value> for ValueData {
|
||||
pub(crate) fn coerce_columns(transform: &Transform) -> Result<Vec<ColumnSchema>, String> {
|
||||
let mut columns = Vec::new();
|
||||
|
||||
for field in transform.fields.iter() {
|
||||
let column_name = field.get_target_field().to_string();
|
||||
for field in transform.real_fields.iter() {
|
||||
let column_name = field.output_name().to_string();
|
||||
|
||||
let datatype = coerce_type(transform)? as i32;
|
||||
|
||||
@@ -134,7 +134,7 @@ fn coerce_type(transform: &Transform) -> Result<ColumnDataType, String> {
|
||||
|
||||
Value::Null => Err(format!(
|
||||
"Null type not supported when to coerce '{}' type",
|
||||
transform.fields
|
||||
transform.type_.to_str_type()
|
||||
)),
|
||||
}
|
||||
}
|
||||
@@ -144,15 +144,18 @@ pub(crate) fn coerce_value(
|
||||
transform: &Transform,
|
||||
) -> Result<Option<ValueData>, String> {
|
||||
match val {
|
||||
Value::Null => match transform.on_failure {
|
||||
Some(OnFailure::Ignore) => Ok(None),
|
||||
Some(OnFailure::Default) => transform
|
||||
.get_default()
|
||||
.map(|default| coerce_value(default, transform))
|
||||
.unwrap_or_else(|| {
|
||||
coerce_value(transform.get_type_matched_default_val(), transform)
|
||||
}),
|
||||
None => Ok(None),
|
||||
Value::Null => match &transform.default {
|
||||
Some(default) => coerce_value(default, transform),
|
||||
None => match transform.on_failure {
|
||||
Some(OnFailure::Ignore) => Ok(None),
|
||||
Some(OnFailure::Default) => transform
|
||||
.get_default()
|
||||
.map(|default| coerce_value(default, transform))
|
||||
.unwrap_or_else(|| {
|
||||
coerce_value(transform.get_type_matched_default_val(), transform)
|
||||
}),
|
||||
None => Ok(None),
|
||||
},
|
||||
},
|
||||
|
||||
Value::Int8(n) => coerce_i64_value(*n as i64, transform),
|
||||
@@ -404,12 +407,11 @@ fn coerce_string_value(s: &String, transform: &Transform) -> Result<Option<Value
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::etl::field::Fields;
|
||||
|
||||
#[test]
|
||||
fn test_coerce_string_without_on_failure() {
|
||||
let transform = Transform {
|
||||
fields: Fields::default(),
|
||||
real_fields: vec![],
|
||||
type_: Value::Int32(0),
|
||||
default: None,
|
||||
index: None,
|
||||
@@ -434,7 +436,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_coerce_string_with_on_failure_ignore() {
|
||||
let transform = Transform {
|
||||
fields: Fields::default(),
|
||||
real_fields: vec![],
|
||||
type_: Value::Int32(0),
|
||||
default: None,
|
||||
index: None,
|
||||
@@ -449,7 +451,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_coerce_string_with_on_failure_default() {
|
||||
let mut transform = Transform {
|
||||
fields: Fields::default(),
|
||||
real_fields: vec![],
|
||||
type_: Value::Int32(0),
|
||||
default: None,
|
||||
index: None,
|
||||
|
||||
@@ -13,20 +13,45 @@
|
||||
// limitations under the License.
|
||||
|
||||
use greptime_proto::v1::{ColumnDataType, ColumnSchema, Rows, SemanticType};
|
||||
use pipeline::{parse, Content, GreptimeTransformer, Pipeline, Value};
|
||||
use pipeline::{parse, Content, GreptimeTransformer, Pipeline};
|
||||
|
||||
/// test util function to parse and execute pipeline
|
||||
pub fn parse_and_exec(input_str: &str, pipeline_yaml: &str) -> Rows {
|
||||
let input_value: Value = serde_json::from_str::<serde_json::Value>(input_str)
|
||||
.expect("failed to parse into json")
|
||||
.try_into()
|
||||
.expect("failed to convert into value");
|
||||
let input_value = serde_json::from_str::<serde_json::Value>(input_str).unwrap();
|
||||
|
||||
let yaml_content = Content::Yaml(pipeline_yaml.into());
|
||||
let pipeline: Pipeline<GreptimeTransformer> =
|
||||
parse(&yaml_content).expect("failed to parse pipeline");
|
||||
let mut result = pipeline.init_intermediate_state();
|
||||
|
||||
pipeline.exec(input_value).expect("failed to exec pipeline")
|
||||
let schema = pipeline.schemas().clone();
|
||||
|
||||
let mut rows = Vec::new();
|
||||
|
||||
match input_value {
|
||||
serde_json::Value::Array(array) => {
|
||||
for value in array {
|
||||
pipeline.prepare(value, &mut result).unwrap();
|
||||
let row = pipeline
|
||||
.exec_mut(&mut result)
|
||||
.expect("failed to exec pipeline");
|
||||
rows.push(row);
|
||||
pipeline.reset_intermediate_state(&mut result);
|
||||
}
|
||||
}
|
||||
serde_json::Value::Object(_) => {
|
||||
pipeline.prepare(input_value, &mut result).unwrap();
|
||||
let row = pipeline
|
||||
.exec_mut(&mut result)
|
||||
.expect("failed to exec pipeline");
|
||||
rows.push(row);
|
||||
}
|
||||
_ => {
|
||||
panic!("invalid input value");
|
||||
}
|
||||
}
|
||||
|
||||
Rows { schema, rows }
|
||||
}
|
||||
|
||||
/// test util function to create column schema
|
||||
|
||||
@@ -157,7 +157,7 @@ transform:
|
||||
fn test_modifier() {
|
||||
let empty_str = r#"
|
||||
{
|
||||
"str": "key1 key2 key3 key4 key5 key6 key7 key8"
|
||||
"str": "key1 key2 key3 key4 key5 key6"
|
||||
}"#;
|
||||
|
||||
let pipeline_yaml = r#"
|
||||
@@ -165,7 +165,7 @@ processors:
|
||||
- dissect:
|
||||
field: str
|
||||
patterns:
|
||||
- "%{key1} %{key2} %{+key3} %{+key3/2} %{key5->} %{?key6} %{*key_7} %{&key_7}"
|
||||
- "%{key1} %{key2} %{+key3} %{+key3/2} %{key5->} %{?key6}"
|
||||
|
||||
transform:
|
||||
- fields:
|
||||
@@ -173,7 +173,6 @@ transform:
|
||||
- key2
|
||||
- key3
|
||||
- key5
|
||||
- key7
|
||||
type: string
|
||||
"#;
|
||||
|
||||
@@ -184,7 +183,6 @@ transform:
|
||||
make_string_column_schema("key2".to_string()),
|
||||
make_string_column_schema("key3".to_string()),
|
||||
make_string_column_schema("key5".to_string()),
|
||||
make_string_column_schema("key7".to_string()),
|
||||
common::make_column_schema(
|
||||
"greptime_timestamp".to_string(),
|
||||
ColumnDataType::TimestampNanosecond,
|
||||
@@ -209,10 +207,6 @@ transform:
|
||||
output.rows[0].values[3].value_data,
|
||||
Some(StringValue("key5".to_string()))
|
||||
);
|
||||
assert_eq!(
|
||||
output.rows[0].values[4].value_data,
|
||||
Some(StringValue("key8".to_string()))
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -12,18 +12,18 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use api::v1::Rows;
|
||||
use common_telemetry::tracing::info;
|
||||
use greptime_proto::v1::value::ValueData::{
|
||||
BoolValue, F64Value, StringValue, TimestampNanosecondValue, TimestampSecondValue, U32Value,
|
||||
U64Value, U8Value,
|
||||
};
|
||||
use greptime_proto::v1::Value as GreptimeValue;
|
||||
use pipeline::{parse, Content, GreptimeTransformer, Pipeline, Value};
|
||||
use pipeline::{parse, Content, GreptimeTransformer, Pipeline};
|
||||
|
||||
#[test]
|
||||
fn test_complex_data() {
|
||||
let input_value_str = r#"
|
||||
[
|
||||
{
|
||||
"version": 1,
|
||||
"streamId": "12345",
|
||||
@@ -73,12 +73,9 @@ fn test_complex_data() {
|
||||
"ewExecutionInfo": "c:4380:7:161:162:161:n:::12473:200|C:4380:3:0:4:0:n:::6967:200|R:4380:20:99:99:1:n:::35982:200",
|
||||
"customField": "any-custom-value"
|
||||
}
|
||||
]
|
||||
"#;
|
||||
let input_value: Value = serde_json::from_str::<serde_json::Value>(input_value_str)
|
||||
.expect("failed to parse input value")
|
||||
.try_into()
|
||||
.expect("failed to convert input value");
|
||||
let input_value = serde_json::from_str::<serde_json::Value>(input_value_str)
|
||||
.expect("failed to parse input value");
|
||||
|
||||
let pipeline_yaml = r#"
|
||||
---
|
||||
@@ -422,7 +419,19 @@ transform:
|
||||
let yaml_content = Content::Yaml(pipeline_yaml.into());
|
||||
let pipeline: Pipeline<GreptimeTransformer> =
|
||||
parse(&yaml_content).expect("failed to parse pipeline");
|
||||
let output = pipeline.exec(input_value).expect("failed to exec pipeline");
|
||||
let mut stats = pipeline.init_intermediate_state();
|
||||
pipeline
|
||||
.prepare(input_value, &mut stats)
|
||||
.expect("failed to prepare pipeline");
|
||||
|
||||
let row = pipeline
|
||||
.exec_mut(&mut stats)
|
||||
.expect("failed to exec pipeline");
|
||||
|
||||
let output = Rows {
|
||||
schema: pipeline.schemas().clone(),
|
||||
rows: vec![row],
|
||||
};
|
||||
|
||||
assert_eq!(output.rows.len(), 1);
|
||||
let values = output.rows.first().unwrap().values.clone();
|
||||
@@ -464,10 +473,7 @@ fn test_simple_data() {
|
||||
"line": "2024-05-25 20:16:37.217 hello world"
|
||||
}
|
||||
"#;
|
||||
let input_value: Value = serde_json::from_str::<serde_json::Value>(input_value_str)
|
||||
.unwrap()
|
||||
.try_into()
|
||||
.unwrap();
|
||||
let input_value = serde_json::from_str::<serde_json::Value>(input_value_str).unwrap();
|
||||
|
||||
let pipeline_yaml = r#"
|
||||
processors:
|
||||
@@ -493,11 +499,13 @@ transform:
|
||||
|
||||
let yaml_content = Content::Yaml(pipeline_yaml.into());
|
||||
let pipeline: Pipeline<GreptimeTransformer> = parse(&yaml_content).unwrap();
|
||||
let output = pipeline.exec(input_value).unwrap();
|
||||
let r = output
|
||||
.rows
|
||||
|
||||
let mut status = pipeline.init_intermediate_state();
|
||||
pipeline.prepare(input_value, &mut status).unwrap();
|
||||
let row = pipeline.exec_mut(&mut status).unwrap();
|
||||
let r = row
|
||||
.values
|
||||
.into_iter()
|
||||
.flat_map(|v| v.values)
|
||||
.map(|v| v.value_data.unwrap())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
|
||||
@@ -753,6 +753,7 @@ impl HttpServer {
|
||||
"/pipelines/:pipeline_name",
|
||||
routing::delete(event::delete_pipeline),
|
||||
)
|
||||
.route("/pipelines/dryrun", routing::post(event::pipeline_dryrun))
|
||||
.layer(
|
||||
ServiceBuilder::new()
|
||||
.layer(HandleErrorLayer::new(handle_error))
|
||||
|
||||
@@ -23,15 +23,16 @@ use axum::headers::ContentType;
|
||||
use axum::http::header::CONTENT_TYPE;
|
||||
use axum::http::{Request, StatusCode};
|
||||
use axum::response::{IntoResponse, Response};
|
||||
use axum::{async_trait, BoxError, Extension, TypedHeader};
|
||||
use axum::{async_trait, BoxError, Extension, Json, TypedHeader};
|
||||
use common_query::{Output, OutputData};
|
||||
use common_telemetry::{error, warn};
|
||||
use datatypes::value::column_data_to_json;
|
||||
use pipeline::error::PipelineTransformSnafu;
|
||||
use pipeline::util::to_pipeline_version;
|
||||
use pipeline::PipelineVersion;
|
||||
use schemars::JsonSchema;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::{Deserializer, Value};
|
||||
use serde_json::{Deserializer, Map, Value};
|
||||
use session::context::{Channel, QueryContext, QueryContextRef};
|
||||
use snafu::{ensure, OptionExt, ResultExt};
|
||||
|
||||
@@ -230,6 +231,117 @@ fn transform_ndjson_array_factory(
|
||||
})
|
||||
}
|
||||
|
||||
#[axum_macros::debug_handler]
|
||||
pub async fn pipeline_dryrun(
|
||||
State(log_state): State<LogState>,
|
||||
Query(query_params): Query<LogIngesterQueryParams>,
|
||||
Extension(mut query_ctx): Extension<QueryContext>,
|
||||
TypedHeader(content_type): TypedHeader<ContentType>,
|
||||
payload: String,
|
||||
) -> Result<Response> {
|
||||
let handler = log_state.log_handler;
|
||||
let pipeline_name = query_params.pipeline_name.context(InvalidParameterSnafu {
|
||||
reason: "pipeline_name is required",
|
||||
})?;
|
||||
|
||||
let version = to_pipeline_version(query_params.version).context(PipelineSnafu)?;
|
||||
|
||||
let ignore_errors = query_params.ignore_errors.unwrap_or(false);
|
||||
|
||||
let value = extract_pipeline_value_by_content_type(content_type, payload, ignore_errors)?;
|
||||
|
||||
if value.len() > 10 {
|
||||
return Err(InvalidParameterSnafu {
|
||||
reason: "too many rows for dryrun",
|
||||
}
|
||||
.build());
|
||||
}
|
||||
|
||||
query_ctx.set_channel(Channel::Http);
|
||||
let query_ctx = Arc::new(query_ctx);
|
||||
|
||||
let pipeline = handler
|
||||
.get_pipeline(&pipeline_name, version, query_ctx.clone())
|
||||
.await?;
|
||||
|
||||
let mut intermediate_state = pipeline.init_intermediate_state();
|
||||
|
||||
let mut results = Vec::with_capacity(value.len());
|
||||
for v in value {
|
||||
pipeline
|
||||
.prepare(v, &mut intermediate_state)
|
||||
.map_err(|reason| PipelineTransformSnafu { reason }.build())
|
||||
.context(PipelineSnafu)?;
|
||||
let r = pipeline
|
||||
.exec_mut(&mut intermediate_state)
|
||||
.map_err(|reason| PipelineTransformSnafu { reason }.build())
|
||||
.context(PipelineSnafu)?;
|
||||
results.push(r);
|
||||
pipeline.reset_intermediate_state(&mut intermediate_state);
|
||||
}
|
||||
|
||||
let colume_type_key = "colume_type";
|
||||
let data_type_key = "data_type";
|
||||
let name_key = "name";
|
||||
|
||||
let schema = pipeline
|
||||
.schemas()
|
||||
.iter()
|
||||
.map(|cs| {
|
||||
let mut map = Map::new();
|
||||
map.insert(name_key.to_string(), Value::String(cs.column_name.clone()));
|
||||
map.insert(
|
||||
data_type_key.to_string(),
|
||||
Value::String(cs.datatype().as_str_name().to_string()),
|
||||
);
|
||||
map.insert(
|
||||
colume_type_key.to_string(),
|
||||
Value::String(cs.semantic_type().as_str_name().to_string()),
|
||||
);
|
||||
map.insert(
|
||||
"fulltext".to_string(),
|
||||
Value::Bool(
|
||||
cs.options
|
||||
.clone()
|
||||
.is_some_and(|x| x.options.contains_key("fulltext")),
|
||||
),
|
||||
);
|
||||
Value::Object(map)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
let rows = results
|
||||
.into_iter()
|
||||
.map(|row| {
|
||||
let row = row
|
||||
.values
|
||||
.into_iter()
|
||||
.enumerate()
|
||||
.map(|(idx, v)| {
|
||||
v.value_data
|
||||
.map(|d| {
|
||||
let mut map = Map::new();
|
||||
map.insert("value".to_string(), column_data_to_json(d));
|
||||
map.insert("key".to_string(), schema[idx][name_key].clone());
|
||||
map.insert(
|
||||
"semantic_type".to_string(),
|
||||
schema[idx][colume_type_key].clone(),
|
||||
);
|
||||
map.insert("data_type".to_string(), schema[idx][data_type_key].clone());
|
||||
Value::Object(map)
|
||||
})
|
||||
.unwrap_or(Value::Null)
|
||||
})
|
||||
.collect();
|
||||
Value::Array(row)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
let mut result = Map::new();
|
||||
result.insert("schema".to_string(), Value::Array(schema));
|
||||
result.insert("rows".to_string(), Value::Array(rows));
|
||||
let result = Value::Object(result);
|
||||
Ok(Json(result).into_response())
|
||||
}
|
||||
|
||||
#[axum_macros::debug_handler]
|
||||
pub async fn log_ingester(
|
||||
State(log_state): State<LogState>,
|
||||
|
||||
@@ -233,6 +233,9 @@ pub trait RegionScanner: Debug + DisplayAs + Send {
|
||||
/// # Panics
|
||||
/// Panics if the `partition` is out of bound.
|
||||
fn scan_partition(&self, partition: usize) -> Result<SendableRecordBatchStream, BoxedError>;
|
||||
|
||||
/// Check if there is any predicate that may be executed in this scanner.
|
||||
fn has_predicate(&self) -> bool;
|
||||
}
|
||||
|
||||
pub type RegionScannerRef = Box<dyn RegionScanner>;
|
||||
@@ -367,6 +370,10 @@ impl RegionScanner for SinglePartitionScanner {
|
||||
))
|
||||
})
|
||||
}
|
||||
|
||||
fn has_predicate(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
impl DisplayAs for SinglePartitionScanner {
|
||||
|
||||
@@ -180,7 +180,7 @@ impl ExecutionPlan for RegionScanExec {
|
||||
}
|
||||
|
||||
fn statistics(&self) -> DfResult<Statistics> {
|
||||
let statistics = if self.append_mode {
|
||||
let statistics = if self.append_mode && !self.scanner.lock().unwrap().has_predicate() {
|
||||
let column_statistics = self
|
||||
.arrow_schema
|
||||
.fields
|
||||
|
||||
@@ -78,6 +78,7 @@ macro_rules! http_tests {
|
||||
test_vm_proto_remote_write,
|
||||
|
||||
test_pipeline_api,
|
||||
test_test_pipeline_api,
|
||||
test_plain_text_ingestion,
|
||||
);
|
||||
)*
|
||||
@@ -1146,6 +1147,171 @@ transform:
|
||||
guard.remove_all().await;
|
||||
}
|
||||
|
||||
pub async fn test_test_pipeline_api(store_type: StorageType) {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let (app, mut guard) = setup_test_http_app_with_frontend(store_type, "test_pipeline_api").await;
|
||||
|
||||
// handshake
|
||||
let client = TestClient::new(app);
|
||||
|
||||
let body = r#"
|
||||
processors:
|
||||
- date:
|
||||
field: time
|
||||
formats:
|
||||
- "%Y-%m-%d %H:%M:%S%.3f"
|
||||
ignore_missing: true
|
||||
|
||||
transform:
|
||||
- fields:
|
||||
- id1
|
||||
- id2
|
||||
type: int32
|
||||
- fields:
|
||||
- type
|
||||
- log
|
||||
- logger
|
||||
type: string
|
||||
- field: time
|
||||
type: time
|
||||
index: timestamp
|
||||
"#;
|
||||
|
||||
// 1. create pipeline
|
||||
let res = client
|
||||
.post("/v1/events/pipelines/test")
|
||||
.header("Content-Type", "application/x-yaml")
|
||||
.body(body)
|
||||
.send()
|
||||
.await;
|
||||
|
||||
assert_eq!(res.status(), StatusCode::OK);
|
||||
|
||||
let content = res.text().await;
|
||||
|
||||
let content = serde_json::from_str(&content);
|
||||
assert!(content.is_ok());
|
||||
// {"execution_time_ms":13,"pipelines":[{"name":"test","version":"2024-07-04 08:31:00.987136"}]}
|
||||
let content: Value = content.unwrap();
|
||||
|
||||
let execution_time = content.get("execution_time_ms");
|
||||
assert!(execution_time.unwrap().is_number());
|
||||
let pipelines = content.get("pipelines");
|
||||
let pipelines = pipelines.unwrap().as_array().unwrap();
|
||||
assert_eq!(pipelines.len(), 1);
|
||||
let pipeline = pipelines.first().unwrap();
|
||||
assert_eq!(pipeline.get("name").unwrap(), "test");
|
||||
|
||||
// 2. write data
|
||||
let data_body = r#"
|
||||
[
|
||||
{
|
||||
"id1": "2436",
|
||||
"id2": "2528",
|
||||
"logger": "INTERACT.MANAGER",
|
||||
"type": "I",
|
||||
"time": "2024-05-25 20:16:37.217",
|
||||
"log": "ClusterAdapter:enter sendTextDataToCluster\\n"
|
||||
}
|
||||
]
|
||||
"#;
|
||||
let res = client
|
||||
.post("/v1/events/pipelines/dryrun?pipeline_name=test")
|
||||
.header("Content-Type", "application/json")
|
||||
.body(data_body)
|
||||
.send()
|
||||
.await;
|
||||
assert_eq!(res.status(), StatusCode::OK);
|
||||
let body: serde_json::Value = res.json().await;
|
||||
let schema = &body["schema"];
|
||||
let rows = &body["rows"];
|
||||
assert_eq!(
|
||||
schema,
|
||||
&json!([
|
||||
{
|
||||
"colume_type": "FIELD",
|
||||
"data_type": "INT32",
|
||||
"fulltext": false,
|
||||
"name": "id1"
|
||||
},
|
||||
{
|
||||
"colume_type": "FIELD",
|
||||
"data_type": "INT32",
|
||||
"fulltext": false,
|
||||
"name": "id2"
|
||||
},
|
||||
{
|
||||
"colume_type": "FIELD",
|
||||
"data_type": "STRING",
|
||||
"fulltext": false,
|
||||
"name": "type"
|
||||
},
|
||||
{
|
||||
"colume_type": "FIELD",
|
||||
"data_type": "STRING",
|
||||
"fulltext": false,
|
||||
"name": "log"
|
||||
},
|
||||
{
|
||||
"colume_type": "FIELD",
|
||||
"data_type": "STRING",
|
||||
"fulltext": false,
|
||||
"name": "logger"
|
||||
},
|
||||
{
|
||||
"colume_type": "TIMESTAMP",
|
||||
"data_type": "TIMESTAMP_NANOSECOND",
|
||||
"fulltext": false,
|
||||
"name": "time"
|
||||
}
|
||||
])
|
||||
);
|
||||
assert_eq!(
|
||||
rows,
|
||||
&json!([
|
||||
[
|
||||
{
|
||||
"data_type": "INT32",
|
||||
"key": "id1",
|
||||
"semantic_type": "FIELD",
|
||||
"value": 2436
|
||||
},
|
||||
{
|
||||
"data_type": "INT32",
|
||||
"key": "id2",
|
||||
"semantic_type": "FIELD",
|
||||
"value": 2528
|
||||
},
|
||||
{
|
||||
"data_type": "STRING",
|
||||
"key": "type",
|
||||
"semantic_type": "FIELD",
|
||||
"value": "I"
|
||||
},
|
||||
{
|
||||
"data_type": "STRING",
|
||||
"key": "log",
|
||||
"semantic_type": "FIELD",
|
||||
"value": "ClusterAdapter:enter sendTextDataToCluster\\n"
|
||||
},
|
||||
{
|
||||
"data_type": "STRING",
|
||||
"key": "logger",
|
||||
"semantic_type": "FIELD",
|
||||
"value": "INTERACT.MANAGER"
|
||||
},
|
||||
{
|
||||
"data_type": "TIMESTAMP_NANOSECOND",
|
||||
"key": "time",
|
||||
"semantic_type": "TIMESTAMP",
|
||||
"value": "2024-05-25 20:16:37.217+0000"
|
||||
}
|
||||
]
|
||||
])
|
||||
);
|
||||
guard.remove_all().await;
|
||||
}
|
||||
|
||||
pub async fn test_plain_text_ingestion(store_type: StorageType) {
|
||||
common_telemetry::init_default_ut_logging();
|
||||
let (app, mut guard) = setup_test_http_app_with_frontend(store_type, "test_pipeline_api").await;
|
||||
|
||||
@@ -54,3 +54,50 @@ drop table test;
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
-- Append table
|
||||
create table count_where_bug (
|
||||
tag String,
|
||||
ts TimestampMillisecond time index,
|
||||
num Int64,
|
||||
primary key (tag),
|
||||
) engine=mito with('append_mode'='true');
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
insert into count_where_bug (tag, ts, num)
|
||||
values ('a', '2024-09-06T06:00:01Z', 1),
|
||||
('a', '2024-09-06T06:00:02Z', 2),
|
||||
('a', '2024-09-06T06:00:03Z', 3),
|
||||
('b', '2024-09-06T06:00:04Z', 4),
|
||||
('b', '2024-09-06T06:00:05Z', 5);
|
||||
|
||||
Affected Rows: 5
|
||||
|
||||
select count(1) from count_where_bug where tag = 'b';
|
||||
|
||||
+-----------------+
|
||||
| COUNT(Int64(1)) |
|
||||
+-----------------+
|
||||
| 2 |
|
||||
+-----------------+
|
||||
|
||||
select count(1) from count_where_bug where ts > '2024-09-06T06:00:04Z';
|
||||
|
||||
+-----------------+
|
||||
| COUNT(Int64(1)) |
|
||||
+-----------------+
|
||||
| 1 |
|
||||
+-----------------+
|
||||
|
||||
select count(1) from count_where_bug where num != 3;
|
||||
|
||||
+-----------------+
|
||||
| COUNT(Int64(1)) |
|
||||
+-----------------+
|
||||
| 4 |
|
||||
+-----------------+
|
||||
|
||||
drop table count_where_bug;
|
||||
|
||||
Affected Rows: 0
|
||||
|
||||
|
||||
@@ -17,3 +17,27 @@ select count(*) from (select * from test cross join "HelloWorld");
|
||||
drop table "HelloWorld";
|
||||
|
||||
drop table test;
|
||||
|
||||
-- Append table
|
||||
|
||||
create table count_where_bug (
|
||||
tag String,
|
||||
ts TimestampMillisecond time index,
|
||||
num Int64,
|
||||
primary key (tag),
|
||||
) engine=mito with('append_mode'='true');
|
||||
|
||||
insert into count_where_bug (tag, ts, num)
|
||||
values ('a', '2024-09-06T06:00:01Z', 1),
|
||||
('a', '2024-09-06T06:00:02Z', 2),
|
||||
('a', '2024-09-06T06:00:03Z', 3),
|
||||
('b', '2024-09-06T06:00:04Z', 4),
|
||||
('b', '2024-09-06T06:00:05Z', 5);
|
||||
|
||||
select count(1) from count_where_bug where tag = 'b';
|
||||
|
||||
select count(1) from count_where_bug where ts > '2024-09-06T06:00:04Z';
|
||||
|
||||
select count(1) from count_where_bug where num != 3;
|
||||
|
||||
drop table count_where_bug;
|
||||
|
||||
Reference in New Issue
Block a user