mirror of
https://github.com/GreptimeTeam/greptimedb.git
synced 2026-05-20 23:10:37 +00:00
refactor!: add processor builder and transform buidler (#4571)
* chore: add processor builder and transform buidler * chore: in process * chore: intermediate state from hashmap to vector in pipeline * chore: remove useless code and rename some struct * chore: fix typos * chore: format code * chore: add error handling and optimize code readability * chore: fix typos * chore: remove useless code * chore: add some doc * chore: fix by pr commit * chore: remove useless code and change struct name * chore: modify the location of the find_key_index function.
This commit is contained in:
@@ -13,27 +13,13 @@
|
||||
// limitations under the License.
|
||||
|
||||
use criterion::{black_box, criterion_group, criterion_main, Criterion};
|
||||
use pipeline::{parse, Array, Content, GreptimeTransformer, Pipeline, Value as PipelineValue};
|
||||
use pipeline::{parse, Content, GreptimeTransformer, Pipeline};
|
||||
use serde_json::{Deserializer, Value};
|
||||
|
||||
fn processor_map(
|
||||
pipeline: &Pipeline<GreptimeTransformer>,
|
||||
input_values: Vec<Value>,
|
||||
) -> impl IntoIterator<Item = greptime_proto::v1::Rows> {
|
||||
let pipeline_data = input_values
|
||||
.into_iter()
|
||||
.map(|v| PipelineValue::try_from(v).unwrap())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
pipeline.exec(PipelineValue::Array(Array {
|
||||
values: pipeline_data,
|
||||
}))
|
||||
}
|
||||
|
||||
fn processor_mut(
|
||||
pipeline: &Pipeline<GreptimeTransformer>,
|
||||
input_values: Vec<Value>,
|
||||
) -> impl IntoIterator<Item = Vec<greptime_proto::v1::Row>> {
|
||||
) -> Result<Vec<greptime_proto::v1::Row>, String> {
|
||||
let mut payload = pipeline.init_intermediate_state();
|
||||
let mut result = Vec::with_capacity(input_values.len());
|
||||
|
||||
@@ -249,11 +235,10 @@ fn criterion_benchmark(c: &mut Criterion) {
|
||||
let pipeline = prepare_pipeline();
|
||||
let mut group = c.benchmark_group("pipeline");
|
||||
group.sample_size(50);
|
||||
group.bench_function("processor map", |b| {
|
||||
b.iter(|| processor_map(black_box(&pipeline), black_box(input_value.clone())))
|
||||
});
|
||||
group.bench_function("processor mut", |b| {
|
||||
b.iter(|| processor_mut(black_box(&pipeline), black_box(input_value.clone())))
|
||||
b.iter(|| {
|
||||
processor_mut(black_box(&pipeline), black_box(input_value.clone())).unwrap();
|
||||
})
|
||||
});
|
||||
group.finish();
|
||||
}
|
||||
|
||||
@@ -19,92 +19,24 @@ pub mod processor;
|
||||
pub mod transform;
|
||||
pub mod value;
|
||||
|
||||
use ahash::{HashMap, HashSet};
|
||||
use common_telemetry::{debug, warn};
|
||||
use ahash::HashSet;
|
||||
use common_telemetry::debug;
|
||||
use itertools::{merge, Itertools};
|
||||
use processor::Processor;
|
||||
use transform::{Transformer, Transforms};
|
||||
use value::{Map, Value};
|
||||
use processor::{Processor, ProcessorBuilder, Processors};
|
||||
use transform::{TransformBuilders, Transformer, Transforms};
|
||||
use value::Value;
|
||||
use yaml_rust::YamlLoader;
|
||||
|
||||
const DESCRIPTION: &str = "description";
|
||||
const PROCESSORS: &str = "processors";
|
||||
const TRANSFORM: &str = "transform";
|
||||
const TRANSFORMS: &str = "transforms";
|
||||
|
||||
pub enum Content {
|
||||
Json(String),
|
||||
Yaml(String),
|
||||
}
|
||||
|
||||
/// set the index for the processor keys
|
||||
/// the index is the position of the key in the final intermediate keys
|
||||
fn set_processor_keys_index(
|
||||
processors: &mut processor::Processors,
|
||||
final_intermediate_keys: &Vec<String>,
|
||||
) -> Result<(), String> {
|
||||
let final_intermediate_key_index = final_intermediate_keys
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, k)| (k.as_str(), i))
|
||||
.collect::<HashMap<_, _>>();
|
||||
for processor in processors.iter_mut() {
|
||||
for field in processor.fields_mut().iter_mut() {
|
||||
let index = final_intermediate_key_index.get(field.input_field.name.as_str()).ok_or(format!(
|
||||
"input field {} is not found in intermediate keys: {final_intermediate_keys:?} when set processor keys index",
|
||||
field.input_field.name
|
||||
))?;
|
||||
field.set_input_index(*index);
|
||||
for (k, v) in field.output_fields_index_mapping.iter_mut() {
|
||||
let index = final_intermediate_key_index.get(k.as_str());
|
||||
match index {
|
||||
Some(index) => {
|
||||
*v = *index;
|
||||
}
|
||||
None => {
|
||||
warn!(
|
||||
"output field {k} is not found in intermediate keys: {final_intermediate_keys:?} when set processor keys index"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn set_transform_keys_index(
|
||||
transforms: &mut Transforms,
|
||||
final_intermediate_keys: &[String],
|
||||
output_keys: &[String],
|
||||
) -> Result<(), String> {
|
||||
let final_intermediate_key_index = final_intermediate_keys
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, k)| (k.as_str(), i))
|
||||
.collect::<HashMap<_, _>>();
|
||||
let output_key_index = output_keys
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, k)| (k.as_str(), i))
|
||||
.collect::<HashMap<_, _>>();
|
||||
for transform in transforms.iter_mut() {
|
||||
for field in transform.fields.iter_mut() {
|
||||
let index = final_intermediate_key_index.get(field.input_field.name.as_str()).ok_or(format!(
|
||||
"input field {} is not found in intermediate keys: {final_intermediate_keys:?} when set transform keys index",
|
||||
field.input_field.name
|
||||
))?;
|
||||
field.set_input_index(*index);
|
||||
for (k, v) in field.output_fields_index_mapping.iter_mut() {
|
||||
let index = output_key_index.get(k.as_str()).ok_or(format!(
|
||||
"output field {k} is not found in output keys: {final_intermediate_keys:?} when set transform keys index"
|
||||
))?;
|
||||
*v = *index;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn parse<T>(input: &Content) -> Result<Pipeline<T>, String>
|
||||
where
|
||||
T: Transformer,
|
||||
@@ -117,24 +49,22 @@ where
|
||||
|
||||
let description = doc[DESCRIPTION].as_str().map(|s| s.to_string());
|
||||
|
||||
let mut processors = if let Some(v) = doc[PROCESSORS].as_vec() {
|
||||
let processor_builder_list = if let Some(v) = doc[PROCESSORS].as_vec() {
|
||||
v.try_into()?
|
||||
} else {
|
||||
processor::Processors::default()
|
||||
processor::ProcessorBuilderList::default()
|
||||
};
|
||||
|
||||
let transforms = if let Some(v) = doc[TRANSFORM].as_vec() {
|
||||
v.try_into()?
|
||||
} else {
|
||||
Transforms::default()
|
||||
};
|
||||
let transform_builders =
|
||||
if let Some(v) = doc[TRANSFORMS].as_vec().or(doc[TRANSFORM].as_vec()) {
|
||||
v.try_into()?
|
||||
} else {
|
||||
TransformBuilders::default()
|
||||
};
|
||||
|
||||
let mut transformer = T::new(transforms)?;
|
||||
let transforms = transformer.transforms_mut();
|
||||
|
||||
let processors_output_keys = processors.output_keys();
|
||||
let processors_required_keys = processors.required_keys();
|
||||
let processors_required_original_keys = processors.required_original_keys();
|
||||
let processors_required_keys = &processor_builder_list.input_keys;
|
||||
let processors_output_keys = &processor_builder_list.output_keys;
|
||||
let processors_required_original_keys = &processor_builder_list.original_input_keys;
|
||||
|
||||
debug!(
|
||||
"processors_required_original_keys: {:?}",
|
||||
@@ -143,7 +73,7 @@ where
|
||||
debug!("processors_required_keys: {:?}", processors_required_keys);
|
||||
debug!("processors_output_keys: {:?}", processors_output_keys);
|
||||
|
||||
let transforms_required_keys = transforms.required_keys();
|
||||
let transforms_required_keys = &transform_builders.required_keys;
|
||||
let mut tr_keys = Vec::with_capacity(50);
|
||||
for key in transforms_required_keys.iter() {
|
||||
if !processors_output_keys.contains(key)
|
||||
@@ -183,9 +113,33 @@ where
|
||||
|
||||
final_intermediate_keys.extend(intermediate_keys_exclude_original);
|
||||
|
||||
let output_keys = transforms.output_keys().clone();
|
||||
set_processor_keys_index(&mut processors, &final_intermediate_keys)?;
|
||||
set_transform_keys_index(transforms, &final_intermediate_keys, &output_keys)?;
|
||||
let output_keys = transform_builders.output_keys.clone();
|
||||
|
||||
let processors_kind_list = processor_builder_list
|
||||
.processor_builders
|
||||
.into_iter()
|
||||
.map(|builder| builder.build(&final_intermediate_keys))
|
||||
.collect::<Result<Vec<_>, _>>()?;
|
||||
let processors = Processors {
|
||||
processors: processors_kind_list,
|
||||
required_keys: processors_required_keys.clone(),
|
||||
output_keys: processors_output_keys.clone(),
|
||||
required_original_keys: processors_required_original_keys.clone(),
|
||||
};
|
||||
|
||||
let transfor_list = transform_builders
|
||||
.builders
|
||||
.into_iter()
|
||||
.map(|builder| builder.build(&final_intermediate_keys, &output_keys))
|
||||
.collect::<Result<Vec<_>, String>>()?;
|
||||
|
||||
let transformers = Transforms {
|
||||
transforms: transfor_list,
|
||||
required_keys: transforms_required_keys.clone(),
|
||||
output_keys: output_keys.clone(),
|
||||
};
|
||||
|
||||
let transformer = T::new(transformers)?;
|
||||
|
||||
Ok(Pipeline {
|
||||
description,
|
||||
@@ -238,38 +192,6 @@ impl<T> Pipeline<T>
|
||||
where
|
||||
T: Transformer,
|
||||
{
|
||||
fn exec_map(&self, map: &mut Map) -> Result<(), String> {
|
||||
let v = map;
|
||||
for processor in self.processors.iter() {
|
||||
processor.exec_map(v)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn exec(&self, mut val: Value) -> Result<T::Output, String> {
|
||||
let result = match val {
|
||||
Value::Map(ref mut map) => {
|
||||
self.exec_map(map)?;
|
||||
val
|
||||
}
|
||||
Value::Array(arr) => arr
|
||||
.values
|
||||
.into_iter()
|
||||
.map(|mut v| match v {
|
||||
Value::Map(ref mut map) => {
|
||||
self.exec_map(map)?;
|
||||
Ok(v)
|
||||
}
|
||||
_ => Err(format!("expected a map, but got {}", v)),
|
||||
})
|
||||
.collect::<Result<Vec<Value>, String>>()
|
||||
.map(|values| Value::Array(value::Array { values }))?,
|
||||
_ => return Err(format!("expected a map or array, but got {}", val)),
|
||||
};
|
||||
|
||||
self.transformer.transform(result)
|
||||
}
|
||||
|
||||
pub fn exec_mut(&self, val: &mut Vec<Value>) -> Result<T::VecOutput, String> {
|
||||
for processor in self.processors.iter() {
|
||||
processor.exec_mut(val)?;
|
||||
@@ -347,9 +269,24 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn find_key_index(
|
||||
intermediate_keys: &[String],
|
||||
key: &str,
|
||||
kind: &str,
|
||||
) -> Result<usize, String> {
|
||||
intermediate_keys
|
||||
.iter()
|
||||
.position(|k| k == key)
|
||||
.ok_or(format!(
|
||||
"{} processor.{} not found in intermediate keys",
|
||||
kind, key
|
||||
))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use api::v1::Rows;
|
||||
use greptime_proto::v1::value::ValueData;
|
||||
use greptime_proto::v1::{self, ColumnDataType, SemanticType};
|
||||
|
||||
@@ -359,96 +296,43 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_pipeline_prepare() {
|
||||
{
|
||||
let input_value_str = r#"
|
||||
{
|
||||
"my_field": "1,2",
|
||||
"foo": "bar"
|
||||
}
|
||||
"#;
|
||||
let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap();
|
||||
|
||||
let pipeline_yaml = r#"
|
||||
---
|
||||
description: Pipeline for Apache Tomcat
|
||||
let input_value_str = r#"
|
||||
{
|
||||
"my_field": "1,2",
|
||||
"foo": "bar"
|
||||
}
|
||||
"#;
|
||||
let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap();
|
||||
|
||||
let pipeline_yaml = r#"description: 'Pipeline for Apache Tomcat'
|
||||
processors:
|
||||
- csv:
|
||||
field: my_field, my_field,field1, field2
|
||||
|
||||
field: my_field
|
||||
target_fields: field1, field2
|
||||
transform:
|
||||
- field: field1
|
||||
type: uint32
|
||||
- field: field2
|
||||
type: uint32
|
||||
"#;
|
||||
let pipeline: Pipeline<GreptimeTransformer> =
|
||||
parse(&Content::Yaml(pipeline_yaml.into())).unwrap();
|
||||
let mut payload = pipeline.init_intermediate_state();
|
||||
pipeline.prepare(input_value, &mut payload).unwrap();
|
||||
assert_eq!(
|
||||
&["greptime_timestamp", "my_field"].to_vec(),
|
||||
pipeline.required_keys()
|
||||
);
|
||||
assert_eq!(
|
||||
payload,
|
||||
vec![
|
||||
Value::Null,
|
||||
Value::String("1,2".to_string()),
|
||||
Value::Null,
|
||||
Value::Null
|
||||
]
|
||||
);
|
||||
let result = pipeline.exec_mut(&mut payload).unwrap();
|
||||
let pipeline: Pipeline<GreptimeTransformer> =
|
||||
parse(&Content::Yaml(pipeline_yaml.into())).unwrap();
|
||||
let mut payload = pipeline.init_intermediate_state();
|
||||
pipeline.prepare(input_value, &mut payload).unwrap();
|
||||
assert_eq!(&["my_field"].to_vec(), pipeline.required_keys());
|
||||
assert_eq!(
|
||||
payload,
|
||||
vec![Value::String("1,2".to_string()), Value::Null, Value::Null]
|
||||
);
|
||||
let result = pipeline.exec_mut(&mut payload).unwrap();
|
||||
|
||||
assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1)));
|
||||
assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2)));
|
||||
match &result.values[2].value_data {
|
||||
Some(ValueData::TimestampNanosecondValue(v)) => {
|
||||
assert_ne!(*v, 0);
|
||||
}
|
||||
_ => panic!("expect null value"),
|
||||
assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1)));
|
||||
assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2)));
|
||||
match &result.values[2].value_data {
|
||||
Some(ValueData::TimestampNanosecondValue(v)) => {
|
||||
assert_ne!(*v, 0);
|
||||
}
|
||||
}
|
||||
{
|
||||
let input_value_str = r#"
|
||||
{
|
||||
"reqTimeSec": "1573840000.000"
|
||||
}
|
||||
"#;
|
||||
|
||||
let pipeline_yaml = r#"
|
||||
---
|
||||
description: Pipeline for Demo Log
|
||||
|
||||
processors:
|
||||
- gsub:
|
||||
field: reqTimeSec
|
||||
pattern: "\\."
|
||||
replacement: ""
|
||||
- epoch:
|
||||
field: reqTimeSec
|
||||
resolution: millisecond
|
||||
ignore_missing: true
|
||||
|
||||
transform:
|
||||
- field: reqTimeSec
|
||||
type: epoch, millisecond
|
||||
index: timestamp
|
||||
"#;
|
||||
let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap();
|
||||
let pipeline: Pipeline<GreptimeTransformer> =
|
||||
parse(&Content::Yaml(pipeline_yaml.into())).unwrap();
|
||||
let mut payload = pipeline.init_intermediate_state();
|
||||
pipeline.prepare(input_value, &mut payload).unwrap();
|
||||
assert_eq!(&["reqTimeSec"].to_vec(), pipeline.required_keys());
|
||||
assert_eq!(payload, vec![Value::String("1573840000.000".to_string())]);
|
||||
let result = pipeline.exec_mut(&mut payload).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
result.values[0].value_data,
|
||||
Some(ValueData::TimestampMillisecondValue(1573840000000))
|
||||
);
|
||||
_ => panic!("expect null value"),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -541,21 +425,19 @@ transform:
|
||||
#[test]
|
||||
fn test_csv_pipeline() {
|
||||
let input_value_str = r#"
|
||||
{
|
||||
"my_field": "1,2",
|
||||
"foo": "bar"
|
||||
}
|
||||
"#;
|
||||
{
|
||||
"my_field": "1,2",
|
||||
"foo": "bar"
|
||||
}
|
||||
"#;
|
||||
let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap();
|
||||
|
||||
let pipeline_yaml = r#"
|
||||
---
|
||||
description: Pipeline for Apache Tomcat
|
||||
|
||||
processors:
|
||||
- csv:
|
||||
field: my_field,my_field, field1, field2
|
||||
|
||||
field: my_field
|
||||
target_fields: field1, field2
|
||||
transform:
|
||||
- field: field1
|
||||
type: uint32
|
||||
@@ -565,8 +447,22 @@ transform:
|
||||
|
||||
let pipeline: Pipeline<GreptimeTransformer> =
|
||||
parse(&Content::Yaml(pipeline_yaml.into())).unwrap();
|
||||
let output = pipeline.exec(input_value.try_into().unwrap());
|
||||
assert!(output.is_ok());
|
||||
let mut payload = pipeline.init_intermediate_state();
|
||||
pipeline.prepare(input_value, &mut payload).unwrap();
|
||||
assert_eq!(&["my_field"].to_vec(), pipeline.required_keys());
|
||||
assert_eq!(
|
||||
payload,
|
||||
vec![Value::String("1,2".to_string()), Value::Null, Value::Null]
|
||||
);
|
||||
let result = pipeline.exec_mut(&mut payload).unwrap();
|
||||
assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1)));
|
||||
assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2)));
|
||||
match &result.values[2].value_data {
|
||||
Some(ValueData::TimestampNanosecondValue(v)) => {
|
||||
assert_ne!(*v, 0);
|
||||
}
|
||||
_ => panic!("expect null value"),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -596,7 +492,14 @@ transform:
|
||||
|
||||
let pipeline: Pipeline<GreptimeTransformer> =
|
||||
parse(&Content::Yaml(pipeline_yaml.into())).unwrap();
|
||||
let output = pipeline.exec(input_value.try_into().unwrap()).unwrap();
|
||||
let schema = pipeline.schemas().clone();
|
||||
let mut result = pipeline.init_intermediate_state();
|
||||
pipeline.prepare(input_value, &mut result).unwrap();
|
||||
let row = pipeline.exec_mut(&mut result).unwrap();
|
||||
let output = Rows {
|
||||
schema,
|
||||
rows: vec![row],
|
||||
};
|
||||
let schemas = output.schema;
|
||||
|
||||
assert_eq!(schemas.len(), 1);
|
||||
|
||||
@@ -12,69 +12,12 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
use std::ops::Deref;
|
||||
use std::str::FromStr;
|
||||
|
||||
use ahash::{HashSet, HashSetExt};
|
||||
use itertools::Itertools;
|
||||
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct Fields(Vec<Field>);
|
||||
|
||||
impl Fields {
|
||||
pub(crate) fn new(fields: Vec<Field>) -> Result<Self, String> {
|
||||
let ff = Fields(fields);
|
||||
ff.check()
|
||||
}
|
||||
|
||||
pub(crate) fn one(field: Field) -> Self {
|
||||
Fields(vec![field])
|
||||
}
|
||||
|
||||
pub(crate) fn get_target_fields(&self) -> Vec<&str> {
|
||||
self.0.iter().map(|f| f.get_target_field()).collect()
|
||||
}
|
||||
|
||||
fn check(self) -> Result<Self, String> {
|
||||
if self.0.is_empty() {
|
||||
return Err("fields must not be empty".to_string());
|
||||
}
|
||||
|
||||
let mut set = HashSet::new();
|
||||
for f in self.0.iter() {
|
||||
if set.contains(&f.input_field.name) {
|
||||
return Err(format!(
|
||||
"field name must be unique, but got duplicated: {}",
|
||||
f.input_field.name
|
||||
));
|
||||
}
|
||||
set.insert(&f.input_field.name);
|
||||
}
|
||||
|
||||
Ok(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Fields {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
let s = self.0.iter().map(|f| f.to_string()).join(";");
|
||||
write!(f, "{s}")
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::Deref for Fields {
|
||||
type Target = Vec<Field>;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::DerefMut for Fields {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
&mut self.0
|
||||
}
|
||||
}
|
||||
use crate::etl::find_key_index;
|
||||
|
||||
/// Information about the input field including the name and index in intermediate keys.
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct InputFieldInfo {
|
||||
pub(crate) name: String,
|
||||
@@ -82,132 +25,202 @@ pub struct InputFieldInfo {
|
||||
}
|
||||
|
||||
impl InputFieldInfo {
|
||||
/// Create a new input field info with the given field name and index.
|
||||
pub(crate) fn new(field: impl Into<String>, index: usize) -> Self {
|
||||
InputFieldInfo {
|
||||
name: field.into(),
|
||||
index,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn name(field: impl Into<String>) -> Self {
|
||||
InputFieldInfo {
|
||||
name: field.into(),
|
||||
index: 0,
|
||||
/// Information about a field that has one input and one output.
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct OneInputOneOutputField {
|
||||
input: InputFieldInfo,
|
||||
output: Option<(String, usize)>,
|
||||
}
|
||||
|
||||
impl OneInputOneOutputField {
|
||||
/// Create a new field with the given input and output.
|
||||
pub(crate) fn new(input: InputFieldInfo, output: (String, usize)) -> Self {
|
||||
OneInputOneOutputField {
|
||||
input,
|
||||
output: Some(output),
|
||||
}
|
||||
}
|
||||
|
||||
/// Build a new field with the given processor kind, intermediate keys, input field, and target field.
|
||||
pub(crate) fn build(
|
||||
processor_kind: &str,
|
||||
intermediate_keys: &[String],
|
||||
input_field: &str,
|
||||
target_field: &str,
|
||||
) -> Result<Self, String> {
|
||||
let input_index = find_key_index(intermediate_keys, input_field, processor_kind)?;
|
||||
|
||||
let input_field_info = InputFieldInfo::new(input_field, input_index);
|
||||
let output_index = find_key_index(intermediate_keys, target_field, processor_kind)?;
|
||||
Ok(OneInputOneOutputField::new(
|
||||
input_field_info,
|
||||
(target_field.to_string(), output_index),
|
||||
))
|
||||
}
|
||||
|
||||
/// Get the input field information.
|
||||
pub(crate) fn input(&self) -> &InputFieldInfo {
|
||||
&self.input
|
||||
}
|
||||
|
||||
/// Get the index of the input field.
|
||||
pub(crate) fn input_index(&self) -> usize {
|
||||
self.input.index
|
||||
}
|
||||
|
||||
/// Get the name of the input field.
|
||||
pub(crate) fn input_name(&self) -> &str {
|
||||
&self.input.name
|
||||
}
|
||||
|
||||
/// Get the index of the output field.
|
||||
pub(crate) fn output_index(&self) -> usize {
|
||||
*self.output().1
|
||||
}
|
||||
|
||||
/// Get the name of the output field.
|
||||
pub(crate) fn output_name(&self) -> &str {
|
||||
self.output().0
|
||||
}
|
||||
|
||||
/// Get the output field information.
|
||||
pub(crate) fn output(&self) -> (&String, &usize) {
|
||||
if let Some((name, index)) = &self.output {
|
||||
(name, index)
|
||||
} else {
|
||||
(&self.input.name, &self.input.index)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Used to represent the input and output fields of a processor or transform.
|
||||
/// Information about a field that has one input and multiple outputs.
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct OneInputMultiOutputField {
|
||||
input: InputFieldInfo,
|
||||
/// Typically, processors that output multiple keys need to be distinguished by splicing the keys together.
|
||||
prefix: Option<String>,
|
||||
}
|
||||
|
||||
impl OneInputMultiOutputField {
|
||||
/// Create a new field with the given input and prefix.
|
||||
pub(crate) fn new(input: InputFieldInfo, prefix: Option<String>) -> Self {
|
||||
OneInputMultiOutputField { input, prefix }
|
||||
}
|
||||
|
||||
/// Get the input field information.
|
||||
pub(crate) fn input(&self) -> &InputFieldInfo {
|
||||
&self.input
|
||||
}
|
||||
|
||||
/// Get the index of the input field.
|
||||
pub(crate) fn input_index(&self) -> usize {
|
||||
self.input.index
|
||||
}
|
||||
|
||||
/// Get the name of the input field.
|
||||
pub(crate) fn input_name(&self) -> &str {
|
||||
&self.input.name
|
||||
}
|
||||
|
||||
/// Get the prefix for the output fields.
|
||||
pub(crate) fn target_prefix(&self) -> &str {
|
||||
self.prefix.as_deref().unwrap_or(&self.input.name)
|
||||
}
|
||||
}
|
||||
|
||||
/// Raw processor-defined inputs and outputs
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct Field {
|
||||
/// The input field name and index.
|
||||
pub input_field: InputFieldInfo,
|
||||
|
||||
/// The output field name and index mapping.
|
||||
pub output_fields_index_mapping: BTreeMap<String, usize>,
|
||||
|
||||
// rename
|
||||
pub target_field: Option<String>,
|
||||
|
||||
// 1-to-many mapping
|
||||
// processors:
|
||||
// - csv
|
||||
pub target_fields: Option<Vec<String>>,
|
||||
pub(crate) input_field: String,
|
||||
pub(crate) target_field: Option<String>,
|
||||
}
|
||||
|
||||
impl Field {
|
||||
pub(crate) fn new(field: impl Into<String>) -> Self {
|
||||
Field {
|
||||
input_field: InputFieldInfo::name(field.into()),
|
||||
output_fields_index_mapping: BTreeMap::new(),
|
||||
target_field: None,
|
||||
target_fields: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// target column_name in processor or transform
|
||||
/// if target_field is None, return input field name
|
||||
pub(crate) fn get_target_field(&self) -> &str {
|
||||
self.target_field
|
||||
.as_deref()
|
||||
.unwrap_or(&self.input_field.name)
|
||||
}
|
||||
|
||||
/// input column_name in processor or transform
|
||||
pub(crate) fn get_field_name(&self) -> &str {
|
||||
&self.input_field.name
|
||||
}
|
||||
|
||||
/// set input column index in processor or transform
|
||||
pub(crate) fn set_input_index(&mut self, index: usize) {
|
||||
self.input_field.index = index;
|
||||
}
|
||||
|
||||
pub(crate) fn set_output_index(&mut self, key: &str, index: usize) {
|
||||
if let Some(v) = self.output_fields_index_mapping.get_mut(key) {
|
||||
*v = index;
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn insert_output_index(&mut self, key: String, index: usize) {
|
||||
self.output_fields_index_mapping.insert(key, index);
|
||||
}
|
||||
}
|
||||
|
||||
impl std::str::FromStr for Field {
|
||||
impl FromStr for Field {
|
||||
type Err = String;
|
||||
|
||||
fn from_str(s: &str) -> Result<Self, Self::Err> {
|
||||
let mut parts = s.split(',');
|
||||
let field = parts.next().ok_or("field is missing")?.trim().to_string();
|
||||
let input_field = parts
|
||||
.next()
|
||||
.ok_or("input field is missing")?
|
||||
.trim()
|
||||
.to_string();
|
||||
let target_field = parts.next().map(|x| x.trim().to_string());
|
||||
|
||||
if field.is_empty() {
|
||||
return Err("field is empty".to_string());
|
||||
if input_field.is_empty() {
|
||||
return Err("input field is empty".to_string());
|
||||
}
|
||||
|
||||
let renamed_field = match parts.next() {
|
||||
Some(s) if !s.trim().is_empty() => Some(s.trim().to_string()),
|
||||
_ => None,
|
||||
};
|
||||
|
||||
// TODO(qtang): ???? what's this?
|
||||
// weird design? field: <field>,<target_field>,<target_fields>,<target_fields>....
|
||||
// and only use in csv processor
|
||||
let fields: Vec<_> = parts
|
||||
.map(|s| s.trim())
|
||||
.filter(|s| !s.is_empty())
|
||||
.map(|s| s.to_string())
|
||||
.collect();
|
||||
let target_fields = if fields.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(fields)
|
||||
};
|
||||
|
||||
Ok(Field {
|
||||
input_field: InputFieldInfo::name(field),
|
||||
output_fields_index_mapping: BTreeMap::new(),
|
||||
target_field: renamed_field,
|
||||
target_fields,
|
||||
input_field,
|
||||
target_field,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for Field {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
match (&self.target_field, &self.target_fields) {
|
||||
(Some(target_field), None) => write!(f, "{}, {target_field}", self.input_field.name),
|
||||
(None, Some(target_fields)) => {
|
||||
write!(
|
||||
f,
|
||||
"{}, {}",
|
||||
self.input_field.name,
|
||||
target_fields.iter().join(",")
|
||||
)
|
||||
}
|
||||
_ => write!(f, "{}", self.input_field.name),
|
||||
impl Field {
|
||||
/// Create a new field with the given input and target fields.
|
||||
pub(crate) fn new(input_field: impl Into<String>, target_field: Option<String>) -> Self {
|
||||
Field {
|
||||
input_field: input_field.into(),
|
||||
target_field,
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the input field.
|
||||
pub(crate) fn input_field(&self) -> &str {
|
||||
&self.input_field
|
||||
}
|
||||
|
||||
/// Get the target field.
|
||||
pub(crate) fn target_field(&self) -> Option<&str> {
|
||||
self.target_field.as_deref()
|
||||
}
|
||||
|
||||
/// Get the target field or the input field if the target field is not set.
|
||||
pub(crate) fn target_or_input_field(&self) -> &str {
|
||||
self.target_field.as_deref().unwrap_or(&self.input_field)
|
||||
}
|
||||
}
|
||||
|
||||
/// A collection of fields.
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct Fields(Vec<Field>);
|
||||
|
||||
impl Fields {
|
||||
pub(crate) fn new(fields: Vec<Field>) -> Self {
|
||||
Fields(fields)
|
||||
}
|
||||
|
||||
pub(crate) fn one(field: Field) -> Self {
|
||||
Fields(vec![field])
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for Fields {
|
||||
type Target = Vec<Field>;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl IntoIterator for Fields {
|
||||
type Item = Field;
|
||||
type IntoIter = std::vec::IntoIter<Field>;
|
||||
|
||||
fn into_iter(self) -> Self::IntoIter {
|
||||
self.0.into_iter()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -227,35 +240,14 @@ mod tests {
|
||||
|
||||
let cases = [
|
||||
// ("field", "field", None, None),
|
||||
(
|
||||
"field, target_field",
|
||||
"field",
|
||||
Some("target_field".into()),
|
||||
None,
|
||||
),
|
||||
(
|
||||
"field, target_field1, target_field2, target_field3",
|
||||
"field",
|
||||
Some("target_field1".into()),
|
||||
Some(vec!["target_field2".into(), "target_field3".into()]),
|
||||
),
|
||||
(
|
||||
"field,, target_field1, target_field2, target_field3",
|
||||
"field",
|
||||
None,
|
||||
Some(vec![
|
||||
"target_field1".into(),
|
||||
"target_field2".into(),
|
||||
"target_field3".into(),
|
||||
]),
|
||||
),
|
||||
("field, target_field", "field", Some("target_field")),
|
||||
("field", "field", None),
|
||||
];
|
||||
|
||||
for (s, field, target_field, target_fields) in cases.into_iter() {
|
||||
for (s, field, target_field) in cases.into_iter() {
|
||||
let f: Field = s.parse().unwrap();
|
||||
assert_eq!(f.get_field_name(), field, "{s}");
|
||||
assert_eq!(f.target_field, target_field, "{s}");
|
||||
assert_eq!(f.target_fields, target_fields, "{s}");
|
||||
assert_eq!(f.input_field(), field, "{s}");
|
||||
assert_eq!(f.target_field(), target_field, "{s}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -25,22 +25,22 @@ pub mod timestamp;
|
||||
pub mod urlencoding;
|
||||
|
||||
use ahash::{HashSet, HashSetExt};
|
||||
use cmcd::CmcdProcessor;
|
||||
use csv::CsvProcessor;
|
||||
use date::DateProcessor;
|
||||
use dissect::DissectProcessor;
|
||||
use cmcd::{CmcdProcessor, CmcdProcessorBuilder};
|
||||
use csv::{CsvProcessor, CsvProcessorBuilder};
|
||||
use date::{DateProcessor, DateProcessorBuilder};
|
||||
use dissect::{DissectProcessor, DissectProcessorBuilder};
|
||||
use enum_dispatch::enum_dispatch;
|
||||
use epoch::EpochProcessor;
|
||||
use gsub::GsubProcessor;
|
||||
use epoch::{EpochProcessor, EpochProcessorBuilder};
|
||||
use gsub::{GsubProcessor, GsubProcessorBuilder};
|
||||
use itertools::Itertools;
|
||||
use join::JoinProcessor;
|
||||
use letter::LetterProcessor;
|
||||
use regex::RegexProcessor;
|
||||
use timestamp::TimestampProcessor;
|
||||
use urlencoding::UrlEncodingProcessor;
|
||||
use join::{JoinProcessor, JoinProcessorBuilder};
|
||||
use letter::{LetterProcessor, LetterProcessorBuilder};
|
||||
use regex::{RegexProcessor, RegexProcessorBuilder};
|
||||
use timestamp::{TimestampProcessor, TimestampProcessorBuilder};
|
||||
use urlencoding::{UrlEncodingProcessor, UrlEncodingProcessorBuilder};
|
||||
|
||||
use crate::etl::field::{Field, Fields};
|
||||
use crate::etl::value::{Map, Value};
|
||||
use super::field::{Field, Fields};
|
||||
use crate::etl::value::Value;
|
||||
|
||||
const FIELD_NAME: &str = "field";
|
||||
const FIELDS_NAME: &str = "fields";
|
||||
@@ -49,6 +49,7 @@ const METHOD_NAME: &str = "method";
|
||||
const PATTERN_NAME: &str = "pattern";
|
||||
const PATTERNS_NAME: &str = "patterns";
|
||||
const SEPARATOR_NAME: &str = "separator";
|
||||
const TARGET_FIELDS_NAME: &str = "target_fields";
|
||||
|
||||
// const IF_NAME: &str = "if";
|
||||
// const IGNORE_FAILURE_NAME: &str = "ignore_failure";
|
||||
@@ -62,55 +63,14 @@ const SEPARATOR_NAME: &str = "separator";
|
||||
/// The output of a processor is a map of key-value pairs that will be merged into the document when you use exec_map method.
|
||||
#[enum_dispatch(ProcessorKind)]
|
||||
pub trait Processor: std::fmt::Debug + Send + Sync + 'static {
|
||||
/// Get the processor's fields
|
||||
/// fields is just the same processor for multiple keys. It is not the case that a processor has multiple inputs
|
||||
fn fields(&self) -> &Fields;
|
||||
|
||||
/// Get the processor's fields mutably
|
||||
fn fields_mut(&mut self) -> &mut Fields;
|
||||
|
||||
/// Get the processor's kind
|
||||
fn kind(&self) -> &str;
|
||||
|
||||
/// Whether to ignore missing
|
||||
fn ignore_missing(&self) -> bool;
|
||||
|
||||
/// processor all output keys
|
||||
/// if a processor has multiple output keys, it should return all of them
|
||||
fn output_keys(&self) -> HashSet<String>;
|
||||
|
||||
/// Execute the processor on a document
|
||||
/// and return a map of key-value pairs
|
||||
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String>;
|
||||
|
||||
/// Execute the processor on a vector which be preprocessed by the pipeline
|
||||
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String>;
|
||||
|
||||
/// Execute the processor on a map
|
||||
/// and merge the output into the original map
|
||||
fn exec_map(&self, map: &mut Map) -> Result<(), String> {
|
||||
for ff @ Field {
|
||||
input_field: field_info,
|
||||
..
|
||||
} in self.fields().iter()
|
||||
{
|
||||
match map.get(&field_info.name) {
|
||||
Some(v) => {
|
||||
map.extend(self.exec_field(v, ff)?);
|
||||
}
|
||||
None if self.ignore_missing() => {}
|
||||
None => {
|
||||
return Err(format!(
|
||||
"{} processor: field '{}' is required but missing in {map}",
|
||||
self.kind(),
|
||||
field_info.name,
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
@@ -129,6 +89,42 @@ pub enum ProcessorKind {
|
||||
Date(DateProcessor),
|
||||
}
|
||||
|
||||
/// ProcessorBuilder trait defines the interface for all processor builders
|
||||
/// A processor builder is used to create a processor
|
||||
#[enum_dispatch(ProcessorBuilders)]
|
||||
pub trait ProcessorBuilder: std::fmt::Debug + Send + Sync + 'static {
|
||||
/// Get the processor's output keys
|
||||
fn output_keys(&self) -> HashSet<&str>;
|
||||
/// Get the processor's input keys
|
||||
fn input_keys(&self) -> HashSet<&str>;
|
||||
/// Build the processor
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String>;
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
#[enum_dispatch]
|
||||
pub enum ProcessorBuilders {
|
||||
Cmcd(CmcdProcessorBuilder),
|
||||
Csv(CsvProcessorBuilder),
|
||||
Dissect(DissectProcessorBuilder),
|
||||
Gsub(GsubProcessorBuilder),
|
||||
Join(JoinProcessorBuilder),
|
||||
Letter(LetterProcessorBuilder),
|
||||
Regex(RegexProcessorBuilder),
|
||||
Timestamp(TimestampProcessorBuilder),
|
||||
UrlEncoding(UrlEncodingProcessorBuilder),
|
||||
Epoch(EpochProcessorBuilder),
|
||||
Date(DateProcessorBuilder),
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct ProcessorBuilderList {
|
||||
pub(crate) processor_builders: Vec<ProcessorBuilders>,
|
||||
pub(crate) input_keys: Vec<String>,
|
||||
pub(crate) output_keys: Vec<String>,
|
||||
pub(crate) original_input_keys: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct Processors {
|
||||
/// A ordered list of processors
|
||||
@@ -174,52 +170,63 @@ impl Processors {
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&Vec<yaml_rust::Yaml>> for Processors {
|
||||
impl TryFrom<&Vec<yaml_rust::Yaml>> for ProcessorBuilderList {
|
||||
type Error = String;
|
||||
|
||||
fn try_from(vec: &Vec<yaml_rust::Yaml>) -> Result<Self, Self::Error> {
|
||||
let mut processors = vec![];
|
||||
let mut processors_builders = vec![];
|
||||
let mut all_output_keys = HashSet::with_capacity(50);
|
||||
let mut all_required_keys = HashSet::with_capacity(50);
|
||||
let mut all_required_original_keys = HashSet::with_capacity(50);
|
||||
for doc in vec {
|
||||
let processor = parse_processor(doc)?;
|
||||
|
||||
// get all required keys
|
||||
let processor_required_keys: Vec<String> = processor
|
||||
.fields()
|
||||
.iter()
|
||||
.map(|f| f.input_field.name.clone())
|
||||
.collect();
|
||||
|
||||
for key in &processor_required_keys {
|
||||
if !all_output_keys.contains(key) {
|
||||
all_required_original_keys.insert(key.clone());
|
||||
}
|
||||
}
|
||||
|
||||
all_required_keys.extend(processor_required_keys);
|
||||
|
||||
let processor_output_keys = processor.output_keys().into_iter();
|
||||
all_output_keys.extend(processor_output_keys);
|
||||
|
||||
processors.push(processor);
|
||||
processors_builders.push(processor);
|
||||
}
|
||||
|
||||
let all_required_keys = all_required_keys.into_iter().sorted().collect();
|
||||
let all_output_keys = all_output_keys.into_iter().sorted().collect();
|
||||
let all_required_original_keys = all_required_original_keys.into_iter().sorted().collect();
|
||||
for processor in processors_builders.iter() {
|
||||
{
|
||||
// get all required keys
|
||||
let processor_required_keys = processor.input_keys();
|
||||
|
||||
Ok(Processors {
|
||||
processors,
|
||||
required_keys: all_required_keys,
|
||||
for key in &processor_required_keys {
|
||||
if !all_output_keys.contains(key) {
|
||||
all_required_original_keys.insert(*key);
|
||||
}
|
||||
}
|
||||
|
||||
all_required_keys.extend(processor_required_keys);
|
||||
|
||||
let processor_output_keys = processor.output_keys().into_iter();
|
||||
all_output_keys.extend(processor_output_keys);
|
||||
}
|
||||
}
|
||||
|
||||
let all_required_keys = all_required_keys
|
||||
.into_iter()
|
||||
.map(|x| x.to_string())
|
||||
.sorted()
|
||||
.collect();
|
||||
let all_output_keys = all_output_keys
|
||||
.into_iter()
|
||||
.map(|x| x.to_string())
|
||||
.sorted()
|
||||
.collect();
|
||||
let all_required_original_keys = all_required_original_keys
|
||||
.into_iter()
|
||||
.map(|x| x.to_string())
|
||||
.sorted()
|
||||
.collect();
|
||||
|
||||
Ok(ProcessorBuilderList {
|
||||
processor_builders: processors_builders,
|
||||
input_keys: all_required_keys,
|
||||
output_keys: all_output_keys,
|
||||
required_original_keys: all_required_original_keys,
|
||||
original_input_keys: all_required_original_keys,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_processor(doc: &yaml_rust::Yaml) -> Result<ProcessorKind, String> {
|
||||
fn parse_processor(doc: &yaml_rust::Yaml) -> Result<ProcessorBuilders, String> {
|
||||
let map = doc.as_hash().ok_or("processor must be a map".to_string())?;
|
||||
|
||||
let key = map
|
||||
@@ -238,20 +245,24 @@ fn parse_processor(doc: &yaml_rust::Yaml) -> Result<ProcessorKind, String> {
|
||||
.ok_or("processor key must be a string".to_string())?;
|
||||
|
||||
let processor = match str_key {
|
||||
cmcd::PROCESSOR_CMCD => ProcessorKind::Cmcd(CmcdProcessor::try_from(value)?),
|
||||
csv::PROCESSOR_CSV => ProcessorKind::Csv(CsvProcessor::try_from(value)?),
|
||||
dissect::PROCESSOR_DISSECT => ProcessorKind::Dissect(DissectProcessor::try_from(value)?),
|
||||
epoch::PROCESSOR_EPOCH => ProcessorKind::Epoch(EpochProcessor::try_from(value)?),
|
||||
date::PROCESSOR_DATE => ProcessorKind::Date(DateProcessor::try_from(value)?),
|
||||
gsub::PROCESSOR_GSUB => ProcessorKind::Gsub(GsubProcessor::try_from(value)?),
|
||||
join::PROCESSOR_JOIN => ProcessorKind::Join(JoinProcessor::try_from(value)?),
|
||||
letter::PROCESSOR_LETTER => ProcessorKind::Letter(LetterProcessor::try_from(value)?),
|
||||
regex::PROCESSOR_REGEX => ProcessorKind::Regex(RegexProcessor::try_from(value)?),
|
||||
cmcd::PROCESSOR_CMCD => ProcessorBuilders::Cmcd(CmcdProcessorBuilder::try_from(value)?),
|
||||
csv::PROCESSOR_CSV => ProcessorBuilders::Csv(CsvProcessorBuilder::try_from(value)?),
|
||||
dissect::PROCESSOR_DISSECT => {
|
||||
ProcessorBuilders::Dissect(DissectProcessorBuilder::try_from(value)?)
|
||||
}
|
||||
epoch::PROCESSOR_EPOCH => ProcessorBuilders::Epoch(EpochProcessorBuilder::try_from(value)?),
|
||||
date::PROCESSOR_DATE => ProcessorBuilders::Date(DateProcessorBuilder::try_from(value)?),
|
||||
gsub::PROCESSOR_GSUB => ProcessorBuilders::Gsub(GsubProcessorBuilder::try_from(value)?),
|
||||
join::PROCESSOR_JOIN => ProcessorBuilders::Join(JoinProcessorBuilder::try_from(value)?),
|
||||
letter::PROCESSOR_LETTER => {
|
||||
ProcessorBuilders::Letter(LetterProcessorBuilder::try_from(value)?)
|
||||
}
|
||||
regex::PROCESSOR_REGEX => ProcessorBuilders::Regex(RegexProcessorBuilder::try_from(value)?),
|
||||
timestamp::PROCESSOR_TIMESTAMP => {
|
||||
ProcessorKind::Timestamp(TimestampProcessor::try_from(value)?)
|
||||
ProcessorBuilders::Timestamp(TimestampProcessorBuilder::try_from(value)?)
|
||||
}
|
||||
urlencoding::PROCESSOR_URL_ENCODING => {
|
||||
ProcessorKind::UrlEncoding(UrlEncodingProcessor::try_from(value)?)
|
||||
ProcessorBuilders::UrlEncoding(UrlEncodingProcessorBuilder::try_from(value)?)
|
||||
}
|
||||
_ => return Err(format!("unsupported {} processor", str_key)),
|
||||
};
|
||||
@@ -301,19 +312,10 @@ where
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn yaml_fields(v: &yaml_rust::Yaml, field: &str) -> Result<Fields, String> {
|
||||
let v = yaml_parse_strings(v, field)?;
|
||||
Fields::new(v)
|
||||
pub(crate) fn yaml_new_fields(v: &yaml_rust::Yaml, field: &str) -> Result<Fields, String> {
|
||||
yaml_parse_strings(v, field).map(Fields::new)
|
||||
}
|
||||
|
||||
pub(crate) fn yaml_field(v: &yaml_rust::Yaml, field: &str) -> Result<Field, String> {
|
||||
pub(crate) fn yaml_new_field(v: &yaml_rust::Yaml, field: &str) -> Result<Field, String> {
|
||||
yaml_parse_string(v, field)
|
||||
}
|
||||
|
||||
pub(crate) fn update_one_one_output_keys(fields: &mut Fields) {
|
||||
for field in fields.iter_mut() {
|
||||
field
|
||||
.output_fields_index_mapping
|
||||
.insert(field.get_target_field().to_string(), 0_usize);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,14 +12,18 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use ahash::HashSet;
|
||||
use urlencoding::decode;
|
||||
|
||||
use crate::etl::field::{Field, Fields};
|
||||
use crate::etl::field::{Field, Fields, InputFieldInfo, OneInputMultiOutputField};
|
||||
use crate::etl::find_key_index;
|
||||
use crate::etl::processor::{
|
||||
yaml_bool, yaml_field, yaml_fields, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
|
||||
yaml_bool, yaml_new_field, yaml_new_fields, Processor, ProcessorBuilder, ProcessorKind,
|
||||
FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
|
||||
};
|
||||
use crate::etl::value::{Map, Value};
|
||||
use crate::etl::value::Value;
|
||||
|
||||
pub(crate) const PROCESSOR_CMCD: &str = "cmcd";
|
||||
|
||||
@@ -63,6 +67,178 @@ const CMCD_KEYS: [&str; 18] = [
|
||||
CMCD_KEY_V,
|
||||
];
|
||||
|
||||
/// CmcdProcessorBuilder is a builder for CmcdProcessor
|
||||
/// parse from raw yaml
|
||||
#[derive(Debug, Default)]
|
||||
pub struct CmcdProcessorBuilder {
|
||||
fields: Fields,
|
||||
output_keys: HashSet<String>,
|
||||
ignore_missing: bool,
|
||||
}
|
||||
|
||||
impl CmcdProcessorBuilder {
|
||||
/// build_cmcd_outputs build cmcd output info
|
||||
/// generate index and function for each output
|
||||
pub(super) fn build_cmcd_outputs(
|
||||
field: &Field,
|
||||
intermediate_keys: &[String],
|
||||
) -> Result<(BTreeMap<String, usize>, Vec<CmcdOutputInfo>), String> {
|
||||
let mut output_index = BTreeMap::new();
|
||||
let mut cmcd_field_outputs = Vec::with_capacity(CMCD_KEYS.len());
|
||||
for cmcd in CMCD_KEYS {
|
||||
let final_key = generate_key(field.target_or_input_field(), cmcd);
|
||||
let index = find_key_index(intermediate_keys, &final_key, "cmcd")?;
|
||||
output_index.insert(final_key.clone(), index);
|
||||
match cmcd {
|
||||
CMCD_KEY_BS | CMCD_KEY_SU => {
|
||||
let output_info = CmcdOutputInfo::new(final_key, cmcd, index, bs_su);
|
||||
cmcd_field_outputs.push(output_info);
|
||||
}
|
||||
CMCD_KEY_BR | CMCD_KEY_BL | CMCD_KEY_D | CMCD_KEY_DL | CMCD_KEY_MTP
|
||||
| CMCD_KEY_RTP | CMCD_KEY_TB => {
|
||||
let output_info = CmcdOutputInfo::new(final_key, cmcd, index, br_tb);
|
||||
cmcd_field_outputs.push(output_info);
|
||||
}
|
||||
CMCD_KEY_CID | CMCD_KEY_NRR | CMCD_KEY_OT | CMCD_KEY_SF | CMCD_KEY_SID
|
||||
| CMCD_KEY_ST | CMCD_KEY_V => {
|
||||
let output_info = CmcdOutputInfo::new(final_key, cmcd, index, cid_v);
|
||||
cmcd_field_outputs.push(output_info);
|
||||
}
|
||||
CMCD_KEY_NOR => {
|
||||
let output_info = CmcdOutputInfo::new(final_key, cmcd, index, nor);
|
||||
cmcd_field_outputs.push(output_info);
|
||||
}
|
||||
CMCD_KEY_PR => {
|
||||
let output_info = CmcdOutputInfo::new(final_key, cmcd, index, pr);
|
||||
cmcd_field_outputs.push(output_info);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
Ok((output_index, cmcd_field_outputs))
|
||||
}
|
||||
|
||||
/// build CmcdProcessor from CmcdProcessorBuilder
|
||||
pub fn build(self, intermediate_keys: &[String]) -> Result<CmcdProcessor, String> {
|
||||
let mut real_fields = vec![];
|
||||
let mut cmcd_outputs = Vec::with_capacity(CMCD_KEYS.len());
|
||||
for field in self.fields.into_iter() {
|
||||
let input_index = find_key_index(intermediate_keys, field.input_field(), "cmcd")?;
|
||||
|
||||
let input_field_info = InputFieldInfo::new(field.input_field(), input_index);
|
||||
|
||||
let (_, cmcd_field_outputs) = Self::build_cmcd_outputs(&field, intermediate_keys)?;
|
||||
|
||||
cmcd_outputs.push(cmcd_field_outputs);
|
||||
|
||||
let real_field = OneInputMultiOutputField::new(input_field_info, field.target_field);
|
||||
real_fields.push(real_field);
|
||||
}
|
||||
Ok(CmcdProcessor {
|
||||
fields: real_fields,
|
||||
cmcd_outputs,
|
||||
ignore_missing: self.ignore_missing,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl ProcessorBuilder for CmcdProcessorBuilder {
|
||||
fn output_keys(&self) -> HashSet<&str> {
|
||||
self.output_keys.iter().map(|s| s.as_str()).collect()
|
||||
}
|
||||
|
||||
fn input_keys(&self) -> HashSet<&str> {
|
||||
self.fields.iter().map(|f| f.input_field()).collect()
|
||||
}
|
||||
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String> {
|
||||
self.build(intermediate_keys).map(ProcessorKind::Cmcd)
|
||||
}
|
||||
}
|
||||
|
||||
fn generate_key(prefix: &str, key: &str) -> String {
|
||||
format!("{}_{}", prefix, key)
|
||||
}
|
||||
|
||||
/// CmcdOutputInfo is a struct to store output info
|
||||
#[derive(Debug)]
|
||||
pub(super) struct CmcdOutputInfo {
|
||||
/// {input_field}_{cmcd_key}
|
||||
final_key: String,
|
||||
/// cmcd key
|
||||
key: &'static str,
|
||||
/// index in intermediate_keys
|
||||
index: usize,
|
||||
/// function to resolve value
|
||||
f: fn(&str, &str, Option<&str>) -> Result<Value, String>,
|
||||
}
|
||||
|
||||
impl CmcdOutputInfo {
|
||||
fn new(
|
||||
final_key: String,
|
||||
key: &'static str,
|
||||
index: usize,
|
||||
f: fn(&str, &str, Option<&str>) -> Result<Value, String>,
|
||||
) -> Self {
|
||||
Self {
|
||||
final_key,
|
||||
key,
|
||||
index,
|
||||
f,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for CmcdOutputInfo {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
final_key: String::default(),
|
||||
key: "",
|
||||
index: 0,
|
||||
f: |_, _, _| Ok(Value::Null),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// function to resolve CMCD_KEY_BS | CMCD_KEY_SU
|
||||
fn bs_su(_: &str, _: &str, _: Option<&str>) -> Result<Value, String> {
|
||||
Ok(Value::Boolean(true))
|
||||
}
|
||||
|
||||
/// function to resolve CMCD_KEY_BR | CMCD_KEY_BL | CMCD_KEY_D | CMCD_KEY_DL | CMCD_KEY_MTP | CMCD_KEY_RTP | CMCD_KEY_TB
|
||||
fn br_tb(s: &str, k: &str, v: Option<&str>) -> Result<Value, String> {
|
||||
let v = v.ok_or(format!("{k} missing value in {s}"))?;
|
||||
let val: i64 = v
|
||||
.parse()
|
||||
.map_err(|_| format!("failed to parse {v} as i64"))?;
|
||||
Ok(Value::Int64(val))
|
||||
}
|
||||
|
||||
/// function to resolve CMCD_KEY_CID | CMCD_KEY_NRR | CMCD_KEY_OT | CMCD_KEY_SF | CMCD_KEY_SID | CMCD_KEY_V
|
||||
fn cid_v(s: &str, k: &str, v: Option<&str>) -> Result<Value, String> {
|
||||
let v = v.ok_or(format!("{k} missing value in {s}"))?;
|
||||
Ok(Value::String(v.to_string()))
|
||||
}
|
||||
|
||||
/// function to resolve CMCD_KEY_NOR
|
||||
fn nor(s: &str, k: &str, v: Option<&str>) -> Result<Value, String> {
|
||||
let v = v.ok_or(format!("{k} missing value in {s}"))?;
|
||||
let val = match decode(v) {
|
||||
Ok(val) => val.to_string(),
|
||||
Err(_) => v.to_string(),
|
||||
};
|
||||
Ok(Value::String(val))
|
||||
}
|
||||
|
||||
/// function to resolve CMCD_KEY_PR
|
||||
fn pr(s: &str, k: &str, v: Option<&str>) -> Result<Value, String> {
|
||||
let v = v.ok_or(format!("{k} missing value in {s}"))?;
|
||||
let val: f64 = v
|
||||
.parse()
|
||||
.map_err(|_| format!("failed to parse {v} as f64"))?;
|
||||
Ok(Value::Float64(val))
|
||||
}
|
||||
|
||||
/// Common Media Client Data Specification:
|
||||
/// https://cdn.cta.tech/cta/media/media/resources/standards/pdfs/cta-5004-final.pdf
|
||||
///
|
||||
@@ -100,98 +276,43 @@ const CMCD_KEYS: [&str; 18] = [
|
||||
/// 12. Transport Layer Security SHOULD be used to protect all transmission of CMCD data.
|
||||
#[derive(Debug, Default)]
|
||||
pub struct CmcdProcessor {
|
||||
fields: Fields,
|
||||
fields: Vec<OneInputMultiOutputField>,
|
||||
cmcd_outputs: Vec<Vec<CmcdOutputInfo>>,
|
||||
|
||||
ignore_missing: bool,
|
||||
}
|
||||
|
||||
impl CmcdProcessor {
|
||||
fn with_fields(&mut self, mut fields: Fields) {
|
||||
Self::update_output_keys(&mut fields);
|
||||
self.fields = fields;
|
||||
}
|
||||
|
||||
fn with_ignore_missing(&mut self, ignore_missing: bool) {
|
||||
self.ignore_missing = ignore_missing;
|
||||
}
|
||||
|
||||
fn generate_key(prefix: &str, key: &str) -> String {
|
||||
format!("{}_{}", prefix, key)
|
||||
}
|
||||
|
||||
fn parse(prefix: &str, s: &str) -> Result<Map, String> {
|
||||
let mut map = Map::default();
|
||||
fn parse(&self, field_index: usize, s: &str) -> Result<Vec<(usize, Value)>, String> {
|
||||
let parts = s.split(',');
|
||||
let mut result = Vec::new();
|
||||
for part in parts {
|
||||
let mut kv = part.split('=');
|
||||
let k = kv.next().ok_or(format!("{part} missing key in {s}"))?;
|
||||
let v = kv.next();
|
||||
|
||||
let key = Self::generate_key(prefix, k);
|
||||
match k {
|
||||
CMCD_KEY_BS | CMCD_KEY_SU => {
|
||||
map.insert(key, Value::Boolean(true));
|
||||
for cmcd_key in self.cmcd_outputs[field_index].iter() {
|
||||
if cmcd_key.key == k {
|
||||
let val = (cmcd_key.f)(s, k, v)?;
|
||||
result.push((cmcd_key.index, val));
|
||||
}
|
||||
CMCD_KEY_BR | CMCD_KEY_BL | CMCD_KEY_D | CMCD_KEY_DL | CMCD_KEY_MTP
|
||||
| CMCD_KEY_RTP | CMCD_KEY_TB => {
|
||||
let v = v.ok_or(format!("{k} missing value in {s}"))?;
|
||||
let val: i64 = v
|
||||
.parse()
|
||||
.map_err(|_| format!("failed to parse {v} as i64"))?;
|
||||
map.insert(key, Value::Int64(val));
|
||||
}
|
||||
CMCD_KEY_CID | CMCD_KEY_NRR | CMCD_KEY_OT | CMCD_KEY_SF | CMCD_KEY_SID
|
||||
| CMCD_KEY_ST | CMCD_KEY_V => {
|
||||
let v = v.ok_or(format!("{k} missing value in {s}"))?;
|
||||
map.insert(key, Value::String(v.to_string()));
|
||||
}
|
||||
CMCD_KEY_NOR => {
|
||||
let v = v.ok_or(format!("{k} missing value in {s}"))?;
|
||||
let val = match decode(v) {
|
||||
Ok(val) => val.to_string(),
|
||||
Err(_) => v.to_string(),
|
||||
};
|
||||
map.insert(key, Value::String(val));
|
||||
}
|
||||
CMCD_KEY_PR => {
|
||||
let v = v.ok_or(format!("{k} missing value in {s}"))?;
|
||||
let val: f64 = v
|
||||
.parse()
|
||||
.map_err(|_| format!("failed to parse {v} as f64"))?;
|
||||
map.insert(key, Value::Float64(val));
|
||||
}
|
||||
_ => match v {
|
||||
Some(v) => map.insert(key, Value::String(v.to_string())),
|
||||
None => map.insert(k, Value::Boolean(true)),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
Ok(map)
|
||||
}
|
||||
|
||||
fn process_field(&self, val: &str, field: &Field) -> Result<Map, String> {
|
||||
let prefix = field.get_target_field();
|
||||
|
||||
Self::parse(prefix, val)
|
||||
}
|
||||
|
||||
fn update_output_keys(fields: &mut Fields) {
|
||||
for field in fields.iter_mut() {
|
||||
for key in CMCD_KEYS.iter() {
|
||||
field
|
||||
.output_fields_index_mapping
|
||||
.insert(Self::generate_key(field.get_target_field(), key), 0);
|
||||
}
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for CmcdProcessor {
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for CmcdProcessorBuilder {
|
||||
type Error = String;
|
||||
|
||||
fn try_from(value: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
|
||||
let mut processor = CmcdProcessor::default();
|
||||
let mut fields = Fields::default();
|
||||
let mut ignore_missing = false;
|
||||
|
||||
for (k, v) in value.iter() {
|
||||
let key = k
|
||||
@@ -199,25 +320,40 @@ impl TryFrom<&yaml_rust::yaml::Hash> for CmcdProcessor {
|
||||
.ok_or(format!("key must be a string, but got {k:?}"))?;
|
||||
match key {
|
||||
FIELD_NAME => {
|
||||
processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?));
|
||||
fields = Fields::one(yaml_new_field(v, FIELD_NAME)?);
|
||||
}
|
||||
FIELDS_NAME => {
|
||||
processor.with_fields(yaml_fields(v, FIELDS_NAME)?);
|
||||
fields = yaml_new_fields(v, FIELDS_NAME)?;
|
||||
}
|
||||
|
||||
IGNORE_MISSING_NAME => {
|
||||
processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?);
|
||||
ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?;
|
||||
}
|
||||
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(processor)
|
||||
let output_keys = fields
|
||||
.iter()
|
||||
.flat_map(|f| {
|
||||
CMCD_KEYS
|
||||
.iter()
|
||||
.map(|cmcd_key| generate_key(f.target_or_input_field(), cmcd_key))
|
||||
})
|
||||
.collect();
|
||||
|
||||
let builder = CmcdProcessorBuilder {
|
||||
fields,
|
||||
output_keys,
|
||||
ignore_missing,
|
||||
};
|
||||
|
||||
Ok(builder)
|
||||
}
|
||||
}
|
||||
|
||||
impl crate::etl::processor::Processor for CmcdProcessor {
|
||||
impl Processor for CmcdProcessor {
|
||||
fn kind(&self) -> &str {
|
||||
PROCESSOR_CMCD
|
||||
}
|
||||
@@ -226,51 +362,14 @@ impl crate::etl::processor::Processor for CmcdProcessor {
|
||||
self.ignore_missing
|
||||
}
|
||||
|
||||
fn fields(&self) -> &Fields {
|
||||
&self.fields
|
||||
}
|
||||
|
||||
fn fields_mut(&mut self) -> &mut Fields {
|
||||
&mut self.fields
|
||||
}
|
||||
|
||||
fn output_keys(&self) -> HashSet<String> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|field| {
|
||||
field
|
||||
.target_field
|
||||
.clone()
|
||||
.unwrap_or_else(|| field.get_field_name().to_string())
|
||||
})
|
||||
.flat_map(|keys| {
|
||||
CMCD_KEYS
|
||||
.iter()
|
||||
.map(move |key| format!("{}_{}", keys, *key))
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
|
||||
match val {
|
||||
Value::String(val) => self.process_field(val, field),
|
||||
_ => Err(format!(
|
||||
"{} processor: expect string value, but got {val:?}",
|
||||
self.kind()
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String> {
|
||||
for field in self.fields.iter() {
|
||||
match val.get(field.input_field.index) {
|
||||
for (field_index, field) in self.fields.iter().enumerate() {
|
||||
let field_value_index = field.input_index();
|
||||
match val.get(field_value_index) {
|
||||
Some(Value::String(v)) => {
|
||||
// TODO(qtang): Let this method use the intermediate state collection directly.
|
||||
let map = self.process_field(v, field)?;
|
||||
for (k, v) in map.values.into_iter() {
|
||||
if let Some(index) = field.output_fields_index_mapping.get(&k) {
|
||||
val[*index] = v;
|
||||
}
|
||||
let result_list = self.parse(field_index, v)?;
|
||||
for (output_index, v) in result_list {
|
||||
val[output_index] = v;
|
||||
}
|
||||
}
|
||||
Some(Value::Null) | None => {
|
||||
@@ -278,7 +377,7 @@ impl crate::etl::processor::Processor for CmcdProcessor {
|
||||
return Err(format!(
|
||||
"{} processor: missing field: {}",
|
||||
self.kind(),
|
||||
field.get_field_name()
|
||||
field.input_name()
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -299,7 +398,8 @@ mod tests {
|
||||
use ahash::HashMap;
|
||||
use urlencoding::decode;
|
||||
|
||||
use super::CmcdProcessor;
|
||||
use super::{CmcdProcessorBuilder, CMCD_KEYS};
|
||||
use crate::etl::field::{Field, Fields};
|
||||
use crate::etl::value::{Map, Value};
|
||||
|
||||
#[test]
|
||||
@@ -329,6 +429,7 @@ mod tests {
|
||||
],
|
||||
),
|
||||
(
|
||||
// we not resolve `b` key
|
||||
"b%2Crtp%3D15000%2Csid%3D%226e2fb550-c457-11e9-bb97-0800200c9a66%22",
|
||||
vec![
|
||||
(
|
||||
@@ -336,7 +437,6 @@ mod tests {
|
||||
Value::String("\"6e2fb550-c457-11e9-bb97-0800200c9a66\"".into()),
|
||||
),
|
||||
("prefix_rtp", Value::Int64(15000)),
|
||||
("b", Value::Boolean(true)),
|
||||
],
|
||||
),
|
||||
(
|
||||
@@ -347,16 +447,17 @@ mod tests {
|
||||
],
|
||||
),
|
||||
(
|
||||
// we not resolve custom key
|
||||
"d%3D4004%2Ccom.example-myNumericKey%3D500%2Ccom.examplemyStringKey%3D%22myStringValue%22",
|
||||
vec![
|
||||
(
|
||||
"prefix_com.example-myNumericKey",
|
||||
Value::String("500".into()),
|
||||
),
|
||||
(
|
||||
"prefix_com.examplemyStringKey",
|
||||
Value::String("\"myStringValue\"".into()),
|
||||
),
|
||||
// (
|
||||
// "prefix_com.example-myNumericKey",
|
||||
// Value::String("500".into()),
|
||||
// ),
|
||||
// (
|
||||
// "prefix_com.examplemyStringKey",
|
||||
// Value::String("\"myStringValue\"".into()),
|
||||
// ),
|
||||
("prefix_d", Value::Int64(4004)),
|
||||
],
|
||||
),
|
||||
@@ -431,6 +532,24 @@ mod tests {
|
||||
),
|
||||
];
|
||||
|
||||
let field = Field::new("prefix", None);
|
||||
|
||||
let output_keys = CMCD_KEYS
|
||||
.iter()
|
||||
.map(|k| format!("prefix_{}", k))
|
||||
.collect::<Vec<String>>();
|
||||
|
||||
let mut intermediate_keys = vec!["prefix".to_string()];
|
||||
intermediate_keys.append(&mut (output_keys.clone()));
|
||||
|
||||
let builder = CmcdProcessorBuilder {
|
||||
fields: Fields::new(vec![field]),
|
||||
output_keys: output_keys.iter().map(|s| s.to_string()).collect(),
|
||||
ignore_missing: false,
|
||||
};
|
||||
|
||||
let processor = builder.build(&intermediate_keys).unwrap();
|
||||
|
||||
for (s, vec) in ss.into_iter() {
|
||||
let decoded = decode(s).unwrap().to_string();
|
||||
|
||||
@@ -440,7 +559,12 @@ mod tests {
|
||||
.collect::<HashMap<String, Value>>();
|
||||
let expected = Map { values };
|
||||
|
||||
let actual = CmcdProcessor::parse("prefix", &decoded).unwrap();
|
||||
let actual = processor.parse(0, &decoded).unwrap();
|
||||
let actual = actual
|
||||
.into_iter()
|
||||
.map(|(index, value)| (intermediate_keys[index].clone(), value))
|
||||
.collect::<HashMap<String, Value>>();
|
||||
let actual = Map { values: actual };
|
||||
assert_eq!(actual, expected);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,17 +14,18 @@
|
||||
|
||||
// Reference: https://www.elastic.co/guide/en/elasticsearch/reference/current/csv-processor.html
|
||||
|
||||
use ahash::{HashMap, HashSet};
|
||||
use ahash::HashSet;
|
||||
use csv::{ReaderBuilder, Trim};
|
||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||
use itertools::Itertools;
|
||||
|
||||
use crate::etl::field::{Field, Fields};
|
||||
use crate::etl::field::{Fields, InputFieldInfo, OneInputMultiOutputField};
|
||||
use crate::etl::find_key_index;
|
||||
use crate::etl::processor::{
|
||||
yaml_bool, yaml_field, yaml_fields, yaml_string, Processor, FIELDS_NAME, FIELD_NAME,
|
||||
IGNORE_MISSING_NAME,
|
||||
yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, ProcessorBuilder,
|
||||
ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
|
||||
};
|
||||
use crate::etl::value::{Map, Value};
|
||||
use crate::etl::value::Value;
|
||||
|
||||
pub(crate) const PROCESSOR_CSV: &str = "csv";
|
||||
|
||||
@@ -32,18 +33,78 @@ const SEPARATOR_NAME: &str = "separator";
|
||||
const QUOTE_NAME: &str = "quote";
|
||||
const TRIM_NAME: &str = "trim";
|
||||
const EMPTY_VALUE_NAME: &str = "empty_value";
|
||||
const TARGET_FIELDS: &str = "target_fields";
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct CsvProcessorBuilder {
|
||||
reader: ReaderBuilder,
|
||||
|
||||
fields: Fields,
|
||||
ignore_missing: bool,
|
||||
|
||||
// Value used to fill empty fields, empty fields will be skipped if this is not provided.
|
||||
empty_value: Option<String>,
|
||||
target_fields: Vec<String>,
|
||||
// description
|
||||
// if
|
||||
// ignore_failure
|
||||
// on_failure
|
||||
// tag
|
||||
}
|
||||
|
||||
impl CsvProcessorBuilder {
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<CsvProcessor, String> {
|
||||
let mut real_fields = vec![];
|
||||
|
||||
for field in self.fields {
|
||||
let input_index = find_key_index(intermediate_keys, field.input_field(), "csv")?;
|
||||
|
||||
let input_field_info = InputFieldInfo::new(field.input_field(), input_index);
|
||||
let real_field = OneInputMultiOutputField::new(input_field_info, None);
|
||||
real_fields.push(real_field);
|
||||
}
|
||||
|
||||
let output_index_info = self
|
||||
.target_fields
|
||||
.iter()
|
||||
.map(|f| find_key_index(intermediate_keys, f, "csv"))
|
||||
.collect::<Result<Vec<_>, String>>()?;
|
||||
Ok(CsvProcessor {
|
||||
reader: self.reader,
|
||||
fields: real_fields,
|
||||
ignore_missing: self.ignore_missing,
|
||||
empty_value: self.empty_value,
|
||||
output_index_info,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl ProcessorBuilder for CsvProcessorBuilder {
|
||||
fn output_keys(&self) -> HashSet<&str> {
|
||||
self.target_fields.iter().map(|s| s.as_str()).collect()
|
||||
}
|
||||
|
||||
fn input_keys(&self) -> HashSet<&str> {
|
||||
self.fields.iter().map(|f| f.input_field()).collect()
|
||||
}
|
||||
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String> {
|
||||
self.build(intermediate_keys).map(ProcessorKind::Csv)
|
||||
}
|
||||
}
|
||||
|
||||
/// only support string value
|
||||
#[derive(Debug)]
|
||||
pub struct CsvProcessor {
|
||||
reader: ReaderBuilder,
|
||||
|
||||
fields: Fields,
|
||||
fields: Vec<OneInputMultiOutputField>,
|
||||
|
||||
ignore_missing: bool,
|
||||
|
||||
// Value used to fill empty fields, empty fields will be skipped if this is not provided.
|
||||
empty_value: Option<String>,
|
||||
output_index_info: Vec<usize>,
|
||||
// description
|
||||
// if
|
||||
// ignore_failure
|
||||
@@ -52,81 +113,19 @@ pub struct CsvProcessor {
|
||||
}
|
||||
|
||||
impl CsvProcessor {
|
||||
fn new() -> Self {
|
||||
let mut reader = ReaderBuilder::new();
|
||||
reader.has_headers(false);
|
||||
|
||||
Self {
|
||||
reader,
|
||||
fields: Fields::default(),
|
||||
ignore_missing: false,
|
||||
empty_value: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn with_fields(&mut self, fields: Fields) {
|
||||
self.fields = fields;
|
||||
}
|
||||
|
||||
fn try_separator(&mut self, separator: String) -> Result<(), String> {
|
||||
if separator.len() != 1 {
|
||||
Err(format!(
|
||||
"'{}' must be a single character, but got '{}'",
|
||||
SEPARATOR_NAME, separator
|
||||
))
|
||||
} else {
|
||||
self.reader.delimiter(separator.as_bytes()[0]);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn try_quote(&mut self, quote: String) -> Result<(), String> {
|
||||
if quote.len() != 1 {
|
||||
Err(format!(
|
||||
"'{}' must be a single character, but got '{}'",
|
||||
QUOTE_NAME, quote
|
||||
))
|
||||
} else {
|
||||
self.reader.quote(quote.as_bytes()[0]);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn with_trim(&mut self, trim: bool) {
|
||||
if trim {
|
||||
self.reader.trim(Trim::All);
|
||||
} else {
|
||||
self.reader.trim(Trim::None);
|
||||
}
|
||||
}
|
||||
|
||||
fn with_ignore_missing(&mut self, ignore_missing: bool) {
|
||||
self.ignore_missing = ignore_missing;
|
||||
}
|
||||
|
||||
fn with_empty_value(&mut self, empty_value: String) {
|
||||
self.empty_value = Some(empty_value);
|
||||
}
|
||||
|
||||
// process the csv format string to a map with target_fields as keys
|
||||
fn process_field(&self, val: &str, field: &Field) -> Result<Map, String> {
|
||||
fn process(&self, val: &str) -> Result<Vec<(usize, Value)>, String> {
|
||||
let mut reader = self.reader.from_reader(val.as_bytes());
|
||||
|
||||
if let Some(result) = reader.records().next() {
|
||||
let record: csv::StringRecord = result.map_err(|e| e.to_string())?;
|
||||
|
||||
let values: HashMap<String, Value> = field
|
||||
.target_fields
|
||||
.as_ref()
|
||||
.ok_or(format!(
|
||||
"target fields must be set after '{}'",
|
||||
field.get_field_name()
|
||||
))?
|
||||
let values: Vec<(usize, Value)> = self
|
||||
.output_index_info
|
||||
.iter()
|
||||
.map(|f| f.to_string())
|
||||
.zip_longest(record.iter())
|
||||
.filter_map(|zipped| match zipped {
|
||||
Both(target_field, val) => Some((target_field, Value::String(val.into()))),
|
||||
Both(target_field, val) => Some((*target_field, Value::String(val.into()))),
|
||||
// if target fields are more than extracted fields, fill the rest with empty value
|
||||
Left(target_field) => {
|
||||
let value = self
|
||||
@@ -134,69 +133,101 @@ impl CsvProcessor {
|
||||
.as_ref()
|
||||
.map(|s| Value::String(s.clone()))
|
||||
.unwrap_or(Value::Null);
|
||||
Some((target_field, value))
|
||||
Some((*target_field, value))
|
||||
}
|
||||
// if extracted fields are more than target fields, ignore the rest
|
||||
Right(_) => None,
|
||||
})
|
||||
.collect();
|
||||
|
||||
Ok(Map { values })
|
||||
Ok(values)
|
||||
} else {
|
||||
Err("expected at least one record from csv format, but got none".into())
|
||||
}
|
||||
}
|
||||
|
||||
fn update_output_keys(&mut self) {
|
||||
self.fields.iter_mut().for_each(|f| {
|
||||
if let Some(tfs) = f.target_fields.as_ref() {
|
||||
tfs.iter().for_each(|tf| {
|
||||
if !tf.is_empty() {
|
||||
f.output_fields_index_mapping.insert(tf.to_string(), 0);
|
||||
}
|
||||
});
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for CsvProcessor {
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for CsvProcessorBuilder {
|
||||
type Error = String;
|
||||
|
||||
fn try_from(hash: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
|
||||
let mut processor = CsvProcessor::new();
|
||||
let mut reader = ReaderBuilder::new();
|
||||
reader.has_headers(false);
|
||||
|
||||
let mut fields = Fields::default();
|
||||
let mut ignore_missing = false;
|
||||
let mut empty_value = None;
|
||||
let mut target_fields = vec![];
|
||||
|
||||
for (k, v) in hash {
|
||||
let key = k
|
||||
.as_str()
|
||||
.ok_or(format!("key must be a string, but got {k:?}"))?;
|
||||
match key {
|
||||
FIELD_NAME => {
|
||||
processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?));
|
||||
fields = Fields::one(yaml_new_field(v, FIELD_NAME)?);
|
||||
}
|
||||
FIELDS_NAME => {
|
||||
processor.with_fields(yaml_fields(v, FIELDS_NAME)?);
|
||||
fields = yaml_new_fields(v, FIELDS_NAME)?;
|
||||
}
|
||||
TARGET_FIELDS => {
|
||||
target_fields = yaml_string(v, TARGET_FIELDS)?
|
||||
.split(',')
|
||||
.map(|s| s.trim().to_string())
|
||||
.filter(|s| !s.is_empty())
|
||||
.collect();
|
||||
}
|
||||
SEPARATOR_NAME => {
|
||||
processor.try_separator(yaml_string(v, SEPARATOR_NAME)?)?;
|
||||
let separator = yaml_string(v, SEPARATOR_NAME)?;
|
||||
if separator.len() != 1 {
|
||||
return Err(format!(
|
||||
"'{}' must be a single character, but got '{}'",
|
||||
SEPARATOR_NAME, separator
|
||||
));
|
||||
} else {
|
||||
reader.delimiter(separator.as_bytes()[0]);
|
||||
}
|
||||
}
|
||||
QUOTE_NAME => {
|
||||
processor.try_quote(yaml_string(v, QUOTE_NAME)?)?;
|
||||
let quote = yaml_string(v, QUOTE_NAME)?;
|
||||
if quote.len() != 1 {
|
||||
return Err(format!(
|
||||
"'{}' must be a single character, but got '{}'",
|
||||
QUOTE_NAME, quote
|
||||
));
|
||||
} else {
|
||||
reader.quote(quote.as_bytes()[0]);
|
||||
}
|
||||
}
|
||||
TRIM_NAME => {
|
||||
processor.with_trim(yaml_bool(v, TRIM_NAME)?);
|
||||
let trim = yaml_bool(v, TRIM_NAME)?;
|
||||
if trim {
|
||||
reader.trim(Trim::All);
|
||||
} else {
|
||||
reader.trim(Trim::None);
|
||||
}
|
||||
}
|
||||
IGNORE_MISSING_NAME => {
|
||||
processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?);
|
||||
ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?;
|
||||
}
|
||||
EMPTY_VALUE_NAME => {
|
||||
processor.with_empty_value(yaml_string(v, EMPTY_VALUE_NAME)?);
|
||||
empty_value = Some(yaml_string(v, EMPTY_VALUE_NAME)?);
|
||||
}
|
||||
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
processor.update_output_keys();
|
||||
Ok(processor)
|
||||
let builder = {
|
||||
CsvProcessorBuilder {
|
||||
reader,
|
||||
fields,
|
||||
ignore_missing,
|
||||
empty_value,
|
||||
target_fields,
|
||||
}
|
||||
};
|
||||
|
||||
Ok(builder)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -209,41 +240,14 @@ impl Processor for CsvProcessor {
|
||||
self.ignore_missing
|
||||
}
|
||||
|
||||
fn fields(&self) -> &Fields {
|
||||
&self.fields
|
||||
}
|
||||
|
||||
fn fields_mut(&mut self) -> &mut Fields {
|
||||
&mut self.fields
|
||||
}
|
||||
|
||||
fn output_keys(&self) -> HashSet<String> {
|
||||
self.fields
|
||||
.iter()
|
||||
.flat_map(|f| f.target_fields.clone().unwrap_or_default())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
|
||||
match val {
|
||||
Value::String(val) => self.process_field(val, field),
|
||||
_ => Err(format!(
|
||||
"{} processor: expect string value, but got {val:?}",
|
||||
self.kind()
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String> {
|
||||
for field in self.fields.iter() {
|
||||
match val.get(field.input_field.index) {
|
||||
let index = field.input_index();
|
||||
match val.get(index) {
|
||||
Some(Value::String(v)) => {
|
||||
// TODO(qtang): Let this method use the intermediate state collection directly.
|
||||
let map = self.process_field(v, field)?;
|
||||
for (k, v) in map.values.into_iter() {
|
||||
if let Some(index) = field.output_fields_index_mapping.get(&k) {
|
||||
val[*index] = v;
|
||||
}
|
||||
let resule_list = self.process(v)?;
|
||||
for (k, v) in resule_list {
|
||||
val[k] = v;
|
||||
}
|
||||
}
|
||||
Some(Value::Null) | None => {
|
||||
@@ -251,7 +255,7 @@ impl Processor for CsvProcessor {
|
||||
return Err(format!(
|
||||
"{} processor: missing field: {}",
|
||||
self.kind(),
|
||||
field.get_field_name()
|
||||
field.input_name()
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -267,116 +271,140 @@ impl Processor for CsvProcessor {
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(yuanbohan): more test cases
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use ahash::HashMap;
|
||||
|
||||
use super::{CsvProcessor, Value};
|
||||
use crate::etl::field::Fields;
|
||||
use crate::etl::processor::Processor;
|
||||
use crate::etl::value::Map;
|
||||
use super::Value;
|
||||
use crate::etl::processor::csv::CsvProcessorBuilder;
|
||||
|
||||
#[test]
|
||||
fn test_equal_length() {
|
||||
let mut processor = CsvProcessor::new();
|
||||
let field = "data,, a, b".parse().unwrap();
|
||||
processor.with_fields(Fields::one(field));
|
||||
let mut reader = csv::ReaderBuilder::new();
|
||||
reader.has_headers(false);
|
||||
let builder = CsvProcessorBuilder {
|
||||
reader,
|
||||
target_fields: vec!["a".into(), "b".into()],
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let values: HashMap<String, Value> = [("data".into(), Value::String("1,2".into()))]
|
||||
let intermediate_keys = vec!["data".into(), "a".into(), "b".into()];
|
||||
|
||||
let processor = builder.build(&intermediate_keys).unwrap();
|
||||
let result = processor
|
||||
.process("1,2")
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.collect();
|
||||
let mut m = Map { values };
|
||||
|
||||
processor.exec_map(&mut m).unwrap();
|
||||
.map(|(k, v)| (intermediate_keys[k].clone(), v))
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
let values = [
|
||||
("data".into(), Value::String("1,2".into())),
|
||||
("a".into(), Value::String("1".into())),
|
||||
("b".into(), Value::String("2".into())),
|
||||
]
|
||||
.into_iter()
|
||||
.collect();
|
||||
let expected = Map { values };
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
assert_eq!(expected, m);
|
||||
assert_eq!(result, values);
|
||||
}
|
||||
|
||||
// test target_fields length larger than the record length
|
||||
#[test]
|
||||
fn test_target_fields_has_more_length() {
|
||||
let values = [("data".into(), Value::String("1,2".into()))]
|
||||
.into_iter()
|
||||
.collect();
|
||||
let mut input = Map { values };
|
||||
|
||||
// with no empty value
|
||||
{
|
||||
let mut processor = CsvProcessor::new();
|
||||
let field = "data,, a,b,c".parse().unwrap();
|
||||
processor.with_fields(Fields::one(field));
|
||||
let mut reader = csv::ReaderBuilder::new();
|
||||
reader.has_headers(false);
|
||||
let builder = CsvProcessorBuilder {
|
||||
reader,
|
||||
target_fields: vec!["a".into(), "b".into(), "c".into()],
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
processor.exec_map(&mut input).unwrap();
|
||||
let intermediate_keys = vec!["data".into(), "a".into(), "b".into(), "c".into()];
|
||||
|
||||
let processor = builder.build(&intermediate_keys).unwrap();
|
||||
let result = processor
|
||||
.process("1,2")
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|(k, v)| (intermediate_keys[k].clone(), v))
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
let values = [
|
||||
("data".into(), Value::String("1,2".into())),
|
||||
("a".into(), Value::String("1".into())),
|
||||
("b".into(), Value::String("2".into())),
|
||||
("c".into(), Value::Null),
|
||||
]
|
||||
.into_iter()
|
||||
.collect();
|
||||
let expected = Map { values };
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
assert_eq!(expected, input);
|
||||
assert_eq!(result, values);
|
||||
}
|
||||
|
||||
// with empty value
|
||||
{
|
||||
let mut processor = CsvProcessor::new();
|
||||
let field = "data,, a,b,c".parse().unwrap();
|
||||
processor.with_fields(Fields::one(field));
|
||||
processor.with_empty_value("default".into());
|
||||
let mut reader = csv::ReaderBuilder::new();
|
||||
reader.has_headers(false);
|
||||
let builder = CsvProcessorBuilder {
|
||||
reader,
|
||||
target_fields: vec!["a".into(), "b".into(), "c".into()],
|
||||
empty_value: Some("default".into()),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
processor.exec_map(&mut input).unwrap();
|
||||
let intermediate_keys = vec!["data".into(), "a".into(), "b".into(), "c".into()];
|
||||
|
||||
let processor = builder.build(&intermediate_keys).unwrap();
|
||||
let result = processor
|
||||
.process("1,2")
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|(k, v)| (intermediate_keys[k].clone(), v))
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
let values = [
|
||||
("data".into(), Value::String("1,2".into())),
|
||||
("a".into(), Value::String("1".into())),
|
||||
("b".into(), Value::String("2".into())),
|
||||
("c".into(), Value::String("default".into())),
|
||||
]
|
||||
.into_iter()
|
||||
.collect();
|
||||
let expected = Map { values };
|
||||
|
||||
assert_eq!(expected, input);
|
||||
assert_eq!(result, values);
|
||||
}
|
||||
}
|
||||
|
||||
// test record has larger length
|
||||
#[test]
|
||||
fn test_target_fields_has_less_length() {
|
||||
let values = [("data".into(), Value::String("1,2,3".into()))]
|
||||
let mut reader = csv::ReaderBuilder::new();
|
||||
reader.has_headers(false);
|
||||
let builder = CsvProcessorBuilder {
|
||||
reader,
|
||||
target_fields: vec!["a".into(), "b".into()],
|
||||
empty_value: Some("default".into()),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let intermediate_keys = vec!["data".into(), "a".into(), "b".into()];
|
||||
|
||||
let processor = builder.build(&intermediate_keys).unwrap();
|
||||
let result = processor
|
||||
.process("1,2")
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.collect();
|
||||
let mut input = Map { values };
|
||||
|
||||
let mut processor = CsvProcessor::new();
|
||||
let field = "data,,a,b".parse().unwrap();
|
||||
processor.with_fields(Fields::one(field));
|
||||
|
||||
processor.exec_map(&mut input).unwrap();
|
||||
.map(|(k, v)| (intermediate_keys[k].clone(), v))
|
||||
.collect::<HashMap<_, _>>();
|
||||
|
||||
let values = [
|
||||
("data".into(), Value::String("1,2,3".into())),
|
||||
("a".into(), Value::String("1".into())),
|
||||
("b".into(), Value::String("2".into())),
|
||||
]
|
||||
.into_iter()
|
||||
.collect();
|
||||
let expected = Map { values };
|
||||
|
||||
assert_eq!(expected, input);
|
||||
assert_eq!(result, values);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,12 +19,12 @@ use chrono::{DateTime, NaiveDateTime};
|
||||
use chrono_tz::Tz;
|
||||
use lazy_static::lazy_static;
|
||||
|
||||
use crate::etl::field::{Field, Fields};
|
||||
use crate::etl::field::{Fields, OneInputOneOutputField};
|
||||
use crate::etl::processor::{
|
||||
update_one_one_output_keys, yaml_bool, yaml_field, yaml_fields, yaml_string, yaml_strings,
|
||||
Processor, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
|
||||
yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, yaml_strings, Processor,
|
||||
ProcessorBuilder, ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
|
||||
};
|
||||
use crate::etl::value::{Map, Timestamp, Value};
|
||||
use crate::etl::value::{Timestamp, Value};
|
||||
|
||||
pub(crate) const PROCESSOR_DATE: &str = "date";
|
||||
|
||||
@@ -57,9 +57,15 @@ lazy_static! {
|
||||
.collect();
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
#[derive(Debug)]
|
||||
struct Formats(Vec<Arc<String>>);
|
||||
|
||||
impl Default for Formats {
|
||||
fn default() -> Self {
|
||||
Formats(DEFAULT_FORMATS.clone())
|
||||
}
|
||||
}
|
||||
|
||||
impl Formats {
|
||||
fn new(mut formats: Vec<Arc<String>>) -> Self {
|
||||
formats.sort();
|
||||
@@ -76,16 +82,119 @@ impl std::ops::Deref for Formats {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct DateProcessorBuilder {
|
||||
fields: Fields,
|
||||
formats: Formats,
|
||||
timezone: Option<Arc<String>>,
|
||||
locale: Option<Arc<String>>,
|
||||
ignore_missing: bool,
|
||||
}
|
||||
|
||||
impl ProcessorBuilder for DateProcessorBuilder {
|
||||
fn output_keys(&self) -> HashSet<&str> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|f| f.target_or_input_field())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn input_keys(&self) -> HashSet<&str> {
|
||||
self.fields.iter().map(|f| f.input_field()).collect()
|
||||
}
|
||||
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String> {
|
||||
self.build(intermediate_keys).map(ProcessorKind::Date)
|
||||
}
|
||||
}
|
||||
|
||||
impl DateProcessorBuilder {
|
||||
pub fn build(self, intermediate_keys: &[String]) -> Result<DateProcessor, String> {
|
||||
let mut real_fields = vec![];
|
||||
for field in self.fields.into_iter() {
|
||||
let input = OneInputOneOutputField::build(
|
||||
"date",
|
||||
intermediate_keys,
|
||||
field.input_field(),
|
||||
field.target_or_input_field(),
|
||||
)?;
|
||||
real_fields.push(input);
|
||||
}
|
||||
Ok(DateProcessor {
|
||||
fields: real_fields,
|
||||
formats: self.formats,
|
||||
timezone: self.timezone,
|
||||
locale: self.locale,
|
||||
ignore_missing: self.ignore_missing,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for DateProcessorBuilder {
|
||||
type Error = String;
|
||||
|
||||
fn try_from(hash: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
|
||||
let mut fields = Fields::default();
|
||||
let mut formats = Formats::default();
|
||||
let mut timezone = None;
|
||||
let mut locale = None;
|
||||
let mut ignore_missing = false;
|
||||
|
||||
for (k, v) in hash {
|
||||
let key = k
|
||||
.as_str()
|
||||
.ok_or(format!("key must be a string, but got {k:?}"))?;
|
||||
|
||||
match key {
|
||||
FIELD_NAME => {
|
||||
fields = Fields::one(yaml_new_field(v, FIELD_NAME)?);
|
||||
}
|
||||
FIELDS_NAME => {
|
||||
fields = yaml_new_fields(v, FIELDS_NAME)?;
|
||||
}
|
||||
|
||||
FORMATS_NAME => {
|
||||
let format_strs = yaml_strings(v, FORMATS_NAME)?;
|
||||
if format_strs.is_empty() {
|
||||
formats = Formats::new(DEFAULT_FORMATS.clone());
|
||||
} else {
|
||||
formats = Formats::new(format_strs.into_iter().map(Arc::new).collect());
|
||||
}
|
||||
}
|
||||
TIMEZONE_NAME => {
|
||||
timezone = Some(Arc::new(yaml_string(v, TIMEZONE_NAME)?));
|
||||
}
|
||||
LOCALE_NAME => {
|
||||
locale = Some(Arc::new(yaml_string(v, LOCALE_NAME)?));
|
||||
}
|
||||
IGNORE_MISSING_NAME => {
|
||||
ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?;
|
||||
}
|
||||
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
let builder = DateProcessorBuilder {
|
||||
fields,
|
||||
formats,
|
||||
timezone,
|
||||
locale,
|
||||
ignore_missing,
|
||||
};
|
||||
|
||||
Ok(builder)
|
||||
}
|
||||
}
|
||||
|
||||
/// deprecated it should be removed in the future
|
||||
/// Reserved for compatibility only
|
||||
#[derive(Debug, Default)]
|
||||
pub struct DateProcessor {
|
||||
fields: Fields,
|
||||
|
||||
fields: Vec<OneInputOneOutputField>,
|
||||
formats: Formats,
|
||||
timezone: Option<Arc<String>>,
|
||||
locale: Option<Arc<String>>, // to support locale
|
||||
output_format: Option<Arc<String>>,
|
||||
|
||||
ignore_missing: bool,
|
||||
// description
|
||||
@@ -96,43 +205,6 @@ pub struct DateProcessor {
|
||||
}
|
||||
|
||||
impl DateProcessor {
|
||||
fn with_fields(&mut self, mut fields: Fields) {
|
||||
update_one_one_output_keys(&mut fields);
|
||||
self.fields = fields
|
||||
}
|
||||
|
||||
fn with_formats(&mut self, v: Option<Vec<Arc<String>>>) {
|
||||
let v = match v {
|
||||
Some(v) if !v.is_empty() => v,
|
||||
_ => DEFAULT_FORMATS.clone(),
|
||||
};
|
||||
|
||||
let formats = Formats::new(v);
|
||||
self.formats = formats;
|
||||
}
|
||||
|
||||
fn with_timezone(&mut self, timezone: String) {
|
||||
if !timezone.is_empty() {
|
||||
self.timezone = Some(Arc::new(timezone));
|
||||
}
|
||||
}
|
||||
|
||||
fn with_locale(&mut self, locale: String) {
|
||||
if !locale.is_empty() {
|
||||
self.locale = Some(Arc::new(locale));
|
||||
}
|
||||
}
|
||||
|
||||
fn with_output_format(&mut self, output_format: String) {
|
||||
if !output_format.is_empty() {
|
||||
self.output_format = Some(Arc::new(output_format));
|
||||
}
|
||||
}
|
||||
|
||||
fn with_ignore_missing(&mut self, ignore_missing: bool) {
|
||||
self.ignore_missing = ignore_missing;
|
||||
}
|
||||
|
||||
fn parse(&self, val: &str) -> Result<Timestamp, String> {
|
||||
let mut tz = Tz::UTC;
|
||||
if let Some(timezone) = &self.timezone {
|
||||
@@ -147,61 +219,6 @@ impl DateProcessor {
|
||||
|
||||
Err(format!("{} processor: failed to parse {val}", self.kind(),))
|
||||
}
|
||||
|
||||
fn process_field(&self, val: &str, field: &Field) -> Result<Map, String> {
|
||||
let key = field.get_target_field();
|
||||
|
||||
Ok(Map::one(key, Value::Timestamp(self.parse(val)?)))
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for DateProcessor {
|
||||
type Error = String;
|
||||
|
||||
fn try_from(hash: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
|
||||
let mut processor = DateProcessor::default();
|
||||
|
||||
let mut formats_opt = None;
|
||||
|
||||
for (k, v) in hash {
|
||||
let key = k
|
||||
.as_str()
|
||||
.ok_or(format!("key must be a string, but got {k:?}"))?;
|
||||
|
||||
match key {
|
||||
FIELD_NAME => {
|
||||
processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?));
|
||||
}
|
||||
FIELDS_NAME => {
|
||||
processor.with_fields(yaml_fields(v, FIELDS_NAME)?);
|
||||
}
|
||||
|
||||
FORMATS_NAME => {
|
||||
let formats = yaml_strings(v, FORMATS_NAME)?;
|
||||
formats_opt = Some(formats.into_iter().map(Arc::new).collect());
|
||||
}
|
||||
TIMEZONE_NAME => {
|
||||
processor.with_timezone(yaml_string(v, TIMEZONE_NAME)?);
|
||||
}
|
||||
LOCALE_NAME => {
|
||||
processor.with_locale(yaml_string(v, LOCALE_NAME)?);
|
||||
}
|
||||
OUTPUT_FORMAT_NAME => {
|
||||
processor.with_output_format(yaml_string(v, OUTPUT_FORMAT_NAME)?);
|
||||
}
|
||||
|
||||
IGNORE_MISSING_NAME => {
|
||||
processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?);
|
||||
}
|
||||
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
processor.with_formats(formats_opt);
|
||||
|
||||
Ok(processor)
|
||||
}
|
||||
}
|
||||
|
||||
impl Processor for DateProcessor {
|
||||
@@ -213,53 +230,21 @@ impl Processor for DateProcessor {
|
||||
self.ignore_missing
|
||||
}
|
||||
|
||||
fn fields(&self) -> &Fields {
|
||||
&self.fields
|
||||
}
|
||||
|
||||
fn fields_mut(&mut self) -> &mut Fields {
|
||||
&mut self.fields
|
||||
}
|
||||
|
||||
fn output_keys(&self) -> HashSet<String> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|f| f.get_target_field().to_string())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
|
||||
match val {
|
||||
Value::String(s) => self.process_field(s, field),
|
||||
_ => Err(format!(
|
||||
"{} processor: expect string value, but got {val:?}",
|
||||
self.kind()
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String> {
|
||||
for field in self.fields().iter() {
|
||||
let index = field.input_field.index;
|
||||
for field in self.fields.iter() {
|
||||
let index = field.input_index();
|
||||
match val.get(index) {
|
||||
Some(Value::String(s)) => {
|
||||
// TODO(qtang): Let this method use the intermediate state collection directly.
|
||||
let mut map = self.process_field(s, field)?;
|
||||
field
|
||||
.output_fields_index_mapping
|
||||
.iter()
|
||||
.for_each(|(k, output_index)| {
|
||||
if let Some(v) = map.remove(k) {
|
||||
val[*output_index] = v;
|
||||
}
|
||||
});
|
||||
let timestamp = self.parse(s)?;
|
||||
let output_index = field.output_index();
|
||||
val[output_index] = Value::Timestamp(timestamp);
|
||||
}
|
||||
Some(Value::Null) | None => {
|
||||
if !self.ignore_missing {
|
||||
return Err(format!(
|
||||
"{} processor: missing field: {}",
|
||||
self.kind(),
|
||||
field.get_field_name()
|
||||
field.input_name()
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -318,8 +303,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_parse() {
|
||||
let mut processor = DateProcessor::default();
|
||||
processor.with_formats(None);
|
||||
let processor = DateProcessor::default();
|
||||
|
||||
let values: Vec<&str> = vec![
|
||||
"2014-5-17T12:34:56",
|
||||
@@ -340,7 +324,6 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_parse_with_formats() {
|
||||
let mut processor = DateProcessor::default();
|
||||
let formats = vec![
|
||||
"%Y-%m-%dT%H:%M:%S%:z",
|
||||
"%Y-%m-%dT%H:%M:%S%.3f%:z",
|
||||
@@ -349,8 +332,11 @@ mod tests {
|
||||
]
|
||||
.into_iter()
|
||||
.map(|s| Arc::new(s.to_string()))
|
||||
.collect();
|
||||
processor.with_formats(Some(formats));
|
||||
.collect::<Vec<_>>();
|
||||
let processor = DateProcessor {
|
||||
formats: super::Formats(formats),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let values: Vec<&str> = vec![
|
||||
"2014-5-17T12:34:56",
|
||||
@@ -371,9 +357,10 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_parse_with_timezone() {
|
||||
let mut processor = DateProcessor::default();
|
||||
processor.with_formats(None);
|
||||
processor.with_timezone("Asia/Tokyo".to_string());
|
||||
let processor = DateProcessor {
|
||||
timezone: Some(Arc::new("Asia/Tokyo".to_string())),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let values: Vec<&str> = vec![
|
||||
"2014-5-17T12:34:56",
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -14,17 +14,17 @@
|
||||
|
||||
use ahash::HashSet;
|
||||
|
||||
use crate::etl::field::{Field, Fields};
|
||||
use crate::etl::field::{Fields, OneInputOneOutputField};
|
||||
use crate::etl::processor::{
|
||||
update_one_one_output_keys, yaml_bool, yaml_field, yaml_fields, yaml_string, Processor,
|
||||
FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
|
||||
yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, ProcessorBuilder,
|
||||
ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
|
||||
};
|
||||
use crate::etl::value::time::{
|
||||
MICROSECOND_RESOLUTION, MICRO_RESOLUTION, MILLISECOND_RESOLUTION, MILLI_RESOLUTION,
|
||||
MS_RESOLUTION, NANOSECOND_RESOLUTION, NANO_RESOLUTION, NS_RESOLUTION, SECOND_RESOLUTION,
|
||||
SEC_RESOLUTION, S_RESOLUTION, US_RESOLUTION,
|
||||
};
|
||||
use crate::etl::value::{Map, Timestamp, Value};
|
||||
use crate::etl::value::{Timestamp, Value};
|
||||
|
||||
pub(crate) const PROCESSOR_EPOCH: &str = "epoch";
|
||||
const RESOLUTION_NAME: &str = "resolution";
|
||||
@@ -52,12 +52,56 @@ impl TryFrom<&str> for Resolution {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub struct EpochProcessorBuilder {
|
||||
fields: Fields,
|
||||
resolution: Resolution,
|
||||
ignore_missing: bool,
|
||||
}
|
||||
|
||||
impl ProcessorBuilder for EpochProcessorBuilder {
|
||||
fn output_keys(&self) -> HashSet<&str> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|f| f.target_or_input_field())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn input_keys(&self) -> HashSet<&str> {
|
||||
self.fields.iter().map(|f| f.input_field()).collect()
|
||||
}
|
||||
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String> {
|
||||
self.build(intermediate_keys).map(ProcessorKind::Epoch)
|
||||
}
|
||||
}
|
||||
|
||||
impl EpochProcessorBuilder {
|
||||
pub fn build(self, intermediate_keys: &[String]) -> Result<EpochProcessor, String> {
|
||||
let mut real_fields = vec![];
|
||||
for field in self.fields.into_iter() {
|
||||
let input = OneInputOneOutputField::build(
|
||||
"epoch",
|
||||
intermediate_keys,
|
||||
field.input_field(),
|
||||
field.target_or_input_field(),
|
||||
)?;
|
||||
real_fields.push(input);
|
||||
}
|
||||
Ok(EpochProcessor {
|
||||
fields: real_fields,
|
||||
resolution: self.resolution,
|
||||
ignore_missing: self.ignore_missing,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// support string, integer, float, time, epoch
|
||||
/// deprecated it should be removed in the future
|
||||
/// Reserved for compatibility only
|
||||
#[derive(Debug, Default)]
|
||||
pub struct EpochProcessor {
|
||||
fields: Fields,
|
||||
fields: Vec<OneInputOneOutputField>,
|
||||
resolution: Resolution,
|
||||
ignore_missing: bool,
|
||||
// description
|
||||
@@ -68,19 +112,6 @@ pub struct EpochProcessor {
|
||||
}
|
||||
|
||||
impl EpochProcessor {
|
||||
fn with_fields(&mut self, mut fields: Fields) {
|
||||
update_one_one_output_keys(&mut fields);
|
||||
self.fields = fields
|
||||
}
|
||||
|
||||
fn with_resolution(&mut self, resolution: Resolution) {
|
||||
self.resolution = resolution;
|
||||
}
|
||||
|
||||
fn with_ignore_missing(&mut self, ignore_missing: bool) {
|
||||
self.ignore_missing = ignore_missing;
|
||||
}
|
||||
|
||||
fn parse(&self, val: &Value) -> Result<Timestamp, String> {
|
||||
let t: i64 = match val {
|
||||
Value::String(s) => s
|
||||
@@ -117,19 +148,15 @@ impl EpochProcessor {
|
||||
Resolution::Nano => Ok(Timestamp::Nanosecond(t)),
|
||||
}
|
||||
}
|
||||
|
||||
fn process_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
|
||||
let key = field.get_target_field();
|
||||
|
||||
Ok(Map::one(key, Value::Timestamp(self.parse(val)?)))
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for EpochProcessor {
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for EpochProcessorBuilder {
|
||||
type Error = String;
|
||||
|
||||
fn try_from(hash: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
|
||||
let mut processor = EpochProcessor::default();
|
||||
let mut fields = Fields::default();
|
||||
let mut resolution = Resolution::default();
|
||||
let mut ignore_missing = false;
|
||||
|
||||
for (k, v) in hash {
|
||||
let key = k
|
||||
@@ -138,24 +165,29 @@ impl TryFrom<&yaml_rust::yaml::Hash> for EpochProcessor {
|
||||
|
||||
match key {
|
||||
FIELD_NAME => {
|
||||
processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?));
|
||||
fields = Fields::one(yaml_new_field(v, FIELD_NAME)?);
|
||||
}
|
||||
FIELDS_NAME => {
|
||||
processor.with_fields(yaml_fields(v, FIELDS_NAME)?);
|
||||
fields = yaml_new_fields(v, FIELDS_NAME)?;
|
||||
}
|
||||
RESOLUTION_NAME => {
|
||||
let s = yaml_string(v, RESOLUTION_NAME)?.as_str().try_into()?;
|
||||
processor.with_resolution(s);
|
||||
resolution = s;
|
||||
}
|
||||
IGNORE_MISSING_NAME => {
|
||||
processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?);
|
||||
ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?;
|
||||
}
|
||||
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
let builder = EpochProcessorBuilder {
|
||||
fields,
|
||||
resolution,
|
||||
ignore_missing,
|
||||
};
|
||||
|
||||
Ok(processor)
|
||||
Ok(builder)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -168,49 +200,23 @@ impl Processor for EpochProcessor {
|
||||
self.ignore_missing
|
||||
}
|
||||
|
||||
fn fields(&self) -> &Fields {
|
||||
&self.fields
|
||||
}
|
||||
|
||||
fn fields_mut(&mut self) -> &mut Fields {
|
||||
&mut self.fields
|
||||
}
|
||||
|
||||
fn output_keys(&self) -> HashSet<String> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|f| f.get_target_field().to_string())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
|
||||
self.process_field(val, field)
|
||||
}
|
||||
|
||||
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String> {
|
||||
for field in self.fields.iter() {
|
||||
let index = field.input_field.index;
|
||||
let index = field.input_index();
|
||||
match val.get(index) {
|
||||
Some(Value::Null) | None => {
|
||||
if !self.ignore_missing {
|
||||
return Err(format!(
|
||||
"{} processor: missing field: {}",
|
||||
self.kind(),
|
||||
field.get_field_name()
|
||||
field.input_name()
|
||||
));
|
||||
}
|
||||
}
|
||||
Some(v) => {
|
||||
// TODO(qtang): Let this method use the intermediate state collection directly.
|
||||
let mut map = self.process_field(v, field)?;
|
||||
field
|
||||
.output_fields_index_mapping
|
||||
.iter()
|
||||
.for_each(|(k, output_index)| {
|
||||
if let Some(v) = map.remove(k) {
|
||||
val[*output_index] = v;
|
||||
}
|
||||
});
|
||||
let timestamp = self.parse(v)?;
|
||||
let output_index = field.output_index();
|
||||
val[output_index] = Value::Timestamp(timestamp);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -225,8 +231,10 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_parse_epoch() {
|
||||
let mut processor = EpochProcessor::default();
|
||||
processor.with_resolution(super::Resolution::Second);
|
||||
let processor = EpochProcessor {
|
||||
resolution: super::Resolution::Second,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let values = [
|
||||
Value::String("1573840000".into()),
|
||||
|
||||
@@ -15,45 +15,43 @@
|
||||
use ahash::HashSet;
|
||||
use regex::Regex;
|
||||
|
||||
use crate::etl::field::{Field, Fields};
|
||||
use crate::etl::field::{Fields, OneInputOneOutputField};
|
||||
use crate::etl::processor::{
|
||||
update_one_one_output_keys, yaml_bool, yaml_field, yaml_fields, yaml_string, Processor,
|
||||
FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, PATTERN_NAME,
|
||||
yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, ProcessorBuilder,
|
||||
ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, PATTERN_NAME,
|
||||
};
|
||||
use crate::etl::value::{Array, Map, Value};
|
||||
use crate::etl::value::Value;
|
||||
|
||||
pub(crate) const PROCESSOR_GSUB: &str = "gsub";
|
||||
|
||||
const REPLACEMENT_NAME: &str = "replacement";
|
||||
|
||||
/// A processor to replace all matches of a pattern in string by a replacement, only support string value, and array string value
|
||||
#[derive(Debug, Default)]
|
||||
pub struct GsubProcessor {
|
||||
pub struct GsubProcessorBuilder {
|
||||
fields: Fields,
|
||||
pattern: Option<Regex>,
|
||||
replacement: Option<String>,
|
||||
ignore_missing: bool,
|
||||
}
|
||||
|
||||
impl GsubProcessor {
|
||||
fn with_fields(&mut self, mut fields: Fields) {
|
||||
update_one_one_output_keys(&mut fields);
|
||||
self.fields = fields;
|
||||
impl ProcessorBuilder for GsubProcessorBuilder {
|
||||
fn output_keys(&self) -> HashSet<&str> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|f| f.target_or_input_field())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn with_ignore_missing(&mut self, ignore_missing: bool) {
|
||||
self.ignore_missing = ignore_missing;
|
||||
fn input_keys(&self) -> HashSet<&str> {
|
||||
self.fields.iter().map(|f| f.input_field()).collect()
|
||||
}
|
||||
|
||||
fn try_pattern(&mut self, pattern: &str) -> Result<(), String> {
|
||||
self.pattern = Some(Regex::new(pattern).map_err(|e| e.to_string())?);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn with_replacement(&mut self, replacement: impl Into<String>) {
|
||||
self.replacement = Some(replacement.into());
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String> {
|
||||
self.build(intermediate_keys).map(ProcessorKind::Gsub)
|
||||
}
|
||||
}
|
||||
|
||||
impl GsubProcessorBuilder {
|
||||
fn check(self) -> Result<Self, String> {
|
||||
if self.pattern.is_none() {
|
||||
return Err("pattern is required".to_string());
|
||||
@@ -66,7 +64,49 @@ impl GsubProcessor {
|
||||
Ok(self)
|
||||
}
|
||||
|
||||
fn process_string_field(&self, val: &str, field: &Field) -> Result<Map, String> {
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<GsubProcessor, String> {
|
||||
let mut real_fields = vec![];
|
||||
for field in self.fields.into_iter() {
|
||||
let input = OneInputOneOutputField::build(
|
||||
"gsub",
|
||||
intermediate_keys,
|
||||
field.input_field(),
|
||||
field.target_or_input_field(),
|
||||
)?;
|
||||
real_fields.push(input);
|
||||
}
|
||||
Ok(GsubProcessor {
|
||||
fields: real_fields,
|
||||
pattern: self.pattern,
|
||||
replacement: self.replacement,
|
||||
ignore_missing: self.ignore_missing,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// A processor to replace all matches of a pattern in string by a replacement, only support string value, and array string value
|
||||
#[derive(Debug, Default)]
|
||||
pub struct GsubProcessor {
|
||||
fields: Vec<OneInputOneOutputField>,
|
||||
pattern: Option<Regex>,
|
||||
replacement: Option<String>,
|
||||
ignore_missing: bool,
|
||||
}
|
||||
|
||||
impl GsubProcessor {
|
||||
fn check(self) -> Result<Self, String> {
|
||||
if self.pattern.is_none() {
|
||||
return Err("pattern is required".to_string());
|
||||
}
|
||||
|
||||
if self.replacement.is_none() {
|
||||
return Err("replacement is required".to_string());
|
||||
}
|
||||
|
||||
Ok(self)
|
||||
}
|
||||
|
||||
fn process_string(&self, val: &str) -> Result<Value, String> {
|
||||
let replacement = self.replacement.as_ref().unwrap();
|
||||
let new_val = self
|
||||
.pattern
|
||||
@@ -76,42 +116,28 @@ impl GsubProcessor {
|
||||
.to_string();
|
||||
let val = Value::String(new_val);
|
||||
|
||||
let key = field.get_target_field();
|
||||
|
||||
Ok(Map::one(key, val))
|
||||
Ok(val)
|
||||
}
|
||||
|
||||
fn process_array_field(&self, arr: &Array, field: &Field) -> Result<Map, String> {
|
||||
let key = field.get_target_field();
|
||||
|
||||
let re = self.pattern.as_ref().unwrap();
|
||||
let replacement = self.replacement.as_ref().unwrap();
|
||||
|
||||
let mut result = Array::default();
|
||||
for val in arr.iter() {
|
||||
match val {
|
||||
Value::String(haystack) => {
|
||||
let new_val = re.replace_all(haystack, replacement).to_string();
|
||||
result.push(Value::String(new_val));
|
||||
}
|
||||
_ => {
|
||||
return Err(format!(
|
||||
"{} processor: expect string or array string, but got {val:?}",
|
||||
self.kind()
|
||||
))
|
||||
}
|
||||
}
|
||||
fn process(&self, val: &Value) -> Result<Value, String> {
|
||||
match val {
|
||||
Value::String(val) => self.process_string(val),
|
||||
_ => Err(format!(
|
||||
"{} processor: expect string or array string, but got {val:?}",
|
||||
self.kind()
|
||||
)),
|
||||
}
|
||||
|
||||
Ok(Map::one(key, Value::Array(result)))
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for GsubProcessor {
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for GsubProcessorBuilder {
|
||||
type Error = String;
|
||||
|
||||
fn try_from(value: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
|
||||
let mut processor = GsubProcessor::default();
|
||||
let mut fields = Fields::default();
|
||||
let mut ignore_missing = false;
|
||||
let mut pattern = None;
|
||||
let mut replacement = None;
|
||||
|
||||
for (k, v) in value.iter() {
|
||||
let key = k
|
||||
@@ -119,27 +145,36 @@ impl TryFrom<&yaml_rust::yaml::Hash> for GsubProcessor {
|
||||
.ok_or(format!("key must be a string, but got {k:?}"))?;
|
||||
match key {
|
||||
FIELD_NAME => {
|
||||
processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?));
|
||||
fields = Fields::one(yaml_new_field(v, FIELD_NAME)?);
|
||||
}
|
||||
FIELDS_NAME => {
|
||||
processor.with_fields(yaml_fields(v, FIELDS_NAME)?);
|
||||
fields = yaml_new_fields(v, FIELDS_NAME)?;
|
||||
}
|
||||
PATTERN_NAME => {
|
||||
processor.try_pattern(&yaml_string(v, PATTERN_NAME)?)?;
|
||||
let pattern_str = yaml_string(v, PATTERN_NAME)?;
|
||||
pattern = Some(Regex::new(&pattern_str).map_err(|e| e.to_string())?);
|
||||
}
|
||||
REPLACEMENT_NAME => {
|
||||
processor.with_replacement(yaml_string(v, REPLACEMENT_NAME)?);
|
||||
let replacement_str = yaml_string(v, REPLACEMENT_NAME)?;
|
||||
replacement = Some(replacement_str);
|
||||
}
|
||||
|
||||
IGNORE_MISSING_NAME => {
|
||||
processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?);
|
||||
ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?;
|
||||
}
|
||||
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
processor.check()
|
||||
let builder = GsubProcessorBuilder {
|
||||
fields,
|
||||
pattern,
|
||||
replacement,
|
||||
ignore_missing,
|
||||
};
|
||||
|
||||
builder.check()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -152,56 +187,23 @@ impl crate::etl::processor::Processor for GsubProcessor {
|
||||
self.ignore_missing
|
||||
}
|
||||
|
||||
fn fields(&self) -> &Fields {
|
||||
&self.fields
|
||||
}
|
||||
|
||||
fn fields_mut(&mut self) -> &mut Fields {
|
||||
&mut self.fields
|
||||
}
|
||||
|
||||
fn output_keys(&self) -> HashSet<String> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|f| f.get_target_field().to_string())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
|
||||
match val {
|
||||
Value::String(val) => self.process_string_field(val, field),
|
||||
Value::Array(arr) => self.process_array_field(arr, field),
|
||||
_ => Err(format!(
|
||||
"{} processor: expect string or array string, but got {val:?}",
|
||||
self.kind()
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String> {
|
||||
for field in self.fields.iter() {
|
||||
let index = field.input_field.index;
|
||||
let index = field.input_index();
|
||||
match val.get(index) {
|
||||
Some(Value::Null) | None => {
|
||||
if !self.ignore_missing {
|
||||
return Err(format!(
|
||||
"{} processor: missing field: {}",
|
||||
self.kind(),
|
||||
field.get_field_name()
|
||||
field.input_name()
|
||||
));
|
||||
}
|
||||
}
|
||||
Some(v) => {
|
||||
// TODO(qtang): Let this method use the intermediate state collection directly.
|
||||
let mut map = self.exec_field(v, field)?;
|
||||
field
|
||||
.output_fields_index_mapping
|
||||
.iter()
|
||||
.for_each(|(k, output_index)| {
|
||||
if let Some(v) = map.remove(k) {
|
||||
val[*output_index] = v;
|
||||
}
|
||||
});
|
||||
let result = self.process(v)?;
|
||||
let output_index = field.output_index();
|
||||
val[output_index] = result;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -211,55 +213,20 @@ impl crate::etl::processor::Processor for GsubProcessor {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::etl::field::Field;
|
||||
use crate::etl::processor::gsub::GsubProcessor;
|
||||
use crate::etl::processor::Processor;
|
||||
use crate::etl::value::{Map, Value};
|
||||
use crate::etl::value::Value;
|
||||
|
||||
#[test]
|
||||
fn test_string_value() {
|
||||
let mut processor = GsubProcessor::default();
|
||||
processor.try_pattern(r"\d+").unwrap();
|
||||
processor.with_replacement("xxx");
|
||||
let processor = GsubProcessor {
|
||||
pattern: Some(regex::Regex::new(r"\d+").unwrap()),
|
||||
replacement: Some("xxx".to_string()),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let field = Field::new("message");
|
||||
let val = Value::String("123".to_string());
|
||||
let result = processor.exec_field(&val, &field).unwrap();
|
||||
let result = processor.process(&val).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
result,
|
||||
Map::one("message", Value::String("xxx".to_string()))
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_array_string_value() {
|
||||
let mut processor = GsubProcessor::default();
|
||||
processor.try_pattern(r"\d+").unwrap();
|
||||
processor.with_replacement("xxx");
|
||||
|
||||
let field = Field::new("message");
|
||||
let val = Value::Array(
|
||||
vec![
|
||||
Value::String("123".to_string()),
|
||||
Value::String("456".to_string()),
|
||||
]
|
||||
.into(),
|
||||
);
|
||||
let result = processor.exec_field(&val, &field).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
result,
|
||||
Map::one(
|
||||
"message",
|
||||
Value::Array(
|
||||
vec![
|
||||
Value::String("xxx".to_string()),
|
||||
Value::String("xxx".to_string())
|
||||
]
|
||||
.into()
|
||||
)
|
||||
)
|
||||
);
|
||||
assert_eq!(result, Value::String("xxx".to_string()));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,40 +14,78 @@
|
||||
|
||||
use ahash::HashSet;
|
||||
|
||||
use crate::etl::field::{Field, Fields};
|
||||
use crate::etl::field::{Fields, OneInputOneOutputField};
|
||||
use crate::etl::processor::{
|
||||
update_one_one_output_keys, yaml_bool, yaml_field, yaml_fields, yaml_string, Processor,
|
||||
FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, SEPARATOR_NAME,
|
||||
yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, ProcessorBuilder,
|
||||
ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, SEPARATOR_NAME,
|
||||
};
|
||||
use crate::etl::value::{Array, Map, Value};
|
||||
use crate::etl::value::{Array, Value};
|
||||
|
||||
pub(crate) const PROCESSOR_JOIN: &str = "join";
|
||||
|
||||
/// A processor to join each element of an array into a single string using a separator string between each element
|
||||
#[derive(Debug, Default)]
|
||||
pub struct JoinProcessor {
|
||||
pub struct JoinProcessorBuilder {
|
||||
fields: Fields,
|
||||
separator: Option<String>,
|
||||
ignore_missing: bool,
|
||||
}
|
||||
|
||||
impl ProcessorBuilder for JoinProcessorBuilder {
|
||||
fn output_keys(&self) -> HashSet<&str> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|f| f.target_or_input_field())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn input_keys(&self) -> HashSet<&str> {
|
||||
self.fields.iter().map(|f| f.input_field()).collect()
|
||||
}
|
||||
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String> {
|
||||
self.build(intermediate_keys).map(ProcessorKind::Join)
|
||||
}
|
||||
}
|
||||
|
||||
impl JoinProcessorBuilder {
|
||||
fn check(self) -> Result<Self, String> {
|
||||
if self.separator.is_none() {
|
||||
return Err("separator is required".to_string());
|
||||
}
|
||||
|
||||
Ok(self)
|
||||
}
|
||||
|
||||
pub fn build(self, intermediate_keys: &[String]) -> Result<JoinProcessor, String> {
|
||||
let mut real_fields = vec![];
|
||||
for field in self.fields.into_iter() {
|
||||
let input = OneInputOneOutputField::build(
|
||||
"join",
|
||||
intermediate_keys,
|
||||
field.input_field(),
|
||||
field.target_or_input_field(),
|
||||
)?;
|
||||
real_fields.push(input);
|
||||
}
|
||||
|
||||
Ok(JoinProcessor {
|
||||
fields: real_fields,
|
||||
separator: self.separator,
|
||||
ignore_missing: self.ignore_missing,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// A processor to join each element of an array into a single string using a separator string between each element
|
||||
#[derive(Debug, Default)]
|
||||
pub struct JoinProcessor {
|
||||
fields: Vec<OneInputOneOutputField>,
|
||||
separator: Option<String>,
|
||||
ignore_missing: bool,
|
||||
}
|
||||
|
||||
impl JoinProcessor {
|
||||
fn with_fields(&mut self, mut fields: Fields) {
|
||||
update_one_one_output_keys(&mut fields);
|
||||
self.fields = fields;
|
||||
}
|
||||
|
||||
fn with_separator(&mut self, separator: impl Into<String>) {
|
||||
self.separator = Some(separator.into());
|
||||
}
|
||||
|
||||
fn with_ignore_missing(&mut self, ignore_missing: bool) {
|
||||
self.ignore_missing = ignore_missing;
|
||||
}
|
||||
|
||||
fn process_field(&self, arr: &Array, field: &Field) -> Result<Map, String> {
|
||||
let key = field.get_target_field();
|
||||
|
||||
fn process(&self, arr: &Array) -> Result<Value, String> {
|
||||
let sep = self.separator.as_ref().unwrap();
|
||||
let val = arr
|
||||
.iter()
|
||||
@@ -55,7 +93,7 @@ impl JoinProcessor {
|
||||
.collect::<Vec<String>>()
|
||||
.join(sep);
|
||||
|
||||
Ok(Map::one(key, Value::String(val)))
|
||||
Ok(Value::String(val))
|
||||
}
|
||||
|
||||
fn check(self) -> Result<Self, String> {
|
||||
@@ -67,11 +105,13 @@ impl JoinProcessor {
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for JoinProcessor {
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for JoinProcessorBuilder {
|
||||
type Error = String;
|
||||
|
||||
fn try_from(value: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
|
||||
let mut processor = JoinProcessor::default();
|
||||
let mut fields = Fields::default();
|
||||
let mut separator = None;
|
||||
let mut ignore_missing = false;
|
||||
|
||||
for (k, v) in value.iter() {
|
||||
let key = k
|
||||
@@ -79,30 +119,31 @@ impl TryFrom<&yaml_rust::yaml::Hash> for JoinProcessor {
|
||||
.ok_or(format!("key must be a string, but got {k:?}"))?;
|
||||
match key {
|
||||
FIELD_NAME => {
|
||||
processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?));
|
||||
fields = Fields::one(yaml_new_field(v, FIELD_NAME)?);
|
||||
}
|
||||
FIELDS_NAME => {
|
||||
processor.with_fields(yaml_fields(v, FIELDS_NAME)?);
|
||||
fields = yaml_new_fields(v, FIELDS_NAME)?;
|
||||
}
|
||||
SEPARATOR_NAME => {
|
||||
processor.with_separator(yaml_string(v, SEPARATOR_NAME)?);
|
||||
separator = Some(yaml_string(v, SEPARATOR_NAME)?);
|
||||
}
|
||||
IGNORE_MISSING_NAME => {
|
||||
processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?);
|
||||
ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
processor.check()
|
||||
let builder = JoinProcessorBuilder {
|
||||
fields,
|
||||
separator,
|
||||
ignore_missing,
|
||||
};
|
||||
builder.check()
|
||||
}
|
||||
}
|
||||
|
||||
impl Processor for JoinProcessor {
|
||||
fn fields(&self) -> &Fields {
|
||||
&self.fields
|
||||
}
|
||||
|
||||
fn kind(&self) -> &str {
|
||||
PROCESSOR_JOIN
|
||||
}
|
||||
@@ -111,49 +152,21 @@ impl Processor for JoinProcessor {
|
||||
self.ignore_missing
|
||||
}
|
||||
|
||||
fn fields_mut(&mut self) -> &mut Fields {
|
||||
&mut self.fields
|
||||
}
|
||||
|
||||
fn output_keys(&self) -> HashSet<String> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|f| f.get_target_field().to_string())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
|
||||
match val {
|
||||
Value::Array(arr) => self.process_field(arr, field),
|
||||
_ => Err(format!(
|
||||
"{} processor: expect array value, but got {val:?}",
|
||||
self.kind()
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String> {
|
||||
for field in self.fields.iter() {
|
||||
let index = field.input_field.index;
|
||||
let index = field.input_index();
|
||||
match val.get(index) {
|
||||
Some(Value::Array(arr)) => {
|
||||
// TODO(qtang): Let this method use the intermediate state collection directly.
|
||||
let mut map = self.process_field(arr, field)?;
|
||||
field
|
||||
.output_fields_index_mapping
|
||||
.iter()
|
||||
.for_each(|(k, output_index)| {
|
||||
if let Some(v) = map.remove(k) {
|
||||
val[*output_index] = v;
|
||||
}
|
||||
});
|
||||
let result = self.process(arr)?;
|
||||
let output_index = field.output_index();
|
||||
val[output_index] = result;
|
||||
}
|
||||
Some(Value::Null) | None => {
|
||||
if !self.ignore_missing {
|
||||
return Err(format!(
|
||||
"{} processor: missing field: {}",
|
||||
self.kind(),
|
||||
field.get_field_name()
|
||||
field.input_name()
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -173,25 +186,22 @@ impl Processor for JoinProcessor {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use crate::etl::field::Field;
|
||||
use crate::etl::processor::join::JoinProcessor;
|
||||
use crate::etl::processor::Processor;
|
||||
use crate::etl::value::{Map, Value};
|
||||
use crate::etl::value::Value;
|
||||
|
||||
#[test]
|
||||
fn test_join_processor() {
|
||||
let mut processor = JoinProcessor::default();
|
||||
processor.with_separator("-");
|
||||
let processor = JoinProcessor {
|
||||
separator: Some("-".to_string()),
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let field = Field::new("test");
|
||||
let arr = Value::Array(
|
||||
vec![
|
||||
Value::String("a".to_string()),
|
||||
Value::String("b".to_string()),
|
||||
]
|
||||
.into(),
|
||||
);
|
||||
let result = processor.exec_field(&arr, &field).unwrap();
|
||||
assert_eq!(result, Map::one("test", Value::String("a-b".to_string())));
|
||||
let arr = vec![
|
||||
Value::String("a".to_string()),
|
||||
Value::String("b".to_string()),
|
||||
]
|
||||
.into();
|
||||
let result = processor.process(&arr).unwrap();
|
||||
assert_eq!(result, Value::String("a-b".to_string()));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -14,12 +14,12 @@
|
||||
|
||||
use ahash::HashSet;
|
||||
|
||||
use crate::etl::field::{Field, Fields};
|
||||
use crate::etl::field::{Fields, OneInputOneOutputField};
|
||||
use crate::etl::processor::{
|
||||
update_one_one_output_keys, yaml_bool, yaml_field, yaml_fields, yaml_string, Processor,
|
||||
FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, METHOD_NAME,
|
||||
yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, ProcessorBuilder,
|
||||
ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, METHOD_NAME,
|
||||
};
|
||||
use crate::etl::value::{Map, Value};
|
||||
use crate::etl::value::Value;
|
||||
|
||||
pub(crate) const PROCESSOR_LETTER: &str = "letter";
|
||||
|
||||
@@ -54,29 +54,61 @@ impl std::str::FromStr for Method {
|
||||
}
|
||||
}
|
||||
|
||||
/// only support string value
|
||||
#[derive(Debug, Default)]
|
||||
pub struct LetterProcessor {
|
||||
pub struct LetterProcessorBuilder {
|
||||
fields: Fields,
|
||||
method: Method,
|
||||
ignore_missing: bool,
|
||||
}
|
||||
|
||||
impl ProcessorBuilder for LetterProcessorBuilder {
|
||||
fn output_keys(&self) -> HashSet<&str> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|f| f.target_or_input_field())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn input_keys(&self) -> HashSet<&str> {
|
||||
self.fields.iter().map(|f| f.input_field()).collect()
|
||||
}
|
||||
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String> {
|
||||
self.build(intermediate_keys).map(ProcessorKind::Letter)
|
||||
}
|
||||
}
|
||||
|
||||
impl LetterProcessorBuilder {
|
||||
pub fn build(self, intermediate_keys: &[String]) -> Result<LetterProcessor, String> {
|
||||
let mut real_fields = vec![];
|
||||
for field in self.fields.into_iter() {
|
||||
let input = OneInputOneOutputField::build(
|
||||
"letter",
|
||||
intermediate_keys,
|
||||
field.input_field(),
|
||||
field.target_or_input_field(),
|
||||
)?;
|
||||
real_fields.push(input);
|
||||
}
|
||||
|
||||
Ok(LetterProcessor {
|
||||
fields: real_fields,
|
||||
method: self.method,
|
||||
ignore_missing: self.ignore_missing,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// only support string value
|
||||
#[derive(Debug, Default)]
|
||||
pub struct LetterProcessor {
|
||||
fields: Vec<OneInputOneOutputField>,
|
||||
method: Method,
|
||||
ignore_missing: bool,
|
||||
}
|
||||
|
||||
impl LetterProcessor {
|
||||
fn with_fields(&mut self, mut fields: Fields) {
|
||||
update_one_one_output_keys(&mut fields);
|
||||
self.fields = fields;
|
||||
}
|
||||
|
||||
fn with_method(&mut self, method: Method) {
|
||||
self.method = method;
|
||||
}
|
||||
|
||||
fn with_ignore_missing(&mut self, ignore_missing: bool) {
|
||||
self.ignore_missing = ignore_missing;
|
||||
}
|
||||
|
||||
fn process_field(&self, val: &str, field: &Field) -> Result<Map, String> {
|
||||
fn process_field(&self, val: &str) -> Result<Value, String> {
|
||||
let processed = match self.method {
|
||||
Method::Upper => val.to_uppercase(),
|
||||
Method::Lower => val.to_lowercase(),
|
||||
@@ -84,17 +116,17 @@ impl LetterProcessor {
|
||||
};
|
||||
let val = Value::String(processed);
|
||||
|
||||
let key = field.get_target_field();
|
||||
|
||||
Ok(Map::one(key, val))
|
||||
Ok(val)
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for LetterProcessor {
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for LetterProcessorBuilder {
|
||||
type Error = String;
|
||||
|
||||
fn try_from(value: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
|
||||
let mut processor = LetterProcessor::default();
|
||||
let mut fields = Fields::default();
|
||||
let mut method = Method::Lower;
|
||||
let mut ignore_missing = false;
|
||||
|
||||
for (k, v) in value.iter() {
|
||||
let key = k
|
||||
@@ -102,23 +134,26 @@ impl TryFrom<&yaml_rust::yaml::Hash> for LetterProcessor {
|
||||
.ok_or(format!("key must be a string, but got {k:?}"))?;
|
||||
match key {
|
||||
FIELD_NAME => {
|
||||
processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?));
|
||||
fields = Fields::one(yaml_new_field(v, FIELD_NAME)?);
|
||||
}
|
||||
FIELDS_NAME => {
|
||||
processor.with_fields(yaml_fields(v, FIELDS_NAME)?);
|
||||
fields = yaml_new_fields(v, FIELDS_NAME)?;
|
||||
}
|
||||
METHOD_NAME => {
|
||||
let method = yaml_string(v, METHOD_NAME)?;
|
||||
processor.with_method(method.parse()?);
|
||||
method = yaml_string(v, METHOD_NAME)?.parse()?;
|
||||
}
|
||||
IGNORE_MISSING_NAME => {
|
||||
processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?);
|
||||
ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(processor)
|
||||
Ok(LetterProcessorBuilder {
|
||||
fields,
|
||||
method,
|
||||
ignore_missing,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -131,53 +166,21 @@ impl Processor for LetterProcessor {
|
||||
self.ignore_missing
|
||||
}
|
||||
|
||||
fn fields(&self) -> &Fields {
|
||||
&self.fields
|
||||
}
|
||||
|
||||
fn fields_mut(&mut self) -> &mut Fields {
|
||||
&mut self.fields
|
||||
}
|
||||
|
||||
fn output_keys(&self) -> HashSet<String> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|f| f.get_target_field().to_string())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
|
||||
match val {
|
||||
Value::String(val) => self.process_field(val, field),
|
||||
_ => Err(format!(
|
||||
"{} processor: expect string value, but got {val:?}",
|
||||
self.kind()
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String> {
|
||||
for field in self.fields.iter() {
|
||||
let index = field.input_field.index;
|
||||
let index = field.input_index();
|
||||
match val.get(index) {
|
||||
Some(Value::String(s)) => {
|
||||
// TODO(qtang): Let this method use the intermediate state collection directly.
|
||||
let mut processed = self.process_field(s, field)?;
|
||||
field
|
||||
.output_fields_index_mapping
|
||||
.iter()
|
||||
.for_each(|(k, output_index)| {
|
||||
if let Some(v) = processed.remove(k) {
|
||||
val[*output_index] = v;
|
||||
}
|
||||
});
|
||||
let result = self.process_field(s)?;
|
||||
let (_, output_index) = field.output();
|
||||
val[*output_index] = result;
|
||||
}
|
||||
Some(Value::Null) | None => {
|
||||
if !self.ignore_missing {
|
||||
return Err(format!(
|
||||
"{} processor: missing field: {}",
|
||||
self.kind(),
|
||||
field.get_field_name()
|
||||
&field.input().name
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -204,33 +207,36 @@ fn capitalize(s: &str) -> String {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::etl::field::Fields;
|
||||
use crate::etl::processor::letter::{LetterProcessor, Method};
|
||||
use crate::etl::value::{Map, Value};
|
||||
use crate::etl::value::Value;
|
||||
|
||||
#[test]
|
||||
fn test_process() {
|
||||
let field = "letter";
|
||||
let ff: crate::etl::processor::Field = field.parse().unwrap();
|
||||
let mut processor = LetterProcessor::default();
|
||||
processor.with_fields(Fields::one(ff.clone()));
|
||||
|
||||
{
|
||||
processor.with_method(Method::Upper);
|
||||
let processed = processor.process_field("pipeline", &ff).unwrap();
|
||||
assert_eq!(Map::one(field, Value::String("PIPELINE".into())), processed)
|
||||
let processor = LetterProcessor {
|
||||
method: Method::Upper,
|
||||
..Default::default()
|
||||
};
|
||||
let processed = processor.process_field("pipeline").unwrap();
|
||||
assert_eq!(Value::String("PIPELINE".into()), processed)
|
||||
}
|
||||
|
||||
{
|
||||
processor.with_method(Method::Lower);
|
||||
let processed = processor.process_field("Pipeline", &ff).unwrap();
|
||||
assert_eq!(Map::one(field, Value::String("pipeline".into())), processed)
|
||||
let processor = LetterProcessor {
|
||||
method: Method::Lower,
|
||||
..Default::default()
|
||||
};
|
||||
let processed = processor.process_field("Pipeline").unwrap();
|
||||
assert_eq!(Value::String("pipeline".into()), processed)
|
||||
}
|
||||
|
||||
{
|
||||
processor.with_method(Method::Capital);
|
||||
let processed = processor.process_field("pipeline", &ff).unwrap();
|
||||
assert_eq!(Map::one(field, Value::String("Pipeline".into())), processed)
|
||||
let processor = LetterProcessor {
|
||||
method: Method::Capital,
|
||||
..Default::default()
|
||||
};
|
||||
let processed = processor.process_field("pipeline").unwrap();
|
||||
assert_eq!(Value::String("Pipeline".into()), processed)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,16 +18,17 @@ const PATTERNS_NAME: &str = "patterns";
|
||||
|
||||
pub(crate) const PROCESSOR_REGEX: &str = "regex";
|
||||
|
||||
use ahash::HashSet;
|
||||
use ahash::{HashSet, HashSetExt};
|
||||
use lazy_static::lazy_static;
|
||||
use regex::Regex;
|
||||
|
||||
use crate::etl::field::Fields;
|
||||
use crate::etl::field::{Fields, InputFieldInfo, OneInputMultiOutputField};
|
||||
use crate::etl::find_key_index;
|
||||
use crate::etl::processor::{
|
||||
yaml_bool, yaml_field, yaml_fields, yaml_string, yaml_strings, Field, Processor, FIELDS_NAME,
|
||||
FIELD_NAME, IGNORE_MISSING_NAME, PATTERN_NAME,
|
||||
yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, yaml_strings, Processor,
|
||||
ProcessorBuilder, ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, PATTERN_NAME,
|
||||
};
|
||||
use crate::etl::value::{Map, Value};
|
||||
use crate::etl::value::Value;
|
||||
|
||||
lazy_static! {
|
||||
static ref GROUPS_NAME_REGEX: Regex = Regex::new(r"\(\?P?<([[:word:]]+)>.+?\)").unwrap();
|
||||
@@ -40,6 +41,10 @@ fn get_regex_group_names(s: &str) -> Vec<String> {
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn generate_key(prefix: &str, group: &str) -> String {
|
||||
format!("{prefix}_{group}")
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct GroupRegex {
|
||||
origin: String,
|
||||
@@ -72,34 +77,29 @@ impl std::str::FromStr for GroupRegex {
|
||||
}
|
||||
}
|
||||
|
||||
/// only support string value
|
||||
/// if no value found from a pattern, the target_field will be ignored
|
||||
#[derive(Debug, Default)]
|
||||
pub struct RegexProcessor {
|
||||
pub struct RegexProcessorBuilder {
|
||||
fields: Fields,
|
||||
patterns: Vec<GroupRegex>,
|
||||
ignore_missing: bool,
|
||||
output_keys: HashSet<String>,
|
||||
}
|
||||
|
||||
impl RegexProcessor {
|
||||
fn with_fields(&mut self, fields: Fields) {
|
||||
self.fields = fields;
|
||||
impl ProcessorBuilder for RegexProcessorBuilder {
|
||||
fn output_keys(&self) -> HashSet<&str> {
|
||||
self.output_keys.iter().map(|k| k.as_str()).collect()
|
||||
}
|
||||
|
||||
fn try_with_patterns(&mut self, patterns: Vec<String>) -> Result<(), String> {
|
||||
let mut rs = vec![];
|
||||
for pattern in patterns {
|
||||
let gr = pattern.parse()?;
|
||||
rs.push(gr);
|
||||
}
|
||||
self.patterns = rs;
|
||||
Ok(())
|
||||
fn input_keys(&self) -> HashSet<&str> {
|
||||
self.fields.iter().map(|f| f.input_field()).collect()
|
||||
}
|
||||
|
||||
fn with_ignore_missing(&mut self, ignore_missing: bool) {
|
||||
self.ignore_missing = ignore_missing;
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String> {
|
||||
self.build(intermediate_keys).map(ProcessorKind::Regex)
|
||||
}
|
||||
}
|
||||
|
||||
impl RegexProcessorBuilder {
|
||||
fn check(self) -> Result<Self, String> {
|
||||
if self.fields.is_empty() {
|
||||
return Err(format!(
|
||||
@@ -118,47 +118,78 @@ impl RegexProcessor {
|
||||
Ok(self)
|
||||
}
|
||||
|
||||
fn generate_key(prefix: &str, group: &str) -> String {
|
||||
format!("{prefix}_{group}")
|
||||
fn build_group_output_info(
|
||||
group_regex: &GroupRegex,
|
||||
om_field: &OneInputMultiOutputField,
|
||||
intermediate_keys: &[String],
|
||||
) -> Result<Vec<OutPutInfo>, String> {
|
||||
group_regex
|
||||
.groups
|
||||
.iter()
|
||||
.map(|g| {
|
||||
let key = generate_key(om_field.target_prefix(), g);
|
||||
let index = find_key_index(intermediate_keys, &key, "regex");
|
||||
index.map(|index| OutPutInfo {
|
||||
final_key: key,
|
||||
group_name: g.to_string(),
|
||||
index,
|
||||
})
|
||||
})
|
||||
.collect::<Result<Vec<_>, String>>()
|
||||
}
|
||||
|
||||
fn process_field(&self, val: &str, field: &Field, gr: &GroupRegex) -> Result<Map, String> {
|
||||
let mut map = Map::default();
|
||||
|
||||
if let Some(captures) = gr.regex.captures(val) {
|
||||
for group in &gr.groups {
|
||||
if let Some(capture) = captures.name(group) {
|
||||
let value = capture.as_str().to_string();
|
||||
let prefix = field.get_target_field();
|
||||
|
||||
let key = Self::generate_key(prefix, group);
|
||||
|
||||
map.insert(key, Value::String(value));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(map)
|
||||
fn build_group_output_infos(
|
||||
patterns: &[GroupRegex],
|
||||
om_field: &OneInputMultiOutputField,
|
||||
intermediate_keys: &[String],
|
||||
) -> Result<Vec<Vec<OutPutInfo>>, String> {
|
||||
patterns
|
||||
.iter()
|
||||
.map(|group_regex| {
|
||||
Self::build_group_output_info(group_regex, om_field, intermediate_keys)
|
||||
})
|
||||
.collect::<Result<Vec<_>, String>>()
|
||||
}
|
||||
|
||||
fn update_output_keys(&mut self) {
|
||||
for field in self.fields.iter_mut() {
|
||||
for gr in &self.patterns {
|
||||
for group in &gr.groups {
|
||||
field
|
||||
.output_fields_index_mapping
|
||||
.insert(Self::generate_key(field.get_target_field(), group), 0_usize);
|
||||
}
|
||||
}
|
||||
fn build_output_info(
|
||||
real_fields: &[OneInputMultiOutputField],
|
||||
patterns: &[GroupRegex],
|
||||
intermediate_keys: &[String],
|
||||
) -> Result<RegexProcessorOutputInfo, String> {
|
||||
let inner = real_fields
|
||||
.iter()
|
||||
.map(|om_field| Self::build_group_output_infos(patterns, om_field, intermediate_keys))
|
||||
.collect::<Result<Vec<_>, String>>();
|
||||
inner.map(|inner| RegexProcessorOutputInfo { inner })
|
||||
}
|
||||
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<RegexProcessor, String> {
|
||||
let mut real_fields = vec![];
|
||||
for field in self.fields.into_iter() {
|
||||
let input_index = find_key_index(intermediate_keys, field.input_field(), "regex")?;
|
||||
let input_field_info = InputFieldInfo::new(field.input_field(), input_index);
|
||||
|
||||
let input = OneInputMultiOutputField::new(input_field_info, field.target_field);
|
||||
real_fields.push(input);
|
||||
}
|
||||
let output_info = Self::build_output_info(&real_fields, &self.patterns, intermediate_keys)?;
|
||||
Ok(RegexProcessor {
|
||||
// fields: Fields::one(Field::new("test".to_string())),
|
||||
fields: real_fields,
|
||||
patterns: self.patterns,
|
||||
output_info,
|
||||
ignore_missing: self.ignore_missing,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for RegexProcessor {
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for RegexProcessorBuilder {
|
||||
type Error = String;
|
||||
|
||||
fn try_from(value: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
|
||||
let mut processor = RegexProcessor::default();
|
||||
let mut fields = Fields::default();
|
||||
let mut patterns: Vec<GroupRegex> = vec![];
|
||||
let mut ignore_missing = false;
|
||||
|
||||
for (k, v) in value.iter() {
|
||||
let key = k
|
||||
@@ -166,28 +197,113 @@ impl TryFrom<&yaml_rust::yaml::Hash> for RegexProcessor {
|
||||
.ok_or(format!("key must be a string, but got {k:?}"))?;
|
||||
match key {
|
||||
FIELD_NAME => {
|
||||
processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?));
|
||||
fields = Fields::one(yaml_new_field(v, FIELD_NAME)?);
|
||||
}
|
||||
FIELDS_NAME => {
|
||||
processor.with_fields(yaml_fields(v, FIELDS_NAME)?);
|
||||
fields = yaml_new_fields(v, FIELDS_NAME)?;
|
||||
}
|
||||
PATTERN_NAME => {
|
||||
processor.try_with_patterns(vec![yaml_string(v, PATTERN_NAME)?])?;
|
||||
let pattern = yaml_string(v, PATTERN_NAME)?;
|
||||
let gr = pattern.parse()?;
|
||||
patterns.push(gr);
|
||||
}
|
||||
PATTERNS_NAME => {
|
||||
processor.try_with_patterns(yaml_strings(v, PATTERNS_NAME)?)?;
|
||||
for pattern in yaml_strings(v, PATTERNS_NAME)? {
|
||||
let gr = pattern.parse()?;
|
||||
patterns.push(gr);
|
||||
}
|
||||
}
|
||||
IGNORE_MISSING_NAME => {
|
||||
processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?);
|
||||
ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?;
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
processor.check().map(|mut p| {
|
||||
p.update_output_keys();
|
||||
p
|
||||
})
|
||||
let pattern_output_keys = patterns
|
||||
.iter()
|
||||
.flat_map(|pattern| pattern.groups.iter())
|
||||
.collect::<Vec<_>>();
|
||||
let mut output_keys = HashSet::new();
|
||||
for field in fields.iter() {
|
||||
for x in pattern_output_keys.iter() {
|
||||
output_keys.insert(generate_key(field.target_or_input_field(), x));
|
||||
}
|
||||
}
|
||||
|
||||
let processor_builder = RegexProcessorBuilder {
|
||||
fields,
|
||||
patterns,
|
||||
ignore_missing,
|
||||
output_keys,
|
||||
};
|
||||
|
||||
processor_builder.check()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
struct OutPutInfo {
|
||||
final_key: String,
|
||||
group_name: String,
|
||||
index: usize,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
struct RegexProcessorOutputInfo {
|
||||
pub inner: Vec<Vec<Vec<OutPutInfo>>>,
|
||||
}
|
||||
|
||||
impl RegexProcessorOutputInfo {
|
||||
fn get_output_index(
|
||||
&self,
|
||||
field_index: usize,
|
||||
pattern_index: usize,
|
||||
group_index: usize,
|
||||
) -> usize {
|
||||
self.inner[field_index][pattern_index][group_index].index
|
||||
}
|
||||
}
|
||||
/// only support string value
|
||||
/// if no value found from a pattern, the target_field will be ignored
|
||||
#[derive(Debug, Default)]
|
||||
pub struct RegexProcessor {
|
||||
fields: Vec<OneInputMultiOutputField>,
|
||||
output_info: RegexProcessorOutputInfo,
|
||||
patterns: Vec<GroupRegex>,
|
||||
ignore_missing: bool,
|
||||
}
|
||||
|
||||
impl RegexProcessor {
|
||||
fn try_with_patterns(&mut self, patterns: Vec<String>) -> Result<(), String> {
|
||||
let mut rs = vec![];
|
||||
for pattern in patterns {
|
||||
let gr = pattern.parse()?;
|
||||
rs.push(gr);
|
||||
}
|
||||
self.patterns = rs;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn process(
|
||||
&self,
|
||||
val: &str,
|
||||
gr: &GroupRegex,
|
||||
index: (usize, usize),
|
||||
) -> Result<Vec<(usize, Value)>, String> {
|
||||
let mut result = Vec::new();
|
||||
if let Some(captures) = gr.regex.captures(val) {
|
||||
for (group_index, group) in gr.groups.iter().enumerate() {
|
||||
if let Some(capture) = captures.name(group) {
|
||||
let value = capture.as_str().to_string();
|
||||
let index = self
|
||||
.output_info
|
||||
.get_output_index(index.0, index.1, group_index);
|
||||
result.push((index, Value::String(value)));
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -200,71 +316,40 @@ impl Processor for RegexProcessor {
|
||||
self.ignore_missing
|
||||
}
|
||||
|
||||
fn fields(&self) -> &Fields {
|
||||
&self.fields
|
||||
}
|
||||
|
||||
fn fields_mut(&mut self) -> &mut Fields {
|
||||
&mut self.fields
|
||||
}
|
||||
|
||||
fn output_keys(&self) -> HashSet<String> {
|
||||
self.fields
|
||||
.iter()
|
||||
.flat_map(|f| {
|
||||
self.patterns.iter().flat_map(move |p| {
|
||||
p.groups
|
||||
.iter()
|
||||
.map(move |g| Self::generate_key(&f.input_field.name, g))
|
||||
})
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
|
||||
match val {
|
||||
Value::String(val) => {
|
||||
let mut map = Map::default();
|
||||
for gr in &self.patterns {
|
||||
let m = self.process_field(val, field, gr)?;
|
||||
map.extend(m);
|
||||
}
|
||||
Ok(map)
|
||||
}
|
||||
_ => Err(format!(
|
||||
"{} processor: expect string value, but got {val:?}",
|
||||
self.kind()
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String> {
|
||||
for field in self.fields.iter() {
|
||||
let index = field.input_field.index;
|
||||
for (field_index, field) in self.fields.iter().enumerate() {
|
||||
let index = field.input_index();
|
||||
let mut result_list = None;
|
||||
match val.get(index) {
|
||||
Some(Value::String(s)) => {
|
||||
let mut map = Map::default();
|
||||
for gr in &self.patterns {
|
||||
// TODO(qtang): Let this method use the intermediate state collection directly.
|
||||
let m = self.process_field(s, field, gr)?;
|
||||
map.extend(m);
|
||||
}
|
||||
|
||||
field
|
||||
.output_fields_index_mapping
|
||||
.iter()
|
||||
.for_each(|(k, output_index)| {
|
||||
if let Some(v) = map.remove(k) {
|
||||
val[*output_index] = v;
|
||||
// we get rust borrow checker error here
|
||||
// for (gr_index, gr) in self.patterns.iter().enumerate() {
|
||||
// let result_list = self.process(s.as_str(), gr, (field_index, gr_index))?;
|
||||
// for (output_index, result) in result_list {
|
||||
//cannot borrow `*val` as mutable because it is also borrowed as immutable mutable borrow occurs here
|
||||
// val[output_index] = result;
|
||||
// }
|
||||
// }
|
||||
for (gr_index, gr) in self.patterns.iter().enumerate() {
|
||||
let result = self.process(s.as_str(), gr, (field_index, gr_index))?;
|
||||
if !result.is_empty() {
|
||||
match result_list.as_mut() {
|
||||
None => {
|
||||
result_list = Some(result);
|
||||
}
|
||||
Some(result_list) => {
|
||||
result_list.extend(result);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
Some(Value::Null) | None => {
|
||||
if !self.ignore_missing {
|
||||
return Err(format!(
|
||||
"{} processor: missing field: {}",
|
||||
self.kind(),
|
||||
field.get_field_name()
|
||||
field.input_name()
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -275,6 +360,15 @@ impl Processor for RegexProcessor {
|
||||
));
|
||||
}
|
||||
}
|
||||
// safety here
|
||||
match result_list {
|
||||
None => {}
|
||||
Some(result_list) => {
|
||||
for (output_index, result) in result_list {
|
||||
val[output_index] = result;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@@ -282,37 +376,42 @@ impl Processor for RegexProcessor {
|
||||
}
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use ahash::{HashMap, HashMapExt};
|
||||
use itertools::Itertools;
|
||||
|
||||
use super::RegexProcessor;
|
||||
use crate::etl::field::Fields;
|
||||
use crate::etl::processor::Processor;
|
||||
use crate::etl::processor::regex::RegexProcessorBuilder;
|
||||
use crate::etl::value::{Map, Value};
|
||||
|
||||
#[test]
|
||||
fn test_simple_parse() {
|
||||
let mut processor = RegexProcessor::default();
|
||||
let pipeline_str = r#"fields: ["a"]
|
||||
patterns: ['(?<ar>\d)']
|
||||
ignore_missing: false"#;
|
||||
|
||||
let processor_yaml = yaml_rust::YamlLoader::load_from_str(pipeline_str)
|
||||
.unwrap()
|
||||
.pop()
|
||||
.unwrap();
|
||||
let processor_yaml_hash = processor_yaml.as_hash().unwrap();
|
||||
let builder = RegexProcessorBuilder::try_from(processor_yaml_hash).unwrap();
|
||||
let intermediate_keys = ["a".to_string(), "a_ar".to_string()];
|
||||
let processor = builder.build(&intermediate_keys).unwrap();
|
||||
|
||||
// single field (with prefix), multiple patterns
|
||||
let f = ["a"].iter().map(|f| f.parse().unwrap()).collect();
|
||||
processor.with_fields(Fields::new(f).unwrap());
|
||||
|
||||
let ar = "(?<ar>\\d)";
|
||||
let result = processor
|
||||
.process("123", &processor.patterns[0], (0, 0))
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|(k, v)| (intermediate_keys[k].clone(), v))
|
||||
.collect();
|
||||
|
||||
let patterns = [ar].iter().map(|p| p.to_string()).collect();
|
||||
processor.try_with_patterns(patterns).unwrap();
|
||||
|
||||
let mut map = Map::default();
|
||||
map.insert("a", Value::String("123".to_string()));
|
||||
processor.exec_map(&mut map).unwrap();
|
||||
let map = Map { values: result };
|
||||
|
||||
let v = Map {
|
||||
values: vec![
|
||||
("a_ar".to_string(), Value::String("1".to_string())),
|
||||
("a".to_string(), Value::String("123".to_string())),
|
||||
]
|
||||
.into_iter()
|
||||
.collect(),
|
||||
values: vec![("a_ar".to_string(), Value::String("1".to_string()))]
|
||||
.into_iter()
|
||||
.collect(),
|
||||
};
|
||||
|
||||
assert_eq!(v, map);
|
||||
@@ -320,17 +419,14 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_process() {
|
||||
let mut processor = RegexProcessor::default();
|
||||
|
||||
let cc = "[c=c,n=US_CA_SANJOSE,o=55155]";
|
||||
let cg = "[a=12.34.567.89,b=12345678,c=g,n=US_CA_SANJOSE,o=20940]";
|
||||
let co = "[a=987.654.321.09,c=o]";
|
||||
let cp = "[c=p,n=US_CA_SANJOSE,o=55155]";
|
||||
let cw = "[c=w,n=US_CA_SANJOSE,o=55155]";
|
||||
let breadcrumbs = Value::String([cc, cg, co, cp, cw].iter().join(","));
|
||||
let breadcrumbs_str = [cc, cg, co, cp, cw].iter().join(",");
|
||||
|
||||
let values = [
|
||||
("breadcrumbs", breadcrumbs.clone()),
|
||||
("breadcrumbs_parent", Value::String(cc.to_string())),
|
||||
("breadcrumbs_edge", Value::String(cg.to_string())),
|
||||
("breadcrumbs_origin", Value::String(co.to_string())),
|
||||
@@ -340,61 +436,141 @@ mod tests {
|
||||
.into_iter()
|
||||
.map(|(k, v)| (k.to_string(), v))
|
||||
.collect();
|
||||
let mut temporary_map = Map { values };
|
||||
let temporary_map = Map { values };
|
||||
|
||||
{
|
||||
// single field (with prefix), multiple patterns
|
||||
let ff = ["breadcrumbs, breadcrumbs"]
|
||||
.iter()
|
||||
.map(|f| f.parse().unwrap())
|
||||
.collect();
|
||||
processor.with_fields(Fields::new(ff).unwrap());
|
||||
|
||||
let ccr = "(?<parent>\\[[^\\[]*c=c[^\\]]*\\])";
|
||||
let cgr = "(?<edge>\\[[^\\[]*c=g[^\\]]*\\])";
|
||||
let cor = "(?<origin>\\[[^\\[]*c=o[^\\]]*\\])";
|
||||
let cpr = "(?<peer>\\[[^\\[]*c=p[^\\]]*\\])";
|
||||
let cwr = "(?<wrapper>\\[[^\\[]*c=w[^\\]]*\\])";
|
||||
let patterns = [ccr, cgr, cor, cpr, cwr]
|
||||
.iter()
|
||||
.map(|p| p.to_string())
|
||||
.collect();
|
||||
processor.try_with_patterns(patterns).unwrap();
|
||||
let pipeline_str = r#"fields: ["breadcrumbs"]
|
||||
patterns:
|
||||
- '(?<parent>\[[^\[]*c=c[^\]]*\])'
|
||||
- '(?<edge>\[[^\[]*c=g[^\]]*\])'
|
||||
- '(?<origin>\[[^\[]*c=o[^\]]*\])'
|
||||
- '(?<peer>\[[^\[]*c=p[^\]]*\])'
|
||||
- '(?<wrapper>\[[^\[]*c=w[^\]]*\])'
|
||||
ignore_missing: false"#;
|
||||
|
||||
let mut map = Map::default();
|
||||
map.insert("breadcrumbs", breadcrumbs.clone());
|
||||
processor.exec_map(&mut map).unwrap();
|
||||
|
||||
assert_eq!(map, temporary_map);
|
||||
let processor_yaml = yaml_rust::YamlLoader::load_from_str(pipeline_str)
|
||||
.unwrap()
|
||||
.pop()
|
||||
.unwrap();
|
||||
let processor_yaml_hash = processor_yaml.as_hash().unwrap();
|
||||
let builder = RegexProcessorBuilder::try_from(processor_yaml_hash).unwrap();
|
||||
let intermediate_keys = [
|
||||
"breadcrumbs",
|
||||
"breadcrumbs_parent",
|
||||
"breadcrumbs_edge",
|
||||
"breadcrumbs_origin",
|
||||
"breadcrumbs_peer",
|
||||
"breadcrumbs_wrapper",
|
||||
]
|
||||
.iter()
|
||||
.map(|k| k.to_string())
|
||||
.collect_vec();
|
||||
let processor = builder.build(&intermediate_keys).unwrap();
|
||||
let mut result = HashMap::new();
|
||||
for (index, pattern) in processor.patterns.iter().enumerate() {
|
||||
let r = processor
|
||||
.process(&breadcrumbs_str, pattern, (0, index))
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|(k, v)| (intermediate_keys[k].clone(), v))
|
||||
.collect::<HashMap<_, _>>();
|
||||
result.extend(r);
|
||||
}
|
||||
let map = Map { values: result };
|
||||
assert_eq!(temporary_map, map);
|
||||
}
|
||||
|
||||
{
|
||||
// multiple fields (with prefix), multiple patterns
|
||||
let ff = [
|
||||
"breadcrumbs_parent, parent",
|
||||
"breadcrumbs_edge, edge",
|
||||
"breadcrumbs_origin, origin",
|
||||
"breadcrumbs_peer, peer",
|
||||
"breadcrumbs_wrapper, wrapper",
|
||||
]
|
||||
.iter()
|
||||
.map(|f| f.parse().unwrap())
|
||||
.collect();
|
||||
processor.with_fields(Fields::new(ff).unwrap());
|
||||
|
||||
let patterns = [
|
||||
"a=(?<ip>[^,\\]]+)",
|
||||
"b=(?<request_id>[^,\\]]+)",
|
||||
"k=(?<request_end_time>[^,\\]]+)",
|
||||
"l=(?<turn_around_time>[^,\\]]+)",
|
||||
"m=(?<dns_lookup_time>[^,\\]]+)",
|
||||
"n=(?<geo>[^,\\]]+)",
|
||||
"o=(?<asn>[^,\\]]+)",
|
||||
let pipeline_str = r#"fields:
|
||||
- breadcrumbs_parent, parent
|
||||
- breadcrumbs_edge, edge
|
||||
- breadcrumbs_origin, origin
|
||||
- breadcrumbs_peer, peer
|
||||
- breadcrumbs_wrapper, wrapper
|
||||
patterns:
|
||||
- 'a=(?<ip>[^,\]]+)'
|
||||
- 'b=(?<request_id>[^,\]]+)'
|
||||
- 'k=(?<request_end_time>[^,\]]+)'
|
||||
- 'l=(?<turn_around_time>[^,\]]+)'
|
||||
- 'm=(?<dns_lookup_time>[^,\]]+)'
|
||||
- 'n=(?<geo>[^,\]]+)'
|
||||
- 'o=(?<asn>[^,\]]+)'
|
||||
ignore_missing: false"#;
|
||||
|
||||
let processor_yaml = yaml_rust::YamlLoader::load_from_str(pipeline_str)
|
||||
.unwrap()
|
||||
.pop()
|
||||
.unwrap();
|
||||
let processor_yaml_hash = processor_yaml.as_hash().unwrap();
|
||||
let builder = RegexProcessorBuilder::try_from(processor_yaml_hash).unwrap();
|
||||
|
||||
let intermediate_keys = [
|
||||
"breadcrumbs_parent",
|
||||
"breadcrumbs_edge",
|
||||
"breadcrumbs_origin",
|
||||
"breadcrumbs_peer",
|
||||
"breadcrumbs_wrapper",
|
||||
"edge_ip",
|
||||
"edge_request_id",
|
||||
"edge_request_end_time",
|
||||
"edge_turn_around_time",
|
||||
"edge_dns_lookup_time",
|
||||
"edge_geo",
|
||||
"edge_asn",
|
||||
"origin_ip",
|
||||
"origin_request_id",
|
||||
"origin_request_end_time",
|
||||
"origin_turn_around_time",
|
||||
"origin_dns_lookup_time",
|
||||
"origin_geo",
|
||||
"origin_asn",
|
||||
"peer_ip",
|
||||
"peer_request_id",
|
||||
"peer_request_end_time",
|
||||
"peer_turn_around_time",
|
||||
"peer_dns_lookup_time",
|
||||
"peer_geo",
|
||||
"peer_asn",
|
||||
"parent_ip",
|
||||
"parent_request_id",
|
||||
"parent_request_end_time",
|
||||
"parent_turn_around_time",
|
||||
"parent_dns_lookup_time",
|
||||
"parent_geo",
|
||||
"parent_asn",
|
||||
"wrapper_ip",
|
||||
"wrapper_request_id",
|
||||
"wrapper_request_end_time",
|
||||
"wrapper_turn_around_time",
|
||||
"wrapper_dns_lookup_time",
|
||||
"wrapper_geo",
|
||||
"wrapper_asn",
|
||||
]
|
||||
.iter()
|
||||
.map(|p| p.to_string())
|
||||
.collect();
|
||||
processor.try_with_patterns(patterns).unwrap();
|
||||
.map(|k| k.to_string())
|
||||
.collect_vec();
|
||||
let processor = builder.build(&intermediate_keys).unwrap();
|
||||
|
||||
let mut result = HashMap::new();
|
||||
for (field_index, field) in processor.fields.iter().enumerate() {
|
||||
for (pattern_index, pattern) in processor.patterns.iter().enumerate() {
|
||||
let s = temporary_map
|
||||
.get(field.input_name())
|
||||
.unwrap()
|
||||
.to_str_value();
|
||||
let r = processor
|
||||
.process(&s, pattern, (field_index, pattern_index))
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|(k, v)| (intermediate_keys[k].clone(), v))
|
||||
.collect::<HashMap<_, _>>();
|
||||
result.extend(r);
|
||||
}
|
||||
}
|
||||
|
||||
let new_values = vec![
|
||||
("edge_ip", Value::String("12.34.567.89".to_string())),
|
||||
@@ -413,11 +589,7 @@ mod tests {
|
||||
.map(|(k, v)| (k.to_string(), v))
|
||||
.collect();
|
||||
|
||||
let mut expected_map = temporary_map.clone();
|
||||
processor.exec_map(&mut temporary_map).unwrap();
|
||||
expected_map.extend(Map { values: new_values });
|
||||
|
||||
assert_eq!(expected_map, temporary_map);
|
||||
assert_eq!(result, new_values);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,18 +19,17 @@ use chrono::{DateTime, NaiveDateTime};
|
||||
use chrono_tz::Tz;
|
||||
use lazy_static::lazy_static;
|
||||
|
||||
use super::yaml_strings;
|
||||
use crate::etl::field::{Field, Fields};
|
||||
use crate::etl::field::{Fields, OneInputOneOutputField};
|
||||
use crate::etl::processor::{
|
||||
update_one_one_output_keys, yaml_bool, yaml_field, yaml_fields, yaml_string, Processor,
|
||||
FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
|
||||
yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, yaml_strings, Processor,
|
||||
ProcessorBuilder, ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
|
||||
};
|
||||
use crate::etl::value::time::{
|
||||
MICROSECOND_RESOLUTION, MICRO_RESOLUTION, MILLISECOND_RESOLUTION, MILLI_RESOLUTION,
|
||||
MS_RESOLUTION, NANOSECOND_RESOLUTION, NANO_RESOLUTION, NS_RESOLUTION, SECOND_RESOLUTION,
|
||||
SEC_RESOLUTION, S_RESOLUTION, US_RESOLUTION,
|
||||
};
|
||||
use crate::etl::value::{Map, Timestamp, Value};
|
||||
use crate::etl::value::{Timestamp, Value};
|
||||
|
||||
pub(crate) const PROCESSOR_TIMESTAMP: &str = "timestamp";
|
||||
const RESOLUTION_NAME: &str = "resolution";
|
||||
@@ -108,10 +107,56 @@ impl std::ops::Deref for Formats {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct TimestampProcessorBuilder {
|
||||
fields: Fields,
|
||||
formats: Formats,
|
||||
resolution: Resolution,
|
||||
ignore_missing: bool,
|
||||
}
|
||||
|
||||
impl ProcessorBuilder for TimestampProcessorBuilder {
|
||||
fn output_keys(&self) -> HashSet<&str> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|f| f.target_or_input_field())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn input_keys(&self) -> HashSet<&str> {
|
||||
self.fields.iter().map(|f| f.input_field()).collect()
|
||||
}
|
||||
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String> {
|
||||
self.build(intermediate_keys).map(ProcessorKind::Timestamp)
|
||||
}
|
||||
}
|
||||
|
||||
impl TimestampProcessorBuilder {
|
||||
pub fn build(self, intermediate_keys: &[String]) -> Result<TimestampProcessor, String> {
|
||||
let mut real_fields = vec![];
|
||||
for field in self.fields.into_iter() {
|
||||
let input = OneInputOneOutputField::build(
|
||||
"timestamp",
|
||||
intermediate_keys,
|
||||
field.input_field(),
|
||||
field.target_or_input_field(),
|
||||
)?;
|
||||
real_fields.push(input);
|
||||
}
|
||||
Ok(TimestampProcessor {
|
||||
fields: real_fields,
|
||||
formats: self.formats,
|
||||
resolution: self.resolution,
|
||||
ignore_missing: self.ignore_missing,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// support string, integer, float, time, epoch
|
||||
#[derive(Debug, Default)]
|
||||
pub struct TimestampProcessor {
|
||||
fields: Fields,
|
||||
fields: Vec<OneInputOneOutputField>,
|
||||
formats: Formats,
|
||||
resolution: Resolution,
|
||||
ignore_missing: bool,
|
||||
@@ -123,29 +168,6 @@ pub struct TimestampProcessor {
|
||||
}
|
||||
|
||||
impl TimestampProcessor {
|
||||
fn with_fields(&mut self, mut fields: Fields) {
|
||||
update_one_one_output_keys(&mut fields);
|
||||
self.fields = fields
|
||||
}
|
||||
|
||||
fn with_resolution(&mut self, resolution: Resolution) {
|
||||
self.resolution = resolution;
|
||||
}
|
||||
|
||||
fn with_formats(&mut self, v: Option<Vec<(Arc<String>, Tz)>>) {
|
||||
let v = match v {
|
||||
Some(v) if !v.is_empty() => v,
|
||||
_ => DEFAULT_FORMATS.clone(),
|
||||
};
|
||||
|
||||
let formats = Formats::new(v);
|
||||
self.formats = formats;
|
||||
}
|
||||
|
||||
fn with_ignore_missing(&mut self, ignore_missing: bool) {
|
||||
self.ignore_missing = ignore_missing;
|
||||
}
|
||||
|
||||
/// try to parse val with timezone first, if failed, parse without timezone
|
||||
fn try_parse(val: &str, fmt: &str, tz: Tz) -> Result<i64, String> {
|
||||
if let Ok(dt) = DateTime::parse_from_str(val, fmt) {
|
||||
@@ -212,12 +234,6 @@ impl TimestampProcessor {
|
||||
Resolution::Nano => Ok(Timestamp::Nanosecond(t)),
|
||||
}
|
||||
}
|
||||
|
||||
fn process_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
|
||||
let key = field.get_target_field();
|
||||
|
||||
Ok(Map::one(key, Value::Timestamp(self.parse(val)?)))
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_formats(yaml: &yaml_rust::yaml::Yaml) -> Result<Vec<(Arc<String>, Tz)>, String> {
|
||||
@@ -250,11 +266,14 @@ fn parse_formats(yaml: &yaml_rust::yaml::Yaml) -> Result<Vec<(Arc<String>, Tz)>,
|
||||
};
|
||||
}
|
||||
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for TimestampProcessor {
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for TimestampProcessorBuilder {
|
||||
type Error = String;
|
||||
|
||||
fn try_from(hash: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
|
||||
let mut processor = TimestampProcessor::default();
|
||||
let mut fields = Fields::default();
|
||||
let mut formats = Formats::default();
|
||||
let mut resolution = Resolution::default();
|
||||
let mut ignore_missing = false;
|
||||
|
||||
for (k, v) in hash {
|
||||
let key = k
|
||||
@@ -263,28 +282,33 @@ impl TryFrom<&yaml_rust::yaml::Hash> for TimestampProcessor {
|
||||
|
||||
match key {
|
||||
FIELD_NAME => {
|
||||
processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?));
|
||||
fields = Fields::one(yaml_new_field(v, FIELD_NAME)?);
|
||||
}
|
||||
FIELDS_NAME => {
|
||||
processor.with_fields(yaml_fields(v, FIELDS_NAME)?);
|
||||
fields = yaml_new_fields(v, FIELDS_NAME)?;
|
||||
}
|
||||
FORMATS_NAME => {
|
||||
let formats = parse_formats(v)?;
|
||||
processor.with_formats(Some(formats));
|
||||
let formats_vec = parse_formats(v)?;
|
||||
formats = Formats::new(formats_vec);
|
||||
}
|
||||
RESOLUTION_NAME => {
|
||||
let s = yaml_string(v, RESOLUTION_NAME)?.as_str().try_into()?;
|
||||
processor.with_resolution(s);
|
||||
resolution = yaml_string(v, RESOLUTION_NAME)?.as_str().try_into()?;
|
||||
}
|
||||
IGNORE_MISSING_NAME => {
|
||||
processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?);
|
||||
ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?;
|
||||
}
|
||||
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(processor)
|
||||
let processor_builder = TimestampProcessorBuilder {
|
||||
fields,
|
||||
formats,
|
||||
resolution,
|
||||
ignore_missing,
|
||||
};
|
||||
|
||||
Ok(processor_builder)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -297,49 +321,23 @@ impl Processor for TimestampProcessor {
|
||||
self.ignore_missing
|
||||
}
|
||||
|
||||
fn fields(&self) -> &Fields {
|
||||
&self.fields
|
||||
}
|
||||
|
||||
fn fields_mut(&mut self) -> &mut Fields {
|
||||
&mut self.fields
|
||||
}
|
||||
|
||||
fn output_keys(&self) -> HashSet<String> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|f| f.get_target_field().to_string())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
|
||||
self.process_field(val, field)
|
||||
}
|
||||
|
||||
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String> {
|
||||
for field in self.fields.iter() {
|
||||
let index = field.input_field.index;
|
||||
let index = field.input().index;
|
||||
match val.get(index) {
|
||||
Some(Value::Null) | None => {
|
||||
if !self.ignore_missing {
|
||||
return Err(format!(
|
||||
"{} processor: missing field: {}",
|
||||
self.kind(),
|
||||
field.get_field_name()
|
||||
&field.input().name
|
||||
));
|
||||
}
|
||||
}
|
||||
Some(v) => {
|
||||
// TODO(qtang): Let this method use the intermediate state collection directly.
|
||||
let mut map = self.process_field(v, field)?;
|
||||
field
|
||||
.output_fields_index_mapping
|
||||
.iter()
|
||||
.for_each(|(k, output_index)| {
|
||||
if let Some(v) = map.remove(k) {
|
||||
val[*output_index] = v;
|
||||
}
|
||||
});
|
||||
let result = self.parse(v)?;
|
||||
let (_, index) = field.output();
|
||||
val[*index] = Value::Timestamp(result);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -351,9 +349,18 @@ impl Processor for TimestampProcessor {
|
||||
mod tests {
|
||||
use yaml_rust::YamlLoader;
|
||||
|
||||
use super::TimestampProcessor;
|
||||
use super::{TimestampProcessor, TimestampProcessorBuilder};
|
||||
use crate::etl::value::{Timestamp, Value};
|
||||
|
||||
fn builder_to_native_processor(builder: TimestampProcessorBuilder) -> TimestampProcessor {
|
||||
TimestampProcessor {
|
||||
fields: vec![],
|
||||
formats: builder.formats,
|
||||
resolution: builder.resolution,
|
||||
ignore_missing: builder.ignore_missing,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_epoch() {
|
||||
let processor_yaml_str = r#"fields:
|
||||
@@ -367,7 +374,9 @@ formats:
|
||||
"#;
|
||||
let yaml = &YamlLoader::load_from_str(processor_yaml_str).unwrap()[0];
|
||||
let timestamp_yaml = yaml.as_hash().unwrap();
|
||||
let processor = TimestampProcessor::try_from(timestamp_yaml).unwrap();
|
||||
let processor = builder_to_native_processor(
|
||||
TimestampProcessorBuilder::try_from(timestamp_yaml).unwrap(),
|
||||
);
|
||||
|
||||
let values = [
|
||||
(
|
||||
@@ -419,7 +428,9 @@ formats:
|
||||
"#;
|
||||
let yaml = &YamlLoader::load_from_str(processor_yaml_str).unwrap()[0];
|
||||
let timestamp_yaml = yaml.as_hash().unwrap();
|
||||
let processor = TimestampProcessor::try_from(timestamp_yaml).unwrap();
|
||||
let processor = builder_to_native_processor(
|
||||
TimestampProcessorBuilder::try_from(timestamp_yaml).unwrap(),
|
||||
);
|
||||
|
||||
let values: Vec<&str> = vec![
|
||||
"2014-5-17T12:34:56",
|
||||
|
||||
@@ -15,12 +15,12 @@
|
||||
use ahash::HashSet;
|
||||
use urlencoding::{decode, encode};
|
||||
|
||||
use crate::etl::field::{Field, Fields};
|
||||
use crate::etl::field::{Fields, OneInputOneOutputField};
|
||||
use crate::etl::processor::{
|
||||
yaml_bool, yaml_field, yaml_fields, yaml_string, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
|
||||
METHOD_NAME,
|
||||
yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, ProcessorBuilder, ProcessorKind,
|
||||
FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, METHOD_NAME,
|
||||
};
|
||||
use crate::etl::value::{Map, Value};
|
||||
use crate::etl::value::Value;
|
||||
|
||||
pub(crate) const PROCESSOR_URL_ENCODING: &str = "urlencoding";
|
||||
|
||||
@@ -52,54 +52,76 @@ impl std::str::FromStr for Method {
|
||||
}
|
||||
}
|
||||
|
||||
/// only support string value
|
||||
#[derive(Debug, Default)]
|
||||
pub struct UrlEncodingProcessor {
|
||||
pub struct UrlEncodingProcessorBuilder {
|
||||
fields: Fields,
|
||||
method: Method,
|
||||
ignore_missing: bool,
|
||||
}
|
||||
|
||||
impl ProcessorBuilder for UrlEncodingProcessorBuilder {
|
||||
fn output_keys(&self) -> HashSet<&str> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|f| f.target_or_input_field())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn input_keys(&self) -> HashSet<&str> {
|
||||
self.fields.iter().map(|f| f.input_field()).collect()
|
||||
}
|
||||
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind, String> {
|
||||
self.build(intermediate_keys)
|
||||
.map(ProcessorKind::UrlEncoding)
|
||||
}
|
||||
}
|
||||
|
||||
impl UrlEncodingProcessorBuilder {
|
||||
fn build(self, intermediate_keys: &[String]) -> Result<UrlEncodingProcessor, String> {
|
||||
let mut real_fields = vec![];
|
||||
for field in self.fields.into_iter() {
|
||||
let input = OneInputOneOutputField::build(
|
||||
"urlencoding",
|
||||
intermediate_keys,
|
||||
field.input_field(),
|
||||
field.target_or_input_field(),
|
||||
)?;
|
||||
real_fields.push(input);
|
||||
}
|
||||
Ok(UrlEncodingProcessor {
|
||||
fields: real_fields,
|
||||
method: self.method,
|
||||
ignore_missing: self.ignore_missing,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// only support string value
|
||||
#[derive(Debug, Default)]
|
||||
pub struct UrlEncodingProcessor {
|
||||
fields: Vec<OneInputOneOutputField>,
|
||||
method: Method,
|
||||
ignore_missing: bool,
|
||||
}
|
||||
|
||||
impl UrlEncodingProcessor {
|
||||
fn with_fields(&mut self, mut fields: Fields) {
|
||||
Self::update_output_keys(&mut fields);
|
||||
self.fields = fields;
|
||||
}
|
||||
|
||||
fn with_ignore_missing(&mut self, ignore_missing: bool) {
|
||||
self.ignore_missing = ignore_missing;
|
||||
}
|
||||
|
||||
fn with_method(&mut self, method: Method) {
|
||||
self.method = method;
|
||||
}
|
||||
|
||||
fn process_field(&self, val: &str, field: &Field) -> Result<Map, String> {
|
||||
fn process_field(&self, val: &str) -> Result<Value, String> {
|
||||
let processed = match self.method {
|
||||
Method::Encode => encode(val).to_string(),
|
||||
Method::Decode => decode(val).map_err(|e| e.to_string())?.into_owned(),
|
||||
};
|
||||
let val = Value::String(processed);
|
||||
|
||||
let key = field.get_target_field();
|
||||
|
||||
Ok(Map::one(key, val))
|
||||
}
|
||||
|
||||
fn update_output_keys(fields: &mut Fields) {
|
||||
for field in fields.iter_mut() {
|
||||
field
|
||||
.output_fields_index_mapping
|
||||
.insert(field.get_target_field().to_string(), 0_usize);
|
||||
}
|
||||
Ok(Value::String(processed))
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for UrlEncodingProcessor {
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for UrlEncodingProcessorBuilder {
|
||||
type Error = String;
|
||||
|
||||
fn try_from(value: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
|
||||
let mut processor = UrlEncodingProcessor::default();
|
||||
let mut fields = Fields::default();
|
||||
let mut method = Method::Decode;
|
||||
let mut ignore_missing = false;
|
||||
|
||||
for (k, v) in value.iter() {
|
||||
let key = k
|
||||
@@ -107,24 +129,29 @@ impl TryFrom<&yaml_rust::yaml::Hash> for UrlEncodingProcessor {
|
||||
.ok_or(format!("key must be a string, but got {k:?}"))?;
|
||||
match key {
|
||||
FIELD_NAME => {
|
||||
processor.with_fields(Fields::one(yaml_field(v, FIELD_NAME)?));
|
||||
fields = Fields::one(yaml_new_field(v, FIELD_NAME)?);
|
||||
}
|
||||
FIELDS_NAME => {
|
||||
processor.with_fields(yaml_fields(v, FIELDS_NAME)?);
|
||||
fields = yaml_new_fields(v, FIELDS_NAME)?;
|
||||
}
|
||||
|
||||
IGNORE_MISSING_NAME => {
|
||||
processor.with_ignore_missing(yaml_bool(v, IGNORE_MISSING_NAME)?);
|
||||
ignore_missing = yaml_bool(v, IGNORE_MISSING_NAME)?;
|
||||
}
|
||||
|
||||
METHOD_NAME => {
|
||||
let method = yaml_string(v, METHOD_NAME)?;
|
||||
processor.with_method(method.parse()?);
|
||||
let method_str = yaml_string(v, METHOD_NAME)?;
|
||||
method = method_str.parse()?;
|
||||
}
|
||||
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
let processor = UrlEncodingProcessorBuilder {
|
||||
fields,
|
||||
method,
|
||||
ignore_missing,
|
||||
};
|
||||
|
||||
Ok(processor)
|
||||
}
|
||||
@@ -139,52 +166,21 @@ impl crate::etl::processor::Processor for UrlEncodingProcessor {
|
||||
self.ignore_missing
|
||||
}
|
||||
|
||||
fn fields(&self) -> &Fields {
|
||||
&self.fields
|
||||
}
|
||||
|
||||
fn fields_mut(&mut self) -> &mut Fields {
|
||||
&mut self.fields
|
||||
}
|
||||
|
||||
fn output_keys(&self) -> HashSet<String> {
|
||||
self.fields
|
||||
.iter()
|
||||
.map(|f| f.get_target_field().to_string())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn exec_field(&self, val: &Value, field: &Field) -> Result<Map, String> {
|
||||
match val {
|
||||
Value::String(val) => self.process_field(val, field),
|
||||
_ => Err(format!(
|
||||
"{} processor: expect string value, but got {val:?}",
|
||||
self.kind()
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
fn exec_mut(&self, val: &mut Vec<Value>) -> Result<(), String> {
|
||||
for field in self.fields.iter() {
|
||||
let index = field.input_field.index;
|
||||
let index = field.input_index();
|
||||
match val.get(index) {
|
||||
Some(Value::String(s)) => {
|
||||
let mut map = self.process_field(s, field)?;
|
||||
field
|
||||
.output_fields_index_mapping
|
||||
.iter()
|
||||
.for_each(|(k, output_index)| {
|
||||
if let Some(v) = map.remove(k) {
|
||||
val[*output_index] = v;
|
||||
}
|
||||
});
|
||||
let result = self.process_field(s)?;
|
||||
let output_index = field.output_index();
|
||||
val[output_index] = result;
|
||||
}
|
||||
Some(Value::Null) | None => {
|
||||
if !self.ignore_missing {
|
||||
return Err(format!(
|
||||
"{} processor: missing field: {}",
|
||||
self.kind(),
|
||||
field.get_field_name()
|
||||
field.output_name()
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -202,29 +198,28 @@ impl crate::etl::processor::Processor for UrlEncodingProcessor {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::etl::field::{Field, Fields};
|
||||
|
||||
use crate::etl::processor::urlencoding::UrlEncodingProcessor;
|
||||
use crate::etl::value::{Map, Value};
|
||||
use crate::etl::value::Value;
|
||||
|
||||
#[test]
|
||||
fn test_decode_url() {
|
||||
let field = "url";
|
||||
let ff: Field = field.parse().unwrap();
|
||||
|
||||
let decoded = "//BC/[a=6.7.8.9,c=g,k=0,l=1]";
|
||||
let encoded = "%2F%2FBC%2F%5Ba%3D6.7.8.9%2Cc%3Dg%2Ck%3D0%2Cl%3D1%5D";
|
||||
|
||||
let mut processor = UrlEncodingProcessor::default();
|
||||
processor.with_fields(Fields::one(ff.clone()));
|
||||
|
||||
{
|
||||
let result = processor.process_field(encoded, &ff).unwrap();
|
||||
assert_eq!(Map::one(field, Value::String(decoded.into())), result)
|
||||
let processor = UrlEncodingProcessor::default();
|
||||
let result = processor.process_field(encoded).unwrap();
|
||||
assert_eq!(Value::String(decoded.into()), result)
|
||||
}
|
||||
{
|
||||
processor.with_method(super::Method::Encode);
|
||||
let result = processor.process_field(decoded, &ff).unwrap();
|
||||
assert_eq!(Map::one(field, Value::String(encoded.into())), result)
|
||||
let processor = UrlEncodingProcessor {
|
||||
fields: vec![],
|
||||
method: super::Method::Encode,
|
||||
ignore_missing: false,
|
||||
};
|
||||
let result = processor.process_field(decoded).unwrap();
|
||||
assert_eq!(Value::String(encoded.into()), result)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,8 +17,8 @@ pub mod transformer;
|
||||
|
||||
use itertools::Itertools;
|
||||
|
||||
use crate::etl::field::Fields;
|
||||
use crate::etl::processor::{update_one_one_output_keys, yaml_field, yaml_fields, yaml_string};
|
||||
use crate::etl::find_key_index;
|
||||
use crate::etl::processor::yaml_string;
|
||||
use crate::etl::transform::index::Index;
|
||||
use crate::etl::value::Value;
|
||||
|
||||
@@ -31,6 +31,9 @@ const TRANSFORM_ON_FAILURE: &str = "on_failure";
|
||||
|
||||
pub use transformer::greptime::GreptimeTransformer;
|
||||
|
||||
use super::field::{Fields, InputFieldInfo, OneInputOneOutputField};
|
||||
use super::processor::{yaml_new_field, yaml_new_fields};
|
||||
|
||||
pub trait Transformer: std::fmt::Display + Sized + Send + Sync + 'static {
|
||||
type Output;
|
||||
type VecOutput;
|
||||
@@ -39,12 +42,11 @@ pub trait Transformer: std::fmt::Display + Sized + Send + Sync + 'static {
|
||||
fn schemas(&self) -> &Vec<greptime_proto::v1::ColumnSchema>;
|
||||
fn transforms(&self) -> &Transforms;
|
||||
fn transforms_mut(&mut self) -> &mut Transforms;
|
||||
fn transform(&self, val: Value) -> Result<Self::Output, String>;
|
||||
fn transform_mut(&self, val: &mut Vec<Value>) -> Result<Self::VecOutput, String>;
|
||||
}
|
||||
|
||||
/// On Failure behavior when transform fails
|
||||
#[derive(Debug, Clone, Default)]
|
||||
#[derive(Debug, Clone, Default, Copy)]
|
||||
pub enum OnFailure {
|
||||
// Return None if transform fails
|
||||
#[default]
|
||||
@@ -74,12 +76,18 @@ impl std::fmt::Display for OnFailure {
|
||||
}
|
||||
}
|
||||
}
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct TransformBuilders {
|
||||
pub(crate) builders: Vec<TransformBuilder>,
|
||||
pub(crate) output_keys: Vec<String>,
|
||||
pub(crate) required_keys: Vec<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct Transforms {
|
||||
transforms: Vec<Transform>,
|
||||
output_keys: Vec<String>,
|
||||
required_keys: Vec<String>,
|
||||
pub(crate) transforms: Vec<Transform>,
|
||||
pub(crate) output_keys: Vec<String>,
|
||||
pub(crate) required_keys: Vec<String>,
|
||||
}
|
||||
|
||||
impl Transforms {
|
||||
@@ -130,7 +138,7 @@ impl std::ops::DerefMut for Transforms {
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&Vec<yaml_rust::Yaml>> for Transforms {
|
||||
impl TryFrom<&Vec<yaml_rust::Yaml>> for TransformBuilders {
|
||||
type Error = String;
|
||||
|
||||
fn try_from(docs: &Vec<yaml_rust::Yaml>) -> Result<Self, Self::Error> {
|
||||
@@ -138,41 +146,78 @@ impl TryFrom<&Vec<yaml_rust::Yaml>> for Transforms {
|
||||
let mut all_output_keys: Vec<String> = Vec::with_capacity(100);
|
||||
let mut all_required_keys = Vec::with_capacity(100);
|
||||
for doc in docs {
|
||||
let transform: Transform = doc
|
||||
let transform_builder: TransformBuilder = doc
|
||||
.as_hash()
|
||||
.ok_or("transform element must be a map".to_string())?
|
||||
.try_into()?;
|
||||
let mut transform_output_keys = transform
|
||||
let mut transform_output_keys = transform_builder
|
||||
.fields
|
||||
.iter()
|
||||
.map(|f| f.get_target_field().to_string())
|
||||
.map(|f| f.target_or_input_field().to_string())
|
||||
.collect();
|
||||
all_output_keys.append(&mut transform_output_keys);
|
||||
|
||||
let mut transform_required_keys = transform
|
||||
let mut transform_required_keys = transform_builder
|
||||
.fields
|
||||
.iter()
|
||||
.map(|f| f.input_field.name.clone())
|
||||
.map(|f| f.input_field().to_string())
|
||||
.collect();
|
||||
all_required_keys.append(&mut transform_required_keys);
|
||||
|
||||
transforms.push(transform);
|
||||
transforms.push(transform_builder);
|
||||
}
|
||||
|
||||
all_required_keys.sort();
|
||||
|
||||
Ok(Transforms {
|
||||
transforms,
|
||||
Ok(TransformBuilders {
|
||||
builders: transforms,
|
||||
output_keys: all_output_keys,
|
||||
required_keys: all_required_keys,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct TransformBuilder {
|
||||
fields: Fields,
|
||||
type_: Value,
|
||||
default: Option<Value>,
|
||||
index: Option<Index>,
|
||||
on_failure: Option<OnFailure>,
|
||||
}
|
||||
|
||||
impl TransformBuilder {
|
||||
pub fn build(
|
||||
self,
|
||||
intermediate_keys: &[String],
|
||||
output_keys: &[String],
|
||||
) -> Result<Transform, String> {
|
||||
let mut real_fields = vec![];
|
||||
for field in self.fields {
|
||||
let input_index = find_key_index(intermediate_keys, field.input_field(), "transform")?;
|
||||
let input_field_info = InputFieldInfo::new(field.input_field(), input_index);
|
||||
let output_index =
|
||||
find_key_index(output_keys, field.target_or_input_field(), "transform")?;
|
||||
let input = OneInputOneOutputField::new(
|
||||
input_field_info,
|
||||
(field.target_or_input_field().to_string(), output_index),
|
||||
);
|
||||
real_fields.push(input);
|
||||
}
|
||||
Ok(Transform {
|
||||
real_fields,
|
||||
type_: self.type_,
|
||||
default: self.default,
|
||||
index: self.index,
|
||||
on_failure: self.on_failure,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// only field is required
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Transform {
|
||||
pub fields: Fields,
|
||||
pub real_fields: Vec<OneInputOneOutputField>,
|
||||
|
||||
pub type_: Value,
|
||||
|
||||
@@ -192,7 +237,7 @@ impl std::fmt::Display for Transform {
|
||||
};
|
||||
|
||||
let type_ = format!("type: {}", self.type_);
|
||||
let fields = format!("field(s): {}", self.fields);
|
||||
let fields = format!("field(s): {:?}", self.real_fields);
|
||||
let default = if let Some(default) = &self.default {
|
||||
format!(", default: {}", default)
|
||||
} else {
|
||||
@@ -212,7 +257,7 @@ impl std::fmt::Display for Transform {
|
||||
impl Default for Transform {
|
||||
fn default() -> Self {
|
||||
Transform {
|
||||
fields: Fields::default(),
|
||||
real_fields: Vec::new(),
|
||||
type_: Value::Null,
|
||||
default: None,
|
||||
index: None,
|
||||
@@ -222,40 +267,6 @@ impl Default for Transform {
|
||||
}
|
||||
|
||||
impl Transform {
|
||||
fn with_fields(&mut self, mut fields: Fields) {
|
||||
update_one_one_output_keys(&mut fields);
|
||||
self.fields = fields;
|
||||
}
|
||||
|
||||
fn with_type(&mut self, type_: Value) {
|
||||
self.type_ = type_;
|
||||
}
|
||||
|
||||
fn try_default(&mut self, default: Value) -> Result<(), String> {
|
||||
match (&self.type_, &default) {
|
||||
(Value::Null, _) => Err(format!(
|
||||
"transform {} type MUST BE set before default {}",
|
||||
self.fields, &default,
|
||||
)),
|
||||
(_, Value::Null) => Ok(()), // if default is not set, then it will be regarded as default null
|
||||
(_, _) => {
|
||||
let target = self
|
||||
.type_
|
||||
.parse_str_value(default.to_str_value().as_str())?;
|
||||
self.default = Some(target);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn with_index(&mut self, index: Index) {
|
||||
self.index = Some(index);
|
||||
}
|
||||
|
||||
fn with_on_failure(&mut self, on_failure: OnFailure) {
|
||||
self.on_failure = Some(on_failure);
|
||||
}
|
||||
|
||||
pub(crate) fn get_default(&self) -> Option<&Value> {
|
||||
self.default.as_ref()
|
||||
}
|
||||
@@ -265,52 +276,74 @@ impl Transform {
|
||||
}
|
||||
}
|
||||
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for Transform {
|
||||
impl TryFrom<&yaml_rust::yaml::Hash> for TransformBuilder {
|
||||
type Error = String;
|
||||
|
||||
fn try_from(hash: &yaml_rust::yaml::Hash) -> Result<Self, Self::Error> {
|
||||
let mut transform = Transform::default();
|
||||
|
||||
let mut default_opt = None;
|
||||
let mut fields = Fields::default();
|
||||
let mut type_ = Value::Null;
|
||||
let mut default = None;
|
||||
let mut index = None;
|
||||
let mut on_failure = None;
|
||||
|
||||
for (k, v) in hash {
|
||||
let key = k.as_str().ok_or("key must be a string")?;
|
||||
match key {
|
||||
TRANSFORM_FIELD => {
|
||||
transform.with_fields(Fields::one(yaml_field(v, TRANSFORM_FIELD)?));
|
||||
fields = Fields::one(yaml_new_field(v, TRANSFORM_FIELD)?);
|
||||
}
|
||||
|
||||
TRANSFORM_FIELDS => {
|
||||
transform.with_fields(yaml_fields(v, TRANSFORM_FIELDS)?);
|
||||
fields = yaml_new_fields(v, TRANSFORM_FIELDS)?;
|
||||
}
|
||||
|
||||
TRANSFORM_TYPE => {
|
||||
let t = yaml_string(v, TRANSFORM_TYPE)?;
|
||||
transform.with_type(Value::parse_str_type(&t)?);
|
||||
type_ = Value::parse_str_type(&t)?;
|
||||
}
|
||||
|
||||
TRANSFORM_INDEX => {
|
||||
let index = yaml_string(v, TRANSFORM_INDEX)?;
|
||||
transform.with_index(index.try_into()?);
|
||||
let index_str = yaml_string(v, TRANSFORM_INDEX)?;
|
||||
index = Some(index_str.try_into()?);
|
||||
}
|
||||
|
||||
TRANSFORM_DEFAULT => {
|
||||
default_opt = Some(Value::try_from(v)?);
|
||||
default = Some(Value::try_from(v)?);
|
||||
}
|
||||
|
||||
TRANSFORM_ON_FAILURE => {
|
||||
let on_failure = yaml_string(v, TRANSFORM_ON_FAILURE)?;
|
||||
transform.with_on_failure(on_failure.parse()?);
|
||||
let on_failure_str = yaml_string(v, TRANSFORM_ON_FAILURE)?;
|
||||
on_failure = Some(on_failure_str.parse()?);
|
||||
}
|
||||
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
let mut final_default = None;
|
||||
|
||||
if let Some(default) = default_opt {
|
||||
transform.try_default(default)?;
|
||||
if let Some(default_value) = default {
|
||||
match (&type_, &default_value) {
|
||||
(Value::Null, _) => {
|
||||
return Err(format!(
|
||||
"transform {:?} type MUST BE set before default {}",
|
||||
fields, &default_value,
|
||||
));
|
||||
}
|
||||
(_, Value::Null) => {} // if default is not set, then it will be regarded as default null
|
||||
(_, _) => {
|
||||
let target = type_.parse_str_value(default_value.to_str_value().as_str())?;
|
||||
final_default = Some(target);
|
||||
}
|
||||
}
|
||||
}
|
||||
let builder = TransformBuilder {
|
||||
fields,
|
||||
type_,
|
||||
default: final_default,
|
||||
index,
|
||||
on_failure,
|
||||
};
|
||||
|
||||
Ok(transform)
|
||||
Ok(builder)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,10 +20,10 @@ use coerce::{coerce_columns, coerce_value};
|
||||
use greptime_proto::v1::{ColumnSchema, Row, Rows, Value as GreptimeValue};
|
||||
use itertools::Itertools;
|
||||
|
||||
use crate::etl::field::{Field, Fields};
|
||||
use crate::etl::field::{InputFieldInfo, OneInputOneOutputField};
|
||||
use crate::etl::transform::index::Index;
|
||||
use crate::etl::transform::{Transform, Transformer, Transforms};
|
||||
use crate::etl::value::{Array, Map, Timestamp, Value};
|
||||
use crate::etl::value::{Timestamp, Value};
|
||||
|
||||
const DEFAULT_GREPTIME_TIMESTAMP_COLUMN: &str = "greptime_timestamp";
|
||||
|
||||
@@ -36,23 +36,41 @@ pub struct GreptimeTransformer {
|
||||
}
|
||||
|
||||
impl GreptimeTransformer {
|
||||
fn default_greptime_timestamp_column() -> Transform {
|
||||
/// Add a default timestamp column to the transforms
|
||||
fn add_greptime_timestamp_column(transforms: &mut Transforms) {
|
||||
let ns = chrono::Utc::now().timestamp_nanos_opt().unwrap_or(0);
|
||||
let type_ = Value::Timestamp(Timestamp::Nanosecond(ns));
|
||||
let default = Some(type_.clone());
|
||||
let mut field = Field::new(DEFAULT_GREPTIME_TIMESTAMP_COLUMN);
|
||||
field.insert_output_index(DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string(), 0);
|
||||
let fields = Fields::new(vec![field]).unwrap();
|
||||
|
||||
Transform {
|
||||
fields,
|
||||
let transform = Transform {
|
||||
real_fields: vec![OneInputOneOutputField::new(
|
||||
InputFieldInfo {
|
||||
name: DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string(),
|
||||
index: usize::MAX,
|
||||
},
|
||||
(
|
||||
DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string(),
|
||||
transforms
|
||||
.transforms
|
||||
.iter()
|
||||
.map(|x| x.real_fields.len())
|
||||
.sum(),
|
||||
),
|
||||
)],
|
||||
type_,
|
||||
default,
|
||||
index: Some(Index::Time),
|
||||
on_failure: Some(crate::etl::transform::OnFailure::Default),
|
||||
}
|
||||
};
|
||||
let required_keys = transforms.required_keys_mut();
|
||||
required_keys.push(DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string());
|
||||
|
||||
let output_keys = transforms.output_keys_mut();
|
||||
output_keys.push(DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string());
|
||||
transforms.push(transform);
|
||||
}
|
||||
|
||||
/// Generate the schema for the GreptimeTransformer
|
||||
fn schemas(transforms: &Transforms) -> Result<Vec<ColumnSchema>, String> {
|
||||
let mut schema = vec![];
|
||||
for transform in transforms.iter() {
|
||||
@@ -60,53 +78,6 @@ impl GreptimeTransformer {
|
||||
}
|
||||
Ok(schema)
|
||||
}
|
||||
|
||||
fn transform_map(&self, map: &Map) -> Result<Row, String> {
|
||||
let mut values = vec![GreptimeValue { value_data: None }; self.schema.len()];
|
||||
for transform in self.transforms.iter() {
|
||||
for field in transform.fields.iter() {
|
||||
let value_data = match map.get(field.get_field_name()) {
|
||||
Some(val) => coerce_value(val, transform)?,
|
||||
None => {
|
||||
let default = transform.get_default();
|
||||
match default {
|
||||
Some(default) => coerce_value(default, transform)?,
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
};
|
||||
if let Some(i) = field
|
||||
.output_fields_index_mapping
|
||||
.iter()
|
||||
.next()
|
||||
.map(|kv| kv.1)
|
||||
{
|
||||
values[*i] = GreptimeValue { value_data }
|
||||
} else {
|
||||
return Err(format!(
|
||||
"field: {} output_fields is empty.",
|
||||
field.get_field_name()
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Row { values })
|
||||
}
|
||||
|
||||
fn transform_array(&self, arr: &Array) -> Result<Vec<Row>, String> {
|
||||
let mut rows = Vec::with_capacity(arr.len());
|
||||
for v in arr.iter() {
|
||||
match v {
|
||||
Value::Map(map) => {
|
||||
let row = self.transform_map(map)?;
|
||||
rows.push(row);
|
||||
}
|
||||
_ => return Err(format!("Expected map, found: {v:?}")),
|
||||
}
|
||||
}
|
||||
Ok(rows)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for GreptimeTransformer {
|
||||
@@ -129,9 +100,9 @@ impl Transformer for GreptimeTransformer {
|
||||
|
||||
for transform in transforms.iter() {
|
||||
let target_fields_set = transform
|
||||
.fields
|
||||
.real_fields
|
||||
.iter()
|
||||
.map(|f| f.get_target_field())
|
||||
.map(|f| f.output_name())
|
||||
.collect::<HashSet<_>>();
|
||||
|
||||
let intersections: Vec<_> = column_names_set.intersection(&target_fields_set).collect();
|
||||
@@ -146,12 +117,15 @@ impl Transformer for GreptimeTransformer {
|
||||
|
||||
if let Some(idx) = transform.index {
|
||||
if idx == Index::Time {
|
||||
match transform.fields.len() {
|
||||
1 => timestamp_columns.push(transform.fields.first().unwrap().get_field_name()),
|
||||
_ => return Err(format!(
|
||||
"Illegal to set multiple timestamp Index columns, please set only one: {}",
|
||||
transform.fields.get_target_fields().join(", ")
|
||||
)),
|
||||
match transform.real_fields.len() {
|
||||
1 => timestamp_columns
|
||||
.push(transform.real_fields.first().unwrap().input_name()),
|
||||
_ => {
|
||||
return Err(format!(
|
||||
"Illegal to set multiple timestamp Index columns, please set only one: {}",
|
||||
transform.real_fields.iter().map(|x|x.input_name()).join(", ")
|
||||
))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -159,13 +133,7 @@ impl Transformer for GreptimeTransformer {
|
||||
|
||||
match timestamp_columns.len() {
|
||||
0 => {
|
||||
transforms.push(GreptimeTransformer::default_greptime_timestamp_column());
|
||||
|
||||
let required_keys = transforms.required_keys_mut();
|
||||
required_keys.push(DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string());
|
||||
|
||||
let output_keys = transforms.output_keys_mut();
|
||||
output_keys.push(DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string());
|
||||
GreptimeTransformer::add_greptime_timestamp_column(&mut transforms);
|
||||
|
||||
let schema = GreptimeTransformer::schemas(&transforms)?;
|
||||
Ok(GreptimeTransformer { transforms, schema })
|
||||
@@ -184,54 +152,26 @@ impl Transformer for GreptimeTransformer {
|
||||
}
|
||||
}
|
||||
|
||||
fn transform(&self, value: Value) -> Result<Self::Output, String> {
|
||||
match value {
|
||||
Value::Map(map) => {
|
||||
let rows = vec![self.transform_map(&map)?];
|
||||
Ok(Rows {
|
||||
schema: self.schema.clone(),
|
||||
rows,
|
||||
})
|
||||
}
|
||||
Value::Array(arr) => {
|
||||
let rows = self.transform_array(&arr)?;
|
||||
Ok(Rows {
|
||||
schema: self.schema.clone(),
|
||||
rows,
|
||||
})
|
||||
}
|
||||
_ => Err(format!("Expected map or array, found: {}", value)),
|
||||
}
|
||||
}
|
||||
|
||||
fn transform_mut(&self, val: &mut Vec<Value>) -> Result<Self::VecOutput, String> {
|
||||
let mut values = vec![GreptimeValue { value_data: None }; self.schema.len()];
|
||||
for transform in self.transforms.iter() {
|
||||
for field in transform.fields.iter() {
|
||||
let index = field.input_field.index;
|
||||
for field in transform.real_fields.iter() {
|
||||
let index = field.input_index();
|
||||
let output_index = field.output_index();
|
||||
match val.get(index) {
|
||||
Some(v) => {
|
||||
let value_data = coerce_value(v, transform)
|
||||
.map_err(|e| format!("{} processor: {}", field.get_field_name(), e))?;
|
||||
.map_err(|e| format!("{} processor: {}", field.input_name(), e))?;
|
||||
// every transform fields has only one output field
|
||||
if let Some(i) = field
|
||||
.output_fields_index_mapping
|
||||
.iter()
|
||||
.next()
|
||||
.map(|kv| kv.1)
|
||||
{
|
||||
values[*i] = GreptimeValue { value_data }
|
||||
} else {
|
||||
return Err(format!(
|
||||
"field: {} output_fields is empty.",
|
||||
field.get_field_name()
|
||||
));
|
||||
}
|
||||
values[output_index] = GreptimeValue { value_data };
|
||||
}
|
||||
_ => {
|
||||
return Err(format!(
|
||||
"Get field not in the array field: {field:?}, {val:?}"
|
||||
))
|
||||
None => {
|
||||
let default = transform.get_default();
|
||||
let value_data = match default {
|
||||
Some(default) => coerce_value(default, transform)?,
|
||||
None => None,
|
||||
};
|
||||
values[output_index] = GreptimeValue { value_data };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -66,8 +66,8 @@ impl TryFrom<Value> for ValueData {
|
||||
pub(crate) fn coerce_columns(transform: &Transform) -> Result<Vec<ColumnSchema>, String> {
|
||||
let mut columns = Vec::new();
|
||||
|
||||
for field in transform.fields.iter() {
|
||||
let column_name = field.get_target_field().to_string();
|
||||
for field in transform.real_fields.iter() {
|
||||
let column_name = field.output_name().to_string();
|
||||
|
||||
let datatype = coerce_type(transform)? as i32;
|
||||
|
||||
@@ -134,7 +134,7 @@ fn coerce_type(transform: &Transform) -> Result<ColumnDataType, String> {
|
||||
|
||||
Value::Null => Err(format!(
|
||||
"Null type not supported when to coerce '{}' type",
|
||||
transform.fields
|
||||
transform.type_.to_str_type()
|
||||
)),
|
||||
}
|
||||
}
|
||||
@@ -144,15 +144,18 @@ pub(crate) fn coerce_value(
|
||||
transform: &Transform,
|
||||
) -> Result<Option<ValueData>, String> {
|
||||
match val {
|
||||
Value::Null => match transform.on_failure {
|
||||
Some(OnFailure::Ignore) => Ok(None),
|
||||
Some(OnFailure::Default) => transform
|
||||
.get_default()
|
||||
.map(|default| coerce_value(default, transform))
|
||||
.unwrap_or_else(|| {
|
||||
coerce_value(transform.get_type_matched_default_val(), transform)
|
||||
}),
|
||||
None => Ok(None),
|
||||
Value::Null => match &transform.default {
|
||||
Some(default) => coerce_value(default, transform),
|
||||
None => match transform.on_failure {
|
||||
Some(OnFailure::Ignore) => Ok(None),
|
||||
Some(OnFailure::Default) => transform
|
||||
.get_default()
|
||||
.map(|default| coerce_value(default, transform))
|
||||
.unwrap_or_else(|| {
|
||||
coerce_value(transform.get_type_matched_default_val(), transform)
|
||||
}),
|
||||
None => Ok(None),
|
||||
},
|
||||
},
|
||||
|
||||
Value::Int8(n) => coerce_i64_value(*n as i64, transform),
|
||||
@@ -404,12 +407,11 @@ fn coerce_string_value(s: &String, transform: &Transform) -> Result<Option<Value
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::etl::field::Fields;
|
||||
|
||||
#[test]
|
||||
fn test_coerce_string_without_on_failure() {
|
||||
let transform = Transform {
|
||||
fields: Fields::default(),
|
||||
real_fields: vec![],
|
||||
type_: Value::Int32(0),
|
||||
default: None,
|
||||
index: None,
|
||||
@@ -434,7 +436,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_coerce_string_with_on_failure_ignore() {
|
||||
let transform = Transform {
|
||||
fields: Fields::default(),
|
||||
real_fields: vec![],
|
||||
type_: Value::Int32(0),
|
||||
default: None,
|
||||
index: None,
|
||||
@@ -449,7 +451,7 @@ mod tests {
|
||||
#[test]
|
||||
fn test_coerce_string_with_on_failure_default() {
|
||||
let mut transform = Transform {
|
||||
fields: Fields::default(),
|
||||
real_fields: vec![],
|
||||
type_: Value::Int32(0),
|
||||
default: None,
|
||||
index: None,
|
||||
|
||||
@@ -13,20 +13,45 @@
|
||||
// limitations under the License.
|
||||
|
||||
use greptime_proto::v1::{ColumnDataType, ColumnSchema, Rows, SemanticType};
|
||||
use pipeline::{parse, Content, GreptimeTransformer, Pipeline, Value};
|
||||
use pipeline::{parse, Content, GreptimeTransformer, Pipeline};
|
||||
|
||||
/// test util function to parse and execute pipeline
|
||||
pub fn parse_and_exec(input_str: &str, pipeline_yaml: &str) -> Rows {
|
||||
let input_value: Value = serde_json::from_str::<serde_json::Value>(input_str)
|
||||
.expect("failed to parse into json")
|
||||
.try_into()
|
||||
.expect("failed to convert into value");
|
||||
let input_value = serde_json::from_str::<serde_json::Value>(input_str).unwrap();
|
||||
|
||||
let yaml_content = Content::Yaml(pipeline_yaml.into());
|
||||
let pipeline: Pipeline<GreptimeTransformer> =
|
||||
parse(&yaml_content).expect("failed to parse pipeline");
|
||||
let mut result = pipeline.init_intermediate_state();
|
||||
|
||||
pipeline.exec(input_value).expect("failed to exec pipeline")
|
||||
let schema = pipeline.schemas().clone();
|
||||
|
||||
let mut rows = Vec::new();
|
||||
|
||||
match input_value {
|
||||
serde_json::Value::Array(array) => {
|
||||
for value in array {
|
||||
pipeline.prepare(value, &mut result).unwrap();
|
||||
let row = pipeline
|
||||
.exec_mut(&mut result)
|
||||
.expect("failed to exec pipeline");
|
||||
rows.push(row);
|
||||
pipeline.reset_intermediate_state(&mut result);
|
||||
}
|
||||
}
|
||||
serde_json::Value::Object(_) => {
|
||||
pipeline.prepare(input_value, &mut result).unwrap();
|
||||
let row = pipeline
|
||||
.exec_mut(&mut result)
|
||||
.expect("failed to exec pipeline");
|
||||
rows.push(row);
|
||||
}
|
||||
_ => {
|
||||
panic!("invalid input value");
|
||||
}
|
||||
}
|
||||
|
||||
Rows { schema, rows }
|
||||
}
|
||||
|
||||
/// test util function to create column schema
|
||||
|
||||
@@ -157,7 +157,7 @@ transform:
|
||||
fn test_modifier() {
|
||||
let empty_str = r#"
|
||||
{
|
||||
"str": "key1 key2 key3 key4 key5 key6 key7 key8"
|
||||
"str": "key1 key2 key3 key4 key5 key6"
|
||||
}"#;
|
||||
|
||||
let pipeline_yaml = r#"
|
||||
@@ -165,7 +165,7 @@ processors:
|
||||
- dissect:
|
||||
field: str
|
||||
patterns:
|
||||
- "%{key1} %{key2} %{+key3} %{+key3/2} %{key5->} %{?key6} %{*key_7} %{&key_7}"
|
||||
- "%{key1} %{key2} %{+key3} %{+key3/2} %{key5->} %{?key6}"
|
||||
|
||||
transform:
|
||||
- fields:
|
||||
@@ -173,7 +173,6 @@ transform:
|
||||
- key2
|
||||
- key3
|
||||
- key5
|
||||
- key7
|
||||
type: string
|
||||
"#;
|
||||
|
||||
@@ -184,7 +183,6 @@ transform:
|
||||
make_string_column_schema("key2".to_string()),
|
||||
make_string_column_schema("key3".to_string()),
|
||||
make_string_column_schema("key5".to_string()),
|
||||
make_string_column_schema("key7".to_string()),
|
||||
common::make_column_schema(
|
||||
"greptime_timestamp".to_string(),
|
||||
ColumnDataType::TimestampNanosecond,
|
||||
@@ -209,10 +207,6 @@ transform:
|
||||
output.rows[0].values[3].value_data,
|
||||
Some(StringValue("key5".to_string()))
|
||||
);
|
||||
assert_eq!(
|
||||
output.rows[0].values[4].value_data,
|
||||
Some(StringValue("key8".to_string()))
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -12,18 +12,18 @@
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
use api::v1::Rows;
|
||||
use common_telemetry::tracing::info;
|
||||
use greptime_proto::v1::value::ValueData::{
|
||||
BoolValue, F64Value, StringValue, TimestampNanosecondValue, TimestampSecondValue, U32Value,
|
||||
U64Value, U8Value,
|
||||
};
|
||||
use greptime_proto::v1::Value as GreptimeValue;
|
||||
use pipeline::{parse, Content, GreptimeTransformer, Pipeline, Value};
|
||||
use pipeline::{parse, Content, GreptimeTransformer, Pipeline};
|
||||
|
||||
#[test]
|
||||
fn test_complex_data() {
|
||||
let input_value_str = r#"
|
||||
[
|
||||
{
|
||||
"version": 1,
|
||||
"streamId": "12345",
|
||||
@@ -73,12 +73,9 @@ fn test_complex_data() {
|
||||
"ewExecutionInfo": "c:4380:7:161:162:161:n:::12473:200|C:4380:3:0:4:0:n:::6967:200|R:4380:20:99:99:1:n:::35982:200",
|
||||
"customField": "any-custom-value"
|
||||
}
|
||||
]
|
||||
"#;
|
||||
let input_value: Value = serde_json::from_str::<serde_json::Value>(input_value_str)
|
||||
.expect("failed to parse input value")
|
||||
.try_into()
|
||||
.expect("failed to convert input value");
|
||||
let input_value = serde_json::from_str::<serde_json::Value>(input_value_str)
|
||||
.expect("failed to parse input value");
|
||||
|
||||
let pipeline_yaml = r#"
|
||||
---
|
||||
@@ -422,7 +419,19 @@ transform:
|
||||
let yaml_content = Content::Yaml(pipeline_yaml.into());
|
||||
let pipeline: Pipeline<GreptimeTransformer> =
|
||||
parse(&yaml_content).expect("failed to parse pipeline");
|
||||
let output = pipeline.exec(input_value).expect("failed to exec pipeline");
|
||||
let mut stats = pipeline.init_intermediate_state();
|
||||
pipeline
|
||||
.prepare(input_value, &mut stats)
|
||||
.expect("failed to prepare pipeline");
|
||||
|
||||
let row = pipeline
|
||||
.exec_mut(&mut stats)
|
||||
.expect("failed to exec pipeline");
|
||||
|
||||
let output = Rows {
|
||||
schema: pipeline.schemas().clone(),
|
||||
rows: vec![row],
|
||||
};
|
||||
|
||||
assert_eq!(output.rows.len(), 1);
|
||||
let values = output.rows.first().unwrap().values.clone();
|
||||
@@ -464,10 +473,7 @@ fn test_simple_data() {
|
||||
"line": "2024-05-25 20:16:37.217 hello world"
|
||||
}
|
||||
"#;
|
||||
let input_value: Value = serde_json::from_str::<serde_json::Value>(input_value_str)
|
||||
.unwrap()
|
||||
.try_into()
|
||||
.unwrap();
|
||||
let input_value = serde_json::from_str::<serde_json::Value>(input_value_str).unwrap();
|
||||
|
||||
let pipeline_yaml = r#"
|
||||
processors:
|
||||
@@ -493,11 +499,13 @@ transform:
|
||||
|
||||
let yaml_content = Content::Yaml(pipeline_yaml.into());
|
||||
let pipeline: Pipeline<GreptimeTransformer> = parse(&yaml_content).unwrap();
|
||||
let output = pipeline.exec(input_value).unwrap();
|
||||
let r = output
|
||||
.rows
|
||||
|
||||
let mut status = pipeline.init_intermediate_state();
|
||||
pipeline.prepare(input_value, &mut status).unwrap();
|
||||
let row = pipeline.exec_mut(&mut status).unwrap();
|
||||
let r = row
|
||||
.values
|
||||
.into_iter()
|
||||
.flat_map(|v| v.values)
|
||||
.map(|v| v.value_data.unwrap())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
|
||||
Reference in New Issue
Block a user