feat: log ingestion support (#4014)

* chore: add log http ingester scaffold

* chore: add some example code

* chore: add log inserter

* chore: add log handler file

* chore: add pipeline lib

* chore: import log handler

* chore: add pipelime http handler

* chore: add pipeline private table

* chore: add pipeline API

* chore: improve error handling

* chore: merge main

* chore: add multi content type support for log handler

* refactor: remove servers dep on pipeline

* refactor: move define_into_tonic_status to common-error

* refactor: bring in pipeline 3eb890c551b8d7f60c4491fcfec18966e2b210a4

* chore: fix typo

* refactor: bring in pipeline a95c9767d7056ab01dd8ca5fa1214456c6ffc72c

* chore: fix typo and license header

* refactor: move http event handler to a separate file

* chore: add test for pipeline

* chore: fmt

* refactor: bring in pipeline 7d2402701877901871dd1294a65ac937605a6a93

* refactor: move `pipeline_operator` to `pipeline` crate

* chore: minor update

* refactor: bring in pipeline 1711f4d46687bada72426d88cda417899e0ae3a4

* chore: add log

* chore: add log

* chore: remove open hook

* chore: minor update

* chore: fix fmt

* chore: minor update

* chore: rename desc for pipeline table

* refactor: remove updated_at in pipelines

* chore: add more content type support for log inserter api

* chore: introduce pipeline crate

* chore: update upload pipeline api

* chore: fix by pr commit

* chore: add some doc for pub fn/struct

* chore: some minro fix

* chore: add pipeline version support

* chore: impl log pipeline version

* chore: fix format issue

* fix: make the LogicalPlan of a query pipeline sorted in desc order

* chore: remove some debug log

* chore: replacing hashmap cache with moak

* chore: fix by pr commit

* chore: fix toml format issue

* chore: update Cargo.lock

* chore: fix by pr commit

* chore: fix some issue by pr commit

* chore: add more doc for pipeline version

---------

Co-authored-by: shuiyisong <xixing.sys@gmail.com>
This commit is contained in:
localhost
2024-06-15 01:03:30 +08:00
committed by GitHub
parent bf3ad44584
commit 01e3a24cf7
22 changed files with 1613 additions and 52 deletions

View File

@@ -22,7 +22,7 @@ arrow-ipc.workspace = true
arrow-schema.workspace = true
async-trait = "0.1"
auth.workspace = true
axum.workspace = true
axum = { workspace = true, features = ["multipart"] }
axum-macros = "0.3.8"
base64.workspace = true
bytes.workspace = true
@@ -69,6 +69,7 @@ opentelemetry-proto.workspace = true
parking_lot = "0.12"
pgwire = "0.20"
pin-project = "1.0"
pipeline.workspace = true
postgres-types = { version = "0.2", features = ["with-chrono-0_4"] }
pprof = { version = "0.13", features = [
"flamegraph",

View File

@@ -27,6 +27,7 @@ use common_error::status_code::StatusCode;
use common_macro::stack_trace_debug;
use common_telemetry::{debug, error};
use datatypes::prelude::ConcreteDataType;
use headers::ContentType;
use query::parser::PromQuery;
use serde_json::json;
use snafu::{Location, Snafu};
@@ -148,6 +149,19 @@ pub enum Error {
source: BoxedError,
},
#[snafu(display("Pipeline management api error"))]
Pipeline {
source: pipeline::error::Error,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Unsupported delete pipeline."))]
UnsupportedDeletePipeline {
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Failed to execute script by name: {}", name))]
ExecuteScript {
name: String,
@@ -533,6 +547,27 @@ pub enum Error {
location: Location,
},
#[snafu(display("Failed to parse payload as json"))]
ParseJson {
#[snafu(source)]
error: serde_json::error::Error,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Failed to convert to structured log"))]
ToStructuredLog {
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Unsupported content type: {:?}", content_type))]
UnsupportedContentType {
content_type: ContentType,
#[snafu(implicit)]
location: Location,
},
#[snafu(display("Failed to decode url"))]
UrlDecode {
#[snafu(source)]
@@ -600,6 +635,7 @@ impl ErrorExt for Error {
| FileWatch { .. } => StatusCode::Internal,
UnsupportedDataType { .. } => StatusCode::Unsupported,
UnsupportedDeletePipeline { .. } => StatusCode::Unsupported,
#[cfg(not(windows))]
UpdateJemallocMetrics { .. } => StatusCode::Internal,
@@ -614,6 +650,8 @@ impl ErrorExt for Error {
| ExecuteGrpcRequest { source, .. }
| CheckDatabaseValidity { source, .. } => source.status_code(),
Pipeline { source, .. } => source.status_code(),
NotSupported { .. }
| InvalidParameter { .. }
| InvalidQuery { .. }
@@ -637,6 +675,9 @@ impl ErrorExt for Error {
| MissingQueryContext { .. }
| MysqlValueConversion { .. }
| UnexpectedPhysicalTable { .. }
| ParseJson { .. }
| ToStructuredLog { .. }
| UnsupportedContentType { .. }
| TimestampOverflow { .. } => StatusCode::InvalidArguments,
RowWriter { source, .. }

View File

@@ -67,12 +67,13 @@ use crate::metrics_handler::MetricsHandler;
use crate::prometheus_handler::PrometheusHandlerRef;
use crate::query_handler::sql::ServerSqlQueryHandlerRef;
use crate::query_handler::{
InfluxdbLineProtocolHandlerRef, OpenTelemetryProtocolHandlerRef, OpentsdbProtocolHandlerRef,
PromStoreProtocolHandlerRef, ScriptHandlerRef,
InfluxdbLineProtocolHandlerRef, LogHandlerRef, OpenTelemetryProtocolHandlerRef,
OpentsdbProtocolHandlerRef, PromStoreProtocolHandlerRef, ScriptHandlerRef,
};
use crate::server::Server;
pub mod authorize;
pub mod event;
pub mod handler;
pub mod header;
pub mod influxdb;
@@ -587,6 +588,16 @@ impl HttpServerBuilder {
}
}
pub fn with_log_ingest_handler(self, handler: LogHandlerRef) -> Self {
Self {
router: self.router.nest(
&format!("/{HTTP_API_VERSION}/events"),
HttpServer::route_log(handler),
),
..self
}
}
pub fn with_plugins(self, plugins: Plugins) -> Self {
Self { plugins, ..self }
}
@@ -699,6 +710,21 @@ impl HttpServer {
.with_state(metrics_handler)
}
fn route_log<S>(log_handler: LogHandlerRef) -> Router<S> {
Router::new()
.route("/logs", routing::post(event::log_ingester))
.route(
"/pipelines/:pipeline_name",
routing::post(event::add_pipeline),
)
.layer(
ServiceBuilder::new()
.layer(HandleErrorLayer::new(handle_error))
.layer(RequestDecompressionLayer::new()),
)
.with_state(log_handler)
}
fn route_sql<S>(api_state: ApiState) -> ApiRouter<S> {
ApiRouter::new()
.api_route(

View File

@@ -0,0 +1,257 @@
// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::result::Result as StdResult;
use api::v1::{RowInsertRequest, RowInsertRequests, Rows};
use axum::body::HttpBody;
use axum::extract::{FromRequest, Multipart, Path, Query, State};
use axum::headers::ContentType;
use axum::http::header::CONTENT_TYPE;
use axum::http::{Request, StatusCode};
use axum::response::{IntoResponse, Response};
use axum::{async_trait, BoxError, Extension, TypedHeader};
use common_telemetry::{error, warn};
use common_time::Timestamp;
use datatypes::timestamp::TimestampNanosecond;
use mime_guess::mime;
use pipeline::error::{CastTypeSnafu, PipelineTransformSnafu};
use pipeline::table::PipelineVersion;
use pipeline::Value as PipelineValue;
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};
use serde_json::{Deserializer, Value};
use session::context::QueryContextRef;
use snafu::{OptionExt, ResultExt};
use crate::error::{
InvalidParameterSnafu, ParseJsonSnafu, PipelineSnafu, Result, UnsupportedContentTypeSnafu,
};
use crate::http::greptime_result_v1::GreptimedbV1Response;
use crate::http::HttpResponse;
use crate::query_handler::LogHandlerRef;
#[derive(Debug, Default, Serialize, Deserialize, JsonSchema)]
pub struct LogIngesterQueryParams {
pub table: Option<String>,
pub db: Option<String>,
pub pipeline_name: Option<String>,
pub ignore_errors: Option<bool>,
pub version: Option<String>,
}
pub struct PipelineContent(String);
#[async_trait]
impl<S, B> FromRequest<S, B> for PipelineContent
where
B: HttpBody + Send + 'static,
B::Data: Send,
bytes::Bytes: std::convert::From<<B as HttpBody>::Data>,
B::Error: Into<BoxError>,
S: Send + Sync,
{
type Rejection = Response;
async fn from_request(req: Request<B>, state: &S) -> StdResult<Self, Self::Rejection> {
let content_type_header = req.headers().get(CONTENT_TYPE);
let content_type = content_type_header.and_then(|value| value.to_str().ok());
if let Some(content_type) = content_type {
if content_type.ends_with("yaml") {
let payload = String::from_request(req, state)
.await
.map_err(IntoResponse::into_response)?;
return Ok(Self(payload));
}
if content_type.starts_with("multipart/form-data") {
let mut payload: Multipart = Multipart::from_request(req, state)
.await
.map_err(IntoResponse::into_response)?;
let file = payload
.next_field()
.await
.map_err(IntoResponse::into_response)?;
let payload = file
.ok_or(StatusCode::UNSUPPORTED_MEDIA_TYPE.into_response())?
.text()
.await
.map_err(IntoResponse::into_response)?;
return Ok(Self(payload));
}
}
Err(StatusCode::UNSUPPORTED_MEDIA_TYPE.into_response())
}
}
#[axum_macros::debug_handler]
pub async fn add_pipeline(
State(handler): State<LogHandlerRef>,
Path(pipeline_name): Path<String>,
Extension(query_ctx): Extension<QueryContextRef>,
PipelineContent(payload): PipelineContent,
) -> Result<String> {
if pipeline_name.is_empty() {
return Err(InvalidParameterSnafu {
reason: "pipeline_name is required in path",
}
.build());
}
if payload.is_empty() {
return Err(InvalidParameterSnafu {
reason: "pipeline is required in body",
}
.build());
}
let content_type = "yaml";
let result = handler
.insert_pipeline(&pipeline_name, content_type, &payload, query_ctx)
.await;
result.map(|_| "ok".to_string()).map_err(|e| {
error!(e; "failed to insert pipeline");
e
})
}
/// Transform NDJSON array into a single array
fn transform_ndjson_array_factory(
values: impl IntoIterator<Item = StdResult<Value, serde_json::Error>>,
ignore_error: bool,
) -> Result<Value> {
values.into_iter().try_fold(
Value::Array(Vec::with_capacity(100)),
|acc, item| match acc {
Value::Array(mut acc_array) => {
if let Ok(item_value) = item {
match item_value {
Value::Array(item_array) => {
acc_array.extend(item_array);
}
Value::Object(_) => {
acc_array.push(item_value);
}
_ => {
if !ignore_error {
warn!("invalid item in array: {:?}", item_value);
return InvalidParameterSnafu {
reason: format!("invalid item:{} in array", item_value),
}
.fail();
}
}
}
Ok(Value::Array(acc_array))
} else if !ignore_error {
item.context(ParseJsonSnafu)
} else {
warn!("invalid item in array: {:?}", item);
Ok(Value::Array(acc_array))
}
}
_ => unreachable!("invalid acc: {:?}", acc),
},
)
}
#[axum_macros::debug_handler]
pub async fn log_ingester(
State(handler): State<LogHandlerRef>,
Query(query_params): Query<LogIngesterQueryParams>,
Extension(query_ctx): Extension<QueryContextRef>,
TypedHeader(content_type): TypedHeader<ContentType>,
payload: String,
) -> Result<HttpResponse> {
let pipeline_name = query_params.pipeline_name.context(InvalidParameterSnafu {
reason: "pipeline_name is required",
})?;
let table_name = query_params.table.context(InvalidParameterSnafu {
reason: "table is required",
})?;
let version = match query_params.version {
Some(version) => {
let ts = Timestamp::from_str_utc(&version).map_err(|e| {
InvalidParameterSnafu {
reason: format!("invalid pipeline version: {} with error: {}", &version, e),
}
.build()
})?;
Some(TimestampNanosecond(ts))
}
None => None,
};
let ignore_errors = query_params.ignore_errors.unwrap_or(false);
let m: mime::Mime = content_type.clone().into();
let value = match m.subtype() {
mime::JSON => transform_ndjson_array_factory(
Deserializer::from_str(&payload).into_iter(),
ignore_errors,
)?,
// add more content type support
_ => UnsupportedContentTypeSnafu { content_type }.fail()?,
};
ingest_logs_inner(
handler,
pipeline_name,
version,
table_name,
value,
query_ctx,
)
.await
}
async fn ingest_logs_inner(
state: LogHandlerRef,
pipeline_name: String,
version: PipelineVersion,
table_name: String,
payload: Value,
query_ctx: QueryContextRef,
) -> Result<HttpResponse> {
let start = std::time::Instant::now();
let pipeline_data = PipelineValue::try_from(payload)
.map_err(|reason| CastTypeSnafu { msg: reason }.build())
.context(PipelineSnafu)?;
let pipeline = state
.get_pipeline(&pipeline_name, version, query_ctx.clone())
.await?;
let transformed_data: Rows = pipeline
.exec(pipeline_data)
.map_err(|reason| PipelineTransformSnafu { reason }.build())
.context(PipelineSnafu)?;
let insert_request = RowInsertRequest {
rows: Some(transformed_data),
table_name: table_name.clone(),
};
let insert_requests = RowInsertRequests {
inserts: vec![insert_request],
};
let output = state.insert_logs(insert_requests, query_ctx).await;
let response = GreptimedbV1Response::from_output(vec![output])
.await
.with_execution_time(start.elapsed().as_millis() as u64);
Ok(response)
}

View File

@@ -35,6 +35,8 @@ use common_query::Output;
use headers::HeaderValue;
use opentelemetry_proto::tonic::collector::metrics::v1::ExportMetricsServiceRequest;
use opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceRequest;
use pipeline::table::PipelineVersion;
use pipeline::{GreptimeTransformer, Pipeline};
use serde_json::Value;
use session::context::QueryContextRef;
@@ -48,6 +50,7 @@ pub type InfluxdbLineProtocolHandlerRef = Arc<dyn InfluxdbLineProtocolHandler +
pub type PromStoreProtocolHandlerRef = Arc<dyn PromStoreProtocolHandler + Send + Sync>;
pub type OpenTelemetryProtocolHandlerRef = Arc<dyn OpenTelemetryProtocolHandler + Send + Sync>;
pub type ScriptHandlerRef = Arc<dyn ScriptHandler + Send + Sync>;
pub type LogHandlerRef = Arc<dyn LogHandler + Send + Sync>;
#[async_trait]
pub trait ScriptHandler {
@@ -118,3 +121,29 @@ pub trait OpenTelemetryProtocolHandler {
ctx: QueryContextRef,
) -> Result<Output>;
}
/// LogHandler is responsible for handling log related requests.
/// It should be able to insert logs and manage pipelines.
/// The pipeline is a series of transformations that can be applied to logs.
/// The pipeline is stored in the database and can be retrieved by name.
#[async_trait]
pub trait LogHandler {
async fn insert_logs(&self, log: RowInsertRequests, ctx: QueryContextRef) -> Result<Output>;
async fn get_pipeline(
&self,
name: &str,
version: PipelineVersion,
query_ctx: QueryContextRef,
) -> Result<Arc<Pipeline<GreptimeTransformer>>>;
async fn insert_pipeline(
&self,
name: &str,
content_type: &str,
pipeline: &str,
query_ctx: QueryContextRef,
) -> Result<()>;
async fn delete_pipeline(&self, name: &str, query_ctx: QueryContextRef) -> Result<()>;
}