Skip to main content

frontend/instance/
otlp.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::sync::Arc;
16
17use api::helper::ColumnDataTypeWrapper;
18use api::v1::alter_table_expr::Kind;
19use api::v1::{
20    AlterTableExpr, ColumnDataType, ModifyColumnType, ModifyColumnTypes, RowInsertRequests,
21};
22use async_trait::async_trait;
23use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
24use client::Output;
25use common_error::ext::{BoxedError, ErrorExt};
26use common_error::status_code::StatusCode;
27use common_query::prelude::GREPTIME_PHYSICAL_TABLE;
28use common_telemetry::{tracing, warn};
29use itertools::Itertools;
30use opentelemetry_proto::tonic::collector::logs::v1::ExportLogsServiceRequest;
31use opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceRequest;
32use otel_arrow_rust::proto::opentelemetry::collector::metrics::v1::ExportMetricsServiceRequest;
33use pipeline::{GreptimePipelineParams, PipelineWay};
34use servers::error::{self, AuthSnafu, Result as ServerResult};
35use servers::http::prom_store::PHYSICAL_TABLE_PARAM;
36use servers::interceptor::{OpenTelemetryProtocolInterceptor, OpenTelemetryProtocolInterceptorRef};
37use servers::otlp;
38use servers::otlp::coerce::{coerce_value_data, trace_value_datatype};
39use servers::otlp::trace::TraceAuxData;
40use servers::otlp::trace::span::{TraceSpan, TraceSpanGroup};
41use servers::query_handler::{
42    OpenTelemetryProtocolHandler, PipelineHandlerRef, TraceIngestOutcome,
43};
44use session::context::QueryContextRef;
45use snafu::{IntoError, ResultExt};
46use table::requests::{
47    OTLP_METRIC_COMPAT_KEY, OTLP_METRIC_COMPAT_PROM, SEMANTIC_PER_TABLE_INDEX_KEY,
48    SEMANTIC_PIPELINE, SEMANTIC_SIGNAL_TYPE, SEMANTIC_SOURCE, SEMANTIC_TRACE_CONVENTIONS,
49    SEMANTIC_VALUE_MIXED, SEMANTIC_VALUE_UNKNOWN, SIGNAL_TYPE_LOG, SIGNAL_TYPE_METRIC,
50    SIGNAL_TYPE_TRACE, SOURCE_OPENTELEMETRY, TABLE_DATA_MODEL_TRACE_V1,
51};
52
53use crate::instance::Instance;
54use crate::instance::otlp::trace_semconv::trace_semconv_fixed_type;
55use crate::instance::otlp::trace_types::{
56    PendingTraceColumnRewrite, choose_trace_reconcile_decision, enrich_trace_reconcile_error,
57    is_trace_reconcile_candidate_type, push_observed_trace_type, validate_trace_column_rewrites,
58};
59use crate::metrics::{
60    OTLP_LOGS_ROWS, OTLP_METRICS_ROWS, OTLP_TRACES_FAILURE_COUNT, OTLP_TRACES_ROWS,
61};
62
63pub mod trace_semconv;
64pub mod trace_types;
65
66const TRACE_INGEST_CHUNK_SIZE: usize = 64;
67const TRACE_FAILURE_MESSAGE_LIMIT: usize = 4;
68
69#[derive(Debug, Clone, Copy, PartialEq, Eq)]
70enum ChunkFailureReaction {
71    RetryPerSpan,
72    DiscardChunk,
73    Propagate,
74}
75
76impl ChunkFailureReaction {
77    fn as_metric_label(self) -> &'static str {
78        match self {
79            Self::RetryPerSpan => "retry_per_span",
80            Self::DiscardChunk => "discard_chunk",
81            Self::Propagate => "propagate_failure",
82        }
83    }
84}
85
86struct TraceChunkIngestContext<'a> {
87    pipeline_handler: PipelineHandlerRef,
88    pipeline: &'a PipelineWay,
89    pipeline_params: &'a GreptimePipelineParams,
90    table_name: &'a str,
91    is_trace_v1_model: bool,
92}
93
94struct TraceIngestState {
95    aux_data: TraceAuxData,
96    outcome: TraceIngestOutcome,
97    failure_messages: Vec<String>,
98}
99
100#[async_trait]
101impl OpenTelemetryProtocolHandler for Instance {
102    #[tracing::instrument(skip_all)]
103    async fn metrics(
104        &self,
105        request: ExportMetricsServiceRequest,
106        ctx: QueryContextRef,
107    ) -> ServerResult<Output> {
108        self.plugins
109            .get::<PermissionCheckerRef>()
110            .as_ref()
111            .check_permission(ctx.current_user(), PermissionReq::Otlp)
112            .context(AuthSnafu)?;
113
114        let interceptor_ref = self
115            .plugins
116            .get::<OpenTelemetryProtocolInterceptorRef<servers::error::Error>>();
117        interceptor_ref.pre_execute(ctx.clone())?;
118
119        let input_names = request
120            .resource_metrics
121            .iter()
122            .flat_map(|r| r.scope_metrics.iter())
123            .flat_map(|s| s.metrics.iter().map(|m| &m.name))
124            .collect::<Vec<_>>();
125
126        // See [`OtlpMetricCtx`] for details
127        let is_legacy = self.check_otlp_legacy(&input_names, ctx.clone()).await?;
128
129        let mut metric_ctx = ctx
130            .protocol_ctx()
131            .get_otlp_metric_ctx()
132            .cloned()
133            .unwrap_or_default();
134        metric_ctx.is_legacy = is_legacy;
135
136        let (requests, rows, semantic_index) =
137            otlp::metrics::to_grpc_insert_requests(request, &mut metric_ctx)?;
138        OTLP_METRICS_ROWS.inc_by(rows as u64);
139
140        let ctx = {
141            let mut c = (*ctx).clone();
142            c.set_extension(SEMANTIC_SIGNAL_TYPE, SIGNAL_TYPE_METRIC);
143            c.set_extension(SEMANTIC_SOURCE, SOURCE_OPENTELEMETRY);
144            // Per-table metric specifics + resource/scope lineage ride this
145            // internal channel; the auto-create path folds them per table name.
146            if let Some(index) = semantic_index.encode() {
147                c.set_extension(SEMANTIC_PER_TABLE_INDEX_KEY, index);
148            }
149            if !is_legacy {
150                c.set_extension(OTLP_METRIC_COMPAT_KEY, OTLP_METRIC_COMPAT_PROM.to_string());
151            }
152            Arc::new(c)
153        };
154
155        // If the user uses the legacy path, it is by default without metric engine.
156        if metric_ctx.is_legacy || !metric_ctx.with_metric_engine {
157            self.handle_row_inserts(requests, ctx, false, false)
158                .await
159                .map_err(BoxedError::new)
160                .context(error::ExecuteGrpcQuerySnafu)
161        } else {
162            let physical_table = ctx
163                .extension(PHYSICAL_TABLE_PARAM)
164                .unwrap_or(GREPTIME_PHYSICAL_TABLE)
165                .to_string();
166            self.handle_metric_row_inserts(requests, ctx, physical_table.clone())
167                .await
168                .map_err(BoxedError::new)
169                .context(error::ExecuteGrpcQuerySnafu)
170        }
171    }
172
173    #[tracing::instrument(skip_all)]
174    async fn traces(
175        &self,
176        pipeline_handler: PipelineHandlerRef,
177        request: ExportTraceServiceRequest,
178        pipeline: PipelineWay,
179        pipeline_params: GreptimePipelineParams,
180        table_name: String,
181        ctx: QueryContextRef,
182    ) -> ServerResult<TraceIngestOutcome> {
183        self.plugins
184            .get::<PermissionCheckerRef>()
185            .as_ref()
186            .check_permission(ctx.current_user(), PermissionReq::Otlp)
187            .context(AuthSnafu)?;
188
189        let interceptor_ref = self
190            .plugins
191            .get::<OpenTelemetryProtocolInterceptorRef<servers::error::Error>>();
192        interceptor_ref.pre_execute(ctx.clone())?;
193
194        // `schema_url` is consumed by `parse`, so derive conventions first.
195        let conventions = trace_conventions(&request);
196        let spans = otlp::trace::span::parse(request);
197        self.ingest_trace_spans(
198            pipeline_handler,
199            &pipeline,
200            &pipeline_params,
201            table_name,
202            spans,
203            &conventions,
204            ctx,
205        )
206        .await
207    }
208
209    #[tracing::instrument(skip_all)]
210    async fn logs(
211        &self,
212        pipeline_handler: PipelineHandlerRef,
213        request: ExportLogsServiceRequest,
214        pipeline: PipelineWay,
215        pipeline_params: GreptimePipelineParams,
216        table_name: String,
217        ctx: QueryContextRef,
218    ) -> ServerResult<Vec<Output>> {
219        self.plugins
220            .get::<PermissionCheckerRef>()
221            .as_ref()
222            .check_permission(ctx.current_user(), PermissionReq::Otlp)
223            .context(AuthSnafu)?;
224
225        let interceptor_ref = self
226            .plugins
227            .get::<OpenTelemetryProtocolInterceptorRef<servers::error::Error>>();
228        interceptor_ref.pre_execute(ctx.clone())?;
229
230        // `as_req_iter` clones this ctx into each `temp_ctx`, so identity set here
231        // reaches the context that drives table auto-create.
232        let ctx = {
233            let mut c = (*ctx).clone();
234            c.set_extension(SEMANTIC_SIGNAL_TYPE, SIGNAL_TYPE_LOG);
235            c.set_extension(SEMANTIC_SOURCE, SOURCE_OPENTELEMETRY);
236            Arc::new(c)
237        };
238
239        let opt_req = otlp::logs::to_grpc_insert_requests(
240            request,
241            pipeline,
242            pipeline_params,
243            table_name,
244            &ctx,
245            pipeline_handler,
246        )
247        .await?;
248
249        let mut outputs = vec![];
250
251        for (temp_ctx, requests) in opt_req.as_req_iter(ctx) {
252            let cnt = requests
253                .inserts
254                .iter()
255                .filter_map(|r| r.rows.as_ref().map(|r| r.rows.len()))
256                .sum::<usize>();
257
258            let o = self
259                .handle_log_inserts(requests, temp_ctx)
260                .await
261                .inspect(|_| OTLP_LOGS_ROWS.inc_by(cnt as u64))
262                .map_err(BoxedError::new)
263                .context(error::ExecuteGrpcQuerySnafu)?;
264            outputs.push(o);
265        }
266
267        Ok(outputs)
268    }
269}
270
271impl Instance {
272    /// Ingest OTLP trace spans with chunk-level writes and span-level fallback on
273    /// deterministic chunk failures.
274    #[allow(clippy::too_many_arguments)]
275    async fn ingest_trace_spans(
276        &self,
277        pipeline_handler: PipelineHandlerRef,
278        pipeline: &PipelineWay,
279        pipeline_params: &GreptimePipelineParams,
280        table_name: String,
281        groups: Vec<TraceSpanGroup>,
282        conventions: &str,
283        ctx: QueryContextRef,
284    ) -> ServerResult<TraceIngestOutcome> {
285        let is_trace_v1_model = matches!(pipeline, PipelineWay::OtlpTraceDirectV1);
286
287        // Only the main span table gets the identity; the derived `_services` /
288        // `_operations` lookup tables keep the unstamped `ctx`.
289        let main_ctx = {
290            let mut c = (*ctx).clone();
291            c.set_extension(SEMANTIC_SIGNAL_TYPE, SIGNAL_TYPE_TRACE);
292            c.set_extension(SEMANTIC_SOURCE, SOURCE_OPENTELEMETRY);
293            if is_trace_v1_model {
294                c.set_extension(SEMANTIC_PIPELINE, TABLE_DATA_MODEL_TRACE_V1);
295                c.set_extension(SEMANTIC_TRACE_CONVENTIONS, conventions);
296            }
297            Arc::new(c)
298        };
299
300        let ingest_ctx = TraceChunkIngestContext {
301            pipeline_handler,
302            pipeline,
303            pipeline_params,
304            table_name: &table_name,
305            is_trace_v1_model,
306        };
307        let mut ingest_state = TraceIngestState {
308            aux_data: TraceAuxData::default(),
309            outcome: TraceIngestOutcome::default(),
310            failure_messages: Vec::new(),
311        };
312
313        for group in groups {
314            let chunks = group
315                .spans
316                .into_iter()
317                .chunks(TRACE_INGEST_CHUNK_SIZE)
318                .into_iter()
319                .map(|chunk| chunk.collect::<Vec<_>>())
320                .collect::<Vec<_>>();
321            for chunk in chunks {
322                self.ingest_trace_chunk(&ingest_ctx, chunk, main_ctx.clone(), &mut ingest_state)
323                    .await?;
324            }
325        }
326
327        OTLP_TRACES_ROWS.inc_by(ingest_state.outcome.accepted_spans as u64);
328
329        if !ingest_state.aux_data.is_empty() {
330            // Auxiliary trace tables are derived from spans whose main-table
331            // writes are already confirmed, so they never create new accepted
332            // spans and they do not affect rejected span counts.
333            let (aux_requests, _) = otlp::trace::to_grpc_insert_requests_for_aux_tables(
334                std::mem::take(&mut ingest_state.aux_data),
335                ingest_ctx.pipeline,
336                ingest_ctx.table_name,
337            )?;
338
339            if !aux_requests.inserts.is_empty() {
340                match self
341                    .insert_trace_requests(aux_requests, ingest_ctx.is_trace_v1_model, ctx)
342                    .await
343                {
344                    Ok(output) => {
345                        Self::add_trace_write_cost(&mut ingest_state.outcome, output.meta.cost);
346                    }
347                    Err(err) => {
348                        Self::push_trace_failure_message(
349                            &mut ingest_state.failure_messages,
350                            "aux_table_update_failed",
351                            format!(
352                                "Auxiliary trace tables were not fully updated ({})",
353                                err.status_code().as_ref()
354                            ),
355                        );
356                    }
357                }
358            }
359        }
360
361        ingest_state.outcome.error_message = Self::finish_trace_failure_message(
362            ingest_state.outcome.accepted_spans,
363            ingest_state.outcome.rejected_spans,
364            ingest_state.failure_messages,
365        );
366
367        Ok(ingest_state.outcome)
368    }
369
370    /// Ingest one owned trace chunk so successful spans can be moved into the
371    /// accepted set without extra cloning.
372    async fn ingest_trace_chunk(
373        &self,
374        ingest_ctx: &TraceChunkIngestContext<'_>,
375        chunk: Vec<TraceSpan>,
376        ctx: QueryContextRef,
377        ingest_state: &mut TraceIngestState,
378    ) -> ServerResult<()> {
379        // Try the fast path first so healthy batches keep their original
380        // throughput and write amplification stays low.
381        let (requests, chunk_rows) = otlp::trace::to_grpc_insert_requests_from_spans(
382            &chunk,
383            ingest_ctx.pipeline,
384            ingest_ctx.pipeline_params,
385            ingest_ctx.table_name,
386            &ctx,
387            ingest_ctx.pipeline_handler.clone(),
388        )?;
389
390        match self
391            .insert_trace_requests(requests, ingest_ctx.is_trace_v1_model, ctx.clone())
392            .await
393        {
394            Ok(output) => {
395                Self::add_trace_write_cost(&mut ingest_state.outcome, output.meta.cost);
396                ingest_state.outcome.accepted_spans += chunk_rows;
397                for span in &chunk {
398                    ingest_state.aux_data.observe_span(span);
399                }
400            }
401            Err(err) => match Self::classify_trace_chunk_failure(err.status_code()) {
402                ChunkFailureReaction::RetryPerSpan => {
403                    Self::push_trace_failure_message(
404                        &mut ingest_state.failure_messages,
405                        ChunkFailureReaction::RetryPerSpan.as_metric_label(),
406                        format!("Chunk fallback triggered by {}", err.status_code().as_ref()),
407                    );
408                    // Only deterministic failures are retried span by span.
409                    // This includes schemaless table or column creation paths for
410                    // trace ingestion. Ambiguous failures are handled below
411                    // without retrying because the chunk may already have been
412                    // ingested.
413                    self.ingest_trace_chunk_span_by_span(
414                        ingest_ctx,
415                        chunk,
416                        ctx.clone(),
417                        ingest_state,
418                    )
419                    .await?;
420                }
421                ChunkFailureReaction::DiscardChunk => {
422                    ingest_state.outcome.rejected_spans += chunk.len();
423                    Self::push_trace_failure_message(
424                        &mut ingest_state.failure_messages,
425                        ChunkFailureReaction::DiscardChunk.as_metric_label(),
426                        format!(
427                            "Discarded {} spans after ambiguous chunk failure ({})",
428                            chunk.len(),
429                            err.status_code().as_ref()
430                        ),
431                    );
432                    // TODO(shuiyisong): Add an idempotent retry-safe recovery path for
433                    // ambiguous chunk failures such as timeout-like errors.
434                }
435                // Retryable or ambiguous failures must fail the request instead of
436                // becoming partial success. This path is not retry-safe because the
437                // chunk may already have been committed before the error surfaced.
438                ChunkFailureReaction::Propagate => {
439                    Self::push_trace_failure_message(
440                        &mut ingest_state.failure_messages,
441                        ChunkFailureReaction::Propagate.as_metric_label(),
442                        format!(
443                            "Propagating retryable chunk failure ({})",
444                            err.status_code().as_ref()
445                        ),
446                    );
447                    return Err(err);
448                }
449            },
450        }
451
452        Ok(())
453    }
454
455    /// Retry spans one by one only after a deterministic chunk failure.
456    async fn ingest_trace_chunk_span_by_span(
457        &self,
458        ingest_ctx: &TraceChunkIngestContext<'_>,
459        chunk: Vec<TraceSpan>,
460        ctx: QueryContextRef,
461        ingest_state: &mut TraceIngestState,
462    ) -> ServerResult<()> {
463        for span in chunk {
464            let (requests, rows) = otlp::trace::to_grpc_insert_requests_from_spans(
465                std::slice::from_ref(&span),
466                ingest_ctx.pipeline,
467                ingest_ctx.pipeline_params,
468                ingest_ctx.table_name,
469                &ctx,
470                ingest_ctx.pipeline_handler.clone(),
471            )?;
472
473            match self
474                .insert_trace_requests(requests, ingest_ctx.is_trace_v1_model, ctx.clone())
475                .await
476            {
477                Ok(output) => {
478                    Self::add_trace_write_cost(&mut ingest_state.outcome, output.meta.cost);
479                    ingest_state.outcome.accepted_spans += rows;
480                    ingest_state.aux_data.observe_span(&span);
481                }
482                Err(err) => {
483                    if Self::should_propagate_trace_span_failure(err.status_code()) {
484                        Self::push_trace_failure_message(
485                            &mut ingest_state.failure_messages,
486                            ChunkFailureReaction::Propagate.as_metric_label(),
487                            format!(
488                                "Propagating retryable span failure for {}:{} ({})",
489                                span.trace_id,
490                                span.span_id,
491                                err.status_code().as_ref()
492                            ),
493                        );
494                        return Err(err);
495                    }
496
497                    ingest_state.outcome.rejected_spans += 1;
498                    Self::push_trace_failure_message(
499                        &mut ingest_state.failure_messages,
500                        "span_rejected",
501                        format!(
502                            "Rejected span {}:{} ({})",
503                            span.trace_id,
504                            span.span_id,
505                            err.status_code().as_ref()
506                        ),
507                    );
508                }
509            }
510        }
511
512        Ok(())
513    }
514
515    /// Reconcile and insert one trace request batch.
516    async fn insert_trace_requests(
517        &self,
518        mut requests: RowInsertRequests,
519        is_trace_v1_model: bool,
520        ctx: QueryContextRef,
521    ) -> ServerResult<Output> {
522        if is_trace_v1_model {
523            self.reconcile_trace_column_types(&mut requests, &ctx)
524                .await?;
525            self.handle_trace_inserts(requests, ctx)
526                .await
527                .map_err(BoxedError::new)
528                .context(error::ExecuteGrpcQuerySnafu)
529        } else {
530            self.handle_log_inserts(requests, ctx)
531                .await
532                .map_err(BoxedError::new)
533                .context(error::ExecuteGrpcQuerySnafu)
534        }
535    }
536
537    fn classify_trace_chunk_failure(status: StatusCode) -> ChunkFailureReaction {
538        match status {
539            StatusCode::InvalidArguments
540            | StatusCode::InvalidSyntax
541            | StatusCode::Unsupported
542            | StatusCode::TableNotFound
543            | StatusCode::TableColumnNotFound => ChunkFailureReaction::RetryPerSpan,
544            StatusCode::DatabaseNotFound => ChunkFailureReaction::DiscardChunk,
545            StatusCode::Cancelled | StatusCode::DeadlineExceeded => ChunkFailureReaction::Propagate,
546            _ if status.is_retryable() => ChunkFailureReaction::Propagate,
547            _ => ChunkFailureReaction::DiscardChunk,
548        }
549    }
550
551    fn should_propagate_trace_span_failure(status: StatusCode) -> bool {
552        matches!(
553            Self::classify_trace_chunk_failure(status),
554            ChunkFailureReaction::Propagate
555        )
556    }
557
558    fn add_trace_write_cost(outcome: &mut TraceIngestOutcome, cost: usize) {
559        outcome.write_cost += cost;
560    }
561
562    fn push_trace_failure_message(messages: &mut Vec<String>, label: &str, message: String) {
563        OTLP_TRACES_FAILURE_COUNT.with_label_values(&[label]).inc();
564
565        if messages.len() < TRACE_FAILURE_MESSAGE_LIMIT {
566            messages.push(message);
567        } else if messages.len() == TRACE_FAILURE_MESSAGE_LIMIT {
568            tracing::debug!(
569                label,
570                limit = TRACE_FAILURE_MESSAGE_LIMIT,
571                "Trace ingest failure message limit reached; suppressing additional failure details"
572            );
573        }
574    }
575
576    fn finish_trace_failure_message(
577        accepted_spans: usize,
578        rejected_spans: usize,
579        messages: Vec<String>,
580    ) -> Option<String> {
581        if rejected_spans == 0 && messages.is_empty() {
582            return None;
583        }
584
585        let mut summary = format!(
586            "Accepted {} spans, rejected {} spans",
587            accepted_spans, rejected_spans
588        );
589
590        if !messages.is_empty() {
591            summary.push_str(": ");
592            summary.push_str(&messages.join("; "));
593        }
594
595        Some(summary)
596    }
597
598    /// Widen existing trace table columns to Float64 before request rewrite.
599    async fn alter_trace_table_columns_to_float64(
600        &self,
601        ctx: &QueryContextRef,
602        table_name: &str,
603        column_names: &[String],
604    ) -> ServerResult<()> {
605        let catalog_name = ctx.current_catalog().to_string();
606        let schema_name = ctx.current_schema();
607        let alter_expr = AlterTableExpr {
608            catalog_name: catalog_name.clone(),
609            schema_name: schema_name.clone(),
610            table_name: table_name.to_string(),
611            kind: Some(Kind::ModifyColumnTypes(ModifyColumnTypes {
612                modify_column_types: column_names
613                    .iter()
614                    .map(|column_name| ModifyColumnType {
615                        column_name: column_name.clone(),
616                        target_type: ColumnDataType::Float64 as i32,
617                        target_type_extension: None,
618                    })
619                    .collect(),
620            })),
621        };
622
623        if let Err(err) = self
624            .statement_executor
625            .alter_table_inner(alter_expr, ctx.clone())
626            .await
627        {
628            let table = self
629                .catalog_manager
630                .table(&catalog_name, &schema_name, table_name, None)
631                .await
632                .map_err(servers::error::Error::from)?;
633            let alter_already_applied = table
634                .map(|table| {
635                    let table_schema = table.schema();
636                    column_names.iter().all(|column_name| {
637                        table_schema
638                            .column_schema_by_name(column_name)
639                            .and_then(|table_col| {
640                                ColumnDataTypeWrapper::try_from(table_col.data_type.clone())
641                                    .ok()
642                                    .map(|wrapper| wrapper.datatype())
643                            })
644                            == Some(ColumnDataType::Float64)
645                    })
646                })
647                .unwrap_or(false);
648
649            if alter_already_applied {
650                return Ok(());
651            }
652
653            warn!(
654                table_name,
655                columns = ?column_names,
656                error = %err,
657                "failed to widen trace columns before insert"
658            );
659
660            return Err(wrap_trace_alter_failure(err));
661        }
662
663        Ok(())
664    }
665
666    /// Coerce request column types and values to match the existing table schema
667    /// for compatible type pairs. Existing table schema wins when present;
668    /// otherwise the full request batch decides a stable target type.
669    async fn reconcile_trace_column_types(
670        &self,
671        requests: &mut RowInsertRequests,
672        ctx: &QueryContextRef,
673    ) -> ServerResult<()> {
674        let catalog = ctx.current_catalog();
675        let schema = ctx.current_schema();
676
677        for req in &mut requests.inserts {
678            let table = self
679                .catalog_manager
680                .table(catalog, &schema, &req.table_name, None)
681                .await?;
682
683            let Some(rows) = req.rows.as_mut() else {
684                continue;
685            };
686
687            let table_schema = table.map(|table| table.schema());
688            let mut pending_rewrites = Vec::new();
689            let mut pending_alter_columns = Vec::new();
690
691            for (col_idx, col_schema) in rows.schema.iter().enumerate() {
692                let Some(current_type) = ColumnDataType::try_from(col_schema.datatype).ok() else {
693                    continue;
694                };
695
696                let mut observed_types = Vec::new();
697                push_observed_trace_type(&mut observed_types, current_type);
698
699                // Scan the full request first so the final type decision is not affected
700                // by row order inside the batch.
701                for row in &rows.rows {
702                    let Some(value) = row
703                        .values
704                        .get(col_idx)
705                        .and_then(|value| value.value_data.as_ref())
706                    else {
707                        continue;
708                    };
709
710                    let Some(value_type) = trace_value_datatype(value) else {
711                        continue;
712                    };
713                    push_observed_trace_type(&mut observed_types, value_type);
714                }
715
716                let existing_type = table_schema
717                    .as_ref()
718                    .and_then(|schema| schema.column_schema_by_name(&col_schema.column_name))
719                    .and_then(|table_col| {
720                        ColumnDataTypeWrapper::try_from(table_col.data_type.clone())
721                            .ok()
722                            .map(|wrapper| wrapper.datatype())
723                    });
724                let fixed_type = trace_semconv_fixed_type(&col_schema.column_name);
725
726                if !observed_types
727                    .iter()
728                    .copied()
729                    .any(is_trace_reconcile_candidate_type)
730                    && existing_type
731                        .map(|datatype| !is_trace_reconcile_candidate_type(datatype))
732                        .unwrap_or(true)
733                    && fixed_type.is_none()
734                {
735                    continue;
736                }
737
738                // Decide the final type once per column, then rewrite all affected cells
739                // together in one row pass below.
740                let Some(decision) = choose_trace_reconcile_decision(
741                    &col_schema.column_name,
742                    &observed_types,
743                    existing_type,
744                )
745                .map_err(|_| {
746                    enrich_trace_reconcile_error(
747                        &req.table_name,
748                        &col_schema.column_name,
749                        &observed_types,
750                        existing_type,
751                        fixed_type,
752                    )
753                })?
754                else {
755                    continue;
756                };
757                let target_type = decision.target_type();
758
759                if !decision.requires_alter()
760                    && observed_types
761                        .iter()
762                        .all(|observed| *observed == target_type)
763                    && col_schema.datatype == target_type as i32
764                {
765                    continue;
766                }
767
768                if decision.requires_alter()
769                    && !pending_alter_columns.contains(&col_schema.column_name)
770                {
771                    pending_alter_columns.push(col_schema.column_name.clone());
772                }
773
774                pending_rewrites.push(PendingTraceColumnRewrite {
775                    col_idx,
776                    target_type,
777                    column_name: col_schema.column_name.clone(),
778                });
779            }
780
781            if pending_rewrites.is_empty() {
782                continue;
783            }
784
785            validate_trace_column_rewrites(&rows.rows, &pending_rewrites, &req.table_name)?;
786
787            if !pending_alter_columns.is_empty() {
788                self.alter_trace_table_columns_to_float64(
789                    ctx,
790                    &req.table_name,
791                    &pending_alter_columns,
792                )
793                .await?;
794            }
795
796            // Update schema metadata before mutating row values so both stay in sync.
797            for pending_rewrite in &pending_rewrites {
798                rows.schema[pending_rewrite.col_idx].datatype = pending_rewrite.target_type as i32;
799            }
800
801            // Apply all pending column rewrites in one row pass.
802            for row in &mut rows.rows {
803                for pending_rewrite in &pending_rewrites {
804                    let Some(value) = row.values.get_mut(pending_rewrite.col_idx) else {
805                        continue;
806                    };
807                    let Some(request_type) =
808                        value.value_data.as_ref().and_then(trace_value_datatype)
809                    else {
810                        continue;
811                    };
812                    if request_type == pending_rewrite.target_type {
813                        continue;
814                    }
815
816                    value.value_data = coerce_value_data(
817                        &value.value_data,
818                        pending_rewrite.target_type,
819                        request_type,
820                    )
821                    .map_err(|_| {
822                        error::InvalidParameterSnafu {
823                            reason: format!(
824                                "failed to coerce trace column '{}' in table '{}' from {:?} to {:?}",
825                                pending_rewrite.column_name,
826                                req.table_name,
827                                request_type,
828                                pending_rewrite.target_type
829                            ),
830                        }
831                        .build()
832                    })?;
833                }
834            }
835        }
836
837        Ok(())
838    }
839}
840
841/// Preserve the original alter failure status so chunk retry behavior stays correct.
842fn wrap_trace_alter_failure<E>(err: E) -> servers::error::Error
843where
844    E: ErrorExt + Send + Sync + 'static,
845{
846    error::ExecuteGrpcQuerySnafu.into_error(BoxedError::new(err))
847}
848
849/// Derives `trace.conventions` from the request's resource/scope `schema_url`s.
850/// A single distinct non-empty value is concrete; multiple distinct values are
851/// `mixed`; none is `unknown`. `schema_url` is row-level in OTLP, so the
852/// table-level value is best-effort per the RFC conflict rule.
853fn trace_conventions(request: &ExportTraceServiceRequest) -> String {
854    let mut seen: Option<&str> = None;
855    let mut mixed = false;
856
857    for resource_spans in &request.resource_spans {
858        let urls = std::iter::once(resource_spans.schema_url.as_str()).chain(
859            resource_spans
860                .scope_spans
861                .iter()
862                .map(|s| s.schema_url.as_str()),
863        );
864        for url in urls {
865            if url.is_empty() {
866                continue;
867            }
868            match seen {
869                None => seen = Some(url),
870                Some(prev) if prev == url => {}
871                Some(_) => {
872                    mixed = true;
873                    break;
874                }
875            }
876        }
877        if mixed {
878            break;
879        }
880    }
881
882    if mixed {
883        SEMANTIC_VALUE_MIXED.to_string()
884    } else {
885        seen.map(str::to_string)
886            .unwrap_or_else(|| SEMANTIC_VALUE_UNKNOWN.to_string())
887    }
888}
889
890#[cfg(test)]
891mod tests {
892    use common_error::ext::ErrorExt;
893    use common_error::status_code::StatusCode;
894    use servers::query_handler::TraceIngestOutcome;
895
896    use super::{ChunkFailureReaction, Instance, wrap_trace_alter_failure};
897    use crate::metrics::OTLP_TRACES_FAILURE_COUNT;
898
899    #[test]
900    fn test_classify_trace_chunk_failure() {
901        assert_eq!(
902            Instance::classify_trace_chunk_failure(StatusCode::InvalidArguments),
903            ChunkFailureReaction::RetryPerSpan
904        );
905        assert_eq!(
906            Instance::classify_trace_chunk_failure(StatusCode::InvalidSyntax),
907            ChunkFailureReaction::RetryPerSpan
908        );
909        assert_eq!(
910            Instance::classify_trace_chunk_failure(StatusCode::Unsupported),
911            ChunkFailureReaction::RetryPerSpan
912        );
913        assert_eq!(
914            Instance::classify_trace_chunk_failure(StatusCode::TableColumnNotFound),
915            ChunkFailureReaction::RetryPerSpan
916        );
917        assert_eq!(
918            Instance::classify_trace_chunk_failure(StatusCode::TableNotFound),
919            ChunkFailureReaction::RetryPerSpan
920        );
921        assert_eq!(
922            Instance::classify_trace_chunk_failure(StatusCode::DatabaseNotFound),
923            ChunkFailureReaction::DiscardChunk
924        );
925        assert_eq!(
926            Instance::classify_trace_chunk_failure(StatusCode::DeadlineExceeded),
927            ChunkFailureReaction::Propagate
928        );
929        assert_eq!(
930            Instance::classify_trace_chunk_failure(StatusCode::Cancelled),
931            ChunkFailureReaction::Propagate
932        );
933        assert_eq!(
934            Instance::classify_trace_chunk_failure(StatusCode::StorageUnavailable),
935            ChunkFailureReaction::Propagate
936        );
937        assert_eq!(
938            Instance::classify_trace_chunk_failure(StatusCode::Internal),
939            ChunkFailureReaction::Propagate
940        );
941        assert_eq!(
942            Instance::classify_trace_chunk_failure(StatusCode::RegionNotReady),
943            ChunkFailureReaction::Propagate
944        );
945        assert_eq!(
946            Instance::classify_trace_chunk_failure(StatusCode::TableUnavailable),
947            ChunkFailureReaction::Propagate
948        );
949        assert_eq!(
950            Instance::classify_trace_chunk_failure(StatusCode::RegionBusy),
951            ChunkFailureReaction::Propagate
952        );
953        assert_eq!(
954            Instance::classify_trace_chunk_failure(StatusCode::RuntimeResourcesExhausted),
955            ChunkFailureReaction::Propagate
956        );
957    }
958
959    #[test]
960    fn test_classify_trace_span_failure() {
961        assert!(Instance::should_propagate_trace_span_failure(
962            StatusCode::DeadlineExceeded
963        ));
964        assert!(Instance::should_propagate_trace_span_failure(
965            StatusCode::StorageUnavailable
966        ));
967        assert!(!Instance::should_propagate_trace_span_failure(
968            StatusCode::InvalidArguments
969        ));
970    }
971
972    #[test]
973    fn test_add_trace_write_cost() {
974        let mut outcome = TraceIngestOutcome::default();
975        Instance::add_trace_write_cost(&mut outcome, 3);
976        Instance::add_trace_write_cost(&mut outcome, 5);
977        assert_eq!(outcome.write_cost, 8);
978    }
979
980    #[test]
981    fn test_finish_trace_failure_message() {
982        let message = Instance::finish_trace_failure_message(
983            3,
984            2,
985            vec!["Rejected span trace:span (InvalidArguments)".to_string()],
986        )
987        .unwrap();
988        assert!(message.contains("Accepted 3 spans, rejected 2 spans"));
989        assert!(message.contains("Rejected span trace:span"));
990
991        assert_eq!(Instance::finish_trace_failure_message(2, 0, vec![]), None);
992    }
993
994    #[test]
995    fn test_finish_trace_failure_message_without_detail_messages() {
996        assert_eq!(
997            Instance::finish_trace_failure_message(0, 2, vec![]),
998            Some("Accepted 0 spans, rejected 2 spans".to_string())
999        );
1000    }
1001
1002    #[test]
1003    fn test_push_trace_failure_message_increments_labeled_counter() {
1004        let label = "retry_per_span_counter_test";
1005        let initial = OTLP_TRACES_FAILURE_COUNT.with_label_values(&[label]).get();
1006        let mut messages = Vec::new();
1007
1008        Instance::push_trace_failure_message(
1009            &mut messages,
1010            label,
1011            "Chunk fallback triggered by InvalidArguments".to_string(),
1012        );
1013
1014        assert_eq!(messages.len(), 1);
1015        assert_eq!(
1016            OTLP_TRACES_FAILURE_COUNT.with_label_values(&[label]).get(),
1017            initial + 1
1018        );
1019    }
1020
1021    #[test]
1022    fn test_push_trace_failure_message_caps_recorded_messages() {
1023        let label = "retry_per_span_limit_test";
1024        let mut messages = Vec::new();
1025
1026        for idx in 0..=4 {
1027            Instance::push_trace_failure_message(&mut messages, label, format!("failure-{idx}"));
1028        }
1029
1030        assert_eq!(messages.len(), 4);
1031        assert_eq!(
1032            messages,
1033            vec![
1034                "failure-0".to_string(),
1035                "failure-1".to_string(),
1036                "failure-2".to_string(),
1037                "failure-3".to_string()
1038            ]
1039        );
1040    }
1041
1042    #[test]
1043    fn test_classify_trace_chunk_failure_defaults_to_discard() {
1044        assert_eq!(
1045            Instance::classify_trace_chunk_failure(StatusCode::Unknown),
1046            ChunkFailureReaction::DiscardChunk
1047        );
1048    }
1049
1050    #[test]
1051    fn test_wrap_trace_alter_failure_preserves_status_code() {
1052        let err = wrap_trace_alter_failure(
1053            servers::error::TableNotFoundSnafu {
1054                catalog: "greptime".to_string(),
1055                schema: "public".to_string(),
1056                table: "trace_type_missing".to_string(),
1057            }
1058            .build(),
1059        );
1060
1061        assert_eq!(err.status_code(), StatusCode::TableNotFound);
1062    }
1063
1064    use opentelemetry_proto::tonic::trace::v1::{ResourceSpans, ScopeSpans};
1065
1066    use super::{ExportTraceServiceRequest, trace_conventions};
1067
1068    fn resource_spans(resource_url: &str, scope_urls: &[&str]) -> ResourceSpans {
1069        ResourceSpans {
1070            schema_url: resource_url.to_string(),
1071            scope_spans: scope_urls
1072                .iter()
1073                .map(|u| ScopeSpans {
1074                    schema_url: u.to_string(),
1075                    ..Default::default()
1076                })
1077                .collect(),
1078            ..Default::default()
1079        }
1080    }
1081
1082    #[test]
1083    fn test_trace_conventions() {
1084        let unknown = ExportTraceServiceRequest::default();
1085        assert_eq!(trace_conventions(&unknown), "unknown");
1086
1087        let url = "https://opentelemetry.io/schemas/1.27.0";
1088        let single = ExportTraceServiceRequest {
1089            resource_spans: vec![resource_spans("", &[url, url])],
1090        };
1091        assert_eq!(trace_conventions(&single), url);
1092
1093        let resource_level = ExportTraceServiceRequest {
1094            resource_spans: vec![resource_spans(url, &[""])],
1095        };
1096        assert_eq!(trace_conventions(&resource_level), url);
1097
1098        let conflicting = ExportTraceServiceRequest {
1099            resource_spans: vec![resource_spans(
1100                "",
1101                &[url, "https://opentelemetry.io/schemas/1.30.0"],
1102            )],
1103        };
1104        assert_eq!(trace_conventions(&conflicting), "mixed");
1105    }
1106}