Skip to main content

frontend/instance/
otlp.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::sync::Arc;
16
17use api::helper::ColumnDataTypeWrapper;
18use api::v1::alter_table_expr::Kind;
19use api::v1::{
20    AlterTableExpr, ColumnDataType, ModifyColumnType, ModifyColumnTypes, RowInsertRequests,
21};
22use async_trait::async_trait;
23use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
24use client::Output;
25use common_error::ext::{BoxedError, ErrorExt};
26use common_error::status_code::StatusCode;
27use common_query::prelude::GREPTIME_PHYSICAL_TABLE;
28use common_telemetry::{tracing, warn};
29use itertools::Itertools;
30use opentelemetry_proto::tonic::collector::logs::v1::ExportLogsServiceRequest;
31use opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceRequest;
32use otel_arrow_rust::proto::opentelemetry::collector::metrics::v1::ExportMetricsServiceRequest;
33use pipeline::{GreptimePipelineParams, PipelineWay};
34use servers::error::{self, AuthSnafu, Result as ServerResult};
35use servers::http::prom_store::PHYSICAL_TABLE_PARAM;
36use servers::interceptor::{OpenTelemetryProtocolInterceptor, OpenTelemetryProtocolInterceptorRef};
37use servers::otlp;
38use servers::otlp::coerce::{coerce_value_data, trace_value_datatype};
39use servers::otlp::trace::TraceAuxData;
40use servers::otlp::trace::span::{TraceSpan, TraceSpanGroup};
41use servers::query_handler::{
42    OpenTelemetryProtocolHandler, PipelineHandlerRef, TraceIngestOutcome,
43};
44use session::context::QueryContextRef;
45use snafu::{IntoError, ResultExt};
46use table::requests::{
47    OTLP_METRIC_COMPAT_KEY, OTLP_METRIC_COMPAT_PROM, SEMANTIC_PER_TABLE_INDEX_KEY,
48    SEMANTIC_PIPELINE, SEMANTIC_SIGNAL_TYPE, SEMANTIC_SOURCE, SEMANTIC_TRACE_CONVENTIONS,
49    SEMANTIC_VALUE_MIXED, SEMANTIC_VALUE_UNKNOWN, SIGNAL_TYPE_LOG, SIGNAL_TYPE_METRIC,
50    SIGNAL_TYPE_TRACE, SOURCE_OPENTELEMETRY, TABLE_DATA_MODEL_TRACE_V1,
51};
52
53use crate::instance::Instance;
54use crate::instance::otlp::trace_semconv::trace_semconv_fixed_type;
55use crate::instance::otlp::trace_types::{
56    PendingTraceColumnRewrite, choose_trace_reconcile_decision, enrich_trace_reconcile_error,
57    is_trace_reconcile_candidate_type, push_observed_trace_type, validate_trace_column_rewrites,
58};
59use crate::metrics::{
60    OTLP_LOGS_ROWS, OTLP_METRICS_ROWS, OTLP_TRACES_FAILURE_COUNT, OTLP_TRACES_ROWS,
61};
62
63pub mod trace_semconv;
64pub mod trace_types;
65
66const TRACE_INGEST_CHUNK_SIZE: usize = 64;
67const TRACE_FAILURE_MESSAGE_LIMIT: usize = 4;
68
69#[derive(Debug, Clone, Copy, PartialEq, Eq)]
70enum ChunkFailureReaction {
71    RetryPerSpan,
72    DiscardChunk,
73    Propagate,
74}
75
76impl ChunkFailureReaction {
77    fn as_metric_label(self) -> &'static str {
78        match self {
79            Self::RetryPerSpan => "retry_per_span",
80            Self::DiscardChunk => "discard_chunk",
81            Self::Propagate => "propagate_failure",
82        }
83    }
84}
85
86struct TraceChunkIngestContext<'a> {
87    pipeline_handler: PipelineHandlerRef,
88    pipeline: &'a PipelineWay,
89    pipeline_params: &'a GreptimePipelineParams,
90    table_name: &'a str,
91    is_trace_v1_model: bool,
92}
93
94struct TraceIngestState {
95    aux_data: TraceAuxData,
96    outcome: TraceIngestOutcome,
97    failure_messages: Vec<String>,
98}
99
100#[async_trait]
101impl OpenTelemetryProtocolHandler for Instance {
102    #[tracing::instrument(skip_all)]
103    async fn metrics(
104        &self,
105        request: ExportMetricsServiceRequest,
106        ctx: QueryContextRef,
107    ) -> ServerResult<Output> {
108        self.plugins
109            .get::<PermissionCheckerRef>()
110            .as_ref()
111            .check_permission(ctx.current_user(), PermissionReq::Otlp)
112            .context(AuthSnafu)?;
113
114        let interceptor_ref = self
115            .plugins
116            .get::<OpenTelemetryProtocolInterceptorRef<servers::error::Error>>();
117        interceptor_ref.pre_execute(ctx.clone())?;
118
119        let input_names = request
120            .resource_metrics
121            .iter()
122            .flat_map(|r| r.scope_metrics.iter())
123            .flat_map(|s| s.metrics.iter().map(|m| &m.name))
124            .collect::<Vec<_>>();
125
126        // See [`OtlpMetricCtx`] for details
127        let is_legacy = self.check_otlp_legacy(&input_names, ctx.clone()).await?;
128
129        let mut metric_ctx = ctx
130            .protocol_ctx()
131            .get_otlp_metric_ctx()
132            .cloned()
133            .unwrap_or_default();
134        metric_ctx.is_legacy = is_legacy;
135
136        let (requests, rows, semantic_index) =
137            otlp::metrics::to_grpc_insert_requests(request, &mut metric_ctx)?;
138        OTLP_METRICS_ROWS.inc_by(rows as u64);
139
140        let ctx = {
141            let mut c = (*ctx).clone();
142            c.set_extension(SEMANTIC_SIGNAL_TYPE, SIGNAL_TYPE_METRIC);
143            c.set_extension(SEMANTIC_SOURCE, SOURCE_OPENTELEMETRY);
144            // Per-table metric specifics + resource/scope lineage ride this
145            // internal channel; the auto-create path folds them per table name.
146            if let Some(index) = semantic_index.encode() {
147                c.set_extension(SEMANTIC_PER_TABLE_INDEX_KEY, index);
148            }
149            if !is_legacy {
150                c.set_extension(OTLP_METRIC_COMPAT_KEY, OTLP_METRIC_COMPAT_PROM.to_string());
151            }
152            Arc::new(c)
153        };
154
155        // If the user uses the legacy path, it is by default without metric engine.
156        if metric_ctx.is_legacy || !metric_ctx.with_metric_engine {
157            self.handle_row_inserts(requests, ctx, false, false)
158                .await
159                .map_err(BoxedError::new)
160                .context(error::ExecuteGrpcQuerySnafu)
161        } else {
162            let physical_table = ctx
163                .extension(PHYSICAL_TABLE_PARAM)
164                .unwrap_or(GREPTIME_PHYSICAL_TABLE)
165                .to_string();
166            self.handle_metric_row_inserts(requests, ctx, physical_table.clone())
167                .await
168                .map_err(BoxedError::new)
169                .context(error::ExecuteGrpcQuerySnafu)
170        }
171    }
172
173    #[tracing::instrument(skip_all)]
174    async fn traces(
175        &self,
176        pipeline_handler: PipelineHandlerRef,
177        request: ExportTraceServiceRequest,
178        pipeline: PipelineWay,
179        pipeline_params: GreptimePipelineParams,
180        table_name: String,
181        ctx: QueryContextRef,
182    ) -> ServerResult<TraceIngestOutcome> {
183        self.plugins
184            .get::<PermissionCheckerRef>()
185            .as_ref()
186            .check_permission(ctx.current_user(), PermissionReq::Otlp)
187            .context(AuthSnafu)?;
188
189        let interceptor_ref = self
190            .plugins
191            .get::<OpenTelemetryProtocolInterceptorRef<servers::error::Error>>();
192        interceptor_ref.pre_execute(ctx.clone())?;
193
194        // `schema_url` is consumed by `parse`, so derive conventions first.
195        let conventions = trace_conventions(&request);
196        let spans = otlp::trace::span::parse(request);
197        self.ingest_trace_spans(
198            pipeline_handler,
199            &pipeline,
200            &pipeline_params,
201            table_name,
202            spans,
203            &conventions,
204            ctx,
205        )
206        .await
207    }
208
209    #[tracing::instrument(skip_all)]
210    async fn logs(
211        &self,
212        pipeline_handler: PipelineHandlerRef,
213        request: ExportLogsServiceRequest,
214        pipeline: PipelineWay,
215        pipeline_params: GreptimePipelineParams,
216        table_name: String,
217        ctx: QueryContextRef,
218    ) -> ServerResult<Vec<Output>> {
219        self.plugins
220            .get::<PermissionCheckerRef>()
221            .as_ref()
222            .check_permission(ctx.current_user(), PermissionReq::Otlp)
223            .context(AuthSnafu)?;
224
225        let interceptor_ref = self
226            .plugins
227            .get::<OpenTelemetryProtocolInterceptorRef<servers::error::Error>>();
228        interceptor_ref.pre_execute(ctx.clone())?;
229
230        // `as_req_iter` clones this ctx into each `temp_ctx`, so identity set here
231        // reaches the context that drives table auto-create.
232        let ctx = {
233            let mut c = (*ctx).clone();
234            c.set_extension(SEMANTIC_SIGNAL_TYPE, SIGNAL_TYPE_LOG);
235            c.set_extension(SEMANTIC_SOURCE, SOURCE_OPENTELEMETRY);
236            Arc::new(c)
237        };
238
239        let opt_req = otlp::logs::to_grpc_insert_requests(
240            request,
241            pipeline,
242            pipeline_params,
243            table_name,
244            &ctx,
245            pipeline_handler,
246        )
247        .await?;
248
249        let mut outputs = vec![];
250
251        for (temp_ctx, requests) in opt_req.as_req_iter(ctx) {
252            let cnt = requests
253                .inserts
254                .iter()
255                .filter_map(|r| r.rows.as_ref().map(|r| r.rows.len()))
256                .sum::<usize>();
257
258            let o = self
259                .handle_log_inserts(requests, temp_ctx)
260                .await
261                .inspect(|_| OTLP_LOGS_ROWS.inc_by(cnt as u64))
262                .map_err(BoxedError::new)
263                .context(error::ExecuteGrpcQuerySnafu)?;
264            outputs.push(o);
265        }
266
267        Ok(outputs)
268    }
269}
270
271impl Instance {
272    /// Ingest OTLP trace spans with chunk-level writes and span-level fallback on
273    /// deterministic chunk failures.
274    #[allow(clippy::too_many_arguments)]
275    async fn ingest_trace_spans(
276        &self,
277        pipeline_handler: PipelineHandlerRef,
278        pipeline: &PipelineWay,
279        pipeline_params: &GreptimePipelineParams,
280        table_name: String,
281        groups: Vec<TraceSpanGroup>,
282        conventions: &str,
283        ctx: QueryContextRef,
284    ) -> ServerResult<TraceIngestOutcome> {
285        let is_trace_v1_model = matches!(pipeline, PipelineWay::OtlpTraceDirectV1);
286
287        // Only the main span table gets the identity; the derived `_services` /
288        // `_operations` lookup tables keep the unstamped `ctx`.
289        let main_ctx = {
290            let mut c = (*ctx).clone();
291            c.set_extension(SEMANTIC_SIGNAL_TYPE, SIGNAL_TYPE_TRACE);
292            c.set_extension(SEMANTIC_SOURCE, SOURCE_OPENTELEMETRY);
293            if is_trace_v1_model {
294                c.set_extension(SEMANTIC_PIPELINE, TABLE_DATA_MODEL_TRACE_V1);
295                c.set_extension(SEMANTIC_TRACE_CONVENTIONS, conventions);
296            }
297            Arc::new(c)
298        };
299
300        let ingest_ctx = TraceChunkIngestContext {
301            pipeline_handler,
302            pipeline,
303            pipeline_params,
304            table_name: &table_name,
305            is_trace_v1_model,
306        };
307        let mut ingest_state = TraceIngestState {
308            aux_data: TraceAuxData::default(),
309            outcome: TraceIngestOutcome::default(),
310            failure_messages: Vec::new(),
311        };
312
313        for group in groups {
314            let chunks = group
315                .spans
316                .into_iter()
317                .chunks(TRACE_INGEST_CHUNK_SIZE)
318                .into_iter()
319                .map(|chunk| chunk.collect::<Vec<_>>())
320                .collect::<Vec<_>>();
321            for chunk in chunks {
322                self.ingest_trace_chunk(&ingest_ctx, chunk, main_ctx.clone(), &mut ingest_state)
323                    .await?;
324            }
325        }
326
327        OTLP_TRACES_ROWS.inc_by(ingest_state.outcome.accepted_spans as u64);
328
329        if !ingest_state.aux_data.is_empty() {
330            // Auxiliary trace tables are derived from spans whose main-table
331            // writes are already confirmed, so they never create new accepted
332            // spans and they do not affect rejected span counts.
333            let (aux_requests, _) = otlp::trace::to_grpc_insert_requests_for_aux_tables(
334                std::mem::take(&mut ingest_state.aux_data),
335                ingest_ctx.pipeline,
336                ingest_ctx.table_name,
337            )?;
338
339            if !aux_requests.inserts.is_empty() {
340                match self
341                    .insert_trace_requests(aux_requests, ingest_ctx.is_trace_v1_model, ctx)
342                    .await
343                {
344                    Ok(output) => {
345                        Self::add_trace_write_cost(&mut ingest_state.outcome, output.meta.cost);
346                    }
347                    Err(err) => {
348                        Self::push_trace_failure_message(
349                            &mut ingest_state.failure_messages,
350                            "aux_table_update_failed",
351                            format!(
352                                "Auxiliary trace tables were not fully updated ({})",
353                                err.status_code().as_ref()
354                            ),
355                        );
356                    }
357                }
358            }
359        }
360
361        ingest_state.outcome.error_message = Self::finish_trace_failure_message(
362            ingest_state.outcome.accepted_spans,
363            ingest_state.outcome.rejected_spans,
364            ingest_state.failure_messages,
365        );
366
367        Ok(ingest_state.outcome)
368    }
369
370    /// Ingest one owned trace chunk so successful spans can be moved into the
371    /// accepted set without extra cloning.
372    async fn ingest_trace_chunk(
373        &self,
374        ingest_ctx: &TraceChunkIngestContext<'_>,
375        chunk: Vec<TraceSpan>,
376        ctx: QueryContextRef,
377        ingest_state: &mut TraceIngestState,
378    ) -> ServerResult<()> {
379        // Try the fast path first so healthy batches keep their original
380        // throughput and write amplification stays low.
381        let (requests, chunk_rows) = otlp::trace::to_grpc_insert_requests_from_spans(
382            &chunk,
383            ingest_ctx.pipeline,
384            ingest_ctx.pipeline_params,
385            ingest_ctx.table_name,
386            &ctx,
387            ingest_ctx.pipeline_handler.clone(),
388        )?;
389
390        match self
391            .insert_trace_requests(requests, ingest_ctx.is_trace_v1_model, ctx.clone())
392            .await
393        {
394            Ok(output) => {
395                Self::add_trace_write_cost(&mut ingest_state.outcome, output.meta.cost);
396                ingest_state.outcome.accepted_spans += chunk_rows;
397                for span in &chunk {
398                    ingest_state.aux_data.observe_span(span);
399                }
400            }
401            Err(err) => match Self::classify_trace_chunk_failure(err.status_code()) {
402                ChunkFailureReaction::RetryPerSpan => {
403                    Self::push_trace_failure_message(
404                        &mut ingest_state.failure_messages,
405                        ChunkFailureReaction::RetryPerSpan.as_metric_label(),
406                        format!("Chunk fallback triggered by {}", err.status_code().as_ref()),
407                    );
408                    // Only deterministic failures are retried span by span.
409                    // This includes schemaless table or column creation paths for
410                    // trace ingestion. Ambiguous failures are handled below
411                    // without retrying because the chunk may already have been
412                    // ingested.
413                    self.ingest_trace_chunk_span_by_span(
414                        ingest_ctx,
415                        chunk,
416                        ctx.clone(),
417                        ingest_state,
418                    )
419                    .await?;
420                }
421                ChunkFailureReaction::DiscardChunk => {
422                    ingest_state.outcome.rejected_spans += chunk.len();
423                    Self::push_trace_failure_message(
424                        &mut ingest_state.failure_messages,
425                        ChunkFailureReaction::DiscardChunk.as_metric_label(),
426                        format!(
427                            "Discarded {} spans after ambiguous chunk failure ({})",
428                            chunk.len(),
429                            err.status_code().as_ref()
430                        ),
431                    );
432                    // TODO(shuiyisong): Add an idempotent retry-safe recovery path for
433                    // ambiguous chunk failures such as timeout-like errors.
434                }
435                // Retryable or ambiguous failures must fail the request instead of
436                // becoming partial success. This path is not retry-safe because the
437                // chunk may already have been committed before the error surfaced.
438                ChunkFailureReaction::Propagate => {
439                    Self::push_trace_failure_message(
440                        &mut ingest_state.failure_messages,
441                        ChunkFailureReaction::Propagate.as_metric_label(),
442                        format!(
443                            "Propagating retryable chunk failure ({})",
444                            err.status_code().as_ref()
445                        ),
446                    );
447                    return Err(err);
448                }
449            },
450        }
451
452        Ok(())
453    }
454
455    /// Retry spans one by one only after a deterministic chunk failure.
456    async fn ingest_trace_chunk_span_by_span(
457        &self,
458        ingest_ctx: &TraceChunkIngestContext<'_>,
459        chunk: Vec<TraceSpan>,
460        ctx: QueryContextRef,
461        ingest_state: &mut TraceIngestState,
462    ) -> ServerResult<()> {
463        for span in chunk {
464            let (requests, rows) = otlp::trace::to_grpc_insert_requests_from_spans(
465                std::slice::from_ref(&span),
466                ingest_ctx.pipeline,
467                ingest_ctx.pipeline_params,
468                ingest_ctx.table_name,
469                &ctx,
470                ingest_ctx.pipeline_handler.clone(),
471            )?;
472
473            match self
474                .insert_trace_requests(requests, ingest_ctx.is_trace_v1_model, ctx.clone())
475                .await
476            {
477                Ok(output) => {
478                    Self::add_trace_write_cost(&mut ingest_state.outcome, output.meta.cost);
479                    ingest_state.outcome.accepted_spans += rows;
480                    ingest_state.aux_data.observe_span(&span);
481                }
482                Err(err) => {
483                    if Self::should_propagate_trace_span_failure(err.status_code()) {
484                        Self::push_trace_failure_message(
485                            &mut ingest_state.failure_messages,
486                            ChunkFailureReaction::Propagate.as_metric_label(),
487                            format!(
488                                "Propagating retryable span failure for {}:{} ({})",
489                                span.trace_id,
490                                span.span_id,
491                                err.status_code().as_ref()
492                            ),
493                        );
494                        return Err(err);
495                    }
496
497                    ingest_state.outcome.rejected_spans += 1;
498                    Self::push_trace_failure_message(
499                        &mut ingest_state.failure_messages,
500                        "span_rejected",
501                        format!(
502                            "Rejected span {}:{} ({})",
503                            span.trace_id,
504                            span.span_id,
505                            err.status_code().as_ref()
506                        ),
507                    );
508                }
509            }
510        }
511
512        Ok(())
513    }
514
515    /// Reconcile and insert one trace request batch.
516    async fn insert_trace_requests(
517        &self,
518        mut requests: RowInsertRequests,
519        is_trace_v1_model: bool,
520        ctx: QueryContextRef,
521    ) -> ServerResult<Output> {
522        if is_trace_v1_model {
523            self.reconcile_trace_column_types(&mut requests, &ctx)
524                .await?;
525            self.handle_trace_inserts(requests, ctx)
526                .await
527                .map_err(BoxedError::new)
528                .context(error::ExecuteGrpcQuerySnafu)
529        } else {
530            self.handle_log_inserts(requests, ctx)
531                .await
532                .map_err(BoxedError::new)
533                .context(error::ExecuteGrpcQuerySnafu)
534        }
535    }
536
537    fn classify_trace_chunk_failure(status: StatusCode) -> ChunkFailureReaction {
538        match status {
539            StatusCode::InvalidArguments
540            | StatusCode::InvalidSyntax
541            | StatusCode::Unsupported
542            | StatusCode::TableNotFound
543            | StatusCode::TableColumnNotFound => ChunkFailureReaction::RetryPerSpan,
544            StatusCode::DatabaseNotFound => ChunkFailureReaction::DiscardChunk,
545            StatusCode::Cancelled | StatusCode::DeadlineExceeded => ChunkFailureReaction::Propagate,
546            StatusCode::StorageUnavailable
547            | StatusCode::RuntimeResourcesExhausted
548            | StatusCode::Internal
549            | StatusCode::RegionNotReady
550            | StatusCode::TableUnavailable
551            | StatusCode::RegionBusy => ChunkFailureReaction::Propagate,
552            _ => ChunkFailureReaction::DiscardChunk,
553        }
554    }
555
556    fn should_propagate_trace_span_failure(status: StatusCode) -> bool {
557        matches!(
558            Self::classify_trace_chunk_failure(status),
559            ChunkFailureReaction::Propagate
560        )
561    }
562
563    fn add_trace_write_cost(outcome: &mut TraceIngestOutcome, cost: usize) {
564        outcome.write_cost += cost;
565    }
566
567    fn push_trace_failure_message(messages: &mut Vec<String>, label: &str, message: String) {
568        OTLP_TRACES_FAILURE_COUNT.with_label_values(&[label]).inc();
569
570        if messages.len() < TRACE_FAILURE_MESSAGE_LIMIT {
571            messages.push(message);
572        } else if messages.len() == TRACE_FAILURE_MESSAGE_LIMIT {
573            tracing::debug!(
574                label,
575                limit = TRACE_FAILURE_MESSAGE_LIMIT,
576                "Trace ingest failure message limit reached; suppressing additional failure details"
577            );
578        }
579    }
580
581    fn finish_trace_failure_message(
582        accepted_spans: usize,
583        rejected_spans: usize,
584        messages: Vec<String>,
585    ) -> Option<String> {
586        if rejected_spans == 0 && messages.is_empty() {
587            return None;
588        }
589
590        let mut summary = format!(
591            "Accepted {} spans, rejected {} spans",
592            accepted_spans, rejected_spans
593        );
594
595        if !messages.is_empty() {
596            summary.push_str(": ");
597            summary.push_str(&messages.join("; "));
598        }
599
600        Some(summary)
601    }
602
603    /// Widen existing trace table columns to Float64 before request rewrite.
604    async fn alter_trace_table_columns_to_float64(
605        &self,
606        ctx: &QueryContextRef,
607        table_name: &str,
608        column_names: &[String],
609    ) -> ServerResult<()> {
610        let catalog_name = ctx.current_catalog().to_string();
611        let schema_name = ctx.current_schema();
612        let alter_expr = AlterTableExpr {
613            catalog_name: catalog_name.clone(),
614            schema_name: schema_name.clone(),
615            table_name: table_name.to_string(),
616            kind: Some(Kind::ModifyColumnTypes(ModifyColumnTypes {
617                modify_column_types: column_names
618                    .iter()
619                    .map(|column_name| ModifyColumnType {
620                        column_name: column_name.clone(),
621                        target_type: ColumnDataType::Float64 as i32,
622                        target_type_extension: None,
623                    })
624                    .collect(),
625            })),
626        };
627
628        if let Err(err) = self
629            .statement_executor
630            .alter_table_inner(alter_expr, ctx.clone())
631            .await
632        {
633            let table = self
634                .catalog_manager
635                .table(&catalog_name, &schema_name, table_name, None)
636                .await
637                .map_err(servers::error::Error::from)?;
638            let alter_already_applied = table
639                .map(|table| {
640                    let table_schema = table.schema();
641                    column_names.iter().all(|column_name| {
642                        table_schema
643                            .column_schema_by_name(column_name)
644                            .and_then(|table_col| {
645                                ColumnDataTypeWrapper::try_from(table_col.data_type.clone())
646                                    .ok()
647                                    .map(|wrapper| wrapper.datatype())
648                            })
649                            == Some(ColumnDataType::Float64)
650                    })
651                })
652                .unwrap_or(false);
653
654            if alter_already_applied {
655                return Ok(());
656            }
657
658            warn!(
659                table_name,
660                columns = ?column_names,
661                error = %err,
662                "failed to widen trace columns before insert"
663            );
664
665            return Err(wrap_trace_alter_failure(err));
666        }
667
668        Ok(())
669    }
670
671    /// Coerce request column types and values to match the existing table schema
672    /// for compatible type pairs. Existing table schema wins when present;
673    /// otherwise the full request batch decides a stable target type.
674    async fn reconcile_trace_column_types(
675        &self,
676        requests: &mut RowInsertRequests,
677        ctx: &QueryContextRef,
678    ) -> ServerResult<()> {
679        let catalog = ctx.current_catalog();
680        let schema = ctx.current_schema();
681
682        for req in &mut requests.inserts {
683            let table = self
684                .catalog_manager
685                .table(catalog, &schema, &req.table_name, None)
686                .await?;
687
688            let Some(rows) = req.rows.as_mut() else {
689                continue;
690            };
691
692            let table_schema = table.map(|table| table.schema());
693            let mut pending_rewrites = Vec::new();
694            let mut pending_alter_columns = Vec::new();
695
696            for (col_idx, col_schema) in rows.schema.iter().enumerate() {
697                let Some(current_type) = ColumnDataType::try_from(col_schema.datatype).ok() else {
698                    continue;
699                };
700
701                let mut observed_types = Vec::new();
702                push_observed_trace_type(&mut observed_types, current_type);
703
704                // Scan the full request first so the final type decision is not affected
705                // by row order inside the batch.
706                for row in &rows.rows {
707                    let Some(value) = row
708                        .values
709                        .get(col_idx)
710                        .and_then(|value| value.value_data.as_ref())
711                    else {
712                        continue;
713                    };
714
715                    let Some(value_type) = trace_value_datatype(value) else {
716                        continue;
717                    };
718                    push_observed_trace_type(&mut observed_types, value_type);
719                }
720
721                let existing_type = table_schema
722                    .as_ref()
723                    .and_then(|schema| schema.column_schema_by_name(&col_schema.column_name))
724                    .and_then(|table_col| {
725                        ColumnDataTypeWrapper::try_from(table_col.data_type.clone())
726                            .ok()
727                            .map(|wrapper| wrapper.datatype())
728                    });
729                let fixed_type = trace_semconv_fixed_type(&col_schema.column_name);
730
731                if !observed_types
732                    .iter()
733                    .copied()
734                    .any(is_trace_reconcile_candidate_type)
735                    && existing_type
736                        .map(|datatype| !is_trace_reconcile_candidate_type(datatype))
737                        .unwrap_or(true)
738                    && fixed_type.is_none()
739                {
740                    continue;
741                }
742
743                // Decide the final type once per column, then rewrite all affected cells
744                // together in one row pass below.
745                let Some(decision) = choose_trace_reconcile_decision(
746                    &col_schema.column_name,
747                    &observed_types,
748                    existing_type,
749                )
750                .map_err(|_| {
751                    enrich_trace_reconcile_error(
752                        &req.table_name,
753                        &col_schema.column_name,
754                        &observed_types,
755                        existing_type,
756                        fixed_type,
757                    )
758                })?
759                else {
760                    continue;
761                };
762                let target_type = decision.target_type();
763
764                if !decision.requires_alter()
765                    && observed_types
766                        .iter()
767                        .all(|observed| *observed == target_type)
768                    && col_schema.datatype == target_type as i32
769                {
770                    continue;
771                }
772
773                if decision.requires_alter()
774                    && !pending_alter_columns.contains(&col_schema.column_name)
775                {
776                    pending_alter_columns.push(col_schema.column_name.clone());
777                }
778
779                pending_rewrites.push(PendingTraceColumnRewrite {
780                    col_idx,
781                    target_type,
782                    column_name: col_schema.column_name.clone(),
783                });
784            }
785
786            if pending_rewrites.is_empty() {
787                continue;
788            }
789
790            validate_trace_column_rewrites(&rows.rows, &pending_rewrites, &req.table_name)?;
791
792            if !pending_alter_columns.is_empty() {
793                self.alter_trace_table_columns_to_float64(
794                    ctx,
795                    &req.table_name,
796                    &pending_alter_columns,
797                )
798                .await?;
799            }
800
801            // Update schema metadata before mutating row values so both stay in sync.
802            for pending_rewrite in &pending_rewrites {
803                rows.schema[pending_rewrite.col_idx].datatype = pending_rewrite.target_type as i32;
804            }
805
806            // Apply all pending column rewrites in one row pass.
807            for row in &mut rows.rows {
808                for pending_rewrite in &pending_rewrites {
809                    let Some(value) = row.values.get_mut(pending_rewrite.col_idx) else {
810                        continue;
811                    };
812                    let Some(request_type) =
813                        value.value_data.as_ref().and_then(trace_value_datatype)
814                    else {
815                        continue;
816                    };
817                    if request_type == pending_rewrite.target_type {
818                        continue;
819                    }
820
821                    value.value_data = coerce_value_data(
822                        &value.value_data,
823                        pending_rewrite.target_type,
824                        request_type,
825                    )
826                    .map_err(|_| {
827                        error::InvalidParameterSnafu {
828                            reason: format!(
829                                "failed to coerce trace column '{}' in table '{}' from {:?} to {:?}",
830                                pending_rewrite.column_name,
831                                req.table_name,
832                                request_type,
833                                pending_rewrite.target_type
834                            ),
835                        }
836                        .build()
837                    })?;
838                }
839            }
840        }
841
842        Ok(())
843    }
844}
845
846/// Preserve the original alter failure status so chunk retry behavior stays correct.
847fn wrap_trace_alter_failure<E>(err: E) -> servers::error::Error
848where
849    E: ErrorExt + Send + Sync + 'static,
850{
851    error::ExecuteGrpcQuerySnafu.into_error(BoxedError::new(err))
852}
853
854/// Derives `trace.conventions` from the request's resource/scope `schema_url`s.
855/// A single distinct non-empty value is concrete; multiple distinct values are
856/// `mixed`; none is `unknown`. `schema_url` is row-level in OTLP, so the
857/// table-level value is best-effort per the RFC conflict rule.
858fn trace_conventions(request: &ExportTraceServiceRequest) -> String {
859    let mut seen: Option<&str> = None;
860    let mut mixed = false;
861
862    for resource_spans in &request.resource_spans {
863        let urls = std::iter::once(resource_spans.schema_url.as_str()).chain(
864            resource_spans
865                .scope_spans
866                .iter()
867                .map(|s| s.schema_url.as_str()),
868        );
869        for url in urls {
870            if url.is_empty() {
871                continue;
872            }
873            match seen {
874                None => seen = Some(url),
875                Some(prev) if prev == url => {}
876                Some(_) => {
877                    mixed = true;
878                    break;
879                }
880            }
881        }
882        if mixed {
883            break;
884        }
885    }
886
887    if mixed {
888        SEMANTIC_VALUE_MIXED.to_string()
889    } else {
890        seen.map(str::to_string)
891            .unwrap_or_else(|| SEMANTIC_VALUE_UNKNOWN.to_string())
892    }
893}
894
895#[cfg(test)]
896mod tests {
897    use common_error::ext::ErrorExt;
898    use common_error::status_code::StatusCode;
899    use servers::query_handler::TraceIngestOutcome;
900
901    use super::{ChunkFailureReaction, Instance, wrap_trace_alter_failure};
902    use crate::metrics::OTLP_TRACES_FAILURE_COUNT;
903
904    #[test]
905    fn test_classify_trace_chunk_failure() {
906        assert_eq!(
907            Instance::classify_trace_chunk_failure(StatusCode::InvalidArguments),
908            ChunkFailureReaction::RetryPerSpan
909        );
910        assert_eq!(
911            Instance::classify_trace_chunk_failure(StatusCode::InvalidSyntax),
912            ChunkFailureReaction::RetryPerSpan
913        );
914        assert_eq!(
915            Instance::classify_trace_chunk_failure(StatusCode::Unsupported),
916            ChunkFailureReaction::RetryPerSpan
917        );
918        assert_eq!(
919            Instance::classify_trace_chunk_failure(StatusCode::TableColumnNotFound),
920            ChunkFailureReaction::RetryPerSpan
921        );
922        assert_eq!(
923            Instance::classify_trace_chunk_failure(StatusCode::TableNotFound),
924            ChunkFailureReaction::RetryPerSpan
925        );
926        assert_eq!(
927            Instance::classify_trace_chunk_failure(StatusCode::DatabaseNotFound),
928            ChunkFailureReaction::DiscardChunk
929        );
930        assert_eq!(
931            Instance::classify_trace_chunk_failure(StatusCode::DeadlineExceeded),
932            ChunkFailureReaction::Propagate
933        );
934        assert_eq!(
935            Instance::classify_trace_chunk_failure(StatusCode::Cancelled),
936            ChunkFailureReaction::Propagate
937        );
938        assert_eq!(
939            Instance::classify_trace_chunk_failure(StatusCode::StorageUnavailable),
940            ChunkFailureReaction::Propagate
941        );
942        assert_eq!(
943            Instance::classify_trace_chunk_failure(StatusCode::Internal),
944            ChunkFailureReaction::Propagate
945        );
946        assert_eq!(
947            Instance::classify_trace_chunk_failure(StatusCode::RegionNotReady),
948            ChunkFailureReaction::Propagate
949        );
950        assert_eq!(
951            Instance::classify_trace_chunk_failure(StatusCode::TableUnavailable),
952            ChunkFailureReaction::Propagate
953        );
954        assert_eq!(
955            Instance::classify_trace_chunk_failure(StatusCode::RegionBusy),
956            ChunkFailureReaction::Propagate
957        );
958        assert_eq!(
959            Instance::classify_trace_chunk_failure(StatusCode::RuntimeResourcesExhausted),
960            ChunkFailureReaction::Propagate
961        );
962    }
963
964    #[test]
965    fn test_classify_trace_span_failure() {
966        assert!(Instance::should_propagate_trace_span_failure(
967            StatusCode::DeadlineExceeded
968        ));
969        assert!(Instance::should_propagate_trace_span_failure(
970            StatusCode::StorageUnavailable
971        ));
972        assert!(!Instance::should_propagate_trace_span_failure(
973            StatusCode::InvalidArguments
974        ));
975    }
976
977    #[test]
978    fn test_add_trace_write_cost() {
979        let mut outcome = TraceIngestOutcome::default();
980        Instance::add_trace_write_cost(&mut outcome, 3);
981        Instance::add_trace_write_cost(&mut outcome, 5);
982        assert_eq!(outcome.write_cost, 8);
983    }
984
985    #[test]
986    fn test_finish_trace_failure_message() {
987        let message = Instance::finish_trace_failure_message(
988            3,
989            2,
990            vec!["Rejected span trace:span (InvalidArguments)".to_string()],
991        )
992        .unwrap();
993        assert!(message.contains("Accepted 3 spans, rejected 2 spans"));
994        assert!(message.contains("Rejected span trace:span"));
995
996        assert_eq!(Instance::finish_trace_failure_message(2, 0, vec![]), None);
997    }
998
999    #[test]
1000    fn test_finish_trace_failure_message_without_detail_messages() {
1001        assert_eq!(
1002            Instance::finish_trace_failure_message(0, 2, vec![]),
1003            Some("Accepted 0 spans, rejected 2 spans".to_string())
1004        );
1005    }
1006
1007    #[test]
1008    fn test_push_trace_failure_message_increments_labeled_counter() {
1009        let label = "retry_per_span_counter_test";
1010        let initial = OTLP_TRACES_FAILURE_COUNT.with_label_values(&[label]).get();
1011        let mut messages = Vec::new();
1012
1013        Instance::push_trace_failure_message(
1014            &mut messages,
1015            label,
1016            "Chunk fallback triggered by InvalidArguments".to_string(),
1017        );
1018
1019        assert_eq!(messages.len(), 1);
1020        assert_eq!(
1021            OTLP_TRACES_FAILURE_COUNT.with_label_values(&[label]).get(),
1022            initial + 1
1023        );
1024    }
1025
1026    #[test]
1027    fn test_push_trace_failure_message_caps_recorded_messages() {
1028        let label = "retry_per_span_limit_test";
1029        let mut messages = Vec::new();
1030
1031        for idx in 0..=4 {
1032            Instance::push_trace_failure_message(&mut messages, label, format!("failure-{idx}"));
1033        }
1034
1035        assert_eq!(messages.len(), 4);
1036        assert_eq!(
1037            messages,
1038            vec![
1039                "failure-0".to_string(),
1040                "failure-1".to_string(),
1041                "failure-2".to_string(),
1042                "failure-3".to_string()
1043            ]
1044        );
1045    }
1046
1047    #[test]
1048    fn test_classify_trace_chunk_failure_defaults_to_discard() {
1049        assert_eq!(
1050            Instance::classify_trace_chunk_failure(StatusCode::Unknown),
1051            ChunkFailureReaction::DiscardChunk
1052        );
1053    }
1054
1055    #[test]
1056    fn test_wrap_trace_alter_failure_preserves_status_code() {
1057        let err = wrap_trace_alter_failure(
1058            servers::error::TableNotFoundSnafu {
1059                catalog: "greptime".to_string(),
1060                schema: "public".to_string(),
1061                table: "trace_type_missing".to_string(),
1062            }
1063            .build(),
1064        );
1065
1066        assert_eq!(err.status_code(), StatusCode::TableNotFound);
1067    }
1068
1069    use opentelemetry_proto::tonic::trace::v1::{ResourceSpans, ScopeSpans};
1070
1071    use super::{ExportTraceServiceRequest, trace_conventions};
1072
1073    fn resource_spans(resource_url: &str, scope_urls: &[&str]) -> ResourceSpans {
1074        ResourceSpans {
1075            schema_url: resource_url.to_string(),
1076            scope_spans: scope_urls
1077                .iter()
1078                .map(|u| ScopeSpans {
1079                    schema_url: u.to_string(),
1080                    ..Default::default()
1081                })
1082                .collect(),
1083            ..Default::default()
1084        }
1085    }
1086
1087    #[test]
1088    fn test_trace_conventions() {
1089        let unknown = ExportTraceServiceRequest::default();
1090        assert_eq!(trace_conventions(&unknown), "unknown");
1091
1092        let url = "https://opentelemetry.io/schemas/1.27.0";
1093        let single = ExportTraceServiceRequest {
1094            resource_spans: vec![resource_spans("", &[url, url])],
1095        };
1096        assert_eq!(trace_conventions(&single), url);
1097
1098        let resource_level = ExportTraceServiceRequest {
1099            resource_spans: vec![resource_spans(url, &[""])],
1100        };
1101        assert_eq!(trace_conventions(&resource_level), url);
1102
1103        let conflicting = ExportTraceServiceRequest {
1104            resource_spans: vec![resource_spans(
1105                "",
1106                &[url, "https://opentelemetry.io/schemas/1.30.0"],
1107            )],
1108        };
1109        assert_eq!(trace_conventions(&conflicting), "mixed");
1110    }
1111}