Skip to main content

frontend/instance/
otlp.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::sync::Arc;
16
17use api::helper::ColumnDataTypeWrapper;
18use api::v1::alter_table_expr::Kind;
19use api::v1::{
20    AlterTableExpr, ColumnDataType, ModifyColumnType, ModifyColumnTypes, RowInsertRequests,
21};
22use async_trait::async_trait;
23use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
24use client::Output;
25use common_error::ext::{BoxedError, ErrorExt};
26use common_error::status_code::StatusCode;
27use common_query::prelude::GREPTIME_PHYSICAL_TABLE;
28use common_telemetry::{tracing, warn};
29use itertools::Itertools;
30use opentelemetry_proto::tonic::collector::logs::v1::ExportLogsServiceRequest;
31use opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceRequest;
32use otel_arrow_rust::proto::opentelemetry::collector::metrics::v1::ExportMetricsServiceRequest;
33use pipeline::{GreptimePipelineParams, PipelineWay};
34use servers::error::{self, AuthSnafu, Result as ServerResult};
35use servers::http::prom_store::PHYSICAL_TABLE_PARAM;
36use servers::interceptor::{OpenTelemetryProtocolInterceptor, OpenTelemetryProtocolInterceptorRef};
37use servers::otlp;
38use servers::otlp::trace::TraceAuxData;
39use servers::otlp::trace::coerce::{coerce_value_data, trace_value_datatype};
40use servers::otlp::trace::span::{TraceSpan, TraceSpanGroup};
41use servers::query_handler::{
42    OpenTelemetryProtocolHandler, PipelineHandlerRef, TraceIngestOutcome,
43};
44use session::context::QueryContextRef;
45use snafu::{IntoError, ResultExt};
46use table::requests::{
47    OTLP_METRIC_COMPAT_KEY, OTLP_METRIC_COMPAT_PROM, SEMANTIC_PIPELINE, SEMANTIC_SIGNAL_TYPE,
48    SEMANTIC_SOURCE, SEMANTIC_TRACE_CONVENTIONS, SEMANTIC_TRACE_HAS_EVENTS,
49    SEMANTIC_TRACE_HAS_LINKS, SEMANTIC_VALUE_UNKNOWN, SIGNAL_TYPE_LOG, SIGNAL_TYPE_METRIC,
50    SIGNAL_TYPE_TRACE, SOURCE_OPENTELEMETRY, TABLE_DATA_MODEL_TRACE_V1,
51};
52
53use crate::instance::Instance;
54use crate::instance::otlp::trace_semconv::trace_semconv_fixed_type;
55use crate::instance::otlp::trace_types::{
56    PendingTraceColumnRewrite, choose_trace_reconcile_decision, enrich_trace_reconcile_error,
57    is_trace_reconcile_candidate_type, push_observed_trace_type, validate_trace_column_rewrites,
58};
59use crate::metrics::{
60    OTLP_LOGS_ROWS, OTLP_METRICS_ROWS, OTLP_TRACES_FAILURE_COUNT, OTLP_TRACES_ROWS,
61};
62
63pub mod trace_semconv;
64pub mod trace_types;
65
66const TRACE_INGEST_CHUNK_SIZE: usize = 64;
67const TRACE_FAILURE_MESSAGE_LIMIT: usize = 4;
68
69#[derive(Debug, Clone, Copy, PartialEq, Eq)]
70enum ChunkFailureReaction {
71    RetryPerSpan,
72    DiscardChunk,
73    Propagate,
74}
75
76impl ChunkFailureReaction {
77    fn as_metric_label(self) -> &'static str {
78        match self {
79            Self::RetryPerSpan => "retry_per_span",
80            Self::DiscardChunk => "discard_chunk",
81            Self::Propagate => "propagate_failure",
82        }
83    }
84}
85
86struct TraceChunkIngestContext<'a> {
87    pipeline_handler: PipelineHandlerRef,
88    pipeline: &'a PipelineWay,
89    pipeline_params: &'a GreptimePipelineParams,
90    table_name: &'a str,
91    is_trace_v1_model: bool,
92}
93
94struct TraceIngestState {
95    aux_data: TraceAuxData,
96    outcome: TraceIngestOutcome,
97    failure_messages: Vec<String>,
98}
99
100#[async_trait]
101impl OpenTelemetryProtocolHandler for Instance {
102    #[tracing::instrument(skip_all)]
103    async fn metrics(
104        &self,
105        request: ExportMetricsServiceRequest,
106        ctx: QueryContextRef,
107    ) -> ServerResult<Output> {
108        self.plugins
109            .get::<PermissionCheckerRef>()
110            .as_ref()
111            .check_permission(ctx.current_user(), PermissionReq::Otlp)
112            .context(AuthSnafu)?;
113
114        let interceptor_ref = self
115            .plugins
116            .get::<OpenTelemetryProtocolInterceptorRef<servers::error::Error>>();
117        interceptor_ref.pre_execute(ctx.clone())?;
118
119        let input_names = request
120            .resource_metrics
121            .iter()
122            .flat_map(|r| r.scope_metrics.iter())
123            .flat_map(|s| s.metrics.iter().map(|m| &m.name))
124            .collect::<Vec<_>>();
125
126        // See [`OtlpMetricCtx`] for details
127        let is_legacy = self.check_otlp_legacy(&input_names, ctx.clone()).await?;
128
129        let mut metric_ctx = ctx
130            .protocol_ctx()
131            .get_otlp_metric_ctx()
132            .cloned()
133            .unwrap_or_default();
134        metric_ctx.is_legacy = is_legacy;
135
136        let (requests, rows) = otlp::metrics::to_grpc_insert_requests(request, &mut metric_ctx)?;
137        OTLP_METRICS_ROWS.inc_by(rows as u64);
138
139        let ctx = {
140            let mut c = (*ctx).clone();
141            c.set_extension(SEMANTIC_SIGNAL_TYPE, SIGNAL_TYPE_METRIC);
142            c.set_extension(SEMANTIC_SOURCE, SOURCE_OPENTELEMETRY);
143            if !is_legacy {
144                c.set_extension(OTLP_METRIC_COMPAT_KEY, OTLP_METRIC_COMPAT_PROM.to_string());
145            }
146            Arc::new(c)
147        };
148
149        // If the user uses the legacy path, it is by default without metric engine.
150        if metric_ctx.is_legacy || !metric_ctx.with_metric_engine {
151            self.handle_row_inserts(requests, ctx, false, false)
152                .await
153                .map_err(BoxedError::new)
154                .context(error::ExecuteGrpcQuerySnafu)
155        } else {
156            let physical_table = ctx
157                .extension(PHYSICAL_TABLE_PARAM)
158                .unwrap_or(GREPTIME_PHYSICAL_TABLE)
159                .to_string();
160            self.handle_metric_row_inserts(requests, ctx, physical_table.clone())
161                .await
162                .map_err(BoxedError::new)
163                .context(error::ExecuteGrpcQuerySnafu)
164        }
165    }
166
167    #[tracing::instrument(skip_all)]
168    async fn traces(
169        &self,
170        pipeline_handler: PipelineHandlerRef,
171        request: ExportTraceServiceRequest,
172        pipeline: PipelineWay,
173        pipeline_params: GreptimePipelineParams,
174        table_name: String,
175        ctx: QueryContextRef,
176    ) -> ServerResult<TraceIngestOutcome> {
177        self.plugins
178            .get::<PermissionCheckerRef>()
179            .as_ref()
180            .check_permission(ctx.current_user(), PermissionReq::Otlp)
181            .context(AuthSnafu)?;
182
183        let interceptor_ref = self
184            .plugins
185            .get::<OpenTelemetryProtocolInterceptorRef<servers::error::Error>>();
186        interceptor_ref.pre_execute(ctx.clone())?;
187
188        let spans = otlp::trace::span::parse(request);
189        self.ingest_trace_spans(
190            pipeline_handler,
191            &pipeline,
192            &pipeline_params,
193            table_name,
194            spans,
195            ctx,
196        )
197        .await
198    }
199
200    #[tracing::instrument(skip_all)]
201    async fn logs(
202        &self,
203        pipeline_handler: PipelineHandlerRef,
204        request: ExportLogsServiceRequest,
205        pipeline: PipelineWay,
206        pipeline_params: GreptimePipelineParams,
207        table_name: String,
208        ctx: QueryContextRef,
209    ) -> ServerResult<Vec<Output>> {
210        self.plugins
211            .get::<PermissionCheckerRef>()
212            .as_ref()
213            .check_permission(ctx.current_user(), PermissionReq::Otlp)
214            .context(AuthSnafu)?;
215
216        let interceptor_ref = self
217            .plugins
218            .get::<OpenTelemetryProtocolInterceptorRef<servers::error::Error>>();
219        interceptor_ref.pre_execute(ctx.clone())?;
220
221        // `as_req_iter` clones this ctx into each `temp_ctx`, so identity set here
222        // reaches the context that drives table auto-create.
223        let ctx = {
224            let mut c = (*ctx).clone();
225            c.set_extension(SEMANTIC_SIGNAL_TYPE, SIGNAL_TYPE_LOG);
226            c.set_extension(SEMANTIC_SOURCE, SOURCE_OPENTELEMETRY);
227            Arc::new(c)
228        };
229
230        let opt_req = otlp::logs::to_grpc_insert_requests(
231            request,
232            pipeline,
233            pipeline_params,
234            table_name,
235            &ctx,
236            pipeline_handler,
237        )
238        .await?;
239
240        let mut outputs = vec![];
241
242        for (temp_ctx, requests) in opt_req.as_req_iter(ctx) {
243            let cnt = requests
244                .inserts
245                .iter()
246                .filter_map(|r| r.rows.as_ref().map(|r| r.rows.len()))
247                .sum::<usize>();
248
249            let o = self
250                .handle_log_inserts(requests, temp_ctx)
251                .await
252                .inspect(|_| OTLP_LOGS_ROWS.inc_by(cnt as u64))
253                .map_err(BoxedError::new)
254                .context(error::ExecuteGrpcQuerySnafu)?;
255            outputs.push(o);
256        }
257
258        Ok(outputs)
259    }
260}
261
262impl Instance {
263    /// Ingest OTLP trace spans with chunk-level writes and span-level fallback on
264    /// deterministic chunk failures.
265    async fn ingest_trace_spans(
266        &self,
267        pipeline_handler: PipelineHandlerRef,
268        pipeline: &PipelineWay,
269        pipeline_params: &GreptimePipelineParams,
270        table_name: String,
271        groups: Vec<TraceSpanGroup>,
272        ctx: QueryContextRef,
273    ) -> ServerResult<TraceIngestOutcome> {
274        let is_trace_v1_model = matches!(pipeline, PipelineWay::OtlpTraceDirectV1);
275
276        // Only the main span table gets the identity; the derived `_services` /
277        // `_operations` lookup tables keep the unstamped `ctx`.
278        let main_ctx = {
279            let mut c = (*ctx).clone();
280            c.set_extension(SEMANTIC_SIGNAL_TYPE, SIGNAL_TYPE_TRACE);
281            c.set_extension(SEMANTIC_SOURCE, SOURCE_OPENTELEMETRY);
282            if is_trace_v1_model {
283                c.set_extension(SEMANTIC_PIPELINE, TABLE_DATA_MODEL_TRACE_V1);
284                c.set_extension(SEMANTIC_TRACE_HAS_EVENTS, "true");
285                c.set_extension(SEMANTIC_TRACE_HAS_LINKS, "true");
286                // schema_url is row-level, so conventions is unknown at table level.
287                c.set_extension(SEMANTIC_TRACE_CONVENTIONS, SEMANTIC_VALUE_UNKNOWN);
288            }
289            Arc::new(c)
290        };
291
292        let ingest_ctx = TraceChunkIngestContext {
293            pipeline_handler,
294            pipeline,
295            pipeline_params,
296            table_name: &table_name,
297            is_trace_v1_model,
298        };
299        let mut ingest_state = TraceIngestState {
300            aux_data: TraceAuxData::default(),
301            outcome: TraceIngestOutcome::default(),
302            failure_messages: Vec::new(),
303        };
304
305        for group in groups {
306            let chunks = group
307                .spans
308                .into_iter()
309                .chunks(TRACE_INGEST_CHUNK_SIZE)
310                .into_iter()
311                .map(|chunk| chunk.collect::<Vec<_>>())
312                .collect::<Vec<_>>();
313            for chunk in chunks {
314                self.ingest_trace_chunk(&ingest_ctx, chunk, main_ctx.clone(), &mut ingest_state)
315                    .await?;
316            }
317        }
318
319        OTLP_TRACES_ROWS.inc_by(ingest_state.outcome.accepted_spans as u64);
320
321        if !ingest_state.aux_data.is_empty() {
322            // Auxiliary trace tables are derived from spans whose main-table
323            // writes are already confirmed, so they never create new accepted
324            // spans and they do not affect rejected span counts.
325            let (aux_requests, _) = otlp::trace::to_grpc_insert_requests_for_aux_tables(
326                std::mem::take(&mut ingest_state.aux_data),
327                ingest_ctx.pipeline,
328                ingest_ctx.table_name,
329            )?;
330
331            if !aux_requests.inserts.is_empty() {
332                match self
333                    .insert_trace_requests(aux_requests, ingest_ctx.is_trace_v1_model, ctx)
334                    .await
335                {
336                    Ok(output) => {
337                        Self::add_trace_write_cost(&mut ingest_state.outcome, output.meta.cost);
338                    }
339                    Err(err) => {
340                        Self::push_trace_failure_message(
341                            &mut ingest_state.failure_messages,
342                            "aux_table_update_failed",
343                            format!(
344                                "Auxiliary trace tables were not fully updated ({})",
345                                err.status_code().as_ref()
346                            ),
347                        );
348                    }
349                }
350            }
351        }
352
353        ingest_state.outcome.error_message = Self::finish_trace_failure_message(
354            ingest_state.outcome.accepted_spans,
355            ingest_state.outcome.rejected_spans,
356            ingest_state.failure_messages,
357        );
358
359        Ok(ingest_state.outcome)
360    }
361
362    /// Ingest one owned trace chunk so successful spans can be moved into the
363    /// accepted set without extra cloning.
364    async fn ingest_trace_chunk(
365        &self,
366        ingest_ctx: &TraceChunkIngestContext<'_>,
367        chunk: Vec<TraceSpan>,
368        ctx: QueryContextRef,
369        ingest_state: &mut TraceIngestState,
370    ) -> ServerResult<()> {
371        // Try the fast path first so healthy batches keep their original
372        // throughput and write amplification stays low.
373        let (requests, chunk_rows) = otlp::trace::to_grpc_insert_requests_from_spans(
374            &chunk,
375            ingest_ctx.pipeline,
376            ingest_ctx.pipeline_params,
377            ingest_ctx.table_name,
378            &ctx,
379            ingest_ctx.pipeline_handler.clone(),
380        )?;
381
382        match self
383            .insert_trace_requests(requests, ingest_ctx.is_trace_v1_model, ctx.clone())
384            .await
385        {
386            Ok(output) => {
387                Self::add_trace_write_cost(&mut ingest_state.outcome, output.meta.cost);
388                ingest_state.outcome.accepted_spans += chunk_rows;
389                for span in &chunk {
390                    ingest_state.aux_data.observe_span(span);
391                }
392            }
393            Err(err) => match Self::classify_trace_chunk_failure(err.status_code()) {
394                ChunkFailureReaction::RetryPerSpan => {
395                    Self::push_trace_failure_message(
396                        &mut ingest_state.failure_messages,
397                        ChunkFailureReaction::RetryPerSpan.as_metric_label(),
398                        format!("Chunk fallback triggered by {}", err.status_code().as_ref()),
399                    );
400                    // Only deterministic failures are retried span by span.
401                    // This includes schemaless table or column creation paths for
402                    // trace ingestion. Ambiguous failures are handled below
403                    // without retrying because the chunk may already have been
404                    // ingested.
405                    self.ingest_trace_chunk_span_by_span(
406                        ingest_ctx,
407                        chunk,
408                        ctx.clone(),
409                        ingest_state,
410                    )
411                    .await?;
412                }
413                ChunkFailureReaction::DiscardChunk => {
414                    ingest_state.outcome.rejected_spans += chunk.len();
415                    Self::push_trace_failure_message(
416                        &mut ingest_state.failure_messages,
417                        ChunkFailureReaction::DiscardChunk.as_metric_label(),
418                        format!(
419                            "Discarded {} spans after ambiguous chunk failure ({})",
420                            chunk.len(),
421                            err.status_code().as_ref()
422                        ),
423                    );
424                    // TODO(shuiyisong): Add an idempotent retry-safe recovery path for
425                    // ambiguous chunk failures such as timeout-like errors.
426                }
427                // Retryable or ambiguous failures must fail the request instead of
428                // becoming partial success. This path is not retry-safe because the
429                // chunk may already have been committed before the error surfaced.
430                ChunkFailureReaction::Propagate => {
431                    Self::push_trace_failure_message(
432                        &mut ingest_state.failure_messages,
433                        ChunkFailureReaction::Propagate.as_metric_label(),
434                        format!(
435                            "Propagating retryable chunk failure ({})",
436                            err.status_code().as_ref()
437                        ),
438                    );
439                    return Err(err);
440                }
441            },
442        }
443
444        Ok(())
445    }
446
447    /// Retry spans one by one only after a deterministic chunk failure.
448    async fn ingest_trace_chunk_span_by_span(
449        &self,
450        ingest_ctx: &TraceChunkIngestContext<'_>,
451        chunk: Vec<TraceSpan>,
452        ctx: QueryContextRef,
453        ingest_state: &mut TraceIngestState,
454    ) -> ServerResult<()> {
455        for span in chunk {
456            let (requests, rows) = otlp::trace::to_grpc_insert_requests_from_spans(
457                std::slice::from_ref(&span),
458                ingest_ctx.pipeline,
459                ingest_ctx.pipeline_params,
460                ingest_ctx.table_name,
461                &ctx,
462                ingest_ctx.pipeline_handler.clone(),
463            )?;
464
465            match self
466                .insert_trace_requests(requests, ingest_ctx.is_trace_v1_model, ctx.clone())
467                .await
468            {
469                Ok(output) => {
470                    Self::add_trace_write_cost(&mut ingest_state.outcome, output.meta.cost);
471                    ingest_state.outcome.accepted_spans += rows;
472                    ingest_state.aux_data.observe_span(&span);
473                }
474                Err(err) => {
475                    if Self::should_propagate_trace_span_failure(err.status_code()) {
476                        Self::push_trace_failure_message(
477                            &mut ingest_state.failure_messages,
478                            ChunkFailureReaction::Propagate.as_metric_label(),
479                            format!(
480                                "Propagating retryable span failure for {}:{} ({})",
481                                span.trace_id,
482                                span.span_id,
483                                err.status_code().as_ref()
484                            ),
485                        );
486                        return Err(err);
487                    }
488
489                    ingest_state.outcome.rejected_spans += 1;
490                    Self::push_trace_failure_message(
491                        &mut ingest_state.failure_messages,
492                        "span_rejected",
493                        format!(
494                            "Rejected span {}:{} ({})",
495                            span.trace_id,
496                            span.span_id,
497                            err.status_code().as_ref()
498                        ),
499                    );
500                }
501            }
502        }
503
504        Ok(())
505    }
506
507    /// Reconcile and insert one trace request batch.
508    async fn insert_trace_requests(
509        &self,
510        mut requests: RowInsertRequests,
511        is_trace_v1_model: bool,
512        ctx: QueryContextRef,
513    ) -> ServerResult<Output> {
514        if is_trace_v1_model {
515            self.reconcile_trace_column_types(&mut requests, &ctx)
516                .await?;
517            self.handle_trace_inserts(requests, ctx)
518                .await
519                .map_err(BoxedError::new)
520                .context(error::ExecuteGrpcQuerySnafu)
521        } else {
522            self.handle_log_inserts(requests, ctx)
523                .await
524                .map_err(BoxedError::new)
525                .context(error::ExecuteGrpcQuerySnafu)
526        }
527    }
528
529    fn classify_trace_chunk_failure(status: StatusCode) -> ChunkFailureReaction {
530        match status {
531            StatusCode::InvalidArguments
532            | StatusCode::InvalidSyntax
533            | StatusCode::Unsupported
534            | StatusCode::TableNotFound
535            | StatusCode::TableColumnNotFound => ChunkFailureReaction::RetryPerSpan,
536            StatusCode::DatabaseNotFound => ChunkFailureReaction::DiscardChunk,
537            StatusCode::Cancelled | StatusCode::DeadlineExceeded => ChunkFailureReaction::Propagate,
538            _ if status.is_retryable() => ChunkFailureReaction::Propagate,
539            _ => ChunkFailureReaction::DiscardChunk,
540        }
541    }
542
543    fn should_propagate_trace_span_failure(status: StatusCode) -> bool {
544        matches!(
545            Self::classify_trace_chunk_failure(status),
546            ChunkFailureReaction::Propagate
547        )
548    }
549
550    fn add_trace_write_cost(outcome: &mut TraceIngestOutcome, cost: usize) {
551        outcome.write_cost += cost;
552    }
553
554    fn push_trace_failure_message(messages: &mut Vec<String>, label: &str, message: String) {
555        OTLP_TRACES_FAILURE_COUNT.with_label_values(&[label]).inc();
556
557        if messages.len() < TRACE_FAILURE_MESSAGE_LIMIT {
558            messages.push(message);
559        } else if messages.len() == TRACE_FAILURE_MESSAGE_LIMIT {
560            tracing::debug!(
561                label,
562                limit = TRACE_FAILURE_MESSAGE_LIMIT,
563                "Trace ingest failure message limit reached; suppressing additional failure details"
564            );
565        }
566    }
567
568    fn finish_trace_failure_message(
569        accepted_spans: usize,
570        rejected_spans: usize,
571        messages: Vec<String>,
572    ) -> Option<String> {
573        if rejected_spans == 0 && messages.is_empty() {
574            return None;
575        }
576
577        let mut summary = format!(
578            "Accepted {} spans, rejected {} spans",
579            accepted_spans, rejected_spans
580        );
581
582        if !messages.is_empty() {
583            summary.push_str(": ");
584            summary.push_str(&messages.join("; "));
585        }
586
587        Some(summary)
588    }
589
590    /// Widen existing trace table columns to Float64 before request rewrite.
591    async fn alter_trace_table_columns_to_float64(
592        &self,
593        ctx: &QueryContextRef,
594        table_name: &str,
595        column_names: &[String],
596    ) -> ServerResult<()> {
597        let catalog_name = ctx.current_catalog().to_string();
598        let schema_name = ctx.current_schema();
599        let alter_expr = AlterTableExpr {
600            catalog_name: catalog_name.clone(),
601            schema_name: schema_name.clone(),
602            table_name: table_name.to_string(),
603            kind: Some(Kind::ModifyColumnTypes(ModifyColumnTypes {
604                modify_column_types: column_names
605                    .iter()
606                    .map(|column_name| ModifyColumnType {
607                        column_name: column_name.clone(),
608                        target_type: ColumnDataType::Float64 as i32,
609                        target_type_extension: None,
610                    })
611                    .collect(),
612            })),
613        };
614
615        if let Err(err) = self
616            .statement_executor
617            .alter_table_inner(alter_expr, ctx.clone())
618            .await
619        {
620            let table = self
621                .catalog_manager
622                .table(&catalog_name, &schema_name, table_name, None)
623                .await
624                .map_err(servers::error::Error::from)?;
625            let alter_already_applied = table
626                .map(|table| {
627                    let table_schema = table.schema();
628                    column_names.iter().all(|column_name| {
629                        table_schema
630                            .column_schema_by_name(column_name)
631                            .and_then(|table_col| {
632                                ColumnDataTypeWrapper::try_from(table_col.data_type.clone())
633                                    .ok()
634                                    .map(|wrapper| wrapper.datatype())
635                            })
636                            == Some(ColumnDataType::Float64)
637                    })
638                })
639                .unwrap_or(false);
640
641            if alter_already_applied {
642                return Ok(());
643            }
644
645            warn!(
646                table_name,
647                columns = ?column_names,
648                error = %err,
649                "failed to widen trace columns before insert"
650            );
651
652            return Err(wrap_trace_alter_failure(err));
653        }
654
655        Ok(())
656    }
657
658    /// Coerce request column types and values to match the existing table schema
659    /// for compatible type pairs. Existing table schema wins when present;
660    /// otherwise the full request batch decides a stable target type.
661    async fn reconcile_trace_column_types(
662        &self,
663        requests: &mut RowInsertRequests,
664        ctx: &QueryContextRef,
665    ) -> ServerResult<()> {
666        let catalog = ctx.current_catalog();
667        let schema = ctx.current_schema();
668
669        for req in &mut requests.inserts {
670            let table = self
671                .catalog_manager
672                .table(catalog, &schema, &req.table_name, None)
673                .await?;
674
675            let Some(rows) = req.rows.as_mut() else {
676                continue;
677            };
678
679            let table_schema = table.map(|table| table.schema());
680            let mut pending_rewrites = Vec::new();
681            let mut pending_alter_columns = Vec::new();
682
683            for (col_idx, col_schema) in rows.schema.iter().enumerate() {
684                let Some(current_type) = ColumnDataType::try_from(col_schema.datatype).ok() else {
685                    continue;
686                };
687
688                let mut observed_types = Vec::new();
689                push_observed_trace_type(&mut observed_types, current_type);
690
691                // Scan the full request first so the final type decision is not affected
692                // by row order inside the batch.
693                for row in &rows.rows {
694                    let Some(value) = row
695                        .values
696                        .get(col_idx)
697                        .and_then(|value| value.value_data.as_ref())
698                    else {
699                        continue;
700                    };
701
702                    let Some(value_type) = trace_value_datatype(value) else {
703                        continue;
704                    };
705                    push_observed_trace_type(&mut observed_types, value_type);
706                }
707
708                let existing_type = table_schema
709                    .as_ref()
710                    .and_then(|schema| schema.column_schema_by_name(&col_schema.column_name))
711                    .and_then(|table_col| {
712                        ColumnDataTypeWrapper::try_from(table_col.data_type.clone())
713                            .ok()
714                            .map(|wrapper| wrapper.datatype())
715                    });
716                let fixed_type = trace_semconv_fixed_type(&col_schema.column_name);
717
718                if !observed_types
719                    .iter()
720                    .copied()
721                    .any(is_trace_reconcile_candidate_type)
722                    && existing_type
723                        .map(|datatype| !is_trace_reconcile_candidate_type(datatype))
724                        .unwrap_or(true)
725                    && fixed_type.is_none()
726                {
727                    continue;
728                }
729
730                // Decide the final type once per column, then rewrite all affected cells
731                // together in one row pass below.
732                let Some(decision) = choose_trace_reconcile_decision(
733                    &col_schema.column_name,
734                    &observed_types,
735                    existing_type,
736                )
737                .map_err(|_| {
738                    enrich_trace_reconcile_error(
739                        &req.table_name,
740                        &col_schema.column_name,
741                        &observed_types,
742                        existing_type,
743                        fixed_type,
744                    )
745                })?
746                else {
747                    continue;
748                };
749                let target_type = decision.target_type();
750
751                if !decision.requires_alter()
752                    && observed_types
753                        .iter()
754                        .all(|observed| *observed == target_type)
755                    && col_schema.datatype == target_type as i32
756                {
757                    continue;
758                }
759
760                if decision.requires_alter()
761                    && !pending_alter_columns.contains(&col_schema.column_name)
762                {
763                    pending_alter_columns.push(col_schema.column_name.clone());
764                }
765
766                pending_rewrites.push(PendingTraceColumnRewrite {
767                    col_idx,
768                    target_type,
769                    column_name: col_schema.column_name.clone(),
770                });
771            }
772
773            if pending_rewrites.is_empty() {
774                continue;
775            }
776
777            validate_trace_column_rewrites(&rows.rows, &pending_rewrites, &req.table_name)?;
778
779            if !pending_alter_columns.is_empty() {
780                self.alter_trace_table_columns_to_float64(
781                    ctx,
782                    &req.table_name,
783                    &pending_alter_columns,
784                )
785                .await?;
786            }
787
788            // Update schema metadata before mutating row values so both stay in sync.
789            for pending_rewrite in &pending_rewrites {
790                rows.schema[pending_rewrite.col_idx].datatype = pending_rewrite.target_type as i32;
791            }
792
793            // Apply all pending column rewrites in one row pass.
794            for row in &mut rows.rows {
795                for pending_rewrite in &pending_rewrites {
796                    let Some(value) = row.values.get_mut(pending_rewrite.col_idx) else {
797                        continue;
798                    };
799                    let Some(request_type) =
800                        value.value_data.as_ref().and_then(trace_value_datatype)
801                    else {
802                        continue;
803                    };
804                    if request_type == pending_rewrite.target_type {
805                        continue;
806                    }
807
808                    value.value_data = coerce_value_data(
809                        &value.value_data,
810                        pending_rewrite.target_type,
811                        request_type,
812                    )
813                    .map_err(|_| {
814                        error::InvalidParameterSnafu {
815                            reason: format!(
816                                "failed to coerce trace column '{}' in table '{}' from {:?} to {:?}",
817                                pending_rewrite.column_name,
818                                req.table_name,
819                                request_type,
820                                pending_rewrite.target_type
821                            ),
822                        }
823                        .build()
824                    })?;
825                }
826            }
827        }
828
829        Ok(())
830    }
831}
832
833/// Preserve the original alter failure status so chunk retry behavior stays correct.
834fn wrap_trace_alter_failure<E>(err: E) -> servers::error::Error
835where
836    E: ErrorExt + Send + Sync + 'static,
837{
838    error::ExecuteGrpcQuerySnafu.into_error(BoxedError::new(err))
839}
840
841#[cfg(test)]
842mod tests {
843    use common_error::ext::ErrorExt;
844    use common_error::status_code::StatusCode;
845    use servers::query_handler::TraceIngestOutcome;
846
847    use super::{ChunkFailureReaction, Instance, wrap_trace_alter_failure};
848    use crate::metrics::OTLP_TRACES_FAILURE_COUNT;
849
850    #[test]
851    fn test_classify_trace_chunk_failure() {
852        assert_eq!(
853            Instance::classify_trace_chunk_failure(StatusCode::InvalidArguments),
854            ChunkFailureReaction::RetryPerSpan
855        );
856        assert_eq!(
857            Instance::classify_trace_chunk_failure(StatusCode::InvalidSyntax),
858            ChunkFailureReaction::RetryPerSpan
859        );
860        assert_eq!(
861            Instance::classify_trace_chunk_failure(StatusCode::Unsupported),
862            ChunkFailureReaction::RetryPerSpan
863        );
864        assert_eq!(
865            Instance::classify_trace_chunk_failure(StatusCode::TableColumnNotFound),
866            ChunkFailureReaction::RetryPerSpan
867        );
868        assert_eq!(
869            Instance::classify_trace_chunk_failure(StatusCode::TableNotFound),
870            ChunkFailureReaction::RetryPerSpan
871        );
872        assert_eq!(
873            Instance::classify_trace_chunk_failure(StatusCode::DatabaseNotFound),
874            ChunkFailureReaction::DiscardChunk
875        );
876        assert_eq!(
877            Instance::classify_trace_chunk_failure(StatusCode::DeadlineExceeded),
878            ChunkFailureReaction::Propagate
879        );
880        assert_eq!(
881            Instance::classify_trace_chunk_failure(StatusCode::Cancelled),
882            ChunkFailureReaction::Propagate
883        );
884        assert_eq!(
885            Instance::classify_trace_chunk_failure(StatusCode::StorageUnavailable),
886            ChunkFailureReaction::Propagate
887        );
888        assert_eq!(
889            Instance::classify_trace_chunk_failure(StatusCode::Internal),
890            ChunkFailureReaction::Propagate
891        );
892        assert_eq!(
893            Instance::classify_trace_chunk_failure(StatusCode::RegionNotReady),
894            ChunkFailureReaction::Propagate
895        );
896        assert_eq!(
897            Instance::classify_trace_chunk_failure(StatusCode::TableUnavailable),
898            ChunkFailureReaction::Propagate
899        );
900        assert_eq!(
901            Instance::classify_trace_chunk_failure(StatusCode::RegionBusy),
902            ChunkFailureReaction::Propagate
903        );
904        assert_eq!(
905            Instance::classify_trace_chunk_failure(StatusCode::RuntimeResourcesExhausted),
906            ChunkFailureReaction::Propagate
907        );
908    }
909
910    #[test]
911    fn test_classify_trace_span_failure() {
912        assert!(Instance::should_propagate_trace_span_failure(
913            StatusCode::DeadlineExceeded
914        ));
915        assert!(Instance::should_propagate_trace_span_failure(
916            StatusCode::StorageUnavailable
917        ));
918        assert!(!Instance::should_propagate_trace_span_failure(
919            StatusCode::InvalidArguments
920        ));
921    }
922
923    #[test]
924    fn test_add_trace_write_cost() {
925        let mut outcome = TraceIngestOutcome::default();
926        Instance::add_trace_write_cost(&mut outcome, 3);
927        Instance::add_trace_write_cost(&mut outcome, 5);
928        assert_eq!(outcome.write_cost, 8);
929    }
930
931    #[test]
932    fn test_finish_trace_failure_message() {
933        let message = Instance::finish_trace_failure_message(
934            3,
935            2,
936            vec!["Rejected span trace:span (InvalidArguments)".to_string()],
937        )
938        .unwrap();
939        assert!(message.contains("Accepted 3 spans, rejected 2 spans"));
940        assert!(message.contains("Rejected span trace:span"));
941
942        assert_eq!(Instance::finish_trace_failure_message(2, 0, vec![]), None);
943    }
944
945    #[test]
946    fn test_finish_trace_failure_message_without_detail_messages() {
947        assert_eq!(
948            Instance::finish_trace_failure_message(0, 2, vec![]),
949            Some("Accepted 0 spans, rejected 2 spans".to_string())
950        );
951    }
952
953    #[test]
954    fn test_push_trace_failure_message_increments_labeled_counter() {
955        let label = "retry_per_span_counter_test";
956        let initial = OTLP_TRACES_FAILURE_COUNT.with_label_values(&[label]).get();
957        let mut messages = Vec::new();
958
959        Instance::push_trace_failure_message(
960            &mut messages,
961            label,
962            "Chunk fallback triggered by InvalidArguments".to_string(),
963        );
964
965        assert_eq!(messages.len(), 1);
966        assert_eq!(
967            OTLP_TRACES_FAILURE_COUNT.with_label_values(&[label]).get(),
968            initial + 1
969        );
970    }
971
972    #[test]
973    fn test_push_trace_failure_message_caps_recorded_messages() {
974        let label = "retry_per_span_limit_test";
975        let mut messages = Vec::new();
976
977        for idx in 0..=4 {
978            Instance::push_trace_failure_message(&mut messages, label, format!("failure-{idx}"));
979        }
980
981        assert_eq!(messages.len(), 4);
982        assert_eq!(
983            messages,
984            vec![
985                "failure-0".to_string(),
986                "failure-1".to_string(),
987                "failure-2".to_string(),
988                "failure-3".to_string()
989            ]
990        );
991    }
992
993    #[test]
994    fn test_classify_trace_chunk_failure_defaults_to_discard() {
995        assert_eq!(
996            Instance::classify_trace_chunk_failure(StatusCode::Unknown),
997            ChunkFailureReaction::DiscardChunk
998        );
999    }
1000
1001    #[test]
1002    fn test_wrap_trace_alter_failure_preserves_status_code() {
1003        let err = wrap_trace_alter_failure(
1004            servers::error::TableNotFoundSnafu {
1005                catalog: "greptime".to_string(),
1006                schema: "public".to_string(),
1007                table: "trace_type_missing".to_string(),
1008            }
1009            .build(),
1010        );
1011
1012        assert_eq!(err.status_code(), StatusCode::TableNotFound);
1013    }
1014}