Skip to main content

frontend/instance/
otlp.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::sync::Arc;
16
17use api::helper::ColumnDataTypeWrapper;
18use api::v1::alter_table_expr::Kind;
19use api::v1::{
20    AlterTableExpr, ColumnDataType, ModifyColumnType, ModifyColumnTypes, RowInsertRequests,
21};
22use async_trait::async_trait;
23use auth::{PermissionChecker, PermissionCheckerRef, PermissionReq};
24use client::Output;
25use common_error::ext::{BoxedError, ErrorExt};
26use common_error::status_code::StatusCode;
27use common_query::prelude::GREPTIME_PHYSICAL_TABLE;
28use common_telemetry::{tracing, warn};
29use itertools::Itertools;
30use opentelemetry_proto::tonic::collector::logs::v1::ExportLogsServiceRequest;
31use opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceRequest;
32use otel_arrow_rust::proto::opentelemetry::collector::metrics::v1::ExportMetricsServiceRequest;
33use pipeline::{GreptimePipelineParams, PipelineWay};
34use servers::error::{self, AuthSnafu, Result as ServerResult};
35use servers::http::prom_store::PHYSICAL_TABLE_PARAM;
36use servers::interceptor::{OpenTelemetryProtocolInterceptor, OpenTelemetryProtocolInterceptorRef};
37use servers::otlp;
38use servers::otlp::trace::TraceAuxData;
39use servers::otlp::trace::coerce::{coerce_value_data, trace_value_datatype};
40use servers::otlp::trace::span::{TraceSpan, TraceSpanGroup};
41use servers::query_handler::{
42    OpenTelemetryProtocolHandler, PipelineHandlerRef, TraceIngestOutcome,
43};
44use session::context::QueryContextRef;
45use snafu::{IntoError, ResultExt};
46use table::requests::{OTLP_METRIC_COMPAT_KEY, OTLP_METRIC_COMPAT_PROM};
47
48use crate::instance::Instance;
49use crate::instance::otlp::trace_types::{
50    PendingTraceColumnRewrite, choose_trace_reconcile_decision, enrich_trace_reconcile_error,
51    is_trace_reconcile_candidate_type, push_observed_trace_type, validate_trace_column_rewrites,
52};
53use crate::metrics::{
54    OTLP_LOGS_ROWS, OTLP_METRICS_ROWS, OTLP_TRACES_FAILURE_COUNT, OTLP_TRACES_ROWS,
55};
56
57pub mod trace_types;
58
59const TRACE_INGEST_CHUNK_SIZE: usize = 64;
60const TRACE_FAILURE_MESSAGE_LIMIT: usize = 4;
61
62#[derive(Debug, Clone, Copy, PartialEq, Eq)]
63enum ChunkFailureReaction {
64    RetryPerSpan,
65    DiscardChunk,
66    Propagate,
67}
68
69impl ChunkFailureReaction {
70    fn as_metric_label(self) -> &'static str {
71        match self {
72            Self::RetryPerSpan => "retry_per_span",
73            Self::DiscardChunk => "discard_chunk",
74            Self::Propagate => "propagate_failure",
75        }
76    }
77}
78
79struct TraceChunkIngestContext<'a> {
80    pipeline_handler: PipelineHandlerRef,
81    pipeline: &'a PipelineWay,
82    pipeline_params: &'a GreptimePipelineParams,
83    table_name: &'a str,
84    is_trace_v1_model: bool,
85}
86
87struct TraceIngestState {
88    aux_data: TraceAuxData,
89    outcome: TraceIngestOutcome,
90    failure_messages: Vec<String>,
91}
92
93#[async_trait]
94impl OpenTelemetryProtocolHandler for Instance {
95    #[tracing::instrument(skip_all)]
96    async fn metrics(
97        &self,
98        request: ExportMetricsServiceRequest,
99        ctx: QueryContextRef,
100    ) -> ServerResult<Output> {
101        self.plugins
102            .get::<PermissionCheckerRef>()
103            .as_ref()
104            .check_permission(ctx.current_user(), PermissionReq::Otlp)
105            .context(AuthSnafu)?;
106
107        let interceptor_ref = self
108            .plugins
109            .get::<OpenTelemetryProtocolInterceptorRef<servers::error::Error>>();
110        interceptor_ref.pre_execute(ctx.clone())?;
111
112        let input_names = request
113            .resource_metrics
114            .iter()
115            .flat_map(|r| r.scope_metrics.iter())
116            .flat_map(|s| s.metrics.iter().map(|m| &m.name))
117            .collect::<Vec<_>>();
118
119        // See [`OtlpMetricCtx`] for details
120        let is_legacy = self.check_otlp_legacy(&input_names, ctx.clone()).await?;
121
122        let mut metric_ctx = ctx
123            .protocol_ctx()
124            .get_otlp_metric_ctx()
125            .cloned()
126            .unwrap_or_default();
127        metric_ctx.is_legacy = is_legacy;
128
129        let (requests, rows) = otlp::metrics::to_grpc_insert_requests(request, &mut metric_ctx)?;
130        OTLP_METRICS_ROWS.inc_by(rows as u64);
131
132        let ctx = if !is_legacy {
133            let mut c = (*ctx).clone();
134            c.set_extension(OTLP_METRIC_COMPAT_KEY, OTLP_METRIC_COMPAT_PROM.to_string());
135            Arc::new(c)
136        } else {
137            ctx
138        };
139
140        // If the user uses the legacy path, it is by default without metric engine.
141        if metric_ctx.is_legacy || !metric_ctx.with_metric_engine {
142            self.handle_row_inserts(requests, ctx, false, false)
143                .await
144                .map_err(BoxedError::new)
145                .context(error::ExecuteGrpcQuerySnafu)
146        } else {
147            let physical_table = ctx
148                .extension(PHYSICAL_TABLE_PARAM)
149                .unwrap_or(GREPTIME_PHYSICAL_TABLE)
150                .to_string();
151            self.handle_metric_row_inserts(requests, ctx, physical_table.clone())
152                .await
153                .map_err(BoxedError::new)
154                .context(error::ExecuteGrpcQuerySnafu)
155        }
156    }
157
158    #[tracing::instrument(skip_all)]
159    async fn traces(
160        &self,
161        pipeline_handler: PipelineHandlerRef,
162        request: ExportTraceServiceRequest,
163        pipeline: PipelineWay,
164        pipeline_params: GreptimePipelineParams,
165        table_name: String,
166        ctx: QueryContextRef,
167    ) -> ServerResult<TraceIngestOutcome> {
168        self.plugins
169            .get::<PermissionCheckerRef>()
170            .as_ref()
171            .check_permission(ctx.current_user(), PermissionReq::Otlp)
172            .context(AuthSnafu)?;
173
174        let interceptor_ref = self
175            .plugins
176            .get::<OpenTelemetryProtocolInterceptorRef<servers::error::Error>>();
177        interceptor_ref.pre_execute(ctx.clone())?;
178
179        let spans = otlp::trace::span::parse(request);
180        self.ingest_trace_spans(
181            pipeline_handler,
182            &pipeline,
183            &pipeline_params,
184            table_name,
185            spans,
186            ctx,
187        )
188        .await
189    }
190
191    #[tracing::instrument(skip_all)]
192    async fn logs(
193        &self,
194        pipeline_handler: PipelineHandlerRef,
195        request: ExportLogsServiceRequest,
196        pipeline: PipelineWay,
197        pipeline_params: GreptimePipelineParams,
198        table_name: String,
199        ctx: QueryContextRef,
200    ) -> ServerResult<Vec<Output>> {
201        self.plugins
202            .get::<PermissionCheckerRef>()
203            .as_ref()
204            .check_permission(ctx.current_user(), PermissionReq::Otlp)
205            .context(AuthSnafu)?;
206
207        let interceptor_ref = self
208            .plugins
209            .get::<OpenTelemetryProtocolInterceptorRef<servers::error::Error>>();
210        interceptor_ref.pre_execute(ctx.clone())?;
211
212        let opt_req = otlp::logs::to_grpc_insert_requests(
213            request,
214            pipeline,
215            pipeline_params,
216            table_name,
217            &ctx,
218            pipeline_handler,
219        )
220        .await?;
221
222        let mut outputs = vec![];
223
224        for (temp_ctx, requests) in opt_req.as_req_iter(ctx) {
225            let cnt = requests
226                .inserts
227                .iter()
228                .filter_map(|r| r.rows.as_ref().map(|r| r.rows.len()))
229                .sum::<usize>();
230
231            let o = self
232                .handle_log_inserts(requests, temp_ctx)
233                .await
234                .inspect(|_| OTLP_LOGS_ROWS.inc_by(cnt as u64))
235                .map_err(BoxedError::new)
236                .context(error::ExecuteGrpcQuerySnafu)?;
237            outputs.push(o);
238        }
239
240        Ok(outputs)
241    }
242}
243
244impl Instance {
245    /// Ingest OTLP trace spans with chunk-level writes and span-level fallback on
246    /// deterministic chunk failures.
247    async fn ingest_trace_spans(
248        &self,
249        pipeline_handler: PipelineHandlerRef,
250        pipeline: &PipelineWay,
251        pipeline_params: &GreptimePipelineParams,
252        table_name: String,
253        groups: Vec<TraceSpanGroup>,
254        ctx: QueryContextRef,
255    ) -> ServerResult<TraceIngestOutcome> {
256        let is_trace_v1_model = matches!(pipeline, PipelineWay::OtlpTraceDirectV1);
257        let ingest_ctx = TraceChunkIngestContext {
258            pipeline_handler,
259            pipeline,
260            pipeline_params,
261            table_name: &table_name,
262            is_trace_v1_model,
263        };
264        let mut ingest_state = TraceIngestState {
265            aux_data: TraceAuxData::default(),
266            outcome: TraceIngestOutcome::default(),
267            failure_messages: Vec::new(),
268        };
269
270        for group in groups {
271            let chunks = group
272                .spans
273                .into_iter()
274                .chunks(TRACE_INGEST_CHUNK_SIZE)
275                .into_iter()
276                .map(|chunk| chunk.collect::<Vec<_>>())
277                .collect::<Vec<_>>();
278            for chunk in chunks {
279                self.ingest_trace_chunk(&ingest_ctx, chunk, ctx.clone(), &mut ingest_state)
280                    .await?;
281            }
282        }
283
284        OTLP_TRACES_ROWS.inc_by(ingest_state.outcome.accepted_spans as u64);
285
286        if !ingest_state.aux_data.is_empty() {
287            // Auxiliary trace tables are derived from spans whose main-table
288            // writes are already confirmed, so they never create new accepted
289            // spans and they do not affect rejected span counts.
290            let (aux_requests, _) = otlp::trace::to_grpc_insert_requests_for_aux_tables(
291                std::mem::take(&mut ingest_state.aux_data),
292                ingest_ctx.pipeline,
293                ingest_ctx.table_name,
294            )?;
295
296            if !aux_requests.inserts.is_empty() {
297                match self
298                    .insert_trace_requests(aux_requests, ingest_ctx.is_trace_v1_model, ctx)
299                    .await
300                {
301                    Ok(output) => {
302                        Self::add_trace_write_cost(&mut ingest_state.outcome, output.meta.cost);
303                    }
304                    Err(err) => {
305                        Self::push_trace_failure_message(
306                            &mut ingest_state.failure_messages,
307                            "aux_table_update_failed",
308                            format!(
309                                "Auxiliary trace tables were not fully updated ({})",
310                                err.status_code().as_ref()
311                            ),
312                        );
313                    }
314                }
315            }
316        }
317
318        ingest_state.outcome.error_message = Self::finish_trace_failure_message(
319            ingest_state.outcome.accepted_spans,
320            ingest_state.outcome.rejected_spans,
321            ingest_state.failure_messages,
322        );
323
324        Ok(ingest_state.outcome)
325    }
326
327    /// Ingest one owned trace chunk so successful spans can be moved into the
328    /// accepted set without extra cloning.
329    async fn ingest_trace_chunk(
330        &self,
331        ingest_ctx: &TraceChunkIngestContext<'_>,
332        chunk: Vec<TraceSpan>,
333        ctx: QueryContextRef,
334        ingest_state: &mut TraceIngestState,
335    ) -> ServerResult<()> {
336        // Try the fast path first so healthy batches keep their original
337        // throughput and write amplification stays low.
338        let (requests, chunk_rows) = otlp::trace::to_grpc_insert_requests_from_spans(
339            &chunk,
340            ingest_ctx.pipeline,
341            ingest_ctx.pipeline_params,
342            ingest_ctx.table_name,
343            &ctx,
344            ingest_ctx.pipeline_handler.clone(),
345        )?;
346
347        match self
348            .insert_trace_requests(requests, ingest_ctx.is_trace_v1_model, ctx.clone())
349            .await
350        {
351            Ok(output) => {
352                Self::add_trace_write_cost(&mut ingest_state.outcome, output.meta.cost);
353                ingest_state.outcome.accepted_spans += chunk_rows;
354                for span in &chunk {
355                    ingest_state.aux_data.observe_span(span);
356                }
357            }
358            Err(err) => match Self::classify_trace_chunk_failure(err.status_code()) {
359                ChunkFailureReaction::RetryPerSpan => {
360                    Self::push_trace_failure_message(
361                        &mut ingest_state.failure_messages,
362                        ChunkFailureReaction::RetryPerSpan.as_metric_label(),
363                        format!("Chunk fallback triggered by {}", err.status_code().as_ref()),
364                    );
365                    // Only deterministic failures are retried span by span.
366                    // This includes schemaless table or column creation paths for
367                    // trace ingestion. Ambiguous failures are handled below
368                    // without retrying because the chunk may already have been
369                    // ingested.
370                    self.ingest_trace_chunk_span_by_span(
371                        ingest_ctx,
372                        chunk,
373                        ctx.clone(),
374                        ingest_state,
375                    )
376                    .await?;
377                }
378                ChunkFailureReaction::DiscardChunk => {
379                    ingest_state.outcome.rejected_spans += chunk.len();
380                    Self::push_trace_failure_message(
381                        &mut ingest_state.failure_messages,
382                        ChunkFailureReaction::DiscardChunk.as_metric_label(),
383                        format!(
384                            "Discarded {} spans after ambiguous chunk failure ({})",
385                            chunk.len(),
386                            err.status_code().as_ref()
387                        ),
388                    );
389                    // TODO(shuiyisong): Add an idempotent retry-safe recovery path for
390                    // ambiguous chunk failures such as timeout-like errors.
391                }
392                // Retryable or ambiguous failures must fail the request instead of
393                // becoming partial success. This path is not retry-safe because the
394                // chunk may already have been committed before the error surfaced.
395                ChunkFailureReaction::Propagate => {
396                    Self::push_trace_failure_message(
397                        &mut ingest_state.failure_messages,
398                        ChunkFailureReaction::Propagate.as_metric_label(),
399                        format!(
400                            "Propagating retryable chunk failure ({})",
401                            err.status_code().as_ref()
402                        ),
403                    );
404                    return Err(err);
405                }
406            },
407        }
408
409        Ok(())
410    }
411
412    /// Retry spans one by one only after a deterministic chunk failure.
413    async fn ingest_trace_chunk_span_by_span(
414        &self,
415        ingest_ctx: &TraceChunkIngestContext<'_>,
416        chunk: Vec<TraceSpan>,
417        ctx: QueryContextRef,
418        ingest_state: &mut TraceIngestState,
419    ) -> ServerResult<()> {
420        for span in chunk {
421            let (requests, rows) = otlp::trace::to_grpc_insert_requests_from_spans(
422                std::slice::from_ref(&span),
423                ingest_ctx.pipeline,
424                ingest_ctx.pipeline_params,
425                ingest_ctx.table_name,
426                &ctx,
427                ingest_ctx.pipeline_handler.clone(),
428            )?;
429
430            match self
431                .insert_trace_requests(requests, ingest_ctx.is_trace_v1_model, ctx.clone())
432                .await
433            {
434                Ok(output) => {
435                    Self::add_trace_write_cost(&mut ingest_state.outcome, output.meta.cost);
436                    ingest_state.outcome.accepted_spans += rows;
437                    ingest_state.aux_data.observe_span(&span);
438                }
439                Err(err) => {
440                    if Self::should_propagate_trace_span_failure(err.status_code()) {
441                        Self::push_trace_failure_message(
442                            &mut ingest_state.failure_messages,
443                            ChunkFailureReaction::Propagate.as_metric_label(),
444                            format!(
445                                "Propagating retryable span failure for {}:{} ({})",
446                                span.trace_id,
447                                span.span_id,
448                                err.status_code().as_ref()
449                            ),
450                        );
451                        return Err(err);
452                    }
453
454                    ingest_state.outcome.rejected_spans += 1;
455                    Self::push_trace_failure_message(
456                        &mut ingest_state.failure_messages,
457                        "span_rejected",
458                        format!(
459                            "Rejected span {}:{} ({})",
460                            span.trace_id,
461                            span.span_id,
462                            err.status_code().as_ref()
463                        ),
464                    );
465                }
466            }
467        }
468
469        Ok(())
470    }
471
472    /// Reconcile and insert one trace request batch.
473    async fn insert_trace_requests(
474        &self,
475        mut requests: RowInsertRequests,
476        is_trace_v1_model: bool,
477        ctx: QueryContextRef,
478    ) -> ServerResult<Output> {
479        if is_trace_v1_model {
480            self.reconcile_trace_column_types(&mut requests, &ctx)
481                .await?;
482            self.handle_trace_inserts(requests, ctx)
483                .await
484                .map_err(BoxedError::new)
485                .context(error::ExecuteGrpcQuerySnafu)
486        } else {
487            self.handle_log_inserts(requests, ctx)
488                .await
489                .map_err(BoxedError::new)
490                .context(error::ExecuteGrpcQuerySnafu)
491        }
492    }
493
494    fn classify_trace_chunk_failure(status: StatusCode) -> ChunkFailureReaction {
495        match status {
496            StatusCode::InvalidArguments
497            | StatusCode::InvalidSyntax
498            | StatusCode::Unsupported
499            | StatusCode::TableNotFound
500            | StatusCode::TableColumnNotFound => ChunkFailureReaction::RetryPerSpan,
501            StatusCode::DatabaseNotFound => ChunkFailureReaction::DiscardChunk,
502            StatusCode::Cancelled | StatusCode::DeadlineExceeded => ChunkFailureReaction::Propagate,
503            _ if status.is_retryable() => ChunkFailureReaction::Propagate,
504            _ => ChunkFailureReaction::DiscardChunk,
505        }
506    }
507
508    fn should_propagate_trace_span_failure(status: StatusCode) -> bool {
509        matches!(
510            Self::classify_trace_chunk_failure(status),
511            ChunkFailureReaction::Propagate
512        )
513    }
514
515    fn add_trace_write_cost(outcome: &mut TraceIngestOutcome, cost: usize) {
516        outcome.write_cost += cost;
517    }
518
519    fn push_trace_failure_message(messages: &mut Vec<String>, label: &str, message: String) {
520        OTLP_TRACES_FAILURE_COUNT.with_label_values(&[label]).inc();
521
522        if messages.len() < TRACE_FAILURE_MESSAGE_LIMIT {
523            messages.push(message);
524        } else if messages.len() == TRACE_FAILURE_MESSAGE_LIMIT {
525            tracing::debug!(
526                label,
527                limit = TRACE_FAILURE_MESSAGE_LIMIT,
528                "Trace ingest failure message limit reached; suppressing additional failure details"
529            );
530        }
531    }
532
533    fn finish_trace_failure_message(
534        accepted_spans: usize,
535        rejected_spans: usize,
536        messages: Vec<String>,
537    ) -> Option<String> {
538        if rejected_spans == 0 && messages.is_empty() {
539            return None;
540        }
541
542        let mut summary = format!(
543            "Accepted {} spans, rejected {} spans",
544            accepted_spans, rejected_spans
545        );
546
547        if !messages.is_empty() {
548            summary.push_str(": ");
549            summary.push_str(&messages.join("; "));
550        }
551
552        Some(summary)
553    }
554
555    /// Widen existing trace table columns to Float64 before request rewrite.
556    async fn alter_trace_table_columns_to_float64(
557        &self,
558        ctx: &QueryContextRef,
559        table_name: &str,
560        column_names: &[String],
561    ) -> ServerResult<()> {
562        let catalog_name = ctx.current_catalog().to_string();
563        let schema_name = ctx.current_schema();
564        let alter_expr = AlterTableExpr {
565            catalog_name: catalog_name.clone(),
566            schema_name: schema_name.clone(),
567            table_name: table_name.to_string(),
568            kind: Some(Kind::ModifyColumnTypes(ModifyColumnTypes {
569                modify_column_types: column_names
570                    .iter()
571                    .map(|column_name| ModifyColumnType {
572                        column_name: column_name.clone(),
573                        target_type: ColumnDataType::Float64 as i32,
574                        target_type_extension: None,
575                    })
576                    .collect(),
577            })),
578        };
579
580        if let Err(err) = self
581            .statement_executor
582            .alter_table_inner(alter_expr, ctx.clone())
583            .await
584        {
585            let table = self
586                .catalog_manager
587                .table(&catalog_name, &schema_name, table_name, None)
588                .await
589                .map_err(servers::error::Error::from)?;
590            let alter_already_applied = table
591                .map(|table| {
592                    let table_schema = table.schema();
593                    column_names.iter().all(|column_name| {
594                        table_schema
595                            .column_schema_by_name(column_name)
596                            .and_then(|table_col| {
597                                ColumnDataTypeWrapper::try_from(table_col.data_type.clone())
598                                    .ok()
599                                    .map(|wrapper| wrapper.datatype())
600                            })
601                            == Some(ColumnDataType::Float64)
602                    })
603                })
604                .unwrap_or(false);
605
606            if alter_already_applied {
607                return Ok(());
608            }
609
610            warn!(
611                table_name,
612                columns = ?column_names,
613                error = %err,
614                "failed to widen trace columns before insert"
615            );
616
617            return Err(wrap_trace_alter_failure(err));
618        }
619
620        Ok(())
621    }
622
623    /// Coerce request column types and values to match the existing table schema
624    /// for compatible type pairs. Existing table schema wins when present;
625    /// otherwise the full request batch decides a stable target type.
626    async fn reconcile_trace_column_types(
627        &self,
628        requests: &mut RowInsertRequests,
629        ctx: &QueryContextRef,
630    ) -> ServerResult<()> {
631        let catalog = ctx.current_catalog();
632        let schema = ctx.current_schema();
633
634        for req in &mut requests.inserts {
635            let table = self
636                .catalog_manager
637                .table(catalog, &schema, &req.table_name, None)
638                .await?;
639
640            let Some(rows) = req.rows.as_mut() else {
641                continue;
642            };
643
644            let table_schema = table.map(|table| table.schema());
645            let mut pending_rewrites = Vec::new();
646            let mut pending_alter_columns = Vec::new();
647
648            for (col_idx, col_schema) in rows.schema.iter().enumerate() {
649                let Some(current_type) = ColumnDataType::try_from(col_schema.datatype).ok() else {
650                    continue;
651                };
652
653                let mut observed_types = Vec::new();
654                push_observed_trace_type(&mut observed_types, current_type);
655
656                // Scan the full request first so the final type decision is not affected
657                // by row order inside the batch.
658                for row in &rows.rows {
659                    let Some(value) = row
660                        .values
661                        .get(col_idx)
662                        .and_then(|value| value.value_data.as_ref())
663                    else {
664                        continue;
665                    };
666
667                    let Some(value_type) = trace_value_datatype(value) else {
668                        continue;
669                    };
670                    push_observed_trace_type(&mut observed_types, value_type);
671                }
672
673                let existing_type = table_schema
674                    .as_ref()
675                    .and_then(|schema| schema.column_schema_by_name(&col_schema.column_name))
676                    .and_then(|table_col| {
677                        ColumnDataTypeWrapper::try_from(table_col.data_type.clone())
678                            .ok()
679                            .map(|wrapper| wrapper.datatype())
680                    });
681
682                if !observed_types
683                    .iter()
684                    .copied()
685                    .any(is_trace_reconcile_candidate_type)
686                    && existing_type
687                        .map(|datatype| !is_trace_reconcile_candidate_type(datatype))
688                        .unwrap_or(true)
689                {
690                    continue;
691                }
692
693                // Decide the final type once per column, then rewrite all affected cells
694                // together in one row pass below.
695                let Some(decision) =
696                    choose_trace_reconcile_decision(&observed_types, existing_type).map_err(
697                        |_| {
698                            enrich_trace_reconcile_error(
699                                &req.table_name,
700                                &col_schema.column_name,
701                                &observed_types,
702                                existing_type,
703                            )
704                        },
705                    )?
706                else {
707                    continue;
708                };
709                let target_type = decision.target_type();
710
711                if !decision.requires_alter()
712                    && observed_types
713                        .iter()
714                        .all(|observed| *observed == target_type)
715                    && col_schema.datatype == target_type as i32
716                {
717                    continue;
718                }
719
720                if decision.requires_alter()
721                    && !pending_alter_columns.contains(&col_schema.column_name)
722                {
723                    pending_alter_columns.push(col_schema.column_name.clone());
724                }
725
726                pending_rewrites.push(PendingTraceColumnRewrite {
727                    col_idx,
728                    target_type,
729                    column_name: col_schema.column_name.clone(),
730                });
731            }
732
733            if pending_rewrites.is_empty() {
734                continue;
735            }
736
737            validate_trace_column_rewrites(&rows.rows, &pending_rewrites, &req.table_name)?;
738
739            if !pending_alter_columns.is_empty() {
740                self.alter_trace_table_columns_to_float64(
741                    ctx,
742                    &req.table_name,
743                    &pending_alter_columns,
744                )
745                .await?;
746            }
747
748            // Update schema metadata before mutating row values so both stay in sync.
749            for pending_rewrite in &pending_rewrites {
750                rows.schema[pending_rewrite.col_idx].datatype = pending_rewrite.target_type as i32;
751            }
752
753            // Apply all pending column rewrites in one row pass.
754            for row in &mut rows.rows {
755                for pending_rewrite in &pending_rewrites {
756                    let Some(value) = row.values.get_mut(pending_rewrite.col_idx) else {
757                        continue;
758                    };
759                    let Some(request_type) =
760                        value.value_data.as_ref().and_then(trace_value_datatype)
761                    else {
762                        continue;
763                    };
764                    if request_type == pending_rewrite.target_type {
765                        continue;
766                    }
767
768                    value.value_data = coerce_value_data(
769                        &value.value_data,
770                        pending_rewrite.target_type,
771                        request_type,
772                    )
773                    .map_err(|_| {
774                        error::InvalidParameterSnafu {
775                            reason: format!(
776                                "failed to coerce trace column '{}' in table '{}' from {:?} to {:?}",
777                                pending_rewrite.column_name,
778                                req.table_name,
779                                request_type,
780                                pending_rewrite.target_type
781                            ),
782                        }
783                        .build()
784                    })?;
785                }
786            }
787        }
788
789        Ok(())
790    }
791}
792
793/// Preserve the original alter failure status so chunk retry behavior stays correct.
794fn wrap_trace_alter_failure<E>(err: E) -> servers::error::Error
795where
796    E: ErrorExt + Send + Sync + 'static,
797{
798    error::ExecuteGrpcQuerySnafu.into_error(BoxedError::new(err))
799}
800
801#[cfg(test)]
802mod tests {
803    use common_error::ext::ErrorExt;
804    use common_error::status_code::StatusCode;
805    use servers::query_handler::TraceIngestOutcome;
806
807    use super::{ChunkFailureReaction, Instance, wrap_trace_alter_failure};
808    use crate::metrics::OTLP_TRACES_FAILURE_COUNT;
809
810    #[test]
811    fn test_classify_trace_chunk_failure() {
812        assert_eq!(
813            Instance::classify_trace_chunk_failure(StatusCode::InvalidArguments),
814            ChunkFailureReaction::RetryPerSpan
815        );
816        assert_eq!(
817            Instance::classify_trace_chunk_failure(StatusCode::InvalidSyntax),
818            ChunkFailureReaction::RetryPerSpan
819        );
820        assert_eq!(
821            Instance::classify_trace_chunk_failure(StatusCode::Unsupported),
822            ChunkFailureReaction::RetryPerSpan
823        );
824        assert_eq!(
825            Instance::classify_trace_chunk_failure(StatusCode::TableColumnNotFound),
826            ChunkFailureReaction::RetryPerSpan
827        );
828        assert_eq!(
829            Instance::classify_trace_chunk_failure(StatusCode::TableNotFound),
830            ChunkFailureReaction::RetryPerSpan
831        );
832        assert_eq!(
833            Instance::classify_trace_chunk_failure(StatusCode::DatabaseNotFound),
834            ChunkFailureReaction::DiscardChunk
835        );
836        assert_eq!(
837            Instance::classify_trace_chunk_failure(StatusCode::DeadlineExceeded),
838            ChunkFailureReaction::Propagate
839        );
840        assert_eq!(
841            Instance::classify_trace_chunk_failure(StatusCode::Cancelled),
842            ChunkFailureReaction::Propagate
843        );
844        assert_eq!(
845            Instance::classify_trace_chunk_failure(StatusCode::StorageUnavailable),
846            ChunkFailureReaction::Propagate
847        );
848        assert_eq!(
849            Instance::classify_trace_chunk_failure(StatusCode::Internal),
850            ChunkFailureReaction::Propagate
851        );
852        assert_eq!(
853            Instance::classify_trace_chunk_failure(StatusCode::RegionNotReady),
854            ChunkFailureReaction::Propagate
855        );
856        assert_eq!(
857            Instance::classify_trace_chunk_failure(StatusCode::TableUnavailable),
858            ChunkFailureReaction::Propagate
859        );
860        assert_eq!(
861            Instance::classify_trace_chunk_failure(StatusCode::RegionBusy),
862            ChunkFailureReaction::Propagate
863        );
864        assert_eq!(
865            Instance::classify_trace_chunk_failure(StatusCode::RuntimeResourcesExhausted),
866            ChunkFailureReaction::Propagate
867        );
868    }
869
870    #[test]
871    fn test_classify_trace_span_failure() {
872        assert!(Instance::should_propagate_trace_span_failure(
873            StatusCode::DeadlineExceeded
874        ));
875        assert!(Instance::should_propagate_trace_span_failure(
876            StatusCode::StorageUnavailable
877        ));
878        assert!(!Instance::should_propagate_trace_span_failure(
879            StatusCode::InvalidArguments
880        ));
881    }
882
883    #[test]
884    fn test_add_trace_write_cost() {
885        let mut outcome = TraceIngestOutcome::default();
886        Instance::add_trace_write_cost(&mut outcome, 3);
887        Instance::add_trace_write_cost(&mut outcome, 5);
888        assert_eq!(outcome.write_cost, 8);
889    }
890
891    #[test]
892    fn test_finish_trace_failure_message() {
893        let message = Instance::finish_trace_failure_message(
894            3,
895            2,
896            vec!["Rejected span trace:span (InvalidArguments)".to_string()],
897        )
898        .unwrap();
899        assert!(message.contains("Accepted 3 spans, rejected 2 spans"));
900        assert!(message.contains("Rejected span trace:span"));
901
902        assert_eq!(Instance::finish_trace_failure_message(2, 0, vec![]), None);
903    }
904
905    #[test]
906    fn test_finish_trace_failure_message_without_detail_messages() {
907        assert_eq!(
908            Instance::finish_trace_failure_message(0, 2, vec![]),
909            Some("Accepted 0 spans, rejected 2 spans".to_string())
910        );
911    }
912
913    #[test]
914    fn test_push_trace_failure_message_increments_labeled_counter() {
915        let label = "retry_per_span_counter_test";
916        let initial = OTLP_TRACES_FAILURE_COUNT.with_label_values(&[label]).get();
917        let mut messages = Vec::new();
918
919        Instance::push_trace_failure_message(
920            &mut messages,
921            label,
922            "Chunk fallback triggered by InvalidArguments".to_string(),
923        );
924
925        assert_eq!(messages.len(), 1);
926        assert_eq!(
927            OTLP_TRACES_FAILURE_COUNT.with_label_values(&[label]).get(),
928            initial + 1
929        );
930    }
931
932    #[test]
933    fn test_push_trace_failure_message_caps_recorded_messages() {
934        let label = "retry_per_span_limit_test";
935        let mut messages = Vec::new();
936
937        for idx in 0..=4 {
938            Instance::push_trace_failure_message(&mut messages, label, format!("failure-{idx}"));
939        }
940
941        assert_eq!(messages.len(), 4);
942        assert_eq!(
943            messages,
944            vec![
945                "failure-0".to_string(),
946                "failure-1".to_string(),
947                "failure-2".to_string(),
948                "failure-3".to_string()
949            ]
950        );
951    }
952
953    #[test]
954    fn test_classify_trace_chunk_failure_defaults_to_discard() {
955        assert_eq!(
956            Instance::classify_trace_chunk_failure(StatusCode::Unknown),
957            ChunkFailureReaction::DiscardChunk
958        );
959    }
960
961    #[test]
962    fn test_wrap_trace_alter_failure_preserves_status_code() {
963        let err = wrap_trace_alter_failure(
964            servers::error::TableNotFoundSnafu {
965                catalog: "greptime".to_string(),
966                schema: "public".to_string(),
967                table: "trace_type_missing".to_string(),
968            }
969            .build(),
970        );
971
972        assert_eq!(err.status_code(), StatusCode::TableNotFound);
973    }
974}