Skip to main content

metric_engine/
row_modifier.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::collections::{BTreeMap, HashMap};
16use std::hash::Hasher;
17
18use api::v1::value::ValueData;
19use api::v1::{ColumnDataType, ColumnSchema, Row, Rows, SemanticType, Value};
20use datatypes::value::ValueRef;
21use fxhash::FxHasher;
22use mito_codec::row_converter::SparsePrimaryKeyCodec;
23use smallvec::SmallVec;
24use snafu::ResultExt;
25use store_api::codec::PrimaryKeyEncoding;
26use store_api::metadata::ColumnMetadata;
27use store_api::metric_engine_consts::{
28    DATA_SCHEMA_TABLE_ID_COLUMN_NAME, DATA_SCHEMA_TSID_COLUMN_NAME,
29};
30use store_api::storage::consts::{PRIMARY_KEY_COLUMN_NAME, ReservedColumnId};
31use store_api::storage::{ColumnId, TableId};
32
33use crate::error::{EncodePrimaryKeySnafu, Result, TableIdCountMismatchSnafu};
34
35/// A row modifier modifies [`Rows`].
36///
37/// - For [`PrimaryKeyEncoding::Sparse`] encoding,
38///   it replaces the primary key columns with the encoded primary key column(`__primary_key`).
39///
40/// - For [`PrimaryKeyEncoding::Dense`] encoding,
41///   it adds two columns(`__table_id`, `__tsid`) to the row.
42pub struct RowModifier {
43    codec: SparsePrimaryKeyCodec,
44}
45
46/// Table id input for row modification.
47#[derive(Clone, Copy)]
48pub(crate) enum TableIdInput<'a> {
49    Single(TableId),
50    Batch(&'a [TableId]),
51}
52
53impl<'a> TableIdInput<'a> {
54    fn table_id_for_row(&self, row_idx: usize) -> TableId {
55        match self {
56            TableIdInput::Single(table_id) => *table_id,
57            TableIdInput::Batch(table_ids) => table_ids[row_idx],
58        }
59    }
60}
61
62impl Default for RowModifier {
63    fn default() -> Self {
64        Self {
65            codec: SparsePrimaryKeyCodec::schemaless(),
66        }
67    }
68}
69
70impl RowModifier {
71    /// Modify rows with the given primary key encoding and table ids.
72    pub(crate) fn modify_rows(
73        &self,
74        iter: RowsIter,
75        table_ids: TableIdInput<'_>,
76        encoding: PrimaryKeyEncoding,
77    ) -> Result<Rows> {
78        let row_count = iter.rows.rows.len();
79        Self::validate_table_id_count(table_ids, row_count)?;
80        match encoding {
81            PrimaryKeyEncoding::Sparse => self.modify_rows_sparse(iter, table_ids),
82            PrimaryKeyEncoding::Dense => self.modify_rows_dense(iter, table_ids),
83        }
84    }
85
86    /// Modifies rows with sparse primary key encoding.
87    /// It replaces the primary key columns with the encoded primary key column(`__primary_key`).
88    fn modify_rows_sparse(&self, mut iter: RowsIter, table_ids: TableIdInput<'_>) -> Result<Rows> {
89        let num_column = iter.rows.schema.len();
90        let num_primary_key_column = iter.index.num_primary_key_column;
91        // num_output_column = remaining columns(fields columns + timestamp column) + 1 (encoded primary key column)
92        let num_output_column = num_column - num_primary_key_column + 1;
93
94        let mut buffer = vec![];
95
96        for (row_index, mut row_iter) in iter.iter_mut().enumerate() {
97            let table_id = table_ids.table_id_for_row(row_index);
98            let (table_id_value, tsid) = Self::fill_internal_columns(table_id, &row_iter);
99            let mut values = Vec::with_capacity(num_output_column);
100            buffer.clear();
101            let internal_columns = [
102                (
103                    ReservedColumnId::table_id(),
104                    api::helper::pb_value_to_value_ref(&table_id_value, None),
105                ),
106                (
107                    ReservedColumnId::tsid(),
108                    api::helper::pb_value_to_value_ref(&tsid, None),
109                ),
110            ];
111            self.codec
112                .encode_to_vec(internal_columns.into_iter(), &mut buffer)
113                .context(EncodePrimaryKeySnafu)?;
114            self.codec
115                .encode_to_vec(row_iter.primary_keys(), &mut buffer)
116                .context(EncodePrimaryKeySnafu)?;
117
118            values.push(ValueData::BinaryValue(buffer.clone()).into());
119            values.extend(row_iter.remaining());
120            // Replace the row with the encoded row
121            *row_iter.row = Row { values };
122        }
123
124        // Update the schema
125        let mut schema = Vec::with_capacity(num_output_column);
126        schema.push(ColumnSchema {
127            column_name: PRIMARY_KEY_COLUMN_NAME.to_string(),
128            datatype: ColumnDataType::Binary as i32,
129            semantic_type: SemanticType::Tag as _,
130            datatype_extension: None,
131            options: None,
132        });
133        schema.extend(iter.remaining_columns());
134        iter.rows.schema = schema;
135
136        Ok(iter.rows)
137    }
138
139    /// Modifies rows with dense primary key encoding.
140    /// It adds two columns(`__table_id`, `__tsid`) to the row.
141    fn modify_rows_dense(&self, mut iter: RowsIter, table_ids: TableIdInput<'_>) -> Result<Rows> {
142        // add table_name column
143        iter.rows.schema.push(ColumnSchema {
144            column_name: DATA_SCHEMA_TABLE_ID_COLUMN_NAME.to_string(),
145            datatype: ColumnDataType::Uint32 as i32,
146            semantic_type: SemanticType::Tag as _,
147            datatype_extension: None,
148            options: None,
149        });
150        // add tsid column
151        iter.rows.schema.push(ColumnSchema {
152            column_name: DATA_SCHEMA_TSID_COLUMN_NAME.to_string(),
153            datatype: ColumnDataType::Uint64 as i32,
154            semantic_type: SemanticType::Tag as _,
155            datatype_extension: None,
156            options: None,
157        });
158        for (row_index, row_iter) in iter.iter_mut().enumerate() {
159            let table_id = table_ids.table_id_for_row(row_index);
160            let (table_id_value, tsid) = Self::fill_internal_columns(table_id, &row_iter);
161            row_iter.row.values.push(table_id_value);
162            row_iter.row.values.push(tsid);
163        }
164
165        Ok(iter.rows)
166    }
167
168    fn validate_table_id_count(table_ids: TableIdInput<'_>, row_count: usize) -> Result<()> {
169        if let TableIdInput::Batch(table_ids) = table_ids
170            && table_ids.len() != row_count
171        {
172            return TableIdCountMismatchSnafu {
173                expected: row_count,
174                actual: table_ids.len(),
175            }
176            .fail();
177        }
178        Ok(())
179    }
180
181    /// Fills internal columns of a row with table name and a hash of tag values.
182    pub fn fill_internal_columns(table_id: TableId, iter: &RowIter<'_>) -> (Value, Value) {
183        let ts_id = if !iter.has_null_labels() {
184            // No null labels in row, we can safely reuse the precomputed label name hash.
185            let mut ts_id_gen = TsidGenerator::new(iter.index.label_name_hash);
186            for (_, value) in iter.primary_keys_with_name() {
187                // The type is checked before. So only null is ignored.
188                if let Some(ValueData::StringValue(string)) = &value.value_data {
189                    ts_id_gen.write_str(string);
190                } else {
191                    unreachable!(
192                        "Should not contain null or non-string value: {:?}, table id: {}",
193                        value, table_id
194                    );
195                }
196            }
197            ts_id_gen.finish()
198        } else {
199            // Slow path: row contains null, recompute label hash
200            let mut hasher = TsidGenerator::default();
201            // 1. Find out label names with non-null values and get the hash.
202            for (name, value) in iter.primary_keys_with_name() {
203                // The type is checked before. So only null is ignored.
204                if let Some(ValueData::StringValue(_)) = &value.value_data {
205                    hasher.write_str(name);
206                }
207            }
208            let label_name_hash = hasher.finish();
209
210            // 2. Use label name hash as seed and continue with label values.
211            let mut final_hasher = TsidGenerator::new(label_name_hash);
212            for (_, value) in iter.primary_keys_with_name() {
213                if let Some(ValueData::StringValue(value)) = &value.value_data {
214                    final_hasher.write_str(value);
215                }
216            }
217            final_hasher.finish()
218        };
219
220        (
221            ValueData::U32Value(table_id).into(),
222            ValueData::U64Value(ts_id).into(),
223        )
224    }
225}
226
227/// Tsid generator.
228#[derive(Default)]
229pub struct TsidGenerator {
230    hasher: FxHasher,
231}
232
233impl TsidGenerator {
234    pub fn new(label_name_hash: u64) -> Self {
235        let mut hasher = FxHasher::default();
236        hasher.write_u64(label_name_hash);
237        Self { hasher }
238    }
239
240    /// Writes a label pair to the generator.
241    pub fn write_str(&mut self, value: &str) {
242        self.hasher.write(value.as_bytes());
243        self.hasher.write_u8(0xff);
244    }
245
246    /// Generates a new TSID.
247    pub fn finish(&mut self) -> u64 {
248        self.hasher.finish()
249    }
250}
251
252/// Index of a value.
253#[derive(Debug, Clone, Copy)]
254struct ValueIndex {
255    column_id: ColumnId,
256    index: usize,
257}
258
259/// Index of a row.
260struct IterIndex {
261    indices: Vec<ValueIndex>,
262    num_primary_key_column: usize,
263    /// Precomputed hash for label names.
264    label_name_hash: u64,
265}
266
267impl IterIndex {
268    fn new(
269        row_schema: &[ColumnSchema],
270        physical_columns: &HashMap<String, ColumnMetadata>,
271    ) -> Self {
272        let mut reserved_indices = SmallVec::<[ValueIndex; 2]>::new();
273        // Uses BTreeMap to keep the primary key column name order (lexicographical)
274        let mut primary_key_indices = BTreeMap::new();
275        let mut field_indices = SmallVec::<[ValueIndex; 1]>::new();
276        let mut ts_index = None;
277        for (idx, col) in row_schema.iter().enumerate() {
278            match col.semantic_type() {
279                SemanticType::Tag => match col.column_name.as_str() {
280                    DATA_SCHEMA_TABLE_ID_COLUMN_NAME => {
281                        reserved_indices.push(ValueIndex {
282                            column_id: ReservedColumnId::table_id(),
283                            index: idx,
284                        });
285                    }
286                    DATA_SCHEMA_TSID_COLUMN_NAME => {
287                        reserved_indices.push(ValueIndex {
288                            column_id: ReservedColumnId::tsid(),
289                            index: idx,
290                        });
291                    }
292                    _ => {
293                        // Inserts primary key column name follower the column name order (lexicographical)
294                        primary_key_indices.insert(
295                            col.column_name.as_str(),
296                            ValueIndex {
297                                column_id: physical_columns
298                                    .get(&col.column_name)
299                                    .unwrap()
300                                    .column_id,
301                                index: idx,
302                            },
303                        );
304                    }
305                },
306                SemanticType::Field => {
307                    field_indices.push(ValueIndex {
308                        column_id: physical_columns.get(&col.column_name).unwrap().column_id,
309                        index: idx,
310                    });
311                }
312                SemanticType::Timestamp => {
313                    ts_index = Some(ValueIndex {
314                        column_id: physical_columns.get(&col.column_name).unwrap().column_id,
315                        index: idx,
316                    });
317                }
318            }
319        }
320        let num_primary_key_column = primary_key_indices.len() + reserved_indices.len();
321        let mut indices = Vec::with_capacity(num_primary_key_column + 2);
322        indices.extend(reserved_indices);
323        let mut label_name_hasher = TsidGenerator::default();
324        for (pk_name, pk_index) in primary_key_indices {
325            // primary_key_indices already sorted.
326            label_name_hasher.write_str(pk_name);
327            indices.push(pk_index);
328        }
329        let label_name_hash = label_name_hasher.finish();
330
331        indices.extend(ts_index);
332        indices.extend(field_indices);
333        IterIndex {
334            indices,
335            num_primary_key_column,
336            label_name_hash,
337        }
338    }
339}
340
341/// Iterator of rows.
342pub struct RowsIter {
343    rows: Rows,
344    index: IterIndex,
345}
346
347impl RowsIter {
348    pub fn new(rows: Rows, physical_columns: &HashMap<String, ColumnMetadata>) -> Self {
349        let index: IterIndex = IterIndex::new(&rows.schema, physical_columns);
350        Self { rows, index }
351    }
352
353    /// Returns the iterator of rows.
354    pub fn iter_mut(&mut self) -> impl Iterator<Item = RowIter<'_>> {
355        self.rows.rows.iter_mut().map(|row| RowIter {
356            row,
357            index: &self.index,
358            schema: &self.rows.schema,
359        })
360    }
361
362    /// Returns the remaining columns.
363    fn remaining_columns(&mut self) -> impl Iterator<Item = ColumnSchema> + '_ {
364        self.index.indices[self.index.num_primary_key_column..]
365            .iter()
366            .map(|idx| std::mem::take(&mut self.rows.schema[idx.index]))
367    }
368}
369
370/// Iterator of a row.
371pub struct RowIter<'a> {
372    row: &'a mut Row,
373    index: &'a IterIndex,
374    schema: &'a Vec<ColumnSchema>,
375}
376
377impl RowIter<'_> {
378    /// Returns the primary keys with their names.
379    fn primary_keys_with_name(&self) -> impl Iterator<Item = (&String, &Value)> {
380        self.index.indices[..self.index.num_primary_key_column]
381            .iter()
382            .map(|idx| {
383                (
384                    &self.schema[idx.index].column_name,
385                    &self.row.values[idx.index],
386                )
387            })
388    }
389
390    /// Returns true if any label in current row is null.
391    fn has_null_labels(&self) -> bool {
392        self.index.indices[..self.index.num_primary_key_column]
393            .iter()
394            .any(|idx| self.row.values[idx.index].value_data.is_none())
395    }
396
397    /// Returns the primary keys.
398    pub fn primary_keys(&self) -> impl Iterator<Item = (ColumnId, ValueRef<'_>)> {
399        self.index.indices[..self.index.num_primary_key_column]
400            .iter()
401            .map(|idx| {
402                (
403                    idx.column_id,
404                    api::helper::pb_value_to_value_ref(
405                        &self.row.values[idx.index],
406                        self.schema[idx.index].datatype_extension.as_ref(),
407                    ),
408                )
409            })
410    }
411
412    /// Returns the remaining columns.
413    fn remaining(&mut self) -> impl Iterator<Item = Value> + '_ {
414        self.index.indices[self.index.num_primary_key_column..]
415            .iter()
416            .map(|idx| std::mem::take(&mut self.row.values[idx.index]))
417    }
418
419    /// Returns value at given offset.
420    /// # Panics
421    /// Panics if offset out-of-bound
422    pub fn value_at(&self, idx: usize) -> &Value {
423        &self.row.values[idx]
424    }
425}
426
427#[cfg(test)]
428mod tests {
429    use std::collections::HashMap;
430
431    use api::v1::{Row, Rows};
432    use store_api::codec::PrimaryKeyEncoding;
433
434    use super::*;
435    use crate::error::Error;
436
437    fn test_schema() -> Vec<ColumnSchema> {
438        vec![
439            ColumnSchema {
440                column_name: "namespace".to_string(),
441                datatype: ColumnDataType::String as i32,
442                semantic_type: SemanticType::Tag as _,
443                datatype_extension: None,
444                options: None,
445            },
446            ColumnSchema {
447                column_name: "host".to_string(),
448                datatype: ColumnDataType::String as i32,
449                semantic_type: SemanticType::Tag as _,
450                datatype_extension: None,
451                options: None,
452            },
453        ]
454    }
455
456    fn test_row(v1: &str, v2: &str) -> Row {
457        Row {
458            values: vec![
459                ValueData::StringValue(v1.to_string()).into(),
460                ValueData::StringValue(v2.to_string()).into(),
461            ],
462        }
463    }
464
465    fn make_info(name: &str, column_id: ColumnId) -> ColumnMetadata {
466        ColumnMetadata {
467            column_schema: datatypes::schema::ColumnSchema::new(
468                name.to_string(),
469                datatypes::prelude::ConcreteDataType::string_datatype(),
470                false,
471            ),
472            semantic_type: SemanticType::Tag,
473            column_id,
474        }
475    }
476
477    fn test_name_to_column_id() -> HashMap<String, ColumnMetadata> {
478        HashMap::from([
479            ("namespace".to_string(), make_info("namespace", 1)),
480            ("host".to_string(), make_info("host", 2)),
481        ])
482    }
483
484    #[test]
485    fn test_encode_sparse() {
486        let name_to_column_id = test_name_to_column_id();
487        let encoder = RowModifier::default();
488        let table_id = 1025;
489        let schema = test_schema();
490        let row = test_row("greptimedb", "127.0.0.1");
491        let rows = Rows {
492            schema,
493            rows: vec![row],
494        };
495        let rows_iter = RowsIter::new(rows, &name_to_column_id);
496        let result = encoder
497            .modify_rows(
498                rows_iter,
499                TableIdInput::Single(table_id),
500                PrimaryKeyEncoding::Sparse,
501            )
502            .unwrap();
503        assert_eq!(result.rows[0].values.len(), 1);
504        let encoded_primary_key = vec![
505            128, 0, 0, 4, 1, 0, 0, 4, 1, 128, 0, 0, 3, 1, 37, 196, 242, 181, 117, 224, 7, 137, 0,
506            0, 0, 2, 1, 1, 49, 50, 55, 46, 48, 46, 48, 46, 9, 49, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
507            1, 1, 1, 103, 114, 101, 112, 116, 105, 109, 101, 9, 100, 98, 0, 0, 0, 0, 0, 0, 2,
508        ];
509        assert_eq!(
510            result.rows[0].values[0],
511            ValueData::BinaryValue(encoded_primary_key).into()
512        );
513        assert_eq!(result.schema, expected_sparse_schema());
514    }
515
516    fn expected_sparse_schema() -> Vec<ColumnSchema> {
517        vec![ColumnSchema {
518            column_name: PRIMARY_KEY_COLUMN_NAME.to_string(),
519            datatype: ColumnDataType::Binary as i32,
520            semantic_type: SemanticType::Tag as _,
521            datatype_extension: None,
522            options: None,
523        }]
524    }
525
526    fn expected_dense_schema() -> Vec<ColumnSchema> {
527        vec![
528            ColumnSchema {
529                column_name: "namespace".to_string(),
530                datatype: ColumnDataType::String as i32,
531                semantic_type: SemanticType::Tag as _,
532                datatype_extension: None,
533                options: None,
534            },
535            ColumnSchema {
536                column_name: "host".to_string(),
537                datatype: ColumnDataType::String as i32,
538                semantic_type: SemanticType::Tag as _,
539                datatype_extension: None,
540                options: None,
541            },
542            ColumnSchema {
543                column_name: DATA_SCHEMA_TABLE_ID_COLUMN_NAME.to_string(),
544                datatype: ColumnDataType::Uint32 as i32,
545                semantic_type: SemanticType::Tag as _,
546                datatype_extension: None,
547                options: None,
548            },
549            ColumnSchema {
550                column_name: DATA_SCHEMA_TSID_COLUMN_NAME.to_string(),
551                datatype: ColumnDataType::Uint64 as i32,
552                semantic_type: SemanticType::Tag as _,
553                datatype_extension: None,
554                options: None,
555            },
556        ]
557    }
558
559    #[test]
560    fn test_encode_dense() {
561        let name_to_column_id = test_name_to_column_id();
562        let encoder = RowModifier::default();
563        let table_id = 1025;
564        let schema = test_schema();
565        let row = test_row("greptimedb", "127.0.0.1");
566        let rows = Rows {
567            schema,
568            rows: vec![row],
569        };
570        let rows_iter = RowsIter::new(rows, &name_to_column_id);
571        let result = encoder
572            .modify_rows(
573                rows_iter,
574                TableIdInput::Single(table_id),
575                PrimaryKeyEncoding::Dense,
576            )
577            .unwrap();
578        assert_eq!(
579            result.rows[0].values[0],
580            ValueData::StringValue("greptimedb".to_string()).into()
581        );
582        assert_eq!(
583            result.rows[0].values[1],
584            ValueData::StringValue("127.0.0.1".to_string()).into()
585        );
586        assert_eq!(result.rows[0].values[2], ValueData::U32Value(1025).into());
587        assert_eq!(
588            result.rows[0].values[3],
589            ValueData::U64Value(2721566936019240841).into()
590        );
591        assert_eq!(result.schema, expected_dense_schema());
592    }
593
594    #[test]
595    fn test_table_id_count_mismatch() {
596        let name_to_column_id = test_name_to_column_id();
597        let encoder = RowModifier::default();
598        let schema = test_schema();
599        let rows = Rows {
600            schema,
601            rows: vec![test_row("a", "b"), test_row("c", "d")],
602        };
603        let rows_iter = RowsIter::new(rows, &name_to_column_id);
604        let table_ids = [1025];
605        let err = encoder
606            .modify_rows(
607                rows_iter,
608                TableIdInput::Batch(&table_ids),
609                PrimaryKeyEncoding::Dense,
610            )
611            .unwrap_err();
612        assert!(matches!(
613            err,
614            Error::TableIdCountMismatch {
615                expected: 2,
616                actual: 1,
617                ..
618            }
619        ));
620    }
621
622    #[test]
623    fn test_fill_internal_columns() {
624        let name_to_column_id = test_name_to_column_id();
625        let table_id = 1025;
626        let schema = test_schema();
627        let row = test_row("greptimedb", "127.0.0.1");
628        let rows = Rows {
629            schema,
630            rows: vec![row],
631        };
632        let mut rows_iter = RowsIter::new(rows, &name_to_column_id);
633        let row_iter = rows_iter.iter_mut().next().unwrap();
634        let (encoded_table_id, tsid) = RowModifier::fill_internal_columns(table_id, &row_iter);
635        assert_eq!(encoded_table_id, ValueData::U32Value(1025).into());
636        assert_eq!(tsid, ValueData::U64Value(2721566936019240841).into());
637
638        // Change the column order
639        let schema = vec![
640            ColumnSchema {
641                column_name: "host".to_string(),
642                datatype: ColumnDataType::String as i32,
643                semantic_type: SemanticType::Tag as _,
644                datatype_extension: None,
645                options: None,
646            },
647            ColumnSchema {
648                column_name: "namespace".to_string(),
649                datatype: ColumnDataType::String as i32,
650                semantic_type: SemanticType::Tag as _,
651                datatype_extension: None,
652                options: None,
653            },
654        ];
655        let row = test_row("127.0.0.1", "greptimedb");
656        let rows = Rows {
657            schema,
658            rows: vec![row],
659        };
660        let mut rows_iter = RowsIter::new(rows, &name_to_column_id);
661        let row_iter = rows_iter.iter_mut().next().unwrap();
662        let (encoded_table_id, tsid) = RowModifier::fill_internal_columns(table_id, &row_iter);
663        assert_eq!(encoded_table_id, ValueData::U32Value(1025).into());
664        assert_eq!(tsid, ValueData::U64Value(2721566936019240841).into());
665    }
666
667    /// Helper function to create a schema with multiple label columns
668    fn create_multi_label_schema(labels: &[&str]) -> Vec<ColumnSchema> {
669        labels
670            .iter()
671            .map(|name| ColumnSchema {
672                column_name: name.to_string(),
673                datatype: ColumnDataType::String as i32,
674                semantic_type: SemanticType::Tag as _,
675                datatype_extension: None,
676                options: None,
677            })
678            .collect()
679    }
680
681    /// Helper function to create a name_to_column_id map
682    fn create_name_to_column_id(labels: &[&str]) -> HashMap<String, ColumnMetadata> {
683        labels
684            .iter()
685            .enumerate()
686            .map(|(idx, name)| (name.to_string(), make_info(name, idx as ColumnId + 1)))
687            .collect()
688    }
689
690    /// Helper function to create a row with string values
691    fn create_row_with_values(values: &[&str]) -> Row {
692        Row {
693            values: values
694                .iter()
695                .map(|v| ValueData::StringValue(v.to_string()).into())
696                .collect(),
697        }
698    }
699
700    /// Helper function to create a row with some null values
701    fn create_row_with_nulls(values: &[Option<&str>]) -> Row {
702        Row {
703            values: values
704                .iter()
705                .map(|v| {
706                    v.map(|s| ValueData::StringValue(s.to_string()).into())
707                        .unwrap_or(Value { value_data: None })
708                })
709                .collect(),
710        }
711    }
712
713    /// Helper function to extract TSID from a row
714    fn extract_tsid(
715        schema: Vec<ColumnSchema>,
716        row: Row,
717        name_to_column_id: &HashMap<String, ColumnMetadata>,
718        table_id: TableId,
719    ) -> u64 {
720        let rows = Rows {
721            schema,
722            rows: vec![row],
723        };
724        let mut rows_iter = RowsIter::new(rows, name_to_column_id);
725        let row_iter = rows_iter.iter_mut().next().unwrap();
726        let (_, tsid_value) = RowModifier::fill_internal_columns(table_id, &row_iter);
727        match tsid_value.value_data {
728            Some(ValueData::U64Value(tsid)) => tsid,
729            _ => panic!("Expected U64Value for TSID"),
730        }
731    }
732
733    #[test]
734    fn test_tsid_same_for_different_label_orders() {
735        // Test that rows with the same label name-value pairs but in different orders
736        // produce the same TSID
737        let table_id = 1025;
738
739        // Schema 1: a, b, c
740        let schema1 = create_multi_label_schema(&["a", "b", "c"]);
741        let name_to_column_id1 = create_name_to_column_id(&["a", "b", "c"]);
742        let row1 = create_row_with_values(&["A", "B", "C"]);
743        let tsid1 = extract_tsid(schema1, row1, &name_to_column_id1, table_id);
744
745        // Schema 2: b, a, c (different order)
746        let schema2 = create_multi_label_schema(&["b", "a", "c"]);
747        let name_to_column_id2 = create_name_to_column_id(&["a", "b", "c"]);
748        let row2 = create_row_with_values(&["B", "A", "C"]);
749        let tsid2 = extract_tsid(schema2, row2, &name_to_column_id2, table_id);
750
751        // Schema 3: c, b, a (another different order)
752        let schema3 = create_multi_label_schema(&["c", "b", "a"]);
753        let name_to_column_id3 = create_name_to_column_id(&["a", "b", "c"]);
754        let row3 = create_row_with_values(&["C", "B", "A"]);
755        let tsid3 = extract_tsid(schema3, row3, &name_to_column_id3, table_id);
756
757        // All should have the same TSID since label names are sorted lexicographically
758        // and we're using the same label name-value pairs
759        assert_eq!(
760            tsid1, tsid2,
761            "TSID should be same for different column orders"
762        );
763        assert_eq!(
764            tsid2, tsid3,
765            "TSID should be same for different column orders"
766        );
767    }
768
769    #[test]
770    fn test_tsid_same_with_null_labels() {
771        // Test that rows that differ only by null label values produce the same TSID
772        let table_id = 1025;
773
774        // Row 1: a=A, b=B (no nulls, fast path)
775        let schema1 = create_multi_label_schema(&["a", "b"]);
776        let name_to_column_id1 = create_name_to_column_id(&["a", "b"]);
777        let row1 = create_row_with_values(&["A", "B"]);
778        let tsid1 = extract_tsid(schema1, row1, &name_to_column_id1, table_id);
779
780        // Row 2: a=A, b=B, c=null (has null, slow path)
781        let schema2 = create_multi_label_schema(&["a", "b", "c"]);
782        let name_to_column_id2 = create_name_to_column_id(&["a", "b", "c"]);
783        let row2 = create_row_with_nulls(&[Some("A"), Some("B"), None]);
784        let tsid2 = extract_tsid(schema2, row2, &name_to_column_id2, table_id);
785
786        // Both should have the same TSID since null labels are ignored
787        assert_eq!(
788            tsid1, tsid2,
789            "TSID should be same when only difference is null label values"
790        );
791    }
792
793    #[test]
794    fn test_tsid_same_with_multiple_null_labels() {
795        // Test with multiple null labels
796        let table_id = 1025;
797
798        // Row 1: a=A, b=B (no nulls)
799        let schema1 = create_multi_label_schema(&["a", "b"]);
800        let name_to_column_id1 = create_name_to_column_id(&["a", "b"]);
801        let row1 = create_row_with_values(&["A", "B"]);
802        let tsid1 = extract_tsid(schema1, row1, &name_to_column_id1, table_id);
803
804        // Row 2: a=A, b=B, c=null, d=null (multiple nulls)
805        let schema2 = create_multi_label_schema(&["a", "b", "c", "d"]);
806        let name_to_column_id2 = create_name_to_column_id(&["a", "b", "c", "d"]);
807        let row2 = create_row_with_nulls(&[Some("A"), Some("B"), None, None]);
808        let tsid2 = extract_tsid(schema2, row2, &name_to_column_id2, table_id);
809
810        assert_eq!(
811            tsid1, tsid2,
812            "TSID should be same when only difference is multiple null label values"
813        );
814    }
815
816    #[test]
817    fn test_tsid_different_with_different_non_null_values() {
818        // Test that rows with different non-null values produce different TSIDs
819        let table_id = 1025;
820
821        // Row 1: a=A, b=B
822        let schema1 = create_multi_label_schema(&["a", "b"]);
823        let name_to_column_id1 = create_name_to_column_id(&["a", "b"]);
824        let row1 = create_row_with_values(&["A", "B"]);
825        let tsid1 = extract_tsid(schema1, row1, &name_to_column_id1, table_id);
826
827        // Row 2: a=A, b=C (different value for b)
828        let schema2 = create_multi_label_schema(&["a", "b"]);
829        let name_to_column_id2 = create_name_to_column_id(&["a", "b"]);
830        let row2 = create_row_with_values(&["A", "C"]);
831        let tsid2 = extract_tsid(schema2, row2, &name_to_column_id2, table_id);
832
833        assert_ne!(
834            tsid1, tsid2,
835            "TSID should be different when label values differ"
836        );
837    }
838
839    #[test]
840    fn test_tsid_fast_path_vs_slow_path_consistency() {
841        // Test that fast path (no nulls) and slow path (with nulls) produce
842        // the same TSID for the same non-null label values
843        let table_id = 1025;
844
845        // Fast path: a=A, b=B (no nulls)
846        let schema_fast = create_multi_label_schema(&["a", "b"]);
847        let name_to_column_id_fast = create_name_to_column_id(&["a", "b"]);
848        let row_fast = create_row_with_values(&["A", "B"]);
849        let tsid_fast = extract_tsid(schema_fast, row_fast, &name_to_column_id_fast, table_id);
850
851        // Slow path: a=A, b=B, c=null (has null, triggers slow path)
852        let schema_slow = create_multi_label_schema(&["a", "b", "c"]);
853        let name_to_column_id_slow = create_name_to_column_id(&["a", "b", "c"]);
854        let row_slow = create_row_with_nulls(&[Some("A"), Some("B"), None]);
855        let tsid_slow = extract_tsid(schema_slow, row_slow, &name_to_column_id_slow, table_id);
856
857        assert_eq!(
858            tsid_fast, tsid_slow,
859            "Fast path and slow path should produce same TSID for same non-null values"
860        );
861    }
862
863    #[test]
864    fn test_tsid_with_null_in_middle() {
865        // Test with null in the middle of labels
866        let table_id = 1025;
867
868        // Row 1: a=A, b=B, c=C
869        let schema1 = create_multi_label_schema(&["a", "b", "c"]);
870        let name_to_column_id1 = create_name_to_column_id(&["a", "b", "c"]);
871        let row1 = create_row_with_values(&["A", "B", "C"]);
872        let tsid1 = extract_tsid(schema1, row1, &name_to_column_id1, table_id);
873
874        // Row 2: a=A, b=null, c=C (null in middle)
875        let schema2 = create_multi_label_schema(&["a", "b", "c"]);
876        let name_to_column_id2 = create_name_to_column_id(&["a", "b", "c"]);
877        let row2 = create_row_with_nulls(&[Some("A"), None, Some("C")]);
878        let tsid2 = extract_tsid(schema2, row2, &name_to_column_id2, table_id);
879
880        // Should be different because b is null in row2 but B in row1
881        // Actually wait, let me reconsider - if b is null, it should be ignored
882        // So row2 should be equivalent to a=A, c=C
883        // But row1 is a=A, b=B, c=C, so they should be different
884        assert_ne!(
885            tsid1, tsid2,
886            "TSID should be different when a non-null value becomes null"
887        );
888
889        // Row 3: a=A, c=C (no b at all, equivalent to row2)
890        let schema3 = create_multi_label_schema(&["a", "c"]);
891        let name_to_column_id3 = create_name_to_column_id(&["a", "c"]);
892        let row3 = create_row_with_values(&["A", "C"]);
893        let tsid3 = extract_tsid(schema3, row3, &name_to_column_id3, table_id);
894
895        // Row2 (a=A, b=null, c=C) should be same as row3 (a=A, c=C)
896        assert_eq!(
897            tsid2, tsid3,
898            "TSID should be same when null label is ignored"
899        );
900    }
901
902    #[test]
903    fn test_tsid_all_null_labels() {
904        // Test with all labels being null
905        let table_id = 1025;
906
907        // Row with all nulls
908        let schema = create_multi_label_schema(&["a", "b", "c"]);
909        let name_to_column_id = create_name_to_column_id(&["a", "b", "c"]);
910        let row = create_row_with_nulls(&[None, None, None]);
911        let tsid = extract_tsid(schema.clone(), row, &name_to_column_id, table_id);
912
913        // Should still produce a TSID (based on label names only when all values are null)
914        // This tests that the slow path handles the case where all values are null
915        // The TSID will be based on the label name hash only
916        // Test that it's consistent - same schema with all nulls should produce same TSID
917        let row2 = create_row_with_nulls(&[None, None, None]);
918        let tsid2 = extract_tsid(schema, row2, &name_to_column_id, table_id);
919        assert_eq!(
920            tsid, tsid2,
921            "TSID should be consistent when all label values are null"
922        );
923    }
924}