Skip to main content

metric_engine/
batch_modifier.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15use std::hash::Hasher;
16use std::sync::Arc;
17
18use datatypes::arrow::array::{Array, BinaryBuilder, StringArray, UInt64Array};
19use datatypes::arrow::datatypes::{DataType, Field, Schema as ArrowSchema};
20use datatypes::arrow::record_batch::RecordBatch;
21use fxhash::FxHasher;
22use mito_codec::row_converter::SparsePrimaryKeyCodec;
23use snafu::ResultExt;
24use store_api::storage::ColumnId;
25use store_api::storage::consts::PRIMARY_KEY_COLUMN_NAME;
26
27use crate::error::{EncodePrimaryKeySnafu, Result, UnexpectedRequestSnafu};
28
29/// Info about a tag column for TSID computation and sparse primary key encoding.
30#[allow(dead_code)]
31pub struct TagColumnInfo {
32    /// Column name (used for label-name hash).
33    pub name: String,
34    /// Column index in the RecordBatch.
35    pub index: usize,
36    /// Column ID in the physical region.
37    pub column_id: ColumnId,
38}
39
40/// Computes the TSID for each row in a [RecordBatch].
41///
42/// The TSID is a stable hash of the set of labels (tags) present in each row.
43/// It accounts for both the names and values of all non-null tag columns.
44///
45/// # Logic
46/// - If a row has no nulls across all `sorted_tag_columns`, it uses a precomputed hash of all label names.
47/// - If a row has nulls, it dynamically computes a hash of the names of labels that are present (non-null).
48/// - In both cases, it then hashes the values of those present labels in the order specified by `sorted_tag_columns`.
49pub fn compute_tsid_array(
50    batch: &RecordBatch,
51    sorted_tag_columns: &[TagColumnInfo],
52    tag_arrays: &[&StringArray],
53) -> UInt64Array {
54    let num_rows = batch.num_rows();
55
56    let label_name_hash = {
57        let mut hasher = FxHasher::default();
58        for tag_col in sorted_tag_columns {
59            hasher.write(tag_col.name.as_bytes());
60            hasher.write_u8(0xff);
61        }
62        hasher.finish()
63    };
64
65    let mut tsid_values = Vec::with_capacity(num_rows);
66    for row in 0..num_rows {
67        let has_null = tag_arrays.iter().any(|arr| arr.is_null(row));
68
69        let tsid = if !has_null {
70            let mut hasher = FxHasher::default();
71            hasher.write_u64(label_name_hash);
72            for arr in tag_arrays {
73                hasher.write(arr.value(row).as_bytes());
74                hasher.write_u8(0xff);
75            }
76            hasher.finish()
77        } else {
78            let mut name_hasher = FxHasher::default();
79            for (tc, arr) in sorted_tag_columns.iter().zip(tag_arrays.iter()) {
80                if !arr.is_null(row) {
81                    name_hasher.write(tc.name.as_bytes());
82                    name_hasher.write_u8(0xff);
83                }
84            }
85            let row_label_hash = name_hasher.finish();
86
87            let mut val_hasher = FxHasher::default();
88            val_hasher.write_u64(row_label_hash);
89            for arr in tag_arrays {
90                if !arr.is_null(row) {
91                    val_hasher.write(arr.value(row).as_bytes());
92                    val_hasher.write_u8(0xff);
93                }
94            }
95            val_hasher.finish()
96        };
97
98        tsid_values.push(tsid);
99    }
100
101    UInt64Array::from(tsid_values)
102}
103
104fn build_tag_arrays<'a>(
105    batch: &'a RecordBatch,
106    sorted_tag_columns: &[TagColumnInfo],
107) -> Vec<&'a StringArray> {
108    sorted_tag_columns
109        .iter()
110        .map(|tc| {
111            batch
112                .column(tc.index)
113                .as_any()
114                .downcast_ref::<StringArray>()
115                .expect("tag column must be utf8")
116        })
117        .collect()
118}
119
120/// Modifies a [RecordBatch] to include a sparse primary key column.
121///
122/// This function transforms the input `batch` into a new `RecordBatch` where the first column
123/// is the generated primary key (named [PRIMARY_KEY_COLUMN_NAME]), followed by columns
124/// indicated by `extra_column_indices`.
125///
126/// The primary key uses a "sparse" encoding, which compactly represents the row's identity
127/// by only including non-null tag values. The encoding, handled by [SparsePrimaryKeyCodec],
128/// consists of:
129/// 1. The `table_id`.
130/// 2. A `tsid` (Time Series ID), which is a hash of the present tags.
131/// 3. The actual non-null tag values paired with their `column_id`.
132///
133/// # Parameters
134/// - `batch`: The source [RecordBatch].
135/// - `table_id`: The ID of the table.
136/// - `sorted_tag_columns`: Metadata for tag columns, used for both TSID computation and PK encoding.
137/// - `extra_column_indices`: Indices of columns from the original batch to keep in the output
138///   (typically the timestamp and value fields).
139pub fn modify_batch_sparse(
140    batch: RecordBatch,
141    table_id: u32,
142    sorted_tag_columns: &[TagColumnInfo],
143    extra_column_indices: &[usize],
144) -> Result<RecordBatch> {
145    let num_rows = batch.num_rows();
146    let codec = SparsePrimaryKeyCodec::schemaless();
147    let tag_arrays: Vec<&StringArray> = build_tag_arrays(&batch, sorted_tag_columns);
148    let tsid_array = compute_tsid_array(&batch, sorted_tag_columns, &tag_arrays);
149
150    let mut pk_builder = BinaryBuilder::with_capacity(num_rows, 0);
151    let mut buffer = Vec::new();
152    for row in 0..num_rows {
153        buffer.clear();
154        codec
155            .encode_internal(table_id, tsid_array.value(row), &mut buffer)
156            .context(EncodePrimaryKeySnafu)?;
157
158        let tags = sorted_tag_columns
159            .iter()
160            .zip(tag_arrays.iter())
161            .filter(|(_, arr)| !arr.is_null(row))
162            .map(|(tc, arr)| (tc.column_id, arr.value(row).as_bytes()));
163        codec
164            .encode_raw_tag_value(tags, &mut buffer)
165            .context(EncodePrimaryKeySnafu)?;
166
167        pk_builder.append_value(&buffer);
168    }
169
170    let pk_array = pk_builder.finish();
171
172    let mut fields = vec![Arc::new(Field::new(
173        PRIMARY_KEY_COLUMN_NAME,
174        DataType::Binary,
175        false,
176    ))];
177    let mut columns: Vec<Arc<dyn Array>> = vec![Arc::new(pk_array)];
178
179    for &idx in extra_column_indices {
180        fields.push(batch.schema().fields()[idx].clone());
181        columns.push(batch.column(idx).clone());
182    }
183
184    let new_schema = Arc::new(ArrowSchema::new(fields));
185    RecordBatch::try_new(new_schema, columns).map_err(|e| {
186        UnexpectedRequestSnafu {
187            reason: format!("Failed to build modified sparse RecordBatch: {e}"),
188        }
189        .build()
190    })
191}
192
193#[cfg(test)]
194mod tests {
195    use std::collections::HashMap;
196    use std::sync::Arc;
197
198    use api::v1::value::ValueData;
199    use api::v1::{ColumnDataType, ColumnSchema, Row, Rows, SemanticType, Value};
200    use datatypes::arrow::array::{BinaryArray, Int64Array, StringArray};
201    use datatypes::arrow::datatypes::{DataType, Field, Schema as ArrowSchema};
202    use datatypes::arrow::record_batch::RecordBatch;
203    use store_api::codec::PrimaryKeyEncoding;
204    use store_api::metadata::ColumnMetadata;
205    use store_api::storage::consts::PRIMARY_KEY_COLUMN_NAME;
206
207    use super::*;
208    use crate::row_modifier::{RowModifier, RowsIter, TableIdInput};
209
210    fn build_sparse_test_batch() -> RecordBatch {
211        let schema = Arc::new(ArrowSchema::new(vec![
212            Field::new("greptime_timestamp", DataType::Int64, false),
213            Field::new("greptime_value", DataType::Float64, true),
214            Field::new("namespace", DataType::Utf8, true),
215            Field::new("host", DataType::Utf8, true),
216        ]));
217        RecordBatch::try_new(
218            schema,
219            vec![
220                Arc::new(Int64Array::from(vec![1000])),
221                Arc::new(datatypes::arrow::array::Float64Array::from(vec![42.0])),
222                Arc::new(StringArray::from(vec!["greptimedb"])),
223                Arc::new(StringArray::from(vec!["127.0.0.1"])),
224            ],
225        )
226        .unwrap()
227    }
228
229    fn sparse_tag_columns() -> Vec<TagColumnInfo> {
230        vec![
231            TagColumnInfo {
232                name: "host".to_string(),
233                index: 3,
234                column_id: 3,
235            },
236            TagColumnInfo {
237                name: "namespace".to_string(),
238                index: 2,
239                column_id: 2,
240            },
241        ]
242    }
243
244    #[test]
245    fn test_compute_tsid_basic() {
246        let schema = Arc::new(ArrowSchema::new(vec![
247            Field::new("namespace", DataType::Utf8, true),
248            Field::new("host", DataType::Utf8, true),
249        ]));
250        let batch = RecordBatch::try_new(
251            schema,
252            vec![
253                Arc::new(StringArray::from(vec!["greptimedb"])),
254                Arc::new(StringArray::from(vec!["127.0.0.1"])),
255            ],
256        )
257        .unwrap();
258
259        let tag_columns: Vec<TagColumnInfo> = vec![
260            TagColumnInfo {
261                name: "host".to_string(),
262                index: 1,
263                column_id: 2,
264            },
265            TagColumnInfo {
266                name: "namespace".to_string(),
267                index: 0,
268                column_id: 1,
269            },
270        ];
271        let tag_arrays = build_tag_arrays(&batch, &tag_columns);
272        let tsid_array = compute_tsid_array(&batch, &tag_columns, &tag_arrays);
273
274        assert_eq!(tsid_array.value(0), 2721566936019240841);
275    }
276
277    #[test]
278    fn test_compute_tsid_with_nulls() {
279        let schema = Arc::new(ArrowSchema::new(vec![
280            Field::new("a", DataType::Utf8, true),
281            Field::new("b", DataType::Utf8, true),
282        ]));
283        let batch_no_null = RecordBatch::try_new(
284            schema.clone(),
285            vec![
286                Arc::new(StringArray::from(vec!["A"])),
287                Arc::new(StringArray::from(vec!["B"])),
288            ],
289        )
290        .unwrap();
291        let tag_cols_2: Vec<TagColumnInfo> = vec![
292            TagColumnInfo {
293                name: "a".to_string(),
294                index: 0,
295                column_id: 1,
296            },
297            TagColumnInfo {
298                name: "b".to_string(),
299                index: 1,
300                column_id: 2,
301            },
302        ];
303        let tag_arrays_2 = build_tag_arrays(&batch_no_null, &tag_cols_2);
304        let tsid_no_null = compute_tsid_array(&batch_no_null, &tag_cols_2, &tag_arrays_2);
305
306        let schema3 = Arc::new(ArrowSchema::new(vec![
307            Field::new("a", DataType::Utf8, true),
308            Field::new("b", DataType::Utf8, true),
309            Field::new("c", DataType::Utf8, true),
310        ]));
311        let batch_with_null = RecordBatch::try_new(
312            schema3,
313            vec![
314                Arc::new(StringArray::from(vec!["A"])),
315                Arc::new(StringArray::from(vec!["B"])),
316                Arc::new(StringArray::from(vec![None as Option<&str>])),
317            ],
318        )
319        .unwrap();
320        let tag_cols_3: Vec<TagColumnInfo> = vec![
321            TagColumnInfo {
322                name: "a".to_string(),
323                index: 0,
324                column_id: 1,
325            },
326            TagColumnInfo {
327                name: "b".to_string(),
328                index: 1,
329                column_id: 2,
330            },
331            TagColumnInfo {
332                name: "c".to_string(),
333                index: 2,
334                column_id: 3,
335            },
336        ];
337        let tag_arrays_3 = build_tag_arrays(&batch_with_null, &tag_cols_3);
338        let tsid_with_null = compute_tsid_array(&batch_with_null, &tag_cols_3, &tag_arrays_3);
339
340        assert_eq!(tsid_no_null.value(0), tsid_with_null.value(0));
341    }
342
343    #[test]
344    fn test_modify_batch_sparse() {
345        let batch = build_sparse_test_batch();
346        let tag_columns = sparse_tag_columns();
347        let non_tag_indices = vec![0, 1];
348        let table_id: u32 = 1025;
349
350        let modified =
351            modify_batch_sparse(batch, table_id, &tag_columns, &non_tag_indices).unwrap();
352
353        assert_eq!(modified.num_columns(), 3);
354        assert_eq!(modified.schema().field(0).name(), PRIMARY_KEY_COLUMN_NAME);
355        assert_eq!(modified.schema().field(1).name(), "greptime_timestamp");
356        assert_eq!(modified.schema().field(2).name(), "greptime_value");
357    }
358
359    #[test]
360    fn test_modify_batch_sparse_matches_row_modifier() {
361        let batch = build_sparse_test_batch();
362        let tag_columns = sparse_tag_columns();
363        let non_tag_indices = vec![0, 1];
364        let table_id: u32 = 1025;
365        let modified =
366            modify_batch_sparse(batch, table_id, &tag_columns, &non_tag_indices).unwrap();
367
368        let make_info = |name: &str, column_id: ColumnId| ColumnMetadata {
369            column_schema: datatypes::schema::ColumnSchema::new(
370                name.to_string(),
371                datatypes::prelude::ConcreteDataType::string_datatype(),
372                false,
373            ),
374            semantic_type: SemanticType::Tag,
375            column_id,
376        };
377        let name_to_column_id: HashMap<String, ColumnMetadata> = [
378            (
379                "greptime_timestamp".to_string(),
380                make_info("greptime_timestamp", 0),
381            ),
382            ("greptime_value".to_string(), make_info("greptime_value", 1)),
383            ("namespace".to_string(), make_info("namespace", 2)),
384            ("host".to_string(), make_info("host", 3)),
385        ]
386        .into_iter()
387        .collect();
388
389        let rows = Rows {
390            schema: vec![
391                ColumnSchema {
392                    column_name: "greptime_timestamp".to_string(),
393                    datatype: ColumnDataType::TimestampMillisecond as i32,
394                    semantic_type: SemanticType::Timestamp as i32,
395                    ..Default::default()
396                },
397                ColumnSchema {
398                    column_name: "greptime_value".to_string(),
399                    datatype: ColumnDataType::Float64 as i32,
400                    semantic_type: SemanticType::Field as i32,
401                    ..Default::default()
402                },
403                ColumnSchema {
404                    column_name: "namespace".to_string(),
405                    datatype: ColumnDataType::String as i32,
406                    semantic_type: SemanticType::Tag as i32,
407                    ..Default::default()
408                },
409                ColumnSchema {
410                    column_name: "host".to_string(),
411                    datatype: ColumnDataType::String as i32,
412                    semantic_type: SemanticType::Tag as i32,
413                    ..Default::default()
414                },
415            ],
416            rows: vec![Row {
417                values: vec![
418                    Value {
419                        value_data: Some(ValueData::TimestampMillisecondValue(1000)),
420                    },
421                    Value {
422                        value_data: Some(ValueData::F64Value(42.0)),
423                    },
424                    Value {
425                        value_data: Some(ValueData::StringValue("greptimedb".to_string())),
426                    },
427                    Value {
428                        value_data: Some(ValueData::StringValue("127.0.0.1".to_string())),
429                    },
430                ],
431            }],
432        };
433
434        let row_iter = RowsIter::new(rows, &name_to_column_id);
435        let rows = RowModifier::default()
436            .modify_rows(
437                row_iter,
438                TableIdInput::Single(table_id),
439                PrimaryKeyEncoding::Sparse,
440            )
441            .unwrap();
442        let ValueData::BinaryValue(expected_pk) =
443            rows.rows[0].values[0].value_data.clone().unwrap()
444        else {
445            panic!("expected binary primary key");
446        };
447
448        let actual_array = modified
449            .column(0)
450            .as_any()
451            .downcast_ref::<BinaryArray>()
452            .unwrap();
453        assert_eq!(actual_array.value(0), expected_pk.as_slice());
454    }
455}