Skip to main content

mito2/
wal.rs

1// Copyright 2023 Greptime Team
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Write ahead log of the engine.
16
17pub mod encoder;
18pub(crate) mod entry_distributor;
19pub(crate) mod entry_reader;
20pub(crate) mod raw_entry_reader;
21
22use std::collections::HashMap;
23use std::mem;
24use std::sync::Arc;
25
26use api::v1::WalEntry;
27use common_error::ext::BoxedError;
28use common_telemetry::debug;
29use encoder::WalEntryEncoder;
30use entry_reader::NoopEntryReader;
31use futures::future::BoxFuture;
32use futures::stream::BoxStream;
33use snafu::ResultExt;
34use store_api::logstore::entry::Entry;
35use store_api::logstore::provider::Provider;
36use store_api::logstore::{AppendBatchResponse, LogStore, WalIndex};
37use store_api::storage::RegionId;
38
39use crate::error::{BuildEntrySnafu, DeleteWalSnafu, Result, WriteWalSnafu};
40use crate::wal::entry_reader::{LogStoreEntryReader, WalEntryReader};
41use crate::wal::raw_entry_reader::{LogStoreRawEntryReader, RegionRawEntryReader};
42
43/// WAL entry id.
44pub type EntryId = store_api::logstore::entry::Id;
45/// A stream that yields tuple of WAL entry id and corresponding entry.
46pub type WalEntryStream<'a> = BoxStream<'a, Result<(EntryId, WalEntry)>>;
47
48/// Write ahead log.
49///
50/// All regions in the engine shares the same WAL instance.
51#[derive(Debug)]
52pub struct Wal<S> {
53    /// The underlying log store.
54    store: Arc<S>,
55}
56
57impl<S> Wal<S> {
58    /// Creates a new [Wal] from the log store.
59    pub fn new(store: Arc<S>) -> Self {
60        Self { store }
61    }
62
63    pub fn store(&self) -> &Arc<S> {
64        &self.store
65    }
66}
67
68impl<S> Clone for Wal<S> {
69    fn clone(&self) -> Self {
70        Self {
71            store: Arc::clone(&self.store),
72        }
73    }
74}
75
76impl<S: LogStore> Wal<S> {
77    /// Returns a writer to write to the WAL.
78    pub fn writer(&self) -> WalWriter<S> {
79        WalWriter {
80            store: self.store.clone(),
81            entries: Vec::new(),
82            providers: HashMap::new(),
83            encoder: WalEntryEncoder::new(),
84        }
85    }
86
87    /// Returns a [OnRegionOpened] function.
88    pub(crate) fn on_region_opened(
89        &self,
90    ) -> impl FnOnce(RegionId, EntryId, &Provider) -> BoxFuture<Result<()>> {
91        let store = self.store.clone();
92        move |region_id, last_entry_id, provider| -> BoxFuture<'_, Result<()>> {
93            if let Provider::Noop = provider {
94                debug!("Skip obsolete for region: {}", region_id);
95                return Box::pin(async move { Ok(()) });
96            }
97            Box::pin(async move {
98                store
99                    .obsolete(provider, region_id, last_entry_id)
100                    .await
101                    .map_err(BoxedError::new)
102                    .context(DeleteWalSnafu { region_id })
103            })
104        }
105    }
106
107    /// Returns a [WalEntryReader]
108    pub(crate) fn wal_entry_reader(
109        &self,
110        provider: &Provider,
111        region_id: RegionId,
112        location_id: Option<u64>,
113    ) -> Box<dyn WalEntryReader> {
114        match provider {
115            Provider::RaftEngine(_) => Box::new(LogStoreEntryReader::new(
116                LogStoreRawEntryReader::new(self.store.clone()),
117            )),
118            Provider::Kafka(_) => {
119                let reader = if let Some(location_id) = location_id {
120                    LogStoreRawEntryReader::new(self.store.clone())
121                        .with_wal_index(WalIndex::new(region_id, location_id))
122                } else {
123                    LogStoreRawEntryReader::new(self.store.clone())
124                };
125
126                Box::new(LogStoreEntryReader::new(RegionRawEntryReader::new(
127                    reader, region_id,
128                )))
129            }
130            Provider::Noop => Box::new(NoopEntryReader),
131        }
132    }
133
134    /// Scan entries of specific region starting from `start_id` (inclusive).
135    /// Currently only used in tests.
136    pub fn scan<'a>(
137        &'a self,
138        region_id: RegionId,
139        start_id: EntryId,
140        provider: &'a Provider,
141    ) -> Result<WalEntryStream<'a>> {
142        let mut reader = self.wal_entry_reader(provider, region_id, None);
143        reader.read(provider, start_id)
144    }
145
146    /// Mark entries whose ids `<= last_id` as deleted.
147    pub async fn obsolete(
148        &self,
149        region_id: RegionId,
150        last_id: EntryId,
151        provider: &Provider,
152    ) -> Result<()> {
153        if let Provider::Noop = provider {
154            return Ok(());
155        }
156        self.store
157            .obsolete(provider, region_id, last_id)
158            .await
159            .map_err(BoxedError::new)
160            .context(DeleteWalSnafu { region_id })
161    }
162}
163
164/// WAL batch writer.
165pub struct WalWriter<S: LogStore> {
166    /// Log store of the WAL.
167    store: Arc<S>,
168    /// Entries to write.
169    entries: Vec<Entry>,
170    /// Providers of regions being written into.
171    providers: HashMap<RegionId, Provider>,
172    /// Cached-size single-pass encoder, reused across entries in this batch.
173    encoder: WalEntryEncoder,
174}
175
176impl<S: LogStore> WalWriter<S> {
177    /// Add a wal entry for specific region to the writer's buffer.
178    pub fn add_entry(
179        &mut self,
180        region_id: RegionId,
181        entry_id: EntryId,
182        wal_entry: &WalEntry,
183        provider: &Provider,
184    ) -> Result<()> {
185        // Gets or inserts with a newly built provider.
186        let provider = self
187            .providers
188            .entry(region_id)
189            .or_insert_with(|| provider.clone());
190
191        let data = self.encoder.encode_to_vec(wal_entry);
192        let entry = self
193            .store
194            .entry(data, entry_id, region_id, provider)
195            .map_err(BoxedError::new)
196            .context(BuildEntrySnafu { region_id })?;
197
198        self.entries.push(entry);
199
200        Ok(())
201    }
202
203    /// Write all buffered entries to the WAL.
204    pub async fn write_to_wal(&mut self) -> Result<AppendBatchResponse> {
205        // TODO(yingwen): metrics.
206
207        let entries = mem::take(&mut self.entries);
208        self.store
209            .append_batch(entries)
210            .await
211            .map_err(BoxedError::new)
212            .context(WriteWalSnafu)
213    }
214}
215
216#[cfg(test)]
217mod tests {
218    use api::v1::helper::{tag_column_schema, time_index_column_schema};
219    use api::v1::{
220        ArrowIpc, BulkWalEntry, ColumnDataType, Mutation, OpType, Row, Rows, Value, bulk_wal_entry,
221        value,
222    };
223    use common_recordbatch::DfRecordBatch;
224    use common_test_util::flight::encode_to_flight_data;
225    use common_test_util::temp_dir::{TempDir, create_temp_dir};
226    use datatypes::arrow;
227    use datatypes::arrow::array::{ArrayRef, TimestampMillisecondArray};
228    use datatypes::arrow::datatypes::Field;
229    use datatypes::arrow_array::StringArray;
230    use futures::TryStreamExt;
231    use log_store::raft_engine::log_store::RaftEngineLogStore;
232    use log_store::test_util::log_store_util;
233    use store_api::storage::SequenceNumber;
234
235    use super::*;
236
237    struct WalEnv {
238        _wal_dir: TempDir,
239        log_store: Option<Arc<RaftEngineLogStore>>,
240    }
241
242    impl WalEnv {
243        async fn new() -> WalEnv {
244            let wal_dir = create_temp_dir("");
245            let log_store =
246                log_store_util::create_tmp_local_file_log_store(wal_dir.path().to_str().unwrap())
247                    .await;
248            WalEnv {
249                _wal_dir: wal_dir,
250                log_store: Some(Arc::new(log_store)),
251            }
252        }
253
254        fn new_wal(&self) -> Wal<RaftEngineLogStore> {
255            let log_store = self.log_store.clone().unwrap();
256            Wal::new(log_store)
257        }
258    }
259
260    /// Create a new mutation from rows.
261    ///
262    /// The row format is (string, i64).
263    fn new_mutation(op_type: OpType, sequence: SequenceNumber, rows: &[(&str, i64)]) -> Mutation {
264        let rows = rows
265            .iter()
266            .map(|(str_col, int_col)| {
267                let values = vec![
268                    Value {
269                        value_data: Some(value::ValueData::StringValue(str_col.to_string())),
270                    },
271                    Value {
272                        value_data: Some(value::ValueData::TimestampMillisecondValue(*int_col)),
273                    },
274                ];
275                Row { values }
276            })
277            .collect();
278        let schema = vec![
279            tag_column_schema("tag", ColumnDataType::String),
280            time_index_column_schema("ts", ColumnDataType::TimestampMillisecond),
281        ];
282
283        Mutation {
284            op_type: op_type as i32,
285            sequence,
286            rows: Some(Rows { schema, rows }),
287            write_hint: None,
288        }
289    }
290
291    #[tokio::test]
292    async fn test_write_wal() {
293        let env = WalEnv::new().await;
294        let wal = env.new_wal();
295
296        let entry = WalEntry {
297            mutations: vec![
298                new_mutation(OpType::Put, 1, &[("k1", 1), ("k2", 2)]),
299                new_mutation(OpType::Put, 2, &[("k3", 3), ("k4", 4)]),
300            ],
301            bulk_entries: vec![],
302        };
303        let mut writer = wal.writer();
304        // Region 1 entry 1.
305        let region_id = RegionId::new(1, 1);
306        writer
307            .add_entry(
308                region_id,
309                1,
310                &entry,
311                &Provider::raft_engine_provider(region_id.as_u64()),
312            )
313            .unwrap();
314        // Region 2 entry 1.
315        let region_id = RegionId::new(1, 2);
316        writer
317            .add_entry(
318                region_id,
319                1,
320                &entry,
321                &Provider::raft_engine_provider(region_id.as_u64()),
322            )
323            .unwrap();
324        // Region 1 entry 2.
325        let region_id = RegionId::new(1, 2);
326        writer
327            .add_entry(
328                region_id,
329                2,
330                &entry,
331                &Provider::raft_engine_provider(region_id.as_u64()),
332            )
333            .unwrap();
334
335        // Test writing multiple region to wal.
336        writer.write_to_wal().await.unwrap();
337    }
338
339    fn build_record_batch(rows: &[(&str, i64)]) -> DfRecordBatch {
340        let schema = Arc::new(arrow::datatypes::Schema::new(vec![
341            Field::new("tag", arrow::datatypes::DataType::Utf8, false),
342            Field::new(
343                "ts",
344                arrow::datatypes::DataType::Timestamp(
345                    arrow::datatypes::TimeUnit::Millisecond,
346                    None,
347                ),
348                false,
349            ),
350        ]));
351
352        let tag = Arc::new(StringArray::from_iter_values(
353            rows.iter().map(|r| r.0.to_string()),
354        )) as ArrayRef;
355        let ts = Arc::new(TimestampMillisecondArray::from_iter_values(
356            rows.iter().map(|r| r.1),
357        )) as ArrayRef;
358        DfRecordBatch::try_new(schema, vec![tag, ts]).unwrap()
359    }
360
361    fn build_bulk_wal_entry(sequence_number: SequenceNumber, rows: &[(&str, i64)]) -> BulkWalEntry {
362        let rb = build_record_batch(rows);
363        let (schema, rb) = encode_to_flight_data(rb);
364        let max_ts = rows.iter().map(|r| r.1).max().unwrap();
365        let min_ts = rows.iter().map(|r| r.1).min().unwrap();
366        BulkWalEntry {
367            sequence: sequence_number,
368            max_ts,
369            min_ts,
370            timestamp_index: 1,
371            body: Some(bulk_wal_entry::Body::ArrowIpc(ArrowIpc {
372                schema: schema.data_header,
373                data_header: rb.data_header,
374                payload: rb.data_body,
375            })),
376        }
377    }
378
379    fn sample_entries() -> Vec<WalEntry> {
380        vec![
381            WalEntry {
382                mutations: vec![
383                    new_mutation(OpType::Put, 1, &[("k1", 1), ("k2", 2)]),
384                    new_mutation(OpType::Put, 2, &[("k3", 3), ("k4", 4)]),
385                ],
386                bulk_entries: vec![],
387            },
388            WalEntry {
389                mutations: vec![new_mutation(OpType::Put, 3, &[("k1", 1), ("k2", 2)])],
390                bulk_entries: vec![],
391            },
392            WalEntry {
393                mutations: vec![
394                    new_mutation(OpType::Put, 4, &[("k1", 1), ("k2", 2)]),
395                    new_mutation(OpType::Put, 5, &[("k3", 3), ("k4", 4)]),
396                ],
397                bulk_entries: vec![],
398            },
399            WalEntry {
400                mutations: vec![new_mutation(OpType::Put, 6, &[("k1", 1), ("k2", 2)])],
401                bulk_entries: vec![build_bulk_wal_entry(7, &[("k1", 8), ("k2", 9)])],
402            },
403        ]
404    }
405
406    fn check_entries(
407        expect: &[WalEntry],
408        expect_start_id: EntryId,
409        actual: &[(EntryId, WalEntry)],
410    ) {
411        for (idx, (expect_entry, (actual_id, actual_entry))) in
412            expect.iter().zip(actual.iter()).enumerate()
413        {
414            let expect_id_entry = (expect_start_id + idx as u64, expect_entry);
415            assert_eq!(expect_id_entry, (*actual_id, actual_entry));
416        }
417        assert_eq!(expect.len(), actual.len());
418    }
419
420    #[tokio::test]
421    async fn test_scan_wal() {
422        let env = WalEnv::new().await;
423        let wal = env.new_wal();
424
425        let entries = sample_entries();
426        let (id1, id2) = (RegionId::new(1, 1), RegionId::new(1, 2));
427        let ns1 = Provider::raft_engine_provider(id1.as_u64());
428        let ns2 = Provider::raft_engine_provider(id2.as_u64());
429        let mut writer = wal.writer();
430        writer.add_entry(id1, 1, &entries[0], &ns1).unwrap();
431        // Insert one entry into region2. Scan should not return this entry.
432        writer.add_entry(id2, 1, &entries[0], &ns2).unwrap();
433        writer.add_entry(id1, 2, &entries[1], &ns1).unwrap();
434        writer.add_entry(id1, 3, &entries[2], &ns1).unwrap();
435        writer.add_entry(id1, 4, &entries[3], &ns1).unwrap();
436
437        writer.write_to_wal().await.unwrap();
438
439        // Scan all contents region1
440        let stream = wal.scan(id1, 1, &ns1).unwrap();
441        let actual: Vec<_> = stream.try_collect().await.unwrap();
442        check_entries(&entries, 1, &actual);
443
444        // Scan parts of contents
445        let stream = wal.scan(id1, 2, &ns1).unwrap();
446        let actual: Vec<_> = stream.try_collect().await.unwrap();
447        check_entries(&entries[1..], 2, &actual);
448
449        // Scan out of range
450        let stream = wal.scan(id1, 5, &ns1).unwrap();
451        let actual: Vec<_> = stream.try_collect().await.unwrap();
452        assert!(actual.is_empty());
453    }
454
455    #[tokio::test]
456    async fn test_obsolete_wal() {
457        let env = WalEnv::new().await;
458        let wal = env.new_wal();
459
460        let entries = sample_entries();
461        let mut writer = wal.writer();
462        let region_id = RegionId::new(1, 1);
463        let ns = Provider::raft_engine_provider(region_id.as_u64());
464        writer.add_entry(region_id, 1, &entries[0], &ns).unwrap();
465        writer.add_entry(region_id, 2, &entries[1], &ns).unwrap();
466        writer.add_entry(region_id, 3, &entries[2], &ns).unwrap();
467
468        writer.write_to_wal().await.unwrap();
469
470        // Delete 1, 2.
471        wal.obsolete(region_id, 2, &ns).await.unwrap();
472
473        // Put 4.
474        let mut writer = wal.writer();
475        writer.add_entry(region_id, 4, &entries[3], &ns).unwrap();
476        writer.write_to_wal().await.unwrap();
477
478        // Scan all
479        let stream = wal.scan(region_id, 1, &ns).unwrap();
480        let actual: Vec<_> = stream.try_collect().await.unwrap();
481        check_entries(&entries[2..], 3, &actual);
482    }
483}