feat(wal): support bulk wal entries (#6178)

* feat/bulk-wal:
 ### Refactor: Simplify Data Handling in LogStore Implementations

 - **`kafka/log_store.rs`, `raft_engine/log_store.rs`, `wal.rs`, `raw_entry_reader.rs`, `logstore.rs`:**
   - Refactored `entry` and `build_entry` functions to accept `Vec<u8>` directly instead of `&mut Vec<u8>`.
   - Removed usage of `std::mem::take` for data handling, simplifying the code and improving readability.
   - Updated test cases to align with the new function signatures.

* feat/bulk-wal:
 ### Add Support for Bulk WAL Entries and Flight Data Encoding

 - **Add `raw_data` field to `BulkPart` and related structs**: Updated `BulkPart` and related structures in `src/mito2/src/memtable/bulk/part.rs`, `src/mito2/src/memtable/simple_bulk_memtable.rs`, `src/mito2/src/memtable/time_partition.rs`, `src/mito2/src/region_write_ctx.rs`,
 `src/mito2/src/worker/handle_bulk_insert.rs`, and `src/store-api/src/region_request.rs` to include a new `raw_data` field for handling Arrow IPC data.
 - **Implement Flight Data Encoding**: Added a new module `flight` in `src/common/test-util/src/flight.rs` to encode record batches to Flight data format.
 - **Update `greptime-proto` dependency**: Changed the revision of the `greptime-proto` dependency in `Cargo.lock` and `Cargo.toml`.
 - **Enhance WAL Writer and Tests**: Modified `src/mito2/src/wal.rs` and related test files to support bulk WAL entries and added tests for encoding and handling bulk data.

* feat/bulk-wal:
 - **Update `greptime-proto` Dependency**: Updated the `greptime-proto` dependency to a new revision in `Cargo.lock` and `Cargo.toml`.
 - **Add `common-grpc` Dependency**: Added `common-grpc` as a dependency in `Cargo.lock` and `src/mito2/Cargo.toml`.
 - **Refactor `BulkPart` Structure**: Removed `num_rows` field and added `num_rows()` method in `src/mito2/src/memtable/bulk/part.rs`. Updated related usages in `src/mito2/src/memtable/simple_bulk_memtable.rs`, `src/mito2/src/memtable/time_partition.rs`, `src/mito2/src/memtable/time_series.rs`,
 `src/mito2/src/region_write_ctx.rs`, and `src/mito2/src/worker/handle_bulk_insert.rs`.
 - **Implement `TryFrom` and `From` for `BulkWalEntry`**: Added implementations for converting between `BulkPart` and `BulkWalEntry` in `src/mito2/src/memtable/bulk/part.rs`.
 - **Handle Bulk Entries in Region Opener**: Added logic to process bulk entries in `src/mito2/src/region/opener.rs`.
 - **Fix `BulkInsertRequest` Handling**: Corrected `region_id` handling in `src/operator/src/bulk_insert.rs` and `src/store-api/src/region_request.rs`.
 - **Add Error Variant for `ConvertBulkWalEntry`**: Added a new error variant in `src/mito2/src/error.rs` for handling bulk WAL entry conversion errors.

* fix: ci

* feat/bulk-wal:
 Add bulk write operation in `opener.rs`

 - Enhanced the region write context by adding a call to `write_bulk()` after `write_memtable()` in `opener.rs`.
 - This change aims to improve the efficiency of writing operations by enabling bulk writes.

* feat/bulk-wal:
 Enhance error handling and metrics in `bulk_insert.rs`

 - Updated `Inserter` to improve error handling by capturing the result of `datanode.handle(request)` and incrementing the `DIST_INGEST_ROW_COUNT` metric with the number of affected rows.

* feat/bulk-wal:
 ### Remove Encode Error Handling for WAL Entries

 - **`error.rs`**: Removed the `EncodeWal` error variant and its associated handling.
 - **`wal.rs`**: Eliminated the `entry_encode_buf` buffer and its usage for encoding WAL entries. Replaced with direct encoding to a vector using `encode_to_vec()`.
This commit is contained in:
Lei, HUANG
2025-05-29 17:10:30 +08:00
committed by GitHub
parent 9afc61f778
commit 4e615e8906
24 changed files with 242 additions and 79 deletions

View File

@@ -89,7 +89,7 @@ pub trait LogStore: Send + Sync + 'static + std::fmt::Debug {
/// Makes an entry instance of the associated Entry type
fn entry(
&self,
data: &mut Vec<u8>,
data: Vec<u8>,
entry_id: EntryId,
region_id: RegionId,
provider: &Provider,

View File

@@ -22,13 +22,13 @@ use api::v1::column_def::{
};
use api::v1::region::bulk_insert_request::Body;
use api::v1::region::{
alter_request, compact_request, region_request, AlterRequest, AlterRequests, ArrowIpc,
BulkInsertRequest, CloseRequest, CompactRequest, CreateRequest, CreateRequests, DeleteRequests,
DropRequest, DropRequests, FlushRequest, InsertRequests, OpenRequest, TruncateRequest,
alter_request, compact_request, region_request, AlterRequest, AlterRequests, BulkInsertRequest,
CloseRequest, CompactRequest, CreateRequest, CreateRequests, DeleteRequests, DropRequest,
DropRequests, FlushRequest, InsertRequests, OpenRequest, TruncateRequest,
};
use api::v1::{
self, set_index, Analyzer, FulltextBackend as PbFulltextBackend, Option as PbOption, Rows,
SemanticType, SkippingIndexType as PbSkippingIndexType, WriteHint,
self, set_index, Analyzer, ArrowIpc, FulltextBackend as PbFulltextBackend, Option as PbOption,
Rows, SemanticType, SkippingIndexType as PbSkippingIndexType, WriteHint,
};
pub use common_base::AffectedRows;
use common_grpc::flight::FlightDecoder;
@@ -325,28 +325,27 @@ fn make_region_truncate(truncate: TruncateRequest) -> Result<Vec<(RegionId, Regi
/// Convert [BulkInsertRequest] to [RegionRequest] and group by [RegionId].
fn make_region_bulk_inserts(request: BulkInsertRequest) -> Result<Vec<(RegionId, RegionRequest)>> {
let region_id = request.region_id.into();
let Some(Body::ArrowIpc(request)) = request.body else {
return Ok(vec![]);
};
let ArrowIpc {
region_id,
schema,
payload,
data_header,
} = request;
let decoder_timer = metrics::CONVERT_REGION_BULK_REQUEST
.with_label_values(&["decode"])
.start_timer();
let mut decoder = FlightDecoder::try_from_schema_bytes(&schema).context(FlightCodecSnafu)?;
let mut decoder =
FlightDecoder::try_from_schema_bytes(&request.schema).context(FlightCodecSnafu)?;
let payload = decoder
.try_decode_record_batch(&data_header, &payload)
.try_decode_record_batch(&request.data_header, &request.payload)
.context(FlightCodecSnafu)?;
decoder_timer.observe_duration();
let region_id: RegionId = region_id.into();
Ok(vec![(
region_id,
RegionRequest::BulkInserts(RegionBulkInsertsRequest { region_id, payload }),
RegionRequest::BulkInserts(RegionBulkInsertsRequest {
region_id,
payload,
raw_data: request,
}),
)])
}
@@ -1137,6 +1136,7 @@ pub struct RegionSequencesRequest {
pub struct RegionBulkInsertsRequest {
pub region_id: RegionId,
pub payload: DfRecordBatch,
pub raw_data: ArrowIpc,
}
impl RegionBulkInsertsRequest {