refactor: unify flush and compaction to always use FlatSource (#7799)

* feat: support write flat as primary key format

Signed-off-by: evenyag <realevenyag@gmail.com>

* feat: migrate flush to always use FlatSource

Add FormatType propagation in SstWriteRequest and use it to choose
Flat vs PrimaryKey write paths (write_all_flat vs
write_all_flat_as_primary_key) in AccessLayer and WriteCache. Make
compactor and flush derive the sst_write_format from region options or
engine config. Simplify flush logic and remove the old memtable_source
helper. Update tests to set default sst_write_format.

Signed-off-by: evenyag <realevenyag@gmail.com>

* refactor: compaction use flat source

Signed-off-by: evenyag <realevenyag@gmail.com>

* refactor: read parquet sequentially as flat batches

Signed-off-by: evenyag <realevenyag@gmail.com>

* refactor: remove new_batch_with_binary in favor of new_record_batch_with_binary

Replace PrimaryKeyWriteFormat with FlatWriteFormat in test_read_large_binary
test and use new_record_batch_with_binary directly, removing the now-unused
new_batch_with_binary function and its BinaryArray import.

Signed-off-by: evenyag <realevenyag@gmail.com>

* test: add tests for PrimaryKeyWriteFormat::convert_flat_batch

Signed-off-by: evenyag <realevenyag@gmail.com>

* refactor: remove Either from SstWriteRequest

Signed-off-by: evenyag <realevenyag@gmail.com>

* fix: handle index build mode

Signed-off-by: evenyag <realevenyag@gmail.com>

* fix: consider sparse encoding and last non null in flush

Signed-off-by: evenyag <realevenyag@gmail.com>

* test: add unit tests for field_column_start edge cases

Signed-off-by: evenyag <realevenyag@gmail.com>

---------

Signed-off-by: evenyag <realevenyag@gmail.com>
This commit is contained in:
Yingwen
2026-03-13 17:44:13 +08:00
committed by GitHub
parent 74ff5c37ea
commit e215851c8a
17 changed files with 668 additions and 904 deletions

View File

@@ -20,13 +20,14 @@ use clap::Parser;
use colored::Colorize;
use datanode::config::RegionEngineConfig;
use datanode::store;
use either::Either;
use futures::stream;
use mito2::access_layer::{
AccessLayer, AccessLayerRef, Metrics, OperationType, SstWriteRequest, WriteType,
};
use mito2::cache::{CacheManager, CacheManagerRef};
use mito2::config::{FulltextIndexConfig, MitoConfig, Mode};
use mito2::read::Source;
use mito2::read::FlatSource;
use mito2::sst::FormatType;
use mito2::sst::file::{FileHandle, FileMeta};
use mito2::sst::file_purger::{FilePurger, FilePurgerRef};
use mito2::sst::index::intermediate::IntermediateManager;
@@ -210,6 +211,7 @@ impl ObjbenchCommand {
object_store.clone(),
)
.expected_metadata(Some(region_meta.clone()))
.flat_format(true)
.build()
.await
.map_err(|e| {
@@ -231,6 +233,10 @@ impl ObjbenchCommand {
let reader_build_elapsed = reader_build_start.elapsed();
let total_rows = reader.parquet_metadata().file_metadata().num_rows();
println!("{} Reader built in {:?}", "".green(), reader_build_elapsed);
let reader_stream = Box::pin(stream::try_unfold(reader, |mut reader| async move {
let batch = reader.next_record_batch().await?;
Ok(batch.map(|batch| (batch, reader)))
}));
// Build write request
let fulltext_index_config = FulltextIndexConfig {
@@ -241,10 +247,11 @@ impl ObjbenchCommand {
let write_req = SstWriteRequest {
op_type: OperationType::Flush,
metadata: region_meta,
source: Either::Left(Source::Reader(Box::new(reader))),
source: FlatSource::Stream(reader_stream),
cache_manager,
storage: None,
max_sequence: None,
sst_write_format: FormatType::PrimaryKey,
index_options: Default::default(),
index_config: mito_engine_config.index.clone(),
inverted_index_config: MitoConfig::default().inverted_index,