Files
greptimedb/src/index/src/inverted_index/format/writer/blob.rs
Zhenchi cbf21e53a9 feat(puffin): apply range reader (#4928)
* wip

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* feat(puffin): apply range reader

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* refactor: read_vec reduce iteration

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* refactor: simplify rangereader for vec<u8>

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* test: add unit test

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

* fix: toml format

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>

---------

Signed-off-by: Zhenchi <zhongzc_arch@outlook.com>
2024-11-12 02:36:38 +00:00

236 lines
8.0 KiB
Rust

// Copyright 2023 Greptime Team
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
use std::num::NonZeroUsize;
use async_trait::async_trait;
use common_base::BitVec;
use futures::{AsyncWrite, AsyncWriteExt};
use greptime_proto::v1::index::InvertedIndexMetas;
use prost::Message;
use snafu::ResultExt;
use crate::inverted_index::error::{CloseSnafu, FlushSnafu, Result, WriteSnafu};
use crate::inverted_index::format::writer::single::SingleIndexWriter;
use crate::inverted_index::format::writer::{InvertedIndexWriter, ValueStream};
/// `InvertedIndexBlobWriter`, implemented [`InvertedIndexWriter`], manages
/// writing of an inverted index to a blob storage.
pub struct InvertedIndexBlobWriter<W> {
/// The underlying blob storage
blob_writer: W,
/// Tracks the total number of bytes written to the storage so far
written_size: u64,
/// Metadata about each index that has been written
metas: InvertedIndexMetas,
}
#[async_trait]
impl<W: AsyncWrite + Send + Unpin> InvertedIndexWriter for InvertedIndexBlobWriter<W> {
async fn add_index(
&mut self,
name: String,
null_bitmap: BitVec,
values: ValueStream,
) -> Result<()> {
let single_writer = SingleIndexWriter::new(
name.clone(),
self.written_size,
null_bitmap,
values,
&mut self.blob_writer,
);
let metadata = single_writer.write().await?;
self.written_size += metadata.inverted_index_size;
self.metas.metas.insert(name, metadata);
Ok(())
}
async fn finish(
&mut self,
total_row_count: u64,
segment_row_count: NonZeroUsize,
) -> Result<()> {
self.metas.segment_row_count = segment_row_count.get() as _;
self.metas.total_row_count = total_row_count;
let metas_bytes = self.metas.encode_to_vec();
self.blob_writer
.write_all(&metas_bytes)
.await
.context(WriteSnafu)?;
let footer_size = metas_bytes.len() as u32;
self.blob_writer
.write_all(&footer_size.to_le_bytes())
.await
.context(WriteSnafu)?;
self.blob_writer.flush().await.context(FlushSnafu)?;
self.blob_writer.close().await.context(CloseSnafu)?;
Ok(())
}
}
impl<W: AsyncWrite + Send + Unpin> InvertedIndexBlobWriter<W> {
pub fn new(blob_writer: W) -> InvertedIndexBlobWriter<W> {
InvertedIndexBlobWriter {
blob_writer,
written_size: 0,
metas: InvertedIndexMetas::default(),
}
}
}
#[cfg(test)]
mod tests {
use futures::stream;
use super::*;
use crate::inverted_index::format::reader::{InvertedIndexBlobReader, InvertedIndexReader};
use crate::inverted_index::Bytes;
fn unpack(fst_value: u64) -> [u32; 2] {
bytemuck::cast::<u64, [u32; 2]>(fst_value)
}
#[tokio::test]
async fn test_inverted_index_blob_writer_write_empty() {
let mut blob = Vec::new();
let mut writer = InvertedIndexBlobWriter::new(&mut blob);
writer
.finish(8, NonZeroUsize::new(1).unwrap())
.await
.unwrap();
let mut reader = InvertedIndexBlobReader::new(blob);
let metadata = reader.metadata().await.unwrap();
assert_eq!(metadata.total_row_count, 8);
assert_eq!(metadata.segment_row_count, 1);
assert_eq!(metadata.metas.len(), 0);
}
#[tokio::test]
async fn test_inverted_index_blob_writer_write_basic() {
let mut blob = Vec::new();
let mut writer = InvertedIndexBlobWriter::new(&mut blob);
writer
.add_index(
"tag0".to_string(),
BitVec::from_slice(&[0b0000_0001, 0b0000_0000]),
Box::new(stream::iter(vec![
Ok((Bytes::from("a"), BitVec::from_slice(&[0b0000_0001]))),
Ok((Bytes::from("b"), BitVec::from_slice(&[0b0010_0000]))),
Ok((Bytes::from("c"), BitVec::from_slice(&[0b0000_0001]))),
])),
)
.await
.unwrap();
writer
.add_index(
"tag1".to_string(),
BitVec::from_slice(&[0b0000_0001, 0b0000_0000]),
Box::new(stream::iter(vec![
Ok((Bytes::from("x"), BitVec::from_slice(&[0b0000_0001]))),
Ok((Bytes::from("y"), BitVec::from_slice(&[0b0010_0000]))),
Ok((Bytes::from("z"), BitVec::from_slice(&[0b0000_0001]))),
])),
)
.await
.unwrap();
writer
.finish(8, NonZeroUsize::new(1).unwrap())
.await
.unwrap();
let mut reader = InvertedIndexBlobReader::new(blob);
let metadata = reader.metadata().await.unwrap();
assert_eq!(metadata.total_row_count, 8);
assert_eq!(metadata.segment_row_count, 1);
assert_eq!(metadata.metas.len(), 2);
// tag0
let tag0 = metadata.metas.get("tag0").unwrap();
let stats0 = tag0.stats.as_ref().unwrap();
assert_eq!(stats0.distinct_count, 3);
assert_eq!(stats0.null_count, 1);
assert_eq!(stats0.min_value, Bytes::from("a"));
assert_eq!(stats0.max_value, Bytes::from("c"));
let fst0 = reader
.fst(
tag0.base_offset + tag0.relative_fst_offset as u64,
tag0.fst_size,
)
.await
.unwrap();
assert_eq!(fst0.len(), 3);
let [offset, size] = unpack(fst0.get(b"a").unwrap());
let bitmap = reader
.bitmap(tag0.base_offset + offset as u64, size)
.await
.unwrap();
assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));
let [offset, size] = unpack(fst0.get(b"b").unwrap());
let bitmap = reader
.bitmap(tag0.base_offset + offset as u64, size)
.await
.unwrap();
assert_eq!(bitmap, BitVec::from_slice(&[0b0010_0000]));
let [offset, size] = unpack(fst0.get(b"c").unwrap());
let bitmap = reader
.bitmap(tag0.base_offset + offset as u64, size)
.await
.unwrap();
assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));
// tag1
let tag1 = metadata.metas.get("tag1").unwrap();
let stats1 = tag1.stats.as_ref().unwrap();
assert_eq!(stats1.distinct_count, 3);
assert_eq!(stats1.null_count, 1);
assert_eq!(stats1.min_value, Bytes::from("x"));
assert_eq!(stats1.max_value, Bytes::from("z"));
let fst1 = reader
.fst(
tag1.base_offset + tag1.relative_fst_offset as u64,
tag1.fst_size,
)
.await
.unwrap();
assert_eq!(fst1.len(), 3);
let [offset, size] = unpack(fst1.get(b"x").unwrap());
let bitmap = reader
.bitmap(tag1.base_offset + offset as u64, size)
.await
.unwrap();
assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));
let [offset, size] = unpack(fst1.get(b"y").unwrap());
let bitmap = reader
.bitmap(tag1.base_offset + offset as u64, size)
.await
.unwrap();
assert_eq!(bitmap, BitVec::from_slice(&[0b0010_0000]));
let [offset, size] = unpack(fst1.get(b"z").unwrap());
let bitmap = reader
.bitmap(tag1.base_offset + offset as u64, size)
.await
.unwrap();
assert_eq!(bitmap, BitVec::from_slice(&[0b0000_0001]));
}
}