refactor: replace some usage of MutableBitmap by BitVec (#610)

This commit is contained in:
Yingwen
2022-11-21 17:36:53 +08:00
committed by GitHub
parent 62fcb54258
commit 0791c65149
8 changed files with 47 additions and 73 deletions

View File

@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use bitvec::prelude as bv;
pub use bitvec::prelude;
// `Lsb0` provides the best codegen for bit manipulation,
// see https://github.com/bitvecto-rs/bitvec/blob/main/doc/order/Lsb0.md
pub type BitVec = bv::BitVec<u8, bv::Lsb0>;
pub type BitVec = prelude::BitVec<u8>;

View File

@@ -12,8 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
pub mod bitset;
pub mod bit_vec;
pub mod buffer;
pub mod bytes;
pub use bitset::BitVec;
pub use bit_vec::BitVec;

View File

@@ -18,8 +18,6 @@ use std::sync::Arc;
use arrow::array::{Array, ArrayRef, BooleanArray, MutableArray, MutableBooleanArray};
use arrow::bitmap::utils::{BitmapIter, ZipValidity};
use arrow::bitmap::MutableBitmap;
use arrow::datatypes::DataType as ArrowDataType;
use snafu::{OptionExt, ResultExt};
use crate::data_type::ConcreteDataType;
@@ -75,14 +73,6 @@ impl<Ptr: Borrow<Option<bool>>> FromIterator<Ptr> for BooleanVector {
}
}
impl From<MutableBitmap> for BooleanVector {
fn from(bitmap: MutableBitmap) -> BooleanVector {
BooleanVector {
array: BooleanArray::new(ArrowDataType::Boolean, bitmap.into(), None),
}
}
}
impl Vector for BooleanVector {
fn data_type(&self) -> ConcreteDataType {
ConcreteDataType::boolean_datatype()
@@ -351,16 +341,4 @@ mod tests {
let expect: VectorRef = Arc::new(BooleanVector::from_slice(&[true, false, true]));
assert_eq!(expect, vector);
}
#[test]
fn test_from_mutable_bitmap() {
let mut bitmap = MutableBitmap::new();
let values = [false, true, true, false, true];
for v in values {
bitmap.push(v);
}
let vector = BooleanVector::from(bitmap);
let expect = BooleanVector::from_slice(&values);
assert_eq!(expect, vector);
}
}

View File

@@ -16,7 +16,7 @@ mod filter;
mod find_unique;
mod replicate;
use arrow::bitmap::MutableBitmap;
use common_base::BitVec;
use crate::error::Result;
use crate::types::PrimitiveElement;
@@ -50,7 +50,7 @@ pub trait VectorOp {
/// Panics if
/// - `selected.len() < self.len()`.
/// - `prev_vector` and `self` have different data types.
fn find_unique(&self, selected: &mut MutableBitmap, prev_vector: Option<&dyn Vector>);
fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>);
/// Filters the vector, returns elements matching the `filter` (i.e. where the values are true).
///
@@ -65,7 +65,7 @@ macro_rules! impl_scalar_vector_op {
replicate::$replicate(self, offsets)
}
fn find_unique(&self, selected: &mut MutableBitmap, prev_vector: Option<&dyn Vector>) {
fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) {
let prev_vector = prev_vector.map(|pv| pv.as_any().downcast_ref::<$VectorType>().unwrap());
find_unique::find_unique_scalar(self, selected, prev_vector);
}
@@ -92,7 +92,7 @@ impl VectorOp for ConstantVector {
replicate::replicate_constant(self, offsets)
}
fn find_unique(&self, selected: &mut MutableBitmap, prev_vector: Option<&dyn Vector>) {
fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) {
let prev_vector = prev_vector.and_then(|pv| pv.as_any().downcast_ref::<ConstantVector>());
find_unique::find_unique_constant(self, selected, prev_vector);
}
@@ -107,7 +107,7 @@ impl VectorOp for NullVector {
replicate::replicate_null(self, offsets)
}
fn find_unique(&self, selected: &mut MutableBitmap, prev_vector: Option<&dyn Vector>) {
fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) {
let prev_vector = prev_vector.and_then(|pv| pv.as_any().downcast_ref::<NullVector>());
find_unique::find_unique_null(self, selected, prev_vector);
}
@@ -125,7 +125,7 @@ where
replicate::replicate_primitive(self, offsets)
}
fn find_unique(&self, selected: &mut MutableBitmap, prev_vector: Option<&dyn Vector>) {
fn find_unique(&self, selected: &mut BitVec, prev_vector: Option<&dyn Vector>) {
let prev_vector =
prev_vector.and_then(|pv| pv.as_any().downcast_ref::<PrimitiveVector<T>>());
find_unique::find_unique_scalar(self, selected, prev_vector);

View File

@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
use arrow::bitmap::MutableBitmap;
use common_base::BitVec;
use crate::scalars::ScalarVector;
use crate::vectors::{ConstantVector, NullVector, Vector};
@@ -22,7 +22,7 @@ use crate::vectors::{ConstantVector, NullVector, Vector};
// in any other case.
pub(crate) fn find_unique_scalar<'a, T: ScalarVector>(
vector: &'a T,
selected: &'a mut MutableBitmap,
selected: &'a mut BitVec,
prev_vector: Option<&'a T>,
) where
T::RefItem<'a>: PartialEq,
@@ -63,7 +63,7 @@ pub(crate) fn find_unique_scalar<'a, T: ScalarVector>(
pub(crate) fn find_unique_null(
vector: &NullVector,
selected: &mut MutableBitmap,
selected: &mut BitVec,
prev_vector: Option<&NullVector>,
) {
if vector.is_empty() {
@@ -78,7 +78,7 @@ pub(crate) fn find_unique_null(
pub(crate) fn find_unique_constant(
vector: &ConstantVector,
selected: &mut MutableBitmap,
selected: &mut BitVec,
prev_vector: Option<&ConstantVector>,
) {
if vector.is_empty() {
@@ -107,7 +107,7 @@ mod tests {
use super::*;
use crate::vectors::{Int32Vector, StringVector, VectorOp};
fn check_bitmap(expect: &[bool], selected: &MutableBitmap) {
fn check_bitmap(expect: &[bool], selected: &BitVec) {
let actual = selected.iter().collect::<Vec<_>>();
assert_eq!(expect, actual);
}
@@ -124,7 +124,7 @@ mod tests {
let input = Int32Vector::from_iter(input);
let prev = prev.map(Int32Vector::from_slice);
let mut selected = MutableBitmap::from_len_zeroed(input.len());
let mut selected = BitVec::repeat(false, input.len());
input.find_unique(&mut selected, prev.as_ref().map(|v| v as _));
check_bitmap(expect, &selected);
@@ -164,7 +164,7 @@ mod tests {
let prev = Int32Vector::from_slice(&[1]);
let v1 = Int32Vector::from_slice(&[2, 3, 4]);
let mut selected = MutableBitmap::from_len_zeroed(v1.len());
let mut selected = BitVec::repeat(false, v1.len());
v1.find_unique(&mut selected, Some(&prev));
// Though element in v2 are the same as prev, but we should still keep them.
@@ -174,15 +174,8 @@ mod tests {
check_bitmap(&[true, true, true], &selected);
}
fn new_bitmap(bits: &[bool]) -> MutableBitmap {
let mut bitmap = MutableBitmap::from_len_zeroed(bits.len());
for (i, bit) in bits.iter().enumerate() {
if *bit {
bitmap.set(i, true);
}
}
bitmap
fn new_bitmap(bits: &[bool]) -> BitVec {
BitVec::from_iter(bits)
}
#[test]
@@ -222,7 +215,7 @@ mod tests {
fn check_find_unique_null(len: usize) {
let input = NullVector::new(len);
let mut selected = MutableBitmap::from_len_zeroed(input.len());
let mut selected = BitVec::repeat(false, input.len());
input.find_unique(&mut selected, None);
let mut expect = vec![false; len];
@@ -231,7 +224,7 @@ mod tests {
}
check_bitmap(&expect, &selected);
let mut selected = MutableBitmap::from_len_zeroed(input.len());
let mut selected = BitVec::repeat(false, input.len());
let prev = Some(NullVector::new(1));
input.find_unique(&mut selected, prev.as_ref().map(|v| v as _));
let expect = vec![false; len];
@@ -273,7 +266,7 @@ mod tests {
fn check_find_unique_constant(len: usize) {
let input = ConstantVector::new(Arc::new(Int32Vector::from_slice(&[8])), len);
let mut selected = MutableBitmap::from_len_zeroed(len);
let mut selected = BitVec::repeat(false, len);
input.find_unique(&mut selected, None);
let mut expect = vec![false; len];
@@ -282,7 +275,7 @@ mod tests {
}
check_bitmap(&expect, &selected);
let mut selected = MutableBitmap::from_len_zeroed(len);
let mut selected = BitVec::repeat(false, len);
let prev = Some(ConstantVector::new(
Arc::new(Int32Vector::from_slice(&[8])),
1,
@@ -340,7 +333,7 @@ mod tests {
#[test]
fn test_find_unique_string() {
let input = StringVector::from_slice(&["a", "a", "b", "c"]);
let mut selected = MutableBitmap::from_len_zeroed(4);
let mut selected = BitVec::repeat(false, 4);
input.find_unique(&mut selected, None);
let expect = vec![true, false, true, true];
check_bitmap(&expect, &selected);
@@ -352,7 +345,7 @@ mod tests {
use $crate::vectors::$VectorType;
let v = $VectorType::from_iterator([8, 8, 9, 10].into_iter().map($ValueType::$method));
let mut selected = MutableBitmap::from_len_zeroed(4);
let mut selected = BitVec::repeat(false, 4);
v.find_unique(&mut selected, None);
let expect = vec![true, false, true, true];
check_bitmap(&expect, &selected);

View File

@@ -20,7 +20,7 @@ mod merge;
use std::cmp::Ordering;
use async_trait::async_trait;
use datatypes::arrow::bitmap::MutableBitmap;
use common_base::BitVec;
use datatypes::data_type::DataType;
use datatypes::prelude::ConcreteDataType;
use datatypes::vectors::{BooleanVector, MutableVector, VectorRef};
@@ -127,7 +127,7 @@ pub trait BatchOp {
/// - `batch` and `prev` have different number of columns (unless `prev` is
/// empty).
/// - `selected.len()` is less than the number of rows.
fn find_unique(&self, batch: &Batch, selected: &mut MutableBitmap, prev: Option<&Batch>);
fn find_unique(&self, batch: &Batch, selected: &mut BitVec, prev: Option<&Batch>);
/// Filters the `batch`, returns elements matching the `filter` (i.e. where the values
/// are true).

View File

@@ -13,7 +13,8 @@
// limitations under the License.
use async_trait::async_trait;
use datatypes::arrow::bitmap::MutableBitmap;
use common_base::BitVec;
use datatypes::prelude::ScalarVector;
use datatypes::vectors::BooleanVector;
use crate::error::Result;
@@ -28,6 +29,8 @@ pub struct DedupReader<R> {
reader: R,
/// Previous batch from the reader.
prev_batch: Option<Batch>,
/// Reused bitmap buffer.
selected: BitVec,
}
impl<R> DedupReader<R> {
@@ -36,6 +39,7 @@ impl<R> DedupReader<R> {
schema,
reader,
prev_batch: None,
selected: BitVec::default(),
}
}
@@ -48,12 +52,11 @@ impl<R> DedupReader<R> {
return Ok(batch);
}
// The `arrow` filter needs `BooleanArray` as input so there is no convenient
// and efficient way to reuse the bitmap. Though we could use `MutableBooleanArray`,
// but we couldn't zero all bits in the mutable array easily.
let mut selected = MutableBitmap::from_len_zeroed(batch.num_rows());
// Reinitialize the bit map to zeros.
self.selected.clear();
self.selected.resize(batch.num_rows(), false);
self.schema
.find_unique(&batch, &mut selected, self.prev_batch.as_ref());
.find_unique(&batch, &mut self.selected, self.prev_batch.as_ref());
// Store current batch to `prev_batch` so we could compare the next batch
// with this batch. We store batch before filtering it mainly for correctness, as
@@ -66,7 +69,7 @@ impl<R> DedupReader<R> {
// TODO(yingwen): To support `DELETE`, we could find all rows whose op_types are equal
// to `OpType::Delete`, mark their `selected` to false, then filter the batch.
let filter = BooleanVector::from(selected);
let filter = BooleanVector::from_iterator(self.selected.iter().by_vals());
// Filter duplicate rows.
self.schema.filter(&batch, &filter)
}

View File

@@ -16,8 +16,8 @@ use std::cmp::Ordering;
use std::collections::{BTreeSet, HashMap};
use std::sync::Arc;
use common_base::BitVec;
use common_error::prelude::*;
use datatypes::arrow::bitmap::MutableBitmap;
use datatypes::schema::{SchemaBuilder, SchemaRef};
use datatypes::vectors::BooleanVector;
use store_api::storage::{Chunk, ColumnId};
@@ -303,7 +303,7 @@ impl BatchOp for ProjectedSchema {
})
}
fn find_unique(&self, batch: &Batch, selected: &mut MutableBitmap, prev: Option<&Batch>) {
fn find_unique(&self, batch: &Batch, selected: &mut BitVec, prev: Option<&Batch>) {
if let Some(prev) = prev {
assert_eq!(batch.num_columns(), prev.num_columns());
}
@@ -503,18 +503,18 @@ mod tests {
let schema = read_util::new_projected_schema();
let batch = read_util::new_kv_batch(&[(1000, Some(1)), (2000, Some(2)), (2000, Some(2))]);
let mut selected = MutableBitmap::from_len_zeroed(3);
let mut selected = BitVec::repeat(false, 3);
schema.find_unique(&batch, &mut selected, None);
assert!(selected.get(0));
assert!(selected.get(1));
assert!(!selected.get(2));
assert!(selected[0]);
assert!(selected[1]);
assert!(!selected[2]);
let mut selected = MutableBitmap::from_len_zeroed(3);
let mut selected = BitVec::repeat(false, 3);
let prev = read_util::new_kv_batch(&[(1000, Some(1))]);
schema.find_unique(&batch, &mut selected, Some(&prev));
assert!(!selected.get(0));
assert!(selected.get(1));
assert!(!selected.get(2));
assert!(!selected[0]);
assert!(selected[1]);
assert!(!selected[2]);
}
#[test]