fix: Fix filtering out rows incorrectly during dedup phase (#484)

* fix: dedup should not mark element as unneeded

It should only mark element as selected, because some column of
different rows may have same value.

* refactor: Rename dedup to find_unique

As the original `dedup` method only mark bitmap to true when it finds
the element is unique, so `find_unique` is more appropriate for its
name.

* test: Renew bitmap in test_batch_find_unique

* chore: Update comments
This commit is contained in:
Yingwen
2022-11-14 21:40:17 +08:00
committed by GitHub
parent fdae67b43e
commit 281eae9f44
6 changed files with 386 additions and 253 deletions

View File

@@ -1,5 +1,5 @@
mod dedup;
mod filter;
mod find_unique;
mod replicate;
use arrow::bitmap::MutableBitmap;
@@ -19,23 +19,22 @@ pub trait VectorOp {
/// Panics if `offsets.len() != self.len()`.
fn replicate(&self, offsets: &[usize]) -> VectorRef;
/// Dedup elements in `self` and mark `i-th` bit of `selected` to `true` if the `i-th` element
/// of `self` is retained.
/// Mark `i-th` bit of `selected` to `true` if the `i-th` element of `self` is unique, which
/// means there is no elements behind it have same value as it.
///
/// The caller should ensure
/// 1. the `selected` bitmap is intialized by setting `[0, vector.len())`
/// bits to false.
/// 1. the length of `selected` bitmap is equal to `vector.len()`.
/// 2. `vector` and `prev_vector` are sorted.
///
/// If there are multiple duplicate elements, this function retains the **first** element.
/// If the first element of `self` is equal to the last element of `prev_vector`, then that
/// first element is also considered as duplicated and won't be retained.
/// The first element is considered as unique if the first element of `self` is different
/// from its previous element, that is the last element of `prev_vector`.
///
/// # Panics
/// Panics if
/// - `selected.len() < self.len()`.
/// - `prev_vector` and `self` have different data types.
fn dedup(&self, selected: &mut MutableBitmap, prev_vector: Option<&dyn Vector>);
fn find_unique(&self, selected: &mut MutableBitmap, prev_vector: Option<&dyn Vector>);
/// Filters the vector, returns elements matching the `filter` (i.e. where the values are true).
///
@@ -50,9 +49,9 @@ macro_rules! impl_scalar_vector_op {
replicate::$replicate(self, offsets)
}
fn dedup(&self, selected: &mut MutableBitmap, prev_vector: Option<&dyn Vector>) {
fn find_unique(&self, selected: &mut MutableBitmap, prev_vector: Option<&dyn Vector>) {
let prev_vector = prev_vector.map(|pv| pv.as_any().downcast_ref::<$VectorType>().unwrap());
dedup::dedup_scalar(self, selected, prev_vector);
find_unique::find_unique_scalar(self, selected, prev_vector);
}
fn filter(&self, filter: &BooleanVector) -> Result<VectorRef> {
@@ -77,9 +76,9 @@ impl VectorOp for ConstantVector {
replicate::replicate_constant(self, offsets)
}
fn dedup(&self, selected: &mut MutableBitmap, prev_vector: Option<&dyn Vector>) {
fn find_unique(&self, selected: &mut MutableBitmap, prev_vector: Option<&dyn Vector>) {
let prev_vector = prev_vector.and_then(|pv| pv.as_any().downcast_ref::<ConstantVector>());
dedup::dedup_constant(self, selected, prev_vector);
find_unique::find_unique_constant(self, selected, prev_vector);
}
fn filter(&self, filter: &BooleanVector) -> Result<VectorRef> {
@@ -92,9 +91,9 @@ impl VectorOp for NullVector {
replicate::replicate_null(self, offsets)
}
fn dedup(&self, selected: &mut MutableBitmap, prev_vector: Option<&dyn Vector>) {
fn find_unique(&self, selected: &mut MutableBitmap, prev_vector: Option<&dyn Vector>) {
let prev_vector = prev_vector.and_then(|pv| pv.as_any().downcast_ref::<NullVector>());
dedup::dedup_null(self, selected, prev_vector);
find_unique::find_unique_null(self, selected, prev_vector);
}
fn filter(&self, filter: &BooleanVector) -> Result<VectorRef> {
@@ -110,10 +109,10 @@ where
replicate::replicate_primitive(self, offsets)
}
fn dedup(&self, selected: &mut MutableBitmap, prev_vector: Option<&dyn Vector>) {
fn find_unique(&self, selected: &mut MutableBitmap, prev_vector: Option<&dyn Vector>) {
let prev_vector =
prev_vector.and_then(|pv| pv.as_any().downcast_ref::<PrimitiveVector<T>>());
dedup::dedup_scalar(self, selected, prev_vector);
find_unique::find_unique_scalar(self, selected, prev_vector);
}
fn filter(&self, filter: &BooleanVector) -> Result<VectorRef> {

View File

@@ -1,223 +0,0 @@
use arrow::bitmap::MutableBitmap;
use crate::scalars::ScalarVector;
use crate::vectors::{ConstantVector, NullVector, Vector};
pub(crate) fn dedup_scalar<'a, T: ScalarVector>(
vector: &'a T,
selected: &'a mut MutableBitmap,
prev_vector: Option<&'a T>,
) where
T::RefItem<'a>: PartialEq,
{
assert!(selected.len() >= vector.len());
if vector.is_empty() {
return;
}
for ((i, current), next) in vector
.iter_data()
.enumerate()
.zip(vector.iter_data().skip(1))
{
if current != next {
// If next element is a different element, we mark it as selected.
selected.set(i + 1, true);
}
}
// Always retain the first element.
selected.set(0, true);
// Then check whether still keep the first element based last element in previous vector.
if let Some(pv) = &prev_vector {
if !pv.is_empty() {
let last = pv.get_data(pv.len() - 1);
if last == vector.get_data(0) {
selected.set(0, false);
}
}
}
}
pub(crate) fn dedup_null(
vector: &NullVector,
selected: &mut MutableBitmap,
prev_vector: Option<&NullVector>,
) {
if vector.is_empty() {
return;
}
let no_prev_element = prev_vector.map(|v| v.is_empty()).unwrap_or(true);
if no_prev_element {
// Retain first element if no previous element (we known that it must
// be null).
selected.set(0, true);
}
}
pub(crate) fn dedup_constant(
vector: &ConstantVector,
selected: &mut MutableBitmap,
prev_vector: Option<&ConstantVector>,
) {
if vector.is_empty() {
return;
}
let equal_to_prev = if let Some(prev) = prev_vector {
!prev.is_empty() && vector.get_constant_ref() == prev.get_constant_ref()
} else {
false
};
if !equal_to_prev {
selected.set(0, true);
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use super::*;
use crate::vectors::{Int32Vector, StringVector, VectorOp};
fn check_bitmap(expect: &[bool], selected: &MutableBitmap) {
assert_eq!(expect.len(), selected.len());
for (exp, v) in expect.iter().zip(selected.iter()) {
assert_eq!(*exp, v);
}
}
fn check_dedup_scalar(expect: &[bool], input: &[i32], prev: Option<&[i32]>) {
check_dedup_scalar_opt(expect, input.iter().map(|v| Some(*v)), prev);
}
fn check_dedup_scalar_opt(
expect: &[bool],
input: impl Iterator<Item = Option<i32>>,
prev: Option<&[i32]>,
) {
let input = Int32Vector::from_iter(input);
let prev = prev.map(Int32Vector::from_slice);
let mut selected = MutableBitmap::from_len_zeroed(input.len());
input.dedup(&mut selected, prev.as_ref().map(|v| v as _));
check_bitmap(expect, &selected);
}
#[test]
fn test_dedup_scalar() {
check_dedup_scalar(&[], &[], None);
check_dedup_scalar(&[true], &[1], None);
check_dedup_scalar(&[true, false], &[1, 1], None);
check_dedup_scalar(&[true, true], &[1, 2], None);
check_dedup_scalar(&[true, true, true, true], &[1, 2, 3, 4], None);
check_dedup_scalar(&[true, false, true, false], &[1, 1, 3, 3], None);
check_dedup_scalar(&[true, false, false, false, true], &[2, 2, 2, 2, 3], None);
check_dedup_scalar(&[true], &[5], Some(&[]));
check_dedup_scalar(&[true], &[5], Some(&[3]));
check_dedup_scalar(&[false], &[5], Some(&[5]));
check_dedup_scalar(&[false], &[5], Some(&[4, 5]));
check_dedup_scalar(&[false, true], &[5, 6], Some(&[4, 5]));
check_dedup_scalar(&[false, true, false], &[5, 6, 6], Some(&[4, 5]));
check_dedup_scalar(
&[false, true, false, true, true],
&[5, 6, 6, 7, 8],
Some(&[4, 5]),
);
check_dedup_scalar_opt(
&[true, true, false, true, false],
[Some(1), Some(2), Some(2), None, None].into_iter(),
None,
);
}
fn check_dedup_null(len: usize) {
let input = NullVector::new(len);
let mut selected = MutableBitmap::from_len_zeroed(input.len());
input.dedup(&mut selected, None);
let mut expect = vec![false; len];
if !expect.is_empty() {
expect[0] = true;
}
check_bitmap(&expect, &selected);
let mut selected = MutableBitmap::from_len_zeroed(input.len());
let prev = Some(NullVector::new(1));
input.dedup(&mut selected, prev.as_ref().map(|v| v as _));
let expect = vec![false; len];
check_bitmap(&expect, &selected);
}
#[test]
fn test_dedup_null() {
for len in 0..5 {
check_dedup_null(len);
}
}
fn check_dedup_constant(len: usize) {
let input = ConstantVector::new(Arc::new(Int32Vector::from_slice(&[8])), len);
let mut selected = MutableBitmap::from_len_zeroed(len);
input.dedup(&mut selected, None);
let mut expect = vec![false; len];
if !expect.is_empty() {
expect[0] = true;
}
check_bitmap(&expect, &selected);
let mut selected = MutableBitmap::from_len_zeroed(len);
let prev = Some(ConstantVector::new(
Arc::new(Int32Vector::from_slice(&[8])),
1,
));
input.dedup(&mut selected, prev.as_ref().map(|v| v as _));
let expect = vec![false; len];
check_bitmap(&expect, &selected);
}
#[test]
fn test_dedup_constant() {
for len in 0..5 {
check_dedup_constant(len);
}
}
#[test]
fn test_dedup_string() {
let input = StringVector::from_slice(&["a", "a", "b", "c"]);
let mut selected = MutableBitmap::from_len_zeroed(4);
input.dedup(&mut selected, None);
let expect = vec![true, false, true, true];
check_bitmap(&expect, &selected);
}
macro_rules! impl_dedup_date_like_test {
($VectorType: ident, $ValueType: ident, $method: ident) => {{
use common_time::$ValueType;
use $crate::vectors::$VectorType;
let v = $VectorType::from_iterator([8, 8, 9, 10].into_iter().map($ValueType::$method));
let mut selected = MutableBitmap::from_len_zeroed(4);
v.dedup(&mut selected, None);
let expect = vec![true, false, true, true];
check_bitmap(&expect, &selected);
}};
}
#[test]
fn test_dedup_date_like() {
impl_dedup_date_like_test!(DateVector, Date, new);
impl_dedup_date_like_test!(DateTimeVector, DateTime, new);
impl_dedup_date_like_test!(TimestampVector, Timestamp, from_millis);
}
}

View File

@@ -0,0 +1,354 @@
use arrow::bitmap::MutableBitmap;
use crate::scalars::ScalarVector;
use crate::vectors::{ConstantVector, NullVector, Vector};
// To implement `find_unique()` correctly, we need to keep in mind that always marks an element as
// selected when it is different from the previous one, and leaves the `selected` unchanged
// in any other case.
pub(crate) fn find_unique_scalar<'a, T: ScalarVector>(
vector: &'a T,
selected: &'a mut MutableBitmap,
prev_vector: Option<&'a T>,
) where
T::RefItem<'a>: PartialEq,
{
assert!(selected.len() >= vector.len());
if vector.is_empty() {
return;
}
for ((i, current), next) in vector
.iter_data()
.enumerate()
.zip(vector.iter_data().skip(1))
{
if current != next {
// If next element is a different element, we mark it as selected.
selected.set(i + 1, true);
}
}
// Marks first element as selcted if it is different from previous element, otherwise
// keep selected bitmap unchanged.
let is_first_not_duplicate = prev_vector
.map(|pv| {
if pv.is_empty() {
true
} else {
let last = pv.get_data(pv.len() - 1);
last != vector.get_data(0)
}
})
.unwrap_or(true);
if is_first_not_duplicate {
selected.set(0, true);
}
}
pub(crate) fn find_unique_null(
vector: &NullVector,
selected: &mut MutableBitmap,
prev_vector: Option<&NullVector>,
) {
if vector.is_empty() {
return;
}
let is_first_not_duplicate = prev_vector.map(|pv| pv.is_empty()).unwrap_or(true);
if is_first_not_duplicate {
selected.set(0, true);
}
}
pub(crate) fn find_unique_constant(
vector: &ConstantVector,
selected: &mut MutableBitmap,
prev_vector: Option<&ConstantVector>,
) {
if vector.is_empty() {
return;
}
let is_first_not_duplicate = prev_vector
.map(|pv| {
if pv.is_empty() {
true
} else {
vector.get_constant_ref() != pv.get_constant_ref()
}
})
.unwrap_or(true);
if is_first_not_duplicate {
selected.set(0, true);
}
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use super::*;
use crate::vectors::{Int32Vector, StringVector, VectorOp};
fn check_bitmap(expect: &[bool], selected: &MutableBitmap) {
let actual = selected.iter().collect::<Vec<_>>();
assert_eq!(expect, actual);
}
fn check_find_unique_scalar(expect: &[bool], input: &[i32], prev: Option<&[i32]>) {
check_find_unique_scalar_opt(expect, input.iter().map(|v| Some(*v)), prev);
}
fn check_find_unique_scalar_opt(
expect: &[bool],
input: impl Iterator<Item = Option<i32>>,
prev: Option<&[i32]>,
) {
let input = Int32Vector::from_iter(input);
let prev = prev.map(Int32Vector::from_slice);
let mut selected = MutableBitmap::from_len_zeroed(input.len());
input.find_unique(&mut selected, prev.as_ref().map(|v| v as _));
check_bitmap(expect, &selected);
}
#[test]
fn test_find_unique_scalar() {
check_find_unique_scalar(&[], &[], None);
check_find_unique_scalar(&[true], &[1], None);
check_find_unique_scalar(&[true, false], &[1, 1], None);
check_find_unique_scalar(&[true, true], &[1, 2], None);
check_find_unique_scalar(&[true, true, true, true], &[1, 2, 3, 4], None);
check_find_unique_scalar(&[true, false, true, false], &[1, 1, 3, 3], None);
check_find_unique_scalar(&[true, false, false, false, true], &[2, 2, 2, 2, 3], None);
check_find_unique_scalar(&[true], &[5], Some(&[]));
check_find_unique_scalar(&[true], &[5], Some(&[3]));
check_find_unique_scalar(&[false], &[5], Some(&[5]));
check_find_unique_scalar(&[false], &[5], Some(&[4, 5]));
check_find_unique_scalar(&[false, true], &[5, 6], Some(&[4, 5]));
check_find_unique_scalar(&[false, true, false], &[5, 6, 6], Some(&[4, 5]));
check_find_unique_scalar(
&[false, true, false, true, true],
&[5, 6, 6, 7, 8],
Some(&[4, 5]),
);
check_find_unique_scalar_opt(
&[true, true, false, true, false],
[Some(1), Some(2), Some(2), None, None].into_iter(),
None,
);
}
#[test]
fn test_find_unique_scalar_multi_times_with_prev() {
let prev = Int32Vector::from_slice(&[1]);
let v1 = Int32Vector::from_slice(&[2, 3, 4]);
let mut selected = MutableBitmap::from_len_zeroed(v1.len());
v1.find_unique(&mut selected, Some(&prev));
// Though element in v2 are the same as prev, but we should still keep them.
let v2 = Int32Vector::from_slice(&[1, 1, 1]);
v2.find_unique(&mut selected, Some(&prev));
check_bitmap(&[true, true, true], &selected);
}
fn new_bitmap(bits: &[bool]) -> MutableBitmap {
let mut bitmap = MutableBitmap::from_len_zeroed(bits.len());
for (i, bit) in bits.iter().enumerate() {
if *bit {
bitmap.set(i, true);
}
}
bitmap
}
#[test]
fn test_find_unique_scalar_with_prev() {
let prev = Int32Vector::from_slice(&[1]);
let mut selected = new_bitmap(&[true, false, true, false]);
let v = Int32Vector::from_slice(&[2, 3, 4, 5]);
v.find_unique(&mut selected, Some(&prev));
// All elements are different.
check_bitmap(&[true, true, true, true], &selected);
let mut selected = new_bitmap(&[true, false, true, false]);
let v = Int32Vector::from_slice(&[1, 2, 3, 4]);
v.find_unique(&mut selected, Some(&prev));
// Though first element is duplicate, but we keep the flag unchanged.
check_bitmap(&[true, true, true, true], &selected);
// Same case as above, but now `prev` is None.
let mut selected = new_bitmap(&[true, false, true, false]);
let v = Int32Vector::from_slice(&[1, 2, 3, 4]);
v.find_unique(&mut selected, None);
check_bitmap(&[true, true, true, true], &selected);
// Same case as above, but now `prev` is empty.
let mut selected = new_bitmap(&[true, false, true, false]);
let v = Int32Vector::from_slice(&[1, 2, 3, 4]);
v.find_unique(&mut selected, Some(&Int32Vector::from_slice(&[])));
check_bitmap(&[true, true, true, true], &selected);
let mut selected = new_bitmap(&[false, false, false, false]);
let v = Int32Vector::from_slice(&[2, 2, 4, 5]);
v.find_unique(&mut selected, Some(&prev));
// only v[1] is duplicate.
check_bitmap(&[true, false, true, true], &selected);
}
fn check_find_unique_null(len: usize) {
let input = NullVector::new(len);
let mut selected = MutableBitmap::from_len_zeroed(input.len());
input.find_unique(&mut selected, None);
let mut expect = vec![false; len];
if !expect.is_empty() {
expect[0] = true;
}
check_bitmap(&expect, &selected);
let mut selected = MutableBitmap::from_len_zeroed(input.len());
let prev = Some(NullVector::new(1));
input.find_unique(&mut selected, prev.as_ref().map(|v| v as _));
let expect = vec![false; len];
check_bitmap(&expect, &selected);
}
#[test]
fn test_find_unique_null() {
for len in 0..5 {
check_find_unique_null(len);
}
}
#[test]
fn test_find_unique_null_with_prev() {
let prev = NullVector::new(1);
// Keep flags unchanged.
let mut selected = new_bitmap(&[true, false, true, false]);
let v = NullVector::new(4);
v.find_unique(&mut selected, Some(&prev));
check_bitmap(&[true, false, true, false], &selected);
// Keep flags unchanged.
let mut selected = new_bitmap(&[false, false, true, false]);
v.find_unique(&mut selected, Some(&prev));
check_bitmap(&[false, false, true, false], &selected);
// Prev is None, select first element.
let mut selected = new_bitmap(&[false, false, true, false]);
v.find_unique(&mut selected, None);
check_bitmap(&[true, false, true, false], &selected);
// Prev is empty, select first element.
let mut selected = new_bitmap(&[false, false, true, false]);
v.find_unique(&mut selected, Some(&NullVector::new(0)));
check_bitmap(&[true, false, true, false], &selected);
}
fn check_find_unique_constant(len: usize) {
let input = ConstantVector::new(Arc::new(Int32Vector::from_slice(&[8])), len);
let mut selected = MutableBitmap::from_len_zeroed(len);
input.find_unique(&mut selected, None);
let mut expect = vec![false; len];
if !expect.is_empty() {
expect[0] = true;
}
check_bitmap(&expect, &selected);
let mut selected = MutableBitmap::from_len_zeroed(len);
let prev = Some(ConstantVector::new(
Arc::new(Int32Vector::from_slice(&[8])),
1,
));
input.find_unique(&mut selected, prev.as_ref().map(|v| v as _));
let expect = vec![false; len];
check_bitmap(&expect, &selected);
}
#[test]
fn test_find_unique_constant() {
for len in 0..5 {
check_find_unique_constant(len);
}
}
#[test]
fn test_find_unique_constant_with_prev() {
let prev = ConstantVector::new(Arc::new(Int32Vector::from_slice(&[1])), 1);
// Keep flags unchanged.
let mut selected = new_bitmap(&[true, false, true, false]);
let v = ConstantVector::new(Arc::new(Int32Vector::from_slice(&[1])), 4);
v.find_unique(&mut selected, Some(&prev));
check_bitmap(&[true, false, true, false], &selected);
// Keep flags unchanged.
let mut selected = new_bitmap(&[false, false, true, false]);
v.find_unique(&mut selected, Some(&prev));
check_bitmap(&[false, false, true, false], &selected);
// Prev is None, select first element.
let mut selected = new_bitmap(&[false, false, true, false]);
v.find_unique(&mut selected, None);
check_bitmap(&[true, false, true, false], &selected);
// Prev is empty, select first element.
let mut selected = new_bitmap(&[false, false, true, false]);
v.find_unique(
&mut selected,
Some(&ConstantVector::new(
Arc::new(Int32Vector::from_slice(&[1])),
0,
)),
);
check_bitmap(&[true, false, true, false], &selected);
// Different constant vector.
let mut selected = new_bitmap(&[false, false, true, false]);
let v = ConstantVector::new(Arc::new(Int32Vector::from_slice(&[2])), 4);
v.find_unique(&mut selected, Some(&prev));
check_bitmap(&[true, false, true, false], &selected);
}
#[test]
fn test_find_unique_string() {
let input = StringVector::from_slice(&["a", "a", "b", "c"]);
let mut selected = MutableBitmap::from_len_zeroed(4);
input.find_unique(&mut selected, None);
let expect = vec![true, false, true, true];
check_bitmap(&expect, &selected);
}
macro_rules! impl_find_unique_date_like_test {
($VectorType: ident, $ValueType: ident, $method: ident) => {{
use common_time::$ValueType;
use $crate::vectors::$VectorType;
let v = $VectorType::from_iterator([8, 8, 9, 10].into_iter().map($ValueType::$method));
let mut selected = MutableBitmap::from_len_zeroed(4);
v.find_unique(&mut selected, None);
let expect = vec![true, false, true, true];
check_bitmap(&expect, &selected);
}};
}
#[test]
fn test_find_unique_date_like() {
impl_find_unique_date_like_test!(DateVector, Date, new);
impl_find_unique_date_like_test!(DateTimeVector, DateTime, new);
impl_find_unique_date_like_test!(TimestampVector, Timestamp, from_millis);
}
}

View File

@@ -98,20 +98,22 @@ pub trait BatchOp {
/// - `left` or `right` has insufficient column num.
fn compare_row(&self, left: &Batch, i: usize, right: &Batch, j: usize) -> Ordering;
/// Dedup rows in `batch` by row key.
/// Find unique rows in `batch` by row key.
///
/// If `prev` is `Some` and not empty, the last row of `prev` would be used to dedup
/// current `batch`. Set `i-th` bit of `selected` to `true` if we need to keep `i-th`
/// row. So the caller could use `selected` to build a [BooleanVector] to filter the
/// batch.
/// current `batch`. Set `i-th` bit of `selected` to `true` if `i-th` row is unique,
/// which means the row key of `i-th` row is different from `i+1-th`'s.
///
/// The caller must ensure `selected` is initialized by filling `batch.num_rows()` bits
/// The caller could use `selected` to build a [BooleanVector] to filter the
/// batch, and must ensure `selected` is initialized by filling `batch.num_rows()` bits
/// to zero.
///
/// # Panics
/// Panics if `batch` and `prev` have different number of columns (unless `prev` is
/// Panics if
/// - `batch` and `prev` have different number of columns (unless `prev` is
/// empty).
fn dedup(&self, batch: &Batch, selected: &mut MutableBitmap, prev: Option<&Batch>);
/// - `selected.len()` is less than the number of rows.
fn find_unique(&self, batch: &Batch, selected: &mut MutableBitmap, prev: Option<&Batch>);
/// Filters the `batch`, returns elements matching the `filter` (i.e. where the values
/// are true).

View File

@@ -39,7 +39,7 @@ impl<R> DedupReader<R> {
// but we couldn't zero all bits in the mutable array easily.
let mut selected = MutableBitmap::from_len_zeroed(batch.num_rows());
self.schema
.dedup(&batch, &mut selected, self.prev_batch.as_ref());
.find_unique(&batch, &mut selected, self.prev_batch.as_ref());
// Store current batch to `prev_batch` so we could compare the next batch
// with this batch. We store batch before filtering it mainly for correctness, as

View File

@@ -289,7 +289,7 @@ impl BatchOp for ProjectedSchema {
})
}
fn dedup(&self, batch: &Batch, selected: &mut MutableBitmap, prev: Option<&Batch>) {
fn find_unique(&self, batch: &Batch, selected: &mut MutableBitmap, prev: Option<&Batch>) {
if let Some(prev) = prev {
assert_eq!(batch.num_columns(), prev.num_columns());
}
@@ -299,7 +299,7 @@ impl BatchOp for ProjectedSchema {
batch.column(idx),
prev.map(|prev| prev.column(idx).as_ref()),
);
current.dedup(selected, prev_col);
current.find_unique(selected, prev_col);
}
}
@@ -485,18 +485,19 @@ mod tests {
}
#[test]
fn test_dedup_batch() {
fn test_batch_find_unique() {
let schema = read_util::new_projected_schema();
let batch = read_util::new_kv_batch(&[(1000, Some(1)), (2000, Some(2)), (2000, Some(2))]);
let mut selected = MutableBitmap::from_len_zeroed(3);
schema.dedup(&batch, &mut selected, None);
let mut selected = MutableBitmap::from_len_zeroed(3);
schema.find_unique(&batch, &mut selected, None);
assert!(selected.get(0));
assert!(selected.get(1));
assert!(!selected.get(2));
let mut selected = MutableBitmap::from_len_zeroed(3);
let prev = read_util::new_kv_batch(&[(1000, Some(1))]);
schema.dedup(&batch, &mut selected, Some(&prev));
schema.find_unique(&batch, &mut selected, Some(&prev));
assert!(!selected.get(0));
assert!(selected.get(1));
assert!(!selected.get(2));