mirror of
https://github.com/GreptimeTeam/greptimedb.git
synced 2026-01-03 20:02:54 +00:00
fix: Fix filtering out rows incorrectly during dedup phase (#484)
* fix: dedup should not mark element as unneeded It should only mark element as selected, because some column of different rows may have same value. * refactor: Rename dedup to find_unique As the original `dedup` method only mark bitmap to true when it finds the element is unique, so `find_unique` is more appropriate for its name. * test: Renew bitmap in test_batch_find_unique * chore: Update comments
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
mod dedup;
|
||||
mod filter;
|
||||
mod find_unique;
|
||||
mod replicate;
|
||||
|
||||
use arrow::bitmap::MutableBitmap;
|
||||
@@ -19,23 +19,22 @@ pub trait VectorOp {
|
||||
/// Panics if `offsets.len() != self.len()`.
|
||||
fn replicate(&self, offsets: &[usize]) -> VectorRef;
|
||||
|
||||
/// Dedup elements in `self` and mark `i-th` bit of `selected` to `true` if the `i-th` element
|
||||
/// of `self` is retained.
|
||||
/// Mark `i-th` bit of `selected` to `true` if the `i-th` element of `self` is unique, which
|
||||
/// means there is no elements behind it have same value as it.
|
||||
///
|
||||
/// The caller should ensure
|
||||
/// 1. the `selected` bitmap is intialized by setting `[0, vector.len())`
|
||||
/// bits to false.
|
||||
/// 1. the length of `selected` bitmap is equal to `vector.len()`.
|
||||
/// 2. `vector` and `prev_vector` are sorted.
|
||||
///
|
||||
/// If there are multiple duplicate elements, this function retains the **first** element.
|
||||
/// If the first element of `self` is equal to the last element of `prev_vector`, then that
|
||||
/// first element is also considered as duplicated and won't be retained.
|
||||
/// The first element is considered as unique if the first element of `self` is different
|
||||
/// from its previous element, that is the last element of `prev_vector`.
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics if
|
||||
/// - `selected.len() < self.len()`.
|
||||
/// - `prev_vector` and `self` have different data types.
|
||||
fn dedup(&self, selected: &mut MutableBitmap, prev_vector: Option<&dyn Vector>);
|
||||
fn find_unique(&self, selected: &mut MutableBitmap, prev_vector: Option<&dyn Vector>);
|
||||
|
||||
/// Filters the vector, returns elements matching the `filter` (i.e. where the values are true).
|
||||
///
|
||||
@@ -50,9 +49,9 @@ macro_rules! impl_scalar_vector_op {
|
||||
replicate::$replicate(self, offsets)
|
||||
}
|
||||
|
||||
fn dedup(&self, selected: &mut MutableBitmap, prev_vector: Option<&dyn Vector>) {
|
||||
fn find_unique(&self, selected: &mut MutableBitmap, prev_vector: Option<&dyn Vector>) {
|
||||
let prev_vector = prev_vector.map(|pv| pv.as_any().downcast_ref::<$VectorType>().unwrap());
|
||||
dedup::dedup_scalar(self, selected, prev_vector);
|
||||
find_unique::find_unique_scalar(self, selected, prev_vector);
|
||||
}
|
||||
|
||||
fn filter(&self, filter: &BooleanVector) -> Result<VectorRef> {
|
||||
@@ -77,9 +76,9 @@ impl VectorOp for ConstantVector {
|
||||
replicate::replicate_constant(self, offsets)
|
||||
}
|
||||
|
||||
fn dedup(&self, selected: &mut MutableBitmap, prev_vector: Option<&dyn Vector>) {
|
||||
fn find_unique(&self, selected: &mut MutableBitmap, prev_vector: Option<&dyn Vector>) {
|
||||
let prev_vector = prev_vector.and_then(|pv| pv.as_any().downcast_ref::<ConstantVector>());
|
||||
dedup::dedup_constant(self, selected, prev_vector);
|
||||
find_unique::find_unique_constant(self, selected, prev_vector);
|
||||
}
|
||||
|
||||
fn filter(&self, filter: &BooleanVector) -> Result<VectorRef> {
|
||||
@@ -92,9 +91,9 @@ impl VectorOp for NullVector {
|
||||
replicate::replicate_null(self, offsets)
|
||||
}
|
||||
|
||||
fn dedup(&self, selected: &mut MutableBitmap, prev_vector: Option<&dyn Vector>) {
|
||||
fn find_unique(&self, selected: &mut MutableBitmap, prev_vector: Option<&dyn Vector>) {
|
||||
let prev_vector = prev_vector.and_then(|pv| pv.as_any().downcast_ref::<NullVector>());
|
||||
dedup::dedup_null(self, selected, prev_vector);
|
||||
find_unique::find_unique_null(self, selected, prev_vector);
|
||||
}
|
||||
|
||||
fn filter(&self, filter: &BooleanVector) -> Result<VectorRef> {
|
||||
@@ -110,10 +109,10 @@ where
|
||||
replicate::replicate_primitive(self, offsets)
|
||||
}
|
||||
|
||||
fn dedup(&self, selected: &mut MutableBitmap, prev_vector: Option<&dyn Vector>) {
|
||||
fn find_unique(&self, selected: &mut MutableBitmap, prev_vector: Option<&dyn Vector>) {
|
||||
let prev_vector =
|
||||
prev_vector.and_then(|pv| pv.as_any().downcast_ref::<PrimitiveVector<T>>());
|
||||
dedup::dedup_scalar(self, selected, prev_vector);
|
||||
find_unique::find_unique_scalar(self, selected, prev_vector);
|
||||
}
|
||||
|
||||
fn filter(&self, filter: &BooleanVector) -> Result<VectorRef> {
|
||||
|
||||
@@ -1,223 +0,0 @@
|
||||
use arrow::bitmap::MutableBitmap;
|
||||
|
||||
use crate::scalars::ScalarVector;
|
||||
use crate::vectors::{ConstantVector, NullVector, Vector};
|
||||
|
||||
pub(crate) fn dedup_scalar<'a, T: ScalarVector>(
|
||||
vector: &'a T,
|
||||
selected: &'a mut MutableBitmap,
|
||||
prev_vector: Option<&'a T>,
|
||||
) where
|
||||
T::RefItem<'a>: PartialEq,
|
||||
{
|
||||
assert!(selected.len() >= vector.len());
|
||||
|
||||
if vector.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
for ((i, current), next) in vector
|
||||
.iter_data()
|
||||
.enumerate()
|
||||
.zip(vector.iter_data().skip(1))
|
||||
{
|
||||
if current != next {
|
||||
// If next element is a different element, we mark it as selected.
|
||||
selected.set(i + 1, true);
|
||||
}
|
||||
}
|
||||
|
||||
// Always retain the first element.
|
||||
selected.set(0, true);
|
||||
|
||||
// Then check whether still keep the first element based last element in previous vector.
|
||||
if let Some(pv) = &prev_vector {
|
||||
if !pv.is_empty() {
|
||||
let last = pv.get_data(pv.len() - 1);
|
||||
if last == vector.get_data(0) {
|
||||
selected.set(0, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn dedup_null(
|
||||
vector: &NullVector,
|
||||
selected: &mut MutableBitmap,
|
||||
prev_vector: Option<&NullVector>,
|
||||
) {
|
||||
if vector.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
let no_prev_element = prev_vector.map(|v| v.is_empty()).unwrap_or(true);
|
||||
if no_prev_element {
|
||||
// Retain first element if no previous element (we known that it must
|
||||
// be null).
|
||||
selected.set(0, true);
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn dedup_constant(
|
||||
vector: &ConstantVector,
|
||||
selected: &mut MutableBitmap,
|
||||
prev_vector: Option<&ConstantVector>,
|
||||
) {
|
||||
if vector.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
let equal_to_prev = if let Some(prev) = prev_vector {
|
||||
!prev.is_empty() && vector.get_constant_ref() == prev.get_constant_ref()
|
||||
} else {
|
||||
false
|
||||
};
|
||||
|
||||
if !equal_to_prev {
|
||||
selected.set(0, true);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use super::*;
|
||||
use crate::vectors::{Int32Vector, StringVector, VectorOp};
|
||||
|
||||
fn check_bitmap(expect: &[bool], selected: &MutableBitmap) {
|
||||
assert_eq!(expect.len(), selected.len());
|
||||
for (exp, v) in expect.iter().zip(selected.iter()) {
|
||||
assert_eq!(*exp, v);
|
||||
}
|
||||
}
|
||||
|
||||
fn check_dedup_scalar(expect: &[bool], input: &[i32], prev: Option<&[i32]>) {
|
||||
check_dedup_scalar_opt(expect, input.iter().map(|v| Some(*v)), prev);
|
||||
}
|
||||
|
||||
fn check_dedup_scalar_opt(
|
||||
expect: &[bool],
|
||||
input: impl Iterator<Item = Option<i32>>,
|
||||
prev: Option<&[i32]>,
|
||||
) {
|
||||
let input = Int32Vector::from_iter(input);
|
||||
let prev = prev.map(Int32Vector::from_slice);
|
||||
|
||||
let mut selected = MutableBitmap::from_len_zeroed(input.len());
|
||||
input.dedup(&mut selected, prev.as_ref().map(|v| v as _));
|
||||
|
||||
check_bitmap(expect, &selected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dedup_scalar() {
|
||||
check_dedup_scalar(&[], &[], None);
|
||||
check_dedup_scalar(&[true], &[1], None);
|
||||
check_dedup_scalar(&[true, false], &[1, 1], None);
|
||||
check_dedup_scalar(&[true, true], &[1, 2], None);
|
||||
check_dedup_scalar(&[true, true, true, true], &[1, 2, 3, 4], None);
|
||||
check_dedup_scalar(&[true, false, true, false], &[1, 1, 3, 3], None);
|
||||
check_dedup_scalar(&[true, false, false, false, true], &[2, 2, 2, 2, 3], None);
|
||||
|
||||
check_dedup_scalar(&[true], &[5], Some(&[]));
|
||||
check_dedup_scalar(&[true], &[5], Some(&[3]));
|
||||
check_dedup_scalar(&[false], &[5], Some(&[5]));
|
||||
check_dedup_scalar(&[false], &[5], Some(&[4, 5]));
|
||||
check_dedup_scalar(&[false, true], &[5, 6], Some(&[4, 5]));
|
||||
check_dedup_scalar(&[false, true, false], &[5, 6, 6], Some(&[4, 5]));
|
||||
check_dedup_scalar(
|
||||
&[false, true, false, true, true],
|
||||
&[5, 6, 6, 7, 8],
|
||||
Some(&[4, 5]),
|
||||
);
|
||||
|
||||
check_dedup_scalar_opt(
|
||||
&[true, true, false, true, false],
|
||||
[Some(1), Some(2), Some(2), None, None].into_iter(),
|
||||
None,
|
||||
);
|
||||
}
|
||||
|
||||
fn check_dedup_null(len: usize) {
|
||||
let input = NullVector::new(len);
|
||||
let mut selected = MutableBitmap::from_len_zeroed(input.len());
|
||||
input.dedup(&mut selected, None);
|
||||
|
||||
let mut expect = vec![false; len];
|
||||
if !expect.is_empty() {
|
||||
expect[0] = true;
|
||||
}
|
||||
check_bitmap(&expect, &selected);
|
||||
|
||||
let mut selected = MutableBitmap::from_len_zeroed(input.len());
|
||||
let prev = Some(NullVector::new(1));
|
||||
input.dedup(&mut selected, prev.as_ref().map(|v| v as _));
|
||||
let expect = vec![false; len];
|
||||
check_bitmap(&expect, &selected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dedup_null() {
|
||||
for len in 0..5 {
|
||||
check_dedup_null(len);
|
||||
}
|
||||
}
|
||||
|
||||
fn check_dedup_constant(len: usize) {
|
||||
let input = ConstantVector::new(Arc::new(Int32Vector::from_slice(&[8])), len);
|
||||
let mut selected = MutableBitmap::from_len_zeroed(len);
|
||||
input.dedup(&mut selected, None);
|
||||
|
||||
let mut expect = vec![false; len];
|
||||
if !expect.is_empty() {
|
||||
expect[0] = true;
|
||||
}
|
||||
check_bitmap(&expect, &selected);
|
||||
|
||||
let mut selected = MutableBitmap::from_len_zeroed(len);
|
||||
let prev = Some(ConstantVector::new(
|
||||
Arc::new(Int32Vector::from_slice(&[8])),
|
||||
1,
|
||||
));
|
||||
input.dedup(&mut selected, prev.as_ref().map(|v| v as _));
|
||||
let expect = vec![false; len];
|
||||
check_bitmap(&expect, &selected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dedup_constant() {
|
||||
for len in 0..5 {
|
||||
check_dedup_constant(len);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dedup_string() {
|
||||
let input = StringVector::from_slice(&["a", "a", "b", "c"]);
|
||||
let mut selected = MutableBitmap::from_len_zeroed(4);
|
||||
input.dedup(&mut selected, None);
|
||||
let expect = vec![true, false, true, true];
|
||||
check_bitmap(&expect, &selected);
|
||||
}
|
||||
|
||||
macro_rules! impl_dedup_date_like_test {
|
||||
($VectorType: ident, $ValueType: ident, $method: ident) => {{
|
||||
use common_time::$ValueType;
|
||||
use $crate::vectors::$VectorType;
|
||||
|
||||
let v = $VectorType::from_iterator([8, 8, 9, 10].into_iter().map($ValueType::$method));
|
||||
let mut selected = MutableBitmap::from_len_zeroed(4);
|
||||
v.dedup(&mut selected, None);
|
||||
let expect = vec![true, false, true, true];
|
||||
check_bitmap(&expect, &selected);
|
||||
}};
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dedup_date_like() {
|
||||
impl_dedup_date_like_test!(DateVector, Date, new);
|
||||
impl_dedup_date_like_test!(DateTimeVector, DateTime, new);
|
||||
impl_dedup_date_like_test!(TimestampVector, Timestamp, from_millis);
|
||||
}
|
||||
}
|
||||
354
src/datatypes/src/vectors/operations/find_unique.rs
Normal file
354
src/datatypes/src/vectors/operations/find_unique.rs
Normal file
@@ -0,0 +1,354 @@
|
||||
use arrow::bitmap::MutableBitmap;
|
||||
|
||||
use crate::scalars::ScalarVector;
|
||||
use crate::vectors::{ConstantVector, NullVector, Vector};
|
||||
|
||||
// To implement `find_unique()` correctly, we need to keep in mind that always marks an element as
|
||||
// selected when it is different from the previous one, and leaves the `selected` unchanged
|
||||
// in any other case.
|
||||
pub(crate) fn find_unique_scalar<'a, T: ScalarVector>(
|
||||
vector: &'a T,
|
||||
selected: &'a mut MutableBitmap,
|
||||
prev_vector: Option<&'a T>,
|
||||
) where
|
||||
T::RefItem<'a>: PartialEq,
|
||||
{
|
||||
assert!(selected.len() >= vector.len());
|
||||
|
||||
if vector.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
for ((i, current), next) in vector
|
||||
.iter_data()
|
||||
.enumerate()
|
||||
.zip(vector.iter_data().skip(1))
|
||||
{
|
||||
if current != next {
|
||||
// If next element is a different element, we mark it as selected.
|
||||
selected.set(i + 1, true);
|
||||
}
|
||||
}
|
||||
|
||||
// Marks first element as selcted if it is different from previous element, otherwise
|
||||
// keep selected bitmap unchanged.
|
||||
let is_first_not_duplicate = prev_vector
|
||||
.map(|pv| {
|
||||
if pv.is_empty() {
|
||||
true
|
||||
} else {
|
||||
let last = pv.get_data(pv.len() - 1);
|
||||
last != vector.get_data(0)
|
||||
}
|
||||
})
|
||||
.unwrap_or(true);
|
||||
if is_first_not_duplicate {
|
||||
selected.set(0, true);
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn find_unique_null(
|
||||
vector: &NullVector,
|
||||
selected: &mut MutableBitmap,
|
||||
prev_vector: Option<&NullVector>,
|
||||
) {
|
||||
if vector.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
let is_first_not_duplicate = prev_vector.map(|pv| pv.is_empty()).unwrap_or(true);
|
||||
if is_first_not_duplicate {
|
||||
selected.set(0, true);
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn find_unique_constant(
|
||||
vector: &ConstantVector,
|
||||
selected: &mut MutableBitmap,
|
||||
prev_vector: Option<&ConstantVector>,
|
||||
) {
|
||||
if vector.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
let is_first_not_duplicate = prev_vector
|
||||
.map(|pv| {
|
||||
if pv.is_empty() {
|
||||
true
|
||||
} else {
|
||||
vector.get_constant_ref() != pv.get_constant_ref()
|
||||
}
|
||||
})
|
||||
.unwrap_or(true);
|
||||
|
||||
if is_first_not_duplicate {
|
||||
selected.set(0, true);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::Arc;
|
||||
|
||||
use super::*;
|
||||
use crate::vectors::{Int32Vector, StringVector, VectorOp};
|
||||
|
||||
fn check_bitmap(expect: &[bool], selected: &MutableBitmap) {
|
||||
let actual = selected.iter().collect::<Vec<_>>();
|
||||
assert_eq!(expect, actual);
|
||||
}
|
||||
|
||||
fn check_find_unique_scalar(expect: &[bool], input: &[i32], prev: Option<&[i32]>) {
|
||||
check_find_unique_scalar_opt(expect, input.iter().map(|v| Some(*v)), prev);
|
||||
}
|
||||
|
||||
fn check_find_unique_scalar_opt(
|
||||
expect: &[bool],
|
||||
input: impl Iterator<Item = Option<i32>>,
|
||||
prev: Option<&[i32]>,
|
||||
) {
|
||||
let input = Int32Vector::from_iter(input);
|
||||
let prev = prev.map(Int32Vector::from_slice);
|
||||
|
||||
let mut selected = MutableBitmap::from_len_zeroed(input.len());
|
||||
input.find_unique(&mut selected, prev.as_ref().map(|v| v as _));
|
||||
|
||||
check_bitmap(expect, &selected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_unique_scalar() {
|
||||
check_find_unique_scalar(&[], &[], None);
|
||||
check_find_unique_scalar(&[true], &[1], None);
|
||||
check_find_unique_scalar(&[true, false], &[1, 1], None);
|
||||
check_find_unique_scalar(&[true, true], &[1, 2], None);
|
||||
check_find_unique_scalar(&[true, true, true, true], &[1, 2, 3, 4], None);
|
||||
check_find_unique_scalar(&[true, false, true, false], &[1, 1, 3, 3], None);
|
||||
check_find_unique_scalar(&[true, false, false, false, true], &[2, 2, 2, 2, 3], None);
|
||||
|
||||
check_find_unique_scalar(&[true], &[5], Some(&[]));
|
||||
check_find_unique_scalar(&[true], &[5], Some(&[3]));
|
||||
check_find_unique_scalar(&[false], &[5], Some(&[5]));
|
||||
check_find_unique_scalar(&[false], &[5], Some(&[4, 5]));
|
||||
check_find_unique_scalar(&[false, true], &[5, 6], Some(&[4, 5]));
|
||||
check_find_unique_scalar(&[false, true, false], &[5, 6, 6], Some(&[4, 5]));
|
||||
check_find_unique_scalar(
|
||||
&[false, true, false, true, true],
|
||||
&[5, 6, 6, 7, 8],
|
||||
Some(&[4, 5]),
|
||||
);
|
||||
|
||||
check_find_unique_scalar_opt(
|
||||
&[true, true, false, true, false],
|
||||
[Some(1), Some(2), Some(2), None, None].into_iter(),
|
||||
None,
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_unique_scalar_multi_times_with_prev() {
|
||||
let prev = Int32Vector::from_slice(&[1]);
|
||||
|
||||
let v1 = Int32Vector::from_slice(&[2, 3, 4]);
|
||||
let mut selected = MutableBitmap::from_len_zeroed(v1.len());
|
||||
v1.find_unique(&mut selected, Some(&prev));
|
||||
|
||||
// Though element in v2 are the same as prev, but we should still keep them.
|
||||
let v2 = Int32Vector::from_slice(&[1, 1, 1]);
|
||||
v2.find_unique(&mut selected, Some(&prev));
|
||||
|
||||
check_bitmap(&[true, true, true], &selected);
|
||||
}
|
||||
|
||||
fn new_bitmap(bits: &[bool]) -> MutableBitmap {
|
||||
let mut bitmap = MutableBitmap::from_len_zeroed(bits.len());
|
||||
for (i, bit) in bits.iter().enumerate() {
|
||||
if *bit {
|
||||
bitmap.set(i, true);
|
||||
}
|
||||
}
|
||||
|
||||
bitmap
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_unique_scalar_with_prev() {
|
||||
let prev = Int32Vector::from_slice(&[1]);
|
||||
|
||||
let mut selected = new_bitmap(&[true, false, true, false]);
|
||||
let v = Int32Vector::from_slice(&[2, 3, 4, 5]);
|
||||
v.find_unique(&mut selected, Some(&prev));
|
||||
// All elements are different.
|
||||
check_bitmap(&[true, true, true, true], &selected);
|
||||
|
||||
let mut selected = new_bitmap(&[true, false, true, false]);
|
||||
let v = Int32Vector::from_slice(&[1, 2, 3, 4]);
|
||||
v.find_unique(&mut selected, Some(&prev));
|
||||
// Though first element is duplicate, but we keep the flag unchanged.
|
||||
check_bitmap(&[true, true, true, true], &selected);
|
||||
|
||||
// Same case as above, but now `prev` is None.
|
||||
let mut selected = new_bitmap(&[true, false, true, false]);
|
||||
let v = Int32Vector::from_slice(&[1, 2, 3, 4]);
|
||||
v.find_unique(&mut selected, None);
|
||||
check_bitmap(&[true, true, true, true], &selected);
|
||||
|
||||
// Same case as above, but now `prev` is empty.
|
||||
let mut selected = new_bitmap(&[true, false, true, false]);
|
||||
let v = Int32Vector::from_slice(&[1, 2, 3, 4]);
|
||||
v.find_unique(&mut selected, Some(&Int32Vector::from_slice(&[])));
|
||||
check_bitmap(&[true, true, true, true], &selected);
|
||||
|
||||
let mut selected = new_bitmap(&[false, false, false, false]);
|
||||
let v = Int32Vector::from_slice(&[2, 2, 4, 5]);
|
||||
v.find_unique(&mut selected, Some(&prev));
|
||||
// only v[1] is duplicate.
|
||||
check_bitmap(&[true, false, true, true], &selected);
|
||||
}
|
||||
|
||||
fn check_find_unique_null(len: usize) {
|
||||
let input = NullVector::new(len);
|
||||
let mut selected = MutableBitmap::from_len_zeroed(input.len());
|
||||
input.find_unique(&mut selected, None);
|
||||
|
||||
let mut expect = vec![false; len];
|
||||
if !expect.is_empty() {
|
||||
expect[0] = true;
|
||||
}
|
||||
check_bitmap(&expect, &selected);
|
||||
|
||||
let mut selected = MutableBitmap::from_len_zeroed(input.len());
|
||||
let prev = Some(NullVector::new(1));
|
||||
input.find_unique(&mut selected, prev.as_ref().map(|v| v as _));
|
||||
let expect = vec![false; len];
|
||||
check_bitmap(&expect, &selected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_unique_null() {
|
||||
for len in 0..5 {
|
||||
check_find_unique_null(len);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_unique_null_with_prev() {
|
||||
let prev = NullVector::new(1);
|
||||
|
||||
// Keep flags unchanged.
|
||||
let mut selected = new_bitmap(&[true, false, true, false]);
|
||||
let v = NullVector::new(4);
|
||||
v.find_unique(&mut selected, Some(&prev));
|
||||
check_bitmap(&[true, false, true, false], &selected);
|
||||
|
||||
// Keep flags unchanged.
|
||||
let mut selected = new_bitmap(&[false, false, true, false]);
|
||||
v.find_unique(&mut selected, Some(&prev));
|
||||
check_bitmap(&[false, false, true, false], &selected);
|
||||
|
||||
// Prev is None, select first element.
|
||||
let mut selected = new_bitmap(&[false, false, true, false]);
|
||||
v.find_unique(&mut selected, None);
|
||||
check_bitmap(&[true, false, true, false], &selected);
|
||||
|
||||
// Prev is empty, select first element.
|
||||
let mut selected = new_bitmap(&[false, false, true, false]);
|
||||
v.find_unique(&mut selected, Some(&NullVector::new(0)));
|
||||
check_bitmap(&[true, false, true, false], &selected);
|
||||
}
|
||||
|
||||
fn check_find_unique_constant(len: usize) {
|
||||
let input = ConstantVector::new(Arc::new(Int32Vector::from_slice(&[8])), len);
|
||||
let mut selected = MutableBitmap::from_len_zeroed(len);
|
||||
input.find_unique(&mut selected, None);
|
||||
|
||||
let mut expect = vec![false; len];
|
||||
if !expect.is_empty() {
|
||||
expect[0] = true;
|
||||
}
|
||||
check_bitmap(&expect, &selected);
|
||||
|
||||
let mut selected = MutableBitmap::from_len_zeroed(len);
|
||||
let prev = Some(ConstantVector::new(
|
||||
Arc::new(Int32Vector::from_slice(&[8])),
|
||||
1,
|
||||
));
|
||||
input.find_unique(&mut selected, prev.as_ref().map(|v| v as _));
|
||||
let expect = vec![false; len];
|
||||
check_bitmap(&expect, &selected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_unique_constant() {
|
||||
for len in 0..5 {
|
||||
check_find_unique_constant(len);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_unique_constant_with_prev() {
|
||||
let prev = ConstantVector::new(Arc::new(Int32Vector::from_slice(&[1])), 1);
|
||||
|
||||
// Keep flags unchanged.
|
||||
let mut selected = new_bitmap(&[true, false, true, false]);
|
||||
let v = ConstantVector::new(Arc::new(Int32Vector::from_slice(&[1])), 4);
|
||||
v.find_unique(&mut selected, Some(&prev));
|
||||
check_bitmap(&[true, false, true, false], &selected);
|
||||
|
||||
// Keep flags unchanged.
|
||||
let mut selected = new_bitmap(&[false, false, true, false]);
|
||||
v.find_unique(&mut selected, Some(&prev));
|
||||
check_bitmap(&[false, false, true, false], &selected);
|
||||
|
||||
// Prev is None, select first element.
|
||||
let mut selected = new_bitmap(&[false, false, true, false]);
|
||||
v.find_unique(&mut selected, None);
|
||||
check_bitmap(&[true, false, true, false], &selected);
|
||||
|
||||
// Prev is empty, select first element.
|
||||
let mut selected = new_bitmap(&[false, false, true, false]);
|
||||
v.find_unique(
|
||||
&mut selected,
|
||||
Some(&ConstantVector::new(
|
||||
Arc::new(Int32Vector::from_slice(&[1])),
|
||||
0,
|
||||
)),
|
||||
);
|
||||
check_bitmap(&[true, false, true, false], &selected);
|
||||
|
||||
// Different constant vector.
|
||||
let mut selected = new_bitmap(&[false, false, true, false]);
|
||||
let v = ConstantVector::new(Arc::new(Int32Vector::from_slice(&[2])), 4);
|
||||
v.find_unique(&mut selected, Some(&prev));
|
||||
check_bitmap(&[true, false, true, false], &selected);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_unique_string() {
|
||||
let input = StringVector::from_slice(&["a", "a", "b", "c"]);
|
||||
let mut selected = MutableBitmap::from_len_zeroed(4);
|
||||
input.find_unique(&mut selected, None);
|
||||
let expect = vec![true, false, true, true];
|
||||
check_bitmap(&expect, &selected);
|
||||
}
|
||||
|
||||
macro_rules! impl_find_unique_date_like_test {
|
||||
($VectorType: ident, $ValueType: ident, $method: ident) => {{
|
||||
use common_time::$ValueType;
|
||||
use $crate::vectors::$VectorType;
|
||||
|
||||
let v = $VectorType::from_iterator([8, 8, 9, 10].into_iter().map($ValueType::$method));
|
||||
let mut selected = MutableBitmap::from_len_zeroed(4);
|
||||
v.find_unique(&mut selected, None);
|
||||
let expect = vec![true, false, true, true];
|
||||
check_bitmap(&expect, &selected);
|
||||
}};
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_unique_date_like() {
|
||||
impl_find_unique_date_like_test!(DateVector, Date, new);
|
||||
impl_find_unique_date_like_test!(DateTimeVector, DateTime, new);
|
||||
impl_find_unique_date_like_test!(TimestampVector, Timestamp, from_millis);
|
||||
}
|
||||
}
|
||||
@@ -98,20 +98,22 @@ pub trait BatchOp {
|
||||
/// - `left` or `right` has insufficient column num.
|
||||
fn compare_row(&self, left: &Batch, i: usize, right: &Batch, j: usize) -> Ordering;
|
||||
|
||||
/// Dedup rows in `batch` by row key.
|
||||
/// Find unique rows in `batch` by row key.
|
||||
///
|
||||
/// If `prev` is `Some` and not empty, the last row of `prev` would be used to dedup
|
||||
/// current `batch`. Set `i-th` bit of `selected` to `true` if we need to keep `i-th`
|
||||
/// row. So the caller could use `selected` to build a [BooleanVector] to filter the
|
||||
/// batch.
|
||||
/// current `batch`. Set `i-th` bit of `selected` to `true` if `i-th` row is unique,
|
||||
/// which means the row key of `i-th` row is different from `i+1-th`'s.
|
||||
///
|
||||
/// The caller must ensure `selected` is initialized by filling `batch.num_rows()` bits
|
||||
/// The caller could use `selected` to build a [BooleanVector] to filter the
|
||||
/// batch, and must ensure `selected` is initialized by filling `batch.num_rows()` bits
|
||||
/// to zero.
|
||||
///
|
||||
/// # Panics
|
||||
/// Panics if `batch` and `prev` have different number of columns (unless `prev` is
|
||||
/// Panics if
|
||||
/// - `batch` and `prev` have different number of columns (unless `prev` is
|
||||
/// empty).
|
||||
fn dedup(&self, batch: &Batch, selected: &mut MutableBitmap, prev: Option<&Batch>);
|
||||
/// - `selected.len()` is less than the number of rows.
|
||||
fn find_unique(&self, batch: &Batch, selected: &mut MutableBitmap, prev: Option<&Batch>);
|
||||
|
||||
/// Filters the `batch`, returns elements matching the `filter` (i.e. where the values
|
||||
/// are true).
|
||||
|
||||
@@ -39,7 +39,7 @@ impl<R> DedupReader<R> {
|
||||
// but we couldn't zero all bits in the mutable array easily.
|
||||
let mut selected = MutableBitmap::from_len_zeroed(batch.num_rows());
|
||||
self.schema
|
||||
.dedup(&batch, &mut selected, self.prev_batch.as_ref());
|
||||
.find_unique(&batch, &mut selected, self.prev_batch.as_ref());
|
||||
|
||||
// Store current batch to `prev_batch` so we could compare the next batch
|
||||
// with this batch. We store batch before filtering it mainly for correctness, as
|
||||
|
||||
@@ -289,7 +289,7 @@ impl BatchOp for ProjectedSchema {
|
||||
})
|
||||
}
|
||||
|
||||
fn dedup(&self, batch: &Batch, selected: &mut MutableBitmap, prev: Option<&Batch>) {
|
||||
fn find_unique(&self, batch: &Batch, selected: &mut MutableBitmap, prev: Option<&Batch>) {
|
||||
if let Some(prev) = prev {
|
||||
assert_eq!(batch.num_columns(), prev.num_columns());
|
||||
}
|
||||
@@ -299,7 +299,7 @@ impl BatchOp for ProjectedSchema {
|
||||
batch.column(idx),
|
||||
prev.map(|prev| prev.column(idx).as_ref()),
|
||||
);
|
||||
current.dedup(selected, prev_col);
|
||||
current.find_unique(selected, prev_col);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -485,18 +485,19 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dedup_batch() {
|
||||
fn test_batch_find_unique() {
|
||||
let schema = read_util::new_projected_schema();
|
||||
let batch = read_util::new_kv_batch(&[(1000, Some(1)), (2000, Some(2)), (2000, Some(2))]);
|
||||
let mut selected = MutableBitmap::from_len_zeroed(3);
|
||||
|
||||
schema.dedup(&batch, &mut selected, None);
|
||||
let mut selected = MutableBitmap::from_len_zeroed(3);
|
||||
schema.find_unique(&batch, &mut selected, None);
|
||||
assert!(selected.get(0));
|
||||
assert!(selected.get(1));
|
||||
assert!(!selected.get(2));
|
||||
|
||||
let mut selected = MutableBitmap::from_len_zeroed(3);
|
||||
let prev = read_util::new_kv_batch(&[(1000, Some(1))]);
|
||||
schema.dedup(&batch, &mut selected, Some(&prev));
|
||||
schema.find_unique(&batch, &mut selected, Some(&prev));
|
||||
assert!(!selected.get(0));
|
||||
assert!(selected.get(1));
|
||||
assert!(!selected.get(2));
|
||||
|
||||
Reference in New Issue
Block a user