mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-28 04:52:55 +00:00
Compare commits
1 Commits
clippy-and
...
use_column
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
375d1f9dac |
@@ -22,6 +22,9 @@ pub struct Column<T> {
|
||||
}
|
||||
|
||||
impl<T: PartialOrd> Column<T> {
|
||||
pub fn get_cardinality(&self) -> Cardinality {
|
||||
self.idx.get_cardinality()
|
||||
}
|
||||
pub fn num_rows(&self) -> RowId {
|
||||
match &self.idx {
|
||||
ColumnIndex::Full => self.values.num_vals() as u32,
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
use std::collections::HashMap;
|
||||
use std::io;
|
||||
|
||||
use super::writer::ColumnarSerializer;
|
||||
use crate::columnar::ColumnarReader;
|
||||
use crate::dynamic_column::DynamicColumn;
|
||||
use crate::ColumnType;
|
||||
use crate::{Cardinality, ColumnType};
|
||||
|
||||
pub enum MergeDocOrder {
|
||||
/// Columnar tables are simply stacked one above the other.
|
||||
@@ -19,20 +20,30 @@ pub enum MergeDocOrder {
|
||||
}
|
||||
|
||||
pub fn merge_columnar(
|
||||
_columnar_readers: &[ColumnarReader],
|
||||
columnar_readers: &[ColumnarReader],
|
||||
mapping: MergeDocOrder,
|
||||
_output: &mut impl io::Write,
|
||||
output: &mut impl io::Write,
|
||||
) -> io::Result<()> {
|
||||
match mapping {
|
||||
MergeDocOrder::Stack => {
|
||||
// implement me :)
|
||||
todo!();
|
||||
}
|
||||
MergeDocOrder::Complex(_) => {
|
||||
// for later
|
||||
todo!();
|
||||
let mut serializer = ColumnarSerializer::new(output);
|
||||
|
||||
// TODO handle dictionary merge for Str/Bytes column
|
||||
let field_name_to_group = group_columns_for_merge(columnar_readers)?;
|
||||
for (column_name, category_to_columns) in field_name_to_group {
|
||||
for (_category, columns_to_merge) in category_to_columns {
|
||||
let column_type = columns_to_merge[0].column_type();
|
||||
let mut column_serialzier =
|
||||
serializer.serialize_column(column_name.as_bytes(), column_type);
|
||||
merge_columns(
|
||||
column_type,
|
||||
&columns_to_merge,
|
||||
&mapping,
|
||||
&mut column_serialzier,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
serializer.finalize()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Column types are grouped into different categories.
|
||||
@@ -44,7 +55,7 @@ pub fn merge_columnar(
|
||||
/// See also [README.md].
|
||||
#[derive(Copy, Clone, Eq, PartialEq, Hash, Debug)]
|
||||
#[repr(u8)]
|
||||
enum ColumnTypeCategory {
|
||||
pub enum ColumnTypeCategory {
|
||||
Bool,
|
||||
Str,
|
||||
Numerical,
|
||||
@@ -68,8 +79,41 @@ impl From<ColumnType> for ColumnTypeCategory {
|
||||
}
|
||||
}
|
||||
|
||||
fn collect_columns(
|
||||
columnar_readers: &[&ColumnarReader],
|
||||
pub fn detect_cardinality(columns: &[DynamicColumn]) -> Cardinality {
|
||||
if columns
|
||||
.iter()
|
||||
.any(|column| column.get_cardinality().is_multivalue())
|
||||
{
|
||||
return Cardinality::Multivalued;
|
||||
}
|
||||
if columns
|
||||
.iter()
|
||||
.any(|column| column.get_cardinality().is_optional())
|
||||
{
|
||||
return Cardinality::Optional;
|
||||
}
|
||||
Cardinality::Full
|
||||
}
|
||||
|
||||
pub fn compute_num_docs(columns: &[DynamicColumn], mapping: &MergeDocOrder) -> usize {
|
||||
// TODO handle deletes
|
||||
|
||||
0
|
||||
}
|
||||
|
||||
pub fn merge_columns(
|
||||
column_type: ColumnType,
|
||||
columns: &[DynamicColumn],
|
||||
mapping: &MergeDocOrder,
|
||||
column_serializer: &mut impl io::Write,
|
||||
) -> io::Result<()> {
|
||||
let cardinality = detect_cardinality(columns);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn group_columns_for_merge(
|
||||
columnar_readers: &[ColumnarReader],
|
||||
) -> io::Result<HashMap<String, HashMap<ColumnTypeCategory, Vec<DynamicColumn>>>> {
|
||||
// Each column name may have multiple types of column associated.
|
||||
// For merging we are interested in the same column type category since they can be merged.
|
||||
@@ -117,26 +161,20 @@ fn cast_to_common_numerical_column(columns: &[DynamicColumn]) -> Vec<DynamicColu
|
||||
.all(|column| column.column_type().numerical_type().is_some()));
|
||||
let coerce_to_i64: Vec<_> = columns
|
||||
.iter()
|
||||
.map(|column| column.clone().coerce_to_i64())
|
||||
.filter_map(|column| column.clone().coerce_to_i64())
|
||||
.collect();
|
||||
|
||||
if coerce_to_i64.iter().all(|column| column.is_some()) {
|
||||
return coerce_to_i64
|
||||
.into_iter()
|
||||
.map(|column| column.unwrap())
|
||||
.collect();
|
||||
if coerce_to_i64.len() == columns.len() {
|
||||
return coerce_to_i64;
|
||||
}
|
||||
|
||||
let coerce_to_u64: Vec<_> = columns
|
||||
.iter()
|
||||
.map(|column| column.clone().coerce_to_u64())
|
||||
.filter_map(|column| column.clone().coerce_to_u64())
|
||||
.collect();
|
||||
|
||||
if coerce_to_u64.iter().all(|column| column.is_some()) {
|
||||
return coerce_to_u64
|
||||
.into_iter()
|
||||
.map(|column| column.unwrap())
|
||||
.collect();
|
||||
if coerce_to_u64.len() == columns.len() {
|
||||
return coerce_to_u64;
|
||||
}
|
||||
|
||||
columns
|
||||
@@ -183,7 +221,9 @@ mod tests {
|
||||
ColumnarReader::open(buffer).unwrap()
|
||||
};
|
||||
|
||||
let column_map = collect_columns(&[&columnar1, &columnar2, &columnar3]).unwrap();
|
||||
let column_map =
|
||||
group_columns_for_merge(&[columnar1.clone(), columnar2.clone(), columnar3.clone()])
|
||||
.unwrap();
|
||||
assert_eq!(column_map.len(), 1);
|
||||
let cat_to_columns = column_map.get("numbers").unwrap();
|
||||
assert_eq!(cat_to_columns.len(), 1);
|
||||
@@ -191,14 +231,14 @@ mod tests {
|
||||
let numerical = cat_to_columns.get(&ColumnTypeCategory::Numerical).unwrap();
|
||||
assert!(numerical.iter().all(|column| column.is_f64()));
|
||||
|
||||
let column_map = collect_columns(&[&columnar1, &columnar1]).unwrap();
|
||||
let column_map = group_columns_for_merge(&[columnar1.clone(), columnar1.clone()]).unwrap();
|
||||
assert_eq!(column_map.len(), 1);
|
||||
let cat_to_columns = column_map.get("numbers").unwrap();
|
||||
assert_eq!(cat_to_columns.len(), 1);
|
||||
let numerical = cat_to_columns.get(&ColumnTypeCategory::Numerical).unwrap();
|
||||
assert!(numerical.iter().all(|column| column.is_i64()));
|
||||
|
||||
let column_map = collect_columns(&[&columnar2, &columnar2]).unwrap();
|
||||
let column_map = group_columns_for_merge(&[columnar2.clone(), columnar2.clone()]).unwrap();
|
||||
assert_eq!(column_map.len(), 1);
|
||||
let cat_to_columns = column_map.get("numbers").unwrap();
|
||||
assert_eq!(cat_to_columns.len(), 1);
|
||||
|
||||
1
columnar/src/columnar/merge_index.rs
Normal file
1
columnar/src/columnar/merge_index.rs
Normal file
@@ -0,0 +1 @@
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
mod column_type;
|
||||
mod format_version;
|
||||
mod merge;
|
||||
mod merge_index;
|
||||
mod reader;
|
||||
mod writer;
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@ fn io_invalid_data(msg: String) -> io::Error {
|
||||
|
||||
/// The ColumnarReader makes it possible to access a set of columns
|
||||
/// associated to field names.
|
||||
#[derive(Clone)]
|
||||
pub struct ColumnarReader {
|
||||
column_dictionary: Dictionary<RangeSSTable>,
|
||||
column_data: FileSlice,
|
||||
|
||||
@@ -8,7 +8,7 @@ use std::net::Ipv6Addr;
|
||||
|
||||
use column_operation::ColumnOperation;
|
||||
use common::CountingWriter;
|
||||
use serializer::ColumnarSerializer;
|
||||
pub(crate) use serializer::ColumnarSerializer;
|
||||
use stacker::{Addr, ArenaHashMap, MemoryArena};
|
||||
|
||||
use crate::column_index::SerializableColumnIndex;
|
||||
|
||||
@@ -8,7 +8,7 @@ use common::{HasLen, OwnedBytes};
|
||||
use crate::column::{BytesColumn, Column, StrColumn};
|
||||
use crate::column_values::{monotonic_map_column, StrictlyMonotonicFn};
|
||||
use crate::columnar::ColumnType;
|
||||
use crate::{DateTime, NumericalType};
|
||||
use crate::{Cardinality, DateTime, NumericalType};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub enum DynamicColumn {
|
||||
@@ -23,6 +23,18 @@ pub enum DynamicColumn {
|
||||
}
|
||||
|
||||
impl DynamicColumn {
|
||||
pub fn get_cardinality(&self) -> Cardinality {
|
||||
match self {
|
||||
DynamicColumn::Bool(c) => c.get_cardinality(),
|
||||
DynamicColumn::I64(c) => c.get_cardinality(),
|
||||
DynamicColumn::U64(c) => c.get_cardinality(),
|
||||
DynamicColumn::F64(c) => c.get_cardinality(),
|
||||
DynamicColumn::IpAddr(c) => c.get_cardinality(),
|
||||
DynamicColumn::DateTime(c) => c.get_cardinality(),
|
||||
DynamicColumn::Bytes(c) => c.ords().get_cardinality(),
|
||||
DynamicColumn::Str(c) => c.ords().get_cardinality(),
|
||||
}
|
||||
}
|
||||
pub fn column_type(&self) -> ColumnType {
|
||||
match self {
|
||||
DynamicColumn::Bool(_) => ColumnType::Bool,
|
||||
|
||||
@@ -62,6 +62,12 @@ pub enum Cardinality {
|
||||
}
|
||||
|
||||
impl Cardinality {
|
||||
pub fn is_optional(&self) -> bool {
|
||||
matches!(self, Cardinality::Optional)
|
||||
}
|
||||
pub fn is_multivalue(&self) -> bool {
|
||||
matches!(self, Cardinality::Multivalued)
|
||||
}
|
||||
pub(crate) fn to_code(self) -> u8 {
|
||||
self as u8
|
||||
}
|
||||
|
||||
@@ -30,6 +30,7 @@ use crate::{BlockAddr, DeltaReader, Reader, SSTable, SSTableIndex, TermOrdinal};
|
||||
/// block boundary.
|
||||
///
|
||||
/// (See also README.md)
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct Dictionary<TSSTable: SSTable> {
|
||||
pub sstable_slice: FileSlice,
|
||||
pub sstable_index: SSTableIndex,
|
||||
|
||||
@@ -117,6 +117,7 @@ impl SSTable for MonotonicU64SSTable {
|
||||
/// `range_sstable[k1].end == range_sstable[k2].start`.
|
||||
///
|
||||
/// The first range is not required to start at `0`.
|
||||
#[derive(Clone, Copy, Debug)]
|
||||
pub struct RangeSSTable;
|
||||
|
||||
impl SSTable for RangeSSTable {
|
||||
|
||||
@@ -5,7 +5,7 @@ use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::{common_prefix_len, SSTableDataCorruption, TermOrdinal};
|
||||
|
||||
#[derive(Default, Debug, Serialize, Deserialize)]
|
||||
#[derive(Default, Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct SSTableIndex {
|
||||
blocks: Vec<BlockMeta>,
|
||||
}
|
||||
@@ -75,7 +75,7 @@ pub struct BlockAddr {
|
||||
pub first_ordinal: u64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub(crate) struct BlockMeta {
|
||||
/// Any byte string that is lexicographically greater or equal to
|
||||
/// the last key in the block,
|
||||
|
||||
Reference in New Issue
Block a user