Integration of columnar

This commit is contained in:
Paul Masurel
2023-01-20 13:52:13 +09:00
parent 6ab8990bbd
commit 0e66423de8
81 changed files with 2760 additions and 3843 deletions

View File

@@ -59,6 +59,7 @@ sstable = { version="0.1", path="./sstable", package ="tantivy-sstable", optiona
stacker = { version="0.1", path="./stacker", package ="tantivy-stacker" }
tantivy-query-grammar = { version= "0.19.0", path="./query-grammar" }
tantivy-bitpacker = { version= "0.3", path="./bitpacker" }
columnar = { version= "0.1", path="./columnar", package="tantivy-columnar" }
common = { version= "0.5", path = "./common/", package = "tantivy-common" }
fastfield_codecs = { version= "0.3", path="./fastfield_codecs", default-features = false }
tokenizer-api = { version="0.1", path="./tokenizer-api", package="tantivy-tokenizer-api" }
@@ -107,7 +108,7 @@ unstable = [] # useful for benches.
quickwit = ["sstable"]
[workspace]
members = ["query-grammar", "bitpacker", "common", "fastfield_codecs", "ownedbytes", "stacker", "sstable", "tokenizer-api"]
members = ["query-grammar", "bitpacker", "common", "fastfield_codecs", "ownedbytes", "stacker", "sstable", "tokenizer-api", "columnar"]
# Following the "fail" crate best practises, we isolate
# tests that define specific behavior in fail check points

18
TODO.txt Normal file
View File

@@ -0,0 +1,18 @@
Make schema_builder API fluent.
fix doc serialization and prevent compression problems
u64 , etc. shoudl return Resutl<Option> now that we support optional missing a column is really not an error
remove fastfield codecs
ditch the first_or_default trick. if it is still useful, improve its implementation.
rename FastFieldReaders::open to load
remove fast field reader
find a way to unify the two DateTime.
readd type check in the filter wrapper
add unit test on columnar list columns.
make sure sort works

View File

@@ -9,6 +9,9 @@
- indexing
- aggregations
- merge
* replug facets
* replug range queries
+ mutlivaued range queries restrat frm the beginning all of the time.
# Perf and Size
* re-add ZSTD compression for dictionaries
@@ -37,6 +40,12 @@ use the rank & select naming in unit tests branch.
multi-linear -> blockwise
linear codec -> simply a multiplication for the index column
rename columnar to something more explicit, like column_dictionary or columnar_table
remove old column from the fast field API.
remove the Column traits alias.
rename fastfield -> column
document changes
rationalization FastFieldValue, HasColumnType
# Other
fix enhance column-cli
@@ -44,4 +53,3 @@ fix enhance column-cli
# Santa claus
autodetect datetime ipaddr, plug customizable tokenizer.

View File

@@ -1,6 +1,7 @@
mod dictionary_encoded;
mod serialize;
use std::fmt::Debug;
use std::ops::Deref;
use std::sync::Arc;
@@ -17,11 +18,11 @@ use crate::{Cardinality, RowId};
#[derive(Clone)]
pub struct Column<T> {
pub idx: ColumnIndex<'static>,
pub idx: ColumnIndex,
pub values: Arc<dyn ColumnValues<T>>,
}
impl<T: PartialOrd> Column<T> {
impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
pub fn num_rows(&self) -> RowId {
match &self.idx {
ColumnIndex::Full => self.values.num_vals() as u32,
@@ -29,7 +30,7 @@ impl<T: PartialOrd> Column<T> {
ColumnIndex::Multivalued(col_index) => {
// The multivalued index contains all value start row_id,
// and one extra value at the end with the overall number of rows.
col_index.num_vals() - 1
col_index.num_rows()
}
}
}
@@ -37,12 +38,11 @@ impl<T: PartialOrd> Column<T> {
pub fn min_value(&self) -> T {
self.values.min_value()
}
pub fn max_value(&self) -> T {
self.values.max_value()
}
}
impl<T: PartialOrd + Copy + Send + Sync + 'static> Column<T> {
pub fn first(&self, row_id: RowId) -> Option<T> {
self.values(row_id).next()
}
@@ -61,7 +61,7 @@ impl<T: PartialOrd + Copy + Send + Sync + 'static> Column<T> {
}
impl<T> Deref for Column<T> {
type Target = ColumnIndex<'static>;
type Target = ColumnIndex;
fn deref(&self) -> &Self::Target {
&self.idx
@@ -86,7 +86,9 @@ struct FirstValueWithDefault<T: Copy> {
default_value: T,
}
impl<T: PartialOrd + Send + Sync + Copy + 'static> ColumnValues<T> for FirstValueWithDefault<T> {
impl<T: PartialOrd + Debug + Send + Sync + Copy + 'static> ColumnValues<T>
for FirstValueWithDefault<T>
{
fn get_val(&self, idx: u32) -> T {
self.column.first(idx).unwrap_or(self.default_value)
}

View File

@@ -1,3 +1,4 @@
use std::fmt::Debug;
use std::io;
use std::io::Write;
use std::sync::Arc;
@@ -33,7 +34,7 @@ pub fn serialize_column_mappable_to_u128<
Ok(())
}
pub fn serialize_column_mappable_to_u64<T: MonotonicallyMappableToU64>(
pub fn serialize_column_mappable_to_u64<T: MonotonicallyMappableToU64 + Debug>(
column_index: SerializableColumnIndex<'_>,
column_values: &impl ColumnValues<T>,
output: &mut impl Write,

View File

@@ -3,16 +3,15 @@ mod optional_index;
mod serialize;
use std::ops::Range;
use std::sync::Arc;
pub use optional_index::{OptionalIndex, SerializableOptionalIndex, Set};
pub use serialize::{open_column_index, serialize_column_index, SerializableColumnIndex};
use crate::column_values::ColumnValues;
use crate::column_index::multivalued_index::MultiValueIndex;
use crate::{Cardinality, RowId};
#[derive(Clone)]
pub enum ColumnIndex<'a> {
pub enum ColumnIndex {
Full,
Optional(OptionalIndex),
// TODO Remove the static by fixing the codec if possible.
@@ -21,10 +20,10 @@ pub enum ColumnIndex<'a> {
///
/// In addition, at index num_rows, an extra value is added
/// containing the overal number of values.
Multivalued(Arc<dyn ColumnValues<RowId> + 'a>),
Multivalued(MultiValueIndex),
}
impl<'a> ColumnIndex<'a> {
impl ColumnIndex {
pub fn get_cardinality(&self) -> Cardinality {
match self {
ColumnIndex::Full => Cardinality::Full,
@@ -43,11 +42,22 @@ impl<'a> ColumnIndex<'a> {
0..0
}
}
ColumnIndex::Multivalued(multivalued_index) => multivalued_index.range(row_id),
}
}
pub fn select_batch_in_place(&self, rank_ids: &mut Vec<RowId>) {
match self {
ColumnIndex::Full => {
// No need to do anything:
// value_idx and row_idx are the same.
}
ColumnIndex::Optional(optional_index) => {
optional_index.select_batch(&mut rank_ids[..]);
}
ColumnIndex::Multivalued(multivalued_index) => {
let multivalued_index_ref = &**multivalued_index;
let start: u32 = multivalued_index_ref.get_val(row_id);
let end: u32 = multivalued_index_ref.get_val(row_id + 1);
start..end
// TODO important: avoid using 0u32, and restart from the beginning all of the time.
multivalued_index.select_batch_in_place(0u32, rank_ids)
}
}
}

View File

@@ -1,5 +1,6 @@
use std::io;
use std::io::Write;
use std::ops::Range;
use std::sync::Arc;
use common::OwnedBytes;
@@ -7,9 +8,6 @@ use common::OwnedBytes;
use crate::column_values::{ColumnValues, FastFieldCodecType};
use crate::RowId;
#[derive(Clone)]
pub struct MultivaluedIndex(Arc<dyn ColumnValues<RowId>>);
pub fn serialize_multivalued_index(
multivalued_index: &dyn ColumnValues<RowId>,
output: &mut impl Write,
@@ -22,8 +20,113 @@ pub fn serialize_multivalued_index(
Ok(())
}
pub fn open_multivalued_index(bytes: OwnedBytes) -> io::Result<Arc<dyn ColumnValues<RowId>>> {
pub fn open_multivalued_index(bytes: OwnedBytes) -> io::Result<MultiValueIndex> {
let start_index_column: Arc<dyn ColumnValues<RowId>> =
crate::column_values::open_u64_mapped(bytes)?;
Ok(start_index_column)
Ok(MultiValueIndex { start_index_column })
}
#[derive(Clone)]
/// Index to resolve value range for given doc_id.
/// Starts at 0.
pub struct MultiValueIndex {
start_index_column: Arc<dyn crate::ColumnValues<RowId>>,
}
impl From<Arc<dyn ColumnValues<RowId>>> for MultiValueIndex {
fn from(start_index_column: Arc<dyn ColumnValues<RowId>>) -> Self {
MultiValueIndex { start_index_column }
}
}
impl MultiValueIndex {
/// Returns `[start, end)`, such that the values associated with
/// the given document are `start..end`.
#[inline]
pub(crate) fn range(&self, row_id: RowId) -> Range<RowId> {
let start = self.start_index_column.get_val(row_id);
let end = self.start_index_column.get_val(row_id + 1);
start..end
}
/// Returns the number of documents in the index.
#[inline]
pub fn num_rows(&self) -> u32 {
self.start_index_column.num_vals() - 1
}
/// Converts a list of ranks (row ids of values) in a 1:n index to the corresponding list of
/// row_ids. Positions are converted inplace to docids.
///
/// Since there is no index for value pos -> docid, but docid -> value pos range, we scan the
/// index.
///
/// Correctness: positions needs to be sorted. idx_reader needs to contain monotonically
/// increasing positions.
///
/// TODO: Instead of a linear scan we can employ a exponential search into binary search to
/// match a docid to its value position.
#[allow(clippy::bool_to_int_with_if)]
pub(crate) fn select_batch_in_place(&self, row_start: RowId, ranks: &mut Vec<u32>) {
if ranks.is_empty() {
return;
}
let mut cur_doc = row_start;
let mut last_doc = None;
assert!(self.start_index_column.get_val(row_start) as u32 <= ranks[0]);
let mut write_doc_pos = 0;
for i in 0..ranks.len() {
let pos = ranks[i];
loop {
let end = self.start_index_column.get_val(cur_doc + 1) as u32;
if end > pos {
ranks[write_doc_pos] = cur_doc;
write_doc_pos += if last_doc == Some(cur_doc) { 0 } else { 1 };
last_doc = Some(cur_doc);
break;
}
cur_doc += 1;
}
}
ranks.truncate(write_doc_pos);
}
}
#[cfg(test)]
mod tests {
use std::ops::Range;
use std::sync::Arc;
use super::MultiValueIndex;
use crate::column_values::IterColumn;
use crate::{ColumnValues, RowId};
fn index_to_pos_helper(
index: &MultiValueIndex,
doc_id_range: Range<u32>,
positions: &[u32],
) -> Vec<u32> {
let mut positions = positions.to_vec();
index.select_batch_in_place(doc_id_range.start, &mut positions);
positions
}
#[test]
fn test_positions_to_docid() {
let offsets: Vec<RowId> = vec![0, 10, 12, 15, 22, 23]; // docid values are [0..10, 10..12, 12..15, etc.]
let column: Arc<dyn ColumnValues<RowId>> = Arc::new(IterColumn::from(offsets.into_iter()));
let index = MultiValueIndex::from(column);
assert_eq!(index.num_rows(), 5);
let positions = &[10u32, 11, 15, 20, 21, 22];
assert_eq!(index_to_pos_helper(&index, 0..5, positions), vec![1, 3, 4]);
assert_eq!(index_to_pos_helper(&index, 1..5, positions), vec![1, 3, 4]);
assert_eq!(index_to_pos_helper(&index, 0..5, &[9]), vec![0]);
assert_eq!(index_to_pos_helper(&index, 1..5, &[10]), vec![1]);
assert_eq!(index_to_pos_helper(&index, 1..5, &[11]), vec![1]);
assert_eq!(index_to_pos_helper(&index, 2..5, &[12]), vec![2]);
assert_eq!(index_to_pos_helper(&index, 2..5, &[12, 14]), vec![2]);
assert_eq!(index_to_pos_helper(&index, 2..5, &[12, 14, 15]), vec![2, 3]);
}
}

View File

@@ -6,7 +6,7 @@ mod set;
mod set_block;
use common::{BinarySerializable, OwnedBytes, VInt};
pub use set::{Set, SetCodec, SelectCursor};
pub use set::{SelectCursor, Set, SetCodec};
use set_block::{
DenseBlock, DenseBlockCodec, SparseBlock, SparseBlockCodec, DENSE_BLOCK_NUM_BYTES,
};
@@ -127,7 +127,6 @@ impl<'a> BlockSelectCursor<'a> {
BlockSelectCursor::Sparse(sparse_select_cursor) => sparse_select_cursor.select(rank),
}
}
}
pub struct OptionalIndexSelectCursor<'a> {
current_block_cursor: BlockSelectCursor<'a>,
@@ -146,7 +145,12 @@ impl<'a> OptionalIndexSelectCursor<'a> {
return;
}
self.current_block_id = self.optional_index.find_block(rank, self.current_block_id);
self.current_block_end_rank = self.optional_index.block_metas.get(self.current_block_id as usize + 1).map(|block_meta| block_meta.non_null_rows_before_block).unwrap_or(u32::MAX);
self.current_block_end_rank = self
.optional_index
.block_metas
.get(self.current_block_id as usize + 1)
.map(|block_meta| block_meta.non_null_rows_before_block)
.unwrap_or(u32::MAX);
self.block_doc_idx_start = (self.current_block_id as u32) * ELEMENTS_PER_BLOCK;
let block_meta = self.optional_index.block_metas[self.current_block_id as usize];
self.num_null_rows_before_block = block_meta.non_null_rows_before_block;
@@ -213,7 +217,9 @@ impl Set<RowId> for OptionalIndex {
fn select_cursor<'b>(&'b self) -> OptionalIndexSelectCursor<'b> {
OptionalIndexSelectCursor {
current_block_cursor: BlockSelectCursor::Sparse(SparseBlockCodec::open(b"").select_cursor()),
current_block_cursor: BlockSelectCursor::Sparse(
SparseBlockCodec::open(b"").select_cursor(),
),
current_block_id: 0u16,
current_block_end_rank: 0u32, //< this is sufficient to force the first load
optional_index: self,
@@ -224,7 +230,6 @@ impl Set<RowId> for OptionalIndex {
}
impl OptionalIndex {
pub fn select_batch(&self, ranks: &mut [RowId]) {
let mut select_cursor = self.select_cursor();
for rank in ranks.iter_mut() {

View File

@@ -13,7 +13,6 @@ pub trait SetCodec {
fn open<'a>(data: &'a [u8]) -> Self::Reader<'a>;
}
/// Stateful object that makes it possible to compute several select in a row,
/// provided the rank passed as argument are increasing.
pub trait SelectCursor<T> {
@@ -23,8 +22,8 @@ pub trait SelectCursor<T> {
}
pub trait Set<T> {
type SelectCursor<'b>: SelectCursor<T> where Self: 'b;
type SelectCursor<'b>: SelectCursor<T>
where Self: 'b;
/// Returns true if the elements is contained in the Set
fn contains(&self, el: T) -> bool;
@@ -41,5 +40,5 @@ pub trait Set<T> {
fn select(&self, rank: T) -> T;
/// Creates a brand new select cursor.
fn select_cursor<'b>(&'b self,) -> Self::SelectCursor<'b>;
fn select_cursor<'b>(&'b self) -> Self::SelectCursor<'b>;
}

View File

@@ -3,7 +3,7 @@ use std::io::{self, Write};
use common::BinarySerializable;
use crate::column_index::optional_index::{Set, SetCodec, SelectCursor, ELEMENTS_PER_BLOCK};
use crate::column_index::optional_index::{SelectCursor, Set, SetCodec, ELEMENTS_PER_BLOCK};
#[inline(always)]
fn get_bit_at(input: u64, n: u16) -> bool {
@@ -113,7 +113,10 @@ pub struct DenseBlockSelectCursor<'a> {
impl<'a> SelectCursor<u16> for DenseBlockSelectCursor<'a> {
#[inline]
fn select(&mut self, rank: u16) -> u16 {
self.block_id = self.dense_block.find_miniblock_containing_rank(rank, self.block_id).unwrap();
self.block_id = self
.dense_block
.find_miniblock_containing_rank(rank, self.block_id)
.unwrap();
let index_block = self.dense_block.mini_block(self.block_id);
let in_block_rank = rank - index_block.rank;
self.block_id * ELEMENTS_PER_MINI_BLOCK + select_u64(index_block.bitvec, in_block_rank)
@@ -154,7 +157,7 @@ impl<'a> Set<u16> for DenseBlock<'a> {
}
#[inline(always)]
fn select_cursor<'b>(&'b self,) -> Self::SelectCursor<'b> {
fn select_cursor<'b>(&'b self) -> Self::SelectCursor<'b> {
DenseBlockSelectCursor {
block_id: 0,
dense_block: *self,

View File

@@ -1,4 +1,4 @@
use crate::column_index::optional_index::{Set, SetCodec, SelectCursor};
use crate::column_index::optional_index::{SelectCursor, Set, SetCodec};
pub struct SparseBlockCodec;
@@ -32,7 +32,6 @@ impl<'a> SelectCursor<u16> for SparseBlock<'a> {
}
impl<'a> Set<u16> for SparseBlock<'a> {
type SelectCursor<'b> = Self where Self: 'b;
#[inline(always)]
@@ -52,10 +51,9 @@ impl<'a> Set<u16> for SparseBlock<'a> {
}
#[inline(always)]
fn select_cursor<'b>(&'b self,) -> Self::SelectCursor<'b> {
fn select_cursor<'b>(&'b self) -> Self::SelectCursor<'b> {
*self
}
}
#[inline(always)]

View File

@@ -2,7 +2,7 @@ use std::collections::HashMap;
use crate::column_index::optional_index::set_block::dense::DENSE_BLOCK_NUM_BYTES;
use crate::column_index::optional_index::set_block::{DenseBlockCodec, SparseBlockCodec};
use crate::column_index::optional_index::{Set, SetCodec, SelectCursor};
use crate::column_index::optional_index::{SelectCursor, Set, SetCodec};
fn test_set_helper<C: SetCodec<Item = u16>>(vals: &[u16]) -> usize {
let mut buffer = Vec::new();

View File

@@ -47,7 +47,7 @@ pub fn serialize_column_index(
Ok(column_index_num_bytes)
}
pub fn open_column_index(mut bytes: OwnedBytes) -> io::Result<ColumnIndex<'static>> {
pub fn open_column_index(mut bytes: OwnedBytes) -> io::Result<ColumnIndex> {
if bytes.is_empty() {
return Err(io::Error::new(
io::ErrorKind::UnexpectedEof,
@@ -64,8 +64,8 @@ pub fn open_column_index(mut bytes: OwnedBytes) -> io::Result<ColumnIndex<'stati
Ok(ColumnIndex::Optional(optional_index))
}
Cardinality::Multivalued => {
let multivalued_index = super::multivalued_index::open_multivalued_index(bytes)?;
Ok(ColumnIndex::Multivalued(multivalued_index))
let multivalue_index = super::multivalued_index::open_multivalued_index(bytes)?;
Ok(ColumnIndex::Multivalued(multivalue_index))
}
}
}

View File

@@ -1,3 +1,4 @@
use std::fmt::Debug;
use std::marker::PhantomData;
use std::ops::{Range, RangeInclusive};
@@ -8,7 +9,7 @@ use crate::column_values::monotonic_mapping::StrictlyMonotonicFn;
/// `ColumnValues` provides access to a dense field column.
///
/// `Column` are just a wrapper over `ColumnValues` and a `ColumnIndex`.
pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync {
pub trait ColumnValues<T: PartialOrd + Debug = u64>: Send + Sync {
/// Return the value associated with the given idx.
///
/// This accessor should return as fast as possible.
@@ -44,7 +45,6 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync {
positions: &mut Vec<u32>,
) {
let doc_id_range = doc_id_range.start..doc_id_range.end.min(self.num_vals());
for idx in doc_id_range.start..doc_id_range.end {
let val = self.get_val(idx);
if value_range.contains(&val) {
@@ -78,7 +78,7 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync {
}
}
impl<T: Copy + PartialOrd> ColumnValues<T> for std::sync::Arc<dyn ColumnValues<T>> {
impl<T: Copy + PartialOrd + Debug> ColumnValues<T> for std::sync::Arc<dyn ColumnValues<T>> {
fn get_val(&self, idx: u32) -> T {
self.as_ref().get_val(idx)
}
@@ -104,7 +104,7 @@ impl<T: Copy + PartialOrd> ColumnValues<T> for std::sync::Arc<dyn ColumnValues<T
}
}
impl<'a, C: ColumnValues<T> + ?Sized, T: Copy + PartialOrd> ColumnValues<T> for &'a C {
impl<'a, C: ColumnValues<T> + ?Sized, T: Copy + PartialOrd + Debug> ColumnValues<T> for &'a C {
fn get_val(&self, idx: u32) -> T {
(*self).get_val(idx)
}
@@ -137,7 +137,7 @@ pub struct VecColumn<'a, T = u64> {
pub(crate) max_value: T,
}
impl<'a, T: Copy + PartialOrd + Send + Sync> ColumnValues<T> for VecColumn<'a, T> {
impl<'a, T: Copy + PartialOrd + Send + Sync + Debug> ColumnValues<T> for VecColumn<'a, T> {
fn get_val(&self, position: u32) -> T {
self.values[position as usize]
}
@@ -205,8 +205,8 @@ pub fn monotonic_map_column<C, T, Input, Output>(
where
C: ColumnValues<Input>,
T: StrictlyMonotonicFn<Input, Output> + Send + Sync,
Input: PartialOrd + Send + Sync + Clone,
Output: PartialOrd + Send + Sync + Clone,
Input: PartialOrd + Debug + Send + Sync + Clone,
Output: PartialOrd + Debug + Send + Sync + Clone,
{
MonotonicMappingColumn {
from_column,
@@ -219,8 +219,8 @@ impl<C, T, Input, Output> ColumnValues<Output> for MonotonicMappingColumn<C, T,
where
C: ColumnValues<Input>,
T: StrictlyMonotonicFn<Input, Output> + Send + Sync,
Input: PartialOrd + Send + Sync + Clone,
Output: PartialOrd + Send + Sync + Clone,
Input: PartialOrd + Send + Debug + Sync + Clone,
Output: PartialOrd + Send + Debug + Sync + Clone,
{
#[inline]
fn get_val(&self, idx: u32) -> Output {
@@ -282,7 +282,7 @@ where T: Iterator + Clone + ExactSizeIterator
impl<T> ColumnValues<T::Item> for IterColumn<T>
where
T: Iterator + Clone + ExactSizeIterator + Send + Sync,
T::Item: PartialOrd,
T::Item: PartialOrd + Debug,
{
fn get_val(&self, idx: u32) -> T::Item {
self.0.clone().nth(idx as usize).unwrap()

View File

@@ -10,6 +10,7 @@
#[cfg(test)]
mod tests;
use std::fmt::Debug;
use std::io;
use std::io::Write;
use std::sync::Arc;
@@ -124,7 +125,7 @@ impl U128FastFieldCodecType {
}
/// Returns the correct codec reader wrapped in the `Arc` for the data.
pub fn open_u128_mapped<T: MonotonicallyMappableToU128>(
pub fn open_u128_mapped<T: MonotonicallyMappableToU128 + Debug>(
mut bytes: OwnedBytes,
) -> io::Result<Arc<dyn ColumnValues<T>>> {
let header = U128Header::deserialize(&mut bytes)?;
@@ -137,7 +138,7 @@ pub fn open_u128_mapped<T: MonotonicallyMappableToU128>(
}
/// Returns the correct codec reader wrapped in the `Arc` for the data.
pub fn open_u64_mapped<T: MonotonicallyMappableToU64>(
pub fn open_u64_mapped<T: MonotonicallyMappableToU64 + Debug>(
mut bytes: OwnedBytes,
) -> io::Result<Arc<dyn ColumnValues<T>>> {
let header = Header::deserialize(&mut bytes)?;
@@ -150,7 +151,7 @@ pub fn open_u64_mapped<T: MonotonicallyMappableToU64>(
}
}
fn open_specific_codec<C: FastFieldCodec, Item: MonotonicallyMappableToU64>(
fn open_specific_codec<C: FastFieldCodec, Item: MonotonicallyMappableToU64 + Debug>(
bytes: OwnedBytes,
header: &Header,
) -> io::Result<Arc<dyn ColumnValues<Item>>> {

View File

@@ -1,3 +1,4 @@
use std::fmt::Debug;
use std::marker::PhantomData;
use fastdivide::DividerU64;
@@ -7,7 +8,7 @@ use crate::RowId;
/// Monotonic maps a value to u64 value space.
/// Monotonic mapping enables `PartialOrd` on u64 space without conversion to original space.
pub trait MonotonicallyMappableToU64: 'static + PartialOrd + Copy + Send + Sync {
pub trait MonotonicallyMappableToU64: 'static + PartialOrd + Debug + Copy + Send + Sync {
/// Converts a value to u64.
///
/// Internally all fast field values are encoded as u64.

View File

@@ -1,8 +1,9 @@
use std::fmt::Debug;
use std::net::Ipv6Addr;
/// Montonic maps a value to u128 value space
/// Monotonic mapping enables `PartialOrd` on u128 space without conversion to original space.
pub trait MonotonicallyMappableToU128: 'static + PartialOrd + Copy + Send + Sync {
pub trait MonotonicallyMappableToU128: 'static + PartialOrd + Copy + Debug + Send + Sync {
/// Converts a value to u128.
///
/// Internally all fast field values are encoded as u64.

View File

@@ -17,6 +17,7 @@
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
use std::fmt::Debug;
use std::io;
use std::num::NonZeroU64;
@@ -178,7 +179,7 @@ pub fn serialize_column_values_u128<F: Fn() -> I, I: Iterator<Item = u128>>(
}
/// Serializes the column with the codec with the best estimate on the data.
pub fn serialize_column_values<T: MonotonicallyMappableToU64>(
pub fn serialize_column_values<T: MonotonicallyMappableToU64 + Debug>(
typed_column: impl ColumnValues<T>,
codecs: &[FastFieldCodecType],
output: &mut impl io::Write,

View File

@@ -1,7 +1,8 @@
use std::fmt::Debug;
use std::net::Ipv6Addr;
use crate::value::NumericalType;
use crate::{column, Column, DynamicColumn, InvalidData, StrColumn};
use crate::InvalidData;
/// The column type represents the column type and can fit on 6-bits.
///
@@ -91,7 +92,7 @@ impl ColumnType {
}
// TODO remove if possible
pub trait HasAssociatedColumnType: 'static + Send + Sync + Copy + PartialOrd {
pub trait HasAssociatedColumnType: 'static + Debug + Send + Sync + Copy + PartialOrd {
fn column_type() -> ColumnType;
fn default_value() -> Self;
}

View File

@@ -19,6 +19,7 @@ pub(crate) mod utils;
mod value;
pub use column::{BytesColumn, Column, StrColumn};
pub use column_index::ColumnIndex;
pub use column_values::ColumnValues;
pub use columnar::{
merge_columnar, ColumnType, ColumnarReader, ColumnarWriter, HasAssociatedColumnType,

View File

@@ -1,4 +1,4 @@
use crate::{Column, ColumnType, InvalidData};
use crate::InvalidData;
#[derive(Copy, Clone, PartialEq, Debug)]
pub enum NumericalValue {

View File

@@ -13,7 +13,7 @@ use tantivy::aggregation::agg_result::AggregationResults;
use tantivy::aggregation::metric::AverageAggregation;
use tantivy::aggregation::AggregationCollector;
use tantivy::query::TermQuery;
use tantivy::schema::{self, Cardinality, IndexRecordOption, Schema, TextFieldIndexing};
use tantivy::schema::{self, IndexRecordOption, Schema, TextFieldIndexing};
use tantivy::{doc, Index, Term};
fn main() -> tantivy::Result<()> {
@@ -25,7 +25,7 @@ fn main() -> tantivy::Result<()> {
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let score_fieldtype =
crate::schema::NumericOptions::default().set_fast(Cardinality::SingleValue);
crate::schema::NumericOptions::default().set_fast();
let highscore_field = schema_builder.add_f64_field("highscore", score_fieldtype.clone());
let price_field = schema_builder.add_f64_field("price", score_fieldtype);

View File

@@ -4,7 +4,7 @@
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::{Cardinality, DateOptions, Schema, Value, INDEXED, STORED, STRING};
use tantivy::schema::{DateOptions, Schema, Value, INDEXED, STORED, STRING};
use tantivy::Index;
fn main() -> tantivy::Result<()> {
@@ -12,7 +12,7 @@ fn main() -> tantivy::Result<()> {
let mut schema_builder = Schema::builder();
let opts = DateOptions::from(INDEXED)
.set_stored()
.set_fast(Cardinality::SingleValue)
.set_fast()
.set_precision(tantivy::DatePrecision::Seconds);
let occurred_at = schema_builder.add_date_field("occurred_at", opts);
let event_type = schema_builder.add_text_field("event", STRING | STORED);

View File

@@ -14,6 +14,7 @@ repository = "https://github.com/quickwit-oss/tantivy"
[dependencies]
common = { version = "0.5", path = "../common/", package = "tantivy-common" }
tantivy-bitpacker = { version= "0.3", path = "../bitpacker/" }
columnar = { version= "0.1", path="../columnar", package="tantivy-columnar" }
prettytable-rs = {version="0.10.0", optional= true}
rand = {version="0.8.3", optional= true}
fastdivide = "0.4"

View File

@@ -2,81 +2,11 @@ use std::fmt::{self, Debug};
use std::marker::PhantomData;
use std::ops::{Range, RangeInclusive};
pub use columnar::ColumnValues as Column;
use tantivy_bitpacker::minmax;
use crate::monotonic_mapping::StrictlyMonotonicFn;
/// `Column` provides columnar access on a field.
pub trait Column<T: PartialOrd + Debug = u64>: Send + Sync {
/// Return the value associated with the given idx.
///
/// This accessor should return as fast as possible.
///
/// # Panics
///
/// May panic if `idx` is greater than the column length.
fn get_val(&self, idx: u32) -> T;
/// Fills an output buffer with the fast field values
/// associated with the `DocId` going from
/// `start` to `start + output.len()`.
///
/// # Panics
///
/// Must panic if `start + output.len()` is greater than
/// the segment's `maxdoc`.
#[inline]
fn get_range(&self, start: u64, output: &mut [T]) {
for (out, idx) in output.iter_mut().zip(start..) {
*out = self.get_val(idx as u32);
}
}
/// Get the positions of values which are in the provided value range.
///
/// Note that position == docid for single value fast fields
#[inline]
fn get_docids_for_value_range(
&self,
value_range: RangeInclusive<T>,
doc_id_range: Range<u32>,
positions: &mut Vec<u32>,
) {
let doc_id_range = doc_id_range.start..doc_id_range.end.min(self.num_vals());
for idx in doc_id_range.start..doc_id_range.end {
let val = self.get_val(idx);
if value_range.contains(&val) {
positions.push(idx);
}
}
}
/// Returns the minimum value for this fast field.
///
/// This min_value may not be exact.
/// For instance, the min value does not take in account of possible
/// deleted document. All values are however guaranteed to be higher than
/// `.min_value()`.
fn min_value(&self) -> T;
/// Returns the maximum value for this fast field.
///
/// This max_value may not be exact.
/// For instance, the max value does not take in account of possible
/// deleted document. All values are however guaranteed to be higher than
/// `.max_value()`.
fn max_value(&self) -> T;
/// The number of values in the column.
fn num_vals(&self) -> u32;
/// Returns a iterator over the data
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = T> + 'a> {
Box::new((0..self.num_vals()).map(|idx| self.get_val(idx)))
}
}
/// VecColumn provides `Column` over a slice.
pub struct VecColumn<'a, T = u64> {
values: &'a [T],
@@ -84,32 +14,6 @@ pub struct VecColumn<'a, T = u64> {
max_value: T,
}
impl<'a, C: Column<T>, T: Copy + PartialOrd + fmt::Debug> Column<T> for &'a C {
fn get_val(&self, idx: u32) -> T {
(*self).get_val(idx)
}
fn min_value(&self) -> T {
(*self).min_value()
}
fn max_value(&self) -> T {
(*self).max_value()
}
fn num_vals(&self) -> u32 {
(*self).num_vals()
}
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = T> + 'b> {
(*self).iter()
}
fn get_range(&self, start: u64, output: &mut [T]) {
(*self).get_range(start, output)
}
}
impl<'a, T: Copy + PartialOrd + Send + Sync + Debug> Column<T> for VecColumn<'a, T> {
fn get_val(&self, position: u32) -> T {
self.values[position as usize]

View File

@@ -15,7 +15,7 @@ use super::metric::{
use super::segment_agg_result::BucketCount;
use super::VecWithNames;
use crate::fastfield::{type_and_cardinality, MultiValuedFastFieldReader};
use crate::schema::{Cardinality, Type};
use crate::schema::Type;
use crate::{InvertedIndexReader, SegmentReader, TantivyError};
#[derive(Clone, Default)]

View File

@@ -43,13 +43,13 @@ mod tests {
use crate::aggregation::agg_result::AggregationResults;
use crate::aggregation::AggregationCollector;
use crate::query::AllQuery;
use crate::schema::{Cardinality, NumericOptions, Schema};
use crate::schema::{NumericOptions, Schema};
use crate::Index;
#[test]
fn test_metric_aggregations() {
let mut schema_builder = Schema::builder();
let field_options = NumericOptions::default().set_fast(Cardinality::SingleValue);
let field_options = NumericOptions::default().set_fast();
let field = schema_builder.add_f64_field("price", field_options);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap();

View File

@@ -433,13 +433,13 @@ mod tests {
let text_field_id = schema_builder.add_text_field("text_id", text_fieldtype);
let string_field_id = schema_builder.add_text_field("string_id", STRING | FAST);
let score_fieldtype =
crate::schema::NumericOptions::default().set_fast(Cardinality::SingleValue);
crate::schema::NumericOptions::default().set_fast();
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
let score_field_f64 = schema_builder.add_f64_field("score_f64", score_fieldtype.clone());
let score_field_i64 = schema_builder.add_i64_field("score_i64", score_fieldtype);
let fraction_field = schema_builder.add_f64_field(
"fraction_f64",
crate::schema::NumericOptions::default().set_fast(Cardinality::SingleValue),
crate::schema::NumericOptions::default().set_fast(),
);
let index = Index::create_in_ram(schema_builder.build());
{
@@ -657,12 +657,12 @@ mod tests {
let date_field = schema_builder.add_date_field("date", FAST);
schema_builder.add_text_field("dummy_text", STRING);
let score_fieldtype =
crate::schema::NumericOptions::default().set_fast(Cardinality::SingleValue);
crate::schema::NumericOptions::default().set_fast();
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
let score_field_f64 = schema_builder.add_f64_field("score_f64", score_fieldtype.clone());
let multivalue =
crate::schema::NumericOptions::default().set_fast(Cardinality::MultiValues);
crate::schema::NumericOptions::default().set_fast();
let scores_field_i64 = schema_builder.add_i64_field("scores_i64", multivalue);
let score_field_i64 = schema_builder.add_i64_field("score_i64", score_fieldtype);
@@ -1190,7 +1190,7 @@ mod tests {
let text_field_few_terms =
schema_builder.add_text_field("text_few_terms", STRING | FAST);
let score_fieldtype =
crate::schema::NumericOptions::default().set_fast(Cardinality::SingleValue);
crate::schema::NumericOptions::default().set_fast();
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
let score_field_f64 =
schema_builder.add_f64_field("score_f64", score_fieldtype.clone());

View File

@@ -12,10 +12,10 @@
use std::marker::PhantomData;
use std::sync::Arc;
use columnar::{DynamicColumn, HasAssociatedColumnType};
use fastfield_codecs::Column;
use crate::collector::{Collector, SegmentCollector};
use crate::fastfield::FastValue;
use crate::schema::Field;
use crate::{Score, SegmentReader, TantivyError};
@@ -61,7 +61,7 @@ use crate::{Score, SegmentReader, TantivyError};
/// # Ok(())
/// # }
/// ```
pub struct FilterCollector<TCollector, TPredicate, TPredicateValue: FastValue>
pub struct FilterCollector<TCollector, TPredicate, TPredicateValue: Default>
where TPredicate: 'static + Clone
{
field: Field,
@@ -70,7 +70,7 @@ where TPredicate: 'static + Clone
t_predicate_value: PhantomData<TPredicateValue>,
}
impl<TCollector, TPredicate, TPredicateValue: FastValue>
impl<TCollector, TPredicate, TPredicateValue: Default>
FilterCollector<TCollector, TPredicate, TPredicateValue>
where
TCollector: Collector + Send + Sync,
@@ -91,12 +91,13 @@ where
}
}
impl<TCollector, TPredicate, TPredicateValue: FastValue> Collector
impl<TCollector, TPredicate, TPredicateValue: Default> Collector
for FilterCollector<TCollector, TPredicate, TPredicateValue>
where
TCollector: Collector + Send + Sync,
TPredicate: 'static + Fn(TPredicateValue) -> bool + Send + Sync + Clone,
TPredicateValue: FastValue,
TPredicateValue: HasAssociatedColumnType,
DynamicColumn: Into<Option<columnar::Column<TPredicateValue>>>,
{
// That's the type of our result.
// Our standard deviation will be a float.
@@ -117,20 +118,10 @@ where
field_entry.name()
)));
}
let requested_type = TPredicateValue::to_type();
let field_schema_type = field_entry.field_type().value_type();
if requested_type != field_schema_type {
return Err(TantivyError::SchemaError(format!(
"Field {:?} is of type {:?}!={:?}",
field_entry.name(),
requested_type,
field_schema_type
)));
}
let fast_field_reader = segment_reader
.fast_fields()
.typed_fast_field_reader(schema.get_field_name(self.field))?;
.typed_column_first_or_default(schema.get_field_name(self.field))?;
let segment_collector = self
.collector
@@ -159,7 +150,7 @@ where
pub struct FilterSegmentCollector<TSegmentCollector, TPredicate, TPredicateValue>
where
TPredicate: 'static,
TPredicateValue: FastValue,
DynamicColumn: Into<Option<columnar::Column<TPredicateValue>>>,
{
fast_field_reader: Arc<dyn Column<TPredicateValue>>,
segment_collector: TSegmentCollector,
@@ -171,8 +162,9 @@ impl<TSegmentCollector, TPredicate, TPredicateValue> SegmentCollector
for FilterSegmentCollector<TSegmentCollector, TPredicate, TPredicateValue>
where
TSegmentCollector: SegmentCollector,
TPredicateValue: HasAssociatedColumnType,
TPredicate: 'static + Fn(TPredicateValue) -> bool + Send + Sync,
TPredicateValue: FastValue,
DynamicColumn: Into<Option<columnar::Column<TPredicateValue>>>,
{
type Fruit = TSegmentCollector::Fruit;

View File

@@ -4,7 +4,7 @@ use fastdivide::DividerU64;
use fastfield_codecs::Column;
use crate::collector::{Collector, SegmentCollector};
use crate::fastfield::FastValue;
use crate::fastfield::{FastFieldNotAvailableError, FastValue};
use crate::schema::Type;
use crate::{DocId, Score};
@@ -87,14 +87,14 @@ impl HistogramComputer {
}
pub struct SegmentHistogramCollector {
histogram_computer: HistogramComputer,
ff_reader: Arc<dyn Column<u64>>,
column_u64: Arc<dyn Column<u64>>,
}
impl SegmentCollector for SegmentHistogramCollector {
type Fruit = Vec<u64>;
fn collect(&mut self, doc: DocId, _score: Score) {
let value = self.ff_reader.get_val(doc);
let value = self.column_u64.get_val(doc);
self.histogram_computer.add_value(value);
}
@@ -112,14 +112,18 @@ impl Collector for HistogramCollector {
_segment_local_id: crate::SegmentOrdinal,
segment: &crate::SegmentReader,
) -> crate::Result<Self::Child> {
let ff_reader = segment.fast_fields().u64_lenient(&self.field)?;
let column_opt = segment.fast_fields().u64_lenient(&self.field)?;
let column = column_opt.ok_or_else(|| FastFieldNotAvailableError {
field_name: self.field.clone(),
})?;
let column_u64 = column.first_or_default_col(0u64);
Ok(SegmentHistogramCollector {
histogram_computer: HistogramComputer {
counts: vec![0; self.num_buckets],
min_value: self.min_value,
divider: self.divider,
},
ff_reader,
column_u64,
})
}

View File

@@ -104,9 +104,8 @@ pub use self::custom_score_top_collector::{CustomScorer, CustomSegmentScorer};
mod tweak_score_top_collector;
pub use self::tweak_score_top_collector::{ScoreSegmentTweaker, ScoreTweaker};
mod facet_collector;
pub use self::facet_collector::{FacetCollector, FacetCounts};
// mod facet_collector;
// pub use self::facet_collector::{FacetCollector, FacetCounts};
use crate::query::Weight;
mod docset_collector;

View File

@@ -5,7 +5,6 @@ use fastfield_codecs::Column;
use super::*;
use crate::collector::{Count, FilterCollector, TopDocs};
use crate::core::SegmentReader;
use crate::fastfield::BytesFastFieldReader;
use crate::query::{AllQuery, QueryParser};
use crate::schema::{Field, Schema, FAST, TEXT};
use crate::time::format_description::well_known::Rfc3339;
@@ -58,9 +57,10 @@ pub fn test_filter_collector() -> crate::Result<()> {
assert_eq!(filtered_top_docs.len(), 0);
fn date_filter(value: DateTime) -> bool {
(value.into_utc() - OffsetDateTime::parse("2019-04-09T00:00:00+00:00", &Rfc3339).unwrap())
.whole_weeks()
fn date_filter(value: columnar::DateTime) -> bool {
(crate::DateTime::from(value).into_utc()
- OffsetDateTime::parse("2019-04-09T00:00:00+00:00", &Rfc3339).unwrap())
.whole_weeks()
> 0
}
@@ -164,8 +164,10 @@ pub struct FastFieldSegmentCollector {
}
impl FastFieldTestCollector {
pub fn for_field(field: String) -> FastFieldTestCollector {
FastFieldTestCollector { field }
pub fn for_field(field: impl ToString) -> FastFieldTestCollector {
FastFieldTestCollector {
field: field.to_string(),
}
}
}
@@ -210,64 +212,62 @@ impl SegmentCollector for FastFieldSegmentCollector {
}
}
/// Collects in order all of the fast field bytes for all of the
/// docs in the `DocSet`
///
/// This collector is mainly useful for tests.
pub struct BytesFastFieldTestCollector {
field: Field,
}
// /// Collects in order all of the fast field bytes for all of the
// /// docs in the `DocSet`
// ///
// /// This collector is mainly useful for tests.
// pub struct BytesFastFieldTestCollector {
// field: Field,
// }
pub struct BytesFastFieldSegmentCollector {
vals: Vec<u8>,
reader: BytesFastFieldReader,
}
// pub struct BytesFastFieldSegmentCollector {
// vals: Vec<u8>,
// reader: BytesFastFieldReader,
// }
impl BytesFastFieldTestCollector {
pub fn for_field(field: Field) -> BytesFastFieldTestCollector {
BytesFastFieldTestCollector { field }
}
}
// impl BytesFastFieldTestCollector {
// pub fn for_field(field: Field) -> BytesFastFieldTestCollector {
// BytesFastFieldTestCollector { field }
// }
// }
impl Collector for BytesFastFieldTestCollector {
type Fruit = Vec<u8>;
type Child = BytesFastFieldSegmentCollector;
// impl Collector for BytesFastFieldTestCollector {
// type Fruit = Vec<u8>;
// type Child = BytesFastFieldSegmentCollector;
fn for_segment(
&self,
_segment_local_id: u32,
segment_reader: &SegmentReader,
) -> crate::Result<BytesFastFieldSegmentCollector> {
let reader = segment_reader
.fast_fields()
.bytes(segment_reader.schema().get_field_name(self.field))?;
Ok(BytesFastFieldSegmentCollector {
vals: Vec::new(),
reader,
})
}
// fn for_segment(
// &self,
// _segment_local_id: u32,
// segment_reader: &SegmentReader,
// ) -> crate::Result<BytesFastFieldSegmentCollector> {
// let reader = segment_reader.fast_fields().bytes(self.field)?;
// Ok(BytesFastFieldSegmentCollector {
// vals: Vec::new(),
// reader,
// })
// }
fn requires_scoring(&self) -> bool {
false
}
// fn requires_scoring(&self) -> bool {
// false
// }
fn merge_fruits(&self, children: Vec<Vec<u8>>) -> crate::Result<Vec<u8>> {
Ok(children.into_iter().flat_map(|c| c.into_iter()).collect())
}
}
// fn merge_fruits(&self, children: Vec<Vec<u8>>) -> crate::Result<Vec<u8>> {
// Ok(children.into_iter().flat_map(|c| c.into_iter()).collect())
// }
// }
impl SegmentCollector for BytesFastFieldSegmentCollector {
type Fruit = Vec<u8>;
// impl SegmentCollector for BytesFastFieldSegmentCollector {
// type Fruit = Vec<u8>;
fn collect(&mut self, doc: u32, _score: Score) {
let data = self.reader.get_bytes(doc);
self.vals.extend(data);
}
// fn collect(&mut self, doc: u32, _score: Score) {
// let data = self.reader.get_bytes(doc);
// self.vals.extend(data);
// }
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
self.vals
}
}
// fn harvest(self) -> <Self as SegmentCollector>::Fruit {
// self.vals
// }
// }
fn make_test_searcher() -> crate::Result<Searcher> {
let schema = Schema::builder().build();

View File

@@ -12,7 +12,7 @@ use crate::collector::tweak_score_top_collector::TweakedScoreTopCollector;
use crate::collector::{
CustomScorer, CustomSegmentScorer, ScoreSegmentTweaker, ScoreTweaker, SegmentCollector,
};
use crate::fastfield::FastValue;
use crate::fastfield::{FastFieldNotAvailableError, FastValue};
use crate::query::Weight;
use crate::schema::Field;
use crate::{DocAddress, DocId, Score, SegmentOrdinal, SegmentReader, TantivyError};
@@ -22,7 +22,7 @@ struct FastFieldConvertCollector<
TFastValue: FastValue,
> {
pub collector: TCollector,
pub field: Field,
pub field: String,
pub fast_value: std::marker::PhantomData<TFastValue>,
}
@@ -41,7 +41,8 @@ where
segment: &SegmentReader,
) -> crate::Result<Self::Child> {
let schema = segment.schema();
let field_entry = schema.get_field_entry(self.field);
let field = schema.get_field(&self.field)?;
let field_entry = schema.get_field_entry(field);
if !field_entry.is_fast() {
return Err(TantivyError::SchemaError(format!(
"Field {:?} is not a fast field.",
@@ -132,17 +133,17 @@ impl fmt::Debug for TopDocs {
}
struct ScorerByFastFieldReader {
ff_reader: Arc<dyn Column<u64>>,
sort_column: Arc<dyn Column<u64>>,
}
impl CustomSegmentScorer<u64> for ScorerByFastFieldReader {
fn score(&mut self, doc: DocId) -> u64 {
self.ff_reader.get_val(doc)
self.sort_column.get_val(doc)
}
}
struct ScorerByField {
field: Field,
field: String,
}
impl CustomScorer<u64> for ScorerByField {
@@ -154,10 +155,13 @@ impl CustomScorer<u64> for ScorerByField {
// mapping is monotonic, so it is sufficient to compute our top-K docs.
//
// The conversion will then happen only on the top-K docs.
let ff_reader = segment_reader
.fast_fields()
.typed_fast_field_reader(segment_reader.schema().get_field_name(self.field))?;
Ok(ScorerByFastFieldReader { ff_reader })
let sort_column_opt = segment_reader.fast_fields().u64_lenient(&self.field)?;
let sort_column = sort_column_opt
.ok_or_else(|| FastFieldNotAvailableError {
field_name: self.field.clone(),
})?
.first_or_default_col(0u64);
Ok(ScorerByFastFieldReader { sort_column })
}
}
@@ -290,9 +294,14 @@ impl TopDocs {
/// the [.order_by_fast_field(...)](TopDocs::order_by_fast_field) method.
pub fn order_by_u64_field(
self,
field: Field,
field: impl ToString,
) -> impl Collector<Fruit = Vec<(u64, DocAddress)>> {
CustomScoreTopCollector::new(ScorerByField { field }, self.0.into_tscore())
CustomScoreTopCollector::new(
ScorerByField {
field: field.to_string(),
},
self.0.into_tscore(),
)
}
/// Set top-K to rank documents by a given fast field.
@@ -367,15 +376,15 @@ impl TopDocs {
/// ```
pub fn order_by_fast_field<TFastValue>(
self,
fast_field: Field,
fast_field: impl ToString,
) -> impl Collector<Fruit = Vec<(TFastValue, DocAddress)>>
where
TFastValue: FastValue,
{
let u64_collector = self.order_by_u64_field(fast_field);
let u64_collector = self.order_by_u64_field(fast_field.to_string());
FastFieldConvertCollector {
collector: u64_collector,
field: fast_field,
field: fast_field.to_string(),
fast_value: PhantomData,
}
}
@@ -877,7 +886,7 @@ mod tests {
});
let searcher = index.reader()?.searcher();
let top_collector = TopDocs::with_limit(4).order_by_u64_field(size);
let top_collector = TopDocs::with_limit(4).order_by_u64_field(SIZE);
let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector)?;
assert_eq!(
&top_docs[..],
@@ -916,7 +925,7 @@ mod tests {
))?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let top_collector = TopDocs::with_limit(3).order_by_fast_field(birthday);
let top_collector = TopDocs::with_limit(3).order_by_fast_field("birthday");
let top_docs: Vec<(DateTime, DocAddress)> = searcher.search(&AllQuery, &top_collector)?;
assert_eq!(
&top_docs[..],
@@ -946,7 +955,7 @@ mod tests {
))?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let top_collector = TopDocs::with_limit(3).order_by_fast_field(altitude);
let top_collector = TopDocs::with_limit(3).order_by_fast_field("altitude");
let top_docs: Vec<(i64, DocAddress)> = searcher.search(&AllQuery, &top_collector)?;
assert_eq!(
&top_docs[..],
@@ -976,7 +985,7 @@ mod tests {
))?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let top_collector = TopDocs::with_limit(3).order_by_fast_field(altitude);
let top_collector = TopDocs::with_limit(3).order_by_fast_field("altitude");
let top_docs: Vec<(f64, DocAddress)> = searcher.search(&AllQuery, &top_collector)?;
assert_eq!(
&top_docs[..],
@@ -1004,7 +1013,7 @@ mod tests {
.unwrap();
});
let searcher = index.reader().unwrap().searcher();
let top_collector = TopDocs::with_limit(4).order_by_u64_field(Field::from_field_id(2));
let top_collector = TopDocs::with_limit(4).order_by_u64_field("missing_field");
let segment_reader = searcher.segment_reader(0u32);
top_collector
.for_segment(0, segment_reader)
@@ -1022,7 +1031,7 @@ mod tests {
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let segment = searcher.segment_reader(0);
let top_collector = TopDocs::with_limit(4).order_by_u64_field(size);
let top_collector = TopDocs::with_limit(4).order_by_u64_field(SIZE);
let err = top_collector.for_segment(0, segment).err().unwrap();
assert!(matches!(err, crate::TantivyError::SchemaError(_)));
Ok(())
@@ -1039,7 +1048,7 @@ mod tests {
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let segment = searcher.segment_reader(0);
let top_collector = TopDocs::with_limit(4).order_by_fast_field::<i64>(size);
let top_collector = TopDocs::with_limit(4).order_by_fast_field::<i64>(SIZE);
let err = top_collector.for_segment(0, segment).err().unwrap();
assert!(
matches!(err, crate::TantivyError::SchemaError(msg) if msg == "Field \"size\" is not a fast field.")

View File

@@ -19,7 +19,7 @@ use crate::error::{DataCorruption, TantivyError};
use crate::indexer::index_writer::{MAX_NUM_THREAD, MEMORY_ARENA_NUM_BYTES_MIN};
use crate::indexer::segment_updater::save_metas;
use crate::reader::{IndexReader, IndexReaderBuilder};
use crate::schema::{Cardinality, Field, FieldType, Schema};
use crate::schema::{Field, FieldType, Schema};
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
use crate::IndexWriter;
@@ -93,7 +93,7 @@ fn save_new_metas(
/// let body_field = schema_builder.add_text_field("body", TEXT);
/// let number_field = schema_builder.add_u64_field(
/// "number",
/// NumericOptions::default().set_fast(Cardinality::SingleValue),
/// NumericOptions::default().set_fast(),
/// );
///
/// let schema = schema_builder.build();
@@ -245,12 +245,6 @@ impl IndexBuilder {
sort_by_field.field
)));
}
if entry.field_type().fastfield_cardinality() != Some(Cardinality::SingleValue) {
return Err(TantivyError::InvalidArgument(format!(
"Only single value fast field Cardinality supported for sorting index {}",
sort_by_field.field
)));
}
}
Ok(())
} else {

View File

@@ -7,7 +7,7 @@ use fail::fail_point;
use crate::core::{InvertedIndexReader, Segment, SegmentComponent, SegmentId};
use crate::directory::{CompositeFile, FileSlice};
use crate::error::DataCorruption;
use crate::fastfield::{intersect_alive_bitsets, AliveBitSet, FacetReader, FastFieldReaders};
use crate::fastfield::{intersect_alive_bitsets, AliveBitSet, FastFieldReaders};
use crate::fieldnorm::{FieldNormReader, FieldNormReaders};
use crate::schema::{Field, FieldType, IndexRecordOption, Schema};
use crate::space_usage::SegmentSpaceUsage;
@@ -90,25 +90,8 @@ impl SegmentReader {
}
/// Accessor to the `FacetReader` associated with a given `Field`.
pub fn facet_reader(&self, field: Field) -> crate::Result<FacetReader> {
let field_entry = self.schema.get_field_entry(field);
match field_entry.field_type() {
FieldType::Facet(_) => {
let term_ords_reader =
self.fast_fields().u64s(self.schema.get_field_name(field))?;
let termdict = self
.termdict_composite
.open_read(field)
.map(TermDictionary::open)
.unwrap_or_else(|| Ok(TermDictionary::empty()))?;
Ok(FacetReader::new(term_ords_reader, termdict))
}
_ => Err(crate::TantivyError::InvalidArgument(format!(
"Field {:?} is not a facet field.",
field_entry.name()
))),
}
pub fn facet_reader(&self, field: Field) -> crate::Result<()> {
todo!();
}
/// Accessor to the segment's `Field norms`'s reader.
@@ -170,9 +153,7 @@ impl SegmentReader {
let schema = segment.schema();
let fast_fields_data = segment.open_read(SegmentComponent::FastFields)?;
let fast_fields_composite = CompositeFile::open(&fast_fields_data)?;
let fast_fields_readers =
Arc::new(FastFieldReaders::new(schema.clone(), fast_fields_composite));
let fast_fields_readers = Arc::new(FastFieldReaders::open(fast_fields_data)?);
let fieldnorm_data = segment.open_read(SegmentComponent::FieldNorms)?;
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;

View File

@@ -8,7 +8,7 @@ use crate::schema::FieldEntry;
#[derive(Debug, Error)]
#[error("Fast field not available: '{field_name:?}'")]
pub struct FastFieldNotAvailableError {
field_name: String,
pub(crate) field_name: String,
}
impl FastFieldNotAvailableError {

File diff suppressed because it is too large Load Diff

View File

@@ -38,7 +38,7 @@ mod tests {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_u64_field(
"multifield",
NumericOptions::default().set_fast(Cardinality::MultiValues),
NumericOptions::default().set_fast(),
);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
@@ -74,7 +74,7 @@ mod tests {
let date_field = schema_builder.add_date_field(
"multi_date_field",
DateOptions::default()
.set_fast(Cardinality::MultiValues)
.set_fast()
.set_indexed()
.set_fieldnorm()
.set_stored(),
@@ -215,7 +215,7 @@ mod tests {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_i64_field(
"multifield",
NumericOptions::default().set_fast(Cardinality::MultiValues),
NumericOptions::default().set_fast(),
);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
@@ -246,7 +246,7 @@ mod tests {
let mut schema_builder = Schema::builder();
let bool_field = schema_builder.add_bool_field(
"multifield",
NumericOptions::default().set_fast(Cardinality::MultiValues),
NumericOptions::default().set_fast(),
);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
@@ -278,7 +278,7 @@ mod tests {
let field = schema_builder.add_u64_field(
"multifield",
NumericOptions::default()
.set_fast(Cardinality::MultiValues)
.set_fast()
.set_indexed(),
);
let schema = schema_builder.build();
@@ -424,7 +424,7 @@ mod bench {
let mut builder = crate::schema::SchemaBuilder::new();
let fast_multi =
crate::schema::NumericOptions::default().set_fast(Cardinality::MultiValues);
crate::schema::NumericOptions::default().set_fast();
let multi_field = builder.add_f64_field("f64s", fast_multi);
let index = crate::Index::create_in_ram(builder.build());
@@ -504,7 +504,7 @@ mod bench {
let path = Path::new("test");
let directory: RamDirectory = RamDirectory::create();
let field = {
let options = NumericOptions::default().set_fast(Cardinality::MultiValues);
let options = NumericOptions::default().set_fast();
let mut schema_builder = Schema::builder();
let field = schema_builder.add_u64_field("field", options);
let schema = schema_builder.build();
@@ -562,7 +562,7 @@ mod bench {
b.iter(|| {
let directory: RamDirectory = RamDirectory::create();
let options = NumericOptions::default().set_fast(Cardinality::MultiValues);
let options = NumericOptions::default().set_fast();
let mut schema_builder = Schema::builder();
let field = schema_builder.add_u64_field("field", options);
let schema = schema_builder.build();
@@ -595,7 +595,7 @@ mod bench {
b.iter(|| {
let directory: RamDirectory = RamDirectory::create();
let options = NumericOptions::default().set_fast(Cardinality::MultiValues);
let options = NumericOptions::default().set_fast();
let mut schema_builder = Schema::builder();
let field = schema_builder.add_u64_field("field", options);
let schema = schema_builder.build();

View File

@@ -137,7 +137,7 @@ mod tests {
let date_field = schema_builder.add_date_field(
"multi_date_field",
DateOptions::default()
.set_fast(Cardinality::MultiValues)
.set_fast()
.set_indexed()
.set_fieldnorm()
.set_precision(DatePrecision::Microseconds)
@@ -188,7 +188,7 @@ mod tests {
let date_field = schema_builder.add_date_field(
"multi_date_field",
DateOptions::default()
.set_fast(Cardinality::MultiValues)
.set_fast()
// TODO: Test different precision after fixing https://github.com/quickwit-oss/tantivy/issues/1783
.set_precision(DatePrecision::Microseconds)
.set_indexed()
@@ -307,7 +307,7 @@ mod tests {
let mut schema_builder = Schema::builder();
let field_options = NumericOptions::default()
.set_indexed()
.set_fast(Cardinality::MultiValues);
.set_fast();
let item_field = schema_builder.add_i64_field("items", field_options);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);

View File

@@ -1,12 +1,16 @@
use std::io;
use std::net::Ipv6Addr;
use std::sync::Arc;
use columnar::{
BytesColumn, ColumnType, ColumnValues, ColumnarReader, DynamicColumn, DynamicColumnHandle,
HasAssociatedColumnType, NumericalType, StrColumn,
};
use fastfield_codecs::{open, open_u128, Column};
use super::multivalued::MultiValuedFastFieldReader;
use crate::directory::{CompositeFile, FileSlice};
use crate::fastfield::{BytesFastFieldReader, FastFieldNotAvailableError, FastValue};
use crate::schema::{Cardinality, Field, FieldType, Schema};
use crate::fastfield::{FastFieldNotAvailableError, FastValue};
use crate::schema::{Field, FieldType, Schema};
use crate::space_usage::PerFieldSpaceUsage;
use crate::{DateTime, TantivyError};
@@ -16,315 +20,167 @@ use crate::{DateTime, TantivyError};
/// and just wraps several `HashMap`.
#[derive(Clone)]
pub struct FastFieldReaders {
schema: Schema,
fast_fields_composite: CompositeFile,
}
#[derive(Eq, PartialEq, Debug)]
pub(crate) enum FastType {
I64,
U64,
U128,
F64,
Bool,
Date,
}
pub(crate) fn type_and_cardinality(field_type: &FieldType) -> Option<(FastType, Cardinality)> {
match field_type {
FieldType::U64(options) => options
.get_fastfield_cardinality()
.map(|cardinality| (FastType::U64, cardinality)),
FieldType::I64(options) => options
.get_fastfield_cardinality()
.map(|cardinality| (FastType::I64, cardinality)),
FieldType::F64(options) => options
.get_fastfield_cardinality()
.map(|cardinality| (FastType::F64, cardinality)),
FieldType::Bool(options) => options
.get_fastfield_cardinality()
.map(|cardinality| (FastType::Bool, cardinality)),
FieldType::Date(options) => options
.get_fastfield_cardinality()
.map(|cardinality| (FastType::Date, cardinality)),
FieldType::Facet(_) => Some((FastType::U64, Cardinality::MultiValues)),
FieldType::Str(options) if options.is_fast() => {
Some((FastType::U64, Cardinality::MultiValues))
}
FieldType::IpAddr(options) => options
.get_fastfield_cardinality()
.map(|cardinality| (FastType::U128, cardinality)),
_ => None,
}
columnar: Arc<ColumnarReader>,
}
impl FastFieldReaders {
pub(crate) fn new(schema: Schema, fast_fields_composite: CompositeFile) -> FastFieldReaders {
FastFieldReaders {
schema,
fast_fields_composite,
}
pub(crate) fn open(fast_field_file: FileSlice) -> io::Result<FastFieldReaders> {
let columnar = Arc::new(ColumnarReader::open(fast_field_file)?);
Ok(FastFieldReaders { columnar })
}
pub(crate) fn space_usage(&self) -> PerFieldSpaceUsage {
self.fast_fields_composite.space_usage()
todo!()
}
#[doc(hidden)]
pub fn fast_field_data(&self, field: Field, idx: usize) -> crate::Result<FileSlice> {
self.fast_fields_composite
.open_read_with_idx(field, idx)
.ok_or_else(|| {
let field_name = self.schema.get_field_entry(field).name();
TantivyError::SchemaError(format!("Field({}) data was not found", field_name))
})
}
fn check_type(
pub fn typed_column_opt<T>(
&self,
field: Field,
expected_fast_type: FastType,
expected_cardinality: Cardinality,
) -> crate::Result<()> {
let field_entry = self.schema.get_field_entry(field);
let (fast_type, cardinality) =
type_and_cardinality(field_entry.field_type()).ok_or_else(|| {
crate::TantivyError::SchemaError(format!(
"Field {:?} is not a fast field.",
field_entry.name()
))
})?;
if fast_type != expected_fast_type {
return Err(crate::TantivyError::SchemaError(format!(
"Field {:?} is of type {:?}, expected {:?}.",
field_entry.name(),
fast_type,
expected_fast_type
)));
field_name: &str,
) -> crate::Result<Option<columnar::Column<T>>>
where
T: PartialOrd + Copy + HasAssociatedColumnType + Send + Sync + 'static,
DynamicColumn: Into<Option<columnar::Column<T>>>,
{
let column_type = T::column_type();
let Some(dynamic_column_handle) = self.column_handle(field_name, column_type)?
else {
return Ok(None);
};
let dynamic_column = dynamic_column_handle.open()?;
Ok(dynamic_column.into())
}
pub fn bytes_column_opt(&self, field_name: &str) -> crate::Result<Option<BytesColumn>> {
let Some(dynamic_column_handle) = self.column_handle(field_name, ColumnType::Bytes)?
else {
return Ok(None);
};
let dynamic_column = dynamic_column_handle.open()?;
Ok(dynamic_column.into())
}
pub fn str_column_opt(&self, field_name: &str) -> crate::Result<Option<StrColumn>> {
let Some(dynamic_column_handle) = self.column_handle(field_name, ColumnType::Str)?
else {
return Ok(None);
};
let dynamic_column = dynamic_column_handle.open()?;
Ok(dynamic_column.into())
}
pub fn column_num_bytes(&self, field: &str) -> crate::Result<usize> {
Ok(self
.columnar
.read_columns(field)?
.into_iter()
.map(|column_handle| column_handle.num_bytes())
.sum())
}
pub fn typed_column_first_or_default<T>(&self, field: &str) -> crate::Result<Arc<dyn Column<T>>>
where
T: PartialOrd + Copy + HasAssociatedColumnType + Send + Sync + 'static,
DynamicColumn: Into<Option<columnar::Column<T>>>,
{
let col_opt: Option<columnar::Column<T>> = self.typed_column_opt(field)?;
if let Some(col) = col_opt {
Ok(col.first_or_default_col(T::default_value()))
} else {
todo!();
}
if cardinality != expected_cardinality {
return Err(crate::TantivyError::SchemaError(format!(
"Field {:?} is of cardinality {:?}, expected {:?}.",
field_entry.name(),
cardinality,
expected_cardinality
)));
}
Ok(())
}
pub(crate) fn typed_fast_field_reader_with_idx<TFastValue: FastValue>(
&self,
field_name: &str,
index: usize,
) -> crate::Result<Arc<dyn Column<TFastValue>>> {
let field = self.schema.get_field(field_name)?;
let fast_field_slice = self.fast_field_data(field, index)?;
let bytes = fast_field_slice.read_bytes()?;
let column = fastfield_codecs::open(bytes)?;
Ok(column)
}
pub(crate) fn typed_fast_field_reader<TFastValue: FastValue>(
&self,
field_name: &str,
) -> crate::Result<Arc<dyn Column<TFastValue>>> {
self.typed_fast_field_reader_with_idx(field_name, 0)
}
pub(crate) fn typed_fast_field_multi_reader<TFastValue: FastValue>(
&self,
field_name: &str,
) -> crate::Result<MultiValuedFastFieldReader<TFastValue>> {
let idx_reader = self.typed_fast_field_reader(field_name)?;
let vals_reader = self.typed_fast_field_reader_with_idx(field_name, 1)?;
Ok(MultiValuedFastFieldReader::open(idx_reader, vals_reader))
}
/// Returns the `u64` fast field reader reader associated with `field`.
///
/// If `field` is not a u64 fast field, this method returns an Error.
pub fn u64(&self, field_name: &str) -> crate::Result<Arc<dyn Column<u64>>> {
self.check_type(
self.schema.get_field(field_name)?,
FastType::U64,
Cardinality::SingleValue,
)?;
self.typed_fast_field_reader(field_name)
pub fn u64(&self, field: &str) -> crate::Result<Arc<dyn ColumnValues<u64>>> {
self.typed_column_first_or_default(field)
}
/// Returns the `date` fast field reader reader associated with `field`.
///
/// If `field` is not a date fast field, this method returns an Error.
pub fn date(&self, field: &str) -> crate::Result<Arc<dyn ColumnValues<columnar::DateTime>>> {
self.typed_column_first_or_default(field)
}
/// Returns the `ip` fast field reader reader associated to `field`.
///
/// If `field` is not a u128 fast field, this method returns an Error.
pub fn ip_addr(&self, field_name: &str) -> crate::Result<Arc<dyn Column<Ipv6Addr>>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::U128, Cardinality::SingleValue)?;
let bytes = self.fast_field_data(field, 0)?.read_bytes()?;
Ok(open_u128::<Ipv6Addr>(bytes)?)
pub fn ip_addr(&self, field: &str) -> crate::Result<Arc<dyn Column<Ipv6Addr>>> {
self.typed_column_first_or_default(field)
}
/// Returns the `ip` fast field reader reader associated to `field`.
///
/// If `field` is not a u128 fast field, this method returns an Error.
pub fn ip_addrs(
pub fn str(&self, field: &str) -> crate::Result<Option<columnar::StrColumn>> {
self.str_column_opt(field)
}
pub fn bytes(&self, field: &str) -> crate::Result<Option<columnar::BytesColumn>> {
self.bytes_column_opt(field)
}
pub fn column_handle(
&self,
field_name: &str,
) -> crate::Result<MultiValuedFastFieldReader<Ipv6Addr>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::U128, Cardinality::MultiValues)?;
let idx_reader: Arc<dyn Column<u64>> = self.typed_fast_field_reader(field_name)?;
let bytes = self.fast_field_data(field, 1)?.read_bytes()?;
let vals_reader = open_u128::<Ipv6Addr>(bytes)?;
Ok(MultiValuedFastFieldReader::open(idx_reader, vals_reader))
column_type: ColumnType,
) -> crate::Result<Option<DynamicColumnHandle>> {
let dynamic_column_handle_opt = self
.columnar
.read_columns(field_name)?
.into_iter()
.filter(|column| column.column_type() == column_type)
.next();
Ok(dynamic_column_handle_opt)
}
/// Returns the `u128` fast field reader reader associated to `field`.
///
/// If `field` is not a u128 fast field, this method returns an Error.
pub(crate) fn u128(&self, field_name: &str) -> crate::Result<Arc<dyn Column<u128>>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::U128, Cardinality::SingleValue)?;
let bytes = self.fast_field_data(field, 0)?.read_bytes()?;
Ok(open_u128::<u128>(bytes)?)
}
/// Returns the `u128` multi-valued fast field reader reader associated to `field`.
///
/// If `field` is not a u128 multi-valued fast field, this method returns an Error.
pub fn u128s(&self, field_name: &str) -> crate::Result<MultiValuedFastFieldReader<u128>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::U128, Cardinality::MultiValues)?;
let idx_reader: Arc<dyn Column<u64>> =
self.typed_fast_field_reader(self.schema.get_field_name(field))?;
let bytes = self.fast_field_data(field, 1)?.read_bytes()?;
let vals_reader = open_u128::<u128>(bytes)?;
Ok(MultiValuedFastFieldReader::open(idx_reader, vals_reader))
}
/// Returns the `u64` fast field reader reader associated with `field`, regardless of whether
/// the given field is effectively of type `u64` or not.
///
/// If not, the fastfield reader will returns the u64-value associated with the original
/// FastValue.
pub fn u64_lenient(&self, field_name: &str) -> crate::Result<Arc<dyn Column<u64>>> {
self.typed_fast_field_reader(field_name)
pub fn u64_lenient(&self, field_name: &str) -> crate::Result<Option<columnar::Column<u64>>> {
for col in self.columnar.read_columns(field_name)? {
if let Some(col_u64) = col.open_u64_lenient()? {
return Ok(Some(col_u64));
}
}
Ok(None)
}
/// Returns the `i64` fast field reader reader associated with `field`.
///
/// If `field` is not a i64 fast field, this method returns an Error.
pub fn i64(&self, field_name: &str) -> crate::Result<Arc<dyn Column<i64>>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::I64, Cardinality::SingleValue)?;
self.typed_fast_field_reader(self.schema.get_field_name(field))
}
/// Returns the `date` fast field reader reader associated with `field`.
///
/// If `field` is not a date fast field, this method returns an Error.
pub fn date(&self, field_name: &str) -> crate::Result<Arc<dyn Column<DateTime>>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::Date, Cardinality::SingleValue)?;
self.typed_fast_field_reader(field_name)
self.typed_column_first_or_default(field_name)
}
/// Returns the `f64` fast field reader reader associated with `field`.
///
/// If `field` is not a f64 fast field, this method returns an Error.
pub fn f64(&self, field_name: &str) -> crate::Result<Arc<dyn Column<f64>>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::F64, Cardinality::SingleValue)?;
self.typed_fast_field_reader(field_name)
self.typed_column_first_or_default(field_name)
}
/// Returns the `bool` fast field reader reader associated with `field`.
///
/// If `field` is not a bool fast field, this method returns an Error.
pub fn bool(&self, field_name: &str) -> crate::Result<Arc<dyn Column<bool>>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::Bool, Cardinality::SingleValue)?;
self.typed_fast_field_reader(field_name)
self.typed_column_first_or_default(field_name)
}
/// Returns a `u64s` multi-valued fast field reader reader associated with `field`.
///
/// If `field` is not a u64 multi-valued fast field, this method returns an Error.
pub fn u64s(&self, field_name: &str) -> crate::Result<MultiValuedFastFieldReader<u64>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::U64, Cardinality::MultiValues)?;
self.typed_fast_field_multi_reader(field_name)
}
/// Returns a `u64s` multi-valued fast field reader reader associated with `field`, regardless
/// of whether the given field is effectively of type `u64` or not.
///
/// If `field` is not a u64 multi-valued fast field, this method returns an Error.
pub fn u64s_lenient(&self, field_name: &str) -> crate::Result<MultiValuedFastFieldReader<u64>> {
self.typed_fast_field_multi_reader(field_name)
}
/// Returns a `i64s` multi-valued fast field reader reader associated with `field`.
///
/// If `field` is not a i64 multi-valued fast field, this method returns an Error.
pub fn i64s(&self, field_name: &str) -> crate::Result<MultiValuedFastFieldReader<i64>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::I64, Cardinality::MultiValues)?;
self.typed_fast_field_multi_reader(self.schema.get_field_name(field))
}
/// Returns a `f64s` multi-valued fast field reader reader associated with `field`.
///
/// If `field` is not a f64 multi-valued fast field, this method returns an Error.
pub fn f64s(&self, field_name: &str) -> crate::Result<MultiValuedFastFieldReader<f64>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::F64, Cardinality::MultiValues)?;
self.typed_fast_field_multi_reader(self.schema.get_field_name(field))
}
/// Returns a `bools` multi-valued fast field reader reader associated with `field`.
///
/// If `field` is not a bool multi-valued fast field, this method returns an Error.
pub fn bools(&self, field_name: &str) -> crate::Result<MultiValuedFastFieldReader<bool>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::Bool, Cardinality::MultiValues)?;
self.typed_fast_field_multi_reader(self.schema.get_field_name(field))
}
/// Returns a `time::OffsetDateTime` multi-valued fast field reader reader associated with
/// `field`.
///
/// If `field` is not a `time::OffsetDateTime` multi-valued fast field, this method returns an
/// Error.
pub fn dates(&self, field_name: &str) -> crate::Result<MultiValuedFastFieldReader<DateTime>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::Date, Cardinality::MultiValues)?;
self.typed_fast_field_multi_reader(self.schema.get_field_name(field))
}
/// Returns the `bytes` fast field reader associated with `field`.
///
/// If `field` is not a bytes fast field, returns an Error.
pub fn bytes(&self, field_name: &str) -> crate::Result<BytesFastFieldReader> {
let field = self.schema.get_field(field_name)?;
let field_entry = self.schema.get_field_entry(field);
if let FieldType::Bytes(bytes_option) = field_entry.field_type() {
if !bytes_option.is_fast() {
return Err(crate::TantivyError::SchemaError(format!(
"Field {:?} is not a fast field.",
field_entry.name()
)));
}
let fast_field_idx_file = self.fast_field_data(field, 0)?;
let fast_field_idx_bytes = fast_field_idx_file.read_bytes()?;
let idx_reader = open(fast_field_idx_bytes)?;
let data = self.fast_field_data(field, 1)?;
BytesFastFieldReader::open(idx_reader, data)
} else {
Err(FastFieldNotAvailableError::new(field_entry).into())
}
}
// Returns the `bytes` fast field reader associated with `field`.
//
// If `field` is not a bytes fast field, returns an Error.
// pub fn bytes(&self, field: Field) -> crate::Result<BytesFastFieldReader> {
// let field_entry = self.schema.get_field_entry(field);
// if let FieldType::Bytes(bytes_option) = field_entry.field_type() {
// if !bytes_option.is_fast() {
// return Err(crate::TantivyError::SchemaError(format!(
// "Field {:?} is not a fast field.",
// field_entry.name()
// )));
// }
// let fast_field_idx_file = self.fast_field_data(field, 0)?;
// let fast_field_idx_bytes = fast_field_idx_file.read_bytes()?;
// let idx_reader = open(fast_field_idx_bytes)?;
// let data = self.fast_field_data(field, 1)?;
// BytesFastFieldReader::open(idx_reader, data)
// } else {
// Err(FastFieldNotAvailableError::new(field_entry).into())
// }
// }
}

View File

@@ -1,558 +1,150 @@
use std::collections::HashMap;
use std::io;
use columnar::{ColumnType, ColumnarWriter, NumericalType, NumericalValue};
use common;
use fastfield_codecs::{Column, MonotonicallyMappableToU128, MonotonicallyMappableToU64};
use rustc_hash::FxHashMap;
use tantivy_bitpacker::BlockedBitpacker;
use super::multivalued::{MultiValueU128FastFieldWriter, MultiValuedFastFieldWriter};
use super::FastFieldType;
use crate::fastfield::{BytesFastFieldWriter, CompositeFastFieldSerializer};
use crate::fastfield::CompositeFastFieldSerializer;
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::UnorderedTermId;
use crate::schema::{Cardinality, Document, Field, FieldEntry, FieldType, Schema, Value};
use crate::schema::{Document, Field, FieldEntry, FieldType, Schema, Type, Value};
use crate::termdict::TermOrdinal;
use crate::DatePrecision;
use crate::{DatePrecision, DocId};
/// The `FastFieldsWriter` groups all of the fast field writers.
pub struct FastFieldsWriter {
term_id_writers: Vec<MultiValuedFastFieldWriter>,
single_value_writers: Vec<IntFastFieldWriter>,
u128_value_writers: Vec<U128FastFieldWriter>,
u128_multi_value_writers: Vec<MultiValueU128FastFieldWriter>,
multi_values_writers: Vec<MultiValuedFastFieldWriter>,
bytes_value_writers: Vec<BytesFastFieldWriter>,
}
pub(crate) fn unexpected_value(expected: &str, actual: &Value) -> crate::TantivyError {
crate::TantivyError::SchemaError(format!(
"Expected a {:?} in fast field, but got {:?}",
expected, actual
))
}
fn fast_field_default_value(field_entry: &FieldEntry) -> u64 {
match *field_entry.field_type() {
FieldType::I64(_) | FieldType::Date(_) => common::i64_to_u64(0i64),
FieldType::F64(_) => common::f64_to_u64(0.0f64),
_ => 0u64,
}
columnar_writer: ColumnarWriter,
fast_field_names: Vec<Option<String>>, //< TODO see if we can cash the field name hash too.
date_precisions: Vec<DatePrecision>,
num_docs: DocId,
}
impl FastFieldsWriter {
/// Create all `FastFieldWriter` required by the schema.
pub fn from_schema(schema: &Schema) -> FastFieldsWriter {
let mut u128_value_writers = Vec::new();
let mut u128_multi_value_writers = Vec::new();
let mut single_value_writers = Vec::new();
let mut term_id_writers = Vec::new();
let mut multi_values_writers = Vec::new();
let mut bytes_value_writers = Vec::new();
for (field, field_entry) in schema.fields() {
match field_entry.field_type() {
FieldType::I64(ref int_options)
| FieldType::U64(ref int_options)
| FieldType::F64(ref int_options)
| FieldType::Bool(ref int_options) => {
match int_options.get_fastfield_cardinality() {
Some(Cardinality::SingleValue) => {
let mut fast_field_writer = IntFastFieldWriter::new(field, None);
let default_value = fast_field_default_value(field_entry);
fast_field_writer.set_val_if_missing(default_value);
single_value_writers.push(fast_field_writer);
}
Some(Cardinality::MultiValues) => {
let fast_field_writer = MultiValuedFastFieldWriter::new(
field,
FastFieldType::Numeric,
None,
);
multi_values_writers.push(fast_field_writer);
}
None => {}
}
}
FieldType::Date(ref options) => match options.get_fastfield_cardinality() {
Some(Cardinality::SingleValue) => {
let mut fast_field_writer =
IntFastFieldWriter::new(field, Some(options.get_precision()));
let default_value = fast_field_default_value(field_entry);
fast_field_writer.set_val_if_missing(default_value);
single_value_writers.push(fast_field_writer);
}
Some(Cardinality::MultiValues) => {
let fast_field_writer = MultiValuedFastFieldWriter::new(
field,
FastFieldType::Numeric,
Some(options.get_precision()),
);
multi_values_writers.push(fast_field_writer);
}
None => {}
},
FieldType::Facet(_) => {
let fast_field_writer =
MultiValuedFastFieldWriter::new(field, FastFieldType::Facet, None);
term_id_writers.push(fast_field_writer);
}
FieldType::Str(_) if field_entry.is_fast() => {
let fast_field_writer =
MultiValuedFastFieldWriter::new(field, FastFieldType::String, None);
term_id_writers.push(fast_field_writer);
}
FieldType::Bytes(bytes_option) => {
if bytes_option.is_fast() {
let fast_field_writer = BytesFastFieldWriter::new(field);
bytes_value_writers.push(fast_field_writer);
}
}
FieldType::IpAddr(opt) => {
if opt.is_fast() {
match opt.get_fastfield_cardinality() {
Some(Cardinality::SingleValue) => {
let fast_field_writer = U128FastFieldWriter::new(field);
u128_value_writers.push(fast_field_writer);
}
Some(Cardinality::MultiValues) => {
let fast_field_writer = MultiValueU128FastFieldWriter::new(field);
u128_multi_value_writers.push(fast_field_writer);
}
None => {}
}
}
}
FieldType::Str(_) | FieldType::JsonObject(_) => {}
let mut columnar_writer = ColumnarWriter::default();
let mut fast_fields: Vec<Option<String>> = vec![None; schema.num_fields()];
let mut date_precisions: Vec<DatePrecision> =
std::iter::repeat_with(DatePrecision::default)
.take(schema.num_fields())
.collect();
// TODO see other types
for (field_id, field_entry) in schema.fields() {
if !field_entry.field_type().is_fast() {
continue;
}
fast_fields[field_id.field_id() as usize] = Some(field_entry.name().to_string());
let column_type = match field_entry.field_type().value_type() {
Type::Str => ColumnType::Str,
Type::U64 => ColumnType::U64,
Type::I64 => ColumnType::I64,
Type::F64 => ColumnType::F64,
Type::Bool => ColumnType::Bool,
Type::Date => ColumnType::DateTime,
Type::Facet => ColumnType::Str,
Type::Bytes => ColumnType::Bytes,
Type::Json => {
continue;
}
Type::IpAddr => ColumnType::IpAddr,
};
if let FieldType::Date(date_options) = field_entry.field_type() {
date_precisions[field_id.field_id() as usize] = date_options.get_precision();
}
columnar_writer.record_column_type(field_entry.name(), column_type);
}
FastFieldsWriter {
u128_value_writers,
u128_multi_value_writers,
term_id_writers,
single_value_writers,
multi_values_writers,
bytes_value_writers,
columnar_writer,
fast_field_names: fast_fields,
num_docs: 0u32,
date_precisions,
}
}
/// The memory used (inclusive childs)
pub fn mem_usage(&self) -> usize {
self.term_id_writers
.iter()
.map(|w| w.mem_usage())
.sum::<usize>()
+ self
.single_value_writers
.iter()
.map(|w| w.mem_usage())
.sum::<usize>()
+ self
.multi_values_writers
.iter()
.map(|w| w.mem_usage())
.sum::<usize>()
+ self
.bytes_value_writers
.iter()
.map(|w| w.mem_usage())
.sum::<usize>()
+ self
.u128_value_writers
.iter()
.map(|w| w.mem_usage())
.sum::<usize>()
+ self
.u128_multi_value_writers
.iter()
.map(|w| w.mem_usage())
.sum::<usize>()
self.columnar_writer.mem_usage()
}
/// Get the `FastFieldWriter` associated with a field.
pub fn get_term_id_writer(&self, field: Field) -> Option<&MultiValuedFastFieldWriter> {
// TODO optimize
self.term_id_writers
.iter()
.find(|field_writer| field_writer.field() == field)
}
/// Get the `FastFieldWriter` associated with a field.
pub fn get_field_writer(&self, field: Field) -> Option<&IntFastFieldWriter> {
// TODO optimize
self.single_value_writers
.iter()
.find(|field_writer| field_writer.field() == field)
}
/// Get the `FastFieldWriter` associated with a field.
pub fn get_field_writer_mut(&mut self, field: Field) -> Option<&mut IntFastFieldWriter> {
// TODO optimize
self.single_value_writers
.iter_mut()
.find(|field_writer| field_writer.field() == field)
}
/// Get the `FastFieldWriter` associated with a field.
pub fn get_term_id_writer_mut(
&mut self,
field: Field,
) -> Option<&mut MultiValuedFastFieldWriter> {
// TODO optimize
self.term_id_writers
.iter_mut()
.find(|field_writer| field_writer.field() == field)
}
/// Returns the fast field multi-value writer for the given field.
///
/// Returns `None` if the field does not exist, or is not
/// configured as a multivalued fastfield in the schema.
pub fn get_multivalue_writer_mut(
&mut self,
field: Field,
) -> Option<&mut MultiValuedFastFieldWriter> {
// TODO optimize
self.multi_values_writers
.iter_mut()
.find(|multivalue_writer| multivalue_writer.field() == field)
}
/// Returns the bytes fast field writer for the given field.
///
/// Returns `None` if the field does not exist, or is not
/// configured as a bytes fastfield in the schema.
pub fn get_bytes_writer_mut(&mut self, field: Field) -> Option<&mut BytesFastFieldWriter> {
// TODO optimize
self.bytes_value_writers
.iter_mut()
.find(|field_writer| field_writer.field() == field)
}
/// Indexes all of the fastfields of a new document.
pub fn add_document(&mut self, doc: &Document) -> crate::Result<()> {
for field_writer in &mut self.term_id_writers {
field_writer.add_document(doc)?;
}
for field_writer in &mut self.single_value_writers {
field_writer.add_document(doc)?;
}
for field_writer in &mut self.multi_values_writers {
field_writer.add_document(doc)?;
}
for field_writer in &mut self.bytes_value_writers {
field_writer.add_document(doc)?;
}
for field_writer in &mut self.u128_value_writers {
field_writer.add_document(doc)?;
}
for field_writer in &mut self.u128_multi_value_writers {
field_writer.add_document(doc)?;
let doc_id = self.num_docs;
for field_value in doc.field_values() {
if let Some(field_name) =
self.fast_field_names[field_value.field().field_id() as usize].as_ref()
{
match &field_value.value {
Value::U64(u64_val) => {
self.columnar_writer.record_numerical(
doc_id,
field_name.as_str(),
NumericalValue::from(*u64_val),
);
}
Value::I64(i64_val) => {
self.columnar_writer.record_numerical(
doc_id,
field_name.as_str(),
NumericalValue::from(*i64_val),
);
}
Value::F64(f64_val) => {
self.columnar_writer.record_numerical(
doc_id,
field_name.as_str(),
NumericalValue::from(*f64_val),
);
}
Value::Str(text_val) => {
self.columnar_writer
.record_str(doc_id, field_name.as_str(), text_val);
}
Value::Bytes(bytes_val) => {
self.columnar_writer
.record_bytes(doc_id, field_name.as_str(), bytes_val);
}
Value::PreTokStr(_) => todo!(),
Value::Bool(bool_val) => {
self.columnar_writer
.record_bool(doc_id, field_name.as_str(), *bool_val);
}
Value::Date(datetime) => {
let date_precision =
self.date_precisions[field_value.field().field_id() as usize];
let truncated_datetime = datetime.truncate(date_precision);
self.columnar_writer.record_datetime(
doc_id,
field_name.as_str(),
truncated_datetime.into(),
);
}
Value::Facet(_) => todo!(),
Value::JsonObject(_) => todo!(),
Value::IpAddr(ip_addr) => {
self.columnar_writer
.record_ip_addr(doc_id, field_name.as_str(), *ip_addr);
}
}
}
}
self.num_docs += 1;
Ok(())
}
/// Serializes all of the `FastFieldWriter`s by pushing them in
/// order to the fast field serializer.
pub fn serialize(
self,
serializer: &mut CompositeFastFieldSerializer,
mapping: &HashMap<Field, FxHashMap<UnorderedTermId, TermOrdinal>>,
mut self,
wrt: &mut dyn io::Write,
doc_id_map: Option<&DocIdMapping>,
) -> io::Result<()> {
for field_writer in self.term_id_writers {
let field = field_writer.field();
field_writer.serialize(serializer, mapping.get(&field), doc_id_map)?;
}
for field_writer in &self.single_value_writers {
field_writer.serialize(serializer, doc_id_map)?;
}
for field_writer in self.multi_values_writers {
let field = field_writer.field();
field_writer.serialize(serializer, mapping.get(&field), doc_id_map)?;
}
for field_writer in self.bytes_value_writers {
field_writer.serialize(serializer, doc_id_map)?;
}
for field_writer in self.u128_value_writers {
field_writer.serialize(serializer, doc_id_map)?;
}
for field_writer in self.u128_multi_value_writers {
field_writer.serialize(serializer, doc_id_map)?;
}
assert!(doc_id_map.is_none()); // TODO handle doc id map
let num_docs = self.num_docs;
self.columnar_writer.serialize(num_docs, wrt)?;
Ok(())
}
}
/// Fast field writer for u128 values.
/// The fast field writer just keeps the values in memory.
///
/// Only when the segment writer can be closed and
/// persisted on disk, the fast field writer is
/// sent to a `FastFieldSerializer` via the `.serialize(...)`
/// method.
///
/// We cannot serialize earlier as the values are
/// compressed to a compact number space and the number of
/// bits required for bitpacking can only been known once
/// we have seen all of the values.
pub struct U128FastFieldWriter {
field: Field,
vals: Vec<u128>,
val_count: u32,
}
impl U128FastFieldWriter {
/// Creates a new `IntFastFieldWriter`
pub fn new(field: Field) -> Self {
Self {
field,
vals: vec![],
val_count: 0,
}
}
/// The memory used (inclusive childs)
pub fn mem_usage(&self) -> usize {
self.vals.len() * 16
}
/// Records a new value.
///
/// The n-th value being recorded is implicitely
/// associated to the document with the `DocId` n.
/// (Well, `n-1` actually because of 0-indexing)
pub fn add_val(&mut self, val: u128) {
self.vals.push(val);
}
/// Extract the fast field value from the document
/// (or use the default value) and records it.
///
/// Extract the value associated to the fast field for
/// this document.
pub fn add_document(&mut self, doc: &Document) -> crate::Result<()> {
match doc.get_first(self.field) {
Some(v) => {
let ip_addr = v.as_ip_addr().ok_or_else(|| unexpected_value("ip", v))?;
let value = ip_addr.to_u128();
self.add_val(value);
}
None => {
self.add_val(0); // TODO fix null handling
}
};
self.val_count += 1;
Ok(())
}
/// Push the fast fields value to the `FastFieldWriter`.
pub fn serialize(
&self,
serializer: &mut CompositeFastFieldSerializer,
doc_id_map: Option<&DocIdMapping>,
) -> io::Result<()> {
if let Some(doc_id_map) = doc_id_map {
let iter_gen = || {
doc_id_map
.iter_old_doc_ids()
.map(|idx| self.vals[idx as usize])
};
serializer.create_u128_fast_field_with_idx(self.field, iter_gen, self.val_count, 0)?;
} else {
let iter_gen = || self.vals.iter().cloned();
serializer.create_u128_fast_field_with_idx(self.field, iter_gen, self.val_count, 0)?;
}
Ok(())
}
}
/// Fast field writer for ints.
/// The fast field writer just keeps the values in memory.
///
/// Only when the segment writer can be closed and
/// persisted on disk, the fast field writer is
/// sent to a `FastFieldSerializer` via the `.serialize(...)`
/// method.
///
/// We cannot serialize earlier as the values are
/// bitpacked and the number of bits required for bitpacking
/// can only been known once we have seen all of the values.
///
/// Both u64, i64 and f64 use the same writer.
/// i64 and f64 are just remapped to the `0..2^64 - 1`
/// using `common::i64_to_u64` and `common::f64_to_u64`.
pub struct IntFastFieldWriter {
field: Field,
precision_opt: Option<DatePrecision>,
vals: BlockedBitpacker,
val_count: usize,
val_if_missing: u64,
val_min: u64,
val_max: u64,
}
impl IntFastFieldWriter {
/// Creates a new `IntFastFieldWriter`
pub fn new(field: Field, precision_opt: Option<DatePrecision>) -> IntFastFieldWriter {
IntFastFieldWriter {
field,
precision_opt,
vals: BlockedBitpacker::new(),
val_count: 0,
val_if_missing: 0u64,
val_min: u64::MAX,
val_max: 0,
}
}
/// The memory used (inclusive childs)
pub fn mem_usage(&self) -> usize {
self.vals.mem_usage()
}
/// Returns the field that this writer is targeting.
pub fn field(&self) -> Field {
self.field
}
/// Sets the default value.
///
/// This default value is recorded for documents if
/// a document does not have any value.
fn set_val_if_missing(&mut self, val_if_missing: u64) {
self.val_if_missing = val_if_missing;
}
/// Records a new value.
///
/// The n-th value being recorded is implicitly
/// associated with the document with the `DocId` n.
/// (Well, `n-1` actually because of 0-indexing)
pub fn add_val(&mut self, val: u64) {
self.vals.add(val);
if val > self.val_max {
self.val_max = val;
}
if val < self.val_min {
self.val_min = val;
}
self.val_count += 1;
}
/// Extract the fast field value from the document
/// (or use the default value) and records it.
///
///
/// Extract the value associated with the fast field for
/// this document.
///
/// i64 and f64 are remapped to u64 using the logic
/// in `common::i64_to_u64` and `common::f64_to_u64`.
///
/// If the value is missing, then the default value is used
/// instead.
/// If the document has more than one value for the given field,
/// only the first one is taken in account.
///
/// Values on text fast fields are skipped.
pub fn add_document(&mut self, doc: &Document) -> crate::Result<()> {
match doc.get_first(self.field) {
Some(v) => {
let value = match (self.precision_opt, v) {
(Some(precision), Value::Date(date_val)) => {
date_val.truncate(precision).to_u64()
}
_ => super::value_to_u64(v)?,
};
self.add_val(value);
}
None => {
self.add_val(self.val_if_missing);
}
};
Ok(())
}
/// get iterator over the data
pub(crate) fn iter(&self) -> impl Iterator<Item = u64> + '_ {
self.vals.iter()
}
/// Push the fast fields value to the `FastFieldWriter`.
pub fn serialize(
&self,
serializer: &mut CompositeFastFieldSerializer,
doc_id_map: Option<&DocIdMapping>,
) -> io::Result<()> {
let (min, max) = if self.val_min > self.val_max {
(0, 0)
} else {
(self.val_min, self.val_max)
};
let fastfield_accessor = WriterFastFieldAccessProvider {
doc_id_map,
vals: &self.vals,
min_value: min,
max_value: max,
num_vals: self.val_count as u32,
};
serializer.create_auto_detect_u64_fast_field(self.field, fastfield_accessor)?;
Ok(())
}
}
#[derive(Clone)]
struct WriterFastFieldAccessProvider<'map, 'bitp> {
doc_id_map: Option<&'map DocIdMapping>,
vals: &'bitp BlockedBitpacker,
min_value: u64,
max_value: u64,
num_vals: u32,
}
impl<'map, 'bitp> Column for WriterFastFieldAccessProvider<'map, 'bitp> {
/// Return the value associated with the given doc.
///
/// Whenever possible use the Iterator passed to the fastfield creation instead, for performance
/// reasons.
///
/// # Panics
///
/// May panic if `doc` is greater than the index.
fn get_val(&self, _doc: u32) -> u64 {
unimplemented!()
}
fn iter(&self) -> Box<dyn Iterator<Item = u64> + '_> {
if let Some(doc_id_map) = self.doc_id_map {
Box::new(
doc_id_map
.iter_old_doc_ids()
.map(|doc_id| self.vals.get(doc_id as usize)),
)
} else {
Box::new(self.vals.iter())
}
}
fn min_value(&self) -> u64 {
self.min_value
}
fn max_value(&self) -> u64 {
self.max_value
}
fn num_vals(&self) -> u32 {
self.num_vals
}
}

View File

@@ -113,34 +113,35 @@ pub(crate) fn get_doc_id_mapping_from_field(
sort_by_field: IndexSortByField,
segment_writer: &SegmentWriter,
) -> crate::Result<DocIdMapping> {
let schema = segment_writer.segment_serializer.segment().schema();
let field_id = expect_field_id_for_sort_field(&schema, &sort_by_field)?; // for now expect fastfield, but not strictly required
let fast_field = segment_writer
.fast_field_writers
.get_field_writer(field_id)
.ok_or_else(|| {
TantivyError::InvalidArgument(format!(
"sort index by field is required to be a fast field {:?}",
sort_by_field.field
))
})?;
todo!()
// let schema = segment_writer.segment_serializer.segment().schema();
// let field_id = expect_field_id_for_sort_field(&schema, &sort_by_field)?; // for now expect
// fastfield, but not strictly required let fast_field = segment_writer
// .fast_field_writers
// .get_field_writer(field_id)
// .ok_or_else(|| {
// TantivyError::InvalidArgument(format!(
// "sort index by field is required to be a fast field {:?}",
// sort_by_field.field
// ))
// })?;
// create new doc_id to old doc_id index (used in fast_field_writers)
let mut doc_id_and_data = fast_field
.iter()
.enumerate()
.map(|el| (el.0 as DocId, el.1))
.collect::<Vec<_>>();
if sort_by_field.order == Order::Desc {
doc_id_and_data.sort_by_key(|k| Reverse(k.1));
} else {
doc_id_and_data.sort_by_key(|k| k.1);
}
let new_doc_id_to_old = doc_id_and_data
.into_iter()
.map(|el| el.0)
.collect::<Vec<_>>();
Ok(DocIdMapping::from_new_id_to_old_id(new_doc_id_to_old))
// // create new doc_id to old doc_id index (used in fast_field_writers)
// let mut doc_id_and_data = fast_field
// .iter()
// .enumerate()
// .map(|el| (el.0 as DocId, el.1))
// .collect::<Vec<_>>();
// if sort_by_field.order == Order::Desc {
// doc_id_and_data.sort_by_key(|k| Reverse(k.1));
// } else {
// doc_id_and_data.sort_by_key(|k| k.1);
// }
// let new_doc_id_to_old = doc_id_and_data
// .into_iter()
// .map(|el| el.0)
// .collect::<Vec<_>>();
// Ok(DocIdMapping::from_new_id_to_old_id(new_doc_id_to_old))
}
#[cfg(test)]
@@ -159,15 +160,11 @@ mod tests_indexsorting {
let my_text_field = schema_builder.add_text_field("text_field", text_field_options);
let my_string_field = schema_builder.add_text_field("string_field", STRING | STORED);
let my_number = schema_builder.add_u64_field(
"my_number",
NumericOptions::default().set_fast(Cardinality::SingleValue),
);
let my_number =
schema_builder.add_u64_field("my_number", NumericOptions::default().set_fast());
let multi_numbers = schema_builder.add_u64_field(
"multi_numbers",
NumericOptions::default().set_fast(Cardinality::MultiValues),
);
let multi_numbers =
schema_builder.add_u64_field("multi_numbers", NumericOptions::default().set_fast());
let schema = schema_builder.build();
let mut index_builder = Index::builder().schema(schema);
@@ -441,47 +438,48 @@ mod tests_indexsorting {
Ok(())
}
#[test]
fn test_sort_index_fast_field() -> crate::Result<()> {
let index = create_test_index(
Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "my_number".to_string(),
order: Order::Asc,
}),
..Default::default()
}),
get_text_options(),
)?;
assert_eq!(
index.settings().sort_by_field.as_ref().unwrap().field,
"my_number".to_string()
);
// #[test]
// fn test_sort_index_fast_field() -> crate::Result<()> {
// let index = create_test_index(
// Some(IndexSettings {
// sort_by_field: Some(IndexSortByField {
// field: "my_number".to_string(),
// order: Order::Asc,
// }),
// ..Default::default()
// }),
// get_text_options(),
// )?;
// assert_eq!(
// index.settings().sort_by_field.as_ref().unwrap().field,
// "my_number".to_string()
// );
let searcher = index.reader()?.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
let segment_reader = searcher.segment_reader(0);
let fast_fields = segment_reader.fast_fields();
index.schema().get_field("my_number").unwrap();
// let searcher = index.reader()?.searcher();
// assert_eq!(searcher.segment_readers().len(), 1);
// let segment_reader = searcher.segment_reader(0);
// let fast_fields = segment_reader.fast_fields();
// let my_number = index.schema().get_field("my_number").unwrap();
let fast_field = fast_fields.u64("my_number").unwrap();
assert_eq!(fast_field.get_val(0), 10u64);
assert_eq!(fast_field.get_val(1), 20u64);
assert_eq!(fast_field.get_val(2), 30u64);
// let fast_field = fast_fields.u64(my_number).unwrap();
// assert_eq!(fast_field.get_val(0), 10u64);
// assert_eq!(fast_field.get_val(1), 20u64);
// assert_eq!(fast_field.get_val(2), 30u64);
let multifield = fast_fields.u64s("multi_numbers").unwrap();
let mut vals = vec![];
multifield.get_vals(0u32, &mut vals);
assert_eq!(vals, &[] as &[u64]);
let mut vals = vec![];
multifield.get_vals(1u32, &mut vals);
assert_eq!(vals, &[5, 6]);
// let multi_numbers = index.schema().get_field("multi_numbers").unwrap();
// let multifield = fast_fields.u64s(multi_numbers).unwrap();
// let mut vals = vec![];
// multifield.get_vals(0u32, &mut vals);
// assert_eq!(vals, &[] as &[u64]);
// let mut vals = vec![];
// multifield.get_vals(1u32, &mut vals);
// assert_eq!(vals, &[5, 6]);
let mut vals = vec![];
multifield.get_vals(2u32, &mut vals);
assert_eq!(vals, &[3]);
Ok(())
}
// let mut vals = vec![];
// multifield.get_vals(2u32, &mut vals);
// assert_eq!(vals, &[3]);
// Ok(())
// }
#[test]
fn test_doc_mapping() {

File diff suppressed because it is too large Load Diff

View File

@@ -150,7 +150,6 @@ fn index_json_value(
json_term_writer.term_buffer,
ctx,
indexing_position,
None,
);
}
TextOrDateTime::DateTime(dt) => {

File diff suppressed because it is too large Load Diff

View File

@@ -2,19 +2,17 @@
mod tests {
use crate::collector::TopDocs;
use crate::core::Index;
use crate::fastfield::{AliveBitSet, MultiValuedFastFieldReader};
use crate::fastfield::AliveBitSet;
use crate::query::QueryParser;
use crate::schema::{
self, BytesOptions, Cardinality, Facet, FacetOptions, IndexRecordOption, NumericOptions,
self, BytesOptions, Facet, FacetOptions, IndexRecordOption, NumericOptions,
TextFieldIndexing, TextOptions,
};
use crate::{DocAddress, DocSet, IndexSettings, IndexSortByField, Order, Postings, Term};
fn create_test_index_posting_list_issue(index_settings: Option<IndexSettings>) -> Index {
let mut schema_builder = schema::Schema::builder();
let int_options = NumericOptions::default()
.set_fast(Cardinality::SingleValue)
.set_indexed();
let int_options = NumericOptions::default().set_fast().set_indexed();
let int_field = schema_builder.add_u64_field("intval", int_options);
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
@@ -62,7 +60,7 @@ mod tests {
) -> crate::Result<Index> {
let mut schema_builder = schema::Schema::builder();
let int_options = NumericOptions::default()
.set_fast(Cardinality::SingleValue)
.set_fast()
.set_stored()
.set_indexed();
let int_field = schema_builder.add_u64_field("intval", int_options);
@@ -71,10 +69,8 @@ mod tests {
let bytes_field = schema_builder.add_bytes_field("bytes", bytes_options);
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
let multi_numbers = schema_builder.add_u64_field(
"multi_numbers",
NumericOptions::default().set_fast(Cardinality::MultiValues),
);
let multi_numbers =
schema_builder.add_u64_field("multi_numbers", NumericOptions::default().set_fast());
let text_field_options = TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
@@ -349,128 +345,130 @@ mod tests {
}
}
#[test]
fn test_merge_sorted_index_asc() {
let index = create_test_index(
Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "intval".to_string(),
order: Order::Asc,
}),
..Default::default()
}),
false,
)
.unwrap();
// #[test]
// fn test_merge_sorted_index_asc() {
// let index = create_test_index(
// Some(IndexSettings {
// sort_by_field: Some(IndexSortByField {
// field: "intval".to_string(),
// order: Order::Asc,
// }),
// ..Default::default()
// }),
// false,
// )
// .unwrap();
let int_field = index.schema().get_field("intval").unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
let segment_reader = searcher.segment_readers().last().unwrap();
// let int_field = index.schema().get_field("intval").unwrap();
// let multi_numbers = index.schema().get_field("multi_numbers").unwrap();
// let bytes_field = index.schema().get_field("bytes").unwrap();
// let reader = index.reader().unwrap();
// let searcher = reader.searcher();
// assert_eq!(searcher.segment_readers().len(), 1);
// let segment_reader = searcher.segment_readers().last().unwrap();
let fast_fields = segment_reader.fast_fields();
let fast_field = fast_fields.u64("intval").unwrap();
assert_eq!(fast_field.get_val(0), 1u64);
assert_eq!(fast_field.get_val(1), 2u64);
assert_eq!(fast_field.get_val(2), 3u64);
assert_eq!(fast_field.get_val(3), 10u64);
assert_eq!(fast_field.get_val(4), 20u64);
assert_eq!(fast_field.get_val(5), 1_000u64);
// let fast_fields = segment_reader.fast_fields();
// let fast_field = fast_fields.u64(int_field).unwrap();
// assert_eq!(fast_field.get_val(0), 1u64);
// assert_eq!(fast_field.get_val(1), 2u64);
// assert_eq!(fast_field.get_val(2), 3u64);
// assert_eq!(fast_field.get_val(3), 10u64);
// assert_eq!(fast_field.get_val(4), 20u64);
// assert_eq!(fast_field.get_val(5), 1_000u64);
let get_vals = |fast_field: &MultiValuedFastFieldReader<u64>, doc_id: u32| -> Vec<u64> {
let mut vals = vec![];
fast_field.get_vals(doc_id, &mut vals);
vals
};
let fast_fields = segment_reader.fast_fields();
let fast_field = fast_fields.u64s("multi_numbers").unwrap();
assert_eq!(&get_vals(&fast_field, 0), &[] as &[u64]);
assert_eq!(&get_vals(&fast_field, 1), &[2, 3]);
assert_eq!(&get_vals(&fast_field, 2), &[3, 4]);
assert_eq!(&get_vals(&fast_field, 3), &[10, 11]);
assert_eq!(&get_vals(&fast_field, 4), &[20]);
assert_eq!(&get_vals(&fast_field, 5), &[1001, 1002]);
// let get_vals = |fast_field: &MultiValuedFastFieldReader<u64>, doc_id: u32| -> Vec<u64> {
// let mut vals = vec![];
// fast_field.get_vals(doc_id, &mut vals);
// vals
// };
// let fast_fields = segment_reader.fast_fields();
// let fast_field = fast_fields.u64s(multi_numbers).unwrap();
// assert_eq!(&get_vals(&fast_field, 0), &[] as &[u64]);
// assert_eq!(&get_vals(&fast_field, 1), &[2, 3]);
// assert_eq!(&get_vals(&fast_field, 2), &[3, 4]);
// assert_eq!(&get_vals(&fast_field, 3), &[10, 11]);
// assert_eq!(&get_vals(&fast_field, 4), &[20]);
// assert_eq!(&get_vals(&fast_field, 5), &[1001, 1002]);
let fast_field = fast_fields.bytes("bytes").unwrap();
assert_eq!(fast_field.get_bytes(0), &[] as &[u8]);
assert_eq!(fast_field.get_bytes(2), &[1, 2, 3]);
assert_eq!(fast_field.get_bytes(5), &[5, 5]);
// let fast_field = fast_fields.bytes(bytes_field).unwrap();
// assert_eq!(fast_field.get_bytes(0), &[] as &[u8]);
// assert_eq!(fast_field.get_bytes(2), &[1, 2, 3]);
// assert_eq!(fast_field.get_bytes(5), &[5, 5]);
// test new field norm mapping
{
let my_text_field = index.schema().get_field("text_field").unwrap();
let fieldnorm_reader = segment_reader.get_fieldnorms_reader(my_text_field).unwrap();
assert_eq!(fieldnorm_reader.fieldnorm(0), 0);
assert_eq!(fieldnorm_reader.fieldnorm(1), 4);
assert_eq!(fieldnorm_reader.fieldnorm(2), 2); // some text
assert_eq!(fieldnorm_reader.fieldnorm(3), 1);
assert_eq!(fieldnorm_reader.fieldnorm(5), 3); // the biggest num
}
// // test new field norm mapping
// {
// let my_text_field = index.schema().get_field("text_field").unwrap();
// let fieldnorm_reader = segment_reader.get_fieldnorms_reader(my_text_field).unwrap();
// assert_eq!(fieldnorm_reader.fieldnorm(0), 0);
// assert_eq!(fieldnorm_reader.fieldnorm(1), 4);
// assert_eq!(fieldnorm_reader.fieldnorm(2), 2); // some text
// assert_eq!(fieldnorm_reader.fieldnorm(3), 1);
// assert_eq!(fieldnorm_reader.fieldnorm(5), 3); // the biggest num
// }
let searcher = index.reader().unwrap().searcher();
{
let my_text_field = index.schema().get_field("text_field").unwrap();
// let searcher = index.reader().unwrap().searcher();
// {
// let my_text_field = index.schema().get_field("text_field").unwrap();
let do_search = |term: &str| {
let query = QueryParser::for_index(&index, vec![my_text_field])
.parse_query(term)
.unwrap();
let top_docs: Vec<(f32, DocAddress)> =
searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
// let do_search = |term: &str| {
// let query = QueryParser::for_index(&index, vec![my_text_field])
// .parse_query(term)
// .unwrap();
// let top_docs: Vec<(f32, DocAddress)> =
// searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
};
// top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
// };
assert_eq!(do_search("some"), vec![2]);
assert_eq!(do_search("blubber"), vec![3]);
assert_eq!(do_search("biggest"), vec![5]);
}
// assert_eq!(do_search("some"), vec![2]);
// assert_eq!(do_search("blubber"), vec![3]);
// assert_eq!(do_search("biggest"), vec![5]);
// }
// postings file
{
let my_text_field = index.schema().get_field("text_field").unwrap();
let term_a = Term::from_field_text(my_text_field, "text");
let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.unwrap()
.unwrap();
// // postings file
// {
// let my_text_field = index.schema().get_field("text_field").unwrap();
// let term_a = Term::from_field_text(my_text_field, "text");
// let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
// let mut postings = inverted_index
// .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
// .unwrap()
// .unwrap();
assert_eq!(postings.doc_freq(), 2);
let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
assert_eq!(
postings.doc_freq_given_deletes(
segment_reader.alive_bitset().unwrap_or(&fallback_bitset)
),
2
);
// assert_eq!(postings.doc_freq(), 2);
// let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
// assert_eq!(
// postings.doc_freq_given_deletes(
// segment_reader.alive_bitset().unwrap_or(&fallback_bitset)
// ),
// 2
// );
let mut output = vec![];
postings.positions(&mut output);
assert_eq!(output, vec![1, 3]);
postings.advance();
// let mut output = vec![];
// postings.positions(&mut output);
// assert_eq!(output, vec![1, 3]);
// postings.advance();
postings.positions(&mut output);
assert_eq!(output, vec![1]);
}
// postings.positions(&mut output);
// assert_eq!(output, vec![1]);
// }
// access doc store
{
let doc = searcher.doc(DocAddress::new(0, 0)).unwrap();
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1));
let doc = searcher.doc(DocAddress::new(0, 1)).unwrap();
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(2));
let doc = searcher.doc(DocAddress::new(0, 2)).unwrap();
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(3));
let doc = searcher.doc(DocAddress::new(0, 3)).unwrap();
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(10));
let doc = searcher.doc(DocAddress::new(0, 4)).unwrap();
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(20));
let doc = searcher.doc(DocAddress::new(0, 5)).unwrap();
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1_000));
}
}
// // access doc store
// {
// let doc = searcher.doc(DocAddress::new(0, 0)).unwrap();
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1));
// let doc = searcher.doc(DocAddress::new(0, 1)).unwrap();
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(2));
// let doc = searcher.doc(DocAddress::new(0, 2)).unwrap();
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(3));
// let doc = searcher.doc(DocAddress::new(0, 3)).unwrap();
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(10));
// let doc = searcher.doc(DocAddress::new(0, 4)).unwrap();
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(20));
// let doc = searcher.doc(DocAddress::new(0, 5)).unwrap();
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1_000));
// }
// }
}
#[cfg(all(test, feature = "unstable"))]
@@ -487,9 +485,7 @@ mod bench_sorted_index_merge {
use crate::{IndexSettings, IndexSortByField, IndexWriter, Order};
fn create_index(sort_by_field: Option<IndexSortByField>) -> Index {
let mut schema_builder = Schema::builder();
let int_options = NumericOptions::default()
.set_fast(Cardinality::SingleValue)
.set_indexed();
let int_options = NumericOptions::default().set_fast().set_indexed();
let int_field = schema_builder.add_u64_field("intval", int_options);
let schema = schema_builder.build();

View File

@@ -19,8 +19,8 @@ mod segment_register;
pub mod segment_serializer;
pub mod segment_updater;
mod segment_writer;
mod sorted_doc_id_column;
mod sorted_doc_id_multivalue_column;
// mod sorted_doc_id_column;
// mod sorted_doc_id_multivalue_column;
mod stamper;
use crossbeam_channel as channel;
@@ -58,7 +58,7 @@ type AddBatchReceiver = channel::Receiver<AddBatch>;
#[cfg(test)]
mod tests_mmap {
use crate::collector::Count;
use crate::query::QueryParser;
// use crate::query::QueryParser;
use crate::schema::{JsonObjectOptions, Schema, TEXT};
use crate::{Index, Term};
@@ -79,45 +79,45 @@ mod tests_mmap {
Ok(())
}
#[test]
fn test_json_field_expand_dots_disabled_dot_escaped_required() {
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap();
let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"});
index_writer.add_document(doc!(json_field=>json)).unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 1);
let parse_query = QueryParser::for_index(&index, Vec::new());
let query = parse_query
.parse_query(r#"json.k8s\.container\.name:prometheus"#)
.unwrap();
let num_docs = searcher.search(&query, &Count).unwrap();
assert_eq!(num_docs, 1);
}
// #[test]
// fn test_json_field_expand_dots_disabled_dot_escaped_required() {
// let mut schema_builder = Schema::builder();
// let json_field = schema_builder.add_json_field("json", TEXT);
// let index = Index::create_in_ram(schema_builder.build());
// let mut index_writer = index.writer_for_tests().unwrap();
// let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"});
// index_writer.add_document(doc!(json_field=>json)).unwrap();
// index_writer.commit().unwrap();
// let reader = index.reader().unwrap();
// let searcher = reader.searcher();
// assert_eq!(searcher.num_docs(), 1);
// let parse_query = QueryParser::for_index(&index, Vec::new());
// let query = parse_query
// .parse_query(r#"json.k8s\.container\.name:prometheus"#)
// .unwrap();
// let num_docs = searcher.search(&query, &Count).unwrap();
// assert_eq!(num_docs, 1);
// }
#[test]
fn test_json_field_expand_dots_enabled_dot_escape_not_required() {
let mut schema_builder = Schema::builder();
let json_options: JsonObjectOptions =
JsonObjectOptions::from(TEXT).set_expand_dots_enabled();
let json_field = schema_builder.add_json_field("json", json_options);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap();
let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"});
index_writer.add_document(doc!(json_field=>json)).unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 1);
let parse_query = QueryParser::for_index(&index, Vec::new());
let query = parse_query
.parse_query(r#"json.k8s.container.name:prometheus"#)
.unwrap();
let num_docs = searcher.search(&query, &Count).unwrap();
assert_eq!(num_docs, 1);
}
// #[test]
// fn test_json_field_expand_dots_enabled_dot_escape_not_required() {
// let mut schema_builder = Schema::builder();
// let json_options: JsonObjectOptions =
// JsonObjectOptions::from(TEXT).set_expand_dots_enabled();
// let json_field = schema_builder.add_json_field("json", json_options);
// let index = Index::create_in_ram(schema_builder.build());
// let mut index_writer = index.writer_for_tests().unwrap();
// let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"});
// index_writer.add_document(doc!(json_field=>json)).unwrap();
// index_writer.commit().unwrap();
// let reader = index.reader().unwrap();
// let searcher = reader.searcher();
// assert_eq!(searcher.num_docs(), 1);
// let parse_query = QueryParser::for_index(&index, Vec::new());
// let query = parse_query
// .parse_query(r#"json.k8s.container.name:prometheus"#)
// .unwrap();
// let num_docs = searcher.search(&query, &Count).unwrap();
// assert_eq!(num_docs, 1);
// }
}

View File

@@ -1,4 +1,7 @@
use common::TerminatingWrite;
use crate::core::{Segment, SegmentComponent};
use crate::directory::WritePtr;
use crate::fastfield::CompositeFastFieldSerializer;
use crate::fieldnorm::FieldNormsSerializer;
use crate::postings::InvertedIndexSerializer;
@@ -9,7 +12,7 @@ use crate::store::StoreWriter;
pub struct SegmentSerializer {
segment: Segment,
pub(crate) store_writer: StoreWriter,
fast_field_serializer: CompositeFastFieldSerializer,
fast_field_write: WritePtr,
fieldnorms_serializer: Option<FieldNormsSerializer>,
postings_serializer: InvertedIndexSerializer,
}
@@ -47,7 +50,6 @@ impl SegmentSerializer {
};
let fast_field_write = segment.open_write(SegmentComponent::FastFields)?;
let fast_field_serializer = CompositeFastFieldSerializer::from_write(fast_field_write)?;
let fieldnorms_write = segment.open_write(SegmentComponent::FieldNorms)?;
let fieldnorms_serializer = FieldNormsSerializer::from_write(fieldnorms_write)?;
@@ -56,7 +58,7 @@ impl SegmentSerializer {
Ok(SegmentSerializer {
segment,
store_writer,
fast_field_serializer,
fast_field_write,
fieldnorms_serializer: Some(fieldnorms_serializer),
postings_serializer,
})
@@ -81,8 +83,8 @@ impl SegmentSerializer {
}
/// Accessor to the `FastFieldSerializer`.
pub fn get_fast_field_serializer(&mut self) -> &mut CompositeFastFieldSerializer {
&mut self.fast_field_serializer
pub fn get_fast_field_write(&mut self) -> &mut WritePtr {
&mut self.fast_field_write
}
/// Extract the field norm serializer.
@@ -102,7 +104,7 @@ impl SegmentSerializer {
if let Some(fieldnorms_serializer) = self.extract_fieldnorms_serializer() {
fieldnorms_serializer.close()?;
}
self.fast_field_serializer.close()?;
self.fast_field_write.terminate()?;
self.postings_serializer.close()?;
self.store_writer.close()?;
Ok(())

View File

@@ -139,7 +139,6 @@ impl SegmentWriter {
self.ctx,
self.fast_field_writers,
&self.fieldnorms_writer,
&self.schema,
self.segment_serializer,
mapping.as_ref(),
)?;
@@ -185,22 +184,15 @@ impl SegmentWriter {
for value in values {
let facet = value.as_facet().ok_or_else(make_schema_error)?;
let facet_str = facet.encoded_str();
let mut unordered_term_id_opt = None;
FacetTokenizer
.token_stream(facet_str)
.process(&mut |token| {
term_buffer.set_text(&token.text);
let unordered_term_id =
postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
// TODO pass indexing context directly in subscribe function
unordered_term_id_opt = Some(unordered_term_id);
});
if let Some(unordered_term_id) = unordered_term_id_opt {
self.fast_field_writers
.get_term_id_writer_mut(field)
.expect("writer for facet missing")
.add_val(unordered_term_id);
}
let mut facet_tokenizer = FacetTokenizer.token_stream(facet_str);
let mut indexing_position = IndexingPosition::default();
postings_writer.index_text(
doc_id,
&mut *facet_tokenizer,
term_buffer,
ctx,
&mut indexing_position,
);
}
}
FieldType::Str(_) => {
@@ -227,7 +219,6 @@ impl SegmentWriter {
term_buffer,
ctx,
&mut indexing_position,
self.fast_field_writers.get_term_id_writer_mut(field),
);
}
if field_entry.has_fieldnorms() {
@@ -383,7 +374,6 @@ fn remap_and_write(
ctx: IndexingContext,
fast_field_writers: FastFieldsWriter,
fieldnorms_writer: &FieldNormsWriter,
schema: &Schema,
mut serializer: SegmentSerializer,
doc_id_map: Option<&DocIdMapping>,
) -> crate::Result<()> {
@@ -395,20 +385,15 @@ fn remap_and_write(
.segment()
.open_read(SegmentComponent::FieldNorms)?;
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;
let term_ord_map = serialize_postings(
serialize_postings(
ctx,
per_field_postings_writers,
fieldnorm_readers,
doc_id_map,
schema,
serializer.get_postings_serializer(),
)?;
debug!("fastfield-serialize");
fast_field_writers.serialize(
serializer.get_fast_field_serializer(),
&term_ord_map,
doc_id_map,
)?;
fast_field_writers.serialize(serializer.get_fast_field_write(), doc_id_map)?;
// finalize temp docstore and create version, which reflects the doc_id_map
if let Some(doc_id_map) = doc_id_map {

View File

@@ -147,6 +147,22 @@ pub struct DateTime {
pub(crate) timestamp_micros: i64,
}
impl From<columnar::DateTime> for DateTime {
fn from(columnar_datetime: columnar::DateTime) -> Self {
DateTime {
timestamp_micros: columnar_datetime.timestamp_micros,
}
}
}
impl From<DateTime> for columnar::DateTime {
fn from(datetime: crate::DateTime) -> Self {
columnar::DateTime {
timestamp_micros: datetime.timestamp_micros,
}
}
}
impl DateTime {
/// Create new from UNIX timestamp in seconds
pub const fn from_timestamp_secs(seconds: i64) -> Self {
@@ -263,7 +279,7 @@ mod indexer;
pub mod error;
pub mod tokenizer;
pub mod aggregation;
// pub mod aggregation;
pub mod collector;
pub mod directory;
pub mod fastfield;

View File

@@ -2,13 +2,10 @@ use std::io;
use stacker::Addr;
use crate::fastfield::MultiValuedFastFieldWriter;
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::postings_writer::SpecializedPostingsWriter;
use crate::postings::recorder::{BufferLender, DocIdRecorder, Recorder};
use crate::postings::{
FieldSerializer, IndexingContext, IndexingPosition, PostingsWriter, UnorderedTermId,
};
use crate::postings::{FieldSerializer, IndexingContext, IndexingPosition, PostingsWriter};
use crate::schema::term::as_json_path_type_value_bytes;
use crate::schema::Type;
use crate::tokenizer::TokenStream;
@@ -33,8 +30,8 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
pos: u32,
term: &crate::Term,
ctx: &mut IndexingContext,
) -> UnorderedTermId {
self.non_str_posting_writer.subscribe(doc, pos, term, ctx)
) {
self.non_str_posting_writer.subscribe(doc, pos, term, ctx);
}
fn index_text(
@@ -44,7 +41,6 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
term_buffer: &mut Term,
ctx: &mut IndexingContext,
indexing_position: &mut IndexingPosition,
_fast_field_writer: Option<&mut MultiValuedFastFieldWriter>,
) {
self.str_posting_writer.index_text(
doc_id,
@@ -52,20 +48,19 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
term_buffer,
ctx,
indexing_position,
None,
);
}
/// The actual serialization format is handled by the `PostingsSerializer`.
fn serialize(
&self,
term_addrs: &[(Term<&[u8]>, Addr, UnorderedTermId)],
term_addrs: &[(Term<&[u8]>, Addr)],
doc_id_map: Option<&DocIdMapping>,
ctx: &IndexingContext,
serializer: &mut FieldSerializer,
) -> io::Result<()> {
let mut buffer_lender = BufferLender::default();
for (term, addr, _) in term_addrs {
for (term, addr) in term_addrs {
// TODO optimization opportunity here.
if let Some((_, typ, _)) = as_json_path_type_value_bytes(term.value_bytes()) {
if typ == Type::Str {

View File

@@ -6,7 +6,6 @@ use std::ops::Range;
use rustc_hash::FxHashMap;
use stacker::Addr;
use crate::fastfield::MultiValuedFastFieldWriter;
use crate::fieldnorm::FieldNormReaders;
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::recorder::{BufferLender, Recorder};
@@ -21,12 +20,10 @@ use crate::DocId;
const POSITION_GAP: u32 = 1;
fn make_field_partition(
term_offsets: &[(Term<&[u8]>, Addr, UnorderedTermId)],
) -> Vec<(Field, Range<usize>)> {
fn make_field_partition(term_offsets: &[(Term<&[u8]>, Addr)]) -> Vec<(Field, Range<usize>)> {
let term_offsets_it = term_offsets
.iter()
.map(|(term, _, _)| term.field())
.map(|(term, _)| term.field())
.enumerate();
let mut prev_field_opt = None;
let mut fields = vec![];
@@ -54,48 +51,18 @@ pub(crate) fn serialize_postings(
per_field_postings_writers: &PerFieldPostingsWriter,
fieldnorm_readers: FieldNormReaders,
doc_id_map: Option<&DocIdMapping>,
schema: &Schema,
serializer: &mut InvertedIndexSerializer,
) -> crate::Result<HashMap<Field, FxHashMap<UnorderedTermId, TermOrdinal>>> {
let mut term_offsets: Vec<(Term<&[u8]>, Addr, UnorderedTermId)> =
Vec::with_capacity(ctx.term_index.len());
) -> crate::Result<()> {
let mut term_offsets: Vec<(Term<&[u8]>, Addr)> = Vec::with_capacity(ctx.term_index.len());
term_offsets.extend(
ctx.term_index
.iter()
.map(|(bytes, addr, unordered_id)| (Term::wrap(bytes), addr, unordered_id)),
.map(|(bytes, addr, _unordered_id)| (Term::wrap(bytes), addr)),
);
term_offsets.sort_unstable_by_key(|(k, _, _)| k.clone());
let mut unordered_term_mappings: HashMap<Field, FxHashMap<UnorderedTermId, TermOrdinal>> =
HashMap::new();
term_offsets.sort_unstable_by_key(|(k, _)| k.clone());
let field_offsets = make_field_partition(&term_offsets);
for (field, byte_offsets) in field_offsets {
let field_entry = schema.get_field_entry(field);
match *field_entry.field_type() {
FieldType::Str(_) | FieldType::Facet(_) => {
// populating the (unordered term ord) -> (ordered term ord) mapping
// for the field.
let unordered_term_ids = term_offsets[byte_offsets.clone()]
.iter()
.map(|&(_, _, bucket)| bucket);
let mapping: FxHashMap<UnorderedTermId, TermOrdinal> = unordered_term_ids
.enumerate()
.map(|(term_ord, unord_term_id)| {
(unord_term_id as UnorderedTermId, term_ord as TermOrdinal)
})
.collect();
unordered_term_mappings.insert(field, mapping);
}
FieldType::U64(_)
| FieldType::I64(_)
| FieldType::F64(_)
| FieldType::Date(_)
| FieldType::Bool(_) => {}
FieldType::Bytes(_) => {}
FieldType::JsonObject(_) => {}
FieldType::IpAddr(_) => {}
}
let postings_writer = per_field_postings_writers.get_for_field(field);
let fieldnorm_reader = fieldnorm_readers.get_field(field)?;
let mut field_serializer =
@@ -108,7 +75,7 @@ pub(crate) fn serialize_postings(
)?;
field_serializer.close()?;
}
Ok(unordered_term_mappings)
Ok(())
}
#[derive(Default)]
@@ -129,19 +96,13 @@ pub(crate) trait PostingsWriter: Send + Sync {
/// * term - the term
/// * ctx - Contains a term hashmap and a memory arena to store all necessary posting list
/// information.
fn subscribe(
&mut self,
doc: DocId,
pos: u32,
term: &Term,
ctx: &mut IndexingContext,
) -> UnorderedTermId;
fn subscribe(&mut self, doc: DocId, pos: u32, term: &Term, ctx: &mut IndexingContext);
/// Serializes the postings on disk.
/// The actual serialization format is handled by the `PostingsSerializer`.
fn serialize(
&self,
term_addrs: &[(Term<&[u8]>, Addr, UnorderedTermId)],
term_addrs: &[(Term<&[u8]>, Addr)],
doc_id_map: Option<&DocIdMapping>,
ctx: &IndexingContext,
serializer: &mut FieldSerializer,
@@ -155,7 +116,6 @@ pub(crate) trait PostingsWriter: Send + Sync {
term_buffer: &mut Term,
ctx: &mut IndexingContext,
indexing_position: &mut IndexingPosition,
mut term_id_fast_field_writer_opt: Option<&mut MultiValuedFastFieldWriter>,
) {
let end_of_path_idx = term_buffer.len_bytes();
let mut num_tokens = 0;
@@ -175,11 +135,7 @@ pub(crate) trait PostingsWriter: Send + Sync {
term_buffer.append_bytes(token.text.as_bytes());
let start_position = indexing_position.end_position + token.position as u32;
end_position = end_position.max(start_position + token.position_length as u32);
let unordered_term_id = self.subscribe(doc_id, start_position, term_buffer, ctx);
if let Some(term_id_fast_field_writer) = term_id_fast_field_writer_opt.as_mut() {
term_id_fast_field_writer.add_val(unordered_term_id);
}
self.subscribe(doc_id, start_position, term_buffer, ctx);
num_tokens += 1;
});
@@ -227,13 +183,7 @@ impl<Rec: Recorder> SpecializedPostingsWriter<Rec> {
}
impl<Rec: Recorder> PostingsWriter for SpecializedPostingsWriter<Rec> {
fn subscribe(
&mut self,
doc: DocId,
position: u32,
term: &Term,
ctx: &mut IndexingContext,
) -> UnorderedTermId {
fn subscribe(&mut self, doc: DocId, position: u32, term: &Term, ctx: &mut IndexingContext) {
debug_assert!(term.as_slice().len() >= 4);
self.total_num_tokens += 1;
let (term_index, arena) = (&mut ctx.term_index, &mut ctx.arena);
@@ -252,18 +202,18 @@ impl<Rec: Recorder> PostingsWriter for SpecializedPostingsWriter<Rec> {
recorder.record_position(position, arena);
recorder
}
}) as UnorderedTermId
});
}
fn serialize(
&self,
term_addrs: &[(Term<&[u8]>, Addr, UnorderedTermId)],
term_addrs: &[(Term<&[u8]>, Addr)],
doc_id_map: Option<&DocIdMapping>,
ctx: &IndexingContext,
serializer: &mut FieldSerializer,
) -> io::Result<()> {
let mut buffer_lender = BufferLender::default();
for (term, addr, _) in term_addrs {
for (term, addr) in term_addrs {
Self::serialize_one_term(term, *addr, doc_id_map, &mut buffer_lender, ctx, serializer)?;
}
Ok(())

View File

@@ -50,7 +50,7 @@ pub use self::more_like_this::{MoreLikeThisQuery, MoreLikeThisQueryBuilder};
pub use self::phrase_query::PhraseQuery;
pub use self::query::{EnableScoring, Query, QueryClone};
pub use self::query_parser::{QueryParser, QueryParserError};
pub use self::range_query::RangeQuery;
// pub use self::range_query::RangeQuery;
pub use self::regex_query::RegexQuery;
pub use self::reqopt_scorer::RequiredOptionalScorer;
pub use self::score_combiner::{

View File

@@ -13,10 +13,19 @@ use crate::core::Index;
use crate::indexer::{
convert_to_fast_value_and_get_term, set_string_and_get_terms, JsonTermWriter,
};
use crate::query::range_query::is_type_valid_for_fastfield_range_query;
use crate::query::range_query::{is_type_valid_for_fastfield_range_query, RangeQuery};
use crate::query::{
AllQuery, BooleanQuery, BoostQuery, EmptyQuery, FuzzyTermQuery, Occur, PhraseQuery, Query,
RangeQuery, TermQuery, TermSetQuery,
AllQuery,
BooleanQuery,
BoostQuery,
EmptyQuery,
FuzzyTermQuery,
Occur,
PhraseQuery,
Query,
// RangeQuery,
TermQuery,
TermSetQuery,
};
use crate::schema::{
Facet, FacetParseError, Field, FieldType, IndexRecordOption, IntoIpv6Addr, JsonObjectOptions,

View File

@@ -1,13 +1,13 @@
use core::fmt;
use core::fmt::Debug;
use std::ops::RangeInclusive;
use std::sync::Arc;
use fastfield_codecs::Column;
use columnar::Column;
use crate::fastfield::{MakeZero, MultiValuedFastFieldReader};
use crate::fastfield::MakeZero;
use crate::{DocId, DocSet, TERMINATED};
/// Helper to have a cursor over a vec of docids
#[derive(Debug)]
struct VecCursor {
docs: Vec<u32>,
current_pos: usize,
@@ -40,26 +40,10 @@ impl VecCursor {
}
}
pub(crate) enum FastFieldCardinality<T: MakeZero> {
SingleValue(Arc<dyn Column<T>>),
MultiValue(MultiValuedFastFieldReader<T>),
}
impl<T: MakeZero + PartialOrd + Copy + fmt::Debug> FastFieldCardinality<T> {
fn num_docs(&self) -> u32 {
match self {
FastFieldCardinality::SingleValue(single_value) => single_value.num_vals(),
FastFieldCardinality::MultiValue(multi_value) => {
multi_value.get_index_reader().num_docs()
}
}
}
}
pub(crate) struct RangeDocSet<T: MakeZero> {
/// The range filter on the values.
value_range: RangeInclusive<T>,
fast_field: FastFieldCardinality<T>,
column: Column<T>,
/// The next docid start range to fetch (inclusive).
next_fetch_start: u32,
/// Number of docs range checked in a batch.
@@ -77,11 +61,11 @@ pub(crate) struct RangeDocSet<T: MakeZero> {
}
const DEFAULT_FETCH_HORIZON: u32 = 128;
impl<T: MakeZero + Send + PartialOrd + Copy + fmt::Debug> RangeDocSet<T> {
pub(crate) fn new(value_range: RangeInclusive<T>, fast_field: FastFieldCardinality<T>) -> Self {
impl<T: MakeZero + Send + Sync + PartialOrd + Copy + Debug + 'static> RangeDocSet<T> {
pub(crate) fn new(value_range: RangeInclusive<T>, column: Column<T>) -> Self {
let mut range_docset = Self {
value_range,
fast_field,
column,
loaded_docs: VecCursor::new(),
next_fetch_start: 0,
fetch_horizon: DEFAULT_FETCH_HORIZON,
@@ -122,36 +106,24 @@ impl<T: MakeZero + Send + PartialOrd + Copy + fmt::Debug> RangeDocSet<T> {
fn fetch_horizon(&mut self, horizon: u32) -> bool {
let mut finished_to_end = false;
let limit = self.fast_field.num_docs();
let limit = self.column.values.num_vals();
let mut end = self.next_fetch_start + horizon;
if end >= limit {
end = limit;
finished_to_end = true;
}
match &self.fast_field {
FastFieldCardinality::MultiValue(multi) => {
let last_value = self.loaded_docs.last_value();
multi.get_docids_for_value_range(
self.value_range.clone(),
self.next_fetch_start..end,
self.loaded_docs.get_cleared_data(),
);
// In case of multivalues, we may have an overlap of the same docid between fetching
// blocks
if let Some(last_value) = last_value {
while self.loaded_docs.current() == Some(last_value) {
self.loaded_docs.next();
}
}
}
FastFieldCardinality::SingleValue(single) => {
single.get_docids_for_value_range(
self.value_range.clone(),
self.next_fetch_start..end,
self.loaded_docs.get_cleared_data(),
);
let last_value = self.loaded_docs.last_value();
let doc_buffer: &mut Vec<DocId> = self.loaded_docs.get_cleared_data();
self.column.values.get_docids_for_value_range(
self.value_range.clone(),
self.next_fetch_start..end,
doc_buffer,
);
self.column.idx.select_batch_in_place(doc_buffer);
if let Some(last_value) = last_value {
while self.loaded_docs.current() == Some(last_value) {
self.loaded_docs.next();
}
}
self.next_fetch_start = end;
@@ -160,18 +132,17 @@ impl<T: MakeZero + Send + PartialOrd + Copy + fmt::Debug> RangeDocSet<T> {
}
}
impl<T: MakeZero + Send + PartialOrd + Copy + fmt::Debug> DocSet for RangeDocSet<T> {
impl<T: MakeZero + Send + Sync + PartialOrd + Copy + Debug + 'static> DocSet for RangeDocSet<T> {
#[inline]
fn advance(&mut self) -> DocId {
if let Some(docid) = self.loaded_docs.next() {
docid
} else {
if self.next_fetch_start >= self.fast_field.num_docs() {
return TERMINATED;
}
self.fetch_block();
self.loaded_docs.current().unwrap_or(TERMINATED)
return docid;
}
if self.next_fetch_start >= self.column.values.num_vals() {
return TERMINATED;
}
self.fetch_block();
self.loaded_docs.current().unwrap_or(TERMINATED)
}
#[inline]

View File

@@ -1,8 +1,31 @@
use std::ops::Bound;
use crate::schema::Type;
mod fast_field_range_query;
mod range_query;
mod range_query_ip_fastfield;
mod range_query_u64_fastfield;
pub(crate) use range_query::is_type_valid_for_fastfield_range_query;
pub use self::range_query::RangeQuery;
// TODO is this correct?
pub(crate) fn is_type_valid_for_fastfield_range_query(typ: Type) -> bool {
match typ {
Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date => true,
Type::IpAddr => true,
Type::Str | Type::Facet | Type::Bytes | Type::Json => false,
}
}
fn map_bound<TFrom, TTo, Transform: Fn(&TFrom) -> TTo>(
bound: &Bound<TFrom>,
transform: &Transform,
) -> Bound<TTo> {
use self::Bound::*;
match bound {
Excluded(ref from_val) => Excluded(transform(from_val)),
Included(ref from_val) => Included(transform(from_val)),
Unbounded => Unbounded,
}
}

View File

@@ -3,28 +3,18 @@ use std::ops::{Bound, Range};
use common::{BinarySerializable, BitSet};
use super::map_bound;
use super::range_query_u64_fastfield::FastFieldRangeWeight;
use crate::core::SegmentReader;
use crate::error::TantivyError;
use crate::query::explanation::does_not_match;
use crate::query::range_query::is_type_valid_for_fastfield_range_query;
use crate::query::range_query::range_query_ip_fastfield::IPFastFieldRangeWeight;
use crate::query::{BitSetDocSet, ConstScorer, EnableScoring, Explanation, Query, Scorer, Weight};
use crate::schema::{Field, IndexRecordOption, Term, Type};
use crate::termdict::{TermDictionary, TermStreamer};
use crate::{DateTime, DocId, Score};
pub(crate) fn map_bound<TFrom, TTo, Transform: Fn(&TFrom) -> TTo>(
bound: &Bound<TFrom>,
transform: &Transform,
) -> Bound<TTo> {
use self::Bound::*;
match bound {
Excluded(ref from_val) => Excluded(transform(from_val)),
Included(ref from_val) => Included(transform(from_val)),
Unbounded => Unbounded,
}
}
/// `RangeQuery` matches all documents that have at least one term within a defined range.
///
/// Matched document will all get a constant `Score` of one.
@@ -285,14 +275,6 @@ impl RangeQuery {
}
}
pub(crate) fn is_type_valid_for_fastfield_range_query(typ: Type) -> bool {
match typ {
Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date => true,
Type::IpAddr => true,
Type::Str | Type::Facet | Type::Bytes | Type::Json => false,
}
}
/// Returns true if the type maps to a u64 fast field
pub(crate) fn maps_to_u64_fastfield(typ: Type) -> bool {
match typ {
@@ -462,7 +444,7 @@ mod tests {
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(2, 6_000_000)?;
let mut index_writer = index.writer_with_num_threads(2, 60_000_000)?;
for i in 1..100 {
let mut doc = Document::new();
@@ -478,6 +460,7 @@ mod tests {
}
let reader = index.reader().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 2);
let count_multiples =
|range_query: RangeQuery| searcher.search(&range_query, &Count).unwrap();
@@ -523,7 +506,7 @@ mod tests {
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(2, 6_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(2, 60_000_000).unwrap();
for i in 1..100 {
let mut doc = Document::new();
@@ -539,6 +522,7 @@ mod tests {
}
let reader = index.reader()?;
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 2);
let count_multiples =
|range_query: RangeQuery| searcher.search(&range_query, &Count).unwrap();
@@ -621,7 +605,7 @@ mod tests {
let ip_addr_2 = IpAddr::from_str("127.0.0.20").unwrap().into_ipv6_addr();
{
let mut index_writer = index.writer(3_000_000).unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
for _ in 0..1_000 {
index_writer
.add_document(doc!(
@@ -638,11 +622,11 @@ mod tests {
))
.unwrap();
}
index_writer.commit().unwrap();
}
let reader = index.reader().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
let get_num_hits = |query| {
let (_top_docs, count) = searcher

View File

@@ -5,13 +5,13 @@
use std::net::Ipv6Addr;
use std::ops::{Bound, RangeInclusive};
use columnar::Column;
use common::BinarySerializable;
use fastfield_codecs::MonotonicallyMappableToU128;
use super::fast_field_range_query::{FastFieldCardinality, RangeDocSet};
use super::range_query::map_bound;
use crate::query::{ConstScorer, Explanation, Scorer, Weight};
use crate::schema::Cardinality;
use super::map_bound;
use crate::query::range_query::fast_field_range_query::RangeDocSet;
use crate::query::{ConstScorer, EmptyScorer, Explanation, Scorer, Weight};
use crate::{DocId, DocSet, Score, SegmentReader, TantivyError};
/// `IPFastFieldRangeWeight` uses the ip address fast field to execute range queries.
@@ -22,6 +22,7 @@ pub struct IPFastFieldRangeWeight {
}
impl IPFastFieldRangeWeight {
// TODO fix code smell... why do we end up working with Vec<u8> here?
pub fn new(field: String, left_bound: &Bound<Vec<u8>>, right_bound: &Bound<Vec<u8>>) -> Self {
let parse_ip_from_bytes = |data: &Vec<u8>| {
let ip_u128: u128 =
@@ -40,40 +41,18 @@ impl IPFastFieldRangeWeight {
impl Weight for IPFastFieldRangeWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let field_type = reader
.schema()
.get_field_entry(reader.schema().get_field(&self.field)?)
.field_type();
match field_type.fastfield_cardinality().unwrap() {
Cardinality::SingleValue => {
let ip_addr_fast_field = reader.fast_fields().ip_addr(&self.field)?;
let value_range = bound_to_value_range(
&self.left_bound,
&self.right_bound,
ip_addr_fast_field.min_value(),
ip_addr_fast_field.max_value(),
);
let docset = RangeDocSet::new(
value_range,
FastFieldCardinality::SingleValue(ip_addr_fast_field),
);
Ok(Box::new(ConstScorer::new(docset, boost)))
}
Cardinality::MultiValues => {
let ip_addr_fast_field = reader.fast_fields().ip_addrs(&self.field)?;
let value_range = bound_to_value_range(
&self.left_bound,
&self.right_bound,
ip_addr_fast_field.min_value(),
ip_addr_fast_field.max_value(),
);
let docset = RangeDocSet::new(
value_range,
FastFieldCardinality::MultiValue(ip_addr_fast_field),
);
Ok(Box::new(ConstScorer::new(docset, boost)))
}
}
let Some(ip_addr_column): Option<Column<Ipv6Addr>> = reader.fast_fields()
.typed_column_opt(&self.field)? else {
return Ok(Box::new(EmptyScorer))
};
let value_range = bound_to_value_range(
&self.left_bound,
&self.right_bound,
ip_addr_column.min_value(),
ip_addr_column.max_value(),
);
let docset = RangeDocSet::new(value_range, ip_addr_column);
Ok(Box::new(ConstScorer::new(docset, boost)))
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
@@ -85,7 +64,6 @@ impl Weight for IPFastFieldRangeWeight {
)));
}
let explanation = Explanation::new("Const", scorer.score());
Ok(explanation)
}
}
@@ -119,7 +97,7 @@ mod tests {
use super::*;
use crate::collector::Count;
use crate::query::QueryParser;
use crate::schema::{IpAddrOptions, Schema, FAST, STORED, STRING};
use crate::schema::{Schema, FAST, INDEXED, STORED, STRING};
use crate::Index;
#[derive(Clone, Debug)]
@@ -156,19 +134,19 @@ mod tests {
#![proptest_config(ProptestConfig::with_cases(10))]
#[test]
fn test_ip_range_for_docs_prop(ops in proptest::collection::vec(operation_strategy(), 1..1000)) {
assert!(test_ip_range_for_docs(ops).is_ok());
assert!(test_ip_range_for_docs(&ops).is_ok());
}
}
#[test]
fn ip_range_regression1_test() {
let ops = vec![doc_from_id_1(0)];
fn test_ip_range_regression1() {
let ops = &[doc_from_id_1(0)];
assert!(test_ip_range_for_docs(ops).is_ok());
}
#[test]
fn ip_range_regression2_test() {
let ops = vec![
fn test_ip_range_regression2() {
let ops = &[
doc_from_id_1(52),
doc_from_id_1(63),
doc_from_id_1(12),
@@ -179,26 +157,48 @@ mod tests {
}
#[test]
fn ip_range_regression3_test() {
let ops = vec![doc_from_id_1(1), doc_from_id_1(2), doc_from_id_1(3)];
fn test_ip_range_regression3() {
let ops = &[doc_from_id_1(1), doc_from_id_1(2), doc_from_id_1(3)];
assert!(test_ip_range_for_docs(ops).is_ok());
}
pub fn create_index_from_docs(docs: &[Doc]) -> Index {
#[test]
fn test_ip_range_regression3_simple() {
let mut schema_builder = Schema::builder();
let ips_field = schema_builder.add_ip_addr_field("ips", FAST | INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer = index.writer_for_tests().unwrap();
let ip_addrs: Vec<Ipv6Addr> = [1000, 2000, 3000]
.into_iter()
.map(Ipv6Addr::from_u128)
.collect();
for &ip_addr in &ip_addrs {
writer
.add_document(doc!(ips_field=>ip_addr, ips_field=>ip_addr))
.unwrap();
}
writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher();
let range_weight = IPFastFieldRangeWeight {
field: "ips".to_string(),
left_bound: Bound::Included(ip_addrs[1]),
right_bound: Bound::Included(ip_addrs[2]),
};
let count = range_weight.count(searcher.segment_reader(0)).unwrap();
assert_eq!(count, 2);
}
fn create_index_from_docs(docs: &[Doc]) -> Index {
let mut schema_builder = Schema::builder();
let ip_field = schema_builder.add_ip_addr_field("ip", STORED | FAST);
let ips_field = schema_builder.add_ip_addr_field(
"ips",
IpAddrOptions::default()
.set_fast(Cardinality::MultiValues)
.set_indexed(),
);
let ips_field = schema_builder.add_ip_addr_field("ips", FAST | INDEXED);
let text_field = schema_builder.add_text_field("id", STRING | STORED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer(10_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(2, 60_000_000).unwrap();
for doc in docs.iter() {
index_writer
.add_document(doc!(
@@ -215,45 +215,50 @@ mod tests {
index
}
fn test_ip_range_for_docs(docs: Vec<Doc>) -> crate::Result<()> {
let index = create_index_from_docs(&docs);
fn test_ip_range_for_docs(docs: &[Doc]) -> crate::Result<()> {
let index = create_index_from_docs(docs);
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let get_num_hits = |query| searcher.search(&query, &(Count)).unwrap();
let get_num_hits = |query| searcher.search(&query, &Count).unwrap();
let query_from_text = |text: &str| {
QueryParser::for_index(&index, vec![])
.parse_query(text)
.unwrap()
};
let gen_query_inclusive = |field: &str, from: Ipv6Addr, to: Ipv6Addr| {
format!("{}:[{} TO {}]", field, &from.to_string(), &to.to_string())
let gen_query_inclusive = |field: &str, ip_range: &RangeInclusive<Ipv6Addr>| {
format!(
"{field}:[{} TO {}]",
ip_range.start().to_string(),
ip_range.end().to_string()
)
};
let test_sample = |sample_docs: Vec<Doc>| {
let test_sample = |sample_docs: &[Doc]| {
let mut ips: Vec<Ipv6Addr> = sample_docs.iter().map(|doc| doc.ip).collect();
ips.sort();
let ip_range = ips[0]..=ips[1];
let expected_num_hits = docs
.iter()
.filter(|doc| (ips[0]..=ips[1]).contains(&doc.ip))
.count();
let query = gen_query_inclusive("ip", ips[0], ips[1]);
let query = gen_query_inclusive("ip", &ip_range);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
let query = gen_query_inclusive("ips", ips[0], ips[1]);
let query = gen_query_inclusive("ips", &ip_range);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
// Intersection search
let id_filter = sample_docs[0].id.to_string();
let expected_num_hits = docs
.iter()
.filter(|doc| (ips[0]..=ips[1]).contains(&doc.ip) && doc.id == id_filter)
.filter(|doc| ip_range.contains(&doc.ip) && doc.id == id_filter)
.count();
let query = format!(
"{} AND id:{}",
gen_query_inclusive("ip", ips[0], ips[1]),
gen_query_inclusive("ip", &ip_range),
&id_filter
);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
@@ -262,19 +267,19 @@ mod tests {
let id_filter = sample_docs[0].id.to_string();
let query = format!(
"{} AND id:{}",
gen_query_inclusive("ips", ips[0], ips[1]),
gen_query_inclusive("ips", &ip_range),
&id_filter
);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
};
test_sample(vec![docs[0].clone(), docs[0].clone()]);
test_sample(&[docs[0].clone(), docs[0].clone()]);
if docs.len() > 1 {
test_sample(vec![docs[0].clone(), docs[1].clone()]);
test_sample(vec![docs[1].clone(), docs[1].clone()]);
test_sample(&[docs[0].clone(), docs[1].clone()]);
test_sample(&[docs[1].clone(), docs[1].clone()]);
}
if docs.len() > 2 {
test_sample(vec![docs[1].clone(), docs[2].clone()]);
test_sample(&[docs[1].clone(), docs[2].clone()]);
}
Ok(())

View File

@@ -6,10 +6,9 @@ use std::ops::{Bound, RangeInclusive};
use fastfield_codecs::MonotonicallyMappableToU64;
use super::fast_field_range_query::{FastFieldCardinality, RangeDocSet};
use super::range_query::map_bound;
use crate::query::{ConstScorer, Explanation, Scorer, Weight};
use crate::schema::Cardinality;
use super::fast_field_range_query::RangeDocSet;
use super::map_bound;
use crate::query::{ConstScorer, EmptyScorer, Explanation, Scorer, Weight};
use crate::{DocId, DocSet, Score, SegmentReader, TantivyError};
/// `FastFieldRangeWeight` uses the fast field to execute range queries.
@@ -33,36 +32,21 @@ impl FastFieldRangeWeight {
impl Weight for FastFieldRangeWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let field_type = reader
.schema()
.get_field_entry(reader.schema().get_field(&self.field)?)
.field_type();
match field_type.fastfield_cardinality().unwrap() {
Cardinality::SingleValue => {
let fast_field = reader.fast_fields().u64_lenient(&self.field)?;
let value_range = bound_to_value_range(
&self.left_bound,
&self.right_bound,
fast_field.min_value(),
fast_field.max_value(),
);
let docset =
RangeDocSet::new(value_range, FastFieldCardinality::SingleValue(fast_field));
Ok(Box::new(ConstScorer::new(docset, boost)))
}
Cardinality::MultiValues => {
let fast_field = reader.fast_fields().u64s_lenient(&self.field)?;
let value_range = bound_to_value_range(
&self.left_bound,
&self.right_bound,
fast_field.min_value(),
fast_field.max_value(),
);
let docset =
RangeDocSet::new(value_range, FastFieldCardinality::MultiValue(fast_field));
Ok(Box::new(ConstScorer::new(docset, boost)))
}
let fast_field_reader = reader.fast_fields();
let Some(column) = fast_field_reader.u64_lenient(&self.field)? else {
return Ok(Box::new(EmptyScorer));
};
let value_range = bound_to_value_range(
&self.left_bound,
&self.right_bound,
column.min_value(),
column.max_value(),
);
if value_range.is_empty() {
return Ok(Box::new(EmptyScorer));
}
let docset = RangeDocSet::new(value_range, column);
Ok(Box::new(ConstScorer::new(docset, boost)))
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
@@ -85,12 +69,14 @@ fn bound_to_value_range<T: MonotonicallyMappableToU64>(
min_value: T,
max_value: T,
) -> RangeInclusive<T> {
let start_value = match left_bound {
let mut start_value = match left_bound {
Bound::Included(val) => *val,
Bound::Excluded(val) => T::from_u64(val.to_u64() + 1),
Bound::Unbounded => min_value,
};
if start_value.partial_cmp(&min_value) == Some(std::cmp::Ordering::Less) {
start_value = min_value;
}
let end_value = match right_bound {
Bound::Included(val) => *val,
Bound::Excluded(val) => T::from_u64(val.to_u64() - 1),
@@ -101,6 +87,8 @@ fn bound_to_value_range<T: MonotonicallyMappableToU64>(
#[cfg(test)]
mod tests {
use std::ops::{Bound, RangeInclusive};
use proptest::prelude::ProptestConfig;
use proptest::strategy::Strategy;
use proptest::{prop_oneof, proptest};
@@ -108,11 +96,11 @@ mod tests {
use rand::seq::SliceRandom;
use rand::SeedableRng;
use super::*;
use crate::collector::Count;
use crate::query::QueryParser;
use crate::schema::{NumericOptions, Schema, FAST, INDEXED, STORED, STRING};
use crate::Index;
use crate::query::range_query::range_query_u64_fastfield::FastFieldRangeWeight;
use crate::query::{QueryParser, Weight};
use crate::schema::{NumericOptions, Schema, SchemaBuilder, FAST, INDEXED, STORED, STRING};
use crate::{Index, TERMINATED};
#[derive(Clone, Debug)]
pub struct Doc {
@@ -127,7 +115,7 @@ mod tests {
]
}
pub fn doc_from_id_1(id: u64) -> Doc {
fn doc_from_id_1(id: u64) -> Doc {
let id = id * 1000;
Doc {
id_name: id.to_string(),
@@ -142,13 +130,15 @@ mod tests {
}
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(10))]
#[test]
fn test_range_for_docs_prop(ops in proptest::collection::vec(operation_strategy(), 1..1000)) {
assert!(test_id_range_for_docs(ops).is_ok());
}
}
// TODO re-enable once merge is replugged.
//
// proptest! {
// #![proptest_config(ProptestConfig::with_cases(10))]
// #[test]
// fn test_range_for_docs_prop(ops in proptest::collection::vec(operation_strategy(),
// 1..1000)) { assert!(test_id_range_for_docs(ops).is_ok());
// }
// }
#[test]
fn range_regression1_test() {
@@ -157,7 +147,7 @@ mod tests {
}
#[test]
fn range_regression2_test() {
fn test_range_regression2() {
let ops = vec![
doc_from_id_1(52),
doc_from_id_1(63),
@@ -168,6 +158,27 @@ mod tests {
assert!(test_id_range_for_docs(ops).is_ok());
}
#[test]
fn test_range_regression_simplified() {
let mut schema_builder = SchemaBuilder::new();
let field = schema_builder.add_u64_field("test_field", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer = index.writer_for_tests().unwrap();
writer.add_document(doc!(field=>52_000u64)).unwrap();
writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher();
let range_query = FastFieldRangeWeight::new(
"test_field".to_string(),
Bound::Included(50_000),
Bound::Included(50_002),
);
let scorer = range_query
.scorer(searcher.segment_reader(0), 1.0f32)
.unwrap();
assert_eq!(scorer.doc(), TERMINATED);
}
#[test]
fn range_regression3_test() {
let ops = vec![doc_from_id_1(1), doc_from_id_1(2), doc_from_id_1(3)];
@@ -180,30 +191,22 @@ mod tests {
assert!(test_id_range_for_docs(ops).is_ok());
}
pub fn create_index_from_docs(docs: &[Doc]) -> Index {
fn create_index_from_docs(docs: &[Doc]) -> Index {
let mut schema_builder = Schema::builder();
let id_u64_field = schema_builder.add_u64_field("id", INDEXED | STORED | FAST);
let ids_u64_field = schema_builder.add_u64_field(
"ids",
NumericOptions::default()
.set_fast(Cardinality::MultiValues)
.set_indexed(),
);
let ids_u64_field =
schema_builder.add_u64_field("ids", NumericOptions::default().set_fast().set_indexed());
let id_f64_field = schema_builder.add_f64_field("id_f64", INDEXED | STORED | FAST);
let ids_f64_field = schema_builder.add_f64_field(
"ids_f64",
NumericOptions::default()
.set_fast(Cardinality::MultiValues)
.set_indexed(),
NumericOptions::default().set_fast().set_indexed(),
);
let id_i64_field = schema_builder.add_i64_field("id_i64", INDEXED | STORED | FAST);
let ids_i64_field = schema_builder.add_i64_field(
"ids_i64",
NumericOptions::default()
.set_fast(Cardinality::MultiValues)
.set_indexed(),
NumericOptions::default().set_fast().set_indexed(),
);
let text_field = schema_builder.add_text_field("id_name", STRING | STORED);
@@ -241,15 +244,15 @@ mod tests {
let mut rng: StdRng = StdRng::from_seed([1u8; 32]);
let get_num_hits = |query| searcher.search(&query, &(Count)).unwrap();
let get_num_hits = |query| searcher.search(&query, &Count).unwrap();
let query_from_text = |text: &str| {
QueryParser::for_index(&index, vec![])
.parse_query(text)
.unwrap()
};
let gen_query_inclusive = |field: &str, from: u64, to: u64| {
format!("{}:[{} TO {}]", field, &from.to_string(), &to.to_string())
let gen_query_inclusive = |field: &str, range: RangeInclusive<u64>| {
format!("{}:[{} TO {}]", field, range.start(), range.end())
};
let test_sample = |sample_docs: Vec<Doc>| {
@@ -260,10 +263,10 @@ mod tests {
.filter(|doc| (ids[0]..=ids[1]).contains(&doc.id))
.count();
let query = gen_query_inclusive("id", ids[0], ids[1]);
let query = gen_query_inclusive("id", ids[0]..=ids[1]);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
let query = gen_query_inclusive("ids", ids[0], ids[1]);
let query = gen_query_inclusive("ids", ids[0]..=ids[1]);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
// Intersection search
@@ -274,19 +277,19 @@ mod tests {
.count();
let query = format!(
"{} AND id_name:{}",
gen_query_inclusive("id", ids[0], ids[1]),
gen_query_inclusive("id", ids[0]..=ids[1]),
&id_filter
);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
let query = format!(
"{} AND id_name:{}",
gen_query_inclusive("id_f64", ids[0], ids[1]),
gen_query_inclusive("id_f64", ids[0]..=ids[1]),
&id_filter
);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
let query = format!(
"{} AND id_name:{}",
gen_query_inclusive("id_i64", ids[0], ids[1]),
gen_query_inclusive("id_i64", ids[0]..=ids[1]),
&id_filter
);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
@@ -295,19 +298,19 @@ mod tests {
let id_filter = sample_docs[0].id_name.to_string();
let query = format!(
"{} AND id_name:{}",
gen_query_inclusive("ids", ids[0], ids[1]),
gen_query_inclusive("ids", ids[0]..=ids[1]),
&id_filter
);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
let query = format!(
"{} AND id_name:{}",
gen_query_inclusive("ids_f64", ids[0], ids[1]),
gen_query_inclusive("ids_f64", ids[0]..=ids[1]),
&id_filter
);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
let query = format!(
"{} AND id_name:{}",
gen_query_inclusive("ids_i64", ids[0], ids[1]),
gen_query_inclusive("ids_i64", ids[0]..=ids[1]),
&id_filter
);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
@@ -376,7 +379,7 @@ mod bench {
10..=10
}
fn excute_query(
fn execute_query(
field: &str,
id_range: RangeInclusive<u64>,
suffix: &str,
@@ -407,154 +410,132 @@ mod bench {
#[bench]
fn bench_id_range_hit_90_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("id", get_90_percent(), "", &index));
bench.iter(|| execute_query("id", get_90_percent(), "", &index));
}
#[bench]
fn bench_id_range_hit_10_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("id", get_10_percent(), "", &index));
bench.iter(|| execute_query("id", get_10_percent(), "", &index));
}
#[bench]
fn bench_id_range_hit_1_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("id", get_1_percent(), "", &index));
bench.iter(|| execute_query("id", get_1_percent(), "", &index));
}
#[bench]
fn bench_id_range_hit_10_percent_intersect_with_10_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("id", get_10_percent(), "AND id_name:few", &index));
bench.iter(|| execute_query("id", get_10_percent(), "AND id_name:few", &index));
}
#[bench]
fn bench_id_range_hit_1_percent_intersect_with_10_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("id", get_1_percent(), "AND id_name:few", &index));
bench.iter(|| execute_query("id", get_1_percent(), "AND id_name:few", &index));
}
#[bench]
fn bench_id_range_hit_1_percent_intersect_with_90_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("id", get_1_percent(), "AND id_name:many", &index));
bench.iter(|| execute_query("id", get_1_percent(), "AND id_name:many", &index));
}
#[bench]
fn bench_id_range_hit_1_percent_intersect_with_1_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("id", get_1_percent(), "AND id_name:veryfew", &index));
bench.iter(|| execute_query("id", get_1_percent(), "AND id_name:veryfew", &index));
}
#[bench]
fn bench_id_range_hit_10_percent_intersect_with_90_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("id", get_10_percent(), "AND id_name:many", &index));
bench.iter(|| execute_query("id", get_10_percent(), "AND id_name:many", &index));
}
#[bench]
fn bench_id_range_hit_90_percent_intersect_with_90_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("id", get_90_percent(), "AND id_name:many", &index));
bench.iter(|| execute_query("id", get_90_percent(), "AND id_name:many", &index));
}
#[bench]
fn bench_id_range_hit_90_percent_intersect_with_10_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("id", get_90_percent(), "AND id_name:few", &index));
bench.iter(|| execute_query("id", get_90_percent(), "AND id_name:few", &index));
}
#[bench]
fn bench_id_range_hit_90_percent_intersect_with_1_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("id", get_90_percent(), "AND id_name:veryfew", &index));
bench.iter(|| execute_query("id", get_90_percent(), "AND id_name:veryfew", &index));
}
#[bench]
fn bench_id_range_hit_90_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ids", get_90_percent(), "", &index));
bench.iter(|| execute_query("ids", get_90_percent(), "", &index));
}
#[bench]
fn bench_id_range_hit_10_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ids", get_10_percent(), "", &index));
bench.iter(|| execute_query("ids", get_10_percent(), "", &index));
}
#[bench]
fn bench_id_range_hit_1_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ids", get_1_percent(), "", &index));
bench.iter(|| execute_query("ids", get_1_percent(), "", &index));
}
#[bench]
fn bench_id_range_hit_10_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ids", get_10_percent(), "AND id_name:few", &index));
bench.iter(|| execute_query("ids", get_10_percent(), "AND id_name:few", &index));
}
#[bench]
fn bench_id_range_hit_1_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ids", get_1_percent(), "AND id_name:few", &index));
bench.iter(|| execute_query("ids", get_1_percent(), "AND id_name:few", &index));
}
#[bench]
fn bench_id_range_hit_1_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ids", get_1_percent(), "AND id_name:many", &index));
bench.iter(|| execute_query("ids", get_1_percent(), "AND id_name:many", &index));
}
#[bench]
fn bench_id_range_hit_1_percent_intersect_with_1_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ids", get_1_percent(), "AND id_name:veryfew", &index));
bench.iter(|| execute_query("ids", get_1_percent(), "AND id_name:veryfew", &index));
}
#[bench]
fn bench_id_range_hit_10_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ids", get_10_percent(), "AND id_name:many", &index));
bench.iter(|| execute_query("ids", get_10_percent(), "AND id_name:many", &index));
}
#[bench]
fn bench_id_range_hit_90_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ids", get_90_percent(), "AND id_name:many", &index));
bench.iter(|| execute_query("ids", get_90_percent(), "AND id_name:many", &index));
}
#[bench]
fn bench_id_range_hit_90_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ids", get_90_percent(), "AND id_name:few", &index));
bench.iter(|| execute_query("ids", get_90_percent(), "AND id_name:few", &index));
}
#[bench]
fn bench_id_range_hit_90_percent_intersect_with_1_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ids", get_90_percent(), "AND id_name:veryfew", &index));
bench.iter(|| execute_query("ids", get_90_percent(), "AND id_name:veryfew", &index));
}
}

View File

@@ -2,14 +2,16 @@ use std::ops::BitOr;
use serde::{Deserialize, Serialize};
use super::Cardinality;
use crate::schema::flags::{FastFlag, IndexedFlag, SchemaFlagList, StoredFlag};
/// DateTime Precision
#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
#[derive(
Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize, Default,
)]
#[serde(rename_all = "lowercase")]
pub enum DatePrecision {
/// Seconds precision
#[default]
Seconds,
/// Milli-seconds precision.
Milliseconds,
@@ -17,20 +19,13 @@ pub enum DatePrecision {
Microseconds,
}
impl Default for DatePrecision {
fn default() -> Self {
DatePrecision::Seconds
}
}
/// Defines how DateTime field should be handled by tantivy.
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)]
pub struct DateOptions {
indexed: bool,
// This boolean has no effect if the field is not marked as indexed true.
fieldnorms: bool,
#[serde(skip_serializing_if = "Option::is_none")]
fast: Option<Cardinality>,
fast: bool,
stored: bool,
// Internal storage precision, used to optimize storage
// compression on fast fields.
@@ -54,18 +49,9 @@ impl DateOptions {
self.fieldnorms && self.indexed
}
/// Returns true iff the value is a fast field and multivalue.
pub fn is_multivalue_fast(&self) -> bool {
if let Some(cardinality) = self.fast {
cardinality == Cardinality::MultiValues
} else {
false
}
}
/// Returns true iff the value is a fast field.
pub fn is_fast(&self) -> bool {
self.fast.is_some()
self.fast
}
/// Set the field as stored.
@@ -107,19 +93,11 @@ impl DateOptions {
/// If more than one value is associated with a fast field, only the last one is
/// kept.
#[must_use]
pub fn set_fast(mut self, cardinality: Cardinality) -> DateOptions {
self.fast = Some(cardinality);
pub fn set_fast(mut self) -> DateOptions {
self.fast = true;
self
}
/// Returns the cardinality of the fastfield.
///
/// If the field has not been declared as a fastfield, then
/// the method returns `None`.
pub fn get_fastfield_cardinality(&self) -> Option<Cardinality> {
self.fast
}
/// Sets the precision for this DateTime field.
///
/// Internal storage precision, used to optimize storage
@@ -147,10 +125,7 @@ impl From<()> for DateOptions {
impl From<FastFlag> for DateOptions {
fn from(_: FastFlag) -> Self {
DateOptions {
indexed: false,
fieldnorms: false,
stored: false,
fast: Some(Cardinality::SingleValue),
fast: true,
..Default::default()
}
}
@@ -159,10 +134,7 @@ impl From<FastFlag> for DateOptions {
impl From<StoredFlag> for DateOptions {
fn from(_: StoredFlag) -> Self {
DateOptions {
indexed: false,
fieldnorms: false,
stored: true,
fast: None,
..Default::default()
}
}
@@ -173,8 +145,6 @@ impl From<IndexedFlag> for DateOptions {
DateOptions {
indexed: true,
fieldnorms: true,
stored: false,
fast: None,
..Default::default()
}
}
@@ -189,7 +159,7 @@ impl<T: Into<DateOptions>> BitOr<T> for DateOptions {
indexed: self.indexed | other.indexed,
fieldnorms: self.fieldnorms | other.fieldnorms,
stored: self.stored | other.stored,
fast: self.fast.or(other.fast),
fast: self.fast | other.fast,
precision: self.precision,
}
}

View File

@@ -8,7 +8,7 @@ use serde_json::Value as JsonValue;
use thiserror::Error;
use super::ip_options::IpAddrOptions;
use super::{Cardinality, IntoIpv6Addr};
use super::IntoIpv6Addr;
use crate::schema::bytes_options::BytesOptions;
use crate::schema::facet_options::FacetOptions;
use crate::schema::{
@@ -241,26 +241,6 @@ impl FieldType {
}
}
/// returns true if the field is fast.
pub fn fastfield_cardinality(&self) -> Option<Cardinality> {
match *self {
FieldType::Bytes(ref bytes_options) => {
bytes_options.is_fast().then_some(Cardinality::SingleValue)
}
FieldType::Str(ref text_options) => {
text_options.is_fast().then_some(Cardinality::MultiValues)
}
FieldType::U64(ref int_options)
| FieldType::I64(ref int_options)
| FieldType::F64(ref int_options)
| FieldType::Bool(ref int_options) => int_options.get_fastfield_cardinality(),
FieldType::Date(ref date_options) => date_options.get_fastfield_cardinality(),
FieldType::Facet(_) => Some(Cardinality::MultiValues),
FieldType::JsonObject(_) => None,
FieldType::IpAddr(ref ip_addr_options) => ip_addr_options.get_fastfield_cardinality(),
}
}
/// returns true if the field is normed (see [fieldnorms](crate::fieldnorm)).
pub fn has_fieldnorms(&self) -> bool {
match *self {

View File

@@ -4,7 +4,6 @@ use std::ops::BitOr;
use serde::{Deserialize, Serialize};
use super::flags::{FastFlag, IndexedFlag, SchemaFlagList, StoredFlag};
use super::Cardinality;
/// Trait to convert into an Ipv6Addr.
pub trait IntoIpv6Addr {
@@ -24,8 +23,7 @@ impl IntoIpv6Addr for IpAddr {
/// Define how an ip field should be handled by tantivy.
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)]
pub struct IpAddrOptions {
#[serde(skip_serializing_if = "Option::is_none")]
fast: Option<Cardinality>,
fast: bool,
stored: bool,
indexed: bool,
fieldnorms: bool,
@@ -34,7 +32,7 @@ pub struct IpAddrOptions {
impl IpAddrOptions {
/// Returns true iff the value is a fast field.
pub fn is_fast(&self) -> bool {
self.fast.is_some()
self.fast
}
/// Returns `true` if the ip address should be stored in the doc store.
@@ -52,14 +50,6 @@ impl IpAddrOptions {
self.fieldnorms
}
/// Returns the cardinality of the fastfield.
///
/// If the field has not been declared as a fastfield, then
/// the method returns None.
pub fn get_fastfield_cardinality(&self) -> Option<Cardinality> {
self.fast
}
/// Set the field as normed.
///
/// Setting an integer as normed will generate
@@ -97,8 +87,8 @@ impl IpAddrOptions {
/// If more than one value is associated with a fast field, only the last one is
/// kept.
#[must_use]
pub fn set_fast(mut self, cardinality: Cardinality) -> Self {
self.fast = Some(cardinality);
pub fn set_fast(mut self) -> Self {
self.fast = true;
self
}
}
@@ -115,7 +105,7 @@ impl From<FastFlag> for IpAddrOptions {
fieldnorms: false,
indexed: false,
stored: false,
fast: Some(Cardinality::SingleValue),
fast: true,
}
}
}
@@ -126,7 +116,7 @@ impl From<StoredFlag> for IpAddrOptions {
fieldnorms: false,
indexed: false,
stored: true,
fast: None,
fast: false,
}
}
}
@@ -137,7 +127,7 @@ impl From<IndexedFlag> for IpAddrOptions {
fieldnorms: true,
indexed: true,
stored: false,
fast: None,
fast: false,
}
}
}
@@ -151,7 +141,7 @@ impl<T: Into<IpAddrOptions>> BitOr<T> for IpAddrOptions {
fieldnorms: self.fieldnorms | other.fieldnorms,
indexed: self.indexed | other.indexed,
stored: self.stored | other.stored,
fast: self.fast.or(other.fast),
fast: self.fast | other.fast,
}
}
}

View File

@@ -141,9 +141,9 @@ pub use self::index_record_option::IndexRecordOption;
pub use self::ip_options::{IntoIpv6Addr, IpAddrOptions};
pub use self::json_object_options::JsonObjectOptions;
pub use self::named_field_document::NamedFieldDocument;
pub use self::numeric_options::NumericOptions;
#[allow(deprecated)]
pub use self::numeric_options::{Cardinality, IntOptions};
pub use self::numeric_options::IntOptions;
pub use self::numeric_options::NumericOptions;
pub use self::schema::{DocParsingError, Schema, SchemaBuilder};
pub use self::term::Term;
pub use self::text_options::{TextFieldIndexing, TextOptions, STRING, TEXT};

View File

@@ -4,18 +4,6 @@ use serde::{Deserialize, Serialize};
use crate::schema::flags::{FastFlag, IndexedFlag, SchemaFlagList, StoredFlag};
/// Express whether a field is single-value or multi-valued.
#[derive(Clone, Copy, PartialEq, Eq, Debug, Serialize, Deserialize)]
pub enum Cardinality {
/// The document must have exactly one value associated with the document.
#[serde(rename = "single")]
SingleValue,
/// The document can have any number of values associated with the document.
/// This is more memory and CPU expensive than the `SingleValue` solution.
#[serde(rename = "multi")]
MultiValues,
}
#[deprecated(since = "0.17.0", note = "Use NumericOptions instead.")]
/// Deprecated use [`NumericOptions`] instead.
pub type IntOptions = NumericOptions;
@@ -27,8 +15,7 @@ pub struct NumericOptions {
indexed: bool,
// This boolean has no effect if the field is not marked as indexed too.
fieldnorms: bool, // This attribute only has an effect if indexed is true.
#[serde(skip_serializing_if = "Option::is_none")]
fast: Option<Cardinality>,
fast: bool,
stored: bool,
}
@@ -42,8 +29,7 @@ struct NumericOptionsDeser {
indexed: bool,
#[serde(default)]
fieldnorms: Option<bool>, // This attribute only has an effect if indexed is true.
#[serde(default)]
fast: Option<Cardinality>,
fast: bool,
stored: bool,
}
@@ -74,18 +60,9 @@ impl NumericOptions {
self.fieldnorms && self.indexed
}
/// Returns true iff the value is a fast field and multivalue.
pub fn is_multivalue_fast(&self) -> bool {
if let Some(cardinality) = self.fast {
cardinality == Cardinality::MultiValues
} else {
false
}
}
/// Returns true iff the value is a fast field.
pub fn is_fast(&self) -> bool {
self.fast.is_some()
self.fast
}
/// Set the field as stored.
@@ -127,18 +104,10 @@ impl NumericOptions {
/// If more than one value is associated with a fast field, only the last one is
/// kept.
#[must_use]
pub fn set_fast(mut self, cardinality: Cardinality) -> NumericOptions {
self.fast = Some(cardinality);
pub fn set_fast(mut self) -> NumericOptions {
self.fast = true;
self
}
/// Returns the cardinality of the fastfield.
///
/// If the field has not been declared as a fastfield, then
/// the method returns `None`.
pub fn get_fastfield_cardinality(&self) -> Option<Cardinality> {
self.fast
}
}
impl From<()> for NumericOptions {
@@ -153,7 +122,7 @@ impl From<FastFlag> for NumericOptions {
indexed: false,
fieldnorms: false,
stored: false,
fast: Some(Cardinality::SingleValue),
fast: true,
}
}
}
@@ -164,7 +133,7 @@ impl From<StoredFlag> for NumericOptions {
indexed: false,
fieldnorms: false,
stored: true,
fast: None,
fast: false,
}
}
}
@@ -175,7 +144,7 @@ impl From<IndexedFlag> for NumericOptions {
indexed: true,
fieldnorms: true,
stored: false,
fast: None,
fast: false,
}
}
}
@@ -189,7 +158,7 @@ impl<T: Into<NumericOptions>> BitOr<T> for NumericOptions {
indexed: self.indexed | other.indexed,
fieldnorms: self.fieldnorms | other.fieldnorms,
stored: self.stored | other.stored,
fast: self.fast.or(other.fast),
fast: self.fast | other.fast,
}
}
}
@@ -221,7 +190,7 @@ mod tests {
&NumericOptions {
indexed: true,
fieldnorms: true,
fast: None,
fast: false,
stored: false
}
);
@@ -239,7 +208,7 @@ mod tests {
&NumericOptions {
indexed: false,
fieldnorms: false,
fast: None,
fast: false,
stored: false
}
);
@@ -258,7 +227,7 @@ mod tests {
&NumericOptions {
indexed: true,
fieldnorms: false,
fast: None,
fast: false,
stored: false
}
);
@@ -278,7 +247,7 @@ mod tests {
&NumericOptions {
indexed: false,
fieldnorms: true,
fast: None,
fast: false,
stored: false
}
);

View File

@@ -484,7 +484,6 @@ mod tests {
use serde_json;
use crate::schema::field_type::ValueParsingError;
use crate::schema::numeric_options::Cardinality::SingleValue;
use crate::schema::schema::DocParsingError::InvalidJson;
use crate::schema::*;
@@ -506,19 +505,13 @@ mod tests {
#[test]
pub fn test_schema_serialization() {
let mut schema_builder = Schema::builder();
let count_options = NumericOptions::default()
.set_stored()
.set_fast(Cardinality::SingleValue);
let popularity_options = NumericOptions::default()
.set_stored()
.set_fast(Cardinality::SingleValue);
let count_options = NumericOptions::default().set_stored().set_fast();
let popularity_options = NumericOptions::default().set_stored().set_fast();
let score_options = NumericOptions::default()
.set_indexed()
.set_fieldnorm()
.set_fast(Cardinality::SingleValue);
let is_read_options = NumericOptions::default()
.set_stored()
.set_fast(Cardinality::SingleValue);
.set_fast();
let is_read_options = NumericOptions::default().set_stored().set_fast();
schema_builder.add_text_field("title", TEXT);
schema_builder.add_text_field(
"author",
@@ -643,12 +636,8 @@ mod tests {
#[test]
pub fn test_document_to_json() {
let mut schema_builder = Schema::builder();
let count_options = NumericOptions::default()
.set_stored()
.set_fast(Cardinality::SingleValue);
let is_read_options = NumericOptions::default()
.set_stored()
.set_fast(Cardinality::SingleValue);
let count_options = NumericOptions::default().set_stored().set_fast();
let is_read_options = NumericOptions::default().set_stored().set_fast();
schema_builder.add_text_field("title", TEXT);
schema_builder.add_text_field("author", STRING);
schema_builder.add_u64_field("count", count_options);
@@ -748,15 +737,9 @@ mod tests {
#[test]
pub fn test_parse_document() {
let mut schema_builder = Schema::builder();
let count_options = NumericOptions::default()
.set_stored()
.set_fast(Cardinality::SingleValue);
let popularity_options = NumericOptions::default()
.set_stored()
.set_fast(Cardinality::SingleValue);
let score_options = NumericOptions::default()
.set_indexed()
.set_fast(Cardinality::SingleValue);
let count_options = NumericOptions::default().set_stored().set_fast();
let popularity_options = NumericOptions::default().set_stored().set_fast();
let score_options = NumericOptions::default().set_indexed().set_fast();
let title_field = schema_builder.add_text_field("title", TEXT);
let author_field = schema_builder.add_text_field("author", STRING);
let count_field = schema_builder.add_u64_field("count", count_options);
@@ -907,7 +890,7 @@ mod tests {
.set_stored()
.set_indexed()
.set_fieldnorm()
.set_fast(SingleValue);
.set_fast();
schema_builder.add_text_field("_id", id_options);
schema_builder.add_date_field("_timestamp", timestamp_options);