Compare commits

...

13 Commits

Author SHA1 Message Date
Paul Masurel
8303bd8e6d Fixed num_vals 2023-01-19 11:30:45 +09:00
Paul Masurel
1e27387c0f Date fast field working. Precision still broken 2023-01-18 18:23:21 +09:00
Paul Masurel
ae28aab67c Demuxer 2023-01-18 16:13:21 +09:00
Paul Masurel
8bedcab91d blop 2023-01-18 16:13:21 +09:00
Paul Masurel
23ace8bd44 connected datetime 2023-01-18 16:13:21 +09:00
Paul Masurel
58eec2c214 blop 2023-01-18 16:13:21 +09:00
Paul Masurel
7587656f1e Sisiphe work 2023-01-18 16:13:21 +09:00
Paul Masurel
e435b6fdd1 Disconnected facet / fast field merges / examples 2023-01-18 16:13:21 +09:00
Paul Masurel
007168ff4c Changed add_document 2023-01-18 16:13:21 +09:00
Paul Masurel
29c1a76d5a Removed cardinality from fast field options. 2023-01-18 16:13:21 +09:00
Paul Masurel
a0c1ba46c7 added columnar to workspace 2023-01-18 16:13:21 +09:00
Paul Masurel
5110ee7456 Removing cardinality. 2023-01-18 16:13:21 +09:00
Paul Masurel
575931bee4 Added solution to force the type of a column. 2023-01-18 16:13:21 +09:00
75 changed files with 2862 additions and 3685 deletions

View File

@@ -59,6 +59,7 @@ sstable = { version="0.1", path="./sstable", package ="tantivy-sstable", optiona
stacker = { version="0.1", path="./stacker", package ="tantivy-stacker" }
tantivy-query-grammar = { version= "0.19.0", path="./query-grammar" }
tantivy-bitpacker = { version= "0.3", path="./bitpacker" }
columnar = { version= "0.1", path="./columnar", package="tantivy-columnar" }
common = { version= "0.5", path = "./common/", package = "tantivy-common" }
fastfield_codecs = { version= "0.3", path="./fastfield_codecs", default-features = false }
tokenizer-api = { version="0.1", path="./tokenizer-api", package="tantivy-tokenizer-api" }
@@ -107,7 +108,7 @@ unstable = [] # useful for benches.
quickwit = ["sstable"]
[workspace]
members = ["query-grammar", "bitpacker", "common", "fastfield_codecs", "ownedbytes", "stacker", "sstable", "tokenizer-api"]
members = ["query-grammar", "bitpacker", "common", "fastfield_codecs", "ownedbytes", "stacker", "sstable", "tokenizer-api", "columnar"]
# Following the "fail" crate best practises, we isolate
# tests that define specific behavior in fail check points

18
TODO.txt Normal file
View File

@@ -0,0 +1,18 @@
Make schema_builder API fluent.
fix doc serialization and prevent compression problems
u64 , etc. shoudl return Resutl<Option> now that we support optional missing a column is really not an error
remove fastfield codecs
ditch the first_or_default trick. if it is still useful, improve its implementation.
rename FastFieldReaders::open to load
remove fast field reader
find a way to unify the two DateTime.
readd type check in the filter wrapper
add unit test on columnar list columns.
make sure sort works

View File

@@ -24,9 +24,5 @@ proptest = "1"
more-asserts = "0.3.0"
rand = "0.8.3"
# temporary
[workspace]
members = []
[features]
unstable = []

View File

@@ -26,6 +26,7 @@ Add alignment?
Consider another codec to bridge the gap between few and 5k elements
# Cleanup and rationalization
remove the 6 bit limitation of columntype. use 4 + 4 bits instead.
in benchmark, unify percent vs ratio, f32 vs f64.
investigate if should have better errors? io::Error is overused at the moment.
rename rank/select in unit tests

View File

@@ -6,6 +6,7 @@ use sstable::{Dictionary, VoidSSTable};
use crate::column::Column;
use crate::column_index::ColumnIndex;
use crate::RowId;
/// Dictionary encoded column.
#[derive(Clone)]
@@ -24,6 +25,10 @@ impl BytesColumn {
pub fn term_ords(&self) -> &Column<u64> {
&self.term_ord_column
}
pub fn num_rows(&self) -> RowId {
self.term_ord_column.num_rows()
}
}
impl Deref for BytesColumn {

View File

@@ -8,7 +8,7 @@ use common::BinarySerializable;
pub use dictionary_encoded::BytesColumn;
pub use serialize::{open_column_bytes, open_column_u64, serialize_column_u64};
use crate::column_index::ColumnIndex;
use crate::column_index::{ColumnIndex, Set};
use crate::column_values::ColumnValues;
use crate::{Cardinality, RowId};
@@ -18,9 +18,17 @@ pub struct Column<T> {
pub values: Arc<dyn ColumnValues<T>>,
}
use crate::column_index::Set;
impl<T: PartialOrd> Column<T> {
pub fn num_rows(&self) -> RowId {
match &self.idx {
ColumnIndex::Full => self.values.num_vals(),
ColumnIndex::Optional(optional_idx) => optional_idx.num_rows(),
ColumnIndex::Multivalued(_) => todo!(),
}
}
}
impl<T: PartialOrd + Copy + Send + Sync + 'static> Column<T> {
pub fn first(&self, row_id: RowId) -> Option<T> {
match &self.idx {
ColumnIndex::Full => Some(self.values.get_val(row_id)),
@@ -33,6 +41,13 @@ impl<T: PartialOrd> Column<T> {
}
}
}
pub fn first_or_default_col(self, default_value: T) -> Arc<dyn ColumnValues<T>> {
Arc::new(FirstValueWithDefault {
column: self,
default_value,
})
}
}
impl<T> Deref for Column<T> {
@@ -54,3 +69,31 @@ impl BinarySerializable for Cardinality {
Ok(cardinality)
}
}
// TODO simplify or optimize
struct FirstValueWithDefault<T: Copy> {
column: Column<T>,
default_value: T,
}
impl<T: PartialOrd + Send + Sync + Copy + 'static> ColumnValues<T> for FirstValueWithDefault<T> {
fn get_val(&self, idx: u32) -> T {
self.column.first(idx).unwrap_or(self.default_value)
}
fn min_value(&self) -> T {
self.column.values.min_value()
}
fn max_value(&self) -> T {
self.column.values.max_value()
}
fn num_vals(&self) -> u32 {
match &self.column.idx {
ColumnIndex::Full => self.column.values.num_vals(),
ColumnIndex::Optional(optional_idx) => optional_idx.num_rows(),
ColumnIndex::Multivalued(_) => todo!(),
}
}
}

View File

@@ -27,14 +27,4 @@ impl<'a> ColumnIndex<'a> {
ColumnIndex::Multivalued(_) => Cardinality::Multivalued,
}
}
pub fn num_rows(&self) -> RowId {
match self {
ColumnIndex::Full => {
todo!()
}
ColumnIndex::Optional(optional_index) => optional_index.num_rows(),
ColumnIndex::Multivalued(multivalued_index) => multivalued_index.num_vals() - 1,
}
}
}

View File

@@ -194,6 +194,20 @@ impl MonotonicallyMappableToU64 for i64 {
}
}
impl MonotonicallyMappableToU64 for crate::DateTime {
#[inline(always)]
fn to_u64(self) -> u64 {
common::i64_to_u64(self.timestamp_micros)
}
#[inline(always)]
fn from_u64(val: u64) -> Self {
crate::DateTime {
timestamp_micros: common::u64_to_i64(val),
}
}
}
impl MonotonicallyMappableToU64 for bool {
#[inline(always)]
fn to_u64(self) -> u64 {

View File

@@ -8,9 +8,10 @@ use crate::InvalidData;
/// - bits[3..6]: Numerical type if necessary.
#[derive(Hash, Eq, PartialEq, Debug, Clone, Copy)]
pub enum ColumnType {
Bytes,
Str,
Numerical(NumericalType),
Bool,
DateTime,
}
impl ColumnType {
@@ -19,7 +20,7 @@ impl ColumnType {
let column_type_category;
let numerical_type_code: u8;
match self {
ColumnType::Bytes => {
ColumnType::Str => {
column_type_category = ColumnTypeCategory::Str;
numerical_type_code = 0u8;
}
@@ -31,6 +32,10 @@ impl ColumnType {
column_type_category = ColumnTypeCategory::Bool;
numerical_type_code = 0u8;
}
ColumnType::DateTime => {
column_type_category = ColumnTypeCategory::DateTime;
numerical_type_code = 0u8;
}
}
place_bits::<0, 3>(column_type_category.to_code()) | place_bits::<3, 6>(numerical_type_code)
}
@@ -53,16 +58,56 @@ impl ColumnType {
if numerical_type_code != 0u8 {
return Err(InvalidData);
}
Ok(ColumnType::Bytes)
Ok(ColumnType::Str)
}
ColumnTypeCategory::Numerical => {
let numerical_type = NumericalType::try_from_code(numerical_type_code)?;
Ok(ColumnType::Numerical(numerical_type))
}
ColumnTypeCategory::DateTime => {
if numerical_type_code != 0u8 {
return Err(InvalidData);
}
Ok(ColumnType::DateTime)
}
}
}
}
pub trait HasAssociatedColumnType: 'static + Send + Sync + Copy + PartialOrd {
fn column_type() -> ColumnType;
}
impl HasAssociatedColumnType for u64 {
fn column_type() -> ColumnType {
ColumnType::Numerical(NumericalType::U64)
}
}
impl HasAssociatedColumnType for i64 {
fn column_type() -> ColumnType {
ColumnType::Numerical(NumericalType::I64)
}
}
impl HasAssociatedColumnType for f64 {
fn column_type() -> ColumnType {
ColumnType::Numerical(NumericalType::F64)
}
}
impl HasAssociatedColumnType for bool {
fn column_type() -> ColumnType {
ColumnType::Bool
}
}
impl HasAssociatedColumnType for crate::DateTime {
fn column_type() -> ColumnType {
ColumnType::DateTime
}
}
/// Column types are grouped into different categories that
/// corresponds to the different types of `JsonValue` types.
///
@@ -76,6 +121,7 @@ pub(crate) enum ColumnTypeCategory {
Bool = 0u8,
Str = 1u8,
Numerical = 2u8,
DateTime = 3u8,
}
impl ColumnTypeCategory {
@@ -88,6 +134,7 @@ impl ColumnTypeCategory {
0u8 => Ok(Self::Bool),
1u8 => Ok(Self::Str),
2u8 => Ok(Self::Numerical),
3u8 => Ok(Self::DateTime),
_ => Err(InvalidData),
}
}
@@ -109,7 +156,7 @@ mod tests {
assert!(column_type_set.insert(column_type));
}
}
assert_eq!(column_type_set.len(), 2 + 3);
assert_eq!(column_type_set.len(), 3 + 3);
}
#[test]

View File

@@ -23,6 +23,6 @@ mod format_version;
mod reader;
mod writer;
pub use column_type::ColumnType;
pub use column_type::{ColumnType, HasAssociatedColumnType};
pub use reader::ColumnarReader;
pub use writer::ColumnarWriter;

View File

@@ -44,7 +44,7 @@ impl ColumnarReader {
})
}
// TODO fix ugly API
// TODO Add unit tests
pub fn list_columns(&self) -> io::Result<Vec<(String, DynamicColumnHandle)>> {
let mut stream = self.column_dictionary.stream()?;
let mut results = Vec::new();
@@ -55,7 +55,8 @@ impl ColumnarReader {
.map_err(|_| io_invalid_data(format!("Unknown column code `{column_code}`")))?;
let range = stream.value().clone();
let column_name =
String::from_utf8_lossy(&key_bytes[..key_bytes.len() - 1]).to_string();
// The last two bytes are respectively the 0u8 separator and the column_type.
String::from_utf8_lossy(&key_bytes[..key_bytes.len() - 2]).to_string();
let file_slice = self
.column_data
.slice(range.start as usize..range.end as usize);

View File

@@ -102,18 +102,29 @@ pub(crate) struct NumericalColumnWriter {
column_writer: ColumnWriter,
}
impl NumericalColumnWriter {
pub fn force_numerical_type(&mut self, numerical_type: NumericalType) {
assert!(self
.compatible_numerical_types
.is_type_accepted(numerical_type));
self.compatible_numerical_types = CompatibleNumericalTypes::StaticType(numerical_type);
}
}
/// State used to store what types are still acceptable
/// after having seen a set of numerical values.
#[derive(Clone, Copy)]
struct CompatibleNumericalTypes {
all_values_within_i64_range: bool,
all_values_within_u64_range: bool,
// f64 is always acceptable.
enum CompatibleNumericalTypes {
Dynamic {
all_values_within_i64_range: bool,
all_values_within_u64_range: bool,
},
StaticType(NumericalType),
}
impl Default for CompatibleNumericalTypes {
fn default() -> CompatibleNumericalTypes {
CompatibleNumericalTypes {
CompatibleNumericalTypes::Dynamic {
all_values_within_i64_range: true,
all_values_within_u64_range: true,
}
@@ -121,31 +132,54 @@ impl Default for CompatibleNumericalTypes {
}
impl CompatibleNumericalTypes {
fn is_type_accepted(&self, numerical_type: NumericalType) -> bool {
match self {
CompatibleNumericalTypes::Dynamic {
all_values_within_i64_range,
all_values_within_u64_range,
} => match numerical_type {
NumericalType::I64 => *all_values_within_i64_range,
NumericalType::U64 => *all_values_within_u64_range,
NumericalType::F64 => true,
},
CompatibleNumericalTypes::StaticType(static_numerical_type) => {
*static_numerical_type == numerical_type
}
}
}
fn accept_value(&mut self, numerical_value: NumericalValue) {
match numerical_value {
NumericalValue::I64(val_i64) => {
let value_within_u64_range = val_i64 >= 0i64;
self.all_values_within_u64_range &= value_within_u64_range;
}
NumericalValue::U64(val_u64) => {
let value_within_i64_range = val_u64 < i64::MAX as u64;
self.all_values_within_i64_range &= value_within_i64_range;
}
NumericalValue::F64(_) => {
self.all_values_within_i64_range = false;
self.all_values_within_u64_range = false;
match self {
CompatibleNumericalTypes::Dynamic {
all_values_within_i64_range,
all_values_within_u64_range,
} => match numerical_value {
NumericalValue::I64(val_i64) => {
let value_within_u64_range = val_i64 >= 0i64;
*all_values_within_u64_range &= value_within_u64_range;
}
NumericalValue::U64(val_u64) => {
let value_within_i64_range = val_u64 < i64::MAX as u64;
*all_values_within_i64_range &= value_within_i64_range;
}
NumericalValue::F64(_) => {
*all_values_within_i64_range = false;
*all_values_within_u64_range = false;
}
},
CompatibleNumericalTypes::StaticType(typ) => {
assert_eq!(numerical_value.numerical_type(), *typ);
}
}
}
pub fn to_numerical_type(self) -> NumericalType {
if self.all_values_within_i64_range {
NumericalType::I64
} else if self.all_values_within_u64_range {
NumericalType::U64
} else {
NumericalType::F64
for numerical_type in [NumericalType::I64, NumericalType::U64] {
if self.is_type_accepted(numerical_type) {
return numerical_type;
}
}
NumericalType::F64
}
}
@@ -262,4 +296,27 @@ mod tests {
test_column_writer_coercion_aux(&[1i64.into(), 1u64.into()], NumericalType::I64);
test_column_writer_coercion_aux(&[u64::MAX.into(), (-1i64).into()], NumericalType::F64);
}
#[test]
#[should_panic]
fn test_compatible_numerical_types_static_incompatible_type() {
let mut compatible_numerical_types =
CompatibleNumericalTypes::StaticType(NumericalType::U64);
compatible_numerical_types.accept_value(NumericalValue::I64(1i64));
}
#[test]
fn test_compatible_numerical_types_static_different_type_forbidden() {
let mut compatible_numerical_types =
CompatibleNumericalTypes::StaticType(NumericalType::U64);
compatible_numerical_types.accept_value(NumericalValue::U64(u64::MAX));
}
#[test]
fn test_compatible_numerical_types_static() {
for typ in [NumericalType::I64, NumericalType::I64, NumericalType::F64] {
let compatible_numerical_types = CompatibleNumericalTypes::StaticType(typ);
assert_eq!(compatible_numerical_types.to_numerical_type(), typ);
}
}
}

View File

@@ -47,6 +47,7 @@ struct SpareBuffers {
/// ```
pub struct ColumnarWriter {
numerical_field_hash_map: ArenaHashMap,
datetime_field_hash_map: ArenaHashMap,
bool_field_hash_map: ArenaHashMap,
bytes_field_hash_map: ArenaHashMap,
arena: MemoryArena,
@@ -61,6 +62,7 @@ impl Default for ColumnarWriter {
numerical_field_hash_map: ArenaHashMap::new(10_000),
bool_field_hash_map: ArenaHashMap::new(10_000),
bytes_field_hash_map: ArenaHashMap::new(10_000),
datetime_field_hash_map: ArenaHashMap::new(10_000),
dictionaries: Vec::new(),
arena: MemoryArena::default(),
buffers: SpareBuffers::default(),
@@ -68,20 +70,90 @@ impl Default for ColumnarWriter {
}
}
#[inline]
fn mutate_or_create_column<V, TMutator>(
arena_hash_map: &mut ArenaHashMap,
column_name: &str,
updater: TMutator,
) where
V: Copy + 'static,
TMutator: FnMut(Option<V>) -> V,
{
assert!(
!column_name.as_bytes().contains(&0u8),
"key may not contain the 0 byte"
);
arena_hash_map.mutate_or_create(column_name.as_bytes(), updater);
}
impl ColumnarWriter {
pub fn mem_usage(&self) -> usize {
// TODO add dictionary builders.
self.arena.mem_usage()
+ self.numerical_field_hash_map.mem_usage()
+ self.bool_field_hash_map.mem_usage()
+ self.bytes_field_hash_map.mem_usage()
}
pub fn record_column_type(&mut self, column_name: &str, column_type: ColumnType) {
match column_type {
ColumnType::Str => {
mutate_or_create_column(
&mut self.bytes_field_hash_map,
column_name,
|column_opt: Option<StrColumnWriter>| column_opt.unwrap_or_default(),
);
}
ColumnType::Bool => {
mutate_or_create_column(
&mut self.bool_field_hash_map,
column_name,
|column_opt: Option<ColumnWriter>| column_opt.unwrap_or_default(),
);
}
ColumnType::DateTime => {
mutate_or_create_column(
&mut self.datetime_field_hash_map,
column_name,
|column_opt: Option<ColumnWriter>| column_opt.unwrap_or_default(),
);
}
ColumnType::Numerical(numerical_type) => {
mutate_or_create_column(
&mut self.numerical_field_hash_map,
column_name,
|column_opt: Option<NumericalColumnWriter>| {
let mut column: NumericalColumnWriter = column_opt.unwrap_or_default();
column.force_numerical_type(numerical_type);
column
},
);
}
}
}
pub fn force_numerical_type(&mut self, column_name: &str, numerical_type: NumericalType) {
mutate_or_create_column(
&mut self.numerical_field_hash_map,
column_name,
|column_opt: Option<NumericalColumnWriter>| {
let mut column: NumericalColumnWriter = column_opt.unwrap_or_default();
column.force_numerical_type(numerical_type);
column
},
);
}
pub fn record_numerical<T: Into<NumericalValue> + Copy>(
&mut self,
doc: RowId,
column_name: &str,
numerical_value: T,
) {
assert!(
!column_name.as_bytes().contains(&0u8),
"key may not contain the 0 byte"
);
let (hash_map, arena) = (&mut self.numerical_field_hash_map, &mut self.arena);
hash_map.mutate_or_create(
column_name.as_bytes(),
mutate_or_create_column(
hash_map,
column_name,
|column_opt: Option<NumericalColumnWriter>| {
let mut column: NumericalColumnWriter = column_opt.unwrap_or_default();
column.record_numerical_value(doc, numerical_value.into(), arena);
@@ -91,33 +163,32 @@ impl ColumnarWriter {
}
pub fn record_bool(&mut self, doc: RowId, column_name: &str, val: bool) {
assert!(
!column_name.as_bytes().contains(&0u8),
"key may not contain the 0 byte"
);
let (hash_map, arena) = (&mut self.bool_field_hash_map, &mut self.arena);
hash_map.mutate_or_create(
column_name.as_bytes(),
|column_opt: Option<ColumnWriter>| {
let mut column: ColumnWriter = column_opt.unwrap_or_default();
column.record(doc, val, arena);
column
},
);
mutate_or_create_column(hash_map, column_name, |column_opt: Option<ColumnWriter>| {
let mut column: ColumnWriter = column_opt.unwrap_or_default();
column.record(doc, val, arena);
column
});
}
pub fn record_datetime(&mut self, doc: RowId, column_name: &str, datetime: crate::DateTime) {
let (hash_map, arena) = (&mut self.datetime_field_hash_map, &mut self.arena);
mutate_or_create_column(hash_map, column_name, |column_opt: Option<ColumnWriter>| {
let mut column: ColumnWriter = column_opt.unwrap_or_default();
column.record(doc, NumericalValue::I64(datetime.timestamp_micros), arena);
column
});
}
pub fn record_str(&mut self, doc: RowId, column_name: &str, value: &str) {
assert!(
!column_name.as_bytes().contains(&0u8),
"key may not contain the 0 byte"
);
let (hash_map, arena, dictionaries) = (
&mut self.bytes_field_hash_map,
&mut self.arena,
&mut self.dictionaries,
);
hash_map.mutate_or_create(
column_name.as_bytes(),
mutate_or_create_column(
hash_map,
column_name,
|column_opt: Option<StrColumnWriter>| {
let mut column: StrColumnWriter = column_opt.unwrap_or_else(|| {
// Each column has its own dictionary
@@ -133,25 +204,30 @@ impl ColumnarWriter {
pub fn serialize(&mut self, num_docs: RowId, wrt: &mut dyn io::Write) -> io::Result<()> {
let mut serializer = ColumnarSerializer::new(wrt);
let mut field_columns: Vec<(&[u8], ColumnTypeCategory, Addr)> = self
let mut columns: Vec<(&[u8], ColumnTypeCategory, Addr)> = self
.numerical_field_hash_map
.iter()
.map(|(term, addr, _)| (term, ColumnTypeCategory::Numerical, addr))
.map(|(column_name, addr, _)| (column_name, ColumnTypeCategory::Numerical, addr))
.collect();
field_columns.extend(
columns.extend(
self.bytes_field_hash_map
.iter()
.map(|(term, addr, _)| (term, ColumnTypeCategory::Str, addr)),
.map(|(column_name, addr, _)| (column_name, ColumnTypeCategory::Str, addr)),
);
field_columns.extend(
columns.extend(
self.bool_field_hash_map
.iter()
.map(|(term, addr, _)| (term, ColumnTypeCategory::Bool, addr)),
.map(|(column_name, addr, _)| (column_name, ColumnTypeCategory::Bool, addr)),
);
field_columns.sort_unstable_by_key(|(column_name, col_type, _)| (*column_name, *col_type));
columns.extend(
self.datetime_field_hash_map
.iter()
.map(|(column_name, addr, _)| (column_name, ColumnTypeCategory::DateTime, addr)),
);
columns.sort_unstable_by_key(|(column_name, col_type, _)| (*column_name, *col_type));
let (arena, buffers, dictionaries) = (&self.arena, &mut self.buffers, &self.dictionaries);
let mut symbol_byte_buffer: Vec<u8> = Vec::new();
for (column_name, bytes_or_numerical, addr) in field_columns {
for (column_name, bytes_or_numerical, addr) in columns {
match bytes_or_numerical {
ColumnTypeCategory::Bool => {
let column_writer: ColumnWriter = self.bool_field_hash_map.read(addr);
@@ -172,7 +248,7 @@ impl ColumnarWriter {
&dictionaries[str_column_writer.dictionary_id as usize];
let cardinality = str_column_writer.column_writer.get_cardinality(num_docs);
let mut column_serializer =
serializer.serialize_column(column_name, ColumnType::Bytes);
serializer.serialize_column(column_name, ColumnType::Str);
serialize_bytes_column(
cardinality,
num_docs,
@@ -198,6 +274,20 @@ impl ColumnarWriter {
&mut column_serializer,
)?;
}
ColumnTypeCategory::DateTime => {
let column_writer: ColumnWriter = self.datetime_field_hash_map.read(addr);
let cardinality = column_writer.get_cardinality(num_docs);
let mut column_serializer =
serializer.serialize_column(column_name, ColumnType::DateTime);
serialize_numerical_column(
cardinality,
num_docs,
NumericalType::I64,
column_writer.operation_iterator(arena, &mut symbol_byte_buffer),
buffers,
&mut column_serializer,
)?;
}
};
}
serializer.finalize()?;

View File

@@ -97,10 +97,10 @@ mod tests {
#[test]
fn test_prepare_key_bytes() {
let mut buffer: Vec<u8> = b"somegarbage".to_vec();
prepare_key(b"root\0child", ColumnType::Bytes, &mut buffer);
prepare_key(b"root\0child", ColumnType::Str, &mut buffer);
assert_eq!(buffer.len(), 12);
assert_eq!(&buffer[..10], b"root\0child");
assert_eq!(buffer[10], 0u8);
assert_eq!(buffer[11], ColumnType::Bytes.to_code());
assert_eq!(buffer[11], ColumnType::Str.to_code());
}
}

View File

@@ -6,7 +6,6 @@ use common::{HasLen, OwnedBytes};
use crate::column::{BytesColumn, Column};
use crate::columnar::ColumnType;
use crate::DateTime;
#[derive(Clone)]
pub enum DynamicColumn {
@@ -15,33 +14,35 @@ pub enum DynamicColumn {
U64(Column<u64>),
F64(Column<f64>),
IpAddr(Column<IpAddr>),
DateTime(Column<DateTime>),
Str(BytesColumn),
DateTime(Column<crate::DateTime>),
}
impl From<Column<i64>> for DynamicColumn {
fn from(column_i64: Column<i64>) -> Self {
DynamicColumn::I64(column_i64)
}
macro_rules! static_dynamic_conversions {
($typ:ty, $enum_name:ident) => {
impl Into<Option<Column<$typ>>> for DynamicColumn {
fn into(self) -> Option<Column<$typ>> {
if let Self::$enum_name(col) = self {
Some(col)
} else {
None
}
}
}
impl From<Column<$typ>> for DynamicColumn {
fn from(typed_column: Column<$typ>) -> Self {
DynamicColumn::$enum_name(typed_column)
}
}
};
}
impl From<Column<u64>> for DynamicColumn {
fn from(column_u64: Column<u64>) -> Self {
DynamicColumn::U64(column_u64)
}
}
impl From<Column<f64>> for DynamicColumn {
fn from(column_f64: Column<f64>) -> Self {
DynamicColumn::F64(column_f64)
}
}
impl From<Column<bool>> for DynamicColumn {
fn from(bool_column: Column<bool>) -> Self {
DynamicColumn::Bool(bool_column)
}
}
static_dynamic_conversions!(bool, Bool);
static_dynamic_conversions!(u64, U64);
static_dynamic_conversions!(i64, I64);
static_dynamic_conversions!(f64, F64);
static_dynamic_conversions!(crate::DateTime, DateTime);
impl From<BytesColumn> for DynamicColumn {
fn from(dictionary_encoded_col: BytesColumn) -> Self {
@@ -56,19 +57,41 @@ pub struct DynamicColumnHandle {
}
impl DynamicColumnHandle {
// TODO rename load
pub fn open(&self) -> io::Result<DynamicColumn> {
let column_bytes: OwnedBytes = self.file_slice.read_bytes()?;
self.open_internal(column_bytes)
}
// TODO rename load_async
pub async fn open_async(&self) -> io::Result<DynamicColumn> {
let column_bytes: OwnedBytes = self.file_slice.read_bytes_async().await?;
self.open_internal(column_bytes)
}
/// Returns the `u64` fast field reader reader associated with `fields` of types
/// Str, u64, i64, f64, or datetime.
///
/// If not, the fastfield reader will returns the u64-value associated with the original
/// FastValue.
pub fn open_u64_lenient(&self) -> io::Result<Option<Column<u64>>> {
let column_bytes = self.file_slice.read_bytes()?;
match self.column_type {
ColumnType::Str => {
let column = crate::column::open_column_bytes(column_bytes)?;
Ok(Some(column.term_ord_column))
}
ColumnType::Bool => Ok(None),
ColumnType::Numerical(_) | ColumnType::DateTime => {
let column = crate::column::open_column_u64::<u64>(column_bytes)?;
Ok(Some(column))
}
}
}
fn open_internal(&self, column_bytes: OwnedBytes) -> io::Result<DynamicColumn> {
let dynamic_column: DynamicColumn = match self.column_type {
ColumnType::Bytes => crate::column::open_column_bytes(column_bytes)?.into(),
ColumnType::Str => crate::column::open_column_bytes(column_bytes)?.into(),
ColumnType::Numerical(numerical_type) => match numerical_type {
crate::NumericalType::I64 => {
crate::column::open_column_u64::<i64>(column_bytes)?.into()
@@ -81,6 +104,9 @@ impl DynamicColumnHandle {
}
},
ColumnType::Bool => crate::column::open_column_u64::<bool>(column_bytes)?.into(),
ColumnType::DateTime => {
crate::column::open_column_u64::<crate::DateTime>(column_bytes)?.into()
}
};
Ok(dynamic_column)
}

View File

@@ -18,16 +18,18 @@ mod dynamic_column;
pub(crate) mod utils;
mod value;
pub use columnar::{ColumnarReader, ColumnarWriter};
pub use column::Column;
pub use column_values::ColumnValues;
pub use columnar::{ColumnType, ColumnarReader, ColumnarWriter, HasAssociatedColumnType};
pub use value::{NumericalType, NumericalValue};
// pub use self::dynamic_column::DynamicColumnHandle;
pub use self::dynamic_column::{DynamicColumn, DynamicColumnHandle};
pub type RowId = u32;
#[derive(Clone, Copy)]
#[derive(Clone, Copy, PartialOrd, PartialEq, Default, Debug)]
pub struct DateTime {
timestamp_micros: i64,
pub timestamp_micros: i64,
}
#[derive(Copy, Clone, Debug)]

View File

@@ -1,4 +1,4 @@
use crate::InvalidData;
use crate::{Column, ColumnType, InvalidData};
#[derive(Copy, Clone, Debug, PartialEq)]
pub enum NumericalValue {
@@ -106,6 +106,13 @@ impl Coerce for f64 {
}
}
impl Coerce for crate::DateTime {
fn coerce(value: NumericalValue) -> Self {
let timestamp_micros = i64::coerce(value);
crate::DateTime { timestamp_micros }
}
}
#[cfg(test)]
mod tests {
use super::NumericalType;

View File

@@ -13,7 +13,7 @@ use tantivy::aggregation::agg_result::AggregationResults;
use tantivy::aggregation::metric::AverageAggregation;
use tantivy::aggregation::AggregationCollector;
use tantivy::query::TermQuery;
use tantivy::schema::{self, Cardinality, IndexRecordOption, Schema, TextFieldIndexing};
use tantivy::schema::{self, IndexRecordOption, Schema, TextFieldIndexing};
use tantivy::{doc, Index, Term};
fn main() -> tantivy::Result<()> {
@@ -25,7 +25,7 @@ fn main() -> tantivy::Result<()> {
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let score_fieldtype =
crate::schema::NumericOptions::default().set_fast(Cardinality::SingleValue);
crate::schema::NumericOptions::default().set_fast();
let highscore_field = schema_builder.add_f64_field("highscore", score_fieldtype.clone());
let price_field = schema_builder.add_f64_field("price", score_fieldtype);

View File

@@ -4,7 +4,7 @@
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::{Cardinality, DateOptions, Schema, Value, INDEXED, STORED, STRING};
use tantivy::schema::{DateOptions, Schema, Value, INDEXED, STORED, STRING};
use tantivy::Index;
fn main() -> tantivy::Result<()> {
@@ -12,7 +12,7 @@ fn main() -> tantivy::Result<()> {
let mut schema_builder = Schema::builder();
let opts = DateOptions::from(INDEXED)
.set_stored()
.set_fast(Cardinality::SingleValue)
.set_fast()
.set_precision(tantivy::DatePrecision::Seconds);
let occurred_at = schema_builder.add_date_field("occurred_at", opts);
let event_type = schema_builder.add_text_field("event", STRING | STORED);

View File

@@ -14,6 +14,7 @@ repository = "https://github.com/quickwit-oss/tantivy"
[dependencies]
common = { version = "0.5", path = "../common/", package = "tantivy-common" }
tantivy-bitpacker = { version= "0.3", path = "../bitpacker/" }
columnar = { version= "0.1", path="../columnar", package="tantivy-columnar" }
prettytable-rs = {version="0.10.0", optional= true}
rand = {version="0.8.3", optional= true}
fastdivide = "0.4"

View File

@@ -2,81 +2,11 @@ use std::fmt::{self, Debug};
use std::marker::PhantomData;
use std::ops::{Range, RangeInclusive};
pub use columnar::ColumnValues as Column;
use tantivy_bitpacker::minmax;
use crate::monotonic_mapping::StrictlyMonotonicFn;
/// `Column` provides columnar access on a field.
pub trait Column<T: PartialOrd + Debug = u64>: Send + Sync {
/// Return the value associated with the given idx.
///
/// This accessor should return as fast as possible.
///
/// # Panics
///
/// May panic if `idx` is greater than the column length.
fn get_val(&self, idx: u32) -> T;
/// Fills an output buffer with the fast field values
/// associated with the `DocId` going from
/// `start` to `start + output.len()`.
///
/// # Panics
///
/// Must panic if `start + output.len()` is greater than
/// the segment's `maxdoc`.
#[inline]
fn get_range(&self, start: u64, output: &mut [T]) {
for (out, idx) in output.iter_mut().zip(start..) {
*out = self.get_val(idx as u32);
}
}
/// Get the positions of values which are in the provided value range.
///
/// Note that position == docid for single value fast fields
#[inline]
fn get_docids_for_value_range(
&self,
value_range: RangeInclusive<T>,
doc_id_range: Range<u32>,
positions: &mut Vec<u32>,
) {
let doc_id_range = doc_id_range.start..doc_id_range.end.min(self.num_vals());
for idx in doc_id_range.start..doc_id_range.end {
let val = self.get_val(idx);
if value_range.contains(&val) {
positions.push(idx);
}
}
}
/// Returns the minimum value for this fast field.
///
/// This min_value may not be exact.
/// For instance, the min value does not take in account of possible
/// deleted document. All values are however guaranteed to be higher than
/// `.min_value()`.
fn min_value(&self) -> T;
/// Returns the maximum value for this fast field.
///
/// This max_value may not be exact.
/// For instance, the max value does not take in account of possible
/// deleted document. All values are however guaranteed to be higher than
/// `.max_value()`.
fn max_value(&self) -> T;
/// The number of values in the column.
fn num_vals(&self) -> u32;
/// Returns a iterator over the data
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = T> + 'a> {
Box::new((0..self.num_vals()).map(|idx| self.get_val(idx)))
}
}
/// VecColumn provides `Column` over a slice.
pub struct VecColumn<'a, T = u64> {
values: &'a [T],
@@ -84,32 +14,6 @@ pub struct VecColumn<'a, T = u64> {
max_value: T,
}
impl<'a, C: Column<T>, T: Copy + PartialOrd + fmt::Debug> Column<T> for &'a C {
fn get_val(&self, idx: u32) -> T {
(*self).get_val(idx)
}
fn min_value(&self) -> T {
(*self).min_value()
}
fn max_value(&self) -> T {
(*self).max_value()
}
fn num_vals(&self) -> u32 {
(*self).num_vals()
}
fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = T> + 'b> {
(*self).iter()
}
fn get_range(&self, start: u64, output: &mut [T]) {
(*self).get_range(start, output)
}
}
impl<'a, T: Copy + PartialOrd + Send + Sync + Debug> Column<T> for VecColumn<'a, T> {
fn get_val(&self, position: u32) -> T {
self.values[position as usize]

View File

@@ -15,7 +15,7 @@ use super::metric::{
use super::segment_agg_result::BucketCount;
use super::VecWithNames;
use crate::fastfield::{type_and_cardinality, MultiValuedFastFieldReader};
use crate::schema::{Cardinality, Type};
use crate::schema::Type;
use crate::{InvertedIndexReader, SegmentReader, TantivyError};
#[derive(Clone, Default)]

View File

@@ -43,13 +43,13 @@ mod tests {
use crate::aggregation::agg_result::AggregationResults;
use crate::aggregation::AggregationCollector;
use crate::query::AllQuery;
use crate::schema::{Cardinality, NumericOptions, Schema};
use crate::schema::{NumericOptions, Schema};
use crate::Index;
#[test]
fn test_metric_aggregations() {
let mut schema_builder = Schema::builder();
let field_options = NumericOptions::default().set_fast(Cardinality::SingleValue);
let field_options = NumericOptions::default().set_fast();
let field = schema_builder.add_f64_field("price", field_options);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap();

View File

@@ -430,13 +430,13 @@ mod tests {
let text_field_id = schema_builder.add_text_field("text_id", text_fieldtype);
let string_field_id = schema_builder.add_text_field("string_id", STRING | FAST);
let score_fieldtype =
crate::schema::NumericOptions::default().set_fast(Cardinality::SingleValue);
crate::schema::NumericOptions::default().set_fast();
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
let score_field_f64 = schema_builder.add_f64_field("score_f64", score_fieldtype.clone());
let score_field_i64 = schema_builder.add_i64_field("score_i64", score_fieldtype);
let fraction_field = schema_builder.add_f64_field(
"fraction_f64",
crate::schema::NumericOptions::default().set_fast(Cardinality::SingleValue),
crate::schema::NumericOptions::default().set_fast(),
);
let index = Index::create_in_ram(schema_builder.build());
{
@@ -654,12 +654,12 @@ mod tests {
let date_field = schema_builder.add_date_field("date", FAST);
schema_builder.add_text_field("dummy_text", STRING);
let score_fieldtype =
crate::schema::NumericOptions::default().set_fast(Cardinality::SingleValue);
crate::schema::NumericOptions::default().set_fast();
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
let score_field_f64 = schema_builder.add_f64_field("score_f64", score_fieldtype.clone());
let multivalue =
crate::schema::NumericOptions::default().set_fast(Cardinality::MultiValues);
crate::schema::NumericOptions::default().set_fast();
let scores_field_i64 = schema_builder.add_i64_field("scores_i64", multivalue);
let score_field_i64 = schema_builder.add_i64_field("score_i64", score_fieldtype);
@@ -1187,7 +1187,7 @@ mod tests {
let text_field_few_terms =
schema_builder.add_text_field("text_few_terms", STRING | FAST);
let score_fieldtype =
crate::schema::NumericOptions::default().set_fast(Cardinality::SingleValue);
crate::schema::NumericOptions::default().set_fast();
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
let score_field_f64 =
schema_builder.add_f64_field("score_f64", score_fieldtype.clone());

View File

@@ -12,10 +12,10 @@
use std::marker::PhantomData;
use std::sync::Arc;
use columnar::{DynamicColumn, HasAssociatedColumnType};
use fastfield_codecs::Column;
use crate::collector::{Collector, SegmentCollector};
use crate::fastfield::FastValue;
use crate::schema::Field;
use crate::{Score, SegmentReader, TantivyError};
@@ -61,7 +61,7 @@ use crate::{Score, SegmentReader, TantivyError};
/// # Ok(())
/// # }
/// ```
pub struct FilterCollector<TCollector, TPredicate, TPredicateValue: FastValue>
pub struct FilterCollector<TCollector, TPredicate, TPredicateValue: Default>
where TPredicate: 'static + Clone
{
field: Field,
@@ -70,7 +70,7 @@ where TPredicate: 'static + Clone
t_predicate_value: PhantomData<TPredicateValue>,
}
impl<TCollector, TPredicate, TPredicateValue: FastValue>
impl<TCollector, TPredicate, TPredicateValue: Default>
FilterCollector<TCollector, TPredicate, TPredicateValue>
where
TCollector: Collector + Send + Sync,
@@ -91,12 +91,13 @@ where
}
}
impl<TCollector, TPredicate, TPredicateValue: FastValue> Collector
impl<TCollector, TPredicate, TPredicateValue: Default> Collector
for FilterCollector<TCollector, TPredicate, TPredicateValue>
where
TCollector: Collector + Send + Sync,
TPredicate: 'static + Fn(TPredicateValue) -> bool + Send + Sync + Clone,
TPredicateValue: FastValue,
TPredicateValue: HasAssociatedColumnType,
DynamicColumn: Into<Option<columnar::Column<TPredicateValue>>>,
{
// That's the type of our result.
// Our standard deviation will be a float.
@@ -117,20 +118,10 @@ where
field_entry.name()
)));
}
let requested_type = TPredicateValue::to_type();
let field_schema_type = field_entry.field_type().value_type();
if requested_type != field_schema_type {
return Err(TantivyError::SchemaError(format!(
"Field {:?} is of type {:?}!={:?}",
field_entry.name(),
requested_type,
field_schema_type
)));
}
let fast_field_reader = segment_reader
.fast_fields()
.typed_fast_field_reader(schema.get_field_name(self.field))?;
.typed_column_first_or_default(schema.get_field_name(self.field))?;
let segment_collector = self
.collector
@@ -159,7 +150,7 @@ where
pub struct FilterSegmentCollector<TSegmentCollector, TPredicate, TPredicateValue>
where
TPredicate: 'static,
TPredicateValue: FastValue,
DynamicColumn: Into<Option<columnar::Column<TPredicateValue>>>,
{
fast_field_reader: Arc<dyn Column<TPredicateValue>>,
segment_collector: TSegmentCollector,
@@ -171,8 +162,9 @@ impl<TSegmentCollector, TPredicate, TPredicateValue> SegmentCollector
for FilterSegmentCollector<TSegmentCollector, TPredicate, TPredicateValue>
where
TSegmentCollector: SegmentCollector,
TPredicateValue: HasAssociatedColumnType,
TPredicate: 'static + Fn(TPredicateValue) -> bool + Send + Sync,
TPredicateValue: FastValue,
DynamicColumn: Into<Option<columnar::Column<TPredicateValue>>>,
{
type Fruit = TSegmentCollector::Fruit;

View File

@@ -4,7 +4,7 @@ use fastdivide::DividerU64;
use fastfield_codecs::Column;
use crate::collector::{Collector, SegmentCollector};
use crate::fastfield::FastValue;
use crate::fastfield::{FastFieldNotAvailableError, FastValue};
use crate::schema::Type;
use crate::{DocId, Score};
@@ -87,14 +87,14 @@ impl HistogramComputer {
}
pub struct SegmentHistogramCollector {
histogram_computer: HistogramComputer,
ff_reader: Arc<dyn Column<u64>>,
column_u64: Arc<dyn Column<u64>>,
}
impl SegmentCollector for SegmentHistogramCollector {
type Fruit = Vec<u64>;
fn collect(&mut self, doc: DocId, _score: Score) {
let value = self.ff_reader.get_val(doc);
let value = self.column_u64.get_val(doc);
self.histogram_computer.add_value(value);
}
@@ -112,14 +112,18 @@ impl Collector for HistogramCollector {
_segment_local_id: crate::SegmentOrdinal,
segment: &crate::SegmentReader,
) -> crate::Result<Self::Child> {
let ff_reader = segment.fast_fields().u64_lenient(&self.field)?;
let column_opt = segment.fast_fields().u64_lenient(&self.field)?;
let column = column_opt.ok_or_else(|| FastFieldNotAvailableError {
field_name: self.field.clone(),
})?;
let column_u64 = column.first_or_default_col(0u64);
Ok(SegmentHistogramCollector {
histogram_computer: HistogramComputer {
counts: vec![0; self.num_buckets],
min_value: self.min_value,
divider: self.divider,
},
ff_reader,
column_u64,
})
}

View File

@@ -104,9 +104,8 @@ pub use self::custom_score_top_collector::{CustomScorer, CustomSegmentScorer};
mod tweak_score_top_collector;
pub use self::tweak_score_top_collector::{ScoreSegmentTweaker, ScoreTweaker};
mod facet_collector;
pub use self::facet_collector::{FacetCollector, FacetCounts};
// mod facet_collector;
// pub use self::facet_collector::{FacetCollector, FacetCounts};
use crate::query::Weight;
mod docset_collector;

View File

@@ -5,7 +5,6 @@ use fastfield_codecs::Column;
use super::*;
use crate::collector::{Count, FilterCollector, TopDocs};
use crate::core::SegmentReader;
use crate::fastfield::BytesFastFieldReader;
use crate::query::{AllQuery, QueryParser};
use crate::schema::{Field, Schema, FAST, TEXT};
use crate::time::format_description::well_known::Rfc3339;
@@ -58,9 +57,10 @@ pub fn test_filter_collector() -> crate::Result<()> {
assert_eq!(filtered_top_docs.len(), 0);
fn date_filter(value: DateTime) -> bool {
(value.into_utc() - OffsetDateTime::parse("2019-04-09T00:00:00+00:00", &Rfc3339).unwrap())
.whole_weeks()
fn date_filter(value: columnar::DateTime) -> bool {
(crate::DateTime::from(value).into_utc()
- OffsetDateTime::parse("2019-04-09T00:00:00+00:00", &Rfc3339).unwrap())
.whole_weeks()
> 0
}
@@ -164,8 +164,10 @@ pub struct FastFieldSegmentCollector {
}
impl FastFieldTestCollector {
pub fn for_field(field: String) -> FastFieldTestCollector {
FastFieldTestCollector { field }
pub fn for_field(field: impl ToString) -> FastFieldTestCollector {
FastFieldTestCollector {
field: field.to_string(),
}
}
}
@@ -210,64 +212,62 @@ impl SegmentCollector for FastFieldSegmentCollector {
}
}
/// Collects in order all of the fast field bytes for all of the
/// docs in the `DocSet`
///
/// This collector is mainly useful for tests.
pub struct BytesFastFieldTestCollector {
field: Field,
}
// /// Collects in order all of the fast field bytes for all of the
// /// docs in the `DocSet`
// ///
// /// This collector is mainly useful for tests.
// pub struct BytesFastFieldTestCollector {
// field: Field,
// }
pub struct BytesFastFieldSegmentCollector {
vals: Vec<u8>,
reader: BytesFastFieldReader,
}
// pub struct BytesFastFieldSegmentCollector {
// vals: Vec<u8>,
// reader: BytesFastFieldReader,
// }
impl BytesFastFieldTestCollector {
pub fn for_field(field: Field) -> BytesFastFieldTestCollector {
BytesFastFieldTestCollector { field }
}
}
// impl BytesFastFieldTestCollector {
// pub fn for_field(field: Field) -> BytesFastFieldTestCollector {
// BytesFastFieldTestCollector { field }
// }
// }
impl Collector for BytesFastFieldTestCollector {
type Fruit = Vec<u8>;
type Child = BytesFastFieldSegmentCollector;
// impl Collector for BytesFastFieldTestCollector {
// type Fruit = Vec<u8>;
// type Child = BytesFastFieldSegmentCollector;
fn for_segment(
&self,
_segment_local_id: u32,
segment_reader: &SegmentReader,
) -> crate::Result<BytesFastFieldSegmentCollector> {
let reader = segment_reader
.fast_fields()
.bytes(segment_reader.schema().get_field_name(self.field))?;
Ok(BytesFastFieldSegmentCollector {
vals: Vec::new(),
reader,
})
}
// fn for_segment(
// &self,
// _segment_local_id: u32,
// segment_reader: &SegmentReader,
// ) -> crate::Result<BytesFastFieldSegmentCollector> {
// let reader = segment_reader.fast_fields().bytes(self.field)?;
// Ok(BytesFastFieldSegmentCollector {
// vals: Vec::new(),
// reader,
// })
// }
fn requires_scoring(&self) -> bool {
false
}
// fn requires_scoring(&self) -> bool {
// false
// }
fn merge_fruits(&self, children: Vec<Vec<u8>>) -> crate::Result<Vec<u8>> {
Ok(children.into_iter().flat_map(|c| c.into_iter()).collect())
}
}
// fn merge_fruits(&self, children: Vec<Vec<u8>>) -> crate::Result<Vec<u8>> {
// Ok(children.into_iter().flat_map(|c| c.into_iter()).collect())
// }
// }
impl SegmentCollector for BytesFastFieldSegmentCollector {
type Fruit = Vec<u8>;
// impl SegmentCollector for BytesFastFieldSegmentCollector {
// type Fruit = Vec<u8>;
fn collect(&mut self, doc: u32, _score: Score) {
let data = self.reader.get_bytes(doc);
self.vals.extend(data);
}
// fn collect(&mut self, doc: u32, _score: Score) {
// let data = self.reader.get_bytes(doc);
// self.vals.extend(data);
// }
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
self.vals
}
}
// fn harvest(self) -> <Self as SegmentCollector>::Fruit {
// self.vals
// }
// }
fn make_test_searcher() -> crate::Result<Searcher> {
let schema = Schema::builder().build();

View File

@@ -12,7 +12,7 @@ use crate::collector::tweak_score_top_collector::TweakedScoreTopCollector;
use crate::collector::{
CustomScorer, CustomSegmentScorer, ScoreSegmentTweaker, ScoreTweaker, SegmentCollector,
};
use crate::fastfield::FastValue;
use crate::fastfield::{FastFieldNotAvailableError, FastValue};
use crate::query::Weight;
use crate::schema::Field;
use crate::{DocAddress, DocId, Score, SegmentOrdinal, SegmentReader, TantivyError};
@@ -22,7 +22,7 @@ struct FastFieldConvertCollector<
TFastValue: FastValue,
> {
pub collector: TCollector,
pub field: Field,
pub field: String,
pub fast_value: std::marker::PhantomData<TFastValue>,
}
@@ -41,7 +41,8 @@ where
segment: &SegmentReader,
) -> crate::Result<Self::Child> {
let schema = segment.schema();
let field_entry = schema.get_field_entry(self.field);
let field = schema.get_field(&self.field)?;
let field_entry = schema.get_field_entry(field);
if !field_entry.is_fast() {
return Err(TantivyError::SchemaError(format!(
"Field {:?} is not a fast field.",
@@ -132,17 +133,17 @@ impl fmt::Debug for TopDocs {
}
struct ScorerByFastFieldReader {
ff_reader: Arc<dyn Column<u64>>,
sort_column: Arc<dyn Column<u64>>,
}
impl CustomSegmentScorer<u64> for ScorerByFastFieldReader {
fn score(&mut self, doc: DocId) -> u64 {
self.ff_reader.get_val(doc)
self.sort_column.get_val(doc)
}
}
struct ScorerByField {
field: Field,
field: String,
}
impl CustomScorer<u64> for ScorerByField {
@@ -154,10 +155,13 @@ impl CustomScorer<u64> for ScorerByField {
// mapping is monotonic, so it is sufficient to compute our top-K docs.
//
// The conversion will then happen only on the top-K docs.
let ff_reader = segment_reader
.fast_fields()
.typed_fast_field_reader(segment_reader.schema().get_field_name(self.field))?;
Ok(ScorerByFastFieldReader { ff_reader })
let sort_column_opt = segment_reader.fast_fields().u64_lenient(&self.field)?;
let sort_column = sort_column_opt
.ok_or_else(|| FastFieldNotAvailableError {
field_name: self.field.clone(),
})?
.first_or_default_col(0u64);
Ok(ScorerByFastFieldReader { sort_column })
}
}
@@ -290,9 +294,14 @@ impl TopDocs {
/// the [.order_by_fast_field(...)](TopDocs::order_by_fast_field) method.
pub fn order_by_u64_field(
self,
field: Field,
field: impl ToString,
) -> impl Collector<Fruit = Vec<(u64, DocAddress)>> {
CustomScoreTopCollector::new(ScorerByField { field }, self.0.into_tscore())
CustomScoreTopCollector::new(
ScorerByField {
field: field.to_string(),
},
self.0.into_tscore(),
)
}
/// Set top-K to rank documents by a given fast field.
@@ -367,15 +376,15 @@ impl TopDocs {
/// ```
pub fn order_by_fast_field<TFastValue>(
self,
fast_field: Field,
fast_field: impl ToString,
) -> impl Collector<Fruit = Vec<(TFastValue, DocAddress)>>
where
TFastValue: FastValue,
{
let u64_collector = self.order_by_u64_field(fast_field);
let u64_collector = self.order_by_u64_field(fast_field.to_string());
FastFieldConvertCollector {
collector: u64_collector,
field: fast_field,
field: fast_field.to_string(),
fast_value: PhantomData,
}
}
@@ -877,7 +886,7 @@ mod tests {
});
let searcher = index.reader()?.searcher();
let top_collector = TopDocs::with_limit(4).order_by_u64_field(size);
let top_collector = TopDocs::with_limit(4).order_by_u64_field(SIZE);
let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector)?;
assert_eq!(
&top_docs[..],
@@ -916,7 +925,7 @@ mod tests {
))?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let top_collector = TopDocs::with_limit(3).order_by_fast_field(birthday);
let top_collector = TopDocs::with_limit(3).order_by_fast_field("birthday");
let top_docs: Vec<(DateTime, DocAddress)> = searcher.search(&AllQuery, &top_collector)?;
assert_eq!(
&top_docs[..],
@@ -946,7 +955,7 @@ mod tests {
))?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let top_collector = TopDocs::with_limit(3).order_by_fast_field(altitude);
let top_collector = TopDocs::with_limit(3).order_by_fast_field("altitude");
let top_docs: Vec<(i64, DocAddress)> = searcher.search(&AllQuery, &top_collector)?;
assert_eq!(
&top_docs[..],
@@ -976,7 +985,7 @@ mod tests {
))?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let top_collector = TopDocs::with_limit(3).order_by_fast_field(altitude);
let top_collector = TopDocs::with_limit(3).order_by_fast_field("altitude");
let top_docs: Vec<(f64, DocAddress)> = searcher.search(&AllQuery, &top_collector)?;
assert_eq!(
&top_docs[..],
@@ -1004,7 +1013,7 @@ mod tests {
.unwrap();
});
let searcher = index.reader().unwrap().searcher();
let top_collector = TopDocs::with_limit(4).order_by_u64_field(Field::from_field_id(2));
let top_collector = TopDocs::with_limit(4).order_by_u64_field("missing_field");
let segment_reader = searcher.segment_reader(0u32);
top_collector
.for_segment(0, segment_reader)
@@ -1022,7 +1031,7 @@ mod tests {
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let segment = searcher.segment_reader(0);
let top_collector = TopDocs::with_limit(4).order_by_u64_field(size);
let top_collector = TopDocs::with_limit(4).order_by_u64_field(SIZE);
let err = top_collector.for_segment(0, segment).err().unwrap();
assert!(matches!(err, crate::TantivyError::SchemaError(_)));
Ok(())
@@ -1039,7 +1048,7 @@ mod tests {
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let segment = searcher.segment_reader(0);
let top_collector = TopDocs::with_limit(4).order_by_fast_field::<i64>(size);
let top_collector = TopDocs::with_limit(4).order_by_fast_field::<i64>(SIZE);
let err = top_collector.for_segment(0, segment).err().unwrap();
assert!(
matches!(err, crate::TantivyError::SchemaError(msg) if msg == "Field \"size\" is not a fast field.")

View File

@@ -19,7 +19,7 @@ use crate::error::{DataCorruption, TantivyError};
use crate::indexer::index_writer::{MAX_NUM_THREAD, MEMORY_ARENA_NUM_BYTES_MIN};
use crate::indexer::segment_updater::save_metas;
use crate::reader::{IndexReader, IndexReaderBuilder};
use crate::schema::{Cardinality, Field, FieldType, Schema};
use crate::schema::{Field, FieldType, Schema};
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
use crate::IndexWriter;
@@ -93,7 +93,7 @@ fn save_new_metas(
/// let body_field = schema_builder.add_text_field("body", TEXT);
/// let number_field = schema_builder.add_u64_field(
/// "number",
/// NumericOptions::default().set_fast(Cardinality::SingleValue),
/// NumericOptions::default().set_fast(),
/// );
///
/// let schema = schema_builder.build();
@@ -245,12 +245,6 @@ impl IndexBuilder {
sort_by_field.field
)));
}
if entry.field_type().fastfield_cardinality() != Some(Cardinality::SingleValue) {
return Err(TantivyError::InvalidArgument(format!(
"Only single value fast field Cardinality supported for sorting index {}",
sort_by_field.field
)));
}
}
Ok(())
} else {

View File

@@ -7,7 +7,7 @@ use fail::fail_point;
use crate::core::{InvertedIndexReader, Segment, SegmentComponent, SegmentId};
use crate::directory::{CompositeFile, FileSlice};
use crate::error::DataCorruption;
use crate::fastfield::{intersect_alive_bitsets, AliveBitSet, FacetReader, FastFieldReaders};
use crate::fastfield::{intersect_alive_bitsets, AliveBitSet, FastFieldReaders};
use crate::fieldnorm::{FieldNormReader, FieldNormReaders};
use crate::schema::{Field, FieldType, IndexRecordOption, Schema};
use crate::space_usage::SegmentSpaceUsage;
@@ -90,25 +90,8 @@ impl SegmentReader {
}
/// Accessor to the `FacetReader` associated with a given `Field`.
pub fn facet_reader(&self, field: Field) -> crate::Result<FacetReader> {
let field_entry = self.schema.get_field_entry(field);
match field_entry.field_type() {
FieldType::Facet(_) => {
let term_ords_reader =
self.fast_fields().u64s(self.schema.get_field_name(field))?;
let termdict = self
.termdict_composite
.open_read(field)
.map(TermDictionary::open)
.unwrap_or_else(|| Ok(TermDictionary::empty()))?;
Ok(FacetReader::new(term_ords_reader, termdict))
}
_ => Err(crate::TantivyError::InvalidArgument(format!(
"Field {:?} is not a facet field.",
field_entry.name()
))),
}
pub fn facet_reader(&self, field: Field) -> crate::Result<()> {
todo!();
}
/// Accessor to the segment's `Field norms`'s reader.
@@ -170,9 +153,7 @@ impl SegmentReader {
let schema = segment.schema();
let fast_fields_data = segment.open_read(SegmentComponent::FastFields)?;
let fast_fields_composite = CompositeFile::open(&fast_fields_data)?;
let fast_fields_readers =
Arc::new(FastFieldReaders::new(schema.clone(), fast_fields_composite));
let fast_fields_readers = Arc::new(FastFieldReaders::open(fast_fields_data)?);
let fieldnorm_data = segment.open_read(SegmentComponent::FieldNorms)?;
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;

View File

@@ -8,7 +8,7 @@ use crate::schema::FieldEntry;
#[derive(Debug, Error)]
#[error("Fast field not available: '{field_name:?}'")]
pub struct FastFieldNotAvailableError {
field_name: String,
pub(crate) field_name: String,
}
impl FastFieldNotAvailableError {

File diff suppressed because it is too large Load Diff

View File

@@ -38,7 +38,7 @@ mod tests {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_u64_field(
"multifield",
NumericOptions::default().set_fast(Cardinality::MultiValues),
NumericOptions::default().set_fast(),
);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
@@ -74,7 +74,7 @@ mod tests {
let date_field = schema_builder.add_date_field(
"multi_date_field",
DateOptions::default()
.set_fast(Cardinality::MultiValues)
.set_fast()
.set_indexed()
.set_fieldnorm()
.set_stored(),
@@ -215,7 +215,7 @@ mod tests {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_i64_field(
"multifield",
NumericOptions::default().set_fast(Cardinality::MultiValues),
NumericOptions::default().set_fast(),
);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
@@ -246,7 +246,7 @@ mod tests {
let mut schema_builder = Schema::builder();
let bool_field = schema_builder.add_bool_field(
"multifield",
NumericOptions::default().set_fast(Cardinality::MultiValues),
NumericOptions::default().set_fast(),
);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
@@ -278,7 +278,7 @@ mod tests {
let field = schema_builder.add_u64_field(
"multifield",
NumericOptions::default()
.set_fast(Cardinality::MultiValues)
.set_fast()
.set_indexed(),
);
let schema = schema_builder.build();
@@ -424,7 +424,7 @@ mod bench {
let mut builder = crate::schema::SchemaBuilder::new();
let fast_multi =
crate::schema::NumericOptions::default().set_fast(Cardinality::MultiValues);
crate::schema::NumericOptions::default().set_fast();
let multi_field = builder.add_f64_field("f64s", fast_multi);
let index = crate::Index::create_in_ram(builder.build());
@@ -504,7 +504,7 @@ mod bench {
let path = Path::new("test");
let directory: RamDirectory = RamDirectory::create();
let field = {
let options = NumericOptions::default().set_fast(Cardinality::MultiValues);
let options = NumericOptions::default().set_fast();
let mut schema_builder = Schema::builder();
let field = schema_builder.add_u64_field("field", options);
let schema = schema_builder.build();
@@ -562,7 +562,7 @@ mod bench {
b.iter(|| {
let directory: RamDirectory = RamDirectory::create();
let options = NumericOptions::default().set_fast(Cardinality::MultiValues);
let options = NumericOptions::default().set_fast();
let mut schema_builder = Schema::builder();
let field = schema_builder.add_u64_field("field", options);
let schema = schema_builder.build();
@@ -595,7 +595,7 @@ mod bench {
b.iter(|| {
let directory: RamDirectory = RamDirectory::create();
let options = NumericOptions::default().set_fast(Cardinality::MultiValues);
let options = NumericOptions::default().set_fast();
let mut schema_builder = Schema::builder();
let field = schema_builder.add_u64_field("field", options);
let schema = schema_builder.build();

View File

@@ -137,7 +137,7 @@ mod tests {
let date_field = schema_builder.add_date_field(
"multi_date_field",
DateOptions::default()
.set_fast(Cardinality::MultiValues)
.set_fast()
.set_indexed()
.set_fieldnorm()
.set_precision(DatePrecision::Microseconds)
@@ -188,7 +188,7 @@ mod tests {
let date_field = schema_builder.add_date_field(
"multi_date_field",
DateOptions::default()
.set_fast(Cardinality::MultiValues)
.set_fast()
// TODO: Test different precision after fixing https://github.com/quickwit-oss/tantivy/issues/1783
.set_precision(DatePrecision::Microseconds)
.set_indexed()
@@ -307,7 +307,7 @@ mod tests {
let mut schema_builder = Schema::builder();
let field_options = NumericOptions::default()
.set_indexed()
.set_fast(Cardinality::MultiValues);
.set_fast();
let item_field = schema_builder.add_i64_field("items", field_options);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);

View File

@@ -1,12 +1,16 @@
use std::io;
use std::net::Ipv6Addr;
use std::sync::Arc;
use columnar::{
ColumnType, ColumnValues, ColumnarReader, DynamicColumn, DynamicColumnHandle,
HasAssociatedColumnType, NumericalType,
};
use fastfield_codecs::{open, open_u128, Column};
use super::multivalued::MultiValuedFastFieldReader;
use crate::directory::{CompositeFile, FileSlice};
use crate::fastfield::{BytesFastFieldReader, FastFieldNotAvailableError, FastValue};
use crate::schema::{Cardinality, Field, FieldType, Schema};
use crate::fastfield::{FastFieldNotAvailableError, FastValue};
use crate::schema::{Field, FieldType, Schema};
use crate::space_usage::PerFieldSpaceUsage;
use crate::{DateTime, TantivyError};
@@ -16,315 +20,152 @@ use crate::{DateTime, TantivyError};
/// and just wraps several `HashMap`.
#[derive(Clone)]
pub struct FastFieldReaders {
schema: Schema,
fast_fields_composite: CompositeFile,
}
#[derive(Eq, PartialEq, Debug)]
pub(crate) enum FastType {
I64,
U64,
U128,
F64,
Bool,
Date,
}
pub(crate) fn type_and_cardinality(field_type: &FieldType) -> Option<(FastType, Cardinality)> {
match field_type {
FieldType::U64(options) => options
.get_fastfield_cardinality()
.map(|cardinality| (FastType::U64, cardinality)),
FieldType::I64(options) => options
.get_fastfield_cardinality()
.map(|cardinality| (FastType::I64, cardinality)),
FieldType::F64(options) => options
.get_fastfield_cardinality()
.map(|cardinality| (FastType::F64, cardinality)),
FieldType::Bool(options) => options
.get_fastfield_cardinality()
.map(|cardinality| (FastType::Bool, cardinality)),
FieldType::Date(options) => options
.get_fastfield_cardinality()
.map(|cardinality| (FastType::Date, cardinality)),
FieldType::Facet(_) => Some((FastType::U64, Cardinality::MultiValues)),
FieldType::Str(options) if options.is_fast() => {
Some((FastType::U64, Cardinality::MultiValues))
}
FieldType::IpAddr(options) => options
.get_fastfield_cardinality()
.map(|cardinality| (FastType::U128, cardinality)),
_ => None,
}
columnar: Arc<ColumnarReader>,
}
impl FastFieldReaders {
pub(crate) fn new(schema: Schema, fast_fields_composite: CompositeFile) -> FastFieldReaders {
FastFieldReaders {
schema,
fast_fields_composite,
}
pub(crate) fn open(fast_field_file: FileSlice) -> io::Result<FastFieldReaders> {
let columnar = Arc::new(ColumnarReader::open(fast_field_file)?);
Ok(FastFieldReaders { columnar })
}
pub(crate) fn space_usage(&self) -> PerFieldSpaceUsage {
self.fast_fields_composite.space_usage()
todo!()
}
#[doc(hidden)]
pub fn fast_field_data(&self, field: Field, idx: usize) -> crate::Result<FileSlice> {
self.fast_fields_composite
.open_read_with_idx(field, idx)
.ok_or_else(|| {
let field_name = self.schema.get_field_entry(field).name();
TantivyError::SchemaError(format!("Field({}) data was not found", field_name))
})
}
fn check_type(
pub fn typed_column_opt<T>(
&self,
field: Field,
expected_fast_type: FastType,
expected_cardinality: Cardinality,
) -> crate::Result<()> {
let field_entry = self.schema.get_field_entry(field);
let (fast_type, cardinality) =
type_and_cardinality(field_entry.field_type()).ok_or_else(|| {
crate::TantivyError::SchemaError(format!(
"Field {:?} is not a fast field.",
field_entry.name()
))
})?;
if fast_type != expected_fast_type {
return Err(crate::TantivyError::SchemaError(format!(
"Field {:?} is of type {:?}, expected {:?}.",
field_entry.name(),
fast_type,
expected_fast_type
)));
field_name: &str,
) -> crate::Result<Option<columnar::Column<T>>>
where
T: PartialOrd + Copy + HasAssociatedColumnType + Send + Sync + Default + 'static,
DynamicColumn: Into<Option<columnar::Column<T>>>,
{
let column_type = T::column_type();
let Some(dynamic_column_handle) = self.column_handle(field_name, column_type)?
else {
return Ok(None);
};
let dynamic_column = dynamic_column_handle.open()?;
Ok(dynamic_column.into())
}
pub fn column_num_bytes(&self, field: &str) -> crate::Result<usize> {
Ok(self
.columnar
.read_columns(field)?
.into_iter()
.map(|column_handle| column_handle.num_bytes())
.sum())
}
pub fn typed_column_first_or_default<T>(&self, field: &str) -> crate::Result<Arc<dyn Column<T>>>
where
T: PartialOrd + Copy + HasAssociatedColumnType + Send + Sync + Default + 'static,
DynamicColumn: Into<Option<columnar::Column<T>>>,
{
let col_opt = self.typed_column_opt(field)?;
if let Some(col) = col_opt {
Ok(col.first_or_default_col(T::default()))
} else {
todo!();
}
if cardinality != expected_cardinality {
return Err(crate::TantivyError::SchemaError(format!(
"Field {:?} is of cardinality {:?}, expected {:?}.",
field_entry.name(),
cardinality,
expected_cardinality
)));
}
Ok(())
}
pub(crate) fn typed_fast_field_reader_with_idx<TFastValue: FastValue>(
&self,
field_name: &str,
index: usize,
) -> crate::Result<Arc<dyn Column<TFastValue>>> {
let field = self.schema.get_field(field_name)?;
let fast_field_slice = self.fast_field_data(field, index)?;
let bytes = fast_field_slice.read_bytes()?;
let column = fastfield_codecs::open(bytes)?;
Ok(column)
}
pub(crate) fn typed_fast_field_reader<TFastValue: FastValue>(
&self,
field_name: &str,
) -> crate::Result<Arc<dyn Column<TFastValue>>> {
self.typed_fast_field_reader_with_idx(field_name, 0)
}
pub(crate) fn typed_fast_field_multi_reader<TFastValue: FastValue>(
&self,
field_name: &str,
) -> crate::Result<MultiValuedFastFieldReader<TFastValue>> {
let idx_reader = self.typed_fast_field_reader(field_name)?;
let vals_reader = self.typed_fast_field_reader_with_idx(field_name, 1)?;
Ok(MultiValuedFastFieldReader::open(idx_reader, vals_reader))
}
/// Returns the `u64` fast field reader reader associated with `field`.
///
/// If `field` is not a u64 fast field, this method returns an Error.
pub fn u64(&self, field_name: &str) -> crate::Result<Arc<dyn Column<u64>>> {
self.check_type(
self.schema.get_field(field_name)?,
FastType::U64,
Cardinality::SingleValue,
)?;
self.typed_fast_field_reader(field_name)
pub fn u64(&self, field: &str) -> crate::Result<Arc<dyn ColumnValues<u64>>> {
self.typed_column_first_or_default(field)
}
/// Returns the `date` fast field reader reader associated with `field`.
///
/// If `field` is not a date fast field, this method returns an Error.
pub fn date(&self, field: &str) -> crate::Result<Arc<dyn ColumnValues<columnar::DateTime>>> {
self.typed_column_first_or_default(field)
}
/// Returns the `ip` fast field reader reader associated to `field`.
///
/// If `field` is not a u128 fast field, this method returns an Error.
pub fn ip_addr(&self, field_name: &str) -> crate::Result<Arc<dyn Column<Ipv6Addr>>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::U128, Cardinality::SingleValue)?;
let bytes = self.fast_field_data(field, 0)?.read_bytes()?;
Ok(open_u128::<Ipv6Addr>(bytes)?)
}
/// Returns the `ip` fast field reader reader associated to `field`.
///
/// If `field` is not a u128 fast field, this method returns an Error.
pub fn ip_addrs(
&self,
field_name: &str,
) -> crate::Result<MultiValuedFastFieldReader<Ipv6Addr>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::U128, Cardinality::MultiValues)?;
let idx_reader: Arc<dyn Column<u64>> = self.typed_fast_field_reader(field_name)?;
let bytes = self.fast_field_data(field, 1)?.read_bytes()?;
let vals_reader = open_u128::<Ipv6Addr>(bytes)?;
Ok(MultiValuedFastFieldReader::open(idx_reader, vals_reader))
pub fn ip_addr(&self, field: &str) -> crate::Result<Arc<dyn Column<Ipv6Addr>>> {
todo!();
// self.check_type(field, FastType::U128)?;
// let bytes = self.fast_field_data(field, 0)?.read_bytes()?;
// Ok(open_u128::<Ipv6Addr>(bytes)?)
}
/// Returns the `u128` fast field reader reader associated to `field`.
///
/// If `field` is not a u128 fast field, this method returns an Error.
pub(crate) fn u128(&self, field_name: &str) -> crate::Result<Arc<dyn Column<u128>>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::U128, Cardinality::SingleValue)?;
let bytes = self.fast_field_data(field, 0)?.read_bytes()?;
Ok(open_u128::<u128>(bytes)?)
pub(crate) fn u128(&self, field: &str) -> crate::Result<Arc<dyn Column<u128>>> {
todo!();
}
/// Returns the `u128` multi-valued fast field reader reader associated to `field`.
///
/// If `field` is not a u128 multi-valued fast field, this method returns an Error.
pub fn u128s(&self, field_name: &str) -> crate::Result<MultiValuedFastFieldReader<u128>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::U128, Cardinality::MultiValues)?;
let idx_reader: Arc<dyn Column<u64>> =
self.typed_fast_field_reader(self.schema.get_field_name(field))?;
let bytes = self.fast_field_data(field, 1)?.read_bytes()?;
let vals_reader = open_u128::<u128>(bytes)?;
Ok(MultiValuedFastFieldReader::open(idx_reader, vals_reader))
pub fn column_handle(
&self,
field_name: &str,
column_type: ColumnType,
) -> crate::Result<Option<DynamicColumnHandle>> {
let dynamic_column_handle_opt = self
.columnar
.read_columns(field_name)?
.into_iter()
.filter(|column| column.column_type() == column_type)
.next();
Ok(dynamic_column_handle_opt)
}
/// Returns the `u64` fast field reader reader associated with `field`, regardless of whether
/// the given field is effectively of type `u64` or not.
///
/// If not, the fastfield reader will returns the u64-value associated with the original
/// FastValue.
pub fn u64_lenient(&self, field_name: &str) -> crate::Result<Arc<dyn Column<u64>>> {
self.typed_fast_field_reader(field_name)
pub fn u64_lenient(&self, field_name: &str) -> crate::Result<Option<columnar::Column<u64>>> {
for col in self.columnar.read_columns(field_name)? {
if let Some(col_u64) = col.open_u64_lenient()? {
return Ok(Some(col_u64));
}
}
Ok(None)
}
/// Returns the `i64` fast field reader reader associated with `field`.
///
/// If `field` is not a i64 fast field, this method returns an Error.
pub fn i64(&self, field_name: &str) -> crate::Result<Arc<dyn Column<i64>>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::I64, Cardinality::SingleValue)?;
self.typed_fast_field_reader(self.schema.get_field_name(field))
}
/// Returns the `date` fast field reader reader associated with `field`.
///
/// If `field` is not a date fast field, this method returns an Error.
pub fn date(&self, field_name: &str) -> crate::Result<Arc<dyn Column<DateTime>>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::Date, Cardinality::SingleValue)?;
self.typed_fast_field_reader(field_name)
self.typed_column_first_or_default(field_name)
}
/// Returns the `f64` fast field reader reader associated with `field`.
///
/// If `field` is not a f64 fast field, this method returns an Error.
pub fn f64(&self, field_name: &str) -> crate::Result<Arc<dyn Column<f64>>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::F64, Cardinality::SingleValue)?;
self.typed_fast_field_reader(field_name)
self.typed_column_first_or_default(field_name)
}
/// Returns the `bool` fast field reader reader associated with `field`.
///
/// If `field` is not a bool fast field, this method returns an Error.
pub fn bool(&self, field_name: &str) -> crate::Result<Arc<dyn Column<bool>>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::Bool, Cardinality::SingleValue)?;
self.typed_fast_field_reader(field_name)
self.typed_column_first_or_default(field_name)
}
/// Returns a `u64s` multi-valued fast field reader reader associated with `field`.
///
/// If `field` is not a u64 multi-valued fast field, this method returns an Error.
pub fn u64s(&self, field_name: &str) -> crate::Result<MultiValuedFastFieldReader<u64>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::U64, Cardinality::MultiValues)?;
self.typed_fast_field_multi_reader(field_name)
}
/// Returns a `u64s` multi-valued fast field reader reader associated with `field`, regardless
/// of whether the given field is effectively of type `u64` or not.
///
/// If `field` is not a u64 multi-valued fast field, this method returns an Error.
pub fn u64s_lenient(&self, field_name: &str) -> crate::Result<MultiValuedFastFieldReader<u64>> {
self.typed_fast_field_multi_reader(field_name)
}
/// Returns a `i64s` multi-valued fast field reader reader associated with `field`.
///
/// If `field` is not a i64 multi-valued fast field, this method returns an Error.
pub fn i64s(&self, field_name: &str) -> crate::Result<MultiValuedFastFieldReader<i64>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::I64, Cardinality::MultiValues)?;
self.typed_fast_field_multi_reader(self.schema.get_field_name(field))
}
/// Returns a `f64s` multi-valued fast field reader reader associated with `field`.
///
/// If `field` is not a f64 multi-valued fast field, this method returns an Error.
pub fn f64s(&self, field_name: &str) -> crate::Result<MultiValuedFastFieldReader<f64>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::F64, Cardinality::MultiValues)?;
self.typed_fast_field_multi_reader(self.schema.get_field_name(field))
}
/// Returns a `bools` multi-valued fast field reader reader associated with `field`.
///
/// If `field` is not a bool multi-valued fast field, this method returns an Error.
pub fn bools(&self, field_name: &str) -> crate::Result<MultiValuedFastFieldReader<bool>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::Bool, Cardinality::MultiValues)?;
self.typed_fast_field_multi_reader(self.schema.get_field_name(field))
}
/// Returns a `time::OffsetDateTime` multi-valued fast field reader reader associated with
/// `field`.
///
/// If `field` is not a `time::OffsetDateTime` multi-valued fast field, this method returns an
/// Error.
pub fn dates(&self, field_name: &str) -> crate::Result<MultiValuedFastFieldReader<DateTime>> {
let field = self.schema.get_field(field_name)?;
self.check_type(field, FastType::Date, Cardinality::MultiValues)?;
self.typed_fast_field_multi_reader(self.schema.get_field_name(field))
}
/// Returns the `bytes` fast field reader associated with `field`.
///
/// If `field` is not a bytes fast field, returns an Error.
pub fn bytes(&self, field_name: &str) -> crate::Result<BytesFastFieldReader> {
let field = self.schema.get_field(field_name)?;
let field_entry = self.schema.get_field_entry(field);
if let FieldType::Bytes(bytes_option) = field_entry.field_type() {
if !bytes_option.is_fast() {
return Err(crate::TantivyError::SchemaError(format!(
"Field {:?} is not a fast field.",
field_entry.name()
)));
}
let fast_field_idx_file = self.fast_field_data(field, 0)?;
let fast_field_idx_bytes = fast_field_idx_file.read_bytes()?;
let idx_reader = open(fast_field_idx_bytes)?;
let data = self.fast_field_data(field, 1)?;
BytesFastFieldReader::open(idx_reader, data)
} else {
Err(FastFieldNotAvailableError::new(field_entry).into())
}
}
// Returns the `bytes` fast field reader associated with `field`.
//
// If `field` is not a bytes fast field, returns an Error.
// pub fn bytes(&self, field: Field) -> crate::Result<BytesFastFieldReader> {
// let field_entry = self.schema.get_field_entry(field);
// if let FieldType::Bytes(bytes_option) = field_entry.field_type() {
// if !bytes_option.is_fast() {
// return Err(crate::TantivyError::SchemaError(format!(
// "Field {:?} is not a fast field.",
// field_entry.name()
// )));
// }
// let fast_field_idx_file = self.fast_field_data(field, 0)?;
// let fast_field_idx_bytes = fast_field_idx_file.read_bytes()?;
// let idx_reader = open(fast_field_idx_bytes)?;
// let data = self.fast_field_data(field, 1)?;
// BytesFastFieldReader::open(idx_reader, data)
// } else {
// Err(FastFieldNotAvailableError::new(field_entry).into())
// }
// }
}

View File

@@ -1,558 +1,141 @@
use std::collections::HashMap;
use std::io;
use columnar::{ColumnType, ColumnarWriter, NumericalType, NumericalValue};
use common;
use fastfield_codecs::{Column, MonotonicallyMappableToU128, MonotonicallyMappableToU64};
use rustc_hash::FxHashMap;
use tantivy_bitpacker::BlockedBitpacker;
use super::multivalued::{MultiValueU128FastFieldWriter, MultiValuedFastFieldWriter};
use super::FastFieldType;
use crate::fastfield::{BytesFastFieldWriter, CompositeFastFieldSerializer};
use crate::fastfield::CompositeFastFieldSerializer;
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::UnorderedTermId;
use crate::schema::{Cardinality, Document, Field, FieldEntry, FieldType, Schema, Value};
use crate::schema::{Document, Field, FieldEntry, FieldType, Schema, Type, Value};
use crate::termdict::TermOrdinal;
use crate::DatePrecision;
use crate::{DatePrecision, DocId};
/// The `FastFieldsWriter` groups all of the fast field writers.
pub struct FastFieldsWriter {
term_id_writers: Vec<MultiValuedFastFieldWriter>,
single_value_writers: Vec<IntFastFieldWriter>,
u128_value_writers: Vec<U128FastFieldWriter>,
u128_multi_value_writers: Vec<MultiValueU128FastFieldWriter>,
multi_values_writers: Vec<MultiValuedFastFieldWriter>,
bytes_value_writers: Vec<BytesFastFieldWriter>,
}
pub(crate) fn unexpected_value(expected: &str, actual: &Value) -> crate::TantivyError {
crate::TantivyError::SchemaError(format!(
"Expected a {:?} in fast field, but got {:?}",
expected, actual
))
}
fn fast_field_default_value(field_entry: &FieldEntry) -> u64 {
match *field_entry.field_type() {
FieldType::I64(_) | FieldType::Date(_) => common::i64_to_u64(0i64),
FieldType::F64(_) => common::f64_to_u64(0.0f64),
_ => 0u64,
}
columnar_writer: ColumnarWriter,
fast_field_names: Vec<Option<String>>, //< TODO see if we can cash the field name hash too.
date_precisions: Vec<DatePrecision>,
num_docs: DocId,
}
impl FastFieldsWriter {
/// Create all `FastFieldWriter` required by the schema.
pub fn from_schema(schema: &Schema) -> FastFieldsWriter {
let mut u128_value_writers = Vec::new();
let mut u128_multi_value_writers = Vec::new();
let mut single_value_writers = Vec::new();
let mut term_id_writers = Vec::new();
let mut multi_values_writers = Vec::new();
let mut bytes_value_writers = Vec::new();
for (field, field_entry) in schema.fields() {
match field_entry.field_type() {
FieldType::I64(ref int_options)
| FieldType::U64(ref int_options)
| FieldType::F64(ref int_options)
| FieldType::Bool(ref int_options) => {
match int_options.get_fastfield_cardinality() {
Some(Cardinality::SingleValue) => {
let mut fast_field_writer = IntFastFieldWriter::new(field, None);
let default_value = fast_field_default_value(field_entry);
fast_field_writer.set_val_if_missing(default_value);
single_value_writers.push(fast_field_writer);
}
Some(Cardinality::MultiValues) => {
let fast_field_writer = MultiValuedFastFieldWriter::new(
field,
FastFieldType::Numeric,
None,
);
multi_values_writers.push(fast_field_writer);
}
None => {}
}
}
FieldType::Date(ref options) => match options.get_fastfield_cardinality() {
Some(Cardinality::SingleValue) => {
let mut fast_field_writer =
IntFastFieldWriter::new(field, Some(options.get_precision()));
let default_value = fast_field_default_value(field_entry);
fast_field_writer.set_val_if_missing(default_value);
single_value_writers.push(fast_field_writer);
}
Some(Cardinality::MultiValues) => {
let fast_field_writer = MultiValuedFastFieldWriter::new(
field,
FastFieldType::Numeric,
Some(options.get_precision()),
);
multi_values_writers.push(fast_field_writer);
}
None => {}
},
FieldType::Facet(_) => {
let fast_field_writer =
MultiValuedFastFieldWriter::new(field, FastFieldType::Facet, None);
term_id_writers.push(fast_field_writer);
}
FieldType::Str(_) if field_entry.is_fast() => {
let fast_field_writer =
MultiValuedFastFieldWriter::new(field, FastFieldType::String, None);
term_id_writers.push(fast_field_writer);
}
FieldType::Bytes(bytes_option) => {
if bytes_option.is_fast() {
let fast_field_writer = BytesFastFieldWriter::new(field);
bytes_value_writers.push(fast_field_writer);
}
}
FieldType::IpAddr(opt) => {
if opt.is_fast() {
match opt.get_fastfield_cardinality() {
Some(Cardinality::SingleValue) => {
let fast_field_writer = U128FastFieldWriter::new(field);
u128_value_writers.push(fast_field_writer);
}
Some(Cardinality::MultiValues) => {
let fast_field_writer = MultiValueU128FastFieldWriter::new(field);
u128_multi_value_writers.push(fast_field_writer);
}
None => {}
}
}
}
FieldType::Str(_) | FieldType::JsonObject(_) => {}
let mut columnar_writer = ColumnarWriter::default();
let mut fast_fields: Vec<Option<String>> = vec![None; schema.num_fields()];
let mut date_precisions: Vec<DatePrecision> =
std::iter::repeat_with(DatePrecision::default)
.take(schema.num_fields())
.collect();
// TODO see other types
for (field_id, field_entry) in schema.fields() {
if !field_entry.field_type().is_fast() {
continue;
}
fast_fields[field_id.field_id() as usize] = Some(field_entry.name().to_string());
let column_type = match field_entry.field_type().value_type() {
Type::Str => ColumnType::Str,
Type::U64 => ColumnType::Numerical(NumericalType::U64),
Type::I64 => ColumnType::Numerical(NumericalType::I64),
Type::F64 => ColumnType::Numerical(NumericalType::F64),
Type::Bool => ColumnType::Bool,
Type::Date => ColumnType::DateTime,
Type::Facet => ColumnType::Str,
Type::Bytes => todo!(),
Type::Json => {
continue;
}
Type::IpAddr => todo!(),
};
if let FieldType::Date(date_options) = field_entry.field_type() {
date_precisions[field_id.field_id() as usize] = date_options.get_precision();
}
columnar_writer.record_column_type(field_entry.name(), column_type);
}
FastFieldsWriter {
u128_value_writers,
u128_multi_value_writers,
term_id_writers,
single_value_writers,
multi_values_writers,
bytes_value_writers,
columnar_writer,
fast_field_names: fast_fields,
num_docs: 0u32,
date_precisions,
}
}
/// The memory used (inclusive childs)
pub fn mem_usage(&self) -> usize {
self.term_id_writers
.iter()
.map(|w| w.mem_usage())
.sum::<usize>()
+ self
.single_value_writers
.iter()
.map(|w| w.mem_usage())
.sum::<usize>()
+ self
.multi_values_writers
.iter()
.map(|w| w.mem_usage())
.sum::<usize>()
+ self
.bytes_value_writers
.iter()
.map(|w| w.mem_usage())
.sum::<usize>()
+ self
.u128_value_writers
.iter()
.map(|w| w.mem_usage())
.sum::<usize>()
+ self
.u128_multi_value_writers
.iter()
.map(|w| w.mem_usage())
.sum::<usize>()
self.columnar_writer.mem_usage()
}
/// Get the `FastFieldWriter` associated with a field.
pub fn get_term_id_writer(&self, field: Field) -> Option<&MultiValuedFastFieldWriter> {
// TODO optimize
self.term_id_writers
.iter()
.find(|field_writer| field_writer.field() == field)
}
/// Get the `FastFieldWriter` associated with a field.
pub fn get_field_writer(&self, field: Field) -> Option<&IntFastFieldWriter> {
// TODO optimize
self.single_value_writers
.iter()
.find(|field_writer| field_writer.field() == field)
}
/// Get the `FastFieldWriter` associated with a field.
pub fn get_field_writer_mut(&mut self, field: Field) -> Option<&mut IntFastFieldWriter> {
// TODO optimize
self.single_value_writers
.iter_mut()
.find(|field_writer| field_writer.field() == field)
}
/// Get the `FastFieldWriter` associated with a field.
pub fn get_term_id_writer_mut(
&mut self,
field: Field,
) -> Option<&mut MultiValuedFastFieldWriter> {
// TODO optimize
self.term_id_writers
.iter_mut()
.find(|field_writer| field_writer.field() == field)
}
/// Returns the fast field multi-value writer for the given field.
///
/// Returns `None` if the field does not exist, or is not
/// configured as a multivalued fastfield in the schema.
pub fn get_multivalue_writer_mut(
&mut self,
field: Field,
) -> Option<&mut MultiValuedFastFieldWriter> {
// TODO optimize
self.multi_values_writers
.iter_mut()
.find(|multivalue_writer| multivalue_writer.field() == field)
}
/// Returns the bytes fast field writer for the given field.
///
/// Returns `None` if the field does not exist, or is not
/// configured as a bytes fastfield in the schema.
pub fn get_bytes_writer_mut(&mut self, field: Field) -> Option<&mut BytesFastFieldWriter> {
// TODO optimize
self.bytes_value_writers
.iter_mut()
.find(|field_writer| field_writer.field() == field)
}
/// Indexes all of the fastfields of a new document.
pub fn add_document(&mut self, doc: &Document) -> crate::Result<()> {
for field_writer in &mut self.term_id_writers {
field_writer.add_document(doc)?;
}
for field_writer in &mut self.single_value_writers {
field_writer.add_document(doc)?;
}
for field_writer in &mut self.multi_values_writers {
field_writer.add_document(doc)?;
}
for field_writer in &mut self.bytes_value_writers {
field_writer.add_document(doc)?;
}
for field_writer in &mut self.u128_value_writers {
field_writer.add_document(doc)?;
}
for field_writer in &mut self.u128_multi_value_writers {
field_writer.add_document(doc)?;
let doc_id = self.num_docs;
for field_value in doc.field_values() {
if let Some(field_name) =
self.fast_field_names[field_value.field().field_id() as usize].as_ref()
{
match &field_value.value {
Value::U64(u64_val) => {
self.columnar_writer.record_numerical(
doc_id,
field_name.as_str(),
NumericalValue::from(*u64_val),
);
}
Value::I64(i64_val) => {
self.columnar_writer.record_numerical(
doc_id,
field_name.as_str(),
NumericalValue::from(*i64_val),
);
}
Value::F64(f64_val) => {
self.columnar_writer.record_numerical(
doc_id,
field_name.as_str(),
NumericalValue::from(*f64_val),
);
}
Value::Str(_) => todo!(),
Value::PreTokStr(_) => todo!(),
Value::Bool(bool_val) => {
self.columnar_writer
.record_bool(doc_id, field_name.as_str(), *bool_val);
}
Value::Date(datetime) => {
let date_precision =
self.date_precisions[field_value.field().field_id() as usize];
let truncated_datetime = datetime.truncate(date_precision);
self.columnar_writer.record_datetime(
doc_id,
field_name.as_str(),
truncated_datetime.into(),
);
}
Value::Facet(_) => todo!(),
Value::Bytes(_) => todo!(),
Value::JsonObject(_) => todo!(),
Value::IpAddr(_) => todo!(),
}
}
}
self.num_docs += 1;
Ok(())
}
/// Serializes all of the `FastFieldWriter`s by pushing them in
/// order to the fast field serializer.
pub fn serialize(
self,
serializer: &mut CompositeFastFieldSerializer,
mapping: &HashMap<Field, FxHashMap<UnorderedTermId, TermOrdinal>>,
mut self,
wrt: &mut dyn io::Write,
doc_id_map: Option<&DocIdMapping>,
) -> io::Result<()> {
for field_writer in self.term_id_writers {
let field = field_writer.field();
field_writer.serialize(serializer, mapping.get(&field), doc_id_map)?;
}
for field_writer in &self.single_value_writers {
field_writer.serialize(serializer, doc_id_map)?;
}
for field_writer in self.multi_values_writers {
let field = field_writer.field();
field_writer.serialize(serializer, mapping.get(&field), doc_id_map)?;
}
for field_writer in self.bytes_value_writers {
field_writer.serialize(serializer, doc_id_map)?;
}
for field_writer in self.u128_value_writers {
field_writer.serialize(serializer, doc_id_map)?;
}
for field_writer in self.u128_multi_value_writers {
field_writer.serialize(serializer, doc_id_map)?;
}
assert!(doc_id_map.is_none()); // TODO handle doc id map
let num_docs = self.num_docs;
self.columnar_writer.serialize(num_docs, wrt)?;
Ok(())
}
}
/// Fast field writer for u128 values.
/// The fast field writer just keeps the values in memory.
///
/// Only when the segment writer can be closed and
/// persisted on disk, the fast field writer is
/// sent to a `FastFieldSerializer` via the `.serialize(...)`
/// method.
///
/// We cannot serialize earlier as the values are
/// compressed to a compact number space and the number of
/// bits required for bitpacking can only been known once
/// we have seen all of the values.
pub struct U128FastFieldWriter {
field: Field,
vals: Vec<u128>,
val_count: u32,
}
impl U128FastFieldWriter {
/// Creates a new `IntFastFieldWriter`
pub fn new(field: Field) -> Self {
Self {
field,
vals: vec![],
val_count: 0,
}
}
/// The memory used (inclusive childs)
pub fn mem_usage(&self) -> usize {
self.vals.len() * 16
}
/// Records a new value.
///
/// The n-th value being recorded is implicitely
/// associated to the document with the `DocId` n.
/// (Well, `n-1` actually because of 0-indexing)
pub fn add_val(&mut self, val: u128) {
self.vals.push(val);
}
/// Extract the fast field value from the document
/// (or use the default value) and records it.
///
/// Extract the value associated to the fast field for
/// this document.
pub fn add_document(&mut self, doc: &Document) -> crate::Result<()> {
match doc.get_first(self.field) {
Some(v) => {
let ip_addr = v.as_ip_addr().ok_or_else(|| unexpected_value("ip", v))?;
let value = ip_addr.to_u128();
self.add_val(value);
}
None => {
self.add_val(0); // TODO fix null handling
}
};
self.val_count += 1;
Ok(())
}
/// Push the fast fields value to the `FastFieldWriter`.
pub fn serialize(
&self,
serializer: &mut CompositeFastFieldSerializer,
doc_id_map: Option<&DocIdMapping>,
) -> io::Result<()> {
if let Some(doc_id_map) = doc_id_map {
let iter_gen = || {
doc_id_map
.iter_old_doc_ids()
.map(|idx| self.vals[idx as usize])
};
serializer.create_u128_fast_field_with_idx(self.field, iter_gen, self.val_count, 0)?;
} else {
let iter_gen = || self.vals.iter().cloned();
serializer.create_u128_fast_field_with_idx(self.field, iter_gen, self.val_count, 0)?;
}
Ok(())
}
}
/// Fast field writer for ints.
/// The fast field writer just keeps the values in memory.
///
/// Only when the segment writer can be closed and
/// persisted on disk, the fast field writer is
/// sent to a `FastFieldSerializer` via the `.serialize(...)`
/// method.
///
/// We cannot serialize earlier as the values are
/// bitpacked and the number of bits required for bitpacking
/// can only been known once we have seen all of the values.
///
/// Both u64, i64 and f64 use the same writer.
/// i64 and f64 are just remapped to the `0..2^64 - 1`
/// using `common::i64_to_u64` and `common::f64_to_u64`.
pub struct IntFastFieldWriter {
field: Field,
precision_opt: Option<DatePrecision>,
vals: BlockedBitpacker,
val_count: usize,
val_if_missing: u64,
val_min: u64,
val_max: u64,
}
impl IntFastFieldWriter {
/// Creates a new `IntFastFieldWriter`
pub fn new(field: Field, precision_opt: Option<DatePrecision>) -> IntFastFieldWriter {
IntFastFieldWriter {
field,
precision_opt,
vals: BlockedBitpacker::new(),
val_count: 0,
val_if_missing: 0u64,
val_min: u64::MAX,
val_max: 0,
}
}
/// The memory used (inclusive childs)
pub fn mem_usage(&self) -> usize {
self.vals.mem_usage()
}
/// Returns the field that this writer is targeting.
pub fn field(&self) -> Field {
self.field
}
/// Sets the default value.
///
/// This default value is recorded for documents if
/// a document does not have any value.
fn set_val_if_missing(&mut self, val_if_missing: u64) {
self.val_if_missing = val_if_missing;
}
/// Records a new value.
///
/// The n-th value being recorded is implicitly
/// associated with the document with the `DocId` n.
/// (Well, `n-1` actually because of 0-indexing)
pub fn add_val(&mut self, val: u64) {
self.vals.add(val);
if val > self.val_max {
self.val_max = val;
}
if val < self.val_min {
self.val_min = val;
}
self.val_count += 1;
}
/// Extract the fast field value from the document
/// (or use the default value) and records it.
///
///
/// Extract the value associated with the fast field for
/// this document.
///
/// i64 and f64 are remapped to u64 using the logic
/// in `common::i64_to_u64` and `common::f64_to_u64`.
///
/// If the value is missing, then the default value is used
/// instead.
/// If the document has more than one value for the given field,
/// only the first one is taken in account.
///
/// Values on text fast fields are skipped.
pub fn add_document(&mut self, doc: &Document) -> crate::Result<()> {
match doc.get_first(self.field) {
Some(v) => {
let value = match (self.precision_opt, v) {
(Some(precision), Value::Date(date_val)) => {
date_val.truncate(precision).to_u64()
}
_ => super::value_to_u64(v)?,
};
self.add_val(value);
}
None => {
self.add_val(self.val_if_missing);
}
};
Ok(())
}
/// get iterator over the data
pub(crate) fn iter(&self) -> impl Iterator<Item = u64> + '_ {
self.vals.iter()
}
/// Push the fast fields value to the `FastFieldWriter`.
pub fn serialize(
&self,
serializer: &mut CompositeFastFieldSerializer,
doc_id_map: Option<&DocIdMapping>,
) -> io::Result<()> {
let (min, max) = if self.val_min > self.val_max {
(0, 0)
} else {
(self.val_min, self.val_max)
};
let fastfield_accessor = WriterFastFieldAccessProvider {
doc_id_map,
vals: &self.vals,
min_value: min,
max_value: max,
num_vals: self.val_count as u32,
};
serializer.create_auto_detect_u64_fast_field(self.field, fastfield_accessor)?;
Ok(())
}
}
#[derive(Clone)]
struct WriterFastFieldAccessProvider<'map, 'bitp> {
doc_id_map: Option<&'map DocIdMapping>,
vals: &'bitp BlockedBitpacker,
min_value: u64,
max_value: u64,
num_vals: u32,
}
impl<'map, 'bitp> Column for WriterFastFieldAccessProvider<'map, 'bitp> {
/// Return the value associated with the given doc.
///
/// Whenever possible use the Iterator passed to the fastfield creation instead, for performance
/// reasons.
///
/// # Panics
///
/// May panic if `doc` is greater than the index.
fn get_val(&self, _doc: u32) -> u64 {
unimplemented!()
}
fn iter(&self) -> Box<dyn Iterator<Item = u64> + '_> {
if let Some(doc_id_map) = self.doc_id_map {
Box::new(
doc_id_map
.iter_old_doc_ids()
.map(|doc_id| self.vals.get(doc_id as usize)),
)
} else {
Box::new(self.vals.iter())
}
}
fn min_value(&self) -> u64 {
self.min_value
}
fn max_value(&self) -> u64 {
self.max_value
}
fn num_vals(&self) -> u32 {
self.num_vals
}
}

View File

@@ -113,34 +113,35 @@ pub(crate) fn get_doc_id_mapping_from_field(
sort_by_field: IndexSortByField,
segment_writer: &SegmentWriter,
) -> crate::Result<DocIdMapping> {
let schema = segment_writer.segment_serializer.segment().schema();
let field_id = expect_field_id_for_sort_field(&schema, &sort_by_field)?; // for now expect fastfield, but not strictly required
let fast_field = segment_writer
.fast_field_writers
.get_field_writer(field_id)
.ok_or_else(|| {
TantivyError::InvalidArgument(format!(
"sort index by field is required to be a fast field {:?}",
sort_by_field.field
))
})?;
todo!()
// let schema = segment_writer.segment_serializer.segment().schema();
// let field_id = expect_field_id_for_sort_field(&schema, &sort_by_field)?; // for now expect
// fastfield, but not strictly required let fast_field = segment_writer
// .fast_field_writers
// .get_field_writer(field_id)
// .ok_or_else(|| {
// TantivyError::InvalidArgument(format!(
// "sort index by field is required to be a fast field {:?}",
// sort_by_field.field
// ))
// })?;
// create new doc_id to old doc_id index (used in fast_field_writers)
let mut doc_id_and_data = fast_field
.iter()
.enumerate()
.map(|el| (el.0 as DocId, el.1))
.collect::<Vec<_>>();
if sort_by_field.order == Order::Desc {
doc_id_and_data.sort_by_key(|k| Reverse(k.1));
} else {
doc_id_and_data.sort_by_key(|k| k.1);
}
let new_doc_id_to_old = doc_id_and_data
.into_iter()
.map(|el| el.0)
.collect::<Vec<_>>();
Ok(DocIdMapping::from_new_id_to_old_id(new_doc_id_to_old))
// // create new doc_id to old doc_id index (used in fast_field_writers)
// let mut doc_id_and_data = fast_field
// .iter()
// .enumerate()
// .map(|el| (el.0 as DocId, el.1))
// .collect::<Vec<_>>();
// if sort_by_field.order == Order::Desc {
// doc_id_and_data.sort_by_key(|k| Reverse(k.1));
// } else {
// doc_id_and_data.sort_by_key(|k| k.1);
// }
// let new_doc_id_to_old = doc_id_and_data
// .into_iter()
// .map(|el| el.0)
// .collect::<Vec<_>>();
// Ok(DocIdMapping::from_new_id_to_old_id(new_doc_id_to_old))
}
#[cfg(test)]
@@ -159,15 +160,11 @@ mod tests_indexsorting {
let my_text_field = schema_builder.add_text_field("text_field", text_field_options);
let my_string_field = schema_builder.add_text_field("string_field", STRING | STORED);
let my_number = schema_builder.add_u64_field(
"my_number",
NumericOptions::default().set_fast(Cardinality::SingleValue),
);
let my_number =
schema_builder.add_u64_field("my_number", NumericOptions::default().set_fast());
let multi_numbers = schema_builder.add_u64_field(
"multi_numbers",
NumericOptions::default().set_fast(Cardinality::MultiValues),
);
let multi_numbers =
schema_builder.add_u64_field("multi_numbers", NumericOptions::default().set_fast());
let schema = schema_builder.build();
let mut index_builder = Index::builder().schema(schema);
@@ -441,47 +438,48 @@ mod tests_indexsorting {
Ok(())
}
#[test]
fn test_sort_index_fast_field() -> crate::Result<()> {
let index = create_test_index(
Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "my_number".to_string(),
order: Order::Asc,
}),
..Default::default()
}),
get_text_options(),
)?;
assert_eq!(
index.settings().sort_by_field.as_ref().unwrap().field,
"my_number".to_string()
);
// #[test]
// fn test_sort_index_fast_field() -> crate::Result<()> {
// let index = create_test_index(
// Some(IndexSettings {
// sort_by_field: Some(IndexSortByField {
// field: "my_number".to_string(),
// order: Order::Asc,
// }),
// ..Default::default()
// }),
// get_text_options(),
// )?;
// assert_eq!(
// index.settings().sort_by_field.as_ref().unwrap().field,
// "my_number".to_string()
// );
let searcher = index.reader()?.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
let segment_reader = searcher.segment_reader(0);
let fast_fields = segment_reader.fast_fields();
index.schema().get_field("my_number").unwrap();
// let searcher = index.reader()?.searcher();
// assert_eq!(searcher.segment_readers().len(), 1);
// let segment_reader = searcher.segment_reader(0);
// let fast_fields = segment_reader.fast_fields();
// let my_number = index.schema().get_field("my_number").unwrap();
let fast_field = fast_fields.u64("my_number").unwrap();
assert_eq!(fast_field.get_val(0), 10u64);
assert_eq!(fast_field.get_val(1), 20u64);
assert_eq!(fast_field.get_val(2), 30u64);
// let fast_field = fast_fields.u64(my_number).unwrap();
// assert_eq!(fast_field.get_val(0), 10u64);
// assert_eq!(fast_field.get_val(1), 20u64);
// assert_eq!(fast_field.get_val(2), 30u64);
let multifield = fast_fields.u64s("multi_numbers").unwrap();
let mut vals = vec![];
multifield.get_vals(0u32, &mut vals);
assert_eq!(vals, &[] as &[u64]);
let mut vals = vec![];
multifield.get_vals(1u32, &mut vals);
assert_eq!(vals, &[5, 6]);
// let multi_numbers = index.schema().get_field("multi_numbers").unwrap();
// let multifield = fast_fields.u64s(multi_numbers).unwrap();
// let mut vals = vec![];
// multifield.get_vals(0u32, &mut vals);
// assert_eq!(vals, &[] as &[u64]);
// let mut vals = vec![];
// multifield.get_vals(1u32, &mut vals);
// assert_eq!(vals, &[5, 6]);
let mut vals = vec![];
multifield.get_vals(2u32, &mut vals);
assert_eq!(vals, &[3]);
Ok(())
}
// let mut vals = vec![];
// multifield.get_vals(2u32, &mut vals);
// assert_eq!(vals, &[3]);
// Ok(())
// }
#[test]
fn test_doc_mapping() {

File diff suppressed because it is too large Load Diff

View File

@@ -150,7 +150,6 @@ fn index_json_value(
json_term_writer.term_buffer,
ctx,
indexing_position,
None,
);
}
TextOrDateTime::DateTime(dt) => {

File diff suppressed because it is too large Load Diff

View File

@@ -2,19 +2,17 @@
mod tests {
use crate::collector::TopDocs;
use crate::core::Index;
use crate::fastfield::{AliveBitSet, MultiValuedFastFieldReader};
use crate::fastfield::AliveBitSet;
use crate::query::QueryParser;
use crate::schema::{
self, BytesOptions, Cardinality, Facet, FacetOptions, IndexRecordOption, NumericOptions,
self, BytesOptions, Facet, FacetOptions, IndexRecordOption, NumericOptions,
TextFieldIndexing, TextOptions,
};
use crate::{DocAddress, DocSet, IndexSettings, IndexSortByField, Order, Postings, Term};
fn create_test_index_posting_list_issue(index_settings: Option<IndexSettings>) -> Index {
let mut schema_builder = schema::Schema::builder();
let int_options = NumericOptions::default()
.set_fast(Cardinality::SingleValue)
.set_indexed();
let int_options = NumericOptions::default().set_fast().set_indexed();
let int_field = schema_builder.add_u64_field("intval", int_options);
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
@@ -62,7 +60,7 @@ mod tests {
) -> crate::Result<Index> {
let mut schema_builder = schema::Schema::builder();
let int_options = NumericOptions::default()
.set_fast(Cardinality::SingleValue)
.set_fast()
.set_stored()
.set_indexed();
let int_field = schema_builder.add_u64_field("intval", int_options);
@@ -71,10 +69,8 @@ mod tests {
let bytes_field = schema_builder.add_bytes_field("bytes", bytes_options);
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
let multi_numbers = schema_builder.add_u64_field(
"multi_numbers",
NumericOptions::default().set_fast(Cardinality::MultiValues),
);
let multi_numbers =
schema_builder.add_u64_field("multi_numbers", NumericOptions::default().set_fast());
let text_field_options = TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
@@ -349,128 +345,130 @@ mod tests {
}
}
#[test]
fn test_merge_sorted_index_asc() {
let index = create_test_index(
Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "intval".to_string(),
order: Order::Asc,
}),
..Default::default()
}),
false,
)
.unwrap();
// #[test]
// fn test_merge_sorted_index_asc() {
// let index = create_test_index(
// Some(IndexSettings {
// sort_by_field: Some(IndexSortByField {
// field: "intval".to_string(),
// order: Order::Asc,
// }),
// ..Default::default()
// }),
// false,
// )
// .unwrap();
let int_field = index.schema().get_field("intval").unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
let segment_reader = searcher.segment_readers().last().unwrap();
// let int_field = index.schema().get_field("intval").unwrap();
// let multi_numbers = index.schema().get_field("multi_numbers").unwrap();
// let bytes_field = index.schema().get_field("bytes").unwrap();
// let reader = index.reader().unwrap();
// let searcher = reader.searcher();
// assert_eq!(searcher.segment_readers().len(), 1);
// let segment_reader = searcher.segment_readers().last().unwrap();
let fast_fields = segment_reader.fast_fields();
let fast_field = fast_fields.u64("intval").unwrap();
assert_eq!(fast_field.get_val(0), 1u64);
assert_eq!(fast_field.get_val(1), 2u64);
assert_eq!(fast_field.get_val(2), 3u64);
assert_eq!(fast_field.get_val(3), 10u64);
assert_eq!(fast_field.get_val(4), 20u64);
assert_eq!(fast_field.get_val(5), 1_000u64);
// let fast_fields = segment_reader.fast_fields();
// let fast_field = fast_fields.u64(int_field).unwrap();
// assert_eq!(fast_field.get_val(0), 1u64);
// assert_eq!(fast_field.get_val(1), 2u64);
// assert_eq!(fast_field.get_val(2), 3u64);
// assert_eq!(fast_field.get_val(3), 10u64);
// assert_eq!(fast_field.get_val(4), 20u64);
// assert_eq!(fast_field.get_val(5), 1_000u64);
let get_vals = |fast_field: &MultiValuedFastFieldReader<u64>, doc_id: u32| -> Vec<u64> {
let mut vals = vec![];
fast_field.get_vals(doc_id, &mut vals);
vals
};
let fast_fields = segment_reader.fast_fields();
let fast_field = fast_fields.u64s("multi_numbers").unwrap();
assert_eq!(&get_vals(&fast_field, 0), &[] as &[u64]);
assert_eq!(&get_vals(&fast_field, 1), &[2, 3]);
assert_eq!(&get_vals(&fast_field, 2), &[3, 4]);
assert_eq!(&get_vals(&fast_field, 3), &[10, 11]);
assert_eq!(&get_vals(&fast_field, 4), &[20]);
assert_eq!(&get_vals(&fast_field, 5), &[1001, 1002]);
// let get_vals = |fast_field: &MultiValuedFastFieldReader<u64>, doc_id: u32| -> Vec<u64> {
// let mut vals = vec![];
// fast_field.get_vals(doc_id, &mut vals);
// vals
// };
// let fast_fields = segment_reader.fast_fields();
// let fast_field = fast_fields.u64s(multi_numbers).unwrap();
// assert_eq!(&get_vals(&fast_field, 0), &[] as &[u64]);
// assert_eq!(&get_vals(&fast_field, 1), &[2, 3]);
// assert_eq!(&get_vals(&fast_field, 2), &[3, 4]);
// assert_eq!(&get_vals(&fast_field, 3), &[10, 11]);
// assert_eq!(&get_vals(&fast_field, 4), &[20]);
// assert_eq!(&get_vals(&fast_field, 5), &[1001, 1002]);
let fast_field = fast_fields.bytes("bytes").unwrap();
assert_eq!(fast_field.get_bytes(0), &[] as &[u8]);
assert_eq!(fast_field.get_bytes(2), &[1, 2, 3]);
assert_eq!(fast_field.get_bytes(5), &[5, 5]);
// let fast_field = fast_fields.bytes(bytes_field).unwrap();
// assert_eq!(fast_field.get_bytes(0), &[] as &[u8]);
// assert_eq!(fast_field.get_bytes(2), &[1, 2, 3]);
// assert_eq!(fast_field.get_bytes(5), &[5, 5]);
// test new field norm mapping
{
let my_text_field = index.schema().get_field("text_field").unwrap();
let fieldnorm_reader = segment_reader.get_fieldnorms_reader(my_text_field).unwrap();
assert_eq!(fieldnorm_reader.fieldnorm(0), 0);
assert_eq!(fieldnorm_reader.fieldnorm(1), 4);
assert_eq!(fieldnorm_reader.fieldnorm(2), 2); // some text
assert_eq!(fieldnorm_reader.fieldnorm(3), 1);
assert_eq!(fieldnorm_reader.fieldnorm(5), 3); // the biggest num
}
// // test new field norm mapping
// {
// let my_text_field = index.schema().get_field("text_field").unwrap();
// let fieldnorm_reader = segment_reader.get_fieldnorms_reader(my_text_field).unwrap();
// assert_eq!(fieldnorm_reader.fieldnorm(0), 0);
// assert_eq!(fieldnorm_reader.fieldnorm(1), 4);
// assert_eq!(fieldnorm_reader.fieldnorm(2), 2); // some text
// assert_eq!(fieldnorm_reader.fieldnorm(3), 1);
// assert_eq!(fieldnorm_reader.fieldnorm(5), 3); // the biggest num
// }
let searcher = index.reader().unwrap().searcher();
{
let my_text_field = index.schema().get_field("text_field").unwrap();
// let searcher = index.reader().unwrap().searcher();
// {
// let my_text_field = index.schema().get_field("text_field").unwrap();
let do_search = |term: &str| {
let query = QueryParser::for_index(&index, vec![my_text_field])
.parse_query(term)
.unwrap();
let top_docs: Vec<(f32, DocAddress)> =
searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
// let do_search = |term: &str| {
// let query = QueryParser::for_index(&index, vec![my_text_field])
// .parse_query(term)
// .unwrap();
// let top_docs: Vec<(f32, DocAddress)> =
// searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
};
// top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
// };
assert_eq!(do_search("some"), vec![2]);
assert_eq!(do_search("blubber"), vec![3]);
assert_eq!(do_search("biggest"), vec![5]);
}
// assert_eq!(do_search("some"), vec![2]);
// assert_eq!(do_search("blubber"), vec![3]);
// assert_eq!(do_search("biggest"), vec![5]);
// }
// postings file
{
let my_text_field = index.schema().get_field("text_field").unwrap();
let term_a = Term::from_field_text(my_text_field, "text");
let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.unwrap()
.unwrap();
// // postings file
// {
// let my_text_field = index.schema().get_field("text_field").unwrap();
// let term_a = Term::from_field_text(my_text_field, "text");
// let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
// let mut postings = inverted_index
// .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
// .unwrap()
// .unwrap();
assert_eq!(postings.doc_freq(), 2);
let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
assert_eq!(
postings.doc_freq_given_deletes(
segment_reader.alive_bitset().unwrap_or(&fallback_bitset)
),
2
);
// assert_eq!(postings.doc_freq(), 2);
// let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
// assert_eq!(
// postings.doc_freq_given_deletes(
// segment_reader.alive_bitset().unwrap_or(&fallback_bitset)
// ),
// 2
// );
let mut output = vec![];
postings.positions(&mut output);
assert_eq!(output, vec![1, 3]);
postings.advance();
// let mut output = vec![];
// postings.positions(&mut output);
// assert_eq!(output, vec![1, 3]);
// postings.advance();
postings.positions(&mut output);
assert_eq!(output, vec![1]);
}
// postings.positions(&mut output);
// assert_eq!(output, vec![1]);
// }
// access doc store
{
let doc = searcher.doc(DocAddress::new(0, 0)).unwrap();
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1));
let doc = searcher.doc(DocAddress::new(0, 1)).unwrap();
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(2));
let doc = searcher.doc(DocAddress::new(0, 2)).unwrap();
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(3));
let doc = searcher.doc(DocAddress::new(0, 3)).unwrap();
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(10));
let doc = searcher.doc(DocAddress::new(0, 4)).unwrap();
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(20));
let doc = searcher.doc(DocAddress::new(0, 5)).unwrap();
assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1_000));
}
}
// // access doc store
// {
// let doc = searcher.doc(DocAddress::new(0, 0)).unwrap();
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1));
// let doc = searcher.doc(DocAddress::new(0, 1)).unwrap();
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(2));
// let doc = searcher.doc(DocAddress::new(0, 2)).unwrap();
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(3));
// let doc = searcher.doc(DocAddress::new(0, 3)).unwrap();
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(10));
// let doc = searcher.doc(DocAddress::new(0, 4)).unwrap();
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(20));
// let doc = searcher.doc(DocAddress::new(0, 5)).unwrap();
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1_000));
// }
// }
}
#[cfg(all(test, feature = "unstable"))]
@@ -487,9 +485,7 @@ mod bench_sorted_index_merge {
use crate::{IndexSettings, IndexSortByField, IndexWriter, Order};
fn create_index(sort_by_field: Option<IndexSortByField>) -> Index {
let mut schema_builder = Schema::builder();
let int_options = NumericOptions::default()
.set_fast(Cardinality::SingleValue)
.set_indexed();
let int_options = NumericOptions::default().set_fast().set_indexed();
let int_field = schema_builder.add_u64_field("intval", int_options);
let schema = schema_builder.build();

View File

@@ -19,8 +19,8 @@ mod segment_register;
pub mod segment_serializer;
pub mod segment_updater;
mod segment_writer;
mod sorted_doc_id_column;
mod sorted_doc_id_multivalue_column;
// mod sorted_doc_id_column;
// mod sorted_doc_id_multivalue_column;
mod stamper;
use crossbeam_channel as channel;
@@ -58,7 +58,7 @@ type AddBatchReceiver = channel::Receiver<AddBatch>;
#[cfg(test)]
mod tests_mmap {
use crate::collector::Count;
use crate::query::QueryParser;
// use crate::query::QueryParser;
use crate::schema::{JsonObjectOptions, Schema, TEXT};
use crate::{Index, Term};
@@ -79,45 +79,45 @@ mod tests_mmap {
Ok(())
}
#[test]
fn test_json_field_expand_dots_disabled_dot_escaped_required() {
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap();
let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"});
index_writer.add_document(doc!(json_field=>json)).unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 1);
let parse_query = QueryParser::for_index(&index, Vec::new());
let query = parse_query
.parse_query(r#"json.k8s\.container\.name:prometheus"#)
.unwrap();
let num_docs = searcher.search(&query, &Count).unwrap();
assert_eq!(num_docs, 1);
}
// #[test]
// fn test_json_field_expand_dots_disabled_dot_escaped_required() {
// let mut schema_builder = Schema::builder();
// let json_field = schema_builder.add_json_field("json", TEXT);
// let index = Index::create_in_ram(schema_builder.build());
// let mut index_writer = index.writer_for_tests().unwrap();
// let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"});
// index_writer.add_document(doc!(json_field=>json)).unwrap();
// index_writer.commit().unwrap();
// let reader = index.reader().unwrap();
// let searcher = reader.searcher();
// assert_eq!(searcher.num_docs(), 1);
// let parse_query = QueryParser::for_index(&index, Vec::new());
// let query = parse_query
// .parse_query(r#"json.k8s\.container\.name:prometheus"#)
// .unwrap();
// let num_docs = searcher.search(&query, &Count).unwrap();
// assert_eq!(num_docs, 1);
// }
#[test]
fn test_json_field_expand_dots_enabled_dot_escape_not_required() {
let mut schema_builder = Schema::builder();
let json_options: JsonObjectOptions =
JsonObjectOptions::from(TEXT).set_expand_dots_enabled();
let json_field = schema_builder.add_json_field("json", json_options);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap();
let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"});
index_writer.add_document(doc!(json_field=>json)).unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 1);
let parse_query = QueryParser::for_index(&index, Vec::new());
let query = parse_query
.parse_query(r#"json.k8s.container.name:prometheus"#)
.unwrap();
let num_docs = searcher.search(&query, &Count).unwrap();
assert_eq!(num_docs, 1);
}
// #[test]
// fn test_json_field_expand_dots_enabled_dot_escape_not_required() {
// let mut schema_builder = Schema::builder();
// let json_options: JsonObjectOptions =
// JsonObjectOptions::from(TEXT).set_expand_dots_enabled();
// let json_field = schema_builder.add_json_field("json", json_options);
// let index = Index::create_in_ram(schema_builder.build());
// let mut index_writer = index.writer_for_tests().unwrap();
// let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"});
// index_writer.add_document(doc!(json_field=>json)).unwrap();
// index_writer.commit().unwrap();
// let reader = index.reader().unwrap();
// let searcher = reader.searcher();
// assert_eq!(searcher.num_docs(), 1);
// let parse_query = QueryParser::for_index(&index, Vec::new());
// let query = parse_query
// .parse_query(r#"json.k8s.container.name:prometheus"#)
// .unwrap();
// let num_docs = searcher.search(&query, &Count).unwrap();
// assert_eq!(num_docs, 1);
// }
}

View File

@@ -1,4 +1,7 @@
use common::TerminatingWrite;
use crate::core::{Segment, SegmentComponent};
use crate::directory::WritePtr;
use crate::fastfield::CompositeFastFieldSerializer;
use crate::fieldnorm::FieldNormsSerializer;
use crate::postings::InvertedIndexSerializer;
@@ -9,7 +12,7 @@ use crate::store::StoreWriter;
pub struct SegmentSerializer {
segment: Segment,
pub(crate) store_writer: StoreWriter,
fast_field_serializer: CompositeFastFieldSerializer,
fast_field_write: WritePtr,
fieldnorms_serializer: Option<FieldNormsSerializer>,
postings_serializer: InvertedIndexSerializer,
}
@@ -47,7 +50,6 @@ impl SegmentSerializer {
};
let fast_field_write = segment.open_write(SegmentComponent::FastFields)?;
let fast_field_serializer = CompositeFastFieldSerializer::from_write(fast_field_write)?;
let fieldnorms_write = segment.open_write(SegmentComponent::FieldNorms)?;
let fieldnorms_serializer = FieldNormsSerializer::from_write(fieldnorms_write)?;
@@ -56,7 +58,7 @@ impl SegmentSerializer {
Ok(SegmentSerializer {
segment,
store_writer,
fast_field_serializer,
fast_field_write,
fieldnorms_serializer: Some(fieldnorms_serializer),
postings_serializer,
})
@@ -81,8 +83,8 @@ impl SegmentSerializer {
}
/// Accessor to the `FastFieldSerializer`.
pub fn get_fast_field_serializer(&mut self) -> &mut CompositeFastFieldSerializer {
&mut self.fast_field_serializer
pub fn get_fast_field_write(&mut self) -> &mut WritePtr {
&mut self.fast_field_write
}
/// Extract the field norm serializer.
@@ -102,7 +104,7 @@ impl SegmentSerializer {
if let Some(fieldnorms_serializer) = self.extract_fieldnorms_serializer() {
fieldnorms_serializer.close()?;
}
self.fast_field_serializer.close()?;
self.fast_field_write.terminate()?;
self.postings_serializer.close()?;
self.store_writer.close()?;
Ok(())

View File

@@ -139,7 +139,6 @@ impl SegmentWriter {
self.ctx,
self.fast_field_writers,
&self.fieldnorms_writer,
&self.schema,
self.segment_serializer,
mapping.as_ref(),
)?;
@@ -185,22 +184,15 @@ impl SegmentWriter {
for value in values {
let facet = value.as_facet().ok_or_else(make_schema_error)?;
let facet_str = facet.encoded_str();
let mut unordered_term_id_opt = None;
FacetTokenizer
.token_stream(facet_str)
.process(&mut |token| {
term_buffer.set_text(&token.text);
let unordered_term_id =
postings_writer.subscribe(doc_id, 0u32, term_buffer, ctx);
// TODO pass indexing context directly in subscribe function
unordered_term_id_opt = Some(unordered_term_id);
});
if let Some(unordered_term_id) = unordered_term_id_opt {
self.fast_field_writers
.get_term_id_writer_mut(field)
.expect("writer for facet missing")
.add_val(unordered_term_id);
}
let mut facet_tokenizer = FacetTokenizer.token_stream(facet_str);
let mut indexing_position = IndexingPosition::default();
postings_writer.index_text(
doc_id,
&mut *facet_tokenizer,
term_buffer,
ctx,
&mut indexing_position,
);
}
}
FieldType::Str(_) => {
@@ -227,7 +219,6 @@ impl SegmentWriter {
term_buffer,
ctx,
&mut indexing_position,
self.fast_field_writers.get_term_id_writer_mut(field),
);
}
if field_entry.has_fieldnorms() {
@@ -383,7 +374,6 @@ fn remap_and_write(
ctx: IndexingContext,
fast_field_writers: FastFieldsWriter,
fieldnorms_writer: &FieldNormsWriter,
schema: &Schema,
mut serializer: SegmentSerializer,
doc_id_map: Option<&DocIdMapping>,
) -> crate::Result<()> {
@@ -395,20 +385,15 @@ fn remap_and_write(
.segment()
.open_read(SegmentComponent::FieldNorms)?;
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;
let term_ord_map = serialize_postings(
serialize_postings(
ctx,
per_field_postings_writers,
fieldnorm_readers,
doc_id_map,
schema,
serializer.get_postings_serializer(),
)?;
debug!("fastfield-serialize");
fast_field_writers.serialize(
serializer.get_fast_field_serializer(),
&term_ord_map,
doc_id_map,
)?;
fast_field_writers.serialize(serializer.get_fast_field_write(), doc_id_map)?;
// finalize temp docstore and create version, which reflects the doc_id_map
if let Some(doc_id_map) = doc_id_map {

View File

@@ -147,6 +147,22 @@ pub struct DateTime {
pub(crate) timestamp_micros: i64,
}
impl From<columnar::DateTime> for DateTime {
fn from(columnar_datetime: columnar::DateTime) -> Self {
DateTime {
timestamp_micros: columnar_datetime.timestamp_micros,
}
}
}
impl From<DateTime> for columnar::DateTime {
fn from(datetime: crate::DateTime) -> Self {
columnar::DateTime {
timestamp_micros: datetime.timestamp_micros,
}
}
}
impl DateTime {
/// Create new from UNIX timestamp in seconds
pub const fn from_timestamp_secs(seconds: i64) -> Self {
@@ -263,7 +279,7 @@ mod indexer;
pub mod error;
pub mod tokenizer;
pub mod aggregation;
// pub mod aggregation;
pub mod collector;
pub mod directory;
pub mod fastfield;

View File

@@ -2,13 +2,10 @@ use std::io;
use stacker::Addr;
use crate::fastfield::MultiValuedFastFieldWriter;
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::postings_writer::SpecializedPostingsWriter;
use crate::postings::recorder::{BufferLender, DocIdRecorder, Recorder};
use crate::postings::{
FieldSerializer, IndexingContext, IndexingPosition, PostingsWriter, UnorderedTermId,
};
use crate::postings::{FieldSerializer, IndexingContext, IndexingPosition, PostingsWriter};
use crate::schema::term::as_json_path_type_value_bytes;
use crate::schema::Type;
use crate::tokenizer::TokenStream;
@@ -33,8 +30,8 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
pos: u32,
term: &crate::Term,
ctx: &mut IndexingContext,
) -> UnorderedTermId {
self.non_str_posting_writer.subscribe(doc, pos, term, ctx)
) {
self.non_str_posting_writer.subscribe(doc, pos, term, ctx);
}
fn index_text(
@@ -44,7 +41,6 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
term_buffer: &mut Term,
ctx: &mut IndexingContext,
indexing_position: &mut IndexingPosition,
_fast_field_writer: Option<&mut MultiValuedFastFieldWriter>,
) {
self.str_posting_writer.index_text(
doc_id,
@@ -52,20 +48,19 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
term_buffer,
ctx,
indexing_position,
None,
);
}
/// The actual serialization format is handled by the `PostingsSerializer`.
fn serialize(
&self,
term_addrs: &[(Term<&[u8]>, Addr, UnorderedTermId)],
term_addrs: &[(Term<&[u8]>, Addr)],
doc_id_map: Option<&DocIdMapping>,
ctx: &IndexingContext,
serializer: &mut FieldSerializer,
) -> io::Result<()> {
let mut buffer_lender = BufferLender::default();
for (term, addr, _) in term_addrs {
for (term, addr) in term_addrs {
// TODO optimization opportunity here.
if let Some((_, typ, _)) = as_json_path_type_value_bytes(term.value_bytes()) {
if typ == Type::Str {

View File

@@ -6,7 +6,6 @@ use std::ops::Range;
use rustc_hash::FxHashMap;
use stacker::Addr;
use crate::fastfield::MultiValuedFastFieldWriter;
use crate::fieldnorm::FieldNormReaders;
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::recorder::{BufferLender, Recorder};
@@ -21,12 +20,10 @@ use crate::DocId;
const POSITION_GAP: u32 = 1;
fn make_field_partition(
term_offsets: &[(Term<&[u8]>, Addr, UnorderedTermId)],
) -> Vec<(Field, Range<usize>)> {
fn make_field_partition(term_offsets: &[(Term<&[u8]>, Addr)]) -> Vec<(Field, Range<usize>)> {
let term_offsets_it = term_offsets
.iter()
.map(|(term, _, _)| term.field())
.map(|(term, _)| term.field())
.enumerate();
let mut prev_field_opt = None;
let mut fields = vec![];
@@ -54,48 +51,18 @@ pub(crate) fn serialize_postings(
per_field_postings_writers: &PerFieldPostingsWriter,
fieldnorm_readers: FieldNormReaders,
doc_id_map: Option<&DocIdMapping>,
schema: &Schema,
serializer: &mut InvertedIndexSerializer,
) -> crate::Result<HashMap<Field, FxHashMap<UnorderedTermId, TermOrdinal>>> {
let mut term_offsets: Vec<(Term<&[u8]>, Addr, UnorderedTermId)> =
Vec::with_capacity(ctx.term_index.len());
) -> crate::Result<()> {
let mut term_offsets: Vec<(Term<&[u8]>, Addr)> = Vec::with_capacity(ctx.term_index.len());
term_offsets.extend(
ctx.term_index
.iter()
.map(|(bytes, addr, unordered_id)| (Term::wrap(bytes), addr, unordered_id)),
.map(|(bytes, addr, _unordered_id)| (Term::wrap(bytes), addr)),
);
term_offsets.sort_unstable_by_key(|(k, _, _)| k.clone());
let mut unordered_term_mappings: HashMap<Field, FxHashMap<UnorderedTermId, TermOrdinal>> =
HashMap::new();
term_offsets.sort_unstable_by_key(|(k, _)| k.clone());
let field_offsets = make_field_partition(&term_offsets);
for (field, byte_offsets) in field_offsets {
let field_entry = schema.get_field_entry(field);
match *field_entry.field_type() {
FieldType::Str(_) | FieldType::Facet(_) => {
// populating the (unordered term ord) -> (ordered term ord) mapping
// for the field.
let unordered_term_ids = term_offsets[byte_offsets.clone()]
.iter()
.map(|&(_, _, bucket)| bucket);
let mapping: FxHashMap<UnorderedTermId, TermOrdinal> = unordered_term_ids
.enumerate()
.map(|(term_ord, unord_term_id)| {
(unord_term_id as UnorderedTermId, term_ord as TermOrdinal)
})
.collect();
unordered_term_mappings.insert(field, mapping);
}
FieldType::U64(_)
| FieldType::I64(_)
| FieldType::F64(_)
| FieldType::Date(_)
| FieldType::Bool(_) => {}
FieldType::Bytes(_) => {}
FieldType::JsonObject(_) => {}
FieldType::IpAddr(_) => {}
}
let postings_writer = per_field_postings_writers.get_for_field(field);
let fieldnorm_reader = fieldnorm_readers.get_field(field)?;
let mut field_serializer =
@@ -108,7 +75,7 @@ pub(crate) fn serialize_postings(
)?;
field_serializer.close()?;
}
Ok(unordered_term_mappings)
Ok(())
}
#[derive(Default)]
@@ -129,19 +96,13 @@ pub(crate) trait PostingsWriter: Send + Sync {
/// * term - the term
/// * ctx - Contains a term hashmap and a memory arena to store all necessary posting list
/// information.
fn subscribe(
&mut self,
doc: DocId,
pos: u32,
term: &Term,
ctx: &mut IndexingContext,
) -> UnorderedTermId;
fn subscribe(&mut self, doc: DocId, pos: u32, term: &Term, ctx: &mut IndexingContext);
/// Serializes the postings on disk.
/// The actual serialization format is handled by the `PostingsSerializer`.
fn serialize(
&self,
term_addrs: &[(Term<&[u8]>, Addr, UnorderedTermId)],
term_addrs: &[(Term<&[u8]>, Addr)],
doc_id_map: Option<&DocIdMapping>,
ctx: &IndexingContext,
serializer: &mut FieldSerializer,
@@ -155,7 +116,6 @@ pub(crate) trait PostingsWriter: Send + Sync {
term_buffer: &mut Term,
ctx: &mut IndexingContext,
indexing_position: &mut IndexingPosition,
mut term_id_fast_field_writer_opt: Option<&mut MultiValuedFastFieldWriter>,
) {
let end_of_path_idx = term_buffer.len_bytes();
let mut num_tokens = 0;
@@ -175,11 +135,7 @@ pub(crate) trait PostingsWriter: Send + Sync {
term_buffer.append_bytes(token.text.as_bytes());
let start_position = indexing_position.end_position + token.position as u32;
end_position = end_position.max(start_position + token.position_length as u32);
let unordered_term_id = self.subscribe(doc_id, start_position, term_buffer, ctx);
if let Some(term_id_fast_field_writer) = term_id_fast_field_writer_opt.as_mut() {
term_id_fast_field_writer.add_val(unordered_term_id);
}
self.subscribe(doc_id, start_position, term_buffer, ctx);
num_tokens += 1;
});
@@ -227,13 +183,7 @@ impl<Rec: Recorder> SpecializedPostingsWriter<Rec> {
}
impl<Rec: Recorder> PostingsWriter for SpecializedPostingsWriter<Rec> {
fn subscribe(
&mut self,
doc: DocId,
position: u32,
term: &Term,
ctx: &mut IndexingContext,
) -> UnorderedTermId {
fn subscribe(&mut self, doc: DocId, position: u32, term: &Term, ctx: &mut IndexingContext) {
debug_assert!(term.as_slice().len() >= 4);
self.total_num_tokens += 1;
let (term_index, arena) = (&mut ctx.term_index, &mut ctx.arena);
@@ -252,18 +202,18 @@ impl<Rec: Recorder> PostingsWriter for SpecializedPostingsWriter<Rec> {
recorder.record_position(position, arena);
recorder
}
}) as UnorderedTermId
});
}
fn serialize(
&self,
term_addrs: &[(Term<&[u8]>, Addr, UnorderedTermId)],
term_addrs: &[(Term<&[u8]>, Addr)],
doc_id_map: Option<&DocIdMapping>,
ctx: &IndexingContext,
serializer: &mut FieldSerializer,
) -> io::Result<()> {
let mut buffer_lender = BufferLender::default();
for (term, addr, _) in term_addrs {
for (term, addr) in term_addrs {
Self::serialize_one_term(term, *addr, doc_id_map, &mut buffer_lender, ctx, serializer)?;
}
Ok(())

View File

@@ -15,7 +15,7 @@ mod more_like_this;
mod phrase_query;
mod query;
mod query_parser;
mod range_query;
// mod range_query;
mod regex_query;
mod reqopt_scorer;
mod scorer;
@@ -50,7 +50,7 @@ pub use self::more_like_this::{MoreLikeThisQuery, MoreLikeThisQueryBuilder};
pub use self::phrase_query::PhraseQuery;
pub use self::query::{EnableScoring, Query, QueryClone};
pub use self::query_parser::{QueryParser, QueryParserError};
pub use self::range_query::RangeQuery;
// pub use self::range_query::RangeQuery;
pub use self::regex_query::RegexQuery;
pub use self::reqopt_scorer::RequiredOptionalScorer;
pub use self::score_combiner::{

View File

@@ -13,10 +13,19 @@ use crate::core::Index;
use crate::indexer::{
convert_to_fast_value_and_get_term, set_string_and_get_terms, JsonTermWriter,
};
use crate::query::range_query::is_type_valid_for_fastfield_range_query;
// use crate::query::range_query::is_type_valid_for_fastfield_range_query;
use crate::query::{
AllQuery, BooleanQuery, BoostQuery, EmptyQuery, FuzzyTermQuery, Occur, PhraseQuery, Query,
RangeQuery, TermQuery, TermSetQuery,
AllQuery,
BooleanQuery,
BoostQuery,
EmptyQuery,
FuzzyTermQuery,
Occur,
PhraseQuery,
Query,
// RangeQuery,
TermQuery,
TermSetQuery,
};
use crate::schema::{
Facet, FacetParseError, Field, FieldType, IndexRecordOption, IntoIpv6Addr, JsonObjectOptions,
@@ -334,89 +343,90 @@ impl QueryParser {
json_path: &str,
phrase: &str,
) -> Result<Term, QueryParserError> {
let field_entry = self.schema.get_field_entry(field);
let field_type = field_entry.field_type();
let field_supports_ff_range_queries = field_type.is_fast()
&& is_type_valid_for_fastfield_range_query(field_type.value_type());
if !field_type.is_indexed() && !field_supports_ff_range_queries {
return Err(QueryParserError::FieldNotIndexed(
field_entry.name().to_string(),
));
}
if !json_path.is_empty() && field_type.value_type() != Type::Json {
return Err(QueryParserError::UnsupportedQuery(format!(
"Json path is not supported for field {:?}",
field_entry.name()
)));
}
match *field_type {
FieldType::U64(_) => {
let val: u64 = u64::from_str(phrase)?;
Ok(Term::from_field_u64(field, val))
}
FieldType::I64(_) => {
let val: i64 = i64::from_str(phrase)?;
Ok(Term::from_field_i64(field, val))
}
FieldType::F64(_) => {
let val: f64 = f64::from_str(phrase)?;
Ok(Term::from_field_f64(field, val))
}
FieldType::Bool(_) => {
let val: bool = bool::from_str(phrase)?;
Ok(Term::from_field_bool(field, val))
}
FieldType::Date(_) => {
let dt = OffsetDateTime::parse(phrase, &Rfc3339)?;
Ok(Term::from_field_date(field, DateTime::from_utc(dt)))
}
FieldType::Str(ref str_options) => {
let option = str_options.get_indexing_options().ok_or_else(|| {
// This should have been seen earlier really.
QueryParserError::FieldNotIndexed(field_entry.name().to_string())
})?;
let text_analyzer =
self.tokenizer_manager
.get(option.tokenizer())
.ok_or_else(|| QueryParserError::UnknownTokenizer {
field: field_entry.name().to_string(),
tokenizer: option.tokenizer().to_string(),
})?;
let mut terms: Vec<Term> = Vec::new();
let mut token_stream = text_analyzer.token_stream(phrase);
token_stream.process(&mut |token| {
let term = Term::from_field_text(field, &token.text);
terms.push(term);
});
if terms.len() != 1 {
return Err(QueryParserError::UnsupportedQuery(format!(
"Range query boundary cannot have multiple tokens: {phrase:?}."
)));
}
Ok(terms.into_iter().next().unwrap())
}
FieldType::JsonObject(_) => {
// Json range are not supported.
Err(QueryParserError::UnsupportedQuery(
"Range query are not supported on json field.".to_string(),
))
}
FieldType::Facet(_) => match Facet::from_text(phrase) {
Ok(facet) => Ok(Term::from_facet(field, &facet)),
Err(e) => Err(QueryParserError::from(e)),
},
FieldType::Bytes(_) => {
let bytes = BASE64
.decode(phrase)
.map_err(QueryParserError::ExpectedBase64)?;
Ok(Term::from_field_bytes(field, &bytes))
}
FieldType::IpAddr(_) => {
let ip_v6 = IpAddr::from_str(phrase)?.into_ipv6_addr();
Ok(Term::from_field_ip_addr(field, ip_v6))
}
}
todo!();
// let field_entry = self.schema.get_field_entry(field);
// let field_type = field_entry.field_type();
// let field_supports_ff_range_queries = field_type.is_fast()
// && is_type_valid_for_fastfield_range_query(field_type.value_type());
//
// if !field_type.is_indexed() && !field_supports_ff_range_queries {
// return Err(QueryParserError::FieldNotIndexed(
// field_entry.name().to_string(),
// ));
// }
// if !json_path.is_empty() && field_type.value_type() != Type::Json {
// return Err(QueryParserError::UnsupportedQuery(format!(
// "Json path is not supported for field {:?}",
// field_entry.name()
// )));
// }
// match *field_type {
// FieldType::U64(_) => {
// let val: u64 = u64::from_str(phrase)?;
// Ok(Term::from_field_u64(field, val))
// }
// FieldType::I64(_) => {
// let val: i64 = i64::from_str(phrase)?;
// Ok(Term::from_field_i64(field, val))
// }
// FieldType::F64(_) => {
// let val: f64 = f64::from_str(phrase)?;
// Ok(Term::from_field_f64(field, val))
// }
// FieldType::Bool(_) => {
// let val: bool = bool::from_str(phrase)?;
// Ok(Term::from_field_bool(field, val))
// }
// FieldType::Date(_) => {
// let dt = OffsetDateTime::parse(phrase, &Rfc3339)?;
// Ok(Term::from_field_date(field, DateTime::from_utc(dt)))
// }
// FieldType::Str(ref str_options) => {
// let option = str_options.get_indexing_options().ok_or_else(|| {
// This should have been seen earlier really.
// QueryParserError::FieldNotIndexed(field_entry.name().to_string())
// })?;
// let text_analyzer =
// self.tokenizer_manager
// .get(option.tokenizer())
// .ok_or_else(|| QueryParserError::UnknownTokenizer {
// field: field_entry.name().to_string(),
// tokenizer: option.tokenizer().to_string(),
// })?;
// let mut terms: Vec<Term> = Vec::new();
// let mut token_stream = text_analyzer.token_stream(phrase);
// token_stream.process(&mut |token| {
// let term = Term::from_field_text(field, &token.text);
// terms.push(term);
// });
// if terms.len() != 1 {
// return Err(QueryParserError::UnsupportedQuery(format!(
// "Range query boundary cannot have multiple tokens: {phrase:?}."
// )));
// }
// Ok(terms.into_iter().next().unwrap())
// }
// FieldType::JsonObject(_) => {
// Json range are not supported.
// Err(QueryParserError::UnsupportedQuery(
// "Range query are not supported on json field.".to_string(),
// ))
// }
// FieldType::Facet(_) => match Facet::from_text(phrase) {
// Ok(facet) => Ok(Term::from_facet(field, &facet)),
// Err(e) => Err(QueryParserError::from(e)),
// },
// FieldType::Bytes(_) => {
// let bytes = BASE64
// .decode(phrase)
// .map_err(QueryParserError::ExpectedBase64)?;
// Ok(Term::from_field_bytes(field, &bytes))
// }
// FieldType::IpAddr(_) => {
// let ip_v6 = IpAddr::from_str(phrase)?.into_ipv6_addr();
// Ok(Term::from_field_ip_addr(field, ip_v6))
// }
// }
}
fn compute_logical_ast_for_leaf(
@@ -740,9 +750,12 @@ fn convert_literal_to_query(
value_type,
lower,
upper,
} => Box::new(RangeQuery::new_term_bounds(
field, value_type, &lower, &upper,
)),
} => {
todo!();
// Box::new(RangeQuery::new_term_bounds(
// field, value_type, &lower, &upper,
// ))
}
LogicalLiteral::Set { elements, .. } => Box::new(TermSetQuery::new(elements)),
LogicalLiteral::All => Box::new(AllQuery),
}

View File

@@ -4,7 +4,7 @@ use std::sync::Arc;
use fastfield_codecs::Column;
use crate::fastfield::{MakeZero, MultiValuedFastFieldReader};
use crate::fastfield::MakeZero;
use crate::{DocId, DocSet, TERMINATED};
/// Helper to have a cursor over a vec of docids

View File

@@ -8,10 +8,13 @@ use std::ops::{Bound, RangeInclusive};
use common::BinarySerializable;
use fastfield_codecs::MonotonicallyMappableToU128;
use super::fast_field_range_query::{FastFieldCardinality, RangeDocSet};
use super::range_query::map_bound;
use crate::query::{ConstScorer, Explanation, Scorer, Weight};
<<<<<<< HEAD
use crate::schema::Cardinality;
=======
use crate::schema::Field;
>>>>>>> fd1deefd12 (Disconnected facet / fast field merges / examples)
use crate::{DocId, DocSet, Score, SegmentReader, TantivyError};
/// `IPFastFieldRangeWeight` uses the ip address fast field to execute range queries.
@@ -40,6 +43,7 @@ impl IPFastFieldRangeWeight {
impl Weight for IPFastFieldRangeWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
<<<<<<< HEAD
let field_type = reader
.schema()
.get_field_entry(reader.schema().get_field(&self.field)?)
@@ -74,6 +78,40 @@ impl Weight for IPFastFieldRangeWeight {
Ok(Box::new(ConstScorer::new(docset, boost)))
}
}
=======
todo!();
// let field_type = reader.schema().get_field_entry(self.field).field_type();
// match field_type.fastfield_cardinality().unwrap() {
// Cardinality::SingleValue => {
// let ip_addr_fast_field = reader.fast_fields().ip_addr(self.field)?;
// let value_range = bound_to_value_range(
// &self.left_bound,
// &self.right_bound,
// ip_addr_fast_field.min_value(),
// ip_addr_fast_field.max_value(),
// );
// let docset = RangeDocSet::new(
// value_range,
// FastFieldCardinality::SingleValue(ip_addr_fast_field),
// );
// Ok(Box::new(ConstScorer::new(docset, boost)))
// }
// Cardinality::MultiValues => {
// let ip_addr_fast_field = reader.fast_fields().ip_addrs(self.field)?;
// let value_range = bound_to_value_range(
// &self.left_bound,
// &self.right_bound,
// ip_addr_fast_field.min_value(),
// ip_addr_fast_field.max_value(),
// );
// let docset = RangeDocSet::new(
// value_range,
// FastFieldCardinality::MultiValue(ip_addr_fast_field),
// );
// Ok(Box::new(ConstScorer::new(docset, boost)))
// }
// }
>>>>>>> fd1deefd12 (Disconnected facet / fast field merges / examples)
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
@@ -190,7 +228,7 @@ mod tests {
let ips_field = schema_builder.add_ip_addr_field(
"ips",
IpAddrOptions::default()
.set_fast(Cardinality::MultiValues)
.set_fast()
.set_indexed(),
);
let text_field = schema_builder.add_text_field("id", STRING | STORED);

View File

@@ -6,10 +6,14 @@ use std::ops::{Bound, RangeInclusive};
use fastfield_codecs::MonotonicallyMappableToU64;
use super::fast_field_range_query::{FastFieldCardinality, RangeDocSet};
use super::fast_field_range_query::RangeDocSet;
use super::range_query::map_bound;
use crate::query::{ConstScorer, Explanation, Scorer, Weight};
<<<<<<< HEAD
use crate::schema::Cardinality;
=======
use crate::schema::Field;
>>>>>>> fd1deefd12 (Disconnected facet / fast field merges / examples)
use crate::{DocId, DocSet, Score, SegmentReader, TantivyError};
/// `FastFieldRangeWeight` uses the fast field to execute range queries.
@@ -33,6 +37,7 @@ impl FastFieldRangeWeight {
impl Weight for FastFieldRangeWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
<<<<<<< HEAD
let field_type = reader
.schema()
.get_field_entry(reader.schema().get_field(&self.field)?)
@@ -63,6 +68,36 @@ impl Weight for FastFieldRangeWeight {
Ok(Box::new(ConstScorer::new(docset, boost)))
}
}
=======
todo!();
// let field_type = reader.schema().get_field_entry(self.field).field_type();
// match field_type.fastfield_cardinality().unwrap() {
// Cardinality::SingleValue => {
// let fast_field = reader.fast_fields().u64_lenient(self.field)?;
// let value_range = bound_to_value_range(
// &self.left_bound,
// &self.right_bound,
// fast_field.min_value(),
// fast_field.max_value(),
// );
// let docset =
// RangeDocSet::new(value_range, FastFieldCardinality::SingleValue(fast_field));
// Ok(Box::new(ConstScorer::new(docset, boost)))
// }
// Cardinality::MultiValues => {
// let fast_field = reader.fast_fields().u64s_lenient(self.field)?;
// let value_range = bound_to_value_range(
// &self.left_bound,
// &self.right_bound,
// fast_field.min_value(),
// fast_field.max_value(),
// );
// let docset =
// RangeDocSet::new(value_range, FastFieldCardinality::MultiValue(fast_field));
// Ok(Box::new(ConstScorer::new(docset, boost)))
// }
// }
>>>>>>> fd1deefd12 (Disconnected facet / fast field merges / examples)
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
@@ -186,7 +221,7 @@ mod tests {
let ids_u64_field = schema_builder.add_u64_field(
"ids",
NumericOptions::default()
.set_fast(Cardinality::MultiValues)
.set_fast()
.set_indexed(),
);
@@ -194,7 +229,7 @@ mod tests {
let ids_f64_field = schema_builder.add_f64_field(
"ids_f64",
NumericOptions::default()
.set_fast(Cardinality::MultiValues)
.set_fast()
.set_indexed(),
);
@@ -202,7 +237,7 @@ mod tests {
let ids_i64_field = schema_builder.add_i64_field(
"ids_i64",
NumericOptions::default()
.set_fast(Cardinality::MultiValues)
.set_fast()
.set_indexed(),
);

View File

@@ -2,14 +2,16 @@ use std::ops::BitOr;
use serde::{Deserialize, Serialize};
use super::Cardinality;
use crate::schema::flags::{FastFlag, IndexedFlag, SchemaFlagList, StoredFlag};
/// DateTime Precision
#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
#[derive(
Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize, Default,
)]
#[serde(rename_all = "lowercase")]
pub enum DatePrecision {
/// Seconds precision
#[default]
Seconds,
/// Milli-seconds precision.
Milliseconds,
@@ -17,20 +19,13 @@ pub enum DatePrecision {
Microseconds,
}
impl Default for DatePrecision {
fn default() -> Self {
DatePrecision::Seconds
}
}
/// Defines how DateTime field should be handled by tantivy.
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)]
pub struct DateOptions {
indexed: bool,
// This boolean has no effect if the field is not marked as indexed true.
fieldnorms: bool,
#[serde(skip_serializing_if = "Option::is_none")]
fast: Option<Cardinality>,
fast: bool,
stored: bool,
// Internal storage precision, used to optimize storage
// compression on fast fields.
@@ -54,18 +49,9 @@ impl DateOptions {
self.fieldnorms && self.indexed
}
/// Returns true iff the value is a fast field and multivalue.
pub fn is_multivalue_fast(&self) -> bool {
if let Some(cardinality) = self.fast {
cardinality == Cardinality::MultiValues
} else {
false
}
}
/// Returns true iff the value is a fast field.
pub fn is_fast(&self) -> bool {
self.fast.is_some()
self.fast
}
/// Set the field as stored.
@@ -107,19 +93,11 @@ impl DateOptions {
/// If more than one value is associated with a fast field, only the last one is
/// kept.
#[must_use]
pub fn set_fast(mut self, cardinality: Cardinality) -> DateOptions {
self.fast = Some(cardinality);
pub fn set_fast(mut self) -> DateOptions {
self.fast = true;
self
}
/// Returns the cardinality of the fastfield.
///
/// If the field has not been declared as a fastfield, then
/// the method returns `None`.
pub fn get_fastfield_cardinality(&self) -> Option<Cardinality> {
self.fast
}
/// Sets the precision for this DateTime field.
///
/// Internal storage precision, used to optimize storage
@@ -147,10 +125,7 @@ impl From<()> for DateOptions {
impl From<FastFlag> for DateOptions {
fn from(_: FastFlag) -> Self {
DateOptions {
indexed: false,
fieldnorms: false,
stored: false,
fast: Some(Cardinality::SingleValue),
fast: true,
..Default::default()
}
}
@@ -159,10 +134,7 @@ impl From<FastFlag> for DateOptions {
impl From<StoredFlag> for DateOptions {
fn from(_: StoredFlag) -> Self {
DateOptions {
indexed: false,
fieldnorms: false,
stored: true,
fast: None,
..Default::default()
}
}
@@ -173,8 +145,6 @@ impl From<IndexedFlag> for DateOptions {
DateOptions {
indexed: true,
fieldnorms: true,
stored: false,
fast: None,
..Default::default()
}
}
@@ -189,7 +159,7 @@ impl<T: Into<DateOptions>> BitOr<T> for DateOptions {
indexed: self.indexed | other.indexed,
fieldnorms: self.fieldnorms | other.fieldnorms,
stored: self.stored | other.stored,
fast: self.fast.or(other.fast),
fast: self.fast | other.fast,
precision: self.precision,
}
}

View File

@@ -8,7 +8,7 @@ use serde_json::Value as JsonValue;
use thiserror::Error;
use super::ip_options::IpAddrOptions;
use super::{Cardinality, IntoIpv6Addr};
use super::IntoIpv6Addr;
use crate::schema::bytes_options::BytesOptions;
use crate::schema::facet_options::FacetOptions;
use crate::schema::{
@@ -241,26 +241,6 @@ impl FieldType {
}
}
/// returns true if the field is fast.
pub fn fastfield_cardinality(&self) -> Option<Cardinality> {
match *self {
FieldType::Bytes(ref bytes_options) => {
bytes_options.is_fast().then_some(Cardinality::SingleValue)
}
FieldType::Str(ref text_options) => {
text_options.is_fast().then_some(Cardinality::MultiValues)
}
FieldType::U64(ref int_options)
| FieldType::I64(ref int_options)
| FieldType::F64(ref int_options)
| FieldType::Bool(ref int_options) => int_options.get_fastfield_cardinality(),
FieldType::Date(ref date_options) => date_options.get_fastfield_cardinality(),
FieldType::Facet(_) => Some(Cardinality::MultiValues),
FieldType::JsonObject(_) => None,
FieldType::IpAddr(ref ip_addr_options) => ip_addr_options.get_fastfield_cardinality(),
}
}
/// returns true if the field is normed (see [fieldnorms](crate::fieldnorm)).
pub fn has_fieldnorms(&self) -> bool {
match *self {

View File

@@ -4,7 +4,6 @@ use std::ops::BitOr;
use serde::{Deserialize, Serialize};
use super::flags::{FastFlag, IndexedFlag, SchemaFlagList, StoredFlag};
use super::Cardinality;
/// Trait to convert into an Ipv6Addr.
pub trait IntoIpv6Addr {
@@ -24,8 +23,7 @@ impl IntoIpv6Addr for IpAddr {
/// Define how an ip field should be handled by tantivy.
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)]
pub struct IpAddrOptions {
#[serde(skip_serializing_if = "Option::is_none")]
fast: Option<Cardinality>,
fast: bool,
stored: bool,
indexed: bool,
fieldnorms: bool,
@@ -34,7 +32,7 @@ pub struct IpAddrOptions {
impl IpAddrOptions {
/// Returns true iff the value is a fast field.
pub fn is_fast(&self) -> bool {
self.fast.is_some()
self.fast
}
/// Returns `true` if the ip address should be stored in the doc store.
@@ -52,14 +50,6 @@ impl IpAddrOptions {
self.fieldnorms
}
/// Returns the cardinality of the fastfield.
///
/// If the field has not been declared as a fastfield, then
/// the method returns None.
pub fn get_fastfield_cardinality(&self) -> Option<Cardinality> {
self.fast
}
/// Set the field as normed.
///
/// Setting an integer as normed will generate
@@ -97,8 +87,8 @@ impl IpAddrOptions {
/// If more than one value is associated with a fast field, only the last one is
/// kept.
#[must_use]
pub fn set_fast(mut self, cardinality: Cardinality) -> Self {
self.fast = Some(cardinality);
pub fn set_fast(mut self) -> Self {
self.fast = true;
self
}
}
@@ -115,7 +105,7 @@ impl From<FastFlag> for IpAddrOptions {
fieldnorms: false,
indexed: false,
stored: false,
fast: Some(Cardinality::SingleValue),
fast: true,
}
}
}
@@ -126,7 +116,7 @@ impl From<StoredFlag> for IpAddrOptions {
fieldnorms: false,
indexed: false,
stored: true,
fast: None,
fast: false,
}
}
}
@@ -137,7 +127,7 @@ impl From<IndexedFlag> for IpAddrOptions {
fieldnorms: true,
indexed: true,
stored: false,
fast: None,
fast: false,
}
}
}
@@ -151,7 +141,7 @@ impl<T: Into<IpAddrOptions>> BitOr<T> for IpAddrOptions {
fieldnorms: self.fieldnorms | other.fieldnorms,
indexed: self.indexed | other.indexed,
stored: self.stored | other.stored,
fast: self.fast.or(other.fast),
fast: self.fast | other.fast,
}
}
}

View File

@@ -141,9 +141,9 @@ pub use self::index_record_option::IndexRecordOption;
pub use self::ip_options::{IntoIpv6Addr, IpAddrOptions};
pub use self::json_object_options::JsonObjectOptions;
pub use self::named_field_document::NamedFieldDocument;
pub use self::numeric_options::NumericOptions;
#[allow(deprecated)]
pub use self::numeric_options::{Cardinality, IntOptions};
pub use self::numeric_options::IntOptions;
pub use self::numeric_options::NumericOptions;
pub use self::schema::{DocParsingError, Schema, SchemaBuilder};
pub use self::term::Term;
pub use self::text_options::{TextFieldIndexing, TextOptions, STRING, TEXT};

View File

@@ -4,18 +4,6 @@ use serde::{Deserialize, Serialize};
use crate::schema::flags::{FastFlag, IndexedFlag, SchemaFlagList, StoredFlag};
/// Express whether a field is single-value or multi-valued.
#[derive(Clone, Copy, PartialEq, Eq, Debug, Serialize, Deserialize)]
pub enum Cardinality {
/// The document must have exactly one value associated with the document.
#[serde(rename = "single")]
SingleValue,
/// The document can have any number of values associated with the document.
/// This is more memory and CPU expensive than the `SingleValue` solution.
#[serde(rename = "multi")]
MultiValues,
}
#[deprecated(since = "0.17.0", note = "Use NumericOptions instead.")]
/// Deprecated use [`NumericOptions`] instead.
pub type IntOptions = NumericOptions;
@@ -27,8 +15,7 @@ pub struct NumericOptions {
indexed: bool,
// This boolean has no effect if the field is not marked as indexed too.
fieldnorms: bool, // This attribute only has an effect if indexed is true.
#[serde(skip_serializing_if = "Option::is_none")]
fast: Option<Cardinality>,
fast: bool,
stored: bool,
}
@@ -42,8 +29,7 @@ struct NumericOptionsDeser {
indexed: bool,
#[serde(default)]
fieldnorms: Option<bool>, // This attribute only has an effect if indexed is true.
#[serde(default)]
fast: Option<Cardinality>,
fast: bool,
stored: bool,
}
@@ -74,18 +60,9 @@ impl NumericOptions {
self.fieldnorms && self.indexed
}
/// Returns true iff the value is a fast field and multivalue.
pub fn is_multivalue_fast(&self) -> bool {
if let Some(cardinality) = self.fast {
cardinality == Cardinality::MultiValues
} else {
false
}
}
/// Returns true iff the value is a fast field.
pub fn is_fast(&self) -> bool {
self.fast.is_some()
self.fast
}
/// Set the field as stored.
@@ -127,18 +104,10 @@ impl NumericOptions {
/// If more than one value is associated with a fast field, only the last one is
/// kept.
#[must_use]
pub fn set_fast(mut self, cardinality: Cardinality) -> NumericOptions {
self.fast = Some(cardinality);
pub fn set_fast(mut self) -> NumericOptions {
self.fast = true;
self
}
/// Returns the cardinality of the fastfield.
///
/// If the field has not been declared as a fastfield, then
/// the method returns `None`.
pub fn get_fastfield_cardinality(&self) -> Option<Cardinality> {
self.fast
}
}
impl From<()> for NumericOptions {
@@ -153,7 +122,7 @@ impl From<FastFlag> for NumericOptions {
indexed: false,
fieldnorms: false,
stored: false,
fast: Some(Cardinality::SingleValue),
fast: true,
}
}
}
@@ -164,7 +133,7 @@ impl From<StoredFlag> for NumericOptions {
indexed: false,
fieldnorms: false,
stored: true,
fast: None,
fast: false,
}
}
}
@@ -175,7 +144,7 @@ impl From<IndexedFlag> for NumericOptions {
indexed: true,
fieldnorms: true,
stored: false,
fast: None,
fast: false,
}
}
}
@@ -189,7 +158,7 @@ impl<T: Into<NumericOptions>> BitOr<T> for NumericOptions {
indexed: self.indexed | other.indexed,
fieldnorms: self.fieldnorms | other.fieldnorms,
stored: self.stored | other.stored,
fast: self.fast.or(other.fast),
fast: self.fast | other.fast,
}
}
}
@@ -221,7 +190,7 @@ mod tests {
&NumericOptions {
indexed: true,
fieldnorms: true,
fast: None,
fast: false,
stored: false
}
);
@@ -239,7 +208,7 @@ mod tests {
&NumericOptions {
indexed: false,
fieldnorms: false,
fast: None,
fast: false,
stored: false
}
);
@@ -258,7 +227,7 @@ mod tests {
&NumericOptions {
indexed: true,
fieldnorms: false,
fast: None,
fast: false,
stored: false
}
);
@@ -278,7 +247,7 @@ mod tests {
&NumericOptions {
indexed: false,
fieldnorms: true,
fast: None,
fast: false,
stored: false
}
);

View File

@@ -484,7 +484,6 @@ mod tests {
use serde_json;
use crate::schema::field_type::ValueParsingError;
use crate::schema::numeric_options::Cardinality::SingleValue;
use crate::schema::schema::DocParsingError::InvalidJson;
use crate::schema::*;
@@ -506,19 +505,13 @@ mod tests {
#[test]
pub fn test_schema_serialization() {
let mut schema_builder = Schema::builder();
let count_options = NumericOptions::default()
.set_stored()
.set_fast(Cardinality::SingleValue);
let popularity_options = NumericOptions::default()
.set_stored()
.set_fast(Cardinality::SingleValue);
let count_options = NumericOptions::default().set_stored().set_fast();
let popularity_options = NumericOptions::default().set_stored().set_fast();
let score_options = NumericOptions::default()
.set_indexed()
.set_fieldnorm()
.set_fast(Cardinality::SingleValue);
let is_read_options = NumericOptions::default()
.set_stored()
.set_fast(Cardinality::SingleValue);
.set_fast();
let is_read_options = NumericOptions::default().set_stored().set_fast();
schema_builder.add_text_field("title", TEXT);
schema_builder.add_text_field(
"author",
@@ -643,12 +636,8 @@ mod tests {
#[test]
pub fn test_document_to_json() {
let mut schema_builder = Schema::builder();
let count_options = NumericOptions::default()
.set_stored()
.set_fast(Cardinality::SingleValue);
let is_read_options = NumericOptions::default()
.set_stored()
.set_fast(Cardinality::SingleValue);
let count_options = NumericOptions::default().set_stored().set_fast();
let is_read_options = NumericOptions::default().set_stored().set_fast();
schema_builder.add_text_field("title", TEXT);
schema_builder.add_text_field("author", STRING);
schema_builder.add_u64_field("count", count_options);
@@ -748,15 +737,9 @@ mod tests {
#[test]
pub fn test_parse_document() {
let mut schema_builder = Schema::builder();
let count_options = NumericOptions::default()
.set_stored()
.set_fast(Cardinality::SingleValue);
let popularity_options = NumericOptions::default()
.set_stored()
.set_fast(Cardinality::SingleValue);
let score_options = NumericOptions::default()
.set_indexed()
.set_fast(Cardinality::SingleValue);
let count_options = NumericOptions::default().set_stored().set_fast();
let popularity_options = NumericOptions::default().set_stored().set_fast();
let score_options = NumericOptions::default().set_indexed().set_fast();
let title_field = schema_builder.add_text_field("title", TEXT);
let author_field = schema_builder.add_text_field("author", STRING);
let count_field = schema_builder.add_u64_field("count", count_options);
@@ -907,7 +890,7 @@ mod tests {
.set_stored()
.set_indexed()
.set_fieldnorm()
.set_fast(SingleValue);
.set_fast();
schema_builder.add_text_field("_id", id_options);
schema_builder.add_date_field("_timestamp", timestamp_options);