Compare commits

..

2 Commits

Author SHA1 Message Date
Paul Masurel
1a72844048 Added simple columnar CLI program 2022-12-23 22:25:45 +09:00
Paul Masurel
d91df6cc7e Added support for dynamic fast field.
See README for more information.
2022-12-23 22:24:40 +09:00
86 changed files with 1398 additions and 2758 deletions

2
.gitignore vendored
View File

@@ -13,3 +13,5 @@ benchmark
.idea .idea
trace.dat trace.dat
cargo-timing* cargo-timing*
columnar/columnar-cli/*.json
**/perf.data*

View File

@@ -15,7 +15,7 @@ rust-version = "1.62"
[dependencies] [dependencies]
oneshot = "0.1.5" oneshot = "0.1.5"
base64 = "0.21.0" base64 = "0.20.0"
byteorder = "1.4.3" byteorder = "1.4.3"
crc32fast = "1.3.2" crc32fast = "1.3.2"
once_cell = "1.10.0" once_cell = "1.10.0"
@@ -48,7 +48,7 @@ murmurhash32 = "0.2.0"
time = { version = "0.3.10", features = ["serde-well-known"] } time = { version = "0.3.10", features = ["serde-well-known"] }
smallvec = "1.8.0" smallvec = "1.8.0"
rayon = "1.5.2" rayon = "1.5.2"
lru = "0.9.0" lru = "0.7.5"
fastdivide = "0.4.0" fastdivide = "0.4.0"
itertools = "0.10.3" itertools = "0.10.3"
measure_time = "0.8.2" measure_time = "0.8.2"
@@ -61,7 +61,6 @@ tantivy-query-grammar = { version= "0.19.0", path="./query-grammar" }
tantivy-bitpacker = { version= "0.3", path="./bitpacker" } tantivy-bitpacker = { version= "0.3", path="./bitpacker" }
common = { version= "0.5", path = "./common/", package = "tantivy-common" } common = { version= "0.5", path = "./common/", package = "tantivy-common" }
fastfield_codecs = { version= "0.3", path="./fastfield_codecs", default-features = false } fastfield_codecs = { version= "0.3", path="./fastfield_codecs", default-features = false }
tokenizer-api = { version="0.1", path="./tokenizer-api", package="tantivy-tokenizer-api" }
[target.'cfg(windows)'.dependencies] [target.'cfg(windows)'.dependencies]
winapi = "0.3.9" winapi = "0.3.9"
@@ -107,7 +106,7 @@ unstable = [] # useful for benches.
quickwit = ["sstable"] quickwit = ["sstable"]
[workspace] [workspace]
members = ["query-grammar", "bitpacker", "common", "fastfield_codecs", "ownedbytes", "stacker", "sstable", "columnar", "tokenizer-api"] members = ["query-grammar", "bitpacker", "common", "fastfield_codecs", "ownedbytes", "stacker", "sstable", "columnar"]
# Following the "fail" crate best practises, we isolate # Following the "fail" crate best practises, we isolate
# tests that define specific behavior in fail check points # tests that define specific behavior in fail check points

View File

@@ -29,7 +29,7 @@ Your mileage WILL vary depending on the nature of queries and their load.
# Features # Features
- Full-text search - Full-text search
- Configurable tokenizer (stemming available for 17 Latin languages) with third party support for Chinese ([tantivy-jieba](https://crates.io/crates/tantivy-jieba) and [cang-jie](https://crates.io/crates/cang-jie)), Japanese ([lindera](https://github.com/lindera-morphology/lindera-tantivy), [Vaporetto](https://crates.io/crates/vaporetto_tantivy), and [tantivy-tokenizer-tiny-segmenter](https://crates.io/crates/tantivy-tokenizer-tiny-segmenter)) and Korean ([lindera](https://github.com/lindera-morphology/lindera-tantivy) + [lindera-ko-dic-builder](https://github.com/lindera-morphology/lindera-ko-dic-builder)) - Configurable tokenizer (stemming available for 17 Latin languages with third party support for Chinese ([tantivy-jieba](https://crates.io/crates/tantivy-jieba) and [cang-jie](https://crates.io/crates/cang-jie)), Japanese ([lindera](https://github.com/lindera-morphology/lindera-tantivy), [Vaporetto](https://crates.io/crates/vaporetto_tantivy), and [tantivy-tokenizer-tiny-segmenter](https://crates.io/crates/tantivy-tokenizer-tiny-segmenter)) and Korean ([lindera](https://github.com/lindera-morphology/lindera-tantivy) + [lindera-ko-dic-builder](https://github.com/lindera-morphology/lindera-ko-dic-builder))
- Fast (check out the :racehorse: :sparkles: [benchmark](https://tantivy-search.github.io/bench/) :sparkles: :racehorse:) - Fast (check out the :racehorse: :sparkles: [benchmark](https://tantivy-search.github.io/bench/) :sparkles: :racehorse:)
- Tiny startup time (<10ms), perfect for command-line tools - Tiny startup time (<10ms), perfect for command-line tools
- BM25 scoring (the same as Lucene) - BM25 scoring (the same as Lucene)
@@ -42,12 +42,12 @@ Your mileage WILL vary depending on the nature of queries and their load.
- Single valued and multivalued u64, i64, and f64 fast fields (equivalent of doc values in Lucene) - Single valued and multivalued u64, i64, and f64 fast fields (equivalent of doc values in Lucene)
- `&[u8]` fast fields - `&[u8]` fast fields
- Text, i64, u64, f64, dates, and hierarchical facet fields - Text, i64, u64, f64, dates, and hierarchical facet fields
- Compressed document store (LZ4, Zstd, None, Brotli, Snap) - LZ4 compressed document store
- Range queries - Range queries
- Faceted search - Faceted search
- Configurable indexing (optional term frequency and position indexing) - Configurable indexing (optional term frequency and position indexing)
- JSON Field - JSON Field
- Aggregation Collector: histogram, range buckets, average, and stats metrics - Aggregation Collector: range buckets, average, and stats metrics
- LogMergePolicy with deletes - LogMergePolicy with deletes
- Searcher Warmer API - Searcher Warmer API
- Cheesy logo with a horse - Cheesy logo with a horse
@@ -81,10 +81,6 @@ There are many ways to support this project.
We use the GitHub Pull Request workflow: reference a GitHub ticket and/or include a comprehensive commit message when opening a PR. We use the GitHub Pull Request workflow: reference a GitHub ticket and/or include a comprehensive commit message when opening a PR.
## Tokenizer
When implementing a tokenizer for tantivy depend on the `tantivy-tokenizer-api` crate.
## Minimum supported Rust version ## Minimum supported Rust version
Tantivy currently requires at least Rust 1.62 or later to compile. Tantivy currently requires at least Rust 1.62 or later to compile.

View File

@@ -10,6 +10,7 @@ serde_json = "1"
thiserror = "1" thiserror = "1"
fnv = "1" fnv = "1"
sstable = { path = "../sstable", package = "tantivy-sstable" } sstable = { path = "../sstable", package = "tantivy-sstable" }
zstd = "0.12"
common = { path = "../common", package = "tantivy-common" } common = { path = "../common", package = "tantivy-common" }
fastfield_codecs = { path = "../fastfield_codecs"} fastfield_codecs = { path = "../fastfield_codecs"}
itertools = "0.10" itertools = "0.10"

View File

@@ -16,36 +16,42 @@ and different cardinality `(required, optional, multivalued)`.
# Coercion rules # Coercion rules
Users can create a columnar by inserting rows to a `ColumnarWriter`, Users can create a columnar by appending rows to a writer.
and serializing it into a `Write` object. Nothing prevents a user from recording values with different to a same `column_key`.
Nothing prevents a user from recording values with different type to the same `column_name`.
In that case, `tantivy-columnar`'s behavior is as follows: In that case, `tantivy-columnar`'s behavior is as follows:
- JsonValues are grouped into 3 types (String, Number, bool). - Values that corresponds to different JsonValue type are mapped to different columns. For instance, String values are treated independently from Number or boolean values. `tantivy-columnar` will simply emit several columns associated to a given column_name.
Values that corresponds to different groups are mapped to different columns. For instance, String values are treated independently - Only one column for a given json value type is emitted. If number values with different number types are recorded (e.g. u64, i64, f64), `tantivy-columnar` will pick the first type that can represents the set of appended value, with the following prioriy order (`i64`, `u64`, `f64`). `i64` is picked over `u64` as it is likely to yield less change of types. Most use cases strictly requiring `u64` show the restriction on 50% of the values (e.g. a 64-bit hash). On the other hand, a lot of use cases can show rare negative value.
from Number or boolean values. `tantivy-columnar` will simply emit several columns associated to a given column_name.
- Only one column for a given json value type is emitted. If number values with different number types are recorded (e.g. u64, i64, f64),
`tantivy-columnar` will pick the first type that can represents the set of appended value, with the following prioriy order (`i64`, `u64`, `f64`).
`i64` is picked over `u64` as it is likely to yield less change of types. Most use cases strictly requiring `u64` show the
restriction on 50% of the values (e.g. a 64-bit hash). On the other hand, a lot of use cases can show rare negative value.
# Columnar format # Columnar format
This columnar format may have more than one column (with different types) associated to the same `column_name` (see [Coercion rules](#coercion-rules) above). Because this columnar format tries to avoid some coercion.
The `(column_name, columne_type)` couple however uniquely identifies a column. There can be several columns (with different type) associated to a single `column_name`.
That couple is serialized as a column `column_key`. The format of that key is:
Each column is associated to `column_key`.
The format of that key is:
`[column_name][ZERO_BYTE][column_type_header: u8]` `[column_name][ZERO_BYTE][column_type_header: u8]`
``` ```
COLUMNAR:= COLUMNAR:=
[COLUMNAR_DATA] [COLUMNAR_DATA]
[COLUMNAR_KEY_TO_DATA_INDEX] [COLUMNAR_INDEX]
[COLUMNAR_FOOTER]; [COLUMNAR_FOOTER];
# Columns are sorted by their column key. # Columns are sorted by their column key.
COLUMNAR_DATA:= COLUMNAR_DATA:=
[COLUMN_DATA]+; [COLUMN]+;
COLUMN:=
COMPRESSED_COLUMN | NON_COMPRESSED_COLUMN;
# COLUMN_DATA is compressed when it exceeds a threshold of 100KB.
COMPRESSED_COLUMN := [b'1'][zstd(COLUMN_DATA)]
NON_COMPRESSED_COLUMN:= [b'0'][COLUMN_DATA]
COLUMNAR_INDEX := [RANGE_SSTABLE_BYTES]
COLUMNAR_FOOTER := [RANGE_SSTABLE_BYTES_LEN: 8 bytes little endian] COLUMNAR_FOOTER := [RANGE_SSTABLE_BYTES_LEN: 8 bytes little endian]
@@ -54,10 +60,10 @@ COLUMNAR_FOOTER := [RANGE_SSTABLE_BYTES_LEN: 8 bytes little endian]
The columnar file starts by the actual column data, concatenated one after the other, The columnar file starts by the actual column data, concatenated one after the other,
sorted by column key. sorted by column key.
A sstable associates A quickwit/tantivy style sstable associates
`(column name, column_cardinality, column_type) to range of bytes. `(column names, column_cardinality, column_type) to range of bytes.
Column name may not contain the zero byte `\0`. Column name may not contain the zero byte.
Listing all columns associated to `column_name` can therefore Listing all columns associated to `column_name` can therefore
be done by listing all keys prefixed by be done by listing all keys prefixed by

View File

@@ -0,0 +1,17 @@
[package]
name = "tantivy-columnar-cli"
version = "0.1.0"
edition = "2021"
license = "MIT"
[dependencies]
columnar = {path="../", package="tantivy-columnar"}
serde_json = "1"
serde_json_borrow = {git="https://github.com/PSeitz/serde_json_borrow/"}
serde = "1"
[workspace]
members = []
[profile.release]
debug = true

View File

@@ -0,0 +1,126 @@
use columnar::ColumnarWriter;
use columnar::NumericalValue;
use serde_json_borrow;
use std::fs::File;
use std::io;
use std::io::BufRead;
use std::io::BufReader;
use std::time::Instant;
#[derive(Default)]
struct JsonStack {
path: String,
stack: Vec<usize>,
}
impl JsonStack {
fn push(&mut self, seg: &str) {
let len = self.path.len();
self.stack.push(len);
self.path.push('.');
self.path.push_str(seg);
}
fn pop(&mut self) {
if let Some(len) = self.stack.pop() {
self.path.truncate(len);
}
}
fn path(&self) -> &str {
&self.path[1..]
}
}
fn append_json_to_columnar(
doc: u32,
json_value: &serde_json_borrow::Value,
columnar: &mut ColumnarWriter,
stack: &mut JsonStack,
) -> usize {
let mut count = 0;
match json_value {
serde_json_borrow::Value::Null => {}
serde_json_borrow::Value::Bool(val) => {
columnar.record_numerical(
doc,
stack.path(),
NumericalValue::from(if *val { 1u64 } else { 0u64 }),
);
count += 1;
}
serde_json_borrow::Value::Number(num) => {
let numerical_value: NumericalValue = if let Some(num_i64) = num.as_i64() {
num_i64.into()
} else if let Some(num_u64) = num.as_u64() {
num_u64.into()
} else if let Some(num_f64) = num.as_f64() {
num_f64.into()
} else {
panic!();
};
count += 1;
columnar.record_numerical(
doc,
stack.path(),
numerical_value,
);
}
serde_json_borrow::Value::Str(msg) => {
columnar.record_str(
doc,
stack.path(),
msg.as_bytes(),
);
count += 1;
},
serde_json_borrow::Value::Array(vals) => {
for val in vals {
count += append_json_to_columnar(doc, val, columnar, stack);
}
},
serde_json_borrow::Value::Object(json_map) => {
for (child_key, child_val) in json_map {
stack.push(child_key);
count += append_json_to_columnar(doc, child_val, columnar, stack);
stack.pop();
}
},
}
count
}
fn main() -> io::Result<()> {
let file = File::open("gh_small.json")?;
let mut reader = BufReader::new(file);
let mut line = String::with_capacity(100);
let mut columnar = columnar::ColumnarWriter::default();
let mut doc = 0;
let start = Instant::now();
let mut stack = JsonStack::default();
let mut total_count = 0;
loop {
line.clear();
let len = reader.read_line(&mut line)?;
if len == 0 {
break;
}
let Ok(json_value) = serde_json::from_str::<serde_json_borrow::Value>(&line) else { continue; };
total_count += append_json_to_columnar(doc, &json_value, &mut columnar, &mut stack);
doc += 1;
}
println!("value count {total_count}");
println!("record {:?}", start.elapsed());
let mut buffer = Vec::new();
columnar.serialize(doc, &mut buffer)?;
println!("num docs: {doc}, {:?}", start.elapsed());
println!("buffer len {} MB", buffer.len() / 1_000_000);
let columnar = columnar::ColumnarReader::open(buffer)?;
for (column_name, typ, offsets, num_bytes) in columnar.list_columns()? {
if num_bytes>1_000_000 {
println!("{column_name} {typ:?} {offsets:?} {}", num_bytes / 1_000_000);
}
}
println!("{} columns", columnar.num_columns());
Ok(())
}

View File

@@ -1,16 +1,12 @@
use crate::utils::{place_bits, select_bits}; use crate::utils::{place_bits, select_bits};
use crate::value::NumericalType; use crate::value::NumericalType;
use crate::InvalidData;
/// Enum describing the number of values that can exist per document /// Enum describing the number of values that can exist per document
/// (or per row if you will). /// (or per row if you will).
///
/// The cardinality must fit on 2 bits.
#[derive(Clone, Copy, Hash, Default, Debug, PartialEq, Eq, PartialOrd, Ord)] #[derive(Clone, Copy, Hash, Default, Debug, PartialEq, Eq, PartialOrd, Ord)]
#[repr(u8)] #[repr(u8)]
pub enum Cardinality { pub enum Cardinality {
/// All documents contain exactly one value. /// All documents contain exactly one value.
/// Required is the default for auto-detecting the Cardinality, since it is the most strict.
#[default] #[default]
Required = 0, Required = 0,
/// All documents contain at most one value. /// All documents contain at most one value.
@@ -24,20 +20,16 @@ impl Cardinality {
self as u8 self as u8
} }
pub(crate) fn try_from_code(code: u8) -> Result<Cardinality, InvalidData> { pub(crate) fn try_from_code(code: u8) -> Option<Cardinality> {
match code { match code {
0 => Ok(Cardinality::Required), 0 => Some(Cardinality::Required),
1 => Ok(Cardinality::Optional), 1 => Some(Cardinality::Optional),
2 => Ok(Cardinality::Multivalued), 2 => Some(Cardinality::Multivalued),
_ => Err(InvalidData), _ => None,
} }
} }
} }
/// The column type represents the column type and can fit on 6-bits.
///
/// - bits[0..3]: Column category type.
/// - bits[3..6]: Numerical type if necessary.
#[derive(Hash, Eq, PartialEq, Debug, Clone, Copy)] #[derive(Hash, Eq, PartialEq, Debug, Clone, Copy)]
pub enum ColumnType { pub enum ColumnType {
Bytes, Bytes,
@@ -48,79 +40,73 @@ pub enum ColumnType {
impl ColumnType { impl ColumnType {
/// Encoded over 6 bits. /// Encoded over 6 bits.
pub(crate) fn to_code(self) -> u8 { pub(crate) fn to_code(self) -> u8 {
let column_type_category; let high_type;
let numerical_type_code: u8; let low_code: u8;
match self { match self {
ColumnType::Bytes => { ColumnType::Bytes => {
column_type_category = ColumnTypeCategory::Str; high_type = GeneralType::Str;
numerical_type_code = 0u8; low_code = 0u8;
} }
ColumnType::Numerical(numerical_type) => { ColumnType::Numerical(numerical_type) => {
column_type_category = ColumnTypeCategory::Numerical; high_type = GeneralType::Numerical;
numerical_type_code = numerical_type.to_code(); low_code = numerical_type.to_code();
} }
ColumnType::Bool => { ColumnType::Bool => {
column_type_category = ColumnTypeCategory::Bool; high_type = GeneralType::Bool;
numerical_type_code = 0u8; low_code = 0u8;
} }
} }
place_bits::<0, 3>(column_type_category.to_code()) | place_bits::<3, 6>(numerical_type_code) place_bits::<3, 6>(high_type.to_code()) | place_bits::<0, 3>(low_code)
} }
pub(crate) fn try_from_code(code: u8) -> Result<ColumnType, InvalidData> { pub(crate) fn try_from_code(code: u8) -> Option<ColumnType> {
if select_bits::<6, 8>(code) != 0u8 { if select_bits::<6, 8>(code) != 0u8 {
return Err(InvalidData); return None;
} }
let column_type_category_code = select_bits::<0, 3>(code); let high_code = select_bits::<3, 6>(code);
let numerical_type_code = select_bits::<3, 6>(code); let low_code = select_bits::<0, 3>(code);
let column_type_category = ColumnTypeCategory::try_from_code(column_type_category_code)?; let high_type = GeneralType::try_from_code(high_code)?;
match column_type_category { match high_type {
ColumnTypeCategory::Bool => { GeneralType::Bool => {
if numerical_type_code != 0u8 { if low_code != 0u8 {
return Err(InvalidData); return None;
} }
Ok(ColumnType::Bool) Some(ColumnType::Bool)
} }
ColumnTypeCategory::Str => { GeneralType::Str => {
if numerical_type_code != 0u8 { if low_code != 0u8 {
return Err(InvalidData); return None;
} }
Ok(ColumnType::Bytes) Some(ColumnType::Bytes)
} }
ColumnTypeCategory::Numerical => { GeneralType::Numerical => {
let numerical_type = NumericalType::try_from_code(numerical_type_code)?; let numerical_type = NumericalType::try_from_code(low_code)?;
Ok(ColumnType::Numerical(numerical_type)) Some(ColumnType::Numerical(numerical_type))
} }
} }
} }
} }
/// Column types are grouped into different categories that /// This corresponds to the JsonType.
/// corresponds to the different types of `JsonValue` types.
///
/// The columnar writer will apply coercion rules to make sure that
/// at most one column exist per `ColumnTypeCategory`.
///
/// See also [README.md].
#[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Debug)] #[derive(Copy, Clone, Ord, PartialOrd, Eq, PartialEq, Debug)]
#[repr(u8)] #[repr(u8)]
pub(crate) enum ColumnTypeCategory { pub(crate) enum GeneralType {
Bool = 0u8, Bool = 0u8,
Str = 1u8, Str = 1u8,
Numerical = 2u8, Numerical = 2u8,
} }
impl ColumnTypeCategory { impl GeneralType {
pub fn to_code(self) -> u8 { pub fn to_code(self) -> u8 {
self as u8 self as u8
} }
pub fn try_from_code(code: u8) -> Result<Self, InvalidData> { pub fn try_from_code(code: u8) -> Option<Self> {
match code { match code {
0u8 => Ok(Self::Bool), 0u8 => Some(Self::Bool),
1u8 => Ok(Self::Str), 1u8 => Some(Self::Str),
2u8 => Ok(Self::Numerical), 2u8 => Some(Self::Numerical),
_ => Err(InvalidData), _ => None,
} }
} }
} }
@@ -129,26 +115,26 @@ impl ColumnTypeCategory {
/// This is encoded over one-byte and added to a column key in the /// This is encoded over one-byte and added to a column key in the
/// columnar sstable. /// columnar sstable.
/// ///
/// - [0..6] bits: encodes the column type /// Cardinality is encoded as the first two highest two bits.
/// - [6..8] bits: encodes the cardinality /// The low 6 bits encode the column type.
#[derive(Eq, Hash, PartialEq, Debug, Copy, Clone)] #[derive(Eq, Hash, PartialEq, Debug, Copy, Clone)]
pub struct ColumnTypeAndCardinality { pub struct ColumnTypeAndCardinality {
pub typ: ColumnType,
pub cardinality: Cardinality, pub cardinality: Cardinality,
pub typ: ColumnType,
} }
impl ColumnTypeAndCardinality { impl ColumnTypeAndCardinality {
pub fn to_code(self) -> u8 { pub fn to_code(self) -> u8 {
place_bits::<0, 6>(self.typ.to_code()) | place_bits::<6, 8>(self.cardinality.to_code()) place_bits::<6, 8>(self.cardinality.to_code()) | place_bits::<0, 6>(self.typ.to_code())
} }
pub fn try_from_code(code: u8) -> Result<ColumnTypeAndCardinality, InvalidData> { pub fn try_from_code(code: u8) -> Option<ColumnTypeAndCardinality> {
let typ_code = select_bits::<0, 6>(code); let typ_code = select_bits::<0, 6>(code);
let cardinality_code = select_bits::<6, 8>(code); let cardinality_code = select_bits::<6, 8>(code);
let cardinality = Cardinality::try_from_code(cardinality_code)?; let cardinality = Cardinality::try_from_code(cardinality_code)?;
let typ = ColumnType::try_from_code(typ_code)?; let typ = ColumnType::try_from_code(typ_code)?;
assert_eq!(typ.to_code(), typ_code); assert_eq!(typ.to_code(), typ_code);
Ok(ColumnTypeAndCardinality { cardinality, typ }) Some(ColumnTypeAndCardinality { cardinality, typ })
} }
} }
@@ -163,7 +149,7 @@ mod tests {
fn test_column_type_header_to_code() { fn test_column_type_header_to_code() {
let mut column_type_header_set: HashSet<ColumnTypeAndCardinality> = HashSet::new(); let mut column_type_header_set: HashSet<ColumnTypeAndCardinality> = HashSet::new();
for code in u8::MIN..=u8::MAX { for code in u8::MIN..=u8::MAX {
if let Ok(column_type_header) = ColumnTypeAndCardinality::try_from_code(code) { if let Some(column_type_header) = ColumnTypeAndCardinality::try_from_code(code) {
assert_eq!(column_type_header.to_code(), code); assert_eq!(column_type_header.to_code(), code);
assert!(column_type_header_set.insert(column_type_header)); assert!(column_type_header_set.insert(column_type_header));
} }
@@ -179,7 +165,7 @@ mod tests {
fn test_column_type_to_code() { fn test_column_type_to_code() {
let mut column_type_set: HashSet<ColumnType> = HashSet::new(); let mut column_type_set: HashSet<ColumnType> = HashSet::new();
for code in u8::MIN..=u8::MAX { for code in u8::MIN..=u8::MAX {
if let Ok(column_type) = ColumnType::try_from_code(code) { if let Some(column_type) = ColumnType::try_from_code(code) {
assert_eq!(column_type.to_code(), code); assert_eq!(column_type.to_code(), code);
assert!(column_type_set.insert(column_type)); assert!(column_type_set.insert(column_type));
} }
@@ -191,7 +177,8 @@ mod tests {
fn test_cardinality_to_code() { fn test_cardinality_to_code() {
let mut num_cardinality = 0; let mut num_cardinality = 0;
for code in u8::MIN..=u8::MAX { for code in u8::MIN..=u8::MAX {
if let Ok(cardinality) = Cardinality::try_from_code(code) { let cardinality_opt = Cardinality::try_from_code(code);
if let Some(cardinality) = cardinality_opt {
assert_eq!(cardinality.to_code(), code); assert_eq!(cardinality.to_code(), code);
num_cardinality += 1; num_cardinality += 1;
} }

View File

@@ -3,11 +3,11 @@ use std::io;
use fnv::FnvHashMap; use fnv::FnvHashMap;
use sstable::SSTable; use sstable::SSTable;
pub(crate) struct TermIdMapping { pub(crate) struct IdMapping {
unordered_to_ord: Vec<OrderedId>, unordered_to_ord: Vec<OrderedId>,
} }
impl TermIdMapping { impl IdMapping {
pub fn to_ord(&self, unordered: UnorderedId) -> OrderedId { pub fn to_ord(&self, unordered: UnorderedId) -> OrderedId {
self.unordered_to_ord[unordered.0 as usize] self.unordered_to_ord[unordered.0 as usize]
} }
@@ -48,7 +48,7 @@ impl DictionaryBuilder {
/// Serialize the dictionary into an fst, and returns the /// Serialize the dictionary into an fst, and returns the
/// `UnorderedId -> TermOrdinal` map. /// `UnorderedId -> TermOrdinal` map.
pub fn serialize<'a, W: io::Write + 'a>(&self, wrt: &mut W) -> io::Result<TermIdMapping> { pub fn serialize<'a, W: io::Write + 'a>(&self, wrt: &mut W) -> io::Result<IdMapping> {
let mut terms: Vec<(&[u8], UnorderedId)> = let mut terms: Vec<(&[u8], UnorderedId)> =
self.dict.iter().map(|(k, v)| (k.as_slice(), *v)).collect(); self.dict.iter().map(|(k, v)| (k.as_slice(), *v)).collect();
terms.sort_unstable_by_key(|(key, _)| *key); terms.sort_unstable_by_key(|(key, _)| *key);
@@ -61,7 +61,7 @@ impl DictionaryBuilder {
unordered_to_ord[unordered_id.0 as usize] = ordered_id; unordered_to_ord[unordered_id.0 as usize] = ordered_id;
} }
sstable_builder.finish()?; sstable_builder.finish()?;
Ok(TermIdMapping { unordered_to_ord }) Ok(IdMapping { unordered_to_ord })
} }
} }

View File

@@ -12,9 +12,6 @@ pub use writer::ColumnarWriter;
pub type DocId = u32; pub type DocId = u32;
#[derive(Copy, Clone, Debug)]
pub struct InvalidData;
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use std::ops::Range; use std::ops::Range;
@@ -29,8 +26,8 @@ mod tests {
#[test] #[test]
fn test_dataframe_writer_bytes() { fn test_dataframe_writer_bytes() {
let mut dataframe_writer = ColumnarWriter::default(); let mut dataframe_writer = ColumnarWriter::default();
dataframe_writer.record_str(1u32, "my_string", "hello"); dataframe_writer.record_str(1u32, "my_string", b"hello");
dataframe_writer.record_str(3u32, "my_string", "helloeee"); dataframe_writer.record_str(3u32, "my_string", b"helloeee");
let mut buffer: Vec<u8> = Vec::new(); let mut buffer: Vec<u8> = Vec::new();
dataframe_writer.serialize(5, &mut buffer).unwrap(); dataframe_writer.serialize(5, &mut buffer).unwrap();
let columnar_fileslice = FileSlice::from(buffer); let columnar_fileslice = FileSlice::from(buffer);
@@ -39,7 +36,7 @@ mod tests {
let cols: Vec<(ColumnTypeAndCardinality, Range<u64>)> = let cols: Vec<(ColumnTypeAndCardinality, Range<u64>)> =
columnar.read_columns("my_string").unwrap(); columnar.read_columns("my_string").unwrap();
assert_eq!(cols.len(), 1); assert_eq!(cols.len(), 1);
assert_eq!(cols[0].1, 0..158); assert_eq!(cols[0].1, 0..159);
} }
#[test] #[test]
@@ -61,7 +58,7 @@ mod tests {
typ: ColumnType::Bool typ: ColumnType::Bool
} }
); );
assert_eq!(cols[0].1, 0..21); assert_eq!(cols[0].1, 0..22);
} }
#[test] #[test]
@@ -84,6 +81,6 @@ mod tests {
// - vals 8 //< due to padding? could have been 1byte?. // - vals 8 //< due to padding? could have been 1byte?.
// - null footer 6 bytes // - null footer 6 bytes
// - version footer 3 bytes // Should be file-wide // - version footer 3 bytes // Should be file-wide
assert_eq!(cols[0].1, 0..31); assert_eq!(cols[0].1, 0..32);
} }
} }

View File

@@ -8,7 +8,8 @@ use sstable::{Dictionary, RangeSSTable};
use crate::column_type_header::ColumnTypeAndCardinality; use crate::column_type_header::ColumnTypeAndCardinality;
fn io_invalid_data(msg: String) -> io::Error { fn io_invalid_data(msg: String) -> io::Error {
io::Error::new(io::ErrorKind::InvalidData, msg) io::Error::new(io::ErrorKind::InvalidData, msg) // format!("Invalid key found.
// {key_bytes:?}")));
} }
/// The ColumnarReader makes it possible to access a set of columns /// The ColumnarReader makes it possible to access a set of columns
@@ -49,7 +50,7 @@ impl ColumnarReader {
let key_bytes: &[u8] = stream.key(); let key_bytes: &[u8] = stream.key();
let column_code: u8 = key_bytes.last().cloned().unwrap(); let column_code: u8 = key_bytes.last().cloned().unwrap();
let column_type_and_cardinality = ColumnTypeAndCardinality::try_from_code(column_code) let column_type_and_cardinality = ColumnTypeAndCardinality::try_from_code(column_code)
.map_err(|_| io_invalid_data(format!("Unknown column code `{column_code}`")))?; .ok_or_else(|| io_invalid_data(format!("Unknown column code `{column_code}`")))?;
let range = stream.value().clone(); let range = stream.value().clone();
let column_name = String::from_utf8_lossy(&key_bytes[..key_bytes.len() - 1]); let column_name = String::from_utf8_lossy(&key_bytes[..key_bytes.len() - 1]);
let range_len = range.end - range.start; let range_len = range.end - range.start;
@@ -63,26 +64,15 @@ impl ColumnarReader {
Ok(results) Ok(results)
} }
/// Get all columns for the given column name. /// Get all columns for the given field_name.
///
/// There can be more than one column associated to a given column name, provided they have
/// different types.
// TODO fix ugly API // TODO fix ugly API
pub fn read_columns( pub fn read_columns(
&self, &self,
column_name: &str, field_name: &str,
) -> io::Result<Vec<(ColumnTypeAndCardinality, Range<u64>)>> { ) -> io::Result<Vec<(ColumnTypeAndCardinality, Range<u64>)>> {
// Each column is a associated to a given `column_key`, let mut start_key = field_name.to_string();
// that starts by `column_name\0column_header`.
//
// Listing the columns associated to the given column name is therefore equivalent to
// listing `column_key` with the prefix `column_name\0`.
//
// This is in turn equivalent to searching for the range
// `[column_name,\0`..column_name\1)`.
let mut start_key = column_name.to_string();
start_key.push('\0'); start_key.push('\0');
let mut end_key = column_name.to_string(); let mut end_key = field_name.to_string();
end_key.push(1u8 as char); end_key.push(1u8 as char);
let mut stream = self let mut stream = self
.column_dictionary .column_dictionary
@@ -93,10 +83,12 @@ impl ColumnarReader {
let mut results = Vec::new(); let mut results = Vec::new();
while stream.advance() { while stream.advance() {
let key_bytes: &[u8] = stream.key(); let key_bytes: &[u8] = stream.key();
assert!(key_bytes.starts_with(start_key.as_bytes())); if !key_bytes.starts_with(start_key.as_bytes()) {
return Err(io_invalid_data(format!("Invalid key found. {key_bytes:?}")));
}
let column_code: u8 = key_bytes.last().cloned().unwrap(); let column_code: u8 = key_bytes.last().cloned().unwrap();
let column_type_and_cardinality = ColumnTypeAndCardinality::try_from_code(column_code) let column_type_and_cardinality = ColumnTypeAndCardinality::try_from_code(column_code)
.map_err(|_| io_invalid_data(format!("Unknown column code `{column_code}`")))?; .ok_or_else(|| io_invalid_data(format!("Unknown column code `{column_code}`")))?;
let range = stream.value().clone(); let range = stream.value().clone();
results.push((column_type_and_cardinality, range)); results.push((column_type_and_cardinality, range));
} }

View File

@@ -1,5 +1,3 @@
use crate::InvalidData;
#[derive(Copy, Clone, Debug, PartialEq)] #[derive(Copy, Clone, Debug, PartialEq)]
pub enum NumericalValue { pub enum NumericalValue {
I64(i64), I64(i64),
@@ -51,12 +49,12 @@ impl NumericalType {
self as u8 self as u8
} }
pub fn try_from_code(code: u8) -> Result<NumericalType, InvalidData> { pub fn try_from_code(code: u8) -> Option<NumericalType> {
match code { match code {
0 => Ok(NumericalType::I64), 0 => Some(NumericalType::I64),
1 => Ok(NumericalType::U64), 1 => Some(NumericalType::U64),
2 => Ok(NumericalType::F64), 2 => Some(NumericalType::F64),
_ => Err(InvalidData), _ => None,
} }
} }
} }
@@ -64,7 +62,6 @@ impl NumericalType {
/// We voluntarily avoid using `Into` here to keep this /// We voluntarily avoid using `Into` here to keep this
/// implementation quirk as private as possible. /// implementation quirk as private as possible.
/// ///
/// # Panics
/// This coercion trait actually panics if it is used /// This coercion trait actually panics if it is used
/// to convert a loose types to a stricter type. /// to convert a loose types to a stricter type.
/// ///
@@ -114,7 +111,7 @@ mod tests {
fn test_numerical_type_code() { fn test_numerical_type_code() {
let mut num_numerical_type = 0; let mut num_numerical_type = 0;
for code in u8::MIN..=u8::MAX { for code in u8::MIN..=u8::MAX {
if let Ok(numerical_type) = NumericalType::try_from_code(code) { if let Some(numerical_type) = NumericalType::try_from_code(code) {
assert_eq!(numerical_type.to_code(), code); assert_eq!(numerical_type.to_code(), code);
num_numerical_type += 1; num_numerical_type += 1;
} }

View File

@@ -1,110 +1,84 @@
use crate::dictionary::UnorderedId; use crate::dictionary::UnorderedId;
use crate::utils::{place_bits, pop_first_byte, select_bits}; use crate::utils::{place_bits, pop_first_byte, select_bits};
use crate::value::NumericalValue; use crate::value::NumericalValue;
use crate::{DocId, InvalidData, NumericalType}; use crate::{DocId, NumericalType};
/// When we build a columnar dataframe, we first just group /// When we build a columnar dataframe, we first just group
/// all mutations per column, and appends them in append-only buffer /// all mutations per column, and append them in append-only object.
/// in the stacker.
///
/// These ColumnOperation<T> are therefore serialize/deserialized
/// in memory.
/// ///
/// We represents all of these operations as `ColumnOperation`. /// We represents all of these operations as `ColumnOperation`.
#[derive(Eq, PartialEq, Debug, Clone, Copy)] #[derive(Eq, PartialEq, Debug, Clone, Copy)]
pub(super) enum ColumnOperation<T> { pub(crate) enum ColumnOperation<T> {
NewDoc(DocId), NewDoc(DocId),
Value(T), Value(T),
} }
#[derive(Copy, Clone, Eq, PartialEq, Debug)] #[derive(Copy, Clone, Debug, Eq, PartialEq)]
struct ColumnOperationMetadata { struct ColumnOperationHeader {
op_type: ColumnOperationType, typ_code: u8,
len: u8, len: u8,
} }
impl ColumnOperationMetadata { impl ColumnOperationHeader {
fn to_code(self) -> u8 { fn to_code(self) -> u8 {
place_bits::<0, 4>(self.len) | place_bits::<4, 8>(self.op_type.to_code()) place_bits::<0, 4>(self.len) | place_bits::<4, 8>(self.typ_code)
} }
fn try_from_code(code: u8) -> Result<Self, InvalidData> { fn from_code(code: u8) -> Self {
let len = select_bits::<0, 4>(code); let len = select_bits::<0, 4>(code);
let typ_code = select_bits::<4, 8>(code); let typ_code = select_bits::<4, 8>(code);
let column_type = ColumnOperationType::try_from_code(typ_code)?; ColumnOperationHeader { typ_code, len }
Ok(ColumnOperationMetadata {
op_type: column_type,
len,
})
} }
} }
#[derive(Copy, Clone, Eq, PartialEq, Debug)] const NEW_DOC_CODE: u8 = 0u8;
#[repr(u8)] const NEW_VALUE_CODE: u8 = 1u8;
enum ColumnOperationType {
NewDoc = 0u8,
AddValue = 1u8,
}
impl ColumnOperationType {
pub fn to_code(self) -> u8 {
self as u8
}
pub fn try_from_code(code: u8) -> Result<Self, InvalidData> {
match code {
0 => Ok(Self::NewDoc),
1 => Ok(Self::AddValue),
_ => Err(InvalidData),
}
}
}
impl<V: SymbolValue> ColumnOperation<V> { impl<V: SymbolValue> ColumnOperation<V> {
pub(super) fn serialize(self) -> impl AsRef<[u8]> { pub fn serialize(self) -> impl AsRef<[u8]> {
let mut minibuf = MiniBuffer::default(); let mut minibuf = MiniBuffer::default();
let column_op_metadata = match self { let header = match self {
ColumnOperation::NewDoc(new_doc) => { ColumnOperation::NewDoc(new_doc) => {
let symbol_len = new_doc.serialize(&mut minibuf.bytes[1..]); let symbol_len = new_doc.serialize(&mut minibuf.bytes[1..]);
ColumnOperationMetadata { ColumnOperationHeader {
op_type: ColumnOperationType::NewDoc, typ_code: NEW_DOC_CODE,
len: symbol_len, len: symbol_len,
} }
} }
ColumnOperation::Value(val) => { ColumnOperation::Value(val) => {
let symbol_len = val.serialize(&mut minibuf.bytes[1..]); let symbol_len = val.serialize(&mut minibuf.bytes[1..]);
ColumnOperationMetadata { ColumnOperationHeader {
op_type: ColumnOperationType::AddValue, typ_code: NEW_VALUE_CODE,
len: symbol_len, len: symbol_len,
} }
} }
}; };
minibuf.bytes[0] = column_op_metadata.to_code(); minibuf.bytes[0] = header.to_code();
// +1 for the metadata minibuf.len = 1 + header.len;
minibuf.len = 1 + column_op_metadata.len;
minibuf minibuf
} }
/// Deserialize a colummn operation. /// Deserialize a colummn operation.
/// Returns None if the buffer is empty. /// Returns None if the buffer is empty.
/// ///
/// Panics if the payload is invalid: /// Panics if the payload is invalid.
/// this deserialize method is meant to target in memory. pub fn deserialize(bytes: &mut &[u8]) -> Option<Self> {
pub(super) fn deserialize(bytes: &mut &[u8]) -> Option<Self> { let header_byte = pop_first_byte(bytes)?;
let column_op_metadata_byte = pop_first_byte(bytes)?; let column_op_header = ColumnOperationHeader::from_code(header_byte);
let column_op_metadata = ColumnOperationMetadata::try_from_code(column_op_metadata_byte)
.expect("Invalid op metadata byte");
let symbol_bytes: &[u8]; let symbol_bytes: &[u8];
(symbol_bytes, *bytes) = bytes.split_at(column_op_metadata.len as usize); (symbol_bytes, *bytes) = bytes.split_at(column_op_header.len as usize);
match column_op_metadata.op_type { match column_op_header.typ_code {
ColumnOperationType::NewDoc => { NEW_DOC_CODE => {
let new_doc = u32::deserialize(symbol_bytes); let new_doc = u32::deserialize(symbol_bytes);
Some(ColumnOperation::NewDoc(new_doc)) Some(ColumnOperation::NewDoc(new_doc))
} }
ColumnOperationType::AddValue => { NEW_VALUE_CODE => {
let value = V::deserialize(symbol_bytes); let value = V::deserialize(symbol_bytes);
Some(ColumnOperation::Value(value)) Some(ColumnOperation::Value(value))
} }
_ => {
panic!("Unknown code {}", column_op_header.typ_code);
}
} }
} }
} }
@@ -115,25 +89,20 @@ impl<T> From<T> for ColumnOperation<T> {
} }
} }
// Serialization trait very local to the writer.
// As we write fast fields, we accumulate them in "in memory".
// In order to limit memory usage, and in order
// to benefit from the stacker, we do this by serialization our data
// as "Symbols".
#[allow(clippy::from_over_into)] #[allow(clippy::from_over_into)]
pub(super) trait SymbolValue: Clone + Copy { pub(crate) trait SymbolValue: Clone + Copy {
// Serializes the symbol into the given buffer.
// Returns the number of bytes written into the buffer.
/// # Panics
/// May not exceed 9bytes
fn serialize(self, buffer: &mut [u8]) -> u8; fn serialize(self, buffer: &mut [u8]) -> u8;
// Panics if invalid
// Reads the header type and the given bytes.
//
// `bytes` does not contain the header byte.
// This method should advance bytes by the number of bytes that were consumed.
fn deserialize(bytes: &[u8]) -> Self; fn deserialize(bytes: &[u8]) -> Self;
} }
impl SymbolValue for bool { impl SymbolValue for bool {
fn serialize(self, buffer: &mut [u8]) -> u8 { fn serialize(self, buffer: &mut [u8]) -> u8 {
buffer[0] = u8::from(self); buffer[0] = if self { 1u8 } else { 0u8 };
1u8 1u8
} }
@@ -178,9 +147,6 @@ impl SymbolValue for NumericalValue {
} }
} }
/// F64: Serialize with a fixed size of 9 bytes
/// U64: Serialize without leading zeroes
/// I64: ZigZag encoded and serialize without leading zeroes
fn serialize(self, output: &mut [u8]) -> u8 { fn serialize(self, output: &mut [u8]) -> u8 {
match self { match self {
NumericalValue::F64(val) => { NumericalValue::F64(val) => {
@@ -277,14 +243,13 @@ mod tests {
} }
#[test] #[test]
fn test_column_op_metadata_byte_serialization() { fn test_header_byte_serialization() {
for len in 0..=15 { for len in 0..=15 {
for op_type in [ColumnOperationType::AddValue, ColumnOperationType::NewDoc] { for typ_code in 0..=15 {
let column_op_metadata = ColumnOperationMetadata { op_type, len }; let header = ColumnOperationHeader { typ_code, len };
let column_op_metadata_code = column_op_metadata.to_code(); let header_code = header.to_code();
let serdeser_metadata = let serdeser_header = ColumnOperationHeader::from_code(header_code);
ColumnOperationMetadata::try_from_code(column_op_metadata_code).unwrap(); assert_eq!(header, serdeser_header);
assert_eq!(column_op_metadata, serdeser_metadata);
} }
} }
} }

View File

@@ -9,18 +9,18 @@ use crate::{Cardinality, DocId, NumericalType, NumericalValue};
#[derive(Copy, Clone, Debug, Eq, PartialEq)] #[derive(Copy, Clone, Debug, Eq, PartialEq)]
#[repr(u8)] #[repr(u8)]
enum DocumentStep { enum DocumentStep {
Same = 0, SameDoc = 0,
Next = 1, NextDoc = 1,
Skipped = 2, SkippedDoc = 2,
} }
#[inline(always)] #[inline(always)]
fn delta_with_last_doc(last_doc_opt: Option<u32>, doc: u32) -> DocumentStep { fn delta_with_last_doc(last_doc_opt: Option<u32>, doc: u32) -> DocumentStep {
let expected_next_doc = last_doc_opt.map(|last_doc| last_doc + 1).unwrap_or(0u32); let expected_next_doc = last_doc_opt.map(|last_doc| last_doc + 1).unwrap_or(0u32);
match doc.cmp(&expected_next_doc) { match doc.cmp(&expected_next_doc) {
Ordering::Less => DocumentStep::Same, Ordering::Less => DocumentStep::SameDoc,
Ordering::Equal => DocumentStep::Next, Ordering::Equal => DocumentStep::NextDoc,
Ordering::Greater => DocumentStep::Skipped, Ordering::Greater => DocumentStep::SkippedDoc,
} }
} }
@@ -38,7 +38,7 @@ pub struct ColumnWriter {
impl ColumnWriter { impl ColumnWriter {
/// Returns an iterator over the Symbol that have been recorded /// Returns an iterator over the Symbol that have been recorded
/// for the given column. /// for the given column.
pub(super) fn operation_iterator<'a, V: SymbolValue>( pub(crate) fn operation_iterator<'a, V: SymbolValue>(
&self, &self,
arena: &MemoryArena, arena: &MemoryArena,
buffer: &'a mut Vec<u8>, buffer: &'a mut Vec<u8>,
@@ -53,18 +53,18 @@ impl ColumnWriter {
/// ///
/// This function will also update the cardinality of the column /// This function will also update the cardinality of the column
/// if necessary. /// if necessary.
pub(super) fn record<S: SymbolValue>(&mut self, doc: DocId, value: S, arena: &mut MemoryArena) { pub(crate) fn record<S: SymbolValue>(&mut self, doc: DocId, value: S, arena: &mut MemoryArena) {
// Difference between `doc` and the last doc. // Difference between `doc` and the last doc.
match delta_with_last_doc(self.last_doc_opt, doc) { match delta_with_last_doc(self.last_doc_opt, doc) {
DocumentStep::Same => { DocumentStep::SameDoc => {
// This is the last encounterred document. // This is the last encounterred document.
self.cardinality = Cardinality::Multivalued; self.cardinality = Cardinality::Multivalued;
} }
DocumentStep::Next => { DocumentStep::NextDoc => {
self.last_doc_opt = Some(doc); self.last_doc_opt = Some(doc);
self.write_symbol::<S>(ColumnOperation::NewDoc(doc), arena); self.write_symbol::<S>(ColumnOperation::NewDoc(doc), arena);
} }
DocumentStep::Skipped => { DocumentStep::SkippedDoc => {
self.cardinality = self.cardinality.max(Cardinality::Optional); self.cardinality = self.cardinality.max(Cardinality::Optional);
self.last_doc_opt = Some(doc); self.last_doc_opt = Some(doc);
self.write_symbol::<S>(ColumnOperation::NewDoc(doc), arena); self.write_symbol::<S>(ColumnOperation::NewDoc(doc), arena);
@@ -77,10 +77,10 @@ impl ColumnWriter {
// The overall number of docs in the column is necessary to // The overall number of docs in the column is necessary to
// deal with the case where the all docs contain 1 value, except some documents // deal with the case where the all docs contain 1 value, except some documents
// at the end of the column. // at the end of the column.
pub(crate) fn get_cardinality(&self, num_docs: DocId) -> Cardinality { pub fn get_cardinality(&self, num_docs: DocId) -> Cardinality {
match delta_with_last_doc(self.last_doc_opt, num_docs) { match delta_with_last_doc(self.last_doc_opt, num_docs) {
DocumentStep::Same | DocumentStep::Next => self.cardinality, DocumentStep::SameDoc | DocumentStep::NextDoc => self.cardinality,
DocumentStep::Skipped => self.cardinality.max(Cardinality::Optional), DocumentStep::SkippedDoc => self.cardinality.max(Cardinality::Optional),
} }
} }
@@ -105,7 +105,7 @@ pub(crate) struct NumericalColumnWriter {
/// State used to store what types are still acceptable /// State used to store what types are still acceptable
/// after having seen a set of numerical values. /// after having seen a set of numerical values.
#[derive(Clone, Copy)] #[derive(Clone, Copy)]
struct CompatibleNumericalTypes { pub(crate) struct CompatibleNumericalTypes {
all_values_within_i64_range: bool, all_values_within_i64_range: bool,
all_values_within_u64_range: bool, all_values_within_u64_range: bool,
// f64 is always acceptable. // f64 is always acceptable.
@@ -155,7 +155,6 @@ impl NumericalColumnWriter {
let cardinality = self.column_writer.get_cardinality(num_docs); let cardinality = self.column_writer.get_cardinality(num_docs);
(numerical_type, cardinality) (numerical_type, cardinality)
} }
pub fn record_numerical_value( pub fn record_numerical_value(
&mut self, &mut self,
doc: DocId, doc: DocId,
@@ -166,7 +165,7 @@ impl NumericalColumnWriter {
self.column_writer.record(doc, value, arena); self.column_writer.record(doc, value, arena);
} }
pub(super) fn operation_iterator<'a>( pub fn operation_iterator<'a>(
self, self,
arena: &MemoryArena, arena: &MemoryArena,
buffer: &'a mut Vec<u8>, buffer: &'a mut Vec<u8>,
@@ -176,13 +175,13 @@ impl NumericalColumnWriter {
} }
#[derive(Copy, Clone, Default)] #[derive(Copy, Clone, Default)]
pub(crate) struct StrColumnWriter { pub struct StrColumnWriter {
pub(crate) dictionary_id: u32, pub(crate) dictionary_id: u32,
pub(crate) column_writer: ColumnWriter, pub(crate) column_writer: ColumnWriter,
} }
impl StrColumnWriter { impl StrColumnWriter {
pub(crate) fn with_dictionary_id(dictionary_id: u32) -> StrColumnWriter { pub fn with_dictionary_id(dictionary_id: u32) -> StrColumnWriter {
StrColumnWriter { StrColumnWriter {
dictionary_id, dictionary_id,
column_writer: Default::default(), column_writer: Default::default(),
@@ -200,7 +199,7 @@ impl StrColumnWriter {
self.column_writer.record(doc, unordered_id, arena); self.column_writer.record(doc, unordered_id, arena);
} }
pub(super) fn operation_iterator<'a>( pub(crate) fn operation_iterator<'a>(
&self, &self,
arena: &MemoryArena, arena: &MemoryArena,
byte_buffer: &'a mut Vec<u8>, byte_buffer: &'a mut Vec<u8>,
@@ -215,14 +214,20 @@ mod tests {
#[test] #[test]
fn test_delta_with_last_doc() { fn test_delta_with_last_doc() {
assert_eq!(delta_with_last_doc(None, 0u32), DocumentStep::Next); assert_eq!(delta_with_last_doc(None, 0u32), DocumentStep::NextDoc);
assert_eq!(delta_with_last_doc(None, 1u32), DocumentStep::Skipped); assert_eq!(delta_with_last_doc(None, 1u32), DocumentStep::SkippedDoc);
assert_eq!(delta_with_last_doc(None, 2u32), DocumentStep::Skipped); assert_eq!(delta_with_last_doc(None, 2u32), DocumentStep::SkippedDoc);
assert_eq!(delta_with_last_doc(Some(0u32), 0u32), DocumentStep::Same); assert_eq!(delta_with_last_doc(Some(0u32), 0u32), DocumentStep::SameDoc);
assert_eq!(delta_with_last_doc(Some(1u32), 1u32), DocumentStep::Same); assert_eq!(delta_with_last_doc(Some(1u32), 1u32), DocumentStep::SameDoc);
assert_eq!(delta_with_last_doc(Some(1u32), 2u32), DocumentStep::Next); assert_eq!(delta_with_last_doc(Some(1u32), 2u32), DocumentStep::NextDoc);
assert_eq!(delta_with_last_doc(Some(1u32), 3u32), DocumentStep::Skipped); assert_eq!(
assert_eq!(delta_with_last_doc(Some(1u32), 4u32), DocumentStep::Skipped); delta_with_last_doc(Some(1u32), 3u32),
DocumentStep::SkippedDoc
);
assert_eq!(
delta_with_last_doc(Some(1u32), 4u32),
DocumentStep::SkippedDoc
);
} }
#[track_caller] #[track_caller]

View File

@@ -3,24 +3,27 @@ mod column_writers;
mod serializer; mod serializer;
mod value_index; mod value_index;
use std::io; use std::io::{self, Write};
use column_operation::ColumnOperation; use column_operation::ColumnOperation;
use common::CountingWriter;
use fastfield_codecs::serialize::ValueIndexInfo; use fastfield_codecs::serialize::ValueIndexInfo;
use fastfield_codecs::{Column, MonotonicallyMappableToU64, VecColumn}; use fastfield_codecs::{Column, MonotonicallyMappableToU64, VecColumn};
use serializer::ColumnarSerializer; use serializer::ColumnarSerializer;
use stacker::{Addr, ArenaHashMap, MemoryArena}; use stacker::{Addr, ArenaHashMap, MemoryArena};
use crate::column_type_header::{ColumnType, ColumnTypeAndCardinality, ColumnTypeCategory}; use crate::column_type_header::{ColumnType, ColumnTypeAndCardinality, GeneralType};
use crate::dictionary::{DictionaryBuilder, TermIdMapping, UnorderedId}; use crate::dictionary::{DictionaryBuilder, IdMapping, UnorderedId};
use crate::value::{Coerce, NumericalType, NumericalValue}; use crate::value::{Coerce, NumericalType, NumericalValue};
use crate::writer::column_writers::{ColumnWriter, NumericalColumnWriter, StrColumnWriter}; use crate::writer::column_writers::{ColumnWriter, NumericalColumnWriter, StrColumnWriter};
use crate::writer::value_index::{IndexBuilder, SpareIndexBuilders}; use crate::writer::value_index::{IndexBuilder, SpareIndexBuilders};
use crate::{Cardinality, DocId}; use crate::{Cardinality, DocId};
/// This is a set of buffers that are used to temporarily write the values into before passing them /// Threshold above which a column data will be compressed
/// to the fast field codecs. /// using ZSTD.
const COLUMN_COMPRESSION_THRESHOLD: usize = 100_000;
/// This is a set of buffers that are only here
/// to limit the amount of allocation.
#[derive(Default)] #[derive(Default)]
struct SpareBuffers { struct SpareBuffers {
value_index_builders: SpareIndexBuilders, value_index_builders: SpareIndexBuilders,
@@ -28,21 +31,9 @@ struct SpareBuffers {
u64_values: Vec<u64>, u64_values: Vec<u64>,
f64_values: Vec<f64>, f64_values: Vec<f64>,
bool_values: Vec<bool>, bool_values: Vec<bool>,
column_buffer: Vec<u8>,
} }
/// Makes it possible to create a new columnar.
///
/// ```rust
/// use tantivy_columnar::ColumnarWriter;
///
/// let mut columnar_writer = ColumnarWriter::default();
/// columnar_writer.record_str(0u32 /* doc id */, "product_name", "Red backpack");
/// columnar_writer.record_numerical(0u32 /* doc id */, "price", 10u64);
/// columnar_writer.record_str(1u32 /* doc id */, "product_name", "Apple");
/// columnar_writer.record_numerical(0u32 /* doc id */, "price", 10.5f64); //< uh oh we ended up mixing integer and floats.
/// let mut wrt: Vec<u8> = Vec::new();
/// columnar_writer.serialize(2u32, &mut wrt).unwrap();
/// ```
pub struct ColumnarWriter { pub struct ColumnarWriter {
numerical_field_hash_map: ArenaHashMap, numerical_field_hash_map: ArenaHashMap,
bool_field_hash_map: ArenaHashMap, bool_field_hash_map: ArenaHashMap,
@@ -67,11 +58,11 @@ impl Default for ColumnarWriter {
} }
impl ColumnarWriter { impl ColumnarWriter {
pub fn record_numerical<T: Into<NumericalValue> + Copy>( pub fn record_numerical(
&mut self, &mut self,
doc: DocId, doc: DocId,
column_name: &str, column_name: &str,
numerical_value: T, numerical_value: NumericalValue,
) { ) {
assert!( assert!(
!column_name.as_bytes().contains(&0u8), !column_name.as_bytes().contains(&0u8),
@@ -82,7 +73,7 @@ impl ColumnarWriter {
column_name.as_bytes(), column_name.as_bytes(),
|column_opt: Option<NumericalColumnWriter>| { |column_opt: Option<NumericalColumnWriter>| {
let mut column: NumericalColumnWriter = column_opt.unwrap_or_default(); let mut column: NumericalColumnWriter = column_opt.unwrap_or_default();
column.record_numerical_value(doc, numerical_value.into(), arena); column.record_numerical_value(doc, numerical_value, arena);
column column
}, },
); );
@@ -104,7 +95,7 @@ impl ColumnarWriter {
); );
} }
pub fn record_str(&mut self, doc: DocId, column_name: &str, value: &str) { pub fn record_str(&mut self, doc: DocId, column_name: &str, value: &[u8]) {
assert!( assert!(
!column_name.as_bytes().contains(&0u8), !column_name.as_bytes().contains(&0u8),
"key may not contain the 0 byte" "key may not contain the 0 byte"
@@ -118,12 +109,11 @@ impl ColumnarWriter {
column_name.as_bytes(), column_name.as_bytes(),
|column_opt: Option<StrColumnWriter>| { |column_opt: Option<StrColumnWriter>| {
let mut column: StrColumnWriter = column_opt.unwrap_or_else(|| { let mut column: StrColumnWriter = column_opt.unwrap_or_else(|| {
// Each column has its own dictionary
let dictionary_id = dictionaries.len() as u32; let dictionary_id = dictionaries.len() as u32;
dictionaries.push(DictionaryBuilder::default()); dictionaries.push(DictionaryBuilder::default());
StrColumnWriter::with_dictionary_id(dictionary_id) StrColumnWriter::with_dictionary_id(dictionary_id)
}); });
column.record_bytes(doc, value.as_bytes(), dictionaries, arena); column.record_bytes(doc, value, dictionaries, arena);
column column
}, },
); );
@@ -131,44 +121,44 @@ impl ColumnarWriter {
pub fn serialize(&mut self, num_docs: DocId, wrt: &mut dyn io::Write) -> io::Result<()> { pub fn serialize(&mut self, num_docs: DocId, wrt: &mut dyn io::Write) -> io::Result<()> {
let mut serializer = ColumnarSerializer::new(wrt); let mut serializer = ColumnarSerializer::new(wrt);
let mut field_columns: Vec<(&[u8], ColumnTypeCategory, Addr)> = self let mut field_columns: Vec<(&[u8], GeneralType, Addr)> = self
.numerical_field_hash_map .numerical_field_hash_map
.iter() .iter()
.map(|(term, addr, _)| (term, ColumnTypeCategory::Numerical, addr)) .map(|(term, addr, _)| (term, GeneralType::Numerical, addr))
.collect(); .collect();
field_columns.extend( field_columns.extend(
self.bytes_field_hash_map self.bytes_field_hash_map
.iter() .iter()
.map(|(term, addr, _)| (term, ColumnTypeCategory::Str, addr)), .map(|(term, addr, _)| (term, GeneralType::Str, addr)),
); );
field_columns.extend( field_columns.extend(
self.bool_field_hash_map self.bool_field_hash_map
.iter() .iter()
.map(|(term, addr, _)| (term, ColumnTypeCategory::Bool, addr)), .map(|(term, addr, _)| (term, GeneralType::Bool, addr)),
); );
field_columns.sort_unstable_by_key(|(column_name, col_type, _)| (*column_name, *col_type)); field_columns.sort_unstable_by_key(|(column_name, col_type, _)| (*column_name, *col_type));
let (arena, buffers, dictionaries) = (&self.arena, &mut self.buffers, &self.dictionaries); let (arena, buffers, dictionaries) = (&self.arena, &mut self.buffers, &self.dictionaries);
let mut symbol_byte_buffer: Vec<u8> = Vec::new(); let mut symbol_byte_buffer: Vec<u8> = Vec::new();
for (column_name, bytes_or_numerical, addr) in field_columns { for (column_name, bytes_or_numerical, addr) in field_columns {
match bytes_or_numerical { match bytes_or_numerical {
ColumnTypeCategory::Bool => { GeneralType::Bool => {
let column_writer: ColumnWriter = self.bool_field_hash_map.read(addr); let column_writer: ColumnWriter = self.bool_field_hash_map.read(addr);
let cardinality = column_writer.get_cardinality(num_docs); let cardinality = column_writer.get_cardinality(num_docs);
let column_type_and_cardinality = ColumnTypeAndCardinality { let column_type_and_cardinality = ColumnTypeAndCardinality {
cardinality, cardinality,
typ: ColumnType::Bool, typ: ColumnType::Bool,
}; };
let mut column_serializer = let column_serializer =
serializer.serialize_column(column_name, column_type_and_cardinality); serializer.serialize_column(column_name, column_type_and_cardinality);
serialize_bool_column( serialize_bool_column(
cardinality, cardinality,
num_docs, num_docs,
column_writer.operation_iterator(arena, &mut symbol_byte_buffer), column_writer.operation_iterator(arena, &mut symbol_byte_buffer),
buffers, buffers,
&mut column_serializer, column_serializer,
)?; )?;
} }
ColumnTypeCategory::Str => { GeneralType::Str => {
let str_column_writer: StrColumnWriter = self.bytes_field_hash_map.read(addr); let str_column_writer: StrColumnWriter = self.bytes_field_hash_map.read(addr);
let dictionary_builder = let dictionary_builder =
&dictionaries[str_column_writer.dictionary_id as usize]; &dictionaries[str_column_writer.dictionary_id as usize];
@@ -177,7 +167,7 @@ impl ColumnarWriter {
cardinality, cardinality,
typ: ColumnType::Bytes, typ: ColumnType::Bytes,
}; };
let mut column_serializer = let column_serializer =
serializer.serialize_column(column_name, column_type_and_cardinality); serializer.serialize_column(column_name, column_type_and_cardinality);
serialize_bytes_column( serialize_bytes_column(
cardinality, cardinality,
@@ -185,10 +175,10 @@ impl ColumnarWriter {
dictionary_builder, dictionary_builder,
str_column_writer.operation_iterator(arena, &mut symbol_byte_buffer), str_column_writer.operation_iterator(arena, &mut symbol_byte_buffer),
buffers, buffers,
&mut column_serializer, column_serializer,
)?; )?;
} }
ColumnTypeCategory::Numerical => { GeneralType::Numerical => {
let numerical_column_writer: NumericalColumnWriter = let numerical_column_writer: NumericalColumnWriter =
self.numerical_field_hash_map.read(addr); self.numerical_field_hash_map.read(addr);
let (numerical_type, cardinality) = let (numerical_type, cardinality) =
@@ -197,7 +187,7 @@ impl ColumnarWriter {
cardinality, cardinality,
typ: ColumnType::Numerical(numerical_type), typ: ColumnType::Numerical(numerical_type),
}; };
let mut column_serializer = let column_serializer =
serializer.serialize_column(column_name, column_type_and_cardinality); serializer.serialize_column(column_name, column_type_and_cardinality);
serialize_numerical_column( serialize_numerical_column(
cardinality, cardinality,
@@ -205,7 +195,7 @@ impl ColumnarWriter {
numerical_type, numerical_type,
numerical_column_writer.operation_iterator(arena, &mut symbol_byte_buffer), numerical_column_writer.operation_iterator(arena, &mut symbol_byte_buffer),
buffers, buffers,
&mut column_serializer, column_serializer,
)?; )?;
} }
}; };
@@ -215,28 +205,41 @@ impl ColumnarWriter {
} }
} }
fn serialize_bytes_column( fn compress_and_write_column<W: io::Write>(column_bytes: &[u8], wrt: &mut W) -> io::Result<()> {
if column_bytes.len() >= COLUMN_COMPRESSION_THRESHOLD {
wrt.write_all(&[1])?;
let mut encoder = zstd::Encoder::new(wrt, 3)?;
encoder.write_all(column_bytes)?;
encoder.finish()?;
} else {
wrt.write_all(&[0])?;
wrt.write_all(column_bytes)?;
}
Ok(())
}
fn serialize_bytes_column<W: io::Write>(
cardinality: Cardinality, cardinality: Cardinality,
num_docs: DocId, num_docs: DocId,
dictionary_builder: &DictionaryBuilder, dictionary_builder: &DictionaryBuilder,
operation_it: impl Iterator<Item = ColumnOperation<UnorderedId>>, operation_it: impl Iterator<Item = ColumnOperation<UnorderedId>>,
buffers: &mut SpareBuffers, buffers: &mut SpareBuffers,
wrt: impl io::Write, mut wrt: W,
) -> io::Result<()> { ) -> io::Result<()> {
let SpareBuffers { let SpareBuffers {
value_index_builders, value_index_builders,
u64_values, u64_values,
column_buffer,
.. ..
} = buffers; } = buffers;
let mut counting_writer = CountingWriter::wrap(wrt); column_buffer.clear();
let term_id_mapping: TermIdMapping = dictionary_builder.serialize(&mut counting_writer)?; let id_mapping: IdMapping = dictionary_builder.serialize(column_buffer)?;
let dictionary_num_bytes: u32 = counting_writer.written_bytes() as u32; let dictionary_num_bytes: u32 = column_buffer.len() as u32;
let mut wrt = counting_writer.finish();
let operation_iterator = operation_it.map(|symbol: ColumnOperation<UnorderedId>| { let operation_iterator = operation_it.map(|symbol: ColumnOperation<UnorderedId>| {
// We map unordered ids to ordered ids. // We map unordered ids to ordered ids.
match symbol { match symbol {
ColumnOperation::Value(unordered_id) => { ColumnOperation::Value(unordered_id) => {
let ordered_id = term_id_mapping.to_ord(unordered_id); let ordered_id = id_mapping.to_ord(unordered_id);
ColumnOperation::Value(ordered_id.0 as u64) ColumnOperation::Value(ordered_id.0 as u64)
} }
ColumnOperation::NewDoc(doc) => ColumnOperation::NewDoc(doc), ColumnOperation::NewDoc(doc) => ColumnOperation::NewDoc(doc),
@@ -248,27 +251,30 @@ fn serialize_bytes_column(
num_docs, num_docs,
value_index_builders, value_index_builders,
u64_values, u64_values,
&mut wrt, column_buffer,
)?; )?;
wrt.write_all(&dictionary_num_bytes.to_le_bytes()[..])?; column_buffer.write_all(&dictionary_num_bytes.to_le_bytes()[..])?;
compress_and_write_column(column_buffer, &mut wrt)?;
Ok(()) Ok(())
} }
fn serialize_numerical_column( fn serialize_numerical_column<W: io::Write>(
cardinality: Cardinality, cardinality: Cardinality,
num_docs: DocId, num_docs: DocId,
numerical_type: NumericalType, numerical_type: NumericalType,
op_iterator: impl Iterator<Item = ColumnOperation<NumericalValue>>, op_iterator: impl Iterator<Item = ColumnOperation<NumericalValue>>,
buffers: &mut SpareBuffers, buffers: &mut SpareBuffers,
wrt: &mut impl io::Write, mut wrt: W,
) -> io::Result<()> { ) -> io::Result<()> {
let SpareBuffers { let SpareBuffers {
value_index_builders, value_index_builders,
u64_values, u64_values,
i64_values, i64_values,
f64_values, f64_values,
column_buffer,
.. ..
} = buffers; } = buffers;
column_buffer.clear();
match numerical_type { match numerical_type {
NumericalType::I64 => { NumericalType::I64 => {
serialize_column( serialize_column(
@@ -277,7 +283,7 @@ fn serialize_numerical_column(
num_docs, num_docs,
value_index_builders, value_index_builders,
i64_values, i64_values,
wrt, column_buffer,
)?; )?;
} }
NumericalType::U64 => { NumericalType::U64 => {
@@ -287,7 +293,7 @@ fn serialize_numerical_column(
num_docs, num_docs,
value_index_builders, value_index_builders,
u64_values, u64_values,
wrt, column_buffer,
)?; )?;
} }
NumericalType::F64 => { NumericalType::F64 => {
@@ -297,33 +303,37 @@ fn serialize_numerical_column(
num_docs, num_docs,
value_index_builders, value_index_builders,
f64_values, f64_values,
wrt, column_buffer,
)?; )?;
} }
}; };
compress_and_write_column(column_buffer, &mut wrt)?;
Ok(()) Ok(())
} }
fn serialize_bool_column( fn serialize_bool_column<W: io::Write>(
cardinality: Cardinality, cardinality: Cardinality,
num_docs: DocId, num_docs: DocId,
column_operations_it: impl Iterator<Item = ColumnOperation<bool>>, column_operations_it: impl Iterator<Item = ColumnOperation<bool>>,
buffers: &mut SpareBuffers, buffers: &mut SpareBuffers,
wrt: &mut impl io::Write, mut wrt: W,
) -> io::Result<()> { ) -> io::Result<()> {
let SpareBuffers { let SpareBuffers {
value_index_builders, value_index_builders,
bool_values, bool_values,
column_buffer,
.. ..
} = buffers; } = buffers;
column_buffer.clear();
serialize_column( serialize_column(
column_operations_it, column_operations_it,
cardinality, cardinality,
num_docs, num_docs,
value_index_builders, value_index_builders,
bool_values, bool_values,
wrt, column_buffer,
)?; )?;
compress_and_write_column(column_buffer, &mut wrt)?;
Ok(()) Ok(())
} }
@@ -335,7 +345,7 @@ fn serialize_column<
num_docs: DocId, num_docs: DocId,
value_index_builders: &mut SpareIndexBuilders, value_index_builders: &mut SpareIndexBuilders,
values: &mut Vec<T>, values: &mut Vec<T>,
mut wrt: impl io::Write, wrt: &mut Vec<u8>,
) -> io::Result<()> ) -> io::Result<()>
where where
for<'a> VecColumn<'a, T>: Column<T>, for<'a> VecColumn<'a, T>: Column<T>,
@@ -350,7 +360,7 @@ where
); );
fastfield_codecs::serialize( fastfield_codecs::serialize(
VecColumn::from(&values[..]), VecColumn::from(&values[..]),
&mut wrt, wrt,
&fastfield_codecs::ALL_CODEC_TYPES[..], &fastfield_codecs::ALL_CODEC_TYPES[..],
)?; )?;
} }
@@ -361,7 +371,7 @@ where
fastfield_codecs::serialize::serialize_new( fastfield_codecs::serialize::serialize_new(
ValueIndexInfo::SingleValue(Box::new(optional_index)), ValueIndexInfo::SingleValue(Box::new(optional_index)),
VecColumn::from(&values[..]), VecColumn::from(&values[..]),
&mut wrt, wrt,
&fastfield_codecs::ALL_CODEC_TYPES[..], &fastfield_codecs::ALL_CODEC_TYPES[..],
)?; )?;
} }
@@ -372,7 +382,7 @@ where
fastfield_codecs::serialize::serialize_new( fastfield_codecs::serialize::serialize_new(
ValueIndexInfo::MultiValue(Box::new(multivalued_index)), ValueIndexInfo::MultiValue(Box::new(multivalued_index)),
VecColumn::from(&values[..]), VecColumn::from(&values[..]),
&mut wrt, wrt,
&fastfield_codecs::ALL_CODEC_TYPES[..], &fastfield_codecs::ALL_CODEC_TYPES[..],
)?; )?;
} }

View File

@@ -15,10 +15,10 @@ pub struct ColumnarSerializer<W: io::Write> {
/// Returns a key consisting of the concatenation of the key and the column_type_and_cardinality /// Returns a key consisting of the concatenation of the key and the column_type_and_cardinality
/// code. /// code.
fn prepare_key( fn prepare_key<'a>(
key: &[u8], key: &[u8],
column_type_cardinality: ColumnTypeAndCardinality, column_type_cardinality: ColumnTypeAndCardinality,
buffer: &mut Vec<u8>, buffer: &'a mut Vec<u8>,
) { ) {
buffer.clear(); buffer.clear();
buffer.extend_from_slice(key); buffer.extend_from_slice(key);

View File

@@ -5,7 +5,7 @@ use crate::DocId;
/// The `IndexBuilder` interprets a sequence of /// The `IndexBuilder` interprets a sequence of
/// calls of the form: /// calls of the form:
/// (record_doc,record_value+)* /// (record_doc,record_value+)*
/// and can then serialize the results into an index to associate docids with their value[s]. /// and can then serialize the results into an index.
/// ///
/// It has different implementation depending on whether the /// It has different implementation depending on whether the
/// cardinality is required, optional, or multivalued. /// cardinality is required, optional, or multivalued.
@@ -30,7 +30,6 @@ pub struct OptionalIndexBuilder {
} }
struct SingleValueArrayIndex<'a> { struct SingleValueArrayIndex<'a> {
// DocIds with a value. DocIds are strictly increasing
docs: &'a [DocId], docs: &'a [DocId],
num_docs: DocId, num_docs: DocId,
} }
@@ -84,8 +83,7 @@ impl IndexBuilder for OptionalIndexBuilder {
#[derive(Default)] #[derive(Default)]
pub struct MultivaluedIndexBuilder { pub struct MultivaluedIndexBuilder {
// TODO should we switch to `start_offset`? // TODO should we switch to `start_offset`?
// contains the num values so far for each `DocId`. end_values: Vec<DocId>,
end_offsets: Vec<DocId>,
total_num_vals_seen: u32, total_num_vals_seen: u32,
} }
@@ -113,22 +111,22 @@ impl<'a> MultiValueIndexInfo for MultivaluedValueArrayIndex<'a> {
impl MultivaluedIndexBuilder { impl MultivaluedIndexBuilder {
pub fn finish(&mut self, num_docs: DocId) -> impl MultiValueIndexInfo + '_ { pub fn finish(&mut self, num_docs: DocId) -> impl MultiValueIndexInfo + '_ {
self.end_offsets self.end_values
.resize(num_docs as usize, self.total_num_vals_seen); .resize(num_docs as usize, self.total_num_vals_seen);
MultivaluedValueArrayIndex { MultivaluedValueArrayIndex {
end_offsets: &self.end_offsets[..], end_offsets: &self.end_values[..],
} }
} }
fn reset(&mut self) { fn reset(&mut self) {
self.end_offsets.clear(); self.end_values.clear();
self.total_num_vals_seen = 0; self.total_num_vals_seen = 0;
} }
} }
impl IndexBuilder for MultivaluedIndexBuilder { impl IndexBuilder for MultivaluedIndexBuilder {
fn record_doc(&mut self, doc: DocId) { fn record_doc(&mut self, doc: DocId) {
self.end_offsets self.end_values
.resize(doc as usize, self.total_num_vals_seen); .resize(doc as usize, self.total_num_vals_seen);
} }

View File

@@ -1,166 +0,0 @@
use std::cell::RefCell;
use std::iter::Peekable;
use std::rc::Rc;
pub trait GroupByIteratorExtended: Iterator {
/// Return an `Iterator` that groups iterator elements. Consecutive elements that map to the
/// same key are assigned to the same group.
///
/// The returned Iterator item is `(K, impl Iterator)`, where Iterator are the items of the
/// group.
///
/// ```
/// use tantivy_common::GroupByIteratorExtended;
///
/// // group data into blocks of larger than zero or not.
/// let data: Vec<i32> = vec![1, 3, -2, -2, 1, 0, 1, 2];
/// // groups: |---->|------>|--------->|
///
/// let mut data_grouped = Vec::new();
/// // Note: group is an iterator
/// for (key, group) in data.into_iter().group_by(|val| *val >= 0) {
/// data_grouped.push((key, group.collect()));
/// }
/// assert_eq!(data_grouped, vec![(true, vec![1, 3]), (false, vec![-2, -2]), (true, vec![1, 0, 1, 2])]);
/// ```
fn group_by<K, F>(self, key: F) -> GroupByIterator<Self, F, K>
where
Self: Sized,
F: FnMut(&Self::Item) -> K,
K: PartialEq + Copy,
Self::Item: Copy,
{
GroupByIterator::new(self, key)
}
}
impl<I: Iterator> GroupByIteratorExtended for I {}
pub struct GroupByIterator<I, F, K: Copy>
where
I: Iterator,
F: FnMut(&I::Item) -> K,
{
// I really would like to avoid the Rc<RefCell>, but the Iterator is shared between
// `GroupByIterator` and `GroupIter`. In practice they are used consecutive and
// `GroupByIter` is finished before calling next on `GroupByIterator`. I'm not sure there
// is a solution with lifetimes for that, because we would need to enforce it in the usage
// somehow.
//
// One potential solution would be to replace the iterator approach with something similar.
inner: Rc<RefCell<GroupByShared<I, F, K>>>,
}
struct GroupByShared<I, F, K: Copy>
where
I: Iterator,
F: FnMut(&I::Item) -> K,
{
iter: Peekable<I>,
group_by_fn: F,
}
impl<I, F, K> GroupByIterator<I, F, K>
where
I: Iterator,
F: FnMut(&I::Item) -> K,
K: Copy,
{
fn new(inner: I, group_by_fn: F) -> Self {
let inner = GroupByShared {
iter: inner.peekable(),
group_by_fn,
};
Self {
inner: Rc::new(RefCell::new(inner)),
}
}
}
impl<I, F, K> Iterator for GroupByIterator<I, F, K>
where
I: Iterator,
I::Item: Copy,
F: FnMut(&I::Item) -> K,
K: Copy,
{
type Item = (K, GroupIterator<I, F, K>);
fn next(&mut self) -> Option<Self::Item> {
let mut inner = self.inner.borrow_mut();
let value = *inner.iter.peek()?;
let key = (inner.group_by_fn)(&value);
let inner = self.inner.clone();
let group_iter = GroupIterator {
inner,
group_key: key,
};
Some((key, group_iter))
}
}
pub struct GroupIterator<I, F, K: Copy>
where
I: Iterator,
F: FnMut(&I::Item) -> K,
{
inner: Rc<RefCell<GroupByShared<I, F, K>>>,
group_key: K,
}
impl<I, F, K: PartialEq + Copy> Iterator for GroupIterator<I, F, K>
where
I: Iterator,
I::Item: Copy,
F: FnMut(&I::Item) -> K,
{
type Item = I::Item;
fn next(&mut self) -> Option<Self::Item> {
let mut inner = self.inner.borrow_mut();
// peek if next value is in group
let peek_val = *inner.iter.peek()?;
if (inner.group_by_fn)(&peek_val) == self.group_key {
inner.iter.next()
} else {
None
}
}
}
#[cfg(test)]
mod tests {
use super::*;
fn group_by_collect<I: Iterator<Item = u32>>(iter: I) -> Vec<(I::Item, Vec<I::Item>)> {
iter.group_by(|val| val / 10)
.map(|(el, iter)| (el, iter.collect::<Vec<_>>()))
.collect::<Vec<_>>()
}
#[test]
fn group_by_two_groups() {
let vals = vec![1u32, 4, 15];
let grouped_vals = group_by_collect(vals.into_iter());
assert_eq!(grouped_vals, vec![(0, vec![1, 4]), (1, vec![15])]);
}
#[test]
fn group_by_test_empty() {
let vals = vec![];
let grouped_vals = group_by_collect(vals.into_iter());
assert_eq!(grouped_vals, vec![]);
}
#[test]
fn group_by_three_groups() {
let vals = vec![1u32, 4, 15, 1];
let grouped_vals = group_by_collect(vals.into_iter());
assert_eq!(
grouped_vals,
vec![(0, vec![1, 4]), (1, vec![15]), (0, vec![1])]
);
}
}

View File

@@ -6,12 +6,10 @@ pub use byteorder::LittleEndian as Endianness;
mod bitset; mod bitset;
pub mod file_slice; pub mod file_slice;
mod group_by;
mod serialize; mod serialize;
mod vint; mod vint;
mod writer; mod writer;
pub use bitset::*; pub use bitset::*;
pub use group_by::GroupByIteratorExtended;
pub use ownedbytes::{OwnedBytes, StableDeref}; pub use ownedbytes::{OwnedBytes, StableDeref};
pub use serialize::{BinarySerializable, DeserializeFrom, FixedSize}; pub use serialize::{BinarySerializable, DeserializeFrom, FixedSize};
pub use vint::{ pub use vint::{

View File

@@ -1,17 +1,15 @@
// # Faceted Search // # Basic Example
// //
// This example covers the faceted search functionalities of // This example covers the basic functionalities of
// tantivy. // tantivy.
// //
// We will : // We will :
// - define a text field "name" in our schema // - define our schema
// - define a facet field "classification" in our schema // = create an index in a directory
// - create an index in memory // - index few documents in our index
// - index few documents with respective facets in our index // - search for the best document matchings "sea whale"
// - search and count the number of documents that the classifications start the facet "/Felidae" // - retrieve the best document original content.
// - Search the facet "/Felidae/Pantherinae" and count the number of documents that the
// classifications include the facet.
//
// --- // ---
// Importing tantivy... // Importing tantivy...
use tantivy::collector::FacetCollector; use tantivy::collector::FacetCollector;
@@ -23,7 +21,7 @@ fn main() -> tantivy::Result<()> {
// Let's create a temporary directory for the sake of this example // Let's create a temporary directory for the sake of this example
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
let name = schema_builder.add_text_field("name", TEXT | STORED); let name = schema_builder.add_text_field("felin_name", TEXT | STORED);
// this is our faceted field: its scientific classification // this is our faceted field: its scientific classification
let classification = schema_builder.add_facet_field("classification", FacetOptions::default()); let classification = schema_builder.add_facet_field("classification", FacetOptions::default());

View File

@@ -14,7 +14,7 @@ repository = "https://github.com/quickwit-oss/tantivy"
[dependencies] [dependencies]
common = { version = "0.5", path = "../common/", package = "tantivy-common" } common = { version = "0.5", path = "../common/", package = "tantivy-common" }
tantivy-bitpacker = { version= "0.3", path = "../bitpacker/" } tantivy-bitpacker = { version= "0.3", path = "../bitpacker/" }
prettytable-rs = {version="0.10.0", optional= true} prettytable-rs = {version="0.9.0", optional= true}
rand = {version="0.8.3", optional= true} rand = {version="0.8.3", optional= true}
fastdivide = "0.4" fastdivide = "0.4"
log = "0.4" log = "0.4"

View File

@@ -4,7 +4,7 @@ extern crate test;
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use std::ops::RangeInclusive; use std::iter;
use std::sync::Arc; use std::sync::Arc;
use common::OwnedBytes; use common::OwnedBytes;
@@ -71,24 +71,27 @@ mod tests {
}); });
} }
const FIFTY_PERCENT_RANGE: RangeInclusive<u64> = 1..=50; fn get_exp_data() -> Vec<u64> {
const SINGLE_ITEM: u64 = 90;
const SINGLE_ITEM_RANGE: RangeInclusive<u64> = 90..=90;
const ONE_PERCENT_ITEM_RANGE: RangeInclusive<u64> = 49..=49;
fn get_data_50percent_item() -> Vec<u128> {
let mut rng = StdRng::from_seed([1u8; 32]);
let mut data = vec![]; let mut data = vec![];
for _ in 0..300_000 { for i in 0..100 {
let val = rng.gen_range(1..=100); let num = i * i;
data.push(val); data.extend(iter::repeat(i as u64).take(num));
} }
data.push(SINGLE_ITEM); data.shuffle(&mut StdRng::from_seed([1u8; 32]));
data.shuffle(&mut rng); // lengt = 328350
let data = data.iter().map(|el| *el as u128).collect::<Vec<_>>();
data data
} }
fn get_data_50percent_item() -> (u128, u128, Vec<u128>) {
let mut permutation = get_exp_data();
let major_item = 20;
let minor_item = 10;
permutation.extend(iter::repeat(major_item).take(permutation.len()));
permutation.shuffle(&mut StdRng::from_seed([1u8; 32]));
let permutation = permutation.iter().map(|el| *el as u128).collect::<Vec<_>>();
(major_item as u128, minor_item as u128, permutation)
}
fn get_u128_column_random() -> Arc<dyn Column<u128>> { fn get_u128_column_random() -> Arc<dyn Column<u128>> {
let permutation = generate_random(); let permutation = generate_random();
let permutation = permutation.iter().map(|el| *el as u128).collect::<Vec<_>>(); let permutation = permutation.iter().map(|el| *el as u128).collect::<Vec<_>>();
@@ -103,82 +106,15 @@ mod tests {
open_u128::<u128>(out).unwrap() open_u128::<u128>(out).unwrap()
} }
// U64 RANGE START
#[bench]
fn bench_intfastfield_getrange_u64_50percent_hit(b: &mut Bencher) {
let data = get_data_50percent_item();
let data = data.iter().map(|el| *el as u64).collect::<Vec<_>>();
let column: Arc<dyn Column<u64>> = serialize_and_load(&data);
b.iter(|| {
let mut positions = Vec::new();
column.get_docids_for_value_range(
FIFTY_PERCENT_RANGE,
0..data.len() as u32,
&mut positions,
);
positions
});
}
#[bench]
fn bench_intfastfield_getrange_u64_1percent_hit(b: &mut Bencher) {
let data = get_data_50percent_item();
let data = data.iter().map(|el| *el as u64).collect::<Vec<_>>();
let column: Arc<dyn Column<u64>> = serialize_and_load(&data);
b.iter(|| {
let mut positions = Vec::new();
column.get_docids_for_value_range(
ONE_PERCENT_ITEM_RANGE,
0..data.len() as u32,
&mut positions,
);
positions
});
}
#[bench]
fn bench_intfastfield_getrange_u64_single_hit(b: &mut Bencher) {
let data = get_data_50percent_item();
let data = data.iter().map(|el| *el as u64).collect::<Vec<_>>();
let column: Arc<dyn Column<u64>> = serialize_and_load(&data);
b.iter(|| {
let mut positions = Vec::new();
column.get_docids_for_value_range(
SINGLE_ITEM_RANGE,
0..data.len() as u32,
&mut positions,
);
positions
});
}
#[bench]
fn bench_intfastfield_getrange_u64_hit_all(b: &mut Bencher) {
let data = get_data_50percent_item();
let data = data.iter().map(|el| *el as u64).collect::<Vec<_>>();
let column: Arc<dyn Column<u64>> = serialize_and_load(&data);
b.iter(|| {
let mut positions = Vec::new();
column.get_docids_for_value_range(0..=u64::MAX, 0..data.len() as u32, &mut positions);
positions
});
}
// U64 RANGE END
// U128 RANGE START
#[bench] #[bench]
fn bench_intfastfield_getrange_u128_50percent_hit(b: &mut Bencher) { fn bench_intfastfield_getrange_u128_50percent_hit(b: &mut Bencher) {
let data = get_data_50percent_item(); let (major_item, _minor_item, data) = get_data_50percent_item();
let column = get_u128_column_from_data(&data); let column = get_u128_column_from_data(&data);
b.iter(|| { b.iter(|| {
let mut positions = Vec::new(); let mut positions = Vec::new();
column.get_docids_for_value_range( column.get_docids_for_value_range(
*FIFTY_PERCENT_RANGE.start() as u128..=*FIFTY_PERCENT_RANGE.end() as u128, major_item..=major_item,
0..data.len() as u32, 0..data.len() as u32,
&mut positions, &mut positions,
); );
@@ -188,13 +124,13 @@ mod tests {
#[bench] #[bench]
fn bench_intfastfield_getrange_u128_single_hit(b: &mut Bencher) { fn bench_intfastfield_getrange_u128_single_hit(b: &mut Bencher) {
let data = get_data_50percent_item(); let (_major_item, minor_item, data) = get_data_50percent_item();
let column = get_u128_column_from_data(&data); let column = get_u128_column_from_data(&data);
b.iter(|| { b.iter(|| {
let mut positions = Vec::new(); let mut positions = Vec::new();
column.get_docids_for_value_range( column.get_docids_for_value_range(
*SINGLE_ITEM_RANGE.start() as u128..=*SINGLE_ITEM_RANGE.end() as u128, minor_item..=minor_item,
0..data.len() as u32, 0..data.len() as u32,
&mut positions, &mut positions,
); );
@@ -204,7 +140,7 @@ mod tests {
#[bench] #[bench]
fn bench_intfastfield_getrange_u128_hit_all(b: &mut Bencher) { fn bench_intfastfield_getrange_u128_hit_all(b: &mut Bencher) {
let data = get_data_50percent_item(); let (_major_item, _minor_item, data) = get_data_50percent_item();
let column = get_u128_column_from_data(&data); let column = get_u128_column_from_data(&data);
b.iter(|| { b.iter(|| {
@@ -213,7 +149,6 @@ mod tests {
positions positions
}); });
} }
// U128 RANGE END
#[bench] #[bench]
fn bench_intfastfield_scan_all_fflookup_u128(b: &mut Bencher) { fn bench_intfastfield_scan_all_fflookup_u128(b: &mut Bencher) {

View File

@@ -1,4 +1,3 @@
use std::fmt::{self, Debug};
use std::marker::PhantomData; use std::marker::PhantomData;
use std::ops::{Range, RangeInclusive}; use std::ops::{Range, RangeInclusive};
@@ -7,7 +6,7 @@ use tantivy_bitpacker::minmax;
use crate::monotonic_mapping::StrictlyMonotonicFn; use crate::monotonic_mapping::StrictlyMonotonicFn;
/// `Column` provides columnar access on a field. /// `Column` provides columnar access on a field.
pub trait Column<T: PartialOrd + Debug = u64>: Send + Sync { pub trait Column<T: PartialOrd = u64>: Send + Sync {
/// Return the value associated with the given idx. /// Return the value associated with the given idx.
/// ///
/// This accessor should return as fast as possible. /// This accessor should return as fast as possible.
@@ -35,10 +34,6 @@ pub trait Column<T: PartialOrd + Debug = u64>: Send + Sync {
/// Get the positions of values which are in the provided value range. /// Get the positions of values which are in the provided value range.
/// ///
/// Note that position == docid for single value fast fields /// Note that position == docid for single value fast fields
///
/// # Truncation
/// `DateTime` has a truncation setting. This function should get passed the truncated values
/// to avoid unexpected results.
#[inline] #[inline]
fn get_docids_for_value_range( fn get_docids_for_value_range(
&self, &self,
@@ -88,7 +83,7 @@ pub struct VecColumn<'a, T = u64> {
max_value: T, max_value: T,
} }
impl<'a, C: Column<T>, T: Copy + PartialOrd + fmt::Debug> Column<T> for &'a C { impl<'a, C: Column<T>, T: Copy + PartialOrd> Column<T> for &'a C {
fn get_val(&self, idx: u32) -> T { fn get_val(&self, idx: u32) -> T {
(*self).get_val(idx) (*self).get_val(idx)
} }
@@ -114,7 +109,7 @@ impl<'a, C: Column<T>, T: Copy + PartialOrd + fmt::Debug> Column<T> for &'a C {
} }
} }
impl<'a, T: Copy + PartialOrd + Send + Sync + Debug> Column<T> for VecColumn<'a, T> { impl<'a, T: Copy + PartialOrd + Send + Sync> Column<T> for VecColumn<'a, T> {
fn get_val(&self, position: u32) -> T { fn get_val(&self, position: u32) -> T {
self.values[position as usize] self.values[position as usize]
} }
@@ -141,8 +136,7 @@ impl<'a, T: Copy + PartialOrd + Send + Sync + Debug> Column<T> for VecColumn<'a,
} }
impl<'a, T: Copy + PartialOrd + Default, V> From<&'a V> for VecColumn<'a, T> impl<'a, T: Copy + PartialOrd + Default, V> From<&'a V> for VecColumn<'a, T>
where where V: AsRef<[T]> + ?Sized
V: AsRef<[T]> + ?Sized,
{ {
fn from(values: &'a V) -> Self { fn from(values: &'a V) -> Self {
let values = values.as_ref(); let values = values.as_ref();
@@ -183,8 +177,8 @@ pub fn monotonic_map_column<C, T, Input, Output>(
where where
C: Column<Input>, C: Column<Input>,
T: StrictlyMonotonicFn<Input, Output> + Send + Sync, T: StrictlyMonotonicFn<Input, Output> + Send + Sync,
Input: PartialOrd + Send + Sync + Copy + Debug, Input: PartialOrd + Send + Sync + Clone,
Output: PartialOrd + Send + Sync + Copy + Debug, Output: PartialOrd + Send + Sync + Clone,
{ {
MonotonicMappingColumn { MonotonicMappingColumn {
from_column, from_column,
@@ -197,8 +191,8 @@ impl<C, T, Input, Output> Column<Output> for MonotonicMappingColumn<C, T, Input>
where where
C: Column<Input>, C: Column<Input>,
T: StrictlyMonotonicFn<Input, Output> + Send + Sync, T: StrictlyMonotonicFn<Input, Output> + Send + Sync,
Input: PartialOrd + Send + Sync + Copy + Debug, Input: PartialOrd + Send + Sync + Clone,
Output: PartialOrd + Send + Sync + Copy + Debug, Output: PartialOrd + Send + Sync + Clone,
{ {
#[inline] #[inline]
fn get_val(&self, idx: u32) -> Output { fn get_val(&self, idx: u32) -> Output {
@@ -234,15 +228,12 @@ where
doc_id_range: Range<u32>, doc_id_range: Range<u32>,
positions: &mut Vec<u32>, positions: &mut Vec<u32>,
) { ) {
if range.start() > &self.max_value() || range.end() < &self.min_value() { self.from_column.get_docids_for_value_range(
return; self.monotonic_mapping.inverse(range.start().clone())
} ..=self.monotonic_mapping.inverse(range.end().clone()),
let range = self.monotonic_mapping.inverse_coerce(range); doc_id_range,
if range.start() > range.end() { positions,
return; )
}
self.from_column
.get_docids_for_value_range(range, doc_id_range, positions)
} }
// We voluntarily do not implement get_range as it yields a regression, // We voluntarily do not implement get_range as it yields a regression,
@@ -253,8 +244,7 @@ where
pub struct IterColumn<T>(T); pub struct IterColumn<T>(T);
impl<T> From<T> for IterColumn<T> impl<T> From<T> for IterColumn<T>
where where T: Iterator + Clone + ExactSizeIterator
T: Iterator + Clone + ExactSizeIterator,
{ {
fn from(iter: T) -> Self { fn from(iter: T) -> Self {
IterColumn(iter) IterColumn(iter)
@@ -264,7 +254,7 @@ where
impl<T> Column<T::Item> for IterColumn<T> impl<T> Column<T::Item> for IterColumn<T>
where where
T: Iterator + Clone + ExactSizeIterator + Send + Sync, T: Iterator + Clone + ExactSizeIterator + Send + Sync,
T::Item: PartialOrd + fmt::Debug, T::Item: PartialOrd,
{ {
fn get_val(&self, idx: u32) -> T::Item { fn get_val(&self, idx: u32) -> T::Item {
self.0.clone().nth(idx as usize).unwrap() self.0.clone().nth(idx as usize).unwrap()

View File

@@ -453,8 +453,6 @@ impl CompactSpaceDecompressor {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use std::fmt;
use super::*; use super::*;
use crate::format_version::read_format_version; use crate::format_version::read_format_version;
use crate::null_index_footer::read_null_index_footer; use crate::null_index_footer::read_null_index_footer;
@@ -708,7 +706,7 @@ mod tests {
); );
} }
fn get_positions_for_value_range_helper<C: Column<T> + ?Sized, T: PartialOrd + fmt::Debug>( fn get_positions_for_value_range_helper<C: Column<T> + ?Sized, T: PartialOrd>(
column: &C, column: &C,
value_range: RangeInclusive<T>, value_range: RangeInclusive<T>,
doc_id_range: Range<u32>, doc_id_range: Range<u32>,

View File

@@ -14,9 +14,9 @@ extern crate more_asserts;
#[cfg(all(test, feature = "unstable"))] #[cfg(all(test, feature = "unstable"))]
extern crate test; extern crate test;
use std::io;
use std::io::Write; use std::io::Write;
use std::sync::Arc; use std::sync::Arc;
use std::{fmt, io};
use common::{BinarySerializable, OwnedBytes}; use common::{BinarySerializable, OwnedBytes};
use compact_space::CompactSpaceDecompressor; use compact_space::CompactSpaceDecompressor;
@@ -133,7 +133,7 @@ impl U128FastFieldCodecType {
} }
/// Returns the correct codec reader wrapped in the `Arc` for the data. /// Returns the correct codec reader wrapped in the `Arc` for the data.
pub fn open_u128<Item: MonotonicallyMappableToU128 + fmt::Debug>( pub fn open_u128<Item: MonotonicallyMappableToU128>(
bytes: OwnedBytes, bytes: OwnedBytes,
) -> io::Result<Arc<dyn Column<Item>>> { ) -> io::Result<Arc<dyn Column<Item>>> {
let (bytes, _format_version) = read_format_version(bytes)?; let (bytes, _format_version) = read_format_version(bytes)?;
@@ -147,9 +147,7 @@ pub fn open_u128<Item: MonotonicallyMappableToU128 + fmt::Debug>(
} }
/// Returns the correct codec reader wrapped in the `Arc` for the data. /// Returns the correct codec reader wrapped in the `Arc` for the data.
pub fn open<T: MonotonicallyMappableToU64 + fmt::Debug>( pub fn open<T: MonotonicallyMappableToU64>(bytes: OwnedBytes) -> io::Result<Arc<dyn Column<T>>> {
bytes: OwnedBytes,
) -> io::Result<Arc<dyn Column<T>>> {
let (bytes, _format_version) = read_format_version(bytes)?; let (bytes, _format_version) = read_format_version(bytes)?;
let (mut bytes, _null_index_footer) = read_null_index_footer(bytes)?; let (mut bytes, _null_index_footer) = read_null_index_footer(bytes)?;
let header = Header::deserialize(&mut bytes)?; let header = Header::deserialize(&mut bytes)?;
@@ -162,7 +160,7 @@ pub fn open<T: MonotonicallyMappableToU64 + fmt::Debug>(
} }
} }
fn open_specific_codec<C: FastFieldCodec, Item: MonotonicallyMappableToU64 + fmt::Debug>( fn open_specific_codec<C: FastFieldCodec, Item: MonotonicallyMappableToU64>(
bytes: OwnedBytes, bytes: OwnedBytes,
header: &Header, header: &Header,
) -> io::Result<Arc<dyn Column<Item>>> { ) -> io::Result<Arc<dyn Column<Item>>> {
@@ -323,9 +321,6 @@ mod tests {
pub fn get_codec_test_datasets() -> Vec<(Vec<u64>, &'static str)> { pub fn get_codec_test_datasets() -> Vec<(Vec<u64>, &'static str)> {
let mut data_and_names = vec![]; let mut data_and_names = vec![];
let data = vec![10];
data_and_names.push((data, "minimal test"));
let data = (10..=10_000_u64).collect::<Vec<_>>(); let data = (10..=10_000_u64).collect::<Vec<_>>();
data_and_names.push((data, "simple monotonically increasing")); data_and_names.push((data, "simple monotonically increasing"));
@@ -333,9 +328,6 @@ mod tests {
vec![5, 6, 7, 8, 9, 10, 99, 100], vec![5, 6, 7, 8, 9, 10, 99, 100],
"offset in linear interpol", "offset in linear interpol",
)); ));
data_and_names.push((vec![3, 18446744073709551613, 5], "docid range regression"));
data_and_names.push((vec![5, 50, 3, 13, 1, 1000, 35], "rand small")); data_and_names.push((vec![5, 50, 3, 13, 1, 1000, 35], "rand small"));
data_and_names.push((vec![10], "single value")); data_and_names.push((vec![10], "single value"));

View File

@@ -1,6 +1,4 @@
use std::fmt;
use std::marker::PhantomData; use std::marker::PhantomData;
use std::ops::RangeInclusive;
use fastdivide::DividerU64; use fastdivide::DividerU64;
@@ -8,9 +6,7 @@ use crate::MonotonicallyMappableToU128;
/// Monotonic maps a value to u64 value space. /// Monotonic maps a value to u64 value space.
/// Monotonic mapping enables `PartialOrd` on u64 space without conversion to original space. /// Monotonic mapping enables `PartialOrd` on u64 space without conversion to original space.
pub trait MonotonicallyMappableToU64: pub trait MonotonicallyMappableToU64: 'static + PartialOrd + Copy + Send + Sync {
'static + PartialOrd + Copy + Send + Sync + fmt::Debug
{
/// Converts a value to u64. /// Converts a value to u64.
/// ///
/// Internally all fast field values are encoded as u64. /// Internally all fast field values are encoded as u64.
@@ -33,29 +29,11 @@ pub trait MonotonicallyMappableToU64:
/// mapping from their range to their domain. The `inverse` method is required when opening a codec, /// mapping from their range to their domain. The `inverse` method is required when opening a codec,
/// so a value can be converted back to its original domain (e.g. ip address or f64) from its /// so a value can be converted back to its original domain (e.g. ip address or f64) from its
/// internal representation. /// internal representation.
pub trait StrictlyMonotonicFn<External: Copy, Internal: Copy> { pub trait StrictlyMonotonicFn<External, Internal> {
/// Strictly monotonically maps the value from External to Internal. /// Strictly monotonically maps the value from External to Internal.
fn mapping(&self, inp: External) -> Internal; fn mapping(&self, inp: External) -> Internal;
/// Inverse of `mapping`. Maps the value from Internal to External. /// Inverse of `mapping`. Maps the value from Internal to External.
fn inverse(&self, out: Internal) -> External; fn inverse(&self, out: Internal) -> External;
/// Maps a user provded value from External to Internal.
/// It may be necessary to coerce the value if it is outside the value space.
/// In that case it tries to find the next greater value in the value space.
///
/// Returns a bool to mark if a value was outside the value space and had to be coerced _up_.
/// With that information we can detect if two values in a range both map outside the same value
/// space.
///
/// coerce_up means the next valid upper value in the value space will be chosen if the value
/// has to be coerced.
fn mapping_coerce(&self, inp: RangeInclusive<External>) -> RangeInclusive<Internal> {
self.mapping(*inp.start())..=self.mapping(*inp.end())
}
/// Inverse of `mapping_coerce`.
fn inverse_coerce(&self, out: RangeInclusive<Internal>) -> RangeInclusive<External> {
self.inverse(*out.start())..=self.inverse(*out.end())
}
} }
/// Inverts a strictly monotonic mapping from `StrictlyMonotonicFn<A, B>` to /// Inverts a strictly monotonic mapping from `StrictlyMonotonicFn<A, B>` to
@@ -76,10 +54,7 @@ impl<T> From<T> for StrictlyMonotonicMappingInverter<T> {
} }
impl<From, To, T> StrictlyMonotonicFn<To, From> for StrictlyMonotonicMappingInverter<T> impl<From, To, T> StrictlyMonotonicFn<To, From> for StrictlyMonotonicMappingInverter<T>
where where T: StrictlyMonotonicFn<From, To>
T: StrictlyMonotonicFn<From, To>,
From: Copy,
To: Copy,
{ {
#[inline(always)] #[inline(always)]
fn mapping(&self, val: To) -> From { fn mapping(&self, val: To) -> From {
@@ -90,15 +65,6 @@ where
fn inverse(&self, val: From) -> To { fn inverse(&self, val: From) -> To {
self.orig_mapping.mapping(val) self.orig_mapping.mapping(val)
} }
#[inline]
fn mapping_coerce(&self, inp: RangeInclusive<To>) -> RangeInclusive<From> {
self.orig_mapping.inverse_coerce(inp)
}
#[inline]
fn inverse_coerce(&self, out: RangeInclusive<From>) -> RangeInclusive<To> {
self.orig_mapping.mapping_coerce(out)
}
} }
/// Applies the strictly monotonic mapping from `T` without any additional changes. /// Applies the strictly monotonic mapping from `T` without any additional changes.
@@ -176,31 +142,6 @@ impl<External: MonotonicallyMappableToU64> StrictlyMonotonicFn<External, u64>
fn inverse(&self, out: u64) -> External { fn inverse(&self, out: u64) -> External {
External::from_u64(self.min_value + out * self.gcd) External::from_u64(self.min_value + out * self.gcd)
} }
#[inline]
#[allow(clippy::reversed_empty_ranges)]
fn mapping_coerce(&self, inp: RangeInclusive<External>) -> RangeInclusive<u64> {
let end = External::to_u64(*inp.end());
if end < self.min_value || inp.end() < inp.start() {
return 1..=0;
}
let map_coerce = |mut inp, coerce_up| {
let inp_lower_bound = self.inverse(0);
if inp < inp_lower_bound {
inp = inp_lower_bound;
}
let val = External::to_u64(inp);
let need_coercion = coerce_up && (val - self.min_value) % self.gcd != 0;
let mut mapped_val = self.mapping(inp);
if need_coercion {
mapped_val += 1;
}
mapped_val
};
let start = map_coerce(*inp.start(), true);
let end = map_coerce(*inp.end(), false);
start..=end
}
} }
/// Strictly monotonic mapping with a base value. /// Strictly monotonic mapping with a base value.
@@ -217,17 +158,6 @@ impl StrictlyMonotonicMappingToInternalBaseval {
impl<External: MonotonicallyMappableToU64> StrictlyMonotonicFn<External, u64> impl<External: MonotonicallyMappableToU64> StrictlyMonotonicFn<External, u64>
for StrictlyMonotonicMappingToInternalBaseval for StrictlyMonotonicMappingToInternalBaseval
{ {
#[inline]
#[allow(clippy::reversed_empty_ranges)]
fn mapping_coerce(&self, inp: RangeInclusive<External>) -> RangeInclusive<u64> {
if External::to_u64(*inp.end()) < self.min_value {
return 1..=0;
}
let start = self.mapping(External::to_u64(*inp.start()).max(self.min_value));
let end = self.mapping(External::to_u64(*inp.end()));
start..=end
}
#[inline(always)] #[inline(always)]
fn mapping(&self, val: External) -> u64 { fn mapping(&self, val: External) -> u64 {
External::to_u64(val) - self.min_value External::to_u64(val) - self.min_value
@@ -311,7 +241,7 @@ mod tests {
test_round_trip::<_, _, u64>(&mapping, 100u64); test_round_trip::<_, _, u64>(&mapping, 100u64);
} }
fn test_round_trip<T: StrictlyMonotonicFn<K, L>, K: std::fmt::Debug + Eq + Copy, L: Copy>( fn test_round_trip<T: StrictlyMonotonicFn<K, L>, K: std::fmt::Debug + Eq + Copy, L>(
mapping: &T, mapping: &T,
test_val: K, test_val: K,
) { ) {

View File

@@ -1,11 +1,8 @@
use std::fmt;
use std::net::Ipv6Addr; use std::net::Ipv6Addr;
/// Montonic maps a value to u128 value space /// Montonic maps a value to u128 value space
/// Monotonic mapping enables `PartialOrd` on u128 space without conversion to original space. /// Monotonic mapping enables `PartialOrd` on u128 space without conversion to original space.
pub trait MonotonicallyMappableToU128: pub trait MonotonicallyMappableToU128: 'static + PartialOrd + Copy + Send + Sync {
'static + PartialOrd + Copy + Send + Sync + fmt::Debug
{
/// Converts a value to u128. /// Converts a value to u128.
/// ///
/// Internally all fast field values are encoded as u64. /// Internally all fast field values are encoded as u64.

View File

@@ -31,16 +31,15 @@ const BLOCK_BITVEC_SIZE: usize = 8;
const BLOCK_OFFSET_SIZE: usize = 4; const BLOCK_OFFSET_SIZE: usize = 4;
const SERIALIZED_BLOCK_SIZE: usize = BLOCK_BITVEC_SIZE + BLOCK_OFFSET_SIZE; const SERIALIZED_BLOCK_SIZE: usize = BLOCK_BITVEC_SIZE + BLOCK_OFFSET_SIZE;
/// Interpreting the bitvec as a list of 64 bits from the low weight to the
/// high weight.
///
/// This function returns the number of bits set to 1 within
/// `[0..pos_in_vec)`.
#[inline] #[inline]
fn count_ones(bitvec: u64, pos_in_bitvec: u32) -> u32 { fn count_ones(bitvec: u64, pos_in_bitvec: u32) -> u32 {
let mask = (1u64 << pos_in_bitvec) - 1; if pos_in_bitvec == 63 {
let masked_bitvec = bitvec & mask; bitvec.count_ones()
masked_bitvec.count_ones() } else {
let mask = (1u64 << (pos_in_bitvec + 1)) - 1;
let masked_bitvec = bitvec & mask;
masked_bitvec.count_ones()
}
} }
#[derive(Clone, Copy)] #[derive(Clone, Copy)]
@@ -67,7 +66,9 @@ impl DenseCodec {
pub fn exists(&self, idx: u32) -> bool { pub fn exists(&self, idx: u32) -> bool {
let block_pos = idx / ELEMENTS_PER_BLOCK; let block_pos = idx / ELEMENTS_PER_BLOCK;
let bitvec = self.dense_index_block(block_pos).bitvec; let bitvec = self.dense_index_block(block_pos).bitvec;
let pos_in_bitvec = idx % ELEMENTS_PER_BLOCK; let pos_in_bitvec = idx % ELEMENTS_PER_BLOCK;
get_bit_at(bitvec, pos_in_bitvec) get_bit_at(bitvec, pos_in_bitvec)
} }
#[inline] #[inline]
@@ -89,7 +90,8 @@ impl DenseCodec {
let pos_in_block_bit_vec = idx % ELEMENTS_PER_BLOCK; let pos_in_block_bit_vec = idx % ELEMENTS_PER_BLOCK;
let ones_in_block = count_ones(index_block.bitvec, pos_in_block_bit_vec); let ones_in_block = count_ones(index_block.bitvec, pos_in_block_bit_vec);
if get_bit_at(index_block.bitvec, pos_in_block_bit_vec) { if get_bit_at(index_block.bitvec, pos_in_block_bit_vec) {
Some(index_block.offset + ones_in_block) // -1 is ok, since idx does exist, so there's at least one
Some(index_block.offset + ones_in_block - 1)
} else { } else {
None None
} }
@@ -317,10 +319,9 @@ mod tests {
set_bit_at(&mut block, 0); set_bit_at(&mut block, 0);
set_bit_at(&mut block, 2); set_bit_at(&mut block, 2);
assert_eq!(count_ones(block, 0), 0); assert_eq!(count_ones(block, 0), 1);
assert_eq!(count_ones(block, 1), 1); assert_eq!(count_ones(block, 1), 1);
assert_eq!(count_ones(block, 2), 1); assert_eq!(count_ones(block, 2), 2);
assert_eq!(count_ones(block, 3), 2);
} }
} }
@@ -346,16 +347,11 @@ mod bench {
codec codec
} }
fn random_range_iterator( fn random_range_iterator(start: u32, end: u32, step_size: u32) -> impl Iterator<Item = u32> {
start: u32,
end: u32,
avg_step_size: u32,
avg_deviation: u32,
) -> impl Iterator<Item = u32> {
let mut rng: StdRng = StdRng::from_seed([1u8; 32]); let mut rng: StdRng = StdRng::from_seed([1u8; 32]);
let mut current = start; let mut current = start;
std::iter::from_fn(move || { std::iter::from_fn(move || {
current += rng.gen_range(avg_step_size - avg_deviation..=avg_step_size + avg_deviation); current += rng.gen_range(1..step_size + 1);
if current >= end { if current >= end {
None None
} else { } else {
@@ -364,17 +360,10 @@ mod bench {
}) })
} }
fn n_percent_step_iterator(percent: f32, num_values: u32) -> impl Iterator<Item = u32> { fn walk_over_data(codec: &DenseCodec, max_step_size: u32) -> Option<u32> {
let ratio = percent as f32 / 100.0;
let step_size = (1f32 / ratio) as u32;
let deviation = step_size - 1;
random_range_iterator(0, num_values, step_size, deviation)
}
fn walk_over_data(codec: &DenseCodec, avg_step_size: u32) -> Option<u32> {
walk_over_data_from_positions( walk_over_data_from_positions(
codec, codec,
random_range_iterator(0, TOTAL_NUM_VALUES, avg_step_size, 0), random_range_iterator(0, TOTAL_NUM_VALUES, max_step_size),
) )
} }
@@ -390,105 +379,69 @@ mod bench {
} }
#[bench] #[bench]
fn bench_translate_orig_to_codec_1percent_filled_10percent_hit(bench: &mut Bencher) { fn bench_dense_codec_translate_orig_to_codec_90percent_filled_random_stride(
let codec = gen_bools(0.01f64); bench: &mut Bencher,
bench.iter(|| walk_over_data(&codec, 100)); ) {
}
#[bench]
fn bench_translate_orig_to_codec_5percent_filled_10percent_hit(bench: &mut Bencher) {
let codec = gen_bools(0.05f64);
bench.iter(|| walk_over_data(&codec, 100));
}
#[bench]
fn bench_translate_orig_to_codec_5percent_filled_1percent_hit(bench: &mut Bencher) {
let codec = gen_bools(0.05f64);
bench.iter(|| walk_over_data(&codec, 1000));
}
#[bench]
fn bench_translate_orig_to_codec_full_scan_1percent_filled(bench: &mut Bencher) {
let codec = gen_bools(0.01f64);
bench.iter(|| walk_over_data_from_positions(&codec, 0..TOTAL_NUM_VALUES));
}
#[bench]
fn bench_translate_orig_to_codec_full_scan_10percent_filled(bench: &mut Bencher) {
let codec = gen_bools(0.1f64);
bench.iter(|| walk_over_data_from_positions(&codec, 0..TOTAL_NUM_VALUES));
}
#[bench]
fn bench_translate_orig_to_codec_full_scan_90percent_filled(bench: &mut Bencher) {
let codec = gen_bools(0.9f64); let codec = gen_bools(0.9f64);
bench.iter(|| walk_over_data_from_positions(&codec, 0..TOTAL_NUM_VALUES));
}
#[bench]
fn bench_translate_orig_to_codec_10percent_filled_1percent_hit(bench: &mut Bencher) {
let codec = gen_bools(0.1f64);
bench.iter(|| walk_over_data(&codec, 100)); bench.iter(|| walk_over_data(&codec, 100));
} }
#[bench] #[bench]
fn bench_translate_orig_to_codec_50percent_filled_1percent_hit(bench: &mut Bencher) { fn bench_dense_codec_translate_orig_to_codec_50percent_filled_random_stride(
bench: &mut Bencher,
) {
let codec = gen_bools(0.5f64); let codec = gen_bools(0.5f64);
bench.iter(|| walk_over_data(&codec, 100)); bench.iter(|| walk_over_data(&codec, 100));
} }
#[bench] #[bench]
fn bench_translate_orig_to_codec_90percent_filled_1percent_hit(bench: &mut Bencher) { fn bench_dense_codec_translate_orig_to_codec_full_scan_10percent(bench: &mut Bencher) {
let codec = gen_bools(0.1f64);
bench.iter(|| walk_over_data_from_positions(&codec, 0..TOTAL_NUM_VALUES));
}
#[bench]
fn bench_dense_codec_translate_orig_to_codec_full_scan_90percent(bench: &mut Bencher) {
let codec = gen_bools(0.9f64); let codec = gen_bools(0.9f64);
bench.iter(|| walk_over_data_from_positions(&codec, 0..TOTAL_NUM_VALUES));
}
#[bench]
fn bench_dense_codec_translate_orig_to_codec_10percent_filled_random_stride(
bench: &mut Bencher,
) {
let codec = gen_bools(0.1f64);
bench.iter(|| walk_over_data(&codec, 100)); bench.iter(|| walk_over_data(&codec, 100));
} }
#[bench] #[bench]
fn bench_translate_codec_to_orig_1percent_filled_0comma005percent_hit(bench: &mut Bencher) { fn bench_dense_codec_translate_codec_to_orig_90percent_filled_random_stride_big_step(
let codec = gen_bools(0.01f64); bench: &mut Bencher,
let num_non_nulls = codec.num_non_nulls(); ) {
bench.iter(|| { let codec = gen_bools(0.9f64);
codec
.translate_codec_idx_to_original_idx(n_percent_step_iterator(0.005, num_non_nulls))
.last()
});
}
#[bench]
fn bench_translate_codec_to_orig_1percent_filled_10percent_hit(bench: &mut Bencher) {
let codec = gen_bools(0.01f64);
let num_non_nulls = codec.num_non_nulls();
bench.iter(|| {
codec
.translate_codec_idx_to_original_idx(n_percent_step_iterator(10.0, num_non_nulls))
.last()
});
}
#[bench]
fn bench_translate_codec_to_orig_1percent_filled_full_scan(bench: &mut Bencher) {
let codec = gen_bools(0.01f64);
let num_vals = codec.num_non_nulls(); let num_vals = codec.num_non_nulls();
bench.iter(|| { bench.iter(|| {
codec codec
.translate_codec_idx_to_original_idx(0..num_vals) .translate_codec_idx_to_original_idx(random_range_iterator(0, num_vals, 50_000))
.last() .last()
}); });
} }
#[bench] #[bench]
fn bench_translate_codec_to_orig_90percent_filled_0comma005percent_hit(bench: &mut Bencher) { fn bench_dense_codec_translate_codec_to_orig_90percent_filled_random_stride(
let codec = gen_bools(0.90f64); bench: &mut Bencher,
let num_non_nulls = codec.num_non_nulls(); ) {
let codec = gen_bools(0.9f64);
let num_vals = codec.num_non_nulls();
bench.iter(|| { bench.iter(|| {
codec codec
.translate_codec_idx_to_original_idx(n_percent_step_iterator(0.005, num_non_nulls)) .translate_codec_idx_to_original_idx(random_range_iterator(0, num_vals, 100))
.last() .last()
}); });
} }
#[bench] #[bench]
fn bench_translate_codec_to_orig_90percent_filled_full_scan(bench: &mut Bencher) { fn bench_dense_codec_translate_codec_to_orig_90percent_filled_full_scan(bench: &mut Bencher) {
let codec = gen_bools(0.9f64); let codec = gen_bools(0.9f64);
let num_vals = codec.num_non_nulls(); let num_vals = codec.num_non_nulls();
bench.iter(|| { bench.iter(|| {

View File

@@ -1,6 +1,6 @@
use std::io::{self, Write}; use std::io::{self, Write};
use common::{BitSet, GroupByIteratorExtended, OwnedBytes}; use common::{BitSet, OwnedBytes};
use super::{serialize_dense_codec, DenseCodec}; use super::{serialize_dense_codec, DenseCodec};
@@ -78,22 +78,12 @@ struct DenseBlock {
} }
impl DenseBlock { impl DenseBlock {
#[inline]
pub fn exists(&self, idx: u32) -> bool { pub fn exists(&self, idx: u32) -> bool {
self.codec.exists(idx) self.codec.exists(idx)
} }
#[inline]
pub fn translate_to_codec_idx(&self, idx: u32) -> Option<u32> { pub fn translate_to_codec_idx(&self, idx: u32) -> Option<u32> {
self.codec.translate_to_codec_idx(idx) self.codec.translate_to_codec_idx(idx)
} }
#[inline]
pub fn translate_codec_idx_to_original_idx_iter<'a>(
&'a self,
iter: impl Iterator<Item = u32> + 'a,
) -> impl Iterator<Item = u32> + 'a {
self.codec.translate_codec_idx_to_original_idx(iter)
}
#[inline]
pub fn translate_codec_idx_to_original_idx(&self, idx: u32) -> u32 { pub fn translate_codec_idx_to_original_idx(&self, idx: u32) -> u32 {
self.codec self.codec
.translate_codec_idx_to_original_idx(idx..=idx) .translate_codec_idx_to_original_idx(idx..=idx)
@@ -217,7 +207,6 @@ struct ValueAddr {
} }
/// Splits a idx into block index and value in the block /// Splits a idx into block index and value in the block
#[inline]
fn value_addr(idx: u32) -> ValueAddr { fn value_addr(idx: u32) -> ValueAddr {
/// Static assert number elements per block this method expects /// Static assert number elements per block this method expects
#[allow(clippy::assertions_on_constants)] #[allow(clippy::assertions_on_constants)]
@@ -284,7 +273,6 @@ impl SparseCodec {
} }
} }
#[inline]
fn find_block(&self, dense_idx: u32, mut block_pos: u32) -> u32 { fn find_block(&self, dense_idx: u32, mut block_pos: u32) -> u32 {
loop { loop {
let offset = self.blocks[block_pos as usize].offset(); let offset = self.blocks[block_pos as usize].offset();
@@ -296,7 +284,6 @@ impl SparseCodec {
} }
/// Translate positions from the codec index to the original index. /// Translate positions from the codec index to the original index.
/// Correctness: Provided values must be in increasing values
/// ///
/// # Panics /// # Panics
/// ///
@@ -305,41 +292,35 @@ impl SparseCodec {
&'a self, &'a self,
iter: impl Iterator<Item = u32> + 'a, iter: impl Iterator<Item = u32> + 'a,
) -> impl Iterator<Item = u32> + 'a { ) -> impl Iterator<Item = u32> + 'a {
// TODO: There's a big potential performance gain, by using iterators per block instead of
// random access for each element in a block
// group_by itertools won't help though, since it requires a temporary local variable
let mut block_pos = 0u32; let mut block_pos = 0u32;
iter.group_by(move |codec_idx| { iter.map(move |codec_idx| {
block_pos = self.find_block(*codec_idx, block_pos); // update block_pos to limit search scope
block_pos block_pos = self.find_block(codec_idx, block_pos);
})
.flat_map(move |(block_pos, block_iter)| {
let block_doc_idx_start = block_pos * ELEMENTS_PER_BLOCK; let block_doc_idx_start = block_pos * ELEMENTS_PER_BLOCK;
let block = &self.blocks[block_pos as usize]; let block = &self.blocks[block_pos as usize];
let offset = block.offset(); let idx_in_block = codec_idx - block.offset();
let indexes_in_block_iter = block_iter.map(move |codec_idx| codec_idx - offset);
match block { match block {
SparseCodecBlockVariant::Empty { offset: _ } => { SparseCodecBlockVariant::Empty { offset: _ } => {
panic!( panic!(
"invalid input, cannot translate to original index. associated empty \ "invalid input, cannot translate to original index. associated empty \
block with dense idx. block_pos {}, idx_in_block {:?}", block with dense idx. block_pos {}, idx_in_block {}",
block_pos, block_pos, idx_in_block
indexes_in_block_iter.collect::<Vec<_>>()
) )
} }
SparseCodecBlockVariant::Dense(dense) => { SparseCodecBlockVariant::Dense(dense) => {
Box::new(dense.translate_codec_idx_to_original_idx_iter(indexes_in_block_iter)) dense.translate_codec_idx_to_original_idx(idx_in_block) + block_doc_idx_start
as Box<dyn Iterator<Item = u32>>
} }
SparseCodecBlockVariant::Sparse(block) => { SparseCodecBlockVariant::Sparse(block) => {
Box::new(indexes_in_block_iter.map(move |idx_in_block| { block.value_at_idx(&self.data, idx_in_block as u16) as u32 + block_doc_idx_start
block.value_at_idx(&self.data, idx_in_block as u16) as u32
}))
} }
} }
.map(move |idx| idx + block_doc_idx_start)
}) })
} }
} }
#[inline]
fn is_sparse(num_elem_in_block: u32) -> bool { fn is_sparse(num_elem_in_block: u32) -> bool {
num_elem_in_block < DENSE_BLOCK_THRESHOLD num_elem_in_block < DENSE_BLOCK_THRESHOLD
} }
@@ -614,16 +595,11 @@ mod bench {
codec codec
} }
fn random_range_iterator( fn random_range_iterator(start: u32, end: u32, step_size: u32) -> impl Iterator<Item = u32> {
start: u32,
end: u32,
avg_step_size: u32,
avg_deviation: u32,
) -> impl Iterator<Item = u32> {
let mut rng: StdRng = StdRng::from_seed([1u8; 32]); let mut rng: StdRng = StdRng::from_seed([1u8; 32]);
let mut current = start; let mut current = start;
std::iter::from_fn(move || { std::iter::from_fn(move || {
current += rng.gen_range(avg_step_size - avg_deviation..=avg_step_size + avg_deviation); current += rng.gen_range(1..step_size + 1);
if current >= end { if current >= end {
None None
} else { } else {
@@ -632,17 +608,10 @@ mod bench {
}) })
} }
fn n_percent_step_iterator(percent: f32, num_values: u32) -> impl Iterator<Item = u32> { fn walk_over_data(codec: &SparseCodec, max_step_size: u32) -> Option<u32> {
let ratio = percent as f32 / 100.0;
let step_size = (1f32 / ratio) as u32;
let deviation = step_size - 1;
random_range_iterator(0, num_values, step_size, deviation)
}
fn walk_over_data(codec: &SparseCodec, avg_step_size: u32) -> Option<u32> {
walk_over_data_from_positions( walk_over_data_from_positions(
codec, codec,
random_range_iterator(0, TOTAL_NUM_VALUES, avg_step_size, 0), random_range_iterator(0, TOTAL_NUM_VALUES, max_step_size),
) )
} }
@@ -658,83 +627,83 @@ mod bench {
} }
#[bench] #[bench]
fn bench_translate_orig_to_codec_1percent_filled_10percent_hit(bench: &mut Bencher) { fn bench_sparse_codec_translate_orig_to_codec_1percent_filled_random_stride(
bench: &mut Bencher,
) {
let codec = gen_bools(0.01f64); let codec = gen_bools(0.01f64);
bench.iter(|| walk_over_data(&codec, 100)); bench.iter(|| walk_over_data(&codec, 100));
} }
#[bench] #[bench]
fn bench_translate_orig_to_codec_5percent_filled_10percent_hit(bench: &mut Bencher) { fn bench_sparse_codec_translate_orig_to_codec_5percent_filled_random_stride(
bench: &mut Bencher,
) {
let codec = gen_bools(0.05f64); let codec = gen_bools(0.05f64);
bench.iter(|| walk_over_data(&codec, 100)); bench.iter(|| walk_over_data(&codec, 100));
} }
#[bench] #[bench]
fn bench_translate_orig_to_codec_5percent_filled_1percent_hit(bench: &mut Bencher) { fn bench_sparse_codec_translate_orig_to_codec_full_scan_10percent(bench: &mut Bencher) {
let codec = gen_bools(0.05f64);
bench.iter(|| walk_over_data(&codec, 1000));
}
#[bench]
fn bench_translate_orig_to_codec_full_scan_1percent_filled(bench: &mut Bencher) {
let codec = gen_bools(0.01f64);
bench.iter(|| walk_over_data_from_positions(&codec, 0..TOTAL_NUM_VALUES));
}
#[bench]
fn bench_translate_orig_to_codec_full_scan_10percent_filled(bench: &mut Bencher) {
let codec = gen_bools(0.1f64); let codec = gen_bools(0.1f64);
bench.iter(|| walk_over_data_from_positions(&codec, 0..TOTAL_NUM_VALUES)); bench.iter(|| walk_over_data_from_positions(&codec, 0..TOTAL_NUM_VALUES));
} }
#[bench] #[bench]
fn bench_translate_orig_to_codec_full_scan_90percent_filled(bench: &mut Bencher) { fn bench_sparse_codec_translate_orig_to_codec_full_scan_90percent(bench: &mut Bencher) {
let codec = gen_bools(0.9f64); let codec = gen_bools(0.9f64);
bench.iter(|| walk_over_data_from_positions(&codec, 0..TOTAL_NUM_VALUES)); bench.iter(|| walk_over_data_from_positions(&codec, 0..TOTAL_NUM_VALUES));
} }
#[bench] #[bench]
fn bench_translate_orig_to_codec_10percent_filled_1percent_hit(bench: &mut Bencher) { fn bench_sparse_codec_translate_orig_to_codec_full_scan_1percent(bench: &mut Bencher) {
let codec = gen_bools(0.01f64);
bench.iter(|| walk_over_data_from_positions(&codec, 0..TOTAL_NUM_VALUES));
}
#[bench]
fn bench_sparse_codec_translate_orig_to_codec_10percent_filled_random_stride(
bench: &mut Bencher,
) {
let codec = gen_bools(0.1f64); let codec = gen_bools(0.1f64);
bench.iter(|| walk_over_data(&codec, 100)); bench.iter(|| walk_over_data(&codec, 100));
} }
#[bench] #[bench]
fn bench_translate_orig_to_codec_50percent_filled_1percent_hit(bench: &mut Bencher) { fn bench_sparse_codec_translate_orig_to_codec_90percent_filled_random_stride(
let codec = gen_bools(0.5f64); bench: &mut Bencher,
bench.iter(|| walk_over_data(&codec, 100)); ) {
}
#[bench]
fn bench_translate_orig_to_codec_90percent_filled_1percent_hit(bench: &mut Bencher) {
let codec = gen_bools(0.9f64); let codec = gen_bools(0.9f64);
bench.iter(|| walk_over_data(&codec, 100)); bench.iter(|| walk_over_data(&codec, 100));
} }
#[bench] #[bench]
fn bench_translate_codec_to_orig_1percent_filled_0comma005percent_hit(bench: &mut Bencher) { fn bench_sparse_codec_translate_codec_to_orig_1percent_filled_random_stride_big_step(
bench: &mut Bencher,
) {
let codec = gen_bools(0.01f64); let codec = gen_bools(0.01f64);
let num_non_nulls = codec.num_non_nulls(); let num_vals = codec.num_non_nulls();
bench.iter(|| { bench.iter(|| {
codec codec
.translate_codec_idx_to_original_idx(n_percent_step_iterator(0.005, num_non_nulls)) .translate_codec_idx_to_original_idx(random_range_iterator(0, num_vals, 50_000))
.last() .last()
}); });
} }
#[bench] #[bench]
fn bench_translate_codec_to_orig_1percent_filled_10percent_hit(bench: &mut Bencher) { fn bench_sparse_codec_translate_codec_to_orig_1percent_filled_random_stride(
bench: &mut Bencher,
) {
let codec = gen_bools(0.01f64); let codec = gen_bools(0.01f64);
let num_non_nulls = codec.num_non_nulls(); let num_vals = codec.num_non_nulls();
bench.iter(|| { bench.iter(|| {
codec codec
.translate_codec_idx_to_original_idx(n_percent_step_iterator(10.0, num_non_nulls)) .translate_codec_idx_to_original_idx(random_range_iterator(0, num_vals, 100))
.last() .last()
}); });
} }
#[bench] #[bench]
fn bench_translate_codec_to_orig_1percent_filled_full_scan(bench: &mut Bencher) { fn bench_sparse_codec_translate_codec_to_orig_1percent_filled_full_scan(bench: &mut Bencher) {
let codec = gen_bools(0.01f64); let codec = gen_bools(0.01f64);
let num_vals = codec.num_non_nulls(); let num_vals = codec.num_non_nulls();
bench.iter(|| { bench.iter(|| {
@@ -745,18 +714,33 @@ mod bench {
} }
#[bench] #[bench]
fn bench_translate_codec_to_orig_90percent_filled_0comma005percent_hit(bench: &mut Bencher) { fn bench_sparse_codec_translate_codec_to_orig_90percent_filled_random_stride_big_step(
bench: &mut Bencher,
) {
let codec = gen_bools(0.90f64); let codec = gen_bools(0.90f64);
let num_non_nulls = codec.num_non_nulls(); let num_vals = codec.num_non_nulls();
bench.iter(|| { bench.iter(|| {
codec codec
.translate_codec_idx_to_original_idx(n_percent_step_iterator(0.005, num_non_nulls)) .translate_codec_idx_to_original_idx(random_range_iterator(0, num_vals, 50_000))
.last() .last()
}); });
} }
#[bench] #[bench]
fn bench_translate_codec_to_orig_90percent_filled_full_scan(bench: &mut Bencher) { fn bench_sparse_codec_translate_codec_to_orig_90percent_filled_random_stride(
bench: &mut Bencher,
) {
let codec = gen_bools(0.9f64);
let num_vals = codec.num_non_nulls();
bench.iter(|| {
codec
.translate_codec_idx_to_original_idx(random_range_iterator(0, num_vals, 100))
.last()
});
}
#[bench]
fn bench_sparse_codec_translate_codec_to_orig_90percent_filled_full_scan(bench: &mut Bencher) {
let codec = gen_bools(0.9f64); let codec = gen_bools(0.9f64);
let num_vals = codec.num_non_nulls(); let num_vals = codec.num_non_nulls();
bench.iter(|| { bench.iter(|| {

View File

@@ -17,9 +17,9 @@
// You should have received a copy of the GNU Affero General Public License // You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>. // along with this program. If not, see <http://www.gnu.org/licenses/>.
use std::io;
use std::num::NonZeroU64; use std::num::NonZeroU64;
use std::sync::Arc; use std::sync::Arc;
use std::{fmt, io};
use common::{BinarySerializable, OwnedBytes, VInt}; use common::{BinarySerializable, OwnedBytes, VInt};
use log::warn; use log::warn;
@@ -167,7 +167,7 @@ impl BinarySerializable for Header {
/// Return estimated compression for given codec in the value range [0.0..1.0], where 1.0 means no /// Return estimated compression for given codec in the value range [0.0..1.0], where 1.0 means no
/// compression. /// compression.
pub fn estimate<T: MonotonicallyMappableToU64 + fmt::Debug>( pub fn estimate<T: MonotonicallyMappableToU64>(
typed_column: impl Column<T>, typed_column: impl Column<T>,
codec_type: FastFieldCodecType, codec_type: FastFieldCodecType,
) -> Option<f32> { ) -> Option<f32> {
@@ -276,7 +276,7 @@ pub fn serialize_u128_new<F: Fn() -> I, I: Iterator<Item = u128>>(
} }
/// Serializes the column with the codec with the best estimate on the data. /// Serializes the column with the codec with the best estimate on the data.
pub fn serialize<T: MonotonicallyMappableToU64 + fmt::Debug>( pub fn serialize<T: MonotonicallyMappableToU64>(
typed_column: impl Column<T>, typed_column: impl Column<T>,
output: &mut impl io::Write, output: &mut impl io::Write,
codecs: &[FastFieldCodecType], codecs: &[FastFieldCodecType],
@@ -285,7 +285,7 @@ pub fn serialize<T: MonotonicallyMappableToU64 + fmt::Debug>(
} }
/// Serializes the column with the codec with the best estimate on the data. /// Serializes the column with the codec with the best estimate on the data.
pub fn serialize_new<T: MonotonicallyMappableToU64 + fmt::Debug>( pub fn serialize_new<T: MonotonicallyMappableToU64>(
value_index: ValueIndexInfo, value_index: ValueIndexInfo,
typed_column: impl Column<T>, typed_column: impl Column<T>,
output: &mut impl io::Write, output: &mut impl io::Write,
@@ -366,7 +366,7 @@ fn serialize_given_codec(
} }
/// Helper function to serialize a column (autodetect from all codecs) and then open it /// Helper function to serialize a column (autodetect from all codecs) and then open it
pub fn serialize_and_load<T: MonotonicallyMappableToU64 + Ord + Default + fmt::Debug>( pub fn serialize_and_load<T: MonotonicallyMappableToU64 + Ord + Default>(
column: &[T], column: &[T],
) -> Arc<dyn Column<T>> { ) -> Arc<dyn Column<T>> {
let mut buffer = Vec::new(); let mut buffer = Vec::new();

View File

@@ -1208,7 +1208,7 @@ mod tests {
text_field_many_terms => many_terms_data.choose(&mut rng).unwrap().to_string(), text_field_many_terms => many_terms_data.choose(&mut rng).unwrap().to_string(),
text_field_few_terms => few_terms_data.choose(&mut rng).unwrap().to_string(), text_field_few_terms => few_terms_data.choose(&mut rng).unwrap().to_string(),
score_field => val as u64, score_field => val as u64,
score_field_f64 => val, score_field_f64 => val as f64,
score_field_i64 => val as i64, score_field_i64 => val as i64,
))?; ))?;
} }
@@ -1250,7 +1250,10 @@ mod tests {
let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema()); let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema());
let searcher = reader.searcher(); let searcher = reader.searcher();
searcher.search(&term_query, &collector).unwrap() let agg_res: AggregationResults =
searcher.search(&term_query, &collector).unwrap().into();
agg_res
}); });
} }
@@ -1278,7 +1281,10 @@ mod tests {
let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema()); let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema());
let searcher = reader.searcher(); let searcher = reader.searcher();
searcher.search(&term_query, &collector).unwrap() let agg_res: AggregationResults =
searcher.search(&term_query, &collector).unwrap().into();
agg_res
}); });
} }
@@ -1306,7 +1312,10 @@ mod tests {
let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema()); let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema());
let searcher = reader.searcher(); let searcher = reader.searcher();
searcher.search(&term_query, &collector).unwrap() let agg_res: AggregationResults =
searcher.search(&term_query, &collector).unwrap().into();
agg_res
}); });
} }
@@ -1342,7 +1351,10 @@ mod tests {
let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema()); let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema());
let searcher = reader.searcher(); let searcher = reader.searcher();
searcher.search(&term_query, &collector).unwrap() let agg_res: AggregationResults =
searcher.search(&term_query, &collector).unwrap().into();
agg_res
}); });
} }
@@ -1368,7 +1380,10 @@ mod tests {
let collector = AggregationCollector::from_aggs(agg_req, None, index.schema()); let collector = AggregationCollector::from_aggs(agg_req, None, index.schema());
let searcher = reader.searcher(); let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap() let agg_res: AggregationResults =
searcher.search(&AllQuery, &collector).unwrap().into();
agg_res
}); });
} }
@@ -1394,7 +1409,10 @@ mod tests {
let collector = AggregationCollector::from_aggs(agg_req, None, index.schema()); let collector = AggregationCollector::from_aggs(agg_req, None, index.schema());
let searcher = reader.searcher(); let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap() let agg_res: AggregationResults =
searcher.search(&AllQuery, &collector).unwrap().into();
agg_res
}); });
} }
@@ -1428,7 +1446,10 @@ mod tests {
let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema()); let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema());
let searcher = reader.searcher(); let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap() let agg_res: AggregationResults =
searcher.search(&AllQuery, &collector).unwrap().into();
agg_res
}); });
} }
@@ -1460,7 +1481,10 @@ mod tests {
let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema()); let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema());
let searcher = reader.searcher(); let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap() let agg_res: AggregationResults =
searcher.search(&AllQuery, &collector).unwrap().into();
agg_res
}); });
} }
@@ -1496,7 +1520,10 @@ mod tests {
let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema()); let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema());
let searcher = reader.searcher(); let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap() let agg_res: AggregationResults =
searcher.search(&AllQuery, &collector).unwrap().into();
agg_res
}); });
} }
@@ -1523,7 +1550,10 @@ mod tests {
let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema()); let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema());
let searcher = reader.searcher(); let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap() let agg_res: AggregationResults =
searcher.search(&AllQuery, &collector).unwrap().into();
agg_res
}); });
} }
@@ -1567,7 +1597,7 @@ mod tests {
], ],
..Default::default() ..Default::default()
}), }),
sub_aggregation: sub_agg_req_1, sub_aggregation: sub_agg_req_1.clone(),
}), }),
), ),
] ]
@@ -1577,7 +1607,10 @@ mod tests {
let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema()); let collector = AggregationCollector::from_aggs(agg_req_1, None, index.schema());
let searcher = reader.searcher(); let searcher = reader.searcher();
searcher.search(&term_query, &collector).unwrap() let agg_res: AggregationResults =
searcher.search(&term_query, &collector).unwrap().into();
agg_res
}); });
} }
} }

View File

@@ -198,10 +198,11 @@ impl Searcher {
collector: &C, collector: &C,
executor: &Executor, executor: &Executor,
) -> crate::Result<C::Fruit> { ) -> crate::Result<C::Fruit> {
let enabled_scoring = if collector.requires_scoring() { let scoring_enabled = collector.requires_scoring();
EnableScoring::enabled_from_searcher(self) let enabled_scoring = if scoring_enabled {
EnableScoring::Enabled(self)
} else { } else {
EnableScoring::disabled_from_searcher(self) EnableScoring::Disabled(self.schema())
}; };
let weight = query.weight(enabled_scoring)?; let weight = query.weight(enabled_scoring)?;
let segment_readers = self.segment_readers(); let segment_readers = self.segment_readers();

View File

@@ -32,7 +32,7 @@ impl LockError {
/// Error that may occur when opening a directory /// Error that may occur when opening a directory
#[derive(Debug, Clone, Error)] #[derive(Debug, Clone, Error)]
pub enum OpenDirectoryError { pub enum OpenDirectoryError {
/// The underlying directory does not exist. /// The underlying directory does not exists.
#[error("Directory does not exist: '{0}'.")] #[error("Directory does not exist: '{0}'.")]
DoesNotExist(PathBuf), DoesNotExist(PathBuf),
/// The path exists but is not a directory. /// The path exists but is not a directory.
@@ -151,8 +151,8 @@ impl fmt::Debug for Incompatibility {
/// Error that may occur when accessing a file read /// Error that may occur when accessing a file read
#[derive(Debug, Clone, Error)] #[derive(Debug, Clone, Error)]
pub enum OpenReadError { pub enum OpenReadError {
/// The file does not exist. /// The file does not exists.
#[error("Files does not exist: {0:?}")] #[error("Files does not exists: {0:?}")]
FileDoesNotExist(PathBuf), FileDoesNotExist(PathBuf),
/// Any kind of io::Error. /// Any kind of io::Error.
#[error( #[error(
@@ -181,8 +181,8 @@ impl OpenReadError {
/// Error that may occur when trying to delete a file /// Error that may occur when trying to delete a file
#[derive(Debug, Clone, Error)] #[derive(Debug, Clone, Error)]
pub enum DeleteError { pub enum DeleteError {
/// The file does not exist. /// The file does not exists.
#[error("File does not exist: '{0}'.")] #[error("File does not exists: '{0}'.")]
FileDoesNotExist(PathBuf), FileDoesNotExist(PathBuf),
/// Any kind of IO error that happens when /// Any kind of IO error that happens when
/// interacting with the underlying IO device. /// interacting with the underlying IO device.

View File

@@ -232,7 +232,7 @@ impl Directory for RamDirectory {
let path_buf = PathBuf::from(path); let path_buf = PathBuf::from(path);
self.fs.write().unwrap().write(path_buf, data); self.fs.write().unwrap().write(path_buf, data);
if path == *META_FILEPATH { if path == *META_FILEPATH {
drop(self.fs.write().unwrap().watch_router.broadcast()); let _ = self.fs.write().unwrap().watch_router.broadcast();
} }
Ok(()) Ok(())
} }

View File

@@ -168,7 +168,7 @@ mod tests {
watch_event_router.broadcast().wait().unwrap(); watch_event_router.broadcast().wait().unwrap();
assert_eq!(2, counter.load(Ordering::SeqCst)); assert_eq!(2, counter.load(Ordering::SeqCst));
mem::drop(handle_a); mem::drop(handle_a);
drop(watch_event_router.broadcast()); let _ = watch_event_router.broadcast();
watch_event_router.broadcast().wait().unwrap(); watch_event_router.broadcast().wait().unwrap();
assert_eq!(2, counter.load(Ordering::SeqCst)); assert_eq!(2, counter.load(Ordering::SeqCst));
} }

View File

@@ -175,7 +175,7 @@ mod bench {
fn get_alive() -> Vec<u32> { fn get_alive() -> Vec<u32> {
let mut data = (0..1_000_000_u32).collect::<Vec<u32>>(); let mut data = (0..1_000_000_u32).collect::<Vec<u32>>();
for _ in 0..1_000_000 / 8 { for _ in 0..(1_000_000) * 1 / 8 {
remove_rand(&mut data); remove_rand(&mut data);
} }
data data

View File

@@ -96,7 +96,7 @@ mod tests {
let term = Term::from_field_bytes(field, b"lucene".as_ref()); let term = Term::from_field_bytes(field, b"lucene".as_ref());
let term_query = TermQuery::new(term, IndexRecordOption::Basic); let term_query = TermQuery::new(term, IndexRecordOption::Basic);
let term_weight_err = let term_weight_err =
term_query.specialized_weight(EnableScoring::disabled_from_schema(searcher.schema())); term_query.specialized_weight(EnableScoring::Disabled(searcher.schema()));
assert!(matches!( assert!(matches!(
term_weight_err, term_weight_err,
Err(crate::TantivyError::SchemaError(_)) Err(crate::TantivyError::SchemaError(_))

View File

@@ -12,15 +12,13 @@
//! //!
//! //!
//! Fields have to be declared as `FAST` in the schema. //! Fields have to be declared as `FAST` in the schema.
//! Currently supported fields are: u64, i64, f64, bytes, ip and text. //! Currently supported fields are: u64, i64, f64, bytes and text.
//! //!
//! Fast fields are stored in with [different codecs](fastfield_codecs). The best codec is detected //! Fast fields are stored in with [different codecs](fastfield_codecs). The best codec is detected
//! automatically, when serializing. //! automatically, when serializing.
//! //!
//! Read access performance is comparable to that of an array lookup. //! Read access performance is comparable to that of an array lookup.
use std::net::Ipv6Addr;
use fastfield_codecs::MonotonicallyMappableToU64; use fastfield_codecs::MonotonicallyMappableToU64;
pub use self::alive_bitset::{intersect_alive_bitsets, write_alive_bitset, AliveBitSet}; pub use self::alive_bitset::{intersect_alive_bitsets, write_alive_bitset, AliveBitSet};
@@ -30,7 +28,7 @@ pub use self::facet_reader::FacetReader;
pub(crate) use self::multivalued::{get_fastfield_codecs_for_multivalue, MultivalueStartIndex}; pub(crate) use self::multivalued::{get_fastfield_codecs_for_multivalue, MultivalueStartIndex};
pub use self::multivalued::{ pub use self::multivalued::{
MultiValueIndex, MultiValueU128FastFieldWriter, MultiValuedFastFieldReader, MultiValueIndex, MultiValueU128FastFieldWriter, MultiValuedFastFieldReader,
MultiValuedFastFieldWriter, MultiValuedFastFieldWriter, MultiValuedU128FastFieldReader,
}; };
pub(crate) use self::readers::type_and_cardinality; pub(crate) use self::readers::type_and_cardinality;
pub use self::readers::FastFieldReaders; pub use self::readers::FastFieldReaders;
@@ -49,33 +47,6 @@ mod readers;
mod serializer; mod serializer;
mod writer; mod writer;
/// Trait for types that provide a zero value.
///
/// The resulting value is never used, just as placeholder, e.g. for `vec.resize()`.
pub trait MakeZero {
/// Build a default value. This default value is never used, so the value does not
/// really matter.
fn make_zero() -> Self;
}
impl<T: FastValue> MakeZero for T {
fn make_zero() -> Self {
T::from_u64(0)
}
}
impl MakeZero for u128 {
fn make_zero() -> Self {
0
}
}
impl MakeZero for Ipv6Addr {
fn make_zero() -> Self {
Ipv6Addr::from(0u128.to_be_bytes())
}
}
/// Trait for types that are allowed for fast fields: /// Trait for types that are allowed for fast fields:
/// (u64, i64 and f64, bool, DateTime). /// (u64, i64 and f64, bool, DateTime).
pub trait FastValue: pub trait FastValue:
@@ -83,6 +54,12 @@ pub trait FastValue:
{ {
/// Returns the `schema::Type` for this FastValue. /// Returns the `schema::Type` for this FastValue.
fn to_type() -> Type; fn to_type() -> Type;
/// Build a default value. This default value is never used, so the value does not
/// really matter.
fn make_zero() -> Self {
Self::from_u64(0u64)
}
} }
impl FastValue for u64 { impl FastValue for u64 {
@@ -124,6 +101,12 @@ impl FastValue for DateTime {
fn to_type() -> Type { fn to_type() -> Type {
Type::Date Type::Date
} }
fn make_zero() -> Self {
DateTime {
timestamp_micros: 0,
}
}
} }
fn value_to_u64(value: &Value) -> crate::Result<u64> { fn value_to_u64(value: &Value) -> crate::Result<u64> {
@@ -162,7 +145,7 @@ impl FastFieldType {
mod tests { mod tests {
use std::collections::HashMap; use std::collections::HashMap;
use std::ops::{Range, RangeInclusive}; use std::ops::Range;
use std::path::Path; use std::path::Path;
use std::sync::Arc; use std::sync::Arc;
@@ -176,9 +159,7 @@ mod tests {
use super::*; use super::*;
use crate::directory::{CompositeFile, Directory, RamDirectory, WritePtr}; use crate::directory::{CompositeFile, Directory, RamDirectory, WritePtr};
use crate::merge_policy::NoMergePolicy; use crate::merge_policy::NoMergePolicy;
use crate::schema::{ use crate::schema::{Cardinality, Document, Field, Schema, SchemaBuilder, FAST, STRING, TEXT};
Cardinality, Document, Field, Schema, SchemaBuilder, FAST, INDEXED, STRING, TEXT,
};
use crate::time::OffsetDateTime; use crate::time::OffsetDateTime;
use crate::{DateOptions, DatePrecision, Index, SegmentId, SegmentReader}; use crate::{DateOptions, DatePrecision, Index, SegmentId, SegmentReader};
@@ -539,6 +520,11 @@ mod tests {
Ok(()) Ok(())
} }
#[test]
fn test_default_date() {
assert_eq!(0, DateTime::make_zero().into_timestamp_secs());
}
fn get_vals_for_docs(ff: &MultiValuedFastFieldReader<u64>, docs: Range<u32>) -> Vec<u64> { fn get_vals_for_docs(ff: &MultiValuedFastFieldReader<u64>, docs: Range<u32>) -> Vec<u64> {
let mut all = vec![]; let mut all = vec![];
@@ -983,117 +969,4 @@ mod tests {
} }
Ok(len) Ok(len)
} }
#[test]
fn test_gcd_bug_regression_1757() {
let mut schema_builder = Schema::builder();
let num_field = schema_builder.add_u64_field("url_norm_hash", FAST | INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut writer = index.writer_for_tests().unwrap();
writer
.add_document(doc! {
num_field => 100u64,
})
.unwrap();
writer
.add_document(doc! {
num_field => 200u64,
})
.unwrap();
writer
.add_document(doc! {
num_field => 300u64,
})
.unwrap();
writer.commit().unwrap();
}
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let segment = &searcher.segment_readers()[0];
let field = segment.fast_fields().u64(num_field).unwrap();
let numbers = vec![100, 200, 300];
let test_range = |range: RangeInclusive<u64>| {
let expexted_count = numbers.iter().filter(|num| range.contains(num)).count();
let mut vec = vec![];
field.get_docids_for_value_range(range, 0..u32::MAX, &mut vec);
assert_eq!(vec.len(), expexted_count);
};
test_range(50..=50);
test_range(150..=150);
test_range(350..=350);
test_range(100..=250);
test_range(101..=200);
test_range(101..=199);
test_range(100..=300);
test_range(100..=299);
}
#[test]
fn test_mapping_bug_docids_for_value_range() {
let mut schema_builder = Schema::builder();
let num_field = schema_builder.add_u64_field("url_norm_hash", FAST | INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
// Values without gcd, but with min_value
let mut writer = index.writer_for_tests().unwrap();
writer
.add_document(doc! {
num_field => 1000u64,
})
.unwrap();
writer
.add_document(doc! {
num_field => 1001u64,
})
.unwrap();
writer
.add_document(doc! {
num_field => 1003u64,
})
.unwrap();
writer.commit().unwrap();
}
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let segment = &searcher.segment_readers()[0];
let field = segment.fast_fields().u64(num_field).unwrap();
let numbers = vec![1000, 1001, 1003];
let test_range = |range: RangeInclusive<u64>| {
let expexted_count = numbers.iter().filter(|num| range.contains(num)).count();
let mut vec = vec![];
field.get_docids_for_value_range(range, 0..u32::MAX, &mut vec);
assert_eq!(vec.len(), expexted_count);
};
let test_range_variant = |start, stop| {
let start_range = start..=stop;
test_range(start_range);
let start_range = start..=(stop - 1);
test_range(start_range);
let start_range = start..=(stop + 1);
test_range(start_range);
let start_range = (start - 1)..=stop;
test_range(start_range);
let start_range = (start - 1)..=(stop - 1);
test_range(start_range);
let start_range = (start - 1)..=(stop + 1);
test_range(start_range);
let start_range = (start + 1)..=stop;
test_range(start_range);
let start_range = (start + 1)..=(stop - 1);
test_range(start_range);
let start_range = (start + 1)..=(stop + 1);
test_range(start_range);
};
test_range_variant(50, 50);
test_range_variant(1000, 1000);
test_range_variant(1000, 1002);
}
} }

View File

@@ -5,7 +5,7 @@ mod writer;
use fastfield_codecs::FastFieldCodecType; use fastfield_codecs::FastFieldCodecType;
pub use index::MultiValueIndex; pub use index::MultiValueIndex;
pub use self::reader::MultiValuedFastFieldReader; pub use self::reader::{MultiValuedFastFieldReader, MultiValuedU128FastFieldReader};
pub(crate) use self::writer::MultivalueStartIndex; pub(crate) use self::writer::MultivalueStartIndex;
pub use self::writer::{MultiValueU128FastFieldWriter, MultiValuedFastFieldWriter}; pub use self::writer::{MultiValueU128FastFieldWriter, MultiValuedFastFieldWriter};
@@ -525,7 +525,7 @@ mod bench {
serializer.close().unwrap(); serializer.close().unwrap();
field field
}; };
let file = directory.open_read(path).unwrap(); let file = directory.open_read(&path).unwrap();
{ {
let fast_fields_composite = CompositeFile::open(&file).unwrap(); let fast_fields_composite = CompositeFile::open(&file).unwrap();
let data_idx = fast_fields_composite let data_idx = fast_fields_composite

View File

@@ -1,31 +1,107 @@
use core::fmt;
use std::ops::{Range, RangeInclusive}; use std::ops::{Range, RangeInclusive};
use std::sync::Arc; use std::sync::Arc;
use fastfield_codecs::Column; use fastfield_codecs::{Column, MonotonicallyMappableToU128};
use super::MultiValueIndex; use super::MultiValueIndex;
use crate::fastfield::MakeZero; use crate::fastfield::FastValue;
use crate::DocId; use crate::DocId;
/// Reader for a multivalued fast field. /// Reader for a multivalued `u64` fast field.
/// ///
/// The reader is implemented as two fast fields, one u64 fast field for the index and one for the /// The reader is implemented as two `u64` fast field.
/// values.
/// ///
/// The `vals_reader` will access the concatenated list of all values. /// The `vals_reader` will access the concatenated list of all
/// The `idx_reader` associates, for each document, the index of its first value. /// values for all reader.
/// The `idx_reader` associated, for each document, the index of its first value.
/// Stores the start position for each document.
#[derive(Clone)] #[derive(Clone)]
pub struct MultiValuedFastFieldReader<T> { pub struct MultiValuedFastFieldReader<Item: FastValue> {
idx_reader: MultiValueIndex,
vals_reader: Arc<dyn Column<Item>>,
}
impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
pub(crate) fn open(
idx_reader: Arc<dyn Column<u64>>,
vals_reader: Arc<dyn Column<Item>>,
) -> MultiValuedFastFieldReader<Item> {
MultiValuedFastFieldReader {
idx_reader: MultiValueIndex::new(idx_reader),
vals_reader,
}
}
/// Returns the array of values associated with the given `doc`.
#[inline]
fn get_vals_for_range(&self, range: Range<u32>, vals: &mut Vec<Item>) {
let len = (range.end - range.start) as usize;
vals.resize(len, Item::make_zero());
self.vals_reader
.get_range(range.start as u64, &mut vals[..]);
}
/// Returns the array of values associated with the given `doc`.
#[inline]
pub fn get_vals(&self, doc: DocId, vals: &mut Vec<Item>) {
let range = self.idx_reader.range(doc);
self.get_vals_for_range(range, vals);
}
/// returns the multivalue index
pub fn get_index_reader(&self) -> &MultiValueIndex {
&self.idx_reader
}
/// Returns the minimum value for this fast field.
///
/// The min value does not take in account of possible
/// deleted document, and should be considered as a lower bound
/// of the actual minimum value.
pub fn min_value(&self) -> Item {
self.vals_reader.min_value()
}
/// Returns the maximum value for this fast field.
///
/// The max value does not take in account of possible
/// deleted document, and should be considered as an upper bound
/// of the actual maximum value.
pub fn max_value(&self) -> Item {
self.vals_reader.max_value()
}
/// Returns the number of values associated with the document `DocId`.
#[inline]
pub fn num_vals(&self, doc: DocId) -> u32 {
self.idx_reader.num_vals_for_doc(doc)
}
/// Returns the overall number of values in this field.
#[inline]
pub fn total_num_vals(&self) -> u32 {
self.idx_reader.total_num_vals()
}
}
/// Reader for a multivalued `u128` fast field.
///
/// The reader is implemented as a `u64` fast field for the index and a `u128` fast field.
///
/// The `vals_reader` will access the concatenated list of all
/// values for all reader.
/// The `idx_reader` associated, for each document, the index of its first value.
#[derive(Clone)]
pub struct MultiValuedU128FastFieldReader<T: MonotonicallyMappableToU128> {
idx_reader: MultiValueIndex, idx_reader: MultiValueIndex,
vals_reader: Arc<dyn Column<T>>, vals_reader: Arc<dyn Column<T>>,
} }
impl<T: PartialOrd + MakeZero + Copy + fmt::Debug> MultiValuedFastFieldReader<T> { impl<T: MonotonicallyMappableToU128> MultiValuedU128FastFieldReader<T> {
pub(crate) fn open( pub(crate) fn open(
idx_reader: Arc<dyn Column<u64>>, idx_reader: Arc<dyn Column<u64>>,
vals_reader: Arc<dyn Column<T>>, vals_reader: Arc<dyn Column<T>>,
) -> MultiValuedFastFieldReader<T> { ) -> MultiValuedU128FastFieldReader<T> {
Self { Self {
idx_reader: MultiValueIndex::new(idx_reader), idx_reader: MultiValueIndex::new(idx_reader),
vals_reader, vals_reader,
@@ -46,7 +122,7 @@ impl<T: PartialOrd + MakeZero + Copy + fmt::Debug> MultiValuedFastFieldReader<T>
#[inline] #[inline]
fn get_vals_for_range(&self, range: Range<u32>, vals: &mut Vec<T>) { fn get_vals_for_range(&self, range: Range<u32>, vals: &mut Vec<T>) {
let len = (range.end - range.start) as usize; let len = (range.end - range.start) as usize;
vals.resize(len, T::make_zero()); vals.resize(len, T::from_u128(0));
self.vals_reader self.vals_reader
.get_range(range.start as u64, &mut vals[..]); .get_range(range.start as u64, &mut vals[..]);
} }
@@ -123,131 +199,8 @@ impl<T: PartialOrd + MakeZero + Copy + fmt::Debug> MultiValuedFastFieldReader<T>
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use time::{Duration, OffsetDateTime};
use crate::collector::Count;
use crate::core::Index; use crate::core::Index;
use crate::query::RangeQuery;
use crate::schema::{Cardinality, Facet, FacetOptions, NumericOptions, Schema}; use crate::schema::{Cardinality, Facet, FacetOptions, NumericOptions, Schema};
use crate::{DateOptions, DatePrecision, DateTime};
#[test]
fn test_multivalued_date_docids_for_value_range_1() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let date_field = schema_builder.add_date_field(
"multi_date_field",
DateOptions::default()
.set_fast(Cardinality::MultiValues)
.set_indexed()
.set_fieldnorm()
.set_precision(DatePrecision::Microseconds)
.set_stored(),
);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
let first_time_stamp = OffsetDateTime::now_utc();
index_writer.add_document(doc!(
date_field => DateTime::from_utc(first_time_stamp),
date_field => DateTime::from_utc(first_time_stamp),
))?;
// add another second
let two_secs_ahead = first_time_stamp + Duration::seconds(2);
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
let reader = searcher.segment_reader(0);
let date_ff_reader = reader.fast_fields().dates(date_field).unwrap();
let mut docids = vec![];
date_ff_reader.get_docids_for_value_range(
DateTime::from_utc(first_time_stamp)..=DateTime::from_utc(two_secs_ahead),
0..5,
&mut docids,
);
assert_eq!(docids, vec![0]);
let count_multiples =
|range_query: RangeQuery| searcher.search(&range_query, &Count).unwrap();
assert_eq!(
count_multiples(RangeQuery::new_date(
date_field,
DateTime::from_utc(first_time_stamp)..DateTime::from_utc(two_secs_ahead)
)),
1
);
Ok(())
}
#[test]
fn test_multivalued_date_docids_for_value_range_2() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let date_field = schema_builder.add_date_field(
"multi_date_field",
DateOptions::default()
.set_fast(Cardinality::MultiValues)
// TODO: Test different precision after fixing https://github.com/quickwit-oss/tantivy/issues/1783
.set_precision(DatePrecision::Microseconds)
.set_indexed()
.set_fieldnorm()
.set_stored(),
);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
let first_time_stamp = OffsetDateTime::now_utc();
index_writer.add_document(doc!(
date_field => DateTime::from_utc(first_time_stamp),
date_field => DateTime::from_utc(first_time_stamp),
))?;
index_writer.add_document(doc!())?;
// add one second
index_writer.add_document(doc!(
date_field => DateTime::from_utc(first_time_stamp + Duration::seconds(1)),
))?;
// add another second
let two_secs_ahead = first_time_stamp + Duration::seconds(2);
index_writer.add_document(doc!(
date_field => DateTime::from_utc(two_secs_ahead),
date_field => DateTime::from_utc(two_secs_ahead),
date_field => DateTime::from_utc(two_secs_ahead),
))?;
// add three seconds
index_writer.add_document(doc!(
date_field => DateTime::from_utc(first_time_stamp + Duration::seconds(3)),
))?;
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
let reader = searcher.segment_reader(0);
assert_eq!(reader.num_docs(), 5);
let date_ff_reader = reader.fast_fields().dates(date_field).unwrap();
let mut docids = vec![];
date_ff_reader.get_docids_for_value_range(
DateTime::from_utc(first_time_stamp)..=DateTime::from_utc(two_secs_ahead),
0..5,
&mut docids,
);
assert_eq!(docids, vec![0, 2, 3]);
let count_multiples =
|range_query: RangeQuery| searcher.search(&range_query, &Count).unwrap();
assert_eq!(
count_multiples(RangeQuery::new_date(
date_field,
DateTime::from_utc(first_time_stamp)..DateTime::from_utc(two_secs_ahead)
)),
2
);
Ok(())
}
#[test] #[test]
fn test_multifastfield_reader() -> crate::Result<()> { fn test_multifastfield_reader() -> crate::Result<()> {

View File

@@ -3,9 +3,11 @@ use std::sync::Arc;
use fastfield_codecs::{open, open_u128, Column}; use fastfield_codecs::{open, open_u128, Column};
use super::multivalued::MultiValuedFastFieldReader; use super::multivalued::MultiValuedU128FastFieldReader;
use crate::directory::{CompositeFile, FileSlice}; use crate::directory::{CompositeFile, FileSlice};
use crate::fastfield::{BytesFastFieldReader, FastFieldNotAvailableError, FastValue}; use crate::fastfield::{
BytesFastFieldReader, FastFieldNotAvailableError, FastValue, MultiValuedFastFieldReader,
};
use crate::schema::{Cardinality, Field, FieldType, Schema}; use crate::schema::{Cardinality, Field, FieldType, Schema};
use crate::space_usage::PerFieldSpaceUsage; use crate::space_usage::PerFieldSpaceUsage;
use crate::{DateTime, TantivyError}; use crate::{DateTime, TantivyError};
@@ -159,14 +161,20 @@ impl FastFieldReaders {
/// Returns the `ip` fast field reader reader associated to `field`. /// Returns the `ip` fast field reader reader associated to `field`.
/// ///
/// If `field` is not a u128 fast field, this method returns an Error. /// If `field` is not a u128 fast field, this method returns an Error.
pub fn ip_addrs(&self, field: Field) -> crate::Result<MultiValuedFastFieldReader<Ipv6Addr>> { pub fn ip_addrs(
&self,
field: Field,
) -> crate::Result<MultiValuedU128FastFieldReader<Ipv6Addr>> {
self.check_type(field, FastType::U128, Cardinality::MultiValues)?; self.check_type(field, FastType::U128, Cardinality::MultiValues)?;
let idx_reader: Arc<dyn Column<u64>> = self.typed_fast_field_reader(field)?; let idx_reader: Arc<dyn Column<u64>> = self.typed_fast_field_reader(field)?;
let bytes = self.fast_field_data(field, 1)?.read_bytes()?; let bytes = self.fast_field_data(field, 1)?.read_bytes()?;
let vals_reader = open_u128::<Ipv6Addr>(bytes)?; let vals_reader = open_u128::<Ipv6Addr>(bytes)?;
Ok(MultiValuedFastFieldReader::open(idx_reader, vals_reader)) Ok(MultiValuedU128FastFieldReader::open(
idx_reader,
vals_reader,
))
} }
/// Returns the `u128` fast field reader reader associated to `field`. /// Returns the `u128` fast field reader reader associated to `field`.
@@ -181,14 +189,17 @@ impl FastFieldReaders {
/// Returns the `u128` multi-valued fast field reader reader associated to `field`. /// Returns the `u128` multi-valued fast field reader reader associated to `field`.
/// ///
/// If `field` is not a u128 multi-valued fast field, this method returns an Error. /// If `field` is not a u128 multi-valued fast field, this method returns an Error.
pub fn u128s(&self, field: Field) -> crate::Result<MultiValuedFastFieldReader<u128>> { pub fn u128s(&self, field: Field) -> crate::Result<MultiValuedU128FastFieldReader<u128>> {
self.check_type(field, FastType::U128, Cardinality::MultiValues)?; self.check_type(field, FastType::U128, Cardinality::MultiValues)?;
let idx_reader: Arc<dyn Column<u64>> = self.typed_fast_field_reader(field)?; let idx_reader: Arc<dyn Column<u64>> = self.typed_fast_field_reader(field)?;
let bytes = self.fast_field_data(field, 1)?.read_bytes()?; let bytes = self.fast_field_data(field, 1)?.read_bytes()?;
let vals_reader = open_u128::<u128>(bytes)?; let vals_reader = open_u128::<u128>(bytes)?;
Ok(MultiValuedFastFieldReader::open(idx_reader, vals_reader)) Ok(MultiValuedU128FastFieldReader::open(
idx_reader,
vals_reader,
))
} }
/// Returns the `u64` fast field reader reader associated with `field`, regardless of whether /// Returns the `u64` fast field reader reader associated with `field`, regardless of whether

View File

@@ -1,4 +1,3 @@
use std::fmt;
use std::io::{self, Write}; use std::io::{self, Write};
pub use fastfield_codecs::Column; pub use fastfield_codecs::Column;
@@ -50,7 +49,7 @@ impl CompositeFastFieldSerializer {
/// Serialize data into a new u64 fast field. The best compression codec will be chosen /// Serialize data into a new u64 fast field. The best compression codec will be chosen
/// automatically. /// automatically.
pub fn create_auto_detect_u64_fast_field<T: MonotonicallyMappableToU64 + fmt::Debug>( pub fn create_auto_detect_u64_fast_field<T: MonotonicallyMappableToU64>(
&mut self, &mut self,
field: Field, field: Field,
fastfield_accessor: impl Column<T>, fastfield_accessor: impl Column<T>,
@@ -60,9 +59,7 @@ impl CompositeFastFieldSerializer {
/// Serialize data into a new u64 fast field. The best compression codec will be chosen /// Serialize data into a new u64 fast field. The best compression codec will be chosen
/// automatically. /// automatically.
pub fn create_auto_detect_u64_fast_field_with_idx< pub fn create_auto_detect_u64_fast_field_with_idx<T: MonotonicallyMappableToU64>(
T: MonotonicallyMappableToU64 + fmt::Debug,
>(
&mut self, &mut self,
field: Field, field: Field,
fastfield_accessor: impl Column<T>, fastfield_accessor: impl Column<T>,
@@ -75,9 +72,7 @@ impl CompositeFastFieldSerializer {
/// Serialize data into a new u64 fast field. The best compression codec of the the provided /// Serialize data into a new u64 fast field. The best compression codec of the the provided
/// will be chosen. /// will be chosen.
pub fn create_auto_detect_u64_fast_field_with_idx_and_codecs< pub fn create_auto_detect_u64_fast_field_with_idx_and_codecs<T: MonotonicallyMappableToU64>(
T: MonotonicallyMappableToU64 + fmt::Debug,
>(
&mut self, &mut self,
field: Field, field: Field,
fastfield_accessor: impl Column<T>, fastfield_accessor: impl Column<T>,

View File

@@ -678,7 +678,7 @@ impl IndexWriter {
/// only after calling `commit()`. /// only after calling `commit()`.
#[doc(hidden)] #[doc(hidden)]
pub fn delete_query(&self, query: Box<dyn Query>) -> crate::Result<Opstamp> { pub fn delete_query(&self, query: Box<dyn Query>) -> crate::Result<Opstamp> {
let weight = query.weight(EnableScoring::disabled_from_schema(&self.index.schema()))?; let weight = query.weight(EnableScoring::Disabled(&self.index.schema()))?;
let opstamp = self.stamper.stamp(); let opstamp = self.stamper.stamp();
let delete_operation = DeleteOperation { let delete_operation = DeleteOperation {
opstamp, opstamp,
@@ -759,8 +759,7 @@ impl IndexWriter {
match user_op { match user_op {
UserOperation::Delete(term) => { UserOperation::Delete(term) => {
let query = TermQuery::new(term, IndexRecordOption::Basic); let query = TermQuery::new(term, IndexRecordOption::Basic);
let weight = let weight = query.weight(EnableScoring::Disabled(&self.index.schema()))?;
query.weight(EnableScoring::disabled_from_schema(&self.index.schema()))?;
let delete_operation = DeleteOperation { let delete_operation = DeleteOperation {
opstamp, opstamp,
target: weight, target: weight,

View File

@@ -89,11 +89,11 @@ pub(crate) fn index_json_values<'a>(
Ok(()) Ok(())
} }
fn index_json_object( fn index_json_object<'a>(
doc: DocId, doc: DocId,
json_value: &serde_json::Map<String, serde_json::Value>, json_value: &serde_json::Map<String, serde_json::Value>,
text_analyzer: &TextAnalyzer, text_analyzer: &TextAnalyzer,
json_term_writer: &mut JsonTermWriter, json_term_writer: &mut JsonTermWriter<'a>,
postings_writer: &mut dyn PostingsWriter, postings_writer: &mut dyn PostingsWriter,
ctx: &mut IndexingContext, ctx: &mut IndexingContext,
positions_per_path: &mut IndexingPositionsPerPath, positions_per_path: &mut IndexingPositionsPerPath,
@@ -113,11 +113,11 @@ fn index_json_object(
} }
} }
fn index_json_value( fn index_json_value<'a>(
doc: DocId, doc: DocId,
json_value: &serde_json::Value, json_value: &serde_json::Value,
text_analyzer: &TextAnalyzer, text_analyzer: &TextAnalyzer,
json_term_writer: &mut JsonTermWriter, json_term_writer: &mut JsonTermWriter<'a>,
postings_writer: &mut dyn PostingsWriter, postings_writer: &mut dyn PostingsWriter,
ctx: &mut IndexingContext, ctx: &mut IndexingContext,
positions_per_path: &mut IndexingPositionsPerPath, positions_per_path: &mut IndexingPositionsPerPath,

View File

@@ -13,7 +13,7 @@ use crate::docset::{DocSet, TERMINATED};
use crate::error::DataCorruption; use crate::error::DataCorruption;
use crate::fastfield::{ use crate::fastfield::{
get_fastfield_codecs_for_multivalue, AliveBitSet, Column, CompositeFastFieldSerializer, get_fastfield_codecs_for_multivalue, AliveBitSet, Column, CompositeFastFieldSerializer,
MultiValueIndex, MultiValuedFastFieldReader, MultiValueIndex, MultiValuedFastFieldReader, MultiValuedU128FastFieldReader,
}; };
use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer, FieldNormsWriter}; use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer, FieldNormsWriter};
use crate::indexer::doc_id_mapping::{expect_field_id_for_sort_field, SegmentDocIdMapping}; use crate::indexer::doc_id_mapping::{expect_field_id_for_sort_field, SegmentDocIdMapping};
@@ -331,18 +331,18 @@ impl IndexMerger {
fast_field_serializer: &mut CompositeFastFieldSerializer, fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &SegmentDocIdMapping, doc_id_mapping: &SegmentDocIdMapping,
) -> crate::Result<()> { ) -> crate::Result<()> {
let segment_and_ff_readers: Vec<(&SegmentReader, MultiValuedFastFieldReader<u128>)> = self let segment_and_ff_readers: Vec<(&SegmentReader, MultiValuedU128FastFieldReader<u128>)> =
.readers self.readers
.iter() .iter()
.map(|segment_reader| { .map(|segment_reader| {
let ff_reader: MultiValuedFastFieldReader<u128> = let ff_reader: MultiValuedU128FastFieldReader<u128> =
segment_reader.fast_fields().u128s(field).expect( segment_reader.fast_fields().u128s(field).expect(
"Failed to find index for multivalued field. This is a bug in tantivy, \ "Failed to find index for multivalued field. This is a bug in \
please report.", tantivy, please report.",
); );
(segment_reader, ff_reader) (segment_reader, ff_reader)
}) })
.collect::<Vec<_>>(); .collect::<Vec<_>>();
Self::write_1_n_fast_field_idx_generic( Self::write_1_n_fast_field_idx_generic(
field, field,

View File

@@ -577,7 +577,7 @@ impl SegmentUpdater {
for merge_operation in merge_candidates { for merge_operation in merge_candidates {
// If a merge cannot be started this is not a fatal error. // If a merge cannot be started this is not a fatal error.
// We do log a warning in `start_merge`. // We do log a warning in `start_merge`.
drop(self.start_merge(merge_operation)); let _ = self.start_merge(merge_operation);
} }
} }

View File

@@ -1,14 +1,17 @@
#![doc(html_logo_url = "http://fulmicoton.com/tantivy-logo/tantivy-logo.png")] #![doc(html_logo_url = "http://fulmicoton.com/tantivy-logo/tantivy-logo.png")]
#![cfg_attr(all(feature = "unstable", test), feature(test))] #![cfg_attr(all(feature = "unstable", test), feature(test))]
#![cfg_attr(
feature = "cargo-clippy",
allow(
clippy::module_inception,
clippy::needless_range_loop,
clippy::bool_assert_comparison
)
)]
#![doc(test(attr(allow(unused_variables), deny(warnings))))] #![doc(test(attr(allow(unused_variables), deny(warnings))))]
#![warn(missing_docs)] #![warn(missing_docs)]
#![allow( #![allow(clippy::len_without_is_empty)]
clippy::len_without_is_empty, #![allow(clippy::derive_partial_eq_without_eq)]
clippy::derive_partial_eq_without_eq,
clippy::module_inception,
clippy::needless_range_loop,
clippy::bool_assert_comparison
)]
//! # `tantivy` //! # `tantivy`
//! //!
@@ -141,7 +144,7 @@ use crate::time::{OffsetDateTime, PrimitiveDateTime, UtcOffset};
/// All constructors and conversions are provided as explicit /// All constructors and conversions are provided as explicit
/// functions and not by implementing any `From`/`Into` traits /// functions and not by implementing any `From`/`Into` traits
/// to prevent unintended usage. /// to prevent unintended usage.
#[derive(Clone, Default, Copy, PartialEq, Eq, PartialOrd, Ord)] #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub struct DateTime { pub struct DateTime {
// Timestamp in microseconds. // Timestamp in microseconds.
pub(crate) timestamp_micros: i64, pub(crate) timestamp_micros: i64,

View File

@@ -291,7 +291,7 @@ pub mod tests {
const PADDING_VALUE: u32 = 234_234_345u32; const PADDING_VALUE: u32 = 234_234_345u32;
let expected_length = 154; let expected_length = 154;
let mut encoder = BlockEncoder::new(); let mut encoder = BlockEncoder::new();
let input: Vec<u32> = (0u32..123u32).map(|i| 4 + i * 7 / 2).collect(); let input: Vec<u32> = (0u32..123u32).map(|i| 4 + i * 7 / 2).into_iter().collect();
for offset in &[0u32, 1u32, 2u32] { for offset in &[0u32, 1u32, 2u32] {
let encoded_data = encoder.compress_vint_sorted(&input, *offset); let encoded_data = encoder.compress_vint_sorted(&input, *offset);
assert!(encoded_data.len() <= expected_length); assert!(encoded_data.len() <= expected_length);

View File

@@ -631,7 +631,7 @@ mod bench {
let mut segment_postings = segment_reader let mut segment_postings = segment_reader
.inverted_index(TERM_A.field()) .inverted_index(TERM_A.field())
.unwrap() .unwrap()
.read_postings(&TERM_A, IndexRecordOption::Basic) .read_postings(&*TERM_A, IndexRecordOption::Basic)
.unwrap() .unwrap()
.unwrap(); .unwrap();
while segment_postings.advance() != TERMINATED {} while segment_postings.advance() != TERMINATED {}
@@ -647,25 +647,25 @@ mod bench {
let segment_postings_a = segment_reader let segment_postings_a = segment_reader
.inverted_index(TERM_A.field()) .inverted_index(TERM_A.field())
.unwrap() .unwrap()
.read_postings(&TERM_A, IndexRecordOption::Basic) .read_postings(&*TERM_A, IndexRecordOption::Basic)
.unwrap() .unwrap()
.unwrap(); .unwrap();
let segment_postings_b = segment_reader let segment_postings_b = segment_reader
.inverted_index(TERM_B.field()) .inverted_index(TERM_B.field())
.unwrap() .unwrap()
.read_postings(&TERM_B, IndexRecordOption::Basic) .read_postings(&*TERM_B, IndexRecordOption::Basic)
.unwrap() .unwrap()
.unwrap(); .unwrap();
let segment_postings_c = segment_reader let segment_postings_c = segment_reader
.inverted_index(TERM_C.field()) .inverted_index(TERM_C.field())
.unwrap() .unwrap()
.read_postings(&TERM_C, IndexRecordOption::Basic) .read_postings(&*TERM_C, IndexRecordOption::Basic)
.unwrap() .unwrap()
.unwrap(); .unwrap();
let segment_postings_d = segment_reader let segment_postings_d = segment_reader
.inverted_index(TERM_D.field()) .inverted_index(TERM_D.field())
.unwrap() .unwrap()
.read_postings(&TERM_D, IndexRecordOption::Basic) .read_postings(&*TERM_D, IndexRecordOption::Basic)
.unwrap() .unwrap()
.unwrap(); .unwrap();
let mut intersection = Intersection::new(vec![ let mut intersection = Intersection::new(vec![
@@ -687,7 +687,7 @@ mod bench {
let mut segment_postings = segment_reader let mut segment_postings = segment_reader
.inverted_index(TERM_A.field()) .inverted_index(TERM_A.field())
.unwrap() .unwrap()
.read_postings(&TERM_A, IndexRecordOption::Basic) .read_postings(&*TERM_A, IndexRecordOption::Basic)
.unwrap() .unwrap()
.unwrap(); .unwrap();
@@ -705,7 +705,7 @@ mod bench {
let mut segment_postings = segment_reader let mut segment_postings = segment_reader
.inverted_index(TERM_A.field()) .inverted_index(TERM_A.field())
.unwrap() .unwrap()
.read_postings(&TERM_A, IndexRecordOption::Basic) .read_postings(&*TERM_A, IndexRecordOption::Basic)
.unwrap() .unwrap()
.unwrap(); .unwrap();
for doc in &existing_docs { for doc in &existing_docs {
@@ -746,7 +746,7 @@ mod bench {
let mut segment_postings = segment_reader let mut segment_postings = segment_reader
.inverted_index(TERM_A.field()) .inverted_index(TERM_A.field())
.unwrap() .unwrap()
.read_postings(&TERM_A, IndexRecordOption::Basic) .read_postings(&*TERM_A, IndexRecordOption::Basic)
.unwrap() .unwrap()
.unwrap(); .unwrap();
let mut s = 0u32; let mut s = 0u32;

View File

@@ -95,7 +95,7 @@ mod tests {
let index = create_test_index()?; let index = create_test_index()?;
let reader = index.reader()?; let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
let weight = AllQuery.weight(EnableScoring::disabled_from_schema(&index.schema()))?; let weight = AllQuery.weight(EnableScoring::Disabled(&index.schema()))?;
{ {
let reader = searcher.segment_reader(0); let reader = searcher.segment_reader(0);
let mut scorer = weight.scorer(reader, 1.0)?; let mut scorer = weight.scorer(reader, 1.0)?;
@@ -118,7 +118,7 @@ mod tests {
let index = create_test_index()?; let index = create_test_index()?;
let reader = index.reader()?; let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
let weight = AllQuery.weight(EnableScoring::disabled_from_schema(searcher.schema()))?; let weight = AllQuery.weight(EnableScoring::Disabled(searcher.schema()))?;
let reader = searcher.segment_reader(0); let reader = searcher.segment_reader(0);
{ {
let mut scorer = weight.scorer(reader, 2.0)?; let mut scorer = weight.scorer(reader, 2.0)?;

View File

@@ -146,7 +146,7 @@ impl Query for BooleanQuery {
let sub_weights = self let sub_weights = self
.subqueries .subqueries
.iter() .iter()
.map(|(occur, subquery)| Ok((*occur, subquery.weight(enable_scoring)?))) .map(|&(ref occur, ref subquery)| Ok((*occur, subquery.weight(enable_scoring)?)))
.collect::<crate::Result<_>>()?; .collect::<crate::Result<_>>()?;
Ok(Box::new(BooleanWeight::new( Ok(Box::new(BooleanWeight::new(
sub_weights, sub_weights,

View File

@@ -91,7 +91,7 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
boost: Score, boost: Score,
) -> crate::Result<HashMap<Occur, Vec<Box<dyn Scorer>>>> { ) -> crate::Result<HashMap<Occur, Vec<Box<dyn Scorer>>>> {
let mut per_occur_scorers: HashMap<Occur, Vec<Box<dyn Scorer>>> = HashMap::new(); let mut per_occur_scorers: HashMap<Occur, Vec<Box<dyn Scorer>>> = HashMap::new();
for (occur, subweight) in &self.weights { for &(ref occur, ref subweight) in &self.weights {
let sub_scorer: Box<dyn Scorer> = subweight.scorer(reader, boost)?; let sub_scorer: Box<dyn Scorer> = subweight.scorer(reader, boost)?;
per_occur_scorers per_occur_scorers
.entry(*occur) .entry(*occur)
@@ -191,7 +191,7 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
} }
let mut explanation = Explanation::new("BooleanClause. Sum of ...", scorer.score()); let mut explanation = Explanation::new("BooleanClause. Sum of ...", scorer.score());
for (occur, subweight) in &self.weights { for &(ref occur, ref subweight) in &self.weights {
if is_positive_occur(*occur) { if is_positive_occur(*occur) {
if let Ok(child_explanation) = subweight.explain(reader, doc) { if let Ok(child_explanation) = subweight.explain(reader, doc) {
explanation.add_detail(child_explanation); explanation.add_detail(child_explanation);

View File

@@ -98,7 +98,7 @@ mod tests {
} }
{ {
let query = query_parser.parse_query("+a b")?; let query = query_parser.parse_query("+a b")?;
let weight = query.weight(EnableScoring::disabled_from_schema(searcher.schema()))?; let weight = query.weight(EnableScoring::Disabled(searcher.schema()))?;
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0)?; let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0)?;
assert!(scorer.is::<TermScorer>()); assert!(scorer.is::<TermScorer>());
} }

View File

@@ -1,5 +1,8 @@
use std::collections::HashMap;
use std::ops::Range;
use levenshtein_automata::{Distance, LevenshteinAutomatonBuilder, DFA}; use levenshtein_automata::{Distance, LevenshteinAutomatonBuilder, DFA};
use once_cell::sync::OnceCell; use once_cell::sync::Lazy;
use tantivy_fst::Automaton; use tantivy_fst::Automaton;
use crate::query::{AutomatonWeight, EnableScoring, Query, Weight}; use crate::query::{AutomatonWeight, EnableScoring, Query, Weight};
@@ -31,6 +34,22 @@ impl Automaton for DfaWrapper {
} }
} }
/// A range of Levenshtein distances that we will build DFAs for our terms
/// The computation is exponential, so best keep it to low single digits
const VALID_LEVENSHTEIN_DISTANCE_RANGE: Range<u8> = 0..3;
static LEV_BUILDER: Lazy<HashMap<(u8, bool), LevenshteinAutomatonBuilder>> = Lazy::new(|| {
let mut lev_builder_cache = HashMap::new();
// TODO make population lazy on a `(distance, val)` basis
for distance in VALID_LEVENSHTEIN_DISTANCE_RANGE {
for &transposition in &[false, true] {
let lev_automaton_builder = LevenshteinAutomatonBuilder::new(distance, transposition);
lev_builder_cache.insert((distance, transposition), lev_automaton_builder);
}
}
lev_builder_cache
});
/// A Fuzzy Query matches all of the documents /// A Fuzzy Query matches all of the documents
/// containing a specific term that is within /// containing a specific term that is within
/// Levenshtein distance /// Levenshtein distance
@@ -110,39 +129,30 @@ impl FuzzyTermQuery {
} }
fn specialized_weight(&self) -> crate::Result<AutomatonWeight<DfaWrapper>> { fn specialized_weight(&self) -> crate::Result<AutomatonWeight<DfaWrapper>> {
static AUTOMATON_BUILDER: [[OnceCell<LevenshteinAutomatonBuilder>; 2]; 3] = [ // LEV_BUILDER is a HashMap, whose `get` method returns an Option
[OnceCell::new(), OnceCell::new()], match LEV_BUILDER.get(&(self.distance, self.transposition_cost_one)) {
[OnceCell::new(), OnceCell::new()], // Unwrap the option and build the Ok(AutomatonWeight)
[OnceCell::new(), OnceCell::new()], Some(automaton_builder) => {
]; let term_text = self.term.as_str().ok_or_else(|| {
crate::TantivyError::InvalidArgument(
let automaton_builder = AUTOMATON_BUILDER "The fuzzy term query requires a string term.".to_string(),
.get(self.distance as usize) )
.ok_or_else(|| { })?;
InvalidArgument(format!( let automaton = if self.prefix {
"Levenshtein distance of {} is not allowed. Choose a value less than {}", automaton_builder.build_prefix_dfa(term_text)
self.distance, } else {
AUTOMATON_BUILDER.len() automaton_builder.build_dfa(term_text)
};
Ok(AutomatonWeight::new(
self.term.field(),
DfaWrapper(automaton),
)) ))
})? }
.get(self.transposition_cost_one as usize) None => Err(InvalidArgument(format!(
.unwrap() "Levenshtein distance of {} is not allowed. Choose a value in the {:?} range",
.get_or_init(|| { self.distance, VALID_LEVENSHTEIN_DISTANCE_RANGE
LevenshteinAutomatonBuilder::new(self.distance, self.transposition_cost_one) ))),
}); }
let term_text = self.term.as_str().ok_or_else(|| {
InvalidArgument("The fuzzy term query requires a string term.".to_string())
})?;
let automaton = if self.prefix {
automaton_builder.build_prefix_dfa(term_text)
} else {
automaton_builder.build_dfa(term_text)
};
Ok(AutomatonWeight::new(
self.term.field(),
DfaWrapper(automaton),
))
} }
} }

View File

@@ -16,6 +16,7 @@ mod phrase_query;
mod query; mod query;
mod query_parser; mod query_parser;
mod range_query; mod range_query;
mod range_query_ip_fastfield;
mod regex_query; mod regex_query;
mod reqopt_scorer; mod reqopt_scorer;
mod scorer; mod scorer;

View File

@@ -45,7 +45,7 @@ impl Query for MoreLikeThisQuery {
fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> { fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> {
let searcher = match enable_scoring { let searcher = match enable_scoring {
EnableScoring::Enabled(searcher) => searcher, EnableScoring::Enabled(searcher) => searcher,
EnableScoring::Disabled { .. } => { EnableScoring::Disabled(_) => {
let err = "MoreLikeThisQuery requires to enable scoring.".to_string(); let err = "MoreLikeThisQuery requires to enable scoring.".to_string();
return Err(crate::TantivyError::InvalidArgument(err)); return Err(crate::TantivyError::InvalidArgument(err));
} }

View File

@@ -80,7 +80,7 @@ pub mod tests {
.collect(); .collect();
let phrase_query = PhraseQuery::new(terms); let phrase_query = PhraseQuery::new(terms);
let phrase_weight = let phrase_weight =
phrase_query.phrase_weight(EnableScoring::disabled_from_schema(searcher.schema()))?; phrase_query.phrase_weight(EnableScoring::Disabled(searcher.schema()))?;
let mut phrase_scorer = phrase_weight.scorer(searcher.segment_reader(0), 1.0)?; let mut phrase_scorer = phrase_weight.scorer(searcher.segment_reader(0), 1.0)?;
assert_eq!(phrase_scorer.doc(), 1); assert_eq!(phrase_scorer.doc(), 1);
assert_eq!(phrase_scorer.advance(), TERMINATED); assert_eq!(phrase_scorer.advance(), TERMINATED);
@@ -361,7 +361,7 @@ pub mod tests {
let query_parser = QueryParser::for_index(&index, vec![json_field]); let query_parser = QueryParser::for_index(&index, vec![json_field]);
let phrase_query = query_parser.parse_query(query).unwrap(); let phrase_query = query_parser.parse_query(query).unwrap();
let phrase_weight = phrase_query let phrase_weight = phrase_query
.weight(EnableScoring::disabled_from_schema(searcher.schema())) .weight(EnableScoring::Disabled(searcher.schema()))
.unwrap(); .unwrap();
let mut phrase_scorer = phrase_weight let mut phrase_scorer = phrase_weight
.scorer(searcher.segment_reader(0), 1.0f32) .scorer(searcher.segment_reader(0), 1.0f32)

View File

@@ -109,7 +109,7 @@ impl PhraseQuery {
let terms = self.phrase_terms(); let terms = self.phrase_terms();
let bm25_weight_opt = match enable_scoring { let bm25_weight_opt = match enable_scoring {
EnableScoring::Enabled(searcher) => Some(Bm25Weight::for_terms(searcher, &terms)?), EnableScoring::Enabled(searcher) => Some(Bm25Weight::for_terms(searcher, &terms)?),
EnableScoring::Disabled { .. } => None, EnableScoring::Disabled(_) => None,
}; };
let mut weight = PhraseWeight::new(self.phrase_terms.clone(), bm25_weight_opt); let mut weight = PhraseWeight::new(self.phrase_terms.clone(), bm25_weight_opt);
if self.slop > 0 { if self.slop > 0 {

View File

@@ -15,55 +15,24 @@ pub enum EnableScoring<'a> {
Enabled(&'a Searcher), Enabled(&'a Searcher),
/// Pass this to disable scoring. /// Pass this to disable scoring.
/// This can improve performance. /// This can improve performance.
Disabled { Disabled(&'a Schema),
/// Schema is required.
schema: &'a Schema,
/// Searcher should be provided if available.
searcher_opt: Option<&'a Searcher>,
},
} }
impl<'a> EnableScoring<'a> { impl<'a> EnableScoring<'a> {
/// Create using [Searcher] with scoring enabled.
pub fn enabled_from_searcher(searcher: &'a Searcher) -> EnableScoring<'a> {
EnableScoring::Enabled(searcher)
}
/// Create using [Searcher] with scoring disabled.
pub fn disabled_from_searcher(searcher: &'a Searcher) -> EnableScoring<'a> {
EnableScoring::Disabled {
schema: searcher.schema(),
searcher_opt: Some(searcher),
}
}
/// Create using [Schema] with scoring disabled.
pub fn disabled_from_schema(schema: &'a Schema) -> EnableScoring<'a> {
Self::Disabled {
schema,
searcher_opt: None,
}
}
/// Returns the searcher if available.
pub fn searcher(&self) -> Option<&Searcher> {
match self {
EnableScoring::Enabled(searcher) => Some(searcher),
EnableScoring::Disabled { searcher_opt, .. } => searcher_opt.to_owned(),
}
}
/// Returns the schema. /// Returns the schema.
pub fn schema(&self) -> &Schema { pub fn schema(&self) -> &Schema {
match self { match self {
EnableScoring::Enabled(searcher) => searcher.schema(), EnableScoring::Enabled(searcher) => searcher.schema(),
EnableScoring::Disabled { schema, .. } => schema, EnableScoring::Disabled(schema) => schema,
} }
} }
/// Returns true if the scoring is enabled. /// Returns true if the scoring is enabled.
pub fn is_scoring_enabled(&self) -> bool { pub fn is_scoring_enabled(&self) -> bool {
matches!(self, EnableScoring::Enabled(..)) match self {
EnableScoring::Enabled(_) => true,
EnableScoring::Disabled(_) => false,
}
} }
} }
@@ -112,14 +81,14 @@ pub trait Query: QueryClone + Send + Sync + downcast_rs::Downcast + fmt::Debug {
/// Returns an `Explanation` for the score of the document. /// Returns an `Explanation` for the score of the document.
fn explain(&self, searcher: &Searcher, doc_address: DocAddress) -> crate::Result<Explanation> { fn explain(&self, searcher: &Searcher, doc_address: DocAddress) -> crate::Result<Explanation> {
let weight = self.weight(EnableScoring::enabled_from_searcher(searcher))?; let weight = self.weight(EnableScoring::Enabled(searcher))?;
let reader = searcher.segment_reader(doc_address.segment_ord); let reader = searcher.segment_reader(doc_address.segment_ord);
weight.explain(reader, doc_address.doc_id) weight.explain(reader, doc_address.doc_id)
} }
/// Returns the number of documents matching the query. /// Returns the number of documents matching the query.
fn count(&self, searcher: &Searcher) -> crate::Result<usize> { fn count(&self, searcher: &Searcher) -> crate::Result<usize> {
let weight = self.weight(EnableScoring::disabled_from_searcher(searcher))?; let weight = self.weight(EnableScoring::Disabled(searcher.schema()))?;
let mut result = 0; let mut result = 0;
for reader in searcher.segment_readers() { for reader in searcher.segment_readers() {
result += weight.count(reader)? as usize; result += weight.count(reader)? as usize;

View File

@@ -54,9 +54,9 @@ impl fmt::Debug for LogicalAst {
if clause.is_empty() { if clause.is_empty() {
write!(formatter, "<emptyclause>")?; write!(formatter, "<emptyclause>")?;
} else { } else {
let (occur, subquery) = &clause[0]; let (ref occur, ref subquery) = clause[0];
write!(formatter, "({}{:?}", occur_letter(*occur), subquery)?; write!(formatter, "({}{:?}", occur_letter(*occur), subquery)?;
for (occur, subquery) in &clause[1..] { for &(ref occur, ref subquery) in &clause[1..] {
write!(formatter, " {}{:?}", occur_letter(*occur), subquery)?; write!(formatter, " {}{:?}", occur_letter(*occur), subquery)?;
} }
formatter.write_str(")")?; formatter.write_str(")")?;

View File

@@ -1,11 +1,9 @@
use std::collections::HashMap;
use std::net::{AddrParseError, IpAddr}; use std::net::{AddrParseError, IpAddr};
use std::num::{ParseFloatError, ParseIntError}; use std::num::{ParseFloatError, ParseIntError};
use std::ops::Bound; use std::ops::Bound;
use std::str::{FromStr, ParseBoolError}; use std::str::{FromStr, ParseBoolError};
use base64::engine::general_purpose::STANDARD as BASE64;
use base64::Engine;
use rustc_hash::FxHashMap;
use tantivy_query_grammar::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral}; use tantivy_query_grammar::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral};
use super::logical_ast::*; use super::logical_ast::*;
@@ -13,10 +11,9 @@ use crate::core::Index;
use crate::indexer::{ use crate::indexer::{
convert_to_fast_value_and_get_term, set_string_and_get_terms, JsonTermWriter, convert_to_fast_value_and_get_term, set_string_and_get_terms, JsonTermWriter,
}; };
use crate::query::range_query::is_type_valid_for_fastfield_range_query;
use crate::query::{ use crate::query::{
AllQuery, BooleanQuery, BoostQuery, EmptyQuery, FuzzyTermQuery, Occur, PhraseQuery, Query, AllQuery, BooleanQuery, BoostQuery, EmptyQuery, Occur, PhraseQuery, Query, RangeQuery,
RangeQuery, TermQuery, TermSetQuery, TermQuery, TermSetQuery,
}; };
use crate::schema::{ use crate::schema::{
Facet, FacetParseError, Field, FieldType, IndexRecordOption, IntoIpv6Addr, JsonObjectOptions, Facet, FacetParseError, Field, FieldType, IndexRecordOption, IntoIpv6Addr, JsonObjectOptions,
@@ -37,7 +34,7 @@ pub enum QueryParserError {
#[error("Unsupported query: {0}")] #[error("Unsupported query: {0}")]
UnsupportedQuery(String), UnsupportedQuery(String),
/// The query references a field that is not in the schema /// The query references a field that is not in the schema
#[error("Field does not exist: '{0}'")] #[error("Field does not exists: '{0}'")]
FieldDoesNotExist(String), FieldDoesNotExist(String),
/// The query contains a term for a `u64` or `i64`-field, but the value /// The query contains a term for a `u64` or `i64`-field, but the value
/// is neither. /// is neither.
@@ -163,10 +160,6 @@ fn trim_ast(logical_ast: LogicalAst) -> Option<LogicalAst> {
/// word lexicographically between `a` and `c` (inclusive lower bound, exclusive upper bound). /// word lexicographically between `a` and `c` (inclusive lower bound, exclusive upper bound).
/// Inclusive bounds are `[]`, exclusive are `{}`. /// Inclusive bounds are `[]`, exclusive are `{}`.
/// ///
/// * set terms: Using the `IN` operator, a field can be matched against a set of literals, e.g.
/// `title: IN [a b cd]` will match documents where `title` is either `a`, `b` or `cd`, but do so
/// more efficiently than the alternative query `title:a OR title:b OR title:c` does.
///
/// * date values: The query parser supports rfc3339 formatted dates. For example /// * date values: The query parser supports rfc3339 formatted dates. For example
/// `"2002-10-02T15:00:00.05Z"` or `some_date_field:[2002-10-02T15:00:00Z TO /// `"2002-10-02T15:00:00.05Z"` or `some_date_field:[2002-10-02T15:00:00Z TO
/// 2002-10-02T18:00:00Z}` /// 2002-10-02T18:00:00Z}`
@@ -181,9 +174,6 @@ fn trim_ast(logical_ast: LogicalAst) -> Option<LogicalAst> {
/// (See [`set_field_boost(...)`](QueryParser::set_field_boost)). Typically you may want to boost a /// (See [`set_field_boost(...)`](QueryParser::set_field_boost)). Typically you may want to boost a
/// title field. /// title field.
/// ///
/// Additionally, specific fields can be marked to use fuzzy term queries for each literal
/// via the [`QueryParser::set_field_fuzzy`] method.
///
/// Phrase terms support the `~` slop operator which allows to set the phrase's matching /// Phrase terms support the `~` slop operator which allows to set the phrase's matching
/// distance in words. `"big wolf"~1` will return documents containing the phrase `"big bad wolf"`. /// distance in words. `"big wolf"~1` will return documents containing the phrase `"big bad wolf"`.
#[derive(Clone)] #[derive(Clone)]
@@ -192,15 +182,7 @@ pub struct QueryParser {
default_fields: Vec<Field>, default_fields: Vec<Field>,
conjunction_by_default: bool, conjunction_by_default: bool,
tokenizer_manager: TokenizerManager, tokenizer_manager: TokenizerManager,
boost: FxHashMap<Field, Score>, boost: HashMap<Field, Score>,
fuzzy: FxHashMap<Field, Fuzzy>,
}
#[derive(Clone)]
struct Fuzzy {
prefix: bool,
distance: u8,
transpose_cost_one: bool,
} }
fn all_negative(ast: &LogicalAst) -> bool { fn all_negative(ast: &LogicalAst) -> bool {
@@ -228,7 +210,6 @@ impl QueryParser {
tokenizer_manager, tokenizer_manager,
conjunction_by_default: false, conjunction_by_default: false,
boost: Default::default(), boost: Default::default(),
fuzzy: Default::default(),
} }
} }
@@ -266,30 +247,6 @@ impl QueryParser {
self.boost.insert(field, boost); self.boost.insert(field, boost);
} }
/// Sets the given [field][`Field`] to use [fuzzy term queries][`FuzzyTermQuery`]
///
/// If set, the parse will produce queries using fuzzy term queries
/// with the given parameters for each literal matched against the given field.
///
/// See the [`FuzzyTermQuery::new`] and [`FuzzyTermQuery::new_prefix`] methods
/// for the meaning of the individual parameters.
pub fn set_field_fuzzy(
&mut self,
field: Field,
prefix: bool,
distance: u8,
transpose_cost_one: bool,
) {
self.fuzzy.insert(
field,
Fuzzy {
prefix,
distance,
transpose_cost_one,
},
);
}
/// Parse a query /// Parse a query
/// ///
/// Note that `parse_query` returns an error if the input /// Note that `parse_query` returns an error if the input
@@ -302,7 +259,7 @@ impl QueryParser {
/// in [Issue 5](https://github.com/fulmicoton/tantivy/issues/5) /// in [Issue 5](https://github.com/fulmicoton/tantivy/issues/5)
pub fn parse_query(&self, query: &str) -> Result<Box<dyn Query>, QueryParserError> { pub fn parse_query(&self, query: &str) -> Result<Box<dyn Query>, QueryParserError> {
let logical_ast = self.parse_query_to_logical_ast(query)?; let logical_ast = self.parse_query_to_logical_ast(query)?;
Ok(convert_to_query(&self.fuzzy, logical_ast)) Ok(convert_to_query(logical_ast))
} }
/// Parse the user query into an AST. /// Parse the user query into an AST.
@@ -336,10 +293,9 @@ impl QueryParser {
) -> Result<Term, QueryParserError> { ) -> Result<Term, QueryParserError> {
let field_entry = self.schema.get_field_entry(field); let field_entry = self.schema.get_field_entry(field);
let field_type = field_entry.field_type(); let field_type = field_entry.field_type();
let field_supports_ff_range_queries = field_type.is_fast()
&& is_type_valid_for_fastfield_range_query(field_type.value_type());
if !field_type.is_indexed() && !field_supports_ff_range_queries { let is_ip_and_fast = field_type.is_ip_addr() && field_type.is_fast();
if !field_type.is_indexed() && !is_ip_and_fast {
return Err(QueryParserError::FieldNotIndexed( return Err(QueryParserError::FieldNotIndexed(
field_entry.name().to_string(), field_entry.name().to_string(),
)); ));
@@ -407,9 +363,7 @@ impl QueryParser {
Err(e) => Err(QueryParserError::from(e)), Err(e) => Err(QueryParserError::from(e)),
}, },
FieldType::Bytes(_) => { FieldType::Bytes(_) => {
let bytes = BASE64 let bytes = base64::decode(phrase).map_err(QueryParserError::ExpectedBase64)?;
.decode(phrase)
.map_err(QueryParserError::ExpectedBase64)?;
Ok(Term::from_field_bytes(field, &bytes)) Ok(Term::from_field_bytes(field, &bytes))
} }
FieldType::IpAddr(_) => { FieldType::IpAddr(_) => {
@@ -504,9 +458,7 @@ impl QueryParser {
Err(e) => Err(QueryParserError::from(e)), Err(e) => Err(QueryParserError::from(e)),
}, },
FieldType::Bytes(_) => { FieldType::Bytes(_) => {
let bytes = BASE64 let bytes = base64::decode(phrase).map_err(QueryParserError::ExpectedBase64)?;
.decode(phrase)
.map_err(QueryParserError::ExpectedBase64)?;
let bytes_term = Term::from_field_bytes(field, &bytes); let bytes_term = Term::from_field_bytes(field, &bytes);
Ok(vec![LogicalLiteral::Term(bytes_term)]) Ok(vec![LogicalLiteral::Term(bytes_term)])
} }
@@ -708,30 +660,9 @@ impl QueryParser {
} }
} }
fn convert_literal_to_query( fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box<dyn Query> {
fuzzy: &FxHashMap<Field, Fuzzy>,
logical_literal: LogicalLiteral,
) -> Box<dyn Query> {
match logical_literal { match logical_literal {
LogicalLiteral::Term(term) => { LogicalLiteral::Term(term) => Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs)),
if let Some(fuzzy) = fuzzy.get(&term.field()) {
if fuzzy.prefix {
Box::new(FuzzyTermQuery::new_prefix(
term,
fuzzy.distance,
fuzzy.transpose_cost_one,
))
} else {
Box::new(FuzzyTermQuery::new(
term,
fuzzy.distance,
fuzzy.transpose_cost_one,
))
}
} else {
Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs))
}
}
LogicalLiteral::Phrase(term_with_offsets, slop) => Box::new( LogicalLiteral::Phrase(term_with_offsets, slop) => Box::new(
PhraseQuery::new_with_offset_and_slop(term_with_offsets, slop), PhraseQuery::new_with_offset_and_slop(term_with_offsets, slop),
), ),
@@ -824,12 +755,12 @@ fn generate_literals_for_json_object(
Ok(logical_literals) Ok(logical_literals)
} }
fn convert_to_query(fuzzy: &FxHashMap<Field, Fuzzy>, logical_ast: LogicalAst) -> Box<dyn Query> { fn convert_to_query(logical_ast: LogicalAst) -> Box<dyn Query> {
match trim_ast(logical_ast) { match trim_ast(logical_ast) {
Some(LogicalAst::Clause(trimmed_clause)) => { Some(LogicalAst::Clause(trimmed_clause)) => {
let occur_subqueries = trimmed_clause let occur_subqueries = trimmed_clause
.into_iter() .into_iter()
.map(|(occur, subquery)| (occur, convert_to_query(fuzzy, subquery))) .map(|(occur, subquery)| (occur, convert_to_query(subquery)))
.collect::<Vec<_>>(); .collect::<Vec<_>>();
assert!( assert!(
!occur_subqueries.is_empty(), !occur_subqueries.is_empty(),
@@ -838,10 +769,10 @@ fn convert_to_query(fuzzy: &FxHashMap<Field, Fuzzy>, logical_ast: LogicalAst) ->
Box::new(BooleanQuery::new(occur_subqueries)) Box::new(BooleanQuery::new(occur_subqueries))
} }
Some(LogicalAst::Leaf(trimmed_logical_literal)) => { Some(LogicalAst::Leaf(trimmed_logical_literal)) => {
convert_literal_to_query(fuzzy, *trimmed_logical_literal) convert_literal_to_query(*trimmed_logical_literal)
} }
Some(LogicalAst::Boost(ast, boost)) => { Some(LogicalAst::Boost(ast, boost)) => {
let query = convert_to_query(fuzzy, *ast); let query = convert_to_query(*ast);
let boosted_query = BoostQuery::new(query, boost); let boosted_query = BoostQuery::new(query, boost);
Box::new(boosted_query) Box::new(boosted_query)
} }
@@ -857,7 +788,7 @@ mod test {
use super::{QueryParser, QueryParserError}; use super::{QueryParser, QueryParserError};
use crate::query::Query; use crate::query::Query;
use crate::schema::{ use crate::schema::{
FacetOptions, Field, IndexRecordOption, Schema, Term, TextFieldIndexing, TextOptions, FAST, FacetOptions, Field, IndexRecordOption, Schema, Term, TextFieldIndexing, TextOptions,
INDEXED, STORED, STRING, TEXT, INDEXED, STORED, STRING, TEXT,
}; };
use crate::tokenizer::{ use crate::tokenizer::{
@@ -891,7 +822,6 @@ mod test {
schema_builder.add_json_field("json_not_indexed", STORED); schema_builder.add_json_field("json_not_indexed", STORED);
schema_builder.add_bool_field("bool", INDEXED); schema_builder.add_bool_field("bool", INDEXED);
schema_builder.add_bool_field("notindexed_bool", STORED); schema_builder.add_bool_field("notindexed_bool", STORED);
schema_builder.add_u64_field("u64_ff", FAST);
schema_builder.build() schema_builder.build()
} }
@@ -1347,11 +1277,6 @@ mod test {
r#"(Excluded(Term(type=F64, field=10, -1.5)) TO Excluded(Term(type=F64, field=10, 1.5)))"#, r#"(Excluded(Term(type=F64, field=10, -1.5)) TO Excluded(Term(type=F64, field=10, 1.5)))"#,
false, false,
); );
test_parse_query_to_logical_ast_helper(
"u64_ff:[7 TO 77]",
r#"(Included(Term(type=U64, field=18, 7)) TO Included(Term(type=U64, field=18, 77)))"#,
false,
);
} }
#[test] #[test]
@@ -1643,41 +1568,4 @@ mod test {
false, false,
); );
} }
#[test]
pub fn test_set_field_fuzzy() {
{
let mut query_parser = make_query_parser();
query_parser.set_field_fuzzy(
query_parser.schema.get_field("title").unwrap(),
false,
1,
true,
);
let query = query_parser.parse_query("abc").unwrap();
assert_eq!(
format!("{:?}", query),
"BooleanQuery { subqueries: [(Should, FuzzyTermQuery { term: Term(type=Str, \
field=0, \"abc\"), distance: 1, transposition_cost_one: true, prefix: false }), \
(Should, TermQuery(Term(type=Str, field=1, \"abc\")))] }"
);
}
{
let mut query_parser = make_query_parser();
query_parser.set_field_fuzzy(
query_parser.schema.get_field("text").unwrap(),
true,
2,
false,
);
let query = query_parser.parse_query("abc").unwrap();
assert_eq!(
format!("{:?}", query),
"BooleanQuery { subqueries: [(Should, TermQuery(Term(type=Str, field=0, \
\"abc\"))), (Should, FuzzyTermQuery { term: Term(type=Str, field=1, \"abc\"), \
distance: 2, transposition_cost_one: false, prefix: true })] }"
);
}
}
} }

View File

@@ -1,17 +1,16 @@
use std::io; use std::io;
use std::ops::{Bound, Range}; use std::ops::{Bound, Range};
use common::{BinarySerializable, BitSet}; use common::BitSet;
use super::range_query_u64_fastfield::FastFieldRangeWeight;
use crate::core::SegmentReader; use crate::core::SegmentReader;
use crate::error::TantivyError; use crate::error::TantivyError;
use crate::query::explanation::does_not_match; use crate::query::explanation::does_not_match;
use crate::query::range_query::range_query_ip_fastfield::IPFastFieldRangeWeight; use crate::query::range_query_ip_fastfield::IPFastFieldRangeWeight;
use crate::query::{BitSetDocSet, ConstScorer, EnableScoring, Explanation, Query, Scorer, Weight}; use crate::query::{BitSetDocSet, ConstScorer, EnableScoring, Explanation, Query, Scorer, Weight};
use crate::schema::{Field, IndexRecordOption, Term, Type}; use crate::schema::{Field, IndexRecordOption, Term, Type};
use crate::termdict::{TermDictionary, TermStreamer}; use crate::termdict::{TermDictionary, TermStreamer};
use crate::{DateTime, DocId, Score}; use crate::{DocId, Score};
pub(crate) fn map_bound<TFrom, TTo, Transform: Fn(&TFrom) -> TTo>( pub(crate) fn map_bound<TFrom, TTo, Transform: Fn(&TFrom) -> TTo>(
bound: &Bound<TFrom>, bound: &Bound<TFrom>,
@@ -204,40 +203,6 @@ impl RangeQuery {
) )
} }
/// Create a new `RangeQuery` over a `date` field.
///
/// The two `Bound` arguments make it possible to create more complex
/// ranges than semi-inclusive range.
///
/// If the field is not of the type `date`, tantivy
/// will panic when the `Weight` object is created.
pub fn new_date_bounds(
field: Field,
left_bound: Bound<DateTime>,
right_bound: Bound<DateTime>,
) -> RangeQuery {
let make_term_val =
|val: &DateTime| Term::from_field_date(field, *val).value_bytes().to_owned();
RangeQuery {
field,
value_type: Type::Date,
left_bound: map_bound(&left_bound, &make_term_val),
right_bound: map_bound(&right_bound, &make_term_val),
}
}
/// Create a new `RangeQuery` over a `date` field.
///
/// If the field is not of the type `date`, tantivy
/// will panic when the `Weight` object is created.
pub fn new_date(field: Field, range: Range<DateTime>) -> RangeQuery {
RangeQuery::new_date_bounds(
field,
Bound::Included(range.start),
Bound::Excluded(range.end),
)
}
/// Create a new `RangeQuery` over a `Str` field. /// Create a new `RangeQuery` over a `Str` field.
/// ///
/// The two `Bound` arguments make it possible to create more complex /// The two `Bound` arguments make it possible to create more complex
@@ -287,23 +252,6 @@ impl RangeQuery {
} }
} }
pub(crate) fn is_type_valid_for_fastfield_range_query(typ: Type) -> bool {
match typ {
Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date => true,
Type::IpAddr => true,
Type::Str | Type::Facet | Type::Bytes | Type::Json => false,
}
}
/// Returns true if the type maps to a u64 fast field
pub(crate) fn maps_to_u64_fastfield(typ: Type) -> bool {
match typ {
Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date => true,
Type::IpAddr => false,
Type::Str | Type::Facet | Type::Bytes | Type::Json => false,
}
}
impl Query for RangeQuery { impl Query for RangeQuery {
fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> { fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> {
let schema = enable_scoring.schema(); let schema = enable_scoring.schema();
@@ -317,29 +265,12 @@ impl Query for RangeQuery {
return Err(TantivyError::SchemaError(err_msg)); return Err(TantivyError::SchemaError(err_msg));
} }
if field_type.is_fast() && is_type_valid_for_fastfield_range_query(self.value_type) { if field_type.is_ip_addr() && field_type.is_fast() {
if field_type.is_ip_addr() { Ok(Box::new(IPFastFieldRangeWeight::new(
Ok(Box::new(IPFastFieldRangeWeight::new( self.field,
self.field, &self.left_bound,
&self.left_bound, &self.right_bound,
&self.right_bound, )))
)))
} else {
// We run the range query on u64 value space for performance reasons and simpicity
// assert the type maps to u64
assert!(maps_to_u64_fastfield(self.value_type));
let parse_from_bytes = |data: &Vec<u8>| {
u64::from_be(BinarySerializable::deserialize(&mut &data[..]).unwrap())
};
let left_bound = map_bound(&self.left_bound, &parse_from_bytes);
let right_bound = map_bound(&self.right_bound, &parse_from_bytes);
Ok(Box::new(FastFieldRangeWeight::new(
self.field,
left_bound,
right_bound,
)))
}
} else { } else {
Ok(Box::new(RangeWeight { Ok(Box::new(RangeWeight {
field: self.field, field: self.field,

View File

@@ -1,210 +0,0 @@
use core::fmt;
use std::ops::RangeInclusive;
use std::sync::Arc;
use fastfield_codecs::Column;
use crate::fastfield::{MakeZero, MultiValuedFastFieldReader};
use crate::{DocId, DocSet, TERMINATED};
/// Helper to have a cursor over a vec of docids
struct VecCursor {
docs: Vec<u32>,
current_pos: usize,
}
impl VecCursor {
fn new() -> Self {
Self {
docs: Vec::with_capacity(32),
current_pos: 0,
}
}
fn next(&mut self) -> Option<u32> {
self.current_pos += 1;
self.current()
}
#[inline]
fn current(&self) -> Option<u32> {
self.docs.get(self.current_pos).copied()
}
fn get_cleared_data(&mut self) -> &mut Vec<u32> {
self.docs.clear();
self.current_pos = 0;
&mut self.docs
}
fn last_value(&self) -> Option<u32> {
self.docs.iter().last().cloned()
}
fn is_empty(&self) -> bool {
self.current().is_none()
}
}
pub(crate) enum FastFieldCardinality<T: MakeZero> {
SingleValue(Arc<dyn Column<T>>),
MultiValue(MultiValuedFastFieldReader<T>),
}
impl<T: MakeZero + PartialOrd + Copy + fmt::Debug> FastFieldCardinality<T> {
fn num_docs(&self) -> u32 {
match self {
FastFieldCardinality::SingleValue(single_value) => single_value.num_vals(),
FastFieldCardinality::MultiValue(multi_value) => {
multi_value.get_index_reader().num_docs()
}
}
}
}
pub(crate) struct RangeDocSet<T: MakeZero> {
/// The range filter on the values.
value_range: RangeInclusive<T>,
fast_field: FastFieldCardinality<T>,
/// The next docid start range to fetch (inclusive).
next_fetch_start: u32,
/// Number of docs range checked in a batch.
///
/// There are two patterns.
/// - We do a full scan. => We can load large chunks. We don't know in advance if seek call
/// will come, so we start with small chunks
/// - We load docs, interspersed with seek calls. When there are big jumps in the seek, we
/// should load small chunks. When the seeks are small, we can employ the same strategy as on a
/// full scan.
fetch_horizon: u32,
/// Current batch of loaded docs.
loaded_docs: VecCursor,
last_seek_pos_opt: Option<u32>,
}
const DEFAULT_FETCH_HORIZON: u32 = 128;
impl<T: MakeZero + Send + PartialOrd + Copy + fmt::Debug> RangeDocSet<T> {
pub(crate) fn new(value_range: RangeInclusive<T>, fast_field: FastFieldCardinality<T>) -> Self {
let mut range_docset = Self {
value_range,
fast_field,
loaded_docs: VecCursor::new(),
next_fetch_start: 0,
fetch_horizon: DEFAULT_FETCH_HORIZON,
last_seek_pos_opt: None,
};
range_docset.reset_fetch_range();
range_docset.fetch_block();
range_docset
}
fn reset_fetch_range(&mut self) {
self.fetch_horizon = DEFAULT_FETCH_HORIZON;
}
/// Returns true if more data could be fetched
fn fetch_block(&mut self) {
const MAX_HORIZON: u32 = 100_000;
while self.loaded_docs.is_empty() {
let finished_to_end = self.fetch_horizon(self.fetch_horizon);
if finished_to_end {
break;
}
// Fetch more data, increase horizon. Horizon only gets reset when doing a seek.
self.fetch_horizon = (self.fetch_horizon * 2).min(MAX_HORIZON);
}
}
/// check if the distance between the seek calls is large
fn is_last_seek_distance_large(&self, new_seek: DocId) -> bool {
if let Some(last_seek_pos) = self.last_seek_pos_opt {
(new_seek - last_seek_pos) >= 128
} else {
true
}
}
/// Fetches a block for docid range [next_fetch_start .. next_fetch_start + HORIZON]
fn fetch_horizon(&mut self, horizon: u32) -> bool {
let mut finished_to_end = false;
let limit = self.fast_field.num_docs();
let mut end = self.next_fetch_start + horizon;
if end >= limit {
end = limit;
finished_to_end = true;
}
match &self.fast_field {
FastFieldCardinality::MultiValue(multi) => {
let last_value = self.loaded_docs.last_value();
multi.get_docids_for_value_range(
self.value_range.clone(),
self.next_fetch_start..end,
self.loaded_docs.get_cleared_data(),
);
// In case of multivalues, we may have an overlap of the same docid between fetching
// blocks
if let Some(last_value) = last_value {
while self.loaded_docs.current() == Some(last_value) {
self.loaded_docs.next();
}
}
}
FastFieldCardinality::SingleValue(single) => {
single.get_docids_for_value_range(
self.value_range.clone(),
self.next_fetch_start..end,
self.loaded_docs.get_cleared_data(),
);
}
}
self.next_fetch_start = end;
finished_to_end
}
}
impl<T: MakeZero + Send + PartialOrd + Copy + fmt::Debug> DocSet for RangeDocSet<T> {
#[inline]
fn advance(&mut self) -> DocId {
if let Some(docid) = self.loaded_docs.next() {
docid
} else {
if self.next_fetch_start >= self.fast_field.num_docs() {
return TERMINATED;
}
self.fetch_block();
self.loaded_docs.current().unwrap_or(TERMINATED)
}
}
#[inline]
fn doc(&self) -> DocId {
self.loaded_docs.current().unwrap_or(TERMINATED)
}
/// Advances the `DocSet` forward until reaching the target, or going to the
/// lowest [`DocId`] greater than the target.
///
/// If the end of the `DocSet` is reached, [`TERMINATED`] is returned.
///
/// Calling `.seek(target)` on a terminated `DocSet` is legal. Implementation
/// of `DocSet` should support it.
///
/// Calling `seek(TERMINATED)` is also legal and is the normal way to consume a `DocSet`.
fn seek(&mut self, target: DocId) -> DocId {
if self.is_last_seek_distance_large(target) {
self.reset_fetch_range();
}
if target > self.next_fetch_start {
self.next_fetch_start = target;
}
let mut doc = self.doc();
debug_assert!(doc <= target);
while doc < target {
doc = self.advance();
}
self.last_seek_pos_opt = Some(target);
doc
}
fn size_hint(&self) -> u32 {
0 // heuristic possible by checking number of hits when fetching a block
}
}

View File

@@ -1,8 +0,0 @@
mod fast_field_range_query;
mod range_query;
mod range_query_ip_fastfield;
mod range_query_u64_fastfield;
pub(crate) use range_query::is_type_valid_for_fastfield_range_query;
pub use self::range_query::RangeQuery;

View File

@@ -1,557 +0,0 @@
//! Fastfields support efficient scanning for range queries.
//! We use this variant only if the fastfield exists, otherwise the default in `range_query` is
//! used, which uses the term dictionary + postings.
use std::ops::{Bound, RangeInclusive};
use fastfield_codecs::MonotonicallyMappableToU64;
use super::fast_field_range_query::{FastFieldCardinality, RangeDocSet};
use super::range_query::map_bound;
use crate::query::{ConstScorer, Explanation, Scorer, Weight};
use crate::schema::{Cardinality, Field};
use crate::{DocId, DocSet, Score, SegmentReader, TantivyError};
/// `FastFieldRangeWeight` uses the fast field to execute range queries.
pub struct FastFieldRangeWeight {
field: Field,
left_bound: Bound<u64>,
right_bound: Bound<u64>,
}
impl FastFieldRangeWeight {
pub fn new(field: Field, left_bound: Bound<u64>, right_bound: Bound<u64>) -> Self {
let left_bound = map_bound(&left_bound, &|val| *val);
let right_bound = map_bound(&right_bound, &|val| *val);
Self {
field,
left_bound,
right_bound,
}
}
}
impl Weight for FastFieldRangeWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let field_type = reader.schema().get_field_entry(self.field).field_type();
match field_type.fastfield_cardinality().unwrap() {
Cardinality::SingleValue => {
let fast_field = reader.fast_fields().u64_lenient(self.field)?;
let value_range = bound_to_value_range(
&self.left_bound,
&self.right_bound,
fast_field.min_value(),
fast_field.max_value(),
);
let docset =
RangeDocSet::new(value_range, FastFieldCardinality::SingleValue(fast_field));
Ok(Box::new(ConstScorer::new(docset, boost)))
}
Cardinality::MultiValues => {
let fast_field = reader.fast_fields().u64s_lenient(self.field)?;
let value_range = bound_to_value_range(
&self.left_bound,
&self.right_bound,
fast_field.min_value(),
fast_field.max_value(),
);
let docset =
RangeDocSet::new(value_range, FastFieldCardinality::MultiValue(fast_field));
Ok(Box::new(ConstScorer::new(docset, boost)))
}
}
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let mut scorer = self.scorer(reader, 1.0)?;
if scorer.seek(doc) != doc {
return Err(TantivyError::InvalidArgument(format!(
"Document #({}) does not match",
doc
)));
}
let explanation = Explanation::new("Const", scorer.score());
Ok(explanation)
}
}
fn bound_to_value_range<T: MonotonicallyMappableToU64>(
left_bound: &Bound<T>,
right_bound: &Bound<T>,
min_value: T,
max_value: T,
) -> RangeInclusive<T> {
let start_value = match left_bound {
Bound::Included(val) => *val,
Bound::Excluded(val) => T::from_u64(val.to_u64() + 1),
Bound::Unbounded => min_value,
};
let end_value = match right_bound {
Bound::Included(val) => *val,
Bound::Excluded(val) => T::from_u64(val.to_u64() - 1),
Bound::Unbounded => max_value,
};
start_value..=end_value
}
#[cfg(test)]
mod tests {
use proptest::prelude::ProptestConfig;
use proptest::strategy::Strategy;
use proptest::{prop_oneof, proptest};
use rand::rngs::StdRng;
use rand::seq::SliceRandom;
use rand::SeedableRng;
use super::*;
use crate::collector::Count;
use crate::query::QueryParser;
use crate::schema::{NumericOptions, Schema, FAST, INDEXED, STORED, STRING};
use crate::Index;
#[derive(Clone, Debug)]
pub struct Doc {
pub id_name: String,
pub id: u64,
}
fn operation_strategy() -> impl Strategy<Value = Doc> {
prop_oneof![
(0u64..10_000u64).prop_map(doc_from_id_1),
(1u64..10_000u64).prop_map(doc_from_id_2),
]
}
pub fn doc_from_id_1(id: u64) -> Doc {
let id = id * 1000;
Doc {
id_name: id.to_string(),
id,
}
}
fn doc_from_id_2(id: u64) -> Doc {
let id = id * 1000;
Doc {
id_name: (id - 1).to_string(),
id,
}
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(10))]
#[test]
fn test_range_for_docs_prop(ops in proptest::collection::vec(operation_strategy(), 1..1000)) {
assert!(test_id_range_for_docs(ops).is_ok());
}
}
#[test]
fn range_regression1_test() {
let ops = vec![doc_from_id_1(0)];
assert!(test_id_range_for_docs(ops).is_ok());
}
#[test]
fn range_regression2_test() {
let ops = vec![
doc_from_id_1(52),
doc_from_id_1(63),
doc_from_id_1(12),
doc_from_id_2(91),
doc_from_id_2(33),
];
assert!(test_id_range_for_docs(ops).is_ok());
}
#[test]
fn range_regression3_test() {
let ops = vec![doc_from_id_1(1), doc_from_id_1(2), doc_from_id_1(3)];
assert!(test_id_range_for_docs(ops).is_ok());
}
#[test]
fn range_regression4_test() {
let ops = vec![doc_from_id_2(100)];
assert!(test_id_range_for_docs(ops).is_ok());
}
pub fn create_index_from_docs(docs: &[Doc]) -> Index {
let mut schema_builder = Schema::builder();
let id_u64_field = schema_builder.add_u64_field("id", INDEXED | STORED | FAST);
let ids_u64_field = schema_builder.add_u64_field(
"ids",
NumericOptions::default()
.set_fast(Cardinality::MultiValues)
.set_indexed(),
);
let id_f64_field = schema_builder.add_f64_field("id_f64", INDEXED | STORED | FAST);
let ids_f64_field = schema_builder.add_f64_field(
"ids_f64",
NumericOptions::default()
.set_fast(Cardinality::MultiValues)
.set_indexed(),
);
let id_i64_field = schema_builder.add_i64_field("id_i64", INDEXED | STORED | FAST);
let ids_i64_field = schema_builder.add_i64_field(
"ids_i64",
NumericOptions::default()
.set_fast(Cardinality::MultiValues)
.set_indexed(),
);
let text_field = schema_builder.add_text_field("id_name", STRING | STORED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer(3_000_000).unwrap();
for doc in docs.iter() {
index_writer
.add_document(doc!(
ids_i64_field => doc.id as i64,
ids_i64_field => doc.id as i64,
ids_f64_field => doc.id as f64,
ids_f64_field => doc.id as f64,
ids_u64_field => doc.id,
ids_u64_field => doc.id,
id_u64_field => doc.id,
id_f64_field => doc.id as f64,
id_i64_field => doc.id as i64,
text_field => doc.id_name.to_string(),
))
.unwrap();
}
index_writer.commit().unwrap();
}
index
}
fn test_id_range_for_docs(docs: Vec<Doc>) -> crate::Result<()> {
let index = create_index_from_docs(&docs);
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let mut rng: StdRng = StdRng::from_seed([1u8; 32]);
let get_num_hits = |query| searcher.search(&query, &(Count)).unwrap();
let query_from_text = |text: &str| {
QueryParser::for_index(&index, vec![])
.parse_query(text)
.unwrap()
};
let gen_query_inclusive = |field: &str, from: u64, to: u64| {
format!("{}:[{} TO {}]", field, &from.to_string(), &to.to_string())
};
let test_sample = |sample_docs: Vec<Doc>| {
let mut ids: Vec<u64> = sample_docs.iter().map(|doc| doc.id).collect();
ids.sort();
let expected_num_hits = docs
.iter()
.filter(|doc| (ids[0]..=ids[1]).contains(&doc.id))
.count();
let query = gen_query_inclusive("id", ids[0], ids[1]);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
let query = gen_query_inclusive("ids", ids[0], ids[1]);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
// Intersection search
let id_filter = sample_docs[0].id_name.to_string();
let expected_num_hits = docs
.iter()
.filter(|doc| (ids[0]..=ids[1]).contains(&doc.id) && doc.id_name == id_filter)
.count();
let query = format!(
"{} AND id_name:{}",
gen_query_inclusive("id", ids[0], ids[1]),
&id_filter
);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
let query = format!(
"{} AND id_name:{}",
gen_query_inclusive("id_f64", ids[0], ids[1]),
&id_filter
);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
let query = format!(
"{} AND id_name:{}",
gen_query_inclusive("id_i64", ids[0], ids[1]),
&id_filter
);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
// Intersection search on multivalue id field
let id_filter = sample_docs[0].id_name.to_string();
let query = format!(
"{} AND id_name:{}",
gen_query_inclusive("ids", ids[0], ids[1]),
&id_filter
);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
let query = format!(
"{} AND id_name:{}",
gen_query_inclusive("ids_f64", ids[0], ids[1]),
&id_filter
);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
let query = format!(
"{} AND id_name:{}",
gen_query_inclusive("ids_i64", ids[0], ids[1]),
&id_filter
);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
};
test_sample(vec![docs[0].clone(), docs[0].clone()]);
let samples: Vec<_> = docs.choose_multiple(&mut rng, 3).collect();
if samples.len() > 1 {
test_sample(vec![samples[0].clone(), samples[1].clone()]);
test_sample(vec![samples[1].clone(), samples[1].clone()]);
}
if samples.len() > 2 {
test_sample(vec![samples[1].clone(), samples[2].clone()]);
}
Ok(())
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use test::Bencher;
use super::tests::*;
use super::*;
use crate::collector::Count;
use crate::query::QueryParser;
use crate::Index;
fn get_index_0_to_100() -> Index {
let mut rng = StdRng::from_seed([1u8; 32]);
let num_vals = 100_000;
let docs: Vec<_> = (0..num_vals)
.map(|_i| {
let id_name = if rng.gen_bool(0.01) {
"veryfew".to_string() // 1%
} else if rng.gen_bool(0.1) {
"few".to_string() // 9%
} else {
"many".to_string() // 90%
};
Doc {
id_name,
id: rng.gen_range(0..100),
}
})
.collect();
create_index_from_docs(&docs)
}
fn get_90_percent() -> RangeInclusive<u64> {
0..=90
}
fn get_10_percent() -> RangeInclusive<u64> {
0..=10
}
fn get_1_percent() -> RangeInclusive<u64> {
10..=10
}
fn excute_query(
field: &str,
id_range: RangeInclusive<u64>,
suffix: &str,
index: &Index,
) -> usize {
let gen_query_inclusive = |from: &u64, to: &u64| {
format!(
"{}:[{} TO {}] {}",
field,
&from.to_string(),
&to.to_string(),
suffix
)
};
let query = gen_query_inclusive(id_range.start(), id_range.end());
let query_from_text = |text: &str| {
QueryParser::for_index(index, vec![])
.parse_query(text)
.unwrap()
};
let query = query_from_text(&query);
let reader = index.reader().unwrap();
let searcher = reader.searcher();
searcher.search(&query, &(Count)).unwrap()
}
#[bench]
fn bench_id_range_hit_90_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("id", get_90_percent(), "", &index));
}
#[bench]
fn bench_id_range_hit_10_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("id", get_10_percent(), "", &index));
}
#[bench]
fn bench_id_range_hit_1_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("id", get_1_percent(), "", &index));
}
#[bench]
fn bench_id_range_hit_10_percent_intersect_with_10_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("id", get_10_percent(), "AND id_name:few", &index));
}
#[bench]
fn bench_id_range_hit_1_percent_intersect_with_10_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("id", get_1_percent(), "AND id_name:few", &index));
}
#[bench]
fn bench_id_range_hit_1_percent_intersect_with_90_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("id", get_1_percent(), "AND id_name:many", &index));
}
#[bench]
fn bench_id_range_hit_1_percent_intersect_with_1_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("id", get_1_percent(), "AND id_name:veryfew", &index));
}
#[bench]
fn bench_id_range_hit_10_percent_intersect_with_90_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("id", get_10_percent(), "AND id_name:many", &index));
}
#[bench]
fn bench_id_range_hit_90_percent_intersect_with_90_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("id", get_90_percent(), "AND id_name:many", &index));
}
#[bench]
fn bench_id_range_hit_90_percent_intersect_with_10_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("id", get_90_percent(), "AND id_name:few", &index));
}
#[bench]
fn bench_id_range_hit_90_percent_intersect_with_1_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("id", get_90_percent(), "AND id_name:veryfew", &index));
}
#[bench]
fn bench_id_range_hit_90_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ids", get_90_percent(), "", &index));
}
#[bench]
fn bench_id_range_hit_10_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ids", get_10_percent(), "", &index));
}
#[bench]
fn bench_id_range_hit_1_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ids", get_1_percent(), "", &index));
}
#[bench]
fn bench_id_range_hit_10_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ids", get_10_percent(), "AND id_name:few", &index));
}
#[bench]
fn bench_id_range_hit_1_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ids", get_1_percent(), "AND id_name:few", &index));
}
#[bench]
fn bench_id_range_hit_1_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ids", get_1_percent(), "AND id_name:many", &index));
}
#[bench]
fn bench_id_range_hit_1_percent_intersect_with_1_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ids", get_1_percent(), "AND id_name:veryfew", &index));
}
#[bench]
fn bench_id_range_hit_10_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ids", get_10_percent(), "AND id_name:many", &index));
}
#[bench]
fn bench_id_range_hit_90_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ids", get_90_percent(), "AND id_name:many", &index));
}
#[bench]
fn bench_id_range_hit_90_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ids", get_90_percent(), "AND id_name:few", &index));
}
#[bench]
fn bench_id_range_hit_90_percent_intersect_with_1_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ids", get_90_percent(), "AND id_name:veryfew", &index));
}
}

View File

@@ -4,15 +4,16 @@
use std::net::Ipv6Addr; use std::net::Ipv6Addr;
use std::ops::{Bound, RangeInclusive}; use std::ops::{Bound, RangeInclusive};
use std::sync::Arc;
use common::BinarySerializable; use common::BinarySerializable;
use fastfield_codecs::MonotonicallyMappableToU128; use fastfield_codecs::{Column, MonotonicallyMappableToU128};
use super::fast_field_range_query::{FastFieldCardinality, RangeDocSet};
use super::range_query::map_bound; use super::range_query::map_bound;
use crate::query::{ConstScorer, Explanation, Scorer, Weight}; use super::{ConstScorer, Explanation, Scorer, Weight};
use crate::fastfield::MultiValuedU128FastFieldReader;
use crate::schema::{Cardinality, Field}; use crate::schema::{Cardinality, Field};
use crate::{DocId, DocSet, Score, SegmentReader, TantivyError}; use crate::{DocId, DocSet, Score, SegmentReader, TantivyError, TERMINATED};
/// `IPFastFieldRangeWeight` uses the ip address fast field to execute range queries. /// `IPFastFieldRangeWeight` uses the ip address fast field to execute range queries.
pub struct IPFastFieldRangeWeight { pub struct IPFastFieldRangeWeight {
@@ -23,13 +24,13 @@ pub struct IPFastFieldRangeWeight {
impl IPFastFieldRangeWeight { impl IPFastFieldRangeWeight {
pub fn new(field: Field, left_bound: &Bound<Vec<u8>>, right_bound: &Bound<Vec<u8>>) -> Self { pub fn new(field: Field, left_bound: &Bound<Vec<u8>>, right_bound: &Bound<Vec<u8>>) -> Self {
let parse_ip_from_bytes = |data: &Vec<u8>| { let ip_from_bound_raw_data = |data: &Vec<u8>| {
let ip_u128: u128 = let left_ip_u128: u128 =
u128::from_be(BinarySerializable::deserialize(&mut &data[..]).unwrap()); u128::from_be(BinarySerializable::deserialize(&mut &data[..]).unwrap());
Ipv6Addr::from_u128(ip_u128) Ipv6Addr::from_u128(left_ip_u128)
}; };
let left_bound = map_bound(left_bound, &parse_ip_from_bytes); let left_bound = map_bound(left_bound, &ip_from_bound_raw_data);
let right_bound = map_bound(right_bound, &parse_ip_from_bytes); let right_bound = map_bound(right_bound, &ip_from_bound_raw_data);
Self { Self {
field, field,
left_bound, left_bound,
@@ -50,9 +51,9 @@ impl Weight for IPFastFieldRangeWeight {
ip_addr_fast_field.min_value(), ip_addr_fast_field.min_value(),
ip_addr_fast_field.max_value(), ip_addr_fast_field.max_value(),
); );
let docset = RangeDocSet::new( let docset = IpRangeDocSet::new(
value_range, value_range,
FastFieldCardinality::SingleValue(ip_addr_fast_field), IpFastFieldCardinality::SingleValue(ip_addr_fast_field),
); );
Ok(Box::new(ConstScorer::new(docset, boost))) Ok(Box::new(ConstScorer::new(docset, boost)))
} }
@@ -64,9 +65,9 @@ impl Weight for IPFastFieldRangeWeight {
ip_addr_fast_field.min_value(), ip_addr_fast_field.min_value(),
ip_addr_fast_field.max_value(), ip_addr_fast_field.max_value(),
); );
let docset = RangeDocSet::new( let docset = IpRangeDocSet::new(
value_range, value_range,
FastFieldCardinality::MultiValue(ip_addr_fast_field), IpFastFieldCardinality::MultiValue(ip_addr_fast_field),
); );
Ok(Box::new(ConstScorer::new(docset, boost))) Ok(Box::new(ConstScorer::new(docset, boost)))
} }
@@ -107,6 +108,211 @@ fn bound_to_value_range(
start_value..=end_value start_value..=end_value
} }
/// Helper to have a cursor over a vec of docids
struct VecCursor {
docs: Vec<u32>,
current_pos: usize,
}
impl VecCursor {
fn new() -> Self {
Self {
docs: Vec::with_capacity(32),
current_pos: 0,
}
}
fn next(&mut self) -> Option<u32> {
self.current_pos += 1;
self.current()
}
#[inline]
fn current(&self) -> Option<u32> {
self.docs.get(self.current_pos).copied()
}
fn get_cleared_data(&mut self) -> &mut Vec<u32> {
self.docs.clear();
self.current_pos = 0;
&mut self.docs
}
fn last_value(&self) -> Option<u32> {
self.docs.iter().last().cloned()
}
fn is_empty(&self) -> bool {
self.current_pos >= self.docs.len()
}
}
pub(crate) enum IpFastFieldCardinality {
SingleValue(Arc<dyn Column<Ipv6Addr>>),
MultiValue(MultiValuedU128FastFieldReader<Ipv6Addr>),
}
impl IpFastFieldCardinality {
fn num_docs(&self) -> u32 {
match self {
IpFastFieldCardinality::SingleValue(single_value) => single_value.num_vals(),
IpFastFieldCardinality::MultiValue(multi_value) => {
multi_value.get_index_reader().num_docs()
}
}
}
}
struct IpRangeDocSet {
/// The range filter on the values.
value_range: RangeInclusive<Ipv6Addr>,
ip_addr_fast_field: IpFastFieldCardinality,
/// The next docid start range to fetch (inclusive).
next_fetch_start: u32,
/// Number of docs range checked in a batch.
///
/// There are two patterns.
/// - We do a full scan. => We can load large chunks. We don't know in advance if seek call
/// will come, so we start with small chunks
/// - We load docs, interspersed with seek calls. When there are big jumps in the seek, we
/// should load small chunks. When the seeks are small, we can employ the same strategy as on a
/// full scan.
fetch_horizon: u32,
/// Current batch of loaded docs.
loaded_docs: VecCursor,
last_seek_pos_opt: Option<u32>,
}
const DEFAULT_FETCH_HORIZON: u32 = 128;
impl IpRangeDocSet {
fn new(
value_range: RangeInclusive<Ipv6Addr>,
ip_addr_fast_field: IpFastFieldCardinality,
) -> Self {
let mut ip_range_docset = Self {
value_range,
ip_addr_fast_field,
loaded_docs: VecCursor::new(),
next_fetch_start: 0,
fetch_horizon: DEFAULT_FETCH_HORIZON,
last_seek_pos_opt: None,
};
ip_range_docset.reset_fetch_range();
ip_range_docset.fetch_block();
ip_range_docset
}
fn reset_fetch_range(&mut self) {
self.fetch_horizon = DEFAULT_FETCH_HORIZON;
}
/// Returns true if more data could be fetched
fn fetch_block(&mut self) {
const MAX_HORIZON: u32 = 100_000;
while self.loaded_docs.is_empty() {
let finished_to_end = self.fetch_horizon(self.fetch_horizon);
if finished_to_end {
break;
}
// Fetch more data, increase horizon. Horizon only gets reset when doing a seek.
self.fetch_horizon = (self.fetch_horizon * 2).min(MAX_HORIZON);
}
}
/// check if the distance between the seek calls is large
fn is_last_seek_distance_large(&self, new_seek: DocId) -> bool {
if let Some(last_seek_pos) = self.last_seek_pos_opt {
(new_seek - last_seek_pos) >= 128
} else {
true
}
}
/// Fetches a block for docid range [next_fetch_start .. next_fetch_start + HORIZON]
fn fetch_horizon(&mut self, horizon: u32) -> bool {
let mut finished_to_end = false;
let limit = self.ip_addr_fast_field.num_docs();
let mut end = self.next_fetch_start + horizon;
if end >= limit {
end = limit;
finished_to_end = true;
}
match &self.ip_addr_fast_field {
IpFastFieldCardinality::MultiValue(multi) => {
let last_value = self.loaded_docs.last_value();
multi.get_docids_for_value_range(
self.value_range.clone(),
self.next_fetch_start..end,
self.loaded_docs.get_cleared_data(),
);
// In case of multivalues, we may have an overlap of the same docid between fetching
// blocks
if let Some(last_value) = last_value {
while self.loaded_docs.current() == Some(last_value) {
self.loaded_docs.next();
}
}
}
IpFastFieldCardinality::SingleValue(single) => {
single.get_docids_for_value_range(
self.value_range.clone(),
self.next_fetch_start..end,
self.loaded_docs.get_cleared_data(),
);
}
}
self.next_fetch_start = end;
finished_to_end
}
}
impl DocSet for IpRangeDocSet {
#[inline]
fn advance(&mut self) -> DocId {
if let Some(docid) = self.loaded_docs.next() {
docid
} else {
if self.next_fetch_start >= self.ip_addr_fast_field.num_docs() {
return TERMINATED;
}
self.fetch_block();
self.loaded_docs.current().unwrap_or(TERMINATED)
}
}
#[inline]
fn doc(&self) -> DocId {
self.loaded_docs.current().unwrap_or(TERMINATED)
}
/// Advances the `DocSet` forward until reaching the target, or going to the
/// lowest [`DocId`] greater than the target.
///
/// If the end of the `DocSet` is reached, [`TERMINATED`] is returned.
///
/// Calling `.seek(target)` on a terminated `DocSet` is legal. Implementation
/// of `DocSet` should support it.
///
/// Calling `seek(TERMINATED)` is also legal and is the normal way to consume a `DocSet`.
fn seek(&mut self, target: DocId) -> DocId {
if self.is_last_seek_distance_large(target) {
self.reset_fetch_range();
}
if target > self.next_fetch_start {
self.next_fetch_start = target;
}
let mut doc = self.doc();
debug_assert!(doc <= target);
while doc < target {
doc = self.advance();
}
self.last_seek_pos_opt = Some(target);
doc
}
fn size_hint(&self) -> u32 {
0 // heuristic possible by checking number of hits when fetching a block
}
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use proptest::prelude::ProptestConfig; use proptest::prelude::ProptestConfig;
@@ -195,7 +401,7 @@ mod tests {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
{ {
let mut index_writer = index.writer(10_000_000).unwrap(); let mut index_writer = index.writer(3_000_000).unwrap();
for doc in docs.iter() { for doc in docs.iter() {
index_writer index_writer
.add_document(doc!( .add_document(doc!(
@@ -313,7 +519,8 @@ mod bench {
}) })
.collect(); .collect();
create_index_from_docs(&docs) let index = create_index_from_docs(&docs);
index
} }
fn get_90_percent() -> RangeInclusive<Ipv6Addr> { fn get_90_percent() -> RangeInclusive<Ipv6Addr> {
@@ -352,7 +559,7 @@ mod bench {
let query = gen_query_inclusive(ip_range.start(), ip_range.end()); let query = gen_query_inclusive(ip_range.start(), ip_range.end());
let query_from_text = |text: &str| { let query_from_text = |text: &str| {
QueryParser::for_index(index, vec![]) QueryParser::for_index(&index, vec![])
.parse_query(text) .parse_query(text)
.unwrap() .unwrap()
}; };

View File

@@ -158,8 +158,7 @@ mod tests {
let term_a = Term::from_field_text(text_field, "a"); let term_a = Term::from_field_text(text_field, "a");
let term_query = TermQuery::new(term_a, IndexRecordOption::Basic); let term_query = TermQuery::new(term_a, IndexRecordOption::Basic);
let searcher = index.reader()?.searcher(); let searcher = index.reader()?.searcher();
let term_weight = let term_weight = term_query.weight(EnableScoring::Disabled(searcher.schema()))?;
term_query.weight(EnableScoring::disabled_from_schema(searcher.schema()))?;
let mut term_scorer = term_weight.scorer(searcher.segment_reader(0u32), 1.0)?; let mut term_scorer = term_weight.scorer(searcher.segment_reader(0u32), 1.0)?;
assert_eq!(term_scorer.doc(), 0u32); assert_eq!(term_scorer.doc(), 0u32);
term_scorer.seek(1u32); term_scorer.seek(1u32);

View File

@@ -99,7 +99,7 @@ impl TermQuery {
EnableScoring::Enabled(searcher) => { EnableScoring::Enabled(searcher) => {
Bm25Weight::for_terms(searcher, &[self.term.clone()])? Bm25Weight::for_terms(searcher, &[self.term.clone()])?
} }
EnableScoring::Disabled { .. } => { EnableScoring::Disabled(_schema) => {
Bm25Weight::new(Explanation::new("<no score>".to_string(), 1.0f32), 1.0f32) Bm25Weight::new(Explanation::new("<no score>".to_string(), 1.0f32), 1.0f32)
} }
}; };

View File

@@ -1,8 +1,6 @@
use std::net::IpAddr; use std::net::IpAddr;
use std::str::FromStr; use std::str::FromStr;
use base64::engine::general_purpose::STANDARD as BASE64;
use base64::Engine;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use serde_json::Value as JsonValue; use serde_json::Value as JsonValue;
use thiserror::Error; use thiserror::Error;
@@ -360,8 +358,7 @@ impl FieldType {
json: JsonValue::String(field_text), json: JsonValue::String(field_text),
}), }),
FieldType::Facet(_) => Ok(Value::Facet(Facet::from(&field_text))), FieldType::Facet(_) => Ok(Value::Facet(Facet::from(&field_text))),
FieldType::Bytes(_) => BASE64 FieldType::Bytes(_) => base64::decode(&field_text)
.decode(&field_text)
.map(Value::Bytes) .map(Value::Bytes)
.map_err(|_| ValueParsingError::InvalidBase64 { base64: field_text }), .map_err(|_| ValueParsingError::InvalidBase64 { base64: field_text }),
FieldType::JsonObject(_) => Err(ValueParsingError::TypeError { FieldType::JsonObject(_) => Err(ValueParsingError::TypeError {

View File

@@ -258,7 +258,7 @@ mod tests {
let field = schema.get_field("body").unwrap(); let field = schema.get_field("body").unwrap();
let field_entry = schema.get_field_entry(field); let field_entry = schema.get_field_entry(field);
assert!(matches!(field_entry.field_type(), assert!(matches!(field_entry.field_type(),
FieldType::Str(text_options) &FieldType::Str(ref text_options)
if text_options.get_indexing_options().unwrap().tokenizer() == "default")); if text_options.get_indexing_options().unwrap().tokenizer() == "default"));
} }

View File

@@ -1,8 +1,6 @@
use std::fmt; use std::fmt;
use std::net::Ipv6Addr; use std::net::Ipv6Addr;
use base64::engine::general_purpose::STANDARD as BASE64;
use base64::Engine;
use serde::de::Visitor; use serde::de::Visitor;
use serde::{Deserialize, Deserializer, Serialize, Serializer}; use serde::{Deserialize, Deserializer, Serialize, Serializer};
use serde_json::Map; use serde_json::Map;
@@ -53,7 +51,7 @@ impl Serialize for Value {
Value::Bool(b) => serializer.serialize_bool(b), Value::Bool(b) => serializer.serialize_bool(b),
Value::Date(ref date) => time::serde::rfc3339::serialize(&date.into_utc(), serializer), Value::Date(ref date) => time::serde::rfc3339::serialize(&date.into_utc(), serializer),
Value::Facet(ref facet) => facet.serialize(serializer), Value::Facet(ref facet) => facet.serialize(serializer),
Value::Bytes(ref bytes) => serializer.serialize_str(&BASE64.encode(bytes)), Value::Bytes(ref bytes) => serializer.serialize_str(&base64::encode(bytes)),
Value::JsonObject(ref obj) => obj.serialize(serializer), Value::JsonObject(ref obj) => obj.serialize(serializer),
Value::IpAddr(ref obj) => { Value::IpAddr(ref obj) => {
// Ensure IpV4 addresses get serialized as IpV4, but excluding IpV6 loopback. // Ensure IpV4 addresses get serialized as IpV4, but excluding IpV6 loopback.

View File

@@ -124,9 +124,9 @@ impl Snippet {
/// ///
/// Fragments must be valid in the sense that `&text[fragment.start..fragment.stop]`\ /// Fragments must be valid in the sense that `&text[fragment.start..fragment.stop]`\
/// has to be a valid string. /// has to be a valid string.
fn search_fragments( fn search_fragments<'a>(
tokenizer: &TextAnalyzer, tokenizer: &TextAnalyzer,
text: &str, text: &'a str,
terms: &BTreeMap<String, Score>, terms: &BTreeMap<String, Score>,
max_num_chars: usize, max_num_chars: usize,
) -> Vec<FragmentCandidate> { ) -> Vec<FragmentCandidate> {

View File

@@ -1,6 +1,5 @@
use std::io; use std::io;
use std::iter::Sum; use std::iter::Sum;
use std::num::NonZeroUsize;
use std::ops::{AddAssign, Range}; use std::ops::{AddAssign, Range};
use std::sync::atomic::{AtomicUsize, Ordering}; use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::{Arc, Mutex}; use std::sync::{Arc, Mutex};
@@ -34,29 +33,23 @@ pub struct StoreReader {
/// The cache for decompressed blocks. /// The cache for decompressed blocks.
struct BlockCache { struct BlockCache {
cache: Option<Mutex<LruCache<usize, Block>>>, cache: Mutex<LruCache<usize, Block>>,
cache_hits: AtomicUsize, cache_hits: Arc<AtomicUsize>,
cache_misses: AtomicUsize, cache_misses: Arc<AtomicUsize>,
} }
impl BlockCache { impl BlockCache {
fn get_from_cache(&self, pos: usize) -> Option<Block> { fn get_from_cache(&self, pos: usize) -> Option<Block> {
if let Some(block) = self if let Some(block) = self.cache.lock().unwrap().get(&pos) {
.cache
.as_ref()
.and_then(|cache| cache.lock().unwrap().get(&pos).cloned())
{
self.cache_hits.fetch_add(1, Ordering::SeqCst); self.cache_hits.fetch_add(1, Ordering::SeqCst);
return Some(block); return Some(block.clone());
} }
self.cache_misses.fetch_add(1, Ordering::SeqCst); self.cache_misses.fetch_add(1, Ordering::SeqCst);
None None
} }
fn put_into_cache(&self, pos: usize, data: Block) { fn put_into_cache(&self, pos: usize, data: Block) {
if let Some(cache) = self.cache.as_ref() { self.cache.lock().unwrap().put(pos, data);
cache.lock().unwrap().put(pos, data);
}
} }
fn stats(&self) -> CacheStats { fn stats(&self) -> CacheStats {
@@ -66,18 +59,13 @@ impl BlockCache {
num_entries: self.len(), num_entries: self.len(),
} }
} }
fn len(&self) -> usize { fn len(&self) -> usize {
self.cache self.cache.lock().unwrap().len()
.as_ref()
.map_or(0, |cache| cache.lock().unwrap().len())
} }
#[cfg(test)] #[cfg(test)]
fn peek_lru(&self) -> Option<usize> { fn peek_lru(&self) -> Option<usize> {
self.cache self.cache.lock().unwrap().peek_lru().map(|(&k, _)| k)
.as_ref()
.and_then(|cache| cache.lock().unwrap().peek_lru().map(|(&k, _)| k))
} }
} }
@@ -125,8 +113,7 @@ impl StoreReader {
decompressor: footer.decompressor, decompressor: footer.decompressor,
data: data_file, data: data_file,
cache: BlockCache { cache: BlockCache {
cache: NonZeroUsize::new(cache_size) cache: Mutex::new(LruCache::new(cache_size)),
.map(|cache_size| Mutex::new(LruCache::new(cache_size))),
cache_hits: Default::default(), cache_hits: Default::default(),
cache_misses: Default::default(), cache_misses: Default::default(),
}, },

View File

@@ -113,7 +113,7 @@ mod bench {
} }
/// Create a dictionary of random strings. /// Create a dictionary of random strings.
fn rand_dict(num_terms: usize) -> std::io::Result<TermDictionary> { fn rand_dict(num_terms: usize) -> crate::Result<TermDictionary> {
let buffer: Vec<u8> = { let buffer: Vec<u8> = {
let mut terms = vec![]; let mut terms = vec![];
for _i in 0..num_terms { for _i in 0..num_terms {

View File

@@ -99,7 +99,7 @@ fn test_term_dictionary_stream() -> crate::Result<()> {
.collect(); .collect();
let buffer: Vec<u8> = { let buffer: Vec<u8> = {
let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap(); let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();
for (id, i) in &ids { for &(ref id, ref i) in &ids {
term_dictionary_builder term_dictionary_builder
.insert(id.as_bytes(), &make_term_info(*i as u64)) .insert(id.as_bytes(), &make_term_info(*i as u64))
.unwrap(); .unwrap();
@@ -112,14 +112,14 @@ fn test_term_dictionary_stream() -> crate::Result<()> {
let mut streamer = term_dictionary.stream()?; let mut streamer = term_dictionary.stream()?;
let mut i = 0; let mut i = 0;
while let Some((streamer_k, streamer_v)) = streamer.next() { while let Some((streamer_k, streamer_v)) = streamer.next() {
let (key, v) = &ids[i]; let &(ref key, ref v) = &ids[i];
assert_eq!(streamer_k, key.as_bytes()); assert_eq!(streamer_k, key.as_bytes());
assert_eq!(streamer_v, &make_term_info(*v as u64)); assert_eq!(streamer_v, &make_term_info(*v as u64));
i += 1; i += 1;
} }
} }
let (key, val) = &ids[2047]; let &(ref key, ref val) = &ids[2047];
assert_eq!( assert_eq!(
term_dictionary.get(key.as_bytes())?, term_dictionary.get(key.as_bytes())?,
Some(make_term_info(*val as u64)) Some(make_term_info(*val as u64))
@@ -160,7 +160,7 @@ fn test_stream_range() -> crate::Result<()> {
.collect(); .collect();
let buffer: Vec<u8> = { let buffer: Vec<u8> = {
let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap(); let mut term_dictionary_builder = TermDictionaryBuilder::create(vec![]).unwrap();
for (id, i) in &ids { for &(ref id, ref i) in &ids {
term_dictionary_builder term_dictionary_builder
.insert(id.as_bytes(), &make_term_info(*i as u64)) .insert(id.as_bytes(), &make_term_info(*i as u64))
.unwrap(); .unwrap();
@@ -173,14 +173,14 @@ fn test_stream_range() -> crate::Result<()> {
let term_dictionary: TermDictionary = TermDictionary::open(file)?; let term_dictionary: TermDictionary = TermDictionary::open(file)?;
{ {
for i in (0..20).chain(6000..8_000) { for i in (0..20).chain(6000..8_000) {
let (target_key, _) = &ids[i]; let &(ref target_key, _) = &ids[i];
let mut streamer = term_dictionary let mut streamer = term_dictionary
.range() .range()
.ge(target_key.as_bytes()) .ge(target_key.as_bytes())
.into_stream()?; .into_stream()?;
for j in 0..3 { for j in 0..3 {
let (streamer_k, streamer_v) = streamer.next().unwrap(); let (streamer_k, streamer_v) = streamer.next().unwrap();
let (key, v) = &ids[i + j]; let &(ref key, ref v) = &ids[i + j];
assert_eq!(str::from_utf8(streamer_k).unwrap(), key); assert_eq!(str::from_utf8(streamer_k).unwrap(), key);
assert_eq!(streamer_v.doc_freq, *v); assert_eq!(streamer_v.doc_freq, *v);
assert_eq!(streamer_v, &make_term_info(*v as u64)); assert_eq!(streamer_v, &make_term_info(*v as u64));
@@ -190,14 +190,14 @@ fn test_stream_range() -> crate::Result<()> {
{ {
for i in (0..20).chain(BLOCK_SIZE - 10..BLOCK_SIZE + 10) { for i in (0..20).chain(BLOCK_SIZE - 10..BLOCK_SIZE + 10) {
let (target_key, _) = &ids[i]; let &(ref target_key, _) = &ids[i];
let mut streamer = term_dictionary let mut streamer = term_dictionary
.range() .range()
.gt(target_key.as_bytes()) .gt(target_key.as_bytes())
.into_stream()?; .into_stream()?;
for j in 0..3 { for j in 0..3 {
let (streamer_k, streamer_v) = streamer.next().unwrap(); let (streamer_k, streamer_v) = streamer.next().unwrap();
let (key, v) = &ids[i + j + 1]; let &(ref key, ref v) = &ids[i + j + 1];
assert_eq!(streamer_k, key.as_bytes()); assert_eq!(streamer_k, key.as_bytes());
assert_eq!(streamer_v.doc_freq, *v); assert_eq!(streamer_v.doc_freq, *v);
} }
@@ -207,8 +207,8 @@ fn test_stream_range() -> crate::Result<()> {
{ {
for i in (0..20).chain(BLOCK_SIZE - 10..BLOCK_SIZE + 10) { for i in (0..20).chain(BLOCK_SIZE - 10..BLOCK_SIZE + 10) {
for j in 0..3 { for j in 0..3 {
let (fst_key, _) = &ids[i]; let &(ref fst_key, _) = &ids[i];
let (last_key, _) = &ids[i + j]; let &(ref last_key, _) = &ids[i + j];
let mut streamer = term_dictionary let mut streamer = term_dictionary
.range() .range()
.ge(fst_key.as_bytes()) .ge(fst_key.as_bytes())

View File

@@ -52,8 +52,6 @@
//! remove their inflection. This tokenizer is slower than the default one, //! remove their inflection. This tokenizer is slower than the default one,
//! but is recommended to improve recall. //! but is recommended to improve recall.
//! //!
//! # Custom tokenizer Library
//! Avoid using tantivy as dependency and prefer `tantivy-tokenizer-api` instead.
//! //!
//! # Custom tokenizers //! # Custom tokenizers
//! //!
@@ -126,7 +124,6 @@ mod facet_tokenizer;
mod lower_caser; mod lower_caser;
mod ngram_tokenizer; mod ngram_tokenizer;
mod raw_tokenizer; mod raw_tokenizer;
mod regex_tokenizer;
mod remove_long; mod remove_long;
mod simple_tokenizer; mod simple_tokenizer;
mod split_compound_words; mod split_compound_words;
@@ -137,24 +134,21 @@ mod tokenizer;
mod tokenizer_manager; mod tokenizer_manager;
mod whitespace_tokenizer; mod whitespace_tokenizer;
pub use tokenizer_api::{
BoxTokenFilter, BoxTokenStream, Token, TokenFilter, TokenStream, Tokenizer,
};
pub use self::alphanum_only::AlphaNumOnlyFilter; pub use self::alphanum_only::AlphaNumOnlyFilter;
pub use self::ascii_folding_filter::AsciiFoldingFilter; pub use self::ascii_folding_filter::AsciiFoldingFilter;
pub use self::facet_tokenizer::FacetTokenizer; pub use self::facet_tokenizer::FacetTokenizer;
pub use self::lower_caser::LowerCaser; pub use self::lower_caser::LowerCaser;
pub use self::ngram_tokenizer::NgramTokenizer; pub use self::ngram_tokenizer::NgramTokenizer;
pub use self::raw_tokenizer::RawTokenizer; pub use self::raw_tokenizer::RawTokenizer;
pub use self::regex_tokenizer::RegexTokenizer;
pub use self::remove_long::RemoveLongFilter; pub use self::remove_long::RemoveLongFilter;
pub use self::simple_tokenizer::SimpleTokenizer; pub use self::simple_tokenizer::SimpleTokenizer;
pub use self::split_compound_words::SplitCompoundWords; pub use self::split_compound_words::SplitCompoundWords;
pub use self::stemmer::{Language, Stemmer}; pub use self::stemmer::{Language, Stemmer};
pub use self::stop_word_filter::StopWordFilter; pub use self::stop_word_filter::StopWordFilter;
pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString}; pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
pub use self::tokenizer::TextAnalyzer; pub use self::tokenizer::{
BoxTokenFilter, BoxTokenStream, TextAnalyzer, Token, TokenFilter, TokenStream, Tokenizer,
};
pub use self::tokenizer_manager::TokenizerManager; pub use self::tokenizer_manager::TokenizerManager;
pub use self::whitespace_tokenizer::WhitespaceTokenizer; pub use self::whitespace_tokenizer::WhitespaceTokenizer;

View File

@@ -303,7 +303,8 @@ mod tests {
use super::{utf8_codepoint_width, CodepointFrontiers, NgramTokenizer, StutteringIterator}; use super::{utf8_codepoint_width, CodepointFrontiers, NgramTokenizer, StutteringIterator};
use crate::tokenizer::tests::assert_token; use crate::tokenizer::tests::assert_token;
use crate::tokenizer::{BoxTokenStream, Token, Tokenizer}; use crate::tokenizer::tokenizer::Tokenizer;
use crate::tokenizer::{BoxTokenStream, Token};
fn test_helper(mut tokenizer: BoxTokenStream) -> Vec<Token> { fn test_helper(mut tokenizer: BoxTokenStream) -> Vec<Token> {
let mut tokens: Vec<Token> = vec![]; let mut tokens: Vec<Token> = vec![];

View File

@@ -1,158 +0,0 @@
use regex::Regex;
use super::{BoxTokenStream, Token, TokenStream, Tokenizer};
use crate::TantivyError;
/// Tokenize the text by using a regex pattern to split.
/// Each match of the regex emits a distinct token, empty tokens will not be emitted. Anchors such
/// as `\A` will match the text from the part where the last token was emitted or the beginning of
/// the complete text if no token was emitted yet.
///
/// Example: `` 'aaa' bbb 'ccc' 'ddd' `` with the pattern `` '(?:\w*)' `` will be tokenized as
/// followed:
///
/// | Term | aaa | ccc | ddd |
/// |----------|------|--------|-------|
/// | Position | 1 | 2 | 3 |
/// | Offsets |0,5 | 10,15 | 16,21 |
///
///
/// # Example
///
/// ```rust
/// use tantivy::tokenizer::*;
///
/// let tokenizer = RegexTokenizer::new(r"'(?:\w*)'").unwrap();
/// let mut stream = tokenizer.token_stream("'aaa' bbb 'ccc' 'ddd'");
/// {
/// let token = stream.next().unwrap();
/// assert_eq!(token.text, "'aaa'");
/// assert_eq!(token.offset_from, 0);
/// assert_eq!(token.offset_to, 5);
/// }
/// {
/// let token = stream.next().unwrap();
/// assert_eq!(token.text, "'ccc'");
/// assert_eq!(token.offset_from, 10);
/// assert_eq!(token.offset_to, 15);
/// }
/// {
/// let token = stream.next().unwrap();
/// assert_eq!(token.text, "'ddd'");
/// assert_eq!(token.offset_from, 16);
/// assert_eq!(token.offset_to, 21);
/// }
/// assert!(stream.next().is_none());
/// ```
#[derive(Clone)]
pub struct RegexTokenizer {
regex: Regex,
}
impl RegexTokenizer {
/// Creates a new RegexTokenizer.
pub fn new(regex_pattern: &str) -> crate::Result<RegexTokenizer> {
Regex::new(regex_pattern)
.map_err(|_| TantivyError::InvalidArgument(regex_pattern.to_owned()))
.map(|regex| Self { regex })
}
}
impl Tokenizer for RegexTokenizer {
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
BoxTokenStream::from(RegexTokenStream {
regex: self.regex.clone(),
text,
token: Token::default(),
cursor: 0,
})
}
}
pub struct RegexTokenStream<'a> {
regex: Regex,
text: &'a str,
token: Token,
cursor: usize,
}
impl<'a> TokenStream for RegexTokenStream<'a> {
fn advance(&mut self) -> bool {
let Some(regex_match) = self.regex.find(self.text) else {
return false;
};
if regex_match.as_str().is_empty() {
return false;
}
self.token.text.clear();
self.token.text.push_str(regex_match.as_str());
self.token.offset_from = self.cursor + regex_match.start();
self.cursor += regex_match.end();
self.token.offset_to = self.cursor;
self.token.position = self.token.position.wrapping_add(1);
self.text = &self.text[regex_match.end()..];
true
}
fn token(&self) -> &Token {
&self.token
}
fn token_mut(&mut self) -> &mut Token {
&mut self.token
}
}
#[cfg(test)]
mod tests {
use crate::tokenizer::regex_tokenizer::RegexTokenizer;
use crate::tokenizer::tests::assert_token;
use crate::tokenizer::{TextAnalyzer, Token};
#[test]
fn test_regex_tokenizer() {
let tokens = token_stream_helper("'aaa' bbb 'ccc' 'ddd'", r"'(?:\w*)'");
assert_eq!(tokens.len(), 3);
assert_token(&tokens[0], 0, "'aaa'", 0, 5);
assert_token(&tokens[1], 1, "'ccc'", 10, 15);
assert_token(&tokens[2], 2, "'ddd'", 16, 21);
}
#[test]
fn test_regexp_tokenizer_no_match_on_input_data() {
let tokens = token_stream_helper("aaa", r"'(?:\w*)'");
assert_eq!(tokens.len(), 0);
}
#[test]
fn test_regexp_tokenizer_no_input_data() {
let tokens = token_stream_helper("", r"'(?:\w*)'");
assert_eq!(tokens.len(), 0);
}
#[test]
fn test_regexp_tokenizer_error_on_invalid_regex() {
let tokenizer = RegexTokenizer::new(r"\@");
assert_eq!(tokenizer.is_err(), true);
assert_eq!(
tokenizer.err().unwrap().to_string(),
"An invalid argument was passed: '\\@'"
);
}
fn token_stream_helper(text: &str, pattern: &str) -> Vec<Token> {
let r = RegexTokenizer::new(pattern).unwrap();
let a = TextAnalyzer::from(r);
let mut token_stream = a.token_stream(text);
let mut tokens: Vec<Token> = vec![];
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
token_stream.process(&mut add_token);
tokens
}
}

View File

@@ -26,7 +26,7 @@ impl<'a> SimpleTokenStream<'a> {
// search for the end of the current token. // search for the end of the current token.
fn search_token_end(&mut self) -> usize { fn search_token_end(&mut self) -> usize {
(&mut self.chars) (&mut self.chars)
.filter(|(_, c)| !c.is_alphanumeric()) .filter(|&(_, ref c)| !c.is_alphanumeric())
.map(|(offset, _)| offset) .map(|(offset, _)| offset)
.next() .next()
.unwrap_or(self.text.len()) .unwrap_or(self.text.len())

View File

@@ -1,9 +1,42 @@
/// The tokenizer module contains all of the tools used to process /// The tokenizer module contains all of the tools used to process
/// text in `tantivy`. /// text in `tantivy`.
use tokenizer_api::{BoxTokenFilter, BoxTokenStream, Tokenizer}; use std::borrow::{Borrow, BorrowMut};
use std::ops::{Deref, DerefMut};
use serde::{Deserialize, Serialize};
use crate::tokenizer::empty_tokenizer::EmptyTokenizer; use crate::tokenizer::empty_tokenizer::EmptyTokenizer;
/// Token
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
pub struct Token {
/// Offset (byte index) of the first character of the token.
/// Offsets shall not be modified by token filters.
pub offset_from: usize,
/// Offset (byte index) of the last character of the token + 1.
/// The text that generated the token should be obtained by
/// &text[token.offset_from..token.offset_to]
pub offset_to: usize,
/// Position, expressed in number of tokens.
pub position: usize,
/// Actual text content of the token.
pub text: String,
/// Is the length expressed in term of number of original tokens.
pub position_length: usize,
}
impl Default for Token {
fn default() -> Token {
Token {
offset_from: 0,
offset_to: 0,
position: usize::MAX,
text: String::with_capacity(200),
position_length: 1,
}
}
}
/// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`. /// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`.
/// ///
/// It simply wraps a `Tokenizer` and a list of `TokenFilter` that are applied sequentially. /// It simply wraps a `Tokenizer` and a list of `TokenFilter` that are applied sequentially.
@@ -79,3 +112,200 @@ impl Clone for TextAnalyzer {
} }
} }
} }
/// `Tokenizer` are in charge of splitting text into a stream of token
/// before indexing.
///
/// See the [module documentation](crate::tokenizer) for more detail.
///
/// # Warning
///
/// This API may change to use associated types.
pub trait Tokenizer: 'static + Send + Sync + TokenizerClone {
/// Creates a token stream for a given `str`.
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>;
}
pub trait TokenizerClone {
fn box_clone(&self) -> Box<dyn Tokenizer>;
}
impl<T: Tokenizer + Clone> TokenizerClone for T {
fn box_clone(&self) -> Box<dyn Tokenizer> {
Box::new(self.clone())
}
}
impl<'a> TokenStream for Box<dyn TokenStream + 'a> {
fn advance(&mut self) -> bool {
let token_stream: &mut dyn TokenStream = self.borrow_mut();
token_stream.advance()
}
fn token<'b>(&'b self) -> &'b Token {
let token_stream: &'b (dyn TokenStream + 'a) = self.borrow();
token_stream.token()
}
fn token_mut<'b>(&'b mut self) -> &'b mut Token {
let token_stream: &'b mut (dyn TokenStream + 'a) = self.borrow_mut();
token_stream.token_mut()
}
}
/// Simple wrapper of `Box<dyn TokenStream + 'a>`.
///
/// See [`TokenStream`] for more information.
pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
impl<'a, T> From<T> for BoxTokenStream<'a>
where T: TokenStream + 'a
{
fn from(token_stream: T) -> BoxTokenStream<'a> {
BoxTokenStream(Box::new(token_stream))
}
}
impl<'a> Deref for BoxTokenStream<'a> {
type Target = dyn TokenStream + 'a;
fn deref(&self) -> &Self::Target {
&*self.0
}
}
impl<'a> DerefMut for BoxTokenStream<'a> {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut *self.0
}
}
/// Simple wrapper of `Box<dyn TokenFilter + 'a>`.
///
/// See [`TokenFilter`] for more information.
pub struct BoxTokenFilter(Box<dyn TokenFilter>);
impl Deref for BoxTokenFilter {
type Target = dyn TokenFilter;
fn deref(&self) -> &dyn TokenFilter {
&*self.0
}
}
impl<T: TokenFilter> From<T> for BoxTokenFilter {
fn from(tokenizer: T) -> BoxTokenFilter {
BoxTokenFilter(Box::new(tokenizer))
}
}
/// `TokenStream` is the result of the tokenization.
///
/// It consists consumable stream of `Token`s.
///
/// # Example
///
/// ```
/// use tantivy::tokenizer::*;
///
/// let tokenizer = TextAnalyzer::from(SimpleTokenizer)
/// .filter(RemoveLongFilter::limit(40))
/// .filter(LowerCaser);
/// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer");
/// {
/// let token = token_stream.next().unwrap();
/// assert_eq!(&token.text, "hello");
/// assert_eq!(token.offset_from, 0);
/// assert_eq!(token.offset_to, 5);
/// assert_eq!(token.position, 0);
/// }
/// {
/// let token = token_stream.next().unwrap();
/// assert_eq!(&token.text, "happy");
/// assert_eq!(token.offset_from, 7);
/// assert_eq!(token.offset_to, 12);
/// assert_eq!(token.position, 1);
/// }
/// ```
pub trait TokenStream {
/// Advance to the next token
///
/// Returns false if there are no other tokens.
fn advance(&mut self) -> bool;
/// Returns a reference to the current token.
fn token(&self) -> &Token;
/// Returns a mutable reference to the current token.
fn token_mut(&mut self) -> &mut Token;
/// Helper to iterate over tokens. It
/// simply combines a call to `.advance()`
/// and `.token()`.
///
/// ```
/// use tantivy::tokenizer::*;
///
/// let tokenizer = TextAnalyzer::from(SimpleTokenizer)
/// .filter(RemoveLongFilter::limit(40))
/// .filter(LowerCaser);
/// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer");
/// while let Some(token) = token_stream.next() {
/// println!("Token {:?}", token.text);
/// }
/// ```
fn next(&mut self) -> Option<&Token> {
if self.advance() {
Some(self.token())
} else {
None
}
}
/// Helper function to consume the entire `TokenStream`
/// and push the tokens to a sink function.
///
/// Remove this.
fn process(&mut self, sink: &mut dyn FnMut(&Token)) {
while self.advance() {
sink(self.token());
}
}
}
pub trait TokenFilterClone {
fn box_clone(&self) -> BoxTokenFilter;
}
/// Trait for the pluggable components of `Tokenizer`s.
pub trait TokenFilter: 'static + Send + Sync + TokenFilterClone {
/// Wraps a token stream and returns the modified one.
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a>;
}
impl<T: TokenFilter + Clone> TokenFilterClone for T {
fn box_clone(&self) -> BoxTokenFilter {
BoxTokenFilter::from(self.clone())
}
}
#[cfg(test)]
mod test {
use super::Token;
#[test]
fn clone() {
let t1 = Token {
position: 1,
offset_from: 2,
offset_to: 3,
text: "abc".to_string(),
position_length: 1,
};
let t2 = t1.clone();
assert_eq!(t1.position, t2.position);
assert_eq!(t1.offset_from, t2.offset_from);
assert_eq!(t1.offset_to, t2.offset_to);
assert_eq!(t1.text, t2.text);
}
}

View File

@@ -26,7 +26,7 @@ impl<'a> WhitespaceTokenStream<'a> {
// search for the end of the current token. // search for the end of the current token.
fn search_token_end(&mut self) -> usize { fn search_token_end(&mut self) -> usize {
(&mut self.chars) (&mut self.chars)
.filter(|(_, c)| c.is_ascii_whitespace()) .filter(|&(_, ref c)| c.is_ascii_whitespace())
.map(|(offset, _)| offset) .map(|(offset, _)| offset)
.next() .next()
.unwrap_or(self.text.len()) .unwrap_or(self.text.len())

View File

@@ -471,8 +471,8 @@ mod test {
fn bound_strategy() -> impl Strategy<Value = Bound<String>> { fn bound_strategy() -> impl Strategy<Value = Bound<String>> {
prop_oneof![ prop_oneof![
Just(Bound::<String>::Unbounded), Just(Bound::<String>::Unbounded),
"[a-c]{0,5}".prop_map(|key| Bound::Included(key)), "[a-d]*".prop_map(|key| Bound::Included(key)),
"[a-c]{0,5}".prop_map(|key| Bound::Excluded(key)), "[a-d]*".prop_map(|key| Bound::Excluded(key)),
] ]
} }
@@ -490,14 +490,14 @@ mod test {
|(left, right)| match (extract_key(left.as_ref()), extract_key(right.as_ref())) { |(left, right)| match (extract_key(left.as_ref()), extract_key(right.as_ref())) {
(None, _) => true, (None, _) => true,
(_, None) => true, (_, None) => true,
(left, right) => left < right, (left, right) => left <= right,
}, },
) )
} }
proptest! { proptest! {
#[test] #[test]
fn test_proptest_sstable_ranges(words in prop::collection::btree_set("[a-c]{0,6}", 1..100), fn test_prop_test_ranges(words in prop::collection::btree_set("[a-d]*", 1..100),
(lower_bound, upper_bound) in bounds_strategy(), (lower_bound, upper_bound) in bounds_strategy(),
) { ) {
// TODO tweak block size. // TODO tweak block size.

View File

@@ -209,7 +209,7 @@ impl ArenaHashMap {
} }
} }
/// `update` create a new entry for a given key if it does not exist /// `update` create a new entry for a given key if it does not exists
/// or updates the existing entry. /// or updates the existing entry.
/// ///
/// The actual logic for this update is define in the `updater` /// The actual logic for this update is define in the `updater`
@@ -237,7 +237,7 @@ impl ArenaHashMap {
let bucket = probe.next_probe(); let bucket = probe.next_probe();
let kv: KeyValue = self.table[bucket]; let kv: KeyValue = self.table[bucket];
if kv.is_empty() { if kv.is_empty() {
// The key does not exist yet. // The key does not exists yet.
let val = updater(None); let val = updater(None);
let num_bytes = std::mem::size_of::<u16>() + key.len() + std::mem::size_of::<V>(); let num_bytes = std::mem::size_of::<u16>() + key.len() + std::mem::size_of::<V>();
let key_addr = self.memory_arena.allocate_space(num_bytes); let key_addr = self.memory_arena.allocate_space(num_bytes);

View File

@@ -1,11 +0,0 @@
[package]
name = "tantivy-tokenizer-api"
version = "0.1.0"
license = "MIT"
edition = "2021"
description = "Tokenizer API of tantivy"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
serde = { version = "1.0.152", features = ["derive"] }

View File

@@ -1,6 +0,0 @@
#Tokenizer-API
An API to interface a tokenizer with tantivy.
The API will be kept stable in order to not break support for existing tokenizers.

View File

@@ -1,197 +0,0 @@
//! Tokenizer are in charge of chopping text into a stream of tokens
//! ready for indexing. This is an seperate crate from tantivy, so implementors don't need to update
//! for each new tantivy version.
//!
//! To add support for a tokenizer, implement the [`Tokenizer`](crate::Tokenizer) trait.
//! Checkout the [tantivy repo](https://github.com/quickwit-oss/tantivy/tree/main/src/tokenizer) for some examples.
use std::borrow::{Borrow, BorrowMut};
use std::ops::{Deref, DerefMut};
use serde::{Deserialize, Serialize};
/// Token
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
pub struct Token {
/// Offset (byte index) of the first character of the token.
/// Offsets shall not be modified by token filters.
pub offset_from: usize,
/// Offset (byte index) of the last character of the token + 1.
/// The text that generated the token should be obtained by
/// &text[token.offset_from..token.offset_to]
pub offset_to: usize,
/// Position, expressed in number of tokens.
pub position: usize,
/// Actual text content of the token.
pub text: String,
/// Is the length expressed in term of number of original tokens.
pub position_length: usize,
}
impl Default for Token {
fn default() -> Token {
Token {
offset_from: 0,
offset_to: 0,
position: usize::MAX,
text: String::with_capacity(200),
position_length: 1,
}
}
}
/// `Tokenizer` are in charge of splitting text into a stream of token
/// before indexing.
///
/// # Warning
///
/// This API may change to use associated types.
pub trait Tokenizer: 'static + Send + Sync + TokenizerClone {
/// Creates a token stream for a given `str`.
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>;
}
pub trait TokenizerClone {
fn box_clone(&self) -> Box<dyn Tokenizer>;
}
impl<T: Tokenizer + Clone> TokenizerClone for T {
fn box_clone(&self) -> Box<dyn Tokenizer> {
Box::new(self.clone())
}
}
/// Simple wrapper of `Box<dyn TokenStream + 'a>`.
///
/// See [`TokenStream`] for more information.
pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
impl<'a, T> From<T> for BoxTokenStream<'a>
where T: TokenStream + 'a
{
fn from(token_stream: T) -> BoxTokenStream<'a> {
BoxTokenStream(Box::new(token_stream))
}
}
impl<'a> Deref for BoxTokenStream<'a> {
type Target = dyn TokenStream + 'a;
fn deref(&self) -> &Self::Target {
&*self.0
}
}
impl<'a> DerefMut for BoxTokenStream<'a> {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut *self.0
}
}
impl<'a> TokenStream for Box<dyn TokenStream + 'a> {
fn advance(&mut self) -> bool {
let token_stream: &mut dyn TokenStream = self.borrow_mut();
token_stream.advance()
}
fn token<'b>(&'b self) -> &'b Token {
let token_stream: &'b (dyn TokenStream + 'a) = self.borrow();
token_stream.token()
}
fn token_mut<'b>(&'b mut self) -> &'b mut Token {
let token_stream: &'b mut (dyn TokenStream + 'a) = self.borrow_mut();
token_stream.token_mut()
}
}
/// `TokenStream` is the result of the tokenization.
///
/// It consists consumable stream of `Token`s.
pub trait TokenStream {
/// Advance to the next token
///
/// Returns false if there are no other tokens.
fn advance(&mut self) -> bool;
/// Returns a reference to the current token.
fn token(&self) -> &Token;
/// Returns a mutable reference to the current token.
fn token_mut(&mut self) -> &mut Token;
/// Helper to iterate over tokens. It
/// simply combines a call to `.advance()`
/// and `.token()`.
fn next(&mut self) -> Option<&Token> {
if self.advance() {
Some(self.token())
} else {
None
}
}
/// Helper function to consume the entire `TokenStream`
/// and push the tokens to a sink function.
fn process(&mut self, sink: &mut dyn FnMut(&Token)) {
while self.advance() {
sink(self.token());
}
}
}
/// Simple wrapper of `Box<dyn TokenFilter + 'a>`.
///
/// See [`TokenFilter`] for more information.
pub struct BoxTokenFilter(Box<dyn TokenFilter>);
impl Deref for BoxTokenFilter {
type Target = dyn TokenFilter;
fn deref(&self) -> &dyn TokenFilter {
&*self.0
}
}
impl<T: TokenFilter> From<T> for BoxTokenFilter {
fn from(tokenizer: T) -> BoxTokenFilter {
BoxTokenFilter(Box::new(tokenizer))
}
}
pub trait TokenFilterClone {
fn box_clone(&self) -> BoxTokenFilter;
}
/// Trait for the pluggable components of `Tokenizer`s.
pub trait TokenFilter: 'static + Send + Sync + TokenFilterClone {
/// Wraps a token stream and returns the modified one.
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a>;
}
impl<T: TokenFilter + Clone> TokenFilterClone for T {
fn box_clone(&self) -> BoxTokenFilter {
BoxTokenFilter::from(self.clone())
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn clone() {
let t1 = Token {
position: 1,
offset_from: 2,
offset_to: 3,
text: "abc".to_string(),
position_length: 1,
};
let t2 = t1.clone();
assert_eq!(t1.position, t2.position);
assert_eq!(t1.offset_from, t2.offset_from);
assert_eq!(t1.offset_to, t2.offset_to);
assert_eq!(t1.text, t2.text);
}
}