Compare commits

...

73 Commits

Author SHA1 Message Date
Paul Masurel
d5a1e8389c Revert "Exposing min/max value interface on MultiValuedFastField Reader (#1094)"
This reverts commit bb488305c9.
2021-06-23 10:24:28 +09:00
Evance Soumaoro
bb488305c9 Exposing min/max value interface on MultiValuedFastField Reader (#1094)
Exposing min/max value interface on MultiValuedFastField Reader
2021-06-23 08:51:36 +09:00
PSeitz
f05e84f964 add FieldEntry constructor, closes #1086 (#1090) 2021-06-17 10:15:48 +09:00
PSeitz
65546ed22b Merge pull request #1088 from appaquet/fix/store-reader-iterator-take2
Fix corruption in store reader iterator, take 2
2021-06-16 14:44:00 +02:00
Andre-Philippe Paquet
57ae5b27dc fix store reader iterator, take 2 2021-06-16 07:51:39 -04:00
PSeitz
f9531ec3c9 Merge pull request #1085 from PSeitz/fastfieldcompression
use concrete return type, fixes #1084
2021-06-16 13:07:12 +02:00
Pascal Seitz
5b54a32563 use concrete return type, fixes #1084 2021-06-16 12:03:11 +02:00
PSeitz
cd049e28bc Merge pull request #1082 from PSeitz/fastfieldcompression
use dynamic fast field codec for offset index
2021-06-16 11:59:00 +02:00
PSeitz
646e41bec4 Merge pull request #1083 from PSeitz/termdict_block_layout
Move counting writer to common
2021-06-16 08:57:55 +02:00
Pascal Seitz
36528c5e83 move counting writer to common
move counting writer to common
reuse counting writer in fastfield codec
2021-06-16 08:14:04 +02:00
Pascal Seitz
cd169dee23 use dynamic fast field codec for offset index 2021-06-15 13:34:42 +02:00
PSeitz
b5cc60f80b Merge pull request #1080 from PSeitz/more_tests
test all features in github actions
2021-06-15 10:51:57 +02:00
Pascal Seitz
060b83159a use nightly for tests 2021-06-15 10:08:49 +02:00
Pascal Seitz
a40ff35453 test all features 2021-06-15 09:31:39 +02:00
PSeitz
268e6bfe6e update fast field codec readme 2021-06-15 09:19:39 +02:00
PSeitz
f902440b8b Merge pull request #1072 from PSeitz/fastfieldcompression
Enable support for multiple fastfield codecs, add linear interpolation
2021-06-15 09:07:17 +02:00
Pascal Seitz
77a0902605 replace unwrap, use vec in bench 2021-06-14 17:01:46 +02:00
Pascal Seitz
c889ae10e4 add is_applicable to fast field codecs 2021-06-14 16:16:25 +02:00
Pascal Seitz
0a534c6ee0 rename create to serialize 2021-06-14 15:40:07 +02:00
Pascal Seitz
167d88b449 fix tests behind unstable feature flag 2021-06-14 15:31:12 +02:00
Pascal Seitz
1071ed84f2 fix cond compilation 2021-06-14 14:05:04 +02:00
Pascal Seitz
abb5624af2 add contributing guidelines, add codec comparer binary
add contributing guidelines
add codec comparer binary to test codec compressions with different test data sets
2021-06-14 13:56:40 +02:00
Pascal Seitz
1d41b96d32 rename, add codec_tester 2021-06-14 13:56:40 +02:00
Pascal Seitz
ef4665945f rename file 2021-06-14 13:56:40 +02:00
Pascal Seitz
294cd5fd0b streamline traits and tests 2021-06-14 13:56:40 +02:00
Pascal Seitz
f4d271177c add inline, add readme 2021-06-14 13:56:40 +02:00
Pascal Seitz
451538fecf add serialize for bool 2021-06-14 13:56:40 +02:00
Pascal Seitz
e78e0fec59 add multilinearinterpolation
add multilinearinterpolation, which compresses blocks of size 512
add checks for linear interpolation
2021-06-14 13:56:40 +02:00
Pascal Seitz
2e639cebf8 fix bitpacker bug, reset internal value 2021-06-14 13:56:40 +02:00
Pascal Seitz
e296da7ade add debug and failsafes 2021-06-14 13:56:40 +02:00
Pascal Seitz
3b3e26c4b8 use f64 precision for slope calculation 2021-06-14 13:56:40 +02:00
Pascal Seitz
6a4883ac69 use uniform distribution sampling 2021-06-14 13:56:40 +02:00
Pascal Seitz
0ba05df545 add f32::MAX to disable a compressor 2021-06-14 13:56:40 +02:00
Pascal Seitz
aa3c4d4029 use f32 precision, add inline 2021-06-14 13:56:40 +02:00
Pascal Seitz
60df629725 cargo.toml license desc and author 2021-06-14 13:56:40 +02:00
Pascal Seitz
2570b005ac fix estimation test 2021-06-14 13:56:40 +02:00
Pascal Seitz
d5212cd19d fix clippy 2021-06-14 13:56:40 +02:00
Pascal Seitz
2193d85622 fix clippy and common crate tests 2021-06-14 13:56:40 +02:00
Pascal Seitz
dfdbfe9eff add benchmark for fast field codecs
test tests::bench_fastfield_bitpack_create        ... bench:      57,628 ns/iter (+/- 23,486)
test tests::bench_fastfield_bitpack_get           ... bench:      43,323 ns/iter (+/- 4,286)
test tests::bench_fastfield_linearinterpol_create ... bench:     223,625 ns/iter (+/- 33,563)
test tests::bench_fastfield_linearinterpol_get    ... bench:      82,839 ns/iter (+/- 9,575)
2021-06-14 13:56:40 +02:00
Pascal Seitz
b999e836b2 replace BitpackedFastFieldReader, delete FastFieldSerializer trait 2021-06-14 13:56:40 +02:00
Pascal Seitz
be2dd41e69 add interface to create and read codecs
add CodecReader as common interface in fastfield codec crate
add LinearInterpolation to DynamicFastFieldReader
calc estimation and choose best codec
cleanup
2021-06-14 13:56:40 +02:00
Pascal Seitz
483fdb79cc add linear interpolation estimation
add estimation tests
add codec test data in tests
2021-06-14 13:56:40 +02:00
Pascal Seitz
aefd0fc907 refactor, add fastfield metadata to footer
change api to fastfield reader in codec crate
add fastfield metadata to footer
remove old code
merge codec files
2021-06-14 13:56:40 +02:00
Pascal Seitz
3298d6cb71 move common to common crate, create fastfield_codecs crate
move common to common crate
create fastfield_codecs crate
add bitpacker to fast field codecs
add linear interpolation to fast field codecs
add tests
2021-06-14 13:56:40 +02:00
Pascal Seitz
c02c78ea73 implement linear interpol serializer 2021-06-14 13:56:40 +02:00
Pascal Seitz
6bf4fee1ba support multiple codecs
support multiple codes
prepend codec id to all fast fields
add new api to create fastfields with access to all data
use new fastfield creation api in initial creation and merge
remove unused collect of data in doc_id_mapping
2021-06-14 13:56:40 +02:00
PSeitz
5209238c1b use github actions for tests 2021-06-14 12:51:46 +02:00
Paul Masurel
7ef25ec400 Bump to 0.15.1 to publish bugfix 2021-06-14 18:45:38 +09:00
PSeitz
221e7cbb55 Merge pull request #1076 from appaquet/fix/store-reader-iterator
Fix panic in store reader raw document iterator during segment merge
2021-06-14 11:22:58 +02:00
Pascal Seitz
873ac1a3ac cleanup import 2021-06-14 10:31:45 +02:00
Pascal Seitz
ebe55a7ae1 refactor test, fixes #1077
replace test with smaller test in doc_store
2021-06-14 10:10:05 +02:00
Bernard Swart
9f32d40b27 Misspelling of misspelled was fixed (#1078) 2021-06-14 16:29:12 +09:00
Andre-Philippe Paquet
8ae10a930a fix formatting 2021-06-13 17:23:40 -04:00
Andre-Philippe Paquet
473a346814 remove debugging 2021-06-13 16:49:44 -04:00
Andre-Philippe Paquet
3a8a0fe79a add fuzzy merge test 2021-06-13 16:42:24 -04:00
Andre-Philippe Paquet
511dc8f87f fix store reader iterator 2021-06-13 16:00:13 -04:00
Paul Masurel
3901295329 Bumped query-grammar version 2021-06-07 10:00:14 +09:00
Paul Masurel
f5918c6c74 Completed bitpacker README 2021-06-07 09:57:17 +09:00
Paul Masurel
abe6b4baec Bumped tantivy version to 0.15 2021-06-07 09:52:48 +09:00
Paul Masurel
6e4b61154f Issue/1070 (#1071)
Add a boolean flag in the Query::query_terms informing on whether
position information is required.

Closes #1070
2021-06-03 22:33:20 +09:00
PSeitz
2aad0ced77 add inline to bitpacker (#1064) 2021-05-31 23:15:41 +09:00
Stéphane Campinas
41ea14840d add benchmark of term streams merge (#1024)
* add benchmark of term streams merge
* use union based on FST for merging the term dictionaries
* Rename TermMerger benchmark
2021-05-31 23:15:01 +09:00
PSeitz
dff0ffd38a prepare for multiple fastfield codecs (#1063)
* prepare for multiple fastfield codecs

 prepare for multiple fastfield codecs by wrapping the codecs in an enum #1042

* add FastFieldSerializer trait, add DynamicFastFieldSerializer

add FastFieldSerializer trait
add DynamicFastFieldSerializer enum to wrap all implementors of the FastFieldSerializer trait

* add estimation for fastfield bitpacker
2021-05-31 23:14:14 +09:00
PSeitz
8d32c3ba3a Change Footer version handling, Make compression dynamic (#1060)
Change Footer version handling, Make compression dynamic

Change Footer version handling
Simplify version handling by switching to JSON instead of binary serialization.
fixes #1058

Make compression dynamic
Instead of choosing the compression during compile time via a feature flag, you can now have multiple compression algorithms enabled and decide during runtime which one to choose via IndexSettings. Changing the compression algorithm on an index is also supported. The information which algorithm was used in the doc store is stored in the DocStoreFooter. The default is the lz4 block format.
fixes #904

Handle merging of different compressors
Fix feature flag names
Add doc store test for all compressors
2021-05-28 14:57:20 +09:00
Moriyoshi Koizumi
4afba005f9 Provide a means to deal with malformed facet text representation for the query parser (#1056)
* Provide a means to deal with malformed facet text representation for the query parser.
* Specific error enum for the facet parse error.
2021-05-27 12:16:49 +09:00
PSeitz
85fb0cc20a cache field norm reader in merge (#1061) 2021-05-25 21:48:02 +09:00
PSeitz
5ef2d56ec2 Avoid docstore stacking for small segments, fixes #1053 (#1055) 2021-05-24 15:38:49 +09:00
Paul Masurel
fd8e5bdf57 Rename more like this 2021-05-21 16:32:39 +09:00
PSeitz
4f8481a1e4 Detect if segments are stackackable with sorting, fixes #1038 (#1054)
* Detect if segments are stackackable with sorting, fixes #1038

Detect if segments are stackable when their data ranges on the sort property are disjunct.
Presort segments by thei min value on merge, to enable easier stacking.

* move code to function
2021-05-21 15:23:17 +09:00
PSeitz
bcd72e5c14 fix and refactor log merge policy, fixes #1035 (#1043)
* fix and refactor log merge policy, fixes #1035

fixes a bug in log merge policy where an index was wrongly referenced by its index

* cleanup

* fix sort order, improve method names

* use itertools groupby, fix serialization test

* minor improvments

* update names
2021-05-19 10:48:46 +09:00
PSeitz
249bc6cf72 upgrade lz4_flex to 0.8 (#1049)
* upgrade lz4_flex to 0.8

* fix set_len
2021-05-19 10:46:01 +09:00
PSeitz
1c0af5765d fix doc store iter error handling, fixes #1047 (#1051) 2021-05-18 21:43:57 +09:00
Paul Masurel
7ba771ed1b Replaced RawDocument by OwnedBytes (#1046) 2021-05-18 14:33:36 +09:00
83 changed files with 3648 additions and 1287 deletions

30
.github/workflows/test.yml vendored Normal file
View File

@@ -0,0 +1,30 @@
name: Rust
on:
push:
branches: [ main ]
pull_request:
branches: [ main ]
env:
CARGO_TERM_COLOR: always
jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Build
run: cargo build --verbose --workspace
- name: Install latest nightly to test also against unstable feature flag
uses: actions-rs/toolchain@v1
with:
toolchain: nightly
override: true
components: rustfmt
- name: Run tests
run: cargo test --all-features --verbose --workspace
- name: Check Formatting
run: cargo fmt --all -- --check

View File

@@ -1,3 +1,7 @@
Tantivy 0.15.1
=========================
- Major bugfix. DocStore panics when first block is deleted. (@appaquet) #1077
Tantivy 0.15.0
=========================
- API Changes. Using Range instead of (start, end) in the API and internals (`FileSlice`, `OwnedBytes`, `Snippets`, ...)
@@ -8,11 +12,19 @@ Tantivy 0.15.0
- Bugfix consistent tie break handling in facet's topk (@hardikpnsp) #357
- Date field support for range queries (@rihardsk) #516
- Added lz4-flex as the default compression scheme in tantivy (@PSeitz) #1009
- Renamed a lot of symbols to avoid all uppercasing on acronyms, as per new clippy recommendation. For instance, RAMDirectory -> RamDirectory. (@pmasurel)
- Renamed a lot of symbols to avoid all uppercasing on acronyms, as per new clippy recommendation. For instance, RAMDirectory -> RamDirectory. (@fulmicoton)
- Simplified positions index format (@fulmicoton) #1022
- Moved bitpacking to bitpacker subcrate and add BlockedBitpacker, which bitpacks blocks of 128 elements (@PSeitz) #1030
- Added support for more-like-this query in tantivy (@evanxg852000) #1011
- Added support for sorting an index, e.g presorting documents in an index by a timestamp field. This can heavily improve performance for certain scenarios, by utilizing the sorted data (Top-n optimizations). #1026
- Added support for sorting an index, e.g presorting documents in an index by a timestamp field. This can heavily improve performance for certain scenarios, by utilizing the sorted data (Top-n optimizations)(@PSeitz). #1026
- Add iterator over documents in doc store (@PSeitz). #1044
- Fix log merge policy (@PSeitz). #1043
- Add detection to avoid small doc store blocks on merge (@PSeitz). #1054
- Make doc store compression dynamic (@PSeitz). #1060
- Switch to json for footer version handling (@PSeitz). #1060
- Updated TermMerger implementation to rely on the union feature of the FST (@scampi) #469
- Add boolean marking whether position is required in the query_terms API call (@fulmicoton). #1070
Tantivy 0.14.0
=========================

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy"
version = "0.14.0"
version = "0.15.1"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]
@@ -20,8 +20,7 @@ once_cell = "1.7.2"
regex ={ version = "1.5.4", default-features = false, features = ["std"] }
tantivy-fst = "0.3"
memmap = {version = "0.7", optional=true}
lz4_flex = { version = "0.7.5", default-features = false, features = ["checked-decode"], optional = true }
lz4 = { version = "1.23.2", optional = true }
lz4_flex = { version = "0.8.0", default-features = false, features = ["checked-decode"], optional = true }
brotli = { version = "3.3", optional = true }
snap = { version = "1.0.5", optional = true }
tempfile = { version = "3.2", optional = true }
@@ -34,8 +33,10 @@ levenshtein_automata = "0.2"
uuid = { version = "0.8.2", features = ["v4", "serde"] }
crossbeam = "0.8"
futures = { version = "0.3.15", features = ["thread-pool"] }
tantivy-query-grammar = { version="0.14.0", path="./query-grammar" }
tantivy-query-grammar = { version="0.15.0", path="./query-grammar" }
tantivy-bitpacker = { version="0.1", path="./bitpacker" }
common = { version="0.1", path="./common" }
fastfield_codecs = { version="0.1", path="./fastfield_codecs", default-features = false }
stable_deref_trait = "1.2"
rust-stemmers = "1.2"
downcast-rs = "1.2"
@@ -77,18 +78,19 @@ debug-assertions = true
overflow-checks = true
[features]
default = ["mmap", "lz4-block-compression" ]
default = ["mmap", "lz4-compression" ]
mmap = ["fs2", "tempfile", "memmap"]
brotli-compression = ["brotli"]
lz4-compression = ["lz4"]
lz4-block-compression = ["lz4_flex"]
lz4-compression = ["lz4_flex"]
snappy-compression = ["snap"]
failpoints = ["fail/failpoints"]
unstable = [] # useful for benches.
wasm-bindgen = ["uuid/wasm-bindgen"]
[workspace]
members = ["query-grammar", "bitpacker"]
members = ["query-grammar", "bitpacker", "common", "fastfield_codecs"]
[badges]
travis-ci = { repository = "tantivy-search/tantivy" }

View File

@@ -18,5 +18,6 @@ install:
build: false
test_script:
- REM SET RUST_LOG=tantivy,test & cargo test --all --verbose --no-default-features --features lz4-block-compression --features mmap
- REM SET RUST_LOG=tantivy,test & cargo test --all --verbose --no-default-features --features lz4-compression --features mmap
- REM SET RUST_LOG=tantivy,test & cargo test test_store --verbose --no-default-features --features lz4-compression --features snappy-compression --features brotli-compression --features mmap
- REM SET RUST_BACKTRACE=1 & cargo build --examples

View File

@@ -2,6 +2,13 @@
name = "tantivy-bitpacker"
version = "0.1.0"
edition = "2018"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = []
description = """Tantivy-sub crate: bitpacking"""
repository = "https://github.com/tantivy-search/tantivy"
keywords = []
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

View File

@@ -17,6 +17,7 @@ impl BitPacker {
}
}
#[inline]
pub fn write<TWrite: io::Write>(
&mut self,
val: u64,
@@ -48,6 +49,7 @@ impl BitPacker {
let bytes = self.mini_buffer.to_le_bytes();
output.write_all(&bytes[..num_bytes])?;
self.mini_buffer_written = 0;
self.mini_buffer = 0;
}
Ok(())
}
@@ -60,7 +62,7 @@ impl BitPacker {
}
}
#[derive(Clone)]
#[derive(Clone, Debug, Default)]
pub struct BitUnpacker {
num_bits: u64,
mask: u64,
@@ -79,6 +81,7 @@ impl BitUnpacker {
}
}
#[inline]
pub fn get(&self, idx: u64, data: &[u8]) -> u64 {
if self.num_bits == 0 {
return 0u64;

View File

@@ -80,6 +80,7 @@ impl BlockedBitpacker {
* std::mem::size_of_val(&self.buffer.get(0).cloned().unwrap_or_default())
}
#[inline]
pub fn add(&mut self, val: u64) {
self.buffer.push(val);
if self.buffer.len() == BLOCK_SIZE as usize {
@@ -122,6 +123,7 @@ impl BlockedBitpacker {
.resize(self.compressed_blocks.len() + 8, 0); // add padding for bitpacker
}
}
#[inline]
pub fn get(&self, idx: usize) -> u64 {
let metadata_pos = idx / BLOCK_SIZE as usize;
let pos_in_block = idx % BLOCK_SIZE as usize;

12
common/Cargo.toml Normal file
View File

@@ -0,0 +1,12 @@
[package]
name = "common"
version = "0.1.0"
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
license = "MIT"
edition = "2018"
description = "common traits and utility functions used by multiple tantivy subcrates"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
byteorder = "1.4.3"

9
common/src/lib.rs Normal file
View File

@@ -0,0 +1,9 @@
pub use byteorder::LittleEndian as Endianness;
mod serialize;
mod vint;
mod writer;
pub use serialize::{BinarySerializable, DeserializeFrom, FixedSize};
pub use vint::{read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt};
pub use writer::{AntiCallToken, CountingWriter, TerminatingWrite};

View File

@@ -1,5 +1,5 @@
use crate::common::Endianness;
use crate::common::VInt;
use crate::Endianness;
use crate::VInt;
use byteorder::{ReadBytesExt, WriteBytesExt};
use std::fmt;
use std::io;
@@ -14,6 +14,20 @@ pub trait BinarySerializable: fmt::Debug + Sized {
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self>;
}
pub trait DeserializeFrom<T: BinarySerializable> {
fn deserialize(&mut self) -> io::Result<T>;
}
/// Implement deserialize from &[u8] for all types which implement BinarySerializable.
///
/// TryFrom would actually be preferrable, but not possible because of the orphan
/// rules (not completely sure if this could be resolved)
impl<T: BinarySerializable> DeserializeFrom<T> for &[u8] {
fn deserialize(&mut self) -> io::Result<T> {
T::deserialize(self)
}
}
/// `FixedSize` marks a `BinarySerializable` as
/// always serializing to the same size.
pub trait FixedSize: BinarySerializable {
@@ -61,6 +75,11 @@ impl<Left: BinarySerializable, Right: BinarySerializable> BinarySerializable for
Ok((Left::deserialize(reader)?, Right::deserialize(reader)?))
}
}
impl<Left: BinarySerializable + FixedSize, Right: BinarySerializable + FixedSize> FixedSize
for (Left, Right)
{
const SIZE_IN_BYTES: usize = Left::SIZE_IN_BYTES + Right::SIZE_IN_BYTES;
}
impl BinarySerializable for u32 {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
@@ -141,6 +160,28 @@ impl FixedSize for u8 {
const SIZE_IN_BYTES: usize = 1;
}
impl BinarySerializable for bool {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
let val = if *self { 1 } else { 0 };
writer.write_u8(val)
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<bool> {
let val = reader.read_u8()?;
match val {
0 => Ok(false),
1 => Ok(true),
_ => Err(io::Error::new(
io::ErrorKind::InvalidData,
"invalid bool value on deserialization, data corrupted",
)),
}
}
}
impl FixedSize for bool {
const SIZE_IN_BYTES: usize = 1;
}
impl BinarySerializable for String {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
let data: &[u8] = self.as_bytes();
@@ -161,9 +202,9 @@ impl BinarySerializable for String {
#[cfg(test)]
pub mod test {
use super::VInt;
use super::*;
use crate::common::VInt;
use crate::serialize::BinarySerializable;
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
let mut buffer = Vec::new();
O::default().serialize(&mut buffer).unwrap();

View File

@@ -175,8 +175,8 @@ impl BinarySerializable for VInt {
mod tests {
use super::serialize_vint_u32;
use super::BinarySerializable;
use super::VInt;
use crate::common::BinarySerializable;
fn aux_test_vint(val: u64) {
let mut v = [14u8; 10];

View File

@@ -1,7 +1,4 @@
use crate::directory::AntiCallToken;
use crate::directory::TerminatingWrite;
use std::io;
use std::io::Write;
use std::io::{self, BufWriter, Write};
pub struct CountingWriter<W> {
underlying: W,
@@ -16,41 +13,87 @@ impl<W: Write> CountingWriter<W> {
}
}
#[inline]
pub fn written_bytes(&self) -> u64 {
self.written_bytes
}
/// Returns the underlying write object.
/// Note that this method does not trigger any flushing.
#[inline]
pub fn finish(self) -> W {
self.underlying
}
}
impl<W: Write> Write for CountingWriter<W> {
#[inline]
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
let written_size = self.underlying.write(buf)?;
self.written_bytes += written_size as u64;
Ok(written_size)
}
#[inline]
fn write_all(&mut self, buf: &[u8]) -> io::Result<()> {
self.underlying.write_all(buf)?;
self.written_bytes += buf.len() as u64;
Ok(())
}
#[inline]
fn flush(&mut self) -> io::Result<()> {
self.underlying.flush()
}
}
impl<W: TerminatingWrite> TerminatingWrite for CountingWriter<W> {
#[inline]
fn terminate_ref(&mut self, token: AntiCallToken) -> io::Result<()> {
self.underlying.terminate_ref(token)
}
}
/// Struct used to prevent from calling [`terminate_ref`](trait.TerminatingWrite#method.terminate_ref) directly
///
/// The point is that while the type is public, it cannot be built by anyone
/// outside of this module.
pub struct AntiCallToken(());
/// Trait used to indicate when no more write need to be done on a writer
pub trait TerminatingWrite: Write {
/// Indicate that the writer will no longer be used. Internally call terminate_ref.
fn terminate(mut self) -> io::Result<()>
where
Self: Sized,
{
self.terminate_ref(AntiCallToken(()))
}
/// You should implement this function to define custom behavior.
/// This function should flush any buffer it may hold.
fn terminate_ref(&mut self, _: AntiCallToken) -> io::Result<()>;
}
impl<W: TerminatingWrite + ?Sized> TerminatingWrite for Box<W> {
fn terminate_ref(&mut self, token: AntiCallToken) -> io::Result<()> {
self.as_mut().terminate_ref(token)
}
}
impl<W: TerminatingWrite> TerminatingWrite for BufWriter<W> {
fn terminate_ref(&mut self, a: AntiCallToken) -> io::Result<()> {
self.flush()?;
self.get_mut().terminate_ref(a)
}
}
impl<'a> TerminatingWrite for &'a mut Vec<u8> {
fn terminate_ref(&mut self, _a: AntiCallToken) -> io::Result<()> {
self.flush()
}
}
#[cfg(test)]
mod test {

View File

@@ -10,7 +10,7 @@
// ---
// Importing tantivy...
use tantivy::collector::{Collector, SegmentCollector};
use tantivy::fastfield::FastFieldReader;
use tantivy::fastfield::{DynamicFastFieldReader, FastFieldReader};
use tantivy::query::QueryParser;
use tantivy::schema::Field;
use tantivy::schema::{Schema, FAST, INDEXED, TEXT};
@@ -98,7 +98,7 @@ impl Collector for StatsCollector {
}
struct StatsSegmentCollector {
fast_field_reader: FastFieldReader<u64>,
fast_field_reader: DynamicFastFieldReader<u64>,
stats: Stats,
}

View File

@@ -90,7 +90,7 @@ fn main() -> tantivy::Result<()> {
let frankenstein_isbn = Term::from_field_text(isbn, "978-9176370711");
// Oops our frankenstein doc seems mispelled
// Oops our frankenstein doc seems misspelled
let frankenstein_doc_misspelled = extract_doc_given_isbn(&reader, &frankenstein_isbn)?.unwrap();
assert_eq!(
schema.to_json(&frankenstein_doc_misspelled),

View File

@@ -92,7 +92,7 @@ fn main() -> tantivy::Result<()> {
// Check the reference doc for different ways to create a `Facet` object.
{
let facet = Facet::from_text("/Felidae/Pantherinae");
let facet = Facet::from("/Felidae/Pantherinae");
let facet_term = Term::from_facet(classification, &facet);
let facet_term_query = TermQuery::new(facet_term, IndexRecordOption::Basic);
let mut facet_collector = FacetCollector::for_field(classification);

View File

@@ -0,0 +1,25 @@
[package]
name = "fastfield_codecs"
version = "0.1.0"
authors = ["Pascal Seitz <pascal@quickwit.io>"]
license = "MIT"
edition = "2018"
description = "Fast field codecs used by tantivy"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
common = { path = "../common/" }
tantivy-bitpacker = { path = "../bitpacker/" }
prettytable-rs = {version="0.8.0", optional= true}
#prettytable-rs = {version="0.8.0" }
rand = "0.8.3"
[dev-dependencies]
more-asserts = "0.2.1"
rand = "0.8.3"
[features]
bin = ["prettytable-rs"]
default = ["bin"]

View File

@@ -0,0 +1,68 @@
# Fast Field Codecs
This crate contains various fast field codecs, used to compress/decompress fast field data in tantivy.
## Contributing
Contributing is pretty straightforward. Since the bitpacking is the simplest compressor, you can check it for reference.
A codec needs to implement 2 traits:
- A reader implementing `FastFieldCodecReader` to read the codec.
- A serializer implementing `FastFieldCodecSerializer` for compression estimation and codec name + id.
### Tests
Once the traits are implemented test and benchmark integration is pretty easy (see `test_with_codec_data_sets` and `bench.rs`).
Make sure to add the codec to the main.rs, which tests the compression ratio and estimation against different data sets. You can run it with:
```
cargo run --features bin
```
### TODO
- Add real world data sets in comparison
- Add codec to cover sparse data sets
### Codec Comparison
```
+----------------------------------+-------------------+------------------------+
| | Compression Ratio | Compression Estimation |
+----------------------------------+-------------------+------------------------+
| Autoincrement | | |
+----------------------------------+-------------------+------------------------+
| LinearInterpol | 0.000039572664 | 0.000004396963 |
+----------------------------------+-------------------+------------------------+
| MultiLinearInterpol | 0.1477348 | 0.17275847 |
+----------------------------------+-------------------+------------------------+
| Bitpacked | 0.28126493 | 0.28125 |
+----------------------------------+-------------------+------------------------+
| Monotonically increasing concave | | |
+----------------------------------+-------------------+------------------------+
| LinearInterpol | 0.25003937 | 0.26562938 |
+----------------------------------+-------------------+------------------------+
| MultiLinearInterpol | 0.190665 | 0.1883836 |
+----------------------------------+-------------------+------------------------+
| Bitpacked | 0.31251436 | 0.3125 |
+----------------------------------+-------------------+------------------------+
| Monotonically increasing convex | | |
+----------------------------------+-------------------+------------------------+
| LinearInterpol | 0.25003937 | 0.28125438 |
+----------------------------------+-------------------+------------------------+
| MultiLinearInterpol | 0.18676 | 0.2040086 |
+----------------------------------+-------------------+------------------------+
| Bitpacked | 0.31251436 | 0.3125 |
+----------------------------------+-------------------+------------------------+
| Almost monotonically increasing | | |
+----------------------------------+-------------------+------------------------+
| LinearInterpol | 0.14066513 | 0.1562544 |
+----------------------------------+-------------------+------------------------+
| MultiLinearInterpol | 0.16335973 | 0.17275847 |
+----------------------------------+-------------------+------------------------+
| Bitpacked | 0.28126493 | 0.28125 |
+----------------------------------+-------------------+------------------------+
```

View File

@@ -0,0 +1,109 @@
#![feature(test)]
extern crate test;
#[cfg(test)]
mod tests {
use fastfield_codecs::{
bitpacked::{BitpackedFastFieldReader, BitpackedFastFieldSerializer},
linearinterpol::{LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer},
multilinearinterpol::{
MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer,
},
*,
};
fn get_data() -> Vec<u64> {
let mut data: Vec<_> = (100..55000_u64)
.map(|num| num + rand::random::<u8>() as u64)
.collect();
data.push(99_000);
data.insert(1000, 2000);
data.insert(2000, 100);
data.insert(3000, 4100);
data.insert(4000, 100);
data.insert(5000, 800);
data
}
fn value_iter() -> impl Iterator<Item = u64> {
let data = (0..20_000).collect::<Vec<_>>();
data.into_iter()
}
fn bench_get<S: FastFieldCodecSerializer, R: FastFieldCodecReader>(
b: &mut Bencher,
data: &[u64],
) {
let mut bytes = vec![];
S::serialize(
&mut bytes,
&data,
stats_from_vec(&data),
data.iter().cloned(),
data.iter().cloned(),
)
.unwrap();
let reader = R::open_from_bytes(&bytes).unwrap();
b.iter(|| {
for pos in value_iter() {
reader.get_u64(pos as u64, &bytes);
}
});
}
fn bench_create<S: FastFieldCodecSerializer>(b: &mut Bencher, data: &[u64]) {
let mut bytes = vec![];
b.iter(|| {
S::serialize(
&mut bytes,
&data,
stats_from_vec(&data),
data.iter().cloned(),
data.iter().cloned(),
)
.unwrap();
});
}
use test::Bencher;
#[bench]
fn bench_fastfield_bitpack_create(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_create::<BitpackedFastFieldSerializer>(b, &data);
}
#[bench]
fn bench_fastfield_linearinterpol_create(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_create::<LinearInterpolFastFieldSerializer>(b, &data);
}
#[bench]
fn bench_fastfield_multilinearinterpol_create(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_create::<MultiLinearInterpolFastFieldSerializer>(b, &data);
}
#[bench]
fn bench_fastfield_bitpack_get(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get::<BitpackedFastFieldSerializer, BitpackedFastFieldReader>(b, &data);
}
#[bench]
fn bench_fastfield_linearinterpol_get(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get::<LinearInterpolFastFieldSerializer, LinearInterpolFastFieldReader>(b, &data);
}
#[bench]
fn bench_fastfield_multilinearinterpol_get(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get::<MultiLinearInterpolFastFieldSerializer, MultiLinearInterpolFastFieldReader>(
b, &data,
);
}
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
let min_value = data.iter().cloned().min().unwrap_or(0);
let max_value = data.iter().cloned().max().unwrap_or(0);
FastFieldStats {
min_value,
max_value,
num_vals: data.len() as u64,
}
}
}

View File

@@ -0,0 +1,176 @@
use crate::FastFieldCodecReader;
use crate::FastFieldCodecSerializer;
use crate::FastFieldDataAccess;
use crate::FastFieldStats;
use common::BinarySerializable;
use std::io::{self, Write};
use tantivy_bitpacker::compute_num_bits;
use tantivy_bitpacker::BitPacker;
use tantivy_bitpacker::BitUnpacker;
/// Depending on the field type, a different
/// fast field is required.
#[derive(Clone)]
pub struct BitpackedFastFieldReader {
bit_unpacker: BitUnpacker,
pub min_value_u64: u64,
pub max_value_u64: u64,
}
impl<'data> FastFieldCodecReader for BitpackedFastFieldReader {
/// Opens a fast field given a file.
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
let (_data, mut footer) = bytes.split_at(bytes.len() - 16);
let min_value = u64::deserialize(&mut footer)?;
let amplitude = u64::deserialize(&mut footer)?;
let max_value = min_value + amplitude;
let num_bits = compute_num_bits(amplitude);
let bit_unpacker = BitUnpacker::new(num_bits);
Ok(BitpackedFastFieldReader {
min_value_u64: min_value,
max_value_u64: max_value,
bit_unpacker,
})
}
#[inline]
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
self.min_value_u64 + self.bit_unpacker.get(doc, &data)
}
#[inline]
fn min_value(&self) -> u64 {
self.min_value_u64
}
#[inline]
fn max_value(&self) -> u64 {
self.max_value_u64
}
}
pub struct BitpackedFastFieldSerializerLegacy<'a, W: 'a + Write> {
bit_packer: BitPacker,
write: &'a mut W,
min_value: u64,
amplitude: u64,
num_bits: u8,
}
impl<'a, W: Write> BitpackedFastFieldSerializerLegacy<'a, W> {
/// Creates a new fast field serializer.
///
/// The serializer in fact encode the values by bitpacking
/// `(val - min_value)`.
///
/// It requires a `min_value` and a `max_value` to compute
/// compute the minimum number of bits required to encode
/// values.
pub fn open(
write: &'a mut W,
min_value: u64,
max_value: u64,
) -> io::Result<BitpackedFastFieldSerializerLegacy<'a, W>> {
assert!(min_value <= max_value);
let amplitude = max_value - min_value;
let num_bits = compute_num_bits(amplitude);
let bit_packer = BitPacker::new();
Ok(BitpackedFastFieldSerializerLegacy {
bit_packer,
write,
min_value,
amplitude,
num_bits,
})
}
/// Pushes a new value to the currently open u64 fast field.
#[inline]
pub fn add_val(&mut self, val: u64) -> io::Result<()> {
let val_to_write: u64 = val - self.min_value;
self.bit_packer
.write(val_to_write, self.num_bits, &mut self.write)?;
Ok(())
}
pub fn close_field(mut self) -> io::Result<()> {
self.bit_packer.close(&mut self.write)?;
self.min_value.serialize(&mut self.write)?;
self.amplitude.serialize(&mut self.write)?;
Ok(())
}
}
pub struct BitpackedFastFieldSerializer {}
impl FastFieldCodecSerializer for BitpackedFastFieldSerializer {
const NAME: &'static str = "Bitpacked";
const ID: u8 = 1;
/// Serializes data with the BitpackedFastFieldSerializer.
///
/// The serializer in fact encode the values by bitpacking
/// `(val - min_value)`.
///
/// It requires a `min_value` and a `max_value` to compute
/// compute the minimum number of bits required to encode
/// values.
fn serialize(
write: &mut impl Write,
_fastfield_accessor: &impl FastFieldDataAccess,
stats: FastFieldStats,
data_iter: impl Iterator<Item = u64>,
_data_iter1: impl Iterator<Item = u64>,
) -> io::Result<()> {
let mut serializer =
BitpackedFastFieldSerializerLegacy::open(write, stats.min_value, stats.max_value)?;
for val in data_iter {
serializer.add_val(val)?;
}
serializer.close_field()?;
Ok(())
}
fn is_applicable(
_fastfield_accessor: &impl FastFieldDataAccess,
_stats: FastFieldStats,
) -> bool {
true
}
fn estimate(_fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
let amplitude = stats.max_value - stats.min_value;
let num_bits = compute_num_bits(amplitude);
let num_bits_uncompressed = 64;
num_bits as f32 / num_bits_uncompressed as f32
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::tests::get_codec_test_data_sets;
fn create_and_validate(data: &[u64], name: &str) {
crate::tests::create_and_validate::<BitpackedFastFieldSerializer, BitpackedFastFieldReader>(
&data, name,
);
}
#[test]
fn test_with_codec_data_sets() {
let data_sets = get_codec_test_data_sets();
for (mut data, name) in data_sets {
create_and_validate(&data, name);
data.reverse();
create_and_validate(&data, name);
}
}
#[test]
fn bitpacked_fast_field_rand() {
for _ in 0..500 {
let mut data = (0..1 + rand::random::<u8>() as usize)
.map(|_| rand::random::<i64>() as u64 / 2 as u64)
.collect::<Vec<_>>();
create_and_validate(&data, "rand");
data.reverse();
create_and_validate(&data, "rand");
}
}
}

231
fastfield_codecs/src/lib.rs Normal file
View File

@@ -0,0 +1,231 @@
#[cfg(test)]
#[macro_use]
extern crate more_asserts;
use std::io;
use std::io::Write;
pub mod bitpacked;
pub mod linearinterpol;
pub mod multilinearinterpol;
pub trait FastFieldCodecReader: Sized {
/// reads the metadata and returns the CodecReader
fn open_from_bytes(bytes: &[u8]) -> std::io::Result<Self>;
fn get_u64(&self, doc: u64, data: &[u8]) -> u64;
fn min_value(&self) -> u64;
fn max_value(&self) -> u64;
}
/// The FastFieldSerializerEstimate trait is required on all variants
/// of fast field compressions, to decide which one to choose.
pub trait FastFieldCodecSerializer {
/// A codex needs to provide a unique name and id, which is
/// used for debugging and de/serialization.
const NAME: &'static str;
const ID: u8;
/// Check if the Codec is able to compress the data
fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> bool;
/// Returns an estimate of the compression ratio.
/// The baseline is uncompressed 64bit data.
///
/// It could make sense to also return a value representing
/// computational complexity.
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32;
/// Serializes the data using the serializer into write.
/// There are multiple iterators, in case the codec needs to read the data multiple times.
/// The iterators should be preferred over using fastfield_accessor for performance reasons.
fn serialize(
write: &mut impl Write,
fastfield_accessor: &impl FastFieldDataAccess,
stats: FastFieldStats,
data_iter: impl Iterator<Item = u64>,
data_iter1: impl Iterator<Item = u64>,
) -> io::Result<()>;
}
/// FastFieldDataAccess is the trait to access fast field data during serialization and estimation.
pub trait FastFieldDataAccess {
/// Return the value associated to the given document.
///
/// Whenever possible use the Iterator passed to the fastfield creation instead, for performance reasons.
///
/// # Panics
///
/// May panic if `doc` is greater than the segment
fn get(&self, doc: u32) -> u64;
}
#[derive(Debug, Clone)]
pub struct FastFieldStats {
pub min_value: u64,
pub max_value: u64,
pub num_vals: u64,
}
impl<'a> FastFieldDataAccess for &'a [u64] {
fn get(&self, doc: u32) -> u64 {
self[doc as usize]
}
}
impl<'a> FastFieldDataAccess for &'a Vec<u64> {
fn get(&self, doc: u32) -> u64 {
self[doc as usize]
}
}
impl FastFieldDataAccess for Vec<u64> {
fn get(&self, doc: u32) -> u64 {
self[doc as usize]
}
}
#[cfg(test)]
mod tests {
use crate::{
bitpacked::{BitpackedFastFieldReader, BitpackedFastFieldSerializer},
linearinterpol::{LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer},
multilinearinterpol::{
MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer,
},
};
pub fn create_and_validate<S: FastFieldCodecSerializer, R: FastFieldCodecReader>(
data: &[u64],
name: &str,
) -> (f32, f32) {
if !S::is_applicable(&data, crate::tests::stats_from_vec(&data)) {
return (f32::MAX, 0.0);
}
let estimation = S::estimate(&data, crate::tests::stats_from_vec(&data));
let mut out = vec![];
S::serialize(
&mut out,
&data,
crate::tests::stats_from_vec(&data),
data.iter().cloned(),
data.iter().cloned(),
)
.unwrap();
let reader = R::open_from_bytes(&out).unwrap();
for (doc, orig_val) in data.iter().enumerate() {
let val = reader.get_u64(doc as u64, &out);
if val != *orig_val {
panic!(
"val {:?} does not match orig_val {:?}, in data set {}, data {:?}",
val, orig_val, name, data
);
}
}
let actual_compression = data.len() as f32 / out.len() as f32;
return (estimation, actual_compression);
}
pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
let mut data_and_names = vec![];
let data = (10..=20_u64).collect::<Vec<_>>();
data_and_names.push((data, "simple monotonically increasing"));
data_and_names.push((
vec![5, 6, 7, 8, 9, 10, 99, 100],
"offset in linear interpol",
));
data_and_names.push((vec![5, 50, 3, 13, 1, 1000, 35], "rand small"));
data_and_names.push((vec![10], "single value"));
data_and_names
}
fn test_codec<S: FastFieldCodecSerializer, R: FastFieldCodecReader>() {
let codec_name = S::NAME;
for (data, data_set_name) in get_codec_test_data_sets() {
let (estimate, actual) =
crate::tests::create_and_validate::<S, R>(&data, data_set_name);
let result = if estimate == f32::MAX {
"Disabled".to_string()
} else {
format!("Estimate {:?} Actual {:?} ", estimate, actual)
};
println!(
"Codec {}, DataSet {}, {}",
codec_name, data_set_name, result
);
}
}
#[test]
fn test_codec_bitpacking() {
test_codec::<BitpackedFastFieldSerializer, BitpackedFastFieldReader>();
}
#[test]
fn test_codec_interpolation() {
test_codec::<LinearInterpolFastFieldSerializer, LinearInterpolFastFieldReader>();
}
#[test]
fn test_codec_multi_interpolation() {
test_codec::<MultiLinearInterpolFastFieldSerializer, MultiLinearInterpolFastFieldReader>();
}
use super::*;
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
let min_value = data.iter().cloned().min().unwrap_or(0);
let max_value = data.iter().cloned().max().unwrap_or(0);
FastFieldStats {
min_value,
max_value,
num_vals: data.len() as u64,
}
}
#[test]
fn estimation_good_interpolation_case() {
let data = (10..=20000_u64).collect::<Vec<_>>();
let linear_interpol_estimation =
LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
assert_le!(linear_interpol_estimation, 0.01);
let multi_linear_interpol_estimation =
MultiLinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
assert_le!(multi_linear_interpol_estimation, 0.2);
assert_le!(linear_interpol_estimation, multi_linear_interpol_estimation);
let bitpacked_estimation =
BitpackedFastFieldSerializer::estimate(&data, stats_from_vec(&data));
assert_le!(linear_interpol_estimation, bitpacked_estimation);
}
#[test]
fn estimation_test_bad_interpolation_case() {
let data = vec![200, 10, 10, 10, 10, 1000, 20];
let linear_interpol_estimation =
LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
assert_le!(linear_interpol_estimation, 0.32);
let bitpacked_estimation =
BitpackedFastFieldSerializer::estimate(&data, stats_from_vec(&data));
assert_le!(bitpacked_estimation, linear_interpol_estimation);
}
#[test]
fn estimation_test_bad_interpolation_case_monotonically_increasing() {
let mut data = (200..=20000_u64).collect::<Vec<_>>();
data.push(1_000_000);
// in this case the linear interpolation can't in fact not be worse than bitpacking,
// but the estimator adds some threshold, which leads to estimated worse behavior
let linear_interpol_estimation =
LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
assert_le!(linear_interpol_estimation, 0.35);
let bitpacked_estimation =
BitpackedFastFieldSerializer::estimate(&data, stats_from_vec(&data));
assert_le!(bitpacked_estimation, 0.32);
assert_le!(bitpacked_estimation, linear_interpol_estimation);
}
}

View File

@@ -0,0 +1,297 @@
use crate::FastFieldCodecReader;
use crate::FastFieldCodecSerializer;
use crate::FastFieldDataAccess;
use crate::FastFieldStats;
use std::io::{self, Read, Write};
use std::ops::Sub;
use tantivy_bitpacker::compute_num_bits;
use tantivy_bitpacker::BitPacker;
use common::BinarySerializable;
use common::FixedSize;
use tantivy_bitpacker::BitUnpacker;
/// Depending on the field type, a different
/// fast field is required.
#[derive(Clone)]
pub struct LinearInterpolFastFieldReader {
bit_unpacker: BitUnpacker,
pub footer: LinearInterpolFooter,
pub slope: f32,
}
#[derive(Clone, Debug)]
pub struct LinearInterpolFooter {
pub relative_max_value: u64,
pub offset: u64,
pub first_val: u64,
pub last_val: u64,
pub num_vals: u64,
pub min_value: u64,
pub max_value: u64,
}
impl BinarySerializable for LinearInterpolFooter {
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
self.relative_max_value.serialize(write)?;
self.offset.serialize(write)?;
self.first_val.serialize(write)?;
self.last_val.serialize(write)?;
self.num_vals.serialize(write)?;
self.min_value.serialize(write)?;
self.max_value.serialize(write)?;
Ok(())
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<LinearInterpolFooter> {
Ok(LinearInterpolFooter {
relative_max_value: u64::deserialize(reader)?,
offset: u64::deserialize(reader)?,
first_val: u64::deserialize(reader)?,
last_val: u64::deserialize(reader)?,
num_vals: u64::deserialize(reader)?,
min_value: u64::deserialize(reader)?,
max_value: u64::deserialize(reader)?,
})
}
}
impl FixedSize for LinearInterpolFooter {
const SIZE_IN_BYTES: usize = 56;
}
impl FastFieldCodecReader for LinearInterpolFastFieldReader {
/// Opens a fast field given a file.
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
let (_data, mut footer) = bytes.split_at(bytes.len() - LinearInterpolFooter::SIZE_IN_BYTES);
let footer = LinearInterpolFooter::deserialize(&mut footer)?;
let slope = get_slope(footer.first_val, footer.last_val, footer.num_vals);
let num_bits = compute_num_bits(footer.relative_max_value);
let bit_unpacker = BitUnpacker::new(num_bits);
Ok(LinearInterpolFastFieldReader {
bit_unpacker,
footer,
slope,
})
}
#[inline]
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
let calculated_value = get_calculated_value(self.footer.first_val, doc, self.slope);
(calculated_value + self.bit_unpacker.get(doc, &data)) - self.footer.offset
}
#[inline]
fn min_value(&self) -> u64 {
self.footer.min_value
}
#[inline]
fn max_value(&self) -> u64 {
self.footer.max_value
}
}
/// Fastfield serializer, which tries to guess values by linear interpolation
/// and stores the difference bitpacked.
pub struct LinearInterpolFastFieldSerializer {}
#[inline]
fn get_slope(first_val: u64, last_val: u64, num_vals: u64) -> f32 {
if num_vals <= 1 {
return 0.0;
}
// We calculate the slope with f64 high precision and use the result in lower precision f32
// This is done in order to handle estimations for very large values like i64::MAX
((last_val as f64 - first_val as f64) / (num_vals as u64 - 1) as f64) as f32
}
#[inline]
fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 {
first_val + (pos as f32 * slope) as u64
}
impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
const NAME: &'static str = "LinearInterpol";
const ID: u8 = 2;
/// Creates a new fast field serializer.
fn serialize(
write: &mut impl Write,
fastfield_accessor: &impl FastFieldDataAccess,
stats: FastFieldStats,
data_iter: impl Iterator<Item = u64>,
data_iter1: impl Iterator<Item = u64>,
) -> io::Result<()> {
assert!(stats.min_value <= stats.max_value);
let first_val = fastfield_accessor.get(0);
let last_val = fastfield_accessor.get(stats.num_vals as u32 - 1);
let slope = get_slope(first_val, last_val, stats.num_vals);
// calculate offset to ensure all values are positive
let mut offset = 0;
let mut rel_positive_max = 0;
for (pos, actual_value) in data_iter1.enumerate() {
let calculated_value = get_calculated_value(first_val, pos as u64, slope);
if calculated_value > actual_value {
// negative value we need to apply an offset
// we ignore negative values in the max value calculation, because negative values
// will be offset to 0
offset = offset.max(calculated_value - actual_value);
} else {
//positive value no offset reuqired
rel_positive_max = rel_positive_max.max(actual_value - calculated_value);
}
}
// rel_positive_max will be adjusted by offset
let relative_max_value = rel_positive_max + offset;
let num_bits = compute_num_bits(relative_max_value);
let mut bit_packer = BitPacker::new();
for (pos, val) in data_iter.enumerate() {
let calculated_value = get_calculated_value(first_val, pos as u64, slope);
let diff = (val + offset) - calculated_value;
bit_packer.write(diff, num_bits, write)?;
}
bit_packer.close(write)?;
let footer = LinearInterpolFooter {
relative_max_value,
offset,
first_val,
last_val,
num_vals: stats.num_vals,
min_value: stats.min_value,
max_value: stats.max_value,
};
footer.serialize(write)?;
Ok(())
}
fn is_applicable(
_fastfield_accessor: &impl FastFieldDataAccess,
stats: FastFieldStats,
) -> bool {
if stats.num_vals < 3 {
return false; //disable compressor for this case
}
// On serialisation the offset is added to the actual value.
// We need to make sure this won't run into overflow calculation issues.
// For this we take the maximum theroretical offset and add this to the max value.
// If this doesn't overflow the algortihm should be fine
let theorethical_maximum_offset = stats.max_value - stats.min_value;
if stats
.max_value
.checked_add(theorethical_maximum_offset)
.is_none()
{
return false;
}
true
}
/// estimation for linear interpolation is hard because, you don't know
/// where the local maxima for the deviation of the calculated value are and
/// the offset to shift all values to >=0 is also unknown.
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
let first_val = fastfield_accessor.get(0);
let last_val = fastfield_accessor.get(stats.num_vals as u32 - 1);
let slope = get_slope(first_val, last_val, stats.num_vals);
// let's sample at 0%, 5%, 10% .. 95%, 100%
let num_vals = stats.num_vals as f32 / 100.0;
let sample_positions = (0..20)
.map(|pos| (num_vals * pos as f32 * 5.0) as usize)
.collect::<Vec<_>>();
let max_distance = sample_positions
.iter()
.map(|pos| {
let calculated_value = get_calculated_value(first_val, *pos as u64, slope);
let actual_value = fastfield_accessor.get(*pos as u32);
distance(calculated_value, actual_value)
})
.max()
.unwrap_or(0);
// the theory would be that we don't have the actual max_distance, but we are close within 50%
// threshold.
// It is multiplied by 2 because in a log case scenario the line would be as much above as
// below. So the offset would = max_distance
//
let relative_max_value = (max_distance as f32 * 1.5) * 2.0;
let num_bits = compute_num_bits(relative_max_value as u64) as u64 * stats.num_vals as u64
+ LinearInterpolFooter::SIZE_IN_BYTES as u64;
let num_bits_uncompressed = 64 * stats.num_vals;
num_bits as f32 / num_bits_uncompressed as f32
}
}
#[inline]
fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
if x < y {
y - x
} else {
x - y
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::tests::get_codec_test_data_sets;
fn create_and_validate(data: &[u64], name: &str) {
crate::tests::create_and_validate::<
LinearInterpolFastFieldSerializer,
LinearInterpolFastFieldReader,
>(&data, name);
}
#[test]
fn test_with_codec_data_sets() {
let data_sets = get_codec_test_data_sets();
for (mut data, name) in data_sets {
create_and_validate(&data, name);
data.reverse();
create_and_validate(&data, name);
}
}
#[test]
fn linear_interpol_fast_field_test_large_amplitude() {
let data = vec![
i64::MAX as u64 / 2,
i64::MAX as u64 / 3,
i64::MAX as u64 / 2,
];
create_and_validate(&data, "large amplitude");
}
#[test]
fn linear_interpol_fast_concave_data() {
let data = vec![0, 1, 2, 5, 8, 10, 20, 50];
create_and_validate(&data, "concave data");
}
#[test]
fn linear_interpol_fast_convex_data() {
let data = vec![0, 40, 60, 70, 75, 77];
create_and_validate(&data, "convex data");
}
#[test]
fn linear_interpol_fast_field_test_simple() {
let data = (10..=20_u64).collect::<Vec<_>>();
create_and_validate(&data, "simple monotonically");
}
#[test]
fn linear_interpol_fast_field_rand() {
for _ in 0..5000 {
let mut data = (0..50 as usize)
.map(|_| rand::random::<u64>())
.collect::<Vec<_>>();
create_and_validate(&data, "random");
data.reverse();
create_and_validate(&data, "random");
}
}
}

View File

@@ -0,0 +1,126 @@
#[macro_use]
extern crate prettytable;
use fastfield_codecs::{
linearinterpol::LinearInterpolFastFieldSerializer,
multilinearinterpol::MultiLinearInterpolFastFieldSerializer, FastFieldCodecSerializer,
FastFieldStats,
};
use prettytable::{Cell, Row, Table};
fn main() {
let mut table = Table::new();
// Add a row per time
table.add_row(row!["", "Compression Ratio", "Compression Estimation"]);
for (data, data_set_name) in get_codec_test_data_sets() {
let mut results = vec![];
let res = serialize_with_codec::<LinearInterpolFastFieldSerializer>(&data);
results.push(res);
let res = serialize_with_codec::<MultiLinearInterpolFastFieldSerializer>(&data);
results.push(res);
let res = serialize_with_codec::<fastfield_codecs::bitpacked::BitpackedFastFieldSerializer>(
&data,
);
results.push(res);
//let best_estimation_codec = results
//.iter()
//.min_by(|res1, res2| res1.partial_cmp(&res2).unwrap())
//.unwrap();
let best_compression_ratio_codec = results
.iter()
.min_by(|res1, res2| res1.partial_cmp(&res2).unwrap())
.cloned()
.unwrap();
table.add_row(Row::new(vec![Cell::new(data_set_name).style_spec("Bbb")]));
for (is_applicable, est, comp, name) in results {
let (est_cell, ratio_cell) = if !is_applicable {
("Codec Disabled".to_string(), "".to_string())
} else {
(est.to_string(), comp.to_string())
};
let style = if comp == best_compression_ratio_codec.1 {
"Fb"
} else {
""
};
table.add_row(Row::new(vec![
Cell::new(&name.to_string()).style_spec("bFg"),
Cell::new(&ratio_cell).style_spec(style),
Cell::new(&est_cell).style_spec(""),
]));
}
}
table.printstd();
}
pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
let mut data_and_names = vec![];
let data = (1000..=200_000_u64).collect::<Vec<_>>();
data_and_names.push((data, "Autoincrement"));
let mut current_cumulative = 0;
let data = (1..=200_000_u64)
.map(|num| {
let num = (num as f32 + num as f32).log10() as u64;
current_cumulative += num;
current_cumulative
})
.collect::<Vec<_>>();
//let data = (1..=200000_u64).map(|num| num + num).collect::<Vec<_>>();
data_and_names.push((data, "Monotonically increasing concave"));
let mut current_cumulative = 0;
let data = (1..=200_000_u64)
.map(|num| {
let num = (200_000.0 - num as f32).log10() as u64;
current_cumulative += num;
current_cumulative
})
.collect::<Vec<_>>();
data_and_names.push((data, "Monotonically increasing convex"));
let data = (1000..=200_000_u64)
.map(|num| num + rand::random::<u8>() as u64)
.collect::<Vec<_>>();
data_and_names.push((data, "Almost monotonically increasing"));
data_and_names
}
pub fn serialize_with_codec<S: FastFieldCodecSerializer>(
data: &[u64],
) -> (bool, f32, f32, &'static str) {
let is_applicable = S::is_applicable(&data, stats_from_vec(&data));
if !is_applicable {
return (false, 0.0, 0.0, S::NAME);
}
let estimation = S::estimate(&data, stats_from_vec(&data));
let mut out = vec![];
S::serialize(
&mut out,
&data,
stats_from_vec(&data),
data.iter().cloned(),
data.iter().cloned(),
)
.unwrap();
let actual_compression = out.len() as f32 / (data.len() * 8) as f32;
return (true, estimation, actual_compression, S::NAME);
}
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
let min_value = data.iter().cloned().min().unwrap_or(0);
let max_value = data.iter().cloned().max().unwrap_or(0);
FastFieldStats {
min_value,
max_value,
num_vals: data.len() as u64,
}
}

View File

@@ -0,0 +1,410 @@
use crate::FastFieldCodecReader;
use crate::FastFieldCodecSerializer;
use crate::FastFieldDataAccess;
use crate::FastFieldStats;
use common::CountingWriter;
use std::io::{self, Read, Write};
use std::ops::Sub;
use tantivy_bitpacker::compute_num_bits;
use tantivy_bitpacker::BitPacker;
use common::BinarySerializable;
use common::DeserializeFrom;
use tantivy_bitpacker::BitUnpacker;
const CHUNK_SIZE: u64 = 512;
/// Depending on the field type, a different
/// fast field is required.
#[derive(Clone)]
pub struct MultiLinearInterpolFastFieldReader {
pub footer: MultiLinearInterpolFooter,
}
#[derive(Clone, Debug, Default)]
struct Function {
// The offset in the data is required, because we have diffrent bit_widths per block
data_start_offset: u64,
// start_pos in the block will be CHUNK_SIZE * BLOCK_NUM
start_pos: u64,
// only used during serialization, 0 after deserialization
end_pos: u64,
// only used during serialization, 0 after deserialization
value_start_pos: u64,
// only used during serialization, 0 after deserialization
value_end_pos: u64,
slope: f32,
// The offset so that all values are positive when writing them
positive_val_offset: u64,
num_bits: u8,
bit_unpacker: BitUnpacker,
}
impl Function {
fn calc_slope(&mut self) {
let num_vals = self.end_pos - self.start_pos;
get_slope(self.value_start_pos, self.value_end_pos, num_vals);
}
// split the interpolation into two function, change self and return the second split
fn split(&mut self, split_pos: u64, split_pos_value: u64) -> Function {
let mut new_function = Function {
start_pos: split_pos,
end_pos: self.end_pos,
value_start_pos: split_pos_value,
value_end_pos: self.value_end_pos,
..Default::default()
};
new_function.calc_slope();
self.end_pos = split_pos;
self.value_end_pos = split_pos_value;
self.calc_slope();
new_function
}
}
impl BinarySerializable for Function {
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
self.data_start_offset.serialize(write)?;
self.value_start_pos.serialize(write)?;
self.positive_val_offset.serialize(write)?;
self.slope.serialize(write)?;
self.num_bits.serialize(write)?;
Ok(())
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Function> {
let data_start_offset = u64::deserialize(reader)?;
let value_start_pos = u64::deserialize(reader)?;
let offset = u64::deserialize(reader)?;
let slope = f32::deserialize(reader)?;
let num_bits = u8::deserialize(reader)?;
let interpolation = Function {
data_start_offset,
value_start_pos,
positive_val_offset: offset,
num_bits,
bit_unpacker: BitUnpacker::new(num_bits),
slope,
..Default::default()
};
Ok(interpolation)
}
}
#[derive(Clone, Debug)]
pub struct MultiLinearInterpolFooter {
pub num_vals: u64,
pub min_value: u64,
pub max_value: u64,
interpolations: Vec<Function>,
}
impl BinarySerializable for MultiLinearInterpolFooter {
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
let mut out = vec![];
self.num_vals.serialize(&mut out)?;
self.min_value.serialize(&mut out)?;
self.max_value.serialize(&mut out)?;
self.interpolations.serialize(&mut out)?;
write.write_all(&out)?;
(out.len() as u32).serialize(write)?;
Ok(())
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<MultiLinearInterpolFooter> {
let mut footer = MultiLinearInterpolFooter {
num_vals: u64::deserialize(reader)?,
min_value: u64::deserialize(reader)?,
max_value: u64::deserialize(reader)?,
interpolations: Vec::<Function>::deserialize(reader)?,
};
for (num, interpol) in footer.interpolations.iter_mut().enumerate() {
interpol.start_pos = CHUNK_SIZE * num as u64;
}
Ok(footer)
}
}
#[inline]
fn get_interpolation_position(doc: u64) -> usize {
let index = doc / CHUNK_SIZE;
index as usize
}
#[inline]
fn get_interpolation_function(doc: u64, interpolations: &[Function]) -> &Function {
&interpolations[get_interpolation_position(doc)]
}
impl FastFieldCodecReader for MultiLinearInterpolFastFieldReader {
/// Opens a fast field given a file.
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?;
let (_data, mut footer) = bytes.split_at(bytes.len() - (4 + footer_len) as usize);
let footer = MultiLinearInterpolFooter::deserialize(&mut footer)?;
Ok(MultiLinearInterpolFastFieldReader { footer })
}
#[inline]
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
let interpolation = get_interpolation_function(doc, &self.footer.interpolations);
let doc = doc - interpolation.start_pos;
let calculated_value =
get_calculated_value(interpolation.value_start_pos, doc, interpolation.slope);
let diff = interpolation
.bit_unpacker
.get(doc, &data[interpolation.data_start_offset as usize..]);
(calculated_value + diff) - interpolation.positive_val_offset
}
#[inline]
fn min_value(&self) -> u64 {
self.footer.min_value
}
#[inline]
fn max_value(&self) -> u64 {
self.footer.max_value
}
}
#[inline]
fn get_slope(first_val: u64, last_val: u64, num_vals: u64) -> f32 {
((last_val as f64 - first_val as f64) / (num_vals as u64 - 1) as f64) as f32
}
#[inline]
fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 {
(first_val as i64 + (pos as f32 * slope) as i64) as u64
}
/// Same as LinearInterpolFastFieldSerializer, but working on chunks of CHUNK_SIZE elements.
pub struct MultiLinearInterpolFastFieldSerializer {}
impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
const NAME: &'static str = "MultiLinearInterpol";
const ID: u8 = 3;
/// Creates a new fast field serializer.
fn serialize(
write: &mut impl Write,
fastfield_accessor: &impl FastFieldDataAccess,
stats: FastFieldStats,
data_iter: impl Iterator<Item = u64>,
_data_iter1: impl Iterator<Item = u64>,
) -> io::Result<()> {
assert!(stats.min_value <= stats.max_value);
let first_val = fastfield_accessor.get(0);
let last_val = fastfield_accessor.get(stats.num_vals as u32 - 1);
let mut first_function = Function {
end_pos: stats.num_vals,
value_start_pos: first_val,
value_end_pos: last_val,
..Default::default()
};
first_function.calc_slope();
let mut interpolations = vec![first_function];
// Since we potentially apply multiple passes over the data, the data is cached.
// Multiple iteration can be expensive (merge with index sorting can add lot of overhead per
// iteration)
let data = data_iter.collect::<Vec<_>>();
//// let's split this into chunks of CHUNK_SIZE
for data_pos in (0..data.len() as u64).step_by(CHUNK_SIZE as usize).skip(1) {
let new_fun = {
let current_interpolation = interpolations.last_mut().unwrap();
current_interpolation.split(data_pos, data[data_pos as usize])
};
interpolations.push(new_fun);
}
// calculate offset and max (-> numbits) for each function
for interpolation in &mut interpolations {
let mut offset = 0;
let mut rel_positive_max = 0;
for (pos, actual_value) in data
[interpolation.start_pos as usize..interpolation.end_pos as usize]
.iter()
.cloned()
.enumerate()
{
let calculated_value = get_calculated_value(
interpolation.value_start_pos,
pos as u64,
interpolation.slope,
);
if calculated_value > actual_value {
// negative value we need to apply an offset
// we ignore negative values in the max value calculation, because negative values
// will be offset to 0
offset = offset.max(calculated_value - actual_value);
} else {
//positive value no offset reuqired
rel_positive_max = rel_positive_max.max(actual_value - calculated_value);
}
}
interpolation.positive_val_offset = offset;
interpolation.num_bits = compute_num_bits(rel_positive_max + offset);
}
let mut bit_packer = BitPacker::new();
let write = &mut CountingWriter::wrap(write);
for interpolation in &mut interpolations {
interpolation.data_start_offset = write.written_bytes();
let num_bits = interpolation.num_bits;
for (pos, actual_value) in data
[interpolation.start_pos as usize..interpolation.end_pos as usize]
.iter()
.cloned()
.enumerate()
{
let calculated_value = get_calculated_value(
interpolation.value_start_pos,
pos as u64,
interpolation.slope,
);
let diff = (actual_value + interpolation.positive_val_offset) - calculated_value;
bit_packer.write(diff, num_bits, write)?;
}
bit_packer.flush(write)?;
}
bit_packer.close(write)?;
let footer = MultiLinearInterpolFooter {
num_vals: stats.num_vals,
min_value: stats.min_value,
max_value: stats.max_value,
interpolations,
};
footer.serialize(write)?;
Ok(())
}
fn is_applicable(
_fastfield_accessor: &impl FastFieldDataAccess,
stats: FastFieldStats,
) -> bool {
if stats.num_vals < 5_000 {
return false;
}
// On serialization the offset is added to the actual value.
// We need to make sure this won't run into overflow calculation issues.
// For this we take the maximum theroretical offset and add this to the max value.
// If this doesn't overflow the algortihm should be fine
let theorethical_maximum_offset = stats.max_value - stats.min_value;
if stats
.max_value
.checked_add(theorethical_maximum_offset)
.is_none()
{
return false;
}
true
}
/// estimation for linear interpolation is hard because, you don't know
/// where the local maxima are for the deviation of the calculated value and
/// the offset is also unknown.
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
let first_val_in_first_block = fastfield_accessor.get(0);
let last_elem_in_first_chunk = CHUNK_SIZE.min(stats.num_vals);
let last_val_in_first_block = fastfield_accessor.get(last_elem_in_first_chunk as u32 - 1);
let slope = get_slope(
first_val_in_first_block,
last_val_in_first_block,
stats.num_vals,
);
// let's sample at 0%, 5%, 10% .. 95%, 100%, but for the first block only
let sample_positions = (0..20)
.map(|pos| (last_elem_in_first_chunk as f32 / 100.0 * pos as f32 * 5.0) as usize)
.collect::<Vec<_>>();
let max_distance = sample_positions
.iter()
.map(|pos| {
let calculated_value =
get_calculated_value(first_val_in_first_block, *pos as u64, slope);
let actual_value = fastfield_accessor.get(*pos as u32);
distance(calculated_value, actual_value)
})
.max()
.unwrap();
// Estimate one block and extrapolate the cost to all blocks.
// the theory would be that we don't have the actual max_distance, but we are close within 50%
// threshold.
// It is multiplied by 2 because in a log case scenario the line would be as much above as
// below. So the offset would = max_distance
//
let relative_max_value = (max_distance as f32 * 1.5) * 2.0;
let num_bits = compute_num_bits(relative_max_value as u64) as u64 * stats.num_vals as u64
// function metadata per block
+ 29 * (stats.num_vals / CHUNK_SIZE);
let num_bits_uncompressed = 64 * stats.num_vals;
num_bits as f32 / num_bits_uncompressed as f32
}
}
fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
if x < y {
y - x
} else {
x - y
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::tests::get_codec_test_data_sets;
fn create_and_validate(data: &[u64], name: &str) {
crate::tests::create_and_validate::<
MultiLinearInterpolFastFieldSerializer,
MultiLinearInterpolFastFieldReader,
>(&data, name);
}
#[test]
fn test_with_codec_data_sets() {
let data_sets = get_codec_test_data_sets();
for (mut data, name) in data_sets {
create_and_validate(&data, name);
data.reverse();
create_and_validate(&data, name);
}
}
#[test]
fn test_simple() {
let data = (10..=20_u64).collect::<Vec<_>>();
create_and_validate(&data, "simple monotonically");
}
#[test]
fn border_cases_1() {
let data = (0..1024).collect::<Vec<_>>();
create_and_validate(&data, "border case");
}
#[test]
fn border_case_2() {
let data = (0..1025).collect::<Vec<_>>();
create_and_validate(&data, "border case");
}
#[test]
fn rand() {
for _ in 0..10 {
let mut data = (5_000..20_000)
.map(|_| rand::random::<u64>() as u64)
.collect::<Vec<_>>();
create_and_validate(&data, "random");
data.reverse();
create_and_validate(&data, "random");
}
}
}

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy-query-grammar"
version = "0.14.0"
version = "0.15.0"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]

View File

@@ -539,10 +539,10 @@ mod tests {
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(
facet_field => Facet::from_text(&"/subjects/A/a"),
facet_field => Facet::from_text(&"/subjects/B/a"),
facet_field => Facet::from_text(&"/subjects/A/b"),
facet_field => Facet::from_text(&"/subjects/B/b"),
facet_field => Facet::from_text(&"/subjects/A/a").unwrap(),
facet_field => Facet::from_text(&"/subjects/B/a").unwrap(),
facet_field => Facet::from_text(&"/subjects/A/b").unwrap(),
facet_field => Facet::from_text(&"/subjects/B/b").unwrap(),
));
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
@@ -563,16 +563,16 @@ mod tests {
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(
facet_field => Facet::from_text(&"/A/A"),
facet_field => Facet::from_text(&"/A/A").unwrap(),
));
index_writer.add_document(doc!(
facet_field => Facet::from_text(&"/A/B"),
facet_field => Facet::from_text(&"/A/B").unwrap(),
));
index_writer.add_document(doc!(
facet_field => Facet::from_text(&"/A/C/A"),
facet_field => Facet::from_text(&"/A/C/A").unwrap(),
));
index_writer.add_document(doc!(
facet_field => Facet::from_text(&"/D/C/A"),
facet_field => Facet::from_text(&"/D/C/A").unwrap(),
));
index_writer.commit()?;
let reader = index.reader()?;
@@ -580,7 +580,7 @@ mod tests {
assert_eq!(searcher.num_docs(), 4);
let count_facet = |facet_str: &str| {
let term = Term::from_facet(facet_field, &Facet::from_text(facet_str));
let term = Term::from_facet(facet_field, &Facet::from_text(facet_str).unwrap());
searcher
.search(&TermQuery::new(term, IndexRecordOption::Basic), &Count)
.unwrap()

View File

@@ -12,7 +12,7 @@
use std::marker::PhantomData;
use crate::collector::{Collector, SegmentCollector};
use crate::fastfield::{FastFieldReader, FastValue};
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, FastValue};
use crate::schema::Field;
use crate::{Score, SegmentReader, TantivyError};
@@ -155,7 +155,7 @@ where
TPredicate: 'static,
TPredicateValue: FastValue,
{
fast_field_reader: FastFieldReader<TPredicateValue>,
fast_field_reader: DynamicFastFieldReader<TPredicateValue>,
segment_collector: TSegmentCollector,
predicate: TPredicate,
t_predicate_value: PhantomData<TPredicateValue>,

View File

@@ -1,5 +1,5 @@
use crate::collector::{Collector, SegmentCollector};
use crate::fastfield::{FastFieldReader, FastValue};
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, FastValue};
use crate::schema::{Field, Type};
use crate::{DocId, Score};
use fastdivide::DividerU64;
@@ -84,7 +84,7 @@ impl HistogramComputer {
}
pub struct SegmentHistogramCollector {
histogram_computer: HistogramComputer,
ff_reader: FastFieldReader<u64>,
ff_reader: DynamicFastFieldReader<u64>,
}
impl SegmentCollector for SegmentHistogramCollector {

View File

@@ -1,6 +1,7 @@
use super::*;
use crate::core::SegmentReader;
use crate::fastfield::BytesFastFieldReader;
use crate::fastfield::DynamicFastFieldReader;
use crate::fastfield::FastFieldReader;
use crate::schema::Field;
use crate::DocId;
@@ -162,7 +163,7 @@ pub struct FastFieldTestCollector {
pub struct FastFieldSegmentCollector {
vals: Vec<u64>,
reader: FastFieldReader<u64>,
reader: DynamicFastFieldReader<u64>,
}
impl FastFieldTestCollector {

View File

@@ -4,7 +4,7 @@ use crate::collector::tweak_score_top_collector::TweakedScoreTopCollector;
use crate::collector::{
CustomScorer, CustomSegmentScorer, ScoreSegmentTweaker, ScoreTweaker, SegmentCollector,
};
use crate::fastfield::FastFieldReader;
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader};
use crate::query::Weight;
use crate::schema::Field;
use crate::DocAddress;
@@ -129,7 +129,7 @@ impl fmt::Debug for TopDocs {
}
struct ScorerByFastFieldReader {
ff_reader: FastFieldReader<u64>,
ff_reader: DynamicFastFieldReader<u64>,
}
impl CustomSegmentScorer<u64> for ScorerByFastFieldReader {
@@ -151,7 +151,7 @@ impl CustomScorer<u64> for ScorerByField {
// mapping is monotonic, so it is sufficient to compute our top-K docs.
//
// The conversion will then happen only on the top-K docs.
let ff_reader: FastFieldReader<u64> = segment_reader
let ff_reader = segment_reader
.fast_fields()
.typed_fast_field_reader(self.field)?;
Ok(ScorerByFastFieldReader { ff_reader })
@@ -401,6 +401,7 @@ impl TopDocs {
/// # use tantivy::query::QueryParser;
/// use tantivy::SegmentReader;
/// use tantivy::collector::TopDocs;
/// use tantivy::fastfield::FastFieldReader;
/// use tantivy::schema::Field;
///
/// fn create_schema() -> Schema {
@@ -508,6 +509,7 @@ impl TopDocs {
/// use tantivy::SegmentReader;
/// use tantivy::collector::TopDocs;
/// use tantivy::schema::Field;
/// use tantivy::fastfield::FastFieldReader;
///
/// # fn create_schema() -> Schema {
/// # let mut schema_builder = Schema::builder();

View File

@@ -1,18 +1,15 @@
mod bitset;
mod composite_file;
mod counting_writer;
mod serialize;
mod vint;
pub use self::bitset::BitSet;
pub(crate) use self::bitset::TinySet;
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
pub use self::counting_writer::CountingWriter;
pub use self::serialize::{BinarySerializable, FixedSize};
pub use self::vint::{
pub use byteorder::LittleEndian as Endianness;
pub use common::CountingWriter;
pub use common::{
read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt,
};
pub use byteorder::LittleEndian as Endianness;
pub use common::{BinarySerializable, DeserializeFrom, FixedSize};
/// Segment's max doc must be `< MAX_DOC_LIMIT`.
///
@@ -103,8 +100,8 @@ pub fn u64_to_f64(val: u64) -> f64 {
#[cfg(test)]
pub(crate) mod test {
pub use super::serialize::test::fixed_size_test;
use super::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
use common::{BinarySerializable, FixedSize};
use proptest::prelude::*;
use std::f64;
use tantivy_bitpacker::compute_num_bits;
@@ -118,6 +115,12 @@ pub(crate) mod test {
assert_eq!(u64_to_f64(f64_to_u64(val)), val);
}
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
let mut buffer = Vec::new();
O::default().serialize(&mut buffer).unwrap();
assert_eq!(buffer.len(), O::SIZE_IN_BYTES);
}
proptest! {
#[test]
fn test_f64_converter_monotonicity_proptest((left, right) in (proptest::num::f64::NORMAL, proptest::num::f64::NORMAL)) {

View File

@@ -76,7 +76,7 @@ fn load_metas(
/// );
///
/// let schema = schema_builder.build();
/// let settings = IndexSettings{sort_by_field: Some(IndexSortByField{field:"number".to_string(), order:Order::Asc})};
/// let settings = IndexSettings{sort_by_field: Some(IndexSortByField{field:"number".to_string(), order:Order::Asc}), ..Default::default()};
/// let index = Index::builder().schema(schema).settings(settings).create_in_ram();
///
/// ```
@@ -173,7 +173,7 @@ impl IndexBuilder {
&directory,
)?;
let mut metas = IndexMeta::with_schema(self.get_expect_schema()?);
metas.index_settings = self.index_settings.clone();
metas.index_settings = self.index_settings;
let index = Index::open_from_metas(directory, &metas, SegmentMetaInventory::default());
Ok(index)
}
@@ -460,6 +460,13 @@ impl Index {
pub fn settings(&self) -> &IndexSettings {
&self.settings
}
/// Accessor to the index settings
///
pub fn settings_mut(&mut self) -> &mut IndexSettings {
&mut self.settings
}
/// Accessor to the index schema
///
/// The schema is actually cloned.

View File

@@ -1,7 +1,7 @@
use super::SegmentComponent;
use crate::core::SegmentId;
use crate::schema::Schema;
use crate::Opstamp;
use crate::{core::SegmentId, store::Compressor};
use census::{Inventory, TrackedObject};
use serde::{Deserialize, Serialize};
use std::path::PathBuf;
@@ -233,7 +233,11 @@ impl InnerSegmentMeta {
pub struct IndexSettings {
/// Sorts the documents by information
/// provided in `IndexSortByField`
#[serde(skip_serializing_if = "Option::is_none")]
pub sort_by_field: Option<IndexSortByField>,
/// The `Compressor` used to compress the doc store.
#[serde(default)]
pub docstore_compression: Compressor,
}
/// Settings to presort the documents in an index
///
@@ -255,6 +259,17 @@ pub enum Order {
/// Descending Order
Desc,
}
impl Order {
/// return if the Order is ascending
pub fn is_asc(&self) -> bool {
self == &Order::Asc
}
/// return if the Order is descending
pub fn is_desc(&self) -> bool {
self == &Order::Desc
}
}
/// Meta information about the `Index`.
///
/// This object is serialized on disk in the `meta.json` file.
@@ -369,6 +384,7 @@ mod tests {
field: "text".to_string(),
order: Order::Asc,
}),
..Default::default()
},
segments: Vec::new(),
schema,
@@ -378,7 +394,7 @@ mod tests {
let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
assert_eq!(
json,
r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"}},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","tokenizer":"default"},"stored":false}}],"opstamp":0}"#
r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4"},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","tokenizer":"default"},"stored":false}}],"opstamp":0}"#
);
}
}

View File

@@ -147,6 +147,13 @@ impl FileSlice {
self.slice(from_offset..self.len())
}
/// Returns a slice from the end.
///
/// Equivalent to `.slice(self.len() - from_offset, self.len())`
pub fn slice_from_end(&self, from_offset: usize) -> FileSlice {
self.slice(self.len() - from_offset..self.len())
}
/// Like `.slice(...)` but enforcing only the `to`
/// boundary.
///

View File

@@ -1,69 +1,45 @@
use crate::common::{BinarySerializable, CountingWriter, FixedSize, HasLen, VInt};
use crate::directory::error::Incompatibility;
use crate::directory::FileSlice;
use crate::directory::{AntiCallToken, TerminatingWrite};
use crate::Version;
use crate::{
common::{BinarySerializable, CountingWriter, DeserializeFrom, FixedSize, HasLen},
directory::{AntiCallToken, TerminatingWrite},
Version, INDEX_FORMAT_VERSION,
};
use crc32fast::Hasher;
use serde::{Deserialize, Serialize};
use std::io;
use std::io::Write;
const FOOTER_MAX_LEN: usize = 10_000;
const FOOTER_MAX_LEN: u32 = 50_000;
/// The magic byte of the footer to identify corruption
/// or an old version of the footer.
const FOOTER_MAGIC_NUMBER: u32 = 1337;
type CrcHashU32 = u32;
#[derive(Debug, Clone, PartialEq)]
/// A Footer is appended to every file
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct Footer {
pub version: Version,
pub meta: String,
pub versioned_footer: VersionedFooter,
}
/// Serialises the footer to a byte-array
/// - versioned_footer_len : 4 bytes
///- versioned_footer: variable bytes
/// - meta_len: 4 bytes
/// - meta: variable bytes
/// - version_len: 4 bytes
/// - version json: variable bytes
impl BinarySerializable for Footer {
fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
BinarySerializable::serialize(&self.versioned_footer, writer)?;
BinarySerializable::serialize(&self.meta, writer)?;
let version_string =
serde_json::to_string(&self.version).map_err(|_err| io::ErrorKind::InvalidInput)?;
BinarySerializable::serialize(&version_string, writer)?;
Ok(())
}
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
let versioned_footer = VersionedFooter::deserialize(reader)?;
let meta = String::deserialize(reader)?;
let version_json = String::deserialize(reader)?;
let version = serde_json::from_str(&version_json)?;
Ok(Footer {
version,
meta,
versioned_footer,
})
}
pub crc: CrcHashU32,
}
impl Footer {
pub fn new(versioned_footer: VersionedFooter) -> Self {
pub fn new(crc: CrcHashU32) -> Self {
let version = crate::VERSION.clone();
let meta = version.to_string();
Footer {
version,
meta,
versioned_footer,
}
Footer { version, crc }
}
pub fn crc(&self) -> CrcHashU32 {
self.crc
}
pub fn append_footer<W: io::Write>(&self, mut write: &mut W) -> io::Result<()> {
let mut counting_write = CountingWriter::wrap(&mut write);
self.serialize(&mut counting_write)?;
let written_len = counting_write.written_bytes();
(written_len as u32).serialize(write)?;
counting_write.write_all(serde_json::to_string(&self)?.as_ref())?;
let footer_payload_len = counting_write.written_bytes();
BinarySerializable::serialize(&(footer_payload_len as u32), write)?;
BinarySerializable::serialize(&(FOOTER_MAGIC_NUMBER as u32), write)?;
Ok(())
}
@@ -77,12 +53,47 @@ impl Footer {
),
));
}
let (body_footer, footer_len_file) = file.split_from_end(u32::SIZE_IN_BYTES);
let mut footer_len_bytes = footer_len_file.read_bytes()?;
let footer_len = u32::deserialize(&mut footer_len_bytes)? as usize;
let (body, footer) = body_footer.split_from_end(footer_len);
let mut footer_bytes = footer.read_bytes()?;
let footer = Footer::deserialize(&mut footer_bytes)?;
let footer_metadata_len = <(u32, u32)>::SIZE_IN_BYTES;
let (footer_len, footer_magic_byte): (u32, u32) = file
.slice_from_end(footer_metadata_len)
.read_bytes()?
.as_ref()
.deserialize()?;
if footer_magic_byte != FOOTER_MAGIC_NUMBER {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
"Footer magic byte mismatch. File corrupted or index was created using old an tantivy version which is not supported anymore. Please use tantivy 0.15 or above to recreate the index.",
));
}
if footer_len > FOOTER_MAX_LEN {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
format!(
"Footer seems invalid as it suggests a footer len of {}. File is corrupted, \
or the index was created with a different & old version of tantivy.",
footer_len
),
));
}
let total_footer_size = footer_len as usize + footer_metadata_len;
if file.len() < total_footer_size {
return Err(io::Error::new(
io::ErrorKind::UnexpectedEof,
format!(
"File corrupted. The file is smaller than it's footer bytes (len={}).",
total_footer_size
),
));
}
let footer: Footer = serde_json::from_slice(&file.read_bytes_slice(
file.len() - total_footer_size..file.len() - footer_metadata_len as usize,
)?)?;
let body = file.slice_to(file.len() - total_footer_size);
Ok((footer, body))
}
@@ -90,151 +101,16 @@ impl Footer {
/// Has to be called after `extract_footer` to make sure it's not accessing uninitialised memory
pub fn is_compatible(&self) -> Result<(), Incompatibility> {
let library_version = crate::version();
match &self.versioned_footer {
VersionedFooter::V1 {
crc32: _crc,
store_compression,
} => {
if &library_version.store_compression != store_compression {
return Err(Incompatibility::CompressionMismatch {
library_compression_format: library_version.store_compression.to_string(),
index_compression_format: store_compression.to_string(),
});
}
Ok(())
}
VersionedFooter::V2 {
crc32: _crc,
store_compression,
} => {
if &library_version.store_compression != store_compression {
return Err(Incompatibility::CompressionMismatch {
library_compression_format: library_version.store_compression.to_string(),
index_compression_format: store_compression.to_string(),
});
}
Ok(())
}
VersionedFooter::V3 {
crc32: _crc,
store_compression,
} => {
if &library_version.store_compression != store_compression {
return Err(Incompatibility::CompressionMismatch {
library_compression_format: library_version.store_compression.to_string(),
index_compression_format: store_compression.to_string(),
});
}
Ok(())
}
VersionedFooter::UnknownVersion => Err(Incompatibility::IndexMismatch {
if self.version.index_format_version < 4
|| self.version.index_format_version > INDEX_FORMAT_VERSION
{
return Err(Incompatibility::IndexMismatch {
library_version: library_version.clone(),
index_version: self.version.clone(),
}),
});
}
}
}
/// Footer that includes a crc32 hash that enables us to checksum files in the index
#[derive(Debug, Clone, PartialEq)]
pub enum VersionedFooter {
UnknownVersion,
V1 {
crc32: CrcHashU32,
store_compression: String,
},
// Introduction of the Block WAND information.
V2 {
crc32: CrcHashU32,
store_compression: String,
},
// Block wand max termfred on 1 byte
V3 {
crc32: CrcHashU32,
store_compression: String,
},
}
impl BinarySerializable for VersionedFooter {
fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
let mut buf = Vec::new();
match self {
VersionedFooter::V3 {
crc32,
store_compression: compression,
} => {
// Serializes a valid `VersionedFooter` or panics if the version is unknown
// [ version | crc_hash | compression_mode ]
// [ 0..4 | 4..8 | variable ]
BinarySerializable::serialize(&3u32, &mut buf)?;
BinarySerializable::serialize(crc32, &mut buf)?;
BinarySerializable::serialize(compression, &mut buf)?;
}
VersionedFooter::V2 { .. }
| VersionedFooter::V1 { .. }
| VersionedFooter::UnknownVersion => {
return Err(io::Error::new(
io::ErrorKind::InvalidInput,
"Cannot serialize an unknown versioned footer ",
));
}
}
BinarySerializable::serialize(&VInt(buf.len() as u64), writer)?;
assert!(buf.len() <= FOOTER_MAX_LEN);
writer.write_all(&buf[..])?;
Ok(())
}
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
let len = VInt::deserialize(reader)?.0 as usize;
if len > FOOTER_MAX_LEN {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
format!(
"Footer seems invalid as it suggests a footer len of {}. File is corrupted, \
or the index was created with a different & old version of tantivy.",
len
),
));
}
let mut buf = vec![0u8; len];
reader.read_exact(&mut buf[..])?;
let mut cursor = &buf[..];
let version = u32::deserialize(&mut cursor)?;
if version > 3 {
return Ok(VersionedFooter::UnknownVersion);
}
let crc32 = u32::deserialize(&mut cursor)?;
let store_compression = String::deserialize(&mut cursor)?;
Ok(if version == 1 {
VersionedFooter::V1 {
crc32,
store_compression,
}
} else if version == 2 {
VersionedFooter::V2 {
crc32,
store_compression,
}
} else {
assert_eq!(version, 3);
VersionedFooter::V3 {
crc32,
store_compression,
}
})
}
}
impl VersionedFooter {
pub fn crc(&self) -> Option<CrcHashU32> {
match self {
VersionedFooter::V3 { crc32, .. } => Some(*crc32),
VersionedFooter::V2 { crc32, .. } => Some(*crc32),
VersionedFooter::V1 { crc32, .. } => Some(*crc32),
VersionedFooter::UnknownVersion { .. } => None,
}
}
}
pub(crate) struct FooterProxy<W: TerminatingWrite> {
@@ -268,10 +144,7 @@ impl<W: TerminatingWrite> Write for FooterProxy<W> {
impl<W: TerminatingWrite> TerminatingWrite for FooterProxy<W> {
fn terminate_ref(&mut self, _: AntiCallToken) -> io::Result<()> {
let crc32 = self.hasher.take().unwrap().finalize();
let footer = Footer::new(VersionedFooter::V3 {
crc32,
store_compression: crate::store::COMPRESSION.to_string(),
});
let footer = Footer::new(crc32);
let mut writer = self.writer.take().unwrap();
footer.append_footer(&mut writer)?;
writer.terminate()
@@ -281,140 +154,75 @@ impl<W: TerminatingWrite> TerminatingWrite for FooterProxy<W> {
#[cfg(test)]
mod tests {
use super::CrcHashU32;
use super::FooterProxy;
use crate::common::{BinarySerializable, VInt};
use crate::directory::footer::{Footer, VersionedFooter};
use crate::directory::TerminatingWrite;
use byteorder::{ByteOrder, LittleEndian};
use regex::Regex;
use crate::directory::footer::Footer;
use crate::directory::OwnedBytes;
use crate::{
common::BinarySerializable,
directory::{footer::FOOTER_MAGIC_NUMBER, FileSlice},
};
use std::io;
#[test]
fn test_versioned_footer() {
let mut vec = Vec::new();
let footer_proxy = FooterProxy::new(&mut vec);
assert!(footer_proxy.terminate().is_ok());
if crate::store::COMPRESSION == "lz4" {
assert_eq!(vec.len(), 158);
} else if crate::store::COMPRESSION == "snappy" {
assert_eq!(vec.len(), 167);
} else if crate::store::COMPRESSION == "lz4_block" {
assert_eq!(vec.len(), 176);
}
let footer = Footer::deserialize(&mut &vec[..]).unwrap();
assert!(matches!(
footer.versioned_footer,
VersionedFooter::V3 { store_compression, .. }
if store_compression == crate::store::COMPRESSION
));
assert_eq!(&footer.version, crate::version());
fn test_deserialize_footer() {
let mut buf: Vec<u8> = vec![];
let footer = Footer::new(123);
footer.append_footer(&mut buf).unwrap();
let owned_bytes = OwnedBytes::new(buf);
let fileslice = FileSlice::new(Box::new(owned_bytes));
let (footer_deser, _body) = Footer::extract_footer(fileslice).unwrap();
assert_eq!(footer_deser.crc(), footer.crc());
}
#[test]
fn test_serialize_deserialize_footer() {
let mut buffer = Vec::new();
let crc32 = 123456u32;
let footer: Footer = Footer::new(VersionedFooter::V3 {
crc32,
store_compression: "lz4".to_string(),
});
footer.serialize(&mut buffer).unwrap();
let footer_deser = Footer::deserialize(&mut &buffer[..]).unwrap();
assert_eq!(footer_deser, footer);
fn test_deserialize_footer_missing_magic_byte() {
let mut buf: Vec<u8> = vec![];
BinarySerializable::serialize(&0_u32, &mut buf).unwrap();
let wrong_magic_byte: u32 = 5555;
BinarySerializable::serialize(&wrong_magic_byte, &mut buf).unwrap();
let owned_bytes = OwnedBytes::new(buf);
let fileslice = FileSlice::new(Box::new(owned_bytes));
let err = Footer::extract_footer(fileslice).unwrap_err();
assert_eq!(
err.to_string(),
"Footer magic byte mismatch. File corrupted or index was created using old an tantivy version which \
is not supported anymore. Please use tantivy 0.15 or above to recreate the index."
);
}
#[test]
fn footer_length() {
let crc32 = 1111111u32;
let versioned_footer = VersionedFooter::V3 {
crc32,
store_compression: "lz4".to_string(),
};
let mut buf = Vec::new();
versioned_footer.serialize(&mut buf).unwrap();
assert_eq!(buf.len(), 13);
let footer = Footer::new(versioned_footer);
let regex_ptn = Regex::new(
"tantivy v[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.{0,10}, index_format v[0-9]{1,5}",
)
.unwrap();
assert!(regex_ptn.is_match(&footer.meta));
}
fn test_deserialize_footer_wrong_filesize() {
let mut buf: Vec<u8> = vec![];
BinarySerializable::serialize(&100_u32, &mut buf).unwrap();
BinarySerializable::serialize(&FOOTER_MAGIC_NUMBER, &mut buf).unwrap();
#[test]
fn versioned_footer_from_bytes() {
let v_footer_bytes = vec![
// versionned footer length
12 | 128,
// index format version
3,
0,
0,
0,
// crc 32
12,
35,
89,
18,
// compression format
3 | 128,
b'l',
b'z',
b'4',
];
let mut cursor = &v_footer_bytes[..];
let versioned_footer = VersionedFooter::deserialize(&mut cursor).unwrap();
assert!(cursor.is_empty());
let expected_crc: u32 = LittleEndian::read_u32(&v_footer_bytes[5..9]) as CrcHashU32;
let expected_versioned_footer: VersionedFooter = VersionedFooter::V3 {
crc32: expected_crc,
store_compression: "lz4".to_string(),
};
assert_eq!(versioned_footer, expected_versioned_footer);
let mut buffer = Vec::new();
assert!(versioned_footer.serialize(&mut buffer).is_ok());
assert_eq!(&v_footer_bytes[..], &buffer[..]);
}
let owned_bytes = OwnedBytes::new(buf);
#[test]
fn versioned_footer_panic() {
let v_footer_bytes = vec![6u8 | 128u8, 3u8, 0u8, 0u8, 1u8, 0u8, 0u8];
let mut b = &v_footer_bytes[..];
let versioned_footer = VersionedFooter::deserialize(&mut b).unwrap();
assert!(b.is_empty());
let expected_versioned_footer = VersionedFooter::UnknownVersion;
assert_eq!(versioned_footer, expected_versioned_footer);
let mut buf = Vec::new();
assert!(versioned_footer.serialize(&mut buf).is_err());
}
#[test]
#[cfg(not(feature = "lz4"))]
fn compression_mismatch() {
let crc32 = 1111111u32;
let versioned_footer = VersionedFooter::V1 {
crc32,
store_compression: "lz4".to_string(),
};
let footer = Footer::new(versioned_footer);
let res = footer.is_compatible();
assert!(res.is_err());
let fileslice = FileSlice::new(Box::new(owned_bytes));
let err = Footer::extract_footer(fileslice).unwrap_err();
assert_eq!(err.kind(), io::ErrorKind::UnexpectedEof);
assert_eq!(
err.to_string(),
"File corrupted. The file is smaller than it\'s footer bytes (len=108)."
);
}
#[test]
fn test_deserialize_too_large_footer() {
let mut buf = vec![];
assert!(FooterProxy::new(&mut buf).terminate().is_ok());
let mut long_len_buf = [0u8; 10];
let num_bytes = VInt(super::FOOTER_MAX_LEN as u64 + 1u64).serialize_into(&mut long_len_buf);
buf[0..num_bytes].copy_from_slice(&long_len_buf[..num_bytes]);
let err = Footer::deserialize(&mut &buf[..]).unwrap_err();
let mut buf: Vec<u8> = vec![];
let footer_length = super::FOOTER_MAX_LEN + 1;
BinarySerializable::serialize(&footer_length, &mut buf).unwrap();
BinarySerializable::serialize(&FOOTER_MAGIC_NUMBER, &mut buf).unwrap();
let owned_bytes = OwnedBytes::new(buf);
let fileslice = FileSlice::new(Box::new(owned_bytes));
let err = Footer::extract_footer(fileslice).unwrap_err();
assert_eq!(err.kind(), io::ErrorKind::InvalidData);
assert_eq!(
err.to_string(),
"Footer seems invalid as it suggests a footer len of 10001. File is corrupted, \
or the index was created with a different & old version of tantivy."
"Footer seems invalid as it suggests a footer len of 50001. File is corrupted, \
or the index was created with a different & old version of tantivy."
);
}
}

View File

@@ -245,11 +245,7 @@ impl ManagedDirectory {
let mut hasher = Hasher::new();
hasher.update(bytes.as_slice());
let crc = hasher.finalize();
Ok(footer
.versioned_footer
.crc()
.map(|v| v == crc)
.unwrap_or(false))
Ok(footer.crc() == crc)
}
/// List files for which checksum does not match content

View File

@@ -593,7 +593,7 @@ mod tests {
let mut index_writer = index.writer_for_tests().unwrap();
let mut log_merge_policy = LogMergePolicy::default();
log_merge_policy.set_min_merge_size(3);
log_merge_policy.set_min_num_segments(3);
index_writer.set_merge_policy(Box::new(log_merge_policy));
for _num_commits in 0..10 {
for _ in 0..10 {

View File

@@ -28,7 +28,9 @@ pub use self::file_slice::{FileHandle, FileSlice};
pub use self::owned_bytes::OwnedBytes;
pub use self::ram_directory::RamDirectory;
pub use self::watch_event_router::{WatchCallback, WatchCallbackList, WatchHandle};
use std::io::{self, BufWriter, Write};
pub use common::AntiCallToken;
pub use common::TerminatingWrite;
use std::io::BufWriter;
use std::path::PathBuf;
/// Outcome of the Garbage collection
@@ -50,47 +52,6 @@ pub use self::mmap_directory::MmapDirectory;
pub use self::managed_directory::ManagedDirectory;
/// Struct used to prevent from calling [`terminate_ref`](trait.TerminatingWrite#method.terminate_ref) directly
///
/// The point is that while the type is public, it cannot be built by anyone
/// outside of this module.
pub struct AntiCallToken(());
/// Trait used to indicate when no more write need to be done on a writer
pub trait TerminatingWrite: Write {
/// Indicate that the writer will no longer be used. Internally call terminate_ref.
fn terminate(mut self) -> io::Result<()>
where
Self: Sized,
{
self.terminate_ref(AntiCallToken(()))
}
/// You should implement this function to define custom behavior.
/// This function should flush any buffer it may hold.
fn terminate_ref(&mut self, _: AntiCallToken) -> io::Result<()>;
}
impl<W: TerminatingWrite + ?Sized> TerminatingWrite for Box<W> {
fn terminate_ref(&mut self, token: AntiCallToken) -> io::Result<()> {
self.as_mut().terminate_ref(token)
}
}
impl<W: TerminatingWrite> TerminatingWrite for BufWriter<W> {
fn terminate_ref(&mut self, a: AntiCallToken) -> io::Result<()> {
self.flush()?;
self.get_mut().terminate_ref(a)
}
}
#[cfg(test)]
impl<'a> TerminatingWrite for &'a mut Vec<u8> {
fn terminate_ref(&mut self, _a: AntiCallToken) -> io::Result<()> {
self.flush()
}
}
/// Write object for Directory.
///
/// `WritePtr` are required to implement both Write

View File

@@ -46,7 +46,7 @@ impl Drop for VecWriter {
fn drop(&mut self) {
if !self.is_flushed {
panic!(
"You forgot to flush {:?} before its writter got Drop. Do not rely on drop.",
"You forgot to flush {:?} before its writter got Drop. Do not rely on drop. This also occurs when the indexer crashed, so you may want to check the logs for the root cause.",
self.path
)
}

View File

@@ -1,7 +1,7 @@
use crate::directory::FileSlice;
use crate::directory::OwnedBytes;
use crate::fastfield::FastFieldReader;
use crate::fastfield::{BitpackedFastFieldReader, FastFieldReader, MultiValueLength};
use crate::DocId;
use crate::{directory::FileSlice, fastfield::MultiValueLength};
/// Reader for byte array fast fields
///
@@ -15,13 +15,13 @@ use crate::{directory::FileSlice, fastfield::MultiValueLength};
/// and the start index for the next document, and keeping the bytes in between.
#[derive(Clone)]
pub struct BytesFastFieldReader {
idx_reader: FastFieldReader<u64>,
idx_reader: BitpackedFastFieldReader<u64>,
values: OwnedBytes,
}
impl BytesFastFieldReader {
pub(crate) fn open(
idx_reader: FastFieldReader<u64>,
idx_reader: BitpackedFastFieldReader<u64>,
values_file: FileSlice,
) -> crate::Result<BytesFastFieldReader> {
let values = values_file.read_bytes()?;

View File

@@ -2,7 +2,9 @@ use std::io;
use crate::schema::{Document, Field, Value};
use crate::DocId;
use crate::{fastfield::serializer::FastFieldSerializer, indexer::doc_id_mapping::DocIdMapping};
use crate::{
fastfield::serializer::CompositeFastFieldSerializer, indexer::doc_id_mapping::DocIdMapping,
};
/// Writer for byte array (as in, any number of bytes per document) fast fields
///
@@ -104,7 +106,7 @@ impl BytesFastFieldWriter {
/// Serializes the fast field values by pushing them to the `FastFieldSerializer`.
pub fn serialize(
&self,
serializer: &mut FastFieldSerializer,
serializer: &mut CompositeFastFieldSerializer,
doc_id_map: Option<&DocIdMapping>,
) -> io::Result<()> {
// writing the offset index

View File

@@ -95,7 +95,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b")));
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let facet_reader = searcher
@@ -118,7 +118,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b")));
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let facet_reader = searcher
@@ -141,7 +141,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b")));
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let facet_reader = searcher
@@ -164,7 +164,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b")));
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
index_writer.commit()?;
let searcher = index.reader()?.searcher();
let facet_reader = searcher
@@ -187,7 +187,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b")));
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
index_writer.add_document(Document::default());
index_writer.commit()?;
let searcher = index.reader()?.searcher();

View File

@@ -29,9 +29,13 @@ pub use self::delete::DeleteBitSet;
pub use self::error::{FastFieldNotAvailableError, Result};
pub use self::facet_reader::FacetReader;
pub use self::multivalued::{MultiValuedFastFieldReader, MultiValuedFastFieldWriter};
pub(crate) use self::reader::BitpackedFastFieldReader;
pub use self::reader::DynamicFastFieldReader;
pub use self::reader::FastFieldReader;
pub use self::readers::FastFieldReaders;
pub use self::serializer::FastFieldSerializer;
pub use self::serializer::CompositeFastFieldSerializer;
pub use self::serializer::FastFieldDataAccess;
pub use self::serializer::FastFieldStats;
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
use crate::schema::Cardinality;
use crate::schema::FieldType;
@@ -57,7 +61,7 @@ mod writer;
pub trait MultiValueLength {
/// returns the num of values associated to a doc_id
fn get_len(&self, doc_id: DocId) -> u64;
/// returns the sum of num of all values for all doc_ids
/// returns the sum of num values for all doc_ids
fn get_total_len(&self) -> u64;
}
@@ -210,15 +214,14 @@ mod tests {
use super::*;
use crate::common::CompositeFile;
use crate::common::HasLen;
use crate::directory::{Directory, RamDirectory, WritePtr};
use crate::fastfield::FastFieldReader;
use crate::merge_policy::NoMergePolicy;
use crate::schema::Field;
use crate::schema::Schema;
use crate::schema::FAST;
use crate::schema::{Document, IntOptions};
use crate::{Index, SegmentId, SegmentReader};
use common::HasLen;
use once_cell::sync::Lazy;
use rand::prelude::SliceRandom;
use rand::rngs::StdRng;
@@ -236,7 +239,7 @@ mod tests {
#[test]
pub fn test_fastfield() {
let test_fastfield = FastFieldReader::<u64>::from(vec![100, 200, 300]);
let test_fastfield = DynamicFastFieldReader::<u64>::from(vec![100, 200, 300]);
assert_eq!(test_fastfield.get(0), 100);
assert_eq!(test_fastfield.get(1), 200);
assert_eq!(test_fastfield.get(2), 300);
@@ -254,7 +257,7 @@ mod tests {
let directory: RamDirectory = RamDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
fast_field_writers.add_document(&doc!(*FIELD=>13u64));
fast_field_writers.add_document(&doc!(*FIELD=>14u64));
@@ -265,10 +268,10 @@ mod tests {
serializer.close().unwrap();
}
let file = directory.open_read(&path).unwrap();
assert_eq!(file.len(), 36 as usize);
assert_eq!(file.len(), 37 as usize);
let composite_file = CompositeFile::open(&file)?;
let file = composite_file.open_read(*FIELD).unwrap();
let fast_field_reader = FastFieldReader::<u64>::open(file)?;
let fast_field_reader = DynamicFastFieldReader::<u64>::open(file)?;
assert_eq!(fast_field_reader.get(0), 13u64);
assert_eq!(fast_field_reader.get(1), 14u64);
assert_eq!(fast_field_reader.get(2), 2u64);
@@ -281,7 +284,7 @@ mod tests {
let directory: RamDirectory = RamDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test"))?;
let mut serializer = FastFieldSerializer::from_write(write)?;
let mut serializer = CompositeFastFieldSerializer::from_write(write)?;
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
fast_field_writers.add_document(&doc!(*FIELD=>4u64));
fast_field_writers.add_document(&doc!(*FIELD=>14_082_001u64));
@@ -296,11 +299,11 @@ mod tests {
serializer.close()?;
}
let file = directory.open_read(&path)?;
assert_eq!(file.len(), 61 as usize);
assert_eq!(file.len(), 62 as usize);
{
let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = FastFieldReader::<u64>::open(data)?;
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data)?;
assert_eq!(fast_field_reader.get(0), 4u64);
assert_eq!(fast_field_reader.get(1), 14_082_001u64);
assert_eq!(fast_field_reader.get(2), 3_052u64);
@@ -321,7 +324,7 @@ mod tests {
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for _ in 0..10_000 {
fast_field_writers.add_document(&doc!(*FIELD=>100_000u64));
@@ -332,11 +335,11 @@ mod tests {
serializer.close().unwrap();
}
let file = directory.open_read(&path).unwrap();
assert_eq!(file.len(), 34 as usize);
assert_eq!(file.len(), 35 as usize);
{
let fast_fields_composite = CompositeFile::open(&file).unwrap();
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = FastFieldReader::<u64>::open(data)?;
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data)?;
for doc in 0..10_000 {
assert_eq!(fast_field_reader.get(doc), 100_000u64);
}
@@ -351,7 +354,7 @@ mod tests {
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
// forcing the amplitude to be high
fast_field_writers.add_document(&doc!(*FIELD=>0u64));
@@ -364,11 +367,11 @@ mod tests {
serializer.close().unwrap();
}
let file = directory.open_read(&path).unwrap();
assert_eq!(file.len(), 80042 as usize);
assert_eq!(file.len(), 80043 as usize);
{
let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = FastFieldReader::<u64>::open(data)?;
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data)?;
assert_eq!(fast_field_reader.get(0), 0u64);
for doc in 1..10_001 {
assert_eq!(
@@ -390,7 +393,7 @@ mod tests {
let schema = schema_builder.build();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
for i in -100i64..10_000i64 {
let mut doc = Document::default();
@@ -403,11 +406,12 @@ mod tests {
serializer.close().unwrap();
}
let file = directory.open_read(&path).unwrap();
assert_eq!(file.len(), 17709 as usize);
//assert_eq!(file.len(), 17710 as usize); //bitpacked size
assert_eq!(file.len(), 10175 as usize); // linear interpol size
{
let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite.open_read(i64_field).unwrap();
let fast_field_reader = FastFieldReader::<i64>::open(data)?;
let fast_field_reader = DynamicFastFieldReader::<i64>::open(data)?;
assert_eq!(fast_field_reader.min_value(), -100i64);
assert_eq!(fast_field_reader.max_value(), 9_999i64);
@@ -433,7 +437,7 @@ mod tests {
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
let doc = Document::default();
fast_field_writers.add_document(&doc);
@@ -447,7 +451,7 @@ mod tests {
{
let fast_fields_composite = CompositeFile::open(&file).unwrap();
let data = fast_fields_composite.open_read(i64_field).unwrap();
let fast_field_reader = FastFieldReader::<i64>::open(data)?;
let fast_field_reader = DynamicFastFieldReader::<i64>::open(data)?;
assert_eq!(fast_field_reader.get(0u32), 0i64);
}
Ok(())
@@ -468,7 +472,7 @@ mod tests {
let directory = RamDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test"))?;
let mut serializer = FastFieldSerializer::from_write(write)?;
let mut serializer = CompositeFastFieldSerializer::from_write(write)?;
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for &x in &permutation {
fast_field_writers.add_document(&doc!(*FIELD=>x));
@@ -480,7 +484,7 @@ mod tests {
{
let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = FastFieldReader::<u64>::open(data)?;
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data)?;
let mut a = 0u64;
for _ in 0..n {
@@ -624,7 +628,7 @@ mod bench {
let directory: RamDirectory = RamDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for &x in &permutation {
fast_field_writers.add_document(&doc!(*FIELD=>x));
@@ -638,7 +642,7 @@ mod bench {
{
let fast_fields_composite = CompositeFile::open(&file).unwrap();
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = FastFieldReader::<u64>::open(data).unwrap();
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data).unwrap();
b.iter(|| {
let n = test::black_box(7000u32);
@@ -658,7 +662,7 @@ mod bench {
let directory: RamDirectory = RamDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
for &x in &permutation {
fast_field_writers.add_document(&doc!(*FIELD=>x));
@@ -672,7 +676,7 @@ mod bench {
{
let fast_fields_composite = CompositeFile::open(&file).unwrap();
let data = fast_fields_composite.open_read(*FIELD).unwrap();
let fast_field_reader = FastFieldReader::<u64>::open(data).unwrap();
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data).unwrap();
b.iter(|| {
let n = test::black_box(1000u32);

View File

@@ -1,6 +1,8 @@
use std::ops::Range;
use crate::fastfield::{FastFieldReader, FastValue, MultiValueLength};
use crate::fastfield::{
BitpackedFastFieldReader, DynamicFastFieldReader, FastFieldReader, FastValue, MultiValueLength,
};
use crate::DocId;
/// Reader for a multivalued `u64` fast field.
@@ -13,14 +15,14 @@ use crate::DocId;
///
#[derive(Clone)]
pub struct MultiValuedFastFieldReader<Item: FastValue> {
idx_reader: FastFieldReader<u64>,
vals_reader: FastFieldReader<Item>,
idx_reader: DynamicFastFieldReader<u64>,
vals_reader: BitpackedFastFieldReader<Item>,
}
impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
pub(crate) fn open(
idx_reader: FastFieldReader<u64>,
vals_reader: FastFieldReader<Item>,
idx_reader: DynamicFastFieldReader<u64>,
vals_reader: BitpackedFastFieldReader<Item>,
) -> MultiValuedFastFieldReader<Item> {
MultiValuedFastFieldReader {
idx_reader,

View File

@@ -1,5 +1,5 @@
use crate::fastfield::serializer::FastSingleFieldSerializer;
use crate::fastfield::FastFieldSerializer;
use crate::fastfield::serializer::BitpackedFastFieldSerializerLegacy;
use crate::fastfield::CompositeFastFieldSerializer;
use crate::postings::UnorderedTermId;
use crate::schema::{Document, Field};
use crate::termdict::TermOrdinal;
@@ -134,7 +134,7 @@ impl MultiValuedFastFieldWriter {
///
pub fn serialize(
&self,
serializer: &mut FastFieldSerializer,
serializer: &mut CompositeFastFieldSerializer,
mapping_opt: Option<&FnvHashMap<UnorderedTermId, TermOrdinal>>,
doc_id_map: Option<&DocIdMapping>,
) -> io::Result<()> {
@@ -154,7 +154,7 @@ impl MultiValuedFastFieldWriter {
}
{
// writing the values themselves.
let mut value_serializer: FastSingleFieldSerializer<'_, _>;
let mut value_serializer: BitpackedFastFieldSerializerLegacy<'_, _>;
match mapping_opt {
Some(mapping) => {
value_serializer = serializer.new_u64_fast_field_with_idx(

View File

@@ -4,47 +4,24 @@ use crate::common::CompositeFile;
use crate::directory::FileSlice;
use crate::directory::OwnedBytes;
use crate::directory::{Directory, RamDirectory, WritePtr};
use crate::fastfield::{FastFieldSerializer, FastFieldsWriter};
use crate::fastfield::{CompositeFastFieldSerializer, FastFieldsWriter};
use crate::schema::Schema;
use crate::schema::FAST;
use crate::DocId;
use fastfield_codecs::bitpacked::BitpackedFastFieldReader as BitpackedReader;
use fastfield_codecs::bitpacked::BitpackedFastFieldSerializer;
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldReader;
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer;
use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldReader;
use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldSerializer;
use fastfield_codecs::FastFieldCodecReader;
use fastfield_codecs::FastFieldCodecSerializer;
use std::collections::HashMap;
use std::marker::PhantomData;
use std::path::Path;
use tantivy_bitpacker::compute_num_bits;
use tantivy_bitpacker::BitUnpacker;
/// Trait for accessing a fastfield.
///
/// Depending on the field type, a different
/// fast field is required.
#[derive(Clone)]
pub struct FastFieldReader<Item: FastValue> {
bytes: OwnedBytes,
bit_unpacker: BitUnpacker,
min_value_u64: u64,
max_value_u64: u64,
_phantom: PhantomData<Item>,
}
impl<Item: FastValue> FastFieldReader<Item> {
/// Opens a fast field given a file.
pub fn open(file: FileSlice) -> crate::Result<Self> {
let mut bytes = file.read_bytes()?;
let min_value = u64::deserialize(&mut bytes)?;
let amplitude = u64::deserialize(&mut bytes)?;
let max_value = min_value + amplitude;
let num_bits = compute_num_bits(amplitude);
let bit_unpacker = BitUnpacker::new(num_bits);
Ok(FastFieldReader {
bytes,
min_value_u64: min_value,
max_value_u64: max_value,
bit_unpacker,
_phantom: PhantomData,
})
}
/// FastFieldReader is the trait to access fast field data.
pub trait FastFieldReader<Item: FastValue>: Clone {
/// Return the value associated to the given document.
///
/// This accessor should return as fast as possible.
@@ -52,13 +29,154 @@ impl<Item: FastValue> FastFieldReader<Item> {
/// # Panics
///
/// May panic if `doc` is greater than the segment
// `maxdoc`.
pub fn get(&self, doc: DocId) -> Item {
self.get_u64(u64::from(doc))
}
fn get(&self, doc: DocId) -> Item;
/// Fills an output buffer with the fast field values
/// associated with the `DocId` going from
/// `start` to `start + output.len()`.
///
/// Regardless of the type of `Item`, this method works
/// - transmuting the output array
/// - extracting the `Item`s as if they were `u64`
/// - possibly converting the `u64` value to the right type.
///
/// # Panics
///
/// May panic if `start + output.len()` is greater than
/// the segment's `maxdoc`.
fn get_range(&self, start: DocId, output: &mut [Item]);
/// Returns the minimum value for this fast field.
///
/// The max value does not take in account of possible
/// deleted document, and should be considered as an upper bound
/// of the actual maximum value.
fn min_value(&self) -> Item;
/// Returns the maximum value for this fast field.
///
/// The max value does not take in account of possible
/// deleted document, and should be considered as an upper bound
/// of the actual maximum value.
fn max_value(&self) -> Item;
}
#[derive(Clone)]
/// DynamicFastFieldReader wraps different readers to access
/// the various encoded fastfield data
///
pub enum DynamicFastFieldReader<Item: FastValue> {
/// Bitpacked compressed fastfield data.
Bitpacked(FastFieldReaderCodecWrapper<Item, BitpackedReader>),
/// Linear interpolated values + bitpacked
LinearInterpol(FastFieldReaderCodecWrapper<Item, LinearInterpolFastFieldReader>),
/// Blockwise linear interpolated values + bitpacked
MultiLinearInterpol(FastFieldReaderCodecWrapper<Item, MultiLinearInterpolFastFieldReader>),
}
impl<Item: FastValue> DynamicFastFieldReader<Item> {
/// Returns correct the reader wrapped in the `DynamicFastFieldReader` enum for the data.
pub fn open(file: FileSlice) -> crate::Result<DynamicFastFieldReader<Item>> {
let mut bytes = file.read_bytes()?;
let id = bytes.read_u8();
let reader = match id {
BitpackedFastFieldSerializer::ID => {
DynamicFastFieldReader::Bitpacked(FastFieldReaderCodecWrapper::<
Item,
BitpackedReader,
>::open_from_bytes(bytes)?)
}
LinearInterpolFastFieldSerializer::ID => {
DynamicFastFieldReader::LinearInterpol(FastFieldReaderCodecWrapper::<
Item,
LinearInterpolFastFieldReader,
>::open_from_bytes(bytes)?)
}
MultiLinearInterpolFastFieldSerializer::ID => {
DynamicFastFieldReader::MultiLinearInterpol(FastFieldReaderCodecWrapper::<
Item,
MultiLinearInterpolFastFieldReader,
>::open_from_bytes(
bytes
)?)
}
_ => {
panic!(
"unknown fastfield id {:?}. Data corrupted or using old tantivy version.",
id
)
}
};
Ok(reader)
}
}
impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
fn get(&self, doc: DocId) -> Item {
match self {
Self::Bitpacked(reader) => reader.get(doc),
Self::LinearInterpol(reader) => reader.get(doc),
Self::MultiLinearInterpol(reader) => reader.get(doc),
}
}
fn get_range(&self, start: DocId, output: &mut [Item]) {
match self {
Self::Bitpacked(reader) => reader.get_range(start, output),
Self::LinearInterpol(reader) => reader.get_range(start, output),
Self::MultiLinearInterpol(reader) => reader.get_range(start, output),
}
}
fn min_value(&self) -> Item {
match self {
Self::Bitpacked(reader) => reader.min_value(),
Self::LinearInterpol(reader) => reader.min_value(),
Self::MultiLinearInterpol(reader) => reader.min_value(),
}
}
fn max_value(&self) -> Item {
match self {
Self::Bitpacked(reader) => reader.max_value(),
Self::LinearInterpol(reader) => reader.max_value(),
Self::MultiLinearInterpol(reader) => reader.max_value(),
}
}
}
/// Wrapper for accessing a fastfield.
///
/// Holds the data and the codec to the read the data.
///
#[derive(Clone)]
pub struct FastFieldReaderCodecWrapper<Item: FastValue, CodecReader> {
reader: CodecReader,
bytes: OwnedBytes,
_phantom: PhantomData<Item>,
}
impl<Item: FastValue, C: FastFieldCodecReader> FastFieldReaderCodecWrapper<Item, C> {
/// Opens a fast field given a file.
pub fn open(file: FileSlice) -> crate::Result<Self> {
let mut bytes = file.read_bytes()?;
let id = u8::deserialize(&mut bytes)?;
assert_eq!(
BitpackedFastFieldSerializer::ID,
id,
"Tried to open fast field as bitpacked encoded (id=1), but got serializer with different id"
);
Self::open_from_bytes(bytes)
}
/// Opens a fast field given the bytes.
pub fn open_from_bytes(bytes: OwnedBytes) -> crate::Result<Self> {
let reader = C::open_from_bytes(bytes.as_slice())?;
Ok(FastFieldReaderCodecWrapper {
reader,
bytes,
_phantom: PhantomData,
})
}
pub(crate) fn get_u64(&self, doc: u64) -> Item {
Item::from_u64(self.min_value_u64 + self.bit_unpacker.get(doc, &self.bytes))
Item::from_u64(self.reader.get_u64(doc, self.bytes.as_slice()))
}
/// Internally `multivalued` also use SingleValue Fast fields.
@@ -78,6 +196,22 @@ impl<Item: FastValue> FastFieldReader<Item> {
*out = self.get_u64(start + (i as u64));
}
}
}
impl<Item: FastValue, C: FastFieldCodecReader + Clone> FastFieldReader<Item>
for FastFieldReaderCodecWrapper<Item, C>
{
/// Return the value associated to the given document.
///
/// This accessor should return as fast as possible.
///
/// # Panics
///
/// May panic if `doc` is greater than the segment
// `maxdoc`.
fn get(&self, doc: DocId) -> Item {
self.get_u64(u64::from(doc))
}
/// Fills an output buffer with the fast field values
/// associated with the `DocId` going from
@@ -92,7 +226,7 @@ impl<Item: FastValue> FastFieldReader<Item> {
///
/// May panic if `start + output.len()` is greater than
/// the segment's `maxdoc`.
pub fn get_range(&self, start: DocId, output: &mut [Item]) {
fn get_range(&self, start: DocId, output: &mut [Item]) {
self.get_range_u64(u64::from(start), output);
}
@@ -101,8 +235,8 @@ impl<Item: FastValue> FastFieldReader<Item> {
/// The max value does not take in account of possible
/// deleted document, and should be considered as an upper bound
/// of the actual maximum value.
pub fn min_value(&self) -> Item {
Item::from_u64(self.min_value_u64)
fn min_value(&self) -> Item {
Item::from_u64(self.reader.min_value())
}
/// Returns the maximum value for this fast field.
@@ -110,13 +244,15 @@ impl<Item: FastValue> FastFieldReader<Item> {
/// The max value does not take in account of possible
/// deleted document, and should be considered as an upper bound
/// of the actual maximum value.
pub fn max_value(&self) -> Item {
Item::from_u64(self.max_value_u64)
fn max_value(&self) -> Item {
Item::from_u64(self.reader.max_value())
}
}
impl<Item: FastValue> From<Vec<Item>> for FastFieldReader<Item> {
fn from(vals: Vec<Item>) -> FastFieldReader<Item> {
pub(crate) type BitpackedFastFieldReader<Item> = FastFieldReaderCodecWrapper<Item, BitpackedReader>;
impl<Item: FastValue> From<Vec<Item>> for DynamicFastFieldReader<Item> {
fn from(vals: Vec<Item>) -> DynamicFastFieldReader<Item> {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_u64_field("field", FAST);
let schema = schema_builder.build();
@@ -126,7 +262,7 @@ impl<Item: FastValue> From<Vec<Item>> for FastFieldReader<Item> {
let write: WritePtr = directory
.open_write(path)
.expect("With a RamDirectory, this should never fail.");
let mut serializer = FastFieldSerializer::from_write(write)
let mut serializer = CompositeFastFieldSerializer::from_write(write)
.expect("With a RamDirectory, this should never fail.");
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
{
@@ -148,6 +284,6 @@ impl<Item: FastValue> From<Vec<Item>> for FastFieldReader<Item> {
let field_file = composite_file
.open_read(field)
.expect("File component not found");
FastFieldReader::open(field_file).unwrap()
DynamicFastFieldReader::open(field_file).unwrap()
}
}

View File

@@ -1,13 +1,15 @@
use crate::common::CompositeFile;
use crate::directory::FileSlice;
use crate::fastfield::MultiValuedFastFieldReader;
use crate::fastfield::{BitpackedFastFieldReader, FastFieldNotAvailableError};
use crate::fastfield::{BytesFastFieldReader, FastValue};
use crate::fastfield::{FastFieldNotAvailableError, FastFieldReader};
use crate::schema::{Cardinality, Field, FieldType, Schema};
use crate::space_usage::PerFieldSpaceUsage;
use crate::TantivyError;
/// Provides access to all of the FastFieldReader.
use super::reader::DynamicFastFieldReader;
/// Provides access to all of the BitpackedFastFieldReader.
///
/// Internally, `FastFieldReaders` have preloaded fast field readers,
/// and just wraps several `HashMap`.
@@ -100,27 +102,26 @@ impl FastFieldReaders {
pub(crate) fn typed_fast_field_reader<TFastValue: FastValue>(
&self,
field: Field,
) -> crate::Result<FastFieldReader<TFastValue>> {
) -> crate::Result<DynamicFastFieldReader<TFastValue>> {
let fast_field_slice = self.fast_field_data(field, 0)?;
FastFieldReader::open(fast_field_slice)
DynamicFastFieldReader::open(fast_field_slice)
}
pub(crate) fn typed_fast_field_multi_reader<TFastValue: FastValue>(
&self,
field: Field,
) -> crate::Result<MultiValuedFastFieldReader<TFastValue>> {
let fast_field_slice_idx = self.fast_field_data(field, 0)?;
let idx_reader = self.typed_fast_field_reader(field)?;
let fast_field_slice_vals = self.fast_field_data(field, 1)?;
let idx_reader = FastFieldReader::open(fast_field_slice_idx)?;
let vals_reader: FastFieldReader<TFastValue> =
FastFieldReader::open(fast_field_slice_vals)?;
let vals_reader: BitpackedFastFieldReader<TFastValue> =
BitpackedFastFieldReader::open(fast_field_slice_vals)?;
Ok(MultiValuedFastFieldReader::open(idx_reader, vals_reader))
}
/// Returns the `u64` fast field reader reader associated to `field`.
///
/// If `field` is not a u64 fast field, this method returns an Error.
pub fn u64(&self, field: Field) -> crate::Result<FastFieldReader<u64>> {
pub fn u64(&self, field: Field) -> crate::Result<DynamicFastFieldReader<u64>> {
self.check_type(field, FastType::U64, Cardinality::SingleValue)?;
self.typed_fast_field_reader(field)
}
@@ -129,14 +130,14 @@ impl FastFieldReaders {
/// field is effectively of type `u64` or not.
///
/// If not, the fastfield reader will returns the u64-value associated to the original FastValue.
pub fn u64_lenient(&self, field: Field) -> crate::Result<FastFieldReader<u64>> {
pub fn u64_lenient(&self, field: Field) -> crate::Result<DynamicFastFieldReader<u64>> {
self.typed_fast_field_reader(field)
}
/// Returns the `i64` fast field reader reader associated to `field`.
///
/// If `field` is not a i64 fast field, this method returns an Error.
pub fn i64(&self, field: Field) -> crate::Result<FastFieldReader<i64>> {
pub fn i64(&self, field: Field) -> crate::Result<DynamicFastFieldReader<i64>> {
self.check_type(field, FastType::I64, Cardinality::SingleValue)?;
self.typed_fast_field_reader(field)
}
@@ -144,7 +145,7 @@ impl FastFieldReaders {
/// Returns the `i64` fast field reader reader associated to `field`.
///
/// If `field` is not a i64 fast field, this method returns an Error.
pub fn date(&self, field: Field) -> crate::Result<FastFieldReader<crate::DateTime>> {
pub fn date(&self, field: Field) -> crate::Result<DynamicFastFieldReader<crate::DateTime>> {
self.check_type(field, FastType::Date, Cardinality::SingleValue)?;
self.typed_fast_field_reader(field)
}
@@ -152,7 +153,7 @@ impl FastFieldReaders {
/// Returns the `f64` fast field reader reader associated to `field`.
///
/// If `field` is not a f64 fast field, this method returns an Error.
pub fn f64(&self, field: Field) -> crate::Result<FastFieldReader<f64>> {
pub fn f64(&self, field: Field) -> crate::Result<DynamicFastFieldReader<f64>> {
self.check_type(field, FastType::F64, Cardinality::SingleValue)?;
self.typed_fast_field_reader(field)
}
@@ -213,7 +214,7 @@ impl FastFieldReaders {
)));
}
let fast_field_idx_file = self.fast_field_data(field, 0)?;
let idx_reader = FastFieldReader::open(fast_field_idx_file)?;
let idx_reader = BitpackedFastFieldReader::open(fast_field_idx_file)?;
let data = self.fast_field_data(field, 1)?;
BytesFastFieldReader::open(idx_reader, data)
} else {

View File

@@ -1,142 +0,0 @@
use crate::common::BinarySerializable;
use crate::common::CompositeWrite;
use crate::common::CountingWriter;
use crate::directory::WritePtr;
use crate::schema::Field;
use std::io::{self, Write};
use tantivy_bitpacker::compute_num_bits;
use tantivy_bitpacker::BitPacker;
/// `FastFieldSerializer` is in charge of serializing
/// fastfields on disk.
///
/// Fast fields are encoded using bit-packing.
///
/// `FastFieldWriter`s are in charge of pushing the data to
/// the serializer.
/// The serializer expects to receive the following calls.
///
/// * `new_u64_fast_field(...)`
/// * `add_val(...)`
/// * `add_val(...)`
/// * `add_val(...)`
/// * ...
/// * `close_field()`
/// * `new_u64_fast_field(...)`
/// * `add_val(...)`
/// * ...
/// * `close_field()`
/// * `close()`
pub struct FastFieldSerializer {
composite_write: CompositeWrite<WritePtr>,
}
impl FastFieldSerializer {
/// Constructor
pub fn from_write(write: WritePtr) -> io::Result<FastFieldSerializer> {
// just making room for the pointer to header.
let composite_write = CompositeWrite::wrap(write);
Ok(FastFieldSerializer { composite_write })
}
/// Start serializing a new u64 fast field
pub fn new_u64_fast_field(
&mut self,
field: Field,
min_value: u64,
max_value: u64,
) -> io::Result<FastSingleFieldSerializer<'_, CountingWriter<WritePtr>>> {
self.new_u64_fast_field_with_idx(field, min_value, max_value, 0)
}
/// Start serializing a new u64 fast field
pub fn new_u64_fast_field_with_idx(
&mut self,
field: Field,
min_value: u64,
max_value: u64,
idx: usize,
) -> io::Result<FastSingleFieldSerializer<'_, CountingWriter<WritePtr>>> {
let field_write = self.composite_write.for_field_with_idx(field, idx);
FastSingleFieldSerializer::open(field_write, min_value, max_value)
}
/// Start serializing a new [u8] fast field
pub fn new_bytes_fast_field_with_idx(
&mut self,
field: Field,
idx: usize,
) -> FastBytesFieldSerializer<'_, CountingWriter<WritePtr>> {
let field_write = self.composite_write.for_field_with_idx(field, idx);
FastBytesFieldSerializer { write: field_write }
}
/// Closes the serializer
///
/// After this call the data must be persistently save on disk.
pub fn close(self) -> io::Result<()> {
self.composite_write.close()
}
}
pub struct FastSingleFieldSerializer<'a, W: Write> {
bit_packer: BitPacker,
write: &'a mut W,
min_value: u64,
num_bits: u8,
}
impl<'a, W: Write> FastSingleFieldSerializer<'a, W> {
/// Creates a new fast field serializer.
///
/// The serializer in fact encode the values by bitpacking
/// `(val - min_value)`.
///
/// It requires a `min_value` and a `max_value` to compute
/// compute the minimum number of bits required to encode
/// values.
fn open(
write: &'a mut W,
min_value: u64,
max_value: u64,
) -> io::Result<FastSingleFieldSerializer<'a, W>> {
assert!(min_value <= max_value);
min_value.serialize(write)?;
let amplitude = max_value - min_value;
amplitude.serialize(write)?;
let num_bits = compute_num_bits(amplitude);
let bit_packer = BitPacker::new();
Ok(FastSingleFieldSerializer {
bit_packer,
write,
min_value,
num_bits,
})
}
/// Pushes a new value to the currently open u64 fast field.
pub fn add_val(&mut self, val: u64) -> io::Result<()> {
let val_to_write: u64 = val - self.min_value;
self.bit_packer
.write(val_to_write, self.num_bits, &mut self.write)?;
Ok(())
}
pub fn close_field(mut self) -> io::Result<()> {
self.bit_packer.close(&mut self.write)
}
}
pub struct FastBytesFieldSerializer<'a, W: Write> {
write: &'a mut W,
}
impl<'a, W: Write> FastBytesFieldSerializer<'a, W> {
pub fn write_all(&mut self, vals: &[u8]) -> io::Result<()> {
self.write.write_all(vals)
}
pub fn flush(&mut self) -> io::Result<()> {
self.write.flush()
}
}

View File

@@ -0,0 +1,203 @@
use crate::common::BinarySerializable;
use crate::common::CompositeWrite;
use crate::common::CountingWriter;
use crate::directory::WritePtr;
use crate::schema::Field;
pub use fastfield_codecs::bitpacked::BitpackedFastFieldSerializer;
pub use fastfield_codecs::bitpacked::BitpackedFastFieldSerializerLegacy;
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer;
use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldSerializer;
pub use fastfield_codecs::FastFieldCodecSerializer;
pub use fastfield_codecs::FastFieldDataAccess;
pub use fastfield_codecs::FastFieldStats;
use std::io::{self, Write};
/// `CompositeFastFieldSerializer` is in charge of serializing
/// fastfields on disk.
///
/// Fast fields have different encodings like bit-packing.
///
/// `FastFieldWriter`s are in charge of pushing the data to
/// the serializer.
/// The serializer expects to receive the following calls.
///
/// * `new_u64_fast_field(...)`
/// * `add_val(...)`
/// * `add_val(...)`
/// * `add_val(...)`
/// * ...
/// * `close_field()`
/// * `new_u64_fast_field(...)`
/// * `add_val(...)`
/// * ...
/// * `close_field()`
/// * `close()`
pub struct CompositeFastFieldSerializer {
composite_write: CompositeWrite<WritePtr>,
}
// use this, when this is merged and stabilized explicit_generic_args_with_impl_trait
// https://github.com/rust-lang/rust/pull/86176
fn codec_estimation<T: FastFieldCodecSerializer, A: FastFieldDataAccess>(
stats: FastFieldStats,
fastfield_accessor: &A,
estimations: &mut Vec<(f32, &str, u8)>,
) {
if !T::is_applicable(fastfield_accessor, stats.clone()) {
return;
}
let (ratio, name, id) = (
T::estimate(fastfield_accessor, stats.clone()),
T::NAME,
T::ID,
);
estimations.push((ratio, name, id));
}
impl CompositeFastFieldSerializer {
/// Constructor
pub fn from_write(write: WritePtr) -> io::Result<CompositeFastFieldSerializer> {
// just making room for the pointer to header.
let composite_write = CompositeWrite::wrap(write);
Ok(CompositeFastFieldSerializer { composite_write })
}
/// Serialize data into a new u64 fast field. The best compression codec will be chosen automatically.
pub fn create_auto_detect_u64_fast_field(
&mut self,
field: Field,
stats: FastFieldStats,
fastfield_accessor: impl FastFieldDataAccess,
data_iter_1: impl Iterator<Item = u64>,
data_iter_2: impl Iterator<Item = u64>,
) -> io::Result<()> {
let field_write = self.composite_write.for_field_with_idx(field, 0);
let mut estimations = vec![];
codec_estimation::<BitpackedFastFieldSerializer, _>(
stats.clone(),
&fastfield_accessor,
&mut estimations,
);
codec_estimation::<LinearInterpolFastFieldSerializer, _>(
stats.clone(),
&fastfield_accessor,
&mut estimations,
);
codec_estimation::<MultiLinearInterpolFastFieldSerializer, _>(
stats.clone(),
&fastfield_accessor,
&mut estimations,
);
if let Some(broken_estimation) = estimations
.iter()
.find(|estimation| estimation.0 == f32::NAN)
{
warn!(
"broken estimation for fast field codec {}",
broken_estimation.1
);
}
// removing nan values for codecs with broken calculations, and max values which disables codecs
estimations.retain(|estimation| !estimation.0.is_nan() && estimation.0 != f32::MAX);
estimations.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
let (_ratio, name, id) = estimations[0];
debug!(
"choosing fast field codec {} for field_id {:?}",
name, field
); // todo print actual field name
id.serialize(field_write)?;
match name {
BitpackedFastFieldSerializer::NAME => {
BitpackedFastFieldSerializer::serialize(
field_write,
&fastfield_accessor,
stats,
data_iter_1,
data_iter_2,
)?;
}
LinearInterpolFastFieldSerializer::NAME => {
LinearInterpolFastFieldSerializer::serialize(
field_write,
&fastfield_accessor,
stats,
data_iter_1,
data_iter_2,
)?;
}
MultiLinearInterpolFastFieldSerializer::NAME => {
MultiLinearInterpolFastFieldSerializer::serialize(
field_write,
&fastfield_accessor,
stats,
data_iter_1,
data_iter_2,
)?;
}
_ => {
panic!("unknown fastfield serializer {}", name)
}
};
field_write.flush()?;
Ok(())
}
/// Start serializing a new u64 fast field
pub fn new_u64_fast_field(
&mut self,
field: Field,
min_value: u64,
max_value: u64,
) -> io::Result<BitpackedFastFieldSerializerLegacy<'_, CountingWriter<WritePtr>>> {
self.new_u64_fast_field_with_idx(field, min_value, max_value, 0)
}
/// Start serializing a new u64 fast field
pub fn new_u64_fast_field_with_idx(
&mut self,
field: Field,
min_value: u64,
max_value: u64,
idx: usize,
) -> io::Result<BitpackedFastFieldSerializerLegacy<'_, CountingWriter<WritePtr>>> {
let field_write = self.composite_write.for_field_with_idx(field, idx);
// Prepend codec id to field data for compatibility with DynamicFastFieldReader.
let id = BitpackedFastFieldSerializer::ID;
id.serialize(field_write)?;
BitpackedFastFieldSerializerLegacy::open(field_write, min_value, max_value)
}
/// Start serializing a new [u8] fast field
pub fn new_bytes_fast_field_with_idx(
&mut self,
field: Field,
idx: usize,
) -> FastBytesFieldSerializer<'_, CountingWriter<WritePtr>> {
let field_write = self.composite_write.for_field_with_idx(field, idx);
FastBytesFieldSerializer { write: field_write }
}
/// Closes the serializer
///
/// After this call the data must be persistently save on disk.
pub fn close(self) -> io::Result<()> {
self.composite_write.close()
}
}
pub struct FastBytesFieldSerializer<'a, W: Write> {
write: &'a mut W,
}
impl<'a, W: Write> FastBytesFieldSerializer<'a, W> {
pub fn write_all(&mut self, vals: &[u8]) -> io::Result<()> {
self.write.write_all(vals)
}
pub fn flush(&mut self) -> io::Result<()> {
self.write.flush()
}
}

View File

@@ -1,10 +1,13 @@
use super::multivalued::MultiValuedFastFieldWriter;
use super::serializer::FastFieldStats;
use super::FastFieldDataAccess;
use crate::common;
use crate::fastfield::{BytesFastFieldWriter, FastFieldSerializer};
use crate::fastfield::{BytesFastFieldWriter, CompositeFastFieldSerializer};
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::UnorderedTermId;
use crate::schema::{Cardinality, Document, Field, FieldEntry, FieldType, Schema};
use crate::termdict::TermOrdinal;
use crate::DocId;
use fnv::FnvHashMap;
use std::collections::HashMap;
use std::io;
@@ -148,7 +151,7 @@ impl FastFieldsWriter {
/// order to the fast field serializer.
pub fn serialize(
&self,
serializer: &mut FastFieldSerializer,
serializer: &mut CompositeFastFieldSerializer,
mapping: &HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>>,
doc_id_map: Option<&DocIdMapping>,
) -> io::Result<()> {
@@ -264,15 +267,15 @@ impl IntFastFieldWriter {
self.add_val(val);
}
/// Extract the stored data
pub(crate) fn get_data(&self) -> Vec<u64> {
self.vals.iter().collect::<Vec<u64>>()
/// get iterator over the data
pub(crate) fn iter(&self) -> impl Iterator<Item = u64> + '_ {
self.vals.iter()
}
/// Push the fast fields value to the `FastFieldWriter`.
pub fn serialize(
&self,
serializer: &mut FastFieldSerializer,
serializer: &mut CompositeFastFieldSerializer,
doc_id_map: Option<&DocIdMapping>,
) -> io::Result<()> {
let (min, max) = if self.val_min > self.val_max {
@@ -280,17 +283,58 @@ impl IntFastFieldWriter {
} else {
(self.val_min, self.val_max)
};
let mut single_field_serializer = serializer.new_u64_fast_field(self.field, min, max)?;
if let Some(doc_id_map) = doc_id_map {
for doc_id in doc_id_map.iter_old_doc_ids() {
single_field_serializer.add_val(self.vals.get(*doc_id as usize))?;
}
} else {
for val in self.vals.iter() {
single_field_serializer.add_val(val)?;
}
let fastfield_accessor = WriterFastFieldAccessProvider {
doc_id_map,
vals: &self.vals,
};
let stats = FastFieldStats {
min_value: min,
max_value: max,
num_vals: self.val_count as u64,
};
single_field_serializer.close_field()
if let Some(doc_id_map) = doc_id_map {
let iter = doc_id_map
.iter_old_doc_ids()
.map(|doc_id| self.vals.get(*doc_id as usize));
serializer.create_auto_detect_u64_fast_field(
self.field,
stats,
fastfield_accessor,
iter.clone(),
iter,
)?;
} else {
serializer.create_auto_detect_u64_fast_field(
self.field,
stats,
fastfield_accessor,
self.vals.iter(),
self.vals.iter(),
)?;
};
Ok(())
}
}
#[derive(Clone)]
struct WriterFastFieldAccessProvider<'map, 'bitp> {
doc_id_map: Option<&'map DocIdMapping>,
vals: &'bitp BlockedBitpacker,
}
impl<'map, 'bitp> FastFieldDataAccess for WriterFastFieldAccessProvider<'map, 'bitp> {
/// Return the value associated to the given document.
///
/// This accessor should return as fast as possible.
///
/// # Panics
///
/// May panic if `doc` is greater than the segment
fn get(&self, doc: DocId) -> u64 {
if let Some(doc_id_map) = self.doc_id_map {
self.vals.get(doc_id_map.get_old_doc_id(doc) as usize) // consider extra FastFieldReader wrapper for non doc_id_map
} else {
self.vals.get(doc as usize)
}
}
}

View File

@@ -8,7 +8,6 @@ use crate::{
DocId, IndexSortByField, Order, TantivyError,
};
use std::cmp::Reverse;
/// Struct to provide mapping from old doc_id to new doc_id and vice versa
pub struct DocIdMapping {
new_doc_id_to_old: Vec<DocId>,
@@ -61,9 +60,8 @@ pub(crate) fn get_doc_id_mapping_from_field(
})?;
// create new doc_id to old doc_id index (used in fast_field_writers)
let data = fast_field.get_data();
let mut doc_id_and_data = data
.into_iter()
let mut doc_id_and_data = fast_field
.iter()
.enumerate()
.map(|el| (el.0 as DocId, el.1))
.collect::<Vec<_>>();
@@ -92,6 +90,7 @@ pub(crate) fn get_doc_id_mapping_from_field(
#[cfg(test)]
mod tests_indexsorting {
use crate::fastfield::FastFieldReader;
use crate::{collector::TopDocs, query::QueryParser, schema::*};
use crate::{schema::Schema, DocAddress};
use crate::{Index, IndexSettings, IndexSortByField, Order};
@@ -175,6 +174,7 @@ mod tests_indexsorting {
field: "my_number".to_string(),
order: Order::Asc,
}),
..Default::default()
}),
option.clone(),
);
@@ -206,6 +206,7 @@ mod tests_indexsorting {
field: "my_number".to_string(),
order: Order::Desc,
}),
..Default::default()
}),
option.clone(),
);
@@ -264,6 +265,7 @@ mod tests_indexsorting {
field: "my_number".to_string(),
order: Order::Asc,
}),
..Default::default()
}),
get_text_options(),
);
@@ -288,6 +290,7 @@ mod tests_indexsorting {
field: "my_number".to_string(),
order: Order::Desc,
}),
..Default::default()
}),
get_text_options(),
);
@@ -322,6 +325,7 @@ mod tests_indexsorting {
field: "my_number".to_string(),
order: Order::Asc,
}),
..Default::default()
}),
get_text_options(),
);
@@ -352,6 +356,7 @@ mod tests_indexsorting {
field: "my_number".to_string(),
order: Order::Desc,
}),
..Default::default()
}),
get_text_options(),
);
@@ -387,6 +392,7 @@ mod tests_indexsorting {
field: "my_number".to_string(),
order: Order::Asc,
}),
..Default::default()
}),
get_text_options(),
);

View File

@@ -945,7 +945,7 @@ mod tests {
let index_writer = index.writer(3_000_000).unwrap();
assert_eq!(
format!("{:?}", index_writer.get_merge_policy()),
"LogMergePolicy { min_merge_size: 8, max_merge_size: 10000000, min_layer_size: 10000, \
"LogMergePolicy { min_num_segments: 8, max_docs_before_merge: 10000000, min_layer_size: 10000, \
level_log_size: 0.75 }"
);
let merge_policy = Box::new(NoMergePolicy::default());

View File

@@ -1,19 +1,20 @@
use super::merge_policy::{MergeCandidate, MergePolicy};
use crate::core::SegmentMeta;
use itertools::Itertools;
use std::cmp;
use std::f64;
const DEFAULT_LEVEL_LOG_SIZE: f64 = 0.75;
const DEFAULT_MIN_LAYER_SIZE: u32 = 10_000;
const DEFAULT_MIN_MERGE_SIZE: usize = 8;
const DEFAULT_MAX_MERGE_SIZE: usize = 10_000_000;
const DEFAULT_MIN_NUM_SEGMENTS_IN_MERGE: usize = 8;
const DEFAULT_MAX_DOCS_BEFORE_MERGE: usize = 10_000_000;
/// `LogMergePolicy` tries to merge segments that have a similar number of
/// documents.
#[derive(Debug, Clone)]
pub struct LogMergePolicy {
min_merge_size: usize,
max_merge_size: usize,
min_num_segments: usize,
max_docs_before_merge: usize,
min_layer_size: u32,
level_log_size: f64,
}
@@ -23,15 +24,16 @@ impl LogMergePolicy {
cmp::max(self.min_layer_size, size)
}
/// Set the minimum number of segment that may be merge together.
pub fn set_min_merge_size(&mut self, min_merge_size: usize) {
self.min_merge_size = min_merge_size;
/// Set the minimum number of segments that may be merged together.
pub fn set_min_num_segments(&mut self, min_num_segments: usize) {
self.min_num_segments = min_num_segments;
}
/// Set the maximum number docs in a segment for it to be considered for
/// merging.
pub fn set_max_merge_size(&mut self, max_merge_size: usize) {
self.max_merge_size = max_merge_size;
/// merging. A segment can still reach more than max_docs, by merging many
/// smaller ones.
pub fn set_max_docs_before_merge(&mut self, max_docs_merge_size: usize) {
self.max_docs_before_merge = max_docs_merge_size;
}
/// Set the minimum segment size under which all segment belong
@@ -42,7 +44,7 @@ impl LogMergePolicy {
/// Set the ratio between two consecutive levels.
///
/// Segment are group in levels according to their sizes.
/// Segments are grouped in levels according to their sizes.
/// These levels are defined as intervals of exponentially growing sizes.
/// level_log_size define the factor by which one should multiply the limit
/// to reach a level, in order to get the limit to reach the following
@@ -54,52 +56,43 @@ impl LogMergePolicy {
impl MergePolicy for LogMergePolicy {
fn compute_merge_candidates(&self, segments: &[SegmentMeta]) -> Vec<MergeCandidate> {
let mut size_sorted_tuples = segments
let mut size_sorted_segments = segments
.iter()
.map(SegmentMeta::num_docs)
.enumerate()
.filter(|(_, s)| s <= &(self.max_merge_size as u32))
.collect::<Vec<(usize, u32)>>();
.filter(|segment_meta| segment_meta.num_docs() <= (self.max_docs_before_merge as u32))
.collect::<Vec<&SegmentMeta>>();
size_sorted_tuples.sort_by(|x, y| y.1.cmp(&(x.1)));
if size_sorted_tuples.len() <= 1 {
return Vec::new();
}
let size_sorted_log_tuples: Vec<_> = size_sorted_tuples
.into_iter()
.map(|(ind, num_docs)| (ind, f64::from(self.clip_min_size(num_docs)).log2()))
.collect();
if let Some(&(first_ind, first_score)) = size_sorted_log_tuples.first() {
let mut current_max_log_size = first_score;
let mut levels = vec![vec![first_ind]];
for &(ind, score) in (&size_sorted_log_tuples).iter().skip(1) {
if score < (current_max_log_size - self.level_log_size) {
current_max_log_size = score;
levels.push(Vec::new());
}
levels.last_mut().unwrap().push(ind);
}
levels
.iter()
.filter(|level| level.len() >= self.min_merge_size)
.map(|ind_vec| {
MergeCandidate(ind_vec.iter().map(|&ind| segments[ind].id()).collect())
})
.collect()
} else {
if size_sorted_segments.len() <= 1 {
return vec![];
}
size_sorted_segments.sort_by_key(|seg| std::cmp::Reverse(seg.num_docs()));
let mut current_max_log_size = f64::MAX;
let mut levels = vec![];
for (_, merge_group) in &size_sorted_segments.into_iter().group_by(|segment| {
let segment_log_size = f64::from(self.clip_min_size(segment.num_docs())).log2();
if segment_log_size < (current_max_log_size - self.level_log_size) {
// update current_max_log_size to create a new group
current_max_log_size = segment_log_size;
}
// return current_max_log_size to be grouped to the current group
current_max_log_size
}) {
levels.push(merge_group.collect::<Vec<&SegmentMeta>>());
}
levels
.iter()
.filter(|level| level.len() >= self.min_num_segments)
.map(|segments| MergeCandidate(segments.iter().map(|&seg| seg.id()).collect()))
.collect()
}
}
impl Default for LogMergePolicy {
fn default() -> LogMergePolicy {
LogMergePolicy {
min_merge_size: DEFAULT_MIN_MERGE_SIZE,
max_merge_size: DEFAULT_MAX_MERGE_SIZE,
min_num_segments: DEFAULT_MIN_NUM_SEGMENTS_IN_MERGE,
max_docs_before_merge: DEFAULT_MAX_DOCS_BEFORE_MERGE,
min_layer_size: DEFAULT_MIN_LAYER_SIZE,
level_log_size: DEFAULT_LEVEL_LOG_SIZE,
}
@@ -109,16 +102,79 @@ impl Default for LogMergePolicy {
#[cfg(test)]
mod tests {
use super::*;
use crate::core::{SegmentId, SegmentMeta, SegmentMetaInventory};
use crate::indexer::merge_policy::MergePolicy;
use crate::{
core::{SegmentId, SegmentMeta, SegmentMetaInventory},
schema,
};
use crate::{indexer::merge_policy::MergePolicy, schema::INDEXED};
use once_cell::sync::Lazy;
static INVENTORY: Lazy<SegmentMetaInventory> = Lazy::new(SegmentMetaInventory::default);
use crate::Index;
#[test]
fn create_index_test_max_merge_issue_1035() {
let mut schema_builder = schema::Schema::builder();
let int_field = schema_builder.add_u64_field("intval", INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut log_merge_policy = LogMergePolicy::default();
log_merge_policy.set_min_num_segments(1);
log_merge_policy.set_max_docs_before_merge(1);
log_merge_policy.set_min_layer_size(0);
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.set_merge_policy(Box::new(log_merge_policy));
// after every commit the merge checker is started, it will merge only segments with 1
// element in it because of the max_merge_size.
index_writer.add_document(doc!(int_field=>1_u64));
assert!(index_writer.commit().is_ok());
index_writer.add_document(doc!(int_field=>2_u64));
assert!(index_writer.commit().is_ok());
index_writer.add_document(doc!(int_field=>3_u64));
assert!(index_writer.commit().is_ok());
index_writer.add_document(doc!(int_field=>4_u64));
assert!(index_writer.commit().is_ok());
index_writer.add_document(doc!(int_field=>5_u64));
assert!(index_writer.commit().is_ok());
index_writer.add_document(doc!(int_field=>6_u64));
assert!(index_writer.commit().is_ok());
index_writer.add_document(doc!(int_field=>7_u64));
assert!(index_writer.commit().is_ok());
index_writer.add_document(doc!(int_field=>8_u64));
assert!(index_writer.commit().is_ok());
}
let _segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let segment_readers = searcher.segment_readers();
for segment in segment_readers {
if segment.num_docs() > 2 {
panic!("segment can't have more than two segments");
} // don't know how to wait for the merge, then it could be a simple eq
}
}
fn test_merge_policy() -> LogMergePolicy {
let mut log_merge_policy = LogMergePolicy::default();
log_merge_policy.set_min_merge_size(3);
log_merge_policy.set_max_merge_size(100_000);
log_merge_policy.set_min_num_segments(3);
log_merge_policy.set_max_docs_before_merge(100_000);
log_merge_policy.set_min_layer_size(2);
log_merge_policy
}

View File

@@ -1,7 +1,11 @@
use super::doc_id_mapping::DocIdMapping;
use crate::error::DataCorruption;
use crate::fastfield::CompositeFastFieldSerializer;
use crate::fastfield::DeleteBitSet;
use crate::fastfield::DynamicFastFieldReader;
use crate::fastfield::FastFieldDataAccess;
use crate::fastfield::FastFieldReader;
use crate::fastfield::FastFieldSerializer;
use crate::fastfield::FastFieldStats;
use crate::fastfield::MultiValuedFastFieldReader;
use crate::fieldnorm::FieldNormsSerializer;
use crate::fieldnorm::FieldNormsWriter;
@@ -86,7 +90,7 @@ pub struct IndexMerger {
}
fn compute_min_max_val(
u64_reader: &FastFieldReader<u64>,
u64_reader: &impl FastFieldReader<u64>,
max_doc: DocId,
delete_bitset_opt: Option<&DeleteBitSet>,
) -> Option<(u64, u64)> {
@@ -182,6 +186,10 @@ impl IndexMerger {
readers.push(reader);
}
}
if let Some(sort_by_field) = index_settings.sort_by_field.as_ref() {
readers = Self::sort_readers_by_min_sort_field(readers, sort_by_field)?;
}
// sort segments by their natural sort setting
if max_doc >= MAX_DOC_LIMIT {
let err_msg = format!(
"The segment resulting from this merge would have {} docs,\
@@ -191,13 +199,37 @@ impl IndexMerger {
return Err(crate::TantivyError::InvalidArgument(err_msg));
}
Ok(IndexMerger {
schema,
index_settings,
schema,
readers,
max_doc,
})
}
fn sort_readers_by_min_sort_field(
readers: Vec<SegmentReader>,
sort_by_field: &IndexSortByField,
) -> crate::Result<Vec<SegmentReader>> {
// presort the readers by their min_values, so that when they are disjunct, we can use
// the regular merge logic (implicitly sorted)
let mut readers_with_min_sort_values = readers
.into_iter()
.map(|reader| {
let accessor = Self::get_sort_field_accessor(&reader, &sort_by_field)?;
Ok((reader, accessor.min_value()))
})
.collect::<crate::Result<Vec<_>>>()?;
if sort_by_field.order.is_asc() {
readers_with_min_sort_values.sort_by_key(|(_, min_val)| *min_val);
} else {
readers_with_min_sort_values.sort_by_key(|(_, min_val)| std::cmp::Reverse(*min_val));
}
Ok(readers_with_min_sort_values
.into_iter()
.map(|(reader, _)| reader)
.collect())
}
fn write_fieldnorms(
&self,
mut fieldnorms_serializer: FieldNormsSerializer,
@@ -208,9 +240,14 @@ impl IndexMerger {
for field in fields {
fieldnorms_data.clear();
if let Some(doc_id_mapping) = doc_id_mapping {
let fieldnorms_readers: Vec<FieldNormReader> = self
.readers
.iter()
.map(|reader| reader.get_fieldnorms_reader(field))
.collect::<Result<_, _>>()?;
for (doc_id, reader_with_ordinal) in doc_id_mapping {
let fieldnorms_reader =
reader_with_ordinal.reader.get_fieldnorms_reader(field)?;
&fieldnorms_readers[reader_with_ordinal.ordinal as usize];
let fieldnorm_id = fieldnorms_reader.fieldnorm_id(*doc_id);
fieldnorms_data.push(fieldnorm_id);
}
@@ -231,7 +268,7 @@ impl IndexMerger {
fn write_fast_fields(
&self,
fast_field_serializer: &mut FastFieldSerializer,
fast_field_serializer: &mut CompositeFastFieldSerializer,
mut term_ord_mappings: HashMap<Field, TermOrdinalMapping>,
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
) -> crate::Result<()> {
@@ -281,11 +318,11 @@ impl IndexMerger {
fn write_single_fast_field(
&self,
field: Field,
fast_field_serializer: &mut FastFieldSerializer,
fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
) -> crate::Result<()> {
let (min_value, max_value) = self.readers.iter().map(|reader|{
let u64_reader: FastFieldReader<u64> = reader
let u64_reader: DynamicFastFieldReader<u64> = reader
.fast_fields()
.typed_fast_field_reader(field)
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
@@ -300,7 +337,7 @@ impl IndexMerger {
.readers
.iter()
.map(|reader| {
let u64_reader: FastFieldReader<u64> = reader
let u64_reader: DynamicFastFieldReader<u64> = reader
.fast_fields()
.typed_fast_field_reader(field)
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
@@ -308,27 +345,44 @@ impl IndexMerger {
})
.collect::<Vec<_>>();
if let Some(doc_id_mapping) = doc_id_mapping {
let sorted_doc_ids = doc_id_mapping.iter().map(|(doc_id, reader_with_ordinal)| {
(
doc_id,
&fast_field_readers[reader_with_ordinal.ordinal as usize],
)
});
// add values in order of the new doc_ids
let mut fast_single_field_serializer =
fast_field_serializer.new_u64_fast_field(field, min_value, max_value)?;
for (doc_id, field_reader) in sorted_doc_ids {
let val = field_reader.get(*doc_id);
fast_single_field_serializer.add_val(val)?;
#[derive(Clone)]
struct SortedDocidFieldAccessProvider<'a> {
doc_id_mapping: &'a Vec<(DocId, SegmentReaderWithOrdinal<'a>)>,
fast_field_readers: &'a Vec<DynamicFastFieldReader<u64>>,
}
impl<'a> FastFieldDataAccess for SortedDocidFieldAccessProvider<'a> {
fn get(&self, doc: DocId) -> u64 {
let (doc_id, reader_with_ordinal) = self.doc_id_mapping[doc as usize];
self.fast_field_readers[reader_with_ordinal.ordinal as usize].get(doc_id)
}
}
let stats = FastFieldStats {
min_value,
max_value,
num_vals: doc_id_mapping.len() as u64,
};
let fastfield_accessor = SortedDocidFieldAccessProvider {
doc_id_mapping,
fast_field_readers: &fast_field_readers,
};
let iter = doc_id_mapping.iter().map(|(doc_id, reader_with_ordinal)| {
let fast_field_reader = &fast_field_readers[reader_with_ordinal.ordinal as usize];
fast_field_reader.get(*doc_id)
});
fast_field_serializer.create_auto_detect_u64_fast_field(
field,
stats,
fastfield_accessor,
iter.clone(),
iter,
)?;
fast_single_field_serializer.close_field()?;
Ok(())
} else {
let u64_readers = self.readers.iter()
.filter(|reader|reader.max_doc() != reader.delete_bitset().map(|bit_set|bit_set.len() as u32).unwrap_or(0))
.map(|reader|{
let u64_reader: FastFieldReader<u64> = reader
let u64_reader: DynamicFastFieldReader<u64> = reader
.fast_fields()
.typed_fast_field_reader(field)
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
@@ -354,6 +408,60 @@ impl IndexMerger {
}
}
/// Checks if the readers are disjunct for their sort property and in the correct order to be
/// able to just stack them.
pub(crate) fn is_disjunct_and_sorted_on_sort_property(
&self,
sort_by_field: &IndexSortByField,
) -> crate::Result<bool> {
let reader_and_field_accessors = self.get_reader_with_sort_field_accessor(sort_by_field)?;
let everything_is_in_order = reader_and_field_accessors
.into_iter()
.map(|reader| reader.1)
.tuple_windows()
.all(|(field_accessor1, field_accessor2)| {
if sort_by_field.order.is_asc() {
field_accessor1.max_value() <= field_accessor2.min_value()
} else {
field_accessor1.min_value() >= field_accessor2.max_value()
}
});
Ok(everything_is_in_order)
}
pub(crate) fn get_sort_field_accessor(
reader: &SegmentReader,
sort_by_field: &IndexSortByField,
) -> crate::Result<impl FastFieldReader<u64>> {
let field_id = expect_field_id_for_sort_field(&reader.schema(), &sort_by_field)?; // for now expect fastfield, but not strictly required
let value_accessor = reader.fast_fields().u64_lenient(field_id)?;
Ok(value_accessor)
}
/// Collecting value_accessors into a vec to bind the lifetime.
pub(crate) fn get_reader_with_sort_field_accessor<'a, 'b>(
&'a self,
sort_by_field: &'b IndexSortByField,
) -> crate::Result<
Vec<(
SegmentReaderWithOrdinal<'a>,
impl FastFieldReader<u64> + Clone,
)>,
> {
let reader_and_field_accessors = self
.readers
.iter()
.enumerate()
.map(Into::into)
.map(|reader_with_ordinal: SegmentReaderWithOrdinal| {
let value_accessor =
Self::get_sort_field_accessor(reader_with_ordinal.reader, sort_by_field)?;
Ok((reader_with_ordinal, value_accessor))
})
.collect::<crate::Result<Vec<_>>>()?;
Ok(reader_and_field_accessors)
}
/// Generates the doc_id mapping where position in the vec=new
/// doc_id.
/// ReaderWithOrdinal will include the ordinal position of the
@@ -362,42 +470,26 @@ impl IndexMerger {
&self,
sort_by_field: &IndexSortByField,
) -> crate::Result<Vec<(DocId, SegmentReaderWithOrdinal)>> {
let reader_and_field_accessors = self
.readers
.iter()
.enumerate()
.map(|reader| {
let reader_with_ordinal: SegmentReaderWithOrdinal = reader.into();
let field_id = expect_field_id_for_sort_field(
&reader_with_ordinal.reader.schema(),
&sort_by_field,
)?; // for now expect fastfield, but not strictly required
let value_accessor = reader_with_ordinal
.reader
.fast_fields()
.u64_lenient(field_id)?;
Ok((reader_with_ordinal, value_accessor))
})
.collect::<crate::Result<Vec<_>>>()?; // Collecting to bind the lifetime of value_accessor into the vec, or can't be used as a reference.
// Loading the field accessor on demand causes a 15x regression
let reader_and_field_accessors = self.get_reader_with_sort_field_accessor(sort_by_field)?;
// Loading the field accessor on demand causes a 15x regression
// create iterators over segment/sort_accessor/doc_id tuple
let doc_id_reader_pair = reader_and_field_accessors
.iter()
.map(|reader_and_field_accessor| {
reader_and_field_accessor
.0
.reader
.doc_ids_alive()
.map(move |doc_id| {
(
doc_id,
reader_and_field_accessor.0,
&reader_and_field_accessor.1,
)
})
})
.collect::<Vec<_>>();
let doc_id_reader_pair =
reader_and_field_accessors
.iter()
.map(|reader_and_field_accessor| {
reader_and_field_accessor
.0
.reader
.doc_ids_alive()
.map(move |doc_id| {
(
doc_id,
reader_and_field_accessor.0,
&reader_and_field_accessor.1,
)
})
});
// create iterator tuple of (old doc_id, reader) in order of the new doc_ids
let sorted_doc_ids: Vec<(DocId, SegmentReaderWithOrdinal)> = doc_id_reader_pair
@@ -422,19 +514,21 @@ impl IndexMerger {
// Important: reader_and_field_accessor needs
// to have the same order as self.readers since ReaderWithOrdinal
// is used to index the reader_and_field_accessors vec.
fn write_1_n_fast_field_idx_generic(
fn write_1_n_fast_field_idx_generic<T: MultiValueLength>(
field: Field,
fast_field_serializer: &mut FastFieldSerializer,
fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
reader_and_field_accessors: &[(&SegmentReader, impl MultiValueLength)],
reader_and_field_accessors: &[(&SegmentReader, T)],
) -> crate::Result<()> {
let mut total_num_vals = 0u64;
// In the first pass, we compute the total number of vals.
//
// This is required by the bitpacker, as it needs to know
// what should be the bit length use for bitpacking.
let mut idx_num_vals = 0;
for (reader, u64s_reader) in reader_and_field_accessors.iter() {
if let Some(delete_bitset) = reader.delete_bitset() {
idx_num_vals += reader.max_doc() as u64 - delete_bitset.len() as u64;
for doc in 0u32..reader.max_doc() {
if delete_bitset.is_alive(doc) {
let num_vals = u64s_reader.get_len(doc) as u64;
@@ -442,44 +536,66 @@ impl IndexMerger {
}
}
} else {
idx_num_vals += reader.max_doc() as u64;
total_num_vals += u64s_reader.get_total_len();
}
}
let stats = FastFieldStats {
max_value: total_num_vals,
num_vals: idx_num_vals,
min_value: 0,
};
// We can now create our `idx` serializer, and in a second pass,
// can effectively push the different indexes.
if let Some(doc_id_mapping) = doc_id_mapping {
let mut serialize_idx =
fast_field_serializer.new_u64_fast_field_with_idx(field, 0, total_num_vals, 0)?;
// copying into a temp vec is not ideal, but the fast field codec api requires random
// access, which is used in the estimation. It's possible to 1. calculate random
// acccess on the fly or 2. change the codec api to make random access optional, but
// they both have also major drawbacks.
let mut offsets = vec![];
let mut offset = 0;
for (doc_id, reader) in doc_id_mapping {
let reader = &reader_and_field_accessors[reader.ordinal as usize].1;
serialize_idx.add_val(offset)?;
offsets.push(offset);
offset += reader.get_len(*doc_id) as u64;
}
serialize_idx.add_val(offset as u64)?;
offsets.push(offset);
serialize_idx.close_field()?;
fast_field_serializer.create_auto_detect_u64_fast_field(
field,
stats,
&offsets,
offsets.iter().cloned(),
offsets.iter().cloned(),
)?;
} else {
let mut serialize_idx =
fast_field_serializer.new_u64_fast_field_with_idx(field, 0, total_num_vals, 0)?;
let mut idx = 0;
let mut offsets = vec![];
let mut offset = 0;
for (segment_reader, u64s_reader) in reader_and_field_accessors.iter() {
for doc in segment_reader.doc_ids_alive() {
serialize_idx.add_val(idx)?;
idx += u64s_reader.get_len(doc) as u64;
offsets.push(offset);
offset += u64s_reader.get_len(doc) as u64;
}
}
serialize_idx.add_val(idx)?;
serialize_idx.close_field()?;
offsets.push(offset);
fast_field_serializer.create_auto_detect_u64_fast_field(
field,
stats,
&offsets,
offsets.iter().cloned(),
offsets.iter().cloned(),
)?;
}
Ok(())
}
fn write_multi_value_fast_field_idx(
&self,
field: Field,
fast_field_serializer: &mut FastFieldSerializer,
fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
) -> crate::Result<()> {
let reader_and_field_accessors = self.readers.iter().map(|reader|{
@@ -501,7 +617,7 @@ impl IndexMerger {
&self,
field: Field,
term_ordinal_mappings: &TermOrdinalMapping,
fast_field_serializer: &mut FastFieldSerializer,
fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
) -> crate::Result<()> {
// Multifastfield consists in 2 fastfields.
@@ -564,7 +680,7 @@ impl IndexMerger {
fn write_multi_fast_field(
&self,
field: Field,
fast_field_serializer: &mut FastFieldSerializer,
fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
) -> crate::Result<()> {
// Multifastfield consists in 2 fastfields.
@@ -651,7 +767,7 @@ impl IndexMerger {
fn write_bytes_fast_field(
&self,
field: Field,
fast_field_serializer: &mut FastFieldSerializer,
fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
) -> crate::Result<()> {
let reader_and_field_accessors = self
@@ -797,13 +913,11 @@ impl IndexMerger {
let mut total_doc_freq = 0;
// Let's compute the list of non-empty posting lists
for heap_item in merged_terms.current_kvs() {
let segment_ord = heap_item.segment_ord;
let term_info = heap_item.streamer.value();
let segment_reader = &self.readers[heap_item.segment_ord];
for (segment_ord, term_info) in merged_terms.current_segment_ordinals_and_term_infos() {
let segment_reader = &self.readers[segment_ord];
let inverted_index: &InvertedIndexReader = &*field_readers[segment_ord];
let segment_postings = inverted_index
.read_postings_from_terminfo(term_info, segment_postings_option)?;
.read_postings_from_terminfo(&term_info, segment_postings_option)?;
let delete_bitset_opt = segment_reader.delete_bitset();
let doc_freq = if let Some(delete_bitset) = delete_bitset_opt {
segment_postings.doc_freq_given_deletes(delete_bitset)
@@ -927,19 +1041,41 @@ impl IndexMerger {
.collect();
if let Some(doc_id_mapping) = doc_id_mapping {
for (old_doc_id, reader_with_ordinal) in doc_id_mapping {
let store_reader = &mut document_iterators[reader_with_ordinal.ordinal as usize];
let raw_doc = store_reader.next().expect(&format!(
"unexpected missing document in docstore on merge, doc id {:?}",
old_doc_id
))?;
store_writer.store_bytes(raw_doc.get_bytes())?;
let doc_bytes_it = &mut document_iterators[reader_with_ordinal.ordinal as usize];
if let Some(doc_bytes_res) = doc_bytes_it.next() {
let doc_bytes = doc_bytes_res?;
store_writer.store_bytes(&doc_bytes)?;
} else {
return Err(DataCorruption::comment_only(&format!(
"unexpected missing document in docstore on merge, doc id {:?}",
old_doc_id
))
.into());
}
}
} else {
for reader in &self.readers {
let store_reader = reader.get_store_reader()?;
if reader.num_deleted_docs() > 0 {
for raw_doc in store_reader.iter_raw(reader.delete_bitset()) {
store_writer.store_bytes(raw_doc?.get_bytes())?;
if reader.num_deleted_docs() > 0
// If there is not enough data in the store, we avoid stacking in order to
// avoid creating many small blocks in the doc store. Once we have 5 full blocks,
// we start stacking. In the worst case 2/7 of the blocks would be very small.
// [segment 1 - {1 doc}][segment 2 - {fullblock * 5}{1doc}]
// => 5 * full blocks, 2 * 1 document blocks
//
// In a more realistic scenario the segments are of the same size, so 1/6 of
// the doc stores would be on average half full, given total randomness (which
// is not the case here, but not sure how it behaves exactly).
//
// https://github.com/tantivy-search/tantivy/issues/1053
//
// take 7 in order to not walk over all checkpoints.
|| store_reader.block_checkpoints().take(7).count() < 6
|| store_reader.compressor() != store_writer.compressor()
{
for doc_bytes_res in store_reader.iter_raw(reader.delete_bitset()) {
let doc_bytes = doc_bytes_res?;
store_writer.store_bytes(&doc_bytes)?;
}
} else {
store_writer.stack(&store_reader)?;
@@ -958,7 +1094,13 @@ impl SerializableSegment for IndexMerger {
) -> crate::Result<u32> {
let doc_id_mapping = if let Some(sort_by_field) = self.index_settings.sort_by_field.as_ref()
{
Some(self.generate_doc_id_mapping(sort_by_field)?)
// If the documents are already sorted and stackable, we ignore the mapping and execute
// it as if there was no sorting
if self.is_disjunct_and_sorted_on_sort_property(sort_by_field)? {
None
} else {
Some(self.generate_doc_id_mapping(sort_by_field)?)
}
} else {
None
};
@@ -993,6 +1135,7 @@ mod tests {
use crate::collector::tests::{BytesFastFieldTestCollector, FastFieldTestCollector};
use crate::collector::{Count, FacetCollector};
use crate::core::Index;
use crate::fastfield::FastFieldReader;
use crate::query::AllQuery;
use crate::query::BooleanQuery;
use crate::query::Scorer;
@@ -1470,31 +1613,65 @@ mod tests {
}
#[test]
fn test_merge_facets_sort_none() {
test_merge_facets(None)
test_merge_facets(None, true)
}
#[test]
fn test_merge_facets_sort_asc() {
// the data is already sorted asc, so this should have no effect, but go through the docid
// mapping code
test_merge_facets(Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "intval".to_string(),
order: Order::Asc,
// In the merge case this will go through the docid mapping code
test_merge_facets(
Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "intval".to_string(),
order: Order::Desc,
}),
..Default::default()
}),
}));
true,
);
// In the merge case this will not go through the docid mapping code, because the data is
// sorted and disjunct
test_merge_facets(
Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "intval".to_string(),
order: Order::Desc,
}),
..Default::default()
}),
false,
);
}
#[test]
fn test_merge_facets_sort_desc() {
test_merge_facets(Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "intval".to_string(),
order: Order::Desc,
// In the merge case this will go through the docid mapping code
test_merge_facets(
Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "intval".to_string(),
order: Order::Desc,
}),
..Default::default()
}),
}));
true,
);
// In the merge case this will not go through the docid mapping code, because the data is
// sorted and disjunct
test_merge_facets(
Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "intval".to_string(),
order: Order::Desc,
}),
..Default::default()
}),
false,
);
}
fn test_merge_facets(index_settings: Option<IndexSettings>) {
// force_segment_value_overlap forces the int value for sorting to have overlapping min and max
// ranges between segments so that merge algorithm can't apply certain optimizations
fn test_merge_facets(index_settings: Option<IndexSettings>, force_segment_value_overlap: bool) {
let mut schema_builder = schema::Schema::builder();
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
let int_options = IntOptions::default()
@@ -1511,32 +1688,47 @@ mod tests {
let mut int_val = 0;
{
let mut index_writer = index.writer_for_tests().unwrap();
let mut index_doc = |index_writer: &mut IndexWriter, doc_facets: &[&str]| {
let mut doc = Document::default();
for facet in doc_facets {
doc.add_facet(facet_field, Facet::from(facet));
}
doc.add_u64(int_field, int_val);
int_val += 1;
index_writer.add_document(doc);
};
let index_doc =
|index_writer: &mut IndexWriter, doc_facets: &[&str], int_val: &mut u64| {
let mut doc = Document::default();
for facet in doc_facets {
doc.add_facet(facet_field, Facet::from(facet));
}
doc.add_u64(int_field, *int_val);
*int_val += 1;
index_writer.add_document(doc);
};
index_doc(&mut index_writer, &["/top/a/firstdoc", "/top/b"]);
index_doc(&mut index_writer, &["/top/a/firstdoc", "/top/b", "/top/c"]);
index_doc(&mut index_writer, &["/top/a", "/top/b"]);
index_doc(&mut index_writer, &["/top/a"]);
index_doc(
&mut index_writer,
&["/top/a/firstdoc", "/top/b"],
&mut int_val,
);
index_doc(
&mut index_writer,
&["/top/a/firstdoc", "/top/b", "/top/c"],
&mut int_val,
);
index_doc(&mut index_writer, &["/top/a", "/top/b"], &mut int_val);
index_doc(&mut index_writer, &["/top/a"], &mut int_val);
index_doc(&mut index_writer, &["/top/b", "/top/d"]);
index_doc(&mut index_writer, &["/top/d"]);
index_doc(&mut index_writer, &["/top/e"]);
index_doc(&mut index_writer, &["/top/b", "/top/d"], &mut int_val);
if force_segment_value_overlap {
index_doc(&mut index_writer, &["/top/d"], &mut 0);
index_doc(&mut index_writer, &["/top/e"], &mut 10);
index_writer.commit().expect("committed");
index_doc(&mut index_writer, &["/top/a"], &mut 5); // 5 is between 0 - 10 so the segments don' have disjunct ranges
} else {
index_doc(&mut index_writer, &["/top/d"], &mut int_val);
index_doc(&mut index_writer, &["/top/e"], &mut int_val);
index_writer.commit().expect("committed");
index_doc(&mut index_writer, &["/top/a"], &mut int_val);
}
index_doc(&mut index_writer, &["/top/b"], &mut int_val);
index_doc(&mut index_writer, &["/top/c"], &mut int_val);
index_writer.commit().expect("committed");
index_doc(&mut index_writer, &["/top/a"]);
index_doc(&mut index_writer, &["/top/b"]);
index_doc(&mut index_writer, &["/top/c"]);
index_writer.commit().expect("committed");
index_doc(&mut index_writer, &["/top/e", "/top/f"]);
index_doc(&mut index_writer, &["/top/e", "/top/f"], &mut int_val);
index_writer.commit().expect("committed");
}
@@ -1821,7 +2013,7 @@ mod tests {
// Make sure we'll attempt to merge every created segment
let mut policy = crate::indexer::LogMergePolicy::default();
policy.set_min_merge_size(2);
policy.set_min_num_segments(2);
writer.set_merge_policy(Box::new(policy));
for i in 0..100 {

View File

@@ -1,5 +1,6 @@
#[cfg(test)]
mod tests {
use crate::fastfield::FastFieldReader;
use crate::{
collector::TopDocs,
schema::{Cardinality, TextFieldIndexing},
@@ -39,6 +40,7 @@ mod tests {
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(int_field=>3_u64, facet_field=> Facet::from("/crime")));
index_writer.add_document(doc!(int_field=>6_u64, facet_field=> Facet::from("/crime")));
assert!(index_writer.commit().is_ok());
index_writer.add_document(doc!(int_field=>5_u64, facet_field=> Facet::from("/fanta")));
@@ -58,7 +60,12 @@ mod tests {
index
}
fn create_test_index(index_settings: Option<IndexSettings>) -> Index {
// force_disjunct_segment_sort_values forces the field, by which the index is sorted have disjunct
// ranges between segments, e.g. values in segment [1-3] [10 - 20] [50 - 500]
fn create_test_index(
index_settings: Option<IndexSettings>,
force_disjunct_segment_sort_values: bool,
) -> Index {
let mut schema_builder = schema::Schema::builder();
let int_options = IntOptions::default()
.set_fast(Cardinality::SingleValue)
@@ -92,6 +99,7 @@ mod tests {
{
let mut index_writer = index.writer_for_tests().unwrap();
// segment 1 - range 1-3
index_writer.add_document(doc!(int_field=>1_u64));
index_writer.add_document(
doc!(int_field=>3_u64, multi_numbers => 3_u64, multi_numbers => 4_u64, bytes_field => vec![1, 2, 3], text_field => "some text", facet_field=> Facet::from("/book/crime")),
@@ -102,13 +110,26 @@ mod tests {
);
assert!(index_writer.commit().is_ok());
// segment 2 - range 1-20 , with force_disjunct_segment_sort_values 10-20
index_writer.add_document(doc!(int_field=>20_u64, multi_numbers => 20_u64));
index_writer.add_document(doc!(int_field=>1_u64, text_field=> "deleteme", facet_field=> Facet::from("/book/crime")));
let in_val = if force_disjunct_segment_sort_values {
10_u64
} else {
1
};
index_writer.add_document(doc!(int_field=>in_val, text_field=> "deleteme", facet_field=> Facet::from("/book/crime")));
assert!(index_writer.commit().is_ok());
index_writer.add_document(
doc!(int_field=>10_u64, multi_numbers => 10_u64, multi_numbers => 11_u64, text_field=> "blubber", facet_field=> Facet::from("/book/fantasy")),
// segment 3 - range 5-1000, with force_disjunct_segment_sort_values 50-1000
let int_vals = if force_disjunct_segment_sort_values {
[100_u64, 50]
} else {
[10, 5]
};
index_writer.add_document( // position of this doc after delete in desc sorting = [2], in disjunct case [1]
doc!(int_field=>int_vals[0], multi_numbers => 10_u64, multi_numbers => 11_u64, text_field=> "blubber", facet_field=> Facet::from("/book/fantasy")),
);
index_writer.add_document(doc!(int_field=>5_u64, text_field=> "deleteme"));
index_writer.add_document(doc!(int_field=>int_vals[1], text_field=> "deleteme"));
index_writer.add_document(
doc!(int_field=>1_000u64, multi_numbers => 1001_u64, multi_numbers => 1002_u64, bytes_field => vec![5, 5],text_field => "the biggest num")
);
@@ -136,17 +157,30 @@ mod tests {
field: "intval".to_string(),
order: Order::Desc,
}),
..Default::default()
}));
}
#[test]
fn test_merge_sorted_index_desc() {
let index = create_test_index(Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "intval".to_string(),
order: Order::Desc,
fn test_merge_sorted_index_desc_not_disjunct() {
test_merge_sorted_index_desc_(false);
}
#[test]
fn test_merge_sorted_index_desc_disjunct() {
test_merge_sorted_index_desc_(true);
}
fn test_merge_sorted_index_desc_(force_disjunct_segment_sort_values: bool) {
let index = create_test_index(
Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "intval".to_string(),
order: Order::Desc,
}),
..Default::default()
}),
}));
force_disjunct_segment_sort_values,
);
let int_field = index.schema().get_field("intval").unwrap();
let reader = index.reader().unwrap();
@@ -160,8 +194,13 @@ mod tests {
assert_eq!(fast_field.get(5u32), 1u64);
assert_eq!(fast_field.get(4u32), 2u64);
assert_eq!(fast_field.get(3u32), 3u64);
assert_eq!(fast_field.get(2u32), 10u64);
assert_eq!(fast_field.get(1u32), 20u64);
if force_disjunct_segment_sort_values {
assert_eq!(fast_field.get(2u32), 20u64);
assert_eq!(fast_field.get(1u32), 100u64);
} else {
assert_eq!(fast_field.get(2u32), 10u64);
assert_eq!(fast_field.get(1u32), 20u64);
}
assert_eq!(fast_field.get(0u32), 1_000u64);
// test new field norm mapping
@@ -169,8 +208,13 @@ mod tests {
let my_text_field = index.schema().get_field("text_field").unwrap();
let fieldnorm_reader = segment_reader.get_fieldnorms_reader(my_text_field).unwrap();
assert_eq!(fieldnorm_reader.fieldnorm(0), 3); // the biggest num
assert_eq!(fieldnorm_reader.fieldnorm(1), 0);
assert_eq!(fieldnorm_reader.fieldnorm(2), 1); // blubber
if force_disjunct_segment_sort_values {
assert_eq!(fieldnorm_reader.fieldnorm(1), 1); // blubber
assert_eq!(fieldnorm_reader.fieldnorm(2), 0);
} else {
assert_eq!(fieldnorm_reader.fieldnorm(1), 0);
assert_eq!(fieldnorm_reader.fieldnorm(2), 1); // blubber
}
assert_eq!(fieldnorm_reader.fieldnorm(3), 2); // some text
assert_eq!(fieldnorm_reader.fieldnorm(5), 0);
}
@@ -191,13 +235,22 @@ mod tests {
};
assert_eq!(do_search("some"), vec![3]);
assert_eq!(do_search("blubber"), vec![2]);
if force_disjunct_segment_sort_values {
assert_eq!(do_search("blubber"), vec![1]);
} else {
assert_eq!(do_search("blubber"), vec![2]);
}
assert_eq!(do_search("biggest"), vec![0]);
}
// access doc store
{
let doc = searcher.doc(DocAddress::new(0, 2)).unwrap();
let blubber_pos = if force_disjunct_segment_sort_values {
1
} else {
2
};
let doc = searcher.doc(DocAddress::new(0, blubber_pos)).unwrap();
assert_eq!(
doc.get_first(my_text_field).unwrap().text(),
Some("blubber")
@@ -209,12 +262,16 @@ mod tests {
#[test]
fn test_merge_sorted_index_asc() {
let index = create_test_index(Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "intval".to_string(),
order: Order::Asc,
let index = create_test_index(
Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "intval".to_string(),
order: Order::Asc,
}),
..Default::default()
}),
}));
false,
);
let int_field = index.schema().get_field("intval").unwrap();
let multi_numbers = index.schema().get_field("multi_numbers").unwrap();
@@ -305,6 +362,7 @@ mod bench_sorted_index_merge {
use crate::core::Index;
//use cratedoc_id, readerdoc_id_mappinglet vals = reader.fate::schema;
use crate::fastfield::DynamicFastFieldReader;
use crate::fastfield::FastFieldReader;
use crate::indexer::merger::IndexMerger;
use crate::schema::Cardinality;
@@ -315,7 +373,6 @@ mod bench_sorted_index_merge {
use crate::IndexSortByField;
use crate::IndexWriter;
use crate::Order;
use futures::executor::block_on;
use test::{self, Bencher};
fn create_index(sort_by_field: Option<IndexSortByField>) -> Index {
let mut schema_builder = Schema::builder();
@@ -323,12 +380,12 @@ mod bench_sorted_index_merge {
.set_fast(Cardinality::SingleValue)
.set_indexed();
let int_field = schema_builder.add_u64_field("intval", int_options);
let int_field = schema_builder.add_u64_field("intval", int_options);
let schema = schema_builder.build();
let index_builder = Index::builder()
.schema(schema)
.settings(IndexSettings { sort_by_field });
let index_builder = Index::builder().schema(schema).settings(IndexSettings {
sort_by_field,
..Default::default()
});
let index = index_builder.create_in_ram().unwrap();
{
@@ -366,7 +423,7 @@ mod bench_sorted_index_merge {
b.iter(|| {
let sorted_doc_ids = doc_id_mapping.iter().map(|(doc_id, reader)|{
let u64_reader: FastFieldReader<u64> = reader
let u64_reader: DynamicFastFieldReader<u64> = reader.reader
.fast_fields()
.typed_fast_field_reader(field)
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
@@ -391,7 +448,7 @@ mod bench_sorted_index_merge {
order: Order::Desc,
};
let index = create_index(Some(sort_by_field.clone()));
let field = index.schema().get_field("intval").unwrap();
//let field = index.schema().get_field("intval").unwrap();
let segments = index.searchable_segments().unwrap();
let merger: IndexMerger =
IndexMerger::open(index.schema(), index.settings().clone(), &segments[..])?;

View File

@@ -1,6 +1,6 @@
use crate::core::Segment;
use crate::core::SegmentComponent;
use crate::fastfield::FastFieldSerializer;
use crate::fastfield::CompositeFastFieldSerializer;
use crate::fieldnorm::FieldNormsSerializer;
use crate::postings::InvertedIndexSerializer;
use crate::store::StoreWriter;
@@ -10,7 +10,7 @@ use crate::store::StoreWriter;
pub struct SegmentSerializer {
segment: Segment,
pub(crate) store_writer: StoreWriter,
fast_field_serializer: FastFieldSerializer,
fast_field_serializer: CompositeFastFieldSerializer,
fieldnorms_serializer: Option<FieldNormsSerializer>,
postings_serializer: InvertedIndexSerializer,
}
@@ -33,15 +33,16 @@ impl SegmentSerializer {
let store_write = segment.open_write(store_component)?;
let fast_field_write = segment.open_write(SegmentComponent::FastFields)?;
let fast_field_serializer = FastFieldSerializer::from_write(fast_field_write)?;
let fast_field_serializer = CompositeFastFieldSerializer::from_write(fast_field_write)?;
let fieldnorms_write = segment.open_write(SegmentComponent::FieldNorms)?;
let fieldnorms_serializer = FieldNormsSerializer::from_write(fieldnorms_write)?;
let postings_serializer = InvertedIndexSerializer::open(&mut segment)?;
let compressor = segment.index().settings().docstore_compression;
Ok(SegmentSerializer {
segment,
store_writer: StoreWriter::new(store_write),
store_writer: StoreWriter::new(store_write, compressor),
fast_field_serializer,
fieldnorms_serializer: Some(fieldnorms_serializer),
postings_serializer,
@@ -67,7 +68,7 @@ impl SegmentSerializer {
}
/// Accessor to the `FastFieldSerializer`.
pub fn get_fast_field_serializer(&mut self) -> &mut FastFieldSerializer {
pub fn get_fast_field_serializer(&mut self) -> &mut CompositeFastFieldSerializer {
&mut self.fast_field_serializer
}

View File

@@ -345,8 +345,11 @@ fn write(
let store_write = serializer
.segment_mut()
.open_write(SegmentComponent::Store)?;
let old_store_writer =
std::mem::replace(&mut serializer.store_writer, StoreWriter::new(store_write));
let compressor = serializer.segment().index().settings().docstore_compression;
let old_store_writer = std::mem::replace(
&mut serializer.store_writer,
StoreWriter::new(store_write, compressor),
);
old_store_writer.close()?;
let store_read = StoreReader::open(
serializer
@@ -354,12 +357,9 @@ fn write(
.open_read(SegmentComponent::TempStore)?,
)?;
for old_doc_id in doc_id_map.iter_old_doc_ids() {
let raw_doc = store_read.get_raw(*old_doc_id)?;
serializer
.get_store_writer()
.store_bytes(raw_doc.get_bytes())?;
let doc_bytes = store_read.get_document_bytes(*old_doc_id)?;
serializer.get_store_writer().store_bytes(&doc_bytes)?;
}
// TODO delete temp store
}
serializer.close()?;
Ok(())

View File

@@ -178,7 +178,7 @@ use once_cell::sync::Lazy;
use serde::{Deserialize, Serialize};
/// Index format version.
const INDEX_FORMAT_VERSION: u32 = 3;
const INDEX_FORMAT_VERSION: u32 = 4;
/// Structure version for the index.
#[derive(Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -187,7 +187,6 @@ pub struct Version {
minor: u32,
patch: u32,
index_format_version: u32,
store_compression: String,
}
impl fmt::Debug for Version {
@@ -201,14 +200,13 @@ static VERSION: Lazy<Version> = Lazy::new(|| Version {
minor: env!("CARGO_PKG_VERSION_MINOR").parse().unwrap(),
patch: env!("CARGO_PKG_VERSION_PATCH").parse().unwrap(),
index_format_version: INDEX_FORMAT_VERSION,
store_compression: crate::store::COMPRESSION.to_string(),
});
impl ToString for Version {
fn to_string(&self) -> String {
format!(
"tantivy v{}.{}.{}, index_format v{}, store_compression: {}",
self.major, self.minor, self.patch, self.index_format_version, self.store_compression
"tantivy v{}.{}.{}, index_format v{}",
self.major, self.minor, self.patch, self.index_format_version
)
}
}
@@ -293,6 +291,7 @@ mod tests {
use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE;
use crate::core::SegmentReader;
use crate::docset::{DocSet, TERMINATED};
use crate::fastfield::FastFieldReader;
use crate::query::BooleanQuery;
use crate::schema::*;
use crate::DocAddress;

View File

@@ -354,18 +354,18 @@ mod bench {
});
}
#[test]
fn test_all_docs_compression_numbits() {
for expected_num_bits in 0u8.. {
let mut data = [0u32; 128];
if expected_num_bits > 0 {
data[0] = (1u64 << (expected_num_bits as usize) - 1) as u32;
}
let mut encoder = BlockEncoder::new();
let (num_bits, compressed) = encoder.compress_block_unsorted(&data);
assert_eq!(compressed.len(), compressed_block_size(num_bits));
}
}
//#[test]
//fn test_all_docs_compression_numbits() {
//for expected_num_bits in 0u8.. {
//let mut data = [0u32; 128];
//if expected_num_bits > 0 {
//data[0] = (1u64 << (expected_num_bits as usize) - 1) as u32;
//}
//let mut encoder = BlockEncoder::new();
//let (num_bits, compressed) = encoder.compress_block_unsorted(&data);
//assert_eq!(compressed.len(), compressed_block_size(num_bits));
//}
//}
const NUM_INTS_BENCH_VINT: usize = 10;

View File

@@ -6,7 +6,7 @@ use crate::query::Weight;
use crate::schema::IndexRecordOption;
use crate::schema::Term;
use crate::Searcher;
use std::collections::BTreeSet;
use std::collections::BTreeMap;
/// The boolean query returns a set of documents
/// that matches the Boolean combination of constituent subqueries.
@@ -159,9 +159,9 @@ impl Query for BooleanQuery {
Ok(Box::new(BooleanWeight::new(sub_weights, scoring_enabled)))
}
fn query_terms(&self, term_set: &mut BTreeSet<Term>) {
fn query_terms(&self, terms: &mut BTreeMap<Term, bool>) {
for (_occur, subquery) in &self.subqueries {
subquery.query_terms(term_set);
subquery.query_terms(terms);
}
}
}

View File

@@ -2,7 +2,7 @@ use crate::fastfield::DeleteBitSet;
use crate::query::explanation::does_not_match;
use crate::query::{Explanation, Query, Scorer, Weight};
use crate::{DocId, DocSet, Score, Searcher, SegmentReader, Term};
use std::collections::BTreeSet;
use std::collections::BTreeMap;
use std::fmt;
/// `BoostQuery` is a wrapper over a query used to boost its score.
@@ -48,8 +48,8 @@ impl Query for BoostQuery {
Ok(boosted_weight)
}
fn query_terms(&self, term_set: &mut BTreeSet<Term>) {
self.query.query_terms(term_set)
fn query_terms(&self, terms: &mut BTreeMap<Term, bool>) {
self.query.query_terms(terms)
}
}

View File

@@ -11,7 +11,7 @@ mod exclude;
mod explanation;
mod fuzzy_query;
mod intersection;
mod mlt;
mod more_like_this;
mod phrase_query;
mod query;
mod query_parser;
@@ -46,7 +46,7 @@ pub use self::explanation::Explanation;
pub(crate) use self::fuzzy_query::DfaWrapper;
pub use self::fuzzy_query::FuzzyTermQuery;
pub use self::intersection::intersect_scorers;
pub use self::mlt::{MoreLikeThisQuery, MoreLikeThisQueryBuilder};
pub use self::more_like_this::{MoreLikeThisQuery, MoreLikeThisQueryBuilder};
pub use self::phrase_query::PhraseQuery;
pub use self::query::{Query, QueryClone};
pub use self::query_parser::QueryParser;
@@ -66,7 +66,7 @@ mod tests {
use crate::schema::{Schema, TEXT};
use crate::Index;
use crate::Term;
use std::collections::BTreeSet;
use std::collections::BTreeMap;
#[test]
fn test_query_terms() {
@@ -78,49 +78,49 @@ mod tests {
let term_a = Term::from_field_text(text_field, "a");
let term_b = Term::from_field_text(text_field, "b");
{
let mut terms_set: BTreeSet<Term> = BTreeSet::new();
let mut terms: BTreeMap<Term, bool> = Default::default();
query_parser
.parse_query("a")
.unwrap()
.query_terms(&mut terms_set);
let terms: Vec<&Term> = terms_set.iter().collect();
assert_eq!(vec![&term_a], terms);
.query_terms(&mut terms);
let terms: Vec<(&Term, &bool)> = terms.iter().collect();
assert_eq!(vec![(&term_a, &false)], terms);
}
{
let mut terms_set: BTreeSet<Term> = BTreeSet::new();
let mut terms: BTreeMap<Term, bool> = Default::default();
query_parser
.parse_query("a b")
.unwrap()
.query_terms(&mut terms_set);
let terms: Vec<&Term> = terms_set.iter().collect();
assert_eq!(vec![&term_a, &term_b], terms);
.query_terms(&mut terms);
let terms: Vec<(&Term, &bool)> = terms.iter().collect();
assert_eq!(vec![(&term_a, &false), (&term_b, &false)], terms);
}
{
let mut terms_set: BTreeSet<Term> = BTreeSet::new();
let mut terms: BTreeMap<Term, bool> = Default::default();
query_parser
.parse_query("\"a b\"")
.unwrap()
.query_terms(&mut terms_set);
let terms: Vec<&Term> = terms_set.iter().collect();
assert_eq!(vec![&term_a, &term_b], terms);
.query_terms(&mut terms);
let terms: Vec<(&Term, &bool)> = terms.iter().collect();
assert_eq!(vec![(&term_a, &true), (&term_b, &true)], terms);
}
{
let mut terms_set: BTreeSet<Term> = BTreeSet::new();
let mut terms: BTreeMap<Term, bool> = Default::default();
query_parser
.parse_query("a a a a a")
.unwrap()
.query_terms(&mut terms_set);
let terms: Vec<&Term> = terms_set.iter().collect();
assert_eq!(vec![&term_a], terms);
.query_terms(&mut terms);
let terms: Vec<(&Term, &bool)> = terms.iter().collect();
assert_eq!(vec![(&term_a, &false)], terms);
}
{
let mut terms_set: BTreeSet<Term> = BTreeSet::new();
let mut terms: BTreeMap<Term, bool> = Default::default();
query_parser
.parse_query("a -b")
.unwrap()
.query_terms(&mut terms_set);
let terms: Vec<&Term> = terms_set.iter().collect();
assert_eq!(vec![&term_a, &term_b], terms);
.query_terms(&mut terms);
let terms: Vec<(&Term, &bool)> = terms.iter().collect();
assert_eq!(vec![(&term_a, &false), (&term_b, &false)], terms);
}
}
}

View File

@@ -1,5 +1,5 @@
mod mlt;
mod more_like_this;
mod query;
pub use self::mlt::MoreLikeThis;
pub use self::more_like_this::MoreLikeThis;
pub use self::query::{MoreLikeThisQuery, MoreLikeThisQueryBuilder};

View File

@@ -233,10 +233,9 @@ impl MoreLikeThis {
}
FieldType::U64(_) => {
for field_value in field_values {
let val = field_value
.value()
.u64_value()
.ok_or(TantivyError::InvalidArgument("invalid value".to_string()))?;
let val = field_value.value().u64_value().ok_or_else(|| {
TantivyError::InvalidArgument("invalid value".to_string())
})?;
if !self.is_noise_word(val.to_string()) {
let term = Term::from_field_u64(field, val);
*term_frequencies.entry(term).or_insert(0) += 1;
@@ -249,7 +248,7 @@ impl MoreLikeThis {
let val = field_value
.value()
.date_value()
.ok_or(TantivyError::InvalidArgument("invalid value".to_string()))?
.ok_or_else(|| TantivyError::InvalidArgument("invalid value".to_string()))?
.timestamp();
if !self.is_noise_word(val.to_string()) {
let term = Term::from_field_i64(field, val);
@@ -259,10 +258,9 @@ impl MoreLikeThis {
}
FieldType::I64(_) => {
for field_value in field_values {
let val = field_value
.value()
.i64_value()
.ok_or(TantivyError::InvalidArgument("invalid value".to_string()))?;
let val = field_value.value().i64_value().ok_or_else(|| {
TantivyError::InvalidArgument("invalid value".to_string())
})?;
if !self.is_noise_word(val.to_string()) {
let term = Term::from_field_i64(field, val);
*term_frequencies.entry(term).or_insert(0) += 1;
@@ -271,10 +269,9 @@ impl MoreLikeThis {
}
FieldType::F64(_) => {
for field_value in field_values {
let val = field_value
.value()
.f64_value()
.ok_or(TantivyError::InvalidArgument("invalid value".to_string()))?;
let val = field_value.value().f64_value().ok_or_else(|| {
TantivyError::InvalidArgument("invalid value".to_string())
})?;
if !self.is_noise_word(val.to_string()) {
let term = Term::from_field_f64(field, val);
*term_frequencies.entry(term).or_insert(0) += 1;
@@ -306,7 +303,7 @@ impl MoreLikeThis {
{
return true;
}
return self.stop_words.contains(&word);
self.stop_words.contains(&word)
}
/// Couputes the score for each term while ignoring not useful terms

View File

@@ -1,3 +1,5 @@
use std::collections::BTreeMap;
use super::PhraseWeight;
use crate::core::searcher::Searcher;
use crate::query::bm25::Bm25Weight;
@@ -5,7 +7,6 @@ use crate::query::Query;
use crate::query::Weight;
use crate::schema::IndexRecordOption;
use crate::schema::{Field, Term};
use std::collections::BTreeSet;
/// `PhraseQuery` matches a specific sequence of words.
///
@@ -113,9 +114,9 @@ impl Query for PhraseQuery {
Ok(Box::new(phrase_weight))
}
fn query_terms(&self, term_set: &mut BTreeSet<Term>) {
for (_, query_term) in &self.phrase_terms {
term_set.insert(query_term.clone());
fn query_terms(&self, terms: &mut BTreeMap<Term, bool>) {
for (_, term) in &self.phrase_terms {
terms.insert(term.clone(), true);
}
}
}

View File

@@ -4,7 +4,7 @@ use crate::query::Explanation;
use crate::DocAddress;
use crate::Term;
use downcast_rs::impl_downcast;
use std::collections::BTreeSet;
use std::collections::BTreeMap;
use std::fmt;
/// The `Query` trait defines a set of documents and a scoring method
@@ -68,7 +68,10 @@ pub trait Query: QueryClone + Send + Sync + downcast_rs::Downcast + fmt::Debug {
/// Extract all of the terms associated to the query and insert them in the
/// term set given in arguments.
fn query_terms(&self, _term_set: &mut BTreeSet<Term>) {}
///
/// Each term is associated with a boolean indicating whether
/// Positions are required or not.
fn query_terms(&self, _term_set: &mut BTreeMap<Term, bool>) {}
}
/// Implements `box_clone`.
@@ -95,8 +98,8 @@ impl Query for Box<dyn Query> {
self.as_ref().count(searcher)
}
fn query_terms(&self, term_set: &mut BTreeSet<Term<Vec<u8>>>) {
self.as_ref().query_terms(term_set);
fn query_terms(&self, terms: &mut BTreeMap<Term, bool>) {
self.as_ref().query_terms(terms);
}
}

View File

@@ -8,7 +8,7 @@ use crate::query::Query;
use crate::query::RangeQuery;
use crate::query::TermQuery;
use crate::query::{AllQuery, BoostQuery};
use crate::schema::{Facet, IndexRecordOption};
use crate::schema::{Facet, FacetParseError, IndexRecordOption};
use crate::schema::{Field, Schema};
use crate::schema::{FieldType, Term};
use crate::tokenizer::TokenizerManager;
@@ -68,6 +68,9 @@ pub enum QueryParserError {
/// The format for the date field is not RFC 3339 compliant.
#[error("The date field has an invalid format")]
DateFormatError(chrono::ParseError),
/// The format for the facet field is invalid.
#[error("The facet field is malformed: {0}")]
FacetFormatError(FacetParseError),
}
impl From<ParseIntError> for QueryParserError {
@@ -88,6 +91,12 @@ impl From<chrono::ParseError> for QueryParserError {
}
}
impl From<FacetParseError> for QueryParserError {
fn from(err: FacetParseError) -> QueryParserError {
QueryParserError::FacetFormatError(err)
}
}
/// Recursively remove empty clause from the AST
///
/// Returns `None` iff the `logical_ast` ended up being empty.
@@ -358,10 +367,10 @@ impl QueryParser {
))
}
}
FieldType::HierarchicalFacet(_) => {
let facet = Facet::from_text(phrase);
Ok(vec![(0, Term::from_field_text(field, facet.encoded_str()))])
}
FieldType::HierarchicalFacet(_) => match Facet::from_text(phrase) {
Ok(facet) => Ok(vec![(0, Term::from_field_text(field, facet.encoded_str()))]),
Err(e) => Err(QueryParserError::from(e)),
},
FieldType::Bytes(_) => {
let bytes = base64::decode(phrase).map_err(QueryParserError::ExpectedBase64)?;
let term = Term::from_field_bytes(field, &bytes);
@@ -1027,6 +1036,19 @@ mod test {
.is_ok());
}
#[test]
pub fn test_query_parser_expected_facet() {
let query_parser = make_query_parser();
match query_parser.parse_query("facet:INVALID") {
Ok(_) => panic!("should never succeed"),
Err(e) => assert_eq!(
"The facet field is malformed: Failed to parse the facet string: 'INVALID'",
format!("{}", e)
),
}
assert!(query_parser.parse_query("facet:\"/foo/bar\"").is_ok());
}
#[test]
pub fn test_query_parser_not_empty_but_no_tokens() {
let query_parser = make_query_parser();

View File

@@ -5,7 +5,7 @@ use crate::query::{Explanation, Query};
use crate::schema::IndexRecordOption;
use crate::Searcher;
use crate::Term;
use std::collections::BTreeSet;
use std::collections::BTreeMap;
use std::fmt;
/// A Term query matches all of the documents
@@ -127,7 +127,7 @@ impl Query for TermQuery {
self.specialized_weight(searcher, scoring_enabled)?,
))
}
fn query_terms(&self, term_set: &mut BTreeSet<Term>) {
term_set.insert(self.term.clone());
fn query_terms(&self, terms: &mut BTreeMap<Term, bool>) {
terms.insert(self.term.clone(), false);
}
}

View File

@@ -20,6 +20,14 @@ pub const FACET_SEP_BYTE: u8 = 0u8;
/// representation of facets. (It is the null codepoint.)
pub const FACET_SEP_CHAR: char = '\u{0}';
/// An error enum for facet parser.
#[derive(Debug, PartialEq, Eq, Error)]
pub enum FacetParseError {
/// The facet text representation is unparsable.
#[error("Failed to parse the facet string: '{0}'")]
FacetParseError(String),
}
/// A Facet represent a point in a given hierarchy.
///
/// They are typically represented similarly to a filepath.
@@ -75,11 +83,47 @@ impl Facet {
/// It is conceptually, if one of the steps of this path
/// contains a `/` or a `\`, it should be escaped
/// using an anti-slash `/`.
pub fn from_text<T>(path: &T) -> Facet
pub fn from_text<T>(path: &T) -> Result<Facet, FacetParseError>
where
T: ?Sized + AsRef<str>,
{
From::from(path)
#[derive(Copy, Clone)]
enum State {
Escaped,
Idle,
}
let path_ref = path.as_ref();
if path_ref.is_empty() {
return Err(FacetParseError::FacetParseError(path_ref.to_string()));
}
if !path_ref.starts_with('/') {
return Err(FacetParseError::FacetParseError(path_ref.to_string()));
}
let mut facet_encoded = String::new();
let mut state = State::Idle;
let path_bytes = path_ref.as_bytes();
let mut last_offset = 1;
for i in 1..path_bytes.len() {
let c = path_bytes[i];
match (state, c) {
(State::Idle, ESCAPE_BYTE) => {
facet_encoded.push_str(&path_ref[last_offset..i]);
last_offset = i + 1;
state = State::Escaped
}
(State::Idle, SLASH_BYTE) => {
facet_encoded.push_str(&path_ref[last_offset..i]);
facet_encoded.push(FACET_SEP_CHAR);
last_offset = i + 1;
}
(State::Escaped, _escaped_char) => {
state = State::Idle;
}
(State::Idle, _any_char) => {}
}
}
facet_encoded.push_str(&path_ref[last_offset..]);
Ok(Facet(facet_encoded))
}
/// Returns a `Facet` from an iterator over the different
@@ -137,39 +181,7 @@ impl Borrow<str> for Facet {
impl<'a, T: ?Sized + AsRef<str>> From<&'a T> for Facet {
fn from(path_asref: &'a T) -> Facet {
#[derive(Copy, Clone)]
enum State {
Escaped,
Idle,
}
let path: &str = path_asref.as_ref();
assert!(!path.is_empty());
assert!(path.starts_with('/'));
let mut facet_encoded = String::new();
let mut state = State::Idle;
let path_bytes = path.as_bytes();
let mut last_offset = 1;
for i in 1..path_bytes.len() {
let c = path_bytes[i];
match (state, c) {
(State::Idle, ESCAPE_BYTE) => {
facet_encoded.push_str(&path[last_offset..i]);
last_offset = i + 1;
state = State::Escaped
}
(State::Idle, SLASH_BYTE) => {
facet_encoded.push_str(&path[last_offset..i]);
facet_encoded.push(FACET_SEP_CHAR);
last_offset = i + 1;
}
(State::Escaped, _escaped_char) => {
state = State::Idle;
}
(State::Idle, _any_char) => {}
}
}
facet_encoded.push_str(&path[last_offset..]);
Facet(facet_encoded)
Facet::from_text(path_asref).unwrap()
}
}
@@ -226,7 +238,7 @@ impl Debug for Facet {
#[cfg(test)]
mod tests {
use super::Facet;
use super::{Facet, FacetParseError};
#[test]
fn test_root() {
@@ -288,4 +300,12 @@ mod tests {
let facet = Facet::from_path(v.iter());
assert_eq!(facet.to_path_string(), "/");
}
#[test]
fn test_from_text() {
assert_eq!(
Err(FacetParseError::FacetParseError("INVALID".to_string())),
Facet::from_text("INVALID")
);
}
}

View File

@@ -23,6 +23,15 @@ pub struct FieldEntry {
}
impl FieldEntry {
/// Creates a new field entry given a name and a field type
pub fn new(field_name: String, field_type: FieldType) -> FieldEntry {
assert!(is_valid_field_name(&field_name));
FieldEntry {
name: field_name,
field_type,
}
}
/// Creates a new u64 field entry in the schema, given
/// a name, and some options.
pub fn new_text(field_name: String, text_options: TextOptions) -> FieldEntry {

View File

@@ -128,6 +128,7 @@ pub use self::schema::{Schema, SchemaBuilder};
pub use self::value::Value;
pub use self::facet::Facet;
pub use self::facet::FacetParseError;
pub(crate) use self::facet::FACET_SEP_BYTE;
pub use self::facet_options::FacetOptions;

View File

@@ -7,7 +7,6 @@ use crate::{Document, Score};
use htmlescape::encode_minimal;
use std::cmp::Ordering;
use std::collections::BTreeMap;
use std::collections::BTreeSet;
use std::ops::Range;
const DEFAULT_MAX_NUM_CHARS: usize = 150;
@@ -239,10 +238,10 @@ impl SnippetGenerator {
query: &dyn Query,
field: Field,
) -> crate::Result<SnippetGenerator> {
let mut terms = BTreeSet::new();
let mut terms = BTreeMap::new();
query.query_terms(&mut terms);
let mut terms_text: BTreeMap<String, Score> = Default::default();
for term in terms {
for (term, _) in terms {
if term.field() != field {
continue;
}

View File

@@ -1,10 +1,6 @@
use std::io;
/// Name of the compression scheme used in the doc store.
///
/// This name is appended to the version string of tantivy.
pub const COMPRESSION: &'static str = "brotli";
#[inline]
pub fn compress(mut uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
let mut params = brotli::enc::BrotliEncoderParams::default();
params.quality = 5;
@@ -13,6 +9,7 @@ pub fn compress(mut uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result
Ok(())
}
#[inline]
pub fn decompress(mut compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
decompressed.clear();
brotli::BrotliDecompress(&mut compressed, decompressed)?;

View File

@@ -1,22 +0,0 @@
use std::io::{self, Read, Write};
/// Name of the compression scheme used in the doc store.
///
/// This name is appended to the version string of tantivy.
pub const COMPRESSION: &str = "lz4";
pub fn compress(uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
compressed.clear();
let mut encoder = lz4::EncoderBuilder::new().build(compressed)?;
encoder.write_all(&uncompressed)?;
let (_, encoder_result) = encoder.finish();
encoder_result?;
Ok(())
}
pub fn decompress(compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
decompressed.clear();
let mut decoder = lz4::Decoder::new(compressed)?;
decoder.read_to_end(decompressed)?;
Ok(())
}

View File

@@ -2,38 +2,46 @@ use std::io::{self};
use core::convert::TryInto;
use lz4_flex::{compress_into, decompress_into};
/// Name of the compression scheme used in the doc store.
///
/// This name is appended to the version string of tantivy.
pub const COMPRESSION: &str = "lz4_block";
#[inline]
pub fn compress(uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
compressed.clear();
let maximum_ouput_size = lz4_flex::block::get_maximum_output_size(uncompressed.len());
compressed.reserve(maximum_ouput_size);
compressed.extend_from_slice(&[0, 0, 0, 0]);
compress_into(uncompressed, compressed);
unsafe {
compressed.set_len(maximum_ouput_size + 4);
}
let bytes_written = compress_into(uncompressed, compressed, 4)
.map_err(|err| io::Error::new(io::ErrorKind::InvalidData, err.to_string()))?;
let num_bytes = uncompressed.len() as u32;
compressed[0..4].copy_from_slice(&num_bytes.to_le_bytes());
unsafe {
compressed.set_len(bytes_written + 4);
}
Ok(())
}
#[inline]
pub fn decompress(compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
decompressed.clear();
//next lz4_flex version will support slice as input parameter.
//this will make the usage much less ugly
let uncompressed_size_bytes: &[u8; 4] = compressed
.get(..4)
.ok_or(io::ErrorKind::InvalidData)?
.try_into()
.unwrap();
let uncompressed_size = u32::from_le_bytes(*uncompressed_size_bytes) as usize;
// reserve more than required, because blocked writes may write out of bounds, will be improved
// with lz4_flex 1.0
decompressed.reserve(uncompressed_size + 4 + 24);
decompressed.reserve(uncompressed_size);
unsafe {
decompressed.set_len(uncompressed_size);
}
decompress_into(&compressed[4..], decompressed)
let bytes_written = decompress_into(&compressed[4..], decompressed, 0)
.map_err(|err| io::Error::new(io::ErrorKind::InvalidData, err.to_string()))?;
if bytes_written != uncompressed_size {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
"doc store block not completely decompressed, data corruption".to_string(),
));
}
Ok(())
}

View File

@@ -1,10 +1,6 @@
use std::io::{self, Read, Write};
/// Name of the compression scheme used in the doc store.
///
/// This name is appended to the version string of tantivy.
pub const COMPRESSION: &str = "snappy";
#[inline]
pub fn compress(uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
compressed.clear();
let mut encoder = snap::write::FrameEncoder::new(compressed);
@@ -13,6 +9,7 @@ pub fn compress(uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()>
Ok(())
}
#[inline]
pub fn decompress(compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
decompressed.clear();
snap::read::FrameDecoder::new(compressed).read_to_end(decompressed)?;

134
src/store/compressors.rs Normal file
View File

@@ -0,0 +1,134 @@
use serde::{Deserialize, Serialize};
use std::io;
pub trait StoreCompressor {
fn compress(&self, uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()>;
fn decompress(&self, compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()>;
fn get_compressor_id() -> u8;
}
/// Compressor can be used on `IndexSettings` to choose
/// the compressor used to compress the doc store.
///
/// The default is Lz4Block, but also depends on the enabled feature flags.
#[derive(Clone, Debug, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum Compressor {
#[serde(rename = "lz4")]
/// Use the lz4 compressor (block format)
Lz4,
#[serde(rename = "brotli")]
/// Use the brotli compressor
Brotli,
#[serde(rename = "snappy")]
/// Use the snap compressor
Snappy,
}
impl Default for Compressor {
fn default() -> Self {
if cfg!(feature = "lz4-compression") {
Compressor::Lz4
} else if cfg!(feature = "brotli-compression") {
Compressor::Brotli
} else if cfg!(feature = "snappy-compression") {
Compressor::Snappy
} else {
panic!(
"all compressor feature flags like are disabled (e.g. lz4-compression), can't choose default compressor"
);
}
}
}
impl Compressor {
pub(crate) fn from_id(id: u8) -> Compressor {
match id {
1 => Compressor::Lz4,
2 => Compressor::Brotli,
3 => Compressor::Snappy,
_ => panic!("unknown compressor id {:?}", id),
}
}
pub(crate) fn get_id(&self) -> u8 {
match self {
Self::Lz4 => 1,
Self::Brotli => 2,
Self::Snappy => 3,
}
}
#[inline]
pub(crate) fn compress(&self, uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
match self {
Self::Lz4 => {
#[cfg(feature = "lz4-compression")]
{
super::compression_lz4_block::compress(uncompressed, compressed)
}
#[cfg(not(feature = "lz4-compression"))]
{
panic!("lz4-compression feature flag not activated");
}
}
Self::Brotli => {
#[cfg(feature = "brotli-compression")]
{
super::compression_brotli::compress(uncompressed, compressed)
}
#[cfg(not(feature = "brotli-compression"))]
{
panic!("brotli-compression-compression feature flag not activated");
}
}
Self::Snappy => {
#[cfg(feature = "snappy-compression")]
{
super::compression_snap::compress(uncompressed, compressed)
}
#[cfg(not(feature = "snappy-compression"))]
{
panic!("snappy-compression feature flag not activated");
}
}
}
}
#[inline]
pub(crate) fn decompress(
&self,
compressed: &[u8],
decompressed: &mut Vec<u8>,
) -> io::Result<()> {
match self {
Self::Lz4 => {
#[cfg(feature = "lz4-compression")]
{
super::compression_lz4_block::decompress(compressed, decompressed)
}
#[cfg(not(feature = "lz4-compression"))]
{
panic!("lz4-compression feature flag not activated");
}
}
Self::Brotli => {
#[cfg(feature = "brotli-compression")]
{
super::compression_brotli::decompress(compressed, decompressed)
}
#[cfg(not(feature = "brotli-compression"))]
{
panic!("brotli-compression feature flag not activated");
}
}
Self::Snappy => {
#[cfg(feature = "snappy-compression")]
{
super::compression_snap::decompress(compressed, decompressed)
}
#[cfg(not(feature = "snappy-compression"))]
{
panic!("snappy-compression feature flag not activated");
}
}
}
}
}

69
src/store/footer.rs Normal file
View File

@@ -0,0 +1,69 @@
use crate::{
common::{BinarySerializable, FixedSize, HasLen},
directory::FileSlice,
store::Compressor,
};
use std::io;
#[derive(Debug, Clone, PartialEq)]
pub struct DocStoreFooter {
pub offset: u64,
pub compressor: Compressor,
}
/// Serialises the footer to a byte-array
/// - offset : 8 bytes
///- compressor id: 1 byte
/// - reserved for future use: 15 bytes
impl BinarySerializable for DocStoreFooter {
fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
BinarySerializable::serialize(&self.offset, writer)?;
BinarySerializable::serialize(&self.compressor.get_id(), writer)?;
writer.write_all(&[0; 15])?;
Ok(())
}
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
let offset = u64::deserialize(reader)?;
let compressor_id = u8::deserialize(reader)?;
let mut skip_buf = [0; 15];
reader.read_exact(&mut skip_buf)?;
Ok(DocStoreFooter {
offset,
compressor: Compressor::from_id(compressor_id),
})
}
}
impl FixedSize for DocStoreFooter {
const SIZE_IN_BYTES: usize = 24;
}
impl DocStoreFooter {
pub fn new(offset: u64, compressor: Compressor) -> Self {
DocStoreFooter { offset, compressor }
}
pub fn extract_footer(file: FileSlice) -> io::Result<(DocStoreFooter, FileSlice)> {
if file.len() < DocStoreFooter::SIZE_IN_BYTES {
return Err(io::Error::new(
io::ErrorKind::UnexpectedEof,
format!(
"File corrupted. The file is smaller than Footer::SIZE_IN_BYTES (len={}).",
file.len()
),
));
}
let (body, footer_slice) = file.split_from_end(DocStoreFooter::SIZE_IN_BYTES);
let mut footer_bytes = footer_slice.read_bytes()?;
let footer = DocStoreFooter::deserialize(&mut footer_bytes)?;
Ok((footer, body))
}
}
#[test]
fn doc_store_footer_test() {
// This test is just to safe guard changes on the footer.
// When the doc store footer is updated, make sure to update also the serialize/deserialize methods
assert_eq!(core::mem::size_of::<DocStoreFooter>(), 16);
}

View File

@@ -33,73 +33,32 @@ and should rely on either
!*/
mod compressors;
mod footer;
mod index;
mod reader;
mod writer;
pub use self::reader::RawDocument;
pub use self::compressors::Compressor;
pub use self::reader::StoreReader;
pub use self::writer::StoreWriter;
// compile_error doesn't scale very well, enum like feature flags would be great to have in Rust
#[cfg(all(feature = "lz4", feature = "brotli"))]
compile_error!("feature `lz4` or `brotli` must not be enabled together.");
#[cfg(all(feature = "lz4_block", feature = "brotli"))]
compile_error!("feature `lz4_block` or `brotli` must not be enabled together.");
#[cfg(all(feature = "lz4_block", feature = "lz4"))]
compile_error!("feature `lz4_block` or `lz4` must not be enabled together.");
#[cfg(all(feature = "lz4_block", feature = "snap"))]
compile_error!("feature `lz4_block` or `snap` must not be enabled together.");
#[cfg(all(feature = "lz4", feature = "snap"))]
compile_error!("feature `lz4` or `snap` must not be enabled together.");
#[cfg(all(feature = "brotli", feature = "snap"))]
compile_error!("feature `brotli` or `snap` must not be enabled together.");
#[cfg(not(any(
feature = "lz4",
feature = "brotli",
feature = "lz4_flex",
feature = "snap"
)))]
compile_error!("all compressors are deactivated via feature-flags, check Cargo.toml for available decompressors.");
#[cfg(feature = "lz4_flex")]
#[cfg(feature = "lz4-compression")]
mod compression_lz4_block;
#[cfg(feature = "lz4_flex")]
pub use self::compression_lz4_block::COMPRESSION;
#[cfg(feature = "lz4_flex")]
use self::compression_lz4_block::{compress, decompress};
#[cfg(feature = "lz4")]
mod compression_lz4;
#[cfg(feature = "lz4")]
pub use self::compression_lz4::COMPRESSION;
#[cfg(feature = "lz4")]
use self::compression_lz4::{compress, decompress};
#[cfg(feature = "brotli")]
#[cfg(feature = "brotli-compression")]
mod compression_brotli;
#[cfg(feature = "brotli")]
pub use self::compression_brotli::COMPRESSION;
#[cfg(feature = "brotli")]
use self::compression_brotli::{compress, decompress};
#[cfg(feature = "snap")]
#[cfg(feature = "snappy-compression")]
mod compression_snap;
#[cfg(feature = "snap")]
pub use self::compression_snap::COMPRESSION;
#[cfg(feature = "snap")]
use self::compression_snap::{compress, decompress};
#[cfg(test)]
pub mod tests {
use futures::executor::block_on;
use super::*;
use crate::schema::{self, FieldValue, TextFieldIndexing};
use crate::fastfield::DeleteBitSet;
use crate::schema::{self, FieldValue, TextFieldIndexing, STORED, TEXT};
use crate::schema::{Document, TextOptions};
use crate::{
directory::{Directory, RamDirectory, WritePtr},
@@ -108,28 +67,31 @@ pub mod tests {
use crate::{schema::Schema, Index};
use std::path::Path;
pub fn write_lorem_ipsum_store(writer: WritePtr, num_docs: usize) -> Schema {
let mut schema_builder = Schema::builder();
let field_body = schema_builder.add_text_field("body", TextOptions::default().set_stored());
let field_title =
schema_builder.add_text_field("title", TextOptions::default().set_stored());
let schema = schema_builder.build();
let lorem = String::from(
"Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed \
const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed \
do eiusmod tempor incididunt ut labore et dolore magna aliqua. \
Ut enim ad minim veniam, quis nostrud exercitation ullamco \
laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure \
dolor in reprehenderit in voluptate velit esse cillum dolore eu \
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non \
proident, sunt in culpa qui officia deserunt mollit anim id est \
laborum.",
);
laborum.";
pub fn write_lorem_ipsum_store(
writer: WritePtr,
num_docs: usize,
compressor: Compressor,
) -> Schema {
let mut schema_builder = Schema::builder();
let field_body = schema_builder.add_text_field("body", TextOptions::default().set_stored());
let field_title =
schema_builder.add_text_field("title", TextOptions::default().set_stored());
let schema = schema_builder.build();
{
let mut store_writer = StoreWriter::new(writer);
let mut store_writer = StoreWriter::new(writer, compressor);
for i in 0..num_docs {
let mut fields: Vec<FieldValue> = Vec::new();
{
let field_value = FieldValue::new(field_body, From::from(lorem.clone()));
let field_value = FieldValue::new(field_body, From::from(LOREM.to_string()));
fields.push(field_value);
}
{
@@ -146,16 +108,61 @@ pub mod tests {
schema
}
const NUM_DOCS: usize = 1_000;
#[test]
fn test_store() -> crate::Result<()> {
fn test_doc_store_iter_with_delete_bug_1077() -> crate::Result<()> {
// this will cover deletion of the first element in a checkpoint
let deleted_docids = (200..300).collect::<Vec<_>>();
let delete_bitset = DeleteBitSet::for_test(&deleted_docids, NUM_DOCS as u32);
let path = Path::new("store");
let directory = RamDirectory::create();
let store_wrt = directory.open_write(path)?;
let schema = write_lorem_ipsum_store(store_wrt, 1_000);
let schema = write_lorem_ipsum_store(store_wrt, NUM_DOCS, Compressor::Lz4);
let field_title = schema.get_field("title").unwrap();
let store_file = directory.open_read(path)?;
let store = StoreReader::open(store_file)?;
for i in 0..1_000 {
for i in 0..NUM_DOCS as u32 {
assert_eq!(
*store
.get(i)?
.get_first(field_title)
.unwrap()
.text()
.unwrap(),
format!("Doc {}", i)
);
}
for (_, doc) in store.iter(Some(&delete_bitset)).enumerate() {
let doc = doc?;
let title_content = doc.get_first(field_title).unwrap().text().unwrap();
if !title_content.starts_with("Doc ") {
panic!("unexpected title_content {}", title_content);
}
let id = title_content
.strip_prefix("Doc ")
.unwrap()
.parse::<u32>()
.unwrap();
if delete_bitset.is_deleted(id) {
panic!("unexpected deleted document {}", id);
}
}
Ok(())
}
fn test_store(compressor: Compressor) -> crate::Result<()> {
let path = Path::new("store");
let directory = RamDirectory::create();
let store_wrt = directory.open_write(path)?;
let schema = write_lorem_ipsum_store(store_wrt, NUM_DOCS, compressor);
let field_title = schema.get_field("title").unwrap();
let store_file = directory.open_read(path)?;
let store = StoreReader::open(store_file)?;
for i in 0..NUM_DOCS as u32 {
assert_eq!(
*store
.get(i)?
@@ -175,6 +182,22 @@ pub mod tests {
Ok(())
}
#[cfg(feature = "lz4-compression")]
#[test]
fn test_store_lz4_block() -> crate::Result<()> {
test_store(Compressor::Lz4)
}
#[cfg(feature = "snappy-compression")]
#[test]
fn test_store_snap() -> crate::Result<()> {
test_store(Compressor::Snappy)
}
#[cfg(feature = "brotli-compression")]
#[test]
fn test_store_brotli() -> crate::Result<()> {
test_store(Compressor::Brotli)
}
#[test]
fn test_store_with_delete() -> crate::Result<()> {
let mut schema_builder = schema::Schema::builder();
@@ -215,6 +238,108 @@ pub mod tests {
}
Ok(())
}
#[cfg(feature = "snappy-compression")]
#[cfg(feature = "lz4-compression")]
#[test]
fn test_merge_with_changed_compressor() -> crate::Result<()> {
let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text_field", TEXT | STORED);
let schema = schema_builder.build();
let index_builder = Index::builder().schema(schema);
let mut index = index_builder.create_in_ram().unwrap();
index.settings_mut().docstore_compression = Compressor::Lz4;
{
let mut index_writer = index.writer_for_tests().unwrap();
// put enough data create enough blocks in the doc store to be considered for stacking
for _ in 0..200 {
index_writer.add_document(doc!(text_field=> LOREM));
}
assert!(index_writer.commit().is_ok());
for _ in 0..200 {
index_writer.add_document(doc!(text_field=> LOREM));
}
assert!(index_writer.commit().is_ok());
}
assert_eq!(
index.reader().unwrap().searcher().segment_readers()[0]
.get_store_reader()
.unwrap()
.compressor(),
Compressor::Lz4
);
// Change compressor, this disables stacking on merging
let index_settings = index.settings_mut();
index_settings.docstore_compression = Compressor::Snappy;
// Merging the segments
{
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer = index.writer_for_tests().unwrap();
assert!(block_on(index_writer.merge(&segment_ids)).is_ok());
assert!(index_writer.wait_merging_threads().is_ok());
}
let searcher = index.reader().unwrap().searcher();
assert_eq!(searcher.segment_readers().len(), 1);
let reader = searcher.segment_readers().iter().last().unwrap();
let store = reader.get_store_reader().unwrap();
for doc in store.iter(reader.delete_bitset()).take(50) {
assert_eq!(
*doc?.get_first(text_field).unwrap().text().unwrap(),
LOREM.to_string()
);
}
assert_eq!(store.compressor(), Compressor::Snappy);
Ok(())
}
#[test]
fn test_merge_of_small_segments() -> crate::Result<()> {
let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text_field", TEXT | STORED);
let schema = schema_builder.build();
let index_builder = Index::builder().schema(schema);
let index = index_builder.create_in_ram().unwrap();
{
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(text_field=> "1"));
assert!(index_writer.commit().is_ok());
index_writer.add_document(doc!(text_field=> "2"));
assert!(index_writer.commit().is_ok());
index_writer.add_document(doc!(text_field=> "3"));
assert!(index_writer.commit().is_ok());
index_writer.add_document(doc!(text_field=> "4"));
assert!(index_writer.commit().is_ok());
index_writer.add_document(doc!(text_field=> "5"));
assert!(index_writer.commit().is_ok());
}
// Merging the segments
{
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer = index.writer_for_tests().unwrap();
assert!(block_on(index_writer.merge(&segment_ids)).is_ok());
assert!(index_writer.wait_merging_threads().is_ok());
}
let searcher = index.reader().unwrap().searcher();
assert_eq!(searcher.segment_readers().len(), 1);
let reader = searcher.segment_readers().iter().last().unwrap();
let store = reader.get_store_reader().unwrap();
assert_eq!(store.block_checkpoints().count(), 1);
Ok(())
}
}
#[cfg(all(test, feature = "unstable"))]
@@ -223,6 +348,7 @@ mod bench {
use super::tests::write_lorem_ipsum_store;
use crate::directory::Directory;
use crate::directory::RamDirectory;
use crate::store::Compressor;
use crate::store::StoreReader;
use std::path::Path;
use test::Bencher;
@@ -233,7 +359,11 @@ mod bench {
let directory = RamDirectory::create();
let path = Path::new("store");
b.iter(|| {
write_lorem_ipsum_store(directory.open_write(path).unwrap(), 1_000);
write_lorem_ipsum_store(
directory.open_write(path).unwrap(),
1_000,
Compressor::default(),
);
directory.delete(path).unwrap();
});
}
@@ -242,11 +372,13 @@ mod bench {
fn bench_store_decode(b: &mut Bencher) {
let directory = RamDirectory::create();
let path = Path::new("store");
write_lorem_ipsum_store(directory.open_write(path).unwrap(), 1_000);
write_lorem_ipsum_store(
directory.open_write(path).unwrap(),
1_000,
Compressor::default(),
);
let store_file = directory.open_read(path).unwrap();
let store = StoreReader::open(store_file).unwrap();
b.iter(|| {
store.get(12).unwrap();
});
b.iter(|| store.iter(None).collect::<Vec<_>>());
}
}

View File

@@ -1,26 +1,29 @@
use super::decompress;
use super::index::SkipIndex;
use crate::common::{BinarySerializable, HasLen};
use super::Compressor;
use super::{footer::DocStoreFooter, index::SkipIndex};
use crate::directory::{FileSlice, OwnedBytes};
use crate::schema::Document;
use crate::space_usage::StoreSpaceUsage;
use crate::store::index::Checkpoint;
use crate::DocId;
use crate::{common::VInt, fastfield::DeleteBitSet};
use crate::{
common::{BinarySerializable, HasLen, VInt},
error::DataCorruption,
fastfield::DeleteBitSet,
};
use lru::LruCache;
use std::io;
use std::mem::size_of;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::{Arc, Mutex};
const LRU_CACHE_CAPACITY: usize = 100;
type Block = Arc<Vec<u8>>;
type Block = OwnedBytes;
type BlockCache = Arc<Mutex<LruCache<usize, Block>>>;
/// Reads document off tantivy's [`Store`](./index.html)
pub struct StoreReader {
compressor: Compressor,
data: FileSlice,
cache: BlockCache,
cache_hits: Arc<AtomicUsize>,
@@ -32,11 +35,14 @@ pub struct StoreReader {
impl StoreReader {
/// Opens a store reader
pub fn open(store_file: FileSlice) -> io::Result<StoreReader> {
let (data_file, offset_index_file) = split_file(store_file)?;
let (footer, data_and_offset) = DocStoreFooter::extract_footer(store_file)?;
let (data_file, offset_index_file) = data_and_offset.split(footer.offset as usize);
let index_data = offset_index_file.read_bytes()?;
let space_usage = StoreSpaceUsage::new(data_file.len(), offset_index_file.len());
let skip_index = SkipIndex::open(index_data);
Ok(StoreReader {
compressor: footer.compressor,
data: data_file,
cache: Arc::new(Mutex::new(LruCache::new(LRU_CACHE_CAPACITY))),
cache_hits: Default::default(),
@@ -50,6 +56,10 @@ impl StoreReader {
self.skip_index.checkpoints()
}
pub(crate) fn compressor(&self) -> Compressor {
self.compressor
}
fn block_checkpoint(&self, doc_id: DocId) -> Option<Checkpoint> {
self.skip_index.seek(doc_id)
}
@@ -72,9 +82,10 @@ impl StoreReader {
let compressed_block = self.compressed_block(checkpoint)?;
let mut decompressed_block = vec![];
decompress(compressed_block.as_slice(), &mut decompressed_block)?;
self.compressor
.decompress(compressed_block.as_slice(), &mut decompressed_block)?;
let block = Arc::new(decompressed_block);
let block = OwnedBytes::new(decompressed_block);
self.cache
.lock()
.unwrap()
@@ -93,9 +104,8 @@ impl StoreReader {
/// It should not be called to score documents
/// for instance.
pub fn get(&self, doc_id: DocId) -> crate::Result<Document> {
let raw_doc = self.get_raw(doc_id)?;
let mut cursor = raw_doc.get_bytes();
Ok(Document::deserialize(&mut cursor)?)
let mut doc_bytes = self.get_document_bytes(doc_id)?;
Ok(Document::deserialize(&mut doc_bytes)?)
}
/// Reads raw bytes of a given document. Returns `RawDocument`, which contains the block of a document and its start and end
@@ -106,7 +116,7 @@ impl StoreReader {
/// so accessing docs from the same compressed block should be faster.
/// For that reason a store reader should be kept and reused.
///
pub fn get_raw(&self, doc_id: DocId) -> crate::Result<RawDocument> {
pub fn get_document_bytes(&self, doc_id: DocId) -> crate::Result<OwnedBytes> {
let checkpoint = self.block_checkpoint(doc_id).ok_or_else(|| {
crate::TantivyError::InvalidArgument(format!("Failed to lookup Doc #{}.", doc_id))
})?;
@@ -121,11 +131,7 @@ impl StoreReader {
let doc_length = VInt::deserialize(&mut cursor)?.val() as usize;
let start_pos = cursor_len_before - cursor.len();
let end_pos = cursor_len_before - cursor.len() + doc_length;
Ok(RawDocument {
block,
start_pos,
end_pos,
})
Ok(block.slice(start_pos..end_pos))
}
/// Iterator over all Documents in their order as they are stored in the doc store.
@@ -135,10 +141,9 @@ impl StoreReader {
&'b self,
delete_bitset: Option<&'a DeleteBitSet>,
) -> impl Iterator<Item = crate::Result<Document>> + 'b {
self.iter_raw(delete_bitset).map(|raw_doc| {
let raw_doc = raw_doc?;
let mut cursor = raw_doc.get_bytes();
Ok(Document::deserialize(&mut cursor)?)
self.iter_raw(delete_bitset).map(|doc_bytes_res| {
let mut doc_bytes = doc_bytes_res?;
Ok(Document::deserialize(&mut doc_bytes)?)
})
}
@@ -148,7 +153,7 @@ impl StoreReader {
pub(crate) fn iter_raw<'a: 'b, 'b>(
&'b self,
delete_bitset: Option<&'a DeleteBitSet>,
) -> impl Iterator<Item = crate::Result<RawDocument>> + 'b {
) -> impl Iterator<Item = crate::Result<OwnedBytes>> + 'b {
let last_docid = self
.block_checkpoints()
.last()
@@ -158,48 +163,55 @@ impl StoreReader {
let mut curr_checkpoint = checkpoint_block_iter.next();
let mut curr_block = curr_checkpoint
.as_ref()
.map(|checkpoint| self.read_block(&checkpoint));
.map(|checkpoint| self.read_block(&checkpoint).map_err(|e| e.kind())); // map error in order to enable cloning
let mut block_start_pos = 0;
let mut num_skipped = 0;
let mut reset_block_pos = false;
(0..last_docid)
.filter_map(move |doc_id| {
// filter_map is only used to resolve lifetime issues between the two closures on
// the outer variables
let alive = delete_bitset.map_or(true, |bitset| bitset.is_alive(doc_id));
if !alive {
// we keep the number of skipped documents to move forward in the map block
num_skipped += 1;
}
// check move to next checkpoint
let mut reset_block_pos = false;
if doc_id >= curr_checkpoint.as_ref().unwrap().doc_range.end {
curr_checkpoint = checkpoint_block_iter.next();
curr_block = curr_checkpoint
.as_ref()
.map(|checkpoint| self.read_block(&checkpoint));
.map(|checkpoint| self.read_block(&checkpoint).map_err(|e| e.kind()));
reset_block_pos = true;
num_skipped = 0;
}
let alive = delete_bitset.map_or(true, |bitset| bitset.is_alive(doc_id));
if alive {
let ret = Some((
curr_block.as_ref().unwrap().as_ref().unwrap().clone(), // todo forward errors
num_skipped,
reset_block_pos,
));
let ret = Some((curr_block.clone(), num_skipped, reset_block_pos));
// the map block will move over the num_skipped, so we reset to 0
num_skipped = 0;
reset_block_pos = false;
ret
} else {
// we keep the number of skipped documents to move forward in the map block
num_skipped += 1;
None
}
})
.map(move |(block, num_skipped, reset_block_pos)| {
let block = block
.ok_or_else(|| {
DataCorruption::comment_only(
"the current checkpoint in the doc store iterator is none, this should never happen",
)
})?
.map_err(|error_kind| {
std::io::Error::new(error_kind, "error when reading block in doc store")
})?;
// this flag is set, when filter_map moved to the next block
if reset_block_pos {
block_start_pos = 0;
}
let mut cursor = &block[block_start_pos..];
let mut pos = 0;
// move forward 1 doc + num_skipped in block and return length of current doc
let doc_length = loop {
let doc_length = VInt::deserialize(&mut cursor)?.val() as usize;
let num_bytes_read = block[block_start_pos..].len() - cursor.len();
@@ -214,13 +226,9 @@ impl StoreReader {
}
};
let end_pos = block_start_pos + doc_length;
let raw_doc = RawDocument {
block,
start_pos: block_start_pos,
end_pos,
};
let doc_bytes = block.slice(block_start_pos..end_pos);
block_start_pos = end_pos;
Ok(raw_doc)
Ok(doc_bytes)
})
}
@@ -230,31 +238,6 @@ impl StoreReader {
}
}
/// Get the bytes of a serialized `Document` in a decompressed block.
pub struct RawDocument {
/// the block of data containing multiple documents
block: Arc<Vec<u8>>,
/// start position of the document in the block
start_pos: usize,
/// end position of the document in the block
end_pos: usize,
}
impl RawDocument {
/// Get the bytes of a serialized `Document` in a decompressed block.
pub fn get_bytes(&self) -> &[u8] {
&self.block[self.start_pos..self.end_pos]
}
}
fn split_file(data: FileSlice) -> io::Result<(FileSlice, FileSlice)> {
let (data, footer_len_bytes) = data.split_from_end(size_of::<u64>());
let serialized_offset: OwnedBytes = footer_len_bytes.read_bytes()?;
let mut serialized_offset_buf = serialized_offset.as_slice();
let offset = u64::deserialize(&mut serialized_offset_buf)? as usize;
Ok(data.split(offset))
}
#[cfg(test)]
mod tests {
use super::*;
@@ -272,7 +255,7 @@ mod tests {
let directory = RamDirectory::create();
let path = Path::new("store");
let writer = directory.open_write(path)?;
let schema = write_lorem_ipsum_store(writer, 500);
let schema = write_lorem_ipsum_store(writer, 500, Compressor::default());
let title = schema.get_field("title").unwrap();
let store_file = directory.open_read(path)?;
let store = StoreReader::open(store_file)?;
@@ -327,7 +310,7 @@ mod tests {
.unwrap()
.peek_lru()
.map(|(&k, _)| k as usize),
Some(9249)
Some(9210)
);
Ok(())

View File

@@ -1,6 +1,6 @@
use super::compress;
use super::index::SkipIndexBuilder;
use super::StoreReader;
use super::{compressors::Compressor, footer::DocStoreFooter};
use crate::common::CountingWriter;
use crate::common::{BinarySerializable, VInt};
use crate::directory::TerminatingWrite;
@@ -21,6 +21,7 @@ const BLOCK_SIZE: usize = 16_384;
/// The skip list index on the other hand, is built in memory.
///
pub struct StoreWriter {
compressor: Compressor,
doc: DocId,
first_doc_in_block: DocId,
offset_index_writer: SkipIndexBuilder,
@@ -34,8 +35,9 @@ impl StoreWriter {
///
/// The store writer will writes blocks on disc as
/// document are added.
pub fn new(writer: WritePtr) -> StoreWriter {
pub fn new(writer: WritePtr, compressor: Compressor) -> StoreWriter {
StoreWriter {
compressor,
doc: 0,
first_doc_in_block: 0,
offset_index_writer: SkipIndexBuilder::new(),
@@ -45,6 +47,10 @@ impl StoreWriter {
}
}
pub(crate) fn compressor(&self) -> Compressor {
self.compressor
}
/// The memory used (inclusive childs)
pub fn mem_usage(&self) -> usize {
self.intermediary_buffer.capacity() + self.current_block.capacity()
@@ -125,7 +131,8 @@ impl StoreWriter {
fn write_and_compress_block(&mut self) -> io::Result<()> {
assert!(self.doc > 0);
self.intermediary_buffer.clear();
compress(&self.current_block[..], &mut self.intermediary_buffer)?;
self.compressor
.compress(&self.current_block[..], &mut self.intermediary_buffer)?;
let start_offset = self.writer.written_bytes() as usize;
self.writer.write_all(&self.intermediary_buffer)?;
let end_offset = self.writer.written_bytes() as usize;
@@ -147,8 +154,9 @@ impl StoreWriter {
self.write_and_compress_block()?;
}
let header_offset: u64 = self.writer.written_bytes() as u64;
let footer = DocStoreFooter::new(header_offset, self.compressor);
self.offset_index_writer.write(&mut self.writer)?;
header_offset.serialize(&mut self.writer)?;
footer.serialize(&mut self.writer)?;
self.writer.terminate()
}
}

View File

@@ -78,8 +78,8 @@ pub struct TermStreamer<'a, A = AlwaysMatch>
where
A: Automaton,
{
fst_map: &'a TermDictionary,
stream: Stream<'a, A>,
pub(crate) fst_map: &'a TermDictionary,
pub(crate) stream: Stream<'a, A>,
term_ord: TermOrdinal,
current_key: Vec<u8>,
current_value: TermInfo,

View File

@@ -1,32 +1,11 @@
use crate::postings::TermInfo;
use crate::termdict::TermDictionary;
use crate::termdict::TermOrdinal;
use crate::termdict::TermStreamer;
use std::cmp::Ordering;
use std::collections::BinaryHeap;
pub struct HeapItem<'a> {
pub streamer: TermStreamer<'a>,
pub segment_ord: usize,
}
impl<'a> PartialEq for HeapItem<'a> {
fn eq(&self, other: &Self) -> bool {
self.segment_ord == other.segment_ord
}
}
impl<'a> Eq for HeapItem<'a> {}
impl<'a> PartialOrd for HeapItem<'a> {
fn partial_cmp(&self, other: &HeapItem<'a>) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl<'a> Ord for HeapItem<'a> {
fn cmp(&self, other: &HeapItem<'a>) -> Ordering {
(&other.streamer.key(), &other.segment_ord).cmp(&(&self.streamer.key(), &self.segment_ord))
}
}
use tantivy_fst::map::OpBuilder;
use tantivy_fst::map::Union;
use tantivy_fst::raw::IndexedValue;
use tantivy_fst::Streamer;
/// Given a list of sorted term streams,
/// returns an iterator over sorted unique terms.
@@ -34,61 +13,50 @@ impl<'a> Ord for HeapItem<'a> {
/// The item yield is actually a pair with
/// - the term
/// - a slice with the ordinal of the segments containing
/// the terms.
/// the term.
pub struct TermMerger<'a> {
heap: BinaryHeap<HeapItem<'a>>,
current_streamers: Vec<HeapItem<'a>>,
dictionaries: Vec<&'a TermDictionary>,
union: Union<'a>,
current_key: Vec<u8>,
current_segment_and_term_ordinals: Vec<IndexedValue>,
}
impl<'a> TermMerger<'a> {
/// Stream of merged term dictionary
///
pub fn new(streams: Vec<TermStreamer<'a>>) -> TermMerger<'a> {
let mut op_builder = OpBuilder::new();
let mut dictionaries = vec![];
for streamer in streams {
op_builder.push(streamer.stream);
dictionaries.push(streamer.fst_map);
}
TermMerger {
heap: BinaryHeap::new(),
current_streamers: streams
.into_iter()
.enumerate()
.map(|(ord, streamer)| HeapItem {
streamer,
segment_ord: ord,
})
.collect(),
dictionaries,
union: op_builder.union(),
current_key: vec![],
current_segment_and_term_ordinals: vec![],
}
}
pub(crate) fn matching_segments<'b: 'a>(
&'b self,
) -> impl 'b + Iterator<Item = (usize, TermOrdinal)> {
self.current_streamers
pub fn matching_segments<'b: 'a>(&'b self) -> impl 'b + Iterator<Item = (usize, TermOrdinal)> {
self.current_segment_and_term_ordinals
.iter()
.map(|heap_item| (heap_item.segment_ord, heap_item.streamer.term_ord()))
}
fn advance_segments(&mut self) {
let streamers = &mut self.current_streamers;
let heap = &mut self.heap;
for mut heap_item in streamers.drain(..) {
if heap_item.streamer.advance() {
heap.push(heap_item);
}
}
.map(|iv| (iv.index, iv.value))
}
/// Advance the term iterator to the next term.
/// Returns true if there is indeed another term
/// False if there is none.
pub fn advance(&mut self) -> bool {
self.advance_segments();
if let Some(head) = self.heap.pop() {
self.current_streamers.push(head);
while let Some(next_streamer) = self.heap.peek() {
if self.current_streamers[0].streamer.key() != next_streamer.streamer.key() {
break;
}
let next_heap_it = self.heap.pop().unwrap(); // safe : we peeked beforehand
self.current_streamers.push(next_heap_it);
}
if let Some((k, values)) = self.union.next() {
self.current_key.clear();
self.current_key.extend_from_slice(k);
self.current_segment_and_term_ordinals.clear();
self.current_segment_and_term_ordinals
.extend_from_slice(values);
self.current_segment_and_term_ordinals
.sort_by_key(|iv| iv.index);
true
} else {
false
@@ -101,16 +69,85 @@ impl<'a> TermMerger<'a> {
/// iff advance() has been called before
/// and "true" was returned.
pub fn key(&self) -> &[u8] {
self.current_streamers[0].streamer.key()
&self.current_key
}
/// Returns the sorted list of segment ordinals
/// that include the current term.
/// Iterator over (segment ordinal, TermInfo) pairs iterator sorted by the ordinal.
///
/// This method may be called
/// iff advance() has been called before
/// and "true" was returned.
pub fn current_kvs(&self) -> &[HeapItem<'a>] {
&self.current_streamers[..]
pub fn current_segment_ordinals_and_term_infos<'b: 'a>(
&'b self,
) -> impl 'b + Iterator<Item = (usize, TermInfo)> {
self.current_segment_and_term_ordinals
.iter()
.map(move |iv| {
(
iv.index,
self.dictionaries[iv.index].term_info_from_ord(iv.value),
)
})
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use super::TermMerger;
use crate::directory::FileSlice;
use crate::postings::TermInfo;
use crate::termdict::{TermDictionary, TermDictionaryBuilder};
use rand::distributions::Alphanumeric;
use rand::{thread_rng, Rng};
use test::{self, Bencher};
fn make_term_info(term_ord: u64) -> TermInfo {
let offset = |term_ord: u64| (term_ord * 100 + term_ord * term_ord) as usize;
TermInfo {
doc_freq: term_ord as u32,
postings_range: offset(term_ord)..offset(term_ord + 1),
positions_range: offset(term_ord)..offset(term_ord + 1),
}
}
/// Create a dictionary of random strings.
fn rand_dict(num_terms: usize) -> crate::Result<TermDictionary> {
let buffer: Vec<u8> = {
let mut terms = vec![];
for _i in 0..num_terms {
let rand_string: String = thread_rng()
.sample_iter(&Alphanumeric)
.take(thread_rng().gen_range(30..42))
.map(char::from)
.collect();
terms.push(rand_string);
}
terms.sort();
let mut term_dictionary_builder = TermDictionaryBuilder::create(Vec::new())?;
for i in 0..num_terms {
term_dictionary_builder.insert(terms[i].as_bytes(), &make_term_info(i as u64))?;
}
term_dictionary_builder.finish()?
};
let file = FileSlice::from(buffer);
TermDictionary::open(file)
}
#[bench]
fn bench_termmerger(b: &mut Bencher) -> crate::Result<()> {
let dict1 = rand_dict(100_000)?;
let dict2 = rand_dict(100_000)?;
b.iter(|| -> crate::Result<u32> {
let stream1 = dict1.stream()?;
let stream2 = dict2.stream()?;
let mut merger = TermMerger::new(vec![stream1, stream2]);
let mut count = 0;
while merger.advance() {
count += 1;
}
Ok(count)
});
Ok(())
}
}