mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-07 01:32:53 +00:00
Compare commits
73 Commits
owned-byte
...
revert-109
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d5a1e8389c | ||
|
|
bb488305c9 | ||
|
|
f05e84f964 | ||
|
|
65546ed22b | ||
|
|
57ae5b27dc | ||
|
|
f9531ec3c9 | ||
|
|
5b54a32563 | ||
|
|
cd049e28bc | ||
|
|
646e41bec4 | ||
|
|
36528c5e83 | ||
|
|
cd169dee23 | ||
|
|
b5cc60f80b | ||
|
|
060b83159a | ||
|
|
a40ff35453 | ||
|
|
268e6bfe6e | ||
|
|
f902440b8b | ||
|
|
77a0902605 | ||
|
|
c889ae10e4 | ||
|
|
0a534c6ee0 | ||
|
|
167d88b449 | ||
|
|
1071ed84f2 | ||
|
|
abb5624af2 | ||
|
|
1d41b96d32 | ||
|
|
ef4665945f | ||
|
|
294cd5fd0b | ||
|
|
f4d271177c | ||
|
|
451538fecf | ||
|
|
e78e0fec59 | ||
|
|
2e639cebf8 | ||
|
|
e296da7ade | ||
|
|
3b3e26c4b8 | ||
|
|
6a4883ac69 | ||
|
|
0ba05df545 | ||
|
|
aa3c4d4029 | ||
|
|
60df629725 | ||
|
|
2570b005ac | ||
|
|
d5212cd19d | ||
|
|
2193d85622 | ||
|
|
dfdbfe9eff | ||
|
|
b999e836b2 | ||
|
|
be2dd41e69 | ||
|
|
483fdb79cc | ||
|
|
aefd0fc907 | ||
|
|
3298d6cb71 | ||
|
|
c02c78ea73 | ||
|
|
6bf4fee1ba | ||
|
|
5209238c1b | ||
|
|
7ef25ec400 | ||
|
|
221e7cbb55 | ||
|
|
873ac1a3ac | ||
|
|
ebe55a7ae1 | ||
|
|
9f32d40b27 | ||
|
|
8ae10a930a | ||
|
|
473a346814 | ||
|
|
3a8a0fe79a | ||
|
|
511dc8f87f | ||
|
|
3901295329 | ||
|
|
f5918c6c74 | ||
|
|
abe6b4baec | ||
|
|
6e4b61154f | ||
|
|
2aad0ced77 | ||
|
|
41ea14840d | ||
|
|
dff0ffd38a | ||
|
|
8d32c3ba3a | ||
|
|
4afba005f9 | ||
|
|
85fb0cc20a | ||
|
|
5ef2d56ec2 | ||
|
|
fd8e5bdf57 | ||
|
|
4f8481a1e4 | ||
|
|
bcd72e5c14 | ||
|
|
249bc6cf72 | ||
|
|
1c0af5765d | ||
|
|
7ba771ed1b |
30
.github/workflows/test.yml
vendored
Normal file
30
.github/workflows/test.yml
vendored
Normal file
@@ -0,0 +1,30 @@
|
||||
name: Rust
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main ]
|
||||
pull_request:
|
||||
branches: [ main ]
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
|
||||
jobs:
|
||||
build:
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Build
|
||||
run: cargo build --verbose --workspace
|
||||
- name: Install latest nightly to test also against unstable feature flag
|
||||
uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
toolchain: nightly
|
||||
override: true
|
||||
components: rustfmt
|
||||
- name: Run tests
|
||||
run: cargo test --all-features --verbose --workspace
|
||||
- name: Check Formatting
|
||||
run: cargo fmt --all -- --check
|
||||
16
CHANGELOG.md
16
CHANGELOG.md
@@ -1,3 +1,7 @@
|
||||
Tantivy 0.15.1
|
||||
=========================
|
||||
- Major bugfix. DocStore panics when first block is deleted. (@appaquet) #1077
|
||||
|
||||
Tantivy 0.15.0
|
||||
=========================
|
||||
- API Changes. Using Range instead of (start, end) in the API and internals (`FileSlice`, `OwnedBytes`, `Snippets`, ...)
|
||||
@@ -8,11 +12,19 @@ Tantivy 0.15.0
|
||||
- Bugfix consistent tie break handling in facet's topk (@hardikpnsp) #357
|
||||
- Date field support for range queries (@rihardsk) #516
|
||||
- Added lz4-flex as the default compression scheme in tantivy (@PSeitz) #1009
|
||||
- Renamed a lot of symbols to avoid all uppercasing on acronyms, as per new clippy recommendation. For instance, RAMDirectory -> RamDirectory. (@pmasurel)
|
||||
- Renamed a lot of symbols to avoid all uppercasing on acronyms, as per new clippy recommendation. For instance, RAMDirectory -> RamDirectory. (@fulmicoton)
|
||||
- Simplified positions index format (@fulmicoton) #1022
|
||||
- Moved bitpacking to bitpacker subcrate and add BlockedBitpacker, which bitpacks blocks of 128 elements (@PSeitz) #1030
|
||||
- Added support for more-like-this query in tantivy (@evanxg852000) #1011
|
||||
- Added support for sorting an index, e.g presorting documents in an index by a timestamp field. This can heavily improve performance for certain scenarios, by utilizing the sorted data (Top-n optimizations). #1026
|
||||
- Added support for sorting an index, e.g presorting documents in an index by a timestamp field. This can heavily improve performance for certain scenarios, by utilizing the sorted data (Top-n optimizations)(@PSeitz). #1026
|
||||
- Add iterator over documents in doc store (@PSeitz). #1044
|
||||
- Fix log merge policy (@PSeitz). #1043
|
||||
- Add detection to avoid small doc store blocks on merge (@PSeitz). #1054
|
||||
- Make doc store compression dynamic (@PSeitz). #1060
|
||||
- Switch to json for footer version handling (@PSeitz). #1060
|
||||
- Updated TermMerger implementation to rely on the union feature of the FST (@scampi) #469
|
||||
- Add boolean marking whether position is required in the query_terms API call (@fulmicoton). #1070
|
||||
|
||||
|
||||
Tantivy 0.14.0
|
||||
=========================
|
||||
|
||||
18
Cargo.toml
18
Cargo.toml
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy"
|
||||
version = "0.14.0"
|
||||
version = "0.15.1"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
@@ -20,8 +20,7 @@ once_cell = "1.7.2"
|
||||
regex ={ version = "1.5.4", default-features = false, features = ["std"] }
|
||||
tantivy-fst = "0.3"
|
||||
memmap = {version = "0.7", optional=true}
|
||||
lz4_flex = { version = "0.7.5", default-features = false, features = ["checked-decode"], optional = true }
|
||||
lz4 = { version = "1.23.2", optional = true }
|
||||
lz4_flex = { version = "0.8.0", default-features = false, features = ["checked-decode"], optional = true }
|
||||
brotli = { version = "3.3", optional = true }
|
||||
snap = { version = "1.0.5", optional = true }
|
||||
tempfile = { version = "3.2", optional = true }
|
||||
@@ -34,8 +33,10 @@ levenshtein_automata = "0.2"
|
||||
uuid = { version = "0.8.2", features = ["v4", "serde"] }
|
||||
crossbeam = "0.8"
|
||||
futures = { version = "0.3.15", features = ["thread-pool"] }
|
||||
tantivy-query-grammar = { version="0.14.0", path="./query-grammar" }
|
||||
tantivy-query-grammar = { version="0.15.0", path="./query-grammar" }
|
||||
tantivy-bitpacker = { version="0.1", path="./bitpacker" }
|
||||
common = { version="0.1", path="./common" }
|
||||
fastfield_codecs = { version="0.1", path="./fastfield_codecs", default-features = false }
|
||||
stable_deref_trait = "1.2"
|
||||
rust-stemmers = "1.2"
|
||||
downcast-rs = "1.2"
|
||||
@@ -77,18 +78,19 @@ debug-assertions = true
|
||||
overflow-checks = true
|
||||
|
||||
[features]
|
||||
default = ["mmap", "lz4-block-compression" ]
|
||||
default = ["mmap", "lz4-compression" ]
|
||||
mmap = ["fs2", "tempfile", "memmap"]
|
||||
|
||||
brotli-compression = ["brotli"]
|
||||
lz4-compression = ["lz4"]
|
||||
lz4-block-compression = ["lz4_flex"]
|
||||
lz4-compression = ["lz4_flex"]
|
||||
snappy-compression = ["snap"]
|
||||
|
||||
failpoints = ["fail/failpoints"]
|
||||
unstable = [] # useful for benches.
|
||||
wasm-bindgen = ["uuid/wasm-bindgen"]
|
||||
|
||||
[workspace]
|
||||
members = ["query-grammar", "bitpacker"]
|
||||
members = ["query-grammar", "bitpacker", "common", "fastfield_codecs"]
|
||||
|
||||
[badges]
|
||||
travis-ci = { repository = "tantivy-search/tantivy" }
|
||||
|
||||
@@ -18,5 +18,6 @@ install:
|
||||
build: false
|
||||
|
||||
test_script:
|
||||
- REM SET RUST_LOG=tantivy,test & cargo test --all --verbose --no-default-features --features lz4-block-compression --features mmap
|
||||
- REM SET RUST_LOG=tantivy,test & cargo test --all --verbose --no-default-features --features lz4-compression --features mmap
|
||||
- REM SET RUST_LOG=tantivy,test & cargo test test_store --verbose --no-default-features --features lz4-compression --features snappy-compression --features brotli-compression --features mmap
|
||||
- REM SET RUST_BACKTRACE=1 & cargo build --examples
|
||||
|
||||
@@ -2,6 +2,13 @@
|
||||
name = "tantivy-bitpacker"
|
||||
version = "0.1.0"
|
||||
edition = "2018"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = []
|
||||
description = """Tantivy-sub crate: bitpacking"""
|
||||
repository = "https://github.com/tantivy-search/tantivy"
|
||||
keywords = []
|
||||
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
|
||||
@@ -17,6 +17,7 @@ impl BitPacker {
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn write<TWrite: io::Write>(
|
||||
&mut self,
|
||||
val: u64,
|
||||
@@ -48,6 +49,7 @@ impl BitPacker {
|
||||
let bytes = self.mini_buffer.to_le_bytes();
|
||||
output.write_all(&bytes[..num_bytes])?;
|
||||
self.mini_buffer_written = 0;
|
||||
self.mini_buffer = 0;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -60,7 +62,7 @@ impl BitPacker {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct BitUnpacker {
|
||||
num_bits: u64,
|
||||
mask: u64,
|
||||
@@ -79,6 +81,7 @@ impl BitUnpacker {
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn get(&self, idx: u64, data: &[u8]) -> u64 {
|
||||
if self.num_bits == 0 {
|
||||
return 0u64;
|
||||
|
||||
@@ -80,6 +80,7 @@ impl BlockedBitpacker {
|
||||
* std::mem::size_of_val(&self.buffer.get(0).cloned().unwrap_or_default())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn add(&mut self, val: u64) {
|
||||
self.buffer.push(val);
|
||||
if self.buffer.len() == BLOCK_SIZE as usize {
|
||||
@@ -122,6 +123,7 @@ impl BlockedBitpacker {
|
||||
.resize(self.compressed_blocks.len() + 8, 0); // add padding for bitpacker
|
||||
}
|
||||
}
|
||||
#[inline]
|
||||
pub fn get(&self, idx: usize) -> u64 {
|
||||
let metadata_pos = idx / BLOCK_SIZE as usize;
|
||||
let pos_in_block = idx % BLOCK_SIZE as usize;
|
||||
|
||||
12
common/Cargo.toml
Normal file
12
common/Cargo.toml
Normal file
@@ -0,0 +1,12 @@
|
||||
[package]
|
||||
name = "common"
|
||||
version = "0.1.0"
|
||||
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
|
||||
license = "MIT"
|
||||
edition = "2018"
|
||||
description = "common traits and utility functions used by multiple tantivy subcrates"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
byteorder = "1.4.3"
|
||||
9
common/src/lib.rs
Normal file
9
common/src/lib.rs
Normal file
@@ -0,0 +1,9 @@
|
||||
pub use byteorder::LittleEndian as Endianness;
|
||||
|
||||
mod serialize;
|
||||
mod vint;
|
||||
mod writer;
|
||||
|
||||
pub use serialize::{BinarySerializable, DeserializeFrom, FixedSize};
|
||||
pub use vint::{read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt};
|
||||
pub use writer::{AntiCallToken, CountingWriter, TerminatingWrite};
|
||||
@@ -1,5 +1,5 @@
|
||||
use crate::common::Endianness;
|
||||
use crate::common::VInt;
|
||||
use crate::Endianness;
|
||||
use crate::VInt;
|
||||
use byteorder::{ReadBytesExt, WriteBytesExt};
|
||||
use std::fmt;
|
||||
use std::io;
|
||||
@@ -14,6 +14,20 @@ pub trait BinarySerializable: fmt::Debug + Sized {
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self>;
|
||||
}
|
||||
|
||||
pub trait DeserializeFrom<T: BinarySerializable> {
|
||||
fn deserialize(&mut self) -> io::Result<T>;
|
||||
}
|
||||
|
||||
/// Implement deserialize from &[u8] for all types which implement BinarySerializable.
|
||||
///
|
||||
/// TryFrom would actually be preferrable, but not possible because of the orphan
|
||||
/// rules (not completely sure if this could be resolved)
|
||||
impl<T: BinarySerializable> DeserializeFrom<T> for &[u8] {
|
||||
fn deserialize(&mut self) -> io::Result<T> {
|
||||
T::deserialize(self)
|
||||
}
|
||||
}
|
||||
|
||||
/// `FixedSize` marks a `BinarySerializable` as
|
||||
/// always serializing to the same size.
|
||||
pub trait FixedSize: BinarySerializable {
|
||||
@@ -61,6 +75,11 @@ impl<Left: BinarySerializable, Right: BinarySerializable> BinarySerializable for
|
||||
Ok((Left::deserialize(reader)?, Right::deserialize(reader)?))
|
||||
}
|
||||
}
|
||||
impl<Left: BinarySerializable + FixedSize, Right: BinarySerializable + FixedSize> FixedSize
|
||||
for (Left, Right)
|
||||
{
|
||||
const SIZE_IN_BYTES: usize = Left::SIZE_IN_BYTES + Right::SIZE_IN_BYTES;
|
||||
}
|
||||
|
||||
impl BinarySerializable for u32 {
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
@@ -141,6 +160,28 @@ impl FixedSize for u8 {
|
||||
const SIZE_IN_BYTES: usize = 1;
|
||||
}
|
||||
|
||||
impl BinarySerializable for bool {
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
let val = if *self { 1 } else { 0 };
|
||||
writer.write_u8(val)
|
||||
}
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<bool> {
|
||||
let val = reader.read_u8()?;
|
||||
match val {
|
||||
0 => Ok(false),
|
||||
1 => Ok(true),
|
||||
_ => Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
"invalid bool value on deserialization, data corrupted",
|
||||
)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FixedSize for bool {
|
||||
const SIZE_IN_BYTES: usize = 1;
|
||||
}
|
||||
|
||||
impl BinarySerializable for String {
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
let data: &[u8] = self.as_bytes();
|
||||
@@ -161,9 +202,9 @@ impl BinarySerializable for String {
|
||||
#[cfg(test)]
|
||||
pub mod test {
|
||||
|
||||
use super::VInt;
|
||||
use super::*;
|
||||
use crate::common::VInt;
|
||||
|
||||
use crate::serialize::BinarySerializable;
|
||||
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
|
||||
let mut buffer = Vec::new();
|
||||
O::default().serialize(&mut buffer).unwrap();
|
||||
@@ -175,8 +175,8 @@ impl BinarySerializable for VInt {
|
||||
mod tests {
|
||||
|
||||
use super::serialize_vint_u32;
|
||||
use super::BinarySerializable;
|
||||
use super::VInt;
|
||||
use crate::common::BinarySerializable;
|
||||
|
||||
fn aux_test_vint(val: u64) {
|
||||
let mut v = [14u8; 10];
|
||||
@@ -1,7 +1,4 @@
|
||||
use crate::directory::AntiCallToken;
|
||||
use crate::directory::TerminatingWrite;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::io::{self, BufWriter, Write};
|
||||
|
||||
pub struct CountingWriter<W> {
|
||||
underlying: W,
|
||||
@@ -16,41 +13,87 @@ impl<W: Write> CountingWriter<W> {
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn written_bytes(&self) -> u64 {
|
||||
self.written_bytes
|
||||
}
|
||||
|
||||
/// Returns the underlying write object.
|
||||
/// Note that this method does not trigger any flushing.
|
||||
#[inline]
|
||||
pub fn finish(self) -> W {
|
||||
self.underlying
|
||||
}
|
||||
}
|
||||
|
||||
impl<W: Write> Write for CountingWriter<W> {
|
||||
#[inline]
|
||||
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
||||
let written_size = self.underlying.write(buf)?;
|
||||
self.written_bytes += written_size as u64;
|
||||
Ok(written_size)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn write_all(&mut self, buf: &[u8]) -> io::Result<()> {
|
||||
self.underlying.write_all(buf)?;
|
||||
self.written_bytes += buf.len() as u64;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn flush(&mut self) -> io::Result<()> {
|
||||
self.underlying.flush()
|
||||
}
|
||||
}
|
||||
|
||||
impl<W: TerminatingWrite> TerminatingWrite for CountingWriter<W> {
|
||||
#[inline]
|
||||
fn terminate_ref(&mut self, token: AntiCallToken) -> io::Result<()> {
|
||||
self.underlying.terminate_ref(token)
|
||||
}
|
||||
}
|
||||
|
||||
/// Struct used to prevent from calling [`terminate_ref`](trait.TerminatingWrite#method.terminate_ref) directly
|
||||
///
|
||||
/// The point is that while the type is public, it cannot be built by anyone
|
||||
/// outside of this module.
|
||||
pub struct AntiCallToken(());
|
||||
|
||||
/// Trait used to indicate when no more write need to be done on a writer
|
||||
pub trait TerminatingWrite: Write {
|
||||
/// Indicate that the writer will no longer be used. Internally call terminate_ref.
|
||||
fn terminate(mut self) -> io::Result<()>
|
||||
where
|
||||
Self: Sized,
|
||||
{
|
||||
self.terminate_ref(AntiCallToken(()))
|
||||
}
|
||||
|
||||
/// You should implement this function to define custom behavior.
|
||||
/// This function should flush any buffer it may hold.
|
||||
fn terminate_ref(&mut self, _: AntiCallToken) -> io::Result<()>;
|
||||
}
|
||||
|
||||
impl<W: TerminatingWrite + ?Sized> TerminatingWrite for Box<W> {
|
||||
fn terminate_ref(&mut self, token: AntiCallToken) -> io::Result<()> {
|
||||
self.as_mut().terminate_ref(token)
|
||||
}
|
||||
}
|
||||
|
||||
impl<W: TerminatingWrite> TerminatingWrite for BufWriter<W> {
|
||||
fn terminate_ref(&mut self, a: AntiCallToken) -> io::Result<()> {
|
||||
self.flush()?;
|
||||
self.get_mut().terminate_ref(a)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TerminatingWrite for &'a mut Vec<u8> {
|
||||
fn terminate_ref(&mut self, _a: AntiCallToken) -> io::Result<()> {
|
||||
self.flush()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
use tantivy::collector::{Collector, SegmentCollector};
|
||||
use tantivy::fastfield::FastFieldReader;
|
||||
use tantivy::fastfield::{DynamicFastFieldReader, FastFieldReader};
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::Field;
|
||||
use tantivy::schema::{Schema, FAST, INDEXED, TEXT};
|
||||
@@ -98,7 +98,7 @@ impl Collector for StatsCollector {
|
||||
}
|
||||
|
||||
struct StatsSegmentCollector {
|
||||
fast_field_reader: FastFieldReader<u64>,
|
||||
fast_field_reader: DynamicFastFieldReader<u64>,
|
||||
stats: Stats,
|
||||
}
|
||||
|
||||
|
||||
@@ -90,7 +90,7 @@ fn main() -> tantivy::Result<()> {
|
||||
|
||||
let frankenstein_isbn = Term::from_field_text(isbn, "978-9176370711");
|
||||
|
||||
// Oops our frankenstein doc seems mispelled
|
||||
// Oops our frankenstein doc seems misspelled
|
||||
let frankenstein_doc_misspelled = extract_doc_given_isbn(&reader, &frankenstein_isbn)?.unwrap();
|
||||
assert_eq!(
|
||||
schema.to_json(&frankenstein_doc_misspelled),
|
||||
|
||||
@@ -92,7 +92,7 @@ fn main() -> tantivy::Result<()> {
|
||||
|
||||
// Check the reference doc for different ways to create a `Facet` object.
|
||||
{
|
||||
let facet = Facet::from_text("/Felidae/Pantherinae");
|
||||
let facet = Facet::from("/Felidae/Pantherinae");
|
||||
let facet_term = Term::from_facet(classification, &facet);
|
||||
let facet_term_query = TermQuery::new(facet_term, IndexRecordOption::Basic);
|
||||
let mut facet_collector = FacetCollector::for_field(classification);
|
||||
|
||||
25
fastfield_codecs/Cargo.toml
Normal file
25
fastfield_codecs/Cargo.toml
Normal file
@@ -0,0 +1,25 @@
|
||||
[package]
|
||||
name = "fastfield_codecs"
|
||||
version = "0.1.0"
|
||||
authors = ["Pascal Seitz <pascal@quickwit.io>"]
|
||||
license = "MIT"
|
||||
edition = "2018"
|
||||
description = "Fast field codecs used by tantivy"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
common = { path = "../common/" }
|
||||
tantivy-bitpacker = { path = "../bitpacker/" }
|
||||
prettytable-rs = {version="0.8.0", optional= true}
|
||||
#prettytable-rs = {version="0.8.0" }
|
||||
rand = "0.8.3"
|
||||
|
||||
[dev-dependencies]
|
||||
more-asserts = "0.2.1"
|
||||
rand = "0.8.3"
|
||||
|
||||
[features]
|
||||
bin = ["prettytable-rs"]
|
||||
default = ["bin"]
|
||||
|
||||
68
fastfield_codecs/README.md
Normal file
68
fastfield_codecs/README.md
Normal file
@@ -0,0 +1,68 @@
|
||||
|
||||
|
||||
# Fast Field Codecs
|
||||
|
||||
This crate contains various fast field codecs, used to compress/decompress fast field data in tantivy.
|
||||
|
||||
## Contributing
|
||||
|
||||
Contributing is pretty straightforward. Since the bitpacking is the simplest compressor, you can check it for reference.
|
||||
|
||||
A codec needs to implement 2 traits:
|
||||
|
||||
- A reader implementing `FastFieldCodecReader` to read the codec.
|
||||
- A serializer implementing `FastFieldCodecSerializer` for compression estimation and codec name + id.
|
||||
|
||||
### Tests
|
||||
|
||||
Once the traits are implemented test and benchmark integration is pretty easy (see `test_with_codec_data_sets` and `bench.rs`).
|
||||
|
||||
Make sure to add the codec to the main.rs, which tests the compression ratio and estimation against different data sets. You can run it with:
|
||||
```
|
||||
cargo run --features bin
|
||||
```
|
||||
|
||||
### TODO
|
||||
- Add real world data sets in comparison
|
||||
- Add codec to cover sparse data sets
|
||||
|
||||
|
||||
### Codec Comparison
|
||||
```
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| | Compression Ratio | Compression Estimation |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Autoincrement | | |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| LinearInterpol | 0.000039572664 | 0.000004396963 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| MultiLinearInterpol | 0.1477348 | 0.17275847 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Bitpacked | 0.28126493 | 0.28125 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Monotonically increasing concave | | |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| LinearInterpol | 0.25003937 | 0.26562938 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| MultiLinearInterpol | 0.190665 | 0.1883836 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Bitpacked | 0.31251436 | 0.3125 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Monotonically increasing convex | | |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| LinearInterpol | 0.25003937 | 0.28125438 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| MultiLinearInterpol | 0.18676 | 0.2040086 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Bitpacked | 0.31251436 | 0.3125 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Almost monotonically increasing | | |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| LinearInterpol | 0.14066513 | 0.1562544 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| MultiLinearInterpol | 0.16335973 | 0.17275847 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Bitpacked | 0.28126493 | 0.28125 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
|
||||
```
|
||||
109
fastfield_codecs/benches/bench.rs
Normal file
109
fastfield_codecs/benches/bench.rs
Normal file
@@ -0,0 +1,109 @@
|
||||
#![feature(test)]
|
||||
|
||||
extern crate test;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use fastfield_codecs::{
|
||||
bitpacked::{BitpackedFastFieldReader, BitpackedFastFieldSerializer},
|
||||
linearinterpol::{LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer},
|
||||
multilinearinterpol::{
|
||||
MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer,
|
||||
},
|
||||
*,
|
||||
};
|
||||
|
||||
fn get_data() -> Vec<u64> {
|
||||
let mut data: Vec<_> = (100..55000_u64)
|
||||
.map(|num| num + rand::random::<u8>() as u64)
|
||||
.collect();
|
||||
data.push(99_000);
|
||||
data.insert(1000, 2000);
|
||||
data.insert(2000, 100);
|
||||
data.insert(3000, 4100);
|
||||
data.insert(4000, 100);
|
||||
data.insert(5000, 800);
|
||||
data
|
||||
}
|
||||
|
||||
fn value_iter() -> impl Iterator<Item = u64> {
|
||||
let data = (0..20_000).collect::<Vec<_>>();
|
||||
data.into_iter()
|
||||
}
|
||||
fn bench_get<S: FastFieldCodecSerializer, R: FastFieldCodecReader>(
|
||||
b: &mut Bencher,
|
||||
data: &[u64],
|
||||
) {
|
||||
let mut bytes = vec![];
|
||||
S::serialize(
|
||||
&mut bytes,
|
||||
&data,
|
||||
stats_from_vec(&data),
|
||||
data.iter().cloned(),
|
||||
data.iter().cloned(),
|
||||
)
|
||||
.unwrap();
|
||||
let reader = R::open_from_bytes(&bytes).unwrap();
|
||||
b.iter(|| {
|
||||
for pos in value_iter() {
|
||||
reader.get_u64(pos as u64, &bytes);
|
||||
}
|
||||
});
|
||||
}
|
||||
fn bench_create<S: FastFieldCodecSerializer>(b: &mut Bencher, data: &[u64]) {
|
||||
let mut bytes = vec![];
|
||||
b.iter(|| {
|
||||
S::serialize(
|
||||
&mut bytes,
|
||||
&data,
|
||||
stats_from_vec(&data),
|
||||
data.iter().cloned(),
|
||||
data.iter().cloned(),
|
||||
)
|
||||
.unwrap();
|
||||
});
|
||||
}
|
||||
|
||||
use test::Bencher;
|
||||
#[bench]
|
||||
fn bench_fastfield_bitpack_create(b: &mut Bencher) {
|
||||
let data: Vec<_> = get_data();
|
||||
bench_create::<BitpackedFastFieldSerializer>(b, &data);
|
||||
}
|
||||
#[bench]
|
||||
fn bench_fastfield_linearinterpol_create(b: &mut Bencher) {
|
||||
let data: Vec<_> = get_data();
|
||||
bench_create::<LinearInterpolFastFieldSerializer>(b, &data);
|
||||
}
|
||||
#[bench]
|
||||
fn bench_fastfield_multilinearinterpol_create(b: &mut Bencher) {
|
||||
let data: Vec<_> = get_data();
|
||||
bench_create::<MultiLinearInterpolFastFieldSerializer>(b, &data);
|
||||
}
|
||||
#[bench]
|
||||
fn bench_fastfield_bitpack_get(b: &mut Bencher) {
|
||||
let data: Vec<_> = get_data();
|
||||
bench_get::<BitpackedFastFieldSerializer, BitpackedFastFieldReader>(b, &data);
|
||||
}
|
||||
#[bench]
|
||||
fn bench_fastfield_linearinterpol_get(b: &mut Bencher) {
|
||||
let data: Vec<_> = get_data();
|
||||
bench_get::<LinearInterpolFastFieldSerializer, LinearInterpolFastFieldReader>(b, &data);
|
||||
}
|
||||
#[bench]
|
||||
fn bench_fastfield_multilinearinterpol_get(b: &mut Bencher) {
|
||||
let data: Vec<_> = get_data();
|
||||
bench_get::<MultiLinearInterpolFastFieldSerializer, MultiLinearInterpolFastFieldReader>(
|
||||
b, &data,
|
||||
);
|
||||
}
|
||||
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
|
||||
let min_value = data.iter().cloned().min().unwrap_or(0);
|
||||
let max_value = data.iter().cloned().max().unwrap_or(0);
|
||||
FastFieldStats {
|
||||
min_value,
|
||||
max_value,
|
||||
num_vals: data.len() as u64,
|
||||
}
|
||||
}
|
||||
}
|
||||
176
fastfield_codecs/src/bitpacked.rs
Normal file
176
fastfield_codecs/src/bitpacked.rs
Normal file
@@ -0,0 +1,176 @@
|
||||
use crate::FastFieldCodecReader;
|
||||
use crate::FastFieldCodecSerializer;
|
||||
use crate::FastFieldDataAccess;
|
||||
use crate::FastFieldStats;
|
||||
use common::BinarySerializable;
|
||||
use std::io::{self, Write};
|
||||
use tantivy_bitpacker::compute_num_bits;
|
||||
use tantivy_bitpacker::BitPacker;
|
||||
|
||||
use tantivy_bitpacker::BitUnpacker;
|
||||
|
||||
/// Depending on the field type, a different
|
||||
/// fast field is required.
|
||||
#[derive(Clone)]
|
||||
pub struct BitpackedFastFieldReader {
|
||||
bit_unpacker: BitUnpacker,
|
||||
pub min_value_u64: u64,
|
||||
pub max_value_u64: u64,
|
||||
}
|
||||
|
||||
impl<'data> FastFieldCodecReader for BitpackedFastFieldReader {
|
||||
/// Opens a fast field given a file.
|
||||
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
|
||||
let (_data, mut footer) = bytes.split_at(bytes.len() - 16);
|
||||
let min_value = u64::deserialize(&mut footer)?;
|
||||
let amplitude = u64::deserialize(&mut footer)?;
|
||||
let max_value = min_value + amplitude;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let bit_unpacker = BitUnpacker::new(num_bits);
|
||||
Ok(BitpackedFastFieldReader {
|
||||
min_value_u64: min_value,
|
||||
max_value_u64: max_value,
|
||||
bit_unpacker,
|
||||
})
|
||||
}
|
||||
#[inline]
|
||||
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
|
||||
self.min_value_u64 + self.bit_unpacker.get(doc, &data)
|
||||
}
|
||||
#[inline]
|
||||
fn min_value(&self) -> u64 {
|
||||
self.min_value_u64
|
||||
}
|
||||
#[inline]
|
||||
fn max_value(&self) -> u64 {
|
||||
self.max_value_u64
|
||||
}
|
||||
}
|
||||
pub struct BitpackedFastFieldSerializerLegacy<'a, W: 'a + Write> {
|
||||
bit_packer: BitPacker,
|
||||
write: &'a mut W,
|
||||
min_value: u64,
|
||||
amplitude: u64,
|
||||
num_bits: u8,
|
||||
}
|
||||
|
||||
impl<'a, W: Write> BitpackedFastFieldSerializerLegacy<'a, W> {
|
||||
/// Creates a new fast field serializer.
|
||||
///
|
||||
/// The serializer in fact encode the values by bitpacking
|
||||
/// `(val - min_value)`.
|
||||
///
|
||||
/// It requires a `min_value` and a `max_value` to compute
|
||||
/// compute the minimum number of bits required to encode
|
||||
/// values.
|
||||
pub fn open(
|
||||
write: &'a mut W,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
) -> io::Result<BitpackedFastFieldSerializerLegacy<'a, W>> {
|
||||
assert!(min_value <= max_value);
|
||||
let amplitude = max_value - min_value;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let bit_packer = BitPacker::new();
|
||||
Ok(BitpackedFastFieldSerializerLegacy {
|
||||
bit_packer,
|
||||
write,
|
||||
min_value,
|
||||
amplitude,
|
||||
num_bits,
|
||||
})
|
||||
}
|
||||
/// Pushes a new value to the currently open u64 fast field.
|
||||
#[inline]
|
||||
pub fn add_val(&mut self, val: u64) -> io::Result<()> {
|
||||
let val_to_write: u64 = val - self.min_value;
|
||||
self.bit_packer
|
||||
.write(val_to_write, self.num_bits, &mut self.write)?;
|
||||
Ok(())
|
||||
}
|
||||
pub fn close_field(mut self) -> io::Result<()> {
|
||||
self.bit_packer.close(&mut self.write)?;
|
||||
self.min_value.serialize(&mut self.write)?;
|
||||
self.amplitude.serialize(&mut self.write)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct BitpackedFastFieldSerializer {}
|
||||
|
||||
impl FastFieldCodecSerializer for BitpackedFastFieldSerializer {
|
||||
const NAME: &'static str = "Bitpacked";
|
||||
const ID: u8 = 1;
|
||||
/// Serializes data with the BitpackedFastFieldSerializer.
|
||||
///
|
||||
/// The serializer in fact encode the values by bitpacking
|
||||
/// `(val - min_value)`.
|
||||
///
|
||||
/// It requires a `min_value` and a `max_value` to compute
|
||||
/// compute the minimum number of bits required to encode
|
||||
/// values.
|
||||
fn serialize(
|
||||
write: &mut impl Write,
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
data_iter: impl Iterator<Item = u64>,
|
||||
_data_iter1: impl Iterator<Item = u64>,
|
||||
) -> io::Result<()> {
|
||||
let mut serializer =
|
||||
BitpackedFastFieldSerializerLegacy::open(write, stats.min_value, stats.max_value)?;
|
||||
|
||||
for val in data_iter {
|
||||
serializer.add_val(val)?;
|
||||
}
|
||||
serializer.close_field()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
fn is_applicable(
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
_stats: FastFieldStats,
|
||||
) -> bool {
|
||||
true
|
||||
}
|
||||
fn estimate(_fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
|
||||
let amplitude = stats.max_value - stats.min_value;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let num_bits_uncompressed = 64;
|
||||
num_bits as f32 / num_bits_uncompressed as f32
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tests::get_codec_test_data_sets;
|
||||
|
||||
fn create_and_validate(data: &[u64], name: &str) {
|
||||
crate::tests::create_and_validate::<BitpackedFastFieldSerializer, BitpackedFastFieldReader>(
|
||||
&data, name,
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_with_codec_data_sets() {
|
||||
let data_sets = get_codec_test_data_sets();
|
||||
for (mut data, name) in data_sets {
|
||||
create_and_validate(&data, name);
|
||||
data.reverse();
|
||||
create_and_validate(&data, name);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bitpacked_fast_field_rand() {
|
||||
for _ in 0..500 {
|
||||
let mut data = (0..1 + rand::random::<u8>() as usize)
|
||||
.map(|_| rand::random::<i64>() as u64 / 2 as u64)
|
||||
.collect::<Vec<_>>();
|
||||
create_and_validate(&data, "rand");
|
||||
|
||||
data.reverse();
|
||||
create_and_validate(&data, "rand");
|
||||
}
|
||||
}
|
||||
}
|
||||
231
fastfield_codecs/src/lib.rs
Normal file
231
fastfield_codecs/src/lib.rs
Normal file
@@ -0,0 +1,231 @@
|
||||
#[cfg(test)]
|
||||
#[macro_use]
|
||||
extern crate more_asserts;
|
||||
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
|
||||
pub mod bitpacked;
|
||||
pub mod linearinterpol;
|
||||
pub mod multilinearinterpol;
|
||||
|
||||
pub trait FastFieldCodecReader: Sized {
|
||||
/// reads the metadata and returns the CodecReader
|
||||
fn open_from_bytes(bytes: &[u8]) -> std::io::Result<Self>;
|
||||
|
||||
fn get_u64(&self, doc: u64, data: &[u8]) -> u64;
|
||||
|
||||
fn min_value(&self) -> u64;
|
||||
fn max_value(&self) -> u64;
|
||||
}
|
||||
|
||||
/// The FastFieldSerializerEstimate trait is required on all variants
|
||||
/// of fast field compressions, to decide which one to choose.
|
||||
pub trait FastFieldCodecSerializer {
|
||||
/// A codex needs to provide a unique name and id, which is
|
||||
/// used for debugging and de/serialization.
|
||||
const NAME: &'static str;
|
||||
const ID: u8;
|
||||
|
||||
/// Check if the Codec is able to compress the data
|
||||
fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> bool;
|
||||
|
||||
/// Returns an estimate of the compression ratio.
|
||||
/// The baseline is uncompressed 64bit data.
|
||||
///
|
||||
/// It could make sense to also return a value representing
|
||||
/// computational complexity.
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32;
|
||||
|
||||
/// Serializes the data using the serializer into write.
|
||||
/// There are multiple iterators, in case the codec needs to read the data multiple times.
|
||||
/// The iterators should be preferred over using fastfield_accessor for performance reasons.
|
||||
fn serialize(
|
||||
write: &mut impl Write,
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
data_iter: impl Iterator<Item = u64>,
|
||||
data_iter1: impl Iterator<Item = u64>,
|
||||
) -> io::Result<()>;
|
||||
}
|
||||
|
||||
/// FastFieldDataAccess is the trait to access fast field data during serialization and estimation.
|
||||
pub trait FastFieldDataAccess {
|
||||
/// Return the value associated to the given document.
|
||||
///
|
||||
/// Whenever possible use the Iterator passed to the fastfield creation instead, for performance reasons.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic if `doc` is greater than the segment
|
||||
fn get(&self, doc: u32) -> u64;
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct FastFieldStats {
|
||||
pub min_value: u64,
|
||||
pub max_value: u64,
|
||||
pub num_vals: u64,
|
||||
}
|
||||
|
||||
impl<'a> FastFieldDataAccess for &'a [u64] {
|
||||
fn get(&self, doc: u32) -> u64 {
|
||||
self[doc as usize]
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> FastFieldDataAccess for &'a Vec<u64> {
|
||||
fn get(&self, doc: u32) -> u64 {
|
||||
self[doc as usize]
|
||||
}
|
||||
}
|
||||
|
||||
impl FastFieldDataAccess for Vec<u64> {
|
||||
fn get(&self, doc: u32) -> u64 {
|
||||
self[doc as usize]
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::{
|
||||
bitpacked::{BitpackedFastFieldReader, BitpackedFastFieldSerializer},
|
||||
linearinterpol::{LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer},
|
||||
multilinearinterpol::{
|
||||
MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer,
|
||||
},
|
||||
};
|
||||
|
||||
pub fn create_and_validate<S: FastFieldCodecSerializer, R: FastFieldCodecReader>(
|
||||
data: &[u64],
|
||||
name: &str,
|
||||
) -> (f32, f32) {
|
||||
if !S::is_applicable(&data, crate::tests::stats_from_vec(&data)) {
|
||||
return (f32::MAX, 0.0);
|
||||
}
|
||||
let estimation = S::estimate(&data, crate::tests::stats_from_vec(&data));
|
||||
let mut out = vec![];
|
||||
S::serialize(
|
||||
&mut out,
|
||||
&data,
|
||||
crate::tests::stats_from_vec(&data),
|
||||
data.iter().cloned(),
|
||||
data.iter().cloned(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let reader = R::open_from_bytes(&out).unwrap();
|
||||
for (doc, orig_val) in data.iter().enumerate() {
|
||||
let val = reader.get_u64(doc as u64, &out);
|
||||
if val != *orig_val {
|
||||
panic!(
|
||||
"val {:?} does not match orig_val {:?}, in data set {}, data {:?}",
|
||||
val, orig_val, name, data
|
||||
);
|
||||
}
|
||||
}
|
||||
let actual_compression = data.len() as f32 / out.len() as f32;
|
||||
return (estimation, actual_compression);
|
||||
}
|
||||
pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
|
||||
let mut data_and_names = vec![];
|
||||
|
||||
let data = (10..=20_u64).collect::<Vec<_>>();
|
||||
data_and_names.push((data, "simple monotonically increasing"));
|
||||
|
||||
data_and_names.push((
|
||||
vec![5, 6, 7, 8, 9, 10, 99, 100],
|
||||
"offset in linear interpol",
|
||||
));
|
||||
data_and_names.push((vec![5, 50, 3, 13, 1, 1000, 35], "rand small"));
|
||||
data_and_names.push((vec![10], "single value"));
|
||||
|
||||
data_and_names
|
||||
}
|
||||
|
||||
fn test_codec<S: FastFieldCodecSerializer, R: FastFieldCodecReader>() {
|
||||
let codec_name = S::NAME;
|
||||
for (data, data_set_name) in get_codec_test_data_sets() {
|
||||
let (estimate, actual) =
|
||||
crate::tests::create_and_validate::<S, R>(&data, data_set_name);
|
||||
let result = if estimate == f32::MAX {
|
||||
"Disabled".to_string()
|
||||
} else {
|
||||
format!("Estimate {:?} Actual {:?} ", estimate, actual)
|
||||
};
|
||||
println!(
|
||||
"Codec {}, DataSet {}, {}",
|
||||
codec_name, data_set_name, result
|
||||
);
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn test_codec_bitpacking() {
|
||||
test_codec::<BitpackedFastFieldSerializer, BitpackedFastFieldReader>();
|
||||
}
|
||||
#[test]
|
||||
fn test_codec_interpolation() {
|
||||
test_codec::<LinearInterpolFastFieldSerializer, LinearInterpolFastFieldReader>();
|
||||
}
|
||||
#[test]
|
||||
fn test_codec_multi_interpolation() {
|
||||
test_codec::<MultiLinearInterpolFastFieldSerializer, MultiLinearInterpolFastFieldReader>();
|
||||
}
|
||||
|
||||
use super::*;
|
||||
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
|
||||
let min_value = data.iter().cloned().min().unwrap_or(0);
|
||||
let max_value = data.iter().cloned().max().unwrap_or(0);
|
||||
FastFieldStats {
|
||||
min_value,
|
||||
max_value,
|
||||
num_vals: data.len() as u64,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn estimation_good_interpolation_case() {
|
||||
let data = (10..=20000_u64).collect::<Vec<_>>();
|
||||
|
||||
let linear_interpol_estimation =
|
||||
LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(linear_interpol_estimation, 0.01);
|
||||
|
||||
let multi_linear_interpol_estimation =
|
||||
MultiLinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(multi_linear_interpol_estimation, 0.2);
|
||||
assert_le!(linear_interpol_estimation, multi_linear_interpol_estimation);
|
||||
|
||||
let bitpacked_estimation =
|
||||
BitpackedFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(linear_interpol_estimation, bitpacked_estimation);
|
||||
}
|
||||
#[test]
|
||||
fn estimation_test_bad_interpolation_case() {
|
||||
let data = vec![200, 10, 10, 10, 10, 1000, 20];
|
||||
|
||||
let linear_interpol_estimation =
|
||||
LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(linear_interpol_estimation, 0.32);
|
||||
|
||||
let bitpacked_estimation =
|
||||
BitpackedFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(bitpacked_estimation, linear_interpol_estimation);
|
||||
}
|
||||
#[test]
|
||||
fn estimation_test_bad_interpolation_case_monotonically_increasing() {
|
||||
let mut data = (200..=20000_u64).collect::<Vec<_>>();
|
||||
data.push(1_000_000);
|
||||
|
||||
// in this case the linear interpolation can't in fact not be worse than bitpacking,
|
||||
// but the estimator adds some threshold, which leads to estimated worse behavior
|
||||
let linear_interpol_estimation =
|
||||
LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(linear_interpol_estimation, 0.35);
|
||||
|
||||
let bitpacked_estimation =
|
||||
BitpackedFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(bitpacked_estimation, 0.32);
|
||||
assert_le!(bitpacked_estimation, linear_interpol_estimation);
|
||||
}
|
||||
}
|
||||
297
fastfield_codecs/src/linearinterpol.rs
Normal file
297
fastfield_codecs/src/linearinterpol.rs
Normal file
@@ -0,0 +1,297 @@
|
||||
use crate::FastFieldCodecReader;
|
||||
use crate::FastFieldCodecSerializer;
|
||||
use crate::FastFieldDataAccess;
|
||||
use crate::FastFieldStats;
|
||||
use std::io::{self, Read, Write};
|
||||
use std::ops::Sub;
|
||||
use tantivy_bitpacker::compute_num_bits;
|
||||
use tantivy_bitpacker::BitPacker;
|
||||
|
||||
use common::BinarySerializable;
|
||||
use common::FixedSize;
|
||||
use tantivy_bitpacker::BitUnpacker;
|
||||
|
||||
/// Depending on the field type, a different
|
||||
/// fast field is required.
|
||||
#[derive(Clone)]
|
||||
pub struct LinearInterpolFastFieldReader {
|
||||
bit_unpacker: BitUnpacker,
|
||||
pub footer: LinearInterpolFooter,
|
||||
pub slope: f32,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct LinearInterpolFooter {
|
||||
pub relative_max_value: u64,
|
||||
pub offset: u64,
|
||||
pub first_val: u64,
|
||||
pub last_val: u64,
|
||||
pub num_vals: u64,
|
||||
pub min_value: u64,
|
||||
pub max_value: u64,
|
||||
}
|
||||
|
||||
impl BinarySerializable for LinearInterpolFooter {
|
||||
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
|
||||
self.relative_max_value.serialize(write)?;
|
||||
self.offset.serialize(write)?;
|
||||
self.first_val.serialize(write)?;
|
||||
self.last_val.serialize(write)?;
|
||||
self.num_vals.serialize(write)?;
|
||||
self.min_value.serialize(write)?;
|
||||
self.max_value.serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<LinearInterpolFooter> {
|
||||
Ok(LinearInterpolFooter {
|
||||
relative_max_value: u64::deserialize(reader)?,
|
||||
offset: u64::deserialize(reader)?,
|
||||
first_val: u64::deserialize(reader)?,
|
||||
last_val: u64::deserialize(reader)?,
|
||||
num_vals: u64::deserialize(reader)?,
|
||||
min_value: u64::deserialize(reader)?,
|
||||
max_value: u64::deserialize(reader)?,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl FixedSize for LinearInterpolFooter {
|
||||
const SIZE_IN_BYTES: usize = 56;
|
||||
}
|
||||
|
||||
impl FastFieldCodecReader for LinearInterpolFastFieldReader {
|
||||
/// Opens a fast field given a file.
|
||||
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
|
||||
let (_data, mut footer) = bytes.split_at(bytes.len() - LinearInterpolFooter::SIZE_IN_BYTES);
|
||||
let footer = LinearInterpolFooter::deserialize(&mut footer)?;
|
||||
let slope = get_slope(footer.first_val, footer.last_val, footer.num_vals);
|
||||
|
||||
let num_bits = compute_num_bits(footer.relative_max_value);
|
||||
let bit_unpacker = BitUnpacker::new(num_bits);
|
||||
Ok(LinearInterpolFastFieldReader {
|
||||
bit_unpacker,
|
||||
footer,
|
||||
slope,
|
||||
})
|
||||
}
|
||||
#[inline]
|
||||
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
|
||||
let calculated_value = get_calculated_value(self.footer.first_val, doc, self.slope);
|
||||
(calculated_value + self.bit_unpacker.get(doc, &data)) - self.footer.offset
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn min_value(&self) -> u64 {
|
||||
self.footer.min_value
|
||||
}
|
||||
#[inline]
|
||||
fn max_value(&self) -> u64 {
|
||||
self.footer.max_value
|
||||
}
|
||||
}
|
||||
|
||||
/// Fastfield serializer, which tries to guess values by linear interpolation
|
||||
/// and stores the difference bitpacked.
|
||||
pub struct LinearInterpolFastFieldSerializer {}
|
||||
|
||||
#[inline]
|
||||
fn get_slope(first_val: u64, last_val: u64, num_vals: u64) -> f32 {
|
||||
if num_vals <= 1 {
|
||||
return 0.0;
|
||||
}
|
||||
// We calculate the slope with f64 high precision and use the result in lower precision f32
|
||||
// This is done in order to handle estimations for very large values like i64::MAX
|
||||
((last_val as f64 - first_val as f64) / (num_vals as u64 - 1) as f64) as f32
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 {
|
||||
first_val + (pos as f32 * slope) as u64
|
||||
}
|
||||
|
||||
impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
|
||||
const NAME: &'static str = "LinearInterpol";
|
||||
const ID: u8 = 2;
|
||||
/// Creates a new fast field serializer.
|
||||
fn serialize(
|
||||
write: &mut impl Write,
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
data_iter: impl Iterator<Item = u64>,
|
||||
data_iter1: impl Iterator<Item = u64>,
|
||||
) -> io::Result<()> {
|
||||
assert!(stats.min_value <= stats.max_value);
|
||||
|
||||
let first_val = fastfield_accessor.get(0);
|
||||
let last_val = fastfield_accessor.get(stats.num_vals as u32 - 1);
|
||||
let slope = get_slope(first_val, last_val, stats.num_vals);
|
||||
// calculate offset to ensure all values are positive
|
||||
let mut offset = 0;
|
||||
let mut rel_positive_max = 0;
|
||||
for (pos, actual_value) in data_iter1.enumerate() {
|
||||
let calculated_value = get_calculated_value(first_val, pos as u64, slope);
|
||||
if calculated_value > actual_value {
|
||||
// negative value we need to apply an offset
|
||||
// we ignore negative values in the max value calculation, because negative values
|
||||
// will be offset to 0
|
||||
offset = offset.max(calculated_value - actual_value);
|
||||
} else {
|
||||
//positive value no offset reuqired
|
||||
rel_positive_max = rel_positive_max.max(actual_value - calculated_value);
|
||||
}
|
||||
}
|
||||
|
||||
// rel_positive_max will be adjusted by offset
|
||||
let relative_max_value = rel_positive_max + offset;
|
||||
|
||||
let num_bits = compute_num_bits(relative_max_value);
|
||||
let mut bit_packer = BitPacker::new();
|
||||
for (pos, val) in data_iter.enumerate() {
|
||||
let calculated_value = get_calculated_value(first_val, pos as u64, slope);
|
||||
let diff = (val + offset) - calculated_value;
|
||||
bit_packer.write(diff, num_bits, write)?;
|
||||
}
|
||||
bit_packer.close(write)?;
|
||||
|
||||
let footer = LinearInterpolFooter {
|
||||
relative_max_value,
|
||||
offset,
|
||||
first_val,
|
||||
last_val,
|
||||
num_vals: stats.num_vals,
|
||||
min_value: stats.min_value,
|
||||
max_value: stats.max_value,
|
||||
};
|
||||
footer.serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
fn is_applicable(
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> bool {
|
||||
if stats.num_vals < 3 {
|
||||
return false; //disable compressor for this case
|
||||
}
|
||||
// On serialisation the offset is added to the actual value.
|
||||
// We need to make sure this won't run into overflow calculation issues.
|
||||
// For this we take the maximum theroretical offset and add this to the max value.
|
||||
// If this doesn't overflow the algortihm should be fine
|
||||
let theorethical_maximum_offset = stats.max_value - stats.min_value;
|
||||
if stats
|
||||
.max_value
|
||||
.checked_add(theorethical_maximum_offset)
|
||||
.is_none()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
true
|
||||
}
|
||||
/// estimation for linear interpolation is hard because, you don't know
|
||||
/// where the local maxima for the deviation of the calculated value are and
|
||||
/// the offset to shift all values to >=0 is also unknown.
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
|
||||
let first_val = fastfield_accessor.get(0);
|
||||
let last_val = fastfield_accessor.get(stats.num_vals as u32 - 1);
|
||||
let slope = get_slope(first_val, last_val, stats.num_vals);
|
||||
|
||||
// let's sample at 0%, 5%, 10% .. 95%, 100%
|
||||
let num_vals = stats.num_vals as f32 / 100.0;
|
||||
let sample_positions = (0..20)
|
||||
.map(|pos| (num_vals * pos as f32 * 5.0) as usize)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let max_distance = sample_positions
|
||||
.iter()
|
||||
.map(|pos| {
|
||||
let calculated_value = get_calculated_value(first_val, *pos as u64, slope);
|
||||
let actual_value = fastfield_accessor.get(*pos as u32);
|
||||
distance(calculated_value, actual_value)
|
||||
})
|
||||
.max()
|
||||
.unwrap_or(0);
|
||||
|
||||
// the theory would be that we don't have the actual max_distance, but we are close within 50%
|
||||
// threshold.
|
||||
// It is multiplied by 2 because in a log case scenario the line would be as much above as
|
||||
// below. So the offset would = max_distance
|
||||
//
|
||||
let relative_max_value = (max_distance as f32 * 1.5) * 2.0;
|
||||
|
||||
let num_bits = compute_num_bits(relative_max_value as u64) as u64 * stats.num_vals as u64
|
||||
+ LinearInterpolFooter::SIZE_IN_BYTES as u64;
|
||||
let num_bits_uncompressed = 64 * stats.num_vals;
|
||||
num_bits as f32 / num_bits_uncompressed as f32
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
|
||||
if x < y {
|
||||
y - x
|
||||
} else {
|
||||
x - y
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tests::get_codec_test_data_sets;
|
||||
|
||||
fn create_and_validate(data: &[u64], name: &str) {
|
||||
crate::tests::create_and_validate::<
|
||||
LinearInterpolFastFieldSerializer,
|
||||
LinearInterpolFastFieldReader,
|
||||
>(&data, name);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_with_codec_data_sets() {
|
||||
let data_sets = get_codec_test_data_sets();
|
||||
for (mut data, name) in data_sets {
|
||||
create_and_validate(&data, name);
|
||||
data.reverse();
|
||||
create_and_validate(&data, name);
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn linear_interpol_fast_field_test_large_amplitude() {
|
||||
let data = vec![
|
||||
i64::MAX as u64 / 2,
|
||||
i64::MAX as u64 / 3,
|
||||
i64::MAX as u64 / 2,
|
||||
];
|
||||
|
||||
create_and_validate(&data, "large amplitude");
|
||||
}
|
||||
#[test]
|
||||
fn linear_interpol_fast_concave_data() {
|
||||
let data = vec![0, 1, 2, 5, 8, 10, 20, 50];
|
||||
create_and_validate(&data, "concave data");
|
||||
}
|
||||
#[test]
|
||||
fn linear_interpol_fast_convex_data() {
|
||||
let data = vec![0, 40, 60, 70, 75, 77];
|
||||
create_and_validate(&data, "convex data");
|
||||
}
|
||||
#[test]
|
||||
fn linear_interpol_fast_field_test_simple() {
|
||||
let data = (10..=20_u64).collect::<Vec<_>>();
|
||||
|
||||
create_and_validate(&data, "simple monotonically");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn linear_interpol_fast_field_rand() {
|
||||
for _ in 0..5000 {
|
||||
let mut data = (0..50 as usize)
|
||||
.map(|_| rand::random::<u64>())
|
||||
.collect::<Vec<_>>();
|
||||
create_and_validate(&data, "random");
|
||||
|
||||
data.reverse();
|
||||
create_and_validate(&data, "random");
|
||||
}
|
||||
}
|
||||
}
|
||||
126
fastfield_codecs/src/main.rs
Normal file
126
fastfield_codecs/src/main.rs
Normal file
@@ -0,0 +1,126 @@
|
||||
#[macro_use]
|
||||
extern crate prettytable;
|
||||
use fastfield_codecs::{
|
||||
linearinterpol::LinearInterpolFastFieldSerializer,
|
||||
multilinearinterpol::MultiLinearInterpolFastFieldSerializer, FastFieldCodecSerializer,
|
||||
FastFieldStats,
|
||||
};
|
||||
use prettytable::{Cell, Row, Table};
|
||||
|
||||
fn main() {
|
||||
let mut table = Table::new();
|
||||
|
||||
// Add a row per time
|
||||
table.add_row(row!["", "Compression Ratio", "Compression Estimation"]);
|
||||
|
||||
for (data, data_set_name) in get_codec_test_data_sets() {
|
||||
let mut results = vec![];
|
||||
let res = serialize_with_codec::<LinearInterpolFastFieldSerializer>(&data);
|
||||
results.push(res);
|
||||
let res = serialize_with_codec::<MultiLinearInterpolFastFieldSerializer>(&data);
|
||||
results.push(res);
|
||||
let res = serialize_with_codec::<fastfield_codecs::bitpacked::BitpackedFastFieldSerializer>(
|
||||
&data,
|
||||
);
|
||||
results.push(res);
|
||||
|
||||
//let best_estimation_codec = results
|
||||
//.iter()
|
||||
//.min_by(|res1, res2| res1.partial_cmp(&res2).unwrap())
|
||||
//.unwrap();
|
||||
let best_compression_ratio_codec = results
|
||||
.iter()
|
||||
.min_by(|res1, res2| res1.partial_cmp(&res2).unwrap())
|
||||
.cloned()
|
||||
.unwrap();
|
||||
|
||||
table.add_row(Row::new(vec![Cell::new(data_set_name).style_spec("Bbb")]));
|
||||
for (is_applicable, est, comp, name) in results {
|
||||
let (est_cell, ratio_cell) = if !is_applicable {
|
||||
("Codec Disabled".to_string(), "".to_string())
|
||||
} else {
|
||||
(est.to_string(), comp.to_string())
|
||||
};
|
||||
let style = if comp == best_compression_ratio_codec.1 {
|
||||
"Fb"
|
||||
} else {
|
||||
""
|
||||
};
|
||||
|
||||
table.add_row(Row::new(vec![
|
||||
Cell::new(&name.to_string()).style_spec("bFg"),
|
||||
Cell::new(&ratio_cell).style_spec(style),
|
||||
Cell::new(&est_cell).style_spec(""),
|
||||
]));
|
||||
}
|
||||
}
|
||||
|
||||
table.printstd();
|
||||
}
|
||||
|
||||
pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
|
||||
let mut data_and_names = vec![];
|
||||
|
||||
let data = (1000..=200_000_u64).collect::<Vec<_>>();
|
||||
data_and_names.push((data, "Autoincrement"));
|
||||
|
||||
let mut current_cumulative = 0;
|
||||
let data = (1..=200_000_u64)
|
||||
.map(|num| {
|
||||
let num = (num as f32 + num as f32).log10() as u64;
|
||||
current_cumulative += num;
|
||||
current_cumulative
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
//let data = (1..=200000_u64).map(|num| num + num).collect::<Vec<_>>();
|
||||
data_and_names.push((data, "Monotonically increasing concave"));
|
||||
|
||||
let mut current_cumulative = 0;
|
||||
let data = (1..=200_000_u64)
|
||||
.map(|num| {
|
||||
let num = (200_000.0 - num as f32).log10() as u64;
|
||||
current_cumulative += num;
|
||||
current_cumulative
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
data_and_names.push((data, "Monotonically increasing convex"));
|
||||
|
||||
let data = (1000..=200_000_u64)
|
||||
.map(|num| num + rand::random::<u8>() as u64)
|
||||
.collect::<Vec<_>>();
|
||||
data_and_names.push((data, "Almost monotonically increasing"));
|
||||
|
||||
data_and_names
|
||||
}
|
||||
|
||||
pub fn serialize_with_codec<S: FastFieldCodecSerializer>(
|
||||
data: &[u64],
|
||||
) -> (bool, f32, f32, &'static str) {
|
||||
let is_applicable = S::is_applicable(&data, stats_from_vec(&data));
|
||||
if !is_applicable {
|
||||
return (false, 0.0, 0.0, S::NAME);
|
||||
}
|
||||
let estimation = S::estimate(&data, stats_from_vec(&data));
|
||||
let mut out = vec![];
|
||||
S::serialize(
|
||||
&mut out,
|
||||
&data,
|
||||
stats_from_vec(&data),
|
||||
data.iter().cloned(),
|
||||
data.iter().cloned(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let actual_compression = out.len() as f32 / (data.len() * 8) as f32;
|
||||
return (true, estimation, actual_compression, S::NAME);
|
||||
}
|
||||
|
||||
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
|
||||
let min_value = data.iter().cloned().min().unwrap_or(0);
|
||||
let max_value = data.iter().cloned().max().unwrap_or(0);
|
||||
FastFieldStats {
|
||||
min_value,
|
||||
max_value,
|
||||
num_vals: data.len() as u64,
|
||||
}
|
||||
}
|
||||
410
fastfield_codecs/src/multilinearinterpol.rs
Normal file
410
fastfield_codecs/src/multilinearinterpol.rs
Normal file
@@ -0,0 +1,410 @@
|
||||
use crate::FastFieldCodecReader;
|
||||
use crate::FastFieldCodecSerializer;
|
||||
use crate::FastFieldDataAccess;
|
||||
use crate::FastFieldStats;
|
||||
use common::CountingWriter;
|
||||
use std::io::{self, Read, Write};
|
||||
use std::ops::Sub;
|
||||
use tantivy_bitpacker::compute_num_bits;
|
||||
use tantivy_bitpacker::BitPacker;
|
||||
|
||||
use common::BinarySerializable;
|
||||
use common::DeserializeFrom;
|
||||
use tantivy_bitpacker::BitUnpacker;
|
||||
|
||||
const CHUNK_SIZE: u64 = 512;
|
||||
|
||||
/// Depending on the field type, a different
|
||||
/// fast field is required.
|
||||
#[derive(Clone)]
|
||||
pub struct MultiLinearInterpolFastFieldReader {
|
||||
pub footer: MultiLinearInterpolFooter,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default)]
|
||||
struct Function {
|
||||
// The offset in the data is required, because we have diffrent bit_widths per block
|
||||
data_start_offset: u64,
|
||||
// start_pos in the block will be CHUNK_SIZE * BLOCK_NUM
|
||||
start_pos: u64,
|
||||
// only used during serialization, 0 after deserialization
|
||||
end_pos: u64,
|
||||
// only used during serialization, 0 after deserialization
|
||||
value_start_pos: u64,
|
||||
// only used during serialization, 0 after deserialization
|
||||
value_end_pos: u64,
|
||||
slope: f32,
|
||||
// The offset so that all values are positive when writing them
|
||||
positive_val_offset: u64,
|
||||
num_bits: u8,
|
||||
bit_unpacker: BitUnpacker,
|
||||
}
|
||||
|
||||
impl Function {
|
||||
fn calc_slope(&mut self) {
|
||||
let num_vals = self.end_pos - self.start_pos;
|
||||
get_slope(self.value_start_pos, self.value_end_pos, num_vals);
|
||||
}
|
||||
// split the interpolation into two function, change self and return the second split
|
||||
fn split(&mut self, split_pos: u64, split_pos_value: u64) -> Function {
|
||||
let mut new_function = Function {
|
||||
start_pos: split_pos,
|
||||
end_pos: self.end_pos,
|
||||
value_start_pos: split_pos_value,
|
||||
value_end_pos: self.value_end_pos,
|
||||
..Default::default()
|
||||
};
|
||||
new_function.calc_slope();
|
||||
self.end_pos = split_pos;
|
||||
self.value_end_pos = split_pos_value;
|
||||
self.calc_slope();
|
||||
new_function
|
||||
}
|
||||
}
|
||||
|
||||
impl BinarySerializable for Function {
|
||||
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
|
||||
self.data_start_offset.serialize(write)?;
|
||||
self.value_start_pos.serialize(write)?;
|
||||
self.positive_val_offset.serialize(write)?;
|
||||
self.slope.serialize(write)?;
|
||||
self.num_bits.serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Function> {
|
||||
let data_start_offset = u64::deserialize(reader)?;
|
||||
let value_start_pos = u64::deserialize(reader)?;
|
||||
let offset = u64::deserialize(reader)?;
|
||||
let slope = f32::deserialize(reader)?;
|
||||
let num_bits = u8::deserialize(reader)?;
|
||||
let interpolation = Function {
|
||||
data_start_offset,
|
||||
value_start_pos,
|
||||
positive_val_offset: offset,
|
||||
num_bits,
|
||||
bit_unpacker: BitUnpacker::new(num_bits),
|
||||
slope,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
Ok(interpolation)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct MultiLinearInterpolFooter {
|
||||
pub num_vals: u64,
|
||||
pub min_value: u64,
|
||||
pub max_value: u64,
|
||||
interpolations: Vec<Function>,
|
||||
}
|
||||
|
||||
impl BinarySerializable for MultiLinearInterpolFooter {
|
||||
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
|
||||
let mut out = vec![];
|
||||
self.num_vals.serialize(&mut out)?;
|
||||
self.min_value.serialize(&mut out)?;
|
||||
self.max_value.serialize(&mut out)?;
|
||||
self.interpolations.serialize(&mut out)?;
|
||||
write.write_all(&out)?;
|
||||
(out.len() as u32).serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<MultiLinearInterpolFooter> {
|
||||
let mut footer = MultiLinearInterpolFooter {
|
||||
num_vals: u64::deserialize(reader)?,
|
||||
min_value: u64::deserialize(reader)?,
|
||||
max_value: u64::deserialize(reader)?,
|
||||
interpolations: Vec::<Function>::deserialize(reader)?,
|
||||
};
|
||||
for (num, interpol) in footer.interpolations.iter_mut().enumerate() {
|
||||
interpol.start_pos = CHUNK_SIZE * num as u64;
|
||||
}
|
||||
Ok(footer)
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_interpolation_position(doc: u64) -> usize {
|
||||
let index = doc / CHUNK_SIZE;
|
||||
index as usize
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_interpolation_function(doc: u64, interpolations: &[Function]) -> &Function {
|
||||
&interpolations[get_interpolation_position(doc)]
|
||||
}
|
||||
|
||||
impl FastFieldCodecReader for MultiLinearInterpolFastFieldReader {
|
||||
/// Opens a fast field given a file.
|
||||
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
|
||||
let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?;
|
||||
|
||||
let (_data, mut footer) = bytes.split_at(bytes.len() - (4 + footer_len) as usize);
|
||||
let footer = MultiLinearInterpolFooter::deserialize(&mut footer)?;
|
||||
|
||||
Ok(MultiLinearInterpolFastFieldReader { footer })
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
|
||||
let interpolation = get_interpolation_function(doc, &self.footer.interpolations);
|
||||
let doc = doc - interpolation.start_pos;
|
||||
let calculated_value =
|
||||
get_calculated_value(interpolation.value_start_pos, doc, interpolation.slope);
|
||||
let diff = interpolation
|
||||
.bit_unpacker
|
||||
.get(doc, &data[interpolation.data_start_offset as usize..]);
|
||||
(calculated_value + diff) - interpolation.positive_val_offset
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn min_value(&self) -> u64 {
|
||||
self.footer.min_value
|
||||
}
|
||||
#[inline]
|
||||
fn max_value(&self) -> u64 {
|
||||
self.footer.max_value
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_slope(first_val: u64, last_val: u64, num_vals: u64) -> f32 {
|
||||
((last_val as f64 - first_val as f64) / (num_vals as u64 - 1) as f64) as f32
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 {
|
||||
(first_val as i64 + (pos as f32 * slope) as i64) as u64
|
||||
}
|
||||
|
||||
/// Same as LinearInterpolFastFieldSerializer, but working on chunks of CHUNK_SIZE elements.
|
||||
pub struct MultiLinearInterpolFastFieldSerializer {}
|
||||
|
||||
impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
|
||||
const NAME: &'static str = "MultiLinearInterpol";
|
||||
const ID: u8 = 3;
|
||||
/// Creates a new fast field serializer.
|
||||
fn serialize(
|
||||
write: &mut impl Write,
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
data_iter: impl Iterator<Item = u64>,
|
||||
_data_iter1: impl Iterator<Item = u64>,
|
||||
) -> io::Result<()> {
|
||||
assert!(stats.min_value <= stats.max_value);
|
||||
|
||||
let first_val = fastfield_accessor.get(0);
|
||||
let last_val = fastfield_accessor.get(stats.num_vals as u32 - 1);
|
||||
|
||||
let mut first_function = Function {
|
||||
end_pos: stats.num_vals,
|
||||
value_start_pos: first_val,
|
||||
value_end_pos: last_val,
|
||||
..Default::default()
|
||||
};
|
||||
first_function.calc_slope();
|
||||
let mut interpolations = vec![first_function];
|
||||
|
||||
// Since we potentially apply multiple passes over the data, the data is cached.
|
||||
// Multiple iteration can be expensive (merge with index sorting can add lot of overhead per
|
||||
// iteration)
|
||||
let data = data_iter.collect::<Vec<_>>();
|
||||
|
||||
//// let's split this into chunks of CHUNK_SIZE
|
||||
for data_pos in (0..data.len() as u64).step_by(CHUNK_SIZE as usize).skip(1) {
|
||||
let new_fun = {
|
||||
let current_interpolation = interpolations.last_mut().unwrap();
|
||||
current_interpolation.split(data_pos, data[data_pos as usize])
|
||||
};
|
||||
interpolations.push(new_fun);
|
||||
}
|
||||
// calculate offset and max (-> numbits) for each function
|
||||
for interpolation in &mut interpolations {
|
||||
let mut offset = 0;
|
||||
let mut rel_positive_max = 0;
|
||||
for (pos, actual_value) in data
|
||||
[interpolation.start_pos as usize..interpolation.end_pos as usize]
|
||||
.iter()
|
||||
.cloned()
|
||||
.enumerate()
|
||||
{
|
||||
let calculated_value = get_calculated_value(
|
||||
interpolation.value_start_pos,
|
||||
pos as u64,
|
||||
interpolation.slope,
|
||||
);
|
||||
if calculated_value > actual_value {
|
||||
// negative value we need to apply an offset
|
||||
// we ignore negative values in the max value calculation, because negative values
|
||||
// will be offset to 0
|
||||
offset = offset.max(calculated_value - actual_value);
|
||||
} else {
|
||||
//positive value no offset reuqired
|
||||
rel_positive_max = rel_positive_max.max(actual_value - calculated_value);
|
||||
}
|
||||
}
|
||||
|
||||
interpolation.positive_val_offset = offset;
|
||||
interpolation.num_bits = compute_num_bits(rel_positive_max + offset);
|
||||
}
|
||||
let mut bit_packer = BitPacker::new();
|
||||
|
||||
let write = &mut CountingWriter::wrap(write);
|
||||
for interpolation in &mut interpolations {
|
||||
interpolation.data_start_offset = write.written_bytes();
|
||||
let num_bits = interpolation.num_bits;
|
||||
for (pos, actual_value) in data
|
||||
[interpolation.start_pos as usize..interpolation.end_pos as usize]
|
||||
.iter()
|
||||
.cloned()
|
||||
.enumerate()
|
||||
{
|
||||
let calculated_value = get_calculated_value(
|
||||
interpolation.value_start_pos,
|
||||
pos as u64,
|
||||
interpolation.slope,
|
||||
);
|
||||
let diff = (actual_value + interpolation.positive_val_offset) - calculated_value;
|
||||
bit_packer.write(diff, num_bits, write)?;
|
||||
}
|
||||
bit_packer.flush(write)?;
|
||||
}
|
||||
bit_packer.close(write)?;
|
||||
|
||||
let footer = MultiLinearInterpolFooter {
|
||||
num_vals: stats.num_vals,
|
||||
min_value: stats.min_value,
|
||||
max_value: stats.max_value,
|
||||
interpolations,
|
||||
};
|
||||
footer.serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn is_applicable(
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> bool {
|
||||
if stats.num_vals < 5_000 {
|
||||
return false;
|
||||
}
|
||||
// On serialization the offset is added to the actual value.
|
||||
// We need to make sure this won't run into overflow calculation issues.
|
||||
// For this we take the maximum theroretical offset and add this to the max value.
|
||||
// If this doesn't overflow the algortihm should be fine
|
||||
let theorethical_maximum_offset = stats.max_value - stats.min_value;
|
||||
if stats
|
||||
.max_value
|
||||
.checked_add(theorethical_maximum_offset)
|
||||
.is_none()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
true
|
||||
}
|
||||
/// estimation for linear interpolation is hard because, you don't know
|
||||
/// where the local maxima are for the deviation of the calculated value and
|
||||
/// the offset is also unknown.
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
|
||||
let first_val_in_first_block = fastfield_accessor.get(0);
|
||||
let last_elem_in_first_chunk = CHUNK_SIZE.min(stats.num_vals);
|
||||
let last_val_in_first_block = fastfield_accessor.get(last_elem_in_first_chunk as u32 - 1);
|
||||
let slope = get_slope(
|
||||
first_val_in_first_block,
|
||||
last_val_in_first_block,
|
||||
stats.num_vals,
|
||||
);
|
||||
|
||||
// let's sample at 0%, 5%, 10% .. 95%, 100%, but for the first block only
|
||||
let sample_positions = (0..20)
|
||||
.map(|pos| (last_elem_in_first_chunk as f32 / 100.0 * pos as f32 * 5.0) as usize)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let max_distance = sample_positions
|
||||
.iter()
|
||||
.map(|pos| {
|
||||
let calculated_value =
|
||||
get_calculated_value(first_val_in_first_block, *pos as u64, slope);
|
||||
let actual_value = fastfield_accessor.get(*pos as u32);
|
||||
distance(calculated_value, actual_value)
|
||||
})
|
||||
.max()
|
||||
.unwrap();
|
||||
|
||||
// Estimate one block and extrapolate the cost to all blocks.
|
||||
// the theory would be that we don't have the actual max_distance, but we are close within 50%
|
||||
// threshold.
|
||||
// It is multiplied by 2 because in a log case scenario the line would be as much above as
|
||||
// below. So the offset would = max_distance
|
||||
//
|
||||
let relative_max_value = (max_distance as f32 * 1.5) * 2.0;
|
||||
|
||||
let num_bits = compute_num_bits(relative_max_value as u64) as u64 * stats.num_vals as u64
|
||||
// function metadata per block
|
||||
+ 29 * (stats.num_vals / CHUNK_SIZE);
|
||||
let num_bits_uncompressed = 64 * stats.num_vals;
|
||||
num_bits as f32 / num_bits_uncompressed as f32
|
||||
}
|
||||
}
|
||||
|
||||
fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
|
||||
if x < y {
|
||||
y - x
|
||||
} else {
|
||||
x - y
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tests::get_codec_test_data_sets;
|
||||
|
||||
fn create_and_validate(data: &[u64], name: &str) {
|
||||
crate::tests::create_and_validate::<
|
||||
MultiLinearInterpolFastFieldSerializer,
|
||||
MultiLinearInterpolFastFieldReader,
|
||||
>(&data, name);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_with_codec_data_sets() {
|
||||
let data_sets = get_codec_test_data_sets();
|
||||
for (mut data, name) in data_sets {
|
||||
create_and_validate(&data, name);
|
||||
data.reverse();
|
||||
create_and_validate(&data, name);
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn test_simple() {
|
||||
let data = (10..=20_u64).collect::<Vec<_>>();
|
||||
create_and_validate(&data, "simple monotonically");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn border_cases_1() {
|
||||
let data = (0..1024).collect::<Vec<_>>();
|
||||
create_and_validate(&data, "border case");
|
||||
}
|
||||
#[test]
|
||||
fn border_case_2() {
|
||||
let data = (0..1025).collect::<Vec<_>>();
|
||||
create_and_validate(&data, "border case");
|
||||
}
|
||||
#[test]
|
||||
fn rand() {
|
||||
for _ in 0..10 {
|
||||
let mut data = (5_000..20_000)
|
||||
.map(|_| rand::random::<u64>() as u64)
|
||||
.collect::<Vec<_>>();
|
||||
create_and_validate(&data, "random");
|
||||
|
||||
data.reverse();
|
||||
create_and_validate(&data, "random");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy-query-grammar"
|
||||
version = "0.14.0"
|
||||
version = "0.15.0"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
|
||||
@@ -539,10 +539,10 @@ mod tests {
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
index_writer.add_document(doc!(
|
||||
facet_field => Facet::from_text(&"/subjects/A/a"),
|
||||
facet_field => Facet::from_text(&"/subjects/B/a"),
|
||||
facet_field => Facet::from_text(&"/subjects/A/b"),
|
||||
facet_field => Facet::from_text(&"/subjects/B/b"),
|
||||
facet_field => Facet::from_text(&"/subjects/A/a").unwrap(),
|
||||
facet_field => Facet::from_text(&"/subjects/B/a").unwrap(),
|
||||
facet_field => Facet::from_text(&"/subjects/A/b").unwrap(),
|
||||
facet_field => Facet::from_text(&"/subjects/B/b").unwrap(),
|
||||
));
|
||||
index_writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
@@ -563,16 +563,16 @@ mod tests {
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(
|
||||
facet_field => Facet::from_text(&"/A/A"),
|
||||
facet_field => Facet::from_text(&"/A/A").unwrap(),
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
facet_field => Facet::from_text(&"/A/B"),
|
||||
facet_field => Facet::from_text(&"/A/B").unwrap(),
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
facet_field => Facet::from_text(&"/A/C/A"),
|
||||
facet_field => Facet::from_text(&"/A/C/A").unwrap(),
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
facet_field => Facet::from_text(&"/D/C/A"),
|
||||
facet_field => Facet::from_text(&"/D/C/A").unwrap(),
|
||||
));
|
||||
index_writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
@@ -580,7 +580,7 @@ mod tests {
|
||||
assert_eq!(searcher.num_docs(), 4);
|
||||
|
||||
let count_facet = |facet_str: &str| {
|
||||
let term = Term::from_facet(facet_field, &Facet::from_text(facet_str));
|
||||
let term = Term::from_facet(facet_field, &Facet::from_text(facet_str).unwrap());
|
||||
searcher
|
||||
.search(&TermQuery::new(term, IndexRecordOption::Basic), &Count)
|
||||
.unwrap()
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use crate::collector::{Collector, SegmentCollector};
|
||||
use crate::fastfield::{FastFieldReader, FastValue};
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, FastValue};
|
||||
use crate::schema::Field;
|
||||
use crate::{Score, SegmentReader, TantivyError};
|
||||
|
||||
@@ -155,7 +155,7 @@ where
|
||||
TPredicate: 'static,
|
||||
TPredicateValue: FastValue,
|
||||
{
|
||||
fast_field_reader: FastFieldReader<TPredicateValue>,
|
||||
fast_field_reader: DynamicFastFieldReader<TPredicateValue>,
|
||||
segment_collector: TSegmentCollector,
|
||||
predicate: TPredicate,
|
||||
t_predicate_value: PhantomData<TPredicateValue>,
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use crate::collector::{Collector, SegmentCollector};
|
||||
use crate::fastfield::{FastFieldReader, FastValue};
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, FastValue};
|
||||
use crate::schema::{Field, Type};
|
||||
use crate::{DocId, Score};
|
||||
use fastdivide::DividerU64;
|
||||
@@ -84,7 +84,7 @@ impl HistogramComputer {
|
||||
}
|
||||
pub struct SegmentHistogramCollector {
|
||||
histogram_computer: HistogramComputer,
|
||||
ff_reader: FastFieldReader<u64>,
|
||||
ff_reader: DynamicFastFieldReader<u64>,
|
||||
}
|
||||
|
||||
impl SegmentCollector for SegmentHistogramCollector {
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use super::*;
|
||||
use crate::core::SegmentReader;
|
||||
use crate::fastfield::BytesFastFieldReader;
|
||||
use crate::fastfield::DynamicFastFieldReader;
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::schema::Field;
|
||||
use crate::DocId;
|
||||
@@ -162,7 +163,7 @@ pub struct FastFieldTestCollector {
|
||||
|
||||
pub struct FastFieldSegmentCollector {
|
||||
vals: Vec<u64>,
|
||||
reader: FastFieldReader<u64>,
|
||||
reader: DynamicFastFieldReader<u64>,
|
||||
}
|
||||
|
||||
impl FastFieldTestCollector {
|
||||
|
||||
@@ -4,7 +4,7 @@ use crate::collector::tweak_score_top_collector::TweakedScoreTopCollector;
|
||||
use crate::collector::{
|
||||
CustomScorer, CustomSegmentScorer, ScoreSegmentTweaker, ScoreTweaker, SegmentCollector,
|
||||
};
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader};
|
||||
use crate::query::Weight;
|
||||
use crate::schema::Field;
|
||||
use crate::DocAddress;
|
||||
@@ -129,7 +129,7 @@ impl fmt::Debug for TopDocs {
|
||||
}
|
||||
|
||||
struct ScorerByFastFieldReader {
|
||||
ff_reader: FastFieldReader<u64>,
|
||||
ff_reader: DynamicFastFieldReader<u64>,
|
||||
}
|
||||
|
||||
impl CustomSegmentScorer<u64> for ScorerByFastFieldReader {
|
||||
@@ -151,7 +151,7 @@ impl CustomScorer<u64> for ScorerByField {
|
||||
// mapping is monotonic, so it is sufficient to compute our top-K docs.
|
||||
//
|
||||
// The conversion will then happen only on the top-K docs.
|
||||
let ff_reader: FastFieldReader<u64> = segment_reader
|
||||
let ff_reader = segment_reader
|
||||
.fast_fields()
|
||||
.typed_fast_field_reader(self.field)?;
|
||||
Ok(ScorerByFastFieldReader { ff_reader })
|
||||
@@ -401,6 +401,7 @@ impl TopDocs {
|
||||
/// # use tantivy::query::QueryParser;
|
||||
/// use tantivy::SegmentReader;
|
||||
/// use tantivy::collector::TopDocs;
|
||||
/// use tantivy::fastfield::FastFieldReader;
|
||||
/// use tantivy::schema::Field;
|
||||
///
|
||||
/// fn create_schema() -> Schema {
|
||||
@@ -508,6 +509,7 @@ impl TopDocs {
|
||||
/// use tantivy::SegmentReader;
|
||||
/// use tantivy::collector::TopDocs;
|
||||
/// use tantivy::schema::Field;
|
||||
/// use tantivy::fastfield::FastFieldReader;
|
||||
///
|
||||
/// # fn create_schema() -> Schema {
|
||||
/// # let mut schema_builder = Schema::builder();
|
||||
|
||||
@@ -1,18 +1,15 @@
|
||||
mod bitset;
|
||||
mod composite_file;
|
||||
mod counting_writer;
|
||||
mod serialize;
|
||||
mod vint;
|
||||
|
||||
pub use self::bitset::BitSet;
|
||||
pub(crate) use self::bitset::TinySet;
|
||||
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
|
||||
pub use self::counting_writer::CountingWriter;
|
||||
pub use self::serialize::{BinarySerializable, FixedSize};
|
||||
pub use self::vint::{
|
||||
pub use byteorder::LittleEndian as Endianness;
|
||||
pub use common::CountingWriter;
|
||||
pub use common::{
|
||||
read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt,
|
||||
};
|
||||
pub use byteorder::LittleEndian as Endianness;
|
||||
pub use common::{BinarySerializable, DeserializeFrom, FixedSize};
|
||||
|
||||
/// Segment's max doc must be `< MAX_DOC_LIMIT`.
|
||||
///
|
||||
@@ -103,8 +100,8 @@ pub fn u64_to_f64(val: u64) -> f64 {
|
||||
#[cfg(test)]
|
||||
pub(crate) mod test {
|
||||
|
||||
pub use super::serialize::test::fixed_size_test;
|
||||
use super::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
|
||||
use common::{BinarySerializable, FixedSize};
|
||||
use proptest::prelude::*;
|
||||
use std::f64;
|
||||
use tantivy_bitpacker::compute_num_bits;
|
||||
@@ -118,6 +115,12 @@ pub(crate) mod test {
|
||||
assert_eq!(u64_to_f64(f64_to_u64(val)), val);
|
||||
}
|
||||
|
||||
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
|
||||
let mut buffer = Vec::new();
|
||||
O::default().serialize(&mut buffer).unwrap();
|
||||
assert_eq!(buffer.len(), O::SIZE_IN_BYTES);
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#[test]
|
||||
fn test_f64_converter_monotonicity_proptest((left, right) in (proptest::num::f64::NORMAL, proptest::num::f64::NORMAL)) {
|
||||
|
||||
@@ -76,7 +76,7 @@ fn load_metas(
|
||||
/// );
|
||||
///
|
||||
/// let schema = schema_builder.build();
|
||||
/// let settings = IndexSettings{sort_by_field: Some(IndexSortByField{field:"number".to_string(), order:Order::Asc})};
|
||||
/// let settings = IndexSettings{sort_by_field: Some(IndexSortByField{field:"number".to_string(), order:Order::Asc}), ..Default::default()};
|
||||
/// let index = Index::builder().schema(schema).settings(settings).create_in_ram();
|
||||
///
|
||||
/// ```
|
||||
@@ -173,7 +173,7 @@ impl IndexBuilder {
|
||||
&directory,
|
||||
)?;
|
||||
let mut metas = IndexMeta::with_schema(self.get_expect_schema()?);
|
||||
metas.index_settings = self.index_settings.clone();
|
||||
metas.index_settings = self.index_settings;
|
||||
let index = Index::open_from_metas(directory, &metas, SegmentMetaInventory::default());
|
||||
Ok(index)
|
||||
}
|
||||
@@ -460,6 +460,13 @@ impl Index {
|
||||
pub fn settings(&self) -> &IndexSettings {
|
||||
&self.settings
|
||||
}
|
||||
|
||||
/// Accessor to the index settings
|
||||
///
|
||||
pub fn settings_mut(&mut self) -> &mut IndexSettings {
|
||||
&mut self.settings
|
||||
}
|
||||
|
||||
/// Accessor to the index schema
|
||||
///
|
||||
/// The schema is actually cloned.
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use super::SegmentComponent;
|
||||
use crate::core::SegmentId;
|
||||
use crate::schema::Schema;
|
||||
use crate::Opstamp;
|
||||
use crate::{core::SegmentId, store::Compressor};
|
||||
use census::{Inventory, TrackedObject};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::path::PathBuf;
|
||||
@@ -233,7 +233,11 @@ impl InnerSegmentMeta {
|
||||
pub struct IndexSettings {
|
||||
/// Sorts the documents by information
|
||||
/// provided in `IndexSortByField`
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub sort_by_field: Option<IndexSortByField>,
|
||||
/// The `Compressor` used to compress the doc store.
|
||||
#[serde(default)]
|
||||
pub docstore_compression: Compressor,
|
||||
}
|
||||
/// Settings to presort the documents in an index
|
||||
///
|
||||
@@ -255,6 +259,17 @@ pub enum Order {
|
||||
/// Descending Order
|
||||
Desc,
|
||||
}
|
||||
impl Order {
|
||||
/// return if the Order is ascending
|
||||
pub fn is_asc(&self) -> bool {
|
||||
self == &Order::Asc
|
||||
}
|
||||
/// return if the Order is descending
|
||||
pub fn is_desc(&self) -> bool {
|
||||
self == &Order::Desc
|
||||
}
|
||||
}
|
||||
|
||||
/// Meta information about the `Index`.
|
||||
///
|
||||
/// This object is serialized on disk in the `meta.json` file.
|
||||
@@ -369,6 +384,7 @@ mod tests {
|
||||
field: "text".to_string(),
|
||||
order: Order::Asc,
|
||||
}),
|
||||
..Default::default()
|
||||
},
|
||||
segments: Vec::new(),
|
||||
schema,
|
||||
@@ -378,7 +394,7 @@ mod tests {
|
||||
let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
|
||||
assert_eq!(
|
||||
json,
|
||||
r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"}},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","tokenizer":"default"},"stored":false}}],"opstamp":0}"#
|
||||
r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4"},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","tokenizer":"default"},"stored":false}}],"opstamp":0}"#
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -147,6 +147,13 @@ impl FileSlice {
|
||||
self.slice(from_offset..self.len())
|
||||
}
|
||||
|
||||
/// Returns a slice from the end.
|
||||
///
|
||||
/// Equivalent to `.slice(self.len() - from_offset, self.len())`
|
||||
pub fn slice_from_end(&self, from_offset: usize) -> FileSlice {
|
||||
self.slice(self.len() - from_offset..self.len())
|
||||
}
|
||||
|
||||
/// Like `.slice(...)` but enforcing only the `to`
|
||||
/// boundary.
|
||||
///
|
||||
|
||||
@@ -1,69 +1,45 @@
|
||||
use crate::common::{BinarySerializable, CountingWriter, FixedSize, HasLen, VInt};
|
||||
use crate::directory::error::Incompatibility;
|
||||
use crate::directory::FileSlice;
|
||||
use crate::directory::{AntiCallToken, TerminatingWrite};
|
||||
use crate::Version;
|
||||
use crate::{
|
||||
common::{BinarySerializable, CountingWriter, DeserializeFrom, FixedSize, HasLen},
|
||||
directory::{AntiCallToken, TerminatingWrite},
|
||||
Version, INDEX_FORMAT_VERSION,
|
||||
};
|
||||
use crc32fast::Hasher;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
|
||||
const FOOTER_MAX_LEN: usize = 10_000;
|
||||
const FOOTER_MAX_LEN: u32 = 50_000;
|
||||
|
||||
/// The magic byte of the footer to identify corruption
|
||||
/// or an old version of the footer.
|
||||
const FOOTER_MAGIC_NUMBER: u32 = 1337;
|
||||
|
||||
type CrcHashU32 = u32;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
/// A Footer is appended to every file
|
||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||
pub struct Footer {
|
||||
pub version: Version,
|
||||
pub meta: String,
|
||||
pub versioned_footer: VersionedFooter,
|
||||
}
|
||||
|
||||
/// Serialises the footer to a byte-array
|
||||
/// - versioned_footer_len : 4 bytes
|
||||
///- versioned_footer: variable bytes
|
||||
/// - meta_len: 4 bytes
|
||||
/// - meta: variable bytes
|
||||
/// - version_len: 4 bytes
|
||||
/// - version json: variable bytes
|
||||
impl BinarySerializable for Footer {
|
||||
fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
BinarySerializable::serialize(&self.versioned_footer, writer)?;
|
||||
BinarySerializable::serialize(&self.meta, writer)?;
|
||||
let version_string =
|
||||
serde_json::to_string(&self.version).map_err(|_err| io::ErrorKind::InvalidInput)?;
|
||||
BinarySerializable::serialize(&version_string, writer)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let versioned_footer = VersionedFooter::deserialize(reader)?;
|
||||
let meta = String::deserialize(reader)?;
|
||||
let version_json = String::deserialize(reader)?;
|
||||
let version = serde_json::from_str(&version_json)?;
|
||||
Ok(Footer {
|
||||
version,
|
||||
meta,
|
||||
versioned_footer,
|
||||
})
|
||||
}
|
||||
pub crc: CrcHashU32,
|
||||
}
|
||||
|
||||
impl Footer {
|
||||
pub fn new(versioned_footer: VersionedFooter) -> Self {
|
||||
pub fn new(crc: CrcHashU32) -> Self {
|
||||
let version = crate::VERSION.clone();
|
||||
let meta = version.to_string();
|
||||
Footer {
|
||||
version,
|
||||
meta,
|
||||
versioned_footer,
|
||||
}
|
||||
Footer { version, crc }
|
||||
}
|
||||
|
||||
pub fn crc(&self) -> CrcHashU32 {
|
||||
self.crc
|
||||
}
|
||||
pub fn append_footer<W: io::Write>(&self, mut write: &mut W) -> io::Result<()> {
|
||||
let mut counting_write = CountingWriter::wrap(&mut write);
|
||||
self.serialize(&mut counting_write)?;
|
||||
let written_len = counting_write.written_bytes();
|
||||
(written_len as u32).serialize(write)?;
|
||||
counting_write.write_all(serde_json::to_string(&self)?.as_ref())?;
|
||||
let footer_payload_len = counting_write.written_bytes();
|
||||
BinarySerializable::serialize(&(footer_payload_len as u32), write)?;
|
||||
BinarySerializable::serialize(&(FOOTER_MAGIC_NUMBER as u32), write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -77,12 +53,47 @@ impl Footer {
|
||||
),
|
||||
));
|
||||
}
|
||||
let (body_footer, footer_len_file) = file.split_from_end(u32::SIZE_IN_BYTES);
|
||||
let mut footer_len_bytes = footer_len_file.read_bytes()?;
|
||||
let footer_len = u32::deserialize(&mut footer_len_bytes)? as usize;
|
||||
let (body, footer) = body_footer.split_from_end(footer_len);
|
||||
let mut footer_bytes = footer.read_bytes()?;
|
||||
let footer = Footer::deserialize(&mut footer_bytes)?;
|
||||
|
||||
let footer_metadata_len = <(u32, u32)>::SIZE_IN_BYTES;
|
||||
let (footer_len, footer_magic_byte): (u32, u32) = file
|
||||
.slice_from_end(footer_metadata_len)
|
||||
.read_bytes()?
|
||||
.as_ref()
|
||||
.deserialize()?;
|
||||
|
||||
if footer_magic_byte != FOOTER_MAGIC_NUMBER {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
"Footer magic byte mismatch. File corrupted or index was created using old an tantivy version which is not supported anymore. Please use tantivy 0.15 or above to recreate the index.",
|
||||
));
|
||||
}
|
||||
|
||||
if footer_len > FOOTER_MAX_LEN {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
format!(
|
||||
"Footer seems invalid as it suggests a footer len of {}. File is corrupted, \
|
||||
or the index was created with a different & old version of tantivy.",
|
||||
footer_len
|
||||
),
|
||||
));
|
||||
}
|
||||
let total_footer_size = footer_len as usize + footer_metadata_len;
|
||||
if file.len() < total_footer_size {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::UnexpectedEof,
|
||||
format!(
|
||||
"File corrupted. The file is smaller than it's footer bytes (len={}).",
|
||||
total_footer_size
|
||||
),
|
||||
));
|
||||
}
|
||||
|
||||
let footer: Footer = serde_json::from_slice(&file.read_bytes_slice(
|
||||
file.len() - total_footer_size..file.len() - footer_metadata_len as usize,
|
||||
)?)?;
|
||||
|
||||
let body = file.slice_to(file.len() - total_footer_size);
|
||||
Ok((footer, body))
|
||||
}
|
||||
|
||||
@@ -90,151 +101,16 @@ impl Footer {
|
||||
/// Has to be called after `extract_footer` to make sure it's not accessing uninitialised memory
|
||||
pub fn is_compatible(&self) -> Result<(), Incompatibility> {
|
||||
let library_version = crate::version();
|
||||
match &self.versioned_footer {
|
||||
VersionedFooter::V1 {
|
||||
crc32: _crc,
|
||||
store_compression,
|
||||
} => {
|
||||
if &library_version.store_compression != store_compression {
|
||||
return Err(Incompatibility::CompressionMismatch {
|
||||
library_compression_format: library_version.store_compression.to_string(),
|
||||
index_compression_format: store_compression.to_string(),
|
||||
});
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
VersionedFooter::V2 {
|
||||
crc32: _crc,
|
||||
store_compression,
|
||||
} => {
|
||||
if &library_version.store_compression != store_compression {
|
||||
return Err(Incompatibility::CompressionMismatch {
|
||||
library_compression_format: library_version.store_compression.to_string(),
|
||||
index_compression_format: store_compression.to_string(),
|
||||
});
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
VersionedFooter::V3 {
|
||||
crc32: _crc,
|
||||
store_compression,
|
||||
} => {
|
||||
if &library_version.store_compression != store_compression {
|
||||
return Err(Incompatibility::CompressionMismatch {
|
||||
library_compression_format: library_version.store_compression.to_string(),
|
||||
index_compression_format: store_compression.to_string(),
|
||||
});
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
VersionedFooter::UnknownVersion => Err(Incompatibility::IndexMismatch {
|
||||
if self.version.index_format_version < 4
|
||||
|| self.version.index_format_version > INDEX_FORMAT_VERSION
|
||||
{
|
||||
return Err(Incompatibility::IndexMismatch {
|
||||
library_version: library_version.clone(),
|
||||
index_version: self.version.clone(),
|
||||
}),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Footer that includes a crc32 hash that enables us to checksum files in the index
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub enum VersionedFooter {
|
||||
UnknownVersion,
|
||||
V1 {
|
||||
crc32: CrcHashU32,
|
||||
store_compression: String,
|
||||
},
|
||||
// Introduction of the Block WAND information.
|
||||
V2 {
|
||||
crc32: CrcHashU32,
|
||||
store_compression: String,
|
||||
},
|
||||
// Block wand max termfred on 1 byte
|
||||
V3 {
|
||||
crc32: CrcHashU32,
|
||||
store_compression: String,
|
||||
},
|
||||
}
|
||||
|
||||
impl BinarySerializable for VersionedFooter {
|
||||
fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
let mut buf = Vec::new();
|
||||
match self {
|
||||
VersionedFooter::V3 {
|
||||
crc32,
|
||||
store_compression: compression,
|
||||
} => {
|
||||
// Serializes a valid `VersionedFooter` or panics if the version is unknown
|
||||
// [ version | crc_hash | compression_mode ]
|
||||
// [ 0..4 | 4..8 | variable ]
|
||||
BinarySerializable::serialize(&3u32, &mut buf)?;
|
||||
BinarySerializable::serialize(crc32, &mut buf)?;
|
||||
BinarySerializable::serialize(compression, &mut buf)?;
|
||||
}
|
||||
VersionedFooter::V2 { .. }
|
||||
| VersionedFooter::V1 { .. }
|
||||
| VersionedFooter::UnknownVersion => {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"Cannot serialize an unknown versioned footer ",
|
||||
));
|
||||
}
|
||||
}
|
||||
BinarySerializable::serialize(&VInt(buf.len() as u64), writer)?;
|
||||
assert!(buf.len() <= FOOTER_MAX_LEN);
|
||||
writer.write_all(&buf[..])?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let len = VInt::deserialize(reader)?.0 as usize;
|
||||
if len > FOOTER_MAX_LEN {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
format!(
|
||||
"Footer seems invalid as it suggests a footer len of {}. File is corrupted, \
|
||||
or the index was created with a different & old version of tantivy.",
|
||||
len
|
||||
),
|
||||
));
|
||||
}
|
||||
let mut buf = vec![0u8; len];
|
||||
reader.read_exact(&mut buf[..])?;
|
||||
let mut cursor = &buf[..];
|
||||
let version = u32::deserialize(&mut cursor)?;
|
||||
if version > 3 {
|
||||
return Ok(VersionedFooter::UnknownVersion);
|
||||
}
|
||||
let crc32 = u32::deserialize(&mut cursor)?;
|
||||
let store_compression = String::deserialize(&mut cursor)?;
|
||||
Ok(if version == 1 {
|
||||
VersionedFooter::V1 {
|
||||
crc32,
|
||||
store_compression,
|
||||
}
|
||||
} else if version == 2 {
|
||||
VersionedFooter::V2 {
|
||||
crc32,
|
||||
store_compression,
|
||||
}
|
||||
} else {
|
||||
assert_eq!(version, 3);
|
||||
VersionedFooter::V3 {
|
||||
crc32,
|
||||
store_compression,
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl VersionedFooter {
|
||||
pub fn crc(&self) -> Option<CrcHashU32> {
|
||||
match self {
|
||||
VersionedFooter::V3 { crc32, .. } => Some(*crc32),
|
||||
VersionedFooter::V2 { crc32, .. } => Some(*crc32),
|
||||
VersionedFooter::V1 { crc32, .. } => Some(*crc32),
|
||||
VersionedFooter::UnknownVersion { .. } => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct FooterProxy<W: TerminatingWrite> {
|
||||
@@ -268,10 +144,7 @@ impl<W: TerminatingWrite> Write for FooterProxy<W> {
|
||||
impl<W: TerminatingWrite> TerminatingWrite for FooterProxy<W> {
|
||||
fn terminate_ref(&mut self, _: AntiCallToken) -> io::Result<()> {
|
||||
let crc32 = self.hasher.take().unwrap().finalize();
|
||||
let footer = Footer::new(VersionedFooter::V3 {
|
||||
crc32,
|
||||
store_compression: crate::store::COMPRESSION.to_string(),
|
||||
});
|
||||
let footer = Footer::new(crc32);
|
||||
let mut writer = self.writer.take().unwrap();
|
||||
footer.append_footer(&mut writer)?;
|
||||
writer.terminate()
|
||||
@@ -281,140 +154,75 @@ impl<W: TerminatingWrite> TerminatingWrite for FooterProxy<W> {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::CrcHashU32;
|
||||
use super::FooterProxy;
|
||||
use crate::common::{BinarySerializable, VInt};
|
||||
use crate::directory::footer::{Footer, VersionedFooter};
|
||||
use crate::directory::TerminatingWrite;
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use regex::Regex;
|
||||
use crate::directory::footer::Footer;
|
||||
use crate::directory::OwnedBytes;
|
||||
use crate::{
|
||||
common::BinarySerializable,
|
||||
directory::{footer::FOOTER_MAGIC_NUMBER, FileSlice},
|
||||
};
|
||||
use std::io;
|
||||
|
||||
#[test]
|
||||
fn test_versioned_footer() {
|
||||
let mut vec = Vec::new();
|
||||
let footer_proxy = FooterProxy::new(&mut vec);
|
||||
assert!(footer_proxy.terminate().is_ok());
|
||||
if crate::store::COMPRESSION == "lz4" {
|
||||
assert_eq!(vec.len(), 158);
|
||||
} else if crate::store::COMPRESSION == "snappy" {
|
||||
assert_eq!(vec.len(), 167);
|
||||
} else if crate::store::COMPRESSION == "lz4_block" {
|
||||
assert_eq!(vec.len(), 176);
|
||||
}
|
||||
let footer = Footer::deserialize(&mut &vec[..]).unwrap();
|
||||
assert!(matches!(
|
||||
footer.versioned_footer,
|
||||
VersionedFooter::V3 { store_compression, .. }
|
||||
if store_compression == crate::store::COMPRESSION
|
||||
));
|
||||
assert_eq!(&footer.version, crate::version());
|
||||
fn test_deserialize_footer() {
|
||||
let mut buf: Vec<u8> = vec![];
|
||||
let footer = Footer::new(123);
|
||||
footer.append_footer(&mut buf).unwrap();
|
||||
let owned_bytes = OwnedBytes::new(buf);
|
||||
let fileslice = FileSlice::new(Box::new(owned_bytes));
|
||||
let (footer_deser, _body) = Footer::extract_footer(fileslice).unwrap();
|
||||
assert_eq!(footer_deser.crc(), footer.crc());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_serialize_deserialize_footer() {
|
||||
let mut buffer = Vec::new();
|
||||
let crc32 = 123456u32;
|
||||
let footer: Footer = Footer::new(VersionedFooter::V3 {
|
||||
crc32,
|
||||
store_compression: "lz4".to_string(),
|
||||
});
|
||||
footer.serialize(&mut buffer).unwrap();
|
||||
let footer_deser = Footer::deserialize(&mut &buffer[..]).unwrap();
|
||||
assert_eq!(footer_deser, footer);
|
||||
fn test_deserialize_footer_missing_magic_byte() {
|
||||
let mut buf: Vec<u8> = vec![];
|
||||
BinarySerializable::serialize(&0_u32, &mut buf).unwrap();
|
||||
let wrong_magic_byte: u32 = 5555;
|
||||
BinarySerializable::serialize(&wrong_magic_byte, &mut buf).unwrap();
|
||||
|
||||
let owned_bytes = OwnedBytes::new(buf);
|
||||
|
||||
let fileslice = FileSlice::new(Box::new(owned_bytes));
|
||||
let err = Footer::extract_footer(fileslice).unwrap_err();
|
||||
assert_eq!(
|
||||
err.to_string(),
|
||||
"Footer magic byte mismatch. File corrupted or index was created using old an tantivy version which \
|
||||
is not supported anymore. Please use tantivy 0.15 or above to recreate the index."
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn footer_length() {
|
||||
let crc32 = 1111111u32;
|
||||
let versioned_footer = VersionedFooter::V3 {
|
||||
crc32,
|
||||
store_compression: "lz4".to_string(),
|
||||
};
|
||||
let mut buf = Vec::new();
|
||||
versioned_footer.serialize(&mut buf).unwrap();
|
||||
assert_eq!(buf.len(), 13);
|
||||
let footer = Footer::new(versioned_footer);
|
||||
let regex_ptn = Regex::new(
|
||||
"tantivy v[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.{0,10}, index_format v[0-9]{1,5}",
|
||||
)
|
||||
.unwrap();
|
||||
assert!(regex_ptn.is_match(&footer.meta));
|
||||
}
|
||||
fn test_deserialize_footer_wrong_filesize() {
|
||||
let mut buf: Vec<u8> = vec![];
|
||||
BinarySerializable::serialize(&100_u32, &mut buf).unwrap();
|
||||
BinarySerializable::serialize(&FOOTER_MAGIC_NUMBER, &mut buf).unwrap();
|
||||
|
||||
#[test]
|
||||
fn versioned_footer_from_bytes() {
|
||||
let v_footer_bytes = vec![
|
||||
// versionned footer length
|
||||
12 | 128,
|
||||
// index format version
|
||||
3,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
// crc 32
|
||||
12,
|
||||
35,
|
||||
89,
|
||||
18,
|
||||
// compression format
|
||||
3 | 128,
|
||||
b'l',
|
||||
b'z',
|
||||
b'4',
|
||||
];
|
||||
let mut cursor = &v_footer_bytes[..];
|
||||
let versioned_footer = VersionedFooter::deserialize(&mut cursor).unwrap();
|
||||
assert!(cursor.is_empty());
|
||||
let expected_crc: u32 = LittleEndian::read_u32(&v_footer_bytes[5..9]) as CrcHashU32;
|
||||
let expected_versioned_footer: VersionedFooter = VersionedFooter::V3 {
|
||||
crc32: expected_crc,
|
||||
store_compression: "lz4".to_string(),
|
||||
};
|
||||
assert_eq!(versioned_footer, expected_versioned_footer);
|
||||
let mut buffer = Vec::new();
|
||||
assert!(versioned_footer.serialize(&mut buffer).is_ok());
|
||||
assert_eq!(&v_footer_bytes[..], &buffer[..]);
|
||||
}
|
||||
let owned_bytes = OwnedBytes::new(buf);
|
||||
|
||||
#[test]
|
||||
fn versioned_footer_panic() {
|
||||
let v_footer_bytes = vec![6u8 | 128u8, 3u8, 0u8, 0u8, 1u8, 0u8, 0u8];
|
||||
let mut b = &v_footer_bytes[..];
|
||||
let versioned_footer = VersionedFooter::deserialize(&mut b).unwrap();
|
||||
assert!(b.is_empty());
|
||||
let expected_versioned_footer = VersionedFooter::UnknownVersion;
|
||||
assert_eq!(versioned_footer, expected_versioned_footer);
|
||||
let mut buf = Vec::new();
|
||||
assert!(versioned_footer.serialize(&mut buf).is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(not(feature = "lz4"))]
|
||||
fn compression_mismatch() {
|
||||
let crc32 = 1111111u32;
|
||||
let versioned_footer = VersionedFooter::V1 {
|
||||
crc32,
|
||||
store_compression: "lz4".to_string(),
|
||||
};
|
||||
let footer = Footer::new(versioned_footer);
|
||||
let res = footer.is_compatible();
|
||||
assert!(res.is_err());
|
||||
let fileslice = FileSlice::new(Box::new(owned_bytes));
|
||||
let err = Footer::extract_footer(fileslice).unwrap_err();
|
||||
assert_eq!(err.kind(), io::ErrorKind::UnexpectedEof);
|
||||
assert_eq!(
|
||||
err.to_string(),
|
||||
"File corrupted. The file is smaller than it\'s footer bytes (len=108)."
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_deserialize_too_large_footer() {
|
||||
let mut buf = vec![];
|
||||
assert!(FooterProxy::new(&mut buf).terminate().is_ok());
|
||||
let mut long_len_buf = [0u8; 10];
|
||||
let num_bytes = VInt(super::FOOTER_MAX_LEN as u64 + 1u64).serialize_into(&mut long_len_buf);
|
||||
buf[0..num_bytes].copy_from_slice(&long_len_buf[..num_bytes]);
|
||||
let err = Footer::deserialize(&mut &buf[..]).unwrap_err();
|
||||
let mut buf: Vec<u8> = vec![];
|
||||
|
||||
let footer_length = super::FOOTER_MAX_LEN + 1;
|
||||
BinarySerializable::serialize(&footer_length, &mut buf).unwrap();
|
||||
BinarySerializable::serialize(&FOOTER_MAGIC_NUMBER, &mut buf).unwrap();
|
||||
|
||||
let owned_bytes = OwnedBytes::new(buf);
|
||||
|
||||
let fileslice = FileSlice::new(Box::new(owned_bytes));
|
||||
let err = Footer::extract_footer(fileslice).unwrap_err();
|
||||
assert_eq!(err.kind(), io::ErrorKind::InvalidData);
|
||||
assert_eq!(
|
||||
err.to_string(),
|
||||
"Footer seems invalid as it suggests a footer len of 10001. File is corrupted, \
|
||||
or the index was created with a different & old version of tantivy."
|
||||
"Footer seems invalid as it suggests a footer len of 50001. File is corrupted, \
|
||||
or the index was created with a different & old version of tantivy."
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -245,11 +245,7 @@ impl ManagedDirectory {
|
||||
let mut hasher = Hasher::new();
|
||||
hasher.update(bytes.as_slice());
|
||||
let crc = hasher.finalize();
|
||||
Ok(footer
|
||||
.versioned_footer
|
||||
.crc()
|
||||
.map(|v| v == crc)
|
||||
.unwrap_or(false))
|
||||
Ok(footer.crc() == crc)
|
||||
}
|
||||
|
||||
/// List files for which checksum does not match content
|
||||
|
||||
@@ -593,7 +593,7 @@ mod tests {
|
||||
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut log_merge_policy = LogMergePolicy::default();
|
||||
log_merge_policy.set_min_merge_size(3);
|
||||
log_merge_policy.set_min_num_segments(3);
|
||||
index_writer.set_merge_policy(Box::new(log_merge_policy));
|
||||
for _num_commits in 0..10 {
|
||||
for _ in 0..10 {
|
||||
|
||||
@@ -28,7 +28,9 @@ pub use self::file_slice::{FileHandle, FileSlice};
|
||||
pub use self::owned_bytes::OwnedBytes;
|
||||
pub use self::ram_directory::RamDirectory;
|
||||
pub use self::watch_event_router::{WatchCallback, WatchCallbackList, WatchHandle};
|
||||
use std::io::{self, BufWriter, Write};
|
||||
pub use common::AntiCallToken;
|
||||
pub use common::TerminatingWrite;
|
||||
use std::io::BufWriter;
|
||||
use std::path::PathBuf;
|
||||
|
||||
/// Outcome of the Garbage collection
|
||||
@@ -50,47 +52,6 @@ pub use self::mmap_directory::MmapDirectory;
|
||||
|
||||
pub use self::managed_directory::ManagedDirectory;
|
||||
|
||||
/// Struct used to prevent from calling [`terminate_ref`](trait.TerminatingWrite#method.terminate_ref) directly
|
||||
///
|
||||
/// The point is that while the type is public, it cannot be built by anyone
|
||||
/// outside of this module.
|
||||
pub struct AntiCallToken(());
|
||||
|
||||
/// Trait used to indicate when no more write need to be done on a writer
|
||||
pub trait TerminatingWrite: Write {
|
||||
/// Indicate that the writer will no longer be used. Internally call terminate_ref.
|
||||
fn terminate(mut self) -> io::Result<()>
|
||||
where
|
||||
Self: Sized,
|
||||
{
|
||||
self.terminate_ref(AntiCallToken(()))
|
||||
}
|
||||
|
||||
/// You should implement this function to define custom behavior.
|
||||
/// This function should flush any buffer it may hold.
|
||||
fn terminate_ref(&mut self, _: AntiCallToken) -> io::Result<()>;
|
||||
}
|
||||
|
||||
impl<W: TerminatingWrite + ?Sized> TerminatingWrite for Box<W> {
|
||||
fn terminate_ref(&mut self, token: AntiCallToken) -> io::Result<()> {
|
||||
self.as_mut().terminate_ref(token)
|
||||
}
|
||||
}
|
||||
|
||||
impl<W: TerminatingWrite> TerminatingWrite for BufWriter<W> {
|
||||
fn terminate_ref(&mut self, a: AntiCallToken) -> io::Result<()> {
|
||||
self.flush()?;
|
||||
self.get_mut().terminate_ref(a)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl<'a> TerminatingWrite for &'a mut Vec<u8> {
|
||||
fn terminate_ref(&mut self, _a: AntiCallToken) -> io::Result<()> {
|
||||
self.flush()
|
||||
}
|
||||
}
|
||||
|
||||
/// Write object for Directory.
|
||||
///
|
||||
/// `WritePtr` are required to implement both Write
|
||||
|
||||
@@ -46,7 +46,7 @@ impl Drop for VecWriter {
|
||||
fn drop(&mut self) {
|
||||
if !self.is_flushed {
|
||||
panic!(
|
||||
"You forgot to flush {:?} before its writter got Drop. Do not rely on drop.",
|
||||
"You forgot to flush {:?} before its writter got Drop. Do not rely on drop. This also occurs when the indexer crashed, so you may want to check the logs for the root cause.",
|
||||
self.path
|
||||
)
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use crate::directory::FileSlice;
|
||||
use crate::directory::OwnedBytes;
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::fastfield::{BitpackedFastFieldReader, FastFieldReader, MultiValueLength};
|
||||
use crate::DocId;
|
||||
use crate::{directory::FileSlice, fastfield::MultiValueLength};
|
||||
|
||||
/// Reader for byte array fast fields
|
||||
///
|
||||
@@ -15,13 +15,13 @@ use crate::{directory::FileSlice, fastfield::MultiValueLength};
|
||||
/// and the start index for the next document, and keeping the bytes in between.
|
||||
#[derive(Clone)]
|
||||
pub struct BytesFastFieldReader {
|
||||
idx_reader: FastFieldReader<u64>,
|
||||
idx_reader: BitpackedFastFieldReader<u64>,
|
||||
values: OwnedBytes,
|
||||
}
|
||||
|
||||
impl BytesFastFieldReader {
|
||||
pub(crate) fn open(
|
||||
idx_reader: FastFieldReader<u64>,
|
||||
idx_reader: BitpackedFastFieldReader<u64>,
|
||||
values_file: FileSlice,
|
||||
) -> crate::Result<BytesFastFieldReader> {
|
||||
let values = values_file.read_bytes()?;
|
||||
|
||||
@@ -2,7 +2,9 @@ use std::io;
|
||||
|
||||
use crate::schema::{Document, Field, Value};
|
||||
use crate::DocId;
|
||||
use crate::{fastfield::serializer::FastFieldSerializer, indexer::doc_id_mapping::DocIdMapping};
|
||||
use crate::{
|
||||
fastfield::serializer::CompositeFastFieldSerializer, indexer::doc_id_mapping::DocIdMapping,
|
||||
};
|
||||
|
||||
/// Writer for byte array (as in, any number of bytes per document) fast fields
|
||||
///
|
||||
@@ -104,7 +106,7 @@ impl BytesFastFieldWriter {
|
||||
/// Serializes the fast field values by pushing them to the `FastFieldSerializer`.
|
||||
pub fn serialize(
|
||||
&self,
|
||||
serializer: &mut FastFieldSerializer,
|
||||
serializer: &mut CompositeFastFieldSerializer,
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
) -> io::Result<()> {
|
||||
// writing the offset index
|
||||
|
||||
@@ -95,7 +95,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b")));
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let facet_reader = searcher
|
||||
@@ -118,7 +118,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b")));
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let facet_reader = searcher
|
||||
@@ -141,7 +141,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b")));
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let facet_reader = searcher
|
||||
@@ -164,7 +164,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b")));
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let facet_reader = searcher
|
||||
@@ -187,7 +187,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b")));
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
|
||||
index_writer.add_document(Document::default());
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
|
||||
@@ -29,9 +29,13 @@ pub use self::delete::DeleteBitSet;
|
||||
pub use self::error::{FastFieldNotAvailableError, Result};
|
||||
pub use self::facet_reader::FacetReader;
|
||||
pub use self::multivalued::{MultiValuedFastFieldReader, MultiValuedFastFieldWriter};
|
||||
pub(crate) use self::reader::BitpackedFastFieldReader;
|
||||
pub use self::reader::DynamicFastFieldReader;
|
||||
pub use self::reader::FastFieldReader;
|
||||
pub use self::readers::FastFieldReaders;
|
||||
pub use self::serializer::FastFieldSerializer;
|
||||
pub use self::serializer::CompositeFastFieldSerializer;
|
||||
pub use self::serializer::FastFieldDataAccess;
|
||||
pub use self::serializer::FastFieldStats;
|
||||
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
|
||||
use crate::schema::Cardinality;
|
||||
use crate::schema::FieldType;
|
||||
@@ -57,7 +61,7 @@ mod writer;
|
||||
pub trait MultiValueLength {
|
||||
/// returns the num of values associated to a doc_id
|
||||
fn get_len(&self, doc_id: DocId) -> u64;
|
||||
/// returns the sum of num of all values for all doc_ids
|
||||
/// returns the sum of num values for all doc_ids
|
||||
fn get_total_len(&self) -> u64;
|
||||
}
|
||||
|
||||
@@ -210,15 +214,14 @@ mod tests {
|
||||
|
||||
use super::*;
|
||||
use crate::common::CompositeFile;
|
||||
use crate::common::HasLen;
|
||||
use crate::directory::{Directory, RamDirectory, WritePtr};
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::merge_policy::NoMergePolicy;
|
||||
use crate::schema::Field;
|
||||
use crate::schema::Schema;
|
||||
use crate::schema::FAST;
|
||||
use crate::schema::{Document, IntOptions};
|
||||
use crate::{Index, SegmentId, SegmentReader};
|
||||
use common::HasLen;
|
||||
use once_cell::sync::Lazy;
|
||||
use rand::prelude::SliceRandom;
|
||||
use rand::rngs::StdRng;
|
||||
@@ -236,7 +239,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
pub fn test_fastfield() {
|
||||
let test_fastfield = FastFieldReader::<u64>::from(vec![100, 200, 300]);
|
||||
let test_fastfield = DynamicFastFieldReader::<u64>::from(vec![100, 200, 300]);
|
||||
assert_eq!(test_fastfield.get(0), 100);
|
||||
assert_eq!(test_fastfield.get(1), 200);
|
||||
assert_eq!(test_fastfield.get(2), 300);
|
||||
@@ -254,7 +257,7 @@ mod tests {
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>13u64));
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>14u64));
|
||||
@@ -265,10 +268,10 @@ mod tests {
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let file = directory.open_read(&path).unwrap();
|
||||
assert_eq!(file.len(), 36 as usize);
|
||||
assert_eq!(file.len(), 37 as usize);
|
||||
let composite_file = CompositeFile::open(&file)?;
|
||||
let file = composite_file.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<u64>::open(file)?;
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(file)?;
|
||||
assert_eq!(fast_field_reader.get(0), 13u64);
|
||||
assert_eq!(fast_field_reader.get(1), 14u64);
|
||||
assert_eq!(fast_field_reader.get(2), 2u64);
|
||||
@@ -281,7 +284,7 @@ mod tests {
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test"))?;
|
||||
let mut serializer = FastFieldSerializer::from_write(write)?;
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write)?;
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>4u64));
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>14_082_001u64));
|
||||
@@ -296,11 +299,11 @@ mod tests {
|
||||
serializer.close()?;
|
||||
}
|
||||
let file = directory.open_read(&path)?;
|
||||
assert_eq!(file.len(), 61 as usize);
|
||||
assert_eq!(file.len(), 62 as usize);
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file)?;
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<u64>::open(data)?;
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data)?;
|
||||
assert_eq!(fast_field_reader.get(0), 4u64);
|
||||
assert_eq!(fast_field_reader.get(1), 14_082_001u64);
|
||||
assert_eq!(fast_field_reader.get(2), 3_052u64);
|
||||
@@ -321,7 +324,7 @@ mod tests {
|
||||
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for _ in 0..10_000 {
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>100_000u64));
|
||||
@@ -332,11 +335,11 @@ mod tests {
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let file = directory.open_read(&path).unwrap();
|
||||
assert_eq!(file.len(), 34 as usize);
|
||||
assert_eq!(file.len(), 35 as usize);
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file).unwrap();
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<u64>::open(data)?;
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data)?;
|
||||
for doc in 0..10_000 {
|
||||
assert_eq!(fast_field_reader.get(doc), 100_000u64);
|
||||
}
|
||||
@@ -351,7 +354,7 @@ mod tests {
|
||||
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
// forcing the amplitude to be high
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>0u64));
|
||||
@@ -364,11 +367,11 @@ mod tests {
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let file = directory.open_read(&path).unwrap();
|
||||
assert_eq!(file.len(), 80042 as usize);
|
||||
assert_eq!(file.len(), 80043 as usize);
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file)?;
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<u64>::open(data)?;
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data)?;
|
||||
assert_eq!(fast_field_reader.get(0), 0u64);
|
||||
for doc in 1..10_001 {
|
||||
assert_eq!(
|
||||
@@ -390,7 +393,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
|
||||
for i in -100i64..10_000i64 {
|
||||
let mut doc = Document::default();
|
||||
@@ -403,11 +406,12 @@ mod tests {
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let file = directory.open_read(&path).unwrap();
|
||||
assert_eq!(file.len(), 17709 as usize);
|
||||
//assert_eq!(file.len(), 17710 as usize); //bitpacked size
|
||||
assert_eq!(file.len(), 10175 as usize); // linear interpol size
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file)?;
|
||||
let data = fast_fields_composite.open_read(i64_field).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<i64>::open(data)?;
|
||||
let fast_field_reader = DynamicFastFieldReader::<i64>::open(data)?;
|
||||
|
||||
assert_eq!(fast_field_reader.min_value(), -100i64);
|
||||
assert_eq!(fast_field_reader.max_value(), 9_999i64);
|
||||
@@ -433,7 +437,7 @@ mod tests {
|
||||
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
|
||||
let doc = Document::default();
|
||||
fast_field_writers.add_document(&doc);
|
||||
@@ -447,7 +451,7 @@ mod tests {
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file).unwrap();
|
||||
let data = fast_fields_composite.open_read(i64_field).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<i64>::open(data)?;
|
||||
let fast_field_reader = DynamicFastFieldReader::<i64>::open(data)?;
|
||||
assert_eq!(fast_field_reader.get(0u32), 0i64);
|
||||
}
|
||||
Ok(())
|
||||
@@ -468,7 +472,7 @@ mod tests {
|
||||
let directory = RamDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test"))?;
|
||||
let mut serializer = FastFieldSerializer::from_write(write)?;
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write)?;
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for &x in &permutation {
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>x));
|
||||
@@ -480,7 +484,7 @@ mod tests {
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file)?;
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<u64>::open(data)?;
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data)?;
|
||||
|
||||
let mut a = 0u64;
|
||||
for _ in 0..n {
|
||||
@@ -624,7 +628,7 @@ mod bench {
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for &x in &permutation {
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>x));
|
||||
@@ -638,7 +642,7 @@ mod bench {
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file).unwrap();
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<u64>::open(data).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data).unwrap();
|
||||
|
||||
b.iter(|| {
|
||||
let n = test::black_box(7000u32);
|
||||
@@ -658,7 +662,7 @@ mod bench {
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for &x in &permutation {
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>x));
|
||||
@@ -672,7 +676,7 @@ mod bench {
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file).unwrap();
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<u64>::open(data).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data).unwrap();
|
||||
|
||||
b.iter(|| {
|
||||
let n = test::black_box(1000u32);
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
use std::ops::Range;
|
||||
|
||||
use crate::fastfield::{FastFieldReader, FastValue, MultiValueLength};
|
||||
use crate::fastfield::{
|
||||
BitpackedFastFieldReader, DynamicFastFieldReader, FastFieldReader, FastValue, MultiValueLength,
|
||||
};
|
||||
use crate::DocId;
|
||||
|
||||
/// Reader for a multivalued `u64` fast field.
|
||||
@@ -13,14 +15,14 @@ use crate::DocId;
|
||||
///
|
||||
#[derive(Clone)]
|
||||
pub struct MultiValuedFastFieldReader<Item: FastValue> {
|
||||
idx_reader: FastFieldReader<u64>,
|
||||
vals_reader: FastFieldReader<Item>,
|
||||
idx_reader: DynamicFastFieldReader<u64>,
|
||||
vals_reader: BitpackedFastFieldReader<Item>,
|
||||
}
|
||||
|
||||
impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
|
||||
pub(crate) fn open(
|
||||
idx_reader: FastFieldReader<u64>,
|
||||
vals_reader: FastFieldReader<Item>,
|
||||
idx_reader: DynamicFastFieldReader<u64>,
|
||||
vals_reader: BitpackedFastFieldReader<Item>,
|
||||
) -> MultiValuedFastFieldReader<Item> {
|
||||
MultiValuedFastFieldReader {
|
||||
idx_reader,
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use crate::fastfield::serializer::FastSingleFieldSerializer;
|
||||
use crate::fastfield::FastFieldSerializer;
|
||||
use crate::fastfield::serializer::BitpackedFastFieldSerializerLegacy;
|
||||
use crate::fastfield::CompositeFastFieldSerializer;
|
||||
use crate::postings::UnorderedTermId;
|
||||
use crate::schema::{Document, Field};
|
||||
use crate::termdict::TermOrdinal;
|
||||
@@ -134,7 +134,7 @@ impl MultiValuedFastFieldWriter {
|
||||
///
|
||||
pub fn serialize(
|
||||
&self,
|
||||
serializer: &mut FastFieldSerializer,
|
||||
serializer: &mut CompositeFastFieldSerializer,
|
||||
mapping_opt: Option<&FnvHashMap<UnorderedTermId, TermOrdinal>>,
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
) -> io::Result<()> {
|
||||
@@ -154,7 +154,7 @@ impl MultiValuedFastFieldWriter {
|
||||
}
|
||||
{
|
||||
// writing the values themselves.
|
||||
let mut value_serializer: FastSingleFieldSerializer<'_, _>;
|
||||
let mut value_serializer: BitpackedFastFieldSerializerLegacy<'_, _>;
|
||||
match mapping_opt {
|
||||
Some(mapping) => {
|
||||
value_serializer = serializer.new_u64_fast_field_with_idx(
|
||||
|
||||
@@ -4,47 +4,24 @@ use crate::common::CompositeFile;
|
||||
use crate::directory::FileSlice;
|
||||
use crate::directory::OwnedBytes;
|
||||
use crate::directory::{Directory, RamDirectory, WritePtr};
|
||||
use crate::fastfield::{FastFieldSerializer, FastFieldsWriter};
|
||||
use crate::fastfield::{CompositeFastFieldSerializer, FastFieldsWriter};
|
||||
use crate::schema::Schema;
|
||||
use crate::schema::FAST;
|
||||
use crate::DocId;
|
||||
use fastfield_codecs::bitpacked::BitpackedFastFieldReader as BitpackedReader;
|
||||
use fastfield_codecs::bitpacked::BitpackedFastFieldSerializer;
|
||||
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldReader;
|
||||
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer;
|
||||
use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldReader;
|
||||
use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldSerializer;
|
||||
use fastfield_codecs::FastFieldCodecReader;
|
||||
use fastfield_codecs::FastFieldCodecSerializer;
|
||||
use std::collections::HashMap;
|
||||
use std::marker::PhantomData;
|
||||
use std::path::Path;
|
||||
use tantivy_bitpacker::compute_num_bits;
|
||||
use tantivy_bitpacker::BitUnpacker;
|
||||
|
||||
/// Trait for accessing a fastfield.
|
||||
///
|
||||
/// Depending on the field type, a different
|
||||
/// fast field is required.
|
||||
#[derive(Clone)]
|
||||
pub struct FastFieldReader<Item: FastValue> {
|
||||
bytes: OwnedBytes,
|
||||
bit_unpacker: BitUnpacker,
|
||||
min_value_u64: u64,
|
||||
max_value_u64: u64,
|
||||
_phantom: PhantomData<Item>,
|
||||
}
|
||||
|
||||
impl<Item: FastValue> FastFieldReader<Item> {
|
||||
/// Opens a fast field given a file.
|
||||
pub fn open(file: FileSlice) -> crate::Result<Self> {
|
||||
let mut bytes = file.read_bytes()?;
|
||||
let min_value = u64::deserialize(&mut bytes)?;
|
||||
let amplitude = u64::deserialize(&mut bytes)?;
|
||||
let max_value = min_value + amplitude;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let bit_unpacker = BitUnpacker::new(num_bits);
|
||||
Ok(FastFieldReader {
|
||||
bytes,
|
||||
min_value_u64: min_value,
|
||||
max_value_u64: max_value,
|
||||
bit_unpacker,
|
||||
_phantom: PhantomData,
|
||||
})
|
||||
}
|
||||
|
||||
/// FastFieldReader is the trait to access fast field data.
|
||||
pub trait FastFieldReader<Item: FastValue>: Clone {
|
||||
/// Return the value associated to the given document.
|
||||
///
|
||||
/// This accessor should return as fast as possible.
|
||||
@@ -52,13 +29,154 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic if `doc` is greater than the segment
|
||||
// `maxdoc`.
|
||||
pub fn get(&self, doc: DocId) -> Item {
|
||||
self.get_u64(u64::from(doc))
|
||||
}
|
||||
fn get(&self, doc: DocId) -> Item;
|
||||
|
||||
/// Fills an output buffer with the fast field values
|
||||
/// associated with the `DocId` going from
|
||||
/// `start` to `start + output.len()`.
|
||||
///
|
||||
/// Regardless of the type of `Item`, this method works
|
||||
/// - transmuting the output array
|
||||
/// - extracting the `Item`s as if they were `u64`
|
||||
/// - possibly converting the `u64` value to the right type.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic if `start + output.len()` is greater than
|
||||
/// the segment's `maxdoc`.
|
||||
fn get_range(&self, start: DocId, output: &mut [Item]);
|
||||
|
||||
/// Returns the minimum value for this fast field.
|
||||
///
|
||||
/// The max value does not take in account of possible
|
||||
/// deleted document, and should be considered as an upper bound
|
||||
/// of the actual maximum value.
|
||||
fn min_value(&self) -> Item;
|
||||
|
||||
/// Returns the maximum value for this fast field.
|
||||
///
|
||||
/// The max value does not take in account of possible
|
||||
/// deleted document, and should be considered as an upper bound
|
||||
/// of the actual maximum value.
|
||||
fn max_value(&self) -> Item;
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
/// DynamicFastFieldReader wraps different readers to access
|
||||
/// the various encoded fastfield data
|
||||
///
|
||||
pub enum DynamicFastFieldReader<Item: FastValue> {
|
||||
/// Bitpacked compressed fastfield data.
|
||||
Bitpacked(FastFieldReaderCodecWrapper<Item, BitpackedReader>),
|
||||
/// Linear interpolated values + bitpacked
|
||||
LinearInterpol(FastFieldReaderCodecWrapper<Item, LinearInterpolFastFieldReader>),
|
||||
/// Blockwise linear interpolated values + bitpacked
|
||||
MultiLinearInterpol(FastFieldReaderCodecWrapper<Item, MultiLinearInterpolFastFieldReader>),
|
||||
}
|
||||
|
||||
impl<Item: FastValue> DynamicFastFieldReader<Item> {
|
||||
/// Returns correct the reader wrapped in the `DynamicFastFieldReader` enum for the data.
|
||||
pub fn open(file: FileSlice) -> crate::Result<DynamicFastFieldReader<Item>> {
|
||||
let mut bytes = file.read_bytes()?;
|
||||
let id = bytes.read_u8();
|
||||
|
||||
let reader = match id {
|
||||
BitpackedFastFieldSerializer::ID => {
|
||||
DynamicFastFieldReader::Bitpacked(FastFieldReaderCodecWrapper::<
|
||||
Item,
|
||||
BitpackedReader,
|
||||
>::open_from_bytes(bytes)?)
|
||||
}
|
||||
LinearInterpolFastFieldSerializer::ID => {
|
||||
DynamicFastFieldReader::LinearInterpol(FastFieldReaderCodecWrapper::<
|
||||
Item,
|
||||
LinearInterpolFastFieldReader,
|
||||
>::open_from_bytes(bytes)?)
|
||||
}
|
||||
MultiLinearInterpolFastFieldSerializer::ID => {
|
||||
DynamicFastFieldReader::MultiLinearInterpol(FastFieldReaderCodecWrapper::<
|
||||
Item,
|
||||
MultiLinearInterpolFastFieldReader,
|
||||
>::open_from_bytes(
|
||||
bytes
|
||||
)?)
|
||||
}
|
||||
_ => {
|
||||
panic!(
|
||||
"unknown fastfield id {:?}. Data corrupted or using old tantivy version.",
|
||||
id
|
||||
)
|
||||
}
|
||||
};
|
||||
Ok(reader)
|
||||
}
|
||||
}
|
||||
|
||||
impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
|
||||
fn get(&self, doc: DocId) -> Item {
|
||||
match self {
|
||||
Self::Bitpacked(reader) => reader.get(doc),
|
||||
Self::LinearInterpol(reader) => reader.get(doc),
|
||||
Self::MultiLinearInterpol(reader) => reader.get(doc),
|
||||
}
|
||||
}
|
||||
fn get_range(&self, start: DocId, output: &mut [Item]) {
|
||||
match self {
|
||||
Self::Bitpacked(reader) => reader.get_range(start, output),
|
||||
Self::LinearInterpol(reader) => reader.get_range(start, output),
|
||||
Self::MultiLinearInterpol(reader) => reader.get_range(start, output),
|
||||
}
|
||||
}
|
||||
fn min_value(&self) -> Item {
|
||||
match self {
|
||||
Self::Bitpacked(reader) => reader.min_value(),
|
||||
Self::LinearInterpol(reader) => reader.min_value(),
|
||||
Self::MultiLinearInterpol(reader) => reader.min_value(),
|
||||
}
|
||||
}
|
||||
fn max_value(&self) -> Item {
|
||||
match self {
|
||||
Self::Bitpacked(reader) => reader.max_value(),
|
||||
Self::LinearInterpol(reader) => reader.max_value(),
|
||||
Self::MultiLinearInterpol(reader) => reader.max_value(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Wrapper for accessing a fastfield.
|
||||
///
|
||||
/// Holds the data and the codec to the read the data.
|
||||
///
|
||||
#[derive(Clone)]
|
||||
pub struct FastFieldReaderCodecWrapper<Item: FastValue, CodecReader> {
|
||||
reader: CodecReader,
|
||||
bytes: OwnedBytes,
|
||||
_phantom: PhantomData<Item>,
|
||||
}
|
||||
|
||||
impl<Item: FastValue, C: FastFieldCodecReader> FastFieldReaderCodecWrapper<Item, C> {
|
||||
/// Opens a fast field given a file.
|
||||
pub fn open(file: FileSlice) -> crate::Result<Self> {
|
||||
let mut bytes = file.read_bytes()?;
|
||||
let id = u8::deserialize(&mut bytes)?;
|
||||
assert_eq!(
|
||||
BitpackedFastFieldSerializer::ID,
|
||||
id,
|
||||
"Tried to open fast field as bitpacked encoded (id=1), but got serializer with different id"
|
||||
);
|
||||
Self::open_from_bytes(bytes)
|
||||
}
|
||||
/// Opens a fast field given the bytes.
|
||||
pub fn open_from_bytes(bytes: OwnedBytes) -> crate::Result<Self> {
|
||||
let reader = C::open_from_bytes(bytes.as_slice())?;
|
||||
Ok(FastFieldReaderCodecWrapper {
|
||||
reader,
|
||||
bytes,
|
||||
_phantom: PhantomData,
|
||||
})
|
||||
}
|
||||
pub(crate) fn get_u64(&self, doc: u64) -> Item {
|
||||
Item::from_u64(self.min_value_u64 + self.bit_unpacker.get(doc, &self.bytes))
|
||||
Item::from_u64(self.reader.get_u64(doc, self.bytes.as_slice()))
|
||||
}
|
||||
|
||||
/// Internally `multivalued` also use SingleValue Fast fields.
|
||||
@@ -78,6 +196,22 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
||||
*out = self.get_u64(start + (i as u64));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<Item: FastValue, C: FastFieldCodecReader + Clone> FastFieldReader<Item>
|
||||
for FastFieldReaderCodecWrapper<Item, C>
|
||||
{
|
||||
/// Return the value associated to the given document.
|
||||
///
|
||||
/// This accessor should return as fast as possible.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic if `doc` is greater than the segment
|
||||
// `maxdoc`.
|
||||
fn get(&self, doc: DocId) -> Item {
|
||||
self.get_u64(u64::from(doc))
|
||||
}
|
||||
|
||||
/// Fills an output buffer with the fast field values
|
||||
/// associated with the `DocId` going from
|
||||
@@ -92,7 +226,7 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
||||
///
|
||||
/// May panic if `start + output.len()` is greater than
|
||||
/// the segment's `maxdoc`.
|
||||
pub fn get_range(&self, start: DocId, output: &mut [Item]) {
|
||||
fn get_range(&self, start: DocId, output: &mut [Item]) {
|
||||
self.get_range_u64(u64::from(start), output);
|
||||
}
|
||||
|
||||
@@ -101,8 +235,8 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
||||
/// The max value does not take in account of possible
|
||||
/// deleted document, and should be considered as an upper bound
|
||||
/// of the actual maximum value.
|
||||
pub fn min_value(&self) -> Item {
|
||||
Item::from_u64(self.min_value_u64)
|
||||
fn min_value(&self) -> Item {
|
||||
Item::from_u64(self.reader.min_value())
|
||||
}
|
||||
|
||||
/// Returns the maximum value for this fast field.
|
||||
@@ -110,13 +244,15 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
||||
/// The max value does not take in account of possible
|
||||
/// deleted document, and should be considered as an upper bound
|
||||
/// of the actual maximum value.
|
||||
pub fn max_value(&self) -> Item {
|
||||
Item::from_u64(self.max_value_u64)
|
||||
fn max_value(&self) -> Item {
|
||||
Item::from_u64(self.reader.max_value())
|
||||
}
|
||||
}
|
||||
|
||||
impl<Item: FastValue> From<Vec<Item>> for FastFieldReader<Item> {
|
||||
fn from(vals: Vec<Item>) -> FastFieldReader<Item> {
|
||||
pub(crate) type BitpackedFastFieldReader<Item> = FastFieldReaderCodecWrapper<Item, BitpackedReader>;
|
||||
|
||||
impl<Item: FastValue> From<Vec<Item>> for DynamicFastFieldReader<Item> {
|
||||
fn from(vals: Vec<Item>) -> DynamicFastFieldReader<Item> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_u64_field("field", FAST);
|
||||
let schema = schema_builder.build();
|
||||
@@ -126,7 +262,7 @@ impl<Item: FastValue> From<Vec<Item>> for FastFieldReader<Item> {
|
||||
let write: WritePtr = directory
|
||||
.open_write(path)
|
||||
.expect("With a RamDirectory, this should never fail.");
|
||||
let mut serializer = FastFieldSerializer::from_write(write)
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write)
|
||||
.expect("With a RamDirectory, this should never fail.");
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
|
||||
{
|
||||
@@ -148,6 +284,6 @@ impl<Item: FastValue> From<Vec<Item>> for FastFieldReader<Item> {
|
||||
let field_file = composite_file
|
||||
.open_read(field)
|
||||
.expect("File component not found");
|
||||
FastFieldReader::open(field_file).unwrap()
|
||||
DynamicFastFieldReader::open(field_file).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,13 +1,15 @@
|
||||
use crate::common::CompositeFile;
|
||||
use crate::directory::FileSlice;
|
||||
use crate::fastfield::MultiValuedFastFieldReader;
|
||||
use crate::fastfield::{BitpackedFastFieldReader, FastFieldNotAvailableError};
|
||||
use crate::fastfield::{BytesFastFieldReader, FastValue};
|
||||
use crate::fastfield::{FastFieldNotAvailableError, FastFieldReader};
|
||||
use crate::schema::{Cardinality, Field, FieldType, Schema};
|
||||
use crate::space_usage::PerFieldSpaceUsage;
|
||||
use crate::TantivyError;
|
||||
|
||||
/// Provides access to all of the FastFieldReader.
|
||||
use super::reader::DynamicFastFieldReader;
|
||||
|
||||
/// Provides access to all of the BitpackedFastFieldReader.
|
||||
///
|
||||
/// Internally, `FastFieldReaders` have preloaded fast field readers,
|
||||
/// and just wraps several `HashMap`.
|
||||
@@ -100,27 +102,26 @@ impl FastFieldReaders {
|
||||
pub(crate) fn typed_fast_field_reader<TFastValue: FastValue>(
|
||||
&self,
|
||||
field: Field,
|
||||
) -> crate::Result<FastFieldReader<TFastValue>> {
|
||||
) -> crate::Result<DynamicFastFieldReader<TFastValue>> {
|
||||
let fast_field_slice = self.fast_field_data(field, 0)?;
|
||||
FastFieldReader::open(fast_field_slice)
|
||||
DynamicFastFieldReader::open(fast_field_slice)
|
||||
}
|
||||
|
||||
pub(crate) fn typed_fast_field_multi_reader<TFastValue: FastValue>(
|
||||
&self,
|
||||
field: Field,
|
||||
) -> crate::Result<MultiValuedFastFieldReader<TFastValue>> {
|
||||
let fast_field_slice_idx = self.fast_field_data(field, 0)?;
|
||||
let idx_reader = self.typed_fast_field_reader(field)?;
|
||||
let fast_field_slice_vals = self.fast_field_data(field, 1)?;
|
||||
let idx_reader = FastFieldReader::open(fast_field_slice_idx)?;
|
||||
let vals_reader: FastFieldReader<TFastValue> =
|
||||
FastFieldReader::open(fast_field_slice_vals)?;
|
||||
let vals_reader: BitpackedFastFieldReader<TFastValue> =
|
||||
BitpackedFastFieldReader::open(fast_field_slice_vals)?;
|
||||
Ok(MultiValuedFastFieldReader::open(idx_reader, vals_reader))
|
||||
}
|
||||
|
||||
/// Returns the `u64` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a u64 fast field, this method returns an Error.
|
||||
pub fn u64(&self, field: Field) -> crate::Result<FastFieldReader<u64>> {
|
||||
pub fn u64(&self, field: Field) -> crate::Result<DynamicFastFieldReader<u64>> {
|
||||
self.check_type(field, FastType::U64, Cardinality::SingleValue)?;
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
@@ -129,14 +130,14 @@ impl FastFieldReaders {
|
||||
/// field is effectively of type `u64` or not.
|
||||
///
|
||||
/// If not, the fastfield reader will returns the u64-value associated to the original FastValue.
|
||||
pub fn u64_lenient(&self, field: Field) -> crate::Result<FastFieldReader<u64>> {
|
||||
pub fn u64_lenient(&self, field: Field) -> crate::Result<DynamicFastFieldReader<u64>> {
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
|
||||
/// Returns the `i64` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a i64 fast field, this method returns an Error.
|
||||
pub fn i64(&self, field: Field) -> crate::Result<FastFieldReader<i64>> {
|
||||
pub fn i64(&self, field: Field) -> crate::Result<DynamicFastFieldReader<i64>> {
|
||||
self.check_type(field, FastType::I64, Cardinality::SingleValue)?;
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
@@ -144,7 +145,7 @@ impl FastFieldReaders {
|
||||
/// Returns the `i64` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a i64 fast field, this method returns an Error.
|
||||
pub fn date(&self, field: Field) -> crate::Result<FastFieldReader<crate::DateTime>> {
|
||||
pub fn date(&self, field: Field) -> crate::Result<DynamicFastFieldReader<crate::DateTime>> {
|
||||
self.check_type(field, FastType::Date, Cardinality::SingleValue)?;
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
@@ -152,7 +153,7 @@ impl FastFieldReaders {
|
||||
/// Returns the `f64` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a f64 fast field, this method returns an Error.
|
||||
pub fn f64(&self, field: Field) -> crate::Result<FastFieldReader<f64>> {
|
||||
pub fn f64(&self, field: Field) -> crate::Result<DynamicFastFieldReader<f64>> {
|
||||
self.check_type(field, FastType::F64, Cardinality::SingleValue)?;
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
@@ -213,7 +214,7 @@ impl FastFieldReaders {
|
||||
)));
|
||||
}
|
||||
let fast_field_idx_file = self.fast_field_data(field, 0)?;
|
||||
let idx_reader = FastFieldReader::open(fast_field_idx_file)?;
|
||||
let idx_reader = BitpackedFastFieldReader::open(fast_field_idx_file)?;
|
||||
let data = self.fast_field_data(field, 1)?;
|
||||
BytesFastFieldReader::open(idx_reader, data)
|
||||
} else {
|
||||
|
||||
@@ -1,142 +0,0 @@
|
||||
use crate::common::BinarySerializable;
|
||||
use crate::common::CompositeWrite;
|
||||
use crate::common::CountingWriter;
|
||||
use crate::directory::WritePtr;
|
||||
use crate::schema::Field;
|
||||
use std::io::{self, Write};
|
||||
use tantivy_bitpacker::compute_num_bits;
|
||||
use tantivy_bitpacker::BitPacker;
|
||||
|
||||
/// `FastFieldSerializer` is in charge of serializing
|
||||
/// fastfields on disk.
|
||||
///
|
||||
/// Fast fields are encoded using bit-packing.
|
||||
///
|
||||
/// `FastFieldWriter`s are in charge of pushing the data to
|
||||
/// the serializer.
|
||||
/// The serializer expects to receive the following calls.
|
||||
///
|
||||
/// * `new_u64_fast_field(...)`
|
||||
/// * `add_val(...)`
|
||||
/// * `add_val(...)`
|
||||
/// * `add_val(...)`
|
||||
/// * ...
|
||||
/// * `close_field()`
|
||||
/// * `new_u64_fast_field(...)`
|
||||
/// * `add_val(...)`
|
||||
/// * ...
|
||||
/// * `close_field()`
|
||||
/// * `close()`
|
||||
pub struct FastFieldSerializer {
|
||||
composite_write: CompositeWrite<WritePtr>,
|
||||
}
|
||||
|
||||
impl FastFieldSerializer {
|
||||
/// Constructor
|
||||
pub fn from_write(write: WritePtr) -> io::Result<FastFieldSerializer> {
|
||||
// just making room for the pointer to header.
|
||||
let composite_write = CompositeWrite::wrap(write);
|
||||
Ok(FastFieldSerializer { composite_write })
|
||||
}
|
||||
|
||||
/// Start serializing a new u64 fast field
|
||||
pub fn new_u64_fast_field(
|
||||
&mut self,
|
||||
field: Field,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
) -> io::Result<FastSingleFieldSerializer<'_, CountingWriter<WritePtr>>> {
|
||||
self.new_u64_fast_field_with_idx(field, min_value, max_value, 0)
|
||||
}
|
||||
|
||||
/// Start serializing a new u64 fast field
|
||||
pub fn new_u64_fast_field_with_idx(
|
||||
&mut self,
|
||||
field: Field,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
idx: usize,
|
||||
) -> io::Result<FastSingleFieldSerializer<'_, CountingWriter<WritePtr>>> {
|
||||
let field_write = self.composite_write.for_field_with_idx(field, idx);
|
||||
FastSingleFieldSerializer::open(field_write, min_value, max_value)
|
||||
}
|
||||
|
||||
/// Start serializing a new [u8] fast field
|
||||
pub fn new_bytes_fast_field_with_idx(
|
||||
&mut self,
|
||||
field: Field,
|
||||
idx: usize,
|
||||
) -> FastBytesFieldSerializer<'_, CountingWriter<WritePtr>> {
|
||||
let field_write = self.composite_write.for_field_with_idx(field, idx);
|
||||
FastBytesFieldSerializer { write: field_write }
|
||||
}
|
||||
|
||||
/// Closes the serializer
|
||||
///
|
||||
/// After this call the data must be persistently save on disk.
|
||||
pub fn close(self) -> io::Result<()> {
|
||||
self.composite_write.close()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FastSingleFieldSerializer<'a, W: Write> {
|
||||
bit_packer: BitPacker,
|
||||
write: &'a mut W,
|
||||
min_value: u64,
|
||||
num_bits: u8,
|
||||
}
|
||||
|
||||
impl<'a, W: Write> FastSingleFieldSerializer<'a, W> {
|
||||
/// Creates a new fast field serializer.
|
||||
///
|
||||
/// The serializer in fact encode the values by bitpacking
|
||||
/// `(val - min_value)`.
|
||||
///
|
||||
/// It requires a `min_value` and a `max_value` to compute
|
||||
/// compute the minimum number of bits required to encode
|
||||
/// values.
|
||||
fn open(
|
||||
write: &'a mut W,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
) -> io::Result<FastSingleFieldSerializer<'a, W>> {
|
||||
assert!(min_value <= max_value);
|
||||
min_value.serialize(write)?;
|
||||
let amplitude = max_value - min_value;
|
||||
amplitude.serialize(write)?;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let bit_packer = BitPacker::new();
|
||||
Ok(FastSingleFieldSerializer {
|
||||
bit_packer,
|
||||
write,
|
||||
min_value,
|
||||
num_bits,
|
||||
})
|
||||
}
|
||||
|
||||
/// Pushes a new value to the currently open u64 fast field.
|
||||
pub fn add_val(&mut self, val: u64) -> io::Result<()> {
|
||||
let val_to_write: u64 = val - self.min_value;
|
||||
self.bit_packer
|
||||
.write(val_to_write, self.num_bits, &mut self.write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn close_field(mut self) -> io::Result<()> {
|
||||
self.bit_packer.close(&mut self.write)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FastBytesFieldSerializer<'a, W: Write> {
|
||||
write: &'a mut W,
|
||||
}
|
||||
|
||||
impl<'a, W: Write> FastBytesFieldSerializer<'a, W> {
|
||||
pub fn write_all(&mut self, vals: &[u8]) -> io::Result<()> {
|
||||
self.write.write_all(vals)
|
||||
}
|
||||
|
||||
pub fn flush(&mut self) -> io::Result<()> {
|
||||
self.write.flush()
|
||||
}
|
||||
}
|
||||
203
src/fastfield/serializer/mod.rs
Normal file
203
src/fastfield/serializer/mod.rs
Normal file
@@ -0,0 +1,203 @@
|
||||
use crate::common::BinarySerializable;
|
||||
use crate::common::CompositeWrite;
|
||||
use crate::common::CountingWriter;
|
||||
use crate::directory::WritePtr;
|
||||
use crate::schema::Field;
|
||||
pub use fastfield_codecs::bitpacked::BitpackedFastFieldSerializer;
|
||||
pub use fastfield_codecs::bitpacked::BitpackedFastFieldSerializerLegacy;
|
||||
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer;
|
||||
use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldSerializer;
|
||||
pub use fastfield_codecs::FastFieldCodecSerializer;
|
||||
pub use fastfield_codecs::FastFieldDataAccess;
|
||||
pub use fastfield_codecs::FastFieldStats;
|
||||
use std::io::{self, Write};
|
||||
|
||||
/// `CompositeFastFieldSerializer` is in charge of serializing
|
||||
/// fastfields on disk.
|
||||
///
|
||||
/// Fast fields have different encodings like bit-packing.
|
||||
///
|
||||
/// `FastFieldWriter`s are in charge of pushing the data to
|
||||
/// the serializer.
|
||||
/// The serializer expects to receive the following calls.
|
||||
///
|
||||
/// * `new_u64_fast_field(...)`
|
||||
/// * `add_val(...)`
|
||||
/// * `add_val(...)`
|
||||
/// * `add_val(...)`
|
||||
/// * ...
|
||||
/// * `close_field()`
|
||||
/// * `new_u64_fast_field(...)`
|
||||
/// * `add_val(...)`
|
||||
/// * ...
|
||||
/// * `close_field()`
|
||||
/// * `close()`
|
||||
pub struct CompositeFastFieldSerializer {
|
||||
composite_write: CompositeWrite<WritePtr>,
|
||||
}
|
||||
|
||||
// use this, when this is merged and stabilized explicit_generic_args_with_impl_trait
|
||||
// https://github.com/rust-lang/rust/pull/86176
|
||||
fn codec_estimation<T: FastFieldCodecSerializer, A: FastFieldDataAccess>(
|
||||
stats: FastFieldStats,
|
||||
fastfield_accessor: &A,
|
||||
estimations: &mut Vec<(f32, &str, u8)>,
|
||||
) {
|
||||
if !T::is_applicable(fastfield_accessor, stats.clone()) {
|
||||
return;
|
||||
}
|
||||
let (ratio, name, id) = (
|
||||
T::estimate(fastfield_accessor, stats.clone()),
|
||||
T::NAME,
|
||||
T::ID,
|
||||
);
|
||||
estimations.push((ratio, name, id));
|
||||
}
|
||||
|
||||
impl CompositeFastFieldSerializer {
|
||||
/// Constructor
|
||||
pub fn from_write(write: WritePtr) -> io::Result<CompositeFastFieldSerializer> {
|
||||
// just making room for the pointer to header.
|
||||
let composite_write = CompositeWrite::wrap(write);
|
||||
Ok(CompositeFastFieldSerializer { composite_write })
|
||||
}
|
||||
|
||||
/// Serialize data into a new u64 fast field. The best compression codec will be chosen automatically.
|
||||
pub fn create_auto_detect_u64_fast_field(
|
||||
&mut self,
|
||||
field: Field,
|
||||
stats: FastFieldStats,
|
||||
fastfield_accessor: impl FastFieldDataAccess,
|
||||
data_iter_1: impl Iterator<Item = u64>,
|
||||
data_iter_2: impl Iterator<Item = u64>,
|
||||
) -> io::Result<()> {
|
||||
let field_write = self.composite_write.for_field_with_idx(field, 0);
|
||||
|
||||
let mut estimations = vec![];
|
||||
|
||||
codec_estimation::<BitpackedFastFieldSerializer, _>(
|
||||
stats.clone(),
|
||||
&fastfield_accessor,
|
||||
&mut estimations,
|
||||
);
|
||||
codec_estimation::<LinearInterpolFastFieldSerializer, _>(
|
||||
stats.clone(),
|
||||
&fastfield_accessor,
|
||||
&mut estimations,
|
||||
);
|
||||
codec_estimation::<MultiLinearInterpolFastFieldSerializer, _>(
|
||||
stats.clone(),
|
||||
&fastfield_accessor,
|
||||
&mut estimations,
|
||||
);
|
||||
if let Some(broken_estimation) = estimations
|
||||
.iter()
|
||||
.find(|estimation| estimation.0 == f32::NAN)
|
||||
{
|
||||
warn!(
|
||||
"broken estimation for fast field codec {}",
|
||||
broken_estimation.1
|
||||
);
|
||||
}
|
||||
// removing nan values for codecs with broken calculations, and max values which disables codecs
|
||||
estimations.retain(|estimation| !estimation.0.is_nan() && estimation.0 != f32::MAX);
|
||||
estimations.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
|
||||
let (_ratio, name, id) = estimations[0];
|
||||
debug!(
|
||||
"choosing fast field codec {} for field_id {:?}",
|
||||
name, field
|
||||
); // todo print actual field name
|
||||
id.serialize(field_write)?;
|
||||
match name {
|
||||
BitpackedFastFieldSerializer::NAME => {
|
||||
BitpackedFastFieldSerializer::serialize(
|
||||
field_write,
|
||||
&fastfield_accessor,
|
||||
stats,
|
||||
data_iter_1,
|
||||
data_iter_2,
|
||||
)?;
|
||||
}
|
||||
LinearInterpolFastFieldSerializer::NAME => {
|
||||
LinearInterpolFastFieldSerializer::serialize(
|
||||
field_write,
|
||||
&fastfield_accessor,
|
||||
stats,
|
||||
data_iter_1,
|
||||
data_iter_2,
|
||||
)?;
|
||||
}
|
||||
MultiLinearInterpolFastFieldSerializer::NAME => {
|
||||
MultiLinearInterpolFastFieldSerializer::serialize(
|
||||
field_write,
|
||||
&fastfield_accessor,
|
||||
stats,
|
||||
data_iter_1,
|
||||
data_iter_2,
|
||||
)?;
|
||||
}
|
||||
_ => {
|
||||
panic!("unknown fastfield serializer {}", name)
|
||||
}
|
||||
};
|
||||
field_write.flush()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Start serializing a new u64 fast field
|
||||
pub fn new_u64_fast_field(
|
||||
&mut self,
|
||||
field: Field,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
) -> io::Result<BitpackedFastFieldSerializerLegacy<'_, CountingWriter<WritePtr>>> {
|
||||
self.new_u64_fast_field_with_idx(field, min_value, max_value, 0)
|
||||
}
|
||||
|
||||
/// Start serializing a new u64 fast field
|
||||
pub fn new_u64_fast_field_with_idx(
|
||||
&mut self,
|
||||
field: Field,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
idx: usize,
|
||||
) -> io::Result<BitpackedFastFieldSerializerLegacy<'_, CountingWriter<WritePtr>>> {
|
||||
let field_write = self.composite_write.for_field_with_idx(field, idx);
|
||||
// Prepend codec id to field data for compatibility with DynamicFastFieldReader.
|
||||
let id = BitpackedFastFieldSerializer::ID;
|
||||
id.serialize(field_write)?;
|
||||
BitpackedFastFieldSerializerLegacy::open(field_write, min_value, max_value)
|
||||
}
|
||||
|
||||
/// Start serializing a new [u8] fast field
|
||||
pub fn new_bytes_fast_field_with_idx(
|
||||
&mut self,
|
||||
field: Field,
|
||||
idx: usize,
|
||||
) -> FastBytesFieldSerializer<'_, CountingWriter<WritePtr>> {
|
||||
let field_write = self.composite_write.for_field_with_idx(field, idx);
|
||||
FastBytesFieldSerializer { write: field_write }
|
||||
}
|
||||
|
||||
/// Closes the serializer
|
||||
///
|
||||
/// After this call the data must be persistently save on disk.
|
||||
pub fn close(self) -> io::Result<()> {
|
||||
self.composite_write.close()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FastBytesFieldSerializer<'a, W: Write> {
|
||||
write: &'a mut W,
|
||||
}
|
||||
|
||||
impl<'a, W: Write> FastBytesFieldSerializer<'a, W> {
|
||||
pub fn write_all(&mut self, vals: &[u8]) -> io::Result<()> {
|
||||
self.write.write_all(vals)
|
||||
}
|
||||
|
||||
pub fn flush(&mut self) -> io::Result<()> {
|
||||
self.write.flush()
|
||||
}
|
||||
}
|
||||
@@ -1,10 +1,13 @@
|
||||
use super::multivalued::MultiValuedFastFieldWriter;
|
||||
use super::serializer::FastFieldStats;
|
||||
use super::FastFieldDataAccess;
|
||||
use crate::common;
|
||||
use crate::fastfield::{BytesFastFieldWriter, FastFieldSerializer};
|
||||
use crate::fastfield::{BytesFastFieldWriter, CompositeFastFieldSerializer};
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::postings::UnorderedTermId;
|
||||
use crate::schema::{Cardinality, Document, Field, FieldEntry, FieldType, Schema};
|
||||
use crate::termdict::TermOrdinal;
|
||||
use crate::DocId;
|
||||
use fnv::FnvHashMap;
|
||||
use std::collections::HashMap;
|
||||
use std::io;
|
||||
@@ -148,7 +151,7 @@ impl FastFieldsWriter {
|
||||
/// order to the fast field serializer.
|
||||
pub fn serialize(
|
||||
&self,
|
||||
serializer: &mut FastFieldSerializer,
|
||||
serializer: &mut CompositeFastFieldSerializer,
|
||||
mapping: &HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>>,
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
) -> io::Result<()> {
|
||||
@@ -264,15 +267,15 @@ impl IntFastFieldWriter {
|
||||
self.add_val(val);
|
||||
}
|
||||
|
||||
/// Extract the stored data
|
||||
pub(crate) fn get_data(&self) -> Vec<u64> {
|
||||
self.vals.iter().collect::<Vec<u64>>()
|
||||
/// get iterator over the data
|
||||
pub(crate) fn iter(&self) -> impl Iterator<Item = u64> + '_ {
|
||||
self.vals.iter()
|
||||
}
|
||||
|
||||
/// Push the fast fields value to the `FastFieldWriter`.
|
||||
pub fn serialize(
|
||||
&self,
|
||||
serializer: &mut FastFieldSerializer,
|
||||
serializer: &mut CompositeFastFieldSerializer,
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
) -> io::Result<()> {
|
||||
let (min, max) = if self.val_min > self.val_max {
|
||||
@@ -280,17 +283,58 @@ impl IntFastFieldWriter {
|
||||
} else {
|
||||
(self.val_min, self.val_max)
|
||||
};
|
||||
let mut single_field_serializer = serializer.new_u64_fast_field(self.field, min, max)?;
|
||||
if let Some(doc_id_map) = doc_id_map {
|
||||
for doc_id in doc_id_map.iter_old_doc_ids() {
|
||||
single_field_serializer.add_val(self.vals.get(*doc_id as usize))?;
|
||||
}
|
||||
} else {
|
||||
for val in self.vals.iter() {
|
||||
single_field_serializer.add_val(val)?;
|
||||
}
|
||||
let fastfield_accessor = WriterFastFieldAccessProvider {
|
||||
doc_id_map,
|
||||
vals: &self.vals,
|
||||
};
|
||||
let stats = FastFieldStats {
|
||||
min_value: min,
|
||||
max_value: max,
|
||||
num_vals: self.val_count as u64,
|
||||
};
|
||||
|
||||
single_field_serializer.close_field()
|
||||
if let Some(doc_id_map) = doc_id_map {
|
||||
let iter = doc_id_map
|
||||
.iter_old_doc_ids()
|
||||
.map(|doc_id| self.vals.get(*doc_id as usize));
|
||||
serializer.create_auto_detect_u64_fast_field(
|
||||
self.field,
|
||||
stats,
|
||||
fastfield_accessor,
|
||||
iter.clone(),
|
||||
iter,
|
||||
)?;
|
||||
} else {
|
||||
serializer.create_auto_detect_u64_fast_field(
|
||||
self.field,
|
||||
stats,
|
||||
fastfield_accessor,
|
||||
self.vals.iter(),
|
||||
self.vals.iter(),
|
||||
)?;
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct WriterFastFieldAccessProvider<'map, 'bitp> {
|
||||
doc_id_map: Option<&'map DocIdMapping>,
|
||||
vals: &'bitp BlockedBitpacker,
|
||||
}
|
||||
impl<'map, 'bitp> FastFieldDataAccess for WriterFastFieldAccessProvider<'map, 'bitp> {
|
||||
/// Return the value associated to the given document.
|
||||
///
|
||||
/// This accessor should return as fast as possible.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic if `doc` is greater than the segment
|
||||
fn get(&self, doc: DocId) -> u64 {
|
||||
if let Some(doc_id_map) = self.doc_id_map {
|
||||
self.vals.get(doc_id_map.get_old_doc_id(doc) as usize) // consider extra FastFieldReader wrapper for non doc_id_map
|
||||
} else {
|
||||
self.vals.get(doc as usize)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,7 +8,6 @@ use crate::{
|
||||
DocId, IndexSortByField, Order, TantivyError,
|
||||
};
|
||||
use std::cmp::Reverse;
|
||||
|
||||
/// Struct to provide mapping from old doc_id to new doc_id and vice versa
|
||||
pub struct DocIdMapping {
|
||||
new_doc_id_to_old: Vec<DocId>,
|
||||
@@ -61,9 +60,8 @@ pub(crate) fn get_doc_id_mapping_from_field(
|
||||
})?;
|
||||
|
||||
// create new doc_id to old doc_id index (used in fast_field_writers)
|
||||
let data = fast_field.get_data();
|
||||
let mut doc_id_and_data = data
|
||||
.into_iter()
|
||||
let mut doc_id_and_data = fast_field
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|el| (el.0 as DocId, el.1))
|
||||
.collect::<Vec<_>>();
|
||||
@@ -92,6 +90,7 @@ pub(crate) fn get_doc_id_mapping_from_field(
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests_indexsorting {
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::{collector::TopDocs, query::QueryParser, schema::*};
|
||||
use crate::{schema::Schema, DocAddress};
|
||||
use crate::{Index, IndexSettings, IndexSortByField, Order};
|
||||
@@ -175,6 +174,7 @@ mod tests_indexsorting {
|
||||
field: "my_number".to_string(),
|
||||
order: Order::Asc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
option.clone(),
|
||||
);
|
||||
@@ -206,6 +206,7 @@ mod tests_indexsorting {
|
||||
field: "my_number".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
option.clone(),
|
||||
);
|
||||
@@ -264,6 +265,7 @@ mod tests_indexsorting {
|
||||
field: "my_number".to_string(),
|
||||
order: Order::Asc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
get_text_options(),
|
||||
);
|
||||
@@ -288,6 +290,7 @@ mod tests_indexsorting {
|
||||
field: "my_number".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
get_text_options(),
|
||||
);
|
||||
@@ -322,6 +325,7 @@ mod tests_indexsorting {
|
||||
field: "my_number".to_string(),
|
||||
order: Order::Asc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
get_text_options(),
|
||||
);
|
||||
@@ -352,6 +356,7 @@ mod tests_indexsorting {
|
||||
field: "my_number".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
get_text_options(),
|
||||
);
|
||||
@@ -387,6 +392,7 @@ mod tests_indexsorting {
|
||||
field: "my_number".to_string(),
|
||||
order: Order::Asc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
get_text_options(),
|
||||
);
|
||||
|
||||
@@ -945,7 +945,7 @@ mod tests {
|
||||
let index_writer = index.writer(3_000_000).unwrap();
|
||||
assert_eq!(
|
||||
format!("{:?}", index_writer.get_merge_policy()),
|
||||
"LogMergePolicy { min_merge_size: 8, max_merge_size: 10000000, min_layer_size: 10000, \
|
||||
"LogMergePolicy { min_num_segments: 8, max_docs_before_merge: 10000000, min_layer_size: 10000, \
|
||||
level_log_size: 0.75 }"
|
||||
);
|
||||
let merge_policy = Box::new(NoMergePolicy::default());
|
||||
|
||||
@@ -1,19 +1,20 @@
|
||||
use super::merge_policy::{MergeCandidate, MergePolicy};
|
||||
use crate::core::SegmentMeta;
|
||||
use itertools::Itertools;
|
||||
use std::cmp;
|
||||
use std::f64;
|
||||
|
||||
const DEFAULT_LEVEL_LOG_SIZE: f64 = 0.75;
|
||||
const DEFAULT_MIN_LAYER_SIZE: u32 = 10_000;
|
||||
const DEFAULT_MIN_MERGE_SIZE: usize = 8;
|
||||
const DEFAULT_MAX_MERGE_SIZE: usize = 10_000_000;
|
||||
const DEFAULT_MIN_NUM_SEGMENTS_IN_MERGE: usize = 8;
|
||||
const DEFAULT_MAX_DOCS_BEFORE_MERGE: usize = 10_000_000;
|
||||
|
||||
/// `LogMergePolicy` tries to merge segments that have a similar number of
|
||||
/// documents.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct LogMergePolicy {
|
||||
min_merge_size: usize,
|
||||
max_merge_size: usize,
|
||||
min_num_segments: usize,
|
||||
max_docs_before_merge: usize,
|
||||
min_layer_size: u32,
|
||||
level_log_size: f64,
|
||||
}
|
||||
@@ -23,15 +24,16 @@ impl LogMergePolicy {
|
||||
cmp::max(self.min_layer_size, size)
|
||||
}
|
||||
|
||||
/// Set the minimum number of segment that may be merge together.
|
||||
pub fn set_min_merge_size(&mut self, min_merge_size: usize) {
|
||||
self.min_merge_size = min_merge_size;
|
||||
/// Set the minimum number of segments that may be merged together.
|
||||
pub fn set_min_num_segments(&mut self, min_num_segments: usize) {
|
||||
self.min_num_segments = min_num_segments;
|
||||
}
|
||||
|
||||
/// Set the maximum number docs in a segment for it to be considered for
|
||||
/// merging.
|
||||
pub fn set_max_merge_size(&mut self, max_merge_size: usize) {
|
||||
self.max_merge_size = max_merge_size;
|
||||
/// merging. A segment can still reach more than max_docs, by merging many
|
||||
/// smaller ones.
|
||||
pub fn set_max_docs_before_merge(&mut self, max_docs_merge_size: usize) {
|
||||
self.max_docs_before_merge = max_docs_merge_size;
|
||||
}
|
||||
|
||||
/// Set the minimum segment size under which all segment belong
|
||||
@@ -42,7 +44,7 @@ impl LogMergePolicy {
|
||||
|
||||
/// Set the ratio between two consecutive levels.
|
||||
///
|
||||
/// Segment are group in levels according to their sizes.
|
||||
/// Segments are grouped in levels according to their sizes.
|
||||
/// These levels are defined as intervals of exponentially growing sizes.
|
||||
/// level_log_size define the factor by which one should multiply the limit
|
||||
/// to reach a level, in order to get the limit to reach the following
|
||||
@@ -54,52 +56,43 @@ impl LogMergePolicy {
|
||||
|
||||
impl MergePolicy for LogMergePolicy {
|
||||
fn compute_merge_candidates(&self, segments: &[SegmentMeta]) -> Vec<MergeCandidate> {
|
||||
let mut size_sorted_tuples = segments
|
||||
let mut size_sorted_segments = segments
|
||||
.iter()
|
||||
.map(SegmentMeta::num_docs)
|
||||
.enumerate()
|
||||
.filter(|(_, s)| s <= &(self.max_merge_size as u32))
|
||||
.collect::<Vec<(usize, u32)>>();
|
||||
.filter(|segment_meta| segment_meta.num_docs() <= (self.max_docs_before_merge as u32))
|
||||
.collect::<Vec<&SegmentMeta>>();
|
||||
|
||||
size_sorted_tuples.sort_by(|x, y| y.1.cmp(&(x.1)));
|
||||
|
||||
if size_sorted_tuples.len() <= 1 {
|
||||
return Vec::new();
|
||||
}
|
||||
|
||||
let size_sorted_log_tuples: Vec<_> = size_sorted_tuples
|
||||
.into_iter()
|
||||
.map(|(ind, num_docs)| (ind, f64::from(self.clip_min_size(num_docs)).log2()))
|
||||
.collect();
|
||||
|
||||
if let Some(&(first_ind, first_score)) = size_sorted_log_tuples.first() {
|
||||
let mut current_max_log_size = first_score;
|
||||
let mut levels = vec![vec![first_ind]];
|
||||
for &(ind, score) in (&size_sorted_log_tuples).iter().skip(1) {
|
||||
if score < (current_max_log_size - self.level_log_size) {
|
||||
current_max_log_size = score;
|
||||
levels.push(Vec::new());
|
||||
}
|
||||
levels.last_mut().unwrap().push(ind);
|
||||
}
|
||||
levels
|
||||
.iter()
|
||||
.filter(|level| level.len() >= self.min_merge_size)
|
||||
.map(|ind_vec| {
|
||||
MergeCandidate(ind_vec.iter().map(|&ind| segments[ind].id()).collect())
|
||||
})
|
||||
.collect()
|
||||
} else {
|
||||
if size_sorted_segments.len() <= 1 {
|
||||
return vec![];
|
||||
}
|
||||
size_sorted_segments.sort_by_key(|seg| std::cmp::Reverse(seg.num_docs()));
|
||||
|
||||
let mut current_max_log_size = f64::MAX;
|
||||
let mut levels = vec![];
|
||||
for (_, merge_group) in &size_sorted_segments.into_iter().group_by(|segment| {
|
||||
let segment_log_size = f64::from(self.clip_min_size(segment.num_docs())).log2();
|
||||
if segment_log_size < (current_max_log_size - self.level_log_size) {
|
||||
// update current_max_log_size to create a new group
|
||||
current_max_log_size = segment_log_size;
|
||||
}
|
||||
// return current_max_log_size to be grouped to the current group
|
||||
current_max_log_size
|
||||
}) {
|
||||
levels.push(merge_group.collect::<Vec<&SegmentMeta>>());
|
||||
}
|
||||
|
||||
levels
|
||||
.iter()
|
||||
.filter(|level| level.len() >= self.min_num_segments)
|
||||
.map(|segments| MergeCandidate(segments.iter().map(|&seg| seg.id()).collect()))
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for LogMergePolicy {
|
||||
fn default() -> LogMergePolicy {
|
||||
LogMergePolicy {
|
||||
min_merge_size: DEFAULT_MIN_MERGE_SIZE,
|
||||
max_merge_size: DEFAULT_MAX_MERGE_SIZE,
|
||||
min_num_segments: DEFAULT_MIN_NUM_SEGMENTS_IN_MERGE,
|
||||
max_docs_before_merge: DEFAULT_MAX_DOCS_BEFORE_MERGE,
|
||||
min_layer_size: DEFAULT_MIN_LAYER_SIZE,
|
||||
level_log_size: DEFAULT_LEVEL_LOG_SIZE,
|
||||
}
|
||||
@@ -109,16 +102,79 @@ impl Default for LogMergePolicy {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::core::{SegmentId, SegmentMeta, SegmentMetaInventory};
|
||||
use crate::indexer::merge_policy::MergePolicy;
|
||||
use crate::{
|
||||
core::{SegmentId, SegmentMeta, SegmentMetaInventory},
|
||||
schema,
|
||||
};
|
||||
use crate::{indexer::merge_policy::MergePolicy, schema::INDEXED};
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
static INVENTORY: Lazy<SegmentMetaInventory> = Lazy::new(SegmentMetaInventory::default);
|
||||
|
||||
use crate::Index;
|
||||
|
||||
#[test]
|
||||
fn create_index_test_max_merge_issue_1035() {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let int_field = schema_builder.add_u64_field("intval", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
{
|
||||
let mut log_merge_policy = LogMergePolicy::default();
|
||||
log_merge_policy.set_min_num_segments(1);
|
||||
log_merge_policy.set_max_docs_before_merge(1);
|
||||
log_merge_policy.set_min_layer_size(0);
|
||||
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
index_writer.set_merge_policy(Box::new(log_merge_policy));
|
||||
|
||||
// after every commit the merge checker is started, it will merge only segments with 1
|
||||
// element in it because of the max_merge_size.
|
||||
index_writer.add_document(doc!(int_field=>1_u64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
index_writer.add_document(doc!(int_field=>2_u64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
index_writer.add_document(doc!(int_field=>3_u64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
index_writer.add_document(doc!(int_field=>4_u64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
index_writer.add_document(doc!(int_field=>5_u64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
index_writer.add_document(doc!(int_field=>6_u64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
index_writer.add_document(doc!(int_field=>7_u64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
index_writer.add_document(doc!(int_field=>8_u64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
|
||||
let _segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let segment_readers = searcher.segment_readers();
|
||||
for segment in segment_readers {
|
||||
if segment.num_docs() > 2 {
|
||||
panic!("segment can't have more than two segments");
|
||||
} // don't know how to wait for the merge, then it could be a simple eq
|
||||
}
|
||||
}
|
||||
|
||||
fn test_merge_policy() -> LogMergePolicy {
|
||||
let mut log_merge_policy = LogMergePolicy::default();
|
||||
log_merge_policy.set_min_merge_size(3);
|
||||
log_merge_policy.set_max_merge_size(100_000);
|
||||
log_merge_policy.set_min_num_segments(3);
|
||||
log_merge_policy.set_max_docs_before_merge(100_000);
|
||||
log_merge_policy.set_min_layer_size(2);
|
||||
log_merge_policy
|
||||
}
|
||||
|
||||
@@ -1,7 +1,11 @@
|
||||
use super::doc_id_mapping::DocIdMapping;
|
||||
use crate::error::DataCorruption;
|
||||
use crate::fastfield::CompositeFastFieldSerializer;
|
||||
use crate::fastfield::DeleteBitSet;
|
||||
use crate::fastfield::DynamicFastFieldReader;
|
||||
use crate::fastfield::FastFieldDataAccess;
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::fastfield::FastFieldSerializer;
|
||||
use crate::fastfield::FastFieldStats;
|
||||
use crate::fastfield::MultiValuedFastFieldReader;
|
||||
use crate::fieldnorm::FieldNormsSerializer;
|
||||
use crate::fieldnorm::FieldNormsWriter;
|
||||
@@ -86,7 +90,7 @@ pub struct IndexMerger {
|
||||
}
|
||||
|
||||
fn compute_min_max_val(
|
||||
u64_reader: &FastFieldReader<u64>,
|
||||
u64_reader: &impl FastFieldReader<u64>,
|
||||
max_doc: DocId,
|
||||
delete_bitset_opt: Option<&DeleteBitSet>,
|
||||
) -> Option<(u64, u64)> {
|
||||
@@ -182,6 +186,10 @@ impl IndexMerger {
|
||||
readers.push(reader);
|
||||
}
|
||||
}
|
||||
if let Some(sort_by_field) = index_settings.sort_by_field.as_ref() {
|
||||
readers = Self::sort_readers_by_min_sort_field(readers, sort_by_field)?;
|
||||
}
|
||||
// sort segments by their natural sort setting
|
||||
if max_doc >= MAX_DOC_LIMIT {
|
||||
let err_msg = format!(
|
||||
"The segment resulting from this merge would have {} docs,\
|
||||
@@ -191,13 +199,37 @@ impl IndexMerger {
|
||||
return Err(crate::TantivyError::InvalidArgument(err_msg));
|
||||
}
|
||||
Ok(IndexMerger {
|
||||
schema,
|
||||
index_settings,
|
||||
schema,
|
||||
readers,
|
||||
max_doc,
|
||||
})
|
||||
}
|
||||
|
||||
fn sort_readers_by_min_sort_field(
|
||||
readers: Vec<SegmentReader>,
|
||||
sort_by_field: &IndexSortByField,
|
||||
) -> crate::Result<Vec<SegmentReader>> {
|
||||
// presort the readers by their min_values, so that when they are disjunct, we can use
|
||||
// the regular merge logic (implicitly sorted)
|
||||
let mut readers_with_min_sort_values = readers
|
||||
.into_iter()
|
||||
.map(|reader| {
|
||||
let accessor = Self::get_sort_field_accessor(&reader, &sort_by_field)?;
|
||||
Ok((reader, accessor.min_value()))
|
||||
})
|
||||
.collect::<crate::Result<Vec<_>>>()?;
|
||||
if sort_by_field.order.is_asc() {
|
||||
readers_with_min_sort_values.sort_by_key(|(_, min_val)| *min_val);
|
||||
} else {
|
||||
readers_with_min_sort_values.sort_by_key(|(_, min_val)| std::cmp::Reverse(*min_val));
|
||||
}
|
||||
Ok(readers_with_min_sort_values
|
||||
.into_iter()
|
||||
.map(|(reader, _)| reader)
|
||||
.collect())
|
||||
}
|
||||
|
||||
fn write_fieldnorms(
|
||||
&self,
|
||||
mut fieldnorms_serializer: FieldNormsSerializer,
|
||||
@@ -208,9 +240,14 @@ impl IndexMerger {
|
||||
for field in fields {
|
||||
fieldnorms_data.clear();
|
||||
if let Some(doc_id_mapping) = doc_id_mapping {
|
||||
let fieldnorms_readers: Vec<FieldNormReader> = self
|
||||
.readers
|
||||
.iter()
|
||||
.map(|reader| reader.get_fieldnorms_reader(field))
|
||||
.collect::<Result<_, _>>()?;
|
||||
for (doc_id, reader_with_ordinal) in doc_id_mapping {
|
||||
let fieldnorms_reader =
|
||||
reader_with_ordinal.reader.get_fieldnorms_reader(field)?;
|
||||
&fieldnorms_readers[reader_with_ordinal.ordinal as usize];
|
||||
let fieldnorm_id = fieldnorms_reader.fieldnorm_id(*doc_id);
|
||||
fieldnorms_data.push(fieldnorm_id);
|
||||
}
|
||||
@@ -231,7 +268,7 @@ impl IndexMerger {
|
||||
|
||||
fn write_fast_fields(
|
||||
&self,
|
||||
fast_field_serializer: &mut FastFieldSerializer,
|
||||
fast_field_serializer: &mut CompositeFastFieldSerializer,
|
||||
mut term_ord_mappings: HashMap<Field, TermOrdinalMapping>,
|
||||
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
|
||||
) -> crate::Result<()> {
|
||||
@@ -281,11 +318,11 @@ impl IndexMerger {
|
||||
fn write_single_fast_field(
|
||||
&self,
|
||||
field: Field,
|
||||
fast_field_serializer: &mut FastFieldSerializer,
|
||||
fast_field_serializer: &mut CompositeFastFieldSerializer,
|
||||
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
|
||||
) -> crate::Result<()> {
|
||||
let (min_value, max_value) = self.readers.iter().map(|reader|{
|
||||
let u64_reader: FastFieldReader<u64> = reader
|
||||
let u64_reader: DynamicFastFieldReader<u64> = reader
|
||||
.fast_fields()
|
||||
.typed_fast_field_reader(field)
|
||||
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
|
||||
@@ -300,7 +337,7 @@ impl IndexMerger {
|
||||
.readers
|
||||
.iter()
|
||||
.map(|reader| {
|
||||
let u64_reader: FastFieldReader<u64> = reader
|
||||
let u64_reader: DynamicFastFieldReader<u64> = reader
|
||||
.fast_fields()
|
||||
.typed_fast_field_reader(field)
|
||||
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
|
||||
@@ -308,27 +345,44 @@ impl IndexMerger {
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
if let Some(doc_id_mapping) = doc_id_mapping {
|
||||
let sorted_doc_ids = doc_id_mapping.iter().map(|(doc_id, reader_with_ordinal)| {
|
||||
(
|
||||
doc_id,
|
||||
&fast_field_readers[reader_with_ordinal.ordinal as usize],
|
||||
)
|
||||
});
|
||||
// add values in order of the new doc_ids
|
||||
let mut fast_single_field_serializer =
|
||||
fast_field_serializer.new_u64_fast_field(field, min_value, max_value)?;
|
||||
for (doc_id, field_reader) in sorted_doc_ids {
|
||||
let val = field_reader.get(*doc_id);
|
||||
fast_single_field_serializer.add_val(val)?;
|
||||
#[derive(Clone)]
|
||||
struct SortedDocidFieldAccessProvider<'a> {
|
||||
doc_id_mapping: &'a Vec<(DocId, SegmentReaderWithOrdinal<'a>)>,
|
||||
fast_field_readers: &'a Vec<DynamicFastFieldReader<u64>>,
|
||||
}
|
||||
impl<'a> FastFieldDataAccess for SortedDocidFieldAccessProvider<'a> {
|
||||
fn get(&self, doc: DocId) -> u64 {
|
||||
let (doc_id, reader_with_ordinal) = self.doc_id_mapping[doc as usize];
|
||||
self.fast_field_readers[reader_with_ordinal.ordinal as usize].get(doc_id)
|
||||
}
|
||||
}
|
||||
let stats = FastFieldStats {
|
||||
min_value,
|
||||
max_value,
|
||||
num_vals: doc_id_mapping.len() as u64,
|
||||
};
|
||||
let fastfield_accessor = SortedDocidFieldAccessProvider {
|
||||
doc_id_mapping,
|
||||
fast_field_readers: &fast_field_readers,
|
||||
};
|
||||
let iter = doc_id_mapping.iter().map(|(doc_id, reader_with_ordinal)| {
|
||||
let fast_field_reader = &fast_field_readers[reader_with_ordinal.ordinal as usize];
|
||||
fast_field_reader.get(*doc_id)
|
||||
});
|
||||
fast_field_serializer.create_auto_detect_u64_fast_field(
|
||||
field,
|
||||
stats,
|
||||
fastfield_accessor,
|
||||
iter.clone(),
|
||||
iter,
|
||||
)?;
|
||||
|
||||
fast_single_field_serializer.close_field()?;
|
||||
Ok(())
|
||||
} else {
|
||||
let u64_readers = self.readers.iter()
|
||||
.filter(|reader|reader.max_doc() != reader.delete_bitset().map(|bit_set|bit_set.len() as u32).unwrap_or(0))
|
||||
.map(|reader|{
|
||||
let u64_reader: FastFieldReader<u64> = reader
|
||||
let u64_reader: DynamicFastFieldReader<u64> = reader
|
||||
.fast_fields()
|
||||
.typed_fast_field_reader(field)
|
||||
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
|
||||
@@ -354,6 +408,60 @@ impl IndexMerger {
|
||||
}
|
||||
}
|
||||
|
||||
/// Checks if the readers are disjunct for their sort property and in the correct order to be
|
||||
/// able to just stack them.
|
||||
pub(crate) fn is_disjunct_and_sorted_on_sort_property(
|
||||
&self,
|
||||
sort_by_field: &IndexSortByField,
|
||||
) -> crate::Result<bool> {
|
||||
let reader_and_field_accessors = self.get_reader_with_sort_field_accessor(sort_by_field)?;
|
||||
|
||||
let everything_is_in_order = reader_and_field_accessors
|
||||
.into_iter()
|
||||
.map(|reader| reader.1)
|
||||
.tuple_windows()
|
||||
.all(|(field_accessor1, field_accessor2)| {
|
||||
if sort_by_field.order.is_asc() {
|
||||
field_accessor1.max_value() <= field_accessor2.min_value()
|
||||
} else {
|
||||
field_accessor1.min_value() >= field_accessor2.max_value()
|
||||
}
|
||||
});
|
||||
Ok(everything_is_in_order)
|
||||
}
|
||||
|
||||
pub(crate) fn get_sort_field_accessor(
|
||||
reader: &SegmentReader,
|
||||
sort_by_field: &IndexSortByField,
|
||||
) -> crate::Result<impl FastFieldReader<u64>> {
|
||||
let field_id = expect_field_id_for_sort_field(&reader.schema(), &sort_by_field)?; // for now expect fastfield, but not strictly required
|
||||
let value_accessor = reader.fast_fields().u64_lenient(field_id)?;
|
||||
Ok(value_accessor)
|
||||
}
|
||||
/// Collecting value_accessors into a vec to bind the lifetime.
|
||||
pub(crate) fn get_reader_with_sort_field_accessor<'a, 'b>(
|
||||
&'a self,
|
||||
sort_by_field: &'b IndexSortByField,
|
||||
) -> crate::Result<
|
||||
Vec<(
|
||||
SegmentReaderWithOrdinal<'a>,
|
||||
impl FastFieldReader<u64> + Clone,
|
||||
)>,
|
||||
> {
|
||||
let reader_and_field_accessors = self
|
||||
.readers
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(Into::into)
|
||||
.map(|reader_with_ordinal: SegmentReaderWithOrdinal| {
|
||||
let value_accessor =
|
||||
Self::get_sort_field_accessor(reader_with_ordinal.reader, sort_by_field)?;
|
||||
Ok((reader_with_ordinal, value_accessor))
|
||||
})
|
||||
.collect::<crate::Result<Vec<_>>>()?;
|
||||
Ok(reader_and_field_accessors)
|
||||
}
|
||||
|
||||
/// Generates the doc_id mapping where position in the vec=new
|
||||
/// doc_id.
|
||||
/// ReaderWithOrdinal will include the ordinal position of the
|
||||
@@ -362,42 +470,26 @@ impl IndexMerger {
|
||||
&self,
|
||||
sort_by_field: &IndexSortByField,
|
||||
) -> crate::Result<Vec<(DocId, SegmentReaderWithOrdinal)>> {
|
||||
let reader_and_field_accessors = self
|
||||
.readers
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|reader| {
|
||||
let reader_with_ordinal: SegmentReaderWithOrdinal = reader.into();
|
||||
let field_id = expect_field_id_for_sort_field(
|
||||
&reader_with_ordinal.reader.schema(),
|
||||
&sort_by_field,
|
||||
)?; // for now expect fastfield, but not strictly required
|
||||
let value_accessor = reader_with_ordinal
|
||||
.reader
|
||||
.fast_fields()
|
||||
.u64_lenient(field_id)?;
|
||||
Ok((reader_with_ordinal, value_accessor))
|
||||
})
|
||||
.collect::<crate::Result<Vec<_>>>()?; // Collecting to bind the lifetime of value_accessor into the vec, or can't be used as a reference.
|
||||
// Loading the field accessor on demand causes a 15x regression
|
||||
let reader_and_field_accessors = self.get_reader_with_sort_field_accessor(sort_by_field)?;
|
||||
// Loading the field accessor on demand causes a 15x regression
|
||||
|
||||
// create iterators over segment/sort_accessor/doc_id tuple
|
||||
let doc_id_reader_pair = reader_and_field_accessors
|
||||
.iter()
|
||||
.map(|reader_and_field_accessor| {
|
||||
reader_and_field_accessor
|
||||
.0
|
||||
.reader
|
||||
.doc_ids_alive()
|
||||
.map(move |doc_id| {
|
||||
(
|
||||
doc_id,
|
||||
reader_and_field_accessor.0,
|
||||
&reader_and_field_accessor.1,
|
||||
)
|
||||
})
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
let doc_id_reader_pair =
|
||||
reader_and_field_accessors
|
||||
.iter()
|
||||
.map(|reader_and_field_accessor| {
|
||||
reader_and_field_accessor
|
||||
.0
|
||||
.reader
|
||||
.doc_ids_alive()
|
||||
.map(move |doc_id| {
|
||||
(
|
||||
doc_id,
|
||||
reader_and_field_accessor.0,
|
||||
&reader_and_field_accessor.1,
|
||||
)
|
||||
})
|
||||
});
|
||||
|
||||
// create iterator tuple of (old doc_id, reader) in order of the new doc_ids
|
||||
let sorted_doc_ids: Vec<(DocId, SegmentReaderWithOrdinal)> = doc_id_reader_pair
|
||||
@@ -422,19 +514,21 @@ impl IndexMerger {
|
||||
// Important: reader_and_field_accessor needs
|
||||
// to have the same order as self.readers since ReaderWithOrdinal
|
||||
// is used to index the reader_and_field_accessors vec.
|
||||
fn write_1_n_fast_field_idx_generic(
|
||||
fn write_1_n_fast_field_idx_generic<T: MultiValueLength>(
|
||||
field: Field,
|
||||
fast_field_serializer: &mut FastFieldSerializer,
|
||||
fast_field_serializer: &mut CompositeFastFieldSerializer,
|
||||
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
|
||||
reader_and_field_accessors: &[(&SegmentReader, impl MultiValueLength)],
|
||||
reader_and_field_accessors: &[(&SegmentReader, T)],
|
||||
) -> crate::Result<()> {
|
||||
let mut total_num_vals = 0u64;
|
||||
// In the first pass, we compute the total number of vals.
|
||||
//
|
||||
// This is required by the bitpacker, as it needs to know
|
||||
// what should be the bit length use for bitpacking.
|
||||
let mut idx_num_vals = 0;
|
||||
for (reader, u64s_reader) in reader_and_field_accessors.iter() {
|
||||
if let Some(delete_bitset) = reader.delete_bitset() {
|
||||
idx_num_vals += reader.max_doc() as u64 - delete_bitset.len() as u64;
|
||||
for doc in 0u32..reader.max_doc() {
|
||||
if delete_bitset.is_alive(doc) {
|
||||
let num_vals = u64s_reader.get_len(doc) as u64;
|
||||
@@ -442,44 +536,66 @@ impl IndexMerger {
|
||||
}
|
||||
}
|
||||
} else {
|
||||
idx_num_vals += reader.max_doc() as u64;
|
||||
total_num_vals += u64s_reader.get_total_len();
|
||||
}
|
||||
}
|
||||
|
||||
let stats = FastFieldStats {
|
||||
max_value: total_num_vals,
|
||||
num_vals: idx_num_vals,
|
||||
min_value: 0,
|
||||
};
|
||||
// We can now create our `idx` serializer, and in a second pass,
|
||||
// can effectively push the different indexes.
|
||||
if let Some(doc_id_mapping) = doc_id_mapping {
|
||||
let mut serialize_idx =
|
||||
fast_field_serializer.new_u64_fast_field_with_idx(field, 0, total_num_vals, 0)?;
|
||||
// copying into a temp vec is not ideal, but the fast field codec api requires random
|
||||
// access, which is used in the estimation. It's possible to 1. calculate random
|
||||
// acccess on the fly or 2. change the codec api to make random access optional, but
|
||||
// they both have also major drawbacks.
|
||||
|
||||
let mut offsets = vec![];
|
||||
let mut offset = 0;
|
||||
for (doc_id, reader) in doc_id_mapping {
|
||||
let reader = &reader_and_field_accessors[reader.ordinal as usize].1;
|
||||
serialize_idx.add_val(offset)?;
|
||||
offsets.push(offset);
|
||||
offset += reader.get_len(*doc_id) as u64;
|
||||
}
|
||||
serialize_idx.add_val(offset as u64)?;
|
||||
offsets.push(offset);
|
||||
|
||||
serialize_idx.close_field()?;
|
||||
fast_field_serializer.create_auto_detect_u64_fast_field(
|
||||
field,
|
||||
stats,
|
||||
&offsets,
|
||||
offsets.iter().cloned(),
|
||||
offsets.iter().cloned(),
|
||||
)?;
|
||||
} else {
|
||||
let mut serialize_idx =
|
||||
fast_field_serializer.new_u64_fast_field_with_idx(field, 0, total_num_vals, 0)?;
|
||||
let mut idx = 0;
|
||||
let mut offsets = vec![];
|
||||
let mut offset = 0;
|
||||
for (segment_reader, u64s_reader) in reader_and_field_accessors.iter() {
|
||||
for doc in segment_reader.doc_ids_alive() {
|
||||
serialize_idx.add_val(idx)?;
|
||||
idx += u64s_reader.get_len(doc) as u64;
|
||||
offsets.push(offset);
|
||||
offset += u64s_reader.get_len(doc) as u64;
|
||||
}
|
||||
}
|
||||
serialize_idx.add_val(idx)?;
|
||||
serialize_idx.close_field()?;
|
||||
offsets.push(offset);
|
||||
|
||||
fast_field_serializer.create_auto_detect_u64_fast_field(
|
||||
field,
|
||||
stats,
|
||||
&offsets,
|
||||
offsets.iter().cloned(),
|
||||
offsets.iter().cloned(),
|
||||
)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
fn write_multi_value_fast_field_idx(
|
||||
&self,
|
||||
field: Field,
|
||||
fast_field_serializer: &mut FastFieldSerializer,
|
||||
fast_field_serializer: &mut CompositeFastFieldSerializer,
|
||||
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
|
||||
) -> crate::Result<()> {
|
||||
let reader_and_field_accessors = self.readers.iter().map(|reader|{
|
||||
@@ -501,7 +617,7 @@ impl IndexMerger {
|
||||
&self,
|
||||
field: Field,
|
||||
term_ordinal_mappings: &TermOrdinalMapping,
|
||||
fast_field_serializer: &mut FastFieldSerializer,
|
||||
fast_field_serializer: &mut CompositeFastFieldSerializer,
|
||||
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
|
||||
) -> crate::Result<()> {
|
||||
// Multifastfield consists in 2 fastfields.
|
||||
@@ -564,7 +680,7 @@ impl IndexMerger {
|
||||
fn write_multi_fast_field(
|
||||
&self,
|
||||
field: Field,
|
||||
fast_field_serializer: &mut FastFieldSerializer,
|
||||
fast_field_serializer: &mut CompositeFastFieldSerializer,
|
||||
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
|
||||
) -> crate::Result<()> {
|
||||
// Multifastfield consists in 2 fastfields.
|
||||
@@ -651,7 +767,7 @@ impl IndexMerger {
|
||||
fn write_bytes_fast_field(
|
||||
&self,
|
||||
field: Field,
|
||||
fast_field_serializer: &mut FastFieldSerializer,
|
||||
fast_field_serializer: &mut CompositeFastFieldSerializer,
|
||||
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
|
||||
) -> crate::Result<()> {
|
||||
let reader_and_field_accessors = self
|
||||
@@ -797,13 +913,11 @@ impl IndexMerger {
|
||||
let mut total_doc_freq = 0;
|
||||
|
||||
// Let's compute the list of non-empty posting lists
|
||||
for heap_item in merged_terms.current_kvs() {
|
||||
let segment_ord = heap_item.segment_ord;
|
||||
let term_info = heap_item.streamer.value();
|
||||
let segment_reader = &self.readers[heap_item.segment_ord];
|
||||
for (segment_ord, term_info) in merged_terms.current_segment_ordinals_and_term_infos() {
|
||||
let segment_reader = &self.readers[segment_ord];
|
||||
let inverted_index: &InvertedIndexReader = &*field_readers[segment_ord];
|
||||
let segment_postings = inverted_index
|
||||
.read_postings_from_terminfo(term_info, segment_postings_option)?;
|
||||
.read_postings_from_terminfo(&term_info, segment_postings_option)?;
|
||||
let delete_bitset_opt = segment_reader.delete_bitset();
|
||||
let doc_freq = if let Some(delete_bitset) = delete_bitset_opt {
|
||||
segment_postings.doc_freq_given_deletes(delete_bitset)
|
||||
@@ -927,19 +1041,41 @@ impl IndexMerger {
|
||||
.collect();
|
||||
if let Some(doc_id_mapping) = doc_id_mapping {
|
||||
for (old_doc_id, reader_with_ordinal) in doc_id_mapping {
|
||||
let store_reader = &mut document_iterators[reader_with_ordinal.ordinal as usize];
|
||||
let raw_doc = store_reader.next().expect(&format!(
|
||||
"unexpected missing document in docstore on merge, doc id {:?}",
|
||||
old_doc_id
|
||||
))?;
|
||||
store_writer.store_bytes(raw_doc.get_bytes())?;
|
||||
let doc_bytes_it = &mut document_iterators[reader_with_ordinal.ordinal as usize];
|
||||
if let Some(doc_bytes_res) = doc_bytes_it.next() {
|
||||
let doc_bytes = doc_bytes_res?;
|
||||
store_writer.store_bytes(&doc_bytes)?;
|
||||
} else {
|
||||
return Err(DataCorruption::comment_only(&format!(
|
||||
"unexpected missing document in docstore on merge, doc id {:?}",
|
||||
old_doc_id
|
||||
))
|
||||
.into());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for reader in &self.readers {
|
||||
let store_reader = reader.get_store_reader()?;
|
||||
if reader.num_deleted_docs() > 0 {
|
||||
for raw_doc in store_reader.iter_raw(reader.delete_bitset()) {
|
||||
store_writer.store_bytes(raw_doc?.get_bytes())?;
|
||||
if reader.num_deleted_docs() > 0
|
||||
// If there is not enough data in the store, we avoid stacking in order to
|
||||
// avoid creating many small blocks in the doc store. Once we have 5 full blocks,
|
||||
// we start stacking. In the worst case 2/7 of the blocks would be very small.
|
||||
// [segment 1 - {1 doc}][segment 2 - {fullblock * 5}{1doc}]
|
||||
// => 5 * full blocks, 2 * 1 document blocks
|
||||
//
|
||||
// In a more realistic scenario the segments are of the same size, so 1/6 of
|
||||
// the doc stores would be on average half full, given total randomness (which
|
||||
// is not the case here, but not sure how it behaves exactly).
|
||||
//
|
||||
// https://github.com/tantivy-search/tantivy/issues/1053
|
||||
//
|
||||
// take 7 in order to not walk over all checkpoints.
|
||||
|| store_reader.block_checkpoints().take(7).count() < 6
|
||||
|| store_reader.compressor() != store_writer.compressor()
|
||||
{
|
||||
for doc_bytes_res in store_reader.iter_raw(reader.delete_bitset()) {
|
||||
let doc_bytes = doc_bytes_res?;
|
||||
store_writer.store_bytes(&doc_bytes)?;
|
||||
}
|
||||
} else {
|
||||
store_writer.stack(&store_reader)?;
|
||||
@@ -958,7 +1094,13 @@ impl SerializableSegment for IndexMerger {
|
||||
) -> crate::Result<u32> {
|
||||
let doc_id_mapping = if let Some(sort_by_field) = self.index_settings.sort_by_field.as_ref()
|
||||
{
|
||||
Some(self.generate_doc_id_mapping(sort_by_field)?)
|
||||
// If the documents are already sorted and stackable, we ignore the mapping and execute
|
||||
// it as if there was no sorting
|
||||
if self.is_disjunct_and_sorted_on_sort_property(sort_by_field)? {
|
||||
None
|
||||
} else {
|
||||
Some(self.generate_doc_id_mapping(sort_by_field)?)
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
@@ -993,6 +1135,7 @@ mod tests {
|
||||
use crate::collector::tests::{BytesFastFieldTestCollector, FastFieldTestCollector};
|
||||
use crate::collector::{Count, FacetCollector};
|
||||
use crate::core::Index;
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::query::AllQuery;
|
||||
use crate::query::BooleanQuery;
|
||||
use crate::query::Scorer;
|
||||
@@ -1470,31 +1613,65 @@ mod tests {
|
||||
}
|
||||
#[test]
|
||||
fn test_merge_facets_sort_none() {
|
||||
test_merge_facets(None)
|
||||
test_merge_facets(None, true)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_facets_sort_asc() {
|
||||
// the data is already sorted asc, so this should have no effect, but go through the docid
|
||||
// mapping code
|
||||
test_merge_facets(Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Asc,
|
||||
// In the merge case this will go through the docid mapping code
|
||||
test_merge_facets(
|
||||
Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
}));
|
||||
true,
|
||||
);
|
||||
// In the merge case this will not go through the docid mapping code, because the data is
|
||||
// sorted and disjunct
|
||||
test_merge_facets(
|
||||
Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
false,
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_facets_sort_desc() {
|
||||
test_merge_facets(Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Desc,
|
||||
// In the merge case this will go through the docid mapping code
|
||||
test_merge_facets(
|
||||
Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
}));
|
||||
true,
|
||||
);
|
||||
// In the merge case this will not go through the docid mapping code, because the data is
|
||||
// sorted and disjunct
|
||||
test_merge_facets(
|
||||
Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
false,
|
||||
);
|
||||
}
|
||||
fn test_merge_facets(index_settings: Option<IndexSettings>) {
|
||||
// force_segment_value_overlap forces the int value for sorting to have overlapping min and max
|
||||
// ranges between segments so that merge algorithm can't apply certain optimizations
|
||||
fn test_merge_facets(index_settings: Option<IndexSettings>, force_segment_value_overlap: bool) {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
|
||||
let int_options = IntOptions::default()
|
||||
@@ -1511,32 +1688,47 @@ mod tests {
|
||||
let mut int_val = 0;
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_doc = |index_writer: &mut IndexWriter, doc_facets: &[&str]| {
|
||||
let mut doc = Document::default();
|
||||
for facet in doc_facets {
|
||||
doc.add_facet(facet_field, Facet::from(facet));
|
||||
}
|
||||
doc.add_u64(int_field, int_val);
|
||||
int_val += 1;
|
||||
index_writer.add_document(doc);
|
||||
};
|
||||
let index_doc =
|
||||
|index_writer: &mut IndexWriter, doc_facets: &[&str], int_val: &mut u64| {
|
||||
let mut doc = Document::default();
|
||||
for facet in doc_facets {
|
||||
doc.add_facet(facet_field, Facet::from(facet));
|
||||
}
|
||||
doc.add_u64(int_field, *int_val);
|
||||
*int_val += 1;
|
||||
index_writer.add_document(doc);
|
||||
};
|
||||
|
||||
index_doc(&mut index_writer, &["/top/a/firstdoc", "/top/b"]);
|
||||
index_doc(&mut index_writer, &["/top/a/firstdoc", "/top/b", "/top/c"]);
|
||||
index_doc(&mut index_writer, &["/top/a", "/top/b"]);
|
||||
index_doc(&mut index_writer, &["/top/a"]);
|
||||
index_doc(
|
||||
&mut index_writer,
|
||||
&["/top/a/firstdoc", "/top/b"],
|
||||
&mut int_val,
|
||||
);
|
||||
index_doc(
|
||||
&mut index_writer,
|
||||
&["/top/a/firstdoc", "/top/b", "/top/c"],
|
||||
&mut int_val,
|
||||
);
|
||||
index_doc(&mut index_writer, &["/top/a", "/top/b"], &mut int_val);
|
||||
index_doc(&mut index_writer, &["/top/a"], &mut int_val);
|
||||
|
||||
index_doc(&mut index_writer, &["/top/b", "/top/d"]);
|
||||
index_doc(&mut index_writer, &["/top/d"]);
|
||||
index_doc(&mut index_writer, &["/top/e"]);
|
||||
index_doc(&mut index_writer, &["/top/b", "/top/d"], &mut int_val);
|
||||
if force_segment_value_overlap {
|
||||
index_doc(&mut index_writer, &["/top/d"], &mut 0);
|
||||
index_doc(&mut index_writer, &["/top/e"], &mut 10);
|
||||
index_writer.commit().expect("committed");
|
||||
index_doc(&mut index_writer, &["/top/a"], &mut 5); // 5 is between 0 - 10 so the segments don' have disjunct ranges
|
||||
} else {
|
||||
index_doc(&mut index_writer, &["/top/d"], &mut int_val);
|
||||
index_doc(&mut index_writer, &["/top/e"], &mut int_val);
|
||||
index_writer.commit().expect("committed");
|
||||
index_doc(&mut index_writer, &["/top/a"], &mut int_val);
|
||||
}
|
||||
index_doc(&mut index_writer, &["/top/b"], &mut int_val);
|
||||
index_doc(&mut index_writer, &["/top/c"], &mut int_val);
|
||||
index_writer.commit().expect("committed");
|
||||
|
||||
index_doc(&mut index_writer, &["/top/a"]);
|
||||
index_doc(&mut index_writer, &["/top/b"]);
|
||||
index_doc(&mut index_writer, &["/top/c"]);
|
||||
index_writer.commit().expect("committed");
|
||||
|
||||
index_doc(&mut index_writer, &["/top/e", "/top/f"]);
|
||||
index_doc(&mut index_writer, &["/top/e", "/top/f"], &mut int_val);
|
||||
index_writer.commit().expect("committed");
|
||||
}
|
||||
|
||||
@@ -1821,7 +2013,7 @@ mod tests {
|
||||
|
||||
// Make sure we'll attempt to merge every created segment
|
||||
let mut policy = crate::indexer::LogMergePolicy::default();
|
||||
policy.set_min_merge_size(2);
|
||||
policy.set_min_num_segments(2);
|
||||
writer.set_merge_policy(Box::new(policy));
|
||||
|
||||
for i in 0..100 {
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::{
|
||||
collector::TopDocs,
|
||||
schema::{Cardinality, TextFieldIndexing},
|
||||
@@ -39,6 +40,7 @@ mod tests {
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
|
||||
index_writer.add_document(doc!(int_field=>3_u64, facet_field=> Facet::from("/crime")));
|
||||
index_writer.add_document(doc!(int_field=>6_u64, facet_field=> Facet::from("/crime")));
|
||||
|
||||
assert!(index_writer.commit().is_ok());
|
||||
index_writer.add_document(doc!(int_field=>5_u64, facet_field=> Facet::from("/fanta")));
|
||||
@@ -58,7 +60,12 @@ mod tests {
|
||||
index
|
||||
}
|
||||
|
||||
fn create_test_index(index_settings: Option<IndexSettings>) -> Index {
|
||||
// force_disjunct_segment_sort_values forces the field, by which the index is sorted have disjunct
|
||||
// ranges between segments, e.g. values in segment [1-3] [10 - 20] [50 - 500]
|
||||
fn create_test_index(
|
||||
index_settings: Option<IndexSettings>,
|
||||
force_disjunct_segment_sort_values: bool,
|
||||
) -> Index {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let int_options = IntOptions::default()
|
||||
.set_fast(Cardinality::SingleValue)
|
||||
@@ -92,6 +99,7 @@ mod tests {
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
|
||||
// segment 1 - range 1-3
|
||||
index_writer.add_document(doc!(int_field=>1_u64));
|
||||
index_writer.add_document(
|
||||
doc!(int_field=>3_u64, multi_numbers => 3_u64, multi_numbers => 4_u64, bytes_field => vec![1, 2, 3], text_field => "some text", facet_field=> Facet::from("/book/crime")),
|
||||
@@ -102,13 +110,26 @@ mod tests {
|
||||
);
|
||||
|
||||
assert!(index_writer.commit().is_ok());
|
||||
// segment 2 - range 1-20 , with force_disjunct_segment_sort_values 10-20
|
||||
index_writer.add_document(doc!(int_field=>20_u64, multi_numbers => 20_u64));
|
||||
index_writer.add_document(doc!(int_field=>1_u64, text_field=> "deleteme", facet_field=> Facet::from("/book/crime")));
|
||||
|
||||
let in_val = if force_disjunct_segment_sort_values {
|
||||
10_u64
|
||||
} else {
|
||||
1
|
||||
};
|
||||
index_writer.add_document(doc!(int_field=>in_val, text_field=> "deleteme", facet_field=> Facet::from("/book/crime")));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
index_writer.add_document(
|
||||
doc!(int_field=>10_u64, multi_numbers => 10_u64, multi_numbers => 11_u64, text_field=> "blubber", facet_field=> Facet::from("/book/fantasy")),
|
||||
// segment 3 - range 5-1000, with force_disjunct_segment_sort_values 50-1000
|
||||
let int_vals = if force_disjunct_segment_sort_values {
|
||||
[100_u64, 50]
|
||||
} else {
|
||||
[10, 5]
|
||||
};
|
||||
index_writer.add_document( // position of this doc after delete in desc sorting = [2], in disjunct case [1]
|
||||
doc!(int_field=>int_vals[0], multi_numbers => 10_u64, multi_numbers => 11_u64, text_field=> "blubber", facet_field=> Facet::from("/book/fantasy")),
|
||||
);
|
||||
index_writer.add_document(doc!(int_field=>5_u64, text_field=> "deleteme"));
|
||||
index_writer.add_document(doc!(int_field=>int_vals[1], text_field=> "deleteme"));
|
||||
index_writer.add_document(
|
||||
doc!(int_field=>1_000u64, multi_numbers => 1001_u64, multi_numbers => 1002_u64, bytes_field => vec![5, 5],text_field => "the biggest num")
|
||||
);
|
||||
@@ -136,17 +157,30 @@ mod tests {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
..Default::default()
|
||||
}));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_sorted_index_desc() {
|
||||
let index = create_test_index(Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Desc,
|
||||
fn test_merge_sorted_index_desc_not_disjunct() {
|
||||
test_merge_sorted_index_desc_(false);
|
||||
}
|
||||
#[test]
|
||||
fn test_merge_sorted_index_desc_disjunct() {
|
||||
test_merge_sorted_index_desc_(true);
|
||||
}
|
||||
|
||||
fn test_merge_sorted_index_desc_(force_disjunct_segment_sort_values: bool) {
|
||||
let index = create_test_index(
|
||||
Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
}));
|
||||
force_disjunct_segment_sort_values,
|
||||
);
|
||||
|
||||
let int_field = index.schema().get_field("intval").unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
@@ -160,8 +194,13 @@ mod tests {
|
||||
assert_eq!(fast_field.get(5u32), 1u64);
|
||||
assert_eq!(fast_field.get(4u32), 2u64);
|
||||
assert_eq!(fast_field.get(3u32), 3u64);
|
||||
assert_eq!(fast_field.get(2u32), 10u64);
|
||||
assert_eq!(fast_field.get(1u32), 20u64);
|
||||
if force_disjunct_segment_sort_values {
|
||||
assert_eq!(fast_field.get(2u32), 20u64);
|
||||
assert_eq!(fast_field.get(1u32), 100u64);
|
||||
} else {
|
||||
assert_eq!(fast_field.get(2u32), 10u64);
|
||||
assert_eq!(fast_field.get(1u32), 20u64);
|
||||
}
|
||||
assert_eq!(fast_field.get(0u32), 1_000u64);
|
||||
|
||||
// test new field norm mapping
|
||||
@@ -169,8 +208,13 @@ mod tests {
|
||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||
let fieldnorm_reader = segment_reader.get_fieldnorms_reader(my_text_field).unwrap();
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(0), 3); // the biggest num
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(1), 0);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(2), 1); // blubber
|
||||
if force_disjunct_segment_sort_values {
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(1), 1); // blubber
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(2), 0);
|
||||
} else {
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(1), 0);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(2), 1); // blubber
|
||||
}
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(3), 2); // some text
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(5), 0);
|
||||
}
|
||||
@@ -191,13 +235,22 @@ mod tests {
|
||||
};
|
||||
|
||||
assert_eq!(do_search("some"), vec![3]);
|
||||
assert_eq!(do_search("blubber"), vec![2]);
|
||||
if force_disjunct_segment_sort_values {
|
||||
assert_eq!(do_search("blubber"), vec![1]);
|
||||
} else {
|
||||
assert_eq!(do_search("blubber"), vec![2]);
|
||||
}
|
||||
assert_eq!(do_search("biggest"), vec![0]);
|
||||
}
|
||||
|
||||
// access doc store
|
||||
{
|
||||
let doc = searcher.doc(DocAddress::new(0, 2)).unwrap();
|
||||
let blubber_pos = if force_disjunct_segment_sort_values {
|
||||
1
|
||||
} else {
|
||||
2
|
||||
};
|
||||
let doc = searcher.doc(DocAddress::new(0, blubber_pos)).unwrap();
|
||||
assert_eq!(
|
||||
doc.get_first(my_text_field).unwrap().text(),
|
||||
Some("blubber")
|
||||
@@ -209,12 +262,16 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_merge_sorted_index_asc() {
|
||||
let index = create_test_index(Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Asc,
|
||||
let index = create_test_index(
|
||||
Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Asc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
}));
|
||||
false,
|
||||
);
|
||||
|
||||
let int_field = index.schema().get_field("intval").unwrap();
|
||||
let multi_numbers = index.schema().get_field("multi_numbers").unwrap();
|
||||
@@ -305,6 +362,7 @@ mod bench_sorted_index_merge {
|
||||
|
||||
use crate::core::Index;
|
||||
//use cratedoc_id, readerdoc_id_mappinglet vals = reader.fate::schema;
|
||||
use crate::fastfield::DynamicFastFieldReader;
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::indexer::merger::IndexMerger;
|
||||
use crate::schema::Cardinality;
|
||||
@@ -315,7 +373,6 @@ mod bench_sorted_index_merge {
|
||||
use crate::IndexSortByField;
|
||||
use crate::IndexWriter;
|
||||
use crate::Order;
|
||||
use futures::executor::block_on;
|
||||
use test::{self, Bencher};
|
||||
fn create_index(sort_by_field: Option<IndexSortByField>) -> Index {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -323,12 +380,12 @@ mod bench_sorted_index_merge {
|
||||
.set_fast(Cardinality::SingleValue)
|
||||
.set_indexed();
|
||||
let int_field = schema_builder.add_u64_field("intval", int_options);
|
||||
let int_field = schema_builder.add_u64_field("intval", int_options);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index_builder = Index::builder()
|
||||
.schema(schema)
|
||||
.settings(IndexSettings { sort_by_field });
|
||||
let index_builder = Index::builder().schema(schema).settings(IndexSettings {
|
||||
sort_by_field,
|
||||
..Default::default()
|
||||
});
|
||||
let index = index_builder.create_in_ram().unwrap();
|
||||
|
||||
{
|
||||
@@ -366,7 +423,7 @@ mod bench_sorted_index_merge {
|
||||
b.iter(|| {
|
||||
|
||||
let sorted_doc_ids = doc_id_mapping.iter().map(|(doc_id, reader)|{
|
||||
let u64_reader: FastFieldReader<u64> = reader
|
||||
let u64_reader: DynamicFastFieldReader<u64> = reader.reader
|
||||
.fast_fields()
|
||||
.typed_fast_field_reader(field)
|
||||
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
|
||||
@@ -391,7 +448,7 @@ mod bench_sorted_index_merge {
|
||||
order: Order::Desc,
|
||||
};
|
||||
let index = create_index(Some(sort_by_field.clone()));
|
||||
let field = index.schema().get_field("intval").unwrap();
|
||||
//let field = index.schema().get_field("intval").unwrap();
|
||||
let segments = index.searchable_segments().unwrap();
|
||||
let merger: IndexMerger =
|
||||
IndexMerger::open(index.schema(), index.settings().clone(), &segments[..])?;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use crate::core::Segment;
|
||||
use crate::core::SegmentComponent;
|
||||
use crate::fastfield::FastFieldSerializer;
|
||||
use crate::fastfield::CompositeFastFieldSerializer;
|
||||
use crate::fieldnorm::FieldNormsSerializer;
|
||||
use crate::postings::InvertedIndexSerializer;
|
||||
use crate::store::StoreWriter;
|
||||
@@ -10,7 +10,7 @@ use crate::store::StoreWriter;
|
||||
pub struct SegmentSerializer {
|
||||
segment: Segment,
|
||||
pub(crate) store_writer: StoreWriter,
|
||||
fast_field_serializer: FastFieldSerializer,
|
||||
fast_field_serializer: CompositeFastFieldSerializer,
|
||||
fieldnorms_serializer: Option<FieldNormsSerializer>,
|
||||
postings_serializer: InvertedIndexSerializer,
|
||||
}
|
||||
@@ -33,15 +33,16 @@ impl SegmentSerializer {
|
||||
let store_write = segment.open_write(store_component)?;
|
||||
|
||||
let fast_field_write = segment.open_write(SegmentComponent::FastFields)?;
|
||||
let fast_field_serializer = FastFieldSerializer::from_write(fast_field_write)?;
|
||||
let fast_field_serializer = CompositeFastFieldSerializer::from_write(fast_field_write)?;
|
||||
|
||||
let fieldnorms_write = segment.open_write(SegmentComponent::FieldNorms)?;
|
||||
let fieldnorms_serializer = FieldNormsSerializer::from_write(fieldnorms_write)?;
|
||||
|
||||
let postings_serializer = InvertedIndexSerializer::open(&mut segment)?;
|
||||
let compressor = segment.index().settings().docstore_compression;
|
||||
Ok(SegmentSerializer {
|
||||
segment,
|
||||
store_writer: StoreWriter::new(store_write),
|
||||
store_writer: StoreWriter::new(store_write, compressor),
|
||||
fast_field_serializer,
|
||||
fieldnorms_serializer: Some(fieldnorms_serializer),
|
||||
postings_serializer,
|
||||
@@ -67,7 +68,7 @@ impl SegmentSerializer {
|
||||
}
|
||||
|
||||
/// Accessor to the `FastFieldSerializer`.
|
||||
pub fn get_fast_field_serializer(&mut self) -> &mut FastFieldSerializer {
|
||||
pub fn get_fast_field_serializer(&mut self) -> &mut CompositeFastFieldSerializer {
|
||||
&mut self.fast_field_serializer
|
||||
}
|
||||
|
||||
|
||||
@@ -345,8 +345,11 @@ fn write(
|
||||
let store_write = serializer
|
||||
.segment_mut()
|
||||
.open_write(SegmentComponent::Store)?;
|
||||
let old_store_writer =
|
||||
std::mem::replace(&mut serializer.store_writer, StoreWriter::new(store_write));
|
||||
let compressor = serializer.segment().index().settings().docstore_compression;
|
||||
let old_store_writer = std::mem::replace(
|
||||
&mut serializer.store_writer,
|
||||
StoreWriter::new(store_write, compressor),
|
||||
);
|
||||
old_store_writer.close()?;
|
||||
let store_read = StoreReader::open(
|
||||
serializer
|
||||
@@ -354,12 +357,9 @@ fn write(
|
||||
.open_read(SegmentComponent::TempStore)?,
|
||||
)?;
|
||||
for old_doc_id in doc_id_map.iter_old_doc_ids() {
|
||||
let raw_doc = store_read.get_raw(*old_doc_id)?;
|
||||
serializer
|
||||
.get_store_writer()
|
||||
.store_bytes(raw_doc.get_bytes())?;
|
||||
let doc_bytes = store_read.get_document_bytes(*old_doc_id)?;
|
||||
serializer.get_store_writer().store_bytes(&doc_bytes)?;
|
||||
}
|
||||
// TODO delete temp store
|
||||
}
|
||||
serializer.close()?;
|
||||
Ok(())
|
||||
|
||||
@@ -178,7 +178,7 @@ use once_cell::sync::Lazy;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// Index format version.
|
||||
const INDEX_FORMAT_VERSION: u32 = 3;
|
||||
const INDEX_FORMAT_VERSION: u32 = 4;
|
||||
|
||||
/// Structure version for the index.
|
||||
#[derive(Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
@@ -187,7 +187,6 @@ pub struct Version {
|
||||
minor: u32,
|
||||
patch: u32,
|
||||
index_format_version: u32,
|
||||
store_compression: String,
|
||||
}
|
||||
|
||||
impl fmt::Debug for Version {
|
||||
@@ -201,14 +200,13 @@ static VERSION: Lazy<Version> = Lazy::new(|| Version {
|
||||
minor: env!("CARGO_PKG_VERSION_MINOR").parse().unwrap(),
|
||||
patch: env!("CARGO_PKG_VERSION_PATCH").parse().unwrap(),
|
||||
index_format_version: INDEX_FORMAT_VERSION,
|
||||
store_compression: crate::store::COMPRESSION.to_string(),
|
||||
});
|
||||
|
||||
impl ToString for Version {
|
||||
fn to_string(&self) -> String {
|
||||
format!(
|
||||
"tantivy v{}.{}.{}, index_format v{}, store_compression: {}",
|
||||
self.major, self.minor, self.patch, self.index_format_version, self.store_compression
|
||||
"tantivy v{}.{}.{}, index_format v{}",
|
||||
self.major, self.minor, self.patch, self.index_format_version
|
||||
)
|
||||
}
|
||||
}
|
||||
@@ -293,6 +291,7 @@ mod tests {
|
||||
use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE;
|
||||
use crate::core::SegmentReader;
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::query::BooleanQuery;
|
||||
use crate::schema::*;
|
||||
use crate::DocAddress;
|
||||
|
||||
@@ -354,18 +354,18 @@ mod bench {
|
||||
});
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_all_docs_compression_numbits() {
|
||||
for expected_num_bits in 0u8.. {
|
||||
let mut data = [0u32; 128];
|
||||
if expected_num_bits > 0 {
|
||||
data[0] = (1u64 << (expected_num_bits as usize) - 1) as u32;
|
||||
}
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let (num_bits, compressed) = encoder.compress_block_unsorted(&data);
|
||||
assert_eq!(compressed.len(), compressed_block_size(num_bits));
|
||||
}
|
||||
}
|
||||
//#[test]
|
||||
//fn test_all_docs_compression_numbits() {
|
||||
//for expected_num_bits in 0u8.. {
|
||||
//let mut data = [0u32; 128];
|
||||
//if expected_num_bits > 0 {
|
||||
//data[0] = (1u64 << (expected_num_bits as usize) - 1) as u32;
|
||||
//}
|
||||
//let mut encoder = BlockEncoder::new();
|
||||
//let (num_bits, compressed) = encoder.compress_block_unsorted(&data);
|
||||
//assert_eq!(compressed.len(), compressed_block_size(num_bits));
|
||||
//}
|
||||
//}
|
||||
|
||||
const NUM_INTS_BENCH_VINT: usize = 10;
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ use crate::query::Weight;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::schema::Term;
|
||||
use crate::Searcher;
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
/// The boolean query returns a set of documents
|
||||
/// that matches the Boolean combination of constituent subqueries.
|
||||
@@ -159,9 +159,9 @@ impl Query for BooleanQuery {
|
||||
Ok(Box::new(BooleanWeight::new(sub_weights, scoring_enabled)))
|
||||
}
|
||||
|
||||
fn query_terms(&self, term_set: &mut BTreeSet<Term>) {
|
||||
fn query_terms(&self, terms: &mut BTreeMap<Term, bool>) {
|
||||
for (_occur, subquery) in &self.subqueries {
|
||||
subquery.query_terms(term_set);
|
||||
subquery.query_terms(terms);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,7 +2,7 @@ use crate::fastfield::DeleteBitSet;
|
||||
use crate::query::explanation::does_not_match;
|
||||
use crate::query::{Explanation, Query, Scorer, Weight};
|
||||
use crate::{DocId, DocSet, Score, Searcher, SegmentReader, Term};
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt;
|
||||
|
||||
/// `BoostQuery` is a wrapper over a query used to boost its score.
|
||||
@@ -48,8 +48,8 @@ impl Query for BoostQuery {
|
||||
Ok(boosted_weight)
|
||||
}
|
||||
|
||||
fn query_terms(&self, term_set: &mut BTreeSet<Term>) {
|
||||
self.query.query_terms(term_set)
|
||||
fn query_terms(&self, terms: &mut BTreeMap<Term, bool>) {
|
||||
self.query.query_terms(terms)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@ mod exclude;
|
||||
mod explanation;
|
||||
mod fuzzy_query;
|
||||
mod intersection;
|
||||
mod mlt;
|
||||
mod more_like_this;
|
||||
mod phrase_query;
|
||||
mod query;
|
||||
mod query_parser;
|
||||
@@ -46,7 +46,7 @@ pub use self::explanation::Explanation;
|
||||
pub(crate) use self::fuzzy_query::DfaWrapper;
|
||||
pub use self::fuzzy_query::FuzzyTermQuery;
|
||||
pub use self::intersection::intersect_scorers;
|
||||
pub use self::mlt::{MoreLikeThisQuery, MoreLikeThisQueryBuilder};
|
||||
pub use self::more_like_this::{MoreLikeThisQuery, MoreLikeThisQueryBuilder};
|
||||
pub use self::phrase_query::PhraseQuery;
|
||||
pub use self::query::{Query, QueryClone};
|
||||
pub use self::query_parser::QueryParser;
|
||||
@@ -66,7 +66,7 @@ mod tests {
|
||||
use crate::schema::{Schema, TEXT};
|
||||
use crate::Index;
|
||||
use crate::Term;
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
#[test]
|
||||
fn test_query_terms() {
|
||||
@@ -78,49 +78,49 @@ mod tests {
|
||||
let term_a = Term::from_field_text(text_field, "a");
|
||||
let term_b = Term::from_field_text(text_field, "b");
|
||||
{
|
||||
let mut terms_set: BTreeSet<Term> = BTreeSet::new();
|
||||
let mut terms: BTreeMap<Term, bool> = Default::default();
|
||||
query_parser
|
||||
.parse_query("a")
|
||||
.unwrap()
|
||||
.query_terms(&mut terms_set);
|
||||
let terms: Vec<&Term> = terms_set.iter().collect();
|
||||
assert_eq!(vec![&term_a], terms);
|
||||
.query_terms(&mut terms);
|
||||
let terms: Vec<(&Term, &bool)> = terms.iter().collect();
|
||||
assert_eq!(vec![(&term_a, &false)], terms);
|
||||
}
|
||||
{
|
||||
let mut terms_set: BTreeSet<Term> = BTreeSet::new();
|
||||
let mut terms: BTreeMap<Term, bool> = Default::default();
|
||||
query_parser
|
||||
.parse_query("a b")
|
||||
.unwrap()
|
||||
.query_terms(&mut terms_set);
|
||||
let terms: Vec<&Term> = terms_set.iter().collect();
|
||||
assert_eq!(vec![&term_a, &term_b], terms);
|
||||
.query_terms(&mut terms);
|
||||
let terms: Vec<(&Term, &bool)> = terms.iter().collect();
|
||||
assert_eq!(vec![(&term_a, &false), (&term_b, &false)], terms);
|
||||
}
|
||||
{
|
||||
let mut terms_set: BTreeSet<Term> = BTreeSet::new();
|
||||
let mut terms: BTreeMap<Term, bool> = Default::default();
|
||||
query_parser
|
||||
.parse_query("\"a b\"")
|
||||
.unwrap()
|
||||
.query_terms(&mut terms_set);
|
||||
let terms: Vec<&Term> = terms_set.iter().collect();
|
||||
assert_eq!(vec![&term_a, &term_b], terms);
|
||||
.query_terms(&mut terms);
|
||||
let terms: Vec<(&Term, &bool)> = terms.iter().collect();
|
||||
assert_eq!(vec![(&term_a, &true), (&term_b, &true)], terms);
|
||||
}
|
||||
{
|
||||
let mut terms_set: BTreeSet<Term> = BTreeSet::new();
|
||||
let mut terms: BTreeMap<Term, bool> = Default::default();
|
||||
query_parser
|
||||
.parse_query("a a a a a")
|
||||
.unwrap()
|
||||
.query_terms(&mut terms_set);
|
||||
let terms: Vec<&Term> = terms_set.iter().collect();
|
||||
assert_eq!(vec![&term_a], terms);
|
||||
.query_terms(&mut terms);
|
||||
let terms: Vec<(&Term, &bool)> = terms.iter().collect();
|
||||
assert_eq!(vec![(&term_a, &false)], terms);
|
||||
}
|
||||
{
|
||||
let mut terms_set: BTreeSet<Term> = BTreeSet::new();
|
||||
let mut terms: BTreeMap<Term, bool> = Default::default();
|
||||
query_parser
|
||||
.parse_query("a -b")
|
||||
.unwrap()
|
||||
.query_terms(&mut terms_set);
|
||||
let terms: Vec<&Term> = terms_set.iter().collect();
|
||||
assert_eq!(vec![&term_a, &term_b], terms);
|
||||
.query_terms(&mut terms);
|
||||
let terms: Vec<(&Term, &bool)> = terms.iter().collect();
|
||||
assert_eq!(vec![(&term_a, &false), (&term_b, &false)], terms);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
mod mlt;
|
||||
mod more_like_this;
|
||||
mod query;
|
||||
|
||||
pub use self::mlt::MoreLikeThis;
|
||||
pub use self::more_like_this::MoreLikeThis;
|
||||
pub use self::query::{MoreLikeThisQuery, MoreLikeThisQueryBuilder};
|
||||
@@ -233,10 +233,9 @@ impl MoreLikeThis {
|
||||
}
|
||||
FieldType::U64(_) => {
|
||||
for field_value in field_values {
|
||||
let val = field_value
|
||||
.value()
|
||||
.u64_value()
|
||||
.ok_or(TantivyError::InvalidArgument("invalid value".to_string()))?;
|
||||
let val = field_value.value().u64_value().ok_or_else(|| {
|
||||
TantivyError::InvalidArgument("invalid value".to_string())
|
||||
})?;
|
||||
if !self.is_noise_word(val.to_string()) {
|
||||
let term = Term::from_field_u64(field, val);
|
||||
*term_frequencies.entry(term).or_insert(0) += 1;
|
||||
@@ -249,7 +248,7 @@ impl MoreLikeThis {
|
||||
let val = field_value
|
||||
.value()
|
||||
.date_value()
|
||||
.ok_or(TantivyError::InvalidArgument("invalid value".to_string()))?
|
||||
.ok_or_else(|| TantivyError::InvalidArgument("invalid value".to_string()))?
|
||||
.timestamp();
|
||||
if !self.is_noise_word(val.to_string()) {
|
||||
let term = Term::from_field_i64(field, val);
|
||||
@@ -259,10 +258,9 @@ impl MoreLikeThis {
|
||||
}
|
||||
FieldType::I64(_) => {
|
||||
for field_value in field_values {
|
||||
let val = field_value
|
||||
.value()
|
||||
.i64_value()
|
||||
.ok_or(TantivyError::InvalidArgument("invalid value".to_string()))?;
|
||||
let val = field_value.value().i64_value().ok_or_else(|| {
|
||||
TantivyError::InvalidArgument("invalid value".to_string())
|
||||
})?;
|
||||
if !self.is_noise_word(val.to_string()) {
|
||||
let term = Term::from_field_i64(field, val);
|
||||
*term_frequencies.entry(term).or_insert(0) += 1;
|
||||
@@ -271,10 +269,9 @@ impl MoreLikeThis {
|
||||
}
|
||||
FieldType::F64(_) => {
|
||||
for field_value in field_values {
|
||||
let val = field_value
|
||||
.value()
|
||||
.f64_value()
|
||||
.ok_or(TantivyError::InvalidArgument("invalid value".to_string()))?;
|
||||
let val = field_value.value().f64_value().ok_or_else(|| {
|
||||
TantivyError::InvalidArgument("invalid value".to_string())
|
||||
})?;
|
||||
if !self.is_noise_word(val.to_string()) {
|
||||
let term = Term::from_field_f64(field, val);
|
||||
*term_frequencies.entry(term).or_insert(0) += 1;
|
||||
@@ -306,7 +303,7 @@ impl MoreLikeThis {
|
||||
{
|
||||
return true;
|
||||
}
|
||||
return self.stop_words.contains(&word);
|
||||
self.stop_words.contains(&word)
|
||||
}
|
||||
|
||||
/// Couputes the score for each term while ignoring not useful terms
|
||||
@@ -1,3 +1,5 @@
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use super::PhraseWeight;
|
||||
use crate::core::searcher::Searcher;
|
||||
use crate::query::bm25::Bm25Weight;
|
||||
@@ -5,7 +7,6 @@ use crate::query::Query;
|
||||
use crate::query::Weight;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::schema::{Field, Term};
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
/// `PhraseQuery` matches a specific sequence of words.
|
||||
///
|
||||
@@ -113,9 +114,9 @@ impl Query for PhraseQuery {
|
||||
Ok(Box::new(phrase_weight))
|
||||
}
|
||||
|
||||
fn query_terms(&self, term_set: &mut BTreeSet<Term>) {
|
||||
for (_, query_term) in &self.phrase_terms {
|
||||
term_set.insert(query_term.clone());
|
||||
fn query_terms(&self, terms: &mut BTreeMap<Term, bool>) {
|
||||
for (_, term) in &self.phrase_terms {
|
||||
terms.insert(term.clone(), true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@ use crate::query::Explanation;
|
||||
use crate::DocAddress;
|
||||
use crate::Term;
|
||||
use downcast_rs::impl_downcast;
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt;
|
||||
|
||||
/// The `Query` trait defines a set of documents and a scoring method
|
||||
@@ -68,7 +68,10 @@ pub trait Query: QueryClone + Send + Sync + downcast_rs::Downcast + fmt::Debug {
|
||||
|
||||
/// Extract all of the terms associated to the query and insert them in the
|
||||
/// term set given in arguments.
|
||||
fn query_terms(&self, _term_set: &mut BTreeSet<Term>) {}
|
||||
///
|
||||
/// Each term is associated with a boolean indicating whether
|
||||
/// Positions are required or not.
|
||||
fn query_terms(&self, _term_set: &mut BTreeMap<Term, bool>) {}
|
||||
}
|
||||
|
||||
/// Implements `box_clone`.
|
||||
@@ -95,8 +98,8 @@ impl Query for Box<dyn Query> {
|
||||
self.as_ref().count(searcher)
|
||||
}
|
||||
|
||||
fn query_terms(&self, term_set: &mut BTreeSet<Term<Vec<u8>>>) {
|
||||
self.as_ref().query_terms(term_set);
|
||||
fn query_terms(&self, terms: &mut BTreeMap<Term, bool>) {
|
||||
self.as_ref().query_terms(terms);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@ use crate::query::Query;
|
||||
use crate::query::RangeQuery;
|
||||
use crate::query::TermQuery;
|
||||
use crate::query::{AllQuery, BoostQuery};
|
||||
use crate::schema::{Facet, IndexRecordOption};
|
||||
use crate::schema::{Facet, FacetParseError, IndexRecordOption};
|
||||
use crate::schema::{Field, Schema};
|
||||
use crate::schema::{FieldType, Term};
|
||||
use crate::tokenizer::TokenizerManager;
|
||||
@@ -68,6 +68,9 @@ pub enum QueryParserError {
|
||||
/// The format for the date field is not RFC 3339 compliant.
|
||||
#[error("The date field has an invalid format")]
|
||||
DateFormatError(chrono::ParseError),
|
||||
/// The format for the facet field is invalid.
|
||||
#[error("The facet field is malformed: {0}")]
|
||||
FacetFormatError(FacetParseError),
|
||||
}
|
||||
|
||||
impl From<ParseIntError> for QueryParserError {
|
||||
@@ -88,6 +91,12 @@ impl From<chrono::ParseError> for QueryParserError {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<FacetParseError> for QueryParserError {
|
||||
fn from(err: FacetParseError) -> QueryParserError {
|
||||
QueryParserError::FacetFormatError(err)
|
||||
}
|
||||
}
|
||||
|
||||
/// Recursively remove empty clause from the AST
|
||||
///
|
||||
/// Returns `None` iff the `logical_ast` ended up being empty.
|
||||
@@ -358,10 +367,10 @@ impl QueryParser {
|
||||
))
|
||||
}
|
||||
}
|
||||
FieldType::HierarchicalFacet(_) => {
|
||||
let facet = Facet::from_text(phrase);
|
||||
Ok(vec![(0, Term::from_field_text(field, facet.encoded_str()))])
|
||||
}
|
||||
FieldType::HierarchicalFacet(_) => match Facet::from_text(phrase) {
|
||||
Ok(facet) => Ok(vec![(0, Term::from_field_text(field, facet.encoded_str()))]),
|
||||
Err(e) => Err(QueryParserError::from(e)),
|
||||
},
|
||||
FieldType::Bytes(_) => {
|
||||
let bytes = base64::decode(phrase).map_err(QueryParserError::ExpectedBase64)?;
|
||||
let term = Term::from_field_bytes(field, &bytes);
|
||||
@@ -1027,6 +1036,19 @@ mod test {
|
||||
.is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_query_parser_expected_facet() {
|
||||
let query_parser = make_query_parser();
|
||||
match query_parser.parse_query("facet:INVALID") {
|
||||
Ok(_) => panic!("should never succeed"),
|
||||
Err(e) => assert_eq!(
|
||||
"The facet field is malformed: Failed to parse the facet string: 'INVALID'",
|
||||
format!("{}", e)
|
||||
),
|
||||
}
|
||||
assert!(query_parser.parse_query("facet:\"/foo/bar\"").is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_query_parser_not_empty_but_no_tokens() {
|
||||
let query_parser = make_query_parser();
|
||||
|
||||
@@ -5,7 +5,7 @@ use crate::query::{Explanation, Query};
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::Searcher;
|
||||
use crate::Term;
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt;
|
||||
|
||||
/// A Term query matches all of the documents
|
||||
@@ -127,7 +127,7 @@ impl Query for TermQuery {
|
||||
self.specialized_weight(searcher, scoring_enabled)?,
|
||||
))
|
||||
}
|
||||
fn query_terms(&self, term_set: &mut BTreeSet<Term>) {
|
||||
term_set.insert(self.term.clone());
|
||||
fn query_terms(&self, terms: &mut BTreeMap<Term, bool>) {
|
||||
terms.insert(self.term.clone(), false);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,6 +20,14 @@ pub const FACET_SEP_BYTE: u8 = 0u8;
|
||||
/// representation of facets. (It is the null codepoint.)
|
||||
pub const FACET_SEP_CHAR: char = '\u{0}';
|
||||
|
||||
/// An error enum for facet parser.
|
||||
#[derive(Debug, PartialEq, Eq, Error)]
|
||||
pub enum FacetParseError {
|
||||
/// The facet text representation is unparsable.
|
||||
#[error("Failed to parse the facet string: '{0}'")]
|
||||
FacetParseError(String),
|
||||
}
|
||||
|
||||
/// A Facet represent a point in a given hierarchy.
|
||||
///
|
||||
/// They are typically represented similarly to a filepath.
|
||||
@@ -75,11 +83,47 @@ impl Facet {
|
||||
/// It is conceptually, if one of the steps of this path
|
||||
/// contains a `/` or a `\`, it should be escaped
|
||||
/// using an anti-slash `/`.
|
||||
pub fn from_text<T>(path: &T) -> Facet
|
||||
pub fn from_text<T>(path: &T) -> Result<Facet, FacetParseError>
|
||||
where
|
||||
T: ?Sized + AsRef<str>,
|
||||
{
|
||||
From::from(path)
|
||||
#[derive(Copy, Clone)]
|
||||
enum State {
|
||||
Escaped,
|
||||
Idle,
|
||||
}
|
||||
let path_ref = path.as_ref();
|
||||
if path_ref.is_empty() {
|
||||
return Err(FacetParseError::FacetParseError(path_ref.to_string()));
|
||||
}
|
||||
if !path_ref.starts_with('/') {
|
||||
return Err(FacetParseError::FacetParseError(path_ref.to_string()));
|
||||
}
|
||||
let mut facet_encoded = String::new();
|
||||
let mut state = State::Idle;
|
||||
let path_bytes = path_ref.as_bytes();
|
||||
let mut last_offset = 1;
|
||||
for i in 1..path_bytes.len() {
|
||||
let c = path_bytes[i];
|
||||
match (state, c) {
|
||||
(State::Idle, ESCAPE_BYTE) => {
|
||||
facet_encoded.push_str(&path_ref[last_offset..i]);
|
||||
last_offset = i + 1;
|
||||
state = State::Escaped
|
||||
}
|
||||
(State::Idle, SLASH_BYTE) => {
|
||||
facet_encoded.push_str(&path_ref[last_offset..i]);
|
||||
facet_encoded.push(FACET_SEP_CHAR);
|
||||
last_offset = i + 1;
|
||||
}
|
||||
(State::Escaped, _escaped_char) => {
|
||||
state = State::Idle;
|
||||
}
|
||||
(State::Idle, _any_char) => {}
|
||||
}
|
||||
}
|
||||
facet_encoded.push_str(&path_ref[last_offset..]);
|
||||
Ok(Facet(facet_encoded))
|
||||
}
|
||||
|
||||
/// Returns a `Facet` from an iterator over the different
|
||||
@@ -137,39 +181,7 @@ impl Borrow<str> for Facet {
|
||||
|
||||
impl<'a, T: ?Sized + AsRef<str>> From<&'a T> for Facet {
|
||||
fn from(path_asref: &'a T) -> Facet {
|
||||
#[derive(Copy, Clone)]
|
||||
enum State {
|
||||
Escaped,
|
||||
Idle,
|
||||
}
|
||||
let path: &str = path_asref.as_ref();
|
||||
assert!(!path.is_empty());
|
||||
assert!(path.starts_with('/'));
|
||||
let mut facet_encoded = String::new();
|
||||
let mut state = State::Idle;
|
||||
let path_bytes = path.as_bytes();
|
||||
let mut last_offset = 1;
|
||||
for i in 1..path_bytes.len() {
|
||||
let c = path_bytes[i];
|
||||
match (state, c) {
|
||||
(State::Idle, ESCAPE_BYTE) => {
|
||||
facet_encoded.push_str(&path[last_offset..i]);
|
||||
last_offset = i + 1;
|
||||
state = State::Escaped
|
||||
}
|
||||
(State::Idle, SLASH_BYTE) => {
|
||||
facet_encoded.push_str(&path[last_offset..i]);
|
||||
facet_encoded.push(FACET_SEP_CHAR);
|
||||
last_offset = i + 1;
|
||||
}
|
||||
(State::Escaped, _escaped_char) => {
|
||||
state = State::Idle;
|
||||
}
|
||||
(State::Idle, _any_char) => {}
|
||||
}
|
||||
}
|
||||
facet_encoded.push_str(&path[last_offset..]);
|
||||
Facet(facet_encoded)
|
||||
Facet::from_text(path_asref).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -226,7 +238,7 @@ impl Debug for Facet {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::Facet;
|
||||
use super::{Facet, FacetParseError};
|
||||
|
||||
#[test]
|
||||
fn test_root() {
|
||||
@@ -288,4 +300,12 @@ mod tests {
|
||||
let facet = Facet::from_path(v.iter());
|
||||
assert_eq!(facet.to_path_string(), "/");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_from_text() {
|
||||
assert_eq!(
|
||||
Err(FacetParseError::FacetParseError("INVALID".to_string())),
|
||||
Facet::from_text("INVALID")
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -23,6 +23,15 @@ pub struct FieldEntry {
|
||||
}
|
||||
|
||||
impl FieldEntry {
|
||||
/// Creates a new field entry given a name and a field type
|
||||
pub fn new(field_name: String, field_type: FieldType) -> FieldEntry {
|
||||
assert!(is_valid_field_name(&field_name));
|
||||
FieldEntry {
|
||||
name: field_name,
|
||||
field_type,
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a new u64 field entry in the schema, given
|
||||
/// a name, and some options.
|
||||
pub fn new_text(field_name: String, text_options: TextOptions) -> FieldEntry {
|
||||
|
||||
@@ -128,6 +128,7 @@ pub use self::schema::{Schema, SchemaBuilder};
|
||||
pub use self::value::Value;
|
||||
|
||||
pub use self::facet::Facet;
|
||||
pub use self::facet::FacetParseError;
|
||||
pub(crate) use self::facet::FACET_SEP_BYTE;
|
||||
pub use self::facet_options::FacetOptions;
|
||||
|
||||
|
||||
@@ -7,7 +7,6 @@ use crate::{Document, Score};
|
||||
use htmlescape::encode_minimal;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::BTreeSet;
|
||||
use std::ops::Range;
|
||||
|
||||
const DEFAULT_MAX_NUM_CHARS: usize = 150;
|
||||
@@ -239,10 +238,10 @@ impl SnippetGenerator {
|
||||
query: &dyn Query,
|
||||
field: Field,
|
||||
) -> crate::Result<SnippetGenerator> {
|
||||
let mut terms = BTreeSet::new();
|
||||
let mut terms = BTreeMap::new();
|
||||
query.query_terms(&mut terms);
|
||||
let mut terms_text: BTreeMap<String, Score> = Default::default();
|
||||
for term in terms {
|
||||
for (term, _) in terms {
|
||||
if term.field() != field {
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -1,10 +1,6 @@
|
||||
use std::io;
|
||||
|
||||
/// Name of the compression scheme used in the doc store.
|
||||
///
|
||||
/// This name is appended to the version string of tantivy.
|
||||
pub const COMPRESSION: &'static str = "brotli";
|
||||
|
||||
#[inline]
|
||||
pub fn compress(mut uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
let mut params = brotli::enc::BrotliEncoderParams::default();
|
||||
params.quality = 5;
|
||||
@@ -13,6 +9,7 @@ pub fn compress(mut uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn decompress(mut compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
decompressed.clear();
|
||||
brotli::BrotliDecompress(&mut compressed, decompressed)?;
|
||||
|
||||
@@ -1,22 +0,0 @@
|
||||
use std::io::{self, Read, Write};
|
||||
|
||||
/// Name of the compression scheme used in the doc store.
|
||||
///
|
||||
/// This name is appended to the version string of tantivy.
|
||||
pub const COMPRESSION: &str = "lz4";
|
||||
|
||||
pub fn compress(uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
compressed.clear();
|
||||
let mut encoder = lz4::EncoderBuilder::new().build(compressed)?;
|
||||
encoder.write_all(&uncompressed)?;
|
||||
let (_, encoder_result) = encoder.finish();
|
||||
encoder_result?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn decompress(compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
decompressed.clear();
|
||||
let mut decoder = lz4::Decoder::new(compressed)?;
|
||||
decoder.read_to_end(decompressed)?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -2,38 +2,46 @@ use std::io::{self};
|
||||
|
||||
use core::convert::TryInto;
|
||||
use lz4_flex::{compress_into, decompress_into};
|
||||
/// Name of the compression scheme used in the doc store.
|
||||
///
|
||||
/// This name is appended to the version string of tantivy.
|
||||
pub const COMPRESSION: &str = "lz4_block";
|
||||
|
||||
#[inline]
|
||||
pub fn compress(uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
compressed.clear();
|
||||
let maximum_ouput_size = lz4_flex::block::get_maximum_output_size(uncompressed.len());
|
||||
compressed.reserve(maximum_ouput_size);
|
||||
|
||||
compressed.extend_from_slice(&[0, 0, 0, 0]);
|
||||
compress_into(uncompressed, compressed);
|
||||
unsafe {
|
||||
compressed.set_len(maximum_ouput_size + 4);
|
||||
}
|
||||
let bytes_written = compress_into(uncompressed, compressed, 4)
|
||||
.map_err(|err| io::Error::new(io::ErrorKind::InvalidData, err.to_string()))?;
|
||||
let num_bytes = uncompressed.len() as u32;
|
||||
compressed[0..4].copy_from_slice(&num_bytes.to_le_bytes());
|
||||
unsafe {
|
||||
compressed.set_len(bytes_written + 4);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn decompress(compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
decompressed.clear();
|
||||
//next lz4_flex version will support slice as input parameter.
|
||||
//this will make the usage much less ugly
|
||||
let uncompressed_size_bytes: &[u8; 4] = compressed
|
||||
.get(..4)
|
||||
.ok_or(io::ErrorKind::InvalidData)?
|
||||
.try_into()
|
||||
.unwrap();
|
||||
let uncompressed_size = u32::from_le_bytes(*uncompressed_size_bytes) as usize;
|
||||
// reserve more than required, because blocked writes may write out of bounds, will be improved
|
||||
// with lz4_flex 1.0
|
||||
decompressed.reserve(uncompressed_size + 4 + 24);
|
||||
decompressed.reserve(uncompressed_size);
|
||||
unsafe {
|
||||
decompressed.set_len(uncompressed_size);
|
||||
}
|
||||
decompress_into(&compressed[4..], decompressed)
|
||||
let bytes_written = decompress_into(&compressed[4..], decompressed, 0)
|
||||
.map_err(|err| io::Error::new(io::ErrorKind::InvalidData, err.to_string()))?;
|
||||
if bytes_written != uncompressed_size {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
"doc store block not completely decompressed, data corruption".to_string(),
|
||||
));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,10 +1,6 @@
|
||||
use std::io::{self, Read, Write};
|
||||
|
||||
/// Name of the compression scheme used in the doc store.
|
||||
///
|
||||
/// This name is appended to the version string of tantivy.
|
||||
pub const COMPRESSION: &str = "snappy";
|
||||
|
||||
#[inline]
|
||||
pub fn compress(uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
compressed.clear();
|
||||
let mut encoder = snap::write::FrameEncoder::new(compressed);
|
||||
@@ -13,6 +9,7 @@ pub fn compress(uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()>
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn decompress(compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
decompressed.clear();
|
||||
snap::read::FrameDecoder::new(compressed).read_to_end(decompressed)?;
|
||||
|
||||
134
src/store/compressors.rs
Normal file
134
src/store/compressors.rs
Normal file
@@ -0,0 +1,134 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::io;
|
||||
|
||||
pub trait StoreCompressor {
|
||||
fn compress(&self, uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()>;
|
||||
fn decompress(&self, compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()>;
|
||||
fn get_compressor_id() -> u8;
|
||||
}
|
||||
|
||||
/// Compressor can be used on `IndexSettings` to choose
|
||||
/// the compressor used to compress the doc store.
|
||||
///
|
||||
/// The default is Lz4Block, but also depends on the enabled feature flags.
|
||||
#[derive(Clone, Debug, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum Compressor {
|
||||
#[serde(rename = "lz4")]
|
||||
/// Use the lz4 compressor (block format)
|
||||
Lz4,
|
||||
#[serde(rename = "brotli")]
|
||||
/// Use the brotli compressor
|
||||
Brotli,
|
||||
#[serde(rename = "snappy")]
|
||||
/// Use the snap compressor
|
||||
Snappy,
|
||||
}
|
||||
|
||||
impl Default for Compressor {
|
||||
fn default() -> Self {
|
||||
if cfg!(feature = "lz4-compression") {
|
||||
Compressor::Lz4
|
||||
} else if cfg!(feature = "brotli-compression") {
|
||||
Compressor::Brotli
|
||||
} else if cfg!(feature = "snappy-compression") {
|
||||
Compressor::Snappy
|
||||
} else {
|
||||
panic!(
|
||||
"all compressor feature flags like are disabled (e.g. lz4-compression), can't choose default compressor"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Compressor {
|
||||
pub(crate) fn from_id(id: u8) -> Compressor {
|
||||
match id {
|
||||
1 => Compressor::Lz4,
|
||||
2 => Compressor::Brotli,
|
||||
3 => Compressor::Snappy,
|
||||
_ => panic!("unknown compressor id {:?}", id),
|
||||
}
|
||||
}
|
||||
pub(crate) fn get_id(&self) -> u8 {
|
||||
match self {
|
||||
Self::Lz4 => 1,
|
||||
Self::Brotli => 2,
|
||||
Self::Snappy => 3,
|
||||
}
|
||||
}
|
||||
#[inline]
|
||||
pub(crate) fn compress(&self, uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
match self {
|
||||
Self::Lz4 => {
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
{
|
||||
super::compression_lz4_block::compress(uncompressed, compressed)
|
||||
}
|
||||
#[cfg(not(feature = "lz4-compression"))]
|
||||
{
|
||||
panic!("lz4-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
Self::Brotli => {
|
||||
#[cfg(feature = "brotli-compression")]
|
||||
{
|
||||
super::compression_brotli::compress(uncompressed, compressed)
|
||||
}
|
||||
#[cfg(not(feature = "brotli-compression"))]
|
||||
{
|
||||
panic!("brotli-compression-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
Self::Snappy => {
|
||||
#[cfg(feature = "snappy-compression")]
|
||||
{
|
||||
super::compression_snap::compress(uncompressed, compressed)
|
||||
}
|
||||
#[cfg(not(feature = "snappy-compression"))]
|
||||
{
|
||||
panic!("snappy-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn decompress(
|
||||
&self,
|
||||
compressed: &[u8],
|
||||
decompressed: &mut Vec<u8>,
|
||||
) -> io::Result<()> {
|
||||
match self {
|
||||
Self::Lz4 => {
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
{
|
||||
super::compression_lz4_block::decompress(compressed, decompressed)
|
||||
}
|
||||
#[cfg(not(feature = "lz4-compression"))]
|
||||
{
|
||||
panic!("lz4-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
Self::Brotli => {
|
||||
#[cfg(feature = "brotli-compression")]
|
||||
{
|
||||
super::compression_brotli::decompress(compressed, decompressed)
|
||||
}
|
||||
#[cfg(not(feature = "brotli-compression"))]
|
||||
{
|
||||
panic!("brotli-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
Self::Snappy => {
|
||||
#[cfg(feature = "snappy-compression")]
|
||||
{
|
||||
super::compression_snap::decompress(compressed, decompressed)
|
||||
}
|
||||
#[cfg(not(feature = "snappy-compression"))]
|
||||
{
|
||||
panic!("snappy-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
69
src/store/footer.rs
Normal file
69
src/store/footer.rs
Normal file
@@ -0,0 +1,69 @@
|
||||
use crate::{
|
||||
common::{BinarySerializable, FixedSize, HasLen},
|
||||
directory::FileSlice,
|
||||
store::Compressor,
|
||||
};
|
||||
use std::io;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub struct DocStoreFooter {
|
||||
pub offset: u64,
|
||||
pub compressor: Compressor,
|
||||
}
|
||||
|
||||
/// Serialises the footer to a byte-array
|
||||
/// - offset : 8 bytes
|
||||
///- compressor id: 1 byte
|
||||
/// - reserved for future use: 15 bytes
|
||||
impl BinarySerializable for DocStoreFooter {
|
||||
fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
BinarySerializable::serialize(&self.offset, writer)?;
|
||||
BinarySerializable::serialize(&self.compressor.get_id(), writer)?;
|
||||
writer.write_all(&[0; 15])?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let offset = u64::deserialize(reader)?;
|
||||
let compressor_id = u8::deserialize(reader)?;
|
||||
let mut skip_buf = [0; 15];
|
||||
reader.read_exact(&mut skip_buf)?;
|
||||
Ok(DocStoreFooter {
|
||||
offset,
|
||||
compressor: Compressor::from_id(compressor_id),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl FixedSize for DocStoreFooter {
|
||||
const SIZE_IN_BYTES: usize = 24;
|
||||
}
|
||||
|
||||
impl DocStoreFooter {
|
||||
pub fn new(offset: u64, compressor: Compressor) -> Self {
|
||||
DocStoreFooter { offset, compressor }
|
||||
}
|
||||
|
||||
pub fn extract_footer(file: FileSlice) -> io::Result<(DocStoreFooter, FileSlice)> {
|
||||
if file.len() < DocStoreFooter::SIZE_IN_BYTES {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::UnexpectedEof,
|
||||
format!(
|
||||
"File corrupted. The file is smaller than Footer::SIZE_IN_BYTES (len={}).",
|
||||
file.len()
|
||||
),
|
||||
));
|
||||
}
|
||||
let (body, footer_slice) = file.split_from_end(DocStoreFooter::SIZE_IN_BYTES);
|
||||
let mut footer_bytes = footer_slice.read_bytes()?;
|
||||
let footer = DocStoreFooter::deserialize(&mut footer_bytes)?;
|
||||
Ok((footer, body))
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn doc_store_footer_test() {
|
||||
// This test is just to safe guard changes on the footer.
|
||||
// When the doc store footer is updated, make sure to update also the serialize/deserialize methods
|
||||
assert_eq!(core::mem::size_of::<DocStoreFooter>(), 16);
|
||||
}
|
||||
274
src/store/mod.rs
274
src/store/mod.rs
@@ -33,73 +33,32 @@ and should rely on either
|
||||
|
||||
!*/
|
||||
|
||||
mod compressors;
|
||||
mod footer;
|
||||
mod index;
|
||||
mod reader;
|
||||
mod writer;
|
||||
pub use self::reader::RawDocument;
|
||||
pub use self::compressors::Compressor;
|
||||
pub use self::reader::StoreReader;
|
||||
pub use self::writer::StoreWriter;
|
||||
|
||||
// compile_error doesn't scale very well, enum like feature flags would be great to have in Rust
|
||||
#[cfg(all(feature = "lz4", feature = "brotli"))]
|
||||
compile_error!("feature `lz4` or `brotli` must not be enabled together.");
|
||||
|
||||
#[cfg(all(feature = "lz4_block", feature = "brotli"))]
|
||||
compile_error!("feature `lz4_block` or `brotli` must not be enabled together.");
|
||||
|
||||
#[cfg(all(feature = "lz4_block", feature = "lz4"))]
|
||||
compile_error!("feature `lz4_block` or `lz4` must not be enabled together.");
|
||||
|
||||
#[cfg(all(feature = "lz4_block", feature = "snap"))]
|
||||
compile_error!("feature `lz4_block` or `snap` must not be enabled together.");
|
||||
|
||||
#[cfg(all(feature = "lz4", feature = "snap"))]
|
||||
compile_error!("feature `lz4` or `snap` must not be enabled together.");
|
||||
|
||||
#[cfg(all(feature = "brotli", feature = "snap"))]
|
||||
compile_error!("feature `brotli` or `snap` must not be enabled together.");
|
||||
|
||||
#[cfg(not(any(
|
||||
feature = "lz4",
|
||||
feature = "brotli",
|
||||
feature = "lz4_flex",
|
||||
feature = "snap"
|
||||
)))]
|
||||
compile_error!("all compressors are deactivated via feature-flags, check Cargo.toml for available decompressors.");
|
||||
|
||||
#[cfg(feature = "lz4_flex")]
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
mod compression_lz4_block;
|
||||
#[cfg(feature = "lz4_flex")]
|
||||
pub use self::compression_lz4_block::COMPRESSION;
|
||||
#[cfg(feature = "lz4_flex")]
|
||||
use self::compression_lz4_block::{compress, decompress};
|
||||
|
||||
#[cfg(feature = "lz4")]
|
||||
mod compression_lz4;
|
||||
#[cfg(feature = "lz4")]
|
||||
pub use self::compression_lz4::COMPRESSION;
|
||||
#[cfg(feature = "lz4")]
|
||||
use self::compression_lz4::{compress, decompress};
|
||||
|
||||
#[cfg(feature = "brotli")]
|
||||
#[cfg(feature = "brotli-compression")]
|
||||
mod compression_brotli;
|
||||
#[cfg(feature = "brotli")]
|
||||
pub use self::compression_brotli::COMPRESSION;
|
||||
#[cfg(feature = "brotli")]
|
||||
use self::compression_brotli::{compress, decompress};
|
||||
|
||||
#[cfg(feature = "snap")]
|
||||
#[cfg(feature = "snappy-compression")]
|
||||
mod compression_snap;
|
||||
#[cfg(feature = "snap")]
|
||||
pub use self::compression_snap::COMPRESSION;
|
||||
#[cfg(feature = "snap")]
|
||||
use self::compression_snap::{compress, decompress};
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
|
||||
use futures::executor::block_on;
|
||||
|
||||
use super::*;
|
||||
use crate::schema::{self, FieldValue, TextFieldIndexing};
|
||||
use crate::fastfield::DeleteBitSet;
|
||||
use crate::schema::{self, FieldValue, TextFieldIndexing, STORED, TEXT};
|
||||
use crate::schema::{Document, TextOptions};
|
||||
use crate::{
|
||||
directory::{Directory, RamDirectory, WritePtr},
|
||||
@@ -108,28 +67,31 @@ pub mod tests {
|
||||
use crate::{schema::Schema, Index};
|
||||
use std::path::Path;
|
||||
|
||||
pub fn write_lorem_ipsum_store(writer: WritePtr, num_docs: usize) -> Schema {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field_body = schema_builder.add_text_field("body", TextOptions::default().set_stored());
|
||||
let field_title =
|
||||
schema_builder.add_text_field("title", TextOptions::default().set_stored());
|
||||
let schema = schema_builder.build();
|
||||
let lorem = String::from(
|
||||
"Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed \
|
||||
const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed \
|
||||
do eiusmod tempor incididunt ut labore et dolore magna aliqua. \
|
||||
Ut enim ad minim veniam, quis nostrud exercitation ullamco \
|
||||
laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure \
|
||||
dolor in reprehenderit in voluptate velit esse cillum dolore eu \
|
||||
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non \
|
||||
proident, sunt in culpa qui officia deserunt mollit anim id est \
|
||||
laborum.",
|
||||
);
|
||||
laborum.";
|
||||
|
||||
pub fn write_lorem_ipsum_store(
|
||||
writer: WritePtr,
|
||||
num_docs: usize,
|
||||
compressor: Compressor,
|
||||
) -> Schema {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field_body = schema_builder.add_text_field("body", TextOptions::default().set_stored());
|
||||
let field_title =
|
||||
schema_builder.add_text_field("title", TextOptions::default().set_stored());
|
||||
let schema = schema_builder.build();
|
||||
{
|
||||
let mut store_writer = StoreWriter::new(writer);
|
||||
let mut store_writer = StoreWriter::new(writer, compressor);
|
||||
for i in 0..num_docs {
|
||||
let mut fields: Vec<FieldValue> = Vec::new();
|
||||
{
|
||||
let field_value = FieldValue::new(field_body, From::from(lorem.clone()));
|
||||
let field_value = FieldValue::new(field_body, From::from(LOREM.to_string()));
|
||||
fields.push(field_value);
|
||||
}
|
||||
{
|
||||
@@ -146,16 +108,61 @@ pub mod tests {
|
||||
schema
|
||||
}
|
||||
|
||||
const NUM_DOCS: usize = 1_000;
|
||||
#[test]
|
||||
fn test_store() -> crate::Result<()> {
|
||||
fn test_doc_store_iter_with_delete_bug_1077() -> crate::Result<()> {
|
||||
// this will cover deletion of the first element in a checkpoint
|
||||
let deleted_docids = (200..300).collect::<Vec<_>>();
|
||||
let delete_bitset = DeleteBitSet::for_test(&deleted_docids, NUM_DOCS as u32);
|
||||
|
||||
let path = Path::new("store");
|
||||
let directory = RamDirectory::create();
|
||||
let store_wrt = directory.open_write(path)?;
|
||||
let schema = write_lorem_ipsum_store(store_wrt, 1_000);
|
||||
let schema = write_lorem_ipsum_store(store_wrt, NUM_DOCS, Compressor::Lz4);
|
||||
let field_title = schema.get_field("title").unwrap();
|
||||
let store_file = directory.open_read(path)?;
|
||||
let store = StoreReader::open(store_file)?;
|
||||
for i in 0..1_000 {
|
||||
for i in 0..NUM_DOCS as u32 {
|
||||
assert_eq!(
|
||||
*store
|
||||
.get(i)?
|
||||
.get_first(field_title)
|
||||
.unwrap()
|
||||
.text()
|
||||
.unwrap(),
|
||||
format!("Doc {}", i)
|
||||
);
|
||||
}
|
||||
|
||||
for (_, doc) in store.iter(Some(&delete_bitset)).enumerate() {
|
||||
let doc = doc?;
|
||||
let title_content = doc.get_first(field_title).unwrap().text().unwrap();
|
||||
if !title_content.starts_with("Doc ") {
|
||||
panic!("unexpected title_content {}", title_content);
|
||||
}
|
||||
|
||||
let id = title_content
|
||||
.strip_prefix("Doc ")
|
||||
.unwrap()
|
||||
.parse::<u32>()
|
||||
.unwrap();
|
||||
if delete_bitset.is_deleted(id) {
|
||||
panic!("unexpected deleted document {}", id);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn test_store(compressor: Compressor) -> crate::Result<()> {
|
||||
let path = Path::new("store");
|
||||
let directory = RamDirectory::create();
|
||||
let store_wrt = directory.open_write(path)?;
|
||||
let schema = write_lorem_ipsum_store(store_wrt, NUM_DOCS, compressor);
|
||||
let field_title = schema.get_field("title").unwrap();
|
||||
let store_file = directory.open_read(path)?;
|
||||
let store = StoreReader::open(store_file)?;
|
||||
for i in 0..NUM_DOCS as u32 {
|
||||
assert_eq!(
|
||||
*store
|
||||
.get(i)?
|
||||
@@ -175,6 +182,22 @@ pub mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
#[test]
|
||||
fn test_store_lz4_block() -> crate::Result<()> {
|
||||
test_store(Compressor::Lz4)
|
||||
}
|
||||
#[cfg(feature = "snappy-compression")]
|
||||
#[test]
|
||||
fn test_store_snap() -> crate::Result<()> {
|
||||
test_store(Compressor::Snappy)
|
||||
}
|
||||
#[cfg(feature = "brotli-compression")]
|
||||
#[test]
|
||||
fn test_store_brotli() -> crate::Result<()> {
|
||||
test_store(Compressor::Brotli)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_store_with_delete() -> crate::Result<()> {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
@@ -215,6 +238,108 @@ pub mod tests {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(feature = "snappy-compression")]
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
#[test]
|
||||
fn test_merge_with_changed_compressor() -> crate::Result<()> {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
|
||||
let text_field = schema_builder.add_text_field("text_field", TEXT | STORED);
|
||||
let schema = schema_builder.build();
|
||||
let index_builder = Index::builder().schema(schema);
|
||||
|
||||
let mut index = index_builder.create_in_ram().unwrap();
|
||||
index.settings_mut().docstore_compression = Compressor::Lz4;
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
// put enough data create enough blocks in the doc store to be considered for stacking
|
||||
for _ in 0..200 {
|
||||
index_writer.add_document(doc!(text_field=> LOREM));
|
||||
}
|
||||
assert!(index_writer.commit().is_ok());
|
||||
for _ in 0..200 {
|
||||
index_writer.add_document(doc!(text_field=> LOREM));
|
||||
}
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
assert_eq!(
|
||||
index.reader().unwrap().searcher().segment_readers()[0]
|
||||
.get_store_reader()
|
||||
.unwrap()
|
||||
.compressor(),
|
||||
Compressor::Lz4
|
||||
);
|
||||
// Change compressor, this disables stacking on merging
|
||||
let index_settings = index.settings_mut();
|
||||
index_settings.docstore_compression = Compressor::Snappy;
|
||||
// Merging the segments
|
||||
{
|
||||
let segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
assert!(block_on(index_writer.merge(&segment_ids)).is_ok());
|
||||
assert!(index_writer.wait_merging_threads().is_ok());
|
||||
}
|
||||
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
let reader = searcher.segment_readers().iter().last().unwrap();
|
||||
let store = reader.get_store_reader().unwrap();
|
||||
|
||||
for doc in store.iter(reader.delete_bitset()).take(50) {
|
||||
assert_eq!(
|
||||
*doc?.get_first(text_field).unwrap().text().unwrap(),
|
||||
LOREM.to_string()
|
||||
);
|
||||
}
|
||||
assert_eq!(store.compressor(), Compressor::Snappy);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_of_small_segments() -> crate::Result<()> {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
|
||||
let text_field = schema_builder.add_text_field("text_field", TEXT | STORED);
|
||||
let schema = schema_builder.build();
|
||||
let index_builder = Index::builder().schema(schema);
|
||||
|
||||
let index = index_builder.create_in_ram().unwrap();
|
||||
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
|
||||
index_writer.add_document(doc!(text_field=> "1"));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
index_writer.add_document(doc!(text_field=> "2"));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
index_writer.add_document(doc!(text_field=> "3"));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
index_writer.add_document(doc!(text_field=> "4"));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
index_writer.add_document(doc!(text_field=> "5"));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
// Merging the segments
|
||||
{
|
||||
let segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
assert!(block_on(index_writer.merge(&segment_ids)).is_ok());
|
||||
assert!(index_writer.wait_merging_threads().is_ok());
|
||||
}
|
||||
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
let reader = searcher.segment_readers().iter().last().unwrap();
|
||||
let store = reader.get_store_reader().unwrap();
|
||||
assert_eq!(store.block_checkpoints().count(), 1);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
@@ -223,6 +348,7 @@ mod bench {
|
||||
use super::tests::write_lorem_ipsum_store;
|
||||
use crate::directory::Directory;
|
||||
use crate::directory::RamDirectory;
|
||||
use crate::store::Compressor;
|
||||
use crate::store::StoreReader;
|
||||
use std::path::Path;
|
||||
use test::Bencher;
|
||||
@@ -233,7 +359,11 @@ mod bench {
|
||||
let directory = RamDirectory::create();
|
||||
let path = Path::new("store");
|
||||
b.iter(|| {
|
||||
write_lorem_ipsum_store(directory.open_write(path).unwrap(), 1_000);
|
||||
write_lorem_ipsum_store(
|
||||
directory.open_write(path).unwrap(),
|
||||
1_000,
|
||||
Compressor::default(),
|
||||
);
|
||||
directory.delete(path).unwrap();
|
||||
});
|
||||
}
|
||||
@@ -242,11 +372,13 @@ mod bench {
|
||||
fn bench_store_decode(b: &mut Bencher) {
|
||||
let directory = RamDirectory::create();
|
||||
let path = Path::new("store");
|
||||
write_lorem_ipsum_store(directory.open_write(path).unwrap(), 1_000);
|
||||
write_lorem_ipsum_store(
|
||||
directory.open_write(path).unwrap(),
|
||||
1_000,
|
||||
Compressor::default(),
|
||||
);
|
||||
let store_file = directory.open_read(path).unwrap();
|
||||
let store = StoreReader::open(store_file).unwrap();
|
||||
b.iter(|| {
|
||||
store.get(12).unwrap();
|
||||
});
|
||||
b.iter(|| store.iter(None).collect::<Vec<_>>());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,26 +1,29 @@
|
||||
use super::decompress;
|
||||
use super::index::SkipIndex;
|
||||
use crate::common::{BinarySerializable, HasLen};
|
||||
use super::Compressor;
|
||||
use super::{footer::DocStoreFooter, index::SkipIndex};
|
||||
use crate::directory::{FileSlice, OwnedBytes};
|
||||
use crate::schema::Document;
|
||||
use crate::space_usage::StoreSpaceUsage;
|
||||
use crate::store::index::Checkpoint;
|
||||
use crate::DocId;
|
||||
use crate::{common::VInt, fastfield::DeleteBitSet};
|
||||
use crate::{
|
||||
common::{BinarySerializable, HasLen, VInt},
|
||||
error::DataCorruption,
|
||||
fastfield::DeleteBitSet,
|
||||
};
|
||||
use lru::LruCache;
|
||||
use std::io;
|
||||
use std::mem::size_of;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
const LRU_CACHE_CAPACITY: usize = 100;
|
||||
|
||||
type Block = Arc<Vec<u8>>;
|
||||
type Block = OwnedBytes;
|
||||
|
||||
type BlockCache = Arc<Mutex<LruCache<usize, Block>>>;
|
||||
|
||||
/// Reads document off tantivy's [`Store`](./index.html)
|
||||
pub struct StoreReader {
|
||||
compressor: Compressor,
|
||||
data: FileSlice,
|
||||
cache: BlockCache,
|
||||
cache_hits: Arc<AtomicUsize>,
|
||||
@@ -32,11 +35,14 @@ pub struct StoreReader {
|
||||
impl StoreReader {
|
||||
/// Opens a store reader
|
||||
pub fn open(store_file: FileSlice) -> io::Result<StoreReader> {
|
||||
let (data_file, offset_index_file) = split_file(store_file)?;
|
||||
let (footer, data_and_offset) = DocStoreFooter::extract_footer(store_file)?;
|
||||
|
||||
let (data_file, offset_index_file) = data_and_offset.split(footer.offset as usize);
|
||||
let index_data = offset_index_file.read_bytes()?;
|
||||
let space_usage = StoreSpaceUsage::new(data_file.len(), offset_index_file.len());
|
||||
let skip_index = SkipIndex::open(index_data);
|
||||
Ok(StoreReader {
|
||||
compressor: footer.compressor,
|
||||
data: data_file,
|
||||
cache: Arc::new(Mutex::new(LruCache::new(LRU_CACHE_CAPACITY))),
|
||||
cache_hits: Default::default(),
|
||||
@@ -50,6 +56,10 @@ impl StoreReader {
|
||||
self.skip_index.checkpoints()
|
||||
}
|
||||
|
||||
pub(crate) fn compressor(&self) -> Compressor {
|
||||
self.compressor
|
||||
}
|
||||
|
||||
fn block_checkpoint(&self, doc_id: DocId) -> Option<Checkpoint> {
|
||||
self.skip_index.seek(doc_id)
|
||||
}
|
||||
@@ -72,9 +82,10 @@ impl StoreReader {
|
||||
|
||||
let compressed_block = self.compressed_block(checkpoint)?;
|
||||
let mut decompressed_block = vec![];
|
||||
decompress(compressed_block.as_slice(), &mut decompressed_block)?;
|
||||
self.compressor
|
||||
.decompress(compressed_block.as_slice(), &mut decompressed_block)?;
|
||||
|
||||
let block = Arc::new(decompressed_block);
|
||||
let block = OwnedBytes::new(decompressed_block);
|
||||
self.cache
|
||||
.lock()
|
||||
.unwrap()
|
||||
@@ -93,9 +104,8 @@ impl StoreReader {
|
||||
/// It should not be called to score documents
|
||||
/// for instance.
|
||||
pub fn get(&self, doc_id: DocId) -> crate::Result<Document> {
|
||||
let raw_doc = self.get_raw(doc_id)?;
|
||||
let mut cursor = raw_doc.get_bytes();
|
||||
Ok(Document::deserialize(&mut cursor)?)
|
||||
let mut doc_bytes = self.get_document_bytes(doc_id)?;
|
||||
Ok(Document::deserialize(&mut doc_bytes)?)
|
||||
}
|
||||
|
||||
/// Reads raw bytes of a given document. Returns `RawDocument`, which contains the block of a document and its start and end
|
||||
@@ -106,7 +116,7 @@ impl StoreReader {
|
||||
/// so accessing docs from the same compressed block should be faster.
|
||||
/// For that reason a store reader should be kept and reused.
|
||||
///
|
||||
pub fn get_raw(&self, doc_id: DocId) -> crate::Result<RawDocument> {
|
||||
pub fn get_document_bytes(&self, doc_id: DocId) -> crate::Result<OwnedBytes> {
|
||||
let checkpoint = self.block_checkpoint(doc_id).ok_or_else(|| {
|
||||
crate::TantivyError::InvalidArgument(format!("Failed to lookup Doc #{}.", doc_id))
|
||||
})?;
|
||||
@@ -121,11 +131,7 @@ impl StoreReader {
|
||||
let doc_length = VInt::deserialize(&mut cursor)?.val() as usize;
|
||||
let start_pos = cursor_len_before - cursor.len();
|
||||
let end_pos = cursor_len_before - cursor.len() + doc_length;
|
||||
Ok(RawDocument {
|
||||
block,
|
||||
start_pos,
|
||||
end_pos,
|
||||
})
|
||||
Ok(block.slice(start_pos..end_pos))
|
||||
}
|
||||
|
||||
/// Iterator over all Documents in their order as they are stored in the doc store.
|
||||
@@ -135,10 +141,9 @@ impl StoreReader {
|
||||
&'b self,
|
||||
delete_bitset: Option<&'a DeleteBitSet>,
|
||||
) -> impl Iterator<Item = crate::Result<Document>> + 'b {
|
||||
self.iter_raw(delete_bitset).map(|raw_doc| {
|
||||
let raw_doc = raw_doc?;
|
||||
let mut cursor = raw_doc.get_bytes();
|
||||
Ok(Document::deserialize(&mut cursor)?)
|
||||
self.iter_raw(delete_bitset).map(|doc_bytes_res| {
|
||||
let mut doc_bytes = doc_bytes_res?;
|
||||
Ok(Document::deserialize(&mut doc_bytes)?)
|
||||
})
|
||||
}
|
||||
|
||||
@@ -148,7 +153,7 @@ impl StoreReader {
|
||||
pub(crate) fn iter_raw<'a: 'b, 'b>(
|
||||
&'b self,
|
||||
delete_bitset: Option<&'a DeleteBitSet>,
|
||||
) -> impl Iterator<Item = crate::Result<RawDocument>> + 'b {
|
||||
) -> impl Iterator<Item = crate::Result<OwnedBytes>> + 'b {
|
||||
let last_docid = self
|
||||
.block_checkpoints()
|
||||
.last()
|
||||
@@ -158,48 +163,55 @@ impl StoreReader {
|
||||
let mut curr_checkpoint = checkpoint_block_iter.next();
|
||||
let mut curr_block = curr_checkpoint
|
||||
.as_ref()
|
||||
.map(|checkpoint| self.read_block(&checkpoint));
|
||||
.map(|checkpoint| self.read_block(&checkpoint).map_err(|e| e.kind())); // map error in order to enable cloning
|
||||
let mut block_start_pos = 0;
|
||||
let mut num_skipped = 0;
|
||||
let mut reset_block_pos = false;
|
||||
(0..last_docid)
|
||||
.filter_map(move |doc_id| {
|
||||
// filter_map is only used to resolve lifetime issues between the two closures on
|
||||
// the outer variables
|
||||
let alive = delete_bitset.map_or(true, |bitset| bitset.is_alive(doc_id));
|
||||
if !alive {
|
||||
// we keep the number of skipped documents to move forward in the map block
|
||||
num_skipped += 1;
|
||||
}
|
||||
|
||||
// check move to next checkpoint
|
||||
let mut reset_block_pos = false;
|
||||
if doc_id >= curr_checkpoint.as_ref().unwrap().doc_range.end {
|
||||
curr_checkpoint = checkpoint_block_iter.next();
|
||||
curr_block = curr_checkpoint
|
||||
.as_ref()
|
||||
.map(|checkpoint| self.read_block(&checkpoint));
|
||||
.map(|checkpoint| self.read_block(&checkpoint).map_err(|e| e.kind()));
|
||||
reset_block_pos = true;
|
||||
num_skipped = 0;
|
||||
}
|
||||
|
||||
let alive = delete_bitset.map_or(true, |bitset| bitset.is_alive(doc_id));
|
||||
if alive {
|
||||
let ret = Some((
|
||||
curr_block.as_ref().unwrap().as_ref().unwrap().clone(), // todo forward errors
|
||||
num_skipped,
|
||||
reset_block_pos,
|
||||
));
|
||||
let ret = Some((curr_block.clone(), num_skipped, reset_block_pos));
|
||||
// the map block will move over the num_skipped, so we reset to 0
|
||||
num_skipped = 0;
|
||||
reset_block_pos = false;
|
||||
ret
|
||||
} else {
|
||||
// we keep the number of skipped documents to move forward in the map block
|
||||
num_skipped += 1;
|
||||
None
|
||||
}
|
||||
})
|
||||
.map(move |(block, num_skipped, reset_block_pos)| {
|
||||
let block = block
|
||||
.ok_or_else(|| {
|
||||
DataCorruption::comment_only(
|
||||
"the current checkpoint in the doc store iterator is none, this should never happen",
|
||||
)
|
||||
})?
|
||||
.map_err(|error_kind| {
|
||||
std::io::Error::new(error_kind, "error when reading block in doc store")
|
||||
})?;
|
||||
// this flag is set, when filter_map moved to the next block
|
||||
if reset_block_pos {
|
||||
block_start_pos = 0;
|
||||
}
|
||||
let mut cursor = &block[block_start_pos..];
|
||||
let mut pos = 0;
|
||||
// move forward 1 doc + num_skipped in block and return length of current doc
|
||||
let doc_length = loop {
|
||||
let doc_length = VInt::deserialize(&mut cursor)?.val() as usize;
|
||||
let num_bytes_read = block[block_start_pos..].len() - cursor.len();
|
||||
@@ -214,13 +226,9 @@ impl StoreReader {
|
||||
}
|
||||
};
|
||||
let end_pos = block_start_pos + doc_length;
|
||||
let raw_doc = RawDocument {
|
||||
block,
|
||||
start_pos: block_start_pos,
|
||||
end_pos,
|
||||
};
|
||||
let doc_bytes = block.slice(block_start_pos..end_pos);
|
||||
block_start_pos = end_pos;
|
||||
Ok(raw_doc)
|
||||
Ok(doc_bytes)
|
||||
})
|
||||
}
|
||||
|
||||
@@ -230,31 +238,6 @@ impl StoreReader {
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the bytes of a serialized `Document` in a decompressed block.
|
||||
pub struct RawDocument {
|
||||
/// the block of data containing multiple documents
|
||||
block: Arc<Vec<u8>>,
|
||||
/// start position of the document in the block
|
||||
start_pos: usize,
|
||||
/// end position of the document in the block
|
||||
end_pos: usize,
|
||||
}
|
||||
|
||||
impl RawDocument {
|
||||
/// Get the bytes of a serialized `Document` in a decompressed block.
|
||||
pub fn get_bytes(&self) -> &[u8] {
|
||||
&self.block[self.start_pos..self.end_pos]
|
||||
}
|
||||
}
|
||||
|
||||
fn split_file(data: FileSlice) -> io::Result<(FileSlice, FileSlice)> {
|
||||
let (data, footer_len_bytes) = data.split_from_end(size_of::<u64>());
|
||||
let serialized_offset: OwnedBytes = footer_len_bytes.read_bytes()?;
|
||||
let mut serialized_offset_buf = serialized_offset.as_slice();
|
||||
let offset = u64::deserialize(&mut serialized_offset_buf)? as usize;
|
||||
Ok(data.split(offset))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -272,7 +255,7 @@ mod tests {
|
||||
let directory = RamDirectory::create();
|
||||
let path = Path::new("store");
|
||||
let writer = directory.open_write(path)?;
|
||||
let schema = write_lorem_ipsum_store(writer, 500);
|
||||
let schema = write_lorem_ipsum_store(writer, 500, Compressor::default());
|
||||
let title = schema.get_field("title").unwrap();
|
||||
let store_file = directory.open_read(path)?;
|
||||
let store = StoreReader::open(store_file)?;
|
||||
@@ -327,7 +310,7 @@ mod tests {
|
||||
.unwrap()
|
||||
.peek_lru()
|
||||
.map(|(&k, _)| k as usize),
|
||||
Some(9249)
|
||||
Some(9210)
|
||||
);
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use super::compress;
|
||||
use super::index::SkipIndexBuilder;
|
||||
use super::StoreReader;
|
||||
use super::{compressors::Compressor, footer::DocStoreFooter};
|
||||
use crate::common::CountingWriter;
|
||||
use crate::common::{BinarySerializable, VInt};
|
||||
use crate::directory::TerminatingWrite;
|
||||
@@ -21,6 +21,7 @@ const BLOCK_SIZE: usize = 16_384;
|
||||
/// The skip list index on the other hand, is built in memory.
|
||||
///
|
||||
pub struct StoreWriter {
|
||||
compressor: Compressor,
|
||||
doc: DocId,
|
||||
first_doc_in_block: DocId,
|
||||
offset_index_writer: SkipIndexBuilder,
|
||||
@@ -34,8 +35,9 @@ impl StoreWriter {
|
||||
///
|
||||
/// The store writer will writes blocks on disc as
|
||||
/// document are added.
|
||||
pub fn new(writer: WritePtr) -> StoreWriter {
|
||||
pub fn new(writer: WritePtr, compressor: Compressor) -> StoreWriter {
|
||||
StoreWriter {
|
||||
compressor,
|
||||
doc: 0,
|
||||
first_doc_in_block: 0,
|
||||
offset_index_writer: SkipIndexBuilder::new(),
|
||||
@@ -45,6 +47,10 @@ impl StoreWriter {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn compressor(&self) -> Compressor {
|
||||
self.compressor
|
||||
}
|
||||
|
||||
/// The memory used (inclusive childs)
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.intermediary_buffer.capacity() + self.current_block.capacity()
|
||||
@@ -125,7 +131,8 @@ impl StoreWriter {
|
||||
fn write_and_compress_block(&mut self) -> io::Result<()> {
|
||||
assert!(self.doc > 0);
|
||||
self.intermediary_buffer.clear();
|
||||
compress(&self.current_block[..], &mut self.intermediary_buffer)?;
|
||||
self.compressor
|
||||
.compress(&self.current_block[..], &mut self.intermediary_buffer)?;
|
||||
let start_offset = self.writer.written_bytes() as usize;
|
||||
self.writer.write_all(&self.intermediary_buffer)?;
|
||||
let end_offset = self.writer.written_bytes() as usize;
|
||||
@@ -147,8 +154,9 @@ impl StoreWriter {
|
||||
self.write_and_compress_block()?;
|
||||
}
|
||||
let header_offset: u64 = self.writer.written_bytes() as u64;
|
||||
let footer = DocStoreFooter::new(header_offset, self.compressor);
|
||||
self.offset_index_writer.write(&mut self.writer)?;
|
||||
header_offset.serialize(&mut self.writer)?;
|
||||
footer.serialize(&mut self.writer)?;
|
||||
self.writer.terminate()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -78,8 +78,8 @@ pub struct TermStreamer<'a, A = AlwaysMatch>
|
||||
where
|
||||
A: Automaton,
|
||||
{
|
||||
fst_map: &'a TermDictionary,
|
||||
stream: Stream<'a, A>,
|
||||
pub(crate) fst_map: &'a TermDictionary,
|
||||
pub(crate) stream: Stream<'a, A>,
|
||||
term_ord: TermOrdinal,
|
||||
current_key: Vec<u8>,
|
||||
current_value: TermInfo,
|
||||
|
||||
@@ -1,32 +1,11 @@
|
||||
use crate::postings::TermInfo;
|
||||
use crate::termdict::TermDictionary;
|
||||
use crate::termdict::TermOrdinal;
|
||||
use crate::termdict::TermStreamer;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::BinaryHeap;
|
||||
|
||||
pub struct HeapItem<'a> {
|
||||
pub streamer: TermStreamer<'a>,
|
||||
pub segment_ord: usize,
|
||||
}
|
||||
|
||||
impl<'a> PartialEq for HeapItem<'a> {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.segment_ord == other.segment_ord
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Eq for HeapItem<'a> {}
|
||||
|
||||
impl<'a> PartialOrd for HeapItem<'a> {
|
||||
fn partial_cmp(&self, other: &HeapItem<'a>) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Ord for HeapItem<'a> {
|
||||
fn cmp(&self, other: &HeapItem<'a>) -> Ordering {
|
||||
(&other.streamer.key(), &other.segment_ord).cmp(&(&self.streamer.key(), &self.segment_ord))
|
||||
}
|
||||
}
|
||||
use tantivy_fst::map::OpBuilder;
|
||||
use tantivy_fst::map::Union;
|
||||
use tantivy_fst::raw::IndexedValue;
|
||||
use tantivy_fst::Streamer;
|
||||
|
||||
/// Given a list of sorted term streams,
|
||||
/// returns an iterator over sorted unique terms.
|
||||
@@ -34,61 +13,50 @@ impl<'a> Ord for HeapItem<'a> {
|
||||
/// The item yield is actually a pair with
|
||||
/// - the term
|
||||
/// - a slice with the ordinal of the segments containing
|
||||
/// the terms.
|
||||
/// the term.
|
||||
pub struct TermMerger<'a> {
|
||||
heap: BinaryHeap<HeapItem<'a>>,
|
||||
current_streamers: Vec<HeapItem<'a>>,
|
||||
dictionaries: Vec<&'a TermDictionary>,
|
||||
union: Union<'a>,
|
||||
current_key: Vec<u8>,
|
||||
current_segment_and_term_ordinals: Vec<IndexedValue>,
|
||||
}
|
||||
|
||||
impl<'a> TermMerger<'a> {
|
||||
/// Stream of merged term dictionary
|
||||
///
|
||||
pub fn new(streams: Vec<TermStreamer<'a>>) -> TermMerger<'a> {
|
||||
let mut op_builder = OpBuilder::new();
|
||||
let mut dictionaries = vec![];
|
||||
for streamer in streams {
|
||||
op_builder.push(streamer.stream);
|
||||
dictionaries.push(streamer.fst_map);
|
||||
}
|
||||
TermMerger {
|
||||
heap: BinaryHeap::new(),
|
||||
current_streamers: streams
|
||||
.into_iter()
|
||||
.enumerate()
|
||||
.map(|(ord, streamer)| HeapItem {
|
||||
streamer,
|
||||
segment_ord: ord,
|
||||
})
|
||||
.collect(),
|
||||
dictionaries,
|
||||
union: op_builder.union(),
|
||||
current_key: vec![],
|
||||
current_segment_and_term_ordinals: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn matching_segments<'b: 'a>(
|
||||
&'b self,
|
||||
) -> impl 'b + Iterator<Item = (usize, TermOrdinal)> {
|
||||
self.current_streamers
|
||||
pub fn matching_segments<'b: 'a>(&'b self) -> impl 'b + Iterator<Item = (usize, TermOrdinal)> {
|
||||
self.current_segment_and_term_ordinals
|
||||
.iter()
|
||||
.map(|heap_item| (heap_item.segment_ord, heap_item.streamer.term_ord()))
|
||||
}
|
||||
|
||||
fn advance_segments(&mut self) {
|
||||
let streamers = &mut self.current_streamers;
|
||||
let heap = &mut self.heap;
|
||||
for mut heap_item in streamers.drain(..) {
|
||||
if heap_item.streamer.advance() {
|
||||
heap.push(heap_item);
|
||||
}
|
||||
}
|
||||
.map(|iv| (iv.index, iv.value))
|
||||
}
|
||||
|
||||
/// Advance the term iterator to the next term.
|
||||
/// Returns true if there is indeed another term
|
||||
/// False if there is none.
|
||||
pub fn advance(&mut self) -> bool {
|
||||
self.advance_segments();
|
||||
if let Some(head) = self.heap.pop() {
|
||||
self.current_streamers.push(head);
|
||||
while let Some(next_streamer) = self.heap.peek() {
|
||||
if self.current_streamers[0].streamer.key() != next_streamer.streamer.key() {
|
||||
break;
|
||||
}
|
||||
let next_heap_it = self.heap.pop().unwrap(); // safe : we peeked beforehand
|
||||
self.current_streamers.push(next_heap_it);
|
||||
}
|
||||
if let Some((k, values)) = self.union.next() {
|
||||
self.current_key.clear();
|
||||
self.current_key.extend_from_slice(k);
|
||||
self.current_segment_and_term_ordinals.clear();
|
||||
self.current_segment_and_term_ordinals
|
||||
.extend_from_slice(values);
|
||||
self.current_segment_and_term_ordinals
|
||||
.sort_by_key(|iv| iv.index);
|
||||
true
|
||||
} else {
|
||||
false
|
||||
@@ -101,16 +69,85 @@ impl<'a> TermMerger<'a> {
|
||||
/// iff advance() has been called before
|
||||
/// and "true" was returned.
|
||||
pub fn key(&self) -> &[u8] {
|
||||
self.current_streamers[0].streamer.key()
|
||||
&self.current_key
|
||||
}
|
||||
|
||||
/// Returns the sorted list of segment ordinals
|
||||
/// that include the current term.
|
||||
/// Iterator over (segment ordinal, TermInfo) pairs iterator sorted by the ordinal.
|
||||
///
|
||||
/// This method may be called
|
||||
/// iff advance() has been called before
|
||||
/// and "true" was returned.
|
||||
pub fn current_kvs(&self) -> &[HeapItem<'a>] {
|
||||
&self.current_streamers[..]
|
||||
pub fn current_segment_ordinals_and_term_infos<'b: 'a>(
|
||||
&'b self,
|
||||
) -> impl 'b + Iterator<Item = (usize, TermInfo)> {
|
||||
self.current_segment_and_term_ordinals
|
||||
.iter()
|
||||
.map(move |iv| {
|
||||
(
|
||||
iv.index,
|
||||
self.dictionaries[iv.index].term_info_from_ord(iv.value),
|
||||
)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
use super::TermMerger;
|
||||
use crate::directory::FileSlice;
|
||||
use crate::postings::TermInfo;
|
||||
use crate::termdict::{TermDictionary, TermDictionaryBuilder};
|
||||
use rand::distributions::Alphanumeric;
|
||||
use rand::{thread_rng, Rng};
|
||||
use test::{self, Bencher};
|
||||
|
||||
fn make_term_info(term_ord: u64) -> TermInfo {
|
||||
let offset = |term_ord: u64| (term_ord * 100 + term_ord * term_ord) as usize;
|
||||
TermInfo {
|
||||
doc_freq: term_ord as u32,
|
||||
postings_range: offset(term_ord)..offset(term_ord + 1),
|
||||
positions_range: offset(term_ord)..offset(term_ord + 1),
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a dictionary of random strings.
|
||||
fn rand_dict(num_terms: usize) -> crate::Result<TermDictionary> {
|
||||
let buffer: Vec<u8> = {
|
||||
let mut terms = vec![];
|
||||
for _i in 0..num_terms {
|
||||
let rand_string: String = thread_rng()
|
||||
.sample_iter(&Alphanumeric)
|
||||
.take(thread_rng().gen_range(30..42))
|
||||
.map(char::from)
|
||||
.collect();
|
||||
terms.push(rand_string);
|
||||
}
|
||||
terms.sort();
|
||||
|
||||
let mut term_dictionary_builder = TermDictionaryBuilder::create(Vec::new())?;
|
||||
for i in 0..num_terms {
|
||||
term_dictionary_builder.insert(terms[i].as_bytes(), &make_term_info(i as u64))?;
|
||||
}
|
||||
term_dictionary_builder.finish()?
|
||||
};
|
||||
let file = FileSlice::from(buffer);
|
||||
TermDictionary::open(file)
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_termmerger(b: &mut Bencher) -> crate::Result<()> {
|
||||
let dict1 = rand_dict(100_000)?;
|
||||
let dict2 = rand_dict(100_000)?;
|
||||
b.iter(|| -> crate::Result<u32> {
|
||||
let stream1 = dict1.stream()?;
|
||||
let stream2 = dict2.stream()?;
|
||||
let mut merger = TermMerger::new(vec![stream1, stream2]);
|
||||
let mut count = 0;
|
||||
while merger.advance() {
|
||||
count += 1;
|
||||
}
|
||||
Ok(count)
|
||||
});
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user