mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-04 16:22:55 +00:00
Compare commits
27 Commits
owned-byte
...
0.15.1
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5209238c1b | ||
|
|
7ef25ec400 | ||
|
|
221e7cbb55 | ||
|
|
873ac1a3ac | ||
|
|
ebe55a7ae1 | ||
|
|
9f32d40b27 | ||
|
|
8ae10a930a | ||
|
|
473a346814 | ||
|
|
3a8a0fe79a | ||
|
|
511dc8f87f | ||
|
|
3901295329 | ||
|
|
f5918c6c74 | ||
|
|
abe6b4baec | ||
|
|
6e4b61154f | ||
|
|
2aad0ced77 | ||
|
|
41ea14840d | ||
|
|
dff0ffd38a | ||
|
|
8d32c3ba3a | ||
|
|
4afba005f9 | ||
|
|
85fb0cc20a | ||
|
|
5ef2d56ec2 | ||
|
|
fd8e5bdf57 | ||
|
|
4f8481a1e4 | ||
|
|
bcd72e5c14 | ||
|
|
249bc6cf72 | ||
|
|
1c0af5765d | ||
|
|
7ba771ed1b |
24
.github/workflows/test.yml
vendored
Normal file
24
.github/workflows/test.yml
vendored
Normal file
@@ -0,0 +1,24 @@
|
||||
name: Rust
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main ]
|
||||
pull_request:
|
||||
branches: [ main ]
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
|
||||
jobs:
|
||||
build:
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Build
|
||||
run: cargo build --verbose --workspace
|
||||
- name: Run tests
|
||||
run: cargo test --verbose --workspace
|
||||
- name: Check Formatting
|
||||
run: cargo fmt --all -- --check
|
||||
16
CHANGELOG.md
16
CHANGELOG.md
@@ -1,3 +1,7 @@
|
||||
Tantivy 0.15.1
|
||||
=========================
|
||||
- Major bugfix. DocStore panics when first block is deleted. (@appaquet) #1077
|
||||
|
||||
Tantivy 0.15.0
|
||||
=========================
|
||||
- API Changes. Using Range instead of (start, end) in the API and internals (`FileSlice`, `OwnedBytes`, `Snippets`, ...)
|
||||
@@ -8,11 +12,19 @@ Tantivy 0.15.0
|
||||
- Bugfix consistent tie break handling in facet's topk (@hardikpnsp) #357
|
||||
- Date field support for range queries (@rihardsk) #516
|
||||
- Added lz4-flex as the default compression scheme in tantivy (@PSeitz) #1009
|
||||
- Renamed a lot of symbols to avoid all uppercasing on acronyms, as per new clippy recommendation. For instance, RAMDirectory -> RamDirectory. (@pmasurel)
|
||||
- Renamed a lot of symbols to avoid all uppercasing on acronyms, as per new clippy recommendation. For instance, RAMDirectory -> RamDirectory. (@fulmicoton)
|
||||
- Simplified positions index format (@fulmicoton) #1022
|
||||
- Moved bitpacking to bitpacker subcrate and add BlockedBitpacker, which bitpacks blocks of 128 elements (@PSeitz) #1030
|
||||
- Added support for more-like-this query in tantivy (@evanxg852000) #1011
|
||||
- Added support for sorting an index, e.g presorting documents in an index by a timestamp field. This can heavily improve performance for certain scenarios, by utilizing the sorted data (Top-n optimizations). #1026
|
||||
- Added support for sorting an index, e.g presorting documents in an index by a timestamp field. This can heavily improve performance for certain scenarios, by utilizing the sorted data (Top-n optimizations)(@PSeitz). #1026
|
||||
- Add iterator over documents in doc store (@PSeitz). #1044
|
||||
- Fix log merge policy (@PSeitz). #1043
|
||||
- Add detection to avoid small doc store blocks on merge (@PSeitz). #1054
|
||||
- Make doc store compression dynamic (@PSeitz). #1060
|
||||
- Switch to json for footer version handling (@PSeitz). #1060
|
||||
- Updated TermMerger implementation to rely on the union feature of the FST (@scampi) #469
|
||||
- Add boolean marking whether position is required in the query_terms API call (@fulmicoton). #1070
|
||||
|
||||
|
||||
Tantivy 0.14.0
|
||||
=========================
|
||||
|
||||
14
Cargo.toml
14
Cargo.toml
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy"
|
||||
version = "0.14.0"
|
||||
version = "0.15.1"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
@@ -20,8 +20,7 @@ once_cell = "1.7.2"
|
||||
regex ={ version = "1.5.4", default-features = false, features = ["std"] }
|
||||
tantivy-fst = "0.3"
|
||||
memmap = {version = "0.7", optional=true}
|
||||
lz4_flex = { version = "0.7.5", default-features = false, features = ["checked-decode"], optional = true }
|
||||
lz4 = { version = "1.23.2", optional = true }
|
||||
lz4_flex = { version = "0.8.0", default-features = false, features = ["checked-decode"], optional = true }
|
||||
brotli = { version = "3.3", optional = true }
|
||||
snap = { version = "1.0.5", optional = true }
|
||||
tempfile = { version = "3.2", optional = true }
|
||||
@@ -34,7 +33,7 @@ levenshtein_automata = "0.2"
|
||||
uuid = { version = "0.8.2", features = ["v4", "serde"] }
|
||||
crossbeam = "0.8"
|
||||
futures = { version = "0.3.15", features = ["thread-pool"] }
|
||||
tantivy-query-grammar = { version="0.14.0", path="./query-grammar" }
|
||||
tantivy-query-grammar = { version="0.15.0", path="./query-grammar" }
|
||||
tantivy-bitpacker = { version="0.1", path="./bitpacker" }
|
||||
stable_deref_trait = "1.2"
|
||||
rust-stemmers = "1.2"
|
||||
@@ -77,12 +76,13 @@ debug-assertions = true
|
||||
overflow-checks = true
|
||||
|
||||
[features]
|
||||
default = ["mmap", "lz4-block-compression" ]
|
||||
default = ["mmap", "lz4-compression" ]
|
||||
mmap = ["fs2", "tempfile", "memmap"]
|
||||
|
||||
brotli-compression = ["brotli"]
|
||||
lz4-compression = ["lz4"]
|
||||
lz4-block-compression = ["lz4_flex"]
|
||||
lz4-compression = ["lz4_flex"]
|
||||
snappy-compression = ["snap"]
|
||||
|
||||
failpoints = ["fail/failpoints"]
|
||||
unstable = [] # useful for benches.
|
||||
wasm-bindgen = ["uuid/wasm-bindgen"]
|
||||
|
||||
@@ -18,5 +18,6 @@ install:
|
||||
build: false
|
||||
|
||||
test_script:
|
||||
- REM SET RUST_LOG=tantivy,test & cargo test --all --verbose --no-default-features --features lz4-block-compression --features mmap
|
||||
- REM SET RUST_LOG=tantivy,test & cargo test --all --verbose --no-default-features --features lz4-compression --features mmap
|
||||
- REM SET RUST_LOG=tantivy,test & cargo test test_store --verbose --no-default-features --features lz4-compression --features snappy-compression --features brotli-compression --features mmap
|
||||
- REM SET RUST_BACKTRACE=1 & cargo build --examples
|
||||
|
||||
@@ -2,6 +2,13 @@
|
||||
name = "tantivy-bitpacker"
|
||||
version = "0.1.0"
|
||||
edition = "2018"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = []
|
||||
description = """Tantivy-sub crate: bitpacking"""
|
||||
repository = "https://github.com/tantivy-search/tantivy"
|
||||
keywords = []
|
||||
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
|
||||
@@ -17,6 +17,7 @@ impl BitPacker {
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn write<TWrite: io::Write>(
|
||||
&mut self,
|
||||
val: u64,
|
||||
@@ -79,6 +80,7 @@ impl BitUnpacker {
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn get(&self, idx: u64, data: &[u8]) -> u64 {
|
||||
if self.num_bits == 0 {
|
||||
return 0u64;
|
||||
|
||||
@@ -80,6 +80,7 @@ impl BlockedBitpacker {
|
||||
* std::mem::size_of_val(&self.buffer.get(0).cloned().unwrap_or_default())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn add(&mut self, val: u64) {
|
||||
self.buffer.push(val);
|
||||
if self.buffer.len() == BLOCK_SIZE as usize {
|
||||
@@ -122,6 +123,7 @@ impl BlockedBitpacker {
|
||||
.resize(self.compressed_blocks.len() + 8, 0); // add padding for bitpacker
|
||||
}
|
||||
}
|
||||
#[inline]
|
||||
pub fn get(&self, idx: usize) -> u64 {
|
||||
let metadata_pos = idx / BLOCK_SIZE as usize;
|
||||
let pos_in_block = idx % BLOCK_SIZE as usize;
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
use tantivy::collector::{Collector, SegmentCollector};
|
||||
use tantivy::fastfield::FastFieldReader;
|
||||
use tantivy::fastfield::{DynamicFastFieldReader, FastFieldReader};
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::Field;
|
||||
use tantivy::schema::{Schema, FAST, INDEXED, TEXT};
|
||||
@@ -98,7 +98,7 @@ impl Collector for StatsCollector {
|
||||
}
|
||||
|
||||
struct StatsSegmentCollector {
|
||||
fast_field_reader: FastFieldReader<u64>,
|
||||
fast_field_reader: DynamicFastFieldReader<u64>,
|
||||
stats: Stats,
|
||||
}
|
||||
|
||||
|
||||
@@ -90,7 +90,7 @@ fn main() -> tantivy::Result<()> {
|
||||
|
||||
let frankenstein_isbn = Term::from_field_text(isbn, "978-9176370711");
|
||||
|
||||
// Oops our frankenstein doc seems mispelled
|
||||
// Oops our frankenstein doc seems misspelled
|
||||
let frankenstein_doc_misspelled = extract_doc_given_isbn(&reader, &frankenstein_isbn)?.unwrap();
|
||||
assert_eq!(
|
||||
schema.to_json(&frankenstein_doc_misspelled),
|
||||
|
||||
@@ -92,7 +92,7 @@ fn main() -> tantivy::Result<()> {
|
||||
|
||||
// Check the reference doc for different ways to create a `Facet` object.
|
||||
{
|
||||
let facet = Facet::from_text("/Felidae/Pantherinae");
|
||||
let facet = Facet::from("/Felidae/Pantherinae");
|
||||
let facet_term = Term::from_facet(classification, &facet);
|
||||
let facet_term_query = TermQuery::new(facet_term, IndexRecordOption::Basic);
|
||||
let mut facet_collector = FacetCollector::for_field(classification);
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy-query-grammar"
|
||||
version = "0.14.0"
|
||||
version = "0.15.0"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
|
||||
@@ -539,10 +539,10 @@ mod tests {
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
index_writer.add_document(doc!(
|
||||
facet_field => Facet::from_text(&"/subjects/A/a"),
|
||||
facet_field => Facet::from_text(&"/subjects/B/a"),
|
||||
facet_field => Facet::from_text(&"/subjects/A/b"),
|
||||
facet_field => Facet::from_text(&"/subjects/B/b"),
|
||||
facet_field => Facet::from_text(&"/subjects/A/a").unwrap(),
|
||||
facet_field => Facet::from_text(&"/subjects/B/a").unwrap(),
|
||||
facet_field => Facet::from_text(&"/subjects/A/b").unwrap(),
|
||||
facet_field => Facet::from_text(&"/subjects/B/b").unwrap(),
|
||||
));
|
||||
index_writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
@@ -563,16 +563,16 @@ mod tests {
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(
|
||||
facet_field => Facet::from_text(&"/A/A"),
|
||||
facet_field => Facet::from_text(&"/A/A").unwrap(),
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
facet_field => Facet::from_text(&"/A/B"),
|
||||
facet_field => Facet::from_text(&"/A/B").unwrap(),
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
facet_field => Facet::from_text(&"/A/C/A"),
|
||||
facet_field => Facet::from_text(&"/A/C/A").unwrap(),
|
||||
));
|
||||
index_writer.add_document(doc!(
|
||||
facet_field => Facet::from_text(&"/D/C/A"),
|
||||
facet_field => Facet::from_text(&"/D/C/A").unwrap(),
|
||||
));
|
||||
index_writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
@@ -580,7 +580,7 @@ mod tests {
|
||||
assert_eq!(searcher.num_docs(), 4);
|
||||
|
||||
let count_facet = |facet_str: &str| {
|
||||
let term = Term::from_facet(facet_field, &Facet::from_text(facet_str));
|
||||
let term = Term::from_facet(facet_field, &Facet::from_text(facet_str).unwrap());
|
||||
searcher
|
||||
.search(&TermQuery::new(term, IndexRecordOption::Basic), &Count)
|
||||
.unwrap()
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use crate::collector::{Collector, SegmentCollector};
|
||||
use crate::fastfield::{FastFieldReader, FastValue};
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, FastValue};
|
||||
use crate::schema::Field;
|
||||
use crate::{Score, SegmentReader, TantivyError};
|
||||
|
||||
@@ -155,7 +155,7 @@ where
|
||||
TPredicate: 'static,
|
||||
TPredicateValue: FastValue,
|
||||
{
|
||||
fast_field_reader: FastFieldReader<TPredicateValue>,
|
||||
fast_field_reader: DynamicFastFieldReader<TPredicateValue>,
|
||||
segment_collector: TSegmentCollector,
|
||||
predicate: TPredicate,
|
||||
t_predicate_value: PhantomData<TPredicateValue>,
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use crate::collector::{Collector, SegmentCollector};
|
||||
use crate::fastfield::{FastFieldReader, FastValue};
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, FastValue};
|
||||
use crate::schema::{Field, Type};
|
||||
use crate::{DocId, Score};
|
||||
use fastdivide::DividerU64;
|
||||
@@ -84,7 +84,7 @@ impl HistogramComputer {
|
||||
}
|
||||
pub struct SegmentHistogramCollector {
|
||||
histogram_computer: HistogramComputer,
|
||||
ff_reader: FastFieldReader<u64>,
|
||||
ff_reader: DynamicFastFieldReader<u64>,
|
||||
}
|
||||
|
||||
impl SegmentCollector for SegmentHistogramCollector {
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use super::*;
|
||||
use crate::core::SegmentReader;
|
||||
use crate::fastfield::BytesFastFieldReader;
|
||||
use crate::fastfield::DynamicFastFieldReader;
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::schema::Field;
|
||||
use crate::DocId;
|
||||
@@ -162,7 +163,7 @@ pub struct FastFieldTestCollector {
|
||||
|
||||
pub struct FastFieldSegmentCollector {
|
||||
vals: Vec<u64>,
|
||||
reader: FastFieldReader<u64>,
|
||||
reader: DynamicFastFieldReader<u64>,
|
||||
}
|
||||
|
||||
impl FastFieldTestCollector {
|
||||
|
||||
@@ -4,7 +4,7 @@ use crate::collector::tweak_score_top_collector::TweakedScoreTopCollector;
|
||||
use crate::collector::{
|
||||
CustomScorer, CustomSegmentScorer, ScoreSegmentTweaker, ScoreTweaker, SegmentCollector,
|
||||
};
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader};
|
||||
use crate::query::Weight;
|
||||
use crate::schema::Field;
|
||||
use crate::DocAddress;
|
||||
@@ -129,7 +129,7 @@ impl fmt::Debug for TopDocs {
|
||||
}
|
||||
|
||||
struct ScorerByFastFieldReader {
|
||||
ff_reader: FastFieldReader<u64>,
|
||||
ff_reader: DynamicFastFieldReader<u64>,
|
||||
}
|
||||
|
||||
impl CustomSegmentScorer<u64> for ScorerByFastFieldReader {
|
||||
@@ -151,7 +151,7 @@ impl CustomScorer<u64> for ScorerByField {
|
||||
// mapping is monotonic, so it is sufficient to compute our top-K docs.
|
||||
//
|
||||
// The conversion will then happen only on the top-K docs.
|
||||
let ff_reader: FastFieldReader<u64> = segment_reader
|
||||
let ff_reader = segment_reader
|
||||
.fast_fields()
|
||||
.typed_fast_field_reader(self.field)?;
|
||||
Ok(ScorerByFastFieldReader { ff_reader })
|
||||
@@ -401,6 +401,7 @@ impl TopDocs {
|
||||
/// # use tantivy::query::QueryParser;
|
||||
/// use tantivy::SegmentReader;
|
||||
/// use tantivy::collector::TopDocs;
|
||||
/// use tantivy::fastfield::FastFieldReader;
|
||||
/// use tantivy::schema::Field;
|
||||
///
|
||||
/// fn create_schema() -> Schema {
|
||||
@@ -508,6 +509,7 @@ impl TopDocs {
|
||||
/// use tantivy::SegmentReader;
|
||||
/// use tantivy::collector::TopDocs;
|
||||
/// use tantivy::schema::Field;
|
||||
/// use tantivy::fastfield::FastFieldReader;
|
||||
///
|
||||
/// # fn create_schema() -> Schema {
|
||||
/// # let mut schema_builder = Schema::builder();
|
||||
|
||||
@@ -8,7 +8,7 @@ pub use self::bitset::BitSet;
|
||||
pub(crate) use self::bitset::TinySet;
|
||||
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
|
||||
pub use self::counting_writer::CountingWriter;
|
||||
pub use self::serialize::{BinarySerializable, FixedSize};
|
||||
pub use self::serialize::{BinarySerializable, DeserializeFrom, FixedSize};
|
||||
pub use self::vint::{
|
||||
read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt,
|
||||
};
|
||||
|
||||
@@ -14,6 +14,20 @@ pub trait BinarySerializable: fmt::Debug + Sized {
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self>;
|
||||
}
|
||||
|
||||
pub trait DeserializeFrom<T: BinarySerializable> {
|
||||
fn deserialize(&mut self) -> io::Result<T>;
|
||||
}
|
||||
|
||||
/// Implement deserialize from &[u8] for all types which implement BinarySerializable.
|
||||
///
|
||||
/// TryFrom would actually be preferrable, but not possible because of the orphan
|
||||
/// rules (not completely sure if this could be resolved)
|
||||
impl<T: BinarySerializable> DeserializeFrom<T> for &[u8] {
|
||||
fn deserialize(&mut self) -> io::Result<T> {
|
||||
T::deserialize(self)
|
||||
}
|
||||
}
|
||||
|
||||
/// `FixedSize` marks a `BinarySerializable` as
|
||||
/// always serializing to the same size.
|
||||
pub trait FixedSize: BinarySerializable {
|
||||
@@ -61,6 +75,11 @@ impl<Left: BinarySerializable, Right: BinarySerializable> BinarySerializable for
|
||||
Ok((Left::deserialize(reader)?, Right::deserialize(reader)?))
|
||||
}
|
||||
}
|
||||
impl<Left: BinarySerializable + FixedSize, Right: BinarySerializable + FixedSize> FixedSize
|
||||
for (Left, Right)
|
||||
{
|
||||
const SIZE_IN_BYTES: usize = Left::SIZE_IN_BYTES + Right::SIZE_IN_BYTES;
|
||||
}
|
||||
|
||||
impl BinarySerializable for u32 {
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
|
||||
@@ -76,7 +76,7 @@ fn load_metas(
|
||||
/// );
|
||||
///
|
||||
/// let schema = schema_builder.build();
|
||||
/// let settings = IndexSettings{sort_by_field: Some(IndexSortByField{field:"number".to_string(), order:Order::Asc})};
|
||||
/// let settings = IndexSettings{sort_by_field: Some(IndexSortByField{field:"number".to_string(), order:Order::Asc}), ..Default::default()};
|
||||
/// let index = Index::builder().schema(schema).settings(settings).create_in_ram();
|
||||
///
|
||||
/// ```
|
||||
@@ -173,7 +173,7 @@ impl IndexBuilder {
|
||||
&directory,
|
||||
)?;
|
||||
let mut metas = IndexMeta::with_schema(self.get_expect_schema()?);
|
||||
metas.index_settings = self.index_settings.clone();
|
||||
metas.index_settings = self.index_settings;
|
||||
let index = Index::open_from_metas(directory, &metas, SegmentMetaInventory::default());
|
||||
Ok(index)
|
||||
}
|
||||
@@ -460,6 +460,13 @@ impl Index {
|
||||
pub fn settings(&self) -> &IndexSettings {
|
||||
&self.settings
|
||||
}
|
||||
|
||||
/// Accessor to the index settings
|
||||
///
|
||||
pub fn settings_mut(&mut self) -> &mut IndexSettings {
|
||||
&mut self.settings
|
||||
}
|
||||
|
||||
/// Accessor to the index schema
|
||||
///
|
||||
/// The schema is actually cloned.
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use super::SegmentComponent;
|
||||
use crate::core::SegmentId;
|
||||
use crate::schema::Schema;
|
||||
use crate::Opstamp;
|
||||
use crate::{core::SegmentId, store::Compressor};
|
||||
use census::{Inventory, TrackedObject};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::path::PathBuf;
|
||||
@@ -233,7 +233,11 @@ impl InnerSegmentMeta {
|
||||
pub struct IndexSettings {
|
||||
/// Sorts the documents by information
|
||||
/// provided in `IndexSortByField`
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub sort_by_field: Option<IndexSortByField>,
|
||||
/// The `Compressor` used to compress the doc store.
|
||||
#[serde(default)]
|
||||
pub docstore_compression: Compressor,
|
||||
}
|
||||
/// Settings to presort the documents in an index
|
||||
///
|
||||
@@ -255,6 +259,17 @@ pub enum Order {
|
||||
/// Descending Order
|
||||
Desc,
|
||||
}
|
||||
impl Order {
|
||||
/// return if the Order is ascending
|
||||
pub fn is_asc(&self) -> bool {
|
||||
self == &Order::Asc
|
||||
}
|
||||
/// return if the Order is descending
|
||||
pub fn is_desc(&self) -> bool {
|
||||
self == &Order::Desc
|
||||
}
|
||||
}
|
||||
|
||||
/// Meta information about the `Index`.
|
||||
///
|
||||
/// This object is serialized on disk in the `meta.json` file.
|
||||
@@ -369,6 +384,7 @@ mod tests {
|
||||
field: "text".to_string(),
|
||||
order: Order::Asc,
|
||||
}),
|
||||
..Default::default()
|
||||
},
|
||||
segments: Vec::new(),
|
||||
schema,
|
||||
@@ -378,7 +394,7 @@ mod tests {
|
||||
let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
|
||||
assert_eq!(
|
||||
json,
|
||||
r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"}},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","tokenizer":"default"},"stored":false}}],"opstamp":0}"#
|
||||
r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4"},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","tokenizer":"default"},"stored":false}}],"opstamp":0}"#
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -147,6 +147,13 @@ impl FileSlice {
|
||||
self.slice(from_offset..self.len())
|
||||
}
|
||||
|
||||
/// Returns a slice from the end.
|
||||
///
|
||||
/// Equivalent to `.slice(self.len() - from_offset, self.len())`
|
||||
pub fn slice_from_end(&self, from_offset: usize) -> FileSlice {
|
||||
self.slice(self.len() - from_offset..self.len())
|
||||
}
|
||||
|
||||
/// Like `.slice(...)` but enforcing only the `to`
|
||||
/// boundary.
|
||||
///
|
||||
|
||||
@@ -1,69 +1,45 @@
|
||||
use crate::common::{BinarySerializable, CountingWriter, FixedSize, HasLen, VInt};
|
||||
use crate::directory::error::Incompatibility;
|
||||
use crate::directory::FileSlice;
|
||||
use crate::directory::{AntiCallToken, TerminatingWrite};
|
||||
use crate::Version;
|
||||
use crate::{
|
||||
common::{BinarySerializable, CountingWriter, DeserializeFrom, FixedSize, HasLen},
|
||||
directory::{AntiCallToken, TerminatingWrite},
|
||||
Version, INDEX_FORMAT_VERSION,
|
||||
};
|
||||
use crc32fast::Hasher;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
|
||||
const FOOTER_MAX_LEN: usize = 10_000;
|
||||
const FOOTER_MAX_LEN: u32 = 50_000;
|
||||
|
||||
/// The magic byte of the footer to identify corruption
|
||||
/// or an old version of the footer.
|
||||
const FOOTER_MAGIC_NUMBER: u32 = 1337;
|
||||
|
||||
type CrcHashU32 = u32;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
/// A Footer is appended to every file
|
||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||
pub struct Footer {
|
||||
pub version: Version,
|
||||
pub meta: String,
|
||||
pub versioned_footer: VersionedFooter,
|
||||
}
|
||||
|
||||
/// Serialises the footer to a byte-array
|
||||
/// - versioned_footer_len : 4 bytes
|
||||
///- versioned_footer: variable bytes
|
||||
/// - meta_len: 4 bytes
|
||||
/// - meta: variable bytes
|
||||
/// - version_len: 4 bytes
|
||||
/// - version json: variable bytes
|
||||
impl BinarySerializable for Footer {
|
||||
fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
BinarySerializable::serialize(&self.versioned_footer, writer)?;
|
||||
BinarySerializable::serialize(&self.meta, writer)?;
|
||||
let version_string =
|
||||
serde_json::to_string(&self.version).map_err(|_err| io::ErrorKind::InvalidInput)?;
|
||||
BinarySerializable::serialize(&version_string, writer)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let versioned_footer = VersionedFooter::deserialize(reader)?;
|
||||
let meta = String::deserialize(reader)?;
|
||||
let version_json = String::deserialize(reader)?;
|
||||
let version = serde_json::from_str(&version_json)?;
|
||||
Ok(Footer {
|
||||
version,
|
||||
meta,
|
||||
versioned_footer,
|
||||
})
|
||||
}
|
||||
pub crc: CrcHashU32,
|
||||
}
|
||||
|
||||
impl Footer {
|
||||
pub fn new(versioned_footer: VersionedFooter) -> Self {
|
||||
pub fn new(crc: CrcHashU32) -> Self {
|
||||
let version = crate::VERSION.clone();
|
||||
let meta = version.to_string();
|
||||
Footer {
|
||||
version,
|
||||
meta,
|
||||
versioned_footer,
|
||||
}
|
||||
Footer { version, crc }
|
||||
}
|
||||
|
||||
pub fn crc(&self) -> CrcHashU32 {
|
||||
self.crc
|
||||
}
|
||||
pub fn append_footer<W: io::Write>(&self, mut write: &mut W) -> io::Result<()> {
|
||||
let mut counting_write = CountingWriter::wrap(&mut write);
|
||||
self.serialize(&mut counting_write)?;
|
||||
let written_len = counting_write.written_bytes();
|
||||
(written_len as u32).serialize(write)?;
|
||||
counting_write.write_all(serde_json::to_string(&self)?.as_ref())?;
|
||||
let footer_payload_len = counting_write.written_bytes();
|
||||
BinarySerializable::serialize(&(footer_payload_len as u32), write)?;
|
||||
BinarySerializable::serialize(&(FOOTER_MAGIC_NUMBER as u32), write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -77,12 +53,47 @@ impl Footer {
|
||||
),
|
||||
));
|
||||
}
|
||||
let (body_footer, footer_len_file) = file.split_from_end(u32::SIZE_IN_BYTES);
|
||||
let mut footer_len_bytes = footer_len_file.read_bytes()?;
|
||||
let footer_len = u32::deserialize(&mut footer_len_bytes)? as usize;
|
||||
let (body, footer) = body_footer.split_from_end(footer_len);
|
||||
let mut footer_bytes = footer.read_bytes()?;
|
||||
let footer = Footer::deserialize(&mut footer_bytes)?;
|
||||
|
||||
let footer_metadata_len = <(u32, u32)>::SIZE_IN_BYTES;
|
||||
let (footer_len, footer_magic_byte): (u32, u32) = file
|
||||
.slice_from_end(footer_metadata_len)
|
||||
.read_bytes()?
|
||||
.as_ref()
|
||||
.deserialize()?;
|
||||
|
||||
if footer_magic_byte != FOOTER_MAGIC_NUMBER {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
"Footer magic byte mismatch. File corrupted or index was created using old an tantivy version which is not supported anymore. Please use tantivy 0.15 or above to recreate the index.",
|
||||
));
|
||||
}
|
||||
|
||||
if footer_len > FOOTER_MAX_LEN {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
format!(
|
||||
"Footer seems invalid as it suggests a footer len of {}. File is corrupted, \
|
||||
or the index was created with a different & old version of tantivy.",
|
||||
footer_len
|
||||
),
|
||||
));
|
||||
}
|
||||
let total_footer_size = footer_len as usize + footer_metadata_len;
|
||||
if file.len() < total_footer_size {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::UnexpectedEof,
|
||||
format!(
|
||||
"File corrupted. The file is smaller than it's footer bytes (len={}).",
|
||||
total_footer_size
|
||||
),
|
||||
));
|
||||
}
|
||||
|
||||
let footer: Footer = serde_json::from_slice(&file.read_bytes_slice(
|
||||
file.len() - total_footer_size..file.len() - footer_metadata_len as usize,
|
||||
)?)?;
|
||||
|
||||
let body = file.slice_to(file.len() - total_footer_size);
|
||||
Ok((footer, body))
|
||||
}
|
||||
|
||||
@@ -90,151 +101,16 @@ impl Footer {
|
||||
/// Has to be called after `extract_footer` to make sure it's not accessing uninitialised memory
|
||||
pub fn is_compatible(&self) -> Result<(), Incompatibility> {
|
||||
let library_version = crate::version();
|
||||
match &self.versioned_footer {
|
||||
VersionedFooter::V1 {
|
||||
crc32: _crc,
|
||||
store_compression,
|
||||
} => {
|
||||
if &library_version.store_compression != store_compression {
|
||||
return Err(Incompatibility::CompressionMismatch {
|
||||
library_compression_format: library_version.store_compression.to_string(),
|
||||
index_compression_format: store_compression.to_string(),
|
||||
});
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
VersionedFooter::V2 {
|
||||
crc32: _crc,
|
||||
store_compression,
|
||||
} => {
|
||||
if &library_version.store_compression != store_compression {
|
||||
return Err(Incompatibility::CompressionMismatch {
|
||||
library_compression_format: library_version.store_compression.to_string(),
|
||||
index_compression_format: store_compression.to_string(),
|
||||
});
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
VersionedFooter::V3 {
|
||||
crc32: _crc,
|
||||
store_compression,
|
||||
} => {
|
||||
if &library_version.store_compression != store_compression {
|
||||
return Err(Incompatibility::CompressionMismatch {
|
||||
library_compression_format: library_version.store_compression.to_string(),
|
||||
index_compression_format: store_compression.to_string(),
|
||||
});
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
VersionedFooter::UnknownVersion => Err(Incompatibility::IndexMismatch {
|
||||
if self.version.index_format_version < 4
|
||||
|| self.version.index_format_version > INDEX_FORMAT_VERSION
|
||||
{
|
||||
return Err(Incompatibility::IndexMismatch {
|
||||
library_version: library_version.clone(),
|
||||
index_version: self.version.clone(),
|
||||
}),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Footer that includes a crc32 hash that enables us to checksum files in the index
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub enum VersionedFooter {
|
||||
UnknownVersion,
|
||||
V1 {
|
||||
crc32: CrcHashU32,
|
||||
store_compression: String,
|
||||
},
|
||||
// Introduction of the Block WAND information.
|
||||
V2 {
|
||||
crc32: CrcHashU32,
|
||||
store_compression: String,
|
||||
},
|
||||
// Block wand max termfred on 1 byte
|
||||
V3 {
|
||||
crc32: CrcHashU32,
|
||||
store_compression: String,
|
||||
},
|
||||
}
|
||||
|
||||
impl BinarySerializable for VersionedFooter {
|
||||
fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
let mut buf = Vec::new();
|
||||
match self {
|
||||
VersionedFooter::V3 {
|
||||
crc32,
|
||||
store_compression: compression,
|
||||
} => {
|
||||
// Serializes a valid `VersionedFooter` or panics if the version is unknown
|
||||
// [ version | crc_hash | compression_mode ]
|
||||
// [ 0..4 | 4..8 | variable ]
|
||||
BinarySerializable::serialize(&3u32, &mut buf)?;
|
||||
BinarySerializable::serialize(crc32, &mut buf)?;
|
||||
BinarySerializable::serialize(compression, &mut buf)?;
|
||||
}
|
||||
VersionedFooter::V2 { .. }
|
||||
| VersionedFooter::V1 { .. }
|
||||
| VersionedFooter::UnknownVersion => {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidInput,
|
||||
"Cannot serialize an unknown versioned footer ",
|
||||
));
|
||||
}
|
||||
}
|
||||
BinarySerializable::serialize(&VInt(buf.len() as u64), writer)?;
|
||||
assert!(buf.len() <= FOOTER_MAX_LEN);
|
||||
writer.write_all(&buf[..])?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let len = VInt::deserialize(reader)?.0 as usize;
|
||||
if len > FOOTER_MAX_LEN {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
format!(
|
||||
"Footer seems invalid as it suggests a footer len of {}. File is corrupted, \
|
||||
or the index was created with a different & old version of tantivy.",
|
||||
len
|
||||
),
|
||||
));
|
||||
}
|
||||
let mut buf = vec![0u8; len];
|
||||
reader.read_exact(&mut buf[..])?;
|
||||
let mut cursor = &buf[..];
|
||||
let version = u32::deserialize(&mut cursor)?;
|
||||
if version > 3 {
|
||||
return Ok(VersionedFooter::UnknownVersion);
|
||||
}
|
||||
let crc32 = u32::deserialize(&mut cursor)?;
|
||||
let store_compression = String::deserialize(&mut cursor)?;
|
||||
Ok(if version == 1 {
|
||||
VersionedFooter::V1 {
|
||||
crc32,
|
||||
store_compression,
|
||||
}
|
||||
} else if version == 2 {
|
||||
VersionedFooter::V2 {
|
||||
crc32,
|
||||
store_compression,
|
||||
}
|
||||
} else {
|
||||
assert_eq!(version, 3);
|
||||
VersionedFooter::V3 {
|
||||
crc32,
|
||||
store_compression,
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl VersionedFooter {
|
||||
pub fn crc(&self) -> Option<CrcHashU32> {
|
||||
match self {
|
||||
VersionedFooter::V3 { crc32, .. } => Some(*crc32),
|
||||
VersionedFooter::V2 { crc32, .. } => Some(*crc32),
|
||||
VersionedFooter::V1 { crc32, .. } => Some(*crc32),
|
||||
VersionedFooter::UnknownVersion { .. } => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) struct FooterProxy<W: TerminatingWrite> {
|
||||
@@ -268,10 +144,7 @@ impl<W: TerminatingWrite> Write for FooterProxy<W> {
|
||||
impl<W: TerminatingWrite> TerminatingWrite for FooterProxy<W> {
|
||||
fn terminate_ref(&mut self, _: AntiCallToken) -> io::Result<()> {
|
||||
let crc32 = self.hasher.take().unwrap().finalize();
|
||||
let footer = Footer::new(VersionedFooter::V3 {
|
||||
crc32,
|
||||
store_compression: crate::store::COMPRESSION.to_string(),
|
||||
});
|
||||
let footer = Footer::new(crc32);
|
||||
let mut writer = self.writer.take().unwrap();
|
||||
footer.append_footer(&mut writer)?;
|
||||
writer.terminate()
|
||||
@@ -281,140 +154,75 @@ impl<W: TerminatingWrite> TerminatingWrite for FooterProxy<W> {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::CrcHashU32;
|
||||
use super::FooterProxy;
|
||||
use crate::common::{BinarySerializable, VInt};
|
||||
use crate::directory::footer::{Footer, VersionedFooter};
|
||||
use crate::directory::TerminatingWrite;
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use regex::Regex;
|
||||
use crate::directory::footer::Footer;
|
||||
use crate::directory::OwnedBytes;
|
||||
use crate::{
|
||||
common::BinarySerializable,
|
||||
directory::{footer::FOOTER_MAGIC_NUMBER, FileSlice},
|
||||
};
|
||||
use std::io;
|
||||
|
||||
#[test]
|
||||
fn test_versioned_footer() {
|
||||
let mut vec = Vec::new();
|
||||
let footer_proxy = FooterProxy::new(&mut vec);
|
||||
assert!(footer_proxy.terminate().is_ok());
|
||||
if crate::store::COMPRESSION == "lz4" {
|
||||
assert_eq!(vec.len(), 158);
|
||||
} else if crate::store::COMPRESSION == "snappy" {
|
||||
assert_eq!(vec.len(), 167);
|
||||
} else if crate::store::COMPRESSION == "lz4_block" {
|
||||
assert_eq!(vec.len(), 176);
|
||||
}
|
||||
let footer = Footer::deserialize(&mut &vec[..]).unwrap();
|
||||
assert!(matches!(
|
||||
footer.versioned_footer,
|
||||
VersionedFooter::V3 { store_compression, .. }
|
||||
if store_compression == crate::store::COMPRESSION
|
||||
));
|
||||
assert_eq!(&footer.version, crate::version());
|
||||
fn test_deserialize_footer() {
|
||||
let mut buf: Vec<u8> = vec![];
|
||||
let footer = Footer::new(123);
|
||||
footer.append_footer(&mut buf).unwrap();
|
||||
let owned_bytes = OwnedBytes::new(buf);
|
||||
let fileslice = FileSlice::new(Box::new(owned_bytes));
|
||||
let (footer_deser, _body) = Footer::extract_footer(fileslice).unwrap();
|
||||
assert_eq!(footer_deser.crc(), footer.crc());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_serialize_deserialize_footer() {
|
||||
let mut buffer = Vec::new();
|
||||
let crc32 = 123456u32;
|
||||
let footer: Footer = Footer::new(VersionedFooter::V3 {
|
||||
crc32,
|
||||
store_compression: "lz4".to_string(),
|
||||
});
|
||||
footer.serialize(&mut buffer).unwrap();
|
||||
let footer_deser = Footer::deserialize(&mut &buffer[..]).unwrap();
|
||||
assert_eq!(footer_deser, footer);
|
||||
fn test_deserialize_footer_missing_magic_byte() {
|
||||
let mut buf: Vec<u8> = vec![];
|
||||
BinarySerializable::serialize(&0_u32, &mut buf).unwrap();
|
||||
let wrong_magic_byte: u32 = 5555;
|
||||
BinarySerializable::serialize(&wrong_magic_byte, &mut buf).unwrap();
|
||||
|
||||
let owned_bytes = OwnedBytes::new(buf);
|
||||
|
||||
let fileslice = FileSlice::new(Box::new(owned_bytes));
|
||||
let err = Footer::extract_footer(fileslice).unwrap_err();
|
||||
assert_eq!(
|
||||
err.to_string(),
|
||||
"Footer magic byte mismatch. File corrupted or index was created using old an tantivy version which \
|
||||
is not supported anymore. Please use tantivy 0.15 or above to recreate the index."
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn footer_length() {
|
||||
let crc32 = 1111111u32;
|
||||
let versioned_footer = VersionedFooter::V3 {
|
||||
crc32,
|
||||
store_compression: "lz4".to_string(),
|
||||
};
|
||||
let mut buf = Vec::new();
|
||||
versioned_footer.serialize(&mut buf).unwrap();
|
||||
assert_eq!(buf.len(), 13);
|
||||
let footer = Footer::new(versioned_footer);
|
||||
let regex_ptn = Regex::new(
|
||||
"tantivy v[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.{0,10}, index_format v[0-9]{1,5}",
|
||||
)
|
||||
.unwrap();
|
||||
assert!(regex_ptn.is_match(&footer.meta));
|
||||
}
|
||||
fn test_deserialize_footer_wrong_filesize() {
|
||||
let mut buf: Vec<u8> = vec![];
|
||||
BinarySerializable::serialize(&100_u32, &mut buf).unwrap();
|
||||
BinarySerializable::serialize(&FOOTER_MAGIC_NUMBER, &mut buf).unwrap();
|
||||
|
||||
#[test]
|
||||
fn versioned_footer_from_bytes() {
|
||||
let v_footer_bytes = vec![
|
||||
// versionned footer length
|
||||
12 | 128,
|
||||
// index format version
|
||||
3,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
// crc 32
|
||||
12,
|
||||
35,
|
||||
89,
|
||||
18,
|
||||
// compression format
|
||||
3 | 128,
|
||||
b'l',
|
||||
b'z',
|
||||
b'4',
|
||||
];
|
||||
let mut cursor = &v_footer_bytes[..];
|
||||
let versioned_footer = VersionedFooter::deserialize(&mut cursor).unwrap();
|
||||
assert!(cursor.is_empty());
|
||||
let expected_crc: u32 = LittleEndian::read_u32(&v_footer_bytes[5..9]) as CrcHashU32;
|
||||
let expected_versioned_footer: VersionedFooter = VersionedFooter::V3 {
|
||||
crc32: expected_crc,
|
||||
store_compression: "lz4".to_string(),
|
||||
};
|
||||
assert_eq!(versioned_footer, expected_versioned_footer);
|
||||
let mut buffer = Vec::new();
|
||||
assert!(versioned_footer.serialize(&mut buffer).is_ok());
|
||||
assert_eq!(&v_footer_bytes[..], &buffer[..]);
|
||||
}
|
||||
let owned_bytes = OwnedBytes::new(buf);
|
||||
|
||||
#[test]
|
||||
fn versioned_footer_panic() {
|
||||
let v_footer_bytes = vec![6u8 | 128u8, 3u8, 0u8, 0u8, 1u8, 0u8, 0u8];
|
||||
let mut b = &v_footer_bytes[..];
|
||||
let versioned_footer = VersionedFooter::deserialize(&mut b).unwrap();
|
||||
assert!(b.is_empty());
|
||||
let expected_versioned_footer = VersionedFooter::UnknownVersion;
|
||||
assert_eq!(versioned_footer, expected_versioned_footer);
|
||||
let mut buf = Vec::new();
|
||||
assert!(versioned_footer.serialize(&mut buf).is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(not(feature = "lz4"))]
|
||||
fn compression_mismatch() {
|
||||
let crc32 = 1111111u32;
|
||||
let versioned_footer = VersionedFooter::V1 {
|
||||
crc32,
|
||||
store_compression: "lz4".to_string(),
|
||||
};
|
||||
let footer = Footer::new(versioned_footer);
|
||||
let res = footer.is_compatible();
|
||||
assert!(res.is_err());
|
||||
let fileslice = FileSlice::new(Box::new(owned_bytes));
|
||||
let err = Footer::extract_footer(fileslice).unwrap_err();
|
||||
assert_eq!(err.kind(), io::ErrorKind::UnexpectedEof);
|
||||
assert_eq!(
|
||||
err.to_string(),
|
||||
"File corrupted. The file is smaller than it\'s footer bytes (len=108)."
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_deserialize_too_large_footer() {
|
||||
let mut buf = vec![];
|
||||
assert!(FooterProxy::new(&mut buf).terminate().is_ok());
|
||||
let mut long_len_buf = [0u8; 10];
|
||||
let num_bytes = VInt(super::FOOTER_MAX_LEN as u64 + 1u64).serialize_into(&mut long_len_buf);
|
||||
buf[0..num_bytes].copy_from_slice(&long_len_buf[..num_bytes]);
|
||||
let err = Footer::deserialize(&mut &buf[..]).unwrap_err();
|
||||
let mut buf: Vec<u8> = vec![];
|
||||
|
||||
let footer_length = super::FOOTER_MAX_LEN + 1;
|
||||
BinarySerializable::serialize(&footer_length, &mut buf).unwrap();
|
||||
BinarySerializable::serialize(&FOOTER_MAGIC_NUMBER, &mut buf).unwrap();
|
||||
|
||||
let owned_bytes = OwnedBytes::new(buf);
|
||||
|
||||
let fileslice = FileSlice::new(Box::new(owned_bytes));
|
||||
let err = Footer::extract_footer(fileslice).unwrap_err();
|
||||
assert_eq!(err.kind(), io::ErrorKind::InvalidData);
|
||||
assert_eq!(
|
||||
err.to_string(),
|
||||
"Footer seems invalid as it suggests a footer len of 10001. File is corrupted, \
|
||||
or the index was created with a different & old version of tantivy."
|
||||
"Footer seems invalid as it suggests a footer len of 50001. File is corrupted, \
|
||||
or the index was created with a different & old version of tantivy."
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -245,11 +245,7 @@ impl ManagedDirectory {
|
||||
let mut hasher = Hasher::new();
|
||||
hasher.update(bytes.as_slice());
|
||||
let crc = hasher.finalize();
|
||||
Ok(footer
|
||||
.versioned_footer
|
||||
.crc()
|
||||
.map(|v| v == crc)
|
||||
.unwrap_or(false))
|
||||
Ok(footer.crc() == crc)
|
||||
}
|
||||
|
||||
/// List files for which checksum does not match content
|
||||
|
||||
@@ -593,7 +593,7 @@ mod tests {
|
||||
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut log_merge_policy = LogMergePolicy::default();
|
||||
log_merge_policy.set_min_merge_size(3);
|
||||
log_merge_policy.set_min_num_segments(3);
|
||||
index_writer.set_merge_policy(Box::new(log_merge_policy));
|
||||
for _num_commits in 0..10 {
|
||||
for _ in 0..10 {
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use crate::directory::FileSlice;
|
||||
use crate::directory::OwnedBytes;
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::fastfield::{BitpackedFastFieldReader, FastFieldReader, MultiValueLength};
|
||||
use crate::DocId;
|
||||
use crate::{directory::FileSlice, fastfield::MultiValueLength};
|
||||
|
||||
/// Reader for byte array fast fields
|
||||
///
|
||||
@@ -15,13 +15,13 @@ use crate::{directory::FileSlice, fastfield::MultiValueLength};
|
||||
/// and the start index for the next document, and keeping the bytes in between.
|
||||
#[derive(Clone)]
|
||||
pub struct BytesFastFieldReader {
|
||||
idx_reader: FastFieldReader<u64>,
|
||||
idx_reader: BitpackedFastFieldReader<u64>,
|
||||
values: OwnedBytes,
|
||||
}
|
||||
|
||||
impl BytesFastFieldReader {
|
||||
pub(crate) fn open(
|
||||
idx_reader: FastFieldReader<u64>,
|
||||
idx_reader: BitpackedFastFieldReader<u64>,
|
||||
values_file: FileSlice,
|
||||
) -> crate::Result<BytesFastFieldReader> {
|
||||
let values = values_file.read_bytes()?;
|
||||
|
||||
@@ -1,8 +1,11 @@
|
||||
use std::io;
|
||||
|
||||
use crate::fastfield::serializer::FastFieldSerializer;
|
||||
use crate::schema::{Document, Field, Value};
|
||||
use crate::DocId;
|
||||
use crate::{fastfield::serializer::FastFieldSerializer, indexer::doc_id_mapping::DocIdMapping};
|
||||
use crate::{
|
||||
fastfield::serializer::CompositeFastFieldSerializer, indexer::doc_id_mapping::DocIdMapping,
|
||||
};
|
||||
|
||||
/// Writer for byte array (as in, any number of bytes per document) fast fields
|
||||
///
|
||||
@@ -104,7 +107,7 @@ impl BytesFastFieldWriter {
|
||||
/// Serializes the fast field values by pushing them to the `FastFieldSerializer`.
|
||||
pub fn serialize(
|
||||
&self,
|
||||
serializer: &mut FastFieldSerializer,
|
||||
serializer: &mut CompositeFastFieldSerializer,
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
) -> io::Result<()> {
|
||||
// writing the offset index
|
||||
|
||||
@@ -95,7 +95,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b")));
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let facet_reader = searcher
|
||||
@@ -118,7 +118,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b")));
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let facet_reader = searcher
|
||||
@@ -141,7 +141,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b")));
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let facet_reader = searcher
|
||||
@@ -164,7 +164,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b")));
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
let facet_reader = searcher
|
||||
@@ -187,7 +187,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b")));
|
||||
index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()));
|
||||
index_writer.add_document(Document::default());
|
||||
index_writer.commit()?;
|
||||
let searcher = index.reader()?.searcher();
|
||||
|
||||
@@ -29,8 +29,11 @@ pub use self::delete::DeleteBitSet;
|
||||
pub use self::error::{FastFieldNotAvailableError, Result};
|
||||
pub use self::facet_reader::FacetReader;
|
||||
pub use self::multivalued::{MultiValuedFastFieldReader, MultiValuedFastFieldWriter};
|
||||
pub use self::reader::BitpackedFastFieldReader;
|
||||
pub use self::reader::DynamicFastFieldReader;
|
||||
pub use self::reader::FastFieldReader;
|
||||
pub use self::readers::FastFieldReaders;
|
||||
pub use self::serializer::CompositeFastFieldSerializer;
|
||||
pub use self::serializer::FastFieldSerializer;
|
||||
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
|
||||
use crate::schema::Cardinality;
|
||||
@@ -57,7 +60,7 @@ mod writer;
|
||||
pub trait MultiValueLength {
|
||||
/// returns the num of values associated to a doc_id
|
||||
fn get_len(&self, doc_id: DocId) -> u64;
|
||||
/// returns the sum of num of all values for all doc_ids
|
||||
/// returns the sum of num values for all doc_ids
|
||||
fn get_total_len(&self) -> u64;
|
||||
}
|
||||
|
||||
@@ -211,7 +214,7 @@ mod tests {
|
||||
use super::*;
|
||||
use crate::common::CompositeFile;
|
||||
use crate::directory::{Directory, RamDirectory, WritePtr};
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::fastfield::BitpackedFastFieldReader;
|
||||
use crate::merge_policy::NoMergePolicy;
|
||||
use crate::schema::Field;
|
||||
use crate::schema::Schema;
|
||||
@@ -236,7 +239,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
pub fn test_fastfield() {
|
||||
let test_fastfield = FastFieldReader::<u64>::from(vec![100, 200, 300]);
|
||||
let test_fastfield = BitpackedFastFieldReader::<u64>::from(vec![100, 200, 300]);
|
||||
assert_eq!(test_fastfield.get(0), 100);
|
||||
assert_eq!(test_fastfield.get(1), 200);
|
||||
assert_eq!(test_fastfield.get(2), 300);
|
||||
@@ -254,7 +257,7 @@ mod tests {
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>13u64));
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>14u64));
|
||||
@@ -268,7 +271,7 @@ mod tests {
|
||||
assert_eq!(file.len(), 36 as usize);
|
||||
let composite_file = CompositeFile::open(&file)?;
|
||||
let file = composite_file.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<u64>::open(file)?;
|
||||
let fast_field_reader = BitpackedFastFieldReader::<u64>::open(file)?;
|
||||
assert_eq!(fast_field_reader.get(0), 13u64);
|
||||
assert_eq!(fast_field_reader.get(1), 14u64);
|
||||
assert_eq!(fast_field_reader.get(2), 2u64);
|
||||
@@ -281,7 +284,7 @@ mod tests {
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test"))?;
|
||||
let mut serializer = FastFieldSerializer::from_write(write)?;
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write)?;
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>4u64));
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>14_082_001u64));
|
||||
@@ -300,7 +303,7 @@ mod tests {
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file)?;
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<u64>::open(data)?;
|
||||
let fast_field_reader = BitpackedFastFieldReader::<u64>::open(data)?;
|
||||
assert_eq!(fast_field_reader.get(0), 4u64);
|
||||
assert_eq!(fast_field_reader.get(1), 14_082_001u64);
|
||||
assert_eq!(fast_field_reader.get(2), 3_052u64);
|
||||
@@ -321,7 +324,7 @@ mod tests {
|
||||
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for _ in 0..10_000 {
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>100_000u64));
|
||||
@@ -336,7 +339,7 @@ mod tests {
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file).unwrap();
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<u64>::open(data)?;
|
||||
let fast_field_reader = BitpackedFastFieldReader::<u64>::open(data)?;
|
||||
for doc in 0..10_000 {
|
||||
assert_eq!(fast_field_reader.get(doc), 100_000u64);
|
||||
}
|
||||
@@ -351,7 +354,7 @@ mod tests {
|
||||
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
// forcing the amplitude to be high
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>0u64));
|
||||
@@ -368,7 +371,7 @@ mod tests {
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file)?;
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<u64>::open(data)?;
|
||||
let fast_field_reader = BitpackedFastFieldReader::<u64>::open(data)?;
|
||||
assert_eq!(fast_field_reader.get(0), 0u64);
|
||||
for doc in 1..10_001 {
|
||||
assert_eq!(
|
||||
@@ -390,7 +393,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
|
||||
for i in -100i64..10_000i64 {
|
||||
let mut doc = Document::default();
|
||||
@@ -407,7 +410,7 @@ mod tests {
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file)?;
|
||||
let data = fast_fields_composite.open_read(i64_field).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<i64>::open(data)?;
|
||||
let fast_field_reader = BitpackedFastFieldReader::<i64>::open(data)?;
|
||||
|
||||
assert_eq!(fast_field_reader.min_value(), -100i64);
|
||||
assert_eq!(fast_field_reader.max_value(), 9_999i64);
|
||||
@@ -433,7 +436,7 @@ mod tests {
|
||||
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = FastFieldSerializer::from_write(write).unwrap();
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
|
||||
let doc = Document::default();
|
||||
fast_field_writers.add_document(&doc);
|
||||
@@ -447,7 +450,7 @@ mod tests {
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file).unwrap();
|
||||
let data = fast_fields_composite.open_read(i64_field).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<i64>::open(data)?;
|
||||
let fast_field_reader = BitpackedFastFieldReader::<i64>::open(data)?;
|
||||
assert_eq!(fast_field_reader.get(0u32), 0i64);
|
||||
}
|
||||
Ok(())
|
||||
@@ -468,7 +471,7 @@ mod tests {
|
||||
let directory = RamDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test"))?;
|
||||
let mut serializer = FastFieldSerializer::from_write(write)?;
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write)?;
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for &x in &permutation {
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>x));
|
||||
@@ -480,7 +483,7 @@ mod tests {
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file)?;
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = FastFieldReader::<u64>::open(data)?;
|
||||
let fast_field_reader = BitpackedFastFieldReader::<u64>::open(data)?;
|
||||
|
||||
let mut a = 0u64;
|
||||
for _ in 0..n {
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::ops::Range;
|
||||
|
||||
use crate::fastfield::{FastFieldReader, FastValue, MultiValueLength};
|
||||
use crate::fastfield::{BitpackedFastFieldReader, FastFieldReader, FastValue, MultiValueLength};
|
||||
use crate::DocId;
|
||||
|
||||
/// Reader for a multivalued `u64` fast field.
|
||||
@@ -13,14 +13,14 @@ use crate::DocId;
|
||||
///
|
||||
#[derive(Clone)]
|
||||
pub struct MultiValuedFastFieldReader<Item: FastValue> {
|
||||
idx_reader: FastFieldReader<u64>,
|
||||
vals_reader: FastFieldReader<Item>,
|
||||
idx_reader: BitpackedFastFieldReader<u64>,
|
||||
vals_reader: BitpackedFastFieldReader<Item>,
|
||||
}
|
||||
|
||||
impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
|
||||
pub(crate) fn open(
|
||||
idx_reader: FastFieldReader<u64>,
|
||||
vals_reader: FastFieldReader<Item>,
|
||||
idx_reader: BitpackedFastFieldReader<u64>,
|
||||
vals_reader: BitpackedFastFieldReader<Item>,
|
||||
) -> MultiValuedFastFieldReader<Item> {
|
||||
MultiValuedFastFieldReader {
|
||||
idx_reader,
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use crate::fastfield::serializer::FastSingleFieldSerializer;
|
||||
use crate::fastfield::FastFieldSerializer;
|
||||
use crate::fastfield::serializer::DynamicFastFieldSerializer;
|
||||
use crate::fastfield::serializer::FastFieldSerializer;
|
||||
use crate::fastfield::CompositeFastFieldSerializer;
|
||||
use crate::postings::UnorderedTermId;
|
||||
use crate::schema::{Document, Field};
|
||||
use crate::termdict::TermOrdinal;
|
||||
@@ -134,7 +135,7 @@ impl MultiValuedFastFieldWriter {
|
||||
///
|
||||
pub fn serialize(
|
||||
&self,
|
||||
serializer: &mut FastFieldSerializer,
|
||||
serializer: &mut CompositeFastFieldSerializer,
|
||||
mapping_opt: Option<&FnvHashMap<UnorderedTermId, TermOrdinal>>,
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
) -> io::Result<()> {
|
||||
@@ -154,7 +155,7 @@ impl MultiValuedFastFieldWriter {
|
||||
}
|
||||
{
|
||||
// writing the values themselves.
|
||||
let mut value_serializer: FastSingleFieldSerializer<'_, _>;
|
||||
let mut value_serializer: DynamicFastFieldSerializer<'_, _>;
|
||||
match mapping_opt {
|
||||
Some(mapping) => {
|
||||
value_serializer = serializer.new_u64_fast_field_with_idx(
|
||||
|
||||
@@ -4,7 +4,7 @@ use crate::common::CompositeFile;
|
||||
use crate::directory::FileSlice;
|
||||
use crate::directory::OwnedBytes;
|
||||
use crate::directory::{Directory, RamDirectory, WritePtr};
|
||||
use crate::fastfield::{FastFieldSerializer, FastFieldsWriter};
|
||||
use crate::fastfield::{CompositeFastFieldSerializer, FastFieldsWriter};
|
||||
use crate::schema::Schema;
|
||||
use crate::schema::FAST;
|
||||
use crate::DocId;
|
||||
@@ -14,12 +14,94 @@ use std::path::Path;
|
||||
use tantivy_bitpacker::compute_num_bits;
|
||||
use tantivy_bitpacker::BitUnpacker;
|
||||
|
||||
/// FastFieldReader is the trait to access fast field data.
|
||||
pub trait FastFieldReader<Item: FastValue>: Clone {
|
||||
/// Return the value associated to the given document.
|
||||
///
|
||||
/// This accessor should return as fast as possible.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic if `doc` is greater than the segment
|
||||
fn get(&self, doc: DocId) -> Item;
|
||||
|
||||
/// Fills an output buffer with the fast field values
|
||||
/// associated with the `DocId` going from
|
||||
/// `start` to `start + output.len()`.
|
||||
///
|
||||
/// Regardless of the type of `Item`, this method works
|
||||
/// - transmuting the output array
|
||||
/// - extracting the `Item`s as if they were `u64`
|
||||
/// - possibly converting the `u64` value to the right type.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic if `start + output.len()` is greater than
|
||||
/// the segment's `maxdoc`.
|
||||
fn get_range(&self, start: DocId, output: &mut [Item]);
|
||||
|
||||
/// Returns the minimum value for this fast field.
|
||||
///
|
||||
/// The max value does not take in account of possible
|
||||
/// deleted document, and should be considered as an upper bound
|
||||
/// of the actual maximum value.
|
||||
fn min_value(&self) -> Item;
|
||||
|
||||
/// Returns the maximum value for this fast field.
|
||||
///
|
||||
/// The max value does not take in account of possible
|
||||
/// deleted document, and should be considered as an upper bound
|
||||
/// of the actual maximum value.
|
||||
fn max_value(&self) -> Item;
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
/// DynamicFastFieldReader wraps different readers to access
|
||||
/// the various encoded fastfield data
|
||||
///
|
||||
pub enum DynamicFastFieldReader<Item: FastValue> {
|
||||
/// Bitpacked compressed fastfield data.
|
||||
Bitpacked(BitpackedFastFieldReader<Item>),
|
||||
}
|
||||
|
||||
impl<Item: FastValue> DynamicFastFieldReader<Item> {
|
||||
/// Returns correct the reader wrapped in the `DynamicFastFieldReader` enum for the data.
|
||||
pub fn open(file: FileSlice) -> crate::Result<DynamicFastFieldReader<Item>> {
|
||||
Ok(DynamicFastFieldReader::Bitpacked(
|
||||
BitpackedFastFieldReader::open(file)?,
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
|
||||
fn get(&self, doc: DocId) -> Item {
|
||||
match self {
|
||||
Self::Bitpacked(reader) => reader.get(doc),
|
||||
}
|
||||
}
|
||||
fn get_range(&self, start: DocId, output: &mut [Item]) {
|
||||
match self {
|
||||
Self::Bitpacked(reader) => reader.get_range(start, output),
|
||||
}
|
||||
}
|
||||
fn min_value(&self) -> Item {
|
||||
match self {
|
||||
Self::Bitpacked(reader) => reader.min_value(),
|
||||
}
|
||||
}
|
||||
fn max_value(&self) -> Item {
|
||||
match self {
|
||||
Self::Bitpacked(reader) => reader.max_value(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Trait for accessing a fastfield.
|
||||
///
|
||||
/// Depending on the field type, a different
|
||||
/// fast field is required.
|
||||
#[derive(Clone)]
|
||||
pub struct FastFieldReader<Item: FastValue> {
|
||||
pub struct BitpackedFastFieldReader<Item: FastValue> {
|
||||
bytes: OwnedBytes,
|
||||
bit_unpacker: BitUnpacker,
|
||||
min_value_u64: u64,
|
||||
@@ -27,7 +109,7 @@ pub struct FastFieldReader<Item: FastValue> {
|
||||
_phantom: PhantomData<Item>,
|
||||
}
|
||||
|
||||
impl<Item: FastValue> FastFieldReader<Item> {
|
||||
impl<Item: FastValue> BitpackedFastFieldReader<Item> {
|
||||
/// Opens a fast field given a file.
|
||||
pub fn open(file: FileSlice) -> crate::Result<Self> {
|
||||
let mut bytes = file.read_bytes()?;
|
||||
@@ -36,7 +118,7 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
||||
let max_value = min_value + amplitude;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let bit_unpacker = BitUnpacker::new(num_bits);
|
||||
Ok(FastFieldReader {
|
||||
Ok(BitpackedFastFieldReader {
|
||||
bytes,
|
||||
min_value_u64: min_value,
|
||||
max_value_u64: max_value,
|
||||
@@ -44,19 +126,6 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
||||
_phantom: PhantomData,
|
||||
})
|
||||
}
|
||||
|
||||
/// Return the value associated to the given document.
|
||||
///
|
||||
/// This accessor should return as fast as possible.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic if `doc` is greater than the segment
|
||||
// `maxdoc`.
|
||||
pub fn get(&self, doc: DocId) -> Item {
|
||||
self.get_u64(u64::from(doc))
|
||||
}
|
||||
|
||||
pub(crate) fn get_u64(&self, doc: u64) -> Item {
|
||||
Item::from_u64(self.min_value_u64 + self.bit_unpacker.get(doc, &self.bytes))
|
||||
}
|
||||
@@ -78,6 +147,20 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
||||
*out = self.get_u64(start + (i as u64));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<Item: FastValue> FastFieldReader<Item> for BitpackedFastFieldReader<Item> {
|
||||
/// Return the value associated to the given document.
|
||||
///
|
||||
/// This accessor should return as fast as possible.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// May panic if `doc` is greater than the segment
|
||||
// `maxdoc`.
|
||||
fn get(&self, doc: DocId) -> Item {
|
||||
self.get_u64(u64::from(doc))
|
||||
}
|
||||
|
||||
/// Fills an output buffer with the fast field values
|
||||
/// associated with the `DocId` going from
|
||||
@@ -92,7 +175,7 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
||||
///
|
||||
/// May panic if `start + output.len()` is greater than
|
||||
/// the segment's `maxdoc`.
|
||||
pub fn get_range(&self, start: DocId, output: &mut [Item]) {
|
||||
fn get_range(&self, start: DocId, output: &mut [Item]) {
|
||||
self.get_range_u64(u64::from(start), output);
|
||||
}
|
||||
|
||||
@@ -101,7 +184,7 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
||||
/// The max value does not take in account of possible
|
||||
/// deleted document, and should be considered as an upper bound
|
||||
/// of the actual maximum value.
|
||||
pub fn min_value(&self) -> Item {
|
||||
fn min_value(&self) -> Item {
|
||||
Item::from_u64(self.min_value_u64)
|
||||
}
|
||||
|
||||
@@ -110,13 +193,13 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
||||
/// The max value does not take in account of possible
|
||||
/// deleted document, and should be considered as an upper bound
|
||||
/// of the actual maximum value.
|
||||
pub fn max_value(&self) -> Item {
|
||||
fn max_value(&self) -> Item {
|
||||
Item::from_u64(self.max_value_u64)
|
||||
}
|
||||
}
|
||||
|
||||
impl<Item: FastValue> From<Vec<Item>> for FastFieldReader<Item> {
|
||||
fn from(vals: Vec<Item>) -> FastFieldReader<Item> {
|
||||
impl<Item: FastValue> From<Vec<Item>> for BitpackedFastFieldReader<Item> {
|
||||
fn from(vals: Vec<Item>) -> BitpackedFastFieldReader<Item> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_u64_field("field", FAST);
|
||||
let schema = schema_builder.build();
|
||||
@@ -126,7 +209,7 @@ impl<Item: FastValue> From<Vec<Item>> for FastFieldReader<Item> {
|
||||
let write: WritePtr = directory
|
||||
.open_write(path)
|
||||
.expect("With a RamDirectory, this should never fail.");
|
||||
let mut serializer = FastFieldSerializer::from_write(write)
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write)
|
||||
.expect("With a RamDirectory, this should never fail.");
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
|
||||
{
|
||||
@@ -148,6 +231,6 @@ impl<Item: FastValue> From<Vec<Item>> for FastFieldReader<Item> {
|
||||
let field_file = composite_file
|
||||
.open_read(field)
|
||||
.expect("File component not found");
|
||||
FastFieldReader::open(field_file).unwrap()
|
||||
BitpackedFastFieldReader::open(field_file).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,13 +1,16 @@
|
||||
use crate::common::CompositeFile;
|
||||
use crate::directory::FileSlice;
|
||||
use crate::fastfield::MultiValuedFastFieldReader;
|
||||
use crate::fastfield::{BitpackedFastFieldReader, FastFieldNotAvailableError};
|
||||
use crate::fastfield::{BytesFastFieldReader, FastValue};
|
||||
use crate::fastfield::{FastFieldNotAvailableError, FastFieldReader};
|
||||
use crate::schema::{Cardinality, Field, FieldType, Schema};
|
||||
use crate::space_usage::PerFieldSpaceUsage;
|
||||
use crate::TantivyError;
|
||||
|
||||
/// Provides access to all of the FastFieldReader.
|
||||
use super::reader::DynamicFastFieldReader;
|
||||
use super::FastFieldReader;
|
||||
|
||||
/// Provides access to all of the BitpackedFastFieldReader.
|
||||
///
|
||||
/// Internally, `FastFieldReaders` have preloaded fast field readers,
|
||||
/// and just wraps several `HashMap`.
|
||||
@@ -100,9 +103,9 @@ impl FastFieldReaders {
|
||||
pub(crate) fn typed_fast_field_reader<TFastValue: FastValue>(
|
||||
&self,
|
||||
field: Field,
|
||||
) -> crate::Result<FastFieldReader<TFastValue>> {
|
||||
) -> crate::Result<DynamicFastFieldReader<TFastValue>> {
|
||||
let fast_field_slice = self.fast_field_data(field, 0)?;
|
||||
FastFieldReader::open(fast_field_slice)
|
||||
DynamicFastFieldReader::open(fast_field_slice)
|
||||
}
|
||||
|
||||
pub(crate) fn typed_fast_field_multi_reader<TFastValue: FastValue>(
|
||||
@@ -111,16 +114,16 @@ impl FastFieldReaders {
|
||||
) -> crate::Result<MultiValuedFastFieldReader<TFastValue>> {
|
||||
let fast_field_slice_idx = self.fast_field_data(field, 0)?;
|
||||
let fast_field_slice_vals = self.fast_field_data(field, 1)?;
|
||||
let idx_reader = FastFieldReader::open(fast_field_slice_idx)?;
|
||||
let vals_reader: FastFieldReader<TFastValue> =
|
||||
FastFieldReader::open(fast_field_slice_vals)?;
|
||||
let idx_reader = BitpackedFastFieldReader::open(fast_field_slice_idx)?;
|
||||
let vals_reader: BitpackedFastFieldReader<TFastValue> =
|
||||
BitpackedFastFieldReader::open(fast_field_slice_vals)?;
|
||||
Ok(MultiValuedFastFieldReader::open(idx_reader, vals_reader))
|
||||
}
|
||||
|
||||
/// Returns the `u64` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a u64 fast field, this method returns an Error.
|
||||
pub fn u64(&self, field: Field) -> crate::Result<FastFieldReader<u64>> {
|
||||
pub fn u64(&self, field: Field) -> crate::Result<DynamicFastFieldReader<u64>> {
|
||||
self.check_type(field, FastType::U64, Cardinality::SingleValue)?;
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
@@ -129,14 +132,14 @@ impl FastFieldReaders {
|
||||
/// field is effectively of type `u64` or not.
|
||||
///
|
||||
/// If not, the fastfield reader will returns the u64-value associated to the original FastValue.
|
||||
pub fn u64_lenient(&self, field: Field) -> crate::Result<FastFieldReader<u64>> {
|
||||
pub fn u64_lenient(&self, field: Field) -> crate::Result<DynamicFastFieldReader<u64>> {
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
|
||||
/// Returns the `i64` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a i64 fast field, this method returns an Error.
|
||||
pub fn i64(&self, field: Field) -> crate::Result<FastFieldReader<i64>> {
|
||||
pub fn i64(&self, field: Field) -> crate::Result<impl FastFieldReader<i64>> {
|
||||
self.check_type(field, FastType::I64, Cardinality::SingleValue)?;
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
@@ -144,7 +147,7 @@ impl FastFieldReaders {
|
||||
/// Returns the `i64` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a i64 fast field, this method returns an Error.
|
||||
pub fn date(&self, field: Field) -> crate::Result<FastFieldReader<crate::DateTime>> {
|
||||
pub fn date(&self, field: Field) -> crate::Result<impl FastFieldReader<crate::DateTime>> {
|
||||
self.check_type(field, FastType::Date, Cardinality::SingleValue)?;
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
@@ -152,7 +155,7 @@ impl FastFieldReaders {
|
||||
/// Returns the `f64` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a f64 fast field, this method returns an Error.
|
||||
pub fn f64(&self, field: Field) -> crate::Result<FastFieldReader<f64>> {
|
||||
pub fn f64(&self, field: Field) -> crate::Result<impl FastFieldReader<f64>> {
|
||||
self.check_type(field, FastType::F64, Cardinality::SingleValue)?;
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
@@ -213,7 +216,7 @@ impl FastFieldReaders {
|
||||
)));
|
||||
}
|
||||
let fast_field_idx_file = self.fast_field_data(field, 0)?;
|
||||
let idx_reader = FastFieldReader::open(fast_field_idx_file)?;
|
||||
let idx_reader = BitpackedFastFieldReader::open(fast_field_idx_file)?;
|
||||
let data = self.fast_field_data(field, 1)?;
|
||||
BytesFastFieldReader::open(idx_reader, data)
|
||||
} else {
|
||||
|
||||
@@ -7,10 +7,10 @@ use std::io::{self, Write};
|
||||
use tantivy_bitpacker::compute_num_bits;
|
||||
use tantivy_bitpacker::BitPacker;
|
||||
|
||||
/// `FastFieldSerializer` is in charge of serializing
|
||||
/// `CompositeFastFieldSerializer` is in charge of serializing
|
||||
/// fastfields on disk.
|
||||
///
|
||||
/// Fast fields are encoded using bit-packing.
|
||||
/// Fast fields have differnt encodings like bit-packing.
|
||||
///
|
||||
/// `FastFieldWriter`s are in charge of pushing the data to
|
||||
/// the serializer.
|
||||
@@ -27,16 +27,16 @@ use tantivy_bitpacker::BitPacker;
|
||||
/// * ...
|
||||
/// * `close_field()`
|
||||
/// * `close()`
|
||||
pub struct FastFieldSerializer {
|
||||
pub struct CompositeFastFieldSerializer {
|
||||
composite_write: CompositeWrite<WritePtr>,
|
||||
}
|
||||
|
||||
impl FastFieldSerializer {
|
||||
impl CompositeFastFieldSerializer {
|
||||
/// Constructor
|
||||
pub fn from_write(write: WritePtr) -> io::Result<FastFieldSerializer> {
|
||||
pub fn from_write(write: WritePtr) -> io::Result<CompositeFastFieldSerializer> {
|
||||
// just making room for the pointer to header.
|
||||
let composite_write = CompositeWrite::wrap(write);
|
||||
Ok(FastFieldSerializer { composite_write })
|
||||
Ok(CompositeFastFieldSerializer { composite_write })
|
||||
}
|
||||
|
||||
/// Start serializing a new u64 fast field
|
||||
@@ -45,7 +45,7 @@ impl FastFieldSerializer {
|
||||
field: Field,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
) -> io::Result<FastSingleFieldSerializer<'_, CountingWriter<WritePtr>>> {
|
||||
) -> io::Result<DynamicFastFieldSerializer<'_, CountingWriter<WritePtr>>> {
|
||||
self.new_u64_fast_field_with_idx(field, min_value, max_value, 0)
|
||||
}
|
||||
|
||||
@@ -56,9 +56,9 @@ impl FastFieldSerializer {
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
idx: usize,
|
||||
) -> io::Result<FastSingleFieldSerializer<'_, CountingWriter<WritePtr>>> {
|
||||
) -> io::Result<DynamicFastFieldSerializer<'_, CountingWriter<WritePtr>>> {
|
||||
let field_write = self.composite_write.for_field_with_idx(field, idx);
|
||||
FastSingleFieldSerializer::open(field_write, min_value, max_value)
|
||||
DynamicFastFieldSerializer::open(field_write, min_value, max_value)
|
||||
}
|
||||
|
||||
/// Start serializing a new [u8] fast field
|
||||
@@ -79,14 +79,111 @@ impl FastFieldSerializer {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FastSingleFieldSerializer<'a, W: Write> {
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct EstimationStats {
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
}
|
||||
/// The FastFieldSerializer trait is the common interface
|
||||
/// implemented by every fastfield serializer variant.
|
||||
///
|
||||
/// `DynamicFastFieldSerializer` is the enum wrapping all variants.
|
||||
/// It is used to create an serializer instance.
|
||||
pub trait FastFieldSerializer {
|
||||
/// add value to serializer
|
||||
fn add_val(&mut self, val: u64) -> io::Result<()>;
|
||||
/// finish serializing a field.
|
||||
fn close_field(self) -> io::Result<()>;
|
||||
}
|
||||
|
||||
/// The FastFieldSerializerEstimate trait is required on all variants
|
||||
/// of fast field compressions, to decide which one to choose.
|
||||
pub trait FastFieldSerializerEstimate {
|
||||
/// returns an estimate of the compression ratio.
|
||||
fn estimate(
|
||||
/*fastfield_accessor: impl FastFieldReader<u64>,*/ stats: EstimationStats,
|
||||
) -> (f32, &'static str);
|
||||
/// the unique name of the compressor
|
||||
fn name() -> &'static str;
|
||||
}
|
||||
|
||||
pub enum DynamicFastFieldSerializer<'a, W: Write> {
|
||||
Bitpacked(BitpackedFastFieldSerializer<'a, W>),
|
||||
}
|
||||
|
||||
impl<'a, W: Write> DynamicFastFieldSerializer<'a, W> {
|
||||
/// Creates a new fast field serializer.
|
||||
///
|
||||
/// The serializer in fact encode the values by bitpacking
|
||||
/// `(val - min_value)`.
|
||||
///
|
||||
/// It requires a `min_value` and a `max_value` to compute
|
||||
/// compute the minimum number of bits required to encode
|
||||
/// values.
|
||||
pub fn open(
|
||||
write: &'a mut W,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
) -> io::Result<DynamicFastFieldSerializer<'a, W>> {
|
||||
let stats = EstimationStats {
|
||||
min_value,
|
||||
max_value,
|
||||
};
|
||||
let (_ratio, name) = (
|
||||
BitpackedFastFieldSerializer::<Vec<u8>>::estimate(stats),
|
||||
BitpackedFastFieldSerializer::<Vec<u8>>::name(),
|
||||
);
|
||||
Self::open_from_name(write, min_value, max_value, name)
|
||||
}
|
||||
|
||||
/// Creates a new fast field serializer.
|
||||
///
|
||||
/// The serializer in fact encode the values by bitpacking
|
||||
/// `(val - min_value)`.
|
||||
///
|
||||
/// It requires a `min_value` and a `max_value` to compute
|
||||
/// compute the minimum number of bits required to encode
|
||||
/// values.
|
||||
pub fn open_from_name(
|
||||
write: &'a mut W,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
name: &str,
|
||||
) -> io::Result<DynamicFastFieldSerializer<'a, W>> {
|
||||
// Weirdly the W generic on BitpackedFastFieldSerializer needs to be set,
|
||||
// although name() doesn't use it
|
||||
let variant = if name == BitpackedFastFieldSerializer::<Vec<u8>>::name() {
|
||||
DynamicFastFieldSerializer::Bitpacked(BitpackedFastFieldSerializer::open(
|
||||
write, min_value, max_value,
|
||||
)?)
|
||||
} else {
|
||||
panic!("unknown fastfield serializer {}", name);
|
||||
};
|
||||
|
||||
Ok(variant)
|
||||
}
|
||||
}
|
||||
impl<'a, W: Write> FastFieldSerializer for DynamicFastFieldSerializer<'a, W> {
|
||||
fn add_val(&mut self, val: u64) -> io::Result<()> {
|
||||
match self {
|
||||
Self::Bitpacked(serializer) => serializer.add_val(val),
|
||||
}
|
||||
}
|
||||
fn close_field(self) -> io::Result<()> {
|
||||
match self {
|
||||
Self::Bitpacked(serializer) => serializer.close_field(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct BitpackedFastFieldSerializer<'a, W: Write> {
|
||||
bit_packer: BitPacker,
|
||||
write: &'a mut W,
|
||||
min_value: u64,
|
||||
num_bits: u8,
|
||||
}
|
||||
|
||||
impl<'a, W: Write> FastSingleFieldSerializer<'a, W> {
|
||||
impl<'a, W: Write> BitpackedFastFieldSerializer<'a, W> {
|
||||
/// Creates a new fast field serializer.
|
||||
///
|
||||
/// The serializer in fact encode the values by bitpacking
|
||||
@@ -99,34 +196,51 @@ impl<'a, W: Write> FastSingleFieldSerializer<'a, W> {
|
||||
write: &'a mut W,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
) -> io::Result<FastSingleFieldSerializer<'a, W>> {
|
||||
) -> io::Result<BitpackedFastFieldSerializer<'a, W>> {
|
||||
assert!(min_value <= max_value);
|
||||
min_value.serialize(write)?;
|
||||
let amplitude = max_value - min_value;
|
||||
amplitude.serialize(write)?;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let bit_packer = BitPacker::new();
|
||||
Ok(FastSingleFieldSerializer {
|
||||
Ok(BitpackedFastFieldSerializer {
|
||||
bit_packer,
|
||||
write,
|
||||
min_value,
|
||||
num_bits,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, W: 'a + Write> FastFieldSerializer for BitpackedFastFieldSerializer<'a, W> {
|
||||
/// Pushes a new value to the currently open u64 fast field.
|
||||
pub fn add_val(&mut self, val: u64) -> io::Result<()> {
|
||||
fn add_val(&mut self, val: u64) -> io::Result<()> {
|
||||
let val_to_write: u64 = val - self.min_value;
|
||||
self.bit_packer
|
||||
.write(val_to_write, self.num_bits, &mut self.write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn close_field(mut self) -> io::Result<()> {
|
||||
fn close_field(mut self) -> io::Result<()> {
|
||||
self.bit_packer.close(&mut self.write)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, W: 'a + Write> FastFieldSerializerEstimate for BitpackedFastFieldSerializer<'a, W> {
|
||||
fn estimate(
|
||||
/*_fastfield_accessor: impl FastFieldReader<u64>, */ stats: EstimationStats,
|
||||
) -> (f32, &'static str) {
|
||||
let amplitude = stats.max_value - stats.min_value;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let num_bits_uncompressed = 64;
|
||||
let ratio = num_bits as f32 / num_bits_uncompressed as f32;
|
||||
let name = Self::name();
|
||||
(ratio, name)
|
||||
}
|
||||
fn name() -> &'static str {
|
||||
"Bitpacked"
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FastBytesFieldSerializer<'a, W: Write> {
|
||||
write: &'a mut W,
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use super::multivalued::MultiValuedFastFieldWriter;
|
||||
use crate::common;
|
||||
use crate::fastfield::{BytesFastFieldWriter, FastFieldSerializer};
|
||||
use crate::fastfield::serializer::FastFieldSerializer;
|
||||
use crate::fastfield::{BytesFastFieldWriter, CompositeFastFieldSerializer};
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::postings::UnorderedTermId;
|
||||
use crate::schema::{Cardinality, Document, Field, FieldEntry, FieldType, Schema};
|
||||
@@ -148,7 +149,7 @@ impl FastFieldsWriter {
|
||||
/// order to the fast field serializer.
|
||||
pub fn serialize(
|
||||
&self,
|
||||
serializer: &mut FastFieldSerializer,
|
||||
serializer: &mut CompositeFastFieldSerializer,
|
||||
mapping: &HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>>,
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
) -> io::Result<()> {
|
||||
@@ -272,7 +273,7 @@ impl IntFastFieldWriter {
|
||||
/// Push the fast fields value to the `FastFieldWriter`.
|
||||
pub fn serialize(
|
||||
&self,
|
||||
serializer: &mut FastFieldSerializer,
|
||||
serializer: &mut CompositeFastFieldSerializer,
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
) -> io::Result<()> {
|
||||
let (min, max) = if self.val_min > self.val_max {
|
||||
|
||||
@@ -8,7 +8,6 @@ use crate::{
|
||||
DocId, IndexSortByField, Order, TantivyError,
|
||||
};
|
||||
use std::cmp::Reverse;
|
||||
|
||||
/// Struct to provide mapping from old doc_id to new doc_id and vice versa
|
||||
pub struct DocIdMapping {
|
||||
new_doc_id_to_old: Vec<DocId>,
|
||||
@@ -92,6 +91,7 @@ pub(crate) fn get_doc_id_mapping_from_field(
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests_indexsorting {
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::{collector::TopDocs, query::QueryParser, schema::*};
|
||||
use crate::{schema::Schema, DocAddress};
|
||||
use crate::{Index, IndexSettings, IndexSortByField, Order};
|
||||
@@ -175,6 +175,7 @@ mod tests_indexsorting {
|
||||
field: "my_number".to_string(),
|
||||
order: Order::Asc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
option.clone(),
|
||||
);
|
||||
@@ -206,6 +207,7 @@ mod tests_indexsorting {
|
||||
field: "my_number".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
option.clone(),
|
||||
);
|
||||
@@ -264,6 +266,7 @@ mod tests_indexsorting {
|
||||
field: "my_number".to_string(),
|
||||
order: Order::Asc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
get_text_options(),
|
||||
);
|
||||
@@ -288,6 +291,7 @@ mod tests_indexsorting {
|
||||
field: "my_number".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
get_text_options(),
|
||||
);
|
||||
@@ -322,6 +326,7 @@ mod tests_indexsorting {
|
||||
field: "my_number".to_string(),
|
||||
order: Order::Asc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
get_text_options(),
|
||||
);
|
||||
@@ -352,6 +357,7 @@ mod tests_indexsorting {
|
||||
field: "my_number".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
get_text_options(),
|
||||
);
|
||||
@@ -387,6 +393,7 @@ mod tests_indexsorting {
|
||||
field: "my_number".to_string(),
|
||||
order: Order::Asc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
get_text_options(),
|
||||
);
|
||||
|
||||
@@ -945,7 +945,7 @@ mod tests {
|
||||
let index_writer = index.writer(3_000_000).unwrap();
|
||||
assert_eq!(
|
||||
format!("{:?}", index_writer.get_merge_policy()),
|
||||
"LogMergePolicy { min_merge_size: 8, max_merge_size: 10000000, min_layer_size: 10000, \
|
||||
"LogMergePolicy { min_num_segments: 8, max_docs_before_merge: 10000000, min_layer_size: 10000, \
|
||||
level_log_size: 0.75 }"
|
||||
);
|
||||
let merge_policy = Box::new(NoMergePolicy::default());
|
||||
|
||||
@@ -1,19 +1,20 @@
|
||||
use super::merge_policy::{MergeCandidate, MergePolicy};
|
||||
use crate::core::SegmentMeta;
|
||||
use itertools::Itertools;
|
||||
use std::cmp;
|
||||
use std::f64;
|
||||
|
||||
const DEFAULT_LEVEL_LOG_SIZE: f64 = 0.75;
|
||||
const DEFAULT_MIN_LAYER_SIZE: u32 = 10_000;
|
||||
const DEFAULT_MIN_MERGE_SIZE: usize = 8;
|
||||
const DEFAULT_MAX_MERGE_SIZE: usize = 10_000_000;
|
||||
const DEFAULT_MIN_NUM_SEGMENTS_IN_MERGE: usize = 8;
|
||||
const DEFAULT_MAX_DOCS_BEFORE_MERGE: usize = 10_000_000;
|
||||
|
||||
/// `LogMergePolicy` tries to merge segments that have a similar number of
|
||||
/// documents.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct LogMergePolicy {
|
||||
min_merge_size: usize,
|
||||
max_merge_size: usize,
|
||||
min_num_segments: usize,
|
||||
max_docs_before_merge: usize,
|
||||
min_layer_size: u32,
|
||||
level_log_size: f64,
|
||||
}
|
||||
@@ -23,15 +24,16 @@ impl LogMergePolicy {
|
||||
cmp::max(self.min_layer_size, size)
|
||||
}
|
||||
|
||||
/// Set the minimum number of segment that may be merge together.
|
||||
pub fn set_min_merge_size(&mut self, min_merge_size: usize) {
|
||||
self.min_merge_size = min_merge_size;
|
||||
/// Set the minimum number of segments that may be merged together.
|
||||
pub fn set_min_num_segments(&mut self, min_num_segments: usize) {
|
||||
self.min_num_segments = min_num_segments;
|
||||
}
|
||||
|
||||
/// Set the maximum number docs in a segment for it to be considered for
|
||||
/// merging.
|
||||
pub fn set_max_merge_size(&mut self, max_merge_size: usize) {
|
||||
self.max_merge_size = max_merge_size;
|
||||
/// merging. A segment can still reach more than max_docs, by merging many
|
||||
/// smaller ones.
|
||||
pub fn set_max_docs_before_merge(&mut self, max_docs_merge_size: usize) {
|
||||
self.max_docs_before_merge = max_docs_merge_size;
|
||||
}
|
||||
|
||||
/// Set the minimum segment size under which all segment belong
|
||||
@@ -42,7 +44,7 @@ impl LogMergePolicy {
|
||||
|
||||
/// Set the ratio between two consecutive levels.
|
||||
///
|
||||
/// Segment are group in levels according to their sizes.
|
||||
/// Segments are grouped in levels according to their sizes.
|
||||
/// These levels are defined as intervals of exponentially growing sizes.
|
||||
/// level_log_size define the factor by which one should multiply the limit
|
||||
/// to reach a level, in order to get the limit to reach the following
|
||||
@@ -54,52 +56,43 @@ impl LogMergePolicy {
|
||||
|
||||
impl MergePolicy for LogMergePolicy {
|
||||
fn compute_merge_candidates(&self, segments: &[SegmentMeta]) -> Vec<MergeCandidate> {
|
||||
let mut size_sorted_tuples = segments
|
||||
let mut size_sorted_segments = segments
|
||||
.iter()
|
||||
.map(SegmentMeta::num_docs)
|
||||
.enumerate()
|
||||
.filter(|(_, s)| s <= &(self.max_merge_size as u32))
|
||||
.collect::<Vec<(usize, u32)>>();
|
||||
.filter(|segment_meta| segment_meta.num_docs() <= (self.max_docs_before_merge as u32))
|
||||
.collect::<Vec<&SegmentMeta>>();
|
||||
|
||||
size_sorted_tuples.sort_by(|x, y| y.1.cmp(&(x.1)));
|
||||
|
||||
if size_sorted_tuples.len() <= 1 {
|
||||
return Vec::new();
|
||||
}
|
||||
|
||||
let size_sorted_log_tuples: Vec<_> = size_sorted_tuples
|
||||
.into_iter()
|
||||
.map(|(ind, num_docs)| (ind, f64::from(self.clip_min_size(num_docs)).log2()))
|
||||
.collect();
|
||||
|
||||
if let Some(&(first_ind, first_score)) = size_sorted_log_tuples.first() {
|
||||
let mut current_max_log_size = first_score;
|
||||
let mut levels = vec![vec![first_ind]];
|
||||
for &(ind, score) in (&size_sorted_log_tuples).iter().skip(1) {
|
||||
if score < (current_max_log_size - self.level_log_size) {
|
||||
current_max_log_size = score;
|
||||
levels.push(Vec::new());
|
||||
}
|
||||
levels.last_mut().unwrap().push(ind);
|
||||
}
|
||||
levels
|
||||
.iter()
|
||||
.filter(|level| level.len() >= self.min_merge_size)
|
||||
.map(|ind_vec| {
|
||||
MergeCandidate(ind_vec.iter().map(|&ind| segments[ind].id()).collect())
|
||||
})
|
||||
.collect()
|
||||
} else {
|
||||
if size_sorted_segments.len() <= 1 {
|
||||
return vec![];
|
||||
}
|
||||
size_sorted_segments.sort_by_key(|seg| std::cmp::Reverse(seg.num_docs()));
|
||||
|
||||
let mut current_max_log_size = f64::MAX;
|
||||
let mut levels = vec![];
|
||||
for (_, merge_group) in &size_sorted_segments.into_iter().group_by(|segment| {
|
||||
let segment_log_size = f64::from(self.clip_min_size(segment.num_docs())).log2();
|
||||
if segment_log_size < (current_max_log_size - self.level_log_size) {
|
||||
// update current_max_log_size to create a new group
|
||||
current_max_log_size = segment_log_size;
|
||||
}
|
||||
// return current_max_log_size to be grouped to the current group
|
||||
current_max_log_size
|
||||
}) {
|
||||
levels.push(merge_group.collect::<Vec<&SegmentMeta>>());
|
||||
}
|
||||
|
||||
levels
|
||||
.iter()
|
||||
.filter(|level| level.len() >= self.min_num_segments)
|
||||
.map(|segments| MergeCandidate(segments.iter().map(|&seg| seg.id()).collect()))
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for LogMergePolicy {
|
||||
fn default() -> LogMergePolicy {
|
||||
LogMergePolicy {
|
||||
min_merge_size: DEFAULT_MIN_MERGE_SIZE,
|
||||
max_merge_size: DEFAULT_MAX_MERGE_SIZE,
|
||||
min_num_segments: DEFAULT_MIN_NUM_SEGMENTS_IN_MERGE,
|
||||
max_docs_before_merge: DEFAULT_MAX_DOCS_BEFORE_MERGE,
|
||||
min_layer_size: DEFAULT_MIN_LAYER_SIZE,
|
||||
level_log_size: DEFAULT_LEVEL_LOG_SIZE,
|
||||
}
|
||||
@@ -109,16 +102,79 @@ impl Default for LogMergePolicy {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::core::{SegmentId, SegmentMeta, SegmentMetaInventory};
|
||||
use crate::indexer::merge_policy::MergePolicy;
|
||||
use crate::{
|
||||
core::{SegmentId, SegmentMeta, SegmentMetaInventory},
|
||||
schema,
|
||||
};
|
||||
use crate::{indexer::merge_policy::MergePolicy, schema::INDEXED};
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
static INVENTORY: Lazy<SegmentMetaInventory> = Lazy::new(SegmentMetaInventory::default);
|
||||
|
||||
use crate::Index;
|
||||
|
||||
#[test]
|
||||
fn create_index_test_max_merge_issue_1035() {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let int_field = schema_builder.add_u64_field("intval", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
{
|
||||
let mut log_merge_policy = LogMergePolicy::default();
|
||||
log_merge_policy.set_min_num_segments(1);
|
||||
log_merge_policy.set_max_docs_before_merge(1);
|
||||
log_merge_policy.set_min_layer_size(0);
|
||||
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
index_writer.set_merge_policy(Box::new(log_merge_policy));
|
||||
|
||||
// after every commit the merge checker is started, it will merge only segments with 1
|
||||
// element in it because of the max_merge_size.
|
||||
index_writer.add_document(doc!(int_field=>1_u64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
index_writer.add_document(doc!(int_field=>2_u64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
index_writer.add_document(doc!(int_field=>3_u64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
index_writer.add_document(doc!(int_field=>4_u64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
index_writer.add_document(doc!(int_field=>5_u64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
index_writer.add_document(doc!(int_field=>6_u64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
index_writer.add_document(doc!(int_field=>7_u64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
|
||||
index_writer.add_document(doc!(int_field=>8_u64));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
|
||||
let _segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let segment_readers = searcher.segment_readers();
|
||||
for segment in segment_readers {
|
||||
if segment.num_docs() > 2 {
|
||||
panic!("segment can't have more than two segments");
|
||||
} // don't know how to wait for the merge, then it could be a simple eq
|
||||
}
|
||||
}
|
||||
|
||||
fn test_merge_policy() -> LogMergePolicy {
|
||||
let mut log_merge_policy = LogMergePolicy::default();
|
||||
log_merge_policy.set_min_merge_size(3);
|
||||
log_merge_policy.set_max_merge_size(100_000);
|
||||
log_merge_policy.set_min_num_segments(3);
|
||||
log_merge_policy.set_max_docs_before_merge(100_000);
|
||||
log_merge_policy.set_min_layer_size(2);
|
||||
log_merge_policy
|
||||
}
|
||||
|
||||
@@ -1,5 +1,8 @@
|
||||
use super::doc_id_mapping::DocIdMapping;
|
||||
use crate::error::DataCorruption;
|
||||
use crate::fastfield::CompositeFastFieldSerializer;
|
||||
use crate::fastfield::DeleteBitSet;
|
||||
use crate::fastfield::DynamicFastFieldReader;
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::fastfield::FastFieldSerializer;
|
||||
use crate::fastfield::MultiValuedFastFieldReader;
|
||||
@@ -86,7 +89,7 @@ pub struct IndexMerger {
|
||||
}
|
||||
|
||||
fn compute_min_max_val(
|
||||
u64_reader: &FastFieldReader<u64>,
|
||||
u64_reader: &impl FastFieldReader<u64>,
|
||||
max_doc: DocId,
|
||||
delete_bitset_opt: Option<&DeleteBitSet>,
|
||||
) -> Option<(u64, u64)> {
|
||||
@@ -182,6 +185,10 @@ impl IndexMerger {
|
||||
readers.push(reader);
|
||||
}
|
||||
}
|
||||
if let Some(sort_by_field) = index_settings.sort_by_field.as_ref() {
|
||||
readers = Self::sort_readers_by_min_sort_field(readers, sort_by_field)?;
|
||||
}
|
||||
// sort segments by their natural sort setting
|
||||
if max_doc >= MAX_DOC_LIMIT {
|
||||
let err_msg = format!(
|
||||
"The segment resulting from this merge would have {} docs,\
|
||||
@@ -191,13 +198,37 @@ impl IndexMerger {
|
||||
return Err(crate::TantivyError::InvalidArgument(err_msg));
|
||||
}
|
||||
Ok(IndexMerger {
|
||||
schema,
|
||||
index_settings,
|
||||
schema,
|
||||
readers,
|
||||
max_doc,
|
||||
})
|
||||
}
|
||||
|
||||
fn sort_readers_by_min_sort_field(
|
||||
readers: Vec<SegmentReader>,
|
||||
sort_by_field: &IndexSortByField,
|
||||
) -> crate::Result<Vec<SegmentReader>> {
|
||||
// presort the readers by their min_values, so that when they are disjunct, we can use
|
||||
// the regular merge logic (implicitly sorted)
|
||||
let mut readers_with_min_sort_values = readers
|
||||
.into_iter()
|
||||
.map(|reader| {
|
||||
let accessor = Self::get_sort_field_accessor(&reader, &sort_by_field)?;
|
||||
Ok((reader, accessor.min_value()))
|
||||
})
|
||||
.collect::<crate::Result<Vec<_>>>()?;
|
||||
if sort_by_field.order.is_asc() {
|
||||
readers_with_min_sort_values.sort_by_key(|(_, min_val)| *min_val);
|
||||
} else {
|
||||
readers_with_min_sort_values.sort_by_key(|(_, min_val)| std::cmp::Reverse(*min_val));
|
||||
}
|
||||
Ok(readers_with_min_sort_values
|
||||
.into_iter()
|
||||
.map(|(reader, _)| reader)
|
||||
.collect())
|
||||
}
|
||||
|
||||
fn write_fieldnorms(
|
||||
&self,
|
||||
mut fieldnorms_serializer: FieldNormsSerializer,
|
||||
@@ -208,9 +239,14 @@ impl IndexMerger {
|
||||
for field in fields {
|
||||
fieldnorms_data.clear();
|
||||
if let Some(doc_id_mapping) = doc_id_mapping {
|
||||
let fieldnorms_readers: Vec<FieldNormReader> = self
|
||||
.readers
|
||||
.iter()
|
||||
.map(|reader| reader.get_fieldnorms_reader(field))
|
||||
.collect::<Result<_, _>>()?;
|
||||
for (doc_id, reader_with_ordinal) in doc_id_mapping {
|
||||
let fieldnorms_reader =
|
||||
reader_with_ordinal.reader.get_fieldnorms_reader(field)?;
|
||||
&fieldnorms_readers[reader_with_ordinal.ordinal as usize];
|
||||
let fieldnorm_id = fieldnorms_reader.fieldnorm_id(*doc_id);
|
||||
fieldnorms_data.push(fieldnorm_id);
|
||||
}
|
||||
@@ -231,7 +267,7 @@ impl IndexMerger {
|
||||
|
||||
fn write_fast_fields(
|
||||
&self,
|
||||
fast_field_serializer: &mut FastFieldSerializer,
|
||||
fast_field_serializer: &mut CompositeFastFieldSerializer,
|
||||
mut term_ord_mappings: HashMap<Field, TermOrdinalMapping>,
|
||||
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
|
||||
) -> crate::Result<()> {
|
||||
@@ -281,11 +317,11 @@ impl IndexMerger {
|
||||
fn write_single_fast_field(
|
||||
&self,
|
||||
field: Field,
|
||||
fast_field_serializer: &mut FastFieldSerializer,
|
||||
fast_field_serializer: &mut CompositeFastFieldSerializer,
|
||||
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
|
||||
) -> crate::Result<()> {
|
||||
let (min_value, max_value) = self.readers.iter().map(|reader|{
|
||||
let u64_reader: FastFieldReader<u64> = reader
|
||||
let u64_reader: DynamicFastFieldReader<u64> = reader
|
||||
.fast_fields()
|
||||
.typed_fast_field_reader(field)
|
||||
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
|
||||
@@ -300,7 +336,7 @@ impl IndexMerger {
|
||||
.readers
|
||||
.iter()
|
||||
.map(|reader| {
|
||||
let u64_reader: FastFieldReader<u64> = reader
|
||||
let u64_reader: DynamicFastFieldReader<u64> = reader
|
||||
.fast_fields()
|
||||
.typed_fast_field_reader(field)
|
||||
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
|
||||
@@ -328,7 +364,7 @@ impl IndexMerger {
|
||||
let u64_readers = self.readers.iter()
|
||||
.filter(|reader|reader.max_doc() != reader.delete_bitset().map(|bit_set|bit_set.len() as u32).unwrap_or(0))
|
||||
.map(|reader|{
|
||||
let u64_reader: FastFieldReader<u64> = reader
|
||||
let u64_reader: DynamicFastFieldReader<u64> = reader
|
||||
.fast_fields()
|
||||
.typed_fast_field_reader(field)
|
||||
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
|
||||
@@ -354,6 +390,60 @@ impl IndexMerger {
|
||||
}
|
||||
}
|
||||
|
||||
/// Checks if the readers are disjunct for their sort property and in the correct order to be
|
||||
/// able to just stack them.
|
||||
pub(crate) fn is_disjunct_and_sorted_on_sort_property(
|
||||
&self,
|
||||
sort_by_field: &IndexSortByField,
|
||||
) -> crate::Result<bool> {
|
||||
let reader_and_field_accessors = self.get_reader_with_sort_field_accessor(sort_by_field)?;
|
||||
|
||||
let everything_is_in_order = reader_and_field_accessors
|
||||
.into_iter()
|
||||
.map(|reader| reader.1)
|
||||
.tuple_windows()
|
||||
.all(|(field_accessor1, field_accessor2)| {
|
||||
if sort_by_field.order.is_asc() {
|
||||
field_accessor1.max_value() <= field_accessor2.min_value()
|
||||
} else {
|
||||
field_accessor1.min_value() >= field_accessor2.max_value()
|
||||
}
|
||||
});
|
||||
Ok(everything_is_in_order)
|
||||
}
|
||||
|
||||
pub(crate) fn get_sort_field_accessor(
|
||||
reader: &SegmentReader,
|
||||
sort_by_field: &IndexSortByField,
|
||||
) -> crate::Result<impl FastFieldReader<u64>> {
|
||||
let field_id = expect_field_id_for_sort_field(&reader.schema(), &sort_by_field)?; // for now expect fastfield, but not strictly required
|
||||
let value_accessor = reader.fast_fields().u64_lenient(field_id)?;
|
||||
Ok(value_accessor)
|
||||
}
|
||||
/// Collecting value_accessors into a vec to bind the lifetime.
|
||||
pub(crate) fn get_reader_with_sort_field_accessor<'a, 'b>(
|
||||
&'a self,
|
||||
sort_by_field: &'b IndexSortByField,
|
||||
) -> crate::Result<
|
||||
Vec<(
|
||||
SegmentReaderWithOrdinal<'a>,
|
||||
impl FastFieldReader<u64> + Clone,
|
||||
)>,
|
||||
> {
|
||||
let reader_and_field_accessors = self
|
||||
.readers
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(Into::into)
|
||||
.map(|reader_with_ordinal: SegmentReaderWithOrdinal| {
|
||||
let value_accessor =
|
||||
Self::get_sort_field_accessor(reader_with_ordinal.reader, sort_by_field)?;
|
||||
Ok((reader_with_ordinal, value_accessor))
|
||||
})
|
||||
.collect::<crate::Result<Vec<_>>>()?;
|
||||
Ok(reader_and_field_accessors)
|
||||
}
|
||||
|
||||
/// Generates the doc_id mapping where position in the vec=new
|
||||
/// doc_id.
|
||||
/// ReaderWithOrdinal will include the ordinal position of the
|
||||
@@ -362,42 +452,26 @@ impl IndexMerger {
|
||||
&self,
|
||||
sort_by_field: &IndexSortByField,
|
||||
) -> crate::Result<Vec<(DocId, SegmentReaderWithOrdinal)>> {
|
||||
let reader_and_field_accessors = self
|
||||
.readers
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|reader| {
|
||||
let reader_with_ordinal: SegmentReaderWithOrdinal = reader.into();
|
||||
let field_id = expect_field_id_for_sort_field(
|
||||
&reader_with_ordinal.reader.schema(),
|
||||
&sort_by_field,
|
||||
)?; // for now expect fastfield, but not strictly required
|
||||
let value_accessor = reader_with_ordinal
|
||||
.reader
|
||||
.fast_fields()
|
||||
.u64_lenient(field_id)?;
|
||||
Ok((reader_with_ordinal, value_accessor))
|
||||
})
|
||||
.collect::<crate::Result<Vec<_>>>()?; // Collecting to bind the lifetime of value_accessor into the vec, or can't be used as a reference.
|
||||
// Loading the field accessor on demand causes a 15x regression
|
||||
let reader_and_field_accessors = self.get_reader_with_sort_field_accessor(sort_by_field)?;
|
||||
// Loading the field accessor on demand causes a 15x regression
|
||||
|
||||
// create iterators over segment/sort_accessor/doc_id tuple
|
||||
let doc_id_reader_pair = reader_and_field_accessors
|
||||
.iter()
|
||||
.map(|reader_and_field_accessor| {
|
||||
reader_and_field_accessor
|
||||
.0
|
||||
.reader
|
||||
.doc_ids_alive()
|
||||
.map(move |doc_id| {
|
||||
(
|
||||
doc_id,
|
||||
reader_and_field_accessor.0,
|
||||
&reader_and_field_accessor.1,
|
||||
)
|
||||
})
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
let doc_id_reader_pair =
|
||||
reader_and_field_accessors
|
||||
.iter()
|
||||
.map(|reader_and_field_accessor| {
|
||||
reader_and_field_accessor
|
||||
.0
|
||||
.reader
|
||||
.doc_ids_alive()
|
||||
.map(move |doc_id| {
|
||||
(
|
||||
doc_id,
|
||||
reader_and_field_accessor.0,
|
||||
&reader_and_field_accessor.1,
|
||||
)
|
||||
})
|
||||
});
|
||||
|
||||
// create iterator tuple of (old doc_id, reader) in order of the new doc_ids
|
||||
let sorted_doc_ids: Vec<(DocId, SegmentReaderWithOrdinal)> = doc_id_reader_pair
|
||||
@@ -424,7 +498,7 @@ impl IndexMerger {
|
||||
// is used to index the reader_and_field_accessors vec.
|
||||
fn write_1_n_fast_field_idx_generic(
|
||||
field: Field,
|
||||
fast_field_serializer: &mut FastFieldSerializer,
|
||||
fast_field_serializer: &mut CompositeFastFieldSerializer,
|
||||
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
|
||||
reader_and_field_accessors: &[(&SegmentReader, impl MultiValueLength)],
|
||||
) -> crate::Result<()> {
|
||||
@@ -479,7 +553,7 @@ impl IndexMerger {
|
||||
fn write_multi_value_fast_field_idx(
|
||||
&self,
|
||||
field: Field,
|
||||
fast_field_serializer: &mut FastFieldSerializer,
|
||||
fast_field_serializer: &mut CompositeFastFieldSerializer,
|
||||
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
|
||||
) -> crate::Result<()> {
|
||||
let reader_and_field_accessors = self.readers.iter().map(|reader|{
|
||||
@@ -501,7 +575,7 @@ impl IndexMerger {
|
||||
&self,
|
||||
field: Field,
|
||||
term_ordinal_mappings: &TermOrdinalMapping,
|
||||
fast_field_serializer: &mut FastFieldSerializer,
|
||||
fast_field_serializer: &mut CompositeFastFieldSerializer,
|
||||
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
|
||||
) -> crate::Result<()> {
|
||||
// Multifastfield consists in 2 fastfields.
|
||||
@@ -564,7 +638,7 @@ impl IndexMerger {
|
||||
fn write_multi_fast_field(
|
||||
&self,
|
||||
field: Field,
|
||||
fast_field_serializer: &mut FastFieldSerializer,
|
||||
fast_field_serializer: &mut CompositeFastFieldSerializer,
|
||||
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
|
||||
) -> crate::Result<()> {
|
||||
// Multifastfield consists in 2 fastfields.
|
||||
@@ -651,7 +725,7 @@ impl IndexMerger {
|
||||
fn write_bytes_fast_field(
|
||||
&self,
|
||||
field: Field,
|
||||
fast_field_serializer: &mut FastFieldSerializer,
|
||||
fast_field_serializer: &mut CompositeFastFieldSerializer,
|
||||
doc_id_mapping: &Option<Vec<(DocId, SegmentReaderWithOrdinal)>>,
|
||||
) -> crate::Result<()> {
|
||||
let reader_and_field_accessors = self
|
||||
@@ -797,13 +871,11 @@ impl IndexMerger {
|
||||
let mut total_doc_freq = 0;
|
||||
|
||||
// Let's compute the list of non-empty posting lists
|
||||
for heap_item in merged_terms.current_kvs() {
|
||||
let segment_ord = heap_item.segment_ord;
|
||||
let term_info = heap_item.streamer.value();
|
||||
let segment_reader = &self.readers[heap_item.segment_ord];
|
||||
for (segment_ord, term_info) in merged_terms.current_segment_ordinals_and_term_infos() {
|
||||
let segment_reader = &self.readers[segment_ord];
|
||||
let inverted_index: &InvertedIndexReader = &*field_readers[segment_ord];
|
||||
let segment_postings = inverted_index
|
||||
.read_postings_from_terminfo(term_info, segment_postings_option)?;
|
||||
.read_postings_from_terminfo(&term_info, segment_postings_option)?;
|
||||
let delete_bitset_opt = segment_reader.delete_bitset();
|
||||
let doc_freq = if let Some(delete_bitset) = delete_bitset_opt {
|
||||
segment_postings.doc_freq_given_deletes(delete_bitset)
|
||||
@@ -927,19 +999,41 @@ impl IndexMerger {
|
||||
.collect();
|
||||
if let Some(doc_id_mapping) = doc_id_mapping {
|
||||
for (old_doc_id, reader_with_ordinal) in doc_id_mapping {
|
||||
let store_reader = &mut document_iterators[reader_with_ordinal.ordinal as usize];
|
||||
let raw_doc = store_reader.next().expect(&format!(
|
||||
"unexpected missing document in docstore on merge, doc id {:?}",
|
||||
old_doc_id
|
||||
))?;
|
||||
store_writer.store_bytes(raw_doc.get_bytes())?;
|
||||
let doc_bytes_it = &mut document_iterators[reader_with_ordinal.ordinal as usize];
|
||||
if let Some(doc_bytes_res) = doc_bytes_it.next() {
|
||||
let doc_bytes = doc_bytes_res?;
|
||||
store_writer.store_bytes(&doc_bytes)?;
|
||||
} else {
|
||||
return Err(DataCorruption::comment_only(&format!(
|
||||
"unexpected missing document in docstore on merge, doc id {:?}",
|
||||
old_doc_id
|
||||
))
|
||||
.into());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for reader in &self.readers {
|
||||
let store_reader = reader.get_store_reader()?;
|
||||
if reader.num_deleted_docs() > 0 {
|
||||
for raw_doc in store_reader.iter_raw(reader.delete_bitset()) {
|
||||
store_writer.store_bytes(raw_doc?.get_bytes())?;
|
||||
if reader.num_deleted_docs() > 0
|
||||
// If there is not enough data in the store, we avoid stacking in order to
|
||||
// avoid creating many small blocks in the doc store. Once we have 5 full blocks,
|
||||
// we start stacking. In the worst case 2/7 of the blocks would be very small.
|
||||
// [segment 1 - {1 doc}][segment 2 - {fullblock * 5}{1doc}]
|
||||
// => 5 * full blocks, 2 * 1 document blocks
|
||||
//
|
||||
// In a more realistic scenario the segments are of the same size, so 1/6 of
|
||||
// the doc stores would be on average half full, given total randomness (which
|
||||
// is not the case here, but not sure how it behaves exactly).
|
||||
//
|
||||
// https://github.com/tantivy-search/tantivy/issues/1053
|
||||
//
|
||||
// take 7 in order to not walk over all checkpoints.
|
||||
|| store_reader.block_checkpoints().take(7).count() < 6
|
||||
|| store_reader.compressor() != store_writer.compressor()
|
||||
{
|
||||
for doc_bytes_res in store_reader.iter_raw(reader.delete_bitset()) {
|
||||
let doc_bytes = doc_bytes_res?;
|
||||
store_writer.store_bytes(&doc_bytes)?;
|
||||
}
|
||||
} else {
|
||||
store_writer.stack(&store_reader)?;
|
||||
@@ -958,7 +1052,13 @@ impl SerializableSegment for IndexMerger {
|
||||
) -> crate::Result<u32> {
|
||||
let doc_id_mapping = if let Some(sort_by_field) = self.index_settings.sort_by_field.as_ref()
|
||||
{
|
||||
Some(self.generate_doc_id_mapping(sort_by_field)?)
|
||||
// If the documents are already sorted and stackable, we ignore the mapping and execute
|
||||
// it as if there was no sorting
|
||||
if self.is_disjunct_and_sorted_on_sort_property(sort_by_field)? {
|
||||
None
|
||||
} else {
|
||||
Some(self.generate_doc_id_mapping(sort_by_field)?)
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
@@ -993,6 +1093,7 @@ mod tests {
|
||||
use crate::collector::tests::{BytesFastFieldTestCollector, FastFieldTestCollector};
|
||||
use crate::collector::{Count, FacetCollector};
|
||||
use crate::core::Index;
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::query::AllQuery;
|
||||
use crate::query::BooleanQuery;
|
||||
use crate::query::Scorer;
|
||||
@@ -1470,31 +1571,65 @@ mod tests {
|
||||
}
|
||||
#[test]
|
||||
fn test_merge_facets_sort_none() {
|
||||
test_merge_facets(None)
|
||||
test_merge_facets(None, true)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_facets_sort_asc() {
|
||||
// the data is already sorted asc, so this should have no effect, but go through the docid
|
||||
// mapping code
|
||||
test_merge_facets(Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Asc,
|
||||
// In the merge case this will go through the docid mapping code
|
||||
test_merge_facets(
|
||||
Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
}));
|
||||
true,
|
||||
);
|
||||
// In the merge case this will not go through the docid mapping code, because the data is
|
||||
// sorted and disjunct
|
||||
test_merge_facets(
|
||||
Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
false,
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_facets_sort_desc() {
|
||||
test_merge_facets(Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Desc,
|
||||
// In the merge case this will go through the docid mapping code
|
||||
test_merge_facets(
|
||||
Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
}));
|
||||
true,
|
||||
);
|
||||
// In the merge case this will not go through the docid mapping code, because the data is
|
||||
// sorted and disjunct
|
||||
test_merge_facets(
|
||||
Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
false,
|
||||
);
|
||||
}
|
||||
fn test_merge_facets(index_settings: Option<IndexSettings>) {
|
||||
// force_segment_value_overlap forces the int value for sorting to have overlapping min and max
|
||||
// ranges between segments so that merge algorithm can't apply certain optimizations
|
||||
fn test_merge_facets(index_settings: Option<IndexSettings>, force_segment_value_overlap: bool) {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let facet_field = schema_builder.add_facet_field("facet", INDEXED);
|
||||
let int_options = IntOptions::default()
|
||||
@@ -1511,32 +1646,47 @@ mod tests {
|
||||
let mut int_val = 0;
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
let mut index_doc = |index_writer: &mut IndexWriter, doc_facets: &[&str]| {
|
||||
let mut doc = Document::default();
|
||||
for facet in doc_facets {
|
||||
doc.add_facet(facet_field, Facet::from(facet));
|
||||
}
|
||||
doc.add_u64(int_field, int_val);
|
||||
int_val += 1;
|
||||
index_writer.add_document(doc);
|
||||
};
|
||||
let index_doc =
|
||||
|index_writer: &mut IndexWriter, doc_facets: &[&str], int_val: &mut u64| {
|
||||
let mut doc = Document::default();
|
||||
for facet in doc_facets {
|
||||
doc.add_facet(facet_field, Facet::from(facet));
|
||||
}
|
||||
doc.add_u64(int_field, *int_val);
|
||||
*int_val += 1;
|
||||
index_writer.add_document(doc);
|
||||
};
|
||||
|
||||
index_doc(&mut index_writer, &["/top/a/firstdoc", "/top/b"]);
|
||||
index_doc(&mut index_writer, &["/top/a/firstdoc", "/top/b", "/top/c"]);
|
||||
index_doc(&mut index_writer, &["/top/a", "/top/b"]);
|
||||
index_doc(&mut index_writer, &["/top/a"]);
|
||||
index_doc(
|
||||
&mut index_writer,
|
||||
&["/top/a/firstdoc", "/top/b"],
|
||||
&mut int_val,
|
||||
);
|
||||
index_doc(
|
||||
&mut index_writer,
|
||||
&["/top/a/firstdoc", "/top/b", "/top/c"],
|
||||
&mut int_val,
|
||||
);
|
||||
index_doc(&mut index_writer, &["/top/a", "/top/b"], &mut int_val);
|
||||
index_doc(&mut index_writer, &["/top/a"], &mut int_val);
|
||||
|
||||
index_doc(&mut index_writer, &["/top/b", "/top/d"]);
|
||||
index_doc(&mut index_writer, &["/top/d"]);
|
||||
index_doc(&mut index_writer, &["/top/e"]);
|
||||
index_doc(&mut index_writer, &["/top/b", "/top/d"], &mut int_val);
|
||||
if force_segment_value_overlap {
|
||||
index_doc(&mut index_writer, &["/top/d"], &mut 0);
|
||||
index_doc(&mut index_writer, &["/top/e"], &mut 10);
|
||||
index_writer.commit().expect("committed");
|
||||
index_doc(&mut index_writer, &["/top/a"], &mut 5); // 5 is between 0 - 10 so the segments don' have disjunct ranges
|
||||
} else {
|
||||
index_doc(&mut index_writer, &["/top/d"], &mut int_val);
|
||||
index_doc(&mut index_writer, &["/top/e"], &mut int_val);
|
||||
index_writer.commit().expect("committed");
|
||||
index_doc(&mut index_writer, &["/top/a"], &mut int_val);
|
||||
}
|
||||
index_doc(&mut index_writer, &["/top/b"], &mut int_val);
|
||||
index_doc(&mut index_writer, &["/top/c"], &mut int_val);
|
||||
index_writer.commit().expect("committed");
|
||||
|
||||
index_doc(&mut index_writer, &["/top/a"]);
|
||||
index_doc(&mut index_writer, &["/top/b"]);
|
||||
index_doc(&mut index_writer, &["/top/c"]);
|
||||
index_writer.commit().expect("committed");
|
||||
|
||||
index_doc(&mut index_writer, &["/top/e", "/top/f"]);
|
||||
index_doc(&mut index_writer, &["/top/e", "/top/f"], &mut int_val);
|
||||
index_writer.commit().expect("committed");
|
||||
}
|
||||
|
||||
@@ -1821,7 +1971,7 @@ mod tests {
|
||||
|
||||
// Make sure we'll attempt to merge every created segment
|
||||
let mut policy = crate::indexer::LogMergePolicy::default();
|
||||
policy.set_min_merge_size(2);
|
||||
policy.set_min_num_segments(2);
|
||||
writer.set_merge_policy(Box::new(policy));
|
||||
|
||||
for i in 0..100 {
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::{
|
||||
collector::TopDocs,
|
||||
schema::{Cardinality, TextFieldIndexing},
|
||||
@@ -39,6 +40,7 @@ mod tests {
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
|
||||
index_writer.add_document(doc!(int_field=>3_u64, facet_field=> Facet::from("/crime")));
|
||||
index_writer.add_document(doc!(int_field=>6_u64, facet_field=> Facet::from("/crime")));
|
||||
|
||||
assert!(index_writer.commit().is_ok());
|
||||
index_writer.add_document(doc!(int_field=>5_u64, facet_field=> Facet::from("/fanta")));
|
||||
@@ -58,7 +60,12 @@ mod tests {
|
||||
index
|
||||
}
|
||||
|
||||
fn create_test_index(index_settings: Option<IndexSettings>) -> Index {
|
||||
// force_disjunct_segment_sort_values forces the field, by which the index is sorted have disjunct
|
||||
// ranges between segments, e.g. values in segment [1-3] [10 - 20] [50 - 500]
|
||||
fn create_test_index(
|
||||
index_settings: Option<IndexSettings>,
|
||||
force_disjunct_segment_sort_values: bool,
|
||||
) -> Index {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let int_options = IntOptions::default()
|
||||
.set_fast(Cardinality::SingleValue)
|
||||
@@ -92,6 +99,7 @@ mod tests {
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
|
||||
// segment 1 - range 1-3
|
||||
index_writer.add_document(doc!(int_field=>1_u64));
|
||||
index_writer.add_document(
|
||||
doc!(int_field=>3_u64, multi_numbers => 3_u64, multi_numbers => 4_u64, bytes_field => vec![1, 2, 3], text_field => "some text", facet_field=> Facet::from("/book/crime")),
|
||||
@@ -102,13 +110,26 @@ mod tests {
|
||||
);
|
||||
|
||||
assert!(index_writer.commit().is_ok());
|
||||
// segment 2 - range 1-20 , with force_disjunct_segment_sort_values 10-20
|
||||
index_writer.add_document(doc!(int_field=>20_u64, multi_numbers => 20_u64));
|
||||
index_writer.add_document(doc!(int_field=>1_u64, text_field=> "deleteme", facet_field=> Facet::from("/book/crime")));
|
||||
|
||||
let in_val = if force_disjunct_segment_sort_values {
|
||||
10_u64
|
||||
} else {
|
||||
1
|
||||
};
|
||||
index_writer.add_document(doc!(int_field=>in_val, text_field=> "deleteme", facet_field=> Facet::from("/book/crime")));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
index_writer.add_document(
|
||||
doc!(int_field=>10_u64, multi_numbers => 10_u64, multi_numbers => 11_u64, text_field=> "blubber", facet_field=> Facet::from("/book/fantasy")),
|
||||
// segment 3 - range 5-1000, with force_disjunct_segment_sort_values 50-1000
|
||||
let int_vals = if force_disjunct_segment_sort_values {
|
||||
[100_u64, 50]
|
||||
} else {
|
||||
[10, 5]
|
||||
};
|
||||
index_writer.add_document( // position of this doc after delete in desc sorting = [2], in disjunct case [1]
|
||||
doc!(int_field=>int_vals[0], multi_numbers => 10_u64, multi_numbers => 11_u64, text_field=> "blubber", facet_field=> Facet::from("/book/fantasy")),
|
||||
);
|
||||
index_writer.add_document(doc!(int_field=>5_u64, text_field=> "deleteme"));
|
||||
index_writer.add_document(doc!(int_field=>int_vals[1], text_field=> "deleteme"));
|
||||
index_writer.add_document(
|
||||
doc!(int_field=>1_000u64, multi_numbers => 1001_u64, multi_numbers => 1002_u64, bytes_field => vec![5, 5],text_field => "the biggest num")
|
||||
);
|
||||
@@ -136,17 +157,30 @@ mod tests {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
..Default::default()
|
||||
}));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_sorted_index_desc() {
|
||||
let index = create_test_index(Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Desc,
|
||||
fn test_merge_sorted_index_desc_not_disjunct() {
|
||||
test_merge_sorted_index_desc_(false);
|
||||
}
|
||||
#[test]
|
||||
fn test_merge_sorted_index_desc_disjunct() {
|
||||
test_merge_sorted_index_desc_(true);
|
||||
}
|
||||
|
||||
fn test_merge_sorted_index_desc_(force_disjunct_segment_sort_values: bool) {
|
||||
let index = create_test_index(
|
||||
Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Desc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
}));
|
||||
force_disjunct_segment_sort_values,
|
||||
);
|
||||
|
||||
let int_field = index.schema().get_field("intval").unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
@@ -160,8 +194,13 @@ mod tests {
|
||||
assert_eq!(fast_field.get(5u32), 1u64);
|
||||
assert_eq!(fast_field.get(4u32), 2u64);
|
||||
assert_eq!(fast_field.get(3u32), 3u64);
|
||||
assert_eq!(fast_field.get(2u32), 10u64);
|
||||
assert_eq!(fast_field.get(1u32), 20u64);
|
||||
if force_disjunct_segment_sort_values {
|
||||
assert_eq!(fast_field.get(2u32), 20u64);
|
||||
assert_eq!(fast_field.get(1u32), 100u64);
|
||||
} else {
|
||||
assert_eq!(fast_field.get(2u32), 10u64);
|
||||
assert_eq!(fast_field.get(1u32), 20u64);
|
||||
}
|
||||
assert_eq!(fast_field.get(0u32), 1_000u64);
|
||||
|
||||
// test new field norm mapping
|
||||
@@ -169,8 +208,13 @@ mod tests {
|
||||
let my_text_field = index.schema().get_field("text_field").unwrap();
|
||||
let fieldnorm_reader = segment_reader.get_fieldnorms_reader(my_text_field).unwrap();
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(0), 3); // the biggest num
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(1), 0);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(2), 1); // blubber
|
||||
if force_disjunct_segment_sort_values {
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(1), 1); // blubber
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(2), 0);
|
||||
} else {
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(1), 0);
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(2), 1); // blubber
|
||||
}
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(3), 2); // some text
|
||||
assert_eq!(fieldnorm_reader.fieldnorm(5), 0);
|
||||
}
|
||||
@@ -191,13 +235,22 @@ mod tests {
|
||||
};
|
||||
|
||||
assert_eq!(do_search("some"), vec![3]);
|
||||
assert_eq!(do_search("blubber"), vec![2]);
|
||||
if force_disjunct_segment_sort_values {
|
||||
assert_eq!(do_search("blubber"), vec![1]);
|
||||
} else {
|
||||
assert_eq!(do_search("blubber"), vec![2]);
|
||||
}
|
||||
assert_eq!(do_search("biggest"), vec![0]);
|
||||
}
|
||||
|
||||
// access doc store
|
||||
{
|
||||
let doc = searcher.doc(DocAddress::new(0, 2)).unwrap();
|
||||
let blubber_pos = if force_disjunct_segment_sort_values {
|
||||
1
|
||||
} else {
|
||||
2
|
||||
};
|
||||
let doc = searcher.doc(DocAddress::new(0, blubber_pos)).unwrap();
|
||||
assert_eq!(
|
||||
doc.get_first(my_text_field).unwrap().text(),
|
||||
Some("blubber")
|
||||
@@ -209,12 +262,16 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_merge_sorted_index_asc() {
|
||||
let index = create_test_index(Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Asc,
|
||||
let index = create_test_index(
|
||||
Some(IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "intval".to_string(),
|
||||
order: Order::Asc,
|
||||
}),
|
||||
..Default::default()
|
||||
}),
|
||||
}));
|
||||
false,
|
||||
);
|
||||
|
||||
let int_field = index.schema().get_field("intval").unwrap();
|
||||
let multi_numbers = index.schema().get_field("multi_numbers").unwrap();
|
||||
@@ -315,7 +372,6 @@ mod bench_sorted_index_merge {
|
||||
use crate::IndexSortByField;
|
||||
use crate::IndexWriter;
|
||||
use crate::Order;
|
||||
use futures::executor::block_on;
|
||||
use test::{self, Bencher};
|
||||
fn create_index(sort_by_field: Option<IndexSortByField>) -> Index {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -323,12 +379,12 @@ mod bench_sorted_index_merge {
|
||||
.set_fast(Cardinality::SingleValue)
|
||||
.set_indexed();
|
||||
let int_field = schema_builder.add_u64_field("intval", int_options);
|
||||
let int_field = schema_builder.add_u64_field("intval", int_options);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index_builder = Index::builder()
|
||||
.schema(schema)
|
||||
.settings(IndexSettings { sort_by_field });
|
||||
let index_builder = Index::builder().schema(schema).settings(IndexSettings {
|
||||
sort_by_field,
|
||||
..Default::default()
|
||||
});
|
||||
let index = index_builder.create_in_ram().unwrap();
|
||||
|
||||
{
|
||||
@@ -366,7 +422,7 @@ mod bench_sorted_index_merge {
|
||||
b.iter(|| {
|
||||
|
||||
let sorted_doc_ids = doc_id_mapping.iter().map(|(doc_id, reader)|{
|
||||
let u64_reader: FastFieldReader<u64> = reader
|
||||
let u64_reader: FastFieldReader<u64> = reader.reader
|
||||
.fast_fields()
|
||||
.typed_fast_field_reader(field)
|
||||
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
|
||||
@@ -391,7 +447,7 @@ mod bench_sorted_index_merge {
|
||||
order: Order::Desc,
|
||||
};
|
||||
let index = create_index(Some(sort_by_field.clone()));
|
||||
let field = index.schema().get_field("intval").unwrap();
|
||||
//let field = index.schema().get_field("intval").unwrap();
|
||||
let segments = index.searchable_segments().unwrap();
|
||||
let merger: IndexMerger =
|
||||
IndexMerger::open(index.schema(), index.settings().clone(), &segments[..])?;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use crate::core::Segment;
|
||||
use crate::core::SegmentComponent;
|
||||
use crate::fastfield::FastFieldSerializer;
|
||||
use crate::fastfield::CompositeFastFieldSerializer;
|
||||
use crate::fieldnorm::FieldNormsSerializer;
|
||||
use crate::postings::InvertedIndexSerializer;
|
||||
use crate::store::StoreWriter;
|
||||
@@ -10,7 +10,7 @@ use crate::store::StoreWriter;
|
||||
pub struct SegmentSerializer {
|
||||
segment: Segment,
|
||||
pub(crate) store_writer: StoreWriter,
|
||||
fast_field_serializer: FastFieldSerializer,
|
||||
fast_field_serializer: CompositeFastFieldSerializer,
|
||||
fieldnorms_serializer: Option<FieldNormsSerializer>,
|
||||
postings_serializer: InvertedIndexSerializer,
|
||||
}
|
||||
@@ -33,15 +33,16 @@ impl SegmentSerializer {
|
||||
let store_write = segment.open_write(store_component)?;
|
||||
|
||||
let fast_field_write = segment.open_write(SegmentComponent::FastFields)?;
|
||||
let fast_field_serializer = FastFieldSerializer::from_write(fast_field_write)?;
|
||||
let fast_field_serializer = CompositeFastFieldSerializer::from_write(fast_field_write)?;
|
||||
|
||||
let fieldnorms_write = segment.open_write(SegmentComponent::FieldNorms)?;
|
||||
let fieldnorms_serializer = FieldNormsSerializer::from_write(fieldnorms_write)?;
|
||||
|
||||
let postings_serializer = InvertedIndexSerializer::open(&mut segment)?;
|
||||
let compressor = segment.index().settings().docstore_compression;
|
||||
Ok(SegmentSerializer {
|
||||
segment,
|
||||
store_writer: StoreWriter::new(store_write),
|
||||
store_writer: StoreWriter::new(store_write, compressor),
|
||||
fast_field_serializer,
|
||||
fieldnorms_serializer: Some(fieldnorms_serializer),
|
||||
postings_serializer,
|
||||
@@ -67,7 +68,7 @@ impl SegmentSerializer {
|
||||
}
|
||||
|
||||
/// Accessor to the `FastFieldSerializer`.
|
||||
pub fn get_fast_field_serializer(&mut self) -> &mut FastFieldSerializer {
|
||||
pub fn get_fast_field_serializer(&mut self) -> &mut CompositeFastFieldSerializer {
|
||||
&mut self.fast_field_serializer
|
||||
}
|
||||
|
||||
|
||||
@@ -345,8 +345,11 @@ fn write(
|
||||
let store_write = serializer
|
||||
.segment_mut()
|
||||
.open_write(SegmentComponent::Store)?;
|
||||
let old_store_writer =
|
||||
std::mem::replace(&mut serializer.store_writer, StoreWriter::new(store_write));
|
||||
let compressor = serializer.segment().index().settings().docstore_compression;
|
||||
let old_store_writer = std::mem::replace(
|
||||
&mut serializer.store_writer,
|
||||
StoreWriter::new(store_write, compressor),
|
||||
);
|
||||
old_store_writer.close()?;
|
||||
let store_read = StoreReader::open(
|
||||
serializer
|
||||
@@ -354,12 +357,9 @@ fn write(
|
||||
.open_read(SegmentComponent::TempStore)?,
|
||||
)?;
|
||||
for old_doc_id in doc_id_map.iter_old_doc_ids() {
|
||||
let raw_doc = store_read.get_raw(*old_doc_id)?;
|
||||
serializer
|
||||
.get_store_writer()
|
||||
.store_bytes(raw_doc.get_bytes())?;
|
||||
let doc_bytes = store_read.get_document_bytes(*old_doc_id)?;
|
||||
serializer.get_store_writer().store_bytes(&doc_bytes)?;
|
||||
}
|
||||
// TODO delete temp store
|
||||
}
|
||||
serializer.close()?;
|
||||
Ok(())
|
||||
|
||||
@@ -178,7 +178,7 @@ use once_cell::sync::Lazy;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// Index format version.
|
||||
const INDEX_FORMAT_VERSION: u32 = 3;
|
||||
const INDEX_FORMAT_VERSION: u32 = 4;
|
||||
|
||||
/// Structure version for the index.
|
||||
#[derive(Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
@@ -187,7 +187,6 @@ pub struct Version {
|
||||
minor: u32,
|
||||
patch: u32,
|
||||
index_format_version: u32,
|
||||
store_compression: String,
|
||||
}
|
||||
|
||||
impl fmt::Debug for Version {
|
||||
@@ -201,14 +200,13 @@ static VERSION: Lazy<Version> = Lazy::new(|| Version {
|
||||
minor: env!("CARGO_PKG_VERSION_MINOR").parse().unwrap(),
|
||||
patch: env!("CARGO_PKG_VERSION_PATCH").parse().unwrap(),
|
||||
index_format_version: INDEX_FORMAT_VERSION,
|
||||
store_compression: crate::store::COMPRESSION.to_string(),
|
||||
});
|
||||
|
||||
impl ToString for Version {
|
||||
fn to_string(&self) -> String {
|
||||
format!(
|
||||
"tantivy v{}.{}.{}, index_format v{}, store_compression: {}",
|
||||
self.major, self.minor, self.patch, self.index_format_version, self.store_compression
|
||||
"tantivy v{}.{}.{}, index_format v{}",
|
||||
self.major, self.minor, self.patch, self.index_format_version
|
||||
)
|
||||
}
|
||||
}
|
||||
@@ -293,6 +291,7 @@ mod tests {
|
||||
use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE;
|
||||
use crate::core::SegmentReader;
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::query::BooleanQuery;
|
||||
use crate::schema::*;
|
||||
use crate::DocAddress;
|
||||
|
||||
@@ -6,7 +6,7 @@ use crate::query::Weight;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::schema::Term;
|
||||
use crate::Searcher;
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
/// The boolean query returns a set of documents
|
||||
/// that matches the Boolean combination of constituent subqueries.
|
||||
@@ -159,9 +159,9 @@ impl Query for BooleanQuery {
|
||||
Ok(Box::new(BooleanWeight::new(sub_weights, scoring_enabled)))
|
||||
}
|
||||
|
||||
fn query_terms(&self, term_set: &mut BTreeSet<Term>) {
|
||||
fn query_terms(&self, terms: &mut BTreeMap<Term, bool>) {
|
||||
for (_occur, subquery) in &self.subqueries {
|
||||
subquery.query_terms(term_set);
|
||||
subquery.query_terms(terms);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,7 +2,7 @@ use crate::fastfield::DeleteBitSet;
|
||||
use crate::query::explanation::does_not_match;
|
||||
use crate::query::{Explanation, Query, Scorer, Weight};
|
||||
use crate::{DocId, DocSet, Score, Searcher, SegmentReader, Term};
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt;
|
||||
|
||||
/// `BoostQuery` is a wrapper over a query used to boost its score.
|
||||
@@ -48,8 +48,8 @@ impl Query for BoostQuery {
|
||||
Ok(boosted_weight)
|
||||
}
|
||||
|
||||
fn query_terms(&self, term_set: &mut BTreeSet<Term>) {
|
||||
self.query.query_terms(term_set)
|
||||
fn query_terms(&self, terms: &mut BTreeMap<Term, bool>) {
|
||||
self.query.query_terms(terms)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@ mod exclude;
|
||||
mod explanation;
|
||||
mod fuzzy_query;
|
||||
mod intersection;
|
||||
mod mlt;
|
||||
mod more_like_this;
|
||||
mod phrase_query;
|
||||
mod query;
|
||||
mod query_parser;
|
||||
@@ -46,7 +46,7 @@ pub use self::explanation::Explanation;
|
||||
pub(crate) use self::fuzzy_query::DfaWrapper;
|
||||
pub use self::fuzzy_query::FuzzyTermQuery;
|
||||
pub use self::intersection::intersect_scorers;
|
||||
pub use self::mlt::{MoreLikeThisQuery, MoreLikeThisQueryBuilder};
|
||||
pub use self::more_like_this::{MoreLikeThisQuery, MoreLikeThisQueryBuilder};
|
||||
pub use self::phrase_query::PhraseQuery;
|
||||
pub use self::query::{Query, QueryClone};
|
||||
pub use self::query_parser::QueryParser;
|
||||
@@ -66,7 +66,7 @@ mod tests {
|
||||
use crate::schema::{Schema, TEXT};
|
||||
use crate::Index;
|
||||
use crate::Term;
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
#[test]
|
||||
fn test_query_terms() {
|
||||
@@ -78,49 +78,49 @@ mod tests {
|
||||
let term_a = Term::from_field_text(text_field, "a");
|
||||
let term_b = Term::from_field_text(text_field, "b");
|
||||
{
|
||||
let mut terms_set: BTreeSet<Term> = BTreeSet::new();
|
||||
let mut terms: BTreeMap<Term, bool> = Default::default();
|
||||
query_parser
|
||||
.parse_query("a")
|
||||
.unwrap()
|
||||
.query_terms(&mut terms_set);
|
||||
let terms: Vec<&Term> = terms_set.iter().collect();
|
||||
assert_eq!(vec![&term_a], terms);
|
||||
.query_terms(&mut terms);
|
||||
let terms: Vec<(&Term, &bool)> = terms.iter().collect();
|
||||
assert_eq!(vec![(&term_a, &false)], terms);
|
||||
}
|
||||
{
|
||||
let mut terms_set: BTreeSet<Term> = BTreeSet::new();
|
||||
let mut terms: BTreeMap<Term, bool> = Default::default();
|
||||
query_parser
|
||||
.parse_query("a b")
|
||||
.unwrap()
|
||||
.query_terms(&mut terms_set);
|
||||
let terms: Vec<&Term> = terms_set.iter().collect();
|
||||
assert_eq!(vec![&term_a, &term_b], terms);
|
||||
.query_terms(&mut terms);
|
||||
let terms: Vec<(&Term, &bool)> = terms.iter().collect();
|
||||
assert_eq!(vec![(&term_a, &false), (&term_b, &false)], terms);
|
||||
}
|
||||
{
|
||||
let mut terms_set: BTreeSet<Term> = BTreeSet::new();
|
||||
let mut terms: BTreeMap<Term, bool> = Default::default();
|
||||
query_parser
|
||||
.parse_query("\"a b\"")
|
||||
.unwrap()
|
||||
.query_terms(&mut terms_set);
|
||||
let terms: Vec<&Term> = terms_set.iter().collect();
|
||||
assert_eq!(vec![&term_a, &term_b], terms);
|
||||
.query_terms(&mut terms);
|
||||
let terms: Vec<(&Term, &bool)> = terms.iter().collect();
|
||||
assert_eq!(vec![(&term_a, &true), (&term_b, &true)], terms);
|
||||
}
|
||||
{
|
||||
let mut terms_set: BTreeSet<Term> = BTreeSet::new();
|
||||
let mut terms: BTreeMap<Term, bool> = Default::default();
|
||||
query_parser
|
||||
.parse_query("a a a a a")
|
||||
.unwrap()
|
||||
.query_terms(&mut terms_set);
|
||||
let terms: Vec<&Term> = terms_set.iter().collect();
|
||||
assert_eq!(vec![&term_a], terms);
|
||||
.query_terms(&mut terms);
|
||||
let terms: Vec<(&Term, &bool)> = terms.iter().collect();
|
||||
assert_eq!(vec![(&term_a, &false)], terms);
|
||||
}
|
||||
{
|
||||
let mut terms_set: BTreeSet<Term> = BTreeSet::new();
|
||||
let mut terms: BTreeMap<Term, bool> = Default::default();
|
||||
query_parser
|
||||
.parse_query("a -b")
|
||||
.unwrap()
|
||||
.query_terms(&mut terms_set);
|
||||
let terms: Vec<&Term> = terms_set.iter().collect();
|
||||
assert_eq!(vec![&term_a, &term_b], terms);
|
||||
.query_terms(&mut terms);
|
||||
let terms: Vec<(&Term, &bool)> = terms.iter().collect();
|
||||
assert_eq!(vec![(&term_a, &false), (&term_b, &false)], terms);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
mod mlt;
|
||||
mod more_like_this;
|
||||
mod query;
|
||||
|
||||
pub use self::mlt::MoreLikeThis;
|
||||
pub use self::more_like_this::MoreLikeThis;
|
||||
pub use self::query::{MoreLikeThisQuery, MoreLikeThisQueryBuilder};
|
||||
@@ -233,10 +233,9 @@ impl MoreLikeThis {
|
||||
}
|
||||
FieldType::U64(_) => {
|
||||
for field_value in field_values {
|
||||
let val = field_value
|
||||
.value()
|
||||
.u64_value()
|
||||
.ok_or(TantivyError::InvalidArgument("invalid value".to_string()))?;
|
||||
let val = field_value.value().u64_value().ok_or_else(|| {
|
||||
TantivyError::InvalidArgument("invalid value".to_string())
|
||||
})?;
|
||||
if !self.is_noise_word(val.to_string()) {
|
||||
let term = Term::from_field_u64(field, val);
|
||||
*term_frequencies.entry(term).or_insert(0) += 1;
|
||||
@@ -249,7 +248,7 @@ impl MoreLikeThis {
|
||||
let val = field_value
|
||||
.value()
|
||||
.date_value()
|
||||
.ok_or(TantivyError::InvalidArgument("invalid value".to_string()))?
|
||||
.ok_or_else(|| TantivyError::InvalidArgument("invalid value".to_string()))?
|
||||
.timestamp();
|
||||
if !self.is_noise_word(val.to_string()) {
|
||||
let term = Term::from_field_i64(field, val);
|
||||
@@ -259,10 +258,9 @@ impl MoreLikeThis {
|
||||
}
|
||||
FieldType::I64(_) => {
|
||||
for field_value in field_values {
|
||||
let val = field_value
|
||||
.value()
|
||||
.i64_value()
|
||||
.ok_or(TantivyError::InvalidArgument("invalid value".to_string()))?;
|
||||
let val = field_value.value().i64_value().ok_or_else(|| {
|
||||
TantivyError::InvalidArgument("invalid value".to_string())
|
||||
})?;
|
||||
if !self.is_noise_word(val.to_string()) {
|
||||
let term = Term::from_field_i64(field, val);
|
||||
*term_frequencies.entry(term).or_insert(0) += 1;
|
||||
@@ -271,10 +269,9 @@ impl MoreLikeThis {
|
||||
}
|
||||
FieldType::F64(_) => {
|
||||
for field_value in field_values {
|
||||
let val = field_value
|
||||
.value()
|
||||
.f64_value()
|
||||
.ok_or(TantivyError::InvalidArgument("invalid value".to_string()))?;
|
||||
let val = field_value.value().f64_value().ok_or_else(|| {
|
||||
TantivyError::InvalidArgument("invalid value".to_string())
|
||||
})?;
|
||||
if !self.is_noise_word(val.to_string()) {
|
||||
let term = Term::from_field_f64(field, val);
|
||||
*term_frequencies.entry(term).or_insert(0) += 1;
|
||||
@@ -306,7 +303,7 @@ impl MoreLikeThis {
|
||||
{
|
||||
return true;
|
||||
}
|
||||
return self.stop_words.contains(&word);
|
||||
self.stop_words.contains(&word)
|
||||
}
|
||||
|
||||
/// Couputes the score for each term while ignoring not useful terms
|
||||
@@ -1,3 +1,5 @@
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use super::PhraseWeight;
|
||||
use crate::core::searcher::Searcher;
|
||||
use crate::query::bm25::Bm25Weight;
|
||||
@@ -5,7 +7,6 @@ use crate::query::Query;
|
||||
use crate::query::Weight;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::schema::{Field, Term};
|
||||
use std::collections::BTreeSet;
|
||||
|
||||
/// `PhraseQuery` matches a specific sequence of words.
|
||||
///
|
||||
@@ -113,9 +114,9 @@ impl Query for PhraseQuery {
|
||||
Ok(Box::new(phrase_weight))
|
||||
}
|
||||
|
||||
fn query_terms(&self, term_set: &mut BTreeSet<Term>) {
|
||||
for (_, query_term) in &self.phrase_terms {
|
||||
term_set.insert(query_term.clone());
|
||||
fn query_terms(&self, terms: &mut BTreeMap<Term, bool>) {
|
||||
for (_, term) in &self.phrase_terms {
|
||||
terms.insert(term.clone(), true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,7 +4,7 @@ use crate::query::Explanation;
|
||||
use crate::DocAddress;
|
||||
use crate::Term;
|
||||
use downcast_rs::impl_downcast;
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt;
|
||||
|
||||
/// The `Query` trait defines a set of documents and a scoring method
|
||||
@@ -68,7 +68,10 @@ pub trait Query: QueryClone + Send + Sync + downcast_rs::Downcast + fmt::Debug {
|
||||
|
||||
/// Extract all of the terms associated to the query and insert them in the
|
||||
/// term set given in arguments.
|
||||
fn query_terms(&self, _term_set: &mut BTreeSet<Term>) {}
|
||||
///
|
||||
/// Each term is associated with a boolean indicating whether
|
||||
/// Positions are required or not.
|
||||
fn query_terms(&self, _term_set: &mut BTreeMap<Term, bool>) {}
|
||||
}
|
||||
|
||||
/// Implements `box_clone`.
|
||||
@@ -95,8 +98,8 @@ impl Query for Box<dyn Query> {
|
||||
self.as_ref().count(searcher)
|
||||
}
|
||||
|
||||
fn query_terms(&self, term_set: &mut BTreeSet<Term<Vec<u8>>>) {
|
||||
self.as_ref().query_terms(term_set);
|
||||
fn query_terms(&self, terms: &mut BTreeMap<Term, bool>) {
|
||||
self.as_ref().query_terms(terms);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@ use crate::query::Query;
|
||||
use crate::query::RangeQuery;
|
||||
use crate::query::TermQuery;
|
||||
use crate::query::{AllQuery, BoostQuery};
|
||||
use crate::schema::{Facet, IndexRecordOption};
|
||||
use crate::schema::{Facet, FacetParseError, IndexRecordOption};
|
||||
use crate::schema::{Field, Schema};
|
||||
use crate::schema::{FieldType, Term};
|
||||
use crate::tokenizer::TokenizerManager;
|
||||
@@ -68,6 +68,9 @@ pub enum QueryParserError {
|
||||
/// The format for the date field is not RFC 3339 compliant.
|
||||
#[error("The date field has an invalid format")]
|
||||
DateFormatError(chrono::ParseError),
|
||||
/// The format for the facet field is invalid.
|
||||
#[error("The facet field is malformed: {0}")]
|
||||
FacetFormatError(FacetParseError),
|
||||
}
|
||||
|
||||
impl From<ParseIntError> for QueryParserError {
|
||||
@@ -88,6 +91,12 @@ impl From<chrono::ParseError> for QueryParserError {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<FacetParseError> for QueryParserError {
|
||||
fn from(err: FacetParseError) -> QueryParserError {
|
||||
QueryParserError::FacetFormatError(err)
|
||||
}
|
||||
}
|
||||
|
||||
/// Recursively remove empty clause from the AST
|
||||
///
|
||||
/// Returns `None` iff the `logical_ast` ended up being empty.
|
||||
@@ -358,10 +367,10 @@ impl QueryParser {
|
||||
))
|
||||
}
|
||||
}
|
||||
FieldType::HierarchicalFacet(_) => {
|
||||
let facet = Facet::from_text(phrase);
|
||||
Ok(vec![(0, Term::from_field_text(field, facet.encoded_str()))])
|
||||
}
|
||||
FieldType::HierarchicalFacet(_) => match Facet::from_text(phrase) {
|
||||
Ok(facet) => Ok(vec![(0, Term::from_field_text(field, facet.encoded_str()))]),
|
||||
Err(e) => Err(QueryParserError::from(e)),
|
||||
},
|
||||
FieldType::Bytes(_) => {
|
||||
let bytes = base64::decode(phrase).map_err(QueryParserError::ExpectedBase64)?;
|
||||
let term = Term::from_field_bytes(field, &bytes);
|
||||
@@ -1027,6 +1036,19 @@ mod test {
|
||||
.is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_query_parser_expected_facet() {
|
||||
let query_parser = make_query_parser();
|
||||
match query_parser.parse_query("facet:INVALID") {
|
||||
Ok(_) => panic!("should never succeed"),
|
||||
Err(e) => assert_eq!(
|
||||
"The facet field is malformed: Failed to parse the facet string: 'INVALID'",
|
||||
format!("{}", e)
|
||||
),
|
||||
}
|
||||
assert!(query_parser.parse_query("facet:\"/foo/bar\"").is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_query_parser_not_empty_but_no_tokens() {
|
||||
let query_parser = make_query_parser();
|
||||
|
||||
@@ -5,7 +5,7 @@ use crate::query::{Explanation, Query};
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::Searcher;
|
||||
use crate::Term;
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt;
|
||||
|
||||
/// A Term query matches all of the documents
|
||||
@@ -127,7 +127,7 @@ impl Query for TermQuery {
|
||||
self.specialized_weight(searcher, scoring_enabled)?,
|
||||
))
|
||||
}
|
||||
fn query_terms(&self, term_set: &mut BTreeSet<Term>) {
|
||||
term_set.insert(self.term.clone());
|
||||
fn query_terms(&self, terms: &mut BTreeMap<Term, bool>) {
|
||||
terms.insert(self.term.clone(), false);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,6 +20,14 @@ pub const FACET_SEP_BYTE: u8 = 0u8;
|
||||
/// representation of facets. (It is the null codepoint.)
|
||||
pub const FACET_SEP_CHAR: char = '\u{0}';
|
||||
|
||||
/// An error enum for facet parser.
|
||||
#[derive(Debug, PartialEq, Eq, Error)]
|
||||
pub enum FacetParseError {
|
||||
/// The facet text representation is unparsable.
|
||||
#[error("Failed to parse the facet string: '{0}'")]
|
||||
FacetParseError(String),
|
||||
}
|
||||
|
||||
/// A Facet represent a point in a given hierarchy.
|
||||
///
|
||||
/// They are typically represented similarly to a filepath.
|
||||
@@ -75,11 +83,47 @@ impl Facet {
|
||||
/// It is conceptually, if one of the steps of this path
|
||||
/// contains a `/` or a `\`, it should be escaped
|
||||
/// using an anti-slash `/`.
|
||||
pub fn from_text<T>(path: &T) -> Facet
|
||||
pub fn from_text<T>(path: &T) -> Result<Facet, FacetParseError>
|
||||
where
|
||||
T: ?Sized + AsRef<str>,
|
||||
{
|
||||
From::from(path)
|
||||
#[derive(Copy, Clone)]
|
||||
enum State {
|
||||
Escaped,
|
||||
Idle,
|
||||
}
|
||||
let path_ref = path.as_ref();
|
||||
if path_ref.is_empty() {
|
||||
return Err(FacetParseError::FacetParseError(path_ref.to_string()));
|
||||
}
|
||||
if !path_ref.starts_with('/') {
|
||||
return Err(FacetParseError::FacetParseError(path_ref.to_string()));
|
||||
}
|
||||
let mut facet_encoded = String::new();
|
||||
let mut state = State::Idle;
|
||||
let path_bytes = path_ref.as_bytes();
|
||||
let mut last_offset = 1;
|
||||
for i in 1..path_bytes.len() {
|
||||
let c = path_bytes[i];
|
||||
match (state, c) {
|
||||
(State::Idle, ESCAPE_BYTE) => {
|
||||
facet_encoded.push_str(&path_ref[last_offset..i]);
|
||||
last_offset = i + 1;
|
||||
state = State::Escaped
|
||||
}
|
||||
(State::Idle, SLASH_BYTE) => {
|
||||
facet_encoded.push_str(&path_ref[last_offset..i]);
|
||||
facet_encoded.push(FACET_SEP_CHAR);
|
||||
last_offset = i + 1;
|
||||
}
|
||||
(State::Escaped, _escaped_char) => {
|
||||
state = State::Idle;
|
||||
}
|
||||
(State::Idle, _any_char) => {}
|
||||
}
|
||||
}
|
||||
facet_encoded.push_str(&path_ref[last_offset..]);
|
||||
Ok(Facet(facet_encoded))
|
||||
}
|
||||
|
||||
/// Returns a `Facet` from an iterator over the different
|
||||
@@ -137,39 +181,7 @@ impl Borrow<str> for Facet {
|
||||
|
||||
impl<'a, T: ?Sized + AsRef<str>> From<&'a T> for Facet {
|
||||
fn from(path_asref: &'a T) -> Facet {
|
||||
#[derive(Copy, Clone)]
|
||||
enum State {
|
||||
Escaped,
|
||||
Idle,
|
||||
}
|
||||
let path: &str = path_asref.as_ref();
|
||||
assert!(!path.is_empty());
|
||||
assert!(path.starts_with('/'));
|
||||
let mut facet_encoded = String::new();
|
||||
let mut state = State::Idle;
|
||||
let path_bytes = path.as_bytes();
|
||||
let mut last_offset = 1;
|
||||
for i in 1..path_bytes.len() {
|
||||
let c = path_bytes[i];
|
||||
match (state, c) {
|
||||
(State::Idle, ESCAPE_BYTE) => {
|
||||
facet_encoded.push_str(&path[last_offset..i]);
|
||||
last_offset = i + 1;
|
||||
state = State::Escaped
|
||||
}
|
||||
(State::Idle, SLASH_BYTE) => {
|
||||
facet_encoded.push_str(&path[last_offset..i]);
|
||||
facet_encoded.push(FACET_SEP_CHAR);
|
||||
last_offset = i + 1;
|
||||
}
|
||||
(State::Escaped, _escaped_char) => {
|
||||
state = State::Idle;
|
||||
}
|
||||
(State::Idle, _any_char) => {}
|
||||
}
|
||||
}
|
||||
facet_encoded.push_str(&path[last_offset..]);
|
||||
Facet(facet_encoded)
|
||||
Facet::from_text(path_asref).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -226,7 +238,7 @@ impl Debug for Facet {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::Facet;
|
||||
use super::{Facet, FacetParseError};
|
||||
|
||||
#[test]
|
||||
fn test_root() {
|
||||
@@ -288,4 +300,12 @@ mod tests {
|
||||
let facet = Facet::from_path(v.iter());
|
||||
assert_eq!(facet.to_path_string(), "/");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_from_text() {
|
||||
assert_eq!(
|
||||
Err(FacetParseError::FacetParseError("INVALID".to_string())),
|
||||
Facet::from_text("INVALID")
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -128,6 +128,7 @@ pub use self::schema::{Schema, SchemaBuilder};
|
||||
pub use self::value::Value;
|
||||
|
||||
pub use self::facet::Facet;
|
||||
pub use self::facet::FacetParseError;
|
||||
pub(crate) use self::facet::FACET_SEP_BYTE;
|
||||
pub use self::facet_options::FacetOptions;
|
||||
|
||||
|
||||
@@ -7,7 +7,6 @@ use crate::{Document, Score};
|
||||
use htmlescape::encode_minimal;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::BTreeMap;
|
||||
use std::collections::BTreeSet;
|
||||
use std::ops::Range;
|
||||
|
||||
const DEFAULT_MAX_NUM_CHARS: usize = 150;
|
||||
@@ -239,10 +238,10 @@ impl SnippetGenerator {
|
||||
query: &dyn Query,
|
||||
field: Field,
|
||||
) -> crate::Result<SnippetGenerator> {
|
||||
let mut terms = BTreeSet::new();
|
||||
let mut terms = BTreeMap::new();
|
||||
query.query_terms(&mut terms);
|
||||
let mut terms_text: BTreeMap<String, Score> = Default::default();
|
||||
for term in terms {
|
||||
for (term, _) in terms {
|
||||
if term.field() != field {
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -1,10 +1,6 @@
|
||||
use std::io;
|
||||
|
||||
/// Name of the compression scheme used in the doc store.
|
||||
///
|
||||
/// This name is appended to the version string of tantivy.
|
||||
pub const COMPRESSION: &'static str = "brotli";
|
||||
|
||||
#[inline]
|
||||
pub fn compress(mut uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
let mut params = brotli::enc::BrotliEncoderParams::default();
|
||||
params.quality = 5;
|
||||
@@ -13,6 +9,7 @@ pub fn compress(mut uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn decompress(mut compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
decompressed.clear();
|
||||
brotli::BrotliDecompress(&mut compressed, decompressed)?;
|
||||
|
||||
@@ -1,22 +0,0 @@
|
||||
use std::io::{self, Read, Write};
|
||||
|
||||
/// Name of the compression scheme used in the doc store.
|
||||
///
|
||||
/// This name is appended to the version string of tantivy.
|
||||
pub const COMPRESSION: &str = "lz4";
|
||||
|
||||
pub fn compress(uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
compressed.clear();
|
||||
let mut encoder = lz4::EncoderBuilder::new().build(compressed)?;
|
||||
encoder.write_all(&uncompressed)?;
|
||||
let (_, encoder_result) = encoder.finish();
|
||||
encoder_result?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn decompress(compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
decompressed.clear();
|
||||
let mut decoder = lz4::Decoder::new(compressed)?;
|
||||
decoder.read_to_end(decompressed)?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -2,38 +2,46 @@ use std::io::{self};
|
||||
|
||||
use core::convert::TryInto;
|
||||
use lz4_flex::{compress_into, decompress_into};
|
||||
/// Name of the compression scheme used in the doc store.
|
||||
///
|
||||
/// This name is appended to the version string of tantivy.
|
||||
pub const COMPRESSION: &str = "lz4_block";
|
||||
|
||||
#[inline]
|
||||
pub fn compress(uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
compressed.clear();
|
||||
let maximum_ouput_size = lz4_flex::block::get_maximum_output_size(uncompressed.len());
|
||||
compressed.reserve(maximum_ouput_size);
|
||||
|
||||
compressed.extend_from_slice(&[0, 0, 0, 0]);
|
||||
compress_into(uncompressed, compressed);
|
||||
unsafe {
|
||||
compressed.set_len(maximum_ouput_size + 4);
|
||||
}
|
||||
let bytes_written = compress_into(uncompressed, compressed, 4)
|
||||
.map_err(|err| io::Error::new(io::ErrorKind::InvalidData, err.to_string()))?;
|
||||
let num_bytes = uncompressed.len() as u32;
|
||||
compressed[0..4].copy_from_slice(&num_bytes.to_le_bytes());
|
||||
unsafe {
|
||||
compressed.set_len(bytes_written + 4);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn decompress(compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
decompressed.clear();
|
||||
//next lz4_flex version will support slice as input parameter.
|
||||
//this will make the usage much less ugly
|
||||
let uncompressed_size_bytes: &[u8; 4] = compressed
|
||||
.get(..4)
|
||||
.ok_or(io::ErrorKind::InvalidData)?
|
||||
.try_into()
|
||||
.unwrap();
|
||||
let uncompressed_size = u32::from_le_bytes(*uncompressed_size_bytes) as usize;
|
||||
// reserve more than required, because blocked writes may write out of bounds, will be improved
|
||||
// with lz4_flex 1.0
|
||||
decompressed.reserve(uncompressed_size + 4 + 24);
|
||||
decompressed.reserve(uncompressed_size);
|
||||
unsafe {
|
||||
decompressed.set_len(uncompressed_size);
|
||||
}
|
||||
decompress_into(&compressed[4..], decompressed)
|
||||
let bytes_written = decompress_into(&compressed[4..], decompressed, 0)
|
||||
.map_err(|err| io::Error::new(io::ErrorKind::InvalidData, err.to_string()))?;
|
||||
if bytes_written != uncompressed_size {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
"doc store block not completely decompressed, data corruption".to_string(),
|
||||
));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1,10 +1,6 @@
|
||||
use std::io::{self, Read, Write};
|
||||
|
||||
/// Name of the compression scheme used in the doc store.
|
||||
///
|
||||
/// This name is appended to the version string of tantivy.
|
||||
pub const COMPRESSION: &str = "snappy";
|
||||
|
||||
#[inline]
|
||||
pub fn compress(uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
compressed.clear();
|
||||
let mut encoder = snap::write::FrameEncoder::new(compressed);
|
||||
@@ -13,6 +9,7 @@ pub fn compress(uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()>
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn decompress(compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
decompressed.clear();
|
||||
snap::read::FrameDecoder::new(compressed).read_to_end(decompressed)?;
|
||||
|
||||
134
src/store/compressors.rs
Normal file
134
src/store/compressors.rs
Normal file
@@ -0,0 +1,134 @@
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::io;
|
||||
|
||||
pub trait StoreCompressor {
|
||||
fn compress(&self, uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()>;
|
||||
fn decompress(&self, compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()>;
|
||||
fn get_compressor_id() -> u8;
|
||||
}
|
||||
|
||||
/// Compressor can be used on `IndexSettings` to choose
|
||||
/// the compressor used to compress the doc store.
|
||||
///
|
||||
/// The default is Lz4Block, but also depends on the enabled feature flags.
|
||||
#[derive(Clone, Debug, Copy, PartialEq, Eq, Serialize, Deserialize)]
|
||||
pub enum Compressor {
|
||||
#[serde(rename = "lz4")]
|
||||
/// Use the lz4 compressor (block format)
|
||||
Lz4,
|
||||
#[serde(rename = "brotli")]
|
||||
/// Use the brotli compressor
|
||||
Brotli,
|
||||
#[serde(rename = "snappy")]
|
||||
/// Use the snap compressor
|
||||
Snappy,
|
||||
}
|
||||
|
||||
impl Default for Compressor {
|
||||
fn default() -> Self {
|
||||
if cfg!(feature = "lz4-compression") {
|
||||
Compressor::Lz4
|
||||
} else if cfg!(feature = "brotli-compression") {
|
||||
Compressor::Brotli
|
||||
} else if cfg!(feature = "snappy-compression") {
|
||||
Compressor::Snappy
|
||||
} else {
|
||||
panic!(
|
||||
"all compressor feature flags like are disabled (e.g. lz4-compression), can't choose default compressor"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Compressor {
|
||||
pub(crate) fn from_id(id: u8) -> Compressor {
|
||||
match id {
|
||||
1 => Compressor::Lz4,
|
||||
2 => Compressor::Brotli,
|
||||
3 => Compressor::Snappy,
|
||||
_ => panic!("unknown compressor id {:?}", id),
|
||||
}
|
||||
}
|
||||
pub(crate) fn get_id(&self) -> u8 {
|
||||
match self {
|
||||
Self::Lz4 => 1,
|
||||
Self::Brotli => 2,
|
||||
Self::Snappy => 3,
|
||||
}
|
||||
}
|
||||
#[inline]
|
||||
pub(crate) fn compress(&self, uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
match self {
|
||||
Self::Lz4 => {
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
{
|
||||
super::compression_lz4_block::compress(uncompressed, compressed)
|
||||
}
|
||||
#[cfg(not(feature = "lz4-compression"))]
|
||||
{
|
||||
panic!("lz4-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
Self::Brotli => {
|
||||
#[cfg(feature = "brotli-compression")]
|
||||
{
|
||||
super::compression_brotli::compress(uncompressed, compressed)
|
||||
}
|
||||
#[cfg(not(feature = "brotli-compression"))]
|
||||
{
|
||||
panic!("brotli-compression-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
Self::Snappy => {
|
||||
#[cfg(feature = "snappy-compression")]
|
||||
{
|
||||
super::compression_snap::compress(uncompressed, compressed)
|
||||
}
|
||||
#[cfg(not(feature = "snappy-compression"))]
|
||||
{
|
||||
panic!("snappy-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn decompress(
|
||||
&self,
|
||||
compressed: &[u8],
|
||||
decompressed: &mut Vec<u8>,
|
||||
) -> io::Result<()> {
|
||||
match self {
|
||||
Self::Lz4 => {
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
{
|
||||
super::compression_lz4_block::decompress(compressed, decompressed)
|
||||
}
|
||||
#[cfg(not(feature = "lz4-compression"))]
|
||||
{
|
||||
panic!("lz4-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
Self::Brotli => {
|
||||
#[cfg(feature = "brotli-compression")]
|
||||
{
|
||||
super::compression_brotli::decompress(compressed, decompressed)
|
||||
}
|
||||
#[cfg(not(feature = "brotli-compression"))]
|
||||
{
|
||||
panic!("brotli-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
Self::Snappy => {
|
||||
#[cfg(feature = "snappy-compression")]
|
||||
{
|
||||
super::compression_snap::decompress(compressed, decompressed)
|
||||
}
|
||||
#[cfg(not(feature = "snappy-compression"))]
|
||||
{
|
||||
panic!("snappy-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
69
src/store/footer.rs
Normal file
69
src/store/footer.rs
Normal file
@@ -0,0 +1,69 @@
|
||||
use crate::{
|
||||
common::{BinarySerializable, FixedSize, HasLen},
|
||||
directory::FileSlice,
|
||||
store::Compressor,
|
||||
};
|
||||
use std::io;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub struct DocStoreFooter {
|
||||
pub offset: u64,
|
||||
pub compressor: Compressor,
|
||||
}
|
||||
|
||||
/// Serialises the footer to a byte-array
|
||||
/// - offset : 8 bytes
|
||||
///- compressor id: 1 byte
|
||||
/// - reserved for future use: 15 bytes
|
||||
impl BinarySerializable for DocStoreFooter {
|
||||
fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
BinarySerializable::serialize(&self.offset, writer)?;
|
||||
BinarySerializable::serialize(&self.compressor.get_id(), writer)?;
|
||||
writer.write_all(&[0; 15])?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let offset = u64::deserialize(reader)?;
|
||||
let compressor_id = u8::deserialize(reader)?;
|
||||
let mut skip_buf = [0; 15];
|
||||
reader.read_exact(&mut skip_buf)?;
|
||||
Ok(DocStoreFooter {
|
||||
offset,
|
||||
compressor: Compressor::from_id(compressor_id),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl FixedSize for DocStoreFooter {
|
||||
const SIZE_IN_BYTES: usize = 24;
|
||||
}
|
||||
|
||||
impl DocStoreFooter {
|
||||
pub fn new(offset: u64, compressor: Compressor) -> Self {
|
||||
DocStoreFooter { offset, compressor }
|
||||
}
|
||||
|
||||
pub fn extract_footer(file: FileSlice) -> io::Result<(DocStoreFooter, FileSlice)> {
|
||||
if file.len() < DocStoreFooter::SIZE_IN_BYTES {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::UnexpectedEof,
|
||||
format!(
|
||||
"File corrupted. The file is smaller than Footer::SIZE_IN_BYTES (len={}).",
|
||||
file.len()
|
||||
),
|
||||
));
|
||||
}
|
||||
let (body, footer_slice) = file.split_from_end(DocStoreFooter::SIZE_IN_BYTES);
|
||||
let mut footer_bytes = footer_slice.read_bytes()?;
|
||||
let footer = DocStoreFooter::deserialize(&mut footer_bytes)?;
|
||||
Ok((footer, body))
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn doc_store_footer_test() {
|
||||
// This test is just to safe guard changes on the footer.
|
||||
// When the doc store footer is updated, make sure to update also the serialize/deserialize methods
|
||||
assert_eq!(core::mem::size_of::<DocStoreFooter>(), 16);
|
||||
}
|
||||
264
src/store/mod.rs
264
src/store/mod.rs
@@ -33,73 +33,32 @@ and should rely on either
|
||||
|
||||
!*/
|
||||
|
||||
mod compressors;
|
||||
mod footer;
|
||||
mod index;
|
||||
mod reader;
|
||||
mod writer;
|
||||
pub use self::reader::RawDocument;
|
||||
pub use self::compressors::Compressor;
|
||||
pub use self::reader::StoreReader;
|
||||
pub use self::writer::StoreWriter;
|
||||
|
||||
// compile_error doesn't scale very well, enum like feature flags would be great to have in Rust
|
||||
#[cfg(all(feature = "lz4", feature = "brotli"))]
|
||||
compile_error!("feature `lz4` or `brotli` must not be enabled together.");
|
||||
|
||||
#[cfg(all(feature = "lz4_block", feature = "brotli"))]
|
||||
compile_error!("feature `lz4_block` or `brotli` must not be enabled together.");
|
||||
|
||||
#[cfg(all(feature = "lz4_block", feature = "lz4"))]
|
||||
compile_error!("feature `lz4_block` or `lz4` must not be enabled together.");
|
||||
|
||||
#[cfg(all(feature = "lz4_block", feature = "snap"))]
|
||||
compile_error!("feature `lz4_block` or `snap` must not be enabled together.");
|
||||
|
||||
#[cfg(all(feature = "lz4", feature = "snap"))]
|
||||
compile_error!("feature `lz4` or `snap` must not be enabled together.");
|
||||
|
||||
#[cfg(all(feature = "brotli", feature = "snap"))]
|
||||
compile_error!("feature `brotli` or `snap` must not be enabled together.");
|
||||
|
||||
#[cfg(not(any(
|
||||
feature = "lz4",
|
||||
feature = "brotli",
|
||||
feature = "lz4_flex",
|
||||
feature = "snap"
|
||||
)))]
|
||||
compile_error!("all compressors are deactivated via feature-flags, check Cargo.toml for available decompressors.");
|
||||
|
||||
#[cfg(feature = "lz4_flex")]
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
mod compression_lz4_block;
|
||||
#[cfg(feature = "lz4_flex")]
|
||||
pub use self::compression_lz4_block::COMPRESSION;
|
||||
#[cfg(feature = "lz4_flex")]
|
||||
use self::compression_lz4_block::{compress, decompress};
|
||||
|
||||
#[cfg(feature = "lz4")]
|
||||
mod compression_lz4;
|
||||
#[cfg(feature = "lz4")]
|
||||
pub use self::compression_lz4::COMPRESSION;
|
||||
#[cfg(feature = "lz4")]
|
||||
use self::compression_lz4::{compress, decompress};
|
||||
|
||||
#[cfg(feature = "brotli")]
|
||||
#[cfg(feature = "brotli-compression")]
|
||||
mod compression_brotli;
|
||||
#[cfg(feature = "brotli")]
|
||||
pub use self::compression_brotli::COMPRESSION;
|
||||
#[cfg(feature = "brotli")]
|
||||
use self::compression_brotli::{compress, decompress};
|
||||
|
||||
#[cfg(feature = "snap")]
|
||||
#[cfg(feature = "snappy-compression")]
|
||||
mod compression_snap;
|
||||
#[cfg(feature = "snap")]
|
||||
pub use self::compression_snap::COMPRESSION;
|
||||
#[cfg(feature = "snap")]
|
||||
use self::compression_snap::{compress, decompress};
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
|
||||
use futures::executor::block_on;
|
||||
|
||||
use super::*;
|
||||
use crate::schema::{self, FieldValue, TextFieldIndexing};
|
||||
use crate::fastfield::DeleteBitSet;
|
||||
use crate::schema::{self, FieldValue, TextFieldIndexing, STORED, TEXT};
|
||||
use crate::schema::{Document, TextOptions};
|
||||
use crate::{
|
||||
directory::{Directory, RamDirectory, WritePtr},
|
||||
@@ -108,28 +67,31 @@ pub mod tests {
|
||||
use crate::{schema::Schema, Index};
|
||||
use std::path::Path;
|
||||
|
||||
pub fn write_lorem_ipsum_store(writer: WritePtr, num_docs: usize) -> Schema {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field_body = schema_builder.add_text_field("body", TextOptions::default().set_stored());
|
||||
let field_title =
|
||||
schema_builder.add_text_field("title", TextOptions::default().set_stored());
|
||||
let schema = schema_builder.build();
|
||||
let lorem = String::from(
|
||||
"Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed \
|
||||
const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed \
|
||||
do eiusmod tempor incididunt ut labore et dolore magna aliqua. \
|
||||
Ut enim ad minim veniam, quis nostrud exercitation ullamco \
|
||||
laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure \
|
||||
dolor in reprehenderit in voluptate velit esse cillum dolore eu \
|
||||
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non \
|
||||
proident, sunt in culpa qui officia deserunt mollit anim id est \
|
||||
laborum.",
|
||||
);
|
||||
laborum.";
|
||||
|
||||
pub fn write_lorem_ipsum_store(
|
||||
writer: WritePtr,
|
||||
num_docs: usize,
|
||||
compressor: Compressor,
|
||||
) -> Schema {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field_body = schema_builder.add_text_field("body", TextOptions::default().set_stored());
|
||||
let field_title =
|
||||
schema_builder.add_text_field("title", TextOptions::default().set_stored());
|
||||
let schema = schema_builder.build();
|
||||
{
|
||||
let mut store_writer = StoreWriter::new(writer);
|
||||
let mut store_writer = StoreWriter::new(writer, compressor);
|
||||
for i in 0..num_docs {
|
||||
let mut fields: Vec<FieldValue> = Vec::new();
|
||||
{
|
||||
let field_value = FieldValue::new(field_body, From::from(lorem.clone()));
|
||||
let field_value = FieldValue::new(field_body, From::from(LOREM.to_string()));
|
||||
fields.push(field_value);
|
||||
}
|
||||
{
|
||||
@@ -146,16 +108,51 @@ pub mod tests {
|
||||
schema
|
||||
}
|
||||
|
||||
const NUM_DOCS: usize = 1_000;
|
||||
#[test]
|
||||
fn test_store() -> crate::Result<()> {
|
||||
fn test_doc_store_iter_with_delete_bug_1077() -> crate::Result<()> {
|
||||
// this will cover deletion of the first element in a checkpoint
|
||||
let deleted_docids = (200..300).collect::<Vec<_>>();
|
||||
let delete_bitset = DeleteBitSet::for_test(&deleted_docids, NUM_DOCS as u32);
|
||||
|
||||
let path = Path::new("store");
|
||||
let directory = RamDirectory::create();
|
||||
let store_wrt = directory.open_write(path)?;
|
||||
let schema = write_lorem_ipsum_store(store_wrt, 1_000);
|
||||
let schema = write_lorem_ipsum_store(store_wrt, NUM_DOCS, Compressor::Lz4);
|
||||
let field_title = schema.get_field("title").unwrap();
|
||||
let store_file = directory.open_read(path)?;
|
||||
let store = StoreReader::open(store_file)?;
|
||||
for i in 0..1_000 {
|
||||
for i in 0..NUM_DOCS as u32 {
|
||||
assert_eq!(
|
||||
*store
|
||||
.get(i)?
|
||||
.get_first(field_title)
|
||||
.unwrap()
|
||||
.text()
|
||||
.unwrap(),
|
||||
format!("Doc {}", i)
|
||||
);
|
||||
}
|
||||
for (_, doc) in store.iter(Some(&delete_bitset)).enumerate() {
|
||||
let doc = doc?;
|
||||
let title_content = doc.get_first(field_title).unwrap().text().unwrap();
|
||||
if !title_content.starts_with("Doc ") {
|
||||
panic!("unexpected title_content {}", title_content);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn test_store(compressor: Compressor) -> crate::Result<()> {
|
||||
let path = Path::new("store");
|
||||
let directory = RamDirectory::create();
|
||||
let store_wrt = directory.open_write(path)?;
|
||||
let schema = write_lorem_ipsum_store(store_wrt, NUM_DOCS, compressor);
|
||||
let field_title = schema.get_field("title").unwrap();
|
||||
let store_file = directory.open_read(path)?;
|
||||
let store = StoreReader::open(store_file)?;
|
||||
for i in 0..NUM_DOCS as u32 {
|
||||
assert_eq!(
|
||||
*store
|
||||
.get(i)?
|
||||
@@ -175,6 +172,22 @@ pub mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
#[test]
|
||||
fn test_store_lz4_block() -> crate::Result<()> {
|
||||
test_store(Compressor::Lz4)
|
||||
}
|
||||
#[cfg(feature = "snappy-compression")]
|
||||
#[test]
|
||||
fn test_store_snap() -> crate::Result<()> {
|
||||
test_store(Compressor::Snappy)
|
||||
}
|
||||
#[cfg(feature = "brotli-compression")]
|
||||
#[test]
|
||||
fn test_store_brotli() -> crate::Result<()> {
|
||||
test_store(Compressor::Brotli)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_store_with_delete() -> crate::Result<()> {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
@@ -215,6 +228,108 @@ pub mod tests {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(feature = "snappy-compression")]
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
#[test]
|
||||
fn test_merge_with_changed_compressor() -> crate::Result<()> {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
|
||||
let text_field = schema_builder.add_text_field("text_field", TEXT | STORED);
|
||||
let schema = schema_builder.build();
|
||||
let index_builder = Index::builder().schema(schema);
|
||||
|
||||
let mut index = index_builder.create_in_ram().unwrap();
|
||||
index.settings_mut().docstore_compression = Compressor::Lz4;
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
// put enough data create enough blocks in the doc store to be considered for stacking
|
||||
for _ in 0..200 {
|
||||
index_writer.add_document(doc!(text_field=> LOREM));
|
||||
}
|
||||
assert!(index_writer.commit().is_ok());
|
||||
for _ in 0..200 {
|
||||
index_writer.add_document(doc!(text_field=> LOREM));
|
||||
}
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
assert_eq!(
|
||||
index.reader().unwrap().searcher().segment_readers()[0]
|
||||
.get_store_reader()
|
||||
.unwrap()
|
||||
.compressor(),
|
||||
Compressor::Lz4
|
||||
);
|
||||
// Change compressor, this disables stacking on merging
|
||||
let index_settings = index.settings_mut();
|
||||
index_settings.docstore_compression = Compressor::Snappy;
|
||||
// Merging the segments
|
||||
{
|
||||
let segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
assert!(block_on(index_writer.merge(&segment_ids)).is_ok());
|
||||
assert!(index_writer.wait_merging_threads().is_ok());
|
||||
}
|
||||
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
let reader = searcher.segment_readers().iter().last().unwrap();
|
||||
let store = reader.get_store_reader().unwrap();
|
||||
|
||||
for doc in store.iter(reader.delete_bitset()).take(50) {
|
||||
assert_eq!(
|
||||
*doc?.get_first(text_field).unwrap().text().unwrap(),
|
||||
LOREM.to_string()
|
||||
);
|
||||
}
|
||||
assert_eq!(store.compressor(), Compressor::Snappy);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_of_small_segments() -> crate::Result<()> {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
|
||||
let text_field = schema_builder.add_text_field("text_field", TEXT | STORED);
|
||||
let schema = schema_builder.build();
|
||||
let index_builder = Index::builder().schema(schema);
|
||||
|
||||
let index = index_builder.create_in_ram().unwrap();
|
||||
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
|
||||
index_writer.add_document(doc!(text_field=> "1"));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
index_writer.add_document(doc!(text_field=> "2"));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
index_writer.add_document(doc!(text_field=> "3"));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
index_writer.add_document(doc!(text_field=> "4"));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
index_writer.add_document(doc!(text_field=> "5"));
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
// Merging the segments
|
||||
{
|
||||
let segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
assert!(block_on(index_writer.merge(&segment_ids)).is_ok());
|
||||
assert!(index_writer.wait_merging_threads().is_ok());
|
||||
}
|
||||
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
let reader = searcher.segment_readers().iter().last().unwrap();
|
||||
let store = reader.get_store_reader().unwrap();
|
||||
assert_eq!(store.block_checkpoints().count(), 1);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
@@ -223,6 +338,7 @@ mod bench {
|
||||
use super::tests::write_lorem_ipsum_store;
|
||||
use crate::directory::Directory;
|
||||
use crate::directory::RamDirectory;
|
||||
use crate::store::Compressor;
|
||||
use crate::store::StoreReader;
|
||||
use std::path::Path;
|
||||
use test::Bencher;
|
||||
@@ -233,7 +349,11 @@ mod bench {
|
||||
let directory = RamDirectory::create();
|
||||
let path = Path::new("store");
|
||||
b.iter(|| {
|
||||
write_lorem_ipsum_store(directory.open_write(path).unwrap(), 1_000);
|
||||
write_lorem_ipsum_store(
|
||||
directory.open_write(path).unwrap(),
|
||||
1_000,
|
||||
Compressor::default(),
|
||||
);
|
||||
directory.delete(path).unwrap();
|
||||
});
|
||||
}
|
||||
@@ -242,11 +362,13 @@ mod bench {
|
||||
fn bench_store_decode(b: &mut Bencher) {
|
||||
let directory = RamDirectory::create();
|
||||
let path = Path::new("store");
|
||||
write_lorem_ipsum_store(directory.open_write(path).unwrap(), 1_000);
|
||||
write_lorem_ipsum_store(
|
||||
directory.open_write(path).unwrap(),
|
||||
1_000,
|
||||
Compressor::default(),
|
||||
);
|
||||
let store_file = directory.open_read(path).unwrap();
|
||||
let store = StoreReader::open(store_file).unwrap();
|
||||
b.iter(|| {
|
||||
store.get(12).unwrap();
|
||||
});
|
||||
b.iter(|| store.iter(None).collect::<Vec<_>>());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,26 +1,29 @@
|
||||
use super::decompress;
|
||||
use super::index::SkipIndex;
|
||||
use crate::common::{BinarySerializable, HasLen};
|
||||
use super::Compressor;
|
||||
use super::{footer::DocStoreFooter, index::SkipIndex};
|
||||
use crate::directory::{FileSlice, OwnedBytes};
|
||||
use crate::schema::Document;
|
||||
use crate::space_usage::StoreSpaceUsage;
|
||||
use crate::store::index::Checkpoint;
|
||||
use crate::DocId;
|
||||
use crate::{common::VInt, fastfield::DeleteBitSet};
|
||||
use crate::{
|
||||
common::{BinarySerializable, HasLen, VInt},
|
||||
error::DataCorruption,
|
||||
fastfield::DeleteBitSet,
|
||||
};
|
||||
use lru::LruCache;
|
||||
use std::io;
|
||||
use std::mem::size_of;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
const LRU_CACHE_CAPACITY: usize = 100;
|
||||
|
||||
type Block = Arc<Vec<u8>>;
|
||||
type Block = OwnedBytes;
|
||||
|
||||
type BlockCache = Arc<Mutex<LruCache<usize, Block>>>;
|
||||
|
||||
/// Reads document off tantivy's [`Store`](./index.html)
|
||||
pub struct StoreReader {
|
||||
compressor: Compressor,
|
||||
data: FileSlice,
|
||||
cache: BlockCache,
|
||||
cache_hits: Arc<AtomicUsize>,
|
||||
@@ -32,11 +35,14 @@ pub struct StoreReader {
|
||||
impl StoreReader {
|
||||
/// Opens a store reader
|
||||
pub fn open(store_file: FileSlice) -> io::Result<StoreReader> {
|
||||
let (data_file, offset_index_file) = split_file(store_file)?;
|
||||
let (footer, data_and_offset) = DocStoreFooter::extract_footer(store_file)?;
|
||||
|
||||
let (data_file, offset_index_file) = data_and_offset.split(footer.offset as usize);
|
||||
let index_data = offset_index_file.read_bytes()?;
|
||||
let space_usage = StoreSpaceUsage::new(data_file.len(), offset_index_file.len());
|
||||
let skip_index = SkipIndex::open(index_data);
|
||||
Ok(StoreReader {
|
||||
compressor: footer.compressor,
|
||||
data: data_file,
|
||||
cache: Arc::new(Mutex::new(LruCache::new(LRU_CACHE_CAPACITY))),
|
||||
cache_hits: Default::default(),
|
||||
@@ -50,6 +56,10 @@ impl StoreReader {
|
||||
self.skip_index.checkpoints()
|
||||
}
|
||||
|
||||
pub(crate) fn compressor(&self) -> Compressor {
|
||||
self.compressor
|
||||
}
|
||||
|
||||
fn block_checkpoint(&self, doc_id: DocId) -> Option<Checkpoint> {
|
||||
self.skip_index.seek(doc_id)
|
||||
}
|
||||
@@ -72,9 +82,10 @@ impl StoreReader {
|
||||
|
||||
let compressed_block = self.compressed_block(checkpoint)?;
|
||||
let mut decompressed_block = vec![];
|
||||
decompress(compressed_block.as_slice(), &mut decompressed_block)?;
|
||||
self.compressor
|
||||
.decompress(compressed_block.as_slice(), &mut decompressed_block)?;
|
||||
|
||||
let block = Arc::new(decompressed_block);
|
||||
let block = OwnedBytes::new(decompressed_block);
|
||||
self.cache
|
||||
.lock()
|
||||
.unwrap()
|
||||
@@ -93,9 +104,8 @@ impl StoreReader {
|
||||
/// It should not be called to score documents
|
||||
/// for instance.
|
||||
pub fn get(&self, doc_id: DocId) -> crate::Result<Document> {
|
||||
let raw_doc = self.get_raw(doc_id)?;
|
||||
let mut cursor = raw_doc.get_bytes();
|
||||
Ok(Document::deserialize(&mut cursor)?)
|
||||
let mut doc_bytes = self.get_document_bytes(doc_id)?;
|
||||
Ok(Document::deserialize(&mut doc_bytes)?)
|
||||
}
|
||||
|
||||
/// Reads raw bytes of a given document. Returns `RawDocument`, which contains the block of a document and its start and end
|
||||
@@ -106,7 +116,7 @@ impl StoreReader {
|
||||
/// so accessing docs from the same compressed block should be faster.
|
||||
/// For that reason a store reader should be kept and reused.
|
||||
///
|
||||
pub fn get_raw(&self, doc_id: DocId) -> crate::Result<RawDocument> {
|
||||
pub fn get_document_bytes(&self, doc_id: DocId) -> crate::Result<OwnedBytes> {
|
||||
let checkpoint = self.block_checkpoint(doc_id).ok_or_else(|| {
|
||||
crate::TantivyError::InvalidArgument(format!("Failed to lookup Doc #{}.", doc_id))
|
||||
})?;
|
||||
@@ -121,11 +131,7 @@ impl StoreReader {
|
||||
let doc_length = VInt::deserialize(&mut cursor)?.val() as usize;
|
||||
let start_pos = cursor_len_before - cursor.len();
|
||||
let end_pos = cursor_len_before - cursor.len() + doc_length;
|
||||
Ok(RawDocument {
|
||||
block,
|
||||
start_pos,
|
||||
end_pos,
|
||||
})
|
||||
Ok(block.slice(start_pos..end_pos))
|
||||
}
|
||||
|
||||
/// Iterator over all Documents in their order as they are stored in the doc store.
|
||||
@@ -135,10 +141,9 @@ impl StoreReader {
|
||||
&'b self,
|
||||
delete_bitset: Option<&'a DeleteBitSet>,
|
||||
) -> impl Iterator<Item = crate::Result<Document>> + 'b {
|
||||
self.iter_raw(delete_bitset).map(|raw_doc| {
|
||||
let raw_doc = raw_doc?;
|
||||
let mut cursor = raw_doc.get_bytes();
|
||||
Ok(Document::deserialize(&mut cursor)?)
|
||||
self.iter_raw(delete_bitset).map(|doc_bytes_res| {
|
||||
let mut doc_bytes = doc_bytes_res?;
|
||||
Ok(Document::deserialize(&mut doc_bytes)?)
|
||||
})
|
||||
}
|
||||
|
||||
@@ -148,7 +153,7 @@ impl StoreReader {
|
||||
pub(crate) fn iter_raw<'a: 'b, 'b>(
|
||||
&'b self,
|
||||
delete_bitset: Option<&'a DeleteBitSet>,
|
||||
) -> impl Iterator<Item = crate::Result<RawDocument>> + 'b {
|
||||
) -> impl Iterator<Item = crate::Result<OwnedBytes>> + 'b {
|
||||
let last_docid = self
|
||||
.block_checkpoints()
|
||||
.last()
|
||||
@@ -158,9 +163,10 @@ impl StoreReader {
|
||||
let mut curr_checkpoint = checkpoint_block_iter.next();
|
||||
let mut curr_block = curr_checkpoint
|
||||
.as_ref()
|
||||
.map(|checkpoint| self.read_block(&checkpoint));
|
||||
.map(|checkpoint| self.read_block(&checkpoint).map_err(|e| e.kind())); // map error in order to enable cloning
|
||||
let mut block_start_pos = 0;
|
||||
let mut num_skipped = 0;
|
||||
let mut reset_block_pos = false;
|
||||
(0..last_docid)
|
||||
.filter_map(move |doc_id| {
|
||||
// filter_map is only used to resolve lifetime issues between the two closures on
|
||||
@@ -170,36 +176,44 @@ impl StoreReader {
|
||||
// we keep the number of skipped documents to move forward in the map block
|
||||
num_skipped += 1;
|
||||
}
|
||||
|
||||
// check move to next checkpoint
|
||||
let mut reset_block_pos = false;
|
||||
if doc_id >= curr_checkpoint.as_ref().unwrap().doc_range.end {
|
||||
curr_checkpoint = checkpoint_block_iter.next();
|
||||
curr_block = curr_checkpoint
|
||||
.as_ref()
|
||||
.map(|checkpoint| self.read_block(&checkpoint));
|
||||
.map(|checkpoint| self.read_block(&checkpoint).map_err(|e| e.kind()));
|
||||
reset_block_pos = true;
|
||||
num_skipped = 0;
|
||||
}
|
||||
|
||||
if alive {
|
||||
let ret = Some((
|
||||
curr_block.as_ref().unwrap().as_ref().unwrap().clone(), // todo forward errors
|
||||
num_skipped,
|
||||
reset_block_pos,
|
||||
));
|
||||
let ret = Some((curr_block.clone(), num_skipped, reset_block_pos));
|
||||
// the map block will move over the num_skipped, so we reset to 0
|
||||
num_skipped = 0;
|
||||
reset_block_pos = false;
|
||||
ret
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.map(move |(block, num_skipped, reset_block_pos)| {
|
||||
let block = block
|
||||
.ok_or_else(|| {
|
||||
DataCorruption::comment_only(
|
||||
"the current checkpoint in the doc store iterator is none, this should never happen",
|
||||
)
|
||||
})?
|
||||
.map_err(|error_kind| {
|
||||
std::io::Error::new(error_kind, "error when reading block in doc store")
|
||||
})?;
|
||||
// this flag is set, when filter_map moved to the next block
|
||||
if reset_block_pos {
|
||||
block_start_pos = 0;
|
||||
}
|
||||
let mut cursor = &block[block_start_pos..];
|
||||
let mut pos = 0;
|
||||
// move forward 1 doc + num_skipped in block and return length of current doc
|
||||
let doc_length = loop {
|
||||
let doc_length = VInt::deserialize(&mut cursor)?.val() as usize;
|
||||
let num_bytes_read = block[block_start_pos..].len() - cursor.len();
|
||||
@@ -214,13 +228,9 @@ impl StoreReader {
|
||||
}
|
||||
};
|
||||
let end_pos = block_start_pos + doc_length;
|
||||
let raw_doc = RawDocument {
|
||||
block,
|
||||
start_pos: block_start_pos,
|
||||
end_pos,
|
||||
};
|
||||
let doc_bytes = block.slice(block_start_pos..end_pos);
|
||||
block_start_pos = end_pos;
|
||||
Ok(raw_doc)
|
||||
Ok(doc_bytes)
|
||||
})
|
||||
}
|
||||
|
||||
@@ -230,31 +240,6 @@ impl StoreReader {
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the bytes of a serialized `Document` in a decompressed block.
|
||||
pub struct RawDocument {
|
||||
/// the block of data containing multiple documents
|
||||
block: Arc<Vec<u8>>,
|
||||
/// start position of the document in the block
|
||||
start_pos: usize,
|
||||
/// end position of the document in the block
|
||||
end_pos: usize,
|
||||
}
|
||||
|
||||
impl RawDocument {
|
||||
/// Get the bytes of a serialized `Document` in a decompressed block.
|
||||
pub fn get_bytes(&self) -> &[u8] {
|
||||
&self.block[self.start_pos..self.end_pos]
|
||||
}
|
||||
}
|
||||
|
||||
fn split_file(data: FileSlice) -> io::Result<(FileSlice, FileSlice)> {
|
||||
let (data, footer_len_bytes) = data.split_from_end(size_of::<u64>());
|
||||
let serialized_offset: OwnedBytes = footer_len_bytes.read_bytes()?;
|
||||
let mut serialized_offset_buf = serialized_offset.as_slice();
|
||||
let offset = u64::deserialize(&mut serialized_offset_buf)? as usize;
|
||||
Ok(data.split(offset))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -272,7 +257,7 @@ mod tests {
|
||||
let directory = RamDirectory::create();
|
||||
let path = Path::new("store");
|
||||
let writer = directory.open_write(path)?;
|
||||
let schema = write_lorem_ipsum_store(writer, 500);
|
||||
let schema = write_lorem_ipsum_store(writer, 500, Compressor::default());
|
||||
let title = schema.get_field("title").unwrap();
|
||||
let store_file = directory.open_read(path)?;
|
||||
let store = StoreReader::open(store_file)?;
|
||||
@@ -327,7 +312,7 @@ mod tests {
|
||||
.unwrap()
|
||||
.peek_lru()
|
||||
.map(|(&k, _)| k as usize),
|
||||
Some(9249)
|
||||
Some(9210)
|
||||
);
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use super::compress;
|
||||
use super::index::SkipIndexBuilder;
|
||||
use super::StoreReader;
|
||||
use super::{compressors::Compressor, footer::DocStoreFooter};
|
||||
use crate::common::CountingWriter;
|
||||
use crate::common::{BinarySerializable, VInt};
|
||||
use crate::directory::TerminatingWrite;
|
||||
@@ -21,6 +21,7 @@ const BLOCK_SIZE: usize = 16_384;
|
||||
/// The skip list index on the other hand, is built in memory.
|
||||
///
|
||||
pub struct StoreWriter {
|
||||
compressor: Compressor,
|
||||
doc: DocId,
|
||||
first_doc_in_block: DocId,
|
||||
offset_index_writer: SkipIndexBuilder,
|
||||
@@ -34,8 +35,9 @@ impl StoreWriter {
|
||||
///
|
||||
/// The store writer will writes blocks on disc as
|
||||
/// document are added.
|
||||
pub fn new(writer: WritePtr) -> StoreWriter {
|
||||
pub fn new(writer: WritePtr, compressor: Compressor) -> StoreWriter {
|
||||
StoreWriter {
|
||||
compressor,
|
||||
doc: 0,
|
||||
first_doc_in_block: 0,
|
||||
offset_index_writer: SkipIndexBuilder::new(),
|
||||
@@ -45,6 +47,10 @@ impl StoreWriter {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn compressor(&self) -> Compressor {
|
||||
self.compressor
|
||||
}
|
||||
|
||||
/// The memory used (inclusive childs)
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.intermediary_buffer.capacity() + self.current_block.capacity()
|
||||
@@ -125,7 +131,8 @@ impl StoreWriter {
|
||||
fn write_and_compress_block(&mut self) -> io::Result<()> {
|
||||
assert!(self.doc > 0);
|
||||
self.intermediary_buffer.clear();
|
||||
compress(&self.current_block[..], &mut self.intermediary_buffer)?;
|
||||
self.compressor
|
||||
.compress(&self.current_block[..], &mut self.intermediary_buffer)?;
|
||||
let start_offset = self.writer.written_bytes() as usize;
|
||||
self.writer.write_all(&self.intermediary_buffer)?;
|
||||
let end_offset = self.writer.written_bytes() as usize;
|
||||
@@ -147,8 +154,9 @@ impl StoreWriter {
|
||||
self.write_and_compress_block()?;
|
||||
}
|
||||
let header_offset: u64 = self.writer.written_bytes() as u64;
|
||||
let footer = DocStoreFooter::new(header_offset, self.compressor);
|
||||
self.offset_index_writer.write(&mut self.writer)?;
|
||||
header_offset.serialize(&mut self.writer)?;
|
||||
footer.serialize(&mut self.writer)?;
|
||||
self.writer.terminate()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -78,8 +78,8 @@ pub struct TermStreamer<'a, A = AlwaysMatch>
|
||||
where
|
||||
A: Automaton,
|
||||
{
|
||||
fst_map: &'a TermDictionary,
|
||||
stream: Stream<'a, A>,
|
||||
pub(crate) fst_map: &'a TermDictionary,
|
||||
pub(crate) stream: Stream<'a, A>,
|
||||
term_ord: TermOrdinal,
|
||||
current_key: Vec<u8>,
|
||||
current_value: TermInfo,
|
||||
|
||||
@@ -1,32 +1,11 @@
|
||||
use crate::postings::TermInfo;
|
||||
use crate::termdict::TermDictionary;
|
||||
use crate::termdict::TermOrdinal;
|
||||
use crate::termdict::TermStreamer;
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::BinaryHeap;
|
||||
|
||||
pub struct HeapItem<'a> {
|
||||
pub streamer: TermStreamer<'a>,
|
||||
pub segment_ord: usize,
|
||||
}
|
||||
|
||||
impl<'a> PartialEq for HeapItem<'a> {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.segment_ord == other.segment_ord
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Eq for HeapItem<'a> {}
|
||||
|
||||
impl<'a> PartialOrd for HeapItem<'a> {
|
||||
fn partial_cmp(&self, other: &HeapItem<'a>) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Ord for HeapItem<'a> {
|
||||
fn cmp(&self, other: &HeapItem<'a>) -> Ordering {
|
||||
(&other.streamer.key(), &other.segment_ord).cmp(&(&self.streamer.key(), &self.segment_ord))
|
||||
}
|
||||
}
|
||||
use tantivy_fst::map::OpBuilder;
|
||||
use tantivy_fst::map::Union;
|
||||
use tantivy_fst::raw::IndexedValue;
|
||||
use tantivy_fst::Streamer;
|
||||
|
||||
/// Given a list of sorted term streams,
|
||||
/// returns an iterator over sorted unique terms.
|
||||
@@ -34,61 +13,50 @@ impl<'a> Ord for HeapItem<'a> {
|
||||
/// The item yield is actually a pair with
|
||||
/// - the term
|
||||
/// - a slice with the ordinal of the segments containing
|
||||
/// the terms.
|
||||
/// the term.
|
||||
pub struct TermMerger<'a> {
|
||||
heap: BinaryHeap<HeapItem<'a>>,
|
||||
current_streamers: Vec<HeapItem<'a>>,
|
||||
dictionaries: Vec<&'a TermDictionary>,
|
||||
union: Union<'a>,
|
||||
current_key: Vec<u8>,
|
||||
current_segment_and_term_ordinals: Vec<IndexedValue>,
|
||||
}
|
||||
|
||||
impl<'a> TermMerger<'a> {
|
||||
/// Stream of merged term dictionary
|
||||
///
|
||||
pub fn new(streams: Vec<TermStreamer<'a>>) -> TermMerger<'a> {
|
||||
let mut op_builder = OpBuilder::new();
|
||||
let mut dictionaries = vec![];
|
||||
for streamer in streams {
|
||||
op_builder.push(streamer.stream);
|
||||
dictionaries.push(streamer.fst_map);
|
||||
}
|
||||
TermMerger {
|
||||
heap: BinaryHeap::new(),
|
||||
current_streamers: streams
|
||||
.into_iter()
|
||||
.enumerate()
|
||||
.map(|(ord, streamer)| HeapItem {
|
||||
streamer,
|
||||
segment_ord: ord,
|
||||
})
|
||||
.collect(),
|
||||
dictionaries,
|
||||
union: op_builder.union(),
|
||||
current_key: vec![],
|
||||
current_segment_and_term_ordinals: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn matching_segments<'b: 'a>(
|
||||
&'b self,
|
||||
) -> impl 'b + Iterator<Item = (usize, TermOrdinal)> {
|
||||
self.current_streamers
|
||||
pub fn matching_segments<'b: 'a>(&'b self) -> impl 'b + Iterator<Item = (usize, TermOrdinal)> {
|
||||
self.current_segment_and_term_ordinals
|
||||
.iter()
|
||||
.map(|heap_item| (heap_item.segment_ord, heap_item.streamer.term_ord()))
|
||||
}
|
||||
|
||||
fn advance_segments(&mut self) {
|
||||
let streamers = &mut self.current_streamers;
|
||||
let heap = &mut self.heap;
|
||||
for mut heap_item in streamers.drain(..) {
|
||||
if heap_item.streamer.advance() {
|
||||
heap.push(heap_item);
|
||||
}
|
||||
}
|
||||
.map(|iv| (iv.index, iv.value))
|
||||
}
|
||||
|
||||
/// Advance the term iterator to the next term.
|
||||
/// Returns true if there is indeed another term
|
||||
/// False if there is none.
|
||||
pub fn advance(&mut self) -> bool {
|
||||
self.advance_segments();
|
||||
if let Some(head) = self.heap.pop() {
|
||||
self.current_streamers.push(head);
|
||||
while let Some(next_streamer) = self.heap.peek() {
|
||||
if self.current_streamers[0].streamer.key() != next_streamer.streamer.key() {
|
||||
break;
|
||||
}
|
||||
let next_heap_it = self.heap.pop().unwrap(); // safe : we peeked beforehand
|
||||
self.current_streamers.push(next_heap_it);
|
||||
}
|
||||
if let Some((k, values)) = self.union.next() {
|
||||
self.current_key.clear();
|
||||
self.current_key.extend_from_slice(k);
|
||||
self.current_segment_and_term_ordinals.clear();
|
||||
self.current_segment_and_term_ordinals
|
||||
.extend_from_slice(values);
|
||||
self.current_segment_and_term_ordinals
|
||||
.sort_by_key(|iv| iv.index);
|
||||
true
|
||||
} else {
|
||||
false
|
||||
@@ -101,16 +69,85 @@ impl<'a> TermMerger<'a> {
|
||||
/// iff advance() has been called before
|
||||
/// and "true" was returned.
|
||||
pub fn key(&self) -> &[u8] {
|
||||
self.current_streamers[0].streamer.key()
|
||||
&self.current_key
|
||||
}
|
||||
|
||||
/// Returns the sorted list of segment ordinals
|
||||
/// that include the current term.
|
||||
/// Iterator over (segment ordinal, TermInfo) pairs iterator sorted by the ordinal.
|
||||
///
|
||||
/// This method may be called
|
||||
/// iff advance() has been called before
|
||||
/// and "true" was returned.
|
||||
pub fn current_kvs(&self) -> &[HeapItem<'a>] {
|
||||
&self.current_streamers[..]
|
||||
pub fn current_segment_ordinals_and_term_infos<'b: 'a>(
|
||||
&'b self,
|
||||
) -> impl 'b + Iterator<Item = (usize, TermInfo)> {
|
||||
self.current_segment_and_term_ordinals
|
||||
.iter()
|
||||
.map(move |iv| {
|
||||
(
|
||||
iv.index,
|
||||
self.dictionaries[iv.index].term_info_from_ord(iv.value),
|
||||
)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
use super::TermMerger;
|
||||
use crate::directory::FileSlice;
|
||||
use crate::postings::TermInfo;
|
||||
use crate::termdict::{TermDictionary, TermDictionaryBuilder};
|
||||
use rand::distributions::Alphanumeric;
|
||||
use rand::{thread_rng, Rng};
|
||||
use test::{self, Bencher};
|
||||
|
||||
fn make_term_info(term_ord: u64) -> TermInfo {
|
||||
let offset = |term_ord: u64| (term_ord * 100 + term_ord * term_ord) as usize;
|
||||
TermInfo {
|
||||
doc_freq: term_ord as u32,
|
||||
postings_range: offset(term_ord)..offset(term_ord + 1),
|
||||
positions_range: offset(term_ord)..offset(term_ord + 1),
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a dictionary of random strings.
|
||||
fn rand_dict(num_terms: usize) -> crate::Result<TermDictionary> {
|
||||
let buffer: Vec<u8> = {
|
||||
let mut terms = vec![];
|
||||
for _i in 0..num_terms {
|
||||
let rand_string: String = thread_rng()
|
||||
.sample_iter(&Alphanumeric)
|
||||
.take(thread_rng().gen_range(30..42))
|
||||
.map(char::from)
|
||||
.collect();
|
||||
terms.push(rand_string);
|
||||
}
|
||||
terms.sort();
|
||||
|
||||
let mut term_dictionary_builder = TermDictionaryBuilder::create(Vec::new())?;
|
||||
for i in 0..num_terms {
|
||||
term_dictionary_builder.insert(terms[i].as_bytes(), &make_term_info(i as u64))?;
|
||||
}
|
||||
term_dictionary_builder.finish()?
|
||||
};
|
||||
let file = FileSlice::from(buffer);
|
||||
TermDictionary::open(file)
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_termmerger(b: &mut Bencher) -> crate::Result<()> {
|
||||
let dict1 = rand_dict(100_000)?;
|
||||
let dict2 = rand_dict(100_000)?;
|
||||
b.iter(|| -> crate::Result<u32> {
|
||||
let stream1 = dict1.stream()?;
|
||||
let stream2 = dict2.stream()?;
|
||||
let mut merger = TermMerger::new(vec![stream1, stream2]);
|
||||
let mut count = 0;
|
||||
while merger.advance() {
|
||||
count += 1;
|
||||
}
|
||||
Ok(count)
|
||||
});
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user