Compare commits

..

48 Commits

Author SHA1 Message Date
Paul Masurel
784717749f Removing unused imports. 2021-02-05 23:04:17 +09:00
Paul Masurel
945bcc5bd3 Bump tantivy-grammar version 2021-02-05 22:58:21 +09:00
Paul Masurel
51aa9c319e Bumped version to 0.14 2021-02-05 22:55:26 +09:00
Paul Masurel
74d8d2946b Merge pull request #980 from lengyijun/patch-8
Update segment_postings.rs
2021-02-05 22:52:29 +09:00
lyj
0a160cc16e Update segment_postings.rs 2021-02-05 21:32:25 +08:00
Paul Masurel
f099f97daa Merge pull request #979 from slckl/main
FacetCounts are now pub use in tantivy::collector (Closes #978)
2021-02-05 17:05:20 +09:00
alif
769e9ba14d added simple docs to FacetCounts now-public API 2021-02-05 09:18:20 +02:00
alif
a482c0e966 pub use FacetCounts in tantivy::collector module 2021-02-05 09:00:48 +02:00
Paul Masurel
86d92a72e7 Renaming MultiValueIntFastField* to MultiValuedIntFastField* 2021-01-21 22:47:00 +09:00
Paul Masurel
ef618a5999 Made fast field reader clonable. 2021-01-21 22:15:24 +09:00
Paul Masurel
94d3d7a89a Rename FastFieldReaders::load_all 2021-01-21 18:38:48 +09:00
Paul Masurel
aa9e79f957 Clippy warnings. 2021-01-21 18:23:20 +09:00
Paul Masurel
84a2f534db Merge pull request #976 from tantivy-search/issue/fastfield_no_load
Fast field are not loaded on the opening of a segment.
2021-01-21 18:14:55 +09:00
Paul Masurel
1b4be24dca Fast field are not loaded on the opening of a segment.
They are instead loaded lazily when they are request.
2021-01-21 18:13:08 +09:00
Paul Masurel
824ccc37ae Merge pull request #975 from jamescorbett/patch-1
Change from serde::export to std::marker
2021-01-12 10:04:23 +09:00
Paul Masurel
5231651020 Closes #974 2021-01-12 10:03:37 +09:00
James Corbett
fa2c6f80c7 Change from serde::export to std::marker
For some reason under a docker build I get a build error under docker only saying that `serde::export` is private. This fixes it for me.

```
error[E0603]: module `export` is private
   --> /usr/local/cargo/registry/src/github.com-1ecc6299db9ec823/tantivy-0.13.2/src/collector/top_collector.rs:5:12
    |
5   | use serde::export::PhantomData;
    |            ^^^^^^ private module
    |
note: the module `export` is defined here
   --> /usr/local/cargo/registry/src/github.com-1ecc6299db9ec823/serde-1.0.119/src/lib.rs:275:5
    |
275 | use self::__private as export;
    |     ^^^^^^^^^^^^^^^^^^^^^^^^^
```
2021-01-12 00:25:54 +00:00
Paul Masurel
43c7b3bfec Bugfix in the RAMDirectory.
There was a state where the meta.json was empty.
2021-01-11 14:11:42 +09:00
Paul Masurel
b17a10546a Minor change in unit test. 2021-01-11 11:33:59 +09:00
Paul Masurel
bf6e6e8a7c Merge pull request #972 from tantivy-search/issue/969
Issue/969
2021-01-07 22:49:31 +09:00
Paul Masurel
203b0256a3 Minor renaming 2021-01-07 22:47:57 +09:00
Paul Masurel
caf2a38b7e Closes #969.
The segment stacking optimization is not updating "first_doc_in_block".
2021-01-07 22:43:56 +09:00
Paul Masurel
96f24b078e Added failing unit test. 2021-01-07 22:43:28 +09:00
Paul Masurel
332b50a4eb Merge pull request #970 from tantivy-search/functional-test-store
Added a functional long running test to test store merging.
2021-01-07 14:27:08 +09:00
Paul Masurel
8ca0954b3b Added a functional long running test to test store merging. 2021-01-07 14:07:15 +09:00
Paul Masurel
36343e2de8 Merge pull request #968 from tantivy-search/add-bench-analyzer
added a simple bench for the default analyzer
2021-01-06 21:33:39 +09:00
Paul Masurel
2f14a892ca added a simple bench for the default analyzer 2021-01-06 19:11:26 +09:00
Paul Masurel
9c3cabce40 Updated version of the rand crate. 2021-01-06 18:09:00 +09:00
Paul Masurel
f8d71c2b10 Merge pull request #964 from mosuka/deserializable
Make NamedFieldDocument deserializable
2021-01-06 17:43:53 +09:00
Paul Masurel
394dfb24f1 Merge pull request #965 from lewisdiamond/patch-1
Fix spelling
2021-01-06 13:38:31 +09:00
Lewis Diamond
b0549a229d Fix spelling 2021-01-05 22:34:56 -05:00
Minoru Osuka
670b6eaff6 Make NamedFieldDocument deserializable 2020-12-21 16:51:31 +09:00
Paul Masurel
a4f33d3823 Added comment to f64 conversion to u64.
- Added proptest
- Added comment to Lemire blog post.
2020-12-15 13:40:31 +09:00
Paul Masurel
c7841e3da5 Merge pull request #953 from barrotsteindev/filter-collector-tpredicatevalue
Generic filter collector
2020-12-14 10:35:46 +09:00
barrotsteindev
e7b4a12bba cargo fmt 2020-12-10 14:10:55 +02:00
barrotsteindev
0aaa929d6e Merge branch 'main' into filter-collector-tpredicatevalue 2020-12-10 11:27:19 +02:00
barrotsteindev
1112797c18 added a line to CHANGELOG.md 2020-12-10 11:25:08 +02:00
barrotsteindev
920481e1c1 change unit test 2020-12-10 11:24:53 +02:00
Paul Masurel
55f7b84966 Merge pull request #952 from tantivy-search/bm25-on-onebyte
Encode blockwand on a single byte.
2020-12-10 18:09:31 +09:00
Paul Masurel
09ab4df1fe Encode blockwand on a single byte. 2020-12-10 18:08:52 +09:00
barrotsteindev
0c2cf81b37 cargo fmt 2020-12-10 11:08:35 +02:00
barrotsteindev
d864430bda final edits 2020-12-10 11:08:15 +02:00
Paul Masurel
de60540e06 fixing compilation 2020-12-10 10:36:21 +02:00
Paul Masurel
c3e311e6b8 Removed 'static in compression_lz4. 2020-12-09 15:30:52 +09:00
barrotsteindev
ac704f2f22 WIP generic filter collector 2020-12-08 14:36:52 +02:00
Paul Masurel
be626083a0 Reorganized and added termdict unit tests. 2020-12-07 12:50:36 +09:00
Paul Masurel
b68fcca1e0 Minor changes
- Open{Write,Read}Error::wrap_io_error made public
- Arc<PathBuf> -> Arc<Path> in file_watcher.
2020-12-03 23:31:50 +09:00
Paul Masurel
af6dfa1856 Small refactoring 2020-12-03 14:27:05 +09:00
58 changed files with 4564 additions and 2359 deletions

View File

@@ -1,18 +1,23 @@
Tantivy 0.14.0
=========================
- Remove dependency to atomicwrites #833 .Implemented by @pmasurel upon suggestion and research from @asafigan).
- Remove dependency to atomicwrites #833 .Implemented by @fulmicoton upon suggestion and research from @asafigan).
- Migrated tantivy error from the now deprecated `failure` crate to `thiserror` #760. (@hirevo)
- API Change. Accessing the typed value off a `Schema::Value` now returns an Option instead of panicking if the type does not match.
- API Change. Accessing the typed value off a `Schema::Value` now returns an Option instead of panicking if the type does not match.
- Large API Change in the Directory API. Tantivy used to assume that all files could be somehow memory mapped. After this change, Directory return a `FileSlice` that can be reduced and eventually read into an `OwnedBytes` object. Long and blocking io operation are still required by they do not span over the entire file.
- Added support for Brotli compression in the DocStore. (@ppodolsky)
- Added helper for building intersections and unions in BooleanQuery (@guilload)
- Bugfix in `Query::explain`
- Removed dependency on `notify` #924. Replaced with `FileWatcher` struct that polls meta file every 500ms in background thread. (@halvorboe @guilload)
- Added `FilterCollector`, which wraps another collector and filters docs using a predicate over a fast field (@barrotsteindev)
- Simplified the encoding of the skip reader struct. BlockWAND max tf is now encoded over a single byte. (@fulmicoton)
- `FilterCollector` now supports all Fast Field value types (@barrotsteindev)
- FastField are not all loaded when opening the segment reader. (@fulmicoton)
This version breaks compatibility and requires users to reindex everything.
Tantivy 0.13.2
===================
Bugfix. Acquiring a facet reader on a segment that does not contain any
Bugfix. Acquiring a facet reader on a segment that does not contain any
doc with this facet returns `None`. (#896)
Tantivy 0.13.1
@@ -23,7 +28,7 @@ Updated misc dependency versions.
Tantivy 0.13.0
======================
Tantivy 0.13 introduce a change in the index format that will require
you to reindex your index (BlockWAND information are added in the skiplist).
you to reindex your index (BlockWAND information are added in the skiplist).
The index size increase is minor as this information is only added for
full blocks.
If you have a massive index for which reindexing is not an option, please contact me
@@ -32,7 +37,7 @@ so that we can discuss possible solutions.
- Bugfix in `FuzzyTermQuery` not matching terms by prefix when it should (@Peachball)
- Relaxed constraints on the custom/tweak score functions. At the segment level, they can be mut, and they are not required to be Sync + Send.
- `MMapDirectory::open` does not return a `Result` anymore.
- Change in the DocSet and Scorer API. (@fulmicoton).
- Change in the DocSet and Scorer API. (@fulmicoton).
A freshly created DocSet point directly to their first doc. A sentinel value called TERMINATED marks the end of a DocSet.
`.advance()` returns the new DocId. `Scorer::skip(target)` has been replaced by `Scorer::seek(target)` and returns the resulting DocId.
As a result, iterating through DocSet now looks as follows
@@ -46,7 +51,7 @@ while doc != TERMINATED {
The change made it possible to greatly simplify a lot of the docset's code.
- Misc internal optimization and introduction of the `Scorer::for_each_pruning` function. (@fulmicoton)
- Added an offset option to the Top(.*)Collectors. (@robyoung)
- Added Block WAND. Performance on TOP-K on term-unions should be greatly increased. (@fulmicoton, and special thanks
- Added Block WAND. Performance on TOP-K on term-unions should be greatly increased. (@fulmicoton, and special thanks
to the PISA team for answering all my questions!)
Tantivy 0.12.0
@@ -54,14 +59,14 @@ Tantivy 0.12.0
- Removing static dispatch in tokenizers for simplicity. (#762)
- Added backward iteration for `TermDictionary` stream. (@halvorboe)
- Fixed a performance issue when searching for the posting lists of a missing term (@audunhalland)
- Added a configurable maximum number of docs (10M by default) for a segment to be considered for merge (@hntd187, landed by @halvorboe #713)
- Added a configurable maximum number of docs (10M by default) for a segment to be considered for merge (@hntd187, landed by @halvorboe #713)
- Important Bugfix #777, causing tantivy to retain memory mapping. (diagnosed by @poljar)
- Added support for field boosting. (#547, @fulmicoton)
## How to update?
Crates relying on custom tokenizer, or registering tokenizer in the manager will require some
minor changes. Check https://github.com/tantivy-search/tantivy/blob/master/examples/custom_tokenizer.rs
Crates relying on custom tokenizer, or registering tokenizer in the manager will require some
minor changes. Check https://github.com/tantivy-search/tantivy/blob/main/examples/custom_tokenizer.rs
to check for some code sample.
Tantivy 0.11.3
@@ -97,7 +102,7 @@ Tantivy 0.11.0
## How to update?
- The index format is changed. You are required to reindex your data to use tantivy 0.11.
- The index format is changed. You are required to reindex your data to use tantivy 0.11.
- `Box<dyn BoxableTokenizer>` has been replaced by a `BoxedTokenizer` struct.
- Regex are now compiled when the `RegexQuery` instance is built. As a result, it can now return
an error and handling the `Result` is required.
@@ -121,26 +126,26 @@ Tantivy 0.10.0
*Tantivy 0.10.0 index format is compatible with the index format in 0.9.0.*
- Added an API to easily tweak or entirely replace the
default score. See `TopDocs::tweak_score`and `TopScore::custom_score` (@pmasurel)
- Added an API to easily tweak or entirely replace the
default score. See `TopDocs::tweak_score`and `TopScore::custom_score` (@fulmicoton)
- Added an ASCII folding filter (@drusellers)
- Bugfix in `query.count` in presence of deletes (@pmasurel)
- Added `.explain(...)` in `Query` and `Weight` to (@pmasurel)
- Added an efficient way to `delete_all_documents` in `IndexWriter` (@petr-tik).
- Bugfix in `query.count` in presence of deletes (@fulmicoton)
- Added `.explain(...)` in `Query` and `Weight` to (@fulmicoton)
- Added an efficient way to `delete_all_documents` in `IndexWriter` (@petr-tik).
All segments are simply removed.
Minor
---------
- Switched to Rust 2018 (@uvd)
- Small simplification of the code.
- Small simplification of the code.
Calling .freq() or .doc() when .advance() has never been called
on segment postings should panic from now on.
- Tokens exceeding `u16::max_value() - 4` chars are discarded silently instead of panicking.
- Fast fields are now preloaded when the `SegmentReader` is created.
- `IndexMeta` is now public. (@hntd187)
- `IndexWriter` `add_document`, `delete_term`. `IndexWriter` is `Sync`, making it possible to use it with a `
Arc<RwLock<IndexWriter>>`. `add_document` and `delete_term` can
only require a read lock. (@pmasurel)
Arc<RwLock<IndexWriter>>`. `add_document` and `delete_term` can
only require a read lock. (@fulmicoton)
- Introducing `Opstamp` as an expressive type alias for `u64`. (@petr-tik)
- Stamper now relies on `AtomicU64` on all platforms (@petr-tik)
- Bugfix - Files get deleted slightly earlier
@@ -154,7 +159,7 @@ Your program should be usable as is.
Fast fields used to be accessed directly from the `SegmentReader`.
The API changed, you are now required to acquire your fast field reader via the
`segment_reader.fast_fields()`, and use one of the typed method:
`segment_reader.fast_fields()`, and use one of the typed method:
- `.u64()`, `.i64()` if your field is single-valued ;
- `.u64s()`, `.i64s()` if your field is multi-valued ;
- `.bytes()` if your field is bytes fast field.
@@ -163,16 +168,16 @@ The API changed, you are now required to acquire your fast field reader via the
Tantivy 0.9.0
=====================
*0.9.0 index format is not compatible with the
*0.9.0 index format is not compatible with the
previous index format.*
- MAJOR BUGFIX :
- MAJOR BUGFIX :
Some `Mmap` objects were being leaked, and would never get released. (@fulmicoton)
- Removed most unsafe (@fulmicoton)
- Indexer memory footprint improved. (VInt comp, inlining the first block. (@fulmicoton)
- Stemming in other language possible (@pentlander)
- Segments with no docs are deleted earlier (@barrotsteindev)
- Added grouped add and delete operations.
They are guaranteed to happen together (i.e. they cannot be split by a commit).
- Added grouped add and delete operations.
They are guaranteed to happen together (i.e. they cannot be split by a commit).
In addition, adds are guaranteed to happen on the same segment. (@elbow-jason)
- Removed `INT_STORED` and `INT_INDEXED`. It is now possible to use `STORED` and `INDEXED`
for int fields. (@fulmicoton)
@@ -186,26 +191,26 @@ tantivy 0.9 brought some API breaking change.
To update from tantivy 0.8, you will need to go through the following steps.
- `schema::INT_INDEXED` and `schema::INT_STORED` should be replaced by `schema::INDEXED` and `schema::INT_STORED`.
- The index now does not hold the pool of searcher anymore. You are required to create an intermediary object called
`IndexReader` for this.
- The index now does not hold the pool of searcher anymore. You are required to create an intermediary object called
`IndexReader` for this.
```rust
// create the reader. You typically need to create 1 reader for the entire
// lifetime of you program.
let reader = index.reader()?;
// Acquire a searcher (previously `index.searcher()`) is now written:
let searcher = reader.searcher();
// With the default setting of the reader, you are not required to
// With the default setting of the reader, you are not required to
// call `index.load_searchers()` anymore.
//
// The IndexReader will pick up that change automatically, regardless
// of whether the update was done in a different process or not.
// If this behavior is not wanted, you can create your reader with
// If this behavior is not wanted, you can create your reader with
// the `ReloadPolicy::Manual`, and manually decide when to reload the index
// by calling `reader.reload()?`.
```
@@ -220,7 +225,7 @@ Tantivy 0.8.1
=====================
Hotfix of #476.
Merge was reflecting deletes before commit was passed.
Merge was reflecting deletes before commit was passed.
Thanks @barrotsteindev for reporting the bug.
@@ -228,7 +233,7 @@ Tantivy 0.8.0
=====================
*No change in the index format*
- API Breaking change in the collector API. (@jwolfe, @fulmicoton)
- Multithreaded search (@jwolfe, @fulmicoton)
- Multithreaded search (@jwolfe, @fulmicoton)
Tantivy 0.7.1
@@ -256,7 +261,7 @@ Tantivy 0.6.1
- Exclusive `field:{startExcl to endExcl}`
- Mixed `field:[startIncl to endExcl}` and vice versa
- Unbounded `field:[start to *]`, `field:[* to end]`
Tantivy 0.6
==========================
@@ -264,10 +269,10 @@ Tantivy 0.6
Special thanks to @drusellers and @jason-wolfe for their contributions
to this release!
- Removed C code. Tantivy is now pure Rust. (@pmasurel)
- BM25 (@pmasurel)
- Approximate field norms encoded over 1 byte. (@pmasurel)
- Compiles on stable rust (@pmasurel)
- Removed C code. Tantivy is now pure Rust. (@fulmicoton)
- BM25 (@fulmicoton)
- Approximate field norms encoded over 1 byte. (@fulmicoton)
- Compiles on stable rust (@fulmicoton)
- Add &[u8] fastfield for associating arbitrary bytes to each document (@jason-wolfe) (#270)
- Completely uncompressed
- Internally: One u64 fast field for indexes, one fast field for the bytes themselves.
@@ -275,7 +280,7 @@ to this release!
- Add Stopword Filter support (@drusellers)
- Add a FuzzyTermQuery (@drusellers)
- Add a RegexQuery (@drusellers)
- Various performance improvements (@pmasurel)_
- Various performance improvements (@fulmicoton)_
Tantivy 0.5.2

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy"
version = "0.14.0-dev"
version = "0.14.0"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]
@@ -26,7 +26,6 @@ snap = "1"
tempfile = {version="3", optional=true}
log = "0.4"
serde = {version="1", features=["derive"]}
serde_cbor = "0.11"
serde_json = "1"
num_cpus = "1"
fs2={version="0.4", optional=true}
@@ -34,7 +33,7 @@ levenshtein_automata = "0.2"
uuid = { version = "0.8", features = ["v4", "serde"] }
crossbeam = "0.8"
futures = {version = "0.3", features=["thread-pool"] }
tantivy-query-grammar = { version="0.14.0-dev", path="./query-grammar" }
tantivy-query-grammar = { version="0.14.0", path="./query-grammar" }
stable_deref_trait = "1"
rust-stemmers = "1"
downcast-rs = "1"
@@ -54,10 +53,11 @@ lru = "0.6"
winapi = "0.3"
[dev-dependencies]
rand = "0.7"
rand = "0.8"
maplit = "1"
matches = "0.1.8"
proptest = "0.10"
criterion = "0.3"
[dev-dependencies.fail]
version = "0.4"
@@ -98,3 +98,7 @@ travis-ci = { repository = "tantivy-search/tantivy" }
name = "failpoints"
path = "tests/failpoints/mod.rs"
required-features = ["fail/failpoints"]
[[bench]]
name = "analyzer"
harness = false

View File

@@ -1,9 +1,9 @@
[![Build Status](https://travis-ci.org/tantivy-search/tantivy.svg?branch=master)](https://travis-ci.org/tantivy-search/tantivy)
[![codecov](https://codecov.io/gh/tantivy-search/tantivy/branch/master/graph/badge.svg)](https://codecov.io/gh/tantivy-search/tantivy)
[![Build Status](https://travis-ci.org/tantivy-search/tantivy.svg?branch=main)](https://travis-ci.org/tantivy-search/tantivy)
[![codecov](https://codecov.io/gh/tantivy-search/tantivy/branch/main/graph/badge.svg)](https://codecov.io/gh/tantivy-search/tantivy)
[![Join the chat at https://gitter.im/tantivy-search/tantivy](https://badges.gitter.im/tantivy-search/tantivy.svg)](https://gitter.im/tantivy-search/tantivy?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
[![Build status](https://ci.appveyor.com/api/projects/status/r7nb13kj23u8m9pj/branch/master?svg=true)](https://ci.appveyor.com/project/fulmicoton/tantivy/branch/master)
[![Build status](https://ci.appveyor.com/api/projects/status/r7nb13kj23u8m9pj/branch/main?svg=true)](https://ci.appveyor.com/project/fulmicoton/tantivy/branch/main)
[![Crates.io](https://img.shields.io/crates/v/tantivy.svg)](https://crates.io/crates/tantivy)
![Tantivy](https://tantivy-search.github.io/logo/tantivy-logo.png)

3774
benches/alice.txt Normal file

File diff suppressed because it is too large Load Diff

22
benches/analyzer.rs Normal file
View File

@@ -0,0 +1,22 @@
use criterion::{criterion_group, criterion_main, Criterion};
use tantivy::tokenizer::TokenizerManager;
const ALICE_TXT: &'static str = include_str!("alice.txt");
pub fn criterion_benchmark(c: &mut Criterion) {
let tokenizer_manager = TokenizerManager::default();
let tokenizer = tokenizer_manager.get("default").unwrap();
c.bench_function("default-tokenize-alice", |b| {
b.iter(|| {
let mut word_count = 0;
let mut token_stream = tokenizer.token_stream(ALICE_TXT);
while token_stream.advance() {
word_count += 1;
}
assert_eq!(word_count, 30_731);
})
});
}
criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);

View File

@@ -14,7 +14,7 @@ use tantivy::fastfield::FastFieldReader;
use tantivy::query::QueryParser;
use tantivy::schema::Field;
use tantivy::schema::{Schema, FAST, INDEXED, TEXT};
use tantivy::{doc, Index, Score, SegmentReader, TantivyError};
use tantivy::{doc, Index, Score, SegmentReader};
#[derive(Default)]
struct Stats {
@@ -72,16 +72,7 @@ impl Collector for StatsCollector {
_segment_local_id: u32,
segment_reader: &SegmentReader,
) -> tantivy::Result<StatsSegmentCollector> {
let fast_field_reader = segment_reader
.fast_fields()
.u64(self.field)
.ok_or_else(|| {
let field_name = segment_reader.schema().get_field_name(self.field);
TantivyError::SchemaError(format!(
"Field {:?} is not a u64 fast field.",
field_name
))
})?;
let fast_field_reader = segment_reader.fast_fields().u64(self.field)?;
Ok(StatsSegmentCollector {
fast_field_reader,
stats: Stats::default(),

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy-query-grammar"
version = "0.14.0-dev"
version = "0.14.0"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]

View File

@@ -398,6 +398,8 @@ impl<'a> Iterator for FacetChildIterator<'a> {
}
impl FacetCounts {
/// Returns an iterator over all of the facet count pairs inside this result.
/// See the documentation for `FacetCollector` for a usage example.
pub fn get<T>(&self, facet_from: T) -> FacetChildIterator<'_>
where
Facet: From<T>,
@@ -417,6 +419,8 @@ impl FacetCounts {
FacetChildIterator { underlying }
}
/// Returns a vector of top `k` facets with their counts, sorted highest-to-lowest by counts.
/// See the documentation for `FacetCollector` for a usage example.
pub fn top_k<T>(&self, facet: T, k: usize) -> Vec<(&Facet, u64)>
where
Facet: From<T>,

View File

@@ -9,8 +9,10 @@
// ---
// Importing tantivy...
use std::marker::PhantomData;
use crate::collector::{Collector, SegmentCollector};
use crate::fastfield::FastFieldReader;
use crate::fastfield::{FastFieldReader, FastValue};
use crate::schema::Field;
use crate::{Score, SegmentReader, TantivyError};
@@ -41,78 +43,98 @@ use crate::{Score, SegmentReader, TantivyError};
///
/// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query = query_parser.parse_query("diary").unwrap();
/// let no_filter_collector = FilterCollector::new(price, &|value| value > 20_120u64, TopDocs::with_limit(2));
/// let no_filter_collector = FilterCollector::new(price, &|value: u64| value > 20_120u64, TopDocs::with_limit(2));
/// let top_docs = searcher.search(&query, &no_filter_collector).unwrap();
///
/// assert_eq!(top_docs.len(), 1);
/// assert_eq!(top_docs[0].1, DocAddress(0, 1));
///
/// let filter_all_collector = FilterCollector::new(price, &|value| value < 5u64, TopDocs::with_limit(2));
/// let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new(price, &|value| value < 5u64, TopDocs::with_limit(2));
/// let filtered_top_docs = searcher.search(&query, &filter_all_collector).unwrap();
///
/// assert_eq!(filtered_top_docs.len(), 0);
/// ```
pub struct FilterCollector<TCollector, TPredicate>
pub struct FilterCollector<TCollector, TPredicate, TPredicateValue: FastValue>
where
TPredicate: 'static,
{
field: Field,
collector: TCollector,
predicate: &'static TPredicate,
t_predicate_value: PhantomData<TPredicateValue>,
}
impl<TCollector, TPredicate> FilterCollector<TCollector, TPredicate>
impl<TCollector, TPredicate, TPredicateValue: FastValue>
FilterCollector<TCollector, TPredicate, TPredicateValue>
where
TCollector: Collector + Send + Sync,
TPredicate: Fn(u64) -> bool + Send + Sync,
TPredicate: Fn(TPredicateValue) -> bool + Send + Sync,
{
/// Create a new FilterCollector.
pub fn new(
field: Field,
predicate: &'static TPredicate,
collector: TCollector,
) -> FilterCollector<TCollector, TPredicate> {
) -> FilterCollector<TCollector, TPredicate, TPredicateValue> {
FilterCollector {
field,
predicate,
collector,
t_predicate_value: PhantomData,
}
}
}
impl<TCollector, TPredicate> Collector for FilterCollector<TCollector, TPredicate>
impl<TCollector, TPredicate, TPredicateValue: FastValue> Collector
for FilterCollector<TCollector, TPredicate, TPredicateValue>
where
TCollector: Collector + Send + Sync,
TPredicate: 'static + Fn(u64) -> bool + Send + Sync,
TPredicate: 'static + Fn(TPredicateValue) -> bool + Send + Sync,
TPredicateValue: 'static + FastValue,
{
// That's the type of our result.
// Our standard deviation will be a float.
type Fruit = TCollector::Fruit;
type Child = FilterSegmentCollector<TCollector::Child, TPredicate>;
type Child = FilterSegmentCollector<TCollector::Child, TPredicate, TPredicateValue>;
fn for_segment(
&self,
segment_local_id: u32,
segment_reader: &SegmentReader,
) -> crate::Result<FilterSegmentCollector<TCollector::Child, TPredicate>> {
) -> crate::Result<FilterSegmentCollector<TCollector::Child, TPredicate, TPredicateValue>> {
let schema = segment_reader.schema();
let field_entry = schema.get_field_entry(self.field);
if !field_entry.is_fast() {
return Err(TantivyError::SchemaError(format!(
"Field {:?} is not a fast field.",
field_entry.name()
)));
}
let requested_type = TPredicateValue::to_type();
let field_schema_type = field_entry.field_type().value_type();
if requested_type != field_schema_type {
return Err(TantivyError::SchemaError(format!(
"Field {:?} is of type {:?}!={:?}",
field_entry.name(),
requested_type,
field_schema_type
)));
}
let fast_field_reader = segment_reader
.fast_fields()
.u64(self.field)
.ok_or_else(|| {
let field_name = segment_reader.schema().get_field_name(self.field);
TantivyError::SchemaError(format!(
"Field {:?} is not a u64 fast field.",
field_name
))
})?;
.typed_fast_field_reader(self.field)?;
let segment_collector = self
.collector
.for_segment(segment_local_id, segment_reader)?;
Ok(FilterSegmentCollector {
fast_field_reader,
segment_collector,
predicate: self.predicate,
t_predicate_value: PhantomData,
})
}
@@ -128,20 +150,23 @@ where
}
}
pub struct FilterSegmentCollector<TSegmentCollector, TPredicate>
pub struct FilterSegmentCollector<TSegmentCollector, TPredicate, TPredicateValue>
where
TPredicate: 'static,
TPredicateValue: 'static + FastValue,
{
fast_field_reader: FastFieldReader<u64>,
fast_field_reader: FastFieldReader<TPredicateValue>,
segment_collector: TSegmentCollector,
predicate: &'static TPredicate,
t_predicate_value: PhantomData<TPredicateValue>,
}
impl<TSegmentCollector, TPredicate> SegmentCollector
for FilterSegmentCollector<TSegmentCollector, TPredicate>
impl<TSegmentCollector, TPredicate, TPredicateValue> SegmentCollector
for FilterSegmentCollector<TSegmentCollector, TPredicate, TPredicateValue>
where
TSegmentCollector: SegmentCollector,
TPredicate: 'static + Fn(u64) -> bool + Send + Sync,
TPredicate: 'static + Fn(TPredicateValue) -> bool + Send + Sync,
TPredicateValue: 'static + FastValue,
{
type Fruit = TSegmentCollector::Fruit;

View File

@@ -109,6 +109,7 @@ pub use self::tweak_score_top_collector::{ScoreSegmentTweaker, ScoreTweaker};
mod facet_collector;
pub use self::facet_collector::FacetCollector;
pub use self::facet_collector::FacetCounts;
use crate::query::Weight;
mod docset_collector;

View File

@@ -8,6 +8,13 @@ use crate::DocId;
use crate::Score;
use crate::SegmentLocalId;
use crate::collector::{FilterCollector, TopDocs};
use crate::query::QueryParser;
use crate::schema::{Schema, FAST, TEXT};
use crate::DateTime;
use crate::{doc, Index};
use std::str::FromStr;
pub const TEST_COLLECTOR_WITH_SCORE: TestCollector = TestCollector {
compute_score: true,
};
@@ -16,6 +23,54 @@ pub const TEST_COLLECTOR_WITHOUT_SCORE: TestCollector = TestCollector {
compute_score: true,
};
#[test]
pub fn test_filter_collector() {
let mut schema_builder = Schema::builder();
let title = schema_builder.add_text_field("title", TEXT);
let price = schema_builder.add_u64_field("price", FAST);
let date = schema_builder.add_date_field("date", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
index_writer.add_document(doc!(title => "The Name of the Wind", price => 30_200u64, date => DateTime::from_str("1898-04-09T00:00:00+00:00").unwrap()));
index_writer.add_document(doc!(title => "The Diary of Muadib", price => 29_240u64, date => DateTime::from_str("2020-04-09T00:00:00+00:00").unwrap()));
index_writer.add_document(doc!(title => "The Diary of Anne Frank", price => 18_240u64, date => DateTime::from_str("2019-04-20T00:00:00+00:00").unwrap()));
index_writer.add_document(doc!(title => "A Dairy Cow", price => 21_240u64, date => DateTime::from_str("2019-04-09T00:00:00+00:00").unwrap()));
index_writer.add_document(doc!(title => "The Diary of a Young Girl", price => 20_120u64, date => DateTime::from_str("2018-04-09T00:00:00+00:00").unwrap()));
assert!(index_writer.commit().is_ok());
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let query_parser = QueryParser::for_index(&index, vec![title]);
let query = query_parser.parse_query("diary").unwrap();
let filter_some_collector = FilterCollector::new(
price,
&|value: u64| value > 20_120u64,
TopDocs::with_limit(2),
);
let top_docs = searcher.search(&query, &filter_some_collector).unwrap();
assert_eq!(top_docs.len(), 1);
assert_eq!(top_docs[0].1, DocAddress(0, 1));
let filter_all_collector: FilterCollector<_, _, u64> =
FilterCollector::new(price, &|value| value < 5u64, TopDocs::with_limit(2));
let filtered_top_docs = searcher.search(&query, &filter_all_collector).unwrap();
assert_eq!(filtered_top_docs.len(), 0);
fn date_filter(value: DateTime) -> bool {
(value - DateTime::from_str("2019-04-09T00:00:00+00:00").unwrap()).num_weeks() > 0
}
let filter_dates_collector = FilterCollector::new(date, &date_filter, TopDocs::with_limit(5));
let filtered_date_docs = searcher.search(&query, &filter_dates_collector).unwrap();
assert_eq!(filtered_date_docs.len(), 2);
}
/// Stores all of the doc ids.
/// This collector is only used for tests.
/// It is unusable in pr
@@ -185,12 +240,7 @@ impl Collector for BytesFastFieldTestCollector {
_segment_local_id: u32,
segment_reader: &SegmentReader,
) -> crate::Result<BytesFastFieldSegmentCollector> {
let reader = segment_reader
.fast_fields()
.bytes(self.field)
.ok_or_else(|| {
crate::TantivyError::InvalidArgument("Field is not a bytes fast field.".to_string())
})?;
let reader = segment_reader.fast_fields().bytes(self.field)?;
Ok(BytesFastFieldSegmentCollector {
vals: Vec::new(),
reader,

View File

@@ -2,9 +2,9 @@ use crate::DocAddress;
use crate::DocId;
use crate::SegmentLocalId;
use crate::SegmentReader;
use serde::export::PhantomData;
use std::cmp::Ordering;
use std::collections::BinaryHeap;
use std::marker::PhantomData;
/// Contains a feature (field, score, etc.) of a document along with the document address.
///

View File

@@ -146,15 +146,14 @@ impl CustomScorer<u64> for ScorerByField {
type Child = ScorerByFastFieldReader;
fn segment_scorer(&self, segment_reader: &SegmentReader) -> crate::Result<Self::Child> {
let ff_reader = segment_reader
// We interpret this field as u64, regardless of its type, that way,
// we avoid needless conversion. Regardless of the fast field type, the
// mapping is monotonic, so it is sufficient to compute our top-K docs.
//
// The conversion will then happen only on the top-K docs.
let ff_reader: FastFieldReader<u64> = segment_reader
.fast_fields()
.u64_lenient(self.field)
.ok_or_else(|| {
crate::TantivyError::SchemaError(format!(
"Field requested ({:?}) is not a fast field.",
self.field
))
})?;
.typed_fast_field_reader(self.field)?;
Ok(ScorerByFastFieldReader { ff_reader })
}
}
@@ -232,7 +231,7 @@ impl TopDocs {
/// # let title = schema_builder.add_text_field("title", TEXT);
/// # let rating = schema_builder.add_u64_field("rating", FAST);
/// # let schema = schema_builder.build();
/// #
/// #
/// # let index = Index::create_in_ram(schema);
/// # let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// # index_writer.add_document(doc!(title => "The Name of the Wind", rating => 92u64));
@@ -262,7 +261,7 @@ impl TopDocs {
/// let top_books_by_rating = TopDocs
/// ::with_limit(10)
/// .order_by_u64_field(rating_field);
///
///
/// // ... and here are our documents. Note this is a simple vec.
/// // The `u64` in the pair is the value of our fast field for
/// // each documents.
@@ -272,13 +271,13 @@ impl TopDocs {
/// // query.
/// let resulting_docs: Vec<(u64, DocAddress)> =
/// searcher.search(query, &top_books_by_rating)?;
///
///
/// Ok(resulting_docs)
/// }
/// ```
///
/// # See also
///
///
/// To confortably work with `u64`s, `i64`s, `f64`s, or `date`s, please refer to
/// [.order_by_fast_field(...)](#method.order_by_fast_field) method.
pub fn order_by_u64_field(
@@ -290,7 +289,7 @@ impl TopDocs {
/// Set top-K to rank documents by a given fast field.
///
/// If the field is not a fast field, or its field type does not match the generic type, this method does not panic,
/// If the field is not a fast field, or its field type does not match the generic type, this method does not panic,
/// but an explicit error will be returned at the moment of collection.
///
/// Note that this method is a generic. The requested fast field type will be often
@@ -314,7 +313,7 @@ impl TopDocs {
/// # let title = schema_builder.add_text_field("company", TEXT);
/// # let rating = schema_builder.add_i64_field("revenue", FAST);
/// # let schema = schema_builder.build();
/// #
/// #
/// # let index = Index::create_in_ram(schema);
/// # let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// # index_writer.add_document(doc!(title => "MadCow Inc.", rating => 92_000_000i64));
@@ -343,7 +342,7 @@ impl TopDocs {
/// let top_company_by_revenue = TopDocs
/// ::with_limit(2)
/// .order_by_fast_field(revenue_field);
///
///
/// // ... and here are our documents. Note this is a simple vec.
/// // The `i64` in the pair is the value of our fast field for
/// // each documents.
@@ -353,7 +352,7 @@ impl TopDocs {
/// // query.
/// let resulting_docs: Vec<(i64, DocAddress)> =
/// searcher.search(query, &top_company_by_revenue)?;
///
///
/// Ok(resulting_docs)
/// }
/// ```
@@ -392,7 +391,7 @@ impl TopDocs {
///
/// In the following example will will tweak our ranking a bit by
/// boosting popular products a notch.
///
///
/// In more serious application, this tweaking could involved running a
/// learning-to-rank model over various features
///
@@ -523,7 +522,7 @@ impl TopDocs {
/// # let index = Index::create_in_ram(schema);
/// # let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// # let product_name = index.schema().get_field("product_name").unwrap();
/// #
/// #
/// let popularity: Field = index.schema().get_field("popularity").unwrap();
/// let boosted: Field = index.schema().get_field("boosted").unwrap();
/// # index_writer.add_document(doc!(boosted=>1u64, product_name => "The Diary of Muadib", popularity => 1u64));
@@ -557,7 +556,7 @@ impl TopDocs {
/// segment_reader.fast_fields().u64(popularity).unwrap();
/// let boosted_reader =
/// segment_reader.fast_fields().u64(boosted).unwrap();
///
///
/// // We can now define our actual scoring function
/// move |doc: DocId| {
/// let popularity: u64 = popularity_reader.get(doc);
@@ -994,9 +993,7 @@ mod tests {
let segment = searcher.segment_reader(0);
let top_collector = TopDocs::with_limit(4).order_by_u64_field(size);
let err = top_collector.for_segment(0, segment).err().unwrap();
assert!(
matches!(err, crate::TantivyError::SchemaError(msg) if msg == "Field requested (Field(0)) is not a fast field.")
);
assert!(matches!(err, crate::TantivyError::SchemaError(_)));
Ok(())
}

View File

@@ -115,11 +115,16 @@ pub fn u64_to_i64(val: u64) -> i64 {
/// For simplicity, tantivy internally handles `f64` as `u64`.
/// The mapping is defined by this function.
///
/// Maps `f64` to `u64` so that lexical order is preserved.
/// Maps `f64` to `u64` in a monotonic manner, so that bytes lexical order is preserved.
///
/// This is more suited than simply casting (`val as u64`)
/// which would truncate the result
///
/// # Reference
///
/// Daniel Lemire's [blog post](https://lemire.me/blog/2020/12/14/converting-floating-point-numbers-to-integers-while-preserving-order/)
/// explains the mapping in a clear manner.
///
/// # See also
/// The [reverse mapping is `u64_to_f64`](./fn.u64_to_f64.html).
#[inline(always)]
@@ -148,6 +153,7 @@ pub(crate) mod test {
pub use super::minmax;
pub use super::serialize::test::fixed_size_test;
use super::{compute_num_bits, f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
use proptest::prelude::*;
use std::f64;
fn test_i64_converter_helper(val: i64) {
@@ -158,6 +164,15 @@ pub(crate) mod test {
assert_eq!(u64_to_f64(f64_to_u64(val)), val);
}
proptest! {
#[test]
fn test_f64_converter_monotonicity_proptest((left, right) in (proptest::num::f64::NORMAL, proptest::num::f64::NORMAL)) {
let left_u64 = f64_to_u64(left);
let right_u64 = f64_to_u64(right);
assert_eq!(left_u64 < right_u64, left < right);
}
}
#[test]
fn test_i64_converter() {
assert_eq!(i64_to_u64(i64::min_value()), u64::min_value());

View File

@@ -35,12 +35,21 @@ fn load_metas(
inventory: &SegmentMetaInventory,
) -> crate::Result<IndexMeta> {
let meta_data = directory.atomic_read(&META_FILEPATH)?;
let meta_string = String::from_utf8_lossy(&meta_data);
let meta_string = String::from_utf8(meta_data).map_err(|_utf8_err| {
error!("Meta data is not valid utf8.");
DataCorruption::new(
META_FILEPATH.to_path_buf(),
"Meta file does not contain valid utf8 file.".to_string(),
)
})?;
IndexMeta::deserialize(&meta_string, &inventory)
.map_err(|e| {
DataCorruption::new(
META_FILEPATH.to_path_buf(),
format!("Meta file cannot be deserialized. {:?}.", e),
format!(
"Meta file cannot be deserialized. {:?}. Content: {:?}",
e, meta_string
),
)
})
.map_err(From::from)

View File

@@ -114,12 +114,7 @@ impl SegmentReader {
field_entry.name()
)));
}
let term_ords_reader = self.fast_fields().u64s(field).ok_or_else(|| {
DataCorruption::comment_only(format!(
"Cannot find data for hierarchical facet {:?}",
field_entry.name()
))
})?;
let term_ords_reader = self.fast_fields().u64s(field)?;
let termdict = self
.termdict_composite
.open_read(field)
@@ -183,8 +178,10 @@ impl SegmentReader {
let fast_fields_data = segment.open_read(SegmentComponent::FASTFIELDS)?;
let fast_fields_composite = CompositeFile::open(&fast_fields_data)?;
let fast_field_readers =
Arc::new(FastFieldReaders::load_all(&schema, &fast_fields_composite)?);
let fast_field_readers = Arc::new(FastFieldReaders::new(
schema.clone(),
fast_fields_composite,
)?);
let fieldnorm_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;
@@ -310,7 +307,7 @@ impl SegmentReader {
}
/// Returns an iterator that will iterate over the alive document ids
pub fn doc_ids_alive<'a>(&'a self) -> impl Iterator<Item = DocId> + 'a {
pub fn doc_ids_alive(&self) -> impl Iterator<Item = DocId> + '_ {
(0u32..self.max_doc).filter(move |doc| !self.is_deleted(*doc))
}

View File

@@ -58,7 +58,8 @@ pub enum OpenWriteError {
}
impl OpenWriteError {
pub(crate) fn wrap_io_error(io_error: io::Error, filepath: PathBuf) -> Self {
/// Wraps an io error.
pub fn wrap_io_error(io_error: io::Error, filepath: PathBuf) -> Self {
Self::IOError { io_error, filepath }
}
}
@@ -143,7 +144,8 @@ pub enum OpenReadError {
}
impl OpenReadError {
pub(crate) fn wrap_io_error(io_error: io::Error, filepath: PathBuf) -> Self {
/// Wraps an io error.
pub fn wrap_io_error(io_error: io::Error, filepath: PathBuf) -> Self {
Self::IOError { io_error, filepath }
}
}

View File

@@ -3,7 +3,7 @@ use crc32fast::Hasher;
use std::fs;
use std::io;
use std::io::BufRead;
use std::path::PathBuf;
use std::path::Path;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
use std::thread;
@@ -13,15 +13,15 @@ pub const POLLING_INTERVAL: Duration = Duration::from_millis(if cfg!(test) { 1 }
// Watches a file and executes registered callbacks when the file is modified.
pub struct FileWatcher {
path: Arc<PathBuf>,
path: Arc<Path>,
callbacks: Arc<WatchCallbackList>,
state: Arc<AtomicUsize>, // 0: new, 1: runnable, 2: terminated
}
impl FileWatcher {
pub fn new(path: &PathBuf) -> FileWatcher {
pub fn new(path: &Path) -> FileWatcher {
FileWatcher {
path: Arc::new(path.clone()),
path: Arc::from(path),
callbacks: Default::default(),
state: Default::default(),
}
@@ -63,7 +63,7 @@ impl FileWatcher {
handle
}
fn compute_checksum(path: &PathBuf) -> Result<u32, io::Error> {
fn compute_checksum(path: &Path) -> Result<u32, io::Error> {
let reader = match fs::File::open(path) {
Ok(f) => io::BufReader::new(f),
Err(e) => {

View File

@@ -115,6 +115,18 @@ impl Footer {
}
Ok(())
}
VersionedFooter::V3 {
crc32: _crc,
store_compression,
} => {
if &library_version.store_compression != store_compression {
return Err(Incompatibility::CompressionMismatch {
library_compression_format: library_version.store_compression.to_string(),
index_compression_format: store_compression.to_string(),
});
}
Ok(())
}
VersionedFooter::UnknownVersion => Err(Incompatibility::IndexMismatch {
library_version: library_version.clone(),
index_version: self.version.clone(),
@@ -136,24 +148,31 @@ pub enum VersionedFooter {
crc32: CrcHashU32,
store_compression: String,
},
// Block wand max termfred on 1 byte
V3 {
crc32: CrcHashU32,
store_compression: String,
},
}
impl BinarySerializable for VersionedFooter {
fn serialize<W: io::Write>(&self, writer: &mut W) -> io::Result<()> {
let mut buf = Vec::new();
match self {
VersionedFooter::V2 {
VersionedFooter::V3 {
crc32,
store_compression: compression,
} => {
// Serializes a valid `VersionedFooter` or panics if the version is unknown
// [ version | crc_hash | compression_mode ]
// [ 0..4 | 4..8 | variable ]
BinarySerializable::serialize(&2u32, &mut buf)?;
BinarySerializable::serialize(&3u32, &mut buf)?;
BinarySerializable::serialize(crc32, &mut buf)?;
BinarySerializable::serialize(compression, &mut buf)?;
}
VersionedFooter::V1 { .. } | VersionedFooter::UnknownVersion => {
VersionedFooter::V2 { .. }
| VersionedFooter::V1 { .. }
| VersionedFooter::UnknownVersion => {
return Err(io::Error::new(
io::ErrorKind::InvalidInput,
"Cannot serialize an unknown versioned footer ",
@@ -182,7 +201,7 @@ impl BinarySerializable for VersionedFooter {
reader.read_exact(&mut buf[..])?;
let mut cursor = &buf[..];
let version = u32::deserialize(&mut cursor)?;
if version != 1 && version != 2 {
if version > 3 {
return Ok(VersionedFooter::UnknownVersion);
}
let crc32 = u32::deserialize(&mut cursor)?;
@@ -192,12 +211,17 @@ impl BinarySerializable for VersionedFooter {
crc32,
store_compression,
}
} else {
assert_eq!(version, 2);
} else if version == 2 {
VersionedFooter::V2 {
crc32,
store_compression,
}
} else {
assert_eq!(version, 3);
VersionedFooter::V3 {
crc32,
store_compression,
}
})
}
}
@@ -205,6 +229,7 @@ impl BinarySerializable for VersionedFooter {
impl VersionedFooter {
pub fn crc(&self) -> Option<CrcHashU32> {
match self {
VersionedFooter::V3 { crc32, .. } => Some(*crc32),
VersionedFooter::V2 { crc32, .. } => Some(*crc32),
VersionedFooter::V1 { crc32, .. } => Some(*crc32),
VersionedFooter::UnknownVersion { .. } => None,
@@ -243,7 +268,7 @@ impl<W: TerminatingWrite> Write for FooterProxy<W> {
impl<W: TerminatingWrite> TerminatingWrite for FooterProxy<W> {
fn terminate_ref(&mut self, _: AntiCallToken) -> io::Result<()> {
let crc32 = self.hasher.take().unwrap().finalize();
let footer = Footer::new(VersionedFooter::V2 {
let footer = Footer::new(VersionedFooter::V3 {
crc32,
store_compression: crate::store::COMPRESSION.to_string(),
});
@@ -278,7 +303,7 @@ mod tests {
let footer = Footer::deserialize(&mut &vec[..]).unwrap();
assert!(matches!(
footer.versioned_footer,
VersionedFooter::V2 { store_compression, .. }
VersionedFooter::V3 { store_compression, .. }
if store_compression == crate::store::COMPRESSION
));
assert_eq!(&footer.version, crate::version());
@@ -288,7 +313,7 @@ mod tests {
fn test_serialize_deserialize_footer() {
let mut buffer = Vec::new();
let crc32 = 123456u32;
let footer: Footer = Footer::new(VersionedFooter::V2 {
let footer: Footer = Footer::new(VersionedFooter::V3 {
crc32,
store_compression: "lz4".to_string(),
});
@@ -300,7 +325,7 @@ mod tests {
#[test]
fn footer_length() {
let crc32 = 1111111u32;
let versioned_footer = VersionedFooter::V2 {
let versioned_footer = VersionedFooter::V3 {
crc32,
store_compression: "lz4".to_string(),
};
@@ -321,7 +346,7 @@ mod tests {
// versionned footer length
12 | 128,
// index format version
2,
3,
0,
0,
0,
@@ -340,7 +365,7 @@ mod tests {
let versioned_footer = VersionedFooter::deserialize(&mut cursor).unwrap();
assert!(cursor.is_empty());
let expected_crc: u32 = LittleEndian::read_u32(&v_footer_bytes[5..9]) as CrcHashU32;
let expected_versioned_footer: VersionedFooter = VersionedFooter::V2 {
let expected_versioned_footer: VersionedFooter = VersionedFooter::V3 {
crc32: expected_crc,
store_compression: "lz4".to_string(),
};

View File

@@ -226,13 +226,9 @@ impl Directory for RAMDirectory {
)));
let path_buf = PathBuf::from(path);
// Reserve the path to prevent calls to .write() to succeed.
self.fs.write().unwrap().write(path_buf.clone(), &[]);
self.fs.write().unwrap().write(path_buf, data);
let mut vec_writer = VecWriter::new(path_buf, self.clone());
vec_writer.write_all(data)?;
vec_writer.flush()?;
if path == Path::new(&*META_FILEPATH) {
if path == *META_FILEPATH {
let _ = self.fs.write().unwrap().watch_router.broadcast();
}
Ok(())

View File

@@ -1,4 +1,4 @@
use super::MultiValueIntFastFieldReader;
use super::MultiValuedFastFieldReader;
use crate::error::DataCorruption;
use crate::schema::Facet;
use crate::termdict::TermDictionary;
@@ -20,7 +20,7 @@ use std::str;
/// list of facets. This ordinal is segment local and
/// only makes sense for a given segment.
pub struct FacetReader {
term_ords: MultiValueIntFastFieldReader<u64>,
term_ords: MultiValuedFastFieldReader<u64>,
term_dict: TermDictionary,
buffer: Vec<u8>,
}
@@ -29,12 +29,12 @@ impl FacetReader {
/// Creates a new `FacetReader`.
///
/// A facet reader just wraps :
/// - a `MultiValueIntFastFieldReader` that makes it possible to
/// - a `MultiValuedFastFieldReader` that makes it possible to
/// access the list of facet ords for a given document.
/// - a `TermDictionary` that helps associating a facet to
/// an ordinal and vice versa.
pub fn new(
term_ords: MultiValueIntFastFieldReader<u64>,
term_ords: MultiValuedFastFieldReader<u64>,
term_dict: TermDictionary,
) -> FacetReader {
FacetReader {

View File

@@ -28,7 +28,7 @@ pub use self::delete::write_delete_bitset;
pub use self::delete::DeleteBitSet;
pub use self::error::{FastFieldNotAvailableError, Result};
pub use self::facet_reader::FacetReader;
pub use self::multivalued::{MultiValueIntFastFieldReader, MultiValueIntFastFieldWriter};
pub use self::multivalued::{MultiValuedFastFieldReader, MultiValuedFastFieldWriter};
pub use self::reader::FastFieldReader;
pub use self::readers::FastFieldReaders;
pub use self::serializer::FastFieldSerializer;

View File

@@ -1,8 +1,8 @@
mod reader;
mod writer;
pub use self::reader::MultiValueIntFastFieldReader;
pub use self::writer::MultiValueIntFastFieldWriter;
pub use self::reader::MultiValuedFastFieldReader;
pub use self::writer::MultiValuedFastFieldWriter;
#[cfg(test)]
mod tests {

View File

@@ -10,29 +10,22 @@ use crate::DocId;
/// The `idx_reader` associated, for each document, the index of its first value.
///
#[derive(Clone)]
pub struct MultiValueIntFastFieldReader<Item: FastValue> {
pub struct MultiValuedFastFieldReader<Item: FastValue> {
idx_reader: FastFieldReader<u64>,
vals_reader: FastFieldReader<Item>,
}
impl<Item: FastValue> MultiValueIntFastFieldReader<Item> {
impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
pub(crate) fn open(
idx_reader: FastFieldReader<u64>,
vals_reader: FastFieldReader<Item>,
) -> MultiValueIntFastFieldReader<Item> {
MultiValueIntFastFieldReader {
) -> MultiValuedFastFieldReader<Item> {
MultiValuedFastFieldReader {
idx_reader,
vals_reader,
}
}
pub(crate) fn into_u64s_reader(self) -> MultiValueIntFastFieldReader<u64> {
MultiValueIntFastFieldReader {
idx_reader: self.idx_reader,
vals_reader: self.vals_reader.into_u64_reader(),
}
}
/// Returns `(start, stop)`, such that the values associated
/// to the given document are `start..stop`.
fn range(&self, doc: DocId) -> (u64, u64) {

View File

@@ -18,7 +18,7 @@ use std::io;
/// in your schema
/// - add your document simply by calling `.add_document(...)`.
///
/// The `MultiValueIntFastFieldWriter` can be acquired from the
/// The `MultiValuedFastFieldWriter` can be acquired from the
/// fastfield writer, by calling [`.get_multivalue_writer(...)`](./struct.FastFieldsWriter.html#method.get_multivalue_writer).
///
/// Once acquired, writing is done by calling calls to
@@ -29,17 +29,17 @@ use std::io;
/// This makes it possible to push unordered term ids,
/// during indexing and remap them to their respective
/// term ids when the segment is getting serialized.
pub struct MultiValueIntFastFieldWriter {
pub struct MultiValuedFastFieldWriter {
field: Field,
vals: Vec<UnorderedTermId>,
doc_index: Vec<u64>,
is_facet: bool,
}
impl MultiValueIntFastFieldWriter {
impl MultiValuedFastFieldWriter {
/// Creates a new `IntFastFieldWriter`
pub(crate) fn new(field: Field, is_facet: bool) -> Self {
MultiValueIntFastFieldWriter {
MultiValuedFastFieldWriter {
field,
vals: Vec::new(),
doc_index: Vec::new(),
@@ -47,7 +47,7 @@ impl MultiValueIntFastFieldWriter {
}
}
/// Access the field associated to the `MultiValueIntFastFieldWriter`
/// Access the field associated to the `MultiValuedFastFieldWriter`
pub fn field(&self) -> Field {
self.field
}

View File

@@ -42,15 +42,6 @@ impl<Item: FastValue> FastFieldReader<Item> {
})
}
pub(crate) fn into_u64_reader(self) -> FastFieldReader<u64> {
FastFieldReader {
bit_unpacker: self.bit_unpacker,
min_value_u64: self.min_value_u64,
max_value_u64: self.max_value_u64,
_phantom: PhantomData,
}
}
/// Return the value associated to the given document.
///
/// This accessor should return as fast as possible.

View File

@@ -1,28 +1,22 @@
use crate::common::CompositeFile;
use crate::fastfield::BytesFastFieldReader;
use crate::fastfield::MultiValueIntFastFieldReader;
use crate::directory::FileSlice;
use crate::fastfield::MultiValuedFastFieldReader;
use crate::fastfield::{BytesFastFieldReader, FastValue};
use crate::fastfield::{FastFieldNotAvailableError, FastFieldReader};
use crate::schema::{Cardinality, Field, FieldType, Schema};
use crate::space_usage::PerFieldSpaceUsage;
use std::collections::HashMap;
use crate::TantivyError;
/// Provides access to all of the FastFieldReader.
///
/// Internally, `FastFieldReaders` have preloaded fast field readers,
/// and just wraps several `HashMap`.
#[derive(Clone)]
pub struct FastFieldReaders {
fast_field_i64: HashMap<Field, FastFieldReader<i64>>,
fast_field_u64: HashMap<Field, FastFieldReader<u64>>,
fast_field_f64: HashMap<Field, FastFieldReader<f64>>,
fast_field_date: HashMap<Field, FastFieldReader<crate::DateTime>>,
fast_field_i64s: HashMap<Field, MultiValueIntFastFieldReader<i64>>,
fast_field_u64s: HashMap<Field, MultiValueIntFastFieldReader<u64>>,
fast_field_f64s: HashMap<Field, MultiValueIntFastFieldReader<f64>>,
fast_field_dates: HashMap<Field, MultiValueIntFastFieldReader<crate::DateTime>>,
fast_bytes: HashMap<Field, BytesFastFieldReader>,
schema: Schema,
fast_fields_composite: CompositeFile,
}
#[derive(Eq, PartialEq, Debug)]
enum FastType {
I64,
U64,
@@ -50,228 +44,167 @@ fn type_and_cardinality(field_type: &FieldType) -> Option<(FastType, Cardinality
}
impl FastFieldReaders {
pub(crate) fn load_all(
schema: &Schema,
fast_fields_composite: &CompositeFile,
pub(crate) fn new(
schema: Schema,
fast_fields_composite: CompositeFile,
) -> crate::Result<FastFieldReaders> {
let mut fast_field_readers = FastFieldReaders {
fast_field_i64: Default::default(),
fast_field_u64: Default::default(),
fast_field_f64: Default::default(),
fast_field_date: Default::default(),
fast_field_i64s: Default::default(),
fast_field_u64s: Default::default(),
fast_field_f64s: Default::default(),
fast_field_dates: Default::default(),
fast_bytes: Default::default(),
fast_fields_composite: fast_fields_composite.clone(),
};
for (field, field_entry) in schema.fields() {
let field_type = field_entry.field_type();
if let FieldType::Bytes(bytes_option) = field_type {
if !bytes_option.is_fast() {
continue;
}
let fast_field_idx_file = fast_fields_composite
.open_read_with_idx(field, 0)
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))?;
let idx_reader = FastFieldReader::open(fast_field_idx_file)?;
let data = fast_fields_composite
.open_read_with_idx(field, 1)
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))?;
let bytes_fast_field_reader = BytesFastFieldReader::open(idx_reader, data)?;
fast_field_readers
.fast_bytes
.insert(field, bytes_fast_field_reader);
} else if let Some((fast_type, cardinality)) = type_and_cardinality(field_type) {
match cardinality {
Cardinality::SingleValue => {
if let Some(fast_field_data) = fast_fields_composite.open_read(field) {
match fast_type {
FastType::U64 => {
let fast_field_reader = FastFieldReader::open(fast_field_data)?;
fast_field_readers
.fast_field_u64
.insert(field, fast_field_reader);
}
FastType::I64 => {
let fast_field_reader =
FastFieldReader::open(fast_field_data.clone())?;
fast_field_readers
.fast_field_i64
.insert(field, fast_field_reader);
}
FastType::F64 => {
let fast_field_reader =
FastFieldReader::open(fast_field_data.clone())?;
fast_field_readers
.fast_field_f64
.insert(field, fast_field_reader);
}
FastType::Date => {
let fast_field_reader =
FastFieldReader::open(fast_field_data.clone())?;
fast_field_readers
.fast_field_date
.insert(field, fast_field_reader);
}
}
} else {
return Err(From::from(FastFieldNotAvailableError::new(field_entry)));
}
}
Cardinality::MultiValues => {
let idx_opt = fast_fields_composite.open_read_with_idx(field, 0);
let data_opt = fast_fields_composite.open_read_with_idx(field, 1);
if let (Some(fast_field_idx), Some(fast_field_data)) = (idx_opt, data_opt) {
let idx_reader = FastFieldReader::open(fast_field_idx)?;
match fast_type {
FastType::I64 => {
let vals_reader = FastFieldReader::open(fast_field_data)?;
let multivalued_int_fast_field =
MultiValueIntFastFieldReader::open(idx_reader, vals_reader);
fast_field_readers
.fast_field_i64s
.insert(field, multivalued_int_fast_field);
}
FastType::U64 => {
let vals_reader = FastFieldReader::open(fast_field_data)?;
let multivalued_int_fast_field =
MultiValueIntFastFieldReader::open(idx_reader, vals_reader);
fast_field_readers
.fast_field_u64s
.insert(field, multivalued_int_fast_field);
}
FastType::F64 => {
let vals_reader = FastFieldReader::open(fast_field_data)?;
let multivalued_int_fast_field =
MultiValueIntFastFieldReader::open(idx_reader, vals_reader);
fast_field_readers
.fast_field_f64s
.insert(field, multivalued_int_fast_field);
}
FastType::Date => {
let vals_reader = FastFieldReader::open(fast_field_data)?;
let multivalued_int_fast_field =
MultiValueIntFastFieldReader::open(idx_reader, vals_reader);
fast_field_readers
.fast_field_dates
.insert(field, multivalued_int_fast_field);
}
}
} else {
return Err(From::from(FastFieldNotAvailableError::new(field_entry)));
}
}
}
}
}
Ok(fast_field_readers)
Ok(FastFieldReaders {
fast_fields_composite,
schema,
})
}
pub(crate) fn space_usage(&self) -> PerFieldSpaceUsage {
self.fast_fields_composite.space_usage()
}
fn fast_field_data(&self, field: Field, idx: usize) -> crate::Result<FileSlice> {
self.fast_fields_composite
.open_read_with_idx(field, idx)
.ok_or_else(|| {
let field_name = self.schema.get_field_entry(field).name();
TantivyError::SchemaError(format!("Field({}) data was not found", field_name))
})
}
fn check_type(
&self,
field: Field,
expected_fast_type: FastType,
expected_cardinality: Cardinality,
) -> crate::Result<()> {
let field_entry = self.schema.get_field_entry(field);
let (fast_type, cardinality) =
type_and_cardinality(field_entry.field_type()).ok_or_else(|| {
crate::TantivyError::SchemaError(format!(
"Field {:?} is not a fast field.",
field_entry.name()
))
})?;
if fast_type != expected_fast_type {
return Err(crate::TantivyError::SchemaError(format!(
"Field {:?} is of type {:?}, expected {:?}.",
field_entry.name(),
fast_type,
expected_fast_type
)));
}
if cardinality != expected_cardinality {
return Err(crate::TantivyError::SchemaError(format!(
"Field {:?} is of cardinality {:?}, expected {:?}.",
field_entry.name(),
cardinality,
expected_cardinality
)));
}
Ok(())
}
pub(crate) fn typed_fast_field_reader<TFastValue: FastValue>(
&self,
field: Field,
) -> crate::Result<FastFieldReader<TFastValue>> {
let fast_field_slice = self.fast_field_data(field, 0)?;
FastFieldReader::open(fast_field_slice)
}
pub(crate) fn typed_fast_field_multi_reader<TFastValue: FastValue>(
&self,
field: Field,
) -> crate::Result<MultiValuedFastFieldReader<TFastValue>> {
let fast_field_slice_idx = self.fast_field_data(field, 0)?;
let fast_field_slice_vals = self.fast_field_data(field, 1)?;
let idx_reader = FastFieldReader::open(fast_field_slice_idx)?;
let vals_reader: FastFieldReader<TFastValue> =
FastFieldReader::open(fast_field_slice_vals)?;
Ok(MultiValuedFastFieldReader::open(idx_reader, vals_reader))
}
/// Returns the `u64` fast field reader reader associated to `field`.
///
/// If `field` is not a u64 fast field, this method returns `None`.
pub fn u64(&self, field: Field) -> Option<FastFieldReader<u64>> {
self.fast_field_u64.get(&field).cloned()
}
/// If the field is a u64-fast field return the associated reader.
/// If the field is a i64-fast field, return the associated u64 reader. Values are
/// mapped from i64 to u64 using a (well the, it is unique) monotonic mapping. ///
///
/// This method is useful when merging segment reader.
pub(crate) fn u64_lenient(&self, field: Field) -> Option<FastFieldReader<u64>> {
if let Some(u64_ff_reader) = self.u64(field) {
return Some(u64_ff_reader);
}
if let Some(i64_ff_reader) = self.i64(field) {
return Some(i64_ff_reader.into_u64_reader());
}
if let Some(f64_ff_reader) = self.f64(field) {
return Some(f64_ff_reader.into_u64_reader());
}
if let Some(date_ff_reader) = self.date(field) {
return Some(date_ff_reader.into_u64_reader());
}
None
pub fn u64(&self, field: Field) -> crate::Result<FastFieldReader<u64>> {
self.check_type(field, FastType::U64, Cardinality::SingleValue)?;
self.typed_fast_field_reader(field)
}
/// Returns the `i64` fast field reader reader associated to `field`.
///
/// If `field` is not a i64 fast field, this method returns `None`.
pub fn i64(&self, field: Field) -> Option<FastFieldReader<i64>> {
self.fast_field_i64.get(&field).cloned()
pub fn i64(&self, field: Field) -> crate::Result<FastFieldReader<i64>> {
self.check_type(field, FastType::I64, Cardinality::SingleValue)?;
self.typed_fast_field_reader(field)
}
/// Returns the `i64` fast field reader reader associated to `field`.
///
/// If `field` is not a i64 fast field, this method returns `None`.
pub fn date(&self, field: Field) -> Option<FastFieldReader<crate::DateTime>> {
self.fast_field_date.get(&field).cloned()
pub fn date(&self, field: Field) -> crate::Result<FastFieldReader<crate::DateTime>> {
self.check_type(field, FastType::Date, Cardinality::SingleValue)?;
self.typed_fast_field_reader(field)
}
/// Returns the `f64` fast field reader reader associated to `field`.
///
/// If `field` is not a f64 fast field, this method returns `None`.
pub fn f64(&self, field: Field) -> Option<FastFieldReader<f64>> {
self.fast_field_f64.get(&field).cloned()
pub fn f64(&self, field: Field) -> crate::Result<FastFieldReader<f64>> {
self.check_type(field, FastType::F64, Cardinality::SingleValue)?;
self.typed_fast_field_reader(field)
}
/// Returns a `u64s` multi-valued fast field reader reader associated to `field`.
///
/// If `field` is not a u64 multi-valued fast field, this method returns `None`.
pub fn u64s(&self, field: Field) -> Option<MultiValueIntFastFieldReader<u64>> {
self.fast_field_u64s.get(&field).cloned()
}
/// If the field is a u64s-fast field return the associated reader.
/// If the field is a i64s-fast field, return the associated u64s reader. Values are
/// mapped from i64 to u64 using a (well the, it is unique) monotonic mapping.
///
/// This method is useful when merging segment reader.
pub(crate) fn u64s_lenient(&self, field: Field) -> Option<MultiValueIntFastFieldReader<u64>> {
if let Some(u64s_ff_reader) = self.u64s(field) {
return Some(u64s_ff_reader);
}
if let Some(i64s_ff_reader) = self.i64s(field) {
return Some(i64s_ff_reader.into_u64s_reader());
}
if let Some(f64s_ff_reader) = self.f64s(field) {
return Some(f64s_ff_reader.into_u64s_reader());
}
None
pub fn u64s(&self, field: Field) -> crate::Result<MultiValuedFastFieldReader<u64>> {
self.check_type(field, FastType::U64, Cardinality::MultiValues)?;
self.typed_fast_field_multi_reader(field)
}
/// Returns a `i64s` multi-valued fast field reader reader associated to `field`.
///
/// If `field` is not a i64 multi-valued fast field, this method returns `None`.
pub fn i64s(&self, field: Field) -> Option<MultiValueIntFastFieldReader<i64>> {
self.fast_field_i64s.get(&field).cloned()
pub fn i64s(&self, field: Field) -> crate::Result<MultiValuedFastFieldReader<i64>> {
self.check_type(field, FastType::I64, Cardinality::MultiValues)?;
self.typed_fast_field_multi_reader(field)
}
/// Returns a `f64s` multi-valued fast field reader reader associated to `field`.
///
/// If `field` is not a f64 multi-valued fast field, this method returns `None`.
pub fn f64s(&self, field: Field) -> Option<MultiValueIntFastFieldReader<f64>> {
self.fast_field_f64s.get(&field).cloned()
pub fn f64s(&self, field: Field) -> crate::Result<MultiValuedFastFieldReader<f64>> {
self.check_type(field, FastType::F64, Cardinality::MultiValues)?;
self.typed_fast_field_multi_reader(field)
}
/// Returns a `crate::DateTime` multi-valued fast field reader reader associated to `field`.
///
/// If `field` is not a `crate::DateTime` multi-valued fast field, this method returns `None`.
pub fn dates(&self, field: Field) -> Option<MultiValueIntFastFieldReader<crate::DateTime>> {
self.fast_field_dates.get(&field).cloned()
pub fn dates(
&self,
field: Field,
) -> crate::Result<MultiValuedFastFieldReader<crate::DateTime>> {
self.check_type(field, FastType::Date, Cardinality::MultiValues)?;
self.typed_fast_field_multi_reader(field)
}
/// Returns the `bytes` fast field reader associated to `field`.
///
/// If `field` is not a bytes fast field, returns `None`.
pub fn bytes(&self, field: Field) -> Option<BytesFastFieldReader> {
self.fast_bytes.get(&field).cloned()
pub fn bytes(&self, field: Field) -> crate::Result<BytesFastFieldReader> {
let field_entry = self.schema.get_field_entry(field);
if let FieldType::Bytes(bytes_option) = field_entry.field_type() {
if !bytes_option.is_fast() {
return Err(crate::TantivyError::SchemaError(format!(
"Field {:?} is not a fast field.",
field_entry.name()
)));
}
let fast_field_idx_file = self.fast_field_data(field, 0)?;
let idx_reader = FastFieldReader::open(fast_field_idx_file)?;
let data = self.fast_field_data(field, 1)?;
BytesFastFieldReader::open(idx_reader, data)
} else {
Err(FastFieldNotAvailableError::new(field_entry).into())
}
}
}

View File

@@ -1,4 +1,4 @@
use super::multivalued::MultiValueIntFastFieldWriter;
use super::multivalued::MultiValuedFastFieldWriter;
use crate::common;
use crate::common::BinarySerializable;
use crate::common::VInt;
@@ -13,7 +13,7 @@ use std::io;
/// The fastfieldswriter regroup all of the fast field writers.
pub struct FastFieldsWriter {
single_value_writers: Vec<IntFastFieldWriter>,
multi_values_writers: Vec<MultiValueIntFastFieldWriter>,
multi_values_writers: Vec<MultiValuedFastFieldWriter>,
bytes_value_writers: Vec<BytesFastFieldWriter>,
}
@@ -46,14 +46,14 @@ impl FastFieldsWriter {
single_value_writers.push(fast_field_writer);
}
Some(Cardinality::MultiValues) => {
let fast_field_writer = MultiValueIntFastFieldWriter::new(field, false);
let fast_field_writer = MultiValuedFastFieldWriter::new(field, false);
multi_values_writers.push(fast_field_writer);
}
None => {}
}
}
FieldType::HierarchicalFacet => {
let fast_field_writer = MultiValueIntFastFieldWriter::new(field, true);
let fast_field_writer = MultiValuedFastFieldWriter::new(field, true);
multi_values_writers.push(fast_field_writer);
}
FieldType::Bytes(bytes_option) => {
@@ -87,7 +87,7 @@ impl FastFieldsWriter {
pub fn get_multivalue_writer(
&mut self,
field: Field,
) -> Option<&mut MultiValueIntFastFieldWriter> {
) -> Option<&mut MultiValuedFastFieldWriter> {
// TODO optimize
self.multi_values_writers
.iter_mut()

View File

@@ -1,45 +1,93 @@
use rand::thread_rng;
use std::collections::HashSet;
use crate::schema::*;
use crate::Index;
use crate::Searcher;
use crate::{doc, schema::*};
use rand::thread_rng;
use rand::Rng;
use std::collections::HashSet;
fn check_index_content(searcher: &Searcher, vals: &HashSet<u64>) {
fn check_index_content(searcher: &Searcher, vals: &[u64]) -> crate::Result<()> {
assert!(searcher.segment_readers().len() < 20);
assert_eq!(searcher.num_docs() as usize, vals.len());
for segment_reader in searcher.segment_readers() {
let store_reader = segment_reader.get_store_reader()?;
for doc_id in 0..segment_reader.max_doc() {
let _doc = store_reader.get(doc_id)?;
}
}
Ok(())
}
#[test]
#[ignore]
fn test_indexing() {
fn test_functional_store() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let id_field = schema_builder.add_u64_field("id", INDEXED | STORED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let reader = index.reader()?;
let mut rng = thread_rng();
let mut index_writer = index.writer_with_num_threads(3, 12_000_000)?;
let mut doc_set: Vec<u64> = Vec::new();
let mut doc_id = 0u64;
for iteration in 0..500 {
dbg!(iteration);
let num_docs: usize = rng.gen_range(0..4);
if doc_set.len() >= 1 {
let doc_to_remove_id = rng.gen_range(0..doc_set.len());
let removed_doc_id = doc_set.swap_remove(doc_to_remove_id);
index_writer.delete_term(Term::from_field_u64(id_field, removed_doc_id));
}
for _ in 0..num_docs {
doc_set.push(doc_id);
index_writer.add_document(doc!(id_field=>doc_id));
doc_id += 1;
}
index_writer.commit()?;
reader.reload()?;
let searcher = reader.searcher();
check_index_content(&searcher, &doc_set)?;
}
Ok(())
}
#[test]
#[ignore]
fn test_functional_indexing() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let id_field = schema_builder.add_u64_field("id", INDEXED);
let multiples_field = schema_builder.add_u64_field("multiples", INDEXED);
let schema = schema_builder.build();
let index = Index::create_from_tempdir(schema).unwrap();
let reader = index.reader().unwrap();
let index = Index::create_from_tempdir(schema)?;
let reader = index.reader()?;
let mut rng = thread_rng();
let mut index_writer = index.writer_with_num_threads(3, 120_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(3, 120_000_000)?;
let mut committed_docs: HashSet<u64> = HashSet::new();
let mut uncommitted_docs: HashSet<u64> = HashSet::new();
for _ in 0..200 {
let random_val = rng.gen_range(0, 20);
let random_val = rng.gen_range(0..20);
if random_val == 0 {
index_writer.commit().expect("Commit failed");
index_writer.commit()?;
committed_docs.extend(&uncommitted_docs);
uncommitted_docs.clear();
reader.reload().unwrap();
reader.reload()?;
let searcher = reader.searcher();
// check that everything is correct.
check_index_content(&searcher, &committed_docs);
check_index_content(
&searcher,
&committed_docs.iter().cloned().collect::<Vec<u64>>(),
)?;
} else {
if committed_docs.remove(&random_val) || uncommitted_docs.remove(&random_val) {
let doc_id_term = Term::from_field_u64(id_field, random_val);
@@ -55,4 +103,5 @@ fn test_indexing() {
}
}
}
Ok(())
}

View File

@@ -8,7 +8,7 @@ const DEFAULT_MIN_LAYER_SIZE: u32 = 10_000;
const DEFAULT_MIN_MERGE_SIZE: usize = 8;
const DEFAULT_MAX_MERGE_SIZE: usize = 10_000_000;
/// `LogMergePolicy` tries tries to merge segments that have a similar number of
/// `LogMergePolicy` tries to merge segments that have a similar number of
/// documents.
#[derive(Debug, Clone)]
pub struct LogMergePolicy {

View File

@@ -7,7 +7,7 @@ use crate::fastfield::BytesFastFieldReader;
use crate::fastfield::DeleteBitSet;
use crate::fastfield::FastFieldReader;
use crate::fastfield::FastFieldSerializer;
use crate::fastfield::MultiValueIntFastFieldReader;
use crate::fastfield::MultiValuedFastFieldReader;
use crate::fieldnorm::FieldNormsSerializer;
use crate::fieldnorm::FieldNormsWriter;
use crate::fieldnorm::{FieldNormReader, FieldNormReaders};
@@ -246,7 +246,7 @@ impl IndexMerger {
for reader in &self.readers {
let u64_reader: FastFieldReader<u64> = reader
.fast_fields()
.u64_lenient(field)
.typed_fast_field_reader(field)
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
if let Some((seg_min_val, seg_max_val)) =
compute_min_max_val(&u64_reader, reader.max_doc(), reader.delete_bitset())
@@ -290,7 +290,7 @@ impl IndexMerger {
fast_field_serializer: &mut FastFieldSerializer,
) -> crate::Result<()> {
let mut total_num_vals = 0u64;
let mut u64s_readers: Vec<MultiValueIntFastFieldReader<u64>> = Vec::new();
let mut u64s_readers: Vec<MultiValuedFastFieldReader<u64>> = Vec::new();
// In the first pass, we compute the total number of vals.
//
@@ -298,9 +298,8 @@ impl IndexMerger {
// what should be the bit length use for bitpacking.
for reader in &self.readers {
let u64s_reader = reader.fast_fields()
.u64s_lenient(field)
.typed_fast_field_multi_reader(field)
.expect("Failed to find index for multivalued field. This is a bug in tantivy, please report.");
if let Some(delete_bitset) = reader.delete_bitset() {
for doc in 0u32..reader.max_doc() {
if delete_bitset.is_alive(doc) {
@@ -353,7 +352,7 @@ impl IndexMerger {
for (segment_ord, segment_reader) in self.readers.iter().enumerate() {
let term_ordinal_mapping: &[TermOrdinal] =
term_ordinal_mappings.get_segment(segment_ord);
let ff_reader: MultiValueIntFastFieldReader<u64> = segment_reader
let ff_reader: MultiValuedFastFieldReader<u64> = segment_reader
.fast_fields()
.u64s(field)
.expect("Could not find multivalued u64 fast value reader.");
@@ -397,8 +396,10 @@ impl IndexMerger {
// We go through a complete first pass to compute the minimum and the
// maximum value and initialize our Serializer.
for reader in &self.readers {
let ff_reader: MultiValueIntFastFieldReader<u64> =
reader.fast_fields().u64s_lenient(field).expect(
let ff_reader: MultiValuedFastFieldReader<u64> = reader
.fast_fields()
.typed_fast_field_multi_reader(field)
.expect(
"Failed to find multivalued fast field reader. This is a bug in \
tantivy. Please report.",
);
@@ -445,11 +446,7 @@ impl IndexMerger {
let mut bytes_readers: Vec<BytesFastFieldReader> = Vec::new();
for reader in &self.readers {
let bytes_reader = reader.fast_fields().bytes(field).ok_or_else(|| {
crate::TantivyError::InvalidArgument(
"Bytes fast field {:?} not found in segment.".to_string(),
)
})?;
let bytes_reader = reader.fast_fields().bytes(field)?;
if let Some(delete_bitset) = reader.delete_bitset() {
for doc in 0u32..reader.max_doc() {
if delete_bitset.is_alive(doc) {
@@ -512,12 +509,10 @@ impl IndexMerger {
.collect::<crate::Result<Vec<_>>>()?;
let mut field_term_streams = Vec::new();
for field_reader in &field_readers {
let terms = field_reader.terms();
field_term_streams.push(terms.stream()?);
max_term_ords.push(terms.num_terms() as u64);
let term_stream = terms.stream()?;
field_term_streams.push(term_stream);
}
let mut term_ord_mapping_opt = if *field_type == FieldType::HierarchicalFacet {

View File

@@ -96,7 +96,7 @@
//! A good place for you to get started is to check out
//! the example code (
//! [literate programming](https://tantivy-search.github.io/examples/basic_search.html) /
//! [source code](https://github.com/tantivy-search/tantivy/blob/master/examples/basic_search.rs))
//! [source code](https://github.com/tantivy-search/tantivy/blob/main/examples/basic_search.rs))
#[cfg_attr(test, macro_use)]
extern crate serde_json;
@@ -174,7 +174,7 @@ use once_cell::sync::Lazy;
use serde::{Deserialize, Serialize};
/// Index format version.
const INDEX_FORMAT_VERSION: u32 = 2;
const INDEX_FORMAT_VERSION: u32 = 3;
/// Structure version for the index.
#[derive(Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -866,39 +866,39 @@ mod tests {
let searcher = reader.searcher();
let segment_reader: &SegmentReader = searcher.segment_reader(0);
{
let fast_field_reader_opt = segment_reader.fast_fields().u64(text_field);
assert!(fast_field_reader_opt.is_none());
let fast_field_reader_res = segment_reader.fast_fields().u64(text_field);
assert!(fast_field_reader_res.is_err());
}
{
let fast_field_reader_opt = segment_reader.fast_fields().u64(stored_int_field);
assert!(fast_field_reader_opt.is_none());
assert!(fast_field_reader_opt.is_err());
}
{
let fast_field_reader_opt = segment_reader.fast_fields().u64(fast_field_signed);
assert!(fast_field_reader_opt.is_none());
assert!(fast_field_reader_opt.is_err());
}
{
let fast_field_reader_opt = segment_reader.fast_fields().u64(fast_field_float);
assert!(fast_field_reader_opt.is_none());
assert!(fast_field_reader_opt.is_err());
}
{
let fast_field_reader_opt = segment_reader.fast_fields().u64(fast_field_unsigned);
assert!(fast_field_reader_opt.is_some());
assert!(fast_field_reader_opt.is_ok());
let fast_field_reader = fast_field_reader_opt.unwrap();
assert_eq!(fast_field_reader.get(0), 4u64)
}
{
let fast_field_reader_opt = segment_reader.fast_fields().i64(fast_field_signed);
assert!(fast_field_reader_opt.is_some());
let fast_field_reader = fast_field_reader_opt.unwrap();
let fast_field_reader_res = segment_reader.fast_fields().i64(fast_field_signed);
assert!(fast_field_reader_res.is_ok());
let fast_field_reader = fast_field_reader_res.unwrap();
assert_eq!(fast_field_reader.get(0), 4i64)
}
{
let fast_field_reader_opt = segment_reader.fast_fields().f64(fast_field_float);
assert!(fast_field_reader_opt.is_some());
let fast_field_reader = fast_field_reader_opt.unwrap();
let fast_field_reader_res = segment_reader.fast_fields().f64(fast_field_float);
assert!(fast_field_reader_res.is_ok());
let fast_field_reader = fast_field_reader_res.unwrap();
assert_eq!(fast_field_reader.get(0), 4f64)
}
Ok(())

View File

@@ -132,7 +132,7 @@ impl PositionReader {
"offset arguments should be increasing."
);
let delta_to_block_offset = offset as i64 - self.block_offset as i64;
if delta_to_block_offset < 0 || delta_to_block_offset >= 128 {
if !(0..128).contains(&delta_to_block_offset) {
// The first position is not within the first block.
// We need to decompress the first block.
let delta_to_anchor_offset = offset - self.anchor_offset;

View File

@@ -1,14 +1,11 @@
use crate::common::HasLen;
use crate::directory::FileSlice;
use crate::docset::DocSet;
use crate::fastfield::DeleteBitSet;
use crate::positions::PositionReader;
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
use crate::postings::serializer::PostingsSerializer;
use crate::postings::BlockSearcher;
use crate::postings::BlockSegmentPostings;
use crate::postings::Postings;
use crate::schema::IndexRecordOption;
use crate::{DocId, TERMINATED};
/// `SegmentPostings` represents the inverted list or postings associated to
@@ -68,7 +65,11 @@ impl SegmentPostings {
/// It serializes the doc ids using tantivy's codec
/// and returns a `SegmentPostings` object that embeds a
/// buffer with the serialized data.
#[cfg(test)]
pub fn create_from_docs(docs: &[u32]) -> SegmentPostings {
use crate::directory::FileSlice;
use crate::postings::serializer::PostingsSerializer;
use crate::schema::IndexRecordOption;
let mut buffer = Vec::new();
{
let mut postings_serializer =
@@ -97,6 +98,9 @@ impl SegmentPostings {
doc_and_tfs: &[(u32, u32)],
fieldnorms: Option<&[u32]>,
) -> SegmentPostings {
use crate::directory::FileSlice;
use crate::postings::serializer::PostingsSerializer;
use crate::schema::IndexRecordOption;
use crate::fieldnorm::FieldNormReader;
use crate::Score;
let mut buffer: Vec<u8> = Vec::new();

View File

@@ -1,32 +1,46 @@
use crate::common::{read_u32_vint_no_advance, serialize_vint_u32, BinarySerializable};
use std::convert::TryInto;
use crate::directory::OwnedBytes;
use crate::postings::compression::{compressed_block_size, COMPRESSION_BLOCK_SIZE};
use crate::query::BM25Weight;
use crate::schema::IndexRecordOption;
use crate::{DocId, Score, TERMINATED};
#[inline(always)]
fn encode_block_wand_max_tf(max_tf: u32) -> u8 {
max_tf.min(u8::MAX as u32) as u8
}
#[inline(always)]
fn decode_block_wand_max_tf(max_tf_code: u8) -> u32 {
if max_tf_code == u8::MAX {
u32::MAX
} else {
max_tf_code as u32
}
}
#[inline(always)]
fn read_u32(data: &[u8]) -> u32 {
u32::from_le_bytes(data[..4].try_into().unwrap())
}
#[inline(always)]
fn write_u32(val: u32, buf: &mut Vec<u8>) {
buf.extend_from_slice(&val.to_le_bytes());
}
pub struct SkipSerializer {
buffer: Vec<u8>,
prev_doc: DocId,
}
impl SkipSerializer {
pub fn new() -> SkipSerializer {
SkipSerializer {
buffer: Vec::new(),
prev_doc: 0u32,
}
SkipSerializer { buffer: Vec::new() }
}
pub fn write_doc(&mut self, last_doc: DocId, doc_num_bits: u8) {
assert!(
last_doc > self.prev_doc,
"write_doc(...) called with non-increasing doc ids. \
Did you forget to call clear maybe?"
);
let delta_doc = last_doc - self.prev_doc;
self.prev_doc = last_doc;
delta_doc.serialize(&mut self.buffer).unwrap();
write_u32(last_doc, &mut self.buffer);
self.buffer.push(doc_num_bits);
}
@@ -35,16 +49,13 @@ impl SkipSerializer {
}
pub fn write_total_term_freq(&mut self, tf_sum: u32) {
tf_sum
.serialize(&mut self.buffer)
.expect("Should never fail");
write_u32(tf_sum, &mut self.buffer);
}
pub fn write_blockwand_max(&mut self, fieldnorm_id: u8, term_freq: u32) {
self.buffer.push(fieldnorm_id);
let mut buf = [0u8; 8];
let bytes = serialize_vint_u32(term_freq, &mut buf);
self.buffer.extend_from_slice(bytes);
let block_wand_tf = encode_block_wand_max_tf(term_freq);
self.buffer
.extend_from_slice(&[fieldnorm_id, block_wand_tf]);
}
pub fn data(&self) -> &[u8] {
@@ -52,7 +63,6 @@ impl SkipSerializer {
}
pub fn clear(&mut self) {
self.prev_doc = 0u32;
self.buffer.clear();
}
}
@@ -159,18 +169,13 @@ impl SkipReader {
}
fn read_block_info(&mut self) {
let doc_delta = {
let bytes = self.owned_read.as_slice();
let mut buf = [0; 4];
buf.copy_from_slice(&bytes[..4]);
u32::from_le_bytes(buf)
};
self.last_doc_in_block += doc_delta as DocId;
let doc_num_bits = self.owned_read.as_slice()[4];
let bytes = self.owned_read.as_slice();
let advance_len: usize;
self.last_doc_in_block = read_u32(bytes);
let doc_num_bits = bytes[4];
match self.skip_info {
IndexRecordOption::Basic => {
self.owned_read.advance(5);
advance_len = 5;
self.block_info = BlockInfo::BitPacked {
doc_num_bits,
tf_num_bits: 0,
@@ -180,11 +185,10 @@ impl SkipReader {
};
}
IndexRecordOption::WithFreqs => {
let bytes = self.owned_read.as_slice();
let tf_num_bits = bytes[5];
let block_wand_fieldnorm_id = bytes[6];
let (block_wand_term_freq, num_bytes) = read_u32_vint_no_advance(&bytes[7..]);
self.owned_read.advance(7 + num_bytes);
let block_wand_term_freq = decode_block_wand_max_tf(bytes[7]);
advance_len = 8;
self.block_info = BlockInfo::BitPacked {
doc_num_bits,
tf_num_bits,
@@ -194,16 +198,11 @@ impl SkipReader {
};
}
IndexRecordOption::WithFreqsAndPositions => {
let bytes = self.owned_read.as_slice();
let tf_num_bits = bytes[5];
let tf_sum = {
let mut buf = [0; 4];
buf.copy_from_slice(&bytes[6..10]);
u32::from_le_bytes(buf)
};
let tf_sum = read_u32(&bytes[6..10]);
let block_wand_fieldnorm_id = bytes[10];
let (block_wand_term_freq, num_bytes) = read_u32_vint_no_advance(&bytes[11..]);
self.owned_read.advance(11 + num_bytes);
let block_wand_term_freq = decode_block_wand_max_tf(bytes[11]);
advance_len = 12;
self.block_info = BlockInfo::BitPacked {
doc_num_bits,
tf_num_bits,
@@ -213,6 +212,7 @@ impl SkipReader {
};
}
}
self.owned_read.advance(advance_len);
}
pub fn block_info(&self) -> BlockInfo {
@@ -274,6 +274,24 @@ mod tests {
use crate::directory::OwnedBytes;
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
#[test]
fn test_encode_block_wand_max_tf() {
for tf in 0..255 {
assert_eq!(super::encode_block_wand_max_tf(tf), tf as u8);
}
for &tf in &[255, 256, 1_000_000, u32::MAX] {
assert_eq!(super::encode_block_wand_max_tf(tf), 255);
}
}
#[test]
fn test_decode_block_wand_max_tf() {
for tf in 0..255 {
assert_eq!(super::decode_block_wand_max_tf(tf), tf as u32);
}
assert_eq!(super::decode_block_wand_max_tf(255), u32::MAX);
}
#[test]
fn test_skip_with_freq() {
let buf = {

View File

@@ -1,5 +1,3 @@
use rayon::iter::IntoParallelRefIterator;
use crate::core::SegmentReader;
use crate::postings::FreqReadingOption;
use crate::query::explanation::does_not_match;
@@ -24,7 +22,7 @@ enum SpecializedScorer {
fn scorer_union<TScoreCombiner>(scorers: Vec<Box<dyn Scorer>>) -> SpecializedScorer
where
TScoreCombiner: ScoreCombiner + Send,
TScoreCombiner: ScoreCombiner,
{
assert!(!scorers.is_empty());
if scorers.len() == 1 {
@@ -54,7 +52,7 @@ where
SpecializedScorer::Other(Box::new(Union::<_, TScoreCombiner>::from(scorers)))
}
fn into_box_scorer<TScoreCombiner: ScoreCombiner + Send>(scorer: SpecializedScorer) -> Box<dyn Scorer> {
fn into_box_scorer<TScoreCombiner: ScoreCombiner>(scorer: SpecializedScorer) -> Box<dyn Scorer> {
match scorer {
SpecializedScorer::TermUnion(term_scorers) => {
let union_scorer = Union::<TermScorer, TScoreCombiner>::from(term_scorers);
@@ -82,32 +80,18 @@ impl BooleanWeight {
reader: &SegmentReader,
boost: Score,
) -> crate::Result<HashMap<Occur, Vec<Box<dyn Scorer>>>> {
use rayon::iter::ParallelIterator;
use rayon::iter::IndexedParallelIterator;
let mut per_occur_scorers: HashMap<Occur, Vec<Box<dyn Scorer>>> = HashMap::new();
let mut items_res: Vec<crate::Result<(Occur, Box<dyn Scorer>)>> = Vec::new();
let pool = rayon::ThreadPoolBuilder::new().num_threads(self.weights.len()).build().unwrap();
pool.install(|| {
self.weights.iter()
.collect::<Vec<_>>()
.par_iter()
.map(|(occur, subweight)| {
let sub_scorer: Box<dyn Scorer> = subweight.scorer(reader, boost)?;
Ok((*occur, sub_scorer))
})
.collect_into_vec(&mut items_res);
});
for item_res in items_res {
let (occur, sub_scorer) = item_res?;
for &(ref occur, ref subweight) in &self.weights {
let sub_scorer: Box<dyn Scorer> = subweight.scorer(reader, boost)?;
per_occur_scorers
.entry(occur)
.entry(*occur)
.or_insert_with(Vec::new)
.push(sub_scorer);
}
Ok(per_occur_scorers)
}
fn complex_scorer<TScoreCombiner: ScoreCombiner >(
fn complex_scorer<TScoreCombiner: ScoreCombiner>(
&self,
reader: &SegmentReader,
boost: Score,

View File

@@ -302,7 +302,7 @@ mod tests {
let mut rng = rand::thread_rng();
writer.set_merge_policy(Box::new(NoMergePolicy));
for _ in 0..3_000 {
let term_freq = rng.gen_range(1, 10000);
let term_freq = rng.gen_range(1..10000);
let words: Vec<&str> = std::iter::repeat("bbbb").take(term_freq).collect();
let text = words.join(" ");
writer.add_document(doc!(text_field=>text));

View File

@@ -1,5 +1,5 @@
use crate::schema::Value;
use serde::Serialize;
use serde::{Deserialize, Serialize};
use std::collections::BTreeMap;
/// Internal representation of a document used for JSON
@@ -8,5 +8,5 @@ use std::collections::BTreeMap;
/// A `NamedFieldDocument` is a simple representation of a document
/// as a `BTreeMap<String, Vec<Value>>`.
///
#[derive(Serialize)]
#[derive(Debug, Deserialize, Serialize)]
pub struct NamedFieldDocument(pub BTreeMap<String, Vec<Value>>);

View File

@@ -3,7 +3,7 @@ use std::io::{self, Read, Write};
/// Name of the compression scheme used in the doc store.
///
/// This name is appended to the version string of tantivy.
pub const COMPRESSION: &'static str = "lz4";
pub const COMPRESSION: &str = "lz4";
pub fn compress(uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
compressed.clear();

View File

@@ -43,6 +43,9 @@ impl CheckpointBlock {
/// Adding another checkpoint in the block.
pub fn push(&mut self, checkpoint: Checkpoint) {
if let Some(prev_checkpoint) = self.checkpoints.last() {
assert!(checkpoint.follows(prev_checkpoint));
}
self.checkpoints.push(checkpoint);
}

View File

@@ -26,6 +26,12 @@ pub struct Checkpoint {
pub end_offset: u64,
}
impl Checkpoint {
pub(crate) fn follows(&self, other: &Checkpoint) -> bool {
(self.start_doc == other.end_doc) && (self.start_offset == other.end_offset)
}
}
impl fmt::Debug for Checkpoint {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(
@@ -39,13 +45,16 @@ impl fmt::Debug for Checkpoint {
#[cfg(test)]
mod tests {
use std::io;
use std::{io, iter};
use futures::executor::block_on;
use proptest::strategy::{BoxedStrategy, Strategy};
use crate::directory::OwnedBytes;
use crate::indexer::NoMergePolicy;
use crate::schema::{SchemaBuilder, STORED, STRING};
use crate::store::index::Checkpoint;
use crate::DocId;
use crate::{DocAddress, DocId, Index, Term};
use super::{SkipIndex, SkipIndexBuilder};
@@ -54,7 +63,7 @@ mod tests {
let mut output: Vec<u8> = Vec::new();
let skip_index_builder: SkipIndexBuilder = SkipIndexBuilder::new();
skip_index_builder.write(&mut output)?;
let skip_index: SkipIndex = SkipIndex::from(OwnedBytes::new(output));
let skip_index: SkipIndex = SkipIndex::open(OwnedBytes::new(output));
let mut skip_cursor = skip_index.checkpoints();
assert!(skip_cursor.next().is_none());
Ok(())
@@ -72,7 +81,7 @@ mod tests {
};
skip_index_builder.insert(checkpoint);
skip_index_builder.write(&mut output)?;
let skip_index: SkipIndex = SkipIndex::from(OwnedBytes::new(output));
let skip_index: SkipIndex = SkipIndex::open(OwnedBytes::new(output));
let mut skip_cursor = skip_index.checkpoints();
assert_eq!(skip_cursor.next(), Some(checkpoint));
assert_eq!(skip_cursor.next(), None);
@@ -86,7 +95,7 @@ mod tests {
Checkpoint {
start_doc: 0,
end_doc: 3,
start_offset: 4,
start_offset: 0,
end_offset: 9,
},
Checkpoint {
@@ -121,7 +130,7 @@ mod tests {
}
skip_index_builder.write(&mut output)?;
let skip_index: SkipIndex = SkipIndex::from(OwnedBytes::new(output));
let skip_index: SkipIndex = SkipIndex::open(OwnedBytes::new(output));
assert_eq!(
&skip_index.checkpoints().collect::<Vec<_>>()[..],
&checkpoints[..]
@@ -133,6 +142,40 @@ mod tests {
(doc as u64) * (doc as u64)
}
#[test]
fn test_merge_store_with_stacking_reproducing_issue969() -> crate::Result<()> {
let mut schema_builder = SchemaBuilder::default();
let text = schema_builder.add_text_field("text", STORED | STRING);
let body = schema_builder.add_text_field("body", STORED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
let long_text: String = iter::repeat("abcdefghijklmnopqrstuvwxyz")
.take(1_000)
.collect();
for _ in 0..20 {
index_writer.add_document(doc!(body=>long_text.clone()));
}
index_writer.commit()?;
index_writer.add_document(doc!(text=>"testb"));
for _ in 0..10 {
index_writer.add_document(doc!(text=>"testd", body=>long_text.clone()));
}
index_writer.commit()?;
index_writer.delete_term(Term::from_field_text(text, "testb"));
index_writer.commit()?;
let segment_ids = index.searchable_segment_ids()?;
block_on(index_writer.merge(&segment_ids))?;
let reader = index.reader()?;
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 30);
for i in 0..searcher.num_docs() as u32 {
let _doc = searcher.doc(DocAddress(0u32, i))?;
}
Ok(())
}
#[test]
fn test_skip_index_long() -> io::Result<()> {
let mut output: Vec<u8> = Vec::new();
@@ -150,26 +193,28 @@ mod tests {
}
skip_index_builder.write(&mut output)?;
assert_eq!(output.len(), 4035);
let resulting_checkpoints: Vec<Checkpoint> = SkipIndex::from(OwnedBytes::new(output))
let resulting_checkpoints: Vec<Checkpoint> = SkipIndex::open(OwnedBytes::new(output))
.checkpoints()
.collect();
assert_eq!(&resulting_checkpoints, &checkpoints);
Ok(())
}
fn integrate_delta(mut vals: Vec<u64>) -> Vec<u64> {
fn integrate_delta(vals: Vec<u64>) -> Vec<u64> {
let mut output = Vec::with_capacity(vals.len() + 1);
output.push(0u64);
let mut prev = 0u64;
for val in vals.iter_mut() {
let new_val = *val + prev;
for val in vals {
let new_val = val + prev;
prev = new_val;
*val = new_val;
output.push(new_val);
}
vals
output
}
// Generates a sequence of n valid checkpoints, with n < max_len.
fn monotonic_checkpoints(max_len: usize) -> BoxedStrategy<Vec<Checkpoint>> {
(1..max_len)
(0..max_len)
.prop_flat_map(move |len: usize| {
(
proptest::collection::vec(1u64..20u64, len as usize).prop_map(integrate_delta),
@@ -221,7 +266,7 @@ mod tests {
}
let mut buffer = Vec::new();
skip_index_builder.write(&mut buffer).unwrap();
let skip_index = SkipIndex::from(OwnedBytes::new(buffer));
let skip_index = SkipIndex::open(OwnedBytes::new(buffer));
let iter_checkpoints: Vec<Checkpoint> = skip_index.checkpoints().collect();
assert_eq!(&checkpoints[..], &iter_checkpoints[..]);
test_skip_index_aux(skip_index, &checkpoints[..]);

View File

@@ -35,11 +35,11 @@ struct Layer {
}
impl Layer {
fn cursor<'a>(&'a self) -> impl Iterator<Item = Checkpoint> + 'a {
fn cursor(&self) -> impl Iterator<Item = Checkpoint> + '_ {
self.cursor_at_offset(0u64)
}
fn cursor_at_offset<'a>(&'a self, start_offset: u64) -> impl Iterator<Item = Checkpoint> + 'a {
fn cursor_at_offset(&self, start_offset: u64) -> impl Iterator<Item = Checkpoint> + '_ {
let data = &self.data.as_slice();
LayerCursor {
remaining: &data[start_offset as usize..],
@@ -59,7 +59,25 @@ pub struct SkipIndex {
}
impl SkipIndex {
pub(crate) fn checkpoints<'a>(&'a self) -> impl Iterator<Item = Checkpoint> + 'a {
pub fn open(mut data: OwnedBytes) -> SkipIndex {
let offsets: Vec<u64> = Vec::<VInt>::deserialize(&mut data)
.unwrap()
.into_iter()
.map(|el| el.0)
.collect();
let mut start_offset = 0;
let mut layers = Vec::new();
for end_offset in offsets {
let layer = Layer {
data: data.slice(start_offset as usize, end_offset as usize),
};
layers.push(layer);
start_offset = end_offset;
}
SkipIndex { layers }
}
pub(crate) fn checkpoints(&self) -> impl Iterator<Item = Checkpoint> + '_ {
self.layers
.last()
.into_iter()
@@ -90,22 +108,3 @@ impl SkipIndex {
Some(cur_checkpoint)
}
}
impl From<OwnedBytes> for SkipIndex {
fn from(mut data: OwnedBytes) -> SkipIndex {
let offsets: Vec<u64> = Vec::<VInt>::deserialize(&mut data)
.unwrap()
.into_iter()
.map(|el| el.0)
.collect();
let mut start_offset = 0;
let mut layers = Vec::new();
for end_offset in offsets {
layers.push(Layer {
data: data.slice(start_offset as usize, end_offset as usize),
});
start_offset = end_offset;
}
SkipIndex { layers }
}
}

View File

@@ -28,18 +28,20 @@ impl LayerBuilder {
///
/// If the block was empty to begin with, simply return None.
fn flush_block(&mut self) -> Option<Checkpoint> {
self.block.doc_interval().map(|(start_doc, end_doc)| {
if let Some((start_doc, end_doc)) = self.block.doc_interval() {
let start_offset = self.buffer.len() as u64;
self.block.serialize(&mut self.buffer);
let end_offset = self.buffer.len() as u64;
self.block.clear();
Checkpoint {
Some(Checkpoint {
start_doc,
end_doc,
start_offset,
end_offset,
}
})
})
} else {
None
}
}
fn push(&mut self, checkpoint: Checkpoint) {
@@ -48,7 +50,7 @@ impl LayerBuilder {
fn insert(&mut self, checkpoint: Checkpoint) -> Option<Checkpoint> {
self.push(checkpoint);
let emit_skip_info = (self.block.len() % CHECKPOINT_PERIOD) == 0;
let emit_skip_info = self.block.len() >= CHECKPOINT_PERIOD;
if emit_skip_info {
self.flush_block()
} else {

View File

@@ -35,7 +35,7 @@ impl StoreReader {
let (data_file, offset_index_file) = split_file(store_file)?;
let index_data = offset_index_file.read_bytes()?;
let space_usage = StoreSpaceUsage::new(data_file.len(), offset_index_file.len());
let skip_index = SkipIndex::from(index_data);
let skip_index = SkipIndex::open(index_data);
Ok(StoreReader {
data: data_file,
cache: Arc::new(Mutex::new(LruCache::new(LRU_CACHE_CAPACITY))),
@@ -46,7 +46,7 @@ impl StoreReader {
})
}
pub(crate) fn block_checkpoints<'a>(&'a self) -> impl Iterator<Item = Checkpoint> + 'a {
pub(crate) fn block_checkpoints(&self) -> impl Iterator<Item = Checkpoint> + '_ {
self.skip_index.checkpoints()
}

View File

@@ -72,6 +72,7 @@ impl StoreWriter {
if !self.current_block.is_empty() {
self.write_and_compress_block()?;
}
assert_eq!(self.first_doc_in_block, self.doc);
let doc_shift = self.doc;
let start_shift = self.writer.written_bytes() as u64;
@@ -86,12 +87,17 @@ impl StoreWriter {
checkpoint.end_doc += doc_shift;
checkpoint.start_offset += start_shift;
checkpoint.end_offset += start_shift;
self.offset_index_writer.insert(checkpoint);
self.doc = checkpoint.end_doc;
self.register_checkpoint(checkpoint);
}
Ok(())
}
fn register_checkpoint(&mut self, checkpoint: Checkpoint) {
self.offset_index_writer.insert(checkpoint);
self.first_doc_in_block = checkpoint.end_doc;
self.doc = checkpoint.end_doc;
}
fn write_and_compress_block(&mut self) -> io::Result<()> {
assert!(self.doc > 0);
self.intermediary_buffer.clear();
@@ -100,14 +106,13 @@ impl StoreWriter {
self.writer.write_all(&self.intermediary_buffer)?;
let end_offset = self.writer.written_bytes();
let end_doc = self.doc;
self.offset_index_writer.insert(Checkpoint {
self.register_checkpoint(Checkpoint {
start_doc: self.first_doc_in_block,
end_doc,
start_offset,
end_offset,
});
self.current_block.clear();
self.first_doc_in_block = self.doc;
Ok(())
}

View File

@@ -22,10 +22,8 @@ A second datastructure makes it possible to access a [`TermInfo`](../postings/st
use tantivy_fst::automaton::AlwaysMatch;
// mod fst_termdict;
// use fst_termdict as termdict;
mod sstable_termdict;
use sstable_termdict as termdict;
mod fst_termdict;
use fst_termdict as termdict;
mod merger;

View File

@@ -1,148 +0,0 @@
use std::io;
mod sstable;
mod streamer;
mod termdict;
use self::sstable::value::{ValueReader, ValueWriter};
use self::sstable::{BlockReader, SSTable};
use crate::common::VInt;
use crate::postings::TermInfo;
pub use self::streamer::{TermStreamer, TermStreamerBuilder};
pub use self::termdict::{TermDictionary, TermDictionaryBuilder};
pub struct TermSSTable;
impl SSTable for TermSSTable {
type Value = TermInfo;
type Reader = TermInfoReader;
type Writer = TermInfoWriter;
}
#[derive(Default)]
pub struct TermInfoReader {
term_infos: Vec<TermInfo>,
}
impl ValueReader for TermInfoReader {
type Value = TermInfo;
fn value(&self, idx: usize) -> &TermInfo {
&self.term_infos[idx]
}
fn read(&mut self, reader: &mut BlockReader) -> io::Result<()> {
self.term_infos.clear();
let num_els = VInt::deserialize_u64(reader)?;
let mut start_offset = VInt::deserialize_u64(reader)?;
let mut positions_idx = 0;
for _ in 0..num_els {
let doc_freq = VInt::deserialize_u64(reader)? as u32;
let posting_num_bytes = VInt::deserialize_u64(reader)?;
let stop_offset = start_offset + posting_num_bytes;
let delta_positions_idx = VInt::deserialize_u64(reader)?;
positions_idx += delta_positions_idx;
let term_info = TermInfo {
doc_freq,
postings_start_offset: start_offset,
postings_stop_offset: stop_offset,
positions_idx,
};
self.term_infos.push(term_info);
start_offset = stop_offset;
}
Ok(())
}
}
#[derive(Default)]
pub struct TermInfoWriter {
term_infos: Vec<TermInfo>,
}
impl ValueWriter for TermInfoWriter {
type Value = TermInfo;
fn write(&mut self, term_info: &TermInfo) {
self.term_infos.push(term_info.clone());
}
fn write_block(&mut self, buffer: &mut Vec<u8>) {
VInt(self.term_infos.len() as u64).serialize_into_vec(buffer);
if self.term_infos.is_empty() {
return;
}
let mut prev_position_idx = 0u64;
VInt(self.term_infos[0].postings_start_offset).serialize_into_vec(buffer);
for term_info in &self.term_infos {
VInt(term_info.doc_freq as u64).serialize_into_vec(buffer);
VInt(term_info.postings_stop_offset - term_info.postings_start_offset)
.serialize_into_vec(buffer);
VInt(term_info.positions_idx - prev_position_idx).serialize_into_vec(buffer);
prev_position_idx = term_info.positions_idx;
}
self.term_infos.clear();
}
}
#[cfg(test)]
mod tests {
use std::io;
use super::BlockReader;
use crate::directory::OwnedBytes;
use crate::postings::TermInfo;
use crate::termdict::sstable_termdict::sstable::value::{ValueReader, ValueWriter};
use crate::termdict::sstable_termdict::TermInfoReader;
#[test]
fn test_block_terminfos() -> io::Result<()> {
let mut term_info_writer = super::TermInfoWriter::default();
term_info_writer.write(&TermInfo {
doc_freq: 120u32,
postings_start_offset: 17u64,
postings_stop_offset: 45u64,
positions_idx: 10u64,
});
term_info_writer.write(&TermInfo {
doc_freq: 10u32,
postings_start_offset: 45u64,
postings_stop_offset: 450u64,
positions_idx: 104u64,
});
term_info_writer.write(&TermInfo {
doc_freq: 17u32,
postings_start_offset: 450u64,
postings_stop_offset: 462u64,
positions_idx: 210u64,
});
let mut buffer = Vec::new();
term_info_writer.write_block(&mut buffer);
let mut block_reader = make_block_reader(&buffer[..]);
let mut term_info_reader = TermInfoReader::default();
term_info_reader.read(&mut block_reader)?;
assert_eq!(
term_info_reader.value(0),
&TermInfo {
doc_freq: 120u32,
postings_start_offset: 17u64,
postings_stop_offset: 45u64,
positions_idx: 10u64
}
);
assert!(block_reader.buffer().is_empty());
Ok(())
}
fn make_block_reader(data: &[u8]) -> BlockReader {
let mut buffer = (data.len() as u32).to_le_bytes().to_vec();
buffer.extend_from_slice(data);
let owned_bytes = OwnedBytes::new(buffer);
let mut block_reader = BlockReader::new(Box::new(owned_bytes));
block_reader.read_block().unwrap();
block_reader
}
}

View File

@@ -1,84 +0,0 @@
use byteorder::{LittleEndian, ReadBytesExt};
use std::io::{self, Read};
pub struct BlockReader<'a> {
buffer: Vec<u8>,
reader: Box<dyn io::Read + 'a>,
offset: usize,
}
impl<'a> BlockReader<'a> {
pub fn new(reader: Box<dyn io::Read + 'a>) -> BlockReader<'a> {
BlockReader {
buffer: Vec::new(),
reader,
offset: 0,
}
}
pub fn deserialize_u64(&mut self) -> u64 {
let (num_bytes, val) = super::vint::deserialize_read(self.buffer());
self.advance(num_bytes);
val
}
#[inline(always)]
pub fn buffer_from_to(&self, start: usize, end: usize) -> &[u8] {
&self.buffer[start..end]
}
pub fn buffer_from(&self, start: usize) -> &[u8] {
&self.buffer[start..]
}
pub fn read_block(&mut self) -> io::Result<bool> {
self.offset = 0;
let block_len_res = self.reader.read_u32::<LittleEndian>();
if let Err(err) = &block_len_res {
if err.kind() == io::ErrorKind::UnexpectedEof {
return Ok(false);
}
}
let block_len = block_len_res?;
if block_len == 0u32 {
self.buffer.clear();
return Ok(false);
}
self.buffer.resize(block_len as usize, 0u8);
self.reader.read_exact(&mut self.buffer[..])?;
Ok(true)
}
pub fn offset(&self) -> usize {
self.offset
}
pub fn advance(&mut self, num_bytes: usize) {
self.offset += num_bytes;
}
pub fn buffer(&self) -> &[u8] {
&self.buffer[self.offset..]
}
}
impl<'a> io::Read for BlockReader<'a> {
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
let len = self.buffer().read(buf)?;
self.advance(len);
Ok(len)
}
fn read_to_end(&mut self, buf: &mut Vec<u8>) -> io::Result<usize> {
let len = self.buffer.len();
buf.extend_from_slice(self.buffer());
self.advance(len);
Ok(len)
}
fn read_exact(&mut self, buf: &mut [u8]) -> io::Result<()> {
self.buffer().read_exact(buf)?;
self.advance(buf.len());
Ok(())
}
}

View File

@@ -1,203 +0,0 @@
use std::io::{self, BufWriter, Write};
use crate::common::CountingWriter;
use super::value::ValueWriter;
use super::{value, vint, BlockReader};
const FOUR_BIT_LIMITS: usize = 1 << 4;
const VINT_MODE: u8 = 1u8;
const BLOCK_LEN: usize = 256_000;
pub struct DeltaWriter<W, TValueWriter>
where
W: io::Write,
{
block: Vec<u8>,
write: CountingWriter<BufWriter<W>>,
value_writer: TValueWriter,
}
impl<W, TValueWriter> DeltaWriter<W, TValueWriter>
where
W: io::Write,
TValueWriter: ValueWriter,
{
pub fn new(wrt: W) -> Self {
DeltaWriter {
block: Vec::with_capacity(BLOCK_LEN * 2),
write: CountingWriter::wrap(BufWriter::new(wrt)),
value_writer: TValueWriter::default(),
}
}
}
impl<W, TValueWriter> DeltaWriter<W, TValueWriter>
where
W: io::Write,
TValueWriter: value::ValueWriter,
{
pub fn flush_block(&mut self) -> io::Result<Option<(u64, u64)>> {
if self.block.is_empty() {
return Ok(None);
}
let start_offset = self.write.written_bytes();
// TODO avoid buffer allocation
let mut buffer = Vec::new();
self.value_writer.write_block(&mut buffer);
let block_len = buffer.len() + self.block.len();
self.write.write_all(&(block_len as u32).to_le_bytes())?;
self.write.write_all(&buffer[..])?;
self.write.write_all(&mut self.block[..])?;
let end_offset = self.write.written_bytes();
self.block.clear();
Ok(Some((start_offset, end_offset)))
}
fn encode_keep_add(&mut self, keep_len: usize, add_len: usize) {
if keep_len < FOUR_BIT_LIMITS && add_len < FOUR_BIT_LIMITS {
let b = (keep_len | add_len << 4) as u8;
self.block.extend_from_slice(&[b])
} else {
let mut buf = [VINT_MODE; 20];
let mut len = 1 + vint::serialize(keep_len as u64, &mut buf[1..]);
len += vint::serialize(add_len as u64, &mut buf[len..]);
self.block.extend_from_slice(&mut buf[..len])
}
}
pub(crate) fn write_suffix(&mut self, common_prefix_len: usize, suffix: &[u8]) {
let keep_len = common_prefix_len;
let add_len = suffix.len();
self.encode_keep_add(keep_len, add_len);
self.block.extend_from_slice(suffix);
}
pub(crate) fn write_value(&mut self, value: &TValueWriter::Value) {
self.value_writer.write(value);
}
pub fn write_delta(
&mut self,
common_prefix_len: usize,
suffix: &[u8],
value: &TValueWriter::Value,
) {
self.write_suffix(common_prefix_len, suffix);
self.write_value(value);
}
pub fn flush_block_if_required(&mut self) -> io::Result<Option<(u64, u64)>> {
if self.block.len() > BLOCK_LEN {
return self.flush_block();
}
Ok(None)
}
pub fn finalize(mut self) -> CountingWriter<BufWriter<W>> {
self.write
}
}
pub struct DeltaReader<'a, TValueReader> {
common_prefix_len: usize,
suffix_start: usize,
suffix_end: usize,
value_reader: TValueReader,
block_reader: BlockReader<'a>,
idx: usize,
}
impl<'a, TValueReader> DeltaReader<'a, TValueReader>
where
TValueReader: value::ValueReader,
{
pub fn new<R: io::Read + 'a>(reader: R) -> Self {
DeltaReader {
idx: 0,
common_prefix_len: 0,
suffix_start: 0,
suffix_end: 0,
value_reader: TValueReader::default(),
block_reader: BlockReader::new(Box::new(reader)),
}
}
fn deserialize_vint(&mut self) -> u64 {
self.block_reader.deserialize_u64()
}
fn read_keep_add(&mut self) -> Option<(usize, usize)> {
let b = {
let buf = &self.block_reader.buffer();
if buf.is_empty() {
return None;
}
buf[0]
};
self.block_reader.advance(1);
match b {
VINT_MODE => {
let keep = self.deserialize_vint() as usize;
let add = self.deserialize_vint() as usize;
Some((keep, add))
}
b => {
let keep = (b & 0b1111) as usize;
let add = (b >> 4) as usize;
Some((keep, add))
}
}
}
fn read_delta_key(&mut self) -> bool {
if let Some((keep, add)) = self.read_keep_add() {
self.common_prefix_len = keep;
self.suffix_start = self.block_reader.offset();
self.suffix_end = self.suffix_start + add;
self.block_reader.advance(add);
true
} else {
false
}
}
pub fn advance(&mut self) -> io::Result<bool> {
if self.block_reader.buffer().is_empty() {
if !self.block_reader.read_block()? {
return Ok(false);
}
self.value_reader.read(&mut self.block_reader)?;
self.idx = 0;
} else {
self.idx += 1;
}
if !self.read_delta_key() {
return Ok(false);
}
Ok(true)
}
pub fn common_prefix_len(&self) -> usize {
self.common_prefix_len
}
pub fn suffix(&self) -> &[u8] {
&self
.block_reader
.buffer_from_to(self.suffix_start, self.suffix_end)
}
pub fn suffix_from(&self, offset: usize) -> &[u8] {
&self.block_reader.buffer_from_to(
self.suffix_start
.wrapping_add(offset)
.wrapping_sub(self.common_prefix_len),
self.suffix_end,
)
}
pub fn value(&self) -> &TValueReader::Value {
self.value_reader.value(self.idx)
}
}

View File

@@ -1,72 +0,0 @@
use crate::termdict::sstable_termdict::sstable::{Reader, SSTable, Writer};
use super::SingleValueMerger;
use super::ValueMerger;
use std::cmp::Ordering;
use std::collections::binary_heap::PeekMut;
use std::collections::BinaryHeap;
use std::io;
struct HeapItem<B: AsRef<[u8]>>(B);
impl<B: AsRef<[u8]>> Ord for HeapItem<B> {
fn cmp(&self, other: &Self) -> Ordering {
other.0.as_ref().cmp(self.0.as_ref())
}
}
impl<B: AsRef<[u8]>> PartialOrd for HeapItem<B> {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(other.0.as_ref().cmp(self.0.as_ref()))
}
}
impl<B: AsRef<[u8]>> Eq for HeapItem<B> {}
impl<B: AsRef<[u8]>> PartialEq for HeapItem<B> {
fn eq(&self, other: &Self) -> bool {
self.0.as_ref() == other.0.as_ref()
}
}
pub fn merge_sstable<SST: SSTable, W: io::Write, M: ValueMerger<SST::Value>>(
readers: Vec<Reader<SST::Reader>>,
mut writer: Writer<W, SST::Writer>,
mut merger: M,
) -> io::Result<()> {
let mut heap: BinaryHeap<HeapItem<Reader<SST::Reader>>> =
BinaryHeap::with_capacity(readers.len());
for mut reader in readers {
if reader.advance()? {
heap.push(HeapItem(reader));
}
}
loop {
let len = heap.len();
let mut value_merger;
if let Some(mut head) = heap.peek_mut() {
writer.write_key(head.0.key());
value_merger = merger.new_value(head.0.value());
if !head.0.advance()? {
PeekMut::pop(head);
}
} else {
break;
}
for _ in 0..len - 1 {
if let Some(mut head) = heap.peek_mut() {
if head.0.key() == writer.current_key() {
value_merger.add(head.0.value());
if !head.0.advance()? {
PeekMut::pop(head);
}
continue;
}
}
break;
}
let value = value_merger.finish();
writer.write_value(&value);
writer.flush_block_if_required()?;
}
writer.finalize()?;
Ok(())
}

View File

@@ -1,184 +0,0 @@
mod heap_merge;
pub use self::heap_merge::merge_sstable;
pub trait SingleValueMerger<V> {
fn add(&mut self, v: &V);
fn finish(self) -> V;
}
pub trait ValueMerger<V> {
type TSingleValueMerger: SingleValueMerger<V>;
fn new_value(&mut self, v: &V) -> Self::TSingleValueMerger;
}
#[derive(Default)]
pub struct KeepFirst;
pub struct FirstVal<V>(V);
impl<V: Clone> ValueMerger<V> for KeepFirst {
type TSingleValueMerger = FirstVal<V>;
fn new_value(&mut self, v: &V) -> FirstVal<V> {
FirstVal(v.clone())
}
}
impl<V> SingleValueMerger<V> for FirstVal<V> {
fn add(&mut self, _: &V) {}
fn finish(self) -> V {
self.0
}
}
pub struct VoidMerge;
impl ValueMerger<()> for VoidMerge {
type TSingleValueMerger = ();
fn new_value(&mut self, _: &()) -> () {
()
}
}
pub struct U64Merge;
impl ValueMerger<u64> for U64Merge {
type TSingleValueMerger = u64;
fn new_value(&mut self, val: &u64) -> u64 {
*val
}
}
impl SingleValueMerger<u64> for u64 {
fn add(&mut self, val: &u64) {
*self += *val;
}
fn finish(self) -> u64 {
self
}
}
impl SingleValueMerger<()> for () {
fn add(&mut self, _: &()) {}
fn finish(self) -> () {
()
}
}
#[cfg(test)]
mod tests {
use super::super::SSTable;
use super::super::{SSTableMonotonicU64, VoidSSTable};
use super::U64Merge;
use super::VoidMerge;
use std::collections::{BTreeMap, BTreeSet};
use std::str;
fn write_sstable(keys: &[&'static str]) -> Vec<u8> {
let mut buffer: Vec<u8> = vec![];
{
let mut sstable_writer = VoidSSTable::writer(&mut buffer);
for &key in keys {
assert!(sstable_writer.write(key.as_bytes(), &()).is_ok());
}
assert!(sstable_writer.finalize().is_ok());
}
dbg!(&buffer);
buffer
}
fn write_sstable_u64(keys: &[(&'static str, u64)]) -> Vec<u8> {
let mut buffer: Vec<u8> = vec![];
{
let mut sstable_writer = SSTableMonotonicU64::writer(&mut buffer);
for (key, val) in keys {
assert!(sstable_writer.write(key.as_bytes(), val).is_ok());
}
assert!(sstable_writer.finalize().is_ok());
}
buffer
}
fn merge_test_aux(arrs: &[&[&'static str]]) {
let sstables = arrs.iter().cloned().map(write_sstable).collect::<Vec<_>>();
let sstables_ref: Vec<&[u8]> = sstables.iter().map(|s| s.as_ref()).collect();
let mut merged = BTreeSet::new();
for &arr in arrs.iter() {
for &s in arr {
merged.insert(s.to_string());
}
}
let mut w = Vec::new();
assert!(VoidSSTable::merge(sstables_ref, &mut w, VoidMerge).is_ok());
let mut reader = VoidSSTable::reader(&w[..]);
for k in merged {
assert!(reader.advance().unwrap());
assert_eq!(reader.key(), k.as_bytes());
}
assert!(!reader.advance().unwrap());
}
fn merge_test_u64_monotonic_aux(arrs: &[&[(&'static str, u64)]]) {
let sstables = arrs
.iter()
.cloned()
.map(write_sstable_u64)
.collect::<Vec<_>>();
let sstables_ref: Vec<&[u8]> = sstables.iter().map(|s| s.as_ref()).collect();
let mut merged = BTreeMap::new();
for &arr in arrs.iter() {
for (key, val) in arr {
let entry = merged.entry(key.to_string()).or_insert(0u64);
*entry += val;
}
}
let mut w = Vec::new();
assert!(SSTableMonotonicU64::merge(sstables_ref, &mut w, U64Merge).is_ok());
let mut reader = SSTableMonotonicU64::reader(&w[..]);
for (k, v) in merged {
assert!(reader.advance().unwrap());
assert_eq!(reader.key(), k.as_bytes());
assert_eq!(reader.value(), &v);
}
assert!(!reader.advance().unwrap());
}
#[test]
fn test_merge_simple_reproduce() {
let sstable_data = write_sstable(&["a"]);
let mut reader = VoidSSTable::reader(&sstable_data[..]);
assert!(reader.advance().unwrap());
assert_eq!(reader.key(), b"a");
assert!(!reader.advance().unwrap());
}
#[test]
fn test_merge() {
merge_test_aux(&[]);
merge_test_aux(&[&["a"]]);
merge_test_aux(&[&["a", "b"], &["ab"]]); // a, ab, b
merge_test_aux(&[&["a", "b"], &["a", "b"]]);
merge_test_aux(&[
&["happy", "hello", "payer", "tax"],
&["habitat", "hello", "zoo"],
&[],
&["a"],
]);
merge_test_aux(&[&["a"]]);
merge_test_aux(&[&["a", "b"], &["ab"]]);
merge_test_aux(&[&["a", "b"], &["a", "b"]]);
}
#[test]
fn test_merge_u64() {
merge_test_u64_monotonic_aux(&[]);
merge_test_u64_monotonic_aux(&[&[("a", 1u64)]]);
merge_test_u64_monotonic_aux(&[&[("a", 1u64), ("b", 3u64)], &[("ab", 2u64)]]); // a, ab, b
merge_test_u64_monotonic_aux(&[&[("a", 1u64), ("b", 2u64)], &[("a", 16u64), ("b", 23u64)]]);
}
}

View File

@@ -1,365 +0,0 @@
use merge::ValueMerger;
use std::io::{self, Write};
use std::usize;
mod delta;
pub mod merge;
pub mod value;
pub(crate) mod sstable_index;
pub(crate) use self::sstable_index::{SSTableIndex, SSTableIndexBuilder};
pub(crate) mod vint;
mod block_reader;
pub use self::delta::DeltaReader;
use self::delta::DeltaWriter;
use self::value::{U64MonotonicReader, U64MonotonicWriter, ValueReader, ValueWriter};
pub use self::block_reader::BlockReader;
pub use self::merge::VoidMerge;
const DEFAULT_KEY_CAPACITY: usize = 50;
pub(crate) fn common_prefix_len(left: &[u8], right: &[u8]) -> usize {
left.iter()
.cloned()
.zip(right.iter().cloned())
.take_while(|(left, right)| left == right)
.count()
}
pub trait SSTable: Sized {
type Value;
type Reader: ValueReader<Value = Self::Value>;
type Writer: ValueWriter<Value = Self::Value>;
fn delta_writer<W: io::Write>(write: W) -> DeltaWriter<W, Self::Writer> {
DeltaWriter::new(write)
}
fn writer<W: io::Write>(write: W) -> Writer<W, Self::Writer> {
Writer {
previous_key: Vec::with_capacity(DEFAULT_KEY_CAPACITY),
num_terms: 0u64,
index_builder: SSTableIndexBuilder::default(),
delta_writer: Self::delta_writer(write),
first_ordinal_of_the_block: 0u64,
}
}
fn delta_reader<'a, R: io::Read + 'a>(reader: R) -> DeltaReader<'a, Self::Reader> {
DeltaReader::new(reader)
}
fn reader<'a, R: io::Read + 'a>(reader: R) -> Reader<'a, Self::Reader> {
Reader {
key: Vec::with_capacity(DEFAULT_KEY_CAPACITY),
delta_reader: Self::delta_reader(reader),
}
}
fn merge<R: io::Read, W: io::Write, M: ValueMerger<Self::Value>>(
io_readers: Vec<R>,
w: W,
merger: M,
) -> io::Result<()> {
let readers: Vec<_> = io_readers.into_iter().map(Self::reader).collect();
let writer = Self::writer(w);
merge::merge_sstable::<Self, _, _>(readers, writer, merger)
}
}
pub struct VoidSSTable;
impl SSTable for VoidSSTable {
type Value = ();
type Reader = value::VoidReader;
type Writer = value::VoidWriter;
}
pub struct SSTableMonotonicU64;
impl SSTable for SSTableMonotonicU64 {
type Value = u64;
type Reader = U64MonotonicReader;
type Writer = U64MonotonicWriter;
}
pub struct Reader<'a, TValueReader> {
key: Vec<u8>,
delta_reader: DeltaReader<'a, TValueReader>,
}
impl<'a, TValueReader> Reader<'a, TValueReader>
where
TValueReader: ValueReader,
{
pub fn advance(&mut self) -> io::Result<bool> {
if !self.delta_reader.advance()? {
return Ok(false);
}
let common_prefix_len = self.delta_reader.common_prefix_len();
let suffix = self.delta_reader.suffix();
let new_len = self.delta_reader.common_prefix_len() + suffix.len();
self.key.resize(new_len, 0u8);
self.key[common_prefix_len..].copy_from_slice(suffix);
Ok(true)
}
pub fn key(&self) -> &[u8] {
&self.key
}
pub fn value(&self) -> &TValueReader::Value {
self.delta_reader.value()
}
pub(crate) fn into_delta_reader(self) -> DeltaReader<'a, TValueReader> {
assert!(self.key.is_empty());
self.delta_reader
}
}
impl<'a, TValueReader> AsRef<[u8]> for Reader<'a, TValueReader> {
fn as_ref(&self) -> &[u8] {
&self.key
}
}
pub struct Writer<W, TValueWriter>
where
W: io::Write,
{
previous_key: Vec<u8>,
index_builder: SSTableIndexBuilder,
delta_writer: DeltaWriter<W, TValueWriter>,
num_terms: u64,
first_ordinal_of_the_block: u64,
}
impl<W, TValueWriter> Writer<W, TValueWriter>
where
W: io::Write,
TValueWriter: value::ValueWriter,
{
pub(crate) fn current_key(&self) -> &[u8] {
&self.previous_key[..]
}
pub fn write_key(&mut self, key: &[u8]) {
let keep_len = common_prefix_len(&self.previous_key, key);
let add_len = key.len() - keep_len;
let increasing_keys = add_len > 0 && (self.previous_key.len() == keep_len)
|| self.previous_key.is_empty()
|| self.previous_key[keep_len] < key[keep_len];
assert!(
increasing_keys,
"Keys should be increasing. ({:?} > {:?})",
self.previous_key, key
);
self.previous_key.resize(key.len(), 0u8);
self.previous_key[keep_len..].copy_from_slice(&key[keep_len..]);
self.delta_writer.write_suffix(keep_len, &key[keep_len..]);
}
pub(crate) fn into_delta_writer(self) -> DeltaWriter<W, TValueWriter> {
self.delta_writer
}
pub fn write(&mut self, key: &[u8], value: &TValueWriter::Value) -> io::Result<()> {
self.write_key(key);
self.write_value(value)?;
Ok(())
}
pub fn write_value(&mut self, value: &TValueWriter::Value) -> io::Result<()> {
self.delta_writer.write_value(value);
self.num_terms += 1u64;
self.flush_block_if_required()
}
pub fn flush_block_if_required(&mut self) -> io::Result<()> {
if let Some((start_offset, end_offset)) = self.delta_writer.flush_block_if_required()? {
self.index_builder.add_block(
&self.previous_key[..],
start_offset,
end_offset,
self.first_ordinal_of_the_block,
);
self.first_ordinal_of_the_block = self.num_terms;
self.previous_key.clear();
}
Ok(())
}
pub fn finalize(mut self) -> io::Result<W> {
if let Some((start_offset, end_offset)) = self.delta_writer.flush_block()? {
self.index_builder.add_block(
&self.previous_key[..],
start_offset,
end_offset,
self.first_ordinal_of_the_block,
);
self.first_ordinal_of_the_block = self.num_terms;
}
let mut wrt = self.delta_writer.finalize();
wrt.write_all(&0u32.to_le_bytes())?;
let offset = wrt.written_bytes();
self.index_builder.serialize(&mut wrt)?;
wrt.write_all(&offset.to_le_bytes())?;
wrt.write_all(&self.num_terms.to_le_bytes())?;
let wrt = wrt.finish();
Ok(wrt.into_inner()?)
}
}
#[cfg(test)]
mod test {
use std::io;
use super::SSTable;
use super::VoidMerge;
use super::VoidSSTable;
use super::{common_prefix_len, SSTableMonotonicU64};
fn aux_test_common_prefix_len(left: &str, right: &str, expect_len: usize) {
assert_eq!(
common_prefix_len(left.as_bytes(), right.as_bytes()),
expect_len
);
assert_eq!(
common_prefix_len(right.as_bytes(), left.as_bytes()),
expect_len
);
}
#[test]
fn test_common_prefix_len() {
aux_test_common_prefix_len("a", "ab", 1);
aux_test_common_prefix_len("", "ab", 0);
aux_test_common_prefix_len("ab", "abc", 2);
aux_test_common_prefix_len("abde", "abce", 2);
}
#[test]
fn test_long_key_diff() {
let long_key = (0..1_024).map(|x| (x % 255) as u8).collect::<Vec<_>>();
let long_key2 = (1..300).map(|x| (x % 255) as u8).collect::<Vec<_>>();
let mut buffer = vec![];
{
let mut sstable_writer = VoidSSTable::writer(&mut buffer);
assert!(sstable_writer.write(&long_key[..], &()).is_ok());
assert!(sstable_writer.write(&[0, 3, 4], &()).is_ok());
assert!(sstable_writer.write(&long_key2[..], &()).is_ok());
assert!(sstable_writer.finalize().is_ok());
}
let mut sstable_reader = VoidSSTable::reader(&buffer[..]);
assert!(sstable_reader.advance().unwrap());
assert_eq!(sstable_reader.key(), &long_key[..]);
assert!(sstable_reader.advance().unwrap());
assert_eq!(sstable_reader.key(), &[0, 3, 4]);
assert!(sstable_reader.advance().unwrap());
assert_eq!(sstable_reader.key(), &long_key2[..]);
assert!(!sstable_reader.advance().unwrap());
}
#[test]
fn test_simple_sstable() {
let mut buffer = vec![];
{
let mut sstable_writer = VoidSSTable::writer(&mut buffer);
assert!(sstable_writer.write(&[17u8], &()).is_ok());
assert!(sstable_writer.write(&[17u8, 18u8, 19u8], &()).is_ok());
assert!(sstable_writer.write(&[17u8, 20u8], &()).is_ok());
assert!(sstable_writer.finalize().is_ok());
}
assert_eq!(
&buffer,
&[
// block len
7u8, 0u8, 0u8, 0u8, // keep 0 push 1 | ""
16u8, 17u8, // keep 1 push 2 | 18 19
33u8, 18u8, 19u8, // keep 1 push 1 | 20
17u8, 20u8, 0u8, 0u8, 0u8, 0u8, // no more blocks
// index
161, 102, 98, 108, 111, 99, 107, 115, 129, 162, 104, 108, 97, 115, 116, 95, 107,
101, 121, 130, 17, 20, 106, 98, 108, 111, 99, 107, 95, 97, 100, 100, 114, 163, 108,
115, 116, 97, 114, 116, 95, 111, 102, 102, 115, 101, 116, 0, 106, 101, 110, 100,
95, 111, 102, 102, 115, 101, 116, 11, 109, 102, 105, 114, 115, 116, 95, 111, 114,
100, 105, 110, 97, 108, 0, 15, 0, 0, 0, 0, 0, 0, 0, // offset for the index
3u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8 // num terms
]
);
let mut sstable_reader = VoidSSTable::reader(&buffer[..]);
assert!(sstable_reader.advance().unwrap());
assert_eq!(sstable_reader.key(), &[17u8]);
assert!(sstable_reader.advance().unwrap());
assert_eq!(sstable_reader.key(), &[17u8, 18u8, 19u8]);
assert!(sstable_reader.advance().unwrap());
assert_eq!(sstable_reader.key(), &[17u8, 20u8]);
assert!(!sstable_reader.advance().unwrap());
}
#[test]
#[should_panic]
fn test_simple_sstable_non_increasing_key() {
let mut buffer = vec![];
let mut sstable_writer = VoidSSTable::writer(&mut buffer);
assert!(sstable_writer.write(&[17u8], &()).is_ok());
assert!(sstable_writer.write(&[16u8], &()).is_ok());
}
#[test]
fn test_merge_abcd_abe() {
let mut buffer = Vec::new();
{
let mut writer = VoidSSTable::writer(&mut buffer);
writer.write(b"abcd", &()).unwrap();
writer.write(b"abe", &()).unwrap();
writer.finalize().unwrap();
}
let mut output = Vec::new();
assert!(VoidSSTable::merge(vec![&buffer[..], &buffer[..]], &mut output, VoidMerge).is_ok());
assert_eq!(&output[..], &buffer[..]);
}
#[test]
fn test_sstable() {
let mut buffer = Vec::new();
{
let mut writer = VoidSSTable::writer(&mut buffer);
writer.write(b"abcd", &()).unwrap();
writer.write(b"abe", &()).unwrap();
writer.finalize().unwrap();
}
let mut output = Vec::new();
assert!(VoidSSTable::merge(vec![&buffer[..], &buffer[..]], &mut output, VoidMerge).is_ok());
assert_eq!(&output[..], &buffer[..]);
}
#[test]
fn test_sstable_u64() -> io::Result<()> {
let mut buffer = Vec::new();
let mut writer = SSTableMonotonicU64::writer(&mut buffer);
writer.write(b"abcd", &1u64)?;
writer.write(b"abe", &4u64)?;
writer.write(b"gogo", &4324234234234234u64)?;
writer.finalize()?;
let mut reader = SSTableMonotonicU64::reader(&buffer[..]);
assert!(reader.advance()?);
assert_eq!(reader.key(), b"abcd");
assert_eq!(reader.value(), &1u64);
assert!(reader.advance()?);
assert_eq!(reader.key(), b"abe");
assert_eq!(reader.value(), &4u64);
assert!(reader.advance()?);
assert_eq!(reader.key(), b"gogo");
assert_eq!(reader.value(), &4324234234234234u64);
assert!(!reader.advance()?);
Ok(())
}
}

View File

@@ -1,90 +0,0 @@
use std::io;
use serde;
use serde::{Deserialize, Serialize};
#[derive(Default, Debug, Serialize, Deserialize)]
pub struct SSTableIndex {
blocks: Vec<BlockMeta>,
}
impl SSTableIndex {
pub fn load(data: &[u8]) -> SSTableIndex {
// TODO
serde_cbor::de::from_slice(data).unwrap()
}
pub fn search(&self, key: &[u8]) -> Option<BlockAddr> {
self.blocks
.iter()
.find(|block| &block.last_key[..] >= &key)
.map(|block| block.block_addr)
}
}
#[derive(Clone, Eq, PartialEq, Debug, Copy, Serialize, Deserialize)]
pub struct BlockAddr {
pub start_offset: u64,
pub end_offset: u64,
pub first_ordinal: u64,
}
#[derive(Debug, Serialize, Deserialize)]
struct BlockMeta {
pub last_key: Vec<u8>,
pub block_addr: BlockAddr,
}
#[derive(Default)]
pub struct SSTableIndexBuilder {
index: SSTableIndex,
}
impl SSTableIndexBuilder {
pub fn add_block(
&mut self,
last_key: &[u8],
start_offset: u64,
stop_offset: u64,
first_ordinal: u64,
) {
self.index.blocks.push(BlockMeta {
last_key: last_key.to_vec(),
block_addr: BlockAddr {
start_offset,
end_offset: stop_offset,
first_ordinal,
},
})
}
pub fn serialize(&self, wrt: &mut dyn io::Write) -> io::Result<()> {
serde_cbor::ser::to_writer(wrt, &self.index).unwrap();
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::{BlockAddr, SSTableIndex, SSTableIndexBuilder};
#[test]
fn test_sstable_index() {
let mut sstable_builder = SSTableIndexBuilder::default();
sstable_builder.add_block(b"aaa", 10u64, 20u64, 0u64);
sstable_builder.add_block(b"bbbbbbb", 20u64, 30u64, 564);
sstable_builder.add_block(b"ccc", 30u64, 40u64, 10u64);
sstable_builder.add_block(b"dddd", 40u64, 50u64, 15u64);
let mut buffer: Vec<u8> = Vec::new();
sstable_builder.serialize(&mut buffer).unwrap();
let sstable = SSTableIndex::load(&buffer[..]);
assert_eq!(
sstable.search(b"bbbde"),
Some(BlockAddr {
first_ordinal: 10u64,
start_offset: 30u64,
end_offset: 40u64
})
);
}
}

View File

@@ -1,94 +0,0 @@
use super::{vint, BlockReader};
use std::io;
pub trait ValueReader: Default {
type Value;
fn value(&self, idx: usize) -> &Self::Value;
fn read(&mut self, reader: &mut BlockReader) -> io::Result<()>;
}
pub trait ValueWriter: Default {
type Value;
fn write(&mut self, val: &Self::Value);
fn write_block(&mut self, writer: &mut Vec<u8>);
}
#[derive(Default)]
pub struct VoidReader;
impl ValueReader for VoidReader {
type Value = ();
fn value(&self, _idx: usize) -> &() {
&()
}
fn read(&mut self, _reader: &mut BlockReader) -> io::Result<()> {
Ok(())
}
}
#[derive(Default)]
pub struct VoidWriter;
impl ValueWriter for VoidWriter {
type Value = ();
fn write(&mut self, _val: &()) {}
fn write_block(&mut self, _writer: &mut Vec<u8>) {}
}
#[derive(Default)]
pub struct U64MonotonicWriter {
vals: Vec<u64>,
}
impl ValueWriter for U64MonotonicWriter {
type Value = u64;
fn write(&mut self, val: &Self::Value) {
self.vals.push(*val);
}
fn write_block(&mut self, writer: &mut Vec<u8>) {
let mut prev_val = 0u64;
vint::serialize_into_vec(self.vals.len() as u64, writer);
for &val in &self.vals {
let delta = val - prev_val;
vint::serialize_into_vec(delta, writer);
prev_val = val;
}
self.vals.clear();
}
}
#[derive(Default)]
pub struct U64MonotonicReader {
vals: Vec<u64>,
}
impl ValueReader for U64MonotonicReader {
type Value = u64;
fn value(&self, idx: usize) -> &Self::Value {
&self.vals[idx]
}
fn read(&mut self, reader: &mut BlockReader) -> io::Result<()> {
let len = reader.deserialize_u64() as usize;
self.vals.clear();
let mut prev_val = 0u64;
for _ in 0..len {
let delta = reader.deserialize_u64() as u64;
let val = prev_val + delta;
self.vals.push(val);
prev_val = val;
}
Ok(())
}
}

View File

@@ -1,74 +0,0 @@
use super::BlockReader;
const CONTINUE_BIT: u8 = 128u8;
pub fn serialize(mut val: u64, buffer: &mut [u8]) -> usize {
for (i, b) in buffer.iter_mut().enumerate() {
let next_byte: u8 = (val & 127u64) as u8;
val = val >> 7;
if val == 0u64 {
*b = next_byte;
return i + 1;
} else {
*b = next_byte | CONTINUE_BIT;
}
}
10 //< actually unreachable
}
pub fn serialize_into_vec(val: u64, buffer: &mut Vec<u8>) {
let mut buf = [0u8; 10];
let num_bytes = serialize(val, &mut buf[..]);
buffer.extend_from_slice(&buf[..num_bytes]);
}
// super slow but we don't care
pub fn deserialize_read(buf: &[u8]) -> (usize, u64) {
let mut result = 0u64;
let mut shift = 0u64;
let mut consumed = 0;
for &b in buf {
consumed += 1;
result |= u64::from(b % 128u8) << shift;
if b < CONTINUE_BIT {
break;
}
shift += 7;
}
(consumed, result)
}
pub fn deserialize_from_block(block: &mut BlockReader) -> u64 {
let (num_bytes, val) = deserialize_read(block.buffer());
block.advance(num_bytes);
val
}
#[cfg(test)]
mod tests {
use super::{deserialize_read, serialize};
use std::u64;
fn aux_test_int(val: u64, expect_len: usize) {
let mut buffer = [0u8; 14];
assert_eq!(serialize(val, &mut buffer[..]), expect_len);
assert_eq!(deserialize_read(&buffer), (expect_len, val));
}
#[test]
fn test_vint() {
aux_test_int(0u64, 1);
aux_test_int(17u64, 1);
aux_test_int(127u64, 1);
aux_test_int(128u64, 2);
aux_test_int(123423418u64, 4);
for i in 1..63 {
let power_of_two = 1u64 << i;
aux_test_int(power_of_two + 1, (i / 7) + 1);
aux_test_int(power_of_two, (i / 7) + 1);
aux_test_int(power_of_two - 1, ((i - 1) / 7) + 1);
}
aux_test_int(u64::MAX, 10);
}
}

View File

@@ -1,227 +0,0 @@
use super::TermDictionary;
use crate::postings::TermInfo;
use crate::termdict::sstable_termdict::TermInfoReader;
use crate::termdict::TermOrdinal;
use std::io;
use std::ops::Bound;
use tantivy_fst::automaton::AlwaysMatch;
use tantivy_fst::Automaton;
/// `TermStreamerBuilder` is a helper object used to define
/// a range of terms that should be streamed.
pub struct TermStreamerBuilder<'a, A = AlwaysMatch>
where
A: Automaton,
A::State: Clone,
{
term_dict: &'a TermDictionary,
automaton: A,
lower: Bound<Vec<u8>>,
upper: Bound<Vec<u8>>,
}
impl<'a, A> TermStreamerBuilder<'a, A>
where
A: Automaton,
A::State: Clone,
{
pub(crate) fn new(term_dict: &'a TermDictionary, automaton: A) -> Self {
TermStreamerBuilder {
term_dict,
automaton,
lower: Bound::Unbounded,
upper: Bound::Unbounded,
}
}
/// Limit the range to terms greater or equal to the bound
pub fn ge<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
self.lower = Bound::Included(bound.as_ref().to_owned());
self
}
/// Limit the range to terms strictly greater than the bound
pub fn gt<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
self.lower = Bound::Excluded(bound.as_ref().to_owned());
self
}
/// Limit the range to terms lesser or equal to the bound
pub fn le<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
self.upper = Bound::Included(bound.as_ref().to_owned());
self
}
/// Limit the range to terms lesser or equal to the bound
pub fn lt<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
self.lower = Bound::Excluded(bound.as_ref().to_owned());
self
}
pub fn backward(mut self) -> Self {
unimplemented!()
}
/// Creates the stream corresponding to the range
/// of terms defined using the `TermStreamerBuilder`.
pub fn into_stream(self) -> io::Result<TermStreamer<'a, A>> {
let start_state = self.automaton.start();
let delta_reader = self.term_dict.sstable_delta_reader()?;
Ok(TermStreamer {
automaton: self.automaton,
states: vec![start_state],
delta_reader,
key: Vec::new(),
term_ord: 0u64,
})
}
}
/// `TermStreamer` acts as a cursor over a range of terms of a segment.
/// Terms are guaranteed to be sorted.
pub struct TermStreamer<'a, A = AlwaysMatch>
where
A: Automaton,
A::State: Clone,
{
automaton: A,
states: Vec<A::State>,
delta_reader: super::sstable::DeltaReader<'a, TermInfoReader>,
key: Vec<u8>,
term_ord: TermOrdinal,
}
impl<'a, A> TermStreamer<'a, A>
where
A: Automaton,
A::State: Clone,
{
/// Advance position the stream on the next item.
/// Before the first call to `.advance()`, the stream
/// is an unitialized state.
pub fn advance(&mut self) -> bool {
while self.delta_reader.advance().unwrap() {
self.term_ord += 1u64;
let common_prefix_len = self.delta_reader.common_prefix_len();
self.states.truncate(common_prefix_len + 1);
self.key.truncate(common_prefix_len);
let mut state: A::State = self.states.last().unwrap().clone();
for &b in self.delta_reader.suffix() {
state = self.automaton.accept(&state, b);
self.states.push(state.clone());
}
self.key.extend_from_slice(self.delta_reader.suffix());
if self.automaton.is_match(&state) {
return true;
}
}
false
}
/// Returns the `TermOrdinal` of the given term.
///
/// May panic if the called as `.advance()` as never
/// been called before.
pub fn term_ord(&self) -> TermOrdinal {
self.term_ord
}
/// Accesses the current key.
///
/// `.key()` should return the key that was returned
/// by the `.next()` method.
///
/// If the end of the stream as been reached, and `.next()`
/// has been called and returned `None`, `.key()` remains
/// the value of the last key encountered.
///
/// Before any call to `.next()`, `.key()` returns an empty array.
pub fn key(&self) -> &[u8] {
&self.key
}
/// Accesses the current value.
///
/// Calling `.value()` after the end of the stream will return the
/// last `.value()` encountered.
///
/// # Panics
///
/// Calling `.value()` before the first call to `.advance()` returns
/// `V::default()`.
pub fn value(&self) -> &TermInfo {
self.delta_reader.value()
}
/// Return the next `(key, value)` pair.
#[cfg_attr(feature = "cargo-clippy", allow(clippy::should_implement_trait))]
pub fn next(&mut self) -> Option<(&[u8], &TermInfo)> {
if self.advance() {
Some((self.key(), self.value()))
} else {
None
}
}
}
#[cfg(test)]
mod tests {
use super::super::TermDictionary;
use crate::directory::OwnedBytes;
use crate::postings::TermInfo;
fn make_term_info(i: u64) -> TermInfo {
TermInfo {
doc_freq: 1000u32 + i as u32,
positions_idx: i * 500,
postings_start_offset: (i + 10) * (i * 10),
postings_stop_offset: ((i + 1) + 10) * ((i + 1) * 10),
}
}
fn create_test_term_dictionary() -> crate::Result<TermDictionary> {
let mut term_dict_builder = super::super::TermDictionaryBuilder::create(Vec::new())?;
term_dict_builder.insert(b"abaisance", &make_term_info(0u64))?;
term_dict_builder.insert(b"abalation", &make_term_info(1u64))?;
term_dict_builder.insert(b"abalienate", &make_term_info(2u64))?;
term_dict_builder.insert(b"abandon", &make_term_info(3u64))?;
let buffer = term_dict_builder.finish()?;
let owned_bytes = OwnedBytes::new(buffer);
TermDictionary::from_bytes(owned_bytes)
}
#[test]
fn test_sstable_stream() -> crate::Result<()> {
let term_dict = create_test_term_dictionary()?;
let mut term_streamer = term_dict.stream()?;
assert!(term_streamer.advance());
assert_eq!(term_streamer.key(), b"abaisance");
assert_eq!(term_streamer.value().doc_freq, 1000u32);
assert!(term_streamer.advance());
assert_eq!(term_streamer.key(), b"abalation");
assert_eq!(term_streamer.value().doc_freq, 1001u32);
assert!(term_streamer.advance());
assert_eq!(term_streamer.key(), b"abalienate");
assert_eq!(term_streamer.value().doc_freq, 1002u32);
assert!(term_streamer.advance());
assert_eq!(term_streamer.key(), b"abandon");
assert_eq!(term_streamer.value().doc_freq, 1003u32);
assert!(!term_streamer.advance());
Ok(())
}
#[test]
fn test_sstable_search() -> crate::Result<()> {
let term_dict = create_test_term_dictionary()?;
let ptn = tantivy_fst::Regex::new("ab.*t.*").unwrap();
let mut term_streamer = term_dict.search(ptn).into_stream()?;
assert!(term_streamer.advance());
assert_eq!(term_streamer.key(), b"abalation");
assert_eq!(term_streamer.value().doc_freq, 1001u32);
assert!(term_streamer.advance());
assert_eq!(term_streamer.key(), b"abalienate");
assert_eq!(term_streamer.value().doc_freq, 1002u32);
assert!(!term_streamer.advance());
Ok(())
}
}

View File

@@ -1,228 +0,0 @@
use std::io;
use crate::common::BinarySerializable;
use crate::directory::{FileSlice, OwnedBytes};
use crate::postings::TermInfo;
use crate::termdict::sstable_termdict::sstable::sstable_index::BlockAddr;
use crate::termdict::sstable_termdict::sstable::Writer;
use crate::termdict::sstable_termdict::sstable::{DeltaReader, SSTable};
use crate::termdict::sstable_termdict::sstable::{Reader, SSTableIndex};
use crate::termdict::sstable_termdict::{
TermInfoReader, TermInfoWriter, TermSSTable, TermStreamer, TermStreamerBuilder,
};
use crate::termdict::TermOrdinal;
use crate::HasLen;
use once_cell::sync::Lazy;
use tantivy_fst::automaton::AlwaysMatch;
use tantivy_fst::Automaton;
pub struct TermInfoSSTable;
impl SSTable for TermInfoSSTable {
type Value = TermInfo;
type Reader = TermInfoReader;
type Writer = TermInfoWriter;
}
pub struct TermDictionaryBuilder<W: io::Write> {
sstable_writer: Writer<W, TermInfoWriter>,
}
impl<W: io::Write> TermDictionaryBuilder<W> {
/// Creates a new `TermDictionaryBuilder`
pub fn create(w: W) -> io::Result<Self> {
let sstable_writer = TermSSTable::writer(w);
Ok(TermDictionaryBuilder { sstable_writer })
}
/// Inserts a `(key, value)` pair in the term dictionary.
///
/// *Keys have to be inserted in order.*
pub fn insert<K: AsRef<[u8]>>(&mut self, key_ref: K, value: &TermInfo) -> io::Result<()> {
let key = key_ref.as_ref();
self.insert_key(key)?;
self.insert_value(value)?;
Ok(())
}
/// # Warning
/// Horribly dangerous internal API
///
/// If used, it must be used by systematically alternating calls
/// to insert_key and insert_value.
///
/// Prefer using `.insert(key, value)`
pub(crate) fn insert_key(&mut self, key: &[u8]) -> io::Result<()> {
self.sstable_writer.write_key(key);
Ok(())
}
/// # Warning
///
/// Horribly dangerous internal API. See `.insert_key(...)`.
pub(crate) fn insert_value(&mut self, term_info: &TermInfo) -> io::Result<()> {
self.sstable_writer.write_value(term_info);
Ok(())
}
/// Finalize writing the builder, and returns the underlying
/// `Write` object.
pub fn finish(self) -> io::Result<W> {
self.sstable_writer.finalize()
}
}
static EMPTY_TERM_DICT_FILE: Lazy<FileSlice> = Lazy::new(|| {
let term_dictionary_data: Vec<u8> = TermDictionaryBuilder::create(Vec::<u8>::new())
.expect("Creating a TermDictionaryBuilder in a Vec<u8> should never fail")
.finish()
.expect("Writing in a Vec<u8> should never fail");
FileSlice::from(term_dictionary_data)
});
/// The term dictionary contains all of the terms in
/// `tantivy index` in a sorted manner.
///
/// The `Fst` crate is used to associate terms to their
/// respective `TermOrdinal`. The `TermInfoStore` then makes it
/// possible to fetch the associated `TermInfo`.
pub struct TermDictionary {
sstable_slice: FileSlice,
sstable_index: SSTableIndex,
num_terms: u64,
}
impl TermDictionary {
pub(crate) fn sstable_reader(&self) -> io::Result<Reader<'static, TermInfoReader>> {
let data = self.sstable_slice.read_bytes()?;
Ok(TermInfoSSTable::reader(data))
}
pub(crate) fn sstable_reader_block(
&self,
block_addr: BlockAddr,
) -> io::Result<Reader<'static, TermInfoReader>> {
let data = self.sstable_slice.read_bytes_slice(
block_addr.start_offset as usize,
block_addr.end_offset as usize,
)?;
Ok(TermInfoSSTable::reader(data))
}
pub(crate) fn sstable_delta_reader(&self) -> io::Result<DeltaReader<'static, TermInfoReader>> {
let data = self.sstable_slice.read_bytes()?;
Ok(TermInfoSSTable::delta_reader(data))
}
/// Opens a `TermDictionary`.
pub fn open(term_dictionary_file: FileSlice) -> crate::Result<Self> {
let (main_slice, footer_len_slice) = term_dictionary_file.split_from_end(16);
let mut footer_len_bytes: OwnedBytes = footer_len_slice.read_bytes()?;
let index_offset = u64::deserialize(&mut footer_len_bytes)?;
let num_terms = u64::deserialize(&mut footer_len_bytes)?;
let (sstable_slice, index_slice) = main_slice.split(index_offset as usize);
// dbg!(index_slice.len());
let sstable_index_bytes = index_slice.read_bytes()?;
let sstable_index = SSTableIndex::load(sstable_index_bytes.as_slice());
// dbg!(&sstable_index);
Ok(TermDictionary {
sstable_slice,
sstable_index,
num_terms,
})
}
pub fn from_bytes(owned_bytes: OwnedBytes) -> crate::Result<TermDictionary> {
TermDictionary::open(FileSlice::new(Box::new(owned_bytes)))
}
/// Creates an empty term dictionary which contains no terms.
pub fn empty() -> Self {
TermDictionary::open(EMPTY_TERM_DICT_FILE.clone()).unwrap()
}
/// Returns the number of terms in the dictionary.
/// Term ordinals range from 0 to `num_terms() - 1`.
pub fn num_terms(&self) -> usize {
self.num_terms as usize
}
/// Returns the ordinal associated to a given term.
pub fn term_ord<K: AsRef<[u8]>>(&self, key: K) -> io::Result<Option<TermOrdinal>> {
let mut term_ord = 0u64;
let key_bytes = key.as_ref();
let mut sstable_reader = self.sstable_reader()?;
while sstable_reader.advance().unwrap_or(false) {
if sstable_reader.key() == key_bytes {
return Ok(Some(term_ord));
}
term_ord += 1;
}
Ok(None)
}
/// Returns the term associated to a given term ordinal.
///
/// Term ordinals are defined as the position of the term in
/// the sorted list of terms.
///
/// Returns true iff the term has been found.
///
/// Regardless of whether the term is found or not,
/// the buffer may be modified.
pub fn ord_to_term(&self, ord: TermOrdinal, bytes: &mut Vec<u8>) -> io::Result<bool> {
let mut sstable_reader = self.sstable_reader()?;
bytes.clear();
for _ in 0..(ord + 1) {
if !sstable_reader.advance().unwrap_or(false) {
return Ok(false);
}
}
bytes.extend_from_slice(sstable_reader.key());
Ok(true)
}
/// Returns the number of terms in the dictionary.
pub fn term_info_from_ord(&self, term_ord: TermOrdinal) -> io::Result<TermInfo> {
let mut sstable_reader = self.sstable_reader()?;
for _ in 0..(term_ord + 1) {
if !sstable_reader.advance().unwrap_or(false) {
return Ok(TermInfo::default());
}
}
Ok(sstable_reader.value().clone())
}
/// Lookups the value corresponding to the key.
pub fn get<K: AsRef<[u8]>>(&self, key: K) -> io::Result<Option<TermInfo>> {
if let Some(block_addr) = self.sstable_index.search(key.as_ref()) {
let mut sstable_reader = self.sstable_reader_block(block_addr)?;
let key_bytes = key.as_ref();
while sstable_reader.advance().unwrap_or(false) {
if sstable_reader.key() == key_bytes {
let term_info = sstable_reader.value().clone();
return Ok(Some(term_info));
}
}
}
Ok(None)
}
// Returns a range builder, to stream all of the terms
// within an interval.
pub fn range(&self) -> TermStreamerBuilder<'_> {
TermStreamerBuilder::new(self, AlwaysMatch)
}
// A stream of all the sorted terms. [See also `.stream_field()`](#method.stream_field)
pub fn stream(&self) -> io::Result<TermStreamer<'_>> {
self.range().into_stream()
}
// Returns a search builder, to stream all of the terms
// within the Automaton
pub fn search<'a, A: Automaton + 'a>(&'a self, automaton: A) -> TermStreamerBuilder<'a, A>
where
A::State: Clone,
{
TermStreamerBuilder::<A>::new(self, automaton)
}
}

View File

@@ -249,8 +249,7 @@ fn test_empty_string() -> crate::Result<()> {
Ok(())
}
#[test]
fn test_stream_range_boundaries() -> crate::Result<()> {
fn stream_range_test_dict() -> crate::Result<TermDictionary> {
let buffer: Vec<u8> = {
let mut term_dictionary_builder = TermDictionaryBuilder::create(Vec::new())?;
for i in 0u8..10u8 {
@@ -260,84 +259,96 @@ fn test_stream_range_boundaries() -> crate::Result<()> {
term_dictionary_builder.finish()?
};
let file = FileSlice::from(buffer);
let term_dictionary: TermDictionary = TermDictionary::open(file)?;
TermDictionary::open(file)
}
let value_list = |mut streamer: TermStreamer<'_>, backwards: bool| {
#[test]
fn test_stream_range_boundaries_forward() -> crate::Result<()> {
let term_dictionary = stream_range_test_dict()?;
let value_list = |mut streamer: TermStreamer<'_>| {
let mut res: Vec<u32> = vec![];
while let Some((_, ref v)) = streamer.next() {
res.push(v.doc_freq);
}
if backwards {
res.reverse();
}
res
};
{
let range = term_dictionary.range().backward().into_stream()?;
assert_eq!(
value_list(range, true),
vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
);
}
{
let range = term_dictionary.range().ge([2u8]).into_stream()?;
assert_eq!(
value_list(range, false),
vec![2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
);
}
{
let range = term_dictionary.range().ge([2u8]).backward().into_stream()?;
assert_eq!(
value_list(range, true),
value_list(range),
vec![2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
);
}
{
let range = term_dictionary.range().gt([2u8]).into_stream()?;
assert_eq!(
value_list(range, false),
vec![3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
);
}
{
let range = term_dictionary.range().gt([2u8]).backward().into_stream()?;
assert_eq!(
value_list(range, true),
value_list(range),
vec![3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
);
}
{
let range = term_dictionary.range().lt([6u8]).into_stream()?;
assert_eq!(
value_list(range, false),
vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32]
);
}
{
let range = term_dictionary.range().lt([6u8]).backward().into_stream()?;
assert_eq!(
value_list(range, true),
vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32]
);
assert_eq!(value_list(range), vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32]);
}
{
let range = term_dictionary.range().le([6u8]).into_stream()?;
assert_eq!(
value_list(range, false),
vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32]
);
}
{
let range = term_dictionary.range().le([6u8]).backward().into_stream()?;
assert_eq!(
value_list(range, true),
value_list(range),
vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32]
);
}
{
let range = term_dictionary.range().ge([0u8]).lt([5u8]).into_stream()?;
assert_eq!(value_list(range, false), vec![0u32, 1u32, 2u32, 3u32, 4u32]);
assert_eq!(value_list(range), vec![0u32, 1u32, 2u32, 3u32, 4u32]);
}
Ok(())
}
#[test]
fn test_stream_range_boundaries_backward() -> crate::Result<()> {
let term_dictionary = stream_range_test_dict()?;
let value_list_backward = |mut streamer: TermStreamer<'_>| {
let mut res: Vec<u32> = vec![];
while let Some((_, ref v)) = streamer.next() {
res.push(v.doc_freq);
}
res.reverse();
res
};
{
let range = term_dictionary.range().backward().into_stream()?;
assert_eq!(
value_list_backward(range),
vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
);
}
{
let range = term_dictionary.range().ge([2u8]).backward().into_stream()?;
assert_eq!(
value_list_backward(range),
vec![2u32, 3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
);
}
{
let range = term_dictionary.range().gt([2u8]).backward().into_stream()?;
assert_eq!(
value_list_backward(range),
vec![3u32, 4u32, 5u32, 6u32, 7u32, 8u32, 9u32]
);
}
{
let range = term_dictionary.range().lt([6u8]).backward().into_stream()?;
assert_eq!(
value_list_backward(range),
vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32]
);
}
{
let range = term_dictionary.range().le([6u8]).backward().into_stream()?;
assert_eq!(
value_list_backward(range),
vec![0u32, 1u32, 2u32, 3u32, 4u32, 5u32, 6u32]
);
}
{
let range = term_dictionary
@@ -346,11 +357,38 @@ fn test_stream_range_boundaries() -> crate::Result<()> {
.lt([5u8])
.backward()
.into_stream()?;
assert_eq!(value_list(range, true), vec![0u32, 1u32, 2u32, 3u32, 4u32]);
assert_eq!(
value_list_backward(range),
vec![0u32, 1u32, 2u32, 3u32, 4u32]
);
}
Ok(())
}
#[test]
fn test_ord_to_term() -> crate::Result<()> {
let termdict = stream_range_test_dict()?;
let mut bytes = vec![];
for b in 0u8..10u8 {
termdict.ord_to_term(b as u64, &mut bytes)?;
assert_eq!(&bytes, &[b]);
}
Ok(())
}
#[test]
fn test_stream_term_ord() -> crate::Result<()> {
let termdict = stream_range_test_dict()?;
let mut stream = termdict.stream()?;
for b in 0u8..10u8 {
assert!(stream.advance(), true);
assert_eq!(stream.term_ord(), b as u64);
assert_eq!(stream.key(), &[b]);
}
assert!(!stream.advance());
Ok(())
}
#[test]
fn test_automaton_search() -> crate::Result<()> {
use crate::query::DFAWrapper;