mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-04 16:22:55 +00:00
Compare commits
2 Commits
softcommit
...
issue/weba
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
507e46f814 | ||
|
|
3d3da2d66f |
@@ -29,7 +29,7 @@ addons:
|
|||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
# Android
|
# Android
|
||||||
- env: TARGET=aarch64-linux-android DISABLE_TESTS
|
- env: TARGET=aarch64-linux-android DISABLE_TESTS=1
|
||||||
#- env: TARGET=arm-linux-androideabi DISABLE_TESTS=1
|
#- env: TARGET=arm-linux-androideabi DISABLE_TESTS=1
|
||||||
#- env: TARGET=armv7-linux-androideabi DISABLE_TESTS=1
|
#- env: TARGET=armv7-linux-androideabi DISABLE_TESTS=1
|
||||||
#- env: TARGET=i686-linux-android DISABLE_TESTS=1
|
#- env: TARGET=i686-linux-android DISABLE_TESTS=1
|
||||||
@@ -77,4 +77,4 @@ before_cache:
|
|||||||
|
|
||||||
notifications:
|
notifications:
|
||||||
email:
|
email:
|
||||||
on_success: never
|
on_success: never
|
||||||
46
CHANGELOG.md
46
CHANGELOG.md
@@ -1,49 +1,3 @@
|
|||||||
Tantivy 0.9.0
|
|
||||||
=====================
|
|
||||||
*0.9.0 index format is not compatible with the
|
|
||||||
previous index format.*
|
|
||||||
- MAJOR BUGFIX :
|
|
||||||
Some `Mmap` objects were being leaked, and would never get released. (@fulmicoton)
|
|
||||||
- Removed most unsafe (@fulmicoton)
|
|
||||||
- Indexer memory footprint improved. (VInt comp, inlining the first block. (@fulmicoton)
|
|
||||||
- Stemming in other language possible (@pentlander)
|
|
||||||
- Segments with no docs are deleted earlier (@barrotsteindev)
|
|
||||||
- Added grouped add and delete operations.
|
|
||||||
They are guaranteed to happen together (i.e. they cannot be split by a commit).
|
|
||||||
In addition, adds are guaranteed to happen on the same segment. (@elbow-jason)
|
|
||||||
- Removed `INT_STORED` and `INT_INDEXED`. It is now possible to use `STORED` and `INDEXED`
|
|
||||||
for int fields. (@fulmicoton)
|
|
||||||
- Added DateTime field (@barrotsteindev)
|
|
||||||
- Added IndexReader. By default, index is reloaded automatically upon new commits (@fulmicoton)
|
|
||||||
- SIMD linear search within blocks (@fulmicoton)
|
|
||||||
|
|
||||||
Tantivy 0.8.2
|
|
||||||
=====================
|
|
||||||
Fixing build for x86_64 platforms. (#496)
|
|
||||||
No need to update from 0.8.1 if tantivy
|
|
||||||
is building on your platform.
|
|
||||||
|
|
||||||
|
|
||||||
Tantivy 0.8.1
|
|
||||||
=====================
|
|
||||||
Hotfix of #476.
|
|
||||||
|
|
||||||
Merge was reflecting deletes before commit was passed.
|
|
||||||
Thanks @barrotsteindev for reporting the bug.
|
|
||||||
|
|
||||||
|
|
||||||
Tantivy 0.8.0
|
|
||||||
=====================
|
|
||||||
*No change in the index format*
|
|
||||||
- API Breaking change in the collector API. (@jwolfe, @fulmicoton)
|
|
||||||
- Multithreaded search (@jwolfe, @fulmicoton)
|
|
||||||
|
|
||||||
|
|
||||||
Tantivy 0.7.1
|
|
||||||
=====================
|
|
||||||
*No change in the index format*
|
|
||||||
- Bugfix: NGramTokenizer panics on non ascii chars
|
|
||||||
- Added a space usage API
|
|
||||||
|
|
||||||
Tantivy 0.7
|
Tantivy 0.7
|
||||||
=====================
|
=====================
|
||||||
|
|||||||
42
Cargo.toml
42
Cargo.toml
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "tantivy"
|
name = "tantivy"
|
||||||
version = "0.9.0"
|
version = "0.7.0"
|
||||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
categories = ["database-implementations", "data-structures"]
|
categories = ["database-implementations", "data-structures"]
|
||||||
@@ -12,12 +12,12 @@ readme = "README.md"
|
|||||||
keywords = ["search", "information", "retrieval"]
|
keywords = ["search", "information", "retrieval"]
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
base64 = "0.10.0"
|
base64 = "0.9.1"
|
||||||
byteorder = "1.0"
|
byteorder = "1.0"
|
||||||
lazy_static = "1"
|
lazy_static = "1"
|
||||||
regex = "1.0"
|
regex = "1.0"
|
||||||
tantivy-fst = "0.1"
|
fst = {version="0.3", default-features=false}
|
||||||
memmap = {version = "0.7", optional=true}
|
fst-regex = { version="0.2", optional=true}
|
||||||
lz4 = {version="1.20", optional=true}
|
lz4 = {version="1.20", optional=true}
|
||||||
snap = {version="0.2"}
|
snap = {version="0.2"}
|
||||||
atomicwrites = {version="0.2.2", optional=true}
|
atomicwrites = {version="0.2.2", optional=true}
|
||||||
@@ -29,38 +29,33 @@ serde = "1.0"
|
|||||||
serde_derive = "1.0"
|
serde_derive = "1.0"
|
||||||
serde_json = "1.0"
|
serde_json = "1.0"
|
||||||
num_cpus = "1.2"
|
num_cpus = "1.2"
|
||||||
fs2={version="0.4", optional=true}
|
itertools = "0.7"
|
||||||
itertools = "0.8"
|
|
||||||
levenshtein_automata = {version="0.1", features=["fst_automaton"]}
|
levenshtein_automata = {version="0.1", features=["fst_automaton"]}
|
||||||
notify = {version="4", optional=true}
|
|
||||||
bit-set = "0.5"
|
bit-set = "0.5"
|
||||||
uuid = { version = "0.7.2", features = ["v4", "serde"] }
|
uuid = { version = "0.7", features = ["v4", "serde"] }
|
||||||
crossbeam = "0.5"
|
crossbeam = "0.4"
|
||||||
|
crossbeam-channel = "0.2"
|
||||||
futures = "0.1"
|
futures = "0.1"
|
||||||
futures-cpupool = "0.1"
|
futures-cpupool = "0.1"
|
||||||
owning_ref = "0.4"
|
owning_ref = "0.4"
|
||||||
stable_deref_trait = "1.0.0"
|
stable_deref_trait = "1.0.0"
|
||||||
rust-stemmers = "1.1"
|
rust-stemmers = "1"
|
||||||
downcast-rs = { version="1.0" }
|
downcast = { version="0.9" }
|
||||||
bitpacking = "0.6"
|
matches = "0.1"
|
||||||
census = "0.2"
|
bitpacking = "0.5"
|
||||||
|
census = "0.1"
|
||||||
fnv = "1.0.6"
|
fnv = "1.0.6"
|
||||||
owned-read = "0.4"
|
owned-read = "0.4"
|
||||||
failure = "0.1"
|
failure = "0.1"
|
||||||
htmlescape = "0.3.1"
|
htmlescape = "0.3.1"
|
||||||
fail = "0.2"
|
fail = "0.2"
|
||||||
scoped-pool = "1.0"
|
|
||||||
murmurhash32 = "0.2"
|
|
||||||
chrono = "0.4"
|
|
||||||
|
|
||||||
[target.'cfg(windows)'.dependencies]
|
[target.'cfg(windows)'.dependencies]
|
||||||
winapi = "0.2"
|
winapi = "0.2"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
rand = "0.6"
|
rand = "0.5"
|
||||||
maplit = "1"
|
maplit = "1"
|
||||||
matches = "0.1.8"
|
|
||||||
time = "0.1.42"
|
|
||||||
|
|
||||||
[profile.release]
|
[profile.release]
|
||||||
opt-level = 3
|
opt-level = 3
|
||||||
@@ -73,12 +68,13 @@ overflow-checks = true
|
|||||||
|
|
||||||
[features]
|
[features]
|
||||||
# by default no-fail is disabled. We manually enable it when running test.
|
# by default no-fail is disabled. We manually enable it when running test.
|
||||||
default = ["mmap", "no_fail"]
|
default = ["mmap", "no_fail", "regex_query"]
|
||||||
mmap = ["atomicwrites", "fs2", "memmap", "notify"]
|
mmap = ["fst/mmap", "atomicwrites"]
|
||||||
|
regex_query = ["fst-regex"]
|
||||||
lz4-compression = ["lz4"]
|
lz4-compression = ["lz4"]
|
||||||
no_fail = ["fail/no_fail"]
|
no_fail = ["fail/no_fail"]
|
||||||
unstable = [] # useful for benches.
|
|
||||||
wasm-bindgen = ["uuid/wasm-bindgen"]
|
|
||||||
|
|
||||||
[badges]
|
[badges]
|
||||||
travis-ci = { repository = "tantivy-search/tantivy" }
|
travis-ci = { repository = "tantivy-search/tantivy" }
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
27
README.md
27
README.md
@@ -17,29 +17,19 @@
|
|||||||
[](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/6)
|
[](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/6)
|
||||||
[](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/7)
|
[](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/7)
|
||||||
|
|
||||||
[](https://www.patreon.com/fulmicoton)
|
|
||||||
|
|
||||||
|
|
||||||
**Tantivy** is a **full text search engine library** written in rust.
|
**Tantivy** is a **full text search engine library** written in rust.
|
||||||
|
|
||||||
It is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elasticsearch](https://www.elastic.co/products/elasticsearch) and [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not
|
It is closer to Lucene than to Elastic Search and Solr in the sense it is not
|
||||||
an off-the-shelf search engine server, but rather a crate that can be used
|
an off-the-shelf search engine server, but rather a crate that can be used
|
||||||
to build such a search engine.
|
to build such a search engine.
|
||||||
|
|
||||||
Tantivy is, in fact, strongly inspired by Lucene's design.
|
Tantivy is, in fact, strongly inspired by Lucene's design.
|
||||||
|
|
||||||
# Benchmark
|
|
||||||
|
|
||||||
Tantivy is typically faster than Lucene, but the results will depend on
|
|
||||||
the nature of the queries in your workload.
|
|
||||||
|
|
||||||
The following [benchmark](https://tantivy-search.github.io/bench/) break downs
|
|
||||||
performance for different type of queries / collection.
|
|
||||||
|
|
||||||
# Features
|
# Features
|
||||||
|
|
||||||
- Full-text search
|
- Full-text search
|
||||||
- Configurable tokenizer. (stemming available for 17 latin languages. Third party support for Chinese ([tantivy-jieba](https://crates.io/crates/tantivy-jieba) and [cang-jie](https://crates.io/crates/cang-jie)) and [Japanese](https://crates.io/crates/tantivy-tokenizer-tiny-segmenter)
|
|
||||||
- Fast (check out the :racehorse: :sparkles: [benchmark](https://tantivy-search.github.io/bench/) :sparkles: :racehorse:)
|
- Fast (check out the :racehorse: :sparkles: [benchmark](https://tantivy-search.github.io/bench/) :sparkles: :racehorse:)
|
||||||
- Tiny startup time (<10ms), perfect for command line tools
|
- Tiny startup time (<10ms), perfect for command line tools
|
||||||
- BM25 scoring (the same as lucene)
|
- BM25 scoring (the same as lucene)
|
||||||
@@ -51,7 +41,6 @@ performance for different type of queries / collection.
|
|||||||
- SIMD integer compression when the platform/CPU includes the SSE2 instruction set.
|
- SIMD integer compression when the platform/CPU includes the SSE2 instruction set.
|
||||||
- Single valued and multivalued u64 and i64 fast fields (equivalent of doc values in Lucene)
|
- Single valued and multivalued u64 and i64 fast fields (equivalent of doc values in Lucene)
|
||||||
- `&[u8]` fast fields
|
- `&[u8]` fast fields
|
||||||
- Text, i64, u64, dates and hierarchical facet fields
|
|
||||||
- LZ4 compressed document store
|
- LZ4 compressed document store
|
||||||
- Range queries
|
- Range queries
|
||||||
- Faceted search
|
- Faceted search
|
||||||
@@ -87,7 +76,7 @@ It will walk you through getting a wikipedia search engine up and running in a f
|
|||||||
Tantivy compiles on stable rust but requires `Rust >= 1.27`.
|
Tantivy compiles on stable rust but requires `Rust >= 1.27`.
|
||||||
To check out and run tests, you can simply run :
|
To check out and run tests, you can simply run :
|
||||||
|
|
||||||
git clone https://github.com/tantivy-search/tantivy.git
|
git clone git@github.com:tantivy-search/tantivy.git
|
||||||
cd tantivy
|
cd tantivy
|
||||||
cargo build
|
cargo build
|
||||||
|
|
||||||
@@ -96,14 +85,6 @@ To check out and run tests, you can simply run :
|
|||||||
Some tests will not run with just `cargo test` because of `fail-rs`.
|
Some tests will not run with just `cargo test` because of `fail-rs`.
|
||||||
To run the tests exhaustively, run `./run-tests.sh`.
|
To run the tests exhaustively, run `./run-tests.sh`.
|
||||||
|
|
||||||
# How can I support this project ?
|
# Contribute
|
||||||
|
|
||||||
There are many ways to support this project.
|
Send me an email (paul.masurel at gmail.com) if you want to contribute to tantivy.
|
||||||
|
|
||||||
- If you use tantivy, tell us about your experience on [gitter](https://gitter.im/tantivy-search/tantivy) or by email (paul.masurel@gmail.com)
|
|
||||||
- Report bugs
|
|
||||||
- Write a blog post
|
|
||||||
- Complete documentation
|
|
||||||
- Contribute code (you can join [our gitter](https://gitter.im/tantivy-search/tantivy) )
|
|
||||||
- Talk about tantivy around you
|
|
||||||
- Drop a word on on [](https://saythanks.io/to/fulmicoton) or even [](https://www.patreon.com/fulmicoton)
|
|
||||||
|
|||||||
@@ -16,12 +16,10 @@ extern crate tempdir;
|
|||||||
// Importing tantivy...
|
// Importing tantivy...
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate tantivy;
|
extern crate tantivy;
|
||||||
use tantivy::collector::TopDocs;
|
use tantivy::collector::TopCollector;
|
||||||
use tantivy::query::QueryParser;
|
use tantivy::query::QueryParser;
|
||||||
use tantivy::schema::*;
|
use tantivy::schema::*;
|
||||||
use tantivy::Index;
|
use tantivy::Index;
|
||||||
use tantivy::ReloadPolicy;
|
|
||||||
use tempdir::TempDir;
|
|
||||||
|
|
||||||
fn main() -> tantivy::Result<()> {
|
fn main() -> tantivy::Result<()> {
|
||||||
// Let's create a temporary directory for the
|
// Let's create a temporary directory for the
|
||||||
@@ -36,7 +34,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
// be indexed".
|
// be indexed".
|
||||||
|
|
||||||
// first we need to define a schema ...
|
// first we need to define a schema ...
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
|
|
||||||
// Our first field is title.
|
// Our first field is title.
|
||||||
// We want full-text search for it, and we also want
|
// We want full-text search for it, and we also want
|
||||||
@@ -107,37 +105,37 @@ fn main() -> tantivy::Result<()> {
|
|||||||
// For convenience, tantivy also comes with a macro to
|
// For convenience, tantivy also comes with a macro to
|
||||||
// reduce the boilerplate above.
|
// reduce the boilerplate above.
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
title => "Of Mice and Men",
|
title => "Of Mice and Men",
|
||||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||||
limbs and branches that arch over the pool"
|
limbs and branches that arch over the pool"
|
||||||
));
|
));
|
||||||
|
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
title => "Of Mice and Men",
|
title => "Of Mice and Men",
|
||||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||||
limbs and branches that arch over the pool"
|
limbs and branches that arch over the pool"
|
||||||
));
|
));
|
||||||
|
|
||||||
// Multivalued field just need to be repeated.
|
// Multivalued field just need to be repeated.
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
title => "Frankenstein",
|
title => "Frankenstein",
|
||||||
title => "The Modern Prometheus",
|
title => "The Modern Prometheus",
|
||||||
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
|
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
|
||||||
enterprise which you have regarded with such evil forebodings. I arrived here \
|
enterprise which you have regarded with such evil forebodings. I arrived here \
|
||||||
yesterday, and my first task is to assure my dear sister of my welfare and \
|
yesterday, and my first task is to assure my dear sister of my welfare and \
|
||||||
increasing confidence in the success of my undertaking."
|
increasing confidence in the success of my undertaking."
|
||||||
));
|
));
|
||||||
|
|
||||||
// This is an example, so we will only index 3 documents
|
// This is an example, so we will only index 3 documents
|
||||||
@@ -171,33 +169,24 @@ fn main() -> tantivy::Result<()> {
|
|||||||
//
|
//
|
||||||
// ### Searcher
|
// ### Searcher
|
||||||
//
|
//
|
||||||
// A reader is required to get search the index.
|
// Let's search our index. Start by reloading
|
||||||
// It acts as a `Searcher` pool that reloads itself,
|
// searchers in the index. This should be done
|
||||||
// depending on a `ReloadPolicy`.
|
// after every `commit()`.
|
||||||
//
|
index.load_searchers()?;
|
||||||
// For a search server you will typically create one reader for the entire lifetime of your
|
|
||||||
// program, and acquire a new searcher for every single request.
|
|
||||||
//
|
|
||||||
// In the code below, we rely on the 'ON_COMMIT' policy: the reader
|
|
||||||
// will reload the index automatically after each commit.
|
|
||||||
let reader = index
|
|
||||||
.reader_builder()
|
|
||||||
.reload_policy(ReloadPolicy::OnCommit)
|
|
||||||
.try_into()?;
|
|
||||||
|
|
||||||
// We now need to acquire a searcher.
|
// We now need to acquire a searcher.
|
||||||
//
|
|
||||||
// A searcher points to snapshotted, immutable version of the index.
|
|
||||||
//
|
|
||||||
// Some search experience might require more than
|
// Some search experience might require more than
|
||||||
// one query. Using the same searcher ensures that all of these queries will run on the
|
// one query.
|
||||||
// same version of the index.
|
//
|
||||||
|
// The searcher ensure that we get to work
|
||||||
|
// with a consistent version of the index.
|
||||||
//
|
//
|
||||||
// Acquiring a `searcher` is very cheap.
|
// Acquiring a `searcher` is very cheap.
|
||||||
//
|
//
|
||||||
// You should acquire a searcher every time you start processing a request and
|
// You should acquire a searcher every time you
|
||||||
|
// start processing a request and
|
||||||
// and release it right after your query is finished.
|
// and release it right after your query is finished.
|
||||||
let searcher = reader.searcher();
|
let searcher = index.searcher();
|
||||||
|
|
||||||
// ### Query
|
// ### Query
|
||||||
|
|
||||||
@@ -223,10 +212,15 @@ fn main() -> tantivy::Result<()> {
|
|||||||
//
|
//
|
||||||
// We are not interested in all of the documents but
|
// We are not interested in all of the documents but
|
||||||
// only in the top 10. Keeping track of our top 10 best documents
|
// only in the top 10. Keeping track of our top 10 best documents
|
||||||
// is the role of the TopDocs.
|
// is the role of the TopCollector.
|
||||||
|
let mut top_collector = TopCollector::with_limit(10);
|
||||||
|
|
||||||
// We can now perform our query.
|
// We can now perform our query.
|
||||||
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
|
searcher.search(&*query, &mut top_collector)?;
|
||||||
|
|
||||||
|
// Our top collector now contains the 10
|
||||||
|
// most relevant doc ids...
|
||||||
|
let doc_addresses = top_collector.docs();
|
||||||
|
|
||||||
// The actual documents still need to be
|
// The actual documents still need to be
|
||||||
// retrieved from Tantivy's store.
|
// retrieved from Tantivy's store.
|
||||||
@@ -234,10 +228,13 @@ fn main() -> tantivy::Result<()> {
|
|||||||
// Since the body field was not configured as stored,
|
// Since the body field was not configured as stored,
|
||||||
// the document returned will only contain
|
// the document returned will only contain
|
||||||
// a title.
|
// a title.
|
||||||
for (_score, doc_address) in top_docs {
|
|
||||||
|
for doc_address in doc_addresses {
|
||||||
let retrieved_doc = searcher.doc(doc_address)?;
|
let retrieved_doc = searcher.doc(doc_address)?;
|
||||||
println!("{}", schema.to_json(&retrieved_doc));
|
println!("{}", schema.to_json(&retrieved_doc));
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
use tempdir::TempDir;
|
||||||
|
|||||||
@@ -1,187 +0,0 @@
|
|||||||
// # Custom collector example
|
|
||||||
//
|
|
||||||
// This example shows how you can implement your own
|
|
||||||
// collector. As an example, we will compute a collector
|
|
||||||
// that computes the standard deviation of a given fast field.
|
|
||||||
//
|
|
||||||
// Of course, you can have a look at the tantivy's built-in collectors
|
|
||||||
// such as the `CountCollector` for more examples.
|
|
||||||
|
|
||||||
extern crate tempdir;
|
|
||||||
|
|
||||||
// ---
|
|
||||||
// Importing tantivy...
|
|
||||||
#[macro_use]
|
|
||||||
extern crate tantivy;
|
|
||||||
use tantivy::collector::{Collector, SegmentCollector};
|
|
||||||
use tantivy::fastfield::FastFieldReader;
|
|
||||||
use tantivy::query::QueryParser;
|
|
||||||
use tantivy::schema::Field;
|
|
||||||
use tantivy::schema::{Schema, FAST, INDEXED, TEXT};
|
|
||||||
use tantivy::Index;
|
|
||||||
use tantivy::SegmentReader;
|
|
||||||
|
|
||||||
#[derive(Default)]
|
|
||||||
struct Stats {
|
|
||||||
count: usize,
|
|
||||||
sum: f64,
|
|
||||||
squared_sum: f64,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Stats {
|
|
||||||
pub fn count(&self) -> usize {
|
|
||||||
self.count
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn mean(&self) -> f64 {
|
|
||||||
self.sum / (self.count as f64)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn square_mean(&self) -> f64 {
|
|
||||||
self.squared_sum / (self.count as f64)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn standard_deviation(&self) -> f64 {
|
|
||||||
let mean = self.mean();
|
|
||||||
(self.square_mean() - mean * mean).sqrt()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn non_zero_count(self) -> Option<Stats> {
|
|
||||||
if self.count == 0 {
|
|
||||||
None
|
|
||||||
} else {
|
|
||||||
Some(self)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct StatsCollector {
|
|
||||||
field: Field,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl StatsCollector {
|
|
||||||
fn with_field(field: Field) -> StatsCollector {
|
|
||||||
StatsCollector { field }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Collector for StatsCollector {
|
|
||||||
// That's the type of our result.
|
|
||||||
// Our standard deviation will be a float.
|
|
||||||
type Fruit = Option<Stats>;
|
|
||||||
|
|
||||||
type Child = StatsSegmentCollector;
|
|
||||||
|
|
||||||
fn for_segment(
|
|
||||||
&self,
|
|
||||||
_segment_local_id: u32,
|
|
||||||
segment: &SegmentReader,
|
|
||||||
) -> tantivy::Result<StatsSegmentCollector> {
|
|
||||||
let fast_field_reader = segment.fast_field_reader(self.field)?;
|
|
||||||
Ok(StatsSegmentCollector {
|
|
||||||
fast_field_reader,
|
|
||||||
stats: Stats::default(),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
|
||||||
// this collector does not care about score.
|
|
||||||
false
|
|
||||||
}
|
|
||||||
|
|
||||||
fn merge_fruits(&self, segment_stats: Vec<Option<Stats>>) -> tantivy::Result<Option<Stats>> {
|
|
||||||
let mut stats = Stats::default();
|
|
||||||
for segment_stats_opt in segment_stats {
|
|
||||||
if let Some(segment_stats) = segment_stats_opt {
|
|
||||||
stats.count += segment_stats.count;
|
|
||||||
stats.sum += segment_stats.sum;
|
|
||||||
stats.squared_sum += segment_stats.squared_sum;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(stats.non_zero_count())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct StatsSegmentCollector {
|
|
||||||
fast_field_reader: FastFieldReader<u64>,
|
|
||||||
stats: Stats,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SegmentCollector for StatsSegmentCollector {
|
|
||||||
type Fruit = Option<Stats>;
|
|
||||||
|
|
||||||
fn collect(&mut self, doc: u32, _score: f32) {
|
|
||||||
let value = self.fast_field_reader.get(doc) as f64;
|
|
||||||
self.stats.count += 1;
|
|
||||||
self.stats.sum += value;
|
|
||||||
self.stats.squared_sum += value * value;
|
|
||||||
}
|
|
||||||
|
|
||||||
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
|
|
||||||
self.stats.non_zero_count()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn main() -> tantivy::Result<()> {
|
|
||||||
// # Defining the schema
|
|
||||||
//
|
|
||||||
// The Tantivy index requires a very strict schema.
|
|
||||||
// The schema declares which fields are in the index,
|
|
||||||
// and for each field, its type and "the way it should
|
|
||||||
// be indexed".
|
|
||||||
|
|
||||||
// first we need to define a schema ...
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
|
|
||||||
// We'll assume a fictional index containing
|
|
||||||
// products, and with a name, a description, and a price.
|
|
||||||
let product_name = schema_builder.add_text_field("name", TEXT);
|
|
||||||
let product_description = schema_builder.add_text_field("description", TEXT);
|
|
||||||
let price = schema_builder.add_u64_field("price", INDEXED | FAST);
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
|
|
||||||
// # Indexing documents
|
|
||||||
//
|
|
||||||
// Lets index a bunch of fake documents for the sake of
|
|
||||||
// this example.
|
|
||||||
let index = Index::create_in_ram(schema.clone());
|
|
||||||
|
|
||||||
let mut index_writer = index.writer(50_000_000)?;
|
|
||||||
index_writer.add_document(doc!(
|
|
||||||
product_name => "Super Broom 2000",
|
|
||||||
product_description => "While it is ok for short distance travel, this broom \
|
|
||||||
was designed quiditch. It will up your game.",
|
|
||||||
price => 30_200u64
|
|
||||||
));
|
|
||||||
index_writer.add_document(doc!(
|
|
||||||
product_name => "Turbulobroom",
|
|
||||||
product_description => "You might have heard of this broom before : it is the sponsor of the Wales team.\
|
|
||||||
You'll enjoy its sharp turns, and rapid acceleration",
|
|
||||||
price => 29_240u64
|
|
||||||
));
|
|
||||||
index_writer.add_document(doc!(
|
|
||||||
product_name => "Broomio",
|
|
||||||
product_description => "Great value for the price. This broom is a market favorite",
|
|
||||||
price => 21_240u64
|
|
||||||
));
|
|
||||||
index_writer.add_document(doc!(
|
|
||||||
product_name => "Whack a Mole",
|
|
||||||
product_description => "Prime quality bat.",
|
|
||||||
price => 5_200u64
|
|
||||||
));
|
|
||||||
index_writer.commit()?;
|
|
||||||
|
|
||||||
let reader = index.reader()?;
|
|
||||||
let searcher = reader.searcher();
|
|
||||||
let query_parser = QueryParser::for_index(&index, vec![product_name, product_description]);
|
|
||||||
|
|
||||||
// here we want to get a hit on the 'ken' in Frankenstein
|
|
||||||
let query = query_parser.parse_query("broom")?;
|
|
||||||
if let Some(stats) = searcher.search(&query, &StatsCollector::with_field(price))? {
|
|
||||||
println!("count: {}", stats.count());
|
|
||||||
println!("mean: {}", stats.mean());
|
|
||||||
println!("standard deviation: {}", stats.standard_deviation());
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
@@ -5,7 +5,7 @@
|
|||||||
|
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate tantivy;
|
extern crate tantivy;
|
||||||
use tantivy::collector::TopDocs;
|
use tantivy::collector::TopCollector;
|
||||||
use tantivy::query::QueryParser;
|
use tantivy::query::QueryParser;
|
||||||
use tantivy::schema::*;
|
use tantivy::schema::*;
|
||||||
use tantivy::tokenizer::NgramTokenizer;
|
use tantivy::tokenizer::NgramTokenizer;
|
||||||
@@ -20,7 +20,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
// be indexed".
|
// be indexed".
|
||||||
|
|
||||||
// first we need to define a schema ...
|
// first we need to define a schema ...
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
|
|
||||||
// Our first field is title.
|
// Our first field is title.
|
||||||
// In this example we want to use NGram searching
|
// In this example we want to use NGram searching
|
||||||
@@ -68,12 +68,12 @@ fn main() -> tantivy::Result<()> {
|
|||||||
// heap for the indexer can increase its throughput.
|
// heap for the indexer can increase its throughput.
|
||||||
let mut index_writer = index.writer(50_000_000)?;
|
let mut index_writer = index.writer(50_000_000)?;
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
title => "The Old Man and the Sea",
|
title => "The Old Man and the Sea",
|
||||||
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
|
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
|
||||||
he had gone eighty-four days now without taking a fish."
|
he had gone eighty-four days now without taking a fish."
|
||||||
));
|
));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
title => "Of Mice and Men",
|
title => "Of Mice and Men",
|
||||||
body => r#"A few miles south of Soledad, the Salinas River drops in close to the hillside
|
body => r#"A few miles south of Soledad, the Salinas River drops in close to the hillside
|
||||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling
|
bank and runs deep and green. The water is warm too, for it has slipped twinkling
|
||||||
over the yellow sands in the sunlight before reaching the narrow pool. On one
|
over the yellow sands in the sunlight before reaching the narrow pool. On one
|
||||||
@@ -84,16 +84,16 @@ fn main() -> tantivy::Result<()> {
|
|||||||
limbs and branches that arch over the pool"#
|
limbs and branches that arch over the pool"#
|
||||||
));
|
));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
title => "Frankenstein",
|
title => "Frankenstein",
|
||||||
body => r#"You will rejoice to hear that no disaster has accompanied the commencement of an
|
body => r#"You will rejoice to hear that no disaster has accompanied the commencement of an
|
||||||
enterprise which you have regarded with such evil forebodings. I arrived here
|
enterprise which you have regarded with such evil forebodings. I arrived here
|
||||||
yesterday, and my first task is to assure my dear sister of my welfare and
|
yesterday, and my first task is to assure my dear sister of my welfare and
|
||||||
increasing confidence in the success of my undertaking."#
|
increasing confidence in the success of my undertaking."#
|
||||||
));
|
));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
|
index.load_searchers()?;
|
||||||
|
|
||||||
let reader = index.reader()?;
|
let searcher = index.searcher();
|
||||||
let searcher = reader.searcher();
|
|
||||||
|
|
||||||
// The query parser can interpret human queries.
|
// The query parser can interpret human queries.
|
||||||
// Here, if the user does not specify which
|
// Here, if the user does not specify which
|
||||||
@@ -104,9 +104,11 @@ fn main() -> tantivy::Result<()> {
|
|||||||
// here we want to get a hit on the 'ken' in Frankenstein
|
// here we want to get a hit on the 'ken' in Frankenstein
|
||||||
let query = query_parser.parse_query("ken")?;
|
let query = query_parser.parse_query("ken")?;
|
||||||
|
|
||||||
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
|
let mut top_collector = TopCollector::with_limit(10);
|
||||||
|
searcher.search(&*query, &mut top_collector)?;
|
||||||
|
|
||||||
for (_, doc_address) in top_docs {
|
let doc_addresses = top_collector.docs();
|
||||||
|
for doc_address in doc_addresses {
|
||||||
let retrieved_doc = searcher.doc(doc_address)?;
|
let retrieved_doc = searcher.doc(doc_address)?;
|
||||||
println!("{}", schema.to_json(&retrieved_doc));
|
println!("{}", schema.to_json(&retrieved_doc));
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -10,20 +10,16 @@
|
|||||||
// Importing tantivy...
|
// Importing tantivy...
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate tantivy;
|
extern crate tantivy;
|
||||||
use tantivy::collector::TopDocs;
|
use tantivy::collector::TopCollector;
|
||||||
use tantivy::query::TermQuery;
|
use tantivy::query::TermQuery;
|
||||||
use tantivy::schema::*;
|
use tantivy::schema::*;
|
||||||
use tantivy::Index;
|
use tantivy::Index;
|
||||||
use tantivy::IndexReader;
|
|
||||||
|
|
||||||
// A simple helper function to fetch a single document
|
// A simple helper function to fetch a single document
|
||||||
// given its id from our index.
|
// given its id from our index.
|
||||||
// It will be helpful to check our work.
|
// It will be helpful to check our work.
|
||||||
fn extract_doc_given_isbn(
|
fn extract_doc_given_isbn(index: &Index, isbn_term: &Term) -> tantivy::Result<Option<Document>> {
|
||||||
reader: &IndexReader,
|
let searcher = index.searcher();
|
||||||
isbn_term: &Term,
|
|
||||||
) -> tantivy::Result<Option<Document>> {
|
|
||||||
let searcher = reader.searcher();
|
|
||||||
|
|
||||||
// This is the simplest query you can think of.
|
// This is the simplest query you can think of.
|
||||||
// It matches all of the documents containing a specific term.
|
// It matches all of the documents containing a specific term.
|
||||||
@@ -31,9 +27,10 @@ fn extract_doc_given_isbn(
|
|||||||
// The second argument is here to tell we don't care about decoding positions,
|
// The second argument is here to tell we don't care about decoding positions,
|
||||||
// or term frequencies.
|
// or term frequencies.
|
||||||
let term_query = TermQuery::new(isbn_term.clone(), IndexRecordOption::Basic);
|
let term_query = TermQuery::new(isbn_term.clone(), IndexRecordOption::Basic);
|
||||||
let top_docs = searcher.search(&term_query, &TopDocs::with_limit(1))?;
|
let mut top_collector = TopCollector::with_limit(1);
|
||||||
|
searcher.search(&term_query, &mut top_collector)?;
|
||||||
|
|
||||||
if let Some((_score, doc_address)) = top_docs.first() {
|
if let Some(doc_address) = top_collector.docs().first() {
|
||||||
let doc = searcher.doc(*doc_address)?;
|
let doc = searcher.doc(*doc_address)?;
|
||||||
Ok(Some(doc))
|
Ok(Some(doc))
|
||||||
} else {
|
} else {
|
||||||
@@ -47,7 +44,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
//
|
//
|
||||||
// Check out the *basic_search* example if this makes
|
// Check out the *basic_search* example if this makes
|
||||||
// small sense to you.
|
// small sense to you.
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
|
|
||||||
// Tantivy does not really have a notion of primary id.
|
// Tantivy does not really have a notion of primary id.
|
||||||
// This may change in the future.
|
// This may change in the future.
|
||||||
@@ -89,12 +86,12 @@ fn main() -> tantivy::Result<()> {
|
|||||||
isbn => "978-9176370711",
|
isbn => "978-9176370711",
|
||||||
));
|
));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
let reader = index.reader()?;
|
index.load_searchers()?;
|
||||||
|
|
||||||
let frankenstein_isbn = Term::from_field_text(isbn, "978-9176370711");
|
let frankenstein_isbn = Term::from_field_text(isbn, "978-9176370711");
|
||||||
|
|
||||||
// Oops our frankenstein doc seems mispelled
|
// Oops our frankenstein doc seems mispelled
|
||||||
let frankenstein_doc_misspelled = extract_doc_given_isbn(&reader, &frankenstein_isbn)?.unwrap();
|
let frankenstein_doc_misspelled = extract_doc_given_isbn(&index, &frankenstein_isbn)?.unwrap();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
schema.to_json(&frankenstein_doc_misspelled),
|
schema.to_json(&frankenstein_doc_misspelled),
|
||||||
r#"{"isbn":["978-9176370711"],"title":["Frankentein"]}"#,
|
r#"{"isbn":["978-9176370711"],"title":["Frankentein"]}"#,
|
||||||
@@ -133,10 +130,10 @@ fn main() -> tantivy::Result<()> {
|
|||||||
// Everything happened as if the document was updated.
|
// Everything happened as if the document was updated.
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
// We reload our searcher to make our change available to clients.
|
// We reload our searcher to make our change available to clients.
|
||||||
reader.reload()?;
|
index.load_searchers()?;
|
||||||
|
|
||||||
// No more typo!
|
// No more typo!
|
||||||
let frankenstein_new_doc = extract_doc_given_isbn(&reader, &frankenstein_isbn)?.unwrap();
|
let frankenstein_new_doc = extract_doc_given_isbn(&index, &frankenstein_isbn)?.unwrap();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
schema.to_json(&frankenstein_new_doc),
|
schema.to_json(&frankenstein_new_doc),
|
||||||
r#"{"isbn":["978-9176370711"],"title":["Frankenstein"]}"#,
|
r#"{"isbn":["978-9176370711"],"title":["Frankenstein"]}"#,
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
// Let's create a temporary directory for the
|
// Let's create a temporary directory for the
|
||||||
// sake of this example
|
// sake of this example
|
||||||
let index_path = TempDir::new("tantivy_facet_example_dir")?;
|
let index_path = TempDir::new("tantivy_facet_example_dir")?;
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
|
|
||||||
schema_builder.add_text_field("name", TEXT | STORED);
|
schema_builder.add_text_field("name", TEXT | STORED);
|
||||||
|
|
||||||
@@ -55,17 +55,18 @@ fn main() -> tantivy::Result<()> {
|
|||||||
|
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
|
|
||||||
let reader = index.reader()?;
|
index.load_searchers()?;
|
||||||
|
|
||||||
let searcher = reader.searcher();
|
let searcher = index.searcher();
|
||||||
|
|
||||||
let mut facet_collector = FacetCollector::for_field(tags);
|
let mut facet_collector = FacetCollector::for_field(tags);
|
||||||
facet_collector.add_facet("/pools");
|
facet_collector.add_facet("/pools");
|
||||||
|
|
||||||
let facet_counts = searcher.search(&AllQuery, &facet_collector).unwrap();
|
searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
||||||
|
|
||||||
|
let counts = facet_collector.harvest();
|
||||||
// This lists all of the facet counts
|
// This lists all of the facet counts
|
||||||
let facets: Vec<(&Facet, u64)> = facet_counts.get("/pools").collect();
|
let facets: Vec<(&Facet, u64)> = counts.get("/pools").collect();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
facets,
|
facets,
|
||||||
vec![
|
vec![
|
||||||
|
|||||||
@@ -1,43 +0,0 @@
|
|||||||
// # Searching a range on an indexed int field.
|
|
||||||
//
|
|
||||||
// Below is an example of creating an indexed integer field in your schema
|
|
||||||
// You can use RangeQuery to get a Count of all occurrences in a given range.
|
|
||||||
|
|
||||||
#[macro_use]
|
|
||||||
extern crate tantivy;
|
|
||||||
use tantivy::collector::Count;
|
|
||||||
use tantivy::query::RangeQuery;
|
|
||||||
use tantivy::schema::{Schema, INDEXED};
|
|
||||||
use tantivy::Index;
|
|
||||||
use tantivy::Result;
|
|
||||||
|
|
||||||
fn run() -> Result<()> {
|
|
||||||
// For the sake of simplicity, this schema will only have 1 field
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
|
|
||||||
// `INDEXED` is a short-hand to indicate that our field should be "searchable".
|
|
||||||
let year_field = schema_builder.add_u64_field("year", INDEXED);
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
let index = Index::create_in_ram(schema);
|
|
||||||
let reader = index.reader()?;
|
|
||||||
{
|
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 6_000_000)?;
|
|
||||||
for year in 1950u64..2019u64 {
|
|
||||||
index_writer.add_document(doc!(year_field => year));
|
|
||||||
}
|
|
||||||
index_writer.commit()?;
|
|
||||||
// The index will be a range of years
|
|
||||||
}
|
|
||||||
reader.reload()?;
|
|
||||||
let searcher = reader.searcher();
|
|
||||||
// The end is excluded i.e. here we are searching up to 1969
|
|
||||||
let docs_in_the_sixties = RangeQuery::new_u64(year_field, 1960..1970);
|
|
||||||
// Uses a Count collector to sum the total number of docs in the range
|
|
||||||
let num_60s_books = searcher.search(&docs_in_the_sixties, &Count)?;
|
|
||||||
assert_eq!(num_60s_books, 10);
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn main() {
|
|
||||||
run().unwrap()
|
|
||||||
}
|
|
||||||
@@ -18,7 +18,7 @@ use tantivy::{DocId, DocSet, Postings};
|
|||||||
fn main() -> tantivy::Result<()> {
|
fn main() -> tantivy::Result<()> {
|
||||||
// We first create a schema for the sake of the
|
// We first create a schema for the sake of the
|
||||||
// example. Check the `basic_search` example for more information.
|
// example. Check the `basic_search` example for more information.
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
|
|
||||||
// For this example, we need to make sure to index positions for our title
|
// For this example, we need to make sure to index positions for our title
|
||||||
// field. `TEXT` precisely does this.
|
// field. `TEXT` precisely does this.
|
||||||
@@ -33,9 +33,9 @@ fn main() -> tantivy::Result<()> {
|
|||||||
index_writer.add_document(doc!(title => "The modern Promotheus"));
|
index_writer.add_document(doc!(title => "The modern Promotheus"));
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
|
|
||||||
let reader = index.reader()?;
|
index.load_searchers()?;
|
||||||
|
|
||||||
let searcher = reader.searcher();
|
let searcher = index.searcher();
|
||||||
|
|
||||||
// A tantivy index is actually a collection of segments.
|
// A tantivy index is actually a collection of segments.
|
||||||
// Similarly, a searcher just wraps a list `segment_reader`.
|
// Similarly, a searcher just wraps a list `segment_reader`.
|
||||||
|
|||||||
@@ -10,11 +10,11 @@ extern crate tempdir;
|
|||||||
// Importing tantivy...
|
// Importing tantivy...
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate tantivy;
|
extern crate tantivy;
|
||||||
use tantivy::collector::TopDocs;
|
use tantivy::collector::TopCollector;
|
||||||
use tantivy::query::QueryParser;
|
use tantivy::query::QueryParser;
|
||||||
use tantivy::schema::*;
|
use tantivy::schema::*;
|
||||||
use tantivy::Index;
|
use tantivy::Index;
|
||||||
use tantivy::{Snippet, SnippetGenerator};
|
use tantivy::SnippetGenerator;
|
||||||
use tempdir::TempDir;
|
use tempdir::TempDir;
|
||||||
|
|
||||||
fn main() -> tantivy::Result<()> {
|
fn main() -> tantivy::Result<()> {
|
||||||
@@ -23,7 +23,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
let index_path = TempDir::new("tantivy_example_dir")?;
|
let index_path = TempDir::new("tantivy_example_dir")?;
|
||||||
|
|
||||||
// # Defining the schema
|
// # Defining the schema
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let title = schema_builder.add_text_field("title", TEXT | STORED);
|
let title = schema_builder.add_text_field("title", TEXT | STORED);
|
||||||
let body = schema_builder.add_text_field("body", TEXT | STORED);
|
let body = schema_builder.add_text_field("body", TEXT | STORED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
@@ -35,52 +35,37 @@ fn main() -> tantivy::Result<()> {
|
|||||||
|
|
||||||
// we'll only need one doc for this example.
|
// we'll only need one doc for this example.
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
title => "Of Mice and Men",
|
title => "Of Mice and Men",
|
||||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||||
limbs and branches that arch over the pool"
|
limbs and branches that arch over the pool"
|
||||||
));
|
));
|
||||||
// ...
|
// ...
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
|
|
||||||
let reader = index.reader()?;
|
index.load_searchers()?;
|
||||||
let searcher = reader.searcher();
|
|
||||||
|
let searcher = index.searcher();
|
||||||
let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
||||||
let query = query_parser.parse_query("sycamore spring")?;
|
let query = query_parser.parse_query("sycamore spring")?;
|
||||||
|
|
||||||
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
|
let mut top_collector = TopCollector::with_limit(10);
|
||||||
|
searcher.search(&*query, &mut top_collector)?;
|
||||||
|
|
||||||
let snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?;
|
let snippet_generator = SnippetGenerator::new(&searcher, &*query, body)?;
|
||||||
|
|
||||||
for (score, doc_address) in top_docs {
|
let doc_addresses = top_collector.docs();
|
||||||
|
for doc_address in doc_addresses {
|
||||||
let doc = searcher.doc(doc_address)?;
|
let doc = searcher.doc(doc_address)?;
|
||||||
let snippet = snippet_generator.snippet_from_doc(&doc);
|
let snippet = snippet_generator.snippet_from_doc(&doc);
|
||||||
println!("Document score {}:", score);
|
|
||||||
println!("title: {}", doc.get_first(title).unwrap().text().unwrap());
|
println!("title: {}", doc.get_first(title).unwrap().text().unwrap());
|
||||||
println!("snippet: {}", snippet.to_html());
|
println!("snippet: {}", snippet.to_html());
|
||||||
println!("custom highlighting: {}", highlight(snippet));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn highlight(snippet: Snippet) -> String {
|
|
||||||
let mut result = String::new();
|
|
||||||
let mut start_from = 0;
|
|
||||||
|
|
||||||
for (start, end) in snippet.highlighted().iter().map(|h| h.bounds()) {
|
|
||||||
result.push_str(&snippet.fragments()[start_from..start]);
|
|
||||||
result.push_str(" --> ");
|
|
||||||
result.push_str(&snippet.fragments()[start..end]);
|
|
||||||
result.push_str(" <-- ");
|
|
||||||
start_from = end;
|
|
||||||
}
|
|
||||||
|
|
||||||
result.push_str(&snippet.fragments()[start_from..]);
|
|
||||||
result
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ extern crate tempdir;
|
|||||||
// Importing tantivy...
|
// Importing tantivy...
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate tantivy;
|
extern crate tantivy;
|
||||||
use tantivy::collector::TopDocs;
|
use tantivy::collector::TopCollector;
|
||||||
use tantivy::query::QueryParser;
|
use tantivy::query::QueryParser;
|
||||||
use tantivy::schema::*;
|
use tantivy::schema::*;
|
||||||
use tantivy::tokenizer::*;
|
use tantivy::tokenizer::*;
|
||||||
@@ -23,7 +23,7 @@ use tantivy::Index;
|
|||||||
|
|
||||||
fn main() -> tantivy::Result<()> {
|
fn main() -> tantivy::Result<()> {
|
||||||
// this example assumes you understand the content in `basic_search`
|
// this example assumes you understand the content in `basic_search`
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
|
|
||||||
// This configures your custom options for how tantivy will
|
// This configures your custom options for how tantivy will
|
||||||
// store and process your content in the index; The key
|
// store and process your content in the index; The key
|
||||||
@@ -72,44 +72,48 @@ fn main() -> tantivy::Result<()> {
|
|||||||
title => "The Old Man and the Sea",
|
title => "The Old Man and the Sea",
|
||||||
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
|
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
|
||||||
he had gone eighty-four days now without taking a fish."
|
he had gone eighty-four days now without taking a fish."
|
||||||
));
|
));
|
||||||
|
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
title => "Of Mice and Men",
|
title => "Of Mice and Men",
|
||||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||||
limbs and branches that arch over the pool"
|
limbs and branches that arch over the pool"
|
||||||
));
|
));
|
||||||
|
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
title => "Frankenstein",
|
title => "Frankenstein",
|
||||||
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
|
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
|
||||||
enterprise which you have regarded with such evil forebodings. I arrived here \
|
enterprise which you have regarded with such evil forebodings. I arrived here \
|
||||||
yesterday, and my first task is to assure my dear sister of my welfare and \
|
yesterday, and my first task is to assure my dear sister of my welfare and \
|
||||||
increasing confidence in the success of my undertaking."
|
increasing confidence in the success of my undertaking."
|
||||||
));
|
));
|
||||||
|
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
|
|
||||||
let reader = index.reader()?;
|
index.load_searchers()?;
|
||||||
|
|
||||||
let searcher = reader.searcher();
|
let searcher = index.searcher();
|
||||||
|
|
||||||
let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
||||||
|
|
||||||
// stop words are applied on the query as well.
|
// stop words are applied on the query as well.
|
||||||
// The following will be equivalent to `title:frankenstein`
|
// The following will be equivalent to `title:frankenstein`
|
||||||
let query = query_parser.parse_query("title:\"the Frankenstein\"")?;
|
let query = query_parser.parse_query("title:\"the Frankenstein\"")?;
|
||||||
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
|
|
||||||
|
|
||||||
for (score, doc_address) in top_docs {
|
let mut top_collector = TopCollector::with_limit(10);
|
||||||
|
|
||||||
|
searcher.search(&*query, &mut top_collector)?;
|
||||||
|
|
||||||
|
let doc_addresses = top_collector.docs();
|
||||||
|
|
||||||
|
for doc_address in doc_addresses {
|
||||||
let retrieved_doc = searcher.doc(doc_address)?;
|
let retrieved_doc = searcher.doc(doc_address)?;
|
||||||
println!("\n==\nDocument score {}:", score);
|
|
||||||
println!("{}", schema.to_json(&retrieved_doc));
|
println!("{}", schema.to_json(&retrieved_doc));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -9,10 +9,10 @@ fn main() -> tantivy::Result<()> {
|
|||||||
// Check out the basic example if this is confusing to you.
|
// Check out the basic example if this is confusing to you.
|
||||||
//
|
//
|
||||||
// first we need to define a schema ...
|
// first we need to define a schema ...
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
schema_builder.add_text_field("title", TEXT | STORED);
|
schema_builder.add_text_field("title", TEXT | STORED);
|
||||||
schema_builder.add_text_field("body", TEXT);
|
schema_builder.add_text_field("body", TEXT);
|
||||||
schema_builder.add_u64_field("year", INDEXED);
|
schema_builder.add_u64_field("year", INT_INDEXED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
|
|
||||||
// Let's assume we have a json-serialized document.
|
// Let's assume we have a json-serialized document.
|
||||||
|
|||||||
142
src/collector/chained_collector.rs
Normal file
142
src/collector/chained_collector.rs
Normal file
@@ -0,0 +1,142 @@
|
|||||||
|
use collector::Collector;
|
||||||
|
use DocId;
|
||||||
|
use Result;
|
||||||
|
use Score;
|
||||||
|
use SegmentLocalId;
|
||||||
|
use SegmentReader;
|
||||||
|
|
||||||
|
/// Collector that does nothing.
|
||||||
|
/// This is used in the chain Collector and will hopefully
|
||||||
|
/// be optimized away by the compiler.
|
||||||
|
pub struct DoNothingCollector;
|
||||||
|
impl Collector for DoNothingCollector {
|
||||||
|
#[inline]
|
||||||
|
fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<()> {
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
#[inline]
|
||||||
|
fn collect(&mut self, _doc: DocId, _score: Score) {}
|
||||||
|
#[inline]
|
||||||
|
fn requires_scoring(&self) -> bool {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Zero-cost abstraction used to collect on multiple collectors.
|
||||||
|
/// This contraption is only usable if the type of your collectors
|
||||||
|
/// are known at compile time.
|
||||||
|
///
|
||||||
|
/// ```rust
|
||||||
|
/// #[macro_use]
|
||||||
|
/// extern crate tantivy;
|
||||||
|
/// use tantivy::schema::{SchemaBuilder, TEXT};
|
||||||
|
/// use tantivy::{Index, Result};
|
||||||
|
/// use tantivy::collector::{CountCollector, TopCollector, chain};
|
||||||
|
/// use tantivy::query::QueryParser;
|
||||||
|
///
|
||||||
|
/// # fn main() { example().unwrap(); }
|
||||||
|
/// fn example() -> Result<()> {
|
||||||
|
/// let mut schema_builder = SchemaBuilder::new();
|
||||||
|
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||||
|
/// let schema = schema_builder.build();
|
||||||
|
/// let index = Index::create_in_ram(schema);
|
||||||
|
/// {
|
||||||
|
/// let mut index_writer = index.writer(3_000_000)?;
|
||||||
|
/// index_writer.add_document(doc!(
|
||||||
|
/// title => "The Name of the Wind",
|
||||||
|
/// ));
|
||||||
|
/// index_writer.add_document(doc!(
|
||||||
|
/// title => "The Diary of Muadib",
|
||||||
|
/// ));
|
||||||
|
/// index_writer.add_document(doc!(
|
||||||
|
/// title => "A Dairy Cow",
|
||||||
|
/// ));
|
||||||
|
/// index_writer.add_document(doc!(
|
||||||
|
/// title => "The Diary of a Young Girl",
|
||||||
|
/// ));
|
||||||
|
/// index_writer.commit().unwrap();
|
||||||
|
/// }
|
||||||
|
///
|
||||||
|
/// index.load_searchers()?;
|
||||||
|
/// let searcher = index.searcher();
|
||||||
|
///
|
||||||
|
/// {
|
||||||
|
/// let mut top_collector = TopCollector::with_limit(2);
|
||||||
|
/// let mut count_collector = CountCollector::default();
|
||||||
|
/// {
|
||||||
|
/// let mut collectors = chain().push(&mut top_collector).push(&mut count_collector);
|
||||||
|
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||||
|
/// let query = query_parser.parse_query("diary")?;
|
||||||
|
/// searcher.search(&*query, &mut collectors).unwrap();
|
||||||
|
/// }
|
||||||
|
/// assert_eq!(count_collector.count(), 2);
|
||||||
|
/// assert!(top_collector.at_capacity());
|
||||||
|
/// }
|
||||||
|
///
|
||||||
|
/// Ok(())
|
||||||
|
/// }
|
||||||
|
/// ```
|
||||||
|
pub struct ChainedCollector<Left: Collector, Right: Collector> {
|
||||||
|
left: Left,
|
||||||
|
right: Right,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<Left: Collector, Right: Collector> ChainedCollector<Left, Right> {
|
||||||
|
/// Adds a collector
|
||||||
|
pub fn push<C: Collector>(self, new_collector: &mut C) -> ChainedCollector<Self, &mut C> {
|
||||||
|
ChainedCollector {
|
||||||
|
left: self,
|
||||||
|
right: new_collector,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<Left: Collector, Right: Collector> Collector for ChainedCollector<Left, Right> {
|
||||||
|
fn set_segment(
|
||||||
|
&mut self,
|
||||||
|
segment_local_id: SegmentLocalId,
|
||||||
|
segment: &SegmentReader,
|
||||||
|
) -> Result<()> {
|
||||||
|
self.left.set_segment(segment_local_id, segment)?;
|
||||||
|
self.right.set_segment(segment_local_id, segment)?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn collect(&mut self, doc: DocId, score: Score) {
|
||||||
|
self.left.collect(doc, score);
|
||||||
|
self.right.collect(doc, score);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn requires_scoring(&self) -> bool {
|
||||||
|
self.left.requires_scoring() || self.right.requires_scoring()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates a `ChainedCollector`
|
||||||
|
pub fn chain() -> ChainedCollector<DoNothingCollector, DoNothingCollector> {
|
||||||
|
ChainedCollector {
|
||||||
|
left: DoNothingCollector,
|
||||||
|
right: DoNothingCollector,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
|
||||||
|
use super::*;
|
||||||
|
use collector::{Collector, CountCollector, TopCollector};
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_chained_collector() {
|
||||||
|
let mut top_collector = TopCollector::with_limit(2);
|
||||||
|
let mut count_collector = CountCollector::default();
|
||||||
|
{
|
||||||
|
let mut collectors = chain().push(&mut top_collector).push(&mut count_collector);
|
||||||
|
collectors.collect(1, 0.2);
|
||||||
|
collectors.collect(2, 0.1);
|
||||||
|
collectors.collect(3, 0.5);
|
||||||
|
}
|
||||||
|
assert_eq!(count_collector.count(), 3);
|
||||||
|
assert!(top_collector.at_capacity());
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,5 +1,4 @@
|
|||||||
use super::Collector;
|
use super::Collector;
|
||||||
use collector::SegmentCollector;
|
|
||||||
use DocId;
|
use DocId;
|
||||||
use Result;
|
use Result;
|
||||||
use Score;
|
use Score;
|
||||||
@@ -12,14 +11,14 @@ use SegmentReader;
|
|||||||
/// ```rust
|
/// ```rust
|
||||||
/// #[macro_use]
|
/// #[macro_use]
|
||||||
/// extern crate tantivy;
|
/// extern crate tantivy;
|
||||||
/// use tantivy::schema::{Schema, TEXT};
|
/// use tantivy::schema::{SchemaBuilder, TEXT};
|
||||||
/// use tantivy::{Index, Result};
|
/// use tantivy::{Index, Result};
|
||||||
/// use tantivy::collector::Count;
|
/// use tantivy::collector::CountCollector;
|
||||||
/// use tantivy::query::QueryParser;
|
/// use tantivy::query::QueryParser;
|
||||||
///
|
///
|
||||||
/// # fn main() { example().unwrap(); }
|
/// # fn main() { example().unwrap(); }
|
||||||
/// fn example() -> Result<()> {
|
/// fn example() -> Result<()> {
|
||||||
/// let mut schema_builder = Schema::builder();
|
/// let mut schema_builder = SchemaBuilder::new();
|
||||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||||
/// let schema = schema_builder.build();
|
/// let schema = schema_builder.build();
|
||||||
/// let index = Index::create_in_ram(schema);
|
/// let index = Index::create_in_ram(schema);
|
||||||
@@ -40,90 +39,63 @@ use SegmentReader;
|
|||||||
/// index_writer.commit().unwrap();
|
/// index_writer.commit().unwrap();
|
||||||
/// }
|
/// }
|
||||||
///
|
///
|
||||||
/// let reader = index.reader()?;
|
/// index.load_searchers()?;
|
||||||
/// let searcher = reader.searcher();
|
/// let searcher = index.searcher();
|
||||||
///
|
///
|
||||||
/// {
|
/// {
|
||||||
|
/// let mut count_collector = CountCollector::default();
|
||||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||||
/// let query = query_parser.parse_query("diary")?;
|
/// let query = query_parser.parse_query("diary")?;
|
||||||
/// let count = searcher.search(&query, &Count).unwrap();
|
/// searcher.search(&*query, &mut count_collector).unwrap();
|
||||||
///
|
///
|
||||||
/// assert_eq!(count, 2);
|
/// assert_eq!(count_collector.count(), 2);
|
||||||
/// }
|
/// }
|
||||||
///
|
///
|
||||||
/// Ok(())
|
/// Ok(())
|
||||||
/// }
|
/// }
|
||||||
/// ```
|
/// ```
|
||||||
pub struct Count;
|
|
||||||
|
|
||||||
impl Collector for Count {
|
|
||||||
type Fruit = usize;
|
|
||||||
|
|
||||||
type Child = SegmentCountCollector;
|
|
||||||
|
|
||||||
fn for_segment(&self, _: SegmentLocalId, _: &SegmentReader) -> Result<SegmentCountCollector> {
|
|
||||||
Ok(SegmentCountCollector::default())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
|
||||||
false
|
|
||||||
}
|
|
||||||
|
|
||||||
fn merge_fruits(&self, segment_counts: Vec<usize>) -> Result<usize> {
|
|
||||||
Ok(segment_counts.into_iter().sum())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
pub struct SegmentCountCollector {
|
pub struct CountCollector {
|
||||||
count: usize,
|
count: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SegmentCollector for SegmentCountCollector {
|
impl CountCollector {
|
||||||
type Fruit = usize;
|
/// Returns the count of documents that were
|
||||||
|
/// collected.
|
||||||
|
pub fn count(&self) -> usize {
|
||||||
|
self.count
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Collector for CountCollector {
|
||||||
|
fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<()> {
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
fn collect(&mut self, _: DocId, _: Score) {
|
fn collect(&mut self, _: DocId, _: Score) {
|
||||||
self.count += 1;
|
self.count += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
fn harvest(self) -> usize {
|
fn requires_scoring(&self) -> bool {
|
||||||
self.count
|
false
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::{Count, SegmentCountCollector};
|
|
||||||
use collector::Collector;
|
use collector::{Collector, CountCollector};
|
||||||
use collector::SegmentCollector;
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_count_collect_does_not_requires_scoring() {
|
fn test_count_collector() {
|
||||||
assert!(!Count.requires_scoring());
|
let mut count_collector = CountCollector::default();
|
||||||
}
|
assert_eq!(count_collector.count(), 0);
|
||||||
|
count_collector.collect(0u32, 1f32);
|
||||||
#[test]
|
assert_eq!(count_collector.count(), 1);
|
||||||
fn test_segment_count_collector() {
|
assert_eq!(count_collector.count(), 1);
|
||||||
{
|
count_collector.collect(1u32, 1f32);
|
||||||
let count_collector = SegmentCountCollector::default();
|
assert_eq!(count_collector.count(), 2);
|
||||||
assert_eq!(count_collector.harvest(), 0);
|
assert!(!count_collector.requires_scoring());
|
||||||
}
|
|
||||||
{
|
|
||||||
let mut count_collector = SegmentCountCollector::default();
|
|
||||||
count_collector.collect(0u32, 1f32);
|
|
||||||
assert_eq!(count_collector.harvest(), 1);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
let mut count_collector = SegmentCountCollector::default();
|
|
||||||
count_collector.collect(0u32, 1f32);
|
|
||||||
assert_eq!(count_collector.harvest(), 1);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
let mut count_collector = SegmentCountCollector::default();
|
|
||||||
count_collector.collect(0u32, 1f32);
|
|
||||||
count_collector.collect(1u32, 1f32);
|
|
||||||
assert_eq!(count_collector.harvest(), 2);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,17 +1,20 @@
|
|||||||
use collector::Collector;
|
use collector::Collector;
|
||||||
use collector::SegmentCollector;
|
|
||||||
use docset::SkipResult;
|
use docset::SkipResult;
|
||||||
use fastfield::FacetReader;
|
use fastfield::FacetReader;
|
||||||
use schema::Facet;
|
use schema::Facet;
|
||||||
use schema::Field;
|
use schema::Field;
|
||||||
use std::cmp::Ordering;
|
use std::cell::UnsafeCell;
|
||||||
use std::collections::btree_map;
|
use std::collections::btree_map;
|
||||||
use std::collections::BTreeMap;
|
use std::collections::BTreeMap;
|
||||||
use std::collections::BTreeSet;
|
use std::collections::BTreeSet;
|
||||||
use std::collections::BinaryHeap;
|
use std::collections::BinaryHeap;
|
||||||
use std::collections::Bound;
|
use std::collections::Bound;
|
||||||
use std::iter::Peekable;
|
use std::iter::Peekable;
|
||||||
|
use std::mem;
|
||||||
use std::{u64, usize};
|
use std::{u64, usize};
|
||||||
|
use termdict::TermMerger;
|
||||||
|
|
||||||
|
use std::cmp::Ordering;
|
||||||
use DocId;
|
use DocId;
|
||||||
use Result;
|
use Result;
|
||||||
use Score;
|
use Score;
|
||||||
@@ -43,6 +46,12 @@ impl<'a> Ord for Hit<'a> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct SegmentFacetCounter {
|
||||||
|
pub facet_reader: FacetReader,
|
||||||
|
pub facet_ords: Vec<u64>,
|
||||||
|
pub facet_counts: Vec<u64>,
|
||||||
|
}
|
||||||
|
|
||||||
fn facet_depth(facet_bytes: &[u8]) -> usize {
|
fn facet_depth(facet_bytes: &[u8]) -> usize {
|
||||||
if facet_bytes.is_empty() {
|
if facet_bytes.is_empty() {
|
||||||
0
|
0
|
||||||
@@ -82,14 +91,14 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
|||||||
/// ```rust
|
/// ```rust
|
||||||
/// #[macro_use]
|
/// #[macro_use]
|
||||||
/// extern crate tantivy;
|
/// extern crate tantivy;
|
||||||
/// use tantivy::schema::{Facet, Schema, TEXT};
|
/// use tantivy::schema::{Facet, SchemaBuilder, TEXT};
|
||||||
/// use tantivy::{Index, Result};
|
/// use tantivy::{Index, Result};
|
||||||
/// use tantivy::collector::FacetCollector;
|
/// use tantivy::collector::FacetCollector;
|
||||||
/// use tantivy::query::AllQuery;
|
/// use tantivy::query::AllQuery;
|
||||||
///
|
///
|
||||||
/// # fn main() { example().unwrap(); }
|
/// # fn main() { example().unwrap(); }
|
||||||
/// fn example() -> Result<()> {
|
/// fn example() -> Result<()> {
|
||||||
/// let mut schema_builder = Schema::builder();
|
/// let mut schema_builder = SchemaBuilder::new();
|
||||||
///
|
///
|
||||||
/// // Facet have their own specific type.
|
/// // Facet have their own specific type.
|
||||||
/// // It is not a bad practise to put all of your
|
/// // It is not a bad practise to put all of your
|
||||||
@@ -122,19 +131,23 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
|||||||
/// facet => Facet::from("/lang/en"),
|
/// facet => Facet::from("/lang/en"),
|
||||||
/// facet => Facet::from("/category/biography")
|
/// facet => Facet::from("/category/biography")
|
||||||
/// ));
|
/// ));
|
||||||
/// index_writer.commit()?;
|
/// index_writer.commit().unwrap();
|
||||||
/// }
|
/// }
|
||||||
/// let reader = index.reader()?;
|
///
|
||||||
/// let searcher = reader.searcher();
|
/// index.load_searchers()?;
|
||||||
|
/// let searcher = index.searcher();
|
||||||
///
|
///
|
||||||
/// {
|
/// {
|
||||||
/// let mut facet_collector = FacetCollector::for_field(facet);
|
/// let mut facet_collector = FacetCollector::for_field(facet);
|
||||||
/// facet_collector.add_facet("/lang");
|
/// facet_collector.add_facet("/lang");
|
||||||
/// facet_collector.add_facet("/category");
|
/// facet_collector.add_facet("/category");
|
||||||
/// let facet_counts = searcher.search(&AllQuery, &facet_collector)?;
|
/// searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
||||||
|
///
|
||||||
|
/// // this object contains count aggregate for all of the facets.
|
||||||
|
/// let counts = facet_collector.harvest();
|
||||||
///
|
///
|
||||||
/// // This lists all of the facet counts
|
/// // This lists all of the facet counts
|
||||||
/// let facets: Vec<(&Facet, u64)> = facet_counts
|
/// let facets: Vec<(&Facet, u64)> = counts
|
||||||
/// .get("/category")
|
/// .get("/category")
|
||||||
/// .collect();
|
/// .collect();
|
||||||
/// assert_eq!(facets, vec![
|
/// assert_eq!(facets, vec![
|
||||||
@@ -146,10 +159,13 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
|||||||
/// {
|
/// {
|
||||||
/// let mut facet_collector = FacetCollector::for_field(facet);
|
/// let mut facet_collector = FacetCollector::for_field(facet);
|
||||||
/// facet_collector.add_facet("/category/fiction");
|
/// facet_collector.add_facet("/category/fiction");
|
||||||
/// let facet_counts = searcher.search(&AllQuery, &facet_collector)?;
|
/// searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
||||||
|
///
|
||||||
|
/// // this object contains count aggregate for all of the facets.
|
||||||
|
/// let counts = facet_collector.harvest();
|
||||||
///
|
///
|
||||||
/// // This lists all of the facet counts
|
/// // This lists all of the facet counts
|
||||||
/// let facets: Vec<(&Facet, u64)> = facet_counts
|
/// let facets: Vec<(&Facet, u64)> = counts
|
||||||
/// .get("/category/fiction")
|
/// .get("/category/fiction")
|
||||||
/// .collect();
|
/// .collect();
|
||||||
/// assert_eq!(facets, vec![
|
/// assert_eq!(facets, vec![
|
||||||
@@ -162,10 +178,13 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
|||||||
/// {
|
/// {
|
||||||
/// let mut facet_collector = FacetCollector::for_field(facet);
|
/// let mut facet_collector = FacetCollector::for_field(facet);
|
||||||
/// facet_collector.add_facet("/category/fiction");
|
/// facet_collector.add_facet("/category/fiction");
|
||||||
/// let facet_counts = searcher.search(&AllQuery, &facet_collector)?;
|
/// searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
||||||
|
///
|
||||||
|
/// // this object contains count aggregate for all of the facets.
|
||||||
|
/// let counts = facet_collector.harvest();
|
||||||
///
|
///
|
||||||
/// // This lists all of the facet counts
|
/// // This lists all of the facet counts
|
||||||
/// let facets: Vec<(&Facet, u64)> = facet_counts.top_k("/category/fiction", 1);
|
/// let facets: Vec<(&Facet, u64)> = counts.top_k("/category/fiction", 1);
|
||||||
/// assert_eq!(facets, vec![
|
/// assert_eq!(facets, vec![
|
||||||
/// (&Facet::from("/category/fiction/fantasy"), 2)
|
/// (&Facet::from("/category/fiction/fantasy"), 2)
|
||||||
/// ]);
|
/// ]);
|
||||||
@@ -175,19 +194,19 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
|||||||
/// }
|
/// }
|
||||||
/// ```
|
/// ```
|
||||||
pub struct FacetCollector {
|
pub struct FacetCollector {
|
||||||
|
facet_ords: Vec<u64>,
|
||||||
field: Field,
|
field: Field,
|
||||||
facets: BTreeSet<Facet>,
|
ff_reader: Option<UnsafeCell<FacetReader>>,
|
||||||
}
|
segment_counters: Vec<SegmentFacetCounter>,
|
||||||
|
|
||||||
pub struct FacetSegmentCollector {
|
|
||||||
reader: FacetReader,
|
|
||||||
facet_ords_buf: Vec<u64>,
|
|
||||||
// facet_ord -> collapse facet_id
|
// facet_ord -> collapse facet_id
|
||||||
collapse_mapping: Vec<usize>,
|
current_segment_collapse_mapping: Vec<usize>,
|
||||||
// collapse facet_id -> count
|
// collapse facet_id -> count
|
||||||
counts: Vec<u64>,
|
current_segment_counts: Vec<u64>,
|
||||||
// collapse facet_id -> facet_ord
|
// collapse facet_id -> facet_ord
|
||||||
collapse_facet_ords: Vec<u64>,
|
current_collapse_facet_ords: Vec<u64>,
|
||||||
|
|
||||||
|
facets: BTreeSet<Facet>,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn skip<'a, I: Iterator<Item = &'a Facet>>(
|
fn skip<'a, I: Iterator<Item = &'a Facet>>(
|
||||||
@@ -196,7 +215,7 @@ fn skip<'a, I: Iterator<Item = &'a Facet>>(
|
|||||||
) -> SkipResult {
|
) -> SkipResult {
|
||||||
loop {
|
loop {
|
||||||
match collapse_it.peek() {
|
match collapse_it.peek() {
|
||||||
Some(facet_bytes) => match facet_bytes.encoded_str().as_bytes().cmp(target) {
|
Some(facet_bytes) => match facet_bytes.encoded_bytes().cmp(target) {
|
||||||
Ordering::Less => {}
|
Ordering::Less => {}
|
||||||
Ordering::Greater => {
|
Ordering::Greater => {
|
||||||
return SkipResult::OverStep;
|
return SkipResult::OverStep;
|
||||||
@@ -221,8 +240,15 @@ impl FacetCollector {
|
|||||||
/// is of the proper type.
|
/// is of the proper type.
|
||||||
pub fn for_field(field: Field) -> FacetCollector {
|
pub fn for_field(field: Field) -> FacetCollector {
|
||||||
FacetCollector {
|
FacetCollector {
|
||||||
|
facet_ords: Vec::with_capacity(255),
|
||||||
|
segment_counters: Vec::new(),
|
||||||
field,
|
field,
|
||||||
facets: BTreeSet::default(),
|
ff_reader: None,
|
||||||
|
facets: BTreeSet::new(),
|
||||||
|
|
||||||
|
current_segment_collapse_mapping: Vec::new(),
|
||||||
|
current_collapse_facet_ords: Vec::new(),
|
||||||
|
current_segment_counts: Vec::new(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -252,100 +278,143 @@ impl FacetCollector {
|
|||||||
}
|
}
|
||||||
self.facets.insert(facet);
|
self.facets.insert(facet);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
impl Collector for FacetCollector {
|
|
||||||
type Fruit = FacetCounts;
|
|
||||||
|
|
||||||
type Child = FacetSegmentCollector;
|
|
||||||
|
|
||||||
fn for_segment(
|
|
||||||
&self,
|
|
||||||
_: SegmentLocalId,
|
|
||||||
reader: &SegmentReader,
|
|
||||||
) -> Result<FacetSegmentCollector> {
|
|
||||||
let facet_reader = reader.facet_reader(self.field)?;
|
|
||||||
|
|
||||||
let mut collapse_mapping = Vec::new();
|
|
||||||
let mut counts = Vec::new();
|
|
||||||
let mut collapse_facet_ords = Vec::new();
|
|
||||||
|
|
||||||
|
fn set_collapse_mapping(&mut self, facet_reader: &FacetReader) {
|
||||||
|
self.current_segment_collapse_mapping.clear();
|
||||||
|
self.current_collapse_facet_ords.clear();
|
||||||
|
self.current_segment_counts.clear();
|
||||||
let mut collapse_facet_it = self.facets.iter().peekable();
|
let mut collapse_facet_it = self.facets.iter().peekable();
|
||||||
collapse_facet_ords.push(0);
|
self.current_collapse_facet_ords.push(0);
|
||||||
{
|
let mut facet_streamer = facet_reader.facet_dict().range().into_stream();
|
||||||
let mut facet_streamer = facet_reader.facet_dict().range().into_stream();
|
if !facet_streamer.advance() {
|
||||||
if facet_streamer.advance() {
|
return;
|
||||||
'outer: loop {
|
}
|
||||||
// at the begining of this loop, facet_streamer
|
'outer: loop {
|
||||||
// is positionned on a term that has not been processed yet.
|
// at the begining of this loop, facet_streamer
|
||||||
let skip_result = skip(facet_streamer.key(), &mut collapse_facet_it);
|
// is positionned on a term that has not been processed yet.
|
||||||
match skip_result {
|
let skip_result = skip(facet_streamer.key(), &mut collapse_facet_it);
|
||||||
SkipResult::Reached => {
|
match skip_result {
|
||||||
// we reach a facet we decided to collapse.
|
SkipResult::Reached => {
|
||||||
let collapse_depth = facet_depth(facet_streamer.key());
|
// we reach a facet we decided to collapse.
|
||||||
let mut collapsed_id = 0;
|
let collapse_depth = facet_depth(facet_streamer.key());
|
||||||
collapse_mapping.push(0);
|
let mut collapsed_id = 0;
|
||||||
while facet_streamer.advance() {
|
self.current_segment_collapse_mapping.push(0);
|
||||||
let depth = facet_depth(facet_streamer.key());
|
while facet_streamer.advance() {
|
||||||
if depth <= collapse_depth {
|
let depth = facet_depth(facet_streamer.key());
|
||||||
continue 'outer;
|
if depth <= collapse_depth {
|
||||||
}
|
continue 'outer;
|
||||||
if depth == collapse_depth + 1 {
|
|
||||||
collapsed_id = collapse_facet_ords.len();
|
|
||||||
collapse_facet_ords.push(facet_streamer.term_ord());
|
|
||||||
collapse_mapping.push(collapsed_id);
|
|
||||||
} else {
|
|
||||||
collapse_mapping.push(collapsed_id);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
SkipResult::End | SkipResult::OverStep => {
|
if depth == collapse_depth + 1 {
|
||||||
collapse_mapping.push(0);
|
collapsed_id = self.current_collapse_facet_ords.len();
|
||||||
if !facet_streamer.advance() {
|
self.current_collapse_facet_ords
|
||||||
break;
|
.push(facet_streamer.term_ord());
|
||||||
}
|
self.current_segment_collapse_mapping.push(collapsed_id);
|
||||||
|
} else {
|
||||||
|
self.current_segment_collapse_mapping.push(collapsed_id);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
SkipResult::End | SkipResult::OverStep => {
|
||||||
|
self.current_segment_collapse_mapping.push(0);
|
||||||
|
if !facet_streamer.advance() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
counts.resize(collapse_facet_ords.len(), 0);
|
|
||||||
|
|
||||||
Ok(FacetSegmentCollector {
|
|
||||||
reader: facet_reader,
|
|
||||||
facet_ords_buf: Vec::with_capacity(255),
|
|
||||||
collapse_mapping,
|
|
||||||
counts,
|
|
||||||
collapse_facet_ords,
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
fn finalize_segment(&mut self) {
|
||||||
false
|
if self.ff_reader.is_some() {
|
||||||
|
self.segment_counters.push(SegmentFacetCounter {
|
||||||
|
facet_reader: self.ff_reader.take().unwrap().into_inner(),
|
||||||
|
facet_ords: mem::replace(&mut self.current_collapse_facet_ords, Vec::new()),
|
||||||
|
facet_counts: mem::replace(&mut self.current_segment_counts, Vec::new()),
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn merge_fruits(&self, segments_facet_counts: Vec<FacetCounts>) -> Result<FacetCounts> {
|
/// Returns the results of the collection.
|
||||||
let mut facet_counts: BTreeMap<Facet, u64> = BTreeMap::new();
|
///
|
||||||
for segment_facet_counts in segments_facet_counts {
|
/// This method does not just return the counters,
|
||||||
for (facet, count) in segment_facet_counts.facet_counts {
|
/// it also translates the facet ordinals of the last segment.
|
||||||
*(facet_counts.entry(facet).or_insert(0)) += count;
|
pub fn harvest(mut self) -> FacetCounts {
|
||||||
|
self.finalize_segment();
|
||||||
|
|
||||||
|
let collapsed_facet_ords: Vec<&[u64]> = self
|
||||||
|
.segment_counters
|
||||||
|
.iter()
|
||||||
|
.map(|segment_counter| &segment_counter.facet_ords[..])
|
||||||
|
.collect();
|
||||||
|
let collapsed_facet_counts: Vec<&[u64]> = self
|
||||||
|
.segment_counters
|
||||||
|
.iter()
|
||||||
|
.map(|segment_counter| &segment_counter.facet_counts[..])
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let facet_streams = self
|
||||||
|
.segment_counters
|
||||||
|
.iter()
|
||||||
|
.map(|seg_counts| seg_counts.facet_reader.facet_dict().range().into_stream())
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
let mut facet_merger = TermMerger::new(facet_streams);
|
||||||
|
let mut facet_counts = BTreeMap::new();
|
||||||
|
|
||||||
|
while facet_merger.advance() {
|
||||||
|
let count = facet_merger
|
||||||
|
.current_kvs()
|
||||||
|
.iter()
|
||||||
|
.map(|it| {
|
||||||
|
let seg_ord = it.segment_ord;
|
||||||
|
let term_ord = it.streamer.term_ord();
|
||||||
|
collapsed_facet_ords[seg_ord]
|
||||||
|
.binary_search(&term_ord)
|
||||||
|
.map(|collapsed_term_id| {
|
||||||
|
if collapsed_term_id == 0 {
|
||||||
|
0
|
||||||
|
} else {
|
||||||
|
collapsed_facet_counts[seg_ord][collapsed_term_id]
|
||||||
|
}
|
||||||
|
}).unwrap_or(0)
|
||||||
|
}).sum();
|
||||||
|
if count > 0u64 {
|
||||||
|
let bytes: Vec<u8> = facet_merger.key().to_owned();
|
||||||
|
// may create an corrupted facet if the term dicitonary is corrupted
|
||||||
|
let facet = unsafe { Facet::from_encoded(bytes) };
|
||||||
|
facet_counts.insert(facet, count);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(FacetCounts { facet_counts })
|
FacetCounts { facet_counts }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SegmentCollector for FacetSegmentCollector {
|
impl Collector for FacetCollector {
|
||||||
type Fruit = FacetCounts;
|
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
|
||||||
|
self.finalize_segment();
|
||||||
|
let facet_reader = reader.facet_reader(self.field)?;
|
||||||
|
self.set_collapse_mapping(&facet_reader);
|
||||||
|
self.current_segment_counts
|
||||||
|
.resize(self.current_collapse_facet_ords.len(), 0);
|
||||||
|
self.ff_reader = Some(UnsafeCell::new(facet_reader));
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
fn collect(&mut self, doc: DocId, _: Score) {
|
fn collect(&mut self, doc: DocId, _: Score) {
|
||||||
self.reader.facet_ords(doc, &mut self.facet_ords_buf);
|
let facet_reader: &mut FacetReader = unsafe {
|
||||||
|
&mut *self
|
||||||
|
.ff_reader
|
||||||
|
.as_ref()
|
||||||
|
.expect("collect() was called before set_segment. This should never happen.")
|
||||||
|
.get()
|
||||||
|
};
|
||||||
|
facet_reader.facet_ords(doc, &mut self.facet_ords);
|
||||||
let mut previous_collapsed_ord: usize = usize::MAX;
|
let mut previous_collapsed_ord: usize = usize::MAX;
|
||||||
for &facet_ord in &self.facet_ords_buf {
|
for &facet_ord in &self.facet_ords {
|
||||||
let collapsed_ord = self.collapse_mapping[facet_ord as usize];
|
let collapsed_ord = self.current_segment_collapse_mapping[facet_ord as usize];
|
||||||
self.counts[collapsed_ord] += if collapsed_ord == previous_collapsed_ord {
|
self.current_segment_counts[collapsed_ord] += if collapsed_ord == previous_collapsed_ord
|
||||||
|
{
|
||||||
0
|
0
|
||||||
} else {
|
} else {
|
||||||
1
|
1
|
||||||
@@ -354,24 +423,8 @@ impl SegmentCollector for FacetSegmentCollector {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the results of the collection.
|
fn requires_scoring(&self) -> bool {
|
||||||
///
|
false
|
||||||
/// This method does not just return the counters,
|
|
||||||
/// it also translates the facet ordinals of the last segment.
|
|
||||||
fn harvest(self) -> FacetCounts {
|
|
||||||
let mut facet_counts = BTreeMap::new();
|
|
||||||
let facet_dict = self.reader.facet_dict();
|
|
||||||
for (collapsed_facet_ord, count) in self.counts.iter().cloned().enumerate() {
|
|
||||||
if count == 0 {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
let mut facet = vec![];
|
|
||||||
let facet_ord = self.collapse_facet_ords[collapsed_facet_ord];
|
|
||||||
facet_dict.ord_to_term(facet_ord as u64, &mut facet);
|
|
||||||
// TODO
|
|
||||||
facet_counts.insert(Facet::from_encoded(facet).unwrap(), count);
|
|
||||||
}
|
|
||||||
FacetCounts { facet_counts }
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -403,9 +456,9 @@ impl FacetCounts {
|
|||||||
let right_bound = if facet.is_root() {
|
let right_bound = if facet.is_root() {
|
||||||
Bound::Unbounded
|
Bound::Unbounded
|
||||||
} else {
|
} else {
|
||||||
let mut facet_after_bytes: String = facet.encoded_str().to_owned();
|
let mut facet_after_bytes: Vec<u8> = facet.encoded_bytes().to_owned();
|
||||||
facet_after_bytes.push('\u{1}');
|
facet_after_bytes.push(1u8);
|
||||||
let facet_after = Facet::from_encoded_string(facet_after_bytes);
|
let facet_after = unsafe { Facet::from_encoded(facet_after_bytes) }; // ok logic
|
||||||
Bound::Excluded(facet_after)
|
Bound::Excluded(facet_after)
|
||||||
};
|
};
|
||||||
let underlying: btree_map::Range<_, _> = self.facet_counts.range((left_bound, right_bound));
|
let underlying: btree_map::Range<_, _> = self.facet_counts.range((left_bound, right_bound));
|
||||||
@@ -452,14 +505,14 @@ mod tests {
|
|||||||
use core::Index;
|
use core::Index;
|
||||||
use query::AllQuery;
|
use query::AllQuery;
|
||||||
use rand::distributions::Uniform;
|
use rand::distributions::Uniform;
|
||||||
use rand::prelude::SliceRandom;
|
|
||||||
use rand::{thread_rng, Rng};
|
use rand::{thread_rng, Rng};
|
||||||
use schema::{Document, Facet, Field, Schema};
|
use schema::Field;
|
||||||
|
use schema::{Document, Facet, SchemaBuilder};
|
||||||
use std::iter;
|
use std::iter;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_facet_collector_drilldown() {
|
fn test_facet_collector_drilldown() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::new();
|
||||||
let facet_field = schema_builder.add_facet_field("facet");
|
let facet_field = schema_builder.add_facet_field("facet");
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -474,20 +527,21 @@ mod tests {
|
|||||||
n /= 4;
|
n /= 4;
|
||||||
let leaf = n % 5;
|
let leaf = n % 5;
|
||||||
Facet::from(&format!("/top{}/mid{}/leaf{}", top, mid, leaf))
|
Facet::from(&format!("/top{}/mid{}/leaf{}", top, mid, leaf))
|
||||||
})
|
}).collect();
|
||||||
.collect();
|
|
||||||
for i in 0..num_facets * 10 {
|
for i in 0..num_facets * 10 {
|
||||||
let mut doc = Document::new();
|
let mut doc = Document::new();
|
||||||
doc.add_facet(facet_field, facets[i % num_facets].clone());
|
doc.add_facet(facet_field, facets[i % num_facets].clone());
|
||||||
index_writer.add_document(doc);
|
index_writer.add_document(doc);
|
||||||
}
|
}
|
||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
let reader = index.reader().unwrap();
|
index.load_searchers().unwrap();
|
||||||
let searcher = reader.searcher();
|
let searcher = index.searcher();
|
||||||
|
|
||||||
let mut facet_collector = FacetCollector::for_field(facet_field);
|
let mut facet_collector = FacetCollector::for_field(facet_field);
|
||||||
facet_collector.add_facet(Facet::from("/top1"));
|
facet_collector.add_facet(Facet::from("/top1"));
|
||||||
let counts = searcher.search(&AllQuery, &facet_collector).unwrap();
|
searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
||||||
|
|
||||||
|
let counts: FacetCounts = facet_collector.harvest();
|
||||||
{
|
{
|
||||||
let facets: Vec<(String, u64)> = counts
|
let facets: Vec<(String, u64)> = counts
|
||||||
.get("/top1")
|
.get("/top1")
|
||||||
@@ -501,16 +555,18 @@ mod tests {
|
|||||||
("/top1/mid2", 50),
|
("/top1/mid2", 50),
|
||||||
("/top1/mid3", 50),
|
("/top1/mid3", 50),
|
||||||
]
|
]
|
||||||
.iter()
|
.iter()
|
||||||
.map(|&(facet_str, count)| (String::from(facet_str), count))
|
.map(|&(facet_str, count)| (String::from(facet_str), count))
|
||||||
.collect::<Vec<_>>()
|
.collect::<Vec<_>>()
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
#[should_panic(expected = "Tried to add a facet which is a descendant of \
|
#[should_panic(
|
||||||
an already added facet.")]
|
expected = "Tried to add a facet which is a descendant of \
|
||||||
|
an already added facet."
|
||||||
|
)]
|
||||||
fn test_misused_facet_collector() {
|
fn test_misused_facet_collector() {
|
||||||
let mut facet_collector = FacetCollector::for_field(Field(0));
|
let mut facet_collector = FacetCollector::for_field(Field(0));
|
||||||
facet_collector.add_facet(Facet::from("/country"));
|
facet_collector.add_facet(Facet::from("/country"));
|
||||||
@@ -519,7 +575,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_doc_unsorted_multifacet() {
|
fn test_doc_unsorted_multifacet() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::new();
|
||||||
let facet_field = schema_builder.add_facet_field("facets");
|
let facet_field = schema_builder.add_facet_field("facets");
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -531,12 +587,13 @@ mod tests {
|
|||||||
facet_field => Facet::from_text(&"/subjects/B/b"),
|
facet_field => Facet::from_text(&"/subjects/B/b"),
|
||||||
));
|
));
|
||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
let reader = index.reader().unwrap();
|
index.load_searchers().unwrap();
|
||||||
let searcher = reader.searcher();
|
let searcher = index.searcher();
|
||||||
assert_eq!(searcher.num_docs(), 1);
|
assert_eq!(searcher.num_docs(), 1);
|
||||||
let mut facet_collector = FacetCollector::for_field(facet_field);
|
let mut facet_collector = FacetCollector::for_field(facet_field);
|
||||||
facet_collector.add_facet("/subjects");
|
facet_collector.add_facet("/subjects");
|
||||||
let counts = searcher.search(&AllQuery, &facet_collector).unwrap();
|
searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
||||||
|
let counts = facet_collector.harvest();
|
||||||
let facets: Vec<(&Facet, u64)> = counts.get("/subjects").collect();
|
let facets: Vec<(&Facet, u64)> = counts.get("/subjects").collect();
|
||||||
assert_eq!(facets[0].1, 1);
|
assert_eq!(facets[0].1, 1);
|
||||||
}
|
}
|
||||||
@@ -550,7 +607,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_facet_collector_topk() {
|
fn test_facet_collector_topk() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::new();
|
||||||
let facet_field = schema_builder.add_facet_field("facet");
|
let facet_field = schema_builder.add_facet_field("facet");
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -562,28 +619,29 @@ mod tests {
|
|||||||
let facet = Facet::from(&format!("/facet/{}", c));
|
let facet = Facet::from(&format!("/facet/{}", c));
|
||||||
let doc = doc!(facet_field => facet);
|
let doc = doc!(facet_field => facet);
|
||||||
iter::repeat(doc).take(count)
|
iter::repeat(doc).take(count)
|
||||||
})
|
}).map(|mut doc| {
|
||||||
.map(|mut doc| {
|
|
||||||
doc.add_facet(
|
doc.add_facet(
|
||||||
facet_field,
|
facet_field,
|
||||||
&format!("/facet/{}", thread_rng().sample(&uniform)),
|
&format!("/facet/{}", thread_rng().sample(&uniform)),
|
||||||
);
|
);
|
||||||
doc
|
doc
|
||||||
})
|
}).collect();
|
||||||
.collect();
|
thread_rng().shuffle(&mut docs[..]);
|
||||||
docs[..].shuffle(&mut thread_rng());
|
|
||||||
|
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
for doc in docs {
|
for doc in docs {
|
||||||
index_writer.add_document(doc);
|
index_writer.add_document(doc);
|
||||||
}
|
}
|
||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
let searcher = index.reader().unwrap().searcher();
|
index.load_searchers().unwrap();
|
||||||
|
|
||||||
|
let searcher = index.searcher();
|
||||||
|
|
||||||
let mut facet_collector = FacetCollector::for_field(facet_field);
|
let mut facet_collector = FacetCollector::for_field(facet_field);
|
||||||
facet_collector.add_facet("/facet");
|
facet_collector.add_facet("/facet");
|
||||||
let counts: FacetCounts = searcher.search(&AllQuery, &facet_collector).unwrap();
|
searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
||||||
|
|
||||||
|
let counts: FacetCounts = facet_collector.harvest();
|
||||||
{
|
{
|
||||||
let facets: Vec<(&Facet, u64)> = counts.top_k("/facet", 3);
|
let facets: Vec<(&Facet, u64)> = counts.top_k("/facet", 3);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
@@ -606,13 +664,13 @@ mod bench {
|
|||||||
use query::AllQuery;
|
use query::AllQuery;
|
||||||
use rand::{thread_rng, Rng};
|
use rand::{thread_rng, Rng};
|
||||||
use schema::Facet;
|
use schema::Facet;
|
||||||
use schema::Schema;
|
use schema::SchemaBuilder;
|
||||||
use test::Bencher;
|
use test::Bencher;
|
||||||
use Index;
|
use Index;
|
||||||
|
|
||||||
#[bench]
|
#[bench]
|
||||||
fn bench_facet_collector(b: &mut Bencher) {
|
fn bench_facet_collector(b: &mut Bencher) {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::new();
|
||||||
let facet_field = schema_builder.add_facet_field("facet");
|
let facet_field = schema_builder.add_facet_field("facet");
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -632,11 +690,12 @@ mod bench {
|
|||||||
index_writer.add_document(doc);
|
index_writer.add_document(doc);
|
||||||
}
|
}
|
||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
let reader = index.reader().unwrap();
|
index.load_searchers().unwrap();
|
||||||
|
|
||||||
b.iter(|| {
|
b.iter(|| {
|
||||||
let searcher = index.searcher();
|
let searcher = index.searcher();
|
||||||
let facet_collector = FacetCollector::for_field(facet_field);
|
let mut facet_collector = FacetCollector::for_field(facet_field);
|
||||||
searcher.search(&AllQuery, &facet_collector).unwrap();
|
searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -79,7 +79,7 @@ mod tests {
|
|||||||
// make sure we have facet counters correctly filled
|
// make sure we have facet counters correctly filled
|
||||||
fn test_facet_collector_results() {
|
fn test_facet_collector_results() {
|
||||||
|
|
||||||
let mut schema_builder = schema::Schema::builder();
|
let mut schema_builder = schema::SchemaBuilder::new();
|
||||||
let num_field_i64 = schema_builder.add_i64_field("num_i64", FAST);
|
let num_field_i64 = schema_builder.add_i64_field("num_i64", FAST);
|
||||||
let num_field_u64 = schema_builder.add_u64_field("num_u64", FAST);
|
let num_field_u64 = schema_builder.add_u64_field("num_u64", FAST);
|
||||||
let text_field = schema_builder.add_text_field("text", STRING);
|
let text_field = schema_builder.add_text_field("text", STRING);
|
||||||
@@ -88,7 +88,7 @@ mod tests {
|
|||||||
let index = Index::create_in_ram(schema.clone());
|
let index = Index::create_in_ram(schema.clone());
|
||||||
|
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
{
|
{
|
||||||
for i in 0u64..10u64 {
|
for i in 0u64..10u64 {
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
@@ -101,7 +101,8 @@ mod tests {
|
|||||||
assert_eq!(index_writer.commit().unwrap(), 10u64);
|
assert_eq!(index_writer.commit().unwrap(), 10u64);
|
||||||
}
|
}
|
||||||
|
|
||||||
let searcher = index.reader().searcher();
|
index.load_searchers().unwrap();
|
||||||
|
let searcher = index.searcher();
|
||||||
let mut ffvf_i64: IntFacetCollector<I64FastFieldReader> = IntFacetCollector::new(num_field_i64);
|
let mut ffvf_i64: IntFacetCollector<I64FastFieldReader> = IntFacetCollector::new(num_field_i64);
|
||||||
let mut ffvf_u64: IntFacetCollector<U64FastFieldReader> = IntFacetCollector::new(num_field_u64);
|
let mut ffvf_u64: IntFacetCollector<U64FastFieldReader> = IntFacetCollector::new(num_field_u64);
|
||||||
|
|
||||||
|
|||||||
@@ -1,91 +1,7 @@
|
|||||||
/*!
|
/*!
|
||||||
|
Defines how the documents matching a search query should be processed.
|
||||||
# Collectors
|
|
||||||
|
|
||||||
Collectors define the information you want to extract from the documents matching the queries.
|
|
||||||
In tantivy jargon, we call this information your search "fruit".
|
|
||||||
|
|
||||||
Your fruit could for instance be :
|
|
||||||
- [the count of matching documents](./struct.Count.html)
|
|
||||||
- [the top 10 documents, by relevancy or by a fast field](./struct.TopDocs.html)
|
|
||||||
- [facet counts](./struct.FacetCollector.html)
|
|
||||||
|
|
||||||
At one point in your code, you will trigger the actual search operation by calling
|
|
||||||
[the `search(...)` method of your `Searcher` object](../struct.Searcher.html#method.search).
|
|
||||||
This call will look like this.
|
|
||||||
|
|
||||||
```verbatim
|
|
||||||
let fruit = searcher.search(&query, &collector)?;
|
|
||||||
```
|
|
||||||
|
|
||||||
Here the type of fruit is actually determined as an associated type of the collector (`Collector::Fruit`).
|
|
||||||
|
|
||||||
|
|
||||||
# Combining several collectors
|
|
||||||
|
|
||||||
A rich search experience often requires to run several collectors on your search query.
|
|
||||||
For instance,
|
|
||||||
- selecting the top-K products matching your query
|
|
||||||
- counting the matching documents
|
|
||||||
- computing several facets
|
|
||||||
- computing statistics about the matching product prices
|
|
||||||
|
|
||||||
A simple and efficient way to do that is to pass your collectors as one tuple.
|
|
||||||
The resulting `Fruit` will then be a typed tuple with each collector's original fruits
|
|
||||||
in their respective position.
|
|
||||||
|
|
||||||
```rust
|
|
||||||
# extern crate tantivy;
|
|
||||||
# use tantivy::schema::*;
|
|
||||||
# use tantivy::*;
|
|
||||||
# use tantivy::query::*;
|
|
||||||
use tantivy::collector::{Count, TopDocs};
|
|
||||||
#
|
|
||||||
# fn main() -> tantivy::Result<()> {
|
|
||||||
# let mut schema_builder = Schema::builder();
|
|
||||||
# let title = schema_builder.add_text_field("title", TEXT);
|
|
||||||
# let schema = schema_builder.build();
|
|
||||||
# let index = Index::create_in_ram(schema);
|
|
||||||
# let mut index_writer = index.writer(3_000_000)?;
|
|
||||||
# index_writer.add_document(doc!(
|
|
||||||
# title => "The Name of the Wind",
|
|
||||||
# ));
|
|
||||||
# index_writer.add_document(doc!(
|
|
||||||
# title => "The Diary of Muadib",
|
|
||||||
# ));
|
|
||||||
# index_writer.commit()?;
|
|
||||||
# let reader = index.reader()?;
|
|
||||||
# let searcher = reader.searcher();
|
|
||||||
# let query_parser = QueryParser::for_index(&index, vec![title]);
|
|
||||||
# let query = query_parser.parse_query("diary")?;
|
|
||||||
let (doc_count, top_docs): (usize, Vec<(Score, DocAddress)>) =
|
|
||||||
searcher.search(&query, &(Count, TopDocs::with_limit(2)))?;
|
|
||||||
# Ok(())
|
|
||||||
# }
|
|
||||||
```
|
|
||||||
|
|
||||||
The `Collector` trait is implemented for up to 4 collectors.
|
|
||||||
If you have more than 4 collectors, you can either group them into
|
|
||||||
tuples of tuples `(a,(b,(c,d)))`, or rely on `MultiCollector`'s.
|
|
||||||
|
|
||||||
# Combining several collectors dynamically
|
|
||||||
|
|
||||||
Combining collectors into a tuple is a zero-cost abstraction: everything
|
|
||||||
happens as if you had manually implemented a single collector
|
|
||||||
combining all of our features.
|
|
||||||
|
|
||||||
Unfortunately it requires you to know at compile time your collector types.
|
|
||||||
If on the other hand, the collectors depend on some query parameter,
|
|
||||||
you can rely on `MultiCollector`'s.
|
|
||||||
|
|
||||||
|
|
||||||
# Implementing your own collectors.
|
|
||||||
|
|
||||||
See the `custom_collector` example.
|
|
||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
use downcast_rs;
|
|
||||||
use DocId;
|
use DocId;
|
||||||
use Result;
|
use Result;
|
||||||
use Score;
|
use Score;
|
||||||
@@ -93,7 +9,7 @@ use SegmentLocalId;
|
|||||||
use SegmentReader;
|
use SegmentReader;
|
||||||
|
|
||||||
mod count_collector;
|
mod count_collector;
|
||||||
pub use self::count_collector::Count;
|
pub use self::count_collector::CountCollector;
|
||||||
|
|
||||||
mod multi_collector;
|
mod multi_collector;
|
||||||
pub use self::multi_collector::MultiCollector;
|
pub use self::multi_collector::MultiCollector;
|
||||||
@@ -101,264 +17,237 @@ pub use self::multi_collector::MultiCollector;
|
|||||||
mod top_collector;
|
mod top_collector;
|
||||||
|
|
||||||
mod top_score_collector;
|
mod top_score_collector;
|
||||||
pub use self::top_score_collector::TopDocs;
|
pub use self::top_score_collector::TopScoreCollector;
|
||||||
|
#[deprecated]
|
||||||
|
pub use self::top_score_collector::TopScoreCollector as TopCollector;
|
||||||
|
|
||||||
mod top_field_collector;
|
mod top_field_collector;
|
||||||
pub use self::top_field_collector::TopDocsByField;
|
pub use self::top_field_collector::TopFieldCollector;
|
||||||
|
|
||||||
mod facet_collector;
|
mod facet_collector;
|
||||||
pub use self::facet_collector::FacetCollector;
|
pub use self::facet_collector::FacetCollector;
|
||||||
|
|
||||||
/// `Fruit` is the type for the result of our collection.
|
mod chained_collector;
|
||||||
/// e.g. `usize` for the `Count` collector.
|
pub use self::chained_collector::{chain, ChainedCollector};
|
||||||
pub trait Fruit: Send + downcast_rs::Downcast {}
|
|
||||||
|
|
||||||
impl<T> Fruit for T where T: Send + downcast_rs::Downcast {}
|
|
||||||
|
|
||||||
/// Collectors are in charge of collecting and retaining relevant
|
/// Collectors are in charge of collecting and retaining relevant
|
||||||
/// information from the document found and scored by the query.
|
/// information from the document found and scored by the query.
|
||||||
///
|
///
|
||||||
|
///
|
||||||
/// For instance,
|
/// For instance,
|
||||||
///
|
///
|
||||||
/// - keeping track of the top 10 best documents
|
/// - keeping track of the top 10 best documents
|
||||||
/// - computing a breakdown over a fast field
|
/// - computing a breakdown over a fast field
|
||||||
/// - computing the number of documents matching the query
|
/// - computing the number of documents matching the query
|
||||||
///
|
///
|
||||||
/// Our search index is in fact a collection of segments, so
|
/// Queries are in charge of pushing the `DocSet` to the collector.
|
||||||
/// a `Collector` trait is actually more of a factory to instance
|
|
||||||
/// `SegmentCollector`s for each segments.
|
|
||||||
///
|
///
|
||||||
/// The collection logic itself is in the `SegmentCollector`.
|
/// As they work on multiple segments, they first inform
|
||||||
|
/// the collector of a change in a segment and then
|
||||||
|
/// call the `collect` method to push the document to the collector.
|
||||||
|
///
|
||||||
|
/// Temporally, our collector will receive calls
|
||||||
|
/// - `.set_segment(0, segment_reader_0)`
|
||||||
|
/// - `.collect(doc0_of_segment_0)`
|
||||||
|
/// - `.collect(...)`
|
||||||
|
/// - `.collect(last_doc_of_segment_0)`
|
||||||
|
/// - `.set_segment(1, segment_reader_1)`
|
||||||
|
/// - `.collect(doc0_of_segment_1)`
|
||||||
|
/// - `.collect(...)`
|
||||||
|
/// - `.collect(last_doc_of_segment_1)`
|
||||||
|
/// - `...`
|
||||||
|
/// - `.collect(last_doc_of_last_segment)`
|
||||||
///
|
///
|
||||||
/// Segments are not guaranteed to be visited in any specific order.
|
/// Segments are not guaranteed to be visited in any specific order.
|
||||||
pub trait Collector: Sync {
|
pub trait Collector {
|
||||||
/// `Fruit` is the type for the result of our collection.
|
|
||||||
/// e.g. `usize` for the `Count` collector.
|
|
||||||
type Fruit: Fruit;
|
|
||||||
|
|
||||||
/// Type of the `SegmentCollector` associated to this collector.
|
|
||||||
type Child: SegmentCollector<Fruit = Self::Fruit>;
|
|
||||||
|
|
||||||
/// `set_segment` is called before beginning to enumerate
|
/// `set_segment` is called before beginning to enumerate
|
||||||
/// on this segment.
|
/// on this segment.
|
||||||
fn for_segment(
|
fn set_segment(
|
||||||
&self,
|
&mut self,
|
||||||
segment_local_id: SegmentLocalId,
|
segment_local_id: SegmentLocalId,
|
||||||
segment: &SegmentReader,
|
segment: &SegmentReader,
|
||||||
) -> Result<Self::Child>;
|
) -> Result<()>;
|
||||||
|
|
||||||
/// Returns true iff the collector requires to compute scores for documents.
|
|
||||||
fn requires_scoring(&self) -> bool;
|
|
||||||
|
|
||||||
/// Combines the fruit associated to the collection of each segments
|
|
||||||
/// into one fruit.
|
|
||||||
fn merge_fruits(&self, segment_fruits: Vec<Self::Fruit>) -> Result<Self::Fruit>;
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The `SegmentCollector` is the trait in charge of defining the
|
|
||||||
/// collect operation at the scale of the segment.
|
|
||||||
///
|
|
||||||
/// `.collect(doc, score)` will be called for every documents
|
|
||||||
/// matching the query.
|
|
||||||
pub trait SegmentCollector: 'static {
|
|
||||||
/// `Fruit` is the type for the result of our collection.
|
|
||||||
/// e.g. `usize` for the `Count` collector.
|
|
||||||
type Fruit: Fruit;
|
|
||||||
|
|
||||||
/// The query pushes the scored document to the collector via this method.
|
/// The query pushes the scored document to the collector via this method.
|
||||||
fn collect(&mut self, doc: DocId, score: Score);
|
fn collect(&mut self, doc: DocId, score: Score);
|
||||||
|
|
||||||
/// Extract the fruit of the collection from the `SegmentCollector`.
|
/// Returns true iff the collector requires to compute scores for documents.
|
||||||
fn harvest(self) -> Self::Fruit;
|
fn requires_scoring(&self) -> bool;
|
||||||
}
|
}
|
||||||
|
|
||||||
// -----------------------------------------------
|
impl<'a, C: Collector> Collector for &'a mut C {
|
||||||
// Tuple implementations.
|
fn set_segment(
|
||||||
|
&mut self,
|
||||||
impl<Left, Right> Collector for (Left, Right)
|
segment_local_id: SegmentLocalId,
|
||||||
where
|
segment: &SegmentReader,
|
||||||
Left: Collector,
|
) -> Result<()> {
|
||||||
Right: Collector,
|
(*self).set_segment(segment_local_id, segment)
|
||||||
{
|
}
|
||||||
type Fruit = (Left::Fruit, Right::Fruit);
|
/// The query pushes the scored document to the collector via this method.
|
||||||
type Child = (Left::Child, Right::Child);
|
fn collect(&mut self, doc: DocId, score: Score) {
|
||||||
|
C::collect(self, doc, score)
|
||||||
fn for_segment(&self, segment_local_id: u32, segment: &SegmentReader) -> Result<Self::Child> {
|
|
||||||
let left = self.0.for_segment(segment_local_id, segment)?;
|
|
||||||
let right = self.1.for_segment(segment_local_id, segment)?;
|
|
||||||
Ok((left, right))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
fn requires_scoring(&self) -> bool {
|
||||||
self.0.requires_scoring() || self.1.requires_scoring()
|
C::requires_scoring(self)
|
||||||
}
|
|
||||||
|
|
||||||
fn merge_fruits(
|
|
||||||
&self,
|
|
||||||
children: Vec<(Left::Fruit, Right::Fruit)>,
|
|
||||||
) -> Result<(Left::Fruit, Right::Fruit)> {
|
|
||||||
let mut left_fruits = vec![];
|
|
||||||
let mut right_fruits = vec![];
|
|
||||||
for (left_fruit, right_fruit) in children {
|
|
||||||
left_fruits.push(left_fruit);
|
|
||||||
right_fruits.push(right_fruit);
|
|
||||||
}
|
|
||||||
Ok((
|
|
||||||
self.0.merge_fruits(left_fruits)?,
|
|
||||||
self.1.merge_fruits(right_fruits)?,
|
|
||||||
))
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<Left, Right> SegmentCollector for (Left, Right)
|
|
||||||
where
|
|
||||||
Left: SegmentCollector,
|
|
||||||
Right: SegmentCollector,
|
|
||||||
{
|
|
||||||
type Fruit = (Left::Fruit, Right::Fruit);
|
|
||||||
|
|
||||||
fn collect(&mut self, doc: DocId, score: Score) {
|
|
||||||
self.0.collect(doc, score);
|
|
||||||
self.1.collect(doc, score);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
|
|
||||||
(self.0.harvest(), self.1.harvest())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// 3-Tuple
|
|
||||||
|
|
||||||
impl<One, Two, Three> Collector for (One, Two, Three)
|
|
||||||
where
|
|
||||||
One: Collector,
|
|
||||||
Two: Collector,
|
|
||||||
Three: Collector,
|
|
||||||
{
|
|
||||||
type Fruit = (One::Fruit, Two::Fruit, Three::Fruit);
|
|
||||||
type Child = (One::Child, Two::Child, Three::Child);
|
|
||||||
|
|
||||||
fn for_segment(&self, segment_local_id: u32, segment: &SegmentReader) -> Result<Self::Child> {
|
|
||||||
let one = self.0.for_segment(segment_local_id, segment)?;
|
|
||||||
let two = self.1.for_segment(segment_local_id, segment)?;
|
|
||||||
let three = self.2.for_segment(segment_local_id, segment)?;
|
|
||||||
Ok((one, two, three))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
|
||||||
self.0.requires_scoring() || self.1.requires_scoring() || self.2.requires_scoring()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn merge_fruits(&self, children: Vec<Self::Fruit>) -> Result<Self::Fruit> {
|
|
||||||
let mut one_fruits = vec![];
|
|
||||||
let mut two_fruits = vec![];
|
|
||||||
let mut three_fruits = vec![];
|
|
||||||
for (one_fruit, two_fruit, three_fruit) in children {
|
|
||||||
one_fruits.push(one_fruit);
|
|
||||||
two_fruits.push(two_fruit);
|
|
||||||
three_fruits.push(three_fruit);
|
|
||||||
}
|
|
||||||
Ok((
|
|
||||||
self.0.merge_fruits(one_fruits)?,
|
|
||||||
self.1.merge_fruits(two_fruits)?,
|
|
||||||
self.2.merge_fruits(three_fruits)?,
|
|
||||||
))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<One, Two, Three> SegmentCollector for (One, Two, Three)
|
|
||||||
where
|
|
||||||
One: SegmentCollector,
|
|
||||||
Two: SegmentCollector,
|
|
||||||
Three: SegmentCollector,
|
|
||||||
{
|
|
||||||
type Fruit = (One::Fruit, Two::Fruit, Three::Fruit);
|
|
||||||
|
|
||||||
fn collect(&mut self, doc: DocId, score: Score) {
|
|
||||||
self.0.collect(doc, score);
|
|
||||||
self.1.collect(doc, score);
|
|
||||||
self.2.collect(doc, score);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
|
|
||||||
(self.0.harvest(), self.1.harvest(), self.2.harvest())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// 4-Tuple
|
|
||||||
|
|
||||||
impl<One, Two, Three, Four> Collector for (One, Two, Three, Four)
|
|
||||||
where
|
|
||||||
One: Collector,
|
|
||||||
Two: Collector,
|
|
||||||
Three: Collector,
|
|
||||||
Four: Collector,
|
|
||||||
{
|
|
||||||
type Fruit = (One::Fruit, Two::Fruit, Three::Fruit, Four::Fruit);
|
|
||||||
type Child = (One::Child, Two::Child, Three::Child, Four::Child);
|
|
||||||
|
|
||||||
fn for_segment(&self, segment_local_id: u32, segment: &SegmentReader) -> Result<Self::Child> {
|
|
||||||
let one = self.0.for_segment(segment_local_id, segment)?;
|
|
||||||
let two = self.1.for_segment(segment_local_id, segment)?;
|
|
||||||
let three = self.2.for_segment(segment_local_id, segment)?;
|
|
||||||
let four = self.3.for_segment(segment_local_id, segment)?;
|
|
||||||
Ok((one, two, three, four))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
|
||||||
self.0.requires_scoring()
|
|
||||||
|| self.1.requires_scoring()
|
|
||||||
|| self.2.requires_scoring()
|
|
||||||
|| self.3.requires_scoring()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn merge_fruits(&self, children: Vec<Self::Fruit>) -> Result<Self::Fruit> {
|
|
||||||
let mut one_fruits = vec![];
|
|
||||||
let mut two_fruits = vec![];
|
|
||||||
let mut three_fruits = vec![];
|
|
||||||
let mut four_fruits = vec![];
|
|
||||||
for (one_fruit, two_fruit, three_fruit, four_fruit) in children {
|
|
||||||
one_fruits.push(one_fruit);
|
|
||||||
two_fruits.push(two_fruit);
|
|
||||||
three_fruits.push(three_fruit);
|
|
||||||
four_fruits.push(four_fruit);
|
|
||||||
}
|
|
||||||
Ok((
|
|
||||||
self.0.merge_fruits(one_fruits)?,
|
|
||||||
self.1.merge_fruits(two_fruits)?,
|
|
||||||
self.2.merge_fruits(three_fruits)?,
|
|
||||||
self.3.merge_fruits(four_fruits)?,
|
|
||||||
))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<One, Two, Three, Four> SegmentCollector for (One, Two, Three, Four)
|
|
||||||
where
|
|
||||||
One: SegmentCollector,
|
|
||||||
Two: SegmentCollector,
|
|
||||||
Three: SegmentCollector,
|
|
||||||
Four: SegmentCollector,
|
|
||||||
{
|
|
||||||
type Fruit = (One::Fruit, Two::Fruit, Three::Fruit, Four::Fruit);
|
|
||||||
|
|
||||||
fn collect(&mut self, doc: DocId, score: Score) {
|
|
||||||
self.0.collect(doc, score);
|
|
||||||
self.1.collect(doc, score);
|
|
||||||
self.2.collect(doc, score);
|
|
||||||
self.3.collect(doc, score);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
|
|
||||||
(
|
|
||||||
self.0.harvest(),
|
|
||||||
self.1.harvest(),
|
|
||||||
self.2.harvest(),
|
|
||||||
self.3.harvest(),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl_downcast!(Fruit);
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub mod tests;
|
pub mod tests {
|
||||||
|
|
||||||
|
use super::*;
|
||||||
|
use core::SegmentReader;
|
||||||
|
use fastfield::BytesFastFieldReader;
|
||||||
|
use fastfield::FastFieldReader;
|
||||||
|
use schema::Field;
|
||||||
|
use DocId;
|
||||||
|
use Score;
|
||||||
|
use SegmentLocalId;
|
||||||
|
|
||||||
|
/// Stores all of the doc ids.
|
||||||
|
/// This collector is only used for tests.
|
||||||
|
/// It is unusable in practise, as it does not store
|
||||||
|
/// the segment ordinals
|
||||||
|
pub struct TestCollector {
|
||||||
|
offset: DocId,
|
||||||
|
segment_max_doc: DocId,
|
||||||
|
docs: Vec<DocId>,
|
||||||
|
scores: Vec<Score>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TestCollector {
|
||||||
|
/// Return the exhalist of documents.
|
||||||
|
pub fn docs(self) -> Vec<DocId> {
|
||||||
|
self.docs
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn scores(self) -> Vec<Score> {
|
||||||
|
self.scores
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for TestCollector {
|
||||||
|
fn default() -> TestCollector {
|
||||||
|
TestCollector {
|
||||||
|
offset: 0,
|
||||||
|
segment_max_doc: 0,
|
||||||
|
docs: Vec::new(),
|
||||||
|
scores: Vec::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Collector for TestCollector {
|
||||||
|
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
|
||||||
|
self.offset += self.segment_max_doc;
|
||||||
|
self.segment_max_doc = reader.max_doc();
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn collect(&mut self, doc: DocId, score: Score) {
|
||||||
|
self.docs.push(doc + self.offset);
|
||||||
|
self.scores.push(score);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn requires_scoring(&self) -> bool {
|
||||||
|
true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Collects in order all of the fast fields for all of the
|
||||||
|
/// doc in the `DocSet`
|
||||||
|
///
|
||||||
|
/// This collector is mainly useful for tests.
|
||||||
|
pub struct FastFieldTestCollector {
|
||||||
|
vals: Vec<u64>,
|
||||||
|
field: Field,
|
||||||
|
ff_reader: Option<FastFieldReader<u64>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FastFieldTestCollector {
|
||||||
|
pub fn for_field(field: Field) -> FastFieldTestCollector {
|
||||||
|
FastFieldTestCollector {
|
||||||
|
vals: Vec::new(),
|
||||||
|
field,
|
||||||
|
ff_reader: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn vals(self) -> Vec<u64> {
|
||||||
|
self.vals
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Collector for FastFieldTestCollector {
|
||||||
|
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
|
||||||
|
self.ff_reader = Some(reader.fast_field_reader(self.field)?);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn collect(&mut self, doc: DocId, _score: Score) {
|
||||||
|
let val = self.ff_reader.as_ref().unwrap().get(doc);
|
||||||
|
self.vals.push(val);
|
||||||
|
}
|
||||||
|
fn requires_scoring(&self) -> bool {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Collects in order all of the fast field bytes for all of the
|
||||||
|
/// docs in the `DocSet`
|
||||||
|
///
|
||||||
|
/// This collector is mainly useful for tests.
|
||||||
|
pub struct BytesFastFieldTestCollector {
|
||||||
|
vals: Vec<u8>,
|
||||||
|
field: Field,
|
||||||
|
ff_reader: Option<BytesFastFieldReader>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BytesFastFieldTestCollector {
|
||||||
|
pub fn for_field(field: Field) -> BytesFastFieldTestCollector {
|
||||||
|
BytesFastFieldTestCollector {
|
||||||
|
vals: Vec::new(),
|
||||||
|
field,
|
||||||
|
ff_reader: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn vals(self) -> Vec<u8> {
|
||||||
|
self.vals
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Collector for BytesFastFieldTestCollector {
|
||||||
|
fn set_segment(&mut self, _segment_local_id: u32, segment: &SegmentReader) -> Result<()> {
|
||||||
|
self.ff_reader = Some(segment.bytes_fast_field_reader(self.field)?);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn collect(&mut self, doc: u32, _score: f32) {
|
||||||
|
let val = self.ff_reader.as_ref().unwrap().get_val(doc);
|
||||||
|
self.vals.extend(val);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn requires_scoring(&self) -> bool {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(all(test, feature = "unstable"))]
|
||||||
|
mod bench {
|
||||||
|
use collector::{Collector, CountCollector};
|
||||||
|
use test::Bencher;
|
||||||
|
|
||||||
|
#[bench]
|
||||||
|
fn build_collector(b: &mut Bencher) {
|
||||||
|
b.iter(|| {
|
||||||
|
let mut count_collector = CountCollector::default();
|
||||||
|
let docs: Vec<u32> = (0..1_000_000).collect();
|
||||||
|
for doc in docs {
|
||||||
|
count_collector.collect(doc, 1f32);
|
||||||
|
}
|
||||||
|
count_collector.count()
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -1,119 +1,26 @@
|
|||||||
use super::Collector;
|
use super::Collector;
|
||||||
use super::SegmentCollector;
|
|
||||||
use collector::Fruit;
|
|
||||||
use std::marker::PhantomData;
|
|
||||||
use DocId;
|
use DocId;
|
||||||
use Result;
|
use Result;
|
||||||
use Score;
|
use Score;
|
||||||
use SegmentLocalId;
|
use SegmentLocalId;
|
||||||
use SegmentReader;
|
use SegmentReader;
|
||||||
use TantivyError;
|
|
||||||
|
|
||||||
pub struct MultiFruit {
|
|
||||||
sub_fruits: Vec<Option<Box<Fruit>>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct CollectorWrapper<TCollector: Collector>(TCollector);
|
|
||||||
|
|
||||||
impl<TCollector: Collector> Collector for CollectorWrapper<TCollector> {
|
|
||||||
type Fruit = Box<Fruit>;
|
|
||||||
type Child = Box<BoxableSegmentCollector>;
|
|
||||||
|
|
||||||
fn for_segment(
|
|
||||||
&self,
|
|
||||||
segment_local_id: u32,
|
|
||||||
reader: &SegmentReader,
|
|
||||||
) -> Result<Box<BoxableSegmentCollector>> {
|
|
||||||
let child = self.0.for_segment(segment_local_id, reader)?;
|
|
||||||
Ok(Box::new(SegmentCollectorWrapper(child)))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
|
||||||
self.0.requires_scoring()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn merge_fruits(&self, children: Vec<<Self as Collector>::Fruit>) -> Result<Box<Fruit>> {
|
|
||||||
let typed_fruit: Vec<TCollector::Fruit> = children
|
|
||||||
.into_iter()
|
|
||||||
.map(|untyped_fruit| {
|
|
||||||
untyped_fruit
|
|
||||||
.downcast::<TCollector::Fruit>()
|
|
||||||
.map(|boxed_but_typed| *boxed_but_typed)
|
|
||||||
.map_err(|_| {
|
|
||||||
TantivyError::InvalidArgument("Failed to cast child fruit.".to_string())
|
|
||||||
})
|
|
||||||
})
|
|
||||||
.collect::<Result<_>>()?;
|
|
||||||
let merged_fruit = self.0.merge_fruits(typed_fruit)?;
|
|
||||||
Ok(Box::new(merged_fruit))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SegmentCollector for Box<BoxableSegmentCollector> {
|
|
||||||
type Fruit = Box<Fruit>;
|
|
||||||
|
|
||||||
fn collect(&mut self, doc: u32, score: f32) {
|
|
||||||
self.as_mut().collect(doc, score);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn harvest(self) -> Box<Fruit> {
|
|
||||||
BoxableSegmentCollector::harvest_from_box(self)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub trait BoxableSegmentCollector {
|
|
||||||
fn collect(&mut self, doc: u32, score: f32);
|
|
||||||
fn harvest_from_box(self: Box<Self>) -> Box<Fruit>;
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct SegmentCollectorWrapper<TSegmentCollector: SegmentCollector>(TSegmentCollector);
|
|
||||||
|
|
||||||
impl<TSegmentCollector: SegmentCollector> BoxableSegmentCollector
|
|
||||||
for SegmentCollectorWrapper<TSegmentCollector>
|
|
||||||
{
|
|
||||||
fn collect(&mut self, doc: u32, score: f32) {
|
|
||||||
self.0.collect(doc, score);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn harvest_from_box(self: Box<Self>) -> Box<Fruit> {
|
|
||||||
Box::new(self.0.harvest())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct FruitHandle<TFruit: Fruit> {
|
|
||||||
pos: usize,
|
|
||||||
_phantom: PhantomData<TFruit>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<TFruit: Fruit> FruitHandle<TFruit> {
|
|
||||||
pub fn extract(self, fruits: &mut MultiFruit) -> TFruit {
|
|
||||||
let boxed_fruit = fruits.sub_fruits[self.pos].take().expect("");
|
|
||||||
*boxed_fruit
|
|
||||||
.downcast::<TFruit>()
|
|
||||||
.map_err(|_| ())
|
|
||||||
.expect("Failed to downcast collector fruit.")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Multicollector makes it possible to collect on more than one collector.
|
/// Multicollector makes it possible to collect on more than one collector.
|
||||||
/// It should only be used for use cases where the Collector types is unknown
|
/// It should only be used for use cases where the Collector types is unknown
|
||||||
/// at compile time.
|
/// at compile time.
|
||||||
///
|
/// If the type of the collectors is known, you should prefer to use `ChainedCollector`.
|
||||||
/// If the type of the collectors is known, you can just group yours collectors
|
|
||||||
/// in a tuple. See the
|
|
||||||
/// [Combining several collectors section of the collector documentation](./index.html#combining-several-collectors).
|
|
||||||
///
|
///
|
||||||
/// ```rust
|
/// ```rust
|
||||||
/// #[macro_use]
|
/// #[macro_use]
|
||||||
/// extern crate tantivy;
|
/// extern crate tantivy;
|
||||||
/// use tantivy::schema::{Schema, TEXT};
|
/// use tantivy::schema::{SchemaBuilder, TEXT};
|
||||||
/// use tantivy::{Index, Result};
|
/// use tantivy::{Index, Result};
|
||||||
/// use tantivy::collector::{Count, TopDocs, MultiCollector};
|
/// use tantivy::collector::{CountCollector, TopCollector, MultiCollector};
|
||||||
/// use tantivy::query::QueryParser;
|
/// use tantivy::query::QueryParser;
|
||||||
///
|
///
|
||||||
/// # fn main() { example().unwrap(); }
|
/// # fn main() { example().unwrap(); }
|
||||||
/// fn example() -> Result<()> {
|
/// fn example() -> Result<()> {
|
||||||
/// let mut schema_builder = Schema::builder();
|
/// let mut schema_builder = SchemaBuilder::new();
|
||||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||||
/// let schema = schema_builder.build();
|
/// let schema = schema_builder.build();
|
||||||
/// let index = Index::create_in_ram(schema);
|
/// let index = Index::create_in_ram(schema);
|
||||||
@@ -134,118 +41,58 @@ impl<TFruit: Fruit> FruitHandle<TFruit> {
|
|||||||
/// index_writer.commit().unwrap();
|
/// index_writer.commit().unwrap();
|
||||||
/// }
|
/// }
|
||||||
///
|
///
|
||||||
/// let reader = index.reader()?;
|
/// index.load_searchers()?;
|
||||||
/// let searcher = reader.searcher();
|
/// let searcher = index.searcher();
|
||||||
///
|
///
|
||||||
/// let mut collectors = MultiCollector::new();
|
/// {
|
||||||
/// let top_docs_handle = collectors.add_collector(TopDocs::with_limit(2));
|
/// let mut top_collector = TopCollector::with_limit(2);
|
||||||
/// let count_handle = collectors.add_collector(Count);
|
/// let mut count_collector = CountCollector::default();
|
||||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
/// {
|
||||||
/// let query = query_parser.parse_query("diary")?;
|
/// let mut collectors =
|
||||||
/// let mut multi_fruit = searcher.search(&query, &collectors)?;
|
/// MultiCollector::from(vec![&mut top_collector, &mut count_collector]);
|
||||||
///
|
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||||
/// let count = count_handle.extract(&mut multi_fruit);
|
/// let query = query_parser.parse_query("diary")?;
|
||||||
/// let top_docs = top_docs_handle.extract(&mut multi_fruit);
|
/// searcher.search(&*query, &mut collectors).unwrap();
|
||||||
///
|
/// }
|
||||||
/// # assert_eq!(count, 2);
|
/// assert_eq!(count_collector.count(), 2);
|
||||||
/// # assert_eq!(top_docs.len(), 2);
|
/// assert!(top_collector.at_capacity());
|
||||||
|
/// }
|
||||||
///
|
///
|
||||||
/// Ok(())
|
/// Ok(())
|
||||||
/// }
|
/// }
|
||||||
/// ```
|
/// ```
|
||||||
#[allow(clippy::type_complexity)]
|
|
||||||
#[derive(Default)]
|
|
||||||
pub struct MultiCollector<'a> {
|
pub struct MultiCollector<'a> {
|
||||||
collector_wrappers:
|
collectors: Vec<&'a mut Collector>,
|
||||||
Vec<Box<Collector<Child = Box<BoxableSegmentCollector>, Fruit = Box<Fruit>> + 'a>>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> MultiCollector<'a> {
|
impl<'a> MultiCollector<'a> {
|
||||||
/// Create a new `MultiCollector`
|
/// Constructor
|
||||||
pub fn new() -> Self {
|
pub fn from(collectors: Vec<&'a mut Collector>) -> MultiCollector {
|
||||||
Default::default()
|
MultiCollector { collectors }
|
||||||
}
|
|
||||||
|
|
||||||
/// Add a new collector to our `MultiCollector`.
|
|
||||||
pub fn add_collector<'b: 'a, TCollector: Collector + 'b>(
|
|
||||||
&mut self,
|
|
||||||
collector: TCollector,
|
|
||||||
) -> FruitHandle<TCollector::Fruit> {
|
|
||||||
let pos = self.collector_wrappers.len();
|
|
||||||
self.collector_wrappers
|
|
||||||
.push(Box::new(CollectorWrapper(collector)));
|
|
||||||
FruitHandle {
|
|
||||||
pos,
|
|
||||||
_phantom: PhantomData,
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Collector for MultiCollector<'a> {
|
impl<'a> Collector for MultiCollector<'a> {
|
||||||
type Fruit = MultiFruit;
|
fn set_segment(
|
||||||
type Child = MultiCollectorChild;
|
&mut self,
|
||||||
|
|
||||||
fn for_segment(
|
|
||||||
&self,
|
|
||||||
segment_local_id: SegmentLocalId,
|
segment_local_id: SegmentLocalId,
|
||||||
segment: &SegmentReader,
|
segment: &SegmentReader,
|
||||||
) -> Result<MultiCollectorChild> {
|
) -> Result<()> {
|
||||||
let children = self
|
for collector in &mut self.collectors {
|
||||||
.collector_wrappers
|
collector.set_segment(segment_local_id, segment)?;
|
||||||
.iter()
|
|
||||||
.map(|collector_wrapper| collector_wrapper.for_segment(segment_local_id, segment))
|
|
||||||
.collect::<Result<Vec<_>>>()?;
|
|
||||||
Ok(MultiCollectorChild { children })
|
|
||||||
}
|
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
|
||||||
self.collector_wrappers.iter().any(|c| c.requires_scoring())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn merge_fruits(&self, segments_multifruits: Vec<MultiFruit>) -> Result<MultiFruit> {
|
|
||||||
let mut segment_fruits_list: Vec<Vec<Box<Fruit>>> = (0..self.collector_wrappers.len())
|
|
||||||
.map(|_| Vec::with_capacity(segments_multifruits.len()))
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
for segment_multifruit in segments_multifruits {
|
|
||||||
for (idx, segment_fruit_opt) in segment_multifruit.sub_fruits.into_iter().enumerate() {
|
|
||||||
if let Some(segment_fruit) = segment_fruit_opt {
|
|
||||||
segment_fruits_list[idx].push(segment_fruit);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
let sub_fruits = self
|
Ok(())
|
||||||
.collector_wrappers
|
|
||||||
.iter()
|
|
||||||
.zip(segment_fruits_list)
|
|
||||||
.map(|(child_collector, segment_fruits)| {
|
|
||||||
Ok(Some(child_collector.merge_fruits(segment_fruits)?))
|
|
||||||
})
|
|
||||||
.collect::<Result<_>>()?;
|
|
||||||
Ok(MultiFruit { sub_fruits })
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
pub struct MultiCollectorChild {
|
|
||||||
children: Vec<Box<BoxableSegmentCollector>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SegmentCollector for MultiCollectorChild {
|
|
||||||
type Fruit = MultiFruit;
|
|
||||||
|
|
||||||
fn collect(&mut self, doc: DocId, score: Score) {
|
fn collect(&mut self, doc: DocId, score: Score) {
|
||||||
for child in &mut self.children {
|
for collector in &mut self.collectors {
|
||||||
child.collect(doc, score);
|
collector.collect(doc, score);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
fn requires_scoring(&self) -> bool {
|
||||||
fn harvest(self) -> MultiFruit {
|
self.collectors
|
||||||
MultiFruit {
|
.iter()
|
||||||
sub_fruits: self
|
.any(|collector| collector.requires_scoring())
|
||||||
.children
|
|
||||||
.into_iter()
|
|
||||||
.map(|child| Some(child.harvest()))
|
|
||||||
.collect(),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -253,41 +100,20 @@ impl SegmentCollector for MultiCollectorChild {
|
|||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
use collector::{Count, TopDocs};
|
use collector::{Collector, CountCollector, TopScoreCollector};
|
||||||
use query::TermQuery;
|
|
||||||
use schema::IndexRecordOption;
|
|
||||||
use schema::{Schema, TEXT};
|
|
||||||
use Index;
|
|
||||||
use Term;
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_multi_collector() {
|
fn test_multi_collector() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut top_collector = TopScoreCollector::with_limit(2);
|
||||||
let text = schema_builder.add_text_field("text", TEXT);
|
let mut count_collector = CountCollector::default();
|
||||||
let schema = schema_builder.build();
|
|
||||||
|
|
||||||
let index = Index::create_in_ram(schema);
|
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
let mut collectors =
|
||||||
index_writer.add_document(doc!(text=>"abc"));
|
MultiCollector::from(vec![&mut top_collector, &mut count_collector]);
|
||||||
index_writer.add_document(doc!(text=>"abc abc abc"));
|
collectors.collect(1, 0.2);
|
||||||
index_writer.add_document(doc!(text=>"abc abc"));
|
collectors.collect(2, 0.1);
|
||||||
index_writer.commit().unwrap();
|
collectors.collect(3, 0.5);
|
||||||
index_writer.add_document(doc!(text=>""));
|
|
||||||
index_writer.add_document(doc!(text=>"abc abc abc abc"));
|
|
||||||
index_writer.add_document(doc!(text=>"abc"));
|
|
||||||
index_writer.commit().unwrap();
|
|
||||||
}
|
}
|
||||||
let searcher = index.reader().unwrap().searcher();
|
assert_eq!(count_collector.count(), 3);
|
||||||
let term = Term::from_field_text(text, "abc");
|
assert!(top_collector.at_capacity());
|
||||||
let query = TermQuery::new(term, IndexRecordOption::Basic);
|
|
||||||
|
|
||||||
let mut collectors = MultiCollector::new();
|
|
||||||
let topdocs_handler = collectors.add_collector(TopDocs::with_limit(2));
|
|
||||||
let count_handler = collectors.add_collector(Count);
|
|
||||||
let mut multifruits = searcher.search(&query, &mut collectors).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(count_handler.extract(&mut multifruits), 5);
|
|
||||||
assert_eq!(topdocs_handler.extract(&mut multifruits).len(), 2);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,201 +0,0 @@
|
|||||||
use super::*;
|
|
||||||
use core::SegmentReader;
|
|
||||||
use fastfield::BytesFastFieldReader;
|
|
||||||
use fastfield::FastFieldReader;
|
|
||||||
use schema::Field;
|
|
||||||
use DocAddress;
|
|
||||||
use DocId;
|
|
||||||
use Score;
|
|
||||||
use SegmentLocalId;
|
|
||||||
|
|
||||||
/// Stores all of the doc ids.
|
|
||||||
/// This collector is only used for tests.
|
|
||||||
/// It is unusable in pr
|
|
||||||
///
|
|
||||||
/// actise, as it does not store
|
|
||||||
/// the segment ordinals
|
|
||||||
pub struct TestCollector;
|
|
||||||
|
|
||||||
pub struct TestSegmentCollector {
|
|
||||||
segment_id: SegmentLocalId,
|
|
||||||
fruit: TestFruit,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Default)]
|
|
||||||
pub struct TestFruit {
|
|
||||||
docs: Vec<DocAddress>,
|
|
||||||
scores: Vec<Score>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl TestFruit {
|
|
||||||
/// Return the list of matching documents exhaustively.
|
|
||||||
pub fn docs(&self) -> &[DocAddress] {
|
|
||||||
&self.docs[..]
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn scores(&self) -> &[Score] {
|
|
||||||
&self.scores[..]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Collector for TestCollector {
|
|
||||||
type Fruit = TestFruit;
|
|
||||||
type Child = TestSegmentCollector;
|
|
||||||
|
|
||||||
fn for_segment(
|
|
||||||
&self,
|
|
||||||
segment_id: SegmentLocalId,
|
|
||||||
_reader: &SegmentReader,
|
|
||||||
) -> Result<TestSegmentCollector> {
|
|
||||||
Ok(TestSegmentCollector {
|
|
||||||
segment_id,
|
|
||||||
fruit: TestFruit::default(),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
|
||||||
true
|
|
||||||
}
|
|
||||||
|
|
||||||
fn merge_fruits(&self, mut children: Vec<TestFruit>) -> Result<TestFruit> {
|
|
||||||
children.sort_by_key(|fruit| {
|
|
||||||
if fruit.docs().is_empty() {
|
|
||||||
0
|
|
||||||
} else {
|
|
||||||
fruit.docs()[0].segment_ord()
|
|
||||||
}
|
|
||||||
});
|
|
||||||
let mut docs = vec![];
|
|
||||||
let mut scores = vec![];
|
|
||||||
for child in children {
|
|
||||||
docs.extend(child.docs());
|
|
||||||
scores.extend(child.scores);
|
|
||||||
}
|
|
||||||
Ok(TestFruit { docs, scores })
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SegmentCollector for TestSegmentCollector {
|
|
||||||
type Fruit = TestFruit;
|
|
||||||
|
|
||||||
fn collect(&mut self, doc: DocId, score: Score) {
|
|
||||||
self.fruit.docs.push(DocAddress(self.segment_id, doc));
|
|
||||||
self.fruit.scores.push(score);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
|
|
||||||
self.fruit
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Collects in order all of the fast fields for all of the
|
|
||||||
/// doc in the `DocSet`
|
|
||||||
///
|
|
||||||
/// This collector is mainly useful for tests.
|
|
||||||
pub struct FastFieldTestCollector {
|
|
||||||
field: Field,
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct FastFieldSegmentCollector {
|
|
||||||
vals: Vec<u64>,
|
|
||||||
reader: FastFieldReader<u64>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl FastFieldTestCollector {
|
|
||||||
pub fn for_field(field: Field) -> FastFieldTestCollector {
|
|
||||||
FastFieldTestCollector { field }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Collector for FastFieldTestCollector {
|
|
||||||
type Fruit = Vec<u64>;
|
|
||||||
type Child = FastFieldSegmentCollector;
|
|
||||||
|
|
||||||
fn for_segment(
|
|
||||||
&self,
|
|
||||||
_: SegmentLocalId,
|
|
||||||
reader: &SegmentReader,
|
|
||||||
) -> Result<FastFieldSegmentCollector> {
|
|
||||||
Ok(FastFieldSegmentCollector {
|
|
||||||
vals: Vec::new(),
|
|
||||||
reader: reader.fast_field_reader(self.field)?,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
|
||||||
false
|
|
||||||
}
|
|
||||||
|
|
||||||
fn merge_fruits(&self, children: Vec<Vec<u64>>) -> Result<Vec<u64>> {
|
|
||||||
Ok(children.into_iter().flat_map(|v| v.into_iter()).collect())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SegmentCollector for FastFieldSegmentCollector {
|
|
||||||
type Fruit = Vec<u64>;
|
|
||||||
|
|
||||||
fn collect(&mut self, doc: DocId, _score: Score) {
|
|
||||||
let val = self.reader.get(doc);
|
|
||||||
self.vals.push(val);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn harvest(self) -> Vec<u64> {
|
|
||||||
self.vals
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Collects in order all of the fast field bytes for all of the
|
|
||||||
/// docs in the `DocSet`
|
|
||||||
///
|
|
||||||
/// This collector is mainly useful for tests.
|
|
||||||
pub struct BytesFastFieldTestCollector {
|
|
||||||
field: Field,
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct BytesFastFieldSegmentCollector {
|
|
||||||
vals: Vec<u8>,
|
|
||||||
reader: BytesFastFieldReader,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl BytesFastFieldTestCollector {
|
|
||||||
pub fn for_field(field: Field) -> BytesFastFieldTestCollector {
|
|
||||||
BytesFastFieldTestCollector { field }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Collector for BytesFastFieldTestCollector {
|
|
||||||
type Fruit = Vec<u8>;
|
|
||||||
type Child = BytesFastFieldSegmentCollector;
|
|
||||||
|
|
||||||
fn for_segment(
|
|
||||||
&self,
|
|
||||||
_segment_local_id: u32,
|
|
||||||
segment: &SegmentReader,
|
|
||||||
) -> Result<BytesFastFieldSegmentCollector> {
|
|
||||||
Ok(BytesFastFieldSegmentCollector {
|
|
||||||
vals: Vec::new(),
|
|
||||||
reader: segment.bytes_fast_field_reader(self.field)?,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
|
||||||
false
|
|
||||||
}
|
|
||||||
|
|
||||||
fn merge_fruits(&self, children: Vec<Vec<u8>>) -> Result<Vec<u8>> {
|
|
||||||
Ok(children.into_iter().flat_map(|c| c.into_iter()).collect())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SegmentCollector for BytesFastFieldSegmentCollector {
|
|
||||||
type Fruit = Vec<u8>;
|
|
||||||
|
|
||||||
fn collect(&mut self, doc: u32, _score: f32) {
|
|
||||||
let data = self.reader.get_val(doc);
|
|
||||||
self.vals.extend(data);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
|
|
||||||
self.vals
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,59 +1,56 @@
|
|||||||
use serde::export::PhantomData;
|
|
||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use std::collections::BinaryHeap;
|
use std::collections::BinaryHeap;
|
||||||
use DocAddress;
|
use DocAddress;
|
||||||
use DocId;
|
use DocId;
|
||||||
use Result;
|
|
||||||
use SegmentLocalId;
|
use SegmentLocalId;
|
||||||
use SegmentReader;
|
|
||||||
|
|
||||||
/// Contains a feature (field, score, etc.) of a document along with the document address.
|
/// Contains a feature (field, score, etc.) of a document along with the document address.
|
||||||
///
|
///
|
||||||
/// It has a custom implementation of `PartialOrd` that reverses the order. This is because the
|
/// It has a custom implementation of `PartialOrd` that reverses the order. This is because the
|
||||||
/// default Rust heap is a max heap, whereas a min heap is needed.
|
/// default Rust heap is a max heap, whereas a min heap is needed.
|
||||||
///
|
#[derive(Clone, Copy)]
|
||||||
/// WARNING: equality is not what you would expect here.
|
pub struct ComparableDoc<T> {
|
||||||
/// Two elements are equal if their feature is equal, and regardless of whether `doc`
|
|
||||||
/// is equal. This should be perfectly fine for this usage, but let's make sure this
|
|
||||||
/// struct is never public.
|
|
||||||
struct ComparableDoc<T, D> {
|
|
||||||
feature: T,
|
feature: T,
|
||||||
doc: D,
|
doc_address: DocAddress,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: PartialOrd, D> PartialOrd for ComparableDoc<T, D> {
|
impl<T: PartialOrd> PartialOrd for ComparableDoc<T> {
|
||||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||||
Some(self.cmp(other))
|
Some(self.cmp(other))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: PartialOrd, D> Ord for ComparableDoc<T, D> {
|
impl<T: PartialOrd> Ord for ComparableDoc<T> {
|
||||||
#[inline]
|
#[inline]
|
||||||
fn cmp(&self, other: &Self) -> Ordering {
|
fn cmp(&self, other: &Self) -> Ordering {
|
||||||
other
|
other
|
||||||
.feature
|
.feature
|
||||||
.partial_cmp(&self.feature)
|
.partial_cmp(&self.feature)
|
||||||
.unwrap_or_else(|| Ordering::Equal)
|
.unwrap_or_else(|| other.doc_address.cmp(&self.doc_address))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: PartialOrd, D> PartialEq for ComparableDoc<T, D> {
|
impl<T: PartialOrd> PartialEq for ComparableDoc<T> {
|
||||||
fn eq(&self, other: &Self) -> bool {
|
fn eq(&self, other: &Self) -> bool {
|
||||||
self.cmp(other) == Ordering::Equal
|
self.cmp(other) == Ordering::Equal
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: PartialOrd, D> Eq for ComparableDoc<T, D> {}
|
impl<T: PartialOrd> Eq for ComparableDoc<T> {}
|
||||||
|
|
||||||
pub(crate) struct TopCollector<T> {
|
/// The Top Collector keeps track of the K documents
|
||||||
|
/// sorted by type `T`.
|
||||||
|
///
|
||||||
|
/// The implementation is based on a `BinaryHeap`.
|
||||||
|
/// The theorical complexity for collecting the top `K` out of `n` documents
|
||||||
|
/// is `O(n log K)`.
|
||||||
|
pub struct TopCollector<T> {
|
||||||
limit: usize,
|
limit: usize,
|
||||||
_marker: PhantomData<T>,
|
heap: BinaryHeap<ComparableDoc<T>>,
|
||||||
|
segment_id: u32,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T> TopCollector<T>
|
impl<T: PartialOrd + Clone> TopCollector<T> {
|
||||||
where
|
|
||||||
T: PartialOrd + Clone,
|
|
||||||
{
|
|
||||||
/// Creates a top collector, with a number of documents equal to "limit".
|
/// Creates a top collector, with a number of documents equal to "limit".
|
||||||
///
|
///
|
||||||
/// # Panics
|
/// # Panics
|
||||||
@@ -63,157 +60,128 @@ where
|
|||||||
panic!("Limit must be strictly greater than 0.");
|
panic!("Limit must be strictly greater than 0.");
|
||||||
}
|
}
|
||||||
TopCollector {
|
TopCollector {
|
||||||
limit,
|
|
||||||
_marker: PhantomData,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn limit(&self) -> usize {
|
|
||||||
self.limit
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn merge_fruits(
|
|
||||||
&self,
|
|
||||||
children: Vec<Vec<(T, DocAddress)>>,
|
|
||||||
) -> Result<Vec<(T, DocAddress)>> {
|
|
||||||
if self.limit == 0 {
|
|
||||||
return Ok(Vec::new());
|
|
||||||
}
|
|
||||||
let mut top_collector = BinaryHeap::new();
|
|
||||||
for child_fruit in children {
|
|
||||||
for (feature, doc) in child_fruit {
|
|
||||||
if top_collector.len() < self.limit {
|
|
||||||
top_collector.push(ComparableDoc { feature, doc });
|
|
||||||
} else if let Some(mut head) = top_collector.peek_mut() {
|
|
||||||
if head.feature < feature {
|
|
||||||
*head = ComparableDoc { feature, doc };
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(top_collector
|
|
||||||
.into_sorted_vec()
|
|
||||||
.into_iter()
|
|
||||||
.map(|cdoc| (cdoc.feature, cdoc.doc))
|
|
||||||
.collect())
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn for_segment(
|
|
||||||
&self,
|
|
||||||
segment_id: SegmentLocalId,
|
|
||||||
_: &SegmentReader,
|
|
||||||
) -> Result<TopSegmentCollector<T>> {
|
|
||||||
Ok(TopSegmentCollector::new(segment_id, self.limit))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The Top Collector keeps track of the K documents
|
|
||||||
/// sorted by type `T`.
|
|
||||||
///
|
|
||||||
/// The implementation is based on a `BinaryHeap`.
|
|
||||||
/// The theorical complexity for collecting the top `K` out of `n` documents
|
|
||||||
/// is `O(n log K)`.
|
|
||||||
pub(crate) struct TopSegmentCollector<T> {
|
|
||||||
limit: usize,
|
|
||||||
heap: BinaryHeap<ComparableDoc<T, DocId>>,
|
|
||||||
segment_id: u32,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T: PartialOrd> TopSegmentCollector<T> {
|
|
||||||
fn new(segment_id: SegmentLocalId, limit: usize) -> TopSegmentCollector<T> {
|
|
||||||
TopSegmentCollector {
|
|
||||||
limit,
|
limit,
|
||||||
heap: BinaryHeap::with_capacity(limit),
|
heap: BinaryHeap::with_capacity(limit),
|
||||||
segment_id,
|
segment_id: 0,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
|
/// Returns K best documents sorted in decreasing order.
|
||||||
pub fn harvest(self) -> Vec<(T, DocAddress)> {
|
///
|
||||||
let segment_id = self.segment_id;
|
/// Calling this method triggers the sort.
|
||||||
self.heap
|
/// The result of the sort is not cached.
|
||||||
.into_sorted_vec()
|
pub fn docs(&self) -> Vec<DocAddress> {
|
||||||
|
self.top_docs()
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|comparable_doc| {
|
.map(|(_feature, doc)| doc)
|
||||||
(
|
|
||||||
comparable_doc.feature,
|
|
||||||
DocAddress(segment_id, comparable_doc.doc),
|
|
||||||
)
|
|
||||||
})
|
|
||||||
.collect()
|
.collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns K best FeatureDocuments sorted in decreasing order.
|
||||||
|
///
|
||||||
|
/// Calling this method triggers the sort.
|
||||||
|
/// The result of the sort is not cached.
|
||||||
|
pub fn top_docs(&self) -> Vec<(T, DocAddress)> {
|
||||||
|
let mut feature_docs: Vec<ComparableDoc<T>> = self.heap.iter().cloned().collect();
|
||||||
|
feature_docs.sort();
|
||||||
|
feature_docs
|
||||||
|
.into_iter()
|
||||||
|
.map(
|
||||||
|
|ComparableDoc {
|
||||||
|
feature,
|
||||||
|
doc_address,
|
||||||
|
}| (feature, doc_address),
|
||||||
|
).collect()
|
||||||
|
}
|
||||||
|
|
||||||
/// Return true iff at least K documents have gone through
|
/// Return true iff at least K documents have gone through
|
||||||
/// the collector.
|
/// the collector.
|
||||||
#[inline(always)]
|
#[inline]
|
||||||
pub(crate) fn at_capacity(&self) -> bool {
|
pub fn at_capacity(&self) -> bool {
|
||||||
self.heap.len() >= self.limit
|
self.heap.len() >= self.limit
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Sets the segment local ID for the collector
|
||||||
|
pub fn set_segment_id(&mut self, segment_id: SegmentLocalId) {
|
||||||
|
self.segment_id = segment_id;
|
||||||
|
}
|
||||||
|
|
||||||
/// Collects a document scored by the given feature
|
/// Collects a document scored by the given feature
|
||||||
///
|
///
|
||||||
/// It collects documents until it has reached the max capacity. Once it reaches capacity, it
|
/// It collects documents until it has reached the max capacity. Once it reaches capacity, it
|
||||||
/// will compare the lowest scoring item with the given one and keep whichever is greater.
|
/// will compare the lowest scoring item with the given one and keep whichever is greater.
|
||||||
#[inline(always)]
|
|
||||||
pub fn collect(&mut self, doc: DocId, feature: T) {
|
pub fn collect(&mut self, doc: DocId, feature: T) {
|
||||||
if self.at_capacity() {
|
if self.at_capacity() {
|
||||||
// It's ok to unwrap as long as a limit of 0 is forbidden.
|
// It's ok to unwrap as long as a limit of 0 is forbidden.
|
||||||
if let Some(limit_feature) = self.heap.peek().map(|head| head.feature.clone()) {
|
let limit_doc: ComparableDoc<T> = self
|
||||||
if limit_feature < feature {
|
.heap
|
||||||
if let Some(mut head) = self.heap.peek_mut() {
|
.peek()
|
||||||
head.feature = feature;
|
.expect("Top collector with size 0 is forbidden")
|
||||||
head.doc = doc;
|
.clone();
|
||||||
}
|
if limit_doc.feature < feature {
|
||||||
}
|
let mut mut_head = self
|
||||||
|
.heap
|
||||||
|
.peek_mut()
|
||||||
|
.expect("Top collector with size 0 is forbidden");
|
||||||
|
mut_head.feature = feature;
|
||||||
|
mut_head.doc_address = DocAddress(self.segment_id, doc);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// we have not reached capacity yet, so we can just push the
|
let wrapped_doc = ComparableDoc {
|
||||||
// element.
|
feature,
|
||||||
self.heap.push(ComparableDoc { feature, doc });
|
doc_address: DocAddress(self.segment_id, doc),
|
||||||
|
};
|
||||||
|
self.heap.push(wrapped_doc);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::{TopCollector, TopSegmentCollector};
|
use super::*;
|
||||||
use DocAddress;
|
use DocId;
|
||||||
use Score;
|
use Score;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_top_collector_not_at_capacity() {
|
fn test_top_collector_not_at_capacity() {
|
||||||
let mut top_collector = TopSegmentCollector::new(0, 4);
|
let mut top_collector = TopCollector::with_limit(4);
|
||||||
top_collector.collect(1, 0.8);
|
top_collector.collect(1, 0.8);
|
||||||
top_collector.collect(3, 0.2);
|
top_collector.collect(3, 0.2);
|
||||||
top_collector.collect(5, 0.3);
|
top_collector.collect(5, 0.3);
|
||||||
assert_eq!(
|
assert!(!top_collector.at_capacity());
|
||||||
top_collector.harvest(),
|
let score_docs: Vec<(Score, DocId)> = top_collector
|
||||||
vec![
|
.top_docs()
|
||||||
(0.8, DocAddress(0, 1)),
|
.into_iter()
|
||||||
(0.3, DocAddress(0, 5)),
|
.map(|(score, doc_address)| (score, doc_address.doc()))
|
||||||
(0.2, DocAddress(0, 3))
|
.collect();
|
||||||
]
|
assert_eq!(score_docs, vec![(0.8, 1), (0.3, 5), (0.2, 3)]);
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_top_collector_at_capacity() {
|
fn test_top_collector_at_capacity() {
|
||||||
let mut top_collector = TopSegmentCollector::new(0, 4);
|
let mut top_collector = TopCollector::with_limit(4);
|
||||||
top_collector.collect(1, 0.8);
|
top_collector.collect(1, 0.8);
|
||||||
top_collector.collect(3, 0.2);
|
top_collector.collect(3, 0.2);
|
||||||
top_collector.collect(5, 0.3);
|
top_collector.collect(5, 0.3);
|
||||||
top_collector.collect(7, 0.9);
|
top_collector.collect(7, 0.9);
|
||||||
top_collector.collect(9, -0.2);
|
top_collector.collect(9, -0.2);
|
||||||
assert_eq!(
|
assert!(top_collector.at_capacity());
|
||||||
top_collector.harvest(),
|
{
|
||||||
vec![
|
let score_docs: Vec<(Score, DocId)> = top_collector
|
||||||
(0.9, DocAddress(0, 7)),
|
.top_docs()
|
||||||
(0.8, DocAddress(0, 1)),
|
.into_iter()
|
||||||
(0.3, DocAddress(0, 5)),
|
.map(|(score, doc_address)| (score, doc_address.doc()))
|
||||||
(0.2, DocAddress(0, 3))
|
.collect();
|
||||||
]
|
assert_eq!(score_docs, vec![(0.9, 7), (0.8, 1), (0.3, 5), (0.2, 3)]);
|
||||||
);
|
}
|
||||||
|
{
|
||||||
|
let docs: Vec<DocId> = top_collector
|
||||||
|
.docs()
|
||||||
|
.into_iter()
|
||||||
|
.map(|doc_address| doc_address.doc())
|
||||||
|
.collect();
|
||||||
|
assert_eq!(docs, vec![7, 1, 5, 3]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -221,4 +189,5 @@ mod tests {
|
|||||||
fn test_top_0() {
|
fn test_top_0() {
|
||||||
let _collector: TopCollector<Score> = TopCollector::with_limit(0);
|
let _collector: TopCollector<Score> = TopCollector::with_limit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,13 +1,12 @@
|
|||||||
use super::Collector;
|
use super::Collector;
|
||||||
use collector::top_collector::TopCollector;
|
use collector::top_collector::TopCollector;
|
||||||
use collector::top_collector::TopSegmentCollector;
|
|
||||||
use collector::SegmentCollector;
|
|
||||||
use fastfield::FastFieldReader;
|
use fastfield::FastFieldReader;
|
||||||
use fastfield::FastValue;
|
use fastfield::FastValue;
|
||||||
use schema::Field;
|
use schema::Field;
|
||||||
use DocAddress;
|
use DocAddress;
|
||||||
|
use DocId;
|
||||||
use Result;
|
use Result;
|
||||||
use SegmentLocalId;
|
use Score;
|
||||||
use SegmentReader;
|
use SegmentReader;
|
||||||
|
|
||||||
/// The Top Field Collector keeps track of the K documents
|
/// The Top Field Collector keeps track of the K documents
|
||||||
@@ -20,139 +19,136 @@ use SegmentReader;
|
|||||||
/// ```rust
|
/// ```rust
|
||||||
/// #[macro_use]
|
/// #[macro_use]
|
||||||
/// extern crate tantivy;
|
/// extern crate tantivy;
|
||||||
/// # use tantivy::schema::{Schema, Field, FAST, TEXT};
|
/// use tantivy::schema::{SchemaBuilder, TEXT, FAST};
|
||||||
/// # use tantivy::{Index, Result, DocAddress};
|
/// use tantivy::{Index, Result, DocId};
|
||||||
/// # use tantivy::query::{Query, QueryParser};
|
/// use tantivy::collector::TopFieldCollector;
|
||||||
/// use tantivy::Searcher;
|
/// use tantivy::query::QueryParser;
|
||||||
/// use tantivy::collector::TopDocs;
|
|
||||||
///
|
///
|
||||||
/// # fn main() -> tantivy::Result<()> {
|
/// # fn main() { example().unwrap(); }
|
||||||
/// # let mut schema_builder = Schema::builder();
|
/// fn example() -> Result<()> {
|
||||||
/// # let title = schema_builder.add_text_field("title", TEXT);
|
/// let mut schema_builder = SchemaBuilder::new();
|
||||||
/// # let rating = schema_builder.add_u64_field("rating", FAST);
|
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||||
/// # let schema = schema_builder.build();
|
/// let rating = schema_builder.add_u64_field("rating", FAST);
|
||||||
/// # let index = Index::create_in_ram(schema);
|
/// let schema = schema_builder.build();
|
||||||
/// # let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
|
/// let index = Index::create_in_ram(schema);
|
||||||
/// # index_writer.add_document(doc!(
|
/// {
|
||||||
/// # title => "The Name of the Wind",
|
/// let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
|
||||||
/// # rating => 92u64,
|
/// index_writer.add_document(doc!(
|
||||||
/// # ));
|
/// title => "The Name of the Wind",
|
||||||
/// # index_writer.add_document(doc!(title => "The Diary of Muadib", rating => 97u64));
|
/// rating => 92u64,
|
||||||
/// # index_writer.add_document(doc!(title => "A Dairy Cow", rating => 63u64));
|
/// ));
|
||||||
/// # index_writer.add_document(doc!(title => "The Diary of a Young Girl", rating => 80u64));
|
/// index_writer.add_document(doc!(
|
||||||
/// # index_writer.commit()?;
|
/// title => "The Diary of Muadib",
|
||||||
/// # let reader = index.reader()?;
|
/// rating => 97u64,
|
||||||
/// # let query = QueryParser::for_index(&index, vec![title]).parse_query("diary")?;
|
/// ));
|
||||||
/// # let top_docs = docs_sorted_by_rating(&reader.searcher(), &query, rating)?;
|
/// index_writer.add_document(doc!(
|
||||||
/// # assert_eq!(top_docs,
|
/// title => "A Dairy Cow",
|
||||||
/// # vec![(97u64, DocAddress(0u32, 1)),
|
/// rating => 63u64,
|
||||||
/// # (80u64, DocAddress(0u32, 3))]);
|
/// ));
|
||||||
/// # Ok(())
|
/// index_writer.add_document(doc!(
|
||||||
/// # }
|
/// title => "The Diary of a Young Girl",
|
||||||
/// #
|
/// rating => 80u64,
|
||||||
/// /// Searches the document matching the given query, and
|
/// ));
|
||||||
/// /// collects the top 10 documents, order by the `field`
|
/// index_writer.commit().unwrap();
|
||||||
/// /// given in argument.
|
/// }
|
||||||
/// ///
|
|
||||||
/// /// `field` is required to be a FAST field.
|
|
||||||
/// fn docs_sorted_by_rating(searcher: &Searcher,
|
|
||||||
/// query: &Query,
|
|
||||||
/// sort_by_field: Field)
|
|
||||||
/// -> Result<Vec<(u64, DocAddress)>> {
|
|
||||||
///
|
///
|
||||||
/// // This is where we build our collector!
|
/// index.load_searchers()?;
|
||||||
/// let top_docs_by_rating = TopDocs::with_limit(2).order_by_field(sort_by_field);
|
/// let searcher = index.searcher();
|
||||||
///
|
///
|
||||||
/// // ... and here is our documents. Not this is a simple vec.
|
/// {
|
||||||
/// // The `u64` in the pair is the value of our fast field for each documents.
|
/// let mut top_collector = TopFieldCollector::with_limit(rating, 2);
|
||||||
/// searcher.search(query, &top_docs_by_rating)
|
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||||
|
/// let query = query_parser.parse_query("diary")?;
|
||||||
|
/// searcher.search(&*query, &mut top_collector).unwrap();
|
||||||
|
///
|
||||||
|
/// let score_docs: Vec<(u64, DocId)> = top_collector
|
||||||
|
/// .top_docs()
|
||||||
|
/// .into_iter()
|
||||||
|
/// .map(|(field, doc_address)| (field, doc_address.doc()))
|
||||||
|
/// .collect();
|
||||||
|
///
|
||||||
|
/// assert_eq!(score_docs, vec![(97u64, 1), (80, 3)]);
|
||||||
|
/// }
|
||||||
|
///
|
||||||
|
/// Ok(())
|
||||||
/// }
|
/// }
|
||||||
/// ```
|
/// ```
|
||||||
pub struct TopDocsByField<T> {
|
pub struct TopFieldCollector<T: FastValue> {
|
||||||
collector: TopCollector<T>,
|
|
||||||
field: Field,
|
field: Field,
|
||||||
|
collector: TopCollector<T>,
|
||||||
|
fast_field: Option<FastFieldReader<T>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: FastValue + PartialOrd + Clone> TopDocsByField<T> {
|
impl<T: FastValue + PartialOrd + Clone> TopFieldCollector<T> {
|
||||||
/// Creates a top field collector, with a number of documents equal to "limit".
|
/// Creates a top field collector, with a number of documents equal to "limit".
|
||||||
///
|
///
|
||||||
/// The given field name must be a fast field, otherwise the collector have an error while
|
/// The given field name must be a fast field, otherwise the collector have an error while
|
||||||
/// collecting results.
|
/// collecting results.
|
||||||
///
|
///
|
||||||
/// This constructor is crate-private. Client are supposed to call
|
|
||||||
/// build `TopDocsByField` object using the `TopDocs` API.
|
|
||||||
///
|
|
||||||
/// e.g.:
|
|
||||||
/// `TopDocs::with_limit(2).order_by_field(sort_by_field)`
|
|
||||||
///
|
|
||||||
/// # Panics
|
/// # Panics
|
||||||
/// The method panics if limit is 0
|
/// The method panics if limit is 0
|
||||||
pub(crate) fn new(field: Field, limit: usize) -> TopDocsByField<T> {
|
pub fn with_limit(field: Field, limit: usize) -> Self {
|
||||||
TopDocsByField {
|
TopFieldCollector {
|
||||||
collector: TopCollector::with_limit(limit),
|
|
||||||
field,
|
field,
|
||||||
|
collector: TopCollector::with_limit(limit),
|
||||||
|
fast_field: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns K best documents sorted the given field name in decreasing order.
|
||||||
|
///
|
||||||
|
/// Calling this method triggers the sort.
|
||||||
|
/// The result of the sort is not cached.
|
||||||
|
pub fn docs(&self) -> Vec<DocAddress> {
|
||||||
|
self.collector.docs()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns K best FieldDocuments sorted in decreasing order.
|
||||||
|
///
|
||||||
|
/// Calling this method triggers the sort.
|
||||||
|
/// The result of the sort is not cached.
|
||||||
|
pub fn top_docs(&self) -> Vec<(T, DocAddress)> {
|
||||||
|
self.collector.top_docs()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return true iff at least K documents have gone through
|
||||||
|
/// the collector.
|
||||||
|
#[inline]
|
||||||
|
pub fn at_capacity(&self) -> bool {
|
||||||
|
self.collector.at_capacity()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: FastValue + PartialOrd + Send + Sync + 'static> Collector for TopDocsByField<T> {
|
impl<T: FastValue + PartialOrd + Clone> Collector for TopFieldCollector<T> {
|
||||||
type Fruit = Vec<(T, DocAddress)>;
|
fn set_segment(&mut self, segment_id: u32, segment: &SegmentReader) -> Result<()> {
|
||||||
|
self.collector.set_segment_id(segment_id);
|
||||||
|
self.fast_field = Some(segment.fast_field_reader(self.field)?);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
type Child = TopFieldSegmentCollector<T>;
|
fn collect(&mut self, doc: DocId, _score: Score) {
|
||||||
|
let field_value = self
|
||||||
fn for_segment(
|
.fast_field
|
||||||
&self,
|
.as_ref()
|
||||||
segment_local_id: SegmentLocalId,
|
.expect("collect() was called before set_segment. This should never happen.")
|
||||||
reader: &SegmentReader,
|
.get(doc);
|
||||||
) -> Result<TopFieldSegmentCollector<T>> {
|
self.collector.collect(doc, field_value);
|
||||||
let collector = self.collector.for_segment(segment_local_id, reader)?;
|
|
||||||
let reader = reader.fast_field_reader(self.field)?;
|
|
||||||
Ok(TopFieldSegmentCollector { collector, reader })
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
fn requires_scoring(&self) -> bool {
|
||||||
false
|
false
|
||||||
}
|
}
|
||||||
|
|
||||||
fn merge_fruits(
|
|
||||||
&self,
|
|
||||||
segment_fruits: Vec<Vec<(T, DocAddress)>>,
|
|
||||||
) -> Result<Vec<(T, DocAddress)>> {
|
|
||||||
self.collector.merge_fruits(segment_fruits)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct TopFieldSegmentCollector<T: FastValue + PartialOrd> {
|
|
||||||
collector: TopSegmentCollector<T>,
|
|
||||||
reader: FastFieldReader<T>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T: FastValue + PartialOrd + Send + Sync + 'static> SegmentCollector
|
|
||||||
for TopFieldSegmentCollector<T>
|
|
||||||
{
|
|
||||||
type Fruit = Vec<(T, DocAddress)>;
|
|
||||||
|
|
||||||
fn collect(&mut self, doc: u32, _score: f32) {
|
|
||||||
let field_value = self.reader.get(doc);
|
|
||||||
self.collector.collect(doc, field_value);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn harvest(self) -> Vec<(T, DocAddress)> {
|
|
||||||
self.collector.harvest()
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::TopDocsByField;
|
use super::*;
|
||||||
use collector::Collector;
|
|
||||||
use collector::TopDocs;
|
|
||||||
use query::Query;
|
use query::Query;
|
||||||
use query::QueryParser;
|
use query::QueryParser;
|
||||||
use schema::Field;
|
use schema::Field;
|
||||||
use schema::IntOptions;
|
use schema::IntOptions;
|
||||||
use schema::{Schema, FAST, TEXT};
|
use schema::Schema;
|
||||||
use DocAddress;
|
use schema::{SchemaBuilder, FAST, TEXT};
|
||||||
use Index;
|
use Index;
|
||||||
use IndexWriter;
|
use IndexWriter;
|
||||||
use TantivyError;
|
use TantivyError;
|
||||||
@@ -162,7 +158,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_top_collector_not_at_capacity() {
|
fn test_top_collector_not_at_capacity() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::new();
|
||||||
let title = schema_builder.add_text_field(TITLE, TEXT);
|
let title = schema_builder.add_text_field(TITLE, TEXT);
|
||||||
let size = schema_builder.add_u64_field(SIZE, FAST);
|
let size = schema_builder.add_u64_field(SIZE, FAST);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
@@ -180,24 +176,24 @@ mod tests {
|
|||||||
size => 16u64,
|
size => 16u64,
|
||||||
));
|
));
|
||||||
});
|
});
|
||||||
let searcher = index.reader().unwrap().searcher();
|
let searcher = index.searcher();
|
||||||
|
|
||||||
let top_collector = TopDocs::with_limit(4).order_by_field(size);
|
let mut top_collector = TopFieldCollector::with_limit(size, 4);
|
||||||
let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector).unwrap();
|
searcher.search(&*query, &mut top_collector).unwrap();
|
||||||
assert_eq!(
|
assert!(!top_collector.at_capacity());
|
||||||
top_docs,
|
|
||||||
vec![
|
let score_docs: Vec<(u64, DocId)> = top_collector
|
||||||
(64, DocAddress(0, 1)),
|
.top_docs()
|
||||||
(16, DocAddress(0, 2)),
|
.into_iter()
|
||||||
(12, DocAddress(0, 0))
|
.map(|(field, doc_address)| (field, doc_address.doc()))
|
||||||
]
|
.collect();
|
||||||
);
|
assert_eq!(score_docs, vec![(64, 1), (16, 2), (12, 0)]);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
#[should_panic]
|
#[should_panic]
|
||||||
fn test_field_does_not_exist() {
|
fn test_field_does_not_exist() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::new();
|
||||||
let title = schema_builder.add_text_field(TITLE, TEXT);
|
let title = schema_builder.add_text_field(TITLE, TEXT);
|
||||||
let size = schema_builder.add_u64_field(SIZE, FAST);
|
let size = schema_builder.add_u64_field(SIZE, FAST);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
@@ -207,17 +203,15 @@ mod tests {
|
|||||||
size => 12u64,
|
size => 12u64,
|
||||||
));
|
));
|
||||||
});
|
});
|
||||||
let searcher = index.reader().unwrap().searcher();
|
let searcher = index.searcher();
|
||||||
let top_collector: TopDocsByField<u64> = TopDocs::with_limit(4).order_by_field(Field(2));
|
let segment = searcher.segment_reader(0);
|
||||||
let segment_reader = searcher.segment_reader(0u32);
|
let mut top_collector: TopFieldCollector<u64> = TopFieldCollector::with_limit(Field(2), 4);
|
||||||
top_collector
|
let _ = top_collector.set_segment(0, segment);
|
||||||
.for_segment(0, segment_reader)
|
|
||||||
.expect("should panic");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_field_not_fast_field() {
|
fn test_field_not_fast_field() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::new();
|
||||||
let title = schema_builder.add_text_field(TITLE, TEXT);
|
let title = schema_builder.add_text_field(TITLE, TEXT);
|
||||||
let size = schema_builder.add_u64_field(SIZE, IntOptions::default());
|
let size = schema_builder.add_u64_field(SIZE, IntOptions::default());
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
@@ -227,18 +221,28 @@ mod tests {
|
|||||||
size => 12u64,
|
size => 12u64,
|
||||||
));
|
));
|
||||||
});
|
});
|
||||||
let searcher = index.reader().unwrap().searcher();
|
let searcher = index.searcher();
|
||||||
let segment = searcher.segment_reader(0);
|
let segment = searcher.segment_reader(0);
|
||||||
let top_collector: TopDocsByField<u64> = TopDocs::with_limit(4).order_by_field(size);
|
let mut top_collector: TopFieldCollector<u64> = TopFieldCollector::with_limit(size, 4);
|
||||||
assert_matches!(
|
assert_matches!(
|
||||||
top_collector
|
top_collector.set_segment(0, segment),
|
||||||
.for_segment(0, segment)
|
Err(TantivyError::FastFieldError(_))
|
||||||
.map(|_| ())
|
|
||||||
.unwrap_err(),
|
|
||||||
TantivyError::FastFieldError(_)
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic]
|
||||||
|
fn test_collect_before_set_segment() {
|
||||||
|
let mut top_collector: TopFieldCollector<u64> = TopFieldCollector::with_limit(Field(0), 4);
|
||||||
|
top_collector.collect(0, 0f32);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic]
|
||||||
|
fn test_top_0() {
|
||||||
|
let _: TopFieldCollector<u64> = TopFieldCollector::with_limit(Field(0), 0);
|
||||||
|
}
|
||||||
|
|
||||||
fn index(
|
fn index(
|
||||||
query: &str,
|
query: &str,
|
||||||
query_field: Field,
|
query_field: Field,
|
||||||
@@ -250,6 +254,8 @@ mod tests {
|
|||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
doc_adder(&mut index_writer);
|
doc_adder(&mut index_writer);
|
||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
|
index.load_searchers().unwrap();
|
||||||
|
|
||||||
let query_parser = QueryParser::for_index(&index, vec![query_field]);
|
let query_parser = QueryParser::for_index(&index, vec![query_field]);
|
||||||
let query = query_parser.parse_query(query).unwrap();
|
let query = query_parser.parse_query(query).unwrap();
|
||||||
(index, query)
|
(index, query)
|
||||||
|
|||||||
@@ -1,10 +1,5 @@
|
|||||||
use super::Collector;
|
use super::Collector;
|
||||||
use collector::top_collector::TopCollector;
|
use collector::top_collector::TopCollector;
|
||||||
use collector::top_collector::TopSegmentCollector;
|
|
||||||
use collector::SegmentCollector;
|
|
||||||
use collector::TopDocsByField;
|
|
||||||
use fastfield::FastValue;
|
|
||||||
use schema::Field;
|
|
||||||
use DocAddress;
|
use DocAddress;
|
||||||
use DocId;
|
use DocId;
|
||||||
use Result;
|
use Result;
|
||||||
@@ -22,15 +17,14 @@ use SegmentReader;
|
|||||||
/// ```rust
|
/// ```rust
|
||||||
/// #[macro_use]
|
/// #[macro_use]
|
||||||
/// extern crate tantivy;
|
/// extern crate tantivy;
|
||||||
/// use tantivy::DocAddress;
|
/// use tantivy::schema::{SchemaBuilder, TEXT};
|
||||||
/// use tantivy::schema::{Schema, TEXT};
|
/// use tantivy::{Index, Result, DocId, Score};
|
||||||
/// use tantivy::{Index, Result};
|
/// use tantivy::collector::TopScoreCollector;
|
||||||
/// use tantivy::collector::TopDocs;
|
|
||||||
/// use tantivy::query::QueryParser;
|
/// use tantivy::query::QueryParser;
|
||||||
///
|
///
|
||||||
/// # fn main() { example().unwrap(); }
|
/// # fn main() { example().unwrap(); }
|
||||||
/// fn example() -> Result<()> {
|
/// fn example() -> Result<()> {
|
||||||
/// let mut schema_builder = Schema::builder();
|
/// let mut schema_builder = SchemaBuilder::new();
|
||||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||||
/// let schema = schema_builder.build();
|
/// let schema = schema_builder.build();
|
||||||
/// let index = Index::create_in_ram(schema);
|
/// let index = Index::create_in_ram(schema);
|
||||||
@@ -51,153 +45,143 @@ use SegmentReader;
|
|||||||
/// index_writer.commit().unwrap();
|
/// index_writer.commit().unwrap();
|
||||||
/// }
|
/// }
|
||||||
///
|
///
|
||||||
/// let reader = index.reader()?;
|
/// index.load_searchers()?;
|
||||||
/// let searcher = reader.searcher();
|
/// let searcher = index.searcher();
|
||||||
///
|
///
|
||||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
/// {
|
||||||
/// let query = query_parser.parse_query("diary")?;
|
/// let mut top_collector = TopScoreCollector::with_limit(2);
|
||||||
/// let top_docs = searcher.search(&query, &TopDocs::with_limit(2))?;
|
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||||
|
/// let query = query_parser.parse_query("diary")?;
|
||||||
|
/// searcher.search(&*query, &mut top_collector).unwrap();
|
||||||
///
|
///
|
||||||
/// assert_eq!(&top_docs[0], &(0.7261542, DocAddress(0, 1)));
|
/// let score_docs: Vec<(Score, DocId)> = top_collector
|
||||||
/// assert_eq!(&top_docs[1], &(0.6099695, DocAddress(0, 3)));
|
/// .top_docs()
|
||||||
|
/// .into_iter()
|
||||||
|
/// .map(|(score, doc_address)| (score, doc_address.doc()))
|
||||||
|
/// .collect();
|
||||||
|
///
|
||||||
|
/// assert_eq!(score_docs, vec![(0.7261542, 1), (0.6099695, 3)]);
|
||||||
|
/// }
|
||||||
///
|
///
|
||||||
/// Ok(())
|
/// Ok(())
|
||||||
/// }
|
/// }
|
||||||
/// ```
|
/// ```
|
||||||
pub struct TopDocs(TopCollector<Score>);
|
pub struct TopScoreCollector {
|
||||||
|
collector: TopCollector<Score>,
|
||||||
|
}
|
||||||
|
|
||||||
impl TopDocs {
|
impl TopScoreCollector {
|
||||||
/// Creates a top score collector, with a number of documents equal to "limit".
|
/// Creates a top score collector, with a number of documents equal to "limit".
|
||||||
///
|
///
|
||||||
/// # Panics
|
/// # Panics
|
||||||
/// The method panics if limit is 0
|
/// The method panics if limit is 0
|
||||||
pub fn with_limit(limit: usize) -> TopDocs {
|
pub fn with_limit(limit: usize) -> TopScoreCollector {
|
||||||
TopDocs(TopCollector::with_limit(limit))
|
TopScoreCollector {
|
||||||
|
collector: TopCollector::with_limit(limit),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Set top-K to rank documents by a given fast field.
|
/// Returns K best scored documents sorted in decreasing order.
|
||||||
///
|
///
|
||||||
/// (By default, `TopDocs` collects the top-K documents sorted by
|
/// Calling this method triggers the sort.
|
||||||
/// the similarity score.)
|
/// The result of the sort is not cached.
|
||||||
pub fn order_by_field<T: PartialOrd + FastValue + Clone>(
|
pub fn docs(&self) -> Vec<DocAddress> {
|
||||||
self,
|
self.collector.docs()
|
||||||
field: Field,
|
}
|
||||||
) -> TopDocsByField<T> {
|
|
||||||
TopDocsByField::new(field, self.0.limit())
|
/// Returns K best ScoredDocuments sorted in decreasing order.
|
||||||
|
///
|
||||||
|
/// Calling this method triggers the sort.
|
||||||
|
/// The result of the sort is not cached.
|
||||||
|
pub fn top_docs(&self) -> Vec<(Score, DocAddress)> {
|
||||||
|
self.collector.top_docs()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns K best ScoredDocuments sorted in decreasing order.
|
||||||
|
///
|
||||||
|
/// Calling this method triggers the sort.
|
||||||
|
/// The result of the sort is not cached.
|
||||||
|
#[deprecated]
|
||||||
|
pub fn score_docs(&self) -> Vec<(Score, DocAddress)> {
|
||||||
|
self.collector.top_docs()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Return true iff at least K documents have gone through
|
||||||
|
/// the collector.
|
||||||
|
#[inline]
|
||||||
|
pub fn at_capacity(&self) -> bool {
|
||||||
|
self.collector.at_capacity()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Collector for TopDocs {
|
impl Collector for TopScoreCollector {
|
||||||
type Fruit = Vec<(Score, DocAddress)>;
|
fn set_segment(&mut self, segment_id: SegmentLocalId, _: &SegmentReader) -> Result<()> {
|
||||||
|
self.collector.set_segment_id(segment_id);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
type Child = TopScoreSegmentCollector;
|
fn collect(&mut self, doc: DocId, score: Score) {
|
||||||
|
self.collector.collect(doc, score);
|
||||||
fn for_segment(
|
|
||||||
&self,
|
|
||||||
segment_local_id: SegmentLocalId,
|
|
||||||
reader: &SegmentReader,
|
|
||||||
) -> Result<Self::Child> {
|
|
||||||
let collector = self.0.for_segment(segment_local_id, reader)?;
|
|
||||||
Ok(TopScoreSegmentCollector(collector))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
fn requires_scoring(&self) -> bool {
|
||||||
true
|
true
|
||||||
}
|
}
|
||||||
|
|
||||||
fn merge_fruits(&self, child_fruits: Vec<Vec<(Score, DocAddress)>>) -> Result<Self::Fruit> {
|
|
||||||
self.0.merge_fruits(child_fruits)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Segment Collector associated to `TopDocs`.
|
|
||||||
pub struct TopScoreSegmentCollector(TopSegmentCollector<Score>);
|
|
||||||
|
|
||||||
impl SegmentCollector for TopScoreSegmentCollector {
|
|
||||||
type Fruit = Vec<(Score, DocAddress)>;
|
|
||||||
|
|
||||||
fn collect(&mut self, doc: DocId, score: Score) {
|
|
||||||
self.0.collect(doc, score)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn harvest(self) -> Vec<(Score, DocAddress)> {
|
|
||||||
self.0.harvest()
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::TopDocs;
|
use super::*;
|
||||||
use query::QueryParser;
|
use collector::Collector;
|
||||||
use schema::Schema;
|
use DocId;
|
||||||
use schema::TEXT;
|
|
||||||
use DocAddress;
|
|
||||||
use Index;
|
|
||||||
use Score;
|
use Score;
|
||||||
|
|
||||||
fn make_index() -> Index {
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
let index = Index::create_in_ram(schema);
|
|
||||||
{
|
|
||||||
// writing the segment
|
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
|
||||||
index_writer.add_document(doc!(text_field=>"Hello happy tax payer."));
|
|
||||||
index_writer.add_document(doc!(text_field=>"Droopy says hello happy tax payer"));
|
|
||||||
index_writer.add_document(doc!(text_field=>"I like Droopy"));
|
|
||||||
assert!(index_writer.commit().is_ok());
|
|
||||||
}
|
|
||||||
index
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_top_collector_not_at_capacity() {
|
fn test_top_collector_not_at_capacity() {
|
||||||
let index = make_index();
|
let mut top_collector = TopScoreCollector::with_limit(4);
|
||||||
let field = index.schema().get_field("text").unwrap();
|
top_collector.collect(1, 0.8);
|
||||||
let query_parser = QueryParser::for_index(&index, vec![field]);
|
top_collector.collect(3, 0.2);
|
||||||
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
top_collector.collect(5, 0.3);
|
||||||
let score_docs: Vec<(Score, DocAddress)> = index
|
assert!(!top_collector.at_capacity());
|
||||||
.reader()
|
let score_docs: Vec<(Score, DocId)> = top_collector
|
||||||
.unwrap()
|
.top_docs()
|
||||||
.searcher()
|
.into_iter()
|
||||||
.search(&text_query, &TopDocs::with_limit(4))
|
.map(|(score, doc_address)| (score, doc_address.doc()))
|
||||||
.unwrap();
|
.collect();
|
||||||
assert_eq!(
|
assert_eq!(score_docs, vec![(0.8, 1), (0.3, 5), (0.2, 3)]);
|
||||||
score_docs,
|
|
||||||
vec![
|
|
||||||
(0.81221175, DocAddress(0u32, 1)),
|
|
||||||
(0.5376842, DocAddress(0u32, 2)),
|
|
||||||
(0.48527452, DocAddress(0, 0))
|
|
||||||
]
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_top_collector_at_capacity() {
|
fn test_top_collector_at_capacity() {
|
||||||
let index = make_index();
|
let mut top_collector = TopScoreCollector::with_limit(4);
|
||||||
let field = index.schema().get_field("text").unwrap();
|
top_collector.collect(1, 0.8);
|
||||||
let query_parser = QueryParser::for_index(&index, vec![field]);
|
top_collector.collect(3, 0.2);
|
||||||
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
top_collector.collect(5, 0.3);
|
||||||
let score_docs: Vec<(Score, DocAddress)> = index
|
top_collector.collect(7, 0.9);
|
||||||
.reader()
|
top_collector.collect(9, -0.2);
|
||||||
.unwrap()
|
assert!(top_collector.at_capacity());
|
||||||
.searcher()
|
{
|
||||||
.search(&text_query, &TopDocs::with_limit(2))
|
let score_docs: Vec<(Score, DocId)> = top_collector
|
||||||
.unwrap();
|
.top_docs()
|
||||||
assert_eq!(
|
.into_iter()
|
||||||
score_docs,
|
.map(|(score, doc_address)| (score, doc_address.doc()))
|
||||||
vec![
|
.collect();
|
||||||
(0.81221175, DocAddress(0u32, 1)),
|
assert_eq!(score_docs, vec![(0.9, 7), (0.8, 1), (0.3, 5), (0.2, 3)]);
|
||||||
(0.5376842, DocAddress(0u32, 2)),
|
}
|
||||||
]
|
{
|
||||||
);
|
let docs: Vec<DocId> = top_collector
|
||||||
|
.docs()
|
||||||
|
.into_iter()
|
||||||
|
.map(|doc_address| doc_address.doc())
|
||||||
|
.collect();
|
||||||
|
assert_eq!(docs, vec![7, 1, 5, 3]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
#[should_panic]
|
#[should_panic]
|
||||||
fn test_top_0() {
|
fn test_top_0() {
|
||||||
TopDocs::with_limit(0);
|
TopScoreCollector::with_limit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
use byteorder::{ByteOrder, LittleEndian, WriteBytesExt};
|
use common::serialize::BinarySerializable;
|
||||||
use std::io;
|
use std::io;
|
||||||
|
use std::io::Write;
|
||||||
|
use std::mem;
|
||||||
use std::ops::Deref;
|
use std::ops::Deref;
|
||||||
|
use std::ptr;
|
||||||
|
|
||||||
pub(crate) struct BitPacker {
|
pub(crate) struct BitPacker {
|
||||||
mini_buffer: u64,
|
mini_buffer: u64,
|
||||||
@@ -15,7 +18,7 @@ impl BitPacker {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn write<TWrite: io::Write>(
|
pub fn write<TWrite: Write>(
|
||||||
&mut self,
|
&mut self,
|
||||||
val: u64,
|
val: u64,
|
||||||
num_bits: u8,
|
num_bits: u8,
|
||||||
@@ -25,14 +28,14 @@ impl BitPacker {
|
|||||||
let num_bits = num_bits as usize;
|
let num_bits = num_bits as usize;
|
||||||
if self.mini_buffer_written + num_bits > 64 {
|
if self.mini_buffer_written + num_bits > 64 {
|
||||||
self.mini_buffer |= val_u64.wrapping_shl(self.mini_buffer_written as u32);
|
self.mini_buffer |= val_u64.wrapping_shl(self.mini_buffer_written as u32);
|
||||||
output.write_u64::<LittleEndian>(self.mini_buffer)?;
|
self.mini_buffer.serialize(output)?;
|
||||||
self.mini_buffer = val_u64.wrapping_shr((64 - self.mini_buffer_written) as u32);
|
self.mini_buffer = val_u64.wrapping_shr((64 - self.mini_buffer_written) as u32);
|
||||||
self.mini_buffer_written = self.mini_buffer_written + num_bits - 64;
|
self.mini_buffer_written = self.mini_buffer_written + num_bits - 64;
|
||||||
} else {
|
} else {
|
||||||
self.mini_buffer |= val_u64 << self.mini_buffer_written;
|
self.mini_buffer |= val_u64 << self.mini_buffer_written;
|
||||||
self.mini_buffer_written += num_bits;
|
self.mini_buffer_written += num_bits;
|
||||||
if self.mini_buffer_written == 64 {
|
if self.mini_buffer_written == 64 {
|
||||||
output.write_u64::<LittleEndian>(self.mini_buffer)?;
|
self.mini_buffer.serialize(output)?;
|
||||||
self.mini_buffer_written = 0;
|
self.mini_buffer_written = 0;
|
||||||
self.mini_buffer = 0u64;
|
self.mini_buffer = 0u64;
|
||||||
}
|
}
|
||||||
@@ -40,18 +43,17 @@ impl BitPacker {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn flush<TWrite: io::Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
|
pub fn flush<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
|
||||||
if self.mini_buffer_written > 0 {
|
if self.mini_buffer_written > 0 {
|
||||||
let num_bytes = (self.mini_buffer_written + 7) / 8;
|
let num_bytes = (self.mini_buffer_written + 7) / 8;
|
||||||
let mut arr: [u8; 8] = [0u8; 8];
|
let arr: [u8; 8] = unsafe { mem::transmute::<u64, [u8; 8]>(self.mini_buffer.to_le()) };
|
||||||
LittleEndian::write_u64(&mut arr, self.mini_buffer);
|
|
||||||
output.write_all(&arr[..num_bytes])?;
|
output.write_all(&arr[..num_bytes])?;
|
||||||
self.mini_buffer_written = 0;
|
self.mini_buffer_written = 0;
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn close<TWrite: io::Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
|
pub fn close<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
|
||||||
self.flush(output)?;
|
self.flush(output)?;
|
||||||
// Padding the write file to simplify reads.
|
// Padding the write file to simplify reads.
|
||||||
output.write_all(&[0u8; 7])?;
|
output.write_all(&[0u8; 7])?;
|
||||||
@@ -64,7 +66,7 @@ pub struct BitUnpacker<Data>
|
|||||||
where
|
where
|
||||||
Data: Deref<Target = [u8]>,
|
Data: Deref<Target = [u8]>,
|
||||||
{
|
{
|
||||||
num_bits: u64,
|
num_bits: usize,
|
||||||
mask: u64,
|
mask: u64,
|
||||||
data: Data,
|
data: Data,
|
||||||
}
|
}
|
||||||
@@ -80,13 +82,13 @@ where
|
|||||||
(1u64 << num_bits) - 1u64
|
(1u64 << num_bits) - 1u64
|
||||||
};
|
};
|
||||||
BitUnpacker {
|
BitUnpacker {
|
||||||
num_bits: u64::from(num_bits),
|
num_bits: num_bits as usize,
|
||||||
mask,
|
mask,
|
||||||
data,
|
data,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get(&self, idx: u64) -> u64 {
|
pub fn get(&self, idx: usize) -> u64 {
|
||||||
if self.num_bits == 0 {
|
if self.num_bits == 0 {
|
||||||
return 0u64;
|
return 0u64;
|
||||||
}
|
}
|
||||||
@@ -97,13 +99,42 @@ where
|
|||||||
let addr = addr_in_bits >> 3;
|
let addr = addr_in_bits >> 3;
|
||||||
let bit_shift = addr_in_bits & 7;
|
let bit_shift = addr_in_bits & 7;
|
||||||
debug_assert!(
|
debug_assert!(
|
||||||
addr + 8 <= data.len() as u64,
|
addr + 8 <= data.len(),
|
||||||
"The fast field field should have been padded with 7 bytes."
|
"The fast field field should have been padded with 7 bytes."
|
||||||
);
|
);
|
||||||
let val_unshifted_unmasked: u64 = LittleEndian::read_u64(&data[(addr as usize)..]);
|
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
|
||||||
|
let val_unshifted_unmasked: u64 =
|
||||||
|
u64::from_le(unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) });
|
||||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||||
val_shifted & mask
|
val_shifted & mask
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Reads a range of values from the fast field.
|
||||||
|
///
|
||||||
|
/// The range of values read is from
|
||||||
|
/// `[start..start + output.len()[`
|
||||||
|
pub fn get_range(&self, start: u32, output: &mut [u64]) {
|
||||||
|
if self.num_bits == 0 {
|
||||||
|
for val in output.iter_mut() {
|
||||||
|
*val = 0u64;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
let data: &[u8] = &*self.data;
|
||||||
|
let num_bits = self.num_bits;
|
||||||
|
let mask = self.mask;
|
||||||
|
let mut addr_in_bits = (start as usize) * num_bits;
|
||||||
|
for output_val in output.iter_mut() {
|
||||||
|
let addr = addr_in_bits >> 3;
|
||||||
|
let bit_shift = addr_in_bits & 7;
|
||||||
|
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
|
||||||
|
let val_unshifted_unmasked: u64 =
|
||||||
|
unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) };
|
||||||
|
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||||
|
*output_val = val_shifted & mask;
|
||||||
|
addr_in_bits += num_bits;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@@ -129,7 +160,7 @@ mod test {
|
|||||||
fn test_bitpacker_util(len: usize, num_bits: u8) {
|
fn test_bitpacker_util(len: usize, num_bits: u8) {
|
||||||
let (bitunpacker, vals) = create_fastfield_bitpacker(len, num_bits);
|
let (bitunpacker, vals) = create_fastfield_bitpacker(len, num_bits);
|
||||||
for (i, val) in vals.iter().enumerate() {
|
for (i, val) in vals.iter().enumerate() {
|
||||||
assert_eq!(bitunpacker.get(i as u64), *val);
|
assert_eq!(bitunpacker.get(i), *val);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -141,4 +172,17 @@ mod test {
|
|||||||
test_bitpacker_util(6, 14);
|
test_bitpacker_util(6, 14);
|
||||||
test_bitpacker_util(1000, 14);
|
test_bitpacker_util(1000, 14);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_bitpacker_range() {
|
||||||
|
let (bitunpacker, vals) = create_fastfield_bitpacker(100_000, 12);
|
||||||
|
let buffer_len = 100;
|
||||||
|
let mut buffer = vec![0u64; buffer_len];
|
||||||
|
for start in vec![0, 10, 20, 100, 1_000] {
|
||||||
|
bitunpacker.get_range(start as u32, &mut buffer[..]);
|
||||||
|
for i in 0..buffer_len {
|
||||||
|
assert_eq!(buffer[i], vals[start + i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,8 +4,6 @@ use common::VInt;
|
|||||||
use directory::ReadOnlySource;
|
use directory::ReadOnlySource;
|
||||||
use directory::WritePtr;
|
use directory::WritePtr;
|
||||||
use schema::Field;
|
use schema::Field;
|
||||||
use space_usage::FieldUsage;
|
|
||||||
use space_usage::PerFieldSpaceUsage;
|
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::io::Write;
|
use std::io::Write;
|
||||||
use std::io::{self, Read};
|
use std::io::{self, Read};
|
||||||
@@ -39,7 +37,7 @@ impl BinarySerializable for FileAddr {
|
|||||||
/// A `CompositeWrite` is used to write a `CompositeFile`.
|
/// A `CompositeWrite` is used to write a `CompositeFile`.
|
||||||
pub struct CompositeWrite<W = WritePtr> {
|
pub struct CompositeWrite<W = WritePtr> {
|
||||||
write: CountingWriter<W>,
|
write: CountingWriter<W>,
|
||||||
offsets: HashMap<FileAddr, u64>,
|
offsets: HashMap<FileAddr, usize>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<W: Write> CompositeWrite<W> {
|
impl<W: Write> CompositeWrite<W> {
|
||||||
@@ -168,17 +166,6 @@ impl CompositeFile {
|
|||||||
.get(&FileAddr { field, idx })
|
.get(&FileAddr { field, idx })
|
||||||
.map(|&(from, to)| self.data.slice(from, to))
|
.map(|&(from, to)| self.data.slice(from, to))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn space_usage(&self) -> PerFieldSpaceUsage {
|
|
||||||
let mut fields = HashMap::new();
|
|
||||||
for (&field_addr, &(start, end)) in self.offsets_index.iter() {
|
|
||||||
fields
|
|
||||||
.entry(field_addr.field)
|
|
||||||
.or_insert_with(|| FieldUsage::empty(field_addr.field))
|
|
||||||
.add_field_idx(field_addr.idx, end - start);
|
|
||||||
}
|
|
||||||
PerFieldSpaceUsage::new(fields)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ use std::io::Write;
|
|||||||
|
|
||||||
pub struct CountingWriter<W> {
|
pub struct CountingWriter<W> {
|
||||||
underlying: W,
|
underlying: W,
|
||||||
written_bytes: u64,
|
written_bytes: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<W: Write> CountingWriter<W> {
|
impl<W: Write> CountingWriter<W> {
|
||||||
@@ -14,11 +14,11 @@ impl<W: Write> CountingWriter<W> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn written_bytes(&self) -> u64 {
|
pub fn written_bytes(&self) -> usize {
|
||||||
self.written_bytes
|
self.written_bytes
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn finish(mut self) -> io::Result<(W, u64)> {
|
pub fn finish(mut self) -> io::Result<(W, usize)> {
|
||||||
self.flush()?;
|
self.flush()?;
|
||||||
Ok((self.underlying, self.written_bytes))
|
Ok((self.underlying, self.written_bytes))
|
||||||
}
|
}
|
||||||
@@ -27,16 +27,10 @@ impl<W: Write> CountingWriter<W> {
|
|||||||
impl<W: Write> Write for CountingWriter<W> {
|
impl<W: Write> Write for CountingWriter<W> {
|
||||||
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
||||||
let written_size = self.underlying.write(buf)?;
|
let written_size = self.underlying.write(buf)?;
|
||||||
self.written_bytes += written_size as u64;
|
self.written_bytes += written_size;
|
||||||
Ok(written_size)
|
Ok(written_size)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn write_all(&mut self, buf: &[u8]) -> io::Result<()> {
|
|
||||||
self.underlying.write_all(buf)?;
|
|
||||||
self.written_bytes += buf.len() as u64;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn flush(&mut self) -> io::Result<()> {
|
fn flush(&mut self) -> io::Result<()> {
|
||||||
self.underlying.flush()
|
self.underlying.flush()
|
||||||
}
|
}
|
||||||
@@ -54,8 +48,8 @@ mod test {
|
|||||||
let mut counting_writer = CountingWriter::wrap(buffer);
|
let mut counting_writer = CountingWriter::wrap(buffer);
|
||||||
let bytes = (0u8..10u8).collect::<Vec<u8>>();
|
let bytes = (0u8..10u8).collect::<Vec<u8>>();
|
||||||
counting_writer.write_all(&bytes).unwrap();
|
counting_writer.write_all(&bytes).unwrap();
|
||||||
let (w, len): (Vec<u8>, u64) = counting_writer.finish().unwrap();
|
let (w, len): (Vec<u8>, usize) = counting_writer.finish().unwrap();
|
||||||
assert_eq!(len, 10u64);
|
assert_eq!(len, 10);
|
||||||
assert_eq!(w.len(), 10);
|
assert_eq!(w.len(), 10);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -10,14 +10,10 @@ pub(crate) use self::bitset::TinySet;
|
|||||||
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
|
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
|
||||||
pub use self::counting_writer::CountingWriter;
|
pub use self::counting_writer::CountingWriter;
|
||||||
pub use self::serialize::{BinarySerializable, FixedSize};
|
pub use self::serialize::{BinarySerializable, FixedSize};
|
||||||
pub use self::vint::{read_u32_vint, serialize_vint_u32, write_u32_vint, VInt};
|
pub use self::vint::VInt;
|
||||||
pub use byteorder::LittleEndian as Endianness;
|
pub use byteorder::LittleEndian as Endianness;
|
||||||
|
|
||||||
|
use std::io;
|
||||||
/// Segment's max doc must be `< MAX_DOC_LIMIT`.
|
|
||||||
///
|
|
||||||
/// We do not allow segments with more than
|
|
||||||
pub const MAX_DOC_LIMIT: u32 = 1 << 31;
|
|
||||||
|
|
||||||
/// Computes the number of bits that will be used for bitpacking.
|
/// Computes the number of bits that will be used for bitpacking.
|
||||||
///
|
///
|
||||||
@@ -56,6 +52,11 @@ pub(crate) fn is_power_of_2(n: usize) -> bool {
|
|||||||
(n > 0) && (n & (n - 1) == 0)
|
(n > 0) && (n & (n - 1) == 0)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Create a default io error given a string.
|
||||||
|
pub(crate) fn make_io_err(msg: String) -> io::Error {
|
||||||
|
io::Error::new(io::ErrorKind::Other, msg)
|
||||||
|
}
|
||||||
|
|
||||||
/// Has length trait
|
/// Has length trait
|
||||||
pub trait HasLen {
|
pub trait HasLen {
|
||||||
/// Return length
|
/// Return length
|
||||||
@@ -133,11 +134,4 @@ pub(crate) mod test {
|
|||||||
assert_eq!(compute_num_bits(256), 9u8);
|
assert_eq!(compute_num_bits(256), 9u8);
|
||||||
assert_eq!(compute_num_bits(5_000_000_000), 33u8);
|
assert_eq!(compute_num_bits(5_000_000_000), 33u8);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_max_doc() {
|
|
||||||
// this is the first time I write a unit test for a constant.
|
|
||||||
assert!(((super::MAX_DOC_LIMIT - 1) as i32) >= 0);
|
|
||||||
assert!((super::MAX_DOC_LIMIT as i32) < 0);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,5 +1,4 @@
|
|||||||
use super::BinarySerializable;
|
use super::BinarySerializable;
|
||||||
use byteorder::{ByteOrder, LittleEndian};
|
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::io::Read;
|
use std::io::Read;
|
||||||
use std::io::Write;
|
use std::io::Write;
|
||||||
@@ -10,100 +9,6 @@ pub struct VInt(pub u64);
|
|||||||
|
|
||||||
const STOP_BIT: u8 = 128;
|
const STOP_BIT: u8 = 128;
|
||||||
|
|
||||||
pub fn serialize_vint_u32(val: u32) -> (u64, usize) {
|
|
||||||
const START_2: u64 = 1 << 7;
|
|
||||||
const START_3: u64 = 1 << 14;
|
|
||||||
const START_4: u64 = 1 << 21;
|
|
||||||
const START_5: u64 = 1 << 28;
|
|
||||||
|
|
||||||
const STOP_1: u64 = START_2 - 1;
|
|
||||||
const STOP_2: u64 = START_3 - 1;
|
|
||||||
const STOP_3: u64 = START_4 - 1;
|
|
||||||
const STOP_4: u64 = START_5 - 1;
|
|
||||||
|
|
||||||
const MASK_1: u64 = 127;
|
|
||||||
const MASK_2: u64 = MASK_1 << 7;
|
|
||||||
const MASK_3: u64 = MASK_2 << 7;
|
|
||||||
const MASK_4: u64 = MASK_3 << 7;
|
|
||||||
const MASK_5: u64 = MASK_4 << 7;
|
|
||||||
|
|
||||||
let val = u64::from(val);
|
|
||||||
const STOP_BIT: u64 = 128u64;
|
|
||||||
match val {
|
|
||||||
0...STOP_1 => (val | STOP_BIT, 1),
|
|
||||||
START_2...STOP_2 => (
|
|
||||||
(val & MASK_1) | ((val & MASK_2) << 1) | (STOP_BIT << (8)),
|
|
||||||
2,
|
|
||||||
),
|
|
||||||
START_3...STOP_3 => (
|
|
||||||
(val & MASK_1) | ((val & MASK_2) << 1) | ((val & MASK_3) << 2) | (STOP_BIT << (8 * 2)),
|
|
||||||
3,
|
|
||||||
),
|
|
||||||
START_4...STOP_4 => (
|
|
||||||
(val & MASK_1)
|
|
||||||
| ((val & MASK_2) << 1)
|
|
||||||
| ((val & MASK_3) << 2)
|
|
||||||
| ((val & MASK_4) << 3)
|
|
||||||
| (STOP_BIT << (8 * 3)),
|
|
||||||
4,
|
|
||||||
),
|
|
||||||
_ => (
|
|
||||||
(val & MASK_1)
|
|
||||||
| ((val & MASK_2) << 1)
|
|
||||||
| ((val & MASK_3) << 2)
|
|
||||||
| ((val & MASK_4) << 3)
|
|
||||||
| ((val & MASK_5) << 4)
|
|
||||||
| (STOP_BIT << (8 * 4)),
|
|
||||||
5,
|
|
||||||
),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns the number of bytes covered by a
|
|
||||||
/// serialized vint `u32`.
|
|
||||||
///
|
|
||||||
/// Expects a buffer data that starts
|
|
||||||
/// by the serialized `vint`, scans at most 5 bytes ahead until
|
|
||||||
/// it finds the vint final byte.
|
|
||||||
///
|
|
||||||
/// # May Panic
|
|
||||||
/// If the payload does not start by a valid `vint`
|
|
||||||
fn vint_len(data: &[u8]) -> usize {
|
|
||||||
for (i, &val) in data.iter().enumerate().take(5) {
|
|
||||||
if val >= STOP_BIT {
|
|
||||||
return i + 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
panic!("Corrupted data. Invalid VInt 32");
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Reads a vint `u32` from a buffer, and
|
|
||||||
/// consumes its payload data.
|
|
||||||
///
|
|
||||||
/// # Panics
|
|
||||||
///
|
|
||||||
/// If the buffer does not start by a valid
|
|
||||||
/// vint payload
|
|
||||||
pub fn read_u32_vint(data: &mut &[u8]) -> u32 {
|
|
||||||
let vlen = vint_len(*data);
|
|
||||||
let mut result = 0u32;
|
|
||||||
let mut shift = 0u64;
|
|
||||||
for &b in &data[..vlen] {
|
|
||||||
result |= u32::from(b & 127u8) << shift;
|
|
||||||
shift += 7;
|
|
||||||
}
|
|
||||||
*data = &data[vlen..];
|
|
||||||
result
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Write a `u32` as a vint payload.
|
|
||||||
pub fn write_u32_vint<W: io::Write>(val: u32, writer: &mut W) -> io::Result<()> {
|
|
||||||
let (val, num_bytes) = serialize_vint_u32(val);
|
|
||||||
let mut buffer = [0u8; 8];
|
|
||||||
LittleEndian::write_u64(&mut buffer, val);
|
|
||||||
writer.write_all(&buffer[..num_bytes])
|
|
||||||
}
|
|
||||||
|
|
||||||
impl VInt {
|
impl VInt {
|
||||||
pub fn val(&self) -> u64 {
|
pub fn val(&self) -> u64 {
|
||||||
self.0
|
self.0
|
||||||
@@ -119,7 +24,7 @@ impl VInt {
|
|||||||
output.extend(&buffer[0..num_bytes]);
|
output.extend(&buffer[0..num_bytes]);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn serialize_into(&self, buffer: &mut [u8; 10]) -> usize {
|
fn serialize_into(&self, buffer: &mut [u8; 10]) -> usize {
|
||||||
let mut remaining = self.0;
|
let mut remaining = self.0;
|
||||||
for (i, b) in buffer.iter_mut().enumerate() {
|
for (i, b) in buffer.iter_mut().enumerate() {
|
||||||
let next_byte: u8 = (remaining % 128u64) as u8;
|
let next_byte: u8 = (remaining % 128u64) as u8;
|
||||||
@@ -159,7 +64,7 @@ impl BinarySerializable for VInt {
|
|||||||
return Err(io::Error::new(
|
return Err(io::Error::new(
|
||||||
io::ErrorKind::InvalidData,
|
io::ErrorKind::InvalidData,
|
||||||
"Reach end of buffer while reading VInt",
|
"Reach end of buffer while reading VInt",
|
||||||
));
|
))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -169,9 +74,7 @@ impl BinarySerializable for VInt {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
use super::serialize_vint_u32;
|
|
||||||
use super::VInt;
|
use super::VInt;
|
||||||
use byteorder::{ByteOrder, LittleEndian};
|
|
||||||
use common::BinarySerializable;
|
use common::BinarySerializable;
|
||||||
|
|
||||||
fn aux_test_vint(val: u64) {
|
fn aux_test_vint(val: u64) {
|
||||||
@@ -205,28 +108,4 @@ mod tests {
|
|||||||
}
|
}
|
||||||
aux_test_vint(10);
|
aux_test_vint(10);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn aux_test_serialize_vint_u32(val: u32) {
|
|
||||||
let mut buffer = [0u8; 10];
|
|
||||||
let mut buffer2 = [0u8; 10];
|
|
||||||
let len_vint = VInt(val as u64).serialize_into(&mut buffer);
|
|
||||||
let (vint, len) = serialize_vint_u32(val);
|
|
||||||
assert_eq!(len, len_vint, "len wrong for val {}", val);
|
|
||||||
LittleEndian::write_u64(&mut buffer2, vint);
|
|
||||||
assert_eq!(&buffer[..len], &buffer2[..len], "array wrong for {}", val);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_vint_u32() {
|
|
||||||
aux_test_serialize_vint_u32(0);
|
|
||||||
aux_test_serialize_vint_u32(1);
|
|
||||||
aux_test_serialize_vint_u32(5);
|
|
||||||
for i in 1..3 {
|
|
||||||
let power_of_128 = 1u32 << (7 * i);
|
|
||||||
aux_test_serialize_vint_u32(power_of_128 - 1u32);
|
|
||||||
aux_test_serialize_vint_u32(power_of_128);
|
|
||||||
aux_test_serialize_vint_u32(power_of_128 + 1u32);
|
|
||||||
}
|
|
||||||
aux_test_serialize_vint_u32(u32::max_value());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,136 +0,0 @@
|
|||||||
use crossbeam::channel;
|
|
||||||
use scoped_pool::{Pool, ThreadConfig};
|
|
||||||
use Result;
|
|
||||||
|
|
||||||
/// Search executor whether search request are single thread or multithread.
|
|
||||||
///
|
|
||||||
/// We don't expose Rayon thread pool directly here for several reasons.
|
|
||||||
///
|
|
||||||
/// First dependency hell. It is not a good idea to expose the
|
|
||||||
/// API of a dependency, knowing it might conflict with a different version
|
|
||||||
/// used by the client. Second, we may stop using rayon in the future.
|
|
||||||
pub enum Executor {
|
|
||||||
SingleThread,
|
|
||||||
ThreadPool(Pool),
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Executor {
|
|
||||||
/// Creates an Executor that performs all task in the caller thread.
|
|
||||||
pub fn single_thread() -> Executor {
|
|
||||||
Executor::SingleThread
|
|
||||||
}
|
|
||||||
|
|
||||||
// Creates an Executor that dispatches the tasks in a thread pool.
|
|
||||||
pub fn multi_thread(num_threads: usize, prefix: &'static str) -> Executor {
|
|
||||||
let thread_config = ThreadConfig::new().prefix(prefix);
|
|
||||||
let pool = Pool::with_thread_config(num_threads, thread_config);
|
|
||||||
Executor::ThreadPool(pool)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Perform a map in the thread pool.
|
|
||||||
//
|
|
||||||
// Regardless of the executor (`SingleThread` or `ThreadPool`), panics in the task
|
|
||||||
// will propagate to the caller.
|
|
||||||
pub fn map<
|
|
||||||
A: Send,
|
|
||||||
R: Send,
|
|
||||||
AIterator: Iterator<Item = A>,
|
|
||||||
F: Sized + Sync + Fn(A) -> Result<R>,
|
|
||||||
>(
|
|
||||||
&self,
|
|
||||||
f: F,
|
|
||||||
args: AIterator,
|
|
||||||
) -> Result<Vec<R>> {
|
|
||||||
match self {
|
|
||||||
Executor::SingleThread => args.map(f).collect::<Result<_>>(),
|
|
||||||
Executor::ThreadPool(pool) => {
|
|
||||||
let args_with_indices: Vec<(usize, A)> = args.enumerate().collect();
|
|
||||||
let num_fruits = args_with_indices.len();
|
|
||||||
let fruit_receiver = {
|
|
||||||
let (fruit_sender, fruit_receiver) = channel::unbounded();
|
|
||||||
pool.scoped(|scope| {
|
|
||||||
for arg_with_idx in args_with_indices {
|
|
||||||
scope.execute(|| {
|
|
||||||
let (idx, arg) = arg_with_idx;
|
|
||||||
let fruit = f(arg);
|
|
||||||
if let Err(err) = fruit_sender.send((idx, fruit)) {
|
|
||||||
error!("Failed to send search task. It probably means all search threads have panicked. {:?}", err);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
});
|
|
||||||
fruit_receiver
|
|
||||||
// This ends the scope of fruit_sender.
|
|
||||||
// This is important as it makes it possible for the fruit_receiver iteration to
|
|
||||||
// terminate.
|
|
||||||
};
|
|
||||||
// This is lame, but safe.
|
|
||||||
let mut results_with_position = Vec::with_capacity(num_fruits);
|
|
||||||
for (pos, fruit_res) in fruit_receiver {
|
|
||||||
let fruit = fruit_res?;
|
|
||||||
results_with_position.push((pos, fruit));
|
|
||||||
}
|
|
||||||
results_with_position.sort_by_key(|(pos, _)| *pos);
|
|
||||||
assert_eq!(results_with_position.len(), num_fruits);
|
|
||||||
Ok(results_with_position
|
|
||||||
.into_iter()
|
|
||||||
.map(|(_, fruit)| fruit)
|
|
||||||
.collect::<Vec<_>>())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
|
|
||||||
use super::Executor;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
#[should_panic(expected = "panic should propagate")]
|
|
||||||
fn test_panic_propagates_single_thread() {
|
|
||||||
let _result: Vec<usize> = Executor::single_thread()
|
|
||||||
.map(
|
|
||||||
|_| {
|
|
||||||
panic!("panic should propagate");
|
|
||||||
},
|
|
||||||
vec![0].into_iter(),
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
#[should_panic] //< unfortunately the panic message is not propagated
|
|
||||||
fn test_panic_propagates_multi_thread() {
|
|
||||||
let _result: Vec<usize> = Executor::multi_thread(1, "search-test")
|
|
||||||
.map(
|
|
||||||
|_| {
|
|
||||||
panic!("panic should propagate");
|
|
||||||
},
|
|
||||||
vec![0].into_iter(),
|
|
||||||
)
|
|
||||||
.unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_map_singlethread() {
|
|
||||||
let result: Vec<usize> = Executor::single_thread()
|
|
||||||
.map(|i| Ok(i * 2), 0..1_000)
|
|
||||||
.unwrap();
|
|
||||||
assert_eq!(result.len(), 1_000);
|
|
||||||
for i in 0..1_000 {
|
|
||||||
assert_eq!(result[i], i * 2);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_map_multithread() {
|
|
||||||
let result: Vec<usize> = Executor::multi_thread(3, "search-test")
|
|
||||||
.map(|i| Ok(i * 2), 0..10)
|
|
||||||
.unwrap();
|
|
||||||
assert_eq!(result.len(), 10);
|
|
||||||
for i in 0..10 {
|
|
||||||
assert_eq!(result[i], i * 2);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,31 +1,31 @@
|
|||||||
|
use super::pool::LeasedItem;
|
||||||
|
use super::pool::Pool;
|
||||||
use super::segment::create_segment;
|
use super::segment::create_segment;
|
||||||
use super::segment::Segment;
|
use super::segment::Segment;
|
||||||
use core::Executor;
|
use core::searcher::Searcher;
|
||||||
use core::IndexMeta;
|
use core::IndexMeta;
|
||||||
use core::SegmentId;
|
use core::SegmentId;
|
||||||
use core::SegmentMeta;
|
use core::SegmentMeta;
|
||||||
|
use core::SegmentReader;
|
||||||
use core::META_FILEPATH;
|
use core::META_FILEPATH;
|
||||||
use directory::ManagedDirectory;
|
use directory::ManagedDirectory;
|
||||||
#[cfg(feature = "mmap")]
|
#[cfg(feature = "mmap")]
|
||||||
use directory::MmapDirectory;
|
use directory::MmapDirectory;
|
||||||
use directory::INDEX_WRITER_LOCK;
|
|
||||||
use directory::{Directory, RAMDirectory};
|
use directory::{Directory, RAMDirectory};
|
||||||
use error::DataCorruption;
|
|
||||||
use error::TantivyError;
|
use error::TantivyError;
|
||||||
use indexer::index_writer::open_index_writer;
|
use indexer::index_writer::open_index_writer;
|
||||||
use indexer::index_writer::HEAP_SIZE_MIN;
|
use indexer::index_writer::HEAP_SIZE_MIN;
|
||||||
use indexer::segment_updater::save_new_metas;
|
use indexer::segment_updater::save_new_metas;
|
||||||
|
use indexer::LockType;
|
||||||
use num_cpus;
|
use num_cpus;
|
||||||
use reader::IndexReader;
|
|
||||||
use reader::IndexReaderBuilder;
|
|
||||||
use schema::Field;
|
use schema::Field;
|
||||||
use schema::FieldType;
|
use schema::FieldType;
|
||||||
use schema::Schema;
|
use schema::Schema;
|
||||||
use serde_json;
|
use serde_json;
|
||||||
use std::borrow::BorrowMut;
|
use std::borrow::BorrowMut;
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
#[cfg(feature = "mmap")]
|
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use tokenizer::BoxedTokenizer;
|
use tokenizer::BoxedTokenizer;
|
||||||
use tokenizer::TokenizerManager;
|
use tokenizer::TokenizerManager;
|
||||||
@@ -36,53 +36,19 @@ fn load_metas(directory: &Directory) -> Result<IndexMeta> {
|
|||||||
let meta_data = directory.atomic_read(&META_FILEPATH)?;
|
let meta_data = directory.atomic_read(&META_FILEPATH)?;
|
||||||
let meta_string = String::from_utf8_lossy(&meta_data);
|
let meta_string = String::from_utf8_lossy(&meta_data);
|
||||||
serde_json::from_str(&meta_string)
|
serde_json::from_str(&meta_string)
|
||||||
.map_err(|e| {
|
.map_err(|_| TantivyError::CorruptedFile(META_FILEPATH.clone()))
|
||||||
DataCorruption::new(
|
|
||||||
META_FILEPATH.clone(),
|
|
||||||
format!("Meta file cannot be deserialized. {:?}.", e),
|
|
||||||
)
|
|
||||||
})
|
|
||||||
.map_err(From::from)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Search Index
|
/// Search Index
|
||||||
#[derive(Clone)]
|
|
||||||
pub struct Index {
|
pub struct Index {
|
||||||
directory: ManagedDirectory,
|
directory: ManagedDirectory,
|
||||||
schema: Schema,
|
schema: Schema,
|
||||||
executor: Arc<Executor>,
|
num_searchers: Arc<AtomicUsize>,
|
||||||
|
searcher_pool: Arc<Pool<Searcher>>,
|
||||||
tokenizers: TokenizerManager,
|
tokenizers: TokenizerManager,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Index {
|
impl Index {
|
||||||
/// Examines the director to see if it contains an index
|
|
||||||
pub fn exists<Dir: Directory>(dir: &Dir) -> bool {
|
|
||||||
dir.exists(&META_FILEPATH)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Accessor to the search executor.
|
|
||||||
///
|
|
||||||
/// This pool is used by default when calling `searcher.search(...)`
|
|
||||||
/// to perform search on the individual segments.
|
|
||||||
///
|
|
||||||
/// By default the executor is single thread, and simply runs in the calling thread.
|
|
||||||
pub fn search_executor(&self) -> &Executor {
|
|
||||||
self.executor.as_ref()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Replace the default single thread search executor pool
|
|
||||||
/// by a thread pool with a given number of threads.
|
|
||||||
pub fn set_multithread_executor(&mut self, num_threads: usize) {
|
|
||||||
self.executor = Arc::new(Executor::multi_thread(num_threads, "thrd-tantivy-search-"));
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Replace the default single thread search executor pool
|
|
||||||
/// by a thread pool with a given number of threads.
|
|
||||||
pub fn set_default_multithread_executor(&mut self) {
|
|
||||||
let default_num_threads = num_cpus::get();
|
|
||||||
self.set_multithread_executor(default_num_threads);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Creates a new index using the `RAMDirectory`.
|
/// Creates a new index using the `RAMDirectory`.
|
||||||
///
|
///
|
||||||
/// The index will be allocated in anonymous memory.
|
/// The index will be allocated in anonymous memory.
|
||||||
@@ -99,29 +65,9 @@ impl Index {
|
|||||||
#[cfg(feature = "mmap")]
|
#[cfg(feature = "mmap")]
|
||||||
pub fn create_in_dir<P: AsRef<Path>>(directory_path: P, schema: Schema) -> Result<Index> {
|
pub fn create_in_dir<P: AsRef<Path>>(directory_path: P, schema: Schema) -> Result<Index> {
|
||||||
let mmap_directory = MmapDirectory::open(directory_path)?;
|
let mmap_directory = MmapDirectory::open(directory_path)?;
|
||||||
if Index::exists(&mmap_directory) {
|
|
||||||
return Err(TantivyError::IndexAlreadyExists);
|
|
||||||
}
|
|
||||||
|
|
||||||
Index::create(mmap_directory, schema)
|
Index::create(mmap_directory, schema)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Opens or creates a new index in the provided directory
|
|
||||||
pub fn open_or_create<Dir: Directory>(dir: Dir, schema: Schema) -> Result<Index> {
|
|
||||||
if Index::exists(&dir) {
|
|
||||||
let index = Index::open(dir)?;
|
|
||||||
if index.schema() == schema {
|
|
||||||
Ok(index)
|
|
||||||
} else {
|
|
||||||
Err(TantivyError::SchemaError(
|
|
||||||
"An index exists but the schema does not match.".to_string(),
|
|
||||||
))
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
Index::create(dir, schema)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Creates a new index in a temp directory.
|
/// Creates a new index in a temp directory.
|
||||||
///
|
///
|
||||||
/// The index will use the `MMapDirectory` in a newly created directory.
|
/// The index will use the `MMapDirectory` in a newly created directory.
|
||||||
@@ -138,15 +84,13 @@ impl Index {
|
|||||||
|
|
||||||
/// Creates a new index given an implementation of the trait `Directory`
|
/// Creates a new index given an implementation of the trait `Directory`
|
||||||
pub fn create<Dir: Directory>(dir: Dir, schema: Schema) -> Result<Index> {
|
pub fn create<Dir: Directory>(dir: Dir, schema: Schema) -> Result<Index> {
|
||||||
let directory = ManagedDirectory::wrap(dir)?;
|
let directory = ManagedDirectory::new(dir)?;
|
||||||
Index::from_directory(directory, schema)
|
Index::from_directory(directory, schema)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Create a new index from a directory.
|
/// Create a new index from a directory.
|
||||||
///
|
|
||||||
/// This will overwrite existing meta.json
|
|
||||||
fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> {
|
fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> {
|
||||||
save_new_metas(schema.clone(), directory.borrow_mut())?;
|
save_new_metas(schema.clone(), 0, directory.borrow_mut())?;
|
||||||
let metas = IndexMeta::with_schema(schema);
|
let metas = IndexMeta::with_schema(schema);
|
||||||
Index::create_from_metas(directory, &metas)
|
Index::create_from_metas(directory, &metas)
|
||||||
}
|
}
|
||||||
@@ -154,12 +98,15 @@ impl Index {
|
|||||||
/// Creates a new index given a directory and an `IndexMeta`.
|
/// Creates a new index given a directory and an `IndexMeta`.
|
||||||
fn create_from_metas(directory: ManagedDirectory, metas: &IndexMeta) -> Result<Index> {
|
fn create_from_metas(directory: ManagedDirectory, metas: &IndexMeta) -> Result<Index> {
|
||||||
let schema = metas.schema.clone();
|
let schema = metas.schema.clone();
|
||||||
|
let n_cpus = num_cpus::get();
|
||||||
let index = Index {
|
let index = Index {
|
||||||
directory,
|
directory,
|
||||||
schema,
|
schema,
|
||||||
|
num_searchers: Arc::new(AtomicUsize::new(n_cpus)),
|
||||||
|
searcher_pool: Arc::new(Pool::new()),
|
||||||
tokenizers: TokenizerManager::default(),
|
tokenizers: TokenizerManager::default(),
|
||||||
executor: Arc::new(Executor::single_thread()),
|
|
||||||
};
|
};
|
||||||
|
index.load_searchers()?;
|
||||||
Ok(index)
|
Ok(index)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -189,22 +136,6 @@ impl Index {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Create a default `IndexReader` for the given index.
|
|
||||||
///
|
|
||||||
/// See [`Index.reader_builder()`](#method.reader_builder).
|
|
||||||
pub fn reader(&self) -> Result<IndexReader> {
|
|
||||||
self.reader_builder().try_into()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Create a `IndexReader` for the given index.
|
|
||||||
///
|
|
||||||
/// Most project should create at most one reader for a given index.
|
|
||||||
/// This method is typically called only once per `Index` instance,
|
|
||||||
/// over the lifetime of most problem.
|
|
||||||
pub fn reader_builder(&self) -> IndexReaderBuilder {
|
|
||||||
IndexReaderBuilder::new(self.clone())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Opens a new directory from an index path.
|
/// Opens a new directory from an index path.
|
||||||
#[cfg(feature = "mmap")]
|
#[cfg(feature = "mmap")]
|
||||||
pub fn open_in_dir<P: AsRef<Path>>(directory_path: P) -> Result<Index> {
|
pub fn open_in_dir<P: AsRef<Path>>(directory_path: P) -> Result<Index> {
|
||||||
@@ -214,7 +145,7 @@ impl Index {
|
|||||||
|
|
||||||
/// Open the index using the provided directory
|
/// Open the index using the provided directory
|
||||||
pub fn open<D: Directory>(directory: D) -> Result<Index> {
|
pub fn open<D: Directory>(directory: D) -> Result<Index> {
|
||||||
let directory = ManagedDirectory::wrap(directory)?;
|
let directory = ManagedDirectory::new(directory)?;
|
||||||
let metas = load_metas(&directory)?;
|
let metas = load_metas(&directory)?;
|
||||||
Index::create_from_metas(directory, &metas)
|
Index::create_from_metas(directory, &metas)
|
||||||
}
|
}
|
||||||
@@ -240,8 +171,7 @@ impl Index {
|
|||||||
/// Each thread will receive a budget of `overall_heap_size_in_bytes / num_threads`.
|
/// Each thread will receive a budget of `overall_heap_size_in_bytes / num_threads`.
|
||||||
///
|
///
|
||||||
/// # Errors
|
/// # Errors
|
||||||
/// If the lockfile already exists, returns `Error::DirectoryLockBusy` or an `Error::IOError`.
|
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
|
||||||
///
|
|
||||||
/// # Panics
|
/// # Panics
|
||||||
/// If the heap size per thread is too small, panics.
|
/// If the heap size per thread is too small, panics.
|
||||||
pub fn writer_with_num_threads(
|
pub fn writer_with_num_threads(
|
||||||
@@ -249,21 +179,7 @@ impl Index {
|
|||||||
num_threads: usize,
|
num_threads: usize,
|
||||||
overall_heap_size_in_bytes: usize,
|
overall_heap_size_in_bytes: usize,
|
||||||
) -> Result<IndexWriter> {
|
) -> Result<IndexWriter> {
|
||||||
let directory_lock = self
|
let directory_lock = LockType::IndexWriterLock.acquire_lock(&self.directory)?;
|
||||||
.directory
|
|
||||||
.acquire_lock(&INDEX_WRITER_LOCK)
|
|
||||||
.map_err(|err| {
|
|
||||||
TantivyError::LockFailure(
|
|
||||||
err,
|
|
||||||
Some(
|
|
||||||
"Failed to acquire index lock. If you are using\
|
|
||||||
a regular directory, this means there is already an \
|
|
||||||
`IndexWriter` working on this `Directory`, in this process \
|
|
||||||
or in a different process."
|
|
||||||
.to_string(),
|
|
||||||
),
|
|
||||||
)
|
|
||||||
})?;
|
|
||||||
let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads;
|
let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads;
|
||||||
open_index_writer(
|
open_index_writer(
|
||||||
self,
|
self,
|
||||||
@@ -343,6 +259,53 @@ impl Index {
|
|||||||
.map(|segment_meta| segment_meta.id())
|
.map(|segment_meta| segment_meta.id())
|
||||||
.collect())
|
.collect())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Sets the number of searchers to use
|
||||||
|
///
|
||||||
|
/// Only works after the next call to `load_searchers`
|
||||||
|
pub fn set_num_searchers(&mut self, num_searchers: usize) {
|
||||||
|
self.num_searchers.store(num_searchers, Ordering::Release);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Update searchers so that they reflect the state of the last
|
||||||
|
/// `.commit()`.
|
||||||
|
///
|
||||||
|
/// If indexing happens in the same process as searching,
|
||||||
|
/// you most likely want to call `.load_searchers()` right after each
|
||||||
|
/// successful call to `.commit()`.
|
||||||
|
///
|
||||||
|
/// If indexing and searching happen in different processes, the way to
|
||||||
|
/// get the freshest `index` at all time, is to watch `meta.json` and
|
||||||
|
/// call `load_searchers` whenever a changes happen.
|
||||||
|
pub fn load_searchers(&self) -> Result<()> {
|
||||||
|
let _meta_lock = LockType::MetaLock.acquire_lock(self.directory())?;
|
||||||
|
let searchable_segments = self.searchable_segments()?;
|
||||||
|
let segment_readers: Vec<SegmentReader> = searchable_segments
|
||||||
|
.iter()
|
||||||
|
.map(SegmentReader::open)
|
||||||
|
.collect::<Result<_>>()?;
|
||||||
|
let schema = self.schema();
|
||||||
|
let num_searchers: usize = self.num_searchers.load(Ordering::Acquire);
|
||||||
|
let searchers = (0..num_searchers)
|
||||||
|
.map(|_| Searcher::new(schema.clone(), self.clone(), segment_readers.clone()))
|
||||||
|
.collect();
|
||||||
|
self.searcher_pool.publish_new_generation(searchers);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns a searcher
|
||||||
|
///
|
||||||
|
/// This method should be called every single time a search
|
||||||
|
/// query is performed.
|
||||||
|
/// The searchers are taken from a pool of `num_searchers` searchers.
|
||||||
|
/// If no searcher is available
|
||||||
|
/// this may block.
|
||||||
|
///
|
||||||
|
/// The same searcher must be used for a given query, as it ensures
|
||||||
|
/// the use of a consistent segment set.
|
||||||
|
pub fn searcher(&self) -> LeasedItem<Searcher> {
|
||||||
|
self.searcher_pool.acquire()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl fmt::Debug for Index {
|
impl fmt::Debug for Index {
|
||||||
@@ -351,22 +314,27 @@ impl fmt::Debug for Index {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl Clone for Index {
|
||||||
|
fn clone(&self) -> Index {
|
||||||
|
Index {
|
||||||
|
directory: self.directory.clone(),
|
||||||
|
schema: self.schema.clone(),
|
||||||
|
num_searchers: Arc::clone(&self.num_searchers),
|
||||||
|
searcher_pool: Arc::clone(&self.searcher_pool),
|
||||||
|
tokenizers: self.tokenizers.clone(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use directory::RAMDirectory;
|
use schema::{SchemaBuilder, INT_INDEXED, TEXT};
|
||||||
use schema::Field;
|
|
||||||
use schema::{Schema, INDEXED, TEXT};
|
|
||||||
use std::thread;
|
|
||||||
use std::time::Duration;
|
|
||||||
use Index;
|
use Index;
|
||||||
use IndexReader;
|
|
||||||
use IndexWriter;
|
|
||||||
use ReloadPolicy;
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_indexer_for_field() {
|
fn test_indexer_for_field() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let num_likes_field = schema_builder.add_u64_field("num_likes", INDEXED);
|
let num_likes_field = schema_builder.add_u64_field("num_likes", INT_INDEXED);
|
||||||
let body_field = schema_builder.add_text_field("body", TEXT);
|
let body_field = schema_builder.add_text_field("body", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -377,164 +345,4 @@ mod tests {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_index_exists() {
|
|
||||||
let directory = RAMDirectory::create();
|
|
||||||
assert!(!Index::exists(&directory));
|
|
||||||
assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
|
|
||||||
assert!(Index::exists(&directory));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn open_or_create_should_create() {
|
|
||||||
let directory = RAMDirectory::create();
|
|
||||||
assert!(!Index::exists(&directory));
|
|
||||||
assert!(Index::open_or_create(directory.clone(), throw_away_schema()).is_ok());
|
|
||||||
assert!(Index::exists(&directory));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn open_or_create_should_open() {
|
|
||||||
let directory = RAMDirectory::create();
|
|
||||||
assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
|
|
||||||
assert!(Index::exists(&directory));
|
|
||||||
assert!(Index::open_or_create(directory, throw_away_schema()).is_ok());
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn create_should_wipeoff_existing() {
|
|
||||||
let directory = RAMDirectory::create();
|
|
||||||
assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
|
|
||||||
assert!(Index::exists(&directory));
|
|
||||||
assert!(Index::create(directory.clone(), Schema::builder().build()).is_ok());
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn open_or_create_exists_but_schema_does_not_match() {
|
|
||||||
let directory = RAMDirectory::create();
|
|
||||||
assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
|
|
||||||
assert!(Index::exists(&directory));
|
|
||||||
assert!(Index::open_or_create(directory.clone(), throw_away_schema()).is_ok());
|
|
||||||
let err = Index::open_or_create(directory, Schema::builder().build());
|
|
||||||
assert_eq!(
|
|
||||||
format!("{:?}", err.unwrap_err()),
|
|
||||||
"SchemaError(\"An index exists but the schema does not match.\")"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn throw_away_schema() -> Schema {
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
let _ = schema_builder.add_u64_field("num_likes", INDEXED);
|
|
||||||
schema_builder.build()
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_index_on_commit_reload_policy() {
|
|
||||||
let schema = throw_away_schema();
|
|
||||||
let field = schema.get_field("num_likes").unwrap();
|
|
||||||
let index = Index::create_in_ram(schema);
|
|
||||||
let reader = index
|
|
||||||
.reader_builder()
|
|
||||||
.reload_policy(ReloadPolicy::OnCommit)
|
|
||||||
.try_into()
|
|
||||||
.unwrap();
|
|
||||||
assert_eq!(reader.searcher().num_docs(), 0);
|
|
||||||
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
|
||||||
test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(feature = "mmap")]
|
|
||||||
mod mmap_specific {
|
|
||||||
|
|
||||||
use super::*;
|
|
||||||
use std::path::PathBuf;
|
|
||||||
use tempdir::TempDir;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_index_on_commit_reload_policy_mmap() {
|
|
||||||
let schema = throw_away_schema();
|
|
||||||
let field = schema.get_field("num_likes").unwrap();
|
|
||||||
let tempdir = TempDir::new("index").unwrap();
|
|
||||||
let tempdir_path = PathBuf::from(tempdir.path());
|
|
||||||
let index = Index::create_in_dir(&tempdir_path, schema).unwrap();
|
|
||||||
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
|
||||||
writer.commit().unwrap();
|
|
||||||
let reader = index
|
|
||||||
.reader_builder()
|
|
||||||
.reload_policy(ReloadPolicy::OnCommit)
|
|
||||||
.try_into()
|
|
||||||
.unwrap();
|
|
||||||
assert_eq!(reader.searcher().num_docs(), 0);
|
|
||||||
test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_index_manual_policy_mmap() {
|
|
||||||
let schema = throw_away_schema();
|
|
||||||
let field = schema.get_field("num_likes").unwrap();
|
|
||||||
let index = Index::create_from_tempdir(schema).unwrap();
|
|
||||||
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
|
||||||
writer.commit().unwrap();
|
|
||||||
let reader = index
|
|
||||||
.reader_builder()
|
|
||||||
.reload_policy(ReloadPolicy::Manual)
|
|
||||||
.try_into()
|
|
||||||
.unwrap();
|
|
||||||
assert_eq!(reader.searcher().num_docs(), 0);
|
|
||||||
writer.add_document(doc!(field=>1u64));
|
|
||||||
writer.commit().unwrap();
|
|
||||||
thread::sleep(Duration::from_millis(500));
|
|
||||||
assert_eq!(reader.searcher().num_docs(), 0);
|
|
||||||
reader.reload().unwrap();
|
|
||||||
assert_eq!(reader.searcher().num_docs(), 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_index_on_commit_reload_policy_different_directories() {
|
|
||||||
let schema = throw_away_schema();
|
|
||||||
let field = schema.get_field("num_likes").unwrap();
|
|
||||||
let tempdir = TempDir::new("index").unwrap();
|
|
||||||
let tempdir_path = PathBuf::from(tempdir.path());
|
|
||||||
let write_index = Index::create_in_dir(&tempdir_path, schema).unwrap();
|
|
||||||
let read_index = Index::open_in_dir(&tempdir_path).unwrap();
|
|
||||||
let reader = read_index
|
|
||||||
.reader_builder()
|
|
||||||
.reload_policy(ReloadPolicy::OnCommit)
|
|
||||||
.try_into()
|
|
||||||
.unwrap();
|
|
||||||
assert_eq!(reader.searcher().num_docs(), 0);
|
|
||||||
let mut writer = write_index.writer_with_num_threads(1, 3_000_000).unwrap();
|
|
||||||
test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn test_index_on_commit_reload_policy_aux(
|
|
||||||
field: Field,
|
|
||||||
writer: &mut IndexWriter,
|
|
||||||
reader: &IndexReader,
|
|
||||||
) {
|
|
||||||
assert_eq!(reader.searcher().num_docs(), 0);
|
|
||||||
writer.add_document(doc!(field=>1u64));
|
|
||||||
writer.commit().unwrap();
|
|
||||||
let mut count = 0;
|
|
||||||
for _ in 0..100 {
|
|
||||||
count = reader.searcher().num_docs();
|
|
||||||
if count > 0 {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
thread::sleep(Duration::from_millis(100));
|
|
||||||
}
|
|
||||||
assert_eq!(count, 1);
|
|
||||||
writer.add_document(doc!(field=>2u64));
|
|
||||||
writer.commit().unwrap();
|
|
||||||
let mut count = 0;
|
|
||||||
for _ in 0..10 {
|
|
||||||
count = reader.searcher().num_docs();
|
|
||||||
if count > 1 {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
thread::sleep(Duration::from_millis(100));
|
|
||||||
}
|
|
||||||
assert_eq!(count, 2);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -46,13 +46,13 @@ impl fmt::Debug for IndexMeta {
|
|||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
use super::IndexMeta;
|
use super::IndexMeta;
|
||||||
use schema::{Schema, TEXT};
|
use schema::{SchemaBuilder, TEXT};
|
||||||
use serde_json;
|
use serde_json;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_serialize_metas() {
|
fn test_serialize_metas() {
|
||||||
let schema = {
|
let schema = {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::new();
|
||||||
schema_builder.add_text_field("text", TEXT);
|
schema_builder.add_text_field("text", TEXT);
|
||||||
schema_builder.build()
|
schema_builder.build()
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -32,7 +32,10 @@ pub struct InvertedIndexReader {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl InvertedIndexReader {
|
impl InvertedIndexReader {
|
||||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::needless_pass_by_value))] // for symetry
|
#[cfg_attr(
|
||||||
|
feature = "cargo-clippy",
|
||||||
|
allow(clippy::needless_pass_by_value)
|
||||||
|
)] // for symetry
|
||||||
pub(crate) fn new(
|
pub(crate) fn new(
|
||||||
termdict: TermDictionary,
|
termdict: TermDictionary,
|
||||||
postings_source: ReadOnlySource,
|
postings_source: ReadOnlySource,
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
mod executor;
|
|
||||||
pub mod index;
|
pub mod index;
|
||||||
mod index_meta;
|
mod index_meta;
|
||||||
mod inverted_index_reader;
|
mod inverted_index_reader;
|
||||||
|
mod pool;
|
||||||
pub mod searcher;
|
pub mod searcher;
|
||||||
mod segment;
|
mod segment;
|
||||||
mod segment_component;
|
mod segment_component;
|
||||||
@@ -9,7 +9,6 @@ mod segment_id;
|
|||||||
mod segment_meta;
|
mod segment_meta;
|
||||||
mod segment_reader;
|
mod segment_reader;
|
||||||
|
|
||||||
pub use self::executor::Executor;
|
|
||||||
pub use self::index::Index;
|
pub use self::index::Index;
|
||||||
pub use self::index_meta::IndexMeta;
|
pub use self::index_meta::IndexMeta;
|
||||||
pub use self::inverted_index_reader::InvertedIndexReader;
|
pub use self::inverted_index_reader::InvertedIndexReader;
|
||||||
@@ -24,7 +23,6 @@ pub use self::segment_reader::SegmentReader;
|
|||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
|
|
||||||
lazy_static! {
|
lazy_static! {
|
||||||
|
|
||||||
/// The meta file contains all the information about the list of segments and the schema
|
/// The meta file contains all the information about the list of segments and the schema
|
||||||
/// of the index.
|
/// of the index.
|
||||||
pub static ref META_FILEPATH: PathBuf = PathBuf::from("meta.json");
|
pub static ref META_FILEPATH: PathBuf = PathBuf::from("meta.json");
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
use crossbeam::crossbeam_channel::unbounded;
|
use crossbeam::queue::MsQueue;
|
||||||
use crossbeam::{Receiver, RecvError, Sender};
|
use std::mem;
|
||||||
use std::ops::{Deref, DerefMut};
|
use std::ops::{Deref, DerefMut};
|
||||||
use std::sync::atomic::AtomicUsize;
|
use std::sync::atomic::AtomicUsize;
|
||||||
use std::sync::atomic::Ordering;
|
use std::sync::atomic::Ordering;
|
||||||
@@ -10,52 +10,15 @@ pub struct GenerationItem<T> {
|
|||||||
item: T,
|
item: T,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Queue implementation for the Object Pool below
|
|
||||||
/// Uses the unbounded Linked-List type queue from crossbeam-channel
|
|
||||||
/// Splits the Queue into sender and receiver
|
|
||||||
struct Queue<T> {
|
|
||||||
sender: Sender<T>,
|
|
||||||
receiver: Receiver<T>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T> Queue<T> {
|
|
||||||
fn new() -> Self {
|
|
||||||
let (s, r) = unbounded();
|
|
||||||
Queue {
|
|
||||||
sender: s,
|
|
||||||
receiver: r,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Sender trait returns a Result type, which is ignored.
|
|
||||||
/// The Result is not handled at the moment
|
|
||||||
fn push(&self, elem: T) {
|
|
||||||
self.sender
|
|
||||||
.send(elem)
|
|
||||||
.expect("Sending an item to crossbeam-queue shouldn't fail");
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Relies on the underlying crossbeam-channel Receiver
|
|
||||||
/// to block on empty queue
|
|
||||||
fn pop(&self) -> Result<T, RecvError> {
|
|
||||||
self.receiver.recv()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// An object pool
|
|
||||||
///
|
|
||||||
/// This is used in tantivy to create a pool of `Searcher`.
|
|
||||||
/// Object are wrapped in a `LeasedItem` wrapper and are
|
|
||||||
/// released automatically back into the pool on `Drop`.
|
|
||||||
pub struct Pool<T> {
|
pub struct Pool<T> {
|
||||||
queue: Arc<Queue<GenerationItem<T>>>,
|
queue: Arc<MsQueue<GenerationItem<T>>>,
|
||||||
freshest_generation: AtomicUsize,
|
freshest_generation: AtomicUsize,
|
||||||
next_generation: AtomicUsize,
|
next_generation: AtomicUsize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T> Pool<T> {
|
impl<T> Pool<T> {
|
||||||
pub fn new() -> Pool<T> {
|
pub fn new() -> Pool<T> {
|
||||||
let queue = Arc::new(Queue::new());
|
let queue = Arc::new(MsQueue::new());
|
||||||
Pool {
|
Pool {
|
||||||
queue,
|
queue,
|
||||||
freshest_generation: AtomicUsize::default(),
|
freshest_generation: AtomicUsize::default(),
|
||||||
@@ -63,10 +26,6 @@ impl<T> Pool<T> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Publishes a new generation of `Searcher`.
|
|
||||||
///
|
|
||||||
/// After publish, all new `Searcher` acquired will be
|
|
||||||
/// of the new generation.
|
|
||||||
pub fn publish_new_generation(&self, items: Vec<T>) {
|
pub fn publish_new_generation(&self, items: Vec<T>) {
|
||||||
let next_generation = self.next_generation.fetch_add(1, Ordering::SeqCst) + 1;
|
let next_generation = self.next_generation.fetch_add(1, Ordering::SeqCst) + 1;
|
||||||
for item in items {
|
for item in items {
|
||||||
@@ -102,14 +61,10 @@ impl<T> Pool<T> {
|
|||||||
self.freshest_generation.load(Ordering::Acquire)
|
self.freshest_generation.load(Ordering::Acquire)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Acquires a new searcher.
|
|
||||||
///
|
|
||||||
/// If no searcher is available, this methods block until
|
|
||||||
/// a searcher is released.
|
|
||||||
pub fn acquire(&self) -> LeasedItem<T> {
|
pub fn acquire(&self) -> LeasedItem<T> {
|
||||||
let generation = self.generation();
|
let generation = self.generation();
|
||||||
loop {
|
loop {
|
||||||
let gen_item = self.queue.pop().unwrap();
|
let gen_item = self.queue.pop();
|
||||||
if gen_item.generation >= generation {
|
if gen_item.generation >= generation {
|
||||||
return LeasedItem {
|
return LeasedItem {
|
||||||
gen_item: Some(gen_item),
|
gen_item: Some(gen_item),
|
||||||
@@ -125,7 +80,7 @@ impl<T> Pool<T> {
|
|||||||
|
|
||||||
pub struct LeasedItem<T> {
|
pub struct LeasedItem<T> {
|
||||||
gen_item: Option<GenerationItem<T>>,
|
gen_item: Option<GenerationItem<T>>,
|
||||||
recycle_queue: Arc<Queue<GenerationItem<T>>>,
|
recycle_queue: Arc<MsQueue<GenerationItem<T>>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T> Deref for LeasedItem<T> {
|
impl<T> Deref for LeasedItem<T> {
|
||||||
@@ -152,9 +107,9 @@ impl<T> DerefMut for LeasedItem<T> {
|
|||||||
|
|
||||||
impl<T> Drop for LeasedItem<T> {
|
impl<T> Drop for LeasedItem<T> {
|
||||||
fn drop(&mut self) {
|
fn drop(&mut self) {
|
||||||
if let Some(gen_item) = self.gen_item.take() {
|
let gen_item: GenerationItem<T> = mem::replace(&mut self.gen_item, None)
|
||||||
self.recycle_queue.push(gen_item);
|
.expect("Unwrapping a leased item should never fail");
|
||||||
}
|
self.recycle_queue.push(gen_item);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -162,7 +117,6 @@ impl<T> Drop for LeasedItem<T> {
|
|||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
use super::Pool;
|
use super::Pool;
|
||||||
use super::Queue;
|
|
||||||
use std::iter;
|
use std::iter;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -179,47 +133,4 @@ mod tests {
|
|||||||
assert_eq!(*pool.acquire(), 11);
|
assert_eq!(*pool.acquire(), 11);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_queue() {
|
|
||||||
let q = Queue::new();
|
|
||||||
let elem = 5;
|
|
||||||
q.push(elem);
|
|
||||||
let res = q.pop();
|
|
||||||
assert_eq!(res.unwrap(), elem);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_pool_dont_panic_on_empty_pop() {
|
|
||||||
// When the object pool is exhausted, it shouldn't panic on pop()
|
|
||||||
use std::sync::Arc;
|
|
||||||
use std::{thread, time};
|
|
||||||
|
|
||||||
// Wrap the pool in an Arc, same way as its used in `core/index.rs`
|
|
||||||
let pool = Arc::new(Pool::new());
|
|
||||||
// clone pools outside the move scope of each new thread
|
|
||||||
let pool1 = Arc::clone(&pool);
|
|
||||||
let pool2 = Arc::clone(&pool);
|
|
||||||
let elements_for_pool = vec![1, 2];
|
|
||||||
pool.publish_new_generation(elements_for_pool);
|
|
||||||
|
|
||||||
let mut threads = vec![];
|
|
||||||
let sleep_dur = time::Duration::from_millis(10);
|
|
||||||
// spawn one more thread than there are elements in the pool
|
|
||||||
threads.push(thread::spawn(move || {
|
|
||||||
// leasing to make sure it's not dropped before sleep is called
|
|
||||||
let _leased_searcher = &pool.acquire();
|
|
||||||
thread::sleep(sleep_dur);
|
|
||||||
}));
|
|
||||||
threads.push(thread::spawn(move || {
|
|
||||||
// leasing to make sure it's not dropped before sleep is called
|
|
||||||
let _leased_searcher = &pool1.acquire();
|
|
||||||
thread::sleep(sleep_dur);
|
|
||||||
}));
|
|
||||||
threads.push(thread::spawn(move || {
|
|
||||||
// leasing to make sure it's not dropped before sleep is called
|
|
||||||
let _leased_searcher = &pool2.acquire();
|
|
||||||
thread::sleep(sleep_dur);
|
|
||||||
}));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
@@ -1,43 +1,17 @@
|
|||||||
use collector::Collector;
|
use collector::Collector;
|
||||||
use collector::SegmentCollector;
|
|
||||||
use core::Executor;
|
|
||||||
use core::InvertedIndexReader;
|
use core::InvertedIndexReader;
|
||||||
use core::SegmentReader;
|
use core::SegmentReader;
|
||||||
use query::Query;
|
use query::Query;
|
||||||
use query::Scorer;
|
|
||||||
use query::Weight;
|
|
||||||
use schema::Document;
|
use schema::Document;
|
||||||
use schema::Schema;
|
use schema::Schema;
|
||||||
use schema::{Field, Term};
|
use schema::{Field, Term};
|
||||||
use space_usage::SearcherSpaceUsage;
|
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use store::StoreReader;
|
|
||||||
use termdict::TermMerger;
|
use termdict::TermMerger;
|
||||||
use DocAddress;
|
use DocAddress;
|
||||||
use Index;
|
use Index;
|
||||||
use Result;
|
use Result;
|
||||||
|
|
||||||
fn collect_segment<C: Collector>(
|
|
||||||
collector: &C,
|
|
||||||
weight: &Weight,
|
|
||||||
segment_ord: u32,
|
|
||||||
segment_reader: &SegmentReader,
|
|
||||||
) -> Result<C::Fruit> {
|
|
||||||
let mut scorer = weight.scorer(segment_reader)?;
|
|
||||||
let mut segment_collector = collector.for_segment(segment_ord as u32, segment_reader)?;
|
|
||||||
if let Some(delete_bitset) = segment_reader.delete_bitset() {
|
|
||||||
scorer.for_each(&mut |doc, score| {
|
|
||||||
if !delete_bitset.is_deleted(doc) {
|
|
||||||
segment_collector.collect(doc, score);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
scorer.for_each(&mut |doc, score| segment_collector.collect(doc, score));
|
|
||||||
}
|
|
||||||
Ok(segment_collector.harvest())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Holds a list of `SegmentReader`s ready for search.
|
/// Holds a list of `SegmentReader`s ready for search.
|
||||||
///
|
///
|
||||||
/// It guarantees that the `Segment` will not be removed before
|
/// It guarantees that the `Segment` will not be removed before
|
||||||
@@ -47,7 +21,6 @@ pub struct Searcher {
|
|||||||
schema: Schema,
|
schema: Schema,
|
||||||
index: Index,
|
index: Index,
|
||||||
segment_readers: Vec<SegmentReader>,
|
segment_readers: Vec<SegmentReader>,
|
||||||
store_readers: Vec<StoreReader>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Searcher {
|
impl Searcher {
|
||||||
@@ -57,15 +30,10 @@ impl Searcher {
|
|||||||
index: Index,
|
index: Index,
|
||||||
segment_readers: Vec<SegmentReader>,
|
segment_readers: Vec<SegmentReader>,
|
||||||
) -> Searcher {
|
) -> Searcher {
|
||||||
let store_readers = segment_readers
|
|
||||||
.iter()
|
|
||||||
.map(|segment_reader| segment_reader.get_store_reader())
|
|
||||||
.collect();
|
|
||||||
Searcher {
|
Searcher {
|
||||||
schema,
|
schema,
|
||||||
index,
|
index,
|
||||||
segment_readers,
|
segment_readers,
|
||||||
store_readers,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -80,8 +48,8 @@ impl Searcher {
|
|||||||
/// the request to the right `Segment`.
|
/// the request to the right `Segment`.
|
||||||
pub fn doc(&self, doc_address: DocAddress) -> Result<Document> {
|
pub fn doc(&self, doc_address: DocAddress) -> Result<Document> {
|
||||||
let DocAddress(segment_local_id, doc_id) = doc_address;
|
let DocAddress(segment_local_id, doc_id) = doc_address;
|
||||||
let store_reader = &self.store_readers[segment_local_id as usize];
|
let segment_reader = &self.segment_readers[segment_local_id as usize];
|
||||||
store_reader.get(doc_id)
|
segment_reader.doc(doc_id)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Access the schema associated to the index of this searcher.
|
/// Access the schema associated to the index of this searcher.
|
||||||
@@ -104,8 +72,7 @@ impl Searcher {
|
|||||||
.iter()
|
.iter()
|
||||||
.map(|segment_reader| {
|
.map(|segment_reader| {
|
||||||
u64::from(segment_reader.inverted_index(term.field()).doc_freq(term))
|
u64::from(segment_reader.inverted_index(term.field()).doc_freq(term))
|
||||||
})
|
}).sum::<u64>()
|
||||||
.sum::<u64>()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return the list of segment readers
|
/// Return the list of segment readers
|
||||||
@@ -118,58 +85,9 @@ impl Searcher {
|
|||||||
&self.segment_readers[segment_ord as usize]
|
&self.segment_readers[segment_ord as usize]
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Runs a query on the segment readers wrapped by the searcher.
|
/// Runs a query on the segment readers wrapped by the searcher
|
||||||
///
|
pub fn search<C: Collector>(&self, query: &Query, collector: &mut C) -> Result<()> {
|
||||||
/// Search works as follows :
|
query.search(self, collector)
|
||||||
///
|
|
||||||
/// First the weight object associated to the query is created.
|
|
||||||
///
|
|
||||||
/// Then, the query loops over the segments and for each segment :
|
|
||||||
/// - setup the collector and informs it that the segment being processed has changed.
|
|
||||||
/// - creates a SegmentCollector for collecting documents associated to the segment
|
|
||||||
/// - creates a `Scorer` object associated for this segment
|
|
||||||
/// - iterate through the matched documents and push them to the segment collector.
|
|
||||||
///
|
|
||||||
/// Finally, the Collector merges each of the child collectors into itself for result usability
|
|
||||||
/// by the caller.
|
|
||||||
pub fn search<C: Collector>(&self, query: &Query, collector: &C) -> Result<C::Fruit> {
|
|
||||||
let executor = self.index.search_executor();
|
|
||||||
self.search_with_executor(query, collector, executor)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Same as [`search(...)`](#method.search) but multithreaded.
|
|
||||||
///
|
|
||||||
/// The current implementation is rather naive :
|
|
||||||
/// multithreading is by splitting search into as many task
|
|
||||||
/// as there are segments.
|
|
||||||
///
|
|
||||||
/// It is powerless at making search faster if your index consists in
|
|
||||||
/// one large segment.
|
|
||||||
///
|
|
||||||
/// Also, keep in my multithreading a single query on several
|
|
||||||
/// threads will not improve your throughput. It can actually
|
|
||||||
/// hurt it. It will however, decrease the average response time.
|
|
||||||
pub fn search_with_executor<C: Collector>(
|
|
||||||
&self,
|
|
||||||
query: &Query,
|
|
||||||
collector: &C,
|
|
||||||
executor: &Executor,
|
|
||||||
) -> Result<C::Fruit> {
|
|
||||||
let scoring_enabled = collector.requires_scoring();
|
|
||||||
let weight = query.weight(self, scoring_enabled)?;
|
|
||||||
let segment_readers = self.segment_readers();
|
|
||||||
let fruits = executor.map(
|
|
||||||
|(segment_ord, segment_reader)| {
|
|
||||||
collect_segment(
|
|
||||||
collector,
|
|
||||||
weight.as_ref(),
|
|
||||||
segment_ord as u32,
|
|
||||||
segment_reader,
|
|
||||||
)
|
|
||||||
},
|
|
||||||
segment_readers.iter().enumerate(),
|
|
||||||
)?;
|
|
||||||
collector.merge_fruits(fruits)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return the field searcher associated to a `Field`.
|
/// Return the field searcher associated to a `Field`.
|
||||||
@@ -181,15 +99,6 @@ impl Searcher {
|
|||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
FieldSearcher::new(inv_index_readers)
|
FieldSearcher::new(inv_index_readers)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Summarize total space usage of this searcher.
|
|
||||||
pub fn space_usage(&self) -> SearcherSpaceUsage {
|
|
||||||
let mut space_usage = SearcherSpaceUsage::new();
|
|
||||||
for segment_reader in self.segment_readers.iter() {
|
|
||||||
space_usage.add_segment(segment_reader.space_usage());
|
|
||||||
}
|
|
||||||
space_usage
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct FieldSearcher {
|
pub struct FieldSearcher {
|
||||||
|
|||||||
@@ -41,6 +41,6 @@ impl SegmentComponent {
|
|||||||
SegmentComponent::STORE,
|
SegmentComponent::STORE,
|
||||||
SegmentComponent::DELETE,
|
SegmentComponent::DELETE,
|
||||||
];
|
];
|
||||||
SEGMENT_COMPONENTS.iter()
|
SEGMENT_COMPONENTS.into_iter()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ pub struct SegmentId(Uuid);
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
lazy_static! {
|
lazy_static! {
|
||||||
static ref AUTO_INC_COUNTER: atomic::AtomicUsize = atomic::AtomicUsize::default();
|
static ref AUTO_INC_COUNTER: atomic::AtomicUsize = atomic::AtomicUsize::default();
|
||||||
static ref ZERO_ARRAY: [u8; 8] = [0u8; 8];
|
static ref EMPTY_ARR: [u8; 8] = [0u8; 8];
|
||||||
}
|
}
|
||||||
|
|
||||||
// During tests, we generate the segment id in a autoincrement manner
|
// During tests, we generate the segment id in a autoincrement manner
|
||||||
@@ -30,7 +30,7 @@ lazy_static! {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
fn create_uuid() -> Uuid {
|
fn create_uuid() -> Uuid {
|
||||||
let new_auto_inc_id = (*AUTO_INC_COUNTER).fetch_add(1, atomic::Ordering::SeqCst);
|
let new_auto_inc_id = (*AUTO_INC_COUNTER).fetch_add(1, atomic::Ordering::SeqCst);
|
||||||
Uuid::from_fields(new_auto_inc_id as u32, 0, 0, &*ZERO_ARRAY).unwrap()
|
Uuid::from_fields(new_auto_inc_id as u32, 0, 0, &*EMPTY_ARR).unwrap()
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(not(test))]
|
#[cfg(not(test))]
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ use core::InvertedIndexReader;
|
|||||||
use core::Segment;
|
use core::Segment;
|
||||||
use core::SegmentComponent;
|
use core::SegmentComponent;
|
||||||
use core::SegmentId;
|
use core::SegmentId;
|
||||||
use directory::ReadOnlySource;
|
|
||||||
use error::TantivyError;
|
use error::TantivyError;
|
||||||
use fastfield::DeleteBitSet;
|
use fastfield::DeleteBitSet;
|
||||||
use fastfield::FacetReader;
|
use fastfield::FacetReader;
|
||||||
@@ -13,10 +12,10 @@ use fastfield::{self, FastFieldNotAvailableError};
|
|||||||
use fastfield::{BytesFastFieldReader, FastValue, MultiValueIntFastFieldReader};
|
use fastfield::{BytesFastFieldReader, FastValue, MultiValueIntFastFieldReader};
|
||||||
use fieldnorm::FieldNormReader;
|
use fieldnorm::FieldNormReader;
|
||||||
use schema::Cardinality;
|
use schema::Cardinality;
|
||||||
|
use schema::Document;
|
||||||
use schema::Field;
|
use schema::Field;
|
||||||
use schema::FieldType;
|
use schema::FieldType;
|
||||||
use schema::Schema;
|
use schema::Schema;
|
||||||
use space_usage::SegmentSpaceUsage;
|
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
@@ -54,7 +53,7 @@ pub struct SegmentReader {
|
|||||||
fast_fields_composite: CompositeFile,
|
fast_fields_composite: CompositeFile,
|
||||||
fieldnorms_composite: CompositeFile,
|
fieldnorms_composite: CompositeFile,
|
||||||
|
|
||||||
store_source: ReadOnlySource,
|
store_reader: StoreReader,
|
||||||
delete_bitset_opt: Option<DeleteBitSet>,
|
delete_bitset_opt: Option<DeleteBitSet>,
|
||||||
schema: Schema,
|
schema: Schema,
|
||||||
}
|
}
|
||||||
@@ -197,7 +196,8 @@ impl SegmentReader {
|
|||||||
/// Accessor to the segment's `Field norms`'s reader.
|
/// Accessor to the segment's `Field norms`'s reader.
|
||||||
///
|
///
|
||||||
/// Field norms are the length (in tokens) of the fields.
|
/// Field norms are the length (in tokens) of the fields.
|
||||||
/// It is used in the computation of the [TfIdf](https://fulmicoton.gitbooks.io/tantivy-doc/content/tfidf.html).
|
/// It is used in the computation of the [TfIdf]
|
||||||
|
/// (https://fulmicoton.gitbooks.io/tantivy-doc/content/tfidf.html).
|
||||||
///
|
///
|
||||||
/// They are simply stored as a fast field, serialized in
|
/// They are simply stored as a fast field, serialized in
|
||||||
/// the `.fieldnorm` file of the segment.
|
/// the `.fieldnorm` file of the segment.
|
||||||
@@ -215,8 +215,8 @@ impl SegmentReader {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Accessor to the segment's `StoreReader`.
|
/// Accessor to the segment's `StoreReader`.
|
||||||
pub fn get_store_reader(&self) -> StoreReader {
|
pub fn get_store_reader(&self) -> &StoreReader {
|
||||||
StoreReader::from_source(self.store_source.clone())
|
&self.store_reader
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Open a new segment for reading.
|
/// Open a new segment for reading.
|
||||||
@@ -225,6 +225,7 @@ impl SegmentReader {
|
|||||||
let termdict_composite = CompositeFile::open(&termdict_source)?;
|
let termdict_composite = CompositeFile::open(&termdict_source)?;
|
||||||
|
|
||||||
let store_source = segment.open_read(SegmentComponent::STORE)?;
|
let store_source = segment.open_read(SegmentComponent::STORE)?;
|
||||||
|
let store_reader = StoreReader::from_source(store_source);
|
||||||
|
|
||||||
fail_point!("SegmentReader::open#middle");
|
fail_point!("SegmentReader::open#middle");
|
||||||
|
|
||||||
@@ -270,7 +271,7 @@ impl SegmentReader {
|
|||||||
fast_fields_composite,
|
fast_fields_composite,
|
||||||
fieldnorms_composite,
|
fieldnorms_composite,
|
||||||
segment_id: segment.id(),
|
segment_id: segment.id(),
|
||||||
store_source,
|
store_reader,
|
||||||
delete_bitset_opt,
|
delete_bitset_opt,
|
||||||
positions_composite,
|
positions_composite,
|
||||||
positions_idx_composite,
|
positions_idx_composite,
|
||||||
@@ -349,6 +350,14 @@ impl SegmentReader {
|
|||||||
inv_idx_reader
|
inv_idx_reader
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the document (or to be accurate, its stored field)
|
||||||
|
/// bearing the given doc id.
|
||||||
|
/// This method is slow and should seldom be called from
|
||||||
|
/// within a collector.
|
||||||
|
pub fn doc(&self, doc_id: DocId) -> Result<Document> {
|
||||||
|
self.store_reader.get(doc_id)
|
||||||
|
}
|
||||||
|
|
||||||
/// Returns the segment id
|
/// Returns the segment id
|
||||||
pub fn segment_id(&self) -> SegmentId {
|
pub fn segment_id(&self) -> SegmentId {
|
||||||
self.segment_id
|
self.segment_id
|
||||||
@@ -372,24 +381,6 @@ impl SegmentReader {
|
|||||||
pub fn doc_ids_alive(&self) -> SegmentReaderAliveDocsIterator {
|
pub fn doc_ids_alive(&self) -> SegmentReaderAliveDocsIterator {
|
||||||
SegmentReaderAliveDocsIterator::new(&self)
|
SegmentReaderAliveDocsIterator::new(&self)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Summarize total space usage of this segment.
|
|
||||||
pub fn space_usage(&self) -> SegmentSpaceUsage {
|
|
||||||
SegmentSpaceUsage::new(
|
|
||||||
self.num_docs(),
|
|
||||||
self.termdict_composite.space_usage(),
|
|
||||||
self.postings_composite.space_usage(),
|
|
||||||
self.positions_composite.space_usage(),
|
|
||||||
self.positions_idx_composite.space_usage(),
|
|
||||||
self.fast_fields_composite.space_usage(),
|
|
||||||
self.fieldnorms_composite.space_usage(),
|
|
||||||
self.get_store_reader().space_usage(),
|
|
||||||
self.delete_bitset_opt
|
|
||||||
.as_ref()
|
|
||||||
.map(|x| x.space_usage())
|
|
||||||
.unwrap_or(0),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl fmt::Debug for SegmentReader {
|
impl fmt::Debug for SegmentReader {
|
||||||
@@ -447,12 +438,12 @@ impl<'a> Iterator for SegmentReaderAliveDocsIterator<'a> {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use core::Index;
|
use core::Index;
|
||||||
use schema::{Schema, Term, STORED, TEXT};
|
use schema::{SchemaBuilder, Term, STORED, TEXT};
|
||||||
use DocId;
|
use DocId;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_alive_docs_iterator() {
|
fn test_alive_docs_iterator() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::new();
|
||||||
schema_builder.add_text_field("name", TEXT | STORED);
|
schema_builder.add_text_field("name", TEXT | STORED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema.clone());
|
let index = Index::create_in_ram(schema.clone());
|
||||||
@@ -477,7 +468,9 @@ mod test {
|
|||||||
// ok, now we should have a deleted doc
|
// ok, now we should have a deleted doc
|
||||||
index_writer2.commit().unwrap();
|
index_writer2.commit().unwrap();
|
||||||
}
|
}
|
||||||
let searcher = index.reader().unwrap().searcher();
|
|
||||||
|
index.load_searchers().unwrap();
|
||||||
|
let searcher = index.searcher();
|
||||||
let docs: Vec<DocId> = searcher.segment_reader(0).doc_ids_alive().collect();
|
let docs: Vec<DocId> = searcher.segment_reader(0).doc_ids_alive().collect();
|
||||||
assert_eq!(vec![0u32, 2u32], docs);
|
assert_eq!(vec![0u32, 2u32], docs);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,104 +1,11 @@
|
|||||||
use directory::directory_lock::Lock;
|
|
||||||
use directory::error::LockError;
|
|
||||||
use directory::error::{DeleteError, OpenReadError, OpenWriteError};
|
use directory::error::{DeleteError, OpenReadError, OpenWriteError};
|
||||||
use directory::WatchCallback;
|
|
||||||
use directory::WatchHandle;
|
|
||||||
use directory::{ReadOnlySource, WritePtr};
|
use directory::{ReadOnlySource, WritePtr};
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::io::Write;
|
|
||||||
use std::marker::Send;
|
use std::marker::Send;
|
||||||
use std::marker::Sync;
|
use std::marker::Sync;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::path::PathBuf;
|
|
||||||
use std::result;
|
use std::result;
|
||||||
use std::thread;
|
|
||||||
use std::time::Duration;
|
|
||||||
|
|
||||||
/// Retry the logic of acquiring locks is pretty simple.
|
|
||||||
/// We just retry `n` times after a given `duratio`, both
|
|
||||||
/// depending on the type of lock.
|
|
||||||
struct RetryPolicy {
|
|
||||||
num_retries: usize,
|
|
||||||
wait_in_ms: u64,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl RetryPolicy {
|
|
||||||
fn no_retry() -> RetryPolicy {
|
|
||||||
RetryPolicy {
|
|
||||||
num_retries: 0,
|
|
||||||
wait_in_ms: 0,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn wait_and_retry(&mut self) -> bool {
|
|
||||||
if self.num_retries == 0 {
|
|
||||||
false
|
|
||||||
} else {
|
|
||||||
self.num_retries -= 1;
|
|
||||||
let wait_duration = Duration::from_millis(self.wait_in_ms);
|
|
||||||
thread::sleep(wait_duration);
|
|
||||||
true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The `DirectoryLock` is an object that represents a file lock.
|
|
||||||
/// See [`LockType`](struct.LockType.html)
|
|
||||||
///
|
|
||||||
/// It is transparently associated to a lock file, that gets deleted
|
|
||||||
/// on `Drop.` The lock is released automatically on `Drop`.
|
|
||||||
pub struct DirectoryLock(Box<Drop + Send + 'static>);
|
|
||||||
|
|
||||||
struct DirectoryLockGuard {
|
|
||||||
directory: Box<Directory>,
|
|
||||||
path: PathBuf,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T: Drop + Send + 'static> From<Box<T>> for DirectoryLock {
|
|
||||||
fn from(underlying: Box<T>) -> Self {
|
|
||||||
DirectoryLock(underlying)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Drop for DirectoryLockGuard {
|
|
||||||
fn drop(&mut self) {
|
|
||||||
if let Err(e) = self.directory.delete(&*self.path) {
|
|
||||||
error!("Failed to remove the lock file. {:?}", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
enum TryAcquireLockError {
|
|
||||||
FileExists,
|
|
||||||
IOError(io::Error),
|
|
||||||
}
|
|
||||||
|
|
||||||
fn try_acquire_lock(
|
|
||||||
filepath: &Path,
|
|
||||||
directory: &mut Directory,
|
|
||||||
) -> Result<DirectoryLock, TryAcquireLockError> {
|
|
||||||
let mut write = directory.open_write(filepath).map_err(|e| match e {
|
|
||||||
OpenWriteError::FileAlreadyExists(_) => TryAcquireLockError::FileExists,
|
|
||||||
OpenWriteError::IOError(io_error) => TryAcquireLockError::IOError(io_error.into()),
|
|
||||||
})?;
|
|
||||||
write.flush().map_err(TryAcquireLockError::IOError)?;
|
|
||||||
Ok(DirectoryLock::from(Box::new(DirectoryLockGuard {
|
|
||||||
directory: directory.box_clone(),
|
|
||||||
path: filepath.to_owned(),
|
|
||||||
})))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn retry_policy(is_blocking: bool) -> RetryPolicy {
|
|
||||||
if is_blocking {
|
|
||||||
RetryPolicy {
|
|
||||||
num_retries: 100,
|
|
||||||
wait_in_ms: 100,
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
RetryPolicy::no_retry()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Write-once read many (WORM) abstraction for where
|
/// Write-once read many (WORM) abstraction for where
|
||||||
/// tantivy's data should be stored.
|
/// tantivy's data should be stored.
|
||||||
@@ -166,55 +73,6 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
|
|||||||
///
|
///
|
||||||
/// The file may or may not previously exist.
|
/// The file may or may not previously exist.
|
||||||
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()>;
|
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()>;
|
||||||
|
|
||||||
/// Acquire a lock in the given directory.
|
|
||||||
///
|
|
||||||
/// The method is blocking or not depending on the `Lock` object.
|
|
||||||
fn acquire_lock(&self, lock: &Lock) -> Result<DirectoryLock, LockError> {
|
|
||||||
let mut box_directory = self.box_clone();
|
|
||||||
let mut retry_policy = retry_policy(lock.is_blocking);
|
|
||||||
loop {
|
|
||||||
match try_acquire_lock(&lock.filepath, &mut *box_directory) {
|
|
||||||
Ok(result) => {
|
|
||||||
return Ok(result);
|
|
||||||
}
|
|
||||||
Err(TryAcquireLockError::FileExists) => {
|
|
||||||
if !retry_policy.wait_and_retry() {
|
|
||||||
return Err(LockError::LockBusy);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Err(TryAcquireLockError::IOError(io_error)) => {
|
|
||||||
return Err(LockError::IOError(io_error));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Registers a callback that will be called whenever a change on the `meta.json`
|
|
||||||
/// using the `atomic_write` API is detected.
|
|
||||||
///
|
|
||||||
/// The behavior when using `.watch()` on a file using `.open_write(...)` is, on the other
|
|
||||||
/// hand, undefined.
|
|
||||||
///
|
|
||||||
/// The file will be watched for the lifetime of the returned `WatchHandle`. The caller is
|
|
||||||
/// required to keep it.
|
|
||||||
/// It does not override previous callbacks. When the file is modified, all callback that are
|
|
||||||
/// registered (and whose `WatchHandle` is still alive) are triggered.
|
|
||||||
///
|
|
||||||
/// Internally, tantivy only uses this API to detect new commits to implement the
|
|
||||||
/// `OnCommit` `ReloadPolicy`. Not implementing watch in a `Directory` only prevents the
|
|
||||||
/// `OnCommit` `ReloadPolicy` to work properly.
|
|
||||||
fn watch(&self, watch_callback: WatchCallback) -> WatchHandle;
|
|
||||||
|
|
||||||
/// Ensure that all volatile files reach are persisted (in directory where that makes sense.)
|
|
||||||
///
|
|
||||||
/// In order to make Near Real Time efficient, tantivy introduced the notion of soft_commit vs
|
|
||||||
/// commit. Commit will call `.flush()`, while softcommit won't.
|
|
||||||
///
|
|
||||||
/// `meta.json` should be the last file to be flushed.
|
|
||||||
fn flush(&self) -> io::Result<()> {
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// DirectoryClone
|
/// DirectoryClone
|
||||||
|
|||||||
@@ -1,56 +0,0 @@
|
|||||||
use std::path::PathBuf;
|
|
||||||
|
|
||||||
/// A directory lock.
|
|
||||||
///
|
|
||||||
/// A lock is associated to a specific path and some
|
|
||||||
/// [`LockParams`](./enum.LockParams.html).
|
|
||||||
/// Tantivy itself uses only two locks but client application
|
|
||||||
/// can use the directory facility to define their own locks.
|
|
||||||
/// - [INDEX_WRITER_LOCK](./struct.INDEX_WRITER_LOCK.html)
|
|
||||||
/// - [META_LOCK](./struct.META_LOCK.html)
|
|
||||||
///
|
|
||||||
/// Check out these locks documentation for more information.
|
|
||||||
///
|
|
||||||
#[derive(Debug)]
|
|
||||||
pub struct Lock {
|
|
||||||
/// The lock needs to be associated with its own file `path`.
|
|
||||||
/// Depending on the platform, the lock might rely on the creation
|
|
||||||
/// and deletion of this filepath.
|
|
||||||
pub filepath: PathBuf,
|
|
||||||
/// `lock_params` describes whether acquiring the lock is meant
|
|
||||||
/// to be a blocking operation or a non-blocking.
|
|
||||||
///
|
|
||||||
/// Acquiring a blocking lock blocks until the lock is
|
|
||||||
/// available.
|
|
||||||
/// Acquiring a blocking lock returns rapidly, either successfully
|
|
||||||
/// or with an error signifying that someone is already holding
|
|
||||||
/// the lock.
|
|
||||||
pub is_blocking: bool,
|
|
||||||
}
|
|
||||||
|
|
||||||
lazy_static! {
|
|
||||||
/// Only one process should be able to write tantivy's index at a time.
|
|
||||||
/// This lock file, when present, is in charge of preventing other processes to open an IndexWriter.
|
|
||||||
///
|
|
||||||
/// If the process is killed and this file remains, it is safe to remove it manually.
|
|
||||||
///
|
|
||||||
/// Failing to acquire this lock usually means a misuse of tantivy's API,
|
|
||||||
/// (creating more than one instance of the `IndexWriter`), are a spurious
|
|
||||||
/// lock file remaining after a crash. In the latter case, removing the file after
|
|
||||||
/// checking no process running tantivy is running is safe.
|
|
||||||
pub static ref INDEX_WRITER_LOCK: Lock = Lock {
|
|
||||||
filepath: PathBuf::from(".tantivy-writer.lock"),
|
|
||||||
is_blocking: false
|
|
||||||
};
|
|
||||||
/// The meta lock file is here to protect the segment files being opened by
|
|
||||||
/// `IndexReader::reload()` from being garbage collected.
|
|
||||||
/// It makes it possible for another process to safely consume
|
|
||||||
/// our index in-writing. Ideally, we may have prefered `RWLock` semantics
|
|
||||||
/// here, but it is difficult to achieve on Windows.
|
|
||||||
///
|
|
||||||
/// Opening segment readers is a very fast process.
|
|
||||||
pub static ref META_LOCK: Lock = Lock {
|
|
||||||
filepath: PathBuf::from(".tantivy-meta.lock"),
|
|
||||||
is_blocking: true
|
|
||||||
};
|
|
||||||
}
|
|
||||||
@@ -3,22 +3,6 @@ use std::fmt;
|
|||||||
use std::io;
|
use std::io;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
|
|
||||||
/// Error while trying to acquire a directory lock.
|
|
||||||
#[derive(Debug, Fail)]
|
|
||||||
pub enum LockError {
|
|
||||||
/// Failed to acquired a lock as it is already hold by another
|
|
||||||
/// client.
|
|
||||||
/// - In the context of a blocking lock, this means the lock was not released within some `timeout` period.
|
|
||||||
/// - In the context of a non-blocking lock, this means the lock was busy at the moment of the call.
|
|
||||||
#[fail(
|
|
||||||
display = "Could not acquire lock as it is already held, possibly by a different process."
|
|
||||||
)]
|
|
||||||
LockBusy,
|
|
||||||
/// Trying to acquire a lock failed with an `IOError`
|
|
||||||
#[fail(display = "Failed to acquire the lock due to an io:Error.")]
|
|
||||||
IOError(io::Error),
|
|
||||||
}
|
|
||||||
|
|
||||||
/// General IO error with an optional path to the offending file.
|
/// General IO error with an optional path to the offending file.
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct IOError {
|
pub struct IOError {
|
||||||
@@ -26,12 +10,6 @@ pub struct IOError {
|
|||||||
err: io::Error,
|
err: io::Error,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Into<io::Error> for IOError {
|
|
||||||
fn into(self) -> io::Error {
|
|
||||||
self.err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl fmt::Display for IOError {
|
impl fmt::Display for IOError {
|
||||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
match self.path {
|
match self.path {
|
||||||
@@ -73,14 +51,6 @@ pub enum OpenDirectoryError {
|
|||||||
DoesNotExist(PathBuf),
|
DoesNotExist(PathBuf),
|
||||||
/// The path exists but is not a directory.
|
/// The path exists but is not a directory.
|
||||||
NotADirectory(PathBuf),
|
NotADirectory(PathBuf),
|
||||||
/// IoError
|
|
||||||
IoError(io::Error),
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<io::Error> for OpenDirectoryError {
|
|
||||||
fn from(io_err: io::Error) -> Self {
|
|
||||||
OpenDirectoryError::IoError(io_err)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl fmt::Display for OpenDirectoryError {
|
impl fmt::Display for OpenDirectoryError {
|
||||||
@@ -92,11 +62,6 @@ impl fmt::Display for OpenDirectoryError {
|
|||||||
OpenDirectoryError::NotADirectory(ref path) => {
|
OpenDirectoryError::NotADirectory(ref path) => {
|
||||||
write!(f, "the path '{:?}' exists but is not a directory", path)
|
write!(f, "the path '{:?}' exists but is not a directory", path)
|
||||||
}
|
}
|
||||||
OpenDirectoryError::IoError(ref err) => write!(
|
|
||||||
f,
|
|
||||||
"IOError while trying to open/create the directory. {:?}",
|
|
||||||
err
|
|
||||||
),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,11 +1,8 @@
|
|||||||
use core::MANAGED_FILEPATH;
|
use core::MANAGED_FILEPATH;
|
||||||
use directory::error::{DeleteError, IOError, LockError, OpenReadError, OpenWriteError};
|
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
|
||||||
use directory::DirectoryLock;
|
|
||||||
use directory::Lock;
|
|
||||||
use directory::META_LOCK;
|
|
||||||
use directory::{ReadOnlySource, WritePtr};
|
use directory::{ReadOnlySource, WritePtr};
|
||||||
use directory::{WatchCallback, WatchHandle};
|
use error::TantivyError;
|
||||||
use error::DataCorruption;
|
use indexer::LockType;
|
||||||
use serde_json;
|
use serde_json;
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::io;
|
use std::io;
|
||||||
@@ -62,17 +59,12 @@ fn save_managed_paths(
|
|||||||
|
|
||||||
impl ManagedDirectory {
|
impl ManagedDirectory {
|
||||||
/// Wraps a directory as managed directory.
|
/// Wraps a directory as managed directory.
|
||||||
pub fn wrap<Dir: Directory>(directory: Dir) -> Result<ManagedDirectory> {
|
pub fn new<Dir: Directory>(directory: Dir) -> Result<ManagedDirectory> {
|
||||||
match directory.atomic_read(&MANAGED_FILEPATH) {
|
match directory.atomic_read(&MANAGED_FILEPATH) {
|
||||||
Ok(data) => {
|
Ok(data) => {
|
||||||
let managed_files_json = String::from_utf8_lossy(&data);
|
let managed_files_json = String::from_utf8_lossy(&data);
|
||||||
let managed_files: HashSet<PathBuf> = serde_json::from_str(&managed_files_json)
|
let managed_files: HashSet<PathBuf> = serde_json::from_str(&managed_files_json)
|
||||||
.map_err(|e| {
|
.map_err(|_| TantivyError::CorruptedFile(MANAGED_FILEPATH.clone()))?;
|
||||||
DataCorruption::new(
|
|
||||||
MANAGED_FILEPATH.clone(),
|
|
||||||
format!("Managed file cannot be deserialized: {:?}. ", e),
|
|
||||||
)
|
|
||||||
})?;
|
|
||||||
Ok(ManagedDirectory {
|
Ok(ManagedDirectory {
|
||||||
directory: Box::new(directory),
|
directory: Box::new(directory),
|
||||||
meta_informations: Arc::new(RwLock::new(MetaInformation {
|
meta_informations: Arc::new(RwLock::new(MetaInformation {
|
||||||
@@ -95,9 +87,6 @@ impl ManagedDirectory {
|
|||||||
///
|
///
|
||||||
/// * `living_files` - List of files that are still used by the index.
|
/// * `living_files` - List of files that are still used by the index.
|
||||||
///
|
///
|
||||||
/// The use a callback ensures that the list of living_files is computed
|
|
||||||
/// while we hold the lock on meta.
|
|
||||||
///
|
|
||||||
/// This method does not panick nor returns errors.
|
/// This method does not panick nor returns errors.
|
||||||
/// If a file cannot be deleted (for permission reasons for instance)
|
/// If a file cannot be deleted (for permission reasons for instance)
|
||||||
/// an error is simply logged, and the file remains in the list of managed
|
/// an error is simply logged, and the file remains in the list of managed
|
||||||
@@ -128,7 +117,7 @@ impl ManagedDirectory {
|
|||||||
// 2) writer change meta.json (for instance after a merge or a commit)
|
// 2) writer change meta.json (for instance after a merge or a commit)
|
||||||
// 3) gc kicks in.
|
// 3) gc kicks in.
|
||||||
// 4) gc removes a file that was useful for process B, before process B opened it.
|
// 4) gc removes a file that was useful for process B, before process B opened it.
|
||||||
if let Ok(_meta_lock) = self.acquire_lock(&META_LOCK) {
|
if let Ok(_meta_lock) = LockType::MetaLock.acquire_lock(self) {
|
||||||
let living_files = get_living_files();
|
let living_files = get_living_files();
|
||||||
for managed_path in &meta_informations_rlock.managed_paths {
|
for managed_path in &meta_informations_rlock.managed_paths {
|
||||||
if !living_files.contains(managed_path) {
|
if !living_files.contains(managed_path) {
|
||||||
@@ -238,14 +227,6 @@ impl Directory for ManagedDirectory {
|
|||||||
fn exists(&self, path: &Path) -> bool {
|
fn exists(&self, path: &Path) -> bool {
|
||||||
self.directory.exists(path)
|
self.directory.exists(path)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn acquire_lock(&self, lock: &Lock) -> result::Result<DirectoryLock, LockError> {
|
|
||||||
self.directory.acquire_lock(lock)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn watch(&self, watch_callback: WatchCallback) -> WatchHandle {
|
|
||||||
self.directory.watch(watch_callback)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Clone for ManagedDirectory {
|
impl Clone for ManagedDirectory {
|
||||||
@@ -260,98 +241,95 @@ impl Clone for ManagedDirectory {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
|
use super::*;
|
||||||
#[cfg(feature = "mmap")]
|
#[cfg(feature = "mmap")]
|
||||||
mod mmap_specific {
|
use directory::MmapDirectory;
|
||||||
|
use std::io::Write;
|
||||||
|
use std::path::Path;
|
||||||
|
use tempdir::TempDir;
|
||||||
|
|
||||||
use super::super::*;
|
lazy_static! {
|
||||||
use std::path::Path;
|
static ref TEST_PATH1: &'static Path = Path::new("some_path_for_test");
|
||||||
use tempdir::TempDir;
|
static ref TEST_PATH2: &'static Path = Path::new("some_path_for_test2");
|
||||||
|
}
|
||||||
lazy_static! {
|
|
||||||
static ref TEST_PATH1: &'static Path = Path::new("some_path_for_test");
|
|
||||||
static ref TEST_PATH2: &'static Path = Path::new("some_path_for_test2");
|
|
||||||
}
|
|
||||||
|
|
||||||
use directory::MmapDirectory;
|
|
||||||
use std::io::Write;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_managed_directory() {
|
|
||||||
let tempdir = TempDir::new("index").unwrap();
|
|
||||||
let tempdir_path = PathBuf::from(tempdir.path());
|
|
||||||
{
|
|
||||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
|
||||||
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
|
||||||
{
|
|
||||||
let mut write_file = managed_directory.open_write(*TEST_PATH1).unwrap();
|
|
||||||
write_file.flush().unwrap();
|
|
||||||
}
|
|
||||||
{
|
|
||||||
managed_directory
|
|
||||||
.atomic_write(*TEST_PATH2, &vec![0u8, 1u8])
|
|
||||||
.unwrap();
|
|
||||||
}
|
|
||||||
{
|
|
||||||
assert!(managed_directory.exists(*TEST_PATH1));
|
|
||||||
assert!(managed_directory.exists(*TEST_PATH2));
|
|
||||||
}
|
|
||||||
{
|
|
||||||
let living_files: HashSet<PathBuf> =
|
|
||||||
[TEST_PATH1.to_owned()].into_iter().cloned().collect();
|
|
||||||
managed_directory.garbage_collect(|| living_files);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
assert!(managed_directory.exists(*TEST_PATH1));
|
|
||||||
assert!(!managed_directory.exists(*TEST_PATH2));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
{
|
|
||||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
|
||||||
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
|
||||||
{
|
|
||||||
assert!(managed_directory.exists(*TEST_PATH1));
|
|
||||||
assert!(!managed_directory.exists(*TEST_PATH2));
|
|
||||||
}
|
|
||||||
{
|
|
||||||
let living_files: HashSet<PathBuf> = HashSet::new();
|
|
||||||
managed_directory.garbage_collect(|| living_files);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
|
||||||
assert!(!managed_directory.exists(*TEST_PATH2));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_managed_directory_gc_while_mmapped() {
|
|
||||||
let tempdir = TempDir::new("index").unwrap();
|
|
||||||
let tempdir_path = PathBuf::from(tempdir.path());
|
|
||||||
let living_files = HashSet::new();
|
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[cfg(feature = "mmap")]
|
||||||
|
fn test_managed_directory() {
|
||||||
|
let tempdir = TempDir::new("index").unwrap();
|
||||||
|
let tempdir_path = PathBuf::from(tempdir.path());
|
||||||
|
{
|
||||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||||
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
|
||||||
managed_directory
|
{
|
||||||
.atomic_write(*TEST_PATH1, &vec![0u8, 1u8])
|
let mut write_file = managed_directory.open_write(*TEST_PATH1).unwrap();
|
||||||
.unwrap();
|
write_file.flush().unwrap();
|
||||||
assert!(managed_directory.exists(*TEST_PATH1));
|
}
|
||||||
|
{
|
||||||
let _mmap_read = managed_directory.open_read(*TEST_PATH1).unwrap();
|
managed_directory
|
||||||
managed_directory.garbage_collect(|| living_files.clone());
|
.atomic_write(*TEST_PATH2, &vec![0u8, 1u8])
|
||||||
if cfg!(target_os = "windows") {
|
.unwrap();
|
||||||
// On Windows, gc should try and fail the file as it is mmapped.
|
}
|
||||||
|
{
|
||||||
assert!(managed_directory.exists(*TEST_PATH1));
|
assert!(managed_directory.exists(*TEST_PATH1));
|
||||||
// unmap should happen here.
|
assert!(managed_directory.exists(*TEST_PATH2));
|
||||||
drop(_mmap_read);
|
}
|
||||||
// The file should still be in the list of managed file and
|
{
|
||||||
// eventually be deleted once mmap is released.
|
let living_files: HashSet<PathBuf> =
|
||||||
|
[TEST_PATH1.to_owned()].into_iter().cloned().collect();
|
||||||
managed_directory.garbage_collect(|| living_files);
|
managed_directory.garbage_collect(|| living_files);
|
||||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
}
|
||||||
} else {
|
{
|
||||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
assert!(managed_directory.exists(*TEST_PATH1));
|
||||||
|
assert!(!managed_directory.exists(*TEST_PATH2));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
{
|
||||||
|
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||||
|
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
|
||||||
|
{
|
||||||
|
assert!(managed_directory.exists(*TEST_PATH1));
|
||||||
|
assert!(!managed_directory.exists(*TEST_PATH2));
|
||||||
|
}
|
||||||
|
{
|
||||||
|
let living_files: HashSet<PathBuf> = HashSet::new();
|
||||||
|
managed_directory.garbage_collect(|| living_files);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||||
|
assert!(!managed_directory.exists(*TEST_PATH2));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[cfg(feature = "mmap ")]
|
||||||
|
fn test_managed_directory_gc_while_mmapped() {
|
||||||
|
let tempdir = TempDir::new("index").unwrap();
|
||||||
|
let tempdir_path = PathBuf::from(tempdir.path());
|
||||||
|
let living_files = HashSet::new();
|
||||||
|
|
||||||
|
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||||
|
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
|
||||||
|
managed_directory
|
||||||
|
.atomic_write(*TEST_PATH1, &vec![0u8, 1u8])
|
||||||
|
.unwrap();
|
||||||
|
assert!(managed_directory.exists(*TEST_PATH1));
|
||||||
|
|
||||||
|
let _mmap_read = managed_directory.open_read(*TEST_PATH1).unwrap();
|
||||||
|
managed_directory.garbage_collect(|| living_files.clone());
|
||||||
|
if cfg!(target_os = "windows") {
|
||||||
|
// On Windows, gc should try and fail the file as it is mmapped.
|
||||||
|
assert!(managed_directory.exists(*TEST_PATH1));
|
||||||
|
// unmap should happen here.
|
||||||
|
drop(_mmap_read);
|
||||||
|
// The file should still be in the list of managed file and
|
||||||
|
// eventually be deleted once mmap is released.
|
||||||
|
managed_directory.garbage_collect(|| living_files);
|
||||||
|
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||||
|
} else {
|
||||||
|
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,24 +1,12 @@
|
|||||||
extern crate fs2;
|
|
||||||
extern crate notify;
|
|
||||||
|
|
||||||
use self::fs2::FileExt;
|
|
||||||
use self::notify::RawEvent;
|
|
||||||
use self::notify::RecursiveMode;
|
|
||||||
use self::notify::Watcher;
|
|
||||||
use atomicwrites;
|
use atomicwrites;
|
||||||
use core::META_FILEPATH;
|
use common::make_io_err;
|
||||||
use directory::error::LockError;
|
|
||||||
use directory::error::{DeleteError, IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
|
use directory::error::{DeleteError, IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
|
||||||
use directory::read_only_source::BoxedData;
|
use directory::shared_vec_slice::SharedVecSlice;
|
||||||
use directory::Directory;
|
use directory::Directory;
|
||||||
use directory::DirectoryLock;
|
|
||||||
use directory::Lock;
|
|
||||||
use directory::ReadOnlySource;
|
use directory::ReadOnlySource;
|
||||||
use directory::WatchCallback;
|
|
||||||
use directory::WatchCallbackList;
|
|
||||||
use directory::WatchHandle;
|
|
||||||
use directory::WritePtr;
|
use directory::WritePtr;
|
||||||
use memmap::Mmap;
|
use fst::raw::MmapReadOnly;
|
||||||
|
use std::collections::hash_map::Entry as HashMapEntry;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::convert::From;
|
use std::convert::From;
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
@@ -28,22 +16,14 @@ use std::io::{self, Seek, SeekFrom};
|
|||||||
use std::io::{BufWriter, Read, Write};
|
use std::io::{BufWriter, Read, Write};
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::{Path, PathBuf};
|
||||||
use std::result;
|
use std::result;
|
||||||
use std::sync::mpsc::{channel, Receiver, Sender};
|
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::sync::Mutex;
|
|
||||||
use std::sync::RwLock;
|
use std::sync::RwLock;
|
||||||
use std::sync::Weak;
|
|
||||||
use std::thread;
|
|
||||||
use tempdir::TempDir;
|
use tempdir::TempDir;
|
||||||
|
|
||||||
/// Create a default io error given a string.
|
|
||||||
pub(crate) fn make_io_err(msg: String) -> io::Error {
|
|
||||||
io::Error::new(io::ErrorKind::Other, msg)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns None iff the file exists, can be read, but is empty (and hence
|
/// Returns None iff the file exists, can be read, but is empty (and hence
|
||||||
/// cannot be mmapped)
|
/// cannot be mmapped).
|
||||||
fn open_mmap(full_path: &Path) -> result::Result<Option<Mmap>, OpenReadError> {
|
///
|
||||||
|
fn open_mmap(full_path: &Path) -> result::Result<Option<MmapReadOnly>, OpenReadError> {
|
||||||
let file = File::open(full_path).map_err(|e| {
|
let file = File::open(full_path).map_err(|e| {
|
||||||
if e.kind() == io::ErrorKind::NotFound {
|
if e.kind() == io::ErrorKind::NotFound {
|
||||||
OpenReadError::FileDoesNotExist(full_path.to_owned())
|
OpenReadError::FileDoesNotExist(full_path.to_owned())
|
||||||
@@ -62,7 +42,7 @@ fn open_mmap(full_path: &Path) -> result::Result<Option<Mmap>, OpenReadError> {
|
|||||||
return Ok(None);
|
return Ok(None);
|
||||||
}
|
}
|
||||||
unsafe {
|
unsafe {
|
||||||
memmap::Mmap::map(&file)
|
MmapReadOnly::open(&file)
|
||||||
.map(Some)
|
.map(Some)
|
||||||
.map_err(|e| From::from(IOError::with_path(full_path.to_owned(), e)))
|
.map_err(|e| From::from(IOError::with_path(full_path.to_owned(), e)))
|
||||||
}
|
}
|
||||||
@@ -85,7 +65,7 @@ pub struct CacheInfo {
|
|||||||
|
|
||||||
struct MmapCache {
|
struct MmapCache {
|
||||||
counters: CacheCounters,
|
counters: CacheCounters,
|
||||||
cache: HashMap<PathBuf, Weak<BoxedData>>,
|
cache: HashMap<PathBuf, MmapReadOnly>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for MmapCache {
|
impl Default for MmapCache {
|
||||||
@@ -98,7 +78,12 @@ impl Default for MmapCache {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl MmapCache {
|
impl MmapCache {
|
||||||
fn get_info(&self) -> CacheInfo {
|
/// Removes a `MmapReadOnly` entry from the mmap cache.
|
||||||
|
fn discard_from_cache(&mut self, full_path: &Path) -> bool {
|
||||||
|
self.cache.remove(full_path).is_some()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_info(&mut self) -> CacheInfo {
|
||||||
let paths: Vec<PathBuf> = self.cache.keys().cloned().collect();
|
let paths: Vec<PathBuf> = self.cache.keys().cloned().collect();
|
||||||
CacheInfo {
|
CacheInfo {
|
||||||
counters: self.counters.clone(),
|
counters: self.counters.clone(),
|
||||||
@@ -106,105 +91,23 @@ impl MmapCache {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn remove_weak_ref(&mut self) {
|
fn get_mmap(&mut self, full_path: &Path) -> Result<Option<MmapReadOnly>, OpenReadError> {
|
||||||
let keys_to_remove: Vec<PathBuf> = self
|
Ok(match self.cache.entry(full_path.to_owned()) {
|
||||||
.cache
|
HashMapEntry::Occupied(occupied_entry) => {
|
||||||
.iter()
|
let mmap = occupied_entry.get();
|
||||||
.filter(|(_, mmap_weakref)| mmap_weakref.upgrade().is_none())
|
|
||||||
.map(|(key, _)| key.clone())
|
|
||||||
.collect();
|
|
||||||
for key in keys_to_remove {
|
|
||||||
self.cache.remove(&key);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns None if the file exists but as a len of 0 (and hence is not mmappable).
|
|
||||||
fn get_mmap(&mut self, full_path: &Path) -> Result<Option<Arc<BoxedData>>, OpenReadError> {
|
|
||||||
if let Some(mmap_weak) = self.cache.get(full_path) {
|
|
||||||
if let Some(mmap_arc) = mmap_weak.upgrade() {
|
|
||||||
self.counters.hit += 1;
|
self.counters.hit += 1;
|
||||||
return Ok(Some(mmap_arc));
|
Some(mmap.clone())
|
||||||
}
|
}
|
||||||
}
|
HashMapEntry::Vacant(vacant_entry) => {
|
||||||
self.cache.remove(full_path);
|
self.counters.miss += 1;
|
||||||
self.counters.miss += 1;
|
if let Some(mmap) = open_mmap(full_path)? {
|
||||||
Ok(if let Some(mmap) = open_mmap(full_path)? {
|
vacant_entry.insert(mmap.clone());
|
||||||
let mmap_arc: Arc<BoxedData> = Arc::new(Box::new(mmap));
|
Some(mmap)
|
||||||
let mmap_weak = Arc::downgrade(&mmap_arc);
|
} else {
|
||||||
self.cache.insert(full_path.to_owned(), mmap_weak);
|
None
|
||||||
Some(mmap_arc)
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct InnerWatcherWrapper {
|
|
||||||
_watcher: Mutex<notify::RecommendedWatcher>,
|
|
||||||
watcher_router: WatchCallbackList,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl InnerWatcherWrapper {
|
|
||||||
pub fn new(path: &Path) -> Result<(Self, Receiver<notify::RawEvent>), notify::Error> {
|
|
||||||
let (tx, watcher_recv): (Sender<RawEvent>, Receiver<RawEvent>) = channel();
|
|
||||||
// We need to initialize the
|
|
||||||
let mut watcher = notify::raw_watcher(tx)?;
|
|
||||||
watcher.watch(path, RecursiveMode::Recursive)?;
|
|
||||||
let inner = InnerWatcherWrapper {
|
|
||||||
_watcher: Mutex::new(watcher),
|
|
||||||
watcher_router: Default::default(),
|
|
||||||
};
|
|
||||||
Ok((inner, watcher_recv))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Clone)]
|
|
||||||
pub(crate) struct WatcherWrapper {
|
|
||||||
inner: Arc<InnerWatcherWrapper>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl WatcherWrapper {
|
|
||||||
pub fn new(path: &Path) -> Result<Self, OpenDirectoryError> {
|
|
||||||
let (inner, watcher_recv) = InnerWatcherWrapper::new(path).map_err(|err| match err {
|
|
||||||
notify::Error::PathNotFound => OpenDirectoryError::DoesNotExist(path.to_owned()),
|
|
||||||
_ => {
|
|
||||||
panic!("Unknown error while starting watching directory {:?}", path);
|
|
||||||
}
|
|
||||||
})?;
|
|
||||||
let watcher_wrapper = WatcherWrapper {
|
|
||||||
inner: Arc::new(inner),
|
|
||||||
};
|
|
||||||
let watcher_wrapper_clone = watcher_wrapper.clone();
|
|
||||||
thread::Builder::new()
|
|
||||||
.name("meta-file-watch-thread".to_string())
|
|
||||||
.spawn(move || {
|
|
||||||
loop {
|
|
||||||
match watcher_recv.recv().map(|evt| evt.path) {
|
|
||||||
Ok(Some(changed_path)) => {
|
|
||||||
// ... Actually subject to false positive.
|
|
||||||
// We might want to be more accurate than this at one point.
|
|
||||||
if let Some(filename) = changed_path.file_name() {
|
|
||||||
if filename == *META_FILEPATH {
|
|
||||||
watcher_wrapper_clone.inner.watcher_router.broadcast();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(None) => {
|
|
||||||
// not an event we are interested in.
|
|
||||||
}
|
|
||||||
Err(_e) => {
|
|
||||||
// the watch send channel was dropped
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
})
|
}
|
||||||
.expect("Failed to spawn thread to watch meta.json");
|
})
|
||||||
Ok(watcher_wrapper)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn watch(&mut self, watch_callback: WatchCallback) -> WatchHandle {
|
|
||||||
self.inner.watcher_router.subscribe(watch_callback)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -212,72 +115,33 @@ impl WatcherWrapper {
|
|||||||
///
|
///
|
||||||
/// The Mmap object are cached to limit the
|
/// The Mmap object are cached to limit the
|
||||||
/// system calls.
|
/// system calls.
|
||||||
///
|
|
||||||
/// In the `MmapDirectory`, locks are implemented using the `fs2` crate definition of locks.
|
|
||||||
///
|
|
||||||
/// On MacOS & linux, it relies on `flock` (aka `BSD Lock`). These locks solve most of the
|
|
||||||
/// problems related to POSIX Locks, but may their contract may not be respected on `NFS`
|
|
||||||
/// depending on the implementation.
|
|
||||||
///
|
|
||||||
/// On Windows the semantics are again different.
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct MmapDirectory {
|
pub struct MmapDirectory {
|
||||||
inner: Arc<MmapDirectoryInner>,
|
|
||||||
}
|
|
||||||
|
|
||||||
struct MmapDirectoryInner {
|
|
||||||
root_path: PathBuf,
|
root_path: PathBuf,
|
||||||
mmap_cache: RwLock<MmapCache>,
|
mmap_cache: Arc<RwLock<MmapCache>>,
|
||||||
_temp_directory: Option<TempDir>,
|
_temp_directory: Arc<Option<TempDir>>,
|
||||||
watcher: RwLock<WatcherWrapper>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl MmapDirectoryInner {
|
|
||||||
fn new(
|
|
||||||
root_path: PathBuf,
|
|
||||||
temp_directory: Option<TempDir>,
|
|
||||||
) -> Result<MmapDirectoryInner, OpenDirectoryError> {
|
|
||||||
let watch_wrapper = WatcherWrapper::new(&root_path)?;
|
|
||||||
let mmap_directory_inner = MmapDirectoryInner {
|
|
||||||
root_path,
|
|
||||||
mmap_cache: Default::default(),
|
|
||||||
_temp_directory: temp_directory,
|
|
||||||
watcher: RwLock::new(watch_wrapper),
|
|
||||||
};
|
|
||||||
Ok(mmap_directory_inner)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn watch(&self, watch_callback: WatchCallback) -> WatchHandle {
|
|
||||||
let mut wlock = self.watcher.write().unwrap();
|
|
||||||
wlock.watch(watch_callback)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl fmt::Debug for MmapDirectory {
|
impl fmt::Debug for MmapDirectory {
|
||||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
write!(f, "MmapDirectory({:?})", self.inner.root_path)
|
write!(f, "MmapDirectory({:?})", self.root_path)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl MmapDirectory {
|
impl MmapDirectory {
|
||||||
fn new(
|
|
||||||
root_path: PathBuf,
|
|
||||||
temp_directory: Option<TempDir>,
|
|
||||||
) -> Result<MmapDirectory, OpenDirectoryError> {
|
|
||||||
let inner = MmapDirectoryInner::new(root_path, temp_directory)?;
|
|
||||||
Ok(MmapDirectory {
|
|
||||||
inner: Arc::new(inner),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Creates a new MmapDirectory in a temporary directory.
|
/// Creates a new MmapDirectory in a temporary directory.
|
||||||
///
|
///
|
||||||
/// This is mostly useful to test the MmapDirectory itself.
|
/// This is mostly useful to test the MmapDirectory itself.
|
||||||
/// For your unit tests, prefer the RAMDirectory.
|
/// For your unit tests, prefer the RAMDirectory.
|
||||||
pub fn create_from_tempdir() -> Result<MmapDirectory, OpenDirectoryError> {
|
pub fn create_from_tempdir() -> io::Result<MmapDirectory> {
|
||||||
let tempdir = TempDir::new("index").map_err(OpenDirectoryError::IoError)?;
|
let tempdir = TempDir::new("index")?;
|
||||||
let tempdir_path = PathBuf::from(tempdir.path());
|
let tempdir_path = PathBuf::from(tempdir.path());
|
||||||
MmapDirectory::new(tempdir_path, Some(tempdir))
|
let directory = MmapDirectory {
|
||||||
|
root_path: tempdir_path,
|
||||||
|
mmap_cache: Arc::new(RwLock::new(MmapCache::default())),
|
||||||
|
_temp_directory: Arc::new(Some(tempdir)),
|
||||||
|
};
|
||||||
|
Ok(directory)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Opens a MmapDirectory in a directory.
|
/// Opens a MmapDirectory in a directory.
|
||||||
@@ -295,14 +159,18 @@ impl MmapDirectory {
|
|||||||
directory_path,
|
directory_path,
|
||||||
)))
|
)))
|
||||||
} else {
|
} else {
|
||||||
Ok(MmapDirectory::new(PathBuf::from(directory_path), None)?)
|
Ok(MmapDirectory {
|
||||||
|
root_path: PathBuf::from(directory_path),
|
||||||
|
mmap_cache: Arc::new(RwLock::new(MmapCache::default())),
|
||||||
|
_temp_directory: Arc::new(None),
|
||||||
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Joins a relative_path to the directory `root_path`
|
/// Joins a relative_path to the directory `root_path`
|
||||||
/// to create a proper complete `filepath`.
|
/// to create a proper complete `filepath`.
|
||||||
fn resolve_path(&self, relative_path: &Path) -> PathBuf {
|
fn resolve_path(&self, relative_path: &Path) -> PathBuf {
|
||||||
self.inner.root_path.join(relative_path)
|
self.root_path.join(relative_path)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Sync the root directory.
|
/// Sync the root directory.
|
||||||
@@ -327,7 +195,7 @@ impl MmapDirectory {
|
|||||||
.custom_flags(winbase::FILE_FLAG_BACKUP_SEMANTICS);
|
.custom_flags(winbase::FILE_FLAG_BACKUP_SEMANTICS);
|
||||||
}
|
}
|
||||||
|
|
||||||
let fd = open_opts.open(&self.inner.root_path)?;
|
let fd = open_opts.open(&self.root_path)?;
|
||||||
fd.sync_all()?;
|
fd.sync_all()?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -337,38 +205,17 @@ impl MmapDirectory {
|
|||||||
///
|
///
|
||||||
/// The `MmapDirectory` embeds a `MmapDirectory`
|
/// The `MmapDirectory` embeds a `MmapDirectory`
|
||||||
/// to avoid multiplying the `mmap` system calls.
|
/// to avoid multiplying the `mmap` system calls.
|
||||||
pub fn get_cache_info(&self) -> CacheInfo {
|
pub fn get_cache_info(&mut self) -> CacheInfo {
|
||||||
self.inner
|
self.mmap_cache
|
||||||
.mmap_cache
|
|
||||||
.write()
|
.write()
|
||||||
.expect("mmap cache lock is poisoned")
|
|
||||||
.remove_weak_ref();
|
|
||||||
self.inner
|
|
||||||
.mmap_cache
|
|
||||||
.read()
|
|
||||||
.expect("Mmap cache lock is poisoned.")
|
.expect("Mmap cache lock is poisoned.")
|
||||||
.get_info()
|
.get_info()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// We rely on fs2 for file locking. On Windows & MacOS this
|
|
||||||
/// uses BSD locks (`flock`). The lock is actually released when
|
|
||||||
/// the `File` object is dropped and its associated file descriptor
|
|
||||||
/// is closed.
|
|
||||||
struct ReleaseLockFile {
|
|
||||||
_file: File,
|
|
||||||
path: PathBuf,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Drop for ReleaseLockFile {
|
|
||||||
fn drop(&mut self) {
|
|
||||||
debug!("Releasing lock {:?}", self.path);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// This Write wraps a File, but has the specificity of
|
/// This Write wraps a File, but has the specificity of
|
||||||
/// call `sync_all` on flush.
|
/// call `sync_all` on flush.
|
||||||
pub struct SafeFileWriter(File);
|
struct SafeFileWriter(File);
|
||||||
|
|
||||||
impl SafeFileWriter {
|
impl SafeFileWriter {
|
||||||
fn new(file: File) -> SafeFileWriter {
|
fn new(file: File) -> SafeFileWriter {
|
||||||
@@ -398,7 +245,7 @@ impl Directory for MmapDirectory {
|
|||||||
debug!("Open Read {:?}", path);
|
debug!("Open Read {:?}", path);
|
||||||
let full_path = self.resolve_path(path);
|
let full_path = self.resolve_path(path);
|
||||||
|
|
||||||
let mut mmap_cache = self.inner.mmap_cache.write().map_err(|_| {
|
let mut mmap_cache = self.mmap_cache.write().map_err(|_| {
|
||||||
let msg = format!(
|
let msg = format!(
|
||||||
"Failed to acquired write lock \
|
"Failed to acquired write lock \
|
||||||
on mmap cache while reading {:?}",
|
on mmap cache while reading {:?}",
|
||||||
@@ -406,34 +253,11 @@ impl Directory for MmapDirectory {
|
|||||||
);
|
);
|
||||||
IOError::with_path(path.to_owned(), make_io_err(msg))
|
IOError::with_path(path.to_owned(), make_io_err(msg))
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
Ok(mmap_cache
|
Ok(mmap_cache
|
||||||
.get_mmap(&full_path)?
|
.get_mmap(&full_path)?
|
||||||
.map(ReadOnlySource::from)
|
.map(ReadOnlySource::Mmap)
|
||||||
.unwrap_or_else(ReadOnlySource::empty))
|
.unwrap_or_else(|| ReadOnlySource::Anonymous(SharedVecSlice::empty())))
|
||||||
}
|
|
||||||
|
|
||||||
/// Any entry associated to the path in the mmap will be
|
|
||||||
/// removed before the file is deleted.
|
|
||||||
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
|
|
||||||
debug!("Deleting file {:?}", path);
|
|
||||||
let full_path = self.resolve_path(path);
|
|
||||||
match fs::remove_file(&full_path) {
|
|
||||||
Ok(_) => self
|
|
||||||
.sync_directory()
|
|
||||||
.map_err(|e| IOError::with_path(path.to_owned(), e).into()),
|
|
||||||
Err(e) => {
|
|
||||||
if e.kind() == io::ErrorKind::NotFound {
|
|
||||||
Err(DeleteError::FileDoesNotExist(path.to_owned()))
|
|
||||||
} else {
|
|
||||||
Err(IOError::with_path(path.to_owned(), e).into())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn exists(&self, path: &Path) -> bool {
|
|
||||||
let full_path = self.resolve_path(path);
|
|
||||||
full_path.exists()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError> {
|
fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError> {
|
||||||
@@ -466,6 +290,44 @@ impl Directory for MmapDirectory {
|
|||||||
Ok(BufWriter::new(Box::new(writer)))
|
Ok(BufWriter::new(Box::new(writer)))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Any entry associated to the path in the mmap will be
|
||||||
|
/// removed before the file is deleted.
|
||||||
|
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
|
||||||
|
debug!("Deleting file {:?}", path);
|
||||||
|
let full_path = self.resolve_path(path);
|
||||||
|
let mut mmap_cache = self.mmap_cache.write().map_err(|_| {
|
||||||
|
let msg = format!(
|
||||||
|
"Failed to acquired write lock \
|
||||||
|
on mmap cache while deleting {:?}",
|
||||||
|
path
|
||||||
|
);
|
||||||
|
IOError::with_path(path.to_owned(), make_io_err(msg))
|
||||||
|
})?;
|
||||||
|
mmap_cache.discard_from_cache(path);
|
||||||
|
|
||||||
|
// Removing the entry in the MMap cache.
|
||||||
|
// The munmap will appear on Drop,
|
||||||
|
// when the last reference is gone.
|
||||||
|
mmap_cache.cache.remove(&full_path);
|
||||||
|
match fs::remove_file(&full_path) {
|
||||||
|
Ok(_) => self
|
||||||
|
.sync_directory()
|
||||||
|
.map_err(|e| IOError::with_path(path.to_owned(), e).into()),
|
||||||
|
Err(e) => {
|
||||||
|
if e.kind() == io::ErrorKind::NotFound {
|
||||||
|
Err(DeleteError::FileDoesNotExist(path.to_owned()))
|
||||||
|
} else {
|
||||||
|
Err(IOError::with_path(path.to_owned(), e).into())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn exists(&self, path: &Path) -> bool {
|
||||||
|
let full_path = self.resolve_path(path);
|
||||||
|
full_path.exists()
|
||||||
|
}
|
||||||
|
|
||||||
fn atomic_read(&self, path: &Path) -> Result<Vec<u8>, OpenReadError> {
|
fn atomic_read(&self, path: &Path) -> Result<Vec<u8>, OpenReadError> {
|
||||||
let full_path = self.resolve_path(path);
|
let full_path = self.resolve_path(path);
|
||||||
let mut buffer = Vec::new();
|
let mut buffer = Vec::new();
|
||||||
@@ -492,30 +354,6 @@ impl Directory for MmapDirectory {
|
|||||||
meta_file.write(|f| f.write_all(data))?;
|
meta_file.write(|f| f.write_all(data))?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn acquire_lock(&self, lock: &Lock) -> Result<DirectoryLock, LockError> {
|
|
||||||
let full_path = self.resolve_path(&lock.filepath);
|
|
||||||
// We make sure that the file exists.
|
|
||||||
let file: File = OpenOptions::new()
|
|
||||||
.write(true)
|
|
||||||
.create(true) //< if the file does not exist yet, create it.
|
|
||||||
.open(&full_path)
|
|
||||||
.map_err(LockError::IOError)?;
|
|
||||||
if lock.is_blocking {
|
|
||||||
file.lock_exclusive().map_err(LockError::IOError)?;
|
|
||||||
} else {
|
|
||||||
file.try_lock_exclusive().map_err(|_| LockError::LockBusy)?
|
|
||||||
}
|
|
||||||
// dropping the file handle will release the lock.
|
|
||||||
Ok(DirectoryLock::from(Box::new(ReleaseLockFile {
|
|
||||||
path: lock.filepath.clone(),
|
|
||||||
_file: file,
|
|
||||||
})))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn watch(&self, watch_callback: WatchCallback) -> WatchHandle {
|
|
||||||
self.inner.watch(watch_callback)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@@ -525,18 +363,6 @@ mod tests {
|
|||||||
// The following tests are specific to the MmapDirectory
|
// The following tests are specific to the MmapDirectory
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
use schema::{Schema, SchemaBuilder, TEXT};
|
|
||||||
use std::fs;
|
|
||||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
|
||||||
use std::thread;
|
|
||||||
use std::time::Duration;
|
|
||||||
use Index;
|
|
||||||
use ReloadPolicy;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_open_non_existant_path() {
|
|
||||||
assert!(MmapDirectory::open(PathBuf::from("./nowhere")).is_err());
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_open_empty() {
|
fn test_open_empty() {
|
||||||
@@ -556,7 +382,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_cache() {
|
fn test_cache() {
|
||||||
let content = b"abc";
|
let content = "abc".as_bytes();
|
||||||
|
|
||||||
// here we test if the cache releases
|
// here we test if the cache releases
|
||||||
// mmaps correctly.
|
// mmaps correctly.
|
||||||
@@ -572,104 +398,26 @@ mod tests {
|
|||||||
w.flush().unwrap();
|
w.flush().unwrap();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
{
|
||||||
let mut keep = vec![];
|
for (i, path) in paths.iter().enumerate() {
|
||||||
for (i, path) in paths.iter().enumerate() {
|
let _r = mmap_directory.open_read(path).unwrap();
|
||||||
keep.push(mmap_directory.open_read(path).unwrap());
|
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), i + 1);
|
||||||
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), i + 1);
|
}
|
||||||
}
|
for path in paths.iter() {
|
||||||
assert_eq!(mmap_directory.get_cache_info().counters.hit, 0);
|
let _r = mmap_directory.open_read(path).unwrap();
|
||||||
assert_eq!(mmap_directory.get_cache_info().counters.miss, 10);
|
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), num_paths);
|
||||||
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 10);
|
}
|
||||||
for path in paths.iter() {
|
for (i, path) in paths.iter().enumerate() {
|
||||||
let _r = mmap_directory.open_read(path).unwrap();
|
mmap_directory.delete(path).unwrap();
|
||||||
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), num_paths);
|
assert_eq!(
|
||||||
|
mmap_directory.get_cache_info().mmapped.len(),
|
||||||
|
num_paths - i - 1
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
assert_eq!(mmap_directory.get_cache_info().counters.hit, 10);
|
assert_eq!(mmap_directory.get_cache_info().counters.hit, 10);
|
||||||
assert_eq!(mmap_directory.get_cache_info().counters.miss, 10);
|
assert_eq!(mmap_directory.get_cache_info().counters.miss, 10);
|
||||||
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 10);
|
|
||||||
|
|
||||||
for path in paths.iter() {
|
|
||||||
let _r = mmap_directory.open_read(path).unwrap();
|
|
||||||
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 10);
|
|
||||||
}
|
|
||||||
|
|
||||||
assert_eq!(mmap_directory.get_cache_info().counters.hit, 20);
|
|
||||||
assert_eq!(mmap_directory.get_cache_info().counters.miss, 10);
|
|
||||||
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 10);
|
|
||||||
drop(keep);
|
|
||||||
for path in paths.iter() {
|
|
||||||
let _r = mmap_directory.open_read(path).unwrap();
|
|
||||||
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 1);
|
|
||||||
}
|
|
||||||
assert_eq!(mmap_directory.get_cache_info().counters.hit, 20);
|
|
||||||
assert_eq!(mmap_directory.get_cache_info().counters.miss, 20);
|
|
||||||
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 0);
|
|
||||||
|
|
||||||
for path in &paths {
|
|
||||||
mmap_directory.delete(path).unwrap();
|
|
||||||
}
|
|
||||||
assert_eq!(mmap_directory.get_cache_info().counters.hit, 20);
|
|
||||||
assert_eq!(mmap_directory.get_cache_info().counters.miss, 20);
|
|
||||||
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 0);
|
|
||||||
for path in paths.iter() {
|
|
||||||
assert!(mmap_directory.open_read(path).is_err());
|
|
||||||
}
|
|
||||||
assert_eq!(mmap_directory.get_cache_info().counters.hit, 20);
|
|
||||||
assert_eq!(mmap_directory.get_cache_info().counters.miss, 30);
|
|
||||||
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 0);
|
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_watch_wrapper() {
|
|
||||||
let counter: Arc<AtomicUsize> = Default::default();
|
|
||||||
let counter_clone = counter.clone();
|
|
||||||
let tmp_dir: TempDir = tempdir::TempDir::new("test_watch_wrapper").unwrap();
|
|
||||||
let tmp_dirpath = tmp_dir.path().to_owned();
|
|
||||||
let mut watch_wrapper = WatcherWrapper::new(&tmp_dirpath).unwrap();
|
|
||||||
let tmp_file = tmp_dirpath.join("coucou");
|
|
||||||
let _handle = watch_wrapper.watch(Box::new(move || {
|
|
||||||
counter_clone.fetch_add(1, Ordering::SeqCst);
|
|
||||||
}));
|
|
||||||
assert_eq!(counter.load(Ordering::SeqCst), 0);
|
|
||||||
fs::write(&tmp_file, b"whateverwilldo").unwrap();
|
|
||||||
thread::sleep(Duration::new(0, 1_000u32));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_mmap_released() {
|
|
||||||
let mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
|
|
||||||
let mut schema_builder: SchemaBuilder = Schema::builder();
|
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
{
|
|
||||||
let index = Index::create(mmap_directory.clone(), schema).unwrap();
|
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
|
||||||
for _num_commits in 0..16 {
|
|
||||||
for _ in 0..10 {
|
|
||||||
index_writer.add_document(doc!(text_field=>"abc"));
|
|
||||||
}
|
|
||||||
index_writer.commit().unwrap();
|
|
||||||
}
|
|
||||||
let reader = index
|
|
||||||
.reader_builder()
|
|
||||||
.reload_policy(ReloadPolicy::Manual)
|
|
||||||
.try_into()
|
|
||||||
.unwrap();
|
|
||||||
for _ in 0..30 {
|
|
||||||
index_writer.add_document(doc!(text_field=>"abc"));
|
|
||||||
index_writer.commit().unwrap();
|
|
||||||
reader.reload().unwrap();
|
|
||||||
}
|
|
||||||
index_writer.wait_merging_threads().unwrap();
|
|
||||||
reader.reload().unwrap();
|
|
||||||
let num_segments = reader.searcher().segment_readers().len();
|
|
||||||
assert_eq!(num_segments, 4);
|
|
||||||
assert_eq!(
|
|
||||||
num_segments * 7,
|
|
||||||
mmap_directory.get_cache_info().mmapped.len()
|
|
||||||
);
|
|
||||||
}
|
|
||||||
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 0);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,24 +8,21 @@ WORM directory abstraction.
|
|||||||
mod mmap_directory;
|
mod mmap_directory;
|
||||||
|
|
||||||
mod directory;
|
mod directory;
|
||||||
mod directory_lock;
|
|
||||||
mod managed_directory;
|
mod managed_directory;
|
||||||
mod ram_directory;
|
mod ram_directory;
|
||||||
mod read_only_source;
|
mod read_only_source;
|
||||||
mod watch_event_router;
|
mod shared_vec_slice;
|
||||||
mod nrt_directory;
|
mod static_dictionnary;
|
||||||
|
|
||||||
/// Errors specific to the directory module.
|
/// Errors specific to the directory module.
|
||||||
pub mod error;
|
pub mod error;
|
||||||
|
|
||||||
pub use self::directory::DirectoryLock;
|
use std::io::{BufWriter, Seek, Write};
|
||||||
|
|
||||||
pub use self::directory::{Directory, DirectoryClone};
|
pub use self::directory::{Directory, DirectoryClone};
|
||||||
pub use self::directory_lock::{Lock, INDEX_WRITER_LOCK, META_LOCK};
|
|
||||||
pub use self::ram_directory::RAMDirectory;
|
pub use self::ram_directory::RAMDirectory;
|
||||||
pub use self::read_only_source::ReadOnlySource;
|
pub use self::read_only_source::ReadOnlySource;
|
||||||
pub(crate) use self::watch_event_router::WatchCallbackList;
|
pub use self::static_dictionnary::StaticDirectory;
|
||||||
pub use self::watch_event_router::{WatchCallback, WatchHandle};
|
|
||||||
use std::io::{BufWriter, Seek, Write};
|
|
||||||
|
|
||||||
#[cfg(feature = "mmap")]
|
#[cfg(feature = "mmap")]
|
||||||
pub use self::mmap_directory::MmapDirectory;
|
pub use self::mmap_directory::MmapDirectory;
|
||||||
@@ -43,4 +40,128 @@ impl<T: Seek + Write> SeekableWrite for T {}
|
|||||||
pub type WritePtr = BufWriter<Box<SeekableWrite>>;
|
pub type WritePtr = BufWriter<Box<SeekableWrite>>;
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests;
|
mod tests {
|
||||||
|
|
||||||
|
use super::*;
|
||||||
|
use std::io::{Seek, SeekFrom, Write};
|
||||||
|
use std::path::Path;
|
||||||
|
|
||||||
|
lazy_static! {
|
||||||
|
static ref TEST_PATH: &'static Path = Path::new("some_path_for_test");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_ram_directory() {
|
||||||
|
let mut ram_directory = RAMDirectory::create();
|
||||||
|
test_directory(&mut ram_directory);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[cfg(feature = "mmap")]
|
||||||
|
fn test_mmap_directory() {
|
||||||
|
let mut mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
|
||||||
|
test_directory(&mut mmap_directory);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
#[should_panic]
|
||||||
|
fn ram_directory_panics_if_flush_forgotten() {
|
||||||
|
let mut ram_directory = RAMDirectory::create();
|
||||||
|
let mut write_file = ram_directory.open_write(*TEST_PATH).unwrap();
|
||||||
|
assert!(write_file.write_all(&[4]).is_ok());
|
||||||
|
}
|
||||||
|
|
||||||
|
fn test_simple(directory: &mut Directory) {
|
||||||
|
{
|
||||||
|
{
|
||||||
|
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
|
||||||
|
assert!(directory.exists(*TEST_PATH));
|
||||||
|
write_file.write_all(&[4]).unwrap();
|
||||||
|
write_file.write_all(&[3]).unwrap();
|
||||||
|
write_file.write_all(&[7, 3, 5]).unwrap();
|
||||||
|
write_file.flush().unwrap();
|
||||||
|
}
|
||||||
|
let read_file = directory.open_read(*TEST_PATH).unwrap();
|
||||||
|
let data: &[u8] = &*read_file;
|
||||||
|
assert_eq!(data, &[4u8, 3u8, 7u8, 3u8, 5u8]);
|
||||||
|
}
|
||||||
|
|
||||||
|
assert!(directory.delete(*TEST_PATH).is_ok());
|
||||||
|
assert!(!directory.exists(*TEST_PATH));
|
||||||
|
}
|
||||||
|
|
||||||
|
fn test_seek(directory: &mut Directory) {
|
||||||
|
{
|
||||||
|
{
|
||||||
|
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
|
||||||
|
write_file.write_all(&[4, 3, 7, 3, 5]).unwrap();
|
||||||
|
write_file.seek(SeekFrom::Start(0)).unwrap();
|
||||||
|
write_file.write_all(&[3, 1]).unwrap();
|
||||||
|
write_file.flush().unwrap();
|
||||||
|
}
|
||||||
|
let read_file = directory.open_read(*TEST_PATH).unwrap();
|
||||||
|
let data: &[u8] = &*read_file;
|
||||||
|
assert_eq!(data, &[3u8, 1u8, 7u8, 3u8, 5u8]);
|
||||||
|
}
|
||||||
|
|
||||||
|
assert!(directory.delete(*TEST_PATH).is_ok());
|
||||||
|
}
|
||||||
|
|
||||||
|
fn test_rewrite_forbidden(directory: &mut Directory) {
|
||||||
|
{
|
||||||
|
directory.open_write(*TEST_PATH).unwrap();
|
||||||
|
assert!(directory.exists(*TEST_PATH));
|
||||||
|
}
|
||||||
|
{
|
||||||
|
assert!(directory.open_write(*TEST_PATH).is_err());
|
||||||
|
}
|
||||||
|
assert!(directory.delete(*TEST_PATH).is_ok());
|
||||||
|
}
|
||||||
|
|
||||||
|
fn test_write_create_the_file(directory: &mut Directory) {
|
||||||
|
{
|
||||||
|
assert!(directory.open_read(*TEST_PATH).is_err());
|
||||||
|
let _w = directory.open_write(*TEST_PATH).unwrap();
|
||||||
|
assert!(directory.exists(*TEST_PATH));
|
||||||
|
assert!(directory.open_read(*TEST_PATH).is_ok());
|
||||||
|
assert!(directory.delete(*TEST_PATH).is_ok());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn test_directory_delete(directory: &mut Directory) {
|
||||||
|
assert!(directory.open_read(*TEST_PATH).is_err());
|
||||||
|
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
|
||||||
|
write_file.write_all(&[1, 2, 3, 4]).unwrap();
|
||||||
|
write_file.flush().unwrap();
|
||||||
|
{
|
||||||
|
let read_handle = directory.open_read(*TEST_PATH).unwrap();
|
||||||
|
{
|
||||||
|
assert_eq!(&*read_handle, &[1u8, 2u8, 3u8, 4u8]);
|
||||||
|
|
||||||
|
// Mapped files can't be deleted on Windows
|
||||||
|
if !cfg!(windows) {
|
||||||
|
assert!(directory.delete(*TEST_PATH).is_ok());
|
||||||
|
assert_eq!(&*read_handle, &[1u8, 2u8, 3u8, 4u8]);
|
||||||
|
}
|
||||||
|
|
||||||
|
assert!(directory.delete(Path::new("SomeOtherPath")).is_err());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if cfg!(windows) {
|
||||||
|
assert!(directory.delete(*TEST_PATH).is_ok());
|
||||||
|
}
|
||||||
|
|
||||||
|
assert!(directory.open_read(*TEST_PATH).is_err());
|
||||||
|
assert!(directory.delete(*TEST_PATH).is_err());
|
||||||
|
}
|
||||||
|
|
||||||
|
fn test_directory(directory: &mut Directory) {
|
||||||
|
test_simple(directory);
|
||||||
|
test_seek(directory);
|
||||||
|
test_rewrite_forbidden(directory);
|
||||||
|
test_write_create_the_file(directory);
|
||||||
|
test_directory_delete(directory);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|||||||
@@ -1,195 +0,0 @@
|
|||||||
use directory::Directory;
|
|
||||||
use std::path::{PathBuf, Path};
|
|
||||||
use directory::ReadOnlySource;
|
|
||||||
use directory::error::OpenReadError;
|
|
||||||
use directory::error::DeleteError;
|
|
||||||
use std::io::{BufWriter, Cursor};
|
|
||||||
use directory::SeekableWrite;
|
|
||||||
use directory::error::OpenWriteError;
|
|
||||||
use directory::WatchHandle;
|
|
||||||
use directory::ram_directory::InnerRamDirectory;
|
|
||||||
use std::sync::RwLock;
|
|
||||||
use std::sync::Arc;
|
|
||||||
use directory::WatchCallback;
|
|
||||||
use std::fmt;
|
|
||||||
use std::io;
|
|
||||||
use std::io::{Seek, Write};
|
|
||||||
use directory::DirectoryClone;
|
|
||||||
|
|
||||||
|
|
||||||
const BUFFER_LEN: usize = 1_000_000;
|
|
||||||
|
|
||||||
|
|
||||||
pub enum NRTWriter {
|
|
||||||
InRam {
|
|
||||||
buffer: Cursor<Vec<u8>>,
|
|
||||||
path: PathBuf,
|
|
||||||
nrt_directory: NRTDirectory
|
|
||||||
},
|
|
||||||
UnderlyingFile(BufWriter<Box<SeekableWrite>>)
|
|
||||||
}
|
|
||||||
|
|
||||||
impl NRTWriter {
|
|
||||||
pub fn new(path: PathBuf, nrt_directory: NRTDirectory) -> NRTWriter {
|
|
||||||
NRTWriter::InRam {
|
|
||||||
buffer: Cursor::new(Vec::with_capacity(BUFFER_LEN)),
|
|
||||||
path,
|
|
||||||
nrt_directory,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl io::Seek for NRTWriter {
|
|
||||||
fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
|
|
||||||
match self {
|
|
||||||
NRTWriter::InRam { buffer, path, nrt_directory } => {
|
|
||||||
buffer.seek(pos)
|
|
||||||
}
|
|
||||||
NRTWriter::UnderlyingFile(file) => {
|
|
||||||
file.seek(pos)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl io::Write for NRTWriter {
|
|
||||||
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
|
||||||
self.write_all(buf)?;
|
|
||||||
Ok(buf.len())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn flush(&mut self) -> io::Result<()> {
|
|
||||||
match self {
|
|
||||||
NRTWriter::InRam { buffer, path, nrt_directory } => {
|
|
||||||
let mut cache_wlock = nrt_directory.cache.write().unwrap();
|
|
||||||
cache_wlock.write(path.clone(), buffer.get_ref());
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
NRTWriter::UnderlyingFile(file) => {
|
|
||||||
file.flush()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn write_all(&mut self, buf: &[u8]) -> io::Result<()> {
|
|
||||||
// Working around the borrow checker.
|
|
||||||
let mut underlying_write_opt: Option<BufWriter<Box<SeekableWrite>>> = None;
|
|
||||||
if let NRTWriter::InRam { buffer, path, nrt_directory } = self {
|
|
||||||
if buffer.get_ref().len() + buf.len() > BUFFER_LEN {
|
|
||||||
// We can't keep this in RAM. Let's move it to the underlying directory.
|
|
||||||
underlying_write_opt = Some(nrt_directory.open_write(path)
|
|
||||||
.map_err(|open_err| {
|
|
||||||
io::Error::new(io::ErrorKind::Other, open_err)
|
|
||||||
})?);
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if let Some(underlying_write) = underlying_write_opt {
|
|
||||||
*self = NRTWriter::UnderlyingFile(underlying_write);
|
|
||||||
}
|
|
||||||
match self {
|
|
||||||
NRTWriter::InRam { buffer, path, nrt_directory } => {
|
|
||||||
assert!(buffer.get_ref().len() + buf.len() <= BUFFER_LEN);
|
|
||||||
buffer.write_all(buf)
|
|
||||||
}
|
|
||||||
NRTWriter::UnderlyingFile(file) => {
|
|
||||||
file.write_all(buf)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct NRTDirectory {
|
|
||||||
underlying: Box<Directory>,
|
|
||||||
cache: Arc<RwLock<InnerRamDirectory>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
impl Clone for NRTDirectory {
|
|
||||||
fn clone(&self) -> Self {
|
|
||||||
NRTDirectory {
|
|
||||||
underlying: self.underlying.box_clone(),
|
|
||||||
cache: self.cache.clone()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl NRTDirectory {
|
|
||||||
fn wrap(underlying: Box<Directory>) -> NRTDirectory {
|
|
||||||
NRTDirectory {
|
|
||||||
underlying,
|
|
||||||
cache: Default::default()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl fmt::Debug for NRTDirectory {
|
|
||||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
|
||||||
write!(f, "NRTDirectory({:?})", self.underlying)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Directory for NRTDirectory {
|
|
||||||
fn open_read(&self, path: &Path) -> Result<ReadOnlySource, OpenReadError> {
|
|
||||||
unimplemented!()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn delete(&self, path: &Path) -> Result<(), DeleteError> {
|
|
||||||
// We explicitly release the lock, to prevent a panic on the underlying directory
|
|
||||||
// to poison the lock.
|
|
||||||
//
|
|
||||||
// File can only go from cache to underlying so the result does not lead to
|
|
||||||
// any inconsistency.
|
|
||||||
{
|
|
||||||
let mut cache_wlock = self.cache.write().unwrap();
|
|
||||||
if cache_wlock.exists(path) {
|
|
||||||
return cache_wlock.delete(path);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
self.underlying.delete(path)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn exists(&self, path: &Path) -> bool {
|
|
||||||
// We explicitly release the lock, to prevent a panic on the underlying directory
|
|
||||||
// to poison the lock.
|
|
||||||
//
|
|
||||||
// File can only go from cache to underlying so the result does not lead to
|
|
||||||
// any inconsistency.
|
|
||||||
{
|
|
||||||
let rlock_cache = self.cache.read().unwrap();
|
|
||||||
if rlock_cache.exists(path) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
self.underlying.exists(path)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn open_write(&mut self, path: &Path) -> Result<BufWriter<Box<SeekableWrite>>, OpenWriteError> {
|
|
||||||
let mut cache_wlock = self.cache.write().unwrap();
|
|
||||||
// TODO might poison our lock. I don't know have a sound solution yet.
|
|
||||||
let path_buf = path.to_owned();
|
|
||||||
if self.underlying.exists(path) {
|
|
||||||
return Err(OpenWriteError::FileAlreadyExists(path_buf));
|
|
||||||
}
|
|
||||||
let exists = cache_wlock.write(path_buf.clone(), &[]);
|
|
||||||
// force the creation of the file to mimic the MMap directory.
|
|
||||||
if exists {
|
|
||||||
Err(OpenWriteError::FileAlreadyExists(path_buf))
|
|
||||||
} else {
|
|
||||||
let vec_writer = NRTWriter::new(path_buf.clone(), self.clone());
|
|
||||||
Ok(BufWriter::new(Box::new(vec_writer)))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn atomic_read(&self, path: &Path) -> Result<Vec<u8>, OpenReadError> {
|
|
||||||
self.underlying.atomic_read(path)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
|
|
||||||
self.underlying.atomic_write(path, data)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn watch(&self, watch_callback: WatchCallback) -> WatchHandle {
|
|
||||||
self.underlying.watch(watch_callback)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,8 +1,8 @@
|
|||||||
use core::META_FILEPATH;
|
use super::shared_vec_slice::SharedVecSlice;
|
||||||
use directory::error::{DeleteError, OpenReadError, OpenWriteError};
|
use common::make_io_err;
|
||||||
use directory::WatchCallbackList;
|
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
|
||||||
use directory::WritePtr;
|
use directory::WritePtr;
|
||||||
use directory::{Directory, ReadOnlySource, WatchCallback, WatchHandle};
|
use directory::{Directory, ReadOnlySource};
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
use std::io::{self, BufWriter, Cursor, Seek, SeekFrom, Write};
|
use std::io::{self, BufWriter, Cursor, Seek, SeekFrom, Write};
|
||||||
@@ -22,13 +22,13 @@ use std::sync::{Arc, RwLock};
|
|||||||
///
|
///
|
||||||
struct VecWriter {
|
struct VecWriter {
|
||||||
path: PathBuf,
|
path: PathBuf,
|
||||||
shared_directory: RAMDirectory,
|
shared_directory: InnerDirectory,
|
||||||
data: Cursor<Vec<u8>>,
|
data: Cursor<Vec<u8>>,
|
||||||
is_flushed: bool,
|
is_flushed: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl VecWriter {
|
impl VecWriter {
|
||||||
fn new(path_buf: PathBuf, shared_directory: RAMDirectory) -> VecWriter {
|
fn new(path_buf: PathBuf, shared_directory: InnerDirectory) -> VecWriter {
|
||||||
VecWriter {
|
VecWriter {
|
||||||
path: path_buf,
|
path: path_buf,
|
||||||
data: Cursor::new(Vec::new()),
|
data: Cursor::new(Vec::new()),
|
||||||
@@ -64,44 +64,73 @@ impl Write for VecWriter {
|
|||||||
|
|
||||||
fn flush(&mut self) -> io::Result<()> {
|
fn flush(&mut self) -> io::Result<()> {
|
||||||
self.is_flushed = true;
|
self.is_flushed = true;
|
||||||
let mut fs = self.shared_directory.fs.write().unwrap();
|
self.shared_directory
|
||||||
fs.write(self.path.clone(), self.data.get_ref());
|
.write(self.path.clone(), self.data.get_ref())?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Default)]
|
#[derive(Clone)]
|
||||||
pub(crate) struct InnerRamDirectory {
|
struct InnerDirectory(Arc<RwLock<HashMap<PathBuf, Arc<Vec<u8>>>>>);
|
||||||
fs: HashMap<PathBuf, ReadOnlySource>,
|
|
||||||
watch_router: WatchCallbackList,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl InnerRamDirectory {
|
impl InnerDirectory {
|
||||||
pub fn write(&mut self, path: PathBuf, data: &[u8]) -> bool {
|
fn new() -> InnerDirectory {
|
||||||
let data = ReadOnlySource::new(Vec::from(data));
|
InnerDirectory(Arc::new(RwLock::new(HashMap::new())))
|
||||||
self.fs.insert(path, data).is_some()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn open_read(&self, path: &Path) -> Result<ReadOnlySource, OpenReadError> {
|
fn write(&self, path: PathBuf, data: &[u8]) -> io::Result<bool> {
|
||||||
self.fs
|
let mut map = self.0.write().map_err(|_| {
|
||||||
.get(path)
|
make_io_err(format!(
|
||||||
.ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path)))
|
"Failed to lock the directory, when trying to write {:?}",
|
||||||
.map(|el| el.clone())
|
path
|
||||||
|
))
|
||||||
|
})?;
|
||||||
|
let prev_value = map.insert(path, Arc::new(Vec::from(data)));
|
||||||
|
Ok(prev_value.is_some())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn delete(&mut self, path: &Path) -> result::Result<(), DeleteError> {
|
fn open_read(&self, path: &Path) -> Result<ReadOnlySource, OpenReadError> {
|
||||||
match self.fs.remove(path) {
|
self.0
|
||||||
Some(_) => Ok(()),
|
.read()
|
||||||
None => Err(DeleteError::FileDoesNotExist(PathBuf::from(path))),
|
.map_err(|_| {
|
||||||
}
|
let msg = format!(
|
||||||
|
"Failed to acquire read lock for the \
|
||||||
|
directory when trying to read {:?}",
|
||||||
|
path
|
||||||
|
);
|
||||||
|
let io_err = make_io_err(msg);
|
||||||
|
OpenReadError::IOError(IOError::with_path(path.to_owned(), io_err))
|
||||||
|
}).and_then(|readable_map| {
|
||||||
|
readable_map
|
||||||
|
.get(path)
|
||||||
|
.ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path)))
|
||||||
|
.map(Arc::clone)
|
||||||
|
.map(|data| ReadOnlySource::Anonymous(SharedVecSlice::new(data)))
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn exists(&self, path: &Path) -> bool {
|
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
|
||||||
self.fs.contains_key(path)
|
self.0
|
||||||
|
.write()
|
||||||
|
.map_err(|_| {
|
||||||
|
let msg = format!(
|
||||||
|
"Failed to acquire write lock for the \
|
||||||
|
directory when trying to delete {:?}",
|
||||||
|
path
|
||||||
|
);
|
||||||
|
let io_err = make_io_err(msg);
|
||||||
|
DeleteError::IOError(IOError::with_path(path.to_owned(), io_err))
|
||||||
|
}).and_then(|mut writable_map| match writable_map.remove(path) {
|
||||||
|
Some(_) => Ok(()),
|
||||||
|
None => Err(DeleteError::FileDoesNotExist(PathBuf::from(path))),
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn watch(&mut self, watch_handle: WatchCallback) -> WatchHandle {
|
fn exists(&self, path: &Path) -> bool {
|
||||||
self.watch_router.subscribe(watch_handle)
|
self.0
|
||||||
|
.read()
|
||||||
|
.expect("Failed to get read lock directory.")
|
||||||
|
.contains_key(path)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -116,36 +145,33 @@ impl fmt::Debug for RAMDirectory {
|
|||||||
/// It is mainly meant for unit testing.
|
/// It is mainly meant for unit testing.
|
||||||
/// Writes are only made visible upon flushing.
|
/// Writes are only made visible upon flushing.
|
||||||
///
|
///
|
||||||
#[derive(Clone, Default)]
|
#[derive(Clone)]
|
||||||
pub struct RAMDirectory {
|
pub struct RAMDirectory {
|
||||||
fs: Arc<RwLock<InnerRamDirectory>>,
|
fs: InnerDirectory,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl RAMDirectory {
|
impl RAMDirectory {
|
||||||
/// Constructor
|
/// Constructor
|
||||||
pub fn create() -> RAMDirectory {
|
pub fn create() -> RAMDirectory {
|
||||||
Self::default()
|
RAMDirectory {
|
||||||
|
fs: InnerDirectory::new(),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Directory for RAMDirectory {
|
impl Directory for RAMDirectory {
|
||||||
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError> {
|
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError> {
|
||||||
self.fs.read().unwrap().open_read(path)
|
self.fs.open_read(path)
|
||||||
}
|
|
||||||
|
|
||||||
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
|
|
||||||
self.fs.write().unwrap().delete(path)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn exists(&self, path: &Path) -> bool {
|
|
||||||
self.fs.read().unwrap().exists(path)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError> {
|
fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError> {
|
||||||
let mut fs = self.fs.write().unwrap();
|
|
||||||
let path_buf = PathBuf::from(path);
|
let path_buf = PathBuf::from(path);
|
||||||
let vec_writer = VecWriter::new(path_buf.clone(), self.clone());
|
let vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());
|
||||||
let exists = fs.write(path_buf.clone(), &[]);
|
|
||||||
|
let exists = self
|
||||||
|
.fs
|
||||||
|
.write(path_buf.clone(), &Vec::new())
|
||||||
|
.map_err(|err| IOError::with_path(path.to_owned(), err))?;
|
||||||
// force the creation of the file to mimic the MMap directory.
|
// force the creation of the file to mimic the MMap directory.
|
||||||
if exists {
|
if exists {
|
||||||
Err(OpenWriteError::FileAlreadyExists(path_buf))
|
Err(OpenWriteError::FileAlreadyExists(path_buf))
|
||||||
@@ -154,8 +180,17 @@ impl Directory for RAMDirectory {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
|
||||||
|
self.fs.delete(path)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn exists(&self, path: &Path) -> bool {
|
||||||
|
self.fs.exists(path)
|
||||||
|
}
|
||||||
|
|
||||||
fn atomic_read(&self, path: &Path) -> Result<Vec<u8>, OpenReadError> {
|
fn atomic_read(&self, path: &Path) -> Result<Vec<u8>, OpenReadError> {
|
||||||
Ok(self.open_read(path)?.as_slice().to_owned())
|
let read = self.open_read(path)?;
|
||||||
|
Ok(read.as_slice().to_owned())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
|
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
|
||||||
@@ -164,20 +199,10 @@ impl Directory for RAMDirectory {
|
|||||||
msg.unwrap_or("Undefined".to_string())
|
msg.unwrap_or("Undefined".to_string())
|
||||||
)));
|
)));
|
||||||
let path_buf = PathBuf::from(path);
|
let path_buf = PathBuf::from(path);
|
||||||
|
let mut vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());
|
||||||
// Reserve the path to prevent calls to .write() to succeed.
|
self.fs.write(path_buf, &Vec::new())?;
|
||||||
self.fs.write().unwrap().write(path_buf.clone(), &[]);
|
|
||||||
|
|
||||||
let mut vec_writer = VecWriter::new(path_buf.clone(), self.clone());
|
|
||||||
vec_writer.write_all(data)?;
|
vec_writer.write_all(data)?;
|
||||||
vec_writer.flush()?;
|
vec_writer.flush()?;
|
||||||
if path == Path::new(&*META_FILEPATH) {
|
|
||||||
self.fs.write().unwrap().watch_router.broadcast();
|
|
||||||
}
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn watch(&self, watch_callback: WatchCallback) -> WatchHandle {
|
|
||||||
self.fs.write().unwrap().watch(watch_callback)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,9 +1,12 @@
|
|||||||
|
use super::shared_vec_slice::SharedVecSlice;
|
||||||
use common::HasLen;
|
use common::HasLen;
|
||||||
|
#[cfg(feature = "mmap")]
|
||||||
|
use fst::raw::MmapReadOnly;
|
||||||
use stable_deref_trait::{CloneStableDeref, StableDeref};
|
use stable_deref_trait::{CloneStableDeref, StableDeref};
|
||||||
use std::ops::Deref;
|
use std::ops::Deref;
|
||||||
use std::sync::Arc;
|
|
||||||
|
|
||||||
pub type BoxedData = Box<Deref<Target = [u8]> + Send + Sync + 'static>;
|
|
||||||
|
const EMPTY_SLICE: [u8; 0] = [];
|
||||||
|
|
||||||
/// Read object that represents files in tantivy.
|
/// Read object that represents files in tantivy.
|
||||||
///
|
///
|
||||||
@@ -11,10 +14,14 @@ pub type BoxedData = Box<Deref<Target = [u8]> + Send + Sync + 'static>;
|
|||||||
/// the data in the form of a constant read-only `&[u8]`.
|
/// the data in the form of a constant read-only `&[u8]`.
|
||||||
/// Whatever happens to the directory file, the data
|
/// Whatever happens to the directory file, the data
|
||||||
/// hold by this object should never be altered or destroyed.
|
/// hold by this object should never be altered or destroyed.
|
||||||
pub struct ReadOnlySource {
|
pub enum ReadOnlySource {
|
||||||
data: Arc<BoxedData>,
|
/// Mmap source of data
|
||||||
start: usize,
|
#[cfg(feature = "mmap")]
|
||||||
stop: usize,
|
Mmap(MmapReadOnly),
|
||||||
|
/// Wrapping a `Vec<u8>`
|
||||||
|
Anonymous(SharedVecSlice),
|
||||||
|
/// Wrapping a static slice
|
||||||
|
Static(&'static [u8])
|
||||||
}
|
}
|
||||||
|
|
||||||
unsafe impl StableDeref for ReadOnlySource {}
|
unsafe impl StableDeref for ReadOnlySource {}
|
||||||
@@ -28,38 +35,20 @@ impl Deref for ReadOnlySource {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<Arc<BoxedData>> for ReadOnlySource {
|
|
||||||
fn from(data: Arc<BoxedData>) -> Self {
|
|
||||||
let len = data.len();
|
|
||||||
ReadOnlySource {
|
|
||||||
data,
|
|
||||||
start: 0,
|
|
||||||
stop: len,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ReadOnlySource {
|
impl ReadOnlySource {
|
||||||
pub(crate) fn new<D>(data: D) -> ReadOnlySource
|
|
||||||
where
|
|
||||||
D: Deref<Target = [u8]> + Send + Sync + 'static,
|
|
||||||
{
|
|
||||||
let len = data.len();
|
|
||||||
ReadOnlySource {
|
|
||||||
data: Arc::new(Box::new(data)),
|
|
||||||
start: 0,
|
|
||||||
stop: len,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Creates an empty ReadOnlySource
|
/// Creates an empty ReadOnlySource
|
||||||
pub fn empty() -> ReadOnlySource {
|
pub fn empty() -> ReadOnlySource {
|
||||||
ReadOnlySource::new(&[][..])
|
ReadOnlySource::Static(&EMPTY_SLICE)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the data underlying the ReadOnlySource object.
|
/// Returns the data underlying the ReadOnlySource object.
|
||||||
pub fn as_slice(&self) -> &[u8] {
|
pub fn as_slice(&self) -> &[u8] {
|
||||||
&self.data[self.start..self.stop]
|
match *self {
|
||||||
|
#[cfg(feature = "mmap")]
|
||||||
|
ReadOnlySource::Mmap(ref mmap_read_only) => mmap_read_only.as_slice(),
|
||||||
|
ReadOnlySource::Anonymous(ref shared_vec) => shared_vec.as_slice(),
|
||||||
|
ReadOnlySource::Static(data) => data,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Splits into 2 `ReadOnlySource`, at the offset given
|
/// Splits into 2 `ReadOnlySource`, at the offset given
|
||||||
@@ -80,18 +69,25 @@ impl ReadOnlySource {
|
|||||||
/// worth of data in anonymous memory, and only a
|
/// worth of data in anonymous memory, and only a
|
||||||
/// 1KB slice is remaining, the whole `500MBs`
|
/// 1KB slice is remaining, the whole `500MBs`
|
||||||
/// are retained in memory.
|
/// are retained in memory.
|
||||||
pub fn slice(&self, start: usize, stop: usize) -> ReadOnlySource {
|
pub fn slice(&self, from_offset: usize, to_offset: usize) -> ReadOnlySource {
|
||||||
assert!(
|
assert!(
|
||||||
start <= stop,
|
from_offset <= to_offset,
|
||||||
"Requested negative slice [{}..{}]",
|
"Requested negative slice [{}..{}]",
|
||||||
start,
|
from_offset,
|
||||||
stop
|
to_offset
|
||||||
);
|
);
|
||||||
assert!(stop <= self.len());
|
match *self {
|
||||||
ReadOnlySource {
|
#[cfg(feature = "mmap")]
|
||||||
data: self.data.clone(),
|
ReadOnlySource::Mmap(ref mmap_read_only) => {
|
||||||
start: self.start + start,
|
let sliced_mmap = mmap_read_only.range(from_offset, to_offset - from_offset);
|
||||||
stop: self.start + stop,
|
ReadOnlySource::Mmap(sliced_mmap)
|
||||||
|
}
|
||||||
|
ReadOnlySource::Anonymous(ref shared_vec) => {
|
||||||
|
ReadOnlySource::Anonymous(shared_vec.slice(from_offset, to_offset))
|
||||||
|
}
|
||||||
|
ReadOnlySource::Static(data) => {
|
||||||
|
ReadOnlySource::Static(&data[from_offset..to_offset])
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -100,7 +96,8 @@ impl ReadOnlySource {
|
|||||||
///
|
///
|
||||||
/// Equivalent to `.slice(from_offset, self.len())`
|
/// Equivalent to `.slice(from_offset, self.len())`
|
||||||
pub fn slice_from(&self, from_offset: usize) -> ReadOnlySource {
|
pub fn slice_from(&self, from_offset: usize) -> ReadOnlySource {
|
||||||
self.slice(from_offset, self.len())
|
let len = self.len();
|
||||||
|
self.slice(from_offset, len)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Like `.slice(...)` but enforcing only the `to`
|
/// Like `.slice(...)` but enforcing only the `to`
|
||||||
@@ -114,18 +111,25 @@ impl ReadOnlySource {
|
|||||||
|
|
||||||
impl HasLen for ReadOnlySource {
|
impl HasLen for ReadOnlySource {
|
||||||
fn len(&self) -> usize {
|
fn len(&self) -> usize {
|
||||||
self.stop - self.start
|
self.as_slice().len()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Clone for ReadOnlySource {
|
impl Clone for ReadOnlySource {
|
||||||
fn clone(&self) -> Self {
|
fn clone(&self) -> Self {
|
||||||
self.slice_from(0)
|
self.slice(0, self.len())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<Vec<u8>> for ReadOnlySource {
|
impl From<Vec<u8>> for ReadOnlySource {
|
||||||
fn from(data: Vec<u8>) -> ReadOnlySource {
|
fn from(data: Vec<u8>) -> ReadOnlySource {
|
||||||
ReadOnlySource::new(data)
|
let shared_data = SharedVecSlice::from(data);
|
||||||
|
ReadOnlySource::Anonymous(shared_data)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<&'static [u8]> for ReadOnlySource {
|
||||||
|
fn from(data: &'static [u8]) -> ReadOnlySource {
|
||||||
|
ReadOnlySource::Static(data)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
41
src/directory/shared_vec_slice.rs
Normal file
41
src/directory/shared_vec_slice.rs
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct SharedVecSlice {
|
||||||
|
pub data: Arc<Vec<u8>>,
|
||||||
|
pub start: usize,
|
||||||
|
pub len: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SharedVecSlice {
|
||||||
|
pub fn empty() -> SharedVecSlice {
|
||||||
|
SharedVecSlice::new(Arc::new(Vec::new()))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn new(data: Arc<Vec<u8>>) -> SharedVecSlice {
|
||||||
|
let data_len = data.len();
|
||||||
|
SharedVecSlice {
|
||||||
|
data,
|
||||||
|
start: 0,
|
||||||
|
len: data_len,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn as_slice(&self) -> &[u8] {
|
||||||
|
&self.data[self.start..self.start + self.len]
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn slice(&self, from_offset: usize, to_offset: usize) -> SharedVecSlice {
|
||||||
|
SharedVecSlice {
|
||||||
|
data: Arc::clone(&self.data),
|
||||||
|
start: self.start + from_offset,
|
||||||
|
len: to_offset - from_offset,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<Vec<u8>> for SharedVecSlice {
|
||||||
|
fn from(data: Vec<u8>) -> SharedVecSlice {
|
||||||
|
SharedVecSlice::new(Arc::new(data))
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,222 +0,0 @@
|
|||||||
use super::*;
|
|
||||||
use std::io::{Seek, SeekFrom, Write};
|
|
||||||
use std::mem;
|
|
||||||
use std::path::{Path, PathBuf};
|
|
||||||
use std::sync::atomic::AtomicUsize;
|
|
||||||
use std::sync::atomic::Ordering;
|
|
||||||
use std::sync::Arc;
|
|
||||||
use std::thread;
|
|
||||||
use std::time;
|
|
||||||
use std::time::Duration;
|
|
||||||
|
|
||||||
lazy_static! {
|
|
||||||
static ref TEST_PATH: &'static Path = Path::new("some_path_for_test");
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_ram_directory() {
|
|
||||||
let mut ram_directory = RAMDirectory::create();
|
|
||||||
test_directory(&mut ram_directory);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
#[cfg(feature = "mmap")]
|
|
||||||
fn test_mmap_directory() {
|
|
||||||
let mut mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
|
|
||||||
test_directory(&mut mmap_directory);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
#[should_panic]
|
|
||||||
fn ram_directory_panics_if_flush_forgotten() {
|
|
||||||
let mut ram_directory = RAMDirectory::create();
|
|
||||||
let mut write_file = ram_directory.open_write(*TEST_PATH).unwrap();
|
|
||||||
assert!(write_file.write_all(&[4]).is_ok());
|
|
||||||
}
|
|
||||||
|
|
||||||
fn test_simple(directory: &mut Directory) {
|
|
||||||
{
|
|
||||||
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
|
|
||||||
assert!(directory.exists(*TEST_PATH));
|
|
||||||
write_file.write_all(&[4]).unwrap();
|
|
||||||
write_file.write_all(&[3]).unwrap();
|
|
||||||
write_file.write_all(&[7, 3, 5]).unwrap();
|
|
||||||
write_file.flush().unwrap();
|
|
||||||
}
|
|
||||||
{
|
|
||||||
let read_file = directory.open_read(*TEST_PATH).unwrap();
|
|
||||||
let data: &[u8] = &*read_file;
|
|
||||||
assert_eq!(data, &[4u8, 3u8, 7u8, 3u8, 5u8]);
|
|
||||||
}
|
|
||||||
assert!(directory.delete(*TEST_PATH).is_ok());
|
|
||||||
assert!(!directory.exists(*TEST_PATH));
|
|
||||||
}
|
|
||||||
|
|
||||||
fn test_seek(directory: &mut Directory) {
|
|
||||||
{
|
|
||||||
{
|
|
||||||
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
|
|
||||||
write_file.write_all(&[4, 3, 7, 3, 5]).unwrap();
|
|
||||||
write_file.seek(SeekFrom::Start(0)).unwrap();
|
|
||||||
write_file.write_all(&[3, 1]).unwrap();
|
|
||||||
write_file.flush().unwrap();
|
|
||||||
}
|
|
||||||
let read_file = directory.open_read(*TEST_PATH).unwrap();
|
|
||||||
let data: &[u8] = &*read_file;
|
|
||||||
assert_eq!(data, &[3u8, 1u8, 7u8, 3u8, 5u8]);
|
|
||||||
}
|
|
||||||
|
|
||||||
assert!(directory.delete(*TEST_PATH).is_ok());
|
|
||||||
}
|
|
||||||
|
|
||||||
fn test_rewrite_forbidden(directory: &mut Directory) {
|
|
||||||
{
|
|
||||||
directory.open_write(*TEST_PATH).unwrap();
|
|
||||||
assert!(directory.exists(*TEST_PATH));
|
|
||||||
}
|
|
||||||
{
|
|
||||||
assert!(directory.open_write(*TEST_PATH).is_err());
|
|
||||||
}
|
|
||||||
assert!(directory.delete(*TEST_PATH).is_ok());
|
|
||||||
}
|
|
||||||
|
|
||||||
fn test_write_create_the_file(directory: &mut Directory) {
|
|
||||||
{
|
|
||||||
assert!(directory.open_read(*TEST_PATH).is_err());
|
|
||||||
let _w = directory.open_write(*TEST_PATH).unwrap();
|
|
||||||
assert!(directory.exists(*TEST_PATH));
|
|
||||||
assert!(directory.open_read(*TEST_PATH).is_ok());
|
|
||||||
assert!(directory.delete(*TEST_PATH).is_ok());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn test_directory_delete(directory: &mut Directory) {
|
|
||||||
assert!(directory.open_read(*TEST_PATH).is_err());
|
|
||||||
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
|
|
||||||
write_file.write_all(&[1, 2, 3, 4]).unwrap();
|
|
||||||
write_file.flush().unwrap();
|
|
||||||
{
|
|
||||||
let read_handle = directory.open_read(*TEST_PATH).unwrap();
|
|
||||||
{
|
|
||||||
assert_eq!(&*read_handle, &[1u8, 2u8, 3u8, 4u8]);
|
|
||||||
|
|
||||||
// Mapped files can't be deleted on Windows
|
|
||||||
if !cfg!(windows) {
|
|
||||||
assert!(directory.delete(*TEST_PATH).is_ok());
|
|
||||||
assert_eq!(&*read_handle, &[1u8, 2u8, 3u8, 4u8]);
|
|
||||||
}
|
|
||||||
|
|
||||||
assert!(directory.delete(Path::new("SomeOtherPath")).is_err());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if cfg!(windows) {
|
|
||||||
assert!(directory.delete(*TEST_PATH).is_ok());
|
|
||||||
}
|
|
||||||
|
|
||||||
assert!(directory.open_read(*TEST_PATH).is_err());
|
|
||||||
assert!(directory.delete(*TEST_PATH).is_err());
|
|
||||||
}
|
|
||||||
|
|
||||||
fn test_directory(directory: &mut Directory) {
|
|
||||||
test_simple(directory);
|
|
||||||
test_seek(directory);
|
|
||||||
test_rewrite_forbidden(directory);
|
|
||||||
test_write_create_the_file(directory);
|
|
||||||
test_directory_delete(directory);
|
|
||||||
test_lock_non_blocking(directory);
|
|
||||||
test_lock_blocking(directory);
|
|
||||||
test_watch(directory);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn test_watch(directory: &mut Directory) {
|
|
||||||
let counter: Arc<AtomicUsize> = Default::default();
|
|
||||||
let counter_clone = counter.clone();
|
|
||||||
let watch_callback = Box::new(move || {
|
|
||||||
counter_clone.fetch_add(1, Ordering::SeqCst);
|
|
||||||
});
|
|
||||||
assert!(directory
|
|
||||||
.atomic_write(Path::new("meta.json"), b"random_test_data")
|
|
||||||
.is_ok());
|
|
||||||
thread::sleep(Duration::new(0, 10_000));
|
|
||||||
assert_eq!(0, counter.load(Ordering::SeqCst));
|
|
||||||
|
|
||||||
let watch_handle = directory.watch(watch_callback);
|
|
||||||
for i in 0..10 {
|
|
||||||
assert_eq!(i, counter.load(Ordering::SeqCst));
|
|
||||||
assert!(directory
|
|
||||||
.atomic_write(Path::new("meta.json"), b"random_test_data_2")
|
|
||||||
.is_ok());
|
|
||||||
for _ in 0..100 {
|
|
||||||
if counter.load(Ordering::SeqCst) > i {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
thread::sleep(Duration::from_millis(10));
|
|
||||||
}
|
|
||||||
assert_eq!(i + 1, counter.load(Ordering::SeqCst));
|
|
||||||
}
|
|
||||||
mem::drop(watch_handle);
|
|
||||||
assert!(directory
|
|
||||||
.atomic_write(Path::new("meta.json"), b"random_test_data")
|
|
||||||
.is_ok());
|
|
||||||
thread::sleep(Duration::from_millis(200));
|
|
||||||
assert_eq!(10, counter.load(Ordering::SeqCst));
|
|
||||||
}
|
|
||||||
|
|
||||||
fn test_lock_non_blocking(directory: &mut Directory) {
|
|
||||||
{
|
|
||||||
let lock_a_res = directory.acquire_lock(&Lock {
|
|
||||||
filepath: PathBuf::from("a.lock"),
|
|
||||||
is_blocking: false,
|
|
||||||
});
|
|
||||||
assert!(lock_a_res.is_ok());
|
|
||||||
let lock_b_res = directory.acquire_lock(&Lock {
|
|
||||||
filepath: PathBuf::from("b.lock"),
|
|
||||||
is_blocking: false,
|
|
||||||
});
|
|
||||||
assert!(lock_b_res.is_ok());
|
|
||||||
let lock_a_res2 = directory.acquire_lock(&Lock {
|
|
||||||
filepath: PathBuf::from("a.lock"),
|
|
||||||
is_blocking: false,
|
|
||||||
});
|
|
||||||
assert!(lock_a_res2.is_err());
|
|
||||||
}
|
|
||||||
let lock_a_res = directory.acquire_lock(&Lock {
|
|
||||||
filepath: PathBuf::from("a.lock"),
|
|
||||||
is_blocking: false,
|
|
||||||
});
|
|
||||||
assert!(lock_a_res.is_ok());
|
|
||||||
}
|
|
||||||
|
|
||||||
fn test_lock_blocking(directory: &mut Directory) {
|
|
||||||
let lock_a_res = directory.acquire_lock(&Lock {
|
|
||||||
filepath: PathBuf::from("a.lock"),
|
|
||||||
is_blocking: true,
|
|
||||||
});
|
|
||||||
assert!(lock_a_res.is_ok());
|
|
||||||
std::thread::spawn(move || {
|
|
||||||
//< lock_a_res is sent to the thread.
|
|
||||||
std::thread::sleep(time::Duration::from_millis(10));
|
|
||||||
// explicitely droping lock_a_res. It would have been sufficient to just force it
|
|
||||||
// to be part of the move, but the intent seems clearer that way.
|
|
||||||
drop(lock_a_res);
|
|
||||||
});
|
|
||||||
{
|
|
||||||
// A non-blocking call should fail, as the thread is running and holding the lock.
|
|
||||||
let lock_a_res = directory.acquire_lock(&Lock {
|
|
||||||
filepath: PathBuf::from("a.lock"),
|
|
||||||
is_blocking: false,
|
|
||||||
});
|
|
||||||
assert!(lock_a_res.is_err());
|
|
||||||
}
|
|
||||||
{
|
|
||||||
// the blocking call should wait for at least 10ms.
|
|
||||||
let start = time::Instant::now();
|
|
||||||
let lock_a_res = directory.acquire_lock(&Lock {
|
|
||||||
filepath: PathBuf::from("a.lock"),
|
|
||||||
is_blocking: true,
|
|
||||||
});
|
|
||||||
assert!(lock_a_res.is_ok());
|
|
||||||
assert!(start.elapsed().subsec_millis() >= 10);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,156 +0,0 @@
|
|||||||
use std::sync::Arc;
|
|
||||||
use std::sync::RwLock;
|
|
||||||
use std::sync::Weak;
|
|
||||||
|
|
||||||
/// Type alias for callbacks registered when watching files of a `Directory`.
|
|
||||||
pub type WatchCallback = Box<Fn() -> () + Sync + Send>;
|
|
||||||
|
|
||||||
/// Helper struct to implement the watch method in `Directory` implementations.
|
|
||||||
///
|
|
||||||
/// It registers callbacks (See `.subscribe(...)`) and
|
|
||||||
/// calls them upon calls to `.broadcast(...)`.
|
|
||||||
#[derive(Default)]
|
|
||||||
pub struct WatchCallbackList {
|
|
||||||
router: RwLock<Vec<Weak<WatchCallback>>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Controls how long a directory should watch for a file change.
|
|
||||||
///
|
|
||||||
/// After all the clones of `WatchHandle` are dropped, the associated will not be called when a
|
|
||||||
/// file change is detected.
|
|
||||||
#[must_use = "This `WatchHandle` controls the lifetime of the watch and should therefore be used."]
|
|
||||||
#[derive(Clone)]
|
|
||||||
pub struct WatchHandle(Arc<WatchCallback>);
|
|
||||||
|
|
||||||
impl WatchCallbackList {
|
|
||||||
/// Suscribes a new callback and returns a handle that controls the lifetime of the callback.
|
|
||||||
pub fn subscribe(&self, watch_callback: WatchCallback) -> WatchHandle {
|
|
||||||
let watch_callback_arc = Arc::new(watch_callback);
|
|
||||||
let watch_callback_weak = Arc::downgrade(&watch_callback_arc);
|
|
||||||
self.router.write().unwrap().push(watch_callback_weak);
|
|
||||||
WatchHandle(watch_callback_arc)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn list_callback(&self) -> Vec<Arc<WatchCallback>> {
|
|
||||||
let mut callbacks = vec![];
|
|
||||||
let mut router_wlock = self.router.write().unwrap();
|
|
||||||
let mut i = 0;
|
|
||||||
while i < router_wlock.len() {
|
|
||||||
if let Some(watch) = router_wlock[i].upgrade() {
|
|
||||||
callbacks.push(watch);
|
|
||||||
i += 1;
|
|
||||||
} else {
|
|
||||||
router_wlock.swap_remove(i);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
callbacks
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Triggers all callbacks
|
|
||||||
pub fn broadcast(&self) {
|
|
||||||
let callbacks = self.list_callback();
|
|
||||||
let spawn_res = std::thread::Builder::new()
|
|
||||||
.name("watch-callbacks".to_string())
|
|
||||||
.spawn(move || {
|
|
||||||
for callback in callbacks {
|
|
||||||
callback();
|
|
||||||
}
|
|
||||||
});
|
|
||||||
if let Err(err) = spawn_res {
|
|
||||||
error!(
|
|
||||||
"Failed to spawn thread to call watch callbacks. Cause: {:?}",
|
|
||||||
err
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use directory::WatchCallbackList;
|
|
||||||
use std::mem;
|
|
||||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
|
||||||
use std::sync::Arc;
|
|
||||||
use std::thread;
|
|
||||||
use std::time::Duration;
|
|
||||||
|
|
||||||
const WAIT_TIME: u64 = 20;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_watch_event_router_simple() {
|
|
||||||
let watch_event_router = WatchCallbackList::default();
|
|
||||||
let counter: Arc<AtomicUsize> = Default::default();
|
|
||||||
let counter_clone = counter.clone();
|
|
||||||
let inc_callback = Box::new(move || {
|
|
||||||
counter_clone.fetch_add(1, Ordering::SeqCst);
|
|
||||||
});
|
|
||||||
watch_event_router.broadcast();
|
|
||||||
assert_eq!(0, counter.load(Ordering::SeqCst));
|
|
||||||
let handle_a = watch_event_router.subscribe(inc_callback);
|
|
||||||
thread::sleep(Duration::from_millis(WAIT_TIME));
|
|
||||||
assert_eq!(0, counter.load(Ordering::SeqCst));
|
|
||||||
watch_event_router.broadcast();
|
|
||||||
thread::sleep(Duration::from_millis(WAIT_TIME));
|
|
||||||
assert_eq!(1, counter.load(Ordering::SeqCst));
|
|
||||||
watch_event_router.broadcast();
|
|
||||||
watch_event_router.broadcast();
|
|
||||||
watch_event_router.broadcast();
|
|
||||||
thread::sleep(Duration::from_millis(WAIT_TIME));
|
|
||||||
assert_eq!(4, counter.load(Ordering::SeqCst));
|
|
||||||
mem::drop(handle_a);
|
|
||||||
watch_event_router.broadcast();
|
|
||||||
thread::sleep(Duration::from_millis(WAIT_TIME));
|
|
||||||
assert_eq!(4, counter.load(Ordering::SeqCst));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_watch_event_router_multiple_callback_same_key() {
|
|
||||||
let watch_event_router = WatchCallbackList::default();
|
|
||||||
let counter: Arc<AtomicUsize> = Default::default();
|
|
||||||
let inc_callback = |inc: usize| {
|
|
||||||
let counter_clone = counter.clone();
|
|
||||||
Box::new(move || {
|
|
||||||
counter_clone.fetch_add(inc, Ordering::SeqCst);
|
|
||||||
})
|
|
||||||
};
|
|
||||||
let handle_a = watch_event_router.subscribe(inc_callback(1));
|
|
||||||
let handle_a2 = watch_event_router.subscribe(inc_callback(10));
|
|
||||||
thread::sleep(Duration::from_millis(WAIT_TIME));
|
|
||||||
assert_eq!(0, counter.load(Ordering::SeqCst));
|
|
||||||
watch_event_router.broadcast();
|
|
||||||
watch_event_router.broadcast();
|
|
||||||
thread::sleep(Duration::from_millis(WAIT_TIME));
|
|
||||||
assert_eq!(22, counter.load(Ordering::SeqCst));
|
|
||||||
mem::drop(handle_a);
|
|
||||||
watch_event_router.broadcast();
|
|
||||||
thread::sleep(Duration::from_millis(WAIT_TIME));
|
|
||||||
assert_eq!(32, counter.load(Ordering::SeqCst));
|
|
||||||
mem::drop(handle_a2);
|
|
||||||
watch_event_router.broadcast();
|
|
||||||
watch_event_router.broadcast();
|
|
||||||
thread::sleep(Duration::from_millis(WAIT_TIME));
|
|
||||||
assert_eq!(32, counter.load(Ordering::SeqCst));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_watch_event_router_multiple_callback_different_key() {
|
|
||||||
let watch_event_router = WatchCallbackList::default();
|
|
||||||
let counter: Arc<AtomicUsize> = Default::default();
|
|
||||||
let counter_clone = counter.clone();
|
|
||||||
let inc_callback = Box::new(move || {
|
|
||||||
counter_clone.fetch_add(1, Ordering::SeqCst);
|
|
||||||
});
|
|
||||||
let handle_a = watch_event_router.subscribe(inc_callback);
|
|
||||||
assert_eq!(0, counter.load(Ordering::SeqCst));
|
|
||||||
watch_event_router.broadcast();
|
|
||||||
watch_event_router.broadcast();
|
|
||||||
thread::sleep(Duration::from_millis(WAIT_TIME));
|
|
||||||
assert_eq!(2, counter.load(Ordering::SeqCst));
|
|
||||||
thread::sleep(Duration::from_millis(WAIT_TIME));
|
|
||||||
mem::drop(handle_a);
|
|
||||||
watch_event_router.broadcast();
|
|
||||||
thread::sleep(Duration::from_millis(WAIT_TIME));
|
|
||||||
assert_eq!(2, counter.load(Ordering::SeqCst));
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
83
src/error.rs
83
src/error.rs
@@ -2,93 +2,53 @@
|
|||||||
|
|
||||||
use std::io;
|
use std::io;
|
||||||
|
|
||||||
use directory::error::LockError;
|
|
||||||
use directory::error::{IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
|
use directory::error::{IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
|
||||||
use fastfield::FastFieldNotAvailableError;
|
use fastfield::FastFieldNotAvailableError;
|
||||||
|
use indexer::LockType;
|
||||||
use query;
|
use query;
|
||||||
use schema;
|
use schema;
|
||||||
use serde_json;
|
use serde_json;
|
||||||
use std::fmt;
|
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::sync::PoisonError;
|
use std::sync::PoisonError;
|
||||||
|
|
||||||
pub struct DataCorruption {
|
|
||||||
filepath: Option<PathBuf>,
|
|
||||||
comment: String,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl DataCorruption {
|
|
||||||
pub fn new(filepath: PathBuf, comment: String) -> DataCorruption {
|
|
||||||
DataCorruption {
|
|
||||||
filepath: Some(filepath),
|
|
||||||
comment,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn comment_only(comment: String) -> DataCorruption {
|
|
||||||
DataCorruption {
|
|
||||||
filepath: None,
|
|
||||||
comment,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl fmt::Debug for DataCorruption {
|
|
||||||
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
|
|
||||||
write!(f, "Data corruption: ")?;
|
|
||||||
if let Some(ref filepath) = &self.filepath {
|
|
||||||
write!(f, "(in file `{:?}`)", filepath)?;
|
|
||||||
}
|
|
||||||
write!(f, ": {}.", self.comment)?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The library's failure based error enum
|
/// The library's failure based error enum
|
||||||
#[derive(Debug, Fail)]
|
#[derive(Debug, Fail)]
|
||||||
pub enum TantivyError {
|
pub enum TantivyError {
|
||||||
/// Path does not exist.
|
/// Path does not exist.
|
||||||
#[fail(display = "Path does not exist: '{:?}'", _0)]
|
#[fail(display = "path does not exist: '{:?}'", _0)]
|
||||||
PathDoesNotExist(PathBuf),
|
PathDoesNotExist(PathBuf),
|
||||||
/// File already exists, this is a problem when we try to write into a new file.
|
/// File already exists, this is a problem when we try to write into a new file.
|
||||||
#[fail(display = "File already exists: '{:?}'", _0)]
|
#[fail(display = "file already exists: '{:?}'", _0)]
|
||||||
FileAlreadyExists(PathBuf),
|
FileAlreadyExists(PathBuf),
|
||||||
/// Index already exists in this directory
|
|
||||||
#[fail(display = "Index already exists")]
|
|
||||||
IndexAlreadyExists,
|
|
||||||
/// Failed to acquire file lock
|
/// Failed to acquire file lock
|
||||||
#[fail(display = "Failed to acquire Lockfile: {:?}. {:?}", _0, _1)]
|
#[fail(
|
||||||
LockFailure(LockError, Option<String>),
|
display = "Failed to acquire Lockfile: {:?}. Possible causes: another IndexWriter instance or panic during previous lock drop.",
|
||||||
|
_0
|
||||||
|
)]
|
||||||
|
LockFailure(LockType),
|
||||||
/// IO Error.
|
/// IO Error.
|
||||||
#[fail(display = "An IO error occurred: '{}'", _0)]
|
#[fail(display = "an IO error occurred: '{}'", _0)]
|
||||||
IOError(#[cause] IOError),
|
IOError(#[cause] IOError),
|
||||||
/// Data corruption.
|
/// The data within is corrupted.
|
||||||
#[fail(display = "{:?}", _0)]
|
///
|
||||||
DataCorruption(DataCorruption),
|
/// For instance, it contains invalid JSON.
|
||||||
|
#[fail(display = "file contains corrupted data: '{:?}'", _0)]
|
||||||
|
CorruptedFile(PathBuf),
|
||||||
/// A thread holding the locked panicked and poisoned the lock.
|
/// A thread holding the locked panicked and poisoned the lock.
|
||||||
#[fail(display = "A thread holding the locked panicked and poisoned the lock")]
|
#[fail(display = "a thread holding the locked panicked and poisoned the lock")]
|
||||||
Poisoned,
|
Poisoned,
|
||||||
/// Invalid argument was passed by the user.
|
/// Invalid argument was passed by the user.
|
||||||
#[fail(display = "An invalid argument was passed: '{}'", _0)]
|
#[fail(display = "an invalid argument was passed: '{}'", _0)]
|
||||||
InvalidArgument(String),
|
InvalidArgument(String),
|
||||||
/// An Error happened in one of the thread.
|
/// An Error happened in one of the thread.
|
||||||
#[fail(display = "An error occurred in a thread: '{}'", _0)]
|
#[fail(display = "an error occurred in a thread: '{}'", _0)]
|
||||||
ErrorInThread(String),
|
ErrorInThread(String),
|
||||||
/// An Error appeared related to the schema.
|
/// An Error appeared related to the schema.
|
||||||
#[fail(display = "Schema error: '{}'", _0)]
|
#[fail(display = "Schema error: '{}'", _0)]
|
||||||
SchemaError(String),
|
SchemaError(String),
|
||||||
/// Tried to access a fastfield reader for a field not configured accordingly.
|
/// Tried to access a fastfield reader for a field not configured accordingly.
|
||||||
#[fail(display = "Fast field not available: '{:?}'", _0)]
|
#[fail(display = "fast field not available: '{:?}'", _0)]
|
||||||
FastFieldError(#[cause] FastFieldNotAvailableError),
|
FastFieldError(#[cause] FastFieldNotAvailableError),
|
||||||
/// System error. (e.g.: We failed spawning a new thread)
|
|
||||||
#[fail(display = "System error.'{}'", _0)]
|
|
||||||
SystemError(String),
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<DataCorruption> for TantivyError {
|
|
||||||
fn from(data_corruption: DataCorruption) -> TantivyError {
|
|
||||||
TantivyError::DataCorruption(data_corruption)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<FastFieldNotAvailableError> for TantivyError {
|
impl From<FastFieldNotAvailableError> for TantivyError {
|
||||||
@@ -97,12 +57,6 @@ impl From<FastFieldNotAvailableError> for TantivyError {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<LockError> for TantivyError {
|
|
||||||
fn from(lock_error: LockError) -> TantivyError {
|
|
||||||
TantivyError::LockFailure(lock_error, None)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<IOError> for TantivyError {
|
impl From<IOError> for TantivyError {
|
||||||
fn from(io_error: IOError) -> TantivyError {
|
fn from(io_error: IOError) -> TantivyError {
|
||||||
TantivyError::IOError(io_error)
|
TantivyError::IOError(io_error)
|
||||||
@@ -162,7 +116,6 @@ impl From<OpenDirectoryError> for TantivyError {
|
|||||||
OpenDirectoryError::NotADirectory(directory_path) => {
|
OpenDirectoryError::NotADirectory(directory_path) => {
|
||||||
TantivyError::InvalidArgument(format!("{:?} is not a directory", directory_path))
|
TantivyError::InvalidArgument(format!("{:?} is not a directory", directory_path))
|
||||||
}
|
}
|
||||||
OpenDirectoryError::IoError(err) => TantivyError::IOError(IOError::from(err)),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,12 +6,12 @@ pub use self::writer::BytesFastFieldWriter;
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use schema::Schema;
|
use schema::SchemaBuilder;
|
||||||
use Index;
|
use Index;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_bytes() {
|
fn test_bytes() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let field = schema_builder.add_bytes_field("bytesfield");
|
let field = schema_builder.add_bytes_field("bytesfield");
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -22,7 +22,9 @@ mod tests {
|
|||||||
index_writer.add_document(doc!(field=>vec![1u8, 3, 5, 7, 9]));
|
index_writer.add_document(doc!(field=>vec![1u8, 3, 5, 7, 9]));
|
||||||
index_writer.add_document(doc!(field=>vec![0u8; 1000]));
|
index_writer.add_document(doc!(field=>vec![0u8; 1000]));
|
||||||
assert!(index_writer.commit().is_ok());
|
assert!(index_writer.commit().is_ok());
|
||||||
let searcher = index.reader().unwrap().searcher();
|
|
||||||
|
index.load_searchers().unwrap();
|
||||||
|
let searcher = index.searcher();
|
||||||
let reader = searcher.segment_reader(0);
|
let reader = searcher.segment_reader(0);
|
||||||
let bytes_reader = reader.bytes_fast_field_reader(field).unwrap();
|
let bytes_reader = reader.bytes_fast_field_reader(field).unwrap();
|
||||||
|
|
||||||
|
|||||||
@@ -2,7 +2,6 @@ use bit_set::BitSet;
|
|||||||
use common::HasLen;
|
use common::HasLen;
|
||||||
use directory::ReadOnlySource;
|
use directory::ReadOnlySource;
|
||||||
use directory::WritePtr;
|
use directory::WritePtr;
|
||||||
use space_usage::ByteCount;
|
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::io::Write;
|
use std::io::Write;
|
||||||
use DocId;
|
use DocId;
|
||||||
@@ -64,11 +63,6 @@ impl DeleteBitSet {
|
|||||||
b & (1u8 << shift) != 0
|
b & (1u8 << shift) != 0
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Summarize total space usage of this bitset.
|
|
||||||
pub fn space_usage(&self) -> ByteCount {
|
|
||||||
self.data.len()
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl HasLen for DeleteBitSet {
|
impl HasLen for DeleteBitSet {
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
use super::MultiValueIntFastFieldReader;
|
use super::MultiValueIntFastFieldReader;
|
||||||
use schema::Facet;
|
use schema::Facet;
|
||||||
use std::str;
|
|
||||||
use termdict::TermDictionary;
|
use termdict::TermDictionary;
|
||||||
use termdict::TermOrdinal;
|
use termdict::TermOrdinal;
|
||||||
use DocId;
|
use DocId;
|
||||||
@@ -21,7 +20,6 @@ use DocId;
|
|||||||
pub struct FacetReader {
|
pub struct FacetReader {
|
||||||
term_ords: MultiValueIntFastFieldReader<u64>,
|
term_ords: MultiValueIntFastFieldReader<u64>,
|
||||||
term_dict: TermDictionary,
|
term_dict: TermDictionary,
|
||||||
buffer: Vec<u8>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl FacetReader {
|
impl FacetReader {
|
||||||
@@ -39,7 +37,6 @@ impl FacetReader {
|
|||||||
FacetReader {
|
FacetReader {
|
||||||
term_ords,
|
term_ords,
|
||||||
term_dict,
|
term_dict,
|
||||||
buffer: vec![],
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -58,18 +55,11 @@ impl FacetReader {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Given a term ordinal returns the term associated to it.
|
/// Given a term ordinal returns the term associated to it.
|
||||||
pub fn facet_from_ord(
|
pub fn facet_from_ord(&self, facet_ord: TermOrdinal, output: &mut Facet) {
|
||||||
&mut self,
|
|
||||||
facet_ord: TermOrdinal,
|
|
||||||
output: &mut Facet,
|
|
||||||
) -> Result<(), str::Utf8Error> {
|
|
||||||
let found_term = self
|
let found_term = self
|
||||||
.term_dict
|
.term_dict
|
||||||
.ord_to_term(facet_ord as u64, &mut self.buffer);
|
.ord_to_term(facet_ord as u64, output.inner_buffer_mut());
|
||||||
assert!(found_term, "Term ordinal {} no found.", facet_ord);
|
assert!(found_term, "Term ordinal {} no found.", facet_ord);
|
||||||
let facet_str = str::from_utf8(&self.buffer[..])?;
|
|
||||||
output.set_facet_str(facet_str);
|
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return the list of facet ordinals associated to a document.
|
/// Return the list of facet ordinals associated to a document.
|
||||||
|
|||||||
@@ -127,19 +127,19 @@ mod tests {
|
|||||||
use common::CompositeFile;
|
use common::CompositeFile;
|
||||||
use directory::{Directory, RAMDirectory, WritePtr};
|
use directory::{Directory, RAMDirectory, WritePtr};
|
||||||
use fastfield::FastFieldReader;
|
use fastfield::FastFieldReader;
|
||||||
use rand::prelude::SliceRandom;
|
use rand::Rng;
|
||||||
use rand::rngs::StdRng;
|
|
||||||
use rand::SeedableRng;
|
use rand::SeedableRng;
|
||||||
|
use rand::XorShiftRng;
|
||||||
use schema::Document;
|
use schema::Document;
|
||||||
use schema::Field;
|
use schema::Field;
|
||||||
use schema::Schema;
|
|
||||||
use schema::FAST;
|
use schema::FAST;
|
||||||
|
use schema::{Schema, SchemaBuilder};
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
|
||||||
lazy_static! {
|
lazy_static! {
|
||||||
pub static ref SCHEMA: Schema = {
|
pub static ref SCHEMA: Schema = {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
schema_builder.add_u64_field("field", FAST);
|
schema_builder.add_u64_field("field", FAST);
|
||||||
schema_builder.build()
|
schema_builder.build()
|
||||||
};
|
};
|
||||||
@@ -298,7 +298,7 @@ mod tests {
|
|||||||
fn test_signed_intfastfield() {
|
fn test_signed_intfastfield() {
|
||||||
let path = Path::new("test");
|
let path = Path::new("test");
|
||||||
let mut directory: RAMDirectory = RAMDirectory::create();
|
let mut directory: RAMDirectory = RAMDirectory::create();
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::new();
|
||||||
|
|
||||||
let i64_field = schema_builder.add_i64_field("field", FAST);
|
let i64_field = schema_builder.add_i64_field("field", FAST);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
@@ -342,7 +342,7 @@ mod tests {
|
|||||||
fn test_signed_intfastfield_default_val() {
|
fn test_signed_intfastfield_default_val() {
|
||||||
let path = Path::new("test");
|
let path = Path::new("test");
|
||||||
let mut directory: RAMDirectory = RAMDirectory::create();
|
let mut directory: RAMDirectory = RAMDirectory::create();
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::new();
|
||||||
let i64_field = schema_builder.add_i64_field("field", FAST);
|
let i64_field = schema_builder.add_i64_field("field", FAST);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
|
|
||||||
@@ -367,10 +367,11 @@ mod tests {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Warning: this generates the same permutation at each call
|
|
||||||
pub fn generate_permutation() -> Vec<u64> {
|
pub fn generate_permutation() -> Vec<u64> {
|
||||||
|
let seed: [u8; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
|
||||||
|
let mut rng = XorShiftRng::from_seed(seed);
|
||||||
let mut permutation: Vec<u64> = (0u64..100_000u64).collect();
|
let mut permutation: Vec<u64> = (0u64..100_000u64).collect();
|
||||||
permutation.shuffle(&mut StdRng::from_seed([1u8; 32]));
|
rng.shuffle(&mut permutation);
|
||||||
permutation
|
permutation
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -7,20 +7,14 @@ pub use self::writer::MultiValueIntFastFieldWriter;
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
extern crate time;
|
|
||||||
|
|
||||||
use self::time::Duration;
|
|
||||||
use collector::TopDocs;
|
|
||||||
use query::QueryParser;
|
|
||||||
use schema::Cardinality;
|
use schema::Cardinality;
|
||||||
use schema::Facet;
|
|
||||||
use schema::IntOptions;
|
use schema::IntOptions;
|
||||||
use schema::Schema;
|
use schema::SchemaBuilder;
|
||||||
use Index;
|
use Index;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_multivalued_u64() {
|
fn test_multivalued_u64() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let field = schema_builder.add_u64_field(
|
let field = schema_builder.add_u64_field(
|
||||||
"multifield",
|
"multifield",
|
||||||
IntOptions::default().set_fast(Cardinality::MultiValues),
|
IntOptions::default().set_fast(Cardinality::MultiValues),
|
||||||
@@ -34,12 +28,11 @@ mod tests {
|
|||||||
index_writer.add_document(doc!(field=>5u64, field=>20u64,field=>1u64));
|
index_writer.add_document(doc!(field=>5u64, field=>20u64,field=>1u64));
|
||||||
assert!(index_writer.commit().is_ok());
|
assert!(index_writer.commit().is_ok());
|
||||||
|
|
||||||
let searcher = index.reader().unwrap().searcher();
|
index.load_searchers().unwrap();
|
||||||
let segment_reader = searcher.segment_reader(0);
|
let searcher = index.searcher();
|
||||||
|
let reader = searcher.segment_reader(0);
|
||||||
let mut vals = Vec::new();
|
let mut vals = Vec::new();
|
||||||
let multi_value_reader = segment_reader
|
let multi_value_reader = reader.multi_fast_field_reader::<u64>(field).unwrap();
|
||||||
.multi_fast_field_reader::<u64>(field)
|
|
||||||
.unwrap();
|
|
||||||
{
|
{
|
||||||
multi_value_reader.get_vals(2, &mut vals);
|
multi_value_reader.get_vals(2, &mut vals);
|
||||||
assert_eq!(&vals, &[4u64]);
|
assert_eq!(&vals, &[4u64]);
|
||||||
@@ -54,136 +47,9 @@ mod tests {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_multivalued_date() {
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
let date_field = schema_builder.add_date_field(
|
|
||||||
"multi_date_field",
|
|
||||||
IntOptions::default()
|
|
||||||
.set_fast(Cardinality::MultiValues)
|
|
||||||
.set_indexed()
|
|
||||||
.set_stored(),
|
|
||||||
);
|
|
||||||
let time_i =
|
|
||||||
schema_builder.add_i64_field("time_stamp_i", IntOptions::default().set_stored());
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
let index = Index::create_in_ram(schema);
|
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
|
||||||
let first_time_stamp = chrono::Utc::now();
|
|
||||||
index_writer.add_document(
|
|
||||||
doc!(date_field=>first_time_stamp, date_field=>first_time_stamp, time_i=>1i64),
|
|
||||||
);
|
|
||||||
index_writer.add_document(doc!(time_i=>0i64));
|
|
||||||
// add one second
|
|
||||||
index_writer
|
|
||||||
.add_document(doc!(date_field=>first_time_stamp + Duration::seconds(1), time_i=>2i64));
|
|
||||||
// add another second
|
|
||||||
let two_secs_ahead = first_time_stamp + Duration::seconds(2);
|
|
||||||
index_writer.add_document(doc!(date_field=>two_secs_ahead, date_field=>two_secs_ahead,date_field=>two_secs_ahead, time_i=>3i64));
|
|
||||||
assert!(index_writer.commit().is_ok());
|
|
||||||
|
|
||||||
let reader = index.reader().unwrap();
|
|
||||||
let searcher = reader.searcher();
|
|
||||||
let reader = searcher.segment_reader(0);
|
|
||||||
assert_eq!(reader.num_docs(), 4);
|
|
||||||
|
|
||||||
{
|
|
||||||
let parser = QueryParser::for_index(&index, vec![date_field]);
|
|
||||||
let query = parser
|
|
||||||
.parse_query(&format!("\"{}\"", first_time_stamp.to_rfc3339()).to_string())
|
|
||||||
.expect("could not parse query");
|
|
||||||
let results = searcher
|
|
||||||
.search(&query, &TopDocs::with_limit(5))
|
|
||||||
.expect("could not query index");
|
|
||||||
|
|
||||||
assert_eq!(results.len(), 1);
|
|
||||||
for (_score, doc_address) in results {
|
|
||||||
let retrieved_doc = searcher.doc(doc_address).expect("cannot fetch doc");
|
|
||||||
assert_eq!(
|
|
||||||
retrieved_doc
|
|
||||||
.get_first(date_field)
|
|
||||||
.expect("cannot find value")
|
|
||||||
.date_value()
|
|
||||||
.timestamp(),
|
|
||||||
first_time_stamp.timestamp()
|
|
||||||
);
|
|
||||||
assert_eq!(
|
|
||||||
retrieved_doc
|
|
||||||
.get_first(time_i)
|
|
||||||
.expect("cannot find value")
|
|
||||||
.i64_value(),
|
|
||||||
1i64
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
let parser = QueryParser::for_index(&index, vec![date_field]);
|
|
||||||
let query = parser
|
|
||||||
.parse_query(&format!("\"{}\"", two_secs_ahead.to_rfc3339()).to_string())
|
|
||||||
.expect("could not parse query");
|
|
||||||
let results = searcher
|
|
||||||
.search(&query, &TopDocs::with_limit(5))
|
|
||||||
.expect("could not query index");
|
|
||||||
|
|
||||||
assert_eq!(results.len(), 1);
|
|
||||||
|
|
||||||
for (_score, doc_address) in results {
|
|
||||||
let retrieved_doc = searcher.doc(doc_address).expect("cannot fetch doc");
|
|
||||||
assert_eq!(
|
|
||||||
retrieved_doc
|
|
||||||
.get_first(date_field)
|
|
||||||
.expect("cannot find value")
|
|
||||||
.date_value()
|
|
||||||
.timestamp(),
|
|
||||||
two_secs_ahead.timestamp()
|
|
||||||
);
|
|
||||||
assert_eq!(
|
|
||||||
retrieved_doc
|
|
||||||
.get_first(time_i)
|
|
||||||
.expect("cannot find value")
|
|
||||||
.i64_value(),
|
|
||||||
3i64
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: support Date range queries
|
|
||||||
// {
|
|
||||||
// let parser = QueryParser::for_index(&index, vec![date_field]);
|
|
||||||
// let range_q = format!("\"{}\"..\"{}\"",
|
|
||||||
// (first_time_stamp + Duration::seconds(1)).to_rfc3339(),
|
|
||||||
// (first_time_stamp + Duration::seconds(3)).to_rfc3339()
|
|
||||||
// );
|
|
||||||
// let query = parser.parse_query(&range_q)
|
|
||||||
// .expect("could not parse query");
|
|
||||||
// let results = searcher.search(&query, &TopDocs::with_limit(5))
|
|
||||||
// .expect("could not query index");
|
|
||||||
//
|
|
||||||
//
|
|
||||||
// assert_eq!(results.len(), 2);
|
|
||||||
// for (i, doc_pair) in results.iter().enumerate() {
|
|
||||||
// let retrieved_doc = searcher.doc(doc_pair.1).expect("cannot fetch doc");
|
|
||||||
// let offset_sec = match i {
|
|
||||||
// 0 => 1,
|
|
||||||
// 1 => 3,
|
|
||||||
// _ => panic!("should not have more than 2 docs")
|
|
||||||
// };
|
|
||||||
// let time_i_val = match i {
|
|
||||||
// 0 => 2,
|
|
||||||
// 1 => 3,
|
|
||||||
// _ => panic!("should not have more than 2 docs")
|
|
||||||
// };
|
|
||||||
// assert_eq!(retrieved_doc.get_first(date_field).expect("cannot find value").date_value().timestamp(),
|
|
||||||
// (first_time_stamp + Duration::seconds(offset_sec)).timestamp());
|
|
||||||
// assert_eq!(retrieved_doc.get_first(time_i).expect("cannot find value").i64_value(), time_i_val);
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_multivalued_i64() {
|
fn test_multivalued_i64() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let field = schema_builder.add_i64_field(
|
let field = schema_builder.add_i64_field(
|
||||||
"multifield",
|
"multifield",
|
||||||
IntOptions::default().set_fast(Cardinality::MultiValues),
|
IntOptions::default().set_fast(Cardinality::MultiValues),
|
||||||
@@ -197,7 +63,8 @@ mod tests {
|
|||||||
index_writer.add_document(doc!(field=> -5i64, field => -20i64, field=>1i64));
|
index_writer.add_document(doc!(field=> -5i64, field => -20i64, field=>1i64));
|
||||||
assert!(index_writer.commit().is_ok());
|
assert!(index_writer.commit().is_ok());
|
||||||
|
|
||||||
let searcher = index.reader().unwrap().searcher();
|
index.load_searchers().unwrap();
|
||||||
|
let searcher = index.searcher();
|
||||||
let reader = searcher.segment_reader(0);
|
let reader = searcher.segment_reader(0);
|
||||||
let mut vals = Vec::new();
|
let mut vals = Vec::new();
|
||||||
let multi_value_reader = reader.multi_fast_field_reader::<i64>(field).unwrap();
|
let multi_value_reader = reader.multi_fast_field_reader::<i64>(field).unwrap();
|
||||||
@@ -218,17 +85,4 @@ mod tests {
|
|||||||
assert_eq!(&vals, &[-5i64, -20i64, 1i64]);
|
assert_eq!(&vals, &[-5i64, -20i64, 1i64]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#[test]
|
|
||||||
#[ignore]
|
|
||||||
fn test_many_facets() {
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
let field = schema_builder.add_facet_field("facetfield");
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
let index = Index::create_in_ram(schema);
|
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
|
||||||
for i in 0..100_000 {
|
|
||||||
index_writer.add_document(doc!(field=> Facet::from(format!("/lang/{}", i).as_str())));
|
|
||||||
}
|
|
||||||
assert!(index_writer.commit().is_ok());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ impl<Item: FastValue> MultiValueIntFastFieldReader<Item> {
|
|||||||
let (start, stop) = self.range(doc);
|
let (start, stop) = self.range(doc);
|
||||||
let len = (stop - start) as usize;
|
let len = (stop - start) as usize;
|
||||||
vals.resize(len, Item::default());
|
vals.resize(len, Item::default());
|
||||||
self.vals_reader.get_range_u64(start, &mut vals[..]);
|
self.vals_reader.get_range(start as u32, &mut vals[..]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -47,11 +47,11 @@ impl<Item: FastValue> MultiValueIntFastFieldReader<Item> {
|
|||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
use core::Index;
|
use core::Index;
|
||||||
use schema::{Document, Facet, Schema};
|
use schema::{Document, Facet, SchemaBuilder};
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_multifastfield_reader() {
|
fn test_multifastfield_reader() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::new();
|
||||||
let facet_field = schema_builder.add_facet_field("facets");
|
let facet_field = schema_builder.add_facet_field("facets");
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -75,26 +75,27 @@ mod tests {
|
|||||||
index_writer.add_document(doc);
|
index_writer.add_document(doc);
|
||||||
}
|
}
|
||||||
index_writer.commit().expect("Commit failed");
|
index_writer.commit().expect("Commit failed");
|
||||||
let searcher = index.reader().unwrap().searcher();
|
index.load_searchers().expect("Reloading searchers");
|
||||||
|
let searcher = index.searcher();
|
||||||
let segment_reader = searcher.segment_reader(0);
|
let segment_reader = searcher.segment_reader(0);
|
||||||
let mut facet_reader = segment_reader.facet_reader(facet_field).unwrap();
|
let mut facet_reader = segment_reader.facet_reader(facet_field).unwrap();
|
||||||
|
|
||||||
let mut facet = Facet::root();
|
let mut facet = Facet::root();
|
||||||
{
|
{
|
||||||
facet_reader.facet_from_ord(1, &mut facet).unwrap();
|
facet_reader.facet_from_ord(1, &mut facet);
|
||||||
assert_eq!(facet, Facet::from("/category"));
|
assert_eq!(facet, Facet::from("/category"));
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
facet_reader.facet_from_ord(2, &mut facet).unwrap();
|
facet_reader.facet_from_ord(2, &mut facet);
|
||||||
assert_eq!(facet, Facet::from("/category/cat1"));
|
assert_eq!(facet, Facet::from("/category/cat1"));
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
facet_reader.facet_from_ord(3, &mut facet).unwrap();
|
facet_reader.facet_from_ord(3, &mut facet);
|
||||||
assert_eq!(format!("{}", facet), "/category/cat2");
|
assert_eq!(format!("{}", facet), "/category/cat2");
|
||||||
assert_eq!(facet, Facet::from("/category/cat2"));
|
assert_eq!(facet, Facet::from("/category/cat2"));
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
facet_reader.facet_from_ord(4, &mut facet).unwrap();
|
facet_reader.facet_from_ord(4, &mut facet);
|
||||||
assert_eq!(facet, Facet::from("/category/cat3"));
|
assert_eq!(facet, Facet::from("/category/cat3"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ use DocId;
|
|||||||
/// term ids when the segment is getting serialized.
|
/// term ids when the segment is getting serialized.
|
||||||
pub struct MultiValueIntFastFieldWriter {
|
pub struct MultiValueIntFastFieldWriter {
|
||||||
field: Field,
|
field: Field,
|
||||||
vals: Vec<UnorderedTermId>,
|
vals: Vec<u64>,
|
||||||
doc_index: Vec<u64>,
|
doc_index: Vec<u64>,
|
||||||
is_facet: bool,
|
is_facet: bool,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ use directory::ReadOnlySource;
|
|||||||
use directory::{Directory, RAMDirectory, WritePtr};
|
use directory::{Directory, RAMDirectory, WritePtr};
|
||||||
use fastfield::{FastFieldSerializer, FastFieldsWriter};
|
use fastfield::{FastFieldSerializer, FastFieldsWriter};
|
||||||
use owning_ref::OwningRef;
|
use owning_ref::OwningRef;
|
||||||
use schema::Schema;
|
use schema::SchemaBuilder;
|
||||||
use schema::FAST;
|
use schema::FAST;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::marker::PhantomData;
|
use std::marker::PhantomData;
|
||||||
@@ -59,29 +59,7 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
|||||||
/// May panic if `doc` is greater than the segment
|
/// May panic if `doc` is greater than the segment
|
||||||
// `maxdoc`.
|
// `maxdoc`.
|
||||||
pub fn get(&self, doc: DocId) -> Item {
|
pub fn get(&self, doc: DocId) -> Item {
|
||||||
self.get_u64(u64::from(doc))
|
Item::from_u64(self.min_value_u64 + self.bit_unpacker.get(doc as usize))
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn get_u64(&self, doc: u64) -> Item {
|
|
||||||
Item::from_u64(self.min_value_u64 + self.bit_unpacker.get(doc))
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Internally `multivalued` also use SingleValue Fast fields.
|
|
||||||
/// It works as follows... A first column contains the list of start index
|
|
||||||
/// for each document, a second column contains the actual values.
|
|
||||||
///
|
|
||||||
/// The values associated to a given doc, are then
|
|
||||||
/// `second_column[first_column.get(doc)..first_column.get(doc+1)]`.
|
|
||||||
///
|
|
||||||
/// Which means single value fast field reader can be indexed internally with
|
|
||||||
/// something different from a `DocId`. For this use case, we want to use `u64`
|
|
||||||
/// values.
|
|
||||||
///
|
|
||||||
/// See `get_range` for an actual documentation about this method.
|
|
||||||
pub(crate) fn get_range_u64(&self, start: u64, output: &mut [Item]) {
|
|
||||||
for (i, out) in output.iter_mut().enumerate() {
|
|
||||||
*out = self.get_u64(start + (i as u64));
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Fills an output buffer with the fast field values
|
/// Fills an output buffer with the fast field values
|
||||||
@@ -97,8 +75,16 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
|||||||
///
|
///
|
||||||
/// May panic if `start + output.len()` is greater than
|
/// May panic if `start + output.len()` is greater than
|
||||||
/// the segment's `maxdoc`.
|
/// the segment's `maxdoc`.
|
||||||
pub fn get_range(&self, start: DocId, output: &mut [Item]) {
|
///
|
||||||
self.get_range_u64(u64::from(start), output);
|
// TODO change start to `u64`.
|
||||||
|
// For multifastfield, start is an index in a second fastfield, not a `DocId`
|
||||||
|
pub fn get_range(&self, start: u32, output: &mut [Item]) {
|
||||||
|
// ok: Item is either `u64` or `i64`
|
||||||
|
let output_u64: &mut [u64] = unsafe { &mut *(output as *mut [Item] as *mut [u64]) };
|
||||||
|
self.bit_unpacker.get_range(start, output_u64);
|
||||||
|
for out in output_u64.iter_mut() {
|
||||||
|
*out = Item::from_u64(*out + self.min_value_u64).as_u64();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the minimum value for this fast field.
|
/// Returns the minimum value for this fast field.
|
||||||
@@ -122,7 +108,7 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
|||||||
|
|
||||||
impl<Item: FastValue> From<Vec<Item>> for FastFieldReader<Item> {
|
impl<Item: FastValue> From<Vec<Item>> for FastFieldReader<Item> {
|
||||||
fn from(vals: Vec<Item>) -> FastFieldReader<Item> {
|
fn from(vals: Vec<Item>) -> FastFieldReader<Item> {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let field = schema_builder.add_u64_field("field", FAST);
|
let field = schema_builder.add_u64_field("field", FAST);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let path = Path::new("__dummy__");
|
let path = Path::new("__dummy__");
|
||||||
|
|||||||
@@ -15,7 +15,7 @@
|
|||||||
//! precompute computationally expensive functions of the fieldnorm
|
//! precompute computationally expensive functions of the fieldnorm
|
||||||
//! in a very short array.
|
//! in a very short array.
|
||||||
//!
|
//!
|
||||||
//! This trick is used by the BM25 similarity.
|
//! This trick is used by the [BM25 similarity]().
|
||||||
mod code;
|
mod code;
|
||||||
mod reader;
|
mod reader;
|
||||||
mod serializer;
|
mod serializer;
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
use rand::thread_rng;
|
use rand::thread_rng;
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
|
|
||||||
|
use rand::distributions::Range;
|
||||||
use rand::Rng;
|
use rand::Rng;
|
||||||
use schema::*;
|
use schema::*;
|
||||||
use Index;
|
use Index;
|
||||||
@@ -13,16 +14,17 @@ fn check_index_content(searcher: &Searcher, vals: &HashSet<u64>) {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
#[ignore]
|
#[ignore]
|
||||||
|
#[cfg(feature = "mmap")]
|
||||||
fn test_indexing() {
|
fn test_indexing() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
|
|
||||||
let id_field = schema_builder.add_u64_field("id", INDEXED);
|
let id_field = schema_builder.add_u64_field("id", INT_INDEXED);
|
||||||
let multiples_field = schema_builder.add_u64_field("multiples", INDEXED);
|
let multiples_field = schema_builder.add_u64_field("multiples", INT_INDEXED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
|
|
||||||
let index = Index::create_from_tempdir(schema).unwrap();
|
let index = Index::create_from_tempdir(schema).unwrap();
|
||||||
let reader = index.reader().unwrap();
|
|
||||||
|
|
||||||
|
let universe = Range::new(0u64, 20u64);
|
||||||
let mut rng = thread_rng();
|
let mut rng = thread_rng();
|
||||||
|
|
||||||
let mut index_writer = index.writer_with_num_threads(3, 120_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(3, 120_000_000).unwrap();
|
||||||
@@ -31,13 +33,13 @@ fn test_indexing() {
|
|||||||
let mut uncommitted_docs: HashSet<u64> = HashSet::new();
|
let mut uncommitted_docs: HashSet<u64> = HashSet::new();
|
||||||
|
|
||||||
for _ in 0..200 {
|
for _ in 0..200 {
|
||||||
let random_val = rng.gen_range(0, 20);
|
let random_val = rng.sample(&universe);
|
||||||
if random_val == 0 {
|
if random_val == 0 {
|
||||||
index_writer.commit().expect("Commit failed");
|
index_writer.commit().expect("Commit failed");
|
||||||
committed_docs.extend(&uncommitted_docs);
|
committed_docs.extend(&uncommitted_docs);
|
||||||
uncommitted_docs.clear();
|
uncommitted_docs.clear();
|
||||||
reader.reload().unwrap();
|
index.load_searchers().unwrap();
|
||||||
let searcher = reader.searcher();
|
let searcher = index.searcher();
|
||||||
// check that everything is correct.
|
// check that everything is correct.
|
||||||
check_index_content(&searcher, &committed_docs);
|
check_index_content(&searcher, &committed_docs);
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@@ -179,11 +179,6 @@ pub struct DeleteCursor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl DeleteCursor {
|
impl DeleteCursor {
|
||||||
|
|
||||||
pub fn empty() -> DeleteCursor {
|
|
||||||
DeleteQueue::new().cursor()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Skips operations and position it so that
|
/// Skips operations and position it so that
|
||||||
/// - either all of the delete operation currently in the
|
/// - either all of the delete operation currently in the
|
||||||
/// queue are consume and the next get will return None.
|
/// queue are consume and the next get will return None.
|
||||||
@@ -196,7 +191,10 @@ impl DeleteCursor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::wrong_self_convention))]
|
#[cfg_attr(
|
||||||
|
feature = "cargo-clippy",
|
||||||
|
allow(clippy::wrong_self_convention)
|
||||||
|
)]
|
||||||
fn is_behind_opstamp(&mut self, target_opstamp: u64) -> bool {
|
fn is_behind_opstamp(&mut self, target_opstamp: u64) -> bool {
|
||||||
self.get()
|
self.get()
|
||||||
.map(|operation| operation.opstamp < target_opstamp)
|
.map(|operation| operation.opstamp < target_opstamp)
|
||||||
|
|||||||
131
src/indexer/directory_lock.rs
Normal file
131
src/indexer/directory_lock.rs
Normal file
@@ -0,0 +1,131 @@
|
|||||||
|
use directory::error::OpenWriteError;
|
||||||
|
use std::io::Write;
|
||||||
|
use std::path::{Path, PathBuf};
|
||||||
|
use std::thread;
|
||||||
|
use std::time::Duration;
|
||||||
|
use Directory;
|
||||||
|
use TantivyError;
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Copy)]
|
||||||
|
pub enum LockType {
|
||||||
|
/// Only one process should be able to write tantivy's index at a time.
|
||||||
|
/// This lock file, when present, is in charge of preventing other processes to open an IndexWriter.
|
||||||
|
///
|
||||||
|
/// If the process is killed and this file remains, it is safe to remove it manually.
|
||||||
|
///
|
||||||
|
/// Failing to acquire this lock usually means a misuse of tantivy's API,
|
||||||
|
/// (creating more than one instance of the `IndexWriter`), are a spurious
|
||||||
|
/// lock file remaining after a crash. In the latter case, removing the file after
|
||||||
|
/// checking no process running tantivy is running is safe.
|
||||||
|
IndexWriterLock,
|
||||||
|
/// The meta lock file is here to protect the segment files being opened by
|
||||||
|
/// `.load_searchers()` from being garbage collected.
|
||||||
|
/// It makes it possible for another process to safely consume
|
||||||
|
/// our index in-writing. Ideally, we may have prefered `RWLock` semantics
|
||||||
|
/// here, but it is difficult to achieve on Windows.
|
||||||
|
///
|
||||||
|
/// Opening segment readers is a very fast process.
|
||||||
|
/// Right now if the lock cannot be acquire on the first attempt, the logic
|
||||||
|
/// is very simplistic. We retry after `100ms` until we effectively
|
||||||
|
/// acquire the lock.
|
||||||
|
/// This lock should not have much contention in normal usage.
|
||||||
|
MetaLock,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Retry the logic of acquiring locks is pretty simple.
|
||||||
|
/// We just retry `n` times after a given `duratio`, both
|
||||||
|
/// depending on the type of lock.
|
||||||
|
struct RetryPolicy {
|
||||||
|
num_retries: usize,
|
||||||
|
wait_in_ms: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RetryPolicy {
|
||||||
|
fn no_retry() -> RetryPolicy {
|
||||||
|
RetryPolicy {
|
||||||
|
num_retries: 0,
|
||||||
|
wait_in_ms: 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn wait_and_retry(&mut self) -> bool {
|
||||||
|
if self.num_retries == 0 {
|
||||||
|
false
|
||||||
|
} else {
|
||||||
|
self.num_retries -= 1;
|
||||||
|
let wait_duration = Duration::from_millis(self.wait_in_ms);
|
||||||
|
thread::sleep(wait_duration);
|
||||||
|
true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl LockType {
|
||||||
|
fn retry_policy(self) -> RetryPolicy {
|
||||||
|
match self {
|
||||||
|
LockType::IndexWriterLock => RetryPolicy::no_retry(),
|
||||||
|
LockType::MetaLock => RetryPolicy {
|
||||||
|
num_retries: 100,
|
||||||
|
wait_in_ms: 100,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn try_acquire_lock(self, directory: &mut Directory) -> Result<DirectoryLock, TantivyError> {
|
||||||
|
let path = self.filename();
|
||||||
|
let mut write = directory.open_write(path).map_err(|e| match e {
|
||||||
|
OpenWriteError::FileAlreadyExists(_) => TantivyError::LockFailure(self),
|
||||||
|
OpenWriteError::IOError(io_error) => TantivyError::IOError(io_error),
|
||||||
|
})?;
|
||||||
|
write.flush()?;
|
||||||
|
Ok(DirectoryLock {
|
||||||
|
directory: directory.box_clone(),
|
||||||
|
path: path.to_owned(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Acquire a lock in the given directory.
|
||||||
|
pub fn acquire_lock(self, directory: &Directory) -> Result<DirectoryLock, TantivyError> {
|
||||||
|
let mut box_directory = directory.box_clone();
|
||||||
|
let mut retry_policy = self.retry_policy();
|
||||||
|
loop {
|
||||||
|
let lock_result = self.try_acquire_lock(&mut *box_directory);
|
||||||
|
match lock_result {
|
||||||
|
Ok(result) => {
|
||||||
|
return Ok(result);
|
||||||
|
}
|
||||||
|
Err(TantivyError::LockFailure(ref filepath)) => {
|
||||||
|
if !retry_policy.wait_and_retry() {
|
||||||
|
return Err(TantivyError::LockFailure(filepath.to_owned()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(_) => {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn filename(&self) -> &Path {
|
||||||
|
match *self {
|
||||||
|
LockType::MetaLock => Path::new(".tantivy-meta.lock"),
|
||||||
|
LockType::IndexWriterLock => Path::new(".tantivy-indexer.lock"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// The `DirectoryLock` is an object that represents a file lock.
|
||||||
|
/// See [`LockType`](struct.LockType.html)
|
||||||
|
///
|
||||||
|
/// It is transparently associated to a lock file, that gets deleted
|
||||||
|
/// on `Drop.` The lock is release automatically on `Drop`.
|
||||||
|
pub struct DirectoryLock {
|
||||||
|
directory: Box<Directory>,
|
||||||
|
path: PathBuf,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Drop for DirectoryLock {
|
||||||
|
fn drop(&mut self) {
|
||||||
|
if let Err(e) = self.directory.delete(&*self.path) {
|
||||||
|
error!("Failed to remove the lock file. {:?}", e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
use super::operation::{AddOperation, UserOperation};
|
use super::operation::AddOperation;
|
||||||
use super::segment_updater::SegmentUpdater;
|
use super::segment_updater::SegmentUpdater;
|
||||||
use super::PreparedCommit;
|
use super::PreparedCommit;
|
||||||
use bit_set::BitSet;
|
use bit_set::BitSet;
|
||||||
@@ -8,16 +8,16 @@ use core::SegmentComponent;
|
|||||||
use core::SegmentId;
|
use core::SegmentId;
|
||||||
use core::SegmentMeta;
|
use core::SegmentMeta;
|
||||||
use core::SegmentReader;
|
use core::SegmentReader;
|
||||||
use crossbeam::channel;
|
use crossbeam_channel as channel;
|
||||||
use directory::DirectoryLock;
|
|
||||||
use docset::DocSet;
|
use docset::DocSet;
|
||||||
use error::TantivyError;
|
use error::TantivyError;
|
||||||
use fastfield::write_delete_bitset;
|
use fastfield::write_delete_bitset;
|
||||||
use futures::{Canceled, Future};
|
use futures::sync::oneshot::Receiver;
|
||||||
use indexer::delete_queue::{DeleteCursor, DeleteQueue};
|
use indexer::delete_queue::{DeleteCursor, DeleteQueue};
|
||||||
use indexer::doc_opstamp_mapping::DocToOpstampMapping;
|
use indexer::doc_opstamp_mapping::DocToOpstampMapping;
|
||||||
use indexer::operation::DeleteOperation;
|
use indexer::operation::DeleteOperation;
|
||||||
use indexer::stamper::Stamper;
|
use indexer::stamper::Stamper;
|
||||||
|
use indexer::DirectoryLock;
|
||||||
use indexer::MergePolicy;
|
use indexer::MergePolicy;
|
||||||
use indexer::SegmentEntry;
|
use indexer::SegmentEntry;
|
||||||
use indexer::SegmentWriter;
|
use indexer::SegmentWriter;
|
||||||
@@ -26,8 +26,7 @@ use schema::Document;
|
|||||||
use schema::IndexRecordOption;
|
use schema::IndexRecordOption;
|
||||||
use schema::Term;
|
use schema::Term;
|
||||||
use std::mem;
|
use std::mem;
|
||||||
use std::ops::Range;
|
use std::mem::swap;
|
||||||
use std::sync::Arc;
|
|
||||||
use std::thread;
|
use std::thread;
|
||||||
use std::thread::JoinHandle;
|
use std::thread::JoinHandle;
|
||||||
use Result;
|
use Result;
|
||||||
@@ -44,8 +43,8 @@ pub const HEAP_SIZE_MAX: usize = u32::max_value() as usize - MARGIN_IN_BYTES;
|
|||||||
// reaches `PIPELINE_MAX_SIZE_IN_DOCS`
|
// reaches `PIPELINE_MAX_SIZE_IN_DOCS`
|
||||||
const PIPELINE_MAX_SIZE_IN_DOCS: usize = 10_000;
|
const PIPELINE_MAX_SIZE_IN_DOCS: usize = 10_000;
|
||||||
|
|
||||||
type OperationSender = channel::Sender<Vec<AddOperation>>;
|
type DocumentSender = channel::Sender<AddOperation>;
|
||||||
type OperationReceiver = channel::Receiver<Vec<AddOperation>>;
|
type DocumentReceiver = channel::Receiver<AddOperation>;
|
||||||
|
|
||||||
/// Split the thread memory budget into
|
/// Split the thread memory budget into
|
||||||
/// - the heap size
|
/// - the heap size
|
||||||
@@ -53,19 +52,16 @@ type OperationReceiver = channel::Receiver<Vec<AddOperation>>;
|
|||||||
///
|
///
|
||||||
/// Returns (the heap size in bytes, the hash table size in number of bits)
|
/// Returns (the heap size in bytes, the hash table size in number of bits)
|
||||||
fn initial_table_size(per_thread_memory_budget: usize) -> usize {
|
fn initial_table_size(per_thread_memory_budget: usize) -> usize {
|
||||||
assert!(per_thread_memory_budget > 1_000);
|
|
||||||
let table_size_limit: usize = per_thread_memory_budget / 3;
|
let table_size_limit: usize = per_thread_memory_budget / 3;
|
||||||
if let Some(limit) = (1..)
|
(1..)
|
||||||
.take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit)
|
.take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit)
|
||||||
.last()
|
.last()
|
||||||
{
|
.unwrap_or_else(|| {
|
||||||
limit.min(19) // we cap it at 2^19 = 512K.
|
panic!(
|
||||||
} else {
|
"Per thread memory is too small: {}",
|
||||||
unreachable!(
|
per_thread_memory_budget
|
||||||
"Per thread memory is too small: {}",
|
)
|
||||||
per_thread_memory_budget
|
}).min(19) // we cap it at 512K
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// `IndexWriter` is the user entry-point to add document to an index.
|
/// `IndexWriter` is the user entry-point to add document to an index.
|
||||||
@@ -85,8 +81,8 @@ pub struct IndexWriter {
|
|||||||
|
|
||||||
workers_join_handle: Vec<JoinHandle<Result<()>>>,
|
workers_join_handle: Vec<JoinHandle<Result<()>>>,
|
||||||
|
|
||||||
operation_receiver: OperationReceiver,
|
document_receiver: DocumentReceiver,
|
||||||
operation_sender: OperationSender,
|
document_sender: DocumentSender,
|
||||||
|
|
||||||
segment_updater: SegmentUpdater,
|
segment_updater: SegmentUpdater,
|
||||||
|
|
||||||
@@ -133,7 +129,7 @@ pub fn open_index_writer(
|
|||||||
let err_msg = format!("The heap size per thread cannot exceed {}", HEAP_SIZE_MAX);
|
let err_msg = format!("The heap size per thread cannot exceed {}", HEAP_SIZE_MAX);
|
||||||
return Err(TantivyError::InvalidArgument(err_msg));
|
return Err(TantivyError::InvalidArgument(err_msg));
|
||||||
}
|
}
|
||||||
let (document_sender, document_receiver): (OperationSender, OperationReceiver) =
|
let (document_sender, document_receiver): (DocumentSender, DocumentReceiver) =
|
||||||
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
|
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||||
|
|
||||||
let delete_queue = DeleteQueue::new();
|
let delete_queue = DeleteQueue::new();
|
||||||
@@ -143,7 +139,7 @@ pub fn open_index_writer(
|
|||||||
let stamper = Stamper::new(current_opstamp);
|
let stamper = Stamper::new(current_opstamp);
|
||||||
|
|
||||||
let segment_updater =
|
let segment_updater =
|
||||||
SegmentUpdater::create(index.clone(), stamper.clone(), &delete_queue.cursor())?;
|
SegmentUpdater::new(index.clone(), stamper.clone(), &delete_queue.cursor())?;
|
||||||
|
|
||||||
let mut index_writer = IndexWriter {
|
let mut index_writer = IndexWriter {
|
||||||
_directory_lock: Some(directory_lock),
|
_directory_lock: Some(directory_lock),
|
||||||
@@ -151,8 +147,8 @@ pub fn open_index_writer(
|
|||||||
heap_size_in_bytes_per_thread,
|
heap_size_in_bytes_per_thread,
|
||||||
index: index.clone(),
|
index: index.clone(),
|
||||||
|
|
||||||
operation_receiver: document_receiver,
|
document_receiver,
|
||||||
operation_sender: document_sender,
|
document_sender,
|
||||||
|
|
||||||
segment_updater,
|
segment_updater,
|
||||||
|
|
||||||
@@ -259,7 +255,7 @@ pub fn advance_deletes(
|
|||||||
write_delete_bitset(&delete_bitset, &mut delete_file)?;
|
write_delete_bitset(&delete_bitset, &mut delete_file)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
segment_entry.set_meta(target_opstamp, segment.meta().clone());
|
segment_entry.set_meta((*segment.meta()).clone());
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -267,7 +263,7 @@ fn index_documents(
|
|||||||
memory_budget: usize,
|
memory_budget: usize,
|
||||||
segment: &Segment,
|
segment: &Segment,
|
||||||
generation: usize,
|
generation: usize,
|
||||||
document_iterator: &mut Iterator<Item = Vec<AddOperation>>,
|
document_iterator: &mut Iterator<Item = AddOperation>,
|
||||||
segment_updater: &mut SegmentUpdater,
|
segment_updater: &mut SegmentUpdater,
|
||||||
mut delete_cursor: DeleteCursor,
|
mut delete_cursor: DeleteCursor,
|
||||||
) -> Result<bool> {
|
) -> Result<bool> {
|
||||||
@@ -275,11 +271,11 @@ fn index_documents(
|
|||||||
let segment_id = segment.id();
|
let segment_id = segment.id();
|
||||||
let table_size = initial_table_size(memory_budget);
|
let table_size = initial_table_size(memory_budget);
|
||||||
let mut segment_writer = SegmentWriter::for_segment(table_size, segment.clone(), &schema)?;
|
let mut segment_writer = SegmentWriter::for_segment(table_size, segment.clone(), &schema)?;
|
||||||
for documents in document_iterator {
|
for doc in document_iterator {
|
||||||
for doc in documents {
|
segment_writer.add_document(doc, &schema)?;
|
||||||
segment_writer.add_document(doc, &schema)?;
|
|
||||||
}
|
|
||||||
let mem_usage = segment_writer.mem_usage();
|
let mem_usage = segment_writer.mem_usage();
|
||||||
|
|
||||||
if mem_usage >= memory_budget - MARGIN_IN_BYTES {
|
if mem_usage >= memory_budget - MARGIN_IN_BYTES {
|
||||||
info!(
|
info!(
|
||||||
"Buffer limit reached, flushing segment with maxdoc={}.",
|
"Buffer limit reached, flushing segment with maxdoc={}.",
|
||||||
@@ -305,7 +301,7 @@ fn index_documents(
|
|||||||
|
|
||||||
let last_docstamp: u64 = *(doc_opstamps.last().unwrap());
|
let last_docstamp: u64 = *(doc_opstamps.last().unwrap());
|
||||||
|
|
||||||
let delete_bitset_opt = if delete_cursor.get().is_some() {
|
let segment_entry: SegmentEntry = if delete_cursor.get().is_some() {
|
||||||
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
|
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
|
||||||
let segment_reader = SegmentReader::open(segment)?;
|
let segment_reader = SegmentReader::open(segment)?;
|
||||||
let mut deleted_bitset = BitSet::with_capacity(num_docs as usize);
|
let mut deleted_bitset = BitSet::with_capacity(num_docs as usize);
|
||||||
@@ -316,22 +312,18 @@ fn index_documents(
|
|||||||
&doc_to_opstamps,
|
&doc_to_opstamps,
|
||||||
last_docstamp,
|
last_docstamp,
|
||||||
)?;
|
)?;
|
||||||
if may_have_deletes {
|
SegmentEntry::new(segment_meta, delete_cursor, {
|
||||||
Some(deleted_bitset)
|
if may_have_deletes {
|
||||||
} else {
|
Some(deleted_bitset)
|
||||||
None
|
} else {
|
||||||
}
|
None
|
||||||
|
}
|
||||||
|
})
|
||||||
} else {
|
} else {
|
||||||
// if there are no delete operation in the queue, no need
|
// if there are no delete operation in the queue, no need
|
||||||
// to even open the segment.
|
// to even open the segment.
|
||||||
None
|
SegmentEntry::new(segment_meta, delete_cursor, None)
|
||||||
};
|
};
|
||||||
let segment_entry = SegmentEntry::new(
|
|
||||||
segment_meta,
|
|
||||||
delete_cursor,
|
|
||||||
delete_bitset_opt,
|
|
||||||
last_docstamp,
|
|
||||||
);
|
|
||||||
Ok(segment_updater.add_segment(generation, segment_entry))
|
Ok(segment_updater.add_segment(generation, segment_entry))
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -340,7 +332,7 @@ impl IndexWriter {
|
|||||||
pub fn wait_merging_threads(mut self) -> Result<()> {
|
pub fn wait_merging_threads(mut self) -> Result<()> {
|
||||||
// this will stop the indexing thread,
|
// this will stop the indexing thread,
|
||||||
// dropping the last reference to the segment_updater.
|
// dropping the last reference to the segment_updater.
|
||||||
drop(self.operation_sender);
|
drop(self.document_sender);
|
||||||
|
|
||||||
let former_workers_handles = mem::replace(&mut self.workers_join_handle, vec![]);
|
let former_workers_handles = mem::replace(&mut self.workers_join_handle, vec![]);
|
||||||
for join_handle in former_workers_handles {
|
for join_handle in former_workers_handles {
|
||||||
@@ -366,30 +358,27 @@ impl IndexWriter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[doc(hidden)]
|
#[doc(hidden)]
|
||||||
pub fn add_segment(&mut self, segment_meta: SegmentMeta, opstamp: u64) {
|
pub fn add_segment(&mut self, segment_meta: SegmentMeta) {
|
||||||
let delete_cursor = self.delete_queue.cursor();
|
let delete_cursor = self.delete_queue.cursor();
|
||||||
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, None, opstamp);
|
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, None);
|
||||||
self.segment_updater
|
self.segment_updater
|
||||||
.add_segment(self.generation, segment_entry);
|
.add_segment(self.generation, segment_entry);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Creates a new segment.
|
/// *Experimental & Advanced API* Creates a new segment.
|
||||||
|
/// and marks it as currently in write.
|
||||||
///
|
///
|
||||||
/// This method is useful only for users trying to do complex
|
/// This method is useful only for users trying to do complex
|
||||||
/// operations, like converting an index format to another.
|
/// operations, like converting an index format to another.
|
||||||
///
|
|
||||||
/// It is safe to start writing file associated to the new `Segment`.
|
|
||||||
/// These will not be garbage collected as long as an instance object of
|
|
||||||
/// `SegmentMeta` object associated to the new `Segment` is "alive".
|
|
||||||
pub fn new_segment(&self) -> Segment {
|
pub fn new_segment(&self) -> Segment {
|
||||||
self.index.new_segment()
|
self.segment_updater.new_segment()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Spawns a new worker thread for indexing.
|
/// Spawns a new worker thread for indexing.
|
||||||
/// The thread consumes documents from the pipeline.
|
/// The thread consumes documents from the pipeline.
|
||||||
///
|
///
|
||||||
fn add_indexing_worker(&mut self) -> Result<()> {
|
fn add_indexing_worker(&mut self) -> Result<()> {
|
||||||
let document_receiver_clone = self.operation_receiver.clone();
|
let document_receiver_clone = self.document_receiver.clone();
|
||||||
let mut segment_updater = self.segment_updater.clone();
|
let mut segment_updater = self.segment_updater.clone();
|
||||||
|
|
||||||
let generation = self.generation;
|
let generation = self.generation;
|
||||||
@@ -397,16 +386,13 @@ impl IndexWriter {
|
|||||||
let mut delete_cursor = self.delete_queue.cursor();
|
let mut delete_cursor = self.delete_queue.cursor();
|
||||||
|
|
||||||
let mem_budget = self.heap_size_in_bytes_per_thread;
|
let mem_budget = self.heap_size_in_bytes_per_thread;
|
||||||
let index = self.index.clone();
|
|
||||||
let join_handle: JoinHandle<Result<()>> = thread::Builder::new()
|
let join_handle: JoinHandle<Result<()>> = thread::Builder::new()
|
||||||
.name(format!(
|
.name(format!(
|
||||||
"thrd-tantivy-index{}-gen{}",
|
"indexing thread {} for gen {}",
|
||||||
self.worker_id, generation
|
self.worker_id, generation
|
||||||
))
|
)).spawn(move || {
|
||||||
.spawn(move || {
|
|
||||||
loop {
|
loop {
|
||||||
let mut document_iterator =
|
let mut document_iterator = document_receiver_clone.clone().peekable();
|
||||||
document_receiver_clone.clone().into_iter().peekable();
|
|
||||||
|
|
||||||
// the peeking here is to avoid
|
// the peeking here is to avoid
|
||||||
// creating a new segment's files
|
// creating a new segment's files
|
||||||
@@ -415,19 +401,15 @@ impl IndexWriter {
|
|||||||
// this is a valid guarantee as the
|
// this is a valid guarantee as the
|
||||||
// peeked document now belongs to
|
// peeked document now belongs to
|
||||||
// our local iterator.
|
// our local iterator.
|
||||||
if let Some(operations) = document_iterator.peek() {
|
if let Some(operation) = document_iterator.peek() {
|
||||||
if let Some(first) = operations.first() {
|
delete_cursor.skip_to(operation.opstamp);
|
||||||
delete_cursor.skip_to(first.opstamp);
|
|
||||||
} else {
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
// No more documents.
|
// No more documents.
|
||||||
// Happens when there is a commit, or if the `IndexWriter`
|
// Happens when there is a commit, or if the `IndexWriter`
|
||||||
// was dropped.
|
// was dropped.
|
||||||
return Ok(());
|
return Ok(());
|
||||||
}
|
}
|
||||||
let segment = index.new_segment();
|
let segment = segment_updater.new_segment();
|
||||||
index_documents(
|
index_documents(
|
||||||
mem_budget,
|
mem_budget,
|
||||||
&segment,
|
&segment,
|
||||||
@@ -444,7 +426,7 @@ impl IndexWriter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Accessor to the merge policy.
|
/// Accessor to the merge policy.
|
||||||
pub fn get_merge_policy(&self) -> Arc<Box<MergePolicy>> {
|
pub fn get_merge_policy(&self) -> Box<MergePolicy> {
|
||||||
self.segment_updater.get_merge_policy()
|
self.segment_updater.get_merge_policy()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -469,10 +451,7 @@ impl IndexWriter {
|
|||||||
/// Merges a given list of segments
|
/// Merges a given list of segments
|
||||||
///
|
///
|
||||||
/// `segment_ids` is required to be non-empty.
|
/// `segment_ids` is required to be non-empty.
|
||||||
pub fn merge(
|
pub fn merge(&mut self, segment_ids: &[SegmentId]) -> Result<Receiver<SegmentMeta>> {
|
||||||
&mut self,
|
|
||||||
segment_ids: &[SegmentId],
|
|
||||||
) -> Result<impl Future<Item = SegmentMeta, Error = Canceled>> {
|
|
||||||
self.segment_updater.start_merge(segment_ids)
|
self.segment_updater.start_merge(segment_ids)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -484,11 +463,14 @@ impl IndexWriter {
|
|||||||
/// when no documents are remaining.
|
/// when no documents are remaining.
|
||||||
///
|
///
|
||||||
/// Returns the former segment_ready channel.
|
/// Returns the former segment_ready channel.
|
||||||
fn recreate_document_channel(&mut self) -> OperationReceiver {
|
fn recreate_document_channel(&mut self) -> DocumentReceiver {
|
||||||
let (document_sender, document_receiver): (OperationSender, OperationReceiver) =
|
let (mut document_sender, mut document_receiver): (
|
||||||
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
|
DocumentSender,
|
||||||
mem::replace(&mut self.operation_sender, document_sender);
|
DocumentReceiver,
|
||||||
mem::replace(&mut self.operation_receiver, document_receiver)
|
) = channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||||
|
swap(&mut self.document_sender, &mut document_sender);
|
||||||
|
swap(&mut self.document_receiver, &mut document_receiver);
|
||||||
|
document_receiver
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Rollback to the last commit
|
/// Rollback to the last commit
|
||||||
@@ -506,7 +488,7 @@ impl IndexWriter {
|
|||||||
// segment updates will be ignored.
|
// segment updates will be ignored.
|
||||||
self.segment_updater.kill();
|
self.segment_updater.kill();
|
||||||
|
|
||||||
let document_receiver = self.operation_receiver.clone();
|
let document_receiver = self.document_receiver.clone();
|
||||||
|
|
||||||
// take the directory lock to create a new index_writer.
|
// take the directory lock to create a new index_writer.
|
||||||
let directory_lock = self
|
let directory_lock = self
|
||||||
@@ -532,7 +514,7 @@ impl IndexWriter {
|
|||||||
//
|
//
|
||||||
// This will reach an end as the only document_sender
|
// This will reach an end as the only document_sender
|
||||||
// was dropped with the index_writer.
|
// was dropped with the index_writer.
|
||||||
for _ in document_receiver.iter() {}
|
for _ in document_receiver.clone() {}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -559,16 +541,6 @@ impl IndexWriter {
|
|||||||
/// using this API.
|
/// using this API.
|
||||||
/// See [`PreparedCommit::set_payload()`](PreparedCommit.html)
|
/// See [`PreparedCommit::set_payload()`](PreparedCommit.html)
|
||||||
pub fn prepare_commit(&mut self) -> Result<PreparedCommit> {
|
pub fn prepare_commit(&mut self) -> Result<PreparedCommit> {
|
||||||
info!("Preparing commit");
|
|
||||||
self.prepare_commit_internal(false)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn prepare_commit_soft(&mut self) -> Result<PreparedCommit> {
|
|
||||||
info!("Preparing soft commit");
|
|
||||||
self.prepare_commit_internal(true)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn prepare_commit_internal(&mut self, soft: bool) -> Result<PreparedCommit> {
|
|
||||||
// Here, because we join all of the worker threads,
|
// Here, because we join all of the worker threads,
|
||||||
// all of the segment update for this commit have been
|
// all of the segment update for this commit have been
|
||||||
// sent.
|
// sent.
|
||||||
@@ -585,19 +557,24 @@ impl IndexWriter {
|
|||||||
// and recreate a new one channels.
|
// and recreate a new one channels.
|
||||||
self.recreate_document_channel();
|
self.recreate_document_channel();
|
||||||
|
|
||||||
let former_workers_join_handle = mem::replace(&mut self.workers_join_handle, Vec::new());
|
let mut former_workers_join_handle = Vec::new();
|
||||||
|
swap(
|
||||||
|
&mut former_workers_join_handle,
|
||||||
|
&mut self.workers_join_handle,
|
||||||
|
);
|
||||||
|
|
||||||
for worker_handle in former_workers_join_handle {
|
for worker_handle in former_workers_join_handle {
|
||||||
let indexing_worker_result = worker_handle
|
let indexing_worker_result = worker_handle
|
||||||
.join()
|
.join()
|
||||||
.map_err(|e| TantivyError::ErrorInThread(format!("{:?}", e)))?;
|
.map_err(|e| TantivyError::ErrorInThread(format!("{:?}", e)))?;
|
||||||
// add a new worker for the next generation, whether the worker failed or not.
|
|
||||||
self.add_indexing_worker()?;
|
|
||||||
indexing_worker_result?;
|
indexing_worker_result?;
|
||||||
|
// add a new worker for the next generation.
|
||||||
|
self.add_indexing_worker()?;
|
||||||
}
|
}
|
||||||
|
|
||||||
let commit_opstamp = self.stamper.stamp();
|
let commit_opstamp = self.stamper.stamp();
|
||||||
let prepared_commit = PreparedCommit::new(self, commit_opstamp, soft);
|
let prepared_commit = PreparedCommit::new(self, commit_opstamp);
|
||||||
info!("Prepared commit {}", commit_opstamp);
|
info!("Prepared commit {}", commit_opstamp);
|
||||||
Ok(prepared_commit)
|
Ok(prepared_commit)
|
||||||
}
|
}
|
||||||
@@ -620,11 +597,6 @@ impl IndexWriter {
|
|||||||
self.prepare_commit()?.commit()
|
self.prepare_commit()?.commit()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
pub fn soft_commit(&mut self) -> Result<u64> {
|
|
||||||
self.prepare_commit_soft()?.commit()
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn segment_updater(&self) -> &SegmentUpdater {
|
pub(crate) fn segment_updater(&self) -> &SegmentUpdater {
|
||||||
&self.segment_updater
|
&self.segment_updater
|
||||||
}
|
}
|
||||||
@@ -668,189 +640,42 @@ impl IndexWriter {
|
|||||||
pub fn add_document(&mut self, document: Document) -> u64 {
|
pub fn add_document(&mut self, document: Document) -> u64 {
|
||||||
let opstamp = self.stamper.stamp();
|
let opstamp = self.stamper.stamp();
|
||||||
let add_operation = AddOperation { opstamp, document };
|
let add_operation = AddOperation { opstamp, document };
|
||||||
let send_result = self.operation_sender.send(vec![add_operation]);
|
self.document_sender.send(add_operation);
|
||||||
if let Err(e) = send_result {
|
|
||||||
panic!("Failed to index document. Sending to indexing channel failed. This probably means all of the indexing threads have panicked. {:?}", e);
|
|
||||||
}
|
|
||||||
opstamp
|
opstamp
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Gets a range of stamps from the stamper and "pops" the last stamp
|
|
||||||
/// from the range returning a tuple of the last optstamp and the popped
|
|
||||||
/// range.
|
|
||||||
///
|
|
||||||
/// The total number of stamps generated by this method is `count + 1`;
|
|
||||||
/// each operation gets a stamp from the `stamps` iterator and `last_opstamp`
|
|
||||||
/// is for the batch itself.
|
|
||||||
fn get_batch_opstamps(&mut self, count: u64) -> (u64, Range<u64>) {
|
|
||||||
let Range { start, end } = self.stamper.stamps(count + 1u64);
|
|
||||||
let last_opstamp = end - 1;
|
|
||||||
let stamps = Range {
|
|
||||||
start,
|
|
||||||
end: last_opstamp,
|
|
||||||
};
|
|
||||||
(last_opstamp, stamps)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Runs a group of document operations ensuring that the operations are
|
|
||||||
/// assigned contigous u64 opstamps and that add operations of the same
|
|
||||||
/// group are flushed into the same segment.
|
|
||||||
///
|
|
||||||
/// If the indexing pipeline is full, this call may block.
|
|
||||||
///
|
|
||||||
/// Each operation of the given `user_operations` will receive an in-order,
|
|
||||||
/// contiguous u64 opstamp. The entire batch itself is also given an
|
|
||||||
/// opstamp that is 1 greater than the last given operation. This
|
|
||||||
/// `batch_opstamp` is the return value of `run`. An empty group of
|
|
||||||
/// `user_operations`, an empty `Vec<UserOperation>`, still receives
|
|
||||||
/// a valid opstamp even though no changes were _actually_ made to the index.
|
|
||||||
///
|
|
||||||
/// Like adds and deletes (see `IndexWriter.add_document` and
|
|
||||||
/// `IndexWriter.delete_term`), the changes made by calling `run` will be
|
|
||||||
/// visible to readers only after calling `commit()`.
|
|
||||||
pub fn run(&mut self, user_operations: Vec<UserOperation>) -> u64 {
|
|
||||||
let count = user_operations.len() as u64;
|
|
||||||
if count == 0 {
|
|
||||||
return self.stamper.stamp();
|
|
||||||
}
|
|
||||||
let (batch_opstamp, stamps) = self.get_batch_opstamps(count);
|
|
||||||
|
|
||||||
let mut adds: Vec<AddOperation> = Vec::new();
|
|
||||||
|
|
||||||
for (user_op, opstamp) in user_operations.into_iter().zip(stamps) {
|
|
||||||
match user_op {
|
|
||||||
UserOperation::Delete(term) => {
|
|
||||||
let delete_operation = DeleteOperation { opstamp, term };
|
|
||||||
self.delete_queue.push(delete_operation);
|
|
||||||
}
|
|
||||||
UserOperation::Add(document) => {
|
|
||||||
let add_operation = AddOperation { opstamp, document };
|
|
||||||
adds.push(add_operation);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
let send_result = self.operation_sender.send(adds);
|
|
||||||
if let Err(e) = send_result {
|
|
||||||
panic!("Failed to index document. Sending to indexing channel failed. This probably means all of the indexing threads have panicked. {:?}", e);
|
|
||||||
};
|
|
||||||
|
|
||||||
batch_opstamp
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
use super::super::operation::UserOperation;
|
|
||||||
use super::initial_table_size;
|
use super::initial_table_size;
|
||||||
use collector::TopDocs;
|
|
||||||
use directory::error::LockError;
|
|
||||||
use error::*;
|
use error::*;
|
||||||
use indexer::NoMergePolicy;
|
use indexer::NoMergePolicy;
|
||||||
use query::TermQuery;
|
use schema::{self, Document};
|
||||||
use schema::{self, IndexRecordOption};
|
|
||||||
use Index;
|
use Index;
|
||||||
use ReloadPolicy;
|
|
||||||
use Term;
|
use Term;
|
||||||
use IndexReader;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_operations_group() {
|
|
||||||
// an operations group with 2 items should cause 3 opstamps 0, 1, and 2.
|
|
||||||
let mut schema_builder = schema::Schema::builder();
|
|
||||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
|
||||||
let operations = vec![
|
|
||||||
UserOperation::Add(doc!(text_field=>"a")),
|
|
||||||
UserOperation::Add(doc!(text_field=>"b")),
|
|
||||||
];
|
|
||||||
let batch_opstamp1 = index_writer.run(operations);
|
|
||||||
assert_eq!(batch_opstamp1, 2u64);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_ordered_batched_operations() {
|
|
||||||
// * one delete for `doc!(field=>"a")`
|
|
||||||
// * one add for `doc!(field=>"a")`
|
|
||||||
// * one add for `doc!(field=>"b")`
|
|
||||||
// * one delete for `doc!(field=>"b")`
|
|
||||||
// after commit there is one doc with "a" and 0 doc with "b"
|
|
||||||
let mut schema_builder = schema::Schema::builder();
|
|
||||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
|
||||||
let reader = index
|
|
||||||
.reader_builder()
|
|
||||||
.reload_policy(ReloadPolicy::Manual)
|
|
||||||
.try_into()
|
|
||||||
.unwrap();
|
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
|
||||||
let a_term = Term::from_field_text(text_field, "a");
|
|
||||||
let b_term = Term::from_field_text(text_field, "b");
|
|
||||||
let operations = vec![
|
|
||||||
UserOperation::Delete(a_term),
|
|
||||||
UserOperation::Add(doc!(text_field=>"a")),
|
|
||||||
UserOperation::Add(doc!(text_field=>"b")),
|
|
||||||
UserOperation::Delete(b_term),
|
|
||||||
];
|
|
||||||
|
|
||||||
index_writer.run(operations);
|
|
||||||
index_writer.commit().expect("failed to commit");
|
|
||||||
reader.reload().expect("failed to load searchers");
|
|
||||||
|
|
||||||
let a_term = Term::from_field_text(text_field, "a");
|
|
||||||
let b_term = Term::from_field_text(text_field, "b");
|
|
||||||
|
|
||||||
let a_query = TermQuery::new(a_term, IndexRecordOption::Basic);
|
|
||||||
let b_query = TermQuery::new(b_term, IndexRecordOption::Basic);
|
|
||||||
|
|
||||||
let searcher = reader.searcher();
|
|
||||||
|
|
||||||
let a_docs = searcher
|
|
||||||
.search(&a_query, &TopDocs::with_limit(1))
|
|
||||||
.expect("search for a failed");
|
|
||||||
|
|
||||||
let b_docs = searcher
|
|
||||||
.search(&b_query, &TopDocs::with_limit(1))
|
|
||||||
.expect("search for b failed");
|
|
||||||
|
|
||||||
assert_eq!(a_docs.len(), 1);
|
|
||||||
assert_eq!(b_docs.len(), 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_empty_operations_group() {
|
|
||||||
let schema_builder = schema::Schema::builder();
|
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
|
||||||
let mut index_writer = index.writer(3_000_000).unwrap();
|
|
||||||
let operations1 = vec![];
|
|
||||||
let batch_opstamp1 = index_writer.run(operations1);
|
|
||||||
assert_eq!(batch_opstamp1, 0u64);
|
|
||||||
let operations2 = vec![];
|
|
||||||
let batch_opstamp2 = index_writer.run(operations2);
|
|
||||||
assert_eq!(batch_opstamp2, 1u64);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_lockfile_stops_duplicates() {
|
fn test_lockfile_stops_duplicates() {
|
||||||
let schema_builder = schema::Schema::builder();
|
let schema_builder = schema::SchemaBuilder::default();
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
let _index_writer = index.writer(3_000_000).unwrap();
|
let _index_writer = index.writer(40_000_000).unwrap();
|
||||||
match index.writer(3_000_000) {
|
match index.writer(40_000_000) {
|
||||||
Err(TantivyError::LockFailure(LockError::LockBusy, _)) => {}
|
Err(TantivyError::LockFailure(_)) => {}
|
||||||
_ => panic!("Expected a `LockFailure` error"),
|
_ => panic!("Expected FileAlreadyExists error"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_lockfile_already_exists_error_msg() {
|
fn test_lockfile_already_exists_error_msg() {
|
||||||
let schema_builder = schema::Schema::builder();
|
let schema_builder = schema::SchemaBuilder::default();
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
let _index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
let _index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
match index.writer_with_num_threads(1, 3_000_000) {
|
match index.writer_with_num_threads(1, 3_000_000) {
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
let err_msg = err.to_string();
|
let err_msg = err.to_string();
|
||||||
assert!(err_msg.contains("already an `IndexWriter`"));
|
assert!(err_msg.contains("Lockfile"));
|
||||||
|
assert!(err_msg.contains("Possible causes:"))
|
||||||
}
|
}
|
||||||
_ => panic!("Expected LockfileAlreadyExists error"),
|
_ => panic!("Expected LockfileAlreadyExists error"),
|
||||||
}
|
}
|
||||||
@@ -858,9 +683,9 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_set_merge_policy() {
|
fn test_set_merge_policy() {
|
||||||
let schema_builder = schema::Schema::builder();
|
let schema_builder = schema::SchemaBuilder::default();
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
let index_writer = index.writer(3_000_000).unwrap();
|
let index_writer = index.writer(40_000_000).unwrap();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
format!("{:?}", index_writer.get_merge_policy()),
|
format!("{:?}", index_writer.get_merge_policy()),
|
||||||
"LogMergePolicy { min_merge_size: 8, min_layer_size: 10000, \
|
"LogMergePolicy { min_merge_size: 8, min_layer_size: 10000, \
|
||||||
@@ -876,133 +701,90 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_lockfile_released_on_drop() {
|
fn test_lockfile_released_on_drop() {
|
||||||
let schema_builder = schema::Schema::builder();
|
let schema_builder = schema::SchemaBuilder::default();
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
{
|
{
|
||||||
let _index_writer = index.writer(3_000_000).unwrap();
|
let _index_writer = index.writer(40_000_000).unwrap();
|
||||||
// the lock should be released when the
|
// the lock should be released when the
|
||||||
// index_writer leaves the scope.
|
// index_writer leaves the scope.
|
||||||
}
|
}
|
||||||
let _index_writer_two = index.writer(3_000_000).unwrap();
|
let _index_writer_two = index.writer(40_000_000).unwrap();
|
||||||
}
|
|
||||||
|
|
||||||
fn num_docs_containing_text(reader: &IndexReader, term: &str) -> u64 {
|
|
||||||
let searcher = reader.searcher();
|
|
||||||
let text_field = reader.schema().get_field("text").unwrap();
|
|
||||||
let term = Term::from_field_text(text_field, term);
|
|
||||||
searcher.doc_freq(&term)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_commit_and_rollback() {
|
fn test_commit_and_rollback() {
|
||||||
let mut schema_builder = schema::Schema::builder();
|
let mut schema_builder = schema::SchemaBuilder::default();
|
||||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
let reader = index
|
|
||||||
.reader_builder()
|
|
||||||
.reload_policy(ReloadPolicy::Manual)
|
|
||||||
.try_into()
|
|
||||||
.unwrap();
|
|
||||||
let num_docs_containing = |s: &str| {
|
let num_docs_containing = |s: &str| {
|
||||||
let searcher = reader.searcher();
|
let searcher = index.searcher();
|
||||||
let term = Term::from_field_text(text_field, s);
|
let term = Term::from_field_text(text_field, s);
|
||||||
searcher.doc_freq(&term)
|
searcher.doc_freq(&term)
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut index_writer = index.writer(3_000_000).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(index_writer.commit_opstamp(), 0u64);
|
|
||||||
assert_eq!(num_docs_containing_text(&reader, "a"), 0);
|
|
||||||
{
|
{
|
||||||
// writing the segment
|
// writing the segment
|
||||||
|
let mut index_writer = index.writer(3_000_000).unwrap();
|
||||||
index_writer.add_document(doc!(text_field=>"a"));
|
index_writer.add_document(doc!(text_field=>"a"));
|
||||||
index_writer.rollback().unwrap();
|
index_writer.rollback().unwrap();
|
||||||
|
|
||||||
assert_eq!(index_writer.commit_opstamp(), 0u64);
|
assert_eq!(index_writer.commit_opstamp(), 0u64);
|
||||||
assert_eq!(num_docs_containing("a"), 0);
|
assert_eq!(num_docs_containing("a"), 0);
|
||||||
{
|
{
|
||||||
index_writer.add_document(doc!(text_field=>"b"));
|
index_writer.add_document(doc!(text_field=>"b"));
|
||||||
index_writer.add_document(doc!(text_field=>"c"));
|
index_writer.add_document(doc!(text_field=>"c"));
|
||||||
}
|
}
|
||||||
assert!(index_writer.commit().is_ok());
|
assert_eq!(index_writer.commit().unwrap(), 2u64);
|
||||||
reader.reload().unwrap();
|
index.load_searchers().unwrap();
|
||||||
assert_eq!(num_docs_containing("a"), 0);
|
assert_eq!(num_docs_containing("a"), 0);
|
||||||
assert_eq!(num_docs_containing("b"), 1);
|
assert_eq!(num_docs_containing("b"), 1);
|
||||||
assert_eq!(num_docs_containing("c"), 1);
|
assert_eq!(num_docs_containing("c"), 1);
|
||||||
}
|
}
|
||||||
reader.reload().unwrap();
|
index.load_searchers().unwrap();
|
||||||
reader.searcher();
|
index.searcher();
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_softcommit_and_rollback() {
|
|
||||||
let mut schema_builder = schema::Schema::builder();
|
|
||||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
|
||||||
let reader = index.reader().unwrap();
|
|
||||||
// writing the segment
|
|
||||||
let mut index_writer = index.writer(3_000_000).unwrap();
|
|
||||||
index_writer.add_document(doc!(text_field=>"a"));
|
|
||||||
index_writer.rollback().unwrap();
|
|
||||||
|
|
||||||
assert_eq!(index_writer.commit_opstamp(), 0u64);
|
|
||||||
assert_eq!(num_docs_containing_text(&reader, "a"), 0u64);
|
|
||||||
{
|
|
||||||
index_writer.add_document(doc!(text_field=>"b"));
|
|
||||||
index_writer.add_document(doc!(text_field=>"c"));
|
|
||||||
}
|
|
||||||
assert!(index_writer.soft_commit().is_ok());
|
|
||||||
reader.reload().unwrap(); // we need to load soft committed stuff.
|
|
||||||
assert_eq!(num_docs_containing_text(&reader, "a"), 0u64);
|
|
||||||
assert_eq!(num_docs_containing_text(&reader, "b"), 1u64);
|
|
||||||
assert_eq!(num_docs_containing_text(&reader, "c"), 1u64);
|
|
||||||
index_writer.rollback().unwrap();
|
|
||||||
reader.reload().unwrap();
|
|
||||||
assert_eq!(num_docs_containing_text(&reader, "a"), 0u64);
|
|
||||||
assert_eq!(num_docs_containing_text(&reader, "b"), 0u64);
|
|
||||||
assert_eq!(num_docs_containing_text(&reader, "c"), 0u64);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_with_merges() {
|
fn test_with_merges() {
|
||||||
let mut schema_builder = schema::Schema::builder();
|
let mut schema_builder = schema::SchemaBuilder::default();
|
||||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
let reader = index
|
|
||||||
.reader_builder()
|
|
||||||
.reload_policy(ReloadPolicy::Manual)
|
|
||||||
.try_into()
|
|
||||||
.unwrap();
|
|
||||||
let num_docs_containing = |s: &str| {
|
let num_docs_containing = |s: &str| {
|
||||||
|
let searcher = index.searcher();
|
||||||
let term_a = Term::from_field_text(text_field, s);
|
let term_a = Term::from_field_text(text_field, s);
|
||||||
reader.searcher().doc_freq(&term_a)
|
searcher.doc_freq(&term_a)
|
||||||
};
|
};
|
||||||
{
|
{
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer(12_000_000).unwrap();
|
let mut index_writer = index.writer(12_000_000).unwrap();
|
||||||
// create 8 segments with 100 tiny docs
|
// create 8 segments with 100 tiny docs
|
||||||
for _doc in 0..100 {
|
for _doc in 0..100 {
|
||||||
index_writer.add_document(doc!(text_field=>"a"));
|
let mut doc = Document::default();
|
||||||
|
doc.add_text(text_field, "a");
|
||||||
|
index_writer.add_document(doc);
|
||||||
}
|
}
|
||||||
index_writer.commit().expect("commit failed");
|
index_writer.commit().expect("commit failed");
|
||||||
for _doc in 0..100 {
|
for _doc in 0..100 {
|
||||||
index_writer.add_document(doc!(text_field=>"a"));
|
let mut doc = Document::default();
|
||||||
|
doc.add_text(text_field, "a");
|
||||||
|
index_writer.add_document(doc);
|
||||||
}
|
}
|
||||||
// this should create 8 segments and trigger a merge.
|
// this should create 8 segments and trigger a merge.
|
||||||
index_writer.commit().expect("commit failed");
|
index_writer.commit().expect("commit failed");
|
||||||
index_writer
|
index_writer
|
||||||
.wait_merging_threads()
|
.wait_merging_threads()
|
||||||
.expect("waiting merging thread failed");
|
.expect("waiting merging thread failed");
|
||||||
|
index.load_searchers().unwrap();
|
||||||
|
|
||||||
reader.reload().unwrap();
|
assert_eq!(num_docs_containing("a"), 200);
|
||||||
|
|
||||||
assert_eq!(num_docs_containing_text(&reader, "a"), 200);
|
|
||||||
assert!(index.searchable_segments().unwrap().len() < 8);
|
assert!(index.searchable_segments().unwrap().len() < 8);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_prepare_with_commit_message() {
|
fn test_prepare_with_commit_message() {
|
||||||
let mut schema_builder = schema::Schema::builder();
|
let mut schema_builder = schema::SchemaBuilder::default();
|
||||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
|
|
||||||
@@ -1016,6 +798,7 @@ mod tests {
|
|||||||
{
|
{
|
||||||
let mut prepared_commit = index_writer.prepare_commit().expect("commit failed");
|
let mut prepared_commit = index_writer.prepare_commit().expect("commit failed");
|
||||||
prepared_commit.set_payload("first commit");
|
prepared_commit.set_payload("first commit");
|
||||||
|
assert_eq!(prepared_commit.opstamp(), 100);
|
||||||
prepared_commit.commit().expect("commit failed");
|
prepared_commit.commit().expect("commit failed");
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
@@ -1035,10 +818,10 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_prepare_but_rollback() {
|
fn test_prepare_but_rollback() {
|
||||||
let mut schema_builder = schema::Schema::builder();
|
let mut schema_builder = schema::SchemaBuilder::default();
|
||||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
let reader = index.reader();
|
|
||||||
{
|
{
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
|
||||||
@@ -1049,6 +832,7 @@ mod tests {
|
|||||||
{
|
{
|
||||||
let mut prepared_commit = index_writer.prepare_commit().expect("commit failed");
|
let mut prepared_commit = index_writer.prepare_commit().expect("commit failed");
|
||||||
prepared_commit.set_payload("first commit");
|
prepared_commit.set_payload("first commit");
|
||||||
|
assert_eq!(prepared_commit.opstamp(), 100);
|
||||||
prepared_commit.abort().expect("commit failed");
|
prepared_commit.abort().expect("commit failed");
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
@@ -1060,15 +844,11 @@ mod tests {
|
|||||||
}
|
}
|
||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
}
|
}
|
||||||
|
index.load_searchers().unwrap();
|
||||||
let num_docs_containing = |s: &str| {
|
let num_docs_containing = |s: &str| {
|
||||||
|
let searcher = index.searcher();
|
||||||
let term_a = Term::from_field_text(text_field, s);
|
let term_a = Term::from_field_text(text_field, s);
|
||||||
index
|
searcher.doc_freq(&term_a)
|
||||||
.reader_builder()
|
|
||||||
.reload_policy(ReloadPolicy::Manual)
|
|
||||||
.try_into()
|
|
||||||
.unwrap()
|
|
||||||
.searcher()
|
|
||||||
.doc_freq(&term_a)
|
|
||||||
};
|
};
|
||||||
assert_eq!(num_docs_containing("a"), 0);
|
assert_eq!(num_docs_containing("a"), 0);
|
||||||
assert_eq!(num_docs_containing("b"), 100);
|
assert_eq!(num_docs_containing("b"), 100);
|
||||||
@@ -1076,9 +856,9 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_hashmap_size() {
|
fn test_hashmap_size() {
|
||||||
assert_eq!(initial_table_size(100_000), 11);
|
assert_eq!(initial_table_size(100_000), 12);
|
||||||
assert_eq!(initial_table_size(1_000_000), 14);
|
assert_eq!(initial_table_size(1_000_000), 15);
|
||||||
assert_eq!(initial_table_size(10_000_000), 17);
|
assert_eq!(initial_table_size(10_000_000), 18);
|
||||||
assert_eq!(initial_table_size(1_000_000_000), 19);
|
assert_eq!(initial_table_size(1_000_000_000), 19);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1086,7 +866,7 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn test_write_commit_fails() {
|
fn test_write_commit_fails() {
|
||||||
use fail;
|
use fail;
|
||||||
let mut schema_builder = schema::Schema::builder();
|
let mut schema_builder = schema::SchemaBuilder::default();
|
||||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
|
|
||||||
@@ -1100,9 +880,11 @@ mod tests {
|
|||||||
index_writer.add_document(doc!(text_field => "b"));
|
index_writer.add_document(doc!(text_field => "b"));
|
||||||
}
|
}
|
||||||
assert!(index_writer.commit().is_err());
|
assert!(index_writer.commit().is_err());
|
||||||
|
index.load_searchers().unwrap();
|
||||||
let num_docs_containing = |s: &str| {
|
let num_docs_containing = |s: &str| {
|
||||||
|
let searcher = index.searcher();
|
||||||
let term_a = Term::from_field_text(text_field, s);
|
let term_a = Term::from_field_text(text_field, s);
|
||||||
index.reader().unwrap().searcher().doc_freq(&term_a)
|
searcher.doc_freq(&term_a)
|
||||||
};
|
};
|
||||||
assert_eq!(num_docs_containing("a"), 100);
|
assert_eq!(num_docs_containing("a"), 100);
|
||||||
assert_eq!(num_docs_containing("b"), 0);
|
assert_eq!(num_docs_containing("b"), 0);
|
||||||
|
|||||||
@@ -1,64 +0,0 @@
|
|||||||
use census::{Inventory, TrackedObject};
|
|
||||||
use std::collections::HashSet;
|
|
||||||
use SegmentId;
|
|
||||||
|
|
||||||
#[derive(Default)]
|
|
||||||
pub struct MergeOperationInventory(Inventory<InnerMergeOperation>);
|
|
||||||
|
|
||||||
impl MergeOperationInventory {
|
|
||||||
pub fn segment_in_merge(&self) -> HashSet<SegmentId> {
|
|
||||||
let mut segment_in_merge = HashSet::default();
|
|
||||||
for merge_op in self.0.list() {
|
|
||||||
for &segment_id in &merge_op.segment_ids {
|
|
||||||
segment_in_merge.insert(segment_id);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
segment_in_merge
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// A `MergeOperation` has two role.
|
|
||||||
/// It carries all of the information required to describe a merge :
|
|
||||||
/// - `target_opstamp` is the opstamp up to which we want to consume the
|
|
||||||
/// delete queue and reflect their deletes.
|
|
||||||
/// - `segment_ids` is the list of segment to be merged.
|
|
||||||
///
|
|
||||||
/// The second role is to ensure keep track of the fact that these
|
|
||||||
/// segments are in merge and avoid starting a merge operation that
|
|
||||||
/// may conflict with this one.
|
|
||||||
///
|
|
||||||
/// This works by tracking merge operations. When considering computing
|
|
||||||
/// merge candidates, we simply list tracked merge operations and remove
|
|
||||||
/// their segments from possible merge candidates.
|
|
||||||
pub struct MergeOperation {
|
|
||||||
inner: TrackedObject<InnerMergeOperation>,
|
|
||||||
}
|
|
||||||
|
|
||||||
struct InnerMergeOperation {
|
|
||||||
target_opstamp: u64,
|
|
||||||
segment_ids: Vec<SegmentId>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl MergeOperation {
|
|
||||||
pub fn new(
|
|
||||||
inventory: &MergeOperationInventory,
|
|
||||||
target_opstamp: u64,
|
|
||||||
segment_ids: Vec<SegmentId>,
|
|
||||||
) -> MergeOperation {
|
|
||||||
let inner_merge_operation = InnerMergeOperation {
|
|
||||||
target_opstamp,
|
|
||||||
segment_ids,
|
|
||||||
};
|
|
||||||
MergeOperation {
|
|
||||||
inner: inventory.0.track(inner_merge_operation),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn target_opstamp(&self) -> u64 {
|
|
||||||
self.inner.target_opstamp
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn segment_ids(&self) -> &[SegmentId] {
|
|
||||||
&self.inner.segment_ids[..]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -11,7 +11,7 @@ pub struct MergeCandidate(pub Vec<SegmentId>);
|
|||||||
///
|
///
|
||||||
/// Every time a the list of segments changes, the segment updater
|
/// Every time a the list of segments changes, the segment updater
|
||||||
/// asks the merge policy if some segments should be merged.
|
/// asks the merge policy if some segments should be merged.
|
||||||
pub trait MergePolicy: marker::Send + marker::Sync + Debug {
|
pub trait MergePolicy: MergePolicyClone + marker::Send + marker::Sync + Debug {
|
||||||
/// Given the list of segment metas, returns the list of merge candidates.
|
/// Given the list of segment metas, returns the list of merge candidates.
|
||||||
///
|
///
|
||||||
/// This call happens on the segment updater thread, and will block
|
/// This call happens on the segment updater thread, and will block
|
||||||
@@ -19,6 +19,21 @@ pub trait MergePolicy: marker::Send + marker::Sync + Debug {
|
|||||||
fn compute_merge_candidates(&self, segments: &[SegmentMeta]) -> Vec<MergeCandidate>;
|
fn compute_merge_candidates(&self, segments: &[SegmentMeta]) -> Vec<MergeCandidate>;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// MergePolicyClone
|
||||||
|
pub trait MergePolicyClone {
|
||||||
|
/// Returns a boxed clone of the MergePolicy.
|
||||||
|
fn box_clone(&self) -> Box<MergePolicy>;
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T> MergePolicyClone for T
|
||||||
|
where
|
||||||
|
T: 'static + MergePolicy + Clone,
|
||||||
|
{
|
||||||
|
fn box_clone(&self) -> Box<MergePolicy> {
|
||||||
|
Box::new(self.clone())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Never merge segments.
|
/// Never merge segments.
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct NoMergePolicy;
|
pub struct NoMergePolicy;
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
use common::MAX_DOC_LIMIT;
|
|
||||||
use core::Segment;
|
use core::Segment;
|
||||||
use core::SegmentReader;
|
use core::SegmentReader;
|
||||||
use core::SerializableSegment;
|
use core::SerializableSegment;
|
||||||
@@ -24,7 +23,6 @@ use termdict::TermMerger;
|
|||||||
use termdict::TermOrdinal;
|
use termdict::TermOrdinal;
|
||||||
use DocId;
|
use DocId;
|
||||||
use Result;
|
use Result;
|
||||||
use TantivyError;
|
|
||||||
|
|
||||||
fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 {
|
fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 {
|
||||||
let mut total_tokens = 0u64;
|
let mut total_tokens = 0u64;
|
||||||
@@ -42,15 +40,13 @@ fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 {
|
|||||||
total_tokens += reader.inverted_index(field).total_num_tokens();
|
total_tokens += reader.inverted_index(field).total_num_tokens();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
total_tokens
|
total_tokens + count
|
||||||
+ count
|
.iter()
|
||||||
.iter()
|
.cloned()
|
||||||
.cloned()
|
.enumerate()
|
||||||
.enumerate()
|
.map(|(fieldnorm_ord, count)| {
|
||||||
.map(|(fieldnorm_ord, count)| {
|
count as u64 * u64::from(FieldNormReader::id_to_fieldnorm(fieldnorm_ord as u8))
|
||||||
count as u64 * u64::from(FieldNormReader::id_to_fieldnorm(fieldnorm_ord as u8))
|
}).sum::<u64>()
|
||||||
})
|
|
||||||
.sum::<u64>()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct IndexMerger {
|
pub struct IndexMerger {
|
||||||
@@ -152,14 +148,6 @@ impl IndexMerger {
|
|||||||
readers.push(reader);
|
readers.push(reader);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if max_doc >= MAX_DOC_LIMIT {
|
|
||||||
let err_msg = format!(
|
|
||||||
"The segment resulting from this merge would have {} docs,\
|
|
||||||
which exceeds the limit {}.",
|
|
||||||
max_doc, MAX_DOC_LIMIT
|
|
||||||
);
|
|
||||||
return Err(TantivyError::InvalidArgument(err_msg));
|
|
||||||
}
|
|
||||||
Ok(IndexMerger {
|
Ok(IndexMerger {
|
||||||
schema,
|
schema,
|
||||||
readers,
|
readers,
|
||||||
@@ -204,17 +192,17 @@ impl IndexMerger {
|
|||||||
fast_field_serializer,
|
fast_field_serializer,
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
FieldType::U64(ref options)
|
FieldType::U64(ref options) | FieldType::I64(ref options) => {
|
||||||
| FieldType::I64(ref options)
|
match options.get_fastfield_cardinality() {
|
||||||
| FieldType::Date(ref options) => match options.get_fastfield_cardinality() {
|
Some(Cardinality::SingleValue) => {
|
||||||
Some(Cardinality::SingleValue) => {
|
self.write_single_fast_field(field, fast_field_serializer)?;
|
||||||
self.write_single_fast_field(field, fast_field_serializer)?;
|
}
|
||||||
|
Some(Cardinality::MultiValues) => {
|
||||||
|
self.write_multi_fast_field(field, fast_field_serializer)?;
|
||||||
|
}
|
||||||
|
None => {}
|
||||||
}
|
}
|
||||||
Some(Cardinality::MultiValues) => {
|
}
|
||||||
self.write_multi_fast_field(field, fast_field_serializer)?;
|
|
||||||
}
|
|
||||||
None => {}
|
|
||||||
},
|
|
||||||
FieldType::Str(_) => {
|
FieldType::Str(_) => {
|
||||||
// We don't handle str fast field for the moment
|
// We don't handle str fast field for the moment
|
||||||
// They can be implemented using what is done
|
// They can be implemented using what is done
|
||||||
@@ -535,8 +523,7 @@ impl IndexMerger {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
None
|
None
|
||||||
})
|
}).collect();
|
||||||
.collect();
|
|
||||||
|
|
||||||
// At this point, `segment_postings` contains the posting list
|
// At this point, `segment_postings` contains the posting list
|
||||||
// of all of the segments containing the given term.
|
// of all of the segments containing the given term.
|
||||||
@@ -627,7 +614,7 @@ impl IndexMerger {
|
|||||||
store_writer.store(&doc)?;
|
store_writer.store(&doc)?;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
store_writer.stack(&store_reader)?;
|
store_writer.stack(store_reader)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -648,9 +635,10 @@ impl SerializableSegment for IndexMerger {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
|
use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
|
||||||
|
use collector::chain;
|
||||||
use collector::tests::TestCollector;
|
use collector::tests::TestCollector;
|
||||||
use collector::tests::{BytesFastFieldTestCollector, FastFieldTestCollector};
|
use collector::tests::{BytesFastFieldTestCollector, FastFieldTestCollector};
|
||||||
use collector::{Count, FacetCollector};
|
use collector::FacetCollector;
|
||||||
use core::Index;
|
use core::Index;
|
||||||
use futures::Future;
|
use futures::Future;
|
||||||
use query::AllQuery;
|
use query::AllQuery;
|
||||||
@@ -659,12 +647,10 @@ mod tests {
|
|||||||
use schema;
|
use schema;
|
||||||
use schema::Cardinality;
|
use schema::Cardinality;
|
||||||
use schema::Document;
|
use schema::Document;
|
||||||
use schema::Facet;
|
|
||||||
use schema::IndexRecordOption;
|
use schema::IndexRecordOption;
|
||||||
use schema::IntOptions;
|
use schema::IntOptions;
|
||||||
use schema::Term;
|
use schema::Term;
|
||||||
use schema::TextFieldIndexing;
|
use schema::TextFieldIndexing;
|
||||||
use schema::INDEXED;
|
|
||||||
use std::io::Cursor;
|
use std::io::Cursor;
|
||||||
use DocAddress;
|
use DocAddress;
|
||||||
use IndexWriter;
|
use IndexWriter;
|
||||||
@@ -672,22 +658,19 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_index_merger_no_deletes() {
|
fn test_index_merger_no_deletes() {
|
||||||
let mut schema_builder = schema::Schema::builder();
|
let mut schema_builder = schema::SchemaBuilder::default();
|
||||||
let text_fieldtype = schema::TextOptions::default()
|
let text_fieldtype = schema::TextOptions::default()
|
||||||
.set_indexing_options(
|
.set_indexing_options(
|
||||||
TextFieldIndexing::default()
|
TextFieldIndexing::default()
|
||||||
.set_tokenizer("default")
|
.set_tokenizer("default")
|
||||||
.set_index_option(IndexRecordOption::WithFreqs),
|
.set_index_option(IndexRecordOption::WithFreqs),
|
||||||
)
|
).set_stored();
|
||||||
.set_stored();
|
|
||||||
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
||||||
let date_field = schema_builder.add_date_field("date", INDEXED);
|
|
||||||
let score_fieldtype = schema::IntOptions::default().set_fast(Cardinality::SingleValue);
|
let score_fieldtype = schema::IntOptions::default().set_fast(Cardinality::SingleValue);
|
||||||
let score_field = schema_builder.add_u64_field("score", score_fieldtype);
|
let score_field = schema_builder.add_u64_field("score", score_fieldtype);
|
||||||
let bytes_score_field = schema_builder.add_bytes_field("score_bytes");
|
let bytes_score_field = schema_builder.add_bytes_field("score_bytes");
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
let reader = index.reader().unwrap();
|
|
||||||
let curr_time = chrono::Utc::now();
|
|
||||||
let add_score_bytes = |doc: &mut Document, score: u32| {
|
let add_score_bytes = |doc: &mut Document, score: u32| {
|
||||||
let mut bytes = Vec::new();
|
let mut bytes = Vec::new();
|
||||||
bytes
|
bytes
|
||||||
@@ -704,7 +687,6 @@ mod tests {
|
|||||||
let mut doc = Document::default();
|
let mut doc = Document::default();
|
||||||
doc.add_text(text_field, "af b");
|
doc.add_text(text_field, "af b");
|
||||||
doc.add_u64(score_field, 3);
|
doc.add_u64(score_field, 3);
|
||||||
doc.add_date(date_field, &curr_time);
|
|
||||||
add_score_bytes(&mut doc, 3);
|
add_score_bytes(&mut doc, 3);
|
||||||
index_writer.add_document(doc);
|
index_writer.add_document(doc);
|
||||||
}
|
}
|
||||||
@@ -730,7 +712,6 @@ mod tests {
|
|||||||
{
|
{
|
||||||
let mut doc = Document::default();
|
let mut doc = Document::default();
|
||||||
doc.add_text(text_field, "af b");
|
doc.add_text(text_field, "af b");
|
||||||
doc.add_date(date_field, &curr_time);
|
|
||||||
doc.add_u64(score_field, 11);
|
doc.add_u64(score_field, 11);
|
||||||
add_score_bytes(&mut doc, 11);
|
add_score_bytes(&mut doc, 11);
|
||||||
index_writer.add_document(doc);
|
index_writer.add_document(doc);
|
||||||
@@ -758,39 +739,30 @@ mod tests {
|
|||||||
index_writer.wait_merging_threads().unwrap();
|
index_writer.wait_merging_threads().unwrap();
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
reader.reload().unwrap();
|
index.load_searchers().unwrap();
|
||||||
let searcher = reader.searcher();
|
let searcher = index.searcher();
|
||||||
let get_doc_ids = |terms: Vec<Term>| {
|
let get_doc_ids = |terms: Vec<Term>| {
|
||||||
|
let mut collector = TestCollector::default();
|
||||||
let query = BooleanQuery::new_multiterms_query(terms);
|
let query = BooleanQuery::new_multiterms_query(terms);
|
||||||
let top_docs = searcher.search(&query, &TestCollector).unwrap();
|
assert!(searcher.search(&query, &mut collector).is_ok());
|
||||||
top_docs.docs().to_vec()
|
collector.docs()
|
||||||
};
|
};
|
||||||
{
|
{
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
|
get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
|
||||||
vec![DocAddress(0, 1), DocAddress(0, 2), DocAddress(0, 4)]
|
vec![1, 2, 4]
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
|
get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
|
||||||
vec![DocAddress(0, 0), DocAddress(0, 3)]
|
vec![0, 3]
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
get_doc_ids(vec![Term::from_field_text(text_field, "g")]),
|
get_doc_ids(vec![Term::from_field_text(text_field, "g")]),
|
||||||
vec![DocAddress(0, 4)]
|
vec![4]
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
|
get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
|
||||||
vec![
|
vec![0, 1, 2, 3, 4]
|
||||||
DocAddress(0, 0),
|
|
||||||
DocAddress(0, 1),
|
|
||||||
DocAddress(0, 2),
|
|
||||||
DocAddress(0, 3),
|
|
||||||
DocAddress(0, 4)
|
|
||||||
]
|
|
||||||
);
|
|
||||||
assert_eq!(
|
|
||||||
get_doc_ids(vec![Term::from_field_date(date_field, &curr_time)]),
|
|
||||||
vec![DocAddress(0, 0), DocAddress(0, 3)]
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
@@ -816,18 +788,17 @@ mod tests {
|
|||||||
{
|
{
|
||||||
let get_fast_vals = |terms: Vec<Term>| {
|
let get_fast_vals = |terms: Vec<Term>| {
|
||||||
let query = BooleanQuery::new_multiterms_query(terms);
|
let query = BooleanQuery::new_multiterms_query(terms);
|
||||||
searcher
|
let mut collector = FastFieldTestCollector::for_field(score_field);
|
||||||
.search(&query, &FastFieldTestCollector::for_field(score_field))
|
assert!(searcher.search(&query, &mut collector).is_ok());
|
||||||
.unwrap()
|
collector.vals()
|
||||||
};
|
};
|
||||||
let get_fast_vals_bytes = |terms: Vec<Term>| {
|
let get_fast_vals_bytes = |terms: Vec<Term>| {
|
||||||
let query = BooleanQuery::new_multiterms_query(terms);
|
let query = BooleanQuery::new_multiterms_query(terms);
|
||||||
|
let mut collector = BytesFastFieldTestCollector::for_field(bytes_score_field);
|
||||||
searcher
|
searcher
|
||||||
.search(
|
.search(&query, &mut collector)
|
||||||
&query,
|
.expect("failed to search");
|
||||||
&BytesFastFieldTestCollector::for_field(bytes_score_field),
|
collector.vals()
|
||||||
)
|
|
||||||
.expect("failed to search")
|
|
||||||
};
|
};
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
get_fast_vals(vec![Term::from_field_text(text_field, "a")]),
|
get_fast_vals(vec![Term::from_field_text(text_field, "a")]),
|
||||||
@@ -843,27 +814,34 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_index_merger_with_deletes() {
|
fn test_index_merger_with_deletes() {
|
||||||
let mut schema_builder = schema::Schema::builder();
|
let mut schema_builder = schema::SchemaBuilder::default();
|
||||||
let text_fieldtype = schema::TextOptions::default()
|
let text_fieldtype = schema::TextOptions::default()
|
||||||
.set_indexing_options(
|
.set_indexing_options(
|
||||||
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
|
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
|
||||||
)
|
).set_stored();
|
||||||
.set_stored();
|
|
||||||
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
||||||
let score_fieldtype = schema::IntOptions::default().set_fast(Cardinality::SingleValue);
|
let score_fieldtype = schema::IntOptions::default().set_fast(Cardinality::SingleValue);
|
||||||
let score_field = schema_builder.add_u64_field("score", score_fieldtype);
|
let score_field = schema_builder.add_u64_field("score", score_fieldtype);
|
||||||
let bytes_score_field = schema_builder.add_bytes_field("score_bytes");
|
let bytes_score_field = schema_builder.add_bytes_field("score_bytes");
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
let reader = index.reader().unwrap();
|
|
||||||
let search_term = |searcher: &Searcher, term: Term| {
|
let search_term = |searcher: &Searcher, term: Term| {
|
||||||
let collector = FastFieldTestCollector::for_field(score_field);
|
let mut collector = FastFieldTestCollector::for_field(score_field);
|
||||||
let bytes_collector = BytesFastFieldTestCollector::for_field(bytes_score_field);
|
let mut bytes_collector = BytesFastFieldTestCollector::for_field(bytes_score_field);
|
||||||
let term_query = TermQuery::new(term, IndexRecordOption::Basic);
|
let term_query = TermQuery::new(term, IndexRecordOption::Basic);
|
||||||
let (scores, bytes) = searcher
|
|
||||||
.search(&term_query, &(collector, bytes_collector))
|
{
|
||||||
.unwrap();
|
let mut combined_collector =
|
||||||
let mut score_bytes = Cursor::new(bytes);
|
chain().push(&mut collector).push(&mut bytes_collector);
|
||||||
|
searcher
|
||||||
|
.search(&term_query, &mut combined_collector)
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
let scores = collector.vals();
|
||||||
|
|
||||||
|
let mut score_bytes = Cursor::new(bytes_collector.vals());
|
||||||
for &score in &scores {
|
for &score in &scores {
|
||||||
assert_eq!(score as u32, score_bytes.read_u32::<BigEndian>().unwrap());
|
assert_eq!(score as u32, score_bytes.read_u32::<BigEndian>().unwrap());
|
||||||
}
|
}
|
||||||
@@ -876,24 +854,24 @@ mod tests {
|
|||||||
{
|
{
|
||||||
// a first commit
|
// a first commit
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
text_field => "a b d",
|
text_field => "a b d",
|
||||||
score_field => 1u64,
|
score_field => 1u64,
|
||||||
bytes_score_field => vec![0u8, 0, 0, 1],
|
bytes_score_field => vec![0u8, 0, 0, 1],
|
||||||
));
|
));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
text_field => "b c",
|
text_field => "b c",
|
||||||
score_field => 2u64,
|
score_field => 2u64,
|
||||||
bytes_score_field => vec![0u8, 0, 0, 2],
|
bytes_score_field => vec![0u8, 0, 0, 2],
|
||||||
));
|
));
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
text_field => "c d",
|
text_field => "c d",
|
||||||
score_field => 3u64,
|
score_field => 3u64,
|
||||||
bytes_score_field => vec![0u8, 0, 0, 3],
|
bytes_score_field => vec![0u8, 0, 0, 3],
|
||||||
));
|
));
|
||||||
index_writer.commit().expect("committed");
|
index_writer.commit().expect("committed");
|
||||||
reader.reload().unwrap();
|
index.load_searchers().unwrap();
|
||||||
let searcher = reader.searcher();
|
let ref searcher = *index.searcher();
|
||||||
assert_eq!(searcher.num_docs(), 2);
|
assert_eq!(searcher.num_docs(), 2);
|
||||||
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
|
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
|
||||||
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
|
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
|
||||||
@@ -917,37 +895,37 @@ mod tests {
|
|||||||
{
|
{
|
||||||
// a second commit
|
// a second commit
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
text_field => "a d e",
|
text_field => "a d e",
|
||||||
score_field => 4_000u64,
|
score_field => 4_000u64,
|
||||||
bytes_score_field => vec![0u8, 0, 0, 4],
|
bytes_score_field => vec![0u8, 0, 0, 4],
|
||||||
));
|
));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
text_field => "e f",
|
text_field => "e f",
|
||||||
score_field => 5_000u64,
|
score_field => 5_000u64,
|
||||||
bytes_score_field => vec![0u8, 0, 0, 5],
|
bytes_score_field => vec![0u8, 0, 0, 5],
|
||||||
));
|
));
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "a"));
|
index_writer.delete_term(Term::from_field_text(text_field, "a"));
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "f"));
|
index_writer.delete_term(Term::from_field_text(text_field, "f"));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
text_field => "f g",
|
text_field => "f g",
|
||||||
score_field => 6_000u64,
|
score_field => 6_000u64,
|
||||||
bytes_score_field => vec![0u8, 0, 23, 112],
|
bytes_score_field => vec![0u8, 0, 23, 112],
|
||||||
));
|
));
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
text_field => "g h",
|
text_field => "g h",
|
||||||
score_field => 7_000u64,
|
score_field => 7_000u64,
|
||||||
bytes_score_field => vec![0u8, 0, 27, 88],
|
bytes_score_field => vec![0u8, 0, 27, 88],
|
||||||
));
|
));
|
||||||
index_writer.commit().expect("committed");
|
index_writer.commit().expect("committed");
|
||||||
reader.reload().unwrap();
|
index.load_searchers().unwrap();
|
||||||
let searcher = reader.searcher();
|
let searcher = index.searcher();
|
||||||
|
|
||||||
assert_eq!(searcher.segment_readers().len(), 2);
|
assert_eq!(searcher.segment_readers().len(), 2);
|
||||||
assert_eq!(searcher.num_docs(), 3);
|
assert_eq!(searcher.num_docs(), 3);
|
||||||
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
|
assert_eq!(searcher.segment_readers()[0].num_docs(), 1);
|
||||||
assert_eq!(searcher.segment_readers()[0].max_doc(), 4);
|
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
|
||||||
assert_eq!(searcher.segment_readers()[1].num_docs(), 1);
|
assert_eq!(searcher.segment_readers()[1].num_docs(), 2);
|
||||||
assert_eq!(searcher.segment_readers()[1].max_doc(), 3);
|
assert_eq!(searcher.segment_readers()[1].max_doc(), 4);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
search_term(&searcher, Term::from_field_text(text_field, "a")),
|
search_term(&searcher, Term::from_field_text(text_field, "a")),
|
||||||
empty_vec
|
empty_vec
|
||||||
@@ -981,15 +959,15 @@ mod tests {
|
|||||||
.segment_reader(0)
|
.segment_reader(0)
|
||||||
.fast_field_reader::<u64>(score_field)
|
.fast_field_reader::<u64>(score_field)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(score_field_reader.min_value(), 4000);
|
assert_eq!(score_field_reader.min_value(), 1);
|
||||||
assert_eq!(score_field_reader.max_value(), 7000);
|
assert_eq!(score_field_reader.max_value(), 3);
|
||||||
|
|
||||||
let score_field_reader = searcher
|
let score_field_reader = searcher
|
||||||
.segment_reader(1)
|
.segment_reader(1)
|
||||||
.fast_field_reader::<u64>(score_field)
|
.fast_field_reader::<u64>(score_field)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(score_field_reader.min_value(), 1);
|
assert_eq!(score_field_reader.min_value(), 4000);
|
||||||
assert_eq!(score_field_reader.max_value(), 3);
|
assert_eq!(score_field_reader.max_value(), 7000);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
// merging the segments
|
// merging the segments
|
||||||
@@ -1001,8 +979,8 @@ mod tests {
|
|||||||
.expect("Failed to initiate merge")
|
.expect("Failed to initiate merge")
|
||||||
.wait()
|
.wait()
|
||||||
.expect("Merging failed");
|
.expect("Merging failed");
|
||||||
reader.reload().unwrap();
|
index.load_searchers().unwrap();
|
||||||
let searcher = reader.searcher();
|
let ref searcher = *index.searcher();
|
||||||
assert_eq!(searcher.segment_readers().len(), 1);
|
assert_eq!(searcher.segment_readers().len(), 1);
|
||||||
assert_eq!(searcher.num_docs(), 3);
|
assert_eq!(searcher.num_docs(), 3);
|
||||||
assert_eq!(searcher.segment_readers()[0].num_docs(), 3);
|
assert_eq!(searcher.segment_readers()[0].num_docs(), 3);
|
||||||
@@ -1047,8 +1025,8 @@ mod tests {
|
|||||||
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
|
|
||||||
reader.reload().unwrap();
|
index.load_searchers().unwrap();
|
||||||
let searcher = reader.searcher();
|
let ref searcher = *index.searcher();
|
||||||
assert_eq!(searcher.segment_readers().len(), 1);
|
assert_eq!(searcher.segment_readers().len(), 1);
|
||||||
assert_eq!(searcher.num_docs(), 2);
|
assert_eq!(searcher.num_docs(), 2);
|
||||||
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
|
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
|
||||||
@@ -1098,9 +1076,9 @@ mod tests {
|
|||||||
.expect("Failed to initiate merge")
|
.expect("Failed to initiate merge")
|
||||||
.wait()
|
.wait()
|
||||||
.expect("Merging failed");
|
.expect("Merging failed");
|
||||||
reader.reload().unwrap();
|
index.load_searchers().unwrap();
|
||||||
|
|
||||||
let searcher = reader.searcher();
|
let ref searcher = *index.searcher();
|
||||||
assert_eq!(searcher.segment_readers().len(), 1);
|
assert_eq!(searcher.segment_readers().len(), 1);
|
||||||
assert_eq!(searcher.num_docs(), 2);
|
assert_eq!(searcher.num_docs(), 2);
|
||||||
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
|
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
|
||||||
@@ -1144,27 +1122,30 @@ mod tests {
|
|||||||
{
|
{
|
||||||
// Test removing all docs
|
// Test removing all docs
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "g"));
|
index_writer.delete_term(Term::from_field_text(text_field, "g"));
|
||||||
index_writer.commit().unwrap();
|
|
||||||
let segment_ids = index
|
let segment_ids = index
|
||||||
.searchable_segment_ids()
|
.searchable_segment_ids()
|
||||||
.expect("Searchable segments failed.");
|
.expect("Searchable segments failed.");
|
||||||
reader.reload().unwrap();
|
index_writer
|
||||||
|
.merge(&segment_ids)
|
||||||
|
.expect("Failed to initiate merge")
|
||||||
|
.wait()
|
||||||
|
.expect("Merging failed");
|
||||||
|
index.load_searchers().unwrap();
|
||||||
|
|
||||||
let searcher = reader.searcher();
|
let ref searcher = *index.searcher();
|
||||||
assert!(segment_ids.is_empty());
|
assert_eq!(searcher.segment_readers().len(), 1);
|
||||||
assert!(searcher.segment_readers().is_empty());
|
|
||||||
assert_eq!(searcher.num_docs(), 0);
|
assert_eq!(searcher.num_docs(), 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_merge_facets() {
|
fn test_merge_facets() {
|
||||||
let mut schema_builder = schema::Schema::builder();
|
let mut schema_builder = schema::SchemaBuilder::default();
|
||||||
let facet_field = schema_builder.add_facet_field("facet");
|
let facet_field = schema_builder.add_facet_field("facet");
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
let reader = index.reader().unwrap();
|
use schema::Facet;
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
let index_doc = |index_writer: &mut IndexWriter, doc_facets: &[&str]| {
|
let index_doc = |index_writer: &mut IndexWriter, doc_facets: &[&str]| {
|
||||||
let mut doc = Document::default();
|
let mut doc = Document::default();
|
||||||
for facet in doc_facets {
|
for facet in doc_facets {
|
||||||
@@ -1191,16 +1172,20 @@ mod tests {
|
|||||||
index_doc(&mut index_writer, &["/top/e", "/top/f"]);
|
index_doc(&mut index_writer, &["/top/e", "/top/f"]);
|
||||||
index_writer.commit().expect("committed");
|
index_writer.commit().expect("committed");
|
||||||
}
|
}
|
||||||
|
index.load_searchers().unwrap();
|
||||||
reader.reload().unwrap();
|
|
||||||
let test_searcher = |expected_num_docs: usize, expected: &[(&str, u64)]| {
|
let test_searcher = |expected_num_docs: usize, expected: &[(&str, u64)]| {
|
||||||
let searcher = reader.searcher();
|
let searcher = index.searcher();
|
||||||
let mut facet_collector = FacetCollector::for_field(facet_field);
|
let mut facet_collector = FacetCollector::for_field(facet_field);
|
||||||
facet_collector.add_facet(Facet::from("/top"));
|
facet_collector.add_facet(Facet::from("/top"));
|
||||||
let (count, facet_counts) = searcher
|
use collector::{CountCollector, MultiCollector};
|
||||||
.search(&AllQuery, &(Count, facet_collector))
|
let mut count_collector = CountCollector::default();
|
||||||
.unwrap();
|
{
|
||||||
assert_eq!(count, expected_num_docs);
|
let mut multi_collectors =
|
||||||
|
MultiCollector::from(vec![&mut count_collector, &mut facet_collector]);
|
||||||
|
searcher.search(&AllQuery, &mut multi_collectors).unwrap();
|
||||||
|
}
|
||||||
|
assert_eq!(count_collector.count(), expected_num_docs);
|
||||||
|
let facet_counts = facet_collector.harvest();
|
||||||
let facets: Vec<(String, u64)> = facet_counts
|
let facets: Vec<(String, u64)> = facet_counts
|
||||||
.get("/top")
|
.get("/top")
|
||||||
.map(|(facet, count)| (facet.to_string(), count))
|
.map(|(facet, count)| (facet.to_string(), count))
|
||||||
@@ -1224,19 +1209,21 @@ mod tests {
|
|||||||
("/top/f", 1),
|
("/top/f", 1),
|
||||||
],
|
],
|
||||||
);
|
);
|
||||||
|
|
||||||
// Merging the segments
|
// Merging the segments
|
||||||
{
|
{
|
||||||
let segment_ids = index
|
let segment_ids = index
|
||||||
.searchable_segment_ids()
|
.searchable_segment_ids()
|
||||||
.expect("Searchable segments failed.");
|
.expect("Searchable segments failed.");
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
index_writer
|
index_writer
|
||||||
.merge(&segment_ids)
|
.merge(&segment_ids)
|
||||||
.expect("Failed to initiate merge")
|
.expect("Failed to initiate merge")
|
||||||
.wait()
|
.wait()
|
||||||
.expect("Merging failed");
|
.expect("Merging failed");
|
||||||
index_writer.wait_merging_threads().unwrap();
|
index_writer.wait_merging_threads().unwrap();
|
||||||
reader.reload().unwrap();
|
|
||||||
|
index.load_searchers().unwrap();
|
||||||
test_searcher(
|
test_searcher(
|
||||||
11,
|
11,
|
||||||
&[
|
&[
|
||||||
@@ -1252,12 +1239,12 @@ mod tests {
|
|||||||
|
|
||||||
// Deleting one term
|
// Deleting one term
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
let facet = Facet::from_path(vec!["top", "a", "firstdoc"]);
|
let facet = Facet::from_path(vec!["top", "a", "firstdoc"]);
|
||||||
let facet_term = Term::from_facet(facet_field, &facet);
|
let facet_term = Term::from_facet(facet_field, &facet);
|
||||||
index_writer.delete_term(facet_term);
|
index_writer.delete_term(facet_term);
|
||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
reader.reload().unwrap();
|
index.load_searchers().unwrap();
|
||||||
test_searcher(
|
test_searcher(
|
||||||
9,
|
9,
|
||||||
&[
|
&[
|
||||||
@@ -1272,45 +1259,17 @@ mod tests {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_bug_merge() {
|
|
||||||
let mut schema_builder = schema::Schema::builder();
|
|
||||||
let int_field = schema_builder.add_u64_field("intvals", INDEXED);
|
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
|
||||||
index_writer.add_document(doc!(int_field => 1u64));
|
|
||||||
index_writer.commit().expect("commit failed");
|
|
||||||
index_writer.add_document(doc!(int_field => 1u64));
|
|
||||||
index_writer.commit().expect("commit failed");
|
|
||||||
let reader = index.reader().unwrap();
|
|
||||||
let searcher = reader.searcher();
|
|
||||||
assert_eq!(searcher.num_docs(), 2);
|
|
||||||
index_writer.delete_term(Term::from_field_u64(int_field, 1));
|
|
||||||
let segment_ids = index
|
|
||||||
.searchable_segment_ids()
|
|
||||||
.expect("Searchable segments failed.");
|
|
||||||
index_writer
|
|
||||||
.merge(&segment_ids)
|
|
||||||
.expect("Failed to initiate merge")
|
|
||||||
.wait()
|
|
||||||
.expect("Merging failed");
|
|
||||||
reader.reload().unwrap();
|
|
||||||
// commit has not been called yet. The document should still be
|
|
||||||
// there.
|
|
||||||
assert_eq!(reader.searcher().num_docs(), 2);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_merge_multivalued_int_fields_all_deleted() {
|
fn test_merge_multivalued_int_fields_all_deleted() {
|
||||||
let mut schema_builder = schema::Schema::builder();
|
let mut schema_builder = schema::SchemaBuilder::default();
|
||||||
let int_options = IntOptions::default()
|
let int_options = IntOptions::default()
|
||||||
.set_fast(Cardinality::MultiValues)
|
.set_fast(Cardinality::MultiValues)
|
||||||
.set_indexed();
|
.set_indexed();
|
||||||
let int_field = schema_builder.add_u64_field("intvals", int_options);
|
let int_field = schema_builder.add_u64_field("intvals", int_options);
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
let reader = index.reader().unwrap();
|
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
let mut doc = Document::default();
|
let mut doc = Document::default();
|
||||||
doc.add_u64(int_field, 1);
|
doc.add_u64(int_field, 1);
|
||||||
index_writer.add_document(doc.clone());
|
index_writer.add_document(doc.clone());
|
||||||
@@ -1318,34 +1277,32 @@ mod tests {
|
|||||||
index_writer.add_document(doc);
|
index_writer.add_document(doc);
|
||||||
index_writer.commit().expect("commit failed");
|
index_writer.commit().expect("commit failed");
|
||||||
index_writer.delete_term(Term::from_field_u64(int_field, 1));
|
index_writer.delete_term(Term::from_field_u64(int_field, 1));
|
||||||
|
index_writer.commit().expect("commit failed");
|
||||||
|
}
|
||||||
|
index.load_searchers().unwrap();
|
||||||
|
let searcher = index.searcher();
|
||||||
|
assert_eq!(searcher.num_docs(), 0);
|
||||||
|
// Merging the segments
|
||||||
|
{
|
||||||
let segment_ids = index
|
let segment_ids = index
|
||||||
.searchable_segment_ids()
|
.searchable_segment_ids()
|
||||||
.expect("Searchable segments failed.");
|
.expect("Searchable segments failed.");
|
||||||
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
index_writer
|
index_writer
|
||||||
.merge(&segment_ids)
|
.merge(&segment_ids)
|
||||||
.expect("Failed to initiate merge")
|
.expect("Failed to initiate merge")
|
||||||
.wait()
|
.wait()
|
||||||
.expect("Merging failed");
|
.expect("Merging failed");
|
||||||
|
|
||||||
// assert delete has not been committed
|
|
||||||
reader.reload().expect("failed to load searcher 1");
|
|
||||||
let searcher = reader.searcher();
|
|
||||||
assert_eq!(searcher.num_docs(), 2);
|
|
||||||
|
|
||||||
index_writer.commit().unwrap();
|
|
||||||
|
|
||||||
index_writer.wait_merging_threads().unwrap();
|
index_writer.wait_merging_threads().unwrap();
|
||||||
}
|
}
|
||||||
|
index.load_searchers().unwrap();
|
||||||
reader.reload().unwrap();
|
let searcher = index.searcher();
|
||||||
let searcher = reader.searcher();
|
|
||||||
assert_eq!(searcher.num_docs(), 0);
|
assert_eq!(searcher.num_docs(), 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_merge_multivalued_int_fields_simple() {
|
fn test_merge_multivalued_int_fields() {
|
||||||
let mut schema_builder = schema::Schema::builder();
|
let mut schema_builder = schema::SchemaBuilder::default();
|
||||||
let int_options = IntOptions::default()
|
let int_options = IntOptions::default()
|
||||||
.set_fast(Cardinality::MultiValues)
|
.set_fast(Cardinality::MultiValues)
|
||||||
.set_indexed();
|
.set_indexed();
|
||||||
@@ -1353,7 +1310,7 @@ mod tests {
|
|||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
|
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
let index_doc = |index_writer: &mut IndexWriter, int_vals: &[u64]| {
|
let index_doc = |index_writer: &mut IndexWriter, int_vals: &[u64]| {
|
||||||
let mut doc = Document::default();
|
let mut doc = Document::default();
|
||||||
for &val in int_vals {
|
for &val in int_vals {
|
||||||
@@ -1361,6 +1318,7 @@ mod tests {
|
|||||||
}
|
}
|
||||||
index_writer.add_document(doc);
|
index_writer.add_document(doc);
|
||||||
};
|
};
|
||||||
|
|
||||||
index_doc(&mut index_writer, &[1, 2]);
|
index_doc(&mut index_writer, &[1, 2]);
|
||||||
index_doc(&mut index_writer, &[1, 2, 3]);
|
index_doc(&mut index_writer, &[1, 2, 3]);
|
||||||
index_doc(&mut index_writer, &[4, 5]);
|
index_doc(&mut index_writer, &[4, 5]);
|
||||||
@@ -1369,14 +1327,19 @@ mod tests {
|
|||||||
index_doc(&mut index_writer, &[3]);
|
index_doc(&mut index_writer, &[3]);
|
||||||
index_doc(&mut index_writer, &[17]);
|
index_doc(&mut index_writer, &[17]);
|
||||||
index_writer.commit().expect("committed");
|
index_writer.commit().expect("committed");
|
||||||
|
|
||||||
index_doc(&mut index_writer, &[20]);
|
index_doc(&mut index_writer, &[20]);
|
||||||
index_writer.commit().expect("committed");
|
index_writer.commit().expect("committed");
|
||||||
|
|
||||||
index_doc(&mut index_writer, &[28, 27]);
|
index_doc(&mut index_writer, &[28, 27]);
|
||||||
index_doc(&mut index_writer, &[1_000]);
|
index_doc(&mut index_writer, &[1_000]);
|
||||||
|
|
||||||
index_writer.commit().expect("committed");
|
index_writer.commit().expect("committed");
|
||||||
}
|
}
|
||||||
let reader = index.reader().unwrap();
|
index.load_searchers().unwrap();
|
||||||
let searcher = reader.searcher();
|
|
||||||
|
let searcher = index.searcher();
|
||||||
|
|
||||||
let mut vals: Vec<u64> = Vec::new();
|
let mut vals: Vec<u64> = Vec::new();
|
||||||
|
|
||||||
{
|
{
|
||||||
@@ -1405,59 +1368,41 @@ mod tests {
|
|||||||
assert_eq!(&vals, &[17]);
|
assert_eq!(&vals, &[17]);
|
||||||
}
|
}
|
||||||
|
|
||||||
println!(
|
|
||||||
"{:?}",
|
|
||||||
searcher
|
|
||||||
.segment_readers()
|
|
||||||
.iter()
|
|
||||||
.map(|reader| reader.max_doc())
|
|
||||||
.collect::<Vec<_>>()
|
|
||||||
);
|
|
||||||
|
|
||||||
{
|
{
|
||||||
let segment = searcher.segment_reader(1u32);
|
let segment = searcher.segment_reader(1u32);
|
||||||
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
|
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
|
||||||
ff_reader.get_vals(0, &mut vals);
|
ff_reader.get_vals(0, &mut vals);
|
||||||
|
assert_eq!(&vals, &[20]);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
let segment = searcher.segment_reader(2u32);
|
||||||
|
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
|
||||||
|
ff_reader.get_vals(0, &mut vals);
|
||||||
assert_eq!(&vals, &[28, 27]);
|
assert_eq!(&vals, &[28, 27]);
|
||||||
|
|
||||||
ff_reader.get_vals(1, &mut vals);
|
ff_reader.get_vals(1, &mut vals);
|
||||||
assert_eq!(&vals, &[1_000]);
|
assert_eq!(&vals, &[1_000]);
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
|
||||||
let segment = searcher.segment_reader(2u32);
|
|
||||||
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
|
|
||||||
ff_reader.get_vals(0, &mut vals);
|
|
||||||
assert_eq!(&vals, &[20]);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Merging the segments
|
// Merging the segments
|
||||||
{
|
{
|
||||||
let segment_ids = index
|
let segment_ids = index
|
||||||
.searchable_segment_ids()
|
.searchable_segment_ids()
|
||||||
.expect("Searchable segments failed.");
|
.expect("Searchable segments failed.");
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
index_writer
|
index_writer
|
||||||
.merge(&segment_ids)
|
.merge(&segment_ids)
|
||||||
.expect("Failed to initiate merge")
|
.expect("Failed to initiate merge")
|
||||||
.wait()
|
.wait()
|
||||||
.expect("Merging failed");
|
.expect("Merging failed");
|
||||||
index_writer
|
index_writer.wait_merging_threads().unwrap();
|
||||||
.wait_merging_threads()
|
|
||||||
.expect("Wait for merging threads");
|
|
||||||
}
|
}
|
||||||
reader.reload().expect("Load searcher");
|
|
||||||
|
index.load_searchers().unwrap();
|
||||||
|
|
||||||
{
|
{
|
||||||
let searcher = reader.searcher();
|
let searcher = index.searcher();
|
||||||
println!(
|
|
||||||
"{:?}",
|
|
||||||
searcher
|
|
||||||
.segment_readers()
|
|
||||||
.iter()
|
|
||||||
.map(|reader| reader.max_doc())
|
|
||||||
.collect::<Vec<_>>()
|
|
||||||
);
|
|
||||||
let segment = searcher.segment_reader(0u32);
|
let segment = searcher.segment_reader(0u32);
|
||||||
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
|
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
|
||||||
|
|
||||||
@@ -1483,13 +1428,13 @@ mod tests {
|
|||||||
assert_eq!(&vals, &[17]);
|
assert_eq!(&vals, &[17]);
|
||||||
|
|
||||||
ff_reader.get_vals(7, &mut vals);
|
ff_reader.get_vals(7, &mut vals);
|
||||||
assert_eq!(&vals, &[28, 27]);
|
assert_eq!(&vals, &[20]);
|
||||||
|
|
||||||
ff_reader.get_vals(8, &mut vals);
|
ff_reader.get_vals(8, &mut vals);
|
||||||
assert_eq!(&vals, &[1_000]);
|
assert_eq!(&vals, &[28, 27]);
|
||||||
|
|
||||||
ff_reader.get_vals(9, &mut vals);
|
ff_reader.get_vals(9, &mut vals);
|
||||||
assert_eq!(&vals, &[20]);
|
assert_eq!(&vals, &[1_000]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,9 +1,8 @@
|
|||||||
pub mod delete_queue;
|
pub mod delete_queue;
|
||||||
|
mod directory_lock;
|
||||||
mod doc_opstamp_mapping;
|
mod doc_opstamp_mapping;
|
||||||
pub mod index_writer;
|
pub mod index_writer;
|
||||||
mod log_merge_policy;
|
mod log_merge_policy;
|
||||||
mod merge_operation;
|
|
||||||
pub mod merge_policy;
|
pub mod merge_policy;
|
||||||
pub mod merger;
|
pub mod merger;
|
||||||
pub mod operation;
|
pub mod operation;
|
||||||
@@ -16,12 +15,14 @@ pub mod segment_updater;
|
|||||||
mod segment_writer;
|
mod segment_writer;
|
||||||
mod stamper;
|
mod stamper;
|
||||||
|
|
||||||
|
pub(crate) use self::directory_lock::DirectoryLock;
|
||||||
|
pub use self::directory_lock::LockType;
|
||||||
|
|
||||||
pub use self::index_writer::IndexWriter;
|
pub use self::index_writer::IndexWriter;
|
||||||
pub use self::log_merge_policy::LogMergePolicy;
|
pub use self::log_merge_policy::LogMergePolicy;
|
||||||
pub use self::merge_operation::{MergeOperation, MergeOperationInventory};
|
|
||||||
pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy};
|
pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy};
|
||||||
pub use self::prepared_commit::PreparedCommit;
|
pub use self::prepared_commit::PreparedCommit;
|
||||||
pub use self::segment_entry::SegmentEntry;
|
pub use self::segment_entry::{SegmentEntry, SegmentState};
|
||||||
pub use self::segment_manager::SegmentManager;
|
pub use self::segment_manager::SegmentManager;
|
||||||
pub use self::segment_serializer::SegmentSerializer;
|
pub use self::segment_serializer::SegmentSerializer;
|
||||||
pub use self::segment_writer::SegmentWriter;
|
pub use self::segment_writer::SegmentWriter;
|
||||||
|
|||||||
@@ -14,10 +14,3 @@ pub struct AddOperation {
|
|||||||
pub opstamp: u64,
|
pub opstamp: u64,
|
||||||
pub document: Document,
|
pub document: Document,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// UserOperation is an enum type that encapsulates other operation types.
|
|
||||||
#[derive(Eq, PartialEq, Debug)]
|
|
||||||
pub enum UserOperation {
|
|
||||||
Add(Document),
|
|
||||||
Delete(Term),
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -6,20 +6,14 @@ pub struct PreparedCommit<'a> {
|
|||||||
index_writer: &'a mut IndexWriter,
|
index_writer: &'a mut IndexWriter,
|
||||||
payload: Option<String>,
|
payload: Option<String>,
|
||||||
opstamp: u64,
|
opstamp: u64,
|
||||||
soft: bool,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> PreparedCommit<'a> {
|
impl<'a> PreparedCommit<'a> {
|
||||||
pub(crate) fn new(
|
pub(crate) fn new(index_writer: &'a mut IndexWriter, opstamp: u64) -> PreparedCommit {
|
||||||
index_writer: &'a mut IndexWriter,
|
|
||||||
opstamp: u64,
|
|
||||||
soft: bool,
|
|
||||||
) -> PreparedCommit {
|
|
||||||
PreparedCommit {
|
PreparedCommit {
|
||||||
index_writer,
|
index_writer,
|
||||||
payload: None,
|
payload: None,
|
||||||
opstamp,
|
opstamp,
|
||||||
soft,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -39,7 +33,7 @@ impl<'a> PreparedCommit<'a> {
|
|||||||
info!("committing {}", self.opstamp);
|
info!("committing {}", self.opstamp);
|
||||||
self.index_writer
|
self.index_writer
|
||||||
.segment_updater()
|
.segment_updater()
|
||||||
.commit(self.opstamp, self.payload, self.soft)?;
|
.commit(self.opstamp, self.payload)?;
|
||||||
Ok(self.opstamp)
|
Ok(self.opstamp)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,6 +4,21 @@ use core::SegmentMeta;
|
|||||||
use indexer::delete_queue::DeleteCursor;
|
use indexer::delete_queue::DeleteCursor;
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
|
|
||||||
|
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
|
||||||
|
pub enum SegmentState {
|
||||||
|
Ready,
|
||||||
|
InMerge,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SegmentState {
|
||||||
|
pub fn letter_code(self) -> char {
|
||||||
|
match self {
|
||||||
|
SegmentState::InMerge => 'M',
|
||||||
|
SegmentState::Ready => 'R',
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// A segment entry describes the state of
|
/// A segment entry describes the state of
|
||||||
/// a given segment, at a given instant.
|
/// a given segment, at a given instant.
|
||||||
///
|
///
|
||||||
@@ -20,9 +35,9 @@ use std::fmt;
|
|||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct SegmentEntry {
|
pub struct SegmentEntry {
|
||||||
meta: SegmentMeta,
|
meta: SegmentMeta,
|
||||||
|
state: SegmentState,
|
||||||
delete_bitset: Option<BitSet>,
|
delete_bitset: Option<BitSet>,
|
||||||
delete_cursor: DeleteCursor,
|
delete_cursor: DeleteCursor,
|
||||||
opstamp: u64,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SegmentEntry {
|
impl SegmentEntry {
|
||||||
@@ -31,20 +46,15 @@ impl SegmentEntry {
|
|||||||
segment_meta: SegmentMeta,
|
segment_meta: SegmentMeta,
|
||||||
delete_cursor: DeleteCursor,
|
delete_cursor: DeleteCursor,
|
||||||
delete_bitset: Option<BitSet>,
|
delete_bitset: Option<BitSet>,
|
||||||
opstamp: u64,
|
|
||||||
) -> SegmentEntry {
|
) -> SegmentEntry {
|
||||||
SegmentEntry {
|
SegmentEntry {
|
||||||
meta: segment_meta,
|
meta: segment_meta,
|
||||||
|
state: SegmentState::Ready,
|
||||||
delete_bitset,
|
delete_bitset,
|
||||||
delete_cursor,
|
delete_cursor,
|
||||||
opstamp,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn opstamp(&self) -> u64 {
|
|
||||||
self.opstamp
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Return a reference to the segment entry deleted bitset.
|
/// Return a reference to the segment entry deleted bitset.
|
||||||
///
|
///
|
||||||
/// `DocId` in this bitset are flagged as deleted.
|
/// `DocId` in this bitset are flagged as deleted.
|
||||||
@@ -53,8 +63,7 @@ impl SegmentEntry {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Set the `SegmentMeta` for this segment.
|
/// Set the `SegmentMeta` for this segment.
|
||||||
pub fn set_meta(&mut self, opstamp: u64, segment_meta: SegmentMeta) {
|
pub fn set_meta(&mut self, segment_meta: SegmentMeta) {
|
||||||
self.opstamp = opstamp;
|
|
||||||
self.meta = segment_meta;
|
self.meta = segment_meta;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -63,6 +72,14 @@ impl SegmentEntry {
|
|||||||
&mut self.delete_cursor
|
&mut self.delete_cursor
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Return the `SegmentEntry`.
|
||||||
|
///
|
||||||
|
/// The state describes whether the segment is available for
|
||||||
|
/// a merge or not.
|
||||||
|
pub fn state(&self) -> SegmentState {
|
||||||
|
self.state
|
||||||
|
}
|
||||||
|
|
||||||
/// Returns the segment id.
|
/// Returns the segment id.
|
||||||
pub fn segment_id(&self) -> SegmentId {
|
pub fn segment_id(&self) -> SegmentId {
|
||||||
self.meta.id()
|
self.meta.id()
|
||||||
@@ -72,10 +89,33 @@ impl SegmentEntry {
|
|||||||
pub fn meta(&self) -> &SegmentMeta {
|
pub fn meta(&self) -> &SegmentMeta {
|
||||||
&self.meta
|
&self.meta
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Mark the `SegmentEntry` as in merge.
|
||||||
|
///
|
||||||
|
/// Only segments that are not already
|
||||||
|
/// in a merge are elligible for future merge.
|
||||||
|
pub fn start_merge(&mut self) {
|
||||||
|
self.state = SegmentState::InMerge;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Cancel a merge
|
||||||
|
///
|
||||||
|
/// If a merge fails, it is important to switch
|
||||||
|
/// the segment back to a idle state, so that it
|
||||||
|
/// may be elligible for future merges.
|
||||||
|
pub fn cancel_merge(&mut self) {
|
||||||
|
self.state = SegmentState::Ready;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns true iff a segment should
|
||||||
|
/// be considered for a merge.
|
||||||
|
pub fn is_ready(&self) -> bool {
|
||||||
|
self.state == SegmentState::Ready
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl fmt::Debug for SegmentEntry {
|
impl fmt::Debug for SegmentEntry {
|
||||||
fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
|
fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
|
||||||
write!(formatter, "SegmentEntry({:?})", self.meta)
|
write!(formatter, "SegmentEntry({:?}, {:?})", self.meta, self.state)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -11,47 +11,12 @@ use std::path::PathBuf;
|
|||||||
use std::sync::RwLock;
|
use std::sync::RwLock;
|
||||||
use std::sync::{RwLockReadGuard, RwLockWriteGuard};
|
use std::sync::{RwLockReadGuard, RwLockWriteGuard};
|
||||||
use Result as TantivyResult;
|
use Result as TantivyResult;
|
||||||
use std::sync::Arc;
|
|
||||||
use std::collections::HashMap;
|
|
||||||
|
|
||||||
/// Provides a read-only view of the available segments.
|
|
||||||
#[derive(Clone)]
|
|
||||||
pub struct AvailableSegments {
|
|
||||||
registers: Arc<RwLock<SegmentRegisters>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl AvailableSegments {
|
|
||||||
pub fn committed(&self) -> Vec<SegmentMeta> {
|
|
||||||
self.registers
|
|
||||||
.read()
|
|
||||||
.unwrap()
|
|
||||||
.committed
|
|
||||||
.segment_metas()
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn soft_committed(&self) -> Vec<SegmentMeta> {
|
|
||||||
self.registers
|
|
||||||
.read()
|
|
||||||
.unwrap()
|
|
||||||
.soft_committed
|
|
||||||
.segment_metas()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
#[derive(Default)]
|
||||||
struct SegmentRegisters {
|
struct SegmentRegisters {
|
||||||
uncommitted: HashMap<SegmentId, SegmentEntry>,
|
uncommitted: SegmentRegister,
|
||||||
committed: SegmentRegister,
|
committed: SegmentRegister,
|
||||||
/// soft commits can advance committed segment to a future delete
|
writing: HashSet<SegmentId>,
|
||||||
/// opstamp.
|
|
||||||
///
|
|
||||||
/// In that case the same `SegmentId` can appear in both `committed`
|
|
||||||
/// and in `committed_in_the_future`.
|
|
||||||
///
|
|
||||||
/// We do not consider these segments for merges.
|
|
||||||
soft_committed: SegmentRegister,
|
|
||||||
/// `DeleteCursor`, positionned on the soft commit.
|
|
||||||
delete_cursor: DeleteCursor,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// The segment manager stores the list of segments
|
/// The segment manager stores the list of segments
|
||||||
@@ -59,8 +24,9 @@ struct SegmentRegisters {
|
|||||||
///
|
///
|
||||||
/// It guarantees the atomicity of the
|
/// It guarantees the atomicity of the
|
||||||
/// changes (merges especially)
|
/// changes (merges especially)
|
||||||
|
#[derive(Default)]
|
||||||
pub struct SegmentManager {
|
pub struct SegmentManager {
|
||||||
registers: Arc<RwLock<SegmentRegisters>>
|
registers: RwLock<SegmentRegisters>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Debug for SegmentManager {
|
impl Debug for SegmentManager {
|
||||||
@@ -75,23 +41,12 @@ impl Debug for SegmentManager {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_mergeable_segments(
|
pub fn get_mergeable_segments(
|
||||||
in_merge_segment_ids: &HashSet<SegmentId>,
|
|
||||||
segment_manager: &SegmentManager,
|
segment_manager: &SegmentManager,
|
||||||
) -> (Vec<SegmentMeta>, Vec<SegmentMeta>) {
|
) -> (Vec<SegmentMeta>, Vec<SegmentMeta>) {
|
||||||
let registers_lock = segment_manager.read();
|
let registers_lock = segment_manager.read();
|
||||||
(
|
(
|
||||||
registers_lock
|
registers_lock.committed.get_mergeable_segments(),
|
||||||
.soft_committed
|
registers_lock.uncommitted.get_mergeable_segments(),
|
||||||
.get_mergeable_segments(in_merge_segment_ids),
|
|
||||||
registers_lock
|
|
||||||
.uncommitted
|
|
||||||
.values()
|
|
||||||
.map(|segment_entry| segment_entry.meta())
|
|
||||||
.filter(|segment_meta| {
|
|
||||||
!in_merge_segment_ids.contains(&segment_meta.id())
|
|
||||||
})
|
|
||||||
.cloned()
|
|
||||||
.collect::<Vec<_>>()
|
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -99,22 +54,28 @@ impl SegmentManager {
|
|||||||
pub fn from_segments(
|
pub fn from_segments(
|
||||||
segment_metas: Vec<SegmentMeta>,
|
segment_metas: Vec<SegmentMeta>,
|
||||||
delete_cursor: &DeleteCursor,
|
delete_cursor: &DeleteCursor,
|
||||||
opstamp: u64,
|
|
||||||
) -> SegmentManager {
|
) -> SegmentManager {
|
||||||
SegmentManager {
|
SegmentManager {
|
||||||
registers: Arc::new(RwLock::new(SegmentRegisters {
|
registers: RwLock::new(SegmentRegisters {
|
||||||
uncommitted: HashMap::default(),
|
uncommitted: SegmentRegister::default(),
|
||||||
committed: SegmentRegister::new(segment_metas.clone(), opstamp),
|
committed: SegmentRegister::new(segment_metas, delete_cursor),
|
||||||
soft_committed: SegmentRegister::new(segment_metas, opstamp),
|
writing: HashSet::new(),
|
||||||
delete_cursor: delete_cursor.clone(),
|
}),
|
||||||
}))
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn available_segments_view(&self) -> AvailableSegments {
|
/// Returns all of the segment entries (committed or uncommitted)
|
||||||
AvailableSegments {
|
pub fn segment_entries(&self) -> Vec<SegmentEntry> {
|
||||||
registers: self.registers.clone()
|
let registers_lock = self.read();
|
||||||
}
|
let mut segment_entries = registers_lock.uncommitted.segment_entries();
|
||||||
|
segment_entries.extend(registers_lock.committed.segment_entries());
|
||||||
|
segment_entries
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the overall number of segments in the `SegmentManager`
|
||||||
|
pub fn num_segments(&self) -> usize {
|
||||||
|
let registers_lock = self.read();
|
||||||
|
registers_lock.committed.len() + registers_lock.uncommitted.len()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// List the files that are useful to the index.
|
/// List the files that are useful to the index.
|
||||||
@@ -145,84 +106,40 @@ impl SegmentManager {
|
|||||||
.expect("Failed to acquire write lock on SegmentManager.")
|
.expect("Failed to acquire write lock on SegmentManager.")
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Deletes all empty segments
|
pub fn commit(&self, segment_entries: Vec<SegmentEntry>) {
|
||||||
fn remove_empty_segments(&self) {
|
|
||||||
let mut registers_lock = self.write();
|
|
||||||
registers_lock
|
|
||||||
.committed
|
|
||||||
.segment_metas()
|
|
||||||
.iter()
|
|
||||||
.filter(|segment_meta| segment_meta.num_docs() == 0)
|
|
||||||
.for_each(|segment_meta| {
|
|
||||||
registers_lock
|
|
||||||
.committed
|
|
||||||
.remove_segment(&segment_meta.id())
|
|
||||||
});
|
|
||||||
registers_lock
|
|
||||||
.soft_committed
|
|
||||||
.segment_metas()
|
|
||||||
.iter()
|
|
||||||
.filter(|segment_meta| segment_meta.num_docs() == 0)
|
|
||||||
.for_each(|segment_meta| {
|
|
||||||
registers_lock
|
|
||||||
.committed
|
|
||||||
.remove_segment(&segment_meta.id())
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns all of the segment entries (soft committed or uncommitted)
|
|
||||||
pub fn segment_entries(&self) -> Vec<SegmentEntry> {
|
|
||||||
let registers_lock = self.read();
|
|
||||||
let mut segment_entries: Vec<SegmentEntry > = registers_lock.uncommitted.values().cloned().collect();
|
|
||||||
segment_entries.extend(registers_lock.soft_committed.segment_entries(®isters_lock.delete_cursor).into_iter());
|
|
||||||
segment_entries
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
pub fn commit(&self, opstamp: u64, segment_entries: Vec<SegmentEntry>) {
|
|
||||||
let mut registers_lock = self.write();
|
let mut registers_lock = self.write();
|
||||||
|
registers_lock.committed.clear();
|
||||||
registers_lock.uncommitted.clear();
|
registers_lock.uncommitted.clear();
|
||||||
registers_lock
|
for segment_entry in segment_entries {
|
||||||
.committed
|
registers_lock.committed.add_segment_entry(segment_entry);
|
||||||
.set_commit(opstamp, segment_entries.clone());
|
}
|
||||||
registers_lock
|
|
||||||
.soft_committed
|
|
||||||
.set_commit(opstamp, segment_entries);
|
|
||||||
registers_lock.delete_cursor.skip_to(opstamp);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn soft_commit(&self, opstamp: u64, segment_entries: Vec<SegmentEntry>) {
|
/// Marks a list of segments as in merge.
|
||||||
let mut registers_lock = self.write();
|
|
||||||
registers_lock.uncommitted.clear();
|
|
||||||
registers_lock
|
|
||||||
.soft_committed
|
|
||||||
.set_commit(opstamp, segment_entries);
|
|
||||||
registers_lock.delete_cursor.skip_to(opstamp);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Gets the list of segment_entries associated to a list of `segment_ids`.
|
|
||||||
/// This method is used when starting a merge operations.
|
|
||||||
///
|
///
|
||||||
/// Returns an error if some segments are missing, or if
|
/// Returns an error if some segments are missing, or if
|
||||||
/// the `segment_ids` are not either all soft_committed or all
|
/// the `segment_ids` are not either all committed or all
|
||||||
/// uncommitted.
|
/// uncommitted.
|
||||||
pub fn start_merge(&self, segment_ids: &[SegmentId]) -> TantivyResult<Vec<SegmentEntry>> {
|
pub fn start_merge(&self, segment_ids: &[SegmentId]) -> TantivyResult<Vec<SegmentEntry>> {
|
||||||
let registers_lock = self.read();
|
let mut registers_lock = self.write();
|
||||||
let mut segment_entries = vec![];
|
let mut segment_entries = vec![];
|
||||||
if segment_ids.iter().all(|segment_id| registers_lock.uncommitted.contains_key(segment_id)) {
|
if registers_lock.uncommitted.contains_all(segment_ids) {
|
||||||
for segment_id in segment_ids {
|
for segment_id in segment_ids {
|
||||||
let segment_entry = registers_lock.uncommitted
|
let segment_entry = registers_lock.uncommitted
|
||||||
.get(segment_id)
|
.start_merge(segment_id)
|
||||||
.expect("Segment id not found {}. Should never happen because of the contains all if-block.");
|
|
||||||
segment_entries.push(segment_entry.clone());
|
|
||||||
}
|
|
||||||
} else if registers_lock.soft_committed.contains_all(segment_ids) {
|
|
||||||
for segment_id in segment_ids {
|
|
||||||
let segment_entry = registers_lock.soft_committed
|
|
||||||
.get(segment_id, ®isters_lock.delete_cursor)
|
|
||||||
.expect("Segment id not found {}. Should never happen because of the contains all if-block.");
|
.expect("Segment id not found {}. Should never happen because of the contains all if-block.");
|
||||||
segment_entries.push(segment_entry);
|
segment_entries.push(segment_entry);
|
||||||
}
|
}
|
||||||
|
} else if registers_lock.committed.contains_all(segment_ids) {
|
||||||
|
for segment_id in segment_ids {
|
||||||
|
let segment_entry = registers_lock.committed
|
||||||
|
.start_merge(segment_id)
|
||||||
|
.expect("Segment id not found {}. Should never happen because of the contains all if-block.");
|
||||||
|
segment_entries.push(segment_entry);
|
||||||
|
}
|
||||||
|
for segment_id in segment_ids {
|
||||||
|
registers_lock.committed.start_merge(segment_id);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
let error_msg = "Merge operation sent for segments that are not \
|
let error_msg = "Merge operation sent for segments that are not \
|
||||||
all uncommited or commited."
|
all uncommited or commited."
|
||||||
@@ -232,38 +149,86 @@ impl SegmentManager {
|
|||||||
Ok(segment_entries)
|
Ok(segment_entries)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn cancel_merge(
|
||||||
|
&self,
|
||||||
|
before_merge_segment_ids: &[SegmentId],
|
||||||
|
after_merge_segment_id: SegmentId,
|
||||||
|
) {
|
||||||
|
let mut registers_lock = self.write();
|
||||||
|
|
||||||
|
// we mark all segments are ready for merge.
|
||||||
|
{
|
||||||
|
let target_segment_register: &mut SegmentRegister;
|
||||||
|
target_segment_register = {
|
||||||
|
if registers_lock
|
||||||
|
.uncommitted
|
||||||
|
.contains_all(before_merge_segment_ids)
|
||||||
|
{
|
||||||
|
&mut registers_lock.uncommitted
|
||||||
|
} else if registers_lock
|
||||||
|
.committed
|
||||||
|
.contains_all(before_merge_segment_ids)
|
||||||
|
{
|
||||||
|
&mut registers_lock.committed
|
||||||
|
} else {
|
||||||
|
warn!("couldn't find segment in SegmentManager");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
for segment_id in before_merge_segment_ids {
|
||||||
|
target_segment_register.cancel_merge(segment_id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ... and we make sure the target segment entry
|
||||||
|
// can be garbage collected.
|
||||||
|
registers_lock.writing.remove(&after_merge_segment_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn write_segment(&self, segment_id: SegmentId) {
|
||||||
|
let mut registers_lock = self.write();
|
||||||
|
registers_lock.writing.insert(segment_id);
|
||||||
|
}
|
||||||
|
|
||||||
pub fn add_segment(&self, segment_entry: SegmentEntry) {
|
pub fn add_segment(&self, segment_entry: SegmentEntry) {
|
||||||
let mut registers_lock = self.write();
|
let mut registers_lock = self.write();
|
||||||
registers_lock
|
registers_lock.writing.remove(&segment_entry.segment_id());
|
||||||
.uncommitted
|
registers_lock.uncommitted.add_segment_entry(segment_entry);
|
||||||
.insert(segment_entry.segment_id(), segment_entry);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn end_merge(
|
pub fn end_merge(
|
||||||
&self,
|
&self,
|
||||||
before_merge_segment_ids: &[SegmentId],
|
before_merge_segment_ids: &[SegmentId],
|
||||||
after_merge_segment_entry: SegmentEntry
|
after_merge_segment_entry: SegmentEntry,
|
||||||
) {
|
) {
|
||||||
let mut registers_lock = self.write();
|
let mut registers_lock = self.write();
|
||||||
|
registers_lock
|
||||||
|
.writing
|
||||||
|
.remove(&after_merge_segment_entry.segment_id());
|
||||||
|
|
||||||
if before_merge_segment_ids.iter().all(|seg_id|
|
let target_register: &mut SegmentRegister = {
|
||||||
registers_lock
|
if registers_lock
|
||||||
.uncommitted
|
.uncommitted
|
||||||
.contains_key(seg_id))
|
.contains_all(before_merge_segment_ids)
|
||||||
{
|
{
|
||||||
for segment_id in before_merge_segment_ids {
|
&mut registers_lock.uncommitted
|
||||||
registers_lock.uncommitted.remove(&segment_id);
|
} else if registers_lock
|
||||||
|
.committed
|
||||||
|
.contains_all(before_merge_segment_ids)
|
||||||
|
{
|
||||||
|
&mut registers_lock.committed
|
||||||
|
} else {
|
||||||
|
warn!("couldn't find segment in SegmentManager");
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
registers_lock.uncommitted.insert(after_merge_segment_entry.segment_id(),
|
};
|
||||||
after_merge_segment_entry);
|
for segment_id in before_merge_segment_ids {
|
||||||
} else {
|
target_register.remove_segment(segment_id);
|
||||||
registers_lock.committed.receive_merge(&before_merge_segment_ids, &after_merge_segment_entry);
|
|
||||||
registers_lock.soft_committed.receive_merge(&before_merge_segment_ids, &after_merge_segment_entry)
|
|
||||||
}
|
}
|
||||||
|
target_register.add_segment_entry(after_merge_segment_entry);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn committed_segment_metas(&self) -> Vec<SegmentMeta> {
|
pub fn committed_segment_metas(&self) -> Vec<SegmentMeta> {
|
||||||
self.remove_empty_segments();
|
|
||||||
let registers_lock = self.read();
|
let registers_lock = self.read();
|
||||||
registers_lock.committed.segment_metas()
|
registers_lock.committed.segment_metas()
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,7 +3,6 @@ use core::SegmentMeta;
|
|||||||
use indexer::delete_queue::DeleteCursor;
|
use indexer::delete_queue::DeleteCursor;
|
||||||
use indexer::segment_entry::SegmentEntry;
|
use indexer::segment_entry::SegmentEntry;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::collections::HashSet;
|
|
||||||
use std::fmt::{self, Debug, Formatter};
|
use std::fmt::{self, Debug, Formatter};
|
||||||
|
|
||||||
/// The segment register keeps track
|
/// The segment register keeps track
|
||||||
@@ -16,15 +15,14 @@ use std::fmt::{self, Debug, Formatter};
|
|||||||
/// merge candidates.
|
/// merge candidates.
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
pub struct SegmentRegister {
|
pub struct SegmentRegister {
|
||||||
segment_states: HashMap<SegmentId, SegmentMeta>,
|
segment_states: HashMap<SegmentId, SegmentEntry>,
|
||||||
opstamp_constraint: u64,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Debug for SegmentRegister {
|
impl Debug for SegmentRegister {
|
||||||
fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
|
fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
|
||||||
write!(f, "SegmentRegister(")?;
|
write!(f, "SegmentRegister(")?;
|
||||||
for k in self.segment_states.keys() {
|
for (k, v) in &self.segment_states {
|
||||||
write!(f, "{}, ", k.short_uuid_string())?;
|
write!(f, "{}:{}, ", k.short_uuid_string(), v.state().letter_code())?;
|
||||||
}
|
}
|
||||||
write!(f, ")")?;
|
write!(f, ")")?;
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -36,113 +34,76 @@ impl SegmentRegister {
|
|||||||
self.segment_states.clear();
|
self.segment_states.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_mergeable_segments(
|
pub fn len(&self) -> usize {
|
||||||
&self,
|
self.segment_states.len()
|
||||||
in_merge_segment_ids: &HashSet<SegmentId>,
|
}
|
||||||
) -> Vec<SegmentMeta> {
|
|
||||||
|
pub fn get_mergeable_segments(&self) -> Vec<SegmentMeta> {
|
||||||
self.segment_states
|
self.segment_states
|
||||||
.values()
|
.values()
|
||||||
.filter(|segment_meta| !in_merge_segment_ids.contains(&segment_meta.id()))
|
.filter(|segment_entry| segment_entry.is_ready())
|
||||||
.cloned()
|
.map(|segment_entry| segment_entry.meta().clone())
|
||||||
.collect()
|
.collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn segment_entries(&self) -> Vec<SegmentEntry> {
|
||||||
|
self.segment_states.values().cloned().collect()
|
||||||
|
}
|
||||||
|
|
||||||
pub fn segment_metas(&self) -> Vec<SegmentMeta> {
|
pub fn segment_metas(&self) -> Vec<SegmentMeta> {
|
||||||
let mut segment_metas: Vec<SegmentMeta> = self
|
let mut segment_ids: Vec<SegmentMeta> = self
|
||||||
.segment_states
|
.segment_states
|
||||||
.values()
|
.values()
|
||||||
.cloned()
|
.map(|segment_entry| segment_entry.meta().clone())
|
||||||
.collect();
|
.collect();
|
||||||
segment_metas.sort_by_key(|meta| meta.id());
|
segment_ids.sort_by_key(|meta| meta.id());
|
||||||
segment_metas
|
segment_ids
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn segment_entries(&self, delete_cursor: &DeleteCursor) -> Vec<SegmentEntry> {
|
pub fn contains_all(&mut self, segment_ids: &[SegmentId]) -> bool {
|
||||||
self.segment_states
|
|
||||||
.values()
|
|
||||||
.map(|segment_meta| {
|
|
||||||
SegmentEntry::new(segment_meta.clone(), delete_cursor.clone(), None, self.opstamp_constraint)
|
|
||||||
})
|
|
||||||
.collect()
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn contains_all(&self, segment_ids: &[SegmentId]) -> bool {
|
|
||||||
segment_ids
|
segment_ids
|
||||||
.iter()
|
.iter()
|
||||||
.all(|segment_id| self.segment_states.contains_key(segment_id))
|
.all(|segment_id| self.segment_states.contains_key(segment_id))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn receive_merge(&mut self,
|
pub fn add_segment_entry(&mut self, segment_entry: SegmentEntry) {
|
||||||
before_merge_segment_ids: &[SegmentId],
|
|
||||||
after_merge_segment_entry: &SegmentEntry) {
|
|
||||||
if after_merge_segment_entry.opstamp() != self.opstamp_constraint {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if !self.contains_all(before_merge_segment_ids) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
for segment_id in before_merge_segment_ids {
|
|
||||||
self.segment_states.remove(segment_id);
|
|
||||||
}
|
|
||||||
self.register_segment_entry(after_merge_segment_entry.clone());
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Registers a `SegmentEntry`.
|
|
||||||
///
|
|
||||||
/// If a segment entry associated to this `SegmentId` is already there,
|
|
||||||
/// override it with the new `SegmentEntry`.
|
|
||||||
pub fn register_segment_entry(&mut self, segment_entry: SegmentEntry) {
|
|
||||||
if self.opstamp_constraint != segment_entry.opstamp() {
|
|
||||||
panic!(format!(
|
|
||||||
"Invalid segment. Expect opstamp {}, got {}.",
|
|
||||||
self.opstamp_constraint,
|
|
||||||
segment_entry.opstamp()
|
|
||||||
));
|
|
||||||
}
|
|
||||||
if segment_entry.meta().num_docs() == 0 {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
let segment_id = segment_entry.segment_id();
|
let segment_id = segment_entry.segment_id();
|
||||||
// Check that we are ok with deletes.
|
self.segment_states.insert(segment_id, segment_entry);
|
||||||
self.segment_states.insert(segment_id, segment_entry.meta().clone());
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn set_commit(&mut self, opstamp: u64, segment_entries: Vec<SegmentEntry>) {
|
|
||||||
self.segment_states.clear();
|
|
||||||
self.opstamp_constraint = opstamp;
|
|
||||||
for segment_entry in segment_entries {
|
|
||||||
self.register_segment_entry(segment_entry);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn remove_segment(&mut self, segment_id: &SegmentId) {
|
pub fn remove_segment(&mut self, segment_id: &SegmentId) {
|
||||||
self.segment_states.remove(&segment_id);
|
self.segment_states.remove(segment_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get(&self, segment_id: &SegmentId, delete_cursor: &DeleteCursor) -> Option<SegmentEntry> {
|
pub fn cancel_merge(&mut self, segment_id: &SegmentId) {
|
||||||
self.segment_states
|
self.segment_states
|
||||||
.get(&segment_id)
|
.get_mut(segment_id)
|
||||||
.map(|segment_meta|
|
.expect("Received a merge notification for a segment that is not registered")
|
||||||
SegmentEntry::new(
|
.cancel_merge();
|
||||||
segment_meta.clone(),
|
|
||||||
delete_cursor.clone(),
|
|
||||||
None,
|
|
||||||
self.opstamp_constraint
|
|
||||||
))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn new(
|
pub fn start_merge(&mut self, segment_id: &SegmentId) -> Option<SegmentEntry> {
|
||||||
segment_metas: Vec<SegmentMeta>,
|
if let Some(segment_entry) = self.segment_states.get_mut(segment_id) {
|
||||||
opstamp: u64,
|
segment_entry.start_merge();
|
||||||
) -> SegmentRegister {
|
Some(segment_entry.clone())
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn new(segment_metas: Vec<SegmentMeta>, delete_cursor: &DeleteCursor) -> SegmentRegister {
|
||||||
let mut segment_states = HashMap::new();
|
let mut segment_states = HashMap::new();
|
||||||
for segment_meta in segment_metas {
|
for segment_meta in segment_metas {
|
||||||
segment_states.insert(segment_meta.id(), segment_meta);
|
let segment_id = segment_meta.id();
|
||||||
}
|
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor.clone(), None);
|
||||||
SegmentRegister {
|
segment_states.insert(segment_id, segment_entry);
|
||||||
segment_states,
|
|
||||||
opstamp_constraint: opstamp,
|
|
||||||
}
|
}
|
||||||
|
SegmentRegister { segment_states }
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
pub fn segment_entry(&self, segment_id: &SegmentId) -> Option<SegmentEntry> {
|
||||||
|
self.segment_states.get(segment_id).cloned()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -152,6 +113,7 @@ mod tests {
|
|||||||
use core::SegmentId;
|
use core::SegmentId;
|
||||||
use core::SegmentMeta;
|
use core::SegmentMeta;
|
||||||
use indexer::delete_queue::*;
|
use indexer::delete_queue::*;
|
||||||
|
use indexer::SegmentState;
|
||||||
|
|
||||||
fn segment_ids(segment_register: &SegmentRegister) -> Vec<SegmentId> {
|
fn segment_ids(segment_register: &SegmentRegister) -> Vec<SegmentId> {
|
||||||
segment_register
|
segment_register
|
||||||
@@ -171,22 +133,52 @@ mod tests {
|
|||||||
let segment_id_merged = SegmentId::generate_random();
|
let segment_id_merged = SegmentId::generate_random();
|
||||||
|
|
||||||
{
|
{
|
||||||
let segment_meta = SegmentMeta::new(segment_id_a, 1u32);
|
let segment_meta = SegmentMeta::new(segment_id_a, 0u32);
|
||||||
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None, 0u64);
|
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
|
||||||
segment_register.register_segment_entry(segment_entry);
|
segment_register.add_segment_entry(segment_entry);
|
||||||
}
|
}
|
||||||
|
assert_eq!(
|
||||||
|
segment_register
|
||||||
|
.segment_entry(&segment_id_a)
|
||||||
|
.unwrap()
|
||||||
|
.state(),
|
||||||
|
SegmentState::Ready
|
||||||
|
);
|
||||||
assert_eq!(segment_ids(&segment_register), vec![segment_id_a]);
|
assert_eq!(segment_ids(&segment_register), vec![segment_id_a]);
|
||||||
{
|
{
|
||||||
let segment_meta = SegmentMeta::new(segment_id_b, 2u32);
|
let segment_meta = SegmentMeta::new(segment_id_b, 0u32);
|
||||||
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None, 0u64);
|
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
|
||||||
segment_register.register_segment_entry(segment_entry);
|
segment_register.add_segment_entry(segment_entry);
|
||||||
}
|
}
|
||||||
|
assert_eq!(
|
||||||
|
segment_register
|
||||||
|
.segment_entry(&segment_id_b)
|
||||||
|
.unwrap()
|
||||||
|
.state(),
|
||||||
|
SegmentState::Ready
|
||||||
|
);
|
||||||
|
segment_register.start_merge(&segment_id_a);
|
||||||
|
segment_register.start_merge(&segment_id_b);
|
||||||
|
assert_eq!(
|
||||||
|
segment_register
|
||||||
|
.segment_entry(&segment_id_a)
|
||||||
|
.unwrap()
|
||||||
|
.state(),
|
||||||
|
SegmentState::InMerge
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
segment_register
|
||||||
|
.segment_entry(&segment_id_b)
|
||||||
|
.unwrap()
|
||||||
|
.state(),
|
||||||
|
SegmentState::InMerge
|
||||||
|
);
|
||||||
|
segment_register.remove_segment(&segment_id_a);
|
||||||
|
segment_register.remove_segment(&segment_id_b);
|
||||||
{
|
{
|
||||||
let segment_meta_merged = SegmentMeta::new(segment_id_merged, 3u32);
|
let segment_meta_merged = SegmentMeta::new(segment_id_merged, 0u32);
|
||||||
let segment_entry =
|
let segment_entry = SegmentEntry::new(segment_meta_merged, delete_queue.cursor(), None);
|
||||||
SegmentEntry::new(segment_meta_merged, delete_queue.cursor(), None, 0u64);
|
segment_register.add_segment_entry(segment_entry);
|
||||||
segment_register.receive_merge(&[segment_id_a, segment_id_b], &segment_entry);
|
|
||||||
segment_register.register_segment_entry(segment_entry);
|
|
||||||
}
|
}
|
||||||
assert_eq!(segment_ids(&segment_register), vec![segment_id_merged]);
|
assert_eq!(segment_ids(&segment_register), vec![segment_id_merged]);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -16,10 +16,9 @@ use futures_cpupool::CpuFuture;
|
|||||||
use futures_cpupool::CpuPool;
|
use futures_cpupool::CpuPool;
|
||||||
use indexer::delete_queue::DeleteCursor;
|
use indexer::delete_queue::DeleteCursor;
|
||||||
use indexer::index_writer::advance_deletes;
|
use indexer::index_writer::advance_deletes;
|
||||||
use indexer::merge_operation::MergeOperationInventory;
|
|
||||||
use indexer::merger::IndexMerger;
|
use indexer::merger::IndexMerger;
|
||||||
use indexer::stamper::Stamper;
|
use indexer::stamper::Stamper;
|
||||||
use indexer::MergeOperation;
|
use indexer::MergeCandidate;
|
||||||
use indexer::SegmentEntry;
|
use indexer::SegmentEntry;
|
||||||
use indexer::SegmentSerializer;
|
use indexer::SegmentSerializer;
|
||||||
use indexer::{DefaultMergePolicy, MergePolicy};
|
use indexer::{DefaultMergePolicy, MergePolicy};
|
||||||
@@ -27,7 +26,6 @@ use schema::Schema;
|
|||||||
use serde_json;
|
use serde_json;
|
||||||
use std::borrow::BorrowMut;
|
use std::borrow::BorrowMut;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::collections::HashSet;
|
|
||||||
use std::io::Write;
|
use std::io::Write;
|
||||||
use std::mem;
|
use std::mem;
|
||||||
use std::ops::DerefMut;
|
use std::ops::DerefMut;
|
||||||
@@ -47,30 +45,33 @@ use Result;
|
|||||||
/// and flushed.
|
/// and flushed.
|
||||||
///
|
///
|
||||||
/// This method is not part of tantivy's public API
|
/// This method is not part of tantivy's public API
|
||||||
pub fn save_new_metas(schema: Schema, directory: &mut Directory) -> Result<()> {
|
pub fn save_new_metas(schema: Schema, opstamp: u64, directory: &mut Directory) -> Result<()> {
|
||||||
save_metas(
|
save_metas(vec![], schema, opstamp, None, directory)
|
||||||
&IndexMeta {
|
|
||||||
segments: Vec::new(),
|
|
||||||
schema,
|
|
||||||
opstamp: 0u64,
|
|
||||||
payload: None,
|
|
||||||
},
|
|
||||||
directory,
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Save the index meta file.
|
/// Save the index meta file.
|
||||||
/// This operation is atomic:
|
/// This operation is atomic:
|
||||||
/// Either
|
/// Either
|
||||||
/// - it fails, in which case an error is returned,
|
// - it fails, in which case an error is returned,
|
||||||
/// and the `meta.json` remains untouched,
|
/// and the `meta.json` remains untouched,
|
||||||
/// - it success, and `meta.json` is written
|
/// - it success, and `meta.json` is written
|
||||||
/// and flushed.
|
/// and flushed.
|
||||||
///
|
///
|
||||||
/// This method is not part of tantivy's public API
|
/// This method is not part of tantivy's public API
|
||||||
fn save_metas(metas: &IndexMeta, directory: &mut Directory) -> Result<()> {
|
pub fn save_metas(
|
||||||
let mut buffer = serde_json::to_vec_pretty(metas)?;
|
segment_metas: Vec<SegmentMeta>,
|
||||||
// Just adding a new line at the end of the buffer.
|
schema: Schema,
|
||||||
|
opstamp: u64,
|
||||||
|
payload: Option<String>,
|
||||||
|
directory: &mut Directory,
|
||||||
|
) -> Result<()> {
|
||||||
|
let metas = IndexMeta {
|
||||||
|
segments: segment_metas,
|
||||||
|
schema,
|
||||||
|
opstamp,
|
||||||
|
payload,
|
||||||
|
};
|
||||||
|
let mut buffer = serde_json::to_vec_pretty(&metas)?;
|
||||||
writeln!(&mut buffer)?;
|
writeln!(&mut buffer)?;
|
||||||
directory.atomic_write(&META_FILEPATH, &buffer[..])?;
|
directory.atomic_write(&META_FILEPATH, &buffer[..])?;
|
||||||
debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas));
|
debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas));
|
||||||
@@ -82,21 +83,16 @@ fn save_metas(metas: &IndexMeta, directory: &mut Directory) -> Result<()> {
|
|||||||
//
|
//
|
||||||
// All this processing happens on a single thread
|
// All this processing happens on a single thread
|
||||||
// consuming a common queue.
|
// consuming a common queue.
|
||||||
//
|
|
||||||
// We voluntarily pass a merge_operation ref to guarantee that
|
|
||||||
// the merge_operation is alive during the process
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct SegmentUpdater(Arc<InnerSegmentUpdater>);
|
pub struct SegmentUpdater(Arc<InnerSegmentUpdater>);
|
||||||
|
|
||||||
fn perform_merge(
|
fn perform_merge(
|
||||||
merge_operation: &MergeOperation,
|
|
||||||
index: &Index,
|
index: &Index,
|
||||||
mut segment_entries: Vec<SegmentEntry>,
|
mut segment_entries: Vec<SegmentEntry>,
|
||||||
|
mut merged_segment: Segment,
|
||||||
|
target_opstamp: u64,
|
||||||
) -> Result<SegmentEntry> {
|
) -> Result<SegmentEntry> {
|
||||||
let target_opstamp = merge_operation.target_opstamp();
|
|
||||||
|
|
||||||
// first we need to apply deletes to our segment.
|
// first we need to apply deletes to our segment.
|
||||||
let mut merged_segment = index.new_segment();
|
|
||||||
|
|
||||||
// TODO add logging
|
// TODO add logging
|
||||||
let schema = index.schema();
|
let schema = index.schema();
|
||||||
@@ -125,68 +121,60 @@ fn perform_merge(
|
|||||||
|
|
||||||
let segment_meta = SegmentMeta::new(merged_segment.id(), num_docs);
|
let segment_meta = SegmentMeta::new(merged_segment.id(), num_docs);
|
||||||
|
|
||||||
let after_merge_segment_entry = SegmentEntry::new(segment_meta.clone(), delete_cursor, None, target_opstamp);
|
let after_merge_segment_entry = SegmentEntry::new(segment_meta.clone(), delete_cursor, None);
|
||||||
Ok(after_merge_segment_entry)
|
Ok(after_merge_segment_entry)
|
||||||
}
|
}
|
||||||
|
|
||||||
struct InnerSegmentUpdater {
|
struct InnerSegmentUpdater {
|
||||||
// we keep a copy of the current active IndexMeta to
|
|
||||||
// avoid loading the file everytime we need it in the
|
|
||||||
// `SegmentUpdater`.
|
|
||||||
//
|
|
||||||
// This should be up to date as all update happen through
|
|
||||||
// the unique active `SegmentUpdater`.
|
|
||||||
active_metas: RwLock<Arc<IndexMeta>>,
|
|
||||||
pool: CpuPool,
|
pool: CpuPool,
|
||||||
index: Index,
|
index: Index,
|
||||||
segment_manager: SegmentManager,
|
segment_manager: SegmentManager,
|
||||||
merge_policy: RwLock<Arc<Box<MergePolicy>>>,
|
merge_policy: RwLock<Box<MergePolicy>>,
|
||||||
merging_thread_id: AtomicUsize,
|
merging_thread_id: AtomicUsize,
|
||||||
merging_threads: RwLock<HashMap<usize, JoinHandle<Result<()>>>>,
|
merging_threads: RwLock<HashMap<usize, JoinHandle<Result<()>>>>,
|
||||||
generation: AtomicUsize,
|
generation: AtomicUsize,
|
||||||
killed: AtomicBool,
|
killed: AtomicBool,
|
||||||
stamper: Stamper,
|
stamper: Stamper,
|
||||||
merge_operations: MergeOperationInventory,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SegmentUpdater {
|
impl SegmentUpdater {
|
||||||
pub fn create(
|
pub fn new(
|
||||||
index: Index,
|
index: Index,
|
||||||
stamper: Stamper,
|
stamper: Stamper,
|
||||||
delete_cursor: &DeleteCursor,
|
delete_cursor: &DeleteCursor,
|
||||||
) -> Result<SegmentUpdater> {
|
) -> Result<SegmentUpdater> {
|
||||||
|
|
||||||
let index_meta = index.load_metas()?;
|
|
||||||
let segments = index.searchable_segment_metas()?;
|
let segments = index.searchable_segment_metas()?;
|
||||||
let opstamp = index_meta.opstamp;
|
let segment_manager = SegmentManager::from_segments(segments, delete_cursor);
|
||||||
let segment_manager = SegmentManager::from_segments(segments, delete_cursor, opstamp);
|
|
||||||
let pool = CpuPoolBuilder::new()
|
let pool = CpuPoolBuilder::new()
|
||||||
.name_prefix("segment_updater")
|
.name_prefix("segment_updater")
|
||||||
.pool_size(1)
|
.pool_size(1)
|
||||||
.create();
|
.create();
|
||||||
let index_meta = index.load_metas()?;
|
|
||||||
Ok(SegmentUpdater(Arc::new(InnerSegmentUpdater {
|
Ok(SegmentUpdater(Arc::new(InnerSegmentUpdater {
|
||||||
active_metas: RwLock::new(Arc::new(index_meta)),
|
|
||||||
pool,
|
pool,
|
||||||
index,
|
index,
|
||||||
segment_manager,
|
segment_manager,
|
||||||
merge_policy: RwLock::new(Arc::new(Box::new(DefaultMergePolicy::default()))),
|
merge_policy: RwLock::new(Box::new(DefaultMergePolicy::default())),
|
||||||
merging_thread_id: AtomicUsize::default(),
|
merging_thread_id: AtomicUsize::default(),
|
||||||
merging_threads: RwLock::new(HashMap::new()),
|
merging_threads: RwLock::new(HashMap::new()),
|
||||||
generation: AtomicUsize::default(),
|
generation: AtomicUsize::default(),
|
||||||
killed: AtomicBool::new(false),
|
killed: AtomicBool::new(false),
|
||||||
stamper,
|
stamper,
|
||||||
merge_operations: Default::default(),
|
|
||||||
})))
|
})))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_merge_policy(&self) -> Arc<Box<MergePolicy>> {
|
pub fn new_segment(&self) -> Segment {
|
||||||
self.0.merge_policy.read().unwrap().clone()
|
let new_segment = self.0.index.new_segment();
|
||||||
|
let segment_id = new_segment.id();
|
||||||
|
self.0.segment_manager.write_segment(segment_id);
|
||||||
|
new_segment
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get_merge_policy(&self) -> Box<MergePolicy> {
|
||||||
|
self.0.merge_policy.read().unwrap().box_clone()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn set_merge_policy(&self, merge_policy: Box<MergePolicy>) {
|
pub fn set_merge_policy(&self, merge_policy: Box<MergePolicy>) {
|
||||||
let arc_merge_policy = Arc::new(merge_policy);
|
*self.0.merge_policy.write().unwrap() = merge_policy;
|
||||||
*self.0.merge_policy.write().unwrap() = arc_merge_policy;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_merging_thread_id(&self) -> usize {
|
fn get_merging_thread_id(&self) -> usize {
|
||||||
@@ -207,8 +195,7 @@ impl SegmentUpdater {
|
|||||||
segment_updater.0.segment_manager.add_segment(segment_entry);
|
segment_updater.0.segment_manager.add_segment(segment_entry);
|
||||||
segment_updater.consider_merge_options();
|
segment_updater.consider_merge_options();
|
||||||
true
|
true
|
||||||
})
|
}).forget();
|
||||||
.forget();
|
|
||||||
true
|
true
|
||||||
} else {
|
} else {
|
||||||
false
|
false
|
||||||
@@ -240,39 +227,20 @@ impl SegmentUpdater {
|
|||||||
if self.is_alive() {
|
if self.is_alive() {
|
||||||
let index = &self.0.index;
|
let index = &self.0.index;
|
||||||
let directory = index.directory();
|
let directory = index.directory();
|
||||||
let mut commited_segment_metas = self.0.segment_manager.committed_segment_metas();
|
save_metas(
|
||||||
|
self.0.segment_manager.committed_segment_metas(),
|
||||||
// We sort segment_readers by number of documents.
|
index.schema(),
|
||||||
// This is an heuristic to make multithreading more efficient.
|
|
||||||
//
|
|
||||||
// This is not done at the searcher level because I had a strange
|
|
||||||
// use case in which I was dealing with a large static index,
|
|
||||||
// dispatched over 5 SSD drives.
|
|
||||||
//
|
|
||||||
// A `UnionDirectory` makes it possible to read from these
|
|
||||||
// 5 different drives and creates a meta.json on the fly.
|
|
||||||
// In order to optimize the throughput, it creates a lasagna of segments
|
|
||||||
// from the different drives.
|
|
||||||
//
|
|
||||||
// Segment 1 from disk 1, Segment 1 from disk 2, etc.
|
|
||||||
commited_segment_metas.sort_by_key(|segment_meta| -(segment_meta.max_doc() as i32));
|
|
||||||
let index_meta = IndexMeta {
|
|
||||||
segments: commited_segment_metas,
|
|
||||||
schema: index.schema(),
|
|
||||||
opstamp,
|
opstamp,
|
||||||
payload: commit_message,
|
commit_message,
|
||||||
};
|
directory.box_clone().borrow_mut(),
|
||||||
save_metas(&index_meta, directory.box_clone().borrow_mut())
|
).expect("Could not save metas.");
|
||||||
.expect("Could not save metas.");
|
|
||||||
self.store_meta(&index_meta);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn garbage_collect_files(&self) -> Result<()> {
|
pub fn garbage_collect_files(&self) -> Result<()> {
|
||||||
self.run_async(move |segment_updater| {
|
self.run_async(move |segment_updater| {
|
||||||
segment_updater.garbage_collect_files_exec();
|
segment_updater.garbage_collect_files_exec();
|
||||||
})
|
}).wait()
|
||||||
.wait()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn garbage_collect_files_exec(&self) {
|
fn garbage_collect_files_exec(&self) {
|
||||||
@@ -283,94 +251,65 @@ impl SegmentUpdater {
|
|||||||
.garbage_collect(|| self.0.segment_manager.list_files());
|
.garbage_collect(|| self.0.segment_manager.list_files());
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn commit(&self, opstamp: u64, payload: Option<String>, soft: bool) -> Result<()> {
|
pub fn commit(&self, opstamp: u64, payload: Option<String>) -> Result<()> {
|
||||||
self.run_async(move |segment_updater| {
|
self.run_async(move |segment_updater| {
|
||||||
if segment_updater.is_alive() {
|
if segment_updater.is_alive() {
|
||||||
let segment_entries = segment_updater
|
let segment_entries = segment_updater
|
||||||
.purge_deletes(opstamp)
|
.purge_deletes(opstamp)
|
||||||
.expect("Failed purge deletes");
|
.expect("Failed purge deletes");
|
||||||
if soft {
|
segment_updater.0.segment_manager.commit(segment_entries);
|
||||||
// Soft commit.
|
segment_updater.save_metas(opstamp, payload);
|
||||||
//
|
|
||||||
// The list `segment_entries` above is what we might want to use as searchable
|
|
||||||
// segment. However, we do not want to mark them as committed, and we want
|
|
||||||
// to keep the current set of committed segment.
|
|
||||||
segment_updater.0.segment_manager.soft_commit(opstamp, segment_entries);
|
|
||||||
// ... We do not save the meta file.
|
|
||||||
} else {
|
|
||||||
// Hard_commit. We register the new segment entries as committed.
|
|
||||||
segment_updater
|
|
||||||
.0
|
|
||||||
.segment_manager
|
|
||||||
.commit(opstamp, segment_entries);
|
|
||||||
// TODO error handling.
|
|
||||||
segment_updater.save_metas(opstamp, payload);
|
|
||||||
segment_updater.0.index.directory().flush().unwrap();
|
|
||||||
}
|
|
||||||
segment_updater.garbage_collect_files_exec();
|
segment_updater.garbage_collect_files_exec();
|
||||||
segment_updater.consider_merge_options();
|
segment_updater.consider_merge_options();
|
||||||
}
|
}
|
||||||
})
|
}).wait()
|
||||||
.wait()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn start_merge(&self, segment_ids: &[SegmentId]) -> Result<Receiver<SegmentMeta>> {
|
pub fn start_merge(&self, segment_ids: &[SegmentId]) -> Result<Receiver<SegmentMeta>> {
|
||||||
let commit_opstamp = self.load_metas().opstamp;
|
//let future_merged_segment = */
|
||||||
let merge_operation = MergeOperation::new(
|
let segment_ids_vec = segment_ids.to_vec();
|
||||||
&self.0.merge_operations,
|
self.run_async(move |segment_updater| {
|
||||||
commit_opstamp,
|
segment_updater.start_merge_impl(&segment_ids_vec[..])
|
||||||
segment_ids.to_vec(),
|
}).wait()?
|
||||||
);
|
|
||||||
self.run_async(move |segment_updater| segment_updater.start_merge_impl(merge_operation))
|
|
||||||
.wait()?
|
|
||||||
}
|
|
||||||
|
|
||||||
fn store_meta(&self, index_meta: &IndexMeta) {
|
|
||||||
*self.0.active_metas.write().unwrap() = Arc::new(index_meta.clone());
|
|
||||||
}
|
|
||||||
fn load_metas(&self) -> Arc<IndexMeta> {
|
|
||||||
self.0.active_metas.read().unwrap().clone()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// `segment_ids` is required to be non-empty.
|
// `segment_ids` is required to be non-empty.
|
||||||
fn start_merge_impl(&self, merge_operation: MergeOperation) -> Result<Receiver<SegmentMeta>> {
|
fn start_merge_impl(&self, segment_ids: &[SegmentId]) -> Result<Receiver<SegmentMeta>> {
|
||||||
assert!(
|
assert!(!segment_ids.is_empty(), "Segment_ids cannot be empty.");
|
||||||
!merge_operation.segment_ids().is_empty(),
|
|
||||||
"Segment_ids cannot be empty."
|
|
||||||
);
|
|
||||||
|
|
||||||
let segment_updater_clone = self.clone();
|
let segment_updater_clone = self.clone();
|
||||||
let segment_entries: Vec<SegmentEntry> = self
|
let segment_entries: Vec<SegmentEntry> = self.0.segment_manager.start_merge(segment_ids)?;
|
||||||
.0
|
|
||||||
.segment_manager
|
|
||||||
.start_merge(merge_operation.segment_ids())?;
|
|
||||||
|
|
||||||
// let segment_ids_vec = merge_operation.segment_ids.to_vec();
|
let segment_ids_vec = segment_ids.to_vec();
|
||||||
|
|
||||||
let merging_thread_id = self.get_merging_thread_id();
|
let merging_thread_id = self.get_merging_thread_id();
|
||||||
info!(
|
info!(
|
||||||
"Starting merge thread #{} - {:?}",
|
"Starting merge thread #{} - {:?}",
|
||||||
merging_thread_id,
|
merging_thread_id, segment_ids
|
||||||
merge_operation.segment_ids()
|
|
||||||
);
|
);
|
||||||
let (merging_future_send, merging_future_recv) = oneshot();
|
let (merging_future_send, merging_future_recv) = oneshot();
|
||||||
|
|
||||||
|
let target_opstamp = self.0.stamper.stamp();
|
||||||
|
|
||||||
// first we need to apply deletes to our segment.
|
// first we need to apply deletes to our segment.
|
||||||
let merging_join_handle = thread::Builder::new()
|
let merging_join_handle = thread::Builder::new()
|
||||||
.name(format!("mergingthread-{}", merging_thread_id))
|
.name(format!("mergingthread-{}", merging_thread_id))
|
||||||
.spawn(move || {
|
.spawn(move || {
|
||||||
// first we need to apply deletes to our segment.
|
// first we need to apply deletes to our segment.
|
||||||
|
let merged_segment = segment_updater_clone.new_segment();
|
||||||
|
let merged_segment_id = merged_segment.id();
|
||||||
let merge_result = perform_merge(
|
let merge_result = perform_merge(
|
||||||
&merge_operation,
|
|
||||||
&segment_updater_clone.0.index,
|
&segment_updater_clone.0.index,
|
||||||
segment_entries,
|
segment_entries,
|
||||||
|
merged_segment,
|
||||||
|
target_opstamp,
|
||||||
);
|
);
|
||||||
|
|
||||||
match merge_result {
|
match merge_result {
|
||||||
Ok(after_merge_segment_entry) => {
|
Ok(after_merge_segment_entry) => {
|
||||||
let merged_segment_meta = after_merge_segment_entry.meta().clone();
|
let merged_segment_meta = after_merge_segment_entry.meta().clone();
|
||||||
segment_updater_clone
|
segment_updater_clone
|
||||||
.end_merge(merge_operation, after_merge_segment_entry)
|
.end_merge(segment_ids_vec, after_merge_segment_entry)
|
||||||
.expect("Segment updater thread is corrupted.");
|
.expect("Segment updater thread is corrupted.");
|
||||||
|
|
||||||
// the future may fail if the listener of the oneshot future
|
// the future may fail if the listener of the oneshot future
|
||||||
@@ -381,18 +320,13 @@ impl SegmentUpdater {
|
|||||||
let _merging_future_res = merging_future_send.send(merged_segment_meta);
|
let _merging_future_res = merging_future_send.send(merged_segment_meta);
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
warn!(
|
warn!("Merge of {:?} was cancelled: {:?}", segment_ids_vec, e);
|
||||||
"Merge of {:?} was cancelled: {:?}",
|
|
||||||
merge_operation.segment_ids(),
|
|
||||||
e
|
|
||||||
);
|
|
||||||
// ... cancel merge
|
// ... cancel merge
|
||||||
if cfg!(test) {
|
if cfg!(test) {
|
||||||
panic!("Merge failed.");
|
panic!("Merge failed.");
|
||||||
}
|
}
|
||||||
// As `merge_operation` will be dropped, the segment in merge state will
|
segment_updater_clone.cancel_merge(&segment_ids_vec, merged_segment_id);
|
||||||
// be available for merge again.
|
// merging_future_send will be dropped, sending an error to the future.
|
||||||
// `merging_future_send` will be dropped, sending an error to the future.
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
segment_updater_clone
|
segment_updater_clone
|
||||||
@@ -402,8 +336,7 @@ impl SegmentUpdater {
|
|||||||
.unwrap()
|
.unwrap()
|
||||||
.remove(&merging_thread_id);
|
.remove(&merging_thread_id);
|
||||||
Ok(())
|
Ok(())
|
||||||
})
|
}).expect("Failed to spawn a thread.");
|
||||||
.expect("Failed to spawn a thread.");
|
|
||||||
self.0
|
self.0
|
||||||
.merging_threads
|
.merging_threads
|
||||||
.write()
|
.write()
|
||||||
@@ -413,35 +346,16 @@ impl SegmentUpdater {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn consider_merge_options(&self) {
|
fn consider_merge_options(&self) {
|
||||||
let merge_segment_ids: HashSet<SegmentId> = self.0.merge_operations.segment_in_merge();
|
|
||||||
let (committed_segments, uncommitted_segments) =
|
let (committed_segments, uncommitted_segments) =
|
||||||
get_mergeable_segments(&merge_segment_ids, &self.0.segment_manager);
|
get_mergeable_segments(&self.0.segment_manager);
|
||||||
|
|
||||||
// Committed segments cannot be merged with uncommitted_segments.
|
// Committed segments cannot be merged with uncommitted_segments.
|
||||||
// We therefore consider merges using these two sets of segments independently.
|
// We therefore consider merges using these two sets of segments independently.
|
||||||
let merge_policy = self.get_merge_policy();
|
let merge_policy = self.get_merge_policy();
|
||||||
|
let mut merge_candidates = merge_policy.compute_merge_candidates(&uncommitted_segments);
|
||||||
let current_opstamp = self.0.stamper.stamp();
|
let committed_merge_candidates = merge_policy.compute_merge_candidates(&committed_segments);
|
||||||
let mut merge_candidates: Vec<MergeOperation> = merge_policy
|
merge_candidates.extend_from_slice(&committed_merge_candidates[..]);
|
||||||
.compute_merge_candidates(&uncommitted_segments)
|
for MergeCandidate(segment_metas) in merge_candidates {
|
||||||
.into_iter()
|
match self.start_merge_impl(&segment_metas) {
|
||||||
.map(|merge_candidate| {
|
|
||||||
MergeOperation::new(&self.0.merge_operations, current_opstamp, merge_candidate.0)
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
let commit_opstamp = self.load_metas().opstamp;
|
|
||||||
let committed_merge_candidates = merge_policy
|
|
||||||
.compute_merge_candidates(&committed_segments)
|
|
||||||
.into_iter()
|
|
||||||
.map(|merge_candidate| {
|
|
||||||
MergeOperation::new(&self.0.merge_operations, commit_opstamp, merge_candidate.0)
|
|
||||||
})
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
merge_candidates.extend(committed_merge_candidates.into_iter());
|
|
||||||
|
|
||||||
for merge_operation in merge_candidates {
|
|
||||||
match self.start_merge_impl(merge_operation) {
|
|
||||||
Ok(merge_future) => {
|
Ok(merge_future) => {
|
||||||
if let Err(e) = merge_future.fuse().poll() {
|
if let Err(e) = merge_future.fuse().poll() {
|
||||||
error!("The merge task failed quickly after starting: {:?}", e);
|
error!("The merge task failed quickly after starting: {:?}", e);
|
||||||
@@ -457,16 +371,31 @@ impl SegmentUpdater {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn cancel_merge(
|
||||||
|
&self,
|
||||||
|
before_merge_segment_ids: &[SegmentId],
|
||||||
|
after_merge_segment_entry: SegmentId,
|
||||||
|
) {
|
||||||
|
self.0
|
||||||
|
.segment_manager
|
||||||
|
.cancel_merge(before_merge_segment_ids, after_merge_segment_entry);
|
||||||
|
}
|
||||||
|
|
||||||
fn end_merge(
|
fn end_merge(
|
||||||
&self,
|
&self,
|
||||||
merge_operation: MergeOperation,
|
before_merge_segment_ids: Vec<SegmentId>,
|
||||||
mut after_merge_segment_entry: SegmentEntry,
|
mut after_merge_segment_entry: SegmentEntry,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
self.run_async(move |segment_updater| {
|
self.run_async(move |segment_updater| {
|
||||||
info!("End merge {:?}", after_merge_segment_entry.meta());
|
info!("End merge {:?}", after_merge_segment_entry.meta());
|
||||||
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
|
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
|
||||||
if let Some(delete_operation) = delete_cursor.get() {
|
if let Some(delete_operation) = delete_cursor.get() {
|
||||||
let committed_opstamp = segment_updater.load_metas().opstamp;
|
let committed_opstamp = segment_updater
|
||||||
|
.0
|
||||||
|
.index
|
||||||
|
.load_metas()
|
||||||
|
.expect("Failed to read opstamp")
|
||||||
|
.opstamp;
|
||||||
if delete_operation.opstamp < committed_opstamp {
|
if delete_operation.opstamp < committed_opstamp {
|
||||||
let index = &segment_updater.0.index;
|
let index = &segment_updater.0.index;
|
||||||
let segment = index.segment(after_merge_segment_entry.meta().clone());
|
let segment = index.segment(after_merge_segment_entry.meta().clone());
|
||||||
@@ -475,15 +404,16 @@ impl SegmentUpdater {
|
|||||||
{
|
{
|
||||||
error!(
|
error!(
|
||||||
"Merge of {:?} was cancelled (advancing deletes failed): {:?}",
|
"Merge of {:?} was cancelled (advancing deletes failed): {:?}",
|
||||||
merge_operation.segment_ids(),
|
before_merge_segment_ids, e
|
||||||
e
|
|
||||||
);
|
);
|
||||||
|
// ... cancel merge
|
||||||
if cfg!(test) {
|
if cfg!(test) {
|
||||||
panic!("Merge failed.");
|
panic!("Merge failed.");
|
||||||
}
|
}
|
||||||
// ... cancel merge
|
segment_updater.cancel_merge(
|
||||||
// `merge_operations` are tracked. As it is dropped, the
|
&before_merge_segment_ids,
|
||||||
// the segment_ids will be available again for merge.
|
after_merge_segment_entry.segment_id(),
|
||||||
|
);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -491,14 +421,13 @@ impl SegmentUpdater {
|
|||||||
segment_updater
|
segment_updater
|
||||||
.0
|
.0
|
||||||
.segment_manager
|
.segment_manager
|
||||||
.end_merge(merge_operation.segment_ids(), after_merge_segment_entry);
|
.end_merge(&before_merge_segment_ids, after_merge_segment_entry);
|
||||||
segment_updater.consider_merge_options();
|
segment_updater.consider_merge_options();
|
||||||
info!("save metas");
|
info!("save metas");
|
||||||
let previous_metas = segment_updater.load_metas();
|
let previous_metas = segment_updater.0.index.load_metas().unwrap();
|
||||||
segment_updater.save_metas(previous_metas.opstamp, previous_metas.payload.clone());
|
segment_updater.save_metas(previous_metas.opstamp, previous_metas.payload);
|
||||||
segment_updater.garbage_collect_files_exec();
|
segment_updater.garbage_collect_files_exec();
|
||||||
})
|
}).wait()
|
||||||
.wait()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Wait for current merging threads.
|
/// Wait for current merging threads.
|
||||||
@@ -517,25 +446,32 @@ impl SegmentUpdater {
|
|||||||
/// Obsolete files will eventually be cleaned up
|
/// Obsolete files will eventually be cleaned up
|
||||||
/// by the directory garbage collector.
|
/// by the directory garbage collector.
|
||||||
pub fn wait_merging_thread(&self) -> Result<()> {
|
pub fn wait_merging_thread(&self) -> Result<()> {
|
||||||
|
let mut num_segments: usize;
|
||||||
loop {
|
loop {
|
||||||
let merging_threads: HashMap<usize, JoinHandle<Result<()>>> = {
|
num_segments = self.0.segment_manager.num_segments();
|
||||||
|
|
||||||
|
let mut new_merging_threads = HashMap::new();
|
||||||
|
{
|
||||||
let mut merging_threads = self.0.merging_threads.write().unwrap();
|
let mut merging_threads = self.0.merging_threads.write().unwrap();
|
||||||
mem::replace(merging_threads.deref_mut(), HashMap::new())
|
mem::swap(&mut new_merging_threads, merging_threads.deref_mut());
|
||||||
};
|
|
||||||
if merging_threads.is_empty() {
|
|
||||||
return Ok(());
|
|
||||||
}
|
}
|
||||||
debug!("wait merging thread {}", merging_threads.len());
|
debug!("wait merging thread {}", new_merging_threads.len());
|
||||||
for (_, merging_thread_handle) in merging_threads {
|
for (_, merging_thread_handle) in new_merging_threads {
|
||||||
merging_thread_handle
|
merging_thread_handle
|
||||||
.join()
|
.join()
|
||||||
.map(|_| ())
|
.map(|_| ())
|
||||||
.map_err(|_| TantivyError::ErrorInThread("Merging thread failed.".into()))?;
|
.map_err(|_| TantivyError::ErrorInThread("Merging thread failed.".into()))?;
|
||||||
}
|
}
|
||||||
// Our merging thread may have queued their completed merged segment.
|
// Our merging thread may have queued their completed
|
||||||
// Let's wait for that too.
|
|
||||||
self.run_async(move |_| {}).wait()?;
|
self.run_async(move |_| {}).wait()?;
|
||||||
|
|
||||||
|
let new_num_segments = self.0.segment_manager.num_segments();
|
||||||
|
|
||||||
|
if new_num_segments >= num_segments {
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -548,14 +484,14 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_delete_during_merge() {
|
fn test_delete_during_merge() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
|
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
|
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
index_writer.set_merge_policy(Box::new(MergeWheneverPossible));
|
index_writer.set_merge_policy(Box::new(MergeWheneverPossible));
|
||||||
|
|
||||||
{
|
{
|
||||||
@@ -585,8 +521,9 @@ mod tests {
|
|||||||
index_writer.delete_term(term);
|
index_writer.delete_term(term);
|
||||||
assert!(index_writer.commit().is_ok());
|
assert!(index_writer.commit().is_ok());
|
||||||
}
|
}
|
||||||
let reader = index.reader().unwrap();
|
|
||||||
assert_eq!(reader.searcher().num_docs(), 302);
|
index.load_searchers().unwrap();
|
||||||
|
assert_eq!(index.searcher().num_docs(), 302);
|
||||||
|
|
||||||
{
|
{
|
||||||
index_writer
|
index_writer
|
||||||
@@ -594,79 +531,8 @@ mod tests {
|
|||||||
.expect("waiting for merging threads");
|
.expect("waiting for merging threads");
|
||||||
}
|
}
|
||||||
|
|
||||||
reader.reload().unwrap();
|
index.load_searchers().unwrap();
|
||||||
assert_eq!(reader.searcher().segment_readers().len(), 1);
|
assert_eq!(index.searcher().segment_readers().len(), 1);
|
||||||
assert_eq!(reader.searcher().num_docs(), 302);
|
assert_eq!(index.searcher().num_docs(), 302);
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn delete_all_docs() {
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
|
|
||||||
let index = Index::create_in_ram(schema);
|
|
||||||
|
|
||||||
// writing the segment
|
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
|
||||||
|
|
||||||
{
|
|
||||||
for _ in 0..100 {
|
|
||||||
index_writer.add_document(doc!(text_field=>"a"));
|
|
||||||
index_writer.add_document(doc!(text_field=>"b"));
|
|
||||||
}
|
|
||||||
assert!(index_writer.commit().is_ok());
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
for _ in 0..100 {
|
|
||||||
index_writer.add_document(doc!(text_field=>"c"));
|
|
||||||
index_writer.add_document(doc!(text_field=>"d"));
|
|
||||||
}
|
|
||||||
assert!(index_writer.commit().is_ok());
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
index_writer.add_document(doc!(text_field=>"e"));
|
|
||||||
index_writer.add_document(doc!(text_field=>"f"));
|
|
||||||
assert!(index_writer.commit().is_ok());
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
let seg_ids = index
|
|
||||||
.searchable_segment_ids()
|
|
||||||
.expect("Searchable segments failed.");
|
|
||||||
// docs exist, should have at least 1 segment
|
|
||||||
assert!(seg_ids.len() > 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
let term_vals = vec!["a", "b", "c", "d", "e", "f"];
|
|
||||||
for term_val in term_vals {
|
|
||||||
let term = Term::from_field_text(text_field, term_val);
|
|
||||||
index_writer.delete_term(term);
|
|
||||||
assert!(index_writer.commit().is_ok());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
index_writer
|
|
||||||
.wait_merging_threads()
|
|
||||||
.expect("waiting for merging threads");
|
|
||||||
}
|
|
||||||
|
|
||||||
let reader = index.reader().unwrap();
|
|
||||||
assert_eq!(reader.searcher().num_docs(), 0);
|
|
||||||
|
|
||||||
let seg_ids = index
|
|
||||||
.searchable_segment_ids()
|
|
||||||
.expect("Searchable segments failed.");
|
|
||||||
assert!(seg_ids.is_empty());
|
|
||||||
|
|
||||||
reader.reload().unwrap();
|
|
||||||
assert_eq!(reader.searcher().num_docs(), 0);
|
|
||||||
// empty segments should be erased
|
|
||||||
assert!(index.searchable_segment_metas().unwrap().is_empty());
|
|
||||||
assert!(reader.searcher().segment_readers().is_empty());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -62,8 +62,7 @@ impl SegmentWriter {
|
|||||||
segment.index().tokenizers().get(tokenizer_name)
|
segment.index().tokenizers().get(tokenizer_name)
|
||||||
}),
|
}),
|
||||||
_ => None,
|
_ => None,
|
||||||
})
|
}).collect();
|
||||||
.collect();
|
|
||||||
Ok(SegmentWriter {
|
Ok(SegmentWriter {
|
||||||
max_doc: 0,
|
max_doc: 0,
|
||||||
multifield_postings,
|
multifield_postings,
|
||||||
@@ -111,18 +110,18 @@ impl SegmentWriter {
|
|||||||
}
|
}
|
||||||
match *field_options.field_type() {
|
match *field_options.field_type() {
|
||||||
FieldType::HierarchicalFacet => {
|
FieldType::HierarchicalFacet => {
|
||||||
let facets: Vec<&str> = field_values
|
let facets: Vec<&[u8]> = field_values
|
||||||
.iter()
|
.iter()
|
||||||
.flat_map(|field_value| match *field_value.value() {
|
.flat_map(|field_value| match *field_value.value() {
|
||||||
Value::Facet(ref facet) => Some(facet.encoded_str()),
|
Value::Facet(ref facet) => Some(facet.encoded_bytes()),
|
||||||
_ => {
|
_ => {
|
||||||
panic!("Expected hierarchical facet");
|
panic!("Expected hierarchical facet");
|
||||||
}
|
}
|
||||||
})
|
}).collect();
|
||||||
.collect();
|
|
||||||
let mut term = Term::for_field(field); // we set the Term
|
let mut term = Term::for_field(field); // we set the Term
|
||||||
for fake_str in facets {
|
for facet_bytes in facets {
|
||||||
let mut unordered_term_id_opt = None;
|
let mut unordered_term_id_opt = None;
|
||||||
|
let fake_str = unsafe { str::from_utf8_unchecked(facet_bytes) };
|
||||||
FacetTokenizer.token_stream(fake_str).process(&mut |token| {
|
FacetTokenizer.token_stream(fake_str).process(&mut |token| {
|
||||||
term.set_text(&token.text);
|
term.set_text(&token.text);
|
||||||
let unordered_term_id =
|
let unordered_term_id =
|
||||||
@@ -146,8 +145,7 @@ impl SegmentWriter {
|
|||||||
.flat_map(|field_value| match *field_value.value() {
|
.flat_map(|field_value| match *field_value.value() {
|
||||||
Value::Str(ref text) => Some(text.as_str()),
|
Value::Str(ref text) => Some(text.as_str()),
|
||||||
_ => None,
|
_ => None,
|
||||||
})
|
}).collect();
|
||||||
.collect();
|
|
||||||
if texts.is_empty() {
|
if texts.is_empty() {
|
||||||
0
|
0
|
||||||
} else {
|
} else {
|
||||||
@@ -171,17 +169,6 @@ impl SegmentWriter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
FieldType::Date(ref int_option) => {
|
|
||||||
if int_option.is_indexed() {
|
|
||||||
for field_value in field_values {
|
|
||||||
let term = Term::from_field_i64(
|
|
||||||
field_value.field(),
|
|
||||||
field_value.value().date_value().timestamp(),
|
|
||||||
);
|
|
||||||
self.multifield_postings.subscribe(doc_id, &term);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
FieldType::I64(ref int_option) => {
|
FieldType::I64(ref int_option) => {
|
||||||
if int_option.is_indexed() {
|
if int_option.is_indexed() {
|
||||||
for field_value in field_values {
|
for field_value in field_values {
|
||||||
|
|||||||
@@ -1,77 +1,50 @@
|
|||||||
use std::ops::Range;
|
|
||||||
use std::sync::atomic::Ordering;
|
|
||||||
use std::sync::Arc;
|
|
||||||
|
|
||||||
// AtomicU64 have not landed in stable.
|
// AtomicU64 have not landed in stable.
|
||||||
// For the moment let's just use AtomicUsize on
|
// For the moment let's just use AtomicUsize on
|
||||||
// x86/64 bit platform, and a mutex on other platform.
|
// x86/64 bit platform, and a mutex on other platform.
|
||||||
#[cfg(target_arch = "x86_64")]
|
|
||||||
|
#[cfg(target = "x86_64")]
|
||||||
mod archicture_impl {
|
mod archicture_impl {
|
||||||
|
|
||||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
#[derive(Default)]
|
#[derive(Clone, Default)]
|
||||||
pub struct AtomicU64Ersatz(AtomicUsize);
|
pub struct Stamper(Arc<AtomicU64>);
|
||||||
|
|
||||||
impl AtomicU64Ersatz {
|
impl Stamper {
|
||||||
pub fn new(first_opstamp: u64) -> AtomicU64Ersatz {
|
pub fn new(first_opstamp: u64) -> Stamper {
|
||||||
AtomicU64Ersatz(AtomicUsize::new(first_opstamp as usize))
|
Stamper(Arc::new(AtomicU64::new(first_opstamp)))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn fetch_add(&self, val: u64, order: Ordering) -> u64 {
|
pub fn stamp(&self) -> u64 {
|
||||||
self.0.fetch_add(val as usize, order) as u64
|
self.0.fetch_add(1u64, Ordering::SeqCst) as u64
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(not(target_arch = "x86_64"))]
|
#[cfg(not(target = "x86_64"))]
|
||||||
mod archicture_impl {
|
mod archicture_impl {
|
||||||
|
|
||||||
use std::sync::atomic::Ordering;
|
use std::sync::{Arc, Mutex};
|
||||||
/// Under other architecture, we rely on a mutex.
|
|
||||||
use std::sync::RwLock;
|
|
||||||
|
|
||||||
#[derive(Default)]
|
#[derive(Clone, Default)]
|
||||||
pub struct AtomicU64Ersatz(RwLock<u64>);
|
pub struct Stamper(Arc<Mutex<u64>>);
|
||||||
|
|
||||||
impl AtomicU64Ersatz {
|
impl Stamper {
|
||||||
pub fn new(first_opstamp: u64) -> AtomicU64Ersatz {
|
pub fn new(first_opstamp: u64) -> Stamper {
|
||||||
AtomicU64Ersatz(RwLock::new(first_opstamp))
|
Stamper(Arc::new(Mutex::new(first_opstamp)))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn fetch_add(&self, incr: u64, _order: Ordering) -> u64 {
|
pub fn stamp(&self) -> u64 {
|
||||||
let mut lock = self.0.write().unwrap();
|
let mut guard = self.0.lock().expect("Failed to lock the stamper");
|
||||||
let previous_val = *lock;
|
let previous_val = *guard;
|
||||||
*lock = previous_val + incr;
|
*guard = previous_val + 1;
|
||||||
previous_val
|
previous_val
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
use self::archicture_impl::AtomicU64Ersatz;
|
pub use self::archicture_impl::Stamper;
|
||||||
|
|
||||||
#[derive(Clone, Default)]
|
|
||||||
pub struct Stamper(Arc<AtomicU64Ersatz>);
|
|
||||||
|
|
||||||
impl Stamper {
|
|
||||||
pub fn new(first_opstamp: u64) -> Stamper {
|
|
||||||
Stamper(Arc::new(AtomicU64Ersatz::new(first_opstamp)))
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn stamp(&self) -> u64 {
|
|
||||||
self.0.fetch_add(1u64, Ordering::SeqCst) as u64
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Given a desired count `n`, `stamps` returns an iterator that
|
|
||||||
/// will supply `n` number of u64 stamps.
|
|
||||||
pub fn stamps(&self, n: u64) -> Range<u64> {
|
|
||||||
let start = self.0.fetch_add(n, Ordering::SeqCst);
|
|
||||||
Range {
|
|
||||||
start,
|
|
||||||
end: start + n,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
@@ -89,7 +62,5 @@ mod test {
|
|||||||
|
|
||||||
assert_eq!(stamper.stamp(), 10u64);
|
assert_eq!(stamper.stamp(), 10u64);
|
||||||
assert_eq!(stamper_clone.stamp(), 11u64);
|
assert_eq!(stamper_clone.stamp(), 11u64);
|
||||||
assert_eq!(stamper.stamps(3u64), (12..15));
|
|
||||||
assert_eq!(stamper.stamp(), 15u64);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
460
src/lib.rs
460
src/lib.rs
@@ -1,5 +1,6 @@
|
|||||||
#![doc(html_logo_url = "http://fulmicoton.com/tantivy-logo/tantivy-logo.png")]
|
#![doc(html_logo_url = "http://fulmicoton.com/tantivy-logo/tantivy-logo.png")]
|
||||||
#![cfg_attr(all(feature = "unstable", test), feature(test))]
|
#![cfg_attr(all(feature = "unstable", test), feature(test))]
|
||||||
|
#![cfg_attr(feature = "cargo-clippy", feature(tool_lints))]
|
||||||
#![cfg_attr(feature = "cargo-clippy", allow(clippy::module_inception))]
|
#![cfg_attr(feature = "cargo-clippy", allow(clippy::module_inception))]
|
||||||
#![doc(test(attr(allow(unused_variables), deny(warnings))))]
|
#![doc(test(attr(allow(unused_variables), deny(warnings))))]
|
||||||
#![warn(missing_docs)]
|
#![warn(missing_docs)]
|
||||||
@@ -23,8 +24,7 @@
|
|||||||
//! # use tempdir::TempDir;
|
//! # use tempdir::TempDir;
|
||||||
//! # use tantivy::Index;
|
//! # use tantivy::Index;
|
||||||
//! # use tantivy::schema::*;
|
//! # use tantivy::schema::*;
|
||||||
//! # use tantivy::{Score, DocAddress};
|
//! # use tantivy::collector::TopCollector;
|
||||||
//! # use tantivy::collector::TopDocs;
|
|
||||||
//! # use tantivy::query::QueryParser;
|
//! # use tantivy::query::QueryParser;
|
||||||
//! #
|
//! #
|
||||||
//! # fn main() {
|
//! # fn main() {
|
||||||
@@ -46,7 +46,7 @@
|
|||||||
//! // in a compressed, row-oriented key-value store.
|
//! // in a compressed, row-oriented key-value store.
|
||||||
//! // This store is useful to reconstruct the
|
//! // This store is useful to reconstruct the
|
||||||
//! // documents that were selected during the search phase.
|
//! // documents that were selected during the search phase.
|
||||||
//! let mut schema_builder = Schema::builder();
|
//! let mut schema_builder = SchemaBuilder::default();
|
||||||
//! let title = schema_builder.add_text_field("title", TEXT | STORED);
|
//! let title = schema_builder.add_text_field("title", TEXT | STORED);
|
||||||
//! let body = schema_builder.add_text_field("body", TEXT);
|
//! let body = schema_builder.add_text_field("body", TEXT);
|
||||||
//! let schema = schema_builder.build();
|
//! let schema = schema_builder.build();
|
||||||
@@ -75,9 +75,9 @@
|
|||||||
//!
|
//!
|
||||||
//! // # Searching
|
//! // # Searching
|
||||||
//!
|
//!
|
||||||
//! let reader = index.reader()?;
|
//! index.load_searchers()?;
|
||||||
//!
|
//!
|
||||||
//! let searcher = reader.searcher();
|
//! let searcher = index.searcher();
|
||||||
//!
|
//!
|
||||||
//! let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
//! let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
||||||
//!
|
//!
|
||||||
@@ -86,13 +86,13 @@
|
|||||||
//! // A ticket has been opened regarding this problem.
|
//! // A ticket has been opened regarding this problem.
|
||||||
//! let query = query_parser.parse_query("sea whale")?;
|
//! let query = query_parser.parse_query("sea whale")?;
|
||||||
//!
|
//!
|
||||||
//! // Perform search.
|
//! let mut top_collector = TopCollector::with_limit(10);
|
||||||
//! // `topdocs` contains the 10 most relevant doc ids, sorted by decreasing scores...
|
//! searcher.search(&*query, &mut top_collector)?;
|
||||||
//! let top_docs: Vec<(Score, DocAddress)> =
|
|
||||||
//! searcher.search(&query, &TopDocs::with_limit(10))?;
|
|
||||||
//!
|
//!
|
||||||
//! for (_score, doc_address) in top_docs {
|
//! // Our top collector now contains the 10
|
||||||
//! // Retrieve the actual content of documents given its `doc_address`.
|
//! // most relevant doc ids...
|
||||||
|
//! let doc_addresses = top_collector.docs();
|
||||||
|
//! for doc_address in doc_addresses {
|
||||||
//! let retrieved_doc = searcher.doc(doc_address)?;
|
//! let retrieved_doc = searcher.doc(doc_address)?;
|
||||||
//! println!("{}", schema.to_json(&retrieved_doc));
|
//! println!("{}", schema.to_json(&retrieved_doc));
|
||||||
//! }
|
//! }
|
||||||
@@ -129,28 +129,31 @@ extern crate base64;
|
|||||||
extern crate bit_set;
|
extern crate bit_set;
|
||||||
extern crate bitpacking;
|
extern crate bitpacking;
|
||||||
extern crate byteorder;
|
extern crate byteorder;
|
||||||
|
|
||||||
extern crate combine;
|
extern crate combine;
|
||||||
|
|
||||||
extern crate crossbeam;
|
extern crate crossbeam;
|
||||||
|
extern crate crossbeam_channel;
|
||||||
extern crate fnv;
|
extern crate fnv;
|
||||||
|
extern crate fst;
|
||||||
|
|
||||||
extern crate futures;
|
extern crate futures;
|
||||||
extern crate futures_cpupool;
|
extern crate futures_cpupool;
|
||||||
extern crate htmlescape;
|
extern crate htmlescape;
|
||||||
extern crate itertools;
|
extern crate itertools;
|
||||||
extern crate levenshtein_automata;
|
extern crate levenshtein_automata;
|
||||||
#[cfg(feature = "mmap")]
|
|
||||||
extern crate memmap;
|
|
||||||
extern crate num_cpus;
|
extern crate num_cpus;
|
||||||
extern crate owning_ref;
|
extern crate owning_ref;
|
||||||
extern crate regex;
|
extern crate regex;
|
||||||
extern crate rust_stemmers;
|
extern crate rust_stemmers;
|
||||||
extern crate scoped_pool;
|
|
||||||
extern crate serde;
|
extern crate serde;
|
||||||
extern crate stable_deref_trait;
|
extern crate stable_deref_trait;
|
||||||
extern crate tantivy_fst;
|
|
||||||
extern crate tempdir;
|
extern crate tempdir;
|
||||||
extern crate tempfile;
|
extern crate tempfile;
|
||||||
extern crate uuid;
|
extern crate uuid;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate matches;
|
extern crate matches;
|
||||||
@@ -169,12 +172,11 @@ extern crate maplit;
|
|||||||
extern crate test;
|
extern crate test;
|
||||||
|
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate downcast_rs;
|
extern crate downcast;
|
||||||
|
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate fail;
|
extern crate fail;
|
||||||
|
|
||||||
#[cfg(feature = "mmap")]
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod functional_test;
|
mod functional_test;
|
||||||
|
|
||||||
@@ -183,19 +185,18 @@ mod macros;
|
|||||||
|
|
||||||
pub use error::TantivyError;
|
pub use error::TantivyError;
|
||||||
|
|
||||||
#[deprecated(since = "0.7.0", note = "please use `tantivy::TantivyError` instead")]
|
#[deprecated(
|
||||||
|
since = "0.7.0",
|
||||||
|
note = "please use `tantivy::TantivyError` instead"
|
||||||
|
)]
|
||||||
pub use error::TantivyError as Error;
|
pub use error::TantivyError as Error;
|
||||||
|
|
||||||
extern crate census;
|
extern crate census;
|
||||||
pub extern crate chrono;
|
|
||||||
extern crate owned_read;
|
extern crate owned_read;
|
||||||
|
|
||||||
/// Tantivy result.
|
/// Tantivy result.
|
||||||
pub type Result<T> = std::result::Result<T, error::TantivyError>;
|
pub type Result<T> = std::result::Result<T, error::TantivyError>;
|
||||||
|
|
||||||
/// Tantivy DateTime
|
|
||||||
pub type DateTime = chrono::DateTime<chrono::Utc>;
|
|
||||||
|
|
||||||
mod common;
|
mod common;
|
||||||
mod core;
|
mod core;
|
||||||
mod indexer;
|
mod indexer;
|
||||||
@@ -212,15 +213,11 @@ pub(crate) mod positions;
|
|||||||
pub mod postings;
|
pub mod postings;
|
||||||
pub mod query;
|
pub mod query;
|
||||||
pub mod schema;
|
pub mod schema;
|
||||||
pub mod space_usage;
|
|
||||||
pub mod store;
|
pub mod store;
|
||||||
pub mod termdict;
|
pub mod termdict;
|
||||||
|
|
||||||
mod reader;
|
|
||||||
|
|
||||||
pub use self::reader::{IndexReader, IndexReaderBuilder, ReloadPolicy};
|
|
||||||
mod snippet;
|
mod snippet;
|
||||||
pub use self::snippet::{Snippet, SnippetGenerator};
|
pub use self::snippet::SnippetGenerator;
|
||||||
|
|
||||||
mod docset;
|
mod docset;
|
||||||
pub use self::docset::{DocSet, SkipResult};
|
pub use self::docset::{DocSet, SkipResult};
|
||||||
@@ -238,7 +235,11 @@ pub use common::{i64_to_u64, u64_to_i64};
|
|||||||
/// Expose the current version of tantivy, as well
|
/// Expose the current version of tantivy, as well
|
||||||
/// whether it was compiled with the simd compression.
|
/// whether it was compiled with the simd compression.
|
||||||
pub fn version() -> &'static str {
|
pub fn version() -> &'static str {
|
||||||
env!("CARGO_PKG_VERSION")
|
if cfg!(feature = "simdcompression") {
|
||||||
|
concat!(env!("CARGO_PKG_VERSION"), "-simd")
|
||||||
|
} else {
|
||||||
|
concat!(env!("CARGO_PKG_VERSION"), "-nosimd")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Defines tantivy's merging strategy
|
/// Defines tantivy's merging strategy
|
||||||
@@ -299,15 +300,12 @@ mod tests {
|
|||||||
use docset::DocSet;
|
use docset::DocSet;
|
||||||
use query::BooleanQuery;
|
use query::BooleanQuery;
|
||||||
use rand::distributions::Bernoulli;
|
use rand::distributions::Bernoulli;
|
||||||
use rand::distributions::Uniform;
|
use rand::distributions::Range;
|
||||||
use rand::rngs::StdRng;
|
use rand::{Rng, SeedableRng, XorShiftRng};
|
||||||
use rand::{Rng, SeedableRng};
|
|
||||||
use schema::*;
|
use schema::*;
|
||||||
use DocAddress;
|
|
||||||
use Index;
|
use Index;
|
||||||
use IndexWriter;
|
use IndexWriter;
|
||||||
use Postings;
|
use Postings;
|
||||||
use ReloadPolicy;
|
|
||||||
|
|
||||||
pub fn assert_nearly_equals(expected: f32, val: f32) {
|
pub fn assert_nearly_equals(expected: f32, val: f32) {
|
||||||
assert!(
|
assert!(
|
||||||
@@ -323,15 +321,16 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn generate_nonunique_unsorted(max_value: u32, n_elems: usize) -> Vec<u32> {
|
pub fn generate_nonunique_unsorted(max_value: u32, n_elems: usize) -> Vec<u32> {
|
||||||
let seed: [u8; 32] = [1; 32];
|
let seed: [u8; 16] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
|
||||||
StdRng::from_seed(seed)
|
XorShiftRng::from_seed(seed)
|
||||||
.sample_iter(&Uniform::new(0u32, max_value))
|
.sample_iter(&Range::new(0u32, max_value))
|
||||||
.take(n_elems)
|
.take(n_elems)
|
||||||
.collect::<Vec<u32>>()
|
.collect::<Vec<u32>>()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn sample_with_seed(n: u32, ratio: f64, seed_val: u8) -> Vec<u32> {
|
pub fn sample_with_seed(n: u32, ratio: f64, seed_val: u8) -> Vec<u32> {
|
||||||
StdRng::from_seed([seed_val; 32])
|
let seed: [u8; 16] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, seed_val];
|
||||||
|
XorShiftRng::from_seed(seed)
|
||||||
.sample_iter(&Bernoulli::new(ratio))
|
.sample_iter(&Bernoulli::new(ratio))
|
||||||
.take(n as usize)
|
.take(n as usize)
|
||||||
.enumerate()
|
.enumerate()
|
||||||
@@ -346,13 +345,13 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
#[cfg(feature = "mmap")]
|
#[cfg(feature = "mmap")]
|
||||||
fn test_indexing() {
|
fn test_indexing() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_from_tempdir(schema).unwrap();
|
let index = Index::create_from_tempdir(schema).unwrap();
|
||||||
{
|
{
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
{
|
{
|
||||||
let doc = doc!(text_field=>"af b");
|
let doc = doc!(text_field=>"af b");
|
||||||
index_writer.add_document(doc);
|
index_writer.add_document(doc);
|
||||||
@@ -371,10 +370,10 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_docfreq1() {
|
fn test_docfreq1() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
{
|
{
|
||||||
index_writer.add_document(doc!(text_field=>"a b c"));
|
index_writer.add_document(doc!(text_field=>"a b c"));
|
||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
@@ -396,8 +395,8 @@ mod tests {
|
|||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let reader = index.reader().unwrap();
|
index.load_searchers().unwrap();
|
||||||
let searcher = reader.searcher();
|
let searcher = index.searcher();
|
||||||
let term_a = Term::from_field_text(text_field, "a");
|
let term_a = Term::from_field_text(text_field, "a");
|
||||||
assert_eq!(searcher.doc_freq(&term_a), 3);
|
assert_eq!(searcher.doc_freq(&term_a), 3);
|
||||||
let term_b = Term::from_field_text(text_field, "b");
|
let term_b = Term::from_field_text(text_field, "b");
|
||||||
@@ -411,12 +410,12 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_fieldnorm_no_docs_with_field() {
|
fn test_fieldnorm_no_docs_with_field() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let title_field = schema_builder.add_text_field("title", TEXT);
|
let title_field = schema_builder.add_text_field("title", TEXT);
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
{
|
{
|
||||||
let doc = doc!(text_field=>"a b c");
|
let doc = doc!(text_field=>"a b c");
|
||||||
index_writer.add_document(doc);
|
index_writer.add_document(doc);
|
||||||
@@ -424,8 +423,8 @@ mod tests {
|
|||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let index_reader = index.reader().unwrap();
|
index.load_searchers().unwrap();
|
||||||
let searcher = index_reader.searcher();
|
let searcher = index.searcher();
|
||||||
let reader = searcher.segment_reader(0);
|
let reader = searcher.segment_reader(0);
|
||||||
{
|
{
|
||||||
let fieldnorm_reader = reader.get_fieldnorms_reader(text_field);
|
let fieldnorm_reader = reader.get_fieldnorms_reader(text_field);
|
||||||
@@ -440,11 +439,11 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_fieldnorm() {
|
fn test_fieldnorm() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
{
|
{
|
||||||
let doc = doc!(text_field=>"a b c");
|
let doc = doc!(text_field=>"a b c");
|
||||||
index_writer.add_document(doc);
|
index_writer.add_document(doc);
|
||||||
@@ -460,8 +459,8 @@ mod tests {
|
|||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let reader = index.reader().unwrap();
|
index.load_searchers().unwrap();
|
||||||
let searcher = reader.searcher();
|
let searcher = index.searcher();
|
||||||
let segment_reader: &SegmentReader = searcher.segment_reader(0);
|
let segment_reader: &SegmentReader = searcher.segment_reader(0);
|
||||||
let fieldnorms_reader = segment_reader.get_fieldnorms_reader(text_field);
|
let fieldnorms_reader = segment_reader.get_fieldnorms_reader(text_field);
|
||||||
assert_eq!(fieldnorms_reader.fieldnorm(0), 3);
|
assert_eq!(fieldnorms_reader.fieldnorm(0), 3);
|
||||||
@@ -481,7 +480,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_delete_postings1() {
|
fn test_delete_postings1() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let term_abcd = Term::from_field_text(text_field, "abcd");
|
let term_abcd = Term::from_field_text(text_field, "abcd");
|
||||||
let term_a = Term::from_field_text(text_field, "a");
|
let term_a = Term::from_field_text(text_field, "a");
|
||||||
@@ -489,151 +488,183 @@ mod tests {
|
|||||||
let term_c = Term::from_field_text(text_field, "c");
|
let term_c = Term::from_field_text(text_field, "c");
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let reader = index
|
|
||||||
.reader_builder()
|
|
||||||
.reload_policy(ReloadPolicy::Manual)
|
|
||||||
.try_into()
|
|
||||||
.unwrap();
|
|
||||||
{
|
{
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
// 0
|
{
|
||||||
index_writer.add_document(doc!(text_field=>"a b"));
|
// 0
|
||||||
// 1
|
let doc = doc!(text_field=>"a b");
|
||||||
index_writer.add_document(doc!(text_field=>" a c"));
|
index_writer.add_document(doc);
|
||||||
// 2
|
}
|
||||||
index_writer.add_document(doc!(text_field=>" b c"));
|
{
|
||||||
// 3
|
// 1
|
||||||
index_writer.add_document(doc!(text_field=>" b d"));
|
let doc = doc!(text_field=>" a c");
|
||||||
|
index_writer.add_document(doc);
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
}
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "a"));
|
{
|
||||||
// 4
|
// 2
|
||||||
index_writer.add_document(doc!(text_field=>" b c"));
|
let doc = doc!(text_field=>" b c");
|
||||||
// 5
|
index_writer.add_document(doc);
|
||||||
index_writer.add_document(doc!(text_field=>" a"));
|
}
|
||||||
|
{
|
||||||
|
// 3
|
||||||
|
let doc = doc!(text_field=>" b d");
|
||||||
|
index_writer.add_document(doc);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
||||||
|
}
|
||||||
|
{
|
||||||
|
index_writer.delete_term(Term::from_field_text(text_field, "a"));
|
||||||
|
}
|
||||||
|
{
|
||||||
|
// 4
|
||||||
|
let doc = doc!(text_field=>" b c");
|
||||||
|
index_writer.add_document(doc);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
// 5
|
||||||
|
let doc = doc!(text_field=>" a");
|
||||||
|
index_writer.add_document(doc);
|
||||||
|
}
|
||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
reader.reload().unwrap();
|
index.load_searchers().unwrap();
|
||||||
let searcher = reader.searcher();
|
let searcher = index.searcher();
|
||||||
let segment_reader = searcher.segment_reader(0);
|
let reader = searcher.segment_reader(0);
|
||||||
let inverted_index = segment_reader.inverted_index(text_field);
|
let inverted_index = reader.inverted_index(text_field);
|
||||||
assert!(inverted_index
|
assert!(
|
||||||
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
|
inverted_index
|
||||||
.is_none());
|
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
|
||||||
|
.is_none()
|
||||||
|
);
|
||||||
{
|
{
|
||||||
let mut postings = inverted_index
|
let mut postings = inverted_index
|
||||||
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert!(advance_undeleted(&mut postings, segment_reader));
|
assert!(advance_undeleted(&mut postings, reader));
|
||||||
assert_eq!(postings.doc(), 5);
|
assert_eq!(postings.doc(), 5);
|
||||||
assert!(!advance_undeleted(&mut postings, segment_reader));
|
assert!(!advance_undeleted(&mut postings, reader));
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let mut postings = inverted_index
|
let mut postings = inverted_index
|
||||||
.read_postings(&term_b, IndexRecordOption::WithFreqsAndPositions)
|
.read_postings(&term_b, IndexRecordOption::WithFreqsAndPositions)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert!(advance_undeleted(&mut postings, segment_reader));
|
assert!(advance_undeleted(&mut postings, reader));
|
||||||
assert_eq!(postings.doc(), 3);
|
assert_eq!(postings.doc(), 3);
|
||||||
assert!(advance_undeleted(&mut postings, segment_reader));
|
assert!(advance_undeleted(&mut postings, reader));
|
||||||
assert_eq!(postings.doc(), 4);
|
assert_eq!(postings.doc(), 4);
|
||||||
assert!(!advance_undeleted(&mut postings, segment_reader));
|
assert!(!advance_undeleted(&mut postings, reader));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
// 0
|
{
|
||||||
index_writer.add_document(doc!(text_field=>"a b"));
|
// 0
|
||||||
// 1
|
let doc = doc!(text_field=>"a b");
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
index_writer.add_document(doc);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
// 1
|
||||||
|
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
||||||
|
}
|
||||||
index_writer.rollback().unwrap();
|
index_writer.rollback().unwrap();
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
reader.reload().unwrap();
|
index.load_searchers().unwrap();
|
||||||
let searcher = reader.searcher();
|
let searcher = index.searcher();
|
||||||
let seg_reader = searcher.segment_reader(0);
|
let reader = searcher.segment_reader(0);
|
||||||
let inverted_index = seg_reader.inverted_index(term_abcd.field());
|
let inverted_index = reader.inverted_index(term_abcd.field());
|
||||||
|
|
||||||
assert!(inverted_index
|
assert!(
|
||||||
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
|
inverted_index
|
||||||
.is_none());
|
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
|
||||||
|
.is_none()
|
||||||
|
);
|
||||||
{
|
{
|
||||||
let mut postings = inverted_index
|
let mut postings = inverted_index
|
||||||
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert!(advance_undeleted(&mut postings, seg_reader));
|
assert!(advance_undeleted(&mut postings, reader));
|
||||||
assert_eq!(postings.doc(), 5);
|
assert_eq!(postings.doc(), 5);
|
||||||
assert!(!advance_undeleted(&mut postings, seg_reader));
|
assert!(!advance_undeleted(&mut postings, reader));
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let mut postings = inverted_index
|
let mut postings = inverted_index
|
||||||
.read_postings(&term_b, IndexRecordOption::WithFreqsAndPositions)
|
.read_postings(&term_b, IndexRecordOption::WithFreqsAndPositions)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert!(advance_undeleted(&mut postings, seg_reader));
|
assert!(advance_undeleted(&mut postings, reader));
|
||||||
assert_eq!(postings.doc(), 3);
|
assert_eq!(postings.doc(), 3);
|
||||||
assert!(advance_undeleted(&mut postings, seg_reader));
|
assert!(advance_undeleted(&mut postings, reader));
|
||||||
assert_eq!(postings.doc(), 4);
|
assert_eq!(postings.doc(), 4);
|
||||||
assert!(!advance_undeleted(&mut postings, seg_reader));
|
assert!(!advance_undeleted(&mut postings, reader));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
index_writer.add_document(doc!(text_field=>"a b"));
|
{
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
let doc = doc!(text_field=>"a b");
|
||||||
|
index_writer.add_document(doc);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
||||||
|
}
|
||||||
index_writer.rollback().unwrap();
|
index_writer.rollback().unwrap();
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "a"));
|
index_writer.delete_term(Term::from_field_text(text_field, "a"));
|
||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
reader.reload().unwrap();
|
index.load_searchers().unwrap();
|
||||||
let searcher = reader.searcher();
|
let searcher = index.searcher();
|
||||||
let segment_reader = searcher.segment_reader(0);
|
let reader = searcher.segment_reader(0);
|
||||||
let inverted_index = segment_reader.inverted_index(term_abcd.field());
|
let inverted_index = reader.inverted_index(term_abcd.field());
|
||||||
assert!(inverted_index
|
assert!(
|
||||||
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
|
inverted_index
|
||||||
.is_none());
|
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
|
||||||
|
.is_none()
|
||||||
|
);
|
||||||
{
|
{
|
||||||
let mut postings = inverted_index
|
let mut postings = inverted_index
|
||||||
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert!(!advance_undeleted(&mut postings, segment_reader));
|
assert!(!advance_undeleted(&mut postings, reader));
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let mut postings = inverted_index
|
let mut postings = inverted_index
|
||||||
.read_postings(&term_b, IndexRecordOption::WithFreqsAndPositions)
|
.read_postings(&term_b, IndexRecordOption::WithFreqsAndPositions)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert!(advance_undeleted(&mut postings, segment_reader));
|
assert!(advance_undeleted(&mut postings, reader));
|
||||||
assert_eq!(postings.doc(), 3);
|
assert_eq!(postings.doc(), 3);
|
||||||
assert!(advance_undeleted(&mut postings, segment_reader));
|
assert!(advance_undeleted(&mut postings, reader));
|
||||||
assert_eq!(postings.doc(), 4);
|
assert_eq!(postings.doc(), 4);
|
||||||
assert!(!advance_undeleted(&mut postings, segment_reader));
|
assert!(!advance_undeleted(&mut postings, reader));
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let mut postings = inverted_index
|
let mut postings = inverted_index
|
||||||
.read_postings(&term_c, IndexRecordOption::WithFreqsAndPositions)
|
.read_postings(&term_c, IndexRecordOption::WithFreqsAndPositions)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert!(advance_undeleted(&mut postings, segment_reader));
|
assert!(advance_undeleted(&mut postings, reader));
|
||||||
assert_eq!(postings.doc(), 4);
|
assert_eq!(postings.doc(), 4);
|
||||||
assert!(!advance_undeleted(&mut postings, segment_reader));
|
assert!(!advance_undeleted(&mut postings, reader));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_indexed_u64() {
|
fn test_indexed_u64() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let field = schema_builder.add_u64_field("value", INDEXED);
|
let field = schema_builder.add_u64_field("value", INT_INDEXED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
|
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
index_writer.add_document(doc!(field=>1u64));
|
index_writer.add_document(doc!(field=>1u64));
|
||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
let reader = index.reader().unwrap();
|
index.load_searchers().unwrap();
|
||||||
let searcher = reader.searcher();
|
let searcher = index.searcher();
|
||||||
let term = Term::from_field_u64(field, 1u64);
|
let term = Term::from_field_u64(field, 1u64);
|
||||||
let mut postings = searcher
|
let mut postings = searcher
|
||||||
.segment_reader(0)
|
.segment_reader(0)
|
||||||
@@ -647,17 +678,17 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_indexed_i64() {
|
fn test_indexed_i64() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let value_field = schema_builder.add_i64_field("value", INDEXED);
|
let value_field = schema_builder.add_i64_field("value", INT_INDEXED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
|
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
let negative_val = -1i64;
|
let negative_val = -1i64;
|
||||||
index_writer.add_document(doc!(value_field => negative_val));
|
index_writer.add_document(doc!(value_field => negative_val));
|
||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
let reader = index.reader().unwrap();
|
index.load_searchers().unwrap();
|
||||||
let searcher = reader.searcher();
|
let searcher = index.searcher();
|
||||||
let term = Term::from_field_i64(value_field, negative_val);
|
let term = Term::from_field_i64(value_field, negative_val);
|
||||||
let mut postings = searcher
|
let mut postings = searcher
|
||||||
.segment_reader(0)
|
.segment_reader(0)
|
||||||
@@ -671,34 +702,29 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_indexedfield_not_in_documents() {
|
fn test_indexedfield_not_in_documents() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let absent_field = schema_builder.add_text_field("text", TEXT);
|
let absent_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut index_writer = index.writer_with_num_threads(2, 6_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(2, 40_000_000).unwrap();
|
||||||
index_writer.add_document(doc!(text_field=>"a"));
|
index_writer.add_document(doc!(text_field=>"a"));
|
||||||
assert!(index_writer.commit().is_ok());
|
assert!(index_writer.commit().is_ok());
|
||||||
let reader = index.reader().unwrap();
|
assert!(index.load_searchers().is_ok());
|
||||||
let searcher = reader.searcher();
|
let searcher = index.searcher();
|
||||||
let segment_reader = searcher.segment_reader(0);
|
let segment_reader = searcher.segment_reader(0);
|
||||||
segment_reader.inverted_index(absent_field); //< should not panic
|
segment_reader.inverted_index(absent_field); //< should not panic
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_delete_postings2() {
|
fn test_delete_postings2() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let reader = index
|
|
||||||
.reader_builder()
|
|
||||||
.reload_policy(ReloadPolicy::Manual)
|
|
||||||
.try_into()
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_with_num_threads(2, 6_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(2, 40_000_000).unwrap();
|
||||||
|
|
||||||
let add_document = |index_writer: &mut IndexWriter, val: &'static str| {
|
let add_document = |index_writer: &mut IndexWriter, val: &'static str| {
|
||||||
let doc = doc!(text_field=>val);
|
let doc = doc!(text_field=>val);
|
||||||
@@ -721,20 +747,20 @@ mod tests {
|
|||||||
remove_document(&mut index_writer, "38");
|
remove_document(&mut index_writer, "38");
|
||||||
remove_document(&mut index_writer, "34");
|
remove_document(&mut index_writer, "34");
|
||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
reader.reload().unwrap();
|
index.load_searchers().unwrap();
|
||||||
let searcher = reader.searcher();
|
let searcher = index.searcher();
|
||||||
assert_eq!(searcher.num_docs(), 6);
|
assert_eq!(searcher.num_docs(), 6);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_termfreq() {
|
fn test_termfreq() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
{
|
{
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
{
|
{
|
||||||
let doc = doc!(text_field=>"af af af bc bc");
|
let doc = doc!(text_field=>"af af af bc bc");
|
||||||
index_writer.add_document(doc);
|
index_writer.add_document(doc);
|
||||||
@@ -742,14 +768,16 @@ mod tests {
|
|||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let index_reader = index.reader().unwrap();
|
index.load_searchers().unwrap();
|
||||||
let searcher = index_reader.searcher();
|
let searcher = index.searcher();
|
||||||
let reader = searcher.segment_reader(0);
|
let reader = searcher.segment_reader(0);
|
||||||
let inverted_index = reader.inverted_index(text_field);
|
let inverted_index = reader.inverted_index(text_field);
|
||||||
let term_abcd = Term::from_field_text(text_field, "abcd");
|
let term_abcd = Term::from_field_text(text_field, "abcd");
|
||||||
assert!(inverted_index
|
assert!(
|
||||||
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
|
inverted_index
|
||||||
.is_none());
|
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
|
||||||
|
.is_none()
|
||||||
|
);
|
||||||
let term_af = Term::from_field_text(text_field, "af");
|
let term_af = Term::from_field_text(text_field, "af");
|
||||||
let mut postings = inverted_index
|
let mut postings = inverted_index
|
||||||
.read_postings(&term_af, IndexRecordOption::WithFreqsAndPositions)
|
.read_postings(&term_af, IndexRecordOption::WithFreqsAndPositions)
|
||||||
@@ -763,84 +791,109 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_searcher_1() {
|
fn test_searcher_1() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let reader = index.reader().unwrap();
|
|
||||||
{
|
{
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
index_writer.add_document(doc!(text_field=>"af af af b"));
|
{
|
||||||
index_writer.add_document(doc!(text_field=>"a b c"));
|
let doc = doc!(text_field=>"af af af b");
|
||||||
index_writer.add_document(doc!(text_field=>"a b c d"));
|
index_writer.add_document(doc);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
let doc = doc!(text_field=>"a b c");
|
||||||
|
index_writer.add_document(doc);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
let doc = doc!(text_field=>"a b c d");
|
||||||
|
index_writer.add_document(doc);
|
||||||
|
}
|
||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
reader.reload().unwrap();
|
index.load_searchers().unwrap();
|
||||||
let searcher = reader.searcher();
|
let searcher = index.searcher();
|
||||||
let get_doc_ids = |terms: Vec<Term>| {
|
let get_doc_ids = |terms: Vec<Term>| {
|
||||||
let query = BooleanQuery::new_multiterms_query(terms);
|
let query = BooleanQuery::new_multiterms_query(terms);
|
||||||
let topdocs = searcher.search(&query, &TestCollector).unwrap();
|
let mut collector = TestCollector::default();
|
||||||
topdocs.docs().to_vec()
|
assert!(searcher.search(&query, &mut collector).is_ok());
|
||||||
|
collector.docs()
|
||||||
};
|
};
|
||||||
assert_eq!(
|
{
|
||||||
get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
|
assert_eq!(
|
||||||
vec![DocAddress(0, 1), DocAddress(0, 2)]
|
get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
|
||||||
);
|
vec![1, 2]
|
||||||
assert_eq!(
|
);
|
||||||
get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
|
}
|
||||||
vec![DocAddress(0, 0)]
|
{
|
||||||
);
|
assert_eq!(
|
||||||
assert_eq!(
|
get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
|
||||||
get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
|
vec![0]
|
||||||
vec![DocAddress(0, 0), DocAddress(0, 1), DocAddress(0, 2)]
|
);
|
||||||
);
|
}
|
||||||
assert_eq!(
|
{
|
||||||
get_doc_ids(vec![Term::from_field_text(text_field, "c")]),
|
assert_eq!(
|
||||||
vec![DocAddress(0, 1), DocAddress(0, 2)]
|
get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
|
||||||
);
|
vec![0, 1, 2]
|
||||||
assert_eq!(
|
);
|
||||||
get_doc_ids(vec![Term::from_field_text(text_field, "d")]),
|
}
|
||||||
vec![DocAddress(0, 2)]
|
{
|
||||||
);
|
assert_eq!(
|
||||||
assert_eq!(
|
get_doc_ids(vec![Term::from_field_text(text_field, "c")]),
|
||||||
get_doc_ids(vec![
|
vec![1, 2]
|
||||||
Term::from_field_text(text_field, "b"),
|
);
|
||||||
Term::from_field_text(text_field, "a"),
|
}
|
||||||
]),
|
{
|
||||||
vec![DocAddress(0, 0), DocAddress(0, 1), DocAddress(0, 2)]
|
assert_eq!(
|
||||||
);
|
get_doc_ids(vec![Term::from_field_text(text_field, "d")]),
|
||||||
|
vec![2]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
assert_eq!(
|
||||||
|
get_doc_ids(vec![
|
||||||
|
Term::from_field_text(text_field, "b"),
|
||||||
|
Term::from_field_text(text_field, "a"),
|
||||||
|
]),
|
||||||
|
vec![0, 1, 2]
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_searcher_2() {
|
fn test_searcher_2() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let reader = index
|
|
||||||
.reader_builder()
|
|
||||||
.reload_policy(ReloadPolicy::Manual)
|
|
||||||
.try_into()
|
|
||||||
.unwrap();
|
|
||||||
assert_eq!(reader.searcher().num_docs(), 0u64);
|
|
||||||
{
|
{
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
index_writer.add_document(doc!(text_field=>"af b"));
|
{
|
||||||
index_writer.add_document(doc!(text_field=>"a b c"));
|
let doc = doc!(text_field=>"af b");
|
||||||
index_writer.add_document(doc!(text_field=>"a b c d"));
|
index_writer.add_document(doc);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
let doc = doc!(text_field=>"a b c");
|
||||||
|
index_writer.add_document(doc);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
let doc = doc!(text_field=>"a b c d");
|
||||||
|
index_writer.add_document(doc);
|
||||||
|
}
|
||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
}
|
}
|
||||||
reader.reload().unwrap();
|
index.searcher();
|
||||||
assert_eq!(reader.searcher().num_docs(), 3u64);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_doc_macro() {
|
fn test_doc_macro() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let other_text_field = schema_builder.add_text_field("text2", TEXT);
|
let other_text_field = schema_builder.add_text_field("text2", TEXT);
|
||||||
let document = doc!(text_field => "tantivy",
|
let document = doc!(text_field => "tantivy",
|
||||||
@@ -858,11 +911,11 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_wrong_fast_field_type() {
|
fn test_wrong_fast_field_type() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let fast_field_unsigned = schema_builder.add_u64_field("unsigned", FAST);
|
let fast_field_unsigned = schema_builder.add_u64_field("unsigned", FAST);
|
||||||
let fast_field_signed = schema_builder.add_i64_field("signed", FAST);
|
let fast_field_signed = schema_builder.add_i64_field("signed", FAST);
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let stored_int_field = schema_builder.add_u64_field("text", STORED);
|
let stored_int_field = schema_builder.add_u64_field("text", INT_STORED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
|
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -872,8 +925,9 @@ mod tests {
|
|||||||
index_writer.add_document(document);
|
index_writer.add_document(document);
|
||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
}
|
}
|
||||||
let reader = index.reader().unwrap();
|
|
||||||
let searcher = reader.searcher();
|
index.load_searchers().unwrap();
|
||||||
|
let searcher = index.searcher();
|
||||||
let segment_reader: &SegmentReader = searcher.segment_reader(0);
|
let segment_reader: &SegmentReader = searcher.segment_reader(0);
|
||||||
{
|
{
|
||||||
let fast_field_reader_res = segment_reader.fast_field_reader::<u64>(text_field);
|
let fast_field_reader_res = segment_reader.fast_field_reader::<u64>(text_field);
|
||||||
|
|||||||
@@ -26,12 +26,12 @@
|
|||||||
/// #[macro_use]
|
/// #[macro_use]
|
||||||
/// extern crate tantivy;
|
/// extern crate tantivy;
|
||||||
///
|
///
|
||||||
/// use tantivy::schema::{Schema, TEXT, FAST};
|
/// use tantivy::schema::{SchemaBuilder, TEXT, FAST};
|
||||||
///
|
///
|
||||||
/// //...
|
/// //...
|
||||||
///
|
///
|
||||||
/// # fn main() {
|
/// # fn main() {
|
||||||
/// let mut schema_builder = Schema::builder();
|
/// let mut schema_builder = SchemaBuilder::new();
|
||||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||||
/// let author = schema_builder.add_text_field("text", TEXT);
|
/// let author = schema_builder.add_text_field("text", TEXT);
|
||||||
/// let likes = schema_builder.add_u64_field("num_u64", FAST);
|
/// let likes = schema_builder.add_u64_field("num_u64", FAST);
|
||||||
@@ -61,39 +61,39 @@ macro_rules! doc(
|
|||||||
};
|
};
|
||||||
// if there is a trailing comma retry with the trailing comma stripped.
|
// if there is a trailing comma retry with the trailing comma stripped.
|
||||||
($($field:expr => $value:expr),+ ,) => {
|
($($field:expr => $value:expr),+ ,) => {
|
||||||
doc!( $( $field => $value ), *)
|
doc!( $( $field => $value ), *);
|
||||||
};
|
};
|
||||||
);
|
);
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use schema::{Schema, FAST, TEXT};
|
use schema::{SchemaBuilder, FAST, TEXT};
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_doc_basic() {
|
fn test_doc_basic() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::new();
|
||||||
let title = schema_builder.add_text_field("title", TEXT);
|
let title = schema_builder.add_text_field("title", TEXT);
|
||||||
let author = schema_builder.add_text_field("text", TEXT);
|
let author = schema_builder.add_text_field("text", TEXT);
|
||||||
let likes = schema_builder.add_u64_field("num_u64", FAST);
|
let likes = schema_builder.add_u64_field("num_u64", FAST);
|
||||||
let _schema = schema_builder.build();
|
let _schema = schema_builder.build();
|
||||||
let _doc = doc!(
|
let _doc = doc!(
|
||||||
title => "Life Aquatic",
|
title => "Life Aquatic",
|
||||||
author => "Wes Anderson",
|
author => "Wes Anderson",
|
||||||
likes => 4u64
|
likes => 4u64
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_doc_trailing_comma() {
|
fn test_doc_trailing_comma() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::new();
|
||||||
let title = schema_builder.add_text_field("title", TEXT);
|
let title = schema_builder.add_text_field("title", TEXT);
|
||||||
let author = schema_builder.add_text_field("text", TEXT);
|
let author = schema_builder.add_text_field("text", TEXT);
|
||||||
let likes = schema_builder.add_u64_field("num_u64", FAST);
|
let likes = schema_builder.add_u64_field("num_u64", FAST);
|
||||||
let _schema = schema_builder.build();
|
let _schema = schema_builder.build();
|
||||||
let _doc = doc!(
|
let _doc = doc!(
|
||||||
title => "Life Aquatic",
|
title => "Life Aquatic",
|
||||||
author => "Wes Anderson",
|
author => "Wes Anderson",
|
||||||
likes => 4u64,
|
likes => 4u64,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -34,6 +34,10 @@ const COMPRESSION_BLOCK_SIZE: usize = BitPacker4x::BLOCK_LEN;
|
|||||||
const LONG_SKIP_IN_BLOCKS: usize = 1_024;
|
const LONG_SKIP_IN_BLOCKS: usize = 1_024;
|
||||||
const LONG_SKIP_INTERVAL: u64 = (LONG_SKIP_IN_BLOCKS * COMPRESSION_BLOCK_SIZE) as u64;
|
const LONG_SKIP_INTERVAL: u64 = (LONG_SKIP_IN_BLOCKS * COMPRESSION_BLOCK_SIZE) as u64;
|
||||||
|
|
||||||
|
lazy_static! {
|
||||||
|
static ref BIT_PACKER: BitPacker4x = BitPacker4x::new();
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub mod tests {
|
pub mod tests {
|
||||||
|
|
||||||
|
|||||||
@@ -1,23 +1,4 @@
|
|||||||
/// Positions works as a long sequence of compressed block.
|
use super::BIT_PACKER;
|
||||||
/// All terms are chained one after the other.
|
|
||||||
///
|
|
||||||
/// When accessing the position of a term, we get a positions_idx from the `Terminfo`.
|
|
||||||
/// This means we need to skip to the `nth` positions efficiently.
|
|
||||||
///
|
|
||||||
/// This is done thanks to two levels of skiping that we refer to in the code
|
|
||||||
/// as `long_skip` and `short_skip`.
|
|
||||||
///
|
|
||||||
/// The `long_skip` makes it possible to skip every 1_024 compression blocks (= 131_072 positions).
|
|
||||||
/// Skipping offset are simply stored one after as an offset stored over 8 bytes.
|
|
||||||
///
|
|
||||||
/// We find the number of long skips, as `n / long_skip`.
|
|
||||||
///
|
|
||||||
/// Blocks are compressed using bitpacking, so `skip_read` contains the number of bytes
|
|
||||||
/// (values can go from 0bit to 32 bits) required to decompressed every block.
|
|
||||||
///
|
|
||||||
/// A given block obviously takes `(128 x num_bit_for_the_block / num_bits_in_a_byte)`,
|
|
||||||
/// so skipping a block without decompressing it is just a matter of advancing that many
|
|
||||||
/// bytes.
|
|
||||||
use bitpacking::{BitPacker, BitPacker4x};
|
use bitpacking::{BitPacker, BitPacker4x};
|
||||||
use common::{BinarySerializable, FixedSize};
|
use common::{BinarySerializable, FixedSize};
|
||||||
use directory::ReadOnlySource;
|
use directory::ReadOnlySource;
|
||||||
@@ -27,65 +8,9 @@ use positions::LONG_SKIP_INTERVAL;
|
|||||||
use positions::LONG_SKIP_IN_BLOCKS;
|
use positions::LONG_SKIP_IN_BLOCKS;
|
||||||
use postings::compression::compressed_block_size;
|
use postings::compression::compressed_block_size;
|
||||||
|
|
||||||
struct Positions {
|
|
||||||
bit_packer: BitPacker4x,
|
|
||||||
skip_source: ReadOnlySource,
|
|
||||||
position_source: ReadOnlySource,
|
|
||||||
long_skip_source: ReadOnlySource,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Positions {
|
|
||||||
pub fn new(position_source: ReadOnlySource, skip_source: ReadOnlySource) -> Positions {
|
|
||||||
let skip_len = skip_source.len();
|
|
||||||
let (body, footer) = skip_source.split(skip_len - u32::SIZE_IN_BYTES);
|
|
||||||
let num_long_skips = u32::deserialize(&mut footer.as_slice()).expect("Index corrupted");
|
|
||||||
let body_split = body.len() - u64::SIZE_IN_BYTES * (num_long_skips as usize);
|
|
||||||
let (skip_source, long_skip_source) = body.split(body_split);
|
|
||||||
Positions {
|
|
||||||
bit_packer: BitPacker4x::new(),
|
|
||||||
skip_source,
|
|
||||||
long_skip_source,
|
|
||||||
position_source,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns the offset of the block associated to the given `long_skip_id`.
|
|
||||||
///
|
|
||||||
/// One `long_skip_id` means `LONG_SKIP_IN_BLOCKS` blocks.
|
|
||||||
fn long_skip(&self, long_skip_id: usize) -> u64 {
|
|
||||||
if long_skip_id == 0 {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
let long_skip_slice = self.long_skip_source.as_slice();
|
|
||||||
let mut long_skip_blocks: &[u8] = &long_skip_slice[(long_skip_id - 1) * 8..][..8];
|
|
||||||
u64::deserialize(&mut long_skip_blocks).expect("Index corrupted")
|
|
||||||
}
|
|
||||||
|
|
||||||
fn reader(&self, offset: u64) -> PositionReader {
|
|
||||||
let long_skip_id = (offset / LONG_SKIP_INTERVAL) as usize;
|
|
||||||
let small_skip = (offset % LONG_SKIP_INTERVAL) as usize;
|
|
||||||
let offset_num_bytes: u64 = self.long_skip(long_skip_id);
|
|
||||||
let mut position_read = OwnedRead::new(self.position_source.clone());
|
|
||||||
position_read.advance(offset_num_bytes as usize);
|
|
||||||
let mut skip_read = OwnedRead::new(self.skip_source.clone());
|
|
||||||
skip_read.advance(long_skip_id * LONG_SKIP_IN_BLOCKS);
|
|
||||||
let mut position_reader = PositionReader {
|
|
||||||
bit_packer: self.bit_packer,
|
|
||||||
skip_read,
|
|
||||||
position_read,
|
|
||||||
inner_offset: 0,
|
|
||||||
buffer: Box::new([0u32; 128]),
|
|
||||||
ahead: None,
|
|
||||||
};
|
|
||||||
position_reader.skip(small_skip);
|
|
||||||
position_reader
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct PositionReader {
|
pub struct PositionReader {
|
||||||
skip_read: OwnedRead,
|
skip_read: OwnedRead,
|
||||||
position_read: OwnedRead,
|
position_read: OwnedRead,
|
||||||
bit_packer: BitPacker4x,
|
|
||||||
inner_offset: usize,
|
inner_offset: usize,
|
||||||
buffer: Box<[u32; 128]>,
|
buffer: Box<[u32; 128]>,
|
||||||
ahead: Option<usize>, // if None, no block is loaded.
|
ahead: Option<usize>, // if None, no block is loaded.
|
||||||
@@ -102,7 +27,6 @@ pub struct PositionReader {
|
|||||||
// If the requested number of els ends exactly at a given block, the next
|
// If the requested number of els ends exactly at a given block, the next
|
||||||
// block is not decompressed.
|
// block is not decompressed.
|
||||||
fn read_impl(
|
fn read_impl(
|
||||||
bit_packer: BitPacker4x,
|
|
||||||
mut position: &[u8],
|
mut position: &[u8],
|
||||||
buffer: &mut [u32; 128],
|
buffer: &mut [u32; 128],
|
||||||
mut inner_offset: usize,
|
mut inner_offset: usize,
|
||||||
@@ -113,23 +37,21 @@ fn read_impl(
|
|||||||
let mut output_len = output.len();
|
let mut output_len = output.len();
|
||||||
let mut ahead = 0;
|
let mut ahead = 0;
|
||||||
loop {
|
loop {
|
||||||
let available_len = COMPRESSION_BLOCK_SIZE - inner_offset;
|
let available_len = 128 - inner_offset;
|
||||||
// We have enough elements in the current block.
|
|
||||||
// Let's copy the requested elements in the output buffer,
|
|
||||||
// and return.
|
|
||||||
if output_len <= available_len {
|
if output_len <= available_len {
|
||||||
output[output_start..].copy_from_slice(&buffer[inner_offset..][..output_len]);
|
output[output_start..].copy_from_slice(&buffer[inner_offset..][..output_len]);
|
||||||
return ahead;
|
return ahead;
|
||||||
|
} else {
|
||||||
|
output[output_start..][..available_len].copy_from_slice(&buffer[inner_offset..]);
|
||||||
|
output_len -= available_len;
|
||||||
|
output_start += available_len;
|
||||||
|
inner_offset = 0;
|
||||||
|
let num_bits = num_bits[ahead];
|
||||||
|
BitPacker4x::new().decompress(position, &mut buffer[..], num_bits);
|
||||||
|
let block_len = compressed_block_size(num_bits);
|
||||||
|
position = &position[block_len..];
|
||||||
|
ahead += 1;
|
||||||
}
|
}
|
||||||
output[output_start..][..available_len].copy_from_slice(&buffer[inner_offset..]);
|
|
||||||
output_len -= available_len;
|
|
||||||
output_start += available_len;
|
|
||||||
inner_offset = 0;
|
|
||||||
let num_bits = num_bits[ahead];
|
|
||||||
bit_packer.decompress(position, &mut buffer[..], num_bits);
|
|
||||||
let block_len = compressed_block_size(num_bits);
|
|
||||||
position = &position[block_len..];
|
|
||||||
ahead += 1;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -139,7 +61,35 @@ impl PositionReader {
|
|||||||
skip_source: ReadOnlySource,
|
skip_source: ReadOnlySource,
|
||||||
offset: u64,
|
offset: u64,
|
||||||
) -> PositionReader {
|
) -> PositionReader {
|
||||||
Positions::new(position_source, skip_source).reader(offset)
|
let skip_len = skip_source.len();
|
||||||
|
let (body, footer) = skip_source.split(skip_len - u32::SIZE_IN_BYTES);
|
||||||
|
let num_long_skips = u32::deserialize(&mut footer.as_slice()).expect("Index corrupted");
|
||||||
|
let body_split = body.len() - u64::SIZE_IN_BYTES * (num_long_skips as usize);
|
||||||
|
let (skip_body, long_skips) = body.split(body_split);
|
||||||
|
let long_skip_id = (offset / LONG_SKIP_INTERVAL) as usize;
|
||||||
|
let small_skip = (offset - (long_skip_id as u64) * (LONG_SKIP_INTERVAL as u64)) as usize;
|
||||||
|
let offset_num_bytes: u64 = {
|
||||||
|
if long_skip_id > 0 {
|
||||||
|
let mut long_skip_blocks: &[u8] =
|
||||||
|
&long_skips.as_slice()[(long_skip_id - 1) * 8..][..8];
|
||||||
|
u64::deserialize(&mut long_skip_blocks).expect("Index corrupted") * 16
|
||||||
|
} else {
|
||||||
|
0
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let mut position_read = OwnedRead::new(position_source);
|
||||||
|
position_read.advance(offset_num_bytes as usize);
|
||||||
|
let mut skip_read = OwnedRead::new(skip_body);
|
||||||
|
skip_read.advance(long_skip_id * LONG_SKIP_IN_BLOCKS);
|
||||||
|
let mut position_reader = PositionReader {
|
||||||
|
skip_read,
|
||||||
|
position_read,
|
||||||
|
inner_offset: 0,
|
||||||
|
buffer: Box::new([0u32; 128]),
|
||||||
|
ahead: None,
|
||||||
|
};
|
||||||
|
position_reader.skip(small_skip);
|
||||||
|
position_reader
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Fills a buffer with the next `output.len()` integers.
|
/// Fills a buffer with the next `output.len()` integers.
|
||||||
@@ -151,13 +101,10 @@ impl PositionReader {
|
|||||||
if self.ahead != Some(0) {
|
if self.ahead != Some(0) {
|
||||||
// the block currently available is not the block
|
// the block currently available is not the block
|
||||||
// for the current position
|
// for the current position
|
||||||
self.bit_packer
|
BIT_PACKER.decompress(position_data, self.buffer.as_mut(), num_bits);
|
||||||
.decompress(position_data, self.buffer.as_mut(), num_bits);
|
|
||||||
self.ahead = Some(0);
|
|
||||||
}
|
}
|
||||||
let block_len = compressed_block_size(num_bits);
|
let block_len = compressed_block_size(num_bits);
|
||||||
self.ahead = Some(read_impl(
|
self.ahead = Some(read_impl(
|
||||||
self.bit_packer,
|
|
||||||
&position_data[block_len..],
|
&position_data[block_len..],
|
||||||
self.buffer.as_mut(),
|
self.buffer.as_mut(),
|
||||||
self.inner_offset,
|
self.inner_offset,
|
||||||
@@ -186,13 +133,14 @@ impl PositionReader {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
let skip_len_in_bits = self.skip_read.as_ref()[..num_blocks_to_advance]
|
let skip_len = self.skip_read.as_ref()[..num_blocks_to_advance]
|
||||||
.iter()
|
.iter()
|
||||||
.map(|num_bits| *num_bits as usize)
|
.cloned()
|
||||||
|
.map(|num_bit| num_bit as usize)
|
||||||
.sum::<usize>()
|
.sum::<usize>()
|
||||||
* COMPRESSION_BLOCK_SIZE;
|
* (COMPRESSION_BLOCK_SIZE / 8);
|
||||||
let skip_len_in_bytes = skip_len_in_bits / 8;
|
|
||||||
self.skip_read.advance(num_blocks_to_advance);
|
self.skip_read.advance(num_blocks_to_advance);
|
||||||
self.position_read.advance(skip_len_in_bytes);
|
self.position_read.advance(skip_len);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,30 +1,29 @@
|
|||||||
|
use super::BIT_PACKER;
|
||||||
use bitpacking::BitPacker;
|
use bitpacking::BitPacker;
|
||||||
use bitpacking::BitPacker4x;
|
|
||||||
use common::BinarySerializable;
|
use common::BinarySerializable;
|
||||||
use common::CountingWriter;
|
|
||||||
use positions::{COMPRESSION_BLOCK_SIZE, LONG_SKIP_INTERVAL};
|
use positions::{COMPRESSION_BLOCK_SIZE, LONG_SKIP_INTERVAL};
|
||||||
use std::io::{self, Write};
|
use std::io;
|
||||||
|
|
||||||
pub struct PositionSerializer<W: io::Write> {
|
pub struct PositionSerializer<W: io::Write> {
|
||||||
bit_packer: BitPacker4x,
|
write_stream: W,
|
||||||
write_stream: CountingWriter<W>,
|
|
||||||
write_skiplist: W,
|
write_skiplist: W,
|
||||||
block: Vec<u32>,
|
block: Vec<u32>,
|
||||||
buffer: Vec<u8>,
|
buffer: Vec<u8>,
|
||||||
num_ints: u64,
|
num_ints: u64,
|
||||||
long_skips: Vec<u64>,
|
long_skips: Vec<u64>,
|
||||||
|
cumulated_num_bits: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<W: io::Write> PositionSerializer<W> {
|
impl<W: io::Write> PositionSerializer<W> {
|
||||||
pub fn new(write_stream: W, write_skiplist: W) -> PositionSerializer<W> {
|
pub fn new(write_stream: W, write_skiplist: W) -> PositionSerializer<W> {
|
||||||
PositionSerializer {
|
PositionSerializer {
|
||||||
bit_packer: BitPacker4x::new(),
|
write_stream,
|
||||||
write_stream: CountingWriter::wrap(write_stream),
|
|
||||||
write_skiplist,
|
write_skiplist,
|
||||||
block: Vec::with_capacity(128),
|
block: Vec::with_capacity(128),
|
||||||
buffer: vec![0u8; 128 * 4],
|
buffer: vec![0u8; 128 * 4],
|
||||||
num_ints: 0u64,
|
num_ints: 0u64,
|
||||||
long_skips: Vec::new(),
|
long_skips: Vec::new(),
|
||||||
|
cumulated_num_bits: 0u64,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -51,15 +50,14 @@ impl<W: io::Write> PositionSerializer<W> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn flush_block(&mut self) -> io::Result<()> {
|
fn flush_block(&mut self) -> io::Result<()> {
|
||||||
let num_bits = self.bit_packer.num_bits(&self.block[..]);
|
let num_bits = BIT_PACKER.num_bits(&self.block[..]);
|
||||||
|
self.cumulated_num_bits += u64::from(num_bits);
|
||||||
self.write_skiplist.write_all(&[num_bits])?;
|
self.write_skiplist.write_all(&[num_bits])?;
|
||||||
let written_len = self
|
let written_len = BIT_PACKER.compress(&self.block[..], &mut self.buffer, num_bits);
|
||||||
.bit_packer
|
|
||||||
.compress(&self.block[..], &mut self.buffer, num_bits);
|
|
||||||
self.write_stream.write_all(&self.buffer[..written_len])?;
|
self.write_stream.write_all(&self.buffer[..written_len])?;
|
||||||
self.block.clear();
|
self.block.clear();
|
||||||
if (self.num_ints % LONG_SKIP_INTERVAL) == 0u64 {
|
if (self.num_ints % LONG_SKIP_INTERVAL) == 0u64 {
|
||||||
self.long_skips.push(self.write_stream.written_bytes());
|
self.long_skips.push(self.cumulated_num_bits);
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,229 +0,0 @@
|
|||||||
/// This modules define the logic used to search for a doc in a given
|
|
||||||
/// block. (at most 128 docs)
|
|
||||||
///
|
|
||||||
/// Searching within a block is a hotspot when running intersection.
|
|
||||||
/// so it was worth defining it in its own module.
|
|
||||||
|
|
||||||
#[cfg(target_arch = "x86_64")]
|
|
||||||
mod sse2 {
|
|
||||||
use postings::compression::COMPRESSION_BLOCK_SIZE;
|
|
||||||
use std::arch::x86_64::__m128i as DataType;
|
|
||||||
use std::arch::x86_64::_mm_add_epi32 as op_add;
|
|
||||||
use std::arch::x86_64::_mm_cmplt_epi32 as op_lt;
|
|
||||||
use std::arch::x86_64::_mm_load_si128 as op_load; // requires 128-bits alignment
|
|
||||||
use std::arch::x86_64::_mm_set1_epi32 as set1;
|
|
||||||
use std::arch::x86_64::_mm_setzero_si128 as set0;
|
|
||||||
use std::arch::x86_64::_mm_sub_epi32 as op_sub;
|
|
||||||
use std::arch::x86_64::{_mm_cvtsi128_si32, _mm_shuffle_epi32};
|
|
||||||
|
|
||||||
const MASK1: i32 = 78;
|
|
||||||
const MASK2: i32 = 177;
|
|
||||||
|
|
||||||
/// Performs an exhaustive linear search over the
|
|
||||||
///
|
|
||||||
/// There is no early exit here. We simply count the
|
|
||||||
/// number of elements that are `< target`.
|
|
||||||
pub fn linear_search_sse2_128(arr: &[u32], target: u32) -> usize {
|
|
||||||
unsafe {
|
|
||||||
let ptr = arr.as_ptr() as *const DataType;
|
|
||||||
let vkey = set1(target as i32);
|
|
||||||
let mut cnt = set0();
|
|
||||||
// We work over 4 `__m128i` at a time.
|
|
||||||
// A single `__m128i` actual contains 4 `u32`.
|
|
||||||
for i in 0..(COMPRESSION_BLOCK_SIZE as isize) / (4 * 4) {
|
|
||||||
let cmp1 = op_lt(op_load(ptr.offset(i * 4)), vkey);
|
|
||||||
let cmp2 = op_lt(op_load(ptr.offset(i * 4 + 1)), vkey);
|
|
||||||
let cmp3 = op_lt(op_load(ptr.offset(i * 4 + 2)), vkey);
|
|
||||||
let cmp4 = op_lt(op_load(ptr.offset(i * 4 + 3)), vkey);
|
|
||||||
let sum = op_add(op_add(cmp1, cmp2), op_add(cmp3, cmp4));
|
|
||||||
cnt = op_sub(cnt, sum);
|
|
||||||
}
|
|
||||||
cnt = op_add(cnt, _mm_shuffle_epi32(cnt, MASK1));
|
|
||||||
cnt = op_add(cnt, _mm_shuffle_epi32(cnt, MASK2));
|
|
||||||
_mm_cvtsi128_si32(cnt) as usize
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod test {
|
|
||||||
use super::linear_search_sse2_128;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_linear_search_sse2_128_u32() {
|
|
||||||
for i in 0..23 {
|
|
||||||
dbg!(i);
|
|
||||||
let arr: Vec<u32> = (0..128).map(|el| el * 2 + 1 << 18).collect();
|
|
||||||
assert_eq!(linear_search_sse2_128(&arr, arr[64] + 1), 65);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// This `linear search` browser exhaustively through the array.
|
|
||||||
/// but the early exit is very difficult to predict.
|
|
||||||
///
|
|
||||||
/// Coupled with `exponential search` this function is likely
|
|
||||||
/// to be called with the same `len`
|
|
||||||
fn linear_search(arr: &[u32], target: u32) -> usize {
|
|
||||||
arr.iter().map(|&el| if el < target { 1 } else { 0 }).sum()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn exponential_search(arr: &[u32], target: u32) -> (usize, usize) {
|
|
||||||
let end = arr.len();
|
|
||||||
let mut begin = 0;
|
|
||||||
for &pivot in &[1, 3, 7, 15, 31, 63] {
|
|
||||||
if pivot >= end {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if arr[pivot] > target {
|
|
||||||
return (begin, pivot);
|
|
||||||
}
|
|
||||||
begin = pivot;
|
|
||||||
}
|
|
||||||
(begin, end)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn galloping(block_docs: &[u32], target: u32) -> usize {
|
|
||||||
let (start, end) = exponential_search(&block_docs, target);
|
|
||||||
start + linear_search(&block_docs[start..end], target)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Tantivy may rely on SIMD instructions to search for a specific document within
|
|
||||||
/// a given block.
|
|
||||||
#[derive(Clone, Copy, PartialEq)]
|
|
||||||
pub enum BlockSearcher {
|
|
||||||
#[cfg(target_arch = "x86_64")]
|
|
||||||
SSE2,
|
|
||||||
Scalar,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl BlockSearcher {
|
|
||||||
/// Search the first index containing an element greater or equal to
|
|
||||||
/// the target.
|
|
||||||
///
|
|
||||||
/// The results should be equivalent to
|
|
||||||
/// ```ignore
|
|
||||||
/// block[..]
|
|
||||||
// .iter()
|
|
||||||
// .take_while(|&&val| val < target)
|
|
||||||
// .count()
|
|
||||||
/// ```
|
|
||||||
///
|
|
||||||
/// The `start` argument is just used to hint that the response is
|
|
||||||
/// greater than beyond `start`. The implementation may or may not use
|
|
||||||
/// it for optimization.
|
|
||||||
///
|
|
||||||
/// # Assumption
|
|
||||||
///
|
|
||||||
/// The array len is > start.
|
|
||||||
/// The block is sorted
|
|
||||||
/// The target is assumed greater or equal to the `arr[start]`.
|
|
||||||
/// The target is assumed smaller or equal to the last element of the block.
|
|
||||||
///
|
|
||||||
/// Currently the scalar implementation starts by an exponential search, and
|
|
||||||
/// then operates a linear search in the result subarray.
|
|
||||||
///
|
|
||||||
/// If SSE2 instructions are available in the `(platform, running CPU)`,
|
|
||||||
/// then we use a different implementation that does an exhaustive linear search over
|
|
||||||
/// the full block whenever the block is full (`len == 128`). It is surprisingly faster, most likely because of the lack
|
|
||||||
/// of branch.
|
|
||||||
pub fn search_in_block(&self, block_docs: &[u32], start: usize, target: u32) -> usize {
|
|
||||||
#[cfg(target_arch = "x86_64")]
|
|
||||||
{
|
|
||||||
use postings::compression::COMPRESSION_BLOCK_SIZE;
|
|
||||||
if *self == BlockSearcher::SSE2 {
|
|
||||||
if block_docs.len() == COMPRESSION_BLOCK_SIZE {
|
|
||||||
return sse2::linear_search_sse2_128(block_docs, target);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
start + galloping(&block_docs[start..], target)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Default for BlockSearcher {
|
|
||||||
fn default() -> BlockSearcher {
|
|
||||||
#[cfg(target_arch = "x86_64")]
|
|
||||||
{
|
|
||||||
if is_x86_feature_detected!("sse2") {
|
|
||||||
return BlockSearcher::SSE2;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
BlockSearcher::Scalar
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use super::exponential_search;
|
|
||||||
use super::linear_search;
|
|
||||||
use super::BlockSearcher;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_linear_search() {
|
|
||||||
let len: usize = 50;
|
|
||||||
let arr: Vec<u32> = (0..len).map(|el| 1u32 + (el as u32) * 2).collect();
|
|
||||||
for target in 1..*arr.last().unwrap() {
|
|
||||||
let res = linear_search(&arr[..], target);
|
|
||||||
if res > 0 {
|
|
||||||
assert!(arr[res - 1] < target);
|
|
||||||
}
|
|
||||||
if res < len {
|
|
||||||
assert!(arr[res] >= target);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_exponentiel_search() {
|
|
||||||
assert_eq!(exponential_search(&[1, 2], 0), (0, 1));
|
|
||||||
assert_eq!(exponential_search(&[1, 2], 1), (0, 1));
|
|
||||||
assert_eq!(
|
|
||||||
exponential_search(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 7),
|
|
||||||
(3, 7)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn util_test_search_in_block(block_searcher: BlockSearcher, block: &[u32], target: u32) {
|
|
||||||
let cursor = search_in_block_trivial_but_slow(block, target);
|
|
||||||
for i in 0..cursor {
|
|
||||||
assert_eq!(block_searcher.search_in_block(block, i, target), cursor);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn util_test_search_in_block_all(block_searcher: BlockSearcher, block: &[u32]) {
|
|
||||||
use std::collections::HashSet;
|
|
||||||
let mut targets = HashSet::new();
|
|
||||||
for (i, val) in block.iter().cloned().enumerate() {
|
|
||||||
if i > 0 {
|
|
||||||
targets.insert(val - 1);
|
|
||||||
}
|
|
||||||
targets.insert(val);
|
|
||||||
}
|
|
||||||
for target in targets {
|
|
||||||
util_test_search_in_block(block_searcher, block, target);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn search_in_block_trivial_but_slow(block: &[u32], target: u32) -> usize {
|
|
||||||
block.iter().take_while(|&&val| val < target).count()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn test_search_in_block_util(block_searcher: BlockSearcher) {
|
|
||||||
for len in 1u32..128u32 {
|
|
||||||
let v: Vec<u32> = (0..len).map(|i| i * 2).collect();
|
|
||||||
util_test_search_in_block_all(block_searcher, &v[..]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_search_in_block_scalar() {
|
|
||||||
test_search_in_block_util(BlockSearcher::Scalar);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(target_arch = "x86_64")]
|
|
||||||
#[test]
|
|
||||||
fn test_search_in_block_sse2() {
|
|
||||||
test_search_in_block_util(BlockSearcher::SSE2);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -43,14 +43,9 @@ impl BlockEncoder {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// We ensure that the OutputBuffer is align on 128 bits
|
|
||||||
/// in order to run SSE2 linear search on it.
|
|
||||||
#[repr(align(128))]
|
|
||||||
struct OutputBuffer([u32; COMPRESSION_BLOCK_SIZE + 1]);
|
|
||||||
|
|
||||||
pub struct BlockDecoder {
|
pub struct BlockDecoder {
|
||||||
bitpacker: BitPacker4x,
|
bitpacker: BitPacker4x,
|
||||||
output: OutputBuffer,
|
pub output: [u32; COMPRESSION_BLOCK_SIZE + 1],
|
||||||
pub output_len: usize,
|
pub output_len: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -64,7 +59,7 @@ impl BlockDecoder {
|
|||||||
output[COMPRESSION_BLOCK_SIZE] = 0u32;
|
output[COMPRESSION_BLOCK_SIZE] = 0u32;
|
||||||
BlockDecoder {
|
BlockDecoder {
|
||||||
bitpacker: BitPacker4x::new(),
|
bitpacker: BitPacker4x::new(),
|
||||||
output: OutputBuffer(output),
|
output,
|
||||||
output_len: 0,
|
output_len: 0,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -77,23 +72,23 @@ impl BlockDecoder {
|
|||||||
) -> usize {
|
) -> usize {
|
||||||
self.output_len = COMPRESSION_BLOCK_SIZE;
|
self.output_len = COMPRESSION_BLOCK_SIZE;
|
||||||
self.bitpacker
|
self.bitpacker
|
||||||
.decompress_sorted(offset, &compressed_data, &mut self.output.0, num_bits)
|
.decompress_sorted(offset, &compressed_data, &mut self.output, num_bits)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn uncompress_block_unsorted(&mut self, compressed_data: &[u8], num_bits: u8) -> usize {
|
pub fn uncompress_block_unsorted(&mut self, compressed_data: &[u8], num_bits: u8) -> usize {
|
||||||
self.output_len = COMPRESSION_BLOCK_SIZE;
|
self.output_len = COMPRESSION_BLOCK_SIZE;
|
||||||
self.bitpacker
|
self.bitpacker
|
||||||
.decompress(&compressed_data, &mut self.output.0, num_bits)
|
.decompress(&compressed_data, &mut self.output, num_bits)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn output_array(&self) -> &[u32] {
|
pub fn output_array(&self) -> &[u32] {
|
||||||
&self.output.0[..self.output_len]
|
&self.output[..self.output_len]
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn output(&self, idx: usize) -> u32 {
|
pub fn output(&self, idx: usize) -> u32 {
|
||||||
self.output.0[idx]
|
self.output[idx]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -164,12 +159,12 @@ impl VIntDecoder for BlockDecoder {
|
|||||||
num_els: usize,
|
num_els: usize,
|
||||||
) -> usize {
|
) -> usize {
|
||||||
self.output_len = num_els;
|
self.output_len = num_els;
|
||||||
vint::uncompress_sorted(compressed_data, &mut self.output.0[..num_els], offset)
|
vint::uncompress_sorted(compressed_data, &mut self.output[..num_els], offset)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> usize {
|
fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> usize {
|
||||||
self.output_len = num_els;
|
self.output_len = num_els;
|
||||||
vint::uncompress_unsorted(compressed_data, &mut self.output.0[..num_els])
|
vint::uncompress_unsorted(compressed_data, &mut self.output[..num_els])
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -271,17 +266,21 @@ pub mod tests {
|
|||||||
mod bench {
|
mod bench {
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
|
use rand::Rng;
|
||||||
use rand::SeedableRng;
|
use rand::SeedableRng;
|
||||||
use rand::{Rng, XorShiftRng};
|
use rand::XorShiftRng;
|
||||||
use test::Bencher;
|
use test::Bencher;
|
||||||
|
|
||||||
fn generate_array_with_seed(n: usize, ratio: f64, seed_val: u8) -> Vec<u32> {
|
fn generate_array_with_seed(n: usize, ratio: f32, seed_val: u32) -> Vec<u32> {
|
||||||
let seed: &[u8; 16] = &[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, seed_val];
|
let seed: &[u32; 4] = &[1, 2, 3, seed_val];
|
||||||
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
|
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
|
||||||
(0u32..).filter(|_| rng.gen_bool(ratio)).take(n).collect()
|
(0..u32::max_value())
|
||||||
|
.filter(|_| rng.next_f32() < ratio)
|
||||||
|
.take(n)
|
||||||
|
.collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn generate_array(n: usize, ratio: f64) -> Vec<u32> {
|
pub fn generate_array(n: usize, ratio: f32) -> Vec<u32> {
|
||||||
generate_array_with_seed(n, ratio, 4)
|
generate_array_with_seed(n, ratio, 4)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -298,23 +297,24 @@ mod bench {
|
|||||||
fn bench_uncompress(b: &mut Bencher) {
|
fn bench_uncompress(b: &mut Bencher) {
|
||||||
let mut encoder = BlockEncoder::new();
|
let mut encoder = BlockEncoder::new();
|
||||||
let data = generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
|
let data = generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
|
||||||
let (num_bits, compressed) = encoder.compress_block_sorted(&data, 0u32);
|
let (_, compressed) = encoder.compress_block_sorted(&data, 0u32);
|
||||||
let mut decoder = BlockDecoder::new();
|
let mut decoder = BlockDecoder::new();
|
||||||
b.iter(|| {
|
b.iter(|| {
|
||||||
decoder.uncompress_block_sorted(compressed, 0u32, num_bits);
|
decoder.uncompress_block_sorted(compressed, 0u32);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_all_docs_compression_numbits() {
|
fn test_all_docs_compression_numbits() {
|
||||||
for expected_num_bits in 0u8.. {
|
for num_bits in 0..33 {
|
||||||
let mut data = [0u32; 128];
|
let mut data = [0u32; 128];
|
||||||
if expected_num_bits > 0 {
|
if num_bits > 0 {
|
||||||
data[0] = (1u64 << (expected_num_bits as usize) - 1) as u32;
|
data[0] = 1 << (num_bits - 1);
|
||||||
}
|
}
|
||||||
let mut encoder = BlockEncoder::new();
|
let mut encoder = BlockEncoder::new();
|
||||||
let (num_bits, compressed) = encoder.compress_block_unsorted(&data);
|
let compressed = encoder.compress_block_unsorted(&data);
|
||||||
assert_eq!(compressed.len(), compressed_block_size(num_bits));
|
assert_eq!(compressed[0] as usize, num_bits);
|
||||||
|
assert_eq!(compressed.len(), compressed_block_size(compressed[0]));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
Postings module (also called inverted index)
|
Postings module (also called inverted index)
|
||||||
*/
|
*/
|
||||||
|
|
||||||
mod block_search;
|
|
||||||
pub(crate) mod compression;
|
pub(crate) mod compression;
|
||||||
/// Postings module
|
/// Postings module
|
||||||
///
|
///
|
||||||
@@ -17,8 +16,6 @@ mod skip;
|
|||||||
mod stacker;
|
mod stacker;
|
||||||
mod term_info;
|
mod term_info;
|
||||||
|
|
||||||
pub(crate) use self::block_search::BlockSearcher;
|
|
||||||
|
|
||||||
pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
|
pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
|
||||||
pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
|
pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
|
||||||
|
|
||||||
@@ -34,6 +31,7 @@ pub(crate) use self::stacker::compute_table_size;
|
|||||||
pub use common::HasLen;
|
pub use common::HasLen;
|
||||||
|
|
||||||
pub(crate) const USE_SKIP_INFO_LIMIT: u32 = COMPRESSION_BLOCK_SIZE as u32;
|
pub(crate) const USE_SKIP_INFO_LIMIT: u32 = COMPRESSION_BLOCK_SIZE as u32;
|
||||||
|
|
||||||
pub(crate) type UnorderedTermId = u64;
|
pub(crate) type UnorderedTermId = u64;
|
||||||
|
|
||||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::enum_variant_names))]
|
#[cfg_attr(feature = "cargo-clippy", allow(clippy::enum_variant_names))]
|
||||||
@@ -56,18 +54,17 @@ pub mod tests {
|
|||||||
use indexer::operation::AddOperation;
|
use indexer::operation::AddOperation;
|
||||||
use indexer::SegmentWriter;
|
use indexer::SegmentWriter;
|
||||||
use query::Scorer;
|
use query::Scorer;
|
||||||
use rand::rngs::StdRng;
|
use rand::{Rng, SeedableRng, XorShiftRng};
|
||||||
use rand::{Rng, SeedableRng};
|
|
||||||
use schema::Field;
|
use schema::Field;
|
||||||
use schema::IndexRecordOption;
|
use schema::IndexRecordOption;
|
||||||
use schema::{Document, Schema, Term, INDEXED, STRING, TEXT};
|
use schema::{Document, SchemaBuilder, Term, INT_INDEXED, STRING, TEXT};
|
||||||
use std::iter;
|
use std::iter;
|
||||||
use DocId;
|
use DocId;
|
||||||
use Score;
|
use Score;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
pub fn test_position_write() {
|
pub fn test_position_write() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -91,7 +88,7 @@ pub mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
pub fn test_skip_positions() {
|
pub fn test_skip_positions() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::new();
|
||||||
let title = schema_builder.add_text_field("title", TEXT);
|
let title = schema_builder.add_text_field("title", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -103,11 +100,14 @@ pub mod tests {
|
|||||||
}
|
}
|
||||||
index_writer.add_document(doc!(title => r#"abc be be be be abc"#));
|
index_writer.add_document(doc!(title => r#"abc be be be be abc"#));
|
||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
|
index.load_searchers().unwrap();
|
||||||
|
|
||||||
let searcher = index.reader().unwrap().searcher();
|
let searcher = index.searcher();
|
||||||
let inverted_index = searcher.segment_reader(0u32).inverted_index(title);
|
let inverted_index = searcher.segment_reader(0u32).inverted_index(title);
|
||||||
let term = Term::from_field_text(title, "abc");
|
let term = Term::from_field_text(title, "abc");
|
||||||
|
|
||||||
let mut positions = Vec::new();
|
let mut positions = Vec::new();
|
||||||
|
|
||||||
{
|
{
|
||||||
let mut postings = inverted_index
|
let mut postings = inverted_index
|
||||||
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
|
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
|
||||||
@@ -163,7 +163,7 @@ pub mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
pub fn test_position_and_fieldnorm1() {
|
pub fn test_position_and_fieldnorm1() {
|
||||||
let mut positions = Vec::new();
|
let mut positions = Vec::new();
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema.clone());
|
let index = Index::create_in_ram(schema.clone());
|
||||||
@@ -220,10 +220,12 @@ pub mod tests {
|
|||||||
}
|
}
|
||||||
{
|
{
|
||||||
let term_a = Term::from_field_text(text_field, "abcdef");
|
let term_a = Term::from_field_text(text_field, "abcdef");
|
||||||
assert!(segment_reader
|
assert!(
|
||||||
.inverted_index(term_a.field())
|
segment_reader
|
||||||
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
.inverted_index(term_a.field())
|
||||||
.is_none());
|
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
|
||||||
|
.is_none()
|
||||||
|
);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let term_a = Term::from_field_text(text_field, "a");
|
let term_a = Term::from_field_text(text_field, "a");
|
||||||
@@ -274,12 +276,12 @@ pub mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
pub fn test_position_and_fieldnorm2() {
|
pub fn test_position_and_fieldnorm2() {
|
||||||
let mut positions: Vec<u32> = Vec::new();
|
let mut positions: Vec<u32> = Vec::new();
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
{
|
{
|
||||||
let mut doc = Document::default();
|
let mut doc = Document::default();
|
||||||
doc.add_text(text_field, "g b b d c g c");
|
doc.add_text(text_field, "g b b d c g c");
|
||||||
@@ -292,8 +294,9 @@ pub mod tests {
|
|||||||
}
|
}
|
||||||
assert!(index_writer.commit().is_ok());
|
assert!(index_writer.commit().is_ok());
|
||||||
}
|
}
|
||||||
|
index.load_searchers().unwrap();
|
||||||
let term_a = Term::from_field_text(text_field, "a");
|
let term_a = Term::from_field_text(text_field, "a");
|
||||||
let searcher = index.reader().unwrap().searcher();
|
let searcher = index.searcher();
|
||||||
let segment_reader = searcher.segment_reader(0);
|
let segment_reader = searcher.segment_reader(0);
|
||||||
let mut postings = segment_reader
|
let mut postings = segment_reader
|
||||||
.inverted_index(text_field)
|
.inverted_index(text_field)
|
||||||
@@ -314,13 +317,13 @@ pub mod tests {
|
|||||||
let num_docs = 300u32;
|
let num_docs = 300u32;
|
||||||
|
|
||||||
let index = {
|
let index = {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let value_field = schema_builder.add_u64_field("value", INDEXED);
|
let value_field = schema_builder.add_u64_field("value", INT_INDEXED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
|
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
for i in 0..num_docs {
|
for i in 0..num_docs {
|
||||||
let mut doc = Document::default();
|
let mut doc = Document::default();
|
||||||
doc.add_u64(value_field, 2);
|
doc.add_u64(value_field, 2);
|
||||||
@@ -330,9 +333,10 @@ pub mod tests {
|
|||||||
}
|
}
|
||||||
assert!(index_writer.commit().is_ok());
|
assert!(index_writer.commit().is_ok());
|
||||||
}
|
}
|
||||||
|
index.load_searchers().unwrap();
|
||||||
index
|
index
|
||||||
};
|
};
|
||||||
let searcher = index.reader().unwrap().searcher();
|
let searcher = index.searcher();
|
||||||
let segment_reader = searcher.segment_reader(0);
|
let segment_reader = searcher.segment_reader(0);
|
||||||
|
|
||||||
// check that the basic usage works
|
// check that the basic usage works
|
||||||
@@ -396,11 +400,12 @@ pub mod tests {
|
|||||||
|
|
||||||
// delete some of the documents
|
// delete some of the documents
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
index_writer.delete_term(term_0);
|
index_writer.delete_term(term_0);
|
||||||
assert!(index_writer.commit().is_ok());
|
assert!(index_writer.commit().is_ok());
|
||||||
}
|
}
|
||||||
let searcher = index.reader().unwrap().searcher();
|
index.load_searchers().unwrap();
|
||||||
|
let searcher = index.searcher();
|
||||||
let segment_reader = searcher.segment_reader(0);
|
let segment_reader = searcher.segment_reader(0);
|
||||||
|
|
||||||
// make sure seeking still works
|
// make sure seeking still works
|
||||||
@@ -445,19 +450,33 @@ pub mod tests {
|
|||||||
|
|
||||||
// delete everything else
|
// delete everything else
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
index_writer.delete_term(term_1);
|
index_writer.delete_term(term_1);
|
||||||
|
|
||||||
assert!(index_writer.commit().is_ok());
|
assert!(index_writer.commit().is_ok());
|
||||||
}
|
}
|
||||||
let searcher = index.reader().unwrap().searcher();
|
index.load_searchers().unwrap();
|
||||||
|
|
||||||
|
let searcher = index.searcher();
|
||||||
|
let segment_reader = searcher.segment_reader(0);
|
||||||
|
|
||||||
// finally, check that it's empty
|
// finally, check that it's empty
|
||||||
{
|
{
|
||||||
let searchable_segment_ids = index
|
let mut segment_postings = segment_reader
|
||||||
.searchable_segment_ids()
|
.inverted_index(term_2.field())
|
||||||
.expect("could not get index segment ids");
|
.read_postings(&term_2, IndexRecordOption::Basic)
|
||||||
assert!(searchable_segment_ids.is_empty());
|
.unwrap();
|
||||||
assert_eq!(searcher.num_docs(), 0);
|
|
||||||
|
assert_eq!(segment_postings.skip_next(0), SkipResult::Reached);
|
||||||
|
assert_eq!(segment_postings.doc(), 0);
|
||||||
|
assert!(segment_reader.is_deleted(0));
|
||||||
|
|
||||||
|
let mut segment_postings = segment_reader
|
||||||
|
.inverted_index(term_2.field())
|
||||||
|
.read_postings(&term_2, IndexRecordOption::Basic)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
assert_eq!(segment_postings.skip_next(num_docs), SkipResult::End);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -479,16 +498,17 @@ pub mod tests {
|
|||||||
Term::from_field_text(field, "d")
|
Term::from_field_text(field, "d")
|
||||||
};
|
};
|
||||||
pub static ref INDEX: Index = {
|
pub static ref INDEX: Index = {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let text_field = schema_builder.add_text_field("text", STRING);
|
let text_field = schema_builder.add_text_field("text", STRING);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
|
|
||||||
let mut rng: StdRng = StdRng::from_seed([1u8; 32]);
|
let seed: [u8; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
|
||||||
|
let mut rng: XorShiftRng = XorShiftRng::from_seed(seed);
|
||||||
|
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let posting_list_size = 1_000_000;
|
let posting_list_size = 1_000_000;
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
for _ in 0..posting_list_size {
|
for _ in 0..posting_list_size {
|
||||||
let mut doc = Document::default();
|
let mut doc = Document::default();
|
||||||
if rng.gen_bool(1f64 / 15f64) {
|
if rng.gen_bool(1f64 / 15f64) {
|
||||||
@@ -505,6 +525,7 @@ pub mod tests {
|
|||||||
}
|
}
|
||||||
assert!(index_writer.commit().is_ok());
|
assert!(index_writer.commit().is_ok());
|
||||||
}
|
}
|
||||||
|
index.load_searchers().unwrap();
|
||||||
index
|
index
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@@ -633,7 +654,7 @@ mod bench {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
fn bench_skip_next(p: f64, b: &mut Bencher) {
|
fn bench_skip_next(p: f32, b: &mut Bencher) {
|
||||||
let searcher = INDEX.searcher();
|
let searcher = INDEX.searcher();
|
||||||
let segment_reader = searcher.segment_reader(0);
|
let segment_reader = searcher.segment_reader(0);
|
||||||
let docs = tests::sample(segment_reader.num_docs(), p);
|
let docs = tests::sample(segment_reader.num_docs(), p);
|
||||||
|
|||||||
@@ -1,8 +1,6 @@
|
|||||||
use super::stacker::{Addr, MemoryArena, TermHashMap};
|
use super::stacker::{Addr, MemoryArena, TermHashMap};
|
||||||
|
|
||||||
use postings::recorder::{
|
use postings::recorder::{NothingRecorder, Recorder, TFAndPositionRecorder, TermFrequencyRecorder};
|
||||||
BufferLender, NothingRecorder, Recorder, TFAndPositionRecorder, TermFrequencyRecorder,
|
|
||||||
};
|
|
||||||
use postings::UnorderedTermId;
|
use postings::UnorderedTermId;
|
||||||
use postings::{FieldSerializer, InvertedIndexSerializer};
|
use postings::{FieldSerializer, InvertedIndexSerializer};
|
||||||
use schema::IndexRecordOption;
|
use schema::IndexRecordOption;
|
||||||
@@ -31,12 +29,10 @@ fn posting_from_field_entry(field_entry: &FieldEntry) -> Box<PostingsWriter> {
|
|||||||
IndexRecordOption::WithFreqsAndPositions => {
|
IndexRecordOption::WithFreqsAndPositions => {
|
||||||
SpecializedPostingsWriter::<TFAndPositionRecorder>::new_boxed()
|
SpecializedPostingsWriter::<TFAndPositionRecorder>::new_boxed()
|
||||||
}
|
}
|
||||||
})
|
}).unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed()),
|
||||||
.unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed()),
|
FieldType::U64(_) | FieldType::I64(_) | FieldType::HierarchicalFacet => {
|
||||||
FieldType::U64(_)
|
SpecializedPostingsWriter::<NothingRecorder>::new_boxed()
|
||||||
| FieldType::I64(_)
|
}
|
||||||
| FieldType::Date(_)
|
|
||||||
| FieldType::HierarchicalFacet => SpecializedPostingsWriter::<NothingRecorder>::new_boxed(),
|
|
||||||
FieldType::Bytes => {
|
FieldType::Bytes => {
|
||||||
// FieldType::Bytes cannot actually be indexed.
|
// FieldType::Bytes cannot actually be indexed.
|
||||||
// TODO fix during the indexer refactoring described in #276
|
// TODO fix during the indexer refactoring described in #276
|
||||||
@@ -52,31 +48,6 @@ pub struct MultiFieldPostingsWriter {
|
|||||||
per_field_postings_writers: Vec<Box<PostingsWriter>>,
|
per_field_postings_writers: Vec<Box<PostingsWriter>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn make_field_partition(
|
|
||||||
term_offsets: &[(&[u8], Addr, UnorderedTermId)],
|
|
||||||
) -> Vec<(Field, usize, usize)> {
|
|
||||||
let term_offsets_it = term_offsets
|
|
||||||
.iter()
|
|
||||||
.map(|(key, _, _)| Term::wrap(key).field())
|
|
||||||
.enumerate();
|
|
||||||
let mut prev_field = Field(u32::max_value());
|
|
||||||
let mut fields = vec![];
|
|
||||||
let mut offsets = vec![];
|
|
||||||
for (offset, field) in term_offsets_it {
|
|
||||||
if field != prev_field {
|
|
||||||
prev_field = field;
|
|
||||||
fields.push(field);
|
|
||||||
offsets.push(offset);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
offsets.push(term_offsets.len());
|
|
||||||
let mut field_offsets = vec![];
|
|
||||||
for i in 0..fields.len() {
|
|
||||||
field_offsets.push((fields[i], offsets[i], offsets[i + 1]));
|
|
||||||
}
|
|
||||||
field_offsets
|
|
||||||
}
|
|
||||||
|
|
||||||
impl MultiFieldPostingsWriter {
|
impl MultiFieldPostingsWriter {
|
||||||
/// Create a new `MultiFieldPostingsWriter` given
|
/// Create a new `MultiFieldPostingsWriter` given
|
||||||
/// a schema and a heap.
|
/// a schema and a heap.
|
||||||
@@ -122,16 +93,38 @@ impl MultiFieldPostingsWriter {
|
|||||||
&self,
|
&self,
|
||||||
serializer: &mut InvertedIndexSerializer,
|
serializer: &mut InvertedIndexSerializer,
|
||||||
) -> Result<HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>>> {
|
) -> Result<HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>>> {
|
||||||
let mut term_offsets: Vec<(&[u8], Addr, UnorderedTermId)> =
|
let mut term_offsets: Vec<(&[u8], Addr, UnorderedTermId)> = self
|
||||||
self.term_index.iter().collect();
|
.term_index
|
||||||
|
.iter()
|
||||||
|
.map(|(term_bytes, addr, bucket_id)| (term_bytes, addr, bucket_id as UnorderedTermId))
|
||||||
|
.collect();
|
||||||
term_offsets.sort_unstable_by_key(|&(k, _, _)| k);
|
term_offsets.sort_unstable_by_key(|&(k, _, _)| k);
|
||||||
|
|
||||||
let mut unordered_term_mappings: HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>> =
|
let mut offsets: Vec<(Field, usize)> = vec![];
|
||||||
HashMap::new();
|
let term_offsets_it = term_offsets
|
||||||
|
.iter()
|
||||||
|
.cloned()
|
||||||
|
.map(|(key, _, _)| Term::wrap(key).field())
|
||||||
|
.enumerate();
|
||||||
|
|
||||||
let field_offsets = make_field_partition(&term_offsets);
|
let mut unordered_term_mappings: HashMap<
|
||||||
|
Field,
|
||||||
|
HashMap<UnorderedTermId, TermOrdinal>,
|
||||||
|
> = HashMap::new();
|
||||||
|
|
||||||
|
let mut prev_field = Field(u32::max_value());
|
||||||
|
for (offset, field) in term_offsets_it {
|
||||||
|
if field != prev_field {
|
||||||
|
offsets.push((field, offset));
|
||||||
|
prev_field = field;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
offsets.push((Field(0), term_offsets.len()));
|
||||||
|
|
||||||
|
for i in 0..(offsets.len() - 1) {
|
||||||
|
let (field, start) = offsets[i];
|
||||||
|
let (_, stop) = offsets[i + 1];
|
||||||
|
|
||||||
for (field, start, stop) in field_offsets {
|
|
||||||
let field_entry = self.schema.get_field_entry(field);
|
let field_entry = self.schema.get_field_entry(field);
|
||||||
|
|
||||||
match *field_entry.field_type() {
|
match *field_entry.field_type() {
|
||||||
@@ -145,11 +138,10 @@ impl MultiFieldPostingsWriter {
|
|||||||
.enumerate()
|
.enumerate()
|
||||||
.map(|(term_ord, unord_term_id)| {
|
.map(|(term_ord, unord_term_id)| {
|
||||||
(unord_term_id as UnorderedTermId, term_ord as TermOrdinal)
|
(unord_term_id as UnorderedTermId, term_ord as TermOrdinal)
|
||||||
})
|
}).collect();
|
||||||
.collect();
|
|
||||||
unordered_term_mappings.insert(field, mapping);
|
unordered_term_mappings.insert(field, mapping);
|
||||||
}
|
}
|
||||||
FieldType::U64(_) | FieldType::I64(_) | FieldType::Date(_) => {}
|
FieldType::U64(_) | FieldType::I64(_) => {}
|
||||||
FieldType::Bytes => {}
|
FieldType::Bytes => {}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -221,7 +213,7 @@ pub trait PostingsWriter {
|
|||||||
|
|
||||||
/// The `SpecializedPostingsWriter` is just here to remove dynamic
|
/// The `SpecializedPostingsWriter` is just here to remove dynamic
|
||||||
/// dispatch to the recorder information.
|
/// dispatch to the recorder information.
|
||||||
pub(crate) struct SpecializedPostingsWriter<Rec: Recorder + 'static> {
|
pub struct SpecializedPostingsWriter<Rec: Recorder + 'static> {
|
||||||
total_num_tokens: u64,
|
total_num_tokens: u64,
|
||||||
_recorder_type: PhantomData<Rec>,
|
_recorder_type: PhantomData<Rec>,
|
||||||
}
|
}
|
||||||
@@ -253,7 +245,8 @@ impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec>
|
|||||||
debug_assert!(term.as_slice().len() >= 4);
|
debug_assert!(term.as_slice().len() >= 4);
|
||||||
self.total_num_tokens += 1;
|
self.total_num_tokens += 1;
|
||||||
term_index.mutate_or_create(term, |opt_recorder: Option<Rec>| {
|
term_index.mutate_or_create(term, |opt_recorder: Option<Rec>| {
|
||||||
if let Some(mut recorder) = opt_recorder {
|
if opt_recorder.is_some() {
|
||||||
|
let mut recorder = opt_recorder.unwrap();
|
||||||
let current_doc = recorder.current_doc();
|
let current_doc = recorder.current_doc();
|
||||||
if current_doc != doc {
|
if current_doc != doc {
|
||||||
recorder.close_doc(heap);
|
recorder.close_doc(heap);
|
||||||
@@ -262,7 +255,7 @@ impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec>
|
|||||||
recorder.record_position(position, heap);
|
recorder.record_position(position, heap);
|
||||||
recorder
|
recorder
|
||||||
} else {
|
} else {
|
||||||
let mut recorder = Rec::new();
|
let mut recorder = Rec::new(heap);
|
||||||
recorder.new_doc(doc, heap);
|
recorder.new_doc(doc, heap);
|
||||||
recorder.record_position(position, heap);
|
recorder.record_position(position, heap);
|
||||||
recorder
|
recorder
|
||||||
@@ -277,11 +270,10 @@ impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec>
|
|||||||
termdict_heap: &MemoryArena,
|
termdict_heap: &MemoryArena,
|
||||||
heap: &MemoryArena,
|
heap: &MemoryArena,
|
||||||
) -> io::Result<()> {
|
) -> io::Result<()> {
|
||||||
let mut buffer_lender = BufferLender::default();
|
|
||||||
for &(term_bytes, addr, _) in term_addrs {
|
for &(term_bytes, addr, _) in term_addrs {
|
||||||
let recorder: Rec = termdict_heap.read(addr);
|
let recorder: Rec = unsafe { termdict_heap.read(addr) };
|
||||||
serializer.new_term(&term_bytes[4..])?;
|
serializer.new_term(&term_bytes[4..])?;
|
||||||
recorder.serialize(&mut buffer_lender, serializer, heap)?;
|
recorder.serialize(serializer, heap)?;
|
||||||
serializer.close_term()?;
|
serializer.close_term()?;
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
|
|||||||
@@ -1,50 +1,10 @@
|
|||||||
use super::stacker::{ExpUnrolledLinkedList, MemoryArena};
|
use super::stacker::{ExpUnrolledLinkedList, MemoryArena};
|
||||||
use common::{read_u32_vint, write_u32_vint};
|
|
||||||
use postings::FieldSerializer;
|
use postings::FieldSerializer;
|
||||||
use std::io;
|
use std::{self, io};
|
||||||
use DocId;
|
use DocId;
|
||||||
|
|
||||||
const POSITION_END: u32 = 0;
|
const EMPTY_ARRAY: [u32; 0] = [0u32; 0];
|
||||||
|
const POSITION_END: u32 = std::u32::MAX;
|
||||||
#[derive(Default)]
|
|
||||||
pub(crate) struct BufferLender {
|
|
||||||
buffer_u8: Vec<u8>,
|
|
||||||
buffer_u32: Vec<u32>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl BufferLender {
|
|
||||||
pub fn lend_u8(&mut self) -> &mut Vec<u8> {
|
|
||||||
self.buffer_u8.clear();
|
|
||||||
&mut self.buffer_u8
|
|
||||||
}
|
|
||||||
pub fn lend_all(&mut self) -> (&mut Vec<u8>, &mut Vec<u32>) {
|
|
||||||
self.buffer_u8.clear();
|
|
||||||
self.buffer_u32.clear();
|
|
||||||
(&mut self.buffer_u8, &mut self.buffer_u32)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct VInt32Reader<'a> {
|
|
||||||
data: &'a [u8],
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a> VInt32Reader<'a> {
|
|
||||||
fn new(data: &'a [u8]) -> VInt32Reader<'a> {
|
|
||||||
VInt32Reader { data }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a> Iterator for VInt32Reader<'a> {
|
|
||||||
type Item = u32;
|
|
||||||
|
|
||||||
fn next(&mut self) -> Option<u32> {
|
|
||||||
if self.data.is_empty() {
|
|
||||||
None
|
|
||||||
} else {
|
|
||||||
Some(read_u32_vint(&mut self.data))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Recorder is in charge of recording relevant information about
|
/// Recorder is in charge of recording relevant information about
|
||||||
/// the presence of a term in a document.
|
/// the presence of a term in a document.
|
||||||
@@ -55,9 +15,9 @@ impl<'a> Iterator for VInt32Reader<'a> {
|
|||||||
/// * the document id
|
/// * the document id
|
||||||
/// * the term frequency
|
/// * the term frequency
|
||||||
/// * the term positions
|
/// * the term positions
|
||||||
pub(crate) trait Recorder: Copy + 'static {
|
pub trait Recorder: Copy {
|
||||||
///
|
///
|
||||||
fn new() -> Self;
|
fn new(heap: &mut MemoryArena) -> Self;
|
||||||
/// Returns the current document
|
/// Returns the current document
|
||||||
fn current_doc(&self) -> u32;
|
fn current_doc(&self) -> u32;
|
||||||
/// Starts recording information about a new document
|
/// Starts recording information about a new document
|
||||||
@@ -69,12 +29,7 @@ pub(crate) trait Recorder: Copy + 'static {
|
|||||||
/// Close the document. It will help record the term frequency.
|
/// Close the document. It will help record the term frequency.
|
||||||
fn close_doc(&mut self, heap: &mut MemoryArena);
|
fn close_doc(&mut self, heap: &mut MemoryArena);
|
||||||
/// Pushes the postings information to the serializer.
|
/// Pushes the postings information to the serializer.
|
||||||
fn serialize(
|
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()>;
|
||||||
&self,
|
|
||||||
buffer_lender: &mut BufferLender,
|
|
||||||
serializer: &mut FieldSerializer,
|
|
||||||
heap: &MemoryArena,
|
|
||||||
) -> io::Result<()>;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Only records the doc ids
|
/// Only records the doc ids
|
||||||
@@ -85,9 +40,9 @@ pub struct NothingRecorder {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl Recorder for NothingRecorder {
|
impl Recorder for NothingRecorder {
|
||||||
fn new() -> Self {
|
fn new(heap: &mut MemoryArena) -> Self {
|
||||||
NothingRecorder {
|
NothingRecorder {
|
||||||
stack: ExpUnrolledLinkedList::new(),
|
stack: ExpUnrolledLinkedList::new(heap),
|
||||||
current_doc: u32::max_value(),
|
current_doc: u32::max_value(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -98,23 +53,16 @@ impl Recorder for NothingRecorder {
|
|||||||
|
|
||||||
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
|
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
|
||||||
self.current_doc = doc;
|
self.current_doc = doc;
|
||||||
let _ = write_u32_vint(doc, &mut self.stack.writer(heap));
|
self.stack.push(doc, heap);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn record_position(&mut self, _position: u32, _heap: &mut MemoryArena) {}
|
fn record_position(&mut self, _position: u32, _heap: &mut MemoryArena) {}
|
||||||
|
|
||||||
fn close_doc(&mut self, _heap: &mut MemoryArena) {}
|
fn close_doc(&mut self, _heap: &mut MemoryArena) {}
|
||||||
|
|
||||||
fn serialize(
|
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> {
|
||||||
&self,
|
for doc in self.stack.iter(heap) {
|
||||||
buffer_lender: &mut BufferLender,
|
serializer.write_doc(doc, 0u32, &EMPTY_ARRAY)?;
|
||||||
serializer: &mut FieldSerializer,
|
|
||||||
heap: &MemoryArena,
|
|
||||||
) -> io::Result<()> {
|
|
||||||
let buffer = buffer_lender.lend_u8();
|
|
||||||
self.stack.read_to_end(heap, buffer);
|
|
||||||
for doc in VInt32Reader::new(&buffer[..]) {
|
|
||||||
serializer.write_doc(doc as u32, 0u32, &[][..])?;
|
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -129,9 +77,9 @@ pub struct TermFrequencyRecorder {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl Recorder for TermFrequencyRecorder {
|
impl Recorder for TermFrequencyRecorder {
|
||||||
fn new() -> Self {
|
fn new(heap: &mut MemoryArena) -> Self {
|
||||||
TermFrequencyRecorder {
|
TermFrequencyRecorder {
|
||||||
stack: ExpUnrolledLinkedList::new(),
|
stack: ExpUnrolledLinkedList::new(heap),
|
||||||
current_doc: u32::max_value(),
|
current_doc: u32::max_value(),
|
||||||
current_tf: 0u32,
|
current_tf: 0u32,
|
||||||
}
|
}
|
||||||
@@ -143,7 +91,7 @@ impl Recorder for TermFrequencyRecorder {
|
|||||||
|
|
||||||
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
|
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
|
||||||
self.current_doc = doc;
|
self.current_doc = doc;
|
||||||
let _ = write_u32_vint(doc, &mut self.stack.writer(heap));
|
self.stack.push(doc, heap);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn record_position(&mut self, _position: u32, _heap: &mut MemoryArena) {
|
fn record_position(&mut self, _position: u32, _heap: &mut MemoryArena) {
|
||||||
@@ -152,24 +100,24 @@ impl Recorder for TermFrequencyRecorder {
|
|||||||
|
|
||||||
fn close_doc(&mut self, heap: &mut MemoryArena) {
|
fn close_doc(&mut self, heap: &mut MemoryArena) {
|
||||||
debug_assert!(self.current_tf > 0);
|
debug_assert!(self.current_tf > 0);
|
||||||
let _ = write_u32_vint(self.current_tf, &mut self.stack.writer(heap));
|
self.stack.push(self.current_tf, heap);
|
||||||
self.current_tf = 0;
|
self.current_tf = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize(
|
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> {
|
||||||
&self,
|
// the last document has not been closed...
|
||||||
buffer_lender: &mut BufferLender,
|
// its term freq is self.current_tf.
|
||||||
serializer: &mut FieldSerializer,
|
let mut doc_iter = self
|
||||||
heap: &MemoryArena,
|
.stack
|
||||||
) -> io::Result<()> {
|
.iter(heap)
|
||||||
let buffer = buffer_lender.lend_u8();
|
.chain(Some(self.current_tf).into_iter());
|
||||||
self.stack.read_to_end(heap, buffer);
|
|
||||||
let mut u32_it = VInt32Reader::new(&buffer[..]);
|
|
||||||
while let Some(doc) = u32_it.next() {
|
|
||||||
let term_freq = u32_it.next().unwrap_or(self.current_tf);
|
|
||||||
serializer.write_doc(doc as u32, term_freq, &[][..])?;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
while let Some(doc) = doc_iter.next() {
|
||||||
|
let term_freq = doc_iter
|
||||||
|
.next()
|
||||||
|
.expect("The IndexWriter recorded a doc without a term freq.");
|
||||||
|
serializer.write_doc(doc, term_freq, &EMPTY_ARRAY)?;
|
||||||
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -180,10 +128,11 @@ pub struct TFAndPositionRecorder {
|
|||||||
stack: ExpUnrolledLinkedList,
|
stack: ExpUnrolledLinkedList,
|
||||||
current_doc: DocId,
|
current_doc: DocId,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Recorder for TFAndPositionRecorder {
|
impl Recorder for TFAndPositionRecorder {
|
||||||
fn new() -> Self {
|
fn new(heap: &mut MemoryArena) -> Self {
|
||||||
TFAndPositionRecorder {
|
TFAndPositionRecorder {
|
||||||
stack: ExpUnrolledLinkedList::new(),
|
stack: ExpUnrolledLinkedList::new(heap),
|
||||||
current_doc: u32::max_value(),
|
current_doc: u32::max_value(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -194,88 +143,33 @@ impl Recorder for TFAndPositionRecorder {
|
|||||||
|
|
||||||
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
|
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
|
||||||
self.current_doc = doc;
|
self.current_doc = doc;
|
||||||
let _ = write_u32_vint(doc, &mut self.stack.writer(heap));
|
self.stack.push(doc, heap);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn record_position(&mut self, position: u32, heap: &mut MemoryArena) {
|
fn record_position(&mut self, position: u32, heap: &mut MemoryArena) {
|
||||||
let _ = write_u32_vint(position + 1u32, &mut self.stack.writer(heap));
|
self.stack.push(position, heap);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn close_doc(&mut self, heap: &mut MemoryArena) {
|
fn close_doc(&mut self, heap: &mut MemoryArena) {
|
||||||
let _ = write_u32_vint(POSITION_END, &mut self.stack.writer(heap));
|
self.stack.push(POSITION_END, heap);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize(
|
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> {
|
||||||
&self,
|
let mut doc_positions = Vec::with_capacity(100);
|
||||||
buffer_lender: &mut BufferLender,
|
let mut positions_iter = self.stack.iter(heap);
|
||||||
serializer: &mut FieldSerializer,
|
while let Some(doc) = positions_iter.next() {
|
||||||
heap: &MemoryArena,
|
let mut prev_position = 0;
|
||||||
) -> io::Result<()> {
|
doc_positions.clear();
|
||||||
let (buffer_u8, buffer_positions) = buffer_lender.lend_all();
|
for position in &mut positions_iter {
|
||||||
self.stack.read_to_end(heap, buffer_u8);
|
if position == POSITION_END {
|
||||||
let mut u32_it = VInt32Reader::new(&buffer_u8[..]);
|
break;
|
||||||
while let Some(doc) = u32_it.next() {
|
} else {
|
||||||
let mut prev_position_plus_one = 1u32;
|
doc_positions.push(position - prev_position);
|
||||||
buffer_positions.clear();
|
prev_position = position;
|
||||||
loop {
|
|
||||||
match u32_it.next() {
|
|
||||||
Some(POSITION_END) | None => {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
Some(position_plus_one) => {
|
|
||||||
let delta_position = position_plus_one - prev_position_plus_one;
|
|
||||||
buffer_positions.push(delta_position);
|
|
||||||
prev_position_plus_one = position_plus_one;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
serializer.write_doc(doc, buffer_positions.len() as u32, &buffer_positions)?;
|
serializer.write_doc(doc, doc_positions.len() as u32, &doc_positions)?;
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
|
|
||||||
use super::write_u32_vint;
|
|
||||||
use super::BufferLender;
|
|
||||||
use super::VInt32Reader;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_buffer_lender() {
|
|
||||||
let mut buffer_lender = BufferLender::default();
|
|
||||||
{
|
|
||||||
let buf = buffer_lender.lend_u8();
|
|
||||||
assert!(buf.is_empty());
|
|
||||||
buf.push(1u8);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
let buf = buffer_lender.lend_u8();
|
|
||||||
assert!(buf.is_empty());
|
|
||||||
buf.push(1u8);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
let (_, buf) = buffer_lender.lend_all();
|
|
||||||
assert!(buf.is_empty());
|
|
||||||
buf.push(1u32);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
let (_, buf) = buffer_lender.lend_all();
|
|
||||||
assert!(buf.is_empty());
|
|
||||||
buf.push(1u32);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_vint_u32() {
|
|
||||||
let mut buffer = vec![];
|
|
||||||
let vals = [0, 1, 324_234_234, u32::max_value()];
|
|
||||||
for &i in &vals {
|
|
||||||
assert!(write_u32_vint(i, &mut buffer).is_ok());
|
|
||||||
}
|
|
||||||
assert_eq!(buffer.len(), 1 + 1 + 5 + 5);
|
|
||||||
let res: Vec<u32> = VInt32Reader::new(&buffer[..]).collect();
|
|
||||||
assert_eq!(&res[..], &vals[..]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -2,21 +2,22 @@ use common::BitSet;
|
|||||||
use common::HasLen;
|
use common::HasLen;
|
||||||
use common::{BinarySerializable, VInt};
|
use common::{BinarySerializable, VInt};
|
||||||
use docset::{DocSet, SkipResult};
|
use docset::{DocSet, SkipResult};
|
||||||
|
use fst::Streamer;
|
||||||
use owned_read::OwnedRead;
|
use owned_read::OwnedRead;
|
||||||
use positions::PositionReader;
|
use positions::PositionReader;
|
||||||
use postings::compression::compressed_block_size;
|
use postings::compression::compressed_block_size;
|
||||||
use postings::compression::{BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE};
|
use postings::compression::{BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE};
|
||||||
use postings::serializer::PostingsSerializer;
|
use postings::serializer::PostingsSerializer;
|
||||||
use postings::BlockSearcher;
|
|
||||||
use postings::FreqReadingOption;
|
use postings::FreqReadingOption;
|
||||||
use postings::Postings;
|
use postings::Postings;
|
||||||
use postings::SkipReader;
|
use postings::SkipReader;
|
||||||
use postings::USE_SKIP_INFO_LIMIT;
|
use postings::USE_SKIP_INFO_LIMIT;
|
||||||
use schema::IndexRecordOption;
|
use schema::IndexRecordOption;
|
||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use tantivy_fst::Streamer;
|
|
||||||
use DocId;
|
use DocId;
|
||||||
|
|
||||||
|
const EMPTY_ARR: [u8; 0] = [];
|
||||||
|
|
||||||
struct PositionComputer {
|
struct PositionComputer {
|
||||||
// store the amount of position int
|
// store the amount of position int
|
||||||
// before reading positions.
|
// before reading positions.
|
||||||
@@ -61,7 +62,6 @@ pub struct SegmentPostings {
|
|||||||
block_cursor: BlockSegmentPostings,
|
block_cursor: BlockSegmentPostings,
|
||||||
cur: usize,
|
cur: usize,
|
||||||
position_computer: Option<PositionComputer>,
|
position_computer: Option<PositionComputer>,
|
||||||
block_searcher: BlockSearcher,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SegmentPostings {
|
impl SegmentPostings {
|
||||||
@@ -72,7 +72,6 @@ impl SegmentPostings {
|
|||||||
block_cursor: empty_block_cursor,
|
block_cursor: empty_block_cursor,
|
||||||
cur: COMPRESSION_BLOCK_SIZE,
|
cur: COMPRESSION_BLOCK_SIZE,
|
||||||
position_computer: None,
|
position_computer: None,
|
||||||
block_searcher: BlockSearcher::default(),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -120,31 +119,46 @@ impl SegmentPostings {
|
|||||||
block_cursor: segment_block_postings,
|
block_cursor: segment_block_postings,
|
||||||
cur: COMPRESSION_BLOCK_SIZE, // cursor within the block
|
cur: COMPRESSION_BLOCK_SIZE, // cursor within the block
|
||||||
position_computer: positions_stream_opt.map(PositionComputer::new),
|
position_computer: positions_stream_opt.map(PositionComputer::new),
|
||||||
block_searcher: BlockSearcher::default(),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl DocSet for SegmentPostings {
|
fn exponential_search(target: u32, arr: &[u32]) -> (usize, usize) {
|
||||||
// goes to the next element.
|
let mut start = 0;
|
||||||
// next needs to be called a first time to point to the correct element.
|
let end = arr.len();
|
||||||
#[inline]
|
debug_assert!(target >= arr[start]);
|
||||||
fn advance(&mut self) -> bool {
|
debug_assert!(target <= arr[end - 1]);
|
||||||
if self.position_computer.is_some() {
|
let mut jump = 1;
|
||||||
let term_freq = self.term_freq() as usize;
|
loop {
|
||||||
self.position_computer.as_mut().unwrap().add_skip(term_freq);
|
let new = start + jump;
|
||||||
|
if new >= end {
|
||||||
|
return (start, end);
|
||||||
}
|
}
|
||||||
self.cur += 1;
|
if arr[new] > target {
|
||||||
if self.cur >= self.block_cursor.block_len() {
|
return (start, new);
|
||||||
self.cur = 0;
|
|
||||||
if !self.block_cursor.advance() {
|
|
||||||
self.cur = COMPRESSION_BLOCK_SIZE;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
true
|
start = new;
|
||||||
|
jump *= 2;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Search the first index containing an element greater or equal to the target.
|
||||||
|
///
|
||||||
|
/// # Assumption
|
||||||
|
///
|
||||||
|
/// The array is assumed non empty.
|
||||||
|
/// The target is assumed greater or equal to the first element.
|
||||||
|
/// The target is assumed smaller or equal to the last element.
|
||||||
|
fn search_within_block(block_docs: &[u32], target: u32) -> usize {
|
||||||
|
let (start, end) = exponential_search(target, block_docs);
|
||||||
|
start.wrapping_add(
|
||||||
|
block_docs[start..end]
|
||||||
|
.binary_search(&target)
|
||||||
|
.unwrap_or_else(|e| e),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DocSet for SegmentPostings {
|
||||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||||
if !self.advance() {
|
if !self.advance() {
|
||||||
return SkipResult::End;
|
return SkipResult::End;
|
||||||
@@ -202,9 +216,11 @@ impl DocSet for SegmentPostings {
|
|||||||
|
|
||||||
// we're in the right block now, start with an exponential search
|
// we're in the right block now, start with an exponential search
|
||||||
let block_docs = self.block_cursor.docs();
|
let block_docs = self.block_cursor.docs();
|
||||||
|
|
||||||
|
debug_assert!(target >= self.doc());
|
||||||
let new_cur = self
|
let new_cur = self
|
||||||
.block_searcher
|
.cur
|
||||||
.search_in_block(&block_docs, self.cur, target);
|
.wrapping_add(search_within_block(&block_docs[self.cur..], target));
|
||||||
if need_positions {
|
if need_positions {
|
||||||
sum_freqs_skipped += self.block_cursor.freqs()[self.cur..new_cur]
|
sum_freqs_skipped += self.block_cursor.freqs()[self.cur..new_cur]
|
||||||
.iter()
|
.iter()
|
||||||
@@ -226,6 +242,29 @@ impl DocSet for SegmentPostings {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// goes to the next element.
|
||||||
|
// next needs to be called a first time to point to the correct element.
|
||||||
|
#[inline]
|
||||||
|
fn advance(&mut self) -> bool {
|
||||||
|
if self.position_computer.is_some() {
|
||||||
|
let term_freq = self.term_freq() as usize;
|
||||||
|
self.position_computer.as_mut().unwrap().add_skip(term_freq);
|
||||||
|
}
|
||||||
|
self.cur += 1;
|
||||||
|
if self.cur >= self.block_cursor.block_len() {
|
||||||
|
self.cur = 0;
|
||||||
|
if !self.block_cursor.advance() {
|
||||||
|
self.cur = COMPRESSION_BLOCK_SIZE;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
true
|
||||||
|
}
|
||||||
|
|
||||||
|
fn size_hint(&self) -> u32 {
|
||||||
|
self.len() as u32
|
||||||
|
}
|
||||||
|
|
||||||
/// Return the current document's `DocId`.
|
/// Return the current document's `DocId`.
|
||||||
#[inline]
|
#[inline]
|
||||||
fn doc(&self) -> DocId {
|
fn doc(&self) -> DocId {
|
||||||
@@ -237,10 +276,6 @@ impl DocSet for SegmentPostings {
|
|||||||
docs[self.cur]
|
docs[self.cur]
|
||||||
}
|
}
|
||||||
|
|
||||||
fn size_hint(&self) -> u32 {
|
|
||||||
self.len() as u32
|
|
||||||
}
|
|
||||||
|
|
||||||
fn append_to_bitset(&mut self, bitset: &mut BitSet) {
|
fn append_to_bitset(&mut self, bitset: &mut BitSet) {
|
||||||
// finish the current block
|
// finish the current block
|
||||||
if self.advance() {
|
if self.advance() {
|
||||||
@@ -339,7 +374,7 @@ impl BlockSegmentPostings {
|
|||||||
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, data);
|
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, data);
|
||||||
let skip_reader = match skip_data_opt {
|
let skip_reader = match skip_data_opt {
|
||||||
Some(skip_data) => SkipReader::new(skip_data, record_option),
|
Some(skip_data) => SkipReader::new(skip_data, record_option),
|
||||||
None => SkipReader::new(OwnedRead::new(&[][..]), record_option),
|
None => SkipReader::new(OwnedRead::new(&EMPTY_ARR[..]), record_option),
|
||||||
};
|
};
|
||||||
let doc_freq = doc_freq as usize;
|
let doc_freq = doc_freq as usize;
|
||||||
let num_vint_docs = doc_freq % COMPRESSION_BLOCK_SIZE;
|
let num_vint_docs = doc_freq % COMPRESSION_BLOCK_SIZE;
|
||||||
@@ -373,7 +408,7 @@ impl BlockSegmentPostings {
|
|||||||
if let Some(skip_data) = skip_data_opt {
|
if let Some(skip_data) = skip_data_opt {
|
||||||
self.skip_reader.reset(skip_data);
|
self.skip_reader.reset(skip_data);
|
||||||
} else {
|
} else {
|
||||||
self.skip_reader.reset(OwnedRead::new(&[][..]))
|
self.skip_reader.reset(OwnedRead::new(&EMPTY_ARR[..]))
|
||||||
}
|
}
|
||||||
self.doc_offset = 0;
|
self.doc_offset = 0;
|
||||||
self.doc_freq = doc_freq as usize;
|
self.doc_freq = doc_freq as usize;
|
||||||
@@ -498,8 +533,7 @@ impl BlockSegmentPostings {
|
|||||||
} else {
|
} else {
|
||||||
BlockSegmentPostingsSkipResult::Terminated
|
BlockSegmentPostingsSkipResult::Terminated
|
||||||
}
|
}
|
||||||
})
|
}).unwrap_or(BlockSegmentPostingsSkipResult::Terminated);
|
||||||
.unwrap_or(BlockSegmentPostingsSkipResult::Terminated);
|
|
||||||
}
|
}
|
||||||
BlockSegmentPostingsSkipResult::Terminated
|
BlockSegmentPostingsSkipResult::Terminated
|
||||||
}
|
}
|
||||||
@@ -586,19 +620,20 @@ impl<'b> Streamer<'b> for BlockSegmentPostings {
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
|
use super::search_within_block;
|
||||||
use super::BlockSegmentPostings;
|
use super::BlockSegmentPostings;
|
||||||
use super::BlockSegmentPostingsSkipResult;
|
use super::BlockSegmentPostingsSkipResult;
|
||||||
use super::SegmentPostings;
|
use super::SegmentPostings;
|
||||||
use common::HasLen;
|
use common::HasLen;
|
||||||
use core::Index;
|
use core::Index;
|
||||||
use docset::DocSet;
|
use docset::DocSet;
|
||||||
|
use fst::Streamer;
|
||||||
use schema::IndexRecordOption;
|
use schema::IndexRecordOption;
|
||||||
use schema::Schema;
|
use schema::SchemaBuilder;
|
||||||
use schema::Term;
|
use schema::Term;
|
||||||
use schema::INDEXED;
|
use schema::INT_INDEXED;
|
||||||
use tantivy_fst::Streamer;
|
|
||||||
use DocId;
|
use DocId;
|
||||||
use SkipResult;
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_empty_segment_postings() {
|
fn test_empty_segment_postings() {
|
||||||
@@ -615,9 +650,49 @@ mod tests {
|
|||||||
assert_eq!(postings.doc_freq(), 0);
|
assert_eq!(postings.doc_freq(), 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn search_within_block_trivial_but_slow(block: &[u32], target: u32) -> usize {
|
||||||
|
block
|
||||||
|
.iter()
|
||||||
|
.cloned()
|
||||||
|
.enumerate()
|
||||||
|
.filter(|&(_, ref val)| *val >= target)
|
||||||
|
.next()
|
||||||
|
.unwrap()
|
||||||
|
.0
|
||||||
|
}
|
||||||
|
|
||||||
|
fn util_test_search_within_block(block: &[u32], target: u32) {
|
||||||
|
assert_eq!(
|
||||||
|
search_within_block(block, target),
|
||||||
|
search_within_block_trivial_but_slow(block, target)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn util_test_search_within_block_all(block: &[u32]) {
|
||||||
|
use std::collections::HashSet;
|
||||||
|
let mut targets = HashSet::new();
|
||||||
|
for (i, val) in block.iter().cloned().enumerate() {
|
||||||
|
if i > 0 {
|
||||||
|
targets.insert(val - 1);
|
||||||
|
}
|
||||||
|
targets.insert(val);
|
||||||
|
}
|
||||||
|
for target in targets {
|
||||||
|
util_test_search_within_block(block, target);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_search_within_block() {
|
||||||
|
for len in 1u32..128u32 {
|
||||||
|
let v: Vec<u32> = (0..len).map(|i| i * 2).collect();
|
||||||
|
util_test_search_within_block_all(&v[..]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_block_segment_postings() {
|
fn test_block_segment_postings() {
|
||||||
let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>());
|
let mut block_segments = build_block_postings((0..100_000).collect::<Vec<u32>>());
|
||||||
let mut offset: u32 = 0u32;
|
let mut offset: u32 = 0u32;
|
||||||
// checking that the block before calling advance is empty
|
// checking that the block before calling advance is empty
|
||||||
assert!(block_segments.docs().is_empty());
|
assert!(block_segments.docs().is_empty());
|
||||||
@@ -631,44 +706,14 @@ mod tests {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
fn build_block_postings(docs: Vec<DocId>) -> BlockSegmentPostings {
|
||||||
fn test_skip_right_at_new_block() {
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let mut doc_ids = (0..128).collect::<Vec<u32>>();
|
let int_field = schema_builder.add_u64_field("id", INT_INDEXED);
|
||||||
doc_ids.push(129);
|
|
||||||
doc_ids.push(130);
|
|
||||||
{
|
|
||||||
let block_segments = build_block_postings(&doc_ids);
|
|
||||||
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
|
|
||||||
assert_eq!(docset.skip_next(128), SkipResult::OverStep);
|
|
||||||
assert_eq!(docset.doc(), 129);
|
|
||||||
assert!(docset.advance());
|
|
||||||
assert_eq!(docset.doc(), 130);
|
|
||||||
assert!(!docset.advance());
|
|
||||||
}
|
|
||||||
{
|
|
||||||
let block_segments = build_block_postings(&doc_ids);
|
|
||||||
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
|
|
||||||
assert_eq!(docset.skip_next(129), SkipResult::Reached);
|
|
||||||
assert_eq!(docset.doc(), 129);
|
|
||||||
assert!(docset.advance());
|
|
||||||
assert_eq!(docset.doc(), 130);
|
|
||||||
assert!(!docset.advance());
|
|
||||||
}
|
|
||||||
{
|
|
||||||
let block_segments = build_block_postings(&doc_ids);
|
|
||||||
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
|
|
||||||
assert_eq!(docset.skip_next(131), SkipResult::End);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn build_block_postings(docs: &[DocId]) -> BlockSegmentPostings {
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
let int_field = schema_builder.add_u64_field("id", INDEXED);
|
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
let mut last_doc = 0u32;
|
let mut last_doc = 0u32;
|
||||||
for &doc in docs {
|
for doc in docs {
|
||||||
for _ in last_doc..doc {
|
for _ in last_doc..doc {
|
||||||
index_writer.add_document(doc!(int_field=>1u64));
|
index_writer.add_document(doc!(int_field=>1u64));
|
||||||
}
|
}
|
||||||
@@ -676,7 +721,8 @@ mod tests {
|
|||||||
last_doc = doc + 1;
|
last_doc = doc + 1;
|
||||||
}
|
}
|
||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
let searcher = index.reader().unwrap().searcher();
|
index.load_searchers().unwrap();
|
||||||
|
let searcher = index.searcher();
|
||||||
let segment_reader = searcher.segment_reader(0);
|
let segment_reader = searcher.segment_reader(0);
|
||||||
let inverted_index = segment_reader.inverted_index(int_field);
|
let inverted_index = segment_reader.inverted_index(int_field);
|
||||||
let term = Term::from_field_u64(int_field, 0u64);
|
let term = Term::from_field_u64(int_field, 0u64);
|
||||||
@@ -687,7 +733,7 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn test_block_segment_postings_skip() {
|
fn test_block_segment_postings_skip() {
|
||||||
for i in 0..4 {
|
for i in 0..4 {
|
||||||
let mut block_postings = build_block_postings(&[3]);
|
let mut block_postings = build_block_postings(vec![3]);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
block_postings.skip_to(i),
|
block_postings.skip_to(i),
|
||||||
BlockSegmentPostingsSkipResult::Success(0u32)
|
BlockSegmentPostingsSkipResult::Success(0u32)
|
||||||
@@ -697,7 +743,7 @@ mod tests {
|
|||||||
BlockSegmentPostingsSkipResult::Terminated
|
BlockSegmentPostingsSkipResult::Terminated
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
let mut block_postings = build_block_postings(&[3]);
|
let mut block_postings = build_block_postings(vec![3]);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
block_postings.skip_to(4u32),
|
block_postings.skip_to(4u32),
|
||||||
BlockSegmentPostingsSkipResult::Terminated
|
BlockSegmentPostingsSkipResult::Terminated
|
||||||
@@ -710,7 +756,7 @@ mod tests {
|
|||||||
for i in 0..1300 {
|
for i in 0..1300 {
|
||||||
docs.push((i * i / 100) + i);
|
docs.push((i * i / 100) + i);
|
||||||
}
|
}
|
||||||
let mut block_postings = build_block_postings(&docs[..]);
|
let mut block_postings = build_block_postings(docs.clone());
|
||||||
for i in vec![0, 424, 10000] {
|
for i in vec![0, 424, 10000] {
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
block_postings.skip_to(i),
|
block_postings.skip_to(i),
|
||||||
@@ -732,11 +778,11 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_reset_block_segment_postings() {
|
fn test_reset_block_segment_postings() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let int_field = schema_builder.add_u64_field("id", INDEXED);
|
let int_field = schema_builder.add_u64_field("id", INT_INDEXED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
// create two postings list, one containg even number,
|
// create two postings list, one containg even number,
|
||||||
// the other containing odd numbers.
|
// the other containing odd numbers.
|
||||||
for i in 0..6 {
|
for i in 0..6 {
|
||||||
@@ -744,7 +790,8 @@ mod tests {
|
|||||||
index_writer.add_document(doc);
|
index_writer.add_document(doc);
|
||||||
}
|
}
|
||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
let searcher = index.reader().unwrap().searcher();
|
index.load_searchers().unwrap();
|
||||||
|
let searcher = index.searcher();
|
||||||
let segment_reader = searcher.segment_reader(0);
|
let segment_reader = searcher.segment_reader(0);
|
||||||
|
|
||||||
let mut block_segments;
|
let mut block_segments;
|
||||||
|
|||||||
@@ -55,7 +55,7 @@ pub struct InvertedIndexSerializer {
|
|||||||
|
|
||||||
impl InvertedIndexSerializer {
|
impl InvertedIndexSerializer {
|
||||||
/// Open a new `PostingsSerializer` for the given segment
|
/// Open a new `PostingsSerializer` for the given segment
|
||||||
fn create(
|
fn new(
|
||||||
terms_write: CompositeWrite<WritePtr>,
|
terms_write: CompositeWrite<WritePtr>,
|
||||||
postings_write: CompositeWrite<WritePtr>,
|
postings_write: CompositeWrite<WritePtr>,
|
||||||
positions_write: CompositeWrite<WritePtr>,
|
positions_write: CompositeWrite<WritePtr>,
|
||||||
@@ -74,7 +74,7 @@ impl InvertedIndexSerializer {
|
|||||||
/// Open a new `PostingsSerializer` for the given segment
|
/// Open a new `PostingsSerializer` for the given segment
|
||||||
pub fn open(segment: &mut Segment) -> Result<InvertedIndexSerializer> {
|
pub fn open(segment: &mut Segment) -> Result<InvertedIndexSerializer> {
|
||||||
use SegmentComponent::{POSITIONS, POSITIONSSKIP, POSTINGS, TERMS};
|
use SegmentComponent::{POSITIONS, POSITIONSSKIP, POSTINGS, TERMS};
|
||||||
InvertedIndexSerializer::create(
|
InvertedIndexSerializer::new(
|
||||||
CompositeWrite::wrap(segment.open_write(TERMS)?),
|
CompositeWrite::wrap(segment.open_write(TERMS)?),
|
||||||
CompositeWrite::wrap(segment.open_write(POSTINGS)?),
|
CompositeWrite::wrap(segment.open_write(POSTINGS)?),
|
||||||
CompositeWrite::wrap(segment.open_write(POSITIONS)?),
|
CompositeWrite::wrap(segment.open_write(POSITIONS)?),
|
||||||
@@ -99,7 +99,7 @@ impl InvertedIndexSerializer {
|
|||||||
let positions_write = self.positions_write.for_field(field);
|
let positions_write = self.positions_write.for_field(field);
|
||||||
let positionsidx_write = self.positionsidx_write.for_field(field);
|
let positionsidx_write = self.positionsidx_write.for_field(field);
|
||||||
let field_type: FieldType = (*field_entry.field_type()).clone();
|
let field_type: FieldType = (*field_entry.field_type()).clone();
|
||||||
FieldSerializer::create(
|
FieldSerializer::new(
|
||||||
&field_type,
|
&field_type,
|
||||||
term_dictionary_write,
|
term_dictionary_write,
|
||||||
postings_write,
|
postings_write,
|
||||||
@@ -130,7 +130,7 @@ pub struct FieldSerializer<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> FieldSerializer<'a> {
|
impl<'a> FieldSerializer<'a> {
|
||||||
fn create(
|
fn new(
|
||||||
field_type: &FieldType,
|
field_type: &FieldType,
|
||||||
term_dictionary_write: &'a mut CountingWriter<WritePtr>,
|
term_dictionary_write: &'a mut CountingWriter<WritePtr>,
|
||||||
postings_write: &'a mut CountingWriter<WritePtr>,
|
postings_write: &'a mut CountingWriter<WritePtr>,
|
||||||
@@ -152,7 +152,7 @@ impl<'a> FieldSerializer<'a> {
|
|||||||
_ => (false, false),
|
_ => (false, false),
|
||||||
};
|
};
|
||||||
let term_dictionary_builder =
|
let term_dictionary_builder =
|
||||||
TermDictionaryBuilder::create(term_dictionary_write, &field_type)?;
|
TermDictionaryBuilder::new(term_dictionary_write, &field_type)?;
|
||||||
let postings_serializer =
|
let postings_serializer =
|
||||||
PostingsSerializer::new(postings_write, term_freq_enabled, position_enabled);
|
PostingsSerializer::new(postings_write, term_freq_enabled, position_enabled);
|
||||||
let positions_serializer_opt = if position_enabled {
|
let positions_serializer_opt = if position_enabled {
|
||||||
|
|||||||
@@ -1,37 +1,28 @@
|
|||||||
use super::{Addr, MemoryArena};
|
use super::{Addr, MemoryArena};
|
||||||
|
|
||||||
use postings::stacker::memory_arena::load;
|
use common::is_power_of_2;
|
||||||
use postings::stacker::memory_arena::store;
|
|
||||||
use std::io;
|
|
||||||
use std::mem;
|
use std::mem;
|
||||||
|
|
||||||
const MAX_BLOCK_LEN: u32 = 1u32 << 15;
|
const MAX_BLOCK_LEN: u32 = 1u32 << 15;
|
||||||
const FIRST_BLOCK: usize = 16;
|
|
||||||
const INLINED_BLOCK_LEN: usize = FIRST_BLOCK + mem::size_of::<Addr>();
|
|
||||||
|
|
||||||
enum CapacityResult {
|
const FIRST_BLOCK: u32 = 4u32;
|
||||||
Available(u32),
|
|
||||||
NeedAlloc(u32),
|
|
||||||
}
|
|
||||||
|
|
||||||
fn len_to_capacity(len: u32) -> CapacityResult {
|
#[inline]
|
||||||
|
pub fn jump_needed(len: u32) -> Option<usize> {
|
||||||
match len {
|
match len {
|
||||||
0...15 => CapacityResult::Available(FIRST_BLOCK as u32 - len),
|
0...3 => None,
|
||||||
16...MAX_BLOCK_LEN => {
|
4...MAX_BLOCK_LEN => {
|
||||||
let cap = 1 << (32u32 - (len - 1u32).leading_zeros());
|
if is_power_of_2(len as usize) {
|
||||||
let available = cap - len;
|
Some(len as usize)
|
||||||
if available == 0 {
|
|
||||||
CapacityResult::NeedAlloc(len)
|
|
||||||
} else {
|
} else {
|
||||||
CapacityResult::Available(available)
|
None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
n => {
|
n => {
|
||||||
let available = n % MAX_BLOCK_LEN;
|
if n % MAX_BLOCK_LEN == 0 {
|
||||||
if available == 0 {
|
Some(MAX_BLOCK_LEN as usize)
|
||||||
CapacityResult::NeedAlloc(MAX_BLOCK_LEN)
|
|
||||||
} else {
|
} else {
|
||||||
CapacityResult::Available(MAX_BLOCK_LEN - available)
|
None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -61,119 +52,82 @@ fn len_to_capacity(len: u32) -> CapacityResult {
|
|||||||
#[derive(Debug, Clone, Copy)]
|
#[derive(Debug, Clone, Copy)]
|
||||||
pub struct ExpUnrolledLinkedList {
|
pub struct ExpUnrolledLinkedList {
|
||||||
len: u32,
|
len: u32,
|
||||||
|
head: Addr,
|
||||||
tail: Addr,
|
tail: Addr,
|
||||||
inlined_data: [u8; INLINED_BLOCK_LEN as usize],
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct ExpUnrolledLinkedListWriter<'a> {
|
|
||||||
eull: &'a mut ExpUnrolledLinkedList,
|
|
||||||
heap: &'a mut MemoryArena,
|
|
||||||
}
|
|
||||||
|
|
||||||
fn ensure_capacity<'a>(
|
|
||||||
eull: &'a mut ExpUnrolledLinkedList,
|
|
||||||
heap: &'a mut MemoryArena,
|
|
||||||
) -> &'a mut [u8] {
|
|
||||||
if eull.len <= FIRST_BLOCK as u32 {
|
|
||||||
// We are still hitting the inline block.
|
|
||||||
if eull.len < FIRST_BLOCK as u32 {
|
|
||||||
return &mut eull.inlined_data[eull.len as usize..FIRST_BLOCK];
|
|
||||||
}
|
|
||||||
// We need to allocate a new block!
|
|
||||||
let new_block_addr: Addr = heap.allocate_space(FIRST_BLOCK + mem::size_of::<Addr>());
|
|
||||||
store(&mut eull.inlined_data[FIRST_BLOCK..], new_block_addr);
|
|
||||||
eull.tail = new_block_addr;
|
|
||||||
return heap.slice_mut(eull.tail, FIRST_BLOCK);
|
|
||||||
}
|
|
||||||
let len = match len_to_capacity(eull.len) {
|
|
||||||
CapacityResult::NeedAlloc(new_block_len) => {
|
|
||||||
let new_block_addr: Addr =
|
|
||||||
heap.allocate_space(new_block_len as usize + mem::size_of::<Addr>());
|
|
||||||
heap.write_at(eull.tail, new_block_addr);
|
|
||||||
eull.tail = new_block_addr;
|
|
||||||
new_block_len
|
|
||||||
}
|
|
||||||
CapacityResult::Available(available) => available,
|
|
||||||
};
|
|
||||||
heap.slice_mut(eull.tail, len as usize)
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a> ExpUnrolledLinkedListWriter<'a> {
|
|
||||||
pub fn extend_from_slice(&mut self, mut buf: &[u8]) {
|
|
||||||
if buf.is_empty() {
|
|
||||||
// we need to cut early, because `ensure_capacity`
|
|
||||||
// allocates if there is no capacity at all right now.
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
while !buf.is_empty() {
|
|
||||||
let add_len: usize;
|
|
||||||
{
|
|
||||||
let output_buf = ensure_capacity(self.eull, self.heap);
|
|
||||||
add_len = buf.len().min(output_buf.len());
|
|
||||||
output_buf[..add_len].copy_from_slice(&buf[..add_len]);
|
|
||||||
}
|
|
||||||
self.eull.len += add_len as u32;
|
|
||||||
self.eull.tail = self.eull.tail.offset(add_len as u32);
|
|
||||||
buf = &buf[add_len..];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a> io::Write for ExpUnrolledLinkedListWriter<'a> {
|
|
||||||
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
|
||||||
// There is no use case to only write the capacity.
|
|
||||||
// This is not IO after all, so we write the whole
|
|
||||||
// buffer even if the contract of `.write` is looser.
|
|
||||||
self.extend_from_slice(buf);
|
|
||||||
Ok(buf.len())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn write_all(&mut self, buf: &[u8]) -> io::Result<()> {
|
|
||||||
self.extend_from_slice(buf);
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn flush(&mut self) -> io::Result<()> {
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ExpUnrolledLinkedList {
|
impl ExpUnrolledLinkedList {
|
||||||
pub fn new() -> ExpUnrolledLinkedList {
|
pub fn new(heap: &mut MemoryArena) -> ExpUnrolledLinkedList {
|
||||||
|
let addr = heap.allocate_space((FIRST_BLOCK as usize) * mem::size_of::<u32>());
|
||||||
ExpUnrolledLinkedList {
|
ExpUnrolledLinkedList {
|
||||||
len: 0u32,
|
len: 0u32,
|
||||||
tail: Addr::null_pointer(),
|
head: addr,
|
||||||
inlined_data: [0u8; INLINED_BLOCK_LEN as usize],
|
tail: addr,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline(always)]
|
pub fn iter<'a>(&self, heap: &'a MemoryArena) -> ExpUnrolledLinkedListIterator<'a> {
|
||||||
pub fn writer<'a>(&'a mut self, heap: &'a mut MemoryArena) -> ExpUnrolledLinkedListWriter<'a> {
|
ExpUnrolledLinkedListIterator {
|
||||||
ExpUnrolledLinkedListWriter { eull: self, heap }
|
heap,
|
||||||
|
addr: self.head,
|
||||||
|
len: self.len,
|
||||||
|
consumed: 0,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn read_to_end(&self, heap: &MemoryArena, output: &mut Vec<u8>) {
|
/// Appends a new element to the current stack.
|
||||||
let len = self.len as usize;
|
///
|
||||||
if len <= FIRST_BLOCK {
|
/// If the current block end is reached, a new block is allocated.
|
||||||
output.extend_from_slice(&self.inlined_data[..len]);
|
pub fn push(&mut self, val: u32, heap: &mut MemoryArena) {
|
||||||
return;
|
self.len += 1;
|
||||||
|
if let Some(new_block_len) = jump_needed(self.len) {
|
||||||
|
// We need to allocate another block.
|
||||||
|
// We also allocate an extra `u32` to store the pointer
|
||||||
|
// to the future next block.
|
||||||
|
let new_block_size: usize = (new_block_len + 1) * mem::size_of::<u32>();
|
||||||
|
let new_block_addr: Addr = heap.allocate_space(new_block_size);
|
||||||
|
unsafe {
|
||||||
|
// logic
|
||||||
|
heap.write(self.tail, new_block_addr)
|
||||||
|
};
|
||||||
|
self.tail = new_block_addr;
|
||||||
}
|
}
|
||||||
output.extend_from_slice(&self.inlined_data[..FIRST_BLOCK]);
|
unsafe {
|
||||||
let mut cur = FIRST_BLOCK;
|
// logic
|
||||||
let mut addr = load(&self.inlined_data[FIRST_BLOCK..]);
|
heap.write(self.tail, val);
|
||||||
loop {
|
self.tail = self.tail.offset(mem::size_of::<u32>() as u32);
|
||||||
let cap = match len_to_capacity(cur as u32) {
|
}
|
||||||
CapacityResult::Available(capacity) => capacity,
|
}
|
||||||
CapacityResult::NeedAlloc(capacity) => capacity,
|
}
|
||||||
} as usize;
|
|
||||||
let data = heap.slice(addr, cap);
|
pub struct ExpUnrolledLinkedListIterator<'a> {
|
||||||
if cur + cap >= len {
|
heap: &'a MemoryArena,
|
||||||
output.extend_from_slice(&data[..(len - cur)]);
|
addr: Addr,
|
||||||
return;
|
len: u32,
|
||||||
}
|
consumed: u32,
|
||||||
output.extend_from_slice(data);
|
}
|
||||||
cur += cap;
|
|
||||||
addr = heap.read(addr.offset(cap as u32));
|
impl<'a> Iterator for ExpUnrolledLinkedListIterator<'a> {
|
||||||
|
type Item = u32;
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<u32> {
|
||||||
|
if self.consumed == self.len {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
self.consumed += 1;
|
||||||
|
let addr: Addr = if jump_needed(self.consumed).is_some() {
|
||||||
|
unsafe {
|
||||||
|
// logic
|
||||||
|
self.heap.read(self.addr)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
self.addr
|
||||||
|
};
|
||||||
|
self.addr = addr.offset(mem::size_of::<u32>() as u32);
|
||||||
|
Some(unsafe {
|
||||||
|
// logic
|
||||||
|
self.heap.read(addr)
|
||||||
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -182,134 +136,46 @@ impl ExpUnrolledLinkedList {
|
|||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
use super::super::MemoryArena;
|
use super::super::MemoryArena;
|
||||||
use super::len_to_capacity;
|
use super::jump_needed;
|
||||||
use super::*;
|
use super::*;
|
||||||
use byteorder::{ByteOrder, LittleEndian, WriteBytesExt};
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_stack() {
|
fn test_stack() {
|
||||||
let mut heap = MemoryArena::new();
|
let mut heap = MemoryArena::new();
|
||||||
let mut stack = ExpUnrolledLinkedList::new();
|
let mut stack = ExpUnrolledLinkedList::new(&mut heap);
|
||||||
stack.writer(&mut heap).extend_from_slice(&[1u8]);
|
stack.push(1u32, &mut heap);
|
||||||
stack.writer(&mut heap).extend_from_slice(&[2u8]);
|
stack.push(2u32, &mut heap);
|
||||||
stack.writer(&mut heap).extend_from_slice(&[3u8, 4u8]);
|
stack.push(4u32, &mut heap);
|
||||||
stack.writer(&mut heap).extend_from_slice(&[5u8]);
|
stack.push(8u32, &mut heap);
|
||||||
{
|
{
|
||||||
let mut buffer = Vec::new();
|
let mut it = stack.iter(&heap);
|
||||||
stack.read_to_end(&heap, &mut buffer);
|
assert_eq!(it.next().unwrap(), 1u32);
|
||||||
assert_eq!(&buffer[..], &[1u8, 2u8, 3u8, 4u8, 5u8]);
|
assert_eq!(it.next().unwrap(), 2u32);
|
||||||
|
assert_eq!(it.next().unwrap(), 4u32);
|
||||||
|
assert_eq!(it.next().unwrap(), 8u32);
|
||||||
|
assert!(it.next().is_none());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_stack_long() {
|
|
||||||
let mut heap = MemoryArena::new();
|
|
||||||
let mut stack = ExpUnrolledLinkedList::new();
|
|
||||||
let source: Vec<u32> = (0..100).collect();
|
|
||||||
for &el in &source {
|
|
||||||
assert!(stack
|
|
||||||
.writer(&mut heap)
|
|
||||||
.write_u32::<LittleEndian>(el)
|
|
||||||
.is_ok());
|
|
||||||
}
|
|
||||||
let mut buffer = Vec::new();
|
|
||||||
stack.read_to_end(&heap, &mut buffer);
|
|
||||||
let mut result = vec![];
|
|
||||||
let mut remaining = &buffer[..];
|
|
||||||
while !remaining.is_empty() {
|
|
||||||
result.push(LittleEndian::read_u32(&remaining[..4]));
|
|
||||||
remaining = &remaining[4..];
|
|
||||||
}
|
|
||||||
assert_eq!(&result[..], &source[..]);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_stack_interlaced() {
|
|
||||||
let mut heap = MemoryArena::new();
|
|
||||||
let mut stack = ExpUnrolledLinkedList::new();
|
|
||||||
let mut stack2 = ExpUnrolledLinkedList::new();
|
|
||||||
|
|
||||||
let mut vec1: Vec<u8> = vec![];
|
|
||||||
let mut vec2: Vec<u8> = vec![];
|
|
||||||
|
|
||||||
for i in 0..9 {
|
|
||||||
assert!(stack.writer(&mut heap).write_u32::<LittleEndian>(i).is_ok());
|
|
||||||
assert!(vec1.write_u32::<LittleEndian>(i).is_ok());
|
|
||||||
if i % 2 == 0 {
|
|
||||||
assert!(stack2
|
|
||||||
.writer(&mut heap)
|
|
||||||
.write_u32::<LittleEndian>(i)
|
|
||||||
.is_ok());
|
|
||||||
assert!(vec2.write_u32::<LittleEndian>(i).is_ok());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
let mut res1 = vec![];
|
|
||||||
let mut res2 = vec![];
|
|
||||||
stack.read_to_end(&heap, &mut res1);
|
|
||||||
stack2.read_to_end(&heap, &mut res2);
|
|
||||||
assert_eq!(&vec1[..], &res1[..]);
|
|
||||||
assert_eq!(&vec2[..], &res2[..]);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_jump_if_needed() {
|
fn test_jump_if_needed() {
|
||||||
let mut available = 16u32;
|
let mut block_len = 4u32;
|
||||||
for i in 0..10_000_000 {
|
let mut i = 0;
|
||||||
match len_to_capacity(i) {
|
while i < 10_000_000 {
|
||||||
CapacityResult::NeedAlloc(cap) => {
|
assert!(jump_needed(i + block_len - 1).is_none());
|
||||||
assert_eq!(available, 0, "Failed len={}: Expected 0 got {}", i, cap);
|
assert!(jump_needed(i + block_len + 1).is_none());
|
||||||
available = cap;
|
assert!(jump_needed(i + block_len).is_some());
|
||||||
}
|
let new_block_len = jump_needed(i + block_len).unwrap();
|
||||||
CapacityResult::Available(cap) => {
|
i += block_len;
|
||||||
assert_eq!(
|
block_len = new_block_len as u32;
|
||||||
available, cap,
|
|
||||||
"Failed len={}: Expected {} Got {}",
|
|
||||||
i, available, cap
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
available -= 1;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_jump_if_needed_progression() {
|
|
||||||
let mut v = vec![];
|
|
||||||
for i in 0.. {
|
|
||||||
if v.len() >= 10 {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
match len_to_capacity(i) {
|
|
||||||
CapacityResult::NeedAlloc(cap) => {
|
|
||||||
v.push((i, cap));
|
|
||||||
}
|
|
||||||
_ => {}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
assert_eq!(
|
|
||||||
&v[..],
|
|
||||||
&[
|
|
||||||
(16, 16),
|
|
||||||
(32, 32),
|
|
||||||
(64, 64),
|
|
||||||
(128, 128),
|
|
||||||
(256, 256),
|
|
||||||
(512, 512),
|
|
||||||
(1024, 1024),
|
|
||||||
(2048, 2048),
|
|
||||||
(4096, 4096),
|
|
||||||
(8192, 8192)
|
|
||||||
]
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(all(test, feature = "unstable"))]
|
#[cfg(all(test, feature = "unstable"))]
|
||||||
mod bench {
|
mod bench {
|
||||||
use super::super::MemoryArena;
|
|
||||||
use super::ExpUnrolledLinkedList;
|
use super::ExpUnrolledLinkedList;
|
||||||
use byteorder::{NativeEndian, WriteBytesExt};
|
use tantivy_memory_arena::MemoryArena;
|
||||||
use test::Bencher;
|
use test::Bencher;
|
||||||
|
|
||||||
const NUM_STACK: usize = 10_000;
|
const NUM_STACK: usize = 10_000;
|
||||||
@@ -333,19 +199,20 @@ mod bench {
|
|||||||
|
|
||||||
#[bench]
|
#[bench]
|
||||||
fn bench_push_stack(bench: &mut Bencher) {
|
fn bench_push_stack(bench: &mut Bencher) {
|
||||||
|
let heap = MemoryArena::new();
|
||||||
bench.iter(|| {
|
bench.iter(|| {
|
||||||
let mut heap = MemoryArena::new();
|
|
||||||
let mut stacks = Vec::with_capacity(100);
|
let mut stacks = Vec::with_capacity(100);
|
||||||
for _ in 0..NUM_STACK {
|
for _ in 0..NUM_STACK {
|
||||||
let mut stack = ExpUnrolledLinkedList::new();
|
let (_, stack) = heap.allocate_object::<ExpUnrolledLinkedList>();
|
||||||
stacks.push(stack);
|
stacks.push(stack);
|
||||||
}
|
}
|
||||||
for s in 0..NUM_STACK {
|
for s in 0..NUM_STACK {
|
||||||
for i in 0u32..STACK_SIZE {
|
for i in 0u32..STACK_SIZE {
|
||||||
let t = s * 392017 % NUM_STACK;
|
let t = s * 392017 % NUM_STACK;
|
||||||
let _ = stacks[t].writer(&mut heap).write_u32::<NativeEndian>(i);
|
stacks[t].push(i, &heap);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
heap.clear();
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ const PAGE_SIZE: usize = 1 << NUM_BITS_PAGE_ADDR; // pages are 1 MB large
|
|||||||
/// page of memory.
|
/// page of memory.
|
||||||
///
|
///
|
||||||
/// The last 20 bits are an address within this page of memory.
|
/// The last 20 bits are an address within this page of memory.
|
||||||
#[derive(Copy, Clone, Debug)]
|
#[derive(Clone, Copy, Debug)]
|
||||||
pub struct Addr(u32);
|
pub struct Addr(u32);
|
||||||
|
|
||||||
impl Addr {
|
impl Addr {
|
||||||
@@ -69,16 +69,32 @@ impl Addr {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn store<Item: Copy + 'static>(dest: &mut [u8], val: Item) {
|
/// Trait required for an object to be `storable`.
|
||||||
assert_eq!(dest.len(), std::mem::size_of::<Item>());
|
///
|
||||||
unsafe {
|
/// # Warning
|
||||||
ptr::write_unaligned(dest.as_mut_ptr() as *mut Item, val);
|
///
|
||||||
}
|
/// Most of the time you should not implement this trait,
|
||||||
|
/// and only use the `MemoryArena` with object implementing `Copy`.
|
||||||
|
///
|
||||||
|
/// `ArenaStorable` is used in `tantivy` to force
|
||||||
|
/// a `Copy` object and a `slice` of data to be stored contiguously.
|
||||||
|
pub trait ArenaStorable {
|
||||||
|
fn num_bytes(&self) -> usize;
|
||||||
|
unsafe fn write_into(self, arena: &mut MemoryArena, addr: Addr);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn load<Item: Copy + 'static>(data: &[u8]) -> Item {
|
impl<V> ArenaStorable for V
|
||||||
assert_eq!(data.len(), std::mem::size_of::<Item>());
|
where
|
||||||
unsafe { ptr::read_unaligned(data.as_ptr() as *const Item) }
|
V: Copy,
|
||||||
|
{
|
||||||
|
fn num_bytes(&self) -> usize {
|
||||||
|
mem::size_of::<V>()
|
||||||
|
}
|
||||||
|
|
||||||
|
unsafe fn write_into(self, arena: &mut MemoryArena, addr: Addr) {
|
||||||
|
let dst_ptr = arena.get_mut_ptr(addr) as *mut V;
|
||||||
|
ptr::write_unaligned(dst_ptr, self);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// The `MemoryArena`
|
/// The `MemoryArena`
|
||||||
@@ -110,9 +126,47 @@ impl MemoryArena {
|
|||||||
self.pages.len() * PAGE_SIZE
|
self.pages.len() * PAGE_SIZE
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn write_at<Item: Copy + 'static>(&mut self, addr: Addr, val: Item) {
|
/// Writes a slice at the given address, assuming the
|
||||||
let dest = self.slice_mut(addr, std::mem::size_of::<Item>());
|
/// memory was allocated beforehands.
|
||||||
store(dest, val);
|
///
|
||||||
|
/// # Panics
|
||||||
|
///
|
||||||
|
/// May panic or corrupt the heap if he space was not
|
||||||
|
/// properly allocated beforehands.
|
||||||
|
pub fn write_bytes<B: AsRef<[u8]>>(&mut self, addr: Addr, data: B) {
|
||||||
|
let bytes = data.as_ref();
|
||||||
|
self.pages[addr.page_id()]
|
||||||
|
.get_mut_slice(addr.page_local_addr(), bytes.len())
|
||||||
|
.copy_from_slice(bytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the `len` bytes starting at `addr`
|
||||||
|
///
|
||||||
|
/// # Panics
|
||||||
|
///
|
||||||
|
/// Panics if the memory has not been allocated beforehands.
|
||||||
|
pub fn read_slice(&self, addr: Addr, len: usize) -> &[u8] {
|
||||||
|
self.pages[addr.page_id()].get_slice(addr.page_local_addr(), len)
|
||||||
|
}
|
||||||
|
|
||||||
|
unsafe fn get_mut_ptr(&mut self, addr: Addr) -> *mut u8 {
|
||||||
|
self.pages[addr.page_id()].get_mut_ptr(addr.page_local_addr())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Stores an item's data in the heap
|
||||||
|
///
|
||||||
|
/// It allocates the `Item` beforehands.
|
||||||
|
pub fn store<Item: ArenaStorable>(&mut self, val: Item) -> Addr {
|
||||||
|
let num_bytes = val.num_bytes();
|
||||||
|
let addr = self.allocate_space(num_bytes);
|
||||||
|
unsafe {
|
||||||
|
self.write(addr, val);
|
||||||
|
};
|
||||||
|
addr
|
||||||
|
}
|
||||||
|
|
||||||
|
pub unsafe fn write<Item: ArenaStorable>(&mut self, addr: Addr, val: Item) {
|
||||||
|
val.write_into(self, addr)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Read an item in the heap at the given `address`.
|
/// Read an item in the heap at the given `address`.
|
||||||
@@ -120,21 +174,9 @@ impl MemoryArena {
|
|||||||
/// # Panics
|
/// # Panics
|
||||||
///
|
///
|
||||||
/// If the address is erroneous
|
/// If the address is erroneous
|
||||||
pub fn read<Item: Copy + 'static>(&self, addr: Addr) -> Item {
|
pub unsafe fn read<Item: Copy>(&self, addr: Addr) -> Item {
|
||||||
load(self.slice(addr, mem::size_of::<Item>()))
|
let ptr = self.pages[addr.page_id()].get_ptr(addr.page_local_addr());
|
||||||
}
|
ptr::read_unaligned(ptr as *const Item)
|
||||||
|
|
||||||
pub fn slice(&self, addr: Addr, len: usize) -> &[u8] {
|
|
||||||
self.pages[addr.page_id()].slice(addr.page_local_addr(), len)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn slice_from(&self, addr: Addr) -> &[u8] {
|
|
||||||
self.pages[addr.page_id()].slice_from(addr.page_local_addr())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[inline(always)]
|
|
||||||
pub fn slice_mut(&mut self, addr: Addr, len: usize) -> &mut [u8] {
|
|
||||||
self.pages[addr.page_id()].slice_mut(addr.page_local_addr(), len)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Allocates `len` bytes and returns the allocated address.
|
/// Allocates `len` bytes and returns the allocated address.
|
||||||
@@ -155,10 +197,14 @@ struct Page {
|
|||||||
|
|
||||||
impl Page {
|
impl Page {
|
||||||
fn new(page_id: usize) -> Page {
|
fn new(page_id: usize) -> Page {
|
||||||
|
let mut data: Vec<u8> = Vec::with_capacity(PAGE_SIZE);
|
||||||
|
unsafe {
|
||||||
|
data.set_len(PAGE_SIZE);
|
||||||
|
} // avoid initializing page
|
||||||
Page {
|
Page {
|
||||||
page_id,
|
page_id,
|
||||||
len: 0,
|
len: 0,
|
||||||
data: vec![0u8; PAGE_SIZE].into_boxed_slice(),
|
data: data.into_boxed_slice(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -167,18 +213,14 @@ impl Page {
|
|||||||
len + self.len <= PAGE_SIZE
|
len + self.len <= PAGE_SIZE
|
||||||
}
|
}
|
||||||
|
|
||||||
fn slice(&self, local_addr: usize, len: usize) -> &[u8] {
|
fn get_mut_slice(&mut self, local_addr: usize, len: usize) -> &mut [u8] {
|
||||||
&self.slice_from(local_addr)[..len]
|
|
||||||
}
|
|
||||||
|
|
||||||
fn slice_from(&self, local_addr: usize) -> &[u8] {
|
|
||||||
&self.data[local_addr..]
|
|
||||||
}
|
|
||||||
|
|
||||||
fn slice_mut(&mut self, local_addr: usize, len: usize) -> &mut [u8] {
|
|
||||||
&mut self.data[local_addr..][..len]
|
&mut self.data[local_addr..][..len]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn get_slice(&self, local_addr: usize, len: usize) -> &[u8] {
|
||||||
|
&self.data[local_addr..][..len]
|
||||||
|
}
|
||||||
|
|
||||||
fn allocate_space(&mut self, len: usize) -> Option<Addr> {
|
fn allocate_space(&mut self, len: usize) -> Option<Addr> {
|
||||||
if self.is_available(len) {
|
if self.is_available(len) {
|
||||||
let addr = Addr::new(self.page_id, self.len);
|
let addr = Addr::new(self.page_id, self.len);
|
||||||
@@ -188,6 +230,16 @@ impl Page {
|
|||||||
None
|
None
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
pub(crate) unsafe fn get_ptr(&self, addr: usize) -> *const u8 {
|
||||||
|
self.data.as_ptr().add(addr)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
pub(crate) unsafe fn get_mut_ptr(&mut self, addr: usize) -> *mut u8 {
|
||||||
|
self.data.as_mut_ptr().add(addr)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@@ -202,13 +254,13 @@ mod tests {
|
|||||||
let b = b"happy tax payer";
|
let b = b"happy tax payer";
|
||||||
|
|
||||||
let addr_a = arena.allocate_space(a.len());
|
let addr_a = arena.allocate_space(a.len());
|
||||||
arena.slice_mut(addr_a, a.len()).copy_from_slice(a);
|
arena.write_bytes(addr_a, a);
|
||||||
|
|
||||||
let addr_b = arena.allocate_space(b.len());
|
let addr_b = arena.allocate_space(b.len());
|
||||||
arena.slice_mut(addr_b, b.len()).copy_from_slice(b);
|
arena.write_bytes(addr_b, b);
|
||||||
|
|
||||||
assert_eq!(arena.slice(addr_a, a.len()), a);
|
assert_eq!(arena.read_slice(addr_a, a.len()), a);
|
||||||
assert_eq!(arena.slice(addr_b, b.len()), b);
|
assert_eq!(arena.read_slice(addr_b, b.len()), b);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
|
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
|
||||||
@@ -231,15 +283,9 @@ mod tests {
|
|||||||
b: 221,
|
b: 221,
|
||||||
c: 12,
|
c: 12,
|
||||||
};
|
};
|
||||||
|
let addr_a = arena.store(a);
|
||||||
let num_bytes = std::mem::size_of::<MyTest>();
|
let addr_b = arena.store(b);
|
||||||
let addr_a = arena.allocate_space(num_bytes);
|
assert_eq!(unsafe { arena.read::<MyTest>(addr_a) }, a);
|
||||||
arena.write_at(addr_a, a);
|
assert_eq!(unsafe { arena.read::<MyTest>(addr_b) }, b);
|
||||||
|
|
||||||
let addr_b = arena.allocate_space(num_bytes);
|
|
||||||
arena.write_at(addr_b, b);
|
|
||||||
|
|
||||||
assert_eq!(arena.read::<MyTest>(addr_a), a);
|
|
||||||
assert_eq!(arena.read::<MyTest>(addr_b), b);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,7 +1,9 @@
|
|||||||
mod expull;
|
mod expull;
|
||||||
mod memory_arena;
|
mod memory_arena;
|
||||||
|
mod murmurhash2;
|
||||||
mod term_hashmap;
|
mod term_hashmap;
|
||||||
|
|
||||||
pub use self::expull::ExpUnrolledLinkedList;
|
pub use self::expull::ExpUnrolledLinkedList;
|
||||||
pub use self::memory_arena::{Addr, MemoryArena};
|
pub use self::memory_arena::{Addr, ArenaStorable, MemoryArena};
|
||||||
|
use self::murmurhash2::murmurhash2;
|
||||||
pub use self::term_hashmap::{compute_table_size, TermHashMap};
|
pub use self::term_hashmap::{compute_table_size, TermHashMap};
|
||||||
|
|||||||
87
src/postings/stacker/murmurhash2.rs
Normal file
87
src/postings/stacker/murmurhash2.rs
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
use std::ptr;
|
||||||
|
const SEED: u32 = 3_242_157_231u32;
|
||||||
|
const M: u32 = 0x5bd1_e995;
|
||||||
|
|
||||||
|
#[inline(always)]
|
||||||
|
pub fn murmurhash2(key: &[u8]) -> u32 {
|
||||||
|
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
|
||||||
|
let mut key_ptr: *const u32 = key.as_ptr() as *const u32;
|
||||||
|
let len = key.len() as u32;
|
||||||
|
let mut h: u32 = SEED ^ len;
|
||||||
|
|
||||||
|
let num_blocks = len >> 2;
|
||||||
|
for _ in 0..num_blocks {
|
||||||
|
let mut k: u32 = unsafe { ptr::read_unaligned(key_ptr) }; // ok because of num_blocks definition
|
||||||
|
k = k.wrapping_mul(M);
|
||||||
|
k ^= k >> 24;
|
||||||
|
k = k.wrapping_mul(M);
|
||||||
|
h = h.wrapping_mul(M);
|
||||||
|
h ^= k;
|
||||||
|
key_ptr = key_ptr.wrapping_offset(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle the last few bytes of the input array
|
||||||
|
let remaining: &[u8] = &key[key.len() & !3..];
|
||||||
|
match remaining.len() {
|
||||||
|
3 => {
|
||||||
|
h ^= u32::from(remaining[2]) << 16;
|
||||||
|
h ^= u32::from(remaining[1]) << 8;
|
||||||
|
h ^= u32::from(remaining[0]);
|
||||||
|
h = h.wrapping_mul(M);
|
||||||
|
}
|
||||||
|
2 => {
|
||||||
|
h ^= u32::from(remaining[1]) << 8;
|
||||||
|
h ^= u32::from(remaining[0]);
|
||||||
|
h = h.wrapping_mul(M);
|
||||||
|
}
|
||||||
|
1 => {
|
||||||
|
h ^= u32::from(remaining[0]);
|
||||||
|
h = h.wrapping_mul(M);
|
||||||
|
}
|
||||||
|
_ => {}
|
||||||
|
}
|
||||||
|
h ^= h >> 13;
|
||||||
|
h = h.wrapping_mul(M);
|
||||||
|
h ^ (h >> 15)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod test {
|
||||||
|
|
||||||
|
use super::murmurhash2;
|
||||||
|
use std::collections::HashSet;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_murmur() {
|
||||||
|
let s1 = "abcdef";
|
||||||
|
let s2 = "abcdeg";
|
||||||
|
for i in 0..5 {
|
||||||
|
assert_eq!(
|
||||||
|
murmurhash2(&s1[i..5].as_bytes()),
|
||||||
|
murmurhash2(&s2[i..5].as_bytes())
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_murmur_against_reference_impl() {
|
||||||
|
assert_eq!(murmurhash2("".as_bytes()), 3632506080);
|
||||||
|
assert_eq!(murmurhash2("a".as_bytes()), 455683869);
|
||||||
|
assert_eq!(murmurhash2("ab".as_bytes()), 2448092234);
|
||||||
|
assert_eq!(murmurhash2("abc".as_bytes()), 2066295634);
|
||||||
|
assert_eq!(murmurhash2("abcd".as_bytes()), 2588571162);
|
||||||
|
assert_eq!(murmurhash2("abcde".as_bytes()), 2988696942);
|
||||||
|
assert_eq!(murmurhash2("abcdefghijklmnop".as_bytes()), 2350868870);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_murmur_collisions() {
|
||||||
|
let mut set: HashSet<u32> = HashSet::default();
|
||||||
|
for i in 0..10_000 {
|
||||||
|
let s = format!("hash{}", i);
|
||||||
|
let hash = murmurhash2(s.as_bytes());
|
||||||
|
set.insert(hash);
|
||||||
|
}
|
||||||
|
assert_eq!(set.len(), 10_000);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,15 +1,37 @@
|
|||||||
extern crate murmurhash32;
|
use super::murmurhash2;
|
||||||
|
use super::{Addr, ArenaStorable, MemoryArena};
|
||||||
use self::murmurhash32::murmurhash2;
|
|
||||||
|
|
||||||
use super::{Addr, MemoryArena};
|
|
||||||
use byteorder::{ByteOrder, NativeEndian};
|
|
||||||
use postings::stacker::memory_arena::store;
|
|
||||||
use postings::UnorderedTermId;
|
|
||||||
use std::iter;
|
use std::iter;
|
||||||
use std::mem;
|
use std::mem;
|
||||||
use std::slice;
|
use std::slice;
|
||||||
|
|
||||||
|
pub type BucketId = usize;
|
||||||
|
|
||||||
|
struct KeyBytesValue<'a, V> {
|
||||||
|
key: &'a [u8],
|
||||||
|
value: V,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a, V> KeyBytesValue<'a, V> {
|
||||||
|
fn new(key: &'a [u8], value: V) -> KeyBytesValue<'a, V> {
|
||||||
|
KeyBytesValue { key, value }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a, V> ArenaStorable for KeyBytesValue<'a, V>
|
||||||
|
where
|
||||||
|
V: ArenaStorable,
|
||||||
|
{
|
||||||
|
fn num_bytes(&self) -> usize {
|
||||||
|
0u16.num_bytes() + self.key.len() + self.value.num_bytes()
|
||||||
|
}
|
||||||
|
|
||||||
|
unsafe fn write_into(self, arena: &mut MemoryArena, addr: Addr) {
|
||||||
|
arena.write(addr, self.key.len() as u16);
|
||||||
|
arena.write_bytes(addr.offset(2), self.key);
|
||||||
|
arena.write(addr.offset(2 + self.key.len() as u32), self.value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Returns the actual memory size in bytes
|
/// Returns the actual memory size in bytes
|
||||||
/// required to create a table of size $2^num_bits$.
|
/// required to create a table of size $2^num_bits$.
|
||||||
pub fn compute_table_size(num_bits: usize) -> usize {
|
pub fn compute_table_size(num_bits: usize) -> usize {
|
||||||
@@ -27,7 +49,6 @@ pub fn compute_table_size(num_bits: usize) -> usize {
|
|||||||
struct KeyValue {
|
struct KeyValue {
|
||||||
key_value_addr: Addr,
|
key_value_addr: Addr,
|
||||||
hash: u32,
|
hash: u32,
|
||||||
unordered_term_id: UnorderedTermId,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for KeyValue {
|
impl Default for KeyValue {
|
||||||
@@ -35,7 +56,6 @@ impl Default for KeyValue {
|
|||||||
KeyValue {
|
KeyValue {
|
||||||
key_value_addr: Addr::null_pointer(),
|
key_value_addr: Addr::null_pointer(),
|
||||||
hash: 0u32,
|
hash: 0u32,
|
||||||
unordered_term_id: UnorderedTermId::default(),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -60,7 +80,6 @@ pub struct TermHashMap {
|
|||||||
pub heap: MemoryArena,
|
pub heap: MemoryArena,
|
||||||
mask: usize,
|
mask: usize,
|
||||||
occupied: Vec<usize>,
|
occupied: Vec<usize>,
|
||||||
len: usize,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
struct QuadraticProbing {
|
struct QuadraticProbing {
|
||||||
@@ -87,13 +106,14 @@ pub struct Iter<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Iterator for Iter<'a> {
|
impl<'a> Iterator for Iter<'a> {
|
||||||
type Item = (&'a [u8], Addr, UnorderedTermId);
|
type Item = (&'a [u8], Addr, BucketId);
|
||||||
|
|
||||||
fn next(&mut self) -> Option<Self::Item> {
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
self.inner.next().cloned().map(move |bucket: usize| {
|
self.inner.next().cloned().map(move |bucket: usize| {
|
||||||
let kv = self.hashmap.table[bucket];
|
let kv = self.hashmap.table[bucket];
|
||||||
let (key, offset): (&'a [u8], Addr) = self.hashmap.get_key_value(kv.key_value_addr);
|
let (key, offset): (&'a [u8], Addr) =
|
||||||
(key, offset, kv.unordered_term_id)
|
unsafe { self.hashmap.get_key_value(kv.key_value_addr) };
|
||||||
|
(key, offset, bucket as BucketId)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -108,7 +128,6 @@ impl TermHashMap {
|
|||||||
heap,
|
heap,
|
||||||
mask: table_size - 1,
|
mask: table_size - 1,
|
||||||
occupied: Vec::with_capacity(table_size / 2),
|
occupied: Vec::with_capacity(table_size / 2),
|
||||||
len: 0,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -124,34 +143,20 @@ impl TermHashMap {
|
|||||||
self.table.len() < self.occupied.len() * 3
|
self.table.len() < self.occupied.len() * 3
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline(always)]
|
unsafe fn get_key_value(&self, addr: Addr) -> (&[u8], Addr) {
|
||||||
fn get_key_value(&self, addr: Addr) -> (&[u8], Addr) {
|
let key_bytes_len = self.heap.read::<u16>(addr) as usize;
|
||||||
let data = self.heap.slice_from(addr);
|
let key_addr = addr.offset(2u32);
|
||||||
let key_bytes_len = NativeEndian::read_u16(data) as usize;
|
let key_bytes: &[u8] = self.heap.read_slice(key_addr, key_bytes_len);
|
||||||
let key_bytes: &[u8] = &data[2..][..key_bytes_len];
|
let val_addr: Addr = key_addr.offset(key_bytes.len() as u32);
|
||||||
(key_bytes, addr.offset(2u32 + key_bytes_len as u32))
|
(key_bytes, val_addr)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline(always)]
|
pub fn set_bucket(&mut self, hash: u32, key_value_addr: Addr, bucket: usize) {
|
||||||
fn get_value_addr_if_key_match(&self, target_key: &[u8], addr: Addr) -> Option<Addr> {
|
|
||||||
let (stored_key, value_addr) = self.get_key_value(addr);
|
|
||||||
if stored_key == target_key {
|
|
||||||
Some(value_addr)
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn set_bucket(&mut self, hash: u32, key_value_addr: Addr, bucket: usize) -> UnorderedTermId {
|
|
||||||
self.occupied.push(bucket);
|
self.occupied.push(bucket);
|
||||||
let unordered_term_id = self.len as UnorderedTermId;
|
|
||||||
self.len += 1;
|
|
||||||
self.table[bucket] = KeyValue {
|
self.table[bucket] = KeyValue {
|
||||||
key_value_addr,
|
key_value_addr,
|
||||||
hash,
|
hash,
|
||||||
unordered_term_id,
|
|
||||||
};
|
};
|
||||||
unordered_term_id
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn iter(&self) -> Iter {
|
pub fn iter(&self) -> Iter {
|
||||||
@@ -191,53 +196,64 @@ impl TermHashMap {
|
|||||||
/// will be in charge of returning a default value.
|
/// will be in charge of returning a default value.
|
||||||
/// If the key already as an associated value, then it will be passed
|
/// If the key already as an associated value, then it will be passed
|
||||||
/// `Some(previous_value)`.
|
/// `Some(previous_value)`.
|
||||||
pub fn mutate_or_create<S, V, TMutator>(
|
pub fn mutate_or_create<S, V, TMutator>(&mut self, key: S, mut updater: TMutator) -> BucketId
|
||||||
&mut self,
|
|
||||||
key: S,
|
|
||||||
mut updater: TMutator,
|
|
||||||
) -> UnorderedTermId
|
|
||||||
where
|
where
|
||||||
S: AsRef<[u8]>,
|
S: AsRef<[u8]>,
|
||||||
V: Copy + 'static,
|
V: Copy,
|
||||||
TMutator: FnMut(Option<V>) -> V,
|
TMutator: FnMut(Option<V>) -> V,
|
||||||
{
|
{
|
||||||
if self.is_saturated() {
|
if self.is_saturated() {
|
||||||
self.resize();
|
self.resize();
|
||||||
}
|
}
|
||||||
let key_bytes: &[u8] = key.as_ref();
|
let key_bytes: &[u8] = key.as_ref();
|
||||||
let hash = murmurhash2(key.as_ref());
|
let hash = murmurhash2::murmurhash2(key.as_ref());
|
||||||
let mut probe = self.probe(hash);
|
let mut probe = self.probe(hash);
|
||||||
loop {
|
loop {
|
||||||
let bucket = probe.next_probe();
|
let bucket = probe.next_probe();
|
||||||
let kv: KeyValue = self.table[bucket];
|
let kv: KeyValue = self.table[bucket];
|
||||||
if kv.is_empty() {
|
if kv.is_empty() {
|
||||||
// The key does not exists yet.
|
|
||||||
let val = updater(None);
|
let val = updater(None);
|
||||||
let num_bytes =
|
let key_addr = self.heap.store(KeyBytesValue::new(key_bytes, val));
|
||||||
std::mem::size_of::<u16>() + key_bytes.len() + std::mem::size_of::<V>();
|
self.set_bucket(hash, key_addr, bucket);
|
||||||
let key_addr = self.heap.allocate_space(num_bytes);
|
return bucket as BucketId;
|
||||||
{
|
|
||||||
let data = self.heap.slice_mut(key_addr, num_bytes);
|
|
||||||
NativeEndian::write_u16(data, key_bytes.len() as u16);
|
|
||||||
let stop = 2 + key_bytes.len();
|
|
||||||
data[2..stop].copy_from_slice(key_bytes);
|
|
||||||
store(&mut data[stop..], val);
|
|
||||||
}
|
|
||||||
return self.set_bucket(hash, key_addr, bucket);
|
|
||||||
} else if kv.hash == hash {
|
} else if kv.hash == hash {
|
||||||
if let Some(val_addr) =
|
let (key_matches, val_addr) = {
|
||||||
self.get_value_addr_if_key_match(key_bytes, kv.key_value_addr)
|
let (stored_key, val_addr): (&[u8], Addr) =
|
||||||
{
|
unsafe { self.get_key_value(kv.key_value_addr) };
|
||||||
let v = self.heap.read(val_addr);
|
(stored_key == key_bytes, val_addr)
|
||||||
let new_v = updater(Some(v));
|
};
|
||||||
self.heap.write_at(val_addr, new_v);
|
if key_matches {
|
||||||
return kv.unordered_term_id;
|
unsafe {
|
||||||
|
// logic
|
||||||
|
let v = self.heap.read(val_addr);
|
||||||
|
let new_v = updater(Some(v));
|
||||||
|
self.heap.write(val_addr, new_v);
|
||||||
|
};
|
||||||
|
return bucket as BucketId;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(all(test, feature = "unstable"))]
|
||||||
|
mod bench {
|
||||||
|
use super::murmurhash2::murmurhash2;
|
||||||
|
use test::Bencher;
|
||||||
|
|
||||||
|
#[bench]
|
||||||
|
fn bench_murmurhash2(b: &mut Bencher) {
|
||||||
|
let keys: [&'static str; 3] = ["wer qwe qwe qwe ", "werbq weqweqwe2 ", "weraq weqweqwe3 "];
|
||||||
|
b.iter(|| {
|
||||||
|
let mut s = 0;
|
||||||
|
for &key in &keys {
|
||||||
|
s ^= murmurhash2(key.as_bytes());
|
||||||
|
}
|
||||||
|
s
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
@@ -269,7 +285,10 @@ mod tests {
|
|||||||
let mut vanilla_hash_map = HashMap::new();
|
let mut vanilla_hash_map = HashMap::new();
|
||||||
let mut iter_values = hash_map.iter();
|
let mut iter_values = hash_map.iter();
|
||||||
while let Some((key, addr, _)) = iter_values.next() {
|
while let Some((key, addr, _)) = iter_values.next() {
|
||||||
let val: u32 = hash_map.heap.read(addr);
|
let val: u32 = unsafe {
|
||||||
|
// test
|
||||||
|
hash_map.heap.read(addr)
|
||||||
|
};
|
||||||
vanilla_hash_map.insert(key.to_owned(), val);
|
vanilla_hash_map.insert(key.to_owned(), val);
|
||||||
}
|
}
|
||||||
assert_eq!(vanilla_hash_map.len(), 2);
|
assert_eq!(vanilla_hash_map.len(), 2);
|
||||||
|
|||||||
@@ -86,12 +86,12 @@ mod tests {
|
|||||||
|
|
||||||
use super::AllQuery;
|
use super::AllQuery;
|
||||||
use query::Query;
|
use query::Query;
|
||||||
use schema::{Schema, TEXT};
|
use schema::{SchemaBuilder, TEXT};
|
||||||
use Index;
|
use Index;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_all_query() {
|
fn test_all_query() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let field = schema_builder.add_text_field("text", TEXT);
|
let field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -101,9 +101,8 @@ mod tests {
|
|||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
index_writer.add_document(doc!(field=>"ccc"));
|
index_writer.add_document(doc!(field=>"ccc"));
|
||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
let reader = index.reader().unwrap();
|
index.load_searchers().unwrap();
|
||||||
reader.reload().unwrap();
|
let searcher = index.searcher();
|
||||||
let searcher = reader.searcher();
|
|
||||||
let weight = AllQuery.weight(&searcher, false).unwrap();
|
let weight = AllQuery.weight(&searcher, false).unwrap();
|
||||||
{
|
{
|
||||||
let reader = searcher.segment_reader(0);
|
let reader = searcher.segment_reader(0);
|
||||||
|
|||||||
@@ -1,17 +1,17 @@
|
|||||||
use common::BitSet;
|
use common::BitSet;
|
||||||
use core::SegmentReader;
|
use core::SegmentReader;
|
||||||
|
use fst::Automaton;
|
||||||
use query::BitSetDocSet;
|
use query::BitSetDocSet;
|
||||||
use query::ConstScorer;
|
use query::ConstScorer;
|
||||||
use query::{Scorer, Weight};
|
use query::{Scorer, Weight};
|
||||||
use schema::{Field, IndexRecordOption};
|
use schema::{Field, IndexRecordOption};
|
||||||
use tantivy_fst::Automaton;
|
|
||||||
use termdict::{TermDictionary, TermStreamer};
|
use termdict::{TermDictionary, TermStreamer};
|
||||||
use Result;
|
use Result;
|
||||||
|
|
||||||
/// A weight struct for Fuzzy Term and Regex Queries
|
/// A weight struct for Fuzzy Term and Regex Queries
|
||||||
pub struct AutomatonWeight<A>
|
pub struct AutomatonWeight<A>
|
||||||
where
|
where
|
||||||
A: Automaton + Send + Sync + 'static,
|
A: Automaton,
|
||||||
{
|
{
|
||||||
field: Field,
|
field: Field,
|
||||||
automaton: A,
|
automaton: A,
|
||||||
@@ -19,7 +19,7 @@ where
|
|||||||
|
|
||||||
impl<A> AutomatonWeight<A>
|
impl<A> AutomatonWeight<A>
|
||||||
where
|
where
|
||||||
A: Automaton + Send + Sync + 'static,
|
A: Automaton,
|
||||||
{
|
{
|
||||||
/// Create a new AutomationWeight
|
/// Create a new AutomationWeight
|
||||||
pub fn new(field: Field, automaton: A) -> AutomatonWeight<A> {
|
pub fn new(field: Field, automaton: A) -> AutomatonWeight<A> {
|
||||||
@@ -34,7 +34,7 @@ where
|
|||||||
|
|
||||||
impl<A> Weight for AutomatonWeight<A>
|
impl<A> Weight for AutomatonWeight<A>
|
||||||
where
|
where
|
||||||
A: Automaton + Send + Sync + 'static,
|
A: Automaton,
|
||||||
{
|
{
|
||||||
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
|
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
|
||||||
let max_doc = reader.max_doc();
|
let max_doc = reader.max_doc();
|
||||||
|
|||||||
@@ -63,8 +63,7 @@ impl BM25Weight {
|
|||||||
.map(|term| {
|
.map(|term| {
|
||||||
let term_doc_freq = searcher.doc_freq(term);
|
let term_doc_freq = searcher.doc_freq(term);
|
||||||
idf(term_doc_freq, total_num_docs)
|
idf(term_doc_freq, total_num_docs)
|
||||||
})
|
}).sum::<f32>();
|
||||||
.sum::<f32>();
|
|
||||||
BM25Weight::new(idf, average_fieldnorm)
|
BM25Weight::new(idf, average_fieldnorm)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -47,8 +47,7 @@ impl Query for BooleanQuery {
|
|||||||
.iter()
|
.iter()
|
||||||
.map(|&(ref occur, ref subquery)| {
|
.map(|&(ref occur, ref subquery)| {
|
||||||
Ok((*occur, subquery.weight(searcher, scoring_enabled)?))
|
Ok((*occur, subquery.weight(searcher, scoring_enabled)?))
|
||||||
})
|
}).collect::<Result<_>>()?;
|
||||||
.collect::<Result<_>>()?;
|
|
||||||
Ok(Box::new(BooleanWeight::new(sub_weights, scoring_enabled)))
|
Ok(Box::new(BooleanWeight::new(sub_weights, scoring_enabled)))
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -69,8 +68,7 @@ impl BooleanQuery {
|
|||||||
let term_query: Box<Query> =
|
let term_query: Box<Query> =
|
||||||
Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs));
|
Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs));
|
||||||
(Occur::Should, term_query)
|
(Occur::Should, term_query)
|
||||||
})
|
}).collect();
|
||||||
.collect();
|
|
||||||
BooleanQuery::from(occur_term_queries)
|
BooleanQuery::from(occur_term_queries)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
use core::SegmentReader;
|
use core::SegmentReader;
|
||||||
|
use downcast::Downcast;
|
||||||
use query::intersect_scorers;
|
use query::intersect_scorers;
|
||||||
use query::score_combiner::{DoNothingCombiner, ScoreCombiner, SumWithCoordsCombiner};
|
use query::score_combiner::{DoNothingCombiner, ScoreCombiner, SumWithCoordsCombiner};
|
||||||
use query::term_query::TermScorer;
|
use query::term_query::TermScorer;
|
||||||
@@ -9,6 +10,7 @@ use query::RequiredOptionalScorer;
|
|||||||
use query::Scorer;
|
use query::Scorer;
|
||||||
use query::Union;
|
use query::Union;
|
||||||
use query::Weight;
|
use query::Weight;
|
||||||
|
use std::borrow::Borrow;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use Result;
|
use Result;
|
||||||
|
|
||||||
@@ -22,11 +24,14 @@ where
|
|||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
let is_all_term_queries = scorers.iter().all(|scorer| scorer.is::<TermScorer>());
|
let is_all_term_queries = scorers.iter().all(|scorer| {
|
||||||
|
let scorer_ref: &Scorer = scorer.borrow();
|
||||||
|
Downcast::<TermScorer>::is_type(scorer_ref)
|
||||||
|
});
|
||||||
if is_all_term_queries {
|
if is_all_term_queries {
|
||||||
let scorers: Vec<TermScorer> = scorers
|
let scorers: Vec<TermScorer> = scorers
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|scorer| *(scorer.downcast::<TermScorer>().map_err(|_| ()).unwrap()))
|
.map(|scorer| *Downcast::<TermScorer>::downcast(scorer).unwrap())
|
||||||
.collect();
|
.collect();
|
||||||
let scorer: Box<Scorer> = Box::new(Union::<TermScorer, TScoreCombiner>::from(scorers));
|
let scorer: Box<Scorer> = Box::new(Union::<TermScorer, TScoreCombiner>::from(scorers));
|
||||||
return scorer;
|
return scorer;
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ mod tests {
|
|||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
use collector::tests::TestCollector;
|
use collector::tests::TestCollector;
|
||||||
|
use downcast::Downcast;
|
||||||
use query::score_combiner::SumWithCoordsCombiner;
|
use query::score_combiner::SumWithCoordsCombiner;
|
||||||
use query::term_query::TermScorer;
|
use query::term_query::TermScorer;
|
||||||
use query::Intersection;
|
use query::Intersection;
|
||||||
@@ -18,17 +19,16 @@ mod tests {
|
|||||||
use query::Scorer;
|
use query::Scorer;
|
||||||
use query::TermQuery;
|
use query::TermQuery;
|
||||||
use schema::*;
|
use schema::*;
|
||||||
use DocId;
|
|
||||||
use Index;
|
use Index;
|
||||||
|
|
||||||
fn aux_test_helper() -> (Index, Field) {
|
fn aux_test_helper() -> (Index, Field) {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
{
|
{
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
{
|
{
|
||||||
let doc = doc!(text_field => "a b c");
|
let doc = doc!(text_field => "a b c");
|
||||||
index_writer.add_document(doc);
|
index_writer.add_document(doc);
|
||||||
@@ -51,6 +51,7 @@ mod tests {
|
|||||||
}
|
}
|
||||||
assert!(index_writer.commit().is_ok());
|
assert!(index_writer.commit().is_ok());
|
||||||
}
|
}
|
||||||
|
index.load_searchers().unwrap();
|
||||||
(index, text_field)
|
(index, text_field)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -59,8 +60,7 @@ mod tests {
|
|||||||
let (index, text_field) = aux_test_helper();
|
let (index, text_field) = aux_test_helper();
|
||||||
let query_parser = QueryParser::for_index(&index, vec![text_field]);
|
let query_parser = QueryParser::for_index(&index, vec![text_field]);
|
||||||
let query = query_parser.parse_query("(+a +b) d").unwrap();
|
let query = query_parser.parse_query("(+a +b) d").unwrap();
|
||||||
let searcher = index.reader().unwrap().searcher();
|
assert_eq!(query.count(&*index.searcher()).unwrap(), 3);
|
||||||
assert_eq!(query.count(&searcher).unwrap(), 3);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -68,28 +68,28 @@ mod tests {
|
|||||||
let (index, text_field) = aux_test_helper();
|
let (index, text_field) = aux_test_helper();
|
||||||
let query_parser = QueryParser::for_index(&index, vec![text_field]);
|
let query_parser = QueryParser::for_index(&index, vec![text_field]);
|
||||||
let query = query_parser.parse_query("+a").unwrap();
|
let query = query_parser.parse_query("+a").unwrap();
|
||||||
let searcher = index.reader().unwrap().searcher();
|
let searcher = index.searcher();
|
||||||
let weight = query.weight(&searcher, true).unwrap();
|
let weight = query.weight(&searcher, true).unwrap();
|
||||||
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||||
assert!(scorer.is::<TermScorer>());
|
assert!(Downcast::<TermScorer>::is_type(&*scorer));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
pub fn test_boolean_termonly_intersection() {
|
pub fn test_boolean_termonly_intersection() {
|
||||||
let (index, text_field) = aux_test_helper();
|
let (index, text_field) = aux_test_helper();
|
||||||
let query_parser = QueryParser::for_index(&index, vec![text_field]);
|
let query_parser = QueryParser::for_index(&index, vec![text_field]);
|
||||||
let searcher = index.reader().unwrap().searcher();
|
let searcher = index.searcher();
|
||||||
{
|
{
|
||||||
let query = query_parser.parse_query("+a +b +c").unwrap();
|
let query = query_parser.parse_query("+a +b +c").unwrap();
|
||||||
let weight = query.weight(&searcher, true).unwrap();
|
let weight = query.weight(&searcher, true).unwrap();
|
||||||
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||||
assert!(scorer.is::<Intersection<TermScorer>>());
|
assert!(Downcast::<Intersection<TermScorer>>::is_type(&*scorer));
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let query = query_parser.parse_query("+a +(b c)").unwrap();
|
let query = query_parser.parse_query("+a +(b c)").unwrap();
|
||||||
let weight = query.weight(&searcher, true).unwrap();
|
let weight = query.weight(&searcher, true).unwrap();
|
||||||
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||||
assert!(scorer.is::<Intersection<Box<Scorer>>>());
|
assert!(Downcast::<Intersection<Box<Scorer>>>::is_type(&*scorer));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -97,19 +97,21 @@ mod tests {
|
|||||||
pub fn test_boolean_reqopt() {
|
pub fn test_boolean_reqopt() {
|
||||||
let (index, text_field) = aux_test_helper();
|
let (index, text_field) = aux_test_helper();
|
||||||
let query_parser = QueryParser::for_index(&index, vec![text_field]);
|
let query_parser = QueryParser::for_index(&index, vec![text_field]);
|
||||||
let searcher = index.reader().unwrap().searcher();
|
let searcher = index.searcher();
|
||||||
{
|
{
|
||||||
let query = query_parser.parse_query("+a b").unwrap();
|
let query = query_parser.parse_query("+a b").unwrap();
|
||||||
let weight = query.weight(&searcher, true).unwrap();
|
let weight = query.weight(&searcher, true).unwrap();
|
||||||
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||||
assert!(scorer
|
assert!(Downcast::<
|
||||||
.is::<RequiredOptionalScorer<Box<Scorer>, Box<Scorer>, SumWithCoordsCombiner>>());
|
RequiredOptionalScorer<Box<Scorer>, Box<Scorer>, SumWithCoordsCombiner>,
|
||||||
|
>::is_type(&*scorer));
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let query = query_parser.parse_query("+a b").unwrap();
|
let query = query_parser.parse_query("+a b").unwrap();
|
||||||
let weight = query.weight(&searcher, false).unwrap();
|
let weight = query.weight(&searcher, false).unwrap();
|
||||||
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||||
assert!(scorer.is::<TermScorer>());
|
println!("{:?}", scorer.type_name());
|
||||||
|
assert!(Downcast::<TermScorer>::is_type(&*scorer));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -126,18 +128,11 @@ mod tests {
|
|||||||
query
|
query
|
||||||
};
|
};
|
||||||
|
|
||||||
let reader = index.reader().unwrap();
|
|
||||||
|
|
||||||
let matching_docs = |boolean_query: &Query| {
|
let matching_docs = |boolean_query: &Query| {
|
||||||
reader
|
let searcher = index.searcher();
|
||||||
.searcher()
|
let mut test_collector = TestCollector::default();
|
||||||
.search(boolean_query, &TestCollector)
|
searcher.search(boolean_query, &mut test_collector).unwrap();
|
||||||
.unwrap()
|
test_collector.docs()
|
||||||
.docs()
|
|
||||||
.iter()
|
|
||||||
.cloned()
|
|
||||||
.map(|doc| doc.1)
|
|
||||||
.collect::<Vec<DocId>>()
|
|
||||||
};
|
};
|
||||||
|
|
||||||
{
|
{
|
||||||
@@ -188,13 +183,12 @@ mod tests {
|
|||||||
let query: Box<Query> = Box::new(term_query);
|
let query: Box<Query> = Box::new(term_query);
|
||||||
query
|
query
|
||||||
};
|
};
|
||||||
let reader = index.reader().unwrap();
|
|
||||||
let score_docs = |boolean_query: &Query| {
|
let score_docs = |boolean_query: &Query| {
|
||||||
let fruit = reader
|
let searcher = index.searcher();
|
||||||
.searcher()
|
let mut test_collector = TestCollector::default();
|
||||||
.search(boolean_query, &TestCollector)
|
searcher.search(boolean_query, &mut test_collector).unwrap();
|
||||||
.unwrap();
|
test_collector.scores()
|
||||||
fruit.scores().to_vec()
|
|
||||||
};
|
};
|
||||||
|
|
||||||
{
|
{
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user