mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-07 01:32:53 +00:00
Compare commits
1 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4cbcc59e8f |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1,4 +1,3 @@
|
|||||||
tantivy.iml
|
|
||||||
*.swp
|
*.swp
|
||||||
target
|
target
|
||||||
target/debug
|
target/debug
|
||||||
|
|||||||
16
CHANGELOG.md
16
CHANGELOG.md
@@ -1,24 +1,10 @@
|
|||||||
Tantivy 0.8.1
|
|
||||||
=====================
|
|
||||||
*No change in the index format*
|
|
||||||
- API Breaking change in the collector API. (@jwolfe, @fulmicoton)
|
|
||||||
- Multithreaded search (@jwolfe, @fulmicoton)
|
|
||||||
|
|
||||||
|
|
||||||
Tantivy 0.7.1
|
|
||||||
=====================
|
|
||||||
*No change in the index format*
|
|
||||||
- Bugfix: NGramTokenizer panics on non ascii chars
|
|
||||||
- Added a space usage API
|
|
||||||
|
|
||||||
Tantivy 0.7
|
Tantivy 0.7
|
||||||
=====================
|
=====================
|
||||||
- Skip data for doc ids and positions (@fulmicoton),
|
- Skip data for doc ids and positions (@fulmicoton),
|
||||||
greatly improving performance
|
greatly improving performance
|
||||||
- Tantivy error now rely on the failure crate (@drusellers)
|
- Tantivy error now rely on the failure crate (@drusellers)
|
||||||
- Added support for `AND`, `OR`, `NOT` syntax in addition to the `+`,`-` syntax
|
|
||||||
- Added a snippet generator with highlight (@vigneshsarma, @fulmicoton)
|
|
||||||
- Added a `TopFieldCollector` (@pentlander)
|
|
||||||
|
|
||||||
Tantivy 0.6.1
|
Tantivy 0.6.1
|
||||||
=========================
|
=========================
|
||||||
|
|||||||
28
Cargo.toml
28
Cargo.toml
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "tantivy"
|
name = "tantivy"
|
||||||
version = "0.8.0-dev"
|
version = "0.7.0-dev"
|
||||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
categories = ["database-implementations", "data-structures"]
|
categories = ["database-implementations", "data-structures"]
|
||||||
@@ -12,9 +12,10 @@ readme = "README.md"
|
|||||||
keywords = ["search", "information", "retrieval"]
|
keywords = ["search", "information", "retrieval"]
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
base64 = "0.10.0"
|
base64 = "0.9.1"
|
||||||
byteorder = "1.0"
|
byteorder = "1.0"
|
||||||
lazy_static = "1"
|
lazy_static = "1"
|
||||||
|
tinysegmenter = "0.1.0"
|
||||||
regex = "1.0"
|
regex = "1.0"
|
||||||
fst = {version="0.3", default-features=false}
|
fst = {version="0.3", default-features=false}
|
||||||
fst-regex = { version="0.2" }
|
fst-regex = { version="0.2" }
|
||||||
@@ -32,8 +33,9 @@ num_cpus = "1.2"
|
|||||||
itertools = "0.7"
|
itertools = "0.7"
|
||||||
levenshtein_automata = {version="0.1", features=["fst_automaton"]}
|
levenshtein_automata = {version="0.1", features=["fst_automaton"]}
|
||||||
bit-set = "0.5"
|
bit-set = "0.5"
|
||||||
uuid = { version = "0.7", features = ["v4", "serde"] }
|
uuid = { version = "0.6", features = ["v4", "serde"] }
|
||||||
crossbeam = "0.5"
|
crossbeam = "0.4"
|
||||||
|
crossbeam-channel = "0.2"
|
||||||
futures = "0.1"
|
futures = "0.1"
|
||||||
futures-cpupool = "0.1"
|
futures-cpupool = "0.1"
|
||||||
owning_ref = "0.4"
|
owning_ref = "0.4"
|
||||||
@@ -46,36 +48,24 @@ census = "0.1"
|
|||||||
fnv = "1.0.6"
|
fnv = "1.0.6"
|
||||||
owned-read = "0.4"
|
owned-read = "0.4"
|
||||||
failure = "0.1"
|
failure = "0.1"
|
||||||
htmlescape = "0.3.1"
|
|
||||||
fail = "0.2"
|
|
||||||
scoped-pool = "1.0"
|
|
||||||
aho-corasick = "0.6"
|
|
||||||
|
|
||||||
[target.'cfg(windows)'.dependencies]
|
[target.'cfg(windows)'.dependencies]
|
||||||
winapi = "0.2"
|
winapi = "0.2"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
rand = "0.6"
|
rand = "0.5"
|
||||||
maplit = "1"
|
|
||||||
|
|
||||||
[profile.release]
|
[profile.release]
|
||||||
opt-level = 3
|
opt-level = 3
|
||||||
debug = false
|
debug = false
|
||||||
|
lto = true
|
||||||
debug-assertions = false
|
debug-assertions = false
|
||||||
|
|
||||||
[profile.test]
|
|
||||||
debug-assertions = true
|
|
||||||
overflow-checks = true
|
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
# by default no-fail is disabled. We manually enable it when running test.
|
default = ["mmap"]
|
||||||
default = ["mmap", "no_fail"]
|
|
||||||
mmap = ["fst/mmap", "atomicwrites"]
|
mmap = ["fst/mmap", "atomicwrites"]
|
||||||
lz4-compression = ["lz4"]
|
lz4-compression = ["lz4"]
|
||||||
no_fail = ["fail/no_fail"]
|
|
||||||
unstable = [] # useful for benches.
|
|
||||||
|
|
||||||
[badges]
|
[badges]
|
||||||
travis-ci = { repository = "tantivy-search/tantivy" }
|
travis-ci = { repository = "tantivy-search/tantivy" }
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
17
README.md
17
README.md
@@ -4,7 +4,6 @@
|
|||||||
[](https://gitter.im/tantivy-search/tantivy?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
|
[](https://gitter.im/tantivy-search/tantivy?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
|
||||||
[](https://opensource.org/licenses/MIT)
|
[](https://opensource.org/licenses/MIT)
|
||||||
[](https://ci.appveyor.com/project/fulmicoton/tantivy/branch/master)
|
[](https://ci.appveyor.com/project/fulmicoton/tantivy/branch/master)
|
||||||
[](https://saythanks.io/to/fulmicoton)
|
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
@@ -21,7 +20,7 @@
|
|||||||
|
|
||||||
**Tantivy** is a **full text search engine library** written in rust.
|
**Tantivy** is a **full text search engine library** written in rust.
|
||||||
|
|
||||||
It is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elastic Search](https://www.elastic.co/products/elasticsearch) and [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not
|
It is closer to Lucene than to Elastic Search and Solr in the sense it is not
|
||||||
an off-the-shelf search engine server, but rather a crate that can be used
|
an off-the-shelf search engine server, but rather a crate that can be used
|
||||||
to build such a search engine.
|
to build such a search engine.
|
||||||
|
|
||||||
@@ -33,8 +32,8 @@ Tantivy is, in fact, strongly inspired by Lucene's design.
|
|||||||
- Fast (check out the :racehorse: :sparkles: [benchmark](https://tantivy-search.github.io/bench/) :sparkles: :racehorse:)
|
- Fast (check out the :racehorse: :sparkles: [benchmark](https://tantivy-search.github.io/bench/) :sparkles: :racehorse:)
|
||||||
- Tiny startup time (<10ms), perfect for command line tools
|
- Tiny startup time (<10ms), perfect for command line tools
|
||||||
- BM25 scoring (the same as lucene)
|
- BM25 scoring (the same as lucene)
|
||||||
- Natural query language `(michael AND jackson) OR "king of pop"`
|
- Basic query language (`+michael +jackson`)
|
||||||
- Phrase queries search (`"michael jackson"`)
|
- Phrase queries search (\"michael jackson\"`)
|
||||||
- Incremental indexing
|
- Incremental indexing
|
||||||
- Multithreaded indexing (indexing English Wikipedia takes < 3 minutes on my desktop)
|
- Multithreaded indexing (indexing English Wikipedia takes < 3 minutes on my desktop)
|
||||||
- Mmap directory
|
- Mmap directory
|
||||||
@@ -44,14 +43,12 @@ Tantivy is, in fact, strongly inspired by Lucene's design.
|
|||||||
- LZ4 compressed document store
|
- LZ4 compressed document store
|
||||||
- Range queries
|
- Range queries
|
||||||
- Faceted search
|
- Faceted search
|
||||||
- Configurable indexing (optional term frequency and position indexing)
|
- Configurable indexing (optional term frequency and position indexing
|
||||||
- Cheesy logo with a horse
|
- Cheesy logo with a horse
|
||||||
|
|
||||||
# Non-features
|
# Non-features
|
||||||
|
|
||||||
- Distributed search is out of the scope of tantivy. That being said, tantivy is meant as a
|
- Distributed search and will not be in the scope of tantivy.
|
||||||
library upon which one could build a distributed search. Serializable/mergeable collector state for instance,
|
|
||||||
are within the scope of tantivy.
|
|
||||||
|
|
||||||
|
|
||||||
# Supported OS and compiler
|
# Supported OS and compiler
|
||||||
@@ -80,10 +77,6 @@ To check out and run tests, you can simply run :
|
|||||||
cd tantivy
|
cd tantivy
|
||||||
cargo build
|
cargo build
|
||||||
|
|
||||||
## Running tests
|
|
||||||
|
|
||||||
Some tests will not run with just `cargo test` because of `fail-rs`.
|
|
||||||
To run the tests exhaustively, run `./run-tests.sh`.
|
|
||||||
|
|
||||||
# Contribute
|
# Contribute
|
||||||
|
|
||||||
|
|||||||
@@ -18,5 +18,5 @@ install:
|
|||||||
build: false
|
build: false
|
||||||
|
|
||||||
test_script:
|
test_script:
|
||||||
- REM SET RUST_LOG=tantivy,test & cargo test --verbose --no-default-features --features mmap -- --test-threads 1
|
- REM SET RUST_LOG=tantivy,test & cargo test --verbose
|
||||||
- REM SET RUST_BACKTRACE=1 & cargo build --examples
|
- REM SET RUST_BACKTRACE=1 & cargo build --examples
|
||||||
@@ -11,11 +11,12 @@ main() {
|
|||||||
else
|
else
|
||||||
echo "Build"
|
echo "Build"
|
||||||
cross build --target $TARGET
|
cross build --target $TARGET
|
||||||
|
cross build --target $TARGET --release
|
||||||
if [ ! -z $DISABLE_TESTS ]; then
|
if [ ! -z $DISABLE_TESTS ]; then
|
||||||
return
|
return
|
||||||
fi
|
fi
|
||||||
echo "Test"
|
echo "Test"
|
||||||
cross test --target $TARGET --no-default-features --features mmap -- --test-threads 1
|
cross test --target $TARGET
|
||||||
fi
|
fi
|
||||||
for example in $(ls examples/*.rs)
|
for example in $(ls examples/*.rs)
|
||||||
do
|
do
|
||||||
|
|||||||
@@ -9,7 +9,6 @@
|
|||||||
- [Facetting](./facetting.md)
|
- [Facetting](./facetting.md)
|
||||||
- [Innerworkings](./innerworkings.md)
|
- [Innerworkings](./innerworkings.md)
|
||||||
- [Inverted index](./inverted_index.md)
|
- [Inverted index](./inverted_index.md)
|
||||||
- [Best practise](./inverted_index.md)
|
|
||||||
|
|
||||||
[Frequently Asked Questions](./faq.md)
|
[Frequently Asked Questions](./faq.md)
|
||||||
[Examples](./examples.md)
|
[Examples](./examples.md)
|
||||||
|
|||||||
@@ -2,8 +2,8 @@
|
|||||||
|
|
||||||
> Tantivy is a **search** engine **library** for Rust.
|
> Tantivy is a **search** engine **library** for Rust.
|
||||||
|
|
||||||
If you are familiar with Lucene, it's an excellent approximation to consider tantivy as Lucene for rust. tantivy is heavily inspired by Lucene's design and
|
If you are familiar with Lucene, tantivy is heavily inspired by Lucene's design and
|
||||||
they both have the same scope and targetted use cases.
|
they both have the same scope and targetted users.
|
||||||
|
|
||||||
If you are not familiar with Lucene, let's break down our little tagline.
|
If you are not familiar with Lucene, let's break down our little tagline.
|
||||||
|
|
||||||
@@ -17,18 +17,15 @@ relevancy, collapsing, highlighting, spatial search.
|
|||||||
experience. But keep in mind this is just a toolbox.
|
experience. But keep in mind this is just a toolbox.
|
||||||
Which bring us to the second keyword...
|
Which bring us to the second keyword...
|
||||||
|
|
||||||
- **Library** means that you will have to write code. tantivy is not an *all-in-one* server solution like elastic search for instance.
|
- **Library** means that you will have to write code. tantivy is not an *all-in-one* server solution.
|
||||||
|
|
||||||
Sometimes a functionality will not be available in tantivy because it is too
|
|
||||||
specific to your use case. By design, tantivy should make it possible to extend
|
|
||||||
the available set of features using the existing rock-solid datastructures.
|
|
||||||
|
|
||||||
Most frequently this will mean writing your own `Collector`, your own `Scorer` or your own
|
|
||||||
`TokenFilter`... Some of your requirements may also be related to
|
|
||||||
something closer to architecture or operations. For instance, you may
|
|
||||||
want to build a large corpus on Hadoop, fine-tune the merge policy to keep your
|
|
||||||
index sharded in a time-wise fashion, or you may want to convert and existing
|
|
||||||
index from a different format.
|
|
||||||
|
|
||||||
Tantivy exposes a lot of low level API to do all of these things.
|
|
||||||
|
|
||||||
|
Sometimes a functionality will not be available in tantivy because it is too specific to your use case. By design, tantivy should make it possible to extend
|
||||||
|
the available set of features using the existing rock-solid datastructures.
|
||||||
|
|
||||||
|
Most frequently this will mean writing your own `Collector`, your own `Scorer` or your own
|
||||||
|
`Tokenizer/TokenFilter`... But some of your requirement may also be related to
|
||||||
|
architecture or operations. For instance, you may want to build a large corpus on Hadoop,
|
||||||
|
fine-tune the merge policy to keep your index sharded in a time-wise fashion, or you may want
|
||||||
|
to convert and existing index from a different format.
|
||||||
|
|
||||||
|
Tantivy exposes its API to do all of these things.
|
||||||
@@ -2,76 +2,47 @@
|
|||||||
|
|
||||||
## Straight from disk
|
## Straight from disk
|
||||||
|
|
||||||
Tantivy accesses its data using an abstracting trait called `Directory`.
|
By default, tantivy accesses its data using its `MMapDirectory`.
|
||||||
In theory, one can come and override the data access logic. In practise, the
|
While this design has some downsides, this greatly simplifies the source code of tantivy,
|
||||||
trait somewhat assumes that your data can be mapped to memory, and tantivy
|
and entirely delegates the caching to the OS.
|
||||||
seems deeply married to using `mmap` for its io [^1], and the only persisting
|
|
||||||
directory shipped with tantivy is the `MmapDirectory`.
|
|
||||||
|
|
||||||
While this design has some downsides, this greatly simplifies the source code of
|
`tantivy` works entirely (or almost) by directly reading the datastructures as they are layed on disk.
|
||||||
tantivy. Caching is also entirely delegated to the OS.
|
As a result, the act of opening an indexing does not involve loading different datastructures
|
||||||
|
from the disk into random access memory : starting a process, opening an index, and performing a query
|
||||||
|
can typically be done in a matter of milliseconds.
|
||||||
|
|
||||||
`tantivy` works entirely (or almost) by directly reading the datastructures as they are layed on disk. As a result, the act of opening an indexing does not involve loading different datastructures from the disk into random access memory : starting a process, opening an index, and performing your first query can typically be done in a matter of milliseconds.
|
This is an interesting property for a command line search engine, or for some multi-tenant log search engine.
|
||||||
|
Spawning a new process for each new query can be a perfectly sensible solution in some use case.
|
||||||
This is an interesting property for a command line search engine, or for some multi-tenant log search engine : spawning a new process for each new query can be a perfectly sensible solution in some use case.
|
|
||||||
|
|
||||||
In later chapters, we will discuss tantivy's inverted index data layout.
|
In later chapters, we will discuss tantivy's inverted index data layout.
|
||||||
One key take away is that to achieve great performance, search indexes are extremely compact.
|
One key take away is that to achieve great performance, search indexes are extremely compact.
|
||||||
Of course this is crucial to reduce IO, and ensure that as much of our index can sit in RAM.
|
Of course this is crucial to reduce IO, and ensure that as much of our index can sit in RAM.
|
||||||
|
|
||||||
Also, whenever possible its data is accessed sequentially. Of course, this is an amazing property when tantivy needs to access the data from your spinning hard disk, but this is also
|
Also, whenever possible the data is accessed sequentially. Of course, this is an amazing property when tantivy needs to access
|
||||||
critical for performance, if your data is read from and an `SSD` or even already in your pagecache.
|
the data from your spinning hard disk, but this is also a great property when working with `SSD` or `RAM`,
|
||||||
|
as it makes our read patterns very predictable for the CPU.
|
||||||
|
|
||||||
|
|
||||||
## Segments, and the log method
|
## Segments, and the log method
|
||||||
|
|
||||||
That kind of compact layout comes at one cost: it prevents our datastructures from being dynamic.
|
That kind compact layout comes at one cost: it prevents our datastructures from being dynamic.
|
||||||
In fact, the `Directory` trait does not even allow you to modify part of a file.
|
In fact, a trait called `Directory` is in charge of abstracting all of tantivy's data access
|
||||||
|
and its API does not even allow editing these file once they are written.
|
||||||
|
|
||||||
To allow the addition / deletion of documents, and create the illusion that
|
To allow the addition / deletion of documents, and create the illusion that
|
||||||
your index is dynamic (i.e.: adding and deleting documents), tantivy uses a common database trick sometimes referred to as the *log method*.
|
your index is dynamic (i.e.: adding and deleting documents), tantivy uses a common database trick sometimes
|
||||||
|
referred to as the *log method*.
|
||||||
|
|
||||||
Let's forget about deletes for a moment.
|
Let's forget about deletes for a moment. As you add documents, these documents are processed and stored in
|
||||||
|
a dedicated datastructure, in a `RAM` buffer. This datastructure is designed to be dynamic but
|
||||||
As you add documents, these documents are processed and stored in a dedicated datastructure, in a `RAM` buffer. This datastructure is not ready for search, but it is useful to receive your data and rearrange it very rapidly.
|
cannot be accessed for search. As you add documents, this buffer will reach its capacity and tantivy will
|
||||||
|
transparently stop adding document to it and start converting this datastructure to its final
|
||||||
As you add documents, this buffer will reach its capacity and tantivy will transparently stop adding document to it and start converting this datastructure to its final read-only format on disk. Once written, an brand empty buffer is available to resume adding documents.
|
read-only format on disk. Once written, an brand empty buffer is available to resume adding documents.
|
||||||
|
|
||||||
The resulting chunk of index obtained after this serialization is called a `Segment`.
|
The resulting chunk of index obtained after this serialization is called a `Segment`.
|
||||||
|
|
||||||
> A segment is a self-contained atomic piece of index. It is identified with a UUID, and all of its files are identified using the naming scheme : `<UUID>.*`.
|
> A segment is a self-contained atomic piece of index. It is identified with a UUID, and all of its files
|
||||||
|
are identified using the naming scheme : `<UUID>.*`.
|
||||||
Which brings us to the nature of a tantivy `Index`.
|
|
||||||
|
|
||||||
> A tantivy `Index` is a collection of `Segments`.
|
|
||||||
|
|
||||||
Physically, this really just means and index is a bunch of segment files in a given `Directory`,
|
|
||||||
linked together by a `meta.json` file. This transparency can become extremely handy
|
|
||||||
to get tantivy to fit your use case:
|
|
||||||
|
|
||||||
*Example 1* You could for instance use hadoop to build a very large search index in a timely manner, copy all of the resulting segment files in the same directory and edit the `meta.json` to get a functional index.[^2]
|
|
||||||
|
|
||||||
*Example 2* You could also disable your merge policy and enforce daily segments. Removing data after one week can then be done very efficiently by just editing the `meta.json` and deleting the files associated to segment `D-7`.
|
|
||||||
|
|
||||||
|
|
||||||
|
> A tantivy `Index` is a collection of `Segments`.
|
||||||
|
|
||||||
|
|
||||||
# Merging
|
|
||||||
|
|
||||||
As you index more and more data, your index will accumulate more and more segments.
|
|
||||||
Having a lot of small segments is not really optimal. There is a bit of redundancy in having
|
|
||||||
all these term dictionary. Also when searching, we will need to do term lookups as many times as we have segments. It can hurt search performance a bit.
|
|
||||||
|
|
||||||
That's where merging or compacting comes into place. Tantivy will continuously consider merge
|
|
||||||
opportunities and start merging segments in the background.
|
|
||||||
|
|
||||||
|
|
||||||
# Indexing throughput, number of indexing threads
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
[^1]: This may eventually change.
|
|
||||||
|
|
||||||
[^2]: Be careful however. By default these files will not be considered as *managed* by tantivy. This means they will never be garbage collected by tantivy, regardless of whether they become obsolete or not.
|
|
||||||
@@ -1,3 +1 @@
|
|||||||
# Examples
|
# Examples
|
||||||
|
|
||||||
- [Basic search](/examples/basic_search.html)
|
|
||||||
@@ -10,17 +10,17 @@
|
|||||||
// - search for the best document matchings "sea whale"
|
// - search for the best document matchings "sea whale"
|
||||||
// - retrieve the best document original content.
|
// - retrieve the best document original content.
|
||||||
|
|
||||||
|
|
||||||
extern crate tempdir;
|
extern crate tempdir;
|
||||||
|
|
||||||
// ---
|
// ---
|
||||||
// Importing tantivy...
|
// Importing tantivy...
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate tantivy;
|
extern crate tantivy;
|
||||||
use tantivy::collector::TopDocs;
|
use tantivy::collector::TopCollector;
|
||||||
use tantivy::query::QueryParser;
|
use tantivy::query::QueryParser;
|
||||||
use tantivy::schema::*;
|
use tantivy::schema::*;
|
||||||
use tantivy::Index;
|
use tantivy::Index;
|
||||||
use tempdir::TempDir;
|
|
||||||
|
|
||||||
fn main() -> tantivy::Result<()> {
|
fn main() -> tantivy::Result<()> {
|
||||||
// Let's create a temporary directory for the
|
// Let's create a temporary directory for the
|
||||||
@@ -35,7 +35,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
// be indexed".
|
// be indexed".
|
||||||
|
|
||||||
// first we need to define a schema ...
|
// first we need to define a schema ...
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
|
|
||||||
// Our first field is title.
|
// Our first field is title.
|
||||||
// We want full-text search for it, and we also want
|
// We want full-text search for it, and we also want
|
||||||
@@ -213,10 +213,15 @@ fn main() -> tantivy::Result<()> {
|
|||||||
//
|
//
|
||||||
// We are not interested in all of the documents but
|
// We are not interested in all of the documents but
|
||||||
// only in the top 10. Keeping track of our top 10 best documents
|
// only in the top 10. Keeping track of our top 10 best documents
|
||||||
// is the role of the TopDocs.
|
// is the role of the TopCollector.
|
||||||
|
let mut top_collector = TopCollector::with_limit(10);
|
||||||
|
|
||||||
// We can now perform our query.
|
// We can now perform our query.
|
||||||
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
|
searcher.search(&*query, &mut top_collector)?;
|
||||||
|
|
||||||
|
// Our top collector now contains the 10
|
||||||
|
// most relevant doc ids...
|
||||||
|
let doc_addresses = top_collector.docs();
|
||||||
|
|
||||||
// The actual documents still need to be
|
// The actual documents still need to be
|
||||||
// retrieved from Tantivy's store.
|
// retrieved from Tantivy's store.
|
||||||
@@ -225,10 +230,14 @@ fn main() -> tantivy::Result<()> {
|
|||||||
// the document returned will only contain
|
// the document returned will only contain
|
||||||
// a title.
|
// a title.
|
||||||
|
|
||||||
for (_score, doc_address) in top_docs {
|
for doc_address in doc_addresses {
|
||||||
let retrieved_doc = searcher.doc(doc_address)?;
|
let retrieved_doc = searcher.doc(&doc_address)?;
|
||||||
println!("{}", schema.to_json(&retrieved_doc));
|
println!("{}", schema.to_json(&retrieved_doc));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
use tempdir::TempDir;
|
||||||
|
|||||||
@@ -1,187 +0,0 @@
|
|||||||
// # Custom collector example
|
|
||||||
//
|
|
||||||
// This example shows how you can implement your own
|
|
||||||
// collector. As an example, we will compute a collector
|
|
||||||
// that computes the standard deviation of a given fast field.
|
|
||||||
//
|
|
||||||
// Of course, you can have a look at the tantivy's built-in collectors
|
|
||||||
// such as the `CountCollector` for more examples.
|
|
||||||
|
|
||||||
extern crate tempdir;
|
|
||||||
|
|
||||||
// ---
|
|
||||||
// Importing tantivy...
|
|
||||||
#[macro_use]
|
|
||||||
extern crate tantivy;
|
|
||||||
use tantivy::collector::{Collector, SegmentCollector};
|
|
||||||
use tantivy::fastfield::FastFieldReader;
|
|
||||||
use tantivy::query::QueryParser;
|
|
||||||
use tantivy::schema::Field;
|
|
||||||
use tantivy::schema::{Schema, FAST, INT_INDEXED, TEXT};
|
|
||||||
use tantivy::Index;
|
|
||||||
use tantivy::SegmentReader;
|
|
||||||
|
|
||||||
#[derive(Default)]
|
|
||||||
struct Stats {
|
|
||||||
count: usize,
|
|
||||||
sum: f64,
|
|
||||||
squared_sum: f64,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Stats {
|
|
||||||
pub fn count(&self) -> usize {
|
|
||||||
self.count
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn mean(&self) -> f64 {
|
|
||||||
self.sum / (self.count as f64)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn square_mean(&self) -> f64 {
|
|
||||||
self.squared_sum / (self.count as f64)
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn standard_deviation(&self) -> f64 {
|
|
||||||
let mean = self.mean();
|
|
||||||
(self.square_mean() - mean * mean).sqrt()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn non_zero_count(self) -> Option<Stats> {
|
|
||||||
if self.count == 0 {
|
|
||||||
None
|
|
||||||
} else {
|
|
||||||
Some(self)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct StatsCollector {
|
|
||||||
field: Field,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl StatsCollector {
|
|
||||||
fn with_field(field: Field) -> StatsCollector {
|
|
||||||
StatsCollector { field }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Collector for StatsCollector {
|
|
||||||
// That's the type of our result.
|
|
||||||
// Our standard deviation will be a float.
|
|
||||||
type Fruit = Option<Stats>;
|
|
||||||
|
|
||||||
type Child = StatsSegmentCollector;
|
|
||||||
|
|
||||||
fn for_segment(
|
|
||||||
&self,
|
|
||||||
_segment_local_id: u32,
|
|
||||||
segment: &SegmentReader,
|
|
||||||
) -> tantivy::Result<StatsSegmentCollector> {
|
|
||||||
let fast_field_reader = segment.fast_field_reader(self.field)?;
|
|
||||||
Ok(StatsSegmentCollector {
|
|
||||||
fast_field_reader,
|
|
||||||
stats: Stats::default(),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
|
||||||
// this collector does not care about score.
|
|
||||||
false
|
|
||||||
}
|
|
||||||
|
|
||||||
fn merge_fruits(&self, segment_stats: Vec<Option<Stats>>) -> tantivy::Result<Option<Stats>> {
|
|
||||||
let mut stats = Stats::default();
|
|
||||||
for segment_stats_opt in segment_stats {
|
|
||||||
if let Some(segment_stats) = segment_stats_opt {
|
|
||||||
stats.count += segment_stats.count;
|
|
||||||
stats.sum += segment_stats.sum;
|
|
||||||
stats.squared_sum += segment_stats.squared_sum;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(stats.non_zero_count())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct StatsSegmentCollector {
|
|
||||||
fast_field_reader: FastFieldReader<u64>,
|
|
||||||
stats: Stats,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SegmentCollector for StatsSegmentCollector {
|
|
||||||
type Fruit = Option<Stats>;
|
|
||||||
|
|
||||||
fn collect(&mut self, doc: u32, _score: f32) {
|
|
||||||
let value = self.fast_field_reader.get(doc) as f64;
|
|
||||||
self.stats.count += 1;
|
|
||||||
self.stats.sum += value;
|
|
||||||
self.stats.squared_sum += value * value;
|
|
||||||
}
|
|
||||||
|
|
||||||
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
|
|
||||||
self.stats.non_zero_count()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn main() -> tantivy::Result<()> {
|
|
||||||
// # Defining the schema
|
|
||||||
//
|
|
||||||
// The Tantivy index requires a very strict schema.
|
|
||||||
// The schema declares which fields are in the index,
|
|
||||||
// and for each field, its type and "the way it should
|
|
||||||
// be indexed".
|
|
||||||
|
|
||||||
// first we need to define a schema ...
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
|
|
||||||
// We'll assume a fictional index containing
|
|
||||||
// products, and with a name, a description, and a price.
|
|
||||||
let product_name = schema_builder.add_text_field("name", TEXT);
|
|
||||||
let product_description = schema_builder.add_text_field("description", TEXT);
|
|
||||||
let price = schema_builder.add_u64_field("price", INT_INDEXED | FAST);
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
|
|
||||||
// # Indexing documents
|
|
||||||
//
|
|
||||||
// Lets index a bunch of fake documents for the sake of
|
|
||||||
// this example.
|
|
||||||
let index = Index::create_in_ram(schema.clone());
|
|
||||||
|
|
||||||
let mut index_writer = index.writer(50_000_000)?;
|
|
||||||
index_writer.add_document(doc!(
|
|
||||||
product_name => "Super Broom 2000",
|
|
||||||
product_description => "While it is ok for short distance travel, this broom \
|
|
||||||
was designed quiditch. It will up your game.",
|
|
||||||
price => 30_200u64
|
|
||||||
));
|
|
||||||
index_writer.add_document(doc!(
|
|
||||||
product_name => "Turbulobroom",
|
|
||||||
product_description => "You might have heard of this broom before : it is the sponsor of the Wales team.\
|
|
||||||
You'll enjoy its sharp turns, and rapid acceleration",
|
|
||||||
price => 29_240u64
|
|
||||||
));
|
|
||||||
index_writer.add_document(doc!(
|
|
||||||
product_name => "Broomio",
|
|
||||||
product_description => "Great value for the price. This broom is a market favorite",
|
|
||||||
price => 21_240u64
|
|
||||||
));
|
|
||||||
index_writer.add_document(doc!(
|
|
||||||
product_name => "Whack a Mole",
|
|
||||||
product_description => "Prime quality bat.",
|
|
||||||
price => 5_200u64
|
|
||||||
));
|
|
||||||
index_writer.commit()?;
|
|
||||||
index.load_searchers()?;
|
|
||||||
|
|
||||||
let searcher = index.searcher();
|
|
||||||
let query_parser = QueryParser::for_index(&index, vec![product_name, product_description]);
|
|
||||||
|
|
||||||
// here we want to get a hit on the 'ken' in Frankenstein
|
|
||||||
let query = query_parser.parse_query("broom")?;
|
|
||||||
if let Some(stats) = searcher.search(&query, &StatsCollector::with_field(price))? {
|
|
||||||
println!("count: {}", stats.count());
|
|
||||||
println!("mean: {}", stats.mean());
|
|
||||||
println!("standard deviation: {}", stats.standard_deviation());
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
@@ -3,14 +3,16 @@
|
|||||||
// In this example, we'll see how to define a tokenizer pipeline
|
// In this example, we'll see how to define a tokenizer pipeline
|
||||||
// by aligning a bunch of `TokenFilter`.
|
// by aligning a bunch of `TokenFilter`.
|
||||||
|
|
||||||
|
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate tantivy;
|
extern crate tantivy;
|
||||||
use tantivy::collector::TopDocs;
|
use tantivy::collector::TopCollector;
|
||||||
use tantivy::query::QueryParser;
|
use tantivy::query::QueryParser;
|
||||||
use tantivy::schema::*;
|
use tantivy::schema::*;
|
||||||
use tantivy::tokenizer::NgramTokenizer;
|
use tantivy::tokenizer::NgramTokenizer;
|
||||||
use tantivy::Index;
|
use tantivy::Index;
|
||||||
|
|
||||||
|
|
||||||
fn main() -> tantivy::Result<()> {
|
fn main() -> tantivy::Result<()> {
|
||||||
// # Defining the schema
|
// # Defining the schema
|
||||||
//
|
//
|
||||||
@@ -20,7 +22,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
// be indexed".
|
// be indexed".
|
||||||
|
|
||||||
// first we need to define a schema ...
|
// first we need to define a schema ...
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
|
|
||||||
// Our first field is title.
|
// Our first field is title.
|
||||||
// In this example we want to use NGram searching
|
// In this example we want to use NGram searching
|
||||||
@@ -104,10 +106,12 @@ fn main() -> tantivy::Result<()> {
|
|||||||
// here we want to get a hit on the 'ken' in Frankenstein
|
// here we want to get a hit on the 'ken' in Frankenstein
|
||||||
let query = query_parser.parse_query("ken")?;
|
let query = query_parser.parse_query("ken")?;
|
||||||
|
|
||||||
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
|
let mut top_collector = TopCollector::with_limit(10);
|
||||||
|
searcher.search(&*query, &mut top_collector)?;
|
||||||
|
|
||||||
for (_, doc_address) in top_docs {
|
let doc_addresses = top_collector.docs();
|
||||||
let retrieved_doc = searcher.doc(doc_address)?;
|
for doc_address in doc_addresses {
|
||||||
|
let retrieved_doc = searcher.doc(&doc_address)?;
|
||||||
println!("{}", schema.to_json(&retrieved_doc));
|
println!("{}", schema.to_json(&retrieved_doc));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -10,10 +10,11 @@
|
|||||||
// Importing tantivy...
|
// Importing tantivy...
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate tantivy;
|
extern crate tantivy;
|
||||||
use tantivy::collector::TopDocs;
|
use tantivy::collector::TopCollector;
|
||||||
use tantivy::query::TermQuery;
|
|
||||||
use tantivy::schema::*;
|
use tantivy::schema::*;
|
||||||
use tantivy::Index;
|
use tantivy::Index;
|
||||||
|
use tantivy::query::TermQuery;
|
||||||
|
|
||||||
|
|
||||||
// A simple helper function to fetch a single document
|
// A simple helper function to fetch a single document
|
||||||
// given its id from our index.
|
// given its id from our index.
|
||||||
@@ -27,10 +28,11 @@ fn extract_doc_given_isbn(index: &Index, isbn_term: &Term) -> tantivy::Result<Op
|
|||||||
// The second argument is here to tell we don't care about decoding positions,
|
// The second argument is here to tell we don't care about decoding positions,
|
||||||
// or term frequencies.
|
// or term frequencies.
|
||||||
let term_query = TermQuery::new(isbn_term.clone(), IndexRecordOption::Basic);
|
let term_query = TermQuery::new(isbn_term.clone(), IndexRecordOption::Basic);
|
||||||
let top_docs = searcher.search(&term_query, &TopDocs::with_limit(1))?;
|
let mut top_collector = TopCollector::with_limit(1);
|
||||||
|
searcher.search(&term_query, &mut top_collector)?;
|
||||||
|
|
||||||
if let Some((_score, doc_address)) = top_docs.first() {
|
if let Some(doc_address) = top_collector.docs().first() {
|
||||||
let doc = searcher.doc(*doc_address)?;
|
let doc = searcher.doc(doc_address)?;
|
||||||
Ok(Some(doc))
|
Ok(Some(doc))
|
||||||
} else {
|
} else {
|
||||||
// no doc matching this ID.
|
// no doc matching this ID.
|
||||||
@@ -39,11 +41,12 @@ fn extract_doc_given_isbn(index: &Index, isbn_term: &Term) -> tantivy::Result<Op
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn main() -> tantivy::Result<()> {
|
fn main() -> tantivy::Result<()> {
|
||||||
|
|
||||||
// # Defining the schema
|
// # Defining the schema
|
||||||
//
|
//
|
||||||
// Check out the *basic_search* example if this makes
|
// Check out the *basic_search* example if this makes
|
||||||
// small sense to you.
|
// small sense to you.
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
|
|
||||||
// Tantivy does not really have a notion of primary id.
|
// Tantivy does not really have a notion of primary id.
|
||||||
// This may change in the future.
|
// This may change in the future.
|
||||||
@@ -123,6 +126,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
isbn => "978-9176370711",
|
isbn => "978-9176370711",
|
||||||
));
|
));
|
||||||
|
|
||||||
|
|
||||||
// You are guaranteed that your clients will only observe your index in
|
// You are guaranteed that your clients will only observe your index in
|
||||||
// the state it was in after a commit.
|
// the state it was in after a commit.
|
||||||
// In this example, your search engine will at no point be missing the *Frankenstein* document.
|
// In this example, your search engine will at no point be missing the *Frankenstein* document.
|
||||||
@@ -139,4 +143,4 @@ fn main() -> tantivy::Result<()> {
|
|||||||
);
|
);
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -22,59 +22,60 @@ use tantivy::schema::*;
|
|||||||
use tantivy::Index;
|
use tantivy::Index;
|
||||||
|
|
||||||
fn main() -> tantivy::Result<()> {
|
fn main() -> tantivy::Result<()> {
|
||||||
// Let's create a temporary directory for the
|
// Let's create a temporary directory for the
|
||||||
// sake of this example
|
// sake of this example
|
||||||
let index_path = TempDir::new("tantivy_facet_example_dir")?;
|
let index_path = TempDir::new("tantivy_facet_example_dir")?;
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
|
|
||||||
schema_builder.add_text_field("name", TEXT | STORED);
|
schema_builder.add_text_field("name", TEXT | STORED);
|
||||||
|
|
||||||
// this is our faceted field
|
// this is our faceted field
|
||||||
schema_builder.add_facet_field("tags");
|
schema_builder.add_facet_field("tags");
|
||||||
|
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
|
|
||||||
let index = Index::create_in_dir(&index_path, schema.clone())?;
|
let index = Index::create_in_dir(&index_path, schema.clone())?;
|
||||||
|
|
||||||
let mut index_writer = index.writer(50_000_000)?;
|
let mut index_writer = index.writer(50_000_000)?;
|
||||||
|
|
||||||
let name = schema.get_field("name").unwrap();
|
let name = schema.get_field("name").unwrap();
|
||||||
let tags = schema.get_field("tags").unwrap();
|
let tags = schema.get_field("tags").unwrap();
|
||||||
|
|
||||||
// For convenience, tantivy also comes with a macro to
|
// For convenience, tantivy also comes with a macro to
|
||||||
// reduce the boilerplate above.
|
// reduce the boilerplate above.
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
name => "the ditch",
|
name => "the ditch",
|
||||||
tags => Facet::from("/pools/north")
|
tags => Facet::from("/pools/north")
|
||||||
));
|
));
|
||||||
|
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
name => "little stacey",
|
name => "little stacey",
|
||||||
tags => Facet::from("/pools/south")
|
tags => Facet::from("/pools/south")
|
||||||
));
|
));
|
||||||
|
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
|
|
||||||
index.load_searchers()?;
|
index.load_searchers()?;
|
||||||
|
|
||||||
let searcher = index.searcher();
|
let searcher = index.searcher();
|
||||||
|
|
||||||
let mut facet_collector = FacetCollector::for_field(tags);
|
let mut facet_collector = FacetCollector::for_field(tags);
|
||||||
facet_collector.add_facet("/pools");
|
facet_collector.add_facet("/pools");
|
||||||
|
|
||||||
let facet_counts = searcher.search(&AllQuery, &facet_collector).unwrap();
|
searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
||||||
|
|
||||||
// This lists all of the facet counts
|
let counts = facet_collector.harvest();
|
||||||
let facets: Vec<(&Facet, u64)> = facet_counts.get("/pools").collect();
|
// This lists all of the facet counts
|
||||||
assert_eq!(
|
let facets: Vec<(&Facet, u64)> = counts.get("/pools").collect();
|
||||||
facets,
|
assert_eq!(
|
||||||
vec![
|
facets,
|
||||||
(&Facet::from("/pools/north"), 1),
|
vec![
|
||||||
(&Facet::from("/pools/south"), 1),
|
(&Facet::from("/pools/north"), 1),
|
||||||
]
|
(&Facet::from("/pools/south"), 1)
|
||||||
);
|
]
|
||||||
|
);
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
use tempdir::TempDir;
|
use tempdir::TempDir;
|
||||||
|
|||||||
@@ -7,18 +7,21 @@
|
|||||||
// the list of documents containing a term, getting
|
// the list of documents containing a term, getting
|
||||||
// its term frequency, and accessing its positions.
|
// its term frequency, and accessing its positions.
|
||||||
|
|
||||||
|
|
||||||
// ---
|
// ---
|
||||||
// Importing tantivy...
|
// Importing tantivy...
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate tantivy;
|
extern crate tantivy;
|
||||||
use tantivy::schema::*;
|
use tantivy::schema::*;
|
||||||
use tantivy::Index;
|
use tantivy::Index;
|
||||||
use tantivy::{DocId, DocSet, Postings};
|
use tantivy::{DocSet, DocId, Postings};
|
||||||
|
|
||||||
fn main() -> tantivy::Result<()> {
|
fn main() -> tantivy::Result<()> {
|
||||||
|
|
||||||
|
|
||||||
// We first create a schema for the sake of the
|
// We first create a schema for the sake of the
|
||||||
// example. Check the `basic_search` example for more information.
|
// example. Check the `basic_search` example for more information.
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
|
|
||||||
// For this example, we need to make sure to index positions for our title
|
// For this example, we need to make sure to index positions for our title
|
||||||
// field. `TEXT` precisely does this.
|
// field. `TEXT` precisely does this.
|
||||||
@@ -44,6 +47,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
// there is actually only one segment here, but let's iterate through the list
|
// there is actually only one segment here, but let's iterate through the list
|
||||||
// anyway)
|
// anyway)
|
||||||
for segment_reader in searcher.segment_readers() {
|
for segment_reader in searcher.segment_readers() {
|
||||||
|
|
||||||
// A segment contains different data structure.
|
// A segment contains different data structure.
|
||||||
// Inverted index stands for the combination of
|
// Inverted index stands for the combination of
|
||||||
// - the term dictionary
|
// - the term dictionary
|
||||||
@@ -54,18 +58,19 @@ fn main() -> tantivy::Result<()> {
|
|||||||
// Let's go through all docs containing the term `title:the` and access their position
|
// Let's go through all docs containing the term `title:the` and access their position
|
||||||
let term_the = Term::from_field_text(title, "the");
|
let term_the = Term::from_field_text(title, "the");
|
||||||
|
|
||||||
|
|
||||||
// This segment posting object is like a cursor over the documents matching the term.
|
// This segment posting object is like a cursor over the documents matching the term.
|
||||||
// The `IndexRecordOption` arguments tells tantivy we will be interested in both term frequencies
|
// The `IndexRecordOption` arguments tells tantivy we will be interested in both term frequencies
|
||||||
// and positions.
|
// and positions.
|
||||||
//
|
//
|
||||||
// If you don't need all this information, you may get better performance by decompressing less
|
// If you don't need all this information, you may get better performance by decompressing less
|
||||||
// information.
|
// information.
|
||||||
if let Some(mut segment_postings) =
|
if let Some(mut segment_postings) = inverted_index.read_postings(&term_the, IndexRecordOption::WithFreqsAndPositions) {
|
||||||
inverted_index.read_postings(&term_the, IndexRecordOption::WithFreqsAndPositions)
|
|
||||||
{
|
|
||||||
// this buffer will be used to request for positions
|
// this buffer will be used to request for positions
|
||||||
let mut positions: Vec<u32> = Vec::with_capacity(100);
|
let mut positions: Vec<u32> = Vec::with_capacity(100);
|
||||||
while segment_postings.advance() {
|
while segment_postings.advance() {
|
||||||
|
|
||||||
// the number of time the term appears in the document.
|
// the number of time the term appears in the document.
|
||||||
let doc_id: DocId = segment_postings.doc(); //< do not try to access this before calling advance once.
|
let doc_id: DocId = segment_postings.doc(); //< do not try to access this before calling advance once.
|
||||||
|
|
||||||
@@ -93,6 +98,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// A `Term` is a text token associated with a field.
|
// A `Term` is a text token associated with a field.
|
||||||
// Let's go through all docs containing the term `title:the` and access their position
|
// Let's go through all docs containing the term `title:the` and access their position
|
||||||
let term_the = Term::from_field_text(title, "the");
|
let term_the = Term::from_field_text(title, "the");
|
||||||
@@ -105,6 +111,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
// Also, for some VERY specific high performance use case like an OLAP analysis of logs,
|
// Also, for some VERY specific high performance use case like an OLAP analysis of logs,
|
||||||
// you can get better performance by accessing directly the blocks of doc ids.
|
// you can get better performance by accessing directly the blocks of doc ids.
|
||||||
for segment_reader in searcher.segment_readers() {
|
for segment_reader in searcher.segment_readers() {
|
||||||
|
|
||||||
// A segment contains different data structure.
|
// A segment contains different data structure.
|
||||||
// Inverted index stands for the combination of
|
// Inverted index stands for the combination of
|
||||||
// - the term dictionary
|
// - the term dictionary
|
||||||
@@ -117,9 +124,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
//
|
//
|
||||||
// If you don't need all this information, you may get better performance by decompressing less
|
// If you don't need all this information, you may get better performance by decompressing less
|
||||||
// information.
|
// information.
|
||||||
if let Some(mut block_segment_postings) =
|
if let Some(mut block_segment_postings) = inverted_index.read_block_postings(&term_the, IndexRecordOption::Basic) {
|
||||||
inverted_index.read_block_postings(&term_the, IndexRecordOption::Basic)
|
|
||||||
{
|
|
||||||
while block_segment_postings.advance() {
|
while block_segment_postings.advance() {
|
||||||
// Once again these docs MAY contains deleted documents as well.
|
// Once again these docs MAY contains deleted documents as well.
|
||||||
let docs = block_segment_postings.docs();
|
let docs = block_segment_postings.docs();
|
||||||
@@ -131,3 +136,4 @@ fn main() -> tantivy::Result<()> {
|
|||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,87 +0,0 @@
|
|||||||
// # Snippet example
|
|
||||||
//
|
|
||||||
// This example shows how to return a representative snippet of
|
|
||||||
// your hit result.
|
|
||||||
// Snippet are an extracted of a target document, and returned in HTML format.
|
|
||||||
// The keyword searched by the user are highlighted with a `<b>` tag.
|
|
||||||
extern crate tempdir;
|
|
||||||
|
|
||||||
// ---
|
|
||||||
// Importing tantivy...
|
|
||||||
#[macro_use]
|
|
||||||
extern crate tantivy;
|
|
||||||
use tantivy::collector::TopDocs;
|
|
||||||
use tantivy::query::QueryParser;
|
|
||||||
use tantivy::schema::*;
|
|
||||||
use tantivy::Index;
|
|
||||||
use tantivy::{Snippet, SnippetGenerator};
|
|
||||||
use tempdir::TempDir;
|
|
||||||
|
|
||||||
fn main() -> tantivy::Result<()> {
|
|
||||||
// Let's create a temporary directory for the
|
|
||||||
// sake of this example
|
|
||||||
let index_path = TempDir::new("tantivy_example_dir")?;
|
|
||||||
|
|
||||||
// # Defining the schema
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
let title = schema_builder.add_text_field("title", TEXT | STORED);
|
|
||||||
let body = schema_builder.add_text_field("body", TEXT | STORED);
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
|
|
||||||
// # Indexing documents
|
|
||||||
let index = Index::create_in_dir(&index_path, schema.clone())?;
|
|
||||||
|
|
||||||
let mut index_writer = index.writer(50_000_000)?;
|
|
||||||
|
|
||||||
// we'll only need one doc for this example.
|
|
||||||
index_writer.add_document(doc!(
|
|
||||||
title => "Of Mice and Men",
|
|
||||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
|
||||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
|
||||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
|
||||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
|
||||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
|
||||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
|
||||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
|
||||||
limbs and branches that arch over the pool"
|
|
||||||
));
|
|
||||||
// ...
|
|
||||||
index_writer.commit()?;
|
|
||||||
|
|
||||||
index.load_searchers()?;
|
|
||||||
|
|
||||||
let searcher = index.searcher();
|
|
||||||
let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
|
||||||
let query = query_parser.parse_query("sycamore spring")?;
|
|
||||||
|
|
||||||
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
|
|
||||||
|
|
||||||
let snippet_generator = SnippetGenerator::new(&searcher, &*query, body)?;
|
|
||||||
|
|
||||||
for (score, doc_address) in top_docs {
|
|
||||||
let doc = searcher.doc(doc_address)?;
|
|
||||||
let snippet = snippet_generator.snippet_from_doc(&doc);
|
|
||||||
println!("Document score {}:", score);
|
|
||||||
println!("title: {}", doc.get_first(title).unwrap().text().unwrap());
|
|
||||||
println!("snippet: {}", snippet.to_html());
|
|
||||||
println!("custom highlighting: {}", highlight(snippet));
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn highlight(snippet: Snippet) -> String {
|
|
||||||
let mut result = String::new();
|
|
||||||
let mut start_from = 0;
|
|
||||||
|
|
||||||
for (start, end) in snippet.highlighted().iter().map(|h| h.bounds()) {
|
|
||||||
result.push_str(&snippet.fragments()[start_from..start]);
|
|
||||||
result.push_str(" --> ");
|
|
||||||
result.push_str(&snippet.fragments()[start..end]);
|
|
||||||
result.push_str(" <-- ");
|
|
||||||
start_from = end;
|
|
||||||
}
|
|
||||||
|
|
||||||
result.push_str(&snippet.fragments()[start_from..]);
|
|
||||||
result
|
|
||||||
}
|
|
||||||
@@ -15,78 +15,79 @@ extern crate tempdir;
|
|||||||
// Importing tantivy...
|
// Importing tantivy...
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate tantivy;
|
extern crate tantivy;
|
||||||
use tantivy::collector::TopDocs;
|
use tantivy::collector::TopCollector;
|
||||||
use tantivy::query::QueryParser;
|
use tantivy::query::QueryParser;
|
||||||
use tantivy::schema::*;
|
use tantivy::schema::*;
|
||||||
use tantivy::tokenizer::*;
|
use tantivy::tokenizer::*;
|
||||||
use tantivy::Index;
|
use tantivy::Index;
|
||||||
|
|
||||||
fn main() -> tantivy::Result<()> {
|
fn main() -> tantivy::Result<()> {
|
||||||
// this example assumes you understand the content in `basic_search`
|
// this example assumes you understand the content in `basic_search`
|
||||||
let mut schema_builder = Schema::builder();
|
let index_path = TempDir::new("tantivy_stopwords_example_dir")?;
|
||||||
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
|
|
||||||
// This configures your custom options for how tantivy will
|
// This configures your custom options for how tantivy will
|
||||||
// store and process your content in the index; The key
|
// store and process your content in the index; The key
|
||||||
// to note is that we are setting the tokenizer to `stoppy`
|
// to note is that we are setting the tokenizer to `stoppy`
|
||||||
// which will be defined and registered below.
|
// which will be defined and registered below.
|
||||||
let text_field_indexing = TextFieldIndexing::default()
|
let text_field_indexing = TextFieldIndexing::default()
|
||||||
.set_tokenizer("stoppy")
|
.set_tokenizer("stoppy")
|
||||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||||
let text_options = TextOptions::default()
|
let text_options = TextOptions::default()
|
||||||
.set_indexing_options(text_field_indexing)
|
.set_indexing_options(text_field_indexing)
|
||||||
.set_stored();
|
.set_stored();
|
||||||
|
|
||||||
// Our first field is title.
|
// Our first field is title.
|
||||||
schema_builder.add_text_field("title", text_options);
|
schema_builder.add_text_field("title", text_options);
|
||||||
|
|
||||||
// Our second field is body.
|
// Our second field is body.
|
||||||
let text_field_indexing = TextFieldIndexing::default()
|
let text_field_indexing = TextFieldIndexing::default()
|
||||||
.set_tokenizer("stoppy")
|
.set_tokenizer("stoppy")
|
||||||
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||||
let text_options = TextOptions::default()
|
let text_options = TextOptions::default()
|
||||||
.set_indexing_options(text_field_indexing)
|
.set_indexing_options(text_field_indexing)
|
||||||
.set_stored();
|
.set_stored();
|
||||||
schema_builder.add_text_field("body", text_options);
|
schema_builder.add_text_field("body", text_options);
|
||||||
|
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
|
|
||||||
let index = Index::create_in_ram(schema.clone());
|
let index = Index::create_in_dir(&index_path, schema.clone())?;
|
||||||
|
|
||||||
// This tokenizer lowers all of the text (to help with stop word matching)
|
// This tokenizer lowers all of the text (to help with stop word matching)
|
||||||
// then removes all instances of `the` and `and` from the corpus
|
// then removes all instances of `the` and `and` from the corpus
|
||||||
let tokenizer = SimpleTokenizer
|
let tokenizer = SimpleTokenizer
|
||||||
.filter(LowerCaser)
|
.filter(LowerCaser)
|
||||||
.filter(StopWordFilter::remove(vec![
|
.filter(StopWordFilter::remove(vec![
|
||||||
"the".to_string(),
|
"the".to_string(),
|
||||||
"and".to_string(),
|
"and".to_string(),
|
||||||
]));
|
]));
|
||||||
|
|
||||||
index.tokenizers().register("stoppy", tokenizer);
|
index.tokenizers().register("stoppy", tokenizer);
|
||||||
|
|
||||||
let mut index_writer = index.writer(50_000_000)?;
|
let mut index_writer = index.writer(50_000_000)?;
|
||||||
|
|
||||||
let title = schema.get_field("title").unwrap();
|
let title = schema.get_field("title").unwrap();
|
||||||
let body = schema.get_field("body").unwrap();
|
let body = schema.get_field("body").unwrap();
|
||||||
|
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
title => "The Old Man and the Sea",
|
title => "The Old Man and the Sea",
|
||||||
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
|
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
|
||||||
he had gone eighty-four days now without taking a fish."
|
he had gone eighty-four days now without taking a fish."
|
||||||
));
|
));
|
||||||
|
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
title => "Of Mice and Men",
|
title => "Of Mice and Men",
|
||||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||||
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
|
||||||
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
over the yellow sands in the sunlight before reaching the narrow pool. On one \
|
||||||
side of the river the golden foothill slopes curve up to the strong and rocky \
|
side of the river the golden foothill slopes curve up to the strong and rocky \
|
||||||
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
|
||||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||||
limbs and branches that arch over the pool"
|
limbs and branches that arch over the pool"
|
||||||
));
|
));
|
||||||
|
|
||||||
index_writer.add_document(doc!(
|
index_writer.add_document(doc!(
|
||||||
title => "Frankenstein",
|
title => "Frankenstein",
|
||||||
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
|
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
|
||||||
enterprise which you have regarded with such evil forebodings. I arrived here \
|
enterprise which you have regarded with such evil forebodings. I arrived here \
|
||||||
@@ -94,24 +95,35 @@ fn main() -> tantivy::Result<()> {
|
|||||||
increasing confidence in the success of my undertaking."
|
increasing confidence in the success of my undertaking."
|
||||||
));
|
));
|
||||||
|
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
|
|
||||||
index.load_searchers()?;
|
index.load_searchers()?;
|
||||||
|
|
||||||
let searcher = index.searcher();
|
let searcher = index.searcher();
|
||||||
|
|
||||||
let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
||||||
|
|
||||||
// stop words are applied on the query as well.
|
// this will have NO hits because it was filtered out
|
||||||
// The following will be equivalent to `title:frankenstein`
|
// because the query is run through the analyzer you
|
||||||
let query = query_parser.parse_query("title:\"the Frankenstein\"")?;
|
// actually will get an error here because the query becomes
|
||||||
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
|
// empty
|
||||||
|
assert!(query_parser.parse_query("the").is_err());
|
||||||
|
|
||||||
for (score, doc_address) in top_docs {
|
// this will have hits
|
||||||
let retrieved_doc = searcher.doc(doc_address)?;
|
let query = query_parser.parse_query("is")?;
|
||||||
println!("\n==\nDocument score {}:", score);
|
|
||||||
println!("{}", schema.to_json(&retrieved_doc));
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
let mut top_collector = TopCollector::with_limit(10);
|
||||||
|
|
||||||
|
searcher.search(&*query, &mut top_collector)?;
|
||||||
|
|
||||||
|
let doc_addresses = top_collector.docs();
|
||||||
|
|
||||||
|
for doc_address in doc_addresses {
|
||||||
|
let retrieved_doc = searcher.doc(&doc_address)?;
|
||||||
|
println!("{}", schema.to_json(&retrieved_doc));
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
use tempdir::TempDir;
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
// Check out the basic example if this is confusing to you.
|
// Check out the basic example if this is confusing to you.
|
||||||
//
|
//
|
||||||
// first we need to define a schema ...
|
// first we need to define a schema ...
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
schema_builder.add_text_field("title", TEXT | STORED);
|
schema_builder.add_text_field("title", TEXT | STORED);
|
||||||
schema_builder.add_text_field("body", TEXT);
|
schema_builder.add_text_field("body", TEXT);
|
||||||
schema_builder.add_u64_field("year", INT_INDEXED);
|
schema_builder.add_u64_field("year", INT_INDEXED);
|
||||||
|
|||||||
@@ -1,2 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
cargo test --no-default-features --features mmap -- --test-threads 1
|
|
||||||
142
src/collector/chained_collector.rs
Normal file
142
src/collector/chained_collector.rs
Normal file
@@ -0,0 +1,142 @@
|
|||||||
|
use collector::Collector;
|
||||||
|
use DocId;
|
||||||
|
use Result;
|
||||||
|
use Score;
|
||||||
|
use SegmentLocalId;
|
||||||
|
use SegmentReader;
|
||||||
|
|
||||||
|
/// Collector that does nothing.
|
||||||
|
/// This is used in the chain Collector and will hopefully
|
||||||
|
/// be optimized away by the compiler.
|
||||||
|
pub struct DoNothingCollector;
|
||||||
|
impl Collector for DoNothingCollector {
|
||||||
|
#[inline]
|
||||||
|
fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<()> {
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
#[inline]
|
||||||
|
fn collect(&mut self, _doc: DocId, _score: Score) {}
|
||||||
|
#[inline]
|
||||||
|
fn requires_scoring(&self) -> bool {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Zero-cost abstraction used to collect on multiple collectors.
|
||||||
|
/// This contraption is only usable if the type of your collectors
|
||||||
|
/// are known at compile time.
|
||||||
|
///
|
||||||
|
/// ```rust
|
||||||
|
/// #[macro_use]
|
||||||
|
/// extern crate tantivy;
|
||||||
|
/// use tantivy::schema::{SchemaBuilder, TEXT};
|
||||||
|
/// use tantivy::{Index, Result};
|
||||||
|
/// use tantivy::collector::{CountCollector, TopCollector, chain};
|
||||||
|
/// use tantivy::query::QueryParser;
|
||||||
|
///
|
||||||
|
/// # fn main() { example().unwrap(); }
|
||||||
|
/// fn example() -> Result<()> {
|
||||||
|
/// let mut schema_builder = SchemaBuilder::new();
|
||||||
|
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||||
|
/// let schema = schema_builder.build();
|
||||||
|
/// let index = Index::create_in_ram(schema);
|
||||||
|
/// {
|
||||||
|
/// let mut index_writer = index.writer(3_000_000)?;
|
||||||
|
/// index_writer.add_document(doc!(
|
||||||
|
/// title => "The Name of the Wind",
|
||||||
|
/// ));
|
||||||
|
/// index_writer.add_document(doc!(
|
||||||
|
/// title => "The Diary of Muadib",
|
||||||
|
/// ));
|
||||||
|
/// index_writer.add_document(doc!(
|
||||||
|
/// title => "A Dairy Cow",
|
||||||
|
/// ));
|
||||||
|
/// index_writer.add_document(doc!(
|
||||||
|
/// title => "The Diary of a Young Girl",
|
||||||
|
/// ));
|
||||||
|
/// index_writer.commit().unwrap();
|
||||||
|
/// }
|
||||||
|
///
|
||||||
|
/// index.load_searchers()?;
|
||||||
|
/// let searcher = index.searcher();
|
||||||
|
///
|
||||||
|
/// {
|
||||||
|
/// let mut top_collector = TopCollector::with_limit(2);
|
||||||
|
/// let mut count_collector = CountCollector::default();
|
||||||
|
/// {
|
||||||
|
/// let mut collectors = chain().push(&mut top_collector).push(&mut count_collector);
|
||||||
|
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||||
|
/// let query = query_parser.parse_query("diary")?;
|
||||||
|
/// searcher.search(&*query, &mut collectors).unwrap();
|
||||||
|
/// }
|
||||||
|
/// assert_eq!(count_collector.count(), 2);
|
||||||
|
/// assert!(top_collector.at_capacity());
|
||||||
|
/// }
|
||||||
|
///
|
||||||
|
/// Ok(())
|
||||||
|
/// }
|
||||||
|
/// ```
|
||||||
|
pub struct ChainedCollector<Left: Collector, Right: Collector> {
|
||||||
|
left: Left,
|
||||||
|
right: Right,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<Left: Collector, Right: Collector> ChainedCollector<Left, Right> {
|
||||||
|
/// Adds a collector
|
||||||
|
pub fn push<C: Collector>(self, new_collector: &mut C) -> ChainedCollector<Self, &mut C> {
|
||||||
|
ChainedCollector {
|
||||||
|
left: self,
|
||||||
|
right: new_collector,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<Left: Collector, Right: Collector> Collector for ChainedCollector<Left, Right> {
|
||||||
|
fn set_segment(
|
||||||
|
&mut self,
|
||||||
|
segment_local_id: SegmentLocalId,
|
||||||
|
segment: &SegmentReader,
|
||||||
|
) -> Result<()> {
|
||||||
|
self.left.set_segment(segment_local_id, segment)?;
|
||||||
|
self.right.set_segment(segment_local_id, segment)?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn collect(&mut self, doc: DocId, score: Score) {
|
||||||
|
self.left.collect(doc, score);
|
||||||
|
self.right.collect(doc, score);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn requires_scoring(&self) -> bool {
|
||||||
|
self.left.requires_scoring() || self.right.requires_scoring()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates a `ChainedCollector`
|
||||||
|
pub fn chain() -> ChainedCollector<DoNothingCollector, DoNothingCollector> {
|
||||||
|
ChainedCollector {
|
||||||
|
left: DoNothingCollector,
|
||||||
|
right: DoNothingCollector,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
|
||||||
|
use super::*;
|
||||||
|
use collector::{Collector, CountCollector, TopCollector};
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_chained_collector() {
|
||||||
|
let mut top_collector = TopCollector::with_limit(2);
|
||||||
|
let mut count_collector = CountCollector::default();
|
||||||
|
{
|
||||||
|
let mut collectors = chain().push(&mut top_collector).push(&mut count_collector);
|
||||||
|
collectors.collect(1, 0.2);
|
||||||
|
collectors.collect(2, 0.1);
|
||||||
|
collectors.collect(3, 0.5);
|
||||||
|
}
|
||||||
|
assert_eq!(count_collector.count(), 3);
|
||||||
|
assert!(top_collector.at_capacity());
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,5 +1,4 @@
|
|||||||
use super::Collector;
|
use super::Collector;
|
||||||
use collector::SegmentCollector;
|
|
||||||
use DocId;
|
use DocId;
|
||||||
use Result;
|
use Result;
|
||||||
use Score;
|
use Score;
|
||||||
@@ -12,14 +11,14 @@ use SegmentReader;
|
|||||||
/// ```rust
|
/// ```rust
|
||||||
/// #[macro_use]
|
/// #[macro_use]
|
||||||
/// extern crate tantivy;
|
/// extern crate tantivy;
|
||||||
/// use tantivy::schema::{Schema, TEXT};
|
/// use tantivy::schema::{SchemaBuilder, TEXT};
|
||||||
/// use tantivy::{Index, Result};
|
/// use tantivy::{Index, Result};
|
||||||
/// use tantivy::collector::Count;
|
/// use tantivy::collector::CountCollector;
|
||||||
/// use tantivy::query::QueryParser;
|
/// use tantivy::query::QueryParser;
|
||||||
///
|
///
|
||||||
/// # fn main() { example().unwrap(); }
|
/// # fn main() { example().unwrap(); }
|
||||||
/// fn example() -> Result<()> {
|
/// fn example() -> Result<()> {
|
||||||
/// let mut schema_builder = Schema::builder();
|
/// let mut schema_builder = SchemaBuilder::new();
|
||||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||||
/// let schema = schema_builder.build();
|
/// let schema = schema_builder.build();
|
||||||
/// let index = Index::create_in_ram(schema);
|
/// let index = Index::create_in_ram(schema);
|
||||||
@@ -44,86 +43,59 @@ use SegmentReader;
|
|||||||
/// let searcher = index.searcher();
|
/// let searcher = index.searcher();
|
||||||
///
|
///
|
||||||
/// {
|
/// {
|
||||||
|
/// let mut count_collector = CountCollector::default();
|
||||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||||
/// let query = query_parser.parse_query("diary")?;
|
/// let query = query_parser.parse_query("diary")?;
|
||||||
/// let count = searcher.search(&query, &Count).unwrap();
|
/// searcher.search(&*query, &mut count_collector).unwrap();
|
||||||
///
|
///
|
||||||
/// assert_eq!(count, 2);
|
/// assert_eq!(count_collector.count(), 2);
|
||||||
/// }
|
/// }
|
||||||
///
|
///
|
||||||
/// Ok(())
|
/// Ok(())
|
||||||
/// }
|
/// }
|
||||||
/// ```
|
/// ```
|
||||||
pub struct Count;
|
|
||||||
|
|
||||||
impl Collector for Count {
|
|
||||||
type Fruit = usize;
|
|
||||||
|
|
||||||
type Child = SegmentCountCollector;
|
|
||||||
|
|
||||||
fn for_segment(&self, _: SegmentLocalId, _: &SegmentReader) -> Result<SegmentCountCollector> {
|
|
||||||
Ok(SegmentCountCollector::default())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
|
||||||
false
|
|
||||||
}
|
|
||||||
|
|
||||||
fn merge_fruits(&self, segment_counts: Vec<usize>) -> Result<usize> {
|
|
||||||
Ok(segment_counts.into_iter().sum())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
pub struct SegmentCountCollector {
|
pub struct CountCollector {
|
||||||
count: usize,
|
count: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SegmentCollector for SegmentCountCollector {
|
impl CountCollector {
|
||||||
type Fruit = usize;
|
/// Returns the count of documents that were
|
||||||
|
/// collected.
|
||||||
|
pub fn count(&self) -> usize {
|
||||||
|
self.count
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Collector for CountCollector {
|
||||||
|
fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<()> {
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
fn collect(&mut self, _: DocId, _: Score) {
|
fn collect(&mut self, _: DocId, _: Score) {
|
||||||
self.count += 1;
|
self.count += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
fn harvest(self) -> usize {
|
fn requires_scoring(&self) -> bool {
|
||||||
self.count
|
false
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::{Count, SegmentCountCollector};
|
|
||||||
use collector::Collector;
|
use collector::{Collector, CountCollector};
|
||||||
use collector::SegmentCollector;
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_count_collect_does_not_requires_scoring() {
|
fn test_count_collector() {
|
||||||
assert!(!Count.requires_scoring());
|
let mut count_collector = CountCollector::default();
|
||||||
}
|
assert_eq!(count_collector.count(), 0);
|
||||||
|
count_collector.collect(0u32, 1f32);
|
||||||
#[test]
|
assert_eq!(count_collector.count(), 1);
|
||||||
fn test_segment_count_collector() {
|
assert_eq!(count_collector.count(), 1);
|
||||||
{
|
count_collector.collect(1u32, 1f32);
|
||||||
let count_collector = SegmentCountCollector::default();
|
assert_eq!(count_collector.count(), 2);
|
||||||
assert_eq!(count_collector.harvest(), 0);
|
assert!(!count_collector.requires_scoring());
|
||||||
}
|
|
||||||
{
|
|
||||||
let mut count_collector = SegmentCountCollector::default();
|
|
||||||
count_collector.collect(0u32, 1f32);
|
|
||||||
assert_eq!(count_collector.harvest(), 1);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
let mut count_collector = SegmentCountCollector::default();
|
|
||||||
count_collector.collect(0u32, 1f32);
|
|
||||||
assert_eq!(count_collector.harvest(), 1);
|
|
||||||
}
|
|
||||||
{
|
|
||||||
let mut count_collector = SegmentCountCollector::default();
|
|
||||||
count_collector.collect(0u32, 1f32);
|
|
||||||
count_collector.collect(1u32, 1f32);
|
|
||||||
assert_eq!(count_collector.harvest(), 2);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,17 +1,20 @@
|
|||||||
use collector::Collector;
|
use collector::Collector;
|
||||||
use collector::SegmentCollector;
|
|
||||||
use docset::SkipResult;
|
use docset::SkipResult;
|
||||||
use fastfield::FacetReader;
|
use fastfield::FacetReader;
|
||||||
use schema::Facet;
|
use schema::Facet;
|
||||||
use schema::Field;
|
use schema::Field;
|
||||||
use std::cmp::Ordering;
|
use std::cell::UnsafeCell;
|
||||||
use std::collections::btree_map;
|
use std::collections::btree_map;
|
||||||
use std::collections::BTreeMap;
|
use std::collections::BTreeMap;
|
||||||
use std::collections::BTreeSet;
|
use std::collections::BTreeSet;
|
||||||
use std::collections::BinaryHeap;
|
use std::collections::BinaryHeap;
|
||||||
use std::collections::Bound;
|
use std::collections::Bound;
|
||||||
use std::iter::Peekable;
|
use std::iter::Peekable;
|
||||||
|
use std::mem;
|
||||||
use std::{u64, usize};
|
use std::{u64, usize};
|
||||||
|
use termdict::TermMerger;
|
||||||
|
|
||||||
|
use std::cmp::Ordering;
|
||||||
use DocId;
|
use DocId;
|
||||||
use Result;
|
use Result;
|
||||||
use Score;
|
use Score;
|
||||||
@@ -43,6 +46,12 @@ impl<'a> Ord for Hit<'a> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct SegmentFacetCounter {
|
||||||
|
pub facet_reader: FacetReader,
|
||||||
|
pub facet_ords: Vec<u64>,
|
||||||
|
pub facet_counts: Vec<u64>,
|
||||||
|
}
|
||||||
|
|
||||||
fn facet_depth(facet_bytes: &[u8]) -> usize {
|
fn facet_depth(facet_bytes: &[u8]) -> usize {
|
||||||
if facet_bytes.is_empty() {
|
if facet_bytes.is_empty() {
|
||||||
0
|
0
|
||||||
@@ -82,14 +91,14 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
|||||||
/// ```rust
|
/// ```rust
|
||||||
/// #[macro_use]
|
/// #[macro_use]
|
||||||
/// extern crate tantivy;
|
/// extern crate tantivy;
|
||||||
/// use tantivy::schema::{Facet, Schema, TEXT};
|
/// use tantivy::schema::{Facet, SchemaBuilder, TEXT};
|
||||||
/// use tantivy::{Index, Result};
|
/// use tantivy::{Index, Result};
|
||||||
/// use tantivy::collector::FacetCollector;
|
/// use tantivy::collector::FacetCollector;
|
||||||
/// use tantivy::query::AllQuery;
|
/// use tantivy::query::AllQuery;
|
||||||
///
|
///
|
||||||
/// # fn main() { example().unwrap(); }
|
/// # fn main() { example().unwrap(); }
|
||||||
/// fn example() -> Result<()> {
|
/// fn example() -> Result<()> {
|
||||||
/// let mut schema_builder = Schema::builder();
|
/// let mut schema_builder = SchemaBuilder::new();
|
||||||
///
|
///
|
||||||
/// // Facet have their own specific type.
|
/// // Facet have their own specific type.
|
||||||
/// // It is not a bad practise to put all of your
|
/// // It is not a bad practise to put all of your
|
||||||
@@ -132,10 +141,13 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
|||||||
/// let mut facet_collector = FacetCollector::for_field(facet);
|
/// let mut facet_collector = FacetCollector::for_field(facet);
|
||||||
/// facet_collector.add_facet("/lang");
|
/// facet_collector.add_facet("/lang");
|
||||||
/// facet_collector.add_facet("/category");
|
/// facet_collector.add_facet("/category");
|
||||||
/// let facet_counts = searcher.search(&AllQuery, &facet_collector).unwrap();
|
/// searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
||||||
|
///
|
||||||
|
/// // this object contains count aggregate for all of the facets.
|
||||||
|
/// let counts = facet_collector.harvest();
|
||||||
///
|
///
|
||||||
/// // This lists all of the facet counts
|
/// // This lists all of the facet counts
|
||||||
/// let facets: Vec<(&Facet, u64)> = facet_counts
|
/// let facets: Vec<(&Facet, u64)> = counts
|
||||||
/// .get("/category")
|
/// .get("/category")
|
||||||
/// .collect();
|
/// .collect();
|
||||||
/// assert_eq!(facets, vec![
|
/// assert_eq!(facets, vec![
|
||||||
@@ -147,10 +159,13 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
|||||||
/// {
|
/// {
|
||||||
/// let mut facet_collector = FacetCollector::for_field(facet);
|
/// let mut facet_collector = FacetCollector::for_field(facet);
|
||||||
/// facet_collector.add_facet("/category/fiction");
|
/// facet_collector.add_facet("/category/fiction");
|
||||||
/// let facet_counts = searcher.search(&AllQuery, &facet_collector).unwrap();
|
/// searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
||||||
|
///
|
||||||
|
/// // this object contains count aggregate for all of the facets.
|
||||||
|
/// let counts = facet_collector.harvest();
|
||||||
///
|
///
|
||||||
/// // This lists all of the facet counts
|
/// // This lists all of the facet counts
|
||||||
/// let facets: Vec<(&Facet, u64)> = facet_counts
|
/// let facets: Vec<(&Facet, u64)> = counts
|
||||||
/// .get("/category/fiction")
|
/// .get("/category/fiction")
|
||||||
/// .collect();
|
/// .collect();
|
||||||
/// assert_eq!(facets, vec![
|
/// assert_eq!(facets, vec![
|
||||||
@@ -163,10 +178,13 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
|||||||
/// {
|
/// {
|
||||||
/// let mut facet_collector = FacetCollector::for_field(facet);
|
/// let mut facet_collector = FacetCollector::for_field(facet);
|
||||||
/// facet_collector.add_facet("/category/fiction");
|
/// facet_collector.add_facet("/category/fiction");
|
||||||
/// let facet_counts = searcher.search(&AllQuery, &facet_collector).unwrap();
|
/// searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
||||||
|
///
|
||||||
|
/// // this object contains count aggregate for all of the facets.
|
||||||
|
/// let counts = facet_collector.harvest();
|
||||||
///
|
///
|
||||||
/// // This lists all of the facet counts
|
/// // This lists all of the facet counts
|
||||||
/// let facets: Vec<(&Facet, u64)> = facet_counts.top_k("/category/fiction", 1);
|
/// let facets: Vec<(&Facet, u64)> = counts.top_k("/category/fiction", 1);
|
||||||
/// assert_eq!(facets, vec![
|
/// assert_eq!(facets, vec![
|
||||||
/// (&Facet::from("/category/fiction/fantasy"), 2)
|
/// (&Facet::from("/category/fiction/fantasy"), 2)
|
||||||
/// ]);
|
/// ]);
|
||||||
@@ -176,19 +194,19 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
|||||||
/// }
|
/// }
|
||||||
/// ```
|
/// ```
|
||||||
pub struct FacetCollector {
|
pub struct FacetCollector {
|
||||||
|
facet_ords: Vec<u64>,
|
||||||
field: Field,
|
field: Field,
|
||||||
facets: BTreeSet<Facet>,
|
ff_reader: Option<UnsafeCell<FacetReader>>,
|
||||||
}
|
segment_counters: Vec<SegmentFacetCounter>,
|
||||||
|
|
||||||
pub struct FacetSegmentCollector {
|
|
||||||
reader: FacetReader,
|
|
||||||
facet_ords_buf: Vec<u64>,
|
|
||||||
// facet_ord -> collapse facet_id
|
// facet_ord -> collapse facet_id
|
||||||
collapse_mapping: Vec<usize>,
|
current_segment_collapse_mapping: Vec<usize>,
|
||||||
// collapse facet_id -> count
|
// collapse facet_id -> count
|
||||||
counts: Vec<u64>,
|
current_segment_counts: Vec<u64>,
|
||||||
// collapse facet_id -> facet_ord
|
// collapse facet_id -> facet_ord
|
||||||
collapse_facet_ords: Vec<u64>,
|
current_collapse_facet_ords: Vec<u64>,
|
||||||
|
|
||||||
|
facets: BTreeSet<Facet>,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn skip<'a, I: Iterator<Item = &'a Facet>>(
|
fn skip<'a, I: Iterator<Item = &'a Facet>>(
|
||||||
@@ -222,8 +240,15 @@ impl FacetCollector {
|
|||||||
/// is of the proper type.
|
/// is of the proper type.
|
||||||
pub fn for_field(field: Field) -> FacetCollector {
|
pub fn for_field(field: Field) -> FacetCollector {
|
||||||
FacetCollector {
|
FacetCollector {
|
||||||
|
facet_ords: Vec::with_capacity(255),
|
||||||
|
segment_counters: Vec::new(),
|
||||||
field,
|
field,
|
||||||
facets: BTreeSet::default(),
|
ff_reader: None,
|
||||||
|
facets: BTreeSet::new(),
|
||||||
|
|
||||||
|
current_segment_collapse_mapping: Vec::new(),
|
||||||
|
current_collapse_facet_ords: Vec::new(),
|
||||||
|
current_segment_counts: Vec::new(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -253,100 +278,141 @@ impl FacetCollector {
|
|||||||
}
|
}
|
||||||
self.facets.insert(facet);
|
self.facets.insert(facet);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
impl Collector for FacetCollector {
|
|
||||||
type Fruit = FacetCounts;
|
|
||||||
|
|
||||||
type Child = FacetSegmentCollector;
|
|
||||||
|
|
||||||
fn for_segment(
|
|
||||||
&self,
|
|
||||||
_: SegmentLocalId,
|
|
||||||
reader: &SegmentReader,
|
|
||||||
) -> Result<FacetSegmentCollector> {
|
|
||||||
let facet_reader = reader.facet_reader(self.field)?;
|
|
||||||
|
|
||||||
let mut collapse_mapping = Vec::new();
|
|
||||||
let mut counts = Vec::new();
|
|
||||||
let mut collapse_facet_ords = Vec::new();
|
|
||||||
|
|
||||||
|
fn set_collapse_mapping(&mut self, facet_reader: &FacetReader) {
|
||||||
|
self.current_segment_collapse_mapping.clear();
|
||||||
|
self.current_collapse_facet_ords.clear();
|
||||||
|
self.current_segment_counts.clear();
|
||||||
let mut collapse_facet_it = self.facets.iter().peekable();
|
let mut collapse_facet_it = self.facets.iter().peekable();
|
||||||
collapse_facet_ords.push(0);
|
self.current_collapse_facet_ords.push(0);
|
||||||
{
|
let mut facet_streamer = facet_reader.facet_dict().range().into_stream();
|
||||||
let mut facet_streamer = facet_reader.facet_dict().range().into_stream();
|
if !facet_streamer.advance() {
|
||||||
if facet_streamer.advance() {
|
return;
|
||||||
'outer: loop {
|
}
|
||||||
// at the begining of this loop, facet_streamer
|
'outer: loop {
|
||||||
// is positionned on a term that has not been processed yet.
|
// at the begining of this loop, facet_streamer
|
||||||
let skip_result = skip(facet_streamer.key(), &mut collapse_facet_it);
|
// is positionned on a term that has not been processed yet.
|
||||||
match skip_result {
|
let skip_result = skip(facet_streamer.key(), &mut collapse_facet_it);
|
||||||
SkipResult::Reached => {
|
match skip_result {
|
||||||
// we reach a facet we decided to collapse.
|
SkipResult::Reached => {
|
||||||
let collapse_depth = facet_depth(facet_streamer.key());
|
// we reach a facet we decided to collapse.
|
||||||
let mut collapsed_id = 0;
|
let collapse_depth = facet_depth(facet_streamer.key());
|
||||||
collapse_mapping.push(0);
|
let mut collapsed_id = 0;
|
||||||
while facet_streamer.advance() {
|
self.current_segment_collapse_mapping.push(0);
|
||||||
let depth = facet_depth(facet_streamer.key());
|
while facet_streamer.advance() {
|
||||||
if depth <= collapse_depth {
|
let depth = facet_depth(facet_streamer.key());
|
||||||
continue 'outer;
|
if depth <= collapse_depth {
|
||||||
}
|
continue 'outer;
|
||||||
if depth == collapse_depth + 1 {
|
|
||||||
collapsed_id = collapse_facet_ords.len();
|
|
||||||
collapse_facet_ords.push(facet_streamer.term_ord());
|
|
||||||
collapse_mapping.push(collapsed_id);
|
|
||||||
} else {
|
|
||||||
collapse_mapping.push(collapsed_id);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
SkipResult::End | SkipResult::OverStep => {
|
if depth == collapse_depth + 1 {
|
||||||
collapse_mapping.push(0);
|
collapsed_id = self.current_collapse_facet_ords.len();
|
||||||
if !facet_streamer.advance() {
|
self.current_collapse_facet_ords
|
||||||
break;
|
.push(facet_streamer.term_ord());
|
||||||
}
|
self.current_segment_collapse_mapping.push(collapsed_id);
|
||||||
|
} else {
|
||||||
|
self.current_segment_collapse_mapping.push(collapsed_id);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
SkipResult::End | SkipResult::OverStep => {
|
||||||
|
self.current_segment_collapse_mapping.push(0);
|
||||||
|
if !facet_streamer.advance() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
counts.resize(collapse_facet_ords.len(), 0);
|
|
||||||
|
|
||||||
Ok(FacetSegmentCollector {
|
|
||||||
reader: facet_reader,
|
|
||||||
facet_ords_buf: Vec::with_capacity(255),
|
|
||||||
collapse_mapping,
|
|
||||||
counts,
|
|
||||||
collapse_facet_ords,
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
fn finalize_segment(&mut self) {
|
||||||
false
|
if self.ff_reader.is_some() {
|
||||||
|
self.segment_counters.push(SegmentFacetCounter {
|
||||||
|
facet_reader: self.ff_reader.take().unwrap().into_inner(),
|
||||||
|
facet_ords: mem::replace(&mut self.current_collapse_facet_ords, Vec::new()),
|
||||||
|
facet_counts: mem::replace(&mut self.current_segment_counts, Vec::new()),
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn merge_fruits(&self, segments_facet_counts: Vec<FacetCounts>) -> Result<FacetCounts> {
|
/// Returns the results of the collection.
|
||||||
let mut facet_counts: BTreeMap<Facet, u64> = BTreeMap::new();
|
///
|
||||||
for segment_facet_counts in segments_facet_counts {
|
/// This method does not just return the counters,
|
||||||
for (facet, count) in segment_facet_counts.facet_counts {
|
/// it also translates the facet ordinals of the last segment.
|
||||||
*(facet_counts.entry(facet).or_insert(0)) += count;
|
pub fn harvest(mut self) -> FacetCounts {
|
||||||
|
self.finalize_segment();
|
||||||
|
|
||||||
|
let collapsed_facet_ords: Vec<&[u64]> = self.segment_counters
|
||||||
|
.iter()
|
||||||
|
.map(|segment_counter| &segment_counter.facet_ords[..])
|
||||||
|
.collect();
|
||||||
|
let collapsed_facet_counts: Vec<&[u64]> = self.segment_counters
|
||||||
|
.iter()
|
||||||
|
.map(|segment_counter| &segment_counter.facet_counts[..])
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let facet_streams = self.segment_counters
|
||||||
|
.iter()
|
||||||
|
.map(|seg_counts| seg_counts.facet_reader.facet_dict().range().into_stream())
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
let mut facet_merger = TermMerger::new(facet_streams);
|
||||||
|
let mut facet_counts = BTreeMap::new();
|
||||||
|
|
||||||
|
while facet_merger.advance() {
|
||||||
|
let count = facet_merger
|
||||||
|
.current_kvs()
|
||||||
|
.iter()
|
||||||
|
.map(|it| {
|
||||||
|
let seg_ord = it.segment_ord;
|
||||||
|
let term_ord = it.streamer.term_ord();
|
||||||
|
collapsed_facet_ords[seg_ord]
|
||||||
|
.binary_search(&term_ord)
|
||||||
|
.map(|collapsed_term_id| {
|
||||||
|
if collapsed_term_id == 0 {
|
||||||
|
0
|
||||||
|
} else {
|
||||||
|
collapsed_facet_counts[seg_ord][collapsed_term_id]
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.unwrap_or(0)
|
||||||
|
})
|
||||||
|
.sum();
|
||||||
|
if count > 0u64 {
|
||||||
|
let bytes: Vec<u8> = facet_merger.key().to_owned();
|
||||||
|
// may create an corrupted facet if the term dicitonary is corrupted
|
||||||
|
let facet = unsafe { Facet::from_encoded(bytes) };
|
||||||
|
facet_counts.insert(facet, count);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(FacetCounts { facet_counts })
|
FacetCounts { facet_counts }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SegmentCollector for FacetSegmentCollector {
|
impl Collector for FacetCollector {
|
||||||
type Fruit = FacetCounts;
|
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
|
||||||
|
self.finalize_segment();
|
||||||
|
let facet_reader = reader.facet_reader(self.field)?;
|
||||||
|
self.set_collapse_mapping(&facet_reader);
|
||||||
|
self.current_segment_counts
|
||||||
|
.resize(self.current_collapse_facet_ords.len(), 0);
|
||||||
|
self.ff_reader = Some(UnsafeCell::new(facet_reader));
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
fn collect(&mut self, doc: DocId, _: Score) {
|
fn collect(&mut self, doc: DocId, _: Score) {
|
||||||
self.reader.facet_ords(doc, &mut self.facet_ords_buf);
|
let facet_reader: &mut FacetReader = unsafe {
|
||||||
|
&mut *self.ff_reader
|
||||||
|
.as_ref()
|
||||||
|
.expect("collect() was called before set_segment. This should never happen.")
|
||||||
|
.get()
|
||||||
|
};
|
||||||
|
facet_reader.facet_ords(doc, &mut self.facet_ords);
|
||||||
let mut previous_collapsed_ord: usize = usize::MAX;
|
let mut previous_collapsed_ord: usize = usize::MAX;
|
||||||
for &facet_ord in &self.facet_ords_buf {
|
for &facet_ord in &self.facet_ords {
|
||||||
let collapsed_ord = self.collapse_mapping[facet_ord as usize];
|
let collapsed_ord = self.current_segment_collapse_mapping[facet_ord as usize];
|
||||||
self.counts[collapsed_ord] += if collapsed_ord == previous_collapsed_ord {
|
self.current_segment_counts[collapsed_ord] += if collapsed_ord == previous_collapsed_ord
|
||||||
|
{
|
||||||
0
|
0
|
||||||
} else {
|
} else {
|
||||||
1
|
1
|
||||||
@@ -355,23 +421,8 @@ impl SegmentCollector for FacetSegmentCollector {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the results of the collection.
|
fn requires_scoring(&self) -> bool {
|
||||||
///
|
false
|
||||||
/// This method does not just return the counters,
|
|
||||||
/// it also translates the facet ordinals of the last segment.
|
|
||||||
fn harvest(self) -> FacetCounts {
|
|
||||||
let mut facet_counts = BTreeMap::new();
|
|
||||||
let facet_dict = self.reader.facet_dict();
|
|
||||||
for (collapsed_facet_ord, count) in self.counts.iter().cloned().enumerate() {
|
|
||||||
if count == 0 {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
let mut facet = vec![];
|
|
||||||
let facet_ord = self.collapse_facet_ords[collapsed_facet_ord];
|
|
||||||
facet_dict.ord_to_term(facet_ord as u64, &mut facet);
|
|
||||||
facet_counts.insert(unsafe { Facet::from_encoded(facet) }, count);
|
|
||||||
}
|
|
||||||
FacetCounts { facet_counts }
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -425,8 +476,9 @@ impl FacetCounts {
|
|||||||
heap.push(Hit { count, facet });
|
heap.push(Hit { count, facet });
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut lowest_count: u64 = heap.peek().map(|hit| hit.count).unwrap_or(u64::MIN); //< the `unwrap_or` case may be triggered but the value
|
let mut lowest_count: u64 = heap.peek().map(|hit| hit.count)
|
||||||
// is never used in that case.
|
.unwrap_or(u64::MIN); //< the `unwrap_or` case may be triggered but the value
|
||||||
|
// is never used in that case.
|
||||||
|
|
||||||
for (facet, count) in it {
|
for (facet, count) in it {
|
||||||
if count > lowest_count {
|
if count > lowest_count {
|
||||||
@@ -452,14 +504,14 @@ mod tests {
|
|||||||
use core::Index;
|
use core::Index;
|
||||||
use query::AllQuery;
|
use query::AllQuery;
|
||||||
use rand::distributions::Uniform;
|
use rand::distributions::Uniform;
|
||||||
use rand::prelude::SliceRandom;
|
|
||||||
use rand::{thread_rng, Rng};
|
use rand::{thread_rng, Rng};
|
||||||
use schema::{Document, Facet, Field, Schema};
|
use schema::Field;
|
||||||
|
use schema::{Document, Facet, SchemaBuilder};
|
||||||
use std::iter;
|
use std::iter;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_facet_collector_drilldown() {
|
fn test_facet_collector_drilldown() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::new();
|
||||||
let facet_field = schema_builder.add_facet_field("facet");
|
let facet_field = schema_builder.add_facet_field("facet");
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -474,7 +526,8 @@ mod tests {
|
|||||||
n /= 4;
|
n /= 4;
|
||||||
let leaf = n % 5;
|
let leaf = n % 5;
|
||||||
Facet::from(&format!("/top{}/mid{}/leaf{}", top, mid, leaf))
|
Facet::from(&format!("/top{}/mid{}/leaf{}", top, mid, leaf))
|
||||||
}).collect();
|
})
|
||||||
|
.collect();
|
||||||
for i in 0..num_facets * 10 {
|
for i in 0..num_facets * 10 {
|
||||||
let mut doc = Document::new();
|
let mut doc = Document::new();
|
||||||
doc.add_facet(facet_field, facets[i % num_facets].clone());
|
doc.add_facet(facet_field, facets[i % num_facets].clone());
|
||||||
@@ -483,10 +536,12 @@ mod tests {
|
|||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
index.load_searchers().unwrap();
|
index.load_searchers().unwrap();
|
||||||
let searcher = index.searcher();
|
let searcher = index.searcher();
|
||||||
|
|
||||||
let mut facet_collector = FacetCollector::for_field(facet_field);
|
let mut facet_collector = FacetCollector::for_field(facet_field);
|
||||||
facet_collector.add_facet(Facet::from("/top1"));
|
facet_collector.add_facet(Facet::from("/top1"));
|
||||||
let counts = searcher.search(&AllQuery, &facet_collector).unwrap();
|
searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
||||||
|
|
||||||
|
let counts: FacetCounts = facet_collector.harvest();
|
||||||
{
|
{
|
||||||
let facets: Vec<(String, u64)> = counts
|
let facets: Vec<(String, u64)> = counts
|
||||||
.get("/top1")
|
.get("/top1")
|
||||||
@@ -499,8 +554,7 @@ mod tests {
|
|||||||
("/top1/mid1", 50),
|
("/top1/mid1", 50),
|
||||||
("/top1/mid2", 50),
|
("/top1/mid2", 50),
|
||||||
("/top1/mid3", 50),
|
("/top1/mid3", 50),
|
||||||
]
|
].iter()
|
||||||
.iter()
|
|
||||||
.map(|&(facet_str, count)| (String::from(facet_str), count))
|
.map(|&(facet_str, count)| (String::from(facet_str), count))
|
||||||
.collect::<Vec<_>>()
|
.collect::<Vec<_>>()
|
||||||
);
|
);
|
||||||
@@ -520,7 +574,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_doc_unsorted_multifacet() {
|
fn test_doc_unsorted_multifacet() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::new();
|
||||||
let facet_field = schema_builder.add_facet_field("facets");
|
let facet_field = schema_builder.add_facet_field("facets");
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -537,7 +591,8 @@ mod tests {
|
|||||||
assert_eq!(searcher.num_docs(), 1);
|
assert_eq!(searcher.num_docs(), 1);
|
||||||
let mut facet_collector = FacetCollector::for_field(facet_field);
|
let mut facet_collector = FacetCollector::for_field(facet_field);
|
||||||
facet_collector.add_facet("/subjects");
|
facet_collector.add_facet("/subjects");
|
||||||
let counts = searcher.search(&AllQuery, &facet_collector).unwrap();
|
searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
||||||
|
let counts = facet_collector.harvest();
|
||||||
let facets: Vec<(&Facet, u64)> = counts.get("/subjects").collect();
|
let facets: Vec<(&Facet, u64)> = counts.get("/subjects").collect();
|
||||||
assert_eq!(facets[0].1, 1);
|
assert_eq!(facets[0].1, 1);
|
||||||
}
|
}
|
||||||
@@ -551,7 +606,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_facet_collector_topk() {
|
fn test_facet_collector_topk() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::new();
|
||||||
let facet_field = schema_builder.add_facet_field("facet");
|
let facet_field = schema_builder.add_facet_field("facet");
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -563,14 +618,10 @@ mod tests {
|
|||||||
let facet = Facet::from(&format!("/facet/{}", c));
|
let facet = Facet::from(&format!("/facet/{}", c));
|
||||||
let doc = doc!(facet_field => facet);
|
let doc = doc!(facet_field => facet);
|
||||||
iter::repeat(doc).take(count)
|
iter::repeat(doc).take(count)
|
||||||
}).map(|mut doc| {
|
})
|
||||||
doc.add_facet(
|
.map(|mut doc| { doc.add_facet(facet_field, &format!("/facet/{}", thread_rng().sample(&uniform) )); doc})
|
||||||
facet_field,
|
.collect();
|
||||||
&format!("/facet/{}", thread_rng().sample(&uniform)),
|
thread_rng().shuffle(&mut docs[..]);
|
||||||
);
|
|
||||||
doc
|
|
||||||
}).collect();
|
|
||||||
docs[..].shuffle(&mut thread_rng());
|
|
||||||
|
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
for doc in docs {
|
for doc in docs {
|
||||||
@@ -583,8 +634,9 @@ mod tests {
|
|||||||
|
|
||||||
let mut facet_collector = FacetCollector::for_field(facet_field);
|
let mut facet_collector = FacetCollector::for_field(facet_field);
|
||||||
facet_collector.add_facet("/facet");
|
facet_collector.add_facet("/facet");
|
||||||
let counts: FacetCounts = searcher.search(&AllQuery, &facet_collector).unwrap();
|
searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
||||||
|
|
||||||
|
let counts: FacetCounts = facet_collector.harvest();
|
||||||
{
|
{
|
||||||
let facets: Vec<(&Facet, u64)> = counts.top_k("/facet", 3);
|
let facets: Vec<(&Facet, u64)> = counts.top_k("/facet", 3);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
@@ -607,13 +659,13 @@ mod bench {
|
|||||||
use query::AllQuery;
|
use query::AllQuery;
|
||||||
use rand::{thread_rng, Rng};
|
use rand::{thread_rng, Rng};
|
||||||
use schema::Facet;
|
use schema::Facet;
|
||||||
use schema::Schema;
|
use schema::SchemaBuilder;
|
||||||
use test::Bencher;
|
use test::Bencher;
|
||||||
use Index;
|
use Index;
|
||||||
|
|
||||||
#[bench]
|
#[bench]
|
||||||
fn bench_facet_collector(b: &mut Bencher) {
|
fn bench_facet_collector(b: &mut Bencher) {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::new();
|
||||||
let facet_field = schema_builder.add_facet_field("facet");
|
let facet_field = schema_builder.add_facet_field("facet");
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -637,8 +689,8 @@ mod bench {
|
|||||||
|
|
||||||
b.iter(|| {
|
b.iter(|| {
|
||||||
let searcher = index.searcher();
|
let searcher = index.searcher();
|
||||||
let facet_collector = FacetCollector::for_field(facet_field);
|
let mut facet_collector = FacetCollector::for_field(facet_field);
|
||||||
searcher.search(&AllQuery, &facet_collector).unwrap();
|
searcher.search(&AllQuery, &mut facet_collector).unwrap();
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -79,7 +79,7 @@ mod tests {
|
|||||||
// make sure we have facet counters correctly filled
|
// make sure we have facet counters correctly filled
|
||||||
fn test_facet_collector_results() {
|
fn test_facet_collector_results() {
|
||||||
|
|
||||||
let mut schema_builder = schema::Schema::builder();
|
let mut schema_builder = schema::SchemaBuilder::new();
|
||||||
let num_field_i64 = schema_builder.add_i64_field("num_i64", FAST);
|
let num_field_i64 = schema_builder.add_i64_field("num_i64", FAST);
|
||||||
let num_field_u64 = schema_builder.add_u64_field("num_u64", FAST);
|
let num_field_u64 = schema_builder.add_u64_field("num_u64", FAST);
|
||||||
let text_field = schema_builder.add_text_field("text", STRING);
|
let text_field = schema_builder.add_text_field("text", STRING);
|
||||||
|
|||||||
@@ -1,91 +1,7 @@
|
|||||||
/*!
|
/*!
|
||||||
|
Defines how the documents matching a search query should be processed.
|
||||||
# Collectors
|
|
||||||
|
|
||||||
Collectors define the information you want to extract from the documents matching the queries.
|
|
||||||
In tantivy jargon, we call this information your search "fruit".
|
|
||||||
|
|
||||||
Your fruit could for instance be :
|
|
||||||
- [the count of matching documents](./struct.Count.html)
|
|
||||||
- [the top 10 documents, by relevancy or by a fast field](./struct.TopDocs.html)
|
|
||||||
- [facet counts](./struct.FacetCollector.html)
|
|
||||||
|
|
||||||
At one point in your code, you will trigger the actual search operation by calling
|
|
||||||
[the `search(...)` method of your `Searcher` object](../struct.Searcher.html#method.search).
|
|
||||||
This call will look like this.
|
|
||||||
|
|
||||||
```verbatim
|
|
||||||
let fruit = searcher.search(&query, &collector)?;
|
|
||||||
```
|
|
||||||
|
|
||||||
Here the type of fruit is actually determined as an associated type of the collector (`Collector::Fruit`).
|
|
||||||
|
|
||||||
|
|
||||||
# Combining several collectors
|
|
||||||
|
|
||||||
A rich search experience often requires to run several collectors on your search query.
|
|
||||||
For instance,
|
|
||||||
- selecting the top-K products matching your query
|
|
||||||
- counting the matching documents
|
|
||||||
- computing several facets
|
|
||||||
- computing statistics about the matching product prices
|
|
||||||
|
|
||||||
A simple and efficient way to do that is to pass your collectors as one tuple.
|
|
||||||
The resulting `Fruit` will then be a typed tuple with each collector's original fruits
|
|
||||||
in their respective position.
|
|
||||||
|
|
||||||
```rust
|
|
||||||
# extern crate tantivy;
|
|
||||||
# use tantivy::schema::*;
|
|
||||||
# use tantivy::*;
|
|
||||||
# use tantivy::query::*;
|
|
||||||
use tantivy::collector::{Count, TopDocs};
|
|
||||||
#
|
|
||||||
# fn main() -> tantivy::Result<()> {
|
|
||||||
# let mut schema_builder = Schema::builder();
|
|
||||||
# let title = schema_builder.add_text_field("title", TEXT);
|
|
||||||
# let schema = schema_builder.build();
|
|
||||||
# let index = Index::create_in_ram(schema);
|
|
||||||
# let mut index_writer = index.writer(3_000_000)?;
|
|
||||||
# index_writer.add_document(doc!(
|
|
||||||
# title => "The Name of the Wind",
|
|
||||||
# ));
|
|
||||||
# index_writer.add_document(doc!(
|
|
||||||
# title => "The Diary of Muadib",
|
|
||||||
# ));
|
|
||||||
# index_writer.commit().unwrap();
|
|
||||||
# index.load_searchers()?;
|
|
||||||
# let searcher = index.searcher();
|
|
||||||
# let query_parser = QueryParser::for_index(&index, vec![title]);
|
|
||||||
# let query = query_parser.parse_query("diary")?;
|
|
||||||
let (doc_count, top_docs): (usize, Vec<(Score, DocAddress)>) =
|
|
||||||
searcher.search(&query, &(Count, TopDocs::with_limit(2)))?;
|
|
||||||
# Ok(())
|
|
||||||
# }
|
|
||||||
```
|
|
||||||
|
|
||||||
The `Collector` trait is implemented for up to 4 collectors.
|
|
||||||
If you have more than 4 collectors, you can either group them into
|
|
||||||
tuples of tuples `(a,(b,(c,d)))`, or rely on `MultiCollector`'s.
|
|
||||||
|
|
||||||
# Combining several collectors dynamically
|
|
||||||
|
|
||||||
Combining collectors into a tuple is a zero-cost abstraction: everything
|
|
||||||
happens as if you had manually implemented a single collector
|
|
||||||
combining all of our features.
|
|
||||||
|
|
||||||
Unfortunately it requires you to know at compile time your collector types.
|
|
||||||
If on the other hand, the collectors depend on some query parameter,
|
|
||||||
you can rely on `MultiCollector`'s.
|
|
||||||
|
|
||||||
|
|
||||||
# Implementing your own collectors.
|
|
||||||
|
|
||||||
See the `custom_collector` example.
|
|
||||||
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
use downcast;
|
|
||||||
use DocId;
|
use DocId;
|
||||||
use Result;
|
use Result;
|
||||||
use Score;
|
use Score;
|
||||||
@@ -93,275 +9,238 @@ use SegmentLocalId;
|
|||||||
use SegmentReader;
|
use SegmentReader;
|
||||||
|
|
||||||
mod count_collector;
|
mod count_collector;
|
||||||
pub use self::count_collector::Count;
|
pub use self::count_collector::CountCollector;
|
||||||
|
|
||||||
mod multi_collector;
|
mod multi_collector;
|
||||||
pub use self::multi_collector::MultiCollector;
|
pub use self::multi_collector::MultiCollector;
|
||||||
|
|
||||||
mod top_collector;
|
mod top_collector;
|
||||||
|
pub use self::top_collector::TopCollector;
|
||||||
mod top_score_collector;
|
|
||||||
pub use self::top_score_collector::TopDocs;
|
|
||||||
|
|
||||||
mod top_field_collector;
|
|
||||||
pub use self::top_field_collector::TopDocsByField;
|
|
||||||
|
|
||||||
mod facet_collector;
|
mod facet_collector;
|
||||||
pub use self::facet_collector::FacetCollector;
|
pub use self::facet_collector::FacetCollector;
|
||||||
|
|
||||||
/// `Fruit` is the type for the result of our collection.
|
mod chained_collector;
|
||||||
/// e.g. `usize` for the `Count` collector.
|
pub use self::chained_collector::{chain, ChainedCollector};
|
||||||
pub trait Fruit: Send + downcast::Any {}
|
|
||||||
|
|
||||||
impl<T> Fruit for T where T: Send + downcast::Any {}
|
|
||||||
|
|
||||||
/// Collectors are in charge of collecting and retaining relevant
|
/// Collectors are in charge of collecting and retaining relevant
|
||||||
/// information from the document found and scored by the query.
|
/// information from the document found and scored by the query.
|
||||||
///
|
///
|
||||||
|
///
|
||||||
/// For instance,
|
/// For instance,
|
||||||
///
|
///
|
||||||
/// - keeping track of the top 10 best documents
|
/// - keeping track of the top 10 best documents
|
||||||
/// - computing a breakdown over a fast field
|
/// - computing a breakdown over a fast field
|
||||||
/// - computing the number of documents matching the query
|
/// - computing the number of documents matching the query
|
||||||
///
|
///
|
||||||
/// Our search index is in fact a collection of segments, so
|
/// Queries are in charge of pushing the `DocSet` to the collector.
|
||||||
/// a `Collector` trait is actually more of a factory to instance
|
|
||||||
/// `SegmentCollector`s for each segments.
|
|
||||||
///
|
///
|
||||||
/// The collection logic itself is in the `SegmentCollector`.
|
/// As they work on multiple segments, they first inform
|
||||||
|
/// the collector of a change in a segment and then
|
||||||
|
/// call the `collect` method to push the document to the collector.
|
||||||
|
///
|
||||||
|
/// Temporally, our collector will receive calls
|
||||||
|
/// - `.set_segment(0, segment_reader_0)`
|
||||||
|
/// - `.collect(doc0_of_segment_0)`
|
||||||
|
/// - `.collect(...)`
|
||||||
|
/// - `.collect(last_doc_of_segment_0)`
|
||||||
|
/// - `.set_segment(1, segment_reader_1)`
|
||||||
|
/// - `.collect(doc0_of_segment_1)`
|
||||||
|
/// - `.collect(...)`
|
||||||
|
/// - `.collect(last_doc_of_segment_1)`
|
||||||
|
/// - `...`
|
||||||
|
/// - `.collect(last_doc_of_last_segment)`
|
||||||
///
|
///
|
||||||
/// Segments are not guaranteed to be visited in any specific order.
|
/// Segments are not guaranteed to be visited in any specific order.
|
||||||
pub trait Collector: Sync {
|
pub trait Collector {
|
||||||
/// `Fruit` is the type for the result of our collection.
|
|
||||||
/// e.g. `usize` for the `Count` collector.
|
|
||||||
type Fruit: Fruit;
|
|
||||||
|
|
||||||
/// Type of the `SegmentCollector` associated to this collector.
|
|
||||||
type Child: SegmentCollector<Fruit = Self::Fruit>;
|
|
||||||
|
|
||||||
/// `set_segment` is called before beginning to enumerate
|
/// `set_segment` is called before beginning to enumerate
|
||||||
/// on this segment.
|
/// on this segment.
|
||||||
fn for_segment(
|
fn set_segment(
|
||||||
&self,
|
&mut self,
|
||||||
segment_local_id: SegmentLocalId,
|
segment_local_id: SegmentLocalId,
|
||||||
segment: &SegmentReader,
|
segment: &SegmentReader,
|
||||||
) -> Result<Self::Child>;
|
) -> Result<()>;
|
||||||
|
|
||||||
/// Returns true iff the collector requires to compute scores for documents.
|
|
||||||
fn requires_scoring(&self) -> bool;
|
|
||||||
|
|
||||||
/// Combines the fruit associated to the collection of each segments
|
|
||||||
/// into one fruit.
|
|
||||||
fn merge_fruits(&self, segment_fruits: Vec<Self::Fruit>) -> Result<Self::Fruit>;
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The `SegmentCollector` is the trait in charge of defining the
|
|
||||||
/// collect operation at the scale of the segment.
|
|
||||||
///
|
|
||||||
/// `.collect(doc, score)` will be called for every documents
|
|
||||||
/// matching the query.
|
|
||||||
pub trait SegmentCollector: 'static {
|
|
||||||
/// `Fruit` is the type for the result of our collection.
|
|
||||||
/// e.g. `usize` for the `Count` collector.
|
|
||||||
type Fruit: Fruit;
|
|
||||||
|
|
||||||
/// The query pushes the scored document to the collector via this method.
|
/// The query pushes the scored document to the collector via this method.
|
||||||
fn collect(&mut self, doc: DocId, score: Score);
|
fn collect(&mut self, doc: DocId, score: Score);
|
||||||
|
|
||||||
/// Extract the fruit of the collection from the `SegmentCollector`.
|
/// Returns true iff the collector requires to compute scores for documents.
|
||||||
fn harvest(self) -> Self::Fruit;
|
fn requires_scoring(&self) -> bool;
|
||||||
}
|
}
|
||||||
|
|
||||||
// -----------------------------------------------
|
impl<'a, C: Collector> Collector for &'a mut C {
|
||||||
// Tuple implementations.
|
fn set_segment(
|
||||||
|
&mut self,
|
||||||
impl<Left, Right> Collector for (Left, Right)
|
segment_local_id: SegmentLocalId,
|
||||||
where
|
segment: &SegmentReader,
|
||||||
Left: Collector,
|
) -> Result<()> {
|
||||||
Right: Collector,
|
(*self).set_segment(segment_local_id, segment)
|
||||||
{
|
}
|
||||||
type Fruit = (Left::Fruit, Right::Fruit);
|
/// The query pushes the scored document to the collector via this method.
|
||||||
type Child = (Left::Child, Right::Child);
|
fn collect(&mut self, doc: DocId, score: Score) {
|
||||||
|
C::collect(self, doc, score)
|
||||||
fn for_segment(&self, segment_local_id: u32, segment: &SegmentReader) -> Result<Self::Child> {
|
|
||||||
let left = self.0.for_segment(segment_local_id, segment)?;
|
|
||||||
let right = self.1.for_segment(segment_local_id, segment)?;
|
|
||||||
Ok((left, right))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
fn requires_scoring(&self) -> bool {
|
||||||
self.0.requires_scoring() || self.1.requires_scoring()
|
C::requires_scoring(self)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn merge_fruits(
|
|
||||||
&self,
|
|
||||||
children: Vec<(Left::Fruit, Right::Fruit)>,
|
|
||||||
) -> Result<(Left::Fruit, Right::Fruit)> {
|
|
||||||
let mut left_fruits = vec![];
|
|
||||||
let mut right_fruits = vec![];
|
|
||||||
for (left_fruit, right_fruit) in children {
|
|
||||||
left_fruits.push(left_fruit);
|
|
||||||
right_fruits.push(right_fruit);
|
|
||||||
}
|
|
||||||
Ok((
|
|
||||||
self.0.merge_fruits(left_fruits)?,
|
|
||||||
self.1.merge_fruits(right_fruits)?,
|
|
||||||
))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<Left, Right> SegmentCollector for (Left, Right)
|
|
||||||
where
|
|
||||||
Left: SegmentCollector,
|
|
||||||
Right: SegmentCollector,
|
|
||||||
{
|
|
||||||
type Fruit = (Left::Fruit, Right::Fruit);
|
|
||||||
|
|
||||||
fn collect(&mut self, doc: DocId, score: Score) {
|
|
||||||
self.0.collect(doc, score);
|
|
||||||
self.1.collect(doc, score);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
|
|
||||||
(self.0.harvest(), self.1.harvest())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// 3-Tuple
|
|
||||||
|
|
||||||
impl<One, Two, Three> Collector for (One, Two, Three)
|
|
||||||
where
|
|
||||||
One: Collector,
|
|
||||||
Two: Collector,
|
|
||||||
Three: Collector,
|
|
||||||
{
|
|
||||||
type Fruit = (One::Fruit, Two::Fruit, Three::Fruit);
|
|
||||||
type Child = (One::Child, Two::Child, Three::Child);
|
|
||||||
|
|
||||||
fn for_segment(&self, segment_local_id: u32, segment: &SegmentReader) -> Result<Self::Child> {
|
|
||||||
let one = self.0.for_segment(segment_local_id, segment)?;
|
|
||||||
let two = self.1.for_segment(segment_local_id, segment)?;
|
|
||||||
let three = self.2.for_segment(segment_local_id, segment)?;
|
|
||||||
Ok((one, two, three))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
|
||||||
self.0.requires_scoring() || self.1.requires_scoring() || self.2.requires_scoring()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn merge_fruits(&self, children: Vec<Self::Fruit>) -> Result<Self::Fruit> {
|
|
||||||
let mut one_fruits = vec![];
|
|
||||||
let mut two_fruits = vec![];
|
|
||||||
let mut three_fruits = vec![];
|
|
||||||
for (one_fruit, two_fruit, three_fruit) in children {
|
|
||||||
one_fruits.push(one_fruit);
|
|
||||||
two_fruits.push(two_fruit);
|
|
||||||
three_fruits.push(three_fruit);
|
|
||||||
}
|
|
||||||
Ok((
|
|
||||||
self.0.merge_fruits(one_fruits)?,
|
|
||||||
self.1.merge_fruits(two_fruits)?,
|
|
||||||
self.2.merge_fruits(three_fruits)?,
|
|
||||||
))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<One, Two, Three> SegmentCollector for (One, Two, Three)
|
|
||||||
where
|
|
||||||
One: SegmentCollector,
|
|
||||||
Two: SegmentCollector,
|
|
||||||
Three: SegmentCollector,
|
|
||||||
{
|
|
||||||
type Fruit = (One::Fruit, Two::Fruit, Three::Fruit);
|
|
||||||
|
|
||||||
fn collect(&mut self, doc: DocId, score: Score) {
|
|
||||||
self.0.collect(doc, score);
|
|
||||||
self.1.collect(doc, score);
|
|
||||||
self.2.collect(doc, score);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
|
|
||||||
(self.0.harvest(), self.1.harvest(), self.2.harvest())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// 4-Tuple
|
|
||||||
|
|
||||||
impl<One, Two, Three, Four> Collector for (One, Two, Three, Four)
|
|
||||||
where
|
|
||||||
One: Collector,
|
|
||||||
Two: Collector,
|
|
||||||
Three: Collector,
|
|
||||||
Four: Collector,
|
|
||||||
{
|
|
||||||
type Fruit = (One::Fruit, Two::Fruit, Three::Fruit, Four::Fruit);
|
|
||||||
type Child = (One::Child, Two::Child, Three::Child, Four::Child);
|
|
||||||
|
|
||||||
fn for_segment(&self, segment_local_id: u32, segment: &SegmentReader) -> Result<Self::Child> {
|
|
||||||
let one = self.0.for_segment(segment_local_id, segment)?;
|
|
||||||
let two = self.1.for_segment(segment_local_id, segment)?;
|
|
||||||
let three = self.2.for_segment(segment_local_id, segment)?;
|
|
||||||
let four = self.3.for_segment(segment_local_id, segment)?;
|
|
||||||
Ok((one, two, three, four))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
|
||||||
self.0.requires_scoring()
|
|
||||||
|| self.1.requires_scoring()
|
|
||||||
|| self.2.requires_scoring()
|
|
||||||
|| self.3.requires_scoring()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn merge_fruits(&self, children: Vec<Self::Fruit>) -> Result<Self::Fruit> {
|
|
||||||
let mut one_fruits = vec![];
|
|
||||||
let mut two_fruits = vec![];
|
|
||||||
let mut three_fruits = vec![];
|
|
||||||
let mut four_fruits = vec![];
|
|
||||||
for (one_fruit, two_fruit, three_fruit, four_fruit) in children {
|
|
||||||
one_fruits.push(one_fruit);
|
|
||||||
two_fruits.push(two_fruit);
|
|
||||||
three_fruits.push(three_fruit);
|
|
||||||
four_fruits.push(four_fruit);
|
|
||||||
}
|
|
||||||
Ok((
|
|
||||||
self.0.merge_fruits(one_fruits)?,
|
|
||||||
self.1.merge_fruits(two_fruits)?,
|
|
||||||
self.2.merge_fruits(three_fruits)?,
|
|
||||||
self.3.merge_fruits(four_fruits)?,
|
|
||||||
))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<One, Two, Three, Four> SegmentCollector for (One, Two, Three, Four)
|
|
||||||
where
|
|
||||||
One: SegmentCollector,
|
|
||||||
Two: SegmentCollector,
|
|
||||||
Three: SegmentCollector,
|
|
||||||
Four: SegmentCollector,
|
|
||||||
{
|
|
||||||
type Fruit = (One::Fruit, Two::Fruit, Three::Fruit, Four::Fruit);
|
|
||||||
|
|
||||||
fn collect(&mut self, doc: DocId, score: Score) {
|
|
||||||
self.0.collect(doc, score);
|
|
||||||
self.1.collect(doc, score);
|
|
||||||
self.2.collect(doc, score);
|
|
||||||
self.3.collect(doc, score);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
|
|
||||||
(
|
|
||||||
self.0.harvest(),
|
|
||||||
self.1.harvest(),
|
|
||||||
self.2.harvest(),
|
|
||||||
self.3.harvest(),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[allow(missing_docs)]
|
|
||||||
mod downcast_impl {
|
|
||||||
downcast!(super::Fruit);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub mod tests;
|
pub mod tests {
|
||||||
|
|
||||||
|
use super::*;
|
||||||
|
use core::SegmentReader;
|
||||||
|
use fastfield::BytesFastFieldReader;
|
||||||
|
use fastfield::FastFieldReader;
|
||||||
|
use schema::Field;
|
||||||
|
use DocId;
|
||||||
|
use Score;
|
||||||
|
use SegmentLocalId;
|
||||||
|
|
||||||
|
/// Stores all of the doc ids.
|
||||||
|
/// This collector is only used for tests.
|
||||||
|
/// It is unusable in practise, as it does not store
|
||||||
|
/// the segment ordinals
|
||||||
|
pub struct TestCollector {
|
||||||
|
offset: DocId,
|
||||||
|
segment_max_doc: DocId,
|
||||||
|
docs: Vec<DocId>,
|
||||||
|
scores: Vec<Score>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TestCollector {
|
||||||
|
/// Return the exhalist of documents.
|
||||||
|
pub fn docs(self) -> Vec<DocId> {
|
||||||
|
self.docs
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn scores(self) -> Vec<Score> {
|
||||||
|
self.scores
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for TestCollector {
|
||||||
|
fn default() -> TestCollector {
|
||||||
|
TestCollector {
|
||||||
|
offset: 0,
|
||||||
|
segment_max_doc: 0,
|
||||||
|
docs: Vec::new(),
|
||||||
|
scores: Vec::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Collector for TestCollector {
|
||||||
|
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
|
||||||
|
self.offset += self.segment_max_doc;
|
||||||
|
self.segment_max_doc = reader.max_doc();
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn collect(&mut self, doc: DocId, score: Score) {
|
||||||
|
self.docs.push(doc + self.offset);
|
||||||
|
self.scores.push(score);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn requires_scoring(&self) -> bool {
|
||||||
|
true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Collects in order all of the fast fields for all of the
|
||||||
|
/// doc in the `DocSet`
|
||||||
|
///
|
||||||
|
/// This collector is mainly useful for tests.
|
||||||
|
pub struct FastFieldTestCollector {
|
||||||
|
vals: Vec<u64>,
|
||||||
|
field: Field,
|
||||||
|
ff_reader: Option<FastFieldReader<u64>>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FastFieldTestCollector {
|
||||||
|
pub fn for_field(field: Field) -> FastFieldTestCollector {
|
||||||
|
FastFieldTestCollector {
|
||||||
|
vals: Vec::new(),
|
||||||
|
field,
|
||||||
|
ff_reader: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn vals(self) -> Vec<u64> {
|
||||||
|
self.vals
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Collector for FastFieldTestCollector {
|
||||||
|
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
|
||||||
|
self.ff_reader = Some(reader.fast_field_reader(self.field)?);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn collect(&mut self, doc: DocId, _score: Score) {
|
||||||
|
let val = self.ff_reader.as_ref().unwrap().get(doc);
|
||||||
|
self.vals.push(val);
|
||||||
|
}
|
||||||
|
fn requires_scoring(&self) -> bool {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Collects in order all of the fast field bytes for all of the
|
||||||
|
/// docs in the `DocSet`
|
||||||
|
///
|
||||||
|
/// This collector is mainly useful for tests.
|
||||||
|
pub struct BytesFastFieldTestCollector {
|
||||||
|
vals: Vec<u8>,
|
||||||
|
field: Field,
|
||||||
|
ff_reader: Option<BytesFastFieldReader>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BytesFastFieldTestCollector {
|
||||||
|
pub fn for_field(field: Field) -> BytesFastFieldTestCollector {
|
||||||
|
BytesFastFieldTestCollector {
|
||||||
|
vals: Vec::new(),
|
||||||
|
field,
|
||||||
|
ff_reader: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn vals(self) -> Vec<u8> {
|
||||||
|
self.vals
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Collector for BytesFastFieldTestCollector {
|
||||||
|
fn set_segment(&mut self, _segment_local_id: u32, segment: &SegmentReader) -> Result<()> {
|
||||||
|
self.ff_reader = Some(segment.bytes_fast_field_reader(self.field)?);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn collect(&mut self, doc: u32, _score: f32) {
|
||||||
|
let val = self.ff_reader.as_ref().unwrap().get_val(doc);
|
||||||
|
self.vals.extend(val);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn requires_scoring(&self) -> bool {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(all(test, feature = "unstable"))]
|
||||||
|
mod bench {
|
||||||
|
use collector::{Collector, CountCollector};
|
||||||
|
use test::Bencher;
|
||||||
|
|
||||||
|
#[bench]
|
||||||
|
fn build_collector(b: &mut Bencher) {
|
||||||
|
b.iter(|| {
|
||||||
|
let mut count_collector = CountCollector::default();
|
||||||
|
let docs: Vec<u32> = (0..1_000_000).collect();
|
||||||
|
for doc in docs {
|
||||||
|
count_collector.collect(doc, 1f32);
|
||||||
|
}
|
||||||
|
count_collector.count()
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -1,96 +1,9 @@
|
|||||||
use super::Collector;
|
use super::Collector;
|
||||||
use super::SegmentCollector;
|
|
||||||
use collector::Fruit;
|
|
||||||
use downcast::Downcast;
|
|
||||||
use std::marker::PhantomData;
|
|
||||||
use DocId;
|
use DocId;
|
||||||
use Result;
|
use Result;
|
||||||
use Score;
|
use Score;
|
||||||
use SegmentLocalId;
|
use SegmentLocalId;
|
||||||
use SegmentReader;
|
use SegmentReader;
|
||||||
use TantivyError;
|
|
||||||
|
|
||||||
pub struct MultiFruit {
|
|
||||||
sub_fruits: Vec<Option<Box<Fruit>>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct CollectorWrapper<TCollector: Collector>(TCollector);
|
|
||||||
|
|
||||||
impl<TCollector: Collector> Collector for CollectorWrapper<TCollector> {
|
|
||||||
type Fruit = Box<Fruit>;
|
|
||||||
type Child = Box<BoxableSegmentCollector>;
|
|
||||||
|
|
||||||
fn for_segment(
|
|
||||||
&self,
|
|
||||||
segment_local_id: u32,
|
|
||||||
reader: &SegmentReader,
|
|
||||||
) -> Result<Box<BoxableSegmentCollector>> {
|
|
||||||
let child = self.0.for_segment(segment_local_id, reader)?;
|
|
||||||
Ok(Box::new(SegmentCollectorWrapper(child)))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
|
||||||
self.0.requires_scoring()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn merge_fruits(&self, children: Vec<<Self as Collector>::Fruit>) -> Result<Box<Fruit>> {
|
|
||||||
let typed_fruit: Vec<TCollector::Fruit> = children
|
|
||||||
.into_iter()
|
|
||||||
.map(|untyped_fruit| {
|
|
||||||
Downcast::<TCollector::Fruit>::downcast(untyped_fruit)
|
|
||||||
.map(|boxed_but_typed| *boxed_but_typed)
|
|
||||||
.map_err(|e| {
|
|
||||||
let err_msg = format!("Failed to cast child collector fruit. {:?}", e);
|
|
||||||
TantivyError::InvalidArgument(err_msg)
|
|
||||||
})
|
|
||||||
}).collect::<Result<_>>()?;
|
|
||||||
let merged_fruit = self.0.merge_fruits(typed_fruit)?;
|
|
||||||
Ok(Box::new(merged_fruit))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SegmentCollector for Box<BoxableSegmentCollector> {
|
|
||||||
type Fruit = Box<Fruit>;
|
|
||||||
|
|
||||||
fn collect(&mut self, doc: u32, score: f32) {
|
|
||||||
self.as_mut().collect(doc, score);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn harvest(self) -> Box<Fruit> {
|
|
||||||
BoxableSegmentCollector::harvest_from_box(self)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub trait BoxableSegmentCollector {
|
|
||||||
fn collect(&mut self, doc: u32, score: f32);
|
|
||||||
fn harvest_from_box(self: Box<Self>) -> Box<Fruit>;
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct SegmentCollectorWrapper<TSegmentCollector: SegmentCollector>(TSegmentCollector);
|
|
||||||
|
|
||||||
impl<TSegmentCollector: SegmentCollector> BoxableSegmentCollector
|
|
||||||
for SegmentCollectorWrapper<TSegmentCollector>
|
|
||||||
{
|
|
||||||
fn collect(&mut self, doc: u32, score: f32) {
|
|
||||||
self.0.collect(doc, score);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn harvest_from_box(self: Box<Self>) -> Box<Fruit> {
|
|
||||||
Box::new(self.0.harvest())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct FruitHandle<TFruit: Fruit> {
|
|
||||||
pos: usize,
|
|
||||||
_phantom: PhantomData<TFruit>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<TFruit: Fruit> FruitHandle<TFruit> {
|
|
||||||
pub fn extract(self, fruits: &mut MultiFruit) -> TFruit {
|
|
||||||
let boxed_fruit = fruits.sub_fruits[self.pos].take().expect("");
|
|
||||||
*Downcast::<TFruit>::downcast(boxed_fruit).expect("Failed")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Multicollector makes it possible to collect on more than one collector.
|
/// Multicollector makes it possible to collect on more than one collector.
|
||||||
/// It should only be used for use cases where the Collector types is unknown
|
/// It should only be used for use cases where the Collector types is unknown
|
||||||
@@ -100,14 +13,14 @@ impl<TFruit: Fruit> FruitHandle<TFruit> {
|
|||||||
/// ```rust
|
/// ```rust
|
||||||
/// #[macro_use]
|
/// #[macro_use]
|
||||||
/// extern crate tantivy;
|
/// extern crate tantivy;
|
||||||
/// use tantivy::schema::{Schema, TEXT};
|
/// use tantivy::schema::{SchemaBuilder, TEXT};
|
||||||
/// use tantivy::{Index, Result};
|
/// use tantivy::{Index, Result};
|
||||||
/// use tantivy::collector::{Count, TopDocs, MultiCollector};
|
/// use tantivy::collector::{CountCollector, TopCollector, MultiCollector};
|
||||||
/// use tantivy::query::QueryParser;
|
/// use tantivy::query::QueryParser;
|
||||||
///
|
///
|
||||||
/// # fn main() { example().unwrap(); }
|
/// # fn main() { example().unwrap(); }
|
||||||
/// fn example() -> Result<()> {
|
/// fn example() -> Result<()> {
|
||||||
/// let mut schema_builder = Schema::builder();
|
/// let mut schema_builder = SchemaBuilder::new();
|
||||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||||
/// let schema = schema_builder.build();
|
/// let schema = schema_builder.build();
|
||||||
/// let index = Index::create_in_ram(schema);
|
/// let index = Index::create_in_ram(schema);
|
||||||
@@ -131,114 +44,55 @@ impl<TFruit: Fruit> FruitHandle<TFruit> {
|
|||||||
/// index.load_searchers()?;
|
/// index.load_searchers()?;
|
||||||
/// let searcher = index.searcher();
|
/// let searcher = index.searcher();
|
||||||
///
|
///
|
||||||
/// let mut collectors = MultiCollector::new();
|
/// {
|
||||||
/// let top_docs_handle = collectors.add_collector(TopDocs::with_limit(2));
|
/// let mut top_collector = TopCollector::with_limit(2);
|
||||||
/// let count_handle = collectors.add_collector(Count);
|
/// let mut count_collector = CountCollector::default();
|
||||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
/// {
|
||||||
/// let query = query_parser.parse_query("diary")?;
|
/// let mut collectors =
|
||||||
/// let mut multi_fruit = searcher.search(&query, &collectors)?;
|
/// MultiCollector::from(vec![&mut top_collector, &mut count_collector]);
|
||||||
///
|
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||||
/// let count = count_handle.extract(&mut multi_fruit);
|
/// let query = query_parser.parse_query("diary")?;
|
||||||
/// let top_docs = top_docs_handle.extract(&mut multi_fruit);
|
/// searcher.search(&*query, &mut collectors).unwrap();
|
||||||
///
|
/// }
|
||||||
/// # assert_eq!(count, 2);
|
/// assert_eq!(count_collector.count(), 2);
|
||||||
/// # assert_eq!(top_docs.len(), 2);
|
/// assert!(top_collector.at_capacity());
|
||||||
|
/// }
|
||||||
///
|
///
|
||||||
/// Ok(())
|
/// Ok(())
|
||||||
/// }
|
/// }
|
||||||
/// ```
|
/// ```
|
||||||
pub struct MultiCollector<'a> {
|
pub struct MultiCollector<'a> {
|
||||||
collector_wrappers:
|
collectors: Vec<&'a mut Collector>,
|
||||||
Vec<Box<Collector<Child = Box<BoxableSegmentCollector>, Fruit = Box<Fruit>> + 'a>>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> MultiCollector<'a> {
|
impl<'a> MultiCollector<'a> {
|
||||||
/// Create a new `MultiCollector`
|
/// Constructor
|
||||||
pub fn new() -> MultiCollector<'a> {
|
pub fn from(collectors: Vec<&'a mut Collector>) -> MultiCollector {
|
||||||
MultiCollector {
|
MultiCollector { collectors }
|
||||||
collector_wrappers: Vec::new(),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Add a new collector to our `MultiCollector`.
|
|
||||||
pub fn add_collector<'b: 'a, TCollector: Collector + 'b>(
|
|
||||||
&mut self,
|
|
||||||
collector: TCollector,
|
|
||||||
) -> FruitHandle<TCollector::Fruit> {
|
|
||||||
let pos = self.collector_wrappers.len();
|
|
||||||
self.collector_wrappers
|
|
||||||
.push(Box::new(CollectorWrapper(collector)));
|
|
||||||
FruitHandle {
|
|
||||||
pos,
|
|
||||||
_phantom: PhantomData,
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Collector for MultiCollector<'a> {
|
impl<'a> Collector for MultiCollector<'a> {
|
||||||
type Fruit = MultiFruit;
|
fn set_segment(
|
||||||
type Child = MultiCollectorChild;
|
&mut self,
|
||||||
|
|
||||||
fn for_segment(
|
|
||||||
&self,
|
|
||||||
segment_local_id: SegmentLocalId,
|
segment_local_id: SegmentLocalId,
|
||||||
segment: &SegmentReader,
|
segment: &SegmentReader,
|
||||||
) -> Result<MultiCollectorChild> {
|
) -> Result<()> {
|
||||||
let children = self
|
for collector in &mut self.collectors {
|
||||||
.collector_wrappers
|
collector.set_segment(segment_local_id, segment)?;
|
||||||
.iter()
|
|
||||||
.map(|collector_wrapper| collector_wrapper.for_segment(segment_local_id, segment))
|
|
||||||
.collect::<Result<Vec<_>>>()?;
|
|
||||||
Ok(MultiCollectorChild { children })
|
|
||||||
}
|
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
|
||||||
self.collector_wrappers.iter().any(|c| c.requires_scoring())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn merge_fruits(&self, segments_multifruits: Vec<MultiFruit>) -> Result<MultiFruit> {
|
|
||||||
let mut segment_fruits_list: Vec<Vec<Box<Fruit>>> = (0..self.collector_wrappers.len())
|
|
||||||
.map(|_| Vec::with_capacity(segments_multifruits.len()))
|
|
||||||
.collect::<Vec<_>>();
|
|
||||||
for segment_multifruit in segments_multifruits {
|
|
||||||
for (idx, segment_fruit_opt) in segment_multifruit.sub_fruits.into_iter().enumerate() {
|
|
||||||
if let Some(segment_fruit) = segment_fruit_opt {
|
|
||||||
segment_fruits_list[idx].push(segment_fruit);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
let sub_fruits = self
|
Ok(())
|
||||||
.collector_wrappers
|
|
||||||
.iter()
|
|
||||||
.zip(segment_fruits_list)
|
|
||||||
.map(|(child_collector, segment_fruits)| {
|
|
||||||
Ok(Some(child_collector.merge_fruits(segment_fruits)?))
|
|
||||||
}).collect::<Result<_>>()?;
|
|
||||||
Ok(MultiFruit { sub_fruits })
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
pub struct MultiCollectorChild {
|
|
||||||
children: Vec<Box<BoxableSegmentCollector>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SegmentCollector for MultiCollectorChild {
|
|
||||||
type Fruit = MultiFruit;
|
|
||||||
|
|
||||||
fn collect(&mut self, doc: DocId, score: Score) {
|
fn collect(&mut self, doc: DocId, score: Score) {
|
||||||
for child in &mut self.children {
|
for collector in &mut self.collectors {
|
||||||
child.collect(doc, score);
|
collector.collect(doc, score);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
fn requires_scoring(&self) -> bool {
|
||||||
fn harvest(self) -> MultiFruit {
|
self.collectors
|
||||||
MultiFruit {
|
.iter()
|
||||||
sub_fruits: self
|
.any(|collector| collector.requires_scoring())
|
||||||
.children
|
|
||||||
.into_iter()
|
|
||||||
.map(|child| Some(child.harvest()))
|
|
||||||
.collect(),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -246,42 +100,20 @@ impl SegmentCollector for MultiCollectorChild {
|
|||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
use collector::{Count, TopDocs};
|
use collector::{Collector, CountCollector, TopCollector};
|
||||||
use query::TermQuery;
|
|
||||||
use schema::IndexRecordOption;
|
|
||||||
use schema::{Schema, TEXT};
|
|
||||||
use Index;
|
|
||||||
use Term;
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_multi_collector() {
|
fn test_multi_collector() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut top_collector = TopCollector::with_limit(2);
|
||||||
let text = schema_builder.add_text_field("text", TEXT);
|
let mut count_collector = CountCollector::default();
|
||||||
let schema = schema_builder.build();
|
|
||||||
|
|
||||||
let index = Index::create_in_ram(schema);
|
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
let mut collectors =
|
||||||
index_writer.add_document(doc!(text=>"abc"));
|
MultiCollector::from(vec![&mut top_collector, &mut count_collector]);
|
||||||
index_writer.add_document(doc!(text=>"abc abc abc"));
|
collectors.collect(1, 0.2);
|
||||||
index_writer.add_document(doc!(text=>"abc abc"));
|
collectors.collect(2, 0.1);
|
||||||
index_writer.commit().unwrap();
|
collectors.collect(3, 0.5);
|
||||||
index_writer.add_document(doc!(text=>""));
|
|
||||||
index_writer.add_document(doc!(text=>"abc abc abc abc"));
|
|
||||||
index_writer.add_document(doc!(text=>"abc"));
|
|
||||||
index_writer.commit().unwrap();
|
|
||||||
}
|
}
|
||||||
index.load_searchers().unwrap();
|
assert_eq!(count_collector.count(), 3);
|
||||||
let searcher = index.searcher();
|
assert!(top_collector.at_capacity());
|
||||||
let term = Term::from_field_text(text, "abc");
|
|
||||||
let query = TermQuery::new(term, IndexRecordOption::Basic);
|
|
||||||
|
|
||||||
let mut collectors = MultiCollector::new();
|
|
||||||
let topdocs_handler = collectors.add_collector(TopDocs::with_limit(2));
|
|
||||||
let count_handler = collectors.add_collector(Count);
|
|
||||||
let mut multifruits = searcher.search(&query, &mut collectors).unwrap();
|
|
||||||
|
|
||||||
assert_eq!(count_handler.extract(&mut multifruits), 5);
|
|
||||||
assert_eq!(topdocs_handler.extract(&mut multifruits).len(), 2);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,201 +0,0 @@
|
|||||||
use super::*;
|
|
||||||
use core::SegmentReader;
|
|
||||||
use fastfield::BytesFastFieldReader;
|
|
||||||
use fastfield::FastFieldReader;
|
|
||||||
use schema::Field;
|
|
||||||
use DocAddress;
|
|
||||||
use DocId;
|
|
||||||
use Score;
|
|
||||||
use SegmentLocalId;
|
|
||||||
|
|
||||||
/// Stores all of the doc ids.
|
|
||||||
/// This collector is only used for tests.
|
|
||||||
/// It is unusable in pr
|
|
||||||
///
|
|
||||||
/// actise, as it does not store
|
|
||||||
/// the segment ordinals
|
|
||||||
pub struct TestCollector;
|
|
||||||
|
|
||||||
pub struct TestSegmentCollector {
|
|
||||||
segment_id: SegmentLocalId,
|
|
||||||
fruit: TestFruit,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Default)]
|
|
||||||
pub struct TestFruit {
|
|
||||||
docs: Vec<DocAddress>,
|
|
||||||
scores: Vec<Score>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl TestFruit {
|
|
||||||
/// Return the list of matching documents exhaustively.
|
|
||||||
pub fn docs(&self) -> &[DocAddress] {
|
|
||||||
&self.docs[..]
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn scores(&self) -> &[Score] {
|
|
||||||
&self.scores[..]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Collector for TestCollector {
|
|
||||||
type Fruit = TestFruit;
|
|
||||||
type Child = TestSegmentCollector;
|
|
||||||
|
|
||||||
fn for_segment(
|
|
||||||
&self,
|
|
||||||
segment_id: SegmentLocalId,
|
|
||||||
_reader: &SegmentReader,
|
|
||||||
) -> Result<TestSegmentCollector> {
|
|
||||||
Ok(TestSegmentCollector {
|
|
||||||
segment_id,
|
|
||||||
fruit: TestFruit::default(),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
|
||||||
true
|
|
||||||
}
|
|
||||||
|
|
||||||
fn merge_fruits(&self, mut children: Vec<TestFruit>) -> Result<TestFruit> {
|
|
||||||
children.sort_by_key(|fruit| {
|
|
||||||
if fruit.docs().is_empty() {
|
|
||||||
0
|
|
||||||
} else {
|
|
||||||
fruit.docs()[0].segment_ord()
|
|
||||||
}
|
|
||||||
});
|
|
||||||
let mut docs = vec![];
|
|
||||||
let mut scores = vec![];
|
|
||||||
for child in children {
|
|
||||||
docs.extend(child.docs());
|
|
||||||
scores.extend(child.scores);
|
|
||||||
}
|
|
||||||
Ok(TestFruit { docs, scores })
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SegmentCollector for TestSegmentCollector {
|
|
||||||
type Fruit = TestFruit;
|
|
||||||
|
|
||||||
fn collect(&mut self, doc: DocId, score: Score) {
|
|
||||||
self.fruit.docs.push(DocAddress(self.segment_id, doc));
|
|
||||||
self.fruit.scores.push(score);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
|
|
||||||
self.fruit
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Collects in order all of the fast fields for all of the
|
|
||||||
/// doc in the `DocSet`
|
|
||||||
///
|
|
||||||
/// This collector is mainly useful for tests.
|
|
||||||
pub struct FastFieldTestCollector {
|
|
||||||
field: Field,
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct FastFieldSegmentCollector {
|
|
||||||
vals: Vec<u64>,
|
|
||||||
reader: FastFieldReader<u64>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl FastFieldTestCollector {
|
|
||||||
pub fn for_field(field: Field) -> FastFieldTestCollector {
|
|
||||||
FastFieldTestCollector { field }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Collector for FastFieldTestCollector {
|
|
||||||
type Fruit = Vec<u64>;
|
|
||||||
type Child = FastFieldSegmentCollector;
|
|
||||||
|
|
||||||
fn for_segment(
|
|
||||||
&self,
|
|
||||||
_: SegmentLocalId,
|
|
||||||
reader: &SegmentReader,
|
|
||||||
) -> Result<FastFieldSegmentCollector> {
|
|
||||||
Ok(FastFieldSegmentCollector {
|
|
||||||
vals: Vec::new(),
|
|
||||||
reader: reader.fast_field_reader(self.field)?,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
|
||||||
false
|
|
||||||
}
|
|
||||||
|
|
||||||
fn merge_fruits(&self, children: Vec<Vec<u64>>) -> Result<Vec<u64>> {
|
|
||||||
Ok(children.into_iter().flat_map(|v| v.into_iter()).collect())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SegmentCollector for FastFieldSegmentCollector {
|
|
||||||
type Fruit = Vec<u64>;
|
|
||||||
|
|
||||||
fn collect(&mut self, doc: DocId, _score: Score) {
|
|
||||||
let val = self.reader.get(doc);
|
|
||||||
self.vals.push(val);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn harvest(self) -> Vec<u64> {
|
|
||||||
self.vals
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Collects in order all of the fast field bytes for all of the
|
|
||||||
/// docs in the `DocSet`
|
|
||||||
///
|
|
||||||
/// This collector is mainly useful for tests.
|
|
||||||
pub struct BytesFastFieldTestCollector {
|
|
||||||
field: Field,
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct BytesFastFieldSegmentCollector {
|
|
||||||
vals: Vec<u8>,
|
|
||||||
reader: BytesFastFieldReader,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl BytesFastFieldTestCollector {
|
|
||||||
pub fn for_field(field: Field) -> BytesFastFieldTestCollector {
|
|
||||||
BytesFastFieldTestCollector { field }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Collector for BytesFastFieldTestCollector {
|
|
||||||
type Fruit = Vec<u8>;
|
|
||||||
type Child = BytesFastFieldSegmentCollector;
|
|
||||||
|
|
||||||
fn for_segment(
|
|
||||||
&self,
|
|
||||||
_segment_local_id: u32,
|
|
||||||
segment: &SegmentReader,
|
|
||||||
) -> Result<BytesFastFieldSegmentCollector> {
|
|
||||||
Ok(BytesFastFieldSegmentCollector {
|
|
||||||
vals: Vec::new(),
|
|
||||||
reader: segment.bytes_fast_field_reader(self.field)?,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
|
||||||
false
|
|
||||||
}
|
|
||||||
|
|
||||||
fn merge_fruits(&self, children: Vec<Vec<u8>>) -> Result<Vec<u8>> {
|
|
||||||
Ok(children.into_iter().flat_map(|c| c.into_iter()).collect())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl SegmentCollector for BytesFastFieldSegmentCollector {
|
|
||||||
type Fruit = Vec<u8>;
|
|
||||||
|
|
||||||
fn collect(&mut self, doc: u32, _score: f32) {
|
|
||||||
let data = self.reader.get_val(doc);
|
|
||||||
self.vals.extend(data);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn harvest(self) -> <Self as SegmentCollector>::Fruit {
|
|
||||||
self.vals
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,225 +1,244 @@
|
|||||||
use serde::export::PhantomData;
|
use super::Collector;
|
||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use std::collections::BinaryHeap;
|
use std::collections::BinaryHeap;
|
||||||
use DocAddress;
|
use DocAddress;
|
||||||
use DocId;
|
use DocId;
|
||||||
use Result;
|
use Result;
|
||||||
|
use Score;
|
||||||
use SegmentLocalId;
|
use SegmentLocalId;
|
||||||
use SegmentReader;
|
use SegmentReader;
|
||||||
|
|
||||||
/// Contains a feature (field, score, etc.) of a document along with the document address.
|
// Rust heap is a max-heap and we need a min heap.
|
||||||
///
|
#[derive(Clone, Copy)]
|
||||||
/// It has a custom implementation of `PartialOrd` that reverses the order. This is because the
|
struct GlobalScoredDoc {
|
||||||
/// default Rust heap is a max heap, whereas a min heap is needed.
|
score: Score,
|
||||||
///
|
doc_address: DocAddress,
|
||||||
/// WARNING: equality is not what you would expect here.
|
|
||||||
/// Two elements are equal if their feature is equal, and regardless of whether `doc`
|
|
||||||
/// is equal. This should be perfectly fine for this usage, but let's make sure this
|
|
||||||
/// struct is never public.
|
|
||||||
struct ComparableDoc<T, D> {
|
|
||||||
feature: T,
|
|
||||||
doc: D,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: PartialOrd, D> PartialOrd for ComparableDoc<T, D> {
|
impl PartialOrd for GlobalScoredDoc {
|
||||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
fn partial_cmp(&self, other: &GlobalScoredDoc) -> Option<Ordering> {
|
||||||
Some(self.cmp(other))
|
Some(self.cmp(other))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: PartialOrd, D> Ord for ComparableDoc<T, D> {
|
impl Ord for GlobalScoredDoc {
|
||||||
#[inline]
|
#[inline]
|
||||||
fn cmp(&self, other: &Self) -> Ordering {
|
fn cmp(&self, other: &GlobalScoredDoc) -> Ordering {
|
||||||
other
|
other
|
||||||
.feature
|
.score
|
||||||
.partial_cmp(&self.feature)
|
.partial_cmp(&self.score)
|
||||||
.unwrap_or_else(|| Ordering::Equal)
|
.unwrap_or_else(|| other.doc_address.cmp(&self.doc_address))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: PartialOrd, D> PartialEq for ComparableDoc<T, D> {
|
impl PartialEq for GlobalScoredDoc {
|
||||||
fn eq(&self, other: &Self) -> bool {
|
fn eq(&self, other: &GlobalScoredDoc) -> bool {
|
||||||
self.cmp(other) == Ordering::Equal
|
self.cmp(other) == Ordering::Equal
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: PartialOrd, D> Eq for ComparableDoc<T, D> {}
|
impl Eq for GlobalScoredDoc {}
|
||||||
|
|
||||||
pub(crate) struct TopCollector<T> {
|
/// The Top Collector keeps track of the K documents
|
||||||
|
/// with the best scores.
|
||||||
|
///
|
||||||
|
/// The implementation is based on a `BinaryHeap`.
|
||||||
|
/// The theorical complexity for collecting the top `K` out of `n` documents
|
||||||
|
/// is `O(n log K)`.
|
||||||
|
///
|
||||||
|
/// ```rust
|
||||||
|
/// #[macro_use]
|
||||||
|
/// extern crate tantivy;
|
||||||
|
/// use tantivy::schema::{SchemaBuilder, TEXT};
|
||||||
|
/// use tantivy::{Index, Result, DocId, Score};
|
||||||
|
/// use tantivy::collector::TopCollector;
|
||||||
|
/// use tantivy::query::QueryParser;
|
||||||
|
///
|
||||||
|
/// # fn main() { example().unwrap(); }
|
||||||
|
/// fn example() -> Result<()> {
|
||||||
|
/// let mut schema_builder = SchemaBuilder::new();
|
||||||
|
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||||
|
/// let schema = schema_builder.build();
|
||||||
|
/// let index = Index::create_in_ram(schema);
|
||||||
|
/// {
|
||||||
|
/// let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
|
||||||
|
/// index_writer.add_document(doc!(
|
||||||
|
/// title => "The Name of the Wind",
|
||||||
|
/// ));
|
||||||
|
/// index_writer.add_document(doc!(
|
||||||
|
/// title => "The Diary of Muadib",
|
||||||
|
/// ));
|
||||||
|
/// index_writer.add_document(doc!(
|
||||||
|
/// title => "A Dairy Cow",
|
||||||
|
/// ));
|
||||||
|
/// index_writer.add_document(doc!(
|
||||||
|
/// title => "The Diary of a Young Girl",
|
||||||
|
/// ));
|
||||||
|
/// index_writer.commit().unwrap();
|
||||||
|
/// }
|
||||||
|
///
|
||||||
|
/// index.load_searchers()?;
|
||||||
|
/// let searcher = index.searcher();
|
||||||
|
///
|
||||||
|
/// {
|
||||||
|
/// let mut top_collector = TopCollector::with_limit(2);
|
||||||
|
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||||
|
/// let query = query_parser.parse_query("diary")?;
|
||||||
|
/// searcher.search(&*query, &mut top_collector).unwrap();
|
||||||
|
///
|
||||||
|
/// let score_docs: Vec<(Score, DocId)> = top_collector
|
||||||
|
/// .score_docs()
|
||||||
|
/// .into_iter()
|
||||||
|
/// .map(|(score, doc_address)| (score, doc_address.doc()))
|
||||||
|
/// .collect();
|
||||||
|
///
|
||||||
|
/// assert_eq!(score_docs, vec![(0.7261542, 1), (0.6099695, 3)]);
|
||||||
|
/// }
|
||||||
|
///
|
||||||
|
/// Ok(())
|
||||||
|
/// }
|
||||||
|
/// ```
|
||||||
|
pub struct TopCollector {
|
||||||
limit: usize,
|
limit: usize,
|
||||||
_marker: PhantomData<T>,
|
heap: BinaryHeap<GlobalScoredDoc>,
|
||||||
|
segment_id: u32,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T> TopCollector<T>
|
impl TopCollector {
|
||||||
where
|
|
||||||
T: PartialOrd + Clone,
|
|
||||||
{
|
|
||||||
/// Creates a top collector, with a number of documents equal to "limit".
|
/// Creates a top collector, with a number of documents equal to "limit".
|
||||||
///
|
///
|
||||||
/// # Panics
|
/// # Panics
|
||||||
/// The method panics if limit is 0
|
/// The method panics if limit is 0
|
||||||
pub fn with_limit(limit: usize) -> TopCollector<T> {
|
pub fn with_limit(limit: usize) -> TopCollector {
|
||||||
if limit < 1 {
|
if limit < 1 {
|
||||||
panic!("Limit must be strictly greater than 0.");
|
panic!("Limit must be strictly greater than 0.");
|
||||||
}
|
}
|
||||||
TopCollector {
|
TopCollector {
|
||||||
limit,
|
|
||||||
_marker: PhantomData,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn limit(&self) -> usize {
|
|
||||||
self.limit
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn merge_fruits(
|
|
||||||
&self,
|
|
||||||
children: Vec<Vec<(T, DocAddress)>>,
|
|
||||||
) -> Result<Vec<(T, DocAddress)>> {
|
|
||||||
if self.limit == 0 {
|
|
||||||
return Ok(Vec::new());
|
|
||||||
}
|
|
||||||
let mut top_collector = BinaryHeap::new();
|
|
||||||
for child_fruit in children {
|
|
||||||
for (feature, doc) in child_fruit {
|
|
||||||
if top_collector.len() < self.limit {
|
|
||||||
top_collector.push(ComparableDoc { feature, doc });
|
|
||||||
} else {
|
|
||||||
if let Some(mut head) = top_collector.peek_mut() {
|
|
||||||
if head.feature < feature {
|
|
||||||
*head = ComparableDoc { feature, doc };
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(top_collector
|
|
||||||
.into_sorted_vec()
|
|
||||||
.into_iter()
|
|
||||||
.map(|cdoc| (cdoc.feature, cdoc.doc))
|
|
||||||
.collect())
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(crate) fn for_segment(
|
|
||||||
&self,
|
|
||||||
segment_id: SegmentLocalId,
|
|
||||||
_: &SegmentReader,
|
|
||||||
) -> Result<TopSegmentCollector<T>> {
|
|
||||||
Ok(TopSegmentCollector::new(segment_id, self.limit))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The Top Collector keeps track of the K documents
|
|
||||||
/// sorted by type `T`.
|
|
||||||
///
|
|
||||||
/// The implementation is based on a `BinaryHeap`.
|
|
||||||
/// The theorical complexity for collecting the top `K` out of `n` documents
|
|
||||||
/// is `O(n log K)`.
|
|
||||||
pub(crate) struct TopSegmentCollector<T> {
|
|
||||||
limit: usize,
|
|
||||||
heap: BinaryHeap<ComparableDoc<T, DocId>>,
|
|
||||||
segment_id: u32,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T: PartialOrd> TopSegmentCollector<T> {
|
|
||||||
fn new(segment_id: SegmentLocalId, limit: usize) -> TopSegmentCollector<T> {
|
|
||||||
TopSegmentCollector {
|
|
||||||
limit,
|
limit,
|
||||||
heap: BinaryHeap::with_capacity(limit),
|
heap: BinaryHeap::with_capacity(limit),
|
||||||
segment_id,
|
segment_id: 0,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
|
/// Returns K best documents sorted in decreasing order.
|
||||||
pub fn harvest(self) -> Vec<(T, DocAddress)> {
|
///
|
||||||
let segment_id = self.segment_id;
|
/// Calling this method triggers the sort.
|
||||||
self.heap
|
/// The result of the sort is not cached.
|
||||||
.into_sorted_vec()
|
pub fn docs(&self) -> Vec<DocAddress> {
|
||||||
|
self.score_docs()
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|comparable_doc| {
|
.map(|score_doc| score_doc.1)
|
||||||
(
|
.collect()
|
||||||
comparable_doc.feature,
|
}
|
||||||
DocAddress(segment_id, comparable_doc.doc),
|
|
||||||
)
|
/// Returns K best ScoredDocument sorted in decreasing order.
|
||||||
}).collect()
|
///
|
||||||
|
/// Calling this method triggers the sort.
|
||||||
|
/// The result of the sort is not cached.
|
||||||
|
pub fn score_docs(&self) -> Vec<(Score, DocAddress)> {
|
||||||
|
let mut scored_docs: Vec<GlobalScoredDoc> = self.heap.iter().cloned().collect();
|
||||||
|
scored_docs.sort();
|
||||||
|
scored_docs
|
||||||
|
.into_iter()
|
||||||
|
.map(|GlobalScoredDoc { score, doc_address }| (score, doc_address))
|
||||||
|
.collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return true iff at least K documents have gone through
|
/// Return true iff at least K documents have gone through
|
||||||
/// the collector.
|
/// the collector.
|
||||||
#[inline(always)]
|
#[inline]
|
||||||
pub(crate) fn at_capacity(&self) -> bool {
|
pub fn at_capacity(&self) -> bool {
|
||||||
self.heap.len() >= self.limit
|
self.heap.len() >= self.limit
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Collects a document scored by the given feature
|
impl Collector for TopCollector {
|
||||||
///
|
fn set_segment(&mut self, segment_id: SegmentLocalId, _: &SegmentReader) -> Result<()> {
|
||||||
/// It collects documents until it has reached the max capacity. Once it reaches capacity, it
|
self.segment_id = segment_id;
|
||||||
/// will compare the lowest scoring item with the given one and keep whichever is greater.
|
Ok(())
|
||||||
#[inline(always)]
|
}
|
||||||
pub fn collect(&mut self, doc: DocId, feature: T) {
|
|
||||||
|
fn collect(&mut self, doc: DocId, score: Score) {
|
||||||
if self.at_capacity() {
|
if self.at_capacity() {
|
||||||
// It's ok to unwrap as long as a limit of 0 is forbidden.
|
// It's ok to unwrap as long as a limit of 0 is forbidden.
|
||||||
if let Some(limit_feature) = self.heap.peek().map(|head| head.feature.clone()) {
|
let limit_doc: GlobalScoredDoc = *self.heap
|
||||||
if limit_feature < feature {
|
.peek()
|
||||||
if let Some(mut head) = self.heap.peek_mut() {
|
.expect("Top collector with size 0 is forbidden");
|
||||||
head.feature = feature;
|
if limit_doc.score < score {
|
||||||
head.doc = doc;
|
let mut mut_head = self.heap
|
||||||
}
|
.peek_mut()
|
||||||
}
|
.expect("Top collector with size 0 is forbidden");
|
||||||
|
mut_head.score = score;
|
||||||
|
mut_head.doc_address = DocAddress(self.segment_id, doc);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// we have not reached capacity yet, so we can just push the
|
let wrapped_doc = GlobalScoredDoc {
|
||||||
// element.
|
score,
|
||||||
self.heap.push(ComparableDoc { feature, doc });
|
doc_address: DocAddress(self.segment_id, doc),
|
||||||
|
};
|
||||||
|
self.heap.push(wrapped_doc);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn requires_scoring(&self) -> bool {
|
||||||
|
true
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::{TopCollector, TopSegmentCollector};
|
|
||||||
use DocAddress;
|
use super::*;
|
||||||
|
use collector::Collector;
|
||||||
|
use DocId;
|
||||||
use Score;
|
use Score;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_top_collector_not_at_capacity() {
|
fn test_top_collector_not_at_capacity() {
|
||||||
let mut top_collector = TopSegmentCollector::new(0, 4);
|
let mut top_collector = TopCollector::with_limit(4);
|
||||||
top_collector.collect(1, 0.8);
|
top_collector.collect(1, 0.8);
|
||||||
top_collector.collect(3, 0.2);
|
top_collector.collect(3, 0.2);
|
||||||
top_collector.collect(5, 0.3);
|
top_collector.collect(5, 0.3);
|
||||||
assert_eq!(
|
assert!(!top_collector.at_capacity());
|
||||||
top_collector.harvest(),
|
let score_docs: Vec<(Score, DocId)> = top_collector
|
||||||
vec![
|
.score_docs()
|
||||||
(0.8, DocAddress(0, 1)),
|
.into_iter()
|
||||||
(0.3, DocAddress(0, 5)),
|
.map(|(score, doc_address)| (score, doc_address.doc()))
|
||||||
(0.2, DocAddress(0, 3))
|
.collect();
|
||||||
]
|
assert_eq!(score_docs, vec![(0.8, 1), (0.3, 5), (0.2, 3)]);
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_top_collector_at_capacity() {
|
fn test_top_collector_at_capacity() {
|
||||||
let mut top_collector = TopSegmentCollector::new(0, 4);
|
let mut top_collector = TopCollector::with_limit(4);
|
||||||
top_collector.collect(1, 0.8);
|
top_collector.collect(1, 0.8);
|
||||||
top_collector.collect(3, 0.2);
|
top_collector.collect(3, 0.2);
|
||||||
top_collector.collect(5, 0.3);
|
top_collector.collect(5, 0.3);
|
||||||
top_collector.collect(7, 0.9);
|
top_collector.collect(7, 0.9);
|
||||||
top_collector.collect(9, -0.2);
|
top_collector.collect(9, -0.2);
|
||||||
assert_eq!(
|
assert!(top_collector.at_capacity());
|
||||||
top_collector.harvest(),
|
{
|
||||||
vec![
|
let score_docs: Vec<(Score, DocId)> = top_collector
|
||||||
(0.9, DocAddress(0, 7)),
|
.score_docs()
|
||||||
(0.8, DocAddress(0, 1)),
|
.into_iter()
|
||||||
(0.3, DocAddress(0, 5)),
|
.map(|(score, doc_address)| (score, doc_address.doc()))
|
||||||
(0.2, DocAddress(0, 3))
|
.collect();
|
||||||
]
|
assert_eq!(score_docs, vec![(0.9, 7), (0.8, 1), (0.3, 5), (0.2, 3)]);
|
||||||
);
|
}
|
||||||
|
{
|
||||||
|
let docs: Vec<DocId> = top_collector
|
||||||
|
.docs()
|
||||||
|
.into_iter()
|
||||||
|
.map(|doc_address| doc_address.doc())
|
||||||
|
.collect();
|
||||||
|
assert_eq!(docs, vec![7, 1, 5, 3]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
#[should_panic]
|
#[should_panic]
|
||||||
fn test_top_0() {
|
fn test_top_0() {
|
||||||
let _collector: TopCollector<Score> = TopCollector::with_limit(0);
|
TopCollector::with_limit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,250 +0,0 @@
|
|||||||
use super::Collector;
|
|
||||||
use collector::top_collector::TopCollector;
|
|
||||||
use collector::top_collector::TopSegmentCollector;
|
|
||||||
use collector::SegmentCollector;
|
|
||||||
use fastfield::FastFieldReader;
|
|
||||||
use fastfield::FastValue;
|
|
||||||
use schema::Field;
|
|
||||||
use DocAddress;
|
|
||||||
use Result;
|
|
||||||
use SegmentLocalId;
|
|
||||||
use SegmentReader;
|
|
||||||
|
|
||||||
/// The Top Field Collector keeps track of the K documents
|
|
||||||
/// sorted by a fast field in the index
|
|
||||||
///
|
|
||||||
/// The implementation is based on a `BinaryHeap`.
|
|
||||||
/// The theorical complexity for collecting the top `K` out of `n` documents
|
|
||||||
/// is `O(n log K)`.
|
|
||||||
///
|
|
||||||
/// ```rust
|
|
||||||
/// #[macro_use]
|
|
||||||
/// extern crate tantivy;
|
|
||||||
/// # use tantivy::schema::{Schema, Field, FAST, TEXT};
|
|
||||||
/// # use tantivy::{Index, Result, DocAddress};
|
|
||||||
/// # use tantivy::query::{Query, QueryParser};
|
|
||||||
/// use tantivy::collector::TopDocs;
|
|
||||||
///
|
|
||||||
/// # fn main() {
|
|
||||||
/// # let mut schema_builder = Schema::builder();
|
|
||||||
/// # let title = schema_builder.add_text_field("title", TEXT);
|
|
||||||
/// # let rating = schema_builder.add_u64_field("rating", FAST);
|
|
||||||
/// # let schema = schema_builder.build();
|
|
||||||
/// # let index = Index::create_in_ram(schema);
|
|
||||||
/// # let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
|
||||||
/// # index_writer.add_document(doc!(
|
|
||||||
/// # title => "The Name of the Wind",
|
|
||||||
/// # rating => 92u64,
|
|
||||||
/// # ));
|
|
||||||
/// # index_writer.add_document(doc!(title => "The Diary of Muadib", rating => 97u64));
|
|
||||||
/// # index_writer.add_document(doc!(title => "A Dairy Cow", rating => 63u64));
|
|
||||||
/// # index_writer.add_document(doc!(title => "The Diary of a Young Girl", rating => 80u64));
|
|
||||||
/// # index_writer.commit().unwrap();
|
|
||||||
/// # index.load_searchers().unwrap();
|
|
||||||
/// # let query = QueryParser::for_index(&index, vec![title]).parse_query("diary").unwrap();
|
|
||||||
/// # let top_docs = docs_sorted_by_rating(&index, &query, rating).unwrap();
|
|
||||||
/// # assert_eq!(top_docs,
|
|
||||||
/// # vec![(97u64, DocAddress(0u32, 1)),
|
|
||||||
/// # (80u64, DocAddress(0u32, 3))]);
|
|
||||||
/// # }
|
|
||||||
/// #
|
|
||||||
/// /// Searches the document matching the given query, and
|
|
||||||
/// /// collects the top 10 documents, order by the `field`
|
|
||||||
/// /// given in argument.
|
|
||||||
/// ///
|
|
||||||
/// /// `field` is required to be a FAST field.
|
|
||||||
/// fn docs_sorted_by_rating(index: &Index, query: &Query, sort_by_field: Field)
|
|
||||||
/// -> Result<Vec<(u64, DocAddress)>> {
|
|
||||||
///
|
|
||||||
/// // This is where we build our collector!
|
|
||||||
/// let top_docs_by_rating = TopDocs::with_limit(2).order_by_field(sort_by_field);
|
|
||||||
///
|
|
||||||
/// // ... and here is our documents. Not this is a simple vec.
|
|
||||||
/// // The `u64` in the pair is the value of our fast field for each documents.
|
|
||||||
/// index.searcher()
|
|
||||||
/// .search(query, &top_docs_by_rating)
|
|
||||||
/// }
|
|
||||||
/// ```
|
|
||||||
pub struct TopDocsByField<T> {
|
|
||||||
collector: TopCollector<T>,
|
|
||||||
field: Field,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T: FastValue + PartialOrd + Clone> TopDocsByField<T> {
|
|
||||||
/// Creates a top field collector, with a number of documents equal to "limit".
|
|
||||||
///
|
|
||||||
/// The given field name must be a fast field, otherwise the collector have an error while
|
|
||||||
/// collecting results.
|
|
||||||
///
|
|
||||||
/// # Panics
|
|
||||||
/// The method panics if limit is 0
|
|
||||||
pub(crate) fn new(field: Field, limit: usize) -> TopDocsByField<T> {
|
|
||||||
TopDocsByField {
|
|
||||||
collector: TopCollector::with_limit(limit),
|
|
||||||
field,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T: FastValue + PartialOrd + Send + Sync + 'static> Collector for TopDocsByField<T> {
|
|
||||||
type Fruit = Vec<(T, DocAddress)>;
|
|
||||||
|
|
||||||
type Child = TopFieldSegmentCollector<T>;
|
|
||||||
|
|
||||||
fn for_segment(
|
|
||||||
&self,
|
|
||||||
segment_local_id: SegmentLocalId,
|
|
||||||
reader: &SegmentReader,
|
|
||||||
) -> Result<TopFieldSegmentCollector<T>> {
|
|
||||||
let collector = self.collector.for_segment(segment_local_id, reader)?;
|
|
||||||
let reader = reader.fast_field_reader(self.field)?;
|
|
||||||
Ok(TopFieldSegmentCollector { collector, reader })
|
|
||||||
}
|
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
|
||||||
false
|
|
||||||
}
|
|
||||||
|
|
||||||
fn merge_fruits(
|
|
||||||
&self,
|
|
||||||
segment_fruits: Vec<Vec<(T, DocAddress)>>,
|
|
||||||
) -> Result<Vec<(T, DocAddress)>> {
|
|
||||||
self.collector.merge_fruits(segment_fruits)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct TopFieldSegmentCollector<T: FastValue + PartialOrd> {
|
|
||||||
collector: TopSegmentCollector<T>,
|
|
||||||
reader: FastFieldReader<T>,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T: FastValue + PartialOrd + Send + Sync + 'static> SegmentCollector
|
|
||||||
for TopFieldSegmentCollector<T>
|
|
||||||
{
|
|
||||||
type Fruit = Vec<(T, DocAddress)>;
|
|
||||||
|
|
||||||
fn collect(&mut self, doc: u32, _score: f32) {
|
|
||||||
let field_value = self.reader.get(doc);
|
|
||||||
self.collector.collect(doc, field_value);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn harvest(self) -> Vec<(T, DocAddress)> {
|
|
||||||
self.collector.harvest()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use super::TopDocsByField;
|
|
||||||
use collector::Collector;
|
|
||||||
use collector::TopDocs;
|
|
||||||
use query::Query;
|
|
||||||
use query::QueryParser;
|
|
||||||
use schema::Field;
|
|
||||||
use schema::IntOptions;
|
|
||||||
use schema::{Schema, FAST, TEXT};
|
|
||||||
use DocAddress;
|
|
||||||
use Index;
|
|
||||||
use IndexWriter;
|
|
||||||
use TantivyError;
|
|
||||||
|
|
||||||
const TITLE: &str = "title";
|
|
||||||
const SIZE: &str = "size";
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_top_collector_not_at_capacity() {
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
let title = schema_builder.add_text_field(TITLE, TEXT);
|
|
||||||
let size = schema_builder.add_u64_field(SIZE, FAST);
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
let (index, query) = index("beer", title, schema, |index_writer| {
|
|
||||||
index_writer.add_document(doc!(
|
|
||||||
title => "bottle of beer",
|
|
||||||
size => 12u64,
|
|
||||||
));
|
|
||||||
index_writer.add_document(doc!(
|
|
||||||
title => "growler of beer",
|
|
||||||
size => 64u64,
|
|
||||||
));
|
|
||||||
index_writer.add_document(doc!(
|
|
||||||
title => "pint of beer",
|
|
||||||
size => 16u64,
|
|
||||||
));
|
|
||||||
});
|
|
||||||
let searcher = index.searcher();
|
|
||||||
|
|
||||||
let top_collector = TopDocs::with_limit(4).order_by_field(size);
|
|
||||||
let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector).unwrap();
|
|
||||||
assert_eq!(
|
|
||||||
top_docs,
|
|
||||||
vec![
|
|
||||||
(64, DocAddress(0, 1)),
|
|
||||||
(16, DocAddress(0, 2)),
|
|
||||||
(12, DocAddress(0, 0))
|
|
||||||
]
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
#[should_panic]
|
|
||||||
fn test_field_does_not_exist() {
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
let title = schema_builder.add_text_field(TITLE, TEXT);
|
|
||||||
let size = schema_builder.add_u64_field(SIZE, FAST);
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
let (index, _) = index("beer", title, schema, |index_writer| {
|
|
||||||
index_writer.add_document(doc!(
|
|
||||||
title => "bottle of beer",
|
|
||||||
size => 12u64,
|
|
||||||
));
|
|
||||||
});
|
|
||||||
let searcher = index.searcher();
|
|
||||||
let top_collector: TopDocsByField<u64> = TopDocs::with_limit(4).order_by_field(Field(2));
|
|
||||||
let segment_reader = searcher.segment_reader(0u32);
|
|
||||||
top_collector
|
|
||||||
.for_segment(0, segment_reader)
|
|
||||||
.expect("should panic");
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_field_not_fast_field() {
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
let title = schema_builder.add_text_field(TITLE, TEXT);
|
|
||||||
let size = schema_builder.add_u64_field(SIZE, IntOptions::default());
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
let (index, _) = index("beer", title, schema, |index_writer| {
|
|
||||||
index_writer.add_document(doc!(
|
|
||||||
title => "bottle of beer",
|
|
||||||
size => 12u64,
|
|
||||||
));
|
|
||||||
});
|
|
||||||
let searcher = index.searcher();
|
|
||||||
let segment = searcher.segment_reader(0);
|
|
||||||
let top_collector: TopDocsByField<u64> = TopDocs::with_limit(4).order_by_field(size);
|
|
||||||
assert_matches!(
|
|
||||||
top_collector
|
|
||||||
.for_segment(0, segment)
|
|
||||||
.map(|_| ())
|
|
||||||
.unwrap_err(),
|
|
||||||
TantivyError::FastFieldError(_)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn index(
|
|
||||||
query: &str,
|
|
||||||
query_field: Field,
|
|
||||||
schema: Schema,
|
|
||||||
mut doc_adder: impl FnMut(&mut IndexWriter) -> (),
|
|
||||||
) -> (Index, Box<Query>) {
|
|
||||||
let index = Index::create_in_ram(schema);
|
|
||||||
|
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
|
||||||
doc_adder(&mut index_writer);
|
|
||||||
index_writer.commit().unwrap();
|
|
||||||
index.load_searchers().unwrap();
|
|
||||||
|
|
||||||
let query_parser = QueryParser::for_index(&index, vec![query_field]);
|
|
||||||
let query = query_parser.parse_query(query).unwrap();
|
|
||||||
(index, query)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,200 +0,0 @@
|
|||||||
use super::Collector;
|
|
||||||
use collector::top_collector::TopCollector;
|
|
||||||
use collector::top_collector::TopSegmentCollector;
|
|
||||||
use collector::SegmentCollector;
|
|
||||||
use collector::TopDocsByField;
|
|
||||||
use fastfield::FastValue;
|
|
||||||
use schema::Field;
|
|
||||||
use DocAddress;
|
|
||||||
use DocId;
|
|
||||||
use Result;
|
|
||||||
use Score;
|
|
||||||
use SegmentLocalId;
|
|
||||||
use SegmentReader;
|
|
||||||
|
|
||||||
/// The Top Score Collector keeps track of the K documents
|
|
||||||
/// sorted by their score.
|
|
||||||
///
|
|
||||||
/// The implementation is based on a `BinaryHeap`.
|
|
||||||
/// The theorical complexity for collecting the top `K` out of `n` documents
|
|
||||||
/// is `O(n log K)`.
|
|
||||||
///
|
|
||||||
/// ```rust
|
|
||||||
/// #[macro_use]
|
|
||||||
/// extern crate tantivy;
|
|
||||||
/// use tantivy::DocAddress;
|
|
||||||
/// use tantivy::schema::{Schema, TEXT};
|
|
||||||
/// use tantivy::{Index, Result};
|
|
||||||
/// use tantivy::collector::TopDocs;
|
|
||||||
/// use tantivy::query::QueryParser;
|
|
||||||
///
|
|
||||||
/// # fn main() { example().unwrap(); }
|
|
||||||
/// fn example() -> Result<()> {
|
|
||||||
/// let mut schema_builder = Schema::builder();
|
|
||||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
|
||||||
/// let schema = schema_builder.build();
|
|
||||||
/// let index = Index::create_in_ram(schema);
|
|
||||||
/// {
|
|
||||||
/// let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
|
|
||||||
/// index_writer.add_document(doc!(
|
|
||||||
/// title => "The Name of the Wind",
|
|
||||||
/// ));
|
|
||||||
/// index_writer.add_document(doc!(
|
|
||||||
/// title => "The Diary of Muadib",
|
|
||||||
/// ));
|
|
||||||
/// index_writer.add_document(doc!(
|
|
||||||
/// title => "A Dairy Cow",
|
|
||||||
/// ));
|
|
||||||
/// index_writer.add_document(doc!(
|
|
||||||
/// title => "The Diary of a Young Girl",
|
|
||||||
/// ));
|
|
||||||
/// index_writer.commit().unwrap();
|
|
||||||
/// }
|
|
||||||
///
|
|
||||||
/// index.load_searchers()?;
|
|
||||||
/// let searcher = index.searcher();
|
|
||||||
///
|
|
||||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
|
||||||
/// let query = query_parser.parse_query("diary")?;
|
|
||||||
/// let top_docs = searcher.search(&query, &TopDocs::with_limit(2))?;
|
|
||||||
///
|
|
||||||
/// assert_eq!(&top_docs[0], &(0.7261542, DocAddress(0, 1)));
|
|
||||||
/// assert_eq!(&top_docs[1], &(0.6099695, DocAddress(0, 3)));
|
|
||||||
///
|
|
||||||
/// Ok(())
|
|
||||||
/// }
|
|
||||||
/// ```
|
|
||||||
pub struct TopDocs(TopCollector<Score>);
|
|
||||||
|
|
||||||
impl TopDocs {
|
|
||||||
/// Creates a top score collector, with a number of documents equal to "limit".
|
|
||||||
///
|
|
||||||
/// # Panics
|
|
||||||
/// The method panics if limit is 0
|
|
||||||
pub fn with_limit(limit: usize) -> TopDocs {
|
|
||||||
TopDocs(TopCollector::with_limit(limit))
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Set top-K to rank documents by a given fast field.
|
|
||||||
///
|
|
||||||
/// (By default, `TopDocs` collects the top-K documents sorted by
|
|
||||||
/// the similarity score.)
|
|
||||||
pub fn order_by_field<T: PartialOrd + FastValue + Clone>(
|
|
||||||
self,
|
|
||||||
field: Field,
|
|
||||||
) -> TopDocsByField<T> {
|
|
||||||
TopDocsByField::new(field, self.0.limit())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Collector for TopDocs {
|
|
||||||
type Fruit = Vec<(Score, DocAddress)>;
|
|
||||||
|
|
||||||
type Child = TopScoreSegmentCollector;
|
|
||||||
|
|
||||||
fn for_segment(
|
|
||||||
&self,
|
|
||||||
segment_local_id: SegmentLocalId,
|
|
||||||
reader: &SegmentReader,
|
|
||||||
) -> Result<Self::Child> {
|
|
||||||
let collector = self.0.for_segment(segment_local_id, reader)?;
|
|
||||||
Ok(TopScoreSegmentCollector(collector))
|
|
||||||
}
|
|
||||||
|
|
||||||
fn requires_scoring(&self) -> bool {
|
|
||||||
true
|
|
||||||
}
|
|
||||||
|
|
||||||
fn merge_fruits(&self, child_fruits: Vec<Vec<(Score, DocAddress)>>) -> Result<Self::Fruit> {
|
|
||||||
self.0.merge_fruits(child_fruits)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Segment Collector associated to `TopDocs`.
|
|
||||||
pub struct TopScoreSegmentCollector(TopSegmentCollector<Score>);
|
|
||||||
|
|
||||||
impl SegmentCollector for TopScoreSegmentCollector {
|
|
||||||
type Fruit = Vec<(Score, DocAddress)>;
|
|
||||||
|
|
||||||
fn collect(&mut self, doc: DocId, score: Score) {
|
|
||||||
self.0.collect(doc, score)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn harvest(self) -> Vec<(Score, DocAddress)> {
|
|
||||||
self.0.harvest()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use super::TopDocs;
|
|
||||||
use query::QueryParser;
|
|
||||||
use schema::Schema;
|
|
||||||
use schema::TEXT;
|
|
||||||
use DocAddress;
|
|
||||||
use Index;
|
|
||||||
use Score;
|
|
||||||
|
|
||||||
fn make_index() -> Index {
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
let index = Index::create_in_ram(schema);
|
|
||||||
{
|
|
||||||
// writing the segment
|
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
|
||||||
index_writer.add_document(doc!(text_field=>"Hello happy tax payer."));
|
|
||||||
index_writer.add_document(doc!(text_field=>"Droopy says hello happy tax payer"));
|
|
||||||
index_writer.add_document(doc!(text_field=>"I like Droopy"));
|
|
||||||
assert!(index_writer.commit().is_ok());
|
|
||||||
}
|
|
||||||
index.load_searchers().unwrap();
|
|
||||||
index
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_top_collector_not_at_capacity() {
|
|
||||||
let index = make_index();
|
|
||||||
let field = index.schema().get_field("text").unwrap();
|
|
||||||
let query_parser = QueryParser::for_index(&index, vec![field]);
|
|
||||||
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
|
||||||
let score_docs: Vec<(Score, DocAddress)> = index
|
|
||||||
.searcher()
|
|
||||||
.search(&text_query, &TopDocs::with_limit(4))
|
|
||||||
.unwrap();
|
|
||||||
assert_eq!(
|
|
||||||
score_docs,
|
|
||||||
vec![
|
|
||||||
(0.81221175, DocAddress(0u32, 1)),
|
|
||||||
(0.5376842, DocAddress(0u32, 2)),
|
|
||||||
(0.48527452, DocAddress(0, 0))
|
|
||||||
]
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_top_collector_at_capacity() {
|
|
||||||
let index = make_index();
|
|
||||||
let field = index.schema().get_field("text").unwrap();
|
|
||||||
let query_parser = QueryParser::for_index(&index, vec![field]);
|
|
||||||
let text_query = query_parser.parse_query("droopy tax").unwrap();
|
|
||||||
let score_docs: Vec<(Score, DocAddress)> = index
|
|
||||||
.searcher()
|
|
||||||
.search(&text_query, &TopDocs::with_limit(2))
|
|
||||||
.unwrap();
|
|
||||||
assert_eq!(
|
|
||||||
score_docs,
|
|
||||||
vec![
|
|
||||||
(0.81221175, DocAddress(0u32, 1)),
|
|
||||||
(0.5376842, DocAddress(0u32, 2)),
|
|
||||||
]
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
#[should_panic]
|
|
||||||
fn test_top_0() {
|
|
||||||
TopDocs::with_limit(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
@@ -102,7 +102,6 @@ where
|
|||||||
addr + 8 <= data.len(),
|
addr + 8 <= data.len(),
|
||||||
"The fast field field should have been padded with 7 bytes."
|
"The fast field field should have been padded with 7 bytes."
|
||||||
);
|
);
|
||||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
|
|
||||||
let val_unshifted_unmasked: u64 =
|
let val_unshifted_unmasked: u64 =
|
||||||
u64::from_le(unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) });
|
u64::from_le(unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) });
|
||||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||||
@@ -126,7 +125,6 @@ where
|
|||||||
for output_val in output.iter_mut() {
|
for output_val in output.iter_mut() {
|
||||||
let addr = addr_in_bits >> 3;
|
let addr = addr_in_bits >> 3;
|
||||||
let bit_shift = addr_in_bits & 7;
|
let bit_shift = addr_in_bits & 7;
|
||||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
|
|
||||||
let val_unshifted_unmasked: u64 =
|
let val_unshifted_unmasked: u64 =
|
||||||
unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) };
|
unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) };
|
||||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||||
|
|||||||
@@ -34,17 +34,17 @@ impl TinySet {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the complement of the set in `[0, 64[`.
|
/// Returns the complement of the set in `[0, 64[`.
|
||||||
fn complement(self) -> TinySet {
|
fn complement(&self) -> TinySet {
|
||||||
TinySet(!self.0)
|
TinySet(!self.0)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns true iff the `TinySet` contains the element `el`.
|
/// Returns true iff the `TinySet` contains the element `el`.
|
||||||
pub fn contains(self, el: u32) -> bool {
|
pub fn contains(&self, el: u32) -> bool {
|
||||||
!self.intersect(TinySet::singleton(el)).is_empty()
|
!self.intersect(TinySet::singleton(el)).is_empty()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the intersection of `self` and `other`
|
/// Returns the intersection of `self` and `other`
|
||||||
pub fn intersect(self, other: TinySet) -> TinySet {
|
pub fn intersect(&self, other: TinySet) -> TinySet {
|
||||||
TinySet(self.0 & other.0)
|
TinySet(self.0 & other.0)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -77,7 +77,7 @@ impl TinySet {
|
|||||||
|
|
||||||
/// Returns true iff the `TinySet` is empty.
|
/// Returns true iff the `TinySet` is empty.
|
||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
pub fn is_empty(self) -> bool {
|
pub fn is_empty(&self) -> bool {
|
||||||
self.0 == 0u64
|
self.0 == 0u64
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -114,7 +114,7 @@ impl TinySet {
|
|||||||
self.0 = 0u64;
|
self.0 = 0u64;
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn len(self) -> u32 {
|
pub fn len(&self) -> u32 {
|
||||||
self.0.count_ones()
|
self.0.count_ones()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -266,14 +266,14 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_bitset_large() {
|
fn test_bitset_large() {
|
||||||
let arr = generate_nonunique_unsorted(100_000, 5_000);
|
let arr = generate_nonunique_unsorted(1_000_000, 50_000);
|
||||||
let mut btreeset: BTreeSet<u32> = BTreeSet::new();
|
let mut btreeset: BTreeSet<u32> = BTreeSet::new();
|
||||||
let mut bitset = BitSet::with_max_value(100_000);
|
let mut bitset = BitSet::with_max_value(1_000_000);
|
||||||
for el in arr {
|
for el in arr {
|
||||||
btreeset.insert(el);
|
btreeset.insert(el);
|
||||||
bitset.insert(el);
|
bitset.insert(el);
|
||||||
}
|
}
|
||||||
for i in 0..100_000 {
|
for i in 0..1_000_000 {
|
||||||
assert_eq!(btreeset.contains(&i), bitset.contains(i));
|
assert_eq!(btreeset.contains(&i), bitset.contains(i));
|
||||||
}
|
}
|
||||||
assert_eq!(btreeset.len(), bitset.len());
|
assert_eq!(btreeset.len(), bitset.len());
|
||||||
|
|||||||
@@ -4,8 +4,6 @@ use common::VInt;
|
|||||||
use directory::ReadOnlySource;
|
use directory::ReadOnlySource;
|
||||||
use directory::WritePtr;
|
use directory::WritePtr;
|
||||||
use schema::Field;
|
use schema::Field;
|
||||||
use space_usage::FieldUsage;
|
|
||||||
use space_usage::PerFieldSpaceUsage;
|
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::io::Write;
|
use std::io::Write;
|
||||||
use std::io::{self, Read};
|
use std::io::{self, Read};
|
||||||
@@ -74,8 +72,7 @@ impl<W: Write> CompositeWrite<W> {
|
|||||||
let footer_offset = self.write.written_bytes();
|
let footer_offset = self.write.written_bytes();
|
||||||
VInt(self.offsets.len() as u64).serialize(&mut self.write)?;
|
VInt(self.offsets.len() as u64).serialize(&mut self.write)?;
|
||||||
|
|
||||||
let mut offset_fields: Vec<_> = self
|
let mut offset_fields: Vec<_> = self.offsets
|
||||||
.offsets
|
|
||||||
.iter()
|
.iter()
|
||||||
.map(|(file_addr, offset)| (*offset, *file_addr))
|
.map(|(file_addr, offset)| (*offset, *file_addr))
|
||||||
.collect();
|
.collect();
|
||||||
@@ -168,17 +165,6 @@ impl CompositeFile {
|
|||||||
.get(&FileAddr { field, idx })
|
.get(&FileAddr { field, idx })
|
||||||
.map(|&(from, to)| self.data.slice(from, to))
|
.map(|&(from, to)| self.data.slice(from, to))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn space_usage(&self) -> PerFieldSpaceUsage {
|
|
||||||
let mut fields = HashMap::new();
|
|
||||||
for (&field_addr, &(start, end)) in self.offsets_index.iter() {
|
|
||||||
fields
|
|
||||||
.entry(field_addr.field)
|
|
||||||
.or_insert_with(|| FieldUsage::empty(field_addr.field))
|
|
||||||
.add_field_idx(field_addr.idx, end - start);
|
|
||||||
}
|
|
||||||
PerFieldSpaceUsage::new(fields)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
|||||||
@@ -10,6 +10,8 @@ pub struct VInt(pub u64);
|
|||||||
const STOP_BIT: u8 = 128;
|
const STOP_BIT: u8 = 128;
|
||||||
|
|
||||||
impl VInt {
|
impl VInt {
|
||||||
|
|
||||||
|
|
||||||
pub fn val(&self) -> u64 {
|
pub fn val(&self) -> u64 {
|
||||||
self.0
|
self.0
|
||||||
}
|
}
|
||||||
@@ -18,13 +20,14 @@ impl VInt {
|
|||||||
VInt::deserialize(reader).map(|vint| vint.0)
|
VInt::deserialize(reader).map(|vint| vint.0)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn serialize_into_vec(&self, output: &mut Vec<u8>) {
|
pub fn serialize_into_vec(&self, output: &mut Vec<u8>){
|
||||||
let mut buffer = [0u8; 10];
|
let mut buffer = [0u8; 10];
|
||||||
let num_bytes = self.serialize_into(&mut buffer);
|
let num_bytes = self.serialize_into(&mut buffer);
|
||||||
output.extend(&buffer[0..num_bytes]);
|
output.extend(&buffer[0..num_bytes]);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_into(&self, buffer: &mut [u8; 10]) -> usize {
|
fn serialize_into(&self, buffer: &mut [u8; 10]) -> usize {
|
||||||
|
|
||||||
let mut remaining = self.0;
|
let mut remaining = self.0;
|
||||||
for (i, b) in buffer.iter_mut().enumerate() {
|
for (i, b) in buffer.iter_mut().enumerate() {
|
||||||
let next_byte: u8 = (remaining % 128u64) as u8;
|
let next_byte: u8 = (remaining % 128u64) as u8;
|
||||||
@@ -71,6 +74,7 @@ impl BinarySerializable for VInt {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
@@ -85,10 +89,10 @@ mod tests {
|
|||||||
}
|
}
|
||||||
assert!(num_bytes > 0);
|
assert!(num_bytes > 0);
|
||||||
if num_bytes < 10 {
|
if num_bytes < 10 {
|
||||||
assert!(1u64 << (7 * num_bytes) > val);
|
assert!(1u64 << (7*num_bytes) > val);
|
||||||
}
|
}
|
||||||
if num_bytes > 1 {
|
if num_bytes > 1 {
|
||||||
assert!(1u64 << (7 * (num_bytes - 1)) <= val);
|
assert!(1u64 << (7*(num_bytes-1)) <= val);
|
||||||
}
|
}
|
||||||
let serdeser_val = VInt::deserialize(&mut &v[..]).unwrap();
|
let serdeser_val = VInt::deserialize(&mut &v[..]).unwrap();
|
||||||
assert_eq!(val, serdeser_val.0);
|
assert_eq!(val, serdeser_val.0);
|
||||||
@@ -101,11 +105,11 @@ mod tests {
|
|||||||
aux_test_vint(5);
|
aux_test_vint(5);
|
||||||
aux_test_vint(u64::max_value());
|
aux_test_vint(u64::max_value());
|
||||||
for i in 1..9 {
|
for i in 1..9 {
|
||||||
let power_of_128 = 1u64 << (7 * i);
|
let power_of_128 = 1u64 << (7*i);
|
||||||
aux_test_vint(power_of_128 - 1u64);
|
aux_test_vint(power_of_128 - 1u64);
|
||||||
aux_test_vint(power_of_128);
|
aux_test_vint(power_of_128 );
|
||||||
aux_test_vint(power_of_128 + 1u64);
|
aux_test_vint(power_of_128 + 1u64);
|
||||||
}
|
}
|
||||||
aux_test_vint(10);
|
aux_test_vint(10);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1,134 +0,0 @@
|
|||||||
use crossbeam::channel;
|
|
||||||
use scoped_pool::{Pool, ThreadConfig};
|
|
||||||
use Result;
|
|
||||||
|
|
||||||
/// Search executor whether search request are single thread or multithread.
|
|
||||||
///
|
|
||||||
/// We don't expose Rayon thread pool directly here for several reasons.
|
|
||||||
///
|
|
||||||
/// First dependency hell. It is not a good idea to expose the
|
|
||||||
/// API of a dependency, knowing it might conflict with a different version
|
|
||||||
/// used by the client. Second, we may stop using rayon in the future.
|
|
||||||
pub enum Executor {
|
|
||||||
SingleThread,
|
|
||||||
ThreadPool(Pool),
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Executor {
|
|
||||||
/// Creates an Executor that performs all task in the caller thread.
|
|
||||||
pub fn single_thread() -> Executor {
|
|
||||||
Executor::SingleThread
|
|
||||||
}
|
|
||||||
|
|
||||||
// Creates an Executor that dispatches the tasks in a thread pool.
|
|
||||||
pub fn multi_thread(num_threads: usize, prefix: &'static str) -> Executor {
|
|
||||||
let thread_config = ThreadConfig::new().prefix(prefix);
|
|
||||||
let pool = Pool::with_thread_config(num_threads, thread_config);
|
|
||||||
Executor::ThreadPool(pool)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Perform a map in the thread pool.
|
|
||||||
//
|
|
||||||
// Regardless of the executor (`SingleThread` or `ThreadPool`), panics in the task
|
|
||||||
// will propagate to the caller.
|
|
||||||
pub fn map<
|
|
||||||
A: Send,
|
|
||||||
R: Send,
|
|
||||||
AIterator: Iterator<Item = A>,
|
|
||||||
F: Sized + Sync + Fn(A) -> Result<R>,
|
|
||||||
>(
|
|
||||||
&self,
|
|
||||||
f: F,
|
|
||||||
args: AIterator,
|
|
||||||
) -> Result<Vec<R>> {
|
|
||||||
match self {
|
|
||||||
Executor::SingleThread => args.map(f).collect::<Result<_>>(),
|
|
||||||
Executor::ThreadPool(pool) => {
|
|
||||||
let args_with_indices: Vec<(usize, A)> = args.enumerate().collect();
|
|
||||||
let num_fruits = args_with_indices.len();
|
|
||||||
let fruit_receiver = {
|
|
||||||
let (fruit_sender, fruit_receiver) = channel::unbounded();
|
|
||||||
pool.scoped(|scope| {
|
|
||||||
for arg_with_idx in args_with_indices {
|
|
||||||
scope.execute(|| {
|
|
||||||
let (idx, arg) = arg_with_idx;
|
|
||||||
let fruit = f(arg);
|
|
||||||
if let Err(err) = fruit_sender.send((idx, fruit)) {
|
|
||||||
error!("Failed to send search task. It probably means all search threads have panicked. {:?}", err);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
});
|
|
||||||
fruit_receiver
|
|
||||||
// This ends the scope of fruit_sender.
|
|
||||||
// This is important as it makes it possible for the fruit_receiver iteration to
|
|
||||||
// terminate.
|
|
||||||
};
|
|
||||||
let mut results = Vec::with_capacity(num_fruits);
|
|
||||||
unsafe { results.set_len(num_fruits) };
|
|
||||||
let mut num_items = 0;
|
|
||||||
for (pos, fruit_res) in fruit_receiver {
|
|
||||||
results[pos] = fruit_res?;
|
|
||||||
num_items += 1;
|
|
||||||
}
|
|
||||||
// this checks ensures that we filled of this
|
|
||||||
// uninitialized memory.
|
|
||||||
assert_eq!(num_items, results.len());
|
|
||||||
Ok(results)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
|
|
||||||
use super::Executor;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
#[should_panic(expected = "panic should propagate")]
|
|
||||||
fn test_panic_propagates_single_thread() {
|
|
||||||
let _result: Vec<usize> = Executor::single_thread()
|
|
||||||
.map(
|
|
||||||
|_| {
|
|
||||||
panic!("panic should propagate");
|
|
||||||
},
|
|
||||||
vec![0].into_iter(),
|
|
||||||
).unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
#[should_panic] //< unfortunately the panic message is not propagated
|
|
||||||
fn test_panic_propagates_multi_thread() {
|
|
||||||
let _result: Vec<usize> = Executor::multi_thread(1, "search-test")
|
|
||||||
.map(
|
|
||||||
|_| {
|
|
||||||
panic!("panic should propagate");
|
|
||||||
},
|
|
||||||
vec![0].into_iter(),
|
|
||||||
).unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_map_singlethread() {
|
|
||||||
let result: Vec<usize> = Executor::single_thread()
|
|
||||||
.map(|i| Ok(i * 2), 0..1_000)
|
|
||||||
.unwrap();
|
|
||||||
assert_eq!(result.len(), 1_000);
|
|
||||||
for i in 0..1_000 {
|
|
||||||
assert_eq!(result[i], i * 2);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_map_multithread() {
|
|
||||||
let result: Vec<usize> = Executor::multi_thread(3, "search-test")
|
|
||||||
.map(|i| Ok(i * 2), 0..10)
|
|
||||||
.unwrap();
|
|
||||||
assert_eq!(result.len(), 10);
|
|
||||||
for i in 0..10 {
|
|
||||||
assert_eq!(result[i], i * 2);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -1,37 +1,34 @@
|
|||||||
|
use core::SegmentId;
|
||||||
|
use error::TantivyError;
|
||||||
|
use schema::Schema;
|
||||||
|
use serde_json;
|
||||||
|
use std::borrow::BorrowMut;
|
||||||
|
use std::fmt;
|
||||||
|
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||||
|
use std::sync::Arc;
|
||||||
|
use Result;
|
||||||
|
|
||||||
use super::pool::LeasedItem;
|
use super::pool::LeasedItem;
|
||||||
use super::pool::Pool;
|
use super::pool::Pool;
|
||||||
use super::segment::create_segment;
|
use super::segment::create_segment;
|
||||||
use super::segment::Segment;
|
use super::segment::Segment;
|
||||||
use core::searcher::Searcher;
|
use core::searcher::Searcher;
|
||||||
use core::Executor;
|
|
||||||
use core::IndexMeta;
|
use core::IndexMeta;
|
||||||
use core::SegmentId;
|
|
||||||
use core::SegmentMeta;
|
use core::SegmentMeta;
|
||||||
use core::SegmentReader;
|
use core::SegmentReader;
|
||||||
use core::META_FILEPATH;
|
use core::META_FILEPATH;
|
||||||
use directory::ManagedDirectory;
|
|
||||||
#[cfg(feature = "mmap")]
|
#[cfg(feature = "mmap")]
|
||||||
use directory::MmapDirectory;
|
use directory::MmapDirectory;
|
||||||
use directory::{Directory, RAMDirectory};
|
use directory::{Directory, RAMDirectory};
|
||||||
use error::TantivyError;
|
use directory::{DirectoryClone, ManagedDirectory};
|
||||||
use indexer::index_writer::open_index_writer;
|
use indexer::index_writer::open_index_writer;
|
||||||
use indexer::index_writer::HEAP_SIZE_MIN;
|
use indexer::index_writer::HEAP_SIZE_MIN;
|
||||||
use indexer::segment_updater::save_new_metas;
|
use indexer::segment_updater::save_new_metas;
|
||||||
use indexer::LockType;
|
use indexer::DirectoryLock;
|
||||||
use num_cpus;
|
use num_cpus;
|
||||||
use schema::Field;
|
|
||||||
use schema::FieldType;
|
|
||||||
use schema::Schema;
|
|
||||||
use serde_json;
|
|
||||||
use std::borrow::BorrowMut;
|
|
||||||
use std::fmt;
|
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
|
||||||
use std::sync::Arc;
|
|
||||||
use tokenizer::BoxedTokenizer;
|
|
||||||
use tokenizer::TokenizerManager;
|
use tokenizer::TokenizerManager;
|
||||||
use IndexWriter;
|
use IndexWriter;
|
||||||
use Result;
|
|
||||||
|
|
||||||
fn load_metas(directory: &Directory) -> Result<IndexMeta> {
|
fn load_metas(directory: &Directory) -> Result<IndexMeta> {
|
||||||
let meta_data = directory.atomic_read(&META_FILEPATH)?;
|
let meta_data = directory.atomic_read(&META_FILEPATH)?;
|
||||||
@@ -46,39 +43,10 @@ pub struct Index {
|
|||||||
schema: Schema,
|
schema: Schema,
|
||||||
num_searchers: Arc<AtomicUsize>,
|
num_searchers: Arc<AtomicUsize>,
|
||||||
searcher_pool: Arc<Pool<Searcher>>,
|
searcher_pool: Arc<Pool<Searcher>>,
|
||||||
executor: Arc<Executor>,
|
|
||||||
tokenizers: TokenizerManager,
|
tokenizers: TokenizerManager,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Index {
|
impl Index {
|
||||||
/// Examines the director to see if it contains an index
|
|
||||||
pub fn exists<Dir: Directory>(dir: &Dir) -> bool {
|
|
||||||
dir.exists(&META_FILEPATH)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Accessor to the search executor.
|
|
||||||
///
|
|
||||||
/// This pool is used by default when calling `searcher.search(...)`
|
|
||||||
/// to perform search on the individual segments.
|
|
||||||
///
|
|
||||||
/// By default the executor is single thread, and simply runs in the calling thread.
|
|
||||||
pub fn search_executor(&self) -> &Executor {
|
|
||||||
self.executor.as_ref()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Replace the default single thread search executor pool
|
|
||||||
/// by a thread pool with a given number of threads.
|
|
||||||
pub fn set_multithread_executor(&mut self, num_threads: usize) {
|
|
||||||
self.executor = Arc::new(Executor::multi_thread(num_threads, "thrd-tantivy-search-"));
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Replace the default single thread search executor pool
|
|
||||||
/// by a thread pool with a given number of threads.
|
|
||||||
pub fn set_default_multithread_executor(&mut self) {
|
|
||||||
let default_num_threads = num_cpus::get();
|
|
||||||
self.set_multithread_executor(default_num_threads);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Creates a new index using the `RAMDirectory`.
|
/// Creates a new index using the `RAMDirectory`.
|
||||||
///
|
///
|
||||||
/// The index will be allocated in anonymous memory.
|
/// The index will be allocated in anonymous memory.
|
||||||
@@ -95,30 +63,9 @@ impl Index {
|
|||||||
#[cfg(feature = "mmap")]
|
#[cfg(feature = "mmap")]
|
||||||
pub fn create_in_dir<P: AsRef<Path>>(directory_path: P, schema: Schema) -> Result<Index> {
|
pub fn create_in_dir<P: AsRef<Path>>(directory_path: P, schema: Schema) -> Result<Index> {
|
||||||
let mmap_directory = MmapDirectory::open(directory_path)?;
|
let mmap_directory = MmapDirectory::open(directory_path)?;
|
||||||
if Index::exists(&mmap_directory) {
|
|
||||||
return Err(TantivyError::IndexAlreadyExists);
|
|
||||||
}
|
|
||||||
|
|
||||||
Index::create(mmap_directory, schema)
|
Index::create(mmap_directory, schema)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Opens or creates a new index in the provided directory
|
|
||||||
#[cfg(feature = "mmap")]
|
|
||||||
pub fn open_or_create<Dir: Directory>(dir: Dir, schema: Schema) -> Result<Index> {
|
|
||||||
if Index::exists(&dir) {
|
|
||||||
let index = Index::open(dir)?;
|
|
||||||
if index.schema() == schema {
|
|
||||||
Ok(index)
|
|
||||||
} else {
|
|
||||||
Err(TantivyError::SchemaError(
|
|
||||||
"An index exists but the schema does not match.".to_string(),
|
|
||||||
))
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
Index::create(dir, schema)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Creates a new index in a temp directory.
|
/// Creates a new index in a temp directory.
|
||||||
///
|
///
|
||||||
/// The index will use the `MMapDirectory` in a newly created directory.
|
/// The index will use the `MMapDirectory` in a newly created directory.
|
||||||
@@ -140,8 +87,6 @@ impl Index {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Create a new index from a directory.
|
/// Create a new index from a directory.
|
||||||
///
|
|
||||||
/// This will overwrite existing meta.json
|
|
||||||
fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> {
|
fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> {
|
||||||
save_new_metas(schema.clone(), 0, directory.borrow_mut())?;
|
save_new_metas(schema.clone(), 0, directory.borrow_mut())?;
|
||||||
let metas = IndexMeta::with_schema(schema);
|
let metas = IndexMeta::with_schema(schema);
|
||||||
@@ -158,7 +103,6 @@ impl Index {
|
|||||||
num_searchers: Arc::new(AtomicUsize::new(n_cpus)),
|
num_searchers: Arc::new(AtomicUsize::new(n_cpus)),
|
||||||
searcher_pool: Arc::new(Pool::new()),
|
searcher_pool: Arc::new(Pool::new()),
|
||||||
tokenizers: TokenizerManager::default(),
|
tokenizers: TokenizerManager::default(),
|
||||||
executor: Arc::new(Executor::single_thread()),
|
|
||||||
};
|
};
|
||||||
index.load_searchers()?;
|
index.load_searchers()?;
|
||||||
Ok(index)
|
Ok(index)
|
||||||
@@ -169,27 +113,6 @@ impl Index {
|
|||||||
&self.tokenizers
|
&self.tokenizers
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Helper to access the tokenizer associated to a specific field.
|
|
||||||
pub fn tokenizer_for_field(&self, field: Field) -> Result<Box<BoxedTokenizer>> {
|
|
||||||
let field_entry = self.schema.get_field_entry(field);
|
|
||||||
let field_type = field_entry.field_type();
|
|
||||||
let tokenizer_manager: &TokenizerManager = self.tokenizers();
|
|
||||||
let tokenizer_name_opt: Option<Box<BoxedTokenizer>> = match field_type {
|
|
||||||
FieldType::Str(text_options) => text_options
|
|
||||||
.get_indexing_options()
|
|
||||||
.map(|text_indexing_options| text_indexing_options.tokenizer().to_string())
|
|
||||||
.and_then(|tokenizer_name| tokenizer_manager.get(&tokenizer_name)),
|
|
||||||
_ => None,
|
|
||||||
};
|
|
||||||
match tokenizer_name_opt {
|
|
||||||
Some(tokenizer) => Ok(tokenizer),
|
|
||||||
None => Err(TantivyError::SchemaError(format!(
|
|
||||||
"{:?} is not a text field.",
|
|
||||||
field_entry.name()
|
|
||||||
))),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Opens a new directory from an index path.
|
/// Opens a new directory from an index path.
|
||||||
#[cfg(feature = "mmap")]
|
#[cfg(feature = "mmap")]
|
||||||
pub fn open_in_dir<P: AsRef<Path>>(directory_path: P) -> Result<Index> {
|
pub fn open_in_dir<P: AsRef<Path>>(directory_path: P) -> Result<Index> {
|
||||||
@@ -233,7 +156,7 @@ impl Index {
|
|||||||
num_threads: usize,
|
num_threads: usize,
|
||||||
overall_heap_size_in_bytes: usize,
|
overall_heap_size_in_bytes: usize,
|
||||||
) -> Result<IndexWriter> {
|
) -> Result<IndexWriter> {
|
||||||
let directory_lock = LockType::IndexWriterLock.acquire_lock(&self.directory)?;
|
let directory_lock = DirectoryLock::lock(self.directory().box_clone())?;
|
||||||
let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads;
|
let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads;
|
||||||
open_index_writer(
|
open_index_writer(
|
||||||
self,
|
self,
|
||||||
@@ -271,8 +194,7 @@ impl Index {
|
|||||||
|
|
||||||
/// Returns the list of segments that are searchable
|
/// Returns the list of segments that are searchable
|
||||||
pub fn searchable_segments(&self) -> Result<Vec<Segment>> {
|
pub fn searchable_segments(&self) -> Result<Vec<Segment>> {
|
||||||
Ok(self
|
Ok(self.searchable_segment_metas()?
|
||||||
.searchable_segment_metas()?
|
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|segment_meta| self.segment(segment_meta))
|
.map(|segment_meta| self.segment(segment_meta))
|
||||||
.collect())
|
.collect())
|
||||||
@@ -307,8 +229,7 @@ impl Index {
|
|||||||
|
|
||||||
/// Returns the list of segment ids that are searchable.
|
/// Returns the list of segment ids that are searchable.
|
||||||
pub fn searchable_segment_ids(&self) -> Result<Vec<SegmentId>> {
|
pub fn searchable_segment_ids(&self) -> Result<Vec<SegmentId>> {
|
||||||
Ok(self
|
Ok(self.searchable_segment_metas()?
|
||||||
.searchable_segment_metas()?
|
|
||||||
.iter()
|
.iter()
|
||||||
.map(|segment_meta| segment_meta.id())
|
.map(|segment_meta| segment_meta.id())
|
||||||
.collect())
|
.collect())
|
||||||
@@ -321,18 +242,13 @@ impl Index {
|
|||||||
self.num_searchers.store(num_searchers, Ordering::Release);
|
self.num_searchers.store(num_searchers, Ordering::Release);
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Update searchers so that they reflect the state of the last
|
/// Creates a new generation of searchers after
|
||||||
/// `.commit()`.
|
|
||||||
|
/// a change of the set of searchable indexes.
|
||||||
///
|
///
|
||||||
/// If indexing happens in the same process as searching,
|
/// This needs to be called when a new segment has been
|
||||||
/// you most likely want to call `.load_searchers()` right after each
|
/// published or after a merge.
|
||||||
/// successful call to `.commit()`.
|
|
||||||
///
|
|
||||||
/// If indexing and searching happen in different processes, the way to
|
|
||||||
/// get the freshest `index` at all time, is to watch `meta.json` and
|
|
||||||
/// call `load_searchers` whenever a changes happen.
|
|
||||||
pub fn load_searchers(&self) -> Result<()> {
|
pub fn load_searchers(&self) -> Result<()> {
|
||||||
let _meta_lock = LockType::MetaLock.acquire_lock(self.directory())?;
|
|
||||||
let searchable_segments = self.searchable_segments()?;
|
let searchable_segments = self.searchable_segments()?;
|
||||||
let segment_readers: Vec<SegmentReader> = searchable_segments
|
let segment_readers: Vec<SegmentReader> = searchable_segments
|
||||||
.iter()
|
.iter()
|
||||||
@@ -341,7 +257,7 @@ impl Index {
|
|||||||
let schema = self.schema();
|
let schema = self.schema();
|
||||||
let num_searchers: usize = self.num_searchers.load(Ordering::Acquire);
|
let num_searchers: usize = self.num_searchers.load(Ordering::Acquire);
|
||||||
let searchers = (0..num_searchers)
|
let searchers = (0..num_searchers)
|
||||||
.map(|_| Searcher::new(schema.clone(), self.clone(), segment_readers.clone()))
|
.map(|_| Searcher::new(schema.clone(), segment_readers.clone()))
|
||||||
.collect();
|
.collect();
|
||||||
self.searcher_pool.publish_new_generation(searchers);
|
self.searcher_pool.publish_new_generation(searchers);
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -376,79 +292,6 @@ impl Clone for Index {
|
|||||||
num_searchers: Arc::clone(&self.num_searchers),
|
num_searchers: Arc::clone(&self.num_searchers),
|
||||||
searcher_pool: Arc::clone(&self.searcher_pool),
|
searcher_pool: Arc::clone(&self.searcher_pool),
|
||||||
tokenizers: self.tokenizers.clone(),
|
tokenizers: self.tokenizers.clone(),
|
||||||
executor: self.executor.clone(),
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use directory::RAMDirectory;
|
|
||||||
use schema::{Schema, INT_INDEXED, TEXT};
|
|
||||||
use Index;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_indexer_for_field() {
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
let num_likes_field = schema_builder.add_u64_field("num_likes", INT_INDEXED);
|
|
||||||
let body_field = schema_builder.add_text_field("body", TEXT);
|
|
||||||
let schema = schema_builder.build();
|
|
||||||
let index = Index::create_in_ram(schema);
|
|
||||||
assert!(index.tokenizer_for_field(body_field).is_ok());
|
|
||||||
assert_eq!(
|
|
||||||
format!("{:?}", index.tokenizer_for_field(num_likes_field).err()),
|
|
||||||
"Some(SchemaError(\"\\\"num_likes\\\" is not a text field.\"))"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_index_exists() {
|
|
||||||
let directory = RAMDirectory::create();
|
|
||||||
assert!(!Index::exists(&directory));
|
|
||||||
assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
|
|
||||||
assert!(Index::exists(&directory));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn open_or_create_should_create() {
|
|
||||||
let directory = RAMDirectory::create();
|
|
||||||
assert!(!Index::exists(&directory));
|
|
||||||
assert!(Index::open_or_create(directory.clone(), throw_away_schema()).is_ok());
|
|
||||||
assert!(Index::exists(&directory));
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn open_or_create_should_open() {
|
|
||||||
let directory = RAMDirectory::create();
|
|
||||||
assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
|
|
||||||
assert!(Index::exists(&directory));
|
|
||||||
assert!(Index::open_or_create(directory, throw_away_schema()).is_ok());
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn create_should_wipeoff_existing() {
|
|
||||||
let directory = RAMDirectory::create();
|
|
||||||
assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
|
|
||||||
assert!(Index::exists(&directory));
|
|
||||||
assert!(Index::create(directory.clone(), Schema::builder().build()).is_ok());
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn open_or_create_exists_but_schema_does_not_match() {
|
|
||||||
let directory = RAMDirectory::create();
|
|
||||||
assert!(Index::create(directory.clone(), throw_away_schema()).is_ok());
|
|
||||||
assert!(Index::exists(&directory));
|
|
||||||
assert!(Index::open_or_create(directory.clone(), throw_away_schema()).is_ok());
|
|
||||||
let err = Index::open_or_create(directory, Schema::builder().build());
|
|
||||||
assert_eq!(
|
|
||||||
format!("{:?}", err.unwrap_err()),
|
|
||||||
"SchemaError(\"An index exists but the schema does not match.\")"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
fn throw_away_schema() -> Schema {
|
|
||||||
let mut schema_builder = Schema::builder();
|
|
||||||
let _ = schema_builder.add_u64_field("num_likes", INT_INDEXED);
|
|
||||||
schema_builder.build()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -46,19 +46,19 @@ impl fmt::Debug for IndexMeta {
|
|||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
use super::IndexMeta;
|
use super::IndexMeta;
|
||||||
use schema::{Schema, TEXT};
|
use schema::{SchemaBuilder, TEXT};
|
||||||
use serde_json;
|
use serde_json;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_serialize_metas() {
|
fn test_serialize_metas() {
|
||||||
let schema = {
|
let schema = {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::new();
|
||||||
schema_builder.add_text_field("text", TEXT);
|
schema_builder.add_text_field("text", TEXT);
|
||||||
schema_builder.build()
|
schema_builder.build()
|
||||||
};
|
};
|
||||||
let index_metas = IndexMeta {
|
let index_metas = IndexMeta {
|
||||||
segments: Vec::new(),
|
segments: Vec::new(),
|
||||||
schema,
|
schema: schema,
|
||||||
opstamp: 0u64,
|
opstamp: 0u64,
|
||||||
payload: None,
|
payload: None,
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -1,13 +1,13 @@
|
|||||||
use common::BinarySerializable;
|
use common::BinarySerializable;
|
||||||
use directory::ReadOnlySource;
|
use directory::ReadOnlySource;
|
||||||
use owned_read::OwnedRead;
|
|
||||||
use positions::PositionReader;
|
|
||||||
use postings::TermInfo;
|
use postings::TermInfo;
|
||||||
use postings::{BlockSegmentPostings, SegmentPostings};
|
use postings::{BlockSegmentPostings, SegmentPostings};
|
||||||
use schema::FieldType;
|
use schema::FieldType;
|
||||||
use schema::IndexRecordOption;
|
use schema::IndexRecordOption;
|
||||||
use schema::Term;
|
use schema::Term;
|
||||||
use termdict::TermDictionary;
|
use termdict::TermDictionary;
|
||||||
|
use owned_read::OwnedRead;
|
||||||
|
use positions::PositionReader;
|
||||||
|
|
||||||
/// The inverted index reader is in charge of accessing
|
/// The inverted index reader is in charge of accessing
|
||||||
/// the inverted index associated to a specific field.
|
/// the inverted index associated to a specific field.
|
||||||
@@ -32,10 +32,6 @@ pub struct InvertedIndexReader {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl InvertedIndexReader {
|
impl InvertedIndexReader {
|
||||||
#[cfg_attr(
|
|
||||||
feature = "cargo-clippy",
|
|
||||||
allow(clippy::needless_pass_by_value)
|
|
||||||
)] // for symetry
|
|
||||||
pub(crate) fn new(
|
pub(crate) fn new(
|
||||||
termdict: TermDictionary,
|
termdict: TermDictionary,
|
||||||
postings_source: ReadOnlySource,
|
postings_source: ReadOnlySource,
|
||||||
@@ -58,12 +54,12 @@ impl InvertedIndexReader {
|
|||||||
|
|
||||||
/// Creates an empty `InvertedIndexReader` object, which
|
/// Creates an empty `InvertedIndexReader` object, which
|
||||||
/// contains no terms at all.
|
/// contains no terms at all.
|
||||||
pub fn empty(field_type: &FieldType) -> InvertedIndexReader {
|
pub fn empty(field_type: FieldType) -> InvertedIndexReader {
|
||||||
let record_option = field_type
|
let record_option = field_type
|
||||||
.get_index_record_option()
|
.get_index_record_option()
|
||||||
.unwrap_or(IndexRecordOption::Basic);
|
.unwrap_or(IndexRecordOption::Basic);
|
||||||
InvertedIndexReader {
|
InvertedIndexReader {
|
||||||
termdict: TermDictionary::empty(&field_type),
|
termdict: TermDictionary::empty(field_type),
|
||||||
postings_source: ReadOnlySource::empty(),
|
postings_source: ReadOnlySource::empty(),
|
||||||
positions_source: ReadOnlySource::empty(),
|
positions_source: ReadOnlySource::empty(),
|
||||||
positions_idx_source: ReadOnlySource::empty(),
|
positions_idx_source: ReadOnlySource::empty(),
|
||||||
@@ -104,6 +100,7 @@ impl InvertedIndexReader {
|
|||||||
block_postings.reset(term_info.doc_freq, postings_reader);
|
block_postings.reset(term_info.doc_freq, postings_reader);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/// Returns a block postings given a `Term`.
|
/// Returns a block postings given a `Term`.
|
||||||
/// This method is for an advanced usage only.
|
/// This method is for an advanced usage only.
|
||||||
///
|
///
|
||||||
@@ -114,7 +111,7 @@ impl InvertedIndexReader {
|
|||||||
option: IndexRecordOption,
|
option: IndexRecordOption,
|
||||||
) -> Option<BlockSegmentPostings> {
|
) -> Option<BlockSegmentPostings> {
|
||||||
self.get_term_info(term)
|
self.get_term_info(term)
|
||||||
.map(move |term_info| self.read_block_postings_from_terminfo(&term_info, option))
|
.map(move|term_info| self.read_block_postings_from_terminfo(&term_info, option))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns a block postings given a `term_info`.
|
/// Returns a block postings given a `term_info`.
|
||||||
@@ -150,8 +147,7 @@ impl InvertedIndexReader {
|
|||||||
if option.has_positions() {
|
if option.has_positions() {
|
||||||
let position_reader = self.positions_source.clone();
|
let position_reader = self.positions_source.clone();
|
||||||
let skip_reader = self.positions_idx_source.clone();
|
let skip_reader = self.positions_idx_source.clone();
|
||||||
let position_reader =
|
let position_reader = PositionReader::new(position_reader, skip_reader, term_info.positions_idx);
|
||||||
PositionReader::new(position_reader, skip_reader, term_info.positions_idx);
|
|
||||||
Some(position_reader)
|
Some(position_reader)
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
mod executor;
|
|
||||||
pub mod index;
|
pub mod index;
|
||||||
mod index_meta;
|
mod index_meta;
|
||||||
mod inverted_index_reader;
|
mod inverted_index_reader;
|
||||||
@@ -10,7 +9,6 @@ mod segment_id;
|
|||||||
mod segment_meta;
|
mod segment_meta;
|
||||||
mod segment_reader;
|
mod segment_reader;
|
||||||
|
|
||||||
pub use self::executor::Executor;
|
|
||||||
pub use self::index::Index;
|
pub use self::index::Index;
|
||||||
pub use self::index_meta::IndexMeta;
|
pub use self::index_meta::IndexMeta;
|
||||||
pub use self::inverted_index_reader::InvertedIndexReader;
|
pub use self::inverted_index_reader::InvertedIndexReader;
|
||||||
@@ -35,4 +33,10 @@ lazy_static! {
|
|||||||
/// Removing this file is safe, but will prevent the garbage collection of all of the file that
|
/// Removing this file is safe, but will prevent the garbage collection of all of the file that
|
||||||
/// are currently in the directory
|
/// are currently in the directory
|
||||||
pub static ref MANAGED_FILEPATH: PathBuf = PathBuf::from(".managed.json");
|
pub static ref MANAGED_FILEPATH: PathBuf = PathBuf::from(".managed.json");
|
||||||
|
|
||||||
|
/// Only one process should be able to write tantivy's index at a time.
|
||||||
|
/// This file, when present, is in charge of preventing other processes to open an IndexWriter.
|
||||||
|
///
|
||||||
|
/// If the process is killed and this file remains, it is safe to remove it manually.
|
||||||
|
pub static ref LOCKFILE_FILEPATH: PathBuf = PathBuf::from(".tantivy-indexer.lock");
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -87,8 +87,7 @@ impl<T> Deref for LeasedItem<T> {
|
|||||||
type Target = T;
|
type Target = T;
|
||||||
|
|
||||||
fn deref(&self) -> &T {
|
fn deref(&self) -> &T {
|
||||||
&self
|
&self.gen_item
|
||||||
.gen_item
|
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.expect("Unwrapping a leased item should never fail")
|
.expect("Unwrapping a leased item should never fail")
|
||||||
.item // unwrap is safe here
|
.item // unwrap is safe here
|
||||||
@@ -97,8 +96,7 @@ impl<T> Deref for LeasedItem<T> {
|
|||||||
|
|
||||||
impl<T> DerefMut for LeasedItem<T> {
|
impl<T> DerefMut for LeasedItem<T> {
|
||||||
fn deref_mut(&mut self) -> &mut T {
|
fn deref_mut(&mut self) -> &mut T {
|
||||||
&mut self
|
&mut self.gen_item
|
||||||
.gen_item
|
|
||||||
.as_mut()
|
.as_mut()
|
||||||
.expect("Unwrapping a mut leased item should never fail")
|
.expect("Unwrapping a mut leased item should never fail")
|
||||||
.item // unwrap is safe here
|
.item // unwrap is safe here
|
||||||
|
|||||||
@@ -1,43 +1,16 @@
|
|||||||
use collector::Collector;
|
use collector::Collector;
|
||||||
use collector::SegmentCollector;
|
|
||||||
use core::Executor;
|
|
||||||
use core::InvertedIndexReader;
|
use core::InvertedIndexReader;
|
||||||
use core::SegmentReader;
|
use core::SegmentReader;
|
||||||
use query::Query;
|
use query::Query;
|
||||||
use query::Scorer;
|
|
||||||
use query::Weight;
|
|
||||||
use schema::Document;
|
use schema::Document;
|
||||||
use schema::Schema;
|
use schema::Schema;
|
||||||
use schema::{Field, Term};
|
use schema::{Field, Term};
|
||||||
use space_usage::SearcherSpaceUsage;
|
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use store::StoreReader;
|
|
||||||
use termdict::TermMerger;
|
use termdict::TermMerger;
|
||||||
use DocAddress;
|
use DocAddress;
|
||||||
use Index;
|
|
||||||
use Result;
|
use Result;
|
||||||
|
|
||||||
fn collect_segment<C: Collector>(
|
|
||||||
collector: &C,
|
|
||||||
weight: &Weight,
|
|
||||||
segment_ord: u32,
|
|
||||||
segment_reader: &SegmentReader,
|
|
||||||
) -> Result<C::Fruit> {
|
|
||||||
let mut scorer = weight.scorer(segment_reader)?;
|
|
||||||
let mut segment_collector = collector.for_segment(segment_ord as u32, segment_reader)?;
|
|
||||||
if let Some(delete_bitset) = segment_reader.delete_bitset() {
|
|
||||||
scorer.for_each(&mut |doc, score| {
|
|
||||||
if !delete_bitset.is_deleted(doc) {
|
|
||||||
segment_collector.collect(doc, score);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
} else {
|
|
||||||
scorer.for_each(&mut |doc, score| segment_collector.collect(doc, score));
|
|
||||||
}
|
|
||||||
Ok(segment_collector.harvest())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Holds a list of `SegmentReader`s ready for search.
|
/// Holds a list of `SegmentReader`s ready for search.
|
||||||
///
|
///
|
||||||
/// It guarantees that the `Segment` will not be removed before
|
/// It guarantees that the `Segment` will not be removed before
|
||||||
@@ -45,43 +18,25 @@ fn collect_segment<C: Collector>(
|
|||||||
///
|
///
|
||||||
pub struct Searcher {
|
pub struct Searcher {
|
||||||
schema: Schema,
|
schema: Schema,
|
||||||
index: Index,
|
|
||||||
segment_readers: Vec<SegmentReader>,
|
segment_readers: Vec<SegmentReader>,
|
||||||
store_readers: Vec<StoreReader>,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Searcher {
|
impl Searcher {
|
||||||
/// Creates a new `Searcher`
|
/// Creates a new `Searcher`
|
||||||
pub(crate) fn new(
|
pub(crate) fn new(schema: Schema, segment_readers: Vec<SegmentReader>) -> Searcher {
|
||||||
schema: Schema,
|
|
||||||
index: Index,
|
|
||||||
segment_readers: Vec<SegmentReader>,
|
|
||||||
) -> Searcher {
|
|
||||||
let store_readers = segment_readers
|
|
||||||
.iter()
|
|
||||||
.map(|segment_reader| segment_reader.get_store_reader())
|
|
||||||
.collect();
|
|
||||||
Searcher {
|
Searcher {
|
||||||
schema,
|
schema,
|
||||||
index,
|
|
||||||
segment_readers,
|
segment_readers,
|
||||||
store_readers,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the `Index` associated to the `Searcher`
|
|
||||||
pub fn index(&self) -> &Index {
|
|
||||||
&self.index
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Fetches a document from tantivy's store given a `DocAddress`.
|
/// Fetches a document from tantivy's store given a `DocAddress`.
|
||||||
///
|
///
|
||||||
/// The searcher uses the segment ordinal to route the
|
/// The searcher uses the segment ordinal to route the
|
||||||
/// the request to the right `Segment`.
|
/// the request to the right `Segment`.
|
||||||
pub fn doc(&self, doc_address: DocAddress) -> Result<Document> {
|
pub fn doc(&self, doc_address: &DocAddress) -> Result<Document> {
|
||||||
let DocAddress(segment_local_id, doc_id) = doc_address;
|
let DocAddress(segment_local_id, doc_id) = *doc_address;
|
||||||
let store_reader = &self.store_readers[segment_local_id as usize];
|
let segment_reader = &self.segment_readers[segment_local_id as usize];
|
||||||
store_reader.get(doc_id)
|
segment_reader.doc(doc_id)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Access the schema associated to the index of this searcher.
|
/// Access the schema associated to the index of this searcher.
|
||||||
@@ -93,7 +48,7 @@ impl Searcher {
|
|||||||
pub fn num_docs(&self) -> u64 {
|
pub fn num_docs(&self) -> u64 {
|
||||||
self.segment_readers
|
self.segment_readers
|
||||||
.iter()
|
.iter()
|
||||||
.map(|segment_reader| u64::from(segment_reader.num_docs()))
|
.map(|segment_reader| segment_reader.num_docs() as u64)
|
||||||
.sum::<u64>()
|
.sum::<u64>()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -102,9 +57,8 @@ impl Searcher {
|
|||||||
pub fn doc_freq(&self, term: &Term) -> u64 {
|
pub fn doc_freq(&self, term: &Term) -> u64 {
|
||||||
self.segment_readers
|
self.segment_readers
|
||||||
.iter()
|
.iter()
|
||||||
.map(|segment_reader| {
|
.map(|segment_reader| segment_reader.inverted_index(term.field()).doc_freq(term) as u64)
|
||||||
u64::from(segment_reader.inverted_index(term.field()).doc_freq(term))
|
.sum::<u64>()
|
||||||
}).sum::<u64>()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return the list of segment readers
|
/// Return the list of segment readers
|
||||||
@@ -117,78 +71,19 @@ impl Searcher {
|
|||||||
&self.segment_readers[segment_ord as usize]
|
&self.segment_readers[segment_ord as usize]
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Runs a query on the segment readers wrapped by the searcher.
|
/// Runs a query on the segment readers wrapped by the searcher
|
||||||
///
|
pub fn search<C: Collector>(&self, query: &Query, collector: &mut C) -> Result<()> {
|
||||||
/// Search works as follows :
|
query.search(self, collector)
|
||||||
///
|
|
||||||
/// First the weight object associated to the query is created.
|
|
||||||
///
|
|
||||||
/// Then, the query loops over the segments and for each segment :
|
|
||||||
/// - setup the collector and informs it that the segment being processed has changed.
|
|
||||||
/// - creates a SegmentCollector for collecting documents associated to the segment
|
|
||||||
/// - creates a `Scorer` object associated for this segment
|
|
||||||
/// - iterate through the matched documents and push them to the segment collector.
|
|
||||||
///
|
|
||||||
/// Finally, the Collector merges each of the child collectors into itself for result usability
|
|
||||||
/// by the caller.
|
|
||||||
pub fn search<C: Collector>(&self, query: &Query, collector: &C) -> Result<C::Fruit> {
|
|
||||||
let executor = self.index.search_executor();
|
|
||||||
self.search_with_executor(query, collector, executor)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Same as [`search(...)`](#method.search) but multithreaded.
|
|
||||||
///
|
|
||||||
/// The current implementation is rather naive :
|
|
||||||
/// multithreading is by splitting search into as many task
|
|
||||||
/// as there are segments.
|
|
||||||
///
|
|
||||||
/// It is powerless at making search faster if your index consists in
|
|
||||||
/// one large segment.
|
|
||||||
///
|
|
||||||
/// Also, keep in my multithreading a single query on several
|
|
||||||
/// threads will not improve your throughput. It can actually
|
|
||||||
/// hurt it. It will however, decrease the average response time.
|
|
||||||
pub fn search_with_executor<C: Collector>(
|
|
||||||
&self,
|
|
||||||
query: &Query,
|
|
||||||
collector: &C,
|
|
||||||
executor: &Executor,
|
|
||||||
) -> Result<C::Fruit> {
|
|
||||||
let scoring_enabled = collector.requires_scoring();
|
|
||||||
let weight = query.weight(self, scoring_enabled)?;
|
|
||||||
let segment_readers = self.segment_readers();
|
|
||||||
let fruits = executor.map(
|
|
||||||
|(segment_ord, segment_reader)| {
|
|
||||||
collect_segment(
|
|
||||||
collector,
|
|
||||||
weight.as_ref(),
|
|
||||||
segment_ord as u32,
|
|
||||||
segment_reader,
|
|
||||||
)
|
|
||||||
},
|
|
||||||
segment_readers.iter().enumerate(),
|
|
||||||
)?;
|
|
||||||
collector.merge_fruits(fruits)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return the field searcher associated to a `Field`.
|
/// Return the field searcher associated to a `Field`.
|
||||||
pub fn field(&self, field: Field) -> FieldSearcher {
|
pub fn field(&self, field: Field) -> FieldSearcher {
|
||||||
let inv_index_readers = self
|
let inv_index_readers = self.segment_readers
|
||||||
.segment_readers
|
|
||||||
.iter()
|
.iter()
|
||||||
.map(|segment_reader| segment_reader.inverted_index(field))
|
.map(|segment_reader| segment_reader.inverted_index(field))
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
FieldSearcher::new(inv_index_readers)
|
FieldSearcher::new(inv_index_readers)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Summarize total space usage of this searcher.
|
|
||||||
pub fn space_usage(&self) -> SearcherSpaceUsage {
|
|
||||||
let mut space_usage = SearcherSpaceUsage::new();
|
|
||||||
for segment_reader in self.segment_readers.iter() {
|
|
||||||
space_usage.add_segment(segment_reader.space_usage());
|
|
||||||
}
|
|
||||||
space_usage
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct FieldSearcher {
|
pub struct FieldSearcher {
|
||||||
@@ -203,8 +98,7 @@ impl FieldSearcher {
|
|||||||
/// Returns a Stream over all of the sorted unique terms of
|
/// Returns a Stream over all of the sorted unique terms of
|
||||||
/// for the given field.
|
/// for the given field.
|
||||||
pub fn terms(&self) -> TermMerger {
|
pub fn terms(&self) -> TermMerger {
|
||||||
let term_streamers: Vec<_> = self
|
let term_streamers: Vec<_> = self.inv_index_readers
|
||||||
.inv_index_readers
|
|
||||||
.iter()
|
.iter()
|
||||||
.map(|inverted_index| inverted_index.terms().stream())
|
.map(|inverted_index| inverted_index.terms().stream())
|
||||||
.collect();
|
.collect();
|
||||||
@@ -214,8 +108,7 @@ impl FieldSearcher {
|
|||||||
|
|
||||||
impl fmt::Debug for Searcher {
|
impl fmt::Debug for Searcher {
|
||||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
let segment_ids = self
|
let segment_ids = self.segment_readers
|
||||||
.segment_readers
|
|
||||||
.iter()
|
.iter()
|
||||||
.map(|segment_reader| segment_reader.segment_id())
|
.map(|segment_reader| segment_reader.segment_id())
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
|
|||||||
@@ -52,12 +52,12 @@ impl SegmentId {
|
|||||||
/// Picking the first 8 chars is ok to identify
|
/// Picking the first 8 chars is ok to identify
|
||||||
/// segments in a display message.
|
/// segments in a display message.
|
||||||
pub fn short_uuid_string(&self) -> String {
|
pub fn short_uuid_string(&self) -> String {
|
||||||
(&self.0.to_simple_ref().to_string()[..8]).to_string()
|
(&self.0.simple().to_string()[..8]).to_string()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns a segment uuid string.
|
/// Returns a segment uuid string.
|
||||||
pub fn uuid_string(&self) -> String {
|
pub fn uuid_string(&self) -> String {
|
||||||
self.0.to_simple_ref().to_string()
|
self.0.simple().to_string()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ impl<'a> serde::Deserialize<'a> for SegmentMeta {
|
|||||||
{
|
{
|
||||||
let inner = InnerSegmentMeta::deserialize(deserializer)?;
|
let inner = InnerSegmentMeta::deserialize(deserializer)?;
|
||||||
let tracked = INVENTORY.track(inner);
|
let tracked = INVENTORY.track(inner);
|
||||||
Ok(SegmentMeta { tracked })
|
Ok(SegmentMeta { tracked: tracked })
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ use core::InvertedIndexReader;
|
|||||||
use core::Segment;
|
use core::Segment;
|
||||||
use core::SegmentComponent;
|
use core::SegmentComponent;
|
||||||
use core::SegmentId;
|
use core::SegmentId;
|
||||||
use directory::ReadOnlySource;
|
use core::SegmentMeta;
|
||||||
use error::TantivyError;
|
use error::TantivyError;
|
||||||
use fastfield::DeleteBitSet;
|
use fastfield::DeleteBitSet;
|
||||||
use fastfield::FacetReader;
|
use fastfield::FacetReader;
|
||||||
@@ -13,10 +13,10 @@ use fastfield::{self, FastFieldNotAvailableError};
|
|||||||
use fastfield::{BytesFastFieldReader, FastValue, MultiValueIntFastFieldReader};
|
use fastfield::{BytesFastFieldReader, FastValue, MultiValueIntFastFieldReader};
|
||||||
use fieldnorm::FieldNormReader;
|
use fieldnorm::FieldNormReader;
|
||||||
use schema::Cardinality;
|
use schema::Cardinality;
|
||||||
|
use schema::Document;
|
||||||
use schema::Field;
|
use schema::Field;
|
||||||
use schema::FieldType;
|
use schema::FieldType;
|
||||||
use schema::Schema;
|
use schema::Schema;
|
||||||
use space_usage::SegmentSpaceUsage;
|
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
@@ -44,8 +44,7 @@ pub struct SegmentReader {
|
|||||||
inv_idx_reader_cache: Arc<RwLock<HashMap<Field, Arc<InvertedIndexReader>>>>,
|
inv_idx_reader_cache: Arc<RwLock<HashMap<Field, Arc<InvertedIndexReader>>>>,
|
||||||
|
|
||||||
segment_id: SegmentId,
|
segment_id: SegmentId,
|
||||||
max_doc: DocId,
|
segment_meta: SegmentMeta,
|
||||||
num_docs: DocId,
|
|
||||||
|
|
||||||
termdict_composite: CompositeFile,
|
termdict_composite: CompositeFile,
|
||||||
postings_composite: CompositeFile,
|
postings_composite: CompositeFile,
|
||||||
@@ -54,7 +53,7 @@ pub struct SegmentReader {
|
|||||||
fast_fields_composite: CompositeFile,
|
fast_fields_composite: CompositeFile,
|
||||||
fieldnorms_composite: CompositeFile,
|
fieldnorms_composite: CompositeFile,
|
||||||
|
|
||||||
store_source: ReadOnlySource,
|
store_reader: StoreReader,
|
||||||
delete_bitset_opt: Option<DeleteBitSet>,
|
delete_bitset_opt: Option<DeleteBitSet>,
|
||||||
schema: Schema,
|
schema: Schema,
|
||||||
}
|
}
|
||||||
@@ -65,7 +64,7 @@ impl SegmentReader {
|
|||||||
/// Today, `tantivy` does not handle deletes, so it happens
|
/// Today, `tantivy` does not handle deletes, so it happens
|
||||||
/// to also be the number of documents in the index.
|
/// to also be the number of documents in the index.
|
||||||
pub fn max_doc(&self) -> DocId {
|
pub fn max_doc(&self) -> DocId {
|
||||||
self.max_doc
|
self.segment_meta.max_doc()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the number of documents.
|
/// Returns the number of documents.
|
||||||
@@ -74,7 +73,7 @@ impl SegmentReader {
|
|||||||
/// Today, `tantivy` does not handle deletes so max doc and
|
/// Today, `tantivy` does not handle deletes so max doc and
|
||||||
/// num_docs are the same.
|
/// num_docs are the same.
|
||||||
pub fn num_docs(&self) -> DocId {
|
pub fn num_docs(&self) -> DocId {
|
||||||
self.num_docs
|
self.segment_meta.num_docs()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the schema of the index this segment belongs to.
|
/// Returns the schema of the index this segment belongs to.
|
||||||
@@ -154,17 +153,15 @@ impl SegmentReader {
|
|||||||
/// Accessor to the `BytesFastFieldReader` associated to a given `Field`.
|
/// Accessor to the `BytesFastFieldReader` associated to a given `Field`.
|
||||||
pub fn bytes_fast_field_reader(&self, field: Field) -> fastfield::Result<BytesFastFieldReader> {
|
pub fn bytes_fast_field_reader(&self, field: Field) -> fastfield::Result<BytesFastFieldReader> {
|
||||||
let field_entry = self.schema.get_field_entry(field);
|
let field_entry = self.schema.get_field_entry(field);
|
||||||
match *field_entry.field_type() {
|
match field_entry.field_type() {
|
||||||
FieldType::Bytes => {}
|
&FieldType::Bytes => {}
|
||||||
_ => return Err(FastFieldNotAvailableError::new(field_entry)),
|
_ => return Err(FastFieldNotAvailableError::new(field_entry)),
|
||||||
}
|
}
|
||||||
let idx_reader = self
|
let idx_reader = self.fast_fields_composite
|
||||||
.fast_fields_composite
|
|
||||||
.open_read_with_idx(field, 0)
|
.open_read_with_idx(field, 0)
|
||||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
|
||||||
.map(FastFieldReader::open)?;
|
.map(FastFieldReader::open)?;
|
||||||
let values = self
|
let values = self.fast_fields_composite
|
||||||
.fast_fields_composite
|
|
||||||
.open_read_with_idx(field, 1)
|
.open_read_with_idx(field, 1)
|
||||||
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))?;
|
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))?;
|
||||||
Ok(BytesFastFieldReader::open(idx_reader, values))
|
Ok(BytesFastFieldReader::open(idx_reader, values))
|
||||||
@@ -178,7 +175,7 @@ impl SegmentReader {
|
|||||||
"The field {:?} is not a \
|
"The field {:?} is not a \
|
||||||
hierarchical facet.",
|
hierarchical facet.",
|
||||||
field_entry
|
field_entry
|
||||||
)));
|
)).into());
|
||||||
}
|
}
|
||||||
let term_ords_reader = self.multi_fast_field_reader(field)?;
|
let term_ords_reader = self.multi_fast_field_reader(field)?;
|
||||||
let termdict_source = self.termdict_composite.open_read(field).ok_or_else(|| {
|
let termdict_source = self.termdict_composite.open_read(field).ok_or_else(|| {
|
||||||
@@ -189,7 +186,7 @@ impl SegmentReader {
|
|||||||
field_entry.name()
|
field_entry.name()
|
||||||
))
|
))
|
||||||
})?;
|
})?;
|
||||||
let termdict = TermDictionary::from_source(&termdict_source);
|
let termdict = TermDictionary::from_source(termdict_source);
|
||||||
let facet_reader = FacetReader::new(term_ords_reader, termdict);
|
let facet_reader = FacetReader::new(term_ords_reader, termdict);
|
||||||
Ok(facet_reader)
|
Ok(facet_reader)
|
||||||
}
|
}
|
||||||
@@ -197,7 +194,8 @@ impl SegmentReader {
|
|||||||
/// Accessor to the segment's `Field norms`'s reader.
|
/// Accessor to the segment's `Field norms`'s reader.
|
||||||
///
|
///
|
||||||
/// Field norms are the length (in tokens) of the fields.
|
/// Field norms are the length (in tokens) of the fields.
|
||||||
/// It is used in the computation of the [TfIdf](https://fulmicoton.gitbooks.io/tantivy-doc/content/tfidf.html).
|
/// It is used in the computation of the [TfIdf]
|
||||||
|
/// (https://fulmicoton.gitbooks.io/tantivy-doc/content/tfidf.html).
|
||||||
///
|
///
|
||||||
/// They are simply stored as a fast field, serialized in
|
/// They are simply stored as a fast field, serialized in
|
||||||
/// the `.fieldnorm` file of the segment.
|
/// the `.fieldnorm` file of the segment.
|
||||||
@@ -215,8 +213,8 @@ impl SegmentReader {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Accessor to the segment's `StoreReader`.
|
/// Accessor to the segment's `StoreReader`.
|
||||||
pub fn get_store_reader(&self) -> StoreReader {
|
pub fn get_store_reader(&self) -> &StoreReader {
|
||||||
StoreReader::from_source(self.store_source.clone())
|
&self.store_reader
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Open a new segment for reading.
|
/// Open a new segment for reading.
|
||||||
@@ -225,8 +223,7 @@ impl SegmentReader {
|
|||||||
let termdict_composite = CompositeFile::open(&termdict_source)?;
|
let termdict_composite = CompositeFile::open(&termdict_source)?;
|
||||||
|
|
||||||
let store_source = segment.open_read(SegmentComponent::STORE)?;
|
let store_source = segment.open_read(SegmentComponent::STORE)?;
|
||||||
|
let store_reader = StoreReader::from_source(store_source);
|
||||||
fail_point!("SegmentReader::open#middle");
|
|
||||||
|
|
||||||
let postings_source = segment.open_read(SegmentComponent::POSTINGS)?;
|
let postings_source = segment.open_read(SegmentComponent::POSTINGS)?;
|
||||||
let postings_composite = CompositeFile::open(&postings_source)?;
|
let postings_composite = CompositeFile::open(&postings_source)?;
|
||||||
@@ -263,14 +260,13 @@ impl SegmentReader {
|
|||||||
let schema = segment.schema();
|
let schema = segment.schema();
|
||||||
Ok(SegmentReader {
|
Ok(SegmentReader {
|
||||||
inv_idx_reader_cache: Arc::new(RwLock::new(HashMap::new())),
|
inv_idx_reader_cache: Arc::new(RwLock::new(HashMap::new())),
|
||||||
max_doc: segment.meta().max_doc(),
|
segment_meta: segment.meta().clone(),
|
||||||
num_docs: segment.meta().num_docs(),
|
|
||||||
termdict_composite,
|
termdict_composite,
|
||||||
postings_composite,
|
postings_composite,
|
||||||
fast_fields_composite,
|
fast_fields_composite,
|
||||||
fieldnorms_composite,
|
fieldnorms_composite,
|
||||||
segment_id: segment.id(),
|
segment_id: segment.id(),
|
||||||
store_source,
|
store_reader,
|
||||||
delete_bitset_opt,
|
delete_bitset_opt,
|
||||||
positions_composite,
|
positions_composite,
|
||||||
positions_idx_composite,
|
positions_idx_composite,
|
||||||
@@ -286,8 +282,7 @@ impl SegmentReader {
|
|||||||
/// term dictionary associated to a specific field,
|
/// term dictionary associated to a specific field,
|
||||||
/// and opening the posting list associated to any term.
|
/// and opening the posting list associated to any term.
|
||||||
pub fn inverted_index(&self, field: Field) -> Arc<InvertedIndexReader> {
|
pub fn inverted_index(&self, field: Field) -> Arc<InvertedIndexReader> {
|
||||||
if let Some(inv_idx_reader) = self
|
if let Some(inv_idx_reader) = self.inv_idx_reader_cache
|
||||||
.inv_idx_reader_cache
|
|
||||||
.read()
|
.read()
|
||||||
.expect("Lock poisoned. This should never happen")
|
.expect("Lock poisoned. This should never happen")
|
||||||
.get(&field)
|
.get(&field)
|
||||||
@@ -311,28 +306,25 @@ impl SegmentReader {
|
|||||||
// As a result, no data is associated to the inverted index.
|
// As a result, no data is associated to the inverted index.
|
||||||
//
|
//
|
||||||
// Returns an empty inverted index.
|
// Returns an empty inverted index.
|
||||||
return Arc::new(InvertedIndexReader::empty(field_type));
|
return Arc::new(InvertedIndexReader::empty(field_type.clone()));
|
||||||
}
|
}
|
||||||
|
|
||||||
let postings_source = postings_source_opt.unwrap();
|
let postings_source = postings_source_opt.unwrap();
|
||||||
|
|
||||||
let termdict_source = self
|
let termdict_source = self.termdict_composite
|
||||||
.termdict_composite
|
|
||||||
.open_read(field)
|
.open_read(field)
|
||||||
.expect("Failed to open field term dictionary in composite file. Is the field indexed");
|
.expect("Failed to open field term dictionary in composite file. Is the field indexed");
|
||||||
|
|
||||||
let positions_source = self
|
let positions_source = self.positions_composite
|
||||||
.positions_composite
|
|
||||||
.open_read(field)
|
.open_read(field)
|
||||||
.expect("Index corrupted. Failed to open field positions in composite file.");
|
.expect("Index corrupted. Failed to open field positions in composite file.");
|
||||||
|
|
||||||
let positions_idx_source = self
|
let positions_idx_source = self.positions_idx_composite
|
||||||
.positions_idx_composite
|
|
||||||
.open_read(field)
|
.open_read(field)
|
||||||
.expect("Index corrupted. Failed to open field positions in composite file.");
|
.expect("Index corrupted. Failed to open field positions in composite file.");
|
||||||
|
|
||||||
let inv_idx_reader = Arc::new(InvertedIndexReader::new(
|
let inv_idx_reader = Arc::new(InvertedIndexReader::new(
|
||||||
TermDictionary::from_source(&termdict_source),
|
TermDictionary::from_source(termdict_source),
|
||||||
postings_source,
|
postings_source,
|
||||||
positions_source,
|
positions_source,
|
||||||
positions_idx_source,
|
positions_idx_source,
|
||||||
@@ -349,6 +341,14 @@ impl SegmentReader {
|
|||||||
inv_idx_reader
|
inv_idx_reader
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the document (or to be accurate, its stored field)
|
||||||
|
/// bearing the given doc id.
|
||||||
|
/// This method is slow and should seldom be called from
|
||||||
|
/// within a collector.
|
||||||
|
pub fn doc(&self, doc_id: DocId) -> Result<Document> {
|
||||||
|
self.store_reader.get(doc_id)
|
||||||
|
}
|
||||||
|
|
||||||
/// Returns the segment id
|
/// Returns the segment id
|
||||||
pub fn segment_id(&self) -> SegmentId {
|
pub fn segment_id(&self) -> SegmentId {
|
||||||
self.segment_id
|
self.segment_id
|
||||||
@@ -372,24 +372,6 @@ impl SegmentReader {
|
|||||||
pub fn doc_ids_alive(&self) -> SegmentReaderAliveDocsIterator {
|
pub fn doc_ids_alive(&self) -> SegmentReaderAliveDocsIterator {
|
||||||
SegmentReaderAliveDocsIterator::new(&self)
|
SegmentReaderAliveDocsIterator::new(&self)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Summarize total space usage of this segment.
|
|
||||||
pub fn space_usage(&self) -> SegmentSpaceUsage {
|
|
||||||
SegmentSpaceUsage::new(
|
|
||||||
self.num_docs(),
|
|
||||||
self.termdict_composite.space_usage(),
|
|
||||||
self.postings_composite.space_usage(),
|
|
||||||
self.positions_composite.space_usage(),
|
|
||||||
self.positions_idx_composite.space_usage(),
|
|
||||||
self.fast_fields_composite.space_usage(),
|
|
||||||
self.fieldnorms_composite.space_usage(),
|
|
||||||
self.get_store_reader().space_usage(),
|
|
||||||
self.delete_bitset_opt
|
|
||||||
.as_ref()
|
|
||||||
.map(|x| x.space_usage())
|
|
||||||
.unwrap_or(0),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl fmt::Debug for SegmentReader {
|
impl fmt::Debug for SegmentReader {
|
||||||
@@ -409,7 +391,7 @@ pub struct SegmentReaderAliveDocsIterator<'a> {
|
|||||||
impl<'a> SegmentReaderAliveDocsIterator<'a> {
|
impl<'a> SegmentReaderAliveDocsIterator<'a> {
|
||||||
pub fn new(reader: &'a SegmentReader) -> SegmentReaderAliveDocsIterator<'a> {
|
pub fn new(reader: &'a SegmentReader) -> SegmentReaderAliveDocsIterator<'a> {
|
||||||
SegmentReaderAliveDocsIterator {
|
SegmentReaderAliveDocsIterator {
|
||||||
reader,
|
reader: reader,
|
||||||
max_doc: reader.max_doc(),
|
max_doc: reader.max_doc(),
|
||||||
current: 0,
|
current: 0,
|
||||||
}
|
}
|
||||||
@@ -447,12 +429,12 @@ impl<'a> Iterator for SegmentReaderAliveDocsIterator<'a> {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use core::Index;
|
use core::Index;
|
||||||
use schema::{Schema, Term, STORED, TEXT};
|
use schema::{SchemaBuilder, Term, STORED, TEXT};
|
||||||
use DocId;
|
use DocId;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_alive_docs_iterator() {
|
fn test_alive_docs_iterator() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::new();
|
||||||
schema_builder.add_text_field("name", TEXT | STORED);
|
schema_builder.add_text_field("name", TEXT | STORED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema.clone());
|
let index = Index::create_in_ram(schema.clone());
|
||||||
|
|||||||
@@ -77,15 +77,15 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
|
|||||||
|
|
||||||
/// DirectoryClone
|
/// DirectoryClone
|
||||||
pub trait DirectoryClone {
|
pub trait DirectoryClone {
|
||||||
/// Clones the directory and boxes the clone
|
/// Clones the directory and boxes the clone
|
||||||
fn box_clone(&self) -> Box<Directory>;
|
fn box_clone(&self) -> Box<Directory>;
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T> DirectoryClone for T
|
impl<T> DirectoryClone for T
|
||||||
where
|
where
|
||||||
T: 'static + Directory + Clone,
|
T: 'static + Directory + Clone,
|
||||||
{
|
{
|
||||||
fn box_clone(&self) -> Box<Directory> {
|
fn box_clone(&self) -> Box<Directory> {
|
||||||
Box::new(self.clone())
|
Box::new(self.clone())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,7 +2,6 @@ use core::MANAGED_FILEPATH;
|
|||||||
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
|
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
|
||||||
use directory::{ReadOnlySource, WritePtr};
|
use directory::{ReadOnlySource, WritePtr};
|
||||||
use error::TantivyError;
|
use error::TantivyError;
|
||||||
use indexer::LockType;
|
|
||||||
use serde_json;
|
use serde_json;
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::io;
|
use std::io;
|
||||||
@@ -14,17 +13,6 @@ use std::sync::{Arc, RwLock};
|
|||||||
use Directory;
|
use Directory;
|
||||||
use Result;
|
use Result;
|
||||||
|
|
||||||
/// Returns true iff the file is "managed".
|
|
||||||
/// Non-managed file are not subject to garbage collection.
|
|
||||||
///
|
|
||||||
/// Filenames that starts by a "." -typically locks-
|
|
||||||
/// are not managed.
|
|
||||||
fn is_managed(path: &Path) -> bool {
|
|
||||||
path.to_str()
|
|
||||||
.map(|p_str| !p_str.starts_with('.'))
|
|
||||||
.unwrap_or(true)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Wrapper of directories that keeps track of files created by Tantivy.
|
/// Wrapper of directories that keeps track of files created by Tantivy.
|
||||||
///
|
///
|
||||||
/// A managed directory is just a wrapper of a directory
|
/// A managed directory is just a wrapper of a directory
|
||||||
@@ -52,7 +40,7 @@ fn save_managed_paths(
|
|||||||
wlock: &RwLockWriteGuard<MetaInformation>,
|
wlock: &RwLockWriteGuard<MetaInformation>,
|
||||||
) -> io::Result<()> {
|
) -> io::Result<()> {
|
||||||
let mut w = serde_json::to_vec(&wlock.managed_paths)?;
|
let mut w = serde_json::to_vec(&wlock.managed_paths)?;
|
||||||
writeln!(&mut w)?;
|
write!(&mut w, "\n")?;
|
||||||
directory.atomic_write(&MANAGED_FILEPATH, &w[..])?;
|
directory.atomic_write(&MANAGED_FILEPATH, &w[..])?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -94,35 +82,25 @@ impl ManagedDirectory {
|
|||||||
pub fn garbage_collect<L: FnOnce() -> HashSet<PathBuf>>(&mut self, get_living_files: L) {
|
pub fn garbage_collect<L: FnOnce() -> HashSet<PathBuf>>(&mut self, get_living_files: L) {
|
||||||
info!("Garbage collect");
|
info!("Garbage collect");
|
||||||
let mut files_to_delete = vec![];
|
let mut files_to_delete = vec![];
|
||||||
|
|
||||||
// It is crucial to get the living files after acquiring the
|
|
||||||
// read lock of meta informations. That way, we
|
|
||||||
// avoid the following scenario.
|
|
||||||
//
|
|
||||||
// 1) we get the list of living files.
|
|
||||||
// 2) someone creates a new file.
|
|
||||||
// 3) we start garbage collection and remove this file
|
|
||||||
// even though it is a living file.
|
|
||||||
//
|
|
||||||
// releasing the lock as .delete() will use it too.
|
|
||||||
{
|
{
|
||||||
let meta_informations_rlock = self
|
// releasing the lock as .delete() will use it too.
|
||||||
.meta_informations
|
let meta_informations_rlock = self.meta_informations
|
||||||
.read()
|
.read()
|
||||||
.expect("Managed directory rlock poisoned in garbage collect.");
|
.expect("Managed directory rlock poisoned in garbage collect.");
|
||||||
|
|
||||||
// The point of this second "file" lock is to enforce the following scenario
|
// It is crucial to get the living files after acquiring the
|
||||||
// 1) process B tries to load a new set of searcher.
|
// read lock of meta informations. That way, we
|
||||||
// The list of segments is loaded
|
// avoid the following scenario.
|
||||||
// 2) writer change meta.json (for instance after a merge or a commit)
|
//
|
||||||
// 3) gc kicks in.
|
// 1) we get the list of living files.
|
||||||
// 4) gc removes a file that was useful for process B, before process B opened it.
|
// 2) someone creates a new file.
|
||||||
if let Ok(_meta_lock) = LockType::MetaLock.acquire_lock(self) {
|
// 3) we start garbage collection and remove this file
|
||||||
let living_files = get_living_files();
|
// even though it is a living file.
|
||||||
for managed_path in &meta_informations_rlock.managed_paths {
|
let living_files = get_living_files();
|
||||||
if !living_files.contains(managed_path) {
|
|
||||||
files_to_delete.push(managed_path.clone());
|
for managed_path in &meta_informations_rlock.managed_paths {
|
||||||
}
|
if !living_files.contains(managed_path) {
|
||||||
|
files_to_delete.push(managed_path.clone());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -156,8 +134,7 @@ impl ManagedDirectory {
|
|||||||
if !deleted_files.is_empty() {
|
if !deleted_files.is_empty() {
|
||||||
// update the list of managed files by removing
|
// update the list of managed files by removing
|
||||||
// the file that were removed.
|
// the file that were removed.
|
||||||
let mut meta_informations_wlock = self
|
let mut meta_informations_wlock = self.meta_informations
|
||||||
.meta_informations
|
|
||||||
.write()
|
.write()
|
||||||
.expect("Managed directory wlock poisoned (2).");
|
.expect("Managed directory wlock poisoned (2).");
|
||||||
{
|
{
|
||||||
@@ -179,17 +156,8 @@ impl ManagedDirectory {
|
|||||||
/// registering the filepath and creating the file
|
/// registering the filepath and creating the file
|
||||||
/// will not lead to garbage files that will
|
/// will not lead to garbage files that will
|
||||||
/// never get removed.
|
/// never get removed.
|
||||||
///
|
|
||||||
/// File starting by "." are reserved to locks.
|
|
||||||
/// They are not managed and cannot be subjected
|
|
||||||
/// to garbage collection.
|
|
||||||
fn register_file_as_managed(&mut self, filepath: &Path) -> io::Result<()> {
|
fn register_file_as_managed(&mut self, filepath: &Path) -> io::Result<()> {
|
||||||
// Files starting by "." (e.g. lock files) are not managed.
|
let mut meta_wlock = self.meta_informations
|
||||||
if !is_managed(filepath) {
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
let mut meta_wlock = self
|
|
||||||
.meta_informations
|
|
||||||
.write()
|
.write()
|
||||||
.expect("Managed file lock poisoned");
|
.expect("Managed file lock poisoned");
|
||||||
let has_changed = meta_wlock.managed_paths.insert(filepath.to_owned());
|
let has_changed = meta_wlock.managed_paths.insert(filepath.to_owned());
|
||||||
|
|||||||
@@ -32,8 +32,7 @@ fn open_mmap(full_path: &Path) -> result::Result<Option<MmapReadOnly>, OpenReadE
|
|||||||
}
|
}
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
let meta_data = file
|
let meta_data = file.metadata()
|
||||||
.metadata()
|
|
||||||
.map_err(|e| IOError::with_path(full_path.to_owned(), e))?;
|
.map_err(|e| IOError::with_path(full_path.to_owned(), e))?;
|
||||||
if meta_data.len() == 0 {
|
if meta_data.len() == 0 {
|
||||||
// if the file size is 0, it will not be possible
|
// if the file size is 0, it will not be possible
|
||||||
@@ -310,8 +309,7 @@ impl Directory for MmapDirectory {
|
|||||||
// when the last reference is gone.
|
// when the last reference is gone.
|
||||||
mmap_cache.cache.remove(&full_path);
|
mmap_cache.cache.remove(&full_path);
|
||||||
match fs::remove_file(&full_path) {
|
match fs::remove_file(&full_path) {
|
||||||
Ok(_) => self
|
Ok(_) => self.sync_directory()
|
||||||
.sync_directory()
|
|
||||||
.map_err(|e| IOError::with_path(path.to_owned(), e).into()),
|
.map_err(|e| IOError::with_path(path.to_owned(), e).into()),
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
if e.kind() == io::ErrorKind::NotFound {
|
if e.kind() == io::ErrorKind::NotFound {
|
||||||
@@ -364,11 +362,6 @@ mod tests {
|
|||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_open_non_existant_path() {
|
|
||||||
assert!(MmapDirectory::open(PathBuf::from("./nowhere")).is_err());
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_open_empty() {
|
fn test_open_empty() {
|
||||||
// empty file is actually an edge case because those
|
// empty file is actually an edge case because those
|
||||||
|
|||||||
@@ -100,7 +100,8 @@ impl InnerDirectory {
|
|||||||
);
|
);
|
||||||
let io_err = make_io_err(msg);
|
let io_err = make_io_err(msg);
|
||||||
OpenReadError::IOError(IOError::with_path(path.to_owned(), io_err))
|
OpenReadError::IOError(IOError::with_path(path.to_owned(), io_err))
|
||||||
}).and_then(|readable_map| {
|
})
|
||||||
|
.and_then(|readable_map| {
|
||||||
readable_map
|
readable_map
|
||||||
.get(path)
|
.get(path)
|
||||||
.ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path)))
|
.ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path)))
|
||||||
@@ -120,7 +121,8 @@ impl InnerDirectory {
|
|||||||
);
|
);
|
||||||
let io_err = make_io_err(msg);
|
let io_err = make_io_err(msg);
|
||||||
DeleteError::IOError(IOError::with_path(path.to_owned(), io_err))
|
DeleteError::IOError(IOError::with_path(path.to_owned(), io_err))
|
||||||
}).and_then(|mut writable_map| match writable_map.remove(path) {
|
})
|
||||||
|
.and_then(|mut writable_map| match writable_map.remove(path) {
|
||||||
Some(_) => Ok(()),
|
Some(_) => Ok(()),
|
||||||
None => Err(DeleteError::FileDoesNotExist(PathBuf::from(path))),
|
None => Err(DeleteError::FileDoesNotExist(PathBuf::from(path))),
|
||||||
})
|
})
|
||||||
@@ -168,10 +170,10 @@ impl Directory for RAMDirectory {
|
|||||||
let path_buf = PathBuf::from(path);
|
let path_buf = PathBuf::from(path);
|
||||||
let vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());
|
let vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());
|
||||||
|
|
||||||
let exists = self
|
let exists = self.fs
|
||||||
.fs
|
|
||||||
.write(path_buf.clone(), &Vec::new())
|
.write(path_buf.clone(), &Vec::new())
|
||||||
.map_err(|err| IOError::with_path(path.to_owned(), err))?;
|
.map_err(|err| IOError::with_path(path.to_owned(), err))?;
|
||||||
|
|
||||||
// force the creation of the file to mimic the MMap directory.
|
// force the creation of the file to mimic the MMap directory.
|
||||||
if exists {
|
if exists {
|
||||||
Err(OpenWriteError::FileAlreadyExists(path_buf))
|
Err(OpenWriteError::FileAlreadyExists(path_buf))
|
||||||
@@ -194,10 +196,6 @@ impl Directory for RAMDirectory {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
|
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
|
||||||
fail_point!("RAMDirectory::atomic_write", |msg| Err(io::Error::new(
|
|
||||||
io::ErrorKind::Other,
|
|
||||||
msg.unwrap_or("Undefined".to_string())
|
|
||||||
)));
|
|
||||||
let path_buf = PathBuf::from(path);
|
let path_buf = PathBuf::from(path);
|
||||||
let mut vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());
|
let mut vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());
|
||||||
self.fs.write(path_buf, &Vec::new())?;
|
self.fs.write(path_buf, &Vec::new())?;
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ use fst::raw::MmapReadOnly;
|
|||||||
use stable_deref_trait::{CloneStableDeref, StableDeref};
|
use stable_deref_trait::{CloneStableDeref, StableDeref};
|
||||||
use std::ops::Deref;
|
use std::ops::Deref;
|
||||||
|
|
||||||
|
|
||||||
/// Read object that represents files in tantivy.
|
/// Read object that represents files in tantivy.
|
||||||
///
|
///
|
||||||
/// These read objects are only in charge to deliver
|
/// These read objects are only in charge to deliver
|
||||||
|
|||||||
63
src/error.rs
63
src/error.rs
@@ -4,7 +4,6 @@ use std::io;
|
|||||||
|
|
||||||
use directory::error::{IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
|
use directory::error::{IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
|
||||||
use fastfield::FastFieldNotAvailableError;
|
use fastfield::FastFieldNotAvailableError;
|
||||||
use indexer::LockType;
|
|
||||||
use query;
|
use query;
|
||||||
use schema;
|
use schema;
|
||||||
use serde_json;
|
use serde_json;
|
||||||
@@ -15,88 +14,80 @@ use std::sync::PoisonError;
|
|||||||
#[derive(Debug, Fail)]
|
#[derive(Debug, Fail)]
|
||||||
pub enum TantivyError {
|
pub enum TantivyError {
|
||||||
/// Path does not exist.
|
/// Path does not exist.
|
||||||
#[fail(display = "Path does not exist: '{:?}'", _0)]
|
#[fail(display = "path does not exist: '{:?}'", _0)]
|
||||||
PathDoesNotExist(PathBuf),
|
PathDoesNotExist(PathBuf),
|
||||||
/// File already exists, this is a problem when we try to write into a new file.
|
/// File already exists, this is a problem when we try to write into a new file.
|
||||||
#[fail(display = "File already exists: '{:?}'", _0)]
|
#[fail(display = "file already exists: '{:?}'", _0)]
|
||||||
FileAlreadyExists(PathBuf),
|
FileAlreadyExists(PathBuf),
|
||||||
/// Index already exists in this directory
|
|
||||||
#[fail(display = "Index already exists")]
|
|
||||||
IndexAlreadyExists,
|
|
||||||
/// Failed to acquire file lock
|
|
||||||
#[fail(
|
|
||||||
display = "Failed to acquire Lockfile: {:?}. Possible causes: another IndexWriter instance or panic during previous lock drop.",
|
|
||||||
_0
|
|
||||||
)]
|
|
||||||
LockFailure(LockType),
|
|
||||||
/// IO Error.
|
/// IO Error.
|
||||||
#[fail(display = "An IO error occurred: '{}'", _0)]
|
#[fail(display = "an IO error occurred: '{}'", _0)]
|
||||||
IOError(#[cause] IOError),
|
IOError(#[cause] IOError),
|
||||||
/// Data corruption.
|
/// The data within is corrupted.
|
||||||
#[fail(display = "File contains corrupted data: '{:?}'", _0)]
|
///
|
||||||
|
/// For instance, it contains invalid JSON.
|
||||||
|
#[fail(display = "file contains corrupted data: '{:?}'", _0)]
|
||||||
CorruptedFile(PathBuf),
|
CorruptedFile(PathBuf),
|
||||||
/// A thread holding the locked panicked and poisoned the lock.
|
/// A thread holding the locked panicked and poisoned the lock.
|
||||||
#[fail(display = "A thread holding the locked panicked and poisoned the lock")]
|
#[fail(display = "a thread holding the locked panicked and poisoned the lock")]
|
||||||
Poisoned,
|
Poisoned,
|
||||||
/// Invalid argument was passed by the user.
|
/// Invalid argument was passed by the user.
|
||||||
#[fail(display = "An invalid argument was passed: '{}'", _0)]
|
#[fail(display = "an invalid argument was passed: '{}'", _0)]
|
||||||
InvalidArgument(String),
|
InvalidArgument(String),
|
||||||
/// An Error happened in one of the thread.
|
/// An Error happened in one of the thread.
|
||||||
#[fail(display = "An error occurred in a thread: '{}'", _0)]
|
#[fail(display = "an error occurred in a thread: '{}'", _0)]
|
||||||
ErrorInThread(String),
|
ErrorInThread(String),
|
||||||
/// An Error appeared related to the schema.
|
/// An Error appeared related to the schema.
|
||||||
#[fail(display = "Schema error: '{}'", _0)]
|
#[fail(display = "Schema error: '{}'", _0)]
|
||||||
SchemaError(String),
|
SchemaError(String),
|
||||||
/// Tried to access a fastfield reader for a field not configured accordingly.
|
/// Tried to access a fastfield reader for a field not configured accordingly.
|
||||||
#[fail(display = "Fast field not available: '{:?}'", _0)]
|
#[fail(display = "fast field not available: '{:?}'", _0)]
|
||||||
FastFieldError(#[cause] FastFieldNotAvailableError),
|
FastFieldError(#[cause] FastFieldNotAvailableError),
|
||||||
/// System error. (e.g.: We failed spawning a new thread)
|
|
||||||
#[fail(display = "System error.'{}'", _0)]
|
|
||||||
SystemError(String),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<FastFieldNotAvailableError> for TantivyError {
|
impl From<FastFieldNotAvailableError> for TantivyError {
|
||||||
fn from(fastfield_error: FastFieldNotAvailableError) -> TantivyError {
|
fn from(fastfield_error: FastFieldNotAvailableError) -> TantivyError {
|
||||||
TantivyError::FastFieldError(fastfield_error)
|
TantivyError::FastFieldError(fastfield_error).into()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<IOError> for TantivyError {
|
impl From<IOError> for TantivyError {
|
||||||
fn from(io_error: IOError) -> TantivyError {
|
fn from(io_error: IOError) -> TantivyError {
|
||||||
TantivyError::IOError(io_error)
|
TantivyError::IOError(io_error).into()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<io::Error> for TantivyError {
|
impl From<io::Error> for TantivyError {
|
||||||
fn from(io_error: io::Error) -> TantivyError {
|
fn from(io_error: io::Error) -> TantivyError {
|
||||||
TantivyError::IOError(io_error.into())
|
TantivyError::IOError(io_error.into()).into()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<query::QueryParserError> for TantivyError {
|
impl From<query::QueryParserError> for TantivyError {
|
||||||
fn from(parsing_error: query::QueryParserError) -> TantivyError {
|
fn from(parsing_error: query::QueryParserError) -> TantivyError {
|
||||||
TantivyError::InvalidArgument(format!("Query is invalid. {:?}", parsing_error))
|
TantivyError::InvalidArgument(format!("Query is invalid. {:?}", parsing_error)).into()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<Guard> From<PoisonError<Guard>> for TantivyError {
|
impl<Guard> From<PoisonError<Guard>> for TantivyError {
|
||||||
fn from(_: PoisonError<Guard>) -> TantivyError {
|
fn from(_: PoisonError<Guard>) -> TantivyError {
|
||||||
TantivyError::Poisoned
|
TantivyError::Poisoned.into()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<OpenReadError> for TantivyError {
|
impl From<OpenReadError> for TantivyError {
|
||||||
fn from(error: OpenReadError) -> TantivyError {
|
fn from(error: OpenReadError) -> TantivyError {
|
||||||
match error {
|
match error {
|
||||||
OpenReadError::FileDoesNotExist(filepath) => TantivyError::PathDoesNotExist(filepath),
|
OpenReadError::FileDoesNotExist(filepath) => {
|
||||||
OpenReadError::IOError(io_error) => TantivyError::IOError(io_error),
|
TantivyError::PathDoesNotExist(filepath).into()
|
||||||
|
}
|
||||||
|
OpenReadError::IOError(io_error) => TantivyError::IOError(io_error).into(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<schema::DocParsingError> for TantivyError {
|
impl From<schema::DocParsingError> for TantivyError {
|
||||||
fn from(error: schema::DocParsingError) -> TantivyError {
|
fn from(error: schema::DocParsingError) -> TantivyError {
|
||||||
TantivyError::InvalidArgument(format!("Failed to parse document {:?}", error))
|
TantivyError::InvalidArgument(format!("Failed to parse document {:?}", error)).into()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -107,7 +98,7 @@ impl From<OpenWriteError> for TantivyError {
|
|||||||
TantivyError::FileAlreadyExists(filepath)
|
TantivyError::FileAlreadyExists(filepath)
|
||||||
}
|
}
|
||||||
OpenWriteError::IOError(io_error) => TantivyError::IOError(io_error),
|
OpenWriteError::IOError(io_error) => TantivyError::IOError(io_error),
|
||||||
}
|
}.into()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -115,11 +106,11 @@ impl From<OpenDirectoryError> for TantivyError {
|
|||||||
fn from(error: OpenDirectoryError) -> TantivyError {
|
fn from(error: OpenDirectoryError) -> TantivyError {
|
||||||
match error {
|
match error {
|
||||||
OpenDirectoryError::DoesNotExist(directory_path) => {
|
OpenDirectoryError::DoesNotExist(directory_path) => {
|
||||||
TantivyError::PathDoesNotExist(directory_path)
|
TantivyError::PathDoesNotExist(directory_path).into()
|
||||||
}
|
|
||||||
OpenDirectoryError::NotADirectory(directory_path) => {
|
|
||||||
TantivyError::InvalidArgument(format!("{:?} is not a directory", directory_path))
|
|
||||||
}
|
}
|
||||||
|
OpenDirectoryError::NotADirectory(directory_path) => TantivyError::InvalidArgument(
|
||||||
|
format!("{:?} is not a directory", directory_path),
|
||||||
|
).into(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -127,6 +118,6 @@ impl From<OpenDirectoryError> for TantivyError {
|
|||||||
impl From<serde_json::Error> for TantivyError {
|
impl From<serde_json::Error> for TantivyError {
|
||||||
fn from(error: serde_json::Error) -> TantivyError {
|
fn from(error: serde_json::Error) -> TantivyError {
|
||||||
let io_err = io::Error::from(error);
|
let io_err = io::Error::from(error);
|
||||||
TantivyError::IOError(io_err.into())
|
TantivyError::IOError(io_err.into()).into()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,12 +6,12 @@ pub use self::writer::BytesFastFieldWriter;
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use schema::Schema;
|
use schema::SchemaBuilder;
|
||||||
use Index;
|
use Index;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_bytes() {
|
fn test_bytes() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let field = schema_builder.add_bytes_field("bytesfield");
|
let field = schema_builder.add_bytes_field("bytesfield");
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ impl BytesFastFieldWriter {
|
|||||||
self.next_doc();
|
self.next_doc();
|
||||||
for field_value in doc.field_values() {
|
for field_value in doc.field_values() {
|
||||||
if field_value.field() == self.field {
|
if field_value.field() == self.field {
|
||||||
if let Value::Bytes(ref bytes) = *field_value.value() {
|
if let &Value::Bytes(ref bytes) = field_value.value() {
|
||||||
self.vals.extend_from_slice(bytes);
|
self.vals.extend_from_slice(bytes);
|
||||||
} else {
|
} else {
|
||||||
panic!(
|
panic!(
|
||||||
|
|||||||
@@ -2,7 +2,6 @@ use bit_set::BitSet;
|
|||||||
use common::HasLen;
|
use common::HasLen;
|
||||||
use directory::ReadOnlySource;
|
use directory::ReadOnlySource;
|
||||||
use directory::WritePtr;
|
use directory::WritePtr;
|
||||||
use space_usage::ByteCount;
|
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::io::Write;
|
use std::io::Write;
|
||||||
use DocId;
|
use DocId;
|
||||||
@@ -42,8 +41,7 @@ pub struct DeleteBitSet {
|
|||||||
impl DeleteBitSet {
|
impl DeleteBitSet {
|
||||||
/// Opens a delete bitset given its data source.
|
/// Opens a delete bitset given its data source.
|
||||||
pub fn open(data: ReadOnlySource) -> DeleteBitSet {
|
pub fn open(data: ReadOnlySource) -> DeleteBitSet {
|
||||||
let num_deleted: usize = data
|
let num_deleted: usize = data.as_slice()
|
||||||
.as_slice()
|
|
||||||
.iter()
|
.iter()
|
||||||
.map(|b| b.count_ones() as usize)
|
.map(|b| b.count_ones() as usize)
|
||||||
.sum();
|
.sum();
|
||||||
@@ -64,11 +62,6 @@ impl DeleteBitSet {
|
|||||||
b & (1u8 << shift) != 0
|
b & (1u8 << shift) != 0
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Summarize total space usage of this bitset.
|
|
||||||
pub fn space_usage(&self) -> ByteCount {
|
|
||||||
self.data.len()
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl HasLen for DeleteBitSet {
|
impl HasLen for DeleteBitSet {
|
||||||
|
|||||||
@@ -56,8 +56,7 @@ impl FacetReader {
|
|||||||
|
|
||||||
/// Given a term ordinal returns the term associated to it.
|
/// Given a term ordinal returns the term associated to it.
|
||||||
pub fn facet_from_ord(&self, facet_ord: TermOrdinal, output: &mut Facet) {
|
pub fn facet_from_ord(&self, facet_ord: TermOrdinal, output: &mut Facet) {
|
||||||
let found_term = self
|
let found_term = self.term_dict
|
||||||
.term_dict
|
|
||||||
.ord_to_term(facet_ord as u64, output.inner_buffer_mut());
|
.ord_to_term(facet_ord as u64, output.inner_buffer_mut());
|
||||||
assert!(found_term, "Term ordinal {} no found.", facet_ord);
|
assert!(found_term, "Term ordinal {} no found.", facet_ord);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -127,19 +127,19 @@ mod tests {
|
|||||||
use common::CompositeFile;
|
use common::CompositeFile;
|
||||||
use directory::{Directory, RAMDirectory, WritePtr};
|
use directory::{Directory, RAMDirectory, WritePtr};
|
||||||
use fastfield::FastFieldReader;
|
use fastfield::FastFieldReader;
|
||||||
use rand::prelude::SliceRandom;
|
use rand::Rng;
|
||||||
use rand::rngs::StdRng;
|
|
||||||
use rand::SeedableRng;
|
use rand::SeedableRng;
|
||||||
|
use rand::XorShiftRng;
|
||||||
use schema::Document;
|
use schema::Document;
|
||||||
use schema::Field;
|
use schema::Field;
|
||||||
use schema::Schema;
|
|
||||||
use schema::FAST;
|
use schema::FAST;
|
||||||
|
use schema::{Schema, SchemaBuilder};
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
|
||||||
lazy_static! {
|
lazy_static! {
|
||||||
pub static ref SCHEMA: Schema = {
|
pub static ref SCHEMA: Schema = {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
schema_builder.add_u64_field("field", FAST);
|
schema_builder.add_u64_field("field", FAST);
|
||||||
schema_builder.build()
|
schema_builder.build()
|
||||||
};
|
};
|
||||||
@@ -298,7 +298,7 @@ mod tests {
|
|||||||
fn test_signed_intfastfield() {
|
fn test_signed_intfastfield() {
|
||||||
let path = Path::new("test");
|
let path = Path::new("test");
|
||||||
let mut directory: RAMDirectory = RAMDirectory::create();
|
let mut directory: RAMDirectory = RAMDirectory::create();
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::new();
|
||||||
|
|
||||||
let i64_field = schema_builder.add_i64_field("field", FAST);
|
let i64_field = schema_builder.add_i64_field("field", FAST);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
@@ -342,7 +342,7 @@ mod tests {
|
|||||||
fn test_signed_intfastfield_default_val() {
|
fn test_signed_intfastfield_default_val() {
|
||||||
let path = Path::new("test");
|
let path = Path::new("test");
|
||||||
let mut directory: RAMDirectory = RAMDirectory::create();
|
let mut directory: RAMDirectory = RAMDirectory::create();
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::new();
|
||||||
let i64_field = schema_builder.add_i64_field("field", FAST);
|
let i64_field = schema_builder.add_i64_field("field", FAST);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
|
|
||||||
@@ -367,10 +367,11 @@ mod tests {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Warning: this generates the same permutation at each call
|
|
||||||
pub fn generate_permutation() -> Vec<u64> {
|
pub fn generate_permutation() -> Vec<u64> {
|
||||||
let mut permutation: Vec<u64> = (0u64..100_000u64).collect();
|
let seed: [u8; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
|
||||||
permutation.shuffle(&mut StdRng::from_seed([1u8; 32]));
|
let mut rng = XorShiftRng::from_seed(seed);
|
||||||
|
let mut permutation: Vec<u64> = (0u64..1_000_000u64).collect();
|
||||||
|
rng.shuffle(&mut permutation);
|
||||||
permutation
|
permutation
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -9,12 +9,12 @@ mod tests {
|
|||||||
|
|
||||||
use schema::Cardinality;
|
use schema::Cardinality;
|
||||||
use schema::IntOptions;
|
use schema::IntOptions;
|
||||||
use schema::Schema;
|
use schema::SchemaBuilder;
|
||||||
use Index;
|
use Index;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_multivalued_u64() {
|
fn test_multivalued_u64() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let field = schema_builder.add_u64_field(
|
let field = schema_builder.add_u64_field(
|
||||||
"multifield",
|
"multifield",
|
||||||
IntOptions::default().set_fast(Cardinality::MultiValues),
|
IntOptions::default().set_fast(Cardinality::MultiValues),
|
||||||
@@ -49,7 +49,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_multivalued_i64() {
|
fn test_multivalued_i64() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let field = schema_builder.add_i64_field(
|
let field = schema_builder.add_i64_field(
|
||||||
"multifield",
|
"multifield",
|
||||||
IntOptions::default().set_fast(Cardinality::MultiValues),
|
IntOptions::default().set_fast(Cardinality::MultiValues),
|
||||||
|
|||||||
@@ -47,11 +47,11 @@ impl<Item: FastValue> MultiValueIntFastFieldReader<Item> {
|
|||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
use core::Index;
|
use core::Index;
|
||||||
use schema::{Document, Facet, Schema};
|
use schema::{Document, Facet, SchemaBuilder};
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_multifastfield_reader() {
|
fn test_multifastfield_reader() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::new();
|
||||||
let facet_field = schema_builder.add_facet_field("facets");
|
let facet_field = schema_builder.add_facet_field("facets");
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
|
|||||||
@@ -132,8 +132,7 @@ impl MultiValueIntFastFieldWriter {
|
|||||||
);
|
);
|
||||||
|
|
||||||
let mut doc_vals: Vec<u64> = Vec::with_capacity(100);
|
let mut doc_vals: Vec<u64> = Vec::with_capacity(100);
|
||||||
for (start, stop) in self
|
for (start, stop) in self.doc_index
|
||||||
.doc_index
|
|
||||||
.windows(2)
|
.windows(2)
|
||||||
.map(|interval| (interval[0], interval[1]))
|
.map(|interval| (interval[0], interval[1]))
|
||||||
.chain(Some(last_interval).into_iter())
|
.chain(Some(last_interval).into_iter())
|
||||||
@@ -149,6 +148,7 @@ impl MultiValueIntFastFieldWriter {
|
|||||||
value_serializer.add_val(val)?;
|
value_serializer.add_val(val)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
None => {
|
None => {
|
||||||
let val_min_max = self.vals.iter().cloned().minmax();
|
let val_min_max = self.vals.iter().cloned().minmax();
|
||||||
|
|||||||
@@ -7,10 +7,11 @@ use directory::ReadOnlySource;
|
|||||||
use directory::{Directory, RAMDirectory, WritePtr};
|
use directory::{Directory, RAMDirectory, WritePtr};
|
||||||
use fastfield::{FastFieldSerializer, FastFieldsWriter};
|
use fastfield::{FastFieldSerializer, FastFieldsWriter};
|
||||||
use owning_ref::OwningRef;
|
use owning_ref::OwningRef;
|
||||||
use schema::Schema;
|
use schema::SchemaBuilder;
|
||||||
use schema::FAST;
|
use schema::FAST;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::marker::PhantomData;
|
use std::marker::PhantomData;
|
||||||
|
use std::mem;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use DocId;
|
use DocId;
|
||||||
|
|
||||||
@@ -79,8 +80,7 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
|||||||
// TODO change start to `u64`.
|
// TODO change start to `u64`.
|
||||||
// For multifastfield, start is an index in a second fastfield, not a `DocId`
|
// For multifastfield, start is an index in a second fastfield, not a `DocId`
|
||||||
pub fn get_range(&self, start: u32, output: &mut [Item]) {
|
pub fn get_range(&self, start: u32, output: &mut [Item]) {
|
||||||
// ok: Item is either `u64` or `i64`
|
let output_u64: &mut [u64] = unsafe { mem::transmute(output) }; // ok: Item is either `u64` or `i64`
|
||||||
let output_u64: &mut [u64] = unsafe { &mut *(output as *mut [Item] as *mut [u64]) };
|
|
||||||
self.bit_unpacker.get_range(start, output_u64);
|
self.bit_unpacker.get_range(start, output_u64);
|
||||||
for out in output_u64.iter_mut() {
|
for out in output_u64.iter_mut() {
|
||||||
*out = Item::from_u64(*out + self.min_value_u64).as_u64();
|
*out = Item::from_u64(*out + self.min_value_u64).as_u64();
|
||||||
@@ -108,7 +108,7 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
|||||||
|
|
||||||
impl<Item: FastValue> From<Vec<Item>> for FastFieldReader<Item> {
|
impl<Item: FastValue> From<Vec<Item>> for FastFieldReader<Item> {
|
||||||
fn from(vals: Vec<Item>) -> FastFieldReader<Item> {
|
fn from(vals: Vec<Item>) -> FastFieldReader<Item> {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let field = schema_builder.add_u64_field("field", FAST);
|
let field = schema_builder.add_u64_field("field", FAST);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let path = Path::new("__dummy__");
|
let path = Path::new("__dummy__");
|
||||||
|
|||||||
@@ -10,28 +10,27 @@ pub fn fieldnorm_to_id(fieldnorm: u32) -> u8 {
|
|||||||
.unwrap_or_else(|idx| idx - 1) as u8
|
.unwrap_or_else(|idx| idx - 1) as u8
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::unreadable_literal))]
|
|
||||||
pub const FIELD_NORMS_TABLE: [u32; 256] = [
|
pub const FIELD_NORMS_TABLE: [u32; 256] = [
|
||||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
|
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
|
||||||
26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 42, 44, 46, 48, 50, 52, 54, 56, 60,
|
26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 42, 44, 46, 48, 50, 52, 54, 56, 60,
|
||||||
64, 68, 72, 76, 80, 84, 88, 96, 104, 112, 120, 128, 136, 144, 152, 168, 184, 200, 216, 232,
|
64, 68, 72, 76, 80, 84, 88, 96, 104, 112, 120, 128, 136, 144, 152, 168, 184, 200, 216, 232,
|
||||||
248, 264, 280, 312, 344, 376, 408, 440, 472, 504, 536, 600, 664, 728, 792, 856, 920, 984,
|
248, 264, 280, 312, 344, 376, 408, 440, 472, 504, 536, 600, 664, 728, 792, 856, 920, 984, 1048,
|
||||||
1_048, 1176, 1304, 1432, 1560, 1688, 1816, 1944, 2072, 2328, 2584, 2840, 3096, 3352, 3608,
|
1176, 1304, 1432, 1560, 1688, 1816, 1944, 2072, 2328, 2584, 2840, 3096, 3352, 3608, 3864, 4120,
|
||||||
3864, 4120, 4632, 5144, 5656, 6168, 6680, 7192, 7704, 8216, 9240, 10264, 11288, 12312, 13336,
|
4632, 5144, 5656, 6168, 6680, 7192, 7704, 8216, 9240, 10264, 11288, 12312, 13336, 14360, 15384,
|
||||||
14360, 15384, 16408, 18456, 20504, 22552, 24600, 26648, 28696, 30744, 32792, 36888, 40984,
|
16408, 18456, 20504, 22552, 24600, 26648, 28696, 30744, 32792, 36888, 40984, 45080, 49176,
|
||||||
45080, 49176, 53272, 57368, 61464, 65560, 73752, 81944, 90136, 98328, 106520, 114712, 122904,
|
53272, 57368, 61464, 65560, 73752, 81944, 90136, 98328, 106520, 114712, 122904, 131096, 147480,
|
||||||
131096, 147480, 163864, 180248, 196632, 213016, 229400, 245784, 262168, 294936, 327704, 360472,
|
163864, 180248, 196632, 213016, 229400, 245784, 262168, 294936, 327704, 360472, 393240, 426008,
|
||||||
393240, 426008, 458776, 491544, 524312, 589848, 655384, 720920, 786456, 851992, 917528, 983064,
|
458776, 491544, 524312, 589848, 655384, 720920, 786456, 851992, 917528, 983064, 1048600,
|
||||||
1048600, 1179672, 1310744, 1441816, 1572888, 1703960, 1835032, 1966104, 2097176, 2359320,
|
1179672, 1310744, 1441816, 1572888, 1703960, 1835032, 1966104, 2097176, 2359320, 2621464,
|
||||||
2621464, 2883608, 3145752, 3407896, 3670040, 3932184, 4194328, 4718616, 5242904, 5767192,
|
2883608, 3145752, 3407896, 3670040, 3932184, 4194328, 4718616, 5242904, 5767192, 6291480,
|
||||||
6291480, 6815768, 7340056, 7864344, 8388632, 9437208, 10485784, 11534360, 12582936, 13631512,
|
6815768, 7340056, 7864344, 8388632, 9437208, 10485784, 11534360, 12582936, 13631512, 14680088,
|
||||||
14680088, 15728664, 16777240, 18874392, 20971544, 23068696, 25165848, 27263000, 29360152,
|
15728664, 16777240, 18874392, 20971544, 23068696, 25165848, 27263000, 29360152, 31457304,
|
||||||
31457304, 33554456, 37748760, 41943064, 46137368, 50331672, 54525976, 58720280, 62914584,
|
33554456, 37748760, 41943064, 46137368, 50331672, 54525976, 58720280, 62914584, 67108888,
|
||||||
67108888, 75497496, 83886104, 92274712, 100663320, 109051928, 117440536, 125829144, 134217752,
|
75497496, 83886104, 92274712, 100663320, 109051928, 117440536, 125829144, 134217752, 150994968,
|
||||||
150994968, 167772184, 184549400, 201326616, 218103832, 234881048, 251658264, 268435480,
|
167772184, 184549400, 201326616, 218103832, 234881048, 251658264, 268435480, 301989912,
|
||||||
301989912, 335544344, 369098776, 402653208, 436207640, 469762072, 503316504, 536870936,
|
335544344, 369098776, 402653208, 436207640, 469762072, 503316504, 536870936, 603979800,
|
||||||
603979800, 671088664, 738197528, 805306392, 872415256, 939524120, 1006632984, 1073741848,
|
671088664, 738197528, 805306392, 872415256, 939524120, 1006632984, 1073741848, 1207959576,
|
||||||
1207959576, 1342177304, 1476395032, 1610612760, 1744830488, 1879048216, 2013265944,
|
1342177304, 1476395032, 1610612760, 1744830488, 1879048216, 2013265944,
|
||||||
];
|
];
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
|||||||
@@ -15,7 +15,7 @@
|
|||||||
//! precompute computationally expensive functions of the fieldnorm
|
//! precompute computationally expensive functions of the fieldnorm
|
||||||
//! in a very short array.
|
//! in a very short array.
|
||||||
//!
|
//!
|
||||||
//! This trick is used by the BM25 similarity.
|
//! This trick is used by the [BM25 similarity]().
|
||||||
mod code;
|
mod code;
|
||||||
mod reader;
|
mod reader;
|
||||||
mod serializer;
|
mod serializer;
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ use rand::thread_rng;
|
|||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
|
|
||||||
use rand::Rng;
|
use rand::Rng;
|
||||||
|
use rand::distributions::Range;
|
||||||
use schema::*;
|
use schema::*;
|
||||||
use Index;
|
use Index;
|
||||||
use Searcher;
|
use Searcher;
|
||||||
@@ -15,7 +16,7 @@ fn check_index_content(searcher: &Searcher, vals: &HashSet<u64>) {
|
|||||||
#[ignore]
|
#[ignore]
|
||||||
#[cfg(feature = "mmap")]
|
#[cfg(feature = "mmap")]
|
||||||
fn test_indexing() {
|
fn test_indexing() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
|
|
||||||
let id_field = schema_builder.add_u64_field("id", INT_INDEXED);
|
let id_field = schema_builder.add_u64_field("id", INT_INDEXED);
|
||||||
let multiples_field = schema_builder.add_u64_field("multiples", INT_INDEXED);
|
let multiples_field = schema_builder.add_u64_field("multiples", INT_INDEXED);
|
||||||
@@ -23,6 +24,7 @@ fn test_indexing() {
|
|||||||
|
|
||||||
let index = Index::create_from_tempdir(schema).unwrap();
|
let index = Index::create_from_tempdir(schema).unwrap();
|
||||||
|
|
||||||
|
let universe = Range::new(0u64, 20u64);
|
||||||
let mut rng = thread_rng();
|
let mut rng = thread_rng();
|
||||||
|
|
||||||
let mut index_writer = index.writer_with_num_threads(3, 120_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(3, 120_000_000).unwrap();
|
||||||
@@ -31,7 +33,7 @@ fn test_indexing() {
|
|||||||
let mut uncommitted_docs: HashSet<u64> = HashSet::new();
|
let mut uncommitted_docs: HashSet<u64> = HashSet::new();
|
||||||
|
|
||||||
for _ in 0..200 {
|
for _ in 0..200 {
|
||||||
let random_val = rng.gen_range(0, 20);
|
let random_val = rng.sample(&universe);
|
||||||
if random_val == 0 {
|
if random_val == 0 {
|
||||||
index_writer.commit().expect("Commit failed");
|
index_writer.commit().expect("Commit failed");
|
||||||
committed_docs.extend(&uncommitted_docs);
|
committed_docs.extend(&uncommitted_docs);
|
||||||
|
|||||||
@@ -52,8 +52,7 @@ impl DeleteQueue {
|
|||||||
//
|
//
|
||||||
// Past delete operations are not accessible.
|
// Past delete operations are not accessible.
|
||||||
pub fn cursor(&self) -> DeleteCursor {
|
pub fn cursor(&self) -> DeleteCursor {
|
||||||
let last_block = self
|
let last_block = self.inner
|
||||||
.inner
|
|
||||||
.read()
|
.read()
|
||||||
.expect("Read lock poisoned when opening delete queue cursor")
|
.expect("Read lock poisoned when opening delete queue cursor")
|
||||||
.last_block
|
.last_block
|
||||||
@@ -93,8 +92,7 @@ impl DeleteQueue {
|
|||||||
// be some unflushed operations.
|
// be some unflushed operations.
|
||||||
//
|
//
|
||||||
fn flush(&self) -> Option<Arc<Block>> {
|
fn flush(&self) -> Option<Arc<Block>> {
|
||||||
let mut self_wlock = self
|
let mut self_wlock = self.inner
|
||||||
.inner
|
|
||||||
.write()
|
.write()
|
||||||
.expect("Failed to acquire write lock on delete queue writer");
|
.expect("Failed to acquire write lock on delete queue writer");
|
||||||
|
|
||||||
@@ -134,8 +132,7 @@ impl From<DeleteQueue> for NextBlock {
|
|||||||
impl NextBlock {
|
impl NextBlock {
|
||||||
fn next_block(&self) -> Option<Arc<Block>> {
|
fn next_block(&self) -> Option<Arc<Block>> {
|
||||||
{
|
{
|
||||||
let next_read_lock = self
|
let next_read_lock = self.0
|
||||||
.0
|
|
||||||
.read()
|
.read()
|
||||||
.expect("Failed to acquire write lock in delete queue");
|
.expect("Failed to acquire write lock in delete queue");
|
||||||
if let InnerNextBlock::Closed(ref block) = *next_read_lock {
|
if let InnerNextBlock::Closed(ref block) = *next_read_lock {
|
||||||
@@ -144,8 +141,7 @@ impl NextBlock {
|
|||||||
}
|
}
|
||||||
let next_block;
|
let next_block;
|
||||||
{
|
{
|
||||||
let mut next_write_lock = self
|
let mut next_write_lock = self.0
|
||||||
.0
|
|
||||||
.write()
|
.write()
|
||||||
.expect("Failed to acquire write lock in delete queue");
|
.expect("Failed to acquire write lock in delete queue");
|
||||||
match *next_write_lock {
|
match *next_write_lock {
|
||||||
@@ -186,21 +182,19 @@ impl DeleteCursor {
|
|||||||
/// `opstamp >= target_opstamp`.
|
/// `opstamp >= target_opstamp`.
|
||||||
pub fn skip_to(&mut self, target_opstamp: u64) {
|
pub fn skip_to(&mut self, target_opstamp: u64) {
|
||||||
// TODO Can be optimize as we work with block.
|
// TODO Can be optimize as we work with block.
|
||||||
while self.is_behind_opstamp(target_opstamp) {
|
#[cfg_attr(feature = "cargo-clippy", allow(while_let_loop))]
|
||||||
|
loop {
|
||||||
|
if let Some(operation) = self.get() {
|
||||||
|
if operation.opstamp >= target_opstamp {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
self.advance();
|
self.advance();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg_attr(
|
|
||||||
feature = "cargo-clippy",
|
|
||||||
allow(clippy::wrong_self_convention)
|
|
||||||
)]
|
|
||||||
fn is_behind_opstamp(&mut self, target_opstamp: u64) -> bool {
|
|
||||||
self.get()
|
|
||||||
.map(|operation| operation.opstamp < target_opstamp)
|
|
||||||
.unwrap_or(false)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// If the current block has been entirely
|
/// If the current block has been entirely
|
||||||
/// consumed, try to load the next one.
|
/// consumed, try to load the next one.
|
||||||
///
|
///
|
||||||
|
|||||||
@@ -1,130 +1,26 @@
|
|||||||
|
use core::LOCKFILE_FILEPATH;
|
||||||
use directory::error::OpenWriteError;
|
use directory::error::OpenWriteError;
|
||||||
use std::io::Write;
|
|
||||||
use std::path::{Path, PathBuf};
|
|
||||||
use std::thread;
|
|
||||||
use std::time::Duration;
|
|
||||||
use Directory;
|
use Directory;
|
||||||
use TantivyError;
|
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy)]
|
/// The directory lock is a mechanism used to
|
||||||
pub enum LockType {
|
/// prevent the creation of two [`IndexWriter`](struct.IndexWriter.html)
|
||||||
/// Only one process should be able to write tantivy's index at a time.
|
|
||||||
/// This lock file, when present, is in charge of preventing other processes to open an IndexWriter.
|
|
||||||
///
|
|
||||||
/// If the process is killed and this file remains, it is safe to remove it manually.
|
|
||||||
///
|
|
||||||
/// Failing to acquire this lock usually means a misuse of tantivy's API,
|
|
||||||
/// (creating more than one instance of the `IndexWriter`), are a spurious
|
|
||||||
/// lock file remaining after a crash. In the latter case, removing the file after
|
|
||||||
/// checking no process running tantivy is running is safe.
|
|
||||||
IndexWriterLock,
|
|
||||||
/// The meta lock file is here to protect the segment files being opened by
|
|
||||||
/// `.load_searchers()` from being garbage collected.
|
|
||||||
/// It makes it possible for another process to safely consume
|
|
||||||
/// our index in-writing. Ideally, we may have prefered `RWLock` semantics
|
|
||||||
/// here, but it is difficult to achieve on Windows.
|
|
||||||
///
|
|
||||||
/// Opening segment readers is a very fast process.
|
|
||||||
/// Right now if the lock cannot be acquire on the first attempt, the logic
|
|
||||||
/// is very simplistic. We retry after `100ms` until we effectively
|
|
||||||
/// acquire the lock.
|
|
||||||
/// This lock should not have much contention in normal usage.
|
|
||||||
MetaLock,
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Retry the logic of acquiring locks is pretty simple.
|
|
||||||
/// We just retry `n` times after a given `duratio`, both
|
|
||||||
/// depending on the type of lock.
|
|
||||||
struct RetryPolicy {
|
|
||||||
num_retries: usize,
|
|
||||||
wait_in_ms: u64,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl RetryPolicy {
|
|
||||||
fn no_retry() -> RetryPolicy {
|
|
||||||
RetryPolicy {
|
|
||||||
num_retries: 0,
|
|
||||||
wait_in_ms: 0,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn wait_and_retry(&mut self) -> bool {
|
|
||||||
if self.num_retries == 0 {
|
|
||||||
false
|
|
||||||
} else {
|
|
||||||
self.num_retries -= 1;
|
|
||||||
let wait_duration = Duration::from_millis(self.wait_in_ms);
|
|
||||||
thread::sleep(wait_duration);
|
|
||||||
true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl LockType {
|
|
||||||
fn retry_policy(self) -> RetryPolicy {
|
|
||||||
match self {
|
|
||||||
LockType::IndexWriterLock => RetryPolicy::no_retry(),
|
|
||||||
LockType::MetaLock => RetryPolicy {
|
|
||||||
num_retries: 100,
|
|
||||||
wait_in_ms: 100,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn try_acquire_lock(self, directory: &mut Directory) -> Result<DirectoryLock, TantivyError> {
|
|
||||||
let path = self.filename();
|
|
||||||
let mut write = directory.open_write(path).map_err(|e| match e {
|
|
||||||
OpenWriteError::FileAlreadyExists(_) => TantivyError::LockFailure(self),
|
|
||||||
OpenWriteError::IOError(io_error) => TantivyError::IOError(io_error),
|
|
||||||
})?;
|
|
||||||
write.flush()?;
|
|
||||||
Ok(DirectoryLock {
|
|
||||||
directory: directory.box_clone(),
|
|
||||||
path: path.to_owned(),
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Acquire a lock in the given directory.
|
|
||||||
pub fn acquire_lock(self, directory: &Directory) -> Result<DirectoryLock, TantivyError> {
|
|
||||||
let mut box_directory = directory.box_clone();
|
|
||||||
let mut retry_policy = self.retry_policy();
|
|
||||||
loop {
|
|
||||||
let lock_result = self.try_acquire_lock(&mut *box_directory);
|
|
||||||
match lock_result {
|
|
||||||
Ok(result) => {
|
|
||||||
return Ok(result);
|
|
||||||
}
|
|
||||||
Err(TantivyError::LockFailure(ref filepath)) => {
|
|
||||||
if !retry_policy.wait_and_retry() {
|
|
||||||
return Err(TantivyError::LockFailure(filepath.to_owned()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Err(_) => {}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn filename(&self) -> &Path {
|
|
||||||
match *self {
|
|
||||||
LockType::MetaLock => Path::new(".tantivy-meta.lock"),
|
|
||||||
LockType::IndexWriterLock => Path::new(".tantivy-indexer.lock"),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// The `DirectoryLock` is an object that represents a file lock.
|
|
||||||
/// See [`LockType`](struct.LockType.html)
|
|
||||||
///
|
///
|
||||||
/// It is transparently associated to a lock file, that gets deleted
|
/// Only one lock can exist at a time for a given directory.
|
||||||
/// on `Drop.` The lock is release automatically on `Drop`.
|
/// The lock is release automatically on `Drop`.
|
||||||
pub struct DirectoryLock {
|
pub struct DirectoryLock {
|
||||||
directory: Box<Directory>,
|
directory: Box<Directory>,
|
||||||
path: PathBuf,
|
}
|
||||||
|
|
||||||
|
impl DirectoryLock {
|
||||||
|
pub fn lock(mut directory: Box<Directory>) -> Result<DirectoryLock, OpenWriteError> {
|
||||||
|
directory.open_write(&*LOCKFILE_FILEPATH)?;
|
||||||
|
Ok(DirectoryLock { directory })
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Drop for DirectoryLock {
|
impl Drop for DirectoryLock {
|
||||||
fn drop(&mut self) {
|
fn drop(&mut self) {
|
||||||
if let Err(e) = self.directory.delete(&*self.path) {
|
if let Err(e) = self.directory.delete(&*LOCKFILE_FILEPATH) {
|
||||||
error!("Failed to remove the lock file. {:?}", e);
|
error!("Failed to remove the lock file. {:?}", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ use core::SegmentComponent;
|
|||||||
use core::SegmentId;
|
use core::SegmentId;
|
||||||
use core::SegmentMeta;
|
use core::SegmentMeta;
|
||||||
use core::SegmentReader;
|
use core::SegmentReader;
|
||||||
use crossbeam::channel;
|
use crossbeam_channel as channel;
|
||||||
use docset::DocSet;
|
use docset::DocSet;
|
||||||
use error::TantivyError;
|
use error::TantivyError;
|
||||||
use fastfield::write_delete_bitset;
|
use fastfield::write_delete_bitset;
|
||||||
@@ -54,14 +54,14 @@ type DocumentReceiver = channel::Receiver<AddOperation>;
|
|||||||
fn initial_table_size(per_thread_memory_budget: usize) -> usize {
|
fn initial_table_size(per_thread_memory_budget: usize) -> usize {
|
||||||
let table_size_limit: usize = per_thread_memory_budget / 3;
|
let table_size_limit: usize = per_thread_memory_budget / 3;
|
||||||
(1..)
|
(1..)
|
||||||
|
.into_iter()
|
||||||
.take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit)
|
.take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit)
|
||||||
.last()
|
.last()
|
||||||
.unwrap_or_else(|| {
|
.expect(&format!(
|
||||||
panic!(
|
"Per thread memory is too small: {}",
|
||||||
"Per thread memory is too small: {}",
|
per_thread_memory_budget
|
||||||
per_thread_memory_budget
|
))
|
||||||
)
|
.min(19) // we cap it at 512K
|
||||||
}).min(19) // we cap it at 512K
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// `IndexWriter` is the user entry-point to add document to an index.
|
/// `IndexWriter` is the user entry-point to add document to an index.
|
||||||
@@ -177,7 +177,7 @@ pub fn compute_deleted_bitset(
|
|||||||
) -> Result<bool> {
|
) -> Result<bool> {
|
||||||
let mut might_have_changed = false;
|
let mut might_have_changed = false;
|
||||||
|
|
||||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::while_let_loop))]
|
#[cfg_attr(feature = "cargo-clippy", allow(while_let_loop))]
|
||||||
loop {
|
loop {
|
||||||
if let Some(delete_op) = delete_cursor.get() {
|
if let Some(delete_op) = delete_cursor.get() {
|
||||||
if delete_op.opstamp > target_opstamp {
|
if delete_op.opstamp > target_opstamp {
|
||||||
@@ -301,29 +301,25 @@ fn index_documents(
|
|||||||
|
|
||||||
let last_docstamp: u64 = *(doc_opstamps.last().unwrap());
|
let last_docstamp: u64 = *(doc_opstamps.last().unwrap());
|
||||||
|
|
||||||
let segment_entry: SegmentEntry = if delete_cursor.get().is_some() {
|
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
|
||||||
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
|
let segment_reader = SegmentReader::open(segment)?;
|
||||||
let segment_reader = SegmentReader::open(segment)?;
|
let mut deleted_bitset = BitSet::with_capacity(num_docs as usize);
|
||||||
let mut deleted_bitset = BitSet::with_capacity(num_docs as usize);
|
let may_have_deletes = compute_deleted_bitset(
|
||||||
let may_have_deletes = compute_deleted_bitset(
|
&mut deleted_bitset,
|
||||||
&mut deleted_bitset,
|
&segment_reader,
|
||||||
&segment_reader,
|
&mut delete_cursor,
|
||||||
&mut delete_cursor,
|
&doc_to_opstamps,
|
||||||
&doc_to_opstamps,
|
last_docstamp,
|
||||||
last_docstamp,
|
)?;
|
||||||
)?;
|
|
||||||
SegmentEntry::new(segment_meta, delete_cursor, {
|
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, {
|
||||||
if may_have_deletes {
|
if may_have_deletes {
|
||||||
Some(deleted_bitset)
|
Some(deleted_bitset)
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
})
|
});
|
||||||
} else {
|
|
||||||
// if there are no delete operation in the queue, no need
|
|
||||||
// to even open the segment.
|
|
||||||
SegmentEntry::new(segment_meta, delete_cursor, None)
|
|
||||||
};
|
|
||||||
Ok(segment_updater.add_segment(generation, segment_entry))
|
Ok(segment_updater.add_segment(generation, segment_entry))
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -345,8 +341,7 @@ impl IndexWriter {
|
|||||||
}
|
}
|
||||||
drop(self.workers_join_handle);
|
drop(self.workers_join_handle);
|
||||||
|
|
||||||
let result = self
|
let result = self.segment_updater
|
||||||
.segment_updater
|
|
||||||
.wait_merging_thread()
|
.wait_merging_thread()
|
||||||
.map_err(|_| TantivyError::ErrorInThread("Failed to join merging thread.".into()));
|
.map_err(|_| TantivyError::ErrorInThread("Failed to join merging thread.".into()));
|
||||||
|
|
||||||
@@ -388,9 +383,10 @@ impl IndexWriter {
|
|||||||
let mem_budget = self.heap_size_in_bytes_per_thread;
|
let mem_budget = self.heap_size_in_bytes_per_thread;
|
||||||
let join_handle: JoinHandle<Result<()>> = thread::Builder::new()
|
let join_handle: JoinHandle<Result<()>> = thread::Builder::new()
|
||||||
.name(format!(
|
.name(format!(
|
||||||
"thrd-tantivy-index{}-gen{}",
|
"indexing thread {} for gen {}",
|
||||||
self.worker_id, generation
|
self.worker_id, generation
|
||||||
)).spawn(move || {
|
))
|
||||||
|
.spawn(move || {
|
||||||
loop {
|
loop {
|
||||||
let mut document_iterator =
|
let mut document_iterator =
|
||||||
document_receiver_clone.clone().into_iter().peekable();
|
document_receiver_clone.clone().into_iter().peekable();
|
||||||
@@ -492,8 +488,7 @@ impl IndexWriter {
|
|||||||
let document_receiver = self.document_receiver.clone();
|
let document_receiver = self.document_receiver.clone();
|
||||||
|
|
||||||
// take the directory lock to create a new index_writer.
|
// take the directory lock to create a new index_writer.
|
||||||
let directory_lock = self
|
let directory_lock = self._directory_lock
|
||||||
._directory_lock
|
|
||||||
.take()
|
.take()
|
||||||
.expect("The IndexWriter does not have any lock. This is a bug, please report.");
|
.expect("The IndexWriter does not have any lock. This is a bug, please report.");
|
||||||
|
|
||||||
@@ -641,10 +636,7 @@ impl IndexWriter {
|
|||||||
pub fn add_document(&mut self, document: Document) -> u64 {
|
pub fn add_document(&mut self, document: Document) -> u64 {
|
||||||
let opstamp = self.stamper.stamp();
|
let opstamp = self.stamper.stamp();
|
||||||
let add_operation = AddOperation { opstamp, document };
|
let add_operation = AddOperation { opstamp, document };
|
||||||
let send_result = self.document_sender.send(add_operation);
|
self.document_sender.send(add_operation);
|
||||||
if let Err(e) = send_result {
|
|
||||||
panic!("Failed to index document. Sending to indexing channel failed. This probably means all of the indexing threads have panicked. {:?}", e);
|
|
||||||
}
|
|
||||||
opstamp
|
opstamp
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -661,33 +653,18 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_lockfile_stops_duplicates() {
|
fn test_lockfile_stops_duplicates() {
|
||||||
let schema_builder = schema::Schema::builder();
|
let schema_builder = schema::SchemaBuilder::default();
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
let _index_writer = index.writer(40_000_000).unwrap();
|
let _index_writer = index.writer(40_000_000).unwrap();
|
||||||
match index.writer(40_000_000) {
|
match index.writer(40_000_000) {
|
||||||
Err(TantivyError::LockFailure(_)) => {}
|
Err(TantivyError::FileAlreadyExists(_)) => {}
|
||||||
_ => panic!("Expected FileAlreadyExists error"),
|
_ => panic!("Expected FileAlreadyExists error"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_lockfile_already_exists_error_msg() {
|
|
||||||
let schema_builder = schema::Schema::builder();
|
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
|
||||||
let _index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
|
||||||
match index.writer_with_num_threads(1, 3_000_000) {
|
|
||||||
Err(err) => {
|
|
||||||
let err_msg = err.to_string();
|
|
||||||
assert!(err_msg.contains("Lockfile"));
|
|
||||||
assert!(err_msg.contains("Possible causes:"))
|
|
||||||
}
|
|
||||||
_ => panic!("Expected LockfileAlreadyExists error"),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_set_merge_policy() {
|
fn test_set_merge_policy() {
|
||||||
let schema_builder = schema::Schema::builder();
|
let schema_builder = schema::SchemaBuilder::default();
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
let index_writer = index.writer(40_000_000).unwrap();
|
let index_writer = index.writer(40_000_000).unwrap();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
@@ -705,7 +682,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_lockfile_released_on_drop() {
|
fn test_lockfile_released_on_drop() {
|
||||||
let schema_builder = schema::Schema::builder();
|
let schema_builder = schema::SchemaBuilder::default();
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
{
|
{
|
||||||
let _index_writer = index.writer(40_000_000).unwrap();
|
let _index_writer = index.writer(40_000_000).unwrap();
|
||||||
@@ -717,7 +694,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_commit_and_rollback() {
|
fn test_commit_and_rollback() {
|
||||||
let mut schema_builder = schema::Schema::builder();
|
let mut schema_builder = schema::SchemaBuilder::default();
|
||||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
|
|
||||||
@@ -751,7 +728,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_with_merges() {
|
fn test_with_merges() {
|
||||||
let mut schema_builder = schema::Schema::builder();
|
let mut schema_builder = schema::SchemaBuilder::default();
|
||||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
let num_docs_containing = |s: &str| {
|
let num_docs_containing = |s: &str| {
|
||||||
@@ -788,7 +765,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_prepare_with_commit_message() {
|
fn test_prepare_with_commit_message() {
|
||||||
let mut schema_builder = schema::Schema::builder();
|
let mut schema_builder = schema::SchemaBuilder::default();
|
||||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
|
|
||||||
@@ -822,7 +799,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_prepare_but_rollback() {
|
fn test_prepare_but_rollback() {
|
||||||
let mut schema_builder = schema::Schema::builder();
|
let mut schema_builder = schema::SchemaBuilder::default();
|
||||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
|
|
||||||
@@ -866,32 +843,4 @@ mod tests {
|
|||||||
assert_eq!(initial_table_size(1_000_000_000), 19);
|
assert_eq!(initial_table_size(1_000_000_000), 19);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(not(feature = "no_fail"))]
|
|
||||||
#[test]
|
|
||||||
fn test_write_commit_fails() {
|
|
||||||
use fail;
|
|
||||||
let mut schema_builder = schema::Schema::builder();
|
|
||||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
|
||||||
|
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
|
||||||
for _ in 0..100 {
|
|
||||||
index_writer.add_document(doc!(text_field => "a"));
|
|
||||||
}
|
|
||||||
index_writer.commit().unwrap();
|
|
||||||
fail::cfg("RAMDirectory::atomic_write", "return(error_write_failed)").unwrap();
|
|
||||||
for _ in 0..100 {
|
|
||||||
index_writer.add_document(doc!(text_field => "b"));
|
|
||||||
}
|
|
||||||
assert!(index_writer.commit().is_err());
|
|
||||||
index.load_searchers().unwrap();
|
|
||||||
let num_docs_containing = |s: &str| {
|
|
||||||
let searcher = index.searcher();
|
|
||||||
let term_a = Term::from_field_text(text_field, s);
|
|
||||||
searcher.doc_freq(&term_a)
|
|
||||||
};
|
|
||||||
assert_eq!(num_docs_containing("a"), 100);
|
|
||||||
assert_eq!(num_docs_containing("b"), 0);
|
|
||||||
fail::cfg("RAMDirectory::atomic_write", "off").unwrap();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -21,17 +21,17 @@ pub trait MergePolicy: MergePolicyClone + marker::Send + marker::Sync + Debug {
|
|||||||
|
|
||||||
/// MergePolicyClone
|
/// MergePolicyClone
|
||||||
pub trait MergePolicyClone {
|
pub trait MergePolicyClone {
|
||||||
/// Returns a boxed clone of the MergePolicy.
|
/// Returns a boxed clone of the MergePolicy.
|
||||||
fn box_clone(&self) -> Box<MergePolicy>;
|
fn box_clone(&self) -> Box<MergePolicy>;
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T> MergePolicyClone for T
|
impl<T> MergePolicyClone for T
|
||||||
where
|
where
|
||||||
T: 'static + MergePolicy + Clone,
|
T: 'static + MergePolicy + Clone,
|
||||||
{
|
{
|
||||||
fn box_clone(&self) -> Box<MergePolicy> {
|
fn box_clone(&self) -> Box<MergePolicy> {
|
||||||
Box::new(self.clone())
|
Box::new(self.clone())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Never merge segments.
|
/// Never merge segments.
|
||||||
|
|||||||
@@ -40,13 +40,15 @@ fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 {
|
|||||||
total_tokens += reader.inverted_index(field).total_num_tokens();
|
total_tokens += reader.inverted_index(field).total_num_tokens();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
total_tokens + count
|
total_tokens
|
||||||
.iter()
|
+ count
|
||||||
.cloned()
|
.iter()
|
||||||
.enumerate()
|
.cloned()
|
||||||
.map(|(fieldnorm_ord, count)| {
|
.enumerate()
|
||||||
count as u64 * u64::from(FieldNormReader::id_to_fieldnorm(fieldnorm_ord as u8))
|
.map(|(fieldnorm_ord, count)| {
|
||||||
}).sum::<u64>()
|
count as u64 * FieldNormReader::id_to_fieldnorm(fieldnorm_ord as u8) as u64
|
||||||
|
})
|
||||||
|
.sum::<u64>()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct IndexMerger {
|
pub struct IndexMerger {
|
||||||
@@ -109,7 +111,7 @@ impl TermOrdinalMapping {
|
|||||||
.iter()
|
.iter()
|
||||||
.flat_map(|term_ordinals| term_ordinals.iter().cloned().max())
|
.flat_map(|term_ordinals| term_ordinals.iter().cloned().max())
|
||||||
.max()
|
.max()
|
||||||
.unwrap_or_else(TermOrdinal::default)
|
.unwrap_or(TermOrdinal::default())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -188,7 +190,7 @@ impl IndexMerger {
|
|||||||
`term_ordinal_mapping`.");
|
`term_ordinal_mapping`.");
|
||||||
self.write_hierarchical_facet_field(
|
self.write_hierarchical_facet_field(
|
||||||
field,
|
field,
|
||||||
&term_ordinal_mapping,
|
term_ordinal_mapping,
|
||||||
fast_field_serializer,
|
fast_field_serializer,
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
@@ -312,7 +314,7 @@ impl IndexMerger {
|
|||||||
fn write_hierarchical_facet_field(
|
fn write_hierarchical_facet_field(
|
||||||
&self,
|
&self,
|
||||||
field: Field,
|
field: Field,
|
||||||
term_ordinal_mappings: &TermOrdinalMapping,
|
term_ordinal_mappings: TermOrdinalMapping,
|
||||||
fast_field_serializer: &mut FastFieldSerializer,
|
fast_field_serializer: &mut FastFieldSerializer,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
// Multifastfield consists in 2 fastfields.
|
// Multifastfield consists in 2 fastfields.
|
||||||
@@ -391,8 +393,8 @@ impl IndexMerger {
|
|||||||
|
|
||||||
// We can now initialize our serializer, and push it the different values
|
// We can now initialize our serializer, and push it the different values
|
||||||
{
|
{
|
||||||
let mut serialize_vals = fast_field_serializer
|
let mut serialize_vals =
|
||||||
.new_u64_fast_field_with_idx(field, min_value, max_value, 1)?;
|
fast_field_serializer.new_u64_fast_field_with_idx(field, min_value, max_value, 1)?;
|
||||||
for reader in &self.readers {
|
for reader in &self.readers {
|
||||||
let ff_reader: MultiValueIntFastFieldReader<u64> =
|
let ff_reader: MultiValueIntFastFieldReader<u64> =
|
||||||
reader.multi_fast_field_reader(field)?;
|
reader.multi_fast_field_reader(field)?;
|
||||||
@@ -438,8 +440,7 @@ impl IndexMerger {
|
|||||||
) -> Result<Option<TermOrdinalMapping>> {
|
) -> Result<Option<TermOrdinalMapping>> {
|
||||||
let mut positions_buffer: Vec<u32> = Vec::with_capacity(1_000);
|
let mut positions_buffer: Vec<u32> = Vec::with_capacity(1_000);
|
||||||
let mut delta_computer = DeltaComputer::new();
|
let mut delta_computer = DeltaComputer::new();
|
||||||
let field_readers = self
|
let field_readers = self.readers
|
||||||
.readers
|
|
||||||
.iter()
|
.iter()
|
||||||
.map(|reader| reader.inverted_index(indexed_field))
|
.map(|reader| reader.inverted_index(indexed_field))
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
@@ -523,7 +524,8 @@ impl IndexMerger {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
None
|
None
|
||||||
}).collect();
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
// At this point, `segment_postings` contains the posting list
|
// At this point, `segment_postings` contains the posting list
|
||||||
// of all of the segments containing the given term.
|
// of all of the segments containing the given term.
|
||||||
@@ -614,7 +616,7 @@ impl IndexMerger {
|
|||||||
store_writer.store(&doc)?;
|
store_writer.store(&doc)?;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
store_writer.stack(&store_reader)?;
|
store_writer.stack(store_reader)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -635,9 +637,10 @@ impl SerializableSegment for IndexMerger {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
|
use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt};
|
||||||
|
use collector::chain;
|
||||||
use collector::tests::TestCollector;
|
use collector::tests::TestCollector;
|
||||||
use collector::tests::{BytesFastFieldTestCollector, FastFieldTestCollector};
|
use collector::tests::{BytesFastFieldTestCollector, FastFieldTestCollector};
|
||||||
use collector::{Count, FacetCollector};
|
use collector::FacetCollector;
|
||||||
use core::Index;
|
use core::Index;
|
||||||
use futures::Future;
|
use futures::Future;
|
||||||
use query::AllQuery;
|
use query::AllQuery;
|
||||||
@@ -646,7 +649,6 @@ mod tests {
|
|||||||
use schema;
|
use schema;
|
||||||
use schema::Cardinality;
|
use schema::Cardinality;
|
||||||
use schema::Document;
|
use schema::Document;
|
||||||
use schema::Facet;
|
|
||||||
use schema::IndexRecordOption;
|
use schema::IndexRecordOption;
|
||||||
use schema::IntOptions;
|
use schema::IntOptions;
|
||||||
use schema::Term;
|
use schema::Term;
|
||||||
@@ -658,13 +660,14 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_index_merger_no_deletes() {
|
fn test_index_merger_no_deletes() {
|
||||||
let mut schema_builder = schema::Schema::builder();
|
let mut schema_builder = schema::SchemaBuilder::default();
|
||||||
let text_fieldtype = schema::TextOptions::default()
|
let text_fieldtype = schema::TextOptions::default()
|
||||||
.set_indexing_options(
|
.set_indexing_options(
|
||||||
TextFieldIndexing::default()
|
TextFieldIndexing::default()
|
||||||
.set_tokenizer("default")
|
.set_tokenizer("default")
|
||||||
.set_index_option(IndexRecordOption::WithFreqs),
|
.set_index_option(IndexRecordOption::WithFreqs),
|
||||||
).set_stored();
|
)
|
||||||
|
.set_stored();
|
||||||
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
||||||
let score_fieldtype = schema::IntOptions::default().set_fast(Cardinality::SingleValue);
|
let score_fieldtype = schema::IntOptions::default().set_fast(Cardinality::SingleValue);
|
||||||
let score_field = schema_builder.add_u64_field("score", score_fieldtype);
|
let score_field = schema_builder.add_u64_field("score", score_fieldtype);
|
||||||
@@ -742,68 +745,63 @@ mod tests {
|
|||||||
index.load_searchers().unwrap();
|
index.load_searchers().unwrap();
|
||||||
let searcher = index.searcher();
|
let searcher = index.searcher();
|
||||||
let get_doc_ids = |terms: Vec<Term>| {
|
let get_doc_ids = |terms: Vec<Term>| {
|
||||||
|
let mut collector = TestCollector::default();
|
||||||
let query = BooleanQuery::new_multiterms_query(terms);
|
let query = BooleanQuery::new_multiterms_query(terms);
|
||||||
let top_docs = searcher.search(&query, &TestCollector).unwrap();
|
assert!(searcher.search(&query, &mut collector).is_ok());
|
||||||
top_docs.docs().to_vec()
|
collector.docs()
|
||||||
};
|
};
|
||||||
{
|
{
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
|
get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
|
||||||
vec![DocAddress(0, 1), DocAddress(0, 2), DocAddress(0, 4)]
|
vec![1, 2, 4]
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
|
get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
|
||||||
vec![DocAddress(0, 0), DocAddress(0, 3)]
|
vec![0, 3]
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
get_doc_ids(vec![Term::from_field_text(text_field, "g")]),
|
get_doc_ids(vec![Term::from_field_text(text_field, "g")]),
|
||||||
vec![DocAddress(0, 4)]
|
vec![4]
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
|
get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
|
||||||
vec![
|
vec![0, 1, 2, 3, 4]
|
||||||
DocAddress(0, 0),
|
|
||||||
DocAddress(0, 1),
|
|
||||||
DocAddress(0, 2),
|
|
||||||
DocAddress(0, 3),
|
|
||||||
DocAddress(0, 4)
|
|
||||||
]
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let doc = searcher.doc(DocAddress(0, 0)).unwrap();
|
let doc = searcher.doc(&DocAddress(0, 0)).unwrap();
|
||||||
assert_eq!(doc.get_first(text_field).unwrap().text(), Some("af b"));
|
assert_eq!(doc.get_first(text_field).unwrap().text(), "af b");
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let doc = searcher.doc(DocAddress(0, 1)).unwrap();
|
let doc = searcher.doc(&DocAddress(0, 1)).unwrap();
|
||||||
assert_eq!(doc.get_first(text_field).unwrap().text(), Some("a b c"));
|
assert_eq!(doc.get_first(text_field).unwrap().text(), "a b c");
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let doc = searcher.doc(DocAddress(0, 2)).unwrap();
|
let doc = searcher.doc(&DocAddress(0, 2)).unwrap();
|
||||||
assert_eq!(doc.get_first(text_field).unwrap().text(), Some("a b c d"));
|
assert_eq!(doc.get_first(text_field).unwrap().text(), "a b c d");
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let doc = searcher.doc(DocAddress(0, 3)).unwrap();
|
let doc = searcher.doc(&DocAddress(0, 3)).unwrap();
|
||||||
assert_eq!(doc.get_first(text_field).unwrap().text(), Some("af b"));
|
assert_eq!(doc.get_first(text_field).unwrap().text(), "af b");
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let doc = searcher.doc(DocAddress(0, 4)).unwrap();
|
let doc = searcher.doc(&DocAddress(0, 4)).unwrap();
|
||||||
assert_eq!(doc.get_first(text_field).unwrap().text(), Some("a b c g"));
|
assert_eq!(doc.get_first(text_field).unwrap().text(), "a b c g");
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let get_fast_vals = |terms: Vec<Term>| {
|
let get_fast_vals = |terms: Vec<Term>| {
|
||||||
let query = BooleanQuery::new_multiterms_query(terms);
|
let query = BooleanQuery::new_multiterms_query(terms);
|
||||||
searcher
|
let mut collector = FastFieldTestCollector::for_field(score_field);
|
||||||
.search(&query, &FastFieldTestCollector::for_field(score_field))
|
assert!(searcher.search(&query, &mut collector).is_ok());
|
||||||
.unwrap()
|
collector.vals()
|
||||||
};
|
};
|
||||||
let get_fast_vals_bytes = |terms: Vec<Term>| {
|
let get_fast_vals_bytes = |terms: Vec<Term>| {
|
||||||
let query = BooleanQuery::new_multiterms_query(terms);
|
let query = BooleanQuery::new_multiterms_query(terms);
|
||||||
|
let mut collector = BytesFastFieldTestCollector::for_field(bytes_score_field);
|
||||||
searcher
|
searcher
|
||||||
.search(
|
.search(&query, &mut collector)
|
||||||
&query,
|
.expect("failed to search");
|
||||||
&BytesFastFieldTestCollector::for_field(bytes_score_field),
|
collector.vals()
|
||||||
).expect("failed to search")
|
|
||||||
};
|
};
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
get_fast_vals(vec![Term::from_field_text(text_field, "a")]),
|
get_fast_vals(vec![Term::from_field_text(text_field, "a")]),
|
||||||
@@ -819,11 +817,12 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_index_merger_with_deletes() {
|
fn test_index_merger_with_deletes() {
|
||||||
let mut schema_builder = schema::Schema::builder();
|
let mut schema_builder = schema::SchemaBuilder::default();
|
||||||
let text_fieldtype = schema::TextOptions::default()
|
let text_fieldtype = schema::TextOptions::default()
|
||||||
.set_indexing_options(
|
.set_indexing_options(
|
||||||
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
|
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
|
||||||
).set_stored();
|
)
|
||||||
|
.set_stored();
|
||||||
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
||||||
let score_fieldtype = schema::IntOptions::default().set_fast(Cardinality::SingleValue);
|
let score_fieldtype = schema::IntOptions::default().set_fast(Cardinality::SingleValue);
|
||||||
let score_field = schema_builder.add_u64_field("score", score_fieldtype);
|
let score_field = schema_builder.add_u64_field("score", score_fieldtype);
|
||||||
@@ -832,13 +831,21 @@ mod tests {
|
|||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
|
|
||||||
let search_term = |searcher: &Searcher, term: Term| {
|
let search_term = |searcher: &Searcher, term: Term| {
|
||||||
let collector = FastFieldTestCollector::for_field(score_field);
|
let mut collector = FastFieldTestCollector::for_field(score_field);
|
||||||
let bytes_collector = BytesFastFieldTestCollector::for_field(bytes_score_field);
|
let mut bytes_collector = BytesFastFieldTestCollector::for_field(bytes_score_field);
|
||||||
let term_query = TermQuery::new(term, IndexRecordOption::Basic);
|
let term_query = TermQuery::new(term, IndexRecordOption::Basic);
|
||||||
let (scores, bytes) = searcher
|
|
||||||
.search(&term_query, &(collector, bytes_collector))
|
{
|
||||||
.unwrap();
|
let mut combined_collector =
|
||||||
let mut score_bytes = Cursor::new(bytes);
|
chain().push(&mut collector).push(&mut bytes_collector);
|
||||||
|
searcher
|
||||||
|
.search(&term_query, &mut combined_collector)
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
let scores = collector.vals();
|
||||||
|
|
||||||
|
let mut score_bytes = Cursor::new(bytes_collector.vals());
|
||||||
for &score in &scores {
|
for &score in &scores {
|
||||||
assert_eq!(score as u32, score_bytes.read_u32::<BigEndian>().unwrap());
|
assert_eq!(score as u32, score_bytes.read_u32::<BigEndian>().unwrap());
|
||||||
}
|
}
|
||||||
@@ -919,10 +926,10 @@ mod tests {
|
|||||||
|
|
||||||
assert_eq!(searcher.segment_readers().len(), 2);
|
assert_eq!(searcher.segment_readers().len(), 2);
|
||||||
assert_eq!(searcher.num_docs(), 3);
|
assert_eq!(searcher.num_docs(), 3);
|
||||||
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
|
assert_eq!(searcher.segment_readers()[0].num_docs(), 1);
|
||||||
assert_eq!(searcher.segment_readers()[0].max_doc(), 4);
|
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
|
||||||
assert_eq!(searcher.segment_readers()[1].num_docs(), 1);
|
assert_eq!(searcher.segment_readers()[1].num_docs(), 2);
|
||||||
assert_eq!(searcher.segment_readers()[1].max_doc(), 3);
|
assert_eq!(searcher.segment_readers()[1].max_doc(), 4);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
search_term(&searcher, Term::from_field_text(text_field, "a")),
|
search_term(&searcher, Term::from_field_text(text_field, "a")),
|
||||||
empty_vec
|
empty_vec
|
||||||
@@ -956,15 +963,15 @@ mod tests {
|
|||||||
.segment_reader(0)
|
.segment_reader(0)
|
||||||
.fast_field_reader::<u64>(score_field)
|
.fast_field_reader::<u64>(score_field)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(score_field_reader.min_value(), 4000);
|
assert_eq!(score_field_reader.min_value(), 1);
|
||||||
assert_eq!(score_field_reader.max_value(), 7000);
|
assert_eq!(score_field_reader.max_value(), 3);
|
||||||
|
|
||||||
let score_field_reader = searcher
|
let score_field_reader = searcher
|
||||||
.segment_reader(1)
|
.segment_reader(1)
|
||||||
.fast_field_reader::<u64>(score_field)
|
.fast_field_reader::<u64>(score_field)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(score_field_reader.min_value(), 1);
|
assert_eq!(score_field_reader.min_value(), 4000);
|
||||||
assert_eq!(score_field_reader.max_value(), 3);
|
assert_eq!(score_field_reader.max_value(), 7000);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
// merging the segments
|
// merging the segments
|
||||||
@@ -1137,9 +1144,10 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_merge_facets() {
|
fn test_merge_facets() {
|
||||||
let mut schema_builder = schema::Schema::builder();
|
let mut schema_builder = schema::SchemaBuilder::default();
|
||||||
let facet_field = schema_builder.add_facet_field("facet");
|
let facet_field = schema_builder.add_facet_field("facet");
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
|
use schema::Facet;
|
||||||
{
|
{
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
let index_doc = |index_writer: &mut IndexWriter, doc_facets: &[&str]| {
|
let index_doc = |index_writer: &mut IndexWriter, doc_facets: &[&str]| {
|
||||||
@@ -1168,16 +1176,20 @@ mod tests {
|
|||||||
index_doc(&mut index_writer, &["/top/e", "/top/f"]);
|
index_doc(&mut index_writer, &["/top/e", "/top/f"]);
|
||||||
index_writer.commit().expect("committed");
|
index_writer.commit().expect("committed");
|
||||||
}
|
}
|
||||||
|
|
||||||
index.load_searchers().unwrap();
|
index.load_searchers().unwrap();
|
||||||
let test_searcher = |expected_num_docs: usize, expected: &[(&str, u64)]| {
|
let test_searcher = |expected_num_docs: usize, expected: &[(&str, u64)]| {
|
||||||
let searcher = index.searcher();
|
let searcher = index.searcher();
|
||||||
let mut facet_collector = FacetCollector::for_field(facet_field);
|
let mut facet_collector = FacetCollector::for_field(facet_field);
|
||||||
facet_collector.add_facet(Facet::from("/top"));
|
facet_collector.add_facet(Facet::from("/top"));
|
||||||
let (count, facet_counts) = searcher
|
use collector::{CountCollector, MultiCollector};
|
||||||
.search(&AllQuery, &(Count, facet_collector))
|
let mut count_collector = CountCollector::default();
|
||||||
.unwrap();
|
{
|
||||||
assert_eq!(count, expected_num_docs);
|
let mut multi_collectors =
|
||||||
|
MultiCollector::from(vec![&mut count_collector, &mut facet_collector]);
|
||||||
|
searcher.search(&AllQuery, &mut multi_collectors).unwrap();
|
||||||
|
}
|
||||||
|
assert_eq!(count_collector.count(), expected_num_docs);
|
||||||
|
let facet_counts = facet_collector.harvest();
|
||||||
let facets: Vec<(String, u64)> = facet_counts
|
let facets: Vec<(String, u64)> = facet_counts
|
||||||
.get("/top")
|
.get("/top")
|
||||||
.map(|(facet, count)| (facet.to_string(), count))
|
.map(|(facet, count)| (facet.to_string(), count))
|
||||||
@@ -1201,6 +1213,7 @@ mod tests {
|
|||||||
("/top/f", 1),
|
("/top/f", 1),
|
||||||
],
|
],
|
||||||
);
|
);
|
||||||
|
|
||||||
// Merging the segments
|
// Merging the segments
|
||||||
{
|
{
|
||||||
let segment_ids = index
|
let segment_ids = index
|
||||||
@@ -1213,6 +1226,7 @@ mod tests {
|
|||||||
.wait()
|
.wait()
|
||||||
.expect("Merging failed");
|
.expect("Merging failed");
|
||||||
index_writer.wait_merging_threads().unwrap();
|
index_writer.wait_merging_threads().unwrap();
|
||||||
|
|
||||||
index.load_searchers().unwrap();
|
index.load_searchers().unwrap();
|
||||||
test_searcher(
|
test_searcher(
|
||||||
11,
|
11,
|
||||||
@@ -1251,7 +1265,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_merge_multivalued_int_fields_all_deleted() {
|
fn test_merge_multivalued_int_fields_all_deleted() {
|
||||||
let mut schema_builder = schema::Schema::builder();
|
let mut schema_builder = schema::SchemaBuilder::default();
|
||||||
let int_options = IntOptions::default()
|
let int_options = IntOptions::default()
|
||||||
.set_fast(Cardinality::MultiValues)
|
.set_fast(Cardinality::MultiValues)
|
||||||
.set_indexed();
|
.set_indexed();
|
||||||
@@ -1292,7 +1306,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_merge_multivalued_int_fields() {
|
fn test_merge_multivalued_int_fields() {
|
||||||
let mut schema_builder = schema::Schema::builder();
|
let mut schema_builder = schema::SchemaBuilder::default();
|
||||||
let int_options = IntOptions::default()
|
let int_options = IntOptions::default()
|
||||||
.set_fast(Cardinality::MultiValues)
|
.set_fast(Cardinality::MultiValues)
|
||||||
.set_indexed();
|
.set_indexed();
|
||||||
@@ -1358,30 +1372,21 @@ mod tests {
|
|||||||
assert_eq!(&vals, &[17]);
|
assert_eq!(&vals, &[17]);
|
||||||
}
|
}
|
||||||
|
|
||||||
println!(
|
|
||||||
"{:?}",
|
|
||||||
searcher
|
|
||||||
.segment_readers()
|
|
||||||
.iter()
|
|
||||||
.map(|reader| reader.max_doc())
|
|
||||||
.collect::<Vec<_>>()
|
|
||||||
);
|
|
||||||
|
|
||||||
{
|
{
|
||||||
let segment = searcher.segment_reader(1u32);
|
let segment = searcher.segment_reader(1u32);
|
||||||
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
|
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
|
||||||
ff_reader.get_vals(0, &mut vals);
|
ff_reader.get_vals(0, &mut vals);
|
||||||
assert_eq!(&vals, &[28, 27]);
|
assert_eq!(&vals, &[20]);
|
||||||
|
|
||||||
ff_reader.get_vals(1, &mut vals);
|
|
||||||
assert_eq!(&vals, &[1_000]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
let segment = searcher.segment_reader(2u32);
|
let segment = searcher.segment_reader(2u32);
|
||||||
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
|
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
|
||||||
ff_reader.get_vals(0, &mut vals);
|
ff_reader.get_vals(0, &mut vals);
|
||||||
assert_eq!(&vals, &[20]);
|
assert_eq!(&vals, &[28, 27]);
|
||||||
|
|
||||||
|
ff_reader.get_vals(1, &mut vals);
|
||||||
|
assert_eq!(&vals, &[1_000]);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Merging the segments
|
// Merging the segments
|
||||||
@@ -1402,14 +1407,6 @@ mod tests {
|
|||||||
|
|
||||||
{
|
{
|
||||||
let searcher = index.searcher();
|
let searcher = index.searcher();
|
||||||
println!(
|
|
||||||
"{:?}",
|
|
||||||
searcher
|
|
||||||
.segment_readers()
|
|
||||||
.iter()
|
|
||||||
.map(|reader| reader.max_doc())
|
|
||||||
.collect::<Vec<_>>()
|
|
||||||
);
|
|
||||||
let segment = searcher.segment_reader(0u32);
|
let segment = searcher.segment_reader(0u32);
|
||||||
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
|
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
|
||||||
|
|
||||||
@@ -1435,13 +1432,13 @@ mod tests {
|
|||||||
assert_eq!(&vals, &[17]);
|
assert_eq!(&vals, &[17]);
|
||||||
|
|
||||||
ff_reader.get_vals(7, &mut vals);
|
ff_reader.get_vals(7, &mut vals);
|
||||||
assert_eq!(&vals, &[28, 27]);
|
assert_eq!(&vals, &[20]);
|
||||||
|
|
||||||
ff_reader.get_vals(8, &mut vals);
|
ff_reader.get_vals(8, &mut vals);
|
||||||
assert_eq!(&vals, &[1_000]);
|
assert_eq!(&vals, &[28, 27]);
|
||||||
|
|
||||||
ff_reader.get_vals(9, &mut vals);
|
ff_reader.get_vals(9, &mut vals);
|
||||||
assert_eq!(&vals, &[20]);
|
assert_eq!(&vals, &[1_000]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -16,8 +16,6 @@ mod segment_writer;
|
|||||||
mod stamper;
|
mod stamper;
|
||||||
|
|
||||||
pub(crate) use self::directory_lock::DirectoryLock;
|
pub(crate) use self::directory_lock::DirectoryLock;
|
||||||
pub use self::directory_lock::LockType;
|
|
||||||
|
|
||||||
pub use self::index_writer::IndexWriter;
|
pub use self::index_writer::IndexWriter;
|
||||||
pub use self::log_merge_policy::LogMergePolicy;
|
pub use self::log_merge_policy::LogMergePolicy;
|
||||||
pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy};
|
pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy};
|
||||||
|
|||||||
@@ -11,8 +11,8 @@ pub enum SegmentState {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl SegmentState {
|
impl SegmentState {
|
||||||
pub fn letter_code(self) -> char {
|
pub fn letter_code(&self) -> char {
|
||||||
match self {
|
match *self {
|
||||||
SegmentState::InMerge => 'M',
|
SegmentState::InMerge => 'M',
|
||||||
SegmentState::Ready => 'R',
|
SegmentState::Ready => 'R',
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
use super::segment_register::SegmentRegister;
|
use super::segment_register::SegmentRegister;
|
||||||
use core::SegmentId;
|
use core::SegmentId;
|
||||||
use core::SegmentMeta;
|
use core::SegmentMeta;
|
||||||
use core::META_FILEPATH;
|
use core::{LOCKFILE_FILEPATH, META_FILEPATH};
|
||||||
use error::TantivyError;
|
use error::TantivyError;
|
||||||
use indexer::delete_queue::DeleteCursor;
|
use indexer::delete_queue::DeleteCursor;
|
||||||
use indexer::SegmentEntry;
|
use indexer::SegmentEntry;
|
||||||
@@ -78,13 +78,10 @@ impl SegmentManager {
|
|||||||
registers_lock.committed.len() + registers_lock.uncommitted.len()
|
registers_lock.committed.len() + registers_lock.uncommitted.len()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// List the files that are useful to the index.
|
|
||||||
///
|
|
||||||
/// This does not include lock files, or files that are obsolete
|
|
||||||
/// but have not yet been deleted by the garbage collector.
|
|
||||||
pub fn list_files(&self) -> HashSet<PathBuf> {
|
pub fn list_files(&self) -> HashSet<PathBuf> {
|
||||||
let mut files = HashSet::new();
|
let mut files = HashSet::new();
|
||||||
files.insert(META_FILEPATH.clone());
|
files.insert(META_FILEPATH.clone());
|
||||||
|
files.insert(LOCKFILE_FILEPATH.clone());
|
||||||
for segment_meta in SegmentMeta::all() {
|
for segment_meta in SegmentMeta::all() {
|
||||||
files.extend(segment_meta.list_files());
|
files.extend(segment_meta.list_files());
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -51,8 +51,7 @@ impl SegmentRegister {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn segment_metas(&self) -> Vec<SegmentMeta> {
|
pub fn segment_metas(&self) -> Vec<SegmentMeta> {
|
||||||
let mut segment_ids: Vec<SegmentMeta> = self
|
let mut segment_ids: Vec<SegmentMeta> = self.segment_states
|
||||||
.segment_states
|
|
||||||
.values()
|
.values()
|
||||||
.map(|segment_entry| segment_entry.meta().clone())
|
.map(|segment_entry| segment_entry.meta().clone())
|
||||||
.collect();
|
.collect();
|
||||||
|
|||||||
@@ -72,7 +72,7 @@ pub fn save_metas(
|
|||||||
payload,
|
payload,
|
||||||
};
|
};
|
||||||
let mut buffer = serde_json::to_vec_pretty(&metas)?;
|
let mut buffer = serde_json::to_vec_pretty(&metas)?;
|
||||||
writeln!(&mut buffer)?;
|
write!(&mut buffer, "\n")?;
|
||||||
directory.atomic_write(&META_FILEPATH, &buffer[..])?;
|
directory.atomic_write(&META_FILEPATH, &buffer[..])?;
|
||||||
debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas));
|
debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas));
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -227,24 +227,8 @@ impl SegmentUpdater {
|
|||||||
if self.is_alive() {
|
if self.is_alive() {
|
||||||
let index = &self.0.index;
|
let index = &self.0.index;
|
||||||
let directory = index.directory();
|
let directory = index.directory();
|
||||||
let mut commited_segment_metas = self.0.segment_manager.committed_segment_metas();
|
|
||||||
|
|
||||||
// We sort segment_readers by number of documents.
|
|
||||||
// This is an heuristic to make multithreading more efficient.
|
|
||||||
//
|
|
||||||
// This is not done at the searcher level because I had a strange
|
|
||||||
// use case in which I was dealing with a large static index,
|
|
||||||
// dispatched over 5 SSD drives.
|
|
||||||
//
|
|
||||||
// A `UnionDirectory` makes it possible to read from these
|
|
||||||
// 5 different drives and creates a meta.json on the fly.
|
|
||||||
// In order to optimize the throughput, it creates a lasagna of segments
|
|
||||||
// from the different drives.
|
|
||||||
//
|
|
||||||
// Segment 1 from disk 1, Segment 1 from disk 2, etc.
|
|
||||||
commited_segment_metas.sort_by_key(|segment_meta| -(segment_meta.max_doc() as i32));
|
|
||||||
save_metas(
|
save_metas(
|
||||||
commited_segment_metas,
|
self.0.segment_manager.committed_segment_metas(),
|
||||||
index.schema(),
|
index.schema(),
|
||||||
opstamp,
|
opstamp,
|
||||||
commit_message,
|
commit_message,
|
||||||
@@ -352,7 +336,8 @@ impl SegmentUpdater {
|
|||||||
.unwrap()
|
.unwrap()
|
||||||
.remove(&merging_thread_id);
|
.remove(&merging_thread_id);
|
||||||
Ok(())
|
Ok(())
|
||||||
}).expect("Failed to spawn a thread.");
|
})
|
||||||
|
.expect("Failed to spawn a thread.");
|
||||||
self.0
|
self.0
|
||||||
.merging_threads
|
.merging_threads
|
||||||
.write()
|
.write()
|
||||||
@@ -500,7 +485,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_delete_during_merge() {
|
fn test_delete_during_merge() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
|
|
||||||
|
|||||||
@@ -49,20 +49,20 @@ impl SegmentWriter {
|
|||||||
) -> Result<SegmentWriter> {
|
) -> Result<SegmentWriter> {
|
||||||
let segment_serializer = SegmentSerializer::for_segment(&mut segment)?;
|
let segment_serializer = SegmentSerializer::for_segment(&mut segment)?;
|
||||||
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_bits);
|
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_bits);
|
||||||
let tokenizers =
|
let tokenizers = schema
|
||||||
schema
|
.fields()
|
||||||
.fields()
|
.iter()
|
||||||
.iter()
|
.map(|field_entry| field_entry.field_type())
|
||||||
.map(|field_entry| field_entry.field_type())
|
.map(|field_type| match *field_type {
|
||||||
.map(|field_type| match *field_type {
|
FieldType::Str(ref text_options) => text_options.get_indexing_options().and_then(
|
||||||
FieldType::Str(ref text_options) => text_options
|
|text_index_option| {
|
||||||
.get_indexing_options()
|
let tokenizer_name = &text_index_option.tokenizer();
|
||||||
.and_then(|text_index_option| {
|
segment.index().tokenizers().get(tokenizer_name)
|
||||||
let tokenizer_name = &text_index_option.tokenizer();
|
},
|
||||||
segment.index().tokenizers().get(tokenizer_name)
|
),
|
||||||
}),
|
_ => None,
|
||||||
_ => None,
|
})
|
||||||
}).collect();
|
.collect();
|
||||||
Ok(SegmentWriter {
|
Ok(SegmentWriter {
|
||||||
max_doc: 0,
|
max_doc: 0,
|
||||||
multifield_postings,
|
multifield_postings,
|
||||||
@@ -117,7 +117,8 @@ impl SegmentWriter {
|
|||||||
_ => {
|
_ => {
|
||||||
panic!("Expected hierarchical facet");
|
panic!("Expected hierarchical facet");
|
||||||
}
|
}
|
||||||
}).collect();
|
})
|
||||||
|
.collect();
|
||||||
let mut term = Term::for_field(field); // we set the Term
|
let mut term = Term::for_field(field); // we set the Term
|
||||||
for facet_bytes in facets {
|
for facet_bytes in facets {
|
||||||
let mut unordered_term_id_opt = None;
|
let mut unordered_term_id_opt = None;
|
||||||
@@ -145,7 +146,8 @@ impl SegmentWriter {
|
|||||||
.flat_map(|field_value| match *field_value.value() {
|
.flat_map(|field_value| match *field_value.value() {
|
||||||
Value::Str(ref text) => Some(text.as_str()),
|
Value::Str(ref text) => Some(text.as_str()),
|
||||||
_ => None,
|
_ => None,
|
||||||
}).collect();
|
})
|
||||||
|
.collect();
|
||||||
if texts.is_empty() {
|
if texts.is_empty() {
|
||||||
0
|
0
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
260
src/lib.rs
Executable file → Normal file
260
src/lib.rs
Executable file → Normal file
@@ -1,8 +1,11 @@
|
|||||||
#![doc(html_logo_url = "http://fulmicoton.com/tantivy-logo/tantivy-logo.png")]
|
#![doc(html_logo_url = "http://fulmicoton.com/tantivy-logo/tantivy-logo.png")]
|
||||||
|
#![cfg_attr(feature = "cargo-clippy", allow(module_inception))]
|
||||||
|
#![cfg_attr(feature = "cargo-clippy", allow(inline_always))]
|
||||||
#![cfg_attr(all(feature = "unstable", test), feature(test))]
|
#![cfg_attr(all(feature = "unstable", test), feature(test))]
|
||||||
#![cfg_attr(feature = "cargo-clippy", feature(tool_lints))]
|
|
||||||
#![cfg_attr(feature = "cargo-clippy", allow(clippy::module_inception))]
|
|
||||||
#![doc(test(attr(allow(unused_variables), deny(warnings))))]
|
#![doc(test(attr(allow(unused_variables), deny(warnings))))]
|
||||||
|
#![allow(unknown_lints)]
|
||||||
|
#![allow(new_without_default)]
|
||||||
|
#![allow(decimal_literal_representation)]
|
||||||
#![warn(missing_docs)]
|
#![warn(missing_docs)]
|
||||||
#![recursion_limit = "80"]
|
#![recursion_limit = "80"]
|
||||||
|
|
||||||
@@ -24,8 +27,7 @@
|
|||||||
//! # use tempdir::TempDir;
|
//! # use tempdir::TempDir;
|
||||||
//! # use tantivy::Index;
|
//! # use tantivy::Index;
|
||||||
//! # use tantivy::schema::*;
|
//! # use tantivy::schema::*;
|
||||||
//! # use tantivy::{Score, DocAddress};
|
//! # use tantivy::collector::TopCollector;
|
||||||
//! # use tantivy::collector::TopDocs;
|
|
||||||
//! # use tantivy::query::QueryParser;
|
//! # use tantivy::query::QueryParser;
|
||||||
//! #
|
//! #
|
||||||
//! # fn main() {
|
//! # fn main() {
|
||||||
@@ -47,7 +49,7 @@
|
|||||||
//! // in a compressed, row-oriented key-value store.
|
//! // in a compressed, row-oriented key-value store.
|
||||||
//! // This store is useful to reconstruct the
|
//! // This store is useful to reconstruct the
|
||||||
//! // documents that were selected during the search phase.
|
//! // documents that were selected during the search phase.
|
||||||
//! let mut schema_builder = Schema::builder();
|
//! let mut schema_builder = SchemaBuilder::default();
|
||||||
//! let title = schema_builder.add_text_field("title", TEXT | STORED);
|
//! let title = schema_builder.add_text_field("title", TEXT | STORED);
|
||||||
//! let body = schema_builder.add_text_field("body", TEXT);
|
//! let body = schema_builder.add_text_field("body", TEXT);
|
||||||
//! let schema = schema_builder.build();
|
//! let schema = schema_builder.build();
|
||||||
@@ -87,14 +89,14 @@
|
|||||||
//! // A ticket has been opened regarding this problem.
|
//! // A ticket has been opened regarding this problem.
|
||||||
//! let query = query_parser.parse_query("sea whale")?;
|
//! let query = query_parser.parse_query("sea whale")?;
|
||||||
//!
|
//!
|
||||||
//! // Perform search.
|
//! let mut top_collector = TopCollector::with_limit(10);
|
||||||
//! // `topdocs` contains the 10 most relevant doc ids, sorted by decreasing scores...
|
//! searcher.search(&*query, &mut top_collector)?;
|
||||||
//! let top_docs: Vec<(Score, DocAddress)> =
|
|
||||||
//! searcher.search(&query, &TopDocs::with_limit(10))?;
|
|
||||||
//!
|
//!
|
||||||
//! for (_score, doc_address) in top_docs {
|
//! // Our top collector now contains the 10
|
||||||
//! // Retrieve the actual content of documents given its `doc_address`.
|
//! // most relevant doc ids...
|
||||||
//! let retrieved_doc = searcher.doc(doc_address)?;
|
//! let doc_addresses = top_collector.docs();
|
||||||
|
//! for doc_address in doc_addresses {
|
||||||
|
//! let retrieved_doc = searcher.doc(&doc_address)?;
|
||||||
//! println!("{}", schema.to_json(&retrieved_doc));
|
//! println!("{}", schema.to_json(&retrieved_doc));
|
||||||
//! }
|
//! }
|
||||||
//!
|
//!
|
||||||
@@ -130,17 +132,17 @@ extern crate base64;
|
|||||||
extern crate bit_set;
|
extern crate bit_set;
|
||||||
extern crate bitpacking;
|
extern crate bitpacking;
|
||||||
extern crate byteorder;
|
extern crate byteorder;
|
||||||
extern crate scoped_pool;
|
|
||||||
|
|
||||||
|
#[macro_use]
|
||||||
extern crate combine;
|
extern crate combine;
|
||||||
|
|
||||||
extern crate crossbeam;
|
extern crate crossbeam;
|
||||||
|
extern crate crossbeam_channel;
|
||||||
extern crate fnv;
|
extern crate fnv;
|
||||||
extern crate fst;
|
extern crate fst;
|
||||||
extern crate fst_regex;
|
extern crate fst_regex;
|
||||||
extern crate futures;
|
extern crate futures;
|
||||||
extern crate futures_cpupool;
|
extern crate futures_cpupool;
|
||||||
extern crate htmlescape;
|
|
||||||
extern crate itertools;
|
extern crate itertools;
|
||||||
extern crate levenshtein_automata;
|
extern crate levenshtein_automata;
|
||||||
extern crate num_cpus;
|
extern crate num_cpus;
|
||||||
@@ -163,18 +165,13 @@ extern crate winapi;
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
extern crate rand;
|
extern crate rand;
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
#[macro_use]
|
|
||||||
extern crate maplit;
|
|
||||||
|
|
||||||
#[cfg(all(test, feature = "unstable"))]
|
#[cfg(all(test, feature = "unstable"))]
|
||||||
extern crate test;
|
extern crate test;
|
||||||
|
|
||||||
#[macro_use]
|
extern crate tinysegmenter;
|
||||||
extern crate downcast;
|
|
||||||
|
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate fail;
|
extern crate downcast;
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod functional_test;
|
mod functional_test;
|
||||||
@@ -184,10 +181,7 @@ mod macros;
|
|||||||
|
|
||||||
pub use error::TantivyError;
|
pub use error::TantivyError;
|
||||||
|
|
||||||
#[deprecated(
|
#[deprecated(since="0.7.0", note="please use `tantivy::TantivyError` instead")]
|
||||||
since = "0.7.0",
|
|
||||||
note = "please use `tantivy::TantivyError` instead"
|
|
||||||
)]
|
|
||||||
pub use error::TantivyError as Error;
|
pub use error::TantivyError as Error;
|
||||||
|
|
||||||
extern crate census;
|
extern crate census;
|
||||||
@@ -212,13 +206,9 @@ pub(crate) mod positions;
|
|||||||
pub mod postings;
|
pub mod postings;
|
||||||
pub mod query;
|
pub mod query;
|
||||||
pub mod schema;
|
pub mod schema;
|
||||||
pub mod space_usage;
|
|
||||||
pub mod store;
|
pub mod store;
|
||||||
pub mod termdict;
|
pub mod termdict;
|
||||||
|
|
||||||
mod snippet;
|
|
||||||
pub use self::snippet::{SnippetGenerator, Snippet};
|
|
||||||
|
|
||||||
mod docset;
|
mod docset;
|
||||||
pub use self::docset::{DocSet, SkipResult};
|
pub use self::docset::{DocSet, SkipResult};
|
||||||
|
|
||||||
@@ -271,12 +261,12 @@ impl DocAddress {
|
|||||||
/// The segment ordinal is an id identifying the segment
|
/// The segment ordinal is an id identifying the segment
|
||||||
/// hosting the document. It is only meaningful, in the context
|
/// hosting the document. It is only meaningful, in the context
|
||||||
/// of a searcher.
|
/// of a searcher.
|
||||||
pub fn segment_ord(self) -> SegmentLocalId {
|
pub fn segment_ord(&self) -> SegmentLocalId {
|
||||||
self.0
|
self.0
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return the segment local `DocId`
|
/// Return the segment local `DocId`
|
||||||
pub fn doc(self) -> DocId {
|
pub fn doc(&self) -> DocId {
|
||||||
self.1
|
self.1
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -300,11 +290,9 @@ mod tests {
|
|||||||
use docset::DocSet;
|
use docset::DocSet;
|
||||||
use query::BooleanQuery;
|
use query::BooleanQuery;
|
||||||
use rand::distributions::Bernoulli;
|
use rand::distributions::Bernoulli;
|
||||||
use rand::distributions::Uniform;
|
use rand::distributions::Range;
|
||||||
use rand::rngs::StdRng;
|
use rand::{Rng, SeedableRng, XorShiftRng};
|
||||||
use rand::{Rng, SeedableRng};
|
|
||||||
use schema::*;
|
use schema::*;
|
||||||
use DocAddress;
|
|
||||||
use Index;
|
use Index;
|
||||||
use IndexWriter;
|
use IndexWriter;
|
||||||
use Postings;
|
use Postings;
|
||||||
@@ -323,15 +311,16 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn generate_nonunique_unsorted(max_value: u32, n_elems: usize) -> Vec<u32> {
|
pub fn generate_nonunique_unsorted(max_value: u32, n_elems: usize) -> Vec<u32> {
|
||||||
let seed: [u8; 32] = [1; 32];
|
let seed: [u8; 16] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15];
|
||||||
StdRng::from_seed(seed)
|
XorShiftRng::from_seed(seed)
|
||||||
.sample_iter(&Uniform::new(0u32, max_value))
|
.sample_iter(&Range::new(0u32, max_value))
|
||||||
.take(n_elems)
|
.take(n_elems)
|
||||||
.collect::<Vec<u32>>()
|
.collect::<Vec<u32>>()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn sample_with_seed(n: u32, ratio: f64, seed_val: u8) -> Vec<u32> {
|
pub fn sample_with_seed(n: u32, ratio: f64, seed_val: u8) -> Vec<u32> {
|
||||||
StdRng::from_seed([seed_val; 32])
|
let seed: [u8; 16] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, seed_val];
|
||||||
|
XorShiftRng::from_seed(seed)
|
||||||
.sample_iter(&Bernoulli::new(ratio))
|
.sample_iter(&Bernoulli::new(ratio))
|
||||||
.take(n as usize)
|
.take(n as usize)
|
||||||
.enumerate()
|
.enumerate()
|
||||||
@@ -346,7 +335,7 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
#[cfg(feature = "mmap")]
|
#[cfg(feature = "mmap")]
|
||||||
fn test_indexing() {
|
fn test_indexing() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_from_tempdir(schema).unwrap();
|
let index = Index::create_from_tempdir(schema).unwrap();
|
||||||
@@ -371,7 +360,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_docfreq1() {
|
fn test_docfreq1() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
@@ -411,7 +400,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_fieldnorm_no_docs_with_field() {
|
fn test_fieldnorm_no_docs_with_field() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let title_field = schema_builder.add_text_field("title", TEXT);
|
let title_field = schema_builder.add_text_field("title", TEXT);
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
@@ -440,7 +429,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_fieldnorm() {
|
fn test_fieldnorm() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let index = Index::create_in_ram(schema_builder.build());
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
{
|
{
|
||||||
@@ -481,7 +470,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_delete_postings1() {
|
fn test_delete_postings1() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let term_abcd = Term::from_field_text(text_field, "abcd");
|
let term_abcd = Term::from_field_text(text_field, "abcd");
|
||||||
let term_a = Term::from_field_text(text_field, "a");
|
let term_a = Term::from_field_text(text_field, "a");
|
||||||
@@ -492,21 +481,42 @@ mod tests {
|
|||||||
{
|
{
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
// 0
|
{
|
||||||
index_writer.add_document(doc!(text_field=>"a b"));
|
// 0
|
||||||
// 1
|
let doc = doc!(text_field=>"a b");
|
||||||
index_writer.add_document(doc!(text_field=>" a c"));
|
index_writer.add_document(doc);
|
||||||
// 2
|
}
|
||||||
index_writer.add_document(doc!(text_field=>" b c"));
|
{
|
||||||
// 3
|
// 1
|
||||||
index_writer.add_document(doc!(text_field=>" b d"));
|
let doc = doc!(text_field=>" a c");
|
||||||
|
index_writer.add_document(doc);
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
}
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "a"));
|
{
|
||||||
// 4
|
// 2
|
||||||
index_writer.add_document(doc!(text_field=>" b c"));
|
let doc = doc!(text_field=>" b c");
|
||||||
// 5
|
index_writer.add_document(doc);
|
||||||
index_writer.add_document(doc!(text_field=>" a"));
|
}
|
||||||
|
{
|
||||||
|
// 3
|
||||||
|
let doc = doc!(text_field=>" b d");
|
||||||
|
index_writer.add_document(doc);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
||||||
|
}
|
||||||
|
{
|
||||||
|
index_writer.delete_term(Term::from_field_text(text_field, "a"));
|
||||||
|
}
|
||||||
|
{
|
||||||
|
// 4
|
||||||
|
let doc = doc!(text_field=>" b c");
|
||||||
|
index_writer.add_document(doc);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
// 5
|
||||||
|
let doc = doc!(text_field=>" a");
|
||||||
|
index_writer.add_document(doc);
|
||||||
|
}
|
||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
@@ -541,10 +551,15 @@ mod tests {
|
|||||||
{
|
{
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
// 0
|
{
|
||||||
index_writer.add_document(doc!(text_field=>"a b"));
|
// 0
|
||||||
// 1
|
let doc = doc!(text_field=>"a b");
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
index_writer.add_document(doc);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
// 1
|
||||||
|
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
||||||
|
}
|
||||||
index_writer.rollback().unwrap();
|
index_writer.rollback().unwrap();
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
@@ -580,8 +595,13 @@ mod tests {
|
|||||||
{
|
{
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
index_writer.add_document(doc!(text_field=>"a b"));
|
{
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
let doc = doc!(text_field=>"a b");
|
||||||
|
index_writer.add_document(doc);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
index_writer.delete_term(Term::from_field_text(text_field, "c"));
|
||||||
|
}
|
||||||
index_writer.rollback().unwrap();
|
index_writer.rollback().unwrap();
|
||||||
index_writer.delete_term(Term::from_field_text(text_field, "a"));
|
index_writer.delete_term(Term::from_field_text(text_field, "a"));
|
||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
@@ -625,7 +645,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_indexed_u64() {
|
fn test_indexed_u64() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let field = schema_builder.add_u64_field("value", INT_INDEXED);
|
let field = schema_builder.add_u64_field("value", INT_INDEXED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
|
|
||||||
@@ -648,7 +668,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_indexed_i64() {
|
fn test_indexed_i64() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let value_field = schema_builder.add_i64_field("value", INT_INDEXED);
|
let value_field = schema_builder.add_i64_field("value", INT_INDEXED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
|
|
||||||
@@ -672,7 +692,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_indexedfield_not_in_documents() {
|
fn test_indexedfield_not_in_documents() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let absent_field = schema_builder.add_text_field("text", TEXT);
|
let absent_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
@@ -688,7 +708,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_delete_postings2() {
|
fn test_delete_postings2() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -724,7 +744,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_termfreq() {
|
fn test_termfreq() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -761,7 +781,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_searcher_1() {
|
fn test_searcher_1() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -769,9 +789,18 @@ mod tests {
|
|||||||
{
|
{
|
||||||
// writing the segment
|
// writing the segment
|
||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
index_writer.add_document(doc!(text_field=>"af af af b"));
|
{
|
||||||
index_writer.add_document(doc!(text_field=>"a b c"));
|
let doc = doc!(text_field=>"af af af b");
|
||||||
index_writer.add_document(doc!(text_field=>"a b c d"));
|
index_writer.add_document(doc);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
let doc = doc!(text_field=>"a b c");
|
||||||
|
index_writer.add_document(doc);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
let doc = doc!(text_field=>"a b c d");
|
||||||
|
index_writer.add_document(doc);
|
||||||
|
}
|
||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
@@ -779,42 +808,55 @@ mod tests {
|
|||||||
let searcher = index.searcher();
|
let searcher = index.searcher();
|
||||||
let get_doc_ids = |terms: Vec<Term>| {
|
let get_doc_ids = |terms: Vec<Term>| {
|
||||||
let query = BooleanQuery::new_multiterms_query(terms);
|
let query = BooleanQuery::new_multiterms_query(terms);
|
||||||
let topdocs = searcher.search(&query, &TestCollector).unwrap();
|
let mut collector = TestCollector::default();
|
||||||
topdocs.docs().to_vec()
|
assert!(searcher.search(&query, &mut collector).is_ok());
|
||||||
|
collector.docs()
|
||||||
};
|
};
|
||||||
assert_eq!(
|
{
|
||||||
get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
|
assert_eq!(
|
||||||
vec![DocAddress(0, 1), DocAddress(0, 2)]
|
get_doc_ids(vec![Term::from_field_text(text_field, "a")]),
|
||||||
);
|
vec![1, 2]
|
||||||
assert_eq!(
|
);
|
||||||
get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
|
}
|
||||||
vec![DocAddress(0, 0)]
|
{
|
||||||
);
|
assert_eq!(
|
||||||
assert_eq!(
|
get_doc_ids(vec![Term::from_field_text(text_field, "af")]),
|
||||||
get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
|
vec![0]
|
||||||
vec![DocAddress(0, 0), DocAddress(0, 1), DocAddress(0, 2)]
|
);
|
||||||
);
|
}
|
||||||
assert_eq!(
|
{
|
||||||
get_doc_ids(vec![Term::from_field_text(text_field, "c")]),
|
assert_eq!(
|
||||||
vec![DocAddress(0, 1), DocAddress(0, 2)]
|
get_doc_ids(vec![Term::from_field_text(text_field, "b")]),
|
||||||
);
|
vec![0, 1, 2]
|
||||||
assert_eq!(
|
);
|
||||||
get_doc_ids(vec![Term::from_field_text(text_field, "d")]),
|
}
|
||||||
vec![DocAddress(0, 2)]
|
{
|
||||||
);
|
assert_eq!(
|
||||||
assert_eq!(
|
get_doc_ids(vec![Term::from_field_text(text_field, "c")]),
|
||||||
get_doc_ids(vec![
|
vec![1, 2]
|
||||||
Term::from_field_text(text_field, "b"),
|
);
|
||||||
Term::from_field_text(text_field, "a"),
|
}
|
||||||
]),
|
{
|
||||||
vec![DocAddress(0, 0), DocAddress(0, 1), DocAddress(0, 2)]
|
assert_eq!(
|
||||||
);
|
get_doc_ids(vec![Term::from_field_text(text_field, "d")]),
|
||||||
|
vec![2]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
{
|
||||||
|
assert_eq!(
|
||||||
|
get_doc_ids(vec![
|
||||||
|
Term::from_field_text(text_field, "b"),
|
||||||
|
Term::from_field_text(text_field, "a"),
|
||||||
|
]),
|
||||||
|
vec![0, 1, 2]
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_searcher_2() {
|
fn test_searcher_2() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -841,7 +883,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_doc_macro() {
|
fn test_doc_macro() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let other_text_field = schema_builder.add_text_field("text2", TEXT);
|
let other_text_field = schema_builder.add_text_field("text2", TEXT);
|
||||||
let document = doc!(text_field => "tantivy",
|
let document = doc!(text_field => "tantivy",
|
||||||
@@ -850,16 +892,16 @@ mod tests {
|
|||||||
assert_eq!(document.len(), 3);
|
assert_eq!(document.len(), 3);
|
||||||
let values = document.get_all(text_field);
|
let values = document.get_all(text_field);
|
||||||
assert_eq!(values.len(), 2);
|
assert_eq!(values.len(), 2);
|
||||||
assert_eq!(values[0].text(), Some("tantivy"));
|
assert_eq!(values[0].text(), "tantivy");
|
||||||
assert_eq!(values[1].text(), Some("some other value"));
|
assert_eq!(values[1].text(), "some other value");
|
||||||
let values = document.get_all(other_text_field);
|
let values = document.get_all(other_text_field);
|
||||||
assert_eq!(values.len(), 1);
|
assert_eq!(values.len(), 1);
|
||||||
assert_eq!(values[0].text(), Some("short"));
|
assert_eq!(values[0].text(), "short");
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_wrong_fast_field_type() {
|
fn test_wrong_fast_field_type() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let fast_field_unsigned = schema_builder.add_u64_field("unsigned", FAST);
|
let fast_field_unsigned = schema_builder.add_u64_field("unsigned", FAST);
|
||||||
let fast_field_signed = schema_builder.add_i64_field("signed", FAST);
|
let fast_field_signed = schema_builder.add_i64_field("signed", FAST);
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
|
|||||||
@@ -26,12 +26,12 @@
|
|||||||
/// #[macro_use]
|
/// #[macro_use]
|
||||||
/// extern crate tantivy;
|
/// extern crate tantivy;
|
||||||
///
|
///
|
||||||
/// use tantivy::schema::{Schema, TEXT, FAST};
|
/// use tantivy::schema::{SchemaBuilder, TEXT, FAST};
|
||||||
///
|
///
|
||||||
/// //...
|
/// //...
|
||||||
///
|
///
|
||||||
/// # fn main() {
|
/// # fn main() {
|
||||||
/// let mut schema_builder = Schema::builder();
|
/// let mut schema_builder = SchemaBuilder::new();
|
||||||
/// let title = schema_builder.add_text_field("title", TEXT);
|
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||||
/// let author = schema_builder.add_text_field("text", TEXT);
|
/// let author = schema_builder.add_text_field("text", TEXT);
|
||||||
/// let likes = schema_builder.add_u64_field("num_u64", FAST);
|
/// let likes = schema_builder.add_u64_field("num_u64", FAST);
|
||||||
@@ -67,11 +67,11 @@ macro_rules! doc(
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use schema::{Schema, FAST, TEXT};
|
use schema::{SchemaBuilder, FAST, TEXT};
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_doc_basic() {
|
fn test_doc_basic() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::new();
|
||||||
let title = schema_builder.add_text_field("title", TEXT);
|
let title = schema_builder.add_text_field("title", TEXT);
|
||||||
let author = schema_builder.add_text_field("text", TEXT);
|
let author = schema_builder.add_text_field("text", TEXT);
|
||||||
let likes = schema_builder.add_u64_field("num_u64", FAST);
|
let likes = schema_builder.add_u64_field("num_u64", FAST);
|
||||||
@@ -85,7 +85,7 @@ mod test {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_doc_trailing_comma() {
|
fn test_doc_trailing_comma() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::new();
|
||||||
let title = schema_builder.add_text_field("title", TEXT);
|
let title = schema_builder.add_text_field("title", TEXT);
|
||||||
let author = schema_builder.add_text_field("text", TEXT);
|
let author = schema_builder.add_text_field("text", TEXT);
|
||||||
let likes = schema_builder.add_u64_field("num_u64", FAST);
|
let likes = schema_builder.add_u64_field("num_u64", FAST);
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
|
||||||
/// Positions are stored in three parts and over two files.
|
/// Positions are stored in three parts and over two files.
|
||||||
//
|
//
|
||||||
/// The `SegmentComponent::POSITIONS` file contains all of the bitpacked positions delta,
|
/// The `SegmentComponent::POSITIONS` file contains all of the bitpacked positions delta,
|
||||||
@@ -23,12 +24,13 @@
|
|||||||
/// The long skip structure makes it possible to skip rapidly to the a checkpoint close to this
|
/// The long skip structure makes it possible to skip rapidly to the a checkpoint close to this
|
||||||
/// value, and then skip normally.
|
/// value, and then skip normally.
|
||||||
///
|
///
|
||||||
|
|
||||||
mod reader;
|
mod reader;
|
||||||
mod serializer;
|
mod serializer;
|
||||||
|
|
||||||
pub use self::reader::PositionReader;
|
pub use self::reader::PositionReader;
|
||||||
pub use self::serializer::PositionSerializer;
|
pub use self::serializer::PositionSerializer;
|
||||||
use bitpacking::{BitPacker, BitPacker4x};
|
use bitpacking::{BitPacker4x, BitPacker};
|
||||||
|
|
||||||
const COMPRESSION_BLOCK_SIZE: usize = BitPacker4x::BLOCK_LEN;
|
const COMPRESSION_BLOCK_SIZE: usize = BitPacker4x::BLOCK_LEN;
|
||||||
const LONG_SKIP_IN_BLOCKS: usize = 1_024;
|
const LONG_SKIP_IN_BLOCKS: usize = 1_024;
|
||||||
@@ -41,10 +43,10 @@ lazy_static! {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub mod tests {
|
pub mod tests {
|
||||||
|
|
||||||
use super::{PositionReader, PositionSerializer};
|
use std::iter;
|
||||||
|
use super::{PositionSerializer, PositionReader};
|
||||||
use directory::ReadOnlySource;
|
use directory::ReadOnlySource;
|
||||||
use positions::COMPRESSION_BLOCK_SIZE;
|
use positions::COMPRESSION_BLOCK_SIZE;
|
||||||
use std::iter;
|
|
||||||
|
|
||||||
fn create_stream_buffer(vals: &[u32]) -> (ReadOnlySource, ReadOnlySource) {
|
fn create_stream_buffer(vals: &[u32]) -> (ReadOnlySource, ReadOnlySource) {
|
||||||
let mut skip_buffer = vec![];
|
let mut skip_buffer = vec![];
|
||||||
@@ -57,10 +59,7 @@ pub mod tests {
|
|||||||
}
|
}
|
||||||
serializer.close().unwrap();
|
serializer.close().unwrap();
|
||||||
}
|
}
|
||||||
(
|
(ReadOnlySource::from(stream_buffer), ReadOnlySource::from(skip_buffer))
|
||||||
ReadOnlySource::from(stream_buffer),
|
|
||||||
ReadOnlySource::from(skip_buffer),
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -104,7 +103,7 @@ pub mod tests {
|
|||||||
assert_eq!(skip.len(), 12);
|
assert_eq!(skip.len(), 12);
|
||||||
assert_eq!(stream.len(), 1168);
|
assert_eq!(stream.len(), 1168);
|
||||||
|
|
||||||
let mut position_reader = PositionReader::new(stream, skip, 0u64);
|
let mut position_reader = PositionReader::new(stream,skip, 0u64);
|
||||||
let mut buf = [0u32; 7];
|
let mut buf = [0u32; 7];
|
||||||
let mut c = 0;
|
let mut c = 0;
|
||||||
for _ in 0..100 {
|
for _ in 0..100 {
|
||||||
@@ -126,7 +125,7 @@ pub mod tests {
|
|||||||
let (stream, skip) = create_stream_buffer(&v[..]);
|
let (stream, skip) = create_stream_buffer(&v[..]);
|
||||||
assert_eq!(skip.len(), 15_749);
|
assert_eq!(skip.len(), 15_749);
|
||||||
assert_eq!(stream.len(), 1_000_000);
|
assert_eq!(stream.len(), 1_000_000);
|
||||||
let mut position_reader = PositionReader::new(stream, skip, 128 * 1024);
|
let mut position_reader = PositionReader::new(stream,skip, 128 * 1024);
|
||||||
let mut buf = [0u32; 1];
|
let mut buf = [0u32; 1];
|
||||||
position_reader.read(&mut buf);
|
position_reader.read(&mut buf);
|
||||||
assert_eq!(buf[0], CONST_VAL);
|
assert_eq!(buf[0], CONST_VAL);
|
||||||
@@ -138,17 +137,12 @@ pub mod tests {
|
|||||||
let (stream, skip) = create_stream_buffer(&v[..]);
|
let (stream, skip) = create_stream_buffer(&v[..]);
|
||||||
assert_eq!(skip.len(), 15_749);
|
assert_eq!(skip.len(), 15_749);
|
||||||
assert_eq!(stream.len(), 4_987_872);
|
assert_eq!(stream.len(), 4_987_872);
|
||||||
for &offset in &[
|
for &offset in &[10, 128 * 1024, 128 * 1024 - 1, 128 * 1024 + 7, 128 * 10 * 1024 + 10] {
|
||||||
10,
|
let mut position_reader = PositionReader::new(stream.clone(),skip.clone(), offset);
|
||||||
128 * 1024,
|
|
||||||
128 * 1024 - 1,
|
|
||||||
128 * 1024 + 7,
|
|
||||||
128 * 10 * 1024 + 10,
|
|
||||||
] {
|
|
||||||
let mut position_reader = PositionReader::new(stream.clone(), skip.clone(), offset);
|
|
||||||
let mut buf = [0u32; 1];
|
let mut buf = [0u32; 1];
|
||||||
position_reader.read(&mut buf);
|
position_reader.read(&mut buf);
|
||||||
assert_eq!(buf[0], offset as u32);
|
assert_eq!(buf[0], offset as u32);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,12 +1,12 @@
|
|||||||
use super::BIT_PACKER;
|
use bitpacking::{BitPacker4x, BitPacker};
|
||||||
use bitpacking::{BitPacker, BitPacker4x};
|
|
||||||
use common::{BinarySerializable, FixedSize};
|
|
||||||
use directory::ReadOnlySource;
|
|
||||||
use owned_read::OwnedRead;
|
use owned_read::OwnedRead;
|
||||||
use positions::COMPRESSION_BLOCK_SIZE;
|
use common::{BinarySerializable, FixedSize};
|
||||||
use positions::LONG_SKIP_INTERVAL;
|
|
||||||
use positions::LONG_SKIP_IN_BLOCKS;
|
|
||||||
use postings::compression::compressed_block_size;
|
use postings::compression::compressed_block_size;
|
||||||
|
use directory::ReadOnlySource;
|
||||||
|
use positions::COMPRESSION_BLOCK_SIZE;
|
||||||
|
use positions::LONG_SKIP_IN_BLOCKS;
|
||||||
|
use positions::LONG_SKIP_INTERVAL;
|
||||||
|
use super::BIT_PACKER;
|
||||||
|
|
||||||
pub struct PositionReader {
|
pub struct PositionReader {
|
||||||
skip_read: OwnedRead,
|
skip_read: OwnedRead,
|
||||||
@@ -18,6 +18,7 @@ pub struct PositionReader {
|
|||||||
// of the block of the next int to read.
|
// of the block of the next int to read.
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// `ahead` represents the offset of the block currently loaded
|
// `ahead` represents the offset of the block currently loaded
|
||||||
// compared to the cursor of the actual stream.
|
// compared to the cursor of the actual stream.
|
||||||
//
|
//
|
||||||
@@ -31,8 +32,7 @@ fn read_impl(
|
|||||||
buffer: &mut [u32; 128],
|
buffer: &mut [u32; 128],
|
||||||
mut inner_offset: usize,
|
mut inner_offset: usize,
|
||||||
num_bits: &[u8],
|
num_bits: &[u8],
|
||||||
output: &mut [u32],
|
output: &mut [u32]) -> usize {
|
||||||
) -> usize {
|
|
||||||
let mut output_start = 0;
|
let mut output_start = 0;
|
||||||
let mut output_len = output.len();
|
let mut output_len = output.len();
|
||||||
let mut ahead = 0;
|
let mut ahead = 0;
|
||||||
@@ -47,7 +47,8 @@ fn read_impl(
|
|||||||
output_start += available_len;
|
output_start += available_len;
|
||||||
inner_offset = 0;
|
inner_offset = 0;
|
||||||
let num_bits = num_bits[ahead];
|
let num_bits = num_bits[ahead];
|
||||||
BitPacker4x::new().decompress(position, &mut buffer[..], num_bits);
|
BitPacker4x::new()
|
||||||
|
.decompress(position, &mut buffer[..], num_bits);
|
||||||
let block_len = compressed_block_size(num_bits);
|
let block_len = compressed_block_size(num_bits);
|
||||||
position = &position[block_len..];
|
position = &position[block_len..];
|
||||||
ahead += 1;
|
ahead += 1;
|
||||||
@@ -55,12 +56,11 @@ fn read_impl(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
impl PositionReader {
|
impl PositionReader {
|
||||||
pub fn new(
|
pub fn new(position_source: ReadOnlySource,
|
||||||
position_source: ReadOnlySource,
|
skip_source: ReadOnlySource,
|
||||||
skip_source: ReadOnlySource,
|
offset: u64) -> PositionReader {
|
||||||
offset: u64,
|
|
||||||
) -> PositionReader {
|
|
||||||
let skip_len = skip_source.len();
|
let skip_len = skip_source.len();
|
||||||
let (body, footer) = skip_source.split(skip_len - u32::SIZE_IN_BYTES);
|
let (body, footer) = skip_source.split(skip_len - u32::SIZE_IN_BYTES);
|
||||||
let num_long_skips = u32::deserialize(&mut footer.as_slice()).expect("Index corrupted");
|
let num_long_skips = u32::deserialize(&mut footer.as_slice()).expect("Index corrupted");
|
||||||
@@ -70,8 +70,7 @@ impl PositionReader {
|
|||||||
let small_skip = (offset - (long_skip_id as u64) * (LONG_SKIP_INTERVAL as u64)) as usize;
|
let small_skip = (offset - (long_skip_id as u64) * (LONG_SKIP_INTERVAL as u64)) as usize;
|
||||||
let offset_num_bytes: u64 = {
|
let offset_num_bytes: u64 = {
|
||||||
if long_skip_id > 0 {
|
if long_skip_id > 0 {
|
||||||
let mut long_skip_blocks: &[u8] =
|
let mut long_skip_blocks: &[u8] = &long_skips.as_slice()[(long_skip_id - 1) * 8..][..8];
|
||||||
&long_skips.as_slice()[(long_skip_id - 1) * 8..][..8];
|
|
||||||
u64::deserialize(&mut long_skip_blocks).expect("Index corrupted") * 16
|
u64::deserialize(&mut long_skip_blocks).expect("Index corrupted") * 16
|
||||||
} else {
|
} else {
|
||||||
0
|
0
|
||||||
@@ -80,13 +79,13 @@ impl PositionReader {
|
|||||||
let mut position_read = OwnedRead::new(position_source);
|
let mut position_read = OwnedRead::new(position_source);
|
||||||
position_read.advance(offset_num_bytes as usize);
|
position_read.advance(offset_num_bytes as usize);
|
||||||
let mut skip_read = OwnedRead::new(skip_body);
|
let mut skip_read = OwnedRead::new(skip_body);
|
||||||
skip_read.advance(long_skip_id * LONG_SKIP_IN_BLOCKS);
|
skip_read.advance(long_skip_id * LONG_SKIP_IN_BLOCKS);
|
||||||
let mut position_reader = PositionReader {
|
let mut position_reader = PositionReader {
|
||||||
skip_read,
|
skip_read,
|
||||||
position_read,
|
position_read,
|
||||||
inner_offset: 0,
|
inner_offset: 0,
|
||||||
buffer: Box::new([0u32; 128]),
|
buffer: Box::new([0u32; 128]),
|
||||||
ahead: None,
|
ahead: None
|
||||||
};
|
};
|
||||||
position_reader.skip(small_skip);
|
position_reader.skip(small_skip);
|
||||||
position_reader
|
position_reader
|
||||||
@@ -109,8 +108,7 @@ impl PositionReader {
|
|||||||
self.buffer.as_mut(),
|
self.buffer.as_mut(),
|
||||||
self.inner_offset,
|
self.inner_offset,
|
||||||
&skip_data[1..],
|
&skip_data[1..],
|
||||||
output,
|
output));
|
||||||
));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Skip the next `skip_len` integer.
|
/// Skip the next `skip_len` integer.
|
||||||
@@ -120,25 +118,27 @@ impl PositionReader {
|
|||||||
///
|
///
|
||||||
/// May panic if the end of the stream is reached.
|
/// May panic if the end of the stream is reached.
|
||||||
pub fn skip(&mut self, skip_len: usize) {
|
pub fn skip(&mut self, skip_len: usize) {
|
||||||
|
|
||||||
let skip_len_plus_inner_offset = skip_len + self.inner_offset;
|
let skip_len_plus_inner_offset = skip_len + self.inner_offset;
|
||||||
|
|
||||||
let num_blocks_to_advance = skip_len_plus_inner_offset / COMPRESSION_BLOCK_SIZE;
|
let num_blocks_to_advance = skip_len_plus_inner_offset / COMPRESSION_BLOCK_SIZE;
|
||||||
self.inner_offset = skip_len_plus_inner_offset % COMPRESSION_BLOCK_SIZE;
|
self.inner_offset = skip_len_plus_inner_offset % COMPRESSION_BLOCK_SIZE;
|
||||||
|
|
||||||
self.ahead = self.ahead.and_then(|num_blocks| {
|
self.ahead = self.ahead
|
||||||
if num_blocks >= num_blocks_to_advance {
|
.and_then(|num_blocks| {
|
||||||
Some(num_blocks - num_blocks_to_advance)
|
if num_blocks >= num_blocks_to_advance {
|
||||||
} else {
|
Some(num_blocks_to_advance - num_blocks_to_advance)
|
||||||
None
|
} else {
|
||||||
}
|
None
|
||||||
});
|
}
|
||||||
|
});
|
||||||
|
|
||||||
let skip_len = self.skip_read.as_ref()[..num_blocks_to_advance]
|
let skip_len = self.skip_read
|
||||||
|
.as_ref()[..num_blocks_to_advance]
|
||||||
.iter()
|
.iter()
|
||||||
.cloned()
|
.cloned()
|
||||||
.map(|num_bit| num_bit as usize)
|
.map(|num_bit| num_bit as usize)
|
||||||
.sum::<usize>()
|
.sum::<usize>() * (COMPRESSION_BLOCK_SIZE / 8);
|
||||||
* (COMPRESSION_BLOCK_SIZE / 8);
|
|
||||||
|
|
||||||
self.skip_read.advance(num_blocks_to_advance);
|
self.skip_read.advance(num_blocks_to_advance);
|
||||||
self.position_read.advance(skip_len);
|
self.position_read.advance(skip_len);
|
||||||
|
|||||||
@@ -1,8 +1,8 @@
|
|||||||
use super::BIT_PACKER;
|
|
||||||
use bitpacking::BitPacker;
|
|
||||||
use common::BinarySerializable;
|
|
||||||
use positions::{COMPRESSION_BLOCK_SIZE, LONG_SKIP_INTERVAL};
|
|
||||||
use std::io;
|
use std::io;
|
||||||
|
use bitpacking::BitPacker;
|
||||||
|
use positions::{COMPRESSION_BLOCK_SIZE, LONG_SKIP_INTERVAL};
|
||||||
|
use common::BinarySerializable;
|
||||||
|
use super::BIT_PACKER;
|
||||||
|
|
||||||
pub struct PositionSerializer<W: io::Write> {
|
pub struct PositionSerializer<W: io::Write> {
|
||||||
write_stream: W,
|
write_stream: W,
|
||||||
@@ -23,7 +23,7 @@ impl<W: io::Write> PositionSerializer<W> {
|
|||||||
buffer: vec![0u8; 128 * 4],
|
buffer: vec![0u8; 128 * 4],
|
||||||
num_ints: 0u64,
|
num_ints: 0u64,
|
||||||
long_skips: Vec::new(),
|
long_skips: Vec::new(),
|
||||||
cumulated_num_bits: 0u64,
|
cumulated_num_bits: 0u64
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -31,6 +31,7 @@ impl<W: io::Write> PositionSerializer<W> {
|
|||||||
self.num_ints
|
self.num_ints
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
fn remaining_block_len(&self) -> usize {
|
fn remaining_block_len(&self) -> usize {
|
||||||
COMPRESSION_BLOCK_SIZE - self.block.len()
|
COMPRESSION_BLOCK_SIZE - self.block.len()
|
||||||
}
|
}
|
||||||
@@ -51,8 +52,8 @@ impl<W: io::Write> PositionSerializer<W> {
|
|||||||
|
|
||||||
fn flush_block(&mut self) -> io::Result<()> {
|
fn flush_block(&mut self) -> io::Result<()> {
|
||||||
let num_bits = BIT_PACKER.num_bits(&self.block[..]);
|
let num_bits = BIT_PACKER.num_bits(&self.block[..]);
|
||||||
self.cumulated_num_bits += u64::from(num_bits);
|
self.cumulated_num_bits += num_bits as u64;
|
||||||
self.write_skiplist.write_all(&[num_bits])?;
|
self.write_skiplist.write(&[num_bits])?;
|
||||||
let written_len = BIT_PACKER.compress(&self.block[..], &mut self.buffer, num_bits);
|
let written_len = BIT_PACKER.compress(&self.block[..], &mut self.buffer, num_bits);
|
||||||
self.write_stream.write_all(&self.buffer[..written_len])?;
|
self.write_stream.write_all(&self.buffer[..written_len])?;
|
||||||
self.block.clear();
|
self.block.clear();
|
||||||
|
|||||||
@@ -28,16 +28,14 @@ impl BlockEncoder {
|
|||||||
|
|
||||||
pub fn compress_block_sorted(&mut self, block: &[u32], offset: u32) -> (u8, &[u8]) {
|
pub fn compress_block_sorted(&mut self, block: &[u32], offset: u32) -> (u8, &[u8]) {
|
||||||
let num_bits = self.bitpacker.num_bits_sorted(offset, block);
|
let num_bits = self.bitpacker.num_bits_sorted(offset, block);
|
||||||
let written_size =
|
let written_size = self.bitpacker
|
||||||
self.bitpacker
|
.compress_sorted(offset, block, &mut self.output[..], num_bits);
|
||||||
.compress_sorted(offset, block, &mut self.output[..], num_bits);
|
|
||||||
(num_bits, &self.output[..written_size])
|
(num_bits, &self.output[..written_size])
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn compress_block_unsorted(&mut self, block: &[u32]) -> (u8, &[u8]) {
|
pub fn compress_block_unsorted(&mut self, block: &[u32]) -> (u8, &[u8]) {
|
||||||
let num_bits = self.bitpacker.num_bits(block);
|
let num_bits = self.bitpacker.num_bits(block);
|
||||||
let written_size = self
|
let written_size = self.bitpacker
|
||||||
.bitpacker
|
|
||||||
.compress(block, &mut self.output[..], num_bits);
|
.compress(block, &mut self.output[..], num_bits);
|
||||||
(num_bits, &self.output[..written_size])
|
(num_bits, &self.output[..written_size])
|
||||||
}
|
}
|
||||||
@@ -64,21 +62,19 @@ impl BlockDecoder {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn uncompress_block_sorted(
|
pub fn uncompress_block_sorted(&mut self, compressed_data: &[u8], offset: u32, num_bits: u8) -> usize {
|
||||||
&mut self,
|
|
||||||
compressed_data: &[u8],
|
|
||||||
offset: u32,
|
|
||||||
num_bits: u8,
|
|
||||||
) -> usize {
|
|
||||||
self.output_len = COMPRESSION_BLOCK_SIZE;
|
self.output_len = COMPRESSION_BLOCK_SIZE;
|
||||||
self.bitpacker
|
self.bitpacker.decompress_sorted(
|
||||||
.decompress_sorted(offset, &compressed_data, &mut self.output, num_bits)
|
offset,
|
||||||
|
&compressed_data,
|
||||||
|
&mut self.output,
|
||||||
|
num_bits,
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn uncompress_block_unsorted(&mut self, compressed_data: &[u8], num_bits: u8) -> usize {
|
pub fn uncompress_block_unsorted(&mut self, compressed_data: &[u8], num_bits: u8) -> usize {
|
||||||
self.output_len = COMPRESSION_BLOCK_SIZE;
|
self.output_len = COMPRESSION_BLOCK_SIZE;
|
||||||
self.bitpacker
|
self.bitpacker.decompress(&compressed_data, &mut self.output, num_bits)
|
||||||
.decompress(&compressed_data, &mut self.output, num_bits)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
@@ -92,6 +88,7 @@ impl BlockDecoder {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
pub trait VIntEncoder {
|
pub trait VIntEncoder {
|
||||||
/// Compresses an array of `u32` integers,
|
/// Compresses an array of `u32` integers,
|
||||||
/// using [delta-encoding](https://en.wikipedia.org/wiki/Delta_ encoding)
|
/// using [delta-encoding](https://en.wikipedia.org/wiki/Delta_ encoding)
|
||||||
@@ -266,17 +263,21 @@ pub mod tests {
|
|||||||
mod bench {
|
mod bench {
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
|
use rand::Rng;
|
||||||
use rand::SeedableRng;
|
use rand::SeedableRng;
|
||||||
use rand::{Rng, XorShiftRng};
|
use rand::XorShiftRng;
|
||||||
use test::Bencher;
|
use test::Bencher;
|
||||||
|
|
||||||
fn generate_array_with_seed(n: usize, ratio: f64, seed_val: u8) -> Vec<u32> {
|
fn generate_array_with_seed(n: usize, ratio: f32, seed_val: u32) -> Vec<u32> {
|
||||||
let seed: &[u8; 16] = &[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, seed_val];
|
let seed: &[u32; 4] = &[1, 2, 3, seed_val];
|
||||||
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
|
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
|
||||||
(0u32..).filter(|_| rng.gen_bool(ratio)).take(n).collect()
|
(0..u32::max_value())
|
||||||
|
.filter(|_| rng.next_f32() < ratio)
|
||||||
|
.take(n)
|
||||||
|
.collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn generate_array(n: usize, ratio: f64) -> Vec<u32> {
|
pub fn generate_array(n: usize, ratio: f32) -> Vec<u32> {
|
||||||
generate_array_with_seed(n, ratio, 4)
|
generate_array_with_seed(n, ratio, 4)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -293,23 +294,24 @@ mod bench {
|
|||||||
fn bench_uncompress(b: &mut Bencher) {
|
fn bench_uncompress(b: &mut Bencher) {
|
||||||
let mut encoder = BlockEncoder::new();
|
let mut encoder = BlockEncoder::new();
|
||||||
let data = generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
|
let data = generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
|
||||||
let (num_bits, compressed) = encoder.compress_block_sorted(&data, 0u32);
|
let (_, compressed) = encoder.compress_block_sorted(&data, 0u32);
|
||||||
let mut decoder = BlockDecoder::new();
|
let mut decoder = BlockDecoder::new();
|
||||||
b.iter(|| {
|
b.iter(|| {
|
||||||
decoder.uncompress_block_sorted(compressed, 0u32, num_bits);
|
decoder.uncompress_block_sorted(compressed, 0u32);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_all_docs_compression_numbits() {
|
fn test_all_docs_compression_numbits() {
|
||||||
for expected_num_bits in 0u8.. {
|
for num_bits in 0..33 {
|
||||||
let mut data = [0u32; 128];
|
let mut data = [0u32; 128];
|
||||||
if expected_num_bits > 0 {
|
if num_bits > 0 {
|
||||||
data[0] = (1u64 << (expected_num_bits as usize) - 1) as u32;
|
data[0] = 1 << (num_bits - 1);
|
||||||
}
|
}
|
||||||
let mut encoder = BlockEncoder::new();
|
let mut encoder = BlockEncoder::new();
|
||||||
let (num_bits, compressed) = encoder.compress_block_unsorted(&data);
|
let compressed = encoder.compress_block_unsorted(&data);
|
||||||
assert_eq!(compressed.len(), compressed_block_size(num_bits));
|
assert_eq!(compressed[0] as usize, num_bits);
|
||||||
|
assert_eq!(compressed.len(), compressed_block_size(compressed[0]));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,9 @@
|
|||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], mut offset: u32) -> &'a [u8] {
|
pub fn compress_sorted<'a>(
|
||||||
|
input: &[u32],
|
||||||
|
output: &'a mut [u8],
|
||||||
|
mut offset: u32,
|
||||||
|
) -> &'a [u8] {
|
||||||
let mut byte_written = 0;
|
let mut byte_written = 0;
|
||||||
for &v in input {
|
for &v in input {
|
||||||
let mut to_encode: u32 = v - offset;
|
let mut to_encode: u32 = v - offset;
|
||||||
@@ -42,41 +46,47 @@ pub(crate) fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
pub fn uncompress_sorted<'a>(compressed_data: &'a [u8], output: &mut [u32], offset: u32) -> usize {
|
pub fn uncompress_sorted<'a>(
|
||||||
|
compressed_data: &'a [u8],
|
||||||
|
output: &mut [u32],
|
||||||
|
offset: u32,
|
||||||
|
) -> usize {
|
||||||
let mut read_byte = 0;
|
let mut read_byte = 0;
|
||||||
let mut result = offset;
|
let mut result = offset;
|
||||||
for output_mut in output.iter_mut() {
|
let num_els = output.len();
|
||||||
|
for i in 0..num_els {
|
||||||
let mut shift = 0u32;
|
let mut shift = 0u32;
|
||||||
loop {
|
loop {
|
||||||
let cur_byte = compressed_data[read_byte];
|
let cur_byte = compressed_data[read_byte];
|
||||||
read_byte += 1;
|
read_byte += 1;
|
||||||
result += u32::from(cur_byte % 128u8) << shift;
|
result += ((cur_byte % 128u8) as u32) << shift;
|
||||||
if cur_byte & 128u8 != 0u8 {
|
if cur_byte & 128u8 != 0u8 {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
shift += 7;
|
shift += 7;
|
||||||
}
|
}
|
||||||
*output_mut = result;
|
output[i] = result;
|
||||||
}
|
}
|
||||||
read_byte
|
read_byte
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
pub(crate) fn uncompress_unsorted(compressed_data: &[u8], output_arr: &mut [u32]) -> usize {
|
pub(crate) fn uncompress_unsorted<'a>(compressed_data: &'a [u8], output: &mut [u32]) -> usize {
|
||||||
let mut read_byte = 0;
|
let mut read_byte = 0;
|
||||||
for output_mut in output_arr.iter_mut() {
|
let num_els = output.len();
|
||||||
|
for i in 0..num_els {
|
||||||
let mut result = 0u32;
|
let mut result = 0u32;
|
||||||
let mut shift = 0u32;
|
let mut shift = 0u32;
|
||||||
loop {
|
loop {
|
||||||
let cur_byte = compressed_data[read_byte];
|
let cur_byte = compressed_data[read_byte];
|
||||||
read_byte += 1;
|
read_byte += 1;
|
||||||
result += u32::from(cur_byte % 128u8) << shift;
|
result += ((cur_byte % 128u8) as u32) << shift;
|
||||||
if cur_byte & 128u8 != 0u8 {
|
if cur_byte & 128u8 != 0u8 {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
shift += 7;
|
shift += 7;
|
||||||
}
|
}
|
||||||
*output_mut = result;
|
output[i] = result;
|
||||||
}
|
}
|
||||||
read_byte
|
read_byte
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,7 +2,6 @@
|
|||||||
Postings module (also called inverted index)
|
Postings module (also called inverted index)
|
||||||
*/
|
*/
|
||||||
|
|
||||||
pub(crate) mod compression;
|
|
||||||
/// Postings module
|
/// Postings module
|
||||||
///
|
///
|
||||||
/// Postings, also called inverted lists, is the key datastructure
|
/// Postings, also called inverted lists, is the key datastructure
|
||||||
@@ -12,17 +11,18 @@ mod postings_writer;
|
|||||||
mod recorder;
|
mod recorder;
|
||||||
mod segment_postings;
|
mod segment_postings;
|
||||||
mod serializer;
|
mod serializer;
|
||||||
mod skip;
|
pub(crate) mod compression;
|
||||||
mod stacker;
|
mod stacker;
|
||||||
mod term_info;
|
mod term_info;
|
||||||
|
mod skip;
|
||||||
|
|
||||||
pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
|
pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
|
||||||
pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
|
pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
|
||||||
|
|
||||||
use self::compression::COMPRESSION_BLOCK_SIZE;
|
|
||||||
pub use self::postings::Postings;
|
pub use self::postings::Postings;
|
||||||
pub(crate) use self::skip::SkipReader;
|
|
||||||
pub use self::term_info::TermInfo;
|
pub use self::term_info::TermInfo;
|
||||||
|
pub(crate) use self::skip::SkipReader;
|
||||||
|
use self::compression::{COMPRESSION_BLOCK_SIZE};
|
||||||
|
|
||||||
pub use self::segment_postings::{BlockSegmentPostings, SegmentPostings};
|
pub use self::segment_postings::{BlockSegmentPostings, SegmentPostings};
|
||||||
|
|
||||||
@@ -34,7 +34,7 @@ pub(crate) const USE_SKIP_INFO_LIMIT: u32 = COMPRESSION_BLOCK_SIZE as u32;
|
|||||||
|
|
||||||
pub(crate) type UnorderedTermId = u64;
|
pub(crate) type UnorderedTermId = u64;
|
||||||
|
|
||||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::enum_variant_names))]
|
#[allow(enum_variant_names)]
|
||||||
#[derive(Debug, PartialEq, Clone, Copy, Eq)]
|
#[derive(Debug, PartialEq, Clone, Copy, Eq)]
|
||||||
pub(crate) enum FreqReadingOption {
|
pub(crate) enum FreqReadingOption {
|
||||||
NoFreq,
|
NoFreq,
|
||||||
@@ -54,25 +54,25 @@ pub mod tests {
|
|||||||
use indexer::operation::AddOperation;
|
use indexer::operation::AddOperation;
|
||||||
use indexer::SegmentWriter;
|
use indexer::SegmentWriter;
|
||||||
use query::Scorer;
|
use query::Scorer;
|
||||||
use rand::rngs::StdRng;
|
use rand::{Rng, SeedableRng, XorShiftRng};
|
||||||
use rand::{Rng, SeedableRng};
|
|
||||||
use schema::Field;
|
use schema::Field;
|
||||||
use schema::IndexRecordOption;
|
use schema::IndexRecordOption;
|
||||||
use schema::{Document, Schema, Term, INT_INDEXED, STRING, TEXT};
|
use schema::{Document, SchemaBuilder, Term, INT_INDEXED, STRING, TEXT};
|
||||||
use std::iter;
|
use std::iter;
|
||||||
use DocId;
|
use DocId;
|
||||||
use Score;
|
use Score;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
pub fn test_position_write() {
|
pub fn test_position_write() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let mut segment = index.new_segment();
|
let mut segment = index.new_segment();
|
||||||
let mut posting_serializer = InvertedIndexSerializer::open(&mut segment).unwrap();
|
let mut posting_serializer = InvertedIndexSerializer::open(&mut segment).unwrap();
|
||||||
{
|
{
|
||||||
let mut field_serializer = posting_serializer.new_field(text_field, 120 * 4).unwrap();
|
let mut field_serializer = posting_serializer
|
||||||
|
.new_field(text_field, 120 * 4).unwrap();
|
||||||
field_serializer.new_term("abc".as_bytes()).unwrap();
|
field_serializer.new_term("abc".as_bytes()).unwrap();
|
||||||
for doc_id in 0u32..120u32 {
|
for doc_id in 0u32..120u32 {
|
||||||
let delta_positions = vec![1, 2, 3, 2];
|
let delta_positions = vec![1, 2, 3, 2];
|
||||||
@@ -89,7 +89,7 @@ pub mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
pub fn test_skip_positions() {
|
pub fn test_skip_positions() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::new();
|
||||||
let title = schema_builder.add_text_field("title", TEXT);
|
let title = schema_builder.add_text_field("title", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -164,7 +164,7 @@ pub mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
pub fn test_position_and_fieldnorm1() {
|
pub fn test_position_and_fieldnorm1() {
|
||||||
let mut positions = Vec::new();
|
let mut positions = Vec::new();
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema.clone());
|
let index = Index::create_in_ram(schema.clone());
|
||||||
@@ -277,7 +277,7 @@ pub mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
pub fn test_position_and_fieldnorm2() {
|
pub fn test_position_and_fieldnorm2() {
|
||||||
let mut positions: Vec<u32> = Vec::new();
|
let mut positions: Vec<u32> = Vec::new();
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -318,7 +318,7 @@ pub mod tests {
|
|||||||
let num_docs = 300u32;
|
let num_docs = 300u32;
|
||||||
|
|
||||||
let index = {
|
let index = {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let value_field = schema_builder.add_u64_field("value", INT_INDEXED);
|
let value_field = schema_builder.add_u64_field("value", INT_INDEXED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
|
|
||||||
@@ -499,11 +499,12 @@ pub mod tests {
|
|||||||
Term::from_field_text(field, "d")
|
Term::from_field_text(field, "d")
|
||||||
};
|
};
|
||||||
pub static ref INDEX: Index = {
|
pub static ref INDEX: Index = {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let text_field = schema_builder.add_text_field("text", STRING);
|
let text_field = schema_builder.add_text_field("text", STRING);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
|
|
||||||
let mut rng: StdRng = StdRng::from_seed([1u8; 32]);
|
let seed: [u8; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16];
|
||||||
|
let mut rng: XorShiftRng = XorShiftRng::from_seed(seed);
|
||||||
|
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
let posting_list_size = 1_000_000;
|
let posting_list_size = 1_000_000;
|
||||||
@@ -511,13 +512,13 @@ pub mod tests {
|
|||||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||||
for _ in 0..posting_list_size {
|
for _ in 0..posting_list_size {
|
||||||
let mut doc = Document::default();
|
let mut doc = Document::default();
|
||||||
if rng.gen_bool(1f64 / 15f64) {
|
if rng.gen_bool(1f64/ 15f64) {
|
||||||
doc.add_text(text_field, "a");
|
doc.add_text(text_field, "a");
|
||||||
}
|
}
|
||||||
if rng.gen_bool(1f64 / 10f64) {
|
if rng.gen_bool(1f64/ 10f64) {
|
||||||
doc.add_text(text_field, "b");
|
doc.add_text(text_field, "b");
|
||||||
}
|
}
|
||||||
if rng.gen_bool(1f64 / 5f64) {
|
if rng.gen_bool(1f64/ 5f64) {
|
||||||
doc.add_text(text_field, "c");
|
doc.add_text(text_field, "c");
|
||||||
}
|
}
|
||||||
doc.add_text(text_field, "d");
|
doc.add_text(text_field, "d");
|
||||||
@@ -654,7 +655,7 @@ mod bench {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
fn bench_skip_next(p: f64, b: &mut Bencher) {
|
fn bench_skip_next(p: f32, b: &mut Bencher) {
|
||||||
let searcher = INDEX.searcher();
|
let searcher = INDEX.searcher();
|
||||||
let segment_reader = searcher.segment_reader(0);
|
let segment_reader = searcher.segment_reader(0);
|
||||||
let docs = tests::sample(segment_reader.num_docs(), p);
|
let docs = tests::sample(segment_reader.num_docs(), p);
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ use tokenizer::TokenStream;
|
|||||||
use DocId;
|
use DocId;
|
||||||
use Result;
|
use Result;
|
||||||
|
|
||||||
fn posting_from_field_entry(field_entry: &FieldEntry) -> Box<PostingsWriter> {
|
fn posting_from_field_entry<'a>(field_entry: &FieldEntry) -> Box<PostingsWriter> {
|
||||||
match *field_entry.field_type() {
|
match *field_entry.field_type() {
|
||||||
FieldType::Str(ref text_options) => text_options
|
FieldType::Str(ref text_options) => text_options
|
||||||
.get_indexing_options()
|
.get_indexing_options()
|
||||||
@@ -29,7 +29,8 @@ fn posting_from_field_entry(field_entry: &FieldEntry) -> Box<PostingsWriter> {
|
|||||||
IndexRecordOption::WithFreqsAndPositions => {
|
IndexRecordOption::WithFreqsAndPositions => {
|
||||||
SpecializedPostingsWriter::<TFAndPositionRecorder>::new_boxed()
|
SpecializedPostingsWriter::<TFAndPositionRecorder>::new_boxed()
|
||||||
}
|
}
|
||||||
}).unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed()),
|
})
|
||||||
|
.unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed()),
|
||||||
FieldType::U64(_) | FieldType::I64(_) | FieldType::HierarchicalFacet => {
|
FieldType::U64(_) | FieldType::I64(_) | FieldType::HierarchicalFacet => {
|
||||||
SpecializedPostingsWriter::<NothingRecorder>::new_boxed()
|
SpecializedPostingsWriter::<NothingRecorder>::new_boxed()
|
||||||
}
|
}
|
||||||
@@ -93,12 +94,11 @@ impl MultiFieldPostingsWriter {
|
|||||||
&self,
|
&self,
|
||||||
serializer: &mut InvertedIndexSerializer,
|
serializer: &mut InvertedIndexSerializer,
|
||||||
) -> Result<HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>>> {
|
) -> Result<HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>>> {
|
||||||
let mut term_offsets: Vec<(&[u8], Addr, UnorderedTermId)> = self
|
let mut term_offsets: Vec<(&[u8], Addr, UnorderedTermId)> = self.term_index
|
||||||
.term_index
|
|
||||||
.iter()
|
.iter()
|
||||||
.map(|(term_bytes, addr, bucket_id)| (term_bytes, addr, bucket_id as UnorderedTermId))
|
.map(|(term_bytes, addr, bucket_id)| (term_bytes, addr, bucket_id as UnorderedTermId))
|
||||||
.collect();
|
.collect();
|
||||||
term_offsets.sort_unstable_by_key(|&(k, _, _)| k);
|
term_offsets.sort_by_key(|&(k, _, _)| k);
|
||||||
|
|
||||||
let mut offsets: Vec<(Field, usize)> = vec![];
|
let mut offsets: Vec<(Field, usize)> = vec![];
|
||||||
let term_offsets_it = term_offsets
|
let term_offsets_it = term_offsets
|
||||||
@@ -127,8 +127,8 @@ impl MultiFieldPostingsWriter {
|
|||||||
|
|
||||||
let field_entry = self.schema.get_field_entry(field);
|
let field_entry = self.schema.get_field_entry(field);
|
||||||
|
|
||||||
match *field_entry.field_type() {
|
match field_entry.field_type() {
|
||||||
FieldType::Str(_) | FieldType::HierarchicalFacet => {
|
&FieldType::Str(_) | &FieldType::HierarchicalFacet => {
|
||||||
// populating the (unordered term ord) -> (ordered term ord) mapping
|
// populating the (unordered term ord) -> (ordered term ord) mapping
|
||||||
// for the field.
|
// for the field.
|
||||||
let mut unordered_term_ids = term_offsets[start..stop]
|
let mut unordered_term_ids = term_offsets[start..stop]
|
||||||
@@ -138,11 +138,12 @@ impl MultiFieldPostingsWriter {
|
|||||||
.enumerate()
|
.enumerate()
|
||||||
.map(|(term_ord, unord_term_id)| {
|
.map(|(term_ord, unord_term_id)| {
|
||||||
(unord_term_id as UnorderedTermId, term_ord as TermOrdinal)
|
(unord_term_id as UnorderedTermId, term_ord as TermOrdinal)
|
||||||
}).collect();
|
})
|
||||||
|
.collect();
|
||||||
unordered_term_mappings.insert(field, mapping);
|
unordered_term_mappings.insert(field, mapping);
|
||||||
}
|
}
|
||||||
FieldType::U64(_) | FieldType::I64(_) => {}
|
&FieldType::U64(_) | &FieldType::I64(_) => {}
|
||||||
FieldType::Bytes => {}
|
&FieldType::Bytes => {}
|
||||||
}
|
}
|
||||||
|
|
||||||
let postings_writer = &self.per_field_postings_writers[field.0 as usize];
|
let postings_writer = &self.per_field_postings_writers[field.0 as usize];
|
||||||
@@ -201,11 +202,14 @@ pub trait PostingsWriter {
|
|||||||
heap: &mut MemoryArena,
|
heap: &mut MemoryArena,
|
||||||
) -> u32 {
|
) -> u32 {
|
||||||
let mut term = Term::for_field(field);
|
let mut term = Term::for_field(field);
|
||||||
let mut sink = |token: &Token| {
|
let num_tokens = {
|
||||||
term.set_text(token.text.as_str());
|
let mut sink = |token: &Token| {
|
||||||
self.subscribe(term_index, doc_id, token.position as u32, &term, heap);
|
term.set_text(token.text.as_str());
|
||||||
|
self.subscribe(term_index, doc_id, token.position as u32, &term, heap);
|
||||||
|
};
|
||||||
|
token_stream.process(&mut sink)
|
||||||
};
|
};
|
||||||
token_stream.process(&mut sink)
|
num_tokens
|
||||||
}
|
}
|
||||||
|
|
||||||
fn total_num_tokens(&self) -> u64;
|
fn total_num_tokens(&self) -> u64;
|
||||||
|
|||||||
@@ -107,8 +107,7 @@ impl Recorder for TermFrequencyRecorder {
|
|||||||
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> {
|
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> {
|
||||||
// the last document has not been closed...
|
// the last document has not been closed...
|
||||||
// its term freq is self.current_tf.
|
// its term freq is self.current_tf.
|
||||||
let mut doc_iter = self
|
let mut doc_iter = self.stack
|
||||||
.stack
|
|
||||||
.iter(heap)
|
.iter(heap)
|
||||||
.chain(Some(self.current_tf).into_iter());
|
.chain(Some(self.current_tf).into_iter());
|
||||||
|
|
||||||
|
|||||||
@@ -1,20 +1,20 @@
|
|||||||
|
use postings::compression::{BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE};
|
||||||
|
use DocId;
|
||||||
use common::BitSet;
|
use common::BitSet;
|
||||||
use common::HasLen;
|
use common::HasLen;
|
||||||
use common::{BinarySerializable, VInt};
|
use postings::compression::compressed_block_size;
|
||||||
use docset::{DocSet, SkipResult};
|
use docset::{DocSet, SkipResult};
|
||||||
use fst::Streamer;
|
use fst::Streamer;
|
||||||
use owned_read::OwnedRead;
|
|
||||||
use positions::PositionReader;
|
|
||||||
use postings::compression::compressed_block_size;
|
|
||||||
use postings::compression::{BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE};
|
|
||||||
use postings::serializer::PostingsSerializer;
|
use postings::serializer::PostingsSerializer;
|
||||||
use postings::FreqReadingOption;
|
use postings::FreqReadingOption;
|
||||||
use postings::Postings;
|
use postings::Postings;
|
||||||
use postings::SkipReader;
|
use owned_read::OwnedRead;
|
||||||
|
use common::{VInt, BinarySerializable};
|
||||||
use postings::USE_SKIP_INFO_LIMIT;
|
use postings::USE_SKIP_INFO_LIMIT;
|
||||||
|
use postings::SkipReader;
|
||||||
use schema::IndexRecordOption;
|
use schema::IndexRecordOption;
|
||||||
|
use positions::PositionReader;
|
||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use DocId;
|
|
||||||
|
|
||||||
const EMPTY_ARR: [u8; 0] = [];
|
const EMPTY_ARR: [u8; 0] = [];
|
||||||
|
|
||||||
@@ -98,7 +98,7 @@ impl SegmentPostings {
|
|||||||
docs.len() as u32,
|
docs.len() as u32,
|
||||||
OwnedRead::new(buffer),
|
OwnedRead::new(buffer),
|
||||||
IndexRecordOption::Basic,
|
IndexRecordOption::Basic,
|
||||||
IndexRecordOption::Basic,
|
IndexRecordOption::Basic
|
||||||
);
|
);
|
||||||
SegmentPostings::from_block_postings(block_segment_postings, None)
|
SegmentPostings::from_block_postings(block_segment_postings, None)
|
||||||
}
|
}
|
||||||
@@ -151,11 +151,7 @@ fn exponential_search(target: u32, arr: &[u32]) -> (usize, usize) {
|
|||||||
/// The target is assumed smaller or equal to the last element.
|
/// The target is assumed smaller or equal to the last element.
|
||||||
fn search_within_block(block_docs: &[u32], target: u32) -> usize {
|
fn search_within_block(block_docs: &[u32], target: u32) -> usize {
|
||||||
let (start, end) = exponential_search(target, block_docs);
|
let (start, end) = exponential_search(target, block_docs);
|
||||||
start.wrapping_add(
|
start.wrapping_add(block_docs[start..end].binary_search(&target).unwrap_or_else(|e| e))
|
||||||
block_docs[start..end]
|
|
||||||
.binary_search(&target)
|
|
||||||
.unwrap_or_else(|e| e),
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl DocSet for SegmentPostings {
|
impl DocSet for SegmentPostings {
|
||||||
@@ -183,20 +179,21 @@ impl DocSet for SegmentPostings {
|
|||||||
// check if we need to go to the next block
|
// check if we need to go to the next block
|
||||||
let need_positions = self.position_computer.is_some();
|
let need_positions = self.position_computer.is_some();
|
||||||
let mut sum_freqs_skipped: u32 = 0;
|
let mut sum_freqs_skipped: u32 = 0;
|
||||||
if !self
|
if !self.block_cursor
|
||||||
.block_cursor
|
.docs()
|
||||||
.docs()
|
.last()
|
||||||
.last()
|
.map(|doc| *doc >= target)
|
||||||
.map(|doc| *doc >= target)
|
.unwrap_or(false) // there should always be at least a document in the block
|
||||||
.unwrap_or(false)
|
// since advance returned.
|
||||||
// there should always be at least a document in the block
|
|
||||||
// since advance returned.
|
|
||||||
{
|
{
|
||||||
// we are not in the right block.
|
// we are not in the right block.
|
||||||
//
|
//
|
||||||
// First compute all of the freqs skipped from the current block.
|
// First compute all of the freqs skipped from the current block.
|
||||||
if need_positions {
|
if need_positions {
|
||||||
sum_freqs_skipped = self.block_cursor.freqs()[self.cur..].iter().sum();
|
sum_freqs_skipped = self.block_cursor
|
||||||
|
.freqs()[self.cur..]
|
||||||
|
.iter()
|
||||||
|
.sum();
|
||||||
match self.block_cursor.skip_to(target) {
|
match self.block_cursor.skip_to(target) {
|
||||||
BlockSegmentPostingsSkipResult::Success(block_skip_freqs) => {
|
BlockSegmentPostingsSkipResult::Success(block_skip_freqs) => {
|
||||||
sum_freqs_skipped += block_skip_freqs;
|
sum_freqs_skipped += block_skip_freqs;
|
||||||
@@ -205,11 +202,11 @@ impl DocSet for SegmentPostings {
|
|||||||
return SkipResult::End;
|
return SkipResult::End;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if self.block_cursor.skip_to(target)
|
} else {
|
||||||
== BlockSegmentPostingsSkipResult::Terminated
|
|
||||||
{
|
|
||||||
// no positions needed. no need to sum freqs.
|
// no positions needed. no need to sum freqs.
|
||||||
return SkipResult::End;
|
if self.block_cursor.skip_to(target) == BlockSegmentPostingsSkipResult::Terminated {
|
||||||
|
return SkipResult::End;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
self.cur = 0;
|
self.cur = 0;
|
||||||
}
|
}
|
||||||
@@ -218,13 +215,9 @@ impl DocSet for SegmentPostings {
|
|||||||
let block_docs = self.block_cursor.docs();
|
let block_docs = self.block_cursor.docs();
|
||||||
|
|
||||||
debug_assert!(target >= self.doc());
|
debug_assert!(target >= self.doc());
|
||||||
let new_cur = self
|
let new_cur = self.cur.wrapping_add(search_within_block(&block_docs[self.cur..], target));
|
||||||
.cur
|
|
||||||
.wrapping_add(search_within_block(&block_docs[self.cur..], target));
|
|
||||||
if need_positions {
|
if need_positions {
|
||||||
sum_freqs_skipped += self.block_cursor.freqs()[self.cur..new_cur]
|
sum_freqs_skipped += self.block_cursor.freqs()[self.cur..new_cur].iter().sum::<u32>();
|
||||||
.iter()
|
|
||||||
.sum::<u32>();
|
|
||||||
self.position_computer
|
self.position_computer
|
||||||
.as_mut()
|
.as_mut()
|
||||||
.unwrap()
|
.unwrap()
|
||||||
@@ -236,9 +229,9 @@ impl DocSet for SegmentPostings {
|
|||||||
let doc = block_docs[new_cur];
|
let doc = block_docs[new_cur];
|
||||||
debug_assert!(doc >= target);
|
debug_assert!(doc >= target);
|
||||||
if doc == target {
|
if doc == target {
|
||||||
SkipResult::Reached
|
return SkipResult::Reached;
|
||||||
} else {
|
} else {
|
||||||
SkipResult::OverStep
|
return SkipResult::OverStep;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -337,10 +330,7 @@ pub struct BlockSegmentPostings {
|
|||||||
skip_reader: SkipReader,
|
skip_reader: SkipReader,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn split_into_skips_and_postings(
|
fn split_into_skips_and_postings(doc_freq: u32, mut data: OwnedRead) -> (Option<OwnedRead>, OwnedRead) {
|
||||||
doc_freq: u32,
|
|
||||||
mut data: OwnedRead,
|
|
||||||
) -> (Option<OwnedRead>, OwnedRead) {
|
|
||||||
if doc_freq >= USE_SKIP_INFO_LIMIT {
|
if doc_freq >= USE_SKIP_INFO_LIMIT {
|
||||||
let skip_len = VInt::deserialize(&mut data).expect("Data corrupted").0 as usize;
|
let skip_len = VInt::deserialize(&mut data).expect("Data corrupted").0 as usize;
|
||||||
let mut postings_data = data.clone();
|
let mut postings_data = data.clone();
|
||||||
@@ -355,7 +345,7 @@ fn split_into_skips_and_postings(
|
|||||||
#[derive(Debug, Eq, PartialEq)]
|
#[derive(Debug, Eq, PartialEq)]
|
||||||
pub enum BlockSegmentPostingsSkipResult {
|
pub enum BlockSegmentPostingsSkipResult {
|
||||||
Terminated,
|
Terminated,
|
||||||
Success(u32), //< number of term freqs to skip
|
Success(u32) //< number of term freqs to skip
|
||||||
}
|
}
|
||||||
|
|
||||||
impl BlockSegmentPostings {
|
impl BlockSegmentPostings {
|
||||||
@@ -363,7 +353,7 @@ impl BlockSegmentPostings {
|
|||||||
doc_freq: u32,
|
doc_freq: u32,
|
||||||
data: OwnedRead,
|
data: OwnedRead,
|
||||||
record_option: IndexRecordOption,
|
record_option: IndexRecordOption,
|
||||||
requested_option: IndexRecordOption,
|
requested_option: IndexRecordOption
|
||||||
) -> BlockSegmentPostings {
|
) -> BlockSegmentPostings {
|
||||||
let freq_reading_option = match (record_option, requested_option) {
|
let freq_reading_option = match (record_option, requested_option) {
|
||||||
(IndexRecordOption::Basic, _) => FreqReadingOption::NoFreq,
|
(IndexRecordOption::Basic, _) => FreqReadingOption::NoFreq,
|
||||||
@@ -372,10 +362,11 @@ impl BlockSegmentPostings {
|
|||||||
};
|
};
|
||||||
|
|
||||||
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, data);
|
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, data);
|
||||||
let skip_reader = match skip_data_opt {
|
let skip_reader =
|
||||||
Some(skip_data) => SkipReader::new(skip_data, record_option),
|
match skip_data_opt {
|
||||||
None => SkipReader::new(OwnedRead::new(&EMPTY_ARR[..]), record_option),
|
Some(skip_data) => SkipReader::new(skip_data, record_option),
|
||||||
};
|
None => SkipReader::new(OwnedRead::new(&EMPTY_ARR[..]), record_option)
|
||||||
|
};
|
||||||
let doc_freq = doc_freq as usize;
|
let doc_freq = doc_freq as usize;
|
||||||
let num_vint_docs = doc_freq % COMPRESSION_BLOCK_SIZE;
|
let num_vint_docs = doc_freq % COMPRESSION_BLOCK_SIZE;
|
||||||
BlockSegmentPostings {
|
BlockSegmentPostings {
|
||||||
@@ -459,6 +450,7 @@ impl BlockSegmentPostings {
|
|||||||
self.doc_decoder.output_len
|
self.doc_decoder.output_len
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/// position on a block that may contains `doc_id`.
|
/// position on a block that may contains `doc_id`.
|
||||||
/// Always advance the current block.
|
/// Always advance the current block.
|
||||||
///
|
///
|
||||||
@@ -469,7 +461,9 @@ impl BlockSegmentPostings {
|
|||||||
/// Returns false iff all of the document remaining are smaller than
|
/// Returns false iff all of the document remaining are smaller than
|
||||||
/// `doc_id`. In that case, all of these document are consumed.
|
/// `doc_id`. In that case, all of these document are consumed.
|
||||||
///
|
///
|
||||||
pub fn skip_to(&mut self, target_doc: DocId) -> BlockSegmentPostingsSkipResult {
|
pub fn skip_to(&mut self,
|
||||||
|
target_doc: DocId) -> BlockSegmentPostingsSkipResult {
|
||||||
|
|
||||||
let mut skip_freqs = 0u32;
|
let mut skip_freqs = 0u32;
|
||||||
while self.skip_reader.advance() {
|
while self.skip_reader.advance() {
|
||||||
if self.skip_reader.doc() >= target_doc {
|
if self.skip_reader.doc() >= target_doc {
|
||||||
@@ -478,11 +472,11 @@ impl BlockSegmentPostings {
|
|||||||
//
|
//
|
||||||
// We found our block!
|
// We found our block!
|
||||||
let num_bits = self.skip_reader.doc_num_bits();
|
let num_bits = self.skip_reader.doc_num_bits();
|
||||||
let num_consumed_bytes = self.doc_decoder.uncompress_block_sorted(
|
let num_consumed_bytes = self.doc_decoder
|
||||||
self.remaining_data.as_ref(),
|
.uncompress_block_sorted(
|
||||||
self.doc_offset,
|
self.remaining_data.as_ref(),
|
||||||
num_bits,
|
self.doc_offset,
|
||||||
);
|
num_bits);
|
||||||
self.remaining_data.advance(num_consumed_bytes);
|
self.remaining_data.advance(num_consumed_bytes);
|
||||||
let tf_num_bits = self.skip_reader.tf_num_bits();
|
let tf_num_bits = self.skip_reader.tf_num_bits();
|
||||||
match self.freq_reading_option {
|
match self.freq_reading_option {
|
||||||
@@ -492,9 +486,9 @@ impl BlockSegmentPostings {
|
|||||||
self.remaining_data.advance(num_bytes_to_skip);
|
self.remaining_data.advance(num_bytes_to_skip);
|
||||||
}
|
}
|
||||||
FreqReadingOption::ReadFreq => {
|
FreqReadingOption::ReadFreq => {
|
||||||
let num_consumed_bytes = self
|
let num_consumed_bytes = self.freq_decoder
|
||||||
.freq_decoder
|
.uncompress_block_unsorted(self.remaining_data.as_ref(),
|
||||||
.uncompress_block_unsorted(self.remaining_data.as_ref(), tf_num_bits);
|
tf_num_bits);
|
||||||
self.remaining_data.advance(num_consumed_bytes);
|
self.remaining_data.advance(num_consumed_bytes);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -524,8 +518,7 @@ impl BlockSegmentPostings {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
self.num_vint_docs = 0;
|
self.num_vint_docs = 0;
|
||||||
return self
|
return self.docs()
|
||||||
.docs()
|
|
||||||
.last()
|
.last()
|
||||||
.map(|last_doc| {
|
.map(|last_doc| {
|
||||||
if *last_doc >= target_doc {
|
if *last_doc >= target_doc {
|
||||||
@@ -533,7 +526,8 @@ impl BlockSegmentPostings {
|
|||||||
} else {
|
} else {
|
||||||
BlockSegmentPostingsSkipResult::Terminated
|
BlockSegmentPostingsSkipResult::Terminated
|
||||||
}
|
}
|
||||||
}).unwrap_or(BlockSegmentPostingsSkipResult::Terminated);
|
})
|
||||||
|
.unwrap_or(BlockSegmentPostingsSkipResult::Terminated);
|
||||||
}
|
}
|
||||||
BlockSegmentPostingsSkipResult::Terminated
|
BlockSegmentPostingsSkipResult::Terminated
|
||||||
}
|
}
|
||||||
@@ -544,11 +538,11 @@ impl BlockSegmentPostings {
|
|||||||
pub fn advance(&mut self) -> bool {
|
pub fn advance(&mut self) -> bool {
|
||||||
if self.skip_reader.advance() {
|
if self.skip_reader.advance() {
|
||||||
let num_bits = self.skip_reader.doc_num_bits();
|
let num_bits = self.skip_reader.doc_num_bits();
|
||||||
let num_consumed_bytes = self.doc_decoder.uncompress_block_sorted(
|
let num_consumed_bytes = self.doc_decoder
|
||||||
self.remaining_data.as_ref(),
|
.uncompress_block_sorted(
|
||||||
self.doc_offset,
|
self.remaining_data.as_ref(),
|
||||||
num_bits,
|
self.doc_offset,
|
||||||
);
|
num_bits);
|
||||||
self.remaining_data.advance(num_consumed_bytes);
|
self.remaining_data.advance(num_consumed_bytes);
|
||||||
let tf_num_bits = self.skip_reader.tf_num_bits();
|
let tf_num_bits = self.skip_reader.tf_num_bits();
|
||||||
match self.freq_reading_option {
|
match self.freq_reading_option {
|
||||||
@@ -558,9 +552,9 @@ impl BlockSegmentPostings {
|
|||||||
self.remaining_data.advance(num_bytes_to_skip);
|
self.remaining_data.advance(num_bytes_to_skip);
|
||||||
}
|
}
|
||||||
FreqReadingOption::ReadFreq => {
|
FreqReadingOption::ReadFreq => {
|
||||||
let num_consumed_bytes = self
|
let num_consumed_bytes = self.freq_decoder
|
||||||
.freq_decoder
|
.uncompress_block_unsorted(self.remaining_data.as_ref(),
|
||||||
.uncompress_block_unsorted(self.remaining_data.as_ref(), tf_num_bits);
|
tf_num_bits);
|
||||||
self.remaining_data.advance(num_consumed_bytes);
|
self.remaining_data.advance(num_consumed_bytes);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -600,6 +594,7 @@ impl BlockSegmentPostings {
|
|||||||
doc_offset: 0,
|
doc_offset: 0,
|
||||||
doc_freq: 0,
|
doc_freq: 0,
|
||||||
|
|
||||||
|
|
||||||
remaining_data: OwnedRead::new(vec![]),
|
remaining_data: OwnedRead::new(vec![]),
|
||||||
skip_reader: SkipReader::new(OwnedRead::new(vec![]), IndexRecordOption::Basic),
|
skip_reader: SkipReader::new(OwnedRead::new(vec![]), IndexRecordOption::Basic),
|
||||||
}
|
}
|
||||||
@@ -621,19 +616,19 @@ impl<'b> Streamer<'b> for BlockSegmentPostings {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
use super::search_within_block;
|
|
||||||
use super::BlockSegmentPostings;
|
use super::BlockSegmentPostings;
|
||||||
use super::BlockSegmentPostingsSkipResult;
|
|
||||||
use super::SegmentPostings;
|
use super::SegmentPostings;
|
||||||
use common::HasLen;
|
use common::HasLen;
|
||||||
use core::Index;
|
use core::Index;
|
||||||
use docset::DocSet;
|
use docset::DocSet;
|
||||||
use fst::Streamer;
|
use fst::Streamer;
|
||||||
use schema::IndexRecordOption;
|
use schema::IndexRecordOption;
|
||||||
use schema::Schema;
|
use schema::SchemaBuilder;
|
||||||
use schema::Term;
|
use schema::Term;
|
||||||
use schema::INT_INDEXED;
|
use schema::INT_INDEXED;
|
||||||
|
use super::BlockSegmentPostingsSkipResult;
|
||||||
use DocId;
|
use DocId;
|
||||||
|
use super::search_within_block;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_empty_segment_postings() {
|
fn test_empty_segment_postings() {
|
||||||
@@ -650,6 +645,7 @@ mod tests {
|
|||||||
assert_eq!(postings.doc_freq(), 0);
|
assert_eq!(postings.doc_freq(), 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
fn search_within_block_trivial_but_slow(block: &[u32], target: u32) -> usize {
|
fn search_within_block_trivial_but_slow(block: &[u32], target: u32) -> usize {
|
||||||
block
|
block
|
||||||
.iter()
|
.iter()
|
||||||
@@ -657,15 +653,11 @@ mod tests {
|
|||||||
.enumerate()
|
.enumerate()
|
||||||
.filter(|&(_, ref val)| *val >= target)
|
.filter(|&(_, ref val)| *val >= target)
|
||||||
.next()
|
.next()
|
||||||
.unwrap()
|
.unwrap().0
|
||||||
.0
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn util_test_search_within_block(block: &[u32], target: u32) {
|
fn util_test_search_within_block(block: &[u32], target: u32) {
|
||||||
assert_eq!(
|
assert_eq!(search_within_block(block, target), search_within_block_trivial_but_slow(block, target));
|
||||||
search_within_block(block, target),
|
|
||||||
search_within_block_trivial_but_slow(block, target)
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn util_test_search_within_block_all(block: &[u32]) {
|
fn util_test_search_within_block_all(block: &[u32]) {
|
||||||
@@ -685,7 +677,7 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn test_search_within_block() {
|
fn test_search_within_block() {
|
||||||
for len in 1u32..128u32 {
|
for len in 1u32..128u32 {
|
||||||
let v: Vec<u32> = (0..len).map(|i| i * 2).collect();
|
let v: Vec<u32> = (0..len).map(|i| i*2).collect();
|
||||||
util_test_search_within_block_all(&v[..]);
|
util_test_search_within_block_all(&v[..]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -707,7 +699,7 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn build_block_postings(docs: Vec<DocId>) -> BlockSegmentPostings {
|
fn build_block_postings(docs: Vec<DocId>) -> BlockSegmentPostings {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let int_field = schema_builder.add_u64_field("id", INT_INDEXED);
|
let int_field = schema_builder.add_u64_field("id", INT_INDEXED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -734,22 +726,14 @@ mod tests {
|
|||||||
fn test_block_segment_postings_skip() {
|
fn test_block_segment_postings_skip() {
|
||||||
for i in 0..4 {
|
for i in 0..4 {
|
||||||
let mut block_postings = build_block_postings(vec![3]);
|
let mut block_postings = build_block_postings(vec![3]);
|
||||||
assert_eq!(
|
assert_eq!(block_postings.skip_to(i), BlockSegmentPostingsSkipResult::Success(0u32));
|
||||||
block_postings.skip_to(i),
|
assert_eq!(block_postings.skip_to(i), BlockSegmentPostingsSkipResult::Terminated);
|
||||||
BlockSegmentPostingsSkipResult::Success(0u32)
|
|
||||||
);
|
|
||||||
assert_eq!(
|
|
||||||
block_postings.skip_to(i),
|
|
||||||
BlockSegmentPostingsSkipResult::Terminated
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
let mut block_postings = build_block_postings(vec![3]);
|
let mut block_postings = build_block_postings(vec![3]);
|
||||||
assert_eq!(
|
assert_eq!(block_postings.skip_to(4u32), BlockSegmentPostingsSkipResult::Terminated);
|
||||||
block_postings.skip_to(4u32),
|
|
||||||
BlockSegmentPostingsSkipResult::Terminated
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_block_segment_postings_skip2() {
|
fn test_block_segment_postings_skip2() {
|
||||||
let mut docs = vec![0];
|
let mut docs = vec![0];
|
||||||
@@ -757,28 +741,19 @@ mod tests {
|
|||||||
docs.push((i * i / 100) + i);
|
docs.push((i * i / 100) + i);
|
||||||
}
|
}
|
||||||
let mut block_postings = build_block_postings(docs.clone());
|
let mut block_postings = build_block_postings(docs.clone());
|
||||||
for i in vec![0, 424, 10000] {
|
for i in vec![0, 424, 10000] {
|
||||||
assert_eq!(
|
assert_eq!(block_postings.skip_to(i), BlockSegmentPostingsSkipResult::Success(0u32));
|
||||||
block_postings.skip_to(i),
|
|
||||||
BlockSegmentPostingsSkipResult::Success(0u32)
|
|
||||||
);
|
|
||||||
let docs = block_postings.docs();
|
let docs = block_postings.docs();
|
||||||
assert!(docs[0] <= i);
|
assert!(docs[0] <= i);
|
||||||
assert!(docs.last().cloned().unwrap_or(0u32) >= i);
|
assert!(docs.last().cloned().unwrap_or(0u32) >= i);
|
||||||
}
|
}
|
||||||
assert_eq!(
|
assert_eq!(block_postings.skip_to(100_000), BlockSegmentPostingsSkipResult::Terminated);
|
||||||
block_postings.skip_to(100_000),
|
assert_eq!(block_postings.skip_to(101_000), BlockSegmentPostingsSkipResult::Terminated);
|
||||||
BlockSegmentPostingsSkipResult::Terminated
|
|
||||||
);
|
|
||||||
assert_eq!(
|
|
||||||
block_postings.skip_to(101_000),
|
|
||||||
BlockSegmentPostingsSkipResult::Terminated
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_reset_block_segment_postings() {
|
fn test_reset_block_segment_postings() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let int_field = schema_builder.add_u64_field("id", INT_INDEXED);
|
let int_field = schema_builder.add_u64_field("id", INT_INDEXED);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
|
|||||||
@@ -1,18 +1,18 @@
|
|||||||
use super::TermInfo;
|
use super::TermInfo;
|
||||||
use common::{BinarySerializable, VInt};
|
use common::{VInt, BinarySerializable};
|
||||||
use common::{CompositeWrite, CountingWriter};
|
use common::{CompositeWrite, CountingWriter};
|
||||||
|
use postings::compression::{VIntEncoder, BlockEncoder, COMPRESSION_BLOCK_SIZE};
|
||||||
use core::Segment;
|
use core::Segment;
|
||||||
use directory::WritePtr;
|
use directory::WritePtr;
|
||||||
use positions::PositionSerializer;
|
|
||||||
use postings::compression::{BlockEncoder, VIntEncoder, COMPRESSION_BLOCK_SIZE};
|
|
||||||
use postings::skip::SkipSerializer;
|
|
||||||
use postings::USE_SKIP_INFO_LIMIT;
|
|
||||||
use schema::Schema;
|
use schema::Schema;
|
||||||
use schema::{Field, FieldEntry, FieldType};
|
use schema::{Field, FieldEntry, FieldType};
|
||||||
use std::io::{self, Write};
|
use std::io::{self, Write};
|
||||||
use termdict::{TermDictionaryBuilder, TermOrdinal};
|
use termdict::{TermDictionaryBuilder, TermOrdinal};
|
||||||
use DocId;
|
use DocId;
|
||||||
use Result;
|
use Result;
|
||||||
|
use postings::USE_SKIP_INFO_LIMIT;
|
||||||
|
use postings::skip::SkipSerializer;
|
||||||
|
use positions::PositionSerializer;
|
||||||
|
|
||||||
/// `PostingsSerializer` is in charge of serializing
|
/// `PostingsSerializer` is in charge of serializing
|
||||||
/// postings on disk, in the
|
/// postings on disk, in the
|
||||||
@@ -100,11 +100,11 @@ impl InvertedIndexSerializer {
|
|||||||
let positionsidx_write = self.positionsidx_write.for_field(field);
|
let positionsidx_write = self.positionsidx_write.for_field(field);
|
||||||
let field_type: FieldType = (*field_entry.field_type()).clone();
|
let field_type: FieldType = (*field_entry.field_type()).clone();
|
||||||
FieldSerializer::new(
|
FieldSerializer::new(
|
||||||
&field_type,
|
field_type,
|
||||||
term_dictionary_write,
|
term_dictionary_write,
|
||||||
postings_write,
|
postings_write,
|
||||||
positions_write,
|
positions_write,
|
||||||
positionsidx_write,
|
positionsidx_write
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -131,11 +131,11 @@ pub struct FieldSerializer<'a> {
|
|||||||
|
|
||||||
impl<'a> FieldSerializer<'a> {
|
impl<'a> FieldSerializer<'a> {
|
||||||
fn new(
|
fn new(
|
||||||
field_type: &FieldType,
|
field_type: FieldType,
|
||||||
term_dictionary_write: &'a mut CountingWriter<WritePtr>,
|
term_dictionary_write: &'a mut CountingWriter<WritePtr>,
|
||||||
postings_write: &'a mut CountingWriter<WritePtr>,
|
postings_write: &'a mut CountingWriter<WritePtr>,
|
||||||
positions_write: &'a mut CountingWriter<WritePtr>,
|
positions_write: &'a mut CountingWriter<WritePtr>,
|
||||||
positionsidx_write: &'a mut CountingWriter<WritePtr>,
|
positionsidx_write: &'a mut CountingWriter<WritePtr>
|
||||||
) -> io::Result<FieldSerializer<'a>> {
|
) -> io::Result<FieldSerializer<'a>> {
|
||||||
let (term_freq_enabled, position_enabled): (bool, bool) = match field_type {
|
let (term_freq_enabled, position_enabled): (bool, bool) = match field_type {
|
||||||
FieldType::Str(ref text_options) => {
|
FieldType::Str(ref text_options) => {
|
||||||
@@ -152,9 +152,8 @@ impl<'a> FieldSerializer<'a> {
|
|||||||
_ => (false, false),
|
_ => (false, false),
|
||||||
};
|
};
|
||||||
let term_dictionary_builder =
|
let term_dictionary_builder =
|
||||||
TermDictionaryBuilder::new(term_dictionary_write, &field_type)?;
|
TermDictionaryBuilder::new(term_dictionary_write, field_type)?;
|
||||||
let postings_serializer =
|
let postings_serializer = PostingsSerializer::new(postings_write, term_freq_enabled, position_enabled);
|
||||||
PostingsSerializer::new(postings_write, term_freq_enabled, position_enabled);
|
|
||||||
let positions_serializer_opt = if position_enabled {
|
let positions_serializer_opt = if position_enabled {
|
||||||
Some(PositionSerializer::new(positions_write, positionsidx_write))
|
Some(PositionSerializer::new(positions_write, positionsidx_write))
|
||||||
} else {
|
} else {
|
||||||
@@ -172,15 +171,14 @@ impl<'a> FieldSerializer<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn current_term_info(&self) -> TermInfo {
|
fn current_term_info(&self) -> TermInfo {
|
||||||
let positions_idx = self
|
let positions_idx = self.positions_serializer_opt
|
||||||
.positions_serializer_opt
|
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.map(|positions_serializer| positions_serializer.positions_idx())
|
.map(|positions_serializer| positions_serializer.positions_idx())
|
||||||
.unwrap_or(0u64);
|
.unwrap_or(0u64);
|
||||||
TermInfo {
|
TermInfo {
|
||||||
doc_freq: 0,
|
doc_freq: 0,
|
||||||
postings_offset: self.postings_serializer.addr(),
|
postings_offset: self.postings_serializer.addr(),
|
||||||
positions_idx,
|
positions_idx
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -255,7 +253,7 @@ impl<'a> FieldSerializer<'a> {
|
|||||||
struct Block {
|
struct Block {
|
||||||
doc_ids: [DocId; COMPRESSION_BLOCK_SIZE],
|
doc_ids: [DocId; COMPRESSION_BLOCK_SIZE],
|
||||||
term_freqs: [u32; COMPRESSION_BLOCK_SIZE],
|
term_freqs: [u32; COMPRESSION_BLOCK_SIZE],
|
||||||
len: usize,
|
len: usize
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Block {
|
impl Block {
|
||||||
@@ -263,7 +261,7 @@ impl Block {
|
|||||||
Block {
|
Block {
|
||||||
doc_ids: [0u32; COMPRESSION_BLOCK_SIZE],
|
doc_ids: [0u32; COMPRESSION_BLOCK_SIZE],
|
||||||
term_freqs: [0u32; COMPRESSION_BLOCK_SIZE],
|
term_freqs: [0u32; COMPRESSION_BLOCK_SIZE],
|
||||||
len: 0,
|
len: 0
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -314,12 +312,9 @@ pub struct PostingsSerializer<W: Write> {
|
|||||||
termfreq_sum_enabled: bool,
|
termfreq_sum_enabled: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
impl<W: Write> PostingsSerializer<W> {
|
impl<W: Write> PostingsSerializer<W> {
|
||||||
pub fn new(
|
pub fn new(write: W, termfreq_enabled: bool, termfreq_sum_enabled: bool) -> PostingsSerializer<W> {
|
||||||
write: W,
|
|
||||||
termfreq_enabled: bool,
|
|
||||||
termfreq_sum_enabled: bool,
|
|
||||||
) -> PostingsSerializer<W> {
|
|
||||||
PostingsSerializer {
|
PostingsSerializer {
|
||||||
output_write: CountingWriter::wrap(write),
|
output_write: CountingWriter::wrap(write),
|
||||||
|
|
||||||
@@ -342,16 +337,14 @@ impl<W: Write> PostingsSerializer<W> {
|
|||||||
.block_encoder
|
.block_encoder
|
||||||
.compress_block_sorted(&self.block.doc_ids(), self.last_doc_id_encoded);
|
.compress_block_sorted(&self.block.doc_ids(), self.last_doc_id_encoded);
|
||||||
self.last_doc_id_encoded = self.block.last_doc();
|
self.last_doc_id_encoded = self.block.last_doc();
|
||||||
self.skip_write
|
self.skip_write.write_doc(self.last_doc_id_encoded, num_bits);
|
||||||
.write_doc(self.last_doc_id_encoded, num_bits);
|
|
||||||
// last el block 0, offset block 1,
|
// last el block 0, offset block 1,
|
||||||
self.postings_write.extend(block_encoded);
|
self.postings_write.extend(block_encoded);
|
||||||
}
|
}
|
||||||
if self.termfreq_enabled {
|
if self.termfreq_enabled {
|
||||||
// encode the term_freqs
|
// encode the term_freqs
|
||||||
let (num_bits, block_encoded): (u8, &[u8]) = self
|
let (num_bits, block_encoded): (u8, &[u8]) =
|
||||||
.block_encoder
|
self.block_encoder.compress_block_unsorted(&self.block.term_freqs());
|
||||||
.compress_block_unsorted(&self.block.term_freqs());
|
|
||||||
self.postings_write.extend(block_encoded);
|
self.postings_write.extend(block_encoded);
|
||||||
self.skip_write.write_term_freq(num_bits);
|
self.skip_write.write_term_freq(num_bits);
|
||||||
if self.termfreq_sum_enabled {
|
if self.termfreq_sum_enabled {
|
||||||
@@ -382,15 +375,13 @@ impl<W: Write> PostingsSerializer<W> {
|
|||||||
// In that case, the remaining part is encoded
|
// In that case, the remaining part is encoded
|
||||||
// using variable int encoding.
|
// using variable int encoding.
|
||||||
{
|
{
|
||||||
let block_encoded = self
|
let block_encoded = self.block_encoder
|
||||||
.block_encoder
|
|
||||||
.compress_vint_sorted(&self.block.doc_ids(), self.last_doc_id_encoded);
|
.compress_vint_sorted(&self.block.doc_ids(), self.last_doc_id_encoded);
|
||||||
self.postings_write.write_all(block_encoded)?;
|
self.postings_write.write_all(block_encoded)?;
|
||||||
}
|
}
|
||||||
// ... Idem for term frequencies
|
// ... Idem for term frequencies
|
||||||
if self.termfreq_enabled {
|
if self.termfreq_enabled {
|
||||||
let block_encoded = self
|
let block_encoded = self.block_encoder
|
||||||
.block_encoder
|
|
||||||
.compress_vint_unsorted(self.block.term_freqs());
|
.compress_vint_unsorted(self.block.term_freqs());
|
||||||
self.postings_write.write_all(block_encoded)?;
|
self.postings_write.write_all(block_encoded)?;
|
||||||
}
|
}
|
||||||
@@ -401,6 +392,7 @@ impl<W: Write> PostingsSerializer<W> {
|
|||||||
VInt(skip_data.len() as u64).serialize(&mut self.output_write)?;
|
VInt(skip_data.len() as u64).serialize(&mut self.output_write)?;
|
||||||
self.output_write.write_all(skip_data)?;
|
self.output_write.write_all(skip_data)?;
|
||||||
self.output_write.write_all(&self.postings_write[..])?;
|
self.output_write.write_all(&self.postings_write[..])?;
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
self.output_write.write_all(&self.postings_write[..])?;
|
self.output_write.write_all(&self.postings_write[..])?;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,8 +1,8 @@
|
|||||||
|
use DocId;
|
||||||
use common::BinarySerializable;
|
use common::BinarySerializable;
|
||||||
use owned_read::OwnedRead;
|
use owned_read::OwnedRead;
|
||||||
use postings::compression::COMPRESSION_BLOCK_SIZE;
|
use postings::compression::COMPRESSION_BLOCK_SIZE;
|
||||||
use schema::IndexRecordOption;
|
use schema::IndexRecordOption;
|
||||||
use DocId;
|
|
||||||
|
|
||||||
pub struct SkipSerializer {
|
pub struct SkipSerializer {
|
||||||
buffer: Vec<u8>,
|
buffer: Vec<u8>,
|
||||||
@@ -18,11 +18,8 @@ impl SkipSerializer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn write_doc(&mut self, last_doc: DocId, doc_num_bits: u8) {
|
pub fn write_doc(&mut self, last_doc: DocId, doc_num_bits: u8) {
|
||||||
assert!(
|
assert!(last_doc > self.prev_doc, "write_doc(...) called with non-increasing doc ids. \
|
||||||
last_doc > self.prev_doc,
|
Did you forget to call clear maybe?");
|
||||||
"write_doc(...) called with non-increasing doc ids. \
|
|
||||||
Did you forget to call clear maybe?"
|
|
||||||
);
|
|
||||||
let delta_doc = last_doc - self.prev_doc;
|
let delta_doc = last_doc - self.prev_doc;
|
||||||
self.prev_doc = last_doc;
|
self.prev_doc = last_doc;
|
||||||
delta_doc.serialize(&mut self.buffer).unwrap();
|
delta_doc.serialize(&mut self.buffer).unwrap();
|
||||||
@@ -33,10 +30,9 @@ impl SkipSerializer {
|
|||||||
self.buffer.push(tf_num_bits);
|
self.buffer.push(tf_num_bits);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
pub fn write_total_term_freq(&mut self, tf_sum: u32) {
|
pub fn write_total_term_freq(&mut self, tf_sum: u32) {
|
||||||
tf_sum
|
tf_sum.serialize(&mut self.buffer).expect("Should never fail");
|
||||||
.serialize(&mut self.buffer)
|
|
||||||
.expect("Should never fail");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn data(&self) -> &[u8] {
|
pub fn data(&self) -> &[u8] {
|
||||||
@@ -107,32 +103,33 @@ impl SkipReader {
|
|||||||
} else {
|
} else {
|
||||||
let doc_delta = u32::deserialize(&mut self.owned_read).expect("Skip data corrupted");
|
let doc_delta = u32::deserialize(&mut self.owned_read).expect("Skip data corrupted");
|
||||||
self.doc += doc_delta as DocId;
|
self.doc += doc_delta as DocId;
|
||||||
self.doc_num_bits = self.owned_read.get(0);
|
self.doc_num_bits = self.owned_read.get(0);
|
||||||
match self.skip_info {
|
match self.skip_info {
|
||||||
IndexRecordOption::Basic => {
|
IndexRecordOption::Basic => {
|
||||||
self.owned_read.advance(1);
|
self.owned_read.advance(1);
|
||||||
}
|
}
|
||||||
IndexRecordOption::WithFreqs => {
|
IndexRecordOption::WithFreqs=> {
|
||||||
self.tf_num_bits = self.owned_read.get(1);
|
self.tf_num_bits = self.owned_read.get(1);
|
||||||
self.owned_read.advance(2);
|
self.owned_read.advance(2);
|
||||||
}
|
}
|
||||||
IndexRecordOption::WithFreqsAndPositions => {
|
IndexRecordOption::WithFreqsAndPositions => {
|
||||||
self.tf_num_bits = self.owned_read.get(1);
|
self.tf_num_bits = self.owned_read.get(1);
|
||||||
self.owned_read.advance(2);
|
self.owned_read.advance(2);
|
||||||
self.tf_sum =
|
self.tf_sum = u32::deserialize(&mut self.owned_read)
|
||||||
u32::deserialize(&mut self.owned_read).expect("Failed reading tf_sum");
|
.expect("Failed reading tf_sum");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
true
|
true
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
|
||||||
use super::IndexRecordOption;
|
|
||||||
use super::{SkipReader, SkipSerializer};
|
use super::{SkipReader, SkipSerializer};
|
||||||
|
use super::IndexRecordOption;
|
||||||
use owned_read::OwnedRead;
|
use owned_read::OwnedRead;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -174,4 +171,4 @@ mod tests {
|
|||||||
assert_eq!(skip_reader.doc_num_bits(), 5u8);
|
assert_eq!(skip_reader.doc_num_bits(), 5u8);
|
||||||
assert!(!skip_reader.advance());
|
assert!(!skip_reader.advance());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -174,8 +174,8 @@ mod tests {
|
|||||||
|
|
||||||
#[cfg(all(test, feature = "unstable"))]
|
#[cfg(all(test, feature = "unstable"))]
|
||||||
mod bench {
|
mod bench {
|
||||||
use super::super::MemoryArena;
|
|
||||||
use super::ExpUnrolledLinkedList;
|
use super::ExpUnrolledLinkedList;
|
||||||
|
use tantivy_memory_arena::MemoryArena;
|
||||||
use test::Bencher;
|
use test::Bencher;
|
||||||
|
|
||||||
const NUM_STACK: usize = 10_000;
|
const NUM_STACK: usize = 10_000;
|
||||||
@@ -199,19 +199,20 @@ mod bench {
|
|||||||
|
|
||||||
#[bench]
|
#[bench]
|
||||||
fn bench_push_stack(bench: &mut Bencher) {
|
fn bench_push_stack(bench: &mut Bencher) {
|
||||||
|
let heap = MemoryArena::new();
|
||||||
bench.iter(|| {
|
bench.iter(|| {
|
||||||
let mut heap = MemoryArena::new();
|
|
||||||
let mut stacks = Vec::with_capacity(100);
|
let mut stacks = Vec::with_capacity(100);
|
||||||
for _ in 0..NUM_STACK {
|
for _ in 0..NUM_STACK {
|
||||||
let mut stack = ExpUnrolledLinkedList::new(&mut heap);
|
let (_, stack) = heap.allocate_object::<ExpUnrolledLinkedList>();
|
||||||
stacks.push(stack);
|
stacks.push(stack);
|
||||||
}
|
}
|
||||||
for s in 0..NUM_STACK {
|
for s in 0..NUM_STACK {
|
||||||
for i in 0u32..STACK_SIZE {
|
for i in 0u32..STACK_SIZE {
|
||||||
let t = s * 392017 % NUM_STACK;
|
let t = s * 392017 % NUM_STACK;
|
||||||
stacks[t].push(i, &mut heap);
|
stacks[t].push(i, &heap);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
heap.clear();
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ impl Addr {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the `Addr` object for `addr + offset`
|
/// Returns the `Addr` object for `addr + offset`
|
||||||
pub fn offset(self, offset: u32) -> Addr {
|
pub fn offset(&self, offset: u32) -> Addr {
|
||||||
Addr(self.0.wrapping_add(offset))
|
Addr(self.0.wrapping_add(offset))
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -55,16 +55,16 @@ impl Addr {
|
|||||||
Addr((page_id << NUM_BITS_PAGE_ADDR | local_addr) as u32)
|
Addr((page_id << NUM_BITS_PAGE_ADDR | local_addr) as u32)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn page_id(self) -> usize {
|
fn page_id(&self) -> usize {
|
||||||
(self.0 as usize) >> NUM_BITS_PAGE_ADDR
|
(self.0 as usize) >> NUM_BITS_PAGE_ADDR
|
||||||
}
|
}
|
||||||
|
|
||||||
fn page_local_addr(self) -> usize {
|
fn page_local_addr(&self) -> usize {
|
||||||
(self.0 as usize) & (PAGE_SIZE - 1)
|
(self.0 as usize) & (PAGE_SIZE - 1)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns true if and only if the `Addr` is null.
|
/// Returns true if and only if the `Addr` is null.
|
||||||
pub fn is_null(self) -> bool {
|
pub fn is_null(&self) -> bool {
|
||||||
self.0 == u32::max_value()
|
self.0 == u32::max_value()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -233,12 +233,12 @@ impl Page {
|
|||||||
|
|
||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
pub(crate) unsafe fn get_ptr(&self, addr: usize) -> *const u8 {
|
pub(crate) unsafe fn get_ptr(&self, addr: usize) -> *const u8 {
|
||||||
self.data.as_ptr().add(addr)
|
self.data.as_ptr().offset(addr as isize)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
pub(crate) unsafe fn get_mut_ptr(&mut self, addr: usize) -> *mut u8 {
|
pub(crate) unsafe fn get_mut_ptr(&mut self, addr: usize) -> *mut u8 {
|
||||||
self.data.as_mut_ptr().add(addr)
|
self.data.as_mut_ptr().offset(addr as isize)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ const M: u32 = 0x5bd1_e995;
|
|||||||
|
|
||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
pub fn murmurhash2(key: &[u8]) -> u32 {
|
pub fn murmurhash2(key: &[u8]) -> u32 {
|
||||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
|
|
||||||
let mut key_ptr: *const u32 = key.as_ptr() as *const u32;
|
let mut key_ptr: *const u32 = key.as_ptr() as *const u32;
|
||||||
let len = key.len() as u32;
|
let len = key.len() as u32;
|
||||||
let mut h: u32 = SEED ^ len;
|
let mut h: u32 = SEED ^ len;
|
||||||
|
|||||||
@@ -61,7 +61,7 @@ impl Default for KeyValue {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl KeyValue {
|
impl KeyValue {
|
||||||
fn is_empty(self) -> bool {
|
fn is_empty(&self) -> bool {
|
||||||
self.key_value_addr.is_null()
|
self.key_value_addr.is_null()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -59,10 +59,10 @@ impl DocSet for AllScorer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if self.doc < self.max_doc {
|
if self.doc < self.max_doc {
|
||||||
true
|
return true;
|
||||||
} else {
|
} else {
|
||||||
self.state = State::Finished;
|
self.state = State::Finished;
|
||||||
false
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -86,12 +86,12 @@ mod tests {
|
|||||||
|
|
||||||
use super::AllQuery;
|
use super::AllQuery;
|
||||||
use query::Query;
|
use query::Query;
|
||||||
use schema::{Schema, TEXT};
|
use schema::{SchemaBuilder, TEXT};
|
||||||
use Index;
|
use Index;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_all_query() {
|
fn test_all_query() {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let field = schema_builder.add_text_field("text", TEXT);
|
let field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ use Result;
|
|||||||
/// A weight struct for Fuzzy Term and Regex Queries
|
/// A weight struct for Fuzzy Term and Regex Queries
|
||||||
pub struct AutomatonWeight<A>
|
pub struct AutomatonWeight<A>
|
||||||
where
|
where
|
||||||
A: Automaton + Send + Sync + 'static,
|
A: Automaton,
|
||||||
{
|
{
|
||||||
field: Field,
|
field: Field,
|
||||||
automaton: A,
|
automaton: A,
|
||||||
@@ -19,7 +19,7 @@ where
|
|||||||
|
|
||||||
impl<A> AutomatonWeight<A>
|
impl<A> AutomatonWeight<A>
|
||||||
where
|
where
|
||||||
A: Automaton + Send + Sync + 'static,
|
A: Automaton,
|
||||||
{
|
{
|
||||||
/// Create a new AutomationWeight
|
/// Create a new AutomationWeight
|
||||||
pub fn new(field: Field, automaton: A) -> AutomatonWeight<A> {
|
pub fn new(field: Field, automaton: A) -> AutomatonWeight<A> {
|
||||||
@@ -34,7 +34,7 @@ where
|
|||||||
|
|
||||||
impl<A> Weight for AutomatonWeight<A>
|
impl<A> Weight for AutomatonWeight<A>
|
||||||
where
|
where
|
||||||
A: Automaton + Send + Sync + 'static,
|
A: Automaton,
|
||||||
{
|
{
|
||||||
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
|
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
|
||||||
let max_doc = reader.max_doc();
|
let max_doc = reader.max_doc();
|
||||||
|
|||||||
@@ -17,9 +17,9 @@ fn cached_tf_component(fieldnorm: u32, average_fieldnorm: f32) -> f32 {
|
|||||||
|
|
||||||
fn compute_tf_cache(average_fieldnorm: f32) -> [f32; 256] {
|
fn compute_tf_cache(average_fieldnorm: f32) -> [f32; 256] {
|
||||||
let mut cache = [0f32; 256];
|
let mut cache = [0f32; 256];
|
||||||
for (fieldnorm_id, cache_mut) in cache.iter_mut().enumerate() {
|
for fieldnorm_id in 0..256 {
|
||||||
let fieldnorm = FieldNormReader::id_to_fieldnorm(fieldnorm_id as u8);
|
let fieldnorm = FieldNormReader::id_to_fieldnorm(fieldnorm_id as u8);
|
||||||
*cache_mut = cached_tf_component(fieldnorm, average_fieldnorm);
|
cache[fieldnorm_id] = cached_tf_component(fieldnorm, average_fieldnorm);
|
||||||
}
|
}
|
||||||
cache
|
cache
|
||||||
}
|
}
|
||||||
@@ -54,7 +54,7 @@ impl BM25Weight {
|
|||||||
for segment_reader in searcher.segment_readers() {
|
for segment_reader in searcher.segment_readers() {
|
||||||
let inverted_index = segment_reader.inverted_index(field);
|
let inverted_index = segment_reader.inverted_index(field);
|
||||||
total_num_tokens += inverted_index.total_num_tokens();
|
total_num_tokens += inverted_index.total_num_tokens();
|
||||||
total_num_docs += u64::from(segment_reader.max_doc());
|
total_num_docs += segment_reader.max_doc() as u64;
|
||||||
}
|
}
|
||||||
let average_fieldnorm = total_num_tokens as f32 / total_num_docs as f32;
|
let average_fieldnorm = total_num_tokens as f32 / total_num_docs as f32;
|
||||||
|
|
||||||
@@ -63,7 +63,8 @@ impl BM25Weight {
|
|||||||
.map(|term| {
|
.map(|term| {
|
||||||
let term_doc_freq = searcher.doc_freq(term);
|
let term_doc_freq = searcher.doc_freq(term);
|
||||||
idf(term_doc_freq, total_num_docs)
|
idf(term_doc_freq, total_num_docs)
|
||||||
}).sum::<f32>();
|
})
|
||||||
|
.sum::<f32>();
|
||||||
BM25Weight::new(idf, average_fieldnorm)
|
BM25Weight::new(idf, average_fieldnorm)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -5,7 +5,6 @@ use query::TermQuery;
|
|||||||
use query::Weight;
|
use query::Weight;
|
||||||
use schema::IndexRecordOption;
|
use schema::IndexRecordOption;
|
||||||
use schema::Term;
|
use schema::Term;
|
||||||
use std::collections::BTreeSet;
|
|
||||||
use Result;
|
use Result;
|
||||||
use Searcher;
|
use Searcher;
|
||||||
|
|
||||||
@@ -28,7 +27,7 @@ impl Clone for BooleanQuery {
|
|||||||
fn clone(&self) -> Self {
|
fn clone(&self) -> Self {
|
||||||
self.subqueries
|
self.subqueries
|
||||||
.iter()
|
.iter()
|
||||||
.map(|(occur, subquery)| (*occur, subquery.box_clone()))
|
.map(|(x, y)| (x.clone(), y.box_clone()))
|
||||||
.collect::<Vec<_>>()
|
.collect::<Vec<_>>()
|
||||||
.into()
|
.into()
|
||||||
}
|
}
|
||||||
@@ -42,20 +41,14 @@ impl From<Vec<(Occur, Box<Query>)>> for BooleanQuery {
|
|||||||
|
|
||||||
impl Query for BooleanQuery {
|
impl Query for BooleanQuery {
|
||||||
fn weight(&self, searcher: &Searcher, scoring_enabled: bool) -> Result<Box<Weight>> {
|
fn weight(&self, searcher: &Searcher, scoring_enabled: bool) -> Result<Box<Weight>> {
|
||||||
let sub_weights = self
|
let sub_weights = self.subqueries
|
||||||
.subqueries
|
|
||||||
.iter()
|
.iter()
|
||||||
.map(|&(ref occur, ref subquery)| {
|
.map(|&(ref occur, ref subquery)| {
|
||||||
Ok((*occur, subquery.weight(searcher, scoring_enabled)?))
|
Ok((*occur, subquery.weight(searcher, scoring_enabled)?))
|
||||||
}).collect::<Result<_>>()?;
|
})
|
||||||
|
.collect::<Result<_>>()?;
|
||||||
Ok(Box::new(BooleanWeight::new(sub_weights, scoring_enabled)))
|
Ok(Box::new(BooleanWeight::new(sub_weights, scoring_enabled)))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn query_terms(&self, term_set: &mut BTreeSet<Term>) {
|
|
||||||
for (_occur, subquery) in &self.subqueries {
|
|
||||||
subquery.query_terms(term_set);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl BooleanQuery {
|
impl BooleanQuery {
|
||||||
@@ -68,7 +61,8 @@ impl BooleanQuery {
|
|||||||
let term_query: Box<Query> =
|
let term_query: Box<Query> =
|
||||||
Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs));
|
Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs));
|
||||||
(Occur::Should, term_query)
|
(Occur::Should, term_query)
|
||||||
}).collect();
|
})
|
||||||
|
.collect();
|
||||||
BooleanQuery::from(occur_term_queries)
|
BooleanQuery::from(occur_term_queries)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ where
|
|||||||
}
|
}
|
||||||
|
|
||||||
let scorer: Box<Scorer> = Box::new(Union::<_, TScoreCombiner>::from(scorers));
|
let scorer: Box<Scorer> = Box::new(Union::<_, TScoreCombiner>::from(scorers));
|
||||||
scorer
|
return scorer;
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct BooleanWeight {
|
pub struct BooleanWeight {
|
||||||
|
|||||||
@@ -19,11 +19,10 @@ mod tests {
|
|||||||
use query::Scorer;
|
use query::Scorer;
|
||||||
use query::TermQuery;
|
use query::TermQuery;
|
||||||
use schema::*;
|
use schema::*;
|
||||||
use DocId;
|
|
||||||
use Index;
|
use Index;
|
||||||
|
|
||||||
fn aux_test_helper() -> (Index, Field) {
|
fn aux_test_helper() -> (Index, Field) {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = SchemaBuilder::default();
|
||||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||||
let schema = schema_builder.build();
|
let schema = schema_builder.build();
|
||||||
let index = Index::create_in_ram(schema);
|
let index = Index::create_in_ram(schema);
|
||||||
@@ -70,7 +69,7 @@ mod tests {
|
|||||||
let query_parser = QueryParser::for_index(&index, vec![text_field]);
|
let query_parser = QueryParser::for_index(&index, vec![text_field]);
|
||||||
let query = query_parser.parse_query("+a").unwrap();
|
let query = query_parser.parse_query("+a").unwrap();
|
||||||
let searcher = index.searcher();
|
let searcher = index.searcher();
|
||||||
let weight = query.weight(&searcher, true).unwrap();
|
let weight = query.weight(&*searcher, true).unwrap();
|
||||||
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||||
assert!(Downcast::<TermScorer>::is_type(&*scorer));
|
assert!(Downcast::<TermScorer>::is_type(&*scorer));
|
||||||
}
|
}
|
||||||
@@ -82,13 +81,13 @@ mod tests {
|
|||||||
let searcher = index.searcher();
|
let searcher = index.searcher();
|
||||||
{
|
{
|
||||||
let query = query_parser.parse_query("+a +b +c").unwrap();
|
let query = query_parser.parse_query("+a +b +c").unwrap();
|
||||||
let weight = query.weight(&searcher, true).unwrap();
|
let weight = query.weight(&*searcher, true).unwrap();
|
||||||
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||||
assert!(Downcast::<Intersection<TermScorer>>::is_type(&*scorer));
|
assert!(Downcast::<Intersection<TermScorer>>::is_type(&*scorer));
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let query = query_parser.parse_query("+a +(b c)").unwrap();
|
let query = query_parser.parse_query("+a +(b c)").unwrap();
|
||||||
let weight = query.weight(&searcher, true).unwrap();
|
let weight = query.weight(&*searcher, true).unwrap();
|
||||||
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||||
assert!(Downcast::<Intersection<Box<Scorer>>>::is_type(&*scorer));
|
assert!(Downcast::<Intersection<Box<Scorer>>>::is_type(&*scorer));
|
||||||
}
|
}
|
||||||
@@ -101,7 +100,7 @@ mod tests {
|
|||||||
let searcher = index.searcher();
|
let searcher = index.searcher();
|
||||||
{
|
{
|
||||||
let query = query_parser.parse_query("+a b").unwrap();
|
let query = query_parser.parse_query("+a b").unwrap();
|
||||||
let weight = query.weight(&searcher, true).unwrap();
|
let weight = query.weight(&*searcher, true).unwrap();
|
||||||
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||||
assert!(Downcast::<
|
assert!(Downcast::<
|
||||||
RequiredOptionalScorer<Box<Scorer>, Box<Scorer>, SumWithCoordsCombiner>,
|
RequiredOptionalScorer<Box<Scorer>, Box<Scorer>, SumWithCoordsCombiner>,
|
||||||
@@ -109,7 +108,7 @@ mod tests {
|
|||||||
}
|
}
|
||||||
{
|
{
|
||||||
let query = query_parser.parse_query("+a b").unwrap();
|
let query = query_parser.parse_query("+a b").unwrap();
|
||||||
let weight = query.weight(&searcher, false).unwrap();
|
let weight = query.weight(&*searcher, false).unwrap();
|
||||||
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
|
||||||
println!("{:?}", scorer.type_name());
|
println!("{:?}", scorer.type_name());
|
||||||
assert!(Downcast::<TermScorer>::is_type(&*scorer));
|
assert!(Downcast::<TermScorer>::is_type(&*scorer));
|
||||||
@@ -131,13 +130,9 @@ mod tests {
|
|||||||
|
|
||||||
let matching_docs = |boolean_query: &Query| {
|
let matching_docs = |boolean_query: &Query| {
|
||||||
let searcher = index.searcher();
|
let searcher = index.searcher();
|
||||||
let test_docs = searcher.search(boolean_query, &TestCollector).unwrap();
|
let mut test_collector = TestCollector::default();
|
||||||
test_docs
|
searcher.search(boolean_query, &mut test_collector).unwrap();
|
||||||
.docs()
|
test_collector.docs()
|
||||||
.iter()
|
|
||||||
.cloned()
|
|
||||||
.map(|doc| doc.1)
|
|
||||||
.collect::<Vec<DocId>>()
|
|
||||||
};
|
};
|
||||||
|
|
||||||
{
|
{
|
||||||
@@ -191,8 +186,9 @@ mod tests {
|
|||||||
|
|
||||||
let score_docs = |boolean_query: &Query| {
|
let score_docs = |boolean_query: &Query| {
|
||||||
let searcher = index.searcher();
|
let searcher = index.searcher();
|
||||||
let fruit = searcher.search(boolean_query, &TestCollector).unwrap();
|
let mut test_collector = TestCollector::default();
|
||||||
fruit.scores().to_vec()
|
searcher.search(boolean_query, &mut test_collector).unwrap();
|
||||||
|
test_collector.scores()
|
||||||
};
|
};
|
||||||
|
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -1,11 +1,11 @@
|
|||||||
use super::Scorer;
|
use super::Scorer;
|
||||||
use query::Query;
|
|
||||||
use query::Weight;
|
|
||||||
use DocId;
|
|
||||||
use DocSet;
|
use DocSet;
|
||||||
use Result;
|
|
||||||
use Score;
|
use Score;
|
||||||
|
use DocId;
|
||||||
|
use query::Query;
|
||||||
|
use Result;
|
||||||
use Searcher;
|
use Searcher;
|
||||||
|
use query::Weight;
|
||||||
use SegmentReader;
|
use SegmentReader;
|
||||||
|
|
||||||
/// `EmptyQuery` is a dummy `Query` in which no document matches.
|
/// `EmptyQuery` is a dummy `Query` in which no document matches.
|
||||||
|
|||||||
96
src/query/fastfield_filter/fastfield_filter_query.rs
Normal file
96
src/query/fastfield_filter/fastfield_filter_query.rs
Normal file
@@ -0,0 +1,96 @@
|
|||||||
|
use super::FastFieldFilterWeight;
|
||||||
|
use query::Query;
|
||||||
|
use query::Weight;
|
||||||
|
use Result;
|
||||||
|
use Searcher;
|
||||||
|
use schema::Field;
|
||||||
|
use super::RangeU64;
|
||||||
|
use std::collections::Bound;
|
||||||
|
use common::i64_to_u64;
|
||||||
|
use schema::Schema;
|
||||||
|
use schema::FieldEntry;
|
||||||
|
use TantivyError;
|
||||||
|
use schema::Type;
|
||||||
|
|
||||||
|
#[derive(Debug, Copy, Clone)]
|
||||||
|
enum TypeInt {
|
||||||
|
U64, I64
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TypeInt {
|
||||||
|
fn value_type(self) -> Type {
|
||||||
|
match self {
|
||||||
|
TypeInt::I64 => Type::I64,
|
||||||
|
TypeInt::U64 => Type::U64
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//< TODO i64 range Debug string will not look good in the
|
||||||
|
// current implementation. Defer conversion to the scorer, or
|
||||||
|
// back convert values for Debug.
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct FastFieldFilterQuery {
|
||||||
|
field: Field,
|
||||||
|
range: RangeU64,
|
||||||
|
int_type: TypeInt, //< just here to check the schema at runtime, as we call `.weight`
|
||||||
|
}
|
||||||
|
|
||||||
|
fn convert_bound_to_u64(bound: Bound<i64>) -> Bound<u64> {
|
||||||
|
match bound {
|
||||||
|
Bound::Included(val) =>
|
||||||
|
Bound::Excluded(i64_to_u64(val)),
|
||||||
|
Bound::Excluded(val) =>
|
||||||
|
Bound::Excluded(i64_to_u64(val)),
|
||||||
|
Bound::Unbounded => Bound::Unbounded
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FastFieldFilterQuery {
|
||||||
|
|
||||||
|
pub fn new_u64(field: Field, low: Bound<u64>, high: Bound<u64>) -> FastFieldFilterQuery {
|
||||||
|
FastFieldFilterQuery {
|
||||||
|
field: field,
|
||||||
|
range: RangeU64 { low, high },
|
||||||
|
int_type: TypeInt::U64
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn new_i64(field: Field, low: Bound<i64>, high: Bound<i64>) -> FastFieldFilterQuery {
|
||||||
|
FastFieldFilterQuery {
|
||||||
|
field: field,
|
||||||
|
range: RangeU64 {
|
||||||
|
low: convert_bound_to_u64(low),
|
||||||
|
high: convert_bound_to_u64(high)
|
||||||
|
},
|
||||||
|
int_type: TypeInt::I64
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
fn validate_schema(&self, schema: &Schema) -> Result<()> {
|
||||||
|
let field_entry: &FieldEntry = schema.get_field_entry(self.field);
|
||||||
|
if !field_entry.is_int_fast() {
|
||||||
|
return Err(TantivyError::SchemaError(format!(
|
||||||
|
"Field {:?} is not an int fast field",
|
||||||
|
field_entry.name()
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
let expected_value_type = self.int_type.value_type();
|
||||||
|
if field_entry.field_type().value_type() != self.int_type.value_type() {
|
||||||
|
return Err(TantivyError::SchemaError(format!(
|
||||||
|
"Field {:?} is not a {:?}",
|
||||||
|
field_entry.name(),
|
||||||
|
expected_value_type
|
||||||
|
)));
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Query for FastFieldFilterQuery {
|
||||||
|
fn weight(&self, searcher: &Searcher, _scoring_enabled: bool) -> Result<Box<Weight>> {
|
||||||
|
self.validate_schema(searcher.schema())?;
|
||||||
|
Ok(Box::new(FastFieldFilterWeight::new(self.field, self.range.clone())))
|
||||||
|
}
|
||||||
|
}
|
||||||
58
src/query/fastfield_filter/fastfield_filter_scorer.rs
Normal file
58
src/query/fastfield_filter/fastfield_filter_scorer.rs
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
use query::Scorer;
|
||||||
|
use fastfield::FastFieldReader;
|
||||||
|
use DocId;
|
||||||
|
use DocSet;
|
||||||
|
use query::fastfield_filter::RangeU64;
|
||||||
|
|
||||||
|
pub(crate) struct FastFieldFilterScorer {
|
||||||
|
fastfield_reader: FastFieldReader<u64>,
|
||||||
|
range: RangeU64,
|
||||||
|
max_doc: DocId,
|
||||||
|
doc: DocId,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FastFieldFilterScorer {
|
||||||
|
pub fn new(fastfield_reader: FastFieldReader<u64>,
|
||||||
|
range: RangeU64,
|
||||||
|
max_doc: DocId) -> FastFieldFilterScorer {
|
||||||
|
FastFieldFilterScorer {
|
||||||
|
fastfield_reader,
|
||||||
|
range,
|
||||||
|
max_doc,
|
||||||
|
doc: 0u32,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn within_range(&self, doc: DocId) -> bool {
|
||||||
|
let val = self.fastfield_reader.get(doc);
|
||||||
|
self.range.contains(val)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DocSet for FastFieldFilterScorer {
|
||||||
|
fn advance(&mut self) -> bool {
|
||||||
|
for doc in (self.doc + 1)..self.max_doc {
|
||||||
|
if self.within_range(doc) {
|
||||||
|
self.doc = doc;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
self.doc = self.max_doc;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
fn doc(&self) -> u32 {
|
||||||
|
self.doc
|
||||||
|
}
|
||||||
|
|
||||||
|
fn size_hint(&self) -> u32 {
|
||||||
|
self.max_doc
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Scorer for FastFieldFilterScorer {
|
||||||
|
fn score(&mut self) -> f32 {
|
||||||
|
1f32
|
||||||
|
}
|
||||||
|
}
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user