mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-03 15:52:55 +00:00
Compare commits
47 Commits
0.10.0
...
incrementa
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
45da5829bc | ||
|
|
e2f7aab39f | ||
|
|
1b9cbdb672 | ||
|
|
a8f3cf9679 | ||
|
|
4b9c1dce69 | ||
|
|
d74f71bbef | ||
|
|
5196ca41d8 | ||
|
|
4959e06151 | ||
|
|
c1635c13f6 | ||
|
|
135e0ea2e9 | ||
|
|
f283bfd7ab | ||
|
|
9f74786db2 | ||
|
|
32e5d7a0c7 | ||
|
|
84c615cff1 | ||
|
|
039c0a0863 | ||
|
|
b3b0138b82 | ||
|
|
ea56160cdc | ||
|
|
028b0a749c | ||
|
|
941f06eb9f | ||
|
|
04832a86eb | ||
|
|
beb8e990cd | ||
|
|
001af3876f | ||
|
|
f428f344da | ||
|
|
143f78eced | ||
|
|
754b55eee5 | ||
|
|
280ea1209c | ||
|
|
0154dbe477 | ||
|
|
efd1af1325 | ||
|
|
c91eb7fba7 | ||
|
|
6eb4e08636 | ||
|
|
c3231ca252 | ||
|
|
7211df6719 | ||
|
|
f27ce6412c | ||
|
|
8197a9921f | ||
|
|
b0e23b5715 | ||
|
|
0167151f5b | ||
|
|
0668949390 | ||
|
|
94d0e52786 | ||
|
|
818a0abbee | ||
|
|
4e6dcf3cbe | ||
|
|
af7ea1422a | ||
|
|
498057c5b7 | ||
|
|
5095e6b010 | ||
|
|
1aebc87ee3 | ||
|
|
9fb5058b29 | ||
|
|
158e0a28ba | ||
|
|
3576a006f7 |
23
.travis.yml
23
.travis.yml
@@ -38,9 +38,8 @@ matrix:
|
||||
# Linux
|
||||
#- env: TARGET=aarch64-unknown-linux-gnu
|
||||
#- env: TARGET=i686-unknown-linux-gnu
|
||||
- env: TARGET=x86_64-unknown-linux-gnu CODECOV=1 UPLOAD_DOCS=1
|
||||
- env: TARGET=x86_64-unknown-linux-gnu CODECOV=1 #UPLOAD_DOCS=1
|
||||
# - env: TARGET=x86_64-unknown-linux-musl CODECOV=1
|
||||
|
||||
# OSX
|
||||
#- env: TARGET=x86_64-apple-darwin
|
||||
# os: osx
|
||||
@@ -48,6 +47,7 @@ matrix:
|
||||
before_install:
|
||||
- set -e
|
||||
- rustup self update
|
||||
- rustup component add rustfmt
|
||||
|
||||
install:
|
||||
- sh ci/install.sh
|
||||
@@ -61,6 +61,7 @@ before_script:
|
||||
|
||||
script:
|
||||
- bash ci/script.sh
|
||||
- cargo fmt --all -- --check
|
||||
|
||||
before_deploy:
|
||||
- sh ci/before_deploy.sh
|
||||
@@ -70,15 +71,15 @@ after_success:
|
||||
- if [[ -v GH_TOKEN ]]; then echo "GH TOKEN IS SET"; else echo "GH TOKEN NOT SET"; fi
|
||||
- if [[ -v UPLOAD_DOCS ]]; then cargo doc; cargo doc-upload; else echo "doc upload disabled."; fi
|
||||
|
||||
cache: cargo
|
||||
before_cache:
|
||||
# Travis can't cache files that are not readable by "others"
|
||||
- chmod -R a+r $HOME/.cargo
|
||||
- find ./target/debug -type f -maxdepth 1 -delete
|
||||
- rm -f ./target/.rustc_info.json
|
||||
- rm -fr ./target/debug/{deps,.fingerprint}/tantivy*
|
||||
- rm -r target/debug/examples/
|
||||
- ls -1 examples/ | sed -e 's/\.rs$//' | xargs -I "{}" find target/* -name "*{}*" -type f -delete
|
||||
#cache: cargo
|
||||
#before_cache:
|
||||
# # Travis can't cache files that are not readable by "others"
|
||||
# - chmod -R a+r $HOME/.cargo
|
||||
# - find ./target/debug -type f -maxdepth 1 -delete
|
||||
# - rm -f ./target/.rustc_info.json
|
||||
# - rm -fr ./target/debug/{deps,.fingerprint}/tantivy*
|
||||
# - rm -r target/debug/examples/
|
||||
# - ls -1 examples/ | sed -e 's/\.rs$//' | xargs -I "{}" find target/* -name "*{}*" -type f -delete
|
||||
|
||||
#branches:
|
||||
# only:
|
||||
|
||||
25
CHANGELOG.md
25
CHANGELOG.md
@@ -1,3 +1,28 @@
|
||||
Tantivy 0.11.0
|
||||
=====================
|
||||
|
||||
- Added f64 field. Internally reuse u64 code the same way i64 does (@fdb-hiroshima)
|
||||
- Various bugfixes in the query parser.
|
||||
- Better handling of hyphens in query parser. (#609)
|
||||
- Better handling of whitespaces.
|
||||
- Closes #498 - add support for Elastic-style unbounded range queries for alphanumeric types eg. "title:>hello", "weight:>=70.5", "height:<200" (@petr-tik)
|
||||
- API change around `Box<BoxableTokenizer>`. See detail in #629
|
||||
- Avoid rebuilding Regex automaton whenever a regex query is reused. #630 (@brainlock)
|
||||
|
||||
## How to update?
|
||||
|
||||
- `Box<dyn BoxableTokenizer>` has been replaced by a `BoxedTokenizer` struct.
|
||||
- Regex are now compiled when the `RegexQuery` instance is built. As a result, it can now return
|
||||
an error and handling the `Result` is required.
|
||||
|
||||
Tantivy 0.10.1
|
||||
=====================
|
||||
|
||||
- Closes #544. A few users experienced problems with the directory watching system.
|
||||
Avoid watching the mmap directory until someone effectively creates a reader that uses
|
||||
this functionality.
|
||||
|
||||
|
||||
Tantivy 0.10.0
|
||||
=====================
|
||||
|
||||
|
||||
41
Cargo.toml
41
Cargo.toml
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy"
|
||||
version = "0.10.0"
|
||||
version = "0.11.0"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
@@ -15,44 +15,45 @@ edition = "2018"
|
||||
[dependencies]
|
||||
base64 = "0.10.0"
|
||||
byteorder = "1.0"
|
||||
once_cell = "0.2"
|
||||
regex = "1.0"
|
||||
once_cell = "1.0"
|
||||
regex ={version = "1.3.0", default-features = false, features = ["std"]}
|
||||
tantivy-fst = "0.1"
|
||||
memmap = {version = "0.7", optional=true}
|
||||
lz4 = {version="1.20", optional=true}
|
||||
snap = {version="0.2"}
|
||||
derive_builder = "0.7"
|
||||
atomicwrites = {version="0.2.2", optional=true}
|
||||
tempfile = "3.0"
|
||||
log = "0.4"
|
||||
combine = ">=3.6.0,<4.0.0"
|
||||
tempdir = "0.3"
|
||||
serde = "1.0"
|
||||
serde_derive = "1.0"
|
||||
serde_json = "1.0"
|
||||
num_cpus = "1.2"
|
||||
fs2={version="0.4", optional=true}
|
||||
itertools = "0.8"
|
||||
levenshtein_automata = {version="0.1", features=["fst_automaton"]}
|
||||
levenshtein_automata = "0.1"
|
||||
notify = {version="4", optional=true}
|
||||
bit-set = "0.5"
|
||||
uuid = { version = "0.7.2", features = ["v4", "serde"] }
|
||||
crossbeam = "0.5"
|
||||
crossbeam = "0.7"
|
||||
futures = "0.1"
|
||||
futures-cpupool = "0.1"
|
||||
owning_ref = "0.4"
|
||||
stable_deref_trait = "1.0.0"
|
||||
rust-stemmers = "1.1"
|
||||
downcast-rs = { version="1.0" }
|
||||
tantivy-query-grammar = { path="./query-grammar" }
|
||||
bitpacking = {version="0.8", default-features = false, features=["bitpacker4x"]}
|
||||
census = "0.2"
|
||||
fnv = "1.0.6"
|
||||
owned-read = "0.4"
|
||||
failure = "0.1"
|
||||
htmlescape = "0.3.1"
|
||||
fail = "0.2"
|
||||
fail = "0.3"
|
||||
scoped-pool = "1.0"
|
||||
murmurhash32 = "0.2"
|
||||
chrono = "0.4"
|
||||
smallvec = "0.6"
|
||||
|
||||
[target.'cfg(windows)'.dependencies]
|
||||
winapi = "0.3"
|
||||
@@ -73,13 +74,31 @@ debug-assertions = true
|
||||
overflow-checks = true
|
||||
|
||||
[features]
|
||||
# by default no-fail is disabled. We manually enable it when running test.
|
||||
default = ["mmap", "no_fail"]
|
||||
default = ["mmap"]
|
||||
mmap = ["atomicwrites", "fs2", "memmap", "notify"]
|
||||
lz4-compression = ["lz4"]
|
||||
no_fail = ["fail/no_fail"]
|
||||
failpoints = ["fail/failpoints"]
|
||||
unstable = [] # useful for benches.
|
||||
wasm-bindgen = ["uuid/wasm-bindgen"]
|
||||
|
||||
[workspace]
|
||||
members = ["query-grammar", "incremental-search"]
|
||||
|
||||
[badges]
|
||||
travis-ci = { repository = "tantivy-search/tantivy" }
|
||||
|
||||
[dev-dependencies.fail]
|
||||
version = "0.3"
|
||||
features = ["failpoints"]
|
||||
|
||||
# Following the "fail" crate best practises, we isolate
|
||||
# tests that define specific behavior in fail check points
|
||||
# in a different binary.
|
||||
#
|
||||
# We do that because, fail rely on a global definition of
|
||||
# failpoints behavior and hence, it is incompatible with
|
||||
# multithreading.
|
||||
[[test]]
|
||||
name = "failpoints"
|
||||
path = "tests/failpoints/mod.rs"
|
||||
required-features = ["fail/failpoints"]
|
||||
|
||||
3
Makefile
Normal file
3
Makefile
Normal file
@@ -0,0 +1,3 @@
|
||||
test:
|
||||
echo "Run test only... No examples."
|
||||
cargo test --all --tests --lib
|
||||
10
README.md
10
README.md
@@ -50,9 +50,9 @@ performance for different type of queries / collection.
|
||||
- Multithreaded indexing (indexing English Wikipedia takes < 3 minutes on my desktop)
|
||||
- Mmap directory
|
||||
- SIMD integer compression when the platform/CPU includes the SSE2 instruction set.
|
||||
- Single valued and multivalued u64 and i64 fast fields (equivalent of doc values in Lucene)
|
||||
- Single valued and multivalued u64, i64 and f64 fast fields (equivalent of doc values in Lucene)
|
||||
- `&[u8]` fast fields
|
||||
- Text, i64, u64, dates and hierarchical facet fields
|
||||
- Text, i64, u64, f64, dates and hierarchical facet fields
|
||||
- LZ4 compressed document store
|
||||
- Range queries
|
||||
- Faceted search
|
||||
@@ -71,14 +71,12 @@ Tantivy works on stable rust (>= 1.27) and supports Linux, MacOS and Windows.
|
||||
|
||||
# Getting started
|
||||
|
||||
- [tantivy's simple search example](http://fulmicoton.com/tantivy-examples/simple_search.html)
|
||||
- [tantivy's simple search example](https://tantivy-search.github.io/examples/basic_search.html)
|
||||
- [tantivy-cli and its tutorial](https://github.com/tantivy-search/tantivy-cli).
|
||||
`tantivy-cli` is an actual command line interface that makes it easy for you to create a search engine,
|
||||
index documents and search via the CLI or a small server with a REST API.
|
||||
It will walk you through getting a wikipedia search engine up and running in a few minutes.
|
||||
- [reference doc]
|
||||
- [For the last released version](https://docs.rs/tantivy/)
|
||||
- [For the last master branch](https://tantivy-search.github.io/tantivy/tantivy/index.html)
|
||||
- [reference doc for the last released version](https://docs.rs/tantivy/)
|
||||
|
||||
# How can I support this project?
|
||||
|
||||
|
||||
@@ -18,5 +18,5 @@ install:
|
||||
build: false
|
||||
|
||||
test_script:
|
||||
- REM SET RUST_LOG=tantivy,test & cargo test --verbose --no-default-features --features mmap -- --test-threads 1
|
||||
- REM SET RUST_LOG=tantivy,test & cargo test --verbose --no-default-features --features mmap
|
||||
- REM SET RUST_BACKTRACE=1 & cargo build --examples
|
||||
|
||||
@@ -7,7 +7,7 @@ set -ex
|
||||
main() {
|
||||
if [ ! -z $CODECOV ]; then
|
||||
echo "Codecov"
|
||||
cargo build --verbose && cargo coverage --verbose && bash <(curl -s https://codecov.io/bash) -s target/kcov
|
||||
cargo build --verbose && cargo coverage --verbose --all && bash <(curl -s https://codecov.io/bash) -s target/kcov
|
||||
else
|
||||
echo "Build"
|
||||
cross build --target $TARGET
|
||||
@@ -15,7 +15,8 @@ main() {
|
||||
return
|
||||
fi
|
||||
echo "Test"
|
||||
cross test --target $TARGET --no-default-features --features mmap -- --test-threads 1
|
||||
cross test --target $TARGET --no-default-features --features mmap
|
||||
cross test --target $TARGET --no-default-features --features mmap query-grammar
|
||||
fi
|
||||
for example in $(ls examples/*.rs)
|
||||
do
|
||||
|
||||
@@ -5,26 +5,23 @@
|
||||
//
|
||||
// We will :
|
||||
// - define our schema
|
||||
// = create an index in a directory
|
||||
// - index few documents in our index
|
||||
// - search for the best document matchings "sea whale"
|
||||
// - retrieve the best document original content.
|
||||
// - create an index in a directory
|
||||
// - index a few documents into our index
|
||||
// - search for the best document matching a basic query
|
||||
// - retrieve the best document's original content.
|
||||
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
#[macro_use]
|
||||
extern crate tantivy;
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::Index;
|
||||
use tantivy::ReloadPolicy;
|
||||
use tempdir::TempDir;
|
||||
use tantivy::{doc, Index, ReloadPolicy};
|
||||
use tempfile::TempDir;
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// Let's create a temporary directory for the
|
||||
// sake of this example
|
||||
let index_path = TempDir::new("tantivy_example_dir")?;
|
||||
let index_path = TempDir::new()?;
|
||||
|
||||
// # Defining the schema
|
||||
//
|
||||
@@ -33,7 +30,7 @@ fn main() -> tantivy::Result<()> {
|
||||
// and for each field, its type and "the way it should
|
||||
// be indexed".
|
||||
|
||||
// first we need to define a schema ...
|
||||
// First we need to define a schema ...
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
// Our first field is title.
|
||||
@@ -48,7 +45,7 @@ fn main() -> tantivy::Result<()> {
|
||||
//
|
||||
// `STORED` means that the field will also be saved
|
||||
// in a compressed, row-oriented key-value store.
|
||||
// This store is useful to reconstruct the
|
||||
// This store is useful for reconstructing the
|
||||
// documents that were selected during the search phase.
|
||||
schema_builder.add_text_field("title", TEXT | STORED);
|
||||
|
||||
@@ -57,8 +54,7 @@ fn main() -> tantivy::Result<()> {
|
||||
// need to be able to be able to retrieve it
|
||||
// for our application.
|
||||
//
|
||||
// We can make our index lighter and
|
||||
// by omitting `STORED` flag.
|
||||
// We can make our index lighter by omitting the `STORED` flag.
|
||||
schema_builder.add_text_field("body", TEXT);
|
||||
|
||||
let schema = schema_builder.build();
|
||||
@@ -71,7 +67,7 @@ fn main() -> tantivy::Result<()> {
|
||||
// with our schema in the directory.
|
||||
let index = Index::create_in_dir(&index_path, schema.clone())?;
|
||||
|
||||
// To insert document we need an index writer.
|
||||
// To insert a document we will need an index writer.
|
||||
// There must be only one writer at a time.
|
||||
// This single `IndexWriter` is already
|
||||
// multithreaded.
|
||||
@@ -149,8 +145,8 @@ fn main() -> tantivy::Result<()> {
|
||||
// At this point our documents are not searchable.
|
||||
//
|
||||
//
|
||||
// We need to call .commit() explicitly to force the
|
||||
// index_writer to finish processing the documents in the queue,
|
||||
// We need to call `.commit()` explicitly to force the
|
||||
// `index_writer` to finish processing the documents in the queue,
|
||||
// flush the current index to the disk, and advertise
|
||||
// the existence of new documents.
|
||||
//
|
||||
@@ -162,14 +158,14 @@ fn main() -> tantivy::Result<()> {
|
||||
// persistently indexed.
|
||||
//
|
||||
// In the scenario of a crash or a power failure,
|
||||
// tantivy behaves as if has rolled back to its last
|
||||
// tantivy behaves as if it has rolled back to its last
|
||||
// commit.
|
||||
|
||||
// # Searching
|
||||
//
|
||||
// ### Searcher
|
||||
//
|
||||
// A reader is required to get search the index.
|
||||
// A reader is required first in order to search an index.
|
||||
// It acts as a `Searcher` pool that reloads itself,
|
||||
// depending on a `ReloadPolicy`.
|
||||
//
|
||||
@@ -185,7 +181,7 @@ fn main() -> tantivy::Result<()> {
|
||||
|
||||
// We now need to acquire a searcher.
|
||||
//
|
||||
// A searcher points to snapshotted, immutable version of the index.
|
||||
// A searcher points to a snapshotted, immutable version of the index.
|
||||
//
|
||||
// Some search experience might require more than
|
||||
// one query. Using the same searcher ensures that all of these queries will run on the
|
||||
@@ -205,7 +201,7 @@ fn main() -> tantivy::Result<()> {
|
||||
// in both title and body.
|
||||
let query_parser = QueryParser::for_index(&index, vec![title, body]);
|
||||
|
||||
// QueryParser may fail if the query is not in the right
|
||||
// `QueryParser` may fail if the query is not in the right
|
||||
// format. For user facing applications, this can be a problem.
|
||||
// A ticket has been opened regarding this problem.
|
||||
let query = query_parser.parse_query("sea whale")?;
|
||||
@@ -221,7 +217,7 @@ fn main() -> tantivy::Result<()> {
|
||||
//
|
||||
// We are not interested in all of the documents but
|
||||
// only in the top 10. Keeping track of our top 10 best documents
|
||||
// is the role of the TopDocs.
|
||||
// is the role of the `TopDocs` collector.
|
||||
|
||||
// We can now perform our query.
|
||||
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
|
||||
|
||||
@@ -9,15 +9,12 @@
|
||||
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
#[macro_use]
|
||||
extern crate tantivy;
|
||||
use tantivy::collector::{Collector, SegmentCollector};
|
||||
use tantivy::fastfield::FastFieldReader;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::Field;
|
||||
use tantivy::schema::{Schema, FAST, INDEXED, TEXT};
|
||||
use tantivy::SegmentReader;
|
||||
use tantivy::{Index, TantivyError};
|
||||
use tantivy::{doc, Index, SegmentReader, TantivyError};
|
||||
|
||||
#[derive(Default)]
|
||||
struct Stats {
|
||||
|
||||
@@ -2,14 +2,11 @@
|
||||
//
|
||||
// In this example, we'll see how to define a tokenizer pipeline
|
||||
// by aligning a bunch of `TokenFilter`.
|
||||
|
||||
#[macro_use]
|
||||
extern crate tantivy;
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::tokenizer::NgramTokenizer;
|
||||
use tantivy::Index;
|
||||
use tantivy::{doc, Index};
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// # Defining the schema
|
||||
|
||||
@@ -8,13 +8,10 @@
|
||||
//
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
#[macro_use]
|
||||
extern crate tantivy;
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::TermQuery;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::Index;
|
||||
use tantivy::IndexReader;
|
||||
use tantivy::{doc, Index, IndexReader};
|
||||
|
||||
// A simple helper function to fetch a single document
|
||||
// given its id from our index.
|
||||
|
||||
@@ -12,17 +12,16 @@
|
||||
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
#[macro_use]
|
||||
extern crate tantivy;
|
||||
use tantivy::collector::FacetCollector;
|
||||
use tantivy::query::AllQuery;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::Index;
|
||||
use tantivy::{doc, Index};
|
||||
use tempfile::TempDir;
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// Let's create a temporary directory for the
|
||||
// sake of this example
|
||||
let index_path = TempDir::new("tantivy_facet_example_dir")?;
|
||||
let index_path = TempDir::new()?;
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
schema_builder.add_text_field("name", TEXT | STORED);
|
||||
@@ -74,5 +73,3 @@ fn main() -> tantivy::Result<()> {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
use tempdir::TempDir;
|
||||
|
||||
@@ -2,14 +2,10 @@
|
||||
//
|
||||
// Below is an example of creating an indexed integer field in your schema
|
||||
// You can use RangeQuery to get a Count of all occurrences in a given range.
|
||||
|
||||
#[macro_use]
|
||||
extern crate tantivy;
|
||||
use tantivy::collector::Count;
|
||||
use tantivy::query::RangeQuery;
|
||||
use tantivy::schema::{Schema, INDEXED};
|
||||
use tantivy::Index;
|
||||
use tantivy::Result;
|
||||
use tantivy::{doc, Index, Result};
|
||||
|
||||
fn run() -> Result<()> {
|
||||
// For the sake of simplicity, this schema will only have 1 field
|
||||
|
||||
@@ -9,11 +9,8 @@
|
||||
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
#[macro_use]
|
||||
extern crate tantivy;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::Index;
|
||||
use tantivy::{DocId, DocSet, Postings};
|
||||
use tantivy::{doc, DocId, DocSet, Index, Postings};
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// We first create a schema for the sake of the
|
||||
|
||||
@@ -25,14 +25,11 @@
|
||||
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
#[macro_use]
|
||||
extern crate tantivy;
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::thread;
|
||||
use std::time::Duration;
|
||||
use tantivy::schema::{Schema, STORED, TEXT};
|
||||
use tantivy::Opstamp;
|
||||
use tantivy::{Index, IndexWriter};
|
||||
use tantivy::{doc, Index, IndexWriter, Opstamp};
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// # Defining the schema
|
||||
@@ -49,10 +46,9 @@ fn main() -> tantivy::Result<()> {
|
||||
thread::spawn(move || {
|
||||
// we index 100 times the document... for the sake of the example.
|
||||
for i in 0..100 {
|
||||
let opstamp = {
|
||||
// A read lock is sufficient here.
|
||||
let index_writer_rlock = index_writer_clone_1.read().unwrap();
|
||||
index_writer_rlock.add_document(
|
||||
let opstamp = index_writer_clone_1
|
||||
.read().unwrap() //< A read lock is sufficient here.
|
||||
.add_document(
|
||||
doc!(
|
||||
title => "Of Mice and Men",
|
||||
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
|
||||
@@ -63,8 +59,7 @@ fn main() -> tantivy::Result<()> {
|
||||
fresh and green with every spring, carrying in their lower leaf junctures the \
|
||||
debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
|
||||
limbs and branches that arch over the pool"
|
||||
))
|
||||
};
|
||||
));
|
||||
println!("add doc {} from thread 1 - opstamp {}", i, opstamp);
|
||||
thread::sleep(Duration::from_millis(20));
|
||||
}
|
||||
|
||||
@@ -7,19 +7,16 @@
|
||||
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
#[macro_use]
|
||||
extern crate tantivy;
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::Index;
|
||||
use tantivy::{Snippet, SnippetGenerator};
|
||||
use tempdir::TempDir;
|
||||
use tantivy::{doc, Index, Snippet, SnippetGenerator};
|
||||
use tempfile::TempDir;
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// Let's create a temporary directory for the
|
||||
// sake of this example
|
||||
let index_path = TempDir::new("tantivy_example_dir")?;
|
||||
let index_path = TempDir::new()?;
|
||||
|
||||
// # Defining the schema
|
||||
let mut schema_builder = Schema::builder();
|
||||
|
||||
@@ -11,13 +11,11 @@
|
||||
|
||||
// ---
|
||||
// Importing tantivy...
|
||||
#[macro_use]
|
||||
extern crate tantivy;
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::*;
|
||||
use tantivy::tokenizer::*;
|
||||
use tantivy::Index;
|
||||
use tantivy::{doc, Index};
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// this example assumes you understand the content in `basic_search`
|
||||
|
||||
10
incremental-search/Cargo.toml
Normal file
10
incremental-search/Cargo.toml
Normal file
@@ -0,0 +1,10 @@
|
||||
[package]
|
||||
name = "incremental-search"
|
||||
version = "0.11.0"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
edition = "2018"
|
||||
|
||||
|
||||
[dependencies]
|
||||
derive_builder = "0.7"
|
||||
tantivy = {path = ".."}
|
||||
395
incremental-search/src/bitset.rs
Normal file
395
incremental-search/src/bitset.rs
Normal file
@@ -0,0 +1,395 @@
|
||||
use std::fmt;
|
||||
use std::u64;
|
||||
|
||||
#[derive(Clone, Copy, Eq, PartialEq)]
|
||||
pub(crate) struct TinySet(u64);
|
||||
|
||||
impl fmt::Debug for TinySet {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
self.into_iter().collect::<Vec<u32>>().fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct TinySetIterator(TinySet);
|
||||
impl Iterator for TinySetIterator {
|
||||
type Item = u32;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.0.pop_lowest()
|
||||
}
|
||||
}
|
||||
|
||||
impl IntoIterator for TinySet {
|
||||
type Item = u32;
|
||||
type IntoIter = TinySetIterator;
|
||||
fn into_iter(self) -> Self::IntoIter {
|
||||
TinySetIterator(self)
|
||||
}
|
||||
}
|
||||
|
||||
impl TinySet {
|
||||
/// Returns an empty `TinySet`.
|
||||
pub fn empty() -> TinySet {
|
||||
TinySet(0u64)
|
||||
}
|
||||
|
||||
/// Returns the complement of the set in `[0, 64[`.
|
||||
fn complement(self) -> TinySet {
|
||||
TinySet(!self.0)
|
||||
}
|
||||
|
||||
/// Returns true iff the `TinySet` contains the element `el`.
|
||||
pub fn contains(self, el: u32) -> bool {
|
||||
!self.intersect(TinySet::singleton(el)).is_empty()
|
||||
}
|
||||
|
||||
/// Returns the intersection of `self` and `other`
|
||||
pub fn intersect(self, other: TinySet) -> TinySet {
|
||||
TinySet(self.0 & other.0)
|
||||
}
|
||||
|
||||
/// Creates a new `TinySet` containing only one element
|
||||
/// within `[0; 64[`
|
||||
#[inline(always)]
|
||||
pub fn singleton(el: u32) -> TinySet {
|
||||
TinySet(1u64 << u64::from(el))
|
||||
}
|
||||
|
||||
/// Insert a new element within [0..64[
|
||||
#[inline(always)]
|
||||
pub fn insert(self, el: u32) -> TinySet {
|
||||
self.union(TinySet::singleton(el))
|
||||
}
|
||||
|
||||
/// Insert a new element within [0..64[
|
||||
#[inline(always)]
|
||||
pub fn insert_mut(&mut self, el: u32) -> bool {
|
||||
let old = *self;
|
||||
*self = old.insert(el);
|
||||
old != *self
|
||||
}
|
||||
|
||||
/// Returns the union of two tinysets
|
||||
#[inline(always)]
|
||||
pub fn union(self, other: TinySet) -> TinySet {
|
||||
TinySet(self.0 | other.0)
|
||||
}
|
||||
|
||||
/// Returns true iff the `TinySet` is empty.
|
||||
#[inline(always)]
|
||||
pub fn is_empty(self) -> bool {
|
||||
self.0 == 0u64
|
||||
}
|
||||
|
||||
/// Returns the lowest element in the `TinySet`
|
||||
/// and removes it.
|
||||
#[inline(always)]
|
||||
pub fn pop_lowest(&mut self) -> Option<u32> {
|
||||
if self.is_empty() {
|
||||
None
|
||||
} else {
|
||||
let lowest = self.0.trailing_zeros() as u32;
|
||||
self.0 ^= TinySet::singleton(lowest).0;
|
||||
Some(lowest)
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a `TinySet` than contains all values up
|
||||
/// to limit excluded.
|
||||
///
|
||||
/// The limit is assumed to be strictly lower than 64.
|
||||
pub fn range_lower(upper_bound: u32) -> TinySet {
|
||||
TinySet((1u64 << u64::from(upper_bound % 64u32)) - 1u64)
|
||||
}
|
||||
|
||||
/// Returns a `TinySet` that contains all values greater
|
||||
/// or equal to the given limit, included. (and up to 63)
|
||||
///
|
||||
/// The limit is assumed to be strictly lower than 64.
|
||||
pub fn range_greater_or_equal(from_included: u32) -> TinySet {
|
||||
TinySet::range_lower(from_included).complement()
|
||||
}
|
||||
|
||||
pub fn clear(&mut self) {
|
||||
self.0 = 0u64;
|
||||
}
|
||||
|
||||
pub fn len(self) -> u32 {
|
||||
self.0.count_ones()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct BitSet {
|
||||
tinysets: Box<[TinySet]>,
|
||||
len: usize, //< Technically it should be u32, but we
|
||||
// count multiple inserts.
|
||||
// `usize` guards us from overflow.
|
||||
max_value: u32,
|
||||
}
|
||||
|
||||
fn num_buckets(max_val: u32) -> u32 {
|
||||
(max_val + 63u32) / 64u32
|
||||
}
|
||||
|
||||
impl BitSet {
|
||||
/// Create a new `BitSet` that may contain elements
|
||||
/// within `[0, max_val[`.
|
||||
pub fn with_max_value(max_value: u32) -> BitSet {
|
||||
let num_buckets = num_buckets(max_value);
|
||||
let tinybisets = vec![TinySet::empty(); num_buckets as usize].into_boxed_slice();
|
||||
BitSet {
|
||||
tinysets: tinybisets,
|
||||
len: 0,
|
||||
max_value,
|
||||
}
|
||||
}
|
||||
|
||||
/// Removes all elements from the `BitSet`.
|
||||
pub fn clear(&mut self) {
|
||||
for tinyset in self.tinysets.iter_mut() {
|
||||
*tinyset = TinySet::empty();
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the number of elements in the `BitSet`.
|
||||
pub fn len(&self) -> usize {
|
||||
self.len
|
||||
}
|
||||
|
||||
/// Inserts an element in the `BitSet`
|
||||
pub fn insert(&mut self, el: u32) {
|
||||
// we do not check saturated els.
|
||||
let higher = el / 64u32;
|
||||
let lower = el % 64u32;
|
||||
self.len += if self.tinysets[higher as usize].insert_mut(lower) {
|
||||
1
|
||||
} else {
|
||||
0
|
||||
};
|
||||
}
|
||||
|
||||
/// Returns true iff the elements is in the `BitSet`.
|
||||
pub fn contains(&self, el: u32) -> bool {
|
||||
self.tinyset(el / 64u32).contains(el % 64)
|
||||
}
|
||||
|
||||
/// Returns the first non-empty `TinySet` associated to a bucket lower
|
||||
/// or greater than bucket.
|
||||
///
|
||||
/// Reminder: the tiny set with the bucket `bucket`, represents the
|
||||
/// elements from `bucket * 64` to `(bucket+1) * 64`.
|
||||
pub(crate) fn first_non_empty_bucket(&self, bucket: u32) -> Option<u32> {
|
||||
self.tinysets[bucket as usize..]
|
||||
.iter()
|
||||
.cloned()
|
||||
.position(|tinyset| !tinyset.is_empty())
|
||||
.map(|delta_bucket| bucket + delta_bucket as u32)
|
||||
}
|
||||
|
||||
pub fn max_value(&self) -> u32 {
|
||||
self.max_value
|
||||
}
|
||||
|
||||
/// Returns the tiny bitset representing the
|
||||
/// the set restricted to the number range from
|
||||
/// `bucket * 64` to `(bucket + 1) * 64`.
|
||||
pub(crate) fn tinyset(&self, bucket: u32) -> TinySet {
|
||||
self.tinysets[bucket as usize]
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::BitSet;
|
||||
use super::TinySet;
|
||||
use crate::docset::DocSet;
|
||||
use crate::query::BitSetDocSet;
|
||||
use crate::tests;
|
||||
use crate::tests::generate_nonunique_unsorted;
|
||||
use std::collections::BTreeSet;
|
||||
use std::collections::HashSet;
|
||||
|
||||
#[test]
|
||||
fn test_tiny_set() {
|
||||
assert!(TinySet::empty().is_empty());
|
||||
{
|
||||
let mut u = TinySet::empty().insert(1u32);
|
||||
assert_eq!(u.pop_lowest(), Some(1u32));
|
||||
assert!(u.pop_lowest().is_none())
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty().insert(1u32).insert(1u32);
|
||||
assert_eq!(u.pop_lowest(), Some(1u32));
|
||||
assert!(u.pop_lowest().is_none())
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty().insert(2u32);
|
||||
assert_eq!(u.pop_lowest(), Some(2u32));
|
||||
u.insert_mut(1u32);
|
||||
assert_eq!(u.pop_lowest(), Some(1u32));
|
||||
assert!(u.pop_lowest().is_none());
|
||||
}
|
||||
{
|
||||
let mut u = TinySet::empty().insert(63u32);
|
||||
assert_eq!(u.pop_lowest(), Some(63u32));
|
||||
assert!(u.pop_lowest().is_none());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset() {
|
||||
let test_against_hashset = |els: &[u32], max_value: u32| {
|
||||
let mut hashset: HashSet<u32> = HashSet::new();
|
||||
let mut bitset = BitSet::with_max_value(max_value);
|
||||
for &el in els {
|
||||
assert!(el < max_value);
|
||||
hashset.insert(el);
|
||||
bitset.insert(el);
|
||||
}
|
||||
for el in 0..max_value {
|
||||
assert_eq!(hashset.contains(&el), bitset.contains(el));
|
||||
}
|
||||
assert_eq!(bitset.max_value(), max_value);
|
||||
};
|
||||
|
||||
test_against_hashset(&[], 0);
|
||||
test_against_hashset(&[], 1);
|
||||
test_against_hashset(&[0u32], 1);
|
||||
test_against_hashset(&[0u32], 100);
|
||||
test_against_hashset(&[1u32, 2u32], 4);
|
||||
test_against_hashset(&[99u32], 100);
|
||||
test_against_hashset(&[63u32], 64);
|
||||
test_against_hashset(&[62u32, 63u32], 64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset_large() {
|
||||
let arr = generate_nonunique_unsorted(100_000, 5_000);
|
||||
let mut btreeset: BTreeSet<u32> = BTreeSet::new();
|
||||
let mut bitset = BitSet::with_max_value(100_000);
|
||||
for el in arr {
|
||||
btreeset.insert(el);
|
||||
bitset.insert(el);
|
||||
}
|
||||
for i in 0..100_000 {
|
||||
assert_eq!(btreeset.contains(&i), bitset.contains(i));
|
||||
}
|
||||
assert_eq!(btreeset.len(), bitset.len());
|
||||
let mut bitset_docset = BitSetDocSet::from(bitset);
|
||||
for el in btreeset.into_iter() {
|
||||
bitset_docset.advance();
|
||||
assert_eq!(bitset_docset.doc(), el);
|
||||
}
|
||||
assert!(!bitset_docset.advance());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset_num_buckets() {
|
||||
use super::num_buckets;
|
||||
assert_eq!(num_buckets(0u32), 0);
|
||||
assert_eq!(num_buckets(1u32), 1);
|
||||
assert_eq!(num_buckets(64u32), 1);
|
||||
assert_eq!(num_buckets(65u32), 2);
|
||||
assert_eq!(num_buckets(128u32), 2);
|
||||
assert_eq!(num_buckets(129u32), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tinyset_range() {
|
||||
assert_eq!(
|
||||
TinySet::range_lower(3).into_iter().collect::<Vec<u32>>(),
|
||||
[0, 1, 2]
|
||||
);
|
||||
assert!(TinySet::range_lower(0).is_empty());
|
||||
assert_eq!(
|
||||
TinySet::range_lower(63).into_iter().collect::<Vec<u32>>(),
|
||||
(0u32..63u32).collect::<Vec<_>>()
|
||||
);
|
||||
assert_eq!(
|
||||
TinySet::range_lower(1).into_iter().collect::<Vec<u32>>(),
|
||||
[0]
|
||||
);
|
||||
assert_eq!(
|
||||
TinySet::range_lower(2).into_iter().collect::<Vec<u32>>(),
|
||||
[0, 1]
|
||||
);
|
||||
assert_eq!(
|
||||
TinySet::range_greater_or_equal(3)
|
||||
.into_iter()
|
||||
.collect::<Vec<u32>>(),
|
||||
(3u32..64u32).collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset_len() {
|
||||
let mut bitset = BitSet::with_max_value(1_000);
|
||||
assert_eq!(bitset.len(), 0);
|
||||
bitset.insert(3u32);
|
||||
assert_eq!(bitset.len(), 1);
|
||||
bitset.insert(103u32);
|
||||
assert_eq!(bitset.len(), 2);
|
||||
bitset.insert(3u32);
|
||||
assert_eq!(bitset.len(), 2);
|
||||
bitset.insert(103u32);
|
||||
assert_eq!(bitset.len(), 2);
|
||||
bitset.insert(104u32);
|
||||
assert_eq!(bitset.len(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitset_clear() {
|
||||
let mut bitset = BitSet::with_max_value(1_000);
|
||||
let els = tests::sample(1_000, 0.01f64);
|
||||
for &el in &els {
|
||||
bitset.insert(el);
|
||||
}
|
||||
assert!(els.iter().all(|el| bitset.contains(*el)));
|
||||
bitset.clear();
|
||||
for el in 0u32..1000u32 {
|
||||
assert!(!bitset.contains(el));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use super::BitSet;
|
||||
use super::TinySet;
|
||||
use test;
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyset_pop(b: &mut test::Bencher) {
|
||||
b.iter(|| {
|
||||
let mut tinyset = TinySet::singleton(test::black_box(31u32));
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyset_sum(b: &mut test::Bencher) {
|
||||
let tiny_set = TinySet::empty().insert(10u32).insert(14u32).insert(21u32);
|
||||
b.iter(|| {
|
||||
assert_eq!(test::black_box(tiny_set).into_iter().sum::<u32>(), 45u32);
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyarr_sum(b: &mut test::Bencher) {
|
||||
let v = [10u32, 14u32, 21u32];
|
||||
b.iter(|| test::black_box(v).iter().cloned().sum::<u32>());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_bitset_initialize(b: &mut test::Bencher) {
|
||||
b.iter(|| BitSet::with_max_value(1_000_000));
|
||||
}
|
||||
}
|
||||
266
incremental-search/src/lib.rs
Normal file
266
incremental-search/src/lib.rs
Normal file
@@ -0,0 +1,266 @@
|
||||
use tantivy::query::{BooleanQuery, FuzzyTermQuery, EmptyQuery};
|
||||
use derive_builder::Builder;
|
||||
use std::str::FromStr;
|
||||
use tantivy::query::{FuzzyConfiguration, FuzzyConfigurationBuilder, Query, Occur};
|
||||
use tantivy::schema::Field;
|
||||
use tantivy::{Searcher, TantivyError, DocAddress, Term, Document};
|
||||
use tantivy::collector::TopDocs;
|
||||
use std::ops::Deref;
|
||||
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct IncrementalSearchQuery {
|
||||
pub terms: Vec<String>,
|
||||
pub last_is_prefix: bool,
|
||||
}
|
||||
|
||||
impl IncrementalSearchQuery {
|
||||
pub fn fuzzy_configurations(&self) -> Vec<FuzzyConfigurations> {
|
||||
if self.terms.is_empty() {
|
||||
return Vec::default();
|
||||
}
|
||||
let single_term_confs: Vec<FuzzyConfigurationBuilder> = (0u8..3u8)
|
||||
.map(|d: u8| {
|
||||
let mut builder = FuzzyConfigurationBuilder::default();
|
||||
builder.distance(d).transposition_cost_one(true);
|
||||
builder
|
||||
})
|
||||
.collect();
|
||||
let mut configurations: Vec<Vec<FuzzyConfigurationBuilder>> = single_term_confs
|
||||
.iter()
|
||||
.map(|conf| vec![conf.clone()])
|
||||
.collect();
|
||||
let mut new_configurations = Vec::new();
|
||||
for _ in 1..self.terms.len() {
|
||||
new_configurations.clear();
|
||||
for single_term_conf in &single_term_confs {
|
||||
for configuration in &configurations {
|
||||
let mut new_configuration: Vec<FuzzyConfigurationBuilder> = configuration.clone();
|
||||
new_configuration.push(single_term_conf.clone());
|
||||
new_configurations.push(new_configuration);
|
||||
}
|
||||
}
|
||||
std::mem::swap(&mut configurations, &mut new_configurations);
|
||||
}
|
||||
if self.last_is_prefix {
|
||||
for configuration in &mut configurations {
|
||||
if let Some(last_conf) = configuration.last_mut() {
|
||||
last_conf.prefix(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
let mut fuzzy_configurations: Vec<FuzzyConfigurations> = configurations
|
||||
.into_iter()
|
||||
.map(FuzzyConfigurations::from)
|
||||
.collect();
|
||||
fuzzy_configurations.sort_by(|left, right| left.cost.partial_cmp(&right.cost).unwrap());
|
||||
fuzzy_configurations
|
||||
}
|
||||
|
||||
fn search_query(&self, fields: &[Field], configurations: FuzzyConfigurations) -> Box<dyn Query> {
|
||||
if self.terms.is_empty() {
|
||||
Box::new(EmptyQuery)
|
||||
} else if self.terms.len() == 1 {
|
||||
build_query_for_fields(fields, &self.terms[0], &configurations.configurations[0])
|
||||
} else {
|
||||
Box::new(BooleanQuery::from(self.terms.iter()
|
||||
.zip(configurations.configurations.iter())
|
||||
.map(|(term, configuration)|
|
||||
(Occur::Must, build_query_for_fields(fields, &term, &configuration))
|
||||
)
|
||||
.collect::<Vec<_>>()))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct FuzzyConfigurations {
|
||||
configurations: Vec<FuzzyConfiguration>,
|
||||
cost: f64,
|
||||
}
|
||||
|
||||
|
||||
fn compute_cost(fuzzy_confs: &[FuzzyConfiguration]) -> f64 {
|
||||
fuzzy_confs
|
||||
.iter()
|
||||
.map(|fuzzy_conf| {
|
||||
let weight = if fuzzy_conf.prefix { 30f64 } else { 5f64 };
|
||||
weight * f64::from(fuzzy_conf.distance)
|
||||
})
|
||||
.sum()
|
||||
}
|
||||
|
||||
impl From<Vec<FuzzyConfigurationBuilder>> for FuzzyConfigurations {
|
||||
fn from(fuzzy_conf_builder: Vec<FuzzyConfigurationBuilder>) -> FuzzyConfigurations {
|
||||
let configurations = fuzzy_conf_builder
|
||||
.into_iter()
|
||||
.map(|conf| conf.build().unwrap())
|
||||
.collect::<Vec<FuzzyConfiguration>>();
|
||||
let cost = compute_cost(&configurations);
|
||||
FuzzyConfigurations {
|
||||
configurations,
|
||||
cost,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct ParseIncrementalQueryError;
|
||||
|
||||
impl Into<TantivyError> for ParseIncrementalQueryError {
|
||||
fn into(self) -> TantivyError {
|
||||
TantivyError::InvalidArgument(format!("Invalid query: {:?}", self))
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for IncrementalSearchQuery {
|
||||
type Err = ParseIncrementalQueryError;
|
||||
|
||||
fn from_str(query_str: &str) -> Result<Self, Self::Err> {
|
||||
let terms: Vec<String> = query_str
|
||||
.split_whitespace()
|
||||
.map(ToString::to_string)
|
||||
.collect();
|
||||
Ok(IncrementalSearchQuery {
|
||||
terms,
|
||||
last_is_prefix: query_str
|
||||
.chars()
|
||||
.last()
|
||||
.map(|c| !c.is_whitespace())
|
||||
.unwrap_or(false),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn build_query_for_fields(fields: &[Field], term_text: &str, conf: &FuzzyConfiguration) -> Box<dyn Query> {
|
||||
assert!(fields.len() > 0);
|
||||
if fields.len() > 1 {
|
||||
let term_queries: Vec<(Occur, Box<dyn Query>)> = fields
|
||||
.iter()
|
||||
.map(|&field| {
|
||||
let term = Term::from_field_text(field, term_text);
|
||||
let query = FuzzyTermQuery::new_from_configuration(term, conf.clone());
|
||||
let boxed_query: Box<dyn Query> = Box::new(query);
|
||||
(Occur::Must, boxed_query)
|
||||
})
|
||||
.collect();
|
||||
Box::new(BooleanQuery::from(term_queries))
|
||||
} else {
|
||||
let term = Term::from_field_text(fields[0], term_text);
|
||||
Box::new( FuzzyTermQuery::new_from_configuration(term, conf.clone()))
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
pub struct IncrementalSearchResult {
|
||||
pub docs: Vec<Document>
|
||||
}
|
||||
|
||||
#[derive(Builder, Default)]
|
||||
pub struct IncrementalSearch {
|
||||
nhits: usize,
|
||||
#[builder(default)]
|
||||
search_fields: Vec<Field>,
|
||||
#[builder(default)]
|
||||
return_fields: Vec<Field>,
|
||||
}
|
||||
|
||||
impl IncrementalSearch {
|
||||
|
||||
pub fn search<S: Deref<Target=Searcher>>(
|
||||
&self,
|
||||
query: &str,
|
||||
searcher: &S,
|
||||
) -> tantivy::Result<IncrementalSearchResult> {
|
||||
let searcher = searcher.deref();
|
||||
let inc_search_query: IncrementalSearchQuery =
|
||||
FromStr::from_str(query).map_err(Into::<TantivyError>::into)?;
|
||||
|
||||
let mut results: Vec<DocAddress> = Vec::default();
|
||||
let mut remaining = self.nhits;
|
||||
for fuzzy_conf in inc_search_query.fuzzy_configurations() {
|
||||
if remaining == 0 {
|
||||
break;
|
||||
}
|
||||
let query = inc_search_query.search_query(&self.search_fields[..], fuzzy_conf);
|
||||
let new_docs = searcher.search(query.as_ref(), &TopDocs::with_limit(remaining))?;
|
||||
// TODO(pmasurel) remove already added docs.
|
||||
results.extend(new_docs.into_iter()
|
||||
.map(|(_, doc_address)| doc_address));
|
||||
remaining = self.nhits - results.len();
|
||||
if remaining == 0 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
let docs: Vec<Document> = results.into_iter()
|
||||
.map(|doc_address: DocAddress| searcher.doc(doc_address))
|
||||
.collect::<tantivy::Result<_>>()?;
|
||||
Ok(IncrementalSearchResult {
|
||||
docs
|
||||
})
|
||||
}
|
||||
}
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use tantivy::doc;
|
||||
use crate::{IncrementalSearch, IncrementalSearchBuilder, IncrementalSearchQuery};
|
||||
use std::str::FromStr;
|
||||
use tantivy::schema::{SchemaBuilder, TEXT, STORED};
|
||||
use tantivy::Index;
|
||||
|
||||
#[test]
|
||||
fn test_incremental_search() {
|
||||
let incremental_search = IncrementalSearchBuilder::default()
|
||||
.nhits(10)
|
||||
.build()
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_incremental_search_query_parse_empty() {
|
||||
let query = IncrementalSearchQuery::from_str("").unwrap();
|
||||
assert_eq!(query.terms, Vec::<String>::new());
|
||||
assert!(!query.last_is_prefix);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_incremental_search_query_parse_trailing_whitespace() {
|
||||
let query = IncrementalSearchQuery::from_str("hello happy tax pa ").unwrap();
|
||||
assert_eq!(query.terms, vec!["hello", "happy", "tax", "pa"]);
|
||||
assert!(!query.last_is_prefix);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_incremental_search_query_parse_unicode_whitespace() {
|
||||
let query = IncrementalSearchQuery::from_str("hello happy tax pa ").unwrap();
|
||||
assert_eq!(query.terms, vec!["hello", "happy", "tax", "pa"]);
|
||||
assert!(!query.last_is_prefix);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_incremental_search_query_parse() {
|
||||
let query = IncrementalSearchQuery::from_str("hello happy tax pa").unwrap();
|
||||
assert_eq!(query.terms, vec!["hello", "happy", "tax", "pa"]);
|
||||
assert!(query.last_is_prefix);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_blop() {
|
||||
let mut schema_builder = SchemaBuilder::new();
|
||||
let body = schema_builder.add_text_field("body", TEXT | STORED);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 30_000_000).unwrap();
|
||||
index_writer.add_document(doc!(body=> "hello happy tax payer"));
|
||||
index_writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let incremental_search: IncrementalSearch = IncrementalSearchBuilder::default()
|
||||
.nhits(1)
|
||||
.search_fields(vec![body])
|
||||
.build()
|
||||
.unwrap();
|
||||
let top_docs = incremental_search.search("hello hapy t", &searcher).unwrap();
|
||||
assert_eq!(top_docs.docs.len(), 1);
|
||||
}
|
||||
}
|
||||
16
query-grammar/Cargo.toml
Normal file
16
query-grammar/Cargo.toml
Normal file
@@ -0,0 +1,16 @@
|
||||
[package]
|
||||
name = "tantivy-query-grammar"
|
||||
version = "0.11.0"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
description = """Search engine library"""
|
||||
documentation = "https://tantivy-search.github.io/tantivy/tantivy/index.html"
|
||||
homepage = "https://github.com/tantivy-search/tantivy"
|
||||
repository = "https://github.com/tantivy-search/tantivy"
|
||||
readme = "README.md"
|
||||
keywords = ["search", "information", "retrieval"]
|
||||
edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
combine = ">=3.6.0,<4.0.0"
|
||||
17
query-grammar/src/lib.rs
Normal file
17
query-grammar/src/lib.rs
Normal file
@@ -0,0 +1,17 @@
|
||||
#![recursion_limit = "100"]
|
||||
|
||||
mod occur;
|
||||
mod query_grammar;
|
||||
mod user_input_ast;
|
||||
use combine::parser::Parser;
|
||||
|
||||
pub use crate::occur::Occur;
|
||||
use crate::query_grammar::parse_to_ast;
|
||||
pub use crate::user_input_ast::{UserInputAST, UserInputBound, UserInputLeaf, UserInputLiteral};
|
||||
|
||||
pub struct Error;
|
||||
|
||||
pub fn parse_query(query: &str) -> Result<UserInputAST, Error> {
|
||||
let (user_input_ast, _remaining) = parse_to_ast().parse(query).map_err(|_| Error)?;
|
||||
Ok(user_input_ast)
|
||||
}
|
||||
@@ -25,24 +25,24 @@ impl Occur {
|
||||
Occur::MustNot => '-',
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Compose two occur values.
|
||||
pub fn compose_occur(left: Occur, right: Occur) -> Occur {
|
||||
match left {
|
||||
Occur::Should => right,
|
||||
Occur::Must => {
|
||||
if right == Occur::MustNot {
|
||||
Occur::MustNot
|
||||
} else {
|
||||
Occur::Must
|
||||
/// Compose two occur values.
|
||||
pub fn compose(left: Occur, right: Occur) -> Occur {
|
||||
match left {
|
||||
Occur::Should => right,
|
||||
Occur::Must => {
|
||||
if right == Occur::MustNot {
|
||||
Occur::MustNot
|
||||
} else {
|
||||
Occur::Must
|
||||
}
|
||||
}
|
||||
}
|
||||
Occur::MustNot => {
|
||||
if right == Occur::MustNot {
|
||||
Occur::Must
|
||||
} else {
|
||||
Occur::MustNot
|
||||
Occur::MustNot => {
|
||||
if right == Occur::MustNot {
|
||||
Occur::Must
|
||||
} else {
|
||||
Occur::MustNot
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
380
query-grammar/src/query_grammar.rs
Normal file
380
query-grammar/src/query_grammar.rs
Normal file
@@ -0,0 +1,380 @@
|
||||
use super::user_input_ast::*;
|
||||
use crate::Occur;
|
||||
use combine::char::*;
|
||||
use combine::error::StreamError;
|
||||
use combine::stream::StreamErrorFor;
|
||||
use combine::*;
|
||||
|
||||
parser! {
|
||||
fn field[I]()(I) -> String
|
||||
where [I: Stream<Item = char>] {
|
||||
(
|
||||
letter(),
|
||||
many(satisfy(|c: char| c.is_alphanumeric() || c == '_')),
|
||||
).skip(char(':')).map(|(s1, s2): (char, String)| format!("{}{}", s1, s2))
|
||||
}
|
||||
}
|
||||
|
||||
parser! {
|
||||
fn word[I]()(I) -> String
|
||||
where [I: Stream<Item = char>] {
|
||||
(
|
||||
satisfy(|c: char| !c.is_whitespace() && !['-', '`', ':', '{', '}', '"', '[', ']', '(',')'].contains(&c) ),
|
||||
many(satisfy(|c: char| !c.is_whitespace() && ![':', '{', '}', '"', '[', ']', '(',')'].contains(&c)))
|
||||
)
|
||||
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2))
|
||||
.and_then(|s: String|
|
||||
match s.as_str() {
|
||||
"OR" => Err(StreamErrorFor::<I>::unexpected_static_message("OR")),
|
||||
"AND" => Err(StreamErrorFor::<I>::unexpected_static_message("AND")),
|
||||
"NOT" => Err(StreamErrorFor::<I>::unexpected_static_message("NOT")),
|
||||
_ => Ok(s)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
parser! {
|
||||
fn literal[I]()(I) -> UserInputLeaf
|
||||
where [I: Stream<Item = char>]
|
||||
{
|
||||
let term_val = || {
|
||||
let phrase = char('"').with(many1(satisfy(|c| c != '"'))).skip(char('"'));
|
||||
phrase.or(word())
|
||||
};
|
||||
let term_val_with_field = negative_number().or(term_val());
|
||||
let term_query =
|
||||
(field(), term_val_with_field)
|
||||
.map(|(field_name, phrase)| UserInputLiteral {
|
||||
field_name: Some(field_name),
|
||||
phrase,
|
||||
});
|
||||
let term_default_field = term_val().map(|phrase| UserInputLiteral {
|
||||
field_name: None,
|
||||
phrase,
|
||||
});
|
||||
attempt(term_query)
|
||||
.or(term_default_field)
|
||||
.map(UserInputLeaf::from)
|
||||
}
|
||||
}
|
||||
|
||||
parser! {
|
||||
fn negative_number[I]()(I) -> String
|
||||
where [I: Stream<Item = char>]
|
||||
{
|
||||
(char('-'), many1(satisfy(char::is_numeric)),
|
||||
optional((char('.'), many1(satisfy(char::is_numeric)))))
|
||||
.map(|(s1, s2, s3): (char, String, Option<(char, String)>)| {
|
||||
if let Some(('.', s3)) = s3 {
|
||||
format!("{}{}.{}", s1, s2, s3)
|
||||
} else {
|
||||
format!("{}{}", s1, s2)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
parser! {
|
||||
fn spaces1[I]()(I) -> ()
|
||||
where [I: Stream<Item = char>] {
|
||||
skip_many1(space())
|
||||
}
|
||||
}
|
||||
|
||||
parser! {
|
||||
/// Function that parses a range out of a Stream
|
||||
/// Supports ranges like:
|
||||
/// [5 TO 10], {5 TO 10}, [* TO 10], [10 TO *], {10 TO *], >5, <=10
|
||||
/// [a TO *], [a TO c], [abc TO bcd}
|
||||
fn range[I]()(I) -> UserInputLeaf
|
||||
where [I: Stream<Item = char>] {
|
||||
let range_term_val = || {
|
||||
word().or(negative_number()).or(char('*').with(value("*".to_string())))
|
||||
};
|
||||
|
||||
// check for unbounded range in the form of <5, <=10, >5, >=5
|
||||
let elastic_unbounded_range = (choice([attempt(string(">=")),
|
||||
attempt(string("<=")),
|
||||
attempt(string("<")),
|
||||
attempt(string(">"))])
|
||||
.skip(spaces()),
|
||||
range_term_val()).
|
||||
map(|(comparison_sign, bound): (&str, String)|
|
||||
match comparison_sign {
|
||||
">=" => (UserInputBound::Inclusive(bound), UserInputBound::Unbounded),
|
||||
"<=" => (UserInputBound::Unbounded, UserInputBound::Inclusive(bound)),
|
||||
"<" => (UserInputBound::Unbounded, UserInputBound::Exclusive(bound)),
|
||||
">" => (UserInputBound::Exclusive(bound), UserInputBound::Unbounded),
|
||||
// default case
|
||||
_ => (UserInputBound::Unbounded, UserInputBound::Unbounded)
|
||||
});
|
||||
let lower_bound = (one_of("{[".chars()), range_term_val())
|
||||
.map(|(boundary_char, lower_bound): (char, String)|
|
||||
if lower_bound == "*" {
|
||||
UserInputBound::Unbounded
|
||||
} else if boundary_char == '{' {
|
||||
UserInputBound::Exclusive(lower_bound)
|
||||
} else {
|
||||
UserInputBound::Inclusive(lower_bound)
|
||||
});
|
||||
let upper_bound = (range_term_val(), one_of("}]".chars()))
|
||||
.map(|(higher_bound, boundary_char): (String, char)|
|
||||
if higher_bound == "*" {
|
||||
UserInputBound::Unbounded
|
||||
} else if boundary_char == '}' {
|
||||
UserInputBound::Exclusive(higher_bound)
|
||||
} else {
|
||||
UserInputBound::Inclusive(higher_bound)
|
||||
});
|
||||
// return only lower and upper
|
||||
let lower_to_upper = (lower_bound.
|
||||
skip((spaces(),
|
||||
string("TO"),
|
||||
spaces())),
|
||||
upper_bound);
|
||||
|
||||
(optional(field()).skip(spaces()),
|
||||
// try elastic first, if it matches, the range is unbounded
|
||||
attempt(elastic_unbounded_range).or(lower_to_upper))
|
||||
.map(|(field, (lower, upper))|
|
||||
// Construct the leaf from extracted field (optional)
|
||||
// and bounds
|
||||
UserInputLeaf::Range {
|
||||
field,
|
||||
lower,
|
||||
upper
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn negate(expr: UserInputAST) -> UserInputAST {
|
||||
expr.unary(Occur::MustNot)
|
||||
}
|
||||
|
||||
fn must(expr: UserInputAST) -> UserInputAST {
|
||||
expr.unary(Occur::Must)
|
||||
}
|
||||
|
||||
parser! {
|
||||
fn leaf[I]()(I) -> UserInputAST
|
||||
where [I: Stream<Item = char>] {
|
||||
char('-').with(leaf()).map(negate)
|
||||
.or(char('+').with(leaf()).map(must))
|
||||
.or(char('(').with(ast()).skip(char(')')))
|
||||
.or(char('*').map(|_| UserInputAST::from(UserInputLeaf::All)))
|
||||
.or(attempt(string("NOT").skip(spaces1()).with(leaf()).map(negate)))
|
||||
.or(attempt(range().map(UserInputAST::from)))
|
||||
.or(literal().map(UserInputAST::from))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
enum BinaryOperand {
|
||||
Or,
|
||||
And,
|
||||
}
|
||||
|
||||
parser! {
|
||||
fn binary_operand[I]()(I) -> BinaryOperand
|
||||
where [I: Stream<Item = char>]
|
||||
{
|
||||
string("AND").with(value(BinaryOperand::And))
|
||||
.or(string("OR").with(value(BinaryOperand::Or)))
|
||||
}
|
||||
}
|
||||
|
||||
fn aggregate_binary_expressions(
|
||||
left: UserInputAST,
|
||||
others: Vec<(BinaryOperand, UserInputAST)>,
|
||||
) -> UserInputAST {
|
||||
let mut dnf: Vec<Vec<UserInputAST>> = vec![vec![left]];
|
||||
for (operator, operand_ast) in others {
|
||||
match operator {
|
||||
BinaryOperand::And => {
|
||||
if let Some(last) = dnf.last_mut() {
|
||||
last.push(operand_ast);
|
||||
}
|
||||
}
|
||||
BinaryOperand::Or => {
|
||||
dnf.push(vec![operand_ast]);
|
||||
}
|
||||
}
|
||||
}
|
||||
if dnf.len() == 1 {
|
||||
UserInputAST::and(dnf.into_iter().next().unwrap()) //< safe
|
||||
} else {
|
||||
let conjunctions = dnf.into_iter().map(UserInputAST::and).collect();
|
||||
UserInputAST::or(conjunctions)
|
||||
}
|
||||
}
|
||||
|
||||
parser! {
|
||||
pub fn ast[I]()(I) -> UserInputAST
|
||||
where [I: Stream<Item = char>]
|
||||
{
|
||||
let operand_leaf = (binary_operand().skip(spaces()), leaf().skip(spaces()));
|
||||
let boolean_expr = (leaf().skip(spaces().silent()), many1(operand_leaf)).map(
|
||||
|(left, right)| aggregate_binary_expressions(left,right));
|
||||
let whitespace_separated_leaves = many1(leaf().skip(spaces().silent()))
|
||||
.map(|subqueries: Vec<UserInputAST>|
|
||||
if subqueries.len() == 1 {
|
||||
subqueries.into_iter().next().unwrap()
|
||||
} else {
|
||||
UserInputAST::Clause(subqueries.into_iter().collect())
|
||||
});
|
||||
let expr = attempt(boolean_expr).or(whitespace_separated_leaves);
|
||||
spaces().with(expr).skip(spaces())
|
||||
}
|
||||
}
|
||||
|
||||
parser! {
|
||||
pub fn parse_to_ast[I]()(I) -> UserInputAST
|
||||
where [I: Stream<Item = char>]
|
||||
{
|
||||
spaces().with(optional(ast()).skip(eof())).map(|opt_ast| opt_ast.unwrap_or_else(UserInputAST::empty_query))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
|
||||
use super::*;
|
||||
|
||||
fn test_parse_query_to_ast_helper(query: &str, expected: &str) {
|
||||
let query = parse_to_ast().parse(query).unwrap().0;
|
||||
let query_str = format!("{:?}", query);
|
||||
assert_eq!(query_str, expected);
|
||||
}
|
||||
|
||||
fn test_is_parse_err(query: &str) {
|
||||
assert!(parse_to_ast().parse(query).is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_empty_to_ast() {
|
||||
test_parse_query_to_ast_helper("", "<emptyclause>");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_query_to_ast_hyphen() {
|
||||
test_parse_query_to_ast_helper("\"www-form-encoded\"", "\"www-form-encoded\"");
|
||||
test_parse_query_to_ast_helper("www-form-encoded", "\"www-form-encoded\"");
|
||||
test_parse_query_to_ast_helper("www-form-encoded", "\"www-form-encoded\"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_query_to_ast_not_op() {
|
||||
assert_eq!(
|
||||
format!("{:?}", parse_to_ast().parse("NOT")),
|
||||
"Err(UnexpectedParse)"
|
||||
);
|
||||
test_parse_query_to_ast_helper("NOTa", "\"NOTa\"");
|
||||
test_parse_query_to_ast_helper("NOT a", "-(\"a\")");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_query_to_ast_binary_op() {
|
||||
test_parse_query_to_ast_helper("a AND b", "(+(\"a\") +(\"b\"))");
|
||||
test_parse_query_to_ast_helper("a OR b", "(?(\"a\") ?(\"b\"))");
|
||||
test_parse_query_to_ast_helper("a OR b AND c", "(?(\"a\") ?((+(\"b\") +(\"c\"))))");
|
||||
test_parse_query_to_ast_helper("a AND b AND c", "(+(\"a\") +(\"b\") +(\"c\"))");
|
||||
assert_eq!(
|
||||
format!("{:?}", parse_to_ast().parse("a OR b aaa")),
|
||||
"Err(UnexpectedParse)"
|
||||
);
|
||||
assert_eq!(
|
||||
format!("{:?}", parse_to_ast().parse("a AND b aaa")),
|
||||
"Err(UnexpectedParse)"
|
||||
);
|
||||
assert_eq!(
|
||||
format!("{:?}", parse_to_ast().parse("aaa a OR b ")),
|
||||
"Err(UnexpectedParse)"
|
||||
);
|
||||
assert_eq!(
|
||||
format!("{:?}", parse_to_ast().parse("aaa ccc a OR b ")),
|
||||
"Err(UnexpectedParse)"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_elastic_query_ranges() {
|
||||
test_parse_query_to_ast_helper("title: >a", "title:{\"a\" TO \"*\"}");
|
||||
test_parse_query_to_ast_helper("title:>=a", "title:[\"a\" TO \"*\"}");
|
||||
test_parse_query_to_ast_helper("title: <a", "title:{\"*\" TO \"a\"}");
|
||||
test_parse_query_to_ast_helper("title:<=a", "title:{\"*\" TO \"a\"]");
|
||||
test_parse_query_to_ast_helper("title:<=bsd", "title:{\"*\" TO \"bsd\"]");
|
||||
|
||||
test_parse_query_to_ast_helper("weight: >70", "weight:{\"70\" TO \"*\"}");
|
||||
test_parse_query_to_ast_helper("weight:>=70", "weight:[\"70\" TO \"*\"}");
|
||||
test_parse_query_to_ast_helper("weight: <70", "weight:{\"*\" TO \"70\"}");
|
||||
test_parse_query_to_ast_helper("weight:<=70", "weight:{\"*\" TO \"70\"]");
|
||||
test_parse_query_to_ast_helper("weight: >60.7", "weight:{\"60.7\" TO \"*\"}");
|
||||
|
||||
test_parse_query_to_ast_helper("weight: <= 70", "weight:{\"*\" TO \"70\"]");
|
||||
|
||||
test_parse_query_to_ast_helper("weight: <= 70.5", "weight:{\"*\" TO \"70.5\"]");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_range_parser() {
|
||||
// testing the range() parser separately
|
||||
let res = range().parse("title: <hello").unwrap().0;
|
||||
let expected = UserInputLeaf::Range {
|
||||
field: Some("title".to_string()),
|
||||
lower: UserInputBound::Unbounded,
|
||||
upper: UserInputBound::Exclusive("hello".to_string()),
|
||||
};
|
||||
let res2 = range().parse("title:{* TO hello}").unwrap().0;
|
||||
assert_eq!(res, expected);
|
||||
assert_eq!(res2, expected);
|
||||
let expected_weight = UserInputLeaf::Range {
|
||||
field: Some("weight".to_string()),
|
||||
lower: UserInputBound::Inclusive("71.2".to_string()),
|
||||
upper: UserInputBound::Unbounded,
|
||||
};
|
||||
|
||||
let res3 = range().parse("weight: >=71.2").unwrap().0;
|
||||
let res4 = range().parse("weight:[71.2 TO *}").unwrap().0;
|
||||
assert_eq!(res3, expected_weight);
|
||||
assert_eq!(res4, expected_weight);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_query_to_triming_spaces() {
|
||||
test_parse_query_to_ast_helper(" abc", "\"abc\"");
|
||||
test_parse_query_to_ast_helper("abc ", "\"abc\"");
|
||||
test_parse_query_to_ast_helper("( a OR abc)", "(?(\"a\") ?(\"abc\"))");
|
||||
test_parse_query_to_ast_helper("(a OR abc)", "(?(\"a\") ?(\"abc\"))");
|
||||
test_parse_query_to_ast_helper("(a OR abc)", "(?(\"a\") ?(\"abc\"))");
|
||||
test_parse_query_to_ast_helper("a OR abc ", "(?(\"a\") ?(\"abc\"))");
|
||||
test_parse_query_to_ast_helper("(a OR abc )", "(?(\"a\") ?(\"abc\"))");
|
||||
test_parse_query_to_ast_helper("(a OR abc) ", "(?(\"a\") ?(\"abc\"))");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_query_to_ast() {
|
||||
test_parse_query_to_ast_helper("abc", "\"abc\"");
|
||||
test_parse_query_to_ast_helper("a b", "(\"a\" \"b\")");
|
||||
test_parse_query_to_ast_helper("+(a b)", "+((\"a\" \"b\"))");
|
||||
test_parse_query_to_ast_helper("+d", "+(\"d\")");
|
||||
test_parse_query_to_ast_helper("+(a b) +d", "(+((\"a\" \"b\")) +(\"d\"))");
|
||||
test_parse_query_to_ast_helper("(+a +b) d", "((+(\"a\") +(\"b\")) \"d\")");
|
||||
test_parse_query_to_ast_helper("(+a)", "+(\"a\")");
|
||||
test_parse_query_to_ast_helper("(+a +b)", "(+(\"a\") +(\"b\"))");
|
||||
test_parse_query_to_ast_helper("abc:toto", "abc:\"toto\"");
|
||||
test_parse_query_to_ast_helper("abc:1.1", "abc:\"1.1\"");
|
||||
test_parse_query_to_ast_helper("+abc:toto", "+(abc:\"toto\")");
|
||||
test_parse_query_to_ast_helper("(+abc:toto -titi)", "(+(abc:\"toto\") -(\"titi\"))");
|
||||
test_parse_query_to_ast_helper("-abc:toto", "-(abc:\"toto\")");
|
||||
test_parse_query_to_ast_helper("abc:a b", "(abc:\"a\" \"b\")");
|
||||
test_parse_query_to_ast_helper("abc:\"a b\"", "abc:\"a b\"");
|
||||
test_parse_query_to_ast_helper("foo:[1 TO 5]", "foo:[\"1\" TO \"5\"]");
|
||||
test_parse_query_to_ast_helper("[1 TO 5]", "[\"1\" TO \"5\"]");
|
||||
test_parse_query_to_ast_helper("foo:{a TO z}", "foo:{\"a\" TO \"z\"}");
|
||||
test_parse_query_to_ast_helper("foo:[1 TO toto}", "foo:[\"1\" TO \"toto\"}");
|
||||
test_parse_query_to_ast_helper("foo:[* TO toto}", "foo:{\"*\" TO \"toto\"}");
|
||||
test_parse_query_to_ast_helper("foo:[1 TO *}", "foo:[\"1\" TO \"*\"}");
|
||||
test_parse_query_to_ast_helper("foo:[1.1 TO *}", "foo:[\"1.1\" TO \"*\"}");
|
||||
test_is_parse_err("abc + ");
|
||||
}
|
||||
}
|
||||
@@ -1,8 +1,9 @@
|
||||
use std::fmt;
|
||||
use std::fmt::{Debug, Formatter};
|
||||
|
||||
use crate::query::Occur;
|
||||
use crate::Occur;
|
||||
|
||||
#[derive(PartialEq)]
|
||||
pub enum UserInputLeaf {
|
||||
Literal(UserInputLiteral),
|
||||
All,
|
||||
@@ -35,6 +36,7 @@ impl Debug for UserInputLeaf {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(PartialEq)]
|
||||
pub struct UserInputLiteral {
|
||||
pub field_name: Option<String>,
|
||||
pub phrase: String,
|
||||
@@ -49,9 +51,11 @@ impl fmt::Debug for UserInputLiteral {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(PartialEq)]
|
||||
pub enum UserInputBound {
|
||||
Inclusive(String),
|
||||
Exclusive(String),
|
||||
Unbounded,
|
||||
}
|
||||
|
||||
impl UserInputBound {
|
||||
@@ -59,6 +63,7 @@ impl UserInputBound {
|
||||
match *self {
|
||||
UserInputBound::Inclusive(ref word) => write!(formatter, "[\"{}\"", word),
|
||||
UserInputBound::Exclusive(ref word) => write!(formatter, "{{\"{}\"", word),
|
||||
UserInputBound::Unbounded => write!(formatter, "{{\"*\""),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -66,6 +71,7 @@ impl UserInputBound {
|
||||
match *self {
|
||||
UserInputBound::Inclusive(ref word) => write!(formatter, "\"{}\"]", word),
|
||||
UserInputBound::Exclusive(ref word) => write!(formatter, "\"{}\"}}", word),
|
||||
UserInputBound::Unbounded => write!(formatter, "\"*\"}}"),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -73,6 +79,7 @@ impl UserInputBound {
|
||||
match *self {
|
||||
UserInputBound::Inclusive(ref contents) => contents,
|
||||
UserInputBound::Exclusive(ref contents) => contents,
|
||||
UserInputBound::Unbounded => &"*",
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -80,9 +87,6 @@ impl UserInputBound {
|
||||
pub enum UserInputAST {
|
||||
Clause(Vec<UserInputAST>),
|
||||
Unary(Occur, Box<UserInputAST>),
|
||||
// Not(Box<UserInputAST>),
|
||||
// Should(Box<UserInputAST>),
|
||||
// Must(Box<UserInputAST>),
|
||||
Leaf(Box<UserInputLeaf>),
|
||||
}
|
||||
|
||||
@@ -92,7 +96,7 @@ impl UserInputAST {
|
||||
}
|
||||
|
||||
fn compose(occur: Occur, asts: Vec<UserInputAST>) -> UserInputAST {
|
||||
assert!(occur != Occur::MustNot);
|
||||
assert_ne!(occur, Occur::MustNot);
|
||||
assert!(!asts.is_empty());
|
||||
if asts.len() == 1 {
|
||||
asts.into_iter().next().unwrap() //< safe
|
||||
@@ -105,6 +109,10 @@ impl UserInputAST {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn empty_query() -> UserInputAST {
|
||||
UserInputAST::Clause(Vec::default())
|
||||
}
|
||||
|
||||
pub fn and(asts: Vec<UserInputAST>) -> UserInputAST {
|
||||
UserInputAST::compose(Occur::Must, asts)
|
||||
}
|
||||
@@ -114,42 +122,6 @@ impl UserInputAST {
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
impl UserInputAST {
|
||||
|
||||
fn compose_occur(self, occur: Occur) -> UserInputAST {
|
||||
match self {
|
||||
UserInputAST::Not(other) => {
|
||||
let new_occur = compose_occur(Occur::MustNot, occur);
|
||||
other.simplify()
|
||||
}
|
||||
_ => {
|
||||
self
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn simplify(self) -> UserInputAST {
|
||||
match self {
|
||||
UserInputAST::Clause(els) => {
|
||||
if els.len() == 1 {
|
||||
return els.into_iter().next().unwrap();
|
||||
} else {
|
||||
return self;
|
||||
}
|
||||
}
|
||||
UserInputAST::Not(els) => {
|
||||
if els.len() == 1 {
|
||||
return els.into_iter().next().unwrap();
|
||||
} else {
|
||||
return self;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
impl From<UserInputLiteral> for UserInputLeaf {
|
||||
fn from(literal: UserInputLiteral) -> UserInputLeaf {
|
||||
UserInputLeaf::Literal(literal)
|
||||
@@ -1,2 +1,2 @@
|
||||
#!/bin/bash
|
||||
cargo test --no-default-features --features mmap -- --test-threads 1
|
||||
cargo test
|
||||
|
||||
@@ -10,12 +10,10 @@ use crate::SegmentReader;
|
||||
/// documents match the query.
|
||||
///
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::schema::{Schema, TEXT};
|
||||
/// use tantivy::{Index, Result};
|
||||
/// use tantivy::collector::Count;
|
||||
/// use tantivy::query::QueryParser;
|
||||
/// use tantivy::schema::{Schema, TEXT};
|
||||
/// use tantivy::{doc, Index, Result};
|
||||
///
|
||||
/// # fn main() { example().unwrap(); }
|
||||
/// fn example() -> Result<()> {
|
||||
|
||||
@@ -81,12 +81,10 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
|
||||
///
|
||||
///
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::schema::{Facet, Schema, TEXT};
|
||||
/// use tantivy::{Index, Result};
|
||||
/// use tantivy::collector::FacetCollector;
|
||||
/// use tantivy::query::AllQuery;
|
||||
/// use tantivy::schema::{Facet, Schema, TEXT};
|
||||
/// use tantivy::{doc, Index, Result};
|
||||
///
|
||||
/// # fn main() { example().unwrap(); }
|
||||
/// fn example() -> Result<()> {
|
||||
|
||||
@@ -82,6 +82,7 @@ mod tests {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let num_field_i64 = schema_builder.add_i64_field("num_i64", FAST);
|
||||
let num_field_u64 = schema_builder.add_u64_field("num_u64", FAST);
|
||||
let num_field_f64 = schema_builder.add_f64_field("num_f64", FAST);
|
||||
let text_field = schema_builder.add_text_field("text", STRING);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
@@ -94,6 +95,7 @@ mod tests {
|
||||
index_writer.add_document(doc!(
|
||||
num_field_i64 => ((i as i64) % 3i64) as i64,
|
||||
num_field_u64 => (i % 2u64) as u64,
|
||||
num_field_f64 => (i % 4u64) as f64,
|
||||
text_field => "text"
|
||||
));
|
||||
}
|
||||
@@ -104,10 +106,11 @@ mod tests {
|
||||
let searcher = index.reader().searcher();
|
||||
let mut ffvf_i64: IntFacetCollector<I64FastFieldReader> = IntFacetCollector::new(num_field_i64);
|
||||
let mut ffvf_u64: IntFacetCollector<U64FastFieldReader> = IntFacetCollector::new(num_field_u64);
|
||||
let mut ffvf_f64: IntFacetCollector<F64FastFieldReader> = IntFacetCollector::new(num_field_f64);
|
||||
|
||||
{
|
||||
// perform the query
|
||||
let mut facet_collectors = chain().push(&mut ffvf_i64).push(&mut ffvf_u64);
|
||||
let mut facet_collectors = chain().push(&mut ffvf_i64).push(&mut ffvf_u64).push(&mut ffvf_f64);
|
||||
let mut query_parser = QueryParser::for_index(index, vec![text_field]);
|
||||
let query = query_parser.parse_query("text:text").unwrap();
|
||||
query.search(&searcher, &mut facet_collectors).unwrap();
|
||||
@@ -117,6 +120,8 @@ mod tests {
|
||||
assert_eq!(ffvf_u64.counters[&1], 5);
|
||||
assert_eq!(ffvf_i64.counters[&0], 4);
|
||||
assert_eq!(ffvf_i64.counters[&1], 3);
|
||||
assert_eq!(ffvf_f64.counters[&0.0], 3);
|
||||
assert_eq!(ffvf_f64.counters[&2.0], 2);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -35,7 +35,6 @@ The resulting `Fruit` will then be a typed tuple with each collector's original
|
||||
in their respective position.
|
||||
|
||||
```rust
|
||||
# extern crate tantivy;
|
||||
# use tantivy::schema::*;
|
||||
# use tantivy::*;
|
||||
# use tantivy::query::*;
|
||||
|
||||
@@ -105,12 +105,10 @@ impl<TFruit: Fruit> FruitHandle<TFruit> {
|
||||
/// [Combining several collectors section of the collector documentation](./index.html#combining-several-collectors).
|
||||
///
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::schema::{Schema, TEXT};
|
||||
/// use tantivy::{Index, Result};
|
||||
/// use tantivy::collector::{Count, TopDocs, MultiCollector};
|
||||
/// use tantivy::query::QueryParser;
|
||||
/// use tantivy::schema::{Schema, TEXT};
|
||||
/// use tantivy::{doc, Index, Result};
|
||||
///
|
||||
/// # fn main() { example().unwrap(); }
|
||||
/// fn example() -> Result<()> {
|
||||
|
||||
@@ -8,13 +8,23 @@ use crate::DocId;
|
||||
use crate::Score;
|
||||
use crate::SegmentLocalId;
|
||||
|
||||
pub const TEST_COLLECTOR_WITH_SCORE: TestCollector = TestCollector {
|
||||
compute_score: true,
|
||||
};
|
||||
|
||||
pub const TEST_COLLECTOR_WITHOUT_SCORE: TestCollector = TestCollector {
|
||||
compute_score: true,
|
||||
};
|
||||
|
||||
/// Stores all of the doc ids.
|
||||
/// This collector is only used for tests.
|
||||
/// It is unusable in pr
|
||||
///
|
||||
/// actise, as it does not store
|
||||
/// the segment ordinals
|
||||
pub struct TestCollector;
|
||||
pub struct TestCollector {
|
||||
pub compute_score: bool,
|
||||
}
|
||||
|
||||
pub struct TestSegmentCollector {
|
||||
segment_id: SegmentLocalId,
|
||||
@@ -32,7 +42,6 @@ impl TestFruit {
|
||||
pub fn docs(&self) -> &[DocAddress] {
|
||||
&self.docs[..]
|
||||
}
|
||||
|
||||
pub fn scores(&self) -> &[Score] {
|
||||
&self.scores[..]
|
||||
}
|
||||
@@ -54,7 +63,7 @@ impl Collector for TestCollector {
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
true
|
||||
self.compute_score
|
||||
}
|
||||
|
||||
fn merge_fruits(&self, mut children: Vec<TestFruit>) -> Result<TestFruit> {
|
||||
|
||||
@@ -13,6 +13,7 @@ use crate::Result;
|
||||
use crate::Score;
|
||||
use crate::SegmentLocalId;
|
||||
use crate::SegmentReader;
|
||||
use std::fmt;
|
||||
|
||||
/// The Top Score Collector keeps track of the K documents
|
||||
/// sorted by their score.
|
||||
@@ -22,13 +23,10 @@ use crate::SegmentReader;
|
||||
/// is `O(n log K)`.
|
||||
///
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::DocAddress;
|
||||
/// use tantivy::schema::{Schema, TEXT};
|
||||
/// use tantivy::{Index, Result};
|
||||
/// use tantivy::collector::TopDocs;
|
||||
/// use tantivy::query::QueryParser;
|
||||
/// use tantivy::schema::{Schema, TEXT};
|
||||
/// use tantivy::{doc, DocAddress, Index, Result};
|
||||
///
|
||||
/// # fn main() { example().unwrap(); }
|
||||
/// fn example() -> Result<()> {
|
||||
@@ -68,6 +66,12 @@ use crate::SegmentReader;
|
||||
/// ```
|
||||
pub struct TopDocs(TopCollector<Score>);
|
||||
|
||||
impl fmt::Debug for TopDocs {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "TopDocs({})", self.0.limit())
|
||||
}
|
||||
}
|
||||
|
||||
impl TopDocs {
|
||||
/// Creates a top score collector, with a number of documents equal to "limit".
|
||||
///
|
||||
@@ -80,10 +84,8 @@ impl TopDocs {
|
||||
/// Set top-K to rank documents by a given fast field.
|
||||
///
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// # use tantivy::schema::{Schema, FAST, TEXT};
|
||||
/// # use tantivy::{Index, Result, DocAddress};
|
||||
/// # use tantivy::{doc, Index, Result, DocAddress};
|
||||
/// # use tantivy::query::{Query, QueryParser};
|
||||
/// use tantivy::Searcher;
|
||||
/// use tantivy::collector::TopDocs;
|
||||
@@ -121,7 +123,7 @@ impl TopDocs {
|
||||
/// ///
|
||||
/// /// `field` is required to be a FAST field.
|
||||
/// fn docs_sorted_by_rating(searcher: &Searcher,
|
||||
/// query: &Query,
|
||||
/// query: &dyn Query,
|
||||
/// sort_by_field: Field)
|
||||
/// -> Result<Vec<(u64, DocAddress)>> {
|
||||
///
|
||||
@@ -160,6 +162,7 @@ impl TopDocs {
|
||||
.fast_fields()
|
||||
.u64(field)
|
||||
.expect("Field requested is not a i64/u64 fast field.");
|
||||
//TODO error message missmatch actual behavior for i64
|
||||
move |doc: DocId| ff_reader.get(doc)
|
||||
})
|
||||
}
|
||||
@@ -189,10 +192,8 @@ impl TopDocs {
|
||||
/// learning-to-rank model over various features
|
||||
///
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// # use tantivy::schema::{Schema, FAST, TEXT};
|
||||
/// # use tantivy::{Index, DocAddress, DocId, Score};
|
||||
/// # use tantivy::{doc, Index, DocAddress, DocId, Score};
|
||||
/// # use tantivy::query::QueryParser;
|
||||
/// use tantivy::SegmentReader;
|
||||
/// use tantivy::collector::TopDocs;
|
||||
@@ -294,10 +295,8 @@ impl TopDocs {
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// # #[macro_use]
|
||||
/// # extern crate tantivy;
|
||||
/// # use tantivy::schema::{Schema, FAST, TEXT};
|
||||
/// # use tantivy::{Index, DocAddress, DocId};
|
||||
/// # use tantivy::{doc, Index, DocAddress, DocId};
|
||||
/// # use tantivy::query::QueryParser;
|
||||
/// use tantivy::SegmentReader;
|
||||
/// use tantivy::collector::TopDocs;
|
||||
@@ -583,7 +582,7 @@ mod tests {
|
||||
query_field: Field,
|
||||
schema: Schema,
|
||||
mut doc_adder: impl FnMut(&mut IndexWriter) -> (),
|
||||
) -> (Index, Box<Query>) {
|
||||
) -> (Index, Box<dyn Query>) {
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
|
||||
@@ -99,16 +99,53 @@ pub fn u64_to_i64(val: u64) -> i64 {
|
||||
(val ^ HIGHEST_BIT) as i64
|
||||
}
|
||||
|
||||
/// Maps a `f64` to `u64`
|
||||
///
|
||||
/// For simplicity, tantivy internally handles `f64` as `u64`.
|
||||
/// The mapping is defined by this function.
|
||||
///
|
||||
/// Maps `f64` to `u64` so that lexical order is preserved.
|
||||
///
|
||||
/// This is more suited than simply casting (`val as u64`)
|
||||
/// which would truncate the result
|
||||
///
|
||||
/// # See also
|
||||
/// The [reverse mapping is `u64_to_f64`](./fn.u64_to_f64.html).
|
||||
#[inline(always)]
|
||||
pub fn f64_to_u64(val: f64) -> u64 {
|
||||
let bits = val.to_bits();
|
||||
if val.is_sign_positive() {
|
||||
bits ^ HIGHEST_BIT
|
||||
} else {
|
||||
!bits
|
||||
}
|
||||
}
|
||||
|
||||
/// Reverse the mapping given by [`i64_to_u64`](./fn.i64_to_u64.html).
|
||||
#[inline(always)]
|
||||
pub fn u64_to_f64(val: u64) -> f64 {
|
||||
f64::from_bits(if val & HIGHEST_BIT != 0 {
|
||||
val ^ HIGHEST_BIT
|
||||
} else {
|
||||
!val
|
||||
})
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) mod test {
|
||||
|
||||
pub use super::serialize::test::fixed_size_test;
|
||||
use super::{compute_num_bits, i64_to_u64, u64_to_i64};
|
||||
use super::{compute_num_bits, f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
|
||||
use std::f64;
|
||||
|
||||
fn test_i64_converter_helper(val: i64) {
|
||||
assert_eq!(u64_to_i64(i64_to_u64(val)), val);
|
||||
}
|
||||
|
||||
fn test_f64_converter_helper(val: f64) {
|
||||
assert_eq!(u64_to_f64(f64_to_u64(val)), val);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_i64_converter() {
|
||||
assert_eq!(i64_to_u64(i64::min_value()), u64::min_value());
|
||||
@@ -121,6 +158,29 @@ pub(crate) mod test {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_f64_converter() {
|
||||
test_f64_converter_helper(f64::INFINITY);
|
||||
test_f64_converter_helper(f64::NEG_INFINITY);
|
||||
test_f64_converter_helper(0.0);
|
||||
test_f64_converter_helper(-0.0);
|
||||
test_f64_converter_helper(1.0);
|
||||
test_f64_converter_helper(-1.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_f64_order() {
|
||||
assert!(!(f64_to_u64(f64::NEG_INFINITY)..f64_to_u64(f64::INFINITY))
|
||||
.contains(&f64_to_u64(f64::NAN))); //nan is not a number
|
||||
assert!(f64_to_u64(1.5) > f64_to_u64(1.0)); //same exponent, different mantissa
|
||||
assert!(f64_to_u64(2.0) > f64_to_u64(1.0)); //same mantissa, different exponent
|
||||
assert!(f64_to_u64(2.0) > f64_to_u64(1.5)); //different exponent and mantissa
|
||||
assert!(f64_to_u64(1.0) > f64_to_u64(-1.0)); // pos > neg
|
||||
assert!(f64_to_u64(-1.5) < f64_to_u64(-1.0));
|
||||
assert!(f64_to_u64(-2.0) < f64_to_u64(1.0));
|
||||
assert!(f64_to_u64(-2.0) < f64_to_u64(-1.5));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compute_num_bits() {
|
||||
assert_eq!(compute_num_bits(1), 1u8);
|
||||
|
||||
@@ -102,6 +102,19 @@ impl FixedSize for i64 {
|
||||
const SIZE_IN_BYTES: usize = 8;
|
||||
}
|
||||
|
||||
impl BinarySerializable for f64 {
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
writer.write_f64::<Endianness>(*self)
|
||||
}
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
reader.read_f64::<Endianness>()
|
||||
}
|
||||
}
|
||||
|
||||
impl FixedSize for f64 {
|
||||
const SIZE_IN_BYTES: usize = 8;
|
||||
}
|
||||
|
||||
impl BinarySerializable for u8 {
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
writer.write_u8(*self)
|
||||
@@ -172,6 +185,11 @@ pub mod test {
|
||||
fixed_size_test::<i64>();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_serialize_f64() {
|
||||
fixed_size_test::<f64>();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_serialize_u64() {
|
||||
fixed_size_test::<u64>();
|
||||
|
||||
@@ -4,6 +4,7 @@ use crate::core::Executor;
|
||||
use crate::core::IndexMeta;
|
||||
use crate::core::SegmentId;
|
||||
use crate::core::SegmentMeta;
|
||||
use crate::core::SegmentMetaInventory;
|
||||
use crate::core::META_FILEPATH;
|
||||
use crate::directory::ManagedDirectory;
|
||||
#[cfg(feature = "mmap")]
|
||||
@@ -12,7 +13,6 @@ use crate::directory::INDEX_WRITER_LOCK;
|
||||
use crate::directory::{Directory, RAMDirectory};
|
||||
use crate::error::DataCorruption;
|
||||
use crate::error::TantivyError;
|
||||
use crate::indexer::index_writer::open_index_writer;
|
||||
use crate::indexer::index_writer::HEAP_SIZE_MIN;
|
||||
use crate::indexer::segment_updater::save_new_metas;
|
||||
use crate::reader::IndexReader;
|
||||
@@ -25,17 +25,16 @@ use crate::tokenizer::TokenizerManager;
|
||||
use crate::IndexWriter;
|
||||
use crate::Result;
|
||||
use num_cpus;
|
||||
use serde_json;
|
||||
use std::borrow::BorrowMut;
|
||||
use std::fmt;
|
||||
#[cfg(feature = "mmap")]
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
|
||||
fn load_metas(directory: &dyn Directory) -> Result<IndexMeta> {
|
||||
fn load_metas(directory: &dyn Directory, inventory: &SegmentMetaInventory) -> Result<IndexMeta> {
|
||||
let meta_data = directory.atomic_read(&META_FILEPATH)?;
|
||||
let meta_string = String::from_utf8_lossy(&meta_data);
|
||||
serde_json::from_str(&meta_string)
|
||||
IndexMeta::deserialize(&meta_string, &inventory)
|
||||
.map_err(|e| {
|
||||
DataCorruption::new(
|
||||
META_FILEPATH.to_path_buf(),
|
||||
@@ -52,6 +51,7 @@ pub struct Index {
|
||||
schema: Schema,
|
||||
executor: Arc<Executor>,
|
||||
tokenizers: TokenizerManager,
|
||||
inventory: SegmentMetaInventory,
|
||||
}
|
||||
|
||||
impl Index {
|
||||
@@ -148,19 +148,23 @@ impl Index {
|
||||
fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> {
|
||||
save_new_metas(schema.clone(), directory.borrow_mut())?;
|
||||
let metas = IndexMeta::with_schema(schema);
|
||||
Index::create_from_metas(directory, &metas)
|
||||
Index::create_from_metas(directory, &metas, SegmentMetaInventory::default())
|
||||
}
|
||||
|
||||
/// Creates a new index given a directory and an `IndexMeta`.
|
||||
fn create_from_metas(directory: ManagedDirectory, metas: &IndexMeta) -> Result<Index> {
|
||||
fn create_from_metas(
|
||||
directory: ManagedDirectory,
|
||||
metas: &IndexMeta,
|
||||
inventory: SegmentMetaInventory,
|
||||
) -> Result<Index> {
|
||||
let schema = metas.schema.clone();
|
||||
let index = Index {
|
||||
Ok(Index {
|
||||
directory,
|
||||
schema,
|
||||
tokenizers: TokenizerManager::default(),
|
||||
executor: Arc::new(Executor::single_thread()),
|
||||
};
|
||||
Ok(index)
|
||||
inventory,
|
||||
})
|
||||
}
|
||||
|
||||
/// Accessor for the tokenizer manager.
|
||||
@@ -169,11 +173,11 @@ impl Index {
|
||||
}
|
||||
|
||||
/// Helper to access the tokenizer associated to a specific field.
|
||||
pub fn tokenizer_for_field(&self, field: Field) -> Result<Box<dyn BoxedTokenizer>> {
|
||||
pub fn tokenizer_for_field(&self, field: Field) -> Result<BoxedTokenizer> {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let field_type = field_entry.field_type();
|
||||
let tokenizer_manager: &TokenizerManager = self.tokenizers();
|
||||
let tokenizer_name_opt: Option<Box<dyn BoxedTokenizer>> = match field_type {
|
||||
let tokenizer_name_opt: Option<BoxedTokenizer> = match field_type {
|
||||
FieldType::Str(text_options) => text_options
|
||||
.get_indexing_options()
|
||||
.map(|text_indexing_options| text_indexing_options.tokenizer().to_string())
|
||||
@@ -212,16 +216,35 @@ impl Index {
|
||||
Index::open(mmap_directory)
|
||||
}
|
||||
|
||||
/// Returns the list of the segment metas tracked by the index.
|
||||
///
|
||||
/// Such segments can of course be part of the index,
|
||||
/// but also they could be segments being currently built or in the middle of a merge
|
||||
/// operation.
|
||||
pub fn list_all_segment_metas(&self) -> Vec<SegmentMeta> {
|
||||
self.inventory.all()
|
||||
}
|
||||
|
||||
/// Creates a new segment_meta (Advanced user only).
|
||||
///
|
||||
/// As long as the `SegmentMeta` lives, the files associated with the
|
||||
/// `SegmentMeta` are guaranteed to not be garbage collected, regardless of
|
||||
/// whether the segment is recorded as part of the index or not.
|
||||
pub fn new_segment_meta(&self, segment_id: SegmentId, max_doc: u32) -> SegmentMeta {
|
||||
self.inventory.new_segment_meta(segment_id, max_doc)
|
||||
}
|
||||
|
||||
/// Open the index using the provided directory
|
||||
pub fn open<D: Directory>(directory: D) -> Result<Index> {
|
||||
let directory = ManagedDirectory::wrap(directory)?;
|
||||
let metas = load_metas(&directory)?;
|
||||
Index::create_from_metas(directory, &metas)
|
||||
let inventory = SegmentMetaInventory::default();
|
||||
let metas = load_metas(&directory, &inventory)?;
|
||||
Index::create_from_metas(directory, &metas, inventory)
|
||||
}
|
||||
|
||||
/// Reads the index meta file from the directory.
|
||||
pub fn load_metas(&self) -> Result<IndexMeta> {
|
||||
load_metas(self.directory())
|
||||
load_metas(self.directory(), &self.inventory)
|
||||
}
|
||||
|
||||
/// Open a new index writer. Attempts to acquire a lockfile.
|
||||
@@ -265,7 +288,7 @@ impl Index {
|
||||
)
|
||||
})?;
|
||||
let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads;
|
||||
open_index_writer(
|
||||
IndexWriter::new(
|
||||
self,
|
||||
num_threads,
|
||||
heap_size_in_bytes_per_thread,
|
||||
@@ -315,7 +338,9 @@ impl Index {
|
||||
|
||||
/// Creates a new segment.
|
||||
pub fn new_segment(&self) -> Segment {
|
||||
let segment_meta = SegmentMeta::new(SegmentId::generate_random(), 0);
|
||||
let segment_meta = self
|
||||
.inventory
|
||||
.new_segment_meta(SegmentId::generate_random(), 0);
|
||||
self.segment(segment_meta)
|
||||
}
|
||||
|
||||
@@ -448,13 +473,13 @@ mod tests {
|
||||
|
||||
use super::*;
|
||||
use std::path::PathBuf;
|
||||
use tempdir::TempDir;
|
||||
use tempfile::TempDir;
|
||||
|
||||
#[test]
|
||||
fn test_index_on_commit_reload_policy_mmap() {
|
||||
let schema = throw_away_schema();
|
||||
let field = schema.get_field("num_likes").unwrap();
|
||||
let tempdir = TempDir::new("index").unwrap();
|
||||
let tempdir = TempDir::new().unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
let index = Index::create_in_dir(&tempdir_path, schema).unwrap();
|
||||
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
@@ -493,7 +518,7 @@ mod tests {
|
||||
fn test_index_on_commit_reload_policy_different_directories() {
|
||||
let schema = throw_away_schema();
|
||||
let field = schema.get_field("num_likes").unwrap();
|
||||
let tempdir = TempDir::new("index").unwrap();
|
||||
let tempdir = TempDir::new().unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
let write_index = Index::create_in_dir(&tempdir_path, schema).unwrap();
|
||||
let read_index = Index::open_in_dir(&tempdir_path).unwrap();
|
||||
|
||||
@@ -1,8 +1,184 @@
|
||||
use crate::core::SegmentMeta;
|
||||
use super::SegmentComponent;
|
||||
use crate::core::SegmentId;
|
||||
use crate::schema::Schema;
|
||||
use crate::Opstamp;
|
||||
use census::{Inventory, TrackedObject};
|
||||
use serde;
|
||||
use serde_json;
|
||||
use std::collections::HashSet;
|
||||
use std::fmt;
|
||||
use std::path::PathBuf;
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
struct DeleteMeta {
|
||||
num_deleted_docs: u32,
|
||||
opstamp: Opstamp,
|
||||
}
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
pub struct SegmentMetaInventory {
|
||||
inventory: Inventory<InnerSegmentMeta>,
|
||||
}
|
||||
|
||||
impl SegmentMetaInventory {
|
||||
/// Lists all living `SegmentMeta` object at the time of the call.
|
||||
pub fn all(&self) -> Vec<SegmentMeta> {
|
||||
self.inventory
|
||||
.list()
|
||||
.into_iter()
|
||||
.map(SegmentMeta::from)
|
||||
.collect::<Vec<_>>()
|
||||
}
|
||||
|
||||
pub fn new_segment_meta(&self, segment_id: SegmentId, max_doc: u32) -> SegmentMeta {
|
||||
let inner = InnerSegmentMeta {
|
||||
segment_id,
|
||||
max_doc,
|
||||
deletes: None,
|
||||
};
|
||||
SegmentMeta::from(self.inventory.track(inner))
|
||||
}
|
||||
}
|
||||
|
||||
/// `SegmentMeta` contains simple meta information about a segment.
|
||||
///
|
||||
/// For instance the number of docs it contains,
|
||||
/// how many are deleted, etc.
|
||||
#[derive(Clone)]
|
||||
pub struct SegmentMeta {
|
||||
tracked: TrackedObject<InnerSegmentMeta>,
|
||||
}
|
||||
|
||||
impl fmt::Debug for SegmentMeta {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
|
||||
self.tracked.fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
impl serde::Serialize for SegmentMeta {
|
||||
fn serialize<S>(
|
||||
&self,
|
||||
serializer: S,
|
||||
) -> Result<<S as serde::Serializer>::Ok, <S as serde::Serializer>::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
self.tracked.serialize(serializer)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<TrackedObject<InnerSegmentMeta>> for SegmentMeta {
|
||||
fn from(tracked: TrackedObject<InnerSegmentMeta>) -> SegmentMeta {
|
||||
SegmentMeta { tracked }
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentMeta {
|
||||
// Creates a new `SegmentMeta` object.
|
||||
|
||||
/// Returns the segment id.
|
||||
pub fn id(&self) -> SegmentId {
|
||||
self.tracked.segment_id
|
||||
}
|
||||
|
||||
/// Returns the number of deleted documents.
|
||||
pub fn num_deleted_docs(&self) -> u32 {
|
||||
self.tracked
|
||||
.deletes
|
||||
.as_ref()
|
||||
.map(|delete_meta| delete_meta.num_deleted_docs)
|
||||
.unwrap_or(0u32)
|
||||
}
|
||||
|
||||
/// Returns the list of files that
|
||||
/// are required for the segment meta.
|
||||
///
|
||||
/// This is useful as the way tantivy removes files
|
||||
/// is by removing all files that have been created by tantivy
|
||||
/// and are not used by any segment anymore.
|
||||
pub fn list_files(&self) -> HashSet<PathBuf> {
|
||||
SegmentComponent::iterator()
|
||||
.map(|component| self.relative_path(*component))
|
||||
.collect::<HashSet<PathBuf>>()
|
||||
}
|
||||
|
||||
/// Returns the relative path of a component of our segment.
|
||||
///
|
||||
/// It just joins the segment id with the extension
|
||||
/// associated to a segment component.
|
||||
pub fn relative_path(&self, component: SegmentComponent) -> PathBuf {
|
||||
let mut path = self.id().uuid_string();
|
||||
path.push_str(&*match component {
|
||||
SegmentComponent::POSTINGS => ".idx".to_string(),
|
||||
SegmentComponent::POSITIONS => ".pos".to_string(),
|
||||
SegmentComponent::POSITIONSSKIP => ".posidx".to_string(),
|
||||
SegmentComponent::TERMS => ".term".to_string(),
|
||||
SegmentComponent::STORE => ".store".to_string(),
|
||||
SegmentComponent::FASTFIELDS => ".fast".to_string(),
|
||||
SegmentComponent::FIELDNORMS => ".fieldnorm".to_string(),
|
||||
SegmentComponent::DELETE => format!(".{}.del", self.delete_opstamp().unwrap_or(0)),
|
||||
});
|
||||
PathBuf::from(path)
|
||||
}
|
||||
|
||||
/// Return the highest doc id + 1
|
||||
///
|
||||
/// If there are no deletes, then num_docs = max_docs
|
||||
/// and all the doc ids contains in this segment
|
||||
/// are exactly (0..max_doc).
|
||||
pub fn max_doc(&self) -> u32 {
|
||||
self.tracked.max_doc
|
||||
}
|
||||
|
||||
/// Return the number of documents in the segment.
|
||||
pub fn num_docs(&self) -> u32 {
|
||||
self.max_doc() - self.num_deleted_docs()
|
||||
}
|
||||
|
||||
/// Returns the `Opstamp` of the last delete operation
|
||||
/// taken in account in this segment.
|
||||
pub fn delete_opstamp(&self) -> Option<Opstamp> {
|
||||
self.tracked
|
||||
.deletes
|
||||
.as_ref()
|
||||
.map(|delete_meta| delete_meta.opstamp)
|
||||
}
|
||||
|
||||
/// Returns true iff the segment meta contains
|
||||
/// delete information.
|
||||
pub fn has_deletes(&self) -> bool {
|
||||
self.num_deleted_docs() > 0
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> SegmentMeta {
|
||||
let delete_meta = DeleteMeta {
|
||||
num_deleted_docs,
|
||||
opstamp,
|
||||
};
|
||||
let tracked = self.tracked.map(move |inner_meta| InnerSegmentMeta {
|
||||
segment_id: inner_meta.segment_id,
|
||||
max_doc: inner_meta.max_doc,
|
||||
deletes: Some(delete_meta),
|
||||
});
|
||||
SegmentMeta { tracked }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
struct InnerSegmentMeta {
|
||||
segment_id: SegmentId,
|
||||
max_doc: u32,
|
||||
deletes: Option<DeleteMeta>,
|
||||
}
|
||||
|
||||
impl InnerSegmentMeta {
|
||||
pub fn track(self, inventory: &SegmentMetaInventory) -> SegmentMeta {
|
||||
SegmentMeta {
|
||||
tracked: inventory.inventory.track(self),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Meta information about the `Index`.
|
||||
///
|
||||
@@ -12,7 +188,7 @@ use std::fmt;
|
||||
/// * the index `docstamp`
|
||||
/// * the schema
|
||||
///
|
||||
#[derive(Clone, Serialize, Deserialize)]
|
||||
#[derive(Clone, Serialize)]
|
||||
pub struct IndexMeta {
|
||||
/// List of `SegmentMeta` informations associated to each finalized segment of the index.
|
||||
pub segments: Vec<SegmentMeta>,
|
||||
@@ -29,6 +205,30 @@ pub struct IndexMeta {
|
||||
pub payload: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct UntrackedIndexMeta {
|
||||
pub segments: Vec<InnerSegmentMeta>,
|
||||
pub schema: Schema,
|
||||
pub opstamp: Opstamp,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub payload: Option<String>,
|
||||
}
|
||||
|
||||
impl UntrackedIndexMeta {
|
||||
pub fn track(self, inventory: &SegmentMetaInventory) -> IndexMeta {
|
||||
IndexMeta {
|
||||
segments: self
|
||||
.segments
|
||||
.into_iter()
|
||||
.map(|inner_seg_meta| inner_seg_meta.track(inventory))
|
||||
.collect::<Vec<SegmentMeta>>(),
|
||||
schema: self.schema,
|
||||
opstamp: self.opstamp,
|
||||
payload: self.payload,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IndexMeta {
|
||||
/// Create an `IndexMeta` object representing a brand new `Index`
|
||||
/// with the given index.
|
||||
@@ -43,6 +243,14 @@ impl IndexMeta {
|
||||
payload: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn deserialize(
|
||||
meta_json: &str,
|
||||
inventory: &SegmentMetaInventory,
|
||||
) -> serde_json::Result<IndexMeta> {
|
||||
let untracked_meta_json: UntrackedIndexMeta = serde_json::from_str(meta_json)?;
|
||||
Ok(untracked_meta_json.track(inventory))
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for IndexMeta {
|
||||
|
||||
@@ -32,7 +32,7 @@ pub struct InvertedIndexReader {
|
||||
}
|
||||
|
||||
impl InvertedIndexReader {
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::needless_pass_by_value))] // for symetry
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::needless_pass_by_value))] // for symmetry
|
||||
pub(crate) fn new(
|
||||
termdict: TermDictionary,
|
||||
postings_source: ReadOnlySource,
|
||||
|
||||
@@ -6,19 +6,17 @@ pub mod searcher;
|
||||
mod segment;
|
||||
mod segment_component;
|
||||
mod segment_id;
|
||||
mod segment_meta;
|
||||
mod segment_reader;
|
||||
|
||||
pub use self::executor::Executor;
|
||||
pub use self::index::Index;
|
||||
pub use self::index_meta::IndexMeta;
|
||||
pub use self::index_meta::{IndexMeta, SegmentMeta, SegmentMetaInventory};
|
||||
pub use self::inverted_index_reader::InvertedIndexReader;
|
||||
pub use self::searcher::Searcher;
|
||||
pub use self::segment::Segment;
|
||||
pub use self::segment::SerializableSegment;
|
||||
pub use self::segment_component::SegmentComponent;
|
||||
pub use self::segment_id::SegmentId;
|
||||
pub use self::segment_meta::SegmentMeta;
|
||||
pub use self::segment_reader::SegmentReader;
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
@@ -4,6 +4,8 @@ use uuid::Uuid;
|
||||
|
||||
#[cfg(test)]
|
||||
use once_cell::sync::Lazy;
|
||||
use std::error::Error;
|
||||
use std::str::FromStr;
|
||||
#[cfg(test)]
|
||||
use std::sync::atomic;
|
||||
|
||||
@@ -52,15 +54,51 @@ impl SegmentId {
|
||||
/// and the rest is random.
|
||||
///
|
||||
/// Picking the first 8 chars is ok to identify
|
||||
/// segments in a display message.
|
||||
/// segments in a display message (e.g. a5c4dfcb).
|
||||
pub fn short_uuid_string(&self) -> String {
|
||||
(&self.0.to_simple_ref().to_string()[..8]).to_string()
|
||||
}
|
||||
|
||||
/// Returns a segment uuid string.
|
||||
///
|
||||
/// It consists in 32 lowercase hexadecimal chars
|
||||
/// (e.g. a5c4dfcbdfe645089129e308e26d5523)
|
||||
pub fn uuid_string(&self) -> String {
|
||||
self.0.to_simple_ref().to_string()
|
||||
}
|
||||
|
||||
/// Build a `SegmentId` string from the full uuid string.
|
||||
///
|
||||
/// E.g. "a5c4dfcbdfe645089129e308e26d5523"
|
||||
pub fn from_uuid_string(uuid_string: &str) -> Result<SegmentId, SegmentIdParseError> {
|
||||
FromStr::from_str(uuid_string)
|
||||
}
|
||||
}
|
||||
|
||||
/// Error type used when parsing a `SegmentId` from a string fails.
|
||||
pub struct SegmentIdParseError(uuid::parser::ParseError);
|
||||
|
||||
impl Error for SegmentIdParseError {}
|
||||
|
||||
impl fmt::Debug for SegmentIdParseError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
self.0.fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for SegmentIdParseError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
self.0.fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
impl FromStr for SegmentId {
|
||||
type Err = SegmentIdParseError;
|
||||
|
||||
fn from_str(uuid_string: &str) -> Result<Self, SegmentIdParseError> {
|
||||
let uuid = Uuid::parse_str(uuid_string).map_err(SegmentIdParseError)?;
|
||||
Ok(SegmentId(uuid))
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for SegmentId {
|
||||
@@ -80,3 +118,18 @@ impl Ord for SegmentId {
|
||||
self.0.as_bytes().cmp(other.0.as_bytes())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::SegmentId;
|
||||
|
||||
#[test]
|
||||
fn test_to_uuid_string() {
|
||||
let full_uuid = "a5c4dfcbdfe645089129e308e26d5523";
|
||||
let segment_id = SegmentId::from_uuid_string(full_uuid).unwrap();
|
||||
assert_eq!(segment_id.uuid_string(), full_uuid);
|
||||
assert_eq!(segment_id.short_uuid_string(), "a5c4dfcb");
|
||||
// one extra char
|
||||
assert!(SegmentId::from_uuid_string("a5c4dfcbdfe645089129e308e26d5523b").is_err());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,174 +0,0 @@
|
||||
use super::SegmentComponent;
|
||||
use crate::core::SegmentId;
|
||||
use crate::Opstamp;
|
||||
use census::{Inventory, TrackedObject};
|
||||
use once_cell::sync::Lazy;
|
||||
use serde;
|
||||
use std::collections::HashSet;
|
||||
use std::fmt;
|
||||
use std::path::PathBuf;
|
||||
|
||||
static INVENTORY: Lazy<Inventory<InnerSegmentMeta>> = Lazy::new(Inventory::new);
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
struct DeleteMeta {
|
||||
num_deleted_docs: u32,
|
||||
opstamp: Opstamp,
|
||||
}
|
||||
|
||||
/// `SegmentMeta` contains simple meta information about a segment.
|
||||
///
|
||||
/// For instance the number of docs it contains,
|
||||
/// how many are deleted, etc.
|
||||
#[derive(Clone)]
|
||||
pub struct SegmentMeta {
|
||||
tracked: TrackedObject<InnerSegmentMeta>,
|
||||
}
|
||||
|
||||
impl fmt::Debug for SegmentMeta {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
|
||||
self.tracked.fmt(f)
|
||||
}
|
||||
}
|
||||
|
||||
impl serde::Serialize for SegmentMeta {
|
||||
fn serialize<S>(
|
||||
&self,
|
||||
serializer: S,
|
||||
) -> Result<<S as serde::Serializer>::Ok, <S as serde::Serializer>::Error>
|
||||
where
|
||||
S: serde::Serializer,
|
||||
{
|
||||
self.tracked.serialize(serializer)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> serde::Deserialize<'a> for SegmentMeta {
|
||||
fn deserialize<D>(deserializer: D) -> Result<Self, <D as serde::Deserializer<'a>>::Error>
|
||||
where
|
||||
D: serde::Deserializer<'a>,
|
||||
{
|
||||
let inner = InnerSegmentMeta::deserialize(deserializer)?;
|
||||
let tracked = INVENTORY.track(inner);
|
||||
Ok(SegmentMeta { tracked })
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentMeta {
|
||||
/// Lists all living `SegmentMeta` object at the time of the call.
|
||||
pub fn all() -> Vec<SegmentMeta> {
|
||||
INVENTORY
|
||||
.list()
|
||||
.into_iter()
|
||||
.map(|inner| SegmentMeta { tracked: inner })
|
||||
.collect::<Vec<_>>()
|
||||
}
|
||||
|
||||
/// Creates a new `SegmentMeta` object.
|
||||
#[doc(hidden)]
|
||||
pub fn new(segment_id: SegmentId, max_doc: u32) -> SegmentMeta {
|
||||
let inner = InnerSegmentMeta {
|
||||
segment_id,
|
||||
max_doc,
|
||||
deletes: None,
|
||||
};
|
||||
SegmentMeta {
|
||||
tracked: INVENTORY.track(inner),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the segment id.
|
||||
pub fn id(&self) -> SegmentId {
|
||||
self.tracked.segment_id
|
||||
}
|
||||
|
||||
/// Returns the number of deleted documents.
|
||||
pub fn num_deleted_docs(&self) -> u32 {
|
||||
self.tracked
|
||||
.deletes
|
||||
.as_ref()
|
||||
.map(|delete_meta| delete_meta.num_deleted_docs)
|
||||
.unwrap_or(0u32)
|
||||
}
|
||||
|
||||
/// Returns the list of files that
|
||||
/// are required for the segment meta.
|
||||
///
|
||||
/// This is useful as the way tantivy removes files
|
||||
/// is by removing all files that have been created by tantivy
|
||||
/// and are not used by any segment anymore.
|
||||
pub fn list_files(&self) -> HashSet<PathBuf> {
|
||||
SegmentComponent::iterator()
|
||||
.map(|component| self.relative_path(*component))
|
||||
.collect::<HashSet<PathBuf>>()
|
||||
}
|
||||
|
||||
/// Returns the relative path of a component of our segment.
|
||||
///
|
||||
/// It just joins the segment id with the extension
|
||||
/// associated to a segment component.
|
||||
pub fn relative_path(&self, component: SegmentComponent) -> PathBuf {
|
||||
let mut path = self.id().uuid_string();
|
||||
path.push_str(&*match component {
|
||||
SegmentComponent::POSTINGS => ".idx".to_string(),
|
||||
SegmentComponent::POSITIONS => ".pos".to_string(),
|
||||
SegmentComponent::POSITIONSSKIP => ".posidx".to_string(),
|
||||
SegmentComponent::TERMS => ".term".to_string(),
|
||||
SegmentComponent::STORE => ".store".to_string(),
|
||||
SegmentComponent::FASTFIELDS => ".fast".to_string(),
|
||||
SegmentComponent::FIELDNORMS => ".fieldnorm".to_string(),
|
||||
SegmentComponent::DELETE => format!(".{}.del", self.delete_opstamp().unwrap_or(0)),
|
||||
});
|
||||
PathBuf::from(path)
|
||||
}
|
||||
|
||||
/// Return the highest doc id + 1
|
||||
///
|
||||
/// If there are no deletes, then num_docs = max_docs
|
||||
/// and all the doc ids contains in this segment
|
||||
/// are exactly (0..max_doc).
|
||||
pub fn max_doc(&self) -> u32 {
|
||||
self.tracked.max_doc
|
||||
}
|
||||
|
||||
/// Return the number of documents in the segment.
|
||||
pub fn num_docs(&self) -> u32 {
|
||||
self.max_doc() - self.num_deleted_docs()
|
||||
}
|
||||
|
||||
/// Returns the `Opstamp` of the last delete operation
|
||||
/// taken in account in this segment.
|
||||
pub fn delete_opstamp(&self) -> Option<Opstamp> {
|
||||
self.tracked
|
||||
.deletes
|
||||
.as_ref()
|
||||
.map(|delete_meta| delete_meta.opstamp)
|
||||
}
|
||||
|
||||
/// Returns true iff the segment meta contains
|
||||
/// delete information.
|
||||
pub fn has_deletes(&self) -> bool {
|
||||
self.num_deleted_docs() > 0
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> SegmentMeta {
|
||||
let delete_meta = DeleteMeta {
|
||||
num_deleted_docs,
|
||||
opstamp,
|
||||
};
|
||||
let tracked = self.tracked.map(move |inner_meta| InnerSegmentMeta {
|
||||
segment_id: inner_meta.segment_id,
|
||||
max_doc: inner_meta.max_doc,
|
||||
deletes: Some(delete_meta),
|
||||
});
|
||||
SegmentMeta { tracked }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
struct InnerSegmentMeta {
|
||||
segment_id: SegmentId,
|
||||
max_doc: u32,
|
||||
deletes: Option<DeleteMeta>,
|
||||
}
|
||||
@@ -48,14 +48,14 @@ impl RetryPolicy {
|
||||
///
|
||||
/// It is transparently associated to a lock file, that gets deleted
|
||||
/// on `Drop.` The lock is released automatically on `Drop`.
|
||||
pub struct DirectoryLock(Box<dyn Drop + Send + Sync + 'static>);
|
||||
pub struct DirectoryLock(Box<dyn Send + Sync + 'static>);
|
||||
|
||||
struct DirectoryLockGuard {
|
||||
directory: Box<dyn Directory>,
|
||||
path: PathBuf,
|
||||
}
|
||||
|
||||
impl<T: Drop + Send + Sync + 'static> From<Box<T>> for DirectoryLock {
|
||||
impl<T: Send + Sync + 'static> From<Box<T>> for DirectoryLock {
|
||||
fn from(underlying: Box<T>) -> Self {
|
||||
DirectoryLock(underlying)
|
||||
}
|
||||
@@ -204,7 +204,7 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
|
||||
/// Internally, tantivy only uses this API to detect new commits to implement the
|
||||
/// `OnCommit` `ReloadPolicy`. Not implementing watch in a `Directory` only prevents the
|
||||
/// `OnCommit` `ReloadPolicy` to work properly.
|
||||
fn watch(&self, watch_callback: WatchCallback) -> WatchHandle;
|
||||
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle>;
|
||||
}
|
||||
|
||||
/// DirectoryClone
|
||||
|
||||
@@ -135,28 +135,28 @@ impl ManagedDirectory {
|
||||
files_to_delete.push(managed_path.clone());
|
||||
}
|
||||
}
|
||||
} else {
|
||||
error!("Failed to acquire lock for GC");
|
||||
}
|
||||
}
|
||||
|
||||
let mut deleted_files = vec![];
|
||||
{
|
||||
for file_to_delete in files_to_delete {
|
||||
match self.delete(&file_to_delete) {
|
||||
Ok(_) => {
|
||||
info!("Deleted {:?}", file_to_delete);
|
||||
deleted_files.push(file_to_delete);
|
||||
}
|
||||
Err(file_error) => {
|
||||
match file_error {
|
||||
DeleteError::FileDoesNotExist(_) => {
|
||||
deleted_files.push(file_to_delete);
|
||||
}
|
||||
DeleteError::IOError(_) => {
|
||||
if !cfg!(target_os = "windows") {
|
||||
// On windows, delete is expected to fail if the file
|
||||
// is mmapped.
|
||||
error!("Failed to delete {:?}", file_to_delete);
|
||||
}
|
||||
for file_to_delete in files_to_delete {
|
||||
match self.delete(&file_to_delete) {
|
||||
Ok(_) => {
|
||||
info!("Deleted {:?}", file_to_delete);
|
||||
deleted_files.push(file_to_delete);
|
||||
}
|
||||
Err(file_error) => {
|
||||
match file_error {
|
||||
DeleteError::FileDoesNotExist(_) => {
|
||||
deleted_files.push(file_to_delete);
|
||||
}
|
||||
DeleteError::IOError(_) => {
|
||||
if !cfg!(target_os = "windows") {
|
||||
// On windows, delete is expected to fail if the file
|
||||
// is mmapped.
|
||||
error!("Failed to delete {:?}", file_to_delete);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -171,11 +171,9 @@ impl ManagedDirectory {
|
||||
.meta_informations
|
||||
.write()
|
||||
.expect("Managed directory wlock poisoned (2).");
|
||||
{
|
||||
let managed_paths_write = &mut meta_informations_wlock.managed_paths;
|
||||
for delete_file in &deleted_files {
|
||||
managed_paths_write.remove(delete_file);
|
||||
}
|
||||
let managed_paths_write = &mut meta_informations_wlock.managed_paths;
|
||||
for delete_file in &deleted_files {
|
||||
managed_paths_write.remove(delete_file);
|
||||
}
|
||||
if save_managed_paths(self.directory.as_mut(), &meta_informations_wlock).is_err() {
|
||||
error!("Failed to save the list of managed files.");
|
||||
@@ -243,7 +241,7 @@ impl Directory for ManagedDirectory {
|
||||
self.directory.acquire_lock(lock)
|
||||
}
|
||||
|
||||
fn watch(&self, watch_callback: WatchCallback) -> WatchHandle {
|
||||
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> {
|
||||
self.directory.watch(watch_callback)
|
||||
}
|
||||
}
|
||||
@@ -257,100 +255,80 @@ impl Clone for ManagedDirectory {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "mmap")]
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
mod tests_mmap_specific {
|
||||
|
||||
#[cfg(feature = "mmap")]
|
||||
mod mmap_specific {
|
||||
use crate::directory::{Directory, ManagedDirectory, MmapDirectory};
|
||||
use std::collections::HashSet;
|
||||
use std::io::Write;
|
||||
use std::path::{Path, PathBuf};
|
||||
use tempfile::TempDir;
|
||||
|
||||
use super::super::*;
|
||||
use once_cell::sync::Lazy;
|
||||
use std::path::Path;
|
||||
use tempdir::TempDir;
|
||||
|
||||
static TEST_PATH1: Lazy<&'static Path> = Lazy::new(|| Path::new("some_path_for_test"));
|
||||
static TEST_PATH2: Lazy<&'static Path> = Lazy::new(|| Path::new("some_path_for_test2"));
|
||||
|
||||
use crate::directory::MmapDirectory;
|
||||
use std::io::Write;
|
||||
|
||||
#[test]
|
||||
fn test_managed_directory() {
|
||||
let tempdir = TempDir::new("index").unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
{
|
||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
||||
{
|
||||
let mut write_file = managed_directory.open_write(*TEST_PATH1).unwrap();
|
||||
write_file.flush().unwrap();
|
||||
}
|
||||
{
|
||||
managed_directory
|
||||
.atomic_write(*TEST_PATH2, &vec![0u8, 1u8])
|
||||
.unwrap();
|
||||
}
|
||||
{
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
assert!(managed_directory.exists(*TEST_PATH2));
|
||||
}
|
||||
{
|
||||
let living_files: HashSet<PathBuf> =
|
||||
[TEST_PATH1.to_owned()].into_iter().cloned().collect();
|
||||
managed_directory.garbage_collect(|| living_files);
|
||||
}
|
||||
{
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
assert!(!managed_directory.exists(*TEST_PATH2));
|
||||
}
|
||||
}
|
||||
{
|
||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
||||
{
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
assert!(!managed_directory.exists(*TEST_PATH2));
|
||||
}
|
||||
{
|
||||
let living_files: HashSet<PathBuf> = HashSet::new();
|
||||
managed_directory.garbage_collect(|| living_files);
|
||||
}
|
||||
{
|
||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||
assert!(!managed_directory.exists(*TEST_PATH2));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_managed_directory_gc_while_mmapped() {
|
||||
let tempdir = TempDir::new("index").unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
let living_files = HashSet::new();
|
||||
#[test]
|
||||
fn test_managed_directory() {
|
||||
let tempdir = TempDir::new().unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
|
||||
let test_path1: &'static Path = Path::new("some_path_for_test");
|
||||
let test_path2: &'static Path = Path::new("some_path_for_test_2");
|
||||
{
|
||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
||||
let mut write_file = managed_directory.open_write(test_path1).unwrap();
|
||||
write_file.flush().unwrap();
|
||||
managed_directory
|
||||
.atomic_write(*TEST_PATH1, &vec![0u8, 1u8])
|
||||
.atomic_write(test_path2, &[0u8, 1u8])
|
||||
.unwrap();
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
|
||||
let _mmap_read = managed_directory.open_read(*TEST_PATH1).unwrap();
|
||||
managed_directory.garbage_collect(|| living_files.clone());
|
||||
if cfg!(target_os = "windows") {
|
||||
// On Windows, gc should try and fail the file as it is mmapped.
|
||||
assert!(managed_directory.exists(*TEST_PATH1));
|
||||
// unmap should happen here.
|
||||
drop(_mmap_read);
|
||||
// The file should still be in the list of managed file and
|
||||
// eventually be deleted once mmap is released.
|
||||
managed_directory.garbage_collect(|| living_files);
|
||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||
} else {
|
||||
assert!(!managed_directory.exists(*TEST_PATH1));
|
||||
}
|
||||
assert!(managed_directory.exists(test_path1));
|
||||
assert!(managed_directory.exists(test_path2));
|
||||
let living_files: HashSet<PathBuf> =
|
||||
[test_path1.to_owned()].into_iter().cloned().collect();
|
||||
managed_directory.garbage_collect(|| living_files);
|
||||
assert!(managed_directory.exists(test_path1));
|
||||
assert!(!managed_directory.exists(test_path2));
|
||||
}
|
||||
{
|
||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
||||
assert!(managed_directory.exists(test_path1));
|
||||
assert!(!managed_directory.exists(test_path2));
|
||||
let living_files: HashSet<PathBuf> = HashSet::new();
|
||||
managed_directory.garbage_collect(|| living_files);
|
||||
assert!(!managed_directory.exists(test_path1));
|
||||
assert!(!managed_directory.exists(test_path2));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_managed_directory_gc_while_mmapped() {
|
||||
let test_path1: &'static Path = Path::new("some_path_for_test");
|
||||
|
||||
let tempdir = TempDir::new().unwrap();
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
let living_files = HashSet::new();
|
||||
|
||||
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
|
||||
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
|
||||
managed_directory
|
||||
.atomic_write(test_path1, &vec![0u8, 1u8])
|
||||
.unwrap();
|
||||
assert!(managed_directory.exists(test_path1));
|
||||
|
||||
let _mmap_read = managed_directory.open_read(test_path1).unwrap();
|
||||
managed_directory.garbage_collect(|| living_files.clone());
|
||||
if cfg!(target_os = "windows") {
|
||||
// On Windows, gc should try and fail the file as it is mmapped.
|
||||
assert!(managed_directory.exists(test_path1));
|
||||
// unmap should happen here.
|
||||
drop(_mmap_read);
|
||||
// The file should still be in the list of managed file and
|
||||
// eventually be deleted once mmap is released.
|
||||
managed_directory.garbage_collect(|| living_files);
|
||||
assert!(!managed_directory.exists(test_path1));
|
||||
} else {
|
||||
assert!(!managed_directory.exists(test_path1));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -36,7 +36,7 @@ use std::sync::Mutex;
|
||||
use std::sync::RwLock;
|
||||
use std::sync::Weak;
|
||||
use std::thread;
|
||||
use tempdir::TempDir;
|
||||
use tempfile::TempDir;
|
||||
|
||||
/// Create a default io error given a string.
|
||||
pub(crate) fn make_io_err(msg: String) -> io::Error {
|
||||
@@ -161,7 +161,7 @@ impl InnerWatcherWrapper {
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct WatcherWrapper {
|
||||
struct WatcherWrapper {
|
||||
inner: Arc<InnerWatcherWrapper>,
|
||||
}
|
||||
|
||||
@@ -231,7 +231,7 @@ struct MmapDirectoryInner {
|
||||
root_path: PathBuf,
|
||||
mmap_cache: RwLock<MmapCache>,
|
||||
_temp_directory: Option<TempDir>,
|
||||
watcher: RwLock<WatcherWrapper>,
|
||||
watcher: RwLock<Option<WatcherWrapper>>,
|
||||
}
|
||||
|
||||
impl MmapDirectoryInner {
|
||||
@@ -239,19 +239,36 @@ impl MmapDirectoryInner {
|
||||
root_path: PathBuf,
|
||||
temp_directory: Option<TempDir>,
|
||||
) -> Result<MmapDirectoryInner, OpenDirectoryError> {
|
||||
let watch_wrapper = WatcherWrapper::new(&root_path)?;
|
||||
let mmap_directory_inner = MmapDirectoryInner {
|
||||
root_path,
|
||||
mmap_cache: Default::default(),
|
||||
_temp_directory: temp_directory,
|
||||
watcher: RwLock::new(watch_wrapper),
|
||||
watcher: RwLock::new(None),
|
||||
};
|
||||
Ok(mmap_directory_inner)
|
||||
}
|
||||
|
||||
fn watch(&self, watch_callback: WatchCallback) -> WatchHandle {
|
||||
let mut wlock = self.watcher.write().unwrap();
|
||||
wlock.watch(watch_callback)
|
||||
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> {
|
||||
// a lot of juggling here, to ensure we don't do anything that panics
|
||||
// while the rwlock is held. That way we ensure that the rwlock cannot
|
||||
// be poisoned.
|
||||
//
|
||||
// The downside is that we might create a watch wrapper that is not useful.
|
||||
let need_initialization = self.watcher.read().unwrap().is_none();
|
||||
if need_initialization {
|
||||
let watch_wrapper = WatcherWrapper::new(&self.root_path)?;
|
||||
let mut watch_wlock = self.watcher.write().unwrap();
|
||||
// the watcher could have been initialized when we released the lock, and
|
||||
// we do not want to lose the watched files that were set.
|
||||
if watch_wlock.is_none() {
|
||||
*watch_wlock = Some(watch_wrapper);
|
||||
}
|
||||
}
|
||||
if let Some(watch_wrapper) = self.watcher.write().unwrap().as_mut() {
|
||||
return Ok(watch_wrapper.watch(watch_callback));
|
||||
} else {
|
||||
unreachable!("At this point, watch wrapper is supposed to be initialized");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -277,7 +294,7 @@ impl MmapDirectory {
|
||||
/// This is mostly useful to test the MmapDirectory itself.
|
||||
/// For your unit tests, prefer the RAMDirectory.
|
||||
pub fn create_from_tempdir() -> Result<MmapDirectory, OpenDirectoryError> {
|
||||
let tempdir = TempDir::new("index").map_err(OpenDirectoryError::IoError)?;
|
||||
let tempdir = TempDir::new().map_err(OpenDirectoryError::IoError)?;
|
||||
let tempdir_path = PathBuf::from(tempdir.path());
|
||||
MmapDirectory::new(tempdir_path, Some(tempdir))
|
||||
}
|
||||
@@ -417,7 +434,6 @@ impl Directory for MmapDirectory {
|
||||
/// Any entry associated to the path in the mmap will be
|
||||
/// removed before the file is deleted.
|
||||
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
|
||||
debug!("Deleting file {:?}", path);
|
||||
let full_path = self.resolve_path(path);
|
||||
match fs::remove_file(&full_path) {
|
||||
Ok(_) => self
|
||||
@@ -515,7 +531,7 @@ impl Directory for MmapDirectory {
|
||||
})))
|
||||
}
|
||||
|
||||
fn watch(&self, watch_callback: WatchCallback) -> WatchHandle {
|
||||
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> {
|
||||
self.inner.watch(watch_callback)
|
||||
}
|
||||
}
|
||||
@@ -626,7 +642,7 @@ mod tests {
|
||||
fn test_watch_wrapper() {
|
||||
let counter: Arc<AtomicUsize> = Default::default();
|
||||
let counter_clone = counter.clone();
|
||||
let tmp_dir: TempDir = tempdir::TempDir::new("test_watch_wrapper").unwrap();
|
||||
let tmp_dir = tempfile::TempDir::new().unwrap();
|
||||
let tmp_dirpath = tmp_dir.path().to_owned();
|
||||
let mut watch_wrapper = WatcherWrapper::new(&tmp_dirpath).unwrap();
|
||||
let tmp_file = tmp_dirpath.join("coucou");
|
||||
|
||||
@@ -29,7 +29,7 @@ use std::io::{BufWriter, Write};
|
||||
#[cfg(feature = "mmap")]
|
||||
pub use self::mmap_directory::MmapDirectory;
|
||||
|
||||
pub(crate) use self::managed_directory::ManagedDirectory;
|
||||
pub use self::managed_directory::ManagedDirectory;
|
||||
|
||||
/// Write object for Directory.
|
||||
///
|
||||
|
||||
@@ -145,6 +145,11 @@ impl Directory for RAMDirectory {
|
||||
}
|
||||
|
||||
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
|
||||
fail_point!("RAMDirectory::delete", |_| {
|
||||
use crate::directory::error::IOError;
|
||||
let io_error = IOError::from(io::Error::from(io::ErrorKind::Other));
|
||||
Err(DeleteError::from(io_error))
|
||||
});
|
||||
self.fs.write().unwrap().delete(path)
|
||||
}
|
||||
|
||||
@@ -172,7 +177,7 @@ impl Directory for RAMDirectory {
|
||||
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
|
||||
fail_point!("RAMDirectory::atomic_write", |msg| Err(io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
msg.unwrap_or("Undefined".to_string())
|
||||
msg.unwrap_or_else(|| "Undefined".to_string())
|
||||
)));
|
||||
let path_buf = PathBuf::from(path);
|
||||
|
||||
@@ -188,7 +193,7 @@ impl Directory for RAMDirectory {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn watch(&self, watch_callback: WatchCallback) -> WatchHandle {
|
||||
self.fs.write().unwrap().watch(watch_callback)
|
||||
fn watch(&self, watch_callback: WatchCallback) -> crate::Result<WatchHandle> {
|
||||
Ok(self.fs.write().unwrap().watch(watch_callback))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use super::*;
|
||||
use once_cell::sync::Lazy;
|
||||
use std::io::Write;
|
||||
use std::mem;
|
||||
use std::path::{Path, PathBuf};
|
||||
@@ -10,8 +9,6 @@ use std::thread;
|
||||
use std::time;
|
||||
use std::time::Duration;
|
||||
|
||||
static TEST_PATH: Lazy<&'static Path> = Lazy::new(|| Path::new("some_path_for_test"));
|
||||
|
||||
#[test]
|
||||
fn test_ram_directory() {
|
||||
let mut ram_directory = RAMDirectory::create();
|
||||
@@ -28,76 +25,78 @@ fn test_mmap_directory() {
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn ram_directory_panics_if_flush_forgotten() {
|
||||
let test_path: &'static Path = Path::new("some_path_for_test");
|
||||
let mut ram_directory = RAMDirectory::create();
|
||||
let mut write_file = ram_directory.open_write(*TEST_PATH).unwrap();
|
||||
let mut write_file = ram_directory.open_write(test_path).unwrap();
|
||||
assert!(write_file.write_all(&[4]).is_ok());
|
||||
}
|
||||
|
||||
fn test_simple(directory: &mut dyn Directory) {
|
||||
let test_path: &'static Path = Path::new("some_path_for_test");
|
||||
{
|
||||
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
|
||||
assert!(directory.exists(*TEST_PATH));
|
||||
let mut write_file = directory.open_write(test_path).unwrap();
|
||||
assert!(directory.exists(test_path));
|
||||
write_file.write_all(&[4]).unwrap();
|
||||
write_file.write_all(&[3]).unwrap();
|
||||
write_file.write_all(&[7, 3, 5]).unwrap();
|
||||
write_file.flush().unwrap();
|
||||
}
|
||||
{
|
||||
let read_file = directory.open_read(*TEST_PATH).unwrap();
|
||||
let read_file = directory.open_read(test_path).unwrap();
|
||||
let data: &[u8] = &*read_file;
|
||||
assert_eq!(data, &[4u8, 3u8, 7u8, 3u8, 5u8]);
|
||||
}
|
||||
assert!(directory.delete(*TEST_PATH).is_ok());
|
||||
assert!(!directory.exists(*TEST_PATH));
|
||||
assert!(directory.delete(test_path).is_ok());
|
||||
assert!(!directory.exists(test_path));
|
||||
}
|
||||
|
||||
fn test_rewrite_forbidden(directory: &mut dyn Directory) {
|
||||
let test_path: &'static Path = Path::new("some_path_for_test");
|
||||
{
|
||||
directory.open_write(*TEST_PATH).unwrap();
|
||||
assert!(directory.exists(*TEST_PATH));
|
||||
directory.open_write(test_path).unwrap();
|
||||
assert!(directory.exists(test_path));
|
||||
}
|
||||
{
|
||||
assert!(directory.open_write(*TEST_PATH).is_err());
|
||||
assert!(directory.open_write(test_path).is_err());
|
||||
}
|
||||
assert!(directory.delete(*TEST_PATH).is_ok());
|
||||
assert!(directory.delete(test_path).is_ok());
|
||||
}
|
||||
|
||||
fn test_write_create_the_file(directory: &mut dyn Directory) {
|
||||
let test_path: &'static Path = Path::new("some_path_for_test");
|
||||
{
|
||||
assert!(directory.open_read(*TEST_PATH).is_err());
|
||||
let _w = directory.open_write(*TEST_PATH).unwrap();
|
||||
assert!(directory.exists(*TEST_PATH));
|
||||
assert!(directory.open_read(*TEST_PATH).is_ok());
|
||||
assert!(directory.delete(*TEST_PATH).is_ok());
|
||||
assert!(directory.open_read(test_path).is_err());
|
||||
let _w = directory.open_write(test_path).unwrap();
|
||||
assert!(directory.exists(test_path));
|
||||
assert!(directory.open_read(test_path).is_ok());
|
||||
assert!(directory.delete(test_path).is_ok());
|
||||
}
|
||||
}
|
||||
|
||||
fn test_directory_delete(directory: &mut dyn Directory) {
|
||||
assert!(directory.open_read(*TEST_PATH).is_err());
|
||||
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
|
||||
let test_path: &'static Path = Path::new("some_path_for_test");
|
||||
assert!(directory.open_read(test_path).is_err());
|
||||
let mut write_file = directory.open_write(&test_path).unwrap();
|
||||
write_file.write_all(&[1, 2, 3, 4]).unwrap();
|
||||
write_file.flush().unwrap();
|
||||
{
|
||||
let read_handle = directory.open_read(*TEST_PATH).unwrap();
|
||||
{
|
||||
let read_handle = directory.open_read(&test_path).unwrap();
|
||||
assert_eq!(&*read_handle, &[1u8, 2u8, 3u8, 4u8]);
|
||||
// Mapped files can't be deleted on Windows
|
||||
if !cfg!(windows) {
|
||||
assert!(directory.delete(&test_path).is_ok());
|
||||
assert_eq!(&*read_handle, &[1u8, 2u8, 3u8, 4u8]);
|
||||
|
||||
// Mapped files can't be deleted on Windows
|
||||
if !cfg!(windows) {
|
||||
assert!(directory.delete(*TEST_PATH).is_ok());
|
||||
assert_eq!(&*read_handle, &[1u8, 2u8, 3u8, 4u8]);
|
||||
}
|
||||
|
||||
assert!(directory.delete(Path::new("SomeOtherPath")).is_err());
|
||||
}
|
||||
|
||||
assert!(directory.delete(Path::new("SomeOtherPath")).is_err());
|
||||
}
|
||||
|
||||
if cfg!(windows) {
|
||||
assert!(directory.delete(*TEST_PATH).is_ok());
|
||||
assert!(directory.delete(&test_path).is_ok());
|
||||
}
|
||||
|
||||
assert!(directory.open_read(*TEST_PATH).is_err());
|
||||
assert!(directory.delete(*TEST_PATH).is_err());
|
||||
assert!(directory.open_read(&test_path).is_err());
|
||||
assert!(directory.delete(&test_path).is_err());
|
||||
}
|
||||
|
||||
fn test_directory(directory: &mut dyn Directory) {
|
||||
@@ -122,7 +121,7 @@ fn test_watch(directory: &mut dyn Directory) {
|
||||
thread::sleep(Duration::new(0, 10_000));
|
||||
assert_eq!(0, counter.load(Ordering::SeqCst));
|
||||
|
||||
let watch_handle = directory.watch(watch_callback);
|
||||
let watch_handle = directory.watch(watch_callback).unwrap();
|
||||
for i in 0..10 {
|
||||
assert_eq!(i, counter.load(Ordering::SeqCst));
|
||||
assert!(directory
|
||||
|
||||
@@ -48,7 +48,7 @@ mod readers;
|
||||
mod serializer;
|
||||
mod writer;
|
||||
|
||||
/// Trait for types that are allowed for fast fields: (u64 or i64).
|
||||
/// Trait for types that are allowed for fast fields: (u64, i64 and f64).
|
||||
pub trait FastValue: Default + Clone + Copy + Send + Sync + PartialOrd {
|
||||
/// Converts a value from u64
|
||||
///
|
||||
@@ -114,11 +114,33 @@ impl FastValue for i64 {
|
||||
}
|
||||
}
|
||||
|
||||
impl FastValue for f64 {
|
||||
fn from_u64(val: u64) -> Self {
|
||||
common::u64_to_f64(val)
|
||||
}
|
||||
|
||||
fn to_u64(&self) -> u64 {
|
||||
common::f64_to_u64(*self)
|
||||
}
|
||||
|
||||
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
|
||||
match *field_type {
|
||||
FieldType::F64(ref integer_options) => integer_options.get_fastfield_cardinality(),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn as_u64(&self) -> u64 {
|
||||
self.to_bits()
|
||||
}
|
||||
}
|
||||
|
||||
fn value_to_u64(value: &Value) -> u64 {
|
||||
match *value {
|
||||
Value::U64(ref val) => *val,
|
||||
Value::I64(ref val) => common::i64_to_u64(*val),
|
||||
_ => panic!("Expected a u64/i64 field, got {:?} ", value),
|
||||
Value::F64(ref val) => common::f64_to_u64(*val),
|
||||
_ => panic!("Expected a u64/i64/f64 field, got {:?} ", value),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -14,8 +14,10 @@ use std::collections::HashMap;
|
||||
pub struct FastFieldReaders {
|
||||
fast_field_i64: HashMap<Field, FastFieldReader<i64>>,
|
||||
fast_field_u64: HashMap<Field, FastFieldReader<u64>>,
|
||||
fast_field_f64: HashMap<Field, FastFieldReader<f64>>,
|
||||
fast_field_i64s: HashMap<Field, MultiValueIntFastFieldReader<i64>>,
|
||||
fast_field_u64s: HashMap<Field, MultiValueIntFastFieldReader<u64>>,
|
||||
fast_field_f64s: HashMap<Field, MultiValueIntFastFieldReader<f64>>,
|
||||
fast_bytes: HashMap<Field, BytesFastFieldReader>,
|
||||
fast_fields_composite: CompositeFile,
|
||||
}
|
||||
@@ -23,6 +25,7 @@ pub struct FastFieldReaders {
|
||||
enum FastType {
|
||||
I64,
|
||||
U64,
|
||||
F64,
|
||||
}
|
||||
|
||||
fn type_and_cardinality(field_type: &FieldType) -> Option<(FastType, Cardinality)> {
|
||||
@@ -33,6 +36,9 @@ fn type_and_cardinality(field_type: &FieldType) -> Option<(FastType, Cardinality
|
||||
FieldType::I64(options) => options
|
||||
.get_fastfield_cardinality()
|
||||
.map(|cardinality| (FastType::I64, cardinality)),
|
||||
FieldType::F64(options) => options
|
||||
.get_fastfield_cardinality()
|
||||
.map(|cardinality| (FastType::F64, cardinality)),
|
||||
FieldType::HierarchicalFacet => Some((FastType::U64, Cardinality::MultiValues)),
|
||||
_ => None,
|
||||
}
|
||||
@@ -46,8 +52,10 @@ impl FastFieldReaders {
|
||||
let mut fast_field_readers = FastFieldReaders {
|
||||
fast_field_i64: Default::default(),
|
||||
fast_field_u64: Default::default(),
|
||||
fast_field_f64: Default::default(),
|
||||
fast_field_i64s: Default::default(),
|
||||
fast_field_u64s: Default::default(),
|
||||
fast_field_f64s: Default::default(),
|
||||
fast_bytes: Default::default(),
|
||||
fast_fields_composite: fast_fields_composite.clone(),
|
||||
};
|
||||
@@ -82,6 +90,12 @@ impl FastFieldReaders {
|
||||
FastFieldReader::open(fast_field_data.clone()),
|
||||
);
|
||||
}
|
||||
FastType::F64 => {
|
||||
fast_field_readers.fast_field_f64.insert(
|
||||
field,
|
||||
FastFieldReader::open(fast_field_data.clone()),
|
||||
);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
return Err(From::from(FastFieldNotAvailableError::new(field_entry)));
|
||||
@@ -109,6 +123,14 @@ impl FastFieldReaders {
|
||||
.fast_field_u64s
|
||||
.insert(field, multivalued_int_fast_field);
|
||||
}
|
||||
FastType::F64 => {
|
||||
let vals_reader = FastFieldReader::open(fast_field_data);
|
||||
let multivalued_int_fast_field =
|
||||
MultiValueIntFastFieldReader::open(idx_reader, vals_reader);
|
||||
fast_field_readers
|
||||
.fast_field_f64s
|
||||
.insert(field, multivalued_int_fast_field);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
return Err(From::from(FastFieldNotAvailableError::new(field_entry)));
|
||||
@@ -135,6 +157,8 @@ impl FastFieldReaders {
|
||||
/// If the field is a i64-fast field, return the associated u64 reader. Values are
|
||||
/// mapped from i64 to u64 using a (well the, it is unique) monotonic mapping. ///
|
||||
///
|
||||
///TODO should it also be lenient with f64?
|
||||
///
|
||||
/// This method is useful when merging segment reader.
|
||||
pub(crate) fn u64_lenient(&self, field: Field) -> Option<FastFieldReader<u64>> {
|
||||
if let Some(u64_ff_reader) = self.u64(field) {
|
||||
@@ -153,6 +177,13 @@ impl FastFieldReaders {
|
||||
self.fast_field_i64.get(&field).cloned()
|
||||
}
|
||||
|
||||
/// Returns the `f64` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a f64 fast field, this method returns `None`.
|
||||
pub fn f64(&self, field: Field) -> Option<FastFieldReader<f64>> {
|
||||
self.fast_field_f64.get(&field).cloned()
|
||||
}
|
||||
|
||||
/// Returns a `u64s` multi-valued fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a u64 multi-valued fast field, this method returns `None`.
|
||||
@@ -182,6 +213,13 @@ impl FastFieldReaders {
|
||||
self.fast_field_i64s.get(&field).cloned()
|
||||
}
|
||||
|
||||
/// Returns a `f64s` multi-valued fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a f64 multi-valued fast field, this method returns `None`.
|
||||
pub fn f64s(&self, field: Field) -> Option<MultiValueIntFastFieldReader<f64>> {
|
||||
self.fast_field_f64s.get(&field).cloned()
|
||||
}
|
||||
|
||||
/// Returns the `bytes` fast field reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a bytes fast field, returns `None`.
|
||||
|
||||
@@ -25,13 +25,15 @@ impl FastFieldsWriter {
|
||||
|
||||
for (field_id, field_entry) in schema.fields().iter().enumerate() {
|
||||
let field = Field(field_id as u32);
|
||||
let default_value = if let FieldType::I64(_) = *field_entry.field_type() {
|
||||
common::i64_to_u64(0i64)
|
||||
} else {
|
||||
0u64
|
||||
let default_value = match *field_entry.field_type() {
|
||||
FieldType::I64(_) => common::i64_to_u64(0i64),
|
||||
FieldType::F64(_) => common::f64_to_u64(0.0f64),
|
||||
_ => 0u64,
|
||||
};
|
||||
match *field_entry.field_type() {
|
||||
FieldType::I64(ref int_options) | FieldType::U64(ref int_options) => {
|
||||
FieldType::I64(ref int_options)
|
||||
| FieldType::U64(ref int_options)
|
||||
| FieldType::F64(ref int_options) => {
|
||||
match int_options.get_fastfield_cardinality() {
|
||||
Some(Cardinality::SingleValue) => {
|
||||
let mut fast_field_writer = IntFastFieldWriter::new(field);
|
||||
@@ -142,9 +144,9 @@ impl FastFieldsWriter {
|
||||
/// bitpacked and the number of bits required for bitpacking
|
||||
/// can only been known once we have seen all of the values.
|
||||
///
|
||||
/// Both u64, and i64 use the same writer.
|
||||
/// i64 are just remapped to the `0..2^64 - 1`
|
||||
/// using `common::i64_to_u64`.
|
||||
/// Both u64, i64 and f64 use the same writer.
|
||||
/// i64 and f64 are just remapped to the `0..2^64 - 1`
|
||||
/// using `common::i64_to_u64` and `common::f64_to_u64`.
|
||||
pub struct IntFastFieldWriter {
|
||||
field: Field,
|
||||
vals: Vec<u8>,
|
||||
@@ -203,8 +205,8 @@ impl IntFastFieldWriter {
|
||||
/// Extract the value associated to the fast field for
|
||||
/// this document.
|
||||
///
|
||||
/// i64 are remapped to u64 using the logic
|
||||
/// in `common::i64_to_u64`.
|
||||
/// i64 and f64 are remapped to u64 using the logic
|
||||
/// in `common::i64_to_u64` and `common::f64_to_u64`.
|
||||
///
|
||||
/// If the value is missing, then the default value is used
|
||||
/// instead.
|
||||
|
||||
@@ -10,28 +10,263 @@ pub fn fieldnorm_to_id(fieldnorm: u32) -> u8 {
|
||||
.unwrap_or_else(|idx| idx - 1) as u8
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::unreadable_literal))]
|
||||
pub const FIELD_NORMS_TABLE: [u32; 256] = [
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
|
||||
26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 42, 44, 46, 48, 50, 52, 54, 56, 60,
|
||||
64, 68, 72, 76, 80, 84, 88, 96, 104, 112, 120, 128, 136, 144, 152, 168, 184, 200, 216, 232,
|
||||
248, 264, 280, 312, 344, 376, 408, 440, 472, 504, 536, 600, 664, 728, 792, 856, 920, 984,
|
||||
1_048, 1176, 1304, 1432, 1560, 1688, 1816, 1944, 2072, 2328, 2584, 2840, 3096, 3352, 3608,
|
||||
3864, 4120, 4632, 5144, 5656, 6168, 6680, 7192, 7704, 8216, 9240, 10264, 11288, 12312, 13336,
|
||||
14360, 15384, 16408, 18456, 20504, 22552, 24600, 26648, 28696, 30744, 32792, 36888, 40984,
|
||||
45080, 49176, 53272, 57368, 61464, 65560, 73752, 81944, 90136, 98328, 106520, 114712, 122904,
|
||||
131096, 147480, 163864, 180248, 196632, 213016, 229400, 245784, 262168, 294936, 327704, 360472,
|
||||
393240, 426008, 458776, 491544, 524312, 589848, 655384, 720920, 786456, 851992, 917528, 983064,
|
||||
1048600, 1179672, 1310744, 1441816, 1572888, 1703960, 1835032, 1966104, 2097176, 2359320,
|
||||
2621464, 2883608, 3145752, 3407896, 3670040, 3932184, 4194328, 4718616, 5242904, 5767192,
|
||||
6291480, 6815768, 7340056, 7864344, 8388632, 9437208, 10485784, 11534360, 12582936, 13631512,
|
||||
14680088, 15728664, 16777240, 18874392, 20971544, 23068696, 25165848, 27263000, 29360152,
|
||||
31457304, 33554456, 37748760, 41943064, 46137368, 50331672, 54525976, 58720280, 62914584,
|
||||
67108888, 75497496, 83886104, 92274712, 100663320, 109051928, 117440536, 125829144, 134217752,
|
||||
150994968, 167772184, 184549400, 201326616, 218103832, 234881048, 251658264, 268435480,
|
||||
301989912, 335544344, 369098776, 402653208, 436207640, 469762072, 503316504, 536870936,
|
||||
603979800, 671088664, 738197528, 805306392, 872415256, 939524120, 1006632984, 1073741848,
|
||||
1207959576, 1342177304, 1476395032, 1610612760, 1744830488, 1879048216, 2013265944,
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
6,
|
||||
7,
|
||||
8,
|
||||
9,
|
||||
10,
|
||||
11,
|
||||
12,
|
||||
13,
|
||||
14,
|
||||
15,
|
||||
16,
|
||||
17,
|
||||
18,
|
||||
19,
|
||||
20,
|
||||
21,
|
||||
22,
|
||||
23,
|
||||
24,
|
||||
25,
|
||||
26,
|
||||
27,
|
||||
28,
|
||||
29,
|
||||
30,
|
||||
31,
|
||||
32,
|
||||
33,
|
||||
34,
|
||||
35,
|
||||
36,
|
||||
37,
|
||||
38,
|
||||
39,
|
||||
40,
|
||||
42,
|
||||
44,
|
||||
46,
|
||||
48,
|
||||
50,
|
||||
52,
|
||||
54,
|
||||
56,
|
||||
60,
|
||||
64,
|
||||
68,
|
||||
72,
|
||||
76,
|
||||
80,
|
||||
84,
|
||||
88,
|
||||
96,
|
||||
104,
|
||||
112,
|
||||
120,
|
||||
128,
|
||||
136,
|
||||
144,
|
||||
152,
|
||||
168,
|
||||
184,
|
||||
200,
|
||||
216,
|
||||
232,
|
||||
248,
|
||||
264,
|
||||
280,
|
||||
312,
|
||||
344,
|
||||
376,
|
||||
408,
|
||||
440,
|
||||
472,
|
||||
504,
|
||||
536,
|
||||
600,
|
||||
664,
|
||||
728,
|
||||
792,
|
||||
856,
|
||||
920,
|
||||
984,
|
||||
1_048,
|
||||
1_176,
|
||||
1_304,
|
||||
1_432,
|
||||
1_560,
|
||||
1_688,
|
||||
1_816,
|
||||
1_944,
|
||||
2_072,
|
||||
2_328,
|
||||
2_584,
|
||||
2_840,
|
||||
3_096,
|
||||
3_352,
|
||||
3_608,
|
||||
3_864,
|
||||
4_120,
|
||||
4_632,
|
||||
5_144,
|
||||
5_656,
|
||||
6_168,
|
||||
6_680,
|
||||
7_192,
|
||||
7_704,
|
||||
8_216,
|
||||
9_240,
|
||||
10_264,
|
||||
11_288,
|
||||
12_312,
|
||||
13_336,
|
||||
14_360,
|
||||
15_384,
|
||||
16_408,
|
||||
18_456,
|
||||
20_504,
|
||||
22_552,
|
||||
24_600,
|
||||
26_648,
|
||||
28_696,
|
||||
30_744,
|
||||
32_792,
|
||||
36_888,
|
||||
40_984,
|
||||
45_080,
|
||||
49_176,
|
||||
53_272,
|
||||
57_368,
|
||||
61_464,
|
||||
65_560,
|
||||
73_752,
|
||||
81_944,
|
||||
90_136,
|
||||
98_328,
|
||||
106_520,
|
||||
114_712,
|
||||
122_904,
|
||||
131_096,
|
||||
147_480,
|
||||
163_864,
|
||||
180_248,
|
||||
196_632,
|
||||
213_016,
|
||||
229_400,
|
||||
245_784,
|
||||
262_168,
|
||||
294_936,
|
||||
327_704,
|
||||
360_472,
|
||||
393_240,
|
||||
426_008,
|
||||
458_776,
|
||||
491_544,
|
||||
524_312,
|
||||
589_848,
|
||||
655_384,
|
||||
720_920,
|
||||
786_456,
|
||||
851_992,
|
||||
917_528,
|
||||
983_064,
|
||||
1_048_600,
|
||||
1_179_672,
|
||||
1_310_744,
|
||||
1_441_816,
|
||||
1_572_888,
|
||||
1_703_960,
|
||||
1_835_032,
|
||||
1_966_104,
|
||||
2_097_176,
|
||||
2_359_320,
|
||||
2_621_464,
|
||||
2_883_608,
|
||||
3_145_752,
|
||||
3_407_896,
|
||||
3_670_040,
|
||||
3_932_184,
|
||||
4_194_328,
|
||||
4_718_616,
|
||||
5_242_904,
|
||||
5_767_192,
|
||||
6_291_480,
|
||||
6_815_768,
|
||||
7_340_056,
|
||||
7_864_344,
|
||||
8_388_632,
|
||||
9_437_208,
|
||||
10_485_784,
|
||||
11_534_360,
|
||||
12_582_936,
|
||||
13_631_512,
|
||||
14_680_088,
|
||||
15_728_664,
|
||||
16_777_240,
|
||||
18_874_392,
|
||||
20_971_544,
|
||||
23_068_696,
|
||||
25_165_848,
|
||||
27_263_000,
|
||||
29_360_152,
|
||||
31_457_304,
|
||||
33_554_456,
|
||||
37_748_760,
|
||||
41_943_064,
|
||||
46_137_368,
|
||||
50_331_672,
|
||||
54_525_976,
|
||||
58_720_280,
|
||||
62_914_584,
|
||||
67_108_888,
|
||||
75_497_496,
|
||||
83_886_104,
|
||||
92_274_712,
|
||||
100_663_320,
|
||||
109_051_928,
|
||||
117_440_536,
|
||||
125_829_144,
|
||||
134_217_752,
|
||||
150_994_968,
|
||||
167_772_184,
|
||||
184_549_400,
|
||||
201_326_616,
|
||||
218_103_832,
|
||||
234_881_048,
|
||||
251_658_264,
|
||||
268_435_480,
|
||||
301_989_912,
|
||||
335_544_344,
|
||||
369_098_776,
|
||||
402_653_208,
|
||||
436_207_640,
|
||||
469_762_072,
|
||||
503_316_504,
|
||||
536_870_936,
|
||||
603_979_800,
|
||||
671_088_664,
|
||||
738_197_528,
|
||||
805_306_392,
|
||||
872_415_256,
|
||||
939_524_120,
|
||||
1_006_632_984,
|
||||
1_073_741_848,
|
||||
1_207_959_576,
|
||||
1_342_177_304,
|
||||
1_476_395_032,
|
||||
1_610_612_760,
|
||||
1_744_830_488,
|
||||
1_879_048_216,
|
||||
2_013_265_944,
|
||||
];
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -24,7 +24,7 @@ struct InnerDeleteQueue {
|
||||
last_block: Option<Arc<Block>>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
#[derive(Clone)]
|
||||
pub struct DeleteQueue {
|
||||
inner: Arc<RwLock<InnerDeleteQueue>>,
|
||||
}
|
||||
@@ -37,6 +37,7 @@ impl DeleteQueue {
|
||||
};
|
||||
|
||||
let next_block = NextBlock::from(delete_queue.clone());
|
||||
|
||||
{
|
||||
let mut delete_queue_wlock = delete_queue.inner.write().unwrap();
|
||||
delete_queue_wlock.last_block = Some(Arc::new(Block {
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use crate::DocId;
|
||||
use crate::Opstamp;
|
||||
use std::sync::Arc;
|
||||
|
||||
// Doc to opstamp is used to identify which
|
||||
// document should be deleted.
|
||||
@@ -18,18 +17,18 @@ use std::sync::Arc;
|
||||
// This mapping is (for the moment) stricly increasing
|
||||
// because of the way document id are allocated.
|
||||
#[derive(Clone)]
|
||||
pub enum DocToOpstampMapping {
|
||||
WithMap(Arc<Vec<u64>>),
|
||||
pub enum DocToOpstampMapping<'a> {
|
||||
WithMap(&'a [Opstamp]),
|
||||
None,
|
||||
}
|
||||
|
||||
impl From<Vec<u64>> for DocToOpstampMapping {
|
||||
fn from(opstamps: Vec<Opstamp>) -> DocToOpstampMapping {
|
||||
DocToOpstampMapping::WithMap(Arc::new(opstamps))
|
||||
impl<'a> From<&'a [u64]> for DocToOpstampMapping<'a> {
|
||||
fn from(opstamps: &[Opstamp]) -> DocToOpstampMapping {
|
||||
DocToOpstampMapping::WithMap(opstamps)
|
||||
}
|
||||
}
|
||||
|
||||
impl DocToOpstampMapping {
|
||||
impl<'a> DocToOpstampMapping<'a> {
|
||||
/// Given an opstamp return the limit doc id L
|
||||
/// such that all doc id D such that
|
||||
// D >= L iff opstamp(D) >= than `target_opstamp`.
|
||||
@@ -65,17 +64,18 @@ mod tests {
|
||||
#[test]
|
||||
fn test_doc_to_opstamp_mapping_complex() {
|
||||
{
|
||||
let doc_to_opstamp_mapping = DocToOpstampMapping::from(vec![]);
|
||||
let doc_to_opstamp_mapping = DocToOpstampMapping::from(&[][..]);
|
||||
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(0u64), 0);
|
||||
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(2u64), 0);
|
||||
}
|
||||
{
|
||||
let doc_to_opstamp_mapping = DocToOpstampMapping::from(vec![1u64]);
|
||||
let doc_to_opstamp_mapping = DocToOpstampMapping::from(&[1u64][..]);
|
||||
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(0u64), 0);
|
||||
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(2u64), 1);
|
||||
}
|
||||
{
|
||||
let doc_to_opstamp_mapping = DocToOpstampMapping::from(vec![1u64, 12u64, 17u64, 23u64]);
|
||||
let doc_to_opstamp_mapping =
|
||||
DocToOpstampMapping::from(&[1u64, 12u64, 17u64, 23u64][..]);
|
||||
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(0u64), 0);
|
||||
for i in 2u64..13u64 {
|
||||
assert_eq!(doc_to_opstamp_mapping.compute_doc_limit(i), 1);
|
||||
|
||||
@@ -18,7 +18,6 @@ use crate::indexer::stamper::Stamper;
|
||||
use crate::indexer::MergePolicy;
|
||||
use crate::indexer::SegmentEntry;
|
||||
use crate::indexer::SegmentWriter;
|
||||
use crate::postings::compute_table_size;
|
||||
use crate::schema::Document;
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::schema::Term;
|
||||
@@ -27,6 +26,8 @@ use crate::Result;
|
||||
use bit_set::BitSet;
|
||||
use crossbeam::channel;
|
||||
use futures::{Canceled, Future};
|
||||
use smallvec::smallvec;
|
||||
use smallvec::SmallVec;
|
||||
use std::mem;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
@@ -45,29 +46,15 @@ pub const HEAP_SIZE_MAX: usize = u32::max_value() as usize - MARGIN_IN_BYTES;
|
||||
// reaches `PIPELINE_MAX_SIZE_IN_DOCS`
|
||||
const PIPELINE_MAX_SIZE_IN_DOCS: usize = 10_000;
|
||||
|
||||
type OperationSender = channel::Sender<Vec<AddOperation>>;
|
||||
type OperationReceiver = channel::Receiver<Vec<AddOperation>>;
|
||||
|
||||
/// Split the thread memory budget into
|
||||
/// - the heap size
|
||||
/// - the hash table "table" itself.
|
||||
///
|
||||
/// Returns (the heap size in bytes, the hash table size in number of bits)
|
||||
fn initial_table_size(per_thread_memory_budget: usize) -> usize {
|
||||
assert!(per_thread_memory_budget > 1_000);
|
||||
let table_size_limit: usize = per_thread_memory_budget / 3;
|
||||
if let Some(limit) = (1..)
|
||||
.take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit)
|
||||
.last()
|
||||
{
|
||||
limit.min(19) // we cap it at 2^19 = 512K.
|
||||
} else {
|
||||
unreachable!(
|
||||
"Per thread memory is too small: {}",
|
||||
per_thread_memory_budget
|
||||
);
|
||||
}
|
||||
}
|
||||
// Group of operations.
|
||||
// Most of the time, users will send operation one-by-one, but it can be useful to
|
||||
// send them as a small block to ensure that
|
||||
// - all docs in the operation will happen on the same segment and continuous docids.
|
||||
// - all operations in the group are committed at the same time, making the group
|
||||
// atomic.
|
||||
type OperationGroup = SmallVec<[AddOperation; 4]>;
|
||||
type OperationSender = channel::Sender<OperationGroup>;
|
||||
type OperationReceiver = channel::Receiver<OperationGroup>;
|
||||
|
||||
/// `IndexWriter` is the user entry-point to add document to an index.
|
||||
///
|
||||
@@ -95,85 +82,13 @@ pub struct IndexWriter {
|
||||
|
||||
num_threads: usize,
|
||||
|
||||
generation: usize,
|
||||
|
||||
delete_queue: DeleteQueue,
|
||||
|
||||
stamper: Stamper,
|
||||
committed_opstamp: Opstamp,
|
||||
}
|
||||
|
||||
/// Open a new index writer. Attempts to acquire a lockfile.
|
||||
///
|
||||
/// The lockfile should be deleted on drop, but it is possible
|
||||
/// that due to a panic or other error, a stale lockfile will be
|
||||
/// left in the index directory. If you are sure that no other
|
||||
/// `IndexWriter` on the system is accessing the index directory,
|
||||
/// it is safe to manually delete the lockfile.
|
||||
///
|
||||
/// `num_threads` specifies the number of indexing workers that
|
||||
/// should work at the same time.
|
||||
/// # Errors
|
||||
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
|
||||
/// # Panics
|
||||
/// If the heap size per thread is too small, panics.
|
||||
pub fn open_index_writer(
|
||||
index: &Index,
|
||||
num_threads: usize,
|
||||
heap_size_in_bytes_per_thread: usize,
|
||||
directory_lock: DirectoryLock,
|
||||
) -> Result<IndexWriter> {
|
||||
if heap_size_in_bytes_per_thread < HEAP_SIZE_MIN {
|
||||
let err_msg = format!(
|
||||
"The heap size per thread needs to be at least {}.",
|
||||
HEAP_SIZE_MIN
|
||||
);
|
||||
return Err(TantivyError::InvalidArgument(err_msg));
|
||||
}
|
||||
if heap_size_in_bytes_per_thread >= HEAP_SIZE_MAX {
|
||||
let err_msg = format!("The heap size per thread cannot exceed {}", HEAP_SIZE_MAX);
|
||||
return Err(TantivyError::InvalidArgument(err_msg));
|
||||
}
|
||||
let (document_sender, document_receiver): (OperationSender, OperationReceiver) =
|
||||
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||
|
||||
let delete_queue = DeleteQueue::new();
|
||||
|
||||
let current_opstamp = index.load_metas()?.opstamp;
|
||||
|
||||
let stamper = Stamper::new(current_opstamp);
|
||||
|
||||
let segment_updater =
|
||||
SegmentUpdater::create(index.clone(), stamper.clone(), &delete_queue.cursor())?;
|
||||
|
||||
let mut index_writer = IndexWriter {
|
||||
_directory_lock: Some(directory_lock),
|
||||
|
||||
heap_size_in_bytes_per_thread,
|
||||
index: index.clone(),
|
||||
|
||||
operation_receiver: document_receiver,
|
||||
operation_sender: document_sender,
|
||||
|
||||
segment_updater,
|
||||
|
||||
workers_join_handle: vec![],
|
||||
num_threads,
|
||||
|
||||
delete_queue,
|
||||
|
||||
committed_opstamp: current_opstamp,
|
||||
stamper,
|
||||
|
||||
generation: 0,
|
||||
|
||||
worker_id: 0,
|
||||
};
|
||||
index_writer.start_workers()?;
|
||||
Ok(index_writer)
|
||||
}
|
||||
|
||||
pub fn compute_deleted_bitset(
|
||||
fn compute_deleted_bitset(
|
||||
delete_bitset: &mut BitSet,
|
||||
segment_reader: &SegmentReader,
|
||||
delete_cursor: &mut DeleteCursor,
|
||||
@@ -181,35 +96,30 @@ pub fn compute_deleted_bitset(
|
||||
target_opstamp: Opstamp,
|
||||
) -> Result<bool> {
|
||||
let mut might_have_changed = false;
|
||||
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::while_let_loop))]
|
||||
loop {
|
||||
if let Some(delete_op) = delete_cursor.get() {
|
||||
if delete_op.opstamp > target_opstamp {
|
||||
break;
|
||||
} else {
|
||||
// A delete operation should only affect
|
||||
// document that were inserted after it.
|
||||
//
|
||||
// Limit doc helps identify the first document
|
||||
// that may be affected by the delete operation.
|
||||
let limit_doc = doc_opstamps.compute_doc_limit(delete_op.opstamp);
|
||||
let inverted_index = segment_reader.inverted_index(delete_op.term.field());
|
||||
if let Some(mut docset) =
|
||||
inverted_index.read_postings(&delete_op.term, IndexRecordOption::Basic)
|
||||
{
|
||||
while docset.advance() {
|
||||
let deleted_doc = docset.doc();
|
||||
if deleted_doc < limit_doc {
|
||||
delete_bitset.insert(deleted_doc as usize);
|
||||
might_have_changed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
while let Some(delete_op) = delete_cursor.get() {
|
||||
if delete_op.opstamp > target_opstamp {
|
||||
break;
|
||||
}
|
||||
|
||||
// A delete operation should only affect
|
||||
// document that were inserted after it.
|
||||
//
|
||||
// Limit doc helps identify the first document
|
||||
// that may be affected by the delete operation.
|
||||
let limit_doc = doc_opstamps.compute_doc_limit(delete_op.opstamp);
|
||||
let inverted_index = segment_reader.inverted_index(delete_op.term.field());
|
||||
if let Some(mut docset) =
|
||||
inverted_index.read_postings(&delete_op.term, IndexRecordOption::Basic)
|
||||
{
|
||||
while docset.advance() {
|
||||
let deleted_doc = docset.doc();
|
||||
if deleted_doc < limit_doc {
|
||||
delete_bitset.insert(deleted_doc as usize);
|
||||
might_have_changed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
delete_cursor.advance();
|
||||
}
|
||||
Ok(might_have_changed)
|
||||
@@ -217,7 +127,7 @@ pub fn compute_deleted_bitset(
|
||||
|
||||
/// Advance delete for the given segment up
|
||||
/// to the target opstamp.
|
||||
pub fn advance_deletes(
|
||||
pub(crate) fn advance_deletes(
|
||||
mut segment: Segment,
|
||||
segment_entry: &mut SegmentEntry,
|
||||
target_opstamp: Opstamp,
|
||||
@@ -229,8 +139,8 @@ pub fn advance_deletes(
|
||||
}
|
||||
|
||||
let segment_reader = SegmentReader::open(&segment)?;
|
||||
let max_doc = segment_reader.max_doc();
|
||||
|
||||
let max_doc = segment_reader.max_doc();
|
||||
let mut delete_bitset: BitSet = match segment_entry.delete_bitset() {
|
||||
Some(previous_delete_bitset) => (*previous_delete_bitset).clone(),
|
||||
None => BitSet::with_capacity(max_doc as usize),
|
||||
@@ -267,17 +177,15 @@ pub fn advance_deletes(
|
||||
fn index_documents(
|
||||
memory_budget: usize,
|
||||
segment: &Segment,
|
||||
generation: usize,
|
||||
document_iterator: &mut dyn Iterator<Item = Vec<AddOperation>>,
|
||||
grouped_document_iterator: &mut dyn Iterator<Item = OperationGroup>,
|
||||
segment_updater: &mut SegmentUpdater,
|
||||
mut delete_cursor: DeleteCursor,
|
||||
) -> Result<bool> {
|
||||
let schema = segment.schema();
|
||||
let segment_id = segment.id();
|
||||
let table_size = initial_table_size(memory_budget);
|
||||
let mut segment_writer = SegmentWriter::for_segment(table_size, segment.clone(), &schema)?;
|
||||
for documents in document_iterator {
|
||||
for doc in documents {
|
||||
let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone(), &schema)?;
|
||||
for document_group in grouped_document_iterator {
|
||||
for doc in document_group {
|
||||
segment_writer.add_document(doc, &schema)?;
|
||||
}
|
||||
let mem_usage = segment_writer.mem_usage();
|
||||
@@ -301,37 +209,114 @@ fn index_documents(
|
||||
assert!(num_docs > 0);
|
||||
|
||||
let doc_opstamps: Vec<Opstamp> = segment_writer.finalize()?;
|
||||
|
||||
let segment_meta = SegmentMeta::new(segment_id, num_docs);
|
||||
let segment_meta = segment.index().new_segment_meta(segment_id, num_docs);
|
||||
|
||||
let last_docstamp: Opstamp = *(doc_opstamps.last().unwrap());
|
||||
|
||||
let delete_bitset_opt = if delete_cursor.get().is_some() {
|
||||
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
|
||||
let segment_reader = SegmentReader::open(segment)?;
|
||||
let mut deleted_bitset = BitSet::with_capacity(num_docs as usize);
|
||||
let may_have_deletes = compute_deleted_bitset(
|
||||
&mut deleted_bitset,
|
||||
&segment_reader,
|
||||
&mut delete_cursor,
|
||||
&doc_to_opstamps,
|
||||
last_docstamp,
|
||||
)?;
|
||||
if may_have_deletes {
|
||||
Some(deleted_bitset)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else {
|
||||
let delete_bitset_opt =
|
||||
apply_deletes(&segment, &mut delete_cursor, &doc_opstamps, last_docstamp)?;
|
||||
|
||||
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, delete_bitset_opt);
|
||||
Ok(segment_updater.add_segment(segment_entry))
|
||||
}
|
||||
|
||||
fn apply_deletes(
|
||||
segment: &Segment,
|
||||
mut delete_cursor: &mut DeleteCursor,
|
||||
doc_opstamps: &[Opstamp],
|
||||
last_docstamp: Opstamp,
|
||||
) -> Result<Option<BitSet<u32>>> {
|
||||
if delete_cursor.get().is_none() {
|
||||
// if there are no delete operation in the queue, no need
|
||||
// to even open the segment.
|
||||
return Ok(None);
|
||||
}
|
||||
let segment_reader = SegmentReader::open(segment)?;
|
||||
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
|
||||
let mut deleted_bitset = BitSet::with_capacity(segment_reader.max_doc() as usize);
|
||||
let may_have_deletes = compute_deleted_bitset(
|
||||
&mut deleted_bitset,
|
||||
&segment_reader,
|
||||
&mut delete_cursor,
|
||||
&doc_to_opstamps,
|
||||
last_docstamp,
|
||||
)?;
|
||||
Ok(if may_have_deletes {
|
||||
Some(deleted_bitset)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, delete_bitset_opt);
|
||||
Ok(segment_updater.add_segment(generation, segment_entry))
|
||||
})
|
||||
}
|
||||
|
||||
impl IndexWriter {
|
||||
/// Create a new index writer. Attempts to acquire a lockfile.
|
||||
///
|
||||
/// The lockfile should be deleted on drop, but it is possible
|
||||
/// that due to a panic or other error, a stale lockfile will be
|
||||
/// left in the index directory. If you are sure that no other
|
||||
/// `IndexWriter` on the system is accessing the index directory,
|
||||
/// it is safe to manually delete the lockfile.
|
||||
///
|
||||
/// `num_threads` specifies the number of indexing workers that
|
||||
/// should work at the same time.
|
||||
/// # Errors
|
||||
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
|
||||
/// # Panics
|
||||
/// If the heap size per thread is too small, panics.
|
||||
pub(crate) fn new(
|
||||
index: &Index,
|
||||
num_threads: usize,
|
||||
heap_size_in_bytes_per_thread: usize,
|
||||
directory_lock: DirectoryLock,
|
||||
) -> Result<IndexWriter> {
|
||||
if heap_size_in_bytes_per_thread < HEAP_SIZE_MIN {
|
||||
let err_msg = format!(
|
||||
"The heap size per thread needs to be at least {}.",
|
||||
HEAP_SIZE_MIN
|
||||
);
|
||||
return Err(TantivyError::InvalidArgument(err_msg));
|
||||
}
|
||||
if heap_size_in_bytes_per_thread >= HEAP_SIZE_MAX {
|
||||
let err_msg = format!("The heap size per thread cannot exceed {}", HEAP_SIZE_MAX);
|
||||
return Err(TantivyError::InvalidArgument(err_msg));
|
||||
}
|
||||
let (document_sender, document_receiver): (OperationSender, OperationReceiver) =
|
||||
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||
|
||||
let delete_queue = DeleteQueue::new();
|
||||
|
||||
let current_opstamp = index.load_metas()?.opstamp;
|
||||
|
||||
let stamper = Stamper::new(current_opstamp);
|
||||
|
||||
let segment_updater =
|
||||
SegmentUpdater::create(index.clone(), stamper.clone(), &delete_queue.cursor())?;
|
||||
|
||||
let mut index_writer = IndexWriter {
|
||||
_directory_lock: Some(directory_lock),
|
||||
|
||||
heap_size_in_bytes_per_thread,
|
||||
index: index.clone(),
|
||||
|
||||
operation_receiver: document_receiver,
|
||||
operation_sender: document_sender,
|
||||
|
||||
segment_updater,
|
||||
|
||||
workers_join_handle: vec![],
|
||||
num_threads,
|
||||
|
||||
delete_queue,
|
||||
|
||||
committed_opstamp: current_opstamp,
|
||||
stamper,
|
||||
|
||||
worker_id: 0,
|
||||
};
|
||||
index_writer.start_workers()?;
|
||||
Ok(index_writer)
|
||||
}
|
||||
|
||||
/// If there are some merging threads, blocks until they all finish their work and
|
||||
/// then drop the `IndexWriter`.
|
||||
pub fn wait_merging_threads(mut self) -> Result<()> {
|
||||
@@ -366,8 +351,7 @@ impl IndexWriter {
|
||||
pub fn add_segment(&mut self, segment_meta: SegmentMeta) {
|
||||
let delete_cursor = self.delete_queue.cursor();
|
||||
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, None);
|
||||
self.segment_updater
|
||||
.add_segment(self.generation, segment_entry);
|
||||
self.segment_updater.add_segment(segment_entry);
|
||||
}
|
||||
|
||||
/// Creates a new segment.
|
||||
@@ -388,17 +372,12 @@ impl IndexWriter {
|
||||
let document_receiver_clone = self.operation_receiver.clone();
|
||||
let mut segment_updater = self.segment_updater.clone();
|
||||
|
||||
let generation = self.generation;
|
||||
|
||||
let mut delete_cursor = self.delete_queue.cursor();
|
||||
|
||||
let mem_budget = self.heap_size_in_bytes_per_thread;
|
||||
let index = self.index.clone();
|
||||
let join_handle: JoinHandle<Result<()>> = thread::Builder::new()
|
||||
.name(format!(
|
||||
"thrd-tantivy-index{}-gen{}",
|
||||
self.worker_id, generation
|
||||
))
|
||||
.name(format!("thrd-tantivy-index{}", self.worker_id))
|
||||
.spawn(move || {
|
||||
loop {
|
||||
let mut document_iterator =
|
||||
@@ -427,7 +406,6 @@ impl IndexWriter {
|
||||
index_documents(
|
||||
mem_budget,
|
||||
&segment,
|
||||
generation,
|
||||
&mut document_iterator,
|
||||
&mut segment_updater,
|
||||
delete_cursor.clone(),
|
||||
@@ -459,7 +437,7 @@ impl IndexWriter {
|
||||
/// Detects and removes the files that
|
||||
/// are not used by the index anymore.
|
||||
pub fn garbage_collect_files(&mut self) -> Result<()> {
|
||||
self.segment_updater.garbage_collect_files()
|
||||
self.segment_updater.garbage_collect_files().wait()
|
||||
}
|
||||
|
||||
/// Deletes all documents from the index
|
||||
@@ -469,12 +447,10 @@ impl IndexWriter {
|
||||
/// by clearing and resubmitting necessary documents
|
||||
///
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::query::QueryParser;
|
||||
/// use tantivy::collector::TopDocs;
|
||||
/// use tantivy::query::QueryParser;
|
||||
/// use tantivy::schema::*;
|
||||
/// use tantivy::Index;
|
||||
/// use tantivy::{doc, Index};
|
||||
///
|
||||
/// fn main() -> tantivy::Result<()> {
|
||||
/// let mut schema_builder = Schema::builder();
|
||||
@@ -559,7 +535,7 @@ impl IndexWriter {
|
||||
.take()
|
||||
.expect("The IndexWriter does not have any lock. This is a bug, please report.");
|
||||
|
||||
let new_index_writer: IndexWriter = open_index_writer(
|
||||
let new_index_writer: IndexWriter = IndexWriter::new(
|
||||
&self.index,
|
||||
self.num_threads,
|
||||
self.heap_size_in_bytes_per_thread,
|
||||
@@ -577,7 +553,7 @@ impl IndexWriter {
|
||||
//
|
||||
// This will reach an end as the only document_sender
|
||||
// was dropped with the index_writer.
|
||||
for _ in document_receiver.clone() {}
|
||||
for _ in document_receiver {}
|
||||
|
||||
Ok(self.committed_opstamp)
|
||||
}
|
||||
@@ -608,10 +584,10 @@ impl IndexWriter {
|
||||
// all of the segment update for this commit have been
|
||||
// sent.
|
||||
//
|
||||
// No document belonging to the next generation have been
|
||||
// No document belonging to the next commit have been
|
||||
// pushed too, because add_document can only happen
|
||||
// on this thread.
|
||||
|
||||
//
|
||||
// This will move uncommitted segments to the state of
|
||||
// committed segments.
|
||||
info!("Preparing commit");
|
||||
@@ -627,7 +603,6 @@ impl IndexWriter {
|
||||
.join()
|
||||
.map_err(|e| TantivyError::ErrorInThread(format!("{:?}", e)))?;
|
||||
indexing_worker_result?;
|
||||
// add a new worker for the next generation.
|
||||
self.add_indexing_worker()?;
|
||||
}
|
||||
|
||||
@@ -698,7 +673,7 @@ impl IndexWriter {
|
||||
pub fn add_document(&self, document: Document) -> Opstamp {
|
||||
let opstamp = self.stamper.stamp();
|
||||
let add_operation = AddOperation { opstamp, document };
|
||||
let send_result = self.operation_sender.send(vec![add_operation]);
|
||||
let send_result = self.operation_sender.send(smallvec![add_operation]);
|
||||
if let Err(e) = send_result {
|
||||
panic!("Failed to index document. Sending to indexing channel failed. This probably means all of the indexing threads have panicked. {:?}", e);
|
||||
}
|
||||
@@ -745,7 +720,7 @@ impl IndexWriter {
|
||||
}
|
||||
let (batch_opstamp, stamps) = self.get_batch_opstamps(count);
|
||||
|
||||
let mut adds: Vec<AddOperation> = Vec::new();
|
||||
let mut adds = OperationGroup::default();
|
||||
|
||||
for (user_op, opstamp) in user_operations.into_iter().zip(stamps) {
|
||||
match user_op {
|
||||
@@ -772,7 +747,6 @@ impl IndexWriter {
|
||||
mod tests {
|
||||
|
||||
use super::super::operation::UserOperation;
|
||||
use super::initial_table_size;
|
||||
use crate::collector::TopDocs;
|
||||
use crate::directory::error::LockError;
|
||||
use crate::error::*;
|
||||
@@ -1064,41 +1038,6 @@ mod tests {
|
||||
assert_eq!(num_docs_containing("b"), 100);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_hashmap_size() {
|
||||
assert_eq!(initial_table_size(100_000), 11);
|
||||
assert_eq!(initial_table_size(1_000_000), 14);
|
||||
assert_eq!(initial_table_size(10_000_000), 17);
|
||||
assert_eq!(initial_table_size(1_000_000_000), 19);
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "no_fail"))]
|
||||
#[test]
|
||||
fn test_write_commit_fails() {
|
||||
use fail;
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
for _ in 0..100 {
|
||||
index_writer.add_document(doc!(text_field => "a"));
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
fail::cfg("RAMDirectory::atomic_write", "return(error_write_failed)").unwrap();
|
||||
for _ in 0..100 {
|
||||
index_writer.add_document(doc!(text_field => "b"));
|
||||
}
|
||||
assert!(index_writer.commit().is_err());
|
||||
let num_docs_containing = |s: &str| {
|
||||
let term_a = Term::from_field_text(text_field, s);
|
||||
index.reader().unwrap().searcher().doc_freq(&term_a)
|
||||
};
|
||||
assert_eq!(num_docs_containing("a"), 100);
|
||||
assert_eq!(num_docs_containing("b"), 0);
|
||||
fail::cfg("RAMDirectory::atomic_write", "off").unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_add_then_delete_all_documents() {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
|
||||
@@ -95,8 +95,11 @@ impl Default for LogMergePolicy {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::core::{SegmentId, SegmentMeta};
|
||||
use crate::core::{SegmentId, SegmentMeta, SegmentMetaInventory};
|
||||
use crate::indexer::merge_policy::MergePolicy;
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
static INVENTORY: Lazy<SegmentMetaInventory> = Lazy::new(SegmentMetaInventory::default);
|
||||
|
||||
fn test_merge_policy() -> LogMergePolicy {
|
||||
let mut log_merge_policy = LogMergePolicy::default();
|
||||
@@ -113,7 +116,7 @@ mod tests {
|
||||
}
|
||||
|
||||
fn create_random_segment_meta(num_docs: u32) -> SegmentMeta {
|
||||
SegmentMeta::new(SegmentId::generate_random(), num_docs)
|
||||
INVENTORY.new_segment_meta(SegmentId::generate_random(), num_docs)
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -207,6 +207,7 @@ impl IndexMerger {
|
||||
}
|
||||
FieldType::U64(ref options)
|
||||
| FieldType::I64(ref options)
|
||||
| FieldType::F64(ref options)
|
||||
| FieldType::Date(ref options) => match options.get_fastfield_cardinality() {
|
||||
Some(Cardinality::SingleValue) => {
|
||||
self.write_single_fast_field(field, fast_field_serializer)?;
|
||||
@@ -692,7 +693,7 @@ impl SerializableSegment for IndexMerger {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::collector::tests::TestCollector;
|
||||
use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE;
|
||||
use crate::collector::tests::{BytesFastFieldTestCollector, FastFieldTestCollector};
|
||||
use crate::collector::{Count, FacetCollector};
|
||||
use crate::core::Index;
|
||||
@@ -807,7 +808,7 @@ mod tests {
|
||||
let searcher = reader.searcher();
|
||||
let get_doc_ids = |terms: Vec<Term>| {
|
||||
let query = BooleanQuery::new_multiterms_query(terms);
|
||||
let top_docs = searcher.search(&query, &TestCollector).unwrap();
|
||||
let top_docs = searcher.search(&query, &TEST_COLLECTOR_WITH_SCORE).unwrap();
|
||||
top_docs.docs().to_vec()
|
||||
};
|
||||
{
|
||||
|
||||
@@ -1,14 +1,12 @@
|
||||
use super::segment_register::SegmentRegister;
|
||||
use crate::core::SegmentId;
|
||||
use crate::core::SegmentMeta;
|
||||
use crate::core::META_FILEPATH;
|
||||
use crate::error::TantivyError;
|
||||
use crate::indexer::delete_queue::DeleteCursor;
|
||||
use crate::indexer::SegmentEntry;
|
||||
use crate::Result as TantivyResult;
|
||||
use std::collections::hash_set::HashSet;
|
||||
use std::fmt::{self, Debug, Formatter};
|
||||
use std::path::PathBuf;
|
||||
use std::sync::RwLock;
|
||||
use std::sync::{RwLockReadGuard, RwLockWriteGuard};
|
||||
|
||||
@@ -75,19 +73,6 @@ impl SegmentManager {
|
||||
segment_entries
|
||||
}
|
||||
|
||||
/// List the files that are useful to the index.
|
||||
///
|
||||
/// This does not include lock files, or files that are obsolete
|
||||
/// but have not yet been deleted by the garbage collector.
|
||||
pub fn list_files(&self) -> HashSet<PathBuf> {
|
||||
let mut files = HashSet::new();
|
||||
files.insert(META_FILEPATH.to_path_buf());
|
||||
for segment_meta in SegmentMeta::all() {
|
||||
files.extend(segment_meta.list_files());
|
||||
}
|
||||
files
|
||||
}
|
||||
|
||||
// Lock poisoning should never happen :
|
||||
// The lock is acquired and released within this class,
|
||||
// and the operations cannot panic.
|
||||
|
||||
@@ -93,8 +93,7 @@ impl SegmentRegister {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::core::SegmentId;
|
||||
use crate::core::SegmentMeta;
|
||||
use crate::core::{SegmentId, SegmentMetaInventory};
|
||||
use crate::indexer::delete_queue::*;
|
||||
|
||||
fn segment_ids(segment_register: &SegmentRegister) -> Vec<SegmentId> {
|
||||
@@ -107,6 +106,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_segment_register() {
|
||||
let inventory = SegmentMetaInventory::default();
|
||||
let delete_queue = DeleteQueue::new();
|
||||
|
||||
let mut segment_register = SegmentRegister::default();
|
||||
@@ -115,20 +115,20 @@ mod tests {
|
||||
let segment_id_merged = SegmentId::generate_random();
|
||||
|
||||
{
|
||||
let segment_meta = SegmentMeta::new(segment_id_a, 0u32);
|
||||
let segment_meta = inventory.new_segment_meta(segment_id_a, 0u32);
|
||||
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
|
||||
segment_register.add_segment_entry(segment_entry);
|
||||
}
|
||||
assert_eq!(segment_ids(&segment_register), vec![segment_id_a]);
|
||||
{
|
||||
let segment_meta = SegmentMeta::new(segment_id_b, 0u32);
|
||||
let segment_meta = inventory.new_segment_meta(segment_id_b, 0u32);
|
||||
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
|
||||
segment_register.add_segment_entry(segment_entry);
|
||||
}
|
||||
segment_register.remove_segment(&segment_id_a);
|
||||
segment_register.remove_segment(&segment_id_b);
|
||||
{
|
||||
let segment_meta_merged = SegmentMeta::new(segment_id_merged, 0u32);
|
||||
let segment_meta_merged = inventory.new_segment_meta(segment_id_merged, 0u32);
|
||||
let segment_entry = SegmentEntry::new(segment_meta_merged, delete_queue.cursor(), None);
|
||||
segment_register.add_segment_entry(segment_entry);
|
||||
}
|
||||
|
||||
@@ -33,6 +33,7 @@ use std::collections::HashSet;
|
||||
use std::io::Write;
|
||||
use std::mem;
|
||||
use std::ops::DerefMut;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
|
||||
use std::sync::Arc;
|
||||
use std::sync::RwLock;
|
||||
@@ -125,7 +126,7 @@ fn perform_merge(
|
||||
|
||||
let num_docs = merger.write(segment_serializer)?;
|
||||
|
||||
let segment_meta = SegmentMeta::new(merged_segment.id(), num_docs);
|
||||
let segment_meta = index.new_segment_meta(merged_segment.id(), num_docs);
|
||||
|
||||
let after_merge_segment_entry = SegmentEntry::new(segment_meta.clone(), delete_cursor, None);
|
||||
Ok(after_merge_segment_entry)
|
||||
@@ -145,7 +146,6 @@ struct InnerSegmentUpdater {
|
||||
merge_policy: RwLock<Arc<Box<dyn MergePolicy>>>,
|
||||
merging_thread_id: AtomicUsize,
|
||||
merging_threads: RwLock<HashMap<usize, JoinHandle<Result<()>>>>,
|
||||
generation: AtomicUsize,
|
||||
killed: AtomicBool,
|
||||
stamper: Stamper,
|
||||
merge_operations: MergeOperationInventory,
|
||||
@@ -172,7 +172,6 @@ impl SegmentUpdater {
|
||||
merge_policy: RwLock::new(Arc::new(Box::new(DefaultMergePolicy::default()))),
|
||||
merging_thread_id: AtomicUsize::default(),
|
||||
merging_threads: RwLock::new(HashMap::new()),
|
||||
generation: AtomicUsize::default(),
|
||||
killed: AtomicBool::new(false),
|
||||
stamper,
|
||||
merge_operations: Default::default(),
|
||||
@@ -200,18 +199,14 @@ impl SegmentUpdater {
|
||||
self.0.pool.spawn_fn(move || Ok(f(me_clone)))
|
||||
}
|
||||
|
||||
pub fn add_segment(&self, generation: usize, segment_entry: SegmentEntry) -> bool {
|
||||
if generation >= self.0.generation.load(Ordering::Acquire) {
|
||||
self.run_async(|segment_updater| {
|
||||
segment_updater.0.segment_manager.add_segment(segment_entry);
|
||||
segment_updater.consider_merge_options();
|
||||
true
|
||||
})
|
||||
.forget();
|
||||
pub fn add_segment(&self, segment_entry: SegmentEntry) -> bool {
|
||||
self.run_async(|segment_updater| {
|
||||
segment_updater.0.segment_manager.add_segment(segment_entry);
|
||||
segment_updater.consider_merge_options();
|
||||
true
|
||||
} else {
|
||||
false
|
||||
}
|
||||
})
|
||||
.forget();
|
||||
true
|
||||
}
|
||||
|
||||
/// Orders `SegmentManager` to remove all segments
|
||||
@@ -272,19 +267,29 @@ impl SegmentUpdater {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn garbage_collect_files(&self) -> Result<()> {
|
||||
pub fn garbage_collect_files(&self) -> CpuFuture<(), TantivyError> {
|
||||
self.run_async(move |segment_updater| {
|
||||
segment_updater.garbage_collect_files_exec();
|
||||
})
|
||||
.wait()
|
||||
}
|
||||
|
||||
/// List the files that are useful to the index.
|
||||
///
|
||||
/// This does not include lock files, or files that are obsolete
|
||||
/// but have not yet been deleted by the garbage collector.
|
||||
fn list_files(&self) -> HashSet<PathBuf> {
|
||||
let mut files = HashSet::new();
|
||||
files.insert(META_FILEPATH.to_path_buf());
|
||||
for segment_meta in self.0.index.list_all_segment_metas() {
|
||||
files.extend(segment_meta.list_files());
|
||||
}
|
||||
files
|
||||
}
|
||||
|
||||
fn garbage_collect_files_exec(&self) {
|
||||
info!("Running garbage collection");
|
||||
let mut index = self.0.index.clone();
|
||||
index
|
||||
.directory_mut()
|
||||
.garbage_collect(|| self.0.segment_manager.list_files());
|
||||
index.directory_mut().garbage_collect(|| self.list_files());
|
||||
}
|
||||
|
||||
pub fn commit(&self, opstamp: Opstamp, payload: Option<String>) -> Result<()> {
|
||||
|
||||
@@ -4,6 +4,7 @@ use crate::core::SerializableSegment;
|
||||
use crate::fastfield::FastFieldsWriter;
|
||||
use crate::fieldnorm::FieldNormsWriter;
|
||||
use crate::indexer::segment_serializer::SegmentSerializer;
|
||||
use crate::postings::compute_table_size;
|
||||
use crate::postings::MultiFieldPostingsWriter;
|
||||
use crate::schema::FieldEntry;
|
||||
use crate::schema::FieldType;
|
||||
@@ -16,9 +17,26 @@ use crate::tokenizer::{TokenStream, Tokenizer};
|
||||
use crate::DocId;
|
||||
use crate::Opstamp;
|
||||
use crate::Result;
|
||||
use crate::TantivyError;
|
||||
use std::io;
|
||||
use std::str;
|
||||
|
||||
/// Computes the initial size of the hash table.
|
||||
///
|
||||
/// Returns a number of bit `b`, such that the recommended initial table size is 2^b.
|
||||
fn initial_table_size(per_thread_memory_budget: usize) -> Result<usize> {
|
||||
let table_memory_upper_bound = per_thread_memory_budget / 3;
|
||||
if let Some(limit) = (10..)
|
||||
.take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_memory_upper_bound)
|
||||
.last()
|
||||
{
|
||||
Ok(limit.min(19)) // we cap it at 2^19 = 512K.
|
||||
} else {
|
||||
Err(TantivyError::InvalidArgument(
|
||||
format!("per thread memory budget (={}) is too small. Raise the memory budget or lower the number of threads.", per_thread_memory_budget)))
|
||||
}
|
||||
}
|
||||
|
||||
/// A `SegmentWriter` is in charge of creating segment index from a
|
||||
/// set of documents.
|
||||
///
|
||||
@@ -31,7 +49,7 @@ pub struct SegmentWriter {
|
||||
fast_field_writers: FastFieldsWriter,
|
||||
fieldnorms_writer: FieldNormsWriter,
|
||||
doc_opstamps: Vec<Opstamp>,
|
||||
tokenizers: Vec<Option<Box<dyn BoxedTokenizer>>>,
|
||||
tokenizers: Vec<Option<BoxedTokenizer>>,
|
||||
}
|
||||
|
||||
impl SegmentWriter {
|
||||
@@ -45,12 +63,13 @@ impl SegmentWriter {
|
||||
/// - segment: The segment being written
|
||||
/// - schema
|
||||
pub fn for_segment(
|
||||
table_bits: usize,
|
||||
memory_budget: usize,
|
||||
mut segment: Segment,
|
||||
schema: &Schema,
|
||||
) -> Result<SegmentWriter> {
|
||||
let table_num_bits = initial_table_size(memory_budget)?;
|
||||
let segment_serializer = SegmentSerializer::for_segment(&mut segment)?;
|
||||
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_bits);
|
||||
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_num_bits);
|
||||
let tokenizers =
|
||||
schema
|
||||
.fields()
|
||||
@@ -195,6 +214,17 @@ impl SegmentWriter {
|
||||
}
|
||||
}
|
||||
}
|
||||
FieldType::F64(ref int_option) => {
|
||||
if int_option.is_indexed() {
|
||||
for field_value in field_values {
|
||||
let term = Term::from_field_f64(
|
||||
field_value.field(),
|
||||
field_value.value().f64_value(),
|
||||
);
|
||||
self.multifield_postings.subscribe(doc_id, &term);
|
||||
}
|
||||
}
|
||||
}
|
||||
FieldType::Bytes => {
|
||||
// Do nothing. Bytes only supports fast fields.
|
||||
}
|
||||
@@ -254,3 +284,17 @@ impl SerializableSegment for SegmentWriter {
|
||||
Ok(max_doc)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::initial_table_size;
|
||||
|
||||
#[test]
|
||||
fn test_hashmap_size() {
|
||||
assert_eq!(initial_table_size(100_000).unwrap(), 11);
|
||||
assert_eq!(initial_table_size(1_000_000).unwrap(), 14);
|
||||
assert_eq!(initial_table_size(10_000_000).unwrap(), 17);
|
||||
assert_eq!(initial_table_size(1_000_000_000).unwrap(), 19);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
73
src/lib.rs
73
src/lib.rs
@@ -3,7 +3,6 @@
|
||||
#![cfg_attr(feature = "cargo-clippy", allow(clippy::module_inception))]
|
||||
#![doc(test(attr(allow(unused_variables), deny(warnings))))]
|
||||
#![warn(missing_docs)]
|
||||
#![recursion_limit = "80"]
|
||||
|
||||
//! # `tantivy`
|
||||
//!
|
||||
@@ -11,26 +10,17 @@
|
||||
//! Think `Lucene`, but in Rust.
|
||||
//!
|
||||
//! ```rust
|
||||
|
||||
//! # extern crate tempdir;
|
||||
//! #
|
||||
//! #[macro_use]
|
||||
//! extern crate tantivy;
|
||||
//!
|
||||
//! // ...
|
||||
//!
|
||||
//! # use std::path::Path;
|
||||
//! # use tempdir::TempDir;
|
||||
//! # use tantivy::Index;
|
||||
//! # use tantivy::schema::*;
|
||||
//! # use tantivy::{Score, DocAddress};
|
||||
//! # use tempfile::TempDir;
|
||||
//! # use tantivy::collector::TopDocs;
|
||||
//! # use tantivy::query::QueryParser;
|
||||
//! # use tantivy::schema::*;
|
||||
//! # use tantivy::{doc, DocAddress, Index, Score};
|
||||
//! #
|
||||
//! # fn main() {
|
||||
//! # // Let's create a temporary directory for the
|
||||
//! # // sake of this example
|
||||
//! # if let Ok(dir) = TempDir::new("tantivy_example_dir") {
|
||||
//! # if let Ok(dir) = TempDir::new() {
|
||||
//! # run_example(dir.path()).unwrap();
|
||||
//! # dir.close().unwrap();
|
||||
//! # }
|
||||
@@ -105,8 +95,8 @@
|
||||
//!
|
||||
//! A good place for you to get started is to check out
|
||||
//! the example code (
|
||||
//! [literate programming](http://fulmicoton.com/tantivy-examples/simple_search.html) /
|
||||
//! [source code](https://github.com/fulmicoton/tantivy/blob/master/examples/simple_search.rs))
|
||||
//! [literate programming](https://tantivy-search.github.io/examples/basic_search.html) /
|
||||
//! [source code](https://github.com/tantivy-search/tantivy/blob/master/examples/basic_search.rs))
|
||||
|
||||
#[macro_use]
|
||||
extern crate serde_derive;
|
||||
@@ -171,16 +161,16 @@ pub use self::snippet::{Snippet, SnippetGenerator};
|
||||
mod docset;
|
||||
pub use self::docset::{DocSet, SkipResult};
|
||||
|
||||
pub use crate::common::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
|
||||
pub use crate::core::SegmentComponent;
|
||||
pub use crate::core::{Index, IndexMeta, Searcher, Segment, SegmentId, SegmentMeta};
|
||||
pub use crate::core::{InvertedIndexReader, SegmentReader};
|
||||
pub use crate::directory::Directory;
|
||||
pub use crate::indexer::IndexWriter;
|
||||
pub use crate::postings::Postings;
|
||||
pub use crate::reader::LeasedItem;
|
||||
pub use crate::schema::{Document, Term};
|
||||
|
||||
pub use crate::common::{i64_to_u64, u64_to_i64};
|
||||
|
||||
/// Expose the current version of tantivy, as well
|
||||
/// whether it was compiled with the simd compression.
|
||||
pub fn version() -> &'static str {
|
||||
@@ -250,7 +240,7 @@ pub struct DocAddress(pub SegmentLocalId, pub DocId);
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use crate::collector::tests::TestCollector;
|
||||
use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE;
|
||||
use crate::core::SegmentReader;
|
||||
use crate::docset::DocSet;
|
||||
use crate::query::BooleanQuery;
|
||||
@@ -625,6 +615,30 @@ mod tests {
|
||||
assert!(!postings.advance());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_indexed_f64() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let value_field = schema_builder.add_f64_field("value", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
let val = std::f64::consts::PI;
|
||||
index_writer.add_document(doc!(value_field => val));
|
||||
index_writer.commit().unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let term = Term::from_field_f64(value_field, val);
|
||||
let mut postings = searcher
|
||||
.segment_reader(0)
|
||||
.inverted_index(term.field())
|
||||
.read_postings(&term, IndexRecordOption::Basic)
|
||||
.unwrap();
|
||||
assert!(postings.advance());
|
||||
assert_eq!(postings.doc(), 0);
|
||||
assert!(!postings.advance());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_indexedfield_not_in_documents() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -737,7 +751,7 @@ mod tests {
|
||||
let searcher = reader.searcher();
|
||||
let get_doc_ids = |terms: Vec<Term>| {
|
||||
let query = BooleanQuery::new_multiterms_query(terms);
|
||||
let topdocs = searcher.search(&query, &TestCollector).unwrap();
|
||||
let topdocs = searcher.search(&query, &TEST_COLLECTOR_WITH_SCORE).unwrap();
|
||||
topdocs.docs().to_vec()
|
||||
};
|
||||
assert_eq!(
|
||||
@@ -817,6 +831,7 @@ mod tests {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let fast_field_unsigned = schema_builder.add_u64_field("unsigned", FAST);
|
||||
let fast_field_signed = schema_builder.add_i64_field("signed", FAST);
|
||||
let fast_field_float = schema_builder.add_f64_field("float", FAST);
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let stored_int_field = schema_builder.add_u64_field("text", STORED);
|
||||
let schema = schema_builder.build();
|
||||
@@ -824,7 +839,8 @@ mod tests {
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_with_num_threads(1, 50_000_000).unwrap();
|
||||
{
|
||||
let document = doc!(fast_field_unsigned => 4u64, fast_field_signed=>4i64);
|
||||
let document =
|
||||
doc!(fast_field_unsigned => 4u64, fast_field_signed=>4i64, fast_field_float=>4f64);
|
||||
index_writer.add_document(document);
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
@@ -844,10 +860,14 @@ mod tests {
|
||||
assert!(fast_field_reader_opt.is_none());
|
||||
}
|
||||
{
|
||||
let fast_field_reader_opt = segment_reader.fast_fields().i64(fast_field_signed);
|
||||
let fast_field_reader_opt = segment_reader.fast_fields().u64(fast_field_float);
|
||||
assert!(fast_field_reader_opt.is_none());
|
||||
}
|
||||
{
|
||||
let fast_field_reader_opt = segment_reader.fast_fields().u64(fast_field_unsigned);
|
||||
assert!(fast_field_reader_opt.is_some());
|
||||
let fast_field_reader = fast_field_reader_opt.unwrap();
|
||||
assert_eq!(fast_field_reader.get(0), 4i64)
|
||||
assert_eq!(fast_field_reader.get(0), 4u64)
|
||||
}
|
||||
|
||||
{
|
||||
@@ -856,5 +876,12 @@ mod tests {
|
||||
let fast_field_reader = fast_field_reader_opt.unwrap();
|
||||
assert_eq!(fast_field_reader.get(0), 4i64)
|
||||
}
|
||||
|
||||
{
|
||||
let fast_field_reader_opt = segment_reader.fast_fields().f64(fast_field_float);
|
||||
assert!(fast_field_reader_opt.is_some());
|
||||
let fast_field_reader = fast_field_reader_opt.unwrap();
|
||||
assert_eq!(fast_field_reader.get(0), 4f64)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -22,11 +22,9 @@
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
///
|
||||
/// ```rust
|
||||
/// use tantivy::schema::{Schema, TEXT, FAST};
|
||||
/// use tantivy::doc;
|
||||
///
|
||||
/// //...
|
||||
///
|
||||
|
||||
@@ -220,7 +220,7 @@ pub mod tests {
|
||||
|
||||
{
|
||||
let mut segment_writer =
|
||||
SegmentWriter::for_segment(18, segment.clone(), &schema).unwrap();
|
||||
SegmentWriter::for_segment(3_000_000, segment.clone(), &schema).unwrap();
|
||||
{
|
||||
let mut doc = Document::default();
|
||||
// checking that position works if the field has two values
|
||||
|
||||
@@ -35,6 +35,7 @@ fn posting_from_field_entry(field_entry: &FieldEntry) -> Box<dyn PostingsWriter>
|
||||
.unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed()),
|
||||
FieldType::U64(_)
|
||||
| FieldType::I64(_)
|
||||
| FieldType::F64(_)
|
||||
| FieldType::Date(_)
|
||||
| FieldType::HierarchicalFacet => SpecializedPostingsWriter::<NothingRecorder>::new_boxed(),
|
||||
FieldType::Bytes => {
|
||||
@@ -154,7 +155,7 @@ impl MultiFieldPostingsWriter {
|
||||
.collect();
|
||||
unordered_term_mappings.insert(field, mapping);
|
||||
}
|
||||
FieldType::U64(_) | FieldType::I64(_) | FieldType::Date(_) => {}
|
||||
FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) | FieldType::Date(_) => {}
|
||||
FieldType::Bytes => {}
|
||||
}
|
||||
|
||||
|
||||
@@ -8,15 +8,13 @@ use crate::termdict::{TermDictionary, TermStreamer};
|
||||
use crate::DocId;
|
||||
use crate::TantivyError;
|
||||
use crate::{Result, SkipResult};
|
||||
use std::sync::Arc;
|
||||
use tantivy_fst::Automaton;
|
||||
|
||||
/// A weight struct for Fuzzy Term and Regex Queries
|
||||
pub struct AutomatonWeight<A>
|
||||
where
|
||||
A: Automaton + Send + Sync + 'static,
|
||||
{
|
||||
pub struct AutomatonWeight<A> {
|
||||
field: Field,
|
||||
automaton: A,
|
||||
automaton: Arc<A>,
|
||||
}
|
||||
|
||||
impl<A> AutomatonWeight<A>
|
||||
@@ -24,12 +22,16 @@ where
|
||||
A: Automaton + Send + Sync + 'static,
|
||||
{
|
||||
/// Create a new AutomationWeight
|
||||
pub fn new(field: Field, automaton: A) -> AutomatonWeight<A> {
|
||||
AutomatonWeight { field, automaton }
|
||||
pub fn new<IntoArcA: Into<Arc<A>>>(field: Field, automaton: IntoArcA) -> AutomatonWeight<A> {
|
||||
AutomatonWeight {
|
||||
field,
|
||||
automaton: automaton.into(),
|
||||
}
|
||||
}
|
||||
|
||||
fn automaton_stream<'a>(&'a self, term_dict: &'a TermDictionary) -> TermStreamer<'a, &'a A> {
|
||||
let term_stream_builder = term_dict.search(&self.automaton);
|
||||
let automaton: &A = &*self.automaton;
|
||||
let term_stream_builder = term_dict.search(automaton);
|
||||
term_stream_builder.into_stream()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,7 +7,7 @@ pub use self::boolean_query::BooleanQuery;
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use crate::collector::tests::TestCollector;
|
||||
use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE;
|
||||
use crate::query::score_combiner::SumWithCoordsCombiner;
|
||||
use crate::query::term_query::TermScorer;
|
||||
use crate::query::Intersection;
|
||||
@@ -134,7 +134,7 @@ mod tests {
|
||||
let matching_docs = |boolean_query: &dyn Query| {
|
||||
reader
|
||||
.searcher()
|
||||
.search(boolean_query, &TestCollector)
|
||||
.search(boolean_query, &TEST_COLLECTOR_WITH_SCORE)
|
||||
.unwrap()
|
||||
.docs()
|
||||
.iter()
|
||||
@@ -195,7 +195,7 @@ mod tests {
|
||||
let score_docs = |boolean_query: &dyn Query| {
|
||||
let fruit = reader
|
||||
.searcher()
|
||||
.search(boolean_query, &TestCollector)
|
||||
.search(boolean_query, &TEST_COLLECTOR_WITH_SCORE)
|
||||
.unwrap();
|
||||
fruit.scores().to_vec()
|
||||
};
|
||||
|
||||
@@ -1,15 +1,23 @@
|
||||
use crate::error::TantivyError::InvalidArgument;
|
||||
use crate::query::{AutomatonWeight, Query, Weight};
|
||||
use crate::schema::Term;
|
||||
use crate::termdict::WrappedDFA;
|
||||
use crate::Result;
|
||||
use crate::Searcher;
|
||||
use levenshtein_automata::{LevenshteinAutomatonBuilder, DFA};
|
||||
use levenshtein_automata::{Distance, LevenshteinAutomatonBuilder, DFA};
|
||||
use once_cell::sync::Lazy;
|
||||
use std::collections::HashMap;
|
||||
use std::ops::Range;
|
||||
use derive_builder::Builder;
|
||||
|
||||
/// A range of Levenshtein distances that we will build DFAs for our terms
|
||||
/// The computation is exponential, so best keep it to low single digits
|
||||
const VALID_LEVENSHTEIN_DISTANCE_RANGE: Range<u8> = (0..3);
|
||||
|
||||
static LEV_BUILDER: Lazy<HashMap<(u8, bool), LevenshteinAutomatonBuilder>> = Lazy::new(|| {
|
||||
let mut lev_builder_cache = HashMap::new();
|
||||
// TODO make population lazy on a `(distance, val)` basis
|
||||
for distance in 0..3 {
|
||||
for distance in VALID_LEVENSHTEIN_DISTANCE_RANGE {
|
||||
for &transposition in &[false, true] {
|
||||
let lev_automaton_builder = LevenshteinAutomatonBuilder::new(distance, transposition);
|
||||
lev_builder_cache.insert((distance, transposition), lev_automaton_builder);
|
||||
@@ -18,16 +26,46 @@ static LEV_BUILDER: Lazy<HashMap<(u8, bool), LevenshteinAutomatonBuilder>> = Laz
|
||||
lev_builder_cache
|
||||
});
|
||||
|
||||
|
||||
#[derive(Builder, Default, Clone, Debug)]
|
||||
pub struct FuzzyConfiguration {
|
||||
/// How many changes are we going to allow
|
||||
pub distance: u8,
|
||||
/// Should a transposition cost 1 or 2?
|
||||
#[builder(default)]
|
||||
pub transposition_cost_one: bool,
|
||||
#[builder(default)]
|
||||
pub prefix: bool,
|
||||
/// If true, only the term with a levenshtein of exactly `distance` will match.
|
||||
/// If false, terms at a distance `<=` to `distance` will match.
|
||||
#[builder(default)]
|
||||
pub exact_distance: bool,
|
||||
}
|
||||
|
||||
fn build_dfa(fuzzy_configuration: &FuzzyConfiguration, term_text: &str) -> Result<DFA> {
|
||||
let automaton_builder = LEV_BUILDER
|
||||
.get(&(fuzzy_configuration.distance, fuzzy_configuration.transposition_cost_one))
|
||||
.ok_or_else(|| {
|
||||
InvalidArgument(format!(
|
||||
"Levenshtein distance of {} is not allowed. Choose a value in the {:?} range",
|
||||
fuzzy_configuration.distance, VALID_LEVENSHTEIN_DISTANCE_RANGE
|
||||
))
|
||||
})?;
|
||||
if fuzzy_configuration.prefix {
|
||||
Ok(automaton_builder.build_prefix_dfa(term_text))
|
||||
} else {
|
||||
Ok(automaton_builder.build_dfa(term_text))
|
||||
}
|
||||
}
|
||||
|
||||
/// A Fuzzy Query matches all of the documents
|
||||
/// containing a specific term that is within
|
||||
/// Levenshtein distance
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::schema::{Schema, TEXT};
|
||||
/// use tantivy::{Index, Result, Term};
|
||||
/// use tantivy::collector::{Count, TopDocs};
|
||||
/// use tantivy::query::FuzzyTermQuery;
|
||||
/// use tantivy::schema::{Schema, TEXT};
|
||||
/// use tantivy::{doc, Index, Result, Term};
|
||||
///
|
||||
/// # fn main() { example().unwrap(); }
|
||||
/// fn example() -> Result<()> {
|
||||
@@ -37,32 +75,19 @@ static LEV_BUILDER: Lazy<HashMap<(u8, bool), LevenshteinAutomatonBuilder>> = Laz
|
||||
/// let index = Index::create_in_ram(schema);
|
||||
/// {
|
||||
/// let mut index_writer = index.writer(3_000_000)?;
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Name of the Wind",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of Muadib",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "A Dairy Cow",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(
|
||||
/// title => "The Diary of a Young Girl",
|
||||
/// ));
|
||||
/// index_writer.add_document(doc!(title => "The Name of the Wind"));
|
||||
/// index_writer.add_document(doc!(title => "The Diary of Muadib"));
|
||||
/// index_writer.add_document(doc!(title => "A Dairy Cow"));
|
||||
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl"));
|
||||
/// index_writer.commit().unwrap();
|
||||
/// }
|
||||
/// let reader = index.reader()?;
|
||||
/// let searcher = reader.searcher();
|
||||
///
|
||||
/// {
|
||||
///
|
||||
/// let term = Term::from_field_text(title, "Diary");
|
||||
/// let query = FuzzyTermQuery::new(term, 1, true);
|
||||
/// let (top_docs, count) = searcher.search(&query, &(TopDocs::with_limit(2), Count)).unwrap();
|
||||
/// assert_eq!(count, 2);
|
||||
/// assert_eq!(top_docs.len(), 2);
|
||||
/// }
|
||||
///
|
||||
/// let term = Term::from_field_text(title, "Diary");
|
||||
/// let query = FuzzyTermQuery::new(term, 1, true);
|
||||
/// let (top_docs, count) = searcher.search(&query, &(TopDocs::with_limit(2), Count)).unwrap();
|
||||
/// assert_eq!(count, 2);
|
||||
/// assert_eq!(top_docs.len(), 2);
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
@@ -70,46 +95,58 @@ static LEV_BUILDER: Lazy<HashMap<(u8, bool), LevenshteinAutomatonBuilder>> = Laz
|
||||
pub struct FuzzyTermQuery {
|
||||
/// What term are we searching
|
||||
term: Term,
|
||||
/// How many changes are we going to allow
|
||||
distance: u8,
|
||||
/// Should a transposition cost 1 or 2?
|
||||
transposition_cost_one: bool,
|
||||
///
|
||||
prefix: bool,
|
||||
configuration: FuzzyConfiguration
|
||||
}
|
||||
|
||||
impl FuzzyTermQuery {
|
||||
pub fn new_from_configuration(term: Term, configuration: FuzzyConfiguration) -> FuzzyTermQuery {
|
||||
FuzzyTermQuery {
|
||||
term,
|
||||
configuration
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a new Fuzzy Query
|
||||
pub fn new(term: Term, distance: u8, transposition_cost_one: bool) -> FuzzyTermQuery {
|
||||
FuzzyTermQuery {
|
||||
term,
|
||||
distance,
|
||||
transposition_cost_one,
|
||||
prefix: false,
|
||||
configuration: FuzzyConfiguration {
|
||||
distance,
|
||||
transposition_cost_one,
|
||||
prefix: false,
|
||||
exact_distance: false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a new Fuzzy Query that treats transpositions as cost one rather than two
|
||||
pub fn new_prefix(term: Term, distance: u8, transposition_cost_one: bool) -> FuzzyTermQuery {
|
||||
FuzzyTermQuery {
|
||||
term,
|
||||
distance,
|
||||
transposition_cost_one,
|
||||
prefix: true,
|
||||
}
|
||||
}
|
||||
|
||||
fn specialized_weight(&self) -> Result<AutomatonWeight<DFA>> {
|
||||
let automaton = LEV_BUILDER.get(&(self.distance, false))
|
||||
.unwrap() // TODO return an error
|
||||
.build_dfa(self.term.text());
|
||||
Ok(AutomatonWeight::new(self.term.field(), automaton))
|
||||
}
|
||||
}
|
||||
|
||||
impl Query for FuzzyTermQuery {
|
||||
fn weight(&self, _searcher: &Searcher, _scoring_enabled: bool) -> Result<Box<dyn Weight>> {
|
||||
Ok(Box::new(self.specialized_weight()?))
|
||||
let dfa = build_dfa(&self.configuration, self.term.text())?;
|
||||
// TODO optimize for distance = 0 and possibly prefix
|
||||
if self.configuration.exact_distance {
|
||||
let target_distance = self.configuration.distance;
|
||||
let wrapped_dfa = WrappedDFA {
|
||||
dfa,
|
||||
condition: move |distance: Distance| distance == Distance::Exact(target_distance),
|
||||
};
|
||||
Ok(Box::new(AutomatonWeight::new(
|
||||
self.term.field(),
|
||||
wrapped_dfa,
|
||||
)))
|
||||
} else {
|
||||
let wrapped_dfa = WrappedDFA {
|
||||
dfa,
|
||||
condition: move |distance: Distance| match distance {
|
||||
Distance::Exact(_) => true,
|
||||
Distance::AtLeast(_) => false,
|
||||
},
|
||||
};
|
||||
Ok(Box::new(AutomatonWeight::new(
|
||||
self.term.field(),
|
||||
wrapped_dfa,
|
||||
)))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -122,6 +159,7 @@ mod test {
|
||||
use crate::tests::assert_nearly_equals;
|
||||
use crate::Index;
|
||||
use crate::Term;
|
||||
use super::FuzzyConfigurationBuilder;
|
||||
|
||||
#[test]
|
||||
pub fn test_fuzzy_term() {
|
||||
@@ -143,7 +181,6 @@ mod test {
|
||||
let searcher = reader.searcher();
|
||||
{
|
||||
let term = Term::from_field_text(country_field, "japon");
|
||||
|
||||
let fuzzy_query = FuzzyTermQuery::new(term, 1, true);
|
||||
let top_docs = searcher
|
||||
.search(&fuzzy_query, &TopDocs::with_limit(2))
|
||||
@@ -152,5 +189,73 @@ mod test {
|
||||
let (score, _) = top_docs[0];
|
||||
assert_nearly_equals(1f32, score);
|
||||
}
|
||||
{
|
||||
let term = Term::from_field_text(country_field, "japon");
|
||||
let fuzzy_conf = FuzzyConfigurationBuilder::default()
|
||||
.distance(2)
|
||||
.exact_distance(true)
|
||||
.build()
|
||||
.unwrap();
|
||||
let fuzzy_query = FuzzyTermQuery::new_from_configuration(term, fuzzy_conf);
|
||||
let top_docs = searcher
|
||||
.search(&fuzzy_query, &TopDocs::with_limit(2))
|
||||
.unwrap();
|
||||
assert!(top_docs.is_empty());
|
||||
}
|
||||
{
|
||||
let term = Term::from_field_text(country_field, "japon");
|
||||
let fuzzy_conf = FuzzyConfigurationBuilder::default()
|
||||
.distance(1)
|
||||
.exact_distance(true)
|
||||
.build()
|
||||
.unwrap();
|
||||
let fuzzy_query = FuzzyTermQuery::new_from_configuration(term, fuzzy_conf);
|
||||
let top_docs = searcher
|
||||
.search(&fuzzy_query, &TopDocs::with_limit(2))
|
||||
.unwrap();
|
||||
assert_eq!(top_docs.len(), 1);
|
||||
}
|
||||
{
|
||||
let term = Term::from_field_text(country_field, "jpp");
|
||||
let fuzzy_conf = FuzzyConfigurationBuilder::default()
|
||||
.distance(1)
|
||||
.prefix(true)
|
||||
.build()
|
||||
.unwrap();
|
||||
let fuzzy_query = FuzzyTermQuery::new_from_configuration(term, fuzzy_conf);
|
||||
let top_docs = searcher
|
||||
.search(&fuzzy_query, &TopDocs::with_limit(2))
|
||||
.unwrap();
|
||||
assert_eq!(top_docs.len(), 1);
|
||||
}
|
||||
{
|
||||
let term = Term::from_field_text(country_field, "jpaan");
|
||||
let fuzzy_conf = FuzzyConfigurationBuilder::default()
|
||||
.distance(1)
|
||||
.exact_distance(true)
|
||||
.transposition_cost_one(true)
|
||||
.build()
|
||||
.unwrap();
|
||||
let fuzzy_query = FuzzyTermQuery::new_from_configuration(term, fuzzy_conf);
|
||||
let top_docs = searcher
|
||||
.search(&fuzzy_query, &TopDocs::with_limit(2))
|
||||
.unwrap();
|
||||
assert_eq!(top_docs.len(), 1);
|
||||
}
|
||||
{
|
||||
let term = Term::from_field_text(country_field, "jpaan");
|
||||
let fuzzy_conf = FuzzyConfigurationBuilder::default()
|
||||
.distance(2)
|
||||
.exact_distance(true)
|
||||
.transposition_cost_one(false)
|
||||
.build()
|
||||
.unwrap();
|
||||
let fuzzy_query = FuzzyTermQuery::new_from_configuration(term, fuzzy_conf);
|
||||
let top_docs = searcher
|
||||
.search(&fuzzy_query, &TopDocs::with_limit(2))
|
||||
.unwrap();
|
||||
assert_eq!(top_docs.len(), 1);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -91,7 +91,6 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> Intersection<TDocSet, TOtherDocSet>
|
||||
}
|
||||
|
||||
impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOtherDocSet> {
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::never_loop))]
|
||||
fn advance(&mut self) -> bool {
|
||||
let (left, right) = (&mut self.left, &mut self.right);
|
||||
|
||||
|
||||
@@ -12,7 +12,6 @@ mod exclude;
|
||||
mod explanation;
|
||||
mod fuzzy_query;
|
||||
mod intersection;
|
||||
mod occur;
|
||||
mod phrase_query;
|
||||
mod query;
|
||||
mod query_parser;
|
||||
@@ -41,9 +40,8 @@ pub use self::boolean_query::BooleanQuery;
|
||||
pub use self::empty_query::{EmptyQuery, EmptyScorer, EmptyWeight};
|
||||
pub use self::exclude::Exclude;
|
||||
pub use self::explanation::Explanation;
|
||||
pub use self::fuzzy_query::FuzzyTermQuery;
|
||||
pub use self::fuzzy_query::{FuzzyTermQuery, FuzzyConfiguration, FuzzyConfigurationBuilder};
|
||||
pub use self::intersection::intersect_scorers;
|
||||
pub use self::occur::Occur;
|
||||
pub use self::phrase_query::PhraseQuery;
|
||||
pub use self::query::Query;
|
||||
pub use self::query_parser::QueryParser;
|
||||
@@ -55,6 +53,7 @@ pub use self::scorer::ConstScorer;
|
||||
pub use self::scorer::Scorer;
|
||||
pub use self::term_query::TermQuery;
|
||||
pub use self::weight::Weight;
|
||||
pub use tantivy_query_grammar::Occur;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
@@ -10,13 +10,13 @@ pub use self::phrase_weight::PhraseWeight;
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use crate::collector::tests::TestCollector;
|
||||
use crate::collector::tests::{TEST_COLLECTOR_WITHOUT_SCORE, TEST_COLLECTOR_WITH_SCORE};
|
||||
use crate::core::Index;
|
||||
use crate::error::TantivyError;
|
||||
use crate::schema::{Schema, Term, TEXT};
|
||||
use crate::tests::assert_nearly_equals;
|
||||
use crate::DocAddress;
|
||||
use crate::DocId;
|
||||
use crate::{DocAddress, DocSet};
|
||||
|
||||
fn create_index(texts: &[&'static str]) -> Index {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -53,7 +53,7 @@ mod tests {
|
||||
.collect();
|
||||
let phrase_query = PhraseQuery::new(terms);
|
||||
let test_fruits = searcher
|
||||
.search(&phrase_query, &TestCollector)
|
||||
.search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
|
||||
.expect("search should succeed");
|
||||
test_fruits
|
||||
.docs()
|
||||
@@ -68,6 +68,64 @@ mod tests {
|
||||
assert!(test_query(vec!["g", "a"]).is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_phrase_query_no_score() {
|
||||
let index = create_index(&[
|
||||
"b b b d c g c",
|
||||
"a b b d c g c",
|
||||
"a b a b c",
|
||||
"c a b a d ga a",
|
||||
"a b c",
|
||||
]);
|
||||
let schema = index.schema();
|
||||
let text_field = schema.get_field("text").unwrap();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let test_query = |texts: Vec<&str>| {
|
||||
let terms: Vec<Term> = texts
|
||||
.iter()
|
||||
.map(|text| Term::from_field_text(text_field, text))
|
||||
.collect();
|
||||
let phrase_query = PhraseQuery::new(terms);
|
||||
let test_fruits = searcher
|
||||
.search(&phrase_query, &TEST_COLLECTOR_WITHOUT_SCORE)
|
||||
.expect("search should succeed");
|
||||
test_fruits
|
||||
.docs()
|
||||
.iter()
|
||||
.map(|docaddr| docaddr.1)
|
||||
.collect::<Vec<_>>()
|
||||
};
|
||||
assert_eq!(test_query(vec!["a", "b", "c"]), vec![2, 4]);
|
||||
assert_eq!(test_query(vec!["a", "b"]), vec![1, 2, 3, 4]);
|
||||
assert_eq!(test_query(vec!["b", "b"]), vec![0, 1]);
|
||||
assert!(test_query(vec!["g", "ewrwer"]).is_empty());
|
||||
assert!(test_query(vec!["g", "a"]).is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_phrase_count() {
|
||||
let index = create_index(&["a c", "a a b d a b c", " a b"]);
|
||||
let schema = index.schema();
|
||||
let text_field = schema.get_field("text").unwrap();
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
let phrase_query = PhraseQuery::new(vec![
|
||||
Term::from_field_text(text_field, "a"),
|
||||
Term::from_field_text(text_field, "b"),
|
||||
]);
|
||||
let phrase_weight = phrase_query.phrase_weight(&searcher, true).unwrap();
|
||||
let mut phrase_scorer = phrase_weight
|
||||
.phrase_scorer(searcher.segment_reader(0u32))
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert!(phrase_scorer.advance());
|
||||
assert_eq!(phrase_scorer.doc(), 1);
|
||||
assert_eq!(phrase_scorer.phrase_count(), 2);
|
||||
assert!(phrase_scorer.advance());
|
||||
assert_eq!(phrase_scorer.doc(), 2);
|
||||
assert_eq!(phrase_scorer.phrase_count(), 1);
|
||||
assert!(!phrase_scorer.advance());
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_phrase_query_no_positions() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -93,17 +151,20 @@ mod tests {
|
||||
Term::from_field_text(text_field, "a"),
|
||||
Term::from_field_text(text_field, "b"),
|
||||
]);
|
||||
if let TantivyError::SchemaError(ref msg) = searcher
|
||||
.search(&phrase_query, &TestCollector)
|
||||
match searcher
|
||||
.search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
|
||||
.map(|_| ())
|
||||
.unwrap_err()
|
||||
{
|
||||
assert_eq!(
|
||||
"Applied phrase query on field \"text\", which does not have positions indexed",
|
||||
msg.as_str()
|
||||
);
|
||||
} else {
|
||||
panic!("Should have returned an error");
|
||||
TantivyError::SchemaError(ref msg) => {
|
||||
assert_eq!(
|
||||
"Applied phrase query on field \"text\", which does not have positions indexed",
|
||||
msg.as_str()
|
||||
);
|
||||
}
|
||||
_ => {
|
||||
panic!("Should have returned an error");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -120,7 +181,7 @@ mod tests {
|
||||
.collect();
|
||||
let phrase_query = PhraseQuery::new(terms);
|
||||
searcher
|
||||
.search(&phrase_query, &TestCollector)
|
||||
.search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
|
||||
.expect("search should succeed")
|
||||
.scores()
|
||||
.to_vec()
|
||||
@@ -152,7 +213,7 @@ mod tests {
|
||||
.collect();
|
||||
let phrase_query = PhraseQuery::new(terms);
|
||||
searcher
|
||||
.search(&phrase_query, &TestCollector)
|
||||
.search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
|
||||
.expect("search should succeed")
|
||||
.docs()
|
||||
.to_vec()
|
||||
@@ -180,7 +241,7 @@ mod tests {
|
||||
.collect();
|
||||
let phrase_query = PhraseQuery::new_with_offset(terms);
|
||||
searcher
|
||||
.search(&phrase_query, &TestCollector)
|
||||
.search(&phrase_query, &TEST_COLLECTOR_WITH_SCORE)
|
||||
.expect("search should succeed")
|
||||
.docs()
|
||||
.iter()
|
||||
|
||||
@@ -72,13 +72,16 @@ impl PhraseQuery {
|
||||
.map(|(_, term)| term.clone())
|
||||
.collect::<Vec<Term>>()
|
||||
}
|
||||
}
|
||||
|
||||
impl Query for PhraseQuery {
|
||||
/// Create the weight associated to a query.
|
||||
/// Returns the `PhraseWeight` for the given phrase query given a specific `searcher`.
|
||||
///
|
||||
/// See [`Weight`](./trait.Weight.html).
|
||||
fn weight(&self, searcher: &Searcher, scoring_enabled: bool) -> Result<Box<dyn Weight>> {
|
||||
/// This function is the same as `.weight(...)` except it returns
|
||||
/// a specialized type `PhraseWeight` instead of a Boxed trait.
|
||||
pub(crate) fn phrase_weight(
|
||||
&self,
|
||||
searcher: &Searcher,
|
||||
scoring_enabled: bool,
|
||||
) -> Result<PhraseWeight> {
|
||||
let schema = searcher.schema();
|
||||
let field_entry = schema.get_field_entry(self.field);
|
||||
let has_positions = field_entry
|
||||
@@ -95,9 +98,20 @@ impl Query for PhraseQuery {
|
||||
}
|
||||
let terms = self.phrase_terms();
|
||||
let bm25_weight = BM25Weight::for_terms(searcher, &terms);
|
||||
Ok(PhraseWeight::new(
|
||||
self.phrase_terms.clone(),
|
||||
bm25_weight,
|
||||
scoring_enabled,
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
let phrase_weight: PhraseWeight =
|
||||
PhraseWeight::new(self.phrase_terms.clone(), bm25_weight, scoring_enabled);
|
||||
impl Query for PhraseQuery {
|
||||
/// Create the weight associated to a query.
|
||||
///
|
||||
/// See [`Weight`](./trait.Weight.html).
|
||||
fn weight(&self, searcher: &Searcher, scoring_enabled: bool) -> Result<Box<dyn Weight>> {
|
||||
let phrase_weight = self.phrase_weight(searcher, scoring_enabled)?;
|
||||
Ok(Box::new(phrase_weight))
|
||||
}
|
||||
|
||||
|
||||
@@ -163,11 +163,9 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
|
||||
}
|
||||
|
||||
fn phrase_exists(&mut self) -> bool {
|
||||
{
|
||||
self.intersection_docset
|
||||
.docset_mut_specialized(0)
|
||||
.positions(&mut self.left);
|
||||
}
|
||||
self.intersection_docset
|
||||
.docset_mut_specialized(0)
|
||||
.positions(&mut self.left);
|
||||
let mut intersection_len = self.left.len();
|
||||
for i in 1..self.num_terms - 1 {
|
||||
{
|
||||
|
||||
@@ -37,7 +37,7 @@ impl PhraseWeight {
|
||||
reader.get_fieldnorms_reader(field)
|
||||
}
|
||||
|
||||
fn phrase_scorer(
|
||||
pub fn phrase_scorer(
|
||||
&self,
|
||||
reader: &SegmentReader,
|
||||
) -> Result<Option<PhraseScorer<SegmentPostings>>> {
|
||||
|
||||
@@ -18,7 +18,6 @@ pub enum LogicalLiteral {
|
||||
All,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub enum LogicalAST {
|
||||
Clause(Vec<(Occur, LogicalAST)>),
|
||||
Leaf(Box<LogicalLiteral>),
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
mod query_grammar;
|
||||
mod query_parser;
|
||||
mod user_input_ast;
|
||||
|
||||
pub mod logical_ast;
|
||||
pub use self::query_parser::QueryParser;
|
||||
|
||||
@@ -1,285 +0,0 @@
|
||||
#![cfg_attr(feature = "cargo-clippy", allow(clippy::unneeded_field_pattern))]
|
||||
#![cfg_attr(feature = "cargo-clippy", allow(clippy::toplevel_ref_arg))]
|
||||
|
||||
use super::query_grammar;
|
||||
use super::user_input_ast::*;
|
||||
use crate::query::occur::Occur;
|
||||
use crate::query::query_parser::user_input_ast::UserInputBound;
|
||||
use combine::char::*;
|
||||
use combine::error::StreamError;
|
||||
use combine::stream::StreamErrorFor;
|
||||
use combine::*;
|
||||
|
||||
parser! {
|
||||
fn field[I]()(I) -> String
|
||||
where [I: Stream<Item = char>] {
|
||||
(
|
||||
letter(),
|
||||
many(satisfy(|c: char| c.is_alphanumeric() || c == '_')),
|
||||
).map(|(s1, s2): (char, String)| format!("{}{}", s1, s2))
|
||||
}
|
||||
}
|
||||
|
||||
parser! {
|
||||
fn word[I]()(I) -> String
|
||||
where [I: Stream<Item = char>] {
|
||||
many1(satisfy(char::is_alphanumeric))
|
||||
.and_then(|s: String| {
|
||||
match s.as_str() {
|
||||
"OR" => Err(StreamErrorFor::<I>::unexpected_static_message("OR")),
|
||||
"AND" => Err(StreamErrorFor::<I>::unexpected_static_message("AND")),
|
||||
"NOT" => Err(StreamErrorFor::<I>::unexpected_static_message("NOT")),
|
||||
_ => Ok(s)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
parser! {
|
||||
fn literal[I]()(I) -> UserInputLeaf
|
||||
where [I: Stream<Item = char>]
|
||||
{
|
||||
let term_val = || {
|
||||
let phrase = (char('"'), many1(satisfy(|c| c != '"')), char('"')).map(|(_, s, _)| s);
|
||||
phrase.or(word())
|
||||
};
|
||||
let term_val_with_field = negative_number().or(term_val());
|
||||
let term_query =
|
||||
(field(), char(':'), term_val_with_field).map(|(field_name, _, phrase)| UserInputLiteral {
|
||||
field_name: Some(field_name),
|
||||
phrase,
|
||||
});
|
||||
let term_default_field = term_val().map(|phrase| UserInputLiteral {
|
||||
field_name: None,
|
||||
phrase,
|
||||
});
|
||||
attempt(term_query)
|
||||
.or(term_default_field)
|
||||
.map(UserInputLeaf::from)
|
||||
}
|
||||
}
|
||||
|
||||
parser! {
|
||||
fn negative_number[I]()(I) -> String
|
||||
where [I: Stream<Item = char>]
|
||||
{
|
||||
(char('-'), many1(satisfy(char::is_numeric)))
|
||||
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2))
|
||||
}
|
||||
}
|
||||
|
||||
parser! {
|
||||
fn spaces1[I]()(I) -> ()
|
||||
where [I: Stream<Item = char>] {
|
||||
skip_many1(space())
|
||||
}
|
||||
}
|
||||
|
||||
parser! {
|
||||
fn range[I]()(I) -> UserInputLeaf
|
||||
where [I: Stream<Item = char>] {
|
||||
let term_val = || {
|
||||
word().or(negative_number()).or(char('*').map(|_| "*".to_string()))
|
||||
};
|
||||
let lower_bound = {
|
||||
let excl = (char('{'), term_val()).map(|(_, w)| UserInputBound::Exclusive(w));
|
||||
let incl = (char('['), term_val()).map(|(_, w)| UserInputBound::Inclusive(w));
|
||||
attempt(excl).or(incl)
|
||||
};
|
||||
let upper_bound = {
|
||||
let excl = (term_val(), char('}')).map(|(w, _)| UserInputBound::Exclusive(w));
|
||||
let incl = (term_val(), char(']')).map(|(w, _)| UserInputBound::Inclusive(w));
|
||||
attempt(excl).or(incl)
|
||||
};
|
||||
(
|
||||
optional((field(), char(':')).map(|x| x.0)),
|
||||
lower_bound,
|
||||
spaces(),
|
||||
string("TO"),
|
||||
spaces(),
|
||||
upper_bound,
|
||||
).map(|(field, lower, _, _, _, upper)| UserInputLeaf::Range {
|
||||
field,
|
||||
lower,
|
||||
upper
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
parser! {
|
||||
fn leaf[I]()(I) -> UserInputAST
|
||||
where [I: Stream<Item = char>] {
|
||||
(char('-'), leaf()).map(|(_, expr)| expr.unary(Occur::MustNot) )
|
||||
.or((char('+'), leaf()).map(|(_, expr)| expr.unary(Occur::Must) ))
|
||||
.or((char('('), parse_to_ast(), char(')')).map(|(_, expr, _)| expr))
|
||||
.or(char('*').map(|_| UserInputAST::from(UserInputLeaf::All) ))
|
||||
.or(attempt(
|
||||
(string("NOT"), spaces1(), leaf()).map(|(_, _, expr)| expr.unary(Occur::MustNot))
|
||||
)
|
||||
)
|
||||
.or(attempt(
|
||||
range().map(UserInputAST::from)
|
||||
)
|
||||
)
|
||||
.or(literal().map(|leaf| UserInputAST::Leaf(Box::new(leaf))))
|
||||
}
|
||||
}
|
||||
|
||||
enum BinaryOperand {
|
||||
Or,
|
||||
And,
|
||||
}
|
||||
|
||||
parser! {
|
||||
fn binary_operand[I]()(I) -> BinaryOperand
|
||||
where [I: Stream<Item = char>] {
|
||||
(spaces1(),
|
||||
(
|
||||
string("AND").map(|_| BinaryOperand::And)
|
||||
.or(string("OR").map(|_| BinaryOperand::Or))
|
||||
),
|
||||
spaces1()).map(|(_, op,_)| op)
|
||||
}
|
||||
}
|
||||
|
||||
enum Element {
|
||||
SingleEl(UserInputAST),
|
||||
NormalDisjunctive(Vec<Vec<UserInputAST>>),
|
||||
}
|
||||
|
||||
impl Element {
|
||||
pub fn into_dnf(self) -> Vec<Vec<UserInputAST>> {
|
||||
match self {
|
||||
Element::NormalDisjunctive(conjunctions) => conjunctions,
|
||||
Element::SingleEl(el) => vec![vec![el]],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
parser! {
|
||||
pub fn parse_to_ast[I]()(I) -> UserInputAST
|
||||
where [I: Stream<Item = char>]
|
||||
{
|
||||
(
|
||||
attempt(
|
||||
chainl1(
|
||||
leaf().map(Element::SingleEl),
|
||||
binary_operand().map(|op: BinaryOperand|
|
||||
move |left: Element, right: Element| {
|
||||
let mut dnf = left.into_dnf();
|
||||
if let Element::SingleEl(el) = right {
|
||||
match op {
|
||||
BinaryOperand::And => {
|
||||
if let Some(last) = dnf.last_mut() {
|
||||
last.push(el);
|
||||
}
|
||||
}
|
||||
BinaryOperand::Or => {
|
||||
dnf.push(vec!(el));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
unreachable!("Please report.")
|
||||
}
|
||||
Element::NormalDisjunctive(dnf)
|
||||
}
|
||||
)
|
||||
)
|
||||
.map(query_grammar::Element::into_dnf)
|
||||
.map(|fnd| {
|
||||
if fnd.len() == 1 {
|
||||
UserInputAST::and(fnd.into_iter().next().unwrap()) //< safe
|
||||
} else {
|
||||
let conjunctions = fnd
|
||||
.into_iter()
|
||||
.map(UserInputAST::and)
|
||||
.collect();
|
||||
UserInputAST::or(conjunctions)
|
||||
}
|
||||
})
|
||||
)
|
||||
.or(
|
||||
sep_by(leaf(), spaces())
|
||||
.map(|subqueries: Vec<UserInputAST>| {
|
||||
if subqueries.len() == 1 {
|
||||
subqueries.into_iter().next().unwrap()
|
||||
} else {
|
||||
UserInputAST::Clause(subqueries.into_iter().collect())
|
||||
}
|
||||
})
|
||||
)
|
||||
)
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
|
||||
use super::*;
|
||||
|
||||
fn test_parse_query_to_ast_helper(query: &str, expected: &str) {
|
||||
let query = parse_to_ast().parse(query).unwrap().0;
|
||||
let query_str = format!("{:?}", query);
|
||||
assert_eq!(query_str, expected);
|
||||
}
|
||||
|
||||
fn test_is_parse_err(query: &str) {
|
||||
assert!(parse_to_ast().parse(query).is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_query_to_ast_not_op() {
|
||||
assert_eq!(
|
||||
format!("{:?}", parse_to_ast().parse("NOT")),
|
||||
"Err(UnexpectedParse)"
|
||||
);
|
||||
test_parse_query_to_ast_helper("NOTa", "\"NOTa\"");
|
||||
test_parse_query_to_ast_helper("NOT a", "-(\"a\")");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_query_to_ast_binary_op() {
|
||||
test_parse_query_to_ast_helper("a AND b", "(+(\"a\") +(\"b\"))");
|
||||
test_parse_query_to_ast_helper("a OR b", "(?(\"a\") ?(\"b\"))");
|
||||
test_parse_query_to_ast_helper("a OR b AND c", "(?(\"a\") ?((+(\"b\") +(\"c\"))))");
|
||||
test_parse_query_to_ast_helper("a AND b AND c", "(+(\"a\") +(\"b\") +(\"c\"))");
|
||||
assert_eq!(
|
||||
format!("{:?}", parse_to_ast().parse("a OR b aaa")),
|
||||
"Err(UnexpectedParse)"
|
||||
);
|
||||
assert_eq!(
|
||||
format!("{:?}", parse_to_ast().parse("a AND b aaa")),
|
||||
"Err(UnexpectedParse)"
|
||||
);
|
||||
assert_eq!(
|
||||
format!("{:?}", parse_to_ast().parse("aaa a OR b ")),
|
||||
"Err(UnexpectedParse)"
|
||||
);
|
||||
assert_eq!(
|
||||
format!("{:?}", parse_to_ast().parse("aaa ccc a OR b ")),
|
||||
"Err(UnexpectedParse)"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_query_to_ast() {
|
||||
test_parse_query_to_ast_helper("+(a b) +d", "(+((\"a\" \"b\")) +(\"d\"))");
|
||||
test_parse_query_to_ast_helper("(+a +b) d", "((+(\"a\") +(\"b\")) \"d\")");
|
||||
test_parse_query_to_ast_helper("(+a)", "+(\"a\")");
|
||||
test_parse_query_to_ast_helper("(+a +b)", "(+(\"a\") +(\"b\"))");
|
||||
test_parse_query_to_ast_helper("abc:toto", "abc:\"toto\"");
|
||||
test_parse_query_to_ast_helper("+abc:toto", "+(abc:\"toto\")");
|
||||
test_parse_query_to_ast_helper("(+abc:toto -titi)", "(+(abc:\"toto\") -(\"titi\"))");
|
||||
test_parse_query_to_ast_helper("-abc:toto", "-(abc:\"toto\")");
|
||||
test_parse_query_to_ast_helper("abc:a b", "(abc:\"a\" \"b\")");
|
||||
test_parse_query_to_ast_helper("abc:\"a b\"", "abc:\"a b\"");
|
||||
test_parse_query_to_ast_helper("foo:[1 TO 5]", "foo:[\"1\" TO \"5\"]");
|
||||
test_parse_query_to_ast_helper("[1 TO 5]", "[\"1\" TO \"5\"]");
|
||||
test_parse_query_to_ast_helper("foo:{a TO z}", "foo:{\"a\" TO \"z\"}");
|
||||
test_parse_query_to_ast_helper("foo:[1 TO toto}", "foo:[\"1\" TO \"toto\"}");
|
||||
test_parse_query_to_ast_helper("foo:[* TO toto}", "foo:[\"*\" TO \"toto\"}");
|
||||
test_parse_query_to_ast_helper("foo:[1 TO *}", "foo:[\"1\" TO \"*\"}");
|
||||
test_is_parse_err("abc + ");
|
||||
}
|
||||
}
|
||||
@@ -1,9 +1,5 @@
|
||||
use super::logical_ast::*;
|
||||
use super::query_grammar::parse_to_ast;
|
||||
use super::user_input_ast::*;
|
||||
use crate::core::Index;
|
||||
use crate::query::occur::compose_occur;
|
||||
use crate::query::query_parser::logical_ast::LogicalAST;
|
||||
use crate::query::AllQuery;
|
||||
use crate::query::BooleanQuery;
|
||||
use crate::query::EmptyQuery;
|
||||
@@ -16,41 +12,58 @@ use crate::schema::IndexRecordOption;
|
||||
use crate::schema::{Field, Schema};
|
||||
use crate::schema::{FieldType, Term};
|
||||
use crate::tokenizer::TokenizerManager;
|
||||
use combine::Parser;
|
||||
use std::borrow::Cow;
|
||||
use std::num::ParseIntError;
|
||||
use std::num::{ParseFloatError, ParseIntError};
|
||||
use std::ops::Bound;
|
||||
use std::str::FromStr;
|
||||
use tantivy_query_grammar::{UserInputAST, UserInputBound, UserInputLeaf};
|
||||
|
||||
/// Possible error that may happen when parsing a query.
|
||||
#[derive(Debug, PartialEq, Eq)]
|
||||
#[derive(Debug, PartialEq, Eq, Fail)]
|
||||
pub enum QueryParserError {
|
||||
/// Error in the query syntax
|
||||
#[fail(display = "Syntax Error")]
|
||||
SyntaxError,
|
||||
/// `FieldDoesNotExist(field_name: String)`
|
||||
/// The query references a field that is not in the schema
|
||||
#[fail(display = "File does not exists: '{:?}'", _0)]
|
||||
FieldDoesNotExist(String),
|
||||
/// The query contains a term for a `u64`-field, but the value
|
||||
/// is not a u64.
|
||||
/// The query contains a term for a `u64` or `i64`-field, but the value
|
||||
/// is neither.
|
||||
#[fail(display = "Expected a valid integer: '{:?}'", _0)]
|
||||
ExpectedInt(ParseIntError),
|
||||
/// The query contains a term for a `f64`-field, but the value
|
||||
/// is not a f64.
|
||||
#[fail(display = "Invalid query: Only excluding terms given")]
|
||||
ExpectedFloat(ParseFloatError),
|
||||
/// It is forbidden queries that are only "excluding". (e.g. -title:pop)
|
||||
#[fail(display = "Invalid query: Only excluding terms given")]
|
||||
AllButQueryForbidden,
|
||||
/// If no default field is declared, running a query without any
|
||||
/// field specified is forbbidden.
|
||||
#[fail(display = "No default field declared and no field specified in query")]
|
||||
NoDefaultFieldDeclared,
|
||||
/// The field searched for is not declared
|
||||
/// as indexed in the schema.
|
||||
#[fail(display = "The field '{:?}' is not declared as indexed", _0)]
|
||||
FieldNotIndexed(String),
|
||||
/// A phrase query was requested for a field that does not
|
||||
/// have any positions indexed.
|
||||
#[fail(display = "The field '{:?}' does not have positions indexed", _0)]
|
||||
FieldDoesNotHavePositionsIndexed(String),
|
||||
/// The tokenizer for the given field is unknown
|
||||
/// The two argument strings are the name of the field, the name of the tokenizer
|
||||
#[fail(
|
||||
display = "The tokenizer '{:?}' for the field '{:?}' is unknown",
|
||||
_0, _1
|
||||
)]
|
||||
UnknownTokenizer(String, String),
|
||||
/// The query contains a range query with a phrase as one of the bounds.
|
||||
/// Only terms can be used as bounds.
|
||||
#[fail(display = "A range query cannot have a phrase as one of the bounds")]
|
||||
RangeMustNotHavePhrase,
|
||||
/// The format for the date field is not RFC 3339 compliant.
|
||||
#[fail(display = "The date field has an invalid format")]
|
||||
DateFormatError(chrono::ParseError),
|
||||
}
|
||||
|
||||
@@ -60,6 +73,12 @@ impl From<ParseIntError> for QueryParserError {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<ParseFloatError> for QueryParserError {
|
||||
fn from(err: ParseFloatError) -> QueryParserError {
|
||||
QueryParserError::ExpectedFloat(err)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<chrono::ParseError> for QueryParserError {
|
||||
fn from(err: chrono::ParseError) -> QueryParserError {
|
||||
QueryParserError::DateFormatError(err)
|
||||
@@ -199,9 +218,8 @@ impl QueryParser {
|
||||
|
||||
/// Parse the user query into an AST.
|
||||
fn parse_query_to_logical_ast(&self, query: &str) -> Result<LogicalAST, QueryParserError> {
|
||||
let (user_input_ast, _remaining) = parse_to_ast()
|
||||
.parse(query)
|
||||
.map_err(|_| QueryParserError::SyntaxError)?;
|
||||
let user_input_ast =
|
||||
tantivy_query_grammar::parse_query(query).map_err(|_| QueryParserError::SyntaxError)?;
|
||||
self.compute_logical_ast(user_input_ast)
|
||||
}
|
||||
|
||||
@@ -239,6 +257,11 @@ impl QueryParser {
|
||||
let term = Term::from_field_i64(field, val);
|
||||
Ok(vec![(0, term)])
|
||||
}
|
||||
FieldType::F64(_) => {
|
||||
let val: f64 = f64::from_str(phrase)?;
|
||||
let term = Term::from_field_f64(field, val);
|
||||
Ok(vec![(0, term)])
|
||||
}
|
||||
FieldType::Date(_) => match chrono::DateTime::parse_from_rfc3339(phrase) {
|
||||
Ok(x) => Ok(vec![(
|
||||
0,
|
||||
@@ -341,6 +364,7 @@ impl QueryParser {
|
||||
match *bound {
|
||||
UserInputBound::Inclusive(_) => Ok(Bound::Included(term)),
|
||||
UserInputBound::Exclusive(_) => Ok(Bound::Excluded(term)),
|
||||
UserInputBound::Unbounded => Ok(Bound::Unbounded),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -370,7 +394,7 @@ impl QueryParser {
|
||||
let mut logical_sub_queries: Vec<(Occur, LogicalAST)> = Vec::new();
|
||||
for sub_query in sub_queries {
|
||||
let (occur, sub_ast) = self.compute_logical_ast_with_occur(sub_query)?;
|
||||
let new_occur = compose_occur(default_occur, occur);
|
||||
let new_occur = Occur::compose(default_occur, occur);
|
||||
logical_sub_queries.push((new_occur, sub_ast));
|
||||
}
|
||||
Ok((Occur::Should, LogicalAST::Clause(logical_sub_queries)))
|
||||
@@ -378,7 +402,7 @@ impl QueryParser {
|
||||
UserInputAST::Unary(left_occur, subquery) => {
|
||||
let (right_occur, logical_sub_queries) =
|
||||
self.compute_logical_ast_with_occur(*subquery)?;
|
||||
Ok((compose_occur(left_occur, right_occur), logical_sub_queries))
|
||||
Ok((Occur::compose(left_occur, right_occur), logical_sub_queries))
|
||||
}
|
||||
UserInputAST::Leaf(leaf) => {
|
||||
let result_ast = self.compute_logical_ast_from_leaf(*leaf)?;
|
||||
@@ -529,6 +553,7 @@ mod test {
|
||||
schema_builder.add_text_field("nottokenized", STRING);
|
||||
schema_builder.add_text_field("with_stop_words", text_options);
|
||||
schema_builder.add_date_field("date", INDEXED);
|
||||
schema_builder.add_f64_field("float", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
let default_fields = vec![title, text];
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
@@ -599,7 +624,7 @@ mod test {
|
||||
pub fn test_parse_query_untokenized() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"nottokenized:\"wordone wordtwo\"",
|
||||
"Term([0, 0, 0, 7, 119, 111, 114, 100, 111, 110, \
|
||||
"Term(field=7,bytes=[119, 111, 114, 100, 111, 110, \
|
||||
101, 32, 119, 111, 114, 100, 116, 119, 111])",
|
||||
false,
|
||||
);
|
||||
@@ -634,9 +659,16 @@ mod test {
|
||||
assert!(query_parser
|
||||
.parse_query("unsigned:\"18446744073709551615\"")
|
||||
.is_ok());
|
||||
assert!(query_parser.parse_query("float:\"3.1\"").is_ok());
|
||||
assert!(query_parser.parse_query("float:\"-2.4\"").is_ok());
|
||||
assert!(query_parser.parse_query("float:\"2.1.2\"").is_err());
|
||||
assert!(query_parser.parse_query("float:\"2.1a\"").is_err());
|
||||
assert!(query_parser
|
||||
.parse_query("float:\"18446744073709551615.0\"")
|
||||
.is_ok());
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"unsigned:2324",
|
||||
"Term([0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 9, 20])",
|
||||
"Term(field=3,bytes=[0, 0, 0, 0, 0, 0, 9, 20])",
|
||||
false,
|
||||
);
|
||||
|
||||
@@ -645,25 +677,31 @@ mod test {
|
||||
&format!("{:?}", Term::from_field_i64(Field(2u32), -2324)),
|
||||
false,
|
||||
);
|
||||
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"float:2.5",
|
||||
&format!("{:?}", Term::from_field_f64(Field(10u32), 2.5)),
|
||||
false,
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_query_to_ast_disjunction() {
|
||||
pub fn test_parse_query_to_ast_single_term() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:toto",
|
||||
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
|
||||
"Term(field=0,bytes=[116, 111, 116, 111])",
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"+title:toto",
|
||||
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
|
||||
"Term(field=0,bytes=[116, 111, 116, 111])",
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"+title:toto -titi",
|
||||
"(+Term([0, 0, 0, 0, 116, 111, 116, 111]) \
|
||||
-(Term([0, 0, 0, 0, 116, 105, 116, 105]) \
|
||||
Term([0, 0, 0, 1, 116, 105, 116, 105])))",
|
||||
"(+Term(field=0,bytes=[116, 111, 116, 111]) \
|
||||
-(Term(field=0,bytes=[116, 105, 116, 105]) \
|
||||
Term(field=1,bytes=[116, 105, 116, 105])))",
|
||||
false,
|
||||
);
|
||||
assert_eq!(
|
||||
@@ -672,49 +710,67 @@ mod test {
|
||||
.unwrap(),
|
||||
QueryParserError::AllButQueryForbidden
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_query_to_ast_two_terms() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:a b",
|
||||
"(Term([0, 0, 0, 0, 97]) (Term([0, 0, 0, 0, 98]) \
|
||||
Term([0, 0, 0, 1, 98])))",
|
||||
"(Term(field=0,bytes=[97]) (Term(field=0,bytes=[98]) Term(field=1,bytes=[98])))",
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:\"a b\"",
|
||||
"\"[(0, Term([0, 0, 0, 0, 97])), \
|
||||
(1, Term([0, 0, 0, 0, 98]))]\"",
|
||||
"\"[(0, Term(field=0,bytes=[97])), \
|
||||
(1, Term(field=0,bytes=[98]))]\"",
|
||||
false,
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_query_to_ast_ranges() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:[a TO b]",
|
||||
"(Included(Term([0, 0, 0, 0, 97])) TO \
|
||||
Included(Term([0, 0, 0, 0, 98])))",
|
||||
"(Included(Term(field=0,bytes=[97])) TO Included(Term(field=0,bytes=[98])))",
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"[a TO b]",
|
||||
"((Included(Term([0, 0, 0, 0, 97])) TO \
|
||||
Included(Term([0, 0, 0, 0, 98]))) \
|
||||
(Included(Term([0, 0, 0, 1, 97])) TO \
|
||||
Included(Term([0, 0, 0, 1, 98]))))",
|
||||
"((Included(Term(field=0,bytes=[97])) TO \
|
||||
Included(Term(field=0,bytes=[98]))) \
|
||||
(Included(Term(field=1,bytes=[97])) TO \
|
||||
Included(Term(field=1,bytes=[98]))))",
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:{titi TO toto}",
|
||||
"(Excluded(Term([0, 0, 0, 0, 116, 105, 116, 105])) TO \
|
||||
Excluded(Term([0, 0, 0, 0, 116, 111, 116, 111])))",
|
||||
"(Excluded(Term(field=0,bytes=[116, 105, 116, 105])) TO \
|
||||
Excluded(Term(field=0,bytes=[116, 111, 116, 111])))",
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:{* TO toto}",
|
||||
"(Unbounded TO \
|
||||
Excluded(Term([0, 0, 0, 0, 116, 111, 116, 111])))",
|
||||
"(Unbounded TO Excluded(Term(field=0,bytes=[116, 111, 116, 111])))",
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:{titi TO *}",
|
||||
"(Excluded(Term([0, 0, 0, 0, 116, 105, 116, 105])) TO Unbounded)",
|
||||
"(Excluded(Term(field=0,bytes=[116, 105, 116, 105])) TO Unbounded)",
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"signed:{-5 TO 3}",
|
||||
"(Excluded(Term(field=2,bytes=[127, 255, 255, 255, 255, 255, 255, 251])) TO \
|
||||
Excluded(Term(field=2,bytes=[128, 0, 0, 0, 0, 0, 0, 3])))",
|
||||
false,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"float:{-1.5 TO 1.5}",
|
||||
"(Excluded(Term(field=10,bytes=[64, 7, 255, 255, 255, 255, 255, 255])) TO \
|
||||
Excluded(Term(field=10,bytes=[191, 248, 0, 0, 0, 0, 0, 0])))",
|
||||
false,
|
||||
);
|
||||
|
||||
test_parse_query_to_logical_ast_helper("*", "*", false);
|
||||
}
|
||||
|
||||
@@ -786,6 +842,11 @@ mod test {
|
||||
query_parser.parse_query("signed:18b"),
|
||||
Err(QueryParserError::ExpectedInt(_))
|
||||
);
|
||||
assert!(query_parser.parse_query("float:\"1.8\"").is_ok());
|
||||
assert_matches!(
|
||||
query_parser.parse_query("float:1.8a"),
|
||||
Err(QueryParserError::ExpectedFloat(_))
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -811,19 +872,19 @@ mod test {
|
||||
pub fn test_parse_query_to_ast_conjunction() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:toto",
|
||||
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
|
||||
"Term(field=0,bytes=[116, 111, 116, 111])",
|
||||
true,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"+title:toto",
|
||||
"Term([0, 0, 0, 0, 116, 111, 116, 111])",
|
||||
"Term(field=0,bytes=[116, 111, 116, 111])",
|
||||
true,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"+title:toto -titi",
|
||||
"(+Term([0, 0, 0, 0, 116, 111, 116, 111]) \
|
||||
-(Term([0, 0, 0, 0, 116, 105, 116, 105]) \
|
||||
Term([0, 0, 0, 1, 116, 105, 116, 105])))",
|
||||
"(+Term(field=0,bytes=[116, 111, 116, 111]) \
|
||||
-(Term(field=0,bytes=[116, 105, 116, 105]) \
|
||||
Term(field=1,bytes=[116, 105, 116, 105])))",
|
||||
true,
|
||||
);
|
||||
assert_eq!(
|
||||
@@ -834,16 +895,25 @@ mod test {
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:a b",
|
||||
"(+Term([0, 0, 0, 0, 97]) \
|
||||
+(Term([0, 0, 0, 0, 98]) \
|
||||
Term([0, 0, 0, 1, 98])))",
|
||||
"(+Term(field=0,bytes=[97]) \
|
||||
+(Term(field=0,bytes=[98]) \
|
||||
Term(field=1,bytes=[98])))",
|
||||
true,
|
||||
);
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:\"a b\"",
|
||||
"\"[(0, Term([0, 0, 0, 0, 97])), \
|
||||
(1, Term([0, 0, 0, 0, 98]))]\"",
|
||||
"\"[(0, Term(field=0,bytes=[97])), \
|
||||
(1, Term(field=0,bytes=[98]))]\"",
|
||||
true,
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_query_parser_hyphen() {
|
||||
test_parse_query_to_logical_ast_helper(
|
||||
"title:www-form-encoded",
|
||||
"\"[(0, Term(field=0,bytes=[119, 119, 119])), (1, Term(field=0,bytes=[102, 111, 114, 109])), (2, Term(field=0,bytes=[101, 110, 99, 111, 100, 101, 100]))]\"",
|
||||
false
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,44 +0,0 @@
|
||||
use std::sync::Arc;
|
||||
use stemmer;
|
||||
|
||||
|
||||
pub struct StemmerTokenStream<TailTokenStream>
|
||||
where TailTokenStream: TokenStream {
|
||||
tail: TailTokenStream,
|
||||
stemmer: Arc<stemmer::Stemmer>,
|
||||
}
|
||||
|
||||
impl<TailTokenStream> TokenStream for StemmerTokenStream<TailTokenStream>
|
||||
where TailTokenStream: TokenStream {
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
self.tail.token()
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
self.tail.token_mut()
|
||||
}
|
||||
|
||||
fn advance(&mut self) -> bool {
|
||||
if self.tail.advance() {
|
||||
// self.tail.token_mut().term.make_ascii_lowercase();
|
||||
let new_str = self.stemmer.stem_str(&self.token().term);
|
||||
true
|
||||
}
|
||||
else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
impl<TailTokenStream> StemmerTokenStream<TailTokenStream>
|
||||
where TailTokenStream: TokenStream {
|
||||
|
||||
fn wrap(stemmer: Arc<stemmer::Stemmer>, tail: TailTokenStream) -> StemmerTokenStream<TailTokenStream> {
|
||||
StemmerTokenStream {
|
||||
tail,
|
||||
stemmer,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -38,14 +38,10 @@ fn map_bound<TFrom, TTo, Transform: Fn(&TFrom) -> TTo>(
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
///
|
||||
/// # #[macro_use]
|
||||
/// # extern crate tantivy;
|
||||
/// # use tantivy::Index;
|
||||
/// # use tantivy::schema::{Schema, INDEXED};
|
||||
/// # use tantivy::collector::Count;
|
||||
/// # use tantivy::Result;
|
||||
/// # use tantivy::query::RangeQuery;
|
||||
/// # use tantivy::schema::{Schema, INDEXED};
|
||||
/// # use tantivy::{doc, Index, Result};
|
||||
/// #
|
||||
/// # fn run() -> Result<()> {
|
||||
/// # let mut schema_builder = Schema::builder();
|
||||
@@ -142,6 +138,39 @@ impl RangeQuery {
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a new `RangeQuery` over a `f64` field.
|
||||
///
|
||||
/// If the field is not of the type `f64`, tantivy
|
||||
/// will panic when the `Weight` object is created.
|
||||
pub fn new_f64(field: Field, range: Range<f64>) -> RangeQuery {
|
||||
RangeQuery::new_f64_bounds(
|
||||
field,
|
||||
Bound::Included(range.start),
|
||||
Bound::Excluded(range.end),
|
||||
)
|
||||
}
|
||||
|
||||
/// Create a new `RangeQuery` over a `f64` field.
|
||||
///
|
||||
/// The two `Bound` arguments make it possible to create more complex
|
||||
/// ranges than semi-inclusive range.
|
||||
///
|
||||
/// If the field is not of the type `f64`, tantivy
|
||||
/// will panic when the `Weight` object is created.
|
||||
pub fn new_f64_bounds(
|
||||
field: Field,
|
||||
left_bound: Bound<f64>,
|
||||
right_bound: Bound<f64>,
|
||||
) -> RangeQuery {
|
||||
let make_term_val = |val: &f64| Term::from_field_f64(field, *val).value_bytes().to_owned();
|
||||
RangeQuery {
|
||||
field,
|
||||
value_type: Type::F64,
|
||||
left_bound: map_bound(&left_bound, &make_term_val),
|
||||
right_bound: map_bound(&right_bound, &make_term_val),
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new `RangeQuery` over a `u64` field.
|
||||
///
|
||||
/// The two `Bound` arguments make it possible to create more complex
|
||||
@@ -305,39 +334,33 @@ mod tests {
|
||||
use crate::collector::Count;
|
||||
use crate::schema::{Document, Field, Schema, INDEXED};
|
||||
use crate::Index;
|
||||
use crate::Result;
|
||||
use std::collections::Bound;
|
||||
|
||||
#[test]
|
||||
fn test_range_query_simple() {
|
||||
fn run() -> Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let year_field = schema_builder.add_u64_field("year", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
let mut schema_builder = Schema::builder();
|
||||
let year_field = schema_builder.add_u64_field("year", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 6_000_000).unwrap();
|
||||
for year in 1950u64..2017u64 {
|
||||
let num_docs_within_year = 10 + (year - 1950) * (year - 1950);
|
||||
for _ in 0..num_docs_within_year {
|
||||
index_writer.add_document(doc!(year_field => year));
|
||||
}
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 6_000_000).unwrap();
|
||||
for year in 1950u64..2017u64 {
|
||||
let num_docs_within_year = 10 + (year - 1950) * (year - 1950);
|
||||
for _ in 0..num_docs_within_year {
|
||||
index_writer.add_document(doc!(year_field => year));
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
|
||||
let docs_in_the_sixties = RangeQuery::new_u64(year_field, 1960u64..1970u64);
|
||||
|
||||
// ... or `1960..=1969` if inclusive range is enabled.
|
||||
let count = searcher.search(&docs_in_the_sixties, &Count)?;
|
||||
assert_eq!(count, 2285);
|
||||
Ok(())
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
|
||||
run().unwrap();
|
||||
let docs_in_the_sixties = RangeQuery::new_u64(year_field, 1960u64..1970u64);
|
||||
|
||||
// ... or `1960..=1969` if inclusive range is enabled.
|
||||
let count = searcher.search(&docs_in_the_sixties, &Count).unwrap();
|
||||
assert_eq!(count, 2285);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -397,4 +420,64 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_range_float() {
|
||||
let float_field: Field;
|
||||
let schema = {
|
||||
let mut schema_builder = Schema::builder();
|
||||
float_field = schema_builder.add_f64_field("floatfield", INDEXED);
|
||||
schema_builder.build()
|
||||
};
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(2, 6_000_000).unwrap();
|
||||
|
||||
for i in 1..100 {
|
||||
let mut doc = Document::new();
|
||||
for j in 1..100 {
|
||||
if i % j == 0 {
|
||||
doc.add_f64(float_field, j as f64);
|
||||
}
|
||||
}
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
let count_multiples =
|
||||
|range_query: RangeQuery| searcher.search(&range_query, &Count).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
count_multiples(RangeQuery::new_f64(float_field, 10.0..11.0)),
|
||||
9
|
||||
);
|
||||
assert_eq!(
|
||||
count_multiples(RangeQuery::new_f64_bounds(
|
||||
float_field,
|
||||
Bound::Included(10.0),
|
||||
Bound::Included(11.0)
|
||||
)),
|
||||
18
|
||||
);
|
||||
assert_eq!(
|
||||
count_multiples(RangeQuery::new_f64_bounds(
|
||||
float_field,
|
||||
Bound::Excluded(9.0),
|
||||
Bound::Included(10.0)
|
||||
)),
|
||||
9
|
||||
);
|
||||
assert_eq!(
|
||||
count_multiples(RangeQuery::new_f64_bounds(
|
||||
float_field,
|
||||
Bound::Included(9.0),
|
||||
Bound::Unbounded
|
||||
)),
|
||||
91
|
||||
);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -4,22 +4,18 @@ use crate::schema::Field;
|
||||
use crate::Result;
|
||||
use crate::Searcher;
|
||||
use std::clone::Clone;
|
||||
use std::sync::Arc;
|
||||
use tantivy_fst::Regex;
|
||||
|
||||
// A Regex Query matches all of the documents
|
||||
/// A Regex Query matches all of the documents
|
||||
/// containing a specific term that matches
|
||||
/// a regex pattern
|
||||
/// A Fuzzy Query matches all of the documents
|
||||
/// containing a specific term that is within
|
||||
/// Levenshtein distance
|
||||
/// a regex pattern.
|
||||
///
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::schema::{Schema, TEXT};
|
||||
/// use tantivy::{Index, Result, Term};
|
||||
/// use tantivy::collector::Count;
|
||||
/// use tantivy::query::RegexQuery;
|
||||
/// use tantivy::schema::{Schema, TEXT};
|
||||
/// use tantivy::{doc, Index, Result, Term};
|
||||
///
|
||||
/// # fn main() { example().unwrap(); }
|
||||
/// fn example() -> Result<()> {
|
||||
@@ -48,7 +44,7 @@ use tantivy_fst::Regex;
|
||||
/// let searcher = reader.searcher();
|
||||
///
|
||||
/// let term = Term::from_field_text(title, "Diary");
|
||||
/// let query = RegexQuery::new("d[ai]{2}ry".to_string(), title);
|
||||
/// let query = RegexQuery::from_pattern("d[ai]{2}ry", title)?;
|
||||
/// let count = searcher.search(&query, &Count)?;
|
||||
/// assert_eq!(count, 3);
|
||||
/// Ok(())
|
||||
@@ -56,30 +52,34 @@ use tantivy_fst::Regex;
|
||||
/// ```
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct RegexQuery {
|
||||
regex_pattern: String,
|
||||
regex: Arc<Regex>,
|
||||
field: Field,
|
||||
}
|
||||
|
||||
impl RegexQuery {
|
||||
/// Creates a new Fuzzy Query
|
||||
pub fn new(regex_pattern: String, field: Field) -> RegexQuery {
|
||||
/// Creates a new RegexQuery from a given pattern
|
||||
pub fn from_pattern(regex_pattern: &str, field: Field) -> Result<Self> {
|
||||
let regex = Regex::new(®ex_pattern)
|
||||
.map_err(|_| TantivyError::InvalidArgument(regex_pattern.to_string()))?;
|
||||
Ok(RegexQuery::from_regex(regex, field))
|
||||
}
|
||||
|
||||
/// Creates a new RegexQuery from a fully built Regex
|
||||
pub fn from_regex<T: Into<Arc<Regex>>>(regex: T, field: Field) -> Self {
|
||||
RegexQuery {
|
||||
regex_pattern,
|
||||
regex: regex.into(),
|
||||
field,
|
||||
}
|
||||
}
|
||||
|
||||
fn specialized_weight(&self) -> Result<AutomatonWeight<Regex>> {
|
||||
let automaton = Regex::new(&self.regex_pattern)
|
||||
.map_err(|_| TantivyError::InvalidArgument(self.regex_pattern.clone()))?;
|
||||
|
||||
Ok(AutomatonWeight::new(self.field, automaton))
|
||||
fn specialized_weight(&self) -> AutomatonWeight<Regex> {
|
||||
AutomatonWeight::new(self.field, self.regex.clone())
|
||||
}
|
||||
}
|
||||
|
||||
impl Query for RegexQuery {
|
||||
fn weight(&self, _searcher: &Searcher, _scoring_enabled: bool) -> Result<Box<dyn Weight>> {
|
||||
Ok(Box::new(self.specialized_weight()?))
|
||||
Ok(Box::new(self.specialized_weight()))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -87,13 +87,14 @@ impl Query for RegexQuery {
|
||||
mod test {
|
||||
use super::RegexQuery;
|
||||
use crate::collector::TopDocs;
|
||||
use crate::schema::Schema;
|
||||
use crate::schema::TEXT;
|
||||
use crate::schema::{Field, Schema};
|
||||
use crate::tests::assert_nearly_equals;
|
||||
use crate::Index;
|
||||
use crate::{Index, IndexReader};
|
||||
use std::sync::Arc;
|
||||
use tantivy_fst::Regex;
|
||||
|
||||
#[test]
|
||||
pub fn test_regex_query() {
|
||||
fn build_test_index() -> (IndexReader, Field) {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let country_field = schema_builder.add_text_field("country", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
@@ -109,20 +110,65 @@ mod test {
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
let reader = index.reader().unwrap();
|
||||
|
||||
(reader, country_field)
|
||||
}
|
||||
|
||||
fn verify_regex_query(
|
||||
query_matching_one: RegexQuery,
|
||||
query_matching_zero: RegexQuery,
|
||||
reader: IndexReader,
|
||||
) {
|
||||
let searcher = reader.searcher();
|
||||
{
|
||||
let regex_query = RegexQuery::new("jap[ao]n".to_string(), country_field);
|
||||
let scored_docs = searcher
|
||||
.search(®ex_query, &TopDocs::with_limit(2))
|
||||
.search(&query_matching_one, &TopDocs::with_limit(2))
|
||||
.unwrap();
|
||||
assert_eq!(scored_docs.len(), 1, "Expected only 1 document");
|
||||
let (score, _) = scored_docs[0];
|
||||
assert_nearly_equals(1f32, score);
|
||||
}
|
||||
let regex_query = RegexQuery::new("jap[A-Z]n".to_string(), country_field);
|
||||
let top_docs = searcher
|
||||
.search(®ex_query, &TopDocs::with_limit(2))
|
||||
.search(&query_matching_zero, &TopDocs::with_limit(2))
|
||||
.unwrap();
|
||||
assert!(top_docs.is_empty(), "Expected ZERO document");
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_regex_query() {
|
||||
let (reader, field) = build_test_index();
|
||||
|
||||
let matching_one = RegexQuery::from_pattern("jap[ao]n", field).unwrap();
|
||||
let matching_zero = RegexQuery::from_pattern("jap[A-Z]n", field).unwrap();
|
||||
|
||||
verify_regex_query(matching_one, matching_zero, reader);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_construct_from_regex() {
|
||||
let (reader, field) = build_test_index();
|
||||
|
||||
let matching_one = RegexQuery::from_regex(Regex::new("jap[ao]n").unwrap(), field);
|
||||
let matching_zero = RegexQuery::from_regex(Regex::new("jap[A-Z]n").unwrap(), field);
|
||||
|
||||
verify_regex_query(matching_one, matching_zero, reader);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_construct_from_reused_regex() {
|
||||
let r1 = Arc::new(Regex::new("jap[ao]n").unwrap());
|
||||
let r2 = Arc::new(Regex::new("jap[A-Z]n").unwrap());
|
||||
|
||||
let (reader, field) = build_test_index();
|
||||
|
||||
let matching_one = RegexQuery::from_regex(r1.clone(), field);
|
||||
let matching_zero = RegexQuery::from_regex(r2.clone(), field);
|
||||
|
||||
verify_regex_query(matching_one, matching_zero, reader.clone());
|
||||
|
||||
let matching_one = RegexQuery::from_regex(r1.clone(), field);
|
||||
let matching_zero = RegexQuery::from_regex(r2.clone(), field);
|
||||
|
||||
verify_regex_query(matching_one, matching_zero, reader.clone());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,7 +12,7 @@ mod tests {
|
||||
use crate::collector::TopDocs;
|
||||
use crate::docset::DocSet;
|
||||
use crate::query::{Query, QueryParser, Scorer, TermQuery};
|
||||
use crate::schema::{IndexRecordOption, Schema, STRING, TEXT};
|
||||
use crate::schema::{Field, IndexRecordOption, Schema, STRING, TEXT};
|
||||
use crate::tests::assert_nearly_equals;
|
||||
use crate::Index;
|
||||
use crate::Term;
|
||||
@@ -114,4 +114,16 @@ mod tests {
|
||||
let reader = index.reader().unwrap();
|
||||
assert_eq!(term_query.count(&*reader.searcher()).unwrap(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_term_query_debug() {
|
||||
let term_query = TermQuery::new(
|
||||
Term::from_field_text(Field(1), "hello"),
|
||||
IndexRecordOption::WithFreqs,
|
||||
);
|
||||
assert_eq!(
|
||||
format!("{:?}", term_query),
|
||||
"TermQuery(Term(field=1,bytes=[104, 101, 108, 108, 111]))"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,6 +7,7 @@ use crate::Result;
|
||||
use crate::Searcher;
|
||||
use crate::Term;
|
||||
use std::collections::BTreeSet;
|
||||
use std::fmt;
|
||||
|
||||
/// A Term query matches all of the documents
|
||||
/// containing a specific term.
|
||||
@@ -19,12 +20,10 @@ use std::collections::BTreeSet;
|
||||
/// * `field norm` - number of tokens in the field.
|
||||
///
|
||||
/// ```rust
|
||||
/// #[macro_use]
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::schema::{Schema, TEXT, IndexRecordOption};
|
||||
/// use tantivy::{Index, Result, Term};
|
||||
/// use tantivy::collector::{Count, TopDocs};
|
||||
/// use tantivy::query::TermQuery;
|
||||
/// use tantivy::schema::{Schema, TEXT, IndexRecordOption};
|
||||
/// use tantivy::{doc, Index, Result, Term};
|
||||
///
|
||||
/// # fn main() { example().unwrap(); }
|
||||
/// fn example() -> Result<()> {
|
||||
@@ -61,12 +60,18 @@ use std::collections::BTreeSet;
|
||||
/// Ok(())
|
||||
/// }
|
||||
/// ```
|
||||
#[derive(Clone, Debug)]
|
||||
#[derive(Clone)]
|
||||
pub struct TermQuery {
|
||||
term: Term,
|
||||
index_record_option: IndexRecordOption,
|
||||
}
|
||||
|
||||
impl fmt::Debug for TermQuery {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "TermQuery({:?})", self.term)
|
||||
}
|
||||
}
|
||||
|
||||
impl TermQuery {
|
||||
/// Creates a new term query.
|
||||
pub fn new(term: Term, segment_postings_options: IndexRecordOption) -> TermQuery {
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
mod pool;
|
||||
|
||||
use self::pool::{LeasedItem, Pool};
|
||||
pub use self::pool::LeasedItem;
|
||||
use self::pool::Pool;
|
||||
use crate::core::Segment;
|
||||
use crate::directory::Directory;
|
||||
use crate::directory::WatchHandle;
|
||||
@@ -85,7 +86,10 @@ impl IndexReaderBuilder {
|
||||
);
|
||||
}
|
||||
};
|
||||
let watch_handle = inner_reader_arc.index.directory().watch(Box::new(callback));
|
||||
let watch_handle = inner_reader_arc
|
||||
.index
|
||||
.directory()
|
||||
.watch(Box::new(callback))?;
|
||||
watch_handle_opt = Some(watch_handle);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -123,6 +123,10 @@ impl<T> Pool<T> {
|
||||
}
|
||||
}
|
||||
|
||||
/// A LeasedItem holds an object borrowed from a Pool.
|
||||
///
|
||||
/// Upon drop, the object is automatically returned
|
||||
/// into the pool.
|
||||
pub struct LeasedItem<T> {
|
||||
gen_item: Option<GenerationItem<T>>,
|
||||
recycle_queue: Arc<Queue<GenerationItem<T>>>,
|
||||
|
||||
@@ -88,6 +88,11 @@ impl Document {
|
||||
self.add(FieldValue::new(field, Value::I64(value)));
|
||||
}
|
||||
|
||||
/// Add a f64 field
|
||||
pub fn add_f64(&mut self, field: Field, value: f64) {
|
||||
self.add(FieldValue::new(field, Value::F64(value)));
|
||||
}
|
||||
|
||||
/// Add a date field
|
||||
pub fn add_date(&mut self, field: Field, value: &DateTime) {
|
||||
self.add(FieldValue::new(field, Value::Date(*value)));
|
||||
|
||||
@@ -117,6 +117,11 @@ impl Facet {
|
||||
&& other_str.starts_with(self_str)
|
||||
&& other_str.as_bytes()[self_str.len()] == FACET_SEP_BYTE
|
||||
}
|
||||
|
||||
/// Extract path from the `Facet`.
|
||||
pub fn to_path(&self) -> Vec<&str> {
|
||||
self.encoded_str().split(|c| c == FACET_SEP_CHAR).collect()
|
||||
}
|
||||
}
|
||||
|
||||
impl Borrow<str> for Facet {
|
||||
@@ -254,4 +259,10 @@ mod tests {
|
||||
assert_eq!(format!("{:?}", facet), "Facet(/first/second/third)");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_to_path() {
|
||||
let v = ["first", "second", "third\\/not_fourth"];
|
||||
let facet = Facet::from_path(v.iter());
|
||||
assert_eq!(facet.to_path(), v);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -48,6 +48,15 @@ impl FieldEntry {
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a new f64 field entry in the schema, given
|
||||
/// a name, and some options.
|
||||
pub fn new_f64(field_name: String, field_type: IntOptions) -> FieldEntry {
|
||||
FieldEntry {
|
||||
name: field_name,
|
||||
field_type: FieldType::F64(field_type),
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a new date field entry in the schema, given
|
||||
/// a name, and some options.
|
||||
pub fn new_date(field_name: String, field_type: IntOptions) -> FieldEntry {
|
||||
@@ -89,6 +98,7 @@ impl FieldEntry {
|
||||
FieldType::Str(ref options) => options.get_indexing_options().is_some(),
|
||||
FieldType::U64(ref options)
|
||||
| FieldType::I64(ref options)
|
||||
| FieldType::F64(ref options)
|
||||
| FieldType::Date(ref options) => options.is_indexed(),
|
||||
FieldType::HierarchicalFacet => true,
|
||||
FieldType::Bytes => false,
|
||||
@@ -98,7 +108,9 @@ impl FieldEntry {
|
||||
/// Returns true iff the field is a int (signed or unsigned) fast field
|
||||
pub fn is_int_fast(&self) -> bool {
|
||||
match self.field_type {
|
||||
FieldType::U64(ref options) | FieldType::I64(ref options) => options.is_fast(),
|
||||
FieldType::U64(ref options)
|
||||
| FieldType::I64(ref options)
|
||||
| FieldType::F64(ref options) => options.is_fast(),
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
@@ -108,6 +120,7 @@ impl FieldEntry {
|
||||
match self.field_type {
|
||||
FieldType::U64(ref options)
|
||||
| FieldType::I64(ref options)
|
||||
| FieldType::F64(ref options)
|
||||
| FieldType::Date(ref options) => options.is_stored(),
|
||||
FieldType::Str(ref options) => options.is_stored(),
|
||||
// TODO make stored hierarchical facet optional
|
||||
@@ -138,6 +151,10 @@ impl Serialize for FieldEntry {
|
||||
s.serialize_field("type", "i64")?;
|
||||
s.serialize_field("options", options)?;
|
||||
}
|
||||
FieldType::F64(ref options) => {
|
||||
s.serialize_field("type", "f64")?;
|
||||
s.serialize_field("options", options)?;
|
||||
}
|
||||
FieldType::Date(ref options) => {
|
||||
s.serialize_field("type", "date")?;
|
||||
s.serialize_field("options", options)?;
|
||||
@@ -205,7 +222,7 @@ impl<'de> Deserialize<'de> for FieldEntry {
|
||||
"bytes" => {
|
||||
field_type = Some(FieldType::Bytes);
|
||||
}
|
||||
"text" | "u64" | "i64" | "date" => {
|
||||
"text" | "u64" | "i64" | "f64" | "date" => {
|
||||
// These types require additional options to create a field_type
|
||||
}
|
||||
_ => panic!("unhandled type"),
|
||||
@@ -222,6 +239,7 @@ impl<'de> Deserialize<'de> for FieldEntry {
|
||||
"text" => field_type = Some(FieldType::Str(map.next_value()?)),
|
||||
"u64" => field_type = Some(FieldType::U64(map.next_value()?)),
|
||||
"i64" => field_type = Some(FieldType::I64(map.next_value()?)),
|
||||
"f64" => field_type = Some(FieldType::F64(map.next_value()?)),
|
||||
"date" => field_type = Some(FieldType::Date(map.next_value()?)),
|
||||
_ => {
|
||||
let msg = format!("Unrecognised type {}", ty);
|
||||
|
||||
@@ -10,7 +10,7 @@ use serde_json::Value as JsonValue;
|
||||
|
||||
/// Possible error that may occur while parsing a field value
|
||||
/// At this point the JSON is known to be valid.
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum ValueParsingError {
|
||||
/// Encountered a numerical value that overflows or underflow its integer type.
|
||||
OverflowError(String),
|
||||
@@ -35,6 +35,8 @@ pub enum Type {
|
||||
U64,
|
||||
/// `i64`
|
||||
I64,
|
||||
/// `f64`
|
||||
F64,
|
||||
/// `date(i64) timestamp`
|
||||
Date,
|
||||
/// `tantivy::schema::Facet`. Passed as a string in JSON.
|
||||
@@ -53,6 +55,8 @@ pub enum FieldType {
|
||||
U64(IntOptions),
|
||||
/// Signed 64-bits integers 64 field type configuration
|
||||
I64(IntOptions),
|
||||
/// 64-bits float 64 field type configuration
|
||||
F64(IntOptions),
|
||||
/// Signed 64-bits Date 64 field type configuration,
|
||||
Date(IntOptions),
|
||||
/// Hierachical Facet
|
||||
@@ -68,6 +72,7 @@ impl FieldType {
|
||||
FieldType::Str(_) => Type::Str,
|
||||
FieldType::U64(_) => Type::U64,
|
||||
FieldType::I64(_) => Type::I64,
|
||||
FieldType::F64(_) => Type::F64,
|
||||
FieldType::Date(_) => Type::Date,
|
||||
FieldType::HierarchicalFacet => Type::HierarchicalFacet,
|
||||
FieldType::Bytes => Type::Bytes,
|
||||
@@ -78,9 +83,9 @@ impl FieldType {
|
||||
pub fn is_indexed(&self) -> bool {
|
||||
match *self {
|
||||
FieldType::Str(ref text_options) => text_options.get_indexing_options().is_some(),
|
||||
FieldType::U64(ref int_options) | FieldType::I64(ref int_options) => {
|
||||
int_options.is_indexed()
|
||||
}
|
||||
FieldType::U64(ref int_options)
|
||||
| FieldType::I64(ref int_options)
|
||||
| FieldType::F64(ref int_options) => int_options.is_indexed(),
|
||||
FieldType::Date(ref date_options) => date_options.is_indexed(),
|
||||
FieldType::HierarchicalFacet => true,
|
||||
FieldType::Bytes => false,
|
||||
@@ -98,6 +103,7 @@ impl FieldType {
|
||||
.map(TextFieldIndexing::index_option),
|
||||
FieldType::U64(ref int_options)
|
||||
| FieldType::I64(ref int_options)
|
||||
| FieldType::F64(ref int_options)
|
||||
| FieldType::Date(ref int_options) => {
|
||||
if int_options.is_indexed() {
|
||||
Some(IndexRecordOption::Basic)
|
||||
@@ -119,9 +125,12 @@ impl FieldType {
|
||||
match *json {
|
||||
JsonValue::String(ref field_text) => match *self {
|
||||
FieldType::Str(_) => Ok(Value::Str(field_text.clone())),
|
||||
FieldType::U64(_) | FieldType::I64(_) | FieldType::Date(_) => Err(
|
||||
ValueParsingError::TypeError(format!("Expected an integer, got {:?}", json)),
|
||||
),
|
||||
FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) | FieldType::Date(_) => {
|
||||
Err(ValueParsingError::TypeError(format!(
|
||||
"Expected an integer, got {:?}",
|
||||
json
|
||||
)))
|
||||
}
|
||||
FieldType::HierarchicalFacet => Ok(Value::Facet(Facet::from(field_text))),
|
||||
FieldType::Bytes => decode(field_text).map(Value::Bytes).map_err(|_| {
|
||||
ValueParsingError::InvalidBase64(format!(
|
||||
@@ -147,6 +156,14 @@ impl FieldType {
|
||||
Err(ValueParsingError::OverflowError(msg))
|
||||
}
|
||||
}
|
||||
FieldType::F64(_) => {
|
||||
if let Some(field_val_f64) = field_val_num.as_f64() {
|
||||
Ok(Value::F64(field_val_f64))
|
||||
} else {
|
||||
let msg = format!("Expected a f64 int, got {:?}", json);
|
||||
Err(ValueParsingError::OverflowError(msg))
|
||||
}
|
||||
}
|
||||
FieldType::Str(_) | FieldType::HierarchicalFacet | FieldType::Bytes => {
|
||||
let msg = format!("Expected a string, got {:?}", json);
|
||||
Err(ValueParsingError::TypeError(msg))
|
||||
|
||||
@@ -22,7 +22,7 @@ pub const STORED: SchemaFlagList<StoredFlag, ()> = SchemaFlagList {
|
||||
pub struct IndexedFlag;
|
||||
/// Flag to mark the field as indexed.
|
||||
///
|
||||
/// The `INDEXED` flag can only be used when building `IntOptions` (`u64` and `i64` fields)
|
||||
/// The `INDEXED` flag can only be used when building `IntOptions` (`u64`, `i64` and `f64` fields)
|
||||
/// Of course, text fields can also be indexed... But this is expressed by using either the
|
||||
/// `STRING` (untokenized) or `TEXT` (tokenized with the english tokenizer) flags.
|
||||
pub const INDEXED: SchemaFlagList<IndexedFlag, ()> = SchemaFlagList {
|
||||
@@ -36,7 +36,7 @@ pub struct FastFlag;
|
||||
///
|
||||
/// Fast fields can be random-accessed rapidly. Fields useful for scoring, filtering
|
||||
/// or collection should be mark as fast fields.
|
||||
/// The `FAST` flag can only be used when building `IntOptions` (`u64` and `i64` fields)
|
||||
/// The `FAST` flag can only be used when building `IntOptions` (`u64`, `i64` and `f64` fields)
|
||||
pub const FAST: SchemaFlagList<FastFlag, ()> = SchemaFlagList {
|
||||
head: FastFlag,
|
||||
tail: (),
|
||||
|
||||
@@ -54,7 +54,7 @@ On the other hand setting the field as stored or not determines whether the fiel
|
||||
when [`searcher.doc(doc_address)`](../struct.Searcher.html#method.doc) is called.
|
||||
|
||||
|
||||
## Setting a u64 or a i64 field
|
||||
## Setting a u64, a i64 or a f64 field
|
||||
|
||||
### Example
|
||||
|
||||
|
||||
@@ -82,6 +82,26 @@ impl SchemaBuilder {
|
||||
self.add_field(field_entry)
|
||||
}
|
||||
|
||||
/// Adds a new f64 field.
|
||||
/// Returns the associated field handle
|
||||
///
|
||||
/// # Caution
|
||||
///
|
||||
/// Appending two fields with the same name
|
||||
/// will result in the shadowing of the first
|
||||
/// by the second one.
|
||||
/// The first field will get a field id
|
||||
/// but only the second one will be indexed
|
||||
pub fn add_f64_field<T: Into<IntOptions>>(
|
||||
&mut self,
|
||||
field_name_str: &str,
|
||||
field_options: T,
|
||||
) -> Field {
|
||||
let field_name = String::from(field_name_str);
|
||||
let field_entry = FieldEntry::new_f64(field_name, field_options.into());
|
||||
self.add_field(field_entry)
|
||||
}
|
||||
|
||||
/// Adds a new date field.
|
||||
/// Returns the associated field handle
|
||||
/// Internally, Tantivy simply stores dates as i64 UTC timestamps,
|
||||
@@ -226,6 +246,25 @@ impl Schema {
|
||||
self.0.fields_map.get(field_name).cloned()
|
||||
}
|
||||
|
||||
/// Create a named document off the doc.
|
||||
pub fn convert_named_doc(
|
||||
&self,
|
||||
named_doc: NamedFieldDocument,
|
||||
) -> Result<Document, DocParsingError> {
|
||||
let mut document = Document::new();
|
||||
for (field_name, values) in named_doc.0 {
|
||||
if let Some(field) = self.get_field(&field_name) {
|
||||
for value in values {
|
||||
let field_value = FieldValue::new(field, value);
|
||||
document.add(field_value);
|
||||
}
|
||||
} else {
|
||||
return Err(DocParsingError::NoSuchFieldInSchema(field_name));
|
||||
}
|
||||
}
|
||||
Ok(document)
|
||||
}
|
||||
|
||||
/// Create a named document off the doc.
|
||||
pub fn to_named_doc(&self, doc: &Document) -> NamedFieldDocument {
|
||||
let mut field_map = BTreeMap::new();
|
||||
@@ -262,28 +301,26 @@ impl Schema {
|
||||
|
||||
let mut doc = Document::default();
|
||||
for (field_name, json_value) in json_obj.iter() {
|
||||
match self.get_field(field_name) {
|
||||
Some(field) => {
|
||||
let field_entry = self.get_field_entry(field);
|
||||
let field_type = field_entry.field_type();
|
||||
match *json_value {
|
||||
JsonValue::Array(ref json_items) => {
|
||||
for json_item in json_items {
|
||||
let value = field_type.value_from_json(json_item).map_err(|e| {
|
||||
DocParsingError::ValueError(field_name.clone(), e)
|
||||
})?;
|
||||
doc.add(FieldValue::new(field, value));
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
let value = field_type
|
||||
.value_from_json(json_value)
|
||||
.map_err(|e| DocParsingError::ValueError(field_name.clone(), e))?;
|
||||
doc.add(FieldValue::new(field, value));
|
||||
}
|
||||
let field = self
|
||||
.get_field(field_name)
|
||||
.ok_or_else(|| DocParsingError::NoSuchFieldInSchema(field_name.clone()))?;
|
||||
let field_entry = self.get_field_entry(field);
|
||||
let field_type = field_entry.field_type();
|
||||
match *json_value {
|
||||
JsonValue::Array(ref json_items) => {
|
||||
for json_item in json_items {
|
||||
let value = field_type
|
||||
.value_from_json(json_item)
|
||||
.map_err(|e| DocParsingError::ValueError(field_name.clone(), e))?;
|
||||
doc.add(FieldValue::new(field, value));
|
||||
}
|
||||
}
|
||||
None => return Err(DocParsingError::NoSuchFieldInSchema(field_name.clone())),
|
||||
_ => {
|
||||
let value = field_type
|
||||
.value_from_json(json_value)
|
||||
.map_err(|e| DocParsingError::ValueError(field_name.clone(), e))?;
|
||||
doc.add(FieldValue::new(field, value));
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(doc)
|
||||
@@ -340,13 +377,19 @@ impl<'de> Deserialize<'de> for Schema {
|
||||
|
||||
/// Error that may happen when deserializing
|
||||
/// a document from JSON.
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, Fail, PartialEq)]
|
||||
pub enum DocParsingError {
|
||||
/// The payload given is not valid JSON.
|
||||
#[fail(display = "The provided string is not valid JSON")]
|
||||
NotJSON(String),
|
||||
/// One of the value node could not be parsed.
|
||||
#[fail(display = "The field '{:?}' could not be parsed: {:?}", _0, _1)]
|
||||
ValueError(String, ValueParsingError),
|
||||
/// The json-document contains a field that is not declared in the schema.
|
||||
#[fail(
|
||||
display = "The document contains a field that is not declared in the schema: {:?}",
|
||||
_0
|
||||
)]
|
||||
NoSuchFieldInSchema(String),
|
||||
}
|
||||
|
||||
@@ -358,6 +401,7 @@ mod tests {
|
||||
use crate::schema::*;
|
||||
use matches::{assert_matches, matches};
|
||||
use serde_json;
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
#[test]
|
||||
pub fn is_indexed_test() {
|
||||
@@ -376,10 +420,14 @@ mod tests {
|
||||
let popularity_options = IntOptions::default()
|
||||
.set_stored()
|
||||
.set_fast(Cardinality::SingleValue);
|
||||
let score_options = IntOptions::default()
|
||||
.set_indexed()
|
||||
.set_fast(Cardinality::SingleValue);
|
||||
schema_builder.add_text_field("title", TEXT);
|
||||
schema_builder.add_text_field("author", STRING);
|
||||
schema_builder.add_u64_field("count", count_options);
|
||||
schema_builder.add_i64_field("popularity", popularity_options);
|
||||
schema_builder.add_f64_field("score", score_options);
|
||||
let schema = schema_builder.build();
|
||||
let schema_json = serde_json::to_string_pretty(&schema).unwrap();
|
||||
let expected = r#"[
|
||||
@@ -422,6 +470,15 @@ mod tests {
|
||||
"fast": "single",
|
||||
"stored": true
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "score",
|
||||
"type": "f64",
|
||||
"options": {
|
||||
"indexed": true,
|
||||
"fast": "single",
|
||||
"stored": false
|
||||
}
|
||||
}
|
||||
]"#;
|
||||
assert_eq!(schema_json, expected);
|
||||
@@ -434,6 +491,8 @@ mod tests {
|
||||
assert_eq!("author", fields.next().unwrap().name());
|
||||
assert_eq!("count", fields.next().unwrap().name());
|
||||
assert_eq!("popularity", fields.next().unwrap().name());
|
||||
assert_eq!("score", fields.next().unwrap().name());
|
||||
assert!(fields.next().is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -457,6 +516,54 @@ mod tests {
|
||||
assert_eq!(doc, doc_serdeser);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_document_from_nameddoc() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let title = schema_builder.add_text_field("title", TEXT);
|
||||
let val = schema_builder.add_i64_field("val", INDEXED);
|
||||
let schema = schema_builder.build();
|
||||
let mut named_doc_map = BTreeMap::default();
|
||||
named_doc_map.insert(
|
||||
"title".to_string(),
|
||||
vec![Value::from("title1"), Value::from("title2")],
|
||||
);
|
||||
named_doc_map.insert(
|
||||
"val".to_string(),
|
||||
vec![Value::from(14u64), Value::from(-1i64)],
|
||||
);
|
||||
let doc = schema
|
||||
.convert_named_doc(NamedFieldDocument(named_doc_map))
|
||||
.unwrap();
|
||||
assert_eq!(
|
||||
doc.get_all(title),
|
||||
vec![
|
||||
&Value::from("title1".to_string()),
|
||||
&Value::from("title2".to_string())
|
||||
]
|
||||
);
|
||||
assert_eq!(
|
||||
doc.get_all(val),
|
||||
vec![&Value::from(14u64), &Value::from(-1i64)]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_document_from_nameddoc_error() {
|
||||
let schema = Schema::builder().build();
|
||||
let mut named_doc_map = BTreeMap::default();
|
||||
named_doc_map.insert(
|
||||
"title".to_string(),
|
||||
vec![Value::from("title1"), Value::from("title2")],
|
||||
);
|
||||
let err = schema
|
||||
.convert_named_doc(NamedFieldDocument(named_doc_map))
|
||||
.unwrap_err();
|
||||
assert_eq!(
|
||||
err,
|
||||
DocParsingError::NoSuchFieldInSchema("title".to_string())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_parse_document() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -466,10 +573,14 @@ mod tests {
|
||||
let popularity_options = IntOptions::default()
|
||||
.set_stored()
|
||||
.set_fast(Cardinality::SingleValue);
|
||||
let score_options = IntOptions::default()
|
||||
.set_indexed()
|
||||
.set_fast(Cardinality::SingleValue);
|
||||
let title_field = schema_builder.add_text_field("title", TEXT);
|
||||
let author_field = schema_builder.add_text_field("author", STRING);
|
||||
let count_field = schema_builder.add_u64_field("count", count_options);
|
||||
let popularity_field = schema_builder.add_i64_field("popularity", popularity_options);
|
||||
let score_field = schema_builder.add_f64_field("score", score_options);
|
||||
let schema = schema_builder.build();
|
||||
{
|
||||
let doc = schema.parse_document("{}").unwrap();
|
||||
@@ -482,7 +593,8 @@ mod tests {
|
||||
"title": "my title",
|
||||
"author": "fulmicoton",
|
||||
"count": 4,
|
||||
"popularity": 10
|
||||
"popularity": 10,
|
||||
"score": 80.5
|
||||
}"#,
|
||||
)
|
||||
.unwrap();
|
||||
@@ -493,6 +605,7 @@ mod tests {
|
||||
);
|
||||
assert_eq!(doc.get_first(count_field).unwrap().u64_value(), 4);
|
||||
assert_eq!(doc.get_first(popularity_field).unwrap().i64_value(), 10);
|
||||
assert_eq!(doc.get_first(score_field).unwrap().f64_value(), 80.5);
|
||||
}
|
||||
{
|
||||
let json_err = schema.parse_document(
|
||||
@@ -501,6 +614,7 @@ mod tests {
|
||||
"author": "fulmicoton",
|
||||
"count": 4,
|
||||
"popularity": 10,
|
||||
"score": 80.5,
|
||||
"jambon": "bayonne"
|
||||
}"#,
|
||||
);
|
||||
@@ -513,6 +627,7 @@ mod tests {
|
||||
"author": "fulmicoton",
|
||||
"count": "5",
|
||||
"popularity": "10",
|
||||
"score": "80.5",
|
||||
"jambon": "bayonne"
|
||||
}"#,
|
||||
);
|
||||
@@ -527,7 +642,8 @@ mod tests {
|
||||
"title": "my title",
|
||||
"author": "fulmicoton",
|
||||
"count": -5,
|
||||
"popularity": 10
|
||||
"popularity": 10,
|
||||
"score": 80.5
|
||||
}"#,
|
||||
);
|
||||
assert_matches!(
|
||||
@@ -541,7 +657,8 @@ mod tests {
|
||||
"title": "my title",
|
||||
"author": "fulmicoton",
|
||||
"count": 9223372036854775808,
|
||||
"popularity": 10
|
||||
"popularity": 10,
|
||||
"score": 80.5
|
||||
}"#,
|
||||
);
|
||||
assert!(!matches!(
|
||||
@@ -555,7 +672,8 @@ mod tests {
|
||||
"title": "my title",
|
||||
"author": "fulmicoton",
|
||||
"count": 50,
|
||||
"popularity": 9223372036854775808
|
||||
"popularity": 9223372036854775808,
|
||||
"score": 80.5
|
||||
}"#,
|
||||
);
|
||||
assert_matches!(
|
||||
|
||||
@@ -19,9 +19,9 @@ where
|
||||
B: AsRef<[u8]>;
|
||||
|
||||
impl Term {
|
||||
/// Builds a term given a field, and a u64-value
|
||||
/// Builds a term given a field, and a i64-value
|
||||
///
|
||||
/// Assuming the term has a field id of 1, and a u64 value of 3234,
|
||||
/// Assuming the term has a field id of 1, and a i64 value of 3234,
|
||||
/// the Term will have 8 bytes.
|
||||
///
|
||||
/// The first four byte are dedicated to storing the field id as a u64.
|
||||
@@ -31,6 +31,18 @@ impl Term {
|
||||
Term::from_field_u64(field, val_u64)
|
||||
}
|
||||
|
||||
/// Builds a term given a field, and a f64-value
|
||||
///
|
||||
/// Assuming the term has a field id of 1, and a u64 value of 3234,
|
||||
/// the Term will have 8 bytes. <= this is wrong
|
||||
///
|
||||
/// The first four byte are dedicated to storing the field id as a u64.
|
||||
/// The 4 following bytes are encoding the u64 value.
|
||||
pub fn from_field_f64(field: Field, val: f64) -> Term {
|
||||
let val_u64: u64 = common::f64_to_u64(val);
|
||||
Term::from_field_u64(field, val_u64)
|
||||
}
|
||||
|
||||
/// Builds a term given a field, and a DateTime value
|
||||
///
|
||||
/// Assuming the term has a field id of 1, and a timestamp i64 value of 3234,
|
||||
@@ -112,6 +124,11 @@ impl Term {
|
||||
self.set_u64(common::i64_to_u64(val));
|
||||
}
|
||||
|
||||
/// Sets a `f64` value in the term.
|
||||
pub fn set_f64(&mut self, val: f64) {
|
||||
self.set_u64(common::f64_to_u64(val));
|
||||
}
|
||||
|
||||
fn set_bytes(&mut self, bytes: &[u8]) {
|
||||
self.0.resize(4, 0u8);
|
||||
self.0.extend(bytes);
|
||||
@@ -161,6 +178,15 @@ where
|
||||
common::u64_to_i64(BigEndian::read_u64(&self.0.as_ref()[4..]))
|
||||
}
|
||||
|
||||
/// Returns the `f64` value stored in a term.
|
||||
///
|
||||
/// # Panics
|
||||
/// ... or returns an invalid value
|
||||
/// if the term is not a `i64` field.
|
||||
pub fn get_f64(&self) -> f64 {
|
||||
common::u64_to_f64(BigEndian::read_u64(&self.0.as_ref()[4..]))
|
||||
}
|
||||
|
||||
/// Returns the text associated with the term.
|
||||
///
|
||||
/// # Panics
|
||||
@@ -198,7 +224,12 @@ where
|
||||
|
||||
impl fmt::Debug for Term {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(f, "Term({:?})", &self.0[..])
|
||||
write!(
|
||||
f,
|
||||
"Term(field={},bytes={:?})",
|
||||
self.field().0,
|
||||
self.value_bytes()
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -2,11 +2,11 @@ use crate::schema::Facet;
|
||||
use crate::DateTime;
|
||||
use serde::de::Visitor;
|
||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
use std::fmt;
|
||||
use std::{cmp::Ordering, fmt};
|
||||
|
||||
/// Value represents the value of a any field.
|
||||
/// It is an enum over all over all of the possible field type.
|
||||
#[derive(Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
|
||||
#[derive(Debug, Clone, PartialEq, PartialOrd)]
|
||||
pub enum Value {
|
||||
/// The str type is used for any text information.
|
||||
Str(String),
|
||||
@@ -14,6 +14,8 @@ pub enum Value {
|
||||
U64(u64),
|
||||
/// Signed 64-bits Integer `i64`
|
||||
I64(i64),
|
||||
/// 64-bits Float `f64`
|
||||
F64(f64),
|
||||
/// Signed 64-bits Date time stamp `date`
|
||||
Date(DateTime),
|
||||
/// Hierarchical Facet
|
||||
@@ -22,6 +24,40 @@ pub enum Value {
|
||||
Bytes(Vec<u8>),
|
||||
}
|
||||
|
||||
impl Eq for Value {}
|
||||
impl Ord for Value {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
match (self, other) {
|
||||
(Value::Str(l), Value::Str(r)) => l.cmp(r),
|
||||
(Value::U64(l), Value::U64(r)) => l.cmp(r),
|
||||
(Value::I64(l), Value::I64(r)) => l.cmp(r),
|
||||
(Value::Date(l), Value::Date(r)) => l.cmp(r),
|
||||
(Value::Facet(l), Value::Facet(r)) => l.cmp(r),
|
||||
(Value::Bytes(l), Value::Bytes(r)) => l.cmp(r),
|
||||
(Value::F64(l), Value::F64(r)) => {
|
||||
match (l.is_nan(), r.is_nan()) {
|
||||
(false, false) => l.partial_cmp(r).unwrap(), // only fail on NaN
|
||||
(true, true) => Ordering::Equal,
|
||||
(true, false) => Ordering::Less, // we define NaN as less than -∞
|
||||
(false, true) => Ordering::Greater,
|
||||
}
|
||||
}
|
||||
(Value::Str(_), _) => Ordering::Less,
|
||||
(_, Value::Str(_)) => Ordering::Greater,
|
||||
(Value::U64(_), _) => Ordering::Less,
|
||||
(_, Value::U64(_)) => Ordering::Greater,
|
||||
(Value::I64(_), _) => Ordering::Less,
|
||||
(_, Value::I64(_)) => Ordering::Greater,
|
||||
(Value::F64(_), _) => Ordering::Less,
|
||||
(_, Value::F64(_)) => Ordering::Greater,
|
||||
(Value::Date(_), _) => Ordering::Less,
|
||||
(_, Value::Date(_)) => Ordering::Greater,
|
||||
(Value::Facet(_), _) => Ordering::Less,
|
||||
(_, Value::Facet(_)) => Ordering::Greater,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for Value {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where
|
||||
@@ -31,6 +67,7 @@ impl Serialize for Value {
|
||||
Value::Str(ref v) => serializer.serialize_str(v),
|
||||
Value::U64(u) => serializer.serialize_u64(u),
|
||||
Value::I64(u) => serializer.serialize_i64(u),
|
||||
Value::F64(u) => serializer.serialize_f64(u),
|
||||
Value::Date(ref date) => serializer.serialize_i64(date.timestamp()),
|
||||
Value::Facet(ref facet) => facet.serialize(serializer),
|
||||
Value::Bytes(ref bytes) => serializer.serialize_bytes(bytes),
|
||||
@@ -60,6 +97,10 @@ impl<'de> Deserialize<'de> for Value {
|
||||
Ok(Value::I64(v))
|
||||
}
|
||||
|
||||
fn visit_f64<E>(self, v: f64) -> Result<Self::Value, E> {
|
||||
Ok(Value::F64(v))
|
||||
}
|
||||
|
||||
fn visit_str<E>(self, v: &str) -> Result<Self::Value, E> {
|
||||
Ok(Value::Str(v.to_owned()))
|
||||
}
|
||||
@@ -75,9 +116,7 @@ impl<'de> Deserialize<'de> for Value {
|
||||
|
||||
impl Value {
|
||||
/// Returns the text value, provided the value is of the `Str` type.
|
||||
///
|
||||
/// # Panics
|
||||
/// If the value is not of type `Str`
|
||||
/// (Returns None if the value is not of the `Str` type).
|
||||
pub fn text(&self) -> Option<&str> {
|
||||
match *self {
|
||||
Value::Str(ref text) => Some(text),
|
||||
@@ -92,7 +131,7 @@ impl Value {
|
||||
pub fn u64_value(&self) -> u64 {
|
||||
match *self {
|
||||
Value::U64(ref value) => *value,
|
||||
_ => panic!("This is not a text field."),
|
||||
_ => panic!("This is not a u64 field."),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -103,7 +142,18 @@ impl Value {
|
||||
pub fn i64_value(&self) -> i64 {
|
||||
match *self {
|
||||
Value::I64(ref value) => *value,
|
||||
_ => panic!("This is not a text field."),
|
||||
_ => panic!("This is not a i64 field."),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the f64-value, provided the value is of the `F64` type.
|
||||
///
|
||||
/// # Panics
|
||||
/// If the value is not of type `F64`
|
||||
pub fn f64_value(&self) -> f64 {
|
||||
match *self {
|
||||
Value::F64(ref value) => *value,
|
||||
_ => panic!("This is not a f64 field."),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -137,6 +187,12 @@ impl From<i64> for Value {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<f64> for Value {
|
||||
fn from(v: f64) -> Value {
|
||||
Value::F64(v)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<DateTime> for Value {
|
||||
fn from(date_time: DateTime) -> Value {
|
||||
Value::Date(date_time)
|
||||
@@ -163,7 +219,7 @@ impl From<Vec<u8>> for Value {
|
||||
|
||||
mod binary_serialize {
|
||||
use super::Value;
|
||||
use crate::common::BinarySerializable;
|
||||
use crate::common::{f64_to_u64, u64_to_f64, BinarySerializable};
|
||||
use crate::schema::Facet;
|
||||
use chrono::{TimeZone, Utc};
|
||||
use std::io::{self, Read, Write};
|
||||
@@ -174,6 +230,7 @@ mod binary_serialize {
|
||||
const HIERARCHICAL_FACET_CODE: u8 = 3;
|
||||
const BYTES_CODE: u8 = 4;
|
||||
const DATE_CODE: u8 = 5;
|
||||
const F64_CODE: u8 = 6;
|
||||
|
||||
impl BinarySerializable for Value {
|
||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||
@@ -190,6 +247,10 @@ mod binary_serialize {
|
||||
I64_CODE.serialize(writer)?;
|
||||
val.serialize(writer)
|
||||
}
|
||||
Value::F64(ref val) => {
|
||||
F64_CODE.serialize(writer)?;
|
||||
f64_to_u64(*val).serialize(writer)
|
||||
}
|
||||
Value::Date(ref val) => {
|
||||
DATE_CODE.serialize(writer)?;
|
||||
val.timestamp().serialize(writer)
|
||||
@@ -219,6 +280,10 @@ mod binary_serialize {
|
||||
let value = i64::deserialize(reader)?;
|
||||
Ok(Value::I64(value))
|
||||
}
|
||||
F64_CODE => {
|
||||
let value = u64_to_f64(u64::deserialize(reader)?);
|
||||
Ok(Value::F64(value))
|
||||
}
|
||||
DATE_CODE => {
|
||||
let timestamp = i64::deserialize(reader)?;
|
||||
Ok(Value::Date(Utc.timestamp(timestamp, 0)))
|
||||
|
||||
@@ -63,7 +63,7 @@ impl FragmentCandidate {
|
||||
fn try_add_token(&mut self, token: &Token, terms: &BTreeMap<String, f32>) {
|
||||
self.stop_offset = token.offset_to;
|
||||
|
||||
if let Some(score) = terms.get(&token.text.to_lowercase()) {
|
||||
if let Some(&score) = terms.get(&token.text.to_lowercase()) {
|
||||
self.score += score;
|
||||
self.highlighted
|
||||
.push(HighlightSection::new(token.offset_from, token.offset_to));
|
||||
@@ -142,7 +142,7 @@ impl Snippet {
|
||||
/// Fragments must be valid in the sense that `&text[fragment.start..fragment.stop]`\
|
||||
/// has to be a valid string.
|
||||
fn search_fragments<'a>(
|
||||
tokenizer: &dyn BoxedTokenizer,
|
||||
tokenizer: &BoxedTokenizer,
|
||||
text: &'a str,
|
||||
terms: &BTreeMap<String, f32>,
|
||||
max_num_chars: usize,
|
||||
@@ -150,7 +150,6 @@ fn search_fragments<'a>(
|
||||
let mut token_stream = tokenizer.token_stream(text);
|
||||
let mut fragment = FragmentCandidate::new(0);
|
||||
let mut fragments: Vec<FragmentCandidate> = vec![];
|
||||
|
||||
while let Some(next) = token_stream.next() {
|
||||
if (next.offset_to - fragment.start_offset) > max_num_chars {
|
||||
if fragment.score > 0.0 {
|
||||
@@ -214,11 +213,9 @@ fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str)
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// # #[macro_use]
|
||||
/// # extern crate tantivy;
|
||||
/// # use tantivy::Index;
|
||||
/// # use tantivy::schema::{Schema, TEXT};
|
||||
/// # use tantivy::query::QueryParser;
|
||||
/// # use tantivy::schema::{Schema, TEXT};
|
||||
/// # use tantivy::{doc, Index};
|
||||
/// use tantivy::SnippetGenerator;
|
||||
///
|
||||
/// # fn main() -> tantivy::Result<()> {
|
||||
@@ -254,7 +251,7 @@ fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str)
|
||||
/// ```
|
||||
pub struct SnippetGenerator {
|
||||
terms_text: BTreeMap<String, f32>,
|
||||
tokenizer: Box<dyn BoxedTokenizer>,
|
||||
tokenizer: BoxedTokenizer,
|
||||
field: Field,
|
||||
max_num_chars: usize,
|
||||
}
|
||||
@@ -316,12 +313,8 @@ impl SnippetGenerator {
|
||||
|
||||
/// Generates a snippet for the given text.
|
||||
pub fn snippet(&self, text: &str) -> Snippet {
|
||||
let fragment_candidates = search_fragments(
|
||||
&*self.tokenizer,
|
||||
&text,
|
||||
&self.terms_text,
|
||||
self.max_num_chars,
|
||||
);
|
||||
let fragment_candidates =
|
||||
search_fragments(&self.tokenizer, &text, &self.terms_text, self.max_num_chars);
|
||||
select_best_fragment_combination(&fragment_candidates[..], &text)
|
||||
}
|
||||
}
|
||||
@@ -331,7 +324,7 @@ mod tests {
|
||||
use super::{search_fragments, select_best_fragment_combination};
|
||||
use crate::query::QueryParser;
|
||||
use crate::schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions, TEXT};
|
||||
use crate::tokenizer::{box_tokenizer, SimpleTokenizer};
|
||||
use crate::tokenizer::SimpleTokenizer;
|
||||
use crate::Index;
|
||||
use crate::SnippetGenerator;
|
||||
use maplit::btreemap;
|
||||
@@ -355,12 +348,12 @@ Survey in 2016, 2017, and 2018."#;
|
||||
|
||||
#[test]
|
||||
fn test_snippet() {
|
||||
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
|
||||
let boxed_tokenizer = SimpleTokenizer.into();
|
||||
let terms = btreemap! {
|
||||
String::from("rust") => 1.0,
|
||||
String::from("language") => 0.9
|
||||
};
|
||||
let fragments = search_fragments(&*boxed_tokenizer, TEST_TEXT, &terms, 100);
|
||||
let fragments = search_fragments(&boxed_tokenizer, TEST_TEXT, &terms, 100);
|
||||
assert_eq!(fragments.len(), 7);
|
||||
{
|
||||
let first = &fragments[0];
|
||||
@@ -382,13 +375,13 @@ Survey in 2016, 2017, and 2018."#;
|
||||
|
||||
#[test]
|
||||
fn test_snippet_scored_fragment() {
|
||||
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
|
||||
let boxed_tokenizer = SimpleTokenizer.into();
|
||||
{
|
||||
let terms = btreemap! {
|
||||
String::from("rust") =>1.0f32,
|
||||
String::from("language") => 0.9f32
|
||||
};
|
||||
let fragments = search_fragments(&*boxed_tokenizer, TEST_TEXT, &terms, 20);
|
||||
let fragments = search_fragments(&boxed_tokenizer, TEST_TEXT, &terms, 20);
|
||||
{
|
||||
let first = &fragments[0];
|
||||
assert_eq!(first.score, 1.0);
|
||||
@@ -397,13 +390,13 @@ Survey in 2016, 2017, and 2018."#;
|
||||
let snippet = select_best_fragment_combination(&fragments[..], &TEST_TEXT);
|
||||
assert_eq!(snippet.to_html(), "<b>Rust</b> is a systems")
|
||||
}
|
||||
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
|
||||
let boxed_tokenizer = SimpleTokenizer.into();
|
||||
{
|
||||
let terms = btreemap! {
|
||||
String::from("rust") =>0.9f32,
|
||||
String::from("language") => 1.0f32
|
||||
};
|
||||
let fragments = search_fragments(&*boxed_tokenizer, TEST_TEXT, &terms, 20);
|
||||
let fragments = search_fragments(&boxed_tokenizer, TEST_TEXT, &terms, 20);
|
||||
//assert_eq!(fragments.len(), 7);
|
||||
{
|
||||
let first = &fragments[0];
|
||||
@@ -417,14 +410,14 @@ Survey in 2016, 2017, and 2018."#;
|
||||
|
||||
#[test]
|
||||
fn test_snippet_in_second_fragment() {
|
||||
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
|
||||
let boxed_tokenizer = SimpleTokenizer.into();
|
||||
|
||||
let text = "a b c d e f g";
|
||||
|
||||
let mut terms = BTreeMap::new();
|
||||
terms.insert(String::from("c"), 1.0);
|
||||
|
||||
let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3);
|
||||
let fragments = search_fragments(&boxed_tokenizer, &text, &terms, 3);
|
||||
|
||||
assert_eq!(fragments.len(), 1);
|
||||
{
|
||||
@@ -441,14 +434,14 @@ Survey in 2016, 2017, and 2018."#;
|
||||
|
||||
#[test]
|
||||
fn test_snippet_with_term_at_the_end_of_fragment() {
|
||||
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
|
||||
let boxed_tokenizer = SimpleTokenizer.into();
|
||||
|
||||
let text = "a b c d e f f g";
|
||||
|
||||
let mut terms = BTreeMap::new();
|
||||
terms.insert(String::from("f"), 1.0);
|
||||
|
||||
let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3);
|
||||
let fragments = search_fragments(&boxed_tokenizer, &text, &terms, 3);
|
||||
|
||||
assert_eq!(fragments.len(), 2);
|
||||
{
|
||||
@@ -465,7 +458,7 @@ Survey in 2016, 2017, and 2018."#;
|
||||
|
||||
#[test]
|
||||
fn test_snippet_with_second_fragment_has_the_highest_score() {
|
||||
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
|
||||
let boxed_tokenizer = SimpleTokenizer.into();
|
||||
|
||||
let text = "a b c d e f g";
|
||||
|
||||
@@ -473,7 +466,7 @@ Survey in 2016, 2017, and 2018."#;
|
||||
terms.insert(String::from("f"), 1.0);
|
||||
terms.insert(String::from("a"), 0.9);
|
||||
|
||||
let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 7);
|
||||
let fragments = search_fragments(&boxed_tokenizer, &text, &terms, 7);
|
||||
|
||||
assert_eq!(fragments.len(), 2);
|
||||
{
|
||||
@@ -490,14 +483,14 @@ Survey in 2016, 2017, and 2018."#;
|
||||
|
||||
#[test]
|
||||
fn test_snippet_with_term_not_in_text() {
|
||||
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
|
||||
let boxed_tokenizer = SimpleTokenizer.into();
|
||||
|
||||
let text = "a b c d";
|
||||
|
||||
let mut terms = BTreeMap::new();
|
||||
terms.insert(String::from("z"), 1.0);
|
||||
|
||||
let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3);
|
||||
let fragments = search_fragments(&boxed_tokenizer, &text, &terms, 3);
|
||||
|
||||
assert_eq!(fragments.len(), 0);
|
||||
|
||||
@@ -508,12 +501,12 @@ Survey in 2016, 2017, and 2018."#;
|
||||
|
||||
#[test]
|
||||
fn test_snippet_with_no_terms() {
|
||||
let boxed_tokenizer = box_tokenizer(SimpleTokenizer);
|
||||
let boxed_tokenizer = SimpleTokenizer.into();
|
||||
|
||||
let text = "a b c d";
|
||||
|
||||
let terms = BTreeMap::new();
|
||||
let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3);
|
||||
let fragments = search_fragments(&boxed_tokenizer, &text, &terms, 3);
|
||||
assert_eq!(fragments.len(), 0);
|
||||
|
||||
let snippet = select_best_fragment_combination(&fragments[..], &text);
|
||||
|
||||
@@ -95,7 +95,6 @@ impl StoreReader {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::needless_pass_by_value))]
|
||||
fn split_source(data: ReadOnlySource) -> (ReadOnlySource, ReadOnlySource, DocId) {
|
||||
let data_len = data.len();
|
||||
let footer_offset = data_len - size_of::<u64>() - size_of::<u32>();
|
||||
|
||||
@@ -81,19 +81,14 @@ impl<'a> TermMerger<'a> {
|
||||
/// Advance the term iterator to the next term.
|
||||
/// Returns true if there is indeed another term
|
||||
/// False if there is none.
|
||||
#[cfg_attr(feature = "cargo-clippy", allow(clippy::while_let_loop))]
|
||||
pub fn advance(&mut self) -> bool {
|
||||
self.advance_segments();
|
||||
if let Some(head) = self.heap.pop() {
|
||||
self.current_streamers.push(head);
|
||||
loop {
|
||||
if let Some(next_streamer) = self.heap.peek() {
|
||||
if self.current_streamers[0].streamer.key() != next_streamer.streamer.key() {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
while let Some(next_streamer) = self.heap.peek() {
|
||||
if self.current_streamers[0].streamer.key() != next_streamer.streamer.key() {
|
||||
break;
|
||||
} // no more streamer.
|
||||
}
|
||||
let next_heap_it = self.heap.pop().unwrap(); // safe : we peeked beforehand
|
||||
self.current_streamers.push(next_heap_it);
|
||||
}
|
||||
|
||||
@@ -14,6 +14,9 @@ lexicographical order matches the natural order of integers.
|
||||
`i64`-terms are transformed to `u64` using a continuous mapping `val ⟶ val - i64::min_value()`
|
||||
and then treated as a `u64`.
|
||||
|
||||
`f64`-terms are transformed to `u64` using a mapping that preserve order, and are then treated
|
||||
as `u64`.
|
||||
|
||||
A second datastructure makes it possible to access a [`TermInfo`](../postings/struct.TermInfo.html).
|
||||
*/
|
||||
|
||||
@@ -28,14 +31,43 @@ mod termdict;
|
||||
pub use self::merger::TermMerger;
|
||||
pub use self::streamer::{TermStreamer, TermStreamerBuilder};
|
||||
pub use self::termdict::{TermDictionary, TermDictionaryBuilder};
|
||||
use levenshtein_automata::{Distance, DFA, SINK_STATE};
|
||||
use tantivy_fst::Automaton;
|
||||
|
||||
pub(crate) struct WrappedDFA<Cond> {
|
||||
pub dfa: DFA,
|
||||
pub condition: Cond,
|
||||
}
|
||||
|
||||
impl<Cond: Fn(Distance) -> bool> Automaton for WrappedDFA<Cond> {
|
||||
type State = u32;
|
||||
|
||||
fn start(&self) -> Self::State {
|
||||
self.dfa.initial_state()
|
||||
}
|
||||
|
||||
fn is_match(&self, state: &Self::State) -> bool {
|
||||
let distance = self.dfa.distance(*state);
|
||||
(self.condition)(distance)
|
||||
}
|
||||
|
||||
fn can_match(&self, state: &Self::State) -> bool {
|
||||
*state != SINK_STATE
|
||||
}
|
||||
|
||||
fn accept(&self, state: &Self::State, byte: u8) -> Self::State {
|
||||
self.dfa.transition(*state, byte)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{TermDictionary, TermDictionaryBuilder, TermStreamer};
|
||||
use super::{TermDictionary, TermDictionaryBuilder, TermStreamer, WrappedDFA};
|
||||
use crate::core::Index;
|
||||
use crate::directory::{Directory, RAMDirectory, ReadOnlySource};
|
||||
use crate::postings::TermInfo;
|
||||
use crate::schema::{Document, FieldType, Schema, TEXT};
|
||||
use levenshtein_automata::Distance;
|
||||
use std::path::PathBuf;
|
||||
use std::str;
|
||||
|
||||
@@ -420,9 +452,14 @@ mod tests {
|
||||
|
||||
// We can now build an entire dfa.
|
||||
let lev_automaton_builder = LevenshteinAutomatonBuilder::new(2, true);
|
||||
let automaton = lev_automaton_builder.build_dfa("Spaen");
|
||||
|
||||
let mut range = term_dict.search(automaton).into_stream();
|
||||
let wrapped_dfa = WrappedDFA {
|
||||
dfa: lev_automaton_builder.build_dfa("Spaen"),
|
||||
condition: |distance| match distance {
|
||||
Distance::Exact(_) => true,
|
||||
Distance::AtLeast(_) => false,
|
||||
},
|
||||
};
|
||||
let mut range = term_dict.search(wrapped_dfa).into_stream();
|
||||
|
||||
// get the first finding
|
||||
assert!(range.advance());
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
//! # Example
|
||||
//! ```
|
||||
//! extern crate tantivy;
|
||||
//! ```rust
|
||||
//! use tantivy::tokenizer::*;
|
||||
//!
|
||||
//! # fn main() {
|
||||
|
||||
@@ -4,8 +4,7 @@
|
||||
//! You must define in your schema which tokenizer should be used for
|
||||
//! each of your fields :
|
||||
//!
|
||||
//! ```
|
||||
//! extern crate tantivy;
|
||||
//! ```rust
|
||||
//! use tantivy::schema::*;
|
||||
//!
|
||||
//! # fn main() {
|
||||
@@ -65,8 +64,6 @@
|
||||
//! For instance, the `en_stem` is defined as follows.
|
||||
//!
|
||||
//! ```rust
|
||||
//! # extern crate tantivy;
|
||||
//!
|
||||
//! use tantivy::tokenizer::*;
|
||||
//!
|
||||
//! # fn main() {
|
||||
@@ -80,8 +77,7 @@
|
||||
//! Once your tokenizer is defined, you need to
|
||||
//! register it with a name in your index's [`TokenizerManager`](./struct.TokenizerManager.html).
|
||||
//!
|
||||
//! ```
|
||||
//! # extern crate tantivy;
|
||||
//! ```rust
|
||||
//! # use tantivy::schema::Schema;
|
||||
//! # use tantivy::tokenizer::*;
|
||||
//! # use tantivy::Index;
|
||||
@@ -101,8 +97,7 @@
|
||||
//!
|
||||
//! # Example
|
||||
//!
|
||||
//! ```
|
||||
//! extern crate tantivy;
|
||||
//! ```rust
|
||||
//! use tantivy::schema::{Schema, IndexRecordOption, TextOptions, TextFieldIndexing};
|
||||
//! use tantivy::tokenizer::*;
|
||||
//! use tantivy::Index;
|
||||
@@ -155,7 +150,6 @@ pub use self::simple_tokenizer::SimpleTokenizer;
|
||||
pub use self::stemmer::{Language, Stemmer};
|
||||
pub use self::stop_word_filter::StopWordFilter;
|
||||
pub(crate) use self::token_stream_chain::TokenStreamChain;
|
||||
pub(crate) use self::tokenizer::box_tokenizer;
|
||||
pub use self::tokenizer::BoxedTokenizer;
|
||||
|
||||
pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
|
||||
@@ -29,8 +29,7 @@ use super::{Token, TokenStream, Tokenizer};
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// # extern crate tantivy;
|
||||
/// ```rust
|
||||
/// use tantivy::tokenizer::*;
|
||||
/// # fn main() {
|
||||
/// let tokenizer = NgramTokenizer::new(2, 3, false);
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user