mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-08 01:52:54 +00:00
Compare commits
17 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
88fd7f091a | ||
|
|
6e4fdfd4bf | ||
|
|
0519056bd8 | ||
|
|
7305ad575e | ||
|
|
79f64ac2f4 | ||
|
|
67bce6cbf2 | ||
|
|
e5316a4388 | ||
|
|
6a8a8557d2 | ||
|
|
3a65dc84c8 | ||
|
|
ce42bbf5c9 | ||
|
|
7b21b3f25a | ||
|
|
46caec1040 | ||
|
|
1187a02a3e | ||
|
|
f6c525b19e | ||
|
|
4a8f7712f3 | ||
|
|
2f867aad17 | ||
|
|
5c6580eb15 |
12
.github/FUNDING.yml
vendored
Normal file
12
.github/FUNDING.yml
vendored
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
# These are supported funding model platforms
|
||||||
|
|
||||||
|
github: fulmicoton
|
||||||
|
patreon: # Replace with a single Patreon username
|
||||||
|
open_collective: # Replace with a single Open Collective username
|
||||||
|
ko_fi: # Replace with a single Ko-fi username
|
||||||
|
tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
|
||||||
|
community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
|
||||||
|
liberapay: # Replace with a single Liberapay username
|
||||||
|
issuehunt: # Replace with a single IssueHunt username
|
||||||
|
otechie: # Replace with a single Otechie username
|
||||||
|
custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
|
||||||
@@ -9,6 +9,10 @@ Tantivy 0.11.0
|
|||||||
- API change around `Box<BoxableTokenizer>`. See detail in #629
|
- API change around `Box<BoxableTokenizer>`. See detail in #629
|
||||||
- Avoid rebuilding Regex automaton whenever a regex query is reused. #639 (@brainlock)
|
- Avoid rebuilding Regex automaton whenever a regex query is reused. #639 (@brainlock)
|
||||||
- Add footer with some metadata to index files. #605 (@fdb-hiroshima)
|
- Add footer with some metadata to index files. #605 (@fdb-hiroshima)
|
||||||
|
- TopDocs collector: ensure stable sorting on equal score. #671 (@brainlock)
|
||||||
|
- Added handling of pre-tokenized text fields (#642), which will enable users to
|
||||||
|
load tokens created outside tantivy. See usage in examples/pre_tokenized_text. (@kkoziara)
|
||||||
|
- Fix crash when committing multiple times with deleted documents. #681 (@brainlock)
|
||||||
|
|
||||||
## How to update?
|
## How to update?
|
||||||
|
|
||||||
|
|||||||
16
Cargo.toml
16
Cargo.toml
@@ -13,7 +13,7 @@ keywords = ["search", "information", "retrieval"]
|
|||||||
edition = "2018"
|
edition = "2018"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
base64 = "0.10.0"
|
base64 = "0.11.0"
|
||||||
byteorder = "1.0"
|
byteorder = "1.0"
|
||||||
crc32fast = "1.2.0"
|
crc32fast = "1.2.0"
|
||||||
once_cell = "1.0"
|
once_cell = "1.0"
|
||||||
@@ -34,7 +34,7 @@ itertools = "0.8"
|
|||||||
levenshtein_automata = {version="0.1", features=["fst_automaton"]}
|
levenshtein_automata = {version="0.1", features=["fst_automaton"]}
|
||||||
notify = {version="4", optional=true}
|
notify = {version="4", optional=true}
|
||||||
bit-set = "0.5"
|
bit-set = "0.5"
|
||||||
uuid = { version = "0.7.2", features = ["v4", "serde"] }
|
uuid = { version = "0.8", features = ["v4", "serde"] }
|
||||||
crossbeam = "0.7"
|
crossbeam = "0.7"
|
||||||
futures = "0.1"
|
futures = "0.1"
|
||||||
futures-cpupool = "0.1"
|
futures-cpupool = "0.1"
|
||||||
@@ -50,10 +50,10 @@ owned-read = "0.4"
|
|||||||
failure = "0.1"
|
failure = "0.1"
|
||||||
htmlescape = "0.3.1"
|
htmlescape = "0.3.1"
|
||||||
fail = "0.3"
|
fail = "0.3"
|
||||||
scoped-pool = "1.0"
|
|
||||||
murmurhash32 = "0.2"
|
murmurhash32 = "0.2"
|
||||||
chrono = "0.4"
|
chrono = "0.4"
|
||||||
smallvec = "0.6"
|
smallvec = "1.0"
|
||||||
|
rayon = "1"
|
||||||
|
|
||||||
[target.'cfg(windows)'.dependencies]
|
[target.'cfg(windows)'.dependencies]
|
||||||
winapi = "0.3"
|
winapi = "0.3"
|
||||||
@@ -64,6 +64,10 @@ maplit = "1"
|
|||||||
matches = "0.1.8"
|
matches = "0.1.8"
|
||||||
time = "0.1.42"
|
time = "0.1.42"
|
||||||
|
|
||||||
|
[dev-dependencies.fail]
|
||||||
|
version = "0.3"
|
||||||
|
features = ["failpoints"]
|
||||||
|
|
||||||
[profile.release]
|
[profile.release]
|
||||||
opt-level = 3
|
opt-level = 3
|
||||||
debug = false
|
debug = false
|
||||||
@@ -87,10 +91,6 @@ members = ["query-grammar"]
|
|||||||
[badges]
|
[badges]
|
||||||
travis-ci = { repository = "tantivy-search/tantivy" }
|
travis-ci = { repository = "tantivy-search/tantivy" }
|
||||||
|
|
||||||
[dev-dependencies.fail]
|
|
||||||
version = "0.3"
|
|
||||||
features = ["failpoints"]
|
|
||||||
|
|
||||||
# Following the "fail" crate best practises, we isolate
|
# Following the "fail" crate best practises, we isolate
|
||||||
# tests that define specific behavior in fail check points
|
# tests that define specific behavior in fail check points
|
||||||
# in a different binary.
|
# in a different binary.
|
||||||
|
|||||||
57
README.md
57
README.md
@@ -21,9 +21,9 @@
|
|||||||
[](https://www.patreon.com/fulmicoton)
|
[](https://www.patreon.com/fulmicoton)
|
||||||
|
|
||||||
|
|
||||||
**Tantivy** is a **full text search engine library** written in rust.
|
**Tantivy** is a **full text search engine library** written in Rust.
|
||||||
|
|
||||||
It is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elasticsearch](https://www.elastic.co/products/elasticsearch) and [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not
|
It is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elasticsearch](https://www.elastic.co/products/elasticsearch) or [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not
|
||||||
an off-the-shelf search engine server, but rather a crate that can be used
|
an off-the-shelf search engine server, but rather a crate that can be used
|
||||||
to build such a search engine.
|
to build such a search engine.
|
||||||
|
|
||||||
@@ -31,7 +31,7 @@ Tantivy is, in fact, strongly inspired by Lucene's design.
|
|||||||
|
|
||||||
# Benchmark
|
# Benchmark
|
||||||
|
|
||||||
Tantivy is typically faster than Lucene, but the results will depend on
|
Tantivy is typically faster than Lucene, but the results depend on
|
||||||
the nature of the queries in your workload.
|
the nature of the queries in your workload.
|
||||||
|
|
||||||
The following [benchmark](https://tantivy-search.github.io/bench/) break downs
|
The following [benchmark](https://tantivy-search.github.io/bench/) break downs
|
||||||
@@ -40,19 +40,19 @@ performance for different type of queries / collection.
|
|||||||
# Features
|
# Features
|
||||||
|
|
||||||
- Full-text search
|
- Full-text search
|
||||||
- Configurable tokenizer. (stemming available for 17 latin languages. Third party support for Chinese ([tantivy-jieba](https://crates.io/crates/tantivy-jieba) and [cang-jie](https://crates.io/crates/cang-jie)) and [Japanese](https://crates.io/crates/tantivy-tokenizer-tiny-segmenter)
|
- Configurable tokenizer (stemming available for 17 Latin languages with third party support for Chinese ([tantivy-jieba](https://crates.io/crates/tantivy-jieba) and [cang-jie](https://crates.io/crates/cang-jie)) and [Japanese](https://crates.io/crates/tantivy-tokenizer-tiny-segmenter))
|
||||||
- Fast (check out the :racehorse: :sparkles: [benchmark](https://tantivy-search.github.io/bench/) :sparkles: :racehorse:)
|
- Fast (check out the :racehorse: :sparkles: [benchmark](https://tantivy-search.github.io/bench/) :sparkles: :racehorse:)
|
||||||
- Tiny startup time (<10ms), perfect for command line tools
|
- Tiny startup time (<10ms), perfect for command line tools
|
||||||
- BM25 scoring (the same as lucene)
|
- BM25 scoring (the same as Lucene)
|
||||||
- Natural query language `(michael AND jackson) OR "king of pop"`
|
- Natural query language (e.g. `(michael AND jackson) OR "king of pop"`)
|
||||||
- Phrase queries search (`"michael jackson"`)
|
- Phrase queries search (e.g. `"michael jackson"`)
|
||||||
- Incremental indexing
|
- Incremental indexing
|
||||||
- Multithreaded indexing (indexing English Wikipedia takes < 3 minutes on my desktop)
|
- Multithreaded indexing (indexing English Wikipedia takes < 3 minutes on my desktop)
|
||||||
- Mmap directory
|
- Mmap directory
|
||||||
- SIMD integer compression when the platform/CPU includes the SSE2 instruction set.
|
- SIMD integer compression when the platform/CPU includes the SSE2 instruction set
|
||||||
- Single valued and multivalued u64, i64 and f64 fast fields (equivalent of doc values in Lucene)
|
- Single valued and multivalued u64, i64, and f64 fast fields (equivalent of doc values in Lucene)
|
||||||
- `&[u8]` fast fields
|
- `&[u8]` fast fields
|
||||||
- Text, i64, u64, f64, dates and hierarchical facet fields
|
- Text, i64, u64, f64, dates, and hierarchical facet fields
|
||||||
- LZ4 compressed document store
|
- LZ4 compressed document store
|
||||||
- Range queries
|
- Range queries
|
||||||
- Faceted search
|
- Faceted search
|
||||||
@@ -61,43 +61,42 @@ performance for different type of queries / collection.
|
|||||||
|
|
||||||
# Non-features
|
# Non-features
|
||||||
|
|
||||||
- Distributed search is out of the scope of tantivy. That being said, tantivy is meant as a
|
- Distributed search is out of the scope of Tantivy. That being said, Tantivy is a
|
||||||
library upon which one could build a distributed search. Serializable/mergeable collector state for instance,
|
library upon which one could build a distributed search. Serializable/mergeable collector state for instance,
|
||||||
are within the scope of tantivy.
|
are within the scope of Tantivy.
|
||||||
|
|
||||||
# Supported OS and compiler
|
# Supported OS and compiler
|
||||||
|
|
||||||
Tantivy works on stable rust (>= 1.27) and supports Linux, MacOS and Windows.
|
Tantivy works on stable Rust (>= 1.27) and supports Linux, MacOS, and Windows.
|
||||||
|
|
||||||
# Getting started
|
# Getting started
|
||||||
|
|
||||||
- [tantivy's simple search example](https://tantivy-search.github.io/examples/basic_search.html)
|
- [Tantivy's simple search example](https://tantivy-search.github.io/examples/basic_search.html)
|
||||||
- [tantivy-cli and its tutorial](https://github.com/tantivy-search/tantivy-cli).
|
- [tantivy-cli and its tutorial](https://github.com/tantivy-search/tantivy-cli) - `tantivy-cli` is an actual command line interface that makes it easy for you to create a search engine,
|
||||||
`tantivy-cli` is an actual command line interface that makes it easy for you to create a search engine,
|
index documents, and search via the CLI or a small server with a REST API.
|
||||||
index documents and search via the CLI or a small server with a REST API.
|
It walks you through getting a wikipedia search engine up and running in a few minutes.
|
||||||
It will walk you through getting a wikipedia search engine up and running in a few minutes.
|
- [Reference doc for the last released version](https://docs.rs/tantivy/)
|
||||||
- [reference doc for the last released version](https://docs.rs/tantivy/)
|
|
||||||
|
|
||||||
# How can I support this project?
|
# How can I support this project?
|
||||||
|
|
||||||
There are many ways to support this project.
|
There are many ways to support this project.
|
||||||
|
|
||||||
- Use tantivy and tell us about your experience on [gitter](https://gitter.im/tantivy-search/tantivy) or by email (paul.masurel@gmail.com)
|
- Use Tantivy and tell us about your experience on [Gitter](https://gitter.im/tantivy-search/tantivy) or by email (paul.masurel@gmail.com)
|
||||||
- Report bugs
|
- Report bugs
|
||||||
- Write a blog post
|
- Write a blog post
|
||||||
- Help with documentation by asking questions or submitting PRs
|
- Help with documentation by asking questions or submitting PRs
|
||||||
- Contribute code (you can join [our gitter](https://gitter.im/tantivy-search/tantivy) )
|
- Contribute code (you can join [our Gitter](https://gitter.im/tantivy-search/tantivy))
|
||||||
- Talk about tantivy around you
|
- Talk about Tantivy around you
|
||||||
- Drop a word on on [](https://saythanks.io/to/fulmicoton) or even [](https://www.patreon.com/fulmicoton)
|
- Drop a word on on [](https://saythanks.io/to/fulmicoton) or even [](https://www.patreon.com/fulmicoton)
|
||||||
|
|
||||||
# Contributing code
|
# Contributing code
|
||||||
|
|
||||||
We use the GitHub Pull Request workflow - reference a GitHub ticket and/or include a comprehensive commit message when opening a PR.
|
We use the GitHub Pull Request workflow: reference a GitHub ticket and/or include a comprehensive commit message when opening a PR.
|
||||||
|
|
||||||
## Clone and build locally
|
## Clone and build locally
|
||||||
|
|
||||||
Tantivy compiles on stable rust but requires `Rust >= 1.27`.
|
Tantivy compiles on stable Rust but requires `Rust >= 1.27`.
|
||||||
To check out and run tests, you can simply run :
|
To check out and run tests, you can simply run:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
git clone https://github.com/tantivy-search/tantivy.git
|
git clone https://github.com/tantivy-search/tantivy.git
|
||||||
@@ -108,7 +107,7 @@ To check out and run tests, you can simply run :
|
|||||||
## Run tests
|
## Run tests
|
||||||
|
|
||||||
Some tests will not run with just `cargo test` because of `fail-rs`.
|
Some tests will not run with just `cargo test` because of `fail-rs`.
|
||||||
To run the tests exhaustively, run `./run-tests.sh`
|
To run the tests exhaustively, run `./run-tests.sh`.
|
||||||
|
|
||||||
## Debug
|
## Debug
|
||||||
|
|
||||||
@@ -116,13 +115,13 @@ You might find it useful to step through the programme with a debugger.
|
|||||||
|
|
||||||
### A failing test
|
### A failing test
|
||||||
|
|
||||||
Make sure you haven't run `cargo clean` after the most recent `cargo test` or `cargo build` to guarantee that `target/` dir exists. Use this bash script to find the most name of the most recent debug build of tantivy and run it under rust-gdb.
|
Make sure you haven't run `cargo clean` after the most recent `cargo test` or `cargo build` to guarantee that the `target/` directory exists. Use this bash script to find the name of the most recent debug build of Tantivy and run it under `rust-gdb`:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
find target/debug/ -maxdepth 1 -executable -type f -name "tantivy*" -printf '%TY-%Tm-%Td %TT %p\n' | sort -r | cut -d " " -f 3 | xargs -I RECENT_DBG_TANTIVY rust-gdb RECENT_DBG_TANTIVY
|
find target/debug/ -maxdepth 1 -executable -type f -name "tantivy*" -printf '%TY-%Tm-%Td %TT %p\n' | sort -r | cut -d " " -f 3 | xargs -I RECENT_DBG_TANTIVY rust-gdb RECENT_DBG_TANTIVY
|
||||||
```
|
```
|
||||||
|
|
||||||
Now that you are in rust-gdb, you can set breakpoints on lines and methods that match your source-code and run the debug executable with flags that you normally pass to `cargo test` to like this
|
Now that you are in `rust-gdb`, you can set breakpoints on lines and methods that match your source code and run the debug executable with flags that you normally pass to `cargo test` like this:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$gdb run --test-threads 1 --test $NAME_OF_TEST
|
$gdb run --test-threads 1 --test $NAME_OF_TEST
|
||||||
@@ -130,7 +129,7 @@ $gdb run --test-threads 1 --test $NAME_OF_TEST
|
|||||||
|
|
||||||
### An example
|
### An example
|
||||||
|
|
||||||
By default, rustc compiles everything in the `examples/` dir in debug mode. This makes it easy for you to make examples to reproduce bugs.
|
By default, `rustc` compiles everything in the `examples/` directory in debug mode. This makes it easy for you to make examples to reproduce bugs:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
rust-gdb target/debug/examples/$EXAMPLE_NAME
|
rust-gdb target/debug/examples/$EXAMPLE_NAME
|
||||||
|
|||||||
140
examples/pre_tokenized_text.rs
Normal file
140
examples/pre_tokenized_text.rs
Normal file
@@ -0,0 +1,140 @@
|
|||||||
|
// # Pre-tokenized text example
|
||||||
|
//
|
||||||
|
// This example shows how to use pre-tokenized text. Sometimes yout might
|
||||||
|
// want to index and search through text which is already split into
|
||||||
|
// tokens by some external tool.
|
||||||
|
//
|
||||||
|
// In this example we will:
|
||||||
|
// - use tantivy tokenizer to create tokens and load them directly into tantivy,
|
||||||
|
// - import tokenized text straight from json,
|
||||||
|
// - perform a search on documents with pre-tokenized text
|
||||||
|
|
||||||
|
use tantivy::tokenizer::{PreTokenizedString, SimpleTokenizer, Token, TokenStream, Tokenizer};
|
||||||
|
|
||||||
|
use tantivy::collector::{Count, TopDocs};
|
||||||
|
use tantivy::query::TermQuery;
|
||||||
|
use tantivy::schema::*;
|
||||||
|
use tantivy::{doc, Index, ReloadPolicy};
|
||||||
|
use tempfile::TempDir;
|
||||||
|
|
||||||
|
fn pre_tokenize_text(text: &str) -> Vec<Token> {
|
||||||
|
let mut token_stream = SimpleTokenizer.token_stream(text);
|
||||||
|
let mut tokens = vec![];
|
||||||
|
while token_stream.advance() {
|
||||||
|
tokens.push(token_stream.token().clone());
|
||||||
|
}
|
||||||
|
tokens
|
||||||
|
}
|
||||||
|
|
||||||
|
fn main() -> tantivy::Result<()> {
|
||||||
|
let index_path = TempDir::new()?;
|
||||||
|
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
|
||||||
|
schema_builder.add_text_field("title", TEXT | STORED);
|
||||||
|
schema_builder.add_text_field("body", TEXT);
|
||||||
|
|
||||||
|
let schema = schema_builder.build();
|
||||||
|
|
||||||
|
let index = Index::create_in_dir(&index_path, schema.clone())?;
|
||||||
|
|
||||||
|
let mut index_writer = index.writer(50_000_000)?;
|
||||||
|
|
||||||
|
// We can create a document manually, by setting the fields
|
||||||
|
// one by one in a Document object.
|
||||||
|
let title = schema.get_field("title").unwrap();
|
||||||
|
let body = schema.get_field("body").unwrap();
|
||||||
|
|
||||||
|
let title_text = "The Old Man and the Sea";
|
||||||
|
let body_text = "He was an old man who fished alone in a skiff in the Gulf Stream";
|
||||||
|
|
||||||
|
// Content of our first document
|
||||||
|
// We create `PreTokenizedString` which contains original text and vector of tokens
|
||||||
|
let title_tok = PreTokenizedString {
|
||||||
|
text: String::from(title_text),
|
||||||
|
tokens: pre_tokenize_text(title_text),
|
||||||
|
};
|
||||||
|
|
||||||
|
println!(
|
||||||
|
"Original text: \"{}\" and tokens: {:?}",
|
||||||
|
title_tok.text, title_tok.tokens
|
||||||
|
);
|
||||||
|
|
||||||
|
let body_tok = PreTokenizedString {
|
||||||
|
text: String::from(body_text),
|
||||||
|
tokens: pre_tokenize_text(body_text),
|
||||||
|
};
|
||||||
|
|
||||||
|
// Now lets create a document and add our `PreTokenizedString` using
|
||||||
|
// `add_pre_tokenized_text` method of `Document`
|
||||||
|
let mut old_man_doc = Document::default();
|
||||||
|
old_man_doc.add_pre_tokenized_text(title, &title_tok);
|
||||||
|
old_man_doc.add_pre_tokenized_text(body, &body_tok);
|
||||||
|
|
||||||
|
// ... now let's just add it to the IndexWriter
|
||||||
|
index_writer.add_document(old_man_doc);
|
||||||
|
|
||||||
|
// Pretokenized text can also be fed as JSON
|
||||||
|
let short_man_json = r#"{
|
||||||
|
"title":[{
|
||||||
|
"text":"The Old Man",
|
||||||
|
"tokens":[
|
||||||
|
{"offset_from":0,"offset_to":3,"position":0,"text":"The","position_length":1},
|
||||||
|
{"offset_from":4,"offset_to":7,"position":1,"text":"Old","position_length":1},
|
||||||
|
{"offset_from":8,"offset_to":11,"position":2,"text":"Man","position_length":1}
|
||||||
|
]
|
||||||
|
}]
|
||||||
|
}"#;
|
||||||
|
|
||||||
|
let short_man_doc = schema.parse_document(&short_man_json)?;
|
||||||
|
|
||||||
|
index_writer.add_document(short_man_doc);
|
||||||
|
|
||||||
|
// Let's commit changes
|
||||||
|
index_writer.commit()?;
|
||||||
|
|
||||||
|
// ... and now is the time to query our index
|
||||||
|
|
||||||
|
let reader = index
|
||||||
|
.reader_builder()
|
||||||
|
.reload_policy(ReloadPolicy::OnCommit)
|
||||||
|
.try_into()?;
|
||||||
|
|
||||||
|
let searcher = reader.searcher();
|
||||||
|
|
||||||
|
// We want to get documents with token "Man", we will use TermQuery to do it
|
||||||
|
// Using PreTokenizedString means the tokens are stored as is avoiding stemming
|
||||||
|
// and lowercasing, which preserves full words in their original form
|
||||||
|
let query = TermQuery::new(
|
||||||
|
Term::from_field_text(title, "Man"),
|
||||||
|
IndexRecordOption::Basic,
|
||||||
|
);
|
||||||
|
|
||||||
|
let (top_docs, count) = searcher
|
||||||
|
.search(&query, &(TopDocs::with_limit(2), Count))
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
assert_eq!(count, 2);
|
||||||
|
|
||||||
|
for (_score, doc_address) in top_docs {
|
||||||
|
let retrieved_doc = searcher.doc(doc_address)?;
|
||||||
|
println!("Document: {}", schema.to_json(&retrieved_doc));
|
||||||
|
}
|
||||||
|
|
||||||
|
// In contrary to the previous query, when we search for the "man" term we
|
||||||
|
// should get no results, as it's not one of the indexed tokens. SimpleTokenizer
|
||||||
|
// only splits text on whitespace / punctuation.
|
||||||
|
|
||||||
|
let query = TermQuery::new(
|
||||||
|
Term::from_field_text(title, "man"),
|
||||||
|
IndexRecordOption::Basic,
|
||||||
|
);
|
||||||
|
|
||||||
|
let (_top_docs, count) = searcher
|
||||||
|
.search(&query, &(TopDocs::with_limit(2), Count))
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
assert_eq!(count, 0);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
@@ -2,7 +2,7 @@ use std::fmt;
|
|||||||
use std::fmt::Write;
|
use std::fmt::Write;
|
||||||
|
|
||||||
/// Defines whether a term in a query must be present,
|
/// Defines whether a term in a query must be present,
|
||||||
/// should be present or must not be present.
|
/// should be present or must be not present.
|
||||||
#[derive(Debug, Clone, Hash, Copy, Eq, PartialEq)]
|
#[derive(Debug, Clone, Hash, Copy, Eq, PartialEq)]
|
||||||
pub enum Occur {
|
pub enum Occur {
|
||||||
/// For a given document to be considered for scoring,
|
/// For a given document to be considered for scoring,
|
||||||
|
|||||||
@@ -515,7 +515,7 @@ mod tests {
|
|||||||
#[should_panic(expected = "Tried to add a facet which is a descendant of \
|
#[should_panic(expected = "Tried to add a facet which is a descendant of \
|
||||||
an already added facet.")]
|
an already added facet.")]
|
||||||
fn test_misused_facet_collector() {
|
fn test_misused_facet_collector() {
|
||||||
let mut facet_collector = FacetCollector::for_field(Field(0));
|
let mut facet_collector = FacetCollector::for_field(Field::from_field_id(0));
|
||||||
facet_collector.add_facet(Facet::from("/country"));
|
facet_collector.add_facet(Facet::from("/country"));
|
||||||
facet_collector.add_facet(Facet::from("/country/europe"));
|
facet_collector.add_facet(Facet::from("/country/europe"));
|
||||||
}
|
}
|
||||||
@@ -546,7 +546,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_non_used_facet_collector() {
|
fn test_non_used_facet_collector() {
|
||||||
let mut facet_collector = FacetCollector::for_field(Field(0));
|
let mut facet_collector = FacetCollector::for_field(Field::from_field_id(0));
|
||||||
facet_collector.add_facet(Facet::from("/country"));
|
facet_collector.add_facet(Facet::from("/country"));
|
||||||
facet_collector.add_facet(Facet::from("/countryeurope"));
|
facet_collector.add_facet(Facet::from("/countryeurope"));
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -12,6 +12,9 @@ use std::collections::BinaryHeap;
|
|||||||
/// It has a custom implementation of `PartialOrd` that reverses the order. This is because the
|
/// It has a custom implementation of `PartialOrd` that reverses the order. This is because the
|
||||||
/// default Rust heap is a max heap, whereas a min heap is needed.
|
/// default Rust heap is a max heap, whereas a min heap is needed.
|
||||||
///
|
///
|
||||||
|
/// Additionally, it guarantees stable sorting: in case of a tie on the feature, the document
|
||||||
|
/// address is used.
|
||||||
|
///
|
||||||
/// WARNING: equality is not what you would expect here.
|
/// WARNING: equality is not what you would expect here.
|
||||||
/// Two elements are equal if their feature is equal, and regardless of whether `doc`
|
/// Two elements are equal if their feature is equal, and regardless of whether `doc`
|
||||||
/// is equal. This should be perfectly fine for this usage, but let's make sure this
|
/// is equal. This should be perfectly fine for this usage, but let's make sure this
|
||||||
@@ -21,29 +24,37 @@ struct ComparableDoc<T, D> {
|
|||||||
doc: D,
|
doc: D,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: PartialOrd, D> PartialOrd for ComparableDoc<T, D> {
|
impl<T: PartialOrd, D: PartialOrd> PartialOrd for ComparableDoc<T, D> {
|
||||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||||
Some(self.cmp(other))
|
Some(self.cmp(other))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: PartialOrd, D> Ord for ComparableDoc<T, D> {
|
impl<T: PartialOrd, D: PartialOrd> Ord for ComparableDoc<T, D> {
|
||||||
#[inline]
|
#[inline]
|
||||||
fn cmp(&self, other: &Self) -> Ordering {
|
fn cmp(&self, other: &Self) -> Ordering {
|
||||||
other
|
// Reversed to make BinaryHeap work as a min-heap
|
||||||
|
let by_feature = other
|
||||||
.feature
|
.feature
|
||||||
.partial_cmp(&self.feature)
|
.partial_cmp(&self.feature)
|
||||||
.unwrap_or_else(|| Ordering::Equal)
|
.unwrap_or(Ordering::Equal);
|
||||||
|
|
||||||
|
let lazy_by_doc_address = || self.doc.partial_cmp(&other.doc).unwrap_or(Ordering::Equal);
|
||||||
|
|
||||||
|
// In case of a tie on the feature, we sort by ascending
|
||||||
|
// `DocAddress` in order to ensure a stable sorting of the
|
||||||
|
// documents.
|
||||||
|
by_feature.then_with(lazy_by_doc_address)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: PartialOrd, D> PartialEq for ComparableDoc<T, D> {
|
impl<T: PartialOrd, D: PartialOrd> PartialEq for ComparableDoc<T, D> {
|
||||||
fn eq(&self, other: &Self) -> bool {
|
fn eq(&self, other: &Self) -> bool {
|
||||||
self.cmp(other) == Ordering::Equal
|
self.cmp(other) == Ordering::Equal
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: PartialOrd, D> Eq for ComparableDoc<T, D> {}
|
impl<T: PartialOrd, D: PartialOrd> Eq for ComparableDoc<T, D> {}
|
||||||
|
|
||||||
pub(crate) struct TopCollector<T> {
|
pub(crate) struct TopCollector<T> {
|
||||||
limit: usize,
|
limit: usize,
|
||||||
@@ -214,4 +225,94 @@ mod tests {
|
|||||||
]
|
]
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_top_segment_collector_stable_ordering_for_equal_feature() {
|
||||||
|
// given that the documents are collected in ascending doc id order,
|
||||||
|
// when harvesting we have to guarantee stable sorting in case of a tie
|
||||||
|
// on the score
|
||||||
|
let doc_ids_collection = [4, 5, 6];
|
||||||
|
let score = 3.14;
|
||||||
|
|
||||||
|
let mut top_collector_limit_2 = TopSegmentCollector::new(0, 2);
|
||||||
|
for id in &doc_ids_collection {
|
||||||
|
top_collector_limit_2.collect(*id, score);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut top_collector_limit_3 = TopSegmentCollector::new(0, 3);
|
||||||
|
for id in &doc_ids_collection {
|
||||||
|
top_collector_limit_3.collect(*id, score);
|
||||||
|
}
|
||||||
|
|
||||||
|
assert_eq!(
|
||||||
|
top_collector_limit_2.harvest(),
|
||||||
|
top_collector_limit_3.harvest()[..2].to_vec(),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(all(test, feature = "unstable"))]
|
||||||
|
mod bench {
|
||||||
|
use super::TopSegmentCollector;
|
||||||
|
use test::Bencher;
|
||||||
|
|
||||||
|
#[bench]
|
||||||
|
fn bench_top_segment_collector_collect_not_at_capacity(b: &mut Bencher) {
|
||||||
|
let mut top_collector = TopSegmentCollector::new(0, 400);
|
||||||
|
|
||||||
|
b.iter(|| {
|
||||||
|
for i in 0..100 {
|
||||||
|
top_collector.collect(i, 0.8);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
#[bench]
|
||||||
|
fn bench_top_segment_collector_collect_at_capacity(b: &mut Bencher) {
|
||||||
|
let mut top_collector = TopSegmentCollector::new(0, 100);
|
||||||
|
|
||||||
|
for i in 0..100 {
|
||||||
|
top_collector.collect(i, 0.8);
|
||||||
|
}
|
||||||
|
|
||||||
|
b.iter(|| {
|
||||||
|
for i in 0..100 {
|
||||||
|
top_collector.collect(i, 0.8);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
#[bench]
|
||||||
|
fn bench_top_segment_collector_collect_and_harvest_many_ties(b: &mut Bencher) {
|
||||||
|
b.iter(|| {
|
||||||
|
let mut top_collector = TopSegmentCollector::new(0, 100);
|
||||||
|
|
||||||
|
for i in 0..100 {
|
||||||
|
top_collector.collect(i, 0.8);
|
||||||
|
}
|
||||||
|
|
||||||
|
// it would be nice to be able to do the setup N times but still
|
||||||
|
// measure only harvest(). We can't since harvest() consumes
|
||||||
|
// the top_collector.
|
||||||
|
top_collector.harvest()
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
#[bench]
|
||||||
|
fn bench_top_segment_collector_collect_and_harvest_no_tie(b: &mut Bencher) {
|
||||||
|
b.iter(|| {
|
||||||
|
let mut top_collector = TopSegmentCollector::new(0, 100);
|
||||||
|
let mut score = 1.0;
|
||||||
|
|
||||||
|
for i in 0..100 {
|
||||||
|
score += 1.0;
|
||||||
|
top_collector.collect(i, score);
|
||||||
|
}
|
||||||
|
|
||||||
|
// it would be nice to be able to do the setup N times but still
|
||||||
|
// measure only harvest(). We can't since harvest() consumes
|
||||||
|
// the top_collector.
|
||||||
|
top_collector.harvest()
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -15,13 +15,16 @@ use crate::SegmentLocalId;
|
|||||||
use crate::SegmentReader;
|
use crate::SegmentReader;
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
|
|
||||||
/// The Top Score Collector keeps track of the K documents
|
/// The `TopDocs` collector keeps track of the top `K` documents
|
||||||
/// sorted by their score.
|
/// sorted by their score.
|
||||||
///
|
///
|
||||||
/// The implementation is based on a `BinaryHeap`.
|
/// The implementation is based on a `BinaryHeap`.
|
||||||
/// The theorical complexity for collecting the top `K` out of `n` documents
|
/// The theorical complexity for collecting the top `K` out of `n` documents
|
||||||
/// is `O(n log K)`.
|
/// is `O(n log K)`.
|
||||||
///
|
///
|
||||||
|
/// This collector guarantees a stable sorting in case of a tie on the
|
||||||
|
/// document score. As such, it is suitable to implement pagination.
|
||||||
|
///
|
||||||
/// ```rust
|
/// ```rust
|
||||||
/// use tantivy::collector::TopDocs;
|
/// use tantivy::collector::TopDocs;
|
||||||
/// use tantivy::query::QueryParser;
|
/// use tantivy::query::QueryParser;
|
||||||
@@ -428,12 +431,13 @@ impl SegmentCollector for TopScoreSegmentCollector {
|
|||||||
mod tests {
|
mod tests {
|
||||||
use super::TopDocs;
|
use super::TopDocs;
|
||||||
use crate::collector::Collector;
|
use crate::collector::Collector;
|
||||||
use crate::query::{Query, QueryParser};
|
use crate::query::{AllQuery, Query, QueryParser};
|
||||||
use crate::schema::{Field, Schema, FAST, STORED, TEXT};
|
use crate::schema::{Field, Schema, FAST, STORED, TEXT};
|
||||||
use crate::DocAddress;
|
use crate::DocAddress;
|
||||||
use crate::Index;
|
use crate::Index;
|
||||||
use crate::IndexWriter;
|
use crate::IndexWriter;
|
||||||
use crate::Score;
|
use crate::Score;
|
||||||
|
use itertools::Itertools;
|
||||||
|
|
||||||
fn make_index() -> Index {
|
fn make_index() -> Index {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
@@ -494,6 +498,29 @@ mod tests {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_top_collector_stable_sorting() {
|
||||||
|
let index = make_index();
|
||||||
|
|
||||||
|
// using AllQuery to get a constant score
|
||||||
|
let searcher = index.reader().unwrap().searcher();
|
||||||
|
|
||||||
|
let page_1 = searcher.search(&AllQuery, &TopDocs::with_limit(2)).unwrap();
|
||||||
|
|
||||||
|
let page_2 = searcher.search(&AllQuery, &TopDocs::with_limit(3)).unwrap();
|
||||||
|
|
||||||
|
// precondition for the test to be meaningful: we did get documents
|
||||||
|
// with the same score
|
||||||
|
assert!(page_1.iter().map(|result| result.0).all_equal());
|
||||||
|
assert!(page_2.iter().map(|result| result.0).all_equal());
|
||||||
|
|
||||||
|
// sanity check since we're relying on make_index()
|
||||||
|
assert_eq!(page_1.len(), 2);
|
||||||
|
assert_eq!(page_2.len(), 3);
|
||||||
|
|
||||||
|
assert_eq!(page_1, &page_2[..page_1.len()]);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
#[should_panic]
|
#[should_panic]
|
||||||
fn test_top_0() {
|
fn test_top_0() {
|
||||||
@@ -551,7 +578,7 @@ mod tests {
|
|||||||
));
|
));
|
||||||
});
|
});
|
||||||
let searcher = index.reader().unwrap().searcher();
|
let searcher = index.reader().unwrap().searcher();
|
||||||
let top_collector = TopDocs::with_limit(4).order_by_u64_field(Field(2));
|
let top_collector = TopDocs::with_limit(4).order_by_u64_field(Field::from_field_id(2));
|
||||||
let segment_reader = searcher.segment_reader(0u32);
|
let segment_reader = searcher.segment_reader(0u32);
|
||||||
top_collector
|
top_collector
|
||||||
.for_segment(0, segment_reader)
|
.for_segment(0, segment_reader)
|
||||||
|
|||||||
@@ -199,13 +199,13 @@ mod test {
|
|||||||
let w = directory.open_write(path).unwrap();
|
let w = directory.open_write(path).unwrap();
|
||||||
let mut composite_write = CompositeWrite::wrap(w);
|
let mut composite_write = CompositeWrite::wrap(w);
|
||||||
{
|
{
|
||||||
let mut write_0 = composite_write.for_field(Field(0u32));
|
let mut write_0 = composite_write.for_field(Field::from_field_id(0u32));
|
||||||
VInt(32431123u64).serialize(&mut write_0).unwrap();
|
VInt(32431123u64).serialize(&mut write_0).unwrap();
|
||||||
write_0.flush().unwrap();
|
write_0.flush().unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
let mut write_4 = composite_write.for_field(Field(4u32));
|
let mut write_4 = composite_write.for_field(Field::from_field_id(4u32));
|
||||||
VInt(2).serialize(&mut write_4).unwrap();
|
VInt(2).serialize(&mut write_4).unwrap();
|
||||||
write_4.flush().unwrap();
|
write_4.flush().unwrap();
|
||||||
}
|
}
|
||||||
@@ -215,14 +215,18 @@ mod test {
|
|||||||
let r = directory.open_read(path).unwrap();
|
let r = directory.open_read(path).unwrap();
|
||||||
let composite_file = CompositeFile::open(&r).unwrap();
|
let composite_file = CompositeFile::open(&r).unwrap();
|
||||||
{
|
{
|
||||||
let file0 = composite_file.open_read(Field(0u32)).unwrap();
|
let file0 = composite_file
|
||||||
|
.open_read(Field::from_field_id(0u32))
|
||||||
|
.unwrap();
|
||||||
let mut file0_buf = file0.as_slice();
|
let mut file0_buf = file0.as_slice();
|
||||||
let payload_0 = VInt::deserialize(&mut file0_buf).unwrap().0;
|
let payload_0 = VInt::deserialize(&mut file0_buf).unwrap().0;
|
||||||
assert_eq!(file0_buf.len(), 0);
|
assert_eq!(file0_buf.len(), 0);
|
||||||
assert_eq!(payload_0, 32431123u64);
|
assert_eq!(payload_0, 32431123u64);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let file4 = composite_file.open_read(Field(4u32)).unwrap();
|
let file4 = composite_file
|
||||||
|
.open_read(Field::from_field_id(4u32))
|
||||||
|
.unwrap();
|
||||||
let mut file4_buf = file4.as_slice();
|
let mut file4_buf = file4.as_slice();
|
||||||
let payload_4 = VInt::deserialize(&mut file4_buf).unwrap().0;
|
let payload_4 = VInt::deserialize(&mut file4_buf).unwrap().0;
|
||||||
assert_eq!(file4_buf.len(), 0);
|
assert_eq!(file4_buf.len(), 0);
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
use crate::Result;
|
use crate::Result;
|
||||||
use crossbeam::channel;
|
use crossbeam::channel;
|
||||||
use scoped_pool::{Pool, ThreadConfig};
|
use rayon::{ThreadPool, ThreadPoolBuilder};
|
||||||
|
|
||||||
/// Search executor whether search request are single thread or multithread.
|
/// Search executor whether search request are single thread or multithread.
|
||||||
///
|
///
|
||||||
@@ -11,7 +11,7 @@ use scoped_pool::{Pool, ThreadConfig};
|
|||||||
/// used by the client. Second, we may stop using rayon in the future.
|
/// used by the client. Second, we may stop using rayon in the future.
|
||||||
pub enum Executor {
|
pub enum Executor {
|
||||||
SingleThread,
|
SingleThread,
|
||||||
ThreadPool(Pool),
|
ThreadPool(ThreadPool),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Executor {
|
impl Executor {
|
||||||
@@ -21,10 +21,12 @@ impl Executor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Creates an Executor that dispatches the tasks in a thread pool.
|
// Creates an Executor that dispatches the tasks in a thread pool.
|
||||||
pub fn multi_thread(num_threads: usize, prefix: &'static str) -> Executor {
|
pub fn multi_thread(num_threads: usize, prefix: &'static str) -> Result<Executor> {
|
||||||
let thread_config = ThreadConfig::new().prefix(prefix);
|
let pool = ThreadPoolBuilder::new()
|
||||||
let pool = Pool::with_thread_config(num_threads, thread_config);
|
.num_threads(num_threads)
|
||||||
Executor::ThreadPool(pool)
|
.thread_name(move |num| format!("{}{}", prefix, num))
|
||||||
|
.build()?;
|
||||||
|
Ok(Executor::ThreadPool(pool))
|
||||||
}
|
}
|
||||||
|
|
||||||
// Perform a map in the thread pool.
|
// Perform a map in the thread pool.
|
||||||
@@ -48,9 +50,9 @@ impl Executor {
|
|||||||
let num_fruits = args_with_indices.len();
|
let num_fruits = args_with_indices.len();
|
||||||
let fruit_receiver = {
|
let fruit_receiver = {
|
||||||
let (fruit_sender, fruit_receiver) = channel::unbounded();
|
let (fruit_sender, fruit_receiver) = channel::unbounded();
|
||||||
pool.scoped(|scope| {
|
pool.scope(|scope| {
|
||||||
for arg_with_idx in args_with_indices {
|
for arg_with_idx in args_with_indices {
|
||||||
scope.execute(|| {
|
scope.spawn(|_| {
|
||||||
let (idx, arg) = arg_with_idx;
|
let (idx, arg) = arg_with_idx;
|
||||||
let fruit = f(arg);
|
let fruit = f(arg);
|
||||||
if let Err(err) = fruit_sender.send((idx, fruit)) {
|
if let Err(err) = fruit_sender.send((idx, fruit)) {
|
||||||
@@ -103,6 +105,7 @@ mod tests {
|
|||||||
#[should_panic] //< unfortunately the panic message is not propagated
|
#[should_panic] //< unfortunately the panic message is not propagated
|
||||||
fn test_panic_propagates_multi_thread() {
|
fn test_panic_propagates_multi_thread() {
|
||||||
let _result: Vec<usize> = Executor::multi_thread(1, "search-test")
|
let _result: Vec<usize> = Executor::multi_thread(1, "search-test")
|
||||||
|
.unwrap()
|
||||||
.map(
|
.map(
|
||||||
|_| {
|
|_| {
|
||||||
panic!("panic should propagate");
|
panic!("panic should propagate");
|
||||||
@@ -126,6 +129,7 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn test_map_multithread() {
|
fn test_map_multithread() {
|
||||||
let result: Vec<usize> = Executor::multi_thread(3, "search-test")
|
let result: Vec<usize> = Executor::multi_thread(3, "search-test")
|
||||||
|
.unwrap()
|
||||||
.map(|i| Ok(i * 2), 0..10)
|
.map(|i| Ok(i * 2), 0..10)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(result.len(), 10);
|
assert_eq!(result.len(), 10);
|
||||||
|
|||||||
@@ -73,15 +73,16 @@ impl Index {
|
|||||||
|
|
||||||
/// Replace the default single thread search executor pool
|
/// Replace the default single thread search executor pool
|
||||||
/// by a thread pool with a given number of threads.
|
/// by a thread pool with a given number of threads.
|
||||||
pub fn set_multithread_executor(&mut self, num_threads: usize) {
|
pub fn set_multithread_executor(&mut self, num_threads: usize) -> Result<()> {
|
||||||
self.executor = Arc::new(Executor::multi_thread(num_threads, "thrd-tantivy-search-"));
|
self.executor = Arc::new(Executor::multi_thread(num_threads, "thrd-tantivy-search-")?);
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Replace the default single thread search executor pool
|
/// Replace the default single thread search executor pool
|
||||||
/// by a thread pool with a given number of threads.
|
/// by a thread pool with a given number of threads.
|
||||||
pub fn set_default_multithread_executor(&mut self) {
|
pub fn set_default_multithread_executor(&mut self) -> Result<()> {
|
||||||
let default_num_threads = num_cpus::get();
|
let default_num_threads = num_cpus::get();
|
||||||
self.set_multithread_executor(default_num_threads);
|
self.set_multithread_executor(default_num_threads)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Creates a new index using the `RAMDirectory`.
|
/// Creates a new index using the `RAMDirectory`.
|
||||||
|
|||||||
@@ -150,6 +150,21 @@ impl SegmentMeta {
|
|||||||
self.num_deleted_docs() > 0
|
self.num_deleted_docs() > 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Updates the max_doc value from the `SegmentMeta`.
|
||||||
|
///
|
||||||
|
/// This method is only used when updating `max_doc` from 0
|
||||||
|
/// as we finalize a fresh new segment.
|
||||||
|
pub(crate) fn with_max_doc(self, max_doc: u32) -> SegmentMeta {
|
||||||
|
assert_eq!(self.tracked.max_doc, 0);
|
||||||
|
assert!(self.tracked.deletes.is_none());
|
||||||
|
let tracked = self.tracked.map(move |inner_meta| InnerSegmentMeta {
|
||||||
|
segment_id: inner_meta.segment_id,
|
||||||
|
max_doc,
|
||||||
|
deletes: None,
|
||||||
|
});
|
||||||
|
SegmentMeta { tracked }
|
||||||
|
}
|
||||||
|
|
||||||
#[doc(hidden)]
|
#[doc(hidden)]
|
||||||
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> SegmentMeta {
|
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> SegmentMeta {
|
||||||
let delete_meta = DeleteMeta {
|
let delete_meta = DeleteMeta {
|
||||||
|
|||||||
@@ -50,6 +50,17 @@ impl Segment {
|
|||||||
&self.meta
|
&self.meta
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Updates the max_doc value from the `SegmentMeta`.
|
||||||
|
///
|
||||||
|
/// This method is only used when updating `max_doc` from 0
|
||||||
|
/// as we finalize a fresh new segment.
|
||||||
|
pub(crate) fn with_max_doc(self, max_doc: u32) -> Segment {
|
||||||
|
Segment {
|
||||||
|
index: self.index,
|
||||||
|
meta: self.meta.with_max_doc(max_doc),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[doc(hidden)]
|
#[doc(hidden)]
|
||||||
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> Segment {
|
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> Segment {
|
||||||
Segment {
|
Segment {
|
||||||
|
|||||||
@@ -76,7 +76,7 @@ impl SegmentId {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Error type used when parsing a `SegmentId` from a string fails.
|
/// Error type used when parsing a `SegmentId` from a string fails.
|
||||||
pub struct SegmentIdParseError(uuid::parser::ParseError);
|
pub struct SegmentIdParseError(uuid::Error);
|
||||||
|
|
||||||
impl Error for SegmentIdParseError {}
|
impl Error for SegmentIdParseError {}
|
||||||
|
|
||||||
|
|||||||
@@ -327,8 +327,7 @@ mod tests_mmap_specific {
|
|||||||
.unwrap();
|
.unwrap();
|
||||||
assert!(managed_directory.exists(test_path1));
|
assert!(managed_directory.exists(test_path1));
|
||||||
assert!(managed_directory.exists(test_path2));
|
assert!(managed_directory.exists(test_path2));
|
||||||
let living_files: HashSet<PathBuf> =
|
let living_files: HashSet<PathBuf> = [test_path1.to_owned()].iter().cloned().collect();
|
||||||
[test_path1.to_owned()].into_iter().cloned().collect();
|
|
||||||
managed_directory.garbage_collect(|| living_files);
|
managed_directory.garbage_collect(|| living_files);
|
||||||
assert!(managed_directory.exists(test_path1));
|
assert!(managed_directory.exists(test_path1));
|
||||||
assert!(!managed_directory.exists(test_path2));
|
assert!(!managed_directory.exists(test_path2));
|
||||||
|
|||||||
@@ -170,3 +170,9 @@ impl From<serde_json::Error> for TantivyError {
|
|||||||
TantivyError::IOError(io_err.into())
|
TantivyError::IOError(io_err.into())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl From<rayon::ThreadPoolBuildError> for TantivyError {
|
||||||
|
fn from(error: rayon::ThreadPoolBuildError) -> TantivyError {
|
||||||
|
TantivyError::SystemError(error.to_string())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -10,11 +10,14 @@ use std::io::Write;
|
|||||||
/// Write a delete `BitSet`
|
/// Write a delete `BitSet`
|
||||||
///
|
///
|
||||||
/// where `delete_bitset` is the set of deleted `DocId`.
|
/// where `delete_bitset` is the set of deleted `DocId`.
|
||||||
pub fn write_delete_bitset(delete_bitset: &BitSet, writer: &mut WritePtr) -> io::Result<()> {
|
pub fn write_delete_bitset(
|
||||||
let max_doc = delete_bitset.capacity();
|
delete_bitset: &BitSet,
|
||||||
|
max_doc: u32,
|
||||||
|
writer: &mut WritePtr,
|
||||||
|
) -> io::Result<()> {
|
||||||
let mut byte = 0u8;
|
let mut byte = 0u8;
|
||||||
let mut shift = 0u8;
|
let mut shift = 0u8;
|
||||||
for doc in 0..max_doc {
|
for doc in 0..(max_doc as usize) {
|
||||||
if delete_bitset.contains(doc) {
|
if delete_bitset.contains(doc) {
|
||||||
byte |= 1 << shift;
|
byte |= 1 << shift;
|
||||||
}
|
}
|
||||||
@@ -86,18 +89,17 @@ mod tests {
|
|||||||
use bit_set::BitSet;
|
use bit_set::BitSet;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
|
|
||||||
fn test_delete_bitset_helper(bitset: &BitSet) {
|
fn test_delete_bitset_helper(bitset: &BitSet, max_doc: u32) {
|
||||||
let test_path = PathBuf::from("test");
|
let test_path = PathBuf::from("test");
|
||||||
let mut directory = RAMDirectory::create();
|
let mut directory = RAMDirectory::create();
|
||||||
{
|
{
|
||||||
let mut writer = directory.open_write(&*test_path).unwrap();
|
let mut writer = directory.open_write(&*test_path).unwrap();
|
||||||
write_delete_bitset(bitset, &mut writer).unwrap();
|
write_delete_bitset(bitset, max_doc, &mut writer).unwrap();
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let source = directory.open_read(&test_path).unwrap();
|
let source = directory.open_read(&test_path).unwrap();
|
||||||
let delete_bitset = DeleteBitSet::open(source);
|
let delete_bitset = DeleteBitSet::open(source);
|
||||||
let n = bitset.capacity();
|
for doc in 0..max_doc as usize {
|
||||||
for doc in 0..n {
|
|
||||||
assert_eq!(bitset.contains(doc), delete_bitset.is_deleted(doc as DocId));
|
assert_eq!(bitset.contains(doc), delete_bitset.is_deleted(doc as DocId));
|
||||||
}
|
}
|
||||||
assert_eq!(delete_bitset.len(), bitset.len());
|
assert_eq!(delete_bitset.len(), bitset.len());
|
||||||
@@ -110,7 +112,7 @@ mod tests {
|
|||||||
let mut bitset = BitSet::with_capacity(10);
|
let mut bitset = BitSet::with_capacity(10);
|
||||||
bitset.insert(1);
|
bitset.insert(1);
|
||||||
bitset.insert(9);
|
bitset.insert(9);
|
||||||
test_delete_bitset_helper(&bitset);
|
test_delete_bitset_helper(&bitset, 10);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let mut bitset = BitSet::with_capacity(8);
|
let mut bitset = BitSet::with_capacity(8);
|
||||||
@@ -119,7 +121,7 @@ mod tests {
|
|||||||
bitset.insert(3);
|
bitset.insert(3);
|
||||||
bitset.insert(5);
|
bitset.insert(5);
|
||||||
bitset.insert(7);
|
bitset.insert(7);
|
||||||
test_delete_bitset_helper(&bitset);
|
test_delete_bitset_helper(&bitset, 8);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -59,8 +59,7 @@ impl FastFieldReaders {
|
|||||||
fast_bytes: Default::default(),
|
fast_bytes: Default::default(),
|
||||||
fast_fields_composite: fast_fields_composite.clone(),
|
fast_fields_composite: fast_fields_composite.clone(),
|
||||||
};
|
};
|
||||||
for (field_id, field_entry) in schema.fields().iter().enumerate() {
|
for (field, field_entry) in schema.fields() {
|
||||||
let field = Field(field_id as u32);
|
|
||||||
let field_type = field_entry.field_type();
|
let field_type = field_entry.field_type();
|
||||||
if field_type == &FieldType::Bytes {
|
if field_type == &FieldType::Bytes {
|
||||||
let idx_reader = fast_fields_composite
|
let idx_reader = fast_fields_composite
|
||||||
|
|||||||
@@ -24,8 +24,7 @@ impl FastFieldsWriter {
|
|||||||
let mut multi_values_writers = Vec::new();
|
let mut multi_values_writers = Vec::new();
|
||||||
let mut bytes_value_writers = Vec::new();
|
let mut bytes_value_writers = Vec::new();
|
||||||
|
|
||||||
for (field_id, field_entry) in schema.fields().iter().enumerate() {
|
for (field, field_entry) in schema.fields() {
|
||||||
let field = Field(field_id as u32);
|
|
||||||
let default_value = match *field_entry.field_type() {
|
let default_value = match *field_entry.field_type() {
|
||||||
FieldType::I64(_) => common::i64_to_u64(0i64),
|
FieldType::I64(_) => common::i64_to_u64(0i64),
|
||||||
FieldType::F64(_) => common::f64_to_u64(0.0f64),
|
FieldType::F64(_) => common::f64_to_u64(0.0f64),
|
||||||
|
|||||||
@@ -22,11 +22,14 @@ impl FieldNormsWriter {
|
|||||||
pub(crate) fn fields_with_fieldnorm(schema: &Schema) -> Vec<Field> {
|
pub(crate) fn fields_with_fieldnorm(schema: &Schema) -> Vec<Field> {
|
||||||
schema
|
schema
|
||||||
.fields()
|
.fields()
|
||||||
.iter()
|
.filter_map(|(field, field_entry)| {
|
||||||
.enumerate()
|
if field_entry.is_indexed() {
|
||||||
.filter(|&(_, field_entry)| field_entry.is_indexed())
|
Some(field)
|
||||||
.map(|(field, _)| Field(field as u32))
|
} else {
|
||||||
.collect::<Vec<Field>>()
|
None
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Initialize with state for tracking the field norm fields
|
/// Initialize with state for tracking the field norm fields
|
||||||
@@ -35,7 +38,7 @@ impl FieldNormsWriter {
|
|||||||
let fields = FieldNormsWriter::fields_with_fieldnorm(schema);
|
let fields = FieldNormsWriter::fields_with_fieldnorm(schema);
|
||||||
let max_field = fields
|
let max_field = fields
|
||||||
.iter()
|
.iter()
|
||||||
.map(|field| field.0)
|
.map(Field::field_id)
|
||||||
.max()
|
.max()
|
||||||
.map(|max_field_id| max_field_id as usize + 1)
|
.map(|max_field_id| max_field_id as usize + 1)
|
||||||
.unwrap_or(0);
|
.unwrap_or(0);
|
||||||
@@ -50,8 +53,8 @@ impl FieldNormsWriter {
|
|||||||
///
|
///
|
||||||
/// Will extend with 0-bytes for documents that have not been seen.
|
/// Will extend with 0-bytes for documents that have not been seen.
|
||||||
pub fn fill_up_to_max_doc(&mut self, max_doc: DocId) {
|
pub fn fill_up_to_max_doc(&mut self, max_doc: DocId) {
|
||||||
for &field in self.fields.iter() {
|
for field in self.fields.iter() {
|
||||||
self.fieldnorms_buffer[field.0 as usize].resize(max_doc as usize, 0u8);
|
self.fieldnorms_buffer[field.field_id() as usize].resize(max_doc as usize, 0u8);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -64,7 +67,7 @@ impl FieldNormsWriter {
|
|||||||
/// * field - the field being set
|
/// * field - the field being set
|
||||||
/// * fieldnorm - the number of terms present in document `doc` in field `field`
|
/// * fieldnorm - the number of terms present in document `doc` in field `field`
|
||||||
pub fn record(&mut self, doc: DocId, field: Field, fieldnorm: u32) {
|
pub fn record(&mut self, doc: DocId, field: Field, fieldnorm: u32) {
|
||||||
let fieldnorm_buffer: &mut Vec<u8> = &mut self.fieldnorms_buffer[field.0 as usize];
|
let fieldnorm_buffer: &mut Vec<u8> = &mut self.fieldnorms_buffer[field.field_id() as usize];
|
||||||
assert!(
|
assert!(
|
||||||
fieldnorm_buffer.len() <= doc as usize,
|
fieldnorm_buffer.len() <= doc as usize,
|
||||||
"Cannot register a given fieldnorm twice"
|
"Cannot register a given fieldnorm twice"
|
||||||
@@ -77,7 +80,7 @@ impl FieldNormsWriter {
|
|||||||
/// Serialize the seen fieldnorm values to the serializer for all fields.
|
/// Serialize the seen fieldnorm values to the serializer for all fields.
|
||||||
pub fn serialize(&self, fieldnorms_serializer: &mut FieldNormsSerializer) -> io::Result<()> {
|
pub fn serialize(&self, fieldnorms_serializer: &mut FieldNormsSerializer) -> io::Result<()> {
|
||||||
for &field in self.fields.iter() {
|
for &field in self.fields.iter() {
|
||||||
let fieldnorm_values: &[u8] = &self.fieldnorms_buffer[field.0 as usize][..];
|
let fieldnorm_values: &[u8] = &self.fieldnorms_buffer[field.field_id() as usize][..];
|
||||||
fieldnorms_serializer.serialize_field(field, fieldnorm_values)?;
|
fieldnorms_serializer.serialize_field(field, fieldnorm_values)?;
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
|
|||||||
@@ -258,7 +258,7 @@ mod tests {
|
|||||||
let delete_queue = DeleteQueue::new();
|
let delete_queue = DeleteQueue::new();
|
||||||
|
|
||||||
let make_op = |i: usize| {
|
let make_op = |i: usize| {
|
||||||
let field = Field(1u32);
|
let field = Field::from_field_id(1u32);
|
||||||
DeleteOperation {
|
DeleteOperation {
|
||||||
opstamp: i as u64,
|
opstamp: i as u64,
|
||||||
term: Term::from_field_u64(field, i as u64),
|
term: Term::from_field_u64(field, i as u64),
|
||||||
|
|||||||
@@ -148,7 +148,6 @@ pub(crate) fn advance_deletes(
|
|||||||
};
|
};
|
||||||
|
|
||||||
let delete_cursor = segment_entry.delete_cursor();
|
let delete_cursor = segment_entry.delete_cursor();
|
||||||
|
|
||||||
compute_deleted_bitset(
|
compute_deleted_bitset(
|
||||||
&mut delete_bitset,
|
&mut delete_bitset,
|
||||||
&segment_reader,
|
&segment_reader,
|
||||||
@@ -168,7 +167,7 @@ pub(crate) fn advance_deletes(
|
|||||||
if num_deleted_docs > 0 {
|
if num_deleted_docs > 0 {
|
||||||
segment = segment.with_delete_meta(num_deleted_docs as u32, target_opstamp);
|
segment = segment.with_delete_meta(num_deleted_docs as u32, target_opstamp);
|
||||||
let mut delete_file = segment.open_write(SegmentComponent::DELETE)?;
|
let mut delete_file = segment.open_write(SegmentComponent::DELETE)?;
|
||||||
write_delete_bitset(&delete_bitset, &mut delete_file)?;
|
write_delete_bitset(&delete_bitset, max_doc, &mut delete_file)?;
|
||||||
delete_file.terminate()?;
|
delete_file.terminate()?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -178,13 +177,13 @@ pub(crate) fn advance_deletes(
|
|||||||
|
|
||||||
fn index_documents(
|
fn index_documents(
|
||||||
memory_budget: usize,
|
memory_budget: usize,
|
||||||
segment: &Segment,
|
segment: Segment,
|
||||||
grouped_document_iterator: &mut dyn Iterator<Item = OperationGroup>,
|
grouped_document_iterator: &mut dyn Iterator<Item = OperationGroup>,
|
||||||
segment_updater: &mut SegmentUpdater,
|
segment_updater: &mut SegmentUpdater,
|
||||||
mut delete_cursor: DeleteCursor,
|
mut delete_cursor: DeleteCursor,
|
||||||
) -> Result<bool> {
|
) -> Result<bool> {
|
||||||
let schema = segment.schema();
|
let schema = segment.schema();
|
||||||
let segment_id = segment.id();
|
|
||||||
let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone(), &schema)?;
|
let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone(), &schema)?;
|
||||||
for document_group in grouped_document_iterator {
|
for document_group in grouped_document_iterator {
|
||||||
for doc in document_group {
|
for doc in document_group {
|
||||||
@@ -204,22 +203,32 @@ fn index_documents(
|
|||||||
return Ok(false);
|
return Ok(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
let num_docs = segment_writer.max_doc();
|
let max_doc = segment_writer.max_doc();
|
||||||
|
|
||||||
// this is ensured by the call to peek before starting
|
// this is ensured by the call to peek before starting
|
||||||
// the worker thread.
|
// the worker thread.
|
||||||
assert!(num_docs > 0);
|
assert!(max_doc > 0);
|
||||||
|
|
||||||
let doc_opstamps: Vec<Opstamp> = segment_writer.finalize()?;
|
let doc_opstamps: Vec<Opstamp> = segment_writer.finalize()?;
|
||||||
let segment_meta = segment.index().new_segment_meta(segment_id, num_docs);
|
|
||||||
|
let segment_with_max_doc = segment.with_max_doc(max_doc);
|
||||||
|
|
||||||
let last_docstamp: Opstamp = *(doc_opstamps.last().unwrap());
|
let last_docstamp: Opstamp = *(doc_opstamps.last().unwrap());
|
||||||
|
|
||||||
let delete_bitset_opt =
|
let delete_bitset_opt = apply_deletes(
|
||||||
apply_deletes(&segment, &mut delete_cursor, &doc_opstamps, last_docstamp)?;
|
&segment_with_max_doc,
|
||||||
|
&mut delete_cursor,
|
||||||
|
&doc_opstamps,
|
||||||
|
last_docstamp,
|
||||||
|
)?;
|
||||||
|
|
||||||
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, delete_bitset_opt);
|
let segment_entry = SegmentEntry::new(
|
||||||
Ok(segment_updater.add_segment(segment_entry))
|
segment_with_max_doc.meta().clone(),
|
||||||
|
delete_cursor,
|
||||||
|
delete_bitset_opt,
|
||||||
|
);
|
||||||
|
segment_updater.add_segment(segment_entry);
|
||||||
|
Ok(true)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn apply_deletes(
|
fn apply_deletes(
|
||||||
@@ -235,7 +244,9 @@ fn apply_deletes(
|
|||||||
}
|
}
|
||||||
let segment_reader = SegmentReader::open(segment)?;
|
let segment_reader = SegmentReader::open(segment)?;
|
||||||
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
|
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
|
||||||
let mut deleted_bitset = BitSet::with_capacity(segment_reader.max_doc() as usize);
|
|
||||||
|
let max_doc = segment.meta().max_doc();
|
||||||
|
let mut deleted_bitset = BitSet::with_capacity(max_doc as usize);
|
||||||
let may_have_deletes = compute_deleted_bitset(
|
let may_have_deletes = compute_deleted_bitset(
|
||||||
&mut deleted_bitset,
|
&mut deleted_bitset,
|
||||||
&segment_reader,
|
&segment_reader,
|
||||||
@@ -407,7 +418,7 @@ impl IndexWriter {
|
|||||||
let segment = index.new_segment();
|
let segment = index.new_segment();
|
||||||
index_documents(
|
index_documents(
|
||||||
mem_budget,
|
mem_budget,
|
||||||
&segment,
|
segment,
|
||||||
&mut document_iterator,
|
&mut document_iterator,
|
||||||
&mut segment_updater,
|
&mut segment_updater,
|
||||||
delete_cursor.clone(),
|
delete_cursor.clone(),
|
||||||
|
|||||||
@@ -190,8 +190,7 @@ impl IndexMerger {
|
|||||||
fast_field_serializer: &mut FastFieldSerializer,
|
fast_field_serializer: &mut FastFieldSerializer,
|
||||||
mut term_ord_mappings: HashMap<Field, TermOrdinalMapping>,
|
mut term_ord_mappings: HashMap<Field, TermOrdinalMapping>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
for (field_id, field_entry) in self.schema.fields().iter().enumerate() {
|
for (field, field_entry) in self.schema.fields() {
|
||||||
let field = Field(field_id as u32);
|
|
||||||
let field_type = field_entry.field_type();
|
let field_type = field_entry.field_type();
|
||||||
match *field_type {
|
match *field_type {
|
||||||
FieldType::HierarchicalFacet => {
|
FieldType::HierarchicalFacet => {
|
||||||
@@ -649,15 +648,12 @@ impl IndexMerger {
|
|||||||
serializer: &mut InvertedIndexSerializer,
|
serializer: &mut InvertedIndexSerializer,
|
||||||
) -> Result<HashMap<Field, TermOrdinalMapping>> {
|
) -> Result<HashMap<Field, TermOrdinalMapping>> {
|
||||||
let mut term_ordinal_mappings = HashMap::new();
|
let mut term_ordinal_mappings = HashMap::new();
|
||||||
for (field_ord, field_entry) in self.schema.fields().iter().enumerate() {
|
for (field, field_entry) in self.schema.fields() {
|
||||||
if field_entry.is_indexed() {
|
if field_entry.is_indexed() {
|
||||||
let indexed_field = Field(field_ord as u32);
|
if let Some(term_ordinal_mapping) =
|
||||||
if let Some(term_ordinal_mapping) = self.write_postings_for_field(
|
self.write_postings_for_field(field, field_entry.field_type(), serializer)?
|
||||||
indexed_field,
|
{
|
||||||
field_entry.field_type(),
|
term_ordinal_mappings.insert(field, term_ordinal_mapping);
|
||||||
serializer,
|
|
||||||
)? {
|
|
||||||
term_ordinal_mappings.insert(indexed_field, term_ordinal_mapping);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -28,3 +28,25 @@ pub use self::segment_writer::SegmentWriter;
|
|||||||
|
|
||||||
/// Alias for the default merge policy, which is the `LogMergePolicy`.
|
/// Alias for the default merge policy, which is the `LogMergePolicy`.
|
||||||
pub type DefaultMergePolicy = LogMergePolicy;
|
pub type DefaultMergePolicy = LogMergePolicy;
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use crate::schema::{self, Schema};
|
||||||
|
use crate::{Index, Term};
|
||||||
|
#[test]
|
||||||
|
fn test_advance_delete_bug() {
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||||
|
let index = Index::create_from_tempdir(schema_builder.build()).unwrap();
|
||||||
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
|
// there must be one deleted document in the segment
|
||||||
|
index_writer.add_document(doc!(text_field=>"b"));
|
||||||
|
index_writer.delete_term(Term::from_field_text(text_field, "b"));
|
||||||
|
// we need enough data to trigger the bug (at least 32 documents)
|
||||||
|
for _ in 0..32 {
|
||||||
|
index_writer.add_document(doc!(text_field=>"c"));
|
||||||
|
}
|
||||||
|
index_writer.commit().unwrap();
|
||||||
|
index_writer.commit().unwrap();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -199,14 +199,12 @@ impl SegmentUpdater {
|
|||||||
self.0.pool.spawn_fn(move || Ok(f(me_clone)))
|
self.0.pool.spawn_fn(move || Ok(f(me_clone)))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_segment(&self, segment_entry: SegmentEntry) -> bool {
|
pub fn add_segment(&self, segment_entry: SegmentEntry) {
|
||||||
self.run_async(|segment_updater| {
|
self.run_async(|segment_updater| {
|
||||||
segment_updater.0.segment_manager.add_segment(segment_entry);
|
segment_updater.0.segment_manager.add_segment(segment_entry);
|
||||||
segment_updater.consider_merge_options();
|
segment_updater.consider_merge_options();
|
||||||
true
|
|
||||||
})
|
})
|
||||||
.forget();
|
.forget();
|
||||||
true
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Orders `SegmentManager` to remove all segments
|
/// Orders `SegmentManager` to remove all segments
|
||||||
|
|||||||
@@ -6,14 +6,15 @@ use crate::fieldnorm::FieldNormsWriter;
|
|||||||
use crate::indexer::segment_serializer::SegmentSerializer;
|
use crate::indexer::segment_serializer::SegmentSerializer;
|
||||||
use crate::postings::compute_table_size;
|
use crate::postings::compute_table_size;
|
||||||
use crate::postings::MultiFieldPostingsWriter;
|
use crate::postings::MultiFieldPostingsWriter;
|
||||||
use crate::schema::FieldEntry;
|
|
||||||
use crate::schema::FieldType;
|
use crate::schema::FieldType;
|
||||||
use crate::schema::Schema;
|
use crate::schema::Schema;
|
||||||
use crate::schema::Term;
|
use crate::schema::Term;
|
||||||
use crate::schema::Value;
|
use crate::schema::Value;
|
||||||
|
use crate::schema::{Field, FieldEntry};
|
||||||
use crate::tokenizer::BoxedTokenizer;
|
use crate::tokenizer::BoxedTokenizer;
|
||||||
use crate::tokenizer::FacetTokenizer;
|
use crate::tokenizer::FacetTokenizer;
|
||||||
use crate::tokenizer::{TokenStream, Tokenizer};
|
use crate::tokenizer::PreTokenizedStream;
|
||||||
|
use crate::tokenizer::{TokenStream, TokenStreamChain, Tokenizer};
|
||||||
use crate::DocId;
|
use crate::DocId;
|
||||||
use crate::Opstamp;
|
use crate::Opstamp;
|
||||||
use crate::Result;
|
use crate::Result;
|
||||||
@@ -70,12 +71,10 @@ impl SegmentWriter {
|
|||||||
let table_num_bits = initial_table_size(memory_budget)?;
|
let table_num_bits = initial_table_size(memory_budget)?;
|
||||||
let segment_serializer = SegmentSerializer::for_segment(&mut segment)?;
|
let segment_serializer = SegmentSerializer::for_segment(&mut segment)?;
|
||||||
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_num_bits);
|
let multifield_postings = MultiFieldPostingsWriter::new(schema, table_num_bits);
|
||||||
let tokenizers =
|
let tokenizers = schema
|
||||||
schema
|
.fields()
|
||||||
.fields()
|
.map(
|
||||||
.iter()
|
|(_, field_entry): (Field, &FieldEntry)| match field_entry.field_type() {
|
||||||
.map(FieldEntry::field_type)
|
|
||||||
.map(|field_type| match *field_type {
|
|
||||||
FieldType::Str(ref text_options) => text_options
|
FieldType::Str(ref text_options) => text_options
|
||||||
.get_indexing_options()
|
.get_indexing_options()
|
||||||
.and_then(|text_index_option| {
|
.and_then(|text_index_option| {
|
||||||
@@ -83,8 +82,9 @@ impl SegmentWriter {
|
|||||||
segment.index().tokenizers().get(tokenizer_name)
|
segment.index().tokenizers().get(tokenizer_name)
|
||||||
}),
|
}),
|
||||||
_ => None,
|
_ => None,
|
||||||
})
|
},
|
||||||
.collect();
|
)
|
||||||
|
.collect();
|
||||||
Ok(SegmentWriter {
|
Ok(SegmentWriter {
|
||||||
max_doc: 0,
|
max_doc: 0,
|
||||||
multifield_postings,
|
multifield_postings,
|
||||||
@@ -159,26 +159,44 @@ impl SegmentWriter {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
FieldType::Str(_) => {
|
FieldType::Str(_) => {
|
||||||
let num_tokens = if let Some(ref mut tokenizer) =
|
let mut token_streams: Vec<Box<dyn TokenStream>> = vec![];
|
||||||
self.tokenizers[field.0 as usize]
|
let mut offsets = vec![];
|
||||||
{
|
let mut total_offset = 0;
|
||||||
let texts: Vec<&str> = field_values
|
|
||||||
.iter()
|
for field_value in field_values {
|
||||||
.flat_map(|field_value| match *field_value.value() {
|
match field_value.value() {
|
||||||
Value::Str(ref text) => Some(text.as_str()),
|
Value::PreTokStr(tok_str) => {
|
||||||
_ => None,
|
offsets.push(total_offset);
|
||||||
})
|
if let Some(last_token) = tok_str.tokens.last() {
|
||||||
.collect();
|
total_offset += last_token.offset_to;
|
||||||
if texts.is_empty() {
|
}
|
||||||
0
|
|
||||||
} else {
|
token_streams
|
||||||
let mut token_stream = tokenizer.token_stream_texts(&texts[..]);
|
.push(Box::new(PreTokenizedStream::from(tok_str.clone())));
|
||||||
self.multifield_postings
|
}
|
||||||
.index_text(doc_id, field, &mut token_stream)
|
Value::Str(ref text) => {
|
||||||
|
if let Some(ref mut tokenizer) =
|
||||||
|
self.tokenizers[field.field_id() as usize]
|
||||||
|
{
|
||||||
|
offsets.push(total_offset);
|
||||||
|
total_offset += text.len();
|
||||||
|
|
||||||
|
token_streams.push(tokenizer.token_stream(text));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => (),
|
||||||
}
|
}
|
||||||
} else {
|
}
|
||||||
|
|
||||||
|
let num_tokens = if token_streams.is_empty() {
|
||||||
0
|
0
|
||||||
|
} else {
|
||||||
|
let mut token_stream: Box<dyn TokenStream> =
|
||||||
|
Box::new(TokenStreamChain::new(offsets, token_streams));
|
||||||
|
self.multifield_postings
|
||||||
|
.index_text(doc_id, field, &mut token_stream)
|
||||||
};
|
};
|
||||||
|
|
||||||
self.fieldnorms_writer.record(doc_id, field, num_tokens);
|
self.fieldnorms_writer.record(doc_id, field, num_tokens);
|
||||||
}
|
}
|
||||||
FieldType::U64(ref int_option) => {
|
FieldType::U64(ref int_option) => {
|
||||||
|
|||||||
14
src/lib.rs
14
src/lib.rs
@@ -212,15 +212,13 @@ pub type Score = f32;
|
|||||||
pub type SegmentLocalId = u32;
|
pub type SegmentLocalId = u32;
|
||||||
|
|
||||||
impl DocAddress {
|
impl DocAddress {
|
||||||
/// Return the segment ordinal.
|
/// Return the segment ordinal id that identifies the segment
|
||||||
/// The segment ordinal is an id identifying the segment
|
/// hosting the document in the `Searcher` it is called from.
|
||||||
/// hosting the document. It is only meaningful, in the context
|
|
||||||
/// of a searcher.
|
|
||||||
pub fn segment_ord(self) -> SegmentLocalId {
|
pub fn segment_ord(self) -> SegmentLocalId {
|
||||||
self.0
|
self.0
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return the segment local `DocId`
|
/// Return the segment-local `DocId`
|
||||||
pub fn doc(self) -> DocId {
|
pub fn doc(self) -> DocId {
|
||||||
self.1
|
self.1
|
||||||
}
|
}
|
||||||
@@ -229,11 +227,11 @@ impl DocAddress {
|
|||||||
/// `DocAddress` contains all the necessary information
|
/// `DocAddress` contains all the necessary information
|
||||||
/// to identify a document given a `Searcher` object.
|
/// to identify a document given a `Searcher` object.
|
||||||
///
|
///
|
||||||
/// It consists in an id identifying its segment, and
|
/// It consists of an id identifying its segment, and
|
||||||
/// its segment-local `DocId`.
|
/// a segment-local `DocId`.
|
||||||
///
|
///
|
||||||
/// The id used for the segment is actually an ordinal
|
/// The id used for the segment is actually an ordinal
|
||||||
/// in the list of segment hold by a `Searcher`.
|
/// in the list of `Segment`s held by a `Searcher`.
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
|
||||||
pub struct DocAddress(pub SegmentLocalId, pub DocId);
|
pub struct DocAddress(pub SegmentLocalId, pub DocId);
|
||||||
|
|
||||||
|
|||||||
@@ -356,9 +356,9 @@ pub mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_skip_next() {
|
fn test_skip_next() {
|
||||||
let term_0 = Term::from_field_u64(Field(0), 0);
|
let term_0 = Term::from_field_u64(Field::from_field_id(0), 0);
|
||||||
let term_1 = Term::from_field_u64(Field(0), 1);
|
let term_1 = Term::from_field_u64(Field::from_field_id(0), 1);
|
||||||
let term_2 = Term::from_field_u64(Field(0), 2);
|
let term_2 = Term::from_field_u64(Field::from_field_id(0), 2);
|
||||||
|
|
||||||
let num_docs = 300u32;
|
let num_docs = 300u32;
|
||||||
|
|
||||||
@@ -511,19 +511,19 @@ pub mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub static TERM_A: Lazy<Term> = Lazy::new(|| {
|
pub static TERM_A: Lazy<Term> = Lazy::new(|| {
|
||||||
let field = Field(0);
|
let field = Field::from_field_id(0);
|
||||||
Term::from_field_text(field, "a")
|
Term::from_field_text(field, "a")
|
||||||
});
|
});
|
||||||
pub static TERM_B: Lazy<Term> = Lazy::new(|| {
|
pub static TERM_B: Lazy<Term> = Lazy::new(|| {
|
||||||
let field = Field(0);
|
let field = Field::from_field_id(0);
|
||||||
Term::from_field_text(field, "b")
|
Term::from_field_text(field, "b")
|
||||||
});
|
});
|
||||||
pub static TERM_C: Lazy<Term> = Lazy::new(|| {
|
pub static TERM_C: Lazy<Term> = Lazy::new(|| {
|
||||||
let field = Field(0);
|
let field = Field::from_field_id(0);
|
||||||
Term::from_field_text(field, "c")
|
Term::from_field_text(field, "c")
|
||||||
});
|
});
|
||||||
pub static TERM_D: Lazy<Term> = Lazy::new(|| {
|
pub static TERM_D: Lazy<Term> = Lazy::new(|| {
|
||||||
let field = Field(0);
|
let field = Field::from_field_id(0);
|
||||||
Term::from_field_text(field, "d")
|
Term::from_field_text(field, "d")
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|||||||
@@ -61,12 +61,12 @@ fn make_field_partition(
|
|||||||
.iter()
|
.iter()
|
||||||
.map(|(key, _, _)| Term::wrap(key).field())
|
.map(|(key, _, _)| Term::wrap(key).field())
|
||||||
.enumerate();
|
.enumerate();
|
||||||
let mut prev_field = Field(u32::max_value());
|
let mut prev_field_opt = None;
|
||||||
let mut fields = vec![];
|
let mut fields = vec![];
|
||||||
let mut offsets = vec![];
|
let mut offsets = vec![];
|
||||||
for (offset, field) in term_offsets_it {
|
for (offset, field) in term_offsets_it {
|
||||||
if field != prev_field {
|
if Some(field) != prev_field_opt {
|
||||||
prev_field = field;
|
prev_field_opt = Some(field);
|
||||||
fields.push(field);
|
fields.push(field);
|
||||||
offsets.push(offset);
|
offsets.push(offset);
|
||||||
}
|
}
|
||||||
@@ -86,8 +86,7 @@ impl MultiFieldPostingsWriter {
|
|||||||
let term_index = TermHashMap::new(table_bits);
|
let term_index = TermHashMap::new(table_bits);
|
||||||
let per_field_postings_writers: Vec<_> = schema
|
let per_field_postings_writers: Vec<_> = schema
|
||||||
.fields()
|
.fields()
|
||||||
.iter()
|
.map(|(_, field_entry)| posting_from_field_entry(field_entry))
|
||||||
.map(|field_entry| posting_from_field_entry(field_entry))
|
|
||||||
.collect();
|
.collect();
|
||||||
MultiFieldPostingsWriter {
|
MultiFieldPostingsWriter {
|
||||||
heap: MemoryArena::new(),
|
heap: MemoryArena::new(),
|
||||||
@@ -107,7 +106,8 @@ impl MultiFieldPostingsWriter {
|
|||||||
field: Field,
|
field: Field,
|
||||||
token_stream: &mut dyn TokenStream,
|
token_stream: &mut dyn TokenStream,
|
||||||
) -> u32 {
|
) -> u32 {
|
||||||
let postings_writer = self.per_field_postings_writers[field.0 as usize].deref_mut();
|
let postings_writer =
|
||||||
|
self.per_field_postings_writers[field.field_id() as usize].deref_mut();
|
||||||
postings_writer.index_text(
|
postings_writer.index_text(
|
||||||
&mut self.term_index,
|
&mut self.term_index,
|
||||||
doc,
|
doc,
|
||||||
@@ -118,7 +118,8 @@ impl MultiFieldPostingsWriter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn subscribe(&mut self, doc: DocId, term: &Term) -> UnorderedTermId {
|
pub fn subscribe(&mut self, doc: DocId, term: &Term) -> UnorderedTermId {
|
||||||
let postings_writer = self.per_field_postings_writers[term.field().0 as usize].deref_mut();
|
let postings_writer =
|
||||||
|
self.per_field_postings_writers[term.field().field_id() as usize].deref_mut();
|
||||||
postings_writer.subscribe(&mut self.term_index, doc, 0u32, term, &mut self.heap)
|
postings_writer.subscribe(&mut self.term_index, doc, 0u32, term, &mut self.heap)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -160,7 +161,7 @@ impl MultiFieldPostingsWriter {
|
|||||||
FieldType::Bytes => {}
|
FieldType::Bytes => {}
|
||||||
}
|
}
|
||||||
|
|
||||||
let postings_writer = &self.per_field_postings_writers[field.0 as usize];
|
let postings_writer = &self.per_field_postings_writers[field.field_id() as usize];
|
||||||
let mut field_serializer =
|
let mut field_serializer =
|
||||||
serializer.new_field(field, postings_writer.total_num_tokens())?;
|
serializer.new_field(field, postings_writer.total_num_tokens())?;
|
||||||
postings_writer.serialize(
|
postings_writer.serialize(
|
||||||
|
|||||||
@@ -9,7 +9,8 @@ use crate::Result;
|
|||||||
use crate::Searcher;
|
use crate::Searcher;
|
||||||
use std::collections::BTreeSet;
|
use std::collections::BTreeSet;
|
||||||
|
|
||||||
/// The boolean query combines a set of queries
|
/// The boolean query returns a set of documents
|
||||||
|
/// that matches the Boolean combination of constituent subqueries.
|
||||||
///
|
///
|
||||||
/// The documents matched by the boolean query are
|
/// The documents matched by the boolean query are
|
||||||
/// those which
|
/// those which
|
||||||
@@ -19,6 +20,113 @@ use std::collections::BTreeSet;
|
|||||||
/// `MustNot` occurence.
|
/// `MustNot` occurence.
|
||||||
/// * match at least one of the subqueries that is not
|
/// * match at least one of the subqueries that is not
|
||||||
/// a `MustNot` occurence.
|
/// a `MustNot` occurence.
|
||||||
|
///
|
||||||
|
///
|
||||||
|
/// You can combine other query types and their `Occur`ances into one `BooleanQuery`
|
||||||
|
///
|
||||||
|
/// ```rust
|
||||||
|
///use tantivy::collector::Count;
|
||||||
|
///use tantivy::doc;
|
||||||
|
///use tantivy::query::{BooleanQuery, Occur, PhraseQuery, Query, TermQuery};
|
||||||
|
///use tantivy::schema::{IndexRecordOption, Schema, TEXT};
|
||||||
|
///use tantivy::Term;
|
||||||
|
///use tantivy::{Index, Result};
|
||||||
|
///
|
||||||
|
///fn main() -> Result<()> {
|
||||||
|
/// let mut schema_builder = Schema::builder();
|
||||||
|
/// let title = schema_builder.add_text_field("title", TEXT);
|
||||||
|
/// let body = schema_builder.add_text_field("body", TEXT);
|
||||||
|
/// let schema = schema_builder.build();
|
||||||
|
/// let index = Index::create_in_ram(schema);
|
||||||
|
/// {
|
||||||
|
/// let mut index_writer = index.writer(3_000_000)?;
|
||||||
|
/// index_writer.add_document(doc!(
|
||||||
|
/// title => "The Name of the Wind",
|
||||||
|
/// ));
|
||||||
|
/// index_writer.add_document(doc!(
|
||||||
|
/// title => "The Diary of Muadib",
|
||||||
|
/// ));
|
||||||
|
/// index_writer.add_document(doc!(
|
||||||
|
/// title => "A Dairy Cow",
|
||||||
|
/// body => "hidden",
|
||||||
|
/// ));
|
||||||
|
/// index_writer.add_document(doc!(
|
||||||
|
/// title => "A Dairy Cow",
|
||||||
|
/// body => "found",
|
||||||
|
/// ));
|
||||||
|
/// index_writer.add_document(doc!(
|
||||||
|
/// title => "The Diary of a Young Girl",
|
||||||
|
/// ));
|
||||||
|
/// index_writer.commit().unwrap();
|
||||||
|
/// }
|
||||||
|
///
|
||||||
|
/// let reader = index.reader()?;
|
||||||
|
/// let searcher = reader.searcher();
|
||||||
|
///
|
||||||
|
/// // Make TermQuery's for "girl" and "diary" in the title
|
||||||
|
/// let girl_term_query: Box<dyn Query> = Box::new(TermQuery::new(
|
||||||
|
/// Term::from_field_text(title, "girl"),
|
||||||
|
/// IndexRecordOption::Basic,
|
||||||
|
/// ));
|
||||||
|
/// let diary_term_query: Box<dyn Query> = Box::new(TermQuery::new(
|
||||||
|
/// Term::from_field_text(title, "diary"),
|
||||||
|
/// IndexRecordOption::Basic,
|
||||||
|
/// ));
|
||||||
|
/// // A TermQuery with "found" in the body
|
||||||
|
/// let body_term_query: Box<dyn Query> = Box::new(TermQuery::new(
|
||||||
|
/// Term::from_field_text(body, "found"),
|
||||||
|
/// IndexRecordOption::Basic,
|
||||||
|
/// ));
|
||||||
|
/// // TermQuery "diary" must and "girl" must not be present
|
||||||
|
/// let queries_with_occurs1 = vec![
|
||||||
|
/// (Occur::Must, diary_term_query.box_clone()),
|
||||||
|
/// (Occur::MustNot, girl_term_query),
|
||||||
|
/// ];
|
||||||
|
/// // Make a BooleanQuery equivalent to
|
||||||
|
/// // title:+diary title:-girl
|
||||||
|
/// let diary_must_and_girl_mustnot = BooleanQuery::from(queries_with_occurs1);
|
||||||
|
/// let count1 = searcher.search(&diary_must_and_girl_mustnot, &Count)?;
|
||||||
|
/// assert_eq!(count1, 1);
|
||||||
|
///
|
||||||
|
/// // TermQuery for "cow" in the title
|
||||||
|
/// let cow_term_query: Box<dyn Query> = Box::new(TermQuery::new(
|
||||||
|
/// Term::from_field_text(title, "cow"),
|
||||||
|
/// IndexRecordOption::Basic,
|
||||||
|
/// ));
|
||||||
|
/// // "title:diary OR title:cow"
|
||||||
|
/// let title_diary_or_cow = BooleanQuery::from(vec![
|
||||||
|
/// (Occur::Should, diary_term_query.box_clone()),
|
||||||
|
/// (Occur::Should, cow_term_query),
|
||||||
|
/// ]);
|
||||||
|
/// let count2 = searcher.search(&title_diary_or_cow, &Count)?;
|
||||||
|
/// assert_eq!(count2, 4);
|
||||||
|
///
|
||||||
|
/// // Make a `PhraseQuery` from a vector of `Term`s
|
||||||
|
/// let phrase_query: Box<dyn Query> = Box::new(PhraseQuery::new(vec![
|
||||||
|
/// Term::from_field_text(title, "dairy"),
|
||||||
|
/// Term::from_field_text(title, "cow"),
|
||||||
|
/// ]));
|
||||||
|
/// // You can combine subqueries of different types into 1 BooleanQuery:
|
||||||
|
/// // `TermQuery` and `PhraseQuery`
|
||||||
|
/// // "title:diary OR "dairy cow"
|
||||||
|
/// let term_of_phrase_query = BooleanQuery::from(vec![
|
||||||
|
/// (Occur::Should, diary_term_query.box_clone()),
|
||||||
|
/// (Occur::Should, phrase_query.box_clone()),
|
||||||
|
/// ]);
|
||||||
|
/// let count3 = searcher.search(&term_of_phrase_query, &Count)?;
|
||||||
|
/// assert_eq!(count3, 4);
|
||||||
|
///
|
||||||
|
/// // You can nest one BooleanQuery inside another
|
||||||
|
/// // body:found AND ("title:diary OR "dairy cow")
|
||||||
|
/// let nested_query = BooleanQuery::from(vec![
|
||||||
|
/// (Occur::Must, body_term_query),
|
||||||
|
/// (Occur::Must, Box::new(term_of_phrase_query))
|
||||||
|
/// ]);
|
||||||
|
/// let count4 = searcher.search(&nested_query, &Count)?;
|
||||||
|
/// assert_eq!(count4, 1);
|
||||||
|
/// Ok(())
|
||||||
|
///}
|
||||||
|
/// ```
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct BooleanQuery {
|
pub struct BooleanQuery {
|
||||||
subqueries: Vec<(Occur, Box<dyn Query>)>,
|
subqueries: Vec<(Occur, Box<dyn Query>)>,
|
||||||
|
|||||||
@@ -40,7 +40,7 @@ impl PhraseQuery {
|
|||||||
PhraseQuery::new_with_offset(terms_with_offset)
|
PhraseQuery::new_with_offset(terms_with_offset)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Creates a new `PhraseQuery` given a list of terms and there offsets.
|
/// Creates a new `PhraseQuery` given a list of terms and their offsets.
|
||||||
///
|
///
|
||||||
/// Can be used to provide custom offset for each term.
|
/// Can be used to provide custom offset for each term.
|
||||||
pub fn new_with_offset(mut terms: Vec<(usize, Term)>) -> PhraseQuery {
|
pub fn new_with_offset(mut terms: Vec<(usize, Term)>) -> PhraseQuery {
|
||||||
|
|||||||
@@ -674,13 +674,19 @@ mod test {
|
|||||||
|
|
||||||
test_parse_query_to_logical_ast_helper(
|
test_parse_query_to_logical_ast_helper(
|
||||||
"signed:-2324",
|
"signed:-2324",
|
||||||
&format!("{:?}", Term::from_field_i64(Field(2u32), -2324)),
|
&format!(
|
||||||
|
"{:?}",
|
||||||
|
Term::from_field_i64(Field::from_field_id(2u32), -2324)
|
||||||
|
),
|
||||||
false,
|
false,
|
||||||
);
|
);
|
||||||
|
|
||||||
test_parse_query_to_logical_ast_helper(
|
test_parse_query_to_logical_ast_helper(
|
||||||
"float:2.5",
|
"float:2.5",
|
||||||
&format!("{:?}", Term::from_field_f64(Field(10u32), 2.5)),
|
&format!(
|
||||||
|
"{:?}",
|
||||||
|
Term::from_field_f64(Field::from_field_id(10u32), 2.5)
|
||||||
|
),
|
||||||
false,
|
false,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -118,7 +118,7 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn test_term_query_debug() {
|
fn test_term_query_debug() {
|
||||||
let term_query = TermQuery::new(
|
let term_query = TermQuery::new(
|
||||||
Term::from_field_text(Field(1), "hello"),
|
Term::from_field_text(Field::from_field_id(1), "hello"),
|
||||||
IndexRecordOption::WithFreqs,
|
IndexRecordOption::WithFreqs,
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
use super::*;
|
use super::*;
|
||||||
use crate::common::BinarySerializable;
|
use crate::common::BinarySerializable;
|
||||||
use crate::common::VInt;
|
use crate::common::VInt;
|
||||||
|
use crate::tokenizer::PreTokenizedString;
|
||||||
use crate::DateTime;
|
use crate::DateTime;
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use std::io::{self, Read, Write};
|
use std::io::{self, Read, Write};
|
||||||
@@ -29,8 +30,8 @@ impl From<Vec<FieldValue>> for Document {
|
|||||||
impl PartialEq for Document {
|
impl PartialEq for Document {
|
||||||
fn eq(&self, other: &Document) -> bool {
|
fn eq(&self, other: &Document) -> bool {
|
||||||
// super slow, but only here for tests
|
// super slow, but only here for tests
|
||||||
let mut self_field_values = self.field_values.clone();
|
let mut self_field_values: Vec<&_> = self.field_values.iter().collect();
|
||||||
let mut other_field_values = other.field_values.clone();
|
let mut other_field_values: Vec<&_> = other.field_values.iter().collect();
|
||||||
self_field_values.sort();
|
self_field_values.sort();
|
||||||
other_field_values.sort();
|
other_field_values.sort();
|
||||||
self_field_values.eq(&other_field_values)
|
self_field_values.eq(&other_field_values)
|
||||||
@@ -78,6 +79,16 @@ impl Document {
|
|||||||
self.add(FieldValue::new(field, value));
|
self.add(FieldValue::new(field, value));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Add a pre-tokenized text field.
|
||||||
|
pub fn add_pre_tokenized_text(
|
||||||
|
&mut self,
|
||||||
|
field: Field,
|
||||||
|
pre_tokenized_text: &PreTokenizedString,
|
||||||
|
) {
|
||||||
|
let value = Value::PreTokStr(pre_tokenized_text.clone());
|
||||||
|
self.add(FieldValue::new(field, value));
|
||||||
|
}
|
||||||
|
|
||||||
/// Add a u64 field
|
/// Add a u64 field
|
||||||
pub fn add_u64(&mut self, field: Field, value: u64) {
|
pub fn add_u64(&mut self, field: Field, value: u64) {
|
||||||
self.add(FieldValue::new(field, Value::U64(value)));
|
self.add(FieldValue::new(field, Value::U64(value)));
|
||||||
|
|||||||
@@ -3,14 +3,23 @@ use std::io;
|
|||||||
use std::io::Read;
|
use std::io::Read;
|
||||||
use std::io::Write;
|
use std::io::Write;
|
||||||
|
|
||||||
/// `Field` is actually a `u8` identifying a `Field`
|
/// `Field` is represented by an unsigned 32-bit integer type
|
||||||
/// The schema is in charge of holding mapping between field names
|
/// The schema holds the mapping between field names and `Field` objects.
|
||||||
/// to `Field` objects.
|
|
||||||
///
|
|
||||||
/// Because the field id is a `u8`, tantivy can only have at most `255` fields.
|
|
||||||
/// Value 255 is reserved.
|
|
||||||
#[derive(Copy, Clone, Debug, PartialEq, PartialOrd, Eq, Ord, Hash, Serialize, Deserialize)]
|
#[derive(Copy, Clone, Debug, PartialEq, PartialOrd, Eq, Ord, Hash, Serialize, Deserialize)]
|
||||||
pub struct Field(pub u32);
|
pub struct Field(u32);
|
||||||
|
|
||||||
|
impl Field {
|
||||||
|
/// Create a new field object for the given FieldId.
|
||||||
|
pub fn from_field_id(field_id: u32) -> Field {
|
||||||
|
Field(field_id)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns a u32 identifying uniquely a field within a schema.
|
||||||
|
#[allow(clippy::trivially_copy_pass_by_ref)]
|
||||||
|
pub fn field_id(&self) -> u32 {
|
||||||
|
self.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl BinarySerializable for Field {
|
impl BinarySerializable for Field {
|
||||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||||
|
|||||||
@@ -1,11 +1,11 @@
|
|||||||
use base64::decode;
|
use base64::decode;
|
||||||
|
|
||||||
use crate::schema::{IntOptions, TextOptions};
|
|
||||||
|
|
||||||
use crate::schema::Facet;
|
use crate::schema::Facet;
|
||||||
use crate::schema::IndexRecordOption;
|
use crate::schema::IndexRecordOption;
|
||||||
use crate::schema::TextFieldIndexing;
|
use crate::schema::TextFieldIndexing;
|
||||||
use crate::schema::Value;
|
use crate::schema::Value;
|
||||||
|
use crate::schema::{IntOptions, TextOptions};
|
||||||
|
use crate::tokenizer::PreTokenizedString;
|
||||||
use serde_json::Value as JsonValue;
|
use serde_json::Value as JsonValue;
|
||||||
|
|
||||||
/// Possible error that may occur while parsing a field value
|
/// Possible error that may occur while parsing a field value
|
||||||
@@ -169,6 +169,28 @@ impl FieldType {
|
|||||||
Err(ValueParsingError::TypeError(msg))
|
Err(ValueParsingError::TypeError(msg))
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
JsonValue::Object(_) => match *self {
|
||||||
|
FieldType::Str(_) => {
|
||||||
|
if let Ok(tok_str_val) =
|
||||||
|
serde_json::from_value::<PreTokenizedString>(json.clone())
|
||||||
|
{
|
||||||
|
Ok(Value::PreTokStr(tok_str_val))
|
||||||
|
} else {
|
||||||
|
let msg = format!(
|
||||||
|
"Json value {:?} cannot be translated to PreTokenizedString.",
|
||||||
|
json
|
||||||
|
);
|
||||||
|
Err(ValueParsingError::TypeError(msg))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
let msg = format!(
|
||||||
|
"Json value not supported error {:?}. Expected {:?}",
|
||||||
|
json, self
|
||||||
|
);
|
||||||
|
Err(ValueParsingError::TypeError(msg))
|
||||||
|
}
|
||||||
|
},
|
||||||
_ => {
|
_ => {
|
||||||
let msg = format!(
|
let msg = format!(
|
||||||
"Json value not supported error {:?}. Expected {:?}",
|
"Json value not supported error {:?}. Expected {:?}",
|
||||||
@@ -184,7 +206,9 @@ impl FieldType {
|
|||||||
mod tests {
|
mod tests {
|
||||||
use super::FieldType;
|
use super::FieldType;
|
||||||
use crate::schema::field_type::ValueParsingError;
|
use crate::schema::field_type::ValueParsingError;
|
||||||
|
use crate::schema::TextOptions;
|
||||||
use crate::schema::Value;
|
use crate::schema::Value;
|
||||||
|
use crate::tokenizer::{PreTokenizedString, Token};
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_bytes_value_from_json() {
|
fn test_bytes_value_from_json() {
|
||||||
@@ -205,4 +229,71 @@ mod tests {
|
|||||||
_ => panic!("Expected parse failure for invalid base64"),
|
_ => panic!("Expected parse failure for invalid base64"),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_pre_tok_str_value_from_json() {
|
||||||
|
let pre_tokenized_string_json = r#"{
|
||||||
|
"text": "The Old Man",
|
||||||
|
"tokens": [
|
||||||
|
{
|
||||||
|
"offset_from": 0,
|
||||||
|
"offset_to": 3,
|
||||||
|
"position": 0,
|
||||||
|
"text": "The",
|
||||||
|
"position_length": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"offset_from": 4,
|
||||||
|
"offset_to": 7,
|
||||||
|
"position": 1,
|
||||||
|
"text": "Old",
|
||||||
|
"position_length": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"offset_from": 8,
|
||||||
|
"offset_to": 11,
|
||||||
|
"position": 2,
|
||||||
|
"text": "Man",
|
||||||
|
"position_length": 1
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}"#;
|
||||||
|
|
||||||
|
let expected_value = Value::PreTokStr(PreTokenizedString {
|
||||||
|
text: String::from("The Old Man"),
|
||||||
|
tokens: vec![
|
||||||
|
Token {
|
||||||
|
offset_from: 0,
|
||||||
|
offset_to: 3,
|
||||||
|
position: 0,
|
||||||
|
text: String::from("The"),
|
||||||
|
position_length: 1,
|
||||||
|
},
|
||||||
|
Token {
|
||||||
|
offset_from: 4,
|
||||||
|
offset_to: 7,
|
||||||
|
position: 1,
|
||||||
|
text: String::from("Old"),
|
||||||
|
position_length: 1,
|
||||||
|
},
|
||||||
|
Token {
|
||||||
|
offset_from: 8,
|
||||||
|
offset_to: 11,
|
||||||
|
position: 2,
|
||||||
|
text: String::from("Man"),
|
||||||
|
position_length: 1,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
});
|
||||||
|
|
||||||
|
let deserialized_value = FieldType::Str(TextOptions::default())
|
||||||
|
.value_from_json(&serde_json::from_str(pre_tokenized_string_json).unwrap())
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
assert_eq!(deserialized_value, expected_value);
|
||||||
|
|
||||||
|
let serialized_value_json = serde_json::to_string_pretty(&expected_value).unwrap();
|
||||||
|
|
||||||
|
assert_eq!(serialized_value_json, pre_tokenized_string_json);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -167,7 +167,7 @@ impl SchemaBuilder {
|
|||||||
|
|
||||||
/// Adds a field entry to the schema in build.
|
/// Adds a field entry to the schema in build.
|
||||||
fn add_field(&mut self, field_entry: FieldEntry) -> Field {
|
fn add_field(&mut self, field_entry: FieldEntry) -> Field {
|
||||||
let field = Field(self.fields.len() as u32);
|
let field = Field::from_field_id(self.fields.len() as u32);
|
||||||
let field_name = field_entry.name().to_string();
|
let field_name = field_entry.name().to_string();
|
||||||
self.fields.push(field_entry);
|
self.fields.push(field_entry);
|
||||||
self.fields_map.insert(field_name, field);
|
self.fields_map.insert(field_name, field);
|
||||||
@@ -223,7 +223,7 @@ pub struct Schema(Arc<InnerSchema>);
|
|||||||
impl Schema {
|
impl Schema {
|
||||||
/// Return the `FieldEntry` associated to a `Field`.
|
/// Return the `FieldEntry` associated to a `Field`.
|
||||||
pub fn get_field_entry(&self, field: Field) -> &FieldEntry {
|
pub fn get_field_entry(&self, field: Field) -> &FieldEntry {
|
||||||
&self.0.fields[field.0 as usize]
|
&self.0.fields[field.field_id() as usize]
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return the field name for a given `Field`.
|
/// Return the field name for a given `Field`.
|
||||||
@@ -232,8 +232,12 @@ impl Schema {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Return the list of all the `Field`s.
|
/// Return the list of all the `Field`s.
|
||||||
pub fn fields(&self) -> &[FieldEntry] {
|
pub fn fields(&self) -> impl Iterator<Item = (Field, &FieldEntry)> {
|
||||||
&self.0.fields
|
self.0
|
||||||
|
.fields
|
||||||
|
.iter()
|
||||||
|
.enumerate()
|
||||||
|
.map(|(field_id, field_entry)| (Field::from_field_id(field_id as u32), field_entry))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Creates a new builder.
|
/// Creates a new builder.
|
||||||
@@ -485,13 +489,32 @@ mod tests {
|
|||||||
|
|
||||||
let schema: Schema = serde_json::from_str(expected).unwrap();
|
let schema: Schema = serde_json::from_str(expected).unwrap();
|
||||||
|
|
||||||
let mut fields = schema.fields().iter();
|
let mut fields = schema.fields();
|
||||||
|
{
|
||||||
assert_eq!("title", fields.next().unwrap().name());
|
let (field, field_entry) = fields.next().unwrap();
|
||||||
assert_eq!("author", fields.next().unwrap().name());
|
assert_eq!("title", field_entry.name());
|
||||||
assert_eq!("count", fields.next().unwrap().name());
|
assert_eq!(0, field.field_id());
|
||||||
assert_eq!("popularity", fields.next().unwrap().name());
|
}
|
||||||
assert_eq!("score", fields.next().unwrap().name());
|
{
|
||||||
|
let (field, field_entry) = fields.next().unwrap();
|
||||||
|
assert_eq!("author", field_entry.name());
|
||||||
|
assert_eq!(1, field.field_id());
|
||||||
|
}
|
||||||
|
{
|
||||||
|
let (field, field_entry) = fields.next().unwrap();
|
||||||
|
assert_eq!("count", field_entry.name());
|
||||||
|
assert_eq!(2, field.field_id());
|
||||||
|
}
|
||||||
|
{
|
||||||
|
let (field, field_entry) = fields.next().unwrap();
|
||||||
|
assert_eq!("popularity", field_entry.name());
|
||||||
|
assert_eq!(3, field.field_id());
|
||||||
|
}
|
||||||
|
{
|
||||||
|
let (field, field_entry) = fields.next().unwrap();
|
||||||
|
assert_eq!("score", field_entry.name());
|
||||||
|
assert_eq!(4, field.field_id());
|
||||||
|
}
|
||||||
assert!(fields.next().is_none());
|
assert!(fields.next().is_none());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -105,7 +105,7 @@ impl Term {
|
|||||||
if self.0.len() < 4 {
|
if self.0.len() < 4 {
|
||||||
self.0.resize(4, 0u8);
|
self.0.resize(4, 0u8);
|
||||||
}
|
}
|
||||||
BigEndian::write_u32(&mut self.0[0..4], field.0);
|
BigEndian::write_u32(&mut self.0[0..4], field.field_id());
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Sets a u64 value in the term.
|
/// Sets a u64 value in the term.
|
||||||
@@ -157,7 +157,7 @@ where
|
|||||||
|
|
||||||
/// Returns the field.
|
/// Returns the field.
|
||||||
pub fn field(&self) -> Field {
|
pub fn field(&self) -> Field {
|
||||||
Field(BigEndian::read_u32(&self.0.as_ref()[..4]))
|
Field::from_field_id(BigEndian::read_u32(&self.0.as_ref()[..4]))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the `u64` value stored in a term.
|
/// Returns the `u64` value stored in a term.
|
||||||
@@ -227,7 +227,7 @@ impl fmt::Debug for Term {
|
|||||||
write!(
|
write!(
|
||||||
f,
|
f,
|
||||||
"Term(field={},bytes={:?})",
|
"Term(field={},bytes={:?})",
|
||||||
self.field().0,
|
self.field().field_id(),
|
||||||
self.value_bytes()
|
self.value_bytes()
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
use crate::schema::Facet;
|
use crate::schema::Facet;
|
||||||
|
use crate::tokenizer::PreTokenizedString;
|
||||||
use crate::DateTime;
|
use crate::DateTime;
|
||||||
use serde::de::Visitor;
|
use serde::de::Visitor;
|
||||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||||
@@ -10,6 +11,8 @@ use std::{cmp::Ordering, fmt};
|
|||||||
pub enum Value {
|
pub enum Value {
|
||||||
/// The str type is used for any text information.
|
/// The str type is used for any text information.
|
||||||
Str(String),
|
Str(String),
|
||||||
|
/// Pre-tokenized str type,
|
||||||
|
PreTokStr(PreTokenizedString),
|
||||||
/// Unsigned 64-bits Integer `u64`
|
/// Unsigned 64-bits Integer `u64`
|
||||||
U64(u64),
|
U64(u64),
|
||||||
/// Signed 64-bits Integer `i64`
|
/// Signed 64-bits Integer `i64`
|
||||||
@@ -29,6 +32,7 @@ impl Ord for Value {
|
|||||||
fn cmp(&self, other: &Self) -> Ordering {
|
fn cmp(&self, other: &Self) -> Ordering {
|
||||||
match (self, other) {
|
match (self, other) {
|
||||||
(Value::Str(l), Value::Str(r)) => l.cmp(r),
|
(Value::Str(l), Value::Str(r)) => l.cmp(r),
|
||||||
|
(Value::PreTokStr(l), Value::PreTokStr(r)) => l.cmp(r),
|
||||||
(Value::U64(l), Value::U64(r)) => l.cmp(r),
|
(Value::U64(l), Value::U64(r)) => l.cmp(r),
|
||||||
(Value::I64(l), Value::I64(r)) => l.cmp(r),
|
(Value::I64(l), Value::I64(r)) => l.cmp(r),
|
||||||
(Value::Date(l), Value::Date(r)) => l.cmp(r),
|
(Value::Date(l), Value::Date(r)) => l.cmp(r),
|
||||||
@@ -44,6 +48,8 @@ impl Ord for Value {
|
|||||||
}
|
}
|
||||||
(Value::Str(_), _) => Ordering::Less,
|
(Value::Str(_), _) => Ordering::Less,
|
||||||
(_, Value::Str(_)) => Ordering::Greater,
|
(_, Value::Str(_)) => Ordering::Greater,
|
||||||
|
(Value::PreTokStr(_), _) => Ordering::Less,
|
||||||
|
(_, Value::PreTokStr(_)) => Ordering::Greater,
|
||||||
(Value::U64(_), _) => Ordering::Less,
|
(Value::U64(_), _) => Ordering::Less,
|
||||||
(_, Value::U64(_)) => Ordering::Greater,
|
(_, Value::U64(_)) => Ordering::Greater,
|
||||||
(Value::I64(_), _) => Ordering::Less,
|
(Value::I64(_), _) => Ordering::Less,
|
||||||
@@ -65,6 +71,7 @@ impl Serialize for Value {
|
|||||||
{
|
{
|
||||||
match *self {
|
match *self {
|
||||||
Value::Str(ref v) => serializer.serialize_str(v),
|
Value::Str(ref v) => serializer.serialize_str(v),
|
||||||
|
Value::PreTokStr(ref v) => v.serialize(serializer),
|
||||||
Value::U64(u) => serializer.serialize_u64(u),
|
Value::U64(u) => serializer.serialize_u64(u),
|
||||||
Value::I64(u) => serializer.serialize_i64(u),
|
Value::I64(u) => serializer.serialize_i64(u),
|
||||||
Value::F64(u) => serializer.serialize_f64(u),
|
Value::F64(u) => serializer.serialize_f64(u),
|
||||||
@@ -124,6 +131,15 @@ impl Value {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the tokenized text, provided the value is of the `PreTokStr` type.
|
||||||
|
/// (Returns None if the value is not of the `PreTokStr` type).
|
||||||
|
pub fn tokenized_text(&self) -> Option<&PreTokenizedString> {
|
||||||
|
match *self {
|
||||||
|
Value::PreTokStr(ref tok_text) => Some(tok_text),
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Returns the u64-value, provided the value is of the `U64` type.
|
/// Returns the u64-value, provided the value is of the `U64` type.
|
||||||
///
|
///
|
||||||
/// # Panics
|
/// # Panics
|
||||||
@@ -221,6 +237,7 @@ mod binary_serialize {
|
|||||||
use super::Value;
|
use super::Value;
|
||||||
use crate::common::{f64_to_u64, u64_to_f64, BinarySerializable};
|
use crate::common::{f64_to_u64, u64_to_f64, BinarySerializable};
|
||||||
use crate::schema::Facet;
|
use crate::schema::Facet;
|
||||||
|
use crate::tokenizer::PreTokenizedString;
|
||||||
use chrono::{TimeZone, Utc};
|
use chrono::{TimeZone, Utc};
|
||||||
use std::io::{self, Read, Write};
|
use std::io::{self, Read, Write};
|
||||||
|
|
||||||
@@ -231,6 +248,11 @@ mod binary_serialize {
|
|||||||
const BYTES_CODE: u8 = 4;
|
const BYTES_CODE: u8 = 4;
|
||||||
const DATE_CODE: u8 = 5;
|
const DATE_CODE: u8 = 5;
|
||||||
const F64_CODE: u8 = 6;
|
const F64_CODE: u8 = 6;
|
||||||
|
const EXT_CODE: u8 = 7;
|
||||||
|
|
||||||
|
// extended types
|
||||||
|
|
||||||
|
const TOK_STR_CODE: u8 = 0;
|
||||||
|
|
||||||
impl BinarySerializable for Value {
|
impl BinarySerializable for Value {
|
||||||
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
|
||||||
@@ -239,6 +261,18 @@ mod binary_serialize {
|
|||||||
TEXT_CODE.serialize(writer)?;
|
TEXT_CODE.serialize(writer)?;
|
||||||
text.serialize(writer)
|
text.serialize(writer)
|
||||||
}
|
}
|
||||||
|
Value::PreTokStr(ref tok_str) => {
|
||||||
|
EXT_CODE.serialize(writer)?;
|
||||||
|
TOK_STR_CODE.serialize(writer)?;
|
||||||
|
if let Ok(text) = serde_json::to_string(tok_str) {
|
||||||
|
text.serialize(writer)
|
||||||
|
} else {
|
||||||
|
Err(io::Error::new(
|
||||||
|
io::ErrorKind::Other,
|
||||||
|
"Failed to dump Value::PreTokStr(_) to json.",
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
||||||
Value::U64(ref val) => {
|
Value::U64(ref val) => {
|
||||||
U64_CODE.serialize(writer)?;
|
U64_CODE.serialize(writer)?;
|
||||||
val.serialize(writer)
|
val.serialize(writer)
|
||||||
@@ -290,6 +324,30 @@ mod binary_serialize {
|
|||||||
}
|
}
|
||||||
HIERARCHICAL_FACET_CODE => Ok(Value::Facet(Facet::deserialize(reader)?)),
|
HIERARCHICAL_FACET_CODE => Ok(Value::Facet(Facet::deserialize(reader)?)),
|
||||||
BYTES_CODE => Ok(Value::Bytes(Vec::<u8>::deserialize(reader)?)),
|
BYTES_CODE => Ok(Value::Bytes(Vec::<u8>::deserialize(reader)?)),
|
||||||
|
EXT_CODE => {
|
||||||
|
let ext_type_code = u8::deserialize(reader)?;
|
||||||
|
match ext_type_code {
|
||||||
|
TOK_STR_CODE => {
|
||||||
|
let str_val = String::deserialize(reader)?;
|
||||||
|
if let Ok(value) = serde_json::from_str::<PreTokenizedString>(&str_val)
|
||||||
|
{
|
||||||
|
Ok(Value::PreTokStr(value))
|
||||||
|
} else {
|
||||||
|
Err(io::Error::new(
|
||||||
|
io::ErrorKind::Other,
|
||||||
|
"Failed to parse string data as Value::PreTokStr(_).",
|
||||||
|
))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => Err(io::Error::new(
|
||||||
|
io::ErrorKind::InvalidData,
|
||||||
|
format!(
|
||||||
|
"No extened field type is associated with code {:?}",
|
||||||
|
ext_type_code
|
||||||
|
),
|
||||||
|
)),
|
||||||
|
}
|
||||||
|
}
|
||||||
_ => Err(io::Error::new(
|
_ => Err(io::Error::new(
|
||||||
io::ErrorKind::InvalidData,
|
io::ErrorKind::InvalidData,
|
||||||
format!("No field type is associated with code {:?}", type_code),
|
format!("No field type is associated with code {:?}", type_code),
|
||||||
|
|||||||
@@ -136,6 +136,7 @@ mod simple_tokenizer;
|
|||||||
mod stemmer;
|
mod stemmer;
|
||||||
mod stop_word_filter;
|
mod stop_word_filter;
|
||||||
mod token_stream_chain;
|
mod token_stream_chain;
|
||||||
|
mod tokenized_string;
|
||||||
mod tokenizer;
|
mod tokenizer;
|
||||||
mod tokenizer_manager;
|
mod tokenizer_manager;
|
||||||
|
|
||||||
@@ -152,7 +153,9 @@ pub use self::stop_word_filter::StopWordFilter;
|
|||||||
pub(crate) use self::token_stream_chain::TokenStreamChain;
|
pub(crate) use self::token_stream_chain::TokenStreamChain;
|
||||||
pub use self::tokenizer::BoxedTokenizer;
|
pub use self::tokenizer::BoxedTokenizer;
|
||||||
|
|
||||||
|
pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
|
||||||
pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
|
pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||||
|
|
||||||
pub use self::tokenizer_manager::TokenizerManager;
|
pub use self::tokenizer_manager::TokenizerManager;
|
||||||
|
|
||||||
/// Maximum authorized len (in bytes) for a token.
|
/// Maximum authorized len (in bytes) for a token.
|
||||||
|
|||||||
189
src/tokenizer/tokenized_string.rs
Normal file
189
src/tokenizer/tokenized_string.rs
Normal file
@@ -0,0 +1,189 @@
|
|||||||
|
use crate::tokenizer::{Token, TokenStream, TokenStreamChain};
|
||||||
|
use std::cmp::Ordering;
|
||||||
|
|
||||||
|
/// Struct representing pre-tokenized text
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
|
||||||
|
pub struct PreTokenizedString {
|
||||||
|
/// Original text
|
||||||
|
pub text: String,
|
||||||
|
/// Tokens derived from the text
|
||||||
|
pub tokens: Vec<Token>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Ord for PreTokenizedString {
|
||||||
|
fn cmp(&self, other: &Self) -> Ordering {
|
||||||
|
self.text.cmp(&other.text)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PartialOrd for PreTokenizedString {
|
||||||
|
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||||
|
Some(self.cmp(other))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// TokenStream implementation which wraps PreTokenizedString
|
||||||
|
pub struct PreTokenizedStream {
|
||||||
|
tokenized_string: PreTokenizedString,
|
||||||
|
current_token: i64,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl From<PreTokenizedString> for PreTokenizedStream {
|
||||||
|
fn from(s: PreTokenizedString) -> PreTokenizedStream {
|
||||||
|
PreTokenizedStream {
|
||||||
|
tokenized_string: s,
|
||||||
|
current_token: -1,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PreTokenizedStream {
|
||||||
|
/// Creates a TokenStream from PreTokenizedString array
|
||||||
|
pub fn chain_tokenized_strings<'a>(
|
||||||
|
tok_strings: &'a [&'a PreTokenizedString],
|
||||||
|
) -> Box<dyn TokenStream + 'a> {
|
||||||
|
if tok_strings.len() == 1 {
|
||||||
|
Box::new(PreTokenizedStream::from((*tok_strings[0]).clone()))
|
||||||
|
} else {
|
||||||
|
let mut offsets = vec![];
|
||||||
|
let mut total_offset = 0;
|
||||||
|
for &tok_string in tok_strings {
|
||||||
|
offsets.push(total_offset);
|
||||||
|
if let Some(last_token) = tok_string.tokens.last() {
|
||||||
|
total_offset += last_token.offset_to;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let token_streams: Vec<_> = tok_strings
|
||||||
|
.iter()
|
||||||
|
.map(|tok_string| PreTokenizedStream::from((*tok_string).clone()))
|
||||||
|
.collect();
|
||||||
|
Box::new(TokenStreamChain::new(offsets, token_streams))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TokenStream for PreTokenizedStream {
|
||||||
|
fn advance(&mut self) -> bool {
|
||||||
|
self.current_token += 1;
|
||||||
|
self.current_token < self.tokenized_string.tokens.len() as i64
|
||||||
|
}
|
||||||
|
|
||||||
|
fn token(&self) -> &Token {
|
||||||
|
assert!(
|
||||||
|
self.current_token >= 0,
|
||||||
|
"TokenStream not initialized. You should call advance() at least once."
|
||||||
|
);
|
||||||
|
&self.tokenized_string.tokens[self.current_token as usize]
|
||||||
|
}
|
||||||
|
|
||||||
|
fn token_mut(&mut self) -> &mut Token {
|
||||||
|
assert!(
|
||||||
|
self.current_token >= 0,
|
||||||
|
"TokenStream not initialized. You should call advance() at least once."
|
||||||
|
);
|
||||||
|
&mut self.tokenized_string.tokens[self.current_token as usize]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
use crate::tokenizer::Token;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_tokenized_stream() {
|
||||||
|
let tok_text = PreTokenizedString {
|
||||||
|
text: String::from("A a"),
|
||||||
|
tokens: vec![
|
||||||
|
Token {
|
||||||
|
offset_from: 0,
|
||||||
|
offset_to: 1,
|
||||||
|
position: 0,
|
||||||
|
text: String::from("A"),
|
||||||
|
position_length: 1,
|
||||||
|
},
|
||||||
|
Token {
|
||||||
|
offset_from: 2,
|
||||||
|
offset_to: 3,
|
||||||
|
position: 1,
|
||||||
|
text: String::from("a"),
|
||||||
|
position_length: 1,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut token_stream = PreTokenizedStream::from(tok_text.clone());
|
||||||
|
|
||||||
|
for expected_token in tok_text.tokens {
|
||||||
|
assert!(token_stream.advance());
|
||||||
|
assert_eq!(token_stream.token(), &expected_token);
|
||||||
|
}
|
||||||
|
assert!(!token_stream.advance());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_chain_tokenized_strings() {
|
||||||
|
let tok_text = PreTokenizedString {
|
||||||
|
text: String::from("A a"),
|
||||||
|
tokens: vec![
|
||||||
|
Token {
|
||||||
|
offset_from: 0,
|
||||||
|
offset_to: 1,
|
||||||
|
position: 0,
|
||||||
|
text: String::from("A"),
|
||||||
|
position_length: 1,
|
||||||
|
},
|
||||||
|
Token {
|
||||||
|
offset_from: 2,
|
||||||
|
offset_to: 3,
|
||||||
|
position: 1,
|
||||||
|
text: String::from("a"),
|
||||||
|
position_length: 1,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
};
|
||||||
|
|
||||||
|
let chain_parts = vec![&tok_text, &tok_text];
|
||||||
|
|
||||||
|
let mut token_stream = PreTokenizedStream::chain_tokenized_strings(&chain_parts[..]);
|
||||||
|
|
||||||
|
let expected_tokens = vec![
|
||||||
|
Token {
|
||||||
|
offset_from: 0,
|
||||||
|
offset_to: 1,
|
||||||
|
position: 0,
|
||||||
|
text: String::from("A"),
|
||||||
|
position_length: 1,
|
||||||
|
},
|
||||||
|
Token {
|
||||||
|
offset_from: 2,
|
||||||
|
offset_to: 3,
|
||||||
|
position: 1,
|
||||||
|
text: String::from("a"),
|
||||||
|
position_length: 1,
|
||||||
|
},
|
||||||
|
Token {
|
||||||
|
offset_from: 3,
|
||||||
|
offset_to: 4,
|
||||||
|
position: 3,
|
||||||
|
text: String::from("A"),
|
||||||
|
position_length: 1,
|
||||||
|
},
|
||||||
|
Token {
|
||||||
|
offset_from: 5,
|
||||||
|
offset_to: 6,
|
||||||
|
position: 4,
|
||||||
|
text: String::from("a"),
|
||||||
|
position_length: 1,
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
for expected_token in expected_tokens {
|
||||||
|
assert!(token_stream.advance());
|
||||||
|
assert_eq!(token_stream.token(), &expected_token);
|
||||||
|
}
|
||||||
|
assert!(!token_stream.advance());
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -4,7 +4,7 @@ use crate::tokenizer::TokenStreamChain;
|
|||||||
use std::borrow::{Borrow, BorrowMut};
|
use std::borrow::{Borrow, BorrowMut};
|
||||||
|
|
||||||
/// Token
|
/// Token
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
|
||||||
pub struct Token {
|
pub struct Token {
|
||||||
/// Offset (byte index) of the first character of the token.
|
/// Offset (byte index) of the first character of the token.
|
||||||
/// Offsets shall not be modified by token filters.
|
/// Offsets shall not be modified by token filters.
|
||||||
|
|||||||
@@ -1,5 +1,4 @@
|
|||||||
use fail;
|
use fail;
|
||||||
use std::io::Write;
|
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use tantivy::directory::{Directory, ManagedDirectory, RAMDirectory, TerminatingWrite};
|
use tantivy::directory::{Directory, ManagedDirectory, RAMDirectory, TerminatingWrite};
|
||||||
use tantivy::doc;
|
use tantivy::doc;
|
||||||
|
|||||||
Reference in New Issue
Block a user