diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 000000000..b1a5aece3 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,19 @@ +--- +name: Bug report +about: Create a report to help us improve + +--- + +**Describe the bug** +- What did you do? +- What happened? +- What was expected? + +**Which version of tantivy are you using?** +If "master", ideally give the specific sha1 revision. + +**To Reproduce** + +If your bug is deterministic, can you give a minimal reproducing code? +Some bugs are not deterministic. Can you describe with precision in which context it happened? +If this is possible, can you share your code? diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 000000000..3affc3c24 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,14 @@ +--- +name: Feature request +about: Suggest an idea for this project + +--- + +**Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. + +**[Optional] describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. diff --git a/.github/ISSUE_TEMPLATE/question.md b/.github/ISSUE_TEMPLATE/question.md new file mode 100644 index 000000000..e00e9a1b0 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/question.md @@ -0,0 +1,7 @@ +--- +name: Question +about: Ask any question about tantivy's usage... + +--- + +Try to be specific about your use case... diff --git a/.gitignore b/.gitignore index b6f5cc5b8..afaed5719 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +tantivy.iml *.swp target target/debug diff --git a/CHANGELOG.md b/CHANGELOG.md index 0be47a0b7..1ad8de098 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ + +Tantivy 0.7 +===================== +- Skip data for doc ids and positions (@fulmicoton), + greatly improving performance +- Tantivy error now rely on the failure crate (@drusellers) +- Added support for `AND`, `OR`, `NOT` syntax in addition to the `+`,`-` syntax +- Added a snippet generator with highlight (@vigneshsarma, @fulmicoton) +- Added a `TopFieldCollector` (@pentlander) + Tantivy 0.6.1 ========================= - Bugfix #324. GC removing was removing file that were still in useful diff --git a/Cargo.toml b/Cargo.toml index d852f17a7..bb84e41b8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tantivy" -version = "0.7.0-dev" +version = "0.7.0" authors = ["Paul Masurel "] license = "MIT" categories = ["database-implementations", "data-structures"] @@ -15,7 +15,6 @@ keywords = ["search", "information", "retrieval"] base64 = "0.9.1" byteorder = "1.0" lazy_static = "1" -tinysegmenter = "0.1.0" regex = "1.0" fst = {version="0.3", default-features=false} fst-regex = { version="0.2" } @@ -33,13 +32,12 @@ num_cpus = "1.2" itertools = "0.7" levenshtein_automata = {version="0.1", features=["fst_automaton"]} bit-set = "0.5" -uuid = { version = "0.6", features = ["v4", "serde"] } +uuid = { version = "0.7", features = ["v4", "serde"] } crossbeam = "0.4" crossbeam-channel = "0.2" futures = "0.1" futures-cpupool = "0.1" -error-chain = "0.8" -owning_ref = "0.3" +owning_ref = "0.4" stable_deref_trait = "1.0.0" rust-stemmers = "1" downcast = { version="0.9" } @@ -48,24 +46,34 @@ bitpacking = "0.5" census = "0.1" fnv = "1.0.6" owned-read = "0.4" +failure = "0.1" +htmlescape = "0.3.1" +fail = "0.2" [target.'cfg(windows)'.dependencies] winapi = "0.2" [dev-dependencies] rand = "0.5" +maplit = "1" [profile.release] opt-level = 3 debug = false -lto = true debug-assertions = false +[profile.test] +debug-assertions = true +overflow-checks = true + [features] -default = ["mmap"] +# by default no-fail is disabled. We manually enable it when running test. +default = ["mmap", "no_fail"] mmap = ["fst/mmap", "atomicwrites"] lz4-compression = ["lz4"] +no_fail = ["fail/no_fail"] [badges] travis-ci = { repository = "tantivy-search/tantivy" } + diff --git a/README.md b/README.md index b3eede677..1bf169af1 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,7 @@ [![Join the chat at https://gitter.im/tantivy-search/tantivy](https://badges.gitter.im/tantivy-search/tantivy.svg)](https://gitter.im/tantivy-search/tantivy?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) [![Build status](https://ci.appveyor.com/api/projects/status/r7nb13kj23u8m9pj/branch/master?svg=true)](https://ci.appveyor.com/project/fulmicoton/tantivy/branch/master) +[![Say Thanks!](https://img.shields.io/badge/Say%20Thanks-!-1EAEDB.svg)](https://saythanks.io/to/fulmicoton) ![Tantivy](https://tantivy-search.github.io/logo/tantivy-logo.png) @@ -20,7 +21,7 @@ **Tantivy** is a **full text search engine library** written in rust. -It is closer to Lucene than to Elastic Search and Solr in the sense it is not +It is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elastic Search](https://www.elastic.co/products/elasticsearch) and [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not an off-the-shelf search engine server, but rather a crate that can be used to build such a search engine. @@ -29,10 +30,11 @@ Tantivy is, in fact, strongly inspired by Lucene's design. # Features - Full-text search +- Fast (check out the :racehorse: :sparkles: [benchmark](https://tantivy-search.github.io/bench/) :sparkles: :racehorse:) - Tiny startup time (<10ms), perfect for command line tools - BM25 scoring (the same as lucene) -- Basic query language (`+michael +jackson`) -- Phrase queries search (\"michael jackson\"`) +- Natural query language `(michael AND jackson) OR "king of pop"` +- Phrase queries search (`"michael jackson"`) - Incremental indexing - Multithreaded indexing (indexing English Wikipedia takes < 3 minutes on my desktop) - Mmap directory @@ -42,12 +44,14 @@ Tantivy is, in fact, strongly inspired by Lucene's design. - LZ4 compressed document store - Range queries - Faceted search -- Configurable indexing (optional term frequency and position indexing +- Configurable indexing (optional term frequency and position indexing) - Cheesy logo with a horse # Non-features -- Distributed search and will not be in the scope of tantivy. +- Distributed search is out of the scope of tantivy. That being said, tantivy is meant as a +library upon which one could build a distributed search. Serializable/mergeable collector state for instance, +are within the scope of tantivy. # Supported OS and compiler @@ -76,6 +80,10 @@ To check out and run tests, you can simply run : cd tantivy cargo build +## Running tests + +Some tests will not run with just `cargo test` because of `fail-rs`. +To run the tests exhaustively, run `./run-tests.sh`. # Contribute diff --git a/appveyor.yml b/appveyor.yml index a3bd2ac04..685b04d3a 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -18,5 +18,5 @@ install: build: false test_script: - - REM SET RUST_LOG=tantivy,test & cargo test --verbose - - REM SET RUST_BACKTRACE=1 & cargo build --examples \ No newline at end of file + - REM SET RUST_LOG=tantivy,test & cargo test --verbose --no-default-features --features mmap -- --test-threads 1 + - REM SET RUST_BACKTRACE=1 & cargo build --examples diff --git a/ci/script.sh b/ci/script.sh index b56345753..9f3cf889d 100644 --- a/ci/script.sh +++ b/ci/script.sh @@ -11,12 +11,11 @@ main() { else echo "Build" cross build --target $TARGET - cross build --target $TARGET --release if [ ! -z $DISABLE_TESTS ]; then return fi echo "Test" - cross test --target $TARGET + cross test --target $TARGET --no-default-features --features mmap -- --test-threads 1 fi for example in $(ls examples/*.rs) do diff --git a/doc/.gitignore b/doc/.gitignore new file mode 100644 index 000000000..7585238ef --- /dev/null +++ b/doc/.gitignore @@ -0,0 +1 @@ +book diff --git a/doc/book.toml b/doc/book.toml new file mode 100644 index 000000000..a8c8ec91a --- /dev/null +++ b/doc/book.toml @@ -0,0 +1,5 @@ +[book] +authors = ["Paul Masurel"] +multilingual = false +src = "src" +title = "Tantivy, the user guide" diff --git a/doc/src/SUMMARY.md b/doc/src/SUMMARY.md new file mode 100644 index 000000000..a280d19b7 --- /dev/null +++ b/doc/src/SUMMARY.md @@ -0,0 +1,15 @@ +# Summary + + + +[Avant Propos](./avant-propos.md) + +- [Segments](./basis.md) +- [Defining your schema](./schema.md) +- [Facetting](./facetting.md) +- [Innerworkings](./innerworkings.md) + - [Inverted index](./inverted_index.md) +- [Best practise](./inverted_index.md) + +[Frequently Asked Questions](./faq.md) +[Examples](./examples.md) diff --git a/doc/src/avant-propos.md b/doc/src/avant-propos.md new file mode 100644 index 000000000..485afd178 --- /dev/null +++ b/doc/src/avant-propos.md @@ -0,0 +1,34 @@ +# Foreword, what is the scope of tantivy? + +> Tantivy is a **search** engine **library** for Rust. + +If you are familiar with Lucene, it's an excellent approximation to consider tantivy as Lucene for rust. tantivy is heavily inspired by Lucene's design and +they both have the same scope and targetted use cases. + +If you are not familiar with Lucene, let's break down our little tagline. + +- **Search** here means full-text search : fundamentally, tantivy is here to help you +identify efficiently what are the documents matching a given query in your corpus. +But modern search UI are so much more : text processing, facetting, autocomplete, fuzzy search, good +relevancy, collapsing, highlighting, spatial search. + + While some of these features are not available in tantivy yet, all of these are relevant + feature requests. Tantivy's objective is to offer a solid toolbox to create the best search + experience. But keep in mind this is just a toolbox. + Which bring us to the second keyword... + +- **Library** means that you will have to write code. tantivy is not an *all-in-one* server solution like elastic search for instance. + + Sometimes a functionality will not be available in tantivy because it is too + specific to your use case. By design, tantivy should make it possible to extend + the available set of features using the existing rock-solid datastructures. + + Most frequently this will mean writing your own `Collector`, your own `Scorer` or your own + `TokenFilter`... Some of your requirements may also be related to + something closer to architecture or operations. For instance, you may + want to build a large corpus on Hadoop, fine-tune the merge policy to keep your + index sharded in a time-wise fashion, or you may want to convert and existing + index from a different format. + + Tantivy exposes a lot of low level API to do all of these things. + diff --git a/doc/src/basis.md b/doc/src/basis.md new file mode 100644 index 000000000..21dadb7fb --- /dev/null +++ b/doc/src/basis.md @@ -0,0 +1,77 @@ +# Anatomy of an index + +## Straight from disk + +Tantivy accesses its data using an abstracting trait called `Directory`. +In theory, one can come and override the data access logic. In practise, the +trait somewhat assumes that your data can be mapped to memory, and tantivy +seems deeply married to using `mmap` for its io [^1], and the only persisting +directory shipped with tantivy is the `MmapDirectory`. + +While this design has some downsides, this greatly simplifies the source code of +tantivy. Caching is also entirely delegated to the OS. + +`tantivy` works entirely (or almost) by directly reading the datastructures as they are layed on disk. As a result, the act of opening an indexing does not involve loading different datastructures from the disk into random access memory : starting a process, opening an index, and performing your first query can typically be done in a matter of milliseconds. + +This is an interesting property for a command line search engine, or for some multi-tenant log search engine : spawning a new process for each new query can be a perfectly sensible solution in some use case. + +In later chapters, we will discuss tantivy's inverted index data layout. +One key take away is that to achieve great performance, search indexes are extremely compact. +Of course this is crucial to reduce IO, and ensure that as much of our index can sit in RAM. + +Also, whenever possible its data is accessed sequentially. Of course, this is an amazing property when tantivy needs to access the data from your spinning hard disk, but this is also +critical for performance, if your data is read from and an `SSD` or even already in your pagecache. + + +## Segments, and the log method + +That kind of compact layout comes at one cost: it prevents our datastructures from being dynamic. +In fact, the `Directory` trait does not even allow you to modify part of a file. + +To allow the addition / deletion of documents, and create the illusion that +your index is dynamic (i.e.: adding and deleting documents), tantivy uses a common database trick sometimes referred to as the *log method*. + +Let's forget about deletes for a moment. + +As you add documents, these documents are processed and stored in a dedicated datastructure, in a `RAM` buffer. This datastructure is not ready for search, but it is useful to receive your data and rearrange it very rapidly. + +As you add documents, this buffer will reach its capacity and tantivy will transparently stop adding document to it and start converting this datastructure to its final read-only format on disk. Once written, an brand empty buffer is available to resume adding documents. + +The resulting chunk of index obtained after this serialization is called a `Segment`. + +> A segment is a self-contained atomic piece of index. It is identified with a UUID, and all of its files are identified using the naming scheme : `.*`. + +Which brings us to the nature of a tantivy `Index`. + +> A tantivy `Index` is a collection of `Segments`. + +Physically, this really just means and index is a bunch of segment files in a given `Directory`, +linked together by a `meta.json` file. This transparency can become extremely handy +to get tantivy to fit your use case: + +*Example 1* You could for instance use hadoop to build a very large search index in a timely manner, copy all of the resulting segment files in the same directory and edit the `meta.json` to get a functional index.[^2] + +*Example 2* You could also disable your merge policy and enforce daily segments. Removing data after one week can then be done very efficiently by just editing the `meta.json` and deleting the files associated to segment `D-7`. + + + + + +# Merging + +As you index more and more data, your index will accumulate more and more segments. +Having a lot of small segments is not really optimal. There is a bit of redundancy in having +all these term dictionary. Also when searching, we will need to do term lookups as many times as we have segments. It can hurt search performance a bit. + +That's where merging or compacting comes into place. Tantivy will continuously consider merge +opportunities and start merging segments in the background. + + +# Indexing throughput, number of indexing threads + + + + +[^1]: This may eventually change. + +[^2]: Be careful however. By default these files will not be considered as *managed* by tantivy. This means they will never be garbage collected by tantivy, regardless of whether they become obsolete or not. diff --git a/doc/src/best_practise.md.rs b/doc/src/best_practise.md.rs new file mode 100644 index 000000000..e69de29bb diff --git a/doc/src/examples.md b/doc/src/examples.md new file mode 100644 index 000000000..6ba4a8a4d --- /dev/null +++ b/doc/src/examples.md @@ -0,0 +1,3 @@ +# Examples + +- [Basic search](/examples/basic_search.html) \ No newline at end of file diff --git a/doc/src/facetting.md b/doc/src/facetting.md new file mode 100644 index 000000000..a1d7dc061 --- /dev/null +++ b/doc/src/facetting.md @@ -0,0 +1,5 @@ +# Facetting + +wewew + +## weeewe diff --git a/doc/src/faq.md b/doc/src/faq.md new file mode 100644 index 000000000..e69de29bb diff --git a/doc/src/innerworkings.md b/doc/src/innerworkings.md new file mode 100644 index 000000000..f1de34898 --- /dev/null +++ b/doc/src/innerworkings.md @@ -0,0 +1 @@ +# Innerworkings diff --git a/doc/src/inverted_index.md b/doc/src/inverted_index.md new file mode 100644 index 000000000..f07f47e52 --- /dev/null +++ b/doc/src/inverted_index.md @@ -0,0 +1 @@ +# Inverted index diff --git a/doc/src/schema.md b/doc/src/schema.md new file mode 100644 index 000000000..eb661bd69 --- /dev/null +++ b/doc/src/schema.md @@ -0,0 +1 @@ +# Defining your schema diff --git a/examples/basic_search.rs b/examples/basic_search.rs index 35867b2f0..00576be51 100644 --- a/examples/basic_search.rs +++ b/examples/basic_search.rs @@ -10,7 +10,6 @@ // - search for the best document matchings "sea whale" // - retrieve the best document original content. - extern crate tempdir; // --- @@ -231,13 +230,11 @@ fn main() -> tantivy::Result<()> { // a title. for doc_address in doc_addresses { - let retrieved_doc = searcher.doc(&doc_address)?; + let retrieved_doc = searcher.doc(doc_address)?; println!("{}", schema.to_json(&retrieved_doc)); } - Ok(()) } - use tempdir::TempDir; diff --git a/examples/custom_tokenizer.rs b/examples/custom_tokenizer.rs index e44b82c57..08236c0e5 100644 --- a/examples/custom_tokenizer.rs +++ b/examples/custom_tokenizer.rs @@ -3,7 +3,6 @@ // In this example, we'll see how to define a tokenizer pipeline // by aligning a bunch of `TokenFilter`. - #[macro_use] extern crate tantivy; use tantivy::collector::TopCollector; @@ -12,7 +11,6 @@ use tantivy::schema::*; use tantivy::tokenizer::NgramTokenizer; use tantivy::Index; - fn main() -> tantivy::Result<()> { // # Defining the schema // @@ -111,7 +109,7 @@ fn main() -> tantivy::Result<()> { let doc_addresses = top_collector.docs(); for doc_address in doc_addresses { - let retrieved_doc = searcher.doc(&doc_address)?; + let retrieved_doc = searcher.doc(doc_address)?; println!("{}", schema.to_json(&retrieved_doc)); } diff --git a/examples/deleting_updating_documents.rs b/examples/deleting_updating_documents.rs index 9ddb38a59..afae85685 100644 --- a/examples/deleting_updating_documents.rs +++ b/examples/deleting_updating_documents.rs @@ -11,10 +11,9 @@ #[macro_use] extern crate tantivy; use tantivy::collector::TopCollector; +use tantivy::query::TermQuery; use tantivy::schema::*; use tantivy::Index; -use tantivy::query::TermQuery; - // A simple helper function to fetch a single document // given its id from our index. @@ -31,8 +30,8 @@ fn extract_doc_given_isbn(index: &Index, isbn_term: &Term) -> tantivy::Result tantivy::Result tantivy::Result<()> { - // # Defining the schema // // Check out the *basic_search* example if this makes @@ -126,7 +124,6 @@ fn main() -> tantivy::Result<()> { isbn => "978-9176370711", )); - // You are guaranteed that your clients will only observe your index in // the state it was in after a commit. // In this example, your search engine will at no point be missing the *Frankenstein* document. @@ -143,4 +140,4 @@ fn main() -> tantivy::Result<()> { ); Ok(()) -} \ No newline at end of file +} diff --git a/examples/faceted_search.rs b/examples/faceted_search.rs new file mode 100644 index 000000000..24fd536e8 --- /dev/null +++ b/examples/faceted_search.rs @@ -0,0 +1,81 @@ +// # Basic Example +// +// This example covers the basic functionalities of +// tantivy. +// +// We will : +// - define our schema +// = create an index in a directory +// - index few documents in our index +// - search for the best document matchings "sea whale" +// - retrieve the best document original content. + +extern crate tempdir; + +// --- +// Importing tantivy... +#[macro_use] +extern crate tantivy; +use tantivy::collector::FacetCollector; +use tantivy::query::AllQuery; +use tantivy::schema::*; +use tantivy::Index; + +fn main() -> tantivy::Result<()> { + // Let's create a temporary directory for the + // sake of this example + let index_path = TempDir::new("tantivy_facet_example_dir")?; + let mut schema_builder = SchemaBuilder::default(); + + schema_builder.add_text_field("name", TEXT | STORED); + + // this is our faceted field + schema_builder.add_facet_field("tags"); + + let schema = schema_builder.build(); + + let index = Index::create_in_dir(&index_path, schema.clone())?; + + let mut index_writer = index.writer(50_000_000)?; + + let name = schema.get_field("name").unwrap(); + let tags = schema.get_field("tags").unwrap(); + + // For convenience, tantivy also comes with a macro to + // reduce the boilerplate above. + index_writer.add_document(doc!( + name => "the ditch", + tags => Facet::from("/pools/north") + )); + + index_writer.add_document(doc!( + name => "little stacey", + tags => Facet::from("/pools/south") + )); + + index_writer.commit()?; + + index.load_searchers()?; + + let searcher = index.searcher(); + + let mut facet_collector = FacetCollector::for_field(tags); + facet_collector.add_facet("/pools"); + + searcher.search(&AllQuery, &mut facet_collector).unwrap(); + + let counts = facet_collector.harvest(); + // This lists all of the facet counts + let facets: Vec<(&Facet, u64)> = counts.get("/pools").collect(); + assert_eq!( + facets, + vec![ + (&Facet::from("/pools/north"), 1), + (&Facet::from("/pools/south"), 1), + ] + ); + + Ok(()) +} + +use tempdir::TempDir; diff --git a/examples/generate_html.sh b/examples/generate_html.sh deleted file mode 100755 index ec07322e2..000000000 --- a/examples/generate_html.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash - -for example in $(ls *.rs) -do - docco $example -o html -done diff --git a/examples/iterating_docs_and_positions.rs b/examples/iterating_docs_and_positions.rs new file mode 100644 index 000000000..0434f58c8 --- /dev/null +++ b/examples/iterating_docs_and_positions.rs @@ -0,0 +1,133 @@ +// # Iterating docs and positioms. +// +// At its core of tantivy, relies on a data structure +// called an inverted index. +// +// This example shows how to manually iterate through +// the list of documents containing a term, getting +// its term frequency, and accessing its positions. + +// --- +// Importing tantivy... +#[macro_use] +extern crate tantivy; +use tantivy::schema::*; +use tantivy::Index; +use tantivy::{DocId, DocSet, Postings}; + +fn main() -> tantivy::Result<()> { + // We first create a schema for the sake of the + // example. Check the `basic_search` example for more information. + let mut schema_builder = SchemaBuilder::default(); + + // For this example, we need to make sure to index positions for our title + // field. `TEXT` precisely does this. + let title = schema_builder.add_text_field("title", TEXT | STORED); + let schema = schema_builder.build(); + + let index = Index::create_in_ram(schema.clone()); + + let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?; + index_writer.add_document(doc!(title => "The Old Man and the Sea")); + index_writer.add_document(doc!(title => "Of Mice and Men")); + index_writer.add_document(doc!(title => "The modern Promotheus")); + index_writer.commit()?; + + index.load_searchers()?; + + let searcher = index.searcher(); + + // A tantivy index is actually a collection of segments. + // Similarly, a searcher just wraps a list `segment_reader`. + // + // (Because we indexed a very small number of documents over one thread + // there is actually only one segment here, but let's iterate through the list + // anyway) + for segment_reader in searcher.segment_readers() { + // A segment contains different data structure. + // Inverted index stands for the combination of + // - the term dictionary + // - the inverted lists associated to each terms and their positions + let inverted_index = segment_reader.inverted_index(title); + + // A `Term` is a text token associated with a field. + // Let's go through all docs containing the term `title:the` and access their position + let term_the = Term::from_field_text(title, "the"); + + // This segment posting object is like a cursor over the documents matching the term. + // The `IndexRecordOption` arguments tells tantivy we will be interested in both term frequencies + // and positions. + // + // If you don't need all this information, you may get better performance by decompressing less + // information. + if let Some(mut segment_postings) = + inverted_index.read_postings(&term_the, IndexRecordOption::WithFreqsAndPositions) + { + // this buffer will be used to request for positions + let mut positions: Vec = Vec::with_capacity(100); + while segment_postings.advance() { + // the number of time the term appears in the document. + let doc_id: DocId = segment_postings.doc(); //< do not try to access this before calling advance once. + + // This MAY contains deleted documents as well. + if segment_reader.is_deleted(doc_id) { + continue; + } + + // the number of time the term appears in the document. + let term_freq: u32 = segment_postings.term_freq(); + // accessing positions is slightly expensive and lazy, do not request + // for them if you don't need them for some documents. + segment_postings.positions(&mut positions); + + // By definition we should have `term_freq` positions. + assert_eq!(positions.len(), term_freq as usize); + + // This prints: + // ``` + // Doc 0: TermFreq 2: [0, 4] + // Doc 2: TermFreq 1: [0] + // ``` + println!("Doc {}: TermFreq {}: {:?}", doc_id, term_freq, positions); + } + } + } + + // A `Term` is a text token associated with a field. + // Let's go through all docs containing the term `title:the` and access their position + let term_the = Term::from_field_text(title, "the"); + + // Some other powerful operations (especially `.skip_to`) may be useful to consume these + // posting lists rapidly. + // You can check for them in the [`DocSet`](https://docs.rs/tantivy/~0/tantivy/trait.DocSet.html) trait + // and the [`Postings`](https://docs.rs/tantivy/~0/tantivy/trait.Postings.html) trait + + // Also, for some VERY specific high performance use case like an OLAP analysis of logs, + // you can get better performance by accessing directly the blocks of doc ids. + for segment_reader in searcher.segment_readers() { + // A segment contains different data structure. + // Inverted index stands for the combination of + // - the term dictionary + // - the inverted lists associated to each terms and their positions + let inverted_index = segment_reader.inverted_index(title); + + // This segment posting object is like a cursor over the documents matching the term. + // The `IndexRecordOption` arguments tells tantivy we will be interested in both term frequencies + // and positions. + // + // If you don't need all this information, you may get better performance by decompressing less + // information. + if let Some(mut block_segment_postings) = + inverted_index.read_block_postings(&term_the, IndexRecordOption::Basic) + { + while block_segment_postings.advance() { + // Once again these docs MAY contains deleted documents as well. + let docs = block_segment_postings.docs(); + // Prints `Docs [0, 2].` + println!("Docs {:?}", docs); + } + } + } + + Ok(()) +} diff --git a/examples/snippet.rs b/examples/snippet.rs new file mode 100644 index 000000000..ecc9481ed --- /dev/null +++ b/examples/snippet.rs @@ -0,0 +1,71 @@ +// # Snippet example +// +// This example shows how to return a representative snippet of +// your hit result. +// Snippet are an extracted of a target document, and returned in HTML format. +// The keyword searched by the user are highlighted with a `` tag. +extern crate tempdir; + +// --- +// Importing tantivy... +#[macro_use] +extern crate tantivy; +use tantivy::collector::TopCollector; +use tantivy::query::QueryParser; +use tantivy::schema::*; +use tantivy::Index; +use tantivy::SnippetGenerator; +use tempdir::TempDir; + +fn main() -> tantivy::Result<()> { + // Let's create a temporary directory for the + // sake of this example + let index_path = TempDir::new("tantivy_example_dir")?; + + // # Defining the schema + let mut schema_builder = SchemaBuilder::default(); + let title = schema_builder.add_text_field("title", TEXT | STORED); + let body = schema_builder.add_text_field("body", TEXT | STORED); + let schema = schema_builder.build(); + + // # Indexing documents + let index = Index::create_in_dir(&index_path, schema.clone())?; + + let mut index_writer = index.writer(50_000_000)?; + + // we'll only need one doc for this example. + index_writer.add_document(doc!( + title => "Of Mice and Men", + body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \ + bank and runs deep and green. The water is warm too, for it has slipped twinkling \ + over the yellow sands in the sunlight before reaching the narrow pool. On one \ + side of the river the golden foothill slopes curve up to the strong and rocky \ + Gabilan Mountains, but on the valley side the water is lined with trees—willows \ + fresh and green with every spring, carrying in their lower leaf junctures the \ + debris of the winter’s flooding; and sycamores with mottled, white, recumbent \ + limbs and branches that arch over the pool" + )); + // ... + index_writer.commit()?; + + index.load_searchers()?; + + let searcher = index.searcher(); + let query_parser = QueryParser::for_index(&index, vec![title, body]); + let query = query_parser.parse_query("sycamore spring")?; + + let mut top_collector = TopCollector::with_limit(10); + searcher.search(&*query, &mut top_collector)?; + + let snippet_generator = SnippetGenerator::new(&searcher, &*query, body)?; + + let doc_addresses = top_collector.docs(); + for doc_address in doc_addresses { + let doc = searcher.doc(doc_address)?; + let snippet = snippet_generator.snippet_from_doc(&doc); + println!("title: {}", doc.get_first(title).unwrap().text().unwrap()); + println!("snippet: {}", snippet.to_html()); + } + + Ok(()) +} diff --git a/examples/stop_words.rs b/examples/stop_words.rs index 950a42afd..80e78ece2 100644 --- a/examples/stop_words.rs +++ b/examples/stop_words.rs @@ -22,72 +22,71 @@ use tantivy::tokenizer::*; use tantivy::Index; fn main() -> tantivy::Result<()> { - // this example assumes you understand the content in `basic_search` - let index_path = TempDir::new("tantivy_stopwords_example_dir")?; - let mut schema_builder = SchemaBuilder::default(); + // this example assumes you understand the content in `basic_search` + let mut schema_builder = SchemaBuilder::default(); - // This configures your custom options for how tantivy will - // store and process your content in the index; The key - // to note is that we are setting the tokenizer to `stoppy` - // which will be defined and registered below. - let text_field_indexing = TextFieldIndexing::default() - .set_tokenizer("stoppy") - .set_index_option(IndexRecordOption::WithFreqsAndPositions); - let text_options = TextOptions::default() - .set_indexing_options(text_field_indexing) - .set_stored(); + // This configures your custom options for how tantivy will + // store and process your content in the index; The key + // to note is that we are setting the tokenizer to `stoppy` + // which will be defined and registered below. + let text_field_indexing = TextFieldIndexing::default() + .set_tokenizer("stoppy") + .set_index_option(IndexRecordOption::WithFreqsAndPositions); + let text_options = TextOptions::default() + .set_indexing_options(text_field_indexing) + .set_stored(); - // Our first field is title. - schema_builder.add_text_field("title", text_options); + // Our first field is title. + schema_builder.add_text_field("title", text_options); - // Our second field is body. - let text_field_indexing = TextFieldIndexing::default() - .set_tokenizer("stoppy") - .set_index_option(IndexRecordOption::WithFreqsAndPositions); - let text_options = TextOptions::default() - .set_indexing_options(text_field_indexing) - .set_stored(); - schema_builder.add_text_field("body", text_options); + // Our second field is body. + let text_field_indexing = TextFieldIndexing::default() + .set_tokenizer("stoppy") + .set_index_option(IndexRecordOption::WithFreqsAndPositions); + let text_options = TextOptions::default() + .set_indexing_options(text_field_indexing) + .set_stored(); + schema_builder.add_text_field("body", text_options); - let schema = schema_builder.build(); + let schema = schema_builder.build(); - let index = Index::create_in_dir(&index_path, schema.clone())?; + let index = Index::create_in_ram(schema.clone()); - // This tokenizer lowers all of the text (to help with stop word matching) - // then removes all instances of `the` and `and` from the corpus - let tokenizer = SimpleTokenizer - .filter(LowerCaser) - .filter(StopWordFilter::remove(vec![ - "the".to_string(), - "and".to_string(), - ])); + // This tokenizer lowers all of the text (to help with stop word matching) + // then removes all instances of `the` and `and` from the corpus + let tokenizer = SimpleTokenizer + .filter(LowerCaser) + .filter(StopWordFilter::remove(vec![ + "the".to_string(), + "and".to_string(), + ])); - index.tokenizers().register("stoppy", tokenizer); + index.tokenizers().register("stoppy", tokenizer); - let mut index_writer = index.writer(50_000_000)?; + let mut index_writer = index.writer(50_000_000)?; - let title = schema.get_field("title").unwrap(); - let body = schema.get_field("body").unwrap(); + let title = schema.get_field("title").unwrap(); + let body = schema.get_field("body").unwrap(); - index_writer.add_document(doc!( + index_writer.add_document(doc!( title => "The Old Man and the Sea", body => "He was an old man who fished alone in a skiff in the Gulf Stream and \ he had gone eighty-four days now without taking a fish." )); - index_writer.add_document(doc!( - title => "Of Mice and Men", - body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \ - bank and runs deep and green. The water is warm too, for it has slipped twinkling \ - over the yellow sands in the sunlight before reaching the narrow pool. On one \ - side of the river the golden foothill slopes curve up to the strong and rocky \ - Gabilan Mountains, but on the valley side the water is lined with trees—willows \ - fresh and green with every spring, carrying in their lower leaf junctures the \ - debris of the winter’s flooding; and sycamores with mottled, white, recumbent \ - limbs and branches that arch over the pool" - )); + index_writer.add_document(doc!( + title => "Of Mice and Men", + body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \ + bank and runs deep and green. The water is warm too, for it has slipped twinkling \ + over the yellow sands in the sunlight before reaching the narrow pool. On one \ + side of the river the golden foothill slopes curve up to the strong and rocky \ + Gabilan Mountains, but on the valley side the water is lined with trees—willows \ + fresh and green with every spring, carrying in their lower leaf junctures the \ + debris of the winter’s flooding; and sycamores with mottled, white, recumbent \ + limbs and branches that arch over the pool" + )); - index_writer.add_document(doc!( + index_writer.add_document(doc!( title => "Frankenstein", body => "You will rejoice to hear that no disaster has accompanied the commencement of an \ enterprise which you have regarded with such evil forebodings. I arrived here \ @@ -95,35 +94,28 @@ fn main() -> tantivy::Result<()> { increasing confidence in the success of my undertaking." )); - index_writer.commit()?; + index_writer.commit()?; - index.load_searchers()?; + index.load_searchers()?; - let searcher = index.searcher(); + let searcher = index.searcher(); - let query_parser = QueryParser::for_index(&index, vec![title, body]); + let query_parser = QueryParser::for_index(&index, vec![title, body]); - // this will have NO hits because it was filtered out - // because the query is run through the analyzer you - // actually will get an error here because the query becomes - // empty - assert!(query_parser.parse_query("the").is_err()); + // stop words are applied on the query as well. + // The following will be equivalent to `title:frankenstein` + let query = query_parser.parse_query("title:\"the Frankenstein\"")?; - // this will have hits - let query = query_parser.parse_query("is")?; + let mut top_collector = TopCollector::with_limit(10); - let mut top_collector = TopCollector::with_limit(10); + searcher.search(&*query, &mut top_collector)?; - searcher.search(&*query, &mut top_collector)?; + let doc_addresses = top_collector.docs(); - let doc_addresses = top_collector.docs(); + for doc_address in doc_addresses { + let retrieved_doc = searcher.doc(doc_address)?; + println!("{}", schema.to_json(&retrieved_doc)); + } - for doc_address in doc_addresses { - let retrieved_doc = searcher.doc(&doc_address)?; - println!("{}", schema.to_json(&retrieved_doc)); - } - - Ok(()) + Ok(()) } - -use tempdir::TempDir; diff --git a/examples/working_with_json.rs b/examples/working_with_json.rs index 5de285df2..3c8e3c1ca 100644 --- a/examples/working_with_json.rs +++ b/examples/working_with_json.rs @@ -1,7 +1,6 @@ extern crate tantivy; use tantivy::schema::*; - // # Document from json // // For convenience, `Document` can be parsed directly from json. @@ -23,8 +22,8 @@ fn main() -> tantivy::Result<()> { }"#; // We can parse our document - let mice_and_men_doc = schema.parse_document(&mice_and_men_doc_json)?; - + let _mice_and_men_doc = schema.parse_document(&mice_and_men_doc_json)?; + // Multi-valued field are allowed, they are // expressed in JSON by an array. // The following document has two titles. @@ -32,8 +31,7 @@ fn main() -> tantivy::Result<()> { "title": ["Frankenstein", "The Modern Prometheus"], "year": 1818 }"#; - let frankenstein_doc = schema.parse_document(&frankenstein_json)?; - + let _frankenstein_doc = schema.parse_document(&frankenstein_json)?; // Note that the schema is saved in your index directory. // diff --git a/run-tests.sh b/run-tests.sh new file mode 100755 index 000000000..fc2944dd5 --- /dev/null +++ b/run-tests.sh @@ -0,0 +1,2 @@ +#!/bin/bash +cargo test --no-default-features --features mmap -- --test-threads 1 diff --git a/src/collector/facet_collector.rs b/src/collector/facet_collector.rs index 6c0bb647d..a092a8dae 100644 --- a/src/collector/facet_collector.rs +++ b/src/collector/facet_collector.rs @@ -342,16 +342,19 @@ impl FacetCollector { pub fn harvest(mut self) -> FacetCounts { self.finalize_segment(); - let collapsed_facet_ords: Vec<&[u64]> = self.segment_counters + let collapsed_facet_ords: Vec<&[u64]> = self + .segment_counters .iter() .map(|segment_counter| &segment_counter.facet_ords[..]) .collect(); - let collapsed_facet_counts: Vec<&[u64]> = self.segment_counters + let collapsed_facet_counts: Vec<&[u64]> = self + .segment_counters .iter() .map(|segment_counter| &segment_counter.facet_counts[..]) .collect(); - let facet_streams = self.segment_counters + let facet_streams = self + .segment_counters .iter() .map(|seg_counts| seg_counts.facet_reader.facet_dict().range().into_stream()) .collect::>(); @@ -374,10 +377,8 @@ impl FacetCollector { } else { collapsed_facet_counts[seg_ord][collapsed_term_id] } - }) - .unwrap_or(0) - }) - .sum(); + }).unwrap_or(0) + }).sum(); if count > 0u64 { let bytes: Vec = facet_merger.key().to_owned(); // may create an corrupted facet if the term dicitonary is corrupted @@ -402,7 +403,8 @@ impl Collector for FacetCollector { fn collect(&mut self, doc: DocId, _: Score) { let facet_reader: &mut FacetReader = unsafe { - &mut *self.ff_reader + &mut *self + .ff_reader .as_ref() .expect("collect() was called before set_segment. This should never happen.") .get() @@ -476,9 +478,8 @@ impl FacetCounts { heap.push(Hit { count, facet }); } - let mut lowest_count: u64 = heap.peek().map(|hit| hit.count) - .unwrap_or(u64::MIN); //< the `unwrap_or` case may be triggered but the value - // is never used in that case. + let mut lowest_count: u64 = heap.peek().map(|hit| hit.count).unwrap_or(u64::MIN); //< the `unwrap_or` case may be triggered but the value + // is never used in that case. for (facet, count) in it { if count > lowest_count { @@ -526,8 +527,7 @@ mod tests { n /= 4; let leaf = n % 5; Facet::from(&format!("/top{}/mid{}/leaf{}", top, mid, leaf)) - }) - .collect(); + }).collect(); for i in 0..num_facets * 10 { let mut doc = Document::new(); doc.add_facet(facet_field, facets[i % num_facets].clone()); @@ -554,7 +554,8 @@ mod tests { ("/top1/mid1", 50), ("/top1/mid2", 50), ("/top1/mid3", 50), - ].iter() + ] + .iter() .map(|&(facet_str, count)| (String::from(facet_str), count)) .collect::>() ); @@ -618,9 +619,13 @@ mod tests { let facet = Facet::from(&format!("/facet/{}", c)); let doc = doc!(facet_field => facet); iter::repeat(doc).take(count) - }) - .map(|mut doc| { doc.add_facet(facet_field, &format!("/facet/{}", thread_rng().sample(&uniform) )); doc}) - .collect(); + }).map(|mut doc| { + doc.add_facet( + facet_field, + &format!("/facet/{}", thread_rng().sample(&uniform)), + ); + doc + }).collect(); thread_rng().shuffle(&mut docs[..]); let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); diff --git a/src/collector/mod.rs b/src/collector/mod.rs index d29eb1c6f..99af0f286 100644 --- a/src/collector/mod.rs +++ b/src/collector/mod.rs @@ -15,7 +15,14 @@ mod multi_collector; pub use self::multi_collector::MultiCollector; mod top_collector; -pub use self::top_collector::TopCollector; + +mod top_score_collector; +pub use self::top_score_collector::TopScoreCollector; +#[deprecated] +pub use self::top_score_collector::TopScoreCollector as TopCollector; + +mod top_field_collector; +pub use self::top_field_collector::TopFieldCollector; mod facet_collector; pub use self::facet_collector::FacetCollector; diff --git a/src/collector/multi_collector.rs b/src/collector/multi_collector.rs index 568a843e8..14ff80788 100644 --- a/src/collector/multi_collector.rs +++ b/src/collector/multi_collector.rs @@ -100,11 +100,11 @@ impl<'a> Collector for MultiCollector<'a> { mod tests { use super::*; - use collector::{Collector, CountCollector, TopCollector}; + use collector::{Collector, CountCollector, TopScoreCollector}; #[test] fn test_multi_collector() { - let mut top_collector = TopCollector::with_limit(2); + let mut top_collector = TopScoreCollector::with_limit(2); let mut count_collector = CountCollector::default(); { let mut collectors = diff --git a/src/collector/top_collector.rs b/src/collector/top_collector.rs index 8d2829b73..265a6981a 100644 --- a/src/collector/top_collector.rs +++ b/src/collector/top_collector.rs @@ -1,115 +1,61 @@ -use super::Collector; use std::cmp::Ordering; use std::collections::BinaryHeap; use DocAddress; use DocId; -use Result; -use Score; use SegmentLocalId; -use SegmentReader; -// Rust heap is a max-heap and we need a min heap. +/// Contains a feature (field, score, etc.) of a document along with the document address. +/// +/// It has a custom implementation of `PartialOrd` that reverses the order. This is because the +/// default Rust heap is a max heap, whereas a min heap is needed. #[derive(Clone, Copy)] -struct GlobalScoredDoc { - score: Score, +pub struct ComparableDoc { + feature: T, doc_address: DocAddress, } -impl PartialOrd for GlobalScoredDoc { - fn partial_cmp(&self, other: &GlobalScoredDoc) -> Option { +impl PartialOrd for ComparableDoc { + fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } -impl Ord for GlobalScoredDoc { +impl Ord for ComparableDoc { #[inline] - fn cmp(&self, other: &GlobalScoredDoc) -> Ordering { + fn cmp(&self, other: &Self) -> Ordering { other - .score - .partial_cmp(&self.score) + .feature + .partial_cmp(&self.feature) .unwrap_or_else(|| other.doc_address.cmp(&self.doc_address)) } } -impl PartialEq for GlobalScoredDoc { - fn eq(&self, other: &GlobalScoredDoc) -> bool { +impl PartialEq for ComparableDoc { + fn eq(&self, other: &Self) -> bool { self.cmp(other) == Ordering::Equal } } -impl Eq for GlobalScoredDoc {} +impl Eq for ComparableDoc {} /// The Top Collector keeps track of the K documents -/// with the best scores. +/// sorted by type `T`. /// /// The implementation is based on a `BinaryHeap`. /// The theorical complexity for collecting the top `K` out of `n` documents /// is `O(n log K)`. -/// -/// ```rust -/// #[macro_use] -/// extern crate tantivy; -/// use tantivy::schema::{SchemaBuilder, TEXT}; -/// use tantivy::{Index, Result, DocId, Score}; -/// use tantivy::collector::TopCollector; -/// use tantivy::query::QueryParser; -/// -/// # fn main() { example().unwrap(); } -/// fn example() -> Result<()> { -/// let mut schema_builder = SchemaBuilder::new(); -/// let title = schema_builder.add_text_field("title", TEXT); -/// let schema = schema_builder.build(); -/// let index = Index::create_in_ram(schema); -/// { -/// let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?; -/// index_writer.add_document(doc!( -/// title => "The Name of the Wind", -/// )); -/// index_writer.add_document(doc!( -/// title => "The Diary of Muadib", -/// )); -/// index_writer.add_document(doc!( -/// title => "A Dairy Cow", -/// )); -/// index_writer.add_document(doc!( -/// title => "The Diary of a Young Girl", -/// )); -/// index_writer.commit().unwrap(); -/// } -/// -/// index.load_searchers()?; -/// let searcher = index.searcher(); -/// -/// { -/// let mut top_collector = TopCollector::with_limit(2); -/// let query_parser = QueryParser::for_index(&index, vec![title]); -/// let query = query_parser.parse_query("diary")?; -/// searcher.search(&*query, &mut top_collector).unwrap(); -/// -/// let score_docs: Vec<(Score, DocId)> = top_collector -/// .score_docs() -/// .into_iter() -/// .map(|(score, doc_address)| (score, doc_address.doc())) -/// .collect(); -/// -/// assert_eq!(score_docs, vec![(0.7261542, 1), (0.6099695, 3)]); -/// } -/// -/// Ok(()) -/// } -/// ``` -pub struct TopCollector { +pub struct TopCollector { limit: usize, - heap: BinaryHeap, + heap: BinaryHeap>, segment_id: u32, } -impl TopCollector { +impl TopCollector { /// Creates a top collector, with a number of documents equal to "limit". /// /// # Panics /// The method panics if limit is 0 - pub fn with_limit(limit: usize) -> TopCollector { + pub fn with_limit(limit: usize) -> TopCollector { if limit < 1 { panic!("Limit must be strictly greater than 0."); } @@ -125,23 +71,27 @@ impl TopCollector { /// Calling this method triggers the sort. /// The result of the sort is not cached. pub fn docs(&self) -> Vec { - self.score_docs() + self.top_docs() .into_iter() - .map(|score_doc| score_doc.1) + .map(|(_feature, doc)| doc) .collect() } - /// Returns K best ScoredDocument sorted in decreasing order. + /// Returns K best FeatureDocuments sorted in decreasing order. /// /// Calling this method triggers the sort. /// The result of the sort is not cached. - pub fn score_docs(&self) -> Vec<(Score, DocAddress)> { - let mut scored_docs: Vec = self.heap.iter().cloned().collect(); - scored_docs.sort(); - scored_docs + pub fn top_docs(&self) -> Vec<(T, DocAddress)> { + let mut feature_docs: Vec> = self.heap.iter().cloned().collect(); + feature_docs.sort(); + feature_docs .into_iter() - .map(|GlobalScoredDoc { score, doc_address }| (score, doc_address)) - .collect() + .map( + |ComparableDoc { + feature, + doc_address, + }| (feature, doc_address), + ).collect() } /// Return true iff at least K documents have gone through @@ -150,46 +100,45 @@ impl TopCollector { pub fn at_capacity(&self) -> bool { self.heap.len() >= self.limit } -} -impl Collector for TopCollector { - fn set_segment(&mut self, segment_id: SegmentLocalId, _: &SegmentReader) -> Result<()> { + /// Sets the segment local ID for the collector + pub fn set_segment_id(&mut self, segment_id: SegmentLocalId) { self.segment_id = segment_id; - Ok(()) } - fn collect(&mut self, doc: DocId, score: Score) { + /// Collects a document scored by the given feature + /// + /// It collects documents until it has reached the max capacity. Once it reaches capacity, it + /// will compare the lowest scoring item with the given one and keep whichever is greater. + pub fn collect(&mut self, doc: DocId, feature: T) { if self.at_capacity() { // It's ok to unwrap as long as a limit of 0 is forbidden. - let limit_doc: GlobalScoredDoc = *self.heap + let limit_doc: ComparableDoc = self + .heap .peek() - .expect("Top collector with size 0 is forbidden"); - if limit_doc.score < score { - let mut mut_head = self.heap + .expect("Top collector with size 0 is forbidden") + .clone(); + if limit_doc.feature < feature { + let mut mut_head = self + .heap .peek_mut() .expect("Top collector with size 0 is forbidden"); - mut_head.score = score; + mut_head.feature = feature; mut_head.doc_address = DocAddress(self.segment_id, doc); } } else { - let wrapped_doc = GlobalScoredDoc { - score, + let wrapped_doc = ComparableDoc { + feature, doc_address: DocAddress(self.segment_id, doc), }; self.heap.push(wrapped_doc); } } - - fn requires_scoring(&self) -> bool { - true - } } #[cfg(test)] mod tests { - use super::*; - use collector::Collector; use DocId; use Score; @@ -201,7 +150,7 @@ mod tests { top_collector.collect(5, 0.3); assert!(!top_collector.at_capacity()); let score_docs: Vec<(Score, DocId)> = top_collector - .score_docs() + .top_docs() .into_iter() .map(|(score, doc_address)| (score, doc_address.doc())) .collect(); @@ -219,7 +168,7 @@ mod tests { assert!(top_collector.at_capacity()); { let score_docs: Vec<(Score, DocId)> = top_collector - .score_docs() + .top_docs() .into_iter() .map(|(score, doc_address)| (score, doc_address.doc())) .collect(); @@ -238,7 +187,7 @@ mod tests { #[test] #[should_panic] fn test_top_0() { - TopCollector::with_limit(0); + let _collector: TopCollector = TopCollector::with_limit(0); } } diff --git a/src/collector/top_field_collector.rs b/src/collector/top_field_collector.rs new file mode 100644 index 000000000..3fb95d21a --- /dev/null +++ b/src/collector/top_field_collector.rs @@ -0,0 +1,263 @@ +use super::Collector; +use collector::top_collector::TopCollector; +use fastfield::FastFieldReader; +use fastfield::FastValue; +use schema::Field; +use DocAddress; +use DocId; +use Result; +use Score; +use SegmentReader; + +/// The Top Field Collector keeps track of the K documents +/// sorted by a fast field in the index +/// +/// The implementation is based on a `BinaryHeap`. +/// The theorical complexity for collecting the top `K` out of `n` documents +/// is `O(n log K)`. +/// +/// ```rust +/// #[macro_use] +/// extern crate tantivy; +/// use tantivy::schema::{SchemaBuilder, TEXT, FAST}; +/// use tantivy::{Index, Result, DocId}; +/// use tantivy::collector::TopFieldCollector; +/// use tantivy::query::QueryParser; +/// +/// # fn main() { example().unwrap(); } +/// fn example() -> Result<()> { +/// let mut schema_builder = SchemaBuilder::new(); +/// let title = schema_builder.add_text_field("title", TEXT); +/// let rating = schema_builder.add_u64_field("rating", FAST); +/// let schema = schema_builder.build(); +/// let index = Index::create_in_ram(schema); +/// { +/// let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?; +/// index_writer.add_document(doc!( +/// title => "The Name of the Wind", +/// rating => 92u64, +/// )); +/// index_writer.add_document(doc!( +/// title => "The Diary of Muadib", +/// rating => 97u64, +/// )); +/// index_writer.add_document(doc!( +/// title => "A Dairy Cow", +/// rating => 63u64, +/// )); +/// index_writer.add_document(doc!( +/// title => "The Diary of a Young Girl", +/// rating => 80u64, +/// )); +/// index_writer.commit().unwrap(); +/// } +/// +/// index.load_searchers()?; +/// let searcher = index.searcher(); +/// +/// { +/// let mut top_collector = TopFieldCollector::with_limit(rating, 2); +/// let query_parser = QueryParser::for_index(&index, vec![title]); +/// let query = query_parser.parse_query("diary")?; +/// searcher.search(&*query, &mut top_collector).unwrap(); +/// +/// let score_docs: Vec<(u64, DocId)> = top_collector +/// .top_docs() +/// .into_iter() +/// .map(|(field, doc_address)| (field, doc_address.doc())) +/// .collect(); +/// +/// assert_eq!(score_docs, vec![(97u64, 1), (80, 3)]); +/// } +/// +/// Ok(()) +/// } +/// ``` +pub struct TopFieldCollector { + field: Field, + collector: TopCollector, + fast_field: Option>, +} + +impl TopFieldCollector { + /// Creates a top field collector, with a number of documents equal to "limit". + /// + /// The given field name must be a fast field, otherwise the collector have an error while + /// collecting results. + /// + /// # Panics + /// The method panics if limit is 0 + pub fn with_limit(field: Field, limit: usize) -> Self { + TopFieldCollector { + field, + collector: TopCollector::with_limit(limit), + fast_field: None, + } + } + + /// Returns K best documents sorted the given field name in decreasing order. + /// + /// Calling this method triggers the sort. + /// The result of the sort is not cached. + pub fn docs(&self) -> Vec { + self.collector.docs() + } + + /// Returns K best FieldDocuments sorted in decreasing order. + /// + /// Calling this method triggers the sort. + /// The result of the sort is not cached. + pub fn top_docs(&self) -> Vec<(T, DocAddress)> { + self.collector.top_docs() + } + + /// Return true iff at least K documents have gone through + /// the collector. + #[inline] + pub fn at_capacity(&self) -> bool { + self.collector.at_capacity() + } +} + +impl Collector for TopFieldCollector { + fn set_segment(&mut self, segment_id: u32, segment: &SegmentReader) -> Result<()> { + self.collector.set_segment_id(segment_id); + self.fast_field = Some(segment.fast_field_reader(self.field)?); + Ok(()) + } + + fn collect(&mut self, doc: DocId, _score: Score) { + let field_value = self + .fast_field + .as_ref() + .expect("collect() was called before set_segment. This should never happen.") + .get(doc); + self.collector.collect(doc, field_value); + } + + fn requires_scoring(&self) -> bool { + false + } +} + +#[cfg(test)] +mod tests { + use super::*; + use query::Query; + use query::QueryParser; + use schema::Field; + use schema::IntOptions; + use schema::Schema; + use schema::{SchemaBuilder, FAST, TEXT}; + use Index; + use IndexWriter; + use TantivyError; + + const TITLE: &str = "title"; + const SIZE: &str = "size"; + + #[test] + fn test_top_collector_not_at_capacity() { + let mut schema_builder = SchemaBuilder::new(); + let title = schema_builder.add_text_field(TITLE, TEXT); + let size = schema_builder.add_u64_field(SIZE, FAST); + let schema = schema_builder.build(); + let (index, query) = index("beer", title, schema, |index_writer| { + index_writer.add_document(doc!( + title => "bottle of beer", + size => 12u64, + )); + index_writer.add_document(doc!( + title => "growler of beer", + size => 64u64, + )); + index_writer.add_document(doc!( + title => "pint of beer", + size => 16u64, + )); + }); + let searcher = index.searcher(); + + let mut top_collector = TopFieldCollector::with_limit(size, 4); + searcher.search(&*query, &mut top_collector).unwrap(); + assert!(!top_collector.at_capacity()); + + let score_docs: Vec<(u64, DocId)> = top_collector + .top_docs() + .into_iter() + .map(|(field, doc_address)| (field, doc_address.doc())) + .collect(); + assert_eq!(score_docs, vec![(64, 1), (16, 2), (12, 0)]); + } + + #[test] + #[should_panic] + fn test_field_does_not_exist() { + let mut schema_builder = SchemaBuilder::new(); + let title = schema_builder.add_text_field(TITLE, TEXT); + let size = schema_builder.add_u64_field(SIZE, FAST); + let schema = schema_builder.build(); + let (index, _) = index("beer", title, schema, |index_writer| { + index_writer.add_document(doc!( + title => "bottle of beer", + size => 12u64, + )); + }); + let searcher = index.searcher(); + let segment = searcher.segment_reader(0); + let mut top_collector: TopFieldCollector = TopFieldCollector::with_limit(Field(2), 4); + let _ = top_collector.set_segment(0, segment); + } + + #[test] + fn test_field_not_fast_field() { + let mut schema_builder = SchemaBuilder::new(); + let title = schema_builder.add_text_field(TITLE, TEXT); + let size = schema_builder.add_u64_field(SIZE, IntOptions::default()); + let schema = schema_builder.build(); + let (index, _) = index("beer", title, schema, |index_writer| { + index_writer.add_document(doc!( + title => "bottle of beer", + size => 12u64, + )); + }); + let searcher = index.searcher(); + let segment = searcher.segment_reader(0); + let mut top_collector: TopFieldCollector = TopFieldCollector::with_limit(size, 4); + assert_matches!( + top_collector.set_segment(0, segment), + Err(TantivyError::FastFieldError(_)) + ); + } + + #[test] + #[should_panic] + fn test_collect_before_set_segment() { + let mut top_collector: TopFieldCollector = TopFieldCollector::with_limit(Field(0), 4); + top_collector.collect(0, 0f32); + } + + #[test] + #[should_panic] + fn test_top_0() { + let _: TopFieldCollector = TopFieldCollector::with_limit(Field(0), 0); + } + + fn index( + query: &str, + query_field: Field, + schema: Schema, + mut doc_adder: impl FnMut(&mut IndexWriter) -> (), + ) -> (Index, Box) { + let index = Index::create_in_ram(schema); + + let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); + doc_adder(&mut index_writer); + index_writer.commit().unwrap(); + index.load_searchers().unwrap(); + + let query_parser = QueryParser::for_index(&index, vec![query_field]); + let query = query_parser.parse_query(query).unwrap(); + (index, query) + } +} diff --git a/src/collector/top_score_collector.rs b/src/collector/top_score_collector.rs new file mode 100644 index 000000000..68bf114f6 --- /dev/null +++ b/src/collector/top_score_collector.rs @@ -0,0 +1,187 @@ +use super::Collector; +use collector::top_collector::TopCollector; +use DocAddress; +use DocId; +use Result; +use Score; +use SegmentLocalId; +use SegmentReader; + +/// The Top Score Collector keeps track of the K documents +/// sorted by their score. +/// +/// The implementation is based on a `BinaryHeap`. +/// The theorical complexity for collecting the top `K` out of `n` documents +/// is `O(n log K)`. +/// +/// ```rust +/// #[macro_use] +/// extern crate tantivy; +/// use tantivy::schema::{SchemaBuilder, TEXT}; +/// use tantivy::{Index, Result, DocId, Score}; +/// use tantivy::collector::TopScoreCollector; +/// use tantivy::query::QueryParser; +/// +/// # fn main() { example().unwrap(); } +/// fn example() -> Result<()> { +/// let mut schema_builder = SchemaBuilder::new(); +/// let title = schema_builder.add_text_field("title", TEXT); +/// let schema = schema_builder.build(); +/// let index = Index::create_in_ram(schema); +/// { +/// let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?; +/// index_writer.add_document(doc!( +/// title => "The Name of the Wind", +/// )); +/// index_writer.add_document(doc!( +/// title => "The Diary of Muadib", +/// )); +/// index_writer.add_document(doc!( +/// title => "A Dairy Cow", +/// )); +/// index_writer.add_document(doc!( +/// title => "The Diary of a Young Girl", +/// )); +/// index_writer.commit().unwrap(); +/// } +/// +/// index.load_searchers()?; +/// let searcher = index.searcher(); +/// +/// { +/// let mut top_collector = TopScoreCollector::with_limit(2); +/// let query_parser = QueryParser::for_index(&index, vec![title]); +/// let query = query_parser.parse_query("diary")?; +/// searcher.search(&*query, &mut top_collector).unwrap(); +/// +/// let score_docs: Vec<(Score, DocId)> = top_collector +/// .top_docs() +/// .into_iter() +/// .map(|(score, doc_address)| (score, doc_address.doc())) +/// .collect(); +/// +/// assert_eq!(score_docs, vec![(0.7261542, 1), (0.6099695, 3)]); +/// } +/// +/// Ok(()) +/// } +/// ``` +pub struct TopScoreCollector { + collector: TopCollector, +} + +impl TopScoreCollector { + /// Creates a top score collector, with a number of documents equal to "limit". + /// + /// # Panics + /// The method panics if limit is 0 + pub fn with_limit(limit: usize) -> TopScoreCollector { + TopScoreCollector { + collector: TopCollector::with_limit(limit), + } + } + + /// Returns K best scored documents sorted in decreasing order. + /// + /// Calling this method triggers the sort. + /// The result of the sort is not cached. + pub fn docs(&self) -> Vec { + self.collector.docs() + } + + /// Returns K best ScoredDocuments sorted in decreasing order. + /// + /// Calling this method triggers the sort. + /// The result of the sort is not cached. + pub fn top_docs(&self) -> Vec<(Score, DocAddress)> { + self.collector.top_docs() + } + + /// Returns K best ScoredDocuments sorted in decreasing order. + /// + /// Calling this method triggers the sort. + /// The result of the sort is not cached. + #[deprecated] + pub fn score_docs(&self) -> Vec<(Score, DocAddress)> { + self.collector.top_docs() + } + + /// Return true iff at least K documents have gone through + /// the collector. + #[inline] + pub fn at_capacity(&self) -> bool { + self.collector.at_capacity() + } +} + +impl Collector for TopScoreCollector { + fn set_segment(&mut self, segment_id: SegmentLocalId, _: &SegmentReader) -> Result<()> { + self.collector.set_segment_id(segment_id); + Ok(()) + } + + fn collect(&mut self, doc: DocId, score: Score) { + self.collector.collect(doc, score); + } + + fn requires_scoring(&self) -> bool { + true + } +} + +#[cfg(test)] +mod tests { + use super::*; + use collector::Collector; + use DocId; + use Score; + + #[test] + fn test_top_collector_not_at_capacity() { + let mut top_collector = TopScoreCollector::with_limit(4); + top_collector.collect(1, 0.8); + top_collector.collect(3, 0.2); + top_collector.collect(5, 0.3); + assert!(!top_collector.at_capacity()); + let score_docs: Vec<(Score, DocId)> = top_collector + .top_docs() + .into_iter() + .map(|(score, doc_address)| (score, doc_address.doc())) + .collect(); + assert_eq!(score_docs, vec![(0.8, 1), (0.3, 5), (0.2, 3)]); + } + + #[test] + fn test_top_collector_at_capacity() { + let mut top_collector = TopScoreCollector::with_limit(4); + top_collector.collect(1, 0.8); + top_collector.collect(3, 0.2); + top_collector.collect(5, 0.3); + top_collector.collect(7, 0.9); + top_collector.collect(9, -0.2); + assert!(top_collector.at_capacity()); + { + let score_docs: Vec<(Score, DocId)> = top_collector + .top_docs() + .into_iter() + .map(|(score, doc_address)| (score, doc_address.doc())) + .collect(); + assert_eq!(score_docs, vec![(0.9, 7), (0.8, 1), (0.3, 5), (0.2, 3)]); + } + { + let docs: Vec = top_collector + .docs() + .into_iter() + .map(|doc_address| doc_address.doc()) + .collect(); + assert_eq!(docs, vec![7, 1, 5, 3]); + } + } + + #[test] + #[should_panic] + fn test_top_0() { + TopScoreCollector::with_limit(0); + } + +} diff --git a/src/common/bitpacker.rs b/src/common/bitpacker.rs index 4945796b0..593e36fb8 100644 --- a/src/common/bitpacker.rs +++ b/src/common/bitpacker.rs @@ -102,6 +102,7 @@ where addr + 8 <= data.len(), "The fast field field should have been padded with 7 bytes." ); + #[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))] let val_unshifted_unmasked: u64 = u64::from_le(unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) }); let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64; @@ -125,6 +126,7 @@ where for output_val in output.iter_mut() { let addr = addr_in_bits >> 3; let bit_shift = addr_in_bits & 7; + #[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))] let val_unshifted_unmasked: u64 = unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) }; let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64; diff --git a/src/common/bitset.rs b/src/common/bitset.rs index 73f03c4f5..a125f4cbc 100644 --- a/src/common/bitset.rs +++ b/src/common/bitset.rs @@ -34,17 +34,17 @@ impl TinySet { } /// Returns the complement of the set in `[0, 64[`. - fn complement(&self) -> TinySet { + fn complement(self) -> TinySet { TinySet(!self.0) } /// Returns true iff the `TinySet` contains the element `el`. - pub fn contains(&self, el: u32) -> bool { + pub fn contains(self, el: u32) -> bool { !self.intersect(TinySet::singleton(el)).is_empty() } /// Returns the intersection of `self` and `other` - pub fn intersect(&self, other: TinySet) -> TinySet { + pub fn intersect(self, other: TinySet) -> TinySet { TinySet(self.0 & other.0) } @@ -77,7 +77,7 @@ impl TinySet { /// Returns true iff the `TinySet` is empty. #[inline(always)] - pub fn is_empty(&self) -> bool { + pub fn is_empty(self) -> bool { self.0 == 0u64 } @@ -114,7 +114,7 @@ impl TinySet { self.0 = 0u64; } - pub fn len(&self) -> u32 { + pub fn len(self) -> u32 { self.0.count_ones() } } @@ -266,14 +266,14 @@ mod tests { #[test] fn test_bitset_large() { - let arr = generate_nonunique_unsorted(1_000_000, 50_000); + let arr = generate_nonunique_unsorted(100_000, 5_000); let mut btreeset: BTreeSet = BTreeSet::new(); - let mut bitset = BitSet::with_max_value(1_000_000); + let mut bitset = BitSet::with_max_value(100_000); for el in arr { btreeset.insert(el); bitset.insert(el); } - for i in 0..1_000_000 { + for i in 0..100_000 { assert_eq!(btreeset.contains(&i), bitset.contains(i)); } assert_eq!(btreeset.len(), bitset.len()); diff --git a/src/common/composite_file.rs b/src/common/composite_file.rs index 2f3f71a47..0cdfdff87 100644 --- a/src/common/composite_file.rs +++ b/src/common/composite_file.rs @@ -4,6 +4,8 @@ use common::VInt; use directory::ReadOnlySource; use directory::WritePtr; use schema::Field; +use space_usage::PerFieldSpaceUsage; +use space_usage::FieldUsage; use std::collections::HashMap; use std::io::Write; use std::io::{self, Read}; @@ -72,7 +74,8 @@ impl CompositeWrite { let footer_offset = self.write.written_bytes(); VInt(self.offsets.len() as u64).serialize(&mut self.write)?; - let mut offset_fields: Vec<_> = self.offsets + let mut offset_fields: Vec<_> = self + .offsets .iter() .map(|(file_addr, offset)| (*offset, *file_addr)) .collect(); @@ -165,6 +168,16 @@ impl CompositeFile { .get(&FileAddr { field, idx }) .map(|&(from, to)| self.data.slice(from, to)) } + + pub fn space_usage(&self) -> PerFieldSpaceUsage { + let mut fields = HashMap::new(); + for (&field_addr, &(start, end)) in self.offsets_index.iter() { + fields.entry(field_addr.field) + .or_insert_with(|| FieldUsage::empty(field_addr.field)) + .add_field_idx(field_addr.idx, end - start); + } + PerFieldSpaceUsage::new(fields) + } } #[cfg(test)] diff --git a/src/common/vint.rs b/src/common/vint.rs index 308aff1ca..7b782a946 100644 --- a/src/common/vint.rs +++ b/src/common/vint.rs @@ -10,8 +10,6 @@ pub struct VInt(pub u64); const STOP_BIT: u8 = 128; impl VInt { - - pub fn val(&self) -> u64 { self.0 } @@ -20,14 +18,13 @@ impl VInt { VInt::deserialize(reader).map(|vint| vint.0) } - pub fn serialize_into_vec(&self, output: &mut Vec){ + pub fn serialize_into_vec(&self, output: &mut Vec) { let mut buffer = [0u8; 10]; let num_bytes = self.serialize_into(&mut buffer); output.extend(&buffer[0..num_bytes]); } fn serialize_into(&self, buffer: &mut [u8; 10]) -> usize { - let mut remaining = self.0; for (i, b) in buffer.iter_mut().enumerate() { let next_byte: u8 = (remaining % 128u64) as u8; @@ -74,7 +71,6 @@ impl BinarySerializable for VInt { } } - #[cfg(test)] mod tests { @@ -89,10 +85,10 @@ mod tests { } assert!(num_bytes > 0); if num_bytes < 10 { - assert!(1u64 << (7*num_bytes) > val); + assert!(1u64 << (7 * num_bytes) > val); } if num_bytes > 1 { - assert!(1u64 << (7*(num_bytes-1)) <= val); + assert!(1u64 << (7 * (num_bytes - 1)) <= val); } let serdeser_val = VInt::deserialize(&mut &v[..]).unwrap(); assert_eq!(val, serdeser_val.0); @@ -105,11 +101,11 @@ mod tests { aux_test_vint(5); aux_test_vint(u64::max_value()); for i in 1..9 { - let power_of_128 = 1u64 << (7*i); + let power_of_128 = 1u64 << (7 * i); aux_test_vint(power_of_128 - 1u64); - aux_test_vint(power_of_128 ); + aux_test_vint(power_of_128); aux_test_vint(power_of_128 + 1u64); } aux_test_vint(10); } -} \ No newline at end of file +} diff --git a/src/core/index.rs b/src/core/index.rs index c88292ba1..3eafb90cc 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -1,39 +1,42 @@ -use core::SegmentId; -use error::{ErrorKind, ResultExt}; -use schema::Schema; -use serde_json; -use std::borrow::BorrowMut; -use std::fmt; -use std::sync::atomic::{AtomicUsize, Ordering}; -use std::sync::Arc; -use Result; - use super::pool::LeasedItem; use super::pool::Pool; use super::segment::create_segment; use super::segment::Segment; use core::searcher::Searcher; use core::IndexMeta; +use core::SegmentId; use core::SegmentMeta; use core::SegmentReader; use core::META_FILEPATH; -use directory::{ManagedDirectory, DirectoryClone}; +use directory::ManagedDirectory; #[cfg(feature = "mmap")] use directory::MmapDirectory; use directory::{Directory, RAMDirectory}; +use error::TantivyError; use indexer::index_writer::open_index_writer; use indexer::index_writer::HEAP_SIZE_MIN; use indexer::segment_updater::save_new_metas; -use indexer::DirectoryLock; +use indexer::LockType; use num_cpus; +use schema::Field; +use schema::FieldType; +use schema::Schema; +use serde_json; +use std::borrow::BorrowMut; +use std::fmt; use std::path::Path; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; +use tokenizer::BoxedTokenizer; use tokenizer::TokenizerManager; use IndexWriter; +use Result; fn load_metas(directory: &Directory) -> Result { let meta_data = directory.atomic_read(&META_FILEPATH)?; let meta_string = String::from_utf8_lossy(&meta_data); - serde_json::from_str(&meta_string).chain_err(|| ErrorKind::CorruptedFile(META_FILEPATH.clone())) + serde_json::from_str(&meta_string) + .map_err(|_| TantivyError::CorruptedFile(META_FILEPATH.clone())) } /// Search Index @@ -112,6 +115,27 @@ impl Index { &self.tokenizers } + /// Helper to access the tokenizer associated to a specific field. + pub fn tokenizer_for_field(&self, field: Field) -> Result> { + let field_entry = self.schema.get_field_entry(field); + let field_type = field_entry.field_type(); + let tokenizer_manager: &TokenizerManager = self.tokenizers(); + let tokenizer_name_opt: Option> = match field_type { + FieldType::Str(text_options) => text_options + .get_indexing_options() + .map(|text_indexing_options| text_indexing_options.tokenizer().to_string()) + .and_then(|tokenizer_name| tokenizer_manager.get(&tokenizer_name)), + _ => None, + }; + match tokenizer_name_opt { + Some(tokenizer) => Ok(tokenizer), + None => Err(TantivyError::SchemaError(format!( + "{:?} is not a text field.", + field_entry.name() + ))), + } + } + /// Opens a new directory from an index path. #[cfg(feature = "mmap")] pub fn open_in_dir>(directory_path: P) -> Result { @@ -155,7 +179,7 @@ impl Index { num_threads: usize, overall_heap_size_in_bytes: usize, ) -> Result { - let directory_lock = DirectoryLock::lock(self.directory().box_clone())?; + let directory_lock = LockType::IndexWriterLock.acquire_lock(&self.directory)?; let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads; open_index_writer( self, @@ -193,7 +217,8 @@ impl Index { /// Returns the list of segments that are searchable pub fn searchable_segments(&self) -> Result> { - Ok(self.searchable_segment_metas()? + Ok(self + .searchable_segment_metas()? .into_iter() .map(|segment_meta| self.segment(segment_meta)) .collect()) @@ -228,7 +253,8 @@ impl Index { /// Returns the list of segment ids that are searchable. pub fn searchable_segment_ids(&self) -> Result> { - Ok(self.searchable_segment_metas()? + Ok(self + .searchable_segment_metas()? .iter() .map(|segment_meta| segment_meta.id()) .collect()) @@ -241,13 +267,18 @@ impl Index { self.num_searchers.store(num_searchers, Ordering::Release); } - /// Creates a new generation of searchers after - - /// a change of the set of searchable indexes. + /// Update searchers so that they reflect the state of the last + /// `.commit()`. /// - /// This needs to be called when a new segment has been - /// published or after a merge. + /// If indexing happens in the same process as searching, + /// you most likely want to call `.load_searchers()` right after each + /// successful call to `.commit()`. + /// + /// If indexing and searching happen in different processes, the way to + /// get the freshest `index` at all time, is to watch `meta.json` and + /// call `load_searchers` whenever a changes happen. pub fn load_searchers(&self) -> Result<()> { + let _meta_lock = LockType::MetaLock.acquire_lock(self.directory())?; let searchable_segments = self.searchable_segments()?; let segment_readers: Vec = searchable_segments .iter() @@ -256,7 +287,7 @@ impl Index { let schema = self.schema(); let num_searchers: usize = self.num_searchers.load(Ordering::Acquire); let searchers = (0..num_searchers) - .map(|_| Searcher::new(schema.clone(), segment_readers.clone())) + .map(|_| Searcher::new(schema.clone(), self.clone(), segment_readers.clone())) .collect(); self.searcher_pool.publish_new_generation(searchers); Ok(()) @@ -294,3 +325,24 @@ impl Clone for Index { } } } + +#[cfg(test)] +mod tests { + use schema::{SchemaBuilder, INT_INDEXED, TEXT}; + use Index; + + #[test] + fn test_indexer_for_field() { + let mut schema_builder = SchemaBuilder::default(); + let num_likes_field = schema_builder.add_u64_field("num_likes", INT_INDEXED); + let body_field = schema_builder.add_text_field("body", TEXT); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + assert!(index.tokenizer_for_field(body_field).is_ok()); + assert_eq!( + format!("{:?}", index.tokenizer_for_field(num_likes_field).err()), + "Some(SchemaError(\"\\\"num_likes\\\" is not a text field.\"))" + ); + } + +} diff --git a/src/core/index_meta.rs b/src/core/index_meta.rs index b5ed52427..ecef75d02 100644 --- a/src/core/index_meta.rs +++ b/src/core/index_meta.rs @@ -58,7 +58,7 @@ mod tests { }; let index_metas = IndexMeta { segments: Vec::new(), - schema: schema, + schema, opstamp: 0u64, payload: None, }; diff --git a/src/core/inverted_index_reader.rs b/src/core/inverted_index_reader.rs index 57716748a..ba9d77c70 100644 --- a/src/core/inverted_index_reader.rs +++ b/src/core/inverted_index_reader.rs @@ -1,13 +1,13 @@ use common::BinarySerializable; use directory::ReadOnlySource; +use owned_read::OwnedRead; +use positions::PositionReader; use postings::TermInfo; use postings::{BlockSegmentPostings, SegmentPostings}; use schema::FieldType; use schema::IndexRecordOption; use schema::Term; use termdict::TermDictionary; -use owned_read::OwnedRead; -use positions::PositionReader; /// The inverted index reader is in charge of accessing /// the inverted index associated to a specific field. @@ -32,6 +32,10 @@ pub struct InvertedIndexReader { } impl InvertedIndexReader { + #[cfg_attr( + feature = "cargo-clippy", + allow(clippy::needless_pass_by_value) + )] // for symetry pub(crate) fn new( termdict: TermDictionary, postings_source: ReadOnlySource, @@ -54,12 +58,12 @@ impl InvertedIndexReader { /// Creates an empty `InvertedIndexReader` object, which /// contains no terms at all. - pub fn empty(field_type: FieldType) -> InvertedIndexReader { + pub fn empty(field_type: &FieldType) -> InvertedIndexReader { let record_option = field_type .get_index_record_option() .unwrap_or(IndexRecordOption::Basic); InvertedIndexReader { - termdict: TermDictionary::empty(field_type), + termdict: TermDictionary::empty(&field_type), postings_source: ReadOnlySource::empty(), positions_source: ReadOnlySource::empty(), positions_idx_source: ReadOnlySource::empty(), @@ -100,6 +104,19 @@ impl InvertedIndexReader { block_postings.reset(term_info.doc_freq, postings_reader); } + /// Returns a block postings given a `Term`. + /// This method is for an advanced usage only. + /// + /// Most user should prefer using `read_postings` instead. + pub fn read_block_postings( + &self, + term: &Term, + option: IndexRecordOption, + ) -> Option { + self.get_term_info(term) + .map(move |term_info| self.read_block_postings_from_terminfo(&term_info, option)) + } + /// Returns a block postings given a `term_info`. /// This method is for an advanced usage only. /// @@ -133,7 +150,8 @@ impl InvertedIndexReader { if option.has_positions() { let position_reader = self.positions_source.clone(); let skip_reader = self.positions_idx_source.clone(); - let position_reader = PositionReader::new(position_reader, skip_reader, term_info.positions_idx); + let position_reader = + PositionReader::new(position_reader, skip_reader, term_info.positions_idx); Some(position_reader) } else { None @@ -159,8 +177,8 @@ impl InvertedIndexReader { /// `TextIndexingOptions` that does not index position will return a `SegmentPostings` /// with `DocId`s and frequencies. pub fn read_postings(&self, term: &Term, option: IndexRecordOption) -> Option { - let term_info = get!(self.get_term_info(term)); - Some(self.read_postings_from_terminfo(&term_info, option)) + self.get_term_info(term) + .map(move |term_info| self.read_postings_from_terminfo(&term_info, option)) } pub(crate) fn read_postings_no_deletes( @@ -168,8 +186,8 @@ impl InvertedIndexReader { term: &Term, option: IndexRecordOption, ) -> Option { - let term_info = get!(self.get_term_info(term)); - Some(self.read_postings_from_terminfo(&term_info, option)) + self.get_term_info(term) + .map(|term_info| self.read_postings_from_terminfo(&term_info, option)) } /// Returns the number of documents containing the term. diff --git a/src/core/mod.rs b/src/core/mod.rs index 6d43685f8..062b537ee 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -33,10 +33,4 @@ lazy_static! { /// Removing this file is safe, but will prevent the garbage collection of all of the file that /// are currently in the directory pub static ref MANAGED_FILEPATH: PathBuf = PathBuf::from(".managed.json"); - - /// Only one process should be able to write tantivy's index at a time. - /// This file, when present, is in charge of preventing other processes to open an IndexWriter. - /// - /// If the process is killed and this file remains, it is safe to remove it manually. - pub static ref LOCKFILE_FILEPATH: PathBuf = PathBuf::from(".tantivy-indexer.lock"); } diff --git a/src/core/pool.rs b/src/core/pool.rs index 609848317..d8564e46d 100644 --- a/src/core/pool.rs +++ b/src/core/pool.rs @@ -87,7 +87,8 @@ impl Deref for LeasedItem { type Target = T; fn deref(&self) -> &T { - &self.gen_item + &self + .gen_item .as_ref() .expect("Unwrapping a leased item should never fail") .item // unwrap is safe here @@ -96,7 +97,8 @@ impl Deref for LeasedItem { impl DerefMut for LeasedItem { fn deref_mut(&mut self) -> &mut T { - &mut self.gen_item + &mut self + .gen_item .as_mut() .expect("Unwrapping a mut leased item should never fail") .item // unwrap is safe here diff --git a/src/core/searcher.rs b/src/core/searcher.rs index 8f36b58ea..826bf4501 100644 --- a/src/core/searcher.rs +++ b/src/core/searcher.rs @@ -5,10 +5,12 @@ use query::Query; use schema::Document; use schema::Schema; use schema::{Field, Term}; +use space_usage::SearcherSpaceUsage; use std::fmt; use std::sync::Arc; use termdict::TermMerger; use DocAddress; +use Index; use Result; /// Holds a list of `SegmentReader`s ready for search. @@ -18,23 +20,35 @@ use Result; /// pub struct Searcher { schema: Schema, + index: Index, segment_readers: Vec, } impl Searcher { /// Creates a new `Searcher` - pub(crate) fn new(schema: Schema, segment_readers: Vec) -> Searcher { + pub(crate) fn new( + schema: Schema, + index: Index, + segment_readers: Vec, + ) -> Searcher { Searcher { schema, + index, segment_readers, } } + + /// Returns the `Index` associated to the `Searcher` + pub fn index(&self) -> &Index { + &self.index + } + /// Fetches a document from tantivy's store given a `DocAddress`. /// /// The searcher uses the segment ordinal to route the /// the request to the right `Segment`. - pub fn doc(&self, doc_address: &DocAddress) -> Result { - let DocAddress(segment_local_id, doc_id) = *doc_address; + pub fn doc(&self, doc_address: DocAddress) -> Result { + let DocAddress(segment_local_id, doc_id) = doc_address; let segment_reader = &self.segment_readers[segment_local_id as usize]; segment_reader.doc(doc_id) } @@ -48,7 +62,7 @@ impl Searcher { pub fn num_docs(&self) -> u64 { self.segment_readers .iter() - .map(|segment_reader| segment_reader.num_docs() as u64) + .map(|segment_reader| u64::from(segment_reader.num_docs())) .sum::() } @@ -57,8 +71,9 @@ impl Searcher { pub fn doc_freq(&self, term: &Term) -> u64 { self.segment_readers .iter() - .map(|segment_reader| segment_reader.inverted_index(term.field()).doc_freq(term) as u64) - .sum::() + .map(|segment_reader| { + u64::from(segment_reader.inverted_index(term.field()).doc_freq(term)) + }).sum::() } /// Return the list of segment readers @@ -78,12 +93,22 @@ impl Searcher { /// Return the field searcher associated to a `Field`. pub fn field(&self, field: Field) -> FieldSearcher { - let inv_index_readers = self.segment_readers + let inv_index_readers = self + .segment_readers .iter() .map(|segment_reader| segment_reader.inverted_index(field)) .collect::>(); FieldSearcher::new(inv_index_readers) } + + /// Summarize total space usage of this searcher. + pub fn space_usage(&self) -> SearcherSpaceUsage { + let mut space_usage = SearcherSpaceUsage::new(); + for segment_reader in self.segment_readers.iter() { + space_usage.add_segment(segment_reader.space_usage()); + } + space_usage + } } pub struct FieldSearcher { @@ -98,7 +123,8 @@ impl FieldSearcher { /// Returns a Stream over all of the sorted unique terms of /// for the given field. pub fn terms(&self) -> TermMerger { - let term_streamers: Vec<_> = self.inv_index_readers + let term_streamers: Vec<_> = self + .inv_index_readers .iter() .map(|inverted_index| inverted_index.terms().stream()) .collect(); @@ -108,7 +134,8 @@ impl FieldSearcher { impl fmt::Debug for Searcher { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let segment_ids = self.segment_readers + let segment_ids = self + .segment_readers .iter() .map(|segment_reader| segment_reader.segment_id()) .collect::>(); diff --git a/src/core/segment_id.rs b/src/core/segment_id.rs index 75e76089d..64c5fee38 100644 --- a/src/core/segment_id.rs +++ b/src/core/segment_id.rs @@ -52,12 +52,12 @@ impl SegmentId { /// Picking the first 8 chars is ok to identify /// segments in a display message. pub fn short_uuid_string(&self) -> String { - (&self.0.simple().to_string()[..8]).to_string() + (&self.0.to_simple_ref().to_string()[..8]).to_string() } /// Returns a segment uuid string. pub fn uuid_string(&self) -> String { - self.0.simple().to_string() + self.0.to_simple_ref().to_string() } } diff --git a/src/core/segment_meta.rs b/src/core/segment_meta.rs index f1e707bad..9478663ea 100644 --- a/src/core/segment_meta.rs +++ b/src/core/segment_meta.rs @@ -50,7 +50,7 @@ impl<'a> serde::Deserialize<'a> for SegmentMeta { { let inner = InnerSegmentMeta::deserialize(deserializer)?; let tracked = INVENTORY.track(inner); - Ok(SegmentMeta { tracked: tracked }) + Ok(SegmentMeta { tracked }) } } diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index f0edb86b3..54b465e77 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -4,8 +4,7 @@ use core::InvertedIndexReader; use core::Segment; use core::SegmentComponent; use core::SegmentId; -use core::SegmentMeta; -use error::ErrorKind; +use error::TantivyError; use fastfield::DeleteBitSet; use fastfield::FacetReader; use fastfield::FastFieldReader; @@ -17,6 +16,7 @@ use schema::Document; use schema::Field; use schema::FieldType; use schema::Schema; +use space_usage::SegmentSpaceUsage; use std::collections::HashMap; use std::fmt; use std::sync::Arc; @@ -44,7 +44,8 @@ pub struct SegmentReader { inv_idx_reader_cache: Arc>>>, segment_id: SegmentId, - segment_meta: SegmentMeta, + max_doc: DocId, + num_docs: DocId, termdict_composite: CompositeFile, postings_composite: CompositeFile, @@ -64,7 +65,7 @@ impl SegmentReader { /// Today, `tantivy` does not handle deletes, so it happens /// to also be the number of documents in the index. pub fn max_doc(&self) -> DocId { - self.segment_meta.max_doc() + self.max_doc } /// Returns the number of documents. @@ -73,7 +74,7 @@ impl SegmentReader { /// Today, `tantivy` does not handle deletes so max doc and /// num_docs are the same. pub fn num_docs(&self) -> DocId { - self.segment_meta.num_docs() + self.num_docs } /// Returns the schema of the index this segment belongs to. @@ -153,15 +154,17 @@ impl SegmentReader { /// Accessor to the `BytesFastFieldReader` associated to a given `Field`. pub fn bytes_fast_field_reader(&self, field: Field) -> fastfield::Result { let field_entry = self.schema.get_field_entry(field); - match field_entry.field_type() { - &FieldType::Bytes => {} + match *field_entry.field_type() { + FieldType::Bytes => {} _ => return Err(FastFieldNotAvailableError::new(field_entry)), } - let idx_reader = self.fast_fields_composite + let idx_reader = self + .fast_fields_composite .open_read_with_idx(field, 0) .ok_or_else(|| FastFieldNotAvailableError::new(field_entry)) .map(FastFieldReader::open)?; - let values = self.fast_fields_composite + let values = self + .fast_fields_composite .open_read_with_idx(field, 1) .ok_or_else(|| FastFieldNotAvailableError::new(field_entry))?; Ok(BytesFastFieldReader::open(idx_reader, values)) @@ -171,22 +174,22 @@ impl SegmentReader { pub fn facet_reader(&self, field: Field) -> Result { let field_entry = self.schema.get_field_entry(field); if field_entry.field_type() != &FieldType::HierarchicalFacet { - return Err(ErrorKind::InvalidArgument(format!( + return Err(TantivyError::InvalidArgument(format!( "The field {:?} is not a \ hierarchical facet.", field_entry - )).into()); + ))); } let term_ords_reader = self.multi_fast_field_reader(field)?; let termdict_source = self.termdict_composite.open_read(field).ok_or_else(|| { - ErrorKind::InvalidArgument(format!( + TantivyError::InvalidArgument(format!( "The field \"{}\" is a hierarchical \ but this segment does not seem to have the field term \ dictionary.", field_entry.name() )) })?; - let termdict = TermDictionary::from_source(termdict_source); + let termdict = TermDictionary::from_source(&termdict_source); let facet_reader = FacetReader::new(term_ords_reader, termdict); Ok(facet_reader) } @@ -225,6 +228,8 @@ impl SegmentReader { let store_source = segment.open_read(SegmentComponent::STORE)?; let store_reader = StoreReader::from_source(store_source); + fail_point!("SegmentReader::open#middle"); + let postings_source = segment.open_read(SegmentComponent::POSTINGS)?; let postings_composite = CompositeFile::open(&postings_source)?; @@ -260,7 +265,8 @@ impl SegmentReader { let schema = segment.schema(); Ok(SegmentReader { inv_idx_reader_cache: Arc::new(RwLock::new(HashMap::new())), - segment_meta: segment.meta().clone(), + max_doc: segment.meta().max_doc(), + num_docs: segment.meta().num_docs(), termdict_composite, postings_composite, fast_fields_composite, @@ -282,7 +288,8 @@ impl SegmentReader { /// term dictionary associated to a specific field, /// and opening the posting list associated to any term. pub fn inverted_index(&self, field: Field) -> Arc { - if let Some(inv_idx_reader) = self.inv_idx_reader_cache + if let Some(inv_idx_reader) = self + .inv_idx_reader_cache .read() .expect("Lock poisoned. This should never happen") .get(&field) @@ -306,25 +313,28 @@ impl SegmentReader { // As a result, no data is associated to the inverted index. // // Returns an empty inverted index. - return Arc::new(InvertedIndexReader::empty(field_type.clone())); + return Arc::new(InvertedIndexReader::empty(field_type)); } let postings_source = postings_source_opt.unwrap(); - let termdict_source = self.termdict_composite + let termdict_source = self + .termdict_composite .open_read(field) .expect("Failed to open field term dictionary in composite file. Is the field indexed"); - let positions_source = self.positions_composite + let positions_source = self + .positions_composite .open_read(field) .expect("Index corrupted. Failed to open field positions in composite file."); - let positions_idx_source = self.positions_idx_composite + let positions_idx_source = self + .positions_idx_composite .open_read(field) .expect("Index corrupted. Failed to open field positions in composite file."); let inv_idx_reader = Arc::new(InvertedIndexReader::new( - TermDictionary::from_source(termdict_source), + TermDictionary::from_source(&termdict_source), postings_source, positions_source, positions_idx_source, @@ -372,6 +382,21 @@ impl SegmentReader { pub fn doc_ids_alive(&self) -> SegmentReaderAliveDocsIterator { SegmentReaderAliveDocsIterator::new(&self) } + + /// Summarize total space usage of this segment. + pub fn space_usage(&self) -> SegmentSpaceUsage { + SegmentSpaceUsage::new( + self.num_docs(), + self.termdict_composite.space_usage(), + self.postings_composite.space_usage(), + self.positions_composite.space_usage(), + self.positions_idx_composite.space_usage(), + self.fast_fields_composite.space_usage(), + self.fieldnorms_composite.space_usage(), + self.store_reader.space_usage(), + self.delete_bitset_opt.as_ref().map(|x| x.space_usage()).unwrap_or(0), + ) + } } impl fmt::Debug for SegmentReader { @@ -391,7 +416,7 @@ pub struct SegmentReaderAliveDocsIterator<'a> { impl<'a> SegmentReaderAliveDocsIterator<'a> { pub fn new(reader: &'a SegmentReader) -> SegmentReaderAliveDocsIterator<'a> { SegmentReaderAliveDocsIterator { - reader: reader, + reader, max_doc: reader.max_doc(), current: 0, } @@ -462,9 +487,7 @@ mod test { index.load_searchers().unwrap(); let searcher = index.searcher(); - let docs: Vec = searcher.segment_reader(0) - .doc_ids_alive() - .collect(); + let docs: Vec = searcher.segment_reader(0).doc_ids_alive().collect(); assert_eq!(vec![0u32, 2u32], docs); } } diff --git a/src/directory/directory.rs b/src/directory/directory.rs index 596cdc492..0f99be74b 100644 --- a/src/directory/directory.rs +++ b/src/directory/directory.rs @@ -77,15 +77,15 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static { /// DirectoryClone pub trait DirectoryClone { - /// Clones the directory and boxes the clone - fn box_clone(&self) -> Box; + /// Clones the directory and boxes the clone + fn box_clone(&self) -> Box; } impl DirectoryClone for T where - T: 'static + Directory + Clone, + T: 'static + Directory + Clone, { - fn box_clone(&self) -> Box { - Box::new(self.clone()) - } + fn box_clone(&self) -> Box { + Box::new(self.clone()) + } } diff --git a/src/directory/managed_directory.rs b/src/directory/managed_directory.rs index 2977337c6..34259c184 100644 --- a/src/directory/managed_directory.rs +++ b/src/directory/managed_directory.rs @@ -1,7 +1,8 @@ use core::MANAGED_FILEPATH; use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError}; use directory::{ReadOnlySource, WritePtr}; -use error::{ErrorKind, Result, ResultExt}; +use error::TantivyError; +use indexer::LockType; use serde_json; use std::collections::HashSet; use std::io; @@ -11,6 +12,18 @@ use std::result; use std::sync::RwLockWriteGuard; use std::sync::{Arc, RwLock}; use Directory; +use Result; + +/// Returns true iff the file is "managed". +/// Non-managed file are not subject to garbage collection. +/// +/// Filenames that starts by a "." -typically locks- +/// are not managed. +fn is_managed(path: &Path) -> bool { + path.to_str() + .map(|p_str| !p_str.starts_with('.')) + .unwrap_or(true) +} /// Wrapper of directories that keeps track of files created by Tantivy. /// @@ -39,7 +52,7 @@ fn save_managed_paths( wlock: &RwLockWriteGuard, ) -> io::Result<()> { let mut w = serde_json::to_vec(&wlock.managed_paths)?; - write!(&mut w, "\n")?; + writeln!(&mut w)?; directory.atomic_write(&MANAGED_FILEPATH, &w[..])?; Ok(()) } @@ -51,7 +64,7 @@ impl ManagedDirectory { Ok(data) => { let managed_files_json = String::from_utf8_lossy(&data); let managed_files: HashSet = serde_json::from_str(&managed_files_json) - .chain_err(|| ErrorKind::CorruptedFile(MANAGED_FILEPATH.clone()))?; + .map_err(|_| TantivyError::CorruptedFile(MANAGED_FILEPATH.clone()))?; Ok(ManagedDirectory { directory: Box::new(directory), meta_informations: Arc::new(RwLock::new(MetaInformation { @@ -81,25 +94,35 @@ impl ManagedDirectory { pub fn garbage_collect HashSet>(&mut self, get_living_files: L) { info!("Garbage collect"); let mut files_to_delete = vec![]; + + // It is crucial to get the living files after acquiring the + // read lock of meta informations. That way, we + // avoid the following scenario. + // + // 1) we get the list of living files. + // 2) someone creates a new file. + // 3) we start garbage collection and remove this file + // even though it is a living file. + // + // releasing the lock as .delete() will use it too. { - // releasing the lock as .delete() will use it too. - let meta_informations_rlock = self.meta_informations + let meta_informations_rlock = self + .meta_informations .read() .expect("Managed directory rlock poisoned in garbage collect."); - // It is crucial to get the living files after acquiring the - // read lock of meta informations. That way, we - // avoid the following scenario. - // - // 1) we get the list of living files. - // 2) someone creates a new file. - // 3) we start garbage collection and remove this file - // even though it is a living file. - let living_files = get_living_files(); - - for managed_path in &meta_informations_rlock.managed_paths { - if !living_files.contains(managed_path) { - files_to_delete.push(managed_path.clone()); + // The point of this second "file" lock is to enforce the following scenario + // 1) process B tries to load a new set of searcher. + // The list of segments is loaded + // 2) writer change meta.json (for instance after a merge or a commit) + // 3) gc kicks in. + // 4) gc removes a file that was useful for process B, before process B opened it. + if let Ok(_meta_lock) = LockType::MetaLock.acquire_lock(self) { + let living_files = get_living_files(); + for managed_path in &meta_informations_rlock.managed_paths { + if !living_files.contains(managed_path) { + files_to_delete.push(managed_path.clone()); + } } } } @@ -133,7 +156,8 @@ impl ManagedDirectory { if !deleted_files.is_empty() { // update the list of managed files by removing // the file that were removed. - let mut meta_informations_wlock = self.meta_informations + let mut meta_informations_wlock = self + .meta_informations .write() .expect("Managed directory wlock poisoned (2)."); { @@ -155,8 +179,17 @@ impl ManagedDirectory { /// registering the filepath and creating the file /// will not lead to garbage files that will /// never get removed. + /// + /// File starting by "." are reserved to locks. + /// They are not managed and cannot be subjected + /// to garbage collection. fn register_file_as_managed(&mut self, filepath: &Path) -> io::Result<()> { - let mut meta_wlock = self.meta_informations + // Files starting by "." (e.g. lock files) are not managed. + if !is_managed(filepath) { + return Ok(()); + } + let mut meta_wlock = self + .meta_informations .write() .expect("Managed file lock poisoned"); let has_changed = meta_wlock.managed_paths.insert(filepath.to_owned()); diff --git a/src/directory/mmap_directory.rs b/src/directory/mmap_directory.rs index 05fa18793..619e0fd19 100644 --- a/src/directory/mmap_directory.rs +++ b/src/directory/mmap_directory.rs @@ -32,7 +32,8 @@ fn open_mmap(full_path: &Path) -> result::Result, OpenReadE } })?; - let meta_data = file.metadata() + let meta_data = file + .metadata() .map_err(|e| IOError::with_path(full_path.to_owned(), e))?; if meta_data.len() == 0 { // if the file size is 0, it will not be possible @@ -309,7 +310,8 @@ impl Directory for MmapDirectory { // when the last reference is gone. mmap_cache.cache.remove(&full_path); match fs::remove_file(&full_path) { - Ok(_) => self.sync_directory() + Ok(_) => self + .sync_directory() .map_err(|e| IOError::with_path(path.to_owned(), e).into()), Err(e) => { if e.kind() == io::ErrorKind::NotFound { diff --git a/src/directory/ram_directory.rs b/src/directory/ram_directory.rs index d1a671cd1..ad79319e7 100644 --- a/src/directory/ram_directory.rs +++ b/src/directory/ram_directory.rs @@ -100,8 +100,7 @@ impl InnerDirectory { ); let io_err = make_io_err(msg); OpenReadError::IOError(IOError::with_path(path.to_owned(), io_err)) - }) - .and_then(|readable_map| { + }).and_then(|readable_map| { readable_map .get(path) .ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path))) @@ -121,8 +120,7 @@ impl InnerDirectory { ); let io_err = make_io_err(msg); DeleteError::IOError(IOError::with_path(path.to_owned(), io_err)) - }) - .and_then(|mut writable_map| match writable_map.remove(path) { + }).and_then(|mut writable_map| match writable_map.remove(path) { Some(_) => Ok(()), None => Err(DeleteError::FileDoesNotExist(PathBuf::from(path))), }) @@ -170,10 +168,10 @@ impl Directory for RAMDirectory { let path_buf = PathBuf::from(path); let vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone()); - let exists = self.fs + let exists = self + .fs .write(path_buf.clone(), &Vec::new()) .map_err(|err| IOError::with_path(path.to_owned(), err))?; - // force the creation of the file to mimic the MMap directory. if exists { Err(OpenWriteError::FileAlreadyExists(path_buf)) @@ -196,6 +194,10 @@ impl Directory for RAMDirectory { } fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> { + fail_point!("RAMDirectory::atomic_write", |msg| Err(io::Error::new( + io::ErrorKind::Other, + msg.unwrap_or("Undefined".to_string()) + ))); let path_buf = PathBuf::from(path); let mut vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone()); self.fs.write(path_buf, &Vec::new())?; diff --git a/src/directory/read_only_source.rs b/src/directory/read_only_source.rs index d2e9358d4..6ed2049e5 100644 --- a/src/directory/read_only_source.rs +++ b/src/directory/read_only_source.rs @@ -5,7 +5,6 @@ use fst::raw::MmapReadOnly; use stable_deref_trait::{CloneStableDeref, StableDeref}; use std::ops::Deref; - /// Read object that represents files in tantivy. /// /// These read objects are only in charge to deliver diff --git a/src/error.rs b/src/error.rs index 4ec4bfe25..a84befbc8 100644 --- a/src/error.rs +++ b/src/error.rs @@ -4,135 +4,125 @@ use std::io; use directory::error::{IOError, OpenDirectoryError, OpenReadError, OpenWriteError}; use fastfield::FastFieldNotAvailableError; +use indexer::LockType; use query; use schema; use serde_json; use std::path::PathBuf; use std::sync::PoisonError; -error_chain!( - errors { - /// Path does not exist. - PathDoesNotExist(buf: PathBuf) { - description("path does not exist") - display("path does not exist: '{:?}'", buf) - } - /// File already exists, this is a problem when we try to write into a new file. - FileAlreadyExists(buf: PathBuf) { - description("file already exists") - display("file already exists: '{:?}'", buf) - } - /// IO Error. - IOError(err: IOError) { - description("an IO error occurred") - display("an IO error occurred: '{}'", err) - } - /// The data within is corrupted. - /// - /// For instance, it contains invalid JSON. - CorruptedFile(buf: PathBuf) { - description("file contains corrupted data") - display("file contains corrupted data: '{:?}'", buf) - } - /// A thread holding the locked panicked and poisoned the lock. - Poisoned { - description("a thread holding the locked panicked and poisoned the lock") - } - /// Invalid argument was passed by the user. - InvalidArgument(arg: String) { - description("an invalid argument was passed") - display("an invalid argument was passed: '{}'", arg) - } - /// An Error happened in one of the thread. - ErrorInThread(err: String) { - description("an error occurred in a thread") - display("an error occurred in a thread: '{}'", err) - } - /// An Error appeared related to the schema. - SchemaError(message: String) { - description("the schema is not matching expectations.") - display("Schema error: '{}'", message) - } - /// Tried to access a fastfield reader for a field not configured accordingly. - FastFieldError(err: FastFieldNotAvailableError) { - description("fast field not available") - display("fast field not available: '{:?}'", err) - } - } -); +/// The library's failure based error enum +#[derive(Debug, Fail)] +pub enum TantivyError { + /// Path does not exist. + #[fail(display = "path does not exist: '{:?}'", _0)] + PathDoesNotExist(PathBuf), + /// File already exists, this is a problem when we try to write into a new file. + #[fail(display = "file already exists: '{:?}'", _0)] + FileAlreadyExists(PathBuf), + /// Failed to acquire file lock + #[fail( + display = "Failed to acquire Lockfile: {:?}. Possible causes: another IndexWriter instance or panic during previous lock drop.", + _0 + )] + LockFailure(LockType), + /// IO Error. + #[fail(display = "an IO error occurred: '{}'", _0)] + IOError(#[cause] IOError), + /// The data within is corrupted. + /// + /// For instance, it contains invalid JSON. + #[fail(display = "file contains corrupted data: '{:?}'", _0)] + CorruptedFile(PathBuf), + /// A thread holding the locked panicked and poisoned the lock. + #[fail(display = "a thread holding the locked panicked and poisoned the lock")] + Poisoned, + /// Invalid argument was passed by the user. + #[fail(display = "an invalid argument was passed: '{}'", _0)] + InvalidArgument(String), + /// An Error happened in one of the thread. + #[fail(display = "an error occurred in a thread: '{}'", _0)] + ErrorInThread(String), + /// An Error appeared related to the schema. + #[fail(display = "Schema error: '{}'", _0)] + SchemaError(String), + /// Tried to access a fastfield reader for a field not configured accordingly. + #[fail(display = "fast field not available: '{:?}'", _0)] + FastFieldError(#[cause] FastFieldNotAvailableError), +} -impl From for Error { - fn from(fastfield_error: FastFieldNotAvailableError) -> Error { - ErrorKind::FastFieldError(fastfield_error).into() +impl From for TantivyError { + fn from(fastfield_error: FastFieldNotAvailableError) -> TantivyError { + TantivyError::FastFieldError(fastfield_error) } } -impl From for Error { - fn from(io_error: IOError) -> Error { - ErrorKind::IOError(io_error).into() +impl From for TantivyError { + fn from(io_error: IOError) -> TantivyError { + TantivyError::IOError(io_error) } } -impl From for Error { - fn from(io_error: io::Error) -> Error { - ErrorKind::IOError(io_error.into()).into() +impl From for TantivyError { + fn from(io_error: io::Error) -> TantivyError { + TantivyError::IOError(io_error.into()) } } -impl From for Error { - fn from(parsing_error: query::QueryParserError) -> Error { - ErrorKind::InvalidArgument(format!("Query is invalid. {:?}", parsing_error)).into() +impl From for TantivyError { + fn from(parsing_error: query::QueryParserError) -> TantivyError { + TantivyError::InvalidArgument(format!("Query is invalid. {:?}", parsing_error)) } } -impl From> for Error { - fn from(_: PoisonError) -> Error { - ErrorKind::Poisoned.into() +impl From> for TantivyError { + fn from(_: PoisonError) -> TantivyError { + TantivyError::Poisoned } } -impl From for Error { - fn from(error: OpenReadError) -> Error { +impl From for TantivyError { + fn from(error: OpenReadError) -> TantivyError { match error { - OpenReadError::FileDoesNotExist(filepath) => { - ErrorKind::PathDoesNotExist(filepath).into() + OpenReadError::FileDoesNotExist(filepath) => TantivyError::PathDoesNotExist(filepath), + OpenReadError::IOError(io_error) => TantivyError::IOError(io_error), + } + } +} + +impl From for TantivyError { + fn from(error: schema::DocParsingError) -> TantivyError { + TantivyError::InvalidArgument(format!("Failed to parse document {:?}", error)) + } +} + +impl From for TantivyError { + fn from(error: OpenWriteError) -> TantivyError { + match error { + OpenWriteError::FileAlreadyExists(filepath) => { + TantivyError::FileAlreadyExists(filepath) } - OpenReadError::IOError(io_error) => ErrorKind::IOError(io_error).into(), + OpenWriteError::IOError(io_error) => TantivyError::IOError(io_error), } } } -impl From for Error { - fn from(error: schema::DocParsingError) -> Error { - ErrorKind::InvalidArgument(format!("Failed to parse document {:?}", error)).into() - } -} - -impl From for Error { - fn from(error: OpenWriteError) -> Error { - match error { - OpenWriteError::FileAlreadyExists(filepath) => ErrorKind::FileAlreadyExists(filepath), - OpenWriteError::IOError(io_error) => ErrorKind::IOError(io_error), - }.into() - } -} - -impl From for Error { - fn from(error: OpenDirectoryError) -> Error { +impl From for TantivyError { + fn from(error: OpenDirectoryError) -> TantivyError { match error { OpenDirectoryError::DoesNotExist(directory_path) => { - ErrorKind::PathDoesNotExist(directory_path).into() + TantivyError::PathDoesNotExist(directory_path) + } + OpenDirectoryError::NotADirectory(directory_path) => { + TantivyError::InvalidArgument(format!("{:?} is not a directory", directory_path)) } - OpenDirectoryError::NotADirectory(directory_path) => ErrorKind::InvalidArgument( - format!("{:?} is not a directory", directory_path), - ).into(), } } } -impl From for Error { - fn from(error: serde_json::Error) -> Error { +impl From for TantivyError { + fn from(error: serde_json::Error) -> TantivyError { let io_err = io::Error::from(error); - ErrorKind::IOError(io_err.into()).into() + TantivyError::IOError(io_err.into()) } } diff --git a/src/fastfield/bytes/writer.rs b/src/fastfield/bytes/writer.rs index 568a5421f..472e8d682 100644 --- a/src/fastfield/bytes/writer.rs +++ b/src/fastfield/bytes/writer.rs @@ -51,7 +51,7 @@ impl BytesFastFieldWriter { self.next_doc(); for field_value in doc.field_values() { if field_value.field() == self.field { - if let &Value::Bytes(ref bytes) = field_value.value() { + if let Value::Bytes(ref bytes) = *field_value.value() { self.vals.extend_from_slice(bytes); } else { panic!( diff --git a/src/fastfield/delete.rs b/src/fastfield/delete.rs index 3f8a0eb5b..76ff7e43b 100644 --- a/src/fastfield/delete.rs +++ b/src/fastfield/delete.rs @@ -2,6 +2,7 @@ use bit_set::BitSet; use common::HasLen; use directory::ReadOnlySource; use directory::WritePtr; +use space_usage::ByteCount; use std::io; use std::io::Write; use DocId; @@ -41,7 +42,8 @@ pub struct DeleteBitSet { impl DeleteBitSet { /// Opens a delete bitset given its data source. pub fn open(data: ReadOnlySource) -> DeleteBitSet { - let num_deleted: usize = data.as_slice() + let num_deleted: usize = data + .as_slice() .iter() .map(|b| b.count_ones() as usize) .sum(); @@ -62,6 +64,11 @@ impl DeleteBitSet { b & (1u8 << shift) != 0 } } + + /// Summarize total space usage of this bitset. + pub fn space_usage(&self) -> ByteCount { + self.data.len() + } } impl HasLen for DeleteBitSet { diff --git a/src/fastfield/error.rs b/src/fastfield/error.rs index a05ef2284..df6c2febe 100644 --- a/src/fastfield/error.rs +++ b/src/fastfield/error.rs @@ -4,7 +4,8 @@ use std::result; /// `FastFieldNotAvailableError` is returned when the /// user requested for a fast field reader, and the field was not /// defined in the schema as a fast field. -#[derive(Debug)] +#[derive(Debug, Fail)] +#[fail(display = "field not available: '{:?}'", field_name)] pub struct FastFieldNotAvailableError { field_name: String, } diff --git a/src/fastfield/facet_reader.rs b/src/fastfield/facet_reader.rs index 182b17989..92a917089 100644 --- a/src/fastfield/facet_reader.rs +++ b/src/fastfield/facet_reader.rs @@ -56,7 +56,8 @@ impl FacetReader { /// Given a term ordinal returns the term associated to it. pub fn facet_from_ord(&self, facet_ord: TermOrdinal, output: &mut Facet) { - let found_term = self.term_dict + let found_term = self + .term_dict .ord_to_term(facet_ord as u64, output.inner_buffer_mut()); assert!(found_term, "Term ordinal {} no found.", facet_ord); } diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index e3599bacf..fdb029432 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -370,7 +370,7 @@ mod tests { pub fn generate_permutation() -> Vec { let seed: [u8; 16] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; let mut rng = XorShiftRng::from_seed(seed); - let mut permutation: Vec = (0u64..1_000_000u64).collect(); + let mut permutation: Vec = (0u64..100_000u64).collect(); rng.shuffle(&mut permutation); permutation } diff --git a/src/fastfield/multivalued/writer.rs b/src/fastfield/multivalued/writer.rs index 9177ddcd9..e5fd45203 100644 --- a/src/fastfield/multivalued/writer.rs +++ b/src/fastfield/multivalued/writer.rs @@ -132,7 +132,8 @@ impl MultiValueIntFastFieldWriter { ); let mut doc_vals: Vec = Vec::with_capacity(100); - for (start, stop) in self.doc_index + for (start, stop) in self + .doc_index .windows(2) .map(|interval| (interval[0], interval[1])) .chain(Some(last_interval).into_iter()) @@ -148,7 +149,6 @@ impl MultiValueIntFastFieldWriter { value_serializer.add_val(val)?; } } - } None => { let val_min_max = self.vals.iter().cloned().minmax(); diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs index f4b90ac8b..6df8e3775 100644 --- a/src/fastfield/reader.rs +++ b/src/fastfield/reader.rs @@ -11,7 +11,6 @@ use schema::SchemaBuilder; use schema::FAST; use std::collections::HashMap; use std::marker::PhantomData; -use std::mem; use std::path::Path; use DocId; @@ -80,7 +79,8 @@ impl FastFieldReader { // TODO change start to `u64`. // For multifastfield, start is an index in a second fastfield, not a `DocId` pub fn get_range(&self, start: u32, output: &mut [Item]) { - let output_u64: &mut [u64] = unsafe { mem::transmute(output) }; // ok: Item is either `u64` or `i64` + // ok: Item is either `u64` or `i64` + let output_u64: &mut [u64] = unsafe { &mut *(output as *mut [Item] as *mut [u64]) }; self.bit_unpacker.get_range(start, output_u64); for out in output_u64.iter_mut() { *out = Item::from_u64(*out + self.min_value_u64).as_u64(); diff --git a/src/fieldnorm/code.rs b/src/fieldnorm/code.rs index 71079bd02..3a62d18c2 100644 --- a/src/fieldnorm/code.rs +++ b/src/fieldnorm/code.rs @@ -10,27 +10,28 @@ pub fn fieldnorm_to_id(fieldnorm: u32) -> u8 { .unwrap_or_else(|idx| idx - 1) as u8 } +#[cfg_attr(feature = "cargo-clippy", allow(clippy::unreadable_literal))] pub const FIELD_NORMS_TABLE: [u32; 256] = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 42, 44, 46, 48, 50, 52, 54, 56, 60, 64, 68, 72, 76, 80, 84, 88, 96, 104, 112, 120, 128, 136, 144, 152, 168, 184, 200, 216, 232, - 248, 264, 280, 312, 344, 376, 408, 440, 472, 504, 536, 600, 664, 728, 792, 856, 920, 984, 1048, - 1176, 1304, 1432, 1560, 1688, 1816, 1944, 2072, 2328, 2584, 2840, 3096, 3352, 3608, 3864, 4120, - 4632, 5144, 5656, 6168, 6680, 7192, 7704, 8216, 9240, 10264, 11288, 12312, 13336, 14360, 15384, - 16408, 18456, 20504, 22552, 24600, 26648, 28696, 30744, 32792, 36888, 40984, 45080, 49176, - 53272, 57368, 61464, 65560, 73752, 81944, 90136, 98328, 106520, 114712, 122904, 131096, 147480, - 163864, 180248, 196632, 213016, 229400, 245784, 262168, 294936, 327704, 360472, 393240, 426008, - 458776, 491544, 524312, 589848, 655384, 720920, 786456, 851992, 917528, 983064, 1048600, - 1179672, 1310744, 1441816, 1572888, 1703960, 1835032, 1966104, 2097176, 2359320, 2621464, - 2883608, 3145752, 3407896, 3670040, 3932184, 4194328, 4718616, 5242904, 5767192, 6291480, - 6815768, 7340056, 7864344, 8388632, 9437208, 10485784, 11534360, 12582936, 13631512, 14680088, - 15728664, 16777240, 18874392, 20971544, 23068696, 25165848, 27263000, 29360152, 31457304, - 33554456, 37748760, 41943064, 46137368, 50331672, 54525976, 58720280, 62914584, 67108888, - 75497496, 83886104, 92274712, 100663320, 109051928, 117440536, 125829144, 134217752, 150994968, - 167772184, 184549400, 201326616, 218103832, 234881048, 251658264, 268435480, 301989912, - 335544344, 369098776, 402653208, 436207640, 469762072, 503316504, 536870936, 603979800, - 671088664, 738197528, 805306392, 872415256, 939524120, 1006632984, 1073741848, 1207959576, - 1342177304, 1476395032, 1610612760, 1744830488, 1879048216, 2013265944, + 248, 264, 280, 312, 344, 376, 408, 440, 472, 504, 536, 600, 664, 728, 792, 856, 920, 984, + 1_048, 1176, 1304, 1432, 1560, 1688, 1816, 1944, 2072, 2328, 2584, 2840, 3096, 3352, 3608, + 3864, 4120, 4632, 5144, 5656, 6168, 6680, 7192, 7704, 8216, 9240, 10264, 11288, 12312, 13336, + 14360, 15384, 16408, 18456, 20504, 22552, 24600, 26648, 28696, 30744, 32792, 36888, 40984, + 45080, 49176, 53272, 57368, 61464, 65560, 73752, 81944, 90136, 98328, 106520, 114712, 122904, + 131096, 147480, 163864, 180248, 196632, 213016, 229400, 245784, 262168, 294936, 327704, 360472, + 393240, 426008, 458776, 491544, 524312, 589848, 655384, 720920, 786456, 851992, 917528, 983064, + 1048600, 1179672, 1310744, 1441816, 1572888, 1703960, 1835032, 1966104, 2097176, 2359320, + 2621464, 2883608, 3145752, 3407896, 3670040, 3932184, 4194328, 4718616, 5242904, 5767192, + 6291480, 6815768, 7340056, 7864344, 8388632, 9437208, 10485784, 11534360, 12582936, 13631512, + 14680088, 15728664, 16777240, 18874392, 20971544, 23068696, 25165848, 27263000, 29360152, + 31457304, 33554456, 37748760, 41943064, 46137368, 50331672, 54525976, 58720280, 62914584, + 67108888, 75497496, 83886104, 92274712, 100663320, 109051928, 117440536, 125829144, 134217752, + 150994968, 167772184, 184549400, 201326616, 218103832, 234881048, 251658264, 268435480, + 301989912, 335544344, 369098776, 402653208, 436207640, 469762072, 503316504, 536870936, + 603979800, 671088664, 738197528, 805306392, 872415256, 939524120, 1006632984, 1073741848, + 1207959576, 1342177304, 1476395032, 1610612760, 1744830488, 1879048216, 2013265944, ]; #[cfg(test)] diff --git a/src/functional_test.rs b/src/functional_test.rs index af7b1883a..9905f1d6e 100644 --- a/src/functional_test.rs +++ b/src/functional_test.rs @@ -1,8 +1,8 @@ use rand::thread_rng; use std::collections::HashSet; -use rand::Rng; use rand::distributions::Range; +use rand::Rng; use schema::*; use Index; use Searcher; diff --git a/src/indexer/delete_queue.rs b/src/indexer/delete_queue.rs index 4c2597fbb..842b7a2f3 100644 --- a/src/indexer/delete_queue.rs +++ b/src/indexer/delete_queue.rs @@ -52,7 +52,8 @@ impl DeleteQueue { // // Past delete operations are not accessible. pub fn cursor(&self) -> DeleteCursor { - let last_block = self.inner + let last_block = self + .inner .read() .expect("Read lock poisoned when opening delete queue cursor") .last_block @@ -92,7 +93,8 @@ impl DeleteQueue { // be some unflushed operations. // fn flush(&self) -> Option> { - let mut self_wlock = self.inner + let mut self_wlock = self + .inner .write() .expect("Failed to acquire write lock on delete queue writer"); @@ -132,7 +134,8 @@ impl From for NextBlock { impl NextBlock { fn next_block(&self) -> Option> { { - let next_read_lock = self.0 + let next_read_lock = self + .0 .read() .expect("Failed to acquire write lock in delete queue"); if let InnerNextBlock::Closed(ref block) = *next_read_lock { @@ -141,7 +144,8 @@ impl NextBlock { } let next_block; { - let mut next_write_lock = self.0 + let mut next_write_lock = self + .0 .write() .expect("Failed to acquire write lock in delete queue"); match *next_write_lock { @@ -182,19 +186,21 @@ impl DeleteCursor { /// `opstamp >= target_opstamp`. pub fn skip_to(&mut self, target_opstamp: u64) { // TODO Can be optimize as we work with block. - #[cfg_attr(feature = "cargo-clippy", allow(while_let_loop))] - loop { - if let Some(operation) = self.get() { - if operation.opstamp >= target_opstamp { - break; - } - } else { - break; - } + while self.is_behind_opstamp(target_opstamp) { self.advance(); } } + #[cfg_attr( + feature = "cargo-clippy", + allow(clippy::wrong_self_convention) + )] + fn is_behind_opstamp(&mut self, target_opstamp: u64) -> bool { + self.get() + .map(|operation| operation.opstamp < target_opstamp) + .unwrap_or(false) + } + /// If the current block has been entirely /// consumed, try to load the next one. /// diff --git a/src/indexer/directory_lock.rs b/src/indexer/directory_lock.rs index b152a3c58..172165bc2 100644 --- a/src/indexer/directory_lock.rs +++ b/src/indexer/directory_lock.rs @@ -1,26 +1,130 @@ -use core::LOCKFILE_FILEPATH; use directory::error::OpenWriteError; +use std::io::Write; +use std::path::{Path, PathBuf}; +use std::thread; +use std::time::Duration; use Directory; +use TantivyError; -/// The directory lock is a mechanism used to -/// prevent the creation of two [`IndexWriter`](struct.IndexWriter.html) -/// -/// Only one lock can exist at a time for a given directory. -/// The lock is release automatically on `Drop`. -pub struct DirectoryLock { - directory: Box, +#[derive(Debug, Clone, Copy)] +pub enum LockType { + /// Only one process should be able to write tantivy's index at a time. + /// This lock file, when present, is in charge of preventing other processes to open an IndexWriter. + /// + /// If the process is killed and this file remains, it is safe to remove it manually. + /// + /// Failing to acquire this lock usually means a misuse of tantivy's API, + /// (creating more than one instance of the `IndexWriter`), are a spurious + /// lock file remaining after a crash. In the latter case, removing the file after + /// checking no process running tantivy is running is safe. + IndexWriterLock, + /// The meta lock file is here to protect the segment files being opened by + /// `.load_searchers()` from being garbage collected. + /// It makes it possible for another process to safely consume + /// our index in-writing. Ideally, we may have prefered `RWLock` semantics + /// here, but it is difficult to achieve on Windows. + /// + /// Opening segment readers is a very fast process. + /// Right now if the lock cannot be acquire on the first attempt, the logic + /// is very simplistic. We retry after `100ms` until we effectively + /// acquire the lock. + /// This lock should not have much contention in normal usage. + MetaLock, } -impl DirectoryLock { - pub fn lock(mut directory: Box) -> Result { - directory.open_write(&*LOCKFILE_FILEPATH)?; - Ok(DirectoryLock { directory }) +/// Retry the logic of acquiring locks is pretty simple. +/// We just retry `n` times after a given `duratio`, both +/// depending on the type of lock. +struct RetryPolicy { + num_retries: usize, + wait_in_ms: u64, +} + +impl RetryPolicy { + fn no_retry() -> RetryPolicy { + RetryPolicy { + num_retries: 0, + wait_in_ms: 0, + } } + + fn wait_and_retry(&mut self) -> bool { + if self.num_retries == 0 { + false + } else { + self.num_retries -= 1; + let wait_duration = Duration::from_millis(self.wait_in_ms); + thread::sleep(wait_duration); + true + } + } +} + +impl LockType { + fn retry_policy(self) -> RetryPolicy { + match self { + LockType::IndexWriterLock => RetryPolicy::no_retry(), + LockType::MetaLock => RetryPolicy { + num_retries: 100, + wait_in_ms: 100, + }, + } + } + + fn try_acquire_lock(self, directory: &mut Directory) -> Result { + let path = self.filename(); + let mut write = directory.open_write(path).map_err(|e| match e { + OpenWriteError::FileAlreadyExists(_) => TantivyError::LockFailure(self), + OpenWriteError::IOError(io_error) => TantivyError::IOError(io_error), + })?; + write.flush()?; + Ok(DirectoryLock { + directory: directory.box_clone(), + path: path.to_owned(), + }) + } + + /// Acquire a lock in the given directory. + pub fn acquire_lock(self, directory: &Directory) -> Result { + let mut box_directory = directory.box_clone(); + let mut retry_policy = self.retry_policy(); + loop { + let lock_result = self.try_acquire_lock(&mut *box_directory); + match lock_result { + Ok(result) => { + return Ok(result); + } + Err(TantivyError::LockFailure(ref filepath)) => { + if !retry_policy.wait_and_retry() { + return Err(TantivyError::LockFailure(filepath.to_owned())); + } + } + Err(_) => {} + } + } + } + + fn filename(&self) -> &Path { + match *self { + LockType::MetaLock => Path::new(".tantivy-meta.lock"), + LockType::IndexWriterLock => Path::new(".tantivy-indexer.lock"), + } + } +} + +/// The `DirectoryLock` is an object that represents a file lock. +/// See [`LockType`](struct.LockType.html) +/// +/// It is transparently associated to a lock file, that gets deleted +/// on `Drop.` The lock is release automatically on `Drop`. +pub struct DirectoryLock { + directory: Box, + path: PathBuf, } impl Drop for DirectoryLock { fn drop(&mut self) { - if let Err(e) = self.directory.delete(&*LOCKFILE_FILEPATH) { + if let Err(e) = self.directory.delete(&*self.path) { error!("Failed to remove the lock file. {:?}", e); } } diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 59cfb6661..66de84c16 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -2,15 +2,15 @@ use super::operation::AddOperation; use super::segment_updater::SegmentUpdater; use super::PreparedCommit; use bit_set::BitSet; -use crossbeam_channel as channel; use core::Index; use core::Segment; use core::SegmentComponent; use core::SegmentId; use core::SegmentMeta; use core::SegmentReader; +use crossbeam_channel as channel; use docset::DocSet; -use error::{Error, ErrorKind, Result, ResultExt}; +use error::TantivyError; use fastfield::write_delete_bitset; use futures::sync::oneshot::Receiver; use indexer::delete_queue::{DeleteCursor, DeleteQueue}; @@ -29,6 +29,7 @@ use std::mem; use std::mem::swap; use std::thread; use std::thread::JoinHandle; +use Result; // Size of the margin for the heap. A segment is closed when the remaining memory // in the heap goes below MARGIN_IN_BYTES. @@ -53,14 +54,14 @@ type DocumentReceiver = channel::Receiver; fn initial_table_size(per_thread_memory_budget: usize) -> usize { let table_size_limit: usize = per_thread_memory_budget / 3; (1..) - .into_iter() .take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit) .last() - .expect(&format!( - "Per thread memory is too small: {}", - per_thread_memory_budget - )) - .min(19) // we cap it at 512K + .unwrap_or_else(|| { + panic!( + "Per thread memory is too small: {}", + per_thread_memory_budget + ) + }).min(19) // we cap it at 512K } /// `IndexWriter` is the user entry-point to add document to an index. @@ -122,11 +123,11 @@ pub fn open_index_writer( "The heap size per thread needs to be at least {}.", HEAP_SIZE_MIN ); - bail!(ErrorKind::InvalidArgument(err_msg)); + return Err(TantivyError::InvalidArgument(err_msg)); } if heap_size_in_bytes_per_thread >= HEAP_SIZE_MAX { let err_msg = format!("The heap size per thread cannot exceed {}", HEAP_SIZE_MAX); - bail!(ErrorKind::InvalidArgument(err_msg)); + return Err(TantivyError::InvalidArgument(err_msg)); } let (document_sender, document_receiver): (DocumentSender, DocumentReceiver) = channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS); @@ -176,7 +177,7 @@ pub fn compute_deleted_bitset( ) -> Result { let mut might_have_changed = false; - #[cfg_attr(feature = "cargo-clippy", allow(while_let_loop))] + #[cfg_attr(feature = "cargo-clippy", allow(clippy::while_let_loop))] loop { if let Some(delete_op) = delete_cursor.get() { if delete_op.opstamp > target_opstamp { @@ -300,25 +301,29 @@ fn index_documents( let last_docstamp: u64 = *(doc_opstamps.last().unwrap()); - let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps); - let segment_reader = SegmentReader::open(segment)?; - let mut deleted_bitset = BitSet::with_capacity(num_docs as usize); - let may_have_deletes = compute_deleted_bitset( - &mut deleted_bitset, - &segment_reader, - &mut delete_cursor, - &doc_to_opstamps, - last_docstamp, - )?; - - let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, { - if may_have_deletes { - Some(deleted_bitset) - } else { - None - } - }); - + let segment_entry: SegmentEntry = if delete_cursor.get().is_some() { + let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps); + let segment_reader = SegmentReader::open(segment)?; + let mut deleted_bitset = BitSet::with_capacity(num_docs as usize); + let may_have_deletes = compute_deleted_bitset( + &mut deleted_bitset, + &segment_reader, + &mut delete_cursor, + &doc_to_opstamps, + last_docstamp, + )?; + SegmentEntry::new(segment_meta, delete_cursor, { + if may_have_deletes { + Some(deleted_bitset) + } else { + None + } + }) + } else { + // if there are no delete operation in the queue, no need + // to even open the segment. + SegmentEntry::new(segment_meta, delete_cursor, None) + }; Ok(segment_updater.add_segment(generation, segment_entry)) } @@ -334,13 +339,16 @@ impl IndexWriter { join_handle .join() .expect("Indexing Worker thread panicked") - .chain_err(|| ErrorKind::ErrorInThread("Error in indexing worker thread.".into()))?; + .map_err(|_| { + TantivyError::ErrorInThread("Error in indexing worker thread.".into()) + })?; } drop(self.workers_join_handle); - let result = self.segment_updater + let result = self + .segment_updater .wait_merging_thread() - .chain_err(|| ErrorKind::ErrorInThread("Failed to join merging thread.".into())); + .map_err(|_| TantivyError::ErrorInThread("Failed to join merging thread.".into())); if let Err(ref e) = result { error!("Some merging thread failed {:?}", e); @@ -382,11 +390,9 @@ impl IndexWriter { .name(format!( "indexing thread {} for gen {}", self.worker_id, generation - )) - .spawn(move || { + )).spawn(move || { loop { - let mut document_iterator = - document_receiver_clone.clone().into_iter().peekable(); + let mut document_iterator = document_receiver_clone.clone().peekable(); // the peeking here is to avoid // creating a new segment's files @@ -485,7 +491,8 @@ impl IndexWriter { let document_receiver = self.document_receiver.clone(); // take the directory lock to create a new index_writer. - let directory_lock = self._directory_lock + let directory_lock = self + ._directory_lock .take() .expect("The IndexWriter does not have any lock. This is a bug, please report."); @@ -559,7 +566,7 @@ impl IndexWriter { for worker_handle in former_workers_join_handle { let indexing_worker_result = worker_handle .join() - .map_err(|e| Error::from_kind(ErrorKind::ErrorInThread(format!("{:?}", e))))?; + .map_err(|e| TantivyError::ErrorInThread(format!("{:?}", e)))?; indexing_worker_result?; // add a new worker for the next generation. @@ -654,11 +661,26 @@ mod tests { let index = Index::create_in_ram(schema_builder.build()); let _index_writer = index.writer(40_000_000).unwrap(); match index.writer(40_000_000) { - Err(Error(ErrorKind::FileAlreadyExists(_), _)) => {} + Err(TantivyError::LockFailure(_)) => {} _ => panic!("Expected FileAlreadyExists error"), } } + #[test] + fn test_lockfile_already_exists_error_msg() { + let schema_builder = schema::SchemaBuilder::default(); + let index = Index::create_in_ram(schema_builder.build()); + let _index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); + match index.writer_with_num_threads(1, 3_000_000) { + Err(err) => { + let err_msg = err.to_string(); + assert!(err_msg.contains("Lockfile")); + assert!(err_msg.contains("Possible causes:")) + } + _ => panic!("Expected LockfileAlreadyExists error"), + } + } + #[test] fn test_set_merge_policy() { let schema_builder = schema::SchemaBuilder::default(); @@ -840,4 +862,32 @@ mod tests { assert_eq!(initial_table_size(1_000_000_000), 19); } + #[cfg(not(feature = "no_fail"))] + #[test] + fn test_write_commit_fails() { + use fail; + let mut schema_builder = schema::SchemaBuilder::default(); + let text_field = schema_builder.add_text_field("text", schema::TEXT); + let index = Index::create_in_ram(schema_builder.build()); + + let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); + for _ in 0..100 { + index_writer.add_document(doc!(text_field => "a")); + } + index_writer.commit().unwrap(); + fail::cfg("RAMDirectory::atomic_write", "return(error_write_failed)").unwrap(); + for _ in 0..100 { + index_writer.add_document(doc!(text_field => "b")); + } + assert!(index_writer.commit().is_err()); + index.load_searchers().unwrap(); + let num_docs_containing = |s: &str| { + let searcher = index.searcher(); + let term_a = Term::from_field_text(text_field, s); + searcher.doc_freq(&term_a) + }; + assert_eq!(num_docs_containing("a"), 100); + assert_eq!(num_docs_containing("b"), 0); + fail::cfg("RAMDirectory::atomic_write", "off").unwrap(); + } } diff --git a/src/indexer/merge_policy.rs b/src/indexer/merge_policy.rs index 8df588ca3..407cb94bb 100644 --- a/src/indexer/merge_policy.rs +++ b/src/indexer/merge_policy.rs @@ -21,17 +21,17 @@ pub trait MergePolicy: MergePolicyClone + marker::Send + marker::Sync + Debug { /// MergePolicyClone pub trait MergePolicyClone { - /// Returns a boxed clone of the MergePolicy. - fn box_clone(&self) -> Box; + /// Returns a boxed clone of the MergePolicy. + fn box_clone(&self) -> Box; } impl MergePolicyClone for T where - T: 'static + MergePolicy + Clone, + T: 'static + MergePolicy + Clone, { - fn box_clone(&self) -> Box { - Box::new(self.clone()) - } + fn box_clone(&self) -> Box { + Box::new(self.clone()) + } } /// Never merge segments. diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 1a5d4c026..a42ea6d44 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -2,7 +2,6 @@ use core::Segment; use core::SegmentReader; use core::SerializableSegment; use docset::DocSet; -use error::Result; use fastfield::DeleteBitSet; use fastfield::FastFieldReader; use fastfield::FastFieldSerializer; @@ -23,6 +22,7 @@ use store::StoreWriter; use termdict::TermMerger; use termdict::TermOrdinal; use DocId; +use Result; fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 { let mut total_tokens = 0u64; @@ -40,15 +40,13 @@ fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 { total_tokens += reader.inverted_index(field).total_num_tokens(); } } - total_tokens - + count - .iter() - .cloned() - .enumerate() - .map(|(fieldnorm_ord, count)| { - count as u64 * FieldNormReader::id_to_fieldnorm(fieldnorm_ord as u8) as u64 - }) - .sum::() + total_tokens + count + .iter() + .cloned() + .enumerate() + .map(|(fieldnorm_ord, count)| { + count as u64 * u64::from(FieldNormReader::id_to_fieldnorm(fieldnorm_ord as u8)) + }).sum::() } pub struct IndexMerger { @@ -111,7 +109,7 @@ impl TermOrdinalMapping { .iter() .flat_map(|term_ordinals| term_ordinals.iter().cloned().max()) .max() - .unwrap_or(TermOrdinal::default()) + .unwrap_or_else(TermOrdinal::default) } } @@ -190,7 +188,7 @@ impl IndexMerger { `term_ordinal_mapping`."); self.write_hierarchical_facet_field( field, - term_ordinal_mapping, + &term_ordinal_mapping, fast_field_serializer, )?; } @@ -314,7 +312,7 @@ impl IndexMerger { fn write_hierarchical_facet_field( &self, field: Field, - term_ordinal_mappings: TermOrdinalMapping, + term_ordinal_mappings: &TermOrdinalMapping, fast_field_serializer: &mut FastFieldSerializer, ) -> Result<()> { // Multifastfield consists in 2 fastfields. @@ -393,8 +391,8 @@ impl IndexMerger { // We can now initialize our serializer, and push it the different values { - let mut serialize_vals = - fast_field_serializer.new_u64_fast_field_with_idx(field, min_value, max_value, 1)?; + let mut serialize_vals = fast_field_serializer + .new_u64_fast_field_with_idx(field, min_value, max_value, 1)?; for reader in &self.readers { let ff_reader: MultiValueIntFastFieldReader = reader.multi_fast_field_reader(field)?; @@ -440,7 +438,8 @@ impl IndexMerger { ) -> Result> { let mut positions_buffer: Vec = Vec::with_capacity(1_000); let mut delta_computer = DeltaComputer::new(); - let field_readers = self.readers + let field_readers = self + .readers .iter() .map(|reader| reader.inverted_index(indexed_field)) .collect::>(); @@ -524,8 +523,7 @@ impl IndexMerger { } } None - }) - .collect(); + }).collect(); // At this point, `segment_postings` contains the posting list // of all of the segments containing the given term. @@ -666,8 +664,7 @@ mod tests { TextFieldIndexing::default() .set_tokenizer("default") .set_index_option(IndexRecordOption::WithFreqs), - ) - .set_stored(); + ).set_stored(); let text_field = schema_builder.add_text_field("text", text_fieldtype); let score_fieldtype = schema::IntOptions::default().set_fast(Cardinality::SingleValue); let score_field = schema_builder.add_u64_field("score", score_fieldtype); @@ -769,24 +766,24 @@ mod tests { ); } { - let doc = searcher.doc(&DocAddress(0, 0)).unwrap(); - assert_eq!(doc.get_first(text_field).unwrap().text(), "af b"); + let doc = searcher.doc(DocAddress(0, 0)).unwrap(); + assert_eq!(doc.get_first(text_field).unwrap().text(), Some("af b")); } { - let doc = searcher.doc(&DocAddress(0, 1)).unwrap(); - assert_eq!(doc.get_first(text_field).unwrap().text(), "a b c"); + let doc = searcher.doc(DocAddress(0, 1)).unwrap(); + assert_eq!(doc.get_first(text_field).unwrap().text(), Some("a b c")); } { - let doc = searcher.doc(&DocAddress(0, 2)).unwrap(); - assert_eq!(doc.get_first(text_field).unwrap().text(), "a b c d"); + let doc = searcher.doc(DocAddress(0, 2)).unwrap(); + assert_eq!(doc.get_first(text_field).unwrap().text(), Some("a b c d")); } { - let doc = searcher.doc(&DocAddress(0, 3)).unwrap(); - assert_eq!(doc.get_first(text_field).unwrap().text(), "af b"); + let doc = searcher.doc(DocAddress(0, 3)).unwrap(); + assert_eq!(doc.get_first(text_field).unwrap().text(), Some("af b")); } { - let doc = searcher.doc(&DocAddress(0, 4)).unwrap(); - assert_eq!(doc.get_first(text_field).unwrap().text(), "a b c g"); + let doc = searcher.doc(DocAddress(0, 4)).unwrap(); + assert_eq!(doc.get_first(text_field).unwrap().text(), Some("a b c g")); } { let get_fast_vals = |terms: Vec| { @@ -821,8 +818,7 @@ mod tests { let text_fieldtype = schema::TextOptions::default() .set_indexing_options( TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs), - ) - .set_stored(); + ).set_stored(); let text_field = schema_builder.add_text_field("text", text_fieldtype); let score_fieldtype = schema::IntOptions::default().set_fast(Cardinality::SingleValue); let score_field = schema_builder.add_u64_field("score", score_fieldtype); diff --git a/src/indexer/mod.rs b/src/indexer/mod.rs index 783e787c8..3d29b38c0 100644 --- a/src/indexer/mod.rs +++ b/src/indexer/mod.rs @@ -16,6 +16,8 @@ mod segment_writer; mod stamper; pub(crate) use self::directory_lock::DirectoryLock; +pub use self::directory_lock::LockType; + pub use self::index_writer::IndexWriter; pub use self::log_merge_policy::LogMergePolicy; pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy}; diff --git a/src/indexer/segment_entry.rs b/src/indexer/segment_entry.rs index c35406ad1..7e23940d5 100644 --- a/src/indexer/segment_entry.rs +++ b/src/indexer/segment_entry.rs @@ -11,8 +11,8 @@ pub enum SegmentState { } impl SegmentState { - pub fn letter_code(&self) -> char { - match *self { + pub fn letter_code(self) -> char { + match self { SegmentState::InMerge => 'M', SegmentState::Ready => 'R', } diff --git a/src/indexer/segment_manager.rs b/src/indexer/segment_manager.rs index b82af0823..0e67d3b15 100644 --- a/src/indexer/segment_manager.rs +++ b/src/indexer/segment_manager.rs @@ -1,9 +1,8 @@ use super::segment_register::SegmentRegister; use core::SegmentId; use core::SegmentMeta; -use core::{LOCKFILE_FILEPATH, META_FILEPATH}; -use error::ErrorKind; -use error::Result as TantivyResult; +use core::META_FILEPATH; +use error::TantivyError; use indexer::delete_queue::DeleteCursor; use indexer::SegmentEntry; use std::collections::hash_set::HashSet; @@ -11,6 +10,7 @@ use std::fmt::{self, Debug, Formatter}; use std::path::PathBuf; use std::sync::RwLock; use std::sync::{RwLockReadGuard, RwLockWriteGuard}; +use Result as TantivyResult; #[derive(Default)] struct SegmentRegisters { @@ -78,10 +78,13 @@ impl SegmentManager { registers_lock.committed.len() + registers_lock.uncommitted.len() } + /// List the files that are useful to the index. + /// + /// This does not include lock files, or files that are obsolete + /// but have not yet been deleted by the garbage collector. pub fn list_files(&self) -> HashSet { let mut files = HashSet::new(); files.insert(META_FILEPATH.clone()); - files.insert(LOCKFILE_FILEPATH.clone()); for segment_meta in SegmentMeta::all() { files.extend(segment_meta.list_files()); } @@ -141,7 +144,7 @@ impl SegmentManager { let error_msg = "Merge operation sent for segments that are not \ all uncommited or commited." .to_string(); - bail!(ErrorKind::InvalidArgument(error_msg)) + return Err(TantivyError::InvalidArgument(error_msg)); } Ok(segment_entries) } diff --git a/src/indexer/segment_register.rs b/src/indexer/segment_register.rs index c455d3091..c0c883e15 100644 --- a/src/indexer/segment_register.rs +++ b/src/indexer/segment_register.rs @@ -51,7 +51,8 @@ impl SegmentRegister { } pub fn segment_metas(&self) -> Vec { - let mut segment_ids: Vec = self.segment_states + let mut segment_ids: Vec = self + .segment_states .values() .map(|segment_entry| segment_entry.meta().clone()) .collect(); diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index faaef38c0..1b2cd7c85 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -7,7 +7,7 @@ use core::SegmentMeta; use core::SerializableSegment; use core::META_FILEPATH; use directory::{Directory, DirectoryClone}; -use error::{Error, ErrorKind, Result, ResultExt}; +use error::TantivyError; use futures::oneshot; use futures::sync::oneshot::Receiver; use futures::Future; @@ -34,6 +34,7 @@ use std::sync::Arc; use std::sync::RwLock; use std::thread; use std::thread::JoinHandle; +use Result; /// Save the index meta file. /// This operation is atomic : @@ -71,7 +72,7 @@ pub fn save_metas( payload, }; let mut buffer = serde_json::to_vec_pretty(&metas)?; - write!(&mut buffer, "\n")?; + writeln!(&mut buffer)?; directory.atomic_write(&META_FILEPATH, &buffer[..])?; debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas)); Ok(()) @@ -114,12 +115,9 @@ fn perform_merge( // ... we just serialize this index merger in our new segment // to merge the two segments. - let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment) - .chain_err(|| "Creating index serializer failed")?; + let segment_serializer = SegmentSerializer::for_segment(&mut merged_segment)?; - let num_docs = merger - .write(segment_serializer) - .chain_err(|| "Serializing merged index failed")?; + let num_docs = merger.write(segment_serializer)?; let segment_meta = SegmentMeta::new(merged_segment.id(), num_docs); @@ -186,7 +184,7 @@ impl SegmentUpdater { fn run_async T>( &self, f: F, - ) -> CpuFuture { + ) -> CpuFuture { let me_clone = self.clone(); self.0.pool.spawn_fn(move || Ok(f(me_clone))) } @@ -338,8 +336,7 @@ impl SegmentUpdater { .unwrap() .remove(&merging_thread_id); Ok(()) - }) - .expect("Failed to spawn a thread."); + }).expect("Failed to spawn a thread."); self.0 .merging_threads .write() @@ -463,7 +460,7 @@ impl SegmentUpdater { merging_thread_handle .join() .map(|_| ()) - .map_err(|_| ErrorKind::ErrorInThread("Merging thread failed.".into()))?; + .map_err(|_| TantivyError::ErrorInThread("Merging thread failed.".into()))?; } // Our merging thread may have queued their completed self.run_async(move |_| {}).wait()?; diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 9627d60ad..ce4b1eb68 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -49,20 +49,20 @@ impl SegmentWriter { ) -> Result { let segment_serializer = SegmentSerializer::for_segment(&mut segment)?; let multifield_postings = MultiFieldPostingsWriter::new(schema, table_bits); - let tokenizers = schema - .fields() - .iter() - .map(|field_entry| field_entry.field_type()) - .map(|field_type| match *field_type { - FieldType::Str(ref text_options) => text_options.get_indexing_options().and_then( - |text_index_option| { - let tokenizer_name = &text_index_option.tokenizer(); - segment.index().tokenizers().get(tokenizer_name) - }, - ), - _ => None, - }) - .collect(); + let tokenizers = + schema + .fields() + .iter() + .map(|field_entry| field_entry.field_type()) + .map(|field_type| match *field_type { + FieldType::Str(ref text_options) => text_options + .get_indexing_options() + .and_then(|text_index_option| { + let tokenizer_name = &text_index_option.tokenizer(); + segment.index().tokenizers().get(tokenizer_name) + }), + _ => None, + }).collect(); Ok(SegmentWriter { max_doc: 0, multifield_postings, @@ -117,8 +117,7 @@ impl SegmentWriter { _ => { panic!("Expected hierarchical facet"); } - }) - .collect(); + }).collect(); let mut term = Term::for_field(field); // we set the Term for facet_bytes in facets { let mut unordered_term_id_opt = None; @@ -146,8 +145,7 @@ impl SegmentWriter { .flat_map(|field_value| match *field_value.value() { Value::Str(ref text) => Some(text.as_str()), _ => None, - }) - .collect(); + }).collect(); if texts.is_empty() { 0 } else { diff --git a/src/lib.rs b/src/lib.rs old mode 100644 new mode 100755 index c01226c55..7aa8572ff --- a/src/lib.rs +++ b/src/lib.rs @@ -1,13 +1,10 @@ #![doc(html_logo_url = "http://fulmicoton.com/tantivy-logo/tantivy-logo.png")] -#![cfg_attr(feature = "cargo-clippy", allow(module_inception))] -#![cfg_attr(feature = "cargo-clippy", allow(inline_always))] #![cfg_attr(all(feature = "unstable", test), feature(test))] +#![cfg_attr(feature = "cargo-clippy", feature(tool_lints))] +#![cfg_attr(feature = "cargo-clippy", allow(clippy::module_inception))] #![doc(test(attr(allow(unused_variables), deny(warnings))))] -#![allow(unknown_lints)] -#![allow(new_without_default)] -#![allow(decimal_literal_representation)] #![warn(missing_docs)] -#![recursion_limit="80"] +#![recursion_limit = "80"] //! # `tantivy` //! @@ -96,7 +93,7 @@ //! // most relevant doc ids... //! let doc_addresses = top_collector.docs(); //! for doc_address in doc_addresses { -//! let retrieved_doc = searcher.doc(&doc_address)?; +//! let retrieved_doc = searcher.doc(doc_address)?; //! println!("{}", schema.to_json(&retrieved_doc)); //! } //! @@ -124,7 +121,7 @@ extern crate serde_json; extern crate log; #[macro_use] -extern crate error_chain; +extern crate failure; #[cfg(feature = "mmap")] extern crate atomicwrites; @@ -133,7 +130,6 @@ extern crate bit_set; extern crate bitpacking; extern crate byteorder; -#[macro_use] extern crate combine; extern crate crossbeam; @@ -143,6 +139,7 @@ extern crate fst; extern crate fst_regex; extern crate futures; extern crate futures_cpupool; +extern crate htmlescape; extern crate itertools; extern crate levenshtein_automata; extern crate num_cpus; @@ -155,6 +152,8 @@ extern crate tempdir; extern crate tempfile; extern crate uuid; + + #[cfg(test)] #[macro_use] extern crate matches; @@ -165,27 +164,38 @@ extern crate winapi; #[cfg(test)] extern crate rand; +#[cfg(test)] +#[macro_use] +extern crate maplit; + #[cfg(all(test, feature = "unstable"))] extern crate test; -extern crate tinysegmenter; - #[macro_use] extern crate downcast; +#[macro_use] +extern crate fail; + #[cfg(test)] mod functional_test; #[macro_use] mod macros; -pub use error::{Error, ErrorKind, ResultExt}; +pub use error::TantivyError; + +#[deprecated( + since = "0.7.0", + note = "please use `tantivy::TantivyError` instead" +)] +pub use error::TantivyError as Error; extern crate census; extern crate owned_read; /// Tantivy result. -pub type Result = std::result::Result; +pub type Result = std::result::Result; mod common; mod core; @@ -199,13 +209,17 @@ pub mod collector; pub mod directory; pub mod fastfield; pub mod fieldnorm; -pub mod postings; pub(crate) mod positions; +pub mod postings; pub mod query; pub mod schema; +pub mod space_usage; pub mod store; pub mod termdict; +mod snippet; +pub use self::snippet::SnippetGenerator; + mod docset; pub use self::docset::{DocSet, SkipResult}; @@ -258,12 +272,12 @@ impl DocAddress { /// The segment ordinal is an id identifying the segment /// hosting the document. It is only meaningful, in the context /// of a searcher. - pub fn segment_ord(&self) -> SegmentLocalId { + pub fn segment_ord(self) -> SegmentLocalId { self.0 } /// Return the segment local `DocId` - pub fn doc(&self) -> DocId { + pub fn doc(self) -> DocId { self.1 } } @@ -286,13 +300,13 @@ mod tests { use core::SegmentReader; use docset::DocSet; use query::BooleanQuery; + use rand::distributions::Bernoulli; use rand::distributions::Range; use rand::{Rng, SeedableRng, XorShiftRng}; use schema::*; use Index; use IndexWriter; use Postings; - use rand::distributions::Bernoulli; pub fn assert_nearly_equals(expected: f32, val: f32) { assert!( @@ -321,13 +335,7 @@ mod tests { .sample_iter(&Bernoulli::new(ratio)) .take(n as usize) .enumerate() - .filter_map(|(val, keep)| { - if keep { - Some(val as u32) - } else { - None - } - }) + .filter_map(|(val, keep)| if keep { Some(val as u32) } else { None }) .collect() } @@ -895,11 +903,11 @@ mod tests { assert_eq!(document.len(), 3); let values = document.get_all(text_field); assert_eq!(values.len(), 2); - assert_eq!(values[0].text(), "tantivy"); - assert_eq!(values[1].text(), "some other value"); + assert_eq!(values[0].text(), Some("tantivy")); + assert_eq!(values[1].text(), Some("some other value")); let values = document.get_all(other_text_field); assert_eq!(values.len(), 1); - assert_eq!(values[0].text(), "short"); + assert_eq!(values[0].text(), Some("short")); } #[test] diff --git a/src/macros.rs b/src/macros.rs index 5e3d9b023..87d4d926e 100644 --- a/src/macros.rs +++ b/src/macros.rs @@ -1,7 +1,3 @@ -macro_rules! get( - ($e:expr) => (match $e { Some(e) => e, None => return None }) -); - /// `doc!` is a shortcut that helps building `Document` /// objects. /// diff --git a/src/positions/mod.rs b/src/positions/mod.rs index f867358f5..ab0375355 100644 --- a/src/positions/mod.rs +++ b/src/positions/mod.rs @@ -1,4 +1,3 @@ - /// Positions are stored in three parts and over two files. // /// The `SegmentComponent::POSITIONS` file contains all of the bitpacked positions delta, @@ -24,13 +23,12 @@ /// The long skip structure makes it possible to skip rapidly to the a checkpoint close to this /// value, and then skip normally. /// - mod reader; mod serializer; pub use self::reader::PositionReader; pub use self::serializer::PositionSerializer; -use bitpacking::{BitPacker4x, BitPacker}; +use bitpacking::{BitPacker, BitPacker4x}; const COMPRESSION_BLOCK_SIZE: usize = BitPacker4x::BLOCK_LEN; const LONG_SKIP_IN_BLOCKS: usize = 1_024; @@ -43,10 +41,10 @@ lazy_static! { #[cfg(test)] pub mod tests { - use std::iter; - use super::{PositionSerializer, PositionReader}; + use super::{PositionReader, PositionSerializer}; use directory::ReadOnlySource; use positions::COMPRESSION_BLOCK_SIZE; + use std::iter; fn create_stream_buffer(vals: &[u32]) -> (ReadOnlySource, ReadOnlySource) { let mut skip_buffer = vec![]; @@ -59,7 +57,10 @@ pub mod tests { } serializer.close().unwrap(); } - (ReadOnlySource::from(stream_buffer), ReadOnlySource::from(skip_buffer)) + ( + ReadOnlySource::from(stream_buffer), + ReadOnlySource::from(skip_buffer), + ) } #[test] @@ -103,7 +104,7 @@ pub mod tests { assert_eq!(skip.len(), 12); assert_eq!(stream.len(), 1168); - let mut position_reader = PositionReader::new(stream,skip, 0u64); + let mut position_reader = PositionReader::new(stream, skip, 0u64); let mut buf = [0u32; 7]; let mut c = 0; for _ in 0..100 { @@ -125,7 +126,7 @@ pub mod tests { let (stream, skip) = create_stream_buffer(&v[..]); assert_eq!(skip.len(), 15_749); assert_eq!(stream.len(), 1_000_000); - let mut position_reader = PositionReader::new(stream,skip, 128 * 1024); + let mut position_reader = PositionReader::new(stream, skip, 128 * 1024); let mut buf = [0u32; 1]; position_reader.read(&mut buf); assert_eq!(buf[0], CONST_VAL); @@ -137,12 +138,17 @@ pub mod tests { let (stream, skip) = create_stream_buffer(&v[..]); assert_eq!(skip.len(), 15_749); assert_eq!(stream.len(), 4_987_872); - for &offset in &[10, 128 * 1024, 128 * 1024 - 1, 128 * 1024 + 7, 128 * 10 * 1024 + 10] { - let mut position_reader = PositionReader::new(stream.clone(),skip.clone(), offset); + for &offset in &[ + 10, + 128 * 1024, + 128 * 1024 - 1, + 128 * 1024 + 7, + 128 * 10 * 1024 + 10, + ] { + let mut position_reader = PositionReader::new(stream.clone(), skip.clone(), offset); let mut buf = [0u32; 1]; position_reader.read(&mut buf); assert_eq!(buf[0], offset as u32); } } } - diff --git a/src/positions/reader.rs b/src/positions/reader.rs index 8b5dd70bb..470abaaa2 100644 --- a/src/positions/reader.rs +++ b/src/positions/reader.rs @@ -1,12 +1,12 @@ -use bitpacking::{BitPacker4x, BitPacker}; -use owned_read::OwnedRead; -use common::{BinarySerializable, FixedSize}; -use postings::compression::compressed_block_size; -use directory::ReadOnlySource; -use positions::COMPRESSION_BLOCK_SIZE; -use positions::LONG_SKIP_IN_BLOCKS; -use positions::LONG_SKIP_INTERVAL; use super::BIT_PACKER; +use bitpacking::{BitPacker, BitPacker4x}; +use common::{BinarySerializable, FixedSize}; +use directory::ReadOnlySource; +use owned_read::OwnedRead; +use positions::COMPRESSION_BLOCK_SIZE; +use positions::LONG_SKIP_INTERVAL; +use positions::LONG_SKIP_IN_BLOCKS; +use postings::compression::compressed_block_size; pub struct PositionReader { skip_read: OwnedRead, @@ -18,7 +18,6 @@ pub struct PositionReader { // of the block of the next int to read. } - // `ahead` represents the offset of the block currently loaded // compared to the cursor of the actual stream. // @@ -32,7 +31,8 @@ fn read_impl( buffer: &mut [u32; 128], mut inner_offset: usize, num_bits: &[u8], - output: &mut [u32]) -> usize { + output: &mut [u32], +) -> usize { let mut output_start = 0; let mut output_len = output.len(); let mut ahead = 0; @@ -47,8 +47,7 @@ fn read_impl( output_start += available_len; inner_offset = 0; let num_bits = num_bits[ahead]; - BitPacker4x::new() - .decompress(position, &mut buffer[..], num_bits); + BitPacker4x::new().decompress(position, &mut buffer[..], num_bits); let block_len = compressed_block_size(num_bits); position = &position[block_len..]; ahead += 1; @@ -56,11 +55,12 @@ fn read_impl( } } - impl PositionReader { - pub fn new(position_source: ReadOnlySource, - skip_source: ReadOnlySource, - offset: u64) -> PositionReader { + pub fn new( + position_source: ReadOnlySource, + skip_source: ReadOnlySource, + offset: u64, + ) -> PositionReader { let skip_len = skip_source.len(); let (body, footer) = skip_source.split(skip_len - u32::SIZE_IN_BYTES); let num_long_skips = u32::deserialize(&mut footer.as_slice()).expect("Index corrupted"); @@ -70,7 +70,8 @@ impl PositionReader { let small_skip = (offset - (long_skip_id as u64) * (LONG_SKIP_INTERVAL as u64)) as usize; let offset_num_bytes: u64 = { if long_skip_id > 0 { - let mut long_skip_blocks: &[u8] = &long_skips.as_slice()[(long_skip_id - 1) * 8..][..8]; + let mut long_skip_blocks: &[u8] = + &long_skips.as_slice()[(long_skip_id - 1) * 8..][..8]; u64::deserialize(&mut long_skip_blocks).expect("Index corrupted") * 16 } else { 0 @@ -79,13 +80,13 @@ impl PositionReader { let mut position_read = OwnedRead::new(position_source); position_read.advance(offset_num_bytes as usize); let mut skip_read = OwnedRead::new(skip_body); - skip_read.advance(long_skip_id * LONG_SKIP_IN_BLOCKS); + skip_read.advance(long_skip_id * LONG_SKIP_IN_BLOCKS); let mut position_reader = PositionReader { skip_read, position_read, inner_offset: 0, buffer: Box::new([0u32; 128]), - ahead: None + ahead: None, }; position_reader.skip(small_skip); position_reader @@ -108,7 +109,8 @@ impl PositionReader { self.buffer.as_mut(), self.inner_offset, &skip_data[1..], - output)); + output, + )); } /// Skip the next `skip_len` integer. @@ -118,27 +120,25 @@ impl PositionReader { /// /// May panic if the end of the stream is reached. pub fn skip(&mut self, skip_len: usize) { - let skip_len_plus_inner_offset = skip_len + self.inner_offset; let num_blocks_to_advance = skip_len_plus_inner_offset / COMPRESSION_BLOCK_SIZE; self.inner_offset = skip_len_plus_inner_offset % COMPRESSION_BLOCK_SIZE; - self.ahead = self.ahead - .and_then(|num_blocks| { - if num_blocks >= num_blocks_to_advance { - Some(num_blocks_to_advance - num_blocks_to_advance) - } else { - None - } - }); + self.ahead = self.ahead.and_then(|num_blocks| { + if num_blocks >= num_blocks_to_advance { + Some(num_blocks - num_blocks_to_advance) + } else { + None + } + }); - let skip_len = self.skip_read - .as_ref()[..num_blocks_to_advance] + let skip_len = self.skip_read.as_ref()[..num_blocks_to_advance] .iter() .cloned() .map(|num_bit| num_bit as usize) - .sum::() * (COMPRESSION_BLOCK_SIZE / 8); + .sum::() + * (COMPRESSION_BLOCK_SIZE / 8); self.skip_read.advance(num_blocks_to_advance); self.position_read.advance(skip_len); diff --git a/src/positions/serializer.rs b/src/positions/serializer.rs index 598c26363..68c6885cd 100644 --- a/src/positions/serializer.rs +++ b/src/positions/serializer.rs @@ -1,8 +1,8 @@ -use std::io; -use bitpacking::BitPacker; -use positions::{COMPRESSION_BLOCK_SIZE, LONG_SKIP_INTERVAL}; -use common::BinarySerializable; use super::BIT_PACKER; +use bitpacking::BitPacker; +use common::BinarySerializable; +use positions::{COMPRESSION_BLOCK_SIZE, LONG_SKIP_INTERVAL}; +use std::io; pub struct PositionSerializer { write_stream: W, @@ -23,7 +23,7 @@ impl PositionSerializer { buffer: vec![0u8; 128 * 4], num_ints: 0u64, long_skips: Vec::new(), - cumulated_num_bits: 0u64 + cumulated_num_bits: 0u64, } } @@ -31,7 +31,6 @@ impl PositionSerializer { self.num_ints } - fn remaining_block_len(&self) -> usize { COMPRESSION_BLOCK_SIZE - self.block.len() } @@ -52,8 +51,8 @@ impl PositionSerializer { fn flush_block(&mut self) -> io::Result<()> { let num_bits = BIT_PACKER.num_bits(&self.block[..]); - self.cumulated_num_bits += num_bits as u64; - self.write_skiplist.write(&[num_bits])?; + self.cumulated_num_bits += u64::from(num_bits); + self.write_skiplist.write_all(&[num_bits])?; let written_len = BIT_PACKER.compress(&self.block[..], &mut self.buffer, num_bits); self.write_stream.write_all(&self.buffer[..written_len])?; self.block.clear(); diff --git a/src/postings/compression/mod.rs b/src/postings/compression/mod.rs index 6b05010c6..810cf28e7 100644 --- a/src/postings/compression/mod.rs +++ b/src/postings/compression/mod.rs @@ -28,14 +28,16 @@ impl BlockEncoder { pub fn compress_block_sorted(&mut self, block: &[u32], offset: u32) -> (u8, &[u8]) { let num_bits = self.bitpacker.num_bits_sorted(offset, block); - let written_size = self.bitpacker - .compress_sorted(offset, block, &mut self.output[..], num_bits); + let written_size = + self.bitpacker + .compress_sorted(offset, block, &mut self.output[..], num_bits); (num_bits, &self.output[..written_size]) } pub fn compress_block_unsorted(&mut self, block: &[u32]) -> (u8, &[u8]) { let num_bits = self.bitpacker.num_bits(block); - let written_size = self.bitpacker + let written_size = self + .bitpacker .compress(block, &mut self.output[..], num_bits); (num_bits, &self.output[..written_size]) } @@ -62,19 +64,21 @@ impl BlockDecoder { } } - pub fn uncompress_block_sorted(&mut self, compressed_data: &[u8], offset: u32, num_bits: u8) -> usize { + pub fn uncompress_block_sorted( + &mut self, + compressed_data: &[u8], + offset: u32, + num_bits: u8, + ) -> usize { self.output_len = COMPRESSION_BLOCK_SIZE; - self.bitpacker.decompress_sorted( - offset, - &compressed_data, - &mut self.output, - num_bits, - ) + self.bitpacker + .decompress_sorted(offset, &compressed_data, &mut self.output, num_bits) } pub fn uncompress_block_unsorted(&mut self, compressed_data: &[u8], num_bits: u8) -> usize { self.output_len = COMPRESSION_BLOCK_SIZE; - self.bitpacker.decompress(&compressed_data, &mut self.output, num_bits) + self.bitpacker + .decompress(&compressed_data, &mut self.output, num_bits) } #[inline] @@ -88,7 +92,6 @@ impl BlockDecoder { } } - pub trait VIntEncoder { /// Compresses an array of `u32` integers, /// using [delta-encoding](https://en.wikipedia.org/wiki/Delta_ encoding) diff --git a/src/postings/compression/vint.rs b/src/postings/compression/vint.rs index 515510c54..87a672e64 100644 --- a/src/postings/compression/vint.rs +++ b/src/postings/compression/vint.rs @@ -1,9 +1,5 @@ #[inline(always)] -pub fn compress_sorted<'a>( - input: &[u32], - output: &'a mut [u8], - mut offset: u32, -) -> &'a [u8] { +pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], mut offset: u32) -> &'a [u8] { let mut byte_written = 0; for &v in input { let mut to_encode: u32 = v - offset; @@ -46,47 +42,41 @@ pub(crate) fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a } #[inline(always)] -pub fn uncompress_sorted<'a>( - compressed_data: &'a [u8], - output: &mut [u32], - offset: u32, -) -> usize { +pub fn uncompress_sorted<'a>(compressed_data: &'a [u8], output: &mut [u32], offset: u32) -> usize { let mut read_byte = 0; let mut result = offset; - let num_els = output.len(); - for i in 0..num_els { + for output_mut in output.iter_mut() { let mut shift = 0u32; loop { let cur_byte = compressed_data[read_byte]; read_byte += 1; - result += ((cur_byte % 128u8) as u32) << shift; + result += u32::from(cur_byte % 128u8) << shift; if cur_byte & 128u8 != 0u8 { break; } shift += 7; } - output[i] = result; + *output_mut = result; } read_byte } #[inline(always)] -pub(crate) fn uncompress_unsorted<'a>(compressed_data: &'a [u8], output: &mut [u32]) -> usize { +pub(crate) fn uncompress_unsorted(compressed_data: &[u8], output_arr: &mut [u32]) -> usize { let mut read_byte = 0; - let num_els = output.len(); - for i in 0..num_els { + for output_mut in output_arr.iter_mut() { let mut result = 0u32; let mut shift = 0u32; loop { let cur_byte = compressed_data[read_byte]; read_byte += 1; - result += ((cur_byte % 128u8) as u32) << shift; + result += u32::from(cur_byte % 128u8) << shift; if cur_byte & 128u8 != 0u8 { break; } shift += 7; } - output[i] = result; + *output_mut = result; } read_byte } diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 85852ed22..ec0940ff0 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -2,6 +2,7 @@ Postings module (also called inverted index) */ +pub(crate) mod compression; /// Postings module /// /// Postings, also called inverted lists, is the key datastructure @@ -11,18 +12,17 @@ mod postings_writer; mod recorder; mod segment_postings; mod serializer; -pub(crate) mod compression; +mod skip; mod stacker; mod term_info; -mod skip; pub(crate) use self::postings_writer::MultiFieldPostingsWriter; pub use self::serializer::{FieldSerializer, InvertedIndexSerializer}; +use self::compression::COMPRESSION_BLOCK_SIZE; pub use self::postings::Postings; -pub use self::term_info::TermInfo; pub(crate) use self::skip::SkipReader; -use self::compression::{COMPRESSION_BLOCK_SIZE}; +pub use self::term_info::TermInfo; pub use self::segment_postings::{BlockSegmentPostings, SegmentPostings}; @@ -34,7 +34,7 @@ pub(crate) const USE_SKIP_INFO_LIMIT: u32 = COMPRESSION_BLOCK_SIZE as u32; pub(crate) type UnorderedTermId = u64; -#[allow(enum_variant_names)] +#[cfg_attr(feature = "cargo-clippy", allow(clippy::enum_variant_names))] #[derive(Debug, PartialEq, Clone, Copy, Eq)] pub(crate) enum FreqReadingOption { NoFreq, @@ -71,8 +71,7 @@ pub mod tests { let mut segment = index.new_segment(); let mut posting_serializer = InvertedIndexSerializer::open(&mut segment).unwrap(); { - let mut field_serializer = posting_serializer - .new_field(text_field, 120 * 4).unwrap(); + let mut field_serializer = posting_serializer.new_field(text_field, 120 * 4).unwrap(); field_serializer.new_term("abc".as_bytes()).unwrap(); for doc_id in 0u32..120u32 { let delta_positions = vec![1, 2, 3, 2]; @@ -512,13 +511,13 @@ pub mod tests { let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); for _ in 0..posting_list_size { let mut doc = Document::default(); - if rng.gen_bool(1f64/ 15f64) { + if rng.gen_bool(1f64 / 15f64) { doc.add_text(text_field, "a"); } - if rng.gen_bool(1f64/ 10f64) { + if rng.gen_bool(1f64 / 10f64) { doc.add_text(text_field, "b"); } - if rng.gen_bool(1f64/ 5f64) { + if rng.gen_bool(1f64 / 5f64) { doc.add_text(text_field, "c"); } doc.add_text(text_field, "d"); diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index fe56795e7..dd0f691ae 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -15,7 +15,7 @@ use tokenizer::TokenStream; use DocId; use Result; -fn posting_from_field_entry<'a>(field_entry: &FieldEntry) -> Box { +fn posting_from_field_entry(field_entry: &FieldEntry) -> Box { match *field_entry.field_type() { FieldType::Str(ref text_options) => text_options .get_indexing_options() @@ -29,8 +29,7 @@ fn posting_from_field_entry<'a>(field_entry: &FieldEntry) -> Box IndexRecordOption::WithFreqsAndPositions => { SpecializedPostingsWriter::::new_boxed() } - }) - .unwrap_or_else(|| SpecializedPostingsWriter::::new_boxed()), + }).unwrap_or_else(|| SpecializedPostingsWriter::::new_boxed()), FieldType::U64(_) | FieldType::I64(_) | FieldType::HierarchicalFacet => { SpecializedPostingsWriter::::new_boxed() } @@ -94,11 +93,12 @@ impl MultiFieldPostingsWriter { &self, serializer: &mut InvertedIndexSerializer, ) -> Result>> { - let mut term_offsets: Vec<(&[u8], Addr, UnorderedTermId)> = self.term_index + let mut term_offsets: Vec<(&[u8], Addr, UnorderedTermId)> = self + .term_index .iter() .map(|(term_bytes, addr, bucket_id)| (term_bytes, addr, bucket_id as UnorderedTermId)) .collect(); - term_offsets.sort_by_key(|&(k, _, _)| k); + term_offsets.sort_unstable_by_key(|&(k, _, _)| k); let mut offsets: Vec<(Field, usize)> = vec![]; let term_offsets_it = term_offsets @@ -127,8 +127,8 @@ impl MultiFieldPostingsWriter { let field_entry = self.schema.get_field_entry(field); - match field_entry.field_type() { - &FieldType::Str(_) | &FieldType::HierarchicalFacet => { + match *field_entry.field_type() { + FieldType::Str(_) | FieldType::HierarchicalFacet => { // populating the (unordered term ord) -> (ordered term ord) mapping // for the field. let mut unordered_term_ids = term_offsets[start..stop] @@ -138,12 +138,11 @@ impl MultiFieldPostingsWriter { .enumerate() .map(|(term_ord, unord_term_id)| { (unord_term_id as UnorderedTermId, term_ord as TermOrdinal) - }) - .collect(); + }).collect(); unordered_term_mappings.insert(field, mapping); } - &FieldType::U64(_) | &FieldType::I64(_) => {} - &FieldType::Bytes => {} + FieldType::U64(_) | FieldType::I64(_) => {} + FieldType::Bytes => {} } let postings_writer = &self.per_field_postings_writers[field.0 as usize]; @@ -202,14 +201,11 @@ pub trait PostingsWriter { heap: &mut MemoryArena, ) -> u32 { let mut term = Term::for_field(field); - let num_tokens = { - let mut sink = |token: &Token| { - term.set_text(token.text.as_str()); - self.subscribe(term_index, doc_id, token.position as u32, &term, heap); - }; - token_stream.process(&mut sink) + let mut sink = |token: &Token| { + term.set_text(token.text.as_str()); + self.subscribe(term_index, doc_id, token.position as u32, &term, heap); }; - num_tokens + token_stream.process(&mut sink) } fn total_num_tokens(&self) -> u64; diff --git a/src/postings/recorder.rs b/src/postings/recorder.rs index e787ba5e9..c355a78ba 100644 --- a/src/postings/recorder.rs +++ b/src/postings/recorder.rs @@ -107,7 +107,8 @@ impl Recorder for TermFrequencyRecorder { fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> { // the last document has not been closed... // its term freq is self.current_tf. - let mut doc_iter = self.stack + let mut doc_iter = self + .stack .iter(heap) .chain(Some(self.current_tf).into_iter()); diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index 74fcbc199..776844f2a 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -1,20 +1,20 @@ -use postings::compression::{BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE}; -use DocId; use common::BitSet; use common::HasLen; -use postings::compression::compressed_block_size; +use common::{BinarySerializable, VInt}; use docset::{DocSet, SkipResult}; use fst::Streamer; +use owned_read::OwnedRead; +use positions::PositionReader; +use postings::compression::compressed_block_size; +use postings::compression::{BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE}; use postings::serializer::PostingsSerializer; use postings::FreqReadingOption; use postings::Postings; -use owned_read::OwnedRead; -use common::{VInt, BinarySerializable}; -use postings::USE_SKIP_INFO_LIMIT; use postings::SkipReader; +use postings::USE_SKIP_INFO_LIMIT; use schema::IndexRecordOption; -use positions::PositionReader; use std::cmp::Ordering; +use DocId; const EMPTY_ARR: [u8; 0] = []; @@ -98,7 +98,7 @@ impl SegmentPostings { docs.len() as u32, OwnedRead::new(buffer), IndexRecordOption::Basic, - IndexRecordOption::Basic + IndexRecordOption::Basic, ); SegmentPostings::from_block_postings(block_segment_postings, None) } @@ -151,7 +151,11 @@ fn exponential_search(target: u32, arr: &[u32]) -> (usize, usize) { /// The target is assumed smaller or equal to the last element. fn search_within_block(block_docs: &[u32], target: u32) -> usize { let (start, end) = exponential_search(target, block_docs); - start.wrapping_add(block_docs[start..end].binary_search(&target).unwrap_or_else(|e| e)) + start.wrapping_add( + block_docs[start..end] + .binary_search(&target) + .unwrap_or_else(|e| e), + ) } impl DocSet for SegmentPostings { @@ -179,21 +183,20 @@ impl DocSet for SegmentPostings { // check if we need to go to the next block let need_positions = self.position_computer.is_some(); let mut sum_freqs_skipped: u32 = 0; - if !self.block_cursor - .docs() - .last() - .map(|doc| *doc >= target) - .unwrap_or(false) // there should always be at least a document in the block - // since advance returned. + if !self + .block_cursor + .docs() + .last() + .map(|doc| *doc >= target) + .unwrap_or(false) + // there should always be at least a document in the block + // since advance returned. { // we are not in the right block. // // First compute all of the freqs skipped from the current block. if need_positions { - sum_freqs_skipped = self.block_cursor - .freqs()[self.cur..] - .iter() - .sum(); + sum_freqs_skipped = self.block_cursor.freqs()[self.cur..].iter().sum(); match self.block_cursor.skip_to(target) { BlockSegmentPostingsSkipResult::Success(block_skip_freqs) => { sum_freqs_skipped += block_skip_freqs; @@ -202,11 +205,11 @@ impl DocSet for SegmentPostings { return SkipResult::End; } } - } else { + } else if self.block_cursor.skip_to(target) + == BlockSegmentPostingsSkipResult::Terminated + { // no positions needed. no need to sum freqs. - if self.block_cursor.skip_to(target) == BlockSegmentPostingsSkipResult::Terminated { - return SkipResult::End; - } + return SkipResult::End; } self.cur = 0; } @@ -215,9 +218,13 @@ impl DocSet for SegmentPostings { let block_docs = self.block_cursor.docs(); debug_assert!(target >= self.doc()); - let new_cur = self.cur.wrapping_add(search_within_block(&block_docs[self.cur..], target)); + let new_cur = self + .cur + .wrapping_add(search_within_block(&block_docs[self.cur..], target)); if need_positions { - sum_freqs_skipped += self.block_cursor.freqs()[self.cur..new_cur].iter().sum::(); + sum_freqs_skipped += self.block_cursor.freqs()[self.cur..new_cur] + .iter() + .sum::(); self.position_computer .as_mut() .unwrap() @@ -229,9 +236,9 @@ impl DocSet for SegmentPostings { let doc = block_docs[new_cur]; debug_assert!(doc >= target); if doc == target { - return SkipResult::Reached; + SkipResult::Reached } else { - return SkipResult::OverStep; + SkipResult::OverStep } } @@ -330,7 +337,10 @@ pub struct BlockSegmentPostings { skip_reader: SkipReader, } -fn split_into_skips_and_postings(doc_freq: u32, mut data: OwnedRead) -> (Option, OwnedRead) { +fn split_into_skips_and_postings( + doc_freq: u32, + mut data: OwnedRead, +) -> (Option, OwnedRead) { if doc_freq >= USE_SKIP_INFO_LIMIT { let skip_len = VInt::deserialize(&mut data).expect("Data corrupted").0 as usize; let mut postings_data = data.clone(); @@ -345,7 +355,7 @@ fn split_into_skips_and_postings(doc_freq: u32, mut data: OwnedRead) -> (Option< #[derive(Debug, Eq, PartialEq)] pub enum BlockSegmentPostingsSkipResult { Terminated, - Success(u32) //< number of term freqs to skip + Success(u32), //< number of term freqs to skip } impl BlockSegmentPostings { @@ -353,7 +363,7 @@ impl BlockSegmentPostings { doc_freq: u32, data: OwnedRead, record_option: IndexRecordOption, - requested_option: IndexRecordOption + requested_option: IndexRecordOption, ) -> BlockSegmentPostings { let freq_reading_option = match (record_option, requested_option) { (IndexRecordOption::Basic, _) => FreqReadingOption::NoFreq, @@ -362,11 +372,10 @@ impl BlockSegmentPostings { }; let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, data); - let skip_reader = - match skip_data_opt { - Some(skip_data) => SkipReader::new(skip_data, record_option), - None => SkipReader::new(OwnedRead::new(&EMPTY_ARR[..]), record_option) - }; + let skip_reader = match skip_data_opt { + Some(skip_data) => SkipReader::new(skip_data, record_option), + None => SkipReader::new(OwnedRead::new(&EMPTY_ARR[..]), record_option), + }; let doc_freq = doc_freq as usize; let num_vint_docs = doc_freq % COMPRESSION_BLOCK_SIZE; BlockSegmentPostings { @@ -450,7 +459,6 @@ impl BlockSegmentPostings { self.doc_decoder.output_len } - /// position on a block that may contains `doc_id`. /// Always advance the current block. /// @@ -461,9 +469,7 @@ impl BlockSegmentPostings { /// Returns false iff all of the document remaining are smaller than /// `doc_id`. In that case, all of these document are consumed. /// - pub fn skip_to(&mut self, - target_doc: DocId) -> BlockSegmentPostingsSkipResult { - + pub fn skip_to(&mut self, target_doc: DocId) -> BlockSegmentPostingsSkipResult { let mut skip_freqs = 0u32; while self.skip_reader.advance() { if self.skip_reader.doc() >= target_doc { @@ -472,11 +478,11 @@ impl BlockSegmentPostings { // // We found our block! let num_bits = self.skip_reader.doc_num_bits(); - let num_consumed_bytes = self.doc_decoder - .uncompress_block_sorted( - self.remaining_data.as_ref(), - self.doc_offset, - num_bits); + let num_consumed_bytes = self.doc_decoder.uncompress_block_sorted( + self.remaining_data.as_ref(), + self.doc_offset, + num_bits, + ); self.remaining_data.advance(num_consumed_bytes); let tf_num_bits = self.skip_reader.tf_num_bits(); match self.freq_reading_option { @@ -486,9 +492,9 @@ impl BlockSegmentPostings { self.remaining_data.advance(num_bytes_to_skip); } FreqReadingOption::ReadFreq => { - let num_consumed_bytes = self.freq_decoder - .uncompress_block_unsorted(self.remaining_data.as_ref(), - tf_num_bits); + let num_consumed_bytes = self + .freq_decoder + .uncompress_block_unsorted(self.remaining_data.as_ref(), tf_num_bits); self.remaining_data.advance(num_consumed_bytes); } } @@ -518,7 +524,8 @@ impl BlockSegmentPostings { } } self.num_vint_docs = 0; - return self.docs() + return self + .docs() .last() .map(|last_doc| { if *last_doc >= target_doc { @@ -526,8 +533,7 @@ impl BlockSegmentPostings { } else { BlockSegmentPostingsSkipResult::Terminated } - }) - .unwrap_or(BlockSegmentPostingsSkipResult::Terminated); + }).unwrap_or(BlockSegmentPostingsSkipResult::Terminated); } BlockSegmentPostingsSkipResult::Terminated } @@ -538,11 +544,11 @@ impl BlockSegmentPostings { pub fn advance(&mut self) -> bool { if self.skip_reader.advance() { let num_bits = self.skip_reader.doc_num_bits(); - let num_consumed_bytes = self.doc_decoder - .uncompress_block_sorted( - self.remaining_data.as_ref(), - self.doc_offset, - num_bits); + let num_consumed_bytes = self.doc_decoder.uncompress_block_sorted( + self.remaining_data.as_ref(), + self.doc_offset, + num_bits, + ); self.remaining_data.advance(num_consumed_bytes); let tf_num_bits = self.skip_reader.tf_num_bits(); match self.freq_reading_option { @@ -552,9 +558,9 @@ impl BlockSegmentPostings { self.remaining_data.advance(num_bytes_to_skip); } FreqReadingOption::ReadFreq => { - let num_consumed_bytes = self.freq_decoder - .uncompress_block_unsorted(self.remaining_data.as_ref(), - tf_num_bits); + let num_consumed_bytes = self + .freq_decoder + .uncompress_block_unsorted(self.remaining_data.as_ref(), tf_num_bits); self.remaining_data.advance(num_consumed_bytes); } } @@ -594,7 +600,6 @@ impl BlockSegmentPostings { doc_offset: 0, doc_freq: 0, - remaining_data: OwnedRead::new(vec![]), skip_reader: SkipReader::new(OwnedRead::new(vec![]), IndexRecordOption::Basic), } @@ -616,7 +621,9 @@ impl<'b> Streamer<'b> for BlockSegmentPostings { #[cfg(test)] mod tests { + use super::search_within_block; use super::BlockSegmentPostings; + use super::BlockSegmentPostingsSkipResult; use super::SegmentPostings; use common::HasLen; use core::Index; @@ -626,9 +633,7 @@ mod tests { use schema::SchemaBuilder; use schema::Term; use schema::INT_INDEXED; - use super::BlockSegmentPostingsSkipResult; use DocId; - use super::search_within_block; #[test] fn test_empty_segment_postings() { @@ -645,7 +650,6 @@ mod tests { assert_eq!(postings.doc_freq(), 0); } - fn search_within_block_trivial_but_slow(block: &[u32], target: u32) -> usize { block .iter() @@ -653,11 +657,15 @@ mod tests { .enumerate() .filter(|&(_, ref val)| *val >= target) .next() - .unwrap().0 + .unwrap() + .0 } fn util_test_search_within_block(block: &[u32], target: u32) { - assert_eq!(search_within_block(block, target), search_within_block_trivial_but_slow(block, target)); + assert_eq!( + search_within_block(block, target), + search_within_block_trivial_but_slow(block, target) + ); } fn util_test_search_within_block_all(block: &[u32]) { @@ -677,7 +685,7 @@ mod tests { #[test] fn test_search_within_block() { for len in 1u32..128u32 { - let v: Vec = (0..len).map(|i| i*2).collect(); + let v: Vec = (0..len).map(|i| i * 2).collect(); util_test_search_within_block_all(&v[..]); } } @@ -726,14 +734,22 @@ mod tests { fn test_block_segment_postings_skip() { for i in 0..4 { let mut block_postings = build_block_postings(vec![3]); - assert_eq!(block_postings.skip_to(i), BlockSegmentPostingsSkipResult::Success(0u32)); - assert_eq!(block_postings.skip_to(i), BlockSegmentPostingsSkipResult::Terminated); + assert_eq!( + block_postings.skip_to(i), + BlockSegmentPostingsSkipResult::Success(0u32) + ); + assert_eq!( + block_postings.skip_to(i), + BlockSegmentPostingsSkipResult::Terminated + ); } let mut block_postings = build_block_postings(vec![3]); - assert_eq!(block_postings.skip_to(4u32), BlockSegmentPostingsSkipResult::Terminated); + assert_eq!( + block_postings.skip_to(4u32), + BlockSegmentPostingsSkipResult::Terminated + ); } - #[test] fn test_block_segment_postings_skip2() { let mut docs = vec![0]; @@ -741,14 +757,23 @@ mod tests { docs.push((i * i / 100) + i); } let mut block_postings = build_block_postings(docs.clone()); - for i in vec![0, 424, 10000] { - assert_eq!(block_postings.skip_to(i), BlockSegmentPostingsSkipResult::Success(0u32)); + for i in vec![0, 424, 10000] { + assert_eq!( + block_postings.skip_to(i), + BlockSegmentPostingsSkipResult::Success(0u32) + ); let docs = block_postings.docs(); assert!(docs[0] <= i); assert!(docs.last().cloned().unwrap_or(0u32) >= i); } - assert_eq!(block_postings.skip_to(100_000), BlockSegmentPostingsSkipResult::Terminated); - assert_eq!(block_postings.skip_to(101_000), BlockSegmentPostingsSkipResult::Terminated); + assert_eq!( + block_postings.skip_to(100_000), + BlockSegmentPostingsSkipResult::Terminated + ); + assert_eq!( + block_postings.skip_to(101_000), + BlockSegmentPostingsSkipResult::Terminated + ); } #[test] diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index 521d467d2..f578f2caf 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -1,18 +1,18 @@ use super::TermInfo; -use common::{VInt, BinarySerializable}; +use common::{BinarySerializable, VInt}; use common::{CompositeWrite, CountingWriter}; -use postings::compression::{VIntEncoder, BlockEncoder, COMPRESSION_BLOCK_SIZE}; use core::Segment; use directory::WritePtr; +use positions::PositionSerializer; +use postings::compression::{BlockEncoder, VIntEncoder, COMPRESSION_BLOCK_SIZE}; +use postings::skip::SkipSerializer; +use postings::USE_SKIP_INFO_LIMIT; use schema::Schema; use schema::{Field, FieldEntry, FieldType}; use std::io::{self, Write}; use termdict::{TermDictionaryBuilder, TermOrdinal}; use DocId; use Result; -use postings::USE_SKIP_INFO_LIMIT; -use postings::skip::SkipSerializer; -use positions::PositionSerializer; /// `PostingsSerializer` is in charge of serializing /// postings on disk, in the @@ -100,11 +100,11 @@ impl InvertedIndexSerializer { let positionsidx_write = self.positionsidx_write.for_field(field); let field_type: FieldType = (*field_entry.field_type()).clone(); FieldSerializer::new( - field_type, + &field_type, term_dictionary_write, postings_write, positions_write, - positionsidx_write + positionsidx_write, ) } @@ -131,11 +131,11 @@ pub struct FieldSerializer<'a> { impl<'a> FieldSerializer<'a> { fn new( - field_type: FieldType, + field_type: &FieldType, term_dictionary_write: &'a mut CountingWriter, postings_write: &'a mut CountingWriter, positions_write: &'a mut CountingWriter, - positionsidx_write: &'a mut CountingWriter + positionsidx_write: &'a mut CountingWriter, ) -> io::Result> { let (term_freq_enabled, position_enabled): (bool, bool) = match field_type { FieldType::Str(ref text_options) => { @@ -152,8 +152,9 @@ impl<'a> FieldSerializer<'a> { _ => (false, false), }; let term_dictionary_builder = - TermDictionaryBuilder::new(term_dictionary_write, field_type)?; - let postings_serializer = PostingsSerializer::new(postings_write, term_freq_enabled, position_enabled); + TermDictionaryBuilder::new(term_dictionary_write, &field_type)?; + let postings_serializer = + PostingsSerializer::new(postings_write, term_freq_enabled, position_enabled); let positions_serializer_opt = if position_enabled { Some(PositionSerializer::new(positions_write, positionsidx_write)) } else { @@ -171,14 +172,15 @@ impl<'a> FieldSerializer<'a> { } fn current_term_info(&self) -> TermInfo { - let positions_idx = self.positions_serializer_opt + let positions_idx = self + .positions_serializer_opt .as_ref() .map(|positions_serializer| positions_serializer.positions_idx()) .unwrap_or(0u64); TermInfo { doc_freq: 0, postings_offset: self.postings_serializer.addr(), - positions_idx + positions_idx, } } @@ -253,7 +255,7 @@ impl<'a> FieldSerializer<'a> { struct Block { doc_ids: [DocId; COMPRESSION_BLOCK_SIZE], term_freqs: [u32; COMPRESSION_BLOCK_SIZE], - len: usize + len: usize, } impl Block { @@ -261,7 +263,7 @@ impl Block { Block { doc_ids: [0u32; COMPRESSION_BLOCK_SIZE], term_freqs: [0u32; COMPRESSION_BLOCK_SIZE], - len: 0 + len: 0, } } @@ -312,9 +314,12 @@ pub struct PostingsSerializer { termfreq_sum_enabled: bool, } - impl PostingsSerializer { - pub fn new(write: W, termfreq_enabled: bool, termfreq_sum_enabled: bool) -> PostingsSerializer { + pub fn new( + write: W, + termfreq_enabled: bool, + termfreq_sum_enabled: bool, + ) -> PostingsSerializer { PostingsSerializer { output_write: CountingWriter::wrap(write), @@ -337,14 +342,16 @@ impl PostingsSerializer { .block_encoder .compress_block_sorted(&self.block.doc_ids(), self.last_doc_id_encoded); self.last_doc_id_encoded = self.block.last_doc(); - self.skip_write.write_doc(self.last_doc_id_encoded, num_bits); + self.skip_write + .write_doc(self.last_doc_id_encoded, num_bits); // last el block 0, offset block 1, self.postings_write.extend(block_encoded); } if self.termfreq_enabled { // encode the term_freqs - let (num_bits, block_encoded): (u8, &[u8]) = - self.block_encoder.compress_block_unsorted(&self.block.term_freqs()); + let (num_bits, block_encoded): (u8, &[u8]) = self + .block_encoder + .compress_block_unsorted(&self.block.term_freqs()); self.postings_write.extend(block_encoded); self.skip_write.write_term_freq(num_bits); if self.termfreq_sum_enabled { @@ -375,13 +382,15 @@ impl PostingsSerializer { // In that case, the remaining part is encoded // using variable int encoding. { - let block_encoded = self.block_encoder + let block_encoded = self + .block_encoder .compress_vint_sorted(&self.block.doc_ids(), self.last_doc_id_encoded); self.postings_write.write_all(block_encoded)?; } // ... Idem for term frequencies if self.termfreq_enabled { - let block_encoded = self.block_encoder + let block_encoded = self + .block_encoder .compress_vint_unsorted(self.block.term_freqs()); self.postings_write.write_all(block_encoded)?; } @@ -392,7 +401,6 @@ impl PostingsSerializer { VInt(skip_data.len() as u64).serialize(&mut self.output_write)?; self.output_write.write_all(skip_data)?; self.output_write.write_all(&self.postings_write[..])?; - } else { self.output_write.write_all(&self.postings_write[..])?; } diff --git a/src/postings/skip.rs b/src/postings/skip.rs index e2d59e2c6..ab2dcb6c2 100644 --- a/src/postings/skip.rs +++ b/src/postings/skip.rs @@ -1,8 +1,8 @@ -use DocId; use common::BinarySerializable; use owned_read::OwnedRead; use postings::compression::COMPRESSION_BLOCK_SIZE; use schema::IndexRecordOption; +use DocId; pub struct SkipSerializer { buffer: Vec, @@ -18,8 +18,11 @@ impl SkipSerializer { } pub fn write_doc(&mut self, last_doc: DocId, doc_num_bits: u8) { - assert!(last_doc > self.prev_doc, "write_doc(...) called with non-increasing doc ids. \ - Did you forget to call clear maybe?"); + assert!( + last_doc > self.prev_doc, + "write_doc(...) called with non-increasing doc ids. \ + Did you forget to call clear maybe?" + ); let delta_doc = last_doc - self.prev_doc; self.prev_doc = last_doc; delta_doc.serialize(&mut self.buffer).unwrap(); @@ -30,9 +33,10 @@ impl SkipSerializer { self.buffer.push(tf_num_bits); } - pub fn write_total_term_freq(&mut self, tf_sum: u32) { - tf_sum.serialize(&mut self.buffer).expect("Should never fail"); + tf_sum + .serialize(&mut self.buffer) + .expect("Should never fail"); } pub fn data(&self) -> &[u8] { @@ -103,33 +107,32 @@ impl SkipReader { } else { let doc_delta = u32::deserialize(&mut self.owned_read).expect("Skip data corrupted"); self.doc += doc_delta as DocId; - self.doc_num_bits = self.owned_read.get(0); + self.doc_num_bits = self.owned_read.get(0); match self.skip_info { IndexRecordOption::Basic => { self.owned_read.advance(1); } - IndexRecordOption::WithFreqs=> { + IndexRecordOption::WithFreqs => { self.tf_num_bits = self.owned_read.get(1); self.owned_read.advance(2); } IndexRecordOption::WithFreqsAndPositions => { self.tf_num_bits = self.owned_read.get(1); self.owned_read.advance(2); - self.tf_sum = u32::deserialize(&mut self.owned_read) - .expect("Failed reading tf_sum"); + self.tf_sum = + u32::deserialize(&mut self.owned_read).expect("Failed reading tf_sum"); } } true } - } } #[cfg(test)] mod tests { - use super::{SkipReader, SkipSerializer}; use super::IndexRecordOption; + use super::{SkipReader, SkipSerializer}; use owned_read::OwnedRead; #[test] @@ -171,4 +174,4 @@ mod tests { assert_eq!(skip_reader.doc_num_bits(), 5u8); assert!(!skip_reader.advance()); } -} \ No newline at end of file +} diff --git a/src/postings/stacker/memory_arena.rs b/src/postings/stacker/memory_arena.rs index b420fdb22..3b0f875d4 100644 --- a/src/postings/stacker/memory_arena.rs +++ b/src/postings/stacker/memory_arena.rs @@ -47,7 +47,7 @@ impl Addr { } /// Returns the `Addr` object for `addr + offset` - pub fn offset(&self, offset: u32) -> Addr { + pub fn offset(self, offset: u32) -> Addr { Addr(self.0.wrapping_add(offset)) } @@ -55,16 +55,16 @@ impl Addr { Addr((page_id << NUM_BITS_PAGE_ADDR | local_addr) as u32) } - fn page_id(&self) -> usize { + fn page_id(self) -> usize { (self.0 as usize) >> NUM_BITS_PAGE_ADDR } - fn page_local_addr(&self) -> usize { + fn page_local_addr(self) -> usize { (self.0 as usize) & (PAGE_SIZE - 1) } /// Returns true if and only if the `Addr` is null. - pub fn is_null(&self) -> bool { + pub fn is_null(self) -> bool { self.0 == u32::max_value() } } @@ -233,12 +233,12 @@ impl Page { #[inline(always)] pub(crate) unsafe fn get_ptr(&self, addr: usize) -> *const u8 { - self.data.as_ptr().offset(addr as isize) + self.data.as_ptr().add(addr) } #[inline(always)] pub(crate) unsafe fn get_mut_ptr(&mut self, addr: usize) -> *mut u8 { - self.data.as_mut_ptr().offset(addr as isize) + self.data.as_mut_ptr().add(addr) } } diff --git a/src/postings/stacker/murmurhash2.rs b/src/postings/stacker/murmurhash2.rs index 729819be8..9626dcb53 100644 --- a/src/postings/stacker/murmurhash2.rs +++ b/src/postings/stacker/murmurhash2.rs @@ -4,6 +4,7 @@ const M: u32 = 0x5bd1_e995; #[inline(always)] pub fn murmurhash2(key: &[u8]) -> u32 { + #[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))] let mut key_ptr: *const u32 = key.as_ptr() as *const u32; let len = key.len() as u32; let mut h: u32 = SEED ^ len; diff --git a/src/postings/stacker/term_hashmap.rs b/src/postings/stacker/term_hashmap.rs index 6e3625d5d..47ee3d5c7 100644 --- a/src/postings/stacker/term_hashmap.rs +++ b/src/postings/stacker/term_hashmap.rs @@ -61,7 +61,7 @@ impl Default for KeyValue { } impl KeyValue { - fn is_empty(&self) -> bool { + fn is_empty(self) -> bool { self.key_value_addr.is_null() } } diff --git a/src/query/all_query.rs b/src/query/all_query.rs index bfc1fddbf..4f5490ab1 100644 --- a/src/query/all_query.rs +++ b/src/query/all_query.rs @@ -59,10 +59,10 @@ impl DocSet for AllScorer { } } if self.doc < self.max_doc { - return true; + true } else { self.state = State::Finished; - return false; + false } } diff --git a/src/query/bm25.rs b/src/query/bm25.rs index 1fc6087ed..eb2546725 100644 --- a/src/query/bm25.rs +++ b/src/query/bm25.rs @@ -17,9 +17,9 @@ fn cached_tf_component(fieldnorm: u32, average_fieldnorm: f32) -> f32 { fn compute_tf_cache(average_fieldnorm: f32) -> [f32; 256] { let mut cache = [0f32; 256]; - for fieldnorm_id in 0..256 { + for (fieldnorm_id, cache_mut) in cache.iter_mut().enumerate() { let fieldnorm = FieldNormReader::id_to_fieldnorm(fieldnorm_id as u8); - cache[fieldnorm_id] = cached_tf_component(fieldnorm, average_fieldnorm); + *cache_mut = cached_tf_component(fieldnorm, average_fieldnorm); } cache } @@ -54,7 +54,7 @@ impl BM25Weight { for segment_reader in searcher.segment_readers() { let inverted_index = segment_reader.inverted_index(field); total_num_tokens += inverted_index.total_num_tokens(); - total_num_docs += segment_reader.max_doc() as u64; + total_num_docs += u64::from(segment_reader.max_doc()); } let average_fieldnorm = total_num_tokens as f32 / total_num_docs as f32; @@ -63,8 +63,7 @@ impl BM25Weight { .map(|term| { let term_doc_freq = searcher.doc_freq(term); idf(term_doc_freq, total_num_docs) - }) - .sum::(); + }).sum::(); BM25Weight::new(idf, average_fieldnorm) } diff --git a/src/query/boolean_query/boolean_query.rs b/src/query/boolean_query/boolean_query.rs index 286d9f449..b530c6b0a 100644 --- a/src/query/boolean_query/boolean_query.rs +++ b/src/query/boolean_query/boolean_query.rs @@ -5,6 +5,7 @@ use query::TermQuery; use query::Weight; use schema::IndexRecordOption; use schema::Term; +use std::collections::BTreeSet; use Result; use Searcher; @@ -27,7 +28,7 @@ impl Clone for BooleanQuery { fn clone(&self) -> Self { self.subqueries .iter() - .map(|(x, y)| (x.clone(), y.box_clone())) + .map(|(occur, subquery)| (*occur, subquery.box_clone())) .collect::>() .into() } @@ -41,14 +42,20 @@ impl From)>> for BooleanQuery { impl Query for BooleanQuery { fn weight(&self, searcher: &Searcher, scoring_enabled: bool) -> Result> { - let sub_weights = self.subqueries + let sub_weights = self + .subqueries .iter() .map(|&(ref occur, ref subquery)| { Ok((*occur, subquery.weight(searcher, scoring_enabled)?)) - }) - .collect::>()?; + }).collect::>()?; Ok(Box::new(BooleanWeight::new(sub_weights, scoring_enabled))) } + + fn query_terms(&self, term_set: &mut BTreeSet) { + for (_occur, subquery) in &self.subqueries { + subquery.query_terms(term_set); + } + } } impl BooleanQuery { @@ -61,8 +68,7 @@ impl BooleanQuery { let term_query: Box = Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs)); (Occur::Should, term_query) - }) - .collect(); + }).collect(); BooleanQuery::from(occur_term_queries) } diff --git a/src/query/boolean_query/boolean_weight.rs b/src/query/boolean_query/boolean_weight.rs index 575bc2991..edd8fecae 100644 --- a/src/query/boolean_query/boolean_weight.rs +++ b/src/query/boolean_query/boolean_weight.rs @@ -39,7 +39,7 @@ where } let scorer: Box = Box::new(Union::<_, TScoreCombiner>::from(scorers)); - return scorer; + scorer } pub struct BooleanWeight { diff --git a/src/query/boolean_query/mod.rs b/src/query/boolean_query/mod.rs index 5d72406a0..4276720ee 100644 --- a/src/query/boolean_query/mod.rs +++ b/src/query/boolean_query/mod.rs @@ -69,7 +69,7 @@ mod tests { let query_parser = QueryParser::for_index(&index, vec![text_field]); let query = query_parser.parse_query("+a").unwrap(); let searcher = index.searcher(); - let weight = query.weight(&*searcher, true).unwrap(); + let weight = query.weight(&searcher, true).unwrap(); let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap(); assert!(Downcast::::is_type(&*scorer)); } @@ -81,13 +81,13 @@ mod tests { let searcher = index.searcher(); { let query = query_parser.parse_query("+a +b +c").unwrap(); - let weight = query.weight(&*searcher, true).unwrap(); + let weight = query.weight(&searcher, true).unwrap(); let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap(); assert!(Downcast::>::is_type(&*scorer)); } { let query = query_parser.parse_query("+a +(b c)").unwrap(); - let weight = query.weight(&*searcher, true).unwrap(); + let weight = query.weight(&searcher, true).unwrap(); let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap(); assert!(Downcast::>>::is_type(&*scorer)); } @@ -100,7 +100,7 @@ mod tests { let searcher = index.searcher(); { let query = query_parser.parse_query("+a b").unwrap(); - let weight = query.weight(&*searcher, true).unwrap(); + let weight = query.weight(&searcher, true).unwrap(); let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap(); assert!(Downcast::< RequiredOptionalScorer, Box, SumWithCoordsCombiner>, @@ -108,7 +108,7 @@ mod tests { } { let query = query_parser.parse_query("+a b").unwrap(); - let weight = query.weight(&*searcher, false).unwrap(); + let weight = query.weight(&searcher, false).unwrap(); let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap(); println!("{:?}", scorer.type_name()); assert!(Downcast::::is_type(&*scorer)); diff --git a/src/query/empty_query.rs b/src/query/empty_query.rs new file mode 100644 index 000000000..6e64dca57 --- /dev/null +++ b/src/query/empty_query.rs @@ -0,0 +1,81 @@ +use super::Scorer; +use query::Query; +use query::Weight; +use DocId; +use DocSet; +use Result; +use Score; +use Searcher; +use SegmentReader; + +/// `EmptyQuery` is a dummy `Query` in which no document matches. +/// +/// It is useful for tests and handling edge cases. +#[derive(Clone, Debug)] +pub struct EmptyQuery; + +impl Query for EmptyQuery { + fn weight(&self, _searcher: &Searcher, _scoring_enabled: bool) -> Result> { + Ok(Box::new(EmptyWeight)) + } + + fn count(&self, _searcher: &Searcher) -> Result { + Ok(0) + } +} + +/// `EmptyWeight` is a dummy `Weight` in which no document matches. +/// +/// It is useful for tests and handling edge cases. +pub struct EmptyWeight; +impl Weight for EmptyWeight { + fn scorer(&self, _reader: &SegmentReader) -> Result> { + Ok(Box::new(EmptyScorer)) + } +} + +/// `EmptyScorer` is a dummy `Scorer` in which no document matches. +/// +/// It is useful for tests and handling edge cases. +pub struct EmptyScorer; + +impl DocSet for EmptyScorer { + fn advance(&mut self) -> bool { + false + } + + fn doc(&self) -> DocId { + panic!( + "You may not call .doc() on a scorer \ + where the last call to advance() did not return true." + ); + } + + fn size_hint(&self) -> u32 { + 0 + } +} + +impl Scorer for EmptyScorer { + fn score(&mut self) -> Score { + 0f32 + } +} + +#[cfg(test)] +mod tests { + use query::EmptyScorer; + use DocSet; + + #[test] + fn test_empty_scorer() { + let mut empty_scorer = EmptyScorer; + assert!(!empty_scorer.advance()); + } + + #[test] + #[should_panic] + fn test_empty_scorer_panic_on_doc_call() { + EmptyScorer.doc(); + } +} diff --git a/src/query/fuzzy_query.rs b/src/query/fuzzy_query.rs index 6e0a16a67..5253fa80c 100644 --- a/src/query/fuzzy_query.rs +++ b/src/query/fuzzy_query.rs @@ -10,7 +10,7 @@ lazy_static! { let mut lev_builder_cache = HashMap::new(); // TODO make population lazy on a `(distance, val)` basis for distance in 0..3 { - for &transposition in [false, true].iter() { + for &transposition in &[false, true] { let lev_automaton_builder = LevenshteinAutomatonBuilder::new(distance, transposition); lev_builder_cache.insert((distance, transposition), lev_automaton_builder); } @@ -153,7 +153,7 @@ mod test { let fuzzy_query = FuzzyTermQuery::new(term, 1, true); searcher.search(&fuzzy_query, &mut collector).unwrap(); - let scored_docs = collector.score_docs(); + let scored_docs = collector.top_docs(); assert_eq!(scored_docs.len(), 1, "Expected only 1 document"); let (score, _) = scored_docs[0]; assert_nearly_equals(1f32, score); diff --git a/src/query/intersection.rs b/src/query/intersection.rs index 02c40e169..e38d32ec7 100644 --- a/src/query/intersection.rs +++ b/src/query/intersection.rs @@ -26,10 +26,11 @@ pub fn intersect_scorers(mut scorers: Vec>) -> Box { (Some(single_docset), None) => single_docset, (Some(left), Some(right)) => { { - if [&left, &right].into_iter().all(|scorer| { + let all_term_scorers = [&left, &right].into_iter().all(|scorer| { let scorer_ref: &Scorer = (*scorer).borrow(); Downcast::::is_type(scorer_ref) - }) { + }); + if all_term_scorers { let left = *Downcast::::downcast(left).unwrap(); let right = *Downcast::::downcast(right).unwrap(); return Box::new(Intersection { @@ -40,12 +41,12 @@ pub fn intersect_scorers(mut scorers: Vec>) -> Box { }); } } - return Box::new(Intersection { + Box::new(Intersection { left, right, others: scorers, num_docsets, - }); + }) } _ => { unreachable!(); @@ -99,7 +100,7 @@ impl Intersection } impl DocSet for Intersection { - #[allow(never_loop)] + #[cfg_attr(feature = "cargo-clippy", allow(clippy::never_loop))] fn advance(&mut self) -> bool { let (left, right) = (&mut self.left, &mut self.right); diff --git a/src/query/mod.rs b/src/query/mod.rs index 7b6368c00..b7136c232 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -7,6 +7,7 @@ mod automaton_weight; mod bitset; mod bm25; mod boolean_query; +mod empty_query; mod exclude; mod fuzzy_query; mod intersection; @@ -26,7 +27,6 @@ mod weight; mod vec_docset; pub(crate) mod score_combiner; - pub use self::intersection::Intersection; pub use self::union::Union; @@ -37,6 +37,7 @@ pub use self::all_query::{AllQuery, AllScorer, AllWeight}; pub use self::automaton_weight::AutomatonWeight; pub use self::bitset::BitSetDocSet; pub use self::boolean_query::BooleanQuery; +pub use self::empty_query::{EmptyQuery, EmptyScorer, EmptyWeight}; pub use self::exclude::Exclude; pub use self::fuzzy_query::FuzzyTermQuery; pub use self::intersection::intersect_scorers; @@ -49,7 +50,56 @@ pub use self::range_query::RangeQuery; pub use self::regex_query::RegexQuery; pub use self::reqopt_scorer::RequiredOptionalScorer; pub use self::scorer::ConstScorer; -pub use self::scorer::EmptyScorer; pub use self::scorer::Scorer; pub use self::term_query::TermQuery; pub use self::weight::Weight; + +#[cfg(test)] +mod tests { + use Index; + use schema::{SchemaBuilder, TEXT}; + use query::QueryParser; + use Term; + use std::collections::BTreeSet; + + #[test] + fn test_query_terms() { + let mut schema_builder = SchemaBuilder::default(); + let text_field = schema_builder.add_text_field("text", TEXT); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + let query_parser = QueryParser::for_index(&index, vec![text_field]); + let term_a = Term::from_field_text(text_field, "a"); + let term_b = Term::from_field_text(text_field, "b"); + { + let mut terms_set: BTreeSet = BTreeSet::new(); + query_parser.parse_query("a").unwrap().query_terms(&mut terms_set); + let terms: Vec<&Term> = terms_set.iter().collect(); + assert_eq!(vec![&term_a], terms); + } + { + let mut terms_set: BTreeSet = BTreeSet::new(); + query_parser.parse_query("a b").unwrap().query_terms(&mut terms_set); + let terms: Vec<&Term> = terms_set.iter().collect(); + assert_eq!(vec![&term_a, &term_b], terms); + } + { + let mut terms_set: BTreeSet = BTreeSet::new(); + query_parser.parse_query("\"a b\"").unwrap().query_terms(&mut terms_set); + let terms: Vec<&Term> = terms_set.iter().collect(); + assert_eq!(vec![&term_a, &term_b], terms); + } + { + let mut terms_set: BTreeSet = BTreeSet::new(); + query_parser.parse_query("a a a a a").unwrap().query_terms(&mut terms_set); + let terms: Vec<&Term> = terms_set.iter().collect(); + assert_eq!(vec![&term_a], terms); + } + { + let mut terms_set: BTreeSet = BTreeSet::new(); + query_parser.parse_query("a -b").unwrap().query_terms(&mut terms_set); + let terms: Vec<&Term> = terms_set.iter().collect(); + assert_eq!(vec![&term_a, &term_b], terms); + } + } +} \ No newline at end of file diff --git a/src/query/occur.rs b/src/query/occur.rs index 9bcf02bc2..96ff9018c 100644 --- a/src/query/occur.rs +++ b/src/query/occur.rs @@ -12,3 +12,38 @@ pub enum Occur { /// search. MustNot, } + +impl Occur { + /// Returns the one-char prefix symbol for this `Occur`. + /// - `Should` => '?', + /// - `Must` => '+' + /// - `Not` => '-' + pub fn to_char(self) -> char { + match self { + Occur::Should => '?', + Occur::Must => '+', + Occur::MustNot => '-', + } + } +} + +/// Compose two occur values. +pub fn compose_occur(left: Occur, right: Occur) -> Occur { + match left { + Occur::Should => right, + Occur::Must => { + if right == Occur::MustNot { + Occur::MustNot + } else { + Occur::Must + } + } + Occur::MustNot => { + if right == Occur::MustNot { + Occur::Must + } else { + Occur::MustNot + } + } + } +} diff --git a/src/query/phrase_query/mod.rs b/src/query/phrase_query/mod.rs index f4f974388..303301b0d 100644 --- a/src/query/phrase_query/mod.rs +++ b/src/query/phrase_query/mod.rs @@ -12,7 +12,7 @@ mod tests { use super::*; use collector::tests::TestCollector; use core::Index; - use error::ErrorKind; + use error::TantivyError; use schema::{SchemaBuilder, Term, TEXT}; use tests::assert_nearly_equals; @@ -92,10 +92,9 @@ mod tests { Term::from_field_text(text_field, "b"), ]); let mut test_collector = TestCollector::default(); - if let &ErrorKind::SchemaError(ref msg) = searcher + if let TantivyError::SchemaError(ref msg) = searcher .search(&phrase_query, &mut test_collector) .unwrap_err() - .kind() { assert_eq!( "Applied phrase query on field \"text\", which does not have positions indexed", @@ -191,7 +190,7 @@ mod tests { let mut test_collector = TestCollector::default(); let terms: Vec<(usize, Term)> = texts .iter() - .map(|(offset, text)| (*offset, Term::from_field_text(text_field, text)) ) + .map(|(offset, text)| (*offset, Term::from_field_text(text_field, text))) .collect(); let phrase_query = PhraseQuery::new_with_offset(terms); searcher diff --git a/src/query/phrase_query/phrase_query.rs b/src/query/phrase_query/phrase_query.rs index 9cabe8cc4..959b17b0e 100644 --- a/src/query/phrase_query/phrase_query.rs +++ b/src/query/phrase_query/phrase_query.rs @@ -1,10 +1,11 @@ use super::PhraseWeight; use core::searcher::Searcher; -use error::ErrorKind; +use error::TantivyError; use query::bm25::BM25Weight; use query::Query; use query::Weight; use schema::{Field, Term}; +use std::collections::BTreeSet; use Result; /// `PhraseQuery` matches a specific sequence of words. @@ -38,11 +39,10 @@ impl PhraseQuery { PhraseQuery::new_with_offset(terms_with_offset) } - /// Creates a new `PhraseQuery` given a list of terms and there offsets. /// /// Can be used to provide custom offset for each term. - pub fn new_with_offset(mut terms: Vec<(usize, Term)>) ->PhraseQuery { + pub fn new_with_offset(mut terms: Vec<(usize, Term)>) -> PhraseQuery { assert!( terms.len() > 1, "A phrase query is required to have strictly more than one term." @@ -66,9 +66,11 @@ impl PhraseQuery { /// `Term`s in the phrase without the associated offsets. pub fn phrase_terms(&self) -> Vec { - self.phrase_terms.iter().map(|(_, term)| term.clone()).collect::>() - } - + self.phrase_terms + .iter() + .map(|(_, term)| term.clone()) + .collect::>() + } } impl Query for PhraseQuery { @@ -85,15 +87,19 @@ impl Query for PhraseQuery { .unwrap_or(false); if !has_positions { let field_name = field_entry.name(); - bail!(ErrorKind::SchemaError(format!( + return Err(TantivyError::SchemaError(format!( "Applied phrase query on field {:?}, which does not have positions indexed", field_name - ))) + ))); } if scoring_enabled { let terms = self.phrase_terms(); let bm25_weight = BM25Weight::for_terms(searcher, &terms); - Ok(Box::new(PhraseWeight::new(self.phrase_terms.clone(), bm25_weight, true))) + Ok(Box::new(PhraseWeight::new( + self.phrase_terms.clone(), + bm25_weight, + true, + ))) } else { Ok(Box::new(PhraseWeight::new( self.phrase_terms.clone(), @@ -102,4 +108,10 @@ impl Query for PhraseQuery { ))) } } + + fn query_terms(&self, term_set: &mut BTreeSet) { + for (_, query_term) in &self.phrase_terms { + term_set.insert(query_term.clone()); + } + } } diff --git a/src/query/phrase_query/phrase_scorer.rs b/src/query/phrase_query/phrase_scorer.rs index 6e04291c6..9b896a46a 100644 --- a/src/query/phrase_query/phrase_scorer.rs +++ b/src/query/phrase_query/phrase_scorer.rs @@ -124,7 +124,8 @@ impl PhraseScorer { fieldnorm_reader: FieldNormReader, score_needed: bool, ) -> PhraseScorer { - let max_offset = term_postings.iter() + let max_offset = term_postings + .iter() .map(|&(offset, _)| offset) .max() .unwrap_or(0); @@ -133,8 +134,7 @@ impl PhraseScorer { .into_iter() .map(|(offset, postings)| { PostingsWithOffset::new(postings, (max_offset - offset) as u32) - }) - .collect::>(); + }).collect::>(); PhraseScorer { intersection_docset: Intersection::new(postings_with_offsets), num_docsets, diff --git a/src/query/query.rs b/src/query/query.rs index 51e068b92..ca7de8ca6 100644 --- a/src/query/query.rs +++ b/src/query/query.rs @@ -2,9 +2,11 @@ use super::Weight; use collector::Collector; use core::searcher::Searcher; use downcast; +use std::collections::BTreeSet; use std::fmt; use Result; use SegmentLocalId; +use Term; /// The `Query` trait defines a set of documents and a scoring method /// for those documents. @@ -58,6 +60,10 @@ pub trait Query: QueryClone + downcast::Any + fmt::Debug { Ok(result) } + /// Extract all of the terms associated to the query and insert them in the + /// term set given in arguments. + fn query_terms(&self, _term_set: &mut BTreeSet) {} + /// Search works as follows : /// /// First the weight object associated to the query is created. diff --git a/src/query/query_parser/query_grammar.rs b/src/query/query_parser/query_grammar.rs index 352666e8a..3a3ac4256 100644 --- a/src/query/query_parser/query_grammar.rs +++ b/src/query/query_parser/query_grammar.rs @@ -1,6 +1,12 @@ +#![cfg_attr(feature = "cargo-clippy", allow(clippy::unneeded_field_pattern))] +#![cfg_attr(feature = "cargo-clippy", allow(clippy::toplevel_ref_arg))] + use super::user_input_ast::*; use combine::char::*; +use combine::error::StreamError; +use combine::stream::StreamErrorFor; use combine::*; +use query::occur::Occur; use query::query_parser::user_input_ast::UserInputBound; parser! { @@ -17,18 +23,25 @@ parser! { fn word[I]()(I) -> String where [I: Stream] { many1(satisfy(|c: char| c.is_alphanumeric())) + .and_then(|s: String| { + match s.as_str() { + "OR" => Err(StreamErrorFor::::unexpected_static_message("OR")), + "AND" => Err(StreamErrorFor::::unexpected_static_message("AND")), + "NOT" => Err(StreamErrorFor::::unexpected_static_message("NOT")), + _ => Ok(s) + } + }) } } parser! { - fn literal[I]()(I) -> UserInputAST + fn literal[I]()(I) -> UserInputLeaf where [I: Stream] { let term_val = || { let phrase = (char('"'), many1(satisfy(|c| c != '"')), char('"')).map(|(_, s, _)| s); phrase.or(word()) }; - let term_val_with_field = negative_number().or(term_val()); let term_query = (field(), char(':'), term_val_with_field).map(|(field_name, _, phrase)| UserInputLiteral { @@ -41,7 +54,7 @@ parser! { }); try(term_query) .or(term_default_field) - .map(UserInputAST::from) + .map(UserInputLeaf::from) } } @@ -55,7 +68,14 @@ parser! { } parser! { - fn range[I]()(I) -> UserInputAST + fn spaces1[I]()(I) -> () + where [I: Stream] { + skip_many1(space()) + } +} + +parser! { + fn range[I]()(I) -> UserInputLeaf where [I: Stream] { let term_val = || { word().or(negative_number()).or(char('*').map(|_| "*".to_string())) @@ -77,7 +97,7 @@ parser! { string("TO"), spaces(), upper_bound, - ).map(|(field, lower, _, _, _, upper)| UserInputAST::Range { + ).map(|(field, lower, _, _, _, upper)| UserInputLeaf::Range { field, lower, upper @@ -88,13 +108,50 @@ parser! { parser! { fn leaf[I]()(I) -> UserInputAST where [I: Stream] { - (char('-'), leaf()) - .map(|(_, expr)| UserInputAST::Not(Box::new(expr))) - .or((char('+'), leaf()).map(|(_, expr)| UserInputAST::Must(Box::new(expr)))) + (char('-'), leaf()).map(|(_, expr)| expr.unary(Occur::MustNot) ) + .or((char('+'), leaf()).map(|(_, expr)| expr.unary(Occur::Must) )) .or((char('('), parse_to_ast(), char(')')).map(|(_, expr, _)| expr)) - .or(char('*').map(|_| UserInputAST::All)) - .or(try(range())) - .or(literal()) + .or(char('*').map(|_| UserInputAST::from(UserInputLeaf::All) )) + .or(try( + (string("NOT"), spaces1(), leaf()).map(|(_, _, expr)| expr.unary(Occur::MustNot)) + ) + ) + .or(try( + range().map(UserInputAST::from) + ) + ) + .or(literal().map(|leaf| UserInputAST::Leaf(Box::new(leaf)))) + } +} + +enum BinaryOperand { + Or, + And, +} + +parser! { + fn binary_operand[I]()(I) -> BinaryOperand + where [I: Stream] { + (spaces1(), + ( + string("AND").map(|_| BinaryOperand::And) + .or(string("OR").map(|_| BinaryOperand::Or)) + ), + spaces1()).map(|(_, op,_)| op) + } +} + +enum Element { + SingleEl(UserInputAST), + NormalDisjunctive(Vec>), +} + +impl Element { + pub fn into_dnf(self) -> Vec> { + match self { + Element::NormalDisjunctive(conjunctions) => conjunctions, + Element::SingleEl(el) => vec![vec![el]], + } } } @@ -102,14 +159,56 @@ parser! { pub fn parse_to_ast[I]()(I) -> UserInputAST where [I: Stream] { - sep_by(leaf(), spaces()) - .map(|subqueries: Vec| { - if subqueries.len() == 1 { - subqueries.into_iter().next().unwrap() - } else { - UserInputAST::Clause(subqueries.into_iter().map(Box::new).collect()) - } - }) + ( + try( + chainl1( + leaf().map(Element::SingleEl), + binary_operand().map(|op: BinaryOperand| + move |left: Element, right: Element| { + let mut dnf = left.into_dnf(); + if let Element::SingleEl(el) = right { + match op { + BinaryOperand::And => { + if let Some(last) = dnf.last_mut() { + last.push(el); + } + } + BinaryOperand::Or => { + dnf.push(vec!(el)); + } + } + } else { + unreachable!("Please report.") + } + Element::NormalDisjunctive(dnf) + } + ) + ) + .map(|el| el.into_dnf()) + .map(|fnd| { + if fnd.len() == 1 { + UserInputAST::and(fnd.into_iter().next().unwrap()) //< safe + } else { + let conjunctions = fnd + .into_iter() + .map(UserInputAST::and) + .collect(); + UserInputAST::or(conjunctions) + } + }) + ) + .or( + sep_by(leaf(), spaces()) + .map(|subqueries: Vec| { + if subqueries.len() == 1 { + subqueries.into_iter().next().unwrap() + } else { + UserInputAST::Clause(subqueries.into_iter().collect()) + } + }) + ) + ) + } } @@ -128,6 +227,40 @@ mod test { assert!(parse_to_ast().parse(query).is_err()); } + #[test] + fn test_parse_query_to_ast_not_op() { + assert_eq!( + format!("{:?}", parse_to_ast().parse("NOT")), + "Err(UnexpectedParse)" + ); + test_parse_query_to_ast_helper("NOTa", "\"NOTa\""); + test_parse_query_to_ast_helper("NOT a", "-(\"a\")"); + } + + #[test] + fn test_parse_query_to_ast_binary_op() { + test_parse_query_to_ast_helper("a AND b", "(+(\"a\") +(\"b\"))"); + test_parse_query_to_ast_helper("a OR b", "(?(\"a\") ?(\"b\"))"); + test_parse_query_to_ast_helper("a OR b AND c", "(?(\"a\") ?((+(\"b\") +(\"c\"))))"); + test_parse_query_to_ast_helper("a AND b AND c", "(+(\"a\") +(\"b\") +(\"c\"))"); + assert_eq!( + format!("{:?}", parse_to_ast().parse("a OR b aaa")), + "Err(UnexpectedParse)" + ); + assert_eq!( + format!("{:?}", parse_to_ast().parse("a AND b aaa")), + "Err(UnexpectedParse)" + ); + assert_eq!( + format!("{:?}", parse_to_ast().parse("aaa a OR b ")), + "Err(UnexpectedParse)" + ); + assert_eq!( + format!("{:?}", parse_to_ast().parse("aaa ccc a OR b ")), + "Err(UnexpectedParse)" + ); + } + #[test] fn test_parse_query_to_ast() { test_parse_query_to_ast_helper("+(a b) +d", "(+((\"a\" \"b\")) +(\"d\"))"); diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index b31837f44..b74b35a14 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -3,8 +3,11 @@ use super::query_grammar::parse_to_ast; use super::user_input_ast::*; use combine::Parser; use core::Index; +use query::occur::compose_occur; +use query::query_parser::logical_ast::LogicalAST; use query::AllQuery; use query::BooleanQuery; +use query::EmptyQuery; use query::Occur; use query::PhraseQuery; use query::Query; @@ -55,6 +58,27 @@ impl From for QueryParserError { } } +/// Recursively remove empty clause from the AST +/// +/// Returns `None` iff the `logical_ast` ended up being empty. +fn trim_ast(logical_ast: LogicalAST) -> Option { + match logical_ast { + LogicalAST::Clause(children) => { + let trimmed_children = children + .into_iter() + .flat_map(|(occur, child)| { + trim_ast(child).map(|trimmed_child| (occur, trimmed_child)) + }).collect::>(); + if trimmed_children.is_empty() { + None + } else { + Some(LogicalAST::Clause(trimmed_children)) + } + } + _ => Some(logical_ast), + } +} + /// Tantivy's Query parser /// /// The language covered by the current parser is extremely simple. @@ -77,12 +101,22 @@ impl From for QueryParserError { /// /// Switching to a default of `AND` can be done by calling `.set_conjunction_by_default()`. /// +/// +/// * boolean operators `AND`, `OR`. `AND` takes precedence over `OR`, so that `a AND b OR c` is interpreted +/// as `(a AND b) OR c`. +/// +/// * In addition to the boolean operators, the `-`, `+` can help define. These operators +/// are sufficient to axpress all queries using boolean operators. For instance `x AND y OR z` can +/// be written (`(+x +y) z`). In addition, these operators can help define "required optional" +/// queries. `(+x y)` matches the same document set as simply `x`, but `y` will help refining the score. +/// /// * negative terms: By prepending a term by a `-`, a term can be excluded /// from the search. This is useful for disambiguating a query. /// e.g. `apple -fruit` /// /// * must terms: By prepending a term by a `+`, a term can be made required for the search. /// +/// /// * phrase terms: Quoted terms become phrase searches on fields that have positions indexed. /// e.g., `title:"Barack Obama"` will only find documents that have "barack" immediately followed /// by "obama". @@ -173,8 +207,8 @@ impl QueryParser { return convert_to_query(logical_ast); } - // we have no idea what you want, so here's everything - Box::new(AllQuery) + // we have no idea what you want, so here's nothing + Box::new(EmptyQuery) } /// Parse the user query into an AST. @@ -186,15 +220,11 @@ impl QueryParser { // query input // # Escape special characters: \\+-&|!(){}[]^~*?:\/ - let special_chars = vec![ - "\\", "+", "-", "&", "|", "!", "(", ")", "{", "}", "[", "]", "^", "~", "*", "?", ":", - "/", - ]; - - let mut scrubbed_query = query.to_string(); - for c in special_chars.iter() { - scrubbed_query = scrubbed_query.replace(c, &format!("{}", c)); - } + let special_chars = "\\+-&|!(){}[]^~*?:/"; + let mut scrubbed_query = query + .chars() + .filter(|c| !special_chars.contains(*c)) + .collect::(); // AND, OR and NOT are used by tantivy as logical operators. We need // to escape them @@ -208,11 +238,10 @@ impl QueryParser { if quote_count % 2 == 1 { scrubbed_query = scrubbed_query.replace("\"", "\\\""); } - + let (user_input_ast, _remaining) = parse_to_ast() .parse(scrubbed_query.as_str()) .map_err(|_| QueryParserError::SyntaxError)?; - self.compute_logical_ast(user_input_ast) } @@ -373,23 +402,61 @@ impl QueryParser { let default_occur = self.default_occur(); let mut logical_sub_queries: Vec<(Occur, LogicalAST)> = Vec::new(); for sub_query in sub_queries { - let (occur, sub_ast) = self.compute_logical_ast_with_occur(*sub_query)?; + let (occur, sub_ast) = self.compute_logical_ast_with_occur(sub_query)?; let new_occur = compose_occur(default_occur, occur); logical_sub_queries.push((new_occur, sub_ast)); } Ok((Occur::Should, LogicalAST::Clause(logical_sub_queries))) } - UserInputAST::Not(subquery) => { - let (occur, logical_sub_queries) = + UserInputAST::Unary(left_occur, subquery) => { + let (right_occur, logical_sub_queries) = self.compute_logical_ast_with_occur(*subquery)?; - Ok((compose_occur(Occur::MustNot, occur), logical_sub_queries)) + Ok((compose_occur(left_occur, right_occur), logical_sub_queries)) } - UserInputAST::Must(subquery) => { - let (occur, logical_sub_queries) = - self.compute_logical_ast_with_occur(*subquery)?; - Ok((compose_occur(Occur::Must, occur), logical_sub_queries)) + UserInputAST::Leaf(leaf) => { + let result_ast = self.compute_logical_ast_from_leaf(*leaf)?; + Ok((Occur::Should, result_ast)) } - UserInputAST::Range { + } + } + + fn compute_logical_ast_from_leaf( + &self, + leaf: UserInputLeaf, + ) -> Result { + match leaf { + UserInputLeaf::Literal(literal) => { + let term_phrases: Vec<(Field, String)> = match literal.field_name { + Some(ref field_name) => { + let field = self.resolve_field_name(field_name)?; + vec![(field, literal.phrase.clone())] + } + None => { + if self.default_fields.is_empty() { + return Err(QueryParserError::NoDefaultFieldDeclared); + } else { + self.default_fields + .iter() + .map(|default_field| (*default_field, literal.phrase.clone())) + .collect::>() + } + } + }; + let mut asts: Vec = Vec::new(); + for (field, phrase) in term_phrases { + if let Some(ast) = self.compute_logical_ast_for_leaf(field, &phrase)? { + asts.push(LogicalAST::Leaf(Box::new(ast))); + } + } + let result_ast: LogicalAST = if asts.len() == 1 { + asts.into_iter().next().unwrap() + } else { + LogicalAST::Clause(asts.into_iter().map(|ast| (Occur::Should, ast)).collect()) + }; + Ok(result_ast) + } + UserInputLeaf::All => Ok(LogicalAST::Leaf(Box::new(LogicalLiteral::All))), + UserInputLeaf::Range { field, lower, upper, @@ -417,65 +484,7 @@ impl QueryParser { .collect(), ) }; - Ok((Occur::Should, result_ast)) - } - UserInputAST::All => Ok(( - Occur::Should, - LogicalAST::Leaf(Box::new(LogicalLiteral::All)), - )), - UserInputAST::Leaf(literal) => { - let term_phrases: Vec<(Field, String)> = match literal.field_name { - Some(ref field_name) => { - let field = self.resolve_field_name(field_name)?; - vec![(field, literal.phrase.clone())] - } - None => { - if self.default_fields.is_empty() { - return Err(QueryParserError::NoDefaultFieldDeclared); - } else { - self.default_fields - .iter() - .map(|default_field| (*default_field, literal.phrase.clone())) - .collect::>() - } - } - }; - let mut asts: Vec = Vec::new(); - for (field, phrase) in term_phrases { - if let Some(ast) = self.compute_logical_ast_for_leaf(field, &phrase)? { - asts.push(LogicalAST::Leaf(Box::new(ast))); - } - } - let result_ast = if asts.is_empty() { - // this should never happen - return Err(QueryParserError::SyntaxError); - } else if asts.len() == 1 { - asts[0].clone() - } else { - LogicalAST::Clause(asts.into_iter().map(|ast| (Occur::Should, ast)).collect()) - }; - Ok((Occur::Should, result_ast)) - } - } - } -} - -/// Compose two occur values. -fn compose_occur(left: Occur, right: Occur) -> Occur { - match left { - Occur::Should => right, - Occur::Must => { - if right == Occur::MustNot { - Occur::MustNot - } else { - Occur::Must - } - } - Occur::MustNot => { - if right == Occur::MustNot { - Occur::Must - } else { - Occur::MustNot + Ok(result_ast) } } } @@ -492,21 +501,30 @@ fn convert_literal_to_query(logical_literal: LogicalLiteral) -> Box { value_type, lower, upper, - } => Box::new(RangeQuery::new_term_bounds(field, value_type, lower, upper)), + } => Box::new(RangeQuery::new_term_bounds( + field, value_type, &lower, &upper, + )), LogicalLiteral::All => Box::new(AllQuery), } } fn convert_to_query(logical_ast: LogicalAST) -> Box { - match logical_ast { - LogicalAST::Clause(clause) => { - let occur_subqueries = clause + match trim_ast(logical_ast) { + Some(LogicalAST::Clause(trimmed_clause)) => { + let occur_subqueries = trimmed_clause .into_iter() .map(|(occur, subquery)| (occur, convert_to_query(subquery))) .collect::>(); + assert!( + !occur_subqueries.is_empty(), + "Should not be empty after trimming" + ); Box::new(BooleanQuery::from(occur_subqueries)) } - LogicalAST::Leaf(logical_literal) => convert_literal_to_query(*logical_literal), + Some(LogicalAST::Leaf(trimmed_logical_literal)) => { + convert_literal_to_query(*trimmed_logical_literal) + } + None => Box::new(EmptyQuery), } } @@ -519,12 +537,17 @@ mod test { use schema::Field; use schema::{IndexRecordOption, TextFieldIndexing, TextOptions}; use schema::{SchemaBuilder, Term, INT_INDEXED, STORED, STRING, TEXT}; - use tokenizer::SimpleTokenizer; - use tokenizer::TokenizerManager; + use tokenizer::{LowerCaser, SimpleTokenizer, StopWordFilter, Tokenizer, TokenizerManager}; use Index; fn make_query_parser() -> QueryParser { let mut schema_builder = SchemaBuilder::default(); + let text_field_indexing = TextFieldIndexing::default() + .set_tokenizer("en_with_stop_words") + .set_index_option(IndexRecordOption::WithFreqsAndPositions); + let text_options = TextOptions::default() + .set_indexing_options(text_field_indexing) + .set_stored(); let title = schema_builder.add_text_field("title", TEXT); let text = schema_builder.add_text_field("text", TEXT); schema_builder.add_i64_field("signed", INT_INDEXED); @@ -533,9 +556,16 @@ mod test { schema_builder.add_text_field("notindexed_u64", STORED); schema_builder.add_text_field("notindexed_i64", STORED); schema_builder.add_text_field("nottokenized", STRING); + schema_builder.add_text_field("with_stop_words", text_options); let schema = schema_builder.build(); let default_fields = vec![title, text]; let tokenizer_manager = TokenizerManager::default(); + tokenizer_manager.register( + "en_with_stop_words", + SimpleTokenizer + .filter(LowerCaser) + .filter(StopWordFilter::remove(vec!["the".to_string()])), + ); QueryParser::new(schema, default_fields, tokenizer_manager) } @@ -567,7 +597,7 @@ mod test { } #[test] - pub fn test_parse_query_leient_no_panics() { + pub fn test_parse_query_lenient_no_panics() { let query_parser = make_query_parser(); query_parser.parse_query_lenient("toto"); @@ -576,7 +606,7 @@ mod test { } #[test] - pub fn test_parse_query_leient_escapes_bad_queries() { + pub fn test_parse_query_lenient_escapes_bad_queries() { let query_parser = make_query_parser(); let query = query_parser @@ -623,6 +653,16 @@ mod test { ); } + #[test] + pub fn test_parse_query_empty() { + test_parse_query_to_logical_ast_helper("", "", false); + test_parse_query_to_logical_ast_helper(" ", "", false); + let query_parser = make_query_parser(); + let query_result = query_parser.parse_query(""); + let query = query_result.unwrap(); + assert_eq!(format!("{:?}", query), "EmptyQuery"); + } + #[test] pub fn test_parse_query_ints() { let query_parser = make_query_parser(); @@ -802,6 +842,13 @@ mod test { ); } + #[test] + pub fn test_query_parser_not_empty_but_no_tokens() { + let query_parser = make_query_parser(); + assert!(query_parser.parse_query(" !, ").is_ok()); + assert!(query_parser.parse_query("with_stop_words:the").is_ok()); + } + #[test] pub fn test_parse_query_to_ast_conjunction() { test_parse_query_to_logical_ast_helper( diff --git a/src/query/query_parser/user_input_ast.rs b/src/query/query_parser/user_input_ast.rs index 96606915d..ee749ad38 100644 --- a/src/query/query_parser/user_input_ast.rs +++ b/src/query/query_parser/user_input_ast.rs @@ -1,4 +1,39 @@ use std::fmt; +use std::fmt::{Debug, Formatter}; + +use query::Occur; + +pub enum UserInputLeaf { + Literal(UserInputLiteral), + All, + Range { + field: Option, + lower: UserInputBound, + upper: UserInputBound, + }, +} + +impl Debug for UserInputLeaf { + fn fmt(&self, formatter: &mut Formatter) -> Result<(), fmt::Error> { + match self { + UserInputLeaf::Literal(literal) => literal.fmt(formatter), + UserInputLeaf::Range { + ref field, + ref lower, + ref upper, + } => { + if let Some(ref field) = field { + write!(formatter, "{}:", field)?; + } + lower.display_lower(formatter)?; + write!(formatter, " TO ")?; + upper.display_upper(formatter)?; + Ok(()) + } + UserInputLeaf::All => write!(formatter, "*"), + } + } +} pub struct UserInputLiteral { pub field_name: Option, @@ -43,28 +78,54 @@ impl UserInputBound { } pub enum UserInputAST { - Clause(Vec>), - Not(Box), - Must(Box), - Range { - field: Option, - lower: UserInputBound, - upper: UserInputBound, - }, - All, - Leaf(Box), + Clause(Vec), + Unary(Occur, Box), + Leaf(Box), } -impl From for UserInputAST { - fn from(literal: UserInputLiteral) -> UserInputAST { - UserInputAST::Leaf(Box::new(literal)) +impl UserInputAST { + pub fn unary(self, occur: Occur) -> UserInputAST { + UserInputAST::Unary(occur, Box::new(self)) + } + + fn compose(occur: Occur, asts: Vec) -> UserInputAST { + assert_ne!(occur, Occur::MustNot); + assert!(!asts.is_empty()); + if asts.len() == 1 { + asts.into_iter().next().unwrap() //< safe + } else { + UserInputAST::Clause( + asts.into_iter() + .map(|ast: UserInputAST| ast.unary(occur)) + .collect::>(), + ) + } + } + + pub fn and(asts: Vec) -> UserInputAST { + UserInputAST::compose(Occur::Must, asts) + } + + pub fn or(asts: Vec) -> UserInputAST { + UserInputAST::compose(Occur::Should, asts) + } +} + +impl From for UserInputLeaf { + fn from(literal: UserInputLiteral) -> UserInputLeaf { + UserInputLeaf::Literal(literal) + } +} + +impl From for UserInputAST { + fn from(leaf: UserInputLeaf) -> UserInputAST { + UserInputAST::Leaf(Box::new(leaf)) } } impl fmt::Debug for UserInputAST { fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> { match *self { - UserInputAST::Must(ref subquery) => write!(formatter, "+({:?})", subquery), UserInputAST::Clause(ref subqueries) => { if subqueries.is_empty() { write!(formatter, "")?; @@ -78,21 +139,9 @@ impl fmt::Debug for UserInputAST { } Ok(()) } - UserInputAST::Not(ref subquery) => write!(formatter, "-({:?})", subquery), - UserInputAST::Range { - ref field, - ref lower, - ref upper, - } => { - if let &Some(ref field) = field { - write!(formatter, "{}:", field)?; - } - lower.display_lower(formatter)?; - write!(formatter, " TO ")?; - upper.display_upper(formatter)?; - Ok(()) + UserInputAST::Unary(ref occur, ref subquery) => { + write!(formatter, "{}({:?})", occur.to_char(), subquery) } - UserInputAST::All => write!(formatter, "*"), UserInputAST::Leaf(ref subquery) => write!(formatter, "{:?}", subquery), } } diff --git a/src/query/range_query.rs b/src/query/range_query.rs index 17d09657f..43da4bd8c 100644 --- a/src/query/range_query.rs +++ b/src/query/range_query.rs @@ -1,7 +1,7 @@ use common::BitSet; use core::Searcher; use core::SegmentReader; -use error::ErrorKind; +use error::TantivyError; use query::BitSetDocSet; use query::ConstScorer; use query::{Query, Scorer, Weight}; @@ -68,7 +68,7 @@ fn map_bound TTo>( /// let docs_in_the_sixties = RangeQuery::new_u64(year_field, 1960..1970); /// /// let mut count_collector = CountCollector::default(); -/// docs_in_the_sixties.search(&*searcher, &mut count_collector)?; +/// docs_in_the_sixties.search(&searcher, &mut count_collector)?; /// /// let num_60s_books = count_collector.count(); /// @@ -96,8 +96,8 @@ impl RangeQuery { pub fn new_term_bounds( field: Field, value_type: Type, - left_bound: Bound, - right_bound: Bound, + left_bound: &Bound, + right_bound: &Bound, ) -> RangeQuery { let verify_and_unwrap_term = |val: &Term| { assert_eq!(field, val.field()); @@ -184,11 +184,7 @@ impl RangeQuery { /// /// If the field is not of the type `Str`, tantivy /// will panic when the `Weight` object is created. - pub fn new_str_bounds<'b>( - field: Field, - left: Bound<&'b str>, - right: Bound<&'b str>, - ) -> RangeQuery { + pub fn new_str_bounds(field: Field, left: Bound<&str>, right: Bound<&str>) -> RangeQuery { let make_term_val = |val: &&str| val.as_bytes().to_vec(); RangeQuery { field, @@ -202,7 +198,7 @@ impl RangeQuery { /// /// If the field is not of the type `Str`, tantivy /// will panic when the `Weight` object is created. - pub fn new_str<'b>(field: Field, range: Range<&'b str>) -> RangeQuery { + pub fn new_str(field: Field, range: Range<&str>) -> RangeQuery { RangeQuery::new_str_bounds( field, Bound::Included(range.start), @@ -239,7 +235,7 @@ impl Query for RangeQuery { "Create a range query of the type {:?}, when the field given was of type {:?}", self.value_type, value_type ); - bail!(ErrorKind::SchemaError(err_msg)) + return Err(TantivyError::SchemaError(err_msg)); } Ok(Box::new(RangeWeight { field: self.field, @@ -332,7 +328,7 @@ mod tests { // ... or `1960..=1969` if inclusive range is enabled. let mut count_collector = CountCollector::default(); - docs_in_the_sixties.search(&*searcher, &mut count_collector)?; + docs_in_the_sixties.search(&searcher, &mut count_collector)?; assert_eq!(count_collector.count(), 2285); Ok(()) } @@ -369,9 +365,7 @@ mod tests { let searcher = index.searcher(); let count_multiples = |range_query: RangeQuery| { let mut count_collector = CountCollector::default(); - range_query - .search(&*searcher, &mut count_collector) - .unwrap(); + range_query.search(&searcher, &mut count_collector).unwrap(); count_collector.count() }; diff --git a/src/query/regex_query.rs b/src/query/regex_query.rs index 8b930212a..dcdd9bdff 100644 --- a/src/query/regex_query.rs +++ b/src/query/regex_query.rs @@ -1,4 +1,4 @@ -use error::ErrorKind; +use error::TantivyError; use fst_regex::Regex; use query::{AutomatonWeight, Query, Weight}; use schema::Field; @@ -80,9 +80,9 @@ impl RegexQuery { fn specialized_weight(&self) -> Result> { let automaton = Regex::new(&self.regex_pattern) - .map_err(|_| ErrorKind::InvalidArgument(self.regex_pattern.clone()))?; + .map_err(|_| TantivyError::InvalidArgument(self.regex_pattern.clone()))?; - Ok(AutomatonWeight::new(self.field.clone(), automaton)) + Ok(AutomatonWeight::new(self.field, automaton)) } } @@ -123,7 +123,7 @@ mod test { let mut collector = TopCollector::with_limit(2); let regex_query = RegexQuery::new("jap[ao]n".to_string(), country_field); searcher.search(®ex_query, &mut collector).unwrap(); - let scored_docs = collector.score_docs(); + let scored_docs = collector.top_docs(); assert_eq!(scored_docs.len(), 1, "Expected only 1 document"); let (score, _) = scored_docs[0]; assert_nearly_equals(1f32, score); @@ -132,7 +132,7 @@ mod test { let mut collector = TopCollector::with_limit(2); let regex_query = RegexQuery::new("jap[A-Z]n".to_string(), country_field); searcher.search(®ex_query, &mut collector).unwrap(); - let scored_docs = collector.score_docs(); + let scored_docs = collector.top_docs(); assert_eq!(scored_docs.len(), 0, "Expected ZERO document"); } } diff --git a/src/query/scorer.rs b/src/query/scorer.rs index a94b03a5b..2c2f0cd62 100644 --- a/src/query/scorer.rs +++ b/src/query/scorer.rs @@ -50,34 +50,6 @@ impl Scorer for Box { } } -/// `EmptyScorer` is a dummy `Scorer` in which no document matches. -/// -/// It is useful for tests and handling edge cases. -pub struct EmptyScorer; - -impl DocSet for EmptyScorer { - fn advance(&mut self) -> bool { - false - } - - fn doc(&self) -> DocId { - panic!( - "You may not call .doc() on a scorer \ - where the last call to advance() did not return true." - ); - } - - fn size_hint(&self) -> u32 { - 0 - } -} - -impl Scorer for EmptyScorer { - fn score(&mut self) -> Score { - 0f32 - } -} - /// Wraps a `DocSet` and simply returns a constant `Scorer`. /// The `ConstScorer` is useful if you have a `DocSet` where /// you needed a scorer. @@ -135,21 +107,3 @@ impl Scorer for ConstScorer { 1f32 } } - -#[cfg(test)] -mod tests { - use super::EmptyScorer; - use DocSet; - - #[test] - fn test_empty_scorer() { - let mut empty_scorer = EmptyScorer; - assert!(!empty_scorer.advance()); - } - - #[test] - #[should_panic] - fn test_empty_scorer_panic_on_doc_call() { - EmptyScorer.doc(); - } -} diff --git a/src/query/term_query/mod.rs b/src/query/term_query/mod.rs index e8a865e02..bf5171016 100644 --- a/src/query/term_query/mod.rs +++ b/src/query/term_query/mod.rs @@ -72,7 +72,7 @@ mod tests { let term = Term::from_field_text(left_field, "left2"); let term_query = TermQuery::new(term, IndexRecordOption::WithFreqs); searcher.search(&term_query, &mut collector).unwrap(); - let scored_docs = collector.score_docs(); + let scored_docs = collector.top_docs(); assert_eq!(scored_docs.len(), 1); let (score, _) = scored_docs[0]; assert_nearly_equals(0.77802235, score); @@ -82,7 +82,7 @@ mod tests { let term = Term::from_field_text(left_field, "left1"); let term_query = TermQuery::new(term, IndexRecordOption::WithFreqs); searcher.search(&term_query, &mut collector).unwrap(); - let scored_docs = collector.score_docs(); + let scored_docs = collector.top_docs(); assert_eq!(scored_docs.len(), 2); let (score1, _) = scored_docs[0]; assert_nearly_equals(0.27101856, score1); @@ -94,7 +94,7 @@ mod tests { let query = query_parser.parse_query("left:left2 left:left1").unwrap(); let mut collector = TopCollector::with_limit(2); searcher.search(&*query, &mut collector).unwrap(); - let scored_docs = collector.score_docs(); + let scored_docs = collector.top_docs(); assert_eq!(scored_docs.len(), 2); let (score1, _) = scored_docs[0]; assert_nearly_equals(0.9153879, score1); diff --git a/src/query/term_query/term_query.rs b/src/query/term_query/term_query.rs index 9ba10b307..267ca9ba7 100644 --- a/src/query/term_query/term_query.rs +++ b/src/query/term_query/term_query.rs @@ -3,6 +3,7 @@ use query::bm25::BM25Weight; use query::Query; use query::Weight; use schema::IndexRecordOption; +use std::collections::BTreeSet; use Result; use Searcher; use Term; @@ -110,4 +111,7 @@ impl Query for TermQuery { fn weight(&self, searcher: &Searcher, scoring_enabled: bool) -> Result> { Ok(Box::new(self.specialized_weight(searcher, scoring_enabled))) } + fn query_terms(&self, term_set: &mut BTreeSet) { + term_set.insert(self.term.clone()); + } } diff --git a/src/query/union.rs b/src/query/union.rs index 58d5de242..5bbe902a0 100644 --- a/src/query/union.rs +++ b/src/query/union.rs @@ -55,8 +55,7 @@ where None } }, - ) - .collect(); + ).collect(); Union { docsets: non_empty_docsets, bitsets: Box::new([TinySet::empty(); HORIZON_NUM_TINYBITSETS]), @@ -215,6 +214,10 @@ where // The target is outside of the buffered horizon. // advance all docsets to a doc >= to the target. + #[cfg_attr( + feature = "cargo-clippy", + allow(clippy::clippy::collapsible_if) + )] unordered_drain_filter(&mut self.docsets, |docset| { if docset.doc() < target { if docset.skip_next(target) == SkipResult::End { diff --git a/src/schema/document.rs b/src/schema/document.rs index c6a508a94..7254c9660 100644 --- a/src/schema/document.rs +++ b/src/schema/document.rs @@ -113,7 +113,7 @@ impl Document { .into_iter() .group_by(|field_value| field_value.field()) .into_iter() - .map(|(key, group)| (key, group.into_iter().collect())) + .map(|(key, group)| (key, group.collect())) .collect::)>>() } diff --git a/src/schema/facet.rs b/src/schema/facet.rs index 429766c85..bb685c277 100644 --- a/src/schema/facet.rs +++ b/src/schema/facet.rs @@ -97,16 +97,12 @@ impl Facet { } /// Returns `true` iff other is a subfacet of `self`. - #[allow(collapsible_if)] pub fn is_prefix_of(&self, other: &Facet) -> bool { let self_bytes: &[u8] = self.encoded_bytes(); let other_bytes: &[u8] = other.encoded_bytes(); - if self_bytes.len() < other_bytes.len() { - if other_bytes.starts_with(self_bytes) { - return other_bytes[self_bytes.len()] == 0u8; - } - } - false + self_bytes.len() < other_bytes.len() + && other_bytes.starts_with(self_bytes) + && other_bytes[self_bytes.len()] == 0u8 } } diff --git a/src/schema/index_record_option.rs b/src/schema/index_record_option.rs index bd2deaaac..4a595a8f0 100644 --- a/src/schema/index_record_option.rs +++ b/src/schema/index_record_option.rs @@ -30,16 +30,16 @@ pub enum IndexRecordOption { impl IndexRecordOption { /// Returns true iff the term frequency will be encoded. - pub fn is_termfreq_enabled(&self) -> bool { - match *self { + pub fn is_termfreq_enabled(self) -> bool { + match self { IndexRecordOption::WithFreqsAndPositions | IndexRecordOption::WithFreqs => true, _ => false, } } /// Returns true iff the term positions within the document are stored as well. - pub fn is_position_enabled(&self) -> bool { - match *self { + pub fn is_position_enabled(self) -> bool { + match self { IndexRecordOption::WithFreqsAndPositions => true, _ => false, } @@ -47,8 +47,8 @@ impl IndexRecordOption { /// Returns true iff this option includes encoding /// term frequencies. - pub fn has_freq(&self) -> bool { - match *self { + pub fn has_freq(self) -> bool { + match self { IndexRecordOption::Basic => false, IndexRecordOption::WithFreqs | IndexRecordOption::WithFreqsAndPositions => true, } @@ -56,8 +56,8 @@ impl IndexRecordOption { /// Returns true iff this option include encoding /// term positions. - pub fn has_positions(&self) -> bool { - match *self { + pub fn has_positions(self) -> bool { + match self { IndexRecordOption::Basic | IndexRecordOption::WithFreqs => false, IndexRecordOption::WithFreqsAndPositions => true, } diff --git a/src/schema/schema.rs b/src/schema/schema.rs index 6d4f6c949..85d8d14f3 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -441,10 +441,12 @@ mod tests { "count": 4, "popularity": 10 }"#, - ) - .unwrap(); - assert_eq!(doc.get_first(title_field).unwrap().text(), "my title"); - assert_eq!(doc.get_first(author_field).unwrap().text(), "fulmicoton"); + ).unwrap(); + assert_eq!(doc.get_first(title_field).unwrap().text(), Some("my title")); + assert_eq!( + doc.get_first(author_field).unwrap().text(), + Some("fulmicoton") + ); assert_eq!(doc.get_first(count_field).unwrap().u64_value(), 4); assert_eq!(doc.get_first(popularity_field).unwrap().i64_value(), 10); } diff --git a/src/schema/value.rs b/src/schema/value.rs index f5ce151f1..64b0dc795 100644 --- a/src/schema/value.rs +++ b/src/schema/value.rs @@ -74,10 +74,10 @@ impl Value { /// /// # Panics /// If the value is not of type `Str` - pub fn text(&self) -> &str { + pub fn text(&self) -> Option<&str> { match *self { - Value::Str(ref text) => text, - _ => panic!("This is not a text field."), + Value::Str(ref text) => Some(text), + _ => None, } } diff --git a/src/snippet/mod.rs b/src/snippet/mod.rs new file mode 100644 index 000000000..2c6d3d012 --- /dev/null +++ b/src/snippet/mod.rs @@ -0,0 +1,573 @@ +use htmlescape::encode_minimal; +use query::Query; +use schema::Field; +use std::cmp::Ordering; +use std::collections::BTreeMap; +use std::collections::BTreeSet; +use tokenizer::BoxedTokenizer; +use tokenizer::{Token, TokenStream}; +use Document; +use Result; +use Searcher; + +const DEFAULT_MAX_NUM_CHARS: usize = 150; + +#[derive(Debug)] +pub struct HighlightSection { + start: usize, + stop: usize, +} + +impl HighlightSection { + fn new(start: usize, stop: usize) -> HighlightSection { + HighlightSection { start, stop } + } +} + +#[derive(Debug)] +pub struct FragmentCandidate { + score: f32, + start_offset: usize, + stop_offset: usize, + num_chars: usize, + highlighted: Vec, +} + +impl FragmentCandidate { + /// Create a basic `FragmentCandidate` + /// + /// `score`, `num_chars` are set to 0 + /// and `highlighted` is set to empty vec + /// stop_offset is set to start_offset, which is taken as a param. + fn new(start_offset: usize) -> FragmentCandidate { + FragmentCandidate { + score: 0.0, + start_offset, + stop_offset: start_offset, + num_chars: 0, + highlighted: vec![], + } + } + + /// Updates `score` and `highlighted` fields of the objects. + /// + /// taking the token and terms, the token is added to the fragment. + /// if the token is one of the terms, the score + /// and highlighted fields are updated in the fragment. + fn try_add_token(&mut self, token: &Token, terms: &BTreeMap) { + self.stop_offset = token.offset_to; + + if let Some(score) = terms.get(&token.text.to_lowercase()) { + self.score += score; + self.highlighted + .push(HighlightSection::new(token.offset_from, token.offset_to)); + } + } +} + +#[derive(Debug)] +pub struct Snippet { + fragments: String, + highlighted: Vec, +} + +const HIGHLIGHTEN_PREFIX: &str = ""; +const HIGHLIGHTEN_POSTFIX: &str = ""; + +impl Snippet { + pub fn empty() -> Snippet { + Snippet { + fragments: String::new(), + highlighted: Vec::new(), + } + } + + /// Returns a hignlightned html from the `Snippet`. + pub fn to_html(&self) -> String { + let mut html = String::new(); + let mut start_from: usize = 0; + + for item in self.highlighted.iter() { + html.push_str(&encode_minimal(&self.fragments[start_from..item.start])); + html.push_str(HIGHLIGHTEN_PREFIX); + html.push_str(&encode_minimal(&self.fragments[item.start..item.stop])); + html.push_str(HIGHLIGHTEN_POSTFIX); + start_from = item.stop; + } + html.push_str(&encode_minimal( + &self.fragments[start_from..self.fragments.len()], + )); + html + } +} + +/// Returns a non-empty list of "good" fragments. +/// +/// If no target term is within the text, then the function +/// should return an empty Vec. +/// +/// If a target term is within the text, then the returned +/// list is required to be non-empty. +/// +/// The returned list is non-empty and contain less +/// than 12 possibly overlapping fragments. +/// +/// All fragments should contain at least one target term +/// and have at most `max_num_chars` characters (not bytes). +/// +/// It is ok to emit non-overlapping fragments, for instance, +/// one short and one long containing the same keyword, in order +/// to leave optimization opportunity to the fragment selector +/// upstream. +/// +/// Fragments must be valid in the sense that `&text[fragment.start..fragment.stop]`\ +/// has to be a valid string. +fn search_fragments<'a>( + tokenizer: &BoxedTokenizer, + text: &'a str, + terms: &BTreeMap, + max_num_chars: usize, +) -> Vec { + let mut token_stream = tokenizer.token_stream(text); + let mut fragment = FragmentCandidate::new(0); + let mut fragments: Vec = vec![]; + + while let Some(next) = token_stream.next() { + if (next.offset_to - fragment.start_offset) > max_num_chars { + if fragment.score > 0.0 { + fragments.push(fragment) + }; + fragment = FragmentCandidate::new(next.offset_from); + } + fragment.try_add_token(next, &terms); + } + if fragment.score > 0.0 { + fragments.push(fragment) + } + + fragments +} + +/// Returns a Snippet +/// +/// Takes a vector of `FragmentCandidate`s and the text. +/// Figures out the best fragment from it and creates a snippet. +fn select_best_fragment_combination(fragments: &[FragmentCandidate], text: &str) -> Snippet { + let best_fragment_opt = fragments.iter().max_by(|left, right| { + let cmp_score = left + .score + .partial_cmp(&right.score) + .unwrap_or(Ordering::Equal); + if cmp_score == Ordering::Equal { + (right.start_offset, right.stop_offset).cmp(&(left.start_offset, left.stop_offset)) + } else { + cmp_score + } + }); + if let Some(fragment) = best_fragment_opt { + let fragment_text = &text[fragment.start_offset..fragment.stop_offset]; + let highlighted = fragment + .highlighted + .iter() + .map(|item| { + HighlightSection::new( + item.start - fragment.start_offset, + item.stop - fragment.start_offset, + ) + }).collect(); + Snippet { + fragments: fragment_text.to_string(), + highlighted, + } + } else { + // when there no fragments to chose from, + // for now create a empty snippet + Snippet { + fragments: String::new(), + highlighted: vec![], + } + } +} + +/// `SnippetGenerator` +/// +/// # Example +/// +/// ```rust +/// # #[macro_use] +/// # extern crate tantivy; +/// # use tantivy::Index; +/// # use tantivy::schema::{SchemaBuilder, TEXT}; +/// # use tantivy::query::QueryParser; +/// use tantivy::SnippetGenerator; +/// +/// # fn main() -> tantivy::Result<()> { +/// # let mut schema_builder = SchemaBuilder::default(); +/// # let text_field = schema_builder.add_text_field("text", TEXT); +/// # let schema = schema_builder.build(); +/// # let index = Index::create_in_ram(schema); +/// # let mut index_writer = index.writer_with_num_threads(1, 30_000_000)?; +/// # let doc = doc!(text_field => r#"Comme je descendais des Fleuves impassibles, +/// # Je ne me sentis plus guidé par les haleurs : +/// # Des Peaux-Rouges criards les avaient pris pour cibles, +/// # Les ayant cloués nus aux poteaux de couleurs. +/// # +/// # J'étais insoucieux de tous les équipages, +/// # Porteur de blés flamands ou de cotons anglais. +/// # Quand avec mes haleurs ont fini ces tapages, +/// # Les Fleuves m'ont laissé descendre où je voulais. +/// # "#); +/// # index_writer.add_document(doc.clone()); +/// # index_writer.commit()?; +/// # let query_parser = QueryParser::for_index(&index, vec![text_field]); +/// // ... +/// let query = query_parser.parse_query("haleurs flamands").unwrap(); +/// # index.load_searchers()?; +/// # let searcher = index.searcher(); +/// let mut snippet_generator = SnippetGenerator::new(&searcher, &*query, text_field)?; +/// snippet_generator.set_max_num_chars(100); +/// let snippet = snippet_generator.snippet_from_doc(&doc); +/// let snippet_html: String = snippet.to_html(); +/// assert_eq!(snippet_html, "Comme je descendais des Fleuves impassibles,\n Je ne me sentis plus guidé par les haleurs :\n Des"); +/// # Ok(()) +/// # } +/// ``` +pub struct SnippetGenerator { + terms_text: BTreeMap, + tokenizer: Box, + field: Field, + max_num_chars: usize, +} + +impl SnippetGenerator { + /// Creates a new snippet generator + pub fn new(searcher: &Searcher, query: &Query, field: Field) -> Result { + let mut terms = BTreeSet::new(); + query.query_terms(&mut terms); + let terms_text: BTreeMap = terms + .into_iter() + .filter(|term| term.field() == field) + .flat_map(|term| { + let doc_freq = searcher.doc_freq(&term); + let score = 1f32 / (1f32 + doc_freq as f32); + if doc_freq > 0 { + Some((term.text().to_string(), score)) + } else { + None + } + }) + .collect(); + let tokenizer = searcher.index().tokenizer_for_field(field)?; + Ok(SnippetGenerator { + terms_text, + tokenizer, + field, + max_num_chars: DEFAULT_MAX_NUM_CHARS, + }) + } + + /// Sets a maximum number of chars. + pub fn set_max_num_chars(&mut self, max_num_chars: usize) { + self.max_num_chars = max_num_chars; + } + + #[cfg(test)] + pub fn terms_text(&self) -> &BTreeMap { + &self.terms_text + } + + /// Generates a snippet for the given `Document`. + /// + /// This method extract the text associated to the `SnippetGenerator`'s field + /// and computes a snippet. + pub fn snippet_from_doc(&self, doc: &Document) -> Snippet { + let text: String = doc + .get_all(self.field) + .into_iter() + .flat_map(|val| val.text()) + .collect::>() + .join(" "); + self.snippet(&text) + } + + /// Generates a snippet for the given text. + pub fn snippet(&self, text: &str) -> Snippet { + let fragment_candidates = search_fragments( + &*self.tokenizer, + &text, + &self.terms_text, + self.max_num_chars, + ); + select_best_fragment_combination(&fragment_candidates[..], &text) + } +} + +#[cfg(test)] +mod tests { + use super::{search_fragments, select_best_fragment_combination}; + use query::QueryParser; + use schema::{IndexRecordOption, SchemaBuilder, TextFieldIndexing, TextOptions, TEXT}; + use std::collections::BTreeMap; + use std::iter::Iterator; + use tokenizer::{box_tokenizer, SimpleTokenizer}; + use Index; + use SnippetGenerator; + + const TEST_TEXT: &'static str = + r#"Rust is a systems programming language sponsored by Mozilla which +describes it as a "safe, concurrent, practical language", supporting functional and +imperative-procedural paradigms. Rust is syntactically similar to C++[according to whom?], +but its designers intend it to provide better memory safety while still maintaining +performance. + +Rust is free and open-source software, released under an MIT License, or Apache License +2.0. Its designers have refined the language through the experiences of writing the Servo +web browser layout engine[14] and the Rust compiler. A large proportion of current commits +to the project are from community members.[15] + +Rust won first place for "most loved programming language" in the Stack Overflow Developer +Survey in 2016, 2017, and 2018."#; + + + + #[test] + fn test_snippet() { + let boxed_tokenizer = box_tokenizer(SimpleTokenizer); + let terms = btreemap! { + String::from("rust") => 1.0, + String::from("language") => 0.9 + }; + let fragments = search_fragments(&*boxed_tokenizer, TEST_TEXT, &terms, 100); + assert_eq!(fragments.len(), 7); + { + let first = &fragments[0]; + assert_eq!(first.score, 1.9); + assert_eq!(first.stop_offset, 89); + } + let snippet = select_best_fragment_combination(&fragments[..], &TEST_TEXT); + assert_eq!(snippet.fragments, "Rust is a systems programming language sponsored by \ + Mozilla which\ndescribes it as a \"safe"); + assert_eq!(snippet.to_html(), "Rust is a systems programming language \ + sponsored by Mozilla which\ndescribes it as a "safe") + } + + + #[test] + fn test_snippet_scored_fragment() { + let boxed_tokenizer = box_tokenizer(SimpleTokenizer); + { + let terms = btreemap! { + String::from("rust") =>1.0f32, + String::from("language") => 0.9f32 + }; + let fragments = search_fragments(&*boxed_tokenizer, TEST_TEXT, &terms, 20); + { + let first = &fragments[0]; + assert_eq!(first.score, 1.0); + assert_eq!(first.stop_offset, 17); + } + let snippet = select_best_fragment_combination(&fragments[..], &TEST_TEXT); + assert_eq!(snippet.to_html(), "Rust is a systems") + } + let boxed_tokenizer = box_tokenizer(SimpleTokenizer); + { + let terms = btreemap! { + String::from("rust") =>0.9f32, + String::from("language") => 1.0f32 + }; + let fragments = search_fragments(&*boxed_tokenizer, TEST_TEXT, &terms, 20); + //assert_eq!(fragments.len(), 7); + { + let first = &fragments[0]; + assert_eq!(first.score, 0.9); + assert_eq!(first.stop_offset, 17); + } + let snippet = select_best_fragment_combination(&fragments[..], &TEST_TEXT); + assert_eq!(snippet.to_html(), "programming language") + } + + } + + + #[test] + fn test_snippet_in_second_fragment() { + let boxed_tokenizer = box_tokenizer(SimpleTokenizer); + + let text = "a b c d e f g"; + + let mut terms = BTreeMap::new(); + terms.insert(String::from("c"), 1.0); + + let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3); + + assert_eq!(fragments.len(), 1); + { + let first = fragments.iter().nth(0).unwrap(); + assert_eq!(first.score, 1.0); + assert_eq!(first.start_offset, 4); + assert_eq!(first.stop_offset, 7); + } + + let snippet = select_best_fragment_combination(&fragments[..], &text); + assert_eq!(snippet.fragments, "c d"); + assert_eq!(snippet.to_html(), "c d"); + } + + #[test] + fn test_snippet_with_term_at_the_end_of_fragment() { + let boxed_tokenizer = box_tokenizer(SimpleTokenizer); + + let text = "a b c d e f f g"; + + let mut terms = BTreeMap::new(); + terms.insert(String::from("f"), 1.0); + + let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3); + + assert_eq!(fragments.len(), 2); + { + let first = fragments.iter().nth(0).unwrap(); + assert_eq!(first.score, 1.0); + assert_eq!(first.stop_offset, 11); + assert_eq!(first.start_offset, 8); + } + + let snippet = select_best_fragment_combination(&fragments[..], &text); + assert_eq!(snippet.fragments, "e f"); + assert_eq!(snippet.to_html(), "e f"); + } + + #[test] + fn test_snippet_with_second_fragment_has_the_highest_score() { + let boxed_tokenizer = box_tokenizer(SimpleTokenizer); + + let text = "a b c d e f g"; + + let mut terms = BTreeMap::new(); + terms.insert(String::from("f"), 1.0); + terms.insert(String::from("a"), 0.9); + + let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 7); + + assert_eq!(fragments.len(), 2); + { + let first = fragments.iter().nth(0).unwrap(); + assert_eq!(first.score, 0.9); + assert_eq!(first.stop_offset, 7); + assert_eq!(first.start_offset, 0); + } + + let snippet = select_best_fragment_combination(&fragments[..], &text); + assert_eq!(snippet.fragments, "e f g"); + assert_eq!(snippet.to_html(), "e f g"); + } + + #[test] + fn test_snippet_with_term_not_in_text() { + let boxed_tokenizer = box_tokenizer(SimpleTokenizer); + + let text = "a b c d"; + + let mut terms = BTreeMap::new(); + terms.insert(String::from("z"), 1.0); + + let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3); + + assert_eq!(fragments.len(), 0); + + let snippet = select_best_fragment_combination(&fragments[..], &text); + assert_eq!(snippet.fragments, ""); + assert_eq!(snippet.to_html(), ""); + } + + #[test] + fn test_snippet_with_no_terms() { + let boxed_tokenizer = box_tokenizer(SimpleTokenizer); + + let text = "a b c d"; + + let terms = BTreeMap::new(); + let fragments = search_fragments(&*boxed_tokenizer, &text, &terms, 3); + assert_eq!(fragments.len(), 0); + + let snippet = select_best_fragment_combination(&fragments[..], &text); + assert_eq!(snippet.fragments, ""); + assert_eq!(snippet.to_html(), ""); + } + + + #[test] + fn test_snippet_generator_term_score() { + let mut schema_builder = SchemaBuilder::default(); + let text_field = schema_builder.add_text_field("text", TEXT); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + { + // writing the segment + let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); + index_writer.add_document(doc!(text_field => "a")); + index_writer.add_document(doc!(text_field => "a")); + index_writer.add_document(doc!(text_field => "a b")); + index_writer.commit().unwrap(); + index.load_searchers().unwrap(); + } + let searcher = index.searcher(); + let query_parser = QueryParser::for_index(&index, vec![text_field]); + { + let query = query_parser.parse_query("e").unwrap(); + let snippet_generator = SnippetGenerator::new(&searcher, &*query, text_field).unwrap(); + assert!(snippet_generator.terms_text().is_empty()); + } + { + let query = query_parser.parse_query("a").unwrap(); + let snippet_generator = SnippetGenerator::new(&searcher, &*query, text_field).unwrap(); + assert_eq!(&btreemap!("a".to_string() => 0.25f32), snippet_generator.terms_text()); + } + { + let query = query_parser.parse_query("a b").unwrap(); + let snippet_generator = SnippetGenerator::new(&searcher, &*query, text_field).unwrap(); + assert_eq!(&btreemap!("a".to_string() => 0.25f32, "b".to_string() => 0.5), snippet_generator.terms_text()); + } + { + let query = query_parser.parse_query("a b c").unwrap(); + let snippet_generator = SnippetGenerator::new(&searcher, &*query, text_field).unwrap(); + assert_eq!(&btreemap!("a".to_string() => 0.25f32, "b".to_string() => 0.5), snippet_generator.terms_text()); + } + } + + #[test] + fn test_snippet_generator() { + let mut schema_builder = SchemaBuilder::default(); + let text_options = TextOptions::default().set_indexing_options( + TextFieldIndexing::default() + .set_tokenizer("en_stem") + .set_index_option(IndexRecordOption::Basic), + ); + let text_field = schema_builder.add_text_field("text", text_options); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + { + // writing the segment + let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); + { + let doc = doc ! (text_field => TEST_TEXT); + index_writer.add_document(doc); + } + index_writer.commit().unwrap(); + } + index.load_searchers().unwrap(); + let searcher = index.searcher(); + let query_parser = QueryParser::for_index(&index, vec![text_field]); + let query = query_parser.parse_query("rust design").unwrap(); + let mut snippet_generator = SnippetGenerator::new(&searcher, &*query, text_field).unwrap(); + { + let snippet = snippet_generator.snippet(TEST_TEXT); + assert_eq!(snippet.to_html(), "imperative-procedural paradigms. Rust is syntactically similar to C++[according to whom?],\nbut its designers intend it to provide better memory safety"); + } + { + snippet_generator.set_max_num_chars(90); + let snippet = snippet_generator.snippet(TEST_TEXT); + assert_eq!(snippet.to_html(), "Rust is syntactically similar to C++[according to whom?],\nbut its designers intend it to"); + } + } +} diff --git a/src/space_usage/mod.rs b/src/space_usage/mod.rs new file mode 100644 index 000000000..9ffd8b849 --- /dev/null +++ b/src/space_usage/mod.rs @@ -0,0 +1,484 @@ +/*! +Representations for the space usage of various parts of a Tantivy index. + +This can be used programmatically, and will also be exposed in a human readable fashion in +tantivy-cli. + +One important caveat for all of this functionality is that none of it currently takes storage-level +details into consideration. For example, if your file system block size is 4096 bytes, we can +under-count actual resultant space usage by up to 4095 bytes per file. +*/ + +use schema::Field; +use std::collections::HashMap; +use SegmentComponent; + +/// Indicates space usage in bytes +pub type ByteCount = usize; + +/// Enum containing any of the possible space usage results for segment components. +pub enum ComponentSpaceUsage { + /// Data is stored per field in a uniform way + PerField(PerFieldSpaceUsage), + /// Data is stored in separate pieces in the store + Store(StoreSpaceUsage), + /// Some sort of raw byte count + Basic(ByteCount), +} + +/// Represents combined space usage of an entire searcher and its component segments. +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct SearcherSpaceUsage { + segments: Vec, + total: ByteCount, +} + +impl SearcherSpaceUsage { + pub(crate) fn new() -> SearcherSpaceUsage { + SearcherSpaceUsage { + segments: Vec::new(), + total: 0, + } + } + + /// Add a segment, to `self`. + /// Performs no deduplication or other intelligence. + pub(crate) fn add_segment(&mut self, segment: SegmentSpaceUsage) { + self.total += segment.total(); + self.segments.push(segment); + } + + /// Per segment space usage + pub fn segments(&self) -> &[SegmentSpaceUsage] { + &self.segments[..] + } + + /// Returns total byte usage of this searcher, including all large subcomponents. + /// Does not account for smaller things like `meta.json`. + pub fn total(&self) -> ByteCount { + self.total + } +} + +/// Represents combined space usage for all of the large components comprising a segment. +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct SegmentSpaceUsage { + num_docs: u32, + + termdict: PerFieldSpaceUsage, + postings: PerFieldSpaceUsage, + positions: PerFieldSpaceUsage, + positions_idx: PerFieldSpaceUsage, + fast_fields: PerFieldSpaceUsage, + fieldnorms: PerFieldSpaceUsage, + + store: StoreSpaceUsage, + + deletes: ByteCount, + + total: ByteCount, +} + +impl SegmentSpaceUsage { + pub(crate) fn new( + num_docs: u32, + termdict: PerFieldSpaceUsage, + postings: PerFieldSpaceUsage, + positions: PerFieldSpaceUsage, + positions_idx: PerFieldSpaceUsage, + fast_fields: PerFieldSpaceUsage, + fieldnorms: PerFieldSpaceUsage, + store: StoreSpaceUsage, + deletes: ByteCount, + ) -> SegmentSpaceUsage { + let total = termdict.total() + + postings.total() + + positions.total() + + fast_fields.total() + + fieldnorms.total() + + store.total() + + deletes; + SegmentSpaceUsage { + num_docs, + termdict, + postings, + positions, + positions_idx, + fast_fields, + fieldnorms, + store, + deletes, + total, + } + } + + /// Space usage for the given component + /// + /// Clones the underlying data. + /// Use the components directly if this is somehow in performance critical code. + pub fn component(&self, component: SegmentComponent) -> ComponentSpaceUsage { + use SegmentComponent::*; + use self::ComponentSpaceUsage::*; + match component { + POSTINGS => PerField(self.postings().clone()), + POSITIONS => PerField(self.positions().clone()), + POSITIONSSKIP => PerField(self.positions_skip_idx().clone()), + FASTFIELDS => PerField(self.fast_fields().clone()), + FIELDNORMS => PerField(self.fieldnorms().clone()), + TERMS => PerField(self.termdict().clone()), + STORE => Store(self.store().clone()), + DELETE => Basic(self.deletes()), + } + } + + /// Num docs in segment + pub fn num_docs(&self) -> u32 { + self.num_docs + } + + /// Space usage for term dictionary + pub fn termdict(&self) -> &PerFieldSpaceUsage { + &self.termdict + } + + /// Space usage for postings list + pub fn postings(&self) -> &PerFieldSpaceUsage { + &self.postings + } + + /// Space usage for positions + pub fn positions(&self) -> &PerFieldSpaceUsage { + &self.positions + } + + /// Space usage for positions skip idx + pub fn positions_skip_idx(&self) -> &PerFieldSpaceUsage { + &self.positions_idx + } + + /// Space usage for fast fields + pub fn fast_fields(&self) -> &PerFieldSpaceUsage { + &self.fast_fields + } + + /// Space usage for field norms + pub fn fieldnorms(&self) -> &PerFieldSpaceUsage { + &self.fieldnorms + } + + /// Space usage for stored documents + pub fn store(&self) -> &StoreSpaceUsage { + &self.store + } + + /// Space usage for document deletions + pub fn deletes(&self) -> ByteCount { + self.deletes + } + + /// Total space usage in bytes for this segment. + pub fn total(&self) -> ByteCount { + self.total + } +} + +/// Represents space usage for the Store for this segment. +/// +/// This is composed of two parts. +/// `data` represents the compressed data itself. +/// `offsets` represents a lookup to find the start of a block +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct StoreSpaceUsage { + data: ByteCount, + offsets: ByteCount, +} + +impl StoreSpaceUsage { + pub(crate) fn new(data: ByteCount, offsets: ByteCount) -> StoreSpaceUsage { + StoreSpaceUsage { data, offsets } + } + + /// Space usage for the data part of the store + pub fn data_usage(&self) -> ByteCount { + self.data + } + + /// Space usage for the offsets part of the store (doc ID -> offset) + pub fn offsets_usage(&self) -> ByteCount { + self.offsets + } + + /// Total space usage in bytes for this Store + pub fn total(&self) -> ByteCount { + self.data + self.offsets + } +} + +/// Represents space usage for all of the (field, index) pairs that appear in a CompositeFile. +/// +/// A field can appear with a single index (typically 0) or with multiple indexes. +/// Multiple indexes are used to handle variable length things, where +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct PerFieldSpaceUsage { + fields: HashMap, + total: ByteCount +} + +impl PerFieldSpaceUsage { + pub(crate) fn new(fields: HashMap) -> PerFieldSpaceUsage { + let total = fields.values().map(|x| x.total()).sum(); + PerFieldSpaceUsage { fields, total } + } + + /// Per field space usage + pub fn fields(&self) -> impl Iterator { + self.fields.iter() + } + + /// Bytes used by the represented file + pub fn total(&self) -> ByteCount { + self.total + } +} + +/// Represents space usage of a given field, breaking it down into the (field, index) pairs that +/// comprise it. +/// +/// See documentation for PerFieldSpaceUsage for slightly more information. +#[derive(Clone, Debug, Serialize, Deserialize)] +pub struct FieldUsage { + field: Field, + num_bytes: ByteCount, + /// A field can be composed of more than one piece. + /// These pieces are indexed by arbitrary numbers starting at zero. + /// `self.num_bytes` includes all of `self.sub_num_bytes`. + sub_num_bytes: Vec>, +} + +impl FieldUsage { + pub(crate) fn empty(field: Field) -> FieldUsage { + FieldUsage { + field, + num_bytes: 0, + sub_num_bytes: Vec::new(), + } + } + + pub(crate) fn add_field_idx(&mut self, idx: usize, size: ByteCount) { + if self.sub_num_bytes.len() < idx + 1{ + self.sub_num_bytes.resize(idx + 1, None); + } + assert!(self.sub_num_bytes[idx].is_none()); + self.sub_num_bytes[idx] = Some(size); + self.num_bytes += size + } + + /// Field + pub fn field(&self) -> Field { + self.field + } + + /// Space usage for each index + pub fn sub_num_bytes(&self) -> &[Option] { + &self.sub_num_bytes[..] + } + + /// Total bytes used for this field in this context + pub fn total(&self) -> ByteCount { + self.num_bytes + } +} + +#[cfg(test)] +mod test { + use core::Index; + use schema::SchemaBuilder; + use schema::{FAST, INT_INDEXED, TEXT}; + use schema::Field; + use space_usage::ByteCount; + use space_usage::PerFieldSpaceUsage; + use schema::STORED; + use Term; + + #[test] + fn test_empty() { + let schema = SchemaBuilder::new().build(); + let index = Index::create_in_ram(schema.clone()); + + index.load_searchers().unwrap(); + let searcher = index.searcher(); + let searcher_space_usage = searcher.space_usage(); + assert_eq!(0, searcher_space_usage.total()); + } + + fn expect_single_field(field_space: &PerFieldSpaceUsage, field: &Field, min_size: ByteCount, max_size: ByteCount) { + assert!(field_space.total() >= min_size); + assert!(field_space.total() <= max_size); + assert_eq!( + vec![(field, field_space.total())], + field_space.fields().map(|(x,y)| (x, y.total())).collect::>() + ); + } + + #[test] + fn test_fast_indexed() { + let mut schema_builder = SchemaBuilder::new(); + let name = schema_builder.add_u64_field("name", FAST | INT_INDEXED); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema.clone()); + + { + let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); + index_writer.add_document(doc!(name => 1u64)); + index_writer.add_document(doc!(name => 2u64)); + index_writer.add_document(doc!(name => 10u64)); + index_writer.add_document(doc!(name => 20u64)); + index_writer.commit().unwrap(); + } + + index.load_searchers().unwrap(); + let searcher = index.searcher(); + let searcher_space_usage = searcher.space_usage(); + assert!(searcher_space_usage.total() > 0); + assert_eq!(1, searcher_space_usage.segments().len()); + + let segment = &searcher_space_usage.segments()[0]; + assert!(segment.total() > 0); + + assert_eq!(4, segment.num_docs()); + + expect_single_field(segment.termdict(), &name, 1, 512); + expect_single_field(segment.postings(), &name, 1, 512); + assert_eq!(0, segment.positions().total()); + assert_eq!(0, segment.positions_skip_idx().total()); + expect_single_field(segment.fast_fields(), &name, 1, 512); + expect_single_field(segment.fieldnorms(), &name, 1, 512); + // TODO: understand why the following fails +// assert_eq!(0, segment.store().total()); + assert_eq!(0, segment.deletes()); + } + + #[test] + fn test_text() { + let mut schema_builder = SchemaBuilder::new(); + let name = schema_builder.add_text_field("name", TEXT); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema.clone()); + + { + let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); + index_writer.add_document(doc!(name => "hi")); + index_writer.add_document(doc!(name => "this is a test")); + index_writer.add_document(doc!(name => "some more documents with some word overlap with the other test")); + index_writer.add_document(doc!(name => "hello hi goodbye")); + index_writer.commit().unwrap(); + } + + index.load_searchers().unwrap(); + let searcher = index.searcher(); + let searcher_space_usage = searcher.space_usage(); + assert!(searcher_space_usage.total() > 0); + assert_eq!(1, searcher_space_usage.segments().len()); + + let segment = &searcher_space_usage.segments()[0]; + assert!(segment.total() > 0); + + assert_eq!(4, segment.num_docs()); + + expect_single_field(segment.termdict(), &name, 1, 512); + expect_single_field(segment.postings(), &name, 1, 512); + expect_single_field(segment.positions(), &name, 1, 512); + expect_single_field(segment.positions_skip_idx(), &name, 1, 512); + assert_eq!(0, segment.fast_fields().total()); + expect_single_field(segment.fieldnorms(), &name, 1, 512); + // TODO: understand why the following fails +// assert_eq!(0, segment.store().total()); + assert_eq!(0, segment.deletes()); + } + + #[test] + fn test_store() { + let mut schema_builder = SchemaBuilder::new(); + let name = schema_builder.add_text_field("name", STORED); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema.clone()); + + { + let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); + index_writer.add_document(doc!(name => "hi")); + index_writer.add_document(doc!(name => "this is a test")); + index_writer.add_document(doc!(name => "some more documents with some word overlap with the other test")); + index_writer.add_document(doc!(name => "hello hi goodbye")); + index_writer.commit().unwrap(); + } + + index.load_searchers().unwrap(); + let searcher = index.searcher(); + let searcher_space_usage = searcher.space_usage(); + assert!(searcher_space_usage.total() > 0); + assert_eq!(1, searcher_space_usage.segments().len()); + + let segment = &searcher_space_usage.segments()[0]; + assert!(segment.total() > 0); + + assert_eq!(4, segment.num_docs()); + + assert_eq!(0, segment.termdict().total()); + assert_eq!(0, segment.postings().total()); + assert_eq!(0, segment.positions().total()); + assert_eq!(0, segment.positions_skip_idx().total()); + assert_eq!(0, segment.fast_fields().total()); + assert_eq!(0, segment.fieldnorms().total()); + assert!(segment.store().total() > 0); + assert!(segment.store().total() < 512); + assert_eq!(0, segment.deletes()); + } + + #[test] + fn test_deletes() { + let mut schema_builder = SchemaBuilder::new(); + let name = schema_builder.add_u64_field("name", INT_INDEXED); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema.clone()); + + { + let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); + index_writer.add_document(doc!(name => 1u64)); + index_writer.add_document(doc!(name => 2u64)); + index_writer.add_document(doc!(name => 3u64)); + index_writer.add_document(doc!(name => 4u64)); + index_writer.commit().unwrap(); + } + + { + let mut index_writer2 = index.writer(50_000_000).unwrap(); + index_writer2.delete_term(Term::from_field_u64(name, 2u64)); + index_writer2.delete_term(Term::from_field_u64(name, 3u64)); + + // ok, now we should have a deleted doc + index_writer2.commit().unwrap(); + } + + index.load_searchers().unwrap(); + + let searcher = index.searcher(); + let searcher_space_usage = searcher.space_usage(); + assert!(searcher_space_usage.total() > 0); + assert_eq!(1, searcher_space_usage.segments().len()); + + let segment = &searcher_space_usage.segments()[0]; + assert!(segment.total() > 0); + + assert_eq!(2, segment.num_docs()); + + expect_single_field(segment.termdict(), &name, 1, 512); + expect_single_field(segment.postings(), &name, 1, 512); + assert_eq!(0, segment.positions().total()); + assert_eq!(0, segment.positions_skip_idx().total()); + assert_eq!(0, segment.fast_fields().total()); + expect_single_field(segment.fieldnorms(), &name, 1, 512); + // TODO: understand why the following fails +// assert_eq!(0, segment.store().total()); + assert!(segment.deletes() > 0); + } +} \ No newline at end of file diff --git a/src/store/mod.rs b/src/store/mod.rs index 5d71563e1..57930e8d8 100644 --- a/src/store/mod.rs +++ b/src/store/mod.rs @@ -109,7 +109,13 @@ pub mod tests { let store = StoreReader::from_source(store_source); for i in 0..1_000 { assert_eq!( - *store.get(i).unwrap().get_first(field_title).unwrap().text(), + *store + .get(i) + .unwrap() + .get_first(field_title) + .unwrap() + .text() + .unwrap(), format!("Doc {}", i) ); } diff --git a/src/store/reader.rs b/src/store/reader.rs index 5f02825e3..e94705bb3 100644 --- a/src/store/reader.rs +++ b/src/store/reader.rs @@ -6,6 +6,7 @@ use common::BinarySerializable; use common::VInt; use directory::ReadOnlySource; use schema::Document; +use space_usage::StoreSpaceUsage; use std::cell::RefCell; use std::io; use std::mem::size_of; @@ -87,9 +88,17 @@ impl StoreReader { cursor = &cursor[..doc_length]; Ok(Document::deserialize(&mut cursor)?) } + + /// Summarize total space usage of this store reader. + pub fn space_usage(&self) -> StoreSpaceUsage { + StoreSpaceUsage::new(self.data.len(), self.offset_index_source.len()) + } } -#[allow(needless_pass_by_value)] +#[cfg_attr( + feature = "cargo-clippy", + allow(clippy::needless_pass_by_value) +)] fn split_source(data: ReadOnlySource) -> (ReadOnlySource, ReadOnlySource, DocId) { let data_len = data.len(); let footer_offset = data_len - size_of::() - size_of::(); diff --git a/src/store/skiplist/skiplist_builder.rs b/src/store/skiplist/skiplist_builder.rs index 14ccd6dda..61f04bf34 100644 --- a/src/store/skiplist/skiplist_builder.rs +++ b/src/store/skiplist/skiplist_builder.rs @@ -72,7 +72,8 @@ impl SkipListBuilder { let mut skip_pointer = self.data_layer.insert(key, dest)?; loop { skip_pointer = match skip_pointer { - Some((skip_doc_id, skip_offset)) => self.get_skip_layer(layer_id) + Some((skip_doc_id, skip_offset)) => self + .get_skip_layer(layer_id) .insert(skip_doc_id, &skip_offset)?, None => { return Ok(()); diff --git a/src/store/writer.rs b/src/store/writer.rs index f1446ab8b..3fbdee074 100644 --- a/src/store/writer.rs +++ b/src/store/writer.rs @@ -51,7 +51,8 @@ impl StoreWriter { stored_document.serialize(&mut self.intermediary_buffer)?; let doc_num_bytes = self.intermediary_buffer.len(); VInt(doc_num_bytes as u64).serialize(&mut self.current_block)?; - self.current_block.write_all(&self.intermediary_buffer[..])?; + self.current_block + .write_all(&self.intermediary_buffer[..])?; self.doc += 1; if self.current_block.len() > BLOCK_SIZE { self.write_and_compress_block()?; diff --git a/src/termdict/merger.rs b/src/termdict/merger.rs index ef0959cf7..1d3844067 100644 --- a/src/termdict/merger.rs +++ b/src/termdict/merger.rs @@ -53,8 +53,7 @@ impl<'a> TermMerger<'a> { .map(|(ord, streamer)| HeapItem { streamer, segment_ord: ord, - }) - .collect(), + }).collect(), } } @@ -81,7 +80,7 @@ impl<'a> TermMerger<'a> { /// Advance the term iterator to the next term. /// Returns true if there is indeed another term /// False if there is none. - #[allow(while_let_loop)] + #[cfg_attr(feature = "cargo-clippy", allow(clippy::while_let_loop))] pub fn advance(&mut self) -> bool { self.advance_segments(); if let Some(head) = self.heap.pop() { @@ -123,7 +122,10 @@ impl<'a> TermMerger<'a> { } /// Iterates through terms - #[allow(should_implement_trait)] + #[cfg_attr( + feature = "cargo-clippy", + allow(clippy::should_implement_trait) + )] pub fn next(&mut self) -> Option> { if self.advance() { Some(Term::wrap(self.current_streamers[0].streamer.key())) diff --git a/src/termdict/mod.rs b/src/termdict/mod.rs index d403314fa..54102a9f4 100644 --- a/src/termdict/mod.rs +++ b/src/termdict/mod.rs @@ -66,7 +66,7 @@ mod tests { let write = directory.open_write(&path).unwrap(); let field_type = FieldType::Str(TEXT); let mut term_dictionary_builder = - TermDictionaryBuilder::new(write, field_type).unwrap(); + TermDictionaryBuilder::new(write, &field_type).unwrap(); for term in COUNTRIES.iter() { term_dictionary_builder .insert(term.as_bytes(), &make_term_info(0u64)) @@ -75,7 +75,7 @@ mod tests { term_dictionary_builder.finish().unwrap(); } let source = directory.open_read(&path).unwrap(); - let term_dict: TermDictionary = TermDictionary::from_source(source); + let term_dict: TermDictionary = TermDictionary::from_source(&source); for (term_ord, term) in COUNTRIES.iter().enumerate() { assert_eq!(term_dict.term_ord(term).unwrap(), term_ord as u64); let mut bytes = vec![]; @@ -92,7 +92,7 @@ mod tests { let write = directory.open_write(&path).unwrap(); let field_type = FieldType::Str(TEXT); let mut term_dictionary_builder = - TermDictionaryBuilder::new(write, field_type).unwrap(); + TermDictionaryBuilder::new(write, &field_type).unwrap(); term_dictionary_builder .insert("abc".as_bytes(), &make_term_info(34u64)) .unwrap(); @@ -102,7 +102,7 @@ mod tests { term_dictionary_builder.finish().unwrap(); } let source = directory.open_read(&path).unwrap(); - let term_dict: TermDictionary = TermDictionary::from_source(source); + let term_dict: TermDictionary = TermDictionary::from_source(&source); assert_eq!(term_dict.get("abc").unwrap().doc_freq, 34u32); assert_eq!(term_dict.get("abcd").unwrap().doc_freq, 346u32); let mut stream = term_dict.stream(); @@ -180,7 +180,7 @@ mod tests { let field_type = FieldType::Str(TEXT); let buffer: Vec = { let mut term_dictionary_builder = - TermDictionaryBuilder::new(vec![], field_type).unwrap(); + TermDictionaryBuilder::new(vec![], &field_type).unwrap(); for &(ref id, ref i) in &ids { term_dictionary_builder .insert(id.as_bytes(), &make_term_info(*i as u64)) @@ -189,7 +189,7 @@ mod tests { term_dictionary_builder.finish().unwrap() }; let source = ReadOnlySource::from(buffer); - let term_dictionary: TermDictionary = TermDictionary::from_source(source); + let term_dictionary: TermDictionary = TermDictionary::from_source(&source); { let mut streamer = term_dictionary.stream(); let mut i = 0; @@ -210,7 +210,7 @@ mod tests { let field_type = FieldType::Str(TEXT); let buffer: Vec = { let mut term_dictionary_builder = - TermDictionaryBuilder::new(vec![], field_type).unwrap(); + TermDictionaryBuilder::new(vec![], &field_type).unwrap(); // term requires more than 16bits term_dictionary_builder .insert("abcdefghijklmnopqrstuvwxy", &make_term_info(1)) @@ -224,7 +224,7 @@ mod tests { term_dictionary_builder.finish().unwrap() }; let source = ReadOnlySource::from(buffer); - let term_dictionary: TermDictionary = TermDictionary::from_source(source); + let term_dictionary: TermDictionary = TermDictionary::from_source(&source); let mut kv_stream = term_dictionary.stream(); assert!(kv_stream.advance()); assert_eq!(kv_stream.key(), "abcdefghijklmnopqrstuvwxy".as_bytes()); @@ -245,7 +245,7 @@ mod tests { let field_type = FieldType::Str(TEXT); let buffer: Vec = { let mut term_dictionary_builder = - TermDictionaryBuilder::new(vec![], field_type).unwrap(); + TermDictionaryBuilder::new(vec![], &field_type).unwrap(); for &(ref id, ref i) in &ids { term_dictionary_builder .insert(id.as_bytes(), &make_term_info(*i as u64)) @@ -256,7 +256,7 @@ mod tests { let source = ReadOnlySource::from(buffer); - let term_dictionary: TermDictionary = TermDictionary::from_source(source); + let term_dictionary: TermDictionary = TermDictionary::from_source(&source); { for i in (0..20).chain(6000..8_000) { let &(ref target_key, _) = &ids[i]; @@ -314,7 +314,7 @@ mod tests { let field_type = FieldType::Str(TEXT); let buffer: Vec = { let mut term_dictionary_builder = - TermDictionaryBuilder::new(vec![], field_type).unwrap(); + TermDictionaryBuilder::new(vec![], &field_type).unwrap(); term_dictionary_builder .insert(&[], &make_term_info(1 as u64)) .unwrap(); @@ -324,7 +324,7 @@ mod tests { term_dictionary_builder.finish().unwrap() }; let source = ReadOnlySource::from(buffer); - let term_dictionary: TermDictionary = TermDictionary::from_source(source); + let term_dictionary: TermDictionary = TermDictionary::from_source(&source); let mut stream = term_dictionary.stream(); assert!(stream.advance()); assert!(stream.key().is_empty()); @@ -338,7 +338,7 @@ mod tests { let field_type = FieldType::Str(TEXT); let buffer: Vec = { let mut term_dictionary_builder = - TermDictionaryBuilder::new(vec![], field_type).unwrap(); + TermDictionaryBuilder::new(vec![], &field_type).unwrap(); for i in 0u8..10u8 { let number_arr = [i; 1]; term_dictionary_builder @@ -348,7 +348,7 @@ mod tests { term_dictionary_builder.finish().unwrap() }; let source = ReadOnlySource::from(buffer); - let term_dictionary: TermDictionary = TermDictionary::from_source(source); + let term_dictionary: TermDictionary = TermDictionary::from_source(&source); let value_list = |mut streamer: TermStreamer| { let mut res: Vec = vec![]; @@ -408,7 +408,7 @@ mod tests { let write = directory.open_write(&path).unwrap(); let field_type = FieldType::Str(TEXT); let mut term_dictionary_builder = - TermDictionaryBuilder::new(write, field_type).unwrap(); + TermDictionaryBuilder::new(write, &field_type).unwrap(); for term in COUNTRIES.iter() { term_dictionary_builder .insert(term.as_bytes(), &make_term_info(0u64)) @@ -417,7 +417,7 @@ mod tests { term_dictionary_builder.finish().unwrap(); } let source = directory.open_read(&path).unwrap(); - let term_dict: TermDictionary = TermDictionary::from_source(source); + let term_dict: TermDictionary = TermDictionary::from_source(&source); // We can now build an entire dfa. let lev_automaton_builder = LevenshteinAutomatonBuilder::new(2, true); diff --git a/src/termdict/streamer.rs b/src/termdict/streamer.rs index 48eb56c7d..98277f2ef 100644 --- a/src/termdict/streamer.rs +++ b/src/termdict/streamer.rs @@ -132,6 +132,10 @@ where } /// Return the next `(key, value)` pair. + #[cfg_attr( + feature = "cargo-clippy", + allow(clippy::should_implement_trait) + )] pub fn next(&mut self) -> Option<(&[u8], &TermInfo)> { if self.advance() { Some((self.key(), self.value())) diff --git a/src/termdict/term_info_store.rs b/src/termdict/term_info_store.rs index 70ded6090..130b5d62f 100644 --- a/src/termdict/term_info_store.rs +++ b/src/termdict/term_info_store.rs @@ -59,7 +59,6 @@ impl TermInfoBlockMeta { } fn deserialize_term_info(&self, data: &[u8], inner_offset: usize) -> TermInfo { - let num_bits = self.num_bits() as usize; let mut cursor = num_bits * inner_offset; @@ -70,7 +69,6 @@ impl TermInfoBlockMeta { cursor += self.postings_offset_nbits as usize; let positions_idx = extract_bits(data, cursor, self.positions_idx_nbits); - self.positions_idx_nbits as usize; TermInfo { doc_freq, @@ -92,8 +90,10 @@ fn extract_bits(data: &[u8], addr_bits: usize, num_bits: u8) -> u64 { let bit_shift = (addr_bits % 8) as u64; assert!(data.len() >= addr_byte + 7); let val_unshifted_unmasked: u64 = unsafe { - // ok thanks to the 7 byte padding on `.close` - let addr = data.as_ptr().offset(addr_byte as isize) as *const u64; + // ok because the pointer is only accessed using `ptr::read_unaligned` + #[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))] + let addr = data.as_ptr().add(addr_byte) as *const u64; + // ok thanks to the 7 byte padding ptr::read_unaligned(addr) }; let val_shifted_unmasked = val_unshifted_unmasked >> bit_shift; diff --git a/src/termdict/termdict.rs b/src/termdict/termdict.rs index f633211ef..0f8a28231 100644 --- a/src/termdict/termdict.rs +++ b/src/termdict/termdict.rs @@ -29,7 +29,7 @@ where W: Write, { /// Creates a new `TermDictionaryBuilder` - pub fn new(w: W, _field_type: FieldType) -> io::Result { + pub fn new(w: W, _field_type: &FieldType) -> io::Result { let fst_builder = fst::MapBuilder::new(w).map_err(convert_fst_error)?; Ok(TermDictionaryBuilder { fst_builder, @@ -77,7 +77,8 @@ where let mut file = self.fst_builder.into_inner().map_err(convert_fst_error)?; { let mut counting_writer = CountingWriter::wrap(&mut file); - self.term_info_store_writer.serialize(&mut counting_writer)?; + self.term_info_store_writer + .serialize(&mut counting_writer)?; let footer_size = counting_writer.written_bytes(); (footer_size as u64).serialize(&mut counting_writer)?; counting_writer.flush()?; @@ -112,7 +113,7 @@ pub struct TermDictionary { impl TermDictionary { /// Opens a `TermDictionary` given a data source. - pub fn from_source(source: ReadOnlySource) -> Self { + pub fn from_source(source: &ReadOnlySource) -> Self { let total_len = source.len(); let length_offset = total_len - 8; let mut split_len_buffer: &[u8] = &source.as_slice()[length_offset..]; @@ -129,14 +130,14 @@ impl TermDictionary { } /// Creates an empty term dictionary which contains no terms. - pub fn empty(field_type: FieldType) -> Self { + pub fn empty(field_type: &FieldType) -> Self { let term_dictionary_data: Vec = - TermDictionaryBuilder::new(Vec::::new(), field_type) + TermDictionaryBuilder::new(Vec::::new(), &field_type) .expect("Creating a TermDictionaryBuilder in a Vec should never fail") .finish() .expect("Writing in a Vec should never fail"); let source = ReadOnlySource::from(term_dictionary_data); - Self::from_source(source) + Self::from_source(&source) } /// Returns the number of terms in the dictionary. @@ -164,7 +165,8 @@ impl TermDictionary { let fst = self.fst_index.as_fst(); let mut node = fst.root(); while ord != 0 || !node.is_final() { - if let Some(transition) = node.transitions() + if let Some(transition) = node + .transitions() .take_while(|transition| transition.out.value() <= ord) .last() { @@ -192,12 +194,12 @@ impl TermDictionary { /// Returns a range builder, to stream all of the terms /// within an interval. - pub fn range<'a>(&'a self) -> TermStreamerBuilder<'a> { + pub fn range(&self) -> TermStreamerBuilder { TermStreamerBuilder::new(self, self.fst_index.range()) } /// A stream of all the sorted terms. [See also `.stream_field()`](#method.stream_field) - pub fn stream<'a>(&'a self) -> TermStreamer<'a> { + pub fn stream(&self) -> TermStreamer { self.range().into_stream() } diff --git a/src/tokenizer/japanese_tokenizer.rs b/src/tokenizer/japanese_tokenizer.rs deleted file mode 100644 index 5b072e380..000000000 --- a/src/tokenizer/japanese_tokenizer.rs +++ /dev/null @@ -1,94 +0,0 @@ -use super::{Token, TokenStream, Tokenizer}; -use tinysegmenter; - -/// Simple japanese tokenizer based on the `tinysegmenter` crate. -#[derive(Clone)] -pub struct JapaneseTokenizer; - -#[derive(Eq, PartialEq)] -enum Cursor { - HasNotStarted, - Cursor(usize), - Terminated, -} - -pub struct JapaneseTokenizerStream { - tokens: Vec, - cursor: Cursor, -} - -impl<'a> Tokenizer<'a> for JapaneseTokenizer { - type TokenStreamImpl = JapaneseTokenizerStream; - - fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl { - let mut tokens = vec![]; - let mut offset_from; - let mut offset_to = 0; - for (pos, term) in tinysegmenter::tokenize(text).into_iter().enumerate() { - offset_from = offset_to; - offset_to = offset_from + term.len(); - if term.chars().all(char::is_alphanumeric) { - tokens.push(Token { - offset_from, - offset_to, - position: pos, - text: term, - position_length: 1 - }); - } - } - JapaneseTokenizerStream { - tokens, - cursor: Cursor::HasNotStarted, - } - } -} - -impl<'a> TokenStream for JapaneseTokenizerStream { - fn advance(&mut self) -> bool { - let new_cursor = match self.cursor { - Cursor::HasNotStarted => { - if self.tokens.is_empty() { - Cursor::Terminated - } else { - Cursor::Cursor(0) - } - } - Cursor::Cursor(pos) => { - let new_pos = pos + 1; - if new_pos >= self.tokens.len() { - Cursor::Terminated - } else { - Cursor::Cursor(new_pos) - } - } - Cursor::Terminated => Cursor::Terminated, - }; - self.cursor = new_cursor; - self.cursor != Cursor::Terminated - } - - fn token(&self) -> &Token { - match self.cursor { - Cursor::Terminated => { - panic!("You called .token(), after the end of the token stream has been reached"); - } - Cursor::Cursor(i) => &self.tokens[i], - Cursor::HasNotStarted => { - panic!("You called .token(), before having called `.advance()`."); - } - } - } - - fn token_mut(&mut self) -> &mut Token { - match self.cursor { - Cursor::Terminated => { - panic!("You called .token(), after the end of the token stream has been reached"); - } - Cursor::Cursor(i) => &mut self.tokens[i], - Cursor::HasNotStarted => { - panic!("You called .token(), before having called `.advance()`."); - } - } - } -} diff --git a/src/tokenizer/lower_caser.rs b/src/tokenizer/lower_caser.rs index ebade3978..38fa782fc 100644 --- a/src/tokenizer/lower_caser.rs +++ b/src/tokenizer/lower_caser.rs @@ -1,4 +1,5 @@ use super::{Token, TokenFilter, TokenStream}; +use std::mem; /// Token filter that lowercase terms. #[derive(Clone)] @@ -15,13 +16,21 @@ where } } -pub struct LowerCaserTokenStream -where - TailTokenStream: TokenStream, -{ +pub struct LowerCaserTokenStream { + buffer: String, tail: TailTokenStream, } +// writes a lowercased version of text into output. +fn to_lowercase_unicode(text: &mut String, output: &mut String) { + output.clear(); + for c in text.chars() { + // Contrary to the std, we do not take care of sigma special case. + // This will have an normalizationo effect, which is ok for search. + output.extend(c.to_lowercase()); + } +} + impl TokenStream for LowerCaserTokenStream where TailTokenStream: TokenStream, @@ -36,7 +45,14 @@ where fn advance(&mut self) -> bool { if self.tail.advance() { - self.tail.token_mut().text.make_ascii_lowercase(); + if self.token_mut().text.is_ascii() { + // fast track for ascii. + self.token_mut().text.make_ascii_lowercase(); + } else { + to_lowercase_unicode(&mut self.tail.token_mut().text, &mut self.buffer); + + mem::swap(&mut self.tail.token_mut().text, &mut self.buffer); + } true } else { false @@ -49,6 +65,45 @@ where TailTokenStream: TokenStream, { fn wrap(tail: TailTokenStream) -> LowerCaserTokenStream { - LowerCaserTokenStream { tail } + LowerCaserTokenStream { + tail, + buffer: String::with_capacity(100), + } } } + +#[cfg(test)] +mod tests { + use tokenizer::LowerCaser; + use tokenizer::SimpleTokenizer; + use tokenizer::TokenStream; + use tokenizer::Tokenizer; + + #[test] + fn test_to_lower_case() { + assert_eq!( + lowercase_helper("Русский текст"), + vec!["русский".to_string(), "текст".to_string()] + ); + } + + fn lowercase_helper(text: &str) -> Vec { + let mut tokens = vec![]; + let mut token_stream = SimpleTokenizer.filter(LowerCaser).token_stream(text); + while token_stream.advance() { + let token_text = token_stream.token().text.clone(); + tokens.push(token_text); + } + tokens + } + + #[test] + fn test_lowercaser() { + assert_eq!(lowercase_helper("Tree"), vec!["tree".to_string()]); + assert_eq!( + lowercase_helper("Русский"), + vec!["русский".to_string()] + ); + } + +} diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index fd0bfbbde..dd8eb18dd 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -130,7 +130,6 @@ //! mod alphanum_only; mod facet_tokenizer; -mod japanese_tokenizer; mod lower_caser; mod ngram_tokenizer; mod raw_tokenizer; @@ -144,7 +143,6 @@ mod tokenizer_manager; pub use self::alphanum_only::AlphaNumOnlyFilter; pub use self::facet_tokenizer::FacetTokenizer; -pub use self::japanese_tokenizer::JapaneseTokenizer; pub use self::lower_caser::LowerCaser; pub use self::ngram_tokenizer::NgramTokenizer; pub use self::raw_tokenizer::RawTokenizer; @@ -153,7 +151,9 @@ pub use self::simple_tokenizer::SimpleTokenizer; pub use self::stemmer::Stemmer; pub use self::stop_word_filter::StopWordFilter; pub(crate) use self::token_stream_chain::TokenStreamChain; +pub(crate) use self::tokenizer::box_tokenizer; pub use self::tokenizer::BoxedTokenizer; + pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer}; pub use self::tokenizer_manager::TokenizerManager; @@ -224,28 +224,6 @@ pub mod test { assert_token(&tokens[3], 3, "payer", 17, 22); } - #[test] - fn test_jp_tokenizer() { - let tokenizer_manager = TokenizerManager::default(); - let en_tokenizer = tokenizer_manager.get("ja").unwrap(); - - let mut tokens: Vec = vec![]; - { - let mut add_token = |token: &Token| { - tokens.push(token.clone()); - }; - en_tokenizer - .token_stream("野菜食べないとやばい!") - .process(&mut add_token); - } - assert_eq!(tokens.len(), 5); - assert_token(&tokens[0], 0, "野菜", 0, 6); - assert_token(&tokens[1], 1, "食べ", 6, 12); - assert_token(&tokens[2], 2, "ない", 12, 18); - assert_token(&tokens[3], 3, "と", 18, 21); - assert_token(&tokens[4], 4, "やばい", 21, 30); - } - #[test] fn test_ngram_tokenizer() { use super::{LowerCaser, NgramTokenizer}; diff --git a/src/tokenizer/raw_tokenizer.rs b/src/tokenizer/raw_tokenizer.rs index 12a5af82c..338109b88 100644 --- a/src/tokenizer/raw_tokenizer.rs +++ b/src/tokenizer/raw_tokenizer.rs @@ -18,7 +18,7 @@ impl<'a> Tokenizer<'a> for RawTokenizer { offset_to: text.len(), position: 0, text: text.to_string(), - position_length: 1 + position_length: 1, }; RawTokenStream { token, diff --git a/src/tokenizer/stemmer.rs b/src/tokenizer/stemmer.rs index 4c91bfb93..064662889 100644 --- a/src/tokenizer/stemmer.rs +++ b/src/tokenizer/stemmer.rs @@ -1,3 +1,5 @@ +#![cfg_attr(feature = "cargo-clippy", allow(clippy::new_without_default))] + use super::{Token, TokenFilter, TokenStream}; use rust_stemmers::{self, Algorithm}; use std::sync::Arc; diff --git a/src/tokenizer/stop_word_filter.rs b/src/tokenizer/stop_word_filter.rs index f94ec632f..45691d470 100644 --- a/src/tokenizer/stop_word_filter.rs +++ b/src/tokenizer/stop_word_filter.rs @@ -39,6 +39,16 @@ impl StopWordFilter { StopWordFilter { words: set } } + + fn english() -> StopWordFilter { + let words: [&'static str; 33] = [ + "a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", + "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", + "there", "these", "they", "this", "to", "was", "will", "with", + ]; + + StopWordFilter::remove(words.iter().map(|s| s.to_string()).collect()) + } } pub struct StopWordFilterStream @@ -98,3 +108,9 @@ where false } } + +impl Default for StopWordFilter { + fn default() -> StopWordFilter { + StopWordFilter::english() + } +} diff --git a/src/tokenizer/token_stream_chain.rs b/src/tokenizer/token_stream_chain.rs index 01d631e2b..224d7746c 100644 --- a/src/tokenizer/token_stream_chain.rs +++ b/src/tokenizer/token_stream_chain.rs @@ -71,13 +71,16 @@ where #[cfg(test)] mod tests { - use super::POSITION_GAP; + use super::super::{SimpleTokenizer, TokenStream, Tokenizer}; use super::TokenStreamChain; - use super::super::{Tokenizer, TokenStream, SimpleTokenizer}; + use super::POSITION_GAP; #[test] fn test_chain_first_emits_no_tokens() { - let token_streams = vec![SimpleTokenizer.token_stream(""), SimpleTokenizer.token_stream("hello world")]; + let token_streams = vec![ + SimpleTokenizer.token_stream(""), + SimpleTokenizer.token_stream("hello world"), + ]; let mut token_chain = TokenStreamChain::new(vec![0, 0], token_streams); assert!(token_chain.advance()); @@ -91,8 +94,8 @@ mod tests { assert_eq!(token_chain.token().offset_from, 6); assert_eq!(token_chain.token().offset_to, 11); assert_eq!(token_chain.token().position, POSITION_GAP); - + assert!(!token_chain.advance()); } -} \ No newline at end of file +} diff --git a/src/tokenizer/tokenizer.rs b/src/tokenizer/tokenizer.rs index fcdf8f21b..d73f84e93 100644 --- a/src/tokenizer/tokenizer.rs +++ b/src/tokenizer/tokenizer.rs @@ -276,7 +276,7 @@ mod test { offset_from: 2, offset_to: 3, text: "abc".to_string(), - position_length: 1 + position_length: 1, }; let t2 = t1.clone(); diff --git a/src/tokenizer/tokenizer_manager.rs b/src/tokenizer/tokenizer_manager.rs index cbb46af3b..001469f35 100644 --- a/src/tokenizer/tokenizer_manager.rs +++ b/src/tokenizer/tokenizer_manager.rs @@ -1,8 +1,7 @@ use std::collections::HashMap; use std::sync::{Arc, RwLock}; -use tokenizer::tokenizer::box_tokenizer; +use tokenizer::box_tokenizer; use tokenizer::BoxedTokenizer; -use tokenizer::JapaneseTokenizer; use tokenizer::LowerCaser; use tokenizer::RawTokenizer; use tokenizer::RemoveLongFilter; @@ -74,7 +73,6 @@ impl Default for TokenizerManager { .filter(LowerCaser) .filter(Stemmer::new()), ); - manager.register("ja", JapaneseTokenizer.filter(RemoveLongFilter::limit(40))); manager } }