Compare commits

..

1 Commits

Author SHA1 Message Date
Paul Masurel
5f07dc35d8 32bits platforms 2019-02-14 09:12:25 +09:00
145 changed files with 2794 additions and 9863 deletions

View File

@@ -61,9 +61,6 @@ before_script:
script:
- bash ci/script.sh
after_success:
- cargo doc-upload
before_deploy:
- sh ci/before_deploy.sh
@@ -71,11 +68,6 @@ cache: cargo
before_cache:
# Travis can't cache files that are not readable by "others"
- chmod -R a+r $HOME/.cargo
- find ./target/debug -type f -maxdepth 1 -delete
- rm -f ./target/.rustc_info.json
- rm -fr ./target/debug/{deps,.fingerprint}/tantivy*
- rm -r target/debug/examples/
- ls -1 examples/ | sed -e 's/\.rs$//' | xargs -I "{}" find target/* -name "*{}*" -type f -delete
#branches:
# only:
@@ -85,4 +77,4 @@ before_cache:
notifications:
email:
on_success: never
on_success: never

View File

@@ -1,104 +1,3 @@
Tantivy 0.10.0
=====================
*Tantivy 0.10.0 index format is compatible with the index format in 0.9.0.*
- Added an ASCII folding filter (@drusellers)
- Bugfix in `query.count` in presence of deletes (@pmasurel)
Minor
---------
- Small simplification of the code.
Calling .freq() or .doc() when .advance() has never been called
on segment postings should panic from now on.
- Tokens exceeding `u16::max_value() - 4` chars are discarded silently instead of panicking.
- Fast fields are now preloaded when the `SegmentReader` is created.
- `IndexMeta` is now public. (@hntd187)
- `IndexWriter` `add_document`, `delete_term`. `IndexWriter` is `Sync`, making it possible to use it with a `
Arc<RwLock<IndexWriter>>`. `add_document` and `delete_term` can
only require a read lock. (@pmasurel)
- Introducing `Opstamp` as an expressive type alias for `u64`. (@petr-tik)
- Stamper now relies on `AtomicU64` on all platforms (@petr-tik)
## How to update?
Your existing indexes are usable as is, but you may need some
trivial updates.
### Fast fields
Fast fields used to be accessed directly from the `SegmentReader`.
The API changed, you are now required to acquire your fast field reader via the
`segment_reader.fast_fields()`, and use one of the typed method:
- `.u64()`, `.i64()` if your field is single-valued ;
- `.u64s()`, `.i64s()` if your field is multi-valued ;
- `.bytes()` if your field is bytes fast field.
Tantivy 0.9.0
=====================
*0.9.0 index format is not compatible with the
previous index format.*
- MAJOR BUGFIX :
Some `Mmap` objects were being leaked, and would never get released. (@fulmicoton)
- Removed most unsafe (@fulmicoton)
- Indexer memory footprint improved. (VInt comp, inlining the first block. (@fulmicoton)
- Stemming in other language possible (@pentlander)
- Segments with no docs are deleted earlier (@barrotsteindev)
- Added grouped add and delete operations.
They are guaranteed to happen together (i.e. they cannot be split by a commit).
In addition, adds are guaranteed to happen on the same segment. (@elbow-jason)
- Removed `INT_STORED` and `INT_INDEXED`. It is now possible to use `STORED` and `INDEXED`
for int fields. (@fulmicoton)
- Added DateTime field (@barrotsteindev)
- Added IndexReader. By default, index is reloaded automatically upon new commits (@fulmicoton)
- SIMD linear search within blocks (@fulmicoton)
## How to update ?
tantivy 0.9 brought some API breaking change.
To update from tantivy 0.8, you will need to go through the following steps.
- `schema::INT_INDEXED` and `schema::INT_STORED` should be replaced by `schema::INDEXED` and `schema::INT_STORED`.
- The index now does not hold the pool of searcher anymore. You are required to create an intermediary object called
`IndexReader` for this.
```rust
// create the reader. You typically need to create 1 reader for the entire
// lifetime of you program.
let reader = index.reader()?;
// Acquire a searcher (previously `index.searcher()`) is now written:
let searcher = reader.searcher();
// With the default setting of the reader, you are not required to
// call `index.load_searchers()` anymore.
//
// The IndexReader will pick up that change automatically, regardless
// of whether the update was done in a different process or not.
// If this behavior is not wanted, you can create your reader with
// the `ReloadPolicy::Manual`, and manually decide when to reload the index
// by calling `reader.reload()?`.
```
Tantivy 0.8.2
=====================
Fixing build for x86_64 platforms. (#496)
No need to update from 0.8.1 if tantivy
is building on your platform.
Tantivy 0.8.1
=====================
Hotfix of #476.
Merge was reflecting deletes before commit was passed.
Thanks @barrotsteindev for reporting the bug.
Tantivy 0.8.0
=====================
*No change in the index format*

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy"
version = "0.10.0-dev"
version = "0.8.2"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]
@@ -16,34 +16,33 @@ base64 = "0.10.0"
byteorder = "1.0"
lazy_static = "1"
regex = "1.0"
tantivy-fst = "0.1"
memmap = {version = "0.7", optional=true}
fst = {version="0.3", default-features=false}
fst-regex = { version="0.2" }
lz4 = {version="1.20", optional=true}
snap = {version="0.2"}
atomicwrites = {version="0.2.2", optional=true}
tempfile = "3.0"
log = "0.4"
combine = ">=3.6.0,<4.0.0"
combine = "3"
tempdir = "0.3"
serde = "1.0"
serde_derive = "1.0"
serde_json = "1.0"
num_cpus = "1.2"
fs2={version="0.4", optional=true}
itertools = "0.8"
levenshtein_automata = {version="0.1", features=["fst_automaton"]}
notify = {version="4", optional=true}
bit-set = "0.5"
uuid = { version = "0.7.2", features = ["v4", "serde"] }
uuid = { version = "0.7", features = ["v4", "serde"] }
crossbeam = "0.5"
futures = "0.1"
futures-cpupool = "0.1"
owning_ref = "0.4"
stable_deref_trait = "1.0.0"
rust-stemmers = "1.1"
downcast-rs = { version="1.0" }
bitpacking = "0.6"
census = "0.2"
rust-stemmers = "1"
downcast = { version="0.9" }
matches = "0.1"
bitpacking = "0.5"
census = "0.1"
fnv = "1.0.6"
owned-read = "0.4"
failure = "0.1"
@@ -51,7 +50,6 @@ htmlescape = "0.3.1"
fail = "0.2"
scoped-pool = "1.0"
murmurhash32 = "0.2"
chrono = "0.4"
[target.'cfg(windows)'.dependencies]
winapi = "0.2"
@@ -59,8 +57,6 @@ winapi = "0.2"
[dev-dependencies]
rand = "0.6"
maplit = "1"
matches = "0.1.8"
time = "0.1.42"
[profile.release]
opt-level = 3
@@ -74,11 +70,12 @@ overflow-checks = true
[features]
# by default no-fail is disabled. We manually enable it when running test.
default = ["mmap", "no_fail"]
mmap = ["atomicwrites", "fs2", "memmap", "notify"]
mmap = ["fst/mmap", "atomicwrites"]
lz4-compression = ["lz4"]
no_fail = ["fail/no_fail"]
unstable = [] # useful for benches.
wasm-bindgen = ["uuid/wasm-bindgen"]
[badges]
travis-ci = { repository = "tantivy-search/tantivy" }

View File

@@ -17,29 +17,19 @@
[![](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/images/6)](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/6)
[![](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/images/7)](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/7)
[![Become a patron](https://c5.patreon.com/external/logo/become_a_patron_button.png)](https://www.patreon.com/fulmicoton)
**Tantivy** is a **full text search engine library** written in rust.
It is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elasticsearch](https://www.elastic.co/products/elasticsearch) and [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not
It is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elastic Search](https://www.elastic.co/products/elasticsearch) and [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not
an off-the-shelf search engine server, but rather a crate that can be used
to build such a search engine.
Tantivy is, in fact, strongly inspired by Lucene's design.
# Benchmark
Tantivy is typically faster than Lucene, but the results will depend on
the nature of the queries in your workload.
The following [benchmark](https://tantivy-search.github.io/bench/) break downs
performance for different type of queries / collection.
# Features
- Full-text search
- Configurable tokenizer. (stemming available for 17 latin languages. Third party support for Chinese ([tantivy-jieba](https://crates.io/crates/tantivy-jieba) and [cang-jie](https://crates.io/crates/cang-jie)) and [Japanese](https://crates.io/crates/tantivy-tokenizer-tiny-segmenter)
- Fast (check out the :racehorse: :sparkles: [benchmark](https://tantivy-search.github.io/bench/) :sparkles: :racehorse:)
- Tiny startup time (<10ms), perfect for command line tools
- BM25 scoring (the same as lucene)
@@ -51,7 +41,6 @@ performance for different type of queries / collection.
- SIMD integer compression when the platform/CPU includes the SSE2 instruction set.
- Single valued and multivalued u64 and i64 fast fields (equivalent of doc values in Lucene)
- `&[u8]` fast fields
- Text, i64, u64, dates and hierarchical facet fields
- LZ4 compressed document store
- Range queries
- Faceted search
@@ -87,7 +76,7 @@ It will walk you through getting a wikipedia search engine up and running in a f
Tantivy compiles on stable rust but requires `Rust >= 1.27`.
To check out and run tests, you can simply run :
git clone https://github.com/tantivy-search/tantivy.git
git clone git@github.com:tantivy-search/tantivy.git
cd tantivy
cargo build
@@ -96,14 +85,6 @@ To check out and run tests, you can simply run :
Some tests will not run with just `cargo test` because of `fail-rs`.
To run the tests exhaustively, run `./run-tests.sh`.
# How can I support this project ?
# Contribute
There are many ways to support this project.
- If you use tantivy, tell us about your experience on [gitter](https://gitter.im/tantivy-search/tantivy) or by email (paul.masurel@gmail.com)
- Report bugs
- Write a blog post
- Complete documentation
- Contribute code (you can join [our gitter](https://gitter.im/tantivy-search/tantivy) )
- Talk about tantivy around you
- Drop a word on on [![Say Thanks!](https://img.shields.io/badge/Say%20Thanks-!-1EAEDB.svg)](https://saythanks.io/to/fulmicoton) or even [![Become a patron](https://c5.patreon.com/external/logo/become_a_patron_button.png)](https://www.patreon.com/fulmicoton)
Send me an email (paul.masurel at gmail.com) if you want to contribute to tantivy.

View File

@@ -20,7 +20,6 @@ use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::Index;
use tantivy::ReloadPolicy;
use tempdir::TempDir;
fn main() -> tantivy::Result<()> {
@@ -171,33 +170,24 @@ fn main() -> tantivy::Result<()> {
//
// ### Searcher
//
// A reader is required to get search the index.
// It acts as a `Searcher` pool that reloads itself,
// depending on a `ReloadPolicy`.
//
// For a search server you will typically create one reader for the entire lifetime of your
// program, and acquire a new searcher for every single request.
//
// In the code below, we rely on the 'ON_COMMIT' policy: the reader
// will reload the index automatically after each commit.
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommit)
.try_into()?;
// Let's search our index. Start by reloading
// searchers in the index. This should be done
// after every `commit()`.
index.load_searchers()?;
// We now need to acquire a searcher.
//
// A searcher points to snapshotted, immutable version of the index.
//
// Some search experience might require more than
// one query. Using the same searcher ensures that all of these queries will run on the
// same version of the index.
// one query.
//
// The searcher ensure that we get to work
// with a consistent version of the index.
//
// Acquiring a `searcher` is very cheap.
//
// You should acquire a searcher every time you start processing a request and
// You should acquire a searcher every time you
// start processing a request and
// and release it right after your query is finished.
let searcher = reader.searcher();
let searcher = index.searcher();
// ### Query
@@ -234,6 +224,7 @@ fn main() -> tantivy::Result<()> {
// Since the body field was not configured as stored,
// the document returned will only contain
// a title.
for (_score, doc_address) in top_docs {
let retrieved_doc = searcher.doc(doc_address)?;
println!("{}", schema.to_json(&retrieved_doc));

View File

@@ -17,9 +17,9 @@ use tantivy::collector::{Collector, SegmentCollector};
use tantivy::fastfield::FastFieldReader;
use tantivy::query::QueryParser;
use tantivy::schema::Field;
use tantivy::schema::{Schema, FAST, INDEXED, TEXT};
use tantivy::schema::{Schema, FAST, INT_INDEXED, TEXT};
use tantivy::Index;
use tantivy::SegmentReader;
use tantivy::{Index, TantivyError};
#[derive(Default)]
struct Stats {
@@ -75,18 +75,9 @@ impl Collector for StatsCollector {
fn for_segment(
&self,
_segment_local_id: u32,
segment_reader: &SegmentReader,
segment: &SegmentReader,
) -> tantivy::Result<StatsSegmentCollector> {
let fast_field_reader = segment_reader
.fast_fields()
.u64(self.field)
.ok_or_else(|| {
let field_name = segment_reader.schema().get_field_name(self.field);
TantivyError::SchemaError(format!(
"Field {:?} is not a u64 fast field.",
field_name
))
})?;
let fast_field_reader = segment.fast_field_reader(self.field)?;
Ok(StatsSegmentCollector {
fast_field_reader,
stats: Stats::default(),
@@ -146,7 +137,7 @@ fn main() -> tantivy::Result<()> {
// products, and with a name, a description, and a price.
let product_name = schema_builder.add_text_field("name", TEXT);
let product_description = schema_builder.add_text_field("description", TEXT);
let price = schema_builder.add_u64_field("price", INDEXED | FAST);
let price = schema_builder.add_u64_field("price", INT_INDEXED | FAST);
let schema = schema_builder.build();
// # Indexing documents
@@ -179,9 +170,9 @@ fn main() -> tantivy::Result<()> {
price => 5_200u64
));
index_writer.commit()?;
index.load_searchers()?;
let reader = index.reader()?;
let searcher = reader.searcher();
let searcher = index.searcher();
let query_parser = QueryParser::for_index(&index, vec![product_name, product_description]);
// here we want to get a hit on the 'ken' in Frankenstein

View File

@@ -91,9 +91,9 @@ fn main() -> tantivy::Result<()> {
increasing confidence in the success of my undertaking."#
));
index_writer.commit()?;
index.load_searchers()?;
let reader = index.reader()?;
let searcher = reader.searcher();
let searcher = index.searcher();
// The query parser can interpret human queries.
// Here, if the user does not specify which

View File

@@ -14,16 +14,12 @@ use tantivy::collector::TopDocs;
use tantivy::query::TermQuery;
use tantivy::schema::*;
use tantivy::Index;
use tantivy::IndexReader;
// A simple helper function to fetch a single document
// given its id from our index.
// It will be helpful to check our work.
fn extract_doc_given_isbn(
reader: &IndexReader,
isbn_term: &Term,
) -> tantivy::Result<Option<Document>> {
let searcher = reader.searcher();
fn extract_doc_given_isbn(index: &Index, isbn_term: &Term) -> tantivy::Result<Option<Document>> {
let searcher = index.searcher();
// This is the simplest query you can think of.
// It matches all of the documents containing a specific term.
@@ -89,12 +85,12 @@ fn main() -> tantivy::Result<()> {
isbn => "978-9176370711",
));
index_writer.commit()?;
let reader = index.reader()?;
index.load_searchers()?;
let frankenstein_isbn = Term::from_field_text(isbn, "978-9176370711");
// Oops our frankenstein doc seems mispelled
let frankenstein_doc_misspelled = extract_doc_given_isbn(&reader, &frankenstein_isbn)?.unwrap();
let frankenstein_doc_misspelled = extract_doc_given_isbn(&index, &frankenstein_isbn)?.unwrap();
assert_eq!(
schema.to_json(&frankenstein_doc_misspelled),
r#"{"isbn":["978-9176370711"],"title":["Frankentein"]}"#,
@@ -133,10 +129,10 @@ fn main() -> tantivy::Result<()> {
// Everything happened as if the document was updated.
index_writer.commit()?;
// We reload our searcher to make our change available to clients.
reader.reload()?;
index.load_searchers()?;
// No more typo!
let frankenstein_new_doc = extract_doc_given_isbn(&reader, &frankenstein_isbn)?.unwrap();
let frankenstein_new_doc = extract_doc_given_isbn(&index, &frankenstein_isbn)?.unwrap();
assert_eq!(
schema.to_json(&frankenstein_new_doc),
r#"{"isbn":["978-9176370711"],"title":["Frankenstein"]}"#,

View File

@@ -55,9 +55,9 @@ fn main() -> tantivy::Result<()> {
index_writer.commit()?;
let reader = index.reader()?;
index.load_searchers()?;
let searcher = reader.searcher();
let searcher = index.searcher();
let mut facet_collector = FacetCollector::for_field(tags);
facet_collector.add_facet("/pools");

View File

@@ -1,43 +0,0 @@
// # Searching a range on an indexed int field.
//
// Below is an example of creating an indexed integer field in your schema
// You can use RangeQuery to get a Count of all occurrences in a given range.
#[macro_use]
extern crate tantivy;
use tantivy::collector::Count;
use tantivy::query::RangeQuery;
use tantivy::schema::{Schema, INDEXED};
use tantivy::Index;
use tantivy::Result;
fn run() -> Result<()> {
// For the sake of simplicity, this schema will only have 1 field
let mut schema_builder = Schema::builder();
// `INDEXED` is a short-hand to indicate that our field should be "searchable".
let year_field = schema_builder.add_u64_field("year", INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let reader = index.reader()?;
{
let mut index_writer = index.writer_with_num_threads(1, 6_000_000)?;
for year in 1950u64..2019u64 {
index_writer.add_document(doc!(year_field => year));
}
index_writer.commit()?;
// The index will be a range of years
}
reader.reload()?;
let searcher = reader.searcher();
// The end is excluded i.e. here we are searching up to 1969
let docs_in_the_sixties = RangeQuery::new_u64(year_field, 1960..1970);
// Uses a Count collector to sum the total number of docs in the range
let num_60s_books = searcher.search(&docs_in_the_sixties, &Count)?;
assert_eq!(num_60s_books, 10);
Ok(())
}
fn main() {
run().unwrap()
}

View File

@@ -33,9 +33,9 @@ fn main() -> tantivy::Result<()> {
index_writer.add_document(doc!(title => "The modern Promotheus"));
index_writer.commit()?;
let reader = index.reader()?;
index.load_searchers()?;
let searcher = reader.searcher();
let searcher = index.searcher();
// A tantivy index is actually a collection of segments.
// Similarly, a searcher just wraps a list `segment_reader`.

View File

@@ -1,107 +0,0 @@
// # Indexing from different threads.
//
// It is fairly common to have to index from different threads.
// Tantivy forbids to create more than one `IndexWriter` at a time.
//
// This `IndexWriter` itself has its own multithreaded layer, so managing your own
// indexing threads will not help. However, it can still be useful for some applications.
//
// For instance, if preparing documents to send to tantivy before indexing is the bottleneck of
// your application, it is reasonable to have multiple threads.
//
// Another very common reason to want to index from multiple threads, is implementing a webserver
// with CRUD capabilities. The server framework will most likely handle request from
// different threads.
//
// The recommended way to address both of these use case is to wrap your `IndexWriter` into a
// `Arc<RwLock<IndexWriter>>`.
//
// While this is counterintuitive, adding and deleting documents do not require mutability
// over the `IndexWriter`, so several threads will be able to do this operation concurrently.
//
// The example below does not represent an actual real-life use case (who would spawn thread to
// index a single document?), but aims at demonstrating the mechanism that makes indexing
// from several threads possible.
extern crate tempdir;
// ---
// Importing tantivy...
#[macro_use]
extern crate tantivy;
use std::sync::{Arc, RwLock};
use std::thread;
use std::time::Duration;
use tantivy::schema::{Schema, STORED, TEXT};
use tantivy::Opstamp;
use tantivy::{Index, IndexWriter};
fn main() -> tantivy::Result<()> {
// # Defining the schema
let mut schema_builder = Schema::builder();
let title = schema_builder.add_text_field("title", TEXT | STORED);
let body = schema_builder.add_text_field("body", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let index_writer: Arc<RwLock<IndexWriter>> = Arc::new(RwLock::new(index.writer(50_000_000)?));
// # First indexing thread.
let index_writer_clone_1 = index_writer.clone();
thread::spawn(move || {
// we index 100 times the document... for the sake of the example.
for i in 0..100 {
let opstamp = {
// A read lock is sufficient here.
let index_writer_rlock = index_writer_clone_1.read().unwrap();
index_writer_rlock.add_document(
doc!(
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
))
};
println!("add doc {} from thread 1 - opstamp {}", i, opstamp);
thread::sleep(Duration::from_millis(20));
}
});
// # Second indexing thread.
let index_writer_clone_2 = index_writer.clone();
// For convenience, tantivy also comes with a macro to
// reduce the boilerplate above.
thread::spawn(move || {
// we index 100 times the document... for the sake of the example.
for i in 0..100 {
// A read lock is sufficient here.
let opstamp = {
let index_writer_rlock = index_writer_clone_2.read().unwrap();
index_writer_rlock.add_document(doc!(
title => "Manufacturing consent",
body => "Some great book description..."
))
};
println!("add doc {} from thread 2 - opstamp {}", i, opstamp);
thread::sleep(Duration::from_millis(10));
}
});
// # In the main thread, we commit 10 times, once every 500ms.
for _ in 0..10 {
let opstamp: Opstamp = {
// Committing or rollbacking on the other hand requires write lock. This will block other threads.
let mut index_writer_wlock = index_writer.write().unwrap();
index_writer_wlock.commit().unwrap()
};
println!("committed with opstamp {}", opstamp);
thread::sleep(Duration::from_millis(500));
}
Ok(())
}

View File

@@ -48,8 +48,9 @@ fn main() -> tantivy::Result<()> {
// ...
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
index.load_searchers()?;
let searcher = index.searcher();
let query_parser = QueryParser::for_index(&index, vec![title, body]);
let query = query_parser.parse_query("sycamore spring")?;

View File

@@ -96,9 +96,9 @@ fn main() -> tantivy::Result<()> {
index_writer.commit()?;
let reader = index.reader()?;
index.load_searchers()?;
let searcher = reader.searcher();
let searcher = index.searcher();
let query_parser = QueryParser::for_index(&index, vec![title, body]);

View File

@@ -12,7 +12,7 @@ fn main() -> tantivy::Result<()> {
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("title", TEXT | STORED);
schema_builder.add_text_field("body", TEXT);
schema_builder.add_u64_field("year", INDEXED);
schema_builder.add_u64_field("year", INT_INDEXED);
let schema = schema_builder.build();
// Let's assume we have a json-serialized document.

View File

@@ -40,8 +40,8 @@ use SegmentReader;
/// index_writer.commit().unwrap();
/// }
///
/// let reader = index.reader()?;
/// let searcher = reader.searcher();
/// index.load_searchers()?;
/// let searcher = index.searcher();
///
/// {
/// let query_parser = QueryParser::for_index(&index, vec![title]);

View File

@@ -17,7 +17,6 @@ use Result;
use Score;
use SegmentLocalId;
use SegmentReader;
use TantivyError;
struct Hit<'a> {
count: u64,
@@ -123,16 +122,17 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
/// facet => Facet::from("/lang/en"),
/// facet => Facet::from("/category/biography")
/// ));
/// index_writer.commit()?;
/// index_writer.commit().unwrap();
/// }
/// let reader = index.reader()?;
/// let searcher = reader.searcher();
///
/// index.load_searchers()?;
/// let searcher = index.searcher();
///
/// {
/// let mut facet_collector = FacetCollector::for_field(facet);
/// facet_collector.add_facet("/lang");
/// facet_collector.add_facet("/category");
/// let facet_counts = searcher.search(&AllQuery, &facet_collector)?;
/// let facet_counts = searcher.search(&AllQuery, &facet_collector).unwrap();
///
/// // This lists all of the facet counts
/// let facets: Vec<(&Facet, u64)> = facet_counts
@@ -147,7 +147,7 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
/// {
/// let mut facet_collector = FacetCollector::for_field(facet);
/// facet_collector.add_facet("/category/fiction");
/// let facet_counts = searcher.search(&AllQuery, &facet_collector)?;
/// let facet_counts = searcher.search(&AllQuery, &facet_collector).unwrap();
///
/// // This lists all of the facet counts
/// let facets: Vec<(&Facet, u64)> = facet_counts
@@ -163,7 +163,7 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
/// {
/// let mut facet_collector = FacetCollector::for_field(facet);
/// facet_collector.add_facet("/category/fiction");
/// let facet_counts = searcher.search(&AllQuery, &facet_collector)?;
/// let facet_counts = searcher.search(&AllQuery, &facet_collector).unwrap();
///
/// // This lists all of the facet counts
/// let facets: Vec<(&Facet, u64)> = facet_counts.top_k("/category/fiction", 1);
@@ -265,10 +265,7 @@ impl Collector for FacetCollector {
_: SegmentLocalId,
reader: &SegmentReader,
) -> Result<FacetSegmentCollector> {
let field_name = reader.schema().get_field_name(self.field);
let facet_reader = reader.facet_reader(self.field).ok_or_else(|| {
TantivyError::SchemaError(format!("Field {:?} is not a facet field.", field_name))
})?;
let facet_reader = reader.facet_reader(self.field)?;
let mut collapse_mapping = Vec::new();
let mut counts = Vec::new();
@@ -486,8 +483,8 @@ mod tests {
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let mut facet_collector = FacetCollector::for_field(facet_field);
facet_collector.add_facet(Facet::from("/top1"));
let counts = searcher.search(&AllQuery, &facet_collector).unwrap();
@@ -535,8 +532,8 @@ mod tests {
facet_field => Facet::from_text(&"/subjects/B/b"),
));
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
assert_eq!(searcher.num_docs(), 1);
let mut facet_collector = FacetCollector::for_field(facet_field);
facet_collector.add_facet("/subjects");
@@ -582,7 +579,9 @@ mod tests {
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let mut facet_collector = FacetCollector::for_field(facet_field);
facet_collector.add_facet("/facet");
@@ -636,7 +635,8 @@ mod bench {
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
index.load_searchers().unwrap();
b.iter(|| {
let searcher = index.searcher();
let facet_collector = FacetCollector::for_field(facet_field);

View File

@@ -88,7 +88,7 @@ mod tests {
let index = Index::create_in_ram(schema.clone());
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
{
for i in 0u64..10u64 {
index_writer.add_document(doc!(
@@ -101,7 +101,8 @@ mod tests {
assert_eq!(index_writer.commit().unwrap(), 10u64);
}
let searcher = index.reader().searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let mut ffvf_i64: IntFacetCollector<I64FastFieldReader> = IntFacetCollector::new(num_field_i64);
let mut ffvf_u64: IntFacetCollector<U64FastFieldReader> = IntFacetCollector::new(num_field_u64);

View File

@@ -53,9 +53,9 @@ use tantivy::collector::{Count, TopDocs};
# index_writer.add_document(doc!(
# title => "The Diary of Muadib",
# ));
# index_writer.commit()?;
# let reader = index.reader()?;
# let searcher = reader.searcher();
# index_writer.commit().unwrap();
# index.load_searchers()?;
# let searcher = index.searcher();
# let query_parser = QueryParser::for_index(&index, vec![title]);
# let query = query_parser.parse_query("diary")?;
let (doc_count, top_docs): (usize, Vec<(Score, DocAddress)>) =
@@ -85,7 +85,7 @@ See the `custom_collector` example.
*/
use downcast_rs;
use downcast;
use DocId;
use Result;
use Score;
@@ -111,9 +111,9 @@ pub use self::facet_collector::FacetCollector;
/// `Fruit` is the type for the result of our collection.
/// e.g. `usize` for the `Count` collector.
pub trait Fruit: Send + downcast_rs::Downcast {}
pub trait Fruit: Send + downcast::Any {}
impl<T> Fruit for T where T: Send + downcast_rs::Downcast {}
impl<T> Fruit for T where T: Send + downcast::Any {}
/// Collectors are in charge of collecting and retaining relevant
/// information from the document found and scored by the query.
@@ -358,7 +358,10 @@ where
}
}
impl_downcast!(Fruit);
#[allow(missing_docs)]
mod downcast_impl {
downcast!(super::Fruit);
}
#[cfg(test)]
pub mod tests;

View File

@@ -1,8 +1,8 @@
use super::Collector;
use super::SegmentCollector;
use collector::Fruit;
use downcast::Downcast;
use std::marker::PhantomData;
use std::ops::Deref;
use DocId;
use Result;
use Score;
@@ -37,11 +37,11 @@ impl<TCollector: Collector> Collector for CollectorWrapper<TCollector> {
let typed_fruit: Vec<TCollector::Fruit> = children
.into_iter()
.map(|untyped_fruit| {
untyped_fruit
.downcast::<TCollector::Fruit>()
Downcast::<TCollector::Fruit>::downcast(untyped_fruit)
.map(|boxed_but_typed| *boxed_but_typed)
.map_err(|_| {
TantivyError::InvalidArgument("Failed to cast child fruit.".to_string())
.map_err(|e| {
let err_msg = format!("Failed to cast child collector fruit. {:?}", e);
TantivyError::InvalidArgument(err_msg)
})
})
.collect::<Result<_>>()?;
@@ -89,20 +89,14 @@ pub struct FruitHandle<TFruit: Fruit> {
impl<TFruit: Fruit> FruitHandle<TFruit> {
pub fn extract(self, fruits: &mut MultiFruit) -> TFruit {
let boxed_fruit = fruits.sub_fruits[self.pos].take().expect("");
*boxed_fruit
.downcast::<TFruit>()
.map_err(|_| ())
.expect("Failed to downcast collector fruit.")
*Downcast::<TFruit>::downcast(boxed_fruit).expect("Failed")
}
}
/// Multicollector makes it possible to collect on more than one collector.
/// It should only be used for use cases where the Collector types is unknown
/// at compile time.
///
/// If the type of the collectors is known, you can just group yours collectors
/// in a tuple. See the
/// [Combining several collectors section of the collector documentation](./index.html#combining-several-collectors).
/// If the type of the collectors is known, you should prefer to use `ChainedCollector`.
///
/// ```rust
/// #[macro_use]
@@ -135,8 +129,8 @@ impl<TFruit: Fruit> FruitHandle<TFruit> {
/// index_writer.commit().unwrap();
/// }
///
/// let reader = index.reader()?;
/// let searcher = reader.searcher();
/// index.load_searchers()?;
/// let searcher = index.searcher();
///
/// let mut collectors = MultiCollector::new();
/// let top_docs_handle = collectors.add_collector(TopDocs::with_limit(2));
@@ -200,10 +194,7 @@ impl<'a> Collector for MultiCollector<'a> {
}
fn requires_scoring(&self) -> bool {
self.collector_wrappers
.iter()
.map(Deref::deref)
.any(Collector::requires_scoring)
self.collector_wrappers.iter().any(|c| c.requires_scoring())
}
fn merge_fruits(&self, segments_multifruits: Vec<MultiFruit>) -> Result<MultiFruit> {
@@ -282,7 +273,8 @@ mod tests {
index_writer.add_document(doc!(text=>"abc"));
index_writer.commit().unwrap();
}
let searcher = index.reader().unwrap().searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let term = Term::from_field_text(text, "abc");
let query = TermQuery::new(term, IndexRecordOption::Basic);

View File

@@ -114,15 +114,11 @@ impl Collector for FastFieldTestCollector {
fn for_segment(
&self,
_: SegmentLocalId,
segment_reader: &SegmentReader,
reader: &SegmentReader,
) -> Result<FastFieldSegmentCollector> {
let reader = segment_reader
.fast_fields()
.u64(self.field)
.expect("Requested field is not a fast field.");
Ok(FastFieldSegmentCollector {
vals: Vec::new(),
reader,
reader: reader.fast_field_reader(self.field)?,
})
}
@@ -174,14 +170,11 @@ impl Collector for BytesFastFieldTestCollector {
fn for_segment(
&self,
_segment_local_id: u32,
segment_reader: &SegmentReader,
segment: &SegmentReader,
) -> Result<BytesFastFieldSegmentCollector> {
Ok(BytesFastFieldSegmentCollector {
vals: Vec::new(),
reader: segment_reader
.fast_fields()
.bytes(self.field)
.expect("Field is not a bytes fast field."),
reader: segment.bytes_fast_field_reader(self.field)?,
})
}
@@ -198,7 +191,7 @@ impl SegmentCollector for BytesFastFieldSegmentCollector {
type Fruit = Vec<u8>;
fn collect(&mut self, doc: u32, _score: f32) {
let data = self.reader.get_bytes(doc);
let data = self.reader.get_val(doc);
self.vals.extend(data);
}

View File

@@ -98,11 +98,11 @@ where
.collect())
}
pub(crate) fn for_segment<F: PartialOrd>(
pub(crate) fn for_segment(
&self,
segment_id: SegmentLocalId,
_: &SegmentReader,
) -> Result<TopSegmentCollector<F>> {
) -> Result<TopSegmentCollector<T>> {
Ok(TopSegmentCollector::new(segment_id, self.limit))
}
}

View File

@@ -5,12 +5,10 @@ use collector::SegmentCollector;
use fastfield::FastFieldReader;
use fastfield::FastValue;
use schema::Field;
use std::marker::PhantomData;
use DocAddress;
use Result;
use SegmentLocalId;
use SegmentReader;
use TantivyError;
/// The Top Field Collector keeps track of the K documents
/// sorted by a fast field in the index
@@ -25,16 +23,15 @@ use TantivyError;
/// # use tantivy::schema::{Schema, Field, FAST, TEXT};
/// # use tantivy::{Index, Result, DocAddress};
/// # use tantivy::query::{Query, QueryParser};
/// use tantivy::Searcher;
/// use tantivy::collector::TopDocs;
///
/// # fn main() -> tantivy::Result<()> {
/// # fn main() {
/// # let mut schema_builder = Schema::builder();
/// # let title = schema_builder.add_text_field("title", TEXT);
/// # let rating = schema_builder.add_u64_field("rating", FAST);
/// # let schema = schema_builder.build();
/// # let index = Index::create_in_ram(schema);
/// # let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
/// # let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
/// # index_writer.add_document(doc!(
/// # title => "The Name of the Wind",
/// # rating => 92u64,
@@ -42,14 +39,13 @@ use TantivyError;
/// # index_writer.add_document(doc!(title => "The Diary of Muadib", rating => 97u64));
/// # index_writer.add_document(doc!(title => "A Dairy Cow", rating => 63u64));
/// # index_writer.add_document(doc!(title => "The Diary of a Young Girl", rating => 80u64));
/// # index_writer.commit()?;
/// # let reader = index.reader()?;
/// # let query = QueryParser::for_index(&index, vec![title]).parse_query("diary")?;
/// # let top_docs = docs_sorted_by_rating(&reader.searcher(), &query, rating)?;
/// # index_writer.commit().unwrap();
/// # index.load_searchers().unwrap();
/// # let query = QueryParser::for_index(&index, vec![title]).parse_query("diary").unwrap();
/// # let top_docs = docs_sorted_by_rating(&index, &query, rating).unwrap();
/// # assert_eq!(top_docs,
/// # vec![(97u64, DocAddress(0u32, 1)),
/// # (80u64, DocAddress(0u32, 3))]);
/// # Ok(())
/// # }
/// #
/// /// Searches the document matching the given query, and
@@ -57,9 +53,7 @@ use TantivyError;
/// /// given in argument.
/// ///
/// /// `field` is required to be a FAST field.
/// fn docs_sorted_by_rating(searcher: &Searcher,
/// query: &Query,
/// sort_by_field: Field)
/// fn docs_sorted_by_rating(index: &Index, query: &Query, sort_by_field: Field)
/// -> Result<Vec<(u64, DocAddress)>> {
///
/// // This is where we build our collector!
@@ -67,7 +61,8 @@ use TantivyError;
///
/// // ... and here is our documents. Not this is a simple vec.
/// // The `u64` in the pair is the value of our fast field for each documents.
/// searcher.search(query, &top_docs_by_rating)
/// index.searcher()
/// .search(query, &top_docs_by_rating)
/// }
/// ```
pub struct TopDocsByField<T> {
@@ -81,12 +76,6 @@ impl<T: FastValue + PartialOrd + Clone> TopDocsByField<T> {
/// The given field name must be a fast field, otherwise the collector have an error while
/// collecting results.
///
/// This constructor is crate-private. Client are supposed to call
/// build `TopDocsByField` object using the `TopDocs` API.
///
/// e.g.:
/// `TopDocs::with_limit(2).order_by_field(sort_by_field)`
///
/// # Panics
/// The method panics if limit is 0
pub(crate) fn new(field: Field, limit: usize) -> TopDocsByField<T> {
@@ -108,15 +97,8 @@ impl<T: FastValue + PartialOrd + Send + Sync + 'static> Collector for TopDocsByF
reader: &SegmentReader,
) -> Result<TopFieldSegmentCollector<T>> {
let collector = self.collector.for_segment(segment_local_id, reader)?;
let reader = reader.fast_fields().u64(self.field).ok_or_else(|| {
let field_name = reader.schema().get_field_name(self.field);
TantivyError::SchemaError(format!("Failed to find fast field reader {:?}", field_name))
})?;
Ok(TopFieldSegmentCollector {
collector,
reader,
_type: PhantomData,
})
let reader = reader.fast_field_reader(self.field)?;
Ok(TopFieldSegmentCollector { collector, reader })
}
fn requires_scoring(&self) -> bool {
@@ -131,10 +113,9 @@ impl<T: FastValue + PartialOrd + Send + Sync + 'static> Collector for TopDocsByF
}
}
pub struct TopFieldSegmentCollector<T> {
collector: TopSegmentCollector<u64>,
reader: FastFieldReader<u64>,
_type: PhantomData<T>,
pub struct TopFieldSegmentCollector<T: FastValue + PartialOrd> {
collector: TopSegmentCollector<T>,
reader: FastFieldReader<T>,
}
impl<T: FastValue + PartialOrd + Send + Sync + 'static> SegmentCollector
@@ -148,11 +129,7 @@ impl<T: FastValue + PartialOrd + Send + Sync + 'static> SegmentCollector
}
fn harvest(self) -> Vec<(T, DocAddress)> {
self.collector
.harvest()
.into_iter()
.map(|(val, doc_address)| (T::from_u64(val), doc_address))
.collect()
self.collector.harvest()
}
}
@@ -194,7 +171,7 @@ mod tests {
size => 16u64,
));
});
let searcher = index.reader().unwrap().searcher();
let searcher = index.searcher();
let top_collector = TopDocs::with_limit(4).order_by_field(size);
let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector).unwrap();
@@ -221,7 +198,7 @@ mod tests {
size => 12u64,
));
});
let searcher = index.reader().unwrap().searcher();
let searcher = index.searcher();
let top_collector: TopDocsByField<u64> = TopDocs::with_limit(4).order_by_field(Field(2));
let segment_reader = searcher.segment_reader(0u32);
top_collector
@@ -241,7 +218,7 @@ mod tests {
size => 12u64,
));
});
let searcher = index.reader().unwrap().searcher();
let searcher = index.searcher();
let segment = searcher.segment_reader(0);
let top_collector: TopDocsByField<u64> = TopDocs::with_limit(4).order_by_field(size);
assert_matches!(
@@ -249,7 +226,7 @@ mod tests {
.for_segment(0, segment)
.map(|_| ())
.unwrap_err(),
TantivyError::SchemaError(_)
TantivyError::FastFieldError(_)
);
}
@@ -264,6 +241,8 @@ mod tests {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
doc_adder(&mut index_writer);
index_writer.commit().unwrap();
index.load_searchers().unwrap();
let query_parser = QueryParser::for_index(&index, vec![query_field]);
let query = query_parser.parse_query(query).unwrap();
(index, query)

View File

@@ -51,8 +51,8 @@ use SegmentReader;
/// index_writer.commit().unwrap();
/// }
///
/// let reader = index.reader()?;
/// let searcher = reader.searcher();
/// index.load_searchers()?;
/// let searcher = index.searcher();
///
/// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query = query_parser.parse_query("diary")?;
@@ -142,12 +142,13 @@ mod tests {
let index = Index::create_in_ram(schema);
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"Hello happy tax payer."));
index_writer.add_document(doc!(text_field=>"Droopy says hello happy tax payer"));
index_writer.add_document(doc!(text_field=>"I like Droopy"));
assert!(index_writer.commit().is_ok());
}
index.load_searchers().unwrap();
index
}
@@ -158,8 +159,6 @@ mod tests {
let query_parser = QueryParser::for_index(&index, vec![field]);
let text_query = query_parser.parse_query("droopy tax").unwrap();
let score_docs: Vec<(Score, DocAddress)> = index
.reader()
.unwrap()
.searcher()
.search(&text_query, &TopDocs::with_limit(4))
.unwrap();
@@ -180,8 +179,6 @@ mod tests {
let query_parser = QueryParser::for_index(&index, vec![field]);
let text_query = query_parser.parse_query("droopy tax").unwrap();
let score_docs: Vec<(Score, DocAddress)> = index
.reader()
.unwrap()
.searcher()
.search(&text_query, &TopDocs::with_limit(2))
.unwrap();

View File

@@ -64,7 +64,7 @@ pub struct BitUnpacker<Data>
where
Data: Deref<Target = [u8]>,
{
num_bits: u64,
num_bits: usize,
mask: u64,
data: Data,
}
@@ -80,13 +80,13 @@ where
(1u64 << num_bits) - 1u64
};
BitUnpacker {
num_bits: u64::from(num_bits),
num_bits: num_bits as usize,
mask,
data,
}
}
pub fn get(&self, idx: u64) -> u64 {
pub fn get(&self, idx: usize) -> u64 {
if self.num_bits == 0 {
return 0u64;
}
@@ -97,13 +97,38 @@ where
let addr = addr_in_bits >> 3;
let bit_shift = addr_in_bits & 7;
debug_assert!(
addr + 8 <= data.len() as u64,
addr + 8 <= data.len(),
"The fast field field should have been padded with 7 bytes."
);
let val_unshifted_unmasked: u64 = LittleEndian::read_u64(&data[(addr as usize)..]);
let val_unshifted_unmasked: u64 = LittleEndian::read_u64(&data[addr..]);
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
val_shifted & mask
}
/// Reads a range of values from the fast field.
///
/// The range of values read is from
/// `[start..start + output.len()[`
pub fn get_range(&self, start: u32, output: &mut [u64]) {
if self.num_bits == 0 {
for val in output.iter_mut() {
*val = 0u64;
}
} else {
let data: &[u8] = &*self.data;
let num_bits = self.num_bits;
let mask = self.mask;
let mut addr_in_bits = (start as usize) * num_bits;
for output_val in output.iter_mut() {
let addr = addr_in_bits >> 3;
let bit_shift = addr_in_bits & 7;
let val_unshifted_unmasked: u64 = LittleEndian::read_u64(&data[addr..]);
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
*output_val = val_shifted & mask;
addr_in_bits += num_bits;
}
}
}
}
#[cfg(test)]
@@ -129,7 +154,7 @@ mod test {
fn test_bitpacker_util(len: usize, num_bits: u8) {
let (bitunpacker, vals) = create_fastfield_bitpacker(len, num_bits);
for (i, val) in vals.iter().enumerate() {
assert_eq!(bitunpacker.get(i as u64), *val);
assert_eq!(bitunpacker.get(i), *val);
}
}
@@ -141,4 +166,17 @@ mod test {
test_bitpacker_util(6, 14);
test_bitpacker_util(1000, 14);
}
#[test]
fn test_bitpacker_range() {
let (bitunpacker, vals) = create_fastfield_bitpacker(100_000, 12);
let buffer_len = 100;
let mut buffer = vec![0u64; buffer_len];
for start in vec![0, 10, 20, 100, 1_000] {
bitunpacker.get_range(start as u32, &mut buffer[..]);
for i in 0..buffer_len {
assert_eq!(buffer[i], vals[start + i]);
}
}
}
}

View File

@@ -39,7 +39,7 @@ impl BinarySerializable for FileAddr {
/// A `CompositeWrite` is used to write a `CompositeFile`.
pub struct CompositeWrite<W = WritePtr> {
write: CountingWriter<W>,
offsets: HashMap<FileAddr, u64>,
offsets: HashMap<FileAddr, usize>,
}
impl<W: Write> CompositeWrite<W> {

View File

@@ -3,7 +3,7 @@ use std::io::Write;
pub struct CountingWriter<W> {
underlying: W,
written_bytes: u64,
written_bytes: usize,
}
impl<W: Write> CountingWriter<W> {
@@ -14,11 +14,11 @@ impl<W: Write> CountingWriter<W> {
}
}
pub fn written_bytes(&self) -> u64 {
pub fn written_bytes(&self) -> usize {
self.written_bytes
}
pub fn finish(mut self) -> io::Result<(W, u64)> {
pub fn finish(mut self) -> io::Result<(W, usize)> {
self.flush()?;
Ok((self.underlying, self.written_bytes))
}
@@ -27,16 +27,10 @@ impl<W: Write> CountingWriter<W> {
impl<W: Write> Write for CountingWriter<W> {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
let written_size = self.underlying.write(buf)?;
self.written_bytes += written_size as u64;
self.written_bytes += written_size;
Ok(written_size)
}
fn write_all(&mut self, buf: &[u8]) -> io::Result<()> {
self.underlying.write_all(buf)?;
self.written_bytes += buf.len() as u64;
Ok(())
}
fn flush(&mut self) -> io::Result<()> {
self.underlying.flush()
}
@@ -54,8 +48,8 @@ mod test {
let mut counting_writer = CountingWriter::wrap(buffer);
let bytes = (0u8..10u8).collect::<Vec<u8>>();
counting_writer.write_all(&bytes).unwrap();
let (w, len): (Vec<u8>, u64) = counting_writer.finish().unwrap();
assert_eq!(len, 10u64);
let (w, len): (Vec<u8>, usize) = counting_writer.finish().unwrap();
assert_eq!(len, 10);
assert_eq!(w.len(), 10);
}
}

View File

@@ -10,13 +10,10 @@ pub(crate) use self::bitset::TinySet;
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
pub use self::counting_writer::CountingWriter;
pub use self::serialize::{BinarySerializable, FixedSize};
pub use self::vint::{read_u32_vint, serialize_vint_u32, write_u32_vint, VInt};
pub use self::vint::VInt;
pub use byteorder::LittleEndian as Endianness;
/// Segment's max doc must be `< MAX_DOC_LIMIT`.
///
/// We do not allow segments with more than
pub const MAX_DOC_LIMIT: u32 = 1 << 31;
use std::io;
/// Computes the number of bits that will be used for bitpacking.
///
@@ -55,6 +52,11 @@ pub(crate) fn is_power_of_2(n: usize) -> bool {
(n > 0) && (n & (n - 1) == 0)
}
/// Create a default io error given a string.
pub(crate) fn make_io_err(msg: String) -> io::Error {
io::Error::new(io::ErrorKind::Other, msg)
}
/// Has length trait
pub trait HasLen {
/// Return length
@@ -132,11 +134,4 @@ pub(crate) mod test {
assert_eq!(compute_num_bits(256), 9u8);
assert_eq!(compute_num_bits(5_000_000_000), 33u8);
}
#[test]
fn test_max_doc() {
// this is the first time I write a unit test for a constant.
assert!(((super::MAX_DOC_LIMIT - 1) as i32) >= 0);
assert!((super::MAX_DOC_LIMIT as i32) < 0);
}
}

View File

@@ -1,5 +1,4 @@
use super::BinarySerializable;
use byteorder::{ByteOrder, LittleEndian};
use std::io;
use std::io::Read;
use std::io::Write;
@@ -10,100 +9,6 @@ pub struct VInt(pub u64);
const STOP_BIT: u8 = 128;
pub fn serialize_vint_u32(val: u32) -> (u64, usize) {
const START_2: u64 = 1 << 7;
const START_3: u64 = 1 << 14;
const START_4: u64 = 1 << 21;
const START_5: u64 = 1 << 28;
const STOP_1: u64 = START_2 - 1;
const STOP_2: u64 = START_3 - 1;
const STOP_3: u64 = START_4 - 1;
const STOP_4: u64 = START_5 - 1;
const MASK_1: u64 = 127;
const MASK_2: u64 = MASK_1 << 7;
const MASK_3: u64 = MASK_2 << 7;
const MASK_4: u64 = MASK_3 << 7;
const MASK_5: u64 = MASK_4 << 7;
let val = u64::from(val);
const STOP_BIT: u64 = 128u64;
match val {
0...STOP_1 => (val | STOP_BIT, 1),
START_2...STOP_2 => (
(val & MASK_1) | ((val & MASK_2) << 1) | (STOP_BIT << (8)),
2,
),
START_3...STOP_3 => (
(val & MASK_1) | ((val & MASK_2) << 1) | ((val & MASK_3) << 2) | (STOP_BIT << (8 * 2)),
3,
),
START_4...STOP_4 => (
(val & MASK_1)
| ((val & MASK_2) << 1)
| ((val & MASK_3) << 2)
| ((val & MASK_4) << 3)
| (STOP_BIT << (8 * 3)),
4,
),
_ => (
(val & MASK_1)
| ((val & MASK_2) << 1)
| ((val & MASK_3) << 2)
| ((val & MASK_4) << 3)
| ((val & MASK_5) << 4)
| (STOP_BIT << (8 * 4)),
5,
),
}
}
/// Returns the number of bytes covered by a
/// serialized vint `u32`.
///
/// Expects a buffer data that starts
/// by the serialized `vint`, scans at most 5 bytes ahead until
/// it finds the vint final byte.
///
/// # May Panic
/// If the payload does not start by a valid `vint`
fn vint_len(data: &[u8]) -> usize {
for (i, &val) in data.iter().enumerate().take(5) {
if val >= STOP_BIT {
return i + 1;
}
}
panic!("Corrupted data. Invalid VInt 32");
}
/// Reads a vint `u32` from a buffer, and
/// consumes its payload data.
///
/// # Panics
///
/// If the buffer does not start by a valid
/// vint payload
pub fn read_u32_vint(data: &mut &[u8]) -> u32 {
let vlen = vint_len(*data);
let mut result = 0u32;
let mut shift = 0u64;
for &b in &data[..vlen] {
result |= u32::from(b & 127u8) << shift;
shift += 7;
}
*data = &data[vlen..];
result
}
/// Write a `u32` as a vint payload.
pub fn write_u32_vint<W: io::Write>(val: u32, writer: &mut W) -> io::Result<()> {
let (val, num_bytes) = serialize_vint_u32(val);
let mut buffer = [0u8; 8];
LittleEndian::write_u64(&mut buffer, val);
writer.write_all(&buffer[..num_bytes])
}
impl VInt {
pub fn val(&self) -> u64 {
self.0
@@ -119,7 +24,7 @@ impl VInt {
output.extend(&buffer[0..num_bytes]);
}
pub fn serialize_into(&self, buffer: &mut [u8; 10]) -> usize {
fn serialize_into(&self, buffer: &mut [u8; 10]) -> usize {
let mut remaining = self.0;
for (i, b) in buffer.iter_mut().enumerate() {
let next_byte: u8 = (remaining % 128u64) as u8;
@@ -159,7 +64,7 @@ impl BinarySerializable for VInt {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
"Reach end of buffer while reading VInt",
));
))
}
}
}
@@ -169,9 +74,7 @@ impl BinarySerializable for VInt {
#[cfg(test)]
mod tests {
use super::serialize_vint_u32;
use super::VInt;
use byteorder::{ByteOrder, LittleEndian};
use common::BinarySerializable;
fn aux_test_vint(val: u64) {
@@ -205,28 +108,4 @@ mod tests {
}
aux_test_vint(10);
}
fn aux_test_serialize_vint_u32(val: u32) {
let mut buffer = [0u8; 10];
let mut buffer2 = [0u8; 10];
let len_vint = VInt(val as u64).serialize_into(&mut buffer);
let (vint, len) = serialize_vint_u32(val);
assert_eq!(len, len_vint, "len wrong for val {}", val);
LittleEndian::write_u64(&mut buffer2, vint);
assert_eq!(&buffer[..len], &buffer2[..len], "array wrong for {}", val);
}
#[test]
fn test_vint_u32() {
aux_test_serialize_vint_u32(0);
aux_test_serialize_vint_u32(1);
aux_test_serialize_vint_u32(5);
for i in 1..3 {
let power_of_128 = 1u32 << (7 * i);
aux_test_serialize_vint_u32(power_of_128 - 1u32);
aux_test_serialize_vint_u32(power_of_128);
aux_test_serialize_vint_u32(power_of_128 + 1u32);
}
aux_test_serialize_vint_u32(u32::max_value());
}
}

View File

@@ -64,7 +64,7 @@ impl Executor {
// This is important as it makes it possible for the fruit_receiver iteration to
// terminate.
};
// This is lame, but safe.
// This is lame, but it does not use unsafe code.
let mut results_with_position = Vec::with_capacity(num_fruits);
for (pos, fruit_res) in fruit_receiver {
let fruit = fruit_res?;
@@ -123,14 +123,15 @@ mod tests {
}
}
#[test]
fn test_map_multithread() {
let result: Vec<usize> = Executor::multi_thread(3, "search-test")
.map(|i| Ok(i * 2), 0..10)
.unwrap();
assert_eq!(result.len(), 10);
for i in 0..10 {
assert_eq!(result[i], i * 2);
}
}
#[test]
fn test_map_multithread() {
let result: Vec<usize> = Executor::multi_thread(3, "search-test")
.map(|i| Ok(i * 2), 0..10)
.unwrap();
assert_eq!(result.len(), 10);
for i in 0..10 {
assert_eq!(result[i], i * 2);
}
}

View File

@@ -1,31 +1,33 @@
use super::pool::LeasedItem;
use super::pool::Pool;
use super::segment::create_segment;
use super::segment::Segment;
use core::searcher::Searcher;
use core::Executor;
use core::IndexMeta;
use core::SegmentId;
use core::SegmentMeta;
use core::SegmentReader;
use core::META_FILEPATH;
use directory::ManagedDirectory;
#[cfg(feature = "mmap")]
use directory::MmapDirectory;
use directory::INDEX_WRITER_LOCK;
use directory::{Directory, RAMDirectory};
use error::DataCorruption;
use error::TantivyError;
use indexer::index_writer::open_index_writer;
use indexer::index_writer::HEAP_SIZE_MIN;
use indexer::segment_updater::save_new_metas;
use indexer::LockType;
use num_cpus;
use reader::IndexReader;
use reader::IndexReaderBuilder;
use schema::Field;
use schema::FieldType;
use schema::Schema;
use serde_json;
use std::borrow::BorrowMut;
use std::fmt;
#[cfg(feature = "mmap")]
use std::path::Path;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
use tokenizer::BoxedTokenizer;
use tokenizer::TokenizerManager;
@@ -46,10 +48,11 @@ fn load_metas(directory: &Directory) -> Result<IndexMeta> {
}
/// Search Index
#[derive(Clone)]
pub struct Index {
directory: ManagedDirectory,
schema: Schema,
num_searchers: Arc<AtomicUsize>,
searcher_pool: Arc<Pool<Searcher>>,
executor: Arc<Executor>,
tokenizers: TokenizerManager,
}
@@ -107,6 +110,7 @@ impl Index {
}
/// Opens or creates a new index in the provided directory
#[cfg(feature = "mmap")]
pub fn open_or_create<Dir: Directory>(dir: Dir, schema: Schema) -> Result<Index> {
if Index::exists(&dir) {
let index = Index::open(dir)?;
@@ -146,7 +150,7 @@ impl Index {
///
/// This will overwrite existing meta.json
fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> {
save_new_metas(schema.clone(), directory.borrow_mut())?;
save_new_metas(schema.clone(), directory.borrow_mut())?;
let metas = IndexMeta::with_schema(schema);
Index::create_from_metas(directory, &metas)
}
@@ -154,12 +158,16 @@ impl Index {
/// Creates a new index given a directory and an `IndexMeta`.
fn create_from_metas(directory: ManagedDirectory, metas: &IndexMeta) -> Result<Index> {
let schema = metas.schema.clone();
let n_cpus = num_cpus::get();
let index = Index {
directory,
schema,
num_searchers: Arc::new(AtomicUsize::new(n_cpus)),
searcher_pool: Arc::new(Pool::new()),
tokenizers: TokenizerManager::default(),
executor: Arc::new(Executor::single_thread()),
};
index.load_searchers()?;
Ok(index)
}
@@ -189,22 +197,6 @@ impl Index {
}
}
/// Create a default `IndexReader` for the given index.
///
/// See [`Index.reader_builder()`](#method.reader_builder).
pub fn reader(&self) -> Result<IndexReader> {
self.reader_builder().try_into()
}
/// Create a `IndexReader` for the given index.
///
/// Most project should create at most one reader for a given index.
/// This method is typically called only once per `Index` instance,
/// over the lifetime of most problem.
pub fn reader_builder(&self) -> IndexReaderBuilder {
IndexReaderBuilder::new(self.clone())
}
/// Opens a new directory from an index path.
#[cfg(feature = "mmap")]
pub fn open_in_dir<P: AsRef<Path>>(directory_path: P) -> Result<Index> {
@@ -240,8 +232,7 @@ impl Index {
/// Each thread will receive a budget of `overall_heap_size_in_bytes / num_threads`.
///
/// # Errors
/// If the lockfile already exists, returns `Error::DirectoryLockBusy` or an `Error::IOError`.
///
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
/// # Panics
/// If the heap size per thread is too small, panics.
pub fn writer_with_num_threads(
@@ -249,21 +240,7 @@ impl Index {
num_threads: usize,
overall_heap_size_in_bytes: usize,
) -> Result<IndexWriter> {
let directory_lock = self
.directory
.acquire_lock(&INDEX_WRITER_LOCK)
.map_err(|err| {
TantivyError::LockFailure(
err,
Some(
"Failed to acquire index lock. If you are using\
a regular directory, this means there is already an \
`IndexWriter` working on this `Directory`, in this process \
or in a different process."
.to_string(),
),
)
})?;
let directory_lock = LockType::IndexWriterLock.acquire_lock(&self.directory)?;
let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads;
open_index_writer(
self,
@@ -340,9 +317,56 @@ impl Index {
Ok(self
.searchable_segment_metas()?
.iter()
.map(SegmentMeta::id)
.map(|segment_meta| segment_meta.id())
.collect())
}
/// Sets the number of searchers to use
///
/// Only works after the next call to `load_searchers`
pub fn set_num_searchers(&mut self, num_searchers: usize) {
self.num_searchers.store(num_searchers, Ordering::Release);
}
/// Update searchers so that they reflect the state of the last
/// `.commit()`.
///
/// If indexing happens in the same process as searching,
/// you most likely want to call `.load_searchers()` right after each
/// successful call to `.commit()`.
///
/// If indexing and searching happen in different processes, the way to
/// get the freshest `index` at all time, is to watch `meta.json` and
/// call `load_searchers` whenever a changes happen.
pub fn load_searchers(&self) -> Result<()> {
let _meta_lock = LockType::MetaLock.acquire_lock(self.directory())?;
let searchable_segments = self.searchable_segments()?;
let segment_readers: Vec<SegmentReader> = searchable_segments
.iter()
.map(SegmentReader::open)
.collect::<Result<_>>()?;
let schema = self.schema();
let num_searchers: usize = self.num_searchers.load(Ordering::Acquire);
let searchers = (0..num_searchers)
.map(|_| Searcher::new(schema.clone(), self.clone(), segment_readers.clone()))
.collect();
self.searcher_pool.publish_new_generation(searchers);
Ok(())
}
/// Returns a searcher
///
/// This method should be called every single time a search
/// query is performed.
/// The searchers are taken from a pool of `num_searchers` searchers.
/// If no searcher is available
/// this may block.
///
/// The same searcher must be used for a given query, as it ensures
/// the use of a consistent segment set.
pub fn searcher(&self) -> LeasedItem<Searcher> {
self.searcher_pool.acquire()
}
}
impl fmt::Debug for Index {
@@ -351,22 +375,29 @@ impl fmt::Debug for Index {
}
}
impl Clone for Index {
fn clone(&self) -> Index {
Index {
directory: self.directory.clone(),
schema: self.schema.clone(),
num_searchers: Arc::clone(&self.num_searchers),
searcher_pool: Arc::clone(&self.searcher_pool),
tokenizers: self.tokenizers.clone(),
executor: self.executor.clone(),
}
}
}
#[cfg(test)]
mod tests {
use directory::RAMDirectory;
use schema::Field;
use schema::{Schema, INDEXED, TEXT};
use std::thread;
use std::time::Duration;
use schema::{Schema, INT_INDEXED, TEXT};
use Index;
use IndexReader;
use IndexWriter;
use ReloadPolicy;
#[test]
fn test_indexer_for_field() {
let mut schema_builder = Schema::builder();
let num_likes_field = schema_builder.add_u64_field("num_likes", INDEXED);
let num_likes_field = schema_builder.add_u64_field("num_likes", INT_INDEXED);
let body_field = schema_builder.add_text_field("body", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
@@ -424,117 +455,7 @@ mod tests {
fn throw_away_schema() -> Schema {
let mut schema_builder = Schema::builder();
let _ = schema_builder.add_u64_field("num_likes", INDEXED);
let _ = schema_builder.add_u64_field("num_likes", INT_INDEXED);
schema_builder.build()
}
#[test]
fn test_index_on_commit_reload_policy() {
let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap();
let index = Index::create_in_ram(schema);
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommit)
.try_into()
.unwrap();
assert_eq!(reader.searcher().num_docs(), 0);
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
}
#[cfg(feature = "mmap")]
mod mmap_specific {
use super::*;
use std::path::PathBuf;
use tempdir::TempDir;
#[test]
fn test_index_on_commit_reload_policy_mmap() {
let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap();
let tempdir = TempDir::new("index").unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
let index = Index::create_in_dir(&tempdir_path, schema).unwrap();
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
writer.commit().unwrap();
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommit)
.try_into()
.unwrap();
assert_eq!(reader.searcher().num_docs(), 0);
test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
}
#[test]
fn test_index_manual_policy_mmap() {
let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap();
let index = Index::create_from_tempdir(schema).unwrap();
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
writer.commit().unwrap();
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
assert_eq!(reader.searcher().num_docs(), 0);
writer.add_document(doc!(field=>1u64));
writer.commit().unwrap();
thread::sleep(Duration::from_millis(500));
assert_eq!(reader.searcher().num_docs(), 0);
reader.reload().unwrap();
assert_eq!(reader.searcher().num_docs(), 1);
}
#[test]
fn test_index_on_commit_reload_policy_different_directories() {
let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap();
let tempdir = TempDir::new("index").unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
let write_index = Index::create_in_dir(&tempdir_path, schema).unwrap();
let read_index = Index::open_in_dir(&tempdir_path).unwrap();
let reader = read_index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommit)
.try_into()
.unwrap();
assert_eq!(reader.searcher().num_docs(), 0);
let mut writer = write_index.writer_with_num_threads(1, 3_000_000).unwrap();
test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
}
}
fn test_index_on_commit_reload_policy_aux(
field: Field,
writer: &mut IndexWriter,
reader: &IndexReader,
) {
assert_eq!(reader.searcher().num_docs(), 0);
writer.add_document(doc!(field=>1u64));
writer.commit().unwrap();
let mut count = 0;
for _ in 0..100 {
count = reader.searcher().num_docs();
if count > 0 {
break;
}
thread::sleep(Duration::from_millis(100));
}
assert_eq!(count, 1);
writer.add_document(doc!(field=>2u64));
writer.commit().unwrap();
let mut count = 0;
for _ in 0..10 {
count = reader.searcher().num_docs();
if count > 1 {
break;
}
thread::sleep(Duration::from_millis(100));
}
assert_eq!(count, 2);
}
}

View File

@@ -2,7 +2,6 @@ use core::SegmentMeta;
use schema::Schema;
use serde_json;
use std::fmt;
use Opstamp;
/// Meta information about the `Index`.
///
@@ -16,7 +15,7 @@ use Opstamp;
pub struct IndexMeta {
pub segments: Vec<SegmentMeta>,
pub schema: Schema,
pub opstamp: Opstamp,
pub opstamp: u64,
#[serde(skip_serializing_if = "Option::is_none")]
pub payload: Option<String>,
}

View File

@@ -2,6 +2,7 @@ mod executor;
pub mod index;
mod index_meta;
mod inverted_index_reader;
mod pool;
pub mod searcher;
mod segment;
mod segment_component;
@@ -24,7 +25,6 @@ pub use self::segment_reader::SegmentReader;
use std::path::PathBuf;
lazy_static! {
/// The meta file contains all the information about the list of segments and the schema
/// of the index.
pub static ref META_FILEPATH: PathBuf = PathBuf::from("meta.json");

View File

@@ -1,5 +1,5 @@
use crossbeam::crossbeam_channel::unbounded;
use crossbeam::{Receiver, RecvError, Sender};
use crossbeam::queue::MsQueue;
use std::mem;
use std::ops::{Deref, DerefMut};
use std::sync::atomic::AtomicUsize;
use std::sync::atomic::Ordering;
@@ -10,52 +10,15 @@ pub struct GenerationItem<T> {
item: T,
}
/// Queue implementation for the Object Pool below
/// Uses the unbounded Linked-List type queue from crossbeam-channel
/// Splits the Queue into sender and receiver
struct Queue<T> {
sender: Sender<T>,
receiver: Receiver<T>,
}
impl<T> Queue<T> {
fn new() -> Self {
let (s, r) = unbounded();
Queue {
sender: s,
receiver: r,
}
}
/// Sender trait returns a Result type, which is ignored.
/// The Result is not handled at the moment
fn push(&self, elem: T) {
self.sender
.send(elem)
.expect("Sending an item to crossbeam-queue shouldn't fail");
}
/// Relies on the underlying crossbeam-channel Receiver
/// to block on empty queue
fn pop(&self) -> Result<T, RecvError> {
self.receiver.recv()
}
}
/// An object pool
///
/// This is used in tantivy to create a pool of `Searcher`.
/// Object are wrapped in a `LeasedItem` wrapper and are
/// released automatically back into the pool on `Drop`.
pub struct Pool<T> {
queue: Arc<Queue<GenerationItem<T>>>,
queue: Arc<MsQueue<GenerationItem<T>>>,
freshest_generation: AtomicUsize,
next_generation: AtomicUsize,
}
impl<T> Pool<T> {
pub fn new() -> Pool<T> {
let queue = Arc::new(Queue::new());
let queue = Arc::new(MsQueue::new());
Pool {
queue,
freshest_generation: AtomicUsize::default(),
@@ -63,10 +26,6 @@ impl<T> Pool<T> {
}
}
/// Publishes a new generation of `Searcher`.
///
/// After publish, all new `Searcher` acquired will be
/// of the new generation.
pub fn publish_new_generation(&self, items: Vec<T>) {
let next_generation = self.next_generation.fetch_add(1, Ordering::SeqCst) + 1;
for item in items {
@@ -102,14 +61,10 @@ impl<T> Pool<T> {
self.freshest_generation.load(Ordering::Acquire)
}
/// Acquires a new searcher.
///
/// If no searcher is available, this methods block until
/// a searcher is released.
pub fn acquire(&self) -> LeasedItem<T> {
let generation = self.generation();
loop {
let gen_item = self.queue.pop().unwrap();
let gen_item = self.queue.pop();
if gen_item.generation >= generation {
return LeasedItem {
gen_item: Some(gen_item),
@@ -125,7 +80,7 @@ impl<T> Pool<T> {
pub struct LeasedItem<T> {
gen_item: Option<GenerationItem<T>>,
recycle_queue: Arc<Queue<GenerationItem<T>>>,
recycle_queue: Arc<MsQueue<GenerationItem<T>>>,
}
impl<T> Deref for LeasedItem<T> {
@@ -152,9 +107,9 @@ impl<T> DerefMut for LeasedItem<T> {
impl<T> Drop for LeasedItem<T> {
fn drop(&mut self) {
if let Some(gen_item) = self.gen_item.take() {
self.recycle_queue.push(gen_item);
}
let gen_item: GenerationItem<T> = mem::replace(&mut self.gen_item, None)
.expect("Unwrapping a leased item should never fail");
self.recycle_queue.push(gen_item);
}
}
@@ -162,7 +117,6 @@ impl<T> Drop for LeasedItem<T> {
mod tests {
use super::Pool;
use super::Queue;
use std::iter;
#[test]
@@ -179,47 +133,4 @@ mod tests {
assert_eq!(*pool.acquire(), 11);
}
}
#[test]
fn test_queue() {
let q = Queue::new();
let elem = 5;
q.push(elem);
let res = q.pop();
assert_eq!(res.unwrap(), elem);
}
#[test]
fn test_pool_dont_panic_on_empty_pop() {
// When the object pool is exhausted, it shouldn't panic on pop()
use std::sync::Arc;
use std::{thread, time};
// Wrap the pool in an Arc, same way as its used in `core/index.rs`
let pool = Arc::new(Pool::new());
// clone pools outside the move scope of each new thread
let pool1 = Arc::clone(&pool);
let pool2 = Arc::clone(&pool);
let elements_for_pool = vec![1, 2];
pool.publish_new_generation(elements_for_pool);
let mut threads = vec![];
let sleep_dur = time::Duration::from_millis(10);
// spawn one more thread than there are elements in the pool
threads.push(thread::spawn(move || {
// leasing to make sure it's not dropped before sleep is called
let _leased_searcher = &pool.acquire();
thread::sleep(sleep_dur);
}));
threads.push(thread::spawn(move || {
// leasing to make sure it's not dropped before sleep is called
let _leased_searcher = &pool1.acquire();
thread::sleep(sleep_dur);
}));
threads.push(thread::spawn(move || {
// leasing to make sure it's not dropped before sleep is called
let _leased_searcher = &pool2.acquire();
thread::sleep(sleep_dur);
}));
}
}

View File

@@ -59,7 +59,7 @@ impl Searcher {
) -> Searcher {
let store_readers = segment_readers
.iter()
.map(SegmentReader::get_store_reader)
.map(|segment_reader| segment_reader.get_store_reader())
.collect();
Searcher {
schema,
@@ -218,7 +218,7 @@ impl fmt::Debug for Searcher {
let segment_ids = self
.segment_readers
.iter()
.map(SegmentReader::segment_id)
.map(|segment_reader| segment_reader.segment_id())
.collect::<Vec<_>>();
write!(f, "Searcher({:?})", segment_ids)
}

View File

@@ -10,7 +10,6 @@ use schema::Schema;
use std::fmt;
use std::path::PathBuf;
use std::result;
use Opstamp;
use Result;
/// A segment is a piece of the index.
@@ -51,7 +50,7 @@ impl Segment {
}
#[doc(hidden)]
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> Segment {
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: u64) -> Segment {
Segment {
index: self.index,
meta: self.meta.with_delete_meta(num_deleted_docs, opstamp),

View File

@@ -41,6 +41,6 @@ impl SegmentComponent {
SegmentComponent::STORE,
SegmentComponent::DELETE,
];
SEGMENT_COMPONENTS.iter()
SEGMENT_COMPONENTS.into_iter()
}
}

View File

@@ -19,7 +19,7 @@ pub struct SegmentId(Uuid);
#[cfg(test)]
lazy_static! {
static ref AUTO_INC_COUNTER: atomic::AtomicUsize = atomic::AtomicUsize::default();
static ref ZERO_ARRAY: [u8; 8] = [0u8; 8];
static ref EMPTY_ARR: [u8; 8] = [0u8; 8];
}
// During tests, we generate the segment id in a autoincrement manner
@@ -30,7 +30,7 @@ lazy_static! {
#[cfg(test)]
fn create_uuid() -> Uuid {
let new_auto_inc_id = (*AUTO_INC_COUNTER).fetch_add(1, atomic::Ordering::SeqCst);
Uuid::from_fields(new_auto_inc_id as u32, 0, 0, &*ZERO_ARRAY).unwrap()
Uuid::from_fields(new_auto_inc_id as u32, 0, 0, &*EMPTY_ARR).unwrap()
}
#[cfg(not(test))]

View File

@@ -5,7 +5,6 @@ use serde;
use std::collections::HashSet;
use std::fmt;
use std::path::PathBuf;
use Opstamp;
lazy_static! {
static ref INVENTORY: Inventory<InnerSegmentMeta> = { Inventory::new() };
@@ -14,7 +13,7 @@ lazy_static! {
#[derive(Clone, Debug, Serialize, Deserialize)]
struct DeleteMeta {
num_deleted_docs: u32,
opstamp: Opstamp,
opstamp: u64,
}
/// `SegmentMeta` contains simple meta information about a segment.
@@ -137,9 +136,9 @@ impl SegmentMeta {
self.max_doc() - self.num_deleted_docs()
}
/// Returns the `Opstamp` of the last delete operation
/// Returns the opstamp of the last delete operation
/// taken in account in this segment.
pub fn delete_opstamp(&self) -> Option<Opstamp> {
pub fn delete_opstamp(&self) -> Option<u64> {
self.tracked
.deletes
.as_ref()
@@ -153,7 +152,7 @@ impl SegmentMeta {
}
#[doc(hidden)]
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> SegmentMeta {
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: u64) -> SegmentMeta {
let delete_meta = DeleteMeta {
num_deleted_docs,
opstamp,

View File

@@ -5,10 +5,14 @@ use core::Segment;
use core::SegmentComponent;
use core::SegmentId;
use directory::ReadOnlySource;
use error::TantivyError;
use fastfield::DeleteBitSet;
use fastfield::FacetReader;
use fastfield::FastFieldReaders;
use fastfield::FastFieldReader;
use fastfield::{self, FastFieldNotAvailableError};
use fastfield::{BytesFastFieldReader, FastValue, MultiValueIntFastFieldReader};
use fieldnorm::FieldNormReader;
use schema::Cardinality;
use schema::Field;
use schema::FieldType;
use schema::Schema;
@@ -47,7 +51,7 @@ pub struct SegmentReader {
postings_composite: CompositeFile,
positions_composite: CompositeFile,
positions_idx_composite: CompositeFile,
fast_fields_readers: Arc<FastFieldReaders>,
fast_fields_composite: CompositeFile,
fieldnorms_composite: CompositeFile,
store_source: ReadOnlySource,
@@ -101,21 +105,93 @@ impl SegmentReader {
///
/// # Panics
/// May panic if the index is corrupted.
pub fn fast_fields(&self) -> &FastFieldReaders {
&self.fast_fields_readers
pub fn fast_field_reader<Item: FastValue>(
&self,
field: Field,
) -> fastfield::Result<FastFieldReader<Item>> {
let field_entry = self.schema.get_field_entry(field);
if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::SingleValue)
{
self.fast_fields_composite
.open_read(field)
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
.map(FastFieldReader::open)
} else {
Err(FastFieldNotAvailableError::new(field_entry))
}
}
pub(crate) fn fast_field_reader_with_idx<Item: FastValue>(
&self,
field: Field,
idx: usize,
) -> fastfield::Result<FastFieldReader<Item>> {
if let Some(ff_source) = self.fast_fields_composite.open_read_with_idx(field, idx) {
Ok(FastFieldReader::open(ff_source))
} else {
let field_entry = self.schema.get_field_entry(field);
Err(FastFieldNotAvailableError::new(field_entry))
}
}
/// Accessor to the `MultiValueIntFastFieldReader` associated to a given `Field`.
/// May panick if the field is not a multivalued fastfield of the type `Item`.
pub fn multi_fast_field_reader<Item: FastValue>(
&self,
field: Field,
) -> fastfield::Result<MultiValueIntFastFieldReader<Item>> {
let field_entry = self.schema.get_field_entry(field);
if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::MultiValues)
{
let idx_reader = self.fast_field_reader_with_idx(field, 0)?;
let vals_reader = self.fast_field_reader_with_idx(field, 1)?;
Ok(MultiValueIntFastFieldReader::open(idx_reader, vals_reader))
} else {
Err(FastFieldNotAvailableError::new(field_entry))
}
}
/// Accessor to the `BytesFastFieldReader` associated to a given `Field`.
pub fn bytes_fast_field_reader(&self, field: Field) -> fastfield::Result<BytesFastFieldReader> {
let field_entry = self.schema.get_field_entry(field);
match *field_entry.field_type() {
FieldType::Bytes => {}
_ => return Err(FastFieldNotAvailableError::new(field_entry)),
}
let idx_reader = self
.fast_fields_composite
.open_read_with_idx(field, 0)
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
.map(FastFieldReader::open)?;
let values = self
.fast_fields_composite
.open_read_with_idx(field, 1)
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))?;
Ok(BytesFastFieldReader::open(idx_reader, values))
}
/// Accessor to the `FacetReader` associated to a given `Field`.
pub fn facet_reader(&self, field: Field) -> Option<FacetReader> {
pub fn facet_reader(&self, field: Field) -> Result<FacetReader> {
let field_entry = self.schema.get_field_entry(field);
if field_entry.field_type() != &FieldType::HierarchicalFacet {
return None;
return Err(TantivyError::InvalidArgument(format!(
"The field {:?} is not a \
hierarchical facet.",
field_entry
)));
}
let term_ords_reader = self.fast_fields().u64s(field)?;
let termdict_source = self.termdict_composite.open_read(field)?;
let term_ords_reader = self.multi_fast_field_reader(field)?;
let termdict_source = self.termdict_composite.open_read(field).ok_or_else(|| {
TantivyError::InvalidArgument(format!(
"The field \"{}\" is a hierarchical \
but this segment does not seem to have the field term \
dictionary.",
field_entry.name()
))
})?;
let termdict = TermDictionary::from_source(&termdict_source);
let facet_reader = FacetReader::new(term_ords_reader, termdict);
Some(facet_reader)
Ok(facet_reader)
}
/// Accessor to the segment's `Field norms`'s reader.
@@ -171,12 +247,8 @@ impl SegmentReader {
}
};
let schema = segment.schema();
let fast_fields_data = segment.open_read(SegmentComponent::FASTFIELDS)?;
let fast_fields_composite = CompositeFile::open(&fast_fields_data)?;
let fast_field_readers =
Arc::new(FastFieldReaders::load_all(&schema, &fast_fields_composite)?);
let fieldnorms_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
let fieldnorms_composite = CompositeFile::open(&fieldnorms_data)?;
@@ -188,13 +260,14 @@ impl SegmentReader {
None
};
let schema = segment.schema();
Ok(SegmentReader {
inv_idx_reader_cache: Arc::new(RwLock::new(HashMap::new())),
max_doc: segment.meta().max_doc(),
num_docs: segment.meta().num_docs(),
termdict_composite,
postings_composite,
fast_fields_readers: fast_field_readers,
fast_fields_composite,
fieldnorms_composite,
segment_id: segment.id(),
store_source,
@@ -308,12 +381,12 @@ impl SegmentReader {
self.postings_composite.space_usage(),
self.positions_composite.space_usage(),
self.positions_idx_composite.space_usage(),
self.fast_fields_readers.space_usage(),
self.fast_fields_composite.space_usage(),
self.fieldnorms_composite.space_usage(),
self.get_store_reader().space_usage(),
self.delete_bitset_opt
.as_ref()
.map(DeleteBitSet::space_usage)
.map(|x| x.space_usage())
.unwrap_or(0),
)
}
@@ -404,7 +477,9 @@ mod test {
// ok, now we should have a deleted doc
index_writer2.commit().unwrap();
}
let searcher = index.reader().unwrap().searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let docs: Vec<DocId> = searcher.segment_reader(0).doc_ids_alive().collect();
assert_eq!(vec![0u32, 2u32], docs);
}

View File

@@ -1,104 +1,11 @@
use directory::directory_lock::Lock;
use directory::error::LockError;
use directory::error::{DeleteError, OpenReadError, OpenWriteError};
use directory::WatchCallback;
use directory::WatchHandle;
use directory::{ReadOnlySource, WritePtr};
use std::fmt;
use std::io;
use std::io::Write;
use std::marker::Send;
use std::marker::Sync;
use std::path::Path;
use std::path::PathBuf;
use std::result;
use std::thread;
use std::time::Duration;
/// Retry the logic of acquiring locks is pretty simple.
/// We just retry `n` times after a given `duratio`, both
/// depending on the type of lock.
struct RetryPolicy {
num_retries: usize,
wait_in_ms: u64,
}
impl RetryPolicy {
fn no_retry() -> RetryPolicy {
RetryPolicy {
num_retries: 0,
wait_in_ms: 0,
}
}
fn wait_and_retry(&mut self) -> bool {
if self.num_retries == 0 {
false
} else {
self.num_retries -= 1;
let wait_duration = Duration::from_millis(self.wait_in_ms);
thread::sleep(wait_duration);
true
}
}
}
/// The `DirectoryLock` is an object that represents a file lock.
/// See [`LockType`](struct.LockType.html)
///
/// It is transparently associated to a lock file, that gets deleted
/// on `Drop.` The lock is released automatically on `Drop`.
pub struct DirectoryLock(Box<Drop + Send + Sync + 'static>);
struct DirectoryLockGuard {
directory: Box<Directory>,
path: PathBuf,
}
impl<T: Drop + Send + Sync + 'static> From<Box<T>> for DirectoryLock {
fn from(underlying: Box<T>) -> Self {
DirectoryLock(underlying)
}
}
impl Drop for DirectoryLockGuard {
fn drop(&mut self) {
if let Err(e) = self.directory.delete(&*self.path) {
error!("Failed to remove the lock file. {:?}", e);
}
}
}
enum TryAcquireLockError {
FileExists,
IOError(io::Error),
}
fn try_acquire_lock(
filepath: &Path,
directory: &mut Directory,
) -> Result<DirectoryLock, TryAcquireLockError> {
let mut write = directory.open_write(filepath).map_err(|e| match e {
OpenWriteError::FileAlreadyExists(_) => TryAcquireLockError::FileExists,
OpenWriteError::IOError(io_error) => TryAcquireLockError::IOError(io_error.into()),
})?;
write.flush().map_err(TryAcquireLockError::IOError)?;
Ok(DirectoryLock::from(Box::new(DirectoryLockGuard {
directory: directory.box_clone(),
path: filepath.to_owned(),
})))
}
fn retry_policy(is_blocking: bool) -> RetryPolicy {
if is_blocking {
RetryPolicy {
num_retries: 100,
wait_in_ms: 100,
}
} else {
RetryPolicy::no_retry()
}
}
/// Write-once read many (WORM) abstraction for where
/// tantivy's data should be stored.
@@ -166,45 +73,6 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
///
/// The file may or may not previously exist.
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()>;
/// Acquire a lock in the given directory.
///
/// The method is blocking or not depending on the `Lock` object.
fn acquire_lock(&self, lock: &Lock) -> Result<DirectoryLock, LockError> {
let mut box_directory = self.box_clone();
let mut retry_policy = retry_policy(lock.is_blocking);
loop {
match try_acquire_lock(&lock.filepath, &mut *box_directory) {
Ok(result) => {
return Ok(result);
}
Err(TryAcquireLockError::FileExists) => {
if !retry_policy.wait_and_retry() {
return Err(LockError::LockBusy);
}
}
Err(TryAcquireLockError::IOError(io_error)) => {
return Err(LockError::IOError(io_error));
}
}
}
}
/// Registers a callback that will be called whenever a change on the `meta.json`
/// using the `atomic_write` API is detected.
///
/// The behavior when using `.watch()` on a file using `.open_write(...)` is, on the other
/// hand, undefined.
///
/// The file will be watched for the lifetime of the returned `WatchHandle`. The caller is
/// required to keep it.
/// It does not override previous callbacks. When the file is modified, all callback that are
/// registered (and whose `WatchHandle` is still alive) are triggered.
///
/// Internally, tantivy only uses this API to detect new commits to implement the
/// `OnCommit` `ReloadPolicy`. Not implementing watch in a `Directory` only prevents the
/// `OnCommit` `ReloadPolicy` to work properly.
fn watch(&self, watch_callback: WatchCallback) -> WatchHandle;
}
/// DirectoryClone

View File

@@ -1,56 +0,0 @@
use std::path::PathBuf;
/// A directory lock.
///
/// A lock is associated to a specific path and some
/// [`LockParams`](./enum.LockParams.html).
/// Tantivy itself uses only two locks but client application
/// can use the directory facility to define their own locks.
/// - [INDEX_WRITER_LOCK](./struct.INDEX_WRITER_LOCK.html)
/// - [META_LOCK](./struct.META_LOCK.html)
///
/// Check out these locks documentation for more information.
///
#[derive(Debug)]
pub struct Lock {
/// The lock needs to be associated with its own file `path`.
/// Depending on the platform, the lock might rely on the creation
/// and deletion of this filepath.
pub filepath: PathBuf,
/// `lock_params` describes whether acquiring the lock is meant
/// to be a blocking operation or a non-blocking.
///
/// Acquiring a blocking lock blocks until the lock is
/// available.
/// Acquiring a blocking lock returns rapidly, either successfully
/// or with an error signifying that someone is already holding
/// the lock.
pub is_blocking: bool,
}
lazy_static! {
/// Only one process should be able to write tantivy's index at a time.
/// This lock file, when present, is in charge of preventing other processes to open an IndexWriter.
///
/// If the process is killed and this file remains, it is safe to remove it manually.
///
/// Failing to acquire this lock usually means a misuse of tantivy's API,
/// (creating more than one instance of the `IndexWriter`), are a spurious
/// lock file remaining after a crash. In the latter case, removing the file after
/// checking no process running tantivy is running is safe.
pub static ref INDEX_WRITER_LOCK: Lock = Lock {
filepath: PathBuf::from(".tantivy-writer.lock"),
is_blocking: false
};
/// The meta lock file is here to protect the segment files being opened by
/// `IndexReader::reload()` from being garbage collected.
/// It makes it possible for another process to safely consume
/// our index in-writing. Ideally, we may have prefered `RWLock` semantics
/// here, but it is difficult to achieve on Windows.
///
/// Opening segment readers is a very fast process.
pub static ref META_LOCK: Lock = Lock {
filepath: PathBuf::from(".tantivy-meta.lock"),
is_blocking: true
};
}

View File

@@ -3,22 +3,6 @@ use std::fmt;
use std::io;
use std::path::PathBuf;
/// Error while trying to acquire a directory lock.
#[derive(Debug, Fail)]
pub enum LockError {
/// Failed to acquired a lock as it is already hold by another
/// client.
/// - In the context of a blocking lock, this means the lock was not released within some `timeout` period.
/// - In the context of a non-blocking lock, this means the lock was busy at the moment of the call.
#[fail(
display = "Could not acquire lock as it is already held, possibly by a different process."
)]
LockBusy,
/// Trying to acquire a lock failed with an `IOError`
#[fail(display = "Failed to acquire the lock due to an io:Error.")]
IOError(io::Error),
}
/// General IO error with an optional path to the offending file.
#[derive(Debug)]
pub struct IOError {
@@ -26,12 +10,6 @@ pub struct IOError {
err: io::Error,
}
impl Into<io::Error> for IOError {
fn into(self) -> io::Error {
self.err
}
}
impl fmt::Display for IOError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self.path {
@@ -73,14 +51,6 @@ pub enum OpenDirectoryError {
DoesNotExist(PathBuf),
/// The path exists but is not a directory.
NotADirectory(PathBuf),
/// IoError
IoError(io::Error),
}
impl From<io::Error> for OpenDirectoryError {
fn from(io_err: io::Error) -> Self {
OpenDirectoryError::IoError(io_err)
}
}
impl fmt::Display for OpenDirectoryError {
@@ -92,11 +62,6 @@ impl fmt::Display for OpenDirectoryError {
OpenDirectoryError::NotADirectory(ref path) => {
write!(f, "the path '{:?}' exists but is not a directory", path)
}
OpenDirectoryError::IoError(ref err) => write!(
f,
"IOError while trying to open/create the directory. {:?}",
err
),
}
}
}

View File

@@ -1,11 +1,8 @@
use core::MANAGED_FILEPATH;
use directory::error::{DeleteError, IOError, LockError, OpenReadError, OpenWriteError};
use directory::DirectoryLock;
use directory::Lock;
use directory::META_LOCK;
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
use directory::{ReadOnlySource, WritePtr};
use directory::{WatchCallback, WatchHandle};
use error::DataCorruption;
use indexer::LockType;
use serde_json;
use std::collections::HashSet;
use std::io;
@@ -95,9 +92,6 @@ impl ManagedDirectory {
///
/// * `living_files` - List of files that are still used by the index.
///
/// The use a callback ensures that the list of living_files is computed
/// while we hold the lock on meta.
///
/// This method does not panick nor returns errors.
/// If a file cannot be deleted (for permission reasons for instance)
/// an error is simply logged, and the file remains in the list of managed
@@ -128,7 +122,7 @@ impl ManagedDirectory {
// 2) writer change meta.json (for instance after a merge or a commit)
// 3) gc kicks in.
// 4) gc removes a file that was useful for process B, before process B opened it.
if let Ok(_meta_lock) = self.acquire_lock(&META_LOCK) {
if let Ok(_meta_lock) = LockType::MetaLock.acquire_lock(self) {
let living_files = get_living_files();
for managed_path in &meta_informations_rlock.managed_paths {
if !living_files.contains(managed_path) {
@@ -238,14 +232,6 @@ impl Directory for ManagedDirectory {
fn exists(&self, path: &Path) -> bool {
self.directory.exists(path)
}
fn acquire_lock(&self, lock: &Lock) -> result::Result<DirectoryLock, LockError> {
self.directory.acquire_lock(lock)
}
fn watch(&self, watch_callback: WatchCallback) -> WatchHandle {
self.directory.watch(watch_callback)
}
}
impl Clone for ManagedDirectory {
@@ -260,98 +246,95 @@ impl Clone for ManagedDirectory {
#[cfg(test)]
mod tests {
use super::*;
#[cfg(feature = "mmap")]
mod mmap_specific {
use directory::MmapDirectory;
use std::io::Write;
use std::path::Path;
use tempdir::TempDir;
use super::super::*;
use std::path::Path;
use tempdir::TempDir;
lazy_static! {
static ref TEST_PATH1: &'static Path = Path::new("some_path_for_test");
static ref TEST_PATH2: &'static Path = Path::new("some_path_for_test2");
}
use directory::MmapDirectory;
use std::io::Write;
#[test]
fn test_managed_directory() {
let tempdir = TempDir::new("index").unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
{
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
{
let mut write_file = managed_directory.open_write(*TEST_PATH1).unwrap();
write_file.flush().unwrap();
}
{
managed_directory
.atomic_write(*TEST_PATH2, &vec![0u8, 1u8])
.unwrap();
}
{
assert!(managed_directory.exists(*TEST_PATH1));
assert!(managed_directory.exists(*TEST_PATH2));
}
{
let living_files: HashSet<PathBuf> =
[TEST_PATH1.to_owned()].into_iter().cloned().collect();
managed_directory.garbage_collect(|| living_files);
}
{
assert!(managed_directory.exists(*TEST_PATH1));
assert!(!managed_directory.exists(*TEST_PATH2));
}
}
{
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
{
assert!(managed_directory.exists(*TEST_PATH1));
assert!(!managed_directory.exists(*TEST_PATH2));
}
{
let living_files: HashSet<PathBuf> = HashSet::new();
managed_directory.garbage_collect(|| living_files);
}
{
assert!(!managed_directory.exists(*TEST_PATH1));
assert!(!managed_directory.exists(*TEST_PATH2));
}
}
}
#[test]
fn test_managed_directory_gc_while_mmapped() {
let tempdir = TempDir::new("index").unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
let living_files = HashSet::new();
lazy_static! {
static ref TEST_PATH1: &'static Path = Path::new("some_path_for_test");
static ref TEST_PATH2: &'static Path = Path::new("some_path_for_test2");
}
#[test]
#[cfg(feature = "mmap")]
fn test_managed_directory() {
let tempdir = TempDir::new("index").unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
{
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
managed_directory
.atomic_write(*TEST_PATH1, &vec![0u8, 1u8])
.unwrap();
assert!(managed_directory.exists(*TEST_PATH1));
let _mmap_read = managed_directory.open_read(*TEST_PATH1).unwrap();
managed_directory.garbage_collect(|| living_files.clone());
if cfg!(target_os = "windows") {
// On Windows, gc should try and fail the file as it is mmapped.
{
let mut write_file = managed_directory.open_write(*TEST_PATH1).unwrap();
write_file.flush().unwrap();
}
{
managed_directory
.atomic_write(*TEST_PATH2, &vec![0u8, 1u8])
.unwrap();
}
{
assert!(managed_directory.exists(*TEST_PATH1));
// unmap should happen here.
drop(_mmap_read);
// The file should still be in the list of managed file and
// eventually be deleted once mmap is released.
assert!(managed_directory.exists(*TEST_PATH2));
}
{
let living_files: HashSet<PathBuf> =
[TEST_PATH1.to_owned()].into_iter().cloned().collect();
managed_directory.garbage_collect(|| living_files);
assert!(!managed_directory.exists(*TEST_PATH1));
} else {
assert!(!managed_directory.exists(*TEST_PATH1));
}
{
assert!(managed_directory.exists(*TEST_PATH1));
assert!(!managed_directory.exists(*TEST_PATH2));
}
}
{
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
{
assert!(managed_directory.exists(*TEST_PATH1));
assert!(!managed_directory.exists(*TEST_PATH2));
}
{
let living_files: HashSet<PathBuf> = HashSet::new();
managed_directory.garbage_collect(|| living_files);
}
{
assert!(!managed_directory.exists(*TEST_PATH1));
assert!(!managed_directory.exists(*TEST_PATH2));
}
}
}
#[test]
#[cfg(feature = "mmap ")]
fn test_managed_directory_gc_while_mmapped() {
let tempdir = TempDir::new("index").unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
let living_files = HashSet::new();
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
managed_directory
.atomic_write(*TEST_PATH1, &vec![0u8, 1u8])
.unwrap();
assert!(managed_directory.exists(*TEST_PATH1));
let _mmap_read = managed_directory.open_read(*TEST_PATH1).unwrap();
managed_directory.garbage_collect(|| living_files.clone());
if cfg!(target_os = "windows") {
// On Windows, gc should try and fail the file as it is mmapped.
assert!(managed_directory.exists(*TEST_PATH1));
// unmap should happen here.
drop(_mmap_read);
// The file should still be in the list of managed file and
// eventually be deleted once mmap is released.
managed_directory.garbage_collect(|| living_files);
assert!(!managed_directory.exists(*TEST_PATH1));
} else {
assert!(!managed_directory.exists(*TEST_PATH1));
}
}
}

View File

@@ -1,24 +1,12 @@
extern crate fs2;
extern crate notify;
use self::fs2::FileExt;
use self::notify::RawEvent;
use self::notify::RecursiveMode;
use self::notify::Watcher;
use atomicwrites;
use core::META_FILEPATH;
use directory::error::LockError;
use common::make_io_err;
use directory::error::{DeleteError, IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
use directory::read_only_source::BoxedData;
use directory::shared_vec_slice::SharedVecSlice;
use directory::Directory;
use directory::DirectoryLock;
use directory::Lock;
use directory::ReadOnlySource;
use directory::WatchCallback;
use directory::WatchCallbackList;
use directory::WatchHandle;
use directory::WritePtr;
use memmap::Mmap;
use fst::raw::MmapReadOnly;
use std::collections::hash_map::Entry as HashMapEntry;
use std::collections::HashMap;
use std::convert::From;
use std::fmt;
@@ -28,22 +16,14 @@ use std::io::{self, Seek, SeekFrom};
use std::io::{BufWriter, Read, Write};
use std::path::{Path, PathBuf};
use std::result;
use std::sync::mpsc::{channel, Receiver, Sender};
use std::sync::Arc;
use std::sync::Mutex;
use std::sync::RwLock;
use std::sync::Weak;
use std::thread;
use tempdir::TempDir;
/// Create a default io error given a string.
pub(crate) fn make_io_err(msg: String) -> io::Error {
io::Error::new(io::ErrorKind::Other, msg)
}
/// Returns None iff the file exists, can be read, but is empty (and hence
/// cannot be mmapped)
fn open_mmap(full_path: &Path) -> result::Result<Option<Mmap>, OpenReadError> {
/// cannot be mmapped).
///
fn open_mmap(full_path: &Path) -> result::Result<Option<MmapReadOnly>, OpenReadError> {
let file = File::open(full_path).map_err(|e| {
if e.kind() == io::ErrorKind::NotFound {
OpenReadError::FileDoesNotExist(full_path.to_owned())
@@ -62,7 +42,7 @@ fn open_mmap(full_path: &Path) -> result::Result<Option<Mmap>, OpenReadError> {
return Ok(None);
}
unsafe {
memmap::Mmap::map(&file)
MmapReadOnly::open(&file)
.map(Some)
.map_err(|e| From::from(IOError::with_path(full_path.to_owned(), e)))
}
@@ -85,7 +65,7 @@ pub struct CacheInfo {
struct MmapCache {
counters: CacheCounters,
cache: HashMap<PathBuf, Weak<BoxedData>>,
cache: HashMap<PathBuf, MmapReadOnly>,
}
impl Default for MmapCache {
@@ -98,7 +78,12 @@ impl Default for MmapCache {
}
impl MmapCache {
fn get_info(&self) -> CacheInfo {
/// Removes a `MmapReadOnly` entry from the mmap cache.
fn discard_from_cache(&mut self, full_path: &Path) -> bool {
self.cache.remove(full_path).is_some()
}
fn get_info(&mut self) -> CacheInfo {
let paths: Vec<PathBuf> = self.cache.keys().cloned().collect();
CacheInfo {
counters: self.counters.clone(),
@@ -106,105 +91,23 @@ impl MmapCache {
}
}
fn remove_weak_ref(&mut self) {
let keys_to_remove: Vec<PathBuf> = self
.cache
.iter()
.filter(|(_, mmap_weakref)| mmap_weakref.upgrade().is_none())
.map(|(key, _)| key.clone())
.collect();
for key in keys_to_remove {
self.cache.remove(&key);
}
}
// Returns None if the file exists but as a len of 0 (and hence is not mmappable).
fn get_mmap(&mut self, full_path: &Path) -> Result<Option<Arc<BoxedData>>, OpenReadError> {
if let Some(mmap_weak) = self.cache.get(full_path) {
if let Some(mmap_arc) = mmap_weak.upgrade() {
fn get_mmap(&mut self, full_path: &Path) -> Result<Option<MmapReadOnly>, OpenReadError> {
Ok(match self.cache.entry(full_path.to_owned()) {
HashMapEntry::Occupied(occupied_entry) => {
let mmap = occupied_entry.get();
self.counters.hit += 1;
return Ok(Some(mmap_arc));
Some(mmap.clone())
}
}
self.cache.remove(full_path);
self.counters.miss += 1;
Ok(if let Some(mmap) = open_mmap(full_path)? {
let mmap_arc: Arc<BoxedData> = Arc::new(Box::new(mmap));
let mmap_weak = Arc::downgrade(&mmap_arc);
self.cache.insert(full_path.to_owned(), mmap_weak);
Some(mmap_arc)
} else {
None
})
}
}
struct InnerWatcherWrapper {
_watcher: Mutex<notify::RecommendedWatcher>,
watcher_router: WatchCallbackList,
}
impl InnerWatcherWrapper {
pub fn new(path: &Path) -> Result<(Self, Receiver<notify::RawEvent>), notify::Error> {
let (tx, watcher_recv): (Sender<RawEvent>, Receiver<RawEvent>) = channel();
// We need to initialize the
let mut watcher = notify::raw_watcher(tx)?;
watcher.watch(path, RecursiveMode::Recursive)?;
let inner = InnerWatcherWrapper {
_watcher: Mutex::new(watcher),
watcher_router: Default::default(),
};
Ok((inner, watcher_recv))
}
}
#[derive(Clone)]
pub(crate) struct WatcherWrapper {
inner: Arc<InnerWatcherWrapper>,
}
impl WatcherWrapper {
pub fn new(path: &Path) -> Result<Self, OpenDirectoryError> {
let (inner, watcher_recv) = InnerWatcherWrapper::new(path).map_err(|err| match err {
notify::Error::PathNotFound => OpenDirectoryError::DoesNotExist(path.to_owned()),
_ => {
panic!("Unknown error while starting watching directory {:?}", path);
}
})?;
let watcher_wrapper = WatcherWrapper {
inner: Arc::new(inner),
};
let watcher_wrapper_clone = watcher_wrapper.clone();
thread::Builder::new()
.name("meta-file-watch-thread".to_string())
.spawn(move || {
loop {
match watcher_recv.recv().map(|evt| evt.path) {
Ok(Some(changed_path)) => {
// ... Actually subject to false positive.
// We might want to be more accurate than this at one point.
if let Some(filename) = changed_path.file_name() {
if filename == *META_FILEPATH {
watcher_wrapper_clone.inner.watcher_router.broadcast();
}
}
}
Ok(None) => {
// not an event we are interested in.
}
Err(_e) => {
// the watch send channel was dropped
break;
}
}
HashMapEntry::Vacant(vacant_entry) => {
self.counters.miss += 1;
if let Some(mmap) = open_mmap(full_path)? {
vacant_entry.insert(mmap.clone());
Some(mmap)
} else {
None
}
})
.expect("Failed to spawn thread to watch meta.json");
Ok(watcher_wrapper)
}
pub fn watch(&mut self, watch_callback: WatchCallback) -> WatchHandle {
self.inner.watcher_router.subscribe(watch_callback)
}
})
}
}
@@ -212,72 +115,33 @@ impl WatcherWrapper {
///
/// The Mmap object are cached to limit the
/// system calls.
///
/// In the `MmapDirectory`, locks are implemented using the `fs2` crate definition of locks.
///
/// On MacOS & linux, it relies on `flock` (aka `BSD Lock`). These locks solve most of the
/// problems related to POSIX Locks, but may their contract may not be respected on `NFS`
/// depending on the implementation.
///
/// On Windows the semantics are again different.
#[derive(Clone)]
pub struct MmapDirectory {
inner: Arc<MmapDirectoryInner>,
}
struct MmapDirectoryInner {
root_path: PathBuf,
mmap_cache: RwLock<MmapCache>,
_temp_directory: Option<TempDir>,
watcher: RwLock<WatcherWrapper>,
}
impl MmapDirectoryInner {
fn new(
root_path: PathBuf,
temp_directory: Option<TempDir>,
) -> Result<MmapDirectoryInner, OpenDirectoryError> {
let watch_wrapper = WatcherWrapper::new(&root_path)?;
let mmap_directory_inner = MmapDirectoryInner {
root_path,
mmap_cache: Default::default(),
_temp_directory: temp_directory,
watcher: RwLock::new(watch_wrapper),
};
Ok(mmap_directory_inner)
}
fn watch(&self, watch_callback: WatchCallback) -> WatchHandle {
let mut wlock = self.watcher.write().unwrap();
wlock.watch(watch_callback)
}
mmap_cache: Arc<RwLock<MmapCache>>,
_temp_directory: Arc<Option<TempDir>>,
}
impl fmt::Debug for MmapDirectory {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "MmapDirectory({:?})", self.inner.root_path)
write!(f, "MmapDirectory({:?})", self.root_path)
}
}
impl MmapDirectory {
fn new(
root_path: PathBuf,
temp_directory: Option<TempDir>,
) -> Result<MmapDirectory, OpenDirectoryError> {
let inner = MmapDirectoryInner::new(root_path, temp_directory)?;
Ok(MmapDirectory {
inner: Arc::new(inner),
})
}
/// Creates a new MmapDirectory in a temporary directory.
///
/// This is mostly useful to test the MmapDirectory itself.
/// For your unit tests, prefer the RAMDirectory.
pub fn create_from_tempdir() -> Result<MmapDirectory, OpenDirectoryError> {
let tempdir = TempDir::new("index").map_err(OpenDirectoryError::IoError)?;
pub fn create_from_tempdir() -> io::Result<MmapDirectory> {
let tempdir = TempDir::new("index")?;
let tempdir_path = PathBuf::from(tempdir.path());
MmapDirectory::new(tempdir_path, Some(tempdir))
let directory = MmapDirectory {
root_path: tempdir_path,
mmap_cache: Arc::new(RwLock::new(MmapCache::default())),
_temp_directory: Arc::new(Some(tempdir)),
};
Ok(directory)
}
/// Opens a MmapDirectory in a directory.
@@ -295,14 +159,18 @@ impl MmapDirectory {
directory_path,
)))
} else {
Ok(MmapDirectory::new(PathBuf::from(directory_path), None)?)
Ok(MmapDirectory {
root_path: PathBuf::from(directory_path),
mmap_cache: Arc::new(RwLock::new(MmapCache::default())),
_temp_directory: Arc::new(None),
})
}
}
/// Joins a relative_path to the directory `root_path`
/// to create a proper complete `filepath`.
fn resolve_path(&self, relative_path: &Path) -> PathBuf {
self.inner.root_path.join(relative_path)
self.root_path.join(relative_path)
}
/// Sync the root directory.
@@ -327,7 +195,7 @@ impl MmapDirectory {
.custom_flags(winbase::FILE_FLAG_BACKUP_SEMANTICS);
}
let fd = open_opts.open(&self.inner.root_path)?;
let fd = open_opts.open(&self.root_path)?;
fd.sync_all()?;
Ok(())
}
@@ -337,35 +205,14 @@ impl MmapDirectory {
///
/// The `MmapDirectory` embeds a `MmapDirectory`
/// to avoid multiplying the `mmap` system calls.
pub fn get_cache_info(&self) -> CacheInfo {
self.inner
.mmap_cache
pub fn get_cache_info(&mut self) -> CacheInfo {
self.mmap_cache
.write()
.expect("mmap cache lock is poisoned")
.remove_weak_ref();
self.inner
.mmap_cache
.read()
.expect("Mmap cache lock is poisoned.")
.get_info()
}
}
/// We rely on fs2 for file locking. On Windows & MacOS this
/// uses BSD locks (`flock`). The lock is actually released when
/// the `File` object is dropped and its associated file descriptor
/// is closed.
struct ReleaseLockFile {
_file: File,
path: PathBuf,
}
impl Drop for ReleaseLockFile {
fn drop(&mut self) {
debug!("Releasing lock {:?}", self.path);
}
}
/// This Write wraps a File, but has the specificity of
/// call `sync_all` on flush.
struct SafeFileWriter(File);
@@ -398,7 +245,7 @@ impl Directory for MmapDirectory {
debug!("Open Read {:?}", path);
let full_path = self.resolve_path(path);
let mut mmap_cache = self.inner.mmap_cache.write().map_err(|_| {
let mut mmap_cache = self.mmap_cache.write().map_err(|_| {
let msg = format!(
"Failed to acquired write lock \
on mmap cache while reading {:?}",
@@ -406,34 +253,11 @@ impl Directory for MmapDirectory {
);
IOError::with_path(path.to_owned(), make_io_err(msg))
})?;
Ok(mmap_cache
.get_mmap(&full_path)?
.map(ReadOnlySource::from)
.unwrap_or_else(ReadOnlySource::empty))
}
/// Any entry associated to the path in the mmap will be
/// removed before the file is deleted.
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
debug!("Deleting file {:?}", path);
let full_path = self.resolve_path(path);
match fs::remove_file(&full_path) {
Ok(_) => self
.sync_directory()
.map_err(|e| IOError::with_path(path.to_owned(), e).into()),
Err(e) => {
if e.kind() == io::ErrorKind::NotFound {
Err(DeleteError::FileDoesNotExist(path.to_owned()))
} else {
Err(IOError::with_path(path.to_owned(), e).into())
}
}
}
}
fn exists(&self, path: &Path) -> bool {
let full_path = self.resolve_path(path);
full_path.exists()
.map(ReadOnlySource::Mmap)
.unwrap_or_else(|| ReadOnlySource::Anonymous(SharedVecSlice::empty())))
}
fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError> {
@@ -466,6 +290,44 @@ impl Directory for MmapDirectory {
Ok(BufWriter::new(Box::new(writer)))
}
/// Any entry associated to the path in the mmap will be
/// removed before the file is deleted.
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
debug!("Deleting file {:?}", path);
let full_path = self.resolve_path(path);
let mut mmap_cache = self.mmap_cache.write().map_err(|_| {
let msg = format!(
"Failed to acquired write lock \
on mmap cache while deleting {:?}",
path
);
IOError::with_path(path.to_owned(), make_io_err(msg))
})?;
mmap_cache.discard_from_cache(path);
// Removing the entry in the MMap cache.
// The munmap will appear on Drop,
// when the last reference is gone.
mmap_cache.cache.remove(&full_path);
match fs::remove_file(&full_path) {
Ok(_) => self
.sync_directory()
.map_err(|e| IOError::with_path(path.to_owned(), e).into()),
Err(e) => {
if e.kind() == io::ErrorKind::NotFound {
Err(DeleteError::FileDoesNotExist(path.to_owned()))
} else {
Err(IOError::with_path(path.to_owned(), e).into())
}
}
}
}
fn exists(&self, path: &Path) -> bool {
let full_path = self.resolve_path(path);
full_path.exists()
}
fn atomic_read(&self, path: &Path) -> Result<Vec<u8>, OpenReadError> {
let full_path = self.resolve_path(path);
let mut buffer = Vec::new();
@@ -492,30 +354,6 @@ impl Directory for MmapDirectory {
meta_file.write(|f| f.write_all(data))?;
Ok(())
}
fn acquire_lock(&self, lock: &Lock) -> Result<DirectoryLock, LockError> {
let full_path = self.resolve_path(&lock.filepath);
// We make sure that the file exists.
let file: File = OpenOptions::new()
.write(true)
.create(true) //< if the file does not exist yet, create it.
.open(&full_path)
.map_err(LockError::IOError)?;
if lock.is_blocking {
file.lock_exclusive().map_err(LockError::IOError)?;
} else {
file.try_lock_exclusive().map_err(|_| LockError::LockBusy)?
}
// dropping the file handle will release the lock.
Ok(DirectoryLock::from(Box::new(ReleaseLockFile {
path: lock.filepath.clone(),
_file: file,
})))
}
fn watch(&self, watch_callback: WatchCallback) -> WatchHandle {
self.inner.watch(watch_callback)
}
}
#[cfg(test)]
@@ -525,13 +363,6 @@ mod tests {
// The following tests are specific to the MmapDirectory
use super::*;
use schema::{Schema, SchemaBuilder, TEXT};
use std::fs;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::thread;
use std::time::Duration;
use Index;
use ReloadPolicy;
#[test]
fn test_open_non_existant_path() {
@@ -556,7 +387,7 @@ mod tests {
#[test]
fn test_cache() {
let content = b"abc";
let content = "abc".as_bytes();
// here we test if the cache releases
// mmaps correctly.
@@ -572,104 +403,26 @@ mod tests {
w.flush().unwrap();
}
}
let mut keep = vec![];
for (i, path) in paths.iter().enumerate() {
keep.push(mmap_directory.open_read(path).unwrap());
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), i + 1);
}
assert_eq!(mmap_directory.get_cache_info().counters.hit, 0);
assert_eq!(mmap_directory.get_cache_info().counters.miss, 10);
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 10);
for path in paths.iter() {
let _r = mmap_directory.open_read(path).unwrap();
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), num_paths);
{
for (i, path) in paths.iter().enumerate() {
let _r = mmap_directory.open_read(path).unwrap();
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), i + 1);
}
for path in paths.iter() {
let _r = mmap_directory.open_read(path).unwrap();
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), num_paths);
}
for (i, path) in paths.iter().enumerate() {
mmap_directory.delete(path).unwrap();
assert_eq!(
mmap_directory.get_cache_info().mmapped.len(),
num_paths - i - 1
);
}
}
assert_eq!(mmap_directory.get_cache_info().counters.hit, 10);
assert_eq!(mmap_directory.get_cache_info().counters.miss, 10);
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 10);
for path in paths.iter() {
let _r = mmap_directory.open_read(path).unwrap();
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 10);
}
assert_eq!(mmap_directory.get_cache_info().counters.hit, 20);
assert_eq!(mmap_directory.get_cache_info().counters.miss, 10);
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 10);
drop(keep);
for path in paths.iter() {
let _r = mmap_directory.open_read(path).unwrap();
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 1);
}
assert_eq!(mmap_directory.get_cache_info().counters.hit, 20);
assert_eq!(mmap_directory.get_cache_info().counters.miss, 20);
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 0);
for path in &paths {
mmap_directory.delete(path).unwrap();
}
assert_eq!(mmap_directory.get_cache_info().counters.hit, 20);
assert_eq!(mmap_directory.get_cache_info().counters.miss, 20);
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 0);
for path in paths.iter() {
assert!(mmap_directory.open_read(path).is_err());
}
assert_eq!(mmap_directory.get_cache_info().counters.hit, 20);
assert_eq!(mmap_directory.get_cache_info().counters.miss, 30);
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 0);
}
#[test]
fn test_watch_wrapper() {
let counter: Arc<AtomicUsize> = Default::default();
let counter_clone = counter.clone();
let tmp_dir: TempDir = tempdir::TempDir::new("test_watch_wrapper").unwrap();
let tmp_dirpath = tmp_dir.path().to_owned();
let mut watch_wrapper = WatcherWrapper::new(&tmp_dirpath).unwrap();
let tmp_file = tmp_dirpath.join("coucou");
let _handle = watch_wrapper.watch(Box::new(move || {
counter_clone.fetch_add(1, Ordering::SeqCst);
}));
assert_eq!(counter.load(Ordering::SeqCst), 0);
fs::write(&tmp_file, b"whateverwilldo").unwrap();
thread::sleep(Duration::new(0, 1_000u32));
}
#[test]
fn test_mmap_released() {
let mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
let mut schema_builder: SchemaBuilder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
{
let index = Index::create(mmap_directory.clone(), schema).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
for _num_commits in 0..16 {
for _ in 0..10 {
index_writer.add_document(doc!(text_field=>"abc"));
}
index_writer.commit().unwrap();
}
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
for _ in 0..30 {
index_writer.add_document(doc!(text_field=>"abc"));
index_writer.commit().unwrap();
reader.reload().unwrap();
}
index_writer.wait_merging_threads().unwrap();
reader.reload().unwrap();
let num_segments = reader.searcher().segment_readers().len();
assert_eq!(num_segments, 4);
assert_eq!(
num_segments * 7,
mmap_directory.get_cache_info().mmapped.len()
);
}
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 0);
}
}

View File

@@ -8,23 +8,19 @@ WORM directory abstraction.
mod mmap_directory;
mod directory;
mod directory_lock;
mod managed_directory;
mod ram_directory;
mod read_only_source;
mod watch_event_router;
mod shared_vec_slice;
/// Errors specific to the directory module.
pub mod error;
pub use self::directory::DirectoryLock;
use std::io::{BufWriter, Seek, Write};
pub use self::directory::{Directory, DirectoryClone};
pub use self::directory_lock::{Lock, INDEX_WRITER_LOCK, META_LOCK};
pub use self::ram_directory::RAMDirectory;
pub use self::read_only_source::ReadOnlySource;
pub(crate) use self::watch_event_router::WatchCallbackList;
pub use self::watch_event_router::{WatchCallback, WatchHandle};
use std::io::{BufWriter, Seek, Write};
#[cfg(feature = "mmap")]
pub use self::mmap_directory::MmapDirectory;
@@ -42,4 +38,128 @@ impl<T: Seek + Write> SeekableWrite for T {}
pub type WritePtr = BufWriter<Box<SeekableWrite>>;
#[cfg(test)]
mod tests;
mod tests {
use super::*;
use std::io::{Seek, SeekFrom, Write};
use std::path::Path;
lazy_static! {
static ref TEST_PATH: &'static Path = Path::new("some_path_for_test");
}
#[test]
fn test_ram_directory() {
let mut ram_directory = RAMDirectory::create();
test_directory(&mut ram_directory);
}
#[test]
#[cfg(feature = "mmap")]
fn test_mmap_directory() {
let mut mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
test_directory(&mut mmap_directory);
}
#[test]
#[should_panic]
fn ram_directory_panics_if_flush_forgotten() {
let mut ram_directory = RAMDirectory::create();
let mut write_file = ram_directory.open_write(*TEST_PATH).unwrap();
assert!(write_file.write_all(&[4]).is_ok());
}
fn test_simple(directory: &mut Directory) {
{
{
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
assert!(directory.exists(*TEST_PATH));
write_file.write_all(&[4]).unwrap();
write_file.write_all(&[3]).unwrap();
write_file.write_all(&[7, 3, 5]).unwrap();
write_file.flush().unwrap();
}
let read_file = directory.open_read(*TEST_PATH).unwrap();
let data: &[u8] = &*read_file;
assert_eq!(data, &[4u8, 3u8, 7u8, 3u8, 5u8]);
}
assert!(directory.delete(*TEST_PATH).is_ok());
assert!(!directory.exists(*TEST_PATH));
}
fn test_seek(directory: &mut Directory) {
{
{
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
write_file.write_all(&[4, 3, 7, 3, 5]).unwrap();
write_file.seek(SeekFrom::Start(0)).unwrap();
write_file.write_all(&[3, 1]).unwrap();
write_file.flush().unwrap();
}
let read_file = directory.open_read(*TEST_PATH).unwrap();
let data: &[u8] = &*read_file;
assert_eq!(data, &[3u8, 1u8, 7u8, 3u8, 5u8]);
}
assert!(directory.delete(*TEST_PATH).is_ok());
}
fn test_rewrite_forbidden(directory: &mut Directory) {
{
directory.open_write(*TEST_PATH).unwrap();
assert!(directory.exists(*TEST_PATH));
}
{
assert!(directory.open_write(*TEST_PATH).is_err());
}
assert!(directory.delete(*TEST_PATH).is_ok());
}
fn test_write_create_the_file(directory: &mut Directory) {
{
assert!(directory.open_read(*TEST_PATH).is_err());
let _w = directory.open_write(*TEST_PATH).unwrap();
assert!(directory.exists(*TEST_PATH));
assert!(directory.open_read(*TEST_PATH).is_ok());
assert!(directory.delete(*TEST_PATH).is_ok());
}
}
fn test_directory_delete(directory: &mut Directory) {
assert!(directory.open_read(*TEST_PATH).is_err());
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
write_file.write_all(&[1, 2, 3, 4]).unwrap();
write_file.flush().unwrap();
{
let read_handle = directory.open_read(*TEST_PATH).unwrap();
{
assert_eq!(&*read_handle, &[1u8, 2u8, 3u8, 4u8]);
// Mapped files can't be deleted on Windows
if !cfg!(windows) {
assert!(directory.delete(*TEST_PATH).is_ok());
assert_eq!(&*read_handle, &[1u8, 2u8, 3u8, 4u8]);
}
assert!(directory.delete(Path::new("SomeOtherPath")).is_err());
}
}
if cfg!(windows) {
assert!(directory.delete(*TEST_PATH).is_ok());
}
assert!(directory.open_read(*TEST_PATH).is_err());
assert!(directory.delete(*TEST_PATH).is_err());
}
fn test_directory(directory: &mut Directory) {
test_simple(directory);
test_seek(directory);
test_rewrite_forbidden(directory);
test_write_create_the_file(directory);
test_directory_delete(directory);
}
}

View File

@@ -1,8 +1,8 @@
use core::META_FILEPATH;
use directory::error::{DeleteError, OpenReadError, OpenWriteError};
use directory::WatchCallbackList;
use super::shared_vec_slice::SharedVecSlice;
use common::make_io_err;
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
use directory::WritePtr;
use directory::{Directory, ReadOnlySource, WatchCallback, WatchHandle};
use directory::{Directory, ReadOnlySource};
use std::collections::HashMap;
use std::fmt;
use std::io::{self, BufWriter, Cursor, Seek, SeekFrom, Write};
@@ -22,13 +22,13 @@ use std::sync::{Arc, RwLock};
///
struct VecWriter {
path: PathBuf,
shared_directory: RAMDirectory,
shared_directory: InnerDirectory,
data: Cursor<Vec<u8>>,
is_flushed: bool,
}
impl VecWriter {
fn new(path_buf: PathBuf, shared_directory: RAMDirectory) -> VecWriter {
fn new(path_buf: PathBuf, shared_directory: InnerDirectory) -> VecWriter {
VecWriter {
path: path_buf,
data: Cursor::new(Vec::new()),
@@ -64,44 +64,75 @@ impl Write for VecWriter {
fn flush(&mut self) -> io::Result<()> {
self.is_flushed = true;
let mut fs = self.shared_directory.fs.write().unwrap();
fs.write(self.path.clone(), self.data.get_ref());
self.shared_directory
.write(self.path.clone(), self.data.get_ref())?;
Ok(())
}
}
#[derive(Default)]
struct InnerDirectory {
fs: HashMap<PathBuf, ReadOnlySource>,
watch_router: WatchCallbackList,
}
#[derive(Clone)]
struct InnerDirectory(Arc<RwLock<HashMap<PathBuf, Arc<Vec<u8>>>>>);
impl InnerDirectory {
fn write(&mut self, path: PathBuf, data: &[u8]) -> bool {
let data = ReadOnlySource::new(Vec::from(data));
self.fs.insert(path, data).is_some()
fn new() -> InnerDirectory {
InnerDirectory(Arc::new(RwLock::new(HashMap::new())))
}
fn write(&self, path: PathBuf, data: &[u8]) -> io::Result<bool> {
let mut map = self.0.write().map_err(|_| {
make_io_err(format!(
"Failed to lock the directory, when trying to write {:?}",
path
))
})?;
let prev_value = map.insert(path, Arc::new(Vec::from(data)));
Ok(prev_value.is_some())
}
fn open_read(&self, path: &Path) -> Result<ReadOnlySource, OpenReadError> {
self.fs
.get(path)
.ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path)))
.map(Clone::clone)
self.0
.read()
.map_err(|_| {
let msg = format!(
"Failed to acquire read lock for the \
directory when trying to read {:?}",
path
);
let io_err = make_io_err(msg);
OpenReadError::IOError(IOError::with_path(path.to_owned(), io_err))
})
.and_then(|readable_map| {
readable_map
.get(path)
.ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path)))
.map(Arc::clone)
.map(|data| ReadOnlySource::Anonymous(SharedVecSlice::new(data)))
})
}
fn delete(&mut self, path: &Path) -> result::Result<(), DeleteError> {
match self.fs.remove(path) {
Some(_) => Ok(()),
None => Err(DeleteError::FileDoesNotExist(PathBuf::from(path))),
}
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
self.0
.write()
.map_err(|_| {
let msg = format!(
"Failed to acquire write lock for the \
directory when trying to delete {:?}",
path
);
let io_err = make_io_err(msg);
DeleteError::IOError(IOError::with_path(path.to_owned(), io_err))
})
.and_then(|mut writable_map| match writable_map.remove(path) {
Some(_) => Ok(()),
None => Err(DeleteError::FileDoesNotExist(PathBuf::from(path))),
})
}
fn exists(&self, path: &Path) -> bool {
self.fs.contains_key(path)
}
fn watch(&mut self, watch_handle: WatchCallback) -> WatchHandle {
self.watch_router.subscribe(watch_handle)
self.0
.read()
.expect("Failed to get read lock directory.")
.contains_key(path)
}
}
@@ -116,36 +147,33 @@ impl fmt::Debug for RAMDirectory {
/// It is mainly meant for unit testing.
/// Writes are only made visible upon flushing.
///
#[derive(Clone, Default)]
#[derive(Clone)]
pub struct RAMDirectory {
fs: Arc<RwLock<InnerDirectory>>,
fs: InnerDirectory,
}
impl RAMDirectory {
/// Constructor
pub fn create() -> RAMDirectory {
Self::default()
RAMDirectory {
fs: InnerDirectory::new(),
}
}
}
impl Directory for RAMDirectory {
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError> {
self.fs.read().unwrap().open_read(path)
}
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
self.fs.write().unwrap().delete(path)
}
fn exists(&self, path: &Path) -> bool {
self.fs.read().unwrap().exists(path)
self.fs.open_read(path)
}
fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError> {
let mut fs = self.fs.write().unwrap();
let path_buf = PathBuf::from(path);
let vec_writer = VecWriter::new(path_buf.clone(), self.clone());
let exists = fs.write(path_buf.clone(), &[]);
let vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());
let exists = self
.fs
.write(path_buf.clone(), &Vec::new())
.map_err(|err| IOError::with_path(path.to_owned(), err))?;
// force the creation of the file to mimic the MMap directory.
if exists {
Err(OpenWriteError::FileAlreadyExists(path_buf))
@@ -154,8 +182,17 @@ impl Directory for RAMDirectory {
}
}
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
self.fs.delete(path)
}
fn exists(&self, path: &Path) -> bool {
self.fs.exists(path)
}
fn atomic_read(&self, path: &Path) -> Result<Vec<u8>, OpenReadError> {
Ok(self.open_read(path)?.as_slice().to_owned())
let read = self.open_read(path)?;
Ok(read.as_slice().to_owned())
}
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
@@ -164,20 +201,10 @@ impl Directory for RAMDirectory {
msg.unwrap_or("Undefined".to_string())
)));
let path_buf = PathBuf::from(path);
// Reserve the path to prevent calls to .write() to succeed.
self.fs.write().unwrap().write(path_buf.clone(), &[]);
let mut vec_writer = VecWriter::new(path_buf.clone(), self.clone());
let mut vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());
self.fs.write(path_buf, &Vec::new())?;
vec_writer.write_all(data)?;
vec_writer.flush()?;
if path == Path::new(&*META_FILEPATH) {
self.fs.write().unwrap().watch_router.broadcast();
}
Ok(())
}
fn watch(&self, watch_callback: WatchCallback) -> WatchHandle {
self.fs.write().unwrap().watch(watch_callback)
}
}

View File

@@ -1,9 +1,9 @@
use super::shared_vec_slice::SharedVecSlice;
use common::HasLen;
#[cfg(feature = "mmap")]
use fst::raw::MmapReadOnly;
use stable_deref_trait::{CloneStableDeref, StableDeref};
use std::ops::Deref;
use std::sync::Arc;
pub type BoxedData = Box<Deref<Target = [u8]> + Send + Sync + 'static>;
/// Read object that represents files in tantivy.
///
@@ -11,10 +11,12 @@ pub type BoxedData = Box<Deref<Target = [u8]> + Send + Sync + 'static>;
/// the data in the form of a constant read-only `&[u8]`.
/// Whatever happens to the directory file, the data
/// hold by this object should never be altered or destroyed.
pub struct ReadOnlySource {
data: Arc<BoxedData>,
start: usize,
stop: usize,
pub enum ReadOnlySource {
/// Mmap source of data
#[cfg(feature = "mmap")]
Mmap(MmapReadOnly),
/// Wrapping a `Vec<u8>`
Anonymous(SharedVecSlice),
}
unsafe impl StableDeref for ReadOnlySource {}
@@ -28,38 +30,19 @@ impl Deref for ReadOnlySource {
}
}
impl From<Arc<BoxedData>> for ReadOnlySource {
fn from(data: Arc<BoxedData>) -> Self {
let len = data.len();
ReadOnlySource {
data,
start: 0,
stop: len,
}
}
}
impl ReadOnlySource {
pub(crate) fn new<D>(data: D) -> ReadOnlySource
where
D: Deref<Target = [u8]> + Send + Sync + 'static,
{
let len = data.len();
ReadOnlySource {
data: Arc::new(Box::new(data)),
start: 0,
stop: len,
}
}
/// Creates an empty ReadOnlySource
pub fn empty() -> ReadOnlySource {
ReadOnlySource::new(&[][..])
ReadOnlySource::Anonymous(SharedVecSlice::empty())
}
/// Returns the data underlying the ReadOnlySource object.
pub fn as_slice(&self) -> &[u8] {
&self.data[self.start..self.stop]
match *self {
#[cfg(feature = "mmap")]
ReadOnlySource::Mmap(ref mmap_read_only) => mmap_read_only.as_slice(),
ReadOnlySource::Anonymous(ref shared_vec) => shared_vec.as_slice(),
}
}
/// Splits into 2 `ReadOnlySource`, at the offset given
@@ -80,18 +63,22 @@ impl ReadOnlySource {
/// worth of data in anonymous memory, and only a
/// 1KB slice is remaining, the whole `500MBs`
/// are retained in memory.
pub fn slice(&self, start: usize, stop: usize) -> ReadOnlySource {
pub fn slice(&self, from_offset: usize, to_offset: usize) -> ReadOnlySource {
assert!(
start <= stop,
from_offset <= to_offset,
"Requested negative slice [{}..{}]",
start,
stop
from_offset,
to_offset
);
assert!(stop <= self.len());
ReadOnlySource {
data: self.data.clone(),
start: self.start + start,
stop: self.start + stop,
match *self {
#[cfg(feature = "mmap")]
ReadOnlySource::Mmap(ref mmap_read_only) => {
let sliced_mmap = mmap_read_only.range(from_offset, to_offset - from_offset);
ReadOnlySource::Mmap(sliced_mmap)
}
ReadOnlySource::Anonymous(ref shared_vec) => {
ReadOnlySource::Anonymous(shared_vec.slice(from_offset, to_offset))
}
}
}
@@ -100,7 +87,8 @@ impl ReadOnlySource {
///
/// Equivalent to `.slice(from_offset, self.len())`
pub fn slice_from(&self, from_offset: usize) -> ReadOnlySource {
self.slice(from_offset, self.len())
let len = self.len();
self.slice(from_offset, len)
}
/// Like `.slice(...)` but enforcing only the `to`
@@ -114,18 +102,19 @@ impl ReadOnlySource {
impl HasLen for ReadOnlySource {
fn len(&self) -> usize {
self.stop - self.start
self.as_slice().len()
}
}
impl Clone for ReadOnlySource {
fn clone(&self) -> Self {
self.slice_from(0)
self.slice(0, self.len())
}
}
impl From<Vec<u8>> for ReadOnlySource {
fn from(data: Vec<u8>) -> ReadOnlySource {
ReadOnlySource::new(data)
let shared_data = SharedVecSlice::from(data);
ReadOnlySource::Anonymous(shared_data)
}
}

View File

@@ -0,0 +1,41 @@
use std::sync::Arc;
#[derive(Clone)]
pub struct SharedVecSlice {
pub data: Arc<Vec<u8>>,
pub start: usize,
pub len: usize,
}
impl SharedVecSlice {
pub fn empty() -> SharedVecSlice {
SharedVecSlice::new(Arc::new(Vec::new()))
}
pub fn new(data: Arc<Vec<u8>>) -> SharedVecSlice {
let data_len = data.len();
SharedVecSlice {
data,
start: 0,
len: data_len,
}
}
pub fn as_slice(&self) -> &[u8] {
&self.data[self.start..self.start + self.len]
}
pub fn slice(&self, from_offset: usize, to_offset: usize) -> SharedVecSlice {
SharedVecSlice {
data: Arc::clone(&self.data),
start: self.start + from_offset,
len: to_offset - from_offset,
}
}
}
impl From<Vec<u8>> for SharedVecSlice {
fn from(data: Vec<u8>) -> SharedVecSlice {
SharedVecSlice::new(Arc::new(data))
}
}

View File

@@ -1,222 +0,0 @@
use super::*;
use std::io::{Seek, SeekFrom, Write};
use std::mem;
use std::path::{Path, PathBuf};
use std::sync::atomic::AtomicUsize;
use std::sync::atomic::Ordering;
use std::sync::Arc;
use std::thread;
use std::time;
use std::time::Duration;
lazy_static! {
static ref TEST_PATH: &'static Path = Path::new("some_path_for_test");
}
#[test]
fn test_ram_directory() {
let mut ram_directory = RAMDirectory::create();
test_directory(&mut ram_directory);
}
#[test]
#[cfg(feature = "mmap")]
fn test_mmap_directory() {
let mut mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
test_directory(&mut mmap_directory);
}
#[test]
#[should_panic]
fn ram_directory_panics_if_flush_forgotten() {
let mut ram_directory = RAMDirectory::create();
let mut write_file = ram_directory.open_write(*TEST_PATH).unwrap();
assert!(write_file.write_all(&[4]).is_ok());
}
fn test_simple(directory: &mut Directory) {
{
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
assert!(directory.exists(*TEST_PATH));
write_file.write_all(&[4]).unwrap();
write_file.write_all(&[3]).unwrap();
write_file.write_all(&[7, 3, 5]).unwrap();
write_file.flush().unwrap();
}
{
let read_file = directory.open_read(*TEST_PATH).unwrap();
let data: &[u8] = &*read_file;
assert_eq!(data, &[4u8, 3u8, 7u8, 3u8, 5u8]);
}
assert!(directory.delete(*TEST_PATH).is_ok());
assert!(!directory.exists(*TEST_PATH));
}
fn test_seek(directory: &mut Directory) {
{
{
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
write_file.write_all(&[4, 3, 7, 3, 5]).unwrap();
write_file.seek(SeekFrom::Start(0)).unwrap();
write_file.write_all(&[3, 1]).unwrap();
write_file.flush().unwrap();
}
let read_file = directory.open_read(*TEST_PATH).unwrap();
let data: &[u8] = &*read_file;
assert_eq!(data, &[3u8, 1u8, 7u8, 3u8, 5u8]);
}
assert!(directory.delete(*TEST_PATH).is_ok());
}
fn test_rewrite_forbidden(directory: &mut Directory) {
{
directory.open_write(*TEST_PATH).unwrap();
assert!(directory.exists(*TEST_PATH));
}
{
assert!(directory.open_write(*TEST_PATH).is_err());
}
assert!(directory.delete(*TEST_PATH).is_ok());
}
fn test_write_create_the_file(directory: &mut Directory) {
{
assert!(directory.open_read(*TEST_PATH).is_err());
let _w = directory.open_write(*TEST_PATH).unwrap();
assert!(directory.exists(*TEST_PATH));
assert!(directory.open_read(*TEST_PATH).is_ok());
assert!(directory.delete(*TEST_PATH).is_ok());
}
}
fn test_directory_delete(directory: &mut Directory) {
assert!(directory.open_read(*TEST_PATH).is_err());
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
write_file.write_all(&[1, 2, 3, 4]).unwrap();
write_file.flush().unwrap();
{
let read_handle = directory.open_read(*TEST_PATH).unwrap();
{
assert_eq!(&*read_handle, &[1u8, 2u8, 3u8, 4u8]);
// Mapped files can't be deleted on Windows
if !cfg!(windows) {
assert!(directory.delete(*TEST_PATH).is_ok());
assert_eq!(&*read_handle, &[1u8, 2u8, 3u8, 4u8]);
}
assert!(directory.delete(Path::new("SomeOtherPath")).is_err());
}
}
if cfg!(windows) {
assert!(directory.delete(*TEST_PATH).is_ok());
}
assert!(directory.open_read(*TEST_PATH).is_err());
assert!(directory.delete(*TEST_PATH).is_err());
}
fn test_directory(directory: &mut Directory) {
test_simple(directory);
test_seek(directory);
test_rewrite_forbidden(directory);
test_write_create_the_file(directory);
test_directory_delete(directory);
test_lock_non_blocking(directory);
test_lock_blocking(directory);
test_watch(directory);
}
fn test_watch(directory: &mut Directory) {
let counter: Arc<AtomicUsize> = Default::default();
let counter_clone = counter.clone();
let watch_callback = Box::new(move || {
counter_clone.fetch_add(1, Ordering::SeqCst);
});
assert!(directory
.atomic_write(Path::new("meta.json"), b"random_test_data")
.is_ok());
thread::sleep(Duration::new(0, 10_000));
assert_eq!(0, counter.load(Ordering::SeqCst));
let watch_handle = directory.watch(watch_callback);
for i in 0..10 {
assert_eq!(i, counter.load(Ordering::SeqCst));
assert!(directory
.atomic_write(Path::new("meta.json"), b"random_test_data_2")
.is_ok());
for _ in 0..100 {
if counter.load(Ordering::SeqCst) > i {
break;
}
thread::sleep(Duration::from_millis(10));
}
assert_eq!(i + 1, counter.load(Ordering::SeqCst));
}
mem::drop(watch_handle);
assert!(directory
.atomic_write(Path::new("meta.json"), b"random_test_data")
.is_ok());
thread::sleep(Duration::from_millis(200));
assert_eq!(10, counter.load(Ordering::SeqCst));
}
fn test_lock_non_blocking(directory: &mut Directory) {
{
let lock_a_res = directory.acquire_lock(&Lock {
filepath: PathBuf::from("a.lock"),
is_blocking: false,
});
assert!(lock_a_res.is_ok());
let lock_b_res = directory.acquire_lock(&Lock {
filepath: PathBuf::from("b.lock"),
is_blocking: false,
});
assert!(lock_b_res.is_ok());
let lock_a_res2 = directory.acquire_lock(&Lock {
filepath: PathBuf::from("a.lock"),
is_blocking: false,
});
assert!(lock_a_res2.is_err());
}
let lock_a_res = directory.acquire_lock(&Lock {
filepath: PathBuf::from("a.lock"),
is_blocking: false,
});
assert!(lock_a_res.is_ok());
}
fn test_lock_blocking(directory: &mut Directory) {
let lock_a_res = directory.acquire_lock(&Lock {
filepath: PathBuf::from("a.lock"),
is_blocking: true,
});
assert!(lock_a_res.is_ok());
std::thread::spawn(move || {
//< lock_a_res is sent to the thread.
std::thread::sleep(time::Duration::from_millis(10));
// explicitely droping lock_a_res. It would have been sufficient to just force it
// to be part of the move, but the intent seems clearer that way.
drop(lock_a_res);
});
{
// A non-blocking call should fail, as the thread is running and holding the lock.
let lock_a_res = directory.acquire_lock(&Lock {
filepath: PathBuf::from("a.lock"),
is_blocking: false,
});
assert!(lock_a_res.is_err());
}
{
// the blocking call should wait for at least 10ms.
let start = time::Instant::now();
let lock_a_res = directory.acquire_lock(&Lock {
filepath: PathBuf::from("a.lock"),
is_blocking: true,
});
assert!(lock_a_res.is_ok());
assert!(start.elapsed().subsec_millis() >= 10);
}
}

View File

@@ -1,156 +0,0 @@
use std::sync::Arc;
use std::sync::RwLock;
use std::sync::Weak;
/// Type alias for callbacks registered when watching files of a `Directory`.
pub type WatchCallback = Box<Fn() -> () + Sync + Send>;
/// Helper struct to implement the watch method in `Directory` implementations.
///
/// It registers callbacks (See `.subscribe(...)`) and
/// calls them upon calls to `.broadcast(...)`.
#[derive(Default)]
pub struct WatchCallbackList {
router: RwLock<Vec<Weak<WatchCallback>>>,
}
/// Controls how long a directory should watch for a file change.
///
/// After all the clones of `WatchHandle` are dropped, the associated will not be called when a
/// file change is detected.
#[must_use = "This `WatchHandle` controls the lifetime of the watch and should therefore be used."]
#[derive(Clone)]
pub struct WatchHandle(Arc<WatchCallback>);
impl WatchCallbackList {
/// Suscribes a new callback and returns a handle that controls the lifetime of the callback.
pub fn subscribe(&self, watch_callback: WatchCallback) -> WatchHandle {
let watch_callback_arc = Arc::new(watch_callback);
let watch_callback_weak = Arc::downgrade(&watch_callback_arc);
self.router.write().unwrap().push(watch_callback_weak);
WatchHandle(watch_callback_arc)
}
fn list_callback(&self) -> Vec<Arc<WatchCallback>> {
let mut callbacks = vec![];
let mut router_wlock = self.router.write().unwrap();
let mut i = 0;
while i < router_wlock.len() {
if let Some(watch) = router_wlock[i].upgrade() {
callbacks.push(watch);
i += 1;
} else {
router_wlock.swap_remove(i);
}
}
callbacks
}
/// Triggers all callbacks
pub fn broadcast(&self) {
let callbacks = self.list_callback();
let spawn_res = std::thread::Builder::new()
.name("watch-callbacks".to_string())
.spawn(move || {
for callback in callbacks {
callback();
}
});
if let Err(err) = spawn_res {
error!(
"Failed to spawn thread to call watch callbacks. Cause: {:?}",
err
);
}
}
}
#[cfg(test)]
mod tests {
use directory::WatchCallbackList;
use std::mem;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
use std::thread;
use std::time::Duration;
const WAIT_TIME: u64 = 20;
#[test]
fn test_watch_event_router_simple() {
let watch_event_router = WatchCallbackList::default();
let counter: Arc<AtomicUsize> = Default::default();
let counter_clone = counter.clone();
let inc_callback = Box::new(move || {
counter_clone.fetch_add(1, Ordering::SeqCst);
});
watch_event_router.broadcast();
assert_eq!(0, counter.load(Ordering::SeqCst));
let handle_a = watch_event_router.subscribe(inc_callback);
thread::sleep(Duration::from_millis(WAIT_TIME));
assert_eq!(0, counter.load(Ordering::SeqCst));
watch_event_router.broadcast();
thread::sleep(Duration::from_millis(WAIT_TIME));
assert_eq!(1, counter.load(Ordering::SeqCst));
watch_event_router.broadcast();
watch_event_router.broadcast();
watch_event_router.broadcast();
thread::sleep(Duration::from_millis(WAIT_TIME));
assert_eq!(4, counter.load(Ordering::SeqCst));
mem::drop(handle_a);
watch_event_router.broadcast();
thread::sleep(Duration::from_millis(WAIT_TIME));
assert_eq!(4, counter.load(Ordering::SeqCst));
}
#[test]
fn test_watch_event_router_multiple_callback_same_key() {
let watch_event_router = WatchCallbackList::default();
let counter: Arc<AtomicUsize> = Default::default();
let inc_callback = |inc: usize| {
let counter_clone = counter.clone();
Box::new(move || {
counter_clone.fetch_add(inc, Ordering::SeqCst);
})
};
let handle_a = watch_event_router.subscribe(inc_callback(1));
let handle_a2 = watch_event_router.subscribe(inc_callback(10));
thread::sleep(Duration::from_millis(WAIT_TIME));
assert_eq!(0, counter.load(Ordering::SeqCst));
watch_event_router.broadcast();
watch_event_router.broadcast();
thread::sleep(Duration::from_millis(WAIT_TIME));
assert_eq!(22, counter.load(Ordering::SeqCst));
mem::drop(handle_a);
watch_event_router.broadcast();
thread::sleep(Duration::from_millis(WAIT_TIME));
assert_eq!(32, counter.load(Ordering::SeqCst));
mem::drop(handle_a2);
watch_event_router.broadcast();
watch_event_router.broadcast();
thread::sleep(Duration::from_millis(WAIT_TIME));
assert_eq!(32, counter.load(Ordering::SeqCst));
}
#[test]
fn test_watch_event_router_multiple_callback_different_key() {
let watch_event_router = WatchCallbackList::default();
let counter: Arc<AtomicUsize> = Default::default();
let counter_clone = counter.clone();
let inc_callback = Box::new(move || {
counter_clone.fetch_add(1, Ordering::SeqCst);
});
let handle_a = watch_event_router.subscribe(inc_callback);
assert_eq!(0, counter.load(Ordering::SeqCst));
watch_event_router.broadcast();
watch_event_router.broadcast();
thread::sleep(Duration::from_millis(WAIT_TIME));
assert_eq!(2, counter.load(Ordering::SeqCst));
thread::sleep(Duration::from_millis(WAIT_TIME));
mem::drop(handle_a);
watch_event_router.broadcast();
thread::sleep(Duration::from_millis(WAIT_TIME));
assert_eq!(2, counter.load(Ordering::SeqCst));
}
}

View File

@@ -1,5 +1,4 @@
use common::BitSet;
use fastfield::DeleteBitSet;
use std::borrow::Borrow;
use std::borrow::BorrowMut;
use std::cmp::Ordering;
@@ -96,23 +95,9 @@ pub trait DocSet {
}
/// Returns the number documents matching.
/// Calling this method consumes the `DocSet`.
fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
let mut count = 0u32;
while self.advance() {
if !delete_bitset.is_deleted(self.doc()) {
count += 1u32;
}
}
count
}
/// Returns the count of documents, deleted or not.
/// Calling this method consumes the `DocSet`.
///
/// Of course, the result is an upper bound of the result
/// given by `count()`.
fn count_including_deleted(&mut self) -> u32 {
/// Calling this method consumes the `DocSet`.
fn count(&mut self) -> u32 {
let mut count = 0u32;
while self.advance() {
count += 1u32;
@@ -142,14 +127,9 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
unboxed.size_hint()
}
fn count(&mut self, delete_bitset: &DeleteBitSet) -> u32 {
fn count(&mut self) -> u32 {
let unboxed: &mut TDocSet = self.borrow_mut();
unboxed.count(delete_bitset)
}
fn count_including_deleted(&mut self) -> u32 {
let unboxed: &mut TDocSet = self.borrow_mut();
unboxed.count_including_deleted()
unboxed.count()
}
fn append_to_bitset(&mut self, bitset: &mut BitSet) {

View File

@@ -2,9 +2,9 @@
use std::io;
use directory::error::LockError;
use directory::error::{IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
use fastfield::FastFieldNotAvailableError;
use indexer::LockType;
use query;
use schema;
use serde_json;
@@ -57,8 +57,11 @@ pub enum TantivyError {
#[fail(display = "Index already exists")]
IndexAlreadyExists,
/// Failed to acquire file lock
#[fail(display = "Failed to acquire Lockfile: {:?}. {:?}", _0, _1)]
LockFailure(LockError, Option<String>),
#[fail(
display = "Failed to acquire Lockfile: {:?}. Possible causes: another IndexWriter instance or panic during previous lock drop.",
_0
)]
LockFailure(LockType),
/// IO Error.
#[fail(display = "An IO error occurred: '{}'", _0)]
IOError(#[cause] IOError),
@@ -97,12 +100,6 @@ impl From<FastFieldNotAvailableError> for TantivyError {
}
}
impl From<LockError> for TantivyError {
fn from(lock_error: LockError) -> TantivyError {
TantivyError::LockFailure(lock_error, None)
}
}
impl From<IOError> for TantivyError {
fn from(io_error: IOError) -> TantivyError {
TantivyError::IOError(io_error)
@@ -162,7 +159,6 @@ impl From<OpenDirectoryError> for TantivyError {
OpenDirectoryError::NotADirectory(directory_path) => {
TantivyError::InvalidArgument(format!("{:?} is not a directory", directory_path))
}
OpenDirectoryError::IoError(err) => TantivyError::IOError(IOError::from(err)),
}
}
}

View File

@@ -22,15 +22,17 @@ mod tests {
index_writer.add_document(doc!(field=>vec![1u8, 3, 5, 7, 9]));
index_writer.add_document(doc!(field=>vec![0u8; 1000]));
assert!(index_writer.commit().is_ok());
let searcher = index.reader().unwrap().searcher();
let segment_reader = searcher.segment_reader(0);
let bytes_reader = segment_reader.fast_fields().bytes(field).unwrap();
assert_eq!(bytes_reader.get_bytes(0), &[0u8, 1, 2, 3]);
assert!(bytes_reader.get_bytes(1).is_empty());
assert_eq!(bytes_reader.get_bytes(2), &[255u8]);
assert_eq!(bytes_reader.get_bytes(3), &[1u8, 3, 5, 7, 9]);
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
let bytes_reader = reader.bytes_fast_field_reader(field).unwrap();
assert_eq!(bytes_reader.get_val(0), &[0u8, 1, 2, 3]);
assert!(bytes_reader.get_val(1).is_empty());
assert_eq!(bytes_reader.get_val(2), &[255u8]);
assert_eq!(bytes_reader.get_val(3), &[1u8, 3, 5, 7, 9]);
let long = vec![0u8; 1000];
assert_eq!(bytes_reader.get_bytes(4), long.as_slice());
assert_eq!(bytes_reader.get_val(4), long.as_slice());
}
}

View File

@@ -14,7 +14,6 @@ use DocId;
///
/// Reading the value for a document is done by reading the start index for it,
/// and the start index for the next document, and keeping the bytes in between.
#[derive(Clone)]
pub struct BytesFastFieldReader {
idx_reader: FastFieldReader<u64>,
values: OwningRef<ReadOnlySource, [u8]>,
@@ -29,20 +28,10 @@ impl BytesFastFieldReader {
BytesFastFieldReader { idx_reader, values }
}
fn range(&self, doc: DocId) -> (usize, usize) {
/// Returns the bytes associated to the given `doc`
pub fn get_val(&self, doc: DocId) -> &[u8] {
let start = self.idx_reader.get(doc) as usize;
let stop = self.idx_reader.get(doc + 1) as usize;
(start, stop)
}
/// Returns the bytes associated to the given `doc`
pub fn get_bytes(&self, doc: DocId) -> &[u8] {
let (start, stop) = self.range(doc);
&self.values[start..stop]
}
/// Returns the overall number of bytes in this bytes fast field.
pub fn total_num_bytes(&self) -> usize {
self.values.len()
}
}

View File

@@ -53,18 +53,16 @@ impl DeleteBitSet {
}
}
/// Returns true iff the document is still "alive". In other words, if it has not been deleted.
pub fn is_alive(&self, doc: DocId) -> bool {
!self.is_deleted(doc)
}
/// Returns true iff the document has been marked as deleted.
#[inline(always)]
/// Returns whether the document has been marked as deleted.
pub fn is_deleted(&self, doc: DocId) -> bool {
let byte_offset = doc / 8u32;
let b: u8 = (*self.data)[byte_offset as usize];
let shift = (doc & 7u32) as u8;
b & (1u8 << shift) != 0
if self.len == 0 {
false
} else {
let byte_offset = doc / 8u32;
let b: u8 = (*self.data)[byte_offset as usize];
let shift = (doc & 7u32) as u8;
b & (1u8 << shift) != 0
}
}
/// Summarize total space usage of this bitset.

View File

@@ -30,7 +30,6 @@ pub use self::error::{FastFieldNotAvailableError, Result};
pub use self::facet_reader::FacetReader;
pub use self::multivalued::{MultiValueIntFastFieldReader, MultiValueIntFastFieldWriter};
pub use self::reader::FastFieldReader;
pub use self::readers::FastFieldReaders;
pub use self::serializer::FastFieldSerializer;
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
use common;
@@ -44,7 +43,6 @@ mod error;
mod facet_reader;
mod multivalued;
mod reader;
mod readers;
mod serializer;
mod writer;
@@ -80,6 +78,10 @@ impl FastValue for u64 {
*self
}
fn as_u64(&self) -> u64 {
*self
}
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
match *field_type {
FieldType::U64(ref integer_options) => integer_options.get_fastfield_cardinality(),
@@ -87,10 +89,6 @@ impl FastValue for u64 {
_ => None,
}
}
fn as_u64(&self) -> u64 {
*self
}
}
impl FastValue for i64 {

View File

@@ -7,13 +7,7 @@ pub use self::writer::MultiValueIntFastFieldWriter;
#[cfg(test)]
mod tests {
extern crate time;
use self::time::Duration;
use collector::TopDocs;
use query::QueryParser;
use schema::Cardinality;
use schema::Facet;
use schema::IntOptions;
use schema::Schema;
use Index;
@@ -34,10 +28,11 @@ mod tests {
index_writer.add_document(doc!(field=>5u64, field=>20u64,field=>1u64));
assert!(index_writer.commit().is_ok());
let searcher = index.reader().unwrap().searcher();
let segment_reader = searcher.segment_reader(0);
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
let mut vals = Vec::new();
let multi_value_reader = segment_reader.fast_fields().u64s(field).unwrap();
let multi_value_reader = reader.multi_fast_field_reader::<u64>(field).unwrap();
{
multi_value_reader.get_vals(2, &mut vals);
assert_eq!(&vals, &[4u64]);
@@ -52,133 +47,6 @@ mod tests {
}
}
#[test]
fn test_multivalued_date() {
let mut schema_builder = Schema::builder();
let date_field = schema_builder.add_date_field(
"multi_date_field",
IntOptions::default()
.set_fast(Cardinality::MultiValues)
.set_indexed()
.set_stored(),
);
let time_i =
schema_builder.add_i64_field("time_stamp_i", IntOptions::default().set_stored());
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let first_time_stamp = chrono::Utc::now();
index_writer.add_document(
doc!(date_field=>first_time_stamp, date_field=>first_time_stamp, time_i=>1i64),
);
index_writer.add_document(doc!(time_i=>0i64));
// add one second
index_writer
.add_document(doc!(date_field=>first_time_stamp + Duration::seconds(1), time_i=>2i64));
// add another second
let two_secs_ahead = first_time_stamp + Duration::seconds(2);
index_writer.add_document(doc!(date_field=>two_secs_ahead, date_field=>two_secs_ahead,date_field=>two_secs_ahead, time_i=>3i64));
assert!(index_writer.commit().is_ok());
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let reader = searcher.segment_reader(0);
assert_eq!(reader.num_docs(), 4);
{
let parser = QueryParser::for_index(&index, vec![date_field]);
let query = parser
.parse_query(&format!("\"{}\"", first_time_stamp.to_rfc3339()).to_string())
.expect("could not parse query");
let results = searcher
.search(&query, &TopDocs::with_limit(5))
.expect("could not query index");
assert_eq!(results.len(), 1);
for (_score, doc_address) in results {
let retrieved_doc = searcher.doc(doc_address).expect("cannot fetch doc");
assert_eq!(
retrieved_doc
.get_first(date_field)
.expect("cannot find value")
.date_value()
.timestamp(),
first_time_stamp.timestamp()
);
assert_eq!(
retrieved_doc
.get_first(time_i)
.expect("cannot find value")
.i64_value(),
1i64
);
}
}
{
let parser = QueryParser::for_index(&index, vec![date_field]);
let query = parser
.parse_query(&format!("\"{}\"", two_secs_ahead.to_rfc3339()).to_string())
.expect("could not parse query");
let results = searcher
.search(&query, &TopDocs::with_limit(5))
.expect("could not query index");
assert_eq!(results.len(), 1);
for (_score, doc_address) in results {
let retrieved_doc = searcher.doc(doc_address).expect("cannot fetch doc");
assert_eq!(
retrieved_doc
.get_first(date_field)
.expect("cannot find value")
.date_value()
.timestamp(),
two_secs_ahead.timestamp()
);
assert_eq!(
retrieved_doc
.get_first(time_i)
.expect("cannot find value")
.i64_value(),
3i64
);
}
}
// TODO: support Date range queries
// {
// let parser = QueryParser::for_index(&index, vec![date_field]);
// let range_q = format!("\"{}\"..\"{}\"",
// (first_time_stamp + Duration::seconds(1)).to_rfc3339(),
// (first_time_stamp + Duration::seconds(3)).to_rfc3339()
// );
// let query = parser.parse_query(&range_q)
// .expect("could not parse query");
// let results = searcher.search(&query, &TopDocs::with_limit(5))
// .expect("could not query index");
//
//
// assert_eq!(results.len(), 2);
// for (i, doc_pair) in results.iter().enumerate() {
// let retrieved_doc = searcher.doc(doc_pair.1).expect("cannot fetch doc");
// let offset_sec = match i {
// 0 => 1,
// 1 => 3,
// _ => panic!("should not have more than 2 docs")
// };
// let time_i_val = match i {
// 0 => 2,
// 1 => 3,
// _ => panic!("should not have more than 2 docs")
// };
// assert_eq!(retrieved_doc.get_first(date_field).expect("cannot find value").date_value().timestamp(),
// (first_time_stamp + Duration::seconds(offset_sec)).timestamp());
// assert_eq!(retrieved_doc.get_first(time_i).expect("cannot find value").i64_value(), time_i_val);
// }
// }
}
#[test]
fn test_multivalued_i64() {
let mut schema_builder = Schema::builder();
@@ -195,10 +63,11 @@ mod tests {
index_writer.add_document(doc!(field=> -5i64, field => -20i64, field=>1i64));
assert!(index_writer.commit().is_ok());
let searcher = index.reader().unwrap().searcher();
let segment_reader = searcher.segment_reader(0);
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
let mut vals = Vec::new();
let multi_value_reader = segment_reader.fast_fields().i64s(field).unwrap();
let multi_value_reader = reader.multi_fast_field_reader::<i64>(field).unwrap();
{
multi_value_reader.get_vals(2, &mut vals);
assert_eq!(&vals, &[-4i64]);
@@ -216,17 +85,4 @@ mod tests {
assert_eq!(&vals, &[-5i64, -20i64, 1i64]);
}
}
#[test]
#[ignore]
fn test_many_facets() {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_facet_field("facetfield");
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
for i in 0..100_000 {
index_writer.add_document(doc!(field=> Facet::from(format!("/lang/{}", i).as_str())));
}
assert!(index_writer.commit().is_ok());
}
}

View File

@@ -26,13 +26,6 @@ impl<Item: FastValue> MultiValueIntFastFieldReader<Item> {
}
}
pub(crate) fn into_u64s_reader(self) -> MultiValueIntFastFieldReader<u64> {
MultiValueIntFastFieldReader {
idx_reader: self.idx_reader,
vals_reader: self.vals_reader.into_u64_reader(),
}
}
/// Returns `(start, stop)`, such that the values associated
/// to the given document are `start..stop`.
fn range(&self, doc: DocId) -> (u64, u64) {
@@ -46,18 +39,7 @@ impl<Item: FastValue> MultiValueIntFastFieldReader<Item> {
let (start, stop) = self.range(doc);
let len = (stop - start) as usize;
vals.resize(len, Item::default());
self.vals_reader.get_range_u64(start, &mut vals[..]);
}
/// Returns the number of values associated with the document `DocId`.
pub fn num_vals(&self, doc: DocId) -> usize {
let (start, stop) = self.range(doc);
(stop - start) as usize
}
/// Returns the overall number of values in this field .
pub fn total_num_vals(&self) -> u64 {
self.idx_reader.max_value()
self.vals_reader.get_range(start as u32, &mut vals[..]);
}
}
@@ -65,7 +47,7 @@ impl<Item: FastValue> MultiValueIntFastFieldReader<Item> {
mod tests {
use core::Index;
use schema::{Facet, Schema};
use schema::{Document, Facet, Schema};
#[test]
fn test_multifastfield_reader() {
@@ -76,14 +58,25 @@ mod tests {
let mut index_writer = index
.writer_with_num_threads(1, 30_000_000)
.expect("Failed to create index writer.");
index_writer.add_document(doc!(
facet_field => Facet::from("/category/cat2"),
facet_field => Facet::from("/category/cat1"),
));
index_writer.add_document(doc!(facet_field => Facet::from("/category/cat2")));
index_writer.add_document(doc!(facet_field => Facet::from("/category/cat3")));
{
let mut doc = Document::new();
doc.add_facet(facet_field, "/category/cat2");
doc.add_facet(facet_field, "/category/cat1");
index_writer.add_document(doc);
}
{
let mut doc = Document::new();
doc.add_facet(facet_field, "/category/cat2");
index_writer.add_document(doc);
}
{
let mut doc = Document::new();
doc.add_facet(facet_field, "/category/cat3");
index_writer.add_document(doc);
}
index_writer.commit().expect("Commit failed");
let searcher = index.reader().unwrap().searcher();
index.load_searchers().expect("Reloading searchers");
let searcher = index.searcher();
let segment_reader = searcher.segment_reader(0);
let mut facet_reader = segment_reader.facet_reader(facet_field).unwrap();

View File

@@ -32,7 +32,7 @@ use DocId;
/// term ids when the segment is getting serialized.
pub struct MultiValueIntFastFieldWriter {
field: Field,
vals: Vec<UnorderedTermId>,
vals: Vec<u64>,
doc_index: Vec<u64>,
is_facet: bool,
}

View File

@@ -50,15 +50,6 @@ impl<Item: FastValue> FastFieldReader<Item> {
}
}
pub(crate) fn into_u64_reader(self) -> FastFieldReader<u64> {
FastFieldReader {
bit_unpacker: self.bit_unpacker,
min_value_u64: self.min_value_u64,
max_value_u64: self.max_value_u64,
_phantom: PhantomData,
}
}
/// Return the value associated to the given document.
///
/// This accessor should return as fast as possible.
@@ -68,29 +59,7 @@ impl<Item: FastValue> FastFieldReader<Item> {
/// May panic if `doc` is greater than the segment
// `maxdoc`.
pub fn get(&self, doc: DocId) -> Item {
self.get_u64(u64::from(doc))
}
pub(crate) fn get_u64(&self, doc: u64) -> Item {
Item::from_u64(self.min_value_u64 + self.bit_unpacker.get(doc))
}
/// Internally `multivalued` also use SingleValue Fast fields.
/// It works as follows... A first column contains the list of start index
/// for each document, a second column contains the actual values.
///
/// The values associated to a given doc, are then
/// `second_column[first_column.get(doc)..first_column.get(doc+1)]`.
///
/// Which means single value fast field reader can be indexed internally with
/// something different from a `DocId`. For this use case, we want to use `u64`
/// values.
///
/// See `get_range` for an actual documentation about this method.
pub(crate) fn get_range_u64(&self, start: u64, output: &mut [Item]) {
for (i, out) in output.iter_mut().enumerate() {
*out = self.get_u64(start + (i as u64));
}
Item::from_u64(self.min_value_u64 + self.bit_unpacker.get(doc as usize))
}
/// Fills an output buffer with the fast field values
@@ -106,8 +75,16 @@ impl<Item: FastValue> FastFieldReader<Item> {
///
/// May panic if `start + output.len()` is greater than
/// the segment's `maxdoc`.
pub fn get_range(&self, start: DocId, output: &mut [Item]) {
self.get_range_u64(u64::from(start), output);
///
// TODO change start to `u64`.
// For multifastfield, start is an index in a second fastfield, not a `DocId`
pub fn get_range(&self, start: u32, output: &mut [Item]) {
// ok: Item is either `u64` or `i64`
let output_u64: &mut [u64] = unsafe { &mut *(output as *mut [Item] as *mut [u64]) };
self.bit_unpacker.get_range(start, output_u64);
for out in output_u64.iter_mut() {
*out = Item::from_u64(*out + self.min_value_u64).as_u64();
}
}
/// Returns the minimum value for this fast field.

View File

@@ -1,191 +0,0 @@
use common::CompositeFile;
use fastfield::BytesFastFieldReader;
use fastfield::MultiValueIntFastFieldReader;
use fastfield::{FastFieldNotAvailableError, FastFieldReader};
use schema::{Cardinality, Field, FieldType, Schema};
use space_usage::PerFieldSpaceUsage;
use std::collections::HashMap;
use Result;
/// Provides access to all of the FastFieldReader.
///
/// Internally, `FastFieldReaders` have preloaded fast field readers,
/// and just wraps several `HashMap`.
pub struct FastFieldReaders {
fast_field_i64: HashMap<Field, FastFieldReader<i64>>,
fast_field_u64: HashMap<Field, FastFieldReader<u64>>,
fast_field_i64s: HashMap<Field, MultiValueIntFastFieldReader<i64>>,
fast_field_u64s: HashMap<Field, MultiValueIntFastFieldReader<u64>>,
fast_bytes: HashMap<Field, BytesFastFieldReader>,
fast_fields_composite: CompositeFile,
}
enum FastType {
I64,
U64,
}
fn type_and_cardinality(field_type: &FieldType) -> Option<(FastType, Cardinality)> {
match field_type {
FieldType::U64(options) => options
.get_fastfield_cardinality()
.map(|cardinality| (FastType::U64, cardinality)),
FieldType::I64(options) => options
.get_fastfield_cardinality()
.map(|cardinality| (FastType::I64, cardinality)),
FieldType::HierarchicalFacet => Some((FastType::U64, Cardinality::MultiValues)),
_ => None,
}
}
impl FastFieldReaders {
pub(crate) fn load_all(
schema: &Schema,
fast_fields_composite: &CompositeFile,
) -> Result<FastFieldReaders> {
let mut fast_field_readers = FastFieldReaders {
fast_field_i64: Default::default(),
fast_field_u64: Default::default(),
fast_field_i64s: Default::default(),
fast_field_u64s: Default::default(),
fast_bytes: Default::default(),
fast_fields_composite: fast_fields_composite.clone(),
};
for (field_id, field_entry) in schema.fields().iter().enumerate() {
let field = Field(field_id as u32);
let field_type = field_entry.field_type();
if field_type == &FieldType::Bytes {
let idx_reader = fast_fields_composite
.open_read_with_idx(field, 0)
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
.map(FastFieldReader::open)?;
let data = fast_fields_composite
.open_read_with_idx(field, 1)
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))?;
fast_field_readers
.fast_bytes
.insert(field, BytesFastFieldReader::open(idx_reader, data));
} else if let Some((fast_type, cardinality)) = type_and_cardinality(field_type) {
match cardinality {
Cardinality::SingleValue => {
if let Some(fast_field_data) = fast_fields_composite.open_read(field) {
match fast_type {
FastType::U64 => {
let fast_field_reader = FastFieldReader::open(fast_field_data);
fast_field_readers
.fast_field_u64
.insert(field, fast_field_reader);
}
FastType::I64 => {
fast_field_readers.fast_field_i64.insert(
field,
FastFieldReader::open(fast_field_data.clone()),
);
}
}
} else {
return Err(From::from(FastFieldNotAvailableError::new(field_entry)));
}
}
Cardinality::MultiValues => {
let idx_opt = fast_fields_composite.open_read_with_idx(field, 0);
let data_opt = fast_fields_composite.open_read_with_idx(field, 1);
if let (Some(fast_field_idx), Some(fast_field_data)) = (idx_opt, data_opt) {
let idx_reader = FastFieldReader::open(fast_field_idx);
match fast_type {
FastType::I64 => {
let vals_reader = FastFieldReader::open(fast_field_data);
let multivalued_int_fast_field =
MultiValueIntFastFieldReader::open(idx_reader, vals_reader);
fast_field_readers
.fast_field_i64s
.insert(field, multivalued_int_fast_field);
}
FastType::U64 => {
let vals_reader = FastFieldReader::open(fast_field_data);
let multivalued_int_fast_field =
MultiValueIntFastFieldReader::open(idx_reader, vals_reader);
fast_field_readers
.fast_field_u64s
.insert(field, multivalued_int_fast_field);
}
}
} else {
return Err(From::from(FastFieldNotAvailableError::new(field_entry)));
}
}
}
}
}
Ok(fast_field_readers)
}
pub(crate) fn space_usage(&self) -> PerFieldSpaceUsage {
self.fast_fields_composite.space_usage()
}
/// Returns the `u64` fast field reader reader associated to `field`.
///
/// If `field` is not a u64 fast field, this method returns `None`.
pub fn u64(&self, field: Field) -> Option<FastFieldReader<u64>> {
self.fast_field_u64.get(&field).cloned()
}
/// If the field is a u64-fast field return the associated reader.
/// If the field is a i64-fast field, return the associated u64 reader. Values are
/// mapped from i64 to u64 using a (well the, it is unique) monotonic mapping. ///
///
/// This method is useful when merging segment reader.
pub(crate) fn u64_lenient(&self, field: Field) -> Option<FastFieldReader<u64>> {
if let Some(u64_ff_reader) = self.u64(field) {
return Some(u64_ff_reader);
}
if let Some(i64_ff_reader) = self.i64(field) {
return Some(i64_ff_reader.into_u64_reader());
}
None
}
/// Returns the `i64` fast field reader reader associated to `field`.
///
/// If `field` is not a i64 fast field, this method returns `None`.
pub fn i64(&self, field: Field) -> Option<FastFieldReader<i64>> {
self.fast_field_i64.get(&field).cloned()
}
/// Returns a `u64s` multi-valued fast field reader reader associated to `field`.
///
/// If `field` is not a u64 multi-valued fast field, this method returns `None`.
pub fn u64s(&self, field: Field) -> Option<MultiValueIntFastFieldReader<u64>> {
self.fast_field_u64s.get(&field).cloned()
}
/// If the field is a u64s-fast field return the associated reader.
/// If the field is a i64s-fast field, return the associated u64s reader. Values are
/// mapped from i64 to u64 using a (well the, it is unique) monotonic mapping.
///
/// This method is useful when merging segment reader.
pub(crate) fn u64s_lenient(&self, field: Field) -> Option<MultiValueIntFastFieldReader<u64>> {
if let Some(u64s_ff_reader) = self.u64s(field) {
return Some(u64s_ff_reader);
}
if let Some(i64s_ff_reader) = self.i64s(field) {
return Some(i64s_ff_reader.into_u64s_reader());
}
None
}
/// Returns a `i64s` multi-valued fast field reader reader associated to `field`.
///
/// If `field` is not a i64 multi-valued fast field, this method returns `None`.
pub fn i64s(&self, field: Field) -> Option<MultiValueIntFastFieldReader<i64>> {
self.fast_field_i64s.get(&field).cloned()
}
/// Returns the `bytes` fast field reader associated to `field`.
///
/// If `field` is not a bytes fast field, returns `None`.
pub fn bytes(&self, field: Field) -> Option<BytesFastFieldReader> {
self.fast_bytes.get(&field).cloned()
}
}

View File

@@ -13,15 +13,15 @@ fn check_index_content(searcher: &Searcher, vals: &HashSet<u64>) {
#[test]
#[ignore]
#[cfg(feature = "mmap")]
fn test_indexing() {
let mut schema_builder = Schema::builder();
let id_field = schema_builder.add_u64_field("id", INDEXED);
let multiples_field = schema_builder.add_u64_field("multiples", INDEXED);
let id_field = schema_builder.add_u64_field("id", INT_INDEXED);
let multiples_field = schema_builder.add_u64_field("multiples", INT_INDEXED);
let schema = schema_builder.build();
let index = Index::create_from_tempdir(schema).unwrap();
let reader = index.reader().unwrap();
let mut rng = thread_rng();
@@ -36,8 +36,8 @@ fn test_indexing() {
index_writer.commit().expect("Commit failed");
committed_docs.extend(&uncommitted_docs);
uncommitted_docs.clear();
reader.reload().unwrap();
let searcher = reader.searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
// check that everything is correct.
check_index_content(&searcher, &committed_docs);
} else {

View File

@@ -2,7 +2,6 @@ use super::operation::DeleteOperation;
use std::mem;
use std::ops::DerefMut;
use std::sync::{Arc, RwLock};
use Opstamp;
// The DeleteQueue is similar in conceptually to a multiple
// consumer single producer broadcast channel.
@@ -185,7 +184,7 @@ impl DeleteCursor {
/// queue are consume and the next get will return None.
/// - the next get will return the first operation with an
/// `opstamp >= target_opstamp`.
pub fn skip_to(&mut self, target_opstamp: Opstamp) {
pub fn skip_to(&mut self, target_opstamp: u64) {
// TODO Can be optimize as we work with block.
while self.is_behind_opstamp(target_opstamp) {
self.advance();
@@ -193,7 +192,7 @@ impl DeleteCursor {
}
#[cfg_attr(feature = "cargo-clippy", allow(clippy::wrong_self_convention))]
fn is_behind_opstamp(&mut self, target_opstamp: Opstamp) -> bool {
fn is_behind_opstamp(&mut self, target_opstamp: u64) -> bool {
self.get()
.map(|operation| operation.opstamp < target_opstamp)
.unwrap_or(false)

View File

@@ -0,0 +1,131 @@
use directory::error::OpenWriteError;
use std::io::Write;
use std::path::{Path, PathBuf};
use std::thread;
use std::time::Duration;
use Directory;
use TantivyError;
#[derive(Debug, Clone, Copy)]
pub enum LockType {
/// Only one process should be able to write tantivy's index at a time.
/// This lock file, when present, is in charge of preventing other processes to open an IndexWriter.
///
/// If the process is killed and this file remains, it is safe to remove it manually.
///
/// Failing to acquire this lock usually means a misuse of tantivy's API,
/// (creating more than one instance of the `IndexWriter`), are a spurious
/// lock file remaining after a crash. In the latter case, removing the file after
/// checking no process running tantivy is running is safe.
IndexWriterLock,
/// The meta lock file is here to protect the segment files being opened by
/// `.load_searchers()` from being garbage collected.
/// It makes it possible for another process to safely consume
/// our index in-writing. Ideally, we may have prefered `RWLock` semantics
/// here, but it is difficult to achieve on Windows.
///
/// Opening segment readers is a very fast process.
/// Right now if the lock cannot be acquire on the first attempt, the logic
/// is very simplistic. We retry after `100ms` until we effectively
/// acquire the lock.
/// This lock should not have much contention in normal usage.
MetaLock,
}
/// Retry the logic of acquiring locks is pretty simple.
/// We just retry `n` times after a given `duratio`, both
/// depending on the type of lock.
struct RetryPolicy {
num_retries: usize,
wait_in_ms: u64,
}
impl RetryPolicy {
fn no_retry() -> RetryPolicy {
RetryPolicy {
num_retries: 0,
wait_in_ms: 0,
}
}
fn wait_and_retry(&mut self) -> bool {
if self.num_retries == 0 {
false
} else {
self.num_retries -= 1;
let wait_duration = Duration::from_millis(self.wait_in_ms);
thread::sleep(wait_duration);
true
}
}
}
impl LockType {
fn retry_policy(self) -> RetryPolicy {
match self {
LockType::IndexWriterLock => RetryPolicy::no_retry(),
LockType::MetaLock => RetryPolicy {
num_retries: 100,
wait_in_ms: 100,
},
}
}
fn try_acquire_lock(self, directory: &mut Directory) -> Result<DirectoryLock, TantivyError> {
let path = self.filename();
let mut write = directory.open_write(path).map_err(|e| match e {
OpenWriteError::FileAlreadyExists(_) => TantivyError::LockFailure(self),
OpenWriteError::IOError(io_error) => TantivyError::IOError(io_error),
})?;
write.flush()?;
Ok(DirectoryLock {
directory: directory.box_clone(),
path: path.to_owned(),
})
}
/// Acquire a lock in the given directory.
pub fn acquire_lock(self, directory: &Directory) -> Result<DirectoryLock, TantivyError> {
let mut box_directory = directory.box_clone();
let mut retry_policy = self.retry_policy();
loop {
let lock_result = self.try_acquire_lock(&mut *box_directory);
match lock_result {
Ok(result) => {
return Ok(result);
}
Err(TantivyError::LockFailure(ref filepath)) => {
if !retry_policy.wait_and_retry() {
return Err(TantivyError::LockFailure(filepath.to_owned()));
}
}
Err(_) => {}
}
}
}
fn filename(&self) -> &Path {
match *self {
LockType::MetaLock => Path::new(".tantivy-meta.lock"),
LockType::IndexWriterLock => Path::new(".tantivy-indexer.lock"),
}
}
}
/// The `DirectoryLock` is an object that represents a file lock.
/// See [`LockType`](struct.LockType.html)
///
/// It is transparently associated to a lock file, that gets deleted
/// on `Drop.` The lock is release automatically on `Drop`.
pub struct DirectoryLock {
directory: Box<Directory>,
path: PathBuf,
}
impl Drop for DirectoryLock {
fn drop(&mut self) {
if let Err(e) = self.directory.delete(&*self.path) {
error!("Failed to remove the lock file. {:?}", e);
}
}
}

View File

@@ -1,6 +1,5 @@
use std::sync::Arc;
use DocId;
use Opstamp;
// Doc to opstamp is used to identify which
// document should be deleted.
@@ -24,7 +23,7 @@ pub enum DocToOpstampMapping {
}
impl From<Vec<u64>> for DocToOpstampMapping {
fn from(opstamps: Vec<Opstamp>) -> DocToOpstampMapping {
fn from(opstamps: Vec<u64>) -> DocToOpstampMapping {
DocToOpstampMapping::WithMap(Arc::new(opstamps))
}
}
@@ -36,7 +35,7 @@ impl DocToOpstampMapping {
//
// The edge case opstamp = some doc opstamp is in practise
// never called.
pub fn compute_doc_limit(&self, target_opstamp: Opstamp) -> DocId {
pub fn compute_doc_limit(&self, target_opstamp: u64) -> DocId {
match *self {
DocToOpstampMapping::WithMap(ref doc_opstamps) => {
match doc_opstamps.binary_search(&target_opstamp) {

View File

@@ -1,4 +1,4 @@
use super::operation::{AddOperation, UserOperation};
use super::operation::AddOperation;
use super::segment_updater::SegmentUpdater;
use super::PreparedCommit;
use bit_set::BitSet;
@@ -9,15 +9,15 @@ use core::SegmentId;
use core::SegmentMeta;
use core::SegmentReader;
use crossbeam::channel;
use directory::DirectoryLock;
use docset::DocSet;
use error::TantivyError;
use fastfield::write_delete_bitset;
use futures::{Canceled, Future};
use futures::sync::oneshot::Receiver;
use indexer::delete_queue::{DeleteCursor, DeleteQueue};
use indexer::doc_opstamp_mapping::DocToOpstampMapping;
use indexer::operation::DeleteOperation;
use indexer::stamper::Stamper;
use indexer::DirectoryLock;
use indexer::MergePolicy;
use indexer::SegmentEntry;
use indexer::SegmentWriter;
@@ -26,11 +26,9 @@ use schema::Document;
use schema::IndexRecordOption;
use schema::Term;
use std::mem;
use std::ops::Range;
use std::sync::Arc;
use std::mem::swap;
use std::thread;
use std::thread::JoinHandle;
use Opstamp;
use Result;
// Size of the margin for the heap. A segment is closed when the remaining memory
@@ -45,8 +43,8 @@ pub const HEAP_SIZE_MAX: usize = u32::max_value() as usize - MARGIN_IN_BYTES;
// reaches `PIPELINE_MAX_SIZE_IN_DOCS`
const PIPELINE_MAX_SIZE_IN_DOCS: usize = 10_000;
type OperationSender = channel::Sender<Vec<AddOperation>>;
type OperationReceiver = channel::Receiver<Vec<AddOperation>>;
type DocumentSender = channel::Sender<AddOperation>;
type DocumentReceiver = channel::Receiver<AddOperation>;
/// Split the thread memory budget into
/// - the heap size
@@ -54,19 +52,17 @@ type OperationReceiver = channel::Receiver<Vec<AddOperation>>;
///
/// Returns (the heap size in bytes, the hash table size in number of bits)
fn initial_table_size(per_thread_memory_budget: usize) -> usize {
assert!(per_thread_memory_budget > 1_000);
let table_size_limit: usize = per_thread_memory_budget / 3;
if let Some(limit) = (1..)
(1..)
.take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit)
.last()
{
limit.min(19) // we cap it at 2^19 = 512K.
} else {
unreachable!(
"Per thread memory is too small: {}",
per_thread_memory_budget
);
}
.unwrap_or_else(|| {
panic!(
"Per thread memory is too small: {}",
per_thread_memory_budget
)
})
.min(19) // we cap it at 512K
}
/// `IndexWriter` is the user entry-point to add document to an index.
@@ -86,8 +82,8 @@ pub struct IndexWriter {
workers_join_handle: Vec<JoinHandle<Result<()>>>,
operation_receiver: OperationReceiver,
operation_sender: OperationSender,
document_receiver: DocumentReceiver,
document_sender: DocumentSender,
segment_updater: SegmentUpdater,
@@ -100,7 +96,7 @@ pub struct IndexWriter {
delete_queue: DeleteQueue,
stamper: Stamper,
committed_opstamp: Opstamp,
committed_opstamp: u64,
}
/// Open a new index writer. Attempts to acquire a lockfile.
@@ -134,7 +130,7 @@ pub fn open_index_writer(
let err_msg = format!("The heap size per thread cannot exceed {}", HEAP_SIZE_MAX);
return Err(TantivyError::InvalidArgument(err_msg));
}
let (document_sender, document_receiver): (OperationSender, OperationReceiver) =
let (document_sender, document_receiver): (DocumentSender, DocumentReceiver) =
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
let delete_queue = DeleteQueue::new();
@@ -152,8 +148,8 @@ pub fn open_index_writer(
heap_size_in_bytes_per_thread,
index: index.clone(),
operation_receiver: document_receiver,
operation_sender: document_sender,
document_receiver,
document_sender,
segment_updater,
@@ -178,7 +174,7 @@ pub fn compute_deleted_bitset(
segment_reader: &SegmentReader,
delete_cursor: &mut DeleteCursor,
doc_opstamps: &DocToOpstampMapping,
target_opstamp: Opstamp,
target_opstamp: u64,
) -> Result<bool> {
let mut might_have_changed = false;
@@ -220,7 +216,7 @@ pub fn compute_deleted_bitset(
pub fn advance_deletes(
mut segment: Segment,
segment_entry: &mut SegmentEntry,
target_opstamp: Opstamp,
target_opstamp: u64,
) -> Result<()> {
{
if segment_entry.meta().delete_opstamp() == Some(target_opstamp) {
@@ -260,7 +256,7 @@ pub fn advance_deletes(
write_delete_bitset(&delete_bitset, &mut delete_file)?;
}
}
segment_entry.set_meta(segment.meta().clone());
segment_entry.set_meta((*segment.meta()).clone());
Ok(())
}
@@ -268,7 +264,7 @@ fn index_documents(
memory_budget: usize,
segment: &Segment,
generation: usize,
document_iterator: &mut Iterator<Item = Vec<AddOperation>>,
document_iterator: &mut Iterator<Item = AddOperation>,
segment_updater: &mut SegmentUpdater,
mut delete_cursor: DeleteCursor,
) -> Result<bool> {
@@ -276,11 +272,11 @@ fn index_documents(
let segment_id = segment.id();
let table_size = initial_table_size(memory_budget);
let mut segment_writer = SegmentWriter::for_segment(table_size, segment.clone(), &schema)?;
for documents in document_iterator {
for doc in documents {
segment_writer.add_document(doc, &schema)?;
}
for doc in document_iterator {
segment_writer.add_document(doc, &schema)?;
let mem_usage = segment_writer.mem_usage();
if mem_usage >= memory_budget - MARGIN_IN_BYTES {
info!(
"Buffer limit reached, flushing segment with maxdoc={}.",
@@ -300,13 +296,13 @@ fn index_documents(
// the worker thread.
assert!(num_docs > 0);
let doc_opstamps: Vec<Opstamp> = segment_writer.finalize()?;
let doc_opstamps: Vec<u64> = segment_writer.finalize()?;
let segment_meta = SegmentMeta::new(segment_id, num_docs);
let last_docstamp: Opstamp = *(doc_opstamps.last().unwrap());
let last_docstamp: u64 = *(doc_opstamps.last().unwrap());
let delete_bitset_opt = if delete_cursor.get().is_some() {
let segment_entry: SegmentEntry = if delete_cursor.get().is_some() {
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
let segment_reader = SegmentReader::open(segment)?;
let mut deleted_bitset = BitSet::with_capacity(num_docs as usize);
@@ -317,17 +313,18 @@ fn index_documents(
&doc_to_opstamps,
last_docstamp,
)?;
if may_have_deletes {
Some(deleted_bitset)
} else {
None
}
SegmentEntry::new(segment_meta, delete_cursor, {
if may_have_deletes {
Some(deleted_bitset)
} else {
None
}
})
} else {
// if there are no delete operation in the queue, no need
// to even open the segment.
None
SegmentEntry::new(segment_meta, delete_cursor, None)
};
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, delete_bitset_opt);
Ok(segment_updater.add_segment(generation, segment_entry))
}
@@ -336,7 +333,7 @@ impl IndexWriter {
pub fn wait_merging_threads(mut self) -> Result<()> {
// this will stop the indexing thread,
// dropping the last reference to the segment_updater.
drop(self.operation_sender);
drop(self.document_sender);
let former_workers_handles = mem::replace(&mut self.workers_join_handle, vec![]);
for join_handle in former_workers_handles {
@@ -369,23 +366,20 @@ impl IndexWriter {
.add_segment(self.generation, segment_entry);
}
/// Creates a new segment.
/// *Experimental & Advanced API* Creates a new segment.
/// and marks it as currently in write.
///
/// This method is useful only for users trying to do complex
/// operations, like converting an index format to another.
///
/// It is safe to start writing file associated to the new `Segment`.
/// These will not be garbage collected as long as an instance object of
/// `SegmentMeta` object associated to the new `Segment` is "alive".
pub fn new_segment(&self) -> Segment {
self.index.new_segment()
self.segment_updater.new_segment()
}
/// Spawns a new worker thread for indexing.
/// The thread consumes documents from the pipeline.
///
fn add_indexing_worker(&mut self) -> Result<()> {
let document_receiver_clone = self.operation_receiver.clone();
let document_receiver_clone = self.document_receiver.clone();
let mut segment_updater = self.segment_updater.clone();
let generation = self.generation;
@@ -393,7 +387,6 @@ impl IndexWriter {
let mut delete_cursor = self.delete_queue.cursor();
let mem_budget = self.heap_size_in_bytes_per_thread;
let index = self.index.clone();
let join_handle: JoinHandle<Result<()>> = thread::Builder::new()
.name(format!(
"thrd-tantivy-index{}-gen{}",
@@ -411,19 +404,15 @@ impl IndexWriter {
// this is a valid guarantee as the
// peeked document now belongs to
// our local iterator.
if let Some(operations) = document_iterator.peek() {
if let Some(first) = operations.first() {
delete_cursor.skip_to(first.opstamp);
} else {
return Ok(());
}
if let Some(operation) = document_iterator.peek() {
delete_cursor.skip_to(operation.opstamp);
} else {
// No more documents.
// Happens when there is a commit, or if the `IndexWriter`
// was dropped.
return Ok(());
}
let segment = index.new_segment();
let segment = segment_updater.new_segment();
index_documents(
mem_budget,
&segment,
@@ -440,7 +429,7 @@ impl IndexWriter {
}
/// Accessor to the merge policy.
pub fn get_merge_policy(&self) -> Arc<Box<MergePolicy>> {
pub fn get_merge_policy(&self) -> Box<MergePolicy> {
self.segment_updater.get_merge_policy()
}
@@ -465,10 +454,7 @@ impl IndexWriter {
/// Merges a given list of segments
///
/// `segment_ids` is required to be non-empty.
pub fn merge(
&mut self,
segment_ids: &[SegmentId],
) -> Result<impl Future<Item = SegmentMeta, Error = Canceled>> {
pub fn merge(&mut self, segment_ids: &[SegmentId]) -> Result<Receiver<SegmentMeta>> {
self.segment_updater.start_merge(segment_ids)
}
@@ -480,11 +466,12 @@ impl IndexWriter {
/// when no documents are remaining.
///
/// Returns the former segment_ready channel.
fn recreate_document_channel(&mut self) -> OperationReceiver {
let (document_sender, document_receiver): (OperationSender, OperationReceiver) =
fn recreate_document_channel(&mut self) -> DocumentReceiver {
let (mut document_sender, mut document_receiver): (DocumentSender, DocumentReceiver) =
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
mem::replace(&mut self.operation_sender, document_sender);
mem::replace(&mut self.operation_receiver, document_receiver)
swap(&mut self.document_sender, &mut document_sender);
swap(&mut self.document_receiver, &mut document_receiver);
document_receiver
}
/// Rollback to the last commit
@@ -495,14 +482,14 @@ impl IndexWriter {
/// state as it was after the last commit.
///
/// The opstamp at the last commit is returned.
pub fn rollback(&mut self) -> Result<Opstamp> {
pub fn rollback(&mut self) -> Result<()> {
info!("Rolling back to opstamp {}", self.committed_opstamp);
// marks the segment updater as killed. From now on, all
// segment updates will be ignored.
self.segment_updater.kill();
let document_receiver = self.operation_receiver.clone();
let document_receiver = self.document_receiver.clone();
// take the directory lock to create a new index_writer.
let directory_lock = self
@@ -530,7 +517,7 @@ impl IndexWriter {
// was dropped with the index_writer.
for _ in document_receiver.clone() {}
Ok(self.committed_opstamp)
Ok(())
}
/// Prepares a commit.
@@ -568,15 +555,17 @@ impl IndexWriter {
info!("Preparing commit");
// this will drop the current document channel
// and recreate a new one.
// and recreate a new one channels.
self.recreate_document_channel();
let former_workers_join_handle = mem::replace(&mut self.workers_join_handle, Vec::new());
let former_workers_join_handle =
mem::replace(&mut self.workers_join_handle, Vec::new());
for worker_handle in former_workers_join_handle {
let indexing_worker_result = worker_handle
.join()
.map_err(|e| TantivyError::ErrorInThread(format!("{:?}", e)))?;
indexing_worker_result?;
// add a new worker for the next generation.
self.add_indexing_worker()?;
@@ -602,7 +591,7 @@ impl IndexWriter {
/// Commit returns the `opstamp` of the last document
/// that made it in the commit.
///
pub fn commit(&mut self) -> Result<Opstamp> {
pub fn commit(&mut self) -> Result<u64> {
self.prepare_commit()?.commit()
}
@@ -618,7 +607,7 @@ impl IndexWriter {
///
/// Like adds, the deletion itself will be visible
/// only after calling `commit()`.
pub fn delete_term(&self, term: Term) -> Opstamp {
pub fn delete_term(&mut self, term: Term) -> u64 {
let opstamp = self.stamper.stamp();
let delete_operation = DeleteOperation { opstamp, term };
self.delete_queue.push(delete_operation);
@@ -632,7 +621,7 @@ impl IndexWriter {
///
/// This is also the opstamp of the commit that is currently
/// available for searchers.
pub fn commit_opstamp(&self) -> Opstamp {
pub fn commit_opstamp(&self) -> u64 {
self.committed_opstamp
}
@@ -646,179 +635,35 @@ impl IndexWriter {
///
/// Currently it represents the number of documents that
/// have been added since the creation of the index.
pub fn add_document(&self, document: Document) -> Opstamp {
pub fn add_document(&mut self, document: Document) -> u64 {
let opstamp = self.stamper.stamp();
let add_operation = AddOperation { opstamp, document };
let send_result = self.operation_sender.send(vec![add_operation]);
let send_result = self.document_sender.send(add_operation);
if let Err(e) = send_result {
panic!("Failed to index document. Sending to indexing channel failed. This probably means all of the indexing threads have panicked. {:?}", e);
}
opstamp
}
/// Gets a range of stamps from the stamper and "pops" the last stamp
/// from the range returning a tuple of the last optstamp and the popped
/// range.
///
/// The total number of stamps generated by this method is `count + 1`;
/// each operation gets a stamp from the `stamps` iterator and `last_opstamp`
/// is for the batch itself.
fn get_batch_opstamps(&self, count: Opstamp) -> (Opstamp, Range<Opstamp>) {
let Range { start, end } = self.stamper.stamps(count + 1u64);
let last_opstamp = end - 1;
let stamps = Range {
start,
end: last_opstamp,
};
(last_opstamp, stamps)
}
/// Runs a group of document operations ensuring that the operations are
/// assigned contigous u64 opstamps and that add operations of the same
/// group are flushed into the same segment.
///
/// If the indexing pipeline is full, this call may block.
///
/// Each operation of the given `user_operations` will receive an in-order,
/// contiguous u64 opstamp. The entire batch itself is also given an
/// opstamp that is 1 greater than the last given operation. This
/// `batch_opstamp` is the return value of `run`. An empty group of
/// `user_operations`, an empty `Vec<UserOperation>`, still receives
/// a valid opstamp even though no changes were _actually_ made to the index.
///
/// Like adds and deletes (see `IndexWriter.add_document` and
/// `IndexWriter.delete_term`), the changes made by calling `run` will be
/// visible to readers only after calling `commit()`.
pub fn run(&self, user_operations: Vec<UserOperation>) -> Opstamp {
let count = user_operations.len() as u64;
if count == 0 {
return self.stamper.stamp();
}
let (batch_opstamp, stamps) = self.get_batch_opstamps(count);
let mut adds: Vec<AddOperation> = Vec::new();
for (user_op, opstamp) in user_operations.into_iter().zip(stamps) {
match user_op {
UserOperation::Delete(term) => {
let delete_operation = DeleteOperation { opstamp, term };
self.delete_queue.push(delete_operation);
}
UserOperation::Add(document) => {
let add_operation = AddOperation { opstamp, document };
adds.push(add_operation);
}
}
}
let send_result = self.operation_sender.send(adds);
if let Err(e) = send_result {
panic!("Failed to index document. Sending to indexing channel failed. This probably means all of the indexing threads have panicked. {:?}", e);
};
batch_opstamp
}
}
#[cfg(test)]
mod tests {
use super::super::operation::UserOperation;
use super::initial_table_size;
use collector::TopDocs;
use directory::error::LockError;
use error::*;
use indexer::NoMergePolicy;
use query::TermQuery;
use schema::{self, IndexRecordOption};
use schema::{self, Document};
use Index;
use ReloadPolicy;
use Term;
#[test]
fn test_operations_group() {
// an operations group with 2 items should cause 3 opstamps 0, 1, and 2.
let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build());
let index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let operations = vec![
UserOperation::Add(doc!(text_field=>"a")),
UserOperation::Add(doc!(text_field=>"b")),
];
let batch_opstamp1 = index_writer.run(operations);
assert_eq!(batch_opstamp1, 2u64);
}
#[test]
fn test_ordered_batched_operations() {
// * one delete for `doc!(field=>"a")`
// * one add for `doc!(field=>"a")`
// * one add for `doc!(field=>"b")`
// * one delete for `doc!(field=>"b")`
// after commit there is one doc with "a" and 0 doc with "b"
let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build());
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let a_term = Term::from_field_text(text_field, "a");
let b_term = Term::from_field_text(text_field, "b");
let operations = vec![
UserOperation::Delete(a_term),
UserOperation::Add(doc!(text_field=>"a")),
UserOperation::Add(doc!(text_field=>"b")),
UserOperation::Delete(b_term),
];
index_writer.run(operations);
index_writer.commit().expect("failed to commit");
reader.reload().expect("failed to load searchers");
let a_term = Term::from_field_text(text_field, "a");
let b_term = Term::from_field_text(text_field, "b");
let a_query = TermQuery::new(a_term, IndexRecordOption::Basic);
let b_query = TermQuery::new(b_term, IndexRecordOption::Basic);
let searcher = reader.searcher();
let a_docs = searcher
.search(&a_query, &TopDocs::with_limit(1))
.expect("search for a failed");
let b_docs = searcher
.search(&b_query, &TopDocs::with_limit(1))
.expect("search for b failed");
assert_eq!(a_docs.len(), 1);
assert_eq!(b_docs.len(), 0);
}
#[test]
fn test_empty_operations_group() {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
let index_writer = index.writer(3_000_000).unwrap();
let operations1 = vec![];
let batch_opstamp1 = index_writer.run(operations1);
assert_eq!(batch_opstamp1, 0u64);
let operations2 = vec![];
let batch_opstamp2 = index_writer.run(operations2);
assert_eq!(batch_opstamp2, 1u64);
}
#[test]
fn test_lockfile_stops_duplicates() {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
let _index_writer = index.writer(3_000_000).unwrap();
match index.writer(3_000_000) {
Err(TantivyError::LockFailure(LockError::LockBusy, _)) => {}
_ => panic!("Expected a `LockFailure` error"),
let _index_writer = index.writer(40_000_000).unwrap();
match index.writer(40_000_000) {
Err(TantivyError::LockFailure(_)) => {}
_ => panic!("Expected FileAlreadyExists error"),
}
}
@@ -830,7 +675,8 @@ mod tests {
match index.writer_with_num_threads(1, 3_000_000) {
Err(err) => {
let err_msg = err.to_string();
assert!(err_msg.contains("already an `IndexWriter`"));
assert!(err_msg.contains("Lockfile"));
assert!(err_msg.contains("Possible causes:"))
}
_ => panic!("Expected LockfileAlreadyExists error"),
}
@@ -840,7 +686,7 @@ mod tests {
fn test_set_merge_policy() {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
let index_writer = index.writer(3_000_000).unwrap();
let index_writer = index.writer(40_000_000).unwrap();
assert_eq!(
format!("{:?}", index_writer.get_merge_policy()),
"LogMergePolicy { min_merge_size: 8, min_layer_size: 10000, \
@@ -859,11 +705,11 @@ mod tests {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
{
let _index_writer = index.writer(3_000_000).unwrap();
let _index_writer = index.writer(40_000_000).unwrap();
// the lock should be released when the
// index_writer leaves the scope.
}
let _index_writer_two = index.writer(3_000_000).unwrap();
let _index_writer_two = index.writer(40_000_000).unwrap();
}
#[test]
@@ -871,13 +717,9 @@ mod tests {
let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build());
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
let num_docs_containing = |s: &str| {
let searcher = reader.searcher();
let searcher = index.searcher();
let term = Term::from_field_text(text_field, s);
searcher.doc_freq(&term)
};
@@ -887,20 +729,21 @@ mod tests {
let mut index_writer = index.writer(3_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"a"));
index_writer.rollback().unwrap();
assert_eq!(index_writer.commit_opstamp(), 0u64);
assert_eq!(num_docs_containing("a"), 0);
{
index_writer.add_document(doc!(text_field=>"b"));
index_writer.add_document(doc!(text_field=>"c"));
}
assert!(index_writer.commit().is_ok());
reader.reload().unwrap();
assert_eq!(index_writer.commit().unwrap(), 3u64);
index.load_searchers().unwrap();
assert_eq!(num_docs_containing("a"), 0);
assert_eq!(num_docs_containing("b"), 1);
assert_eq!(num_docs_containing("c"), 1);
}
reader.reload().unwrap();
reader.searcher();
index.load_searchers().unwrap();
index.searcher();
}
#[test]
@@ -908,33 +751,32 @@ mod tests {
let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build());
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
let num_docs_containing = |s: &str| {
let searcher = index.searcher();
let term_a = Term::from_field_text(text_field, s);
reader.searcher().doc_freq(&term_a)
searcher.doc_freq(&term_a)
};
{
// writing the segment
let mut index_writer = index.writer(12_000_000).unwrap();
// create 8 segments with 100 tiny docs
for _doc in 0..100 {
index_writer.add_document(doc!(text_field=>"a"));
let mut doc = Document::default();
doc.add_text(text_field, "a");
index_writer.add_document(doc);
}
index_writer.commit().expect("commit failed");
for _doc in 0..100 {
index_writer.add_document(doc!(text_field=>"a"));
let mut doc = Document::default();
doc.add_text(text_field, "a");
index_writer.add_document(doc);
}
// this should create 8 segments and trigger a merge.
// this should create 8 segments and trigger a merge.
index_writer.commit().expect("commit failed");
index_writer
.wait_merging_threads()
.expect("waiting merging thread failed");
reader.reload().unwrap();
index.load_searchers().unwrap();
assert_eq!(num_docs_containing("a"), 200);
assert!(index.searchable_segments().unwrap().len() < 8);
@@ -1001,15 +843,11 @@ mod tests {
}
index_writer.commit().unwrap();
}
index.load_searchers().unwrap();
let num_docs_containing = |s: &str| {
let searcher = index.searcher();
let term_a = Term::from_field_text(text_field, s);
index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap()
.searcher()
.doc_freq(&term_a)
searcher.doc_freq(&term_a)
};
assert_eq!(num_docs_containing("a"), 0);
assert_eq!(num_docs_containing("b"), 100);
@@ -1017,9 +855,9 @@ mod tests {
#[test]
fn test_hashmap_size() {
assert_eq!(initial_table_size(100_000), 11);
assert_eq!(initial_table_size(1_000_000), 14);
assert_eq!(initial_table_size(10_000_000), 17);
assert_eq!(initial_table_size(100_000), 12);
assert_eq!(initial_table_size(1_000_000), 15);
assert_eq!(initial_table_size(10_000_000), 18);
assert_eq!(initial_table_size(1_000_000_000), 19);
}
@@ -1041,9 +879,11 @@ mod tests {
index_writer.add_document(doc!(text_field => "b"));
}
assert!(index_writer.commit().is_err());
index.load_searchers().unwrap();
let num_docs_containing = |s: &str| {
let searcher = index.searcher();
let term_a = Term::from_field_text(text_field, s);
index.reader().unwrap().searcher().doc_freq(&term_a)
searcher.doc_freq(&term_a)
};
assert_eq!(num_docs_containing("a"), 100);
assert_eq!(num_docs_containing("b"), 0);

View File

@@ -52,7 +52,7 @@ impl MergePolicy for LogMergePolicy {
let mut size_sorted_tuples = segments
.iter()
.map(SegmentMeta::num_docs)
.map(|x| x.num_docs())
.enumerate()
.collect::<Vec<(usize, u32)>>();

View File

@@ -1,65 +0,0 @@
use census::{Inventory, TrackedObject};
use std::collections::HashSet;
use Opstamp;
use SegmentId;
#[derive(Default)]
pub struct MergeOperationInventory(Inventory<InnerMergeOperation>);
impl MergeOperationInventory {
pub fn segment_in_merge(&self) -> HashSet<SegmentId> {
let mut segment_in_merge = HashSet::default();
for merge_op in self.0.list() {
for &segment_id in &merge_op.segment_ids {
segment_in_merge.insert(segment_id);
}
}
segment_in_merge
}
}
/// A `MergeOperation` has two roles.
/// It carries all of the information required to describe a merge:
/// - `target_opstamp` is the opstamp up to which we want to consume the
/// delete queue and reflect their deletes.
/// - `segment_ids` is the list of segment to be merged.
///
/// The second role is to ensure keep track of the fact that these
/// segments are in merge and avoid starting a merge operation that
/// may conflict with this one.
///
/// This works by tracking merge operations. When considering computing
/// merge candidates, we simply list tracked merge operations and remove
/// their segments from possible merge candidates.
pub struct MergeOperation {
inner: TrackedObject<InnerMergeOperation>,
}
struct InnerMergeOperation {
target_opstamp: Opstamp,
segment_ids: Vec<SegmentId>,
}
impl MergeOperation {
pub fn new(
inventory: &MergeOperationInventory,
target_opstamp: Opstamp,
segment_ids: Vec<SegmentId>,
) -> MergeOperation {
let inner_merge_operation = InnerMergeOperation {
target_opstamp,
segment_ids,
};
MergeOperation {
inner: inventory.0.track(inner_merge_operation),
}
}
pub fn target_opstamp(&self) -> Opstamp {
self.inner.target_opstamp
}
pub fn segment_ids(&self) -> &[SegmentId] {
&self.inner.segment_ids[..]
}
}

View File

@@ -11,7 +11,7 @@ pub struct MergeCandidate(pub Vec<SegmentId>);
///
/// Every time a the list of segments changes, the segment updater
/// asks the merge policy if some segments should be merged.
pub trait MergePolicy: marker::Send + marker::Sync + Debug {
pub trait MergePolicy: MergePolicyClone + marker::Send + marker::Sync + Debug {
/// Given the list of segment metas, returns the list of merge candidates.
///
/// This call happens on the segment updater thread, and will block
@@ -19,6 +19,21 @@ pub trait MergePolicy: marker::Send + marker::Sync + Debug {
fn compute_merge_candidates(&self, segments: &[SegmentMeta]) -> Vec<MergeCandidate>;
}
/// MergePolicyClone
pub trait MergePolicyClone {
/// Returns a boxed clone of the MergePolicy.
fn box_clone(&self) -> Box<MergePolicy>;
}
impl<T> MergePolicyClone for T
where
T: 'static + MergePolicy + Clone,
{
fn box_clone(&self) -> Box<MergePolicy> {
Box::new(self.clone())
}
}
/// Never merge segments.
#[derive(Debug, Clone)]
pub struct NoMergePolicy;

View File

@@ -1,9 +1,7 @@
use common::MAX_DOC_LIMIT;
use core::Segment;
use core::SegmentReader;
use core::SerializableSegment;
use docset::DocSet;
use fastfield::BytesFastFieldReader;
use fastfield::DeleteBitSet;
use fastfield::FastFieldReader;
use fastfield::FastFieldSerializer;
@@ -25,7 +23,6 @@ use termdict::TermMerger;
use termdict::TermOrdinal;
use DocId;
use Result;
use TantivyError;
fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 {
let mut total_tokens = 0u64;
@@ -73,7 +70,7 @@ fn compute_min_max_val(
// some deleted documents,
// we need to recompute the max / min
(0..max_doc)
.filter(|doc_id| delete_bitset.is_alive(*doc_id))
.filter(|doc_id| !delete_bitset.is_deleted(*doc_id))
.map(|doc_id| u64_reader.get(doc_id))
.minmax()
.into_option()
@@ -153,14 +150,6 @@ impl IndexMerger {
readers.push(reader);
}
}
if max_doc >= MAX_DOC_LIMIT {
let err_msg = format!(
"The segment resulting from this merge would have {} docs,\
which exceeds the limit {}.",
max_doc, MAX_DOC_LIMIT
);
return Err(TantivyError::InvalidArgument(err_msg));
}
Ok(IndexMerger {
schema,
readers,
@@ -205,17 +194,17 @@ impl IndexMerger {
fast_field_serializer,
)?;
}
FieldType::U64(ref options)
| FieldType::I64(ref options)
| FieldType::Date(ref options) => match options.get_fastfield_cardinality() {
Some(Cardinality::SingleValue) => {
self.write_single_fast_field(field, fast_field_serializer)?;
FieldType::U64(ref options) | FieldType::I64(ref options) => {
match options.get_fastfield_cardinality() {
Some(Cardinality::SingleValue) => {
self.write_single_fast_field(field, fast_field_serializer)?;
}
Some(Cardinality::MultiValues) => {
self.write_multi_fast_field(field, fast_field_serializer)?;
}
None => {}
}
Some(Cardinality::MultiValues) => {
self.write_multi_fast_field(field, fast_field_serializer)?;
}
None => {}
},
}
FieldType::Str(_) => {
// We don't handle str fast field for the moment
// They can be implemented using what is done
@@ -240,10 +229,7 @@ impl IndexMerger {
let mut max_value = u64::min_value();
for reader in &self.readers {
let u64_reader: FastFieldReader<u64> = reader
.fast_fields()
.u64_lenient(field)
.expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen.");
let u64_reader: FastFieldReader<u64> = reader.fast_field_reader(field)?;
if let Some((seg_min_val, seg_max_val)) =
compute_min_max_val(&u64_reader, reader.max_doc(), reader.delete_bitset())
{
@@ -286,28 +272,24 @@ impl IndexMerger {
fast_field_serializer: &mut FastFieldSerializer,
) -> Result<()> {
let mut total_num_vals = 0u64;
let mut u64s_readers: Vec<MultiValueIntFastFieldReader<u64>> = Vec::new();
// In the first pass, we compute the total number of vals.
//
// This is required by the bitpacker, as it needs to know
// what should be the bit length use for bitpacking.
for reader in &self.readers {
let u64s_reader = reader.fast_fields()
.u64s_lenient(field)
.expect("Failed to find index for multivalued field. This is a bug in tantivy, please report.");
let idx_reader = reader.fast_field_reader_with_idx::<u64>(field, 0)?;
if let Some(delete_bitset) = reader.delete_bitset() {
for doc in 0u32..reader.max_doc() {
if delete_bitset.is_alive(doc) {
let num_vals = u64s_reader.num_vals(doc) as u64;
total_num_vals += num_vals;
if !delete_bitset.is_deleted(doc) {
let start = idx_reader.get(doc);
let end = idx_reader.get(doc + 1);
total_num_vals += end - start;
}
}
} else {
total_num_vals += u64s_reader.total_num_vals();
total_num_vals += idx_reader.max_value();
}
u64s_readers.push(u64s_reader);
}
// We can now create our `idx` serializer, and in a second pass,
@@ -315,10 +297,13 @@ impl IndexMerger {
let mut serialize_idx =
fast_field_serializer.new_u64_fast_field_with_idx(field, 0, total_num_vals, 0)?;
let mut idx = 0;
for (segment_reader, u64s_reader) in self.readers.iter().zip(&u64s_readers) {
for doc in segment_reader.doc_ids_alive() {
for reader in &self.readers {
let idx_reader = reader.fast_field_reader_with_idx::<u64>(field, 0)?;
for doc in reader.doc_ids_alive() {
serialize_idx.add_val(idx)?;
idx += u64s_reader.num_vals(doc) as u64;
let start = idx_reader.get(doc);
let end = idx_reader.get(doc + 1);
idx += end - start;
}
}
serialize_idx.add_val(idx)?;
@@ -349,10 +334,8 @@ impl IndexMerger {
for (segment_ord, segment_reader) in self.readers.iter().enumerate() {
let term_ordinal_mapping: &[TermOrdinal] =
term_ordinal_mappings.get_segment(segment_ord);
let ff_reader: MultiValueIntFastFieldReader<u64> = segment_reader
.fast_fields()
.u64s(field)
.expect("Could not find multivalued u64 fast value reader.");
let ff_reader: MultiValueIntFastFieldReader<u64> =
segment_reader.multi_fast_field_reader(field)?;
// TODO optimize if no deletes
for doc in segment_reader.doc_ids_alive() {
ff_reader.get_vals(doc, &mut vals);
@@ -384,8 +367,6 @@ impl IndexMerger {
let mut vals = Vec::with_capacity(100);
let mut ff_readers = Vec::new();
// Our values are bitpacked and we need to know what should be
// our bitwidth and our minimum value before serializing any values.
//
@@ -394,10 +375,7 @@ impl IndexMerger {
// maximum value and initialize our Serializer.
for reader in &self.readers {
let ff_reader: MultiValueIntFastFieldReader<u64> =
reader.fast_fields().u64s_lenient(field).expect(
"Failed to find multivalued fast field reader. This is a bug in \
tantivy. Please report.",
);
reader.multi_fast_field_reader(field)?;
for doc in reader.doc_ids_alive() {
ff_reader.get_vals(doc, &mut vals);
for &val in &vals {
@@ -405,7 +383,6 @@ impl IndexMerger {
max_value = cmp::max(val, max_value);
}
}
ff_readers.push(ff_reader);
// TODO optimize when no deletes
}
@@ -418,7 +395,9 @@ impl IndexMerger {
{
let mut serialize_vals = fast_field_serializer
.new_u64_fast_field_with_idx(field, min_value, max_value, 1)?;
for (reader, ff_reader) in self.readers.iter().zip(ff_readers) {
for reader in &self.readers {
let ff_reader: MultiValueIntFastFieldReader<u64> =
reader.multi_fast_field_reader(field)?;
// TODO optimize if no deletes
for doc in reader.doc_ids_alive() {
ff_reader.get_vals(doc, &mut vals);
@@ -437,53 +416,19 @@ impl IndexMerger {
field: Field,
fast_field_serializer: &mut FastFieldSerializer,
) -> Result<()> {
let mut total_num_vals = 0u64;
let mut bytes_readers: Vec<BytesFastFieldReader> = Vec::new();
for reader in &self.readers {
let bytes_reader = reader.fast_fields().bytes(field).expect(
"Failed to find bytes fast field reader. This is a bug in tantivy, please report.",
);
if let Some(delete_bitset) = reader.delete_bitset() {
for doc in 0u32..reader.max_doc() {
if delete_bitset.is_alive(doc) {
let num_vals = bytes_reader.get_bytes(doc).len() as u64;
total_num_vals += num_vals;
}
}
} else {
total_num_vals += bytes_reader.total_num_bytes() as u64;
}
bytes_readers.push(bytes_reader);
}
{
// We can now create our `idx` serializer, and in a second pass,
// can effectively push the different indexes.
let mut serialize_idx =
fast_field_serializer.new_u64_fast_field_with_idx(field, 0, total_num_vals, 0)?;
let mut idx = 0;
for (segment_reader, bytes_reader) in self.readers.iter().zip(&bytes_readers) {
for doc in segment_reader.doc_ids_alive() {
serialize_idx.add_val(idx)?;
idx += bytes_reader.get_bytes(doc).len() as u64;
}
}
serialize_idx.add_val(idx)?;
serialize_idx.close_field()?;
}
self.write_fast_field_idx(field, fast_field_serializer)?;
let mut serialize_vals = fast_field_serializer.new_bytes_fast_field_with_idx(field, 1)?;
for segment_reader in &self.readers {
let bytes_reader = segment_reader.fast_fields().bytes(field)
.expect("Failed to find bytes field in fast field reader. This is a bug in tantivy. Please report.");
for reader in &self.readers {
let bytes_reader = reader.bytes_fast_field_reader(field)?;
// TODO: optimize if no deletes
for doc in segment_reader.doc_ids_alive() {
let val = bytes_reader.get_bytes(doc);
for doc in reader.doc_ids_alive() {
let val = bytes_reader.get_val(doc);
serialize_vals.write_all(val)?;
}
}
serialize_vals.flush()?;
Ok(())
}
@@ -709,7 +654,7 @@ mod tests {
use schema::IntOptions;
use schema::Term;
use schema::TextFieldIndexing;
use schema::INDEXED;
use schema::INT_INDEXED;
use std::io::Cursor;
use DocAddress;
use IndexWriter;
@@ -726,13 +671,11 @@ mod tests {
)
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let date_field = schema_builder.add_date_field("date", INDEXED);
let score_fieldtype = schema::IntOptions::default().set_fast(Cardinality::SingleValue);
let score_field = schema_builder.add_u64_field("score", score_fieldtype);
let bytes_score_field = schema_builder.add_bytes_field("score_bytes");
let index = Index::create_in_ram(schema_builder.build());
let reader = index.reader().unwrap();
let curr_time = chrono::Utc::now();
let add_score_bytes = |doc: &mut Document, score: u32| {
let mut bytes = Vec::new();
bytes
@@ -749,7 +692,6 @@ mod tests {
let mut doc = Document::default();
doc.add_text(text_field, "af b");
doc.add_u64(score_field, 3);
doc.add_date(date_field, &curr_time);
add_score_bytes(&mut doc, 3);
index_writer.add_document(doc);
}
@@ -775,7 +717,6 @@ mod tests {
{
let mut doc = Document::default();
doc.add_text(text_field, "af b");
doc.add_date(date_field, &curr_time);
doc.add_u64(score_field, 11);
add_score_bytes(&mut doc, 11);
index_writer.add_document(doc);
@@ -803,8 +744,8 @@ mod tests {
index_writer.wait_merging_threads().unwrap();
}
{
reader.reload().unwrap();
let searcher = reader.searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let get_doc_ids = |terms: Vec<Term>| {
let query = BooleanQuery::new_multiterms_query(terms);
let top_docs = searcher.search(&query, &TestCollector).unwrap();
@@ -833,10 +774,6 @@ mod tests {
DocAddress(0, 4)
]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_date(date_field, &curr_time)]),
vec![DocAddress(0, 0), DocAddress(0, 3)]
);
}
{
let doc = searcher.doc(DocAddress(0, 0)).unwrap();
@@ -899,8 +836,8 @@ mod tests {
let score_field = schema_builder.add_u64_field("score", score_fieldtype);
let bytes_score_field = schema_builder.add_bytes_field("score_bytes");
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let reader = index.reader().unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
let search_term = |searcher: &Searcher, term: Term| {
let collector = FastFieldTestCollector::for_field(score_field);
let bytes_collector = BytesFastFieldTestCollector::for_field(bytes_score_field);
@@ -937,8 +874,8 @@ mod tests {
bytes_score_field => vec![0u8, 0, 0, 3],
));
index_writer.commit().expect("committed");
reader.reload().unwrap();
let searcher = reader.searcher();
index.load_searchers().unwrap();
let ref searcher = *index.searcher();
assert_eq!(searcher.num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
@@ -984,8 +921,8 @@ mod tests {
bytes_score_field => vec![0u8, 0, 27, 88],
));
index_writer.commit().expect("committed");
reader.reload().unwrap();
let searcher = reader.searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
assert_eq!(searcher.segment_readers().len(), 2);
assert_eq!(searcher.num_docs(), 3);
@@ -1024,16 +961,14 @@ mod tests {
let score_field_reader = searcher
.segment_reader(0)
.fast_fields()
.u64(score_field)
.fast_field_reader::<u64>(score_field)
.unwrap();
assert_eq!(score_field_reader.min_value(), 4000);
assert_eq!(score_field_reader.max_value(), 7000);
let score_field_reader = searcher
.segment_reader(1)
.fast_fields()
.u64(score_field)
.fast_field_reader::<u64>(score_field)
.unwrap();
assert_eq!(score_field_reader.min_value(), 1);
assert_eq!(score_field_reader.max_value(), 3);
@@ -1048,8 +983,8 @@ mod tests {
.expect("Failed to initiate merge")
.wait()
.expect("Merging failed");
reader.reload().unwrap();
let searcher = reader.searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.num_docs(), 3);
assert_eq!(searcher.segment_readers()[0].num_docs(), 3);
@@ -1084,8 +1019,7 @@ mod tests {
);
let score_field_reader = searcher
.segment_reader(0)
.fast_fields()
.u64(score_field)
.fast_field_reader::<u64>(score_field)
.unwrap();
assert_eq!(score_field_reader.min_value(), 3);
assert_eq!(score_field_reader.max_value(), 7000);
@@ -1095,8 +1029,8 @@ mod tests {
index_writer.delete_term(Term::from_field_text(text_field, "c"));
index_writer.commit().unwrap();
reader.reload().unwrap();
let searcher = reader.searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
@@ -1131,8 +1065,7 @@ mod tests {
);
let score_field_reader = searcher
.segment_reader(0)
.fast_fields()
.u64(score_field)
.fast_field_reader::<u64>(score_field)
.unwrap();
assert_eq!(score_field_reader.min_value(), 3);
assert_eq!(score_field_reader.max_value(), 7000);
@@ -1147,9 +1080,9 @@ mod tests {
.expect("Failed to initiate merge")
.wait()
.expect("Merging failed");
reader.reload().unwrap();
index.load_searchers().unwrap();
let searcher = reader.searcher();
let ref searcher = *index.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
@@ -1184,8 +1117,7 @@ mod tests {
);
let score_field_reader = searcher
.segment_reader(0)
.fast_fields()
.u64(score_field)
.fast_field_reader::<u64>(score_field)
.unwrap();
assert_eq!(score_field_reader.min_value(), 6000);
assert_eq!(score_field_reader.max_value(), 7000);
@@ -1198,11 +1130,15 @@ mod tests {
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
reader.reload().unwrap();
index_writer
.merge(&segment_ids)
.expect("Failed to initiate merge")
.wait()
.expect("Merging failed");
index.load_searchers().unwrap();
let searcher = reader.searcher();
assert!(segment_ids.is_empty());
assert!(searcher.segment_readers().is_empty());
let ref searcher = *index.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.num_docs(), 0);
}
}
@@ -1212,9 +1148,8 @@ mod tests {
let mut schema_builder = schema::Schema::builder();
let facet_field = schema_builder.add_facet_field("facet");
let index = Index::create_in_ram(schema_builder.build());
let reader = index.reader().unwrap();
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
let index_doc = |index_writer: &mut IndexWriter, doc_facets: &[&str]| {
let mut doc = Document::default();
for facet in doc_facets {
@@ -1242,9 +1177,9 @@ mod tests {
index_writer.commit().expect("committed");
}
reader.reload().unwrap();
index.load_searchers().unwrap();
let test_searcher = |expected_num_docs: usize, expected: &[(&str, u64)]| {
let searcher = reader.searcher();
let searcher = index.searcher();
let mut facet_collector = FacetCollector::for_field(facet_field);
facet_collector.add_facet(Facet::from("/top"));
let (count, facet_counts) = searcher
@@ -1279,14 +1214,14 @@ mod tests {
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer
.merge(&segment_ids)
.expect("Failed to initiate merge")
.wait()
.expect("Merging failed");
index_writer.wait_merging_threads().unwrap();
reader.reload().unwrap();
index.load_searchers().unwrap();
test_searcher(
11,
&[
@@ -1302,12 +1237,12 @@ mod tests {
// Deleting one term
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
let facet = Facet::from_path(vec!["top", "a", "firstdoc"]);
let facet_term = Term::from_facet(facet_field, &facet);
index_writer.delete_term(facet_term);
index_writer.commit().unwrap();
reader.reload().unwrap();
index.load_searchers().unwrap();
test_searcher(
9,
&[
@@ -1325,15 +1260,15 @@ mod tests {
#[test]
fn test_bug_merge() {
let mut schema_builder = schema::Schema::builder();
let int_field = schema_builder.add_u64_field("intvals", INDEXED);
let int_field = schema_builder.add_u64_field("intvals", INT_INDEXED);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(int_field => 1u64));
index_writer.commit().expect("commit failed");
index_writer.add_document(doc!(int_field => 1u64));
index_writer.commit().expect("commit failed");
let reader = index.reader().unwrap();
let searcher = reader.searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
assert_eq!(searcher.num_docs(), 2);
index_writer.delete_term(Term::from_field_u64(int_field, 1));
let segment_ids = index
@@ -1344,10 +1279,10 @@ mod tests {
.expect("Failed to initiate merge")
.wait()
.expect("Merging failed");
reader.reload().unwrap();
index.load_searchers().unwrap();
// commit has not been called yet. The document should still be
// there.
assert_eq!(reader.searcher().num_docs(), 2);
assert_eq!(index.searcher().num_docs(), 2);
}
#[test]
@@ -1358,9 +1293,9 @@ mod tests {
.set_indexed();
let int_field = schema_builder.add_u64_field("intvals", int_options);
let index = Index::create_in_ram(schema_builder.build());
let reader = index.reader().unwrap();
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
let mut doc = Document::default();
doc.add_u64(int_field, 1);
index_writer.add_document(doc.clone());
@@ -1368,33 +1303,31 @@ mod tests {
index_writer.add_document(doc);
index_writer.commit().expect("commit failed");
index_writer.delete_term(Term::from_field_u64(int_field, 1));
index_writer.commit().expect("commit failed");
}
index.load_searchers().unwrap();
let searcher = index.searcher();
assert_eq!(searcher.num_docs(), 0);
// Merging the segments
{
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer
.merge(&segment_ids)
.expect("Failed to initiate merge")
.wait()
.expect("Merging failed");
// assert delete has not been committed
reader.reload().expect("failed to load searcher 1");
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 2);
index_writer.commit().unwrap();
index_writer.wait_merging_threads().unwrap();
}
reader.reload().unwrap();
let searcher = reader.searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
assert_eq!(searcher.num_docs(), 0);
}
#[test]
fn test_merge_multivalued_int_fields_simple() {
fn test_merge_multivalued_int_fields() {
let mut schema_builder = schema::Schema::builder();
let int_options = IntOptions::default()
.set_fast(Cardinality::MultiValues)
@@ -1403,7 +1336,7 @@ mod tests {
let index = Index::create_in_ram(schema_builder.build());
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
let index_doc = |index_writer: &mut IndexWriter, int_vals: &[u64]| {
let mut doc = Document::default();
for &val in int_vals {
@@ -1411,6 +1344,7 @@ mod tests {
}
index_writer.add_document(doc);
};
index_doc(&mut index_writer, &[1, 2]);
index_doc(&mut index_writer, &[1, 2, 3]);
index_doc(&mut index_writer, &[4, 5]);
@@ -1419,19 +1353,24 @@ mod tests {
index_doc(&mut index_writer, &[3]);
index_doc(&mut index_writer, &[17]);
index_writer.commit().expect("committed");
index_doc(&mut index_writer, &[20]);
index_writer.commit().expect("committed");
index_doc(&mut index_writer, &[28, 27]);
index_doc(&mut index_writer, &[1_000]);
index_writer.commit().expect("committed");
}
let reader = index.reader().unwrap();
let searcher = reader.searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let mut vals: Vec<u64> = Vec::new();
{
let segment = searcher.segment_reader(0u32);
let ff_reader = segment.fast_fields().u64s(int_field).unwrap();
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
ff_reader.get_vals(0, &mut vals);
assert_eq!(&vals, &[1, 2]);
@@ -1466,7 +1405,7 @@ mod tests {
{
let segment = searcher.segment_reader(1u32);
let ff_reader = segment.fast_fields().u64s(int_field).unwrap();
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
ff_reader.get_vals(0, &mut vals);
assert_eq!(&vals, &[28, 27]);
@@ -1476,7 +1415,7 @@ mod tests {
{
let segment = searcher.segment_reader(2u32);
let ff_reader = segment.fast_fields().u64s(int_field).unwrap();
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
ff_reader.get_vals(0, &mut vals);
assert_eq!(&vals, &[20]);
}
@@ -1486,20 +1425,19 @@ mod tests {
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer
.merge(&segment_ids)
.expect("Failed to initiate merge")
.wait()
.expect("Merging failed");
index_writer
.wait_merging_threads()
.expect("Wait for merging threads");
index_writer.wait_merging_threads().unwrap();
}
reader.reload().expect("Load searcher");
index.load_searchers().unwrap();
{
let searcher = reader.searcher();
let searcher = index.searcher();
println!(
"{:?}",
searcher
@@ -1509,7 +1447,7 @@ mod tests {
.collect::<Vec<_>>()
);
let segment = searcher.segment_reader(0u32);
let ff_reader = segment.fast_fields().u64s(int_field).unwrap();
let ff_reader = segment.multi_fast_field_reader(int_field).unwrap();
ff_reader.get_vals(0, &mut vals);
assert_eq!(&vals, &[1, 2]);

View File

@@ -1,9 +1,8 @@
pub mod delete_queue;
mod directory_lock;
mod doc_opstamp_mapping;
pub mod index_writer;
mod log_merge_policy;
mod merge_operation;
pub mod merge_policy;
pub mod merger;
pub mod operation;
@@ -16,12 +15,14 @@ pub mod segment_updater;
mod segment_writer;
mod stamper;
pub(crate) use self::directory_lock::DirectoryLock;
pub use self::directory_lock::LockType;
pub use self::index_writer::IndexWriter;
pub use self::log_merge_policy::LogMergePolicy;
pub use self::merge_operation::{MergeOperation, MergeOperationInventory};
pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy};
pub use self::prepared_commit::PreparedCommit;
pub use self::segment_entry::SegmentEntry;
pub use self::segment_entry::{SegmentEntry, SegmentState};
pub use self::segment_manager::SegmentManager;
pub use self::segment_serializer::SegmentSerializer;
pub use self::segment_writer::SegmentWriter;

View File

@@ -1,24 +1,16 @@
use schema::Document;
use schema::Term;
use Opstamp;
/// Timestamped Delete operation.
#[derive(Clone, Eq, PartialEq, Debug)]
pub struct DeleteOperation {
pub opstamp: Opstamp,
pub opstamp: u64,
pub term: Term,
}
/// Timestamped Add operation.
#[derive(Eq, PartialEq, Debug)]
pub struct AddOperation {
pub opstamp: Opstamp,
pub opstamp: u64,
pub document: Document,
}
/// UserOperation is an enum type that encapsulates other operation types.
#[derive(Eq, PartialEq, Debug)]
pub enum UserOperation {
Add(Document),
Delete(Term),
}

View File

@@ -1,16 +1,15 @@
use super::IndexWriter;
use Opstamp;
use Result;
/// A prepared commit
pub struct PreparedCommit<'a> {
index_writer: &'a mut IndexWriter,
payload: Option<String>,
opstamp: Opstamp,
opstamp: u64,
}
impl<'a> PreparedCommit<'a> {
pub(crate) fn new(index_writer: &'a mut IndexWriter, opstamp: Opstamp) -> PreparedCommit {
pub(crate) fn new(index_writer: &'a mut IndexWriter, opstamp: u64) -> PreparedCommit {
PreparedCommit {
index_writer,
payload: None,
@@ -18,7 +17,7 @@ impl<'a> PreparedCommit<'a> {
}
}
pub fn opstamp(&self) -> Opstamp {
pub fn opstamp(&self) -> u64 {
self.opstamp
}
@@ -26,11 +25,11 @@ impl<'a> PreparedCommit<'a> {
self.payload = Some(payload.to_string())
}
pub fn abort(self) -> Result<Opstamp> {
pub fn abort(self) -> Result<()> {
self.index_writer.rollback()
}
pub fn commit(self) -> Result<Opstamp> {
pub fn commit(self) -> Result<u64> {
info!("committing {}", self.opstamp);
self.index_writer
.segment_updater()

View File

@@ -4,6 +4,21 @@ use core::SegmentMeta;
use indexer::delete_queue::DeleteCursor;
use std::fmt;
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
pub enum SegmentState {
Ready,
InMerge,
}
impl SegmentState {
pub fn letter_code(self) -> char {
match self {
SegmentState::InMerge => 'M',
SegmentState::Ready => 'R',
}
}
}
/// A segment entry describes the state of
/// a given segment, at a given instant.
///
@@ -20,6 +35,7 @@ use std::fmt;
#[derive(Clone)]
pub struct SegmentEntry {
meta: SegmentMeta,
state: SegmentState,
delete_bitset: Option<BitSet>,
delete_cursor: DeleteCursor,
}
@@ -33,6 +49,7 @@ impl SegmentEntry {
) -> SegmentEntry {
SegmentEntry {
meta: segment_meta,
state: SegmentState::Ready,
delete_bitset,
delete_cursor,
}
@@ -55,6 +72,14 @@ impl SegmentEntry {
&mut self.delete_cursor
}
/// Return the `SegmentEntry`.
///
/// The state describes whether the segment is available for
/// a merge or not.
pub fn state(&self) -> SegmentState {
self.state
}
/// Returns the segment id.
pub fn segment_id(&self) -> SegmentId {
self.meta.id()
@@ -64,10 +89,33 @@ impl SegmentEntry {
pub fn meta(&self) -> &SegmentMeta {
&self.meta
}
/// Mark the `SegmentEntry` as in merge.
///
/// Only segments that are not already
/// in a merge are elligible for future merge.
pub fn start_merge(&mut self) {
self.state = SegmentState::InMerge;
}
/// Cancel a merge
///
/// If a merge fails, it is important to switch
/// the segment back to a idle state, so that it
/// may be elligible for future merges.
pub fn cancel_merge(&mut self) {
self.state = SegmentState::Ready;
}
/// Returns true iff a segment should
/// be considered for a merge.
pub fn is_ready(&self) -> bool {
self.state == SegmentState::Ready
}
}
impl fmt::Debug for SegmentEntry {
fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
write!(formatter, "SegmentEntry({:?})", self.meta)
write!(formatter, "SegmentEntry({:?}, {:?})", self.meta, self.state)
}
}

View File

@@ -16,6 +16,7 @@ use Result as TantivyResult;
struct SegmentRegisters {
uncommitted: SegmentRegister,
committed: SegmentRegister,
writing: HashSet<SegmentId>,
}
/// The segment manager stores the list of segments
@@ -40,17 +41,12 @@ impl Debug for SegmentManager {
}
pub fn get_mergeable_segments(
in_merge_segment_ids: &HashSet<SegmentId>,
segment_manager: &SegmentManager,
) -> (Vec<SegmentMeta>, Vec<SegmentMeta>) {
let registers_lock = segment_manager.read();
(
registers_lock
.committed
.get_mergeable_segments(in_merge_segment_ids),
registers_lock
.uncommitted
.get_mergeable_segments(in_merge_segment_ids),
registers_lock.committed.get_mergeable_segments(),
registers_lock.uncommitted.get_mergeable_segments(),
)
}
@@ -63,6 +59,7 @@ impl SegmentManager {
registers: RwLock::new(SegmentRegisters {
uncommitted: SegmentRegister::default(),
committed: SegmentRegister::new(segment_metas, delete_cursor),
writing: HashSet::new(),
}),
}
}
@@ -75,6 +72,12 @@ impl SegmentManager {
segment_entries
}
/// Returns the overall number of segments in the `SegmentManager`
pub fn num_segments(&self) -> usize {
let registers_lock = self.read();
registers_lock.committed.len() + registers_lock.uncommitted.len()
}
/// List the files that are useful to the index.
///
/// This does not include lock files, or files that are obsolete
@@ -103,21 +106,6 @@ impl SegmentManager {
.expect("Failed to acquire write lock on SegmentManager.")
}
/// Deletes all empty segments
fn remove_empty_segments(&self) {
let mut registers_lock = self.write();
registers_lock
.committed
.segment_entries()
.iter()
.filter(|segment| segment.meta().num_docs() == 0)
.for_each(|segment| {
registers_lock
.committed
.remove_segment(&segment.segment_id())
});
}
pub fn commit(&self, segment_entries: Vec<SegmentEntry>) {
let mut registers_lock = self.write();
registers_lock.committed.clear();
@@ -133,22 +121,25 @@ impl SegmentManager {
/// the `segment_ids` are not either all committed or all
/// uncommitted.
pub fn start_merge(&self, segment_ids: &[SegmentId]) -> TantivyResult<Vec<SegmentEntry>> {
let registers_lock = self.read();
let mut registers_lock = self.write();
let mut segment_entries = vec![];
if registers_lock.uncommitted.contains_all(segment_ids) {
for segment_id in segment_ids {
let segment_entry = registers_lock.uncommitted
.get(segment_id)
.start_merge(segment_id)
.expect("Segment id not found {}. Should never happen because of the contains all if-block.");
segment_entries.push(segment_entry);
}
} else if registers_lock.committed.contains_all(segment_ids) {
for segment_id in segment_ids {
let segment_entry = registers_lock.committed
.get(segment_id)
.start_merge(segment_id)
.expect("Segment id not found {}. Should never happen because of the contains all if-block.");
segment_entries.push(segment_entry);
}
for segment_id in segment_ids {
registers_lock.committed.start_merge(segment_id);
}
} else {
let error_msg = "Merge operation sent for segments that are not \
all uncommited or commited."
@@ -158,8 +149,50 @@ impl SegmentManager {
Ok(segment_entries)
}
pub fn cancel_merge(
&self,
before_merge_segment_ids: &[SegmentId],
after_merge_segment_id: SegmentId,
) {
let mut registers_lock = self.write();
// we mark all segments are ready for merge.
{
let target_segment_register: &mut SegmentRegister;
target_segment_register = {
if registers_lock
.uncommitted
.contains_all(before_merge_segment_ids)
{
&mut registers_lock.uncommitted
} else if registers_lock
.committed
.contains_all(before_merge_segment_ids)
{
&mut registers_lock.committed
} else {
warn!("couldn't find segment in SegmentManager");
return;
}
};
for segment_id in before_merge_segment_ids {
target_segment_register.cancel_merge(segment_id);
}
}
// ... and we make sure the target segment entry
// can be garbage collected.
registers_lock.writing.remove(&after_merge_segment_id);
}
pub fn write_segment(&self, segment_id: SegmentId) {
let mut registers_lock = self.write();
registers_lock.writing.insert(segment_id);
}
pub fn add_segment(&self, segment_entry: SegmentEntry) {
let mut registers_lock = self.write();
registers_lock.writing.remove(&segment_entry.segment_id());
registers_lock.uncommitted.add_segment_entry(segment_entry);
}
@@ -169,6 +202,10 @@ impl SegmentManager {
after_merge_segment_entry: SegmentEntry,
) {
let mut registers_lock = self.write();
registers_lock
.writing
.remove(&after_merge_segment_entry.segment_id());
let target_register: &mut SegmentRegister = {
if registers_lock
.uncommitted
@@ -192,7 +229,6 @@ impl SegmentManager {
}
pub fn committed_segment_metas(&self) -> Vec<SegmentMeta> {
self.remove_empty_segments();
let registers_lock = self.read();
registers_lock.committed.segment_metas()
}

View File

@@ -3,7 +3,6 @@ use core::SegmentMeta;
use indexer::delete_queue::DeleteCursor;
use indexer::segment_entry::SegmentEntry;
use std::collections::HashMap;
use std::collections::HashSet;
use std::fmt::{self, Debug, Formatter};
/// The segment register keeps track
@@ -22,8 +21,8 @@ pub struct SegmentRegister {
impl Debug for SegmentRegister {
fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
write!(f, "SegmentRegister(")?;
for k in self.segment_states.keys() {
write!(f, "{}, ", k.short_uuid_string())?;
for (k, v) in &self.segment_states {
write!(f, "{}:{}, ", k.short_uuid_string(), v.state().letter_code())?;
}
write!(f, ")")?;
Ok(())
@@ -35,13 +34,14 @@ impl SegmentRegister {
self.segment_states.clear();
}
pub fn get_mergeable_segments(
&self,
in_merge_segment_ids: &HashSet<SegmentId>,
) -> Vec<SegmentMeta> {
pub fn len(&self) -> usize {
self.segment_states.len()
}
pub fn get_mergeable_segments(&self) -> Vec<SegmentMeta> {
self.segment_states
.values()
.filter(|segment_entry| !in_merge_segment_ids.contains(&segment_entry.segment_id()))
.filter(|segment_entry| segment_entry.is_ready())
.map(|segment_entry| segment_entry.meta().clone())
.collect()
}
@@ -56,11 +56,11 @@ impl SegmentRegister {
.values()
.map(|segment_entry| segment_entry.meta().clone())
.collect();
segment_ids.sort_by_key(SegmentMeta::id);
segment_ids.sort_by_key(|meta| meta.id());
segment_ids
}
pub fn contains_all(&self, segment_ids: &[SegmentId]) -> bool {
pub fn contains_all(&mut self, segment_ids: &[SegmentId]) -> bool {
segment_ids
.iter()
.all(|segment_id| self.segment_states.contains_key(segment_id))
@@ -75,8 +75,20 @@ impl SegmentRegister {
self.segment_states.remove(segment_id);
}
pub fn get(&self, segment_id: &SegmentId) -> Option<SegmentEntry> {
self.segment_states.get(segment_id).cloned()
pub fn cancel_merge(&mut self, segment_id: &SegmentId) {
self.segment_states
.get_mut(segment_id)
.expect("Received a merge notification for a segment that is not registered")
.cancel_merge();
}
pub fn start_merge(&mut self, segment_id: &SegmentId) -> Option<SegmentEntry> {
if let Some(segment_entry) = self.segment_states.get_mut(segment_id) {
segment_entry.start_merge();
Some(segment_entry.clone())
} else {
None
}
}
pub fn new(segment_metas: Vec<SegmentMeta>, delete_cursor: &DeleteCursor) -> SegmentRegister {
@@ -88,6 +100,11 @@ impl SegmentRegister {
}
SegmentRegister { segment_states }
}
#[cfg(test)]
pub fn segment_entry(&self, segment_id: &SegmentId) -> Option<SegmentEntry> {
self.segment_states.get(segment_id).cloned()
}
}
#[cfg(test)]
@@ -96,6 +113,7 @@ mod tests {
use core::SegmentId;
use core::SegmentMeta;
use indexer::delete_queue::*;
use indexer::SegmentState;
fn segment_ids(segment_register: &SegmentRegister) -> Vec<SegmentId> {
segment_register
@@ -119,12 +137,42 @@ mod tests {
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
segment_register.add_segment_entry(segment_entry);
}
assert_eq!(
segment_register
.segment_entry(&segment_id_a)
.unwrap()
.state(),
SegmentState::Ready
);
assert_eq!(segment_ids(&segment_register), vec![segment_id_a]);
{
let segment_meta = SegmentMeta::new(segment_id_b, 0u32);
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
segment_register.add_segment_entry(segment_entry);
}
assert_eq!(
segment_register
.segment_entry(&segment_id_b)
.unwrap()
.state(),
SegmentState::Ready
);
segment_register.start_merge(&segment_id_a);
segment_register.start_merge(&segment_id_b);
assert_eq!(
segment_register
.segment_entry(&segment_id_a)
.unwrap()
.state(),
SegmentState::InMerge
);
assert_eq!(
segment_register
.segment_entry(&segment_id_b)
.unwrap()
.state(),
SegmentState::InMerge
);
segment_register.remove_segment(&segment_id_a);
segment_register.remove_segment(&segment_id_b);
{

View File

@@ -16,10 +16,8 @@ use futures_cpupool::CpuFuture;
use futures_cpupool::CpuPool;
use indexer::delete_queue::DeleteCursor;
use indexer::index_writer::advance_deletes;
use indexer::merge_operation::MergeOperationInventory;
use indexer::merger::IndexMerger;
use indexer::stamper::Stamper;
use indexer::MergeOperation;
use indexer::SegmentEntry;
use indexer::SegmentSerializer;
use indexer::{DefaultMergePolicy, MergePolicy};
@@ -27,7 +25,6 @@ use schema::Schema;
use serde_json;
use std::borrow::BorrowMut;
use std::collections::HashMap;
use std::collections::HashSet;
use std::io::Write;
use std::mem;
use std::ops::DerefMut;
@@ -36,7 +33,6 @@ use std::sync::Arc;
use std::sync::RwLock;
use std::thread;
use std::thread::JoinHandle;
use Opstamp;
use Result;
/// Save the index meta file.
@@ -54,24 +50,31 @@ pub fn save_new_metas(schema: Schema, directory: &mut Directory) -> Result<()> {
segments: Vec::new(),
schema,
opstamp: 0u64,
payload: None,
payload: None
},
directory,
)
directory)
}
/// Save the index meta file.
/// This operation is atomic:
/// Either
/// - it fails, in which case an error is returned,
// - it fails, in which case an error is returned,
/// and the `meta.json` remains untouched,
/// - it success, and `meta.json` is written
/// and flushed.
///
/// This method is not part of tantivy's public API
fn save_metas(metas: &IndexMeta, directory: &mut Directory) -> Result<()> {
fn save_metas(
metas: &IndexMeta,
directory: &mut Directory,
) -> Result<()> {
// let metas = IndexMeta {
// segments: segment_metas,
// schema,
// opstamp,
// payload,
// };
let mut buffer = serde_json::to_vec_pretty(metas)?;
// Just adding a new line at the end of the buffer.
writeln!(&mut buffer)?;
directory.atomic_write(&META_FILEPATH, &buffer[..])?;
debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas));
@@ -83,21 +86,21 @@ fn save_metas(metas: &IndexMeta, directory: &mut Directory) -> Result<()> {
//
// All this processing happens on a single thread
// consuming a common queue.
//
// We voluntarily pass a merge_operation ref to guarantee that
// the merge_operation is alive during the process
#[derive(Clone)]
pub struct SegmentUpdater(Arc<InnerSegmentUpdater>);
struct MergeOperation {
pub target_opstamp: u64,
pub segment_ids: Vec<SegmentId>,
}
fn perform_merge(
merge_operation: &MergeOperation,
index: &Index,
mut segment_entries: Vec<SegmentEntry>,
mut merged_segment: Segment,
target_opstamp: u64,
) -> Result<SegmentEntry> {
let target_opstamp = merge_operation.target_opstamp();
// first we need to apply deletes to our segment.
let mut merged_segment = index.new_segment();
// TODO add logging
let schema = index.schema();
@@ -141,13 +144,12 @@ struct InnerSegmentUpdater {
pool: CpuPool,
index: Index,
segment_manager: SegmentManager,
merge_policy: RwLock<Arc<Box<MergePolicy>>>,
merge_policy: RwLock<Box<MergePolicy>>,
merging_thread_id: AtomicUsize,
merging_threads: RwLock<HashMap<usize, JoinHandle<Result<()>>>>,
generation: AtomicUsize,
killed: AtomicBool,
stamper: Stamper,
merge_operations: MergeOperationInventory,
}
impl SegmentUpdater {
@@ -168,23 +170,28 @@ impl SegmentUpdater {
pool,
index,
segment_manager,
merge_policy: RwLock::new(Arc::new(Box::new(DefaultMergePolicy::default()))),
merge_policy: RwLock::new(Box::new(DefaultMergePolicy::default())),
merging_thread_id: AtomicUsize::default(),
merging_threads: RwLock::new(HashMap::new()),
generation: AtomicUsize::default(),
killed: AtomicBool::new(false),
stamper,
merge_operations: Default::default(),
})))
}
pub fn get_merge_policy(&self) -> Arc<Box<MergePolicy>> {
self.0.merge_policy.read().unwrap().clone()
pub fn new_segment(&self) -> Segment {
let new_segment = self.0.index.new_segment();
let segment_id = new_segment.id();
self.0.segment_manager.write_segment(segment_id);
new_segment
}
pub fn get_merge_policy(&self) -> Box<MergePolicy> {
self.0.merge_policy.read().unwrap().box_clone()
}
pub fn set_merge_policy(&self, merge_policy: Box<MergePolicy>) {
let arc_merge_policy = Arc::new(merge_policy);
*self.0.merge_policy.write().unwrap() = arc_merge_policy;
*self.0.merge_policy.write().unwrap() = merge_policy;
}
fn get_merging_thread_id(&self) -> usize {
@@ -225,7 +232,7 @@ impl SegmentUpdater {
///
/// Tne method returns copies of the segment entries,
/// updated with the delete information.
fn purge_deletes(&self, target_opstamp: Opstamp) -> Result<Vec<SegmentEntry>> {
fn purge_deletes(&self, target_opstamp: u64) -> Result<Vec<SegmentEntry>> {
let mut segment_entries = self.0.segment_manager.segment_entries();
for segment_entry in &mut segment_entries {
let segment = self.0.index.segment(segment_entry.meta().clone());
@@ -234,7 +241,7 @@ impl SegmentUpdater {
Ok(segment_entries)
}
pub fn save_metas(&self, opstamp: Opstamp, commit_message: Option<String>) {
pub fn save_metas(&self, opstamp: u64, commit_message: Option<String>) {
if self.is_alive() {
let index = &self.0.index;
let directory = index.directory();
@@ -258,10 +265,13 @@ impl SegmentUpdater {
segments: commited_segment_metas,
schema: index.schema(),
opstamp,
payload: commit_message,
payload: commit_message
};
save_metas(&index_meta, directory.box_clone().borrow_mut())
.expect("Could not save metas.");
save_metas(
&index_meta,
directory.box_clone().borrow_mut(),
)
.expect("Could not save metas.");
self.store_meta(&index_meta);
}
}
@@ -281,7 +291,7 @@ impl SegmentUpdater {
.garbage_collect(|| self.0.segment_manager.list_files());
}
pub fn commit(&self, opstamp: Opstamp, payload: Option<String>) -> Result<()> {
pub fn commit(&self, opstamp: u64, payload: Option<String>) -> Result<()> {
self.run_async(move |segment_updater| {
if segment_updater.is_alive() {
let segment_entries = segment_updater
@@ -297,14 +307,12 @@ impl SegmentUpdater {
}
pub fn start_merge(&self, segment_ids: &[SegmentId]) -> Result<Receiver<SegmentMeta>> {
let segment_ids_vec = segment_ids.to_vec();
let commit_opstamp = self.load_metas().opstamp;
let merge_operation = MergeOperation::new(
&self.0.merge_operations,
commit_opstamp,
segment_ids.to_vec(),
);
self.run_async(move |segment_updater| segment_updater.start_merge_impl(merge_operation))
.wait()?
self.run_async(move |segment_updater| {
segment_updater.start_merge_impl(&segment_ids_vec[..], commit_opstamp)
})
.wait()?
}
fn store_meta(&self, index_meta: &IndexMeta) {
@@ -315,25 +323,22 @@ impl SegmentUpdater {
}
// `segment_ids` is required to be non-empty.
fn start_merge_impl(&self, merge_operation: MergeOperation) -> Result<Receiver<SegmentMeta>> {
assert!(
!merge_operation.segment_ids().is_empty(),
"Segment_ids cannot be empty."
);
fn start_merge_impl(
&self,
segment_ids: &[SegmentId],
target_opstamp: u64,
) -> Result<Receiver<SegmentMeta>> {
assert!(!segment_ids.is_empty(), "Segment_ids cannot be empty.");
let segment_updater_clone = self.clone();
let segment_entries: Vec<SegmentEntry> = self
.0
.segment_manager
.start_merge(merge_operation.segment_ids())?;
let segment_entries: Vec<SegmentEntry> = self.0.segment_manager.start_merge(segment_ids)?;
// let segment_ids_vec = merge_operation.segment_ids.to_vec();
let segment_ids_vec = segment_ids.to_vec();
let merging_thread_id = self.get_merging_thread_id();
info!(
"Starting merge thread #{} - {:?}",
merging_thread_id,
merge_operation.segment_ids()
merging_thread_id, segment_ids
);
let (merging_future_send, merging_future_recv) = oneshot();
@@ -342,17 +347,20 @@ impl SegmentUpdater {
.name(format!("mergingthread-{}", merging_thread_id))
.spawn(move || {
// first we need to apply deletes to our segment.
let merged_segment = segment_updater_clone.new_segment();
let merged_segment_id = merged_segment.id();
let merge_result = perform_merge(
&merge_operation,
&segment_updater_clone.0.index,
segment_entries,
merged_segment,
target_opstamp,
);
match merge_result {
Ok(after_merge_segment_entry) => {
let merged_segment_meta = after_merge_segment_entry.meta().clone();
segment_updater_clone
.end_merge(merge_operation, after_merge_segment_entry)
.end_merge(segment_ids_vec, after_merge_segment_entry)
.expect("Segment updater thread is corrupted.");
// the future may fail if the listener of the oneshot future
@@ -363,18 +371,13 @@ impl SegmentUpdater {
let _merging_future_res = merging_future_send.send(merged_segment_meta);
}
Err(e) => {
warn!(
"Merge of {:?} was cancelled: {:?}",
merge_operation.segment_ids(),
e
);
warn!("Merge of {:?} was cancelled: {:?}", segment_ids_vec, e);
// ... cancel merge
if cfg!(test) {
panic!("Merge failed.");
}
// As `merge_operation` will be dropped, the segment in merge state will
// be available for merge again.
// `merging_future_send` will be dropped, sending an error to the future.
segment_updater_clone.cancel_merge(&segment_ids_vec, merged_segment_id);
// merging_future_send will be dropped, sending an error to the future.
}
}
segment_updater_clone
@@ -395,35 +398,37 @@ impl SegmentUpdater {
}
fn consider_merge_options(&self) {
let merge_segment_ids: HashSet<SegmentId> = self.0.merge_operations.segment_in_merge();
let (committed_segments, uncommitted_segments) =
get_mergeable_segments(&merge_segment_ids, &self.0.segment_manager);
get_mergeable_segments(&self.0.segment_manager);
// Committed segments cannot be merged with uncommitted_segments.
// We therefore consider merges using these two sets of segments independently.
let merge_policy = self.get_merge_policy();
let current_opstamp = self.0.stamper.stamp();
let mut merge_candidates: Vec<MergeOperation> = merge_policy
let mut merge_candidates = merge_policy
.compute_merge_candidates(&uncommitted_segments)
.into_iter()
.map(|merge_candidate| {
MergeOperation::new(&self.0.merge_operations, current_opstamp, merge_candidate.0)
.map(|merge_candidate| MergeOperation {
target_opstamp: current_opstamp,
segment_ids: merge_candidate.0,
})
.collect();
.collect::<Vec<_>>();
let commit_opstamp = self.load_metas().opstamp;
let committed_merge_candidates = merge_policy
.compute_merge_candidates(&committed_segments)
.into_iter()
.map(|merge_candidate| {
MergeOperation::new(&self.0.merge_operations, commit_opstamp, merge_candidate.0)
.map(|merge_candidate| MergeOperation {
target_opstamp: commit_opstamp,
segment_ids: merge_candidate.0,
})
.collect::<Vec<_>>();
merge_candidates.extend(committed_merge_candidates.into_iter());
for merge_operation in merge_candidates {
match self.start_merge_impl(merge_operation) {
for MergeOperation {
target_opstamp,
segment_ids,
} in merge_candidates
{
match self.start_merge_impl(&segment_ids, target_opstamp) {
Ok(merge_future) => {
if let Err(e) = merge_future.fuse().poll() {
error!("The merge task failed quickly after starting: {:?}", e);
@@ -439,9 +444,19 @@ impl SegmentUpdater {
}
}
fn cancel_merge(
&self,
before_merge_segment_ids: &[SegmentId],
after_merge_segment_entry: SegmentId,
) {
self.0
.segment_manager
.cancel_merge(before_merge_segment_ids, after_merge_segment_entry);
}
fn end_merge(
&self,
merge_operation: MergeOperation,
before_merge_segment_ids: Vec<SegmentId>,
mut after_merge_segment_entry: SegmentEntry,
) -> Result<()> {
self.run_async(move |segment_updater| {
@@ -457,15 +472,16 @@ impl SegmentUpdater {
{
error!(
"Merge of {:?} was cancelled (advancing deletes failed): {:?}",
merge_operation.segment_ids(),
e
before_merge_segment_ids, e
);
// ... cancel merge
if cfg!(test) {
panic!("Merge failed.");
}
// ... cancel merge
// `merge_operations` are tracked. As it is dropped, the
// the segment_ids will be available again for merge.
segment_updater.cancel_merge(
&before_merge_segment_ids,
after_merge_segment_entry.segment_id(),
);
return;
}
}
@@ -473,7 +489,7 @@ impl SegmentUpdater {
segment_updater
.0
.segment_manager
.end_merge(merge_operation.segment_ids(), after_merge_segment_entry);
.end_merge(&before_merge_segment_ids, after_merge_segment_entry);
segment_updater.consider_merge_options();
info!("save metas");
let previous_metas = segment_updater.load_metas();
@@ -499,25 +515,32 @@ impl SegmentUpdater {
/// Obsolete files will eventually be cleaned up
/// by the directory garbage collector.
pub fn wait_merging_thread(&self) -> Result<()> {
let mut num_segments: usize;
loop {
let merging_threads: HashMap<usize, JoinHandle<Result<()>>> = {
num_segments = self.0.segment_manager.num_segments();
let mut new_merging_threads = HashMap::new();
{
let mut merging_threads = self.0.merging_threads.write().unwrap();
mem::replace(merging_threads.deref_mut(), HashMap::new())
};
if merging_threads.is_empty() {
return Ok(());
mem::swap(&mut new_merging_threads, merging_threads.deref_mut());
}
debug!("wait merging thread {}", merging_threads.len());
for (_, merging_thread_handle) in merging_threads {
debug!("wait merging thread {}", new_merging_threads.len());
for (_, merging_thread_handle) in new_merging_threads {
merging_thread_handle
.join()
.map(|_| ())
.map_err(|_| TantivyError::ErrorInThread("Merging thread failed.".into()))?;
}
// Our merging thread may have queued their completed merged segment.
// Let's wait for that too.
// Our merging thread may have queued their completed
self.run_async(move |_| {}).wait()?;
let new_num_segments = self.0.segment_manager.num_segments();
if new_num_segments >= num_segments {
break;
}
}
Ok(())
}
}
@@ -537,7 +560,7 @@ mod tests {
let index = Index::create_in_ram(schema);
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer.set_merge_policy(Box::new(MergeWheneverPossible));
{
@@ -567,8 +590,9 @@ mod tests {
index_writer.delete_term(term);
assert!(index_writer.commit().is_ok());
}
let reader = index.reader().unwrap();
assert_eq!(reader.searcher().num_docs(), 302);
index.load_searchers().unwrap();
assert_eq!(index.searcher().num_docs(), 302);
{
index_writer
@@ -576,79 +600,8 @@ mod tests {
.expect("waiting for merging threads");
}
reader.reload().unwrap();
assert_eq!(reader.searcher().segment_readers().len(), 1);
assert_eq!(reader.searcher().num_docs(), 302);
}
#[test]
fn delete_all_docs() {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
{
for _ in 0..100 {
index_writer.add_document(doc!(text_field=>"a"));
index_writer.add_document(doc!(text_field=>"b"));
}
assert!(index_writer.commit().is_ok());
}
{
for _ in 0..100 {
index_writer.add_document(doc!(text_field=>"c"));
index_writer.add_document(doc!(text_field=>"d"));
}
assert!(index_writer.commit().is_ok());
}
{
index_writer.add_document(doc!(text_field=>"e"));
index_writer.add_document(doc!(text_field=>"f"));
assert!(index_writer.commit().is_ok());
}
{
let seg_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
// docs exist, should have at least 1 segment
assert!(seg_ids.len() > 0);
}
{
let term_vals = vec!["a", "b", "c", "d", "e", "f"];
for term_val in term_vals {
let term = Term::from_field_text(text_field, term_val);
index_writer.delete_term(term);
assert!(index_writer.commit().is_ok());
}
}
{
index_writer
.wait_merging_threads()
.expect("waiting for merging threads");
}
let reader = index.reader().unwrap();
assert_eq!(reader.searcher().num_docs(), 0);
let seg_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
assert!(seg_ids.is_empty());
reader.reload().unwrap();
assert_eq!(reader.searcher().num_docs(), 0);
// empty segments should be erased
assert!(index.searchable_segment_metas().unwrap().is_empty());
assert!(reader.searcher().segment_readers().is_empty());
index.load_searchers().unwrap();
assert_eq!(index.searcher().segment_readers().len(), 1);
assert_eq!(index.searcher().num_docs(), 302);
}
}

View File

@@ -5,7 +5,6 @@ use fastfield::FastFieldsWriter;
use fieldnorm::FieldNormsWriter;
use indexer::segment_serializer::SegmentSerializer;
use postings::MultiFieldPostingsWriter;
use schema::FieldEntry;
use schema::FieldType;
use schema::Schema;
use schema::Term;
@@ -16,7 +15,6 @@ use tokenizer::BoxedTokenizer;
use tokenizer::FacetTokenizer;
use tokenizer::{TokenStream, Tokenizer};
use DocId;
use Opstamp;
use Result;
/// A `SegmentWriter` is in charge of creating segment index from a
@@ -30,7 +28,7 @@ pub struct SegmentWriter {
segment_serializer: SegmentSerializer,
fast_field_writers: FastFieldsWriter,
fieldnorms_writer: FieldNormsWriter,
doc_opstamps: Vec<Opstamp>,
doc_opstamps: Vec<u64>,
tokenizers: Vec<Option<Box<BoxedTokenizer>>>,
}
@@ -55,7 +53,7 @@ impl SegmentWriter {
schema
.fields()
.iter()
.map(FieldEntry::field_type)
.map(|field_entry| field_entry.field_type())
.map(|field_type| match *field_type {
FieldType::Str(ref text_options) => text_options
.get_indexing_options()
@@ -173,17 +171,6 @@ impl SegmentWriter {
}
}
}
FieldType::Date(ref int_option) => {
if int_option.is_indexed() {
for field_value in field_values {
let term = Term::from_field_i64(
field_value.field(),
field_value.value().date_value().timestamp(),
);
self.multifield_postings.subscribe(doc_id, &term);
}
}
}
FieldType::I64(ref int_option) => {
if int_option.is_indexed() {
for field_value in field_values {

View File

@@ -1,35 +1,69 @@
use std::ops::Range;
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::Arc;
use Opstamp;
use std::sync::atomic::Ordering;
/// Stamper provides Opstamps, which is just an auto-increment id to label
/// an operation.
///
/// Cloning does not "fork" the stamp generation. The stamper actually wraps an `Arc`.
#[derive(Clone, Default)]
pub struct Stamper(Arc<AtomicU64>);
impl Stamper {
pub fn new(first_opstamp: Opstamp) -> Stamper {
Stamper(Arc::new(AtomicU64::new(first_opstamp)))
}
// AtomicU64 have not landed in stable.
// For the moment let's just use AtomicUsize on
// x86/64 bit platform, and a mutex on other platform.
#[cfg(target_arch = "x86_64")]
mod archicture_impl {
pub fn stamp(&self) -> Opstamp {
self.0.fetch_add(1u64, Ordering::SeqCst) as u64
}
use std::sync::atomic::{AtomicUsize, Ordering};
/// Given a desired count `n`, `stamps` returns an iterator that
/// will supply `n` number of u64 stamps.
pub fn stamps(&self, n: u64) -> Range<Opstamp> {
let start = self.0.fetch_add(n, Ordering::SeqCst);
Range {
start,
end: start + n,
#[derive(Default)]
pub struct AtomicU64Ersatz(AtomicUsize);
impl AtomicU64Ersatz {
pub fn new(first_opstamp: u64) -> AtomicU64Ersatz {
AtomicU64Ersatz(AtomicUsize::new(first_opstamp as usize))
}
pub fn fetch_add(&self, val: u64, order: Ordering) -> u64 {
self.0.fetch_add(val as usize, order) as u64
}
}
}
#[cfg(not(target_arch = "x86_64"))]
mod archicture_impl {
use std::sync::atomic::Ordering;
/// Under other architecture, we rely on a mutex.
use std::sync::RwLock;
#[derive(Default)]
pub struct AtomicU64Ersatz(RwLock<u64>);
impl AtomicU64Ersatz {
pub fn new(first_opstamp: u64) -> AtomicU64Ersatz {
AtomicU64Ersatz(RwLock::new(first_opstamp))
}
pub fn fetch_add(&self, incr: u64, _order: Ordering) -> u64 {
let mut lock = self.0.write().unwrap();
let previous_val = *lock;
*lock = previous_val + incr;
previous_val
}
}
}
use self::archicture_impl::AtomicU64Ersatz;
#[derive(Clone, Default)]
pub struct Stamper(Arc<AtomicU64Ersatz>);
impl Stamper {
pub fn new(first_opstamp: u64) -> Stamper {
Stamper(Arc::new(AtomicU64Ersatz::new(first_opstamp)))
}
pub fn stamp(&self) -> u64 {
self.0.fetch_add(1u64, Ordering::SeqCst) as u64
}
}
#[cfg(test)]
mod test {
@@ -46,8 +80,5 @@ mod test {
assert_eq!(stamper.stamp(), 10u64);
assert_eq!(stamper_clone.stamp(), 11u64);
assert_eq!(stamper.stamps(3u64), (12..15));
assert_eq!(stamper.stamp(), 15u64);
}
}

View File

@@ -75,9 +75,9 @@
//!
//! // # Searching
//!
//! let reader = index.reader()?;
//! index.load_searchers()?;
//!
//! let searcher = reader.searcher();
//! let searcher = index.searcher();
//!
//! let query_parser = QueryParser::for_index(&index, vec![title, body]);
//!
@@ -129,24 +129,25 @@ extern crate base64;
extern crate bit_set;
extern crate bitpacking;
extern crate byteorder;
extern crate scoped_pool;
extern crate combine;
extern crate crossbeam;
extern crate fnv;
extern crate fst;
extern crate fst_regex;
extern crate futures;
extern crate futures_cpupool;
extern crate htmlescape;
extern crate itertools;
extern crate levenshtein_automata;
#[cfg(feature = "mmap")]
extern crate memmap;
extern crate num_cpus;
extern crate owning_ref;
extern crate regex;
extern crate rust_stemmers;
extern crate scoped_pool;
extern crate serde;
extern crate stable_deref_trait;
extern crate tantivy_fst;
extern crate tempdir;
extern crate tempfile;
extern crate uuid;
@@ -169,12 +170,11 @@ extern crate maplit;
extern crate test;
#[macro_use]
extern crate downcast_rs;
extern crate downcast;
#[macro_use]
extern crate fail;
#[cfg(feature = "mmap")]
#[cfg(test)]
mod functional_test;
@@ -187,15 +187,11 @@ pub use error::TantivyError;
pub use error::TantivyError as Error;
extern crate census;
pub extern crate chrono;
extern crate owned_read;
/// Tantivy result.
pub type Result<T> = std::result::Result<T, error::TantivyError>;
/// Tantivy DateTime
pub type DateTime = chrono::DateTime<chrono::Utc>;
mod common;
mod core;
mod indexer;
@@ -216,9 +212,6 @@ pub mod space_usage;
pub mod store;
pub mod termdict;
mod reader;
pub use self::reader::{IndexReader, IndexReaderBuilder, ReloadPolicy};
mod snippet;
pub use self::snippet::{Snippet, SnippetGenerator};
@@ -226,7 +219,7 @@ mod docset;
pub use self::docset::{DocSet, SkipResult};
pub use core::SegmentComponent;
pub use core::{Index, Searcher, Segment, SegmentId, SegmentMeta, IndexMeta};
pub use core::{Index, Searcher, Segment, SegmentId, SegmentMeta};
pub use core::{InvertedIndexReader, SegmentReader};
pub use directory::Directory;
pub use indexer::IndexWriter;
@@ -238,7 +231,11 @@ pub use common::{i64_to_u64, u64_to_i64};
/// Expose the current version of tantivy, as well
/// whether it was compiled with the simd compression.
pub fn version() -> &'static str {
env!("CARGO_PKG_VERSION")
if cfg!(feature = "simdcompression") {
concat!(env!("CARGO_PKG_VERSION"), "-simd")
} else {
concat!(env!("CARGO_PKG_VERSION"), "-nosimd")
}
}
/// Defines tantivy's merging strategy
@@ -254,16 +251,6 @@ pub mod merge_policy {
/// as they are added in the segment.
pub type DocId = u32;
/// A u64 assigned to every operation incrementally
///
/// All operations modifying the index receives an monotonic Opstamp.
/// The resulting state of the index is consistent with the opstamp ordering.
///
/// For instance, a commit with opstamp `32_423` will reflect all Add and Delete operations
/// with an opstamp `<= 32_423`. A delete operation with opstamp n will no affect a document added
/// with opstamp `n+1`.
pub type Opstamp = u64;
/// A f32 that represents the relevance of the document to the query
///
/// This is modelled internally as a `f32`. The
@@ -317,7 +304,6 @@ mod tests {
use Index;
use IndexWriter;
use Postings;
use ReloadPolicy;
pub fn assert_nearly_equals(expected: f32, val: f32) {
assert!(
@@ -362,7 +348,7 @@ mod tests {
let index = Index::create_from_tempdir(schema).unwrap();
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
{
let doc = doc!(text_field=>"af b");
index_writer.add_document(doc);
@@ -384,7 +370,7 @@ mod tests {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
{
index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.commit().unwrap();
@@ -406,8 +392,8 @@ mod tests {
index_writer.commit().unwrap();
}
{
let reader = index.reader().unwrap();
let searcher = reader.searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let term_a = Term::from_field_text(text_field, "a");
assert_eq!(searcher.doc_freq(&term_a), 3);
let term_b = Term::from_field_text(text_field, "b");
@@ -426,7 +412,7 @@ mod tests {
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
{
let doc = doc!(text_field=>"a b c");
index_writer.add_document(doc);
@@ -434,8 +420,8 @@ mod tests {
index_writer.commit().unwrap();
}
{
let index_reader = index.reader().unwrap();
let searcher = index_reader.searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
{
let fieldnorm_reader = reader.get_fieldnorms_reader(text_field);
@@ -454,7 +440,7 @@ mod tests {
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
{
let doc = doc!(text_field=>"a b c");
index_writer.add_document(doc);
@@ -470,8 +456,8 @@ mod tests {
index_writer.commit().unwrap();
}
{
let reader = index.reader().unwrap();
let searcher = reader.searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let segment_reader: &SegmentReader = searcher.segment_reader(0);
let fieldnorms_reader = segment_reader.get_fieldnorms_reader(text_field);
assert_eq!(fieldnorms_reader.fieldnorm(0), 3);
@@ -499,14 +485,9 @@ mod tests {
let term_c = Term::from_field_text(text_field, "c");
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
// 0
index_writer.add_document(doc!(text_field=>"a b"));
// 1
@@ -525,10 +506,10 @@ mod tests {
index_writer.commit().unwrap();
}
{
reader.reload().unwrap();
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0);
let inverted_index = segment_reader.inverted_index(text_field);
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
let inverted_index = reader.inverted_index(text_field);
assert!(inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.is_none());
@@ -536,24 +517,24 @@ mod tests {
let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.unwrap();
assert!(advance_undeleted(&mut postings, segment_reader));
assert!(advance_undeleted(&mut postings, reader));
assert_eq!(postings.doc(), 5);
assert!(!advance_undeleted(&mut postings, segment_reader));
assert!(!advance_undeleted(&mut postings, reader));
}
{
let mut postings = inverted_index
.read_postings(&term_b, IndexRecordOption::WithFreqsAndPositions)
.unwrap();
assert!(advance_undeleted(&mut postings, segment_reader));
assert!(advance_undeleted(&mut postings, reader));
assert_eq!(postings.doc(), 3);
assert!(advance_undeleted(&mut postings, segment_reader));
assert!(advance_undeleted(&mut postings, reader));
assert_eq!(postings.doc(), 4);
assert!(!advance_undeleted(&mut postings, segment_reader));
assert!(!advance_undeleted(&mut postings, reader));
}
}
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
// 0
index_writer.add_document(doc!(text_field=>"a b"));
// 1
@@ -561,10 +542,10 @@ mod tests {
index_writer.rollback().unwrap();
}
{
reader.reload().unwrap();
let searcher = reader.searcher();
let seg_reader = searcher.segment_reader(0);
let inverted_index = seg_reader.inverted_index(term_abcd.field());
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
let inverted_index = reader.inverted_index(term_abcd.field());
assert!(inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
@@ -573,24 +554,24 @@ mod tests {
let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.unwrap();
assert!(advance_undeleted(&mut postings, seg_reader));
assert!(advance_undeleted(&mut postings, reader));
assert_eq!(postings.doc(), 5);
assert!(!advance_undeleted(&mut postings, seg_reader));
assert!(!advance_undeleted(&mut postings, reader));
}
{
let mut postings = inverted_index
.read_postings(&term_b, IndexRecordOption::WithFreqsAndPositions)
.unwrap();
assert!(advance_undeleted(&mut postings, seg_reader));
assert!(advance_undeleted(&mut postings, reader));
assert_eq!(postings.doc(), 3);
assert!(advance_undeleted(&mut postings, seg_reader));
assert!(advance_undeleted(&mut postings, reader));
assert_eq!(postings.doc(), 4);
assert!(!advance_undeleted(&mut postings, seg_reader));
assert!(!advance_undeleted(&mut postings, reader));
}
}
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"a b"));
index_writer.delete_term(Term::from_field_text(text_field, "c"));
index_writer.rollback().unwrap();
@@ -598,10 +579,10 @@ mod tests {
index_writer.commit().unwrap();
}
{
reader.reload().unwrap();
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0);
let inverted_index = segment_reader.inverted_index(term_abcd.field());
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
let inverted_index = reader.inverted_index(term_abcd.field());
assert!(inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.is_none());
@@ -609,25 +590,25 @@ mod tests {
let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.unwrap();
assert!(!advance_undeleted(&mut postings, segment_reader));
assert!(!advance_undeleted(&mut postings, reader));
}
{
let mut postings = inverted_index
.read_postings(&term_b, IndexRecordOption::WithFreqsAndPositions)
.unwrap();
assert!(advance_undeleted(&mut postings, segment_reader));
assert!(advance_undeleted(&mut postings, reader));
assert_eq!(postings.doc(), 3);
assert!(advance_undeleted(&mut postings, segment_reader));
assert!(advance_undeleted(&mut postings, reader));
assert_eq!(postings.doc(), 4);
assert!(!advance_undeleted(&mut postings, segment_reader));
assert!(!advance_undeleted(&mut postings, reader));
}
{
let mut postings = inverted_index
.read_postings(&term_c, IndexRecordOption::WithFreqsAndPositions)
.unwrap();
assert!(advance_undeleted(&mut postings, segment_reader));
assert!(advance_undeleted(&mut postings, reader));
assert_eq!(postings.doc(), 4);
assert!(!advance_undeleted(&mut postings, segment_reader));
assert!(!advance_undeleted(&mut postings, reader));
}
}
}
@@ -635,15 +616,15 @@ mod tests {
#[test]
fn test_indexed_u64() {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_u64_field("value", INDEXED);
let field = schema_builder.add_u64_field("value", INT_INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer.add_document(doc!(field=>1u64));
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let term = Term::from_field_u64(field, 1u64);
let mut postings = searcher
.segment_reader(0)
@@ -658,16 +639,16 @@ mod tests {
#[test]
fn test_indexed_i64() {
let mut schema_builder = Schema::builder();
let value_field = schema_builder.add_i64_field("value", INDEXED);
let value_field = schema_builder.add_i64_field("value", INT_INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
let negative_val = -1i64;
index_writer.add_document(doc!(value_field => negative_val));
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let term = Term::from_field_i64(value_field, negative_val);
let mut postings = searcher
.segment_reader(0)
@@ -686,11 +667,11 @@ mod tests {
let absent_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(2, 6_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(2, 40_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"a"));
assert!(index_writer.commit().is_ok());
let reader = index.reader().unwrap();
let searcher = reader.searcher();
assert!(index.load_searchers().is_ok());
let searcher = index.searcher();
let segment_reader = searcher.segment_reader(0);
segment_reader.inverted_index(absent_field); //< should not panic
}
@@ -701,14 +682,9 @@ mod tests {
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
// writing the segment
let mut index_writer = index.writer_with_num_threads(2, 6_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(2, 40_000_000).unwrap();
let add_document = |index_writer: &mut IndexWriter, val: &'static str| {
let doc = doc!(text_field=>val);
@@ -731,8 +707,8 @@ mod tests {
remove_document(&mut index_writer, "38");
remove_document(&mut index_writer, "34");
index_writer.commit().unwrap();
reader.reload().unwrap();
let searcher = reader.searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
assert_eq!(searcher.num_docs(), 6);
}
@@ -744,7 +720,7 @@ mod tests {
let index = Index::create_in_ram(schema);
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
{
let doc = doc!(text_field=>"af af af bc bc");
index_writer.add_document(doc);
@@ -752,8 +728,8 @@ mod tests {
index_writer.commit().unwrap();
}
{
let index_reader = index.reader().unwrap();
let searcher = index_reader.searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
let inverted_index = reader.inverted_index(text_field);
let term_abcd = Term::from_field_text(text_field, "abcd");
@@ -777,18 +753,18 @@ mod tests {
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let reader = index.reader().unwrap();
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"af af af b"));
index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.add_document(doc!(text_field=>"a b c d"));
index_writer.commit().unwrap();
}
{
reader.reload().unwrap();
let searcher = reader.searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let get_doc_ids = |terms: Vec<Term>| {
let query = BooleanQuery::new_multiterms_query(terms);
let topdocs = searcher.search(&query, &TestCollector).unwrap();
@@ -830,22 +806,25 @@ mod tests {
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
assert_eq!(reader.searcher().num_docs(), 0u64);
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"af b"));
index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.add_document(doc!(text_field=>"a b c d"));
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
{
let doc = doc!(text_field=>"af b");
index_writer.add_document(doc);
}
{
let doc = doc!(text_field=>"a b c");
index_writer.add_document(doc);
}
{
let doc = doc!(text_field=>"a b c d");
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
}
reader.reload().unwrap();
assert_eq!(reader.searcher().num_docs(), 3u64);
index.searcher();
}
#[test]
@@ -872,7 +851,7 @@ mod tests {
let fast_field_unsigned = schema_builder.add_u64_field("unsigned", FAST);
let fast_field_signed = schema_builder.add_i64_field("signed", FAST);
let text_field = schema_builder.add_text_field("text", TEXT);
let stored_int_field = schema_builder.add_u64_field("text", STORED);
let stored_int_field = schema_builder.add_u64_field("text", INT_STORED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
@@ -882,32 +861,33 @@ mod tests {
index_writer.add_document(document);
index_writer.commit().unwrap();
}
let reader = index.reader().unwrap();
let searcher = reader.searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let segment_reader: &SegmentReader = searcher.segment_reader(0);
{
let fast_field_reader_opt = segment_reader.fast_fields().u64(text_field);
assert!(fast_field_reader_opt.is_none());
let fast_field_reader_res = segment_reader.fast_field_reader::<u64>(text_field);
assert!(fast_field_reader_res.is_err());
}
{
let fast_field_reader_opt = segment_reader.fast_fields().u64(stored_int_field);
assert!(fast_field_reader_opt.is_none());
let fast_field_reader_res = segment_reader.fast_field_reader::<u64>(stored_int_field);
assert!(fast_field_reader_res.is_err());
}
{
let fast_field_reader_opt = segment_reader.fast_fields().u64(fast_field_signed);
assert!(fast_field_reader_opt.is_none());
let fast_field_reader_res = segment_reader.fast_field_reader::<u64>(fast_field_signed);
assert!(fast_field_reader_res.is_err());
}
{
let fast_field_reader_opt = segment_reader.fast_fields().i64(fast_field_signed);
assert!(fast_field_reader_opt.is_some());
let fast_field_reader = fast_field_reader_opt.unwrap();
let fast_field_reader_res = segment_reader.fast_field_reader::<i64>(fast_field_signed);
assert!(fast_field_reader_res.is_ok());
let fast_field_reader = fast_field_reader_res.unwrap();
assert_eq!(fast_field_reader.get(0), 4i64)
}
{
let fast_field_reader_opt = segment_reader.fast_fields().i64(fast_field_signed);
assert!(fast_field_reader_opt.is_some());
let fast_field_reader = fast_field_reader_opt.unwrap();
let fast_field_reader_res = segment_reader.fast_field_reader::<i64>(fast_field_signed);
assert!(fast_field_reader_res.is_ok());
let fast_field_reader = fast_field_reader_res.unwrap();
assert_eq!(fast_field_reader.get(0), 4i64)
}
}

View File

@@ -61,7 +61,7 @@ macro_rules! doc(
};
// if there is a trailing comma retry with the trailing comma stripped.
($($field:expr => $value:expr),+ ,) => {
doc!( $( $field => $value ), *)
doc!( $( $field => $value ), *);
};
);

View File

@@ -34,6 +34,10 @@ const COMPRESSION_BLOCK_SIZE: usize = BitPacker4x::BLOCK_LEN;
const LONG_SKIP_IN_BLOCKS: usize = 1_024;
const LONG_SKIP_INTERVAL: u64 = (LONG_SKIP_IN_BLOCKS * COMPRESSION_BLOCK_SIZE) as u64;
lazy_static! {
static ref BIT_PACKER: BitPacker4x = BitPacker4x::new();
}
#[cfg(test)]
pub mod tests {

View File

@@ -1,23 +1,4 @@
/// Positions works as a long sequence of compressed block.
/// All terms are chained one after the other.
///
/// When accessing the position of a term, we get a positions_idx from the `Terminfo`.
/// This means we need to skip to the `nth` positions efficiently.
///
/// This is done thanks to two levels of skiping that we refer to in the code
/// as `long_skip` and `short_skip`.
///
/// The `long_skip` makes it possible to skip every 1_024 compression blocks (= 131_072 positions).
/// Skipping offset are simply stored one after as an offset stored over 8 bytes.
///
/// We find the number of long skips, as `n / long_skip`.
///
/// Blocks are compressed using bitpacking, so `skip_read` contains the number of bytes
/// (values can go from 0bit to 32 bits) required to decompressed every block.
///
/// A given block obviously takes `(128 x num_bit_for_the_block / num_bits_in_a_byte)`,
/// so skipping a block without decompressing it is just a matter of advancing that many
/// bytes.
use super::BIT_PACKER;
use bitpacking::{BitPacker, BitPacker4x};
use common::{BinarySerializable, FixedSize};
use directory::ReadOnlySource;
@@ -27,65 +8,9 @@ use positions::LONG_SKIP_INTERVAL;
use positions::LONG_SKIP_IN_BLOCKS;
use postings::compression::compressed_block_size;
struct Positions {
bit_packer: BitPacker4x,
skip_source: ReadOnlySource,
position_source: ReadOnlySource,
long_skip_source: ReadOnlySource,
}
impl Positions {
pub fn new(position_source: ReadOnlySource, skip_source: ReadOnlySource) -> Positions {
let skip_len = skip_source.len();
let (body, footer) = skip_source.split(skip_len - u32::SIZE_IN_BYTES);
let num_long_skips = u32::deserialize(&mut footer.as_slice()).expect("Index corrupted");
let body_split = body.len() - u64::SIZE_IN_BYTES * (num_long_skips as usize);
let (skip_source, long_skip_source) = body.split(body_split);
Positions {
bit_packer: BitPacker4x::new(),
skip_source,
long_skip_source,
position_source,
}
}
/// Returns the offset of the block associated to the given `long_skip_id`.
///
/// One `long_skip_id` means `LONG_SKIP_IN_BLOCKS` blocks.
fn long_skip(&self, long_skip_id: usize) -> u64 {
if long_skip_id == 0 {
return 0;
}
let long_skip_slice = self.long_skip_source.as_slice();
let mut long_skip_blocks: &[u8] = &long_skip_slice[(long_skip_id - 1) * 8..][..8];
u64::deserialize(&mut long_skip_blocks).expect("Index corrupted")
}
fn reader(&self, offset: u64) -> PositionReader {
let long_skip_id = (offset / LONG_SKIP_INTERVAL) as usize;
let small_skip = (offset % LONG_SKIP_INTERVAL) as usize;
let offset_num_bytes: u64 = self.long_skip(long_skip_id);
let mut position_read = OwnedRead::new(self.position_source.clone());
position_read.advance(offset_num_bytes as usize);
let mut skip_read = OwnedRead::new(self.skip_source.clone());
skip_read.advance(long_skip_id * LONG_SKIP_IN_BLOCKS);
let mut position_reader = PositionReader {
bit_packer: self.bit_packer,
skip_read,
position_read,
inner_offset: 0,
buffer: Box::new([0u32; 128]),
ahead: None,
};
position_reader.skip(small_skip);
position_reader
}
}
pub struct PositionReader {
skip_read: OwnedRead,
position_read: OwnedRead,
bit_packer: BitPacker4x,
inner_offset: usize,
buffer: Box<[u32; 128]>,
ahead: Option<usize>, // if None, no block is loaded.
@@ -102,7 +27,6 @@ pub struct PositionReader {
// If the requested number of els ends exactly at a given block, the next
// block is not decompressed.
fn read_impl(
bit_packer: BitPacker4x,
mut position: &[u8],
buffer: &mut [u32; 128],
mut inner_offset: usize,
@@ -113,23 +37,21 @@ fn read_impl(
let mut output_len = output.len();
let mut ahead = 0;
loop {
let available_len = COMPRESSION_BLOCK_SIZE - inner_offset;
// We have enough elements in the current block.
// Let's copy the requested elements in the output buffer,
// and return.
let available_len = 128 - inner_offset;
if output_len <= available_len {
output[output_start..].copy_from_slice(&buffer[inner_offset..][..output_len]);
return ahead;
} else {
output[output_start..][..available_len].copy_from_slice(&buffer[inner_offset..]);
output_len -= available_len;
output_start += available_len;
inner_offset = 0;
let num_bits = num_bits[ahead];
BitPacker4x::new().decompress(position, &mut buffer[..], num_bits);
let block_len = compressed_block_size(num_bits);
position = &position[block_len..];
ahead += 1;
}
output[output_start..][..available_len].copy_from_slice(&buffer[inner_offset..]);
output_len -= available_len;
output_start += available_len;
inner_offset = 0;
let num_bits = num_bits[ahead];
bit_packer.decompress(position, &mut buffer[..], num_bits);
let block_len = compressed_block_size(num_bits);
position = &position[block_len..];
ahead += 1;
}
}
@@ -139,7 +61,35 @@ impl PositionReader {
skip_source: ReadOnlySource,
offset: u64,
) -> PositionReader {
Positions::new(position_source, skip_source).reader(offset)
let skip_len = skip_source.len();
let (body, footer) = skip_source.split(skip_len - u32::SIZE_IN_BYTES);
let num_long_skips = u32::deserialize(&mut footer.as_slice()).expect("Index corrupted");
let body_split = body.len() - u64::SIZE_IN_BYTES * (num_long_skips as usize);
let (skip_body, long_skips) = body.split(body_split);
let long_skip_id = (offset / LONG_SKIP_INTERVAL) as usize;
let small_skip = (offset - (long_skip_id as u64) * (LONG_SKIP_INTERVAL as u64)) as usize;
let offset_num_bytes: u64 = {
if long_skip_id > 0 {
let mut long_skip_blocks: &[u8] =
&long_skips.as_slice()[(long_skip_id - 1) * 8..][..8];
u64::deserialize(&mut long_skip_blocks).expect("Index corrupted") * 16
} else {
0
}
};
let mut position_read = OwnedRead::new(position_source);
position_read.advance(offset_num_bytes as usize);
let mut skip_read = OwnedRead::new(skip_body);
skip_read.advance(long_skip_id * LONG_SKIP_IN_BLOCKS);
let mut position_reader = PositionReader {
skip_read,
position_read,
inner_offset: 0,
buffer: Box::new([0u32; 128]),
ahead: None,
};
position_reader.skip(small_skip);
position_reader
}
/// Fills a buffer with the next `output.len()` integers.
@@ -151,13 +101,10 @@ impl PositionReader {
if self.ahead != Some(0) {
// the block currently available is not the block
// for the current position
self.bit_packer
.decompress(position_data, self.buffer.as_mut(), num_bits);
self.ahead = Some(0);
BIT_PACKER.decompress(position_data, self.buffer.as_mut(), num_bits);
}
let block_len = compressed_block_size(num_bits);
self.ahead = Some(read_impl(
self.bit_packer,
&position_data[block_len..],
self.buffer.as_mut(),
self.inner_offset,
@@ -186,13 +133,14 @@ impl PositionReader {
}
});
let skip_len_in_bits = self.skip_read.as_ref()[..num_blocks_to_advance]
let skip_len = self.skip_read.as_ref()[..num_blocks_to_advance]
.iter()
.map(|num_bits| *num_bits as usize)
.cloned()
.map(|num_bit| num_bit as usize)
.sum::<usize>()
* COMPRESSION_BLOCK_SIZE;
let skip_len_in_bytes = skip_len_in_bits / 8;
* (COMPRESSION_BLOCK_SIZE / 8);
self.skip_read.advance(num_blocks_to_advance);
self.position_read.advance(skip_len_in_bytes);
self.position_read.advance(skip_len);
}
}

View File

@@ -1,30 +1,29 @@
use super::BIT_PACKER;
use bitpacking::BitPacker;
use bitpacking::BitPacker4x;
use common::BinarySerializable;
use common::CountingWriter;
use positions::{COMPRESSION_BLOCK_SIZE, LONG_SKIP_INTERVAL};
use std::io::{self, Write};
use std::io;
pub struct PositionSerializer<W: io::Write> {
bit_packer: BitPacker4x,
write_stream: CountingWriter<W>,
write_stream: W,
write_skiplist: W,
block: Vec<u32>,
buffer: Vec<u8>,
num_ints: u64,
long_skips: Vec<u64>,
cumulated_num_bits: u64,
}
impl<W: io::Write> PositionSerializer<W> {
pub fn new(write_stream: W, write_skiplist: W) -> PositionSerializer<W> {
PositionSerializer {
bit_packer: BitPacker4x::new(),
write_stream: CountingWriter::wrap(write_stream),
write_stream,
write_skiplist,
block: Vec::with_capacity(128),
buffer: vec![0u8; 128 * 4],
num_ints: 0u64,
long_skips: Vec::new(),
cumulated_num_bits: 0u64,
}
}
@@ -51,15 +50,14 @@ impl<W: io::Write> PositionSerializer<W> {
}
fn flush_block(&mut self) -> io::Result<()> {
let num_bits = self.bit_packer.num_bits(&self.block[..]);
let num_bits = BIT_PACKER.num_bits(&self.block[..]);
self.cumulated_num_bits += u64::from(num_bits);
self.write_skiplist.write_all(&[num_bits])?;
let written_len = self
.bit_packer
.compress(&self.block[..], &mut self.buffer, num_bits);
let written_len = BIT_PACKER.compress(&self.block[..], &mut self.buffer, num_bits);
self.write_stream.write_all(&self.buffer[..written_len])?;
self.block.clear();
if (self.num_ints % LONG_SKIP_INTERVAL) == 0u64 {
self.long_skips.push(self.write_stream.written_bytes());
self.long_skips.push(self.cumulated_num_bits);
}
Ok(())
}

View File

@@ -1,249 +0,0 @@
use postings::compression::AlignedBuffer;
/// This modules define the logic used to search for a doc in a given
/// block. (at most 128 docs)
///
/// Searching within a block is a hotspot when running intersection.
/// so it was worth defining it in its own module.
#[cfg(target_arch = "x86_64")]
mod sse2 {
use postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};
use std::arch::x86_64::__m128i as DataType;
use std::arch::x86_64::_mm_add_epi32 as op_add;
use std::arch::x86_64::_mm_cmplt_epi32 as op_lt;
use std::arch::x86_64::_mm_load_si128 as op_load; // requires 128-bits alignment
use std::arch::x86_64::_mm_set1_epi32 as set1;
use std::arch::x86_64::_mm_setzero_si128 as set0;
use std::arch::x86_64::_mm_sub_epi32 as op_sub;
use std::arch::x86_64::{_mm_cvtsi128_si32, _mm_shuffle_epi32};
const MASK1: i32 = 78;
const MASK2: i32 = 177;
/// Performs an exhaustive linear search over the
///
/// There is no early exit here. We simply count the
/// number of elements that are `< target`.
pub(crate) fn linear_search_sse2_128(arr: &AlignedBuffer, target: u32) -> usize {
unsafe {
let ptr = arr as *const AlignedBuffer as *const DataType;
let vkey = set1(target as i32);
let mut cnt = set0();
// We work over 4 `__m128i` at a time.
// A single `__m128i` actual contains 4 `u32`.
for i in 0..(COMPRESSION_BLOCK_SIZE as isize) / (4 * 4) {
let cmp1 = op_lt(op_load(ptr.offset(i * 4)), vkey);
let cmp2 = op_lt(op_load(ptr.offset(i * 4 + 1)), vkey);
let cmp3 = op_lt(op_load(ptr.offset(i * 4 + 2)), vkey);
let cmp4 = op_lt(op_load(ptr.offset(i * 4 + 3)), vkey);
let sum = op_add(op_add(cmp1, cmp2), op_add(cmp3, cmp4));
cnt = op_sub(cnt, sum);
}
cnt = op_add(cnt, _mm_shuffle_epi32(cnt, MASK1));
cnt = op_add(cnt, _mm_shuffle_epi32(cnt, MASK2));
_mm_cvtsi128_si32(cnt) as usize
}
}
#[cfg(test)]
mod test {
use super::linear_search_sse2_128;
use postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};
#[test]
fn test_linear_search_sse2_128_u32() {
let mut block = [0u32; COMPRESSION_BLOCK_SIZE];
for el in 0u32..128u32 {
block[el as usize] = el * 2 + 1 << 18;
}
let target = block[64] + 1;
assert_eq!(linear_search_sse2_128(&AlignedBuffer(block), target), 65);
}
}
}
/// This `linear search` browser exhaustively through the array.
/// but the early exit is very difficult to predict.
///
/// Coupled with `exponential search` this function is likely
/// to be called with the same `len`
fn linear_search(arr: &[u32], target: u32) -> usize {
arr.iter().map(|&el| if el < target { 1 } else { 0 }).sum()
}
fn exponential_search(arr: &[u32], target: u32) -> (usize, usize) {
let end = arr.len();
let mut begin = 0;
for &pivot in &[1, 3, 7, 15, 31, 63] {
if pivot >= end {
break;
}
if arr[pivot] > target {
return (begin, pivot);
}
begin = pivot;
}
(begin, end)
}
fn galloping(block_docs: &[u32], target: u32) -> usize {
let (start, end) = exponential_search(&block_docs, target);
start + linear_search(&block_docs[start..end], target)
}
/// Tantivy may rely on SIMD instructions to search for a specific document within
/// a given block.
#[derive(Clone, Copy, PartialEq)]
pub enum BlockSearcher {
#[cfg(target_arch = "x86_64")]
SSE2,
Scalar,
}
impl BlockSearcher {
/// Search the first index containing an element greater or equal to
/// the target.
///
/// The results should be equivalent to
/// ```ignore
/// block[..]
// .iter()
// .take_while(|&&val| val < target)
// .count()
/// ```
///
/// The `start` argument is just used to hint that the response is
/// greater than beyond `start`. The implementation may or may not use
/// it for optimization.
///
/// # Assumption
///
/// The array len is > start.
/// The block is sorted
/// The target is assumed greater or equal to the `arr[start]`.
/// The target is assumed smaller or equal to the last element of the block.
///
/// Currently the scalar implementation starts by an exponential search, and
/// then operates a linear search in the result subarray.
///
/// If SSE2 instructions are available in the `(platform, running CPU)`,
/// then we use a different implementation that does an exhaustive linear search over
/// the full block whenever the block is full (`len == 128`). It is surprisingly faster, most likely because of the lack
/// of branch.
pub(crate) fn search_in_block(
self,
block_docs: &AlignedBuffer,
len: usize,
start: usize,
target: u32,
) -> usize {
#[cfg(target_arch = "x86_64")]
{
use postings::compression::COMPRESSION_BLOCK_SIZE;
if self == BlockSearcher::SSE2 && len == COMPRESSION_BLOCK_SIZE {
return sse2::linear_search_sse2_128(block_docs, target);
}
}
start + galloping(&block_docs.0[start..len], target)
}
}
impl Default for BlockSearcher {
fn default() -> BlockSearcher {
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("sse2") {
return BlockSearcher::SSE2;
}
}
BlockSearcher::Scalar
}
}
#[cfg(test)]
mod tests {
use super::exponential_search;
use super::linear_search;
use super::BlockSearcher;
use postings::compression::{AlignedBuffer, COMPRESSION_BLOCK_SIZE};
#[test]
fn test_linear_search() {
let len: usize = 50;
let arr: Vec<u32> = (0..len).map(|el| 1u32 + (el as u32) * 2).collect();
for target in 1..*arr.last().unwrap() {
let res = linear_search(&arr[..], target);
if res > 0 {
assert!(arr[res - 1] < target);
}
if res < len {
assert!(arr[res] >= target);
}
}
}
#[test]
fn test_exponentiel_search() {
assert_eq!(exponential_search(&[1, 2], 0), (0, 1));
assert_eq!(exponential_search(&[1, 2], 1), (0, 1));
assert_eq!(
exponential_search(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 7),
(3, 7)
);
}
fn util_test_search_in_block(block_searcher: BlockSearcher, block: &[u32], target: u32) {
let cursor = search_in_block_trivial_but_slow(block, target);
assert!(block.len() < COMPRESSION_BLOCK_SIZE);
let mut output_buffer = [u32::max_value(); COMPRESSION_BLOCK_SIZE];
output_buffer[..block.len()].copy_from_slice(block);
for i in 0..cursor {
assert_eq!(
block_searcher.search_in_block(
&AlignedBuffer(output_buffer),
block.len(),
i,
target
),
cursor
);
}
}
fn util_test_search_in_block_all(block_searcher: BlockSearcher, block: &[u32]) {
use std::collections::HashSet;
let mut targets = HashSet::new();
for (i, val) in block.iter().cloned().enumerate() {
if i > 0 {
targets.insert(val - 1);
}
targets.insert(val);
}
for target in targets {
util_test_search_in_block(block_searcher, block, target);
}
}
fn search_in_block_trivial_but_slow(block: &[u32], target: u32) -> usize {
block.iter().take_while(|&&val| val < target).count()
}
fn test_search_in_block_util(block_searcher: BlockSearcher) {
for len in 1u32..128u32 {
let v: Vec<u32> = (0..len).map(|i| i * 2).collect();
util_test_search_in_block_all(block_searcher, &v[..]);
}
}
#[test]
fn test_search_in_block_scalar() {
test_search_in_block_util(BlockSearcher::Scalar);
}
#[cfg(target_arch = "x86_64")]
#[test]
fn test_search_in_block_sse2() {
test_search_in_block_util(BlockSearcher::SSE2);
}
}

View File

@@ -43,14 +43,9 @@ impl BlockEncoder {
}
}
/// We ensure that the OutputBuffer is align on 128 bits
/// in order to run SSE2 linear search on it.
#[repr(align(128))]
pub(crate) struct AlignedBuffer(pub [u32; COMPRESSION_BLOCK_SIZE]);
pub struct BlockDecoder {
bitpacker: BitPacker4x,
output: AlignedBuffer,
pub output: [u32; COMPRESSION_BLOCK_SIZE + 1],
pub output_len: usize,
}
@@ -60,9 +55,11 @@ impl BlockDecoder {
}
pub fn with_val(val: u32) -> BlockDecoder {
let mut output = [val; COMPRESSION_BLOCK_SIZE + 1];
output[COMPRESSION_BLOCK_SIZE] = 0u32;
BlockDecoder {
bitpacker: BitPacker4x::new(),
output: AlignedBuffer([val; COMPRESSION_BLOCK_SIZE]),
output,
output_len: 0,
}
}
@@ -75,28 +72,23 @@ impl BlockDecoder {
) -> usize {
self.output_len = COMPRESSION_BLOCK_SIZE;
self.bitpacker
.decompress_sorted(offset, &compressed_data, &mut self.output.0, num_bits)
.decompress_sorted(offset, &compressed_data, &mut self.output, num_bits)
}
pub fn uncompress_block_unsorted(&mut self, compressed_data: &[u8], num_bits: u8) -> usize {
self.output_len = COMPRESSION_BLOCK_SIZE;
self.bitpacker
.decompress(&compressed_data, &mut self.output.0, num_bits)
.decompress(&compressed_data, &mut self.output, num_bits)
}
#[inline]
pub fn output_array(&self) -> &[u32] {
&self.output.0[..self.output_len]
}
#[inline]
pub(crate) fn output_aligned(&self) -> (&AlignedBuffer, usize) {
(&self.output, self.output_len)
&self.output[..self.output_len]
}
#[inline]
pub fn output(&self, idx: usize) -> u32 {
self.output.0[idx]
self.output[idx]
}
}
@@ -167,12 +159,12 @@ impl VIntDecoder for BlockDecoder {
num_els: usize,
) -> usize {
self.output_len = num_els;
vint::uncompress_sorted(compressed_data, &mut self.output.0[..num_els], offset)
vint::uncompress_sorted(compressed_data, &mut self.output[..num_els], offset)
}
fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> usize {
self.output_len = num_els;
vint::uncompress_unsorted(compressed_data, &mut self.output.0[..num_els])
vint::uncompress_unsorted(compressed_data, &mut self.output[..num_els])
}
}

View File

@@ -2,7 +2,6 @@
Postings module (also called inverted index)
*/
mod block_search;
pub(crate) mod compression;
/// Postings module
///
@@ -17,8 +16,6 @@ mod skip;
mod stacker;
mod term_info;
pub(crate) use self::block_search::BlockSearcher;
pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
@@ -34,6 +31,7 @@ pub(crate) use self::stacker::compute_table_size;
pub use common::HasLen;
pub(crate) const USE_SKIP_INFO_LIMIT: u32 = COMPRESSION_BLOCK_SIZE as u32;
pub(crate) type UnorderedTermId = u64;
#[cfg_attr(feature = "cargo-clippy", allow(clippy::enum_variant_names))]
@@ -55,15 +53,13 @@ pub mod tests {
use fieldnorm::FieldNormReader;
use indexer::operation::AddOperation;
use indexer::SegmentWriter;
use merge_policy::NoMergePolicy;
use query::Scorer;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use schema::{Document, Schema, Term, INDEXED, STRING, TEXT};
use schema::{Field, TextOptions};
use schema::{IndexRecordOption, TextFieldIndexing};
use schema::Field;
use schema::IndexRecordOption;
use schema::{Document, Schema, Term, INT_INDEXED, STRING, TEXT};
use std::iter;
use tokenizer::{SimpleTokenizer, MAX_TOKEN_LEN};
use DocId;
use Score;
@@ -105,11 +101,14 @@ pub mod tests {
}
index_writer.add_document(doc!(title => r#"abc be be be be abc"#));
index_writer.commit().unwrap();
index.load_searchers().unwrap();
let searcher = index.reader().unwrap().searcher();
let searcher = index.searcher();
let inverted_index = searcher.segment_reader(0u32).inverted_index(title);
let term = Term::from_field_text(title, "abc");
let mut positions = Vec::new();
{
let mut postings = inverted_index
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
@@ -162,52 +161,6 @@ pub mod tests {
}
}
#[test]
pub fn test_drop_token_that_are_too_long() {
let ok_token_text: String = iter::repeat('A').take(MAX_TOKEN_LEN).collect();
let mut exceeding_token_text: String = iter::repeat('A').take(MAX_TOKEN_LEN + 1).collect();
exceeding_token_text.push_str(" hello");
let mut schema_builder = Schema::builder();
let text_options = TextOptions::default().set_indexing_options(
TextFieldIndexing::default()
.set_index_option(IndexRecordOption::WithFreqsAndPositions)
.set_tokenizer("simple_no_truncation"),
);
let text_field = schema_builder.add_text_field("text", text_options);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
index
.tokenizers()
.register("simple_no_truncation", SimpleTokenizer);
let reader = index.reader().unwrap();
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.set_merge_policy(Box::new(NoMergePolicy));
{
index_writer.add_document(doc!(text_field=>exceeding_token_text));
index_writer.commit().unwrap();
reader.reload().unwrap();
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0u32);
let inverted_index = segment_reader.inverted_index(text_field);
assert_eq!(inverted_index.terms().num_terms(), 1);
let mut bytes = vec![];
assert!(inverted_index.terms().ord_to_term(0, &mut bytes));
assert_eq!(&bytes, b"hello");
}
{
index_writer.add_document(doc!(text_field=>ok_token_text.clone()));
index_writer.commit().unwrap();
reader.reload().unwrap();
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(1u32);
let inverted_index = segment_reader.inverted_index(text_field);
assert_eq!(inverted_index.terms().num_terms(), 1);
let mut bytes = vec![];
assert!(inverted_index.terms().ord_to_term(0, &mut bytes));
assert_eq!(&bytes[..], ok_token_text.as_bytes());
}
}
#[test]
pub fn test_position_and_fieldnorm1() {
let mut positions = Vec::new();
@@ -327,7 +280,7 @@ pub mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
{
let mut doc = Document::default();
doc.add_text(text_field, "g b b d c g c");
@@ -340,8 +293,9 @@ pub mod tests {
}
assert!(index_writer.commit().is_ok());
}
index.load_searchers().unwrap();
let term_a = Term::from_field_text(text_field, "a");
let searcher = index.reader().unwrap().searcher();
let searcher = index.searcher();
let segment_reader = searcher.segment_reader(0);
let mut postings = segment_reader
.inverted_index(text_field)
@@ -363,12 +317,12 @@ pub mod tests {
let index = {
let mut schema_builder = Schema::builder();
let value_field = schema_builder.add_u64_field("value", INDEXED);
let value_field = schema_builder.add_u64_field("value", INT_INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
for i in 0..num_docs {
let mut doc = Document::default();
doc.add_u64(value_field, 2);
@@ -378,9 +332,10 @@ pub mod tests {
}
assert!(index_writer.commit().is_ok());
}
index.load_searchers().unwrap();
index
};
let searcher = index.reader().unwrap().searcher();
let searcher = index.searcher();
let segment_reader = searcher.segment_reader(0);
// check that the basic usage works
@@ -444,11 +399,12 @@ pub mod tests {
// delete some of the documents
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer.delete_term(term_0);
assert!(index_writer.commit().is_ok());
}
let searcher = index.reader().unwrap().searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let segment_reader = searcher.segment_reader(0);
// make sure seeking still works
@@ -493,19 +449,33 @@ pub mod tests {
// delete everything else
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer.delete_term(term_1);
assert!(index_writer.commit().is_ok());
}
let searcher = index.reader().unwrap().searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let segment_reader = searcher.segment_reader(0);
// finally, check that it's empty
{
let searchable_segment_ids = index
.searchable_segment_ids()
.expect("could not get index segment ids");
assert!(searchable_segment_ids.is_empty());
assert_eq!(searcher.num_docs(), 0);
let mut segment_postings = segment_reader
.inverted_index(term_2.field())
.read_postings(&term_2, IndexRecordOption::Basic)
.unwrap();
assert_eq!(segment_postings.skip_next(0), SkipResult::Reached);
assert_eq!(segment_postings.doc(), 0);
assert!(segment_reader.is_deleted(0));
let mut segment_postings = segment_reader
.inverted_index(term_2.field())
.read_postings(&term_2, IndexRecordOption::Basic)
.unwrap();
assert_eq!(segment_postings.skip_next(num_docs), SkipResult::End);
}
}
@@ -536,7 +506,7 @@ pub mod tests {
let index = Index::create_in_ram(schema);
let posting_list_size = 1_000_000;
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
for _ in 0..posting_list_size {
let mut doc = Document::default();
if rng.gen_bool(1f64 / 15f64) {
@@ -553,6 +523,7 @@ pub mod tests {
}
assert!(index_writer.commit().is_ok());
}
index.load_searchers().unwrap();
index
};
}

View File

@@ -1,8 +1,6 @@
use super::stacker::{Addr, MemoryArena, TermHashMap};
use postings::recorder::{
BufferLender, NothingRecorder, Recorder, TFAndPositionRecorder, TermFrequencyRecorder,
};
use postings::recorder::{NothingRecorder, Recorder, TFAndPositionRecorder, TermFrequencyRecorder};
use postings::UnorderedTermId;
use postings::{FieldSerializer, InvertedIndexSerializer};
use schema::IndexRecordOption;
@@ -12,8 +10,8 @@ use std::io;
use std::marker::PhantomData;
use std::ops::DerefMut;
use termdict::TermOrdinal;
use tokenizer::Token;
use tokenizer::TokenStream;
use tokenizer::{Token, MAX_TOKEN_LEN};
use DocId;
use Result;
@@ -33,10 +31,9 @@ fn posting_from_field_entry(field_entry: &FieldEntry) -> Box<PostingsWriter> {
}
})
.unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed()),
FieldType::U64(_)
| FieldType::I64(_)
| FieldType::Date(_)
| FieldType::HierarchicalFacet => SpecializedPostingsWriter::<NothingRecorder>::new_boxed(),
FieldType::U64(_) | FieldType::I64(_) | FieldType::HierarchicalFacet => {
SpecializedPostingsWriter::<NothingRecorder>::new_boxed()
}
FieldType::Bytes => {
// FieldType::Bytes cannot actually be indexed.
// TODO fix during the indexer refactoring described in #276
@@ -52,31 +49,6 @@ pub struct MultiFieldPostingsWriter {
per_field_postings_writers: Vec<Box<PostingsWriter>>,
}
fn make_field_partition(
term_offsets: &[(&[u8], Addr, UnorderedTermId)],
) -> Vec<(Field, usize, usize)> {
let term_offsets_it = term_offsets
.iter()
.map(|(key, _, _)| Term::wrap(key).field())
.enumerate();
let mut prev_field = Field(u32::max_value());
let mut fields = vec![];
let mut offsets = vec![];
for (offset, field) in term_offsets_it {
if field != prev_field {
prev_field = field;
fields.push(field);
offsets.push(offset);
}
}
offsets.push(term_offsets.len());
let mut field_offsets = vec![];
for i in 0..fields.len() {
field_offsets.push((fields[i], offsets[i], offsets[i + 1]));
}
field_offsets
}
impl MultiFieldPostingsWriter {
/// Create a new `MultiFieldPostingsWriter` given
/// a schema and a heap.
@@ -122,16 +94,36 @@ impl MultiFieldPostingsWriter {
&self,
serializer: &mut InvertedIndexSerializer,
) -> Result<HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>>> {
let mut term_offsets: Vec<(&[u8], Addr, UnorderedTermId)> =
self.term_index.iter().collect();
let mut term_offsets: Vec<(&[u8], Addr, UnorderedTermId)> = self
.term_index
.iter()
.map(|(term_bytes, addr, bucket_id)| (term_bytes, addr, bucket_id as UnorderedTermId))
.collect();
term_offsets.sort_unstable_by_key(|&(k, _, _)| k);
let mut offsets: Vec<(Field, usize)> = vec![];
let term_offsets_it = term_offsets
.iter()
.cloned()
.map(|(key, _, _)| Term::wrap(key).field())
.enumerate();
let mut unordered_term_mappings: HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>> =
HashMap::new();
let field_offsets = make_field_partition(&term_offsets);
let mut prev_field = Field(u32::max_value());
for (offset, field) in term_offsets_it {
if field != prev_field {
offsets.push((field, offset));
prev_field = field;
}
}
offsets.push((Field(0), term_offsets.len()));
for i in 0..(offsets.len() - 1) {
let (field, start) = offsets[i];
let (_, stop) = offsets[i + 1];
for (field, start, stop) in field_offsets {
let field_entry = self.schema.get_field_entry(field);
match *field_entry.field_type() {
@@ -149,7 +141,7 @@ impl MultiFieldPostingsWriter {
.collect();
unordered_term_mappings.insert(field, mapping);
}
FieldType::U64(_) | FieldType::I64(_) | FieldType::Date(_) => {}
FieldType::U64(_) | FieldType::I64(_) => {}
FieldType::Bytes => {}
}
@@ -210,18 +202,8 @@ pub trait PostingsWriter {
) -> u32 {
let mut term = Term::for_field(field);
let mut sink = |token: &Token| {
// We skip all tokens with a len greater than u16.
if token.text.len() <= MAX_TOKEN_LEN {
term.set_text(token.text.as_str());
self.subscribe(term_index, doc_id, token.position as u32, &term, heap);
} else {
info!(
"A token exceeding MAX_TOKEN_LEN ({}>{}) was dropped. Search for \
MAX_TOKEN_LEN in the documentation for more information.",
token.text.len(),
MAX_TOKEN_LEN
);
}
term.set_text(token.text.as_str());
self.subscribe(term_index, doc_id, token.position as u32, &term, heap);
};
token_stream.process(&mut sink)
}
@@ -231,7 +213,7 @@ pub trait PostingsWriter {
/// The `SpecializedPostingsWriter` is just here to remove dynamic
/// dispatch to the recorder information.
pub(crate) struct SpecializedPostingsWriter<Rec: Recorder + 'static> {
pub struct SpecializedPostingsWriter<Rec: Recorder + 'static> {
total_num_tokens: u64,
_recorder_type: PhantomData<Rec>,
}
@@ -263,7 +245,8 @@ impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec>
debug_assert!(term.as_slice().len() >= 4);
self.total_num_tokens += 1;
term_index.mutate_or_create(term, |opt_recorder: Option<Rec>| {
if let Some(mut recorder) = opt_recorder {
if opt_recorder.is_some() {
let mut recorder = opt_recorder.unwrap();
let current_doc = recorder.current_doc();
if current_doc != doc {
recorder.close_doc(heap);
@@ -272,7 +255,7 @@ impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec>
recorder.record_position(position, heap);
recorder
} else {
let mut recorder = Rec::new();
let mut recorder = Rec::new(heap);
recorder.new_doc(doc, heap);
recorder.record_position(position, heap);
recorder
@@ -287,11 +270,10 @@ impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec>
termdict_heap: &MemoryArena,
heap: &MemoryArena,
) -> io::Result<()> {
let mut buffer_lender = BufferLender::default();
for &(term_bytes, addr, _) in term_addrs {
let recorder: Rec = termdict_heap.read(addr);
let recorder: Rec = unsafe { termdict_heap.read(addr) };
serializer.new_term(&term_bytes[4..])?;
recorder.serialize(&mut buffer_lender, serializer, heap)?;
recorder.serialize(serializer, heap)?;
serializer.close_term()?;
}
Ok(())

View File

@@ -1,50 +1,10 @@
use super::stacker::{ExpUnrolledLinkedList, MemoryArena};
use common::{read_u32_vint, write_u32_vint};
use postings::FieldSerializer;
use std::io;
use std::{self, io};
use DocId;
const POSITION_END: u32 = 0;
#[derive(Default)]
pub(crate) struct BufferLender {
buffer_u8: Vec<u8>,
buffer_u32: Vec<u32>,
}
impl BufferLender {
pub fn lend_u8(&mut self) -> &mut Vec<u8> {
self.buffer_u8.clear();
&mut self.buffer_u8
}
pub fn lend_all(&mut self) -> (&mut Vec<u8>, &mut Vec<u32>) {
self.buffer_u8.clear();
self.buffer_u32.clear();
(&mut self.buffer_u8, &mut self.buffer_u32)
}
}
pub struct VInt32Reader<'a> {
data: &'a [u8],
}
impl<'a> VInt32Reader<'a> {
fn new(data: &'a [u8]) -> VInt32Reader<'a> {
VInt32Reader { data }
}
}
impl<'a> Iterator for VInt32Reader<'a> {
type Item = u32;
fn next(&mut self) -> Option<u32> {
if self.data.is_empty() {
None
} else {
Some(read_u32_vint(&mut self.data))
}
}
}
const EMPTY_ARRAY: [u32; 0] = [0u32; 0];
const POSITION_END: u32 = std::u32::MAX;
/// Recorder is in charge of recording relevant information about
/// the presence of a term in a document.
@@ -55,9 +15,9 @@ impl<'a> Iterator for VInt32Reader<'a> {
/// * the document id
/// * the term frequency
/// * the term positions
pub(crate) trait Recorder: Copy + 'static {
pub trait Recorder: Copy {
///
fn new() -> Self;
fn new(heap: &mut MemoryArena) -> Self;
/// Returns the current document
fn current_doc(&self) -> u32;
/// Starts recording information about a new document
@@ -69,12 +29,7 @@ pub(crate) trait Recorder: Copy + 'static {
/// Close the document. It will help record the term frequency.
fn close_doc(&mut self, heap: &mut MemoryArena);
/// Pushes the postings information to the serializer.
fn serialize(
&self,
buffer_lender: &mut BufferLender,
serializer: &mut FieldSerializer,
heap: &MemoryArena,
) -> io::Result<()>;
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()>;
}
/// Only records the doc ids
@@ -85,9 +40,9 @@ pub struct NothingRecorder {
}
impl Recorder for NothingRecorder {
fn new() -> Self {
fn new(heap: &mut MemoryArena) -> Self {
NothingRecorder {
stack: ExpUnrolledLinkedList::new(),
stack: ExpUnrolledLinkedList::new(heap),
current_doc: u32::max_value(),
}
}
@@ -98,23 +53,16 @@ impl Recorder for NothingRecorder {
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
self.current_doc = doc;
let _ = write_u32_vint(doc, &mut self.stack.writer(heap));
self.stack.push(doc, heap);
}
fn record_position(&mut self, _position: u32, _heap: &mut MemoryArena) {}
fn close_doc(&mut self, _heap: &mut MemoryArena) {}
fn serialize(
&self,
buffer_lender: &mut BufferLender,
serializer: &mut FieldSerializer,
heap: &MemoryArena,
) -> io::Result<()> {
let buffer = buffer_lender.lend_u8();
self.stack.read_to_end(heap, buffer);
for doc in VInt32Reader::new(&buffer[..]) {
serializer.write_doc(doc as u32, 0u32, &[][..])?;
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> {
for doc in self.stack.iter(heap) {
serializer.write_doc(doc, 0u32, &EMPTY_ARRAY)?;
}
Ok(())
}
@@ -129,9 +77,9 @@ pub struct TermFrequencyRecorder {
}
impl Recorder for TermFrequencyRecorder {
fn new() -> Self {
fn new(heap: &mut MemoryArena) -> Self {
TermFrequencyRecorder {
stack: ExpUnrolledLinkedList::new(),
stack: ExpUnrolledLinkedList::new(heap),
current_doc: u32::max_value(),
current_tf: 0u32,
}
@@ -143,7 +91,7 @@ impl Recorder for TermFrequencyRecorder {
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
self.current_doc = doc;
let _ = write_u32_vint(doc, &mut self.stack.writer(heap));
self.stack.push(doc, heap);
}
fn record_position(&mut self, _position: u32, _heap: &mut MemoryArena) {
@@ -152,24 +100,24 @@ impl Recorder for TermFrequencyRecorder {
fn close_doc(&mut self, heap: &mut MemoryArena) {
debug_assert!(self.current_tf > 0);
let _ = write_u32_vint(self.current_tf, &mut self.stack.writer(heap));
self.stack.push(self.current_tf, heap);
self.current_tf = 0;
}
fn serialize(
&self,
buffer_lender: &mut BufferLender,
serializer: &mut FieldSerializer,
heap: &MemoryArena,
) -> io::Result<()> {
let buffer = buffer_lender.lend_u8();
self.stack.read_to_end(heap, buffer);
let mut u32_it = VInt32Reader::new(&buffer[..]);
while let Some(doc) = u32_it.next() {
let term_freq = u32_it.next().unwrap_or(self.current_tf);
serializer.write_doc(doc as u32, term_freq, &[][..])?;
}
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> {
// the last document has not been closed...
// its term freq is self.current_tf.
let mut doc_iter = self
.stack
.iter(heap)
.chain(Some(self.current_tf).into_iter());
while let Some(doc) = doc_iter.next() {
let term_freq = doc_iter
.next()
.expect("The IndexWriter recorded a doc without a term freq.");
serializer.write_doc(doc, term_freq, &EMPTY_ARRAY)?;
}
Ok(())
}
}
@@ -180,10 +128,11 @@ pub struct TFAndPositionRecorder {
stack: ExpUnrolledLinkedList,
current_doc: DocId,
}
impl Recorder for TFAndPositionRecorder {
fn new() -> Self {
fn new(heap: &mut MemoryArena) -> Self {
TFAndPositionRecorder {
stack: ExpUnrolledLinkedList::new(),
stack: ExpUnrolledLinkedList::new(heap),
current_doc: u32::max_value(),
}
}
@@ -194,88 +143,33 @@ impl Recorder for TFAndPositionRecorder {
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
self.current_doc = doc;
let _ = write_u32_vint(doc, &mut self.stack.writer(heap));
self.stack.push(doc, heap);
}
fn record_position(&mut self, position: u32, heap: &mut MemoryArena) {
let _ = write_u32_vint(position + 1u32, &mut self.stack.writer(heap));
self.stack.push(position, heap);
}
fn close_doc(&mut self, heap: &mut MemoryArena) {
let _ = write_u32_vint(POSITION_END, &mut self.stack.writer(heap));
self.stack.push(POSITION_END, heap);
}
fn serialize(
&self,
buffer_lender: &mut BufferLender,
serializer: &mut FieldSerializer,
heap: &MemoryArena,
) -> io::Result<()> {
let (buffer_u8, buffer_positions) = buffer_lender.lend_all();
self.stack.read_to_end(heap, buffer_u8);
let mut u32_it = VInt32Reader::new(&buffer_u8[..]);
while let Some(doc) = u32_it.next() {
let mut prev_position_plus_one = 1u32;
buffer_positions.clear();
loop {
match u32_it.next() {
Some(POSITION_END) | None => {
break;
}
Some(position_plus_one) => {
let delta_position = position_plus_one - prev_position_plus_one;
buffer_positions.push(delta_position);
prev_position_plus_one = position_plus_one;
}
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> {
let mut doc_positions = Vec::with_capacity(100);
let mut positions_iter = self.stack.iter(heap);
while let Some(doc) = positions_iter.next() {
let mut prev_position = 0;
doc_positions.clear();
for position in &mut positions_iter {
if position == POSITION_END {
break;
} else {
doc_positions.push(position - prev_position);
prev_position = position;
}
}
serializer.write_doc(doc, buffer_positions.len() as u32, &buffer_positions)?;
serializer.write_doc(doc, doc_positions.len() as u32, &doc_positions)?;
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::write_u32_vint;
use super::BufferLender;
use super::VInt32Reader;
#[test]
fn test_buffer_lender() {
let mut buffer_lender = BufferLender::default();
{
let buf = buffer_lender.lend_u8();
assert!(buf.is_empty());
buf.push(1u8);
}
{
let buf = buffer_lender.lend_u8();
assert!(buf.is_empty());
buf.push(1u8);
}
{
let (_, buf) = buffer_lender.lend_all();
assert!(buf.is_empty());
buf.push(1u32);
}
{
let (_, buf) = buffer_lender.lend_all();
assert!(buf.is_empty());
buf.push(1u32);
}
}
#[test]
fn test_vint_u32() {
let mut buffer = vec![];
let vals = [0, 1, 324_234_234, u32::max_value()];
for &i in &vals {
assert!(write_u32_vint(i, &mut buffer).is_ok());
}
assert_eq!(buffer.len(), 1 + 1 + 5 + 5);
let res: Vec<u32> = VInt32Reader::new(&buffer[..]).collect();
assert_eq!(&res[..], &vals[..]);
}
}

View File

@@ -2,21 +2,22 @@ use common::BitSet;
use common::HasLen;
use common::{BinarySerializable, VInt};
use docset::{DocSet, SkipResult};
use fst::Streamer;
use owned_read::OwnedRead;
use positions::PositionReader;
use postings::compression::{compressed_block_size, AlignedBuffer};
use postings::compression::compressed_block_size;
use postings::compression::{BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE};
use postings::serializer::PostingsSerializer;
use postings::BlockSearcher;
use postings::FreqReadingOption;
use postings::Postings;
use postings::SkipReader;
use postings::USE_SKIP_INFO_LIMIT;
use schema::IndexRecordOption;
use std::cmp::Ordering;
use tantivy_fst::Streamer;
use DocId;
const EMPTY_ARR: [u8; 0] = [];
struct PositionComputer {
// store the amount of position int
// before reading positions.
@@ -61,7 +62,6 @@ pub struct SegmentPostings {
block_cursor: BlockSegmentPostings,
cur: usize,
position_computer: Option<PositionComputer>,
block_searcher: BlockSearcher,
}
impl SegmentPostings {
@@ -72,7 +72,6 @@ impl SegmentPostings {
block_cursor: empty_block_cursor,
cur: COMPRESSION_BLOCK_SIZE,
position_computer: None,
block_searcher: BlockSearcher::default(),
}
}
@@ -120,33 +119,45 @@ impl SegmentPostings {
block_cursor: segment_block_postings,
cur: COMPRESSION_BLOCK_SIZE, // cursor within the block
position_computer: positions_stream_opt.map(PositionComputer::new),
block_searcher: BlockSearcher::default(),
}
}
}
impl DocSet for SegmentPostings {
// goes to the next element.
// next needs to be called a first time to point to the correct element.
#[inline]
fn advance(&mut self) -> bool {
if self.position_computer.is_some() && self.cur < COMPRESSION_BLOCK_SIZE {
let term_freq = self.term_freq() as usize;
if let Some(position_computer) = self.position_computer.as_mut() {
position_computer.add_skip(term_freq);
}
fn exponential_search(target: u32, arr: &[u32]) -> (usize, usize) {
let mut start = 0;
let end = arr.len();
debug_assert!(target <= arr[end - 1]);
let mut jump = 1;
loop {
let new = start + jump;
if new >= end {
return (start, end);
}
self.cur += 1;
if self.cur >= self.block_cursor.block_len() {
self.cur = 0;
if !self.block_cursor.advance() {
self.cur = COMPRESSION_BLOCK_SIZE;
return false;
}
if arr[new] > target {
return (start, new);
}
true
start = new;
jump *= 2;
}
}
/// Search the first index containing an element greater or equal to the target.
///
/// # Assumption
///
/// The array is assumed non empty.
/// The target is assumed greater or equal to the first element.
/// The target is assumed smaller or equal to the last element.
fn search_within_block(block_docs: &[u32], target: u32) -> usize {
let (start, end) = exponential_search(target, block_docs);
start.wrapping_add(
block_docs[start..end]
.binary_search(&target)
.unwrap_or_else(|e| e),
)
}
impl DocSet for SegmentPostings {
fn skip_next(&mut self, target: DocId) -> SkipResult {
if !self.advance() {
return SkipResult::End;
@@ -169,6 +180,7 @@ impl DocSet for SegmentPostings {
// skip blocks until one that might contain the target
// check if we need to go to the next block
let need_positions = self.position_computer.is_some();
let mut sum_freqs_skipped: u32 = 0;
if !self
.block_cursor
@@ -182,7 +194,7 @@ impl DocSet for SegmentPostings {
// we are not in the right block.
//
// First compute all of the freqs skipped from the current block.
if self.position_computer.is_some() {
if need_positions {
sum_freqs_skipped = self.block_cursor.freqs()[self.cur..].iter().sum();
match self.block_cursor.skip_to(target) {
BlockSegmentPostingsSkipResult::Success(block_skip_freqs) => {
@@ -201,21 +213,25 @@ impl DocSet for SegmentPostings {
self.cur = 0;
}
let cur = self.cur;
// we're in the right block now, start with an exponential search
let (output, len) = self.block_cursor.docs_aligned();
let block_docs = self.block_cursor.docs();
let new_cur = self
.block_searcher
.search_in_block(&output, len, cur, target);
if let Some(position_computer) = self.position_computer.as_mut() {
sum_freqs_skipped += self.block_cursor.freqs()[cur..new_cur].iter().sum::<u32>();
position_computer.add_skip(sum_freqs_skipped as usize);
.cur
.wrapping_add(search_within_block(&block_docs[self.cur..], target));
if need_positions {
sum_freqs_skipped += self.block_cursor.freqs()[self.cur..new_cur]
.iter()
.sum::<u32>();
self.position_computer
.as_mut()
.unwrap()
.add_skip(sum_freqs_skipped as usize);
}
self.cur = new_cur;
// `doc` is now the first element >= `target`
let doc = output.0[new_cur];
let doc = block_docs[new_cur];
debug_assert!(doc >= target);
if doc == target {
SkipResult::Reached
@@ -224,25 +240,40 @@ impl DocSet for SegmentPostings {
}
}
// goes to the next element.
// next needs to be called a first time to point to the correct element.
#[inline]
fn advance(&mut self) -> bool {
if self.position_computer.is_some() {
let term_freq = self.term_freq() as usize;
self.position_computer.as_mut().unwrap().add_skip(term_freq);
}
self.cur += 1;
if self.cur >= self.block_cursor.block_len() {
self.cur = 0;
if !self.block_cursor.advance() {
self.cur = COMPRESSION_BLOCK_SIZE;
return false;
}
}
true
}
fn size_hint(&self) -> u32 {
self.len() as u32
}
/// Return the current document's `DocId`.
///
/// # Panics
///
/// Will panics if called without having called advance before.
#[inline]
fn doc(&self) -> DocId {
let docs = self.block_cursor.docs();
debug_assert!(
self.cur < docs.len(),
"Have you forgotten to call `.advance()` at least once before calling `.doc()` ."
"Have you forgotten to call `.advance()` at least once before calling .doc()."
);
docs[self.cur]
}
fn size_hint(&self) -> u32 {
self.len() as u32
}
fn append_to_bitset(&mut self, bitset: &mut BitSet) {
// finish the current block
if self.advance() {
@@ -266,33 +297,17 @@ impl HasLen for SegmentPostings {
}
impl Postings for SegmentPostings {
/// Returns the frequency associated to the current document.
/// If the schema is set up so that no frequency have been encoded,
/// this method should always return 1.
///
/// # Panics
///
/// Will panics if called without having called advance before.
fn term_freq(&self) -> u32 {
debug_assert!(
// Here we do not use the len of `freqs()`
// because it is actually ok to request for the freq of doc
// even if no frequency were encoded for the field.
//
// In that case we hit the block just as if the frequency had been
// decoded. The block is simply prefilled by the value 1.
self.cur < COMPRESSION_BLOCK_SIZE,
"Have you forgotten to call `.advance()` at least once before calling \
`.term_freq()`."
);
self.block_cursor.freq(self.cur)
}
fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
let term_freq = self.term_freq() as usize;
if let Some(position_comp) = self.position_computer.as_mut() {
output.resize(term_freq, 0u32);
position_comp.positions_with_offset(offset, &mut output[..]);
if self.position_computer.is_some() {
output.resize(self.term_freq() as usize, 0u32);
self.position_computer
.as_mut()
.unwrap()
.positions_with_offset(offset, &mut output[..])
} else {
output.clear();
}
@@ -357,7 +372,7 @@ impl BlockSegmentPostings {
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, data);
let skip_reader = match skip_data_opt {
Some(skip_data) => SkipReader::new(skip_data, record_option),
None => SkipReader::new(OwnedRead::new(&[][..]), record_option),
None => SkipReader::new(OwnedRead::new(&EMPTY_ARR[..]), record_option),
};
let doc_freq = doc_freq as usize;
let num_vint_docs = doc_freq % COMPRESSION_BLOCK_SIZE;
@@ -391,7 +406,7 @@ impl BlockSegmentPostings {
if let Some(skip_data) = skip_data_opt {
self.skip_reader.reset(skip_data);
} else {
self.skip_reader.reset(OwnedRead::new(&[][..]))
self.skip_reader.reset(OwnedRead::new(&EMPTY_ARR[..]))
}
self.doc_offset = 0;
self.doc_freq = doc_freq as usize;
@@ -414,10 +429,6 @@ impl BlockSegmentPostings {
self.doc_decoder.output_array()
}
pub(crate) fn docs_aligned(&self) -> (&AlignedBuffer, usize) {
self.doc_decoder.output_aligned()
}
/// Return the document at index `idx` of the block.
#[inline]
pub fn doc(&self, idx: usize) -> u32 {
@@ -608,18 +619,20 @@ impl<'b> Streamer<'b> for BlockSegmentPostings {
#[cfg(test)]
mod tests {
use super::exponential_search;
use super::search_within_block;
use super::BlockSegmentPostings;
use super::BlockSegmentPostingsSkipResult;
use super::SegmentPostings;
use common::HasLen;
use core::Index;
use docset::DocSet;
use postings::postings::Postings;
use fst::Streamer;
use schema::IndexRecordOption;
use schema::Schema;
use schema::Term;
use schema::INDEXED;
use tantivy_fst::Streamer;
use schema::INT_INDEXED;
use DocId;
use SkipResult;
@@ -631,18 +644,6 @@ mod tests {
assert_eq!(postings.len(), 0);
}
#[test]
#[should_panic(expected = "Have you forgotten to call `.advance()`")]
fn test_panic_if_doc_called_before_advance() {
SegmentPostings::empty().doc();
}
#[test]
#[should_panic(expected = "Have you forgotten to call `.advance()`")]
fn test_panic_if_freq_called_before_advance() {
SegmentPostings::empty().term_freq();
}
#[test]
fn test_empty_block_segment_postings() {
let mut postings = BlockSegmentPostings::empty();
@@ -650,6 +651,56 @@ mod tests {
assert_eq!(postings.doc_freq(), 0);
}
fn search_within_block_trivial_but_slow(block: &[u32], target: u32) -> usize {
block
.iter()
.cloned()
.enumerate()
.filter(|&(_, ref val)| *val >= target)
.next()
.unwrap()
.0
}
#[test]
fn test_exponentiel_search() {
assert_eq!(exponential_search(0, &[1, 2]), (0, 1));
assert_eq!(exponential_search(1, &[1, 2]), (0, 1));
assert_eq!(
exponential_search(7, &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
(3, 7)
);
}
fn util_test_search_within_block(block: &[u32], target: u32) {
assert_eq!(
search_within_block(block, target),
search_within_block_trivial_but_slow(block, target)
);
}
fn util_test_search_within_block_all(block: &[u32]) {
use std::collections::HashSet;
let mut targets = HashSet::new();
for (i, val) in block.iter().cloned().enumerate() {
if i > 0 {
targets.insert(val - 1);
}
targets.insert(val);
}
for target in targets {
util_test_search_within_block(block, target);
}
}
#[test]
fn test_search_within_block() {
for len in 1u32..128u32 {
let v: Vec<u32> = (0..len).map(|i| i * 2).collect();
util_test_search_within_block_all(&v[..]);
}
}
#[test]
fn test_block_segment_postings() {
let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>());
@@ -698,10 +749,10 @@ mod tests {
fn build_block_postings(docs: &[DocId]) -> BlockSegmentPostings {
let mut schema_builder = Schema::builder();
let int_field = schema_builder.add_u64_field("id", INDEXED);
let int_field = schema_builder.add_u64_field("id", INT_INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
let mut last_doc = 0u32;
for &doc in docs {
for _ in last_doc..doc {
@@ -711,7 +762,8 @@ mod tests {
last_doc = doc + 1;
}
index_writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let segment_reader = searcher.segment_reader(0);
let inverted_index = segment_reader.inverted_index(int_field);
let term = Term::from_field_u64(int_field, 0u64);
@@ -768,10 +820,10 @@ mod tests {
#[test]
fn test_reset_block_segment_postings() {
let mut schema_builder = Schema::builder();
let int_field = schema_builder.add_u64_field("id", INDEXED);
let int_field = schema_builder.add_u64_field("id", INT_INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
// create two postings list, one containg even number,
// the other containing odd numbers.
for i in 0..6 {
@@ -779,7 +831,8 @@ mod tests {
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let segment_reader = searcher.segment_reader(0);
let mut block_segments;

View File

@@ -14,7 +14,7 @@ use termdict::{TermDictionaryBuilder, TermOrdinal};
use DocId;
use Result;
/// `InvertedIndexSerializer` is in charge of serializing
/// `PostingsSerializer` is in charge of serializing
/// postings on disk, in the
/// * `.idx` (inverted index)
/// * `.pos` (positions file)
@@ -54,7 +54,7 @@ pub struct InvertedIndexSerializer {
}
impl InvertedIndexSerializer {
/// Open a new `InvertedIndexSerializer` for the given segment
/// Open a new `PostingsSerializer` for the given segment
fn create(
terms_write: CompositeWrite<WritePtr>,
postings_write: CompositeWrite<WritePtr>,
@@ -175,7 +175,7 @@ impl<'a> FieldSerializer<'a> {
let positions_idx = self
.positions_serializer_opt
.as_ref()
.map(PositionSerializer::positions_idx)
.map(|positions_serializer| positions_serializer.positions_idx())
.unwrap_or(0u64);
TermInfo {
doc_freq: 0,

View File

@@ -1,37 +1,28 @@
use super::{Addr, MemoryArena};
use postings::stacker::memory_arena::load;
use postings::stacker::memory_arena::store;
use std::io;
use common::is_power_of_2;
use std::mem;
const MAX_BLOCK_LEN: u32 = 1u32 << 15;
const FIRST_BLOCK: usize = 16;
const INLINED_BLOCK_LEN: usize = FIRST_BLOCK + mem::size_of::<Addr>();
enum CapacityResult {
Available(u32),
NeedAlloc(u32),
}
const FIRST_BLOCK: u32 = 4u32;
fn len_to_capacity(len: u32) -> CapacityResult {
#[inline]
pub fn jump_needed(len: u32) -> Option<usize> {
match len {
0...15 => CapacityResult::Available(FIRST_BLOCK as u32 - len),
16...MAX_BLOCK_LEN => {
let cap = 1 << (32u32 - (len - 1u32).leading_zeros());
let available = cap - len;
if available == 0 {
CapacityResult::NeedAlloc(len)
0...3 => None,
4...MAX_BLOCK_LEN => {
if is_power_of_2(len as usize) {
Some(len as usize)
} else {
CapacityResult::Available(available)
None
}
}
n => {
let available = n % MAX_BLOCK_LEN;
if available == 0 {
CapacityResult::NeedAlloc(MAX_BLOCK_LEN)
if n % MAX_BLOCK_LEN == 0 {
Some(MAX_BLOCK_LEN as usize)
} else {
CapacityResult::Available(MAX_BLOCK_LEN - available)
None
}
}
}
@@ -61,119 +52,82 @@ fn len_to_capacity(len: u32) -> CapacityResult {
#[derive(Debug, Clone, Copy)]
pub struct ExpUnrolledLinkedList {
len: u32,
head: Addr,
tail: Addr,
inlined_data: [u8; INLINED_BLOCK_LEN as usize],
}
pub struct ExpUnrolledLinkedListWriter<'a> {
eull: &'a mut ExpUnrolledLinkedList,
heap: &'a mut MemoryArena,
}
fn ensure_capacity<'a>(
eull: &'a mut ExpUnrolledLinkedList,
heap: &'a mut MemoryArena,
) -> &'a mut [u8] {
if eull.len <= FIRST_BLOCK as u32 {
// We are still hitting the inline block.
if eull.len < FIRST_BLOCK as u32 {
return &mut eull.inlined_data[eull.len as usize..FIRST_BLOCK];
}
// We need to allocate a new block!
let new_block_addr: Addr = heap.allocate_space(FIRST_BLOCK + mem::size_of::<Addr>());
store(&mut eull.inlined_data[FIRST_BLOCK..], new_block_addr);
eull.tail = new_block_addr;
return heap.slice_mut(eull.tail, FIRST_BLOCK);
}
let len = match len_to_capacity(eull.len) {
CapacityResult::NeedAlloc(new_block_len) => {
let new_block_addr: Addr =
heap.allocate_space(new_block_len as usize + mem::size_of::<Addr>());
heap.write_at(eull.tail, new_block_addr);
eull.tail = new_block_addr;
new_block_len
}
CapacityResult::Available(available) => available,
};
heap.slice_mut(eull.tail, len as usize)
}
impl<'a> ExpUnrolledLinkedListWriter<'a> {
pub fn extend_from_slice(&mut self, mut buf: &[u8]) {
if buf.is_empty() {
// we need to cut early, because `ensure_capacity`
// allocates if there is no capacity at all right now.
return;
}
while !buf.is_empty() {
let add_len: usize;
{
let output_buf = ensure_capacity(self.eull, self.heap);
add_len = buf.len().min(output_buf.len());
output_buf[..add_len].copy_from_slice(&buf[..add_len]);
}
self.eull.len += add_len as u32;
self.eull.tail = self.eull.tail.offset(add_len as u32);
buf = &buf[add_len..];
}
}
}
impl<'a> io::Write for ExpUnrolledLinkedListWriter<'a> {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
// There is no use case to only write the capacity.
// This is not IO after all, so we write the whole
// buffer even if the contract of `.write` is looser.
self.extend_from_slice(buf);
Ok(buf.len())
}
fn write_all(&mut self, buf: &[u8]) -> io::Result<()> {
self.extend_from_slice(buf);
Ok(())
}
fn flush(&mut self) -> io::Result<()> {
Ok(())
}
}
impl ExpUnrolledLinkedList {
pub fn new() -> ExpUnrolledLinkedList {
pub fn new(heap: &mut MemoryArena) -> ExpUnrolledLinkedList {
let addr = heap.allocate_space((FIRST_BLOCK as usize) * mem::size_of::<u32>());
ExpUnrolledLinkedList {
len: 0u32,
tail: Addr::null_pointer(),
inlined_data: [0u8; INLINED_BLOCK_LEN as usize],
head: addr,
tail: addr,
}
}
#[inline(always)]
pub fn writer<'a>(&'a mut self, heap: &'a mut MemoryArena) -> ExpUnrolledLinkedListWriter<'a> {
ExpUnrolledLinkedListWriter { eull: self, heap }
pub fn iter<'a>(&self, heap: &'a MemoryArena) -> ExpUnrolledLinkedListIterator<'a> {
ExpUnrolledLinkedListIterator {
heap,
addr: self.head,
len: self.len,
consumed: 0,
}
}
pub fn read_to_end(&self, heap: &MemoryArena, output: &mut Vec<u8>) {
let len = self.len as usize;
if len <= FIRST_BLOCK {
output.extend_from_slice(&self.inlined_data[..len]);
return;
/// Appends a new element to the current stack.
///
/// If the current block end is reached, a new block is allocated.
pub fn push(&mut self, val: u32, heap: &mut MemoryArena) {
self.len += 1;
if let Some(new_block_len) = jump_needed(self.len) {
// We need to allocate another block.
// We also allocate an extra `u32` to store the pointer
// to the future next block.
let new_block_size: usize = (new_block_len + 1) * mem::size_of::<u32>();
let new_block_addr: Addr = heap.allocate_space(new_block_size);
unsafe {
// logic
heap.write(self.tail, new_block_addr)
};
self.tail = new_block_addr;
}
output.extend_from_slice(&self.inlined_data[..FIRST_BLOCK]);
let mut cur = FIRST_BLOCK;
let mut addr = load(&self.inlined_data[FIRST_BLOCK..]);
loop {
let cap = match len_to_capacity(cur as u32) {
CapacityResult::Available(capacity) => capacity,
CapacityResult::NeedAlloc(capacity) => capacity,
} as usize;
let data = heap.slice(addr, cap);
if cur + cap >= len {
output.extend_from_slice(&data[..(len - cur)]);
return;
}
output.extend_from_slice(data);
cur += cap;
addr = heap.read(addr.offset(cap as u32));
unsafe {
// logic
heap.write(self.tail, val);
self.tail = self.tail.offset(mem::size_of::<u32>() as u32);
}
}
}
pub struct ExpUnrolledLinkedListIterator<'a> {
heap: &'a MemoryArena,
addr: Addr,
len: u32,
consumed: u32,
}
impl<'a> Iterator for ExpUnrolledLinkedListIterator<'a> {
type Item = u32;
fn next(&mut self) -> Option<u32> {
if self.consumed == self.len {
None
} else {
self.consumed += 1;
let addr: Addr = if jump_needed(self.consumed).is_some() {
unsafe {
// logic
self.heap.read(self.addr)
}
} else {
self.addr
};
self.addr = addr.offset(mem::size_of::<u32>() as u32);
Some(unsafe {
// logic
self.heap.read(addr)
})
}
}
}
@@ -182,134 +136,46 @@ impl ExpUnrolledLinkedList {
mod tests {
use super::super::MemoryArena;
use super::len_to_capacity;
use super::jump_needed;
use super::*;
use byteorder::{ByteOrder, LittleEndian, WriteBytesExt};
#[test]
#[test]
fn test_stack() {
let mut heap = MemoryArena::new();
let mut stack = ExpUnrolledLinkedList::new();
stack.writer(&mut heap).extend_from_slice(&[1u8]);
stack.writer(&mut heap).extend_from_slice(&[2u8]);
stack.writer(&mut heap).extend_from_slice(&[3u8, 4u8]);
stack.writer(&mut heap).extend_from_slice(&[5u8]);
let mut stack = ExpUnrolledLinkedList::new(&mut heap);
stack.push(1u32, &mut heap);
stack.push(2u32, &mut heap);
stack.push(4u32, &mut heap);
stack.push(8u32, &mut heap);
{
let mut buffer = Vec::new();
stack.read_to_end(&heap, &mut buffer);
assert_eq!(&buffer[..], &[1u8, 2u8, 3u8, 4u8, 5u8]);
let mut it = stack.iter(&heap);
assert_eq!(it.next().unwrap(), 1u32);
assert_eq!(it.next().unwrap(), 2u32);
assert_eq!(it.next().unwrap(), 4u32);
assert_eq!(it.next().unwrap(), 8u32);
assert!(it.next().is_none());
}
}
#[test]
fn test_stack_long() {
let mut heap = MemoryArena::new();
let mut stack = ExpUnrolledLinkedList::new();
let source: Vec<u32> = (0..100).collect();
for &el in &source {
assert!(stack
.writer(&mut heap)
.write_u32::<LittleEndian>(el)
.is_ok());
}
let mut buffer = Vec::new();
stack.read_to_end(&heap, &mut buffer);
let mut result = vec![];
let mut remaining = &buffer[..];
while !remaining.is_empty() {
result.push(LittleEndian::read_u32(&remaining[..4]));
remaining = &remaining[4..];
}
assert_eq!(&result[..], &source[..]);
}
#[test]
fn test_stack_interlaced() {
let mut heap = MemoryArena::new();
let mut stack = ExpUnrolledLinkedList::new();
let mut stack2 = ExpUnrolledLinkedList::new();
let mut vec1: Vec<u8> = vec![];
let mut vec2: Vec<u8> = vec![];
for i in 0..9 {
assert!(stack.writer(&mut heap).write_u32::<LittleEndian>(i).is_ok());
assert!(vec1.write_u32::<LittleEndian>(i).is_ok());
if i % 2 == 0 {
assert!(stack2
.writer(&mut heap)
.write_u32::<LittleEndian>(i)
.is_ok());
assert!(vec2.write_u32::<LittleEndian>(i).is_ok());
}
}
let mut res1 = vec![];
let mut res2 = vec![];
stack.read_to_end(&heap, &mut res1);
stack2.read_to_end(&heap, &mut res2);
assert_eq!(&vec1[..], &res1[..]);
assert_eq!(&vec2[..], &res2[..]);
}
#[test]
fn test_jump_if_needed() {
let mut available = 16u32;
for i in 0..10_000_000 {
match len_to_capacity(i) {
CapacityResult::NeedAlloc(cap) => {
assert_eq!(available, 0, "Failed len={}: Expected 0 got {}", i, cap);
available = cap;
}
CapacityResult::Available(cap) => {
assert_eq!(
available, cap,
"Failed len={}: Expected {} Got {}",
i, available, cap
);
}
}
available -= 1;
let mut block_len = 4u32;
let mut i = 0;
while i < 10_000_000 {
assert!(jump_needed(i + block_len - 1).is_none());
assert!(jump_needed(i + block_len + 1).is_none());
assert!(jump_needed(i + block_len).is_some());
let new_block_len = jump_needed(i + block_len).unwrap();
i += block_len;
block_len = new_block_len as u32;
}
}
#[test]
fn test_jump_if_needed_progression() {
let mut v = vec![];
for i in 0.. {
if v.len() >= 10 {
break;
}
match len_to_capacity(i) {
CapacityResult::NeedAlloc(cap) => {
v.push((i, cap));
}
_ => {}
}
}
assert_eq!(
&v[..],
&[
(16, 16),
(32, 32),
(64, 64),
(128, 128),
(256, 256),
(512, 512),
(1024, 1024),
(2048, 2048),
(4096, 4096),
(8192, 8192)
]
);
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use super::super::MemoryArena;
use super::ExpUnrolledLinkedList;
use byteorder::{NativeEndian, WriteBytesExt};
use test::Bencher;
const NUM_STACK: usize = 10_000;
@@ -337,13 +203,13 @@ mod bench {
let mut heap = MemoryArena::new();
let mut stacks = Vec::with_capacity(100);
for _ in 0..NUM_STACK {
let mut stack = ExpUnrolledLinkedList::new();
let mut stack = ExpUnrolledLinkedList::new(&mut heap);
stacks.push(stack);
}
for s in 0..NUM_STACK {
for i in 0u32..STACK_SIZE {
let t = s * 392017 % NUM_STACK;
let _ = stacks[t].writer(&mut heap).write_u32::<NativeEndian>(i);
stacks[t].push(i, &mut heap);
}
}
});

View File

@@ -37,7 +37,7 @@ const PAGE_SIZE: usize = 1 << NUM_BITS_PAGE_ADDR; // pages are 1 MB large
/// page of memory.
///
/// The last 20 bits are an address within this page of memory.
#[derive(Copy, Clone, Debug)]
#[derive(Clone, Copy, Debug)]
pub struct Addr(u32);
impl Addr {
@@ -69,16 +69,32 @@ impl Addr {
}
}
pub fn store<Item: Copy + 'static>(dest: &mut [u8], val: Item) {
assert_eq!(dest.len(), std::mem::size_of::<Item>());
unsafe {
ptr::write_unaligned(dest.as_mut_ptr() as *mut Item, val);
}
/// Trait required for an object to be `storable`.
///
/// # Warning
///
/// Most of the time you should not implement this trait,
/// and only use the `MemoryArena` with object implementing `Copy`.
///
/// `ArenaStorable` is used in `tantivy` to force
/// a `Copy` object and a `slice` of data to be stored contiguously.
pub trait ArenaStorable {
fn num_bytes(&self) -> usize;
unsafe fn write_into(self, arena: &mut MemoryArena, addr: Addr);
}
pub fn load<Item: Copy + 'static>(data: &[u8]) -> Item {
assert_eq!(data.len(), std::mem::size_of::<Item>());
unsafe { ptr::read_unaligned(data.as_ptr() as *const Item) }
impl<V> ArenaStorable for V
where
V: Copy,
{
fn num_bytes(&self) -> usize {
mem::size_of::<V>()
}
unsafe fn write_into(self, arena: &mut MemoryArena, addr: Addr) {
let dst_ptr = arena.get_mut_ptr(addr) as *mut V;
ptr::write_unaligned(dst_ptr, self);
}
}
/// The `MemoryArena`
@@ -110,9 +126,47 @@ impl MemoryArena {
self.pages.len() * PAGE_SIZE
}
pub fn write_at<Item: Copy + 'static>(&mut self, addr: Addr, val: Item) {
let dest = self.slice_mut(addr, std::mem::size_of::<Item>());
store(dest, val);
/// Writes a slice at the given address, assuming the
/// memory was allocated beforehands.
///
/// # Panics
///
/// May panic or corrupt the heap if he space was not
/// properly allocated beforehands.
pub fn write_bytes<B: AsRef<[u8]>>(&mut self, addr: Addr, data: B) {
let bytes = data.as_ref();
self.pages[addr.page_id()]
.get_mut_slice(addr.page_local_addr(), bytes.len())
.copy_from_slice(bytes);
}
/// Returns the `len` bytes starting at `addr`
///
/// # Panics
///
/// Panics if the memory has not been allocated beforehands.
pub fn read_slice(&self, addr: Addr, len: usize) -> &[u8] {
self.pages[addr.page_id()].get_slice(addr.page_local_addr(), len)
}
unsafe fn get_mut_ptr(&mut self, addr: Addr) -> *mut u8 {
self.pages[addr.page_id()].get_mut_ptr(addr.page_local_addr())
}
/// Stores an item's data in the heap
///
/// It allocates the `Item` beforehands.
pub fn store<Item: ArenaStorable>(&mut self, val: Item) -> Addr {
let num_bytes = val.num_bytes();
let addr = self.allocate_space(num_bytes);
unsafe {
self.write(addr, val);
};
addr
}
pub unsafe fn write<Item: ArenaStorable>(&mut self, addr: Addr, val: Item) {
val.write_into(self, addr)
}
/// Read an item in the heap at the given `address`.
@@ -120,21 +174,9 @@ impl MemoryArena {
/// # Panics
///
/// If the address is erroneous
pub fn read<Item: Copy + 'static>(&self, addr: Addr) -> Item {
load(self.slice(addr, mem::size_of::<Item>()))
}
pub fn slice(&self, addr: Addr, len: usize) -> &[u8] {
self.pages[addr.page_id()].slice(addr.page_local_addr(), len)
}
pub fn slice_from(&self, addr: Addr) -> &[u8] {
self.pages[addr.page_id()].slice_from(addr.page_local_addr())
}
#[inline(always)]
pub fn slice_mut(&mut self, addr: Addr, len: usize) -> &mut [u8] {
self.pages[addr.page_id()].slice_mut(addr.page_local_addr(), len)
pub unsafe fn read<Item: Copy>(&self, addr: Addr) -> Item {
let ptr = self.pages[addr.page_id()].get_ptr(addr.page_local_addr());
ptr::read_unaligned(ptr as *const Item)
}
/// Allocates `len` bytes and returns the allocated address.
@@ -155,10 +197,14 @@ struct Page {
impl Page {
fn new(page_id: usize) -> Page {
let mut data: Vec<u8> = Vec::with_capacity(PAGE_SIZE);
unsafe {
data.set_len(PAGE_SIZE);
} // avoid initializing page
Page {
page_id,
len: 0,
data: vec![0u8; PAGE_SIZE].into_boxed_slice(),
data: data.into_boxed_slice(),
}
}
@@ -167,18 +213,14 @@ impl Page {
len + self.len <= PAGE_SIZE
}
fn slice(&self, local_addr: usize, len: usize) -> &[u8] {
&self.slice_from(local_addr)[..len]
}
fn slice_from(&self, local_addr: usize) -> &[u8] {
&self.data[local_addr..]
}
fn slice_mut(&mut self, local_addr: usize, len: usize) -> &mut [u8] {
fn get_mut_slice(&mut self, local_addr: usize, len: usize) -> &mut [u8] {
&mut self.data[local_addr..][..len]
}
fn get_slice(&self, local_addr: usize, len: usize) -> &[u8] {
&self.data[local_addr..][..len]
}
fn allocate_space(&mut self, len: usize) -> Option<Addr> {
if self.is_available(len) {
let addr = Addr::new(self.page_id, self.len);
@@ -188,6 +230,16 @@ impl Page {
None
}
}
#[inline(always)]
pub(crate) unsafe fn get_ptr(&self, addr: usize) -> *const u8 {
self.data.as_ptr().add(addr)
}
#[inline(always)]
pub(crate) unsafe fn get_mut_ptr(&mut self, addr: usize) -> *mut u8 {
self.data.as_mut_ptr().add(addr)
}
}
#[cfg(test)]
@@ -202,13 +254,13 @@ mod tests {
let b = b"happy tax payer";
let addr_a = arena.allocate_space(a.len());
arena.slice_mut(addr_a, a.len()).copy_from_slice(a);
arena.write_bytes(addr_a, a);
let addr_b = arena.allocate_space(b.len());
arena.slice_mut(addr_b, b.len()).copy_from_slice(b);
arena.write_bytes(addr_b, b);
assert_eq!(arena.slice(addr_a, a.len()), a);
assert_eq!(arena.slice(addr_b, b.len()), b);
assert_eq!(arena.read_slice(addr_a, a.len()), a);
assert_eq!(arena.read_slice(addr_b, b.len()), b);
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
@@ -231,15 +283,9 @@ mod tests {
b: 221,
c: 12,
};
let num_bytes = std::mem::size_of::<MyTest>();
let addr_a = arena.allocate_space(num_bytes);
arena.write_at(addr_a, a);
let addr_b = arena.allocate_space(num_bytes);
arena.write_at(addr_b, b);
assert_eq!(arena.read::<MyTest>(addr_a), a);
assert_eq!(arena.read::<MyTest>(addr_b), b);
let addr_a = arena.store(a);
let addr_b = arena.store(b);
assert_eq!(unsafe { arena.read::<MyTest>(addr_a) }, a);
assert_eq!(unsafe { arena.read::<MyTest>(addr_b) }, b);
}
}

View File

@@ -3,5 +3,5 @@ mod memory_arena;
mod term_hashmap;
pub use self::expull::ExpUnrolledLinkedList;
pub use self::memory_arena::{Addr, MemoryArena};
pub use self::memory_arena::{Addr, ArenaStorable, MemoryArena};
pub use self::term_hashmap::{compute_table_size, TermHashMap};

View File

@@ -2,14 +2,39 @@ extern crate murmurhash32;
use self::murmurhash32::murmurhash2;
use super::{Addr, MemoryArena};
use byteorder::{ByteOrder, NativeEndian};
use postings::stacker::memory_arena::store;
use postings::UnorderedTermId;
use super::{Addr, ArenaStorable, MemoryArena};
use std::iter;
use std::mem;
use std::slice;
pub type BucketId = usize;
struct KeyBytesValue<'a, V> {
key: &'a [u8],
value: V,
}
impl<'a, V> KeyBytesValue<'a, V> {
fn new(key: &'a [u8], value: V) -> KeyBytesValue<'a, V> {
KeyBytesValue { key, value }
}
}
impl<'a, V> ArenaStorable for KeyBytesValue<'a, V>
where
V: ArenaStorable,
{
fn num_bytes(&self) -> usize {
0u16.num_bytes() + self.key.len() + self.value.num_bytes()
}
unsafe fn write_into(self, arena: &mut MemoryArena, addr: Addr) {
arena.write(addr, self.key.len() as u16);
arena.write_bytes(addr.offset(2), self.key);
arena.write(addr.offset(2 + self.key.len() as u32), self.value);
}
}
/// Returns the actual memory size in bytes
/// required to create a table of size $2^num_bits$.
pub fn compute_table_size(num_bits: usize) -> usize {
@@ -27,7 +52,6 @@ pub fn compute_table_size(num_bits: usize) -> usize {
struct KeyValue {
key_value_addr: Addr,
hash: u32,
unordered_term_id: UnorderedTermId,
}
impl Default for KeyValue {
@@ -35,7 +59,6 @@ impl Default for KeyValue {
KeyValue {
key_value_addr: Addr::null_pointer(),
hash: 0u32,
unordered_term_id: UnorderedTermId::default(),
}
}
}
@@ -60,7 +83,6 @@ pub struct TermHashMap {
pub heap: MemoryArena,
mask: usize,
occupied: Vec<usize>,
len: usize,
}
struct QuadraticProbing {
@@ -87,13 +109,14 @@ pub struct Iter<'a> {
}
impl<'a> Iterator for Iter<'a> {
type Item = (&'a [u8], Addr, UnorderedTermId);
type Item = (&'a [u8], Addr, BucketId);
fn next(&mut self) -> Option<Self::Item> {
self.inner.next().cloned().map(move |bucket: usize| {
let kv = self.hashmap.table[bucket];
let (key, offset): (&'a [u8], Addr) = self.hashmap.get_key_value(kv.key_value_addr);
(key, offset, kv.unordered_term_id)
let (key, offset): (&'a [u8], Addr) =
unsafe { self.hashmap.get_key_value(kv.key_value_addr) };
(key, offset, bucket as BucketId)
})
}
}
@@ -108,7 +131,6 @@ impl TermHashMap {
heap,
mask: table_size - 1,
occupied: Vec::with_capacity(table_size / 2),
len: 0,
}
}
@@ -124,34 +146,20 @@ impl TermHashMap {
self.table.len() < self.occupied.len() * 3
}
#[inline(always)]
fn get_key_value(&self, addr: Addr) -> (&[u8], Addr) {
let data = self.heap.slice_from(addr);
let key_bytes_len = NativeEndian::read_u16(data) as usize;
let key_bytes: &[u8] = &data[2..][..key_bytes_len];
(key_bytes, addr.offset(2u32 + key_bytes_len as u32))
unsafe fn get_key_value(&self, addr: Addr) -> (&[u8], Addr) {
let key_bytes_len = self.heap.read::<u16>(addr) as usize;
let key_addr = addr.offset(2u32);
let key_bytes: &[u8] = self.heap.read_slice(key_addr, key_bytes_len);
let val_addr: Addr = key_addr.offset(key_bytes.len() as u32);
(key_bytes, val_addr)
}
#[inline(always)]
fn get_value_addr_if_key_match(&self, target_key: &[u8], addr: Addr) -> Option<Addr> {
let (stored_key, value_addr) = self.get_key_value(addr);
if stored_key == target_key {
Some(value_addr)
} else {
None
}
}
fn set_bucket(&mut self, hash: u32, key_value_addr: Addr, bucket: usize) -> UnorderedTermId {
pub fn set_bucket(&mut self, hash: u32, key_value_addr: Addr, bucket: usize) {
self.occupied.push(bucket);
let unordered_term_id = self.len as UnorderedTermId;
self.len += 1;
self.table[bucket] = KeyValue {
key_value_addr,
hash,
unordered_term_id,
};
unordered_term_id
}
pub fn iter(&self) -> Iter {
@@ -191,14 +199,10 @@ impl TermHashMap {
/// will be in charge of returning a default value.
/// If the key already as an associated value, then it will be passed
/// `Some(previous_value)`.
pub fn mutate_or_create<S, V, TMutator>(
&mut self,
key: S,
mut updater: TMutator,
) -> UnorderedTermId
pub fn mutate_or_create<S, V, TMutator>(&mut self, key: S, mut updater: TMutator) -> BucketId
where
S: AsRef<[u8]>,
V: Copy + 'static,
V: Copy,
TMutator: FnMut(Option<V>) -> V,
{
if self.is_saturated() {
@@ -211,33 +215,48 @@ impl TermHashMap {
let bucket = probe.next_probe();
let kv: KeyValue = self.table[bucket];
if kv.is_empty() {
// The key does not exists yet.
let val = updater(None);
let num_bytes =
std::mem::size_of::<u16>() + key_bytes.len() + std::mem::size_of::<V>();
let key_addr = self.heap.allocate_space(num_bytes);
{
let data = self.heap.slice_mut(key_addr, num_bytes);
NativeEndian::write_u16(data, key_bytes.len() as u16);
let stop = 2 + key_bytes.len();
data[2..stop].copy_from_slice(key_bytes);
store(&mut data[stop..], val);
}
return self.set_bucket(hash, key_addr, bucket);
let key_addr = self.heap.store(KeyBytesValue::new(key_bytes, val));
self.set_bucket(hash, key_addr, bucket);
return bucket as BucketId;
} else if kv.hash == hash {
if let Some(val_addr) =
self.get_value_addr_if_key_match(key_bytes, kv.key_value_addr)
{
let v = self.heap.read(val_addr);
let new_v = updater(Some(v));
self.heap.write_at(val_addr, new_v);
return kv.unordered_term_id;
let (key_matches, val_addr) = {
let (stored_key, val_addr): (&[u8], Addr) =
unsafe { self.get_key_value(kv.key_value_addr) };
(stored_key == key_bytes, val_addr)
};
if key_matches {
unsafe {
// logic
let v = self.heap.read(val_addr);
let new_v = updater(Some(v));
self.heap.write(val_addr, new_v);
};
return bucket as BucketId;
}
}
}
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use super::murmurhash2::murmurhash2;
use test::Bencher;
#[bench]
fn bench_murmurhash2(b: &mut Bencher) {
let keys: [&'static str; 3] = ["wer qwe qwe qwe ", "werbq weqweqwe2 ", "weraq weqweqwe3 "];
b.iter(|| {
let mut s = 0;
for &key in &keys {
s ^= murmurhash2(key.as_bytes());
}
s
});
}
}
#[cfg(test)]
mod tests {
@@ -269,7 +288,10 @@ mod tests {
let mut vanilla_hash_map = HashMap::new();
let mut iter_values = hash_map.iter();
while let Some((key, addr, _)) = iter_values.next() {
let val: u32 = hash_map.heap.read(addr);
let val: u32 = unsafe {
// test
hash_map.heap.read(addr)
};
vanilla_hash_map.insert(key.to_owned(), val);
}
assert_eq!(vanilla_hash_map.len(), 2);

View File

@@ -101,9 +101,8 @@ mod tests {
index_writer.commit().unwrap();
index_writer.add_document(doc!(field=>"ccc"));
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
reader.reload().unwrap();
let searcher = reader.searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let weight = AllQuery.weight(&searcher, false).unwrap();
{
let reader = searcher.segment_reader(0);

View File

@@ -1,10 +1,10 @@
use common::BitSet;
use core::SegmentReader;
use fst::Automaton;
use query::BitSetDocSet;
use query::ConstScorer;
use query::{Scorer, Weight};
use schema::{Field, IndexRecordOption};
use tantivy_fst::Automaton;
use termdict::{TermDictionary, TermStreamer};
use Result;

View File

@@ -1,4 +1,5 @@
use core::SegmentReader;
use downcast::Downcast;
use query::intersect_scorers;
use query::score_combiner::{DoNothingCombiner, ScoreCombiner, SumWithCoordsCombiner};
use query::term_query::TermScorer;
@@ -9,6 +10,7 @@ use query::RequiredOptionalScorer;
use query::Scorer;
use query::Union;
use query::Weight;
use std::borrow::Borrow;
use std::collections::HashMap;
use Result;
@@ -22,11 +24,14 @@ where
}
{
let is_all_term_queries = scorers.iter().all(|scorer| scorer.is::<TermScorer>());
let is_all_term_queries = scorers.iter().all(|scorer| {
let scorer_ref: &Scorer = scorer.borrow();
Downcast::<TermScorer>::is_type(scorer_ref)
});
if is_all_term_queries {
let scorers: Vec<TermScorer> = scorers
.into_iter()
.map(|scorer| *(scorer.downcast::<TermScorer>().map_err(|_| ()).unwrap()))
.map(|scorer| *Downcast::<TermScorer>::downcast(scorer).unwrap())
.collect();
let scorer: Box<Scorer> = Box::new(Union::<TermScorer, TScoreCombiner>::from(scorers));
return scorer;

View File

@@ -8,6 +8,7 @@ mod tests {
use super::*;
use collector::tests::TestCollector;
use downcast::Downcast;
use query::score_combiner::SumWithCoordsCombiner;
use query::term_query::TermScorer;
use query::Intersection;
@@ -28,7 +29,7 @@ mod tests {
let index = Index::create_in_ram(schema);
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
{
let doc = doc!(text_field => "a b c");
index_writer.add_document(doc);
@@ -51,6 +52,7 @@ mod tests {
}
assert!(index_writer.commit().is_ok());
}
index.load_searchers().unwrap();
(index, text_field)
}
@@ -59,8 +61,7 @@ mod tests {
let (index, text_field) = aux_test_helper();
let query_parser = QueryParser::for_index(&index, vec![text_field]);
let query = query_parser.parse_query("(+a +b) d").unwrap();
let searcher = index.reader().unwrap().searcher();
assert_eq!(query.count(&searcher).unwrap(), 3);
assert_eq!(query.count(&*index.searcher()).unwrap(), 3);
}
#[test]
@@ -68,28 +69,28 @@ mod tests {
let (index, text_field) = aux_test_helper();
let query_parser = QueryParser::for_index(&index, vec![text_field]);
let query = query_parser.parse_query("+a").unwrap();
let searcher = index.reader().unwrap().searcher();
let searcher = index.searcher();
let weight = query.weight(&searcher, true).unwrap();
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
assert!(scorer.is::<TermScorer>());
assert!(Downcast::<TermScorer>::is_type(&*scorer));
}
#[test]
pub fn test_boolean_termonly_intersection() {
let (index, text_field) = aux_test_helper();
let query_parser = QueryParser::for_index(&index, vec![text_field]);
let searcher = index.reader().unwrap().searcher();
let searcher = index.searcher();
{
let query = query_parser.parse_query("+a +b +c").unwrap();
let weight = query.weight(&searcher, true).unwrap();
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
assert!(scorer.is::<Intersection<TermScorer>>());
assert!(Downcast::<Intersection<TermScorer>>::is_type(&*scorer));
}
{
let query = query_parser.parse_query("+a +(b c)").unwrap();
let weight = query.weight(&searcher, true).unwrap();
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
assert!(scorer.is::<Intersection<Box<Scorer>>>());
assert!(Downcast::<Intersection<Box<Scorer>>>::is_type(&*scorer));
}
}
@@ -97,19 +98,21 @@ mod tests {
pub fn test_boolean_reqopt() {
let (index, text_field) = aux_test_helper();
let query_parser = QueryParser::for_index(&index, vec![text_field]);
let searcher = index.reader().unwrap().searcher();
let searcher = index.searcher();
{
let query = query_parser.parse_query("+a b").unwrap();
let weight = query.weight(&searcher, true).unwrap();
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
assert!(scorer
.is::<RequiredOptionalScorer<Box<Scorer>, Box<Scorer>, SumWithCoordsCombiner>>());
assert!(Downcast::<
RequiredOptionalScorer<Box<Scorer>, Box<Scorer>, SumWithCoordsCombiner>,
>::is_type(&*scorer));
}
{
let query = query_parser.parse_query("+a b").unwrap();
let weight = query.weight(&searcher, false).unwrap();
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
assert!(scorer.is::<TermScorer>());
println!("{:?}", scorer.type_name());
assert!(Downcast::<TermScorer>::is_type(&*scorer));
}
}
@@ -126,13 +129,10 @@ mod tests {
query
};
let reader = index.reader().unwrap();
let matching_docs = |boolean_query: &Query| {
reader
.searcher()
.search(boolean_query, &TestCollector)
.unwrap()
let searcher = index.searcher();
let test_docs = searcher.search(boolean_query, &TestCollector).unwrap();
test_docs
.docs()
.iter()
.cloned()
@@ -188,12 +188,10 @@ mod tests {
let query: Box<Query> = Box::new(term_query);
query
};
let reader = index.reader().unwrap();
let score_docs = |boolean_query: &Query| {
let fruit = reader
.searcher()
.search(boolean_query, &TestCollector)
.unwrap();
let searcher = index.searcher();
let fruit = searcher.search(boolean_query, &TestCollector).unwrap();
fruit.scores().to_vec()
};
@@ -205,332 +203,4 @@ mod tests {
assert_eq!(score_docs(&boolean_query), vec![0.977973, 0.84699446]);
}
}
/*
DoC 0
{
"_index": "test",
"_type": "_doc",
"_id": "0",
"matched": true,
"explanation": {
"value": 6.2610235,
"description": "max of:",
"details": [{
"value": 6.1969156,
"description": "sum of:",
"details": [{
"value": 6.1969156,
"description": "weight(text:оксана in 561) [PerFieldSimilarity], result of:",
"details": [{
"value": 6.1969156,
"description": "score(freq=1.0), product of:",
"details": [{
"value": 2.2,
"description": "boost",
"details": []
}, {
"value": 5.65998,
"description": "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:",
"details": [{
"value": 3,
"description": "n, number of documents containing term",
"details": []
}, {
"value": 1004,
"description": "N, total number of documents with field",
"details": []
}]
}, {
"value": 0.49766606,
"description": "tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:",
"details": [{
"value": 1.0,
"description": "freq, occurrences of term within document",
"details": []
}, {
"value": 1.2,
"description": "k1, term saturation parameter",
"details": []
}, {
"value": 0.75,
"description": "b, length normalization parameter",
"details": []
}, {
"value": 19.0,
"description": "dl, length of field",
"details": []
}, {
"value": 24.105577,
"description": "avgdl, average length of field",
"details": []
}]
}]
}]
}]
}, {
"value": 6.2610235,
"description": "sum of:",
"details": [{
"value": 6.2610235,
"description": "weight(title:оксана in 561) [PerFieldSimilarity], result of:",
"details": [{
"value": 6.2610235,
"description": "score(freq=1.0), product of:",
"details": [{
"value": 2.2,
"description": "boost",
"details": []
}, {
"value": 5.4086657,
"description": "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:",
"details": [{
"value": 4,
"description": "n, number of documents containing term",
"details": []
}, {
"value": 1004,
"description": "N, total number of documents with field",
"details": []
}]
}, {
"value": 0.52617776,
"description": "tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:",
"details": [{
"value": 1.0,
"description": "freq, occurrences of term within document",
"details": []
}, {
"value": 1.2,
"description": "k1, term saturation parameter",
"details": []
}, {
"value": 0.75,
"description": "b, length normalization parameter",
"details": []
}, {
"value": 4.0,
"description": "dl, length of field",
"details": []
}, {
"value": 5.99502,
"description": "avgdl, average length of field",
"details": []
}]
}]
}]
}]
}]
}
}
doc 2
{
"_index": "test",
"_type": "_doc",
"_id": "2",
"matched": true,
"explanation": {
"value": 11.911896,
"description": "max of:",
"details": [{
"value": 11.911896,
"description": "sum of:",
"details": [{
"value": 5.4068284,
"description": "weight(title:оксана in 0) [PerFieldSimilarity], result of:",
"details": [{
"value": 5.4068284,
"description": "score(freq=1.0), product of:",
"details": [{
"value": 2.2,
"description": "boost",
"details": []
}, {
"value": 5.4086657,
"description": "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:",
"details": [{
"value": 4,
"description": "n, number of documents containing term",
"details": []
}, {
"value": 1004,
"description": "N, total number of documents with field",
"details": []
}]
}, {
"value": 0.45439103,
"description": "tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:",
"details": [{
"value": 1.0,
"description": "freq, occurrences of term within document",
"details": []
}, {
"value": 1.2,
"description": "k1, term saturation parameter",
"details": []
}, {
"value": 0.75,
"description": "b, length normalization parameter",
"details": []
}, {
"value": 6.0,
"description": "dl, length of field",
"details": []
}, {
"value": 5.99502,
"description": "avgdl, average length of field",
"details": []
}]
}]
}]
}, {
"value": 6.505067,
"description": "weight(title:лифенко in 0) [PerFieldSimilarity], result of:",
"details": [{
"value": 6.505067,
"description": "score(freq=1.0), product of:",
"details": [{
"value": 2.2,
"description": "boost",
"details": []
}, {
"value": 6.5072775,
"description": "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:",
"details": [{
"value": 1,
"description": "n, number of documents containing term",
"details": []
}, {
"value": 1004,
"description": "N, total number of documents with field",
"details": []
}]
}, {
"value": 0.45439103,
"description": "tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:",
"details": [{
"value": 1.0,
"description": "freq, occurrences of term within document",
"details": []
}, {
"value": 1.2,
"description": "k1, term saturation parameter",
"details": []
}, {
"value": 0.75,
"description": "b, length normalization parameter",
"details": []
}, {
"value": 6.0,
"description": "dl, length of field",
"details": []
}, {
"value": 5.99502,
"description": "avgdl, average length of field",
"details": []
}]
}]
}]
}]
}]
}
}
*/
// motivated by #554
#[test]
fn test_bm25_several_fields() {
let mut schema_builder = Schema::builder();
let title = schema_builder.add_text_field("title", TEXT);
let text = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(
// tf = 1 0
title => "Законы притяжения Оксана Кулакова",
// tf = 1 0
text => "Законы притяжения Оксана Кулакова] \n\nТема: Сексуальное искусство, Женственность\nТип товара: Запись вебинара (аудио)\nПродолжительность: 1,5 часа\n\nСсылка на вебинар:\n ",
));
index_writer.add_document(doc!(
// tf = 1 0
title => "Любимые русские пироги (Оксана Путан)",
// tf = 2 0
text => "http://i95.fastpic.ru/big/2017/0628/9a/615b9c8504d94a3893d7f496ac53539a.jpg \n\nОт издателя\nОксана Путан профессиональный повар, автор кулинарных книг и известный кулинарный блогер. Ее рецепты отличаются практичностью, доступностью и пользуются огромной популярностью в русскоязычном интернете. Это третья книга автора о самом вкусном и ароматном настоящих русских пирогах и выпечке!\nДаже новички на кухне легко готовят по ее рецептам. Оксана описывает процесс приготовления настолько подробно и понятно, что вам остается только наслаждаться готовкой и не тратить время на лишние усилия. Готовьте легко и просто!\n\nhttps://www.ozon.ru/context/detail/id/139872462/"
));
index_writer.add_document(doc!(
// tf = 1 1
title => "PDF Мастер Класс \"Морячок\" (Оксана Лифенко)",
// tf = 0 0
text => "https://i.ibb.co/pzvHrDN/I3d U T6 Gg TM.jpg\nhttps://i.ibb.co/NFrb6v6/N0ls Z9nwjb U.jpg\nВ описание входит штаны, кофта, берет, матросский воротник. Описание продается в формате PDF, состоит из 12 страниц формата А4 и может быть напечатано на любом принтере.\nОписание предназначено для кукол BJD RealPuki от FairyLand, но может подойти и другим подобным куклам. Также вы можете вязать этот наряд из обычной пряжи, и он подойдет для куколок побольше.\nhttps://vk.com/market 95724412?w=product 95724412_2212"
));
for _ in 0..1_000 {
index_writer.add_document(doc!(
title => "a b d e f g",
text => "maitre corbeau sur un arbre perche tenait dans son bec un fromage Maitre rnard par lodeur alleche lui tint a peu pres ce langage."
));
}
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let query_parser = QueryParser::for_index(&index, vec![title, text]);
let query = query_parser
.parse_query("Оксана Лифенко")
.unwrap();
let weight = query.weight(&searcher, true).unwrap();
let mut scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
// let mut scores = vec![];
// while
println!("=====|");
scorer.advance();
dbg!("scorer.score()");
assert!(false);
// scores.push(scorer.score());
// assert_eq!(scores, &[0.8017307, 0.72233325, 1.0300813]);
}
// motivated by #554
#[test]
fn test_bm25_several_fields_bbb() {
let mut schema_builder = Schema::builder();
let text = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(
text => "Законы притяжения Оксана Кулакова] \n\nТема: Сексуальное искусство, Женственность\nТип товара: Запись вебинара (аудио)\nПродолжительность: 1,5 часа\n\nСсылка на вебинар:\n ",
));
index_writer.add_document(doc!(
text => "http://i95.fastpic.ru/big/2017/0628/9a/615b9c8504d94a3893d7f496ac53539a.jpg \n\nОт издателя\nОксана Путан профессиональный повар, автор кулинарных книг и известный кулинарный блогер. Ее рецепты отличаются практичностью, доступностью и пользуются огромной популярностью в русскоязычном интернете. Это третья книга автора о самом вкусном и ароматном настоящих русских пирогах и выпечке!\nДаже новички на кухне легко готовят по ее рецептам. Оксана описывает процесс приготовления настолько подробно и понятно, что вам остается только наслаждаться готовкой и не тратить время на лишние усилия. Готовьте легко и просто!\n\nhttps://www.ozon.ru/context/detail/id/139872462/"
));
index_writer.add_document(doc!(
text => "https://i.ibb.co/pzvHrDN/I3d U T6 Gg TM.jpg\nhttps://i.ibb.co/NFrb6v6/N0ls Z9nwjb U.jpg\nВ описание входит штаны, кофта, берет, матросский воротник. Описание продается в формате PDF, состоит из 12 страниц формата А4 и может быть напечатано на любом принтере.\nОписание предназначено для кукол BJD RealPuki от FairyLand, но может подойти и другим подобным куклам. Также вы можете вязать этот наряд из обычной пряжи, и он подойдет для куколок побольше.\nhttps://vk.com/market 95724412?w=product 95724412_2212"
));
for _ in 0..100 {
index_writer.add_document(doc!(
text => "maitre corbeau sur un arbre perche tenait dans son bec un fromage Maitre rnard par lodeur alleche lui tint a peu pres ce langage."
));
}
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let query_parser = QueryParser::for_index(&index, vec![text]);
let query = query_parser
.parse_query("Оксана Лифенко")
.unwrap();
let weight = query.weight(&searcher, true).unwrap();
let mut scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
let mut scores = vec![];
while scorer.advance() {
scores.push(scorer.score());
}
assert_eq!(scores, &[0.8017307, 0.72233325, 1.0300813]);
index_writer.commit().unwrap();
}
}

Some files were not shown because too many files have changed in this diff Show More