Compare commits

..

4 Commits

Author SHA1 Message Date
Paul Masurel
f3099a83eb Blop 2018-12-24 11:41:18 +09:00
Paul Masurel
f745bb9d2a blop 2018-12-24 11:28:08 +09:00
Paul Masurel
d9417acbc6 done 2018-12-11 09:01:45 +09:00
Paul Masurel
38540c3826 small step 2018-12-09 15:26:19 +09:00
137 changed files with 3306 additions and 5619 deletions

View File

@@ -29,7 +29,7 @@ addons:
matrix:
include:
# Android
- env: TARGET=aarch64-linux-android DISABLE_TESTS
- env: TARGET=aarch64-linux-android DISABLE_TESTS=1
#- env: TARGET=arm-linux-androideabi DISABLE_TESTS=1
#- env: TARGET=armv7-linux-androideabi DISABLE_TESTS=1
#- env: TARGET=i686-linux-android DISABLE_TESTS=1
@@ -77,4 +77,4 @@ before_cache:
notifications:
email:
on_success: never
on_success: never

View File

@@ -1,39 +1,5 @@
Tantivy 0.9.0
=====================
*0.9.0 index format is not compatible with the
previous index format.*
- MAJOR BUGFIX :
Some `Mmap` objects were being leaked, and would never get released. (@fulmicoton)
- Removed most unsafe (@fulmicoton)
- Indexer memory footprint improved. (VInt comp, inlining the first block. (@fulmicoton)
- Stemming in other language possible (@pentlander)
- Segments with no docs are deleted earlier (@barrotsteindev)
- Added grouped add and delete operations.
They are guaranteed to happen together (i.e. they cannot be split by a commit).
In addition, adds are guaranteed to happen on the same segment. (@elbow-jason)
- Removed `INT_STORED` and `INT_INDEXED`. It is now possible to use `STORED` and `INDEXED`
for int fields. (@fulmicoton)
- Added DateTime field (@barrotsteindev)
- Added IndexReader. By default, index is reloaded automatically upon new commits (@fulmicoton)
- SIMD linear search within blocks (@fulmicoton)
Tantivy 0.8.2
=====================
Fixing build for x86_64 platforms. (#496)
No need to update from 0.8.1 if tantivy
is building on your platform.
Tantivy 0.8.1
=====================
Hotfix of #476.
Merge was reflecting deletes before commit was passed.
Thanks @barrotsteindev for reporting the bug.
Tantivy 0.8.0
=====================
*No change in the index format*
- API Breaking change in the collector API. (@jwolfe, @fulmicoton)
- Multithreaded search (@jwolfe, @fulmicoton)

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy"
version = "0.9.0"
version = "0.8.0-dev"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]
@@ -16,8 +16,8 @@ base64 = "0.10.0"
byteorder = "1.0"
lazy_static = "1"
regex = "1.0"
tantivy-fst = "0.1"
memmap = {version = "0.7", optional=true}
fst = {version="0.3", default-features=false}
fst-regex = { version="0.2" }
lz4 = {version="1.20", optional=true}
snap = {version="0.2"}
atomicwrites = {version="0.2.2", optional=true}
@@ -29,29 +29,27 @@ serde = "1.0"
serde_derive = "1.0"
serde_json = "1.0"
num_cpus = "1.2"
fs2={version="0.4", optional=true}
itertools = "0.8"
itertools = "0.7"
levenshtein_automata = {version="0.1", features=["fst_automaton"]}
notify = {version="4", optional=true}
bit-set = "0.5"
uuid = { version = "0.7.2", features = ["v4", "serde"] }
uuid = { version = "0.7", features = ["v4", "serde"] }
crossbeam = "0.5"
futures = "0.1"
futures-cpupool = "0.1"
owning_ref = "0.4"
stable_deref_trait = "1.0.0"
rust-stemmers = "1.1"
downcast-rs = { version="1.0" }
bitpacking = "0.6"
census = "0.2"
rust-stemmers = "1"
downcast = { version="0.9" }
matches = "0.1"
bitpacking = "0.5"
census = "0.1"
fnv = "1.0.6"
owned-read = "0.4"
failure = "0.1"
htmlescape = "0.3.1"
fail = "0.2"
scoped-pool = "1.0"
murmurhash32 = "0.2"
chrono = "0.4"
aho-corasick = "0.6"
[target.'cfg(windows)'.dependencies]
winapi = "0.2"
@@ -59,8 +57,6 @@ winapi = "0.2"
[dev-dependencies]
rand = "0.6"
maplit = "1"
matches = "0.1.8"
time = "0.1.42"
[profile.release]
opt-level = 3
@@ -74,11 +70,12 @@ overflow-checks = true
[features]
# by default no-fail is disabled. We manually enable it when running test.
default = ["mmap", "no_fail"]
mmap = ["atomicwrites", "fs2", "memmap", "notify"]
mmap = ["fst/mmap", "atomicwrites"]
lz4-compression = ["lz4"]
no_fail = ["fail/no_fail"]
unstable = [] # useful for benches.
wasm-bindgen = ["uuid/wasm-bindgen"]
[badges]
travis-ci = { repository = "tantivy-search/tantivy" }

View File

@@ -17,29 +17,19 @@
[![](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/images/6)](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/6)
[![](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/images/7)](https://sourcerer.io/fame/fulmicoton/tantivy-search/tantivy/links/7)
[![Become a patron](https://c5.patreon.com/external/logo/become_a_patron_button.png)](https://www.patreon.com/fulmicoton)
**Tantivy** is a **full text search engine library** written in rust.
It is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elasticsearch](https://www.elastic.co/products/elasticsearch) and [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not
It is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elastic Search](https://www.elastic.co/products/elasticsearch) and [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not
an off-the-shelf search engine server, but rather a crate that can be used
to build such a search engine.
Tantivy is, in fact, strongly inspired by Lucene's design.
# Benchmark
Tantivy is typically faster than Lucene, but the results will depend on
the nature of the queries in your workload.
The following [benchmark](https://tantivy-search.github.io/bench/) break downs
performance for different type of queries / collection.
# Features
- Full-text search
- Configurable tokenizer. (stemming available for 17 latin languages. Third party support for Chinese ([tantivy-jieba](https://crates.io/crates/tantivy-jieba) and [cang-jie](https://crates.io/crates/cang-jie)) and [Japanese](https://crates.io/crates/tantivy-tokenizer-tiny-segmenter)
- Fast (check out the :racehorse: :sparkles: [benchmark](https://tantivy-search.github.io/bench/) :sparkles: :racehorse:)
- Tiny startup time (<10ms), perfect for command line tools
- BM25 scoring (the same as lucene)
@@ -51,7 +41,6 @@ performance for different type of queries / collection.
- SIMD integer compression when the platform/CPU includes the SSE2 instruction set.
- Single valued and multivalued u64 and i64 fast fields (equivalent of doc values in Lucene)
- `&[u8]` fast fields
- Text, i64, u64, dates and hierarchical facet fields
- LZ4 compressed document store
- Range queries
- Faceted search
@@ -87,7 +76,7 @@ It will walk you through getting a wikipedia search engine up and running in a f
Tantivy compiles on stable rust but requires `Rust >= 1.27`.
To check out and run tests, you can simply run :
git clone https://github.com/tantivy-search/tantivy.git
git clone git@github.com:tantivy-search/tantivy.git
cd tantivy
cargo build
@@ -96,14 +85,6 @@ To check out and run tests, you can simply run :
Some tests will not run with just `cargo test` because of `fail-rs`.
To run the tests exhaustively, run `./run-tests.sh`.
# How can I support this project ?
# Contribute
There are many ways to support this project.
- If you use tantivy, tell us about your experience on [gitter](https://gitter.im/tantivy-search/tantivy) or by email (paul.masurel@gmail.com)
- Report bugs
- Write a blog post
- Complete documentation
- Contribute code (you can join [our gitter](https://gitter.im/tantivy-search/tantivy) )
- Talk about tantivy around you
- Drop a word on on [![Say Thanks!](https://img.shields.io/badge/Say%20Thanks-!-1EAEDB.svg)](https://saythanks.io/to/fulmicoton) or even [![Become a patron](https://c5.patreon.com/external/logo/become_a_patron_button.png)](https://www.patreon.com/fulmicoton)
Send me an email (paul.masurel at gmail.com) if you want to contribute to tantivy.

View File

@@ -20,7 +20,6 @@ use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::Index;
use tantivy::ReloadPolicy;
use tempdir::TempDir;
fn main() -> tantivy::Result<()> {
@@ -107,37 +106,37 @@ fn main() -> tantivy::Result<()> {
// For convenience, tantivy also comes with a macro to
// reduce the boilerplate above.
index_writer.add_document(doc!(
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
));
index_writer.add_document(doc!(
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
));
// Multivalued field just need to be repeated.
index_writer.add_document(doc!(
title => "Frankenstein",
title => "The Modern Prometheus",
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
enterprise which you have regarded with such evil forebodings. I arrived here \
yesterday, and my first task is to assure my dear sister of my welfare and \
increasing confidence in the success of my undertaking."
title => "Frankenstein",
title => "The Modern Prometheus",
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
enterprise which you have regarded with such evil forebodings. I arrived here \
yesterday, and my first task is to assure my dear sister of my welfare and \
increasing confidence in the success of my undertaking."
));
// This is an example, so we will only index 3 documents
@@ -171,33 +170,24 @@ fn main() -> tantivy::Result<()> {
//
// ### Searcher
//
// A reader is required to get search the index.
// It acts as a `Searcher` pool that reloads itself,
// depending on a `ReloadPolicy`.
//
// For a search server you will typically create one reader for the entire lifetime of your
// program, and acquire a new searcher for every single request.
//
// In the code below, we rely on the 'ON_COMMIT' policy: the reader
// will reload the index automatically after each commit.
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommit)
.try_into()?;
// Let's search our index. Start by reloading
// searchers in the index. This should be done
// after every `commit()`.
index.load_searchers()?;
// We now need to acquire a searcher.
//
// A searcher points to snapshotted, immutable version of the index.
//
// Some search experience might require more than
// one query. Using the same searcher ensures that all of these queries will run on the
// same version of the index.
// one query.
//
// The searcher ensure that we get to work
// with a consistent version of the index.
//
// Acquiring a `searcher` is very cheap.
//
// You should acquire a searcher every time you start processing a request and
// You should acquire a searcher every time you
// start processing a request and
// and release it right after your query is finished.
let searcher = reader.searcher();
let searcher = index.searcher();
// ### Query
@@ -234,6 +224,7 @@ fn main() -> tantivy::Result<()> {
// Since the body field was not configured as stored,
// the document returned will only contain
// a title.
for (_score, doc_address) in top_docs {
let retrieved_doc = searcher.doc(doc_address)?;
println!("{}", schema.to_json(&retrieved_doc));

View File

@@ -17,7 +17,7 @@ use tantivy::collector::{Collector, SegmentCollector};
use tantivy::fastfield::FastFieldReader;
use tantivy::query::QueryParser;
use tantivy::schema::Field;
use tantivy::schema::{Schema, FAST, INDEXED, TEXT};
use tantivy::schema::{Schema, FAST, INT_INDEXED, TEXT};
use tantivy::Index;
use tantivy::SegmentReader;
@@ -137,7 +137,7 @@ fn main() -> tantivy::Result<()> {
// products, and with a name, a description, and a price.
let product_name = schema_builder.add_text_field("name", TEXT);
let product_description = schema_builder.add_text_field("description", TEXT);
let price = schema_builder.add_u64_field("price", INDEXED | FAST);
let price = schema_builder.add_u64_field("price", INT_INDEXED | FAST);
let schema = schema_builder.build();
// # Indexing documents
@@ -170,9 +170,9 @@ fn main() -> tantivy::Result<()> {
price => 5_200u64
));
index_writer.commit()?;
index.load_searchers()?;
let reader = index.reader()?;
let searcher = reader.searcher();
let searcher = index.searcher();
let query_parser = QueryParser::for_index(&index, vec![product_name, product_description]);
// here we want to get a hit on the 'ken' in Frankenstein

View File

@@ -68,12 +68,12 @@ fn main() -> tantivy::Result<()> {
// heap for the indexer can increase its throughput.
let mut index_writer = index.writer(50_000_000)?;
index_writer.add_document(doc!(
title => "The Old Man and the Sea",
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
he had gone eighty-four days now without taking a fish."
title => "The Old Man and the Sea",
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
he had gone eighty-four days now without taking a fish."
));
index_writer.add_document(doc!(
title => "Of Mice and Men",
title => "Of Mice and Men",
body => r#"A few miles south of Soledad, the Salinas River drops in close to the hillside
bank and runs deep and green. The water is warm too, for it has slipped twinkling
over the yellow sands in the sunlight before reaching the narrow pool. On one
@@ -84,16 +84,16 @@ fn main() -> tantivy::Result<()> {
limbs and branches that arch over the pool"#
));
index_writer.add_document(doc!(
title => "Frankenstein",
title => "Frankenstein",
body => r#"You will rejoice to hear that no disaster has accompanied the commencement of an
enterprise which you have regarded with such evil forebodings. I arrived here
yesterday, and my first task is to assure my dear sister of my welfare and
increasing confidence in the success of my undertaking."#
));
index_writer.commit()?;
index.load_searchers()?;
let reader = index.reader()?;
let searcher = reader.searcher();
let searcher = index.searcher();
// The query parser can interpret human queries.
// Here, if the user does not specify which

View File

@@ -14,16 +14,12 @@ use tantivy::collector::TopDocs;
use tantivy::query::TermQuery;
use tantivy::schema::*;
use tantivy::Index;
use tantivy::IndexReader;
// A simple helper function to fetch a single document
// given its id from our index.
// It will be helpful to check our work.
fn extract_doc_given_isbn(
reader: &IndexReader,
isbn_term: &Term,
) -> tantivy::Result<Option<Document>> {
let searcher = reader.searcher();
fn extract_doc_given_isbn(index: &Index, isbn_term: &Term) -> tantivy::Result<Option<Document>> {
let searcher = index.searcher();
// This is the simplest query you can think of.
// It matches all of the documents containing a specific term.
@@ -89,12 +85,12 @@ fn main() -> tantivy::Result<()> {
isbn => "978-9176370711",
));
index_writer.commit()?;
let reader = index.reader()?;
index.load_searchers()?;
let frankenstein_isbn = Term::from_field_text(isbn, "978-9176370711");
// Oops our frankenstein doc seems mispelled
let frankenstein_doc_misspelled = extract_doc_given_isbn(&reader, &frankenstein_isbn)?.unwrap();
let frankenstein_doc_misspelled = extract_doc_given_isbn(&index, &frankenstein_isbn)?.unwrap();
assert_eq!(
schema.to_json(&frankenstein_doc_misspelled),
r#"{"isbn":["978-9176370711"],"title":["Frankentein"]}"#,
@@ -133,10 +129,10 @@ fn main() -> tantivy::Result<()> {
// Everything happened as if the document was updated.
index_writer.commit()?;
// We reload our searcher to make our change available to clients.
reader.reload()?;
index.load_searchers()?;
// No more typo!
let frankenstein_new_doc = extract_doc_given_isbn(&reader, &frankenstein_isbn)?.unwrap();
let frankenstein_new_doc = extract_doc_given_isbn(&index, &frankenstein_isbn)?.unwrap();
assert_eq!(
schema.to_json(&frankenstein_new_doc),
r#"{"isbn":["978-9176370711"],"title":["Frankenstein"]}"#,

View File

@@ -55,9 +55,9 @@ fn main() -> tantivy::Result<()> {
index_writer.commit()?;
let reader = index.reader()?;
index.load_searchers()?;
let searcher = reader.searcher();
let searcher = index.searcher();
let mut facet_collector = FacetCollector::for_field(tags);
facet_collector.add_facet("/pools");

View File

@@ -1,43 +0,0 @@
// # Searching a range on an indexed int field.
//
// Below is an example of creating an indexed integer field in your schema
// You can use RangeQuery to get a Count of all occurrences in a given range.
#[macro_use]
extern crate tantivy;
use tantivy::collector::Count;
use tantivy::query::RangeQuery;
use tantivy::schema::{Schema, INDEXED};
use tantivy::Index;
use tantivy::Result;
fn run() -> Result<()> {
// For the sake of simplicity, this schema will only have 1 field
let mut schema_builder = Schema::builder();
// `INDEXED` is a short-hand to indicate that our field should be "searchable".
let year_field = schema_builder.add_u64_field("year", INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let reader = index.reader()?;
{
let mut index_writer = index.writer_with_num_threads(1, 6_000_000)?;
for year in 1950u64..2019u64 {
index_writer.add_document(doc!(year_field => year));
}
index_writer.commit()?;
// The index will be a range of years
}
reader.reload()?;
let searcher = reader.searcher();
// The end is excluded i.e. here we are searching up to 1969
let docs_in_the_sixties = RangeQuery::new_u64(year_field, 1960..1970);
// Uses a Count collector to sum the total number of docs in the range
let num_60s_books = searcher.search(&docs_in_the_sixties, &Count)?;
assert_eq!(num_60s_books, 10);
Ok(())
}
fn main() {
run().unwrap()
}

View File

@@ -33,9 +33,9 @@ fn main() -> tantivy::Result<()> {
index_writer.add_document(doc!(title => "The modern Promotheus"));
index_writer.commit()?;
let reader = index.reader()?;
index.load_searchers()?;
let searcher = reader.searcher();
let searcher = index.searcher();
// A tantivy index is actually a collection of segments.
// Similarly, a searcher just wraps a list `segment_reader`.

View File

@@ -35,27 +35,28 @@ fn main() -> tantivy::Result<()> {
// we'll only need one doc for this example.
index_writer.add_document(doc!(
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
));
// ...
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
index.load_searchers()?;
let searcher = index.searcher();
let query_parser = QueryParser::for_index(&index, vec![title, body]);
let query = query_parser.parse_query("sycamore spring")?;
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
let snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?;
let snippet_generator = SnippetGenerator::new(&searcher, &*query, body)?;
for (score, doc_address) in top_docs {
let doc = searcher.doc(doc_address)?;

View File

@@ -72,33 +72,33 @@ fn main() -> tantivy::Result<()> {
title => "The Old Man and the Sea",
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
he had gone eighty-four days now without taking a fish."
));
));
index_writer.add_document(doc!(
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
));
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
));
index_writer.add_document(doc!(
title => "Frankenstein",
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
enterprise which you have regarded with such evil forebodings. I arrived here \
yesterday, and my first task is to assure my dear sister of my welfare and \
increasing confidence in the success of my undertaking."
title => "Frankenstein",
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
enterprise which you have regarded with such evil forebodings. I arrived here \
yesterday, and my first task is to assure my dear sister of my welfare and \
increasing confidence in the success of my undertaking."
));
index_writer.commit()?;
let reader = index.reader()?;
index.load_searchers()?;
let searcher = reader.searcher();
let searcher = index.searcher();
let query_parser = QueryParser::for_index(&index, vec![title, body]);

View File

@@ -12,7 +12,7 @@ fn main() -> tantivy::Result<()> {
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("title", TEXT | STORED);
schema_builder.add_text_field("body", TEXT);
schema_builder.add_u64_field("year", INDEXED);
schema_builder.add_u64_field("year", INT_INDEXED);
let schema = schema_builder.build();
// Let's assume we have a json-serialized document.

View File

@@ -40,8 +40,8 @@ use SegmentReader;
/// index_writer.commit().unwrap();
/// }
///
/// let reader = index.reader()?;
/// let searcher = reader.searcher();
/// index.load_searchers()?;
/// let searcher = index.searcher();
///
/// {
/// let query_parser = QueryParser::for_index(&index, vec![title]);

View File

@@ -122,16 +122,17 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
/// facet => Facet::from("/lang/en"),
/// facet => Facet::from("/category/biography")
/// ));
/// index_writer.commit()?;
/// index_writer.commit().unwrap();
/// }
/// let reader = index.reader()?;
/// let searcher = reader.searcher();
///
/// index.load_searchers()?;
/// let searcher = index.searcher();
///
/// {
/// let mut facet_collector = FacetCollector::for_field(facet);
/// facet_collector.add_facet("/lang");
/// facet_collector.add_facet("/category");
/// let facet_counts = searcher.search(&AllQuery, &facet_collector)?;
/// let facet_counts = searcher.search(&AllQuery, &facet_collector).unwrap();
///
/// // This lists all of the facet counts
/// let facets: Vec<(&Facet, u64)> = facet_counts
@@ -146,7 +147,7 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
/// {
/// let mut facet_collector = FacetCollector::for_field(facet);
/// facet_collector.add_facet("/category/fiction");
/// let facet_counts = searcher.search(&AllQuery, &facet_collector)?;
/// let facet_counts = searcher.search(&AllQuery, &facet_collector).unwrap();
///
/// // This lists all of the facet counts
/// let facets: Vec<(&Facet, u64)> = facet_counts
@@ -162,7 +163,7 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
/// {
/// let mut facet_collector = FacetCollector::for_field(facet);
/// facet_collector.add_facet("/category/fiction");
/// let facet_counts = searcher.search(&AllQuery, &facet_collector)?;
/// let facet_counts = searcher.search(&AllQuery, &facet_collector).unwrap();
///
/// // This lists all of the facet counts
/// let facets: Vec<(&Facet, u64)> = facet_counts.top_k("/category/fiction", 1);
@@ -196,7 +197,7 @@ fn skip<'a, I: Iterator<Item = &'a Facet>>(
) -> SkipResult {
loop {
match collapse_it.peek() {
Some(facet_bytes) => match facet_bytes.encoded_str().as_bytes().cmp(target) {
Some(facet_bytes) => match facet_bytes.encoded_bytes().cmp(target) {
Ordering::Less => {}
Ordering::Greater => {
return SkipResult::OverStep;
@@ -368,8 +369,7 @@ impl SegmentCollector for FacetSegmentCollector {
let mut facet = vec![];
let facet_ord = self.collapse_facet_ords[collapsed_facet_ord];
facet_dict.ord_to_term(facet_ord as u64, &mut facet);
// TODO
facet_counts.insert(Facet::from_encoded(facet).unwrap(), count);
facet_counts.insert(unsafe { Facet::from_encoded(facet) }, count);
}
FacetCounts { facet_counts }
}
@@ -403,9 +403,9 @@ impl FacetCounts {
let right_bound = if facet.is_root() {
Bound::Unbounded
} else {
let mut facet_after_bytes: String = facet.encoded_str().to_owned();
facet_after_bytes.push('\u{1}');
let facet_after = Facet::from_encoded_string(facet_after_bytes);
let mut facet_after_bytes: Vec<u8> = facet.encoded_bytes().to_owned();
facet_after_bytes.push(1u8);
let facet_after = unsafe { Facet::from_encoded(facet_after_bytes) }; // ok logic
Bound::Excluded(facet_after)
};
let underlying: btree_map::Range<_, _> = self.facet_counts.range((left_bound, right_bound));
@@ -474,16 +474,15 @@ mod tests {
n /= 4;
let leaf = n % 5;
Facet::from(&format!("/top{}/mid{}/leaf{}", top, mid, leaf))
})
.collect();
}).collect();
for i in 0..num_facets * 10 {
let mut doc = Document::new();
doc.add_facet(facet_field, facets[i % num_facets].clone());
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let mut facet_collector = FacetCollector::for_field(facet_field);
facet_collector.add_facet(Facet::from("/top1"));
let counts = searcher.search(&AllQuery, &facet_collector).unwrap();
@@ -501,16 +500,18 @@ mod tests {
("/top1/mid2", 50),
("/top1/mid3", 50),
]
.iter()
.map(|&(facet_str, count)| (String::from(facet_str), count))
.collect::<Vec<_>>()
.iter()
.map(|&(facet_str, count)| (String::from(facet_str), count))
.collect::<Vec<_>>()
);
}
}
#[test]
#[should_panic(expected = "Tried to add a facet which is a descendant of \
an already added facet.")]
#[should_panic(
expected = "Tried to add a facet which is a descendant of \
an already added facet."
)]
fn test_misused_facet_collector() {
let mut facet_collector = FacetCollector::for_field(Field(0));
facet_collector.add_facet(Facet::from("/country"));
@@ -531,8 +532,8 @@ mod tests {
facet_field => Facet::from_text(&"/subjects/B/b"),
));
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
assert_eq!(searcher.num_docs(), 1);
let mut facet_collector = FacetCollector::for_field(facet_field);
facet_collector.add_facet("/subjects");
@@ -562,15 +563,13 @@ mod tests {
let facet = Facet::from(&format!("/facet/{}", c));
let doc = doc!(facet_field => facet);
iter::repeat(doc).take(count)
})
.map(|mut doc| {
}).map(|mut doc| {
doc.add_facet(
facet_field,
&format!("/facet/{}", thread_rng().sample(&uniform)),
);
doc
})
.collect();
}).collect();
docs[..].shuffle(&mut thread_rng());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
@@ -578,7 +577,9 @@ mod tests {
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let mut facet_collector = FacetCollector::for_field(facet_field);
facet_collector.add_facet("/facet");
@@ -632,7 +633,8 @@ mod bench {
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
index.load_searchers().unwrap();
b.iter(|| {
let searcher = index.searcher();
let facet_collector = FacetCollector::for_field(facet_field);

View File

@@ -88,7 +88,7 @@ mod tests {
let index = Index::create_in_ram(schema.clone());
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
{
for i in 0u64..10u64 {
index_writer.add_document(doc!(
@@ -101,7 +101,8 @@ mod tests {
assert_eq!(index_writer.commit().unwrap(), 10u64);
}
let searcher = index.reader().searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let mut ffvf_i64: IntFacetCollector<I64FastFieldReader> = IntFacetCollector::new(num_field_i64);
let mut ffvf_u64: IntFacetCollector<U64FastFieldReader> = IntFacetCollector::new(num_field_u64);

View File

@@ -53,9 +53,9 @@ use tantivy::collector::{Count, TopDocs};
# index_writer.add_document(doc!(
# title => "The Diary of Muadib",
# ));
# index_writer.commit()?;
# let reader = index.reader()?;
# let searcher = reader.searcher();
# index_writer.commit().unwrap();
# index.load_searchers()?;
# let searcher = index.searcher();
# let query_parser = QueryParser::for_index(&index, vec![title]);
# let query = query_parser.parse_query("diary")?;
let (doc_count, top_docs): (usize, Vec<(Score, DocAddress)>) =
@@ -85,7 +85,7 @@ See the `custom_collector` example.
*/
use downcast_rs;
use downcast;
use DocId;
use Result;
use Score;
@@ -111,9 +111,9 @@ pub use self::facet_collector::FacetCollector;
/// `Fruit` is the type for the result of our collection.
/// e.g. `usize` for the `Count` collector.
pub trait Fruit: Send + downcast_rs::Downcast {}
pub trait Fruit: Send + downcast::Any {}
impl<T> Fruit for T where T: Send + downcast_rs::Downcast {}
impl<T> Fruit for T where T: Send + downcast::Any {}
/// Collectors are in charge of collecting and retaining relevant
/// information from the document found and scored by the query.
@@ -358,7 +358,10 @@ where
}
}
impl_downcast!(Fruit);
#[allow(missing_docs)]
mod downcast_impl {
downcast!(super::Fruit);
}
#[cfg(test)]
pub mod tests;

View File

@@ -1,6 +1,7 @@
use super::Collector;
use super::SegmentCollector;
use collector::Fruit;
use downcast::Downcast;
use std::marker::PhantomData;
use DocId;
use Result;
@@ -36,14 +37,13 @@ impl<TCollector: Collector> Collector for CollectorWrapper<TCollector> {
let typed_fruit: Vec<TCollector::Fruit> = children
.into_iter()
.map(|untyped_fruit| {
untyped_fruit
.downcast::<TCollector::Fruit>()
Downcast::<TCollector::Fruit>::downcast(untyped_fruit)
.map(|boxed_but_typed| *boxed_but_typed)
.map_err(|_| {
TantivyError::InvalidArgument("Failed to cast child fruit.".to_string())
.map_err(|e| {
let err_msg = format!("Failed to cast child collector fruit. {:?}", e);
TantivyError::InvalidArgument(err_msg)
})
})
.collect::<Result<_>>()?;
}).collect::<Result<_>>()?;
let merged_fruit = self.0.merge_fruits(typed_fruit)?;
Ok(Box::new(merged_fruit))
}
@@ -88,20 +88,14 @@ pub struct FruitHandle<TFruit: Fruit> {
impl<TFruit: Fruit> FruitHandle<TFruit> {
pub fn extract(self, fruits: &mut MultiFruit) -> TFruit {
let boxed_fruit = fruits.sub_fruits[self.pos].take().expect("");
*boxed_fruit
.downcast::<TFruit>()
.map_err(|_| ())
.expect("Failed to downcast collector fruit.")
*Downcast::<TFruit>::downcast(boxed_fruit).expect("Failed")
}
}
/// Multicollector makes it possible to collect on more than one collector.
/// It should only be used for use cases where the Collector types is unknown
/// at compile time.
///
/// If the type of the collectors is known, you can just group yours collectors
/// in a tuple. See the
/// [Combining several collectors section of the collector documentation](./index.html#combining-several-collectors).
/// If the type of the collectors is known, you should prefer to use `ChainedCollector`.
///
/// ```rust
/// #[macro_use]
@@ -134,8 +128,8 @@ impl<TFruit: Fruit> FruitHandle<TFruit> {
/// index_writer.commit().unwrap();
/// }
///
/// let reader = index.reader()?;
/// let searcher = reader.searcher();
/// index.load_searchers()?;
/// let searcher = index.searcher();
///
/// let mut collectors = MultiCollector::new();
/// let top_docs_handle = collectors.add_collector(TopDocs::with_limit(2));
@@ -153,8 +147,6 @@ impl<TFruit: Fruit> FruitHandle<TFruit> {
/// Ok(())
/// }
/// ```
#[allow(clippy::type_complexity)]
#[derive(Default)]
pub struct MultiCollector<'a> {
collector_wrappers:
Vec<Box<Collector<Child = Box<BoxableSegmentCollector>, Fruit = Box<Fruit>> + 'a>>,
@@ -162,8 +154,10 @@ pub struct MultiCollector<'a> {
impl<'a> MultiCollector<'a> {
/// Create a new `MultiCollector`
pub fn new() -> Self {
Default::default()
pub fn new() -> MultiCollector<'a> {
MultiCollector {
collector_wrappers: Vec::new(),
}
}
/// Add a new collector to our `MultiCollector`.
@@ -219,8 +213,7 @@ impl<'a> Collector for MultiCollector<'a> {
.zip(segment_fruits_list)
.map(|(child_collector, segment_fruits)| {
Ok(Some(child_collector.merge_fruits(segment_fruits)?))
})
.collect::<Result<_>>()?;
}).collect::<Result<_>>()?;
Ok(MultiFruit { sub_fruits })
}
}
@@ -278,7 +271,8 @@ mod tests {
index_writer.add_document(doc!(text=>"abc"));
index_writer.commit().unwrap();
}
let searcher = index.reader().unwrap().searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let term = Term::from_field_text(text, "abc");
let query = TermQuery::new(term, IndexRecordOption::Basic);

View File

@@ -84,9 +84,11 @@ where
for (feature, doc) in child_fruit {
if top_collector.len() < self.limit {
top_collector.push(ComparableDoc { feature, doc });
} else if let Some(mut head) = top_collector.peek_mut() {
if head.feature < feature {
*head = ComparableDoc { feature, doc };
} else {
if let Some(mut head) = top_collector.peek_mut() {
if head.feature < feature {
*head = ComparableDoc { feature, doc };
}
}
}
}
@@ -140,8 +142,7 @@ impl<T: PartialOrd + Clone> TopSegmentCollector<T> {
comparable_doc.feature,
DocAddress(segment_id, comparable_doc.doc),
)
})
.collect()
}).collect()
}
/// Return true iff at least K documents have gone through

View File

@@ -23,16 +23,15 @@ use SegmentReader;
/// # use tantivy::schema::{Schema, Field, FAST, TEXT};
/// # use tantivy::{Index, Result, DocAddress};
/// # use tantivy::query::{Query, QueryParser};
/// use tantivy::Searcher;
/// use tantivy::collector::TopDocs;
///
/// # fn main() -> tantivy::Result<()> {
/// # fn main() {
/// # let mut schema_builder = Schema::builder();
/// # let title = schema_builder.add_text_field("title", TEXT);
/// # let rating = schema_builder.add_u64_field("rating", FAST);
/// # let schema = schema_builder.build();
/// # let index = Index::create_in_ram(schema);
/// # let mut index_writer = index.writer_with_num_threads(1, 3_000_000)?;
/// # let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
/// # index_writer.add_document(doc!(
/// # title => "The Name of the Wind",
/// # rating => 92u64,
@@ -40,14 +39,13 @@ use SegmentReader;
/// # index_writer.add_document(doc!(title => "The Diary of Muadib", rating => 97u64));
/// # index_writer.add_document(doc!(title => "A Dairy Cow", rating => 63u64));
/// # index_writer.add_document(doc!(title => "The Diary of a Young Girl", rating => 80u64));
/// # index_writer.commit()?;
/// # let reader = index.reader()?;
/// # let query = QueryParser::for_index(&index, vec![title]).parse_query("diary")?;
/// # let top_docs = docs_sorted_by_rating(&reader.searcher(), &query, rating)?;
/// # index_writer.commit().unwrap();
/// # index.load_searchers().unwrap();
/// # let query = QueryParser::for_index(&index, vec![title]).parse_query("diary").unwrap();
/// # let top_docs = docs_sorted_by_rating(&index, &query, rating).unwrap();
/// # assert_eq!(top_docs,
/// # vec![(97u64, DocAddress(0u32, 1)),
/// # (80u64, DocAddress(0u32, 3))]);
/// # Ok(())
/// # }
/// #
/// /// Searches the document matching the given query, and
@@ -55,9 +53,7 @@ use SegmentReader;
/// /// given in argument.
/// ///
/// /// `field` is required to be a FAST field.
/// fn docs_sorted_by_rating(searcher: &Searcher,
/// query: &Query,
/// sort_by_field: Field)
/// fn docs_sorted_by_rating(index: &Index, query: &Query, sort_by_field: Field)
/// -> Result<Vec<(u64, DocAddress)>> {
///
/// // This is where we build our collector!
@@ -65,7 +61,8 @@ use SegmentReader;
///
/// // ... and here is our documents. Not this is a simple vec.
/// // The `u64` in the pair is the value of our fast field for each documents.
/// searcher.search(query, &top_docs_by_rating)
/// index.searcher()
/// .search(query, &top_docs_by_rating)
/// }
/// ```
pub struct TopDocsByField<T> {
@@ -79,12 +76,6 @@ impl<T: FastValue + PartialOrd + Clone> TopDocsByField<T> {
/// The given field name must be a fast field, otherwise the collector have an error while
/// collecting results.
///
/// This constructor is crate-private. Client are supposed to call
/// build `TopDocsByField` object using the `TopDocs` API.
///
/// e.g.:
/// `TopDocs::with_limit(2).order_by_field(sort_by_field)`
///
/// # Panics
/// The method panics if limit is 0
pub(crate) fn new(field: Field, limit: usize) -> TopDocsByField<T> {
@@ -180,7 +171,7 @@ mod tests {
size => 16u64,
));
});
let searcher = index.reader().unwrap().searcher();
let searcher = index.searcher();
let top_collector = TopDocs::with_limit(4).order_by_field(size);
let top_docs: Vec<(u64, DocAddress)> = searcher.search(&query, &top_collector).unwrap();
@@ -207,7 +198,7 @@ mod tests {
size => 12u64,
));
});
let searcher = index.reader().unwrap().searcher();
let searcher = index.searcher();
let top_collector: TopDocsByField<u64> = TopDocs::with_limit(4).order_by_field(Field(2));
let segment_reader = searcher.segment_reader(0u32);
top_collector
@@ -227,7 +218,7 @@ mod tests {
size => 12u64,
));
});
let searcher = index.reader().unwrap().searcher();
let searcher = index.searcher();
let segment = searcher.segment_reader(0);
let top_collector: TopDocsByField<u64> = TopDocs::with_limit(4).order_by_field(size);
assert_matches!(
@@ -250,6 +241,8 @@ mod tests {
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
doc_adder(&mut index_writer);
index_writer.commit().unwrap();
index.load_searchers().unwrap();
let query_parser = QueryParser::for_index(&index, vec![query_field]);
let query = query_parser.parse_query(query).unwrap();
(index, query)

View File

@@ -51,8 +51,8 @@ use SegmentReader;
/// index_writer.commit().unwrap();
/// }
///
/// let reader = index.reader()?;
/// let searcher = reader.searcher();
/// index.load_searchers()?;
/// let searcher = index.searcher();
///
/// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query = query_parser.parse_query("diary")?;
@@ -142,12 +142,13 @@ mod tests {
let index = Index::create_in_ram(schema);
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"Hello happy tax payer."));
index_writer.add_document(doc!(text_field=>"Droopy says hello happy tax payer"));
index_writer.add_document(doc!(text_field=>"I like Droopy"));
assert!(index_writer.commit().is_ok());
}
index.load_searchers().unwrap();
index
}
@@ -158,8 +159,6 @@ mod tests {
let query_parser = QueryParser::for_index(&index, vec![field]);
let text_query = query_parser.parse_query("droopy tax").unwrap();
let score_docs: Vec<(Score, DocAddress)> = index
.reader()
.unwrap()
.searcher()
.search(&text_query, &TopDocs::with_limit(4))
.unwrap();
@@ -180,8 +179,6 @@ mod tests {
let query_parser = QueryParser::for_index(&index, vec![field]);
let text_query = query_parser.parse_query("droopy tax").unwrap();
let score_docs: Vec<(Score, DocAddress)> = index
.reader()
.unwrap()
.searcher()
.search(&text_query, &TopDocs::with_limit(2))
.unwrap();

View File

@@ -1,6 +1,9 @@
use byteorder::{ByteOrder, LittleEndian, WriteBytesExt};
use common::serialize::BinarySerializable;
use std::io;
use std::io::Write;
use std::mem;
use std::ops::Deref;
use std::ptr;
pub(crate) struct BitPacker {
mini_buffer: u64,
@@ -15,7 +18,7 @@ impl BitPacker {
}
}
pub fn write<TWrite: io::Write>(
pub fn write<TWrite: Write>(
&mut self,
val: u64,
num_bits: u8,
@@ -25,14 +28,14 @@ impl BitPacker {
let num_bits = num_bits as usize;
if self.mini_buffer_written + num_bits > 64 {
self.mini_buffer |= val_u64.wrapping_shl(self.mini_buffer_written as u32);
output.write_u64::<LittleEndian>(self.mini_buffer)?;
self.mini_buffer.serialize(output)?;
self.mini_buffer = val_u64.wrapping_shr((64 - self.mini_buffer_written) as u32);
self.mini_buffer_written = self.mini_buffer_written + num_bits - 64;
} else {
self.mini_buffer |= val_u64 << self.mini_buffer_written;
self.mini_buffer_written += num_bits;
if self.mini_buffer_written == 64 {
output.write_u64::<LittleEndian>(self.mini_buffer)?;
self.mini_buffer.serialize(output)?;
self.mini_buffer_written = 0;
self.mini_buffer = 0u64;
}
@@ -40,18 +43,17 @@ impl BitPacker {
Ok(())
}
pub fn flush<TWrite: io::Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
pub fn flush<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
if self.mini_buffer_written > 0 {
let num_bytes = (self.mini_buffer_written + 7) / 8;
let mut arr: [u8; 8] = [0u8; 8];
LittleEndian::write_u64(&mut arr, self.mini_buffer);
let arr: [u8; 8] = unsafe { mem::transmute::<u64, [u8; 8]>(self.mini_buffer.to_le()) };
output.write_all(&arr[..num_bytes])?;
self.mini_buffer_written = 0;
}
Ok(())
}
pub fn close<TWrite: io::Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
pub fn close<TWrite: Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
self.flush(output)?;
// Padding the write file to simplify reads.
output.write_all(&[0u8; 7])?;
@@ -64,7 +66,7 @@ pub struct BitUnpacker<Data>
where
Data: Deref<Target = [u8]>,
{
num_bits: u64,
num_bits: usize,
mask: u64,
data: Data,
}
@@ -80,13 +82,13 @@ where
(1u64 << num_bits) - 1u64
};
BitUnpacker {
num_bits: u64::from(num_bits),
num_bits: num_bits as usize,
mask,
data,
}
}
pub fn get(&self, idx: u64) -> u64 {
pub fn get(&self, idx: usize) -> u64 {
if self.num_bits == 0 {
return 0u64;
}
@@ -97,13 +99,42 @@ where
let addr = addr_in_bits >> 3;
let bit_shift = addr_in_bits & 7;
debug_assert!(
addr + 8 <= data.len() as u64,
addr + 8 <= data.len(),
"The fast field field should have been padded with 7 bytes."
);
let val_unshifted_unmasked: u64 = LittleEndian::read_u64(&data[(addr as usize)..]);
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
let val_unshifted_unmasked: u64 =
u64::from_le(unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) });
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
val_shifted & mask
}
/// Reads a range of values from the fast field.
///
/// The range of values read is from
/// `[start..start + output.len()[`
pub fn get_range(&self, start: u32, output: &mut [u64]) {
if self.num_bits == 0 {
for val in output.iter_mut() {
*val = 0u64;
}
} else {
let data: &[u8] = &*self.data;
let num_bits = self.num_bits;
let mask = self.mask;
let mut addr_in_bits = (start as usize) * num_bits;
for output_val in output.iter_mut() {
let addr = addr_in_bits >> 3;
let bit_shift = addr_in_bits & 7;
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
let val_unshifted_unmasked: u64 =
unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) };
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
*output_val = val_shifted & mask;
addr_in_bits += num_bits;
}
}
}
}
#[cfg(test)]
@@ -129,7 +160,7 @@ mod test {
fn test_bitpacker_util(len: usize, num_bits: u8) {
let (bitunpacker, vals) = create_fastfield_bitpacker(len, num_bits);
for (i, val) in vals.iter().enumerate() {
assert_eq!(bitunpacker.get(i as u64), *val);
assert_eq!(bitunpacker.get(i), *val);
}
}
@@ -141,4 +172,17 @@ mod test {
test_bitpacker_util(6, 14);
test_bitpacker_util(1000, 14);
}
#[test]
fn test_bitpacker_range() {
let (bitunpacker, vals) = create_fastfield_bitpacker(100_000, 12);
let buffer_len = 100;
let mut buffer = vec![0u64; buffer_len];
for start in vec![0, 10, 20, 100, 1_000] {
bitunpacker.get_range(start as u32, &mut buffer[..]);
for i in 0..buffer_len {
assert_eq!(buffer[i], vals[start + i]);
}
}
}
}

View File

@@ -39,7 +39,7 @@ impl BinarySerializable for FileAddr {
/// A `CompositeWrite` is used to write a `CompositeFile`.
pub struct CompositeWrite<W = WritePtr> {
write: CountingWriter<W>,
offsets: HashMap<FileAddr, u64>,
offsets: HashMap<FileAddr, usize>,
}
impl<W: Write> CompositeWrite<W> {

View File

@@ -3,7 +3,7 @@ use std::io::Write;
pub struct CountingWriter<W> {
underlying: W,
written_bytes: u64,
written_bytes: usize,
}
impl<W: Write> CountingWriter<W> {
@@ -14,11 +14,11 @@ impl<W: Write> CountingWriter<W> {
}
}
pub fn written_bytes(&self) -> u64 {
pub fn written_bytes(&self) -> usize {
self.written_bytes
}
pub fn finish(mut self) -> io::Result<(W, u64)> {
pub fn finish(mut self) -> io::Result<(W, usize)> {
self.flush()?;
Ok((self.underlying, self.written_bytes))
}
@@ -27,16 +27,10 @@ impl<W: Write> CountingWriter<W> {
impl<W: Write> Write for CountingWriter<W> {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
let written_size = self.underlying.write(buf)?;
self.written_bytes += written_size as u64;
self.written_bytes += written_size;
Ok(written_size)
}
fn write_all(&mut self, buf: &[u8]) -> io::Result<()> {
self.underlying.write_all(buf)?;
self.written_bytes += buf.len() as u64;
Ok(())
}
fn flush(&mut self) -> io::Result<()> {
self.underlying.flush()
}
@@ -54,8 +48,8 @@ mod test {
let mut counting_writer = CountingWriter::wrap(buffer);
let bytes = (0u8..10u8).collect::<Vec<u8>>();
counting_writer.write_all(&bytes).unwrap();
let (w, len): (Vec<u8>, u64) = counting_writer.finish().unwrap();
assert_eq!(len, 10u64);
let (w, len): (Vec<u8>, usize) = counting_writer.finish().unwrap();
assert_eq!(len, 10);
assert_eq!(w.len(), 10);
}
}

View File

@@ -10,14 +10,10 @@ pub(crate) use self::bitset::TinySet;
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
pub use self::counting_writer::CountingWriter;
pub use self::serialize::{BinarySerializable, FixedSize};
pub use self::vint::{read_u32_vint, serialize_vint_u32, write_u32_vint, VInt};
pub use self::vint::VInt;
pub use byteorder::LittleEndian as Endianness;
/// Segment's max doc must be `< MAX_DOC_LIMIT`.
///
/// We do not allow segments with more than
pub const MAX_DOC_LIMIT: u32 = 1 << 31;
use std::io;
/// Computes the number of bits that will be used for bitpacking.
///
@@ -56,6 +52,11 @@ pub(crate) fn is_power_of_2(n: usize) -> bool {
(n > 0) && (n & (n - 1) == 0)
}
/// Create a default io error given a string.
pub(crate) fn make_io_err(msg: String) -> io::Error {
io::Error::new(io::ErrorKind::Other, msg)
}
/// Has length trait
pub trait HasLen {
/// Return length
@@ -133,11 +134,4 @@ pub(crate) mod test {
assert_eq!(compute_num_bits(256), 9u8);
assert_eq!(compute_num_bits(5_000_000_000), 33u8);
}
#[test]
fn test_max_doc() {
// this is the first time I write a unit test for a constant.
assert!(((super::MAX_DOC_LIMIT - 1) as i32) >= 0);
assert!((super::MAX_DOC_LIMIT as i32) < 0);
}
}

View File

@@ -1,5 +1,4 @@
use super::BinarySerializable;
use byteorder::{ByteOrder, LittleEndian};
use std::io;
use std::io::Read;
use std::io::Write;
@@ -10,100 +9,6 @@ pub struct VInt(pub u64);
const STOP_BIT: u8 = 128;
pub fn serialize_vint_u32(val: u32) -> (u64, usize) {
const START_2: u64 = 1 << 7;
const START_3: u64 = 1 << 14;
const START_4: u64 = 1 << 21;
const START_5: u64 = 1 << 28;
const STOP_1: u64 = START_2 - 1;
const STOP_2: u64 = START_3 - 1;
const STOP_3: u64 = START_4 - 1;
const STOP_4: u64 = START_5 - 1;
const MASK_1: u64 = 127;
const MASK_2: u64 = MASK_1 << 7;
const MASK_3: u64 = MASK_2 << 7;
const MASK_4: u64 = MASK_3 << 7;
const MASK_5: u64 = MASK_4 << 7;
let val = u64::from(val);
const STOP_BIT: u64 = 128u64;
match val {
0...STOP_1 => (val | STOP_BIT, 1),
START_2...STOP_2 => (
(val & MASK_1) | ((val & MASK_2) << 1) | (STOP_BIT << (8)),
2,
),
START_3...STOP_3 => (
(val & MASK_1) | ((val & MASK_2) << 1) | ((val & MASK_3) << 2) | (STOP_BIT << (8 * 2)),
3,
),
START_4...STOP_4 => (
(val & MASK_1)
| ((val & MASK_2) << 1)
| ((val & MASK_3) << 2)
| ((val & MASK_4) << 3)
| (STOP_BIT << (8 * 3)),
4,
),
_ => (
(val & MASK_1)
| ((val & MASK_2) << 1)
| ((val & MASK_3) << 2)
| ((val & MASK_4) << 3)
| ((val & MASK_5) << 4)
| (STOP_BIT << (8 * 4)),
5,
),
}
}
/// Returns the number of bytes covered by a
/// serialized vint `u32`.
///
/// Expects a buffer data that starts
/// by the serialized `vint`, scans at most 5 bytes ahead until
/// it finds the vint final byte.
///
/// # May Panic
/// If the payload does not start by a valid `vint`
fn vint_len(data: &[u8]) -> usize {
for (i, &val) in data.iter().enumerate().take(5) {
if val >= STOP_BIT {
return i + 1;
}
}
panic!("Corrupted data. Invalid VInt 32");
}
/// Reads a vint `u32` from a buffer, and
/// consumes its payload data.
///
/// # Panics
///
/// If the buffer does not start by a valid
/// vint payload
pub fn read_u32_vint(data: &mut &[u8]) -> u32 {
let vlen = vint_len(*data);
let mut result = 0u32;
let mut shift = 0u64;
for &b in &data[..vlen] {
result |= u32::from(b & 127u8) << shift;
shift += 7;
}
*data = &data[vlen..];
result
}
/// Write a `u32` as a vint payload.
pub fn write_u32_vint<W: io::Write>(val: u32, writer: &mut W) -> io::Result<()> {
let (val, num_bytes) = serialize_vint_u32(val);
let mut buffer = [0u8; 8];
LittleEndian::write_u64(&mut buffer, val);
writer.write_all(&buffer[..num_bytes])
}
impl VInt {
pub fn val(&self) -> u64 {
self.0
@@ -119,7 +24,7 @@ impl VInt {
output.extend(&buffer[0..num_bytes]);
}
pub fn serialize_into(&self, buffer: &mut [u8; 10]) -> usize {
fn serialize_into(&self, buffer: &mut [u8; 10]) -> usize {
let mut remaining = self.0;
for (i, b) in buffer.iter_mut().enumerate() {
let next_byte: u8 = (remaining % 128u64) as u8;
@@ -159,7 +64,7 @@ impl BinarySerializable for VInt {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
"Reach end of buffer while reading VInt",
));
))
}
}
}
@@ -169,9 +74,7 @@ impl BinarySerializable for VInt {
#[cfg(test)]
mod tests {
use super::serialize_vint_u32;
use super::VInt;
use byteorder::{ByteOrder, LittleEndian};
use common::BinarySerializable;
fn aux_test_vint(val: u64) {
@@ -205,28 +108,4 @@ mod tests {
}
aux_test_vint(10);
}
fn aux_test_serialize_vint_u32(val: u32) {
let mut buffer = [0u8; 10];
let mut buffer2 = [0u8; 10];
let len_vint = VInt(val as u64).serialize_into(&mut buffer);
let (vint, len) = serialize_vint_u32(val);
assert_eq!(len, len_vint, "len wrong for val {}", val);
LittleEndian::write_u64(&mut buffer2, vint);
assert_eq!(&buffer[..len], &buffer2[..len], "array wrong for {}", val);
}
#[test]
fn test_vint_u32() {
aux_test_serialize_vint_u32(0);
aux_test_serialize_vint_u32(1);
aux_test_serialize_vint_u32(5);
for i in 1..3 {
let power_of_128 = 1u32 << (7 * i);
aux_test_serialize_vint_u32(power_of_128 - 1u32);
aux_test_serialize_vint_u32(power_of_128);
aux_test_serialize_vint_u32(power_of_128 + 1u32);
}
aux_test_serialize_vint_u32(u32::max_value());
}
}

View File

@@ -64,18 +64,17 @@ impl Executor {
// This is important as it makes it possible for the fruit_receiver iteration to
// terminate.
};
// This is lame, but safe.
let mut results_with_position = Vec::with_capacity(num_fruits);
let mut results = Vec::with_capacity(num_fruits);
unsafe { results.set_len(num_fruits) };
let mut num_items = 0;
for (pos, fruit_res) in fruit_receiver {
let fruit = fruit_res?;
results_with_position.push((pos, fruit));
results[pos] = fruit_res?;
num_items += 1;
}
results_with_position.sort_by_key(|(pos, _)| *pos);
assert_eq!(results_with_position.len(), num_fruits);
Ok(results_with_position
.into_iter()
.map(|(_, fruit)| fruit)
.collect::<Vec<_>>())
// this checks ensures that we filled of this
// uninitialized memory.
assert_eq!(num_items, results.len());
Ok(results)
}
}
}
@@ -95,8 +94,7 @@ mod tests {
panic!("panic should propagate");
},
vec![0].into_iter(),
)
.unwrap();
).unwrap();
}
#[test]
@@ -108,8 +106,7 @@ mod tests {
panic!("panic should propagate");
},
vec![0].into_iter(),
)
.unwrap();
).unwrap();
}
#[test]
@@ -123,14 +120,15 @@ mod tests {
}
}
#[test]
fn test_map_multithread() {
let result: Vec<usize> = Executor::multi_thread(3, "search-test")
.map(|i| Ok(i * 2), 0..10)
.unwrap();
assert_eq!(result.len(), 10);
for i in 0..10 {
assert_eq!(result[i], i * 2);
}
}
#[test]
fn test_map_multithread() {
let result: Vec<usize> = Executor::multi_thread(3, "search-test")
.map(|i| Ok(i * 2), 0..10)
.unwrap();
assert_eq!(result.len(), 10);
for i in 0..10 {
assert_eq!(result[i], i * 2);
}
}

View File

@@ -1,31 +1,32 @@
use super::pool::LeasedItem;
use super::pool::Pool;
use super::segment::create_segment;
use super::segment::Segment;
use core::searcher::Searcher;
use core::Executor;
use core::IndexMeta;
use core::SegmentId;
use core::SegmentMeta;
use core::SegmentReader;
use core::META_FILEPATH;
use directory::ManagedDirectory;
#[cfg(feature = "mmap")]
use directory::MmapDirectory;
use directory::INDEX_WRITER_LOCK;
use directory::{Directory, RAMDirectory};
use error::DataCorruption;
use error::TantivyError;
use indexer::index_writer::open_index_writer;
use indexer::index_writer::HEAP_SIZE_MIN;
use indexer::segment_updater::save_new_metas;
use indexer::LockType;
use num_cpus;
use reader::IndexReader;
use reader::IndexReaderBuilder;
use schema::Field;
use schema::FieldType;
use schema::Schema;
use serde_json;
use std::borrow::BorrowMut;
use std::fmt;
#[cfg(feature = "mmap")]
use std::path::Path;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
use tokenizer::BoxedTokenizer;
use tokenizer::TokenizerManager;
@@ -36,20 +37,15 @@ fn load_metas(directory: &Directory) -> Result<IndexMeta> {
let meta_data = directory.atomic_read(&META_FILEPATH)?;
let meta_string = String::from_utf8_lossy(&meta_data);
serde_json::from_str(&meta_string)
.map_err(|e| {
DataCorruption::new(
META_FILEPATH.clone(),
format!("Meta file cannot be deserialized. {:?}.", e),
)
})
.map_err(From::from)
.map_err(|_| TantivyError::CorruptedFile(META_FILEPATH.clone()))
}
/// Search Index
#[derive(Clone)]
pub struct Index {
directory: ManagedDirectory,
schema: Schema,
num_searchers: Arc<AtomicUsize>,
searcher_pool: Arc<Pool<Searcher>>,
executor: Arc<Executor>,
tokenizers: TokenizerManager,
}
@@ -107,6 +103,7 @@ impl Index {
}
/// Opens or creates a new index in the provided directory
#[cfg(feature = "mmap")]
pub fn open_or_create<Dir: Directory>(dir: Dir, schema: Schema) -> Result<Index> {
if Index::exists(&dir) {
let index = Index::open(dir)?;
@@ -138,7 +135,7 @@ impl Index {
/// Creates a new index given an implementation of the trait `Directory`
pub fn create<Dir: Directory>(dir: Dir, schema: Schema) -> Result<Index> {
let directory = ManagedDirectory::wrap(dir)?;
let directory = ManagedDirectory::new(dir)?;
Index::from_directory(directory, schema)
}
@@ -146,7 +143,7 @@ impl Index {
///
/// This will overwrite existing meta.json
fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> {
save_new_metas(schema.clone(), directory.borrow_mut())?;
save_new_metas(schema.clone(), 0, directory.borrow_mut())?;
let metas = IndexMeta::with_schema(schema);
Index::create_from_metas(directory, &metas)
}
@@ -154,12 +151,16 @@ impl Index {
/// Creates a new index given a directory and an `IndexMeta`.
fn create_from_metas(directory: ManagedDirectory, metas: &IndexMeta) -> Result<Index> {
let schema = metas.schema.clone();
let n_cpus = num_cpus::get();
let index = Index {
directory,
schema,
num_searchers: Arc::new(AtomicUsize::new(n_cpus)),
searcher_pool: Arc::new(Pool::new()),
tokenizers: TokenizerManager::default(),
executor: Arc::new(Executor::single_thread()),
};
index.load_searchers()?;
Ok(index)
}
@@ -189,22 +190,6 @@ impl Index {
}
}
/// Create a default `IndexReader` for the given index.
///
/// See [`Index.reader_builder()`](#method.reader_builder).
pub fn reader(&self) -> Result<IndexReader> {
self.reader_builder().try_into()
}
/// Create a `IndexReader` for the given index.
///
/// Most project should create at most one reader for a given index.
/// This method is typically called only once per `Index` instance,
/// over the lifetime of most problem.
pub fn reader_builder(&self) -> IndexReaderBuilder {
IndexReaderBuilder::new(self.clone())
}
/// Opens a new directory from an index path.
#[cfg(feature = "mmap")]
pub fn open_in_dir<P: AsRef<Path>>(directory_path: P) -> Result<Index> {
@@ -214,7 +199,7 @@ impl Index {
/// Open the index using the provided directory
pub fn open<D: Directory>(directory: D) -> Result<Index> {
let directory = ManagedDirectory::wrap(directory)?;
let directory = ManagedDirectory::new(directory)?;
let metas = load_metas(&directory)?;
Index::create_from_metas(directory, &metas)
}
@@ -240,8 +225,7 @@ impl Index {
/// Each thread will receive a budget of `overall_heap_size_in_bytes / num_threads`.
///
/// # Errors
/// If the lockfile already exists, returns `Error::DirectoryLockBusy` or an `Error::IOError`.
///
/// If the lockfile already exists, returns `Error::FileAlreadyExists`.
/// # Panics
/// If the heap size per thread is too small, panics.
pub fn writer_with_num_threads(
@@ -249,21 +233,7 @@ impl Index {
num_threads: usize,
overall_heap_size_in_bytes: usize,
) -> Result<IndexWriter> {
let directory_lock = self
.directory
.acquire_lock(&INDEX_WRITER_LOCK)
.map_err(|err| {
TantivyError::LockFailure(
err,
Some(
"Failed to acquire index lock. If you are using\
a regular directory, this means there is already an \
`IndexWriter` working on this `Directory`, in this process \
or in a different process."
.to_string(),
),
)
})?;
let directory_lock = LockType::IndexWriterLock.acquire_lock(&self.directory)?;
let heap_size_in_bytes_per_thread = overall_heap_size_in_bytes / num_threads;
open_index_writer(
self,
@@ -343,6 +313,53 @@ impl Index {
.map(|segment_meta| segment_meta.id())
.collect())
}
/// Sets the number of searchers to use
///
/// Only works after the next call to `load_searchers`
pub fn set_num_searchers(&mut self, num_searchers: usize) {
self.num_searchers.store(num_searchers, Ordering::Release);
}
/// Update searchers so that they reflect the state of the last
/// `.commit()`.
///
/// If indexing happens in the same process as searching,
/// you most likely want to call `.load_searchers()` right after each
/// successful call to `.commit()`.
///
/// If indexing and searching happen in different processes, the way to
/// get the freshest `index` at all time, is to watch `meta.json` and
/// call `load_searchers` whenever a changes happen.
pub fn load_searchers(&self) -> Result<()> {
let _meta_lock = LockType::MetaLock.acquire_lock(self.directory())?;
let searchable_segments = self.searchable_segments()?;
let segment_readers: Vec<SegmentReader> = searchable_segments
.iter()
.map(SegmentReader::open)
.collect::<Result<_>>()?;
let schema = self.schema();
let num_searchers: usize = self.num_searchers.load(Ordering::Acquire);
let searchers = (0..num_searchers)
.map(|_| Searcher::new(schema.clone(), self.clone(), segment_readers.clone()))
.collect();
self.searcher_pool.publish_new_generation(searchers);
Ok(())
}
/// Returns a searcher
///
/// This method should be called every single time a search
/// query is performed.
/// The searchers are taken from a pool of `num_searchers` searchers.
/// If no searcher is available
/// this may block.
///
/// The same searcher must be used for a given query, as it ensures
/// the use of a consistent segment set.
pub fn searcher(&self) -> LeasedItem<Searcher> {
self.searcher_pool.acquire()
}
}
impl fmt::Debug for Index {
@@ -351,22 +368,29 @@ impl fmt::Debug for Index {
}
}
impl Clone for Index {
fn clone(&self) -> Index {
Index {
directory: self.directory.clone(),
schema: self.schema.clone(),
num_searchers: Arc::clone(&self.num_searchers),
searcher_pool: Arc::clone(&self.searcher_pool),
tokenizers: self.tokenizers.clone(),
executor: self.executor.clone(),
}
}
}
#[cfg(test)]
mod tests {
use directory::RAMDirectory;
use schema::Field;
use schema::{Schema, INDEXED, TEXT};
use std::thread;
use std::time::Duration;
use schema::{Schema, INT_INDEXED, TEXT};
use Index;
use IndexReader;
use IndexWriter;
use ReloadPolicy;
#[test]
fn test_indexer_for_field() {
let mut schema_builder = Schema::builder();
let num_likes_field = schema_builder.add_u64_field("num_likes", INDEXED);
let num_likes_field = schema_builder.add_u64_field("num_likes", INT_INDEXED);
let body_field = schema_builder.add_text_field("body", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
@@ -424,117 +448,7 @@ mod tests {
fn throw_away_schema() -> Schema {
let mut schema_builder = Schema::builder();
let _ = schema_builder.add_u64_field("num_likes", INDEXED);
let _ = schema_builder.add_u64_field("num_likes", INT_INDEXED);
schema_builder.build()
}
#[test]
fn test_index_on_commit_reload_policy() {
let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap();
let index = Index::create_in_ram(schema);
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommit)
.try_into()
.unwrap();
assert_eq!(reader.searcher().num_docs(), 0);
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
}
#[cfg(feature = "mmap")]
mod mmap_specific {
use super::*;
use std::path::PathBuf;
use tempdir::TempDir;
#[test]
fn test_index_on_commit_reload_policy_mmap() {
let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap();
let tempdir = TempDir::new("index").unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
let index = Index::create_in_dir(&tempdir_path, schema).unwrap();
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
writer.commit().unwrap();
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommit)
.try_into()
.unwrap();
assert_eq!(reader.searcher().num_docs(), 0);
test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
}
#[test]
fn test_index_manual_policy_mmap() {
let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap();
let index = Index::create_from_tempdir(schema).unwrap();
let mut writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
writer.commit().unwrap();
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
assert_eq!(reader.searcher().num_docs(), 0);
writer.add_document(doc!(field=>1u64));
writer.commit().unwrap();
thread::sleep(Duration::from_millis(500));
assert_eq!(reader.searcher().num_docs(), 0);
reader.reload().unwrap();
assert_eq!(reader.searcher().num_docs(), 1);
}
#[test]
fn test_index_on_commit_reload_policy_different_directories() {
let schema = throw_away_schema();
let field = schema.get_field("num_likes").unwrap();
let tempdir = TempDir::new("index").unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
let write_index = Index::create_in_dir(&tempdir_path, schema).unwrap();
let read_index = Index::open_in_dir(&tempdir_path).unwrap();
let reader = read_index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommit)
.try_into()
.unwrap();
assert_eq!(reader.searcher().num_docs(), 0);
let mut writer = write_index.writer_with_num_threads(1, 3_000_000).unwrap();
test_index_on_commit_reload_policy_aux(field, &mut writer, &reader);
}
}
fn test_index_on_commit_reload_policy_aux(
field: Field,
writer: &mut IndexWriter,
reader: &IndexReader,
) {
assert_eq!(reader.searcher().num_docs(), 0);
writer.add_document(doc!(field=>1u64));
writer.commit().unwrap();
let mut count = 0;
for _ in 0..100 {
count = reader.searcher().num_docs();
if count > 0 {
break;
}
thread::sleep(Duration::from_millis(100));
}
assert_eq!(count, 1);
writer.add_document(doc!(field=>2u64));
writer.commit().unwrap();
let mut count = 0;
for _ in 0..10 {
count = reader.searcher().num_docs();
if count > 1 {
break;
}
thread::sleep(Duration::from_millis(100));
}
assert_eq!(count, 2);
}
}

View File

@@ -32,7 +32,10 @@ pub struct InvertedIndexReader {
}
impl InvertedIndexReader {
#[cfg_attr(feature = "cargo-clippy", allow(clippy::needless_pass_by_value))] // for symetry
#[cfg_attr(
feature = "cargo-clippy",
allow(clippy::needless_pass_by_value)
)] // for symetry
pub(crate) fn new(
termdict: TermDictionary,
postings_source: ReadOnlySource,

View File

@@ -2,6 +2,7 @@ mod executor;
pub mod index;
mod index_meta;
mod inverted_index_reader;
mod pool;
pub mod searcher;
mod segment;
mod segment_component;
@@ -24,7 +25,6 @@ pub use self::segment_reader::SegmentReader;
use std::path::PathBuf;
lazy_static! {
/// The meta file contains all the information about the list of segments and the schema
/// of the index.
pub static ref META_FILEPATH: PathBuf = PathBuf::from("meta.json");

View File

@@ -1,5 +1,5 @@
use crossbeam::crossbeam_channel::unbounded;
use crossbeam::{Receiver, RecvError, Sender};
use crossbeam::queue::MsQueue;
use std::mem;
use std::ops::{Deref, DerefMut};
use std::sync::atomic::AtomicUsize;
use std::sync::atomic::Ordering;
@@ -10,52 +10,15 @@ pub struct GenerationItem<T> {
item: T,
}
/// Queue implementation for the Object Pool below
/// Uses the unbounded Linked-List type queue from crossbeam-channel
/// Splits the Queue into sender and receiver
struct Queue<T> {
sender: Sender<T>,
receiver: Receiver<T>,
}
impl<T> Queue<T> {
fn new() -> Self {
let (s, r) = unbounded();
Queue {
sender: s,
receiver: r,
}
}
/// Sender trait returns a Result type, which is ignored.
/// The Result is not handled at the moment
fn push(&self, elem: T) {
self.sender
.send(elem)
.expect("Sending an item to crossbeam-queue shouldn't fail");
}
/// Relies on the underlying crossbeam-channel Receiver
/// to block on empty queue
fn pop(&self) -> Result<T, RecvError> {
self.receiver.recv()
}
}
/// An object pool
///
/// This is used in tantivy to create a pool of `Searcher`.
/// Object are wrapped in a `LeasedItem` wrapper and are
/// released automatically back into the pool on `Drop`.
pub struct Pool<T> {
queue: Arc<Queue<GenerationItem<T>>>,
queue: Arc<MsQueue<GenerationItem<T>>>,
freshest_generation: AtomicUsize,
next_generation: AtomicUsize,
}
impl<T> Pool<T> {
pub fn new() -> Pool<T> {
let queue = Arc::new(Queue::new());
let queue = Arc::new(MsQueue::new());
Pool {
queue,
freshest_generation: AtomicUsize::default(),
@@ -63,10 +26,6 @@ impl<T> Pool<T> {
}
}
/// Publishes a new generation of `Searcher`.
///
/// After publish, all new `Searcher` acquired will be
/// of the new generation.
pub fn publish_new_generation(&self, items: Vec<T>) {
let next_generation = self.next_generation.fetch_add(1, Ordering::SeqCst) + 1;
for item in items {
@@ -102,14 +61,10 @@ impl<T> Pool<T> {
self.freshest_generation.load(Ordering::Acquire)
}
/// Acquires a new searcher.
///
/// If no searcher is available, this methods block until
/// a searcher is released.
pub fn acquire(&self) -> LeasedItem<T> {
let generation = self.generation();
loop {
let gen_item = self.queue.pop().unwrap();
let gen_item = self.queue.pop();
if gen_item.generation >= generation {
return LeasedItem {
gen_item: Some(gen_item),
@@ -125,7 +80,7 @@ impl<T> Pool<T> {
pub struct LeasedItem<T> {
gen_item: Option<GenerationItem<T>>,
recycle_queue: Arc<Queue<GenerationItem<T>>>,
recycle_queue: Arc<MsQueue<GenerationItem<T>>>,
}
impl<T> Deref for LeasedItem<T> {
@@ -152,9 +107,9 @@ impl<T> DerefMut for LeasedItem<T> {
impl<T> Drop for LeasedItem<T> {
fn drop(&mut self) {
if let Some(gen_item) = self.gen_item.take() {
self.recycle_queue.push(gen_item);
}
let gen_item: GenerationItem<T> = mem::replace(&mut self.gen_item, None)
.expect("Unwrapping a leased item should never fail");
self.recycle_queue.push(gen_item);
}
}
@@ -162,7 +117,6 @@ impl<T> Drop for LeasedItem<T> {
mod tests {
use super::Pool;
use super::Queue;
use std::iter;
#[test]
@@ -179,47 +133,4 @@ mod tests {
assert_eq!(*pool.acquire(), 11);
}
}
#[test]
fn test_queue() {
let q = Queue::new();
let elem = 5;
q.push(elem);
let res = q.pop();
assert_eq!(res.unwrap(), elem);
}
#[test]
fn test_pool_dont_panic_on_empty_pop() {
// When the object pool is exhausted, it shouldn't panic on pop()
use std::sync::Arc;
use std::{thread, time};
// Wrap the pool in an Arc, same way as its used in `core/index.rs`
let pool = Arc::new(Pool::new());
// clone pools outside the move scope of each new thread
let pool1 = Arc::clone(&pool);
let pool2 = Arc::clone(&pool);
let elements_for_pool = vec![1, 2];
pool.publish_new_generation(elements_for_pool);
let mut threads = vec![];
let sleep_dur = time::Duration::from_millis(10);
// spawn one more thread than there are elements in the pool
threads.push(thread::spawn(move || {
// leasing to make sure it's not dropped before sleep is called
let _leased_searcher = &pool.acquire();
thread::sleep(sleep_dur);
}));
threads.push(thread::spawn(move || {
// leasing to make sure it's not dropped before sleep is called
let _leased_searcher = &pool1.acquire();
thread::sleep(sleep_dur);
}));
threads.push(thread::spawn(move || {
// leasing to make sure it's not dropped before sleep is called
let _leased_searcher = &pool2.acquire();
thread::sleep(sleep_dur);
}));
}
}

View File

@@ -104,8 +104,7 @@ impl Searcher {
.iter()
.map(|segment_reader| {
u64::from(segment_reader.inverted_index(term.field()).doc_freq(term))
})
.sum::<u64>()
}).sum::<u64>()
}
/// Return the list of segment readers

View File

@@ -41,6 +41,6 @@ impl SegmentComponent {
SegmentComponent::STORE,
SegmentComponent::DELETE,
];
SEGMENT_COMPONENTS.iter()
SEGMENT_COMPONENTS.into_iter()
}
}

View File

@@ -19,7 +19,7 @@ pub struct SegmentId(Uuid);
#[cfg(test)]
lazy_static! {
static ref AUTO_INC_COUNTER: atomic::AtomicUsize = atomic::AtomicUsize::default();
static ref ZERO_ARRAY: [u8; 8] = [0u8; 8];
static ref EMPTY_ARR: [u8; 8] = [0u8; 8];
}
// During tests, we generate the segment id in a autoincrement manner
@@ -30,7 +30,7 @@ lazy_static! {
#[cfg(test)]
fn create_uuid() -> Uuid {
let new_auto_inc_id = (*AUTO_INC_COUNTER).fetch_add(1, atomic::Ordering::SeqCst);
Uuid::from_fields(new_auto_inc_id as u32, 0, 0, &*ZERO_ARRAY).unwrap()
Uuid::from_fields(new_auto_inc_id as u32, 0, 0, &*EMPTY_ARR).unwrap()
}
#[cfg(not(test))]

View File

@@ -477,7 +477,9 @@ mod test {
// ok, now we should have a deleted doc
index_writer2.commit().unwrap();
}
let searcher = index.reader().unwrap().searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let docs: Vec<DocId> = searcher.segment_reader(0).doc_ids_alive().collect();
assert_eq!(vec![0u32, 2u32], docs);
}

View File

@@ -1,104 +1,11 @@
use directory::directory_lock::Lock;
use directory::error::LockError;
use directory::error::{DeleteError, OpenReadError, OpenWriteError};
use directory::WatchCallback;
use directory::WatchHandle;
use directory::{ReadOnlySource, WritePtr};
use std::fmt;
use std::io;
use std::io::Write;
use std::marker::Send;
use std::marker::Sync;
use std::path::Path;
use std::path::PathBuf;
use std::result;
use std::thread;
use std::time::Duration;
/// Retry the logic of acquiring locks is pretty simple.
/// We just retry `n` times after a given `duratio`, both
/// depending on the type of lock.
struct RetryPolicy {
num_retries: usize,
wait_in_ms: u64,
}
impl RetryPolicy {
fn no_retry() -> RetryPolicy {
RetryPolicy {
num_retries: 0,
wait_in_ms: 0,
}
}
fn wait_and_retry(&mut self) -> bool {
if self.num_retries == 0 {
false
} else {
self.num_retries -= 1;
let wait_duration = Duration::from_millis(self.wait_in_ms);
thread::sleep(wait_duration);
true
}
}
}
/// The `DirectoryLock` is an object that represents a file lock.
/// See [`LockType`](struct.LockType.html)
///
/// It is transparently associated to a lock file, that gets deleted
/// on `Drop.` The lock is released automatically on `Drop`.
pub struct DirectoryLock(Box<Drop + Send + 'static>);
struct DirectoryLockGuard {
directory: Box<Directory>,
path: PathBuf,
}
impl<T: Drop + Send + 'static> From<Box<T>> for DirectoryLock {
fn from(underlying: Box<T>) -> Self {
DirectoryLock(underlying)
}
}
impl Drop for DirectoryLockGuard {
fn drop(&mut self) {
if let Err(e) = self.directory.delete(&*self.path) {
error!("Failed to remove the lock file. {:?}", e);
}
}
}
enum TryAcquireLockError {
FileExists,
IOError(io::Error),
}
fn try_acquire_lock(
filepath: &Path,
directory: &mut Directory,
) -> Result<DirectoryLock, TryAcquireLockError> {
let mut write = directory.open_write(filepath).map_err(|e| match e {
OpenWriteError::FileAlreadyExists(_) => TryAcquireLockError::FileExists,
OpenWriteError::IOError(io_error) => TryAcquireLockError::IOError(io_error.into()),
})?;
write.flush().map_err(TryAcquireLockError::IOError)?;
Ok(DirectoryLock::from(Box::new(DirectoryLockGuard {
directory: directory.box_clone(),
path: filepath.to_owned(),
})))
}
fn retry_policy(is_blocking: bool) -> RetryPolicy {
if is_blocking {
RetryPolicy {
num_retries: 100,
wait_in_ms: 100,
}
} else {
RetryPolicy::no_retry()
}
}
/// Write-once read many (WORM) abstraction for where
/// tantivy's data should be stored.
@@ -166,55 +73,6 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
///
/// The file may or may not previously exist.
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()>;
/// Acquire a lock in the given directory.
///
/// The method is blocking or not depending on the `Lock` object.
fn acquire_lock(&self, lock: &Lock) -> Result<DirectoryLock, LockError> {
let mut box_directory = self.box_clone();
let mut retry_policy = retry_policy(lock.is_blocking);
loop {
match try_acquire_lock(&lock.filepath, &mut *box_directory) {
Ok(result) => {
return Ok(result);
}
Err(TryAcquireLockError::FileExists) => {
if !retry_policy.wait_and_retry() {
return Err(LockError::LockBusy);
}
}
Err(TryAcquireLockError::IOError(io_error)) => {
return Err(LockError::IOError(io_error));
}
}
}
}
/// Registers a callback that will be called whenever a change on the `meta.json`
/// using the `atomic_write` API is detected.
///
/// The behavior when using `.watch()` on a file using `.open_write(...)` is, on the other
/// hand, undefined.
///
/// The file will be watched for the lifetime of the returned `WatchHandle`. The caller is
/// required to keep it.
/// It does not override previous callbacks. When the file is modified, all callback that are
/// registered (and whose `WatchHandle` is still alive) are triggered.
///
/// Internally, tantivy only uses this API to detect new commits to implement the
/// `OnCommit` `ReloadPolicy`. Not implementing watch in a `Directory` only prevents the
/// `OnCommit` `ReloadPolicy` to work properly.
fn watch(&self, watch_callback: WatchCallback) -> WatchHandle;
/// Ensure that all volatile files reach are persisted (in directory where that makes sense.)
///
/// In order to make Near Real Time efficient, tantivy introduced the notion of soft_commit vs
/// commit. Commit will call `.flush()`, while softcommit won't.
///
/// `meta.json` should be the last file to be flushed.
fn flush(&self) -> io::Result<()> {
Ok(())
}
}
/// DirectoryClone

View File

@@ -1,56 +0,0 @@
use std::path::PathBuf;
/// A directory lock.
///
/// A lock is associated to a specific path and some
/// [`LockParams`](./enum.LockParams.html).
/// Tantivy itself uses only two locks but client application
/// can use the directory facility to define their own locks.
/// - [INDEX_WRITER_LOCK](./struct.INDEX_WRITER_LOCK.html)
/// - [META_LOCK](./struct.META_LOCK.html)
///
/// Check out these locks documentation for more information.
///
#[derive(Debug)]
pub struct Lock {
/// The lock needs to be associated with its own file `path`.
/// Depending on the platform, the lock might rely on the creation
/// and deletion of this filepath.
pub filepath: PathBuf,
/// `lock_params` describes whether acquiring the lock is meant
/// to be a blocking operation or a non-blocking.
///
/// Acquiring a blocking lock blocks until the lock is
/// available.
/// Acquiring a blocking lock returns rapidly, either successfully
/// or with an error signifying that someone is already holding
/// the lock.
pub is_blocking: bool,
}
lazy_static! {
/// Only one process should be able to write tantivy's index at a time.
/// This lock file, when present, is in charge of preventing other processes to open an IndexWriter.
///
/// If the process is killed and this file remains, it is safe to remove it manually.
///
/// Failing to acquire this lock usually means a misuse of tantivy's API,
/// (creating more than one instance of the `IndexWriter`), are a spurious
/// lock file remaining after a crash. In the latter case, removing the file after
/// checking no process running tantivy is running is safe.
pub static ref INDEX_WRITER_LOCK: Lock = Lock {
filepath: PathBuf::from(".tantivy-writer.lock"),
is_blocking: false
};
/// The meta lock file is here to protect the segment files being opened by
/// `IndexReader::reload()` from being garbage collected.
/// It makes it possible for another process to safely consume
/// our index in-writing. Ideally, we may have prefered `RWLock` semantics
/// here, but it is difficult to achieve on Windows.
///
/// Opening segment readers is a very fast process.
pub static ref META_LOCK: Lock = Lock {
filepath: PathBuf::from(".tantivy-meta.lock"),
is_blocking: true
};
}

View File

@@ -3,22 +3,6 @@ use std::fmt;
use std::io;
use std::path::PathBuf;
/// Error while trying to acquire a directory lock.
#[derive(Debug, Fail)]
pub enum LockError {
/// Failed to acquired a lock as it is already hold by another
/// client.
/// - In the context of a blocking lock, this means the lock was not released within some `timeout` period.
/// - In the context of a non-blocking lock, this means the lock was busy at the moment of the call.
#[fail(
display = "Could not acquire lock as it is already held, possibly by a different process."
)]
LockBusy,
/// Trying to acquire a lock failed with an `IOError`
#[fail(display = "Failed to acquire the lock due to an io:Error.")]
IOError(io::Error),
}
/// General IO error with an optional path to the offending file.
#[derive(Debug)]
pub struct IOError {
@@ -26,12 +10,6 @@ pub struct IOError {
err: io::Error,
}
impl Into<io::Error> for IOError {
fn into(self) -> io::Error {
self.err
}
}
impl fmt::Display for IOError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self.path {
@@ -73,14 +51,6 @@ pub enum OpenDirectoryError {
DoesNotExist(PathBuf),
/// The path exists but is not a directory.
NotADirectory(PathBuf),
/// IoError
IoError(io::Error),
}
impl From<io::Error> for OpenDirectoryError {
fn from(io_err: io::Error) -> Self {
OpenDirectoryError::IoError(io_err)
}
}
impl fmt::Display for OpenDirectoryError {
@@ -92,11 +62,6 @@ impl fmt::Display for OpenDirectoryError {
OpenDirectoryError::NotADirectory(ref path) => {
write!(f, "the path '{:?}' exists but is not a directory", path)
}
OpenDirectoryError::IoError(ref err) => write!(
f,
"IOError while trying to open/create the directory. {:?}",
err
),
}
}
}

View File

@@ -1,11 +1,8 @@
use core::MANAGED_FILEPATH;
use directory::error::{DeleteError, IOError, LockError, OpenReadError, OpenWriteError};
use directory::DirectoryLock;
use directory::Lock;
use directory::META_LOCK;
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
use directory::{ReadOnlySource, WritePtr};
use directory::{WatchCallback, WatchHandle};
use error::DataCorruption;
use error::TantivyError;
use indexer::LockType;
use serde_json;
use std::collections::HashSet;
use std::io;
@@ -62,17 +59,12 @@ fn save_managed_paths(
impl ManagedDirectory {
/// Wraps a directory as managed directory.
pub fn wrap<Dir: Directory>(directory: Dir) -> Result<ManagedDirectory> {
pub fn new<Dir: Directory>(directory: Dir) -> Result<ManagedDirectory> {
match directory.atomic_read(&MANAGED_FILEPATH) {
Ok(data) => {
let managed_files_json = String::from_utf8_lossy(&data);
let managed_files: HashSet<PathBuf> = serde_json::from_str(&managed_files_json)
.map_err(|e| {
DataCorruption::new(
MANAGED_FILEPATH.clone(),
format!("Managed file cannot be deserialized: {:?}. ", e),
)
})?;
.map_err(|_| TantivyError::CorruptedFile(MANAGED_FILEPATH.clone()))?;
Ok(ManagedDirectory {
directory: Box::new(directory),
meta_informations: Arc::new(RwLock::new(MetaInformation {
@@ -95,9 +87,6 @@ impl ManagedDirectory {
///
/// * `living_files` - List of files that are still used by the index.
///
/// The use a callback ensures that the list of living_files is computed
/// while we hold the lock on meta.
///
/// This method does not panick nor returns errors.
/// If a file cannot be deleted (for permission reasons for instance)
/// an error is simply logged, and the file remains in the list of managed
@@ -128,7 +117,7 @@ impl ManagedDirectory {
// 2) writer change meta.json (for instance after a merge or a commit)
// 3) gc kicks in.
// 4) gc removes a file that was useful for process B, before process B opened it.
if let Ok(_meta_lock) = self.acquire_lock(&META_LOCK) {
if let Ok(_meta_lock) = LockType::MetaLock.acquire_lock(self) {
let living_files = get_living_files();
for managed_path in &meta_informations_rlock.managed_paths {
if !living_files.contains(managed_path) {
@@ -238,14 +227,6 @@ impl Directory for ManagedDirectory {
fn exists(&self, path: &Path) -> bool {
self.directory.exists(path)
}
fn acquire_lock(&self, lock: &Lock) -> result::Result<DirectoryLock, LockError> {
self.directory.acquire_lock(lock)
}
fn watch(&self, watch_callback: WatchCallback) -> WatchHandle {
self.directory.watch(watch_callback)
}
}
impl Clone for ManagedDirectory {
@@ -260,98 +241,95 @@ impl Clone for ManagedDirectory {
#[cfg(test)]
mod tests {
use super::*;
#[cfg(feature = "mmap")]
mod mmap_specific {
use directory::MmapDirectory;
use std::io::Write;
use std::path::Path;
use tempdir::TempDir;
use super::super::*;
use std::path::Path;
use tempdir::TempDir;
lazy_static! {
static ref TEST_PATH1: &'static Path = Path::new("some_path_for_test");
static ref TEST_PATH2: &'static Path = Path::new("some_path_for_test2");
}
use directory::MmapDirectory;
use std::io::Write;
#[test]
fn test_managed_directory() {
let tempdir = TempDir::new("index").unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
{
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
{
let mut write_file = managed_directory.open_write(*TEST_PATH1).unwrap();
write_file.flush().unwrap();
}
{
managed_directory
.atomic_write(*TEST_PATH2, &vec![0u8, 1u8])
.unwrap();
}
{
assert!(managed_directory.exists(*TEST_PATH1));
assert!(managed_directory.exists(*TEST_PATH2));
}
{
let living_files: HashSet<PathBuf> =
[TEST_PATH1.to_owned()].into_iter().cloned().collect();
managed_directory.garbage_collect(|| living_files);
}
{
assert!(managed_directory.exists(*TEST_PATH1));
assert!(!managed_directory.exists(*TEST_PATH2));
}
}
{
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
{
assert!(managed_directory.exists(*TEST_PATH1));
assert!(!managed_directory.exists(*TEST_PATH2));
}
{
let living_files: HashSet<PathBuf> = HashSet::new();
managed_directory.garbage_collect(|| living_files);
}
{
assert!(!managed_directory.exists(*TEST_PATH1));
assert!(!managed_directory.exists(*TEST_PATH2));
}
}
}
#[test]
fn test_managed_directory_gc_while_mmapped() {
let tempdir = TempDir::new("index").unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
let living_files = HashSet::new();
lazy_static! {
static ref TEST_PATH1: &'static Path = Path::new("some_path_for_test");
static ref TEST_PATH2: &'static Path = Path::new("some_path_for_test2");
}
#[test]
#[cfg(feature = "mmap")]
fn test_managed_directory() {
let tempdir = TempDir::new("index").unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
{
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::wrap(mmap_directory).unwrap();
managed_directory
.atomic_write(*TEST_PATH1, &vec![0u8, 1u8])
.unwrap();
assert!(managed_directory.exists(*TEST_PATH1));
let _mmap_read = managed_directory.open_read(*TEST_PATH1).unwrap();
managed_directory.garbage_collect(|| living_files.clone());
if cfg!(target_os = "windows") {
// On Windows, gc should try and fail the file as it is mmapped.
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
{
let mut write_file = managed_directory.open_write(*TEST_PATH1).unwrap();
write_file.flush().unwrap();
}
{
managed_directory
.atomic_write(*TEST_PATH2, &vec![0u8, 1u8])
.unwrap();
}
{
assert!(managed_directory.exists(*TEST_PATH1));
// unmap should happen here.
drop(_mmap_read);
// The file should still be in the list of managed file and
// eventually be deleted once mmap is released.
assert!(managed_directory.exists(*TEST_PATH2));
}
{
let living_files: HashSet<PathBuf> =
[TEST_PATH1.to_owned()].into_iter().cloned().collect();
managed_directory.garbage_collect(|| living_files);
assert!(!managed_directory.exists(*TEST_PATH1));
} else {
assert!(!managed_directory.exists(*TEST_PATH1));
}
{
assert!(managed_directory.exists(*TEST_PATH1));
assert!(!managed_directory.exists(*TEST_PATH2));
}
}
{
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
{
assert!(managed_directory.exists(*TEST_PATH1));
assert!(!managed_directory.exists(*TEST_PATH2));
}
{
let living_files: HashSet<PathBuf> = HashSet::new();
managed_directory.garbage_collect(|| living_files);
}
{
assert!(!managed_directory.exists(*TEST_PATH1));
assert!(!managed_directory.exists(*TEST_PATH2));
}
}
}
#[test]
#[cfg(feature = "mmap ")]
fn test_managed_directory_gc_while_mmapped() {
let tempdir = TempDir::new("index").unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
let living_files = HashSet::new();
let mmap_directory = MmapDirectory::open(&tempdir_path).unwrap();
let mut managed_directory = ManagedDirectory::new(mmap_directory).unwrap();
managed_directory
.atomic_write(*TEST_PATH1, &vec![0u8, 1u8])
.unwrap();
assert!(managed_directory.exists(*TEST_PATH1));
let _mmap_read = managed_directory.open_read(*TEST_PATH1).unwrap();
managed_directory.garbage_collect(|| living_files.clone());
if cfg!(target_os = "windows") {
// On Windows, gc should try and fail the file as it is mmapped.
assert!(managed_directory.exists(*TEST_PATH1));
// unmap should happen here.
drop(_mmap_read);
// The file should still be in the list of managed file and
// eventually be deleted once mmap is released.
managed_directory.garbage_collect(|| living_files);
assert!(!managed_directory.exists(*TEST_PATH1));
} else {
assert!(!managed_directory.exists(*TEST_PATH1));
}
}
}

View File

@@ -1,24 +1,12 @@
extern crate fs2;
extern crate notify;
use self::fs2::FileExt;
use self::notify::RawEvent;
use self::notify::RecursiveMode;
use self::notify::Watcher;
use atomicwrites;
use core::META_FILEPATH;
use directory::error::LockError;
use common::make_io_err;
use directory::error::{DeleteError, IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
use directory::read_only_source::BoxedData;
use directory::shared_vec_slice::SharedVecSlice;
use directory::Directory;
use directory::DirectoryLock;
use directory::Lock;
use directory::ReadOnlySource;
use directory::WatchCallback;
use directory::WatchCallbackList;
use directory::WatchHandle;
use directory::WritePtr;
use memmap::Mmap;
use fst::raw::MmapReadOnly;
use std::collections::hash_map::Entry as HashMapEntry;
use std::collections::HashMap;
use std::convert::From;
use std::fmt;
@@ -28,22 +16,14 @@ use std::io::{self, Seek, SeekFrom};
use std::io::{BufWriter, Read, Write};
use std::path::{Path, PathBuf};
use std::result;
use std::sync::mpsc::{channel, Receiver, Sender};
use std::sync::Arc;
use std::sync::Mutex;
use std::sync::RwLock;
use std::sync::Weak;
use std::thread;
use tempdir::TempDir;
/// Create a default io error given a string.
pub(crate) fn make_io_err(msg: String) -> io::Error {
io::Error::new(io::ErrorKind::Other, msg)
}
/// Returns None iff the file exists, can be read, but is empty (and hence
/// cannot be mmapped)
fn open_mmap(full_path: &Path) -> result::Result<Option<Mmap>, OpenReadError> {
/// cannot be mmapped).
///
fn open_mmap(full_path: &Path) -> result::Result<Option<MmapReadOnly>, OpenReadError> {
let file = File::open(full_path).map_err(|e| {
if e.kind() == io::ErrorKind::NotFound {
OpenReadError::FileDoesNotExist(full_path.to_owned())
@@ -62,7 +42,7 @@ fn open_mmap(full_path: &Path) -> result::Result<Option<Mmap>, OpenReadError> {
return Ok(None);
}
unsafe {
memmap::Mmap::map(&file)
MmapReadOnly::open(&file)
.map(Some)
.map_err(|e| From::from(IOError::with_path(full_path.to_owned(), e)))
}
@@ -85,7 +65,7 @@ pub struct CacheInfo {
struct MmapCache {
counters: CacheCounters,
cache: HashMap<PathBuf, Weak<BoxedData>>,
cache: HashMap<PathBuf, MmapReadOnly>,
}
impl Default for MmapCache {
@@ -98,7 +78,12 @@ impl Default for MmapCache {
}
impl MmapCache {
fn get_info(&self) -> CacheInfo {
/// Removes a `MmapReadOnly` entry from the mmap cache.
fn discard_from_cache(&mut self, full_path: &Path) -> bool {
self.cache.remove(full_path).is_some()
}
fn get_info(&mut self) -> CacheInfo {
let paths: Vec<PathBuf> = self.cache.keys().cloned().collect();
CacheInfo {
counters: self.counters.clone(),
@@ -106,105 +91,23 @@ impl MmapCache {
}
}
fn remove_weak_ref(&mut self) {
let keys_to_remove: Vec<PathBuf> = self
.cache
.iter()
.filter(|(_, mmap_weakref)| mmap_weakref.upgrade().is_none())
.map(|(key, _)| key.clone())
.collect();
for key in keys_to_remove {
self.cache.remove(&key);
}
}
// Returns None if the file exists but as a len of 0 (and hence is not mmappable).
fn get_mmap(&mut self, full_path: &Path) -> Result<Option<Arc<BoxedData>>, OpenReadError> {
if let Some(mmap_weak) = self.cache.get(full_path) {
if let Some(mmap_arc) = mmap_weak.upgrade() {
fn get_mmap(&mut self, full_path: &Path) -> Result<Option<MmapReadOnly>, OpenReadError> {
Ok(match self.cache.entry(full_path.to_owned()) {
HashMapEntry::Occupied(occupied_entry) => {
let mmap = occupied_entry.get();
self.counters.hit += 1;
return Ok(Some(mmap_arc));
Some(mmap.clone())
}
}
self.cache.remove(full_path);
self.counters.miss += 1;
Ok(if let Some(mmap) = open_mmap(full_path)? {
let mmap_arc: Arc<BoxedData> = Arc::new(Box::new(mmap));
let mmap_weak = Arc::downgrade(&mmap_arc);
self.cache.insert(full_path.to_owned(), mmap_weak);
Some(mmap_arc)
} else {
None
})
}
}
struct InnerWatcherWrapper {
_watcher: Mutex<notify::RecommendedWatcher>,
watcher_router: WatchCallbackList,
}
impl InnerWatcherWrapper {
pub fn new(path: &Path) -> Result<(Self, Receiver<notify::RawEvent>), notify::Error> {
let (tx, watcher_recv): (Sender<RawEvent>, Receiver<RawEvent>) = channel();
// We need to initialize the
let mut watcher = notify::raw_watcher(tx)?;
watcher.watch(path, RecursiveMode::Recursive)?;
let inner = InnerWatcherWrapper {
_watcher: Mutex::new(watcher),
watcher_router: Default::default(),
};
Ok((inner, watcher_recv))
}
}
#[derive(Clone)]
pub(crate) struct WatcherWrapper {
inner: Arc<InnerWatcherWrapper>,
}
impl WatcherWrapper {
pub fn new(path: &Path) -> Result<Self, OpenDirectoryError> {
let (inner, watcher_recv) = InnerWatcherWrapper::new(path).map_err(|err| match err {
notify::Error::PathNotFound => OpenDirectoryError::DoesNotExist(path.to_owned()),
_ => {
panic!("Unknown error while starting watching directory {:?}", path);
}
})?;
let watcher_wrapper = WatcherWrapper {
inner: Arc::new(inner),
};
let watcher_wrapper_clone = watcher_wrapper.clone();
thread::Builder::new()
.name("meta-file-watch-thread".to_string())
.spawn(move || {
loop {
match watcher_recv.recv().map(|evt| evt.path) {
Ok(Some(changed_path)) => {
// ... Actually subject to false positive.
// We might want to be more accurate than this at one point.
if let Some(filename) = changed_path.file_name() {
if filename == *META_FILEPATH {
watcher_wrapper_clone.inner.watcher_router.broadcast();
}
}
}
Ok(None) => {
// not an event we are interested in.
}
Err(_e) => {
// the watch send channel was dropped
break;
}
}
HashMapEntry::Vacant(vacant_entry) => {
self.counters.miss += 1;
if let Some(mmap) = open_mmap(full_path)? {
vacant_entry.insert(mmap.clone());
Some(mmap)
} else {
None
}
})
.expect("Failed to spawn thread to watch meta.json");
Ok(watcher_wrapper)
}
pub fn watch(&mut self, watch_callback: WatchCallback) -> WatchHandle {
self.inner.watcher_router.subscribe(watch_callback)
}
})
}
}
@@ -212,72 +115,33 @@ impl WatcherWrapper {
///
/// The Mmap object are cached to limit the
/// system calls.
///
/// In the `MmapDirectory`, locks are implemented using the `fs2` crate definition of locks.
///
/// On MacOS & linux, it relies on `flock` (aka `BSD Lock`). These locks solve most of the
/// problems related to POSIX Locks, but may their contract may not be respected on `NFS`
/// depending on the implementation.
///
/// On Windows the semantics are again different.
#[derive(Clone)]
pub struct MmapDirectory {
inner: Arc<MmapDirectoryInner>,
}
struct MmapDirectoryInner {
root_path: PathBuf,
mmap_cache: RwLock<MmapCache>,
_temp_directory: Option<TempDir>,
watcher: RwLock<WatcherWrapper>,
}
impl MmapDirectoryInner {
fn new(
root_path: PathBuf,
temp_directory: Option<TempDir>,
) -> Result<MmapDirectoryInner, OpenDirectoryError> {
let watch_wrapper = WatcherWrapper::new(&root_path)?;
let mmap_directory_inner = MmapDirectoryInner {
root_path,
mmap_cache: Default::default(),
_temp_directory: temp_directory,
watcher: RwLock::new(watch_wrapper),
};
Ok(mmap_directory_inner)
}
fn watch(&self, watch_callback: WatchCallback) -> WatchHandle {
let mut wlock = self.watcher.write().unwrap();
wlock.watch(watch_callback)
}
mmap_cache: Arc<RwLock<MmapCache>>,
_temp_directory: Arc<Option<TempDir>>,
}
impl fmt::Debug for MmapDirectory {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "MmapDirectory({:?})", self.inner.root_path)
write!(f, "MmapDirectory({:?})", self.root_path)
}
}
impl MmapDirectory {
fn new(
root_path: PathBuf,
temp_directory: Option<TempDir>,
) -> Result<MmapDirectory, OpenDirectoryError> {
let inner = MmapDirectoryInner::new(root_path, temp_directory)?;
Ok(MmapDirectory {
inner: Arc::new(inner),
})
}
/// Creates a new MmapDirectory in a temporary directory.
///
/// This is mostly useful to test the MmapDirectory itself.
/// For your unit tests, prefer the RAMDirectory.
pub fn create_from_tempdir() -> Result<MmapDirectory, OpenDirectoryError> {
let tempdir = TempDir::new("index").map_err(OpenDirectoryError::IoError)?;
pub fn create_from_tempdir() -> io::Result<MmapDirectory> {
let tempdir = TempDir::new("index")?;
let tempdir_path = PathBuf::from(tempdir.path());
MmapDirectory::new(tempdir_path, Some(tempdir))
let directory = MmapDirectory {
root_path: tempdir_path,
mmap_cache: Arc::new(RwLock::new(MmapCache::default())),
_temp_directory: Arc::new(Some(tempdir)),
};
Ok(directory)
}
/// Opens a MmapDirectory in a directory.
@@ -295,14 +159,18 @@ impl MmapDirectory {
directory_path,
)))
} else {
Ok(MmapDirectory::new(PathBuf::from(directory_path), None)?)
Ok(MmapDirectory {
root_path: PathBuf::from(directory_path),
mmap_cache: Arc::new(RwLock::new(MmapCache::default())),
_temp_directory: Arc::new(None),
})
}
}
/// Joins a relative_path to the directory `root_path`
/// to create a proper complete `filepath`.
fn resolve_path(&self, relative_path: &Path) -> PathBuf {
self.inner.root_path.join(relative_path)
self.root_path.join(relative_path)
}
/// Sync the root directory.
@@ -327,7 +195,7 @@ impl MmapDirectory {
.custom_flags(winbase::FILE_FLAG_BACKUP_SEMANTICS);
}
let fd = open_opts.open(&self.inner.root_path)?;
let fd = open_opts.open(&self.root_path)?;
fd.sync_all()?;
Ok(())
}
@@ -337,38 +205,17 @@ impl MmapDirectory {
///
/// The `MmapDirectory` embeds a `MmapDirectory`
/// to avoid multiplying the `mmap` system calls.
pub fn get_cache_info(&self) -> CacheInfo {
self.inner
.mmap_cache
pub fn get_cache_info(&mut self) -> CacheInfo {
self.mmap_cache
.write()
.expect("mmap cache lock is poisoned")
.remove_weak_ref();
self.inner
.mmap_cache
.read()
.expect("Mmap cache lock is poisoned.")
.get_info()
}
}
/// We rely on fs2 for file locking. On Windows & MacOS this
/// uses BSD locks (`flock`). The lock is actually released when
/// the `File` object is dropped and its associated file descriptor
/// is closed.
struct ReleaseLockFile {
_file: File,
path: PathBuf,
}
impl Drop for ReleaseLockFile {
fn drop(&mut self) {
debug!("Releasing lock {:?}", self.path);
}
}
/// This Write wraps a File, but has the specificity of
/// call `sync_all` on flush.
pub struct SafeFileWriter(File);
struct SafeFileWriter(File);
impl SafeFileWriter {
fn new(file: File) -> SafeFileWriter {
@@ -398,7 +245,7 @@ impl Directory for MmapDirectory {
debug!("Open Read {:?}", path);
let full_path = self.resolve_path(path);
let mut mmap_cache = self.inner.mmap_cache.write().map_err(|_| {
let mut mmap_cache = self.mmap_cache.write().map_err(|_| {
let msg = format!(
"Failed to acquired write lock \
on mmap cache while reading {:?}",
@@ -406,34 +253,11 @@ impl Directory for MmapDirectory {
);
IOError::with_path(path.to_owned(), make_io_err(msg))
})?;
Ok(mmap_cache
.get_mmap(&full_path)?
.map(ReadOnlySource::from)
.unwrap_or_else(ReadOnlySource::empty))
}
/// Any entry associated to the path in the mmap will be
/// removed before the file is deleted.
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
debug!("Deleting file {:?}", path);
let full_path = self.resolve_path(path);
match fs::remove_file(&full_path) {
Ok(_) => self
.sync_directory()
.map_err(|e| IOError::with_path(path.to_owned(), e).into()),
Err(e) => {
if e.kind() == io::ErrorKind::NotFound {
Err(DeleteError::FileDoesNotExist(path.to_owned()))
} else {
Err(IOError::with_path(path.to_owned(), e).into())
}
}
}
}
fn exists(&self, path: &Path) -> bool {
let full_path = self.resolve_path(path);
full_path.exists()
.map(ReadOnlySource::Mmap)
.unwrap_or_else(|| ReadOnlySource::Anonymous(SharedVecSlice::empty())))
}
fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError> {
@@ -466,6 +290,44 @@ impl Directory for MmapDirectory {
Ok(BufWriter::new(Box::new(writer)))
}
/// Any entry associated to the path in the mmap will be
/// removed before the file is deleted.
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
debug!("Deleting file {:?}", path);
let full_path = self.resolve_path(path);
let mut mmap_cache = self.mmap_cache.write().map_err(|_| {
let msg = format!(
"Failed to acquired write lock \
on mmap cache while deleting {:?}",
path
);
IOError::with_path(path.to_owned(), make_io_err(msg))
})?;
mmap_cache.discard_from_cache(path);
// Removing the entry in the MMap cache.
// The munmap will appear on Drop,
// when the last reference is gone.
mmap_cache.cache.remove(&full_path);
match fs::remove_file(&full_path) {
Ok(_) => self
.sync_directory()
.map_err(|e| IOError::with_path(path.to_owned(), e).into()),
Err(e) => {
if e.kind() == io::ErrorKind::NotFound {
Err(DeleteError::FileDoesNotExist(path.to_owned()))
} else {
Err(IOError::with_path(path.to_owned(), e).into())
}
}
}
}
fn exists(&self, path: &Path) -> bool {
let full_path = self.resolve_path(path);
full_path.exists()
}
fn atomic_read(&self, path: &Path) -> Result<Vec<u8>, OpenReadError> {
let full_path = self.resolve_path(path);
let mut buffer = Vec::new();
@@ -492,30 +354,6 @@ impl Directory for MmapDirectory {
meta_file.write(|f| f.write_all(data))?;
Ok(())
}
fn acquire_lock(&self, lock: &Lock) -> Result<DirectoryLock, LockError> {
let full_path = self.resolve_path(&lock.filepath);
// We make sure that the file exists.
let file: File = OpenOptions::new()
.write(true)
.create(true) //< if the file does not exist yet, create it.
.open(&full_path)
.map_err(LockError::IOError)?;
if lock.is_blocking {
file.lock_exclusive().map_err(LockError::IOError)?;
} else {
file.try_lock_exclusive().map_err(|_| LockError::LockBusy)?
}
// dropping the file handle will release the lock.
Ok(DirectoryLock::from(Box::new(ReleaseLockFile {
path: lock.filepath.clone(),
_file: file,
})))
}
fn watch(&self, watch_callback: WatchCallback) -> WatchHandle {
self.inner.watch(watch_callback)
}
}
#[cfg(test)]
@@ -525,13 +363,6 @@ mod tests {
// The following tests are specific to the MmapDirectory
use super::*;
use schema::{Schema, SchemaBuilder, TEXT};
use std::fs;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::thread;
use std::time::Duration;
use Index;
use ReloadPolicy;
#[test]
fn test_open_non_existant_path() {
@@ -556,7 +387,7 @@ mod tests {
#[test]
fn test_cache() {
let content = b"abc";
let content = "abc".as_bytes();
// here we test if the cache releases
// mmaps correctly.
@@ -572,104 +403,26 @@ mod tests {
w.flush().unwrap();
}
}
let mut keep = vec![];
for (i, path) in paths.iter().enumerate() {
keep.push(mmap_directory.open_read(path).unwrap());
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), i + 1);
}
assert_eq!(mmap_directory.get_cache_info().counters.hit, 0);
assert_eq!(mmap_directory.get_cache_info().counters.miss, 10);
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 10);
for path in paths.iter() {
let _r = mmap_directory.open_read(path).unwrap();
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), num_paths);
{
for (i, path) in paths.iter().enumerate() {
let _r = mmap_directory.open_read(path).unwrap();
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), i + 1);
}
for path in paths.iter() {
let _r = mmap_directory.open_read(path).unwrap();
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), num_paths);
}
for (i, path) in paths.iter().enumerate() {
mmap_directory.delete(path).unwrap();
assert_eq!(
mmap_directory.get_cache_info().mmapped.len(),
num_paths - i - 1
);
}
}
assert_eq!(mmap_directory.get_cache_info().counters.hit, 10);
assert_eq!(mmap_directory.get_cache_info().counters.miss, 10);
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 10);
for path in paths.iter() {
let _r = mmap_directory.open_read(path).unwrap();
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 10);
}
assert_eq!(mmap_directory.get_cache_info().counters.hit, 20);
assert_eq!(mmap_directory.get_cache_info().counters.miss, 10);
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 10);
drop(keep);
for path in paths.iter() {
let _r = mmap_directory.open_read(path).unwrap();
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 1);
}
assert_eq!(mmap_directory.get_cache_info().counters.hit, 20);
assert_eq!(mmap_directory.get_cache_info().counters.miss, 20);
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 0);
for path in &paths {
mmap_directory.delete(path).unwrap();
}
assert_eq!(mmap_directory.get_cache_info().counters.hit, 20);
assert_eq!(mmap_directory.get_cache_info().counters.miss, 20);
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 0);
for path in paths.iter() {
assert!(mmap_directory.open_read(path).is_err());
}
assert_eq!(mmap_directory.get_cache_info().counters.hit, 20);
assert_eq!(mmap_directory.get_cache_info().counters.miss, 30);
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 0);
}
#[test]
fn test_watch_wrapper() {
let counter: Arc<AtomicUsize> = Default::default();
let counter_clone = counter.clone();
let tmp_dir: TempDir = tempdir::TempDir::new("test_watch_wrapper").unwrap();
let tmp_dirpath = tmp_dir.path().to_owned();
let mut watch_wrapper = WatcherWrapper::new(&tmp_dirpath).unwrap();
let tmp_file = tmp_dirpath.join("coucou");
let _handle = watch_wrapper.watch(Box::new(move || {
counter_clone.fetch_add(1, Ordering::SeqCst);
}));
assert_eq!(counter.load(Ordering::SeqCst), 0);
fs::write(&tmp_file, b"whateverwilldo").unwrap();
thread::sleep(Duration::new(0, 1_000u32));
}
#[test]
fn test_mmap_released() {
let mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
let mut schema_builder: SchemaBuilder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
{
let index = Index::create(mmap_directory.clone(), schema).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
for _num_commits in 0..16 {
for _ in 0..10 {
index_writer.add_document(doc!(text_field=>"abc"));
}
index_writer.commit().unwrap();
}
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
for _ in 0..30 {
index_writer.add_document(doc!(text_field=>"abc"));
index_writer.commit().unwrap();
reader.reload().unwrap();
}
index_writer.wait_merging_threads().unwrap();
reader.reload().unwrap();
let num_segments = reader.searcher().segment_readers().len();
assert_eq!(num_segments, 4);
assert_eq!(
num_segments * 7,
mmap_directory.get_cache_info().mmapped.len()
);
}
assert_eq!(mmap_directory.get_cache_info().mmapped.len(), 0);
}
}

View File

@@ -8,24 +8,19 @@ WORM directory abstraction.
mod mmap_directory;
mod directory;
mod directory_lock;
mod managed_directory;
mod ram_directory;
mod read_only_source;
mod watch_event_router;
mod nrt_directory;
mod shared_vec_slice;
/// Errors specific to the directory module.
pub mod error;
pub use self::directory::DirectoryLock;
use std::io::{BufWriter, Seek, Write};
pub use self::directory::{Directory, DirectoryClone};
pub use self::directory_lock::{Lock, INDEX_WRITER_LOCK, META_LOCK};
pub use self::ram_directory::RAMDirectory;
pub use self::read_only_source::ReadOnlySource;
pub(crate) use self::watch_event_router::WatchCallbackList;
pub use self::watch_event_router::{WatchCallback, WatchHandle};
use std::io::{BufWriter, Seek, Write};
#[cfg(feature = "mmap")]
pub use self::mmap_directory::MmapDirectory;
@@ -43,4 +38,128 @@ impl<T: Seek + Write> SeekableWrite for T {}
pub type WritePtr = BufWriter<Box<SeekableWrite>>;
#[cfg(test)]
mod tests;
mod tests {
use super::*;
use std::io::{Seek, SeekFrom, Write};
use std::path::Path;
lazy_static! {
static ref TEST_PATH: &'static Path = Path::new("some_path_for_test");
}
#[test]
fn test_ram_directory() {
let mut ram_directory = RAMDirectory::create();
test_directory(&mut ram_directory);
}
#[test]
#[cfg(feature = "mmap")]
fn test_mmap_directory() {
let mut mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
test_directory(&mut mmap_directory);
}
#[test]
#[should_panic]
fn ram_directory_panics_if_flush_forgotten() {
let mut ram_directory = RAMDirectory::create();
let mut write_file = ram_directory.open_write(*TEST_PATH).unwrap();
assert!(write_file.write_all(&[4]).is_ok());
}
fn test_simple(directory: &mut Directory) {
{
{
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
assert!(directory.exists(*TEST_PATH));
write_file.write_all(&[4]).unwrap();
write_file.write_all(&[3]).unwrap();
write_file.write_all(&[7, 3, 5]).unwrap();
write_file.flush().unwrap();
}
let read_file = directory.open_read(*TEST_PATH).unwrap();
let data: &[u8] = &*read_file;
assert_eq!(data, &[4u8, 3u8, 7u8, 3u8, 5u8]);
}
assert!(directory.delete(*TEST_PATH).is_ok());
assert!(!directory.exists(*TEST_PATH));
}
fn test_seek(directory: &mut Directory) {
{
{
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
write_file.write_all(&[4, 3, 7, 3, 5]).unwrap();
write_file.seek(SeekFrom::Start(0)).unwrap();
write_file.write_all(&[3, 1]).unwrap();
write_file.flush().unwrap();
}
let read_file = directory.open_read(*TEST_PATH).unwrap();
let data: &[u8] = &*read_file;
assert_eq!(data, &[3u8, 1u8, 7u8, 3u8, 5u8]);
}
assert!(directory.delete(*TEST_PATH).is_ok());
}
fn test_rewrite_forbidden(directory: &mut Directory) {
{
directory.open_write(*TEST_PATH).unwrap();
assert!(directory.exists(*TEST_PATH));
}
{
assert!(directory.open_write(*TEST_PATH).is_err());
}
assert!(directory.delete(*TEST_PATH).is_ok());
}
fn test_write_create_the_file(directory: &mut Directory) {
{
assert!(directory.open_read(*TEST_PATH).is_err());
let _w = directory.open_write(*TEST_PATH).unwrap();
assert!(directory.exists(*TEST_PATH));
assert!(directory.open_read(*TEST_PATH).is_ok());
assert!(directory.delete(*TEST_PATH).is_ok());
}
}
fn test_directory_delete(directory: &mut Directory) {
assert!(directory.open_read(*TEST_PATH).is_err());
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
write_file.write_all(&[1, 2, 3, 4]).unwrap();
write_file.flush().unwrap();
{
let read_handle = directory.open_read(*TEST_PATH).unwrap();
{
assert_eq!(&*read_handle, &[1u8, 2u8, 3u8, 4u8]);
// Mapped files can't be deleted on Windows
if !cfg!(windows) {
assert!(directory.delete(*TEST_PATH).is_ok());
assert_eq!(&*read_handle, &[1u8, 2u8, 3u8, 4u8]);
}
assert!(directory.delete(Path::new("SomeOtherPath")).is_err());
}
}
if cfg!(windows) {
assert!(directory.delete(*TEST_PATH).is_ok());
}
assert!(directory.open_read(*TEST_PATH).is_err());
assert!(directory.delete(*TEST_PATH).is_err());
}
fn test_directory(directory: &mut Directory) {
test_simple(directory);
test_seek(directory);
test_rewrite_forbidden(directory);
test_write_create_the_file(directory);
test_directory_delete(directory);
}
}

View File

@@ -1,195 +0,0 @@
use directory::Directory;
use std::path::{PathBuf, Path};
use directory::ReadOnlySource;
use directory::error::OpenReadError;
use directory::error::DeleteError;
use std::io::{BufWriter, Cursor};
use directory::SeekableWrite;
use directory::error::OpenWriteError;
use directory::WatchHandle;
use directory::ram_directory::InnerRamDirectory;
use std::sync::RwLock;
use std::sync::Arc;
use directory::WatchCallback;
use std::fmt;
use std::io;
use std::io::{Seek, Write};
use directory::DirectoryClone;
const BUFFER_LEN: usize = 1_000_000;
pub enum NRTWriter {
InRam {
buffer: Cursor<Vec<u8>>,
path: PathBuf,
nrt_directory: NRTDirectory
},
UnderlyingFile(BufWriter<Box<SeekableWrite>>)
}
impl NRTWriter {
pub fn new(path: PathBuf, nrt_directory: NRTDirectory) -> NRTWriter {
NRTWriter::InRam {
buffer: Cursor::new(Vec::with_capacity(BUFFER_LEN)),
path,
nrt_directory,
}
}
}
impl io::Seek for NRTWriter {
fn seek(&mut self, pos: io::SeekFrom) -> io::Result<u64> {
match self {
NRTWriter::InRam { buffer, path, nrt_directory } => {
buffer.seek(pos)
}
NRTWriter::UnderlyingFile(file) => {
file.seek(pos)
}
}
}
}
impl io::Write for NRTWriter {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
self.write_all(buf)?;
Ok(buf.len())
}
fn flush(&mut self) -> io::Result<()> {
match self {
NRTWriter::InRam { buffer, path, nrt_directory } => {
let mut cache_wlock = nrt_directory.cache.write().unwrap();
cache_wlock.write(path.clone(), buffer.get_ref());
Ok(())
}
NRTWriter::UnderlyingFile(file) => {
file.flush()
}
}
}
fn write_all(&mut self, buf: &[u8]) -> io::Result<()> {
// Working around the borrow checker.
let mut underlying_write_opt: Option<BufWriter<Box<SeekableWrite>>> = None;
if let NRTWriter::InRam { buffer, path, nrt_directory } = self {
if buffer.get_ref().len() + buf.len() > BUFFER_LEN {
// We can't keep this in RAM. Let's move it to the underlying directory.
underlying_write_opt = Some(nrt_directory.open_write(path)
.map_err(|open_err| {
io::Error::new(io::ErrorKind::Other, open_err)
})?);
}
}
if let Some(underlying_write) = underlying_write_opt {
*self = NRTWriter::UnderlyingFile(underlying_write);
}
match self {
NRTWriter::InRam { buffer, path, nrt_directory } => {
assert!(buffer.get_ref().len() + buf.len() <= BUFFER_LEN);
buffer.write_all(buf)
}
NRTWriter::UnderlyingFile(file) => {
file.write_all(buf)
}
}
}
}
pub struct NRTDirectory {
underlying: Box<Directory>,
cache: Arc<RwLock<InnerRamDirectory>>,
}
impl Clone for NRTDirectory {
fn clone(&self) -> Self {
NRTDirectory {
underlying: self.underlying.box_clone(),
cache: self.cache.clone()
}
}
}
impl NRTDirectory {
fn wrap(underlying: Box<Directory>) -> NRTDirectory {
NRTDirectory {
underlying,
cache: Default::default()
}
}
}
impl fmt::Debug for NRTDirectory {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "NRTDirectory({:?})", self.underlying)
}
}
impl Directory for NRTDirectory {
fn open_read(&self, path: &Path) -> Result<ReadOnlySource, OpenReadError> {
unimplemented!()
}
fn delete(&self, path: &Path) -> Result<(), DeleteError> {
// We explicitly release the lock, to prevent a panic on the underlying directory
// to poison the lock.
//
// File can only go from cache to underlying so the result does not lead to
// any inconsistency.
{
let mut cache_wlock = self.cache.write().unwrap();
if cache_wlock.exists(path) {
return cache_wlock.delete(path);
}
}
self.underlying.delete(path)
}
fn exists(&self, path: &Path) -> bool {
// We explicitly release the lock, to prevent a panic on the underlying directory
// to poison the lock.
//
// File can only go from cache to underlying so the result does not lead to
// any inconsistency.
{
let rlock_cache = self.cache.read().unwrap();
if rlock_cache.exists(path) {
return true;
}
}
self.underlying.exists(path)
}
fn open_write(&mut self, path: &Path) -> Result<BufWriter<Box<SeekableWrite>>, OpenWriteError> {
let mut cache_wlock = self.cache.write().unwrap();
// TODO might poison our lock. I don't know have a sound solution yet.
let path_buf = path.to_owned();
if self.underlying.exists(path) {
return Err(OpenWriteError::FileAlreadyExists(path_buf));
}
let exists = cache_wlock.write(path_buf.clone(), &[]);
// force the creation of the file to mimic the MMap directory.
if exists {
Err(OpenWriteError::FileAlreadyExists(path_buf))
} else {
let vec_writer = NRTWriter::new(path_buf.clone(), self.clone());
Ok(BufWriter::new(Box::new(vec_writer)))
}
}
fn atomic_read(&self, path: &Path) -> Result<Vec<u8>, OpenReadError> {
self.underlying.atomic_read(path)
}
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
self.underlying.atomic_write(path, data)
}
fn watch(&self, watch_callback: WatchCallback) -> WatchHandle {
self.underlying.watch(watch_callback)
}
}

View File

@@ -1,8 +1,8 @@
use core::META_FILEPATH;
use directory::error::{DeleteError, OpenReadError, OpenWriteError};
use directory::WatchCallbackList;
use super::shared_vec_slice::SharedVecSlice;
use common::make_io_err;
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
use directory::WritePtr;
use directory::{Directory, ReadOnlySource, WatchCallback, WatchHandle};
use directory::{Directory, ReadOnlySource};
use std::collections::HashMap;
use std::fmt;
use std::io::{self, BufWriter, Cursor, Seek, SeekFrom, Write};
@@ -22,13 +22,13 @@ use std::sync::{Arc, RwLock};
///
struct VecWriter {
path: PathBuf,
shared_directory: RAMDirectory,
shared_directory: InnerDirectory,
data: Cursor<Vec<u8>>,
is_flushed: bool,
}
impl VecWriter {
fn new(path_buf: PathBuf, shared_directory: RAMDirectory) -> VecWriter {
fn new(path_buf: PathBuf, shared_directory: InnerDirectory) -> VecWriter {
VecWriter {
path: path_buf,
data: Cursor::new(Vec::new()),
@@ -64,44 +64,73 @@ impl Write for VecWriter {
fn flush(&mut self) -> io::Result<()> {
self.is_flushed = true;
let mut fs = self.shared_directory.fs.write().unwrap();
fs.write(self.path.clone(), self.data.get_ref());
self.shared_directory
.write(self.path.clone(), self.data.get_ref())?;
Ok(())
}
}
#[derive(Default)]
pub(crate) struct InnerRamDirectory {
fs: HashMap<PathBuf, ReadOnlySource>,
watch_router: WatchCallbackList,
}
#[derive(Clone)]
struct InnerDirectory(Arc<RwLock<HashMap<PathBuf, Arc<Vec<u8>>>>>);
impl InnerRamDirectory {
pub fn write(&mut self, path: PathBuf, data: &[u8]) -> bool {
let data = ReadOnlySource::new(Vec::from(data));
self.fs.insert(path, data).is_some()
impl InnerDirectory {
fn new() -> InnerDirectory {
InnerDirectory(Arc::new(RwLock::new(HashMap::new())))
}
pub fn open_read(&self, path: &Path) -> Result<ReadOnlySource, OpenReadError> {
self.fs
.get(path)
.ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path)))
.map(|el| el.clone())
fn write(&self, path: PathBuf, data: &[u8]) -> io::Result<bool> {
let mut map = self.0.write().map_err(|_| {
make_io_err(format!(
"Failed to lock the directory, when trying to write {:?}",
path
))
})?;
let prev_value = map.insert(path, Arc::new(Vec::from(data)));
Ok(prev_value.is_some())
}
pub fn delete(&mut self, path: &Path) -> result::Result<(), DeleteError> {
match self.fs.remove(path) {
Some(_) => Ok(()),
None => Err(DeleteError::FileDoesNotExist(PathBuf::from(path))),
}
fn open_read(&self, path: &Path) -> Result<ReadOnlySource, OpenReadError> {
self.0
.read()
.map_err(|_| {
let msg = format!(
"Failed to acquire read lock for the \
directory when trying to read {:?}",
path
);
let io_err = make_io_err(msg);
OpenReadError::IOError(IOError::with_path(path.to_owned(), io_err))
}).and_then(|readable_map| {
readable_map
.get(path)
.ok_or_else(|| OpenReadError::FileDoesNotExist(PathBuf::from(path)))
.map(Arc::clone)
.map(|data| ReadOnlySource::Anonymous(SharedVecSlice::new(data)))
})
}
pub fn exists(&self, path: &Path) -> bool {
self.fs.contains_key(path)
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
self.0
.write()
.map_err(|_| {
let msg = format!(
"Failed to acquire write lock for the \
directory when trying to delete {:?}",
path
);
let io_err = make_io_err(msg);
DeleteError::IOError(IOError::with_path(path.to_owned(), io_err))
}).and_then(|mut writable_map| match writable_map.remove(path) {
Some(_) => Ok(()),
None => Err(DeleteError::FileDoesNotExist(PathBuf::from(path))),
})
}
pub fn watch(&mut self, watch_handle: WatchCallback) -> WatchHandle {
self.watch_router.subscribe(watch_handle)
fn exists(&self, path: &Path) -> bool {
self.0
.read()
.expect("Failed to get read lock directory.")
.contains_key(path)
}
}
@@ -116,36 +145,33 @@ impl fmt::Debug for RAMDirectory {
/// It is mainly meant for unit testing.
/// Writes are only made visible upon flushing.
///
#[derive(Clone, Default)]
#[derive(Clone)]
pub struct RAMDirectory {
fs: Arc<RwLock<InnerRamDirectory>>,
fs: InnerDirectory,
}
impl RAMDirectory {
/// Constructor
pub fn create() -> RAMDirectory {
Self::default()
RAMDirectory {
fs: InnerDirectory::new(),
}
}
}
impl Directory for RAMDirectory {
fn open_read(&self, path: &Path) -> result::Result<ReadOnlySource, OpenReadError> {
self.fs.read().unwrap().open_read(path)
}
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
self.fs.write().unwrap().delete(path)
}
fn exists(&self, path: &Path) -> bool {
self.fs.read().unwrap().exists(path)
self.fs.open_read(path)
}
fn open_write(&mut self, path: &Path) -> Result<WritePtr, OpenWriteError> {
let mut fs = self.fs.write().unwrap();
let path_buf = PathBuf::from(path);
let vec_writer = VecWriter::new(path_buf.clone(), self.clone());
let exists = fs.write(path_buf.clone(), &[]);
let vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());
let exists = self
.fs
.write(path_buf.clone(), &Vec::new())
.map_err(|err| IOError::with_path(path.to_owned(), err))?;
// force the creation of the file to mimic the MMap directory.
if exists {
Err(OpenWriteError::FileAlreadyExists(path_buf))
@@ -154,8 +180,17 @@ impl Directory for RAMDirectory {
}
}
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
self.fs.delete(path)
}
fn exists(&self, path: &Path) -> bool {
self.fs.exists(path)
}
fn atomic_read(&self, path: &Path) -> Result<Vec<u8>, OpenReadError> {
Ok(self.open_read(path)?.as_slice().to_owned())
let read = self.open_read(path)?;
Ok(read.as_slice().to_owned())
}
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
@@ -164,20 +199,10 @@ impl Directory for RAMDirectory {
msg.unwrap_or("Undefined".to_string())
)));
let path_buf = PathBuf::from(path);
// Reserve the path to prevent calls to .write() to succeed.
self.fs.write().unwrap().write(path_buf.clone(), &[]);
let mut vec_writer = VecWriter::new(path_buf.clone(), self.clone());
let mut vec_writer = VecWriter::new(path_buf.clone(), self.fs.clone());
self.fs.write(path_buf, &Vec::new())?;
vec_writer.write_all(data)?;
vec_writer.flush()?;
if path == Path::new(&*META_FILEPATH) {
self.fs.write().unwrap().watch_router.broadcast();
}
Ok(())
}
fn watch(&self, watch_callback: WatchCallback) -> WatchHandle {
self.fs.write().unwrap().watch(watch_callback)
}
}

View File

@@ -1,9 +1,9 @@
use super::shared_vec_slice::SharedVecSlice;
use common::HasLen;
#[cfg(feature = "mmap")]
use fst::raw::MmapReadOnly;
use stable_deref_trait::{CloneStableDeref, StableDeref};
use std::ops::Deref;
use std::sync::Arc;
pub type BoxedData = Box<Deref<Target = [u8]> + Send + Sync + 'static>;
/// Read object that represents files in tantivy.
///
@@ -11,10 +11,12 @@ pub type BoxedData = Box<Deref<Target = [u8]> + Send + Sync + 'static>;
/// the data in the form of a constant read-only `&[u8]`.
/// Whatever happens to the directory file, the data
/// hold by this object should never be altered or destroyed.
pub struct ReadOnlySource {
data: Arc<BoxedData>,
start: usize,
stop: usize,
pub enum ReadOnlySource {
/// Mmap source of data
#[cfg(feature = "mmap")]
Mmap(MmapReadOnly),
/// Wrapping a `Vec<u8>`
Anonymous(SharedVecSlice),
}
unsafe impl StableDeref for ReadOnlySource {}
@@ -28,38 +30,19 @@ impl Deref for ReadOnlySource {
}
}
impl From<Arc<BoxedData>> for ReadOnlySource {
fn from(data: Arc<BoxedData>) -> Self {
let len = data.len();
ReadOnlySource {
data,
start: 0,
stop: len,
}
}
}
impl ReadOnlySource {
pub(crate) fn new<D>(data: D) -> ReadOnlySource
where
D: Deref<Target = [u8]> + Send + Sync + 'static,
{
let len = data.len();
ReadOnlySource {
data: Arc::new(Box::new(data)),
start: 0,
stop: len,
}
}
/// Creates an empty ReadOnlySource
pub fn empty() -> ReadOnlySource {
ReadOnlySource::new(&[][..])
ReadOnlySource::Anonymous(SharedVecSlice::empty())
}
/// Returns the data underlying the ReadOnlySource object.
pub fn as_slice(&self) -> &[u8] {
&self.data[self.start..self.stop]
match *self {
#[cfg(feature = "mmap")]
ReadOnlySource::Mmap(ref mmap_read_only) => mmap_read_only.as_slice(),
ReadOnlySource::Anonymous(ref shared_vec) => shared_vec.as_slice(),
}
}
/// Splits into 2 `ReadOnlySource`, at the offset given
@@ -80,18 +63,22 @@ impl ReadOnlySource {
/// worth of data in anonymous memory, and only a
/// 1KB slice is remaining, the whole `500MBs`
/// are retained in memory.
pub fn slice(&self, start: usize, stop: usize) -> ReadOnlySource {
pub fn slice(&self, from_offset: usize, to_offset: usize) -> ReadOnlySource {
assert!(
start <= stop,
from_offset <= to_offset,
"Requested negative slice [{}..{}]",
start,
stop
from_offset,
to_offset
);
assert!(stop <= self.len());
ReadOnlySource {
data: self.data.clone(),
start: self.start + start,
stop: self.start + stop,
match *self {
#[cfg(feature = "mmap")]
ReadOnlySource::Mmap(ref mmap_read_only) => {
let sliced_mmap = mmap_read_only.range(from_offset, to_offset - from_offset);
ReadOnlySource::Mmap(sliced_mmap)
}
ReadOnlySource::Anonymous(ref shared_vec) => {
ReadOnlySource::Anonymous(shared_vec.slice(from_offset, to_offset))
}
}
}
@@ -100,7 +87,8 @@ impl ReadOnlySource {
///
/// Equivalent to `.slice(from_offset, self.len())`
pub fn slice_from(&self, from_offset: usize) -> ReadOnlySource {
self.slice(from_offset, self.len())
let len = self.len();
self.slice(from_offset, len)
}
/// Like `.slice(...)` but enforcing only the `to`
@@ -114,18 +102,19 @@ impl ReadOnlySource {
impl HasLen for ReadOnlySource {
fn len(&self) -> usize {
self.stop - self.start
self.as_slice().len()
}
}
impl Clone for ReadOnlySource {
fn clone(&self) -> Self {
self.slice_from(0)
self.slice(0, self.len())
}
}
impl From<Vec<u8>> for ReadOnlySource {
fn from(data: Vec<u8>) -> ReadOnlySource {
ReadOnlySource::new(data)
let shared_data = SharedVecSlice::from(data);
ReadOnlySource::Anonymous(shared_data)
}
}

View File

@@ -0,0 +1,41 @@
use std::sync::Arc;
#[derive(Clone)]
pub struct SharedVecSlice {
pub data: Arc<Vec<u8>>,
pub start: usize,
pub len: usize,
}
impl SharedVecSlice {
pub fn empty() -> SharedVecSlice {
SharedVecSlice::new(Arc::new(Vec::new()))
}
pub fn new(data: Arc<Vec<u8>>) -> SharedVecSlice {
let data_len = data.len();
SharedVecSlice {
data,
start: 0,
len: data_len,
}
}
pub fn as_slice(&self) -> &[u8] {
&self.data[self.start..self.start + self.len]
}
pub fn slice(&self, from_offset: usize, to_offset: usize) -> SharedVecSlice {
SharedVecSlice {
data: Arc::clone(&self.data),
start: self.start + from_offset,
len: to_offset - from_offset,
}
}
}
impl From<Vec<u8>> for SharedVecSlice {
fn from(data: Vec<u8>) -> SharedVecSlice {
SharedVecSlice::new(Arc::new(data))
}
}

View File

@@ -1,222 +0,0 @@
use super::*;
use std::io::{Seek, SeekFrom, Write};
use std::mem;
use std::path::{Path, PathBuf};
use std::sync::atomic::AtomicUsize;
use std::sync::atomic::Ordering;
use std::sync::Arc;
use std::thread;
use std::time;
use std::time::Duration;
lazy_static! {
static ref TEST_PATH: &'static Path = Path::new("some_path_for_test");
}
#[test]
fn test_ram_directory() {
let mut ram_directory = RAMDirectory::create();
test_directory(&mut ram_directory);
}
#[test]
#[cfg(feature = "mmap")]
fn test_mmap_directory() {
let mut mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
test_directory(&mut mmap_directory);
}
#[test]
#[should_panic]
fn ram_directory_panics_if_flush_forgotten() {
let mut ram_directory = RAMDirectory::create();
let mut write_file = ram_directory.open_write(*TEST_PATH).unwrap();
assert!(write_file.write_all(&[4]).is_ok());
}
fn test_simple(directory: &mut Directory) {
{
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
assert!(directory.exists(*TEST_PATH));
write_file.write_all(&[4]).unwrap();
write_file.write_all(&[3]).unwrap();
write_file.write_all(&[7, 3, 5]).unwrap();
write_file.flush().unwrap();
}
{
let read_file = directory.open_read(*TEST_PATH).unwrap();
let data: &[u8] = &*read_file;
assert_eq!(data, &[4u8, 3u8, 7u8, 3u8, 5u8]);
}
assert!(directory.delete(*TEST_PATH).is_ok());
assert!(!directory.exists(*TEST_PATH));
}
fn test_seek(directory: &mut Directory) {
{
{
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
write_file.write_all(&[4, 3, 7, 3, 5]).unwrap();
write_file.seek(SeekFrom::Start(0)).unwrap();
write_file.write_all(&[3, 1]).unwrap();
write_file.flush().unwrap();
}
let read_file = directory.open_read(*TEST_PATH).unwrap();
let data: &[u8] = &*read_file;
assert_eq!(data, &[3u8, 1u8, 7u8, 3u8, 5u8]);
}
assert!(directory.delete(*TEST_PATH).is_ok());
}
fn test_rewrite_forbidden(directory: &mut Directory) {
{
directory.open_write(*TEST_PATH).unwrap();
assert!(directory.exists(*TEST_PATH));
}
{
assert!(directory.open_write(*TEST_PATH).is_err());
}
assert!(directory.delete(*TEST_PATH).is_ok());
}
fn test_write_create_the_file(directory: &mut Directory) {
{
assert!(directory.open_read(*TEST_PATH).is_err());
let _w = directory.open_write(*TEST_PATH).unwrap();
assert!(directory.exists(*TEST_PATH));
assert!(directory.open_read(*TEST_PATH).is_ok());
assert!(directory.delete(*TEST_PATH).is_ok());
}
}
fn test_directory_delete(directory: &mut Directory) {
assert!(directory.open_read(*TEST_PATH).is_err());
let mut write_file = directory.open_write(*TEST_PATH).unwrap();
write_file.write_all(&[1, 2, 3, 4]).unwrap();
write_file.flush().unwrap();
{
let read_handle = directory.open_read(*TEST_PATH).unwrap();
{
assert_eq!(&*read_handle, &[1u8, 2u8, 3u8, 4u8]);
// Mapped files can't be deleted on Windows
if !cfg!(windows) {
assert!(directory.delete(*TEST_PATH).is_ok());
assert_eq!(&*read_handle, &[1u8, 2u8, 3u8, 4u8]);
}
assert!(directory.delete(Path::new("SomeOtherPath")).is_err());
}
}
if cfg!(windows) {
assert!(directory.delete(*TEST_PATH).is_ok());
}
assert!(directory.open_read(*TEST_PATH).is_err());
assert!(directory.delete(*TEST_PATH).is_err());
}
fn test_directory(directory: &mut Directory) {
test_simple(directory);
test_seek(directory);
test_rewrite_forbidden(directory);
test_write_create_the_file(directory);
test_directory_delete(directory);
test_lock_non_blocking(directory);
test_lock_blocking(directory);
test_watch(directory);
}
fn test_watch(directory: &mut Directory) {
let counter: Arc<AtomicUsize> = Default::default();
let counter_clone = counter.clone();
let watch_callback = Box::new(move || {
counter_clone.fetch_add(1, Ordering::SeqCst);
});
assert!(directory
.atomic_write(Path::new("meta.json"), b"random_test_data")
.is_ok());
thread::sleep(Duration::new(0, 10_000));
assert_eq!(0, counter.load(Ordering::SeqCst));
let watch_handle = directory.watch(watch_callback);
for i in 0..10 {
assert_eq!(i, counter.load(Ordering::SeqCst));
assert!(directory
.atomic_write(Path::new("meta.json"), b"random_test_data_2")
.is_ok());
for _ in 0..100 {
if counter.load(Ordering::SeqCst) > i {
break;
}
thread::sleep(Duration::from_millis(10));
}
assert_eq!(i + 1, counter.load(Ordering::SeqCst));
}
mem::drop(watch_handle);
assert!(directory
.atomic_write(Path::new("meta.json"), b"random_test_data")
.is_ok());
thread::sleep(Duration::from_millis(200));
assert_eq!(10, counter.load(Ordering::SeqCst));
}
fn test_lock_non_blocking(directory: &mut Directory) {
{
let lock_a_res = directory.acquire_lock(&Lock {
filepath: PathBuf::from("a.lock"),
is_blocking: false,
});
assert!(lock_a_res.is_ok());
let lock_b_res = directory.acquire_lock(&Lock {
filepath: PathBuf::from("b.lock"),
is_blocking: false,
});
assert!(lock_b_res.is_ok());
let lock_a_res2 = directory.acquire_lock(&Lock {
filepath: PathBuf::from("a.lock"),
is_blocking: false,
});
assert!(lock_a_res2.is_err());
}
let lock_a_res = directory.acquire_lock(&Lock {
filepath: PathBuf::from("a.lock"),
is_blocking: false,
});
assert!(lock_a_res.is_ok());
}
fn test_lock_blocking(directory: &mut Directory) {
let lock_a_res = directory.acquire_lock(&Lock {
filepath: PathBuf::from("a.lock"),
is_blocking: true,
});
assert!(lock_a_res.is_ok());
std::thread::spawn(move || {
//< lock_a_res is sent to the thread.
std::thread::sleep(time::Duration::from_millis(10));
// explicitely droping lock_a_res. It would have been sufficient to just force it
// to be part of the move, but the intent seems clearer that way.
drop(lock_a_res);
});
{
// A non-blocking call should fail, as the thread is running and holding the lock.
let lock_a_res = directory.acquire_lock(&Lock {
filepath: PathBuf::from("a.lock"),
is_blocking: false,
});
assert!(lock_a_res.is_err());
}
{
// the blocking call should wait for at least 10ms.
let start = time::Instant::now();
let lock_a_res = directory.acquire_lock(&Lock {
filepath: PathBuf::from("a.lock"),
is_blocking: true,
});
assert!(lock_a_res.is_ok());
assert!(start.elapsed().subsec_millis() >= 10);
}
}

View File

@@ -1,156 +0,0 @@
use std::sync::Arc;
use std::sync::RwLock;
use std::sync::Weak;
/// Type alias for callbacks registered when watching files of a `Directory`.
pub type WatchCallback = Box<Fn() -> () + Sync + Send>;
/// Helper struct to implement the watch method in `Directory` implementations.
///
/// It registers callbacks (See `.subscribe(...)`) and
/// calls them upon calls to `.broadcast(...)`.
#[derive(Default)]
pub struct WatchCallbackList {
router: RwLock<Vec<Weak<WatchCallback>>>,
}
/// Controls how long a directory should watch for a file change.
///
/// After all the clones of `WatchHandle` are dropped, the associated will not be called when a
/// file change is detected.
#[must_use = "This `WatchHandle` controls the lifetime of the watch and should therefore be used."]
#[derive(Clone)]
pub struct WatchHandle(Arc<WatchCallback>);
impl WatchCallbackList {
/// Suscribes a new callback and returns a handle that controls the lifetime of the callback.
pub fn subscribe(&self, watch_callback: WatchCallback) -> WatchHandle {
let watch_callback_arc = Arc::new(watch_callback);
let watch_callback_weak = Arc::downgrade(&watch_callback_arc);
self.router.write().unwrap().push(watch_callback_weak);
WatchHandle(watch_callback_arc)
}
fn list_callback(&self) -> Vec<Arc<WatchCallback>> {
let mut callbacks = vec![];
let mut router_wlock = self.router.write().unwrap();
let mut i = 0;
while i < router_wlock.len() {
if let Some(watch) = router_wlock[i].upgrade() {
callbacks.push(watch);
i += 1;
} else {
router_wlock.swap_remove(i);
}
}
callbacks
}
/// Triggers all callbacks
pub fn broadcast(&self) {
let callbacks = self.list_callback();
let spawn_res = std::thread::Builder::new()
.name("watch-callbacks".to_string())
.spawn(move || {
for callback in callbacks {
callback();
}
});
if let Err(err) = spawn_res {
error!(
"Failed to spawn thread to call watch callbacks. Cause: {:?}",
err
);
}
}
}
#[cfg(test)]
mod tests {
use directory::WatchCallbackList;
use std::mem;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
use std::thread;
use std::time::Duration;
const WAIT_TIME: u64 = 20;
#[test]
fn test_watch_event_router_simple() {
let watch_event_router = WatchCallbackList::default();
let counter: Arc<AtomicUsize> = Default::default();
let counter_clone = counter.clone();
let inc_callback = Box::new(move || {
counter_clone.fetch_add(1, Ordering::SeqCst);
});
watch_event_router.broadcast();
assert_eq!(0, counter.load(Ordering::SeqCst));
let handle_a = watch_event_router.subscribe(inc_callback);
thread::sleep(Duration::from_millis(WAIT_TIME));
assert_eq!(0, counter.load(Ordering::SeqCst));
watch_event_router.broadcast();
thread::sleep(Duration::from_millis(WAIT_TIME));
assert_eq!(1, counter.load(Ordering::SeqCst));
watch_event_router.broadcast();
watch_event_router.broadcast();
watch_event_router.broadcast();
thread::sleep(Duration::from_millis(WAIT_TIME));
assert_eq!(4, counter.load(Ordering::SeqCst));
mem::drop(handle_a);
watch_event_router.broadcast();
thread::sleep(Duration::from_millis(WAIT_TIME));
assert_eq!(4, counter.load(Ordering::SeqCst));
}
#[test]
fn test_watch_event_router_multiple_callback_same_key() {
let watch_event_router = WatchCallbackList::default();
let counter: Arc<AtomicUsize> = Default::default();
let inc_callback = |inc: usize| {
let counter_clone = counter.clone();
Box::new(move || {
counter_clone.fetch_add(inc, Ordering::SeqCst);
})
};
let handle_a = watch_event_router.subscribe(inc_callback(1));
let handle_a2 = watch_event_router.subscribe(inc_callback(10));
thread::sleep(Duration::from_millis(WAIT_TIME));
assert_eq!(0, counter.load(Ordering::SeqCst));
watch_event_router.broadcast();
watch_event_router.broadcast();
thread::sleep(Duration::from_millis(WAIT_TIME));
assert_eq!(22, counter.load(Ordering::SeqCst));
mem::drop(handle_a);
watch_event_router.broadcast();
thread::sleep(Duration::from_millis(WAIT_TIME));
assert_eq!(32, counter.load(Ordering::SeqCst));
mem::drop(handle_a2);
watch_event_router.broadcast();
watch_event_router.broadcast();
thread::sleep(Duration::from_millis(WAIT_TIME));
assert_eq!(32, counter.load(Ordering::SeqCst));
}
#[test]
fn test_watch_event_router_multiple_callback_different_key() {
let watch_event_router = WatchCallbackList::default();
let counter: Arc<AtomicUsize> = Default::default();
let counter_clone = counter.clone();
let inc_callback = Box::new(move || {
counter_clone.fetch_add(1, Ordering::SeqCst);
});
let handle_a = watch_event_router.subscribe(inc_callback);
assert_eq!(0, counter.load(Ordering::SeqCst));
watch_event_router.broadcast();
watch_event_router.broadcast();
thread::sleep(Duration::from_millis(WAIT_TIME));
assert_eq!(2, counter.load(Ordering::SeqCst));
thread::sleep(Duration::from_millis(WAIT_TIME));
mem::drop(handle_a);
watch_event_router.broadcast();
thread::sleep(Duration::from_millis(WAIT_TIME));
assert_eq!(2, counter.load(Ordering::SeqCst));
}
}

View File

@@ -2,48 +2,15 @@
use std::io;
use directory::error::LockError;
use directory::error::{IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
use fastfield::FastFieldNotAvailableError;
use indexer::LockType;
use query;
use schema;
use serde_json;
use std::fmt;
use std::path::PathBuf;
use std::sync::PoisonError;
pub struct DataCorruption {
filepath: Option<PathBuf>,
comment: String,
}
impl DataCorruption {
pub fn new(filepath: PathBuf, comment: String) -> DataCorruption {
DataCorruption {
filepath: Some(filepath),
comment,
}
}
pub fn comment_only(comment: String) -> DataCorruption {
DataCorruption {
filepath: None,
comment,
}
}
}
impl fmt::Debug for DataCorruption {
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
write!(f, "Data corruption: ")?;
if let Some(ref filepath) = &self.filepath {
write!(f, "(in file `{:?}`)", filepath)?;
}
write!(f, ": {}.", self.comment)?;
Ok(())
}
}
/// The library's failure based error enum
#[derive(Debug, Fail)]
pub enum TantivyError {
@@ -57,14 +24,17 @@ pub enum TantivyError {
#[fail(display = "Index already exists")]
IndexAlreadyExists,
/// Failed to acquire file lock
#[fail(display = "Failed to acquire Lockfile: {:?}. {:?}", _0, _1)]
LockFailure(LockError, Option<String>),
#[fail(
display = "Failed to acquire Lockfile: {:?}. Possible causes: another IndexWriter instance or panic during previous lock drop.",
_0
)]
LockFailure(LockType),
/// IO Error.
#[fail(display = "An IO error occurred: '{}'", _0)]
IOError(#[cause] IOError),
/// Data corruption.
#[fail(display = "{:?}", _0)]
DataCorruption(DataCorruption),
#[fail(display = "File contains corrupted data: '{:?}'", _0)]
CorruptedFile(PathBuf),
/// A thread holding the locked panicked and poisoned the lock.
#[fail(display = "A thread holding the locked panicked and poisoned the lock")]
Poisoned,
@@ -85,24 +55,12 @@ pub enum TantivyError {
SystemError(String),
}
impl From<DataCorruption> for TantivyError {
fn from(data_corruption: DataCorruption) -> TantivyError {
TantivyError::DataCorruption(data_corruption)
}
}
impl From<FastFieldNotAvailableError> for TantivyError {
fn from(fastfield_error: FastFieldNotAvailableError) -> TantivyError {
TantivyError::FastFieldError(fastfield_error)
}
}
impl From<LockError> for TantivyError {
fn from(lock_error: LockError) -> TantivyError {
TantivyError::LockFailure(lock_error, None)
}
}
impl From<IOError> for TantivyError {
fn from(io_error: IOError) -> TantivyError {
TantivyError::IOError(io_error)
@@ -162,7 +120,6 @@ impl From<OpenDirectoryError> for TantivyError {
OpenDirectoryError::NotADirectory(directory_path) => {
TantivyError::InvalidArgument(format!("{:?} is not a directory", directory_path))
}
OpenDirectoryError::IoError(err) => TantivyError::IOError(IOError::from(err)),
}
}
}

View File

@@ -22,7 +22,9 @@ mod tests {
index_writer.add_document(doc!(field=>vec![1u8, 3, 5, 7, 9]));
index_writer.add_document(doc!(field=>vec![0u8; 1000]));
assert!(index_writer.commit().is_ok());
let searcher = index.reader().unwrap().searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
let bytes_reader = reader.bytes_fast_field_reader(field).unwrap();

View File

@@ -1,6 +1,5 @@
use super::MultiValueIntFastFieldReader;
use schema::Facet;
use std::str;
use termdict::TermDictionary;
use termdict::TermOrdinal;
use DocId;
@@ -21,7 +20,6 @@ use DocId;
pub struct FacetReader {
term_ords: MultiValueIntFastFieldReader<u64>,
term_dict: TermDictionary,
buffer: Vec<u8>,
}
impl FacetReader {
@@ -39,7 +37,6 @@ impl FacetReader {
FacetReader {
term_ords,
term_dict,
buffer: vec![],
}
}
@@ -58,18 +55,11 @@ impl FacetReader {
}
/// Given a term ordinal returns the term associated to it.
pub fn facet_from_ord(
&mut self,
facet_ord: TermOrdinal,
output: &mut Facet,
) -> Result<(), str::Utf8Error> {
pub fn facet_from_ord(&self, facet_ord: TermOrdinal, output: &mut Facet) {
let found_term = self
.term_dict
.ord_to_term(facet_ord as u64, &mut self.buffer);
.ord_to_term(facet_ord as u64, output.inner_buffer_mut());
assert!(found_term, "Term ordinal {} no found.", facet_ord);
let facet_str = str::from_utf8(&self.buffer[..])?;
output.set_facet_str(facet_str);
Ok(())
}
/// Return the list of facet ordinals associated to a document.

View File

@@ -7,13 +7,7 @@ pub use self::writer::MultiValueIntFastFieldWriter;
#[cfg(test)]
mod tests {
extern crate time;
use self::time::Duration;
use collector::TopDocs;
use query::QueryParser;
use schema::Cardinality;
use schema::Facet;
use schema::IntOptions;
use schema::Schema;
use Index;
@@ -34,12 +28,11 @@ mod tests {
index_writer.add_document(doc!(field=>5u64, field=>20u64,field=>1u64));
assert!(index_writer.commit().is_ok());
let searcher = index.reader().unwrap().searcher();
let segment_reader = searcher.segment_reader(0);
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
let mut vals = Vec::new();
let multi_value_reader = segment_reader
.multi_fast_field_reader::<u64>(field)
.unwrap();
let multi_value_reader = reader.multi_fast_field_reader::<u64>(field).unwrap();
{
multi_value_reader.get_vals(2, &mut vals);
assert_eq!(&vals, &[4u64]);
@@ -54,133 +47,6 @@ mod tests {
}
}
#[test]
fn test_multivalued_date() {
let mut schema_builder = Schema::builder();
let date_field = schema_builder.add_date_field(
"multi_date_field",
IntOptions::default()
.set_fast(Cardinality::MultiValues)
.set_indexed()
.set_stored(),
);
let time_i =
schema_builder.add_i64_field("time_stamp_i", IntOptions::default().set_stored());
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let first_time_stamp = chrono::Utc::now();
index_writer.add_document(
doc!(date_field=>first_time_stamp, date_field=>first_time_stamp, time_i=>1i64),
);
index_writer.add_document(doc!(time_i=>0i64));
// add one second
index_writer
.add_document(doc!(date_field=>first_time_stamp + Duration::seconds(1), time_i=>2i64));
// add another second
let two_secs_ahead = first_time_stamp + Duration::seconds(2);
index_writer.add_document(doc!(date_field=>two_secs_ahead, date_field=>two_secs_ahead,date_field=>two_secs_ahead, time_i=>3i64));
assert!(index_writer.commit().is_ok());
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let reader = searcher.segment_reader(0);
assert_eq!(reader.num_docs(), 4);
{
let parser = QueryParser::for_index(&index, vec![date_field]);
let query = parser
.parse_query(&format!("\"{}\"", first_time_stamp.to_rfc3339()).to_string())
.expect("could not parse query");
let results = searcher
.search(&query, &TopDocs::with_limit(5))
.expect("could not query index");
assert_eq!(results.len(), 1);
for (_score, doc_address) in results {
let retrieved_doc = searcher.doc(doc_address).expect("cannot fetch doc");
assert_eq!(
retrieved_doc
.get_first(date_field)
.expect("cannot find value")
.date_value()
.timestamp(),
first_time_stamp.timestamp()
);
assert_eq!(
retrieved_doc
.get_first(time_i)
.expect("cannot find value")
.i64_value(),
1i64
);
}
}
{
let parser = QueryParser::for_index(&index, vec![date_field]);
let query = parser
.parse_query(&format!("\"{}\"", two_secs_ahead.to_rfc3339()).to_string())
.expect("could not parse query");
let results = searcher
.search(&query, &TopDocs::with_limit(5))
.expect("could not query index");
assert_eq!(results.len(), 1);
for (_score, doc_address) in results {
let retrieved_doc = searcher.doc(doc_address).expect("cannot fetch doc");
assert_eq!(
retrieved_doc
.get_first(date_field)
.expect("cannot find value")
.date_value()
.timestamp(),
two_secs_ahead.timestamp()
);
assert_eq!(
retrieved_doc
.get_first(time_i)
.expect("cannot find value")
.i64_value(),
3i64
);
}
}
// TODO: support Date range queries
// {
// let parser = QueryParser::for_index(&index, vec![date_field]);
// let range_q = format!("\"{}\"..\"{}\"",
// (first_time_stamp + Duration::seconds(1)).to_rfc3339(),
// (first_time_stamp + Duration::seconds(3)).to_rfc3339()
// );
// let query = parser.parse_query(&range_q)
// .expect("could not parse query");
// let results = searcher.search(&query, &TopDocs::with_limit(5))
// .expect("could not query index");
//
//
// assert_eq!(results.len(), 2);
// for (i, doc_pair) in results.iter().enumerate() {
// let retrieved_doc = searcher.doc(doc_pair.1).expect("cannot fetch doc");
// let offset_sec = match i {
// 0 => 1,
// 1 => 3,
// _ => panic!("should not have more than 2 docs")
// };
// let time_i_val = match i {
// 0 => 2,
// 1 => 3,
// _ => panic!("should not have more than 2 docs")
// };
// assert_eq!(retrieved_doc.get_first(date_field).expect("cannot find value").date_value().timestamp(),
// (first_time_stamp + Duration::seconds(offset_sec)).timestamp());
// assert_eq!(retrieved_doc.get_first(time_i).expect("cannot find value").i64_value(), time_i_val);
// }
// }
}
#[test]
fn test_multivalued_i64() {
let mut schema_builder = Schema::builder();
@@ -197,7 +63,8 @@ mod tests {
index_writer.add_document(doc!(field=> -5i64, field => -20i64, field=>1i64));
assert!(index_writer.commit().is_ok());
let searcher = index.reader().unwrap().searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
let mut vals = Vec::new();
let multi_value_reader = reader.multi_fast_field_reader::<i64>(field).unwrap();
@@ -218,17 +85,4 @@ mod tests {
assert_eq!(&vals, &[-5i64, -20i64, 1i64]);
}
}
#[test]
#[ignore]
fn test_many_facets() {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_facet_field("facetfield");
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
for i in 0..100_000 {
index_writer.add_document(doc!(field=> Facet::from(format!("/lang/{}", i).as_str())));
}
assert!(index_writer.commit().is_ok());
}
}

View File

@@ -39,7 +39,7 @@ impl<Item: FastValue> MultiValueIntFastFieldReader<Item> {
let (start, stop) = self.range(doc);
let len = (stop - start) as usize;
vals.resize(len, Item::default());
self.vals_reader.get_range_u64(start, &mut vals[..]);
self.vals_reader.get_range(start as u32, &mut vals[..]);
}
}
@@ -75,26 +75,27 @@ mod tests {
index_writer.add_document(doc);
}
index_writer.commit().expect("Commit failed");
let searcher = index.reader().unwrap().searcher();
index.load_searchers().expect("Reloading searchers");
let searcher = index.searcher();
let segment_reader = searcher.segment_reader(0);
let mut facet_reader = segment_reader.facet_reader(facet_field).unwrap();
let mut facet = Facet::root();
{
facet_reader.facet_from_ord(1, &mut facet).unwrap();
facet_reader.facet_from_ord(1, &mut facet);
assert_eq!(facet, Facet::from("/category"));
}
{
facet_reader.facet_from_ord(2, &mut facet).unwrap();
facet_reader.facet_from_ord(2, &mut facet);
assert_eq!(facet, Facet::from("/category/cat1"));
}
{
facet_reader.facet_from_ord(3, &mut facet).unwrap();
facet_reader.facet_from_ord(3, &mut facet);
assert_eq!(format!("{}", facet), "/category/cat2");
assert_eq!(facet, Facet::from("/category/cat2"));
}
{
facet_reader.facet_from_ord(4, &mut facet).unwrap();
facet_reader.facet_from_ord(4, &mut facet);
assert_eq!(facet, Facet::from("/category/cat3"));
}

View File

@@ -32,7 +32,7 @@ use DocId;
/// term ids when the segment is getting serialized.
pub struct MultiValueIntFastFieldWriter {
field: Field,
vals: Vec<UnorderedTermId>,
vals: Vec<u64>,
doc_index: Vec<u64>,
is_facet: bool,
}

View File

@@ -59,29 +59,7 @@ impl<Item: FastValue> FastFieldReader<Item> {
/// May panic if `doc` is greater than the segment
// `maxdoc`.
pub fn get(&self, doc: DocId) -> Item {
self.get_u64(u64::from(doc))
}
pub(crate) fn get_u64(&self, doc: u64) -> Item {
Item::from_u64(self.min_value_u64 + self.bit_unpacker.get(doc))
}
/// Internally `multivalued` also use SingleValue Fast fields.
/// It works as follows... A first column contains the list of start index
/// for each document, a second column contains the actual values.
///
/// The values associated to a given doc, are then
/// `second_column[first_column.get(doc)..first_column.get(doc+1)]`.
///
/// Which means single value fast field reader can be indexed internally with
/// something different from a `DocId`. For this use case, we want to use `u64`
/// values.
///
/// See `get_range` for an actual documentation about this method.
pub(crate) fn get_range_u64(&self, start: u64, output: &mut [Item]) {
for (i, out) in output.iter_mut().enumerate() {
*out = self.get_u64(start + (i as u64));
}
Item::from_u64(self.min_value_u64 + self.bit_unpacker.get(doc as usize))
}
/// Fills an output buffer with the fast field values
@@ -97,8 +75,16 @@ impl<Item: FastValue> FastFieldReader<Item> {
///
/// May panic if `start + output.len()` is greater than
/// the segment's `maxdoc`.
pub fn get_range(&self, start: DocId, output: &mut [Item]) {
self.get_range_u64(u64::from(start), output);
///
// TODO change start to `u64`.
// For multifastfield, start is an index in a second fastfield, not a `DocId`
pub fn get_range(&self, start: u32, output: &mut [Item]) {
// ok: Item is either `u64` or `i64`
let output_u64: &mut [u64] = unsafe { &mut *(output as *mut [Item] as *mut [u64]) };
self.bit_unpacker.get_range(start, output_u64);
for out in output_u64.iter_mut() {
*out = Item::from_u64(*out + self.min_value_u64).as_u64();
}
}
/// Returns the minimum value for this fast field.

View File

@@ -13,15 +13,15 @@ fn check_index_content(searcher: &Searcher, vals: &HashSet<u64>) {
#[test]
#[ignore]
#[cfg(feature = "mmap")]
fn test_indexing() {
let mut schema_builder = Schema::builder();
let id_field = schema_builder.add_u64_field("id", INDEXED);
let multiples_field = schema_builder.add_u64_field("multiples", INDEXED);
let id_field = schema_builder.add_u64_field("id", INT_INDEXED);
let multiples_field = schema_builder.add_u64_field("multiples", INT_INDEXED);
let schema = schema_builder.build();
let index = Index::create_from_tempdir(schema).unwrap();
let reader = index.reader().unwrap();
let mut rng = thread_rng();
@@ -36,8 +36,8 @@ fn test_indexing() {
index_writer.commit().expect("Commit failed");
committed_docs.extend(&uncommitted_docs);
uncommitted_docs.clear();
reader.reload().unwrap();
let searcher = reader.searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
// check that everything is correct.
check_index_content(&searcher, &committed_docs);
} else {

View File

@@ -179,11 +179,6 @@ pub struct DeleteCursor {
}
impl DeleteCursor {
pub fn empty() -> DeleteCursor {
DeleteQueue::new().cursor()
}
/// Skips operations and position it so that
/// - either all of the delete operation currently in the
/// queue are consume and the next get will return None.
@@ -196,7 +191,10 @@ impl DeleteCursor {
}
}
#[cfg_attr(feature = "cargo-clippy", allow(clippy::wrong_self_convention))]
#[cfg_attr(
feature = "cargo-clippy",
allow(clippy::wrong_self_convention)
)]
fn is_behind_opstamp(&mut self, target_opstamp: u64) -> bool {
self.get()
.map(|operation| operation.opstamp < target_opstamp)

View File

@@ -0,0 +1,131 @@
use directory::error::OpenWriteError;
use std::io::Write;
use std::path::{Path, PathBuf};
use std::thread;
use std::time::Duration;
use Directory;
use TantivyError;
#[derive(Debug, Clone, Copy)]
pub enum LockType {
/// Only one process should be able to write tantivy's index at a time.
/// This lock file, when present, is in charge of preventing other processes to open an IndexWriter.
///
/// If the process is killed and this file remains, it is safe to remove it manually.
///
/// Failing to acquire this lock usually means a misuse of tantivy's API,
/// (creating more than one instance of the `IndexWriter`), are a spurious
/// lock file remaining after a crash. In the latter case, removing the file after
/// checking no process running tantivy is running is safe.
IndexWriterLock,
/// The meta lock file is here to protect the segment files being opened by
/// `.load_searchers()` from being garbage collected.
/// It makes it possible for another process to safely consume
/// our index in-writing. Ideally, we may have prefered `RWLock` semantics
/// here, but it is difficult to achieve on Windows.
///
/// Opening segment readers is a very fast process.
/// Right now if the lock cannot be acquire on the first attempt, the logic
/// is very simplistic. We retry after `100ms` until we effectively
/// acquire the lock.
/// This lock should not have much contention in normal usage.
MetaLock,
}
/// Retry the logic of acquiring locks is pretty simple.
/// We just retry `n` times after a given `duratio`, both
/// depending on the type of lock.
struct RetryPolicy {
num_retries: usize,
wait_in_ms: u64,
}
impl RetryPolicy {
fn no_retry() -> RetryPolicy {
RetryPolicy {
num_retries: 0,
wait_in_ms: 0,
}
}
fn wait_and_retry(&mut self) -> bool {
if self.num_retries == 0 {
false
} else {
self.num_retries -= 1;
let wait_duration = Duration::from_millis(self.wait_in_ms);
thread::sleep(wait_duration);
true
}
}
}
impl LockType {
fn retry_policy(self) -> RetryPolicy {
match self {
LockType::IndexWriterLock => RetryPolicy::no_retry(),
LockType::MetaLock => RetryPolicy {
num_retries: 100,
wait_in_ms: 100,
},
}
}
fn try_acquire_lock(self, directory: &mut Directory) -> Result<DirectoryLock, TantivyError> {
let path = self.filename();
let mut write = directory.open_write(path).map_err(|e| match e {
OpenWriteError::FileAlreadyExists(_) => TantivyError::LockFailure(self),
OpenWriteError::IOError(io_error) => TantivyError::IOError(io_error),
})?;
write.flush()?;
Ok(DirectoryLock {
directory: directory.box_clone(),
path: path.to_owned(),
})
}
/// Acquire a lock in the given directory.
pub fn acquire_lock(self, directory: &Directory) -> Result<DirectoryLock, TantivyError> {
let mut box_directory = directory.box_clone();
let mut retry_policy = self.retry_policy();
loop {
let lock_result = self.try_acquire_lock(&mut *box_directory);
match lock_result {
Ok(result) => {
return Ok(result);
}
Err(TantivyError::LockFailure(ref filepath)) => {
if !retry_policy.wait_and_retry() {
return Err(TantivyError::LockFailure(filepath.to_owned()));
}
}
Err(_) => {}
}
}
}
fn filename(&self) -> &Path {
match *self {
LockType::MetaLock => Path::new(".tantivy-meta.lock"),
LockType::IndexWriterLock => Path::new(".tantivy-indexer.lock"),
}
}
}
/// The `DirectoryLock` is an object that represents a file lock.
/// See [`LockType`](struct.LockType.html)
///
/// It is transparently associated to a lock file, that gets deleted
/// on `Drop.` The lock is release automatically on `Drop`.
pub struct DirectoryLock {
directory: Box<Directory>,
path: PathBuf,
}
impl Drop for DirectoryLock {
fn drop(&mut self) {
if let Err(e) = self.directory.delete(&*self.path) {
error!("Failed to remove the lock file. {:?}", e);
}
}
}

View File

@@ -1,4 +1,4 @@
use super::operation::{AddOperation, UserOperation};
use super::operation::AddOperation;
use super::segment_updater::SegmentUpdater;
use super::PreparedCommit;
use bit_set::BitSet;
@@ -9,15 +9,15 @@ use core::SegmentId;
use core::SegmentMeta;
use core::SegmentReader;
use crossbeam::channel;
use directory::DirectoryLock;
use docset::DocSet;
use error::TantivyError;
use fastfield::write_delete_bitset;
use futures::{Canceled, Future};
use futures::sync::oneshot::Receiver;
use indexer::delete_queue::{DeleteCursor, DeleteQueue};
use indexer::doc_opstamp_mapping::DocToOpstampMapping;
use indexer::operation::DeleteOperation;
use indexer::stamper::Stamper;
use indexer::DirectoryLock;
use indexer::MergePolicy;
use indexer::SegmentEntry;
use indexer::SegmentWriter;
@@ -26,8 +26,7 @@ use schema::Document;
use schema::IndexRecordOption;
use schema::Term;
use std::mem;
use std::ops::Range;
use std::sync::Arc;
use std::mem::swap;
use std::thread;
use std::thread::JoinHandle;
use Result;
@@ -44,8 +43,8 @@ pub const HEAP_SIZE_MAX: usize = u32::max_value() as usize - MARGIN_IN_BYTES;
// reaches `PIPELINE_MAX_SIZE_IN_DOCS`
const PIPELINE_MAX_SIZE_IN_DOCS: usize = 10_000;
type OperationSender = channel::Sender<Vec<AddOperation>>;
type OperationReceiver = channel::Receiver<Vec<AddOperation>>;
type DocumentSender = channel::Sender<AddOperation>;
type DocumentReceiver = channel::Receiver<AddOperation>;
/// Split the thread memory budget into
/// - the heap size
@@ -53,19 +52,16 @@ type OperationReceiver = channel::Receiver<Vec<AddOperation>>;
///
/// Returns (the heap size in bytes, the hash table size in number of bits)
fn initial_table_size(per_thread_memory_budget: usize) -> usize {
assert!(per_thread_memory_budget > 1_000);
let table_size_limit: usize = per_thread_memory_budget / 3;
if let Some(limit) = (1..)
(1..)
.take_while(|num_bits: &usize| compute_table_size(*num_bits) < table_size_limit)
.last()
{
limit.min(19) // we cap it at 2^19 = 512K.
} else {
unreachable!(
"Per thread memory is too small: {}",
per_thread_memory_budget
);
}
.unwrap_or_else(|| {
panic!(
"Per thread memory is too small: {}",
per_thread_memory_budget
)
}).min(19) // we cap it at 512K
}
/// `IndexWriter` is the user entry-point to add document to an index.
@@ -85,8 +81,8 @@ pub struct IndexWriter {
workers_join_handle: Vec<JoinHandle<Result<()>>>,
operation_receiver: OperationReceiver,
operation_sender: OperationSender,
document_receiver: DocumentReceiver,
document_sender: DocumentSender,
segment_updater: SegmentUpdater,
@@ -133,7 +129,7 @@ pub fn open_index_writer(
let err_msg = format!("The heap size per thread cannot exceed {}", HEAP_SIZE_MAX);
return Err(TantivyError::InvalidArgument(err_msg));
}
let (document_sender, document_receiver): (OperationSender, OperationReceiver) =
let (document_sender, document_receiver): (DocumentSender, DocumentReceiver) =
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
let delete_queue = DeleteQueue::new();
@@ -143,7 +139,7 @@ pub fn open_index_writer(
let stamper = Stamper::new(current_opstamp);
let segment_updater =
SegmentUpdater::create(index.clone(), stamper.clone(), &delete_queue.cursor())?;
SegmentUpdater::new(index.clone(), stamper.clone(), &delete_queue.cursor())?;
let mut index_writer = IndexWriter {
_directory_lock: Some(directory_lock),
@@ -151,8 +147,8 @@ pub fn open_index_writer(
heap_size_in_bytes_per_thread,
index: index.clone(),
operation_receiver: document_receiver,
operation_sender: document_sender,
document_receiver,
document_sender,
segment_updater,
@@ -259,7 +255,7 @@ pub fn advance_deletes(
write_delete_bitset(&delete_bitset, &mut delete_file)?;
}
}
segment_entry.set_meta(target_opstamp, segment.meta().clone());
segment_entry.set_meta((*segment.meta()).clone());
Ok(())
}
@@ -267,7 +263,7 @@ fn index_documents(
memory_budget: usize,
segment: &Segment,
generation: usize,
document_iterator: &mut Iterator<Item = Vec<AddOperation>>,
document_iterator: &mut Iterator<Item = AddOperation>,
segment_updater: &mut SegmentUpdater,
mut delete_cursor: DeleteCursor,
) -> Result<bool> {
@@ -275,11 +271,11 @@ fn index_documents(
let segment_id = segment.id();
let table_size = initial_table_size(memory_budget);
let mut segment_writer = SegmentWriter::for_segment(table_size, segment.clone(), &schema)?;
for documents in document_iterator {
for doc in documents {
segment_writer.add_document(doc, &schema)?;
}
for doc in document_iterator {
segment_writer.add_document(doc, &schema)?;
let mem_usage = segment_writer.mem_usage();
if mem_usage >= memory_budget - MARGIN_IN_BYTES {
info!(
"Buffer limit reached, flushing segment with maxdoc={}.",
@@ -305,7 +301,7 @@ fn index_documents(
let last_docstamp: u64 = *(doc_opstamps.last().unwrap());
let delete_bitset_opt = if delete_cursor.get().is_some() {
let segment_entry: SegmentEntry = if delete_cursor.get().is_some() {
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
let segment_reader = SegmentReader::open(segment)?;
let mut deleted_bitset = BitSet::with_capacity(num_docs as usize);
@@ -316,22 +312,18 @@ fn index_documents(
&doc_to_opstamps,
last_docstamp,
)?;
if may_have_deletes {
Some(deleted_bitset)
} else {
None
}
SegmentEntry::new(segment_meta, delete_cursor, {
if may_have_deletes {
Some(deleted_bitset)
} else {
None
}
})
} else {
// if there are no delete operation in the queue, no need
// to even open the segment.
None
SegmentEntry::new(segment_meta, delete_cursor, None)
};
let segment_entry = SegmentEntry::new(
segment_meta,
delete_cursor,
delete_bitset_opt,
last_docstamp,
);
Ok(segment_updater.add_segment(generation, segment_entry))
}
@@ -340,7 +332,7 @@ impl IndexWriter {
pub fn wait_merging_threads(mut self) -> Result<()> {
// this will stop the indexing thread,
// dropping the last reference to the segment_updater.
drop(self.operation_sender);
drop(self.document_sender);
let former_workers_handles = mem::replace(&mut self.workers_join_handle, vec![]);
for join_handle in former_workers_handles {
@@ -366,30 +358,27 @@ impl IndexWriter {
}
#[doc(hidden)]
pub fn add_segment(&mut self, segment_meta: SegmentMeta, opstamp: u64) {
pub fn add_segment(&mut self, segment_meta: SegmentMeta) {
let delete_cursor = self.delete_queue.cursor();
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, None, opstamp);
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, None);
self.segment_updater
.add_segment(self.generation, segment_entry);
}
/// Creates a new segment.
/// *Experimental & Advanced API* Creates a new segment.
/// and marks it as currently in write.
///
/// This method is useful only for users trying to do complex
/// operations, like converting an index format to another.
///
/// It is safe to start writing file associated to the new `Segment`.
/// These will not be garbage collected as long as an instance object of
/// `SegmentMeta` object associated to the new `Segment` is "alive".
pub fn new_segment(&self) -> Segment {
self.index.new_segment()
self.segment_updater.new_segment()
}
/// Spawns a new worker thread for indexing.
/// The thread consumes documents from the pipeline.
///
fn add_indexing_worker(&mut self) -> Result<()> {
let document_receiver_clone = self.operation_receiver.clone();
let document_receiver_clone = self.document_receiver.clone();
let mut segment_updater = self.segment_updater.clone();
let generation = self.generation;
@@ -397,13 +386,11 @@ impl IndexWriter {
let mut delete_cursor = self.delete_queue.cursor();
let mem_budget = self.heap_size_in_bytes_per_thread;
let index = self.index.clone();
let join_handle: JoinHandle<Result<()>> = thread::Builder::new()
.name(format!(
"thrd-tantivy-index{}-gen{}",
self.worker_id, generation
))
.spawn(move || {
)).spawn(move || {
loop {
let mut document_iterator =
document_receiver_clone.clone().into_iter().peekable();
@@ -415,19 +402,15 @@ impl IndexWriter {
// this is a valid guarantee as the
// peeked document now belongs to
// our local iterator.
if let Some(operations) = document_iterator.peek() {
if let Some(first) = operations.first() {
delete_cursor.skip_to(first.opstamp);
} else {
return Ok(());
}
if let Some(operation) = document_iterator.peek() {
delete_cursor.skip_to(operation.opstamp);
} else {
// No more documents.
// Happens when there is a commit, or if the `IndexWriter`
// was dropped.
return Ok(());
}
let segment = index.new_segment();
let segment = segment_updater.new_segment();
index_documents(
mem_budget,
&segment,
@@ -444,7 +427,7 @@ impl IndexWriter {
}
/// Accessor to the merge policy.
pub fn get_merge_policy(&self) -> Arc<Box<MergePolicy>> {
pub fn get_merge_policy(&self) -> Box<MergePolicy> {
self.segment_updater.get_merge_policy()
}
@@ -469,10 +452,7 @@ impl IndexWriter {
/// Merges a given list of segments
///
/// `segment_ids` is required to be non-empty.
pub fn merge(
&mut self,
segment_ids: &[SegmentId],
) -> Result<impl Future<Item = SegmentMeta, Error = Canceled>> {
pub fn merge(&mut self, segment_ids: &[SegmentId]) -> Result<Receiver<SegmentMeta>> {
self.segment_updater.start_merge(segment_ids)
}
@@ -484,11 +464,14 @@ impl IndexWriter {
/// when no documents are remaining.
///
/// Returns the former segment_ready channel.
fn recreate_document_channel(&mut self) -> OperationReceiver {
let (document_sender, document_receiver): (OperationSender, OperationReceiver) =
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
mem::replace(&mut self.operation_sender, document_sender);
mem::replace(&mut self.operation_receiver, document_receiver)
fn recreate_document_channel(&mut self) -> DocumentReceiver {
let (mut document_sender, mut document_receiver): (
DocumentSender,
DocumentReceiver,
) = channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
swap(&mut self.document_sender, &mut document_sender);
swap(&mut self.document_receiver, &mut document_receiver);
document_receiver
}
/// Rollback to the last commit
@@ -506,7 +489,7 @@ impl IndexWriter {
// segment updates will be ignored.
self.segment_updater.kill();
let document_receiver = self.operation_receiver.clone();
let document_receiver = self.document_receiver.clone();
// take the directory lock to create a new index_writer.
let directory_lock = self
@@ -532,7 +515,7 @@ impl IndexWriter {
//
// This will reach an end as the only document_sender
// was dropped with the index_writer.
for _ in document_receiver.iter() {}
for _ in document_receiver.clone() {}
Ok(())
}
@@ -559,16 +542,6 @@ impl IndexWriter {
/// using this API.
/// See [`PreparedCommit::set_payload()`](PreparedCommit.html)
pub fn prepare_commit(&mut self) -> Result<PreparedCommit> {
info!("Preparing commit");
self.prepare_commit_internal(false)
}
pub fn prepare_commit_soft(&mut self) -> Result<PreparedCommit> {
info!("Preparing soft commit");
self.prepare_commit_internal(true)
}
pub(crate) fn prepare_commit_internal(&mut self, soft: bool) -> Result<PreparedCommit> {
// Here, because we join all of the worker threads,
// all of the segment update for this commit have been
// sent.
@@ -585,19 +558,24 @@ impl IndexWriter {
// and recreate a new one channels.
self.recreate_document_channel();
let former_workers_join_handle = mem::replace(&mut self.workers_join_handle, Vec::new());
let mut former_workers_join_handle = Vec::new();
swap(
&mut former_workers_join_handle,
&mut self.workers_join_handle,
);
for worker_handle in former_workers_join_handle {
let indexing_worker_result = worker_handle
.join()
.map_err(|e| TantivyError::ErrorInThread(format!("{:?}", e)))?;
// add a new worker for the next generation, whether the worker failed or not.
self.add_indexing_worker()?;
indexing_worker_result?;
// add a new worker for the next generation.
self.add_indexing_worker()?;
}
let commit_opstamp = self.stamper.stamp();
let prepared_commit = PreparedCommit::new(self, commit_opstamp, soft);
let prepared_commit = PreparedCommit::new(self, commit_opstamp);
info!("Prepared commit {}", commit_opstamp);
Ok(prepared_commit)
}
@@ -620,11 +598,6 @@ impl IndexWriter {
self.prepare_commit()?.commit()
}
pub fn soft_commit(&mut self) -> Result<u64> {
self.prepare_commit_soft()?.commit()
}
pub(crate) fn segment_updater(&self) -> &SegmentUpdater {
&self.segment_updater
}
@@ -668,177 +641,32 @@ impl IndexWriter {
pub fn add_document(&mut self, document: Document) -> u64 {
let opstamp = self.stamper.stamp();
let add_operation = AddOperation { opstamp, document };
let send_result = self.operation_sender.send(vec![add_operation]);
let send_result = self.document_sender.send(add_operation);
if let Err(e) = send_result {
panic!("Failed to index document. Sending to indexing channel failed. This probably means all of the indexing threads have panicked. {:?}", e);
}
opstamp
}
/// Gets a range of stamps from the stamper and "pops" the last stamp
/// from the range returning a tuple of the last optstamp and the popped
/// range.
///
/// The total number of stamps generated by this method is `count + 1`;
/// each operation gets a stamp from the `stamps` iterator and `last_opstamp`
/// is for the batch itself.
fn get_batch_opstamps(&mut self, count: u64) -> (u64, Range<u64>) {
let Range { start, end } = self.stamper.stamps(count + 1u64);
let last_opstamp = end - 1;
let stamps = Range {
start,
end: last_opstamp,
};
(last_opstamp, stamps)
}
/// Runs a group of document operations ensuring that the operations are
/// assigned contigous u64 opstamps and that add operations of the same
/// group are flushed into the same segment.
///
/// If the indexing pipeline is full, this call may block.
///
/// Each operation of the given `user_operations` will receive an in-order,
/// contiguous u64 opstamp. The entire batch itself is also given an
/// opstamp that is 1 greater than the last given operation. This
/// `batch_opstamp` is the return value of `run`. An empty group of
/// `user_operations`, an empty `Vec<UserOperation>`, still receives
/// a valid opstamp even though no changes were _actually_ made to the index.
///
/// Like adds and deletes (see `IndexWriter.add_document` and
/// `IndexWriter.delete_term`), the changes made by calling `run` will be
/// visible to readers only after calling `commit()`.
pub fn run(&mut self, user_operations: Vec<UserOperation>) -> u64 {
let count = user_operations.len() as u64;
if count == 0 {
return self.stamper.stamp();
}
let (batch_opstamp, stamps) = self.get_batch_opstamps(count);
let mut adds: Vec<AddOperation> = Vec::new();
for (user_op, opstamp) in user_operations.into_iter().zip(stamps) {
match user_op {
UserOperation::Delete(term) => {
let delete_operation = DeleteOperation { opstamp, term };
self.delete_queue.push(delete_operation);
}
UserOperation::Add(document) => {
let add_operation = AddOperation { opstamp, document };
adds.push(add_operation);
}
}
}
let send_result = self.operation_sender.send(adds);
if let Err(e) = send_result {
panic!("Failed to index document. Sending to indexing channel failed. This probably means all of the indexing threads have panicked. {:?}", e);
};
batch_opstamp
}
}
#[cfg(test)]
mod tests {
use super::super::operation::UserOperation;
use super::initial_table_size;
use collector::TopDocs;
use directory::error::LockError;
use error::*;
use indexer::NoMergePolicy;
use query::TermQuery;
use schema::{self, IndexRecordOption};
use schema::{self, Document};
use Index;
use ReloadPolicy;
use Term;
use IndexReader;
#[test]
fn test_operations_group() {
// an operations group with 2 items should cause 3 opstamps 0, 1, and 2.
let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let operations = vec![
UserOperation::Add(doc!(text_field=>"a")),
UserOperation::Add(doc!(text_field=>"b")),
];
let batch_opstamp1 = index_writer.run(operations);
assert_eq!(batch_opstamp1, 2u64);
}
#[test]
fn test_ordered_batched_operations() {
// * one delete for `doc!(field=>"a")`
// * one add for `doc!(field=>"a")`
// * one add for `doc!(field=>"b")`
// * one delete for `doc!(field=>"b")`
// after commit there is one doc with "a" and 0 doc with "b"
let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build());
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let a_term = Term::from_field_text(text_field, "a");
let b_term = Term::from_field_text(text_field, "b");
let operations = vec![
UserOperation::Delete(a_term),
UserOperation::Add(doc!(text_field=>"a")),
UserOperation::Add(doc!(text_field=>"b")),
UserOperation::Delete(b_term),
];
index_writer.run(operations);
index_writer.commit().expect("failed to commit");
reader.reload().expect("failed to load searchers");
let a_term = Term::from_field_text(text_field, "a");
let b_term = Term::from_field_text(text_field, "b");
let a_query = TermQuery::new(a_term, IndexRecordOption::Basic);
let b_query = TermQuery::new(b_term, IndexRecordOption::Basic);
let searcher = reader.searcher();
let a_docs = searcher
.search(&a_query, &TopDocs::with_limit(1))
.expect("search for a failed");
let b_docs = searcher
.search(&b_query, &TopDocs::with_limit(1))
.expect("search for b failed");
assert_eq!(a_docs.len(), 1);
assert_eq!(b_docs.len(), 0);
}
#[test]
fn test_empty_operations_group() {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer(3_000_000).unwrap();
let operations1 = vec![];
let batch_opstamp1 = index_writer.run(operations1);
assert_eq!(batch_opstamp1, 0u64);
let operations2 = vec![];
let batch_opstamp2 = index_writer.run(operations2);
assert_eq!(batch_opstamp2, 1u64);
}
#[test]
fn test_lockfile_stops_duplicates() {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
let _index_writer = index.writer(3_000_000).unwrap();
match index.writer(3_000_000) {
Err(TantivyError::LockFailure(LockError::LockBusy, _)) => {}
_ => panic!("Expected a `LockFailure` error"),
let _index_writer = index.writer(40_000_000).unwrap();
match index.writer(40_000_000) {
Err(TantivyError::LockFailure(_)) => {}
_ => panic!("Expected FileAlreadyExists error"),
}
}
@@ -850,7 +678,8 @@ mod tests {
match index.writer_with_num_threads(1, 3_000_000) {
Err(err) => {
let err_msg = err.to_string();
assert!(err_msg.contains("already an `IndexWriter`"));
assert!(err_msg.contains("Lockfile"));
assert!(err_msg.contains("Possible causes:"))
}
_ => panic!("Expected LockfileAlreadyExists error"),
}
@@ -860,7 +689,7 @@ mod tests {
fn test_set_merge_policy() {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
let index_writer = index.writer(3_000_000).unwrap();
let index_writer = index.writer(40_000_000).unwrap();
assert_eq!(
format!("{:?}", index_writer.get_merge_policy()),
"LogMergePolicy { min_merge_size: 8, min_layer_size: 10000, \
@@ -879,18 +708,11 @@ mod tests {
let schema_builder = schema::Schema::builder();
let index = Index::create_in_ram(schema_builder.build());
{
let _index_writer = index.writer(3_000_000).unwrap();
let _index_writer = index.writer(40_000_000).unwrap();
// the lock should be released when the
// index_writer leaves the scope.
}
let _index_writer_two = index.writer(3_000_000).unwrap();
}
fn num_docs_containing_text(reader: &IndexReader, term: &str) -> u64 {
let searcher = reader.searcher();
let text_field = reader.schema().get_field("text").unwrap();
let term = Term::from_field_text(text_field, term);
searcher.doc_freq(&term)
let _index_writer_two = index.writer(40_000_000).unwrap();
}
#[test]
@@ -898,68 +720,33 @@ mod tests {
let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build());
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
let num_docs_containing = |s: &str| {
let searcher = reader.searcher();
let searcher = index.searcher();
let term = Term::from_field_text(text_field, s);
searcher.doc_freq(&term)
};
let mut index_writer = index.writer(3_000_000).unwrap();
assert_eq!(index_writer.commit_opstamp(), 0u64);
assert_eq!(num_docs_containing_text(&reader, "a"), 0);
{
// writing the segment
let mut index_writer = index.writer(3_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"a"));
index_writer.rollback().unwrap();
assert_eq!(index_writer.commit_opstamp(), 0u64);
assert_eq!(num_docs_containing("a"), 0);
{
index_writer.add_document(doc!(text_field=>"b"));
index_writer.add_document(doc!(text_field=>"c"));
}
assert!(index_writer.commit().is_ok());
reader.reload().unwrap();
assert_eq!(index_writer.commit().unwrap(), 2u64);
index.load_searchers().unwrap();
assert_eq!(num_docs_containing("a"), 0);
assert_eq!(num_docs_containing("b"), 1);
assert_eq!(num_docs_containing("c"), 1);
}
reader.reload().unwrap();
reader.searcher();
}
#[test]
fn test_softcommit_and_rollback() {
let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build());
let reader = index.reader().unwrap();
// writing the segment
let mut index_writer = index.writer(3_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"a"));
index_writer.rollback().unwrap();
assert_eq!(index_writer.commit_opstamp(), 0u64);
assert_eq!(num_docs_containing_text(&reader, "a"), 0u64);
{
index_writer.add_document(doc!(text_field=>"b"));
index_writer.add_document(doc!(text_field=>"c"));
}
assert!(index_writer.soft_commit().is_ok());
reader.reload().unwrap(); // we need to load soft committed stuff.
assert_eq!(num_docs_containing_text(&reader, "a"), 0u64);
assert_eq!(num_docs_containing_text(&reader, "b"), 1u64);
assert_eq!(num_docs_containing_text(&reader, "c"), 1u64);
index_writer.rollback().unwrap();
reader.reload().unwrap();
assert_eq!(num_docs_containing_text(&reader, "a"), 0u64);
assert_eq!(num_docs_containing_text(&reader, "b"), 0u64);
assert_eq!(num_docs_containing_text(&reader, "c"), 0u64);
index.load_searchers().unwrap();
index.searcher();
}
#[test]
@@ -967,35 +754,34 @@ mod tests {
let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build());
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
let num_docs_containing = |s: &str| {
let searcher = index.searcher();
let term_a = Term::from_field_text(text_field, s);
reader.searcher().doc_freq(&term_a)
searcher.doc_freq(&term_a)
};
{
// writing the segment
let mut index_writer = index.writer(12_000_000).unwrap();
// create 8 segments with 100 tiny docs
for _doc in 0..100 {
index_writer.add_document(doc!(text_field=>"a"));
let mut doc = Document::default();
doc.add_text(text_field, "a");
index_writer.add_document(doc);
}
index_writer.commit().expect("commit failed");
for _doc in 0..100 {
index_writer.add_document(doc!(text_field=>"a"));
let mut doc = Document::default();
doc.add_text(text_field, "a");
index_writer.add_document(doc);
}
// this should create 8 segments and trigger a merge.
// this should create 8 segments and trigger a merge.
index_writer.commit().expect("commit failed");
index_writer
.wait_merging_threads()
.expect("waiting merging thread failed");
index.load_searchers().unwrap();
reader.reload().unwrap();
assert_eq!(num_docs_containing_text(&reader, "a"), 200);
assert_eq!(num_docs_containing("a"), 200);
assert!(index.searchable_segments().unwrap().len() < 8);
}
}
@@ -1016,6 +802,7 @@ mod tests {
{
let mut prepared_commit = index_writer.prepare_commit().expect("commit failed");
prepared_commit.set_payload("first commit");
assert_eq!(prepared_commit.opstamp(), 100);
prepared_commit.commit().expect("commit failed");
}
{
@@ -1038,7 +825,7 @@ mod tests {
let mut schema_builder = schema::Schema::builder();
let text_field = schema_builder.add_text_field("text", schema::TEXT);
let index = Index::create_in_ram(schema_builder.build());
let reader = index.reader();
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(4, 12_000_000).unwrap();
@@ -1049,6 +836,7 @@ mod tests {
{
let mut prepared_commit = index_writer.prepare_commit().expect("commit failed");
prepared_commit.set_payload("first commit");
assert_eq!(prepared_commit.opstamp(), 100);
prepared_commit.abort().expect("commit failed");
}
{
@@ -1060,15 +848,11 @@ mod tests {
}
index_writer.commit().unwrap();
}
index.load_searchers().unwrap();
let num_docs_containing = |s: &str| {
let searcher = index.searcher();
let term_a = Term::from_field_text(text_field, s);
index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap()
.searcher()
.doc_freq(&term_a)
searcher.doc_freq(&term_a)
};
assert_eq!(num_docs_containing("a"), 0);
assert_eq!(num_docs_containing("b"), 100);
@@ -1076,9 +860,9 @@ mod tests {
#[test]
fn test_hashmap_size() {
assert_eq!(initial_table_size(100_000), 11);
assert_eq!(initial_table_size(1_000_000), 14);
assert_eq!(initial_table_size(10_000_000), 17);
assert_eq!(initial_table_size(100_000), 12);
assert_eq!(initial_table_size(1_000_000), 15);
assert_eq!(initial_table_size(10_000_000), 18);
assert_eq!(initial_table_size(1_000_000_000), 19);
}
@@ -1100,9 +884,11 @@ mod tests {
index_writer.add_document(doc!(text_field => "b"));
}
assert!(index_writer.commit().is_err());
index.load_searchers().unwrap();
let num_docs_containing = |s: &str| {
let searcher = index.searcher();
let term_a = Term::from_field_text(text_field, s);
index.reader().unwrap().searcher().doc_freq(&term_a)
searcher.doc_freq(&term_a)
};
assert_eq!(num_docs_containing("a"), 100);
assert_eq!(num_docs_containing("b"), 0);

View File

@@ -1,64 +0,0 @@
use census::{Inventory, TrackedObject};
use std::collections::HashSet;
use SegmentId;
#[derive(Default)]
pub struct MergeOperationInventory(Inventory<InnerMergeOperation>);
impl MergeOperationInventory {
pub fn segment_in_merge(&self) -> HashSet<SegmentId> {
let mut segment_in_merge = HashSet::default();
for merge_op in self.0.list() {
for &segment_id in &merge_op.segment_ids {
segment_in_merge.insert(segment_id);
}
}
segment_in_merge
}
}
/// A `MergeOperation` has two role.
/// It carries all of the information required to describe a merge :
/// - `target_opstamp` is the opstamp up to which we want to consume the
/// delete queue and reflect their deletes.
/// - `segment_ids` is the list of segment to be merged.
///
/// The second role is to ensure keep track of the fact that these
/// segments are in merge and avoid starting a merge operation that
/// may conflict with this one.
///
/// This works by tracking merge operations. When considering computing
/// merge candidates, we simply list tracked merge operations and remove
/// their segments from possible merge candidates.
pub struct MergeOperation {
inner: TrackedObject<InnerMergeOperation>,
}
struct InnerMergeOperation {
target_opstamp: u64,
segment_ids: Vec<SegmentId>,
}
impl MergeOperation {
pub fn new(
inventory: &MergeOperationInventory,
target_opstamp: u64,
segment_ids: Vec<SegmentId>,
) -> MergeOperation {
let inner_merge_operation = InnerMergeOperation {
target_opstamp,
segment_ids,
};
MergeOperation {
inner: inventory.0.track(inner_merge_operation),
}
}
pub fn target_opstamp(&self) -> u64 {
self.inner.target_opstamp
}
pub fn segment_ids(&self) -> &[SegmentId] {
&self.inner.segment_ids[..]
}
}

View File

@@ -11,7 +11,7 @@ pub struct MergeCandidate(pub Vec<SegmentId>);
///
/// Every time a the list of segments changes, the segment updater
/// asks the merge policy if some segments should be merged.
pub trait MergePolicy: marker::Send + marker::Sync + Debug {
pub trait MergePolicy: MergePolicyClone + marker::Send + marker::Sync + Debug {
/// Given the list of segment metas, returns the list of merge candidates.
///
/// This call happens on the segment updater thread, and will block
@@ -19,6 +19,21 @@ pub trait MergePolicy: marker::Send + marker::Sync + Debug {
fn compute_merge_candidates(&self, segments: &[SegmentMeta]) -> Vec<MergeCandidate>;
}
/// MergePolicyClone
pub trait MergePolicyClone {
/// Returns a boxed clone of the MergePolicy.
fn box_clone(&self) -> Box<MergePolicy>;
}
impl<T> MergePolicyClone for T
where
T: 'static + MergePolicy + Clone,
{
fn box_clone(&self) -> Box<MergePolicy> {
Box::new(self.clone())
}
}
/// Never merge segments.
#[derive(Debug, Clone)]
pub struct NoMergePolicy;

View File

@@ -1,4 +1,3 @@
use common::MAX_DOC_LIMIT;
use core::Segment;
use core::SegmentReader;
use core::SerializableSegment;
@@ -24,7 +23,6 @@ use termdict::TermMerger;
use termdict::TermOrdinal;
use DocId;
use Result;
use TantivyError;
fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 {
let mut total_tokens = 0u64;
@@ -42,15 +40,13 @@ fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> u64 {
total_tokens += reader.inverted_index(field).total_num_tokens();
}
}
total_tokens
+ count
.iter()
.cloned()
.enumerate()
.map(|(fieldnorm_ord, count)| {
count as u64 * u64::from(FieldNormReader::id_to_fieldnorm(fieldnorm_ord as u8))
})
.sum::<u64>()
total_tokens + count
.iter()
.cloned()
.enumerate()
.map(|(fieldnorm_ord, count)| {
count as u64 * u64::from(FieldNormReader::id_to_fieldnorm(fieldnorm_ord as u8))
}).sum::<u64>()
}
pub struct IndexMerger {
@@ -152,14 +148,6 @@ impl IndexMerger {
readers.push(reader);
}
}
if max_doc >= MAX_DOC_LIMIT {
let err_msg = format!(
"The segment resulting from this merge would have {} docs,\
which exceeds the limit {}.",
max_doc, MAX_DOC_LIMIT
);
return Err(TantivyError::InvalidArgument(err_msg));
}
Ok(IndexMerger {
schema,
readers,
@@ -204,17 +192,17 @@ impl IndexMerger {
fast_field_serializer,
)?;
}
FieldType::U64(ref options)
| FieldType::I64(ref options)
| FieldType::Date(ref options) => match options.get_fastfield_cardinality() {
Some(Cardinality::SingleValue) => {
self.write_single_fast_field(field, fast_field_serializer)?;
FieldType::U64(ref options) | FieldType::I64(ref options) => {
match options.get_fastfield_cardinality() {
Some(Cardinality::SingleValue) => {
self.write_single_fast_field(field, fast_field_serializer)?;
}
Some(Cardinality::MultiValues) => {
self.write_multi_fast_field(field, fast_field_serializer)?;
}
None => {}
}
Some(Cardinality::MultiValues) => {
self.write_multi_fast_field(field, fast_field_serializer)?;
}
None => {}
},
}
FieldType::Str(_) => {
// We don't handle str fast field for the moment
// They can be implemented using what is done
@@ -535,8 +523,7 @@ impl IndexMerger {
}
}
None
})
.collect();
}).collect();
// At this point, `segment_postings` contains the posting list
// of all of the segments containing the given term.
@@ -664,7 +651,6 @@ mod tests {
use schema::IntOptions;
use schema::Term;
use schema::TextFieldIndexing;
use schema::INDEXED;
use std::io::Cursor;
use DocAddress;
use IndexWriter;
@@ -678,16 +664,13 @@ mod tests {
TextFieldIndexing::default()
.set_tokenizer("default")
.set_index_option(IndexRecordOption::WithFreqs),
)
.set_stored();
).set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let date_field = schema_builder.add_date_field("date", INDEXED);
let score_fieldtype = schema::IntOptions::default().set_fast(Cardinality::SingleValue);
let score_field = schema_builder.add_u64_field("score", score_fieldtype);
let bytes_score_field = schema_builder.add_bytes_field("score_bytes");
let index = Index::create_in_ram(schema_builder.build());
let reader = index.reader().unwrap();
let curr_time = chrono::Utc::now();
let add_score_bytes = |doc: &mut Document, score: u32| {
let mut bytes = Vec::new();
bytes
@@ -704,7 +687,6 @@ mod tests {
let mut doc = Document::default();
doc.add_text(text_field, "af b");
doc.add_u64(score_field, 3);
doc.add_date(date_field, &curr_time);
add_score_bytes(&mut doc, 3);
index_writer.add_document(doc);
}
@@ -730,7 +712,6 @@ mod tests {
{
let mut doc = Document::default();
doc.add_text(text_field, "af b");
doc.add_date(date_field, &curr_time);
doc.add_u64(score_field, 11);
add_score_bytes(&mut doc, 11);
index_writer.add_document(doc);
@@ -758,8 +739,8 @@ mod tests {
index_writer.wait_merging_threads().unwrap();
}
{
reader.reload().unwrap();
let searcher = reader.searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let get_doc_ids = |terms: Vec<Term>| {
let query = BooleanQuery::new_multiterms_query(terms);
let top_docs = searcher.search(&query, &TestCollector).unwrap();
@@ -788,10 +769,6 @@ mod tests {
DocAddress(0, 4)
]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_date(date_field, &curr_time)]),
vec![DocAddress(0, 0), DocAddress(0, 3)]
);
}
{
let doc = searcher.doc(DocAddress(0, 0)).unwrap();
@@ -826,8 +803,7 @@ mod tests {
.search(
&query,
&BytesFastFieldTestCollector::for_field(bytes_score_field),
)
.expect("failed to search")
).expect("failed to search")
};
assert_eq!(
get_fast_vals(vec![Term::from_field_text(text_field, "a")]),
@@ -847,15 +823,14 @@ mod tests {
let text_fieldtype = schema::TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
)
.set_stored();
).set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let score_fieldtype = schema::IntOptions::default().set_fast(Cardinality::SingleValue);
let score_field = schema_builder.add_u64_field("score", score_fieldtype);
let bytes_score_field = schema_builder.add_bytes_field("score_bytes");
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let reader = index.reader().unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
let search_term = |searcher: &Searcher, term: Term| {
let collector = FastFieldTestCollector::for_field(score_field);
let bytes_collector = BytesFastFieldTestCollector::for_field(bytes_score_field);
@@ -876,24 +851,24 @@ mod tests {
{
// a first commit
index_writer.add_document(doc!(
text_field => "a b d",
score_field => 1u64,
bytes_score_field => vec![0u8, 0, 0, 1],
));
text_field => "a b d",
score_field => 1u64,
bytes_score_field => vec![0u8, 0, 0, 1],
));
index_writer.add_document(doc!(
text_field => "b c",
score_field => 2u64,
bytes_score_field => vec![0u8, 0, 0, 2],
));
text_field => "b c",
score_field => 2u64,
bytes_score_field => vec![0u8, 0, 0, 2],
));
index_writer.delete_term(Term::from_field_text(text_field, "c"));
index_writer.add_document(doc!(
text_field => "c d",
score_field => 3u64,
bytes_score_field => vec![0u8, 0, 0, 3],
));
text_field => "c d",
score_field => 3u64,
bytes_score_field => vec![0u8, 0, 0, 3],
));
index_writer.commit().expect("committed");
reader.reload().unwrap();
let searcher = reader.searcher();
index.load_searchers().unwrap();
let ref searcher = *index.searcher();
assert_eq!(searcher.num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].max_doc(), 3);
@@ -917,30 +892,30 @@ mod tests {
{
// a second commit
index_writer.add_document(doc!(
text_field => "a d e",
score_field => 4_000u64,
bytes_score_field => vec![0u8, 0, 0, 4],
));
text_field => "a d e",
score_field => 4_000u64,
bytes_score_field => vec![0u8, 0, 0, 4],
));
index_writer.add_document(doc!(
text_field => "e f",
score_field => 5_000u64,
bytes_score_field => vec![0u8, 0, 0, 5],
));
text_field => "e f",
score_field => 5_000u64,
bytes_score_field => vec![0u8, 0, 0, 5],
));
index_writer.delete_term(Term::from_field_text(text_field, "a"));
index_writer.delete_term(Term::from_field_text(text_field, "f"));
index_writer.add_document(doc!(
text_field => "f g",
score_field => 6_000u64,
bytes_score_field => vec![0u8, 0, 23, 112],
));
text_field => "f g",
score_field => 6_000u64,
bytes_score_field => vec![0u8, 0, 23, 112],
));
index_writer.add_document(doc!(
text_field => "g h",
score_field => 7_000u64,
bytes_score_field => vec![0u8, 0, 27, 88],
));
text_field => "g h",
score_field => 7_000u64,
bytes_score_field => vec![0u8, 0, 27, 88],
));
index_writer.commit().expect("committed");
reader.reload().unwrap();
let searcher = reader.searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
assert_eq!(searcher.segment_readers().len(), 2);
assert_eq!(searcher.num_docs(), 3);
@@ -1001,8 +976,8 @@ mod tests {
.expect("Failed to initiate merge")
.wait()
.expect("Merging failed");
reader.reload().unwrap();
let searcher = reader.searcher();
index.load_searchers().unwrap();
let ref searcher = *index.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.num_docs(), 3);
assert_eq!(searcher.segment_readers()[0].num_docs(), 3);
@@ -1047,8 +1022,8 @@ mod tests {
index_writer.delete_term(Term::from_field_text(text_field, "c"));
index_writer.commit().unwrap();
reader.reload().unwrap();
let searcher = reader.searcher();
index.load_searchers().unwrap();
let ref searcher = *index.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
@@ -1098,9 +1073,9 @@ mod tests {
.expect("Failed to initiate merge")
.wait()
.expect("Merging failed");
reader.reload().unwrap();
index.load_searchers().unwrap();
let searcher = reader.searcher();
let ref searcher = *index.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.num_docs(), 2);
assert_eq!(searcher.segment_readers()[0].num_docs(), 2);
@@ -1144,15 +1119,18 @@ mod tests {
{
// Test removing all docs
index_writer.delete_term(Term::from_field_text(text_field, "g"));
index_writer.commit().unwrap();
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
reader.reload().unwrap();
index_writer
.merge(&segment_ids)
.expect("Failed to initiate merge")
.wait()
.expect("Merging failed");
index.load_searchers().unwrap();
let searcher = reader.searcher();
assert!(segment_ids.is_empty());
assert!(searcher.segment_readers().is_empty());
let ref searcher = *index.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
assert_eq!(searcher.num_docs(), 0);
}
}
@@ -1162,9 +1140,8 @@ mod tests {
let mut schema_builder = schema::Schema::builder();
let facet_field = schema_builder.add_facet_field("facet");
let index = Index::create_in_ram(schema_builder.build());
let reader = index.reader().unwrap();
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
let index_doc = |index_writer: &mut IndexWriter, doc_facets: &[&str]| {
let mut doc = Document::default();
for facet in doc_facets {
@@ -1192,9 +1169,9 @@ mod tests {
index_writer.commit().expect("committed");
}
reader.reload().unwrap();
index.load_searchers().unwrap();
let test_searcher = |expected_num_docs: usize, expected: &[(&str, u64)]| {
let searcher = reader.searcher();
let searcher = index.searcher();
let mut facet_collector = FacetCollector::for_field(facet_field);
facet_collector.add_facet(Facet::from("/top"));
let (count, facet_counts) = searcher
@@ -1229,14 +1206,14 @@ mod tests {
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer
.merge(&segment_ids)
.expect("Failed to initiate merge")
.wait()
.expect("Merging failed");
index_writer.wait_merging_threads().unwrap();
reader.reload().unwrap();
index.load_searchers().unwrap();
test_searcher(
11,
&[
@@ -1252,12 +1229,12 @@ mod tests {
// Deleting one term
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
let facet = Facet::from_path(vec!["top", "a", "firstdoc"]);
let facet_term = Term::from_facet(facet_field, &facet);
index_writer.delete_term(facet_term);
index_writer.commit().unwrap();
reader.reload().unwrap();
index.load_searchers().unwrap();
test_searcher(
9,
&[
@@ -1272,34 +1249,6 @@ mod tests {
}
}
#[test]
fn test_bug_merge() {
let mut schema_builder = schema::Schema::builder();
let int_field = schema_builder.add_u64_field("intvals", INDEXED);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(int_field => 1u64));
index_writer.commit().expect("commit failed");
index_writer.add_document(doc!(int_field => 1u64));
index_writer.commit().expect("commit failed");
let reader = index.reader().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 2);
index_writer.delete_term(Term::from_field_u64(int_field, 1));
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
index_writer
.merge(&segment_ids)
.expect("Failed to initiate merge")
.wait()
.expect("Merging failed");
reader.reload().unwrap();
// commit has not been called yet. The document should still be
// there.
assert_eq!(reader.searcher().num_docs(), 2);
}
#[test]
fn test_merge_multivalued_int_fields_all_deleted() {
let mut schema_builder = schema::Schema::builder();
@@ -1308,9 +1257,9 @@ mod tests {
.set_indexed();
let int_field = schema_builder.add_u64_field("intvals", int_options);
let index = Index::create_in_ram(schema_builder.build());
let reader = index.reader().unwrap();
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
let mut doc = Document::default();
doc.add_u64(int_field, 1);
index_writer.add_document(doc.clone());
@@ -1318,33 +1267,31 @@ mod tests {
index_writer.add_document(doc);
index_writer.commit().expect("commit failed");
index_writer.delete_term(Term::from_field_u64(int_field, 1));
index_writer.commit().expect("commit failed");
}
index.load_searchers().unwrap();
let searcher = index.searcher();
assert_eq!(searcher.num_docs(), 0);
// Merging the segments
{
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer
.merge(&segment_ids)
.expect("Failed to initiate merge")
.wait()
.expect("Merging failed");
// assert delete has not been committed
reader.reload().expect("failed to load searcher 1");
let searcher = reader.searcher();
assert_eq!(searcher.num_docs(), 2);
index_writer.commit().unwrap();
index_writer.wait_merging_threads().unwrap();
}
reader.reload().unwrap();
let searcher = reader.searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
assert_eq!(searcher.num_docs(), 0);
}
#[test]
fn test_merge_multivalued_int_fields_simple() {
fn test_merge_multivalued_int_fields() {
let mut schema_builder = schema::Schema::builder();
let int_options = IntOptions::default()
.set_fast(Cardinality::MultiValues)
@@ -1353,7 +1300,7 @@ mod tests {
let index = Index::create_in_ram(schema_builder.build());
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
let index_doc = |index_writer: &mut IndexWriter, int_vals: &[u64]| {
let mut doc = Document::default();
for &val in int_vals {
@@ -1361,6 +1308,7 @@ mod tests {
}
index_writer.add_document(doc);
};
index_doc(&mut index_writer, &[1, 2]);
index_doc(&mut index_writer, &[1, 2, 3]);
index_doc(&mut index_writer, &[4, 5]);
@@ -1369,14 +1317,19 @@ mod tests {
index_doc(&mut index_writer, &[3]);
index_doc(&mut index_writer, &[17]);
index_writer.commit().expect("committed");
index_doc(&mut index_writer, &[20]);
index_writer.commit().expect("committed");
index_doc(&mut index_writer, &[28, 27]);
index_doc(&mut index_writer, &[1_000]);
index_writer.commit().expect("committed");
}
let reader = index.reader().unwrap();
let searcher = reader.searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let mut vals: Vec<u64> = Vec::new();
{
@@ -1436,20 +1389,19 @@ mod tests {
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer
.merge(&segment_ids)
.expect("Failed to initiate merge")
.wait()
.expect("Merging failed");
index_writer
.wait_merging_threads()
.expect("Wait for merging threads");
index_writer.wait_merging_threads().unwrap();
}
reader.reload().expect("Load searcher");
index.load_searchers().unwrap();
{
let searcher = reader.searcher();
let searcher = index.searcher();
println!(
"{:?}",
searcher

View File

@@ -1,9 +1,8 @@
pub mod delete_queue;
mod directory_lock;
mod doc_opstamp_mapping;
pub mod index_writer;
mod log_merge_policy;
mod merge_operation;
pub mod merge_policy;
pub mod merger;
pub mod operation;
@@ -16,12 +15,14 @@ pub mod segment_updater;
mod segment_writer;
mod stamper;
pub(crate) use self::directory_lock::DirectoryLock;
pub use self::directory_lock::LockType;
pub use self::index_writer::IndexWriter;
pub use self::log_merge_policy::LogMergePolicy;
pub use self::merge_operation::{MergeOperation, MergeOperationInventory};
pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy};
pub use self::prepared_commit::PreparedCommit;
pub use self::segment_entry::SegmentEntry;
pub use self::segment_entry::{SegmentEntry, SegmentState};
pub use self::segment_manager::SegmentManager;
pub use self::segment_serializer::SegmentSerializer;
pub use self::segment_writer::SegmentWriter;

View File

@@ -14,10 +14,3 @@ pub struct AddOperation {
pub opstamp: u64,
pub document: Document,
}
/// UserOperation is an enum type that encapsulates other operation types.
#[derive(Eq, PartialEq, Debug)]
pub enum UserOperation {
Add(Document),
Delete(Term),
}

View File

@@ -6,20 +6,14 @@ pub struct PreparedCommit<'a> {
index_writer: &'a mut IndexWriter,
payload: Option<String>,
opstamp: u64,
soft: bool,
}
impl<'a> PreparedCommit<'a> {
pub(crate) fn new(
index_writer: &'a mut IndexWriter,
opstamp: u64,
soft: bool,
) -> PreparedCommit {
pub(crate) fn new(index_writer: &'a mut IndexWriter, opstamp: u64) -> PreparedCommit {
PreparedCommit {
index_writer,
payload: None,
opstamp,
soft,
}
}
@@ -39,7 +33,7 @@ impl<'a> PreparedCommit<'a> {
info!("committing {}", self.opstamp);
self.index_writer
.segment_updater()
.commit(self.opstamp, self.payload, self.soft)?;
.commit(self.opstamp, self.payload)?;
Ok(self.opstamp)
}
}

View File

@@ -4,6 +4,21 @@ use core::SegmentMeta;
use indexer::delete_queue::DeleteCursor;
use std::fmt;
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
pub enum SegmentState {
Ready,
InMerge,
}
impl SegmentState {
pub fn letter_code(self) -> char {
match self {
SegmentState::InMerge => 'M',
SegmentState::Ready => 'R',
}
}
}
/// A segment entry describes the state of
/// a given segment, at a given instant.
///
@@ -20,9 +35,9 @@ use std::fmt;
#[derive(Clone)]
pub struct SegmentEntry {
meta: SegmentMeta,
state: SegmentState,
delete_bitset: Option<BitSet>,
delete_cursor: DeleteCursor,
opstamp: u64,
}
impl SegmentEntry {
@@ -31,20 +46,15 @@ impl SegmentEntry {
segment_meta: SegmentMeta,
delete_cursor: DeleteCursor,
delete_bitset: Option<BitSet>,
opstamp: u64,
) -> SegmentEntry {
SegmentEntry {
meta: segment_meta,
state: SegmentState::Ready,
delete_bitset,
delete_cursor,
opstamp,
}
}
pub fn opstamp(&self) -> u64 {
self.opstamp
}
/// Return a reference to the segment entry deleted bitset.
///
/// `DocId` in this bitset are flagged as deleted.
@@ -53,8 +63,7 @@ impl SegmentEntry {
}
/// Set the `SegmentMeta` for this segment.
pub fn set_meta(&mut self, opstamp: u64, segment_meta: SegmentMeta) {
self.opstamp = opstamp;
pub fn set_meta(&mut self, segment_meta: SegmentMeta) {
self.meta = segment_meta;
}
@@ -63,6 +72,14 @@ impl SegmentEntry {
&mut self.delete_cursor
}
/// Return the `SegmentEntry`.
///
/// The state describes whether the segment is available for
/// a merge or not.
pub fn state(&self) -> SegmentState {
self.state
}
/// Returns the segment id.
pub fn segment_id(&self) -> SegmentId {
self.meta.id()
@@ -72,10 +89,33 @@ impl SegmentEntry {
pub fn meta(&self) -> &SegmentMeta {
&self.meta
}
/// Mark the `SegmentEntry` as in merge.
///
/// Only segments that are not already
/// in a merge are elligible for future merge.
pub fn start_merge(&mut self) {
self.state = SegmentState::InMerge;
}
/// Cancel a merge
///
/// If a merge fails, it is important to switch
/// the segment back to a idle state, so that it
/// may be elligible for future merges.
pub fn cancel_merge(&mut self) {
self.state = SegmentState::Ready;
}
/// Returns true iff a segment should
/// be considered for a merge.
pub fn is_ready(&self) -> bool {
self.state == SegmentState::Ready
}
}
impl fmt::Debug for SegmentEntry {
fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
write!(formatter, "SegmentEntry({:?})", self.meta)
write!(formatter, "SegmentEntry({:?}, {:?})", self.meta, self.state)
}
}

View File

@@ -11,47 +11,12 @@ use std::path::PathBuf;
use std::sync::RwLock;
use std::sync::{RwLockReadGuard, RwLockWriteGuard};
use Result as TantivyResult;
use std::sync::Arc;
use std::collections::HashMap;
/// Provides a read-only view of the available segments.
#[derive(Clone)]
pub struct AvailableSegments {
registers: Arc<RwLock<SegmentRegisters>>,
}
impl AvailableSegments {
pub fn committed(&self) -> Vec<SegmentMeta> {
self.registers
.read()
.unwrap()
.committed
.segment_metas()
}
pub fn soft_committed(&self) -> Vec<SegmentMeta> {
self.registers
.read()
.unwrap()
.soft_committed
.segment_metas()
}
}
#[derive(Default)]
struct SegmentRegisters {
uncommitted: HashMap<SegmentId, SegmentEntry>,
uncommitted: SegmentRegister,
committed: SegmentRegister,
/// soft commits can advance committed segment to a future delete
/// opstamp.
///
/// In that case the same `SegmentId` can appear in both `committed`
/// and in `committed_in_the_future`.
///
/// We do not consider these segments for merges.
soft_committed: SegmentRegister,
/// `DeleteCursor`, positionned on the soft commit.
delete_cursor: DeleteCursor,
writing: HashSet<SegmentId>,
}
/// The segment manager stores the list of segments
@@ -59,8 +24,9 @@ struct SegmentRegisters {
///
/// It guarantees the atomicity of the
/// changes (merges especially)
#[derive(Default)]
pub struct SegmentManager {
registers: Arc<RwLock<SegmentRegisters>>
registers: RwLock<SegmentRegisters>,
}
impl Debug for SegmentManager {
@@ -75,23 +41,12 @@ impl Debug for SegmentManager {
}
pub fn get_mergeable_segments(
in_merge_segment_ids: &HashSet<SegmentId>,
segment_manager: &SegmentManager,
) -> (Vec<SegmentMeta>, Vec<SegmentMeta>) {
let registers_lock = segment_manager.read();
(
registers_lock
.soft_committed
.get_mergeable_segments(in_merge_segment_ids),
registers_lock
.uncommitted
.values()
.map(|segment_entry| segment_entry.meta())
.filter(|segment_meta| {
!in_merge_segment_ids.contains(&segment_meta.id())
})
.cloned()
.collect::<Vec<_>>()
registers_lock.committed.get_mergeable_segments(),
registers_lock.uncommitted.get_mergeable_segments(),
)
}
@@ -99,22 +54,28 @@ impl SegmentManager {
pub fn from_segments(
segment_metas: Vec<SegmentMeta>,
delete_cursor: &DeleteCursor,
opstamp: u64,
) -> SegmentManager {
SegmentManager {
registers: Arc::new(RwLock::new(SegmentRegisters {
uncommitted: HashMap::default(),
committed: SegmentRegister::new(segment_metas.clone(), opstamp),
soft_committed: SegmentRegister::new(segment_metas, opstamp),
delete_cursor: delete_cursor.clone(),
}))
registers: RwLock::new(SegmentRegisters {
uncommitted: SegmentRegister::default(),
committed: SegmentRegister::new(segment_metas, delete_cursor),
writing: HashSet::new(),
}),
}
}
pub fn available_segments_view(&self) -> AvailableSegments {
AvailableSegments {
registers: self.registers.clone()
}
/// Returns all of the segment entries (committed or uncommitted)
pub fn segment_entries(&self) -> Vec<SegmentEntry> {
let registers_lock = self.read();
let mut segment_entries = registers_lock.uncommitted.segment_entries();
segment_entries.extend(registers_lock.committed.segment_entries());
segment_entries
}
/// Returns the overall number of segments in the `SegmentManager`
pub fn num_segments(&self) -> usize {
let registers_lock = self.read();
registers_lock.committed.len() + registers_lock.uncommitted.len()
}
/// List the files that are useful to the index.
@@ -145,84 +106,40 @@ impl SegmentManager {
.expect("Failed to acquire write lock on SegmentManager.")
}
/// Deletes all empty segments
fn remove_empty_segments(&self) {
let mut registers_lock = self.write();
registers_lock
.committed
.segment_metas()
.iter()
.filter(|segment_meta| segment_meta.num_docs() == 0)
.for_each(|segment_meta| {
registers_lock
.committed
.remove_segment(&segment_meta.id())
});
registers_lock
.soft_committed
.segment_metas()
.iter()
.filter(|segment_meta| segment_meta.num_docs() == 0)
.for_each(|segment_meta| {
registers_lock
.committed
.remove_segment(&segment_meta.id())
});
}
/// Returns all of the segment entries (soft committed or uncommitted)
pub fn segment_entries(&self) -> Vec<SegmentEntry> {
let registers_lock = self.read();
let mut segment_entries: Vec<SegmentEntry > = registers_lock.uncommitted.values().cloned().collect();
segment_entries.extend(registers_lock.soft_committed.segment_entries(&registers_lock.delete_cursor).into_iter());
segment_entries
}
pub fn commit(&self, opstamp: u64, segment_entries: Vec<SegmentEntry>) {
pub fn commit(&self, segment_entries: Vec<SegmentEntry>) {
let mut registers_lock = self.write();
registers_lock.committed.clear();
registers_lock.uncommitted.clear();
registers_lock
.committed
.set_commit(opstamp, segment_entries.clone());
registers_lock
.soft_committed
.set_commit(opstamp, segment_entries);
registers_lock.delete_cursor.skip_to(opstamp);
for segment_entry in segment_entries {
registers_lock.committed.add_segment_entry(segment_entry);
}
}
pub fn soft_commit(&self, opstamp: u64, segment_entries: Vec<SegmentEntry>) {
let mut registers_lock = self.write();
registers_lock.uncommitted.clear();
registers_lock
.soft_committed
.set_commit(opstamp, segment_entries);
registers_lock.delete_cursor.skip_to(opstamp);
}
/// Gets the list of segment_entries associated to a list of `segment_ids`.
/// This method is used when starting a merge operations.
/// Marks a list of segments as in merge.
///
/// Returns an error if some segments are missing, or if
/// the `segment_ids` are not either all soft_committed or all
/// the `segment_ids` are not either all committed or all
/// uncommitted.
pub fn start_merge(&self, segment_ids: &[SegmentId]) -> TantivyResult<Vec<SegmentEntry>> {
let registers_lock = self.read();
let mut registers_lock = self.write();
let mut segment_entries = vec![];
if segment_ids.iter().all(|segment_id| registers_lock.uncommitted.contains_key(segment_id)) {
if registers_lock.uncommitted.contains_all(segment_ids) {
for segment_id in segment_ids {
let segment_entry = registers_lock.uncommitted
.get(segment_id)
.expect("Segment id not found {}. Should never happen because of the contains all if-block.");
segment_entries.push(segment_entry.clone());
}
} else if registers_lock.soft_committed.contains_all(segment_ids) {
for segment_id in segment_ids {
let segment_entry = registers_lock.soft_committed
.get(segment_id, &registers_lock.delete_cursor)
.start_merge(segment_id)
.expect("Segment id not found {}. Should never happen because of the contains all if-block.");
segment_entries.push(segment_entry);
}
} else if registers_lock.committed.contains_all(segment_ids) {
for segment_id in segment_ids {
let segment_entry = registers_lock.committed
.start_merge(segment_id)
.expect("Segment id not found {}. Should never happen because of the contains all if-block.");
segment_entries.push(segment_entry);
}
for segment_id in segment_ids {
registers_lock.committed.start_merge(segment_id);
}
} else {
let error_msg = "Merge operation sent for segments that are not \
all uncommited or commited."
@@ -232,38 +149,86 @@ impl SegmentManager {
Ok(segment_entries)
}
pub fn cancel_merge(
&self,
before_merge_segment_ids: &[SegmentId],
after_merge_segment_id: SegmentId,
) {
let mut registers_lock = self.write();
// we mark all segments are ready for merge.
{
let target_segment_register: &mut SegmentRegister;
target_segment_register = {
if registers_lock
.uncommitted
.contains_all(before_merge_segment_ids)
{
&mut registers_lock.uncommitted
} else if registers_lock
.committed
.contains_all(before_merge_segment_ids)
{
&mut registers_lock.committed
} else {
warn!("couldn't find segment in SegmentManager");
return;
}
};
for segment_id in before_merge_segment_ids {
target_segment_register.cancel_merge(segment_id);
}
}
// ... and we make sure the target segment entry
// can be garbage collected.
registers_lock.writing.remove(&after_merge_segment_id);
}
pub fn write_segment(&self, segment_id: SegmentId) {
let mut registers_lock = self.write();
registers_lock.writing.insert(segment_id);
}
pub fn add_segment(&self, segment_entry: SegmentEntry) {
let mut registers_lock = self.write();
registers_lock
.uncommitted
.insert(segment_entry.segment_id(), segment_entry);
registers_lock.writing.remove(&segment_entry.segment_id());
registers_lock.uncommitted.add_segment_entry(segment_entry);
}
pub fn end_merge(
&self,
before_merge_segment_ids: &[SegmentId],
after_merge_segment_entry: SegmentEntry
after_merge_segment_entry: SegmentEntry,
) {
let mut registers_lock = self.write();
registers_lock
.writing
.remove(&after_merge_segment_entry.segment_id());
if before_merge_segment_ids.iter().all(|seg_id|
registers_lock
let target_register: &mut SegmentRegister = {
if registers_lock
.uncommitted
.contains_key(seg_id))
{
for segment_id in before_merge_segment_ids {
registers_lock.uncommitted.remove(&segment_id);
.contains_all(before_merge_segment_ids)
{
&mut registers_lock.uncommitted
} else if registers_lock
.committed
.contains_all(before_merge_segment_ids)
{
&mut registers_lock.committed
} else {
warn!("couldn't find segment in SegmentManager");
return;
}
registers_lock.uncommitted.insert(after_merge_segment_entry.segment_id(),
after_merge_segment_entry);
} else {
registers_lock.committed.receive_merge(&before_merge_segment_ids, &after_merge_segment_entry);
registers_lock.soft_committed.receive_merge(&before_merge_segment_ids, &after_merge_segment_entry)
};
for segment_id in before_merge_segment_ids {
target_register.remove_segment(segment_id);
}
target_register.add_segment_entry(after_merge_segment_entry);
}
pub fn committed_segment_metas(&self) -> Vec<SegmentMeta> {
self.remove_empty_segments();
let registers_lock = self.read();
registers_lock.committed.segment_metas()
}

View File

@@ -3,7 +3,6 @@ use core::SegmentMeta;
use indexer::delete_queue::DeleteCursor;
use indexer::segment_entry::SegmentEntry;
use std::collections::HashMap;
use std::collections::HashSet;
use std::fmt::{self, Debug, Formatter};
/// The segment register keeps track
@@ -16,15 +15,14 @@ use std::fmt::{self, Debug, Formatter};
/// merge candidates.
#[derive(Default)]
pub struct SegmentRegister {
segment_states: HashMap<SegmentId, SegmentMeta>,
opstamp_constraint: u64,
segment_states: HashMap<SegmentId, SegmentEntry>,
}
impl Debug for SegmentRegister {
fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
write!(f, "SegmentRegister(")?;
for k in self.segment_states.keys() {
write!(f, "{}, ", k.short_uuid_string())?;
for (k, v) in &self.segment_states {
write!(f, "{}:{}, ", k.short_uuid_string(), v.state().letter_code())?;
}
write!(f, ")")?;
Ok(())
@@ -36,113 +34,76 @@ impl SegmentRegister {
self.segment_states.clear();
}
pub fn get_mergeable_segments(
&self,
in_merge_segment_ids: &HashSet<SegmentId>,
) -> Vec<SegmentMeta> {
pub fn len(&self) -> usize {
self.segment_states.len()
}
pub fn get_mergeable_segments(&self) -> Vec<SegmentMeta> {
self.segment_states
.values()
.filter(|segment_meta| !in_merge_segment_ids.contains(&segment_meta.id()))
.cloned()
.filter(|segment_entry| segment_entry.is_ready())
.map(|segment_entry| segment_entry.meta().clone())
.collect()
}
pub fn segment_entries(&self) -> Vec<SegmentEntry> {
self.segment_states.values().cloned().collect()
}
pub fn segment_metas(&self) -> Vec<SegmentMeta> {
let mut segment_metas: Vec<SegmentMeta> = self
let mut segment_ids: Vec<SegmentMeta> = self
.segment_states
.values()
.cloned()
.map(|segment_entry| segment_entry.meta().clone())
.collect();
segment_metas.sort_by_key(|meta| meta.id());
segment_metas
segment_ids.sort_by_key(|meta| meta.id());
segment_ids
}
pub fn segment_entries(&self, delete_cursor: &DeleteCursor) -> Vec<SegmentEntry> {
self.segment_states
.values()
.map(|segment_meta| {
SegmentEntry::new(segment_meta.clone(), delete_cursor.clone(), None, self.opstamp_constraint)
})
.collect()
}
pub fn contains_all(&self, segment_ids: &[SegmentId]) -> bool {
pub fn contains_all(&mut self, segment_ids: &[SegmentId]) -> bool {
segment_ids
.iter()
.all(|segment_id| self.segment_states.contains_key(segment_id))
}
pub fn receive_merge(&mut self,
before_merge_segment_ids: &[SegmentId],
after_merge_segment_entry: &SegmentEntry) {
if after_merge_segment_entry.opstamp() != self.opstamp_constraint {
return;
}
if !self.contains_all(before_merge_segment_ids) {
return;
}
for segment_id in before_merge_segment_ids {
self.segment_states.remove(segment_id);
}
self.register_segment_entry(after_merge_segment_entry.clone());
}
/// Registers a `SegmentEntry`.
///
/// If a segment entry associated to this `SegmentId` is already there,
/// override it with the new `SegmentEntry`.
pub fn register_segment_entry(&mut self, segment_entry: SegmentEntry) {
if self.opstamp_constraint != segment_entry.opstamp() {
panic!(format!(
"Invalid segment. Expect opstamp {}, got {}.",
self.opstamp_constraint,
segment_entry.opstamp()
));
}
if segment_entry.meta().num_docs() == 0 {
return;
}
pub fn add_segment_entry(&mut self, segment_entry: SegmentEntry) {
let segment_id = segment_entry.segment_id();
// Check that we are ok with deletes.
self.segment_states.insert(segment_id, segment_entry.meta().clone());
}
pub fn set_commit(&mut self, opstamp: u64, segment_entries: Vec<SegmentEntry>) {
self.segment_states.clear();
self.opstamp_constraint = opstamp;
for segment_entry in segment_entries {
self.register_segment_entry(segment_entry);
}
self.segment_states.insert(segment_id, segment_entry);
}
pub fn remove_segment(&mut self, segment_id: &SegmentId) {
self.segment_states.remove(&segment_id);
self.segment_states.remove(segment_id);
}
pub fn get(&self, segment_id: &SegmentId, delete_cursor: &DeleteCursor) -> Option<SegmentEntry> {
pub fn cancel_merge(&mut self, segment_id: &SegmentId) {
self.segment_states
.get(&segment_id)
.map(|segment_meta|
SegmentEntry::new(
segment_meta.clone(),
delete_cursor.clone(),
None,
self.opstamp_constraint
))
.get_mut(segment_id)
.expect("Received a merge notification for a segment that is not registered")
.cancel_merge();
}
pub fn new(
segment_metas: Vec<SegmentMeta>,
opstamp: u64,
) -> SegmentRegister {
pub fn start_merge(&mut self, segment_id: &SegmentId) -> Option<SegmentEntry> {
if let Some(segment_entry) = self.segment_states.get_mut(segment_id) {
segment_entry.start_merge();
Some(segment_entry.clone())
} else {
None
}
}
pub fn new(segment_metas: Vec<SegmentMeta>, delete_cursor: &DeleteCursor) -> SegmentRegister {
let mut segment_states = HashMap::new();
for segment_meta in segment_metas {
segment_states.insert(segment_meta.id(), segment_meta);
}
SegmentRegister {
segment_states,
opstamp_constraint: opstamp,
let segment_id = segment_meta.id();
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor.clone(), None);
segment_states.insert(segment_id, segment_entry);
}
SegmentRegister { segment_states }
}
#[cfg(test)]
pub fn segment_entry(&self, segment_id: &SegmentId) -> Option<SegmentEntry> {
self.segment_states.get(segment_id).cloned()
}
}
@@ -152,6 +113,7 @@ mod tests {
use core::SegmentId;
use core::SegmentMeta;
use indexer::delete_queue::*;
use indexer::SegmentState;
fn segment_ids(segment_register: &SegmentRegister) -> Vec<SegmentId> {
segment_register
@@ -171,22 +133,52 @@ mod tests {
let segment_id_merged = SegmentId::generate_random();
{
let segment_meta = SegmentMeta::new(segment_id_a, 1u32);
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None, 0u64);
segment_register.register_segment_entry(segment_entry);
let segment_meta = SegmentMeta::new(segment_id_a, 0u32);
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
segment_register.add_segment_entry(segment_entry);
}
assert_eq!(
segment_register
.segment_entry(&segment_id_a)
.unwrap()
.state(),
SegmentState::Ready
);
assert_eq!(segment_ids(&segment_register), vec![segment_id_a]);
{
let segment_meta = SegmentMeta::new(segment_id_b, 2u32);
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None, 0u64);
segment_register.register_segment_entry(segment_entry);
let segment_meta = SegmentMeta::new(segment_id_b, 0u32);
let segment_entry = SegmentEntry::new(segment_meta, delete_queue.cursor(), None);
segment_register.add_segment_entry(segment_entry);
}
assert_eq!(
segment_register
.segment_entry(&segment_id_b)
.unwrap()
.state(),
SegmentState::Ready
);
segment_register.start_merge(&segment_id_a);
segment_register.start_merge(&segment_id_b);
assert_eq!(
segment_register
.segment_entry(&segment_id_a)
.unwrap()
.state(),
SegmentState::InMerge
);
assert_eq!(
segment_register
.segment_entry(&segment_id_b)
.unwrap()
.state(),
SegmentState::InMerge
);
segment_register.remove_segment(&segment_id_a);
segment_register.remove_segment(&segment_id_b);
{
let segment_meta_merged = SegmentMeta::new(segment_id_merged, 3u32);
let segment_entry =
SegmentEntry::new(segment_meta_merged, delete_queue.cursor(), None, 0u64);
segment_register.receive_merge(&[segment_id_a, segment_id_b], &segment_entry);
segment_register.register_segment_entry(segment_entry);
let segment_meta_merged = SegmentMeta::new(segment_id_merged, 0u32);
let segment_entry = SegmentEntry::new(segment_meta_merged, delete_queue.cursor(), None);
segment_register.add_segment_entry(segment_entry);
}
assert_eq!(segment_ids(&segment_register), vec![segment_id_merged]);
}

View File

@@ -16,10 +16,9 @@ use futures_cpupool::CpuFuture;
use futures_cpupool::CpuPool;
use indexer::delete_queue::DeleteCursor;
use indexer::index_writer::advance_deletes;
use indexer::merge_operation::MergeOperationInventory;
use indexer::merger::IndexMerger;
use indexer::stamper::Stamper;
use indexer::MergeOperation;
use indexer::MergeCandidate;
use indexer::SegmentEntry;
use indexer::SegmentSerializer;
use indexer::{DefaultMergePolicy, MergePolicy};
@@ -27,7 +26,6 @@ use schema::Schema;
use serde_json;
use std::borrow::BorrowMut;
use std::collections::HashMap;
use std::collections::HashSet;
use std::io::Write;
use std::mem;
use std::ops::DerefMut;
@@ -47,30 +45,33 @@ use Result;
/// and flushed.
///
/// This method is not part of tantivy's public API
pub fn save_new_metas(schema: Schema, directory: &mut Directory) -> Result<()> {
save_metas(
&IndexMeta {
segments: Vec::new(),
schema,
opstamp: 0u64,
payload: None,
},
directory,
)
pub fn save_new_metas(schema: Schema, opstamp: u64, directory: &mut Directory) -> Result<()> {
save_metas(vec![], schema, opstamp, None, directory)
}
/// Save the index meta file.
/// This operation is atomic:
/// Either
/// - it fails, in which case an error is returned,
// - it fails, in which case an error is returned,
/// and the `meta.json` remains untouched,
/// - it success, and `meta.json` is written
/// and flushed.
///
/// This method is not part of tantivy's public API
fn save_metas(metas: &IndexMeta, directory: &mut Directory) -> Result<()> {
let mut buffer = serde_json::to_vec_pretty(metas)?;
// Just adding a new line at the end of the buffer.
pub fn save_metas(
segment_metas: Vec<SegmentMeta>,
schema: Schema,
opstamp: u64,
payload: Option<String>,
directory: &mut Directory,
) -> Result<()> {
let metas = IndexMeta {
segments: segment_metas,
schema,
opstamp,
payload,
};
let mut buffer = serde_json::to_vec_pretty(&metas)?;
writeln!(&mut buffer)?;
directory.atomic_write(&META_FILEPATH, &buffer[..])?;
debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas));
@@ -82,21 +83,16 @@ fn save_metas(metas: &IndexMeta, directory: &mut Directory) -> Result<()> {
//
// All this processing happens on a single thread
// consuming a common queue.
//
// We voluntarily pass a merge_operation ref to guarantee that
// the merge_operation is alive during the process
#[derive(Clone)]
pub struct SegmentUpdater(Arc<InnerSegmentUpdater>);
fn perform_merge(
merge_operation: &MergeOperation,
index: &Index,
mut segment_entries: Vec<SegmentEntry>,
mut merged_segment: Segment,
target_opstamp: u64,
) -> Result<SegmentEntry> {
let target_opstamp = merge_operation.target_opstamp();
// first we need to apply deletes to our segment.
let mut merged_segment = index.new_segment();
// TODO add logging
let schema = index.schema();
@@ -125,68 +121,60 @@ fn perform_merge(
let segment_meta = SegmentMeta::new(merged_segment.id(), num_docs);
let after_merge_segment_entry = SegmentEntry::new(segment_meta.clone(), delete_cursor, None, target_opstamp);
let after_merge_segment_entry = SegmentEntry::new(segment_meta.clone(), delete_cursor, None);
Ok(after_merge_segment_entry)
}
struct InnerSegmentUpdater {
// we keep a copy of the current active IndexMeta to
// avoid loading the file everytime we need it in the
// `SegmentUpdater`.
//
// This should be up to date as all update happen through
// the unique active `SegmentUpdater`.
active_metas: RwLock<Arc<IndexMeta>>,
pool: CpuPool,
index: Index,
segment_manager: SegmentManager,
merge_policy: RwLock<Arc<Box<MergePolicy>>>,
merge_policy: RwLock<Box<MergePolicy>>,
merging_thread_id: AtomicUsize,
merging_threads: RwLock<HashMap<usize, JoinHandle<Result<()>>>>,
generation: AtomicUsize,
killed: AtomicBool,
stamper: Stamper,
merge_operations: MergeOperationInventory,
}
impl SegmentUpdater {
pub fn create(
pub fn new(
index: Index,
stamper: Stamper,
delete_cursor: &DeleteCursor,
) -> Result<SegmentUpdater> {
let index_meta = index.load_metas()?;
let segments = index.searchable_segment_metas()?;
let opstamp = index_meta.opstamp;
let segment_manager = SegmentManager::from_segments(segments, delete_cursor, opstamp);
let segment_manager = SegmentManager::from_segments(segments, delete_cursor);
let pool = CpuPoolBuilder::new()
.name_prefix("segment_updater")
.pool_size(1)
.create();
let index_meta = index.load_metas()?;
Ok(SegmentUpdater(Arc::new(InnerSegmentUpdater {
active_metas: RwLock::new(Arc::new(index_meta)),
pool,
index,
segment_manager,
merge_policy: RwLock::new(Arc::new(Box::new(DefaultMergePolicy::default()))),
merge_policy: RwLock::new(Box::new(DefaultMergePolicy::default())),
merging_thread_id: AtomicUsize::default(),
merging_threads: RwLock::new(HashMap::new()),
generation: AtomicUsize::default(),
killed: AtomicBool::new(false),
stamper,
merge_operations: Default::default(),
})))
}
pub fn get_merge_policy(&self) -> Arc<Box<MergePolicy>> {
self.0.merge_policy.read().unwrap().clone()
pub fn new_segment(&self) -> Segment {
let new_segment = self.0.index.new_segment();
let segment_id = new_segment.id();
self.0.segment_manager.write_segment(segment_id);
new_segment
}
pub fn get_merge_policy(&self) -> Box<MergePolicy> {
self.0.merge_policy.read().unwrap().box_clone()
}
pub fn set_merge_policy(&self, merge_policy: Box<MergePolicy>) {
let arc_merge_policy = Arc::new(merge_policy);
*self.0.merge_policy.write().unwrap() = arc_merge_policy;
*self.0.merge_policy.write().unwrap() = merge_policy;
}
fn get_merging_thread_id(&self) -> usize {
@@ -207,8 +195,7 @@ impl SegmentUpdater {
segment_updater.0.segment_manager.add_segment(segment_entry);
segment_updater.consider_merge_options();
true
})
.forget();
}).forget();
true
} else {
false
@@ -256,23 +243,20 @@ impl SegmentUpdater {
//
// Segment 1 from disk 1, Segment 1 from disk 2, etc.
commited_segment_metas.sort_by_key(|segment_meta| -(segment_meta.max_doc() as i32));
let index_meta = IndexMeta {
segments: commited_segment_metas,
schema: index.schema(),
save_metas(
commited_segment_metas,
index.schema(),
opstamp,
payload: commit_message,
};
save_metas(&index_meta, directory.box_clone().borrow_mut())
.expect("Could not save metas.");
self.store_meta(&index_meta);
commit_message,
directory.box_clone().borrow_mut(),
).expect("Could not save metas.");
}
}
pub fn garbage_collect_files(&self) -> Result<()> {
self.run_async(move |segment_updater| {
segment_updater.garbage_collect_files_exec();
})
.wait()
}).wait()
}
fn garbage_collect_files_exec(&self) {
@@ -283,94 +267,65 @@ impl SegmentUpdater {
.garbage_collect(|| self.0.segment_manager.list_files());
}
pub fn commit(&self, opstamp: u64, payload: Option<String>, soft: bool) -> Result<()> {
pub fn commit(&self, opstamp: u64, payload: Option<String>) -> Result<()> {
self.run_async(move |segment_updater| {
if segment_updater.is_alive() {
let segment_entries = segment_updater
.purge_deletes(opstamp)
.expect("Failed purge deletes");
if soft {
// Soft commit.
//
// The list `segment_entries` above is what we might want to use as searchable
// segment. However, we do not want to mark them as committed, and we want
// to keep the current set of committed segment.
segment_updater.0.segment_manager.soft_commit(opstamp, segment_entries);
// ... We do not save the meta file.
} else {
// Hard_commit. We register the new segment entries as committed.
segment_updater
.0
.segment_manager
.commit(opstamp, segment_entries);
// TODO error handling.
segment_updater.save_metas(opstamp, payload);
segment_updater.0.index.directory().flush().unwrap();
}
segment_updater.0.segment_manager.commit(segment_entries);
segment_updater.save_metas(opstamp, payload);
segment_updater.garbage_collect_files_exec();
segment_updater.consider_merge_options();
}
})
.wait()
}).wait()
}
pub fn start_merge(&self, segment_ids: &[SegmentId]) -> Result<Receiver<SegmentMeta>> {
let commit_opstamp = self.load_metas().opstamp;
let merge_operation = MergeOperation::new(
&self.0.merge_operations,
commit_opstamp,
segment_ids.to_vec(),
);
self.run_async(move |segment_updater| segment_updater.start_merge_impl(merge_operation))
.wait()?
}
fn store_meta(&self, index_meta: &IndexMeta) {
*self.0.active_metas.write().unwrap() = Arc::new(index_meta.clone());
}
fn load_metas(&self) -> Arc<IndexMeta> {
self.0.active_metas.read().unwrap().clone()
//let future_merged_segment = */
let segment_ids_vec = segment_ids.to_vec();
self.run_async(move |segment_updater| {
segment_updater.start_merge_impl(&segment_ids_vec[..])
}).wait()?
}
// `segment_ids` is required to be non-empty.
fn start_merge_impl(&self, merge_operation: MergeOperation) -> Result<Receiver<SegmentMeta>> {
assert!(
!merge_operation.segment_ids().is_empty(),
"Segment_ids cannot be empty."
);
fn start_merge_impl(&self, segment_ids: &[SegmentId]) -> Result<Receiver<SegmentMeta>> {
assert!(!segment_ids.is_empty(), "Segment_ids cannot be empty.");
let segment_updater_clone = self.clone();
let segment_entries: Vec<SegmentEntry> = self
.0
.segment_manager
.start_merge(merge_operation.segment_ids())?;
let segment_entries: Vec<SegmentEntry> = self.0.segment_manager.start_merge(segment_ids)?;
// let segment_ids_vec = merge_operation.segment_ids.to_vec();
let segment_ids_vec = segment_ids.to_vec();
let merging_thread_id = self.get_merging_thread_id();
info!(
"Starting merge thread #{} - {:?}",
merging_thread_id,
merge_operation.segment_ids()
merging_thread_id, segment_ids
);
let (merging_future_send, merging_future_recv) = oneshot();
let target_opstamp = self.0.stamper.stamp();
// first we need to apply deletes to our segment.
let merging_join_handle = thread::Builder::new()
.name(format!("mergingthread-{}", merging_thread_id))
.spawn(move || {
// first we need to apply deletes to our segment.
let merged_segment = segment_updater_clone.new_segment();
let merged_segment_id = merged_segment.id();
let merge_result = perform_merge(
&merge_operation,
&segment_updater_clone.0.index,
segment_entries,
merged_segment,
target_opstamp,
);
match merge_result {
Ok(after_merge_segment_entry) => {
let merged_segment_meta = after_merge_segment_entry.meta().clone();
segment_updater_clone
.end_merge(merge_operation, after_merge_segment_entry)
.end_merge(segment_ids_vec, after_merge_segment_entry)
.expect("Segment updater thread is corrupted.");
// the future may fail if the listener of the oneshot future
@@ -381,18 +336,13 @@ impl SegmentUpdater {
let _merging_future_res = merging_future_send.send(merged_segment_meta);
}
Err(e) => {
warn!(
"Merge of {:?} was cancelled: {:?}",
merge_operation.segment_ids(),
e
);
warn!("Merge of {:?} was cancelled: {:?}", segment_ids_vec, e);
// ... cancel merge
if cfg!(test) {
panic!("Merge failed.");
}
// As `merge_operation` will be dropped, the segment in merge state will
// be available for merge again.
// `merging_future_send` will be dropped, sending an error to the future.
segment_updater_clone.cancel_merge(&segment_ids_vec, merged_segment_id);
// merging_future_send will be dropped, sending an error to the future.
}
}
segment_updater_clone
@@ -402,8 +352,7 @@ impl SegmentUpdater {
.unwrap()
.remove(&merging_thread_id);
Ok(())
})
.expect("Failed to spawn a thread.");
}).expect("Failed to spawn a thread.");
self.0
.merging_threads
.write()
@@ -413,35 +362,16 @@ impl SegmentUpdater {
}
fn consider_merge_options(&self) {
let merge_segment_ids: HashSet<SegmentId> = self.0.merge_operations.segment_in_merge();
let (committed_segments, uncommitted_segments) =
get_mergeable_segments(&merge_segment_ids, &self.0.segment_manager);
get_mergeable_segments(&self.0.segment_manager);
// Committed segments cannot be merged with uncommitted_segments.
// We therefore consider merges using these two sets of segments independently.
let merge_policy = self.get_merge_policy();
let current_opstamp = self.0.stamper.stamp();
let mut merge_candidates: Vec<MergeOperation> = merge_policy
.compute_merge_candidates(&uncommitted_segments)
.into_iter()
.map(|merge_candidate| {
MergeOperation::new(&self.0.merge_operations, current_opstamp, merge_candidate.0)
})
.collect();
let commit_opstamp = self.load_metas().opstamp;
let committed_merge_candidates = merge_policy
.compute_merge_candidates(&committed_segments)
.into_iter()
.map(|merge_candidate| {
MergeOperation::new(&self.0.merge_operations, commit_opstamp, merge_candidate.0)
})
.collect::<Vec<_>>();
merge_candidates.extend(committed_merge_candidates.into_iter());
for merge_operation in merge_candidates {
match self.start_merge_impl(merge_operation) {
let mut merge_candidates = merge_policy.compute_merge_candidates(&uncommitted_segments);
let committed_merge_candidates = merge_policy.compute_merge_candidates(&committed_segments);
merge_candidates.extend_from_slice(&committed_merge_candidates[..]);
for MergeCandidate(segment_metas) in merge_candidates {
match self.start_merge_impl(&segment_metas) {
Ok(merge_future) => {
if let Err(e) = merge_future.fuse().poll() {
error!("The merge task failed quickly after starting: {:?}", e);
@@ -457,16 +387,31 @@ impl SegmentUpdater {
}
}
fn cancel_merge(
&self,
before_merge_segment_ids: &[SegmentId],
after_merge_segment_entry: SegmentId,
) {
self.0
.segment_manager
.cancel_merge(before_merge_segment_ids, after_merge_segment_entry);
}
fn end_merge(
&self,
merge_operation: MergeOperation,
before_merge_segment_ids: Vec<SegmentId>,
mut after_merge_segment_entry: SegmentEntry,
) -> Result<()> {
self.run_async(move |segment_updater| {
info!("End merge {:?}", after_merge_segment_entry.meta());
let mut delete_cursor = after_merge_segment_entry.delete_cursor().clone();
if let Some(delete_operation) = delete_cursor.get() {
let committed_opstamp = segment_updater.load_metas().opstamp;
let committed_opstamp = segment_updater
.0
.index
.load_metas()
.expect("Failed to read opstamp")
.opstamp;
if delete_operation.opstamp < committed_opstamp {
let index = &segment_updater.0.index;
let segment = index.segment(after_merge_segment_entry.meta().clone());
@@ -475,15 +420,16 @@ impl SegmentUpdater {
{
error!(
"Merge of {:?} was cancelled (advancing deletes failed): {:?}",
merge_operation.segment_ids(),
e
before_merge_segment_ids, e
);
// ... cancel merge
if cfg!(test) {
panic!("Merge failed.");
}
// ... cancel merge
// `merge_operations` are tracked. As it is dropped, the
// the segment_ids will be available again for merge.
segment_updater.cancel_merge(
&before_merge_segment_ids,
after_merge_segment_entry.segment_id(),
);
return;
}
}
@@ -491,14 +437,13 @@ impl SegmentUpdater {
segment_updater
.0
.segment_manager
.end_merge(merge_operation.segment_ids(), after_merge_segment_entry);
.end_merge(&before_merge_segment_ids, after_merge_segment_entry);
segment_updater.consider_merge_options();
info!("save metas");
let previous_metas = segment_updater.load_metas();
segment_updater.save_metas(previous_metas.opstamp, previous_metas.payload.clone());
let previous_metas = segment_updater.0.index.load_metas().unwrap();
segment_updater.save_metas(previous_metas.opstamp, previous_metas.payload);
segment_updater.garbage_collect_files_exec();
})
.wait()
}).wait()
}
/// Wait for current merging threads.
@@ -517,25 +462,32 @@ impl SegmentUpdater {
/// Obsolete files will eventually be cleaned up
/// by the directory garbage collector.
pub fn wait_merging_thread(&self) -> Result<()> {
let mut num_segments: usize;
loop {
let merging_threads: HashMap<usize, JoinHandle<Result<()>>> = {
num_segments = self.0.segment_manager.num_segments();
let mut new_merging_threads = HashMap::new();
{
let mut merging_threads = self.0.merging_threads.write().unwrap();
mem::replace(merging_threads.deref_mut(), HashMap::new())
};
if merging_threads.is_empty() {
return Ok(());
mem::swap(&mut new_merging_threads, merging_threads.deref_mut());
}
debug!("wait merging thread {}", merging_threads.len());
for (_, merging_thread_handle) in merging_threads {
debug!("wait merging thread {}", new_merging_threads.len());
for (_, merging_thread_handle) in new_merging_threads {
merging_thread_handle
.join()
.map(|_| ())
.map_err(|_| TantivyError::ErrorInThread("Merging thread failed.".into()))?;
}
// Our merging thread may have queued their completed merged segment.
// Let's wait for that too.
// Our merging thread may have queued their completed
self.run_async(move |_| {}).wait()?;
let new_num_segments = self.0.segment_manager.num_segments();
if new_num_segments >= num_segments {
break;
}
}
Ok(())
}
}
@@ -555,7 +507,7 @@ mod tests {
let index = Index::create_in_ram(schema);
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer.set_merge_policy(Box::new(MergeWheneverPossible));
{
@@ -585,8 +537,9 @@ mod tests {
index_writer.delete_term(term);
assert!(index_writer.commit().is_ok());
}
let reader = index.reader().unwrap();
assert_eq!(reader.searcher().num_docs(), 302);
index.load_searchers().unwrap();
assert_eq!(index.searcher().num_docs(), 302);
{
index_writer
@@ -594,79 +547,8 @@ mod tests {
.expect("waiting for merging threads");
}
reader.reload().unwrap();
assert_eq!(reader.searcher().segment_readers().len(), 1);
assert_eq!(reader.searcher().num_docs(), 302);
}
#[test]
fn delete_all_docs() {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
{
for _ in 0..100 {
index_writer.add_document(doc!(text_field=>"a"));
index_writer.add_document(doc!(text_field=>"b"));
}
assert!(index_writer.commit().is_ok());
}
{
for _ in 0..100 {
index_writer.add_document(doc!(text_field=>"c"));
index_writer.add_document(doc!(text_field=>"d"));
}
assert!(index_writer.commit().is_ok());
}
{
index_writer.add_document(doc!(text_field=>"e"));
index_writer.add_document(doc!(text_field=>"f"));
assert!(index_writer.commit().is_ok());
}
{
let seg_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
// docs exist, should have at least 1 segment
assert!(seg_ids.len() > 0);
}
{
let term_vals = vec!["a", "b", "c", "d", "e", "f"];
for term_val in term_vals {
let term = Term::from_field_text(text_field, term_val);
index_writer.delete_term(term);
assert!(index_writer.commit().is_ok());
}
}
{
index_writer
.wait_merging_threads()
.expect("waiting for merging threads");
}
let reader = index.reader().unwrap();
assert_eq!(reader.searcher().num_docs(), 0);
let seg_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
assert!(seg_ids.is_empty());
reader.reload().unwrap();
assert_eq!(reader.searcher().num_docs(), 0);
// empty segments should be erased
assert!(index.searchable_segment_metas().unwrap().is_empty());
assert!(reader.searcher().segment_readers().is_empty());
index.load_searchers().unwrap();
assert_eq!(index.searcher().segment_readers().len(), 1);
assert_eq!(index.searcher().num_docs(), 302);
}
}

View File

@@ -62,8 +62,7 @@ impl SegmentWriter {
segment.index().tokenizers().get(tokenizer_name)
}),
_ => None,
})
.collect();
}).collect();
Ok(SegmentWriter {
max_doc: 0,
multifield_postings,
@@ -111,18 +110,18 @@ impl SegmentWriter {
}
match *field_options.field_type() {
FieldType::HierarchicalFacet => {
let facets: Vec<&str> = field_values
let facets: Vec<&[u8]> = field_values
.iter()
.flat_map(|field_value| match *field_value.value() {
Value::Facet(ref facet) => Some(facet.encoded_str()),
Value::Facet(ref facet) => Some(facet.encoded_bytes()),
_ => {
panic!("Expected hierarchical facet");
}
})
.collect();
}).collect();
let mut term = Term::for_field(field); // we set the Term
for fake_str in facets {
for facet_bytes in facets {
let mut unordered_term_id_opt = None;
let fake_str = unsafe { str::from_utf8_unchecked(facet_bytes) };
FacetTokenizer.token_stream(fake_str).process(&mut |token| {
term.set_text(&token.text);
let unordered_term_id =
@@ -146,8 +145,7 @@ impl SegmentWriter {
.flat_map(|field_value| match *field_value.value() {
Value::Str(ref text) => Some(text.as_str()),
_ => None,
})
.collect();
}).collect();
if texts.is_empty() {
0
} else {
@@ -171,17 +169,6 @@ impl SegmentWriter {
}
}
}
FieldType::Date(ref int_option) => {
if int_option.is_indexed() {
for field_value in field_values {
let term = Term::from_field_i64(
field_value.field(),
field_value.value().date_value().timestamp(),
);
self.multifield_postings.subscribe(doc_id, &term);
}
}
}
FieldType::I64(ref int_option) => {
if int_option.is_indexed() {
for field_value in field_values {

View File

@@ -1,77 +1,50 @@
use std::ops::Range;
use std::sync::atomic::Ordering;
use std::sync::Arc;
// AtomicU64 have not landed in stable.
// For the moment let's just use AtomicUsize on
// x86/64 bit platform, and a mutex on other platform.
#[cfg(target_arch = "x86_64")]
#[cfg(target = "x86_64")]
mod archicture_impl {
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
#[derive(Default)]
pub struct AtomicU64Ersatz(AtomicUsize);
#[derive(Clone, Default)]
pub struct Stamper(Arc<AtomicU64>);
impl AtomicU64Ersatz {
pub fn new(first_opstamp: u64) -> AtomicU64Ersatz {
AtomicU64Ersatz(AtomicUsize::new(first_opstamp as usize))
impl Stamper {
pub fn new(first_opstamp: u64) -> Stamper {
Stamper(Arc::new(AtomicU64::new(first_opstamp)))
}
pub fn fetch_add(&self, val: u64, order: Ordering) -> u64 {
self.0.fetch_add(val as usize, order) as u64
pub fn stamp(&self) -> u64 {
self.0.fetch_add(1u64, Ordering::SeqCst) as u64
}
}
}
#[cfg(not(target_arch = "x86_64"))]
#[cfg(not(target = "x86_64"))]
mod archicture_impl {
use std::sync::atomic::Ordering;
/// Under other architecture, we rely on a mutex.
use std::sync::RwLock;
use std::sync::{Arc, Mutex};
#[derive(Default)]
pub struct AtomicU64Ersatz(RwLock<u64>);
#[derive(Clone, Default)]
pub struct Stamper(Arc<Mutex<u64>>);
impl AtomicU64Ersatz {
pub fn new(first_opstamp: u64) -> AtomicU64Ersatz {
AtomicU64Ersatz(RwLock::new(first_opstamp))
impl Stamper {
pub fn new(first_opstamp: u64) -> Stamper {
Stamper(Arc::new(Mutex::new(first_opstamp)))
}
pub fn fetch_add(&self, incr: u64, _order: Ordering) -> u64 {
let mut lock = self.0.write().unwrap();
let previous_val = *lock;
*lock = previous_val + incr;
pub fn stamp(&self) -> u64 {
let mut guard = self.0.lock().expect("Failed to lock the stamper");
let previous_val = *guard;
*guard = previous_val + 1;
previous_val
}
}
}
use self::archicture_impl::AtomicU64Ersatz;
#[derive(Clone, Default)]
pub struct Stamper(Arc<AtomicU64Ersatz>);
impl Stamper {
pub fn new(first_opstamp: u64) -> Stamper {
Stamper(Arc::new(AtomicU64Ersatz::new(first_opstamp)))
}
pub fn stamp(&self) -> u64 {
self.0.fetch_add(1u64, Ordering::SeqCst) as u64
}
/// Given a desired count `n`, `stamps` returns an iterator that
/// will supply `n` number of u64 stamps.
pub fn stamps(&self, n: u64) -> Range<u64> {
let start = self.0.fetch_add(n, Ordering::SeqCst);
Range {
start,
end: start + n,
}
}
}
pub use self::archicture_impl::Stamper;
#[cfg(test)]
mod test {
@@ -89,7 +62,5 @@ mod test {
assert_eq!(stamper.stamp(), 10u64);
assert_eq!(stamper_clone.stamp(), 11u64);
assert_eq!(stamper.stamps(3u64), (12..15));
assert_eq!(stamper.stamp(), 15u64);
}
}

View File

@@ -1,5 +1,6 @@
#![doc(html_logo_url = "http://fulmicoton.com/tantivy-logo/tantivy-logo.png")]
#![cfg_attr(all(feature = "unstable", test), feature(test))]
#![cfg_attr(feature = "cargo-clippy", feature(tool_lints))]
#![cfg_attr(feature = "cargo-clippy", allow(clippy::module_inception))]
#![doc(test(attr(allow(unused_variables), deny(warnings))))]
#![warn(missing_docs)]
@@ -75,9 +76,9 @@
//!
//! // # Searching
//!
//! let reader = index.reader()?;
//! index.load_searchers()?;
//!
//! let searcher = reader.searcher();
//! let searcher = index.searcher();
//!
//! let query_parser = QueryParser::for_index(&index, vec![title, body]);
//!
@@ -129,24 +130,25 @@ extern crate base64;
extern crate bit_set;
extern crate bitpacking;
extern crate byteorder;
extern crate scoped_pool;
extern crate combine;
extern crate crossbeam;
extern crate fnv;
extern crate fst;
extern crate fst_regex;
extern crate futures;
extern crate futures_cpupool;
extern crate htmlescape;
extern crate itertools;
extern crate levenshtein_automata;
#[cfg(feature = "mmap")]
extern crate memmap;
extern crate num_cpus;
extern crate owning_ref;
extern crate regex;
extern crate rust_stemmers;
extern crate scoped_pool;
extern crate serde;
extern crate stable_deref_trait;
extern crate tantivy_fst;
extern crate tempdir;
extern crate tempfile;
extern crate uuid;
@@ -169,12 +171,11 @@ extern crate maplit;
extern crate test;
#[macro_use]
extern crate downcast_rs;
extern crate downcast;
#[macro_use]
extern crate fail;
#[cfg(feature = "mmap")]
#[cfg(test)]
mod functional_test;
@@ -183,19 +184,18 @@ mod macros;
pub use error::TantivyError;
#[deprecated(since = "0.7.0", note = "please use `tantivy::TantivyError` instead")]
#[deprecated(
since = "0.7.0",
note = "please use `tantivy::TantivyError` instead"
)]
pub use error::TantivyError as Error;
extern crate census;
pub extern crate chrono;
extern crate owned_read;
/// Tantivy result.
pub type Result<T> = std::result::Result<T, error::TantivyError>;
/// Tantivy DateTime
pub type DateTime = chrono::DateTime<chrono::Utc>;
mod common;
mod core;
mod indexer;
@@ -216,11 +216,8 @@ pub mod space_usage;
pub mod store;
pub mod termdict;
mod reader;
pub use self::reader::{IndexReader, IndexReaderBuilder, ReloadPolicy};
mod snippet;
pub use self::snippet::{Snippet, SnippetGenerator};
pub use self::snippet::{SnippetGenerator, Snippet};
mod docset;
pub use self::docset::{DocSet, SkipResult};
@@ -238,7 +235,11 @@ pub use common::{i64_to_u64, u64_to_i64};
/// Expose the current version of tantivy, as well
/// whether it was compiled with the simd compression.
pub fn version() -> &'static str {
env!("CARGO_PKG_VERSION")
if cfg!(feature = "simdcompression") {
concat!(env!("CARGO_PKG_VERSION"), "-simd")
} else {
concat!(env!("CARGO_PKG_VERSION"), "-nosimd")
}
}
/// Defines tantivy's merging strategy
@@ -307,7 +308,6 @@ mod tests {
use Index;
use IndexWriter;
use Postings;
use ReloadPolicy;
pub fn assert_nearly_equals(expected: f32, val: f32) {
assert!(
@@ -352,7 +352,7 @@ mod tests {
let index = Index::create_from_tempdir(schema).unwrap();
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
{
let doc = doc!(text_field=>"af b");
index_writer.add_document(doc);
@@ -374,7 +374,7 @@ mod tests {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
{
index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.commit().unwrap();
@@ -396,8 +396,8 @@ mod tests {
index_writer.commit().unwrap();
}
{
let reader = index.reader().unwrap();
let searcher = reader.searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let term_a = Term::from_field_text(text_field, "a");
assert_eq!(searcher.doc_freq(&term_a), 3);
let term_b = Term::from_field_text(text_field, "b");
@@ -416,7 +416,7 @@ mod tests {
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
{
let doc = doc!(text_field=>"a b c");
index_writer.add_document(doc);
@@ -424,8 +424,8 @@ mod tests {
index_writer.commit().unwrap();
}
{
let index_reader = index.reader().unwrap();
let searcher = index_reader.searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
{
let fieldnorm_reader = reader.get_fieldnorms_reader(text_field);
@@ -444,7 +444,7 @@ mod tests {
let text_field = schema_builder.add_text_field("text", TEXT);
let index = Index::create_in_ram(schema_builder.build());
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
{
let doc = doc!(text_field=>"a b c");
index_writer.add_document(doc);
@@ -460,8 +460,8 @@ mod tests {
index_writer.commit().unwrap();
}
{
let reader = index.reader().unwrap();
let searcher = reader.searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let segment_reader: &SegmentReader = searcher.segment_reader(0);
let fieldnorms_reader = segment_reader.get_fieldnorms_reader(text_field);
assert_eq!(fieldnorms_reader.fieldnorm(0), 3);
@@ -489,14 +489,9 @@ mod tests {
let term_c = Term::from_field_text(text_field, "c");
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
// 0
index_writer.add_document(doc!(text_field=>"a b"));
// 1
@@ -515,35 +510,37 @@ mod tests {
index_writer.commit().unwrap();
}
{
reader.reload().unwrap();
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0);
let inverted_index = segment_reader.inverted_index(text_field);
assert!(inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.is_none());
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
let inverted_index = reader.inverted_index(text_field);
assert!(
inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.is_none()
);
{
let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.unwrap();
assert!(advance_undeleted(&mut postings, segment_reader));
assert!(advance_undeleted(&mut postings, reader));
assert_eq!(postings.doc(), 5);
assert!(!advance_undeleted(&mut postings, segment_reader));
assert!(!advance_undeleted(&mut postings, reader));
}
{
let mut postings = inverted_index
.read_postings(&term_b, IndexRecordOption::WithFreqsAndPositions)
.unwrap();
assert!(advance_undeleted(&mut postings, segment_reader));
assert!(advance_undeleted(&mut postings, reader));
assert_eq!(postings.doc(), 3);
assert!(advance_undeleted(&mut postings, segment_reader));
assert!(advance_undeleted(&mut postings, reader));
assert_eq!(postings.doc(), 4);
assert!(!advance_undeleted(&mut postings, segment_reader));
assert!(!advance_undeleted(&mut postings, reader));
}
}
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
// 0
index_writer.add_document(doc!(text_field=>"a b"));
// 1
@@ -551,36 +548,38 @@ mod tests {
index_writer.rollback().unwrap();
}
{
reader.reload().unwrap();
let searcher = reader.searcher();
let seg_reader = searcher.segment_reader(0);
let inverted_index = seg_reader.inverted_index(term_abcd.field());
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
let inverted_index = reader.inverted_index(term_abcd.field());
assert!(inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.is_none());
assert!(
inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.is_none()
);
{
let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.unwrap();
assert!(advance_undeleted(&mut postings, seg_reader));
assert!(advance_undeleted(&mut postings, reader));
assert_eq!(postings.doc(), 5);
assert!(!advance_undeleted(&mut postings, seg_reader));
assert!(!advance_undeleted(&mut postings, reader));
}
{
let mut postings = inverted_index
.read_postings(&term_b, IndexRecordOption::WithFreqsAndPositions)
.unwrap();
assert!(advance_undeleted(&mut postings, seg_reader));
assert!(advance_undeleted(&mut postings, reader));
assert_eq!(postings.doc(), 3);
assert!(advance_undeleted(&mut postings, seg_reader));
assert!(advance_undeleted(&mut postings, reader));
assert_eq!(postings.doc(), 4);
assert!(!advance_undeleted(&mut postings, seg_reader));
assert!(!advance_undeleted(&mut postings, reader));
}
}
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"a b"));
index_writer.delete_term(Term::from_field_text(text_field, "c"));
index_writer.rollback().unwrap();
@@ -588,36 +587,38 @@ mod tests {
index_writer.commit().unwrap();
}
{
reader.reload().unwrap();
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0);
let inverted_index = segment_reader.inverted_index(term_abcd.field());
assert!(inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.is_none());
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
let inverted_index = reader.inverted_index(term_abcd.field());
assert!(
inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.is_none()
);
{
let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.unwrap();
assert!(!advance_undeleted(&mut postings, segment_reader));
assert!(!advance_undeleted(&mut postings, reader));
}
{
let mut postings = inverted_index
.read_postings(&term_b, IndexRecordOption::WithFreqsAndPositions)
.unwrap();
assert!(advance_undeleted(&mut postings, segment_reader));
assert!(advance_undeleted(&mut postings, reader));
assert_eq!(postings.doc(), 3);
assert!(advance_undeleted(&mut postings, segment_reader));
assert!(advance_undeleted(&mut postings, reader));
assert_eq!(postings.doc(), 4);
assert!(!advance_undeleted(&mut postings, segment_reader));
assert!(!advance_undeleted(&mut postings, reader));
}
{
let mut postings = inverted_index
.read_postings(&term_c, IndexRecordOption::WithFreqsAndPositions)
.unwrap();
assert!(advance_undeleted(&mut postings, segment_reader));
assert!(advance_undeleted(&mut postings, reader));
assert_eq!(postings.doc(), 4);
assert!(!advance_undeleted(&mut postings, segment_reader));
assert!(!advance_undeleted(&mut postings, reader));
}
}
}
@@ -625,15 +626,15 @@ mod tests {
#[test]
fn test_indexed_u64() {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_u64_field("value", INDEXED);
let field = schema_builder.add_u64_field("value", INT_INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer.add_document(doc!(field=>1u64));
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let term = Term::from_field_u64(field, 1u64);
let mut postings = searcher
.segment_reader(0)
@@ -648,16 +649,16 @@ mod tests {
#[test]
fn test_indexed_i64() {
let mut schema_builder = Schema::builder();
let value_field = schema_builder.add_i64_field("value", INDEXED);
let value_field = schema_builder.add_i64_field("value", INT_INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
let negative_val = -1i64;
index_writer.add_document(doc!(value_field => negative_val));
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let term = Term::from_field_i64(value_field, negative_val);
let mut postings = searcher
.segment_reader(0)
@@ -676,11 +677,11 @@ mod tests {
let absent_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(2, 6_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(2, 40_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"a"));
assert!(index_writer.commit().is_ok());
let reader = index.reader().unwrap();
let searcher = reader.searcher();
assert!(index.load_searchers().is_ok());
let searcher = index.searcher();
let segment_reader = searcher.segment_reader(0);
segment_reader.inverted_index(absent_field); //< should not panic
}
@@ -691,14 +692,9 @@ mod tests {
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
// writing the segment
let mut index_writer = index.writer_with_num_threads(2, 6_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(2, 40_000_000).unwrap();
let add_document = |index_writer: &mut IndexWriter, val: &'static str| {
let doc = doc!(text_field=>val);
@@ -721,8 +717,8 @@ mod tests {
remove_document(&mut index_writer, "38");
remove_document(&mut index_writer, "34");
index_writer.commit().unwrap();
reader.reload().unwrap();
let searcher = reader.searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
assert_eq!(searcher.num_docs(), 6);
}
@@ -734,7 +730,7 @@ mod tests {
let index = Index::create_in_ram(schema);
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
{
let doc = doc!(text_field=>"af af af bc bc");
index_writer.add_document(doc);
@@ -742,14 +738,16 @@ mod tests {
index_writer.commit().unwrap();
}
{
let index_reader = index.reader().unwrap();
let searcher = index_reader.searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
let inverted_index = reader.inverted_index(text_field);
let term_abcd = Term::from_field_text(text_field, "abcd");
assert!(inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.is_none());
assert!(
inverted_index
.read_postings(&term_abcd, IndexRecordOption::WithFreqsAndPositions)
.is_none()
);
let term_af = Term::from_field_text(text_field, "af");
let mut postings = inverted_index
.read_postings(&term_af, IndexRecordOption::WithFreqsAndPositions)
@@ -767,18 +765,18 @@ mod tests {
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let reader = index.reader().unwrap();
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"af af af b"));
index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.add_document(doc!(text_field=>"a b c d"));
index_writer.commit().unwrap();
}
{
reader.reload().unwrap();
let searcher = reader.searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let get_doc_ids = |terms: Vec<Term>| {
let query = BooleanQuery::new_multiterms_query(terms);
let topdocs = searcher.search(&query, &TestCollector).unwrap();
@@ -820,22 +818,25 @@ mod tests {
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.unwrap();
assert_eq!(reader.searcher().num_docs(), 0u64);
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"af b"));
index_writer.add_document(doc!(text_field=>"a b c"));
index_writer.add_document(doc!(text_field=>"a b c d"));
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
{
let doc = doc!(text_field=>"af b");
index_writer.add_document(doc);
}
{
let doc = doc!(text_field=>"a b c");
index_writer.add_document(doc);
}
{
let doc = doc!(text_field=>"a b c d");
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
}
reader.reload().unwrap();
assert_eq!(reader.searcher().num_docs(), 3u64);
index.searcher();
}
#[test]
@@ -862,7 +863,7 @@ mod tests {
let fast_field_unsigned = schema_builder.add_u64_field("unsigned", FAST);
let fast_field_signed = schema_builder.add_i64_field("signed", FAST);
let text_field = schema_builder.add_text_field("text", TEXT);
let stored_int_field = schema_builder.add_u64_field("text", STORED);
let stored_int_field = schema_builder.add_u64_field("text", INT_STORED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
@@ -872,8 +873,9 @@ mod tests {
index_writer.add_document(document);
index_writer.commit().unwrap();
}
let reader = index.reader().unwrap();
let searcher = reader.searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let segment_reader: &SegmentReader = searcher.segment_reader(0);
{
let fast_field_reader_res = segment_reader.fast_field_reader::<u64>(text_field);

View File

@@ -61,7 +61,7 @@ macro_rules! doc(
};
// if there is a trailing comma retry with the trailing comma stripped.
($($field:expr => $value:expr),+ ,) => {
doc!( $( $field => $value ), *)
doc!( $( $field => $value ), *);
};
);
@@ -77,10 +77,10 @@ mod test {
let likes = schema_builder.add_u64_field("num_u64", FAST);
let _schema = schema_builder.build();
let _doc = doc!(
title => "Life Aquatic",
author => "Wes Anderson",
likes => 4u64
);
title => "Life Aquatic",
author => "Wes Anderson",
likes => 4u64
);
}
#[test]
@@ -91,9 +91,9 @@ mod test {
let likes = schema_builder.add_u64_field("num_u64", FAST);
let _schema = schema_builder.build();
let _doc = doc!(
title => "Life Aquatic",
author => "Wes Anderson",
likes => 4u64,
);
title => "Life Aquatic",
author => "Wes Anderson",
likes => 4u64,
);
}
}

View File

@@ -34,6 +34,10 @@ const COMPRESSION_BLOCK_SIZE: usize = BitPacker4x::BLOCK_LEN;
const LONG_SKIP_IN_BLOCKS: usize = 1_024;
const LONG_SKIP_INTERVAL: u64 = (LONG_SKIP_IN_BLOCKS * COMPRESSION_BLOCK_SIZE) as u64;
lazy_static! {
static ref BIT_PACKER: BitPacker4x = BitPacker4x::new();
}
#[cfg(test)]
pub mod tests {

View File

@@ -1,23 +1,4 @@
/// Positions works as a long sequence of compressed block.
/// All terms are chained one after the other.
///
/// When accessing the position of a term, we get a positions_idx from the `Terminfo`.
/// This means we need to skip to the `nth` positions efficiently.
///
/// This is done thanks to two levels of skiping that we refer to in the code
/// as `long_skip` and `short_skip`.
///
/// The `long_skip` makes it possible to skip every 1_024 compression blocks (= 131_072 positions).
/// Skipping offset are simply stored one after as an offset stored over 8 bytes.
///
/// We find the number of long skips, as `n / long_skip`.
///
/// Blocks are compressed using bitpacking, so `skip_read` contains the number of bytes
/// (values can go from 0bit to 32 bits) required to decompressed every block.
///
/// A given block obviously takes `(128 x num_bit_for_the_block / num_bits_in_a_byte)`,
/// so skipping a block without decompressing it is just a matter of advancing that many
/// bytes.
use super::BIT_PACKER;
use bitpacking::{BitPacker, BitPacker4x};
use common::{BinarySerializable, FixedSize};
use directory::ReadOnlySource;
@@ -27,65 +8,9 @@ use positions::LONG_SKIP_INTERVAL;
use positions::LONG_SKIP_IN_BLOCKS;
use postings::compression::compressed_block_size;
struct Positions {
bit_packer: BitPacker4x,
skip_source: ReadOnlySource,
position_source: ReadOnlySource,
long_skip_source: ReadOnlySource,
}
impl Positions {
pub fn new(position_source: ReadOnlySource, skip_source: ReadOnlySource) -> Positions {
let skip_len = skip_source.len();
let (body, footer) = skip_source.split(skip_len - u32::SIZE_IN_BYTES);
let num_long_skips = u32::deserialize(&mut footer.as_slice()).expect("Index corrupted");
let body_split = body.len() - u64::SIZE_IN_BYTES * (num_long_skips as usize);
let (skip_source, long_skip_source) = body.split(body_split);
Positions {
bit_packer: BitPacker4x::new(),
skip_source,
long_skip_source,
position_source,
}
}
/// Returns the offset of the block associated to the given `long_skip_id`.
///
/// One `long_skip_id` means `LONG_SKIP_IN_BLOCKS` blocks.
fn long_skip(&self, long_skip_id: usize) -> u64 {
if long_skip_id == 0 {
return 0;
}
let long_skip_slice = self.long_skip_source.as_slice();
let mut long_skip_blocks: &[u8] = &long_skip_slice[(long_skip_id - 1) * 8..][..8];
u64::deserialize(&mut long_skip_blocks).expect("Index corrupted")
}
fn reader(&self, offset: u64) -> PositionReader {
let long_skip_id = (offset / LONG_SKIP_INTERVAL) as usize;
let small_skip = (offset % LONG_SKIP_INTERVAL) as usize;
let offset_num_bytes: u64 = self.long_skip(long_skip_id);
let mut position_read = OwnedRead::new(self.position_source.clone());
position_read.advance(offset_num_bytes as usize);
let mut skip_read = OwnedRead::new(self.skip_source.clone());
skip_read.advance(long_skip_id * LONG_SKIP_IN_BLOCKS);
let mut position_reader = PositionReader {
bit_packer: self.bit_packer,
skip_read,
position_read,
inner_offset: 0,
buffer: Box::new([0u32; 128]),
ahead: None,
};
position_reader.skip(small_skip);
position_reader
}
}
pub struct PositionReader {
skip_read: OwnedRead,
position_read: OwnedRead,
bit_packer: BitPacker4x,
inner_offset: usize,
buffer: Box<[u32; 128]>,
ahead: Option<usize>, // if None, no block is loaded.
@@ -102,7 +27,6 @@ pub struct PositionReader {
// If the requested number of els ends exactly at a given block, the next
// block is not decompressed.
fn read_impl(
bit_packer: BitPacker4x,
mut position: &[u8],
buffer: &mut [u32; 128],
mut inner_offset: usize,
@@ -113,23 +37,21 @@ fn read_impl(
let mut output_len = output.len();
let mut ahead = 0;
loop {
let available_len = COMPRESSION_BLOCK_SIZE - inner_offset;
// We have enough elements in the current block.
// Let's copy the requested elements in the output buffer,
// and return.
let available_len = 128 - inner_offset;
if output_len <= available_len {
output[output_start..].copy_from_slice(&buffer[inner_offset..][..output_len]);
return ahead;
} else {
output[output_start..][..available_len].copy_from_slice(&buffer[inner_offset..]);
output_len -= available_len;
output_start += available_len;
inner_offset = 0;
let num_bits = num_bits[ahead];
BitPacker4x::new().decompress(position, &mut buffer[..], num_bits);
let block_len = compressed_block_size(num_bits);
position = &position[block_len..];
ahead += 1;
}
output[output_start..][..available_len].copy_from_slice(&buffer[inner_offset..]);
output_len -= available_len;
output_start += available_len;
inner_offset = 0;
let num_bits = num_bits[ahead];
bit_packer.decompress(position, &mut buffer[..], num_bits);
let block_len = compressed_block_size(num_bits);
position = &position[block_len..];
ahead += 1;
}
}
@@ -139,7 +61,35 @@ impl PositionReader {
skip_source: ReadOnlySource,
offset: u64,
) -> PositionReader {
Positions::new(position_source, skip_source).reader(offset)
let skip_len = skip_source.len();
let (body, footer) = skip_source.split(skip_len - u32::SIZE_IN_BYTES);
let num_long_skips = u32::deserialize(&mut footer.as_slice()).expect("Index corrupted");
let body_split = body.len() - u64::SIZE_IN_BYTES * (num_long_skips as usize);
let (skip_body, long_skips) = body.split(body_split);
let long_skip_id = (offset / LONG_SKIP_INTERVAL) as usize;
let small_skip = (offset - (long_skip_id as u64) * (LONG_SKIP_INTERVAL as u64)) as usize;
let offset_num_bytes: u64 = {
if long_skip_id > 0 {
let mut long_skip_blocks: &[u8] =
&long_skips.as_slice()[(long_skip_id - 1) * 8..][..8];
u64::deserialize(&mut long_skip_blocks).expect("Index corrupted") * 16
} else {
0
}
};
let mut position_read = OwnedRead::new(position_source);
position_read.advance(offset_num_bytes as usize);
let mut skip_read = OwnedRead::new(skip_body);
skip_read.advance(long_skip_id * LONG_SKIP_IN_BLOCKS);
let mut position_reader = PositionReader {
skip_read,
position_read,
inner_offset: 0,
buffer: Box::new([0u32; 128]),
ahead: None,
};
position_reader.skip(small_skip);
position_reader
}
/// Fills a buffer with the next `output.len()` integers.
@@ -151,13 +101,10 @@ impl PositionReader {
if self.ahead != Some(0) {
// the block currently available is not the block
// for the current position
self.bit_packer
.decompress(position_data, self.buffer.as_mut(), num_bits);
self.ahead = Some(0);
BIT_PACKER.decompress(position_data, self.buffer.as_mut(), num_bits);
}
let block_len = compressed_block_size(num_bits);
self.ahead = Some(read_impl(
self.bit_packer,
&position_data[block_len..],
self.buffer.as_mut(),
self.inner_offset,
@@ -186,13 +133,14 @@ impl PositionReader {
}
});
let skip_len_in_bits = self.skip_read.as_ref()[..num_blocks_to_advance]
let skip_len = self.skip_read.as_ref()[..num_blocks_to_advance]
.iter()
.map(|num_bits| *num_bits as usize)
.cloned()
.map(|num_bit| num_bit as usize)
.sum::<usize>()
* COMPRESSION_BLOCK_SIZE;
let skip_len_in_bytes = skip_len_in_bits / 8;
* (COMPRESSION_BLOCK_SIZE / 8);
self.skip_read.advance(num_blocks_to_advance);
self.position_read.advance(skip_len_in_bytes);
self.position_read.advance(skip_len);
}
}

View File

@@ -1,30 +1,29 @@
use super::BIT_PACKER;
use bitpacking::BitPacker;
use bitpacking::BitPacker4x;
use common::BinarySerializable;
use common::CountingWriter;
use positions::{COMPRESSION_BLOCK_SIZE, LONG_SKIP_INTERVAL};
use std::io::{self, Write};
use std::io;
pub struct PositionSerializer<W: io::Write> {
bit_packer: BitPacker4x,
write_stream: CountingWriter<W>,
write_stream: W,
write_skiplist: W,
block: Vec<u32>,
buffer: Vec<u8>,
num_ints: u64,
long_skips: Vec<u64>,
cumulated_num_bits: u64,
}
impl<W: io::Write> PositionSerializer<W> {
pub fn new(write_stream: W, write_skiplist: W) -> PositionSerializer<W> {
PositionSerializer {
bit_packer: BitPacker4x::new(),
write_stream: CountingWriter::wrap(write_stream),
write_stream,
write_skiplist,
block: Vec::with_capacity(128),
buffer: vec![0u8; 128 * 4],
num_ints: 0u64,
long_skips: Vec::new(),
cumulated_num_bits: 0u64,
}
}
@@ -51,15 +50,14 @@ impl<W: io::Write> PositionSerializer<W> {
}
fn flush_block(&mut self) -> io::Result<()> {
let num_bits = self.bit_packer.num_bits(&self.block[..]);
let num_bits = BIT_PACKER.num_bits(&self.block[..]);
self.cumulated_num_bits += u64::from(num_bits);
self.write_skiplist.write_all(&[num_bits])?;
let written_len = self
.bit_packer
.compress(&self.block[..], &mut self.buffer, num_bits);
let written_len = BIT_PACKER.compress(&self.block[..], &mut self.buffer, num_bits);
self.write_stream.write_all(&self.buffer[..written_len])?;
self.block.clear();
if (self.num_ints % LONG_SKIP_INTERVAL) == 0u64 {
self.long_skips.push(self.write_stream.written_bytes());
self.long_skips.push(self.cumulated_num_bits);
}
Ok(())
}

View File

@@ -1,229 +0,0 @@
/// This modules define the logic used to search for a doc in a given
/// block. (at most 128 docs)
///
/// Searching within a block is a hotspot when running intersection.
/// so it was worth defining it in its own module.
#[cfg(target_arch = "x86_64")]
mod sse2 {
use postings::compression::COMPRESSION_BLOCK_SIZE;
use std::arch::x86_64::__m128i as DataType;
use std::arch::x86_64::_mm_add_epi32 as op_add;
use std::arch::x86_64::_mm_cmplt_epi32 as op_lt;
use std::arch::x86_64::_mm_load_si128 as op_load; // requires 128-bits alignment
use std::arch::x86_64::_mm_set1_epi32 as set1;
use std::arch::x86_64::_mm_setzero_si128 as set0;
use std::arch::x86_64::_mm_sub_epi32 as op_sub;
use std::arch::x86_64::{_mm_cvtsi128_si32, _mm_shuffle_epi32};
const MASK1: i32 = 78;
const MASK2: i32 = 177;
/// Performs an exhaustive linear search over the
///
/// There is no early exit here. We simply count the
/// number of elements that are `< target`.
pub fn linear_search_sse2_128(arr: &[u32], target: u32) -> usize {
unsafe {
let ptr = arr.as_ptr() as *const DataType;
let vkey = set1(target as i32);
let mut cnt = set0();
// We work over 4 `__m128i` at a time.
// A single `__m128i` actual contains 4 `u32`.
for i in 0..(COMPRESSION_BLOCK_SIZE as isize) / (4 * 4) {
let cmp1 = op_lt(op_load(ptr.offset(i * 4)), vkey);
let cmp2 = op_lt(op_load(ptr.offset(i * 4 + 1)), vkey);
let cmp3 = op_lt(op_load(ptr.offset(i * 4 + 2)), vkey);
let cmp4 = op_lt(op_load(ptr.offset(i * 4 + 3)), vkey);
let sum = op_add(op_add(cmp1, cmp2), op_add(cmp3, cmp4));
cnt = op_sub(cnt, sum);
}
cnt = op_add(cnt, _mm_shuffle_epi32(cnt, MASK1));
cnt = op_add(cnt, _mm_shuffle_epi32(cnt, MASK2));
_mm_cvtsi128_si32(cnt) as usize
}
}
#[cfg(test)]
mod test {
use super::linear_search_sse2_128;
#[test]
fn test_linear_search_sse2_128_u32() {
for i in 0..23 {
dbg!(i);
let arr: Vec<u32> = (0..128).map(|el| el * 2 + 1 << 18).collect();
assert_eq!(linear_search_sse2_128(&arr, arr[64] + 1), 65);
}
}
}
}
/// This `linear search` browser exhaustively through the array.
/// but the early exit is very difficult to predict.
///
/// Coupled with `exponential search` this function is likely
/// to be called with the same `len`
fn linear_search(arr: &[u32], target: u32) -> usize {
arr.iter().map(|&el| if el < target { 1 } else { 0 }).sum()
}
fn exponential_search(arr: &[u32], target: u32) -> (usize, usize) {
let end = arr.len();
let mut begin = 0;
for &pivot in &[1, 3, 7, 15, 31, 63] {
if pivot >= end {
break;
}
if arr[pivot] > target {
return (begin, pivot);
}
begin = pivot;
}
(begin, end)
}
fn galloping(block_docs: &[u32], target: u32) -> usize {
let (start, end) = exponential_search(&block_docs, target);
start + linear_search(&block_docs[start..end], target)
}
/// Tantivy may rely on SIMD instructions to search for a specific document within
/// a given block.
#[derive(Clone, Copy, PartialEq)]
pub enum BlockSearcher {
#[cfg(target_arch = "x86_64")]
SSE2,
Scalar,
}
impl BlockSearcher {
/// Search the first index containing an element greater or equal to
/// the target.
///
/// The results should be equivalent to
/// ```ignore
/// block[..]
// .iter()
// .take_while(|&&val| val < target)
// .count()
/// ```
///
/// The `start` argument is just used to hint that the response is
/// greater than beyond `start`. The implementation may or may not use
/// it for optimization.
///
/// # Assumption
///
/// The array len is > start.
/// The block is sorted
/// The target is assumed greater or equal to the `arr[start]`.
/// The target is assumed smaller or equal to the last element of the block.
///
/// Currently the scalar implementation starts by an exponential search, and
/// then operates a linear search in the result subarray.
///
/// If SSE2 instructions are available in the `(platform, running CPU)`,
/// then we use a different implementation that does an exhaustive linear search over
/// the full block whenever the block is full (`len == 128`). It is surprisingly faster, most likely because of the lack
/// of branch.
pub fn search_in_block(&self, block_docs: &[u32], start: usize, target: u32) -> usize {
#[cfg(target_arch = "x86_64")]
{
use postings::compression::COMPRESSION_BLOCK_SIZE;
if *self == BlockSearcher::SSE2 {
if block_docs.len() == COMPRESSION_BLOCK_SIZE {
return sse2::linear_search_sse2_128(block_docs, target);
}
}
}
start + galloping(&block_docs[start..], target)
}
}
impl Default for BlockSearcher {
fn default() -> BlockSearcher {
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("sse2") {
return BlockSearcher::SSE2;
}
}
BlockSearcher::Scalar
}
}
#[cfg(test)]
mod tests {
use super::exponential_search;
use super::linear_search;
use super::BlockSearcher;
#[test]
fn test_linear_search() {
let len: usize = 50;
let arr: Vec<u32> = (0..len).map(|el| 1u32 + (el as u32) * 2).collect();
for target in 1..*arr.last().unwrap() {
let res = linear_search(&arr[..], target);
if res > 0 {
assert!(arr[res - 1] < target);
}
if res < len {
assert!(arr[res] >= target);
}
}
}
#[test]
fn test_exponentiel_search() {
assert_eq!(exponential_search(&[1, 2], 0), (0, 1));
assert_eq!(exponential_search(&[1, 2], 1), (0, 1));
assert_eq!(
exponential_search(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], 7),
(3, 7)
);
}
fn util_test_search_in_block(block_searcher: BlockSearcher, block: &[u32], target: u32) {
let cursor = search_in_block_trivial_but_slow(block, target);
for i in 0..cursor {
assert_eq!(block_searcher.search_in_block(block, i, target), cursor);
}
}
fn util_test_search_in_block_all(block_searcher: BlockSearcher, block: &[u32]) {
use std::collections::HashSet;
let mut targets = HashSet::new();
for (i, val) in block.iter().cloned().enumerate() {
if i > 0 {
targets.insert(val - 1);
}
targets.insert(val);
}
for target in targets {
util_test_search_in_block(block_searcher, block, target);
}
}
fn search_in_block_trivial_but_slow(block: &[u32], target: u32) -> usize {
block.iter().take_while(|&&val| val < target).count()
}
fn test_search_in_block_util(block_searcher: BlockSearcher) {
for len in 1u32..128u32 {
let v: Vec<u32> = (0..len).map(|i| i * 2).collect();
util_test_search_in_block_all(block_searcher, &v[..]);
}
}
#[test]
fn test_search_in_block_scalar() {
test_search_in_block_util(BlockSearcher::Scalar);
}
#[cfg(target_arch = "x86_64")]
#[test]
fn test_search_in_block_sse2() {
test_search_in_block_util(BlockSearcher::SSE2);
}
}

View File

@@ -43,14 +43,9 @@ impl BlockEncoder {
}
}
/// We ensure that the OutputBuffer is align on 128 bits
/// in order to run SSE2 linear search on it.
#[repr(align(128))]
struct OutputBuffer([u32; COMPRESSION_BLOCK_SIZE + 1]);
pub struct BlockDecoder {
bitpacker: BitPacker4x,
output: OutputBuffer,
pub output: [u32; COMPRESSION_BLOCK_SIZE + 1],
pub output_len: usize,
}
@@ -64,7 +59,7 @@ impl BlockDecoder {
output[COMPRESSION_BLOCK_SIZE] = 0u32;
BlockDecoder {
bitpacker: BitPacker4x::new(),
output: OutputBuffer(output),
output,
output_len: 0,
}
}
@@ -77,23 +72,23 @@ impl BlockDecoder {
) -> usize {
self.output_len = COMPRESSION_BLOCK_SIZE;
self.bitpacker
.decompress_sorted(offset, &compressed_data, &mut self.output.0, num_bits)
.decompress_sorted(offset, &compressed_data, &mut self.output, num_bits)
}
pub fn uncompress_block_unsorted(&mut self, compressed_data: &[u8], num_bits: u8) -> usize {
self.output_len = COMPRESSION_BLOCK_SIZE;
self.bitpacker
.decompress(&compressed_data, &mut self.output.0, num_bits)
.decompress(&compressed_data, &mut self.output, num_bits)
}
#[inline]
pub fn output_array(&self) -> &[u32] {
&self.output.0[..self.output_len]
&self.output[..self.output_len]
}
#[inline]
pub fn output(&self, idx: usize) -> u32 {
self.output.0[idx]
self.output[idx]
}
}
@@ -164,12 +159,12 @@ impl VIntDecoder for BlockDecoder {
num_els: usize,
) -> usize {
self.output_len = num_els;
vint::uncompress_sorted(compressed_data, &mut self.output.0[..num_els], offset)
vint::uncompress_sorted(compressed_data, &mut self.output[..num_els], offset)
}
fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> usize {
self.output_len = num_els;
vint::uncompress_unsorted(compressed_data, &mut self.output.0[..num_els])
vint::uncompress_unsorted(compressed_data, &mut self.output[..num_els])
}
}

View File

@@ -2,7 +2,6 @@
Postings module (also called inverted index)
*/
mod block_search;
pub(crate) mod compression;
/// Postings module
///
@@ -17,8 +16,6 @@ mod skip;
mod stacker;
mod term_info;
pub(crate) use self::block_search::BlockSearcher;
pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
@@ -34,6 +31,7 @@ pub(crate) use self::stacker::compute_table_size;
pub use common::HasLen;
pub(crate) const USE_SKIP_INFO_LIMIT: u32 = COMPRESSION_BLOCK_SIZE as u32;
pub(crate) type UnorderedTermId = u64;
#[cfg_attr(feature = "cargo-clippy", allow(clippy::enum_variant_names))]
@@ -60,7 +58,7 @@ pub mod tests {
use rand::{Rng, SeedableRng};
use schema::Field;
use schema::IndexRecordOption;
use schema::{Document, Schema, Term, INDEXED, STRING, TEXT};
use schema::{Document, Schema, Term, INT_INDEXED, STRING, TEXT};
use std::iter;
use DocId;
use Score;
@@ -103,11 +101,14 @@ pub mod tests {
}
index_writer.add_document(doc!(title => r#"abc be be be be abc"#));
index_writer.commit().unwrap();
index.load_searchers().unwrap();
let searcher = index.reader().unwrap().searcher();
let searcher = index.searcher();
let inverted_index = searcher.segment_reader(0u32).inverted_index(title);
let term = Term::from_field_text(title, "abc");
let mut positions = Vec::new();
{
let mut postings = inverted_index
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
@@ -220,10 +221,12 @@ pub mod tests {
}
{
let term_a = Term::from_field_text(text_field, "abcdef");
assert!(segment_reader
.inverted_index(term_a.field())
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.is_none());
assert!(
segment_reader
.inverted_index(term_a.field())
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.is_none()
);
}
{
let term_a = Term::from_field_text(text_field, "a");
@@ -279,7 +282,7 @@ pub mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
{
let mut doc = Document::default();
doc.add_text(text_field, "g b b d c g c");
@@ -292,8 +295,9 @@ pub mod tests {
}
assert!(index_writer.commit().is_ok());
}
index.load_searchers().unwrap();
let term_a = Term::from_field_text(text_field, "a");
let searcher = index.reader().unwrap().searcher();
let searcher = index.searcher();
let segment_reader = searcher.segment_reader(0);
let mut postings = segment_reader
.inverted_index(text_field)
@@ -315,12 +319,12 @@ pub mod tests {
let index = {
let mut schema_builder = Schema::builder();
let value_field = schema_builder.add_u64_field("value", INDEXED);
let value_field = schema_builder.add_u64_field("value", INT_INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
for i in 0..num_docs {
let mut doc = Document::default();
doc.add_u64(value_field, 2);
@@ -330,9 +334,10 @@ pub mod tests {
}
assert!(index_writer.commit().is_ok());
}
index.load_searchers().unwrap();
index
};
let searcher = index.reader().unwrap().searcher();
let searcher = index.searcher();
let segment_reader = searcher.segment_reader(0);
// check that the basic usage works
@@ -396,11 +401,12 @@ pub mod tests {
// delete some of the documents
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer.delete_term(term_0);
assert!(index_writer.commit().is_ok());
}
let searcher = index.reader().unwrap().searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let segment_reader = searcher.segment_reader(0);
// make sure seeking still works
@@ -445,19 +451,33 @@ pub mod tests {
// delete everything else
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer.delete_term(term_1);
assert!(index_writer.commit().is_ok());
}
let searcher = index.reader().unwrap().searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let segment_reader = searcher.segment_reader(0);
// finally, check that it's empty
{
let searchable_segment_ids = index
.searchable_segment_ids()
.expect("could not get index segment ids");
assert!(searchable_segment_ids.is_empty());
assert_eq!(searcher.num_docs(), 0);
let mut segment_postings = segment_reader
.inverted_index(term_2.field())
.read_postings(&term_2, IndexRecordOption::Basic)
.unwrap();
assert_eq!(segment_postings.skip_next(0), SkipResult::Reached);
assert_eq!(segment_postings.doc(), 0);
assert!(segment_reader.is_deleted(0));
let mut segment_postings = segment_reader
.inverted_index(term_2.field())
.read_postings(&term_2, IndexRecordOption::Basic)
.unwrap();
assert_eq!(segment_postings.skip_next(num_docs), SkipResult::End);
}
}
@@ -488,7 +508,7 @@ pub mod tests {
let index = Index::create_in_ram(schema);
let posting_list_size = 1_000_000;
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
for _ in 0..posting_list_size {
let mut doc = Document::default();
if rng.gen_bool(1f64 / 15f64) {
@@ -505,6 +525,7 @@ pub mod tests {
}
assert!(index_writer.commit().is_ok());
}
index.load_searchers().unwrap();
index
};
}

View File

@@ -1,8 +1,6 @@
use super::stacker::{Addr, MemoryArena, TermHashMap};
use postings::recorder::{
BufferLender, NothingRecorder, Recorder, TFAndPositionRecorder, TermFrequencyRecorder,
};
use postings::recorder::{NothingRecorder, Recorder, TFAndPositionRecorder, TermFrequencyRecorder};
use postings::UnorderedTermId;
use postings::{FieldSerializer, InvertedIndexSerializer};
use schema::IndexRecordOption;
@@ -31,12 +29,10 @@ fn posting_from_field_entry(field_entry: &FieldEntry) -> Box<PostingsWriter> {
IndexRecordOption::WithFreqsAndPositions => {
SpecializedPostingsWriter::<TFAndPositionRecorder>::new_boxed()
}
})
.unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed()),
FieldType::U64(_)
| FieldType::I64(_)
| FieldType::Date(_)
| FieldType::HierarchicalFacet => SpecializedPostingsWriter::<NothingRecorder>::new_boxed(),
}).unwrap_or_else(|| SpecializedPostingsWriter::<NothingRecorder>::new_boxed()),
FieldType::U64(_) | FieldType::I64(_) | FieldType::HierarchicalFacet => {
SpecializedPostingsWriter::<NothingRecorder>::new_boxed()
}
FieldType::Bytes => {
// FieldType::Bytes cannot actually be indexed.
// TODO fix during the indexer refactoring described in #276
@@ -52,31 +48,6 @@ pub struct MultiFieldPostingsWriter {
per_field_postings_writers: Vec<Box<PostingsWriter>>,
}
fn make_field_partition(
term_offsets: &[(&[u8], Addr, UnorderedTermId)],
) -> Vec<(Field, usize, usize)> {
let term_offsets_it = term_offsets
.iter()
.map(|(key, _, _)| Term::wrap(key).field())
.enumerate();
let mut prev_field = Field(u32::max_value());
let mut fields = vec![];
let mut offsets = vec![];
for (offset, field) in term_offsets_it {
if field != prev_field {
prev_field = field;
fields.push(field);
offsets.push(offset);
}
}
offsets.push(term_offsets.len());
let mut field_offsets = vec![];
for i in 0..fields.len() {
field_offsets.push((fields[i], offsets[i], offsets[i + 1]));
}
field_offsets
}
impl MultiFieldPostingsWriter {
/// Create a new `MultiFieldPostingsWriter` given
/// a schema and a heap.
@@ -122,16 +93,38 @@ impl MultiFieldPostingsWriter {
&self,
serializer: &mut InvertedIndexSerializer,
) -> Result<HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>>> {
let mut term_offsets: Vec<(&[u8], Addr, UnorderedTermId)> =
self.term_index.iter().collect();
let mut term_offsets: Vec<(&[u8], Addr, UnorderedTermId)> = self
.term_index
.iter()
.map(|(term_bytes, addr, bucket_id)| (term_bytes, addr, bucket_id as UnorderedTermId))
.collect();
term_offsets.sort_unstable_by_key(|&(k, _, _)| k);
let mut unordered_term_mappings: HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>> =
HashMap::new();
let mut offsets: Vec<(Field, usize)> = vec![];
let term_offsets_it = term_offsets
.iter()
.cloned()
.map(|(key, _, _)| Term::wrap(key).field())
.enumerate();
let field_offsets = make_field_partition(&term_offsets);
let mut unordered_term_mappings: HashMap<
Field,
HashMap<UnorderedTermId, TermOrdinal>,
> = HashMap::new();
let mut prev_field = Field(u32::max_value());
for (offset, field) in term_offsets_it {
if field != prev_field {
offsets.push((field, offset));
prev_field = field;
}
}
offsets.push((Field(0), term_offsets.len()));
for i in 0..(offsets.len() - 1) {
let (field, start) = offsets[i];
let (_, stop) = offsets[i + 1];
for (field, start, stop) in field_offsets {
let field_entry = self.schema.get_field_entry(field);
match *field_entry.field_type() {
@@ -145,11 +138,10 @@ impl MultiFieldPostingsWriter {
.enumerate()
.map(|(term_ord, unord_term_id)| {
(unord_term_id as UnorderedTermId, term_ord as TermOrdinal)
})
.collect();
}).collect();
unordered_term_mappings.insert(field, mapping);
}
FieldType::U64(_) | FieldType::I64(_) | FieldType::Date(_) => {}
FieldType::U64(_) | FieldType::I64(_) => {}
FieldType::Bytes => {}
}
@@ -221,7 +213,7 @@ pub trait PostingsWriter {
/// The `SpecializedPostingsWriter` is just here to remove dynamic
/// dispatch to the recorder information.
pub(crate) struct SpecializedPostingsWriter<Rec: Recorder + 'static> {
pub struct SpecializedPostingsWriter<Rec: Recorder + 'static> {
total_num_tokens: u64,
_recorder_type: PhantomData<Rec>,
}
@@ -253,7 +245,8 @@ impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec>
debug_assert!(term.as_slice().len() >= 4);
self.total_num_tokens += 1;
term_index.mutate_or_create(term, |opt_recorder: Option<Rec>| {
if let Some(mut recorder) = opt_recorder {
if opt_recorder.is_some() {
let mut recorder = opt_recorder.unwrap();
let current_doc = recorder.current_doc();
if current_doc != doc {
recorder.close_doc(heap);
@@ -262,7 +255,7 @@ impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec>
recorder.record_position(position, heap);
recorder
} else {
let mut recorder = Rec::new();
let mut recorder = Rec::new(heap);
recorder.new_doc(doc, heap);
recorder.record_position(position, heap);
recorder
@@ -277,11 +270,10 @@ impl<Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<Rec>
termdict_heap: &MemoryArena,
heap: &MemoryArena,
) -> io::Result<()> {
let mut buffer_lender = BufferLender::default();
for &(term_bytes, addr, _) in term_addrs {
let recorder: Rec = termdict_heap.read(addr);
let recorder: Rec = unsafe { termdict_heap.read(addr) };
serializer.new_term(&term_bytes[4..])?;
recorder.serialize(&mut buffer_lender, serializer, heap)?;
recorder.serialize(serializer, heap)?;
serializer.close_term()?;
}
Ok(())

View File

@@ -1,50 +1,10 @@
use super::stacker::{ExpUnrolledLinkedList, MemoryArena};
use common::{read_u32_vint, write_u32_vint};
use postings::FieldSerializer;
use std::io;
use std::{self, io};
use DocId;
const POSITION_END: u32 = 0;
#[derive(Default)]
pub(crate) struct BufferLender {
buffer_u8: Vec<u8>,
buffer_u32: Vec<u32>,
}
impl BufferLender {
pub fn lend_u8(&mut self) -> &mut Vec<u8> {
self.buffer_u8.clear();
&mut self.buffer_u8
}
pub fn lend_all(&mut self) -> (&mut Vec<u8>, &mut Vec<u32>) {
self.buffer_u8.clear();
self.buffer_u32.clear();
(&mut self.buffer_u8, &mut self.buffer_u32)
}
}
pub struct VInt32Reader<'a> {
data: &'a [u8],
}
impl<'a> VInt32Reader<'a> {
fn new(data: &'a [u8]) -> VInt32Reader<'a> {
VInt32Reader { data }
}
}
impl<'a> Iterator for VInt32Reader<'a> {
type Item = u32;
fn next(&mut self) -> Option<u32> {
if self.data.is_empty() {
None
} else {
Some(read_u32_vint(&mut self.data))
}
}
}
const EMPTY_ARRAY: [u32; 0] = [0u32; 0];
const POSITION_END: u32 = std::u32::MAX;
/// Recorder is in charge of recording relevant information about
/// the presence of a term in a document.
@@ -55,9 +15,9 @@ impl<'a> Iterator for VInt32Reader<'a> {
/// * the document id
/// * the term frequency
/// * the term positions
pub(crate) trait Recorder: Copy + 'static {
pub trait Recorder: Copy {
///
fn new() -> Self;
fn new(heap: &mut MemoryArena) -> Self;
/// Returns the current document
fn current_doc(&self) -> u32;
/// Starts recording information about a new document
@@ -69,12 +29,7 @@ pub(crate) trait Recorder: Copy + 'static {
/// Close the document. It will help record the term frequency.
fn close_doc(&mut self, heap: &mut MemoryArena);
/// Pushes the postings information to the serializer.
fn serialize(
&self,
buffer_lender: &mut BufferLender,
serializer: &mut FieldSerializer,
heap: &MemoryArena,
) -> io::Result<()>;
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()>;
}
/// Only records the doc ids
@@ -85,9 +40,9 @@ pub struct NothingRecorder {
}
impl Recorder for NothingRecorder {
fn new() -> Self {
fn new(heap: &mut MemoryArena) -> Self {
NothingRecorder {
stack: ExpUnrolledLinkedList::new(),
stack: ExpUnrolledLinkedList::new(heap),
current_doc: u32::max_value(),
}
}
@@ -98,23 +53,16 @@ impl Recorder for NothingRecorder {
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
self.current_doc = doc;
let _ = write_u32_vint(doc, &mut self.stack.writer(heap));
self.stack.push(doc, heap);
}
fn record_position(&mut self, _position: u32, _heap: &mut MemoryArena) {}
fn close_doc(&mut self, _heap: &mut MemoryArena) {}
fn serialize(
&self,
buffer_lender: &mut BufferLender,
serializer: &mut FieldSerializer,
heap: &MemoryArena,
) -> io::Result<()> {
let buffer = buffer_lender.lend_u8();
self.stack.read_to_end(heap, buffer);
for doc in VInt32Reader::new(&buffer[..]) {
serializer.write_doc(doc as u32, 0u32, &[][..])?;
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> {
for doc in self.stack.iter(heap) {
serializer.write_doc(doc, 0u32, &EMPTY_ARRAY)?;
}
Ok(())
}
@@ -129,9 +77,9 @@ pub struct TermFrequencyRecorder {
}
impl Recorder for TermFrequencyRecorder {
fn new() -> Self {
fn new(heap: &mut MemoryArena) -> Self {
TermFrequencyRecorder {
stack: ExpUnrolledLinkedList::new(),
stack: ExpUnrolledLinkedList::new(heap),
current_doc: u32::max_value(),
current_tf: 0u32,
}
@@ -143,7 +91,7 @@ impl Recorder for TermFrequencyRecorder {
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
self.current_doc = doc;
let _ = write_u32_vint(doc, &mut self.stack.writer(heap));
self.stack.push(doc, heap);
}
fn record_position(&mut self, _position: u32, _heap: &mut MemoryArena) {
@@ -152,24 +100,24 @@ impl Recorder for TermFrequencyRecorder {
fn close_doc(&mut self, heap: &mut MemoryArena) {
debug_assert!(self.current_tf > 0);
let _ = write_u32_vint(self.current_tf, &mut self.stack.writer(heap));
self.stack.push(self.current_tf, heap);
self.current_tf = 0;
}
fn serialize(
&self,
buffer_lender: &mut BufferLender,
serializer: &mut FieldSerializer,
heap: &MemoryArena,
) -> io::Result<()> {
let buffer = buffer_lender.lend_u8();
self.stack.read_to_end(heap, buffer);
let mut u32_it = VInt32Reader::new(&buffer[..]);
while let Some(doc) = u32_it.next() {
let term_freq = u32_it.next().unwrap_or(self.current_tf);
serializer.write_doc(doc as u32, term_freq, &[][..])?;
}
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> {
// the last document has not been closed...
// its term freq is self.current_tf.
let mut doc_iter = self
.stack
.iter(heap)
.chain(Some(self.current_tf).into_iter());
while let Some(doc) = doc_iter.next() {
let term_freq = doc_iter
.next()
.expect("The IndexWriter recorded a doc without a term freq.");
serializer.write_doc(doc, term_freq, &EMPTY_ARRAY)?;
}
Ok(())
}
}
@@ -180,10 +128,11 @@ pub struct TFAndPositionRecorder {
stack: ExpUnrolledLinkedList,
current_doc: DocId,
}
impl Recorder for TFAndPositionRecorder {
fn new() -> Self {
fn new(heap: &mut MemoryArena) -> Self {
TFAndPositionRecorder {
stack: ExpUnrolledLinkedList::new(),
stack: ExpUnrolledLinkedList::new(heap),
current_doc: u32::max_value(),
}
}
@@ -194,88 +143,33 @@ impl Recorder for TFAndPositionRecorder {
fn new_doc(&mut self, doc: DocId, heap: &mut MemoryArena) {
self.current_doc = doc;
let _ = write_u32_vint(doc, &mut self.stack.writer(heap));
self.stack.push(doc, heap);
}
fn record_position(&mut self, position: u32, heap: &mut MemoryArena) {
let _ = write_u32_vint(position + 1u32, &mut self.stack.writer(heap));
self.stack.push(position, heap);
}
fn close_doc(&mut self, heap: &mut MemoryArena) {
let _ = write_u32_vint(POSITION_END, &mut self.stack.writer(heap));
self.stack.push(POSITION_END, heap);
}
fn serialize(
&self,
buffer_lender: &mut BufferLender,
serializer: &mut FieldSerializer,
heap: &MemoryArena,
) -> io::Result<()> {
let (buffer_u8, buffer_positions) = buffer_lender.lend_all();
self.stack.read_to_end(heap, buffer_u8);
let mut u32_it = VInt32Reader::new(&buffer_u8[..]);
while let Some(doc) = u32_it.next() {
let mut prev_position_plus_one = 1u32;
buffer_positions.clear();
loop {
match u32_it.next() {
Some(POSITION_END) | None => {
break;
}
Some(position_plus_one) => {
let delta_position = position_plus_one - prev_position_plus_one;
buffer_positions.push(delta_position);
prev_position_plus_one = position_plus_one;
}
fn serialize(&self, serializer: &mut FieldSerializer, heap: &MemoryArena) -> io::Result<()> {
let mut doc_positions = Vec::with_capacity(100);
let mut positions_iter = self.stack.iter(heap);
while let Some(doc) = positions_iter.next() {
let mut prev_position = 0;
doc_positions.clear();
for position in &mut positions_iter {
if position == POSITION_END {
break;
} else {
doc_positions.push(position - prev_position);
prev_position = position;
}
}
serializer.write_doc(doc, buffer_positions.len() as u32, &buffer_positions)?;
serializer.write_doc(doc, doc_positions.len() as u32, &doc_positions)?;
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::write_u32_vint;
use super::BufferLender;
use super::VInt32Reader;
#[test]
fn test_buffer_lender() {
let mut buffer_lender = BufferLender::default();
{
let buf = buffer_lender.lend_u8();
assert!(buf.is_empty());
buf.push(1u8);
}
{
let buf = buffer_lender.lend_u8();
assert!(buf.is_empty());
buf.push(1u8);
}
{
let (_, buf) = buffer_lender.lend_all();
assert!(buf.is_empty());
buf.push(1u32);
}
{
let (_, buf) = buffer_lender.lend_all();
assert!(buf.is_empty());
buf.push(1u32);
}
}
#[test]
fn test_vint_u32() {
let mut buffer = vec![];
let vals = [0, 1, 324_234_234, u32::max_value()];
for &i in &vals {
assert!(write_u32_vint(i, &mut buffer).is_ok());
}
assert_eq!(buffer.len(), 1 + 1 + 5 + 5);
let res: Vec<u32> = VInt32Reader::new(&buffer[..]).collect();
assert_eq!(&res[..], &vals[..]);
}
}

View File

@@ -2,21 +2,22 @@ use common::BitSet;
use common::HasLen;
use common::{BinarySerializable, VInt};
use docset::{DocSet, SkipResult};
use fst::Streamer;
use owned_read::OwnedRead;
use positions::PositionReader;
use postings::compression::compressed_block_size;
use postings::compression::{BlockDecoder, VIntDecoder, COMPRESSION_BLOCK_SIZE};
use postings::serializer::PostingsSerializer;
use postings::BlockSearcher;
use postings::FreqReadingOption;
use postings::Postings;
use postings::SkipReader;
use postings::USE_SKIP_INFO_LIMIT;
use schema::IndexRecordOption;
use std::cmp::Ordering;
use tantivy_fst::Streamer;
use DocId;
const EMPTY_ARR: [u8; 0] = [];
struct PositionComputer {
// store the amount of position int
// before reading positions.
@@ -61,7 +62,6 @@ pub struct SegmentPostings {
block_cursor: BlockSegmentPostings,
cur: usize,
position_computer: Option<PositionComputer>,
block_searcher: BlockSearcher,
}
impl SegmentPostings {
@@ -72,7 +72,6 @@ impl SegmentPostings {
block_cursor: empty_block_cursor,
cur: COMPRESSION_BLOCK_SIZE,
position_computer: None,
block_searcher: BlockSearcher::default(),
}
}
@@ -120,31 +119,46 @@ impl SegmentPostings {
block_cursor: segment_block_postings,
cur: COMPRESSION_BLOCK_SIZE, // cursor within the block
position_computer: positions_stream_opt.map(PositionComputer::new),
block_searcher: BlockSearcher::default(),
}
}
}
impl DocSet for SegmentPostings {
// goes to the next element.
// next needs to be called a first time to point to the correct element.
#[inline]
fn advance(&mut self) -> bool {
if self.position_computer.is_some() {
let term_freq = self.term_freq() as usize;
self.position_computer.as_mut().unwrap().add_skip(term_freq);
fn exponential_search(target: u32, arr: &[u32]) -> (usize, usize) {
let mut start = 0;
let end = arr.len();
debug_assert!(target >= arr[start]);
debug_assert!(target <= arr[end - 1]);
let mut jump = 1;
loop {
let new = start + jump;
if new >= end {
return (start, end);
}
self.cur += 1;
if self.cur >= self.block_cursor.block_len() {
self.cur = 0;
if !self.block_cursor.advance() {
self.cur = COMPRESSION_BLOCK_SIZE;
return false;
}
if arr[new] > target {
return (start, new);
}
true
start = new;
jump *= 2;
}
}
/// Search the first index containing an element greater or equal to the target.
///
/// # Assumption
///
/// The array is assumed non empty.
/// The target is assumed greater or equal to the first element.
/// The target is assumed smaller or equal to the last element.
fn search_within_block(block_docs: &[u32], target: u32) -> usize {
let (start, end) = exponential_search(target, block_docs);
start.wrapping_add(
block_docs[start..end]
.binary_search(&target)
.unwrap_or_else(|e| e),
)
}
impl DocSet for SegmentPostings {
fn skip_next(&mut self, target: DocId) -> SkipResult {
if !self.advance() {
return SkipResult::End;
@@ -202,9 +216,11 @@ impl DocSet for SegmentPostings {
// we're in the right block now, start with an exponential search
let block_docs = self.block_cursor.docs();
debug_assert!(target >= self.doc());
let new_cur = self
.block_searcher
.search_in_block(&block_docs, self.cur, target);
.cur
.wrapping_add(search_within_block(&block_docs[self.cur..], target));
if need_positions {
sum_freqs_skipped += self.block_cursor.freqs()[self.cur..new_cur]
.iter()
@@ -226,6 +242,29 @@ impl DocSet for SegmentPostings {
}
}
// goes to the next element.
// next needs to be called a first time to point to the correct element.
#[inline]
fn advance(&mut self) -> bool {
if self.position_computer.is_some() {
let term_freq = self.term_freq() as usize;
self.position_computer.as_mut().unwrap().add_skip(term_freq);
}
self.cur += 1;
if self.cur >= self.block_cursor.block_len() {
self.cur = 0;
if !self.block_cursor.advance() {
self.cur = COMPRESSION_BLOCK_SIZE;
return false;
}
}
true
}
fn size_hint(&self) -> u32 {
self.len() as u32
}
/// Return the current document's `DocId`.
#[inline]
fn doc(&self) -> DocId {
@@ -237,10 +276,6 @@ impl DocSet for SegmentPostings {
docs[self.cur]
}
fn size_hint(&self) -> u32 {
self.len() as u32
}
fn append_to_bitset(&mut self, bitset: &mut BitSet) {
// finish the current block
if self.advance() {
@@ -339,7 +374,7 @@ impl BlockSegmentPostings {
let (skip_data_opt, postings_data) = split_into_skips_and_postings(doc_freq, data);
let skip_reader = match skip_data_opt {
Some(skip_data) => SkipReader::new(skip_data, record_option),
None => SkipReader::new(OwnedRead::new(&[][..]), record_option),
None => SkipReader::new(OwnedRead::new(&EMPTY_ARR[..]), record_option),
};
let doc_freq = doc_freq as usize;
let num_vint_docs = doc_freq % COMPRESSION_BLOCK_SIZE;
@@ -373,7 +408,7 @@ impl BlockSegmentPostings {
if let Some(skip_data) = skip_data_opt {
self.skip_reader.reset(skip_data);
} else {
self.skip_reader.reset(OwnedRead::new(&[][..]))
self.skip_reader.reset(OwnedRead::new(&EMPTY_ARR[..]))
}
self.doc_offset = 0;
self.doc_freq = doc_freq as usize;
@@ -498,8 +533,7 @@ impl BlockSegmentPostings {
} else {
BlockSegmentPostingsSkipResult::Terminated
}
})
.unwrap_or(BlockSegmentPostingsSkipResult::Terminated);
}).unwrap_or(BlockSegmentPostingsSkipResult::Terminated);
}
BlockSegmentPostingsSkipResult::Terminated
}
@@ -586,19 +620,20 @@ impl<'b> Streamer<'b> for BlockSegmentPostings {
#[cfg(test)]
mod tests {
use super::search_within_block;
use super::BlockSegmentPostings;
use super::BlockSegmentPostingsSkipResult;
use super::SegmentPostings;
use common::HasLen;
use core::Index;
use docset::DocSet;
use fst::Streamer;
use schema::IndexRecordOption;
use schema::Schema;
use schema::Term;
use schema::INDEXED;
use tantivy_fst::Streamer;
use schema::INT_INDEXED;
use DocId;
use SkipResult;
#[test]
fn test_empty_segment_postings() {
@@ -615,9 +650,49 @@ mod tests {
assert_eq!(postings.doc_freq(), 0);
}
fn search_within_block_trivial_but_slow(block: &[u32], target: u32) -> usize {
block
.iter()
.cloned()
.enumerate()
.filter(|&(_, ref val)| *val >= target)
.next()
.unwrap()
.0
}
fn util_test_search_within_block(block: &[u32], target: u32) {
assert_eq!(
search_within_block(block, target),
search_within_block_trivial_but_slow(block, target)
);
}
fn util_test_search_within_block_all(block: &[u32]) {
use std::collections::HashSet;
let mut targets = HashSet::new();
for (i, val) in block.iter().cloned().enumerate() {
if i > 0 {
targets.insert(val - 1);
}
targets.insert(val);
}
for target in targets {
util_test_search_within_block(block, target);
}
}
#[test]
fn test_search_within_block() {
for len in 1u32..128u32 {
let v: Vec<u32> = (0..len).map(|i| i * 2).collect();
util_test_search_within_block_all(&v[..]);
}
}
#[test]
fn test_block_segment_postings() {
let mut block_segments = build_block_postings(&(0..100_000).collect::<Vec<u32>>());
let mut block_segments = build_block_postings((0..100_000).collect::<Vec<u32>>());
let mut offset: u32 = 0u32;
// checking that the block before calling advance is empty
assert!(block_segments.docs().is_empty());
@@ -631,44 +706,14 @@ mod tests {
}
}
#[test]
fn test_skip_right_at_new_block() {
let mut doc_ids = (0..128).collect::<Vec<u32>>();
doc_ids.push(129);
doc_ids.push(130);
{
let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.skip_next(128), SkipResult::OverStep);
assert_eq!(docset.doc(), 129);
assert!(docset.advance());
assert_eq!(docset.doc(), 130);
assert!(!docset.advance());
}
{
let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.skip_next(129), SkipResult::Reached);
assert_eq!(docset.doc(), 129);
assert!(docset.advance());
assert_eq!(docset.doc(), 130);
assert!(!docset.advance());
}
{
let block_segments = build_block_postings(&doc_ids);
let mut docset = SegmentPostings::from_block_postings(block_segments, None);
assert_eq!(docset.skip_next(131), SkipResult::End);
}
}
fn build_block_postings(docs: &[DocId]) -> BlockSegmentPostings {
fn build_block_postings(docs: Vec<DocId>) -> BlockSegmentPostings {
let mut schema_builder = Schema::builder();
let int_field = schema_builder.add_u64_field("id", INDEXED);
let int_field = schema_builder.add_u64_field("id", INT_INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
let mut last_doc = 0u32;
for &doc in docs {
for doc in docs {
for _ in last_doc..doc {
index_writer.add_document(doc!(int_field=>1u64));
}
@@ -676,7 +721,8 @@ mod tests {
last_doc = doc + 1;
}
index_writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let segment_reader = searcher.segment_reader(0);
let inverted_index = segment_reader.inverted_index(int_field);
let term = Term::from_field_u64(int_field, 0u64);
@@ -687,7 +733,7 @@ mod tests {
#[test]
fn test_block_segment_postings_skip() {
for i in 0..4 {
let mut block_postings = build_block_postings(&[3]);
let mut block_postings = build_block_postings(vec![3]);
assert_eq!(
block_postings.skip_to(i),
BlockSegmentPostingsSkipResult::Success(0u32)
@@ -697,7 +743,7 @@ mod tests {
BlockSegmentPostingsSkipResult::Terminated
);
}
let mut block_postings = build_block_postings(&[3]);
let mut block_postings = build_block_postings(vec![3]);
assert_eq!(
block_postings.skip_to(4u32),
BlockSegmentPostingsSkipResult::Terminated
@@ -710,7 +756,7 @@ mod tests {
for i in 0..1300 {
docs.push((i * i / 100) + i);
}
let mut block_postings = build_block_postings(&docs[..]);
let mut block_postings = build_block_postings(docs.clone());
for i in vec![0, 424, 10000] {
assert_eq!(
block_postings.skip_to(i),
@@ -733,10 +779,10 @@ mod tests {
#[test]
fn test_reset_block_segment_postings() {
let mut schema_builder = Schema::builder();
let int_field = schema_builder.add_u64_field("id", INDEXED);
let int_field = schema_builder.add_u64_field("id", INT_INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
// create two postings list, one containg even number,
// the other containing odd numbers.
for i in 0..6 {
@@ -744,7 +790,8 @@ mod tests {
index_writer.add_document(doc);
}
index_writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let segment_reader = searcher.segment_reader(0);
let mut block_segments;

View File

@@ -55,7 +55,7 @@ pub struct InvertedIndexSerializer {
impl InvertedIndexSerializer {
/// Open a new `PostingsSerializer` for the given segment
fn create(
fn new(
terms_write: CompositeWrite<WritePtr>,
postings_write: CompositeWrite<WritePtr>,
positions_write: CompositeWrite<WritePtr>,
@@ -74,7 +74,7 @@ impl InvertedIndexSerializer {
/// Open a new `PostingsSerializer` for the given segment
pub fn open(segment: &mut Segment) -> Result<InvertedIndexSerializer> {
use SegmentComponent::{POSITIONS, POSITIONSSKIP, POSTINGS, TERMS};
InvertedIndexSerializer::create(
InvertedIndexSerializer::new(
CompositeWrite::wrap(segment.open_write(TERMS)?),
CompositeWrite::wrap(segment.open_write(POSTINGS)?),
CompositeWrite::wrap(segment.open_write(POSITIONS)?),
@@ -99,7 +99,7 @@ impl InvertedIndexSerializer {
let positions_write = self.positions_write.for_field(field);
let positionsidx_write = self.positionsidx_write.for_field(field);
let field_type: FieldType = (*field_entry.field_type()).clone();
FieldSerializer::create(
FieldSerializer::new(
&field_type,
term_dictionary_write,
postings_write,
@@ -130,7 +130,7 @@ pub struct FieldSerializer<'a> {
}
impl<'a> FieldSerializer<'a> {
fn create(
fn new(
field_type: &FieldType,
term_dictionary_write: &'a mut CountingWriter<WritePtr>,
postings_write: &'a mut CountingWriter<WritePtr>,
@@ -152,7 +152,7 @@ impl<'a> FieldSerializer<'a> {
_ => (false, false),
};
let term_dictionary_builder =
TermDictionaryBuilder::create(term_dictionary_write, &field_type)?;
TermDictionaryBuilder::new(term_dictionary_write, &field_type)?;
let postings_serializer =
PostingsSerializer::new(postings_write, term_freq_enabled, position_enabled);
let positions_serializer_opt = if position_enabled {

View File

@@ -1,37 +1,28 @@
use super::{Addr, MemoryArena};
use postings::stacker::memory_arena::load;
use postings::stacker::memory_arena::store;
use std::io;
use common::is_power_of_2;
use std::mem;
const MAX_BLOCK_LEN: u32 = 1u32 << 15;
const FIRST_BLOCK: usize = 16;
const INLINED_BLOCK_LEN: usize = FIRST_BLOCK + mem::size_of::<Addr>();
enum CapacityResult {
Available(u32),
NeedAlloc(u32),
}
const FIRST_BLOCK: u32 = 4u32;
fn len_to_capacity(len: u32) -> CapacityResult {
#[inline]
pub fn jump_needed(len: u32) -> Option<usize> {
match len {
0...15 => CapacityResult::Available(FIRST_BLOCK as u32 - len),
16...MAX_BLOCK_LEN => {
let cap = 1 << (32u32 - (len - 1u32).leading_zeros());
let available = cap - len;
if available == 0 {
CapacityResult::NeedAlloc(len)
0...3 => None,
4...MAX_BLOCK_LEN => {
if is_power_of_2(len as usize) {
Some(len as usize)
} else {
CapacityResult::Available(available)
None
}
}
n => {
let available = n % MAX_BLOCK_LEN;
if available == 0 {
CapacityResult::NeedAlloc(MAX_BLOCK_LEN)
if n % MAX_BLOCK_LEN == 0 {
Some(MAX_BLOCK_LEN as usize)
} else {
CapacityResult::Available(MAX_BLOCK_LEN - available)
None
}
}
}
@@ -61,119 +52,82 @@ fn len_to_capacity(len: u32) -> CapacityResult {
#[derive(Debug, Clone, Copy)]
pub struct ExpUnrolledLinkedList {
len: u32,
head: Addr,
tail: Addr,
inlined_data: [u8; INLINED_BLOCK_LEN as usize],
}
pub struct ExpUnrolledLinkedListWriter<'a> {
eull: &'a mut ExpUnrolledLinkedList,
heap: &'a mut MemoryArena,
}
fn ensure_capacity<'a>(
eull: &'a mut ExpUnrolledLinkedList,
heap: &'a mut MemoryArena,
) -> &'a mut [u8] {
if eull.len <= FIRST_BLOCK as u32 {
// We are still hitting the inline block.
if eull.len < FIRST_BLOCK as u32 {
return &mut eull.inlined_data[eull.len as usize..FIRST_BLOCK];
}
// We need to allocate a new block!
let new_block_addr: Addr = heap.allocate_space(FIRST_BLOCK + mem::size_of::<Addr>());
store(&mut eull.inlined_data[FIRST_BLOCK..], new_block_addr);
eull.tail = new_block_addr;
return heap.slice_mut(eull.tail, FIRST_BLOCK);
}
let len = match len_to_capacity(eull.len) {
CapacityResult::NeedAlloc(new_block_len) => {
let new_block_addr: Addr =
heap.allocate_space(new_block_len as usize + mem::size_of::<Addr>());
heap.write_at(eull.tail, new_block_addr);
eull.tail = new_block_addr;
new_block_len
}
CapacityResult::Available(available) => available,
};
heap.slice_mut(eull.tail, len as usize)
}
impl<'a> ExpUnrolledLinkedListWriter<'a> {
pub fn extend_from_slice(&mut self, mut buf: &[u8]) {
if buf.is_empty() {
// we need to cut early, because `ensure_capacity`
// allocates if there is no capacity at all right now.
return;
}
while !buf.is_empty() {
let add_len: usize;
{
let output_buf = ensure_capacity(self.eull, self.heap);
add_len = buf.len().min(output_buf.len());
output_buf[..add_len].copy_from_slice(&buf[..add_len]);
}
self.eull.len += add_len as u32;
self.eull.tail = self.eull.tail.offset(add_len as u32);
buf = &buf[add_len..];
}
}
}
impl<'a> io::Write for ExpUnrolledLinkedListWriter<'a> {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
// There is no use case to only write the capacity.
// This is not IO after all, so we write the whole
// buffer even if the contract of `.write` is looser.
self.extend_from_slice(buf);
Ok(buf.len())
}
fn write_all(&mut self, buf: &[u8]) -> io::Result<()> {
self.extend_from_slice(buf);
Ok(())
}
fn flush(&mut self) -> io::Result<()> {
Ok(())
}
}
impl ExpUnrolledLinkedList {
pub fn new() -> ExpUnrolledLinkedList {
pub fn new(heap: &mut MemoryArena) -> ExpUnrolledLinkedList {
let addr = heap.allocate_space((FIRST_BLOCK as usize) * mem::size_of::<u32>());
ExpUnrolledLinkedList {
len: 0u32,
tail: Addr::null_pointer(),
inlined_data: [0u8; INLINED_BLOCK_LEN as usize],
head: addr,
tail: addr,
}
}
#[inline(always)]
pub fn writer<'a>(&'a mut self, heap: &'a mut MemoryArena) -> ExpUnrolledLinkedListWriter<'a> {
ExpUnrolledLinkedListWriter { eull: self, heap }
pub fn iter<'a>(&self, heap: &'a MemoryArena) -> ExpUnrolledLinkedListIterator<'a> {
ExpUnrolledLinkedListIterator {
heap,
addr: self.head,
len: self.len,
consumed: 0,
}
}
pub fn read_to_end(&self, heap: &MemoryArena, output: &mut Vec<u8>) {
let len = self.len as usize;
if len <= FIRST_BLOCK {
output.extend_from_slice(&self.inlined_data[..len]);
return;
/// Appends a new element to the current stack.
///
/// If the current block end is reached, a new block is allocated.
pub fn push(&mut self, val: u32, heap: &mut MemoryArena) {
self.len += 1;
if let Some(new_block_len) = jump_needed(self.len) {
// We need to allocate another block.
// We also allocate an extra `u32` to store the pointer
// to the future next block.
let new_block_size: usize = (new_block_len + 1) * mem::size_of::<u32>();
let new_block_addr: Addr = heap.allocate_space(new_block_size);
unsafe {
// logic
heap.write(self.tail, new_block_addr)
};
self.tail = new_block_addr;
}
output.extend_from_slice(&self.inlined_data[..FIRST_BLOCK]);
let mut cur = FIRST_BLOCK;
let mut addr = load(&self.inlined_data[FIRST_BLOCK..]);
loop {
let cap = match len_to_capacity(cur as u32) {
CapacityResult::Available(capacity) => capacity,
CapacityResult::NeedAlloc(capacity) => capacity,
} as usize;
let data = heap.slice(addr, cap);
if cur + cap >= len {
output.extend_from_slice(&data[..(len - cur)]);
return;
}
output.extend_from_slice(data);
cur += cap;
addr = heap.read(addr.offset(cap as u32));
unsafe {
// logic
heap.write(self.tail, val);
self.tail = self.tail.offset(mem::size_of::<u32>() as u32);
}
}
}
pub struct ExpUnrolledLinkedListIterator<'a> {
heap: &'a MemoryArena,
addr: Addr,
len: u32,
consumed: u32,
}
impl<'a> Iterator for ExpUnrolledLinkedListIterator<'a> {
type Item = u32;
fn next(&mut self) -> Option<u32> {
if self.consumed == self.len {
None
} else {
self.consumed += 1;
let addr: Addr = if jump_needed(self.consumed).is_some() {
unsafe {
// logic
self.heap.read(self.addr)
}
} else {
self.addr
};
self.addr = addr.offset(mem::size_of::<u32>() as u32);
Some(unsafe {
// logic
self.heap.read(addr)
})
}
}
}
@@ -182,134 +136,46 @@ impl ExpUnrolledLinkedList {
mod tests {
use super::super::MemoryArena;
use super::len_to_capacity;
use super::jump_needed;
use super::*;
use byteorder::{ByteOrder, LittleEndian, WriteBytesExt};
#[test]
#[test]
fn test_stack() {
let mut heap = MemoryArena::new();
let mut stack = ExpUnrolledLinkedList::new();
stack.writer(&mut heap).extend_from_slice(&[1u8]);
stack.writer(&mut heap).extend_from_slice(&[2u8]);
stack.writer(&mut heap).extend_from_slice(&[3u8, 4u8]);
stack.writer(&mut heap).extend_from_slice(&[5u8]);
let mut stack = ExpUnrolledLinkedList::new(&mut heap);
stack.push(1u32, &mut heap);
stack.push(2u32, &mut heap);
stack.push(4u32, &mut heap);
stack.push(8u32, &mut heap);
{
let mut buffer = Vec::new();
stack.read_to_end(&heap, &mut buffer);
assert_eq!(&buffer[..], &[1u8, 2u8, 3u8, 4u8, 5u8]);
let mut it = stack.iter(&heap);
assert_eq!(it.next().unwrap(), 1u32);
assert_eq!(it.next().unwrap(), 2u32);
assert_eq!(it.next().unwrap(), 4u32);
assert_eq!(it.next().unwrap(), 8u32);
assert!(it.next().is_none());
}
}
#[test]
fn test_stack_long() {
let mut heap = MemoryArena::new();
let mut stack = ExpUnrolledLinkedList::new();
let source: Vec<u32> = (0..100).collect();
for &el in &source {
assert!(stack
.writer(&mut heap)
.write_u32::<LittleEndian>(el)
.is_ok());
}
let mut buffer = Vec::new();
stack.read_to_end(&heap, &mut buffer);
let mut result = vec![];
let mut remaining = &buffer[..];
while !remaining.is_empty() {
result.push(LittleEndian::read_u32(&remaining[..4]));
remaining = &remaining[4..];
}
assert_eq!(&result[..], &source[..]);
}
#[test]
fn test_stack_interlaced() {
let mut heap = MemoryArena::new();
let mut stack = ExpUnrolledLinkedList::new();
let mut stack2 = ExpUnrolledLinkedList::new();
let mut vec1: Vec<u8> = vec![];
let mut vec2: Vec<u8> = vec![];
for i in 0..9 {
assert!(stack.writer(&mut heap).write_u32::<LittleEndian>(i).is_ok());
assert!(vec1.write_u32::<LittleEndian>(i).is_ok());
if i % 2 == 0 {
assert!(stack2
.writer(&mut heap)
.write_u32::<LittleEndian>(i)
.is_ok());
assert!(vec2.write_u32::<LittleEndian>(i).is_ok());
}
}
let mut res1 = vec![];
let mut res2 = vec![];
stack.read_to_end(&heap, &mut res1);
stack2.read_to_end(&heap, &mut res2);
assert_eq!(&vec1[..], &res1[..]);
assert_eq!(&vec2[..], &res2[..]);
}
#[test]
fn test_jump_if_needed() {
let mut available = 16u32;
for i in 0..10_000_000 {
match len_to_capacity(i) {
CapacityResult::NeedAlloc(cap) => {
assert_eq!(available, 0, "Failed len={}: Expected 0 got {}", i, cap);
available = cap;
}
CapacityResult::Available(cap) => {
assert_eq!(
available, cap,
"Failed len={}: Expected {} Got {}",
i, available, cap
);
}
}
available -= 1;
let mut block_len = 4u32;
let mut i = 0;
while i < 10_000_000 {
assert!(jump_needed(i + block_len - 1).is_none());
assert!(jump_needed(i + block_len + 1).is_none());
assert!(jump_needed(i + block_len).is_some());
let new_block_len = jump_needed(i + block_len).unwrap();
i += block_len;
block_len = new_block_len as u32;
}
}
#[test]
fn test_jump_if_needed_progression() {
let mut v = vec![];
for i in 0.. {
if v.len() >= 10 {
break;
}
match len_to_capacity(i) {
CapacityResult::NeedAlloc(cap) => {
v.push((i, cap));
}
_ => {}
}
}
assert_eq!(
&v[..],
&[
(16, 16),
(32, 32),
(64, 64),
(128, 128),
(256, 256),
(512, 512),
(1024, 1024),
(2048, 2048),
(4096, 4096),
(8192, 8192)
]
);
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use super::super::MemoryArena;
use super::ExpUnrolledLinkedList;
use byteorder::{NativeEndian, WriteBytesExt};
use test::Bencher;
const NUM_STACK: usize = 10_000;
@@ -337,13 +203,13 @@ mod bench {
let mut heap = MemoryArena::new();
let mut stacks = Vec::with_capacity(100);
for _ in 0..NUM_STACK {
let mut stack = ExpUnrolledLinkedList::new();
let mut stack = ExpUnrolledLinkedList::new(&mut heap);
stacks.push(stack);
}
for s in 0..NUM_STACK {
for i in 0u32..STACK_SIZE {
let t = s * 392017 % NUM_STACK;
let _ = stacks[t].writer(&mut heap).write_u32::<NativeEndian>(i);
stacks[t].push(i, &mut heap);
}
}
});

View File

@@ -37,7 +37,7 @@ const PAGE_SIZE: usize = 1 << NUM_BITS_PAGE_ADDR; // pages are 1 MB large
/// page of memory.
///
/// The last 20 bits are an address within this page of memory.
#[derive(Copy, Clone, Debug)]
#[derive(Clone, Copy, Debug)]
pub struct Addr(u32);
impl Addr {
@@ -69,16 +69,32 @@ impl Addr {
}
}
pub fn store<Item: Copy + 'static>(dest: &mut [u8], val: Item) {
assert_eq!(dest.len(), std::mem::size_of::<Item>());
unsafe {
ptr::write_unaligned(dest.as_mut_ptr() as *mut Item, val);
}
/// Trait required for an object to be `storable`.
///
/// # Warning
///
/// Most of the time you should not implement this trait,
/// and only use the `MemoryArena` with object implementing `Copy`.
///
/// `ArenaStorable` is used in `tantivy` to force
/// a `Copy` object and a `slice` of data to be stored contiguously.
pub trait ArenaStorable {
fn num_bytes(&self) -> usize;
unsafe fn write_into(self, arena: &mut MemoryArena, addr: Addr);
}
pub fn load<Item: Copy + 'static>(data: &[u8]) -> Item {
assert_eq!(data.len(), std::mem::size_of::<Item>());
unsafe { ptr::read_unaligned(data.as_ptr() as *const Item) }
impl<V> ArenaStorable for V
where
V: Copy,
{
fn num_bytes(&self) -> usize {
mem::size_of::<V>()
}
unsafe fn write_into(self, arena: &mut MemoryArena, addr: Addr) {
let dst_ptr = arena.get_mut_ptr(addr) as *mut V;
ptr::write_unaligned(dst_ptr, self);
}
}
/// The `MemoryArena`
@@ -110,9 +126,47 @@ impl MemoryArena {
self.pages.len() * PAGE_SIZE
}
pub fn write_at<Item: Copy + 'static>(&mut self, addr: Addr, val: Item) {
let dest = self.slice_mut(addr, std::mem::size_of::<Item>());
store(dest, val);
/// Writes a slice at the given address, assuming the
/// memory was allocated beforehands.
///
/// # Panics
///
/// May panic or corrupt the heap if he space was not
/// properly allocated beforehands.
pub fn write_bytes<B: AsRef<[u8]>>(&mut self, addr: Addr, data: B) {
let bytes = data.as_ref();
self.pages[addr.page_id()]
.get_mut_slice(addr.page_local_addr(), bytes.len())
.copy_from_slice(bytes);
}
/// Returns the `len` bytes starting at `addr`
///
/// # Panics
///
/// Panics if the memory has not been allocated beforehands.
pub fn read_slice(&self, addr: Addr, len: usize) -> &[u8] {
self.pages[addr.page_id()].get_slice(addr.page_local_addr(), len)
}
unsafe fn get_mut_ptr(&mut self, addr: Addr) -> *mut u8 {
self.pages[addr.page_id()].get_mut_ptr(addr.page_local_addr())
}
/// Stores an item's data in the heap
///
/// It allocates the `Item` beforehands.
pub fn store<Item: ArenaStorable>(&mut self, val: Item) -> Addr {
let num_bytes = val.num_bytes();
let addr = self.allocate_space(num_bytes);
unsafe {
self.write(addr, val);
};
addr
}
pub unsafe fn write<Item: ArenaStorable>(&mut self, addr: Addr, val: Item) {
val.write_into(self, addr)
}
/// Read an item in the heap at the given `address`.
@@ -120,21 +174,9 @@ impl MemoryArena {
/// # Panics
///
/// If the address is erroneous
pub fn read<Item: Copy + 'static>(&self, addr: Addr) -> Item {
load(self.slice(addr, mem::size_of::<Item>()))
}
pub fn slice(&self, addr: Addr, len: usize) -> &[u8] {
self.pages[addr.page_id()].slice(addr.page_local_addr(), len)
}
pub fn slice_from(&self, addr: Addr) -> &[u8] {
self.pages[addr.page_id()].slice_from(addr.page_local_addr())
}
#[inline(always)]
pub fn slice_mut(&mut self, addr: Addr, len: usize) -> &mut [u8] {
self.pages[addr.page_id()].slice_mut(addr.page_local_addr(), len)
pub unsafe fn read<Item: Copy>(&self, addr: Addr) -> Item {
let ptr = self.pages[addr.page_id()].get_ptr(addr.page_local_addr());
ptr::read_unaligned(ptr as *const Item)
}
/// Allocates `len` bytes and returns the allocated address.
@@ -155,10 +197,14 @@ struct Page {
impl Page {
fn new(page_id: usize) -> Page {
let mut data: Vec<u8> = Vec::with_capacity(PAGE_SIZE);
unsafe {
data.set_len(PAGE_SIZE);
} // avoid initializing page
Page {
page_id,
len: 0,
data: vec![0u8; PAGE_SIZE].into_boxed_slice(),
data: data.into_boxed_slice(),
}
}
@@ -167,18 +213,14 @@ impl Page {
len + self.len <= PAGE_SIZE
}
fn slice(&self, local_addr: usize, len: usize) -> &[u8] {
&self.slice_from(local_addr)[..len]
}
fn slice_from(&self, local_addr: usize) -> &[u8] {
&self.data[local_addr..]
}
fn slice_mut(&mut self, local_addr: usize, len: usize) -> &mut [u8] {
fn get_mut_slice(&mut self, local_addr: usize, len: usize) -> &mut [u8] {
&mut self.data[local_addr..][..len]
}
fn get_slice(&self, local_addr: usize, len: usize) -> &[u8] {
&self.data[local_addr..][..len]
}
fn allocate_space(&mut self, len: usize) -> Option<Addr> {
if self.is_available(len) {
let addr = Addr::new(self.page_id, self.len);
@@ -188,6 +230,16 @@ impl Page {
None
}
}
#[inline(always)]
pub(crate) unsafe fn get_ptr(&self, addr: usize) -> *const u8 {
self.data.as_ptr().add(addr)
}
#[inline(always)]
pub(crate) unsafe fn get_mut_ptr(&mut self, addr: usize) -> *mut u8 {
self.data.as_mut_ptr().add(addr)
}
}
#[cfg(test)]
@@ -202,13 +254,13 @@ mod tests {
let b = b"happy tax payer";
let addr_a = arena.allocate_space(a.len());
arena.slice_mut(addr_a, a.len()).copy_from_slice(a);
arena.write_bytes(addr_a, a);
let addr_b = arena.allocate_space(b.len());
arena.slice_mut(addr_b, b.len()).copy_from_slice(b);
arena.write_bytes(addr_b, b);
assert_eq!(arena.slice(addr_a, a.len()), a);
assert_eq!(arena.slice(addr_b, b.len()), b);
assert_eq!(arena.read_slice(addr_a, a.len()), a);
assert_eq!(arena.read_slice(addr_b, b.len()), b);
}
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
@@ -231,15 +283,9 @@ mod tests {
b: 221,
c: 12,
};
let num_bytes = std::mem::size_of::<MyTest>();
let addr_a = arena.allocate_space(num_bytes);
arena.write_at(addr_a, a);
let addr_b = arena.allocate_space(num_bytes);
arena.write_at(addr_b, b);
assert_eq!(arena.read::<MyTest>(addr_a), a);
assert_eq!(arena.read::<MyTest>(addr_b), b);
let addr_a = arena.store(a);
let addr_b = arena.store(b);
assert_eq!(unsafe { arena.read::<MyTest>(addr_a) }, a);
assert_eq!(unsafe { arena.read::<MyTest>(addr_b) }, b);
}
}

View File

@@ -1,7 +1,9 @@
mod expull;
mod memory_arena;
mod murmurhash2;
mod term_hashmap;
pub use self::expull::ExpUnrolledLinkedList;
pub use self::memory_arena::{Addr, MemoryArena};
pub use self::memory_arena::{Addr, ArenaStorable, MemoryArena};
use self::murmurhash2::murmurhash2;
pub use self::term_hashmap::{compute_table_size, TermHashMap};

View File

@@ -0,0 +1,87 @@
use std::ptr;
const SEED: u32 = 3_242_157_231u32;
const M: u32 = 0x5bd1_e995;
#[inline(always)]
pub fn murmurhash2(key: &[u8]) -> u32 {
#[cfg_attr(feature = "cargo-clippy", allow(clippy::cast_ptr_alignment))]
let mut key_ptr: *const u32 = key.as_ptr() as *const u32;
let len = key.len() as u32;
let mut h: u32 = SEED ^ len;
let num_blocks = len >> 2;
for _ in 0..num_blocks {
let mut k: u32 = unsafe { ptr::read_unaligned(key_ptr) }; // ok because of num_blocks definition
k = k.wrapping_mul(M);
k ^= k >> 24;
k = k.wrapping_mul(M);
h = h.wrapping_mul(M);
h ^= k;
key_ptr = key_ptr.wrapping_offset(1);
}
// Handle the last few bytes of the input array
let remaining: &[u8] = &key[key.len() & !3..];
match remaining.len() {
3 => {
h ^= u32::from(remaining[2]) << 16;
h ^= u32::from(remaining[1]) << 8;
h ^= u32::from(remaining[0]);
h = h.wrapping_mul(M);
}
2 => {
h ^= u32::from(remaining[1]) << 8;
h ^= u32::from(remaining[0]);
h = h.wrapping_mul(M);
}
1 => {
h ^= u32::from(remaining[0]);
h = h.wrapping_mul(M);
}
_ => {}
}
h ^= h >> 13;
h = h.wrapping_mul(M);
h ^ (h >> 15)
}
#[cfg(test)]
mod test {
use super::murmurhash2;
use std::collections::HashSet;
#[test]
fn test_murmur() {
let s1 = "abcdef";
let s2 = "abcdeg";
for i in 0..5 {
assert_eq!(
murmurhash2(&s1[i..5].as_bytes()),
murmurhash2(&s2[i..5].as_bytes())
);
}
}
#[test]
fn test_murmur_against_reference_impl() {
assert_eq!(murmurhash2("".as_bytes()), 3632506080);
assert_eq!(murmurhash2("a".as_bytes()), 455683869);
assert_eq!(murmurhash2("ab".as_bytes()), 2448092234);
assert_eq!(murmurhash2("abc".as_bytes()), 2066295634);
assert_eq!(murmurhash2("abcd".as_bytes()), 2588571162);
assert_eq!(murmurhash2("abcde".as_bytes()), 2988696942);
assert_eq!(murmurhash2("abcdefghijklmnop".as_bytes()), 2350868870);
}
#[test]
fn test_murmur_collisions() {
let mut set: HashSet<u32> = HashSet::default();
for i in 0..10_000 {
let s = format!("hash{}", i);
let hash = murmurhash2(s.as_bytes());
set.insert(hash);
}
assert_eq!(set.len(), 10_000);
}
}

View File

@@ -1,15 +1,37 @@
extern crate murmurhash32;
use self::murmurhash32::murmurhash2;
use super::{Addr, MemoryArena};
use byteorder::{ByteOrder, NativeEndian};
use postings::stacker::memory_arena::store;
use postings::UnorderedTermId;
use super::murmurhash2;
use super::{Addr, ArenaStorable, MemoryArena};
use std::iter;
use std::mem;
use std::slice;
pub type BucketId = usize;
struct KeyBytesValue<'a, V> {
key: &'a [u8],
value: V,
}
impl<'a, V> KeyBytesValue<'a, V> {
fn new(key: &'a [u8], value: V) -> KeyBytesValue<'a, V> {
KeyBytesValue { key, value }
}
}
impl<'a, V> ArenaStorable for KeyBytesValue<'a, V>
where
V: ArenaStorable,
{
fn num_bytes(&self) -> usize {
0u16.num_bytes() + self.key.len() + self.value.num_bytes()
}
unsafe fn write_into(self, arena: &mut MemoryArena, addr: Addr) {
arena.write(addr, self.key.len() as u16);
arena.write_bytes(addr.offset(2), self.key);
arena.write(addr.offset(2 + self.key.len() as u32), self.value);
}
}
/// Returns the actual memory size in bytes
/// required to create a table of size $2^num_bits$.
pub fn compute_table_size(num_bits: usize) -> usize {
@@ -27,7 +49,6 @@ pub fn compute_table_size(num_bits: usize) -> usize {
struct KeyValue {
key_value_addr: Addr,
hash: u32,
unordered_term_id: UnorderedTermId,
}
impl Default for KeyValue {
@@ -35,7 +56,6 @@ impl Default for KeyValue {
KeyValue {
key_value_addr: Addr::null_pointer(),
hash: 0u32,
unordered_term_id: UnorderedTermId::default(),
}
}
}
@@ -60,7 +80,6 @@ pub struct TermHashMap {
pub heap: MemoryArena,
mask: usize,
occupied: Vec<usize>,
len: usize,
}
struct QuadraticProbing {
@@ -87,13 +106,14 @@ pub struct Iter<'a> {
}
impl<'a> Iterator for Iter<'a> {
type Item = (&'a [u8], Addr, UnorderedTermId);
type Item = (&'a [u8], Addr, BucketId);
fn next(&mut self) -> Option<Self::Item> {
self.inner.next().cloned().map(move |bucket: usize| {
let kv = self.hashmap.table[bucket];
let (key, offset): (&'a [u8], Addr) = self.hashmap.get_key_value(kv.key_value_addr);
(key, offset, kv.unordered_term_id)
let (key, offset): (&'a [u8], Addr) =
unsafe { self.hashmap.get_key_value(kv.key_value_addr) };
(key, offset, bucket as BucketId)
})
}
}
@@ -108,7 +128,6 @@ impl TermHashMap {
heap,
mask: table_size - 1,
occupied: Vec::with_capacity(table_size / 2),
len: 0,
}
}
@@ -124,34 +143,20 @@ impl TermHashMap {
self.table.len() < self.occupied.len() * 3
}
#[inline(always)]
fn get_key_value(&self, addr: Addr) -> (&[u8], Addr) {
let data = self.heap.slice_from(addr);
let key_bytes_len = NativeEndian::read_u16(data) as usize;
let key_bytes: &[u8] = &data[2..][..key_bytes_len];
(key_bytes, addr.offset(2u32 + key_bytes_len as u32))
unsafe fn get_key_value(&self, addr: Addr) -> (&[u8], Addr) {
let key_bytes_len = self.heap.read::<u16>(addr) as usize;
let key_addr = addr.offset(2u32);
let key_bytes: &[u8] = self.heap.read_slice(key_addr, key_bytes_len);
let val_addr: Addr = key_addr.offset(key_bytes.len() as u32);
(key_bytes, val_addr)
}
#[inline(always)]
fn get_value_addr_if_key_match(&self, target_key: &[u8], addr: Addr) -> Option<Addr> {
let (stored_key, value_addr) = self.get_key_value(addr);
if stored_key == target_key {
Some(value_addr)
} else {
None
}
}
fn set_bucket(&mut self, hash: u32, key_value_addr: Addr, bucket: usize) -> UnorderedTermId {
pub fn set_bucket(&mut self, hash: u32, key_value_addr: Addr, bucket: usize) {
self.occupied.push(bucket);
let unordered_term_id = self.len as UnorderedTermId;
self.len += 1;
self.table[bucket] = KeyValue {
key_value_addr,
hash,
unordered_term_id,
};
unordered_term_id
}
pub fn iter(&self) -> Iter {
@@ -191,53 +196,64 @@ impl TermHashMap {
/// will be in charge of returning a default value.
/// If the key already as an associated value, then it will be passed
/// `Some(previous_value)`.
pub fn mutate_or_create<S, V, TMutator>(
&mut self,
key: S,
mut updater: TMutator,
) -> UnorderedTermId
pub fn mutate_or_create<S, V, TMutator>(&mut self, key: S, mut updater: TMutator) -> BucketId
where
S: AsRef<[u8]>,
V: Copy + 'static,
V: Copy,
TMutator: FnMut(Option<V>) -> V,
{
if self.is_saturated() {
self.resize();
}
let key_bytes: &[u8] = key.as_ref();
let hash = murmurhash2(key.as_ref());
let hash = murmurhash2::murmurhash2(key.as_ref());
let mut probe = self.probe(hash);
loop {
let bucket = probe.next_probe();
let kv: KeyValue = self.table[bucket];
if kv.is_empty() {
// The key does not exists yet.
let val = updater(None);
let num_bytes =
std::mem::size_of::<u16>() + key_bytes.len() + std::mem::size_of::<V>();
let key_addr = self.heap.allocate_space(num_bytes);
{
let data = self.heap.slice_mut(key_addr, num_bytes);
NativeEndian::write_u16(data, key_bytes.len() as u16);
let stop = 2 + key_bytes.len();
data[2..stop].copy_from_slice(key_bytes);
store(&mut data[stop..], val);
}
return self.set_bucket(hash, key_addr, bucket);
let key_addr = self.heap.store(KeyBytesValue::new(key_bytes, val));
self.set_bucket(hash, key_addr, bucket);
return bucket as BucketId;
} else if kv.hash == hash {
if let Some(val_addr) =
self.get_value_addr_if_key_match(key_bytes, kv.key_value_addr)
{
let v = self.heap.read(val_addr);
let new_v = updater(Some(v));
self.heap.write_at(val_addr, new_v);
return kv.unordered_term_id;
let (key_matches, val_addr) = {
let (stored_key, val_addr): (&[u8], Addr) =
unsafe { self.get_key_value(kv.key_value_addr) };
(stored_key == key_bytes, val_addr)
};
if key_matches {
unsafe {
// logic
let v = self.heap.read(val_addr);
let new_v = updater(Some(v));
self.heap.write(val_addr, new_v);
};
return bucket as BucketId;
}
}
}
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use super::murmurhash2::murmurhash2;
use test::Bencher;
#[bench]
fn bench_murmurhash2(b: &mut Bencher) {
let keys: [&'static str; 3] = ["wer qwe qwe qwe ", "werbq weqweqwe2 ", "weraq weqweqwe3 "];
b.iter(|| {
let mut s = 0;
for &key in &keys {
s ^= murmurhash2(key.as_bytes());
}
s
});
}
}
#[cfg(test)]
mod tests {
@@ -269,7 +285,10 @@ mod tests {
let mut vanilla_hash_map = HashMap::new();
let mut iter_values = hash_map.iter();
while let Some((key, addr, _)) = iter_values.next() {
let val: u32 = hash_map.heap.read(addr);
let val: u32 = unsafe {
// test
hash_map.heap.read(addr)
};
vanilla_hash_map.insert(key.to_owned(), val);
}
assert_eq!(vanilla_hash_map.len(), 2);

View File

@@ -101,9 +101,8 @@ mod tests {
index_writer.commit().unwrap();
index_writer.add_document(doc!(field=>"ccc"));
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
reader.reload().unwrap();
let searcher = reader.searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let weight = AllQuery.weight(&searcher, false).unwrap();
{
let reader = searcher.segment_reader(0);

View File

@@ -1,10 +1,10 @@
use common::BitSet;
use core::SegmentReader;
use fst::Automaton;
use query::BitSetDocSet;
use query::ConstScorer;
use query::{Scorer, Weight};
use schema::{Field, IndexRecordOption};
use tantivy_fst::Automaton;
use termdict::{TermDictionary, TermStreamer};
use Result;

View File

@@ -63,8 +63,7 @@ impl BM25Weight {
.map(|term| {
let term_doc_freq = searcher.doc_freq(term);
idf(term_doc_freq, total_num_docs)
})
.sum::<f32>();
}).sum::<f32>();
BM25Weight::new(idf, average_fieldnorm)
}

View File

@@ -47,8 +47,7 @@ impl Query for BooleanQuery {
.iter()
.map(|&(ref occur, ref subquery)| {
Ok((*occur, subquery.weight(searcher, scoring_enabled)?))
})
.collect::<Result<_>>()?;
}).collect::<Result<_>>()?;
Ok(Box::new(BooleanWeight::new(sub_weights, scoring_enabled)))
}
@@ -69,8 +68,7 @@ impl BooleanQuery {
let term_query: Box<Query> =
Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs));
(Occur::Should, term_query)
})
.collect();
}).collect();
BooleanQuery::from(occur_term_queries)
}

View File

@@ -1,4 +1,5 @@
use core::SegmentReader;
use downcast::Downcast;
use query::intersect_scorers;
use query::score_combiner::{DoNothingCombiner, ScoreCombiner, SumWithCoordsCombiner};
use query::term_query::TermScorer;
@@ -9,6 +10,7 @@ use query::RequiredOptionalScorer;
use query::Scorer;
use query::Union;
use query::Weight;
use std::borrow::Borrow;
use std::collections::HashMap;
use Result;
@@ -22,11 +24,14 @@ where
}
{
let is_all_term_queries = scorers.iter().all(|scorer| scorer.is::<TermScorer>());
let is_all_term_queries = scorers.iter().all(|scorer| {
let scorer_ref: &Scorer = scorer.borrow();
Downcast::<TermScorer>::is_type(scorer_ref)
});
if is_all_term_queries {
let scorers: Vec<TermScorer> = scorers
.into_iter()
.map(|scorer| *(scorer.downcast::<TermScorer>().map_err(|_| ()).unwrap()))
.map(|scorer| *Downcast::<TermScorer>::downcast(scorer).unwrap())
.collect();
let scorer: Box<Scorer> = Box::new(Union::<TermScorer, TScoreCombiner>::from(scorers));
return scorer;

View File

@@ -8,6 +8,7 @@ mod tests {
use super::*;
use collector::tests::TestCollector;
use downcast::Downcast;
use query::score_combiner::SumWithCoordsCombiner;
use query::term_query::TermScorer;
use query::Intersection;
@@ -28,7 +29,7 @@ mod tests {
let index = Index::create_in_ram(schema);
{
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
{
let doc = doc!(text_field => "a b c");
index_writer.add_document(doc);
@@ -51,6 +52,7 @@ mod tests {
}
assert!(index_writer.commit().is_ok());
}
index.load_searchers().unwrap();
(index, text_field)
}
@@ -59,8 +61,7 @@ mod tests {
let (index, text_field) = aux_test_helper();
let query_parser = QueryParser::for_index(&index, vec![text_field]);
let query = query_parser.parse_query("(+a +b) d").unwrap();
let searcher = index.reader().unwrap().searcher();
assert_eq!(query.count(&searcher).unwrap(), 3);
assert_eq!(query.count(&*index.searcher()).unwrap(), 3);
}
#[test]
@@ -68,28 +69,28 @@ mod tests {
let (index, text_field) = aux_test_helper();
let query_parser = QueryParser::for_index(&index, vec![text_field]);
let query = query_parser.parse_query("+a").unwrap();
let searcher = index.reader().unwrap().searcher();
let searcher = index.searcher();
let weight = query.weight(&searcher, true).unwrap();
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
assert!(scorer.is::<TermScorer>());
assert!(Downcast::<TermScorer>::is_type(&*scorer));
}
#[test]
pub fn test_boolean_termonly_intersection() {
let (index, text_field) = aux_test_helper();
let query_parser = QueryParser::for_index(&index, vec![text_field]);
let searcher = index.reader().unwrap().searcher();
let searcher = index.searcher();
{
let query = query_parser.parse_query("+a +b +c").unwrap();
let weight = query.weight(&searcher, true).unwrap();
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
assert!(scorer.is::<Intersection<TermScorer>>());
assert!(Downcast::<Intersection<TermScorer>>::is_type(&*scorer));
}
{
let query = query_parser.parse_query("+a +(b c)").unwrap();
let weight = query.weight(&searcher, true).unwrap();
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
assert!(scorer.is::<Intersection<Box<Scorer>>>());
assert!(Downcast::<Intersection<Box<Scorer>>>::is_type(&*scorer));
}
}
@@ -97,19 +98,21 @@ mod tests {
pub fn test_boolean_reqopt() {
let (index, text_field) = aux_test_helper();
let query_parser = QueryParser::for_index(&index, vec![text_field]);
let searcher = index.reader().unwrap().searcher();
let searcher = index.searcher();
{
let query = query_parser.parse_query("+a b").unwrap();
let weight = query.weight(&searcher, true).unwrap();
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
assert!(scorer
.is::<RequiredOptionalScorer<Box<Scorer>, Box<Scorer>, SumWithCoordsCombiner>>());
assert!(Downcast::<
RequiredOptionalScorer<Box<Scorer>, Box<Scorer>, SumWithCoordsCombiner>,
>::is_type(&*scorer));
}
{
let query = query_parser.parse_query("+a b").unwrap();
let weight = query.weight(&searcher, false).unwrap();
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
assert!(scorer.is::<TermScorer>());
println!("{:?}", scorer.type_name());
assert!(Downcast::<TermScorer>::is_type(&*scorer));
}
}
@@ -126,13 +129,10 @@ mod tests {
query
};
let reader = index.reader().unwrap();
let matching_docs = |boolean_query: &Query| {
reader
.searcher()
.search(boolean_query, &TestCollector)
.unwrap()
let searcher = index.searcher();
let test_docs = searcher.search(boolean_query, &TestCollector).unwrap();
test_docs
.docs()
.iter()
.cloned()
@@ -188,12 +188,10 @@ mod tests {
let query: Box<Query> = Box::new(term_query);
query
};
let reader = index.reader().unwrap();
let score_docs = |boolean_query: &Query| {
let fruit = reader
.searcher()
.search(boolean_query, &TestCollector)
.unwrap();
let searcher = index.searcher();
let fruit = searcher.search(boolean_query, &TestCollector).unwrap();
fruit.scores().to_vec()
};

View File

@@ -52,8 +52,9 @@ lazy_static! {
/// ));
/// index_writer.commit().unwrap();
/// }
/// let reader = index.reader()?;
/// let searcher = reader.searcher();
///
/// index.load_searchers()?;
/// let searcher = index.searcher();
///
/// {
///
@@ -140,8 +141,8 @@ mod test {
));
index_writer.commit().unwrap();
}
let reader = index.reader().unwrap();
let searcher = reader.searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
{
let term = Term::from_field_text(country_field, "japon");

View File

@@ -1,7 +1,9 @@
use docset::{DocSet, SkipResult};
use downcast::Downcast;
use query::term_query::TermScorer;
use query::EmptyScorer;
use query::Scorer;
use std::borrow::Borrow;
use DocId;
use Score;
@@ -14,35 +16,42 @@ use Score;
/// specialized implementation if the two
/// shortest scorers are `TermScorer`s.
pub fn intersect_scorers(mut scorers: Vec<Box<Scorer>>) -> Box<Scorer> {
if scorers.is_empty() {
return Box::new(EmptyScorer);
}
if scorers.len() == 1 {
return scorers.pop().unwrap();
}
// We know that we have at least 2 elements.
let num_docsets = scorers.len();
scorers.sort_by(|left, right| right.size_hint().cmp(&left.size_hint()));
let left = scorers.pop().unwrap();
let right = scorers.pop().unwrap();
let rarest_opt = scorers.pop();
let second_rarest_opt = scorers.pop();
scorers.reverse();
let all_term_scorers = [&left, &right]
.iter()
.all(|&scorer| scorer.is::<TermScorer>());
if all_term_scorers {
return Box::new(Intersection {
left: *(left.downcast::<TermScorer>().map_err(|_| ()).unwrap()),
right: *(right.downcast::<TermScorer>().map_err(|_| ()).unwrap()),
others: scorers,
num_docsets,
});
match (rarest_opt, second_rarest_opt) {
(None, None) => Box::new(EmptyScorer),
(Some(single_docset), None) => single_docset,
(Some(left), Some(right)) => {
{
let all_term_scorers = [&left, &right].into_iter().all(|scorer| {
let scorer_ref: &Scorer = (*scorer).borrow();
Downcast::<TermScorer>::is_type(scorer_ref)
});
if all_term_scorers {
let left = *Downcast::<TermScorer>::downcast(left).unwrap();
let right = *Downcast::<TermScorer>::downcast(right).unwrap();
return Box::new(Intersection {
left,
right,
others: scorers,
num_docsets,
});
}
}
Box::new(Intersection {
left,
right,
others: scorers,
num_docsets,
})
}
_ => {
unreachable!();
}
}
Box::new(Intersection {
left,
right,
others: scorers,
num_docsets,
})
}
/// Creates a `DocSet` that iterator through the intersection of two `DocSet`s.
@@ -118,6 +127,7 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
return false;
}
}
match left.skip_next(candidate) {
SkipResult::Reached => {
break;
@@ -133,36 +143,35 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
}
// test the remaining scorers;
for (ord, docset) in self.others.iter_mut().enumerate() {
if ord == other_candidate_ord {
continue;
}
// `candidate_ord` is already at the
// right position.
//
// Calling `skip_next` would advance this docset
// and miss it.
match docset.skip_next(candidate) {
SkipResult::Reached => {}
SkipResult::OverStep => {
// this is not in the intersection,
// let's update our candidate.
candidate = docset.doc();
match left.skip_next(candidate) {
SkipResult::Reached => {
other_candidate_ord = ord;
}
SkipResult::OverStep => {
candidate = left.doc();
other_candidate_ord = usize::max_value();
}
SkipResult::End => {
return false;
if ord != other_candidate_ord {
// `candidate_ord` is already at the
// right position.
//
// Calling `skip_next` would advance this docset
// and miss it.
match docset.skip_next(candidate) {
SkipResult::Reached => {}
SkipResult::OverStep => {
// this is not in the intersection,
// let's update our candidate.
candidate = docset.doc();
match left.skip_next(candidate) {
SkipResult::Reached => {
other_candidate_ord = ord;
}
SkipResult::OverStep => {
candidate = left.doc();
other_candidate_ord = usize::max_value();
}
SkipResult::End => {
return false;
}
}
continue 'outer;
}
SkipResult::End => {
return false;
}
continue 'outer;
}
SkipResult::End => {
return false;
}
}
}

View File

@@ -24,13 +24,14 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
for &text in texts {
let doc = doc!(text_field=>text);
index_writer.add_document(doc);
}
assert!(index_writer.commit().is_ok());
}
index.load_searchers().unwrap();
index
}
@@ -45,7 +46,8 @@ mod tests {
]);
let schema = index.schema();
let text_field = schema.get_field("text").unwrap();
let searcher = index.reader().unwrap().searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let test_query = |texts: Vec<&str>| {
let terms: Vec<Term> = texts
.iter()
@@ -84,11 +86,12 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"a b c"));
assert!(index_writer.commit().is_ok());
}
let searcher = index.reader().unwrap().searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let phrase_query = PhraseQuery::new(vec![
Term::from_field_text(text_field, "a"),
Term::from_field_text(text_field, "b"),
@@ -112,7 +115,8 @@ mod tests {
let index = create_index(&["a b c", "a b c a b"]);
let schema = index.schema();
let text_field = schema.get_field("text").unwrap();
let searcher = index.reader().unwrap().searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let test_query = |texts: Vec<&str>| {
let terms: Vec<Term> = texts
.iter()
@@ -137,14 +141,15 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"b"));
index_writer.add_document(doc!(text_field=>"a b"));
index_writer.add_document(doc!(text_field=>"b a"));
assert!(index_writer.commit().is_ok());
}
let searcher = index.reader().unwrap().searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let test_query = |texts: Vec<&str>| {
let terms: Vec<Term> = texts
.iter()
@@ -168,11 +173,12 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer.add_document(doc!(text_field=>"a b c d e f g h"));
assert!(index_writer.commit().is_ok());
}
let searcher = index.reader().unwrap().searcher();
index.load_searchers().unwrap();
let searcher = index.searcher();
let test_query = |texts: Vec<(usize, &str)>| {
let terms: Vec<(usize, Term)> = texts
.iter()

View File

@@ -43,7 +43,7 @@ impl<TPostings: Postings> DocSet for PostingsWithOffset<TPostings> {
pub struct PhraseScorer<TPostings: Postings> {
intersection_docset: Intersection<PostingsWithOffset<TPostings>, PostingsWithOffset<TPostings>>,
num_terms: usize,
num_docsets: usize,
left: Vec<u32>,
right: Vec<u32>,
phrase_count: u32,
@@ -134,11 +134,10 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
.into_iter()
.map(|(offset, postings)| {
PostingsWithOffset::new(postings, (max_offset - offset) as u32)
})
.collect::<Vec<_>>();
}).collect::<Vec<_>>();
PhraseScorer {
intersection_docset: Intersection::new(postings_with_offsets),
num_terms: num_docsets,
num_docsets,
left: Vec::with_capacity(100),
right: Vec::with_capacity(100),
phrase_count: 0u32,
@@ -165,7 +164,7 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
.positions(&mut self.left);
}
let mut intersection_len = self.left.len();
for i in 1..self.num_terms - 1 {
for i in 1..self.num_docsets - 1 {
{
self.intersection_docset
.docset_mut_specialized(i)
@@ -178,7 +177,7 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
}
self.intersection_docset
.docset_mut_specialized(self.num_terms - 1)
.docset_mut_specialized(self.num_docsets - 1)
.positions(&mut self.right);
intersection_exists(&self.left[..intersection_len], &self.right[..])
}
@@ -190,7 +189,7 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
.positions(&mut self.left);
}
let mut intersection_len = self.left.len();
for i in 1..self.num_terms - 1 {
for i in 1..self.num_docsets - 1 {
{
self.intersection_docset
.docset_mut_specialized(i)
@@ -203,7 +202,7 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
}
self.intersection_docset
.docset_mut_specialized(self.num_terms - 1)
.docset_mut_specialized(self.num_docsets - 1)
.positions(&mut self.right);
intersection_count(&self.left[..intersection_len], &self.right[..]) as u32
}

View File

@@ -1,6 +1,6 @@
use super::Weight;
use core::searcher::Searcher;
use downcast_rs;
use downcast;
use std::collections::BTreeSet;
use std::fmt;
use Result;
@@ -39,7 +39,7 @@ use Term;
///
/// When implementing a new type of `Query`, it is normal to implement a
/// dedicated `Query`, `Weight` and `Scorer`.
pub trait Query: QueryClone + downcast_rs::Downcast + fmt::Debug {
pub trait Query: QueryClone + downcast::Any + fmt::Debug {
/// Create the weight associated to a query.
///
/// If scoring is not required, setting `scoring_enabled` to `false`
@@ -96,4 +96,7 @@ impl QueryClone for Box<Query> {
}
}
impl_downcast!(Query);
#[allow(missing_docs)]
mod downcast_impl {
downcast!(super::Query);
}

View File

@@ -52,7 +52,7 @@ parser! {
field_name: None,
phrase,
});
attempt(term_query)
try(term_query)
.or(term_default_field)
.map(UserInputLeaf::from)
}
@@ -83,12 +83,12 @@ parser! {
let lower_bound = {
let excl = (char('{'), term_val()).map(|(_, w)| UserInputBound::Exclusive(w));
let incl = (char('['), term_val()).map(|(_, w)| UserInputBound::Inclusive(w));
attempt(excl).or(incl)
try(excl).or(incl)
};
let upper_bound = {
let excl = (term_val(), char('}')).map(|(w, _)| UserInputBound::Exclusive(w));
let incl = (term_val(), char(']')).map(|(w, _)| UserInputBound::Inclusive(w));
attempt(excl).or(incl)
try(excl).or(incl)
};
(
optional((field(), char(':')).map(|x| x.0)),
@@ -112,11 +112,11 @@ parser! {
.or((char('+'), leaf()).map(|(_, expr)| expr.unary(Occur::Must) ))
.or((char('('), parse_to_ast(), char(')')).map(|(_, expr, _)| expr))
.or(char('*').map(|_| UserInputAST::from(UserInputLeaf::All) ))
.or(attempt(
.or(try(
(string("NOT"), spaces1(), leaf()).map(|(_, _, expr)| expr.unary(Occur::MustNot))
)
)
.or(attempt(
.or(try(
range().map(UserInputAST::from)
)
)
@@ -160,7 +160,7 @@ parser! {
where [I: Stream<Item = char>]
{
(
attempt(
try(
chainl1(
leaf().map(Element::SingleEl),
binary_operand().map(|op: BinaryOperand|

Some files were not shown because too many files have changed in this diff Show More