Compare commits

..

46 Commits

Author SHA1 Message Date
Jason Wolfe
c9d8031664 Committing for shared discussion 2018-05-19 14:03:38 +09:00
jwolfe
9628413386 Merge branch 'issue/query-ergonomics-3' into staged_collector_with_multi 2018-05-18 17:05:14 +09:00
jwolfe
b2ce65f52d Expose parameters of RangeQuery for external usage 2018-05-18 17:04:26 +09:00
Jason Wolfe
0cea706f10 Add docs to new Query methods (#307) 2018-05-18 13:53:29 +09:00
jwolfe
ad81e131ec Merge branch 'master' into staged_collector_with_multi 2018-05-18 12:22:44 +09:00
Paul Masurel
71d41ca209 Added Google to the license 2018-05-18 10:13:23 +09:00
Paul Masurel
bc69dab822 cargo fmt 2018-05-18 10:08:05 +09:00
Jason Wolfe
72acad0921 Add box_clone() and downcast::Any to Query (#303) 2018-05-18 09:53:11 +09:00
Paul Masurel
c9459f74e8 Update docs about TermDict. 2018-05-18 09:20:39 +09:00
jwolfe
327ca2ab02 Make Weight Send+Sync for parallelization purposes 2018-05-16 10:49:38 +09:00
jwolfe
16ca6a0e5c Fix test 2018-05-14 16:40:00 +09:00
jwolfe
8c07ae653d Merge branch 'master' into staged_collector_with_multi 2018-05-14 15:35:23 +09:00
jwolfe
3d483f8711 Fix chained collector 2018-05-14 14:20:07 +09:00
Paul Masurel
56b2e9731f working. Chained collector is broken though 2018-05-12 16:00:58 -07:00
Dru Sellers
08d2cc6c7b Make it possible to stream the terms matching an Automaton (#297)
* rustfmt and some English grammar

* sort cargo.toml crates

* WIP: something to show

* Remove example for now

* Implement desired method

* Resolving Generic Type Arguments

* Resolve Generic Types

* Banging around on the tests

* DANGER! Change unsafe usage based on compiler warnings

* Unscrew up my rebase

* Clean Up Type Spam

Default Types FTW

* typo

* better variable names

* Remove Duplicate Levenshtein crate
2018-05-11 12:41:14 -07:00
Jason Wolfe
c85668cabe Attempt to add MultiCollector back 2018-05-11 22:18:43 +09:00
Dru Sellers
82d87416c2 Implement StopWords Filter (#292)
* Implement StopWords Filter

- added example doctest for alphanum_only.rs so that I could
drive my own test of the stopword filter

* Style Cop

* Switch HashSet Hasher to FNV for speed

* Update Change Log

* fix missed location renaming
2018-05-09 18:40:41 -07:00
Paul Masurel
96b2c2971e Testing actual doc ids in unit test 2018-05-09 09:14:22 -07:00
Dru Sellers
162afd73f6 Alive docs iterator (#293)
* Add non-deleted DocId iterator to SegmentReader

Closes #287

* Add Todo

* Add Unit Test

* Improving test based on feedback

- found bug and fixed it. :)

* Reestablish changes post rebase for clean merge
2018-05-09 09:03:27 -07:00
Paul Masurel
ddfd87fa59 Merge branch 'master' of github.com:tantivy-search/tantivy 2018-05-08 00:08:17 -07:00
Paul Masurel
24050d0eb5 Remove some unsafe stuff, justified some of it. 2018-05-07 23:57:53 -07:00
Jason Wolfe
89eb209ece #294: Make fieldnorm module public, add documentation (#295) 2018-05-07 20:20:38 -07:00
Paul Masurel
9a0b7f9855 Rustfmt 2018-05-07 19:50:35 -07:00
Jason Wolfe
8e343b1ca3 Add fast field for associating arbitrary bytes to a document (#275)
* Add fast field for associating arbitrary bytes to a document

* Fix unused macro_use warning

* Improvements from code review

* Make BytesFastFieldWriter public

* Fix json parsing validation failure

* Add bytes fast field to CHANGELOG.md

* Fix compile errors from merge

* Support merging

* Address misc code review comments

* Fix comments from CR
2018-05-07 19:30:31 -07:00
Paul Masurel
99c0b84036 Integrating #274, #280, #289 into master (#290)
* Integrating bugfixes into master

Closes #274
Closes #280
Closes #289

* Next version will be 0.6
2018-05-06 09:48:25 -07:00
Dru Sellers
ca74c14647 Simple Implementation of NGram Tokenizer (#278)
* Simple Implementation of NGram Tokenizer

It does not yet support edges
It could probably be better in many "rusty" ways
But the test is passing, so I'll call this a good stopping point for
the day.

* Remove Ngram from manager. Too many variations

* Basic configuration model

Should the extensive tests exist here?

* Add Sample to provide an End to End testing

* Basic Edgegram support

* cleanup

* code feedback

* More code review feedback processed
2018-05-06 09:47:49 -07:00
Dru Sellers
68ee18e4e8 Add Index::open_directory function (#285)
* Add Index::open_directory function

* dry
2018-05-03 00:07:46 -07:00
Jason Wolfe
8a33ddaca7 Split Collector into an overall Collector and a per-segment SegmentCollector. Precursor to cross-segment parallelism, and as a side benefit cleans up any per-segment fields from being Option<T> to just T. 2018-05-02 14:08:52 +09:00
Paul Masurel
5637657c2f Removed ptr dereference for explicit ptr::read_unaligned 2018-04-25 19:15:32 +09:00
Paul Masurel
2e3c9a8878 Bugfix in murmurhash. 2018-04-25 19:06:31 +09:00
Paul Masurel
78673172d0 Cargo fmt 2018-04-21 20:05:36 +09:00
Paul Masurel
175b76f119 Removed streamdict
Closes #271
2018-04-21 19:55:41 +09:00
Paul Masurel
9b79e21bd7 Returning error when schema is not valid for a given query. 2018-04-19 13:02:30 +09:00
Paul Masurel
5e38ae336f Bump tantivy version and readded win deps 2018-04-17 18:27:57 +09:00
Paul Masurel
8604351f59 Hide some of the API
Added some doc.
2018-04-17 13:31:22 +09:00
Paul Masurel
6a48953d8a Closes #266 (#268)
PhraseQuery panics with a nice error message when the underlying field does not have any positions.
The `QueryParser` fails as well with a dedicated error.
2018-04-17 10:03:15 +09:00
pmasurel
0804b42afa Checking the type of range queries 2018-04-16 14:01:10 +09:00
Paul Masurel
8083bc6eef bench working 2018-04-15 12:25:38 +09:00
Paul Masurel
0156f88265 Compiles in stable rust 2018-04-15 11:03:44 +09:00
Paul Masurel
a1c07bf457 Added iterator for facet collector 2018-04-14 20:22:02 +09:00
Paul Masurel
9de74b68d1 Remove range argument 2018-04-13 18:34:23 +09:00
Paul Masurel
57c7073867 Removed 2018-04-13 09:43:36 +09:00
Paul Masurel
121374b89b Removed the need for AtomicU64 2018-04-12 22:08:15 +09:00
Paul Masurel
e44782bf14 No more 2018-04-12 13:01:11 +09:00
Paul Masurel
dfafb24fa6 Bumped bitpacker's version 2018-04-10 21:21:47 +09:00
jason-wolfe
4c6f9541e9 #263: Make MultiValueIntFastFieldWriter public, expose via FastFieldsWriter (#264) 2018-04-10 12:27:34 +09:00
158 changed files with 4996 additions and 3392 deletions

View File

@@ -1,9 +1,21 @@
Tantivy 0.5.2
Tantivy 0.6
==========================
- Removed C code. Tantivy is now pure Rust.
- BM25
- Approximate field norms encoded over 1 byte.
- Compiles on stable rust
- Add &[u8] fastfield for associating arbitrary bytes to each document (@jason-wolfe) (#270)
- Completely uncompressed
- Internally: One u64 fast field for indexes, one fast field for the bytes themselves.
- Add NGram token support (@drusellers)
- Add Stopword Filter support (@drusellers)
Tantivy 0.5.2
===========================
- bugfix #274
- bugfix #280
- bugfix #289
Tantivy 0.5.1
==========================
@@ -81,7 +93,7 @@ Tantivy 0.3
Special thanks to @Kodraus @lnicola @Ameobea @manuel-woelker @celaus
for their contribution to this release.
Thanks also to everyone in tantivy gitter chat
Thanks also to everyone in tantivy gitter chat
for their advise and company :)
https://gitter.im/tantivy-search/tantivy
@@ -89,9 +101,9 @@ https://gitter.im/tantivy-search/tantivy
Warning:
Tantivy 0.3 is NOT backward compatible with tantivy 0.2
Tantivy 0.3 is NOT backward compatible with tantivy 0.2
code and index format.
You should not expect backward compatibility before
You should not expect backward compatibility before
tantivy 1.0.
@@ -117,7 +129,7 @@ Thanks to @KodrAus ! (#108)
the natural ordering.
- Building binary targets for tantivy-cli (Thanks to @KodrAus)
- Misc invisible bug fixes, and code cleanup.
- Use
- Use

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy"
version = "0.5.1"
version = "0.6.0-dev"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]
@@ -12,12 +12,14 @@ readme = "README.md"
keywords = ["search", "information", "retrieval"]
[dependencies]
base64 = "0.9.1"
byteorder = "1.0"
lazy_static = "0.2.1"
tinysegmenter = "0.1.0"
regex = "0.2"
fst = {version="0.2", default-features=false}
fst = {version="0.3", default-features=false}
atomicwrites = {version="0.1", optional=true}
tempfile = "2.1"
log = "0.3.6"
combine = "2.2"
tempdir = "0.3"
@@ -26,6 +28,8 @@ serde_derive = "1.0"
serde_json = "1.0"
num_cpus = "1.2"
itertools = "0.5.9"
levenshtein_automata = {version="0.1", features=["fst_automaton"]}
lz4 = "1.20"
bit-set = "0.4.0"
uuid = { version = "0.6", features = ["v4", "serde"] }
chan = "0.1"
@@ -36,17 +40,16 @@ error-chain = "0.8"
owning_ref = "0.3"
stable_deref_trait = "1.0.0"
rust-stemmers = "0.1.0"
downcast = { version="0.9", features = ["nightly"]}
downcast = { version="0.9" }
matches = "0.1"
snap = "0.2"
bitpacking = {path = "../bitpacking"}
bitpacking = "0.4"
fnv = "1.0.6"
[target.'cfg(windows)'.dependencies]
winapi = "0.2"
[dev-dependencies]
rand = "0.3"
tempfile = "2.1"
env_logger = "0.4"
[profile.release]
@@ -55,12 +58,11 @@ debug = false
lto = true
debug-assertions = false
[features]
default = ["mmap"]
streamdict = []
simd = ["bitpacking/simd"]
mmap = ["fst/mmap", "atomicwrites"]
unstable = ["simd"]
[badges]
travis-ci = { repository = "tantivy-search/tantivy" }
@@ -69,11 +71,5 @@ travis-ci = { repository = "tantivy-search/tantivy" }
name = "simple_search"
required-features = ["mmap"]
[[bin]]
name = "convert_to_static"
path = "./bin/convert_to_static.rs"
[[bin]]
name = "test_static_dir"
path = "./bin/test_static_dir.rs"
[[example]]
name = "custom_tokenizer"

View File

@@ -1,4 +1,4 @@
Copyright (c) 2016 Paul Masurel
Copyright (c) 2018 by Paul Masurel, Google LLC
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

View File

@@ -43,15 +43,12 @@ It will walk you through getting a wikipedia search engine up and running in a f
## Development
Tantivy requires Rust Nightly because it uses requires the features [`box_syntax`](https://doc.rust-lang.org/stable/unstable-book/language-features/box-syntax.html), [`optin_builtin_traits`](https://github.com/rust-lang/rfcs/blob/master/text/0019-opt-in-builtin-traits.md), [`conservative_impl_trait`](https://github.com/rust-lang/rfcs/blob/master/text/1522-conservative-impl-trait.md),
and [simd](https://github.com/rust-lang/rust/issues/27731).
Tantivy now compiles on stable rust.
To check out and run test, you can simply run :
git clone git@github.com:tantivy-search/tantivy.git
cd tantivy
cargo +nightly build
cargo build
## Note on release build and performance

View File

@@ -1,20 +0,0 @@
use std::env;
use std::path::PathBuf;
use std::fs::File;
use std::io::Write;
extern crate tantivy;
use tantivy::directory::write_static_from_directory;
fn main() {
// Prints each argument on a separate line
let mut args = env::args();
args.next().unwrap();
let directory_path= args.next().expect("Expect 2 args.<directory_path> <outputfile>");
let output_path = args.next().expect("Expect 2 args.<directory_path> <outputfile>");
println!("{} => {}", directory_path, output_path);
let buffer = write_static_from_directory(&PathBuf::from(directory_path)).unwrap();
println!("Read all");
let mut output = File::create(output_path).unwrap();
output.write_all(&buffer[..]).unwrap();
output.flush().unwrap();
}

View File

@@ -1,51 +0,0 @@
use std::env;
use std::path::PathBuf;
use std::fs::File;
use std::io::Write;
extern crate tantivy;
use tantivy::directory::{StaticDirectory, write_static_from_directory};
use tantivy::Index;
use tantivy::query::QueryParser;
use tantivy::collector::TopCollector;
static DATA: &'static [u8] = include_bytes!("output.bin");
fn run() -> tantivy::Result<()> {
// Prints each argument on a separate line
let directory = StaticDirectory::open(DATA).unwrap();
let index = Index::open_directory(directory).unwrap();
index.load_searchers().unwrap();
let searcher = index.searcher();
let schema = index.schema();
let title = schema.get_field("title").unwrap();
let body = schema.get_field("body").unwrap();
let query_parser = QueryParser::for_index(&index, vec![title, body]);
let query = query_parser.parse_query("sea whale")?;
let mut top_collector = TopCollector::with_limit(10);
searcher.search(&*query, &mut top_collector)?;
let doc_addresses = top_collector.docs();
// The actual documents still need to be
// retrieved from Tantivy's store.
//
// Since the body field was not configured as stored,
// the document returned will only contain
// a title.
for doc_address in doc_addresses {
let retrieved_doc = searcher.doc(&doc_address)?;
println!("{}", schema.to_json(&retrieved_doc));
}
Ok(())
}
fn main() {
run().unwrap();
}

View File

@@ -0,0 +1,226 @@
extern crate tantivy;
extern crate tempdir;
#[macro_use]
extern crate serde_json;
use std::path::Path;
use tantivy::collector::TopCollector;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::tokenizer::NgramTokenizer;
use tantivy::Index;
use tempdir::TempDir;
fn main() {
// Let's create a temporary directory for the
// sake of this example
if let Ok(dir) = TempDir::new("tantivy_token_example_dir") {
run_example(dir.path()).unwrap();
dir.close().unwrap();
}
}
fn run_example(index_path: &Path) -> tantivy::Result<()> {
// # Defining the schema
//
// The Tantivy index requires a very strict schema.
// The schema declares which fields are in the index,
// and for each field, its type and "the way it should
// be indexed".
// first we need to define a schema ...
let mut schema_builder = SchemaBuilder::default();
// Our first field is title.
// In this example we want to use NGram searching
// we will set that to 3 characters, so any three
// char in the title should be findable.
let text_field_indexing = TextFieldIndexing::default()
.set_tokenizer("ngram3")
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
let text_options = TextOptions::default()
.set_indexing_options(text_field_indexing)
.set_stored();
schema_builder.add_text_field("title", text_options);
// Our second field is body.
// We want full-text search for it, but we do not
// need to be able to be able to retrieve it
// for our application.
//
// We can make our index lighter and
// by omitting `STORED` flag.
schema_builder.add_text_field("body", TEXT);
let schema = schema_builder.build();
// # Indexing documents
//
// Let's create a brand new index.
//
// This will actually just save a meta.json
// with our schema in the directory.
let index = Index::create(index_path, schema.clone())?;
// here we are registering our custome tokenizer
// this will store tokens of 3 characters each
index
.tokenizers()
.register("ngram3", NgramTokenizer::new(3, 3, false));
// To insert document we need an index writer.
// There must be only one writer at a time.
// This single `IndexWriter` is already
// multithreaded.
//
// Here we use a buffer of 50MB per thread. Using a bigger
// heap for the indexer can increase its throughput.
let mut index_writer = index.writer(50_000_000)?;
// Let's index our documents!
// We first need a handle on the title and the body field.
// ### Create a document "manually".
//
// We can create a document manually, by setting the fields
// one by one in a Document object.
let title = schema.get_field("title").unwrap();
let body = schema.get_field("body").unwrap();
let mut old_man_doc = Document::default();
old_man_doc.add_text(title, "The Old Man and the Sea");
old_man_doc.add_text(
body,
"He was an old man who fished alone in a skiff in the Gulf Stream and \
he had gone eighty-four days now without taking a fish.",
);
// ... and add it to the `IndexWriter`.
index_writer.add_document(old_man_doc);
// ### Create a document directly from json.
//
// Alternatively, we can use our schema to parse a
// document object directly from json.
// The document is a string, but we use the `json` macro
// from `serde_json` for the convenience of multi-line support.
let json = json!({
"title": "Of Mice and Men",
"body": "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
});
let mice_and_men_doc = schema.parse_document(&json.to_string())?;
index_writer.add_document(mice_and_men_doc);
// Multi-valued field are allowed, they are
// expressed in JSON by an array.
// The following document has two titles.
let json = json!({
"title": ["Frankenstein", "The Modern Prometheus"],
"body": "You will rejoice to hear that no disaster has accompanied the commencement of an \
enterprise which you have regarded with such evil forebodings. I arrived here \
yesterday, and my first task is to assure my dear sister of my welfare and \
increasing confidence in the success of my undertaking."
});
let frankenstein_doc = schema.parse_document(&json.to_string())?;
index_writer.add_document(frankenstein_doc);
// This is an example, so we will only index 3 documents
// here. You can check out tantivy's tutorial to index
// the English wikipedia. Tantivy's indexing is rather fast.
// Indexing 5 million articles of the English wikipedia takes
// around 4 minutes on my computer!
// ### Committing
//
// At this point our documents are not searchable.
//
//
// We need to call .commit() explicitly to force the
// index_writer to finish processing the documents in the queue,
// flush the current index to the disk, and advertise
// the existence of new documents.
//
// This call is blocking.
index_writer.commit()?;
// If `.commit()` returns correctly, then all of the
// documents that have been added are guaranteed to be
// persistently indexed.
//
// In the scenario of a crash or a power failure,
// tantivy behaves as if has rolled back to its last
// commit.
// # Searching
//
// Let's search our index. Start by reloading
// searchers in the index. This should be done
// after every commit().
index.load_searchers()?;
// Afterwards create one (or more) searchers.
//
// You should create a searcher
// every time you start a "search query".
let searcher = index.searcher();
// The query parser can interpret human queries.
// Here, if the user does not specify which
// field they want to search, tantivy will search
// in both title and body.
let query_parser = QueryParser::for_index(&index, vec![title, body]);
// here we want to get a hit on the 'ken' in Frankenstein
let query = query_parser.parse_query("ken")?;
// A query defines a set of documents, as
// well as the way they should be scored.
//
// A query created by the query parser is scored according
// to a metric called Tf-Idf, and will consider
// any document matching at least one of our terms.
// ### Collectors
//
// We are not interested in all of the documents but
// only in the top 10. Keeping track of our top 10 best documents
// is the role of the TopCollector.
let mut top_collector = TopCollector::with_limit(10);
// We can now perform our query.
searcher.search(&*query, &mut top_collector)?;
// Our top collector now contains the 10
// most relevant doc ids...
let doc_addresses = top_collector.docs();
// The actual documents still need to be
// retrieved from Tantivy's store.
//
// Since the body field was not configured as stored,
// the document returned will only contain
// a title.
for doc_address in doc_addresses {
let retrieved_doc = searcher.doc(&doc_address)?;
println!("{}", schema.to_json(&retrieved_doc));
}
// Wait for indexing and merging threads to shut down.
// Usually this isn't needed, but in `main` we try to
// delete the temporary directory and that fails on
// Windows if the files are still open.
index_writer.wait_merging_threads()?;
Ok(())
}

View File

@@ -5,11 +5,11 @@ extern crate tempdir;
extern crate serde_json;
use std::path::Path;
use tempdir::TempDir;
use tantivy::Index;
use tantivy::schema::*;
use tantivy::collector::TopCollector;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::Index;
use tempdir::TempDir;
fn main() {
// Let's create a temporary directory for the

View File

@@ -1,27 +1,39 @@
use Result;
use collector::Collector;
use DocId;
use Result;
use Score;
use SegmentLocalId;
use SegmentReader;
use DocId;
use Score;
use collector::SegmentCollector;
use collector::CollectorWrapper;
/// Collector that does nothing.
/// This is used in the chain Collector and will hopefully
/// be optimized away by the compiler.
pub struct DoNothingCollector;
impl Collector for DoNothingCollector {
type Child = DoNothingCollector;
#[inline]
fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<()> {
Ok(())
fn for_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<DoNothingCollector> {
Ok(DoNothingCollector)
}
#[inline]
fn collect(&mut self, _doc: DocId, _score: Score) {}
#[inline]
fn requires_scoring(&self) -> bool {
false
}
}
impl SegmentCollector for DoNothingCollector {
type CollectionResult = ();
#[inline]
fn collect(&mut self, _doc: DocId, _score: Score) {}
fn finalize(self) -> () {
()
}
}
/// Zero-cost abstraction used to collect on multiple collectors.
/// This contraption is only usable if the type of your collectors
/// are known at compile time.
@@ -30,34 +42,49 @@ pub struct ChainedCollector<Left: Collector, Right: Collector> {
right: Right,
}
pub struct ChainedSegmentCollector<Left: SegmentCollector, Right: SegmentCollector> {
left: Left,
right: Right,
}
impl<Left: Collector, Right: Collector> ChainedCollector<Left, Right> {
/// Adds a collector
pub fn push<C: Collector>(self, new_collector: &mut C) -> ChainedCollector<Self, &mut C> {
pub fn push<C: Collector>(self, new_collector: &mut C) -> ChainedCollector<Self, CollectorWrapper<C>> {
ChainedCollector {
left: self,
right: new_collector,
right: CollectorWrapper::new(new_collector),
}
}
}
impl<Left: Collector, Right: Collector> Collector for ChainedCollector<Left, Right> {
fn set_segment(
type Child = ChainedSegmentCollector<Left::Child, Right::Child>;
fn for_segment(
&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader,
) -> Result<()> {
self.left.set_segment(segment_local_id, segment)?;
self.right.set_segment(segment_local_id, segment)?;
Ok(())
) -> Result<Self::Child> {
Ok(ChainedSegmentCollector {
left: self.left.for_segment(segment_local_id, segment)?,
right: self.right.for_segment(segment_local_id, segment)?,
})
}
fn requires_scoring(&self) -> bool {
self.left.requires_scoring() || self.right.requires_scoring()
}
}
impl<Left: SegmentCollector, Right: SegmentCollector> SegmentCollector for ChainedSegmentCollector<Left, Right> {
type CollectionResult = (Left::CollectionResult, Right::CollectionResult);
fn collect(&mut self, doc: DocId, score: Score) {
self.left.collect(doc, score);
self.right.collect(doc, score);
}
fn requires_scoring(&self) -> bool {
self.left.requires_scoring() || self.right.requires_scoring()
fn finalize(self) -> Self::CollectionResult {
(self.left.finalize(), self.right.finalize())
}
}
@@ -71,19 +98,35 @@ pub fn chain() -> ChainedCollector<DoNothingCollector, DoNothingCollector> {
#[cfg(test)]
mod tests {
use super::*;
use collector::{Collector, CountCollector, TopCollector};
use collector::{CountCollector, SegmentCollector, TopCollector};
use schema::SchemaBuilder;
use Index;
use Document;
#[test]
fn test_chained_collector() {
let schema_builder = SchemaBuilder::new();
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer(3_000_000).unwrap();
let doc = Document::new();
index_writer.add_document(doc);
index_writer.commit().unwrap();
index.load_searchers().unwrap();
let searcher = index.searcher();
let segment_readers = searcher.segment_readers();
let mut top_collector = TopCollector::with_limit(2);
let mut count_collector = CountCollector::default();
{
let mut collectors = chain().push(&mut top_collector).push(&mut count_collector);
collectors.collect(1, 0.2);
collectors.collect(2, 0.1);
collectors.collect(3, 0.5);
let mut segment_collector = collectors.for_segment(0, &segment_readers[0]).unwrap();
segment_collector.collect(1, 0.2);
segment_collector.collect(2, 0.1);
segment_collector.collect(3, 0.5);
collectors.merge_children(vec![segment_collector]);
}
assert_eq!(count_collector.count(), 3);
assert!(top_collector.at_capacity());

View File

@@ -1,9 +1,11 @@
use super::Collector;
use DocId;
use Score;
use Result;
use SegmentReader;
use Score;
use SegmentLocalId;
use SegmentReader;
use collector::SegmentCollector;
use collector::Combinable;
/// `CountCollector` collector only counts how many
/// documents match the query.
@@ -21,12 +23,10 @@ impl CountCollector {
}
impl Collector for CountCollector {
fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<()> {
Ok(())
}
type Child = CountCollector;
fn collect(&mut self, _: DocId, _: Score) {
self.count += 1;
fn for_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> Result<CountCollector> {
Ok(CountCollector::default())
}
fn requires_scoring(&self) -> bool {
@@ -34,10 +34,28 @@ impl Collector for CountCollector {
}
}
impl Combinable for CountCollector {
fn combine_into(&mut self, other: Self) {
self.count += other.count;
}
}
impl SegmentCollector for CountCollector {
type CollectionResult = CountCollector;
fn collect(&mut self, _: DocId, _: Score) {
self.count += 1;
}
fn finalize(self) -> CountCollector {
self
}
}
#[cfg(test)]
mod tests {
use collector::{Collector, CountCollector};
use collector::{Collector, CountCollector, SegmentCollector};
#[test]
fn test_count_collector() {

View File

@@ -1,27 +1,24 @@
use std::mem;
use collector::Collector;
use docset::SkipResult;
use fastfield::FacetReader;
use schema::Field;
use std::cell::UnsafeCell;
use schema::Facet;
use schema::Field;
use std::collections::btree_map;
use std::collections::BTreeMap;
use std::collections::BTreeSet;
use std::collections::BinaryHeap;
use std::collections::Bound;
use termdict::TermDictionary;
use termdict::TermStreamer;
use termdict::TermStreamerBuilder;
use std::collections::BTreeSet;
use termdict::TermMerger;
use docset::SkipResult;
use std::{usize, u64};
use std::iter::Peekable;
use std::{u64, usize};
use termdict::TermMerger;
use std::cmp::Ordering;
use DocId;
use Result;
use Score;
use SegmentReader;
use SegmentLocalId;
use std::cmp::Ordering;
use SegmentReader;
use collector::SegmentCollector;
struct Hit<'a> {
count: u64,
@@ -196,19 +193,22 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
/// }
/// ```
pub struct FacetCollector {
facet_ords: Vec<u64>,
field: Field,
ff_reader: Option<UnsafeCell<FacetReader>>,
segment_counters: Vec<SegmentFacetCounter>,
facets: BTreeSet<Facet>,
}
pub struct FacetSegmentCollector {
reader: FacetReader,
facet_ords_buf: Vec<u64>,
// facet_ord -> collapse facet_id
current_segment_collapse_mapping: Vec<usize>,
collapse_mapping: Vec<usize>,
// collapse facet_id -> count
current_segment_counts: Vec<u64>,
counts: Vec<u64>,
// collapse facet_id -> facet_ord
current_collapse_facet_ords: Vec<u64>,
facets: BTreeSet<Facet>,
collapse_facet_ords: Vec<u64>,
}
fn skip<'a, I: Iterator<Item = &'a Facet>>(
@@ -242,15 +242,9 @@ impl FacetCollector {
/// is of the proper type.
pub fn for_field(field: Field) -> FacetCollector {
FacetCollector {
facet_ords: Vec::with_capacity(255),
segment_counters: Vec::new(),
field,
ff_reader: None,
facets: BTreeSet::new(),
current_segment_collapse_mapping: Vec::new(),
current_collapse_facet_ords: Vec::new(),
current_segment_counts: Vec::new(),
}
}
@@ -281,69 +275,11 @@ impl FacetCollector {
self.facets.insert(facet);
}
fn set_collapse_mapping(&mut self, facet_reader: &FacetReader) {
self.current_segment_collapse_mapping.clear();
self.current_collapse_facet_ords.clear();
self.current_segment_counts.clear();
let mut collapse_facet_it = self.facets.iter().peekable();
self.current_collapse_facet_ords.push(0);
let mut facet_streamer = facet_reader.facet_dict().range().into_stream();
if !facet_streamer.advance() {
return;
}
'outer: loop {
// at the begining of this loop, facet_streamer
// is positionned on a term that has not been processed yet.
let skip_result = skip(facet_streamer.key(), &mut collapse_facet_it);
match skip_result {
SkipResult::Reached => {
// we reach a facet we decided to collapse.
let collapse_depth = facet_depth(facet_streamer.key());
let mut collapsed_id = 0;
self.current_segment_collapse_mapping.push(0);
while facet_streamer.advance() {
let depth = facet_depth(facet_streamer.key());
if depth <= collapse_depth {
continue 'outer;
}
if depth == collapse_depth + 1 {
collapsed_id = self.current_collapse_facet_ords.len();
self.current_collapse_facet_ords
.push(facet_streamer.term_ord());
self.current_segment_collapse_mapping.push(collapsed_id);
} else {
self.current_segment_collapse_mapping.push(collapsed_id);
}
}
break;
}
SkipResult::End | SkipResult::OverStep => {
self.current_segment_collapse_mapping.push(0);
if !facet_streamer.advance() {
break;
}
}
}
}
}
fn finalize_segment(&mut self) {
if self.ff_reader.is_some() {
self.segment_counters.push(SegmentFacetCounter {
facet_reader: self.ff_reader.take().unwrap().into_inner(),
facet_ords: mem::replace(&mut self.current_collapse_facet_ords, Vec::new()),
facet_counts: mem::replace(&mut self.current_segment_counts, Vec::new()),
});
}
}
/// Returns the results of the collection.
///
/// This method does not just return the counters,
/// it also translates the facet ordinals of the last segment.
pub fn harvest(mut self) -> FacetCounts {
self.finalize_segment();
pub fn harvest(self) -> FacetCounts {
let collapsed_facet_ords: Vec<&[u64]> = self.segment_counters
.iter()
.map(|segment_counter| &segment_counter.facet_ords[..])
@@ -381,38 +317,102 @@ impl FacetCollector {
})
.sum();
if count > 0u64 {
let bytes = facet_merger.key().to_owned();
facet_counts.insert(Facet::from_encoded(bytes), count);
let bytes: Vec<u8> = facet_merger.key().to_owned();
// may create an corrupted facet if the term dicitonary is corrupted
let facet = unsafe { Facet::from_encoded(bytes) };
facet_counts.insert(facet, count);
}
}
FacetCounts { facet_counts }
}
}
impl FacetSegmentCollector {
fn into_segment_facet_counter(self) -> SegmentFacetCounter {
SegmentFacetCounter {
facet_reader: self.reader,
facet_ords: self.collapse_facet_ords,
facet_counts: self.counts,
}
}
}
impl Collector for FacetCollector {
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
self.finalize_segment();
type Child = FacetSegmentCollector;
fn for_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<FacetSegmentCollector> {
let facet_reader = reader.facet_reader(self.field)?;
self.set_collapse_mapping(&facet_reader);
self.current_segment_counts
.resize(self.current_collapse_facet_ords.len(), 0);
self.ff_reader = Some(UnsafeCell::new(facet_reader));
Ok(())
let mut collapse_mapping = Vec::new();
let mut counts = Vec::new();
let mut collapse_facet_ords = Vec::new();
let mut collapse_facet_it = self.facets.iter().peekable();
collapse_facet_ords.push(0);
{
let mut facet_streamer = facet_reader.facet_dict().range().into_stream();
if facet_streamer.advance() {
'outer: loop {
// at the begining of this loop, facet_streamer
// is positionned on a term that has not been processed yet.
let skip_result = skip(facet_streamer.key(), &mut collapse_facet_it);
match skip_result {
SkipResult::Reached => {
// we reach a facet we decided to collapse.
let collapse_depth = facet_depth(facet_streamer.key());
let mut collapsed_id = 0;
collapse_mapping.push(0);
while facet_streamer.advance() {
let depth = facet_depth(facet_streamer.key());
if depth <= collapse_depth {
continue 'outer;
}
if depth == collapse_depth + 1 {
collapsed_id = collapse_facet_ords.len();
collapse_facet_ords.push(facet_streamer.term_ord());
collapse_mapping.push(collapsed_id);
} else {
collapse_mapping.push(collapsed_id);
}
}
break;
}
SkipResult::End | SkipResult::OverStep => {
collapse_mapping.push(0);
if !facet_streamer.advance() {
break;
}
}
}
}
}
}
counts.resize(collapse_facet_ords.len(), 0);
Ok(FacetSegmentCollector {
reader: facet_reader,
facet_ords_buf: Vec::with_capacity(255),
collapse_mapping,
counts,
collapse_facet_ords,
})
}
fn requires_scoring(&self) -> bool {
false
}
}
impl SegmentCollector for FacetSegmentCollector {
type CollectionResult = Vec<SegmentFacetCounter>;
fn collect(&mut self, doc: DocId, _: Score) {
let facet_reader: &mut FacetReader = unsafe {
&mut *self.ff_reader
.as_ref()
.expect("collect() was called before set_segment. This should never happen.")
.get()
};
facet_reader.facet_ords(doc, &mut self.facet_ords);
self.reader.facet_ords(doc, &mut self.facet_ords_buf);
let mut previous_collapsed_ord: usize = usize::MAX;
for &facet_ord in &self.facet_ords {
let collapsed_ord = self.current_segment_collapse_mapping[facet_ord as usize];
self.current_segment_counts[collapsed_ord] += if collapsed_ord == previous_collapsed_ord
{
for &facet_ord in &self.facet_ords_buf {
let collapsed_ord = self.collapse_mapping[facet_ord as usize];
self.counts[collapsed_ord] += if collapsed_ord == previous_collapsed_ord {
0
} else {
1
@@ -421,8 +421,8 @@ impl Collector for FacetCollector {
}
}
fn requires_scoring(&self) -> bool {
false
fn finalize(self) -> Vec<SegmentFacetCounter> {
vec![self.into_segment_facet_counter()]
}
}
@@ -432,9 +432,20 @@ pub struct FacetCounts {
facet_counts: BTreeMap<Facet, u64>,
}
pub struct FacetChildIterator<'a> {
underlying: btree_map::Range<'a, Facet, u64>,
}
impl<'a> Iterator for FacetChildIterator<'a> {
type Item = (&'a Facet, u64);
fn next(&mut self) -> Option<Self::Item> {
self.underlying.next().map(|(facet, count)| (facet, *count))
}
}
impl FacetCounts {
#[allow(needless_lifetimes)] //< compiler fails if we remove the lifetime
pub fn get<'a, T>(&'a self, facet_from: T) -> impl Iterator<Item = (&'a Facet, u64)>
pub fn get<T>(&self, facet_from: T) -> FacetChildIterator
where
Facet: From<T>,
{
@@ -443,15 +454,13 @@ impl FacetCounts {
let right_bound = if facet.is_root() {
Bound::Unbounded
} else {
let mut facet_after_bytes = facet.encoded_bytes().to_owned();
let mut facet_after_bytes: Vec<u8> = facet.encoded_bytes().to_owned();
facet_after_bytes.push(1u8);
let facet_after = Facet::from_encoded(facet_after_bytes);
let facet_after = unsafe { Facet::from_encoded(facet_after_bytes) }; // ok logic
Bound::Excluded(facet_after)
};
self.facet_counts
.range((left_bound, right_bound))
.map(|(facet, count)| (facet, *count))
let underlying: btree_map::Range<_, _> = self.facet_counts.range((left_bound, right_bound));
FacetChildIterator { underlying }
}
pub fn top_k<T>(&self, facet: T, k: usize) -> Vec<(&Facet, u64)>
@@ -483,14 +492,13 @@ impl FacetCounts {
#[cfg(test)]
mod tests {
use test::Bencher;
use core::Index;
use schema::{Document, Facet, SchemaBuilder};
use query::AllQuery;
use super::{FacetCollector, FacetCounts};
use std::iter;
use schema::Field;
use core::Index;
use query::AllQuery;
use rand::{thread_rng, Rng};
use schema::Field;
use schema::{Document, Facet, SchemaBuilder};
use std::iter;
#[test]
fn test_facet_collector_drilldown() {
@@ -545,8 +553,10 @@ mod tests {
}
#[test]
#[should_panic(expected = "Tried to add a facet which is a descendant of \
an already added facet.")]
#[should_panic(
expected = "Tried to add a facet which is a descendant of \
an already added facet."
)]
fn test_misused_facet_collector() {
let mut facet_collector = FacetCollector::for_field(Field(0));
facet_collector.add_facet(Facet::from("/country"));
@@ -604,6 +614,19 @@ mod tests {
}
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use collector::FacetCollector;
use query::AllQuery;
use rand::{thread_rng, Rng};
use schema::Facet;
use schema::SchemaBuilder;
use test::Bencher;
use Index;
#[bench]
fn bench_facet_collector(b: &mut Bencher) {
let mut schema_builder = SchemaBuilder::new();

View File

@@ -2,17 +2,20 @@
Defines how the documents matching a search query should be processed.
*/
use SegmentReader;
use SegmentLocalId;
use DocId;
use Score;
use Result;
use Score;
use SegmentLocalId;
use SegmentReader;
use query::Query;
use Searcher;
use downcast;
mod count_collector;
pub use self::count_collector::CountCollector;
mod multi_collector;
pub use self::multi_collector::MultiCollector;
//mod multi_collector;
//pub use self::multi_collector::MultiCollector;
mod top_collector;
pub use self::top_collector::TopCollector;
@@ -53,31 +56,90 @@ pub use self::chained_collector::chain;
///
/// Segments are not guaranteed to be visited in any specific order.
pub trait Collector {
type Child : SegmentCollector + 'static;
/// `set_segment` is called before beginning to enumerate
/// on this segment.
fn set_segment(
fn for_segment(
&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader,
) -> Result<()>;
/// The query pushes the scored document to the collector via this method.
fn collect(&mut self, doc: DocId, score: Score);
) -> Result<Self::Child>;
/// Returns true iff the collector requires to compute scores for documents.
fn requires_scoring(&self) -> bool;
/// Search works as follows :
///
/// First the weight object associated to the query is created.
///
/// Then, the query loops over the segments and for each segment :
/// - setup the collector and informs it that the segment being processed has changed.
/// - creates a SegmentCollector for collecting documents associated to the segment
/// - creates a `Scorer` object associated for this segment
/// - iterate through the matched documents and push them to the segment collector.
/// - turn the segment collector into a Combinable segment result
///
/// Combining all of the segment results gives a single Child::CollectionResult, which is returned.
///
/// The result will be Ok(None) in case of having no segments.
fn search(&mut self, searcher: &Searcher, query: &Query) -> Result<Option<<Self::Child as SegmentCollector>::CollectionResult>> {
let scoring_enabled = self.requires_scoring();
let weight = query.weight(searcher, scoring_enabled)?;
let mut results = Vec::new();
for (segment_ord, segment_reader) in searcher.segment_readers().iter().enumerate() {
let mut child: Self::Child = self.for_segment(segment_ord as SegmentLocalId, segment_reader)?;
let mut scorer = weight.scorer(segment_reader)?;
scorer.collect(&mut child, segment_reader.delete_bitset());
results.push(child.finalize());
}
Ok(results.into_iter().fold1(|x,y| {
x.combine_into(y);
x
}))
}
}
pub trait Combinable {
fn combine_into(&mut self, other: Self);
}
impl Combinable for () {
fn combine_into(&mut self, other: Self) {
()
}
}
impl<T> Combinable for Vec<T> {
fn combine_into(&mut self, other: Self) {
self.extend(other.into_iter());
}
}
impl<L: Combinable, R: Combinable> Combinable for (L, R) {
fn combine_into(&mut self, other: Self) {
self.0.combine_into(other.0);
self.1.combine_into(other.1);
}
}
pub trait SegmentCollector: downcast::Any + 'static {
type CollectionResult: Combinable + downcast::Any + 'static;
/// The query pushes the scored document to the collector via this method.
fn collect(&mut self, doc: DocId, score: Score);
/// Turn into the final result
fn finalize(self) -> Self::CollectionResult;
}
impl<'a, C: Collector> Collector for &'a mut C {
fn set_segment(
&mut self,
type Child = C::Child;
fn for_segment(
&mut self, // TODO Ask Jason : why &mut self here!?
segment_local_id: SegmentLocalId,
segment: &SegmentReader,
) -> Result<()> {
(*self).set_segment(segment_local_id, segment)
}
/// The query pushes the scored document to the collector via this method.
fn collect(&mut self, doc: DocId, score: Score) {
C::collect(self, doc, score)
) -> Result<C::Child> {
(*self).for_segment(segment_local_id, segment)
}
fn requires_scoring(&self) -> bool {
@@ -85,25 +147,85 @@ impl<'a, C: Collector> Collector for &'a mut C {
}
}
pub struct CollectorWrapper<'a, TCollector: 'a + Collector>(&'a mut TCollector);
impl<'a, T: 'a + Collector> CollectorWrapper<'a, T> {
pub fn new(collector: &'a mut T) -> CollectorWrapper<'a, T> {
CollectorWrapper(collector)
}
}
impl<'a, T: 'a + Collector> Collector for CollectorWrapper<'a, T> {
type Child = T::Child;
fn for_segment(&mut self, segment_local_id: u32, segment: &SegmentReader) -> Result<T::Child> {
self.0.for_segment(segment_local_id, segment)
}
fn requires_scoring(&self) -> bool {
self.0.requires_scoring()
}
}
trait UntypedCollector {
fn for_segment(&mut self, segment_local_id: u32, segment: &SegmentReader) -> Result<Box<UntypedSegmentCollector>>;
}
impl<'a, TCollector:'a + Collector> UntypedCollector for CollectorWrapper<'a, TCollector> {
fn for_segment(&mut self, segment_local_id: u32, segment: &SegmentReader) -> Result<Box<UntypedSegmentCollector>> {
let segment_collector = self.0.for_segment(segment_local_id, segment)?;
Ok(Box::new(segment_collector))
}
}
trait UntypedSegmentCollector {
fn finalize(self) -> Box<UntypedCombinable>;
}
trait UntypedCombinable {
fn combine_into(&mut self, other: Box<UntypedCombinable>);
}
pub struct CombinableWrapper<'a, T: 'a + Combinable>(&'a mut T);
impl<'a, T: 'a + Combinable> CombinableWrapper<'a, T> {
pub fn new(combinable: &'a mut T) -> CombinableWrapper<'a, T> {
CombinableWrapper(combinable)
}
}
impl<'a, T: 'a + Combinable> Combinable for CombinableWrapper<'a, T> {
fn combine_into(&mut self, other: Self) {
self.0.combine_into(*::downcast::Downcast::<T>::downcast(other).unwrap())
}
}
#[cfg(test)]
pub mod tests {
use super::*;
use test::Bencher;
use DocId;
use Score;
use core::SegmentReader;
use SegmentLocalId;
use fastfield::BytesFastFieldReader;
use fastfield::FastFieldReader;
use schema::Field;
use DocId;
use Score;
use SegmentLocalId;
/// Stores all of the doc ids.
/// This collector is only used for tests.
/// It is unusable in practise, as it does not store
/// the segment ordinals
pub struct TestCollector {
next_offset: DocId,
docs: Vec<DocId>,
scores: Vec<Score>,
}
pub struct TestSegmentCollector {
offset: DocId,
segment_max_doc: DocId,
docs: Vec<DocId>,
scores: Vec<Score>,
}
@@ -122,8 +244,7 @@ pub mod tests {
impl Default for TestCollector {
fn default() -> TestCollector {
TestCollector {
offset: 0,
segment_max_doc: 0,
next_offset: 0,
docs: Vec::new(),
scores: Vec::new(),
}
@@ -131,19 +252,33 @@ pub mod tests {
}
impl Collector for TestCollector {
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
self.offset += self.segment_max_doc;
self.segment_max_doc = reader.max_doc();
Ok(())
type Child = TestSegmentCollector;
fn for_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<TestSegmentCollector> {
let offset = self.next_offset;
self.next_offset += reader.max_doc();
Ok(TestSegmentCollector {
offset,
docs: Vec::new(),
scores: Vec::new(),
})
}
fn requires_scoring(&self) -> bool {
true
}
}
impl SegmentCollector for TestSegmentCollector {
type CollectionResult = Vec<TestSegmentCollector>;
fn collect(&mut self, doc: DocId, score: Score) {
self.docs.push(doc + self.offset);
self.scores.push(score);
}
fn requires_scoring(&self) -> bool {
true
fn finalize(self) -> Vec<TestSegmentCollector> {
vec![self]
}
}
@@ -152,17 +287,26 @@ pub mod tests {
///
/// This collector is mainly useful for tests.
pub struct FastFieldTestCollector {
vals: Vec<u64>,
next_counter: usize,
field: Field,
ff_reader: Option<FastFieldReader<u64>>,
}
#[derive(Default)]
pub struct FastFieldSegmentCollectorState {
counter: usize,
vals: Vec<u64>,
}
pub struct FastFieldSegmentCollector {
state: FastFieldSegmentCollectorState,
reader: FastFieldReader<u64>,
}
impl FastFieldTestCollector {
pub fn for_field(field: Field) -> FastFieldTestCollector {
FastFieldTestCollector {
vals: Vec::new(),
next_counter: 0,
field,
ff_reader: None,
}
}
@@ -172,20 +316,96 @@ pub mod tests {
}
impl Collector for FastFieldTestCollector {
fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<()> {
self.ff_reader = Some(reader.fast_field_reader(self.field)?);
Ok(())
type Child = FastFieldSegmentCollector;
fn for_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> Result<FastFieldSegmentCollector> {
let counter = self.next_counter;
self.next_counter += 1;
Ok(FastFieldSegmentCollector {
state: FastFieldSegmentCollectorState::default(),
reader: reader.fast_field_reader(self.field)?,
})
}
fn collect(&mut self, doc: DocId, _score: Score) {
let val = self.ff_reader.as_ref().unwrap().get(doc);
self.vals.push(val);
}
fn requires_scoring(&self) -> bool {
false
}
}
impl SegmentCollector for FastFieldSegmentCollector {
type CollectionResult = Vec<FastFieldSegmentCollectorState>;
fn collect(&mut self, doc: DocId, _score: Score) {
let val = self.reader.get(doc);
self.vals.push(val);
}
fn finalize(self) -> Vec<FastFieldSegmentCollectorState> {
vec![self.state]
}
}
/// Collects in order all of the fast field bytes for all of the
/// docs in the `DocSet`
///
/// This collector is mainly useful for tests.
pub struct BytesFastFieldTestCollector {
vals: Vec<u8>,
field: Field,
}
pub struct BytesFastFieldSegmentCollector {
vals: Vec<u8>,
reader: BytesFastFieldReader,
}
impl BytesFastFieldTestCollector {
pub fn for_field(field: Field) -> BytesFastFieldTestCollector {
BytesFastFieldTestCollector {
vals: Vec::new(),
field,
}
}
pub fn vals(self) -> Vec<u8> {
self.vals
}
}
impl Collector for BytesFastFieldTestCollector {
type Child = BytesFastFieldSegmentCollector;
fn for_segment(&mut self, _segment_local_id: u32, segment: &SegmentReader) -> Result<BytesFastFieldSegmentCollector> {
Ok(BytesFastFieldSegmentCollector {
vals: Vec::new(),
reader: segment.bytes_fast_field_reader(self.field)?,
})
}
fn requires_scoring(&self) -> bool {
false
}
}
impl SegmentCollector for BytesFastFieldSegmentCollector {
type CollectionResult = Vec<Vec<u8>>;
fn collect(&mut self, doc: u32, _score: f32) {
let val = self.reader.get_val(doc);
self.vals.extend(val);
}
fn finalize(self) -> Vec<Vec<u8>> {
vec![self.vals]
}
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use collector::{Collector, CountCollector};
use test::Bencher;
#[bench]
fn build_collector(b: &mut Bencher) {
b.iter(|| {

View File

@@ -1,67 +1,122 @@
use super::Collector;
use super::SegmentCollector;
use DocId;
use Score;
use Result;
use SegmentReader;
use SegmentLocalId;
use SegmentReader;
use downcast::Downcast;
/// Multicollector makes it possible to collect on more than one collector.
/// It should only be used for use cases where the Collector types is unknown
/// at compile time.
/// If the type of the collectors is known, you should prefer to use `ChainedCollector`.
pub struct MultiCollector<'a> {
collectors: Vec<&'a mut Collector>,
collector_wrappers: Vec<Box<UntypedCollector + 'a>>
}
impl<'a> MultiCollector<'a> {
/// Constructor
pub fn from(collectors: Vec<&'a mut Collector>) -> MultiCollector {
MultiCollector { collectors }
pub fn new() -> MultiCollector<'a> {
MultiCollector {
collector_wrappers: Vec::new()
}
}
pub fn add_collector<TCollector: 'a + Collector>(&mut self, collector: &'a mut TCollector) {
let collector_wrapper = CollectorWrapper(collector);
self.collector_wrappers.push(Box::new(collector_wrapper));
}
}
impl<'a> Collector for MultiCollector<'a> {
fn set_segment(
&mut self,
segment_local_id: SegmentLocalId,
segment: &SegmentReader,
) -> Result<()> {
for collector in &mut self.collectors {
collector.set_segment(segment_local_id, segment)?;
}
Ok(())
type Child = MultiCollectorChild;
fn for_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> Result<MultiCollectorChild> {
let children = self.collector_wrappers
.iter_mut()
.map(|collector_wrapper| {
collector_wrapper.for_segment(segment_local_id, segment)
})
.collect::<Result<Vec<_>>>()?;
Ok(MultiCollectorChild {
children
})
}
fn collect(&mut self, doc: DocId, score: Score) {
for collector in &mut self.collectors {
collector.collect(doc, score);
fn requires_scoring(&self) -> bool {
self.collector_wrappers
.iter()
.any(|c| c.requires_scoring())
}
fn merge_children(&mut self, children: Vec<MultiCollectorChild>) {
let mut per_collector_children: Vec<Vec<Box<SegmentCollector>>> =
(0..self.collector_wrappers.len())
.map(|_| Vec::with_capacity(children.len()))
.collect::<Vec<_>>();
for child in children {
for (idx, segment_collector) in child.children.into_iter().enumerate() {
per_collector_children[idx].push(segment_collector);
}
}
for (collector, children) in self.collector_wrappers.iter_mut().zip(per_collector_children) {
collector.merge_children_anys(children);
}
}
fn requires_scoring(&self) -> bool {
self.collectors
.iter()
.any(|collector| collector.requires_scoring())
}
pub struct MultiCollectorChild {
children: Vec<Box<SegmentCollector>>
}
impl SegmentCollector for MultiCollectorChild {
fn collect(&mut self, doc: DocId, score: Score) {
for child in &mut self.children {
child.collect(doc, score);
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use collector::{Collector, CountCollector, TopCollector};
use schema::{TEXT, SchemaBuilder};
use query::TermQuery;
use Index;
use Term;
use schema::IndexRecordOption;
#[test]
fn test_multi_collector() {
let mut schema_builder = SchemaBuilder::new();
let text = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(text=>"abc"));
index_writer.add_document(doc!(text=>"abc abc abc"));
index_writer.add_document(doc!(text=>"abc abc"));
index_writer.commit().unwrap();
index_writer.add_document(doc!(text=>""));
index_writer.add_document(doc!(text=>"abc abc abc abc"));
index_writer.add_document(doc!(text=>"abc"));
index_writer.commit().unwrap();
}
index.load_searchers().unwrap();
let searcher = index.searcher();
let term = Term::from_field_text(text, "abc");
let query = TermQuery::new(term, IndexRecordOption::Basic);
let mut top_collector = TopCollector::with_limit(2);
let mut count_collector = CountCollector::default();
{
let mut collectors =
MultiCollector::from(vec![&mut top_collector, &mut count_collector]);
collectors.collect(1, 0.2);
collectors.collect(2, 0.1);
collectors.collect(3, 0.5);
let mut collectors = MultiCollector::new();
collectors.add_collector(&mut top_collector);
collectors.add_collector(&mut count_collector);
collectors.search(&*searcher, &query).unwrap();
}
assert_eq!(count_collector.count(), 3);
assert!(top_collector.at_capacity());
assert_eq!(count_collector.count(), 5);
}
}

View File

@@ -1,12 +1,14 @@
use super::Collector;
use SegmentReader;
use SegmentLocalId;
use DocAddress;
use Result;
use std::collections::BinaryHeap;
use std::cmp::Ordering;
use std::collections::BinaryHeap;
use DocAddress;
use DocId;
use Result;
use Score;
use SegmentLocalId;
use SegmentReader;
use collector::SegmentCollector;
use collector::Combinable;
// Rust heap is a max-heap and we need a min heap.
#[derive(Clone, Copy)]
@@ -99,11 +101,34 @@ impl TopCollector {
}
impl Collector for TopCollector {
fn set_segment(&mut self, segment_id: SegmentLocalId, _: &SegmentReader) -> Result<()> {
self.segment_id = segment_id;
Ok(())
type Child = TopCollector;
fn for_segment(&mut self, segment_id: SegmentLocalId, _: &SegmentReader) -> Result<TopCollector> {
Ok(TopCollector {
limit: self.limit,
heap: BinaryHeap::new(),
segment_id,
})
}
fn requires_scoring(&self) -> bool {
true
}
}
impl Combinable for TopCollector {
// TODO: I think this could be a bit better
fn combine_into(&mut self, other: Self) {
self.segment_id = other.segment_id;
while let Some(doc) = other.heap.pop() {
self.collect(doc.doc_address.doc(), doc.score);
}
}
}
impl SegmentCollector for TopCollector {
type CollectionResult = TopCollector;
fn collect(&mut self, doc: DocId, score: Score) {
if self.at_capacity() {
// It's ok to unwrap as long as a limit of 0 is forbidden.
@@ -126,8 +151,8 @@ impl Collector for TopCollector {
}
}
fn requires_scoring(&self) -> bool {
true
fn finalize(self) -> TopCollector {
self
}
}
@@ -137,7 +162,6 @@ mod tests {
use super::*;
use DocId;
use Score;
use collector::Collector;
#[test]
fn test_top_collector_not_at_capacity() {

View File

@@ -1,6 +1,6 @@
use std::io::Write;
use std::io;
use common::serialize::BinarySerializable;
use std::io;
use std::io::Write;
use std::mem;
use std::ops::Deref;
use std::ptr;
@@ -106,7 +106,8 @@ where
addr + 8 <= data.len(),
"The fast field field should have been padded with 7 bytes."
);
let val_unshifted_unmasked: u64 = unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) };
let val_unshifted_unmasked: u64 =
unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) };
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
val_shifted & mask
} else {
@@ -141,7 +142,8 @@ where
for output_val in output.iter_mut() {
let addr = addr_in_bits >> 3;
let bit_shift = addr_in_bits & 7;
let val_unshifted_unmasked: u64 = unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) };
let val_unshifted_unmasked: u64 =
unsafe { ptr::read_unaligned(data[addr..].as_ptr() as *const u64) };
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
*output_val = val_shifted & mask;
addr_in_bits += num_bits;

View File

@@ -202,15 +202,14 @@ impl BitSet {
#[cfg(test)]
mod tests {
extern crate test;
use tests;
use std::collections::HashSet;
use super::BitSet;
use super::TinySet;
use tests::generate_nonunique_unsorted;
use std::collections::BTreeSet;
use query::BitSetDocSet;
use docset::DocSet;
use query::BitSetDocSet;
use std::collections::BTreeSet;
use std::collections::HashSet;
use tests;
use tests::generate_nonunique_unsorted;
#[test]
fn test_tiny_set() {
@@ -353,6 +352,14 @@ mod tests {
assert!(!bitset.contains(el));
}
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use super::BitSet;
use super::TinySet;
use test;
#[bench]
fn bench_tinyset_pop(b: &mut test::Bencher) {
@@ -385,5 +392,4 @@ mod tests {
fn bench_bitset_initialize(b: &mut test::Bencher) {
b.iter(|| BitSet::with_max_value(1_000_000));
}
}

View File

@@ -1,12 +1,12 @@
use std::io::Write;
use common::CountingWriter;
use std::collections::HashMap;
use schema::Field;
use common::VInt;
use directory::WritePtr;
use std::io::{self, Read};
use directory::ReadOnlySource;
use common::BinarySerializable;
use common::CountingWriter;
use common::VInt;
use directory::ReadOnlySource;
use directory::WritePtr;
use schema::Field;
use std::collections::HashMap;
use std::io::Write;
use std::io::{self, Read};
#[derive(Eq, PartialEq, Hash, Copy, Ord, PartialOrd, Clone, Debug)]
pub struct FileAddr {
@@ -30,10 +30,7 @@ impl BinarySerializable for FileAddr {
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
let field = Field::deserialize(reader)?;
let idx = VInt::deserialize(reader)?.0 as usize;
Ok(FileAddr {
field,
idx,
})
Ok(FileAddr { field, idx })
}
}
@@ -166,7 +163,7 @@ impl CompositeFile {
/// to a given `Field` and stored in a `CompositeFile`.
pub fn open_read_with_idx(&self, field: Field, idx: usize) -> Option<ReadOnlySource> {
self.offsets_index
.get(&FileAddr { field, idx, })
.get(&FileAddr { field, idx })
.map(|&(from, to)| self.data.slice(from, to))
}
}
@@ -174,12 +171,12 @@ impl CompositeFile {
#[cfg(test)]
mod test {
use std::io::Write;
use super::{CompositeFile, CompositeWrite};
use common::BinarySerializable;
use common::VInt;
use directory::{Directory, RAMDirectory};
use schema::Field;
use common::VInt;
use common::BinarySerializable;
use std::io::Write;
use std::path::Path;
#[test]

View File

@@ -1,5 +1,5 @@
use std::io::Write;
use std::io;
use std::io::Write;
pub struct CountingWriter<W> {
underlying: W,

View File

@@ -1,16 +1,16 @@
mod serialize;
mod vint;
mod counting_writer;
mod composite_file;
pub mod bitpacker;
mod bitset;
mod composite_file;
mod counting_writer;
mod serialize;
mod vint;
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
pub use self::serialize::{BinarySerializable, FixedSize};
pub use self::vint::VInt;
pub use self::counting_writer::CountingWriter;
pub use self::bitset::BitSet;
pub(crate) use self::bitset::TinySet;
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
pub use self::counting_writer::CountingWriter;
pub use self::serialize::{BinarySerializable, FixedSize};
pub use self::vint::VInt;
pub use byteorder::LittleEndian as Endianness;
use std::io;
@@ -104,8 +104,8 @@ pub fn u64_to_i64(val: u64) -> i64 {
#[cfg(test)]
pub(crate) mod test {
use super::{compute_num_bits, i64_to_u64, u64_to_i64};
pub use super::serialize::test::fixed_size_test;
use super::{compute_num_bits, i64_to_u64, u64_to_i64};
fn test_i64_converter_helper(val: i64) {
assert_eq!(u64_to_i64(i64_to_u64(val)), val);

View File

@@ -1,10 +1,10 @@
use byteorder::{ReadBytesExt, WriteBytesExt};
use common::Endianness;
use std::fmt;
use std::io::Write;
use std::io::Read;
use std::io;
use common::VInt;
use std::fmt;
use std::io;
use std::io::Read;
use std::io::Write;
/// Trait for a simple binary serialization.
pub trait BinarySerializable: fmt::Debug + Sized {
@@ -135,8 +135,8 @@ impl BinarySerializable for String {
#[cfg(test)]
pub mod test {
use common::VInt;
use super::*;
use common::VInt;
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
let mut buffer = Vec::new();

View File

@@ -1,7 +1,7 @@
use super::BinarySerializable;
use std::io;
use std::io::Write;
use std::io::Read;
use std::io::Write;
/// Wrapper over a `u64` that serializes as a variable int.
#[derive(Debug, Eq, PartialEq)]

View File

@@ -8,10 +8,8 @@ const COMPRESSED_BLOCK_MAX_SIZE: usize = COMPRESSION_BLOCK_SIZE * 4 + 1;
pub use self::stream::CompressedIntStream;
use bitpacking::{BitPacker, BitPacker4x};
/// Returns the size in bytes of a compressed block, given `num_bits`.
pub fn compressed_block_size(num_bits: u8) -> usize {
1 + (num_bits as usize) * COMPRESSION_BLOCK_SIZE / 8
@@ -35,19 +33,21 @@ impl BlockEncoder {
pub fn compress_block_sorted(&mut self, block: &[u32], offset: u32) -> &[u8] {
let num_bits = self.bitpacker.num_bits_sorted(offset, block);
self.output[0] = num_bits;
let written_size = 1 + self.bitpacker.compress_sorted(offset, block, &mut self.output[1..], num_bits);
let written_size =
1 + self.bitpacker
.compress_sorted(offset, block, &mut self.output[1..], num_bits);
&self.output[..written_size]
}
pub fn compress_block_unsorted(&mut self, block: &[u32]) -> &[u8] {
let num_bits = self.bitpacker.num_bits(block);
self.output[0] = num_bits;
let written_size = 1 + self.bitpacker.compress(block, &mut self.output[1..], num_bits);
let written_size = 1 + self.bitpacker
.compress(block, &mut self.output[1..], num_bits);
&self.output[..written_size]
}
}
pub struct BlockDecoder {
bitpacker: BitPacker4x,
pub output: [u32; COMPRESSION_BLOCK_SIZE + 1],
@@ -68,17 +68,23 @@ impl BlockDecoder {
output_len: 0,
}
}
pub fn uncompress_block_sorted(&mut self, compressed_data: &[u8], offset: u32) -> usize {
let num_bits = compressed_data[0];
self.output_len = COMPRESSION_BLOCK_SIZE;
1 + self.bitpacker.decompress_sorted(offset, &compressed_data[1..], &mut self.output, num_bits)
1 + self.bitpacker.decompress_sorted(
offset,
&compressed_data[1..],
&mut self.output,
num_bits,
)
}
pub fn uncompress_block_unsorted<'a>(&mut self, compressed_data: &'a [u8]) -> usize {
let num_bits = compressed_data[0];
self.output_len = COMPRESSION_BLOCK_SIZE;
1 + self.bitpacker.decompress(&compressed_data[1..], &mut self.output, num_bits)
1 + self.bitpacker
.decompress(&compressed_data[1..], &mut self.output, num_bits)
}
#[inline]
@@ -174,8 +180,6 @@ impl VIntDecoder for BlockDecoder {
pub mod tests {
use super::*;
use tests;
use test::Bencher;
#[test]
fn test_encode_sorted_block() {
@@ -264,11 +268,34 @@ pub mod tests {
}
}
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use super::*;
use rand::Rng;
use rand::SeedableRng;
use rand::XorShiftRng;
use test::Bencher;
fn generate_array_with_seed(n: usize, ratio: f32, seed_val: u32) -> Vec<u32> {
let seed: &[u32; 4] = &[1, 2, 3, seed_val];
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
(0..u32::max_value())
.filter(|_| rng.next_f32() < ratio)
.take(n)
.collect()
}
pub fn generate_array(n: usize, ratio: f32) -> Vec<u32> {
generate_array_with_seed(n, ratio, 4)
}
#[bench]
fn bench_compress(b: &mut Bencher) {
let mut encoder = BlockEncoder::new();
let data = tests::generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
let data = generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
b.iter(|| {
encoder.compress_block_sorted(&data, 0u32);
});
@@ -277,7 +304,7 @@ pub mod tests {
#[bench]
fn bench_uncompress(b: &mut Bencher) {
let mut encoder = BlockEncoder::new();
let data = tests::generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
let data = generate_array(COMPRESSION_BLOCK_SIZE, 0.1);
let compressed = encoder.compress_block_sorted(&data, 0u32);
let mut decoder = BlockDecoder::new();
b.iter(|| {
@@ -304,7 +331,7 @@ pub mod tests {
#[bench]
fn bench_compress_vint(b: &mut Bencher) {
let mut encoder = BlockEncoder::new();
let data = tests::generate_array(NUM_INTS_BENCH_VINT, 0.001);
let data = generate_array(NUM_INTS_BENCH_VINT, 0.001);
b.iter(|| {
encoder.compress_vint_sorted(&data, 0u32);
});
@@ -313,12 +340,11 @@ pub mod tests {
#[bench]
fn bench_uncompress_vint(b: &mut Bencher) {
let mut encoder = BlockEncoder::new();
let data = tests::generate_array(NUM_INTS_BENCH_VINT, 0.001);
let data = generate_array(NUM_INTS_BENCH_VINT, 0.001);
let compressed = encoder.compress_vint_sorted(&data, 0u32);
let mut decoder = BlockDecoder::new();
b.iter(|| {
decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT);
});
}
}

View File

@@ -1,6 +1,6 @@
use compression::compressed_block_size;
use compression::BlockDecoder;
use compression::COMPRESSION_BLOCK_SIZE;
use compression::compressed_block_size;
use directory::{ReadOnlySource, SourceRead};
/// Reads a stream of compressed ints.
@@ -13,7 +13,7 @@ pub struct CompressedIntStream {
buffer: SourceRead,
block_decoder: BlockDecoder,
cached_addr: usize, // address of the currently decoded block
cached_addr: usize, // address of the currently decoded block
cached_next_addr: usize, // address following the currently decoded block
addr: usize, // address of the block associated to the current position
@@ -42,7 +42,8 @@ impl CompressedIntStream {
// no need to read.
self.cached_next_addr
} else {
let next_addr = addr + self.block_decoder.uncompress_block_unsorted(self.buffer.slice_from(addr));
let next_addr = addr + self.block_decoder
.uncompress_block_unsorted(self.buffer.slice_from(addr));
self.cached_addr = addr;
self.cached_next_addr = next_addr;
next_addr
@@ -101,8 +102,8 @@ pub mod tests {
use super::CompressedIntStream;
use compression::compressed_block_size;
use compression::COMPRESSION_BLOCK_SIZE;
use compression::BlockEncoder;
use compression::COMPRESSION_BLOCK_SIZE;
use directory::ReadOnlySource;
fn create_stream_buffer() -> ReadOnlySource {

View File

@@ -1,34 +1,32 @@
use Result;
use core::SegmentId;
use error::{ErrorKind, ResultExt};
use serde_json;
use schema::Schema;
use std::sync::Arc;
use serde_json;
use std::borrow::BorrowMut;
use std::fmt;
use core::SegmentId;
use std::sync::Arc;
use Result;
#[cfg(feature="mmap")]
use super::pool::LeasedItem;
use super::pool::Pool;
use super::segment::create_segment;
use super::segment::Segment;
use core::searcher::Searcher;
use core::IndexMeta;
use core::SegmentMeta;
use core::SegmentReader;
use core::META_FILEPATH;
use directory::ManagedDirectory;
#[cfg(feature = "mmap")]
use directory::MmapDirectory;
use directory::{Directory, RAMDirectory};
use indexer::index_writer::open_index_writer;
use core::searcher::Searcher;
use std::convert::From;
use num_cpus;
use super::segment::Segment;
use core::SegmentReader;
use super::pool::Pool;
use core::SegmentMeta;
use super::pool::LeasedItem;
use std::path::Path;
use core::IndexMeta;
use indexer::DirectoryLock;
use IndexWriter;
use directory::ManagedDirectory;
use core::META_FILEPATH;
use super::segment::create_segment;
use indexer::segment_updater::save_new_metas;
use indexer::DirectoryLock;
use num_cpus;
use std::path::Path;
use tokenizer::TokenizerManager;
use IndexWriter;
const NUM_SEARCHERS: usize = 12;
@@ -65,7 +63,7 @@ impl Index {
/// The index will use the `MMapDirectory`.
///
/// If a previous index was in this directory, then its meta file will be destroyed.
#[cfg(feature="mmap")]
#[cfg(feature = "mmap")]
pub fn create<P: AsRef<Path>>(directory_path: P, schema: Schema) -> Result<Index> {
let mmap_directory = MmapDirectory::open(directory_path)?;
let directory = ManagedDirectory::new(mmap_directory)?;
@@ -85,8 +83,7 @@ impl Index {
///
/// The temp directory is only used for testing the `MmapDirectory`.
/// For other unit tests, prefer the `RAMDirectory`, see: `create_in_ram`.
#[cfg(feature="mmap")]
#[cfg(test)]
#[cfg(feature = "mmap")]
pub fn create_from_tempdir(schema: Schema) -> Result<Index> {
let mmap_directory = MmapDirectory::create_from_tempdir()?;
let directory = ManagedDirectory::new(mmap_directory)?;
@@ -106,6 +103,20 @@ impl Index {
Ok(index)
}
/// Open the index using the provided directory
pub fn open_directory<D: Directory>(directory: D) -> Result<Index> {
let directory = ManagedDirectory::new(directory)?;
let metas = load_metas(&directory)?;
Index::create_from_metas(directory, &metas)
}
/// Opens a new directory from an index path.
#[cfg(feature = "mmap")]
pub fn open<P: AsRef<Path>>(directory_path: P) -> Result<Index> {
let mmap_directory = MmapDirectory::open(directory_path)?;
Index::open_directory(mmap_directory)
}
/// Create a new index from a directory.
pub fn from_directory(mut directory: ManagedDirectory, schema: Schema) -> Result<Index> {
save_new_metas(schema.clone(), 0, directory.borrow_mut())?;
@@ -113,22 +124,6 @@ impl Index {
Index::create_from_metas(directory, &metas)
}
/// Opens a new directory from an index path.
#[cfg(feature="mmap")]
pub fn open<P: AsRef<Path>>(directory_path: P) -> Result<Index> {
let mmap_directory = MmapDirectory::open(directory_path)?;
let directory = ManagedDirectory::new(mmap_directory)?;
let metas = load_metas(&directory)?;
Index::create_from_metas(directory, &metas)
}
pub fn open_directory<TDirectory: Directory>(directory: TDirectory) -> Result<Index> {
let directory = ManagedDirectory::new(directory)?;
let metas = load_metas(&directory)?;
Index::create_from_metas(directory, &metas)
}
/// Reads the index meta file from the directory.
pub fn load_metas(&self) -> Result<IndexMeta> {
load_metas(self.directory())
@@ -231,8 +226,9 @@ impl Index {
.iter()
.map(SegmentReader::open)
.collect::<Result<_>>()?;
let schema = self.schema();
let searchers = (0..NUM_SEARCHERS)
.map(|_| Searcher::from(segment_readers.clone()))
.map(|_| Searcher::new(schema.clone(), segment_readers.clone()))
.collect();
self.searcher_pool.publish_new_generation(searchers);
Ok(())

View File

@@ -1,7 +1,7 @@
use schema::Schema;
use core::SegmentMeta;
use std::fmt;
use schema::Schema;
use serde_json;
use std::fmt;
/// Meta information about the `Index`.
///
@@ -45,9 +45,9 @@ impl fmt::Debug for IndexMeta {
#[cfg(test)]
mod tests {
use serde_json;
use super::IndexMeta;
use schema::{SchemaBuilder, TEXT};
use serde_json;
#[test]
fn test_serialize_metas() {

View File

@@ -1,13 +1,13 @@
use common::BinarySerializable;
use compression::CompressedIntStream;
use directory::{ReadOnlySource, SourceRead};
use termdict::{TermDictionary, TermDictionaryImpl};
use postings::{BlockSegmentPostings, SegmentPostings};
use postings::FreqReadingOption;
use postings::TermInfo;
use postings::{BlockSegmentPostings, SegmentPostings};
use schema::FieldType;
use schema::IndexRecordOption;
use schema::Term;
use compression::CompressedIntStream;
use postings::FreqReadingOption;
use common::BinarySerializable;
use schema::FieldType;
use termdict::TermDictionary;
/// The inverted index reader is in charge of accessing
/// the inverted index associated to a specific field.
@@ -23,16 +23,16 @@ use schema::FieldType;
/// `InvertedIndexReader` are created by calling
/// the `SegmentReader`'s [`.inverted_index(...)`] method
pub struct InvertedIndexReader {
termdict: TermDictionaryImpl,
termdict: TermDictionary,
postings_source: ReadOnlySource,
positions_source: ReadOnlySource,
record_option: IndexRecordOption,
total_num_tokens: u64
total_num_tokens: u64,
}
impl InvertedIndexReader {
pub(crate) fn new(
termdict: TermDictionaryImpl,
termdict: TermDictionary,
postings_source: ReadOnlySource,
positions_source: ReadOnlySource,
record_option: IndexRecordOption,
@@ -45,7 +45,7 @@ impl InvertedIndexReader {
postings_source: postings_source.slice_from(8),
positions_source,
record_option,
total_num_tokens
total_num_tokens,
}
}
@@ -56,11 +56,11 @@ impl InvertedIndexReader {
.get_index_record_option()
.unwrap_or(IndexRecordOption::Basic);
InvertedIndexReader {
termdict: TermDictionaryImpl::empty(field_type),
termdict: TermDictionary::empty(field_type),
postings_source: ReadOnlySource::empty(),
positions_source: ReadOnlySource::empty(),
record_option,
total_num_tokens: 0u64
total_num_tokens: 0u64,
}
}
@@ -70,7 +70,7 @@ impl InvertedIndexReader {
}
/// Return the term dictionary datastructure.
pub fn terms(&self) -> &TermDictionaryImpl {
pub fn terms(&self) -> &TermDictionary {
&self.termdict
}
@@ -149,8 +149,6 @@ impl InvertedIndexReader {
self.total_num_tokens
}
/// Returns the segment postings associated with the term, and with the given option,
/// or `None` if the term has never been encountered and indexed.
///
@@ -166,12 +164,15 @@ impl InvertedIndexReader {
Some(self.read_postings_from_terminfo(&term_info, option))
}
pub(crate) fn read_postings_no_deletes(&self, term: &Term, option: IndexRecordOption) -> Option<SegmentPostings> {
pub(crate) fn read_postings_no_deletes(
&self,
term: &Term,
option: IndexRecordOption,
) -> Option<SegmentPostings> {
let term_info = get!(self.get_term_info(term));
Some(self.read_postings_from_terminfo(&term_info, option))
}
/// Returns the number of documents containing the term.
pub fn doc_freq(&self, term: &Term) -> u32 {
self.get_term_info(term)
@@ -179,6 +180,3 @@ impl InvertedIndexReader {
.unwrap_or(0u32)
}
}

View File

@@ -1,24 +1,24 @@
pub mod searcher;
pub mod index;
mod segment_reader;
mod segment_id;
mod segment_component;
mod segment;
mod index_meta;
mod pool;
mod segment_meta;
mod inverted_index_reader;
mod pool;
pub mod searcher;
mod segment;
mod segment_component;
mod segment_id;
mod segment_meta;
mod segment_reader;
pub use self::index::Index;
pub use self::index_meta::IndexMeta;
pub use self::inverted_index_reader::InvertedIndexReader;
pub use self::searcher::Searcher;
pub use self::segment_component::SegmentComponent;
pub use self::segment_id::SegmentId;
pub use self::segment_reader::SegmentReader;
pub use self::segment::Segment;
pub use self::segment::SerializableSegment;
pub use self::index::Index;
pub use self::segment_component::SegmentComponent;
pub use self::segment_id::SegmentId;
pub use self::segment_meta::SegmentMeta;
pub use self::index_meta::IndexMeta;
pub use self::segment_reader::SegmentReader;
use std::path::PathBuf;

View File

@@ -1,8 +1,8 @@
use std::sync::atomic::AtomicUsize;
use std::sync::atomic::Ordering;
use crossbeam::sync::MsQueue;
use std::mem;
use std::ops::{Deref, DerefMut};
use crossbeam::sync::MsQueue;
use std::sync::atomic::AtomicUsize;
use std::sync::atomic::Ordering;
use std::sync::Arc;
pub struct GenerationItem<T> {
@@ -114,8 +114,8 @@ impl<T> Drop for LeasedItem<T> {
#[cfg(test)]
mod tests {
use std::iter;
use super::Pool;
use std::iter;
#[test]
fn test_pool() {

View File

@@ -1,14 +1,15 @@
use Result;
use core::SegmentReader;
use schema::Document;
use collector::Collector;
use query::Query;
use DocAddress;
use schema::{Field, Term};
use termdict::{TermDictionary, TermMerger};
use std::sync::Arc;
use std::fmt;
use core::InvertedIndexReader;
use core::SegmentReader;
use query::Query;
use schema::Document;
use schema::Schema;
use schema::{Field, Term};
use std::fmt;
use std::sync::Arc;
use termdict::TermMerger;
use DocAddress;
use Result;
/// Holds a list of `SegmentReader`s ready for search.
///
@@ -16,10 +17,18 @@ use core::InvertedIndexReader;
/// the destruction of the `Searcher`.
///
pub struct Searcher {
schema: Schema,
segment_readers: Vec<SegmentReader>,
}
impl Searcher {
/// Creates a new `Searcher`
pub(crate) fn new(schema: Schema, segment_readers: Vec<SegmentReader>) -> Searcher {
Searcher {
schema,
segment_readers,
}
}
/// Fetches a document from tantivy's store given a `DocAddress`.
///
/// The searcher uses the segment ordinal to route the
@@ -30,6 +39,11 @@ impl Searcher {
segment_reader.doc(doc_id)
}
/// Access the schema associated to the index of this searcher.
pub fn schema(&self) -> &Schema {
&self.schema
}
/// Returns the overall number of documents in the index.
pub fn num_docs(&self) -> u64 {
self.segment_readers
@@ -59,7 +73,7 @@ impl Searcher {
/// Runs a query on the segment readers wrapped by the searcher
pub fn search<C: Collector>(&self, query: &Query, collector: &mut C) -> Result<()> {
query.search(self, collector)
collector.search(self, query)
}
/// Return the field searcher associated to a `Field`.
@@ -92,12 +106,6 @@ impl FieldSearcher {
}
}
impl From<Vec<SegmentReader>> for Searcher {
fn from(segment_readers: Vec<SegmentReader>) -> Searcher {
Searcher { segment_readers }
}
}
impl fmt::Debug for Searcher {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let segment_ids = self.segment_readers

View File

@@ -1,16 +1,16 @@
use Result;
use std::path::PathBuf;
use schema::Schema;
use std::fmt;
use core::SegmentId;
use directory::{FileProtection, ReadOnlySource, WritePtr};
use indexer::segment_serializer::SegmentSerializer;
use super::SegmentComponent;
use core::Index;
use std::result;
use directory::Directory;
use core::SegmentId;
use core::SegmentMeta;
use directory::error::{OpenReadError, OpenWriteError};
use directory::Directory;
use directory::{FileProtection, ReadOnlySource, WritePtr};
use indexer::segment_serializer::SegmentSerializer;
use schema::Schema;
use std::fmt;
use std::path::PathBuf;
use std::result;
use Result;
/// A segment is a piece of the index.
#[derive(Clone)]
@@ -111,8 +111,8 @@ mod tests {
use core::SegmentComponent;
use directory::Directory;
use std::collections::HashSet;
use schema::SchemaBuilder;
use std::collections::HashSet;
use Index;
#[test]

View File

@@ -1,3 +1,5 @@
use std::slice;
/// Enum describing each component of a tantivy segment.
/// Each component is stored in its own file,
/// using the pattern `segment_uuid`.`component_extension`,
@@ -26,7 +28,7 @@ pub enum SegmentComponent {
impl SegmentComponent {
/// Iterates through the components.
pub fn iterator() -> impl Iterator<Item = &'static SegmentComponent> {
pub fn iterator() -> slice::Iter<'static, SegmentComponent> {
static SEGMENT_COMPONENTS: [SegmentComponent; 7] = [
SegmentComponent::POSTINGS,
SegmentComponent::POSITIONS,

View File

@@ -1,6 +1,6 @@
use uuid::Uuid;
use std::fmt;
use std::cmp::{Ord, Ordering};
use std::fmt;
use uuid::Uuid;
#[cfg(test)]
use std::sync::atomic;

View File

@@ -1,7 +1,7 @@
use core::SegmentId;
use super::SegmentComponent;
use std::path::PathBuf;
use core::SegmentId;
use std::collections::HashSet;
use std::path::PathBuf;
#[derive(Clone, Debug, Serialize, Deserialize)]
struct DeleteMeta {

View File

@@ -1,31 +1,30 @@
use Result;
use core::Segment;
use core::SegmentId;
use core::SegmentComponent;
use std::sync::RwLock;
use common::HasLen;
use core::SegmentMeta;
use fastfield::{self, FastFieldNotAvailableError};
use fastfield::DeleteBitSet;
use store::StoreReader;
use schema::Document;
use DocId;
use std::sync::Arc;
use std::collections::HashMap;
use common::CompositeFile;
use std::fmt;
use common::HasLen;
use core::InvertedIndexReader;
use schema::Field;
use schema::FieldType;
use core::Segment;
use core::SegmentComponent;
use core::SegmentId;
use core::SegmentMeta;
use error::ErrorKind;
use termdict::TermDictionaryImpl;
use fastfield::DeleteBitSet;
use fastfield::FacetReader;
use fastfield::FastFieldReader;
use schema::Schema;
use termdict::TermDictionary;
use fastfield::{FastValue, MultiValueIntFastFieldReader};
use schema::Cardinality;
use fastfield::{self, FastFieldNotAvailableError};
use fastfield::{BytesFastFieldReader, FastValue, MultiValueIntFastFieldReader};
use fieldnorm::FieldNormReader;
use schema::Cardinality;
use schema::Document;
use schema::Field;
use schema::FieldType;
use schema::Schema;
use std::collections::HashMap;
use std::fmt;
use std::sync::Arc;
use std::sync::RwLock;
use store::StoreReader;
use termdict::TermDictionary;
use DocId;
use Result;
/// Entry point to access all of the datastructures of the `Segment`
///
@@ -76,6 +75,11 @@ impl SegmentReader {
self.segment_meta.num_docs()
}
/// Returns the schema of the index this segment belongs to.
pub fn schema(&self) -> &Schema {
&self.schema
}
/// Return the number of documents that have been
/// deleted in the segment.
pub fn num_deleted_docs(&self) -> DocId {
@@ -105,12 +109,25 @@ impl SegmentReader {
) -> fastfield::Result<FastFieldReader<Item>> {
let field_entry = self.schema.get_field_entry(field);
if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::SingleValue)
{
self.fast_fields_composite
.open_read(field)
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
.map(FastFieldReader::open)
} else {
{
self.fast_fields_composite
.open_read(field)
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
.map(FastFieldReader::open)
} else {
Err(FastFieldNotAvailableError::new(field_entry))
}
}
pub(crate) fn fast_field_reader_with_idx<Item: FastValue>(
&self,
field: Field,
idx: usize,
) -> fastfield::Result<FastFieldReader<Item>> {
if let Some(ff_source) = self.fast_fields_composite.open_read_with_idx(field, idx) {
Ok(FastFieldReader::open(ff_source))
} else {
let field_entry = self.schema.get_field_entry(field);
Err(FastFieldNotAvailableError::new(field_entry))
}
}
@@ -123,21 +140,32 @@ impl SegmentReader {
) -> fastfield::Result<MultiValueIntFastFieldReader<Item>> {
let field_entry = self.schema.get_field_entry(field);
if Item::fast_field_cardinality(field_entry.field_type()) == Some(Cardinality::MultiValues)
{
let idx_reader = self.fast_fields_composite
.open_read_with_idx(field, 0)
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
.map(FastFieldReader::open)?;
let vals_reader = self.fast_fields_composite
.open_read_with_idx(field, 1)
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
.map(FastFieldReader::open)?;
Ok(MultiValueIntFastFieldReader::open(idx_reader, vals_reader))
} else {
{
let idx_reader = self.fast_field_reader_with_idx(field, 0)?;
let vals_reader = self.fast_field_reader_with_idx(field, 1)?;
Ok(MultiValueIntFastFieldReader::open(idx_reader, vals_reader))
} else {
Err(FastFieldNotAvailableError::new(field_entry))
}
}
/// Accessor to the `BytesFastFieldReader` associated to a given `Field`.
pub fn bytes_fast_field_reader(&self, field: Field) -> fastfield::Result<BytesFastFieldReader> {
let field_entry = self.schema.get_field_entry(field);
match field_entry.field_type() {
&FieldType::Bytes => {}
_ => return Err(FastFieldNotAvailableError::new(field_entry)),
}
let idx_reader = self.fast_fields_composite
.open_read_with_idx(field, 0)
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))
.map(FastFieldReader::open)?;
let values = self.fast_fields_composite
.open_read_with_idx(field, 1)
.ok_or_else(|| FastFieldNotAvailableError::new(field_entry))?;
Ok(BytesFastFieldReader::open(idx_reader, values))
}
/// Accessor to the `FacetReader` associated to a given `Field`.
pub fn facet_reader(&self, field: Field) -> Result<FacetReader> {
let field_entry = self.schema.get_field_entry(field);
@@ -157,7 +185,7 @@ impl SegmentReader {
field_entry.name()
))
})?;
let termdict = TermDictionaryImpl::from_source(termdict_source);
let termdict = TermDictionary::from_source(termdict_source);
let facet_reader = FacetReader::new(term_ords_reader, termdict);
Ok(facet_reader)
}
@@ -171,12 +199,14 @@ impl SegmentReader {
/// They are simply stored as a fast field, serialized in
/// the `.fieldnorm` file of the segment.
pub fn get_fieldnorms_reader(&self, field: Field) -> FieldNormReader {
if let Some(fieldnorm_source) = self.fieldnorms_composite
.open_read(field) {
if let Some(fieldnorm_source) = self.fieldnorms_composite.open_read(field) {
FieldNormReader::open(fieldnorm_source)
} else {
let field_name = self.schema.get_field_name(field);
let err_msg= format!("Field norm not found for field {:?}. Was it market as indexed during indexing.", field_name);
let err_msg = format!(
"Field norm not found for field {:?}. Was it market as indexed during indexing.",
field_name
);
panic!(err_msg);
}
}
@@ -211,13 +241,12 @@ impl SegmentReader {
let fieldnorms_data = segment.open_read(SegmentComponent::FIELDNORMS)?;
let fieldnorms_composite = CompositeFile::open(&fieldnorms_data)?;
let delete_bitset_opt =
if segment.meta().has_deletes() {
let delete_data = segment.open_read(SegmentComponent::DELETE)?;
Some(DeleteBitSet::open(delete_data))
} else {
None
};
let delete_bitset_opt = if segment.meta().has_deletes() {
let delete_data = segment.open_read(SegmentComponent::DELETE)?;
Some(DeleteBitSet::open(delete_data))
} else {
None
};
let schema = segment.schema();
Ok(SegmentReader {
@@ -281,7 +310,7 @@ impl SegmentReader {
.expect("Index corrupted. Failed to open field positions in composite file.");
let inv_idx_reader = Arc::new(InvertedIndexReader::new(
TermDictionaryImpl::from_source(termdict_source),
TermDictionary::from_source(termdict_source),
postings_source,
positions_source,
record_option,
@@ -323,6 +352,11 @@ impl SegmentReader {
.map(|delete_set| delete_set.is_deleted(doc))
.unwrap_or(false)
}
/// Returns an iterator that will iterate over the alive document ids
pub fn doc_ids_alive(&self) -> SegmentReaderAliveDocsIterator {
SegmentReaderAliveDocsIterator::new(&self)
}
}
impl fmt::Debug for SegmentReader {
@@ -330,3 +364,90 @@ impl fmt::Debug for SegmentReader {
write!(f, "SegmentReader({:?})", self.segment_id)
}
}
/// Implements the iterator trait to allow easy iteration
/// over non-deleted ("alive") DocIds in a SegmentReader
pub struct SegmentReaderAliveDocsIterator<'a> {
reader: &'a SegmentReader,
max_doc: DocId,
current: DocId,
}
impl<'a> SegmentReaderAliveDocsIterator<'a> {
pub fn new(reader: &'a SegmentReader) -> SegmentReaderAliveDocsIterator<'a> {
SegmentReaderAliveDocsIterator {
reader: reader,
max_doc: reader.max_doc(),
current: 0,
}
}
}
impl<'a> Iterator for SegmentReaderAliveDocsIterator<'a> {
type Item = DocId;
fn next(&mut self) -> Option<Self::Item> {
// TODO: Use TinySet (like in BitSetDocSet) to speed this process up
if self.current >= self.max_doc {
return None;
}
// find the next alive doc id
while self.reader.is_deleted(self.current) {
self.current += 1;
if self.current >= self.max_doc {
return None;
}
}
// capture the current alive DocId
let result = Some(self.current);
// move down the chain
self.current += 1;
result
}
}
#[cfg(test)]
mod test {
use core::Index;
use schema::{SchemaBuilder, Term, STORED, TEXT};
use DocId;
#[test]
fn test_alive_docs_iterator() {
let mut schema_builder = SchemaBuilder::new();
schema_builder.add_text_field("name", TEXT | STORED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
let name = schema.get_field("name").unwrap();
{
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(name => "tantivy"));
index_writer.add_document(doc!(name => "horse"));
index_writer.add_document(doc!(name => "jockey"));
index_writer.add_document(doc!(name => "cap"));
// we should now have one segment with two docs
index_writer.commit().unwrap();
}
{
let mut index_writer2 = index.writer(50_000_000).unwrap();
index_writer2.delete_term(Term::from_field_text(name, "horse"));
index_writer2.delete_term(Term::from_field_text(name, "cap"));
// ok, now we should have a deleted doc
index_writer2.commit().unwrap();
}
index.load_searchers().unwrap();
let searcher = index.searcher();
let docs: Vec<DocId> = searcher.segment_reader(0).doc_ids_alive().collect();
assert_eq!(vec![0u32, 2u32], docs);
}
}

View File

@@ -1,10 +1,10 @@
#![allow(dead_code)]
mod skiplist_builder;
mod skiplist;
mod skiplist_builder;
pub use self::skiplist_builder::SkipListBuilder;
pub use self::skiplist::SkipList;
pub use self::skiplist_builder::SkipListBuilder;
#[cfg(test)]
mod tests {

View File

@@ -1,6 +1,6 @@
use common::{BinarySerializable, VInt};
use std::marker::PhantomData;
use std::cmp::max;
use std::marker::PhantomData;
static EMPTY: [u8; 0] = [];

View File

@@ -1,7 +1,7 @@
use std::io::Write;
use common::{BinarySerializable, VInt, is_power_of_2};
use std::marker::PhantomData;
use common::{is_power_of_2, BinarySerializable, VInt};
use std::io;
use std::io::Write;
use std::marker::PhantomData;
struct LayerBuilder<T: BinarySerializable> {
period_mask: usize,

View File

@@ -1,5 +1,5 @@
use std::mem;
use super::heap::{Heap, HeapAllocable};
use std::mem;
#[inline]
pub fn is_power_of_2(val: u32) -> bool {
@@ -99,12 +99,8 @@ impl<'a> Iterator for ExpUnrolledLinkedListIterator<'a> {
#[cfg(test)]
mod tests {
use super::*;
use super::super::heap::Heap;
use test::Bencher;
const NUM_STACK: usize = 10_000;
const STACK_SIZE: u32 = 1000;
use super::*;
#[test]
fn test_stack() {
@@ -124,6 +120,17 @@ mod tests {
}
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use super::ExpUnrolledLinkedList;
use super::Heap;
use test::Bencher;
const NUM_STACK: usize = 10_000;
const STACK_SIZE: u32 = 1000;
#[bench]
fn bench_push_vec(bench: &mut Bencher) {
bench.iter(|| {

View File

@@ -1,54 +1,53 @@
use super::heap::{BytesRef, Heap, HeapAllocable};
use postings::UnorderedTermId;
use std::iter;
use std::mem;
use postings::UnorderedTermId;
use super::heap::{BytesRef, Heap, HeapAllocable};
use std::slice;
mod murmurhash2 {
const SEED: u32 = 3_242_157_231u32;
const M: u32 = 0x5bd1_e995;
#[inline(always)]
pub fn murmurhash2(key: &[u8]) -> u32 {
let mut key_ptr: *const u32 = key.as_ptr() as *const u32;
let m: u32 = 0x5bd1_e995;
let r = 24;
let len = key.len() as u32;
let mut h: u32 = SEED ^ len;
let num_blocks = len >> 2;
for _ in 0..num_blocks {
let mut k: u32 = unsafe { *key_ptr };
k = k.wrapping_mul(m);
k ^= k >> r;
k = k.wrapping_mul(m);
k = k.wrapping_mul(m);
let mut k: u32 = unsafe { *key_ptr }; // ok because of num_blocks definition
k = k.wrapping_mul(M);
k ^= k >> 24;
k = k.wrapping_mul(M);
h = h.wrapping_mul(M);
h ^= k;
key_ptr = key_ptr.wrapping_offset(1);
}
// Handle the last few bytes of the input array
let remaining = len & 3;
let key_ptr_u8: *const u8 = key_ptr as *const u8;
match remaining {
let remaining: &[u8] = &key[key.len() & !3..];
match remaining.len() {
3 => {
h ^= unsafe { u32::from(*key_ptr_u8.wrapping_offset(2)) } << 16;
h ^= unsafe { u32::from(*key_ptr_u8.wrapping_offset(1)) } << 8;
h ^= unsafe { u32::from(*key_ptr_u8) };
h = h.wrapping_mul(m);
h ^= u32::from(remaining[2]) << 16;
h ^= u32::from(remaining[1]) << 8;
h ^= u32::from(remaining[0]);
h = h.wrapping_mul(M);
}
2 => {
h ^= unsafe { u32::from(*key_ptr_u8.wrapping_offset(1)) } << 8;
h ^= unsafe { u32::from(*key_ptr_u8) };
h = h.wrapping_mul(m);
h ^= u32::from(remaining[1]) << 8;
h ^= u32::from(remaining[0]);
h = h.wrapping_mul(M);
}
1 => {
h ^= unsafe { u32::from(*key_ptr_u8) };
h = h.wrapping_mul(m);
h ^= u32::from(remaining[0]);
h = h.wrapping_mul(M);
}
_ => {}
}
h ^= h >> 13;
h = h.wrapping_mul(m);
h = h.wrapping_mul(M);
h ^ (h >> 15)
}
}
@@ -117,11 +116,7 @@ struct QuadraticProbing {
impl QuadraticProbing {
fn compute(hash: usize, mask: usize) -> QuadraticProbing {
QuadraticProbing {
hash,
i: 0,
mask,
}
QuadraticProbing { hash, i: 0, mask }
}
#[inline]
@@ -131,6 +126,23 @@ impl QuadraticProbing {
}
}
pub struct Iter<'a: 'b, 'b> {
hashmap: &'b TermHashMap<'a>,
inner: slice::Iter<'a, usize>,
}
impl<'a, 'b> Iterator for Iter<'a, 'b> {
type Item = (&'b [u8], u32, UnorderedTermId);
fn next(&mut self) -> Option<Self::Item> {
self.inner.next().cloned().map(move |bucket: usize| {
let kv = self.hashmap.table[bucket];
let (key, offset): (&'b [u8], u32) = self.hashmap.get_key_value(kv.key_value_addr);
(key, offset, bucket as UnorderedTermId)
})
}
}
impl<'a> TermHashMap<'a> {
pub fn new(num_bucket_power_of_2: usize, heap: &'a Heap) -> TermHashMap<'a> {
let table_size = 1 << num_bucket_power_of_2;
@@ -161,16 +173,16 @@ impl<'a> TermHashMap<'a> {
pub fn set_bucket(&mut self, hash: u32, key_value_addr: BytesRef, bucket: usize) {
self.occupied.push(bucket);
self.table[bucket] = KeyValue {
key_value_addr, hash
key_value_addr,
hash,
};
}
pub fn iter<'b: 'a>(&'b self) -> impl Iterator<Item = (&'a [u8], u32, UnorderedTermId)> + 'b {
self.occupied.iter().cloned().map(move |bucket: usize| {
let kv = self.table[bucket];
let (key, offset) = self.get_key_value(kv.key_value_addr);
(key, offset, bucket as UnorderedTermId)
})
pub fn iter<'b: 'a>(&'b self) -> Iter<'a, 'b> {
Iter {
inner: self.occupied.iter(),
hashmap: &self,
}
}
pub fn get_or_create<S: AsRef<[u8]>, V: HeapAllocable>(
@@ -202,15 +214,32 @@ impl<'a> TermHashMap<'a> {
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use super::murmurhash2::murmurhash2;
use test::Bencher;
#[bench]
fn bench_murmurhash2(b: &mut Bencher) {
let keys: [&'static str; 3] = ["wer qwe qwe qwe ", "werbq weqweqwe2 ", "weraq weqweqwe3 "];
b.iter(|| {
let mut s = 0;
for &key in &keys {
s ^= murmurhash2(key.as_bytes());
}
s
});
}
}
#[cfg(test)]
mod tests {
use super::*;
use super::super::heap::{Heap, HeapAllocable};
use super::murmurhash2::murmurhash2;
use test::Bencher;
use std::collections::HashSet;
use super::split_memory;
use super::*;
use std::collections::HashSet;
struct TestValue {
val: u32,
@@ -281,6 +310,17 @@ mod tests {
}
}
#[test]
fn test_murmur_against_reference_impl() {
assert_eq!(murmurhash2("".as_bytes()), 3632506080);
assert_eq!(murmurhash2("a".as_bytes()), 455683869);
assert_eq!(murmurhash2("ab".as_bytes()), 2448092234);
assert_eq!(murmurhash2("abc".as_bytes()), 2066295634);
assert_eq!(murmurhash2("abcd".as_bytes()), 2588571162);
assert_eq!(murmurhash2("abcde".as_bytes()), 2988696942);
assert_eq!(murmurhash2("abcdefghijklmnop".as_bytes()), 2350868870);
}
#[test]
fn test_murmur_collisions() {
let mut set: HashSet<u32> = HashSet::default();
@@ -292,18 +332,4 @@ mod tests {
assert_eq!(set.len(), 10_000);
}
#[bench]
fn bench_murmurhash_2(b: &mut Bencher) {
let keys: Vec<&'static str> =
vec!["wer qwe qwe qwe ", "werbq weqweqwe2 ", "weraq weqweqwe3 "];
b.iter(|| {
keys.iter()
.map(|&s| s.as_bytes())
.map(murmurhash2::murmurhash2)
.map(|h| h as u64)
.last()
.unwrap()
});
}
}

View File

@@ -1,7 +1,7 @@
use byteorder::{ByteOrder, NativeEndian};
use std::cell::UnsafeCell;
use std::mem;
use std::ptr;
use byteorder::{ByteOrder, NativeEndian};
/// `BytesRef` refers to a slice in tantivy's custom `Heap`.
///

View File

@@ -1,10 +1,10 @@
mod expull;
pub(crate) mod hashmap;
mod heap;
mod expull;
pub use self::heap::{Heap, HeapAllocable};
pub use self::expull::ExpUnrolledLinkedList;
pub use self::hashmap::TermHashMap;
pub use self::heap::{Heap, HeapAllocable};
#[test]
fn test_unrolled_linked_list() {

View File

@@ -1,11 +1,11 @@
use std::marker::Send;
use std::fmt;
use std::path::Path;
use directory::error::{DeleteError, OpenReadError, OpenWriteError};
use directory::{ReadOnlySource, WritePtr};
use std::result;
use std::fmt;
use std::io;
use std::marker::Send;
use std::marker::Sync;
use std::path::Path;
use std::result;
/// Write-once read many (WORM) abstraction for where
/// tantivy's data should be stored.

View File

@@ -1,7 +1,7 @@
use std::error::Error as StdError;
use std::path::PathBuf;
use std::io;
use std::fmt;
use std::io;
use std::path::PathBuf;
/// General IO error with an optional path to the offending file.
#[derive(Debug)]

View File

@@ -1,18 +1,18 @@
use std::path::{Path, PathBuf};
use serde_json;
use core::MANAGED_FILEPATH;
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
use directory::{ReadOnlySource, WritePtr};
use std::result;
use std::io;
use Directory;
use std::sync::{Arc, RwLock};
use std::collections::HashSet;
use std::sync::RwLockWriteGuard;
use std::io::Write;
use core::MANAGED_FILEPATH;
use std::collections::HashMap;
use std::fmt;
use error::{ErrorKind, Result, ResultExt};
use serde_json;
use std::collections::HashMap;
use std::collections::HashSet;
use std::fmt;
use std::io;
use std::io::Write;
use std::path::{Path, PathBuf};
use std::result;
use std::sync::RwLockWriteGuard;
use std::sync::{Arc, RwLock};
use Directory;
/// Wrapper of directories that keeps track of files created by Tantivy.
///
@@ -86,7 +86,7 @@ impl ManagedDirectory {
let managed_files: HashSet<PathBuf> = serde_json::from_str(&managed_files_json)
.chain_err(|| ErrorKind::CorruptedFile(MANAGED_FILEPATH.clone()))?;
Ok(ManagedDirectory {
directory: box directory,
directory: Box::new(directory),
meta_informations: Arc::new(RwLock::new(MetaInformation {
managed_paths: managed_files,
protected_files: HashMap::default(),
@@ -94,7 +94,7 @@ impl ManagedDirectory {
})
}
Err(OpenReadError::FileDoesNotExist(_)) => Ok(ManagedDirectory {
directory: box directory,
directory: Box::new(directory),
meta_informations: Arc::default(),
}),
Err(OpenReadError::IOError(e)) => Err(From::from(e)),
@@ -265,7 +265,7 @@ impl Directory for ManagedDirectory {
}
fn box_clone(&self) -> Box<Directory> {
box self.clone()
Box::new(self.clone())
}
}
@@ -282,10 +282,10 @@ impl Clone for ManagedDirectory {
mod tests {
use super::*;
#[cfg(feature="mmap")]
#[cfg(feature = "mmap")]
use directory::MmapDirectory;
use std::path::Path;
use std::io::Write;
use std::path::Path;
use tempdir::TempDir;
lazy_static! {
@@ -294,7 +294,7 @@ mod tests {
}
#[test]
#[cfg(feature="mmap")]
#[cfg(feature = "mmap")]
fn test_managed_directory() {
let tempdir = TempDir::new("index").unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
@@ -343,7 +343,7 @@ mod tests {
}
#[test]
#[cfg(feature="mmap ")]
#[cfg(feature = "mmap ")]
fn test_managed_directory_gc_while_mmapped() {
let tempdir = TempDir::new("index").unwrap();
let tempdir_path = PathBuf::from(tempdir.path());
@@ -373,7 +373,7 @@ mod tests {
}
#[test]
#[cfg(feature="mmap")]
#[cfg(feature = "mmap")]
fn test_managed_directory_protect() {
let tempdir = TempDir::new("index").unwrap();
let tempdir_path = PathBuf::from(tempdir.path());

View File

@@ -1,17 +1,17 @@
use atomicwrites;
use common::make_io_err;
use directory::Directory;
use directory::error::{DeleteError, IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
use directory::ReadOnlySource;
use directory::shared_vec_slice::SharedVecSlice;
use directory::Directory;
use directory::ReadOnlySource;
use directory::WritePtr;
use fst::raw::MmapReadOnly;
use std::collections::hash_map::Entry as HashMapEntry;
use std::collections::HashMap;
use std::convert::From;
use std::fmt;
use std::fs::{self, File};
use std::fs::OpenOptions;
use std::fs::{self, File};
use std::io::{self, Seek, SeekFrom};
use std::io::{BufWriter, Read, Write};
use std::path::{Path, PathBuf};
@@ -40,9 +40,11 @@ fn open_mmap(full_path: &Path) -> result::Result<Option<MmapReadOnly>, OpenReadE
// instead.
return Ok(None);
}
MmapReadOnly::open(&file)
.map(Some)
.map_err(|e| From::from(IOError::with_path(full_path.to_owned(), e)))
unsafe {
MmapReadOnly::open(&file)
.map(Some)
.map_err(|e| From::from(IOError::with_path(full_path.to_owned(), e)))
}
}
#[derive(Default, Clone, Debug, Serialize, Deserialize)]

View File

@@ -4,32 +4,29 @@ WORM directory abstraction.
*/
#[cfg(feature="mmap")]
#[cfg(feature = "mmap")]
mod mmap_directory;
mod ram_directory;
mod directory;
mod managed_directory;
mod ram_directory;
mod read_only_source;
mod shared_vec_slice;
mod managed_directory;
mod static_directory;
/// Errors specific to the directory module.
pub mod error;
use std::io::{BufWriter, Seek, Write};
pub use self::static_directory::StaticDirectory;
pub use self::static_directory::write_static_from_directory;
pub use self::read_only_source::ReadOnlySource;
pub use self::directory::Directory;
pub use self::ram_directory::RAMDirectory;
pub use self::read_only_source::ReadOnlySource;
#[cfg(feature="mmap")]
#[cfg(feature = "mmap")]
pub use self::mmap_directory::MmapDirectory;
pub(crate) use self::read_only_source::SourceRead;
pub(crate) use self::managed_directory::{FileProtection, ManagedDirectory};
pub(crate) use self::read_only_source::SourceRead;
/// Synonym of Seek + Write
pub trait SeekableWrite: Seek + Write {}
@@ -45,8 +42,8 @@ pub type WritePtr = BufWriter<Box<SeekableWrite>>;
mod tests {
use super::*;
use std::path::Path;
use std::io::{Seek, SeekFrom, Write};
use std::path::Path;
lazy_static! {
static ref TEST_PATH: &'static Path = Path::new("some_path_for_test");
@@ -59,7 +56,7 @@ mod tests {
}
#[test]
#[cfg(feature="mmap")]
#[cfg(feature = "mmap")]
fn test_mmap_directory() {
let mut mmap_directory = MmapDirectory::create_from_tempdir().unwrap();
test_directory(&mut mmap_directory);

View File

@@ -1,14 +1,14 @@
use super::shared_vec_slice::SharedVecSlice;
use common::make_io_err;
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
use directory::WritePtr;
use directory::{Directory, ReadOnlySource};
use std::collections::HashMap;
use std::fmt;
use std::io::{self, BufWriter, Cursor, Seek, SeekFrom, Write};
use std::path::{Path, PathBuf};
use std::result;
use std::sync::{Arc, RwLock};
use common::make_io_err;
use directory::{Directory, ReadOnlySource};
use directory::error::{DeleteError, IOError, OpenReadError, OpenWriteError};
use directory::WritePtr;
use super::shared_vec_slice::SharedVecSlice;
/// Writer associated with the `RAMDirectory`
///

View File

@@ -1,13 +1,11 @@
#[cfg(feature="mmap")]
use fst::raw::MmapReadOnly;
use std::ops::Deref;
use super::shared_vec_slice::SharedVecSlice;
use common::HasLen;
use std::slice;
use std::io::{self, Read};
#[cfg(feature = "mmap")]
use fst::raw::MmapReadOnly;
use stable_deref_trait::{CloneStableDeref, StableDeref};
const EMPTY_SLICE: [u8; 0] = [];
use std::io::{self, Read};
use std::ops::Deref;
use std::slice;
/// Read object that represents files in tantivy.
///
@@ -17,12 +15,10 @@ const EMPTY_SLICE: [u8; 0] = [];
/// hold by this object should never be altered or destroyed.
pub enum ReadOnlySource {
/// Mmap source of data
#[cfg(feature="mmap")]
#[cfg(feature = "mmap")]
Mmap(MmapReadOnly),
/// Wrapping a `Vec<u8>`
Anonymous(SharedVecSlice),
/// Wrapping a static slice
Static(&'static [u8])
}
unsafe impl StableDeref for ReadOnlySource {}
@@ -39,16 +35,15 @@ impl Deref for ReadOnlySource {
impl ReadOnlySource {
/// Creates an empty ReadOnlySource
pub fn empty() -> ReadOnlySource {
ReadOnlySource::Static(&EMPTY_SLICE)
ReadOnlySource::Anonymous(SharedVecSlice::empty())
}
/// Returns the data underlying the ReadOnlySource object.
pub fn as_slice(&self) -> &[u8] {
match *self {
#[cfg(feature="mmap")]
ReadOnlySource::Mmap(ref mmap_read_only) => unsafe { mmap_read_only.as_slice() },
#[cfg(feature = "mmap")]
ReadOnlySource::Mmap(ref mmap_read_only) => mmap_read_only.as_slice(),
ReadOnlySource::Anonymous(ref shared_vec) => shared_vec.as_slice(),
ReadOnlySource::Static(data) => data,
}
}
@@ -71,9 +66,14 @@ impl ReadOnlySource {
/// 1KB slice is remaining, the whole `500MBs`
/// are retained in memory.
pub fn slice(&self, from_offset: usize, to_offset: usize) -> ReadOnlySource {
assert!(from_offset <= to_offset, "Requested negative slice [{}..{}]", from_offset, to_offset);
assert!(
from_offset <= to_offset,
"Requested negative slice [{}..{}]",
from_offset,
to_offset
);
match *self {
#[cfg(feature="mmap")]
#[cfg(feature = "mmap")]
ReadOnlySource::Mmap(ref mmap_read_only) => {
let sliced_mmap = mmap_read_only.range(from_offset, to_offset - from_offset);
ReadOnlySource::Mmap(sliced_mmap)
@@ -81,9 +81,6 @@ impl ReadOnlySource {
ReadOnlySource::Anonymous(ref shared_vec) => {
ReadOnlySource::Anonymous(shared_vec.slice(from_offset, to_offset))
}
ReadOnlySource::Static(data) => {
ReadOnlySource::Static(&data[from_offset..to_offset])
}
}
}
@@ -124,12 +121,6 @@ impl From<Vec<u8>> for ReadOnlySource {
}
}
impl From<&'static [u8]> for ReadOnlySource {
fn from(data: &'static [u8]) -> ReadOnlySource {
ReadOnlySource::Static(data)
}
}
/// Acts as a owning cursor over the data backed up by a `ReadOnlySource`
pub(crate) struct SourceRead {
_data_owner: ReadOnlySource,
@@ -144,13 +135,11 @@ impl SourceRead {
pub fn slice_from(&self, start: usize) -> &[u8] {
&self.cursor[start..]
}
pub fn get(&self, idx: usize) -> u8 {
self.cursor[idx]
}
}
impl AsRef<[u8]> for SourceRead {

View File

@@ -1,123 +0,0 @@
use std::collections::HashMap;
use Directory;
use std::path::PathBuf;
use directory::ReadOnlySource;
use std::io::BufWriter;
use directory::error::{DeleteError, OpenReadError, OpenWriteError};
use std::path::Path;
use std::fmt::{Formatter, Debug, self};
use Result as TantivyResult;
use directory::SeekableWrite;
use std::io;
use std::fs;
use common::Endianness;
use common::BinarySerializable;
use common::VInt;
use byteorder::ByteOrder;
use std::str;
use std::fs::File;
use std::io::{Read, Write};
use std::ffi::OsString;
#[derive(Clone)]
pub struct StaticDirectory {
files: HashMap<PathBuf, &'static [u8]>,
}
impl Debug for StaticDirectory {
fn fmt(&self, f: &mut Formatter) -> Result<(), fmt::Error> {
write!(f, "StaticDirectory[{} files]", self.files.len())?;
Ok(())
}
}
impl StaticDirectory {
pub fn open(mut data: &'static [u8]) -> TantivyResult<StaticDirectory> {
assert!(data.len() > 8);
let footer_len_offset = data.len() - 8;
let body_len = Endianness::read_u64(&data[footer_len_offset..]) as usize;
let mut body = &data[..body_len];
let mut footer = &data[body_len..footer_len_offset];
let num_files = VInt::deserialize(&mut footer)?.0 as usize;
let mut files = HashMap::new();
for _ in 0..num_files {
let filename_len = VInt::deserialize(&mut footer)?.0 as usize;
let filename = &footer[..filename_len];
footer = &footer[filename_len..];
let data_len = VInt::deserialize(&mut footer)?.0 as usize;
let file_data = &body[..data_len];
body = &body[data_len..];
let filename_str = str::from_utf8(filename).expect("Invalid UTF8");
let filename = PathBuf::from(filename_str);
println!("{:?} {:?}", filename, data_len);
files.insert(filename, file_data);
}
Ok(StaticDirectory {
files
})
}
}
impl Directory for StaticDirectory {
fn open_read(&self, path: &Path) -> Result<ReadOnlySource, OpenReadError> {
if let Some(static_data) = self.files.get(path) {
Ok(ReadOnlySource::from(*static_data))
} else {
Err(OpenReadError::FileDoesNotExist(path.to_owned()))
}
}
fn delete(&self, path: &Path) -> Result<(), DeleteError> {
unimplemented!("Static directory is read-only !")
}
fn exists(&self, path: &Path) -> bool {
self.files.contains_key(path)
}
fn open_write(&mut self, path: &Path) -> Result<BufWriter<Box<SeekableWrite>>, OpenWriteError> {
unimplemented!("Static directory is read-only !")
}
fn atomic_read(&self, path: &Path) -> Result<Vec<u8>, OpenReadError> {
if let Some(static_data) = self.files.get(path) {
Ok(static_data.to_vec())
} else {
Err(OpenReadError::FileDoesNotExist(path.to_owned()))
}
}
fn atomic_write(&mut self, path: &Path, data: &[u8]) -> io::Result<()> {
unimplemented!("Static directory is read-only !")
}
fn box_clone(&self) -> Box<Directory> {
box self.clone()
}
}
pub fn write_static_from_directory(directory_path: &Path) -> TantivyResult<Vec<u8>> {
assert!(directory_path.is_dir());
let mut file_data: Vec<(OsString, usize)> = Vec::new();
let mut write: Vec<u8> = Vec::new();
for entry in fs::read_dir(directory_path)? {
let entry = entry?;
let path = entry.path();
if path.is_file() {
info!("Appending {}", path.to_string_lossy());
let mut open_file = File::open(&path)?;
let file_len = open_file.read_to_end(&mut write)?;
file_data.push((entry.file_name(), file_len));
}
}
// write footer
let body_len = write.len();
VInt(file_data.len() as u64).serialize(&mut write)?;
for (filename, filelen) in file_data {
VInt(filename.len() as u64).serialize(&mut write)?;
write.write_all(filename.to_string_lossy().as_bytes())?;
VInt(filelen as u64).serialize(&mut write)?;
}
(body_len as u64).serialize(&mut write)?;
Ok(write)
}

View File

@@ -1,8 +1,8 @@
use DocId;
use common::BitSet;
use std::borrow::Borrow;
use std::borrow::BorrowMut;
use std::cmp::Ordering;
use common::BitSet;
use DocId;
/// Expresses the outcome of a call to `DocSet`'s `.skip_next(...)`.
#[derive(PartialEq, Eq, Debug)]

View File

@@ -2,13 +2,13 @@
use std::io;
use std::path::PathBuf;
use std::sync::PoisonError;
use directory::error::{IOError, OpenDirectoryError, OpenReadError, OpenWriteError};
use fastfield::FastFieldNotAvailableError;
use query;
use schema;
use fastfield::FastFieldNotAvailableError;
use serde_json;
use std::path::PathBuf;
use std::sync::PoisonError;
error_chain!(
errors {
@@ -48,10 +48,10 @@ error_chain!(
description("an error occurred in a thread")
display("an error occurred in a thread: '{}'", err)
}
/// An Error appeared related to the lack of a field.
SchemaError(field: String) {
description("a schema field is missing")
display("a schema field is missing: '{}'", field)
/// An Error appeared related to the schema.
SchemaError(message: String) {
description("the schema is not matching expectations.")
display("Schema error: '{}'", message)
}
/// Tried to access a fastfield reader for a field not configured accordingly.
FastFieldError(err: FastFieldNotAvailableError) {

View File

@@ -0,0 +1,38 @@
mod reader;
mod writer;
pub use self::reader::BytesFastFieldReader;
pub use self::writer::BytesFastFieldWriter;
#[cfg(test)]
mod tests {
use schema::SchemaBuilder;
use Index;
#[test]
fn test_bytes() {
let mut schema_builder = SchemaBuilder::default();
let field = schema_builder.add_bytes_field("bytesfield");
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
index_writer.add_document(doc!(field=>vec![0u8, 1, 2, 3]));
index_writer.add_document(doc!(field=>vec![]));
index_writer.add_document(doc!(field=>vec![255u8]));
index_writer.add_document(doc!(field=>vec![1u8, 3, 5, 7, 9]));
index_writer.add_document(doc!(field=>vec![0u8; 1000]));
assert!(index_writer.commit().is_ok());
index.load_searchers().unwrap();
let searcher = index.searcher();
let reader = searcher.segment_reader(0);
let bytes_reader = reader.bytes_fast_field_reader(field).unwrap();
assert_eq!(bytes_reader.get_val(0), &[0u8, 1, 2, 3]);
assert!(bytes_reader.get_val(1).is_empty());
assert_eq!(bytes_reader.get_val(2), &[255u8]);
assert_eq!(bytes_reader.get_val(3), &[1u8, 3, 5, 7, 9]);
let long = vec![0u8; 1000];
assert_eq!(bytes_reader.get_val(4), long.as_slice());
}
}

View File

@@ -0,0 +1,37 @@
use owning_ref::OwningRef;
use directory::ReadOnlySource;
use fastfield::FastFieldReader;
use DocId;
/// Reader for byte array fast fields
///
/// The reader is implemented as a `u64` fast field and a separate collection of bytes.
///
/// The `vals_reader` will access the concatenated list of all values for all documents.
///
/// The `idx_reader` associates, for each document, the index of its first value.
///
/// Reading the value for a document is done by reading the start index for it,
/// and the start index for the next document, and keeping the bytes in between.
pub struct BytesFastFieldReader {
idx_reader: FastFieldReader<u64>,
values: OwningRef<ReadOnlySource, [u8]>,
}
impl BytesFastFieldReader {
pub(crate) fn open(
idx_reader: FastFieldReader<u64>,
values_source: ReadOnlySource,
) -> BytesFastFieldReader {
let values = OwningRef::new(values_source).map(|source| &source[..]);
BytesFastFieldReader { idx_reader, values }
}
/// Returns the bytes associated to the given `doc`
pub fn get_val(&self, doc: DocId) -> &[u8] {
let start = self.idx_reader.get(doc) as usize;
let stop = self.idx_reader.get(doc + 1) as usize;
&self.values[start..stop]
}
}

View File

@@ -0,0 +1,96 @@
use std::io;
use fastfield::serializer::FastFieldSerializer;
use schema::{Document, Field, Value};
use DocId;
/// Writer for byte array (as in, any number of bytes per document) fast fields
///
/// This `BytesFastFieldWriter` is only useful for advanced user.
/// The normal way to get your associated bytes in your index
/// is to
/// - declare your field with fast set to `Cardinality::SingleValue`
/// in your schema
/// - add your document simply by calling `.add_document(...)` with associating bytes to the field.
///
/// The `BytesFastFieldWriter` can be acquired from the
/// fast field writer by calling
/// [`.get_bytes_writer(...)`](./struct.FastFieldsWriter.html#method.get_bytes_writer).
///
/// Once acquired, writing is done by calling `.add_document_val(&[u8])`
/// once per document, even if there are no bytes associated to it.
pub struct BytesFastFieldWriter {
field: Field,
vals: Vec<u8>,
doc_index: Vec<u64>,
}
impl BytesFastFieldWriter {
/// Creates a new `BytesFastFieldWriter`
pub fn new(field: Field) -> Self {
BytesFastFieldWriter {
field,
vals: Vec::new(),
doc_index: Vec::new(),
}
}
/// Access the field associated to the `BytesFastFieldWriter`
pub fn field(&self) -> Field {
self.field
}
/// Finalize the current document.
pub(crate) fn next_doc(&mut self) {
self.doc_index.push(self.vals.len() as u64);
}
/// Shift to the next document and add all of the
/// matching field values present in the document.
pub fn add_document(&mut self, doc: &Document) {
self.next_doc();
for field_value in doc.field_values() {
if field_value.field() == self.field {
if let &Value::Bytes(ref bytes) = field_value.value() {
self.vals.extend_from_slice(bytes);
} else {
panic!(
"Bytes field contained non-Bytes Value!. Field {:?} = {:?}",
self.field, field_value
);
}
}
}
}
/// Register the bytes associated to a document.
///
/// The method returns the `DocId` of the document that was
/// just written.
pub fn add_document_val(&mut self, val: &[u8]) -> DocId {
let doc = self.doc_index.len() as DocId;
self.next_doc();
self.vals.extend_from_slice(val);
doc
}
/// Serializes the fast field values by pushing them to the `FastFieldSerializer`.
pub fn serialize(&self, serializer: &mut FastFieldSerializer) -> io::Result<()> {
{
// writing the offset index
let mut doc_index_serializer =
serializer.new_u64_fast_field_with_idx(self.field, 0, self.vals.len() as u64, 0)?;
for &offset in &self.doc_index {
doc_index_serializer.add_val(offset)?;
}
doc_index_serializer.add_val(self.vals.len() as u64)?;
doc_index_serializer.close_field()?;
}
{
// writing the values themselves
let mut value_serializer = serializer.new_bytes_fast_field_with_idx(self.field, 1)?;
value_serializer.write_all(&self.vals)?;
}
Ok(())
}
}

View File

@@ -1,10 +1,10 @@
use bit_set::BitSet;
use directory::WritePtr;
use std::io::Write;
use std::io;
use directory::ReadOnlySource;
use DocId;
use common::HasLen;
use directory::ReadOnlySource;
use directory::WritePtr;
use std::io;
use std::io::Write;
use DocId;
/// Write a delete `BitSet`
///
@@ -62,10 +62,8 @@ impl DeleteBitSet {
b & (1u8 << shift) != 0
}
}
}
impl HasLen for DeleteBitSet {
fn len(&self) -> usize {
self.len
@@ -74,10 +72,10 @@ impl HasLen for DeleteBitSet {
#[cfg(test)]
mod tests {
use std::path::PathBuf;
use super::*;
use bit_set::BitSet;
use directory::*;
use super::*;
use std::path::PathBuf;
fn test_delete_bitset_helper(bitset: &BitSet) {
let test_path = PathBuf::from("test");

View File

@@ -1,5 +1,5 @@
use std::result;
use schema::FieldEntry;
use std::result;
/// `FastFieldNotAvailableError` is returned when the
/// user requested for a fast field reader, and the field was not

View File

@@ -1,8 +1,8 @@
use super::MultiValueIntFastFieldReader;
use DocId;
use termdict::TermOrdinal;
use schema::Facet;
use termdict::{TermDictionary, TermDictionaryImpl};
use termdict::TermDictionary;
use termdict::TermOrdinal;
use DocId;
/// The facet reader makes it possible to access the list of
/// facets associated to a given document in a specific
@@ -19,7 +19,7 @@ use termdict::{TermDictionary, TermDictionaryImpl};
/// only makes sense for a given segment.
pub struct FacetReader {
term_ords: MultiValueIntFastFieldReader<u64>,
term_dict: TermDictionaryImpl,
term_dict: TermDictionary,
}
impl FacetReader {
@@ -28,11 +28,11 @@ impl FacetReader {
/// A facet reader just wraps :
/// - a `MultiValueIntFastFieldReader` that makes it possible to
/// access the list of facet ords for a given document.
/// - a `TermDictionaryImpl` that helps associating a facet to
/// - a `TermDictionary` that helps associating a facet to
/// an ordinal and vice versa.
pub fn new(
term_ords: MultiValueIntFastFieldReader<u64>,
term_dict: TermDictionaryImpl,
term_dict: TermDictionary,
) -> FacetReader {
FacetReader {
term_ords,
@@ -50,7 +50,7 @@ impl FacetReader {
}
/// Accessor for the facet term dictionary.
pub fn facet_dict(&self) -> &TermDictionaryImpl {
pub fn facet_dict(&self) -> &TermDictionary {
&self.term_dict
}

View File

@@ -23,26 +23,28 @@ values stored.
Read access performance is comparable to that of an array lookup.
*/
pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter};
pub use self::delete::write_delete_bitset;
pub use self::delete::DeleteBitSet;
pub use self::error::{FastFieldNotAvailableError, Result};
pub use self::facet_reader::FacetReader;
pub use self::multivalued::{MultiValueIntFastFieldReader, MultiValueIntFastFieldWriter};
pub use self::reader::FastFieldReader;
pub use self::serializer::FastFieldSerializer;
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
use common;
use schema::Cardinality;
use schema::FieldType;
use schema::Value;
pub use self::delete::DeleteBitSet;
pub use self::delete::write_delete_bitset;
pub use self::error::{FastFieldNotAvailableError, Result};
pub use self::facet_reader::FacetReader;
pub use self::multivalued::MultiValueIntFastFieldReader;
pub use self::reader::FastFieldReader;
pub use self::serializer::FastFieldSerializer;
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
mod reader;
mod writer;
mod serializer;
mod error;
mod bytes;
mod delete;
mod error;
mod facet_reader;
mod multivalued;
mod reader;
mod serializer;
mod writer;
/// Trait for types that are allowed for fast fields: (u64 or i64).
pub trait FastValue: Default + Clone + Copy {
@@ -121,31 +123,27 @@ fn value_to_u64(value: &Value) -> u64 {
#[cfg(test)]
mod tests {
use super::*;
use common::CompositeFile;
use directory::{Directory, RAMDirectory, WritePtr};
use fastfield::FastFieldReader;
use rand::Rng;
use rand::SeedableRng;
use rand::XorShiftRng;
use schema::{Schema, SchemaBuilder};
use schema::Document;
use schema::FAST;
use schema::Field;
use schema::FAST;
use schema::{Schema, SchemaBuilder};
use std::collections::HashMap;
use std::path::Path;
use super::*;
use test;
use test::Bencher;
lazy_static! {
static ref SCHEMA: Schema = {
pub static ref SCHEMA: Schema = {
let mut schema_builder = SchemaBuilder::default();
schema_builder.add_u64_field("field", FAST);
schema_builder.build()
};
static ref FIELD: Field = {
SCHEMA.get_field("field").unwrap()
};
pub static ref FIELD: Field = { SCHEMA.get_field("field").unwrap() };
}
#[test]
@@ -369,7 +367,7 @@ mod tests {
}
}
fn generate_permutation() -> Vec<u64> {
pub fn generate_permutation() -> Vec<u64> {
let seed: &[u32; 4] = &[1, 2, 3, 4];
let mut rng = XorShiftRng::from_seed(*seed);
let mut permutation: Vec<u64> = (0u64..1_000_000u64).collect();
@@ -409,13 +407,27 @@ mod tests {
}
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use super::tests::FIELD;
use super::tests::{generate_permutation, SCHEMA};
use super::*;
use common::CompositeFile;
use directory::{Directory, RAMDirectory, WritePtr};
use fastfield::FastFieldReader;
use std::collections::HashMap;
use std::path::Path;
use test::{self, Bencher};
#[bench]
fn bench_intfastfield_linear_veclookup(b: &mut Bencher) {
let permutation = generate_permutation();
b.iter(|| {
let n = test::black_box(7000u32);
let mut a = 0u64;
for i in Iterator::step_by(0u32..n, 7) {
for i in (0u32..n / 7).map(|v| v * 7) {
a ^= permutation[i as usize];
}
a
@@ -461,7 +473,7 @@ mod tests {
b.iter(|| {
let n = test::black_box(7000u32);
let mut a = 0u64;
for i in Iterator::step_by(0u32..n, 7) {
for i in (0u32..n / 7).map(|val| val * 7) {
a ^= fast_field_reader.get(i);
}
a
@@ -502,4 +514,5 @@ mod tests {
});
}
}
}

View File

@@ -1,15 +1,15 @@
mod writer;
mod reader;
mod writer;
pub use self::writer::MultiValueIntFastFieldWriter;
pub use self::reader::MultiValueIntFastFieldReader;
pub use self::writer::MultiValueIntFastFieldWriter;
#[cfg(test)]
mod tests {
use schema::SchemaBuilder;
use schema::Cardinality;
use schema::IntOptions;
use schema::SchemaBuilder;
use Index;
#[test]

View File

@@ -1,5 +1,5 @@
use DocId;
use fastfield::{FastFieldReader, FastValue};
use DocId;
/// Reader for a multivalued `u64` fast field.
///
@@ -26,13 +26,20 @@ impl<Item: FastValue> MultiValueIntFastFieldReader<Item> {
}
}
/// Returns `(start, stop)`, such that the values associated
/// to the given document are `start..stop`.
fn range(&self, doc: DocId) -> (u64, u64) {
let start = self.idx_reader.get(doc);
let stop = self.idx_reader.get(doc + 1);
(start, stop)
}
/// Returns the array of values associated to the given `doc`.
pub fn get_vals(&self, doc: DocId, vals: &mut Vec<Item>) {
let start = self.idx_reader.get(doc) as u32;
let stop = self.idx_reader.get(doc + 1) as u32;
let (start, stop) = self.range(doc);
let len = (stop - start) as usize;
vals.resize(len, Item::default());
self.vals_reader.get_range(start, &mut vals[..]);
self.vals_reader.get_range(start as u32, &mut vals[..]);
}
}

View File

@@ -1,12 +1,35 @@
use fastfield::FastFieldSerializer;
use fastfield::serializer::FastSingleFieldSerializer;
use fastfield::value_to_u64;
use std::collections::HashMap;
use fastfield::FastFieldSerializer;
use itertools::Itertools;
use postings::UnorderedTermId;
use schema::{Document, Field};
use std::collections::HashMap;
use std::io;
use itertools::Itertools;
use termdict::TermOrdinal;
use DocId;
/// Writer for multi-valued (as in, more than one value per document)
/// int fast field.
///
/// This `Writer` is only useful for advanced user.
/// The normal way to get your multivalued int in your index
/// is to
/// - declare your field with fast set to `Cardinality::MultiValues`
/// in your schema
/// - add your document simply by calling `.add_document(...)`.
///
/// The `MultiValueIntFastFieldWriter` can be acquired from the
/// fastfield writer, by calling [`.get_multivalue_writer(...)`](./struct.FastFieldsWriter.html#method.get_multivalue_writer).
///
/// Once acquired, writing is done by calling calls to
/// `.add_document_vals(&[u64])` once per document.
///
/// The serializer makes it possible to remap all of the values
/// that were pushed to the writer using a mapping.
/// This makes it possible to push unordered term ids,
/// during indexing and remap them to their respective
/// term ids when the segment is getting serialized.
pub struct MultiValueIntFastFieldWriter {
field: Field,
vals: Vec<u64>,
@@ -16,7 +39,7 @@ pub struct MultiValueIntFastFieldWriter {
impl MultiValueIntFastFieldWriter {
/// Creates a new `IntFastFieldWriter`
pub fn new(field: Field, is_facet: bool) -> Self {
pub(crate) fn new(field: Field, is_facet: bool) -> Self {
MultiValueIntFastFieldWriter {
field,
vals: Vec::new(),
@@ -25,24 +48,26 @@ impl MultiValueIntFastFieldWriter {
}
}
/// Access the field associated to the `MultiValueIntFastFieldWriter`
pub fn field(&self) -> Field {
self.field
}
pub fn next_doc(&mut self) {
/// Finalize the current document.
pub(crate) fn next_doc(&mut self) {
self.doc_index.push(self.vals.len() as u64);
}
/// Records a new value.
///
/// The n-th value being recorded is implicitely
/// associated to the document with the `DocId` n.
/// (Well, `n-1` actually because of 0-indexing)
pub fn add_val(&mut self, val: UnorderedTermId) {
/// Pushes a new value to the current document.
pub(crate) fn add_val(&mut self, val: UnorderedTermId) {
self.vals.push(val);
}
/// Shift to the next document and adds
/// all of the matching field values present in the document.
pub fn add_document(&mut self, doc: &Document) {
self.next_doc();
// facets are indexed in the `SegmentWriter` as we encode their unordered id.
if !self.is_facet {
for field_value in doc.field_values() {
if field_value.field() == self.field {
@@ -52,6 +77,17 @@ impl MultiValueIntFastFieldWriter {
}
}
/// Register all of the values associated to a document.
///
/// The method returns the `DocId` of the document that was
/// just written.
pub fn add_document_vals(&mut self, vals: &[UnorderedTermId]) -> DocId {
let doc = self.doc_index.len() as DocId;
self.next_doc();
self.vals.extend_from_slice(vals);
doc
}
/// Serializes fast field values by pushing them to the `FastFieldSerializer`.
///
/// HashMap makes it possible to remap them before serializing.
@@ -66,7 +102,7 @@ impl MultiValueIntFastFieldWriter {
pub fn serialize(
&self,
serializer: &mut FastFieldSerializer,
mapping_opt: Option<&HashMap<UnorderedTermId, usize>>,
mapping_opt: Option<&HashMap<UnorderedTermId, TermOrdinal>>,
) -> io::Result<()> {
{
// writing the offset index
@@ -90,13 +126,13 @@ impl MultiValueIntFastFieldWriter {
1,
)?;
for val in &self.vals {
let remapped_val = *mapping.get(val).expect("Missing term ordinal") as u64;
let remapped_val = *mapping.get(val).expect("Missing term ordinal");
value_serializer.add_val(remapped_val)?;
}
}
None => {
let val_min_max = self.vals.iter().cloned().minmax();
let (val_min, val_max) = val_min_max.into_option().unwrap_or((0u64, 0));
let (val_min, val_max) = val_min_max.into_option().unwrap_or((0u64, 0u64));
value_serializer =
serializer.new_u64_fast_field_with_idx(self.field, val_min, val_max, 1)?;
for &val in &self.vals {

View File

@@ -1,19 +1,19 @@
use common::BinarySerializable;
use super::FastValue;
use common::bitpacker::BitUnpacker;
use common::CompositeFile;
use common::compute_num_bits;
use directory::{Directory, RAMDirectory, WritePtr};
use common::BinarySerializable;
use common::CompositeFile;
use directory::ReadOnlySource;
use DocId;
use directory::{Directory, RAMDirectory, WritePtr};
use fastfield::{FastFieldSerializer, FastFieldsWriter};
use owning_ref::OwningRef;
use schema::FAST;
use schema::SchemaBuilder;
use schema::FAST;
use std::collections::HashMap;
use std::marker::PhantomData;
use std::mem;
use std::path::Path;
use super::FastValue;
use DocId;
/// Trait for accessing a fastfield.
///
@@ -67,12 +67,20 @@ impl<Item: FastValue> FastFieldReader<Item> {
/// associated with the `DocId` going from
/// `start` to `start + output.len()`.
///
/// Regardless of the type of `Item`, this method works
/// - transmuting the output array
/// - extracting the `Item`s as if they were `u64`
/// - possibly converting the `u64` value to the right type.
///
/// # Panics
///
/// May panic if `start + output.len()` is greater than
/// the segment's `maxdoc`.
///
// TODO change start to `u64`.
// For multifastfield, start is an index in a second fastfield, not a `DocId`
pub fn get_range(&self, start: u32, output: &mut [Item]) {
let output_u64: &mut [u64] = unsafe { mem::transmute(output) };
let output_u64: &mut [u64] = unsafe { mem::transmute(output) }; // ok: Item is either `u64` or `i64`
self.bit_unpacker.get_range(start, output_u64);
for out in output_u64.iter_mut() {
*out = Item::from_u64(*out + self.min_value_u64).as_u64();

View File

@@ -1,10 +1,10 @@
use common::BinarySerializable;
use directory::WritePtr;
use schema::Field;
use common::bitpacker::BitPacker;
use common::compute_num_bits;
use common::CountingWriter;
use common::BinarySerializable;
use common::CompositeWrite;
use common::CountingWriter;
use directory::WritePtr;
use schema::Field;
use std::io::{self, Write};
/// `FastFieldSerializer` is in charge of serializing
@@ -61,6 +61,16 @@ impl FastFieldSerializer {
FastSingleFieldSerializer::open(field_write, min_value, max_value)
}
/// Start serializing a new [u8] fast field
pub fn new_bytes_fast_field_with_idx(
&mut self,
field: Field,
idx: usize,
) -> io::Result<FastBytesFieldSerializer<CountingWriter<WritePtr>>> {
let field_write = self.composite_write.for_field_with_idx(field, idx);
FastBytesFieldSerializer::open(field_write)
}
/// Closes the serializer
///
/// After this call the data must be persistently save on disk.
@@ -77,11 +87,20 @@ pub struct FastSingleFieldSerializer<'a, W: Write + 'a> {
}
impl<'a, W: Write> FastSingleFieldSerializer<'a, W> {
/// Creates a new fast field serializer.
///
/// The serializer in fact encode the values by bitpacking
/// `(val - min_value)`.
///
/// It requires a `min_value` and a `max_value` to compute
/// compute the minimum number of bits required to encode
/// values.
fn open(
write: &'a mut W,
min_value: u64,
max_value: u64,
) -> io::Result<FastSingleFieldSerializer<'a, W>> {
assert!(min_value <= max_value);
min_value.serialize(write)?;
let amplitude = max_value - min_value;
amplitude.serialize(write)?;
@@ -107,3 +126,21 @@ impl<'a, W: Write> FastSingleFieldSerializer<'a, W> {
self.bit_packer.close(&mut self.write)
}
}
pub struct FastBytesFieldSerializer<'a, W: Write + 'a> {
write: &'a mut W,
}
impl<'a, W: Write> FastBytesFieldSerializer<'a, W> {
fn open(write: &'a mut W) -> io::Result<FastBytesFieldSerializer<'a, W>> {
Ok(FastBytesFieldSerializer { write })
}
pub fn write_all(&mut self, vals: &[u8]) -> io::Result<()> {
self.write.write_all(vals)
}
pub fn flush(&mut self) -> io::Result<()> {
self.write.flush()
}
}

View File

@@ -1,18 +1,19 @@
use schema::{Cardinality, Document, Field, Schema};
use fastfield::FastFieldSerializer;
use std::io;
use schema::FieldType;
use common;
use common::VInt;
use std::collections::HashMap;
use postings::UnorderedTermId;
use super::multivalued::MultiValueIntFastFieldWriter;
use common;
use common::BinarySerializable;
use common::VInt;
use fastfield::{BytesFastFieldWriter, FastFieldSerializer};
use postings::UnorderedTermId;
use schema::{Cardinality, Document, Field, FieldType, Schema};
use std::collections::HashMap;
use std::io;
use termdict::TermOrdinal;
/// The fastfieldswriter regroup all of the fast field writers.
pub struct FastFieldsWriter {
single_value_writers: Vec<IntFastFieldWriter>,
multi_values_writers: Vec<MultiValueIntFastFieldWriter>,
bytes_value_writers: Vec<BytesFastFieldWriter>,
}
impl FastFieldsWriter {
@@ -20,6 +21,7 @@ impl FastFieldsWriter {
pub fn from_schema(schema: &Schema) -> FastFieldsWriter {
let mut single_value_writers = Vec::new();
let mut multi_values_writers = Vec::new();
let mut bytes_value_writers = Vec::new();
for (field_id, field_entry) in schema.fields().iter().enumerate() {
let field = Field(field_id as u32);
@@ -47,12 +49,17 @@ impl FastFieldsWriter {
let fast_field_writer = MultiValueIntFastFieldWriter::new(field, true);
multi_values_writers.push(fast_field_writer);
}
FieldType::Bytes => {
let fast_field_writer = BytesFastFieldWriter::new(field);
bytes_value_writers.push(fast_field_writer);
}
_ => {}
}
}
FastFieldsWriter {
single_value_writers,
multi_values_writers,
bytes_value_writers,
}
}
@@ -68,24 +75,36 @@ impl FastFieldsWriter {
///
/// Returns None if the field does not exist, or is not
/// configured as a multivalued fastfield in the schema.
pub(crate) fn get_multivalue_writer(
pub fn get_multivalue_writer(
&mut self,
field: Field,
) -> Option<&mut MultiValueIntFastFieldWriter> {
// TODO optimize
// TODO expose for users
self.multi_values_writers
.iter_mut()
.find(|multivalue_writer| multivalue_writer.field() == field)
}
/// Returns the bytes fast field writer for the given field.
///
/// Returns None if the field does not exist, or is not
/// configured as a bytes fastfield in the schema.
pub fn get_bytes_writer(&mut self, field: Field) -> Option<&mut BytesFastFieldWriter> {
// TODO optimize
self.bytes_value_writers
.iter_mut()
.find(|field_writer| field_writer.field() == field)
}
/// Indexes all of the fastfields of a new document.
pub fn add_document(&mut self, doc: &Document) {
for field_writer in &mut self.single_value_writers {
field_writer.add_document(doc);
}
for field_writer in &mut self.multi_values_writers {
field_writer.next_doc();
field_writer.add_document(doc);
}
for field_writer in &mut self.bytes_value_writers {
field_writer.add_document(doc);
}
}
@@ -95,7 +114,7 @@ impl FastFieldsWriter {
pub fn serialize(
&self,
serializer: &mut FastFieldSerializer,
mapping: &HashMap<Field, HashMap<UnorderedTermId, usize>>,
mapping: &HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>>,
) -> io::Result<()> {
for field_writer in &self.single_value_writers {
field_writer.serialize(serializer)?;
@@ -104,6 +123,9 @@ impl FastFieldsWriter {
let field = field_writer.field();
field_writer.serialize(serializer, mapping.get(&field))?;
}
for field_writer in &self.bytes_value_writers {
field_writer.serialize(serializer)?;
}
Ok(())
}
}

View File

@@ -1,10 +1,8 @@
#[inline(always)]
pub fn id_to_fieldnorm(id: u8) -> u32 {
FIELD_NORMS_TABLE[id as usize]
}
#[inline(always)]
pub fn fieldnorm_to_id(fieldnorm: u32) -> u8 {
FIELD_NORMS_TABLE
@@ -12,45 +10,34 @@ pub fn fieldnorm_to_id(fieldnorm: u32) -> u8 {
.unwrap_or_else(|idx| idx - 1) as u8
}
pub const FIELD_NORMS_TABLE: [u32; 256] = [
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
32, 33, 34, 35, 36, 37, 38, 39, 40, 42, 44, 46, 48, 50, 52, 54,
56, 60, 64, 68, 72, 76, 80, 84, 88, 96, 104, 112, 120, 128, 136, 144,
152, 168, 184, 200, 216, 232, 248, 264, 280, 312, 344, 376, 408, 440, 472, 504,
536, 600, 664, 728, 792, 856, 920, 984,
1048, 1176, 1304, 1432, 1560, 1688, 1816, 1944,
2072, 2328, 2584, 2840, 3096, 3352, 3608, 3864, 4120,
4632, 5144, 5656, 6168, 6680, 7192, 7704, 8216, 9240,
10264, 11288, 12312, 13336, 14360, 15384,
16408, 18456, 20504, 22552, 24600, 26648, 28696, 30744,
32792, 36888, 40984, 45080, 49176, 53272, 57368, 61464,
65560, 73752, 81944, 90136, 98328, 106520, 114712, 122904, 131096, 147480,
163864, 180248, 196632, 213016, 229400, 245784, 262168,
294936, 327704, 360472, 393240, 426008, 458776,
491544, 524312, 589848, 655384, 720920, 786456, 851992, 917528,
983064, 1048600, 1179672, 1310744, 1441816, 1572888, 1703960, 1835032,
1966104, 2097176, 2359320, 2621464, 2883608, 3145752, 3407896, 3670040, 3932184,
4194328, 4718616, 5242904, 5767192, 6291480, 6815768, 7340056, 7864344, 8388632, 9437208,
10485784, 11534360, 12582936, 13631512, 14680088, 15728664, 16777240, 18874392, 20971544,
23068696, 25165848, 27263000, 29360152, 31457304, 33554456, 37748760, 41943064,
46137368, 50331672, 54525976, 58720280, 62914584, 67108888, 75497496, 83886104,
92274712, 100663320, 109051928, 117440536, 125829144, 134217752, 150994968, 167772184,
184549400, 201326616, 218103832, 234881048, 251658264, 268435480, 301989912, 335544344,
369098776, 402653208, 436207640, 469762072, 503316504, 536870936, 603979800, 671088664,
738197528, 805306392, 872415256, 939524120, 1006632984, 1073741848, 1207959576, 1342177304,
1476395032, 1610612760, 1744830488, 1879048216, 2013265944
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 42, 44, 46, 48, 50, 52, 54, 56, 60,
64, 68, 72, 76, 80, 84, 88, 96, 104, 112, 120, 128, 136, 144, 152, 168, 184, 200, 216, 232,
248, 264, 280, 312, 344, 376, 408, 440, 472, 504, 536, 600, 664, 728, 792, 856, 920, 984, 1048,
1176, 1304, 1432, 1560, 1688, 1816, 1944, 2072, 2328, 2584, 2840, 3096, 3352, 3608, 3864, 4120,
4632, 5144, 5656, 6168, 6680, 7192, 7704, 8216, 9240, 10264, 11288, 12312, 13336, 14360, 15384,
16408, 18456, 20504, 22552, 24600, 26648, 28696, 30744, 32792, 36888, 40984, 45080, 49176,
53272, 57368, 61464, 65560, 73752, 81944, 90136, 98328, 106520, 114712, 122904, 131096, 147480,
163864, 180248, 196632, 213016, 229400, 245784, 262168, 294936, 327704, 360472, 393240, 426008,
458776, 491544, 524312, 589848, 655384, 720920, 786456, 851992, 917528, 983064, 1048600,
1179672, 1310744, 1441816, 1572888, 1703960, 1835032, 1966104, 2097176, 2359320, 2621464,
2883608, 3145752, 3407896, 3670040, 3932184, 4194328, 4718616, 5242904, 5767192, 6291480,
6815768, 7340056, 7864344, 8388632, 9437208, 10485784, 11534360, 12582936, 13631512, 14680088,
15728664, 16777240, 18874392, 20971544, 23068696, 25165848, 27263000, 29360152, 31457304,
33554456, 37748760, 41943064, 46137368, 50331672, 54525976, 58720280, 62914584, 67108888,
75497496, 83886104, 92274712, 100663320, 109051928, 117440536, 125829144, 134217752, 150994968,
167772184, 184549400, 201326616, 218103832, 234881048, 251658264, 268435480, 301989912,
335544344, 369098776, 402653208, 436207640, 469762072, 503316504, 536870936, 603979800,
671088664, 738197528, 805306392, 872415256, 939524120, 1006632984, 1073741848, 1207959576,
1342177304, 1476395032, 1610612760, 1744830488, 1879048216, 2013265944,
];
#[cfg(test)]
mod tests {
use super::{fieldnorm_to_id, id_to_fieldnorm, FIELD_NORMS_TABLE};
#[test]
fn test_decode_code() {
assert_eq!(fieldnorm_to_id(0), 0);
@@ -103,4 +90,4 @@ mod tests {
assert_eq!(FIELD_NORMS_TABLE[i], decode_fieldnorm_byte(i as u8));
}
}
}
}

View File

@@ -17,13 +17,12 @@
//!
//! This trick is used by the [BM25 similarity]().
mod code;
mod reader;
mod serializer;
mod writer;
mod reader;
pub use self::reader::FieldNormReader;
pub use self::writer::FieldNormsWriter;
pub use self::serializer::FieldNormsSerializer;
pub use self::writer::FieldNormsWriter;
use self::code::{fieldnorm_to_id, id_to_fieldnorm};

View File

@@ -1,8 +1,7 @@
use super::{id_to_fieldnorm, fieldnorm_to_id};
use super::{fieldnorm_to_id, id_to_fieldnorm};
use directory::ReadOnlySource;
use DocId;
/// Reads the fieldnorm associated to a document.
/// The fieldnorm represents the length associated to
/// a given Field of a given document.
@@ -21,16 +20,13 @@ use DocId;
/// precompute computationally expensive functions of the fieldnorm
/// in a very short array.
pub struct FieldNormReader {
data: ReadOnlySource
data: ReadOnlySource,
}
impl FieldNormReader {
/// Opens a field norm reader given its data source.
pub fn open(data: ReadOnlySource) -> Self {
FieldNormReader {
data
}
FieldNormReader { data }
}
/// Returns the `fieldnorm` associated to a doc id.
@@ -71,12 +67,13 @@ impl FieldNormReader {
#[cfg(test)]
impl From<Vec<u32>> for FieldNormReader {
fn from(field_norms: Vec<u32>) -> FieldNormReader {
let field_norms_id = field_norms.into_iter()
let field_norms_id = field_norms
.into_iter()
.map(FieldNormReader::fieldnorm_to_id)
.collect::<Vec<u8>>();
let field_norms_data = ReadOnlySource::from(field_norms_id);
FieldNormReader {
data: field_norms_data
data: field_norms_data,
}
}
}
}

View File

@@ -1,26 +1,24 @@
use directory::WritePtr;
use std::io;
use common::CompositeWrite;
use directory::WritePtr;
use schema::Field;
use std::io;
use std::io::Write;
/// The fieldnorms serializer is in charge of
/// the serialization of field norms for all fields.
pub struct FieldNormsSerializer {
composite_write: CompositeWrite,
}
impl FieldNormsSerializer {
/// Constructor
pub fn from_write(write: WritePtr) -> io::Result<FieldNormsSerializer> {
// just making room for the pointer to header.
let composite_write = CompositeWrite::wrap(write);
Ok(FieldNormsSerializer {
composite_write
})
Ok(FieldNormsSerializer { composite_write })
}
/// Serialize the given field
pub fn serialize_field(&mut self, field: Field, fieldnorms_data: &[u8]) -> io::Result<()> {
let write = self.composite_write.for_field(field);
write.write_all(fieldnorms_data)?;
@@ -28,10 +26,9 @@ impl FieldNormsSerializer {
Ok(())
}
/// Clean up / flush / close
pub fn close(self) -> io::Result<()> {
self.composite_write.close()?;
Ok(())
}
}

View File

@@ -1,30 +1,36 @@
use DocId;
use schema::Field;
use super::FieldNormsSerializer;
use std::io;
use schema::Schema;
use super::fieldnorm_to_id;
use super::FieldNormsSerializer;
use schema::Field;
use schema::Schema;
use std::io;
/// The `FieldNormsWriter` is in charge of tracking the fieldnorm byte
/// of each document for each field with field norms.
///
/// `FieldNormsWriter` stores a Vec<u8> for each tracked field, using a
/// byte per document per field.
pub struct FieldNormsWriter {
fields: Vec<Field>,
fieldnorms_buffer: Vec<Vec<u8>>
fieldnorms_buffer: Vec<Vec<u8>>,
}
impl FieldNormsWriter {
pub fn fields_with_fieldnorm(schema: &Schema) -> Vec<Field> {
/// Returns the fields that should have field norms computed
/// according to the given schema.
pub(crate) fn fields_with_fieldnorm(schema: &Schema) -> Vec<Field> {
schema
.fields()
.iter()
.enumerate()
.filter(|&(_, field_entry)| {
field_entry.is_indexed()
})
.filter(|&(_, field_entry)| field_entry.is_indexed())
.map(|(field, _)| Field(field as u32))
.collect::<Vec<Field>>()
}
/// Initialize with state for tracking the field norm fields
/// specified in the schema.
pub fn for_schema(schema: &Schema) -> FieldNormsWriter {
let fields = FieldNormsWriter::fields_with_fieldnorm(schema);
let max_field = fields
@@ -35,26 +41,40 @@ impl FieldNormsWriter {
.unwrap_or(0);
FieldNormsWriter {
fields,
fieldnorms_buffer: (0..max_field)
.map(|_| Vec::new())
.collect::<Vec<_>>()
fieldnorms_buffer: (0..max_field).map(|_| Vec::new()).collect::<Vec<_>>(),
}
}
/// Ensure that all documents in 0..max_doc have a byte associated with them
/// in each of the fieldnorm vectors.
///
/// Will extend with 0-bytes for documents that have not been seen.
pub fn fill_up_to_max_doc(&mut self, max_doc: DocId) {
for &field in self.fields.iter() {
self.fieldnorms_buffer[field.0 as usize].resize(max_doc as usize, 0u8);
}
}
/// Set the fieldnorm byte for the given document for the given field.
///
/// Will internally convert the u32 `fieldnorm` value to the appropriate byte
/// to approximate the field norm in less space.
///
/// * doc - the document id
/// * field - the field being set
/// * fieldnorm - the number of terms present in document `doc` in field `field`
pub fn record(&mut self, doc: DocId, field: Field, fieldnorm: u32) {
let fieldnorm_buffer: &mut Vec<u8> = &mut self.fieldnorms_buffer[field.0 as usize];
assert!(fieldnorm_buffer.len() <= doc as usize, "Cannot register a given fieldnorm twice");
assert!(
fieldnorm_buffer.len() <= doc as usize,
"Cannot register a given fieldnorm twice"
);
// we fill intermediary `DocId` as having a fieldnorm of 0.
fieldnorm_buffer.resize(doc as usize + 1, 0u8);
fieldnorm_buffer[doc as usize] = fieldnorm_to_id(fieldnorm);
}
/// Serialize the seen fieldnorm values to the serializer for all fields.
pub fn serialize(&self, fieldnorms_serializer: &mut FieldNormsSerializer) -> io::Result<()> {
for &field in self.fields.iter() {
let fieldnorm_values: &[u8] = &self.fieldnorms_buffer[field.0 as usize][..];
@@ -62,4 +82,4 @@ impl FieldNormsWriter {
}
Ok(())
}
}
}

View File

@@ -1,10 +1,10 @@
use std::collections::HashSet;
use rand::thread_rng;
use std::collections::HashSet;
use rand::distributions::{IndependentSample, Range};
use schema::*;
use Index;
use Searcher;
use rand::distributions::{IndependentSample, Range};
fn check_index_content(searcher: &Searcher, vals: &HashSet<u64>) {
assert!(searcher.segment_readers().len() < 20);
@@ -13,7 +13,7 @@ fn check_index_content(searcher: &Searcher, vals: &HashSet<u64>) {
#[test]
#[ignore]
#[cfg(feature="mmap")]
#[cfg(feature = "mmap")]
fn test_indexing() {
let mut schema_builder = SchemaBuilder::default();

View File

@@ -1,7 +1,7 @@
use super::operation::DeleteOperation;
use std::sync::{Arc, RwLock};
use std::mem;
use std::ops::DerefMut;
use std::sync::{Arc, RwLock};
// The DeleteQueue is similar in conceptually to a multiple
// consumer single producer broadcast channel.

View File

@@ -1,6 +1,6 @@
use Directory;
use directory::error::OpenWriteError;
use core::LOCKFILE_FILEPATH;
use directory::error::OpenWriteError;
use Directory;
/// The directory lock is a mechanism used to
/// prevent the creation of two [`IndexWriter`](struct.IndexWriter.html)

View File

@@ -1,3 +1,6 @@
use super::operation::AddOperation;
use super::segment_updater::SegmentUpdater;
use super::PreparedCommit;
use bit_set::BitSet;
use chan;
use core::Index;
@@ -6,32 +9,28 @@ use core::SegmentComponent;
use core::SegmentId;
use core::SegmentMeta;
use core::SegmentReader;
use indexer::stamper::Stamper;
use datastruct::stacker::hashmap::split_memory;
use datastruct::stacker::Heap;
use directory::FileProtection;
use docset::DocSet;
use error::{Error, ErrorKind, Result, ResultExt};
use fastfield::write_delete_bitset;
use futures::sync::oneshot::Receiver;
use indexer::delete_queue::{DeleteCursor, DeleteQueue};
use futures::Canceled;
use datastruct::stacker::hashmap::split_memory;
use futures::Future;
use indexer::doc_opstamp_mapping::DocToOpstampMapping;
use indexer::MergePolicy;
use indexer::operation::DeleteOperation;
use indexer::stamper::Stamper;
use indexer::DirectoryLock;
use indexer::MergePolicy;
use indexer::SegmentEntry;
use indexer::SegmentWriter;
use docset::DocSet;
use schema::IndexRecordOption;
use schema::Document;
use schema::IndexRecordOption;
use schema::Term;
use std::mem;
use std::mem::swap;
use std::thread::JoinHandle;
use indexer::DirectoryLock;
use super::operation::AddOperation;
use super::segment_updater::SegmentUpdater;
use super::PreparedCommit;
use std::thread;
use std::thread::JoinHandle;
// Size of the margin for the heap. A segment is closed when the remaining memory
// in the heap goes below MARGIN_IN_BYTES.
@@ -81,10 +80,6 @@ pub struct IndexWriter {
committed_opstamp: u64,
}
// IndexWriter cannot be sent to another thread.
impl !Send for IndexWriter {}
impl !Sync for IndexWriter {}
/// Open a new index writer. Attempts to acquire a lockfile.
///
/// The lockfile should be deleted on drop, but it is possible
@@ -200,7 +195,6 @@ pub fn advance_deletes(
target_opstamp: u64,
) -> Result<Option<FileProtection>> {
let mut file_protect: Option<FileProtection> = None;
{
if let Some(previous_opstamp) = segment_entry.meta().delete_opstamp() {
// We are already up-to-date here.
@@ -241,7 +235,6 @@ pub fn advance_deletes(
}
}
segment_entry.set_meta(segment.meta().clone());
Ok(file_protect)
}
@@ -448,10 +441,7 @@ impl IndexWriter {
}
/// Merges a given list of segments
pub fn merge(
&mut self,
segment_ids: &[SegmentId],
) -> impl Future<Item = SegmentMeta, Error = Canceled> {
pub fn merge(&mut self, segment_ids: &[SegmentId]) -> Receiver<SegmentMeta> {
self.segment_updater.start_merge(segment_ids)
}
@@ -647,12 +637,12 @@ impl IndexWriter {
#[cfg(test)]
mod tests {
use env_logger;
use error::*;
use indexer::NoMergePolicy;
use schema::{self, Document};
use Index;
use Term;
use error::*;
use env_logger;
#[test]
fn test_lockfile_stops_duplicates() {
@@ -675,7 +665,7 @@ mod tests {
"LogMergePolicy { min_merge_size: 8, min_layer_size: 10000, \
level_log_size: 0.75 }"
);
let merge_policy = box NoMergePolicy::default();
let merge_policy = Box::new(NoMergePolicy::default());
index_writer.set_merge_policy(merge_policy);
assert_eq!(
format!("{:?}", index_writer.get_merge_policy()),

View File

@@ -82,7 +82,7 @@ impl MergePolicy for LogMergePolicy {
}
fn box_clone(&self) -> Box<MergePolicy> {
box self.clone()
Box::new(self.clone())
}
}
@@ -99,8 +99,8 @@ impl Default for LogMergePolicy {
#[cfg(test)]
mod tests {
use super::*;
use indexer::merge_policy::MergePolicy;
use core::{SegmentId, SegmentMeta};
use indexer::merge_policy::MergePolicy;
fn test_merge_policy() -> LogMergePolicy {
let mut log_merge_policy = LogMergePolicy::default();

View File

@@ -1,7 +1,7 @@
use core::SegmentId;
use core::SegmentMeta;
use std::marker;
use std::fmt::Debug;
use std::marker;
/// Set of segment suggested for a merge.
#[derive(Debug, Clone)]
@@ -37,7 +37,7 @@ impl MergePolicy for NoMergePolicy {
}
fn box_clone(&self) -> Box<MergePolicy> {
box NoMergePolicy
Box::new(NoMergePolicy)
}
}
@@ -69,7 +69,7 @@ pub mod tests {
}
fn box_clone(&self) -> Box<MergePolicy> {
box MergeWheneverPossible
Box::new(MergeWheneverPossible)
}
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,29 +1,29 @@
pub mod index_writer;
pub mod segment_serializer;
pub mod merger;
pub mod merge_policy;
mod log_merge_policy;
mod segment_register;
mod segment_writer;
mod segment_manager;
pub mod delete_queue;
pub mod segment_updater;
mod directory_lock;
mod segment_entry;
mod doc_opstamp_mapping;
pub mod index_writer;
mod log_merge_policy;
pub mod merge_policy;
pub mod merger;
pub mod operation;
mod stamper;
mod prepared_commit;
mod segment_entry;
mod segment_manager;
mod segment_register;
pub mod segment_serializer;
pub mod segment_updater;
mod segment_writer;
mod stamper;
pub use self::prepared_commit::PreparedCommit;
pub use self::segment_entry::{SegmentEntry, SegmentState};
pub use self::segment_serializer::SegmentSerializer;
pub use self::segment_writer::SegmentWriter;
pub(crate) use self::directory_lock::DirectoryLock;
pub use self::index_writer::IndexWriter;
pub use self::log_merge_policy::LogMergePolicy;
pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy};
pub use self::prepared_commit::PreparedCommit;
pub use self::segment_entry::{SegmentEntry, SegmentState};
pub use self::segment_manager::SegmentManager;
pub(crate) use self::directory_lock::DirectoryLock;
pub use self::segment_serializer::SegmentSerializer;
pub use self::segment_writer::SegmentWriter;
/// Alias for the default merge policy, which is the `LogMergePolicy`.
pub type DefaultMergePolicy = LogMergePolicy;

View File

@@ -1,5 +1,5 @@
use Result;
use super::IndexWriter;
use Result;
/// A prepared commit
pub struct PreparedCommit<'a> {
@@ -13,7 +13,7 @@ impl<'a> PreparedCommit<'a> {
PreparedCommit {
index_writer,
payload: None,
opstamp
opstamp,
}
}

View File

@@ -1,7 +1,7 @@
use core::SegmentMeta;
use bit_set::BitSet;
use indexer::delete_queue::DeleteCursor;
use core::SegmentId;
use core::SegmentMeta;
use indexer::delete_queue::DeleteCursor;
use std::fmt;
#[derive(Clone, Copy, PartialEq, Eq, Debug)]

View File

@@ -1,14 +1,14 @@
use super::segment_register::SegmentRegister;
use std::sync::RwLock;
use core::SegmentId;
use core::SegmentMeta;
use core::{LOCKFILE_FILEPATH, META_FILEPATH};
use core::SegmentId;
use indexer::SegmentEntry;
use std::path::PathBuf;
use std::collections::hash_set::HashSet;
use std::sync::{RwLockReadGuard, RwLockWriteGuard};
use std::fmt::{self, Debug, Formatter};
use indexer::delete_queue::DeleteCursor;
use indexer::SegmentEntry;
use std::collections::hash_set::HashSet;
use std::fmt::{self, Debug, Formatter};
use std::path::PathBuf;
use std::sync::RwLock;
use std::sync::{RwLockReadGuard, RwLockWriteGuard};
#[derive(Default)]
struct SegmentRegisters {

View File

@@ -1,10 +1,10 @@
use core::SegmentId;
use std::collections::HashMap;
use core::SegmentMeta;
use indexer::delete_queue::DeleteCursor;
use indexer::segment_entry::SegmentEntry;
use std::collections::HashMap;
use std::fmt;
use std::fmt::{Debug, Formatter};
use indexer::segment_entry::SegmentEntry;
use indexer::delete_queue::DeleteCursor;
/// The segment register keeps track
/// of the list of segment, their size as well
@@ -113,11 +113,11 @@ impl SegmentRegister {
#[cfg(test)]
mod tests {
use indexer::SegmentState;
use super::*;
use core::SegmentId;
use core::SegmentMeta;
use indexer::delete_queue::*;
use super::*;
use indexer::SegmentState;
fn segment_ids(segment_register: &SegmentRegister) -> Vec<SegmentId> {
segment_register

View File

@@ -3,9 +3,9 @@ use Result;
use core::Segment;
use core::SegmentComponent;
use fastfield::FastFieldSerializer;
use store::StoreWriter;
use fieldnorm::FieldNormsSerializer;
use postings::InvertedIndexSerializer;
use store::StoreWriter;
/// Segment serializer is in charge of laying out on disk
/// the data accumulated and sorted by the `SegmentWriter`.
@@ -47,7 +47,7 @@ impl SegmentSerializer {
}
/// Accessor to the field norm serializer.
pub fn get_fieldnorms_serializer(&mut self) -> &mut FieldNormsSerializer {
pub fn get_fieldnorms_serializer(&mut self) -> &mut FieldNormsSerializer {
&mut self.fieldnorms_serializer
}

View File

@@ -1,40 +1,40 @@
use super::segment_manager::{get_mergeable_segments, SegmentManager};
use core::Index;
use core::IndexMeta;
use core::META_FILEPATH;
use core::Segment;
use core::SegmentId;
use core::SegmentMeta;
use core::SerializableSegment;
use core::META_FILEPATH;
use directory::Directory;
use indexer::stamper::Stamper;
use error::{Error, ErrorKind, Result};
use futures_cpupool::CpuPool;
use futures::Future;
use futures::Canceled;
use futures::oneshot;
use directory::FileProtection;
use indexer::{DefaultMergePolicy, MergePolicy};
use error::{Error, ErrorKind, Result};
use futures::oneshot;
use futures::sync::oneshot::Receiver;
use futures::Future;
use futures_cpupool::CpuFuture;
use futures_cpupool::CpuPool;
use indexer::delete_queue::DeleteCursor;
use indexer::index_writer::advance_deletes;
use indexer::MergeCandidate;
use indexer::merger::IndexMerger;
use indexer::stamper::Stamper;
use indexer::MergeCandidate;
use indexer::SegmentEntry;
use indexer::SegmentSerializer;
use futures_cpupool::CpuFuture;
use serde_json;
use indexer::delete_queue::DeleteCursor;
use indexer::{DefaultMergePolicy, MergePolicy};
use schema::Schema;
use serde_json;
use std::borrow::BorrowMut;
use std::collections::HashMap;
use std::io::Write;
use std::mem;
use std::ops::DerefMut;
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, AtomicUsize};
use std::sync::atomic::Ordering;
use std::sync::atomic::{AtomicBool, AtomicUsize};
use std::sync::Arc;
use std::sync::RwLock;
use std::thread;
use std::thread::JoinHandle;
use super::segment_manager::{get_mergeable_segments, SegmentManager};
/// Save the index meta file.
/// This operation is atomic :
@@ -171,7 +171,7 @@ impl SegmentUpdater {
pool: CpuPool::new(1),
index,
segment_manager,
merge_policy: RwLock::new(box DefaultMergePolicy::default()),
merge_policy: RwLock::new(Box::new(DefaultMergePolicy::default())),
merging_thread_id: AtomicUsize::default(),
merging_threads: RwLock::new(HashMap::new()),
generation: AtomicUsize::default(),
@@ -283,10 +283,7 @@ impl SegmentUpdater {
}).wait()
}
pub fn start_merge(
&self,
segment_ids: &[SegmentId],
) -> impl Future<Item = SegmentMeta, Error = Canceled> {
pub fn start_merge(&self, segment_ids: &[SegmentId]) -> Receiver<SegmentMeta> {
self.0.segment_manager.start_merge(segment_ids);
let segment_updater_clone = self.clone();
@@ -361,7 +358,9 @@ impl SegmentUpdater {
let committed_merge_candidates = merge_policy.compute_merge_candidates(&committed_segments);
merge_candidates.extend_from_slice(&committed_merge_candidates[..]);
for MergeCandidate(segment_metas) in merge_candidates {
self.start_merge(&segment_metas);
if let Err(e) = self.start_merge(&segment_metas).fuse().poll() {
error!("The merge task failed quickly after starting: {:?}", e);
}
}
}
@@ -480,9 +479,9 @@ impl SegmentUpdater {
#[cfg(test)]
mod tests {
use Index;
use schema::*;
use indexer::merge_policy::tests::MergeWheneverPossible;
use schema::*;
use Index;
#[test]
fn test_delete_during_merge() {
@@ -494,7 +493,7 @@ mod tests {
// writing the segment
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer.set_merge_policy(box MergeWheneverPossible);
index_writer.set_merge_policy(Box::new(MergeWheneverPossible));
{
for _ in 0..100 {

View File

@@ -1,23 +1,23 @@
use Result;
use DocId;
use std::io;
use std::str;
use schema::Schema;
use schema::Term;
use super::operation::AddOperation;
use core::Segment;
use core::SerializableSegment;
use fastfield::FastFieldsWriter;
use schema::FieldType;
use indexer::segment_serializer::SegmentSerializer;
use datastruct::stacker::Heap;
use fastfield::FastFieldsWriter;
use fieldnorm::FieldNormsWriter;
use indexer::index_writer::MARGIN_IN_BYTES;
use super::operation::AddOperation;
use indexer::segment_serializer::SegmentSerializer;
use postings::MultiFieldPostingsWriter;
use schema::FieldType;
use schema::Schema;
use schema::Term;
use schema::Value;
use std::io;
use std::str;
use tokenizer::BoxedTokenizer;
use tokenizer::FacetTokenizer;
use tokenizer::{TokenStream, Tokenizer};
use schema::Value;
use fieldnorm::FieldNormsWriter;
use DocId;
use Result;
/// A `SegmentWriter` is in charge of creating segment index from a
/// documents.
@@ -35,7 +35,6 @@ pub struct SegmentWriter<'a> {
tokenizers: Vec<Option<Box<BoxedTokenizer>>>,
}
impl<'a> SegmentWriter<'a> {
/// Creates a new `SegmentWriter`
///
@@ -139,8 +138,7 @@ impl<'a> SegmentWriter<'a> {
}
})
.collect();
let mut term = unsafe { Term::with_capacity(100) };
term.set_field(field);
let mut term = Term::for_field(field); // we set the Term
for facet_bytes in facets {
let mut unordered_term_id_opt = None;
let fake_str = unsafe { str::from_utf8_unchecked(facet_bytes) };
@@ -179,8 +177,7 @@ impl<'a> SegmentWriter<'a> {
} else {
0
};
self.fieldnorms_writer
.record(doc_id, field, num_tokens);
self.fieldnorms_writer.record(doc_id, field, num_tokens);
}
FieldType::U64(ref int_option) => {
if int_option.is_indexed() {
@@ -204,6 +201,9 @@ impl<'a> SegmentWriter<'a> {
}
}
}
FieldType::Bytes => {
// Do nothing. Bytes only supports fast fields.
}
}
}
doc.filter_fields(|field| schema.get_field_entry(field).is_stored());

View File

@@ -1,15 +1,66 @@
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
// AtomicU64 have not landed in stable.
// For the moment let's just use AtomicUsize on
// x86/64 bit platform, and a mutex on other platform.
#[derive(Clone, Default)]
pub struct Stamper(Arc<AtomicUsize>);
#[cfg(target = "x86_64")]
mod archicture_impl {
impl Stamper {
pub fn new(first_opstamp: u64) -> Stamper {
Stamper(Arc::new(AtomicUsize::new(first_opstamp as usize)))
}
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
pub fn stamp(&self) -> u64 {
self.0.fetch_add(1, Ordering::SeqCst) as u64
#[derive(Clone, Default)]
pub struct Stamper(Arc<AtomicU64>);
impl Stamper {
pub fn new(first_opstamp: u64) -> Stamper {
Stamper(Arc::new(AtomicU64::new(first_opstamp)))
}
pub fn stamp(&self) -> u64 {
self.0.fetch_add(1u64, Ordering::SeqCst) as u64
}
}
}
#[cfg(not(target = "x86_64"))]
mod archicture_impl {
use std::sync::{Arc, Mutex};
#[derive(Clone, Default)]
pub struct Stamper(Arc<Mutex<u64>>);
impl Stamper {
pub fn new(first_opstamp: u64) -> Stamper {
Stamper(Arc::new(Mutex::new(first_opstamp)))
}
pub fn stamp(&self) -> u64 {
let mut guard = self.0.lock().expect("Failed to lock the stamper");
let previous_val = *guard;
*guard = previous_val + 1;
previous_val
}
}
}
pub use self::archicture_impl::Stamper;
#[cfg(test)]
mod test {
use super::Stamper;
#[test]
fn test_stamper() {
let stamper = Stamper::new(7u64);
assert_eq!(stamper.stamp(), 7u64);
assert_eq!(stamper.stamp(), 8u64);
let stamper_clone = stamper.clone();
assert_eq!(stamper.stamp(), 9u64);
assert_eq!(stamper.stamp(), 10u64);
assert_eq!(stamper_clone.stamp(), 11u64);
}
}

View File

@@ -1,14 +1,7 @@
#![doc(html_logo_url = "http://fulmicoton.com/tantivy-logo/tantivy-logo.png")]
#![cfg_attr(feature = "cargo-clippy", allow(module_inception))]
#![cfg_attr(feature = "cargo-clippy", allow(inline_always))]
#![feature(box_syntax)]
#![feature(optin_builtin_traits)]
#![feature(conservative_impl_trait)]
#![feature(collections_range)]
#![feature(integer_atomics)]
#![feature(drain_filter)]
#![cfg_attr(test, feature(test))]
#![cfg_attr(test, feature(iterator_step_by))]
#![cfg_attr(all(feature = "unstable", test), feature(test))]
#![doc(test(attr(allow(unused_variables), deny(warnings))))]
#![allow(unknown_lints)]
#![allow(new_without_default)]
@@ -123,36 +116,40 @@ extern crate lazy_static;
#[macro_use]
extern crate serde_derive;
#[cfg_attr(test, macro_use)]
extern crate serde_json;
#[macro_use]
extern crate log;
#[macro_use]
extern crate error_chain;
#[cfg(feature="mmap")]
#[cfg(feature = "mmap")]
extern crate atomicwrites;
extern crate base64;
extern crate bit_set;
extern crate bitpacking;
extern crate byteorder;
extern crate chan;
extern crate combine;
extern crate crossbeam;
extern crate fnv;
extern crate fst;
extern crate futures;
extern crate futures_cpupool;
extern crate itertools;
extern crate snap;
extern crate levenshtein_automata;
extern crate lz4;
extern crate num_cpus;
extern crate owning_ref;
extern crate regex;
extern crate rust_stemmers;
extern crate serde;
extern crate serde_json;
extern crate stable_deref_trait;
extern crate tempdir;
#[cfg(test)]
extern crate tempfile;
extern crate uuid;
extern crate bitpacking;
#[cfg(test)]
#[macro_use]
@@ -166,7 +163,8 @@ extern crate winapi;
#[cfg(test)]
extern crate rand;
#[cfg(test)]
#[cfg(all(test, feature = "unstable"))]
extern crate test;
extern crate tinysegmenter;
@@ -185,36 +183,36 @@ pub use error::{Error, ErrorKind, ResultExt};
/// Tantivy result.
pub type Result<T> = std::result::Result<T, Error>;
mod core;
mod compression;
mod indexer;
mod common;
mod compression;
mod core;
mod indexer;
mod datastruct;
#[allow(unused_doc_comment)]
mod error;
pub mod tokenizer;
mod datastruct;
pub mod termdict;
pub mod store;
pub mod query;
pub mod directory;
pub mod collector;
pub mod postings;
pub mod schema;
pub mod directory;
pub mod fastfield;
pub mod fieldnorm;
pub mod postings;
pub mod query;
pub mod schema;
pub mod store;
pub mod termdict;
mod docset;
pub use self::docset::{DocSet, SkipResult};
pub use directory::Directory;
pub use core::{Index, Searcher, Segment, SegmentId, SegmentMeta};
pub use indexer::IndexWriter;
pub use schema::{Document, Term};
pub use core::{InvertedIndexReader, SegmentReader};
pub use postings::Postings;
pub use core::SegmentComponent;
pub use core::{Index, Searcher, Segment, SegmentId, SegmentMeta};
pub use core::{InvertedIndexReader, SegmentReader};
pub use directory::Directory;
pub use indexer::IndexWriter;
pub use postings::Postings;
pub use schema::{Document, Term};
pub use common::{i64_to_u64, u64_to_i64};
@@ -230,10 +228,10 @@ pub fn version() -> &'static str {
/// Defines tantivy's merging strategy
pub mod merge_policy {
pub use indexer::MergePolicy;
pub use indexer::LogMergePolicy;
pub use indexer::NoMergePolicy;
pub use indexer::DefaultMergePolicy;
pub use indexer::LogMergePolicy;
pub use indexer::MergePolicy;
pub use indexer::NoMergePolicy;
}
/// A `u32` identifying a document within a segment.
@@ -282,33 +280,29 @@ pub struct DocAddress(pub SegmentLocalId, pub DocId);
mod tests {
use collector::tests::TestCollector;
use Index;
use core::SegmentReader;
use query::BooleanQuery;
use schema::*;
use docset::DocSet;
use query::BooleanQuery;
use rand::distributions::{IndependentSample, Range};
use rand::{Rng, SeedableRng, XorShiftRng};
use schema::*;
use Index;
use IndexWriter;
use Postings;
use rand::{Rng, SeedableRng, XorShiftRng};
use rand::distributions::{IndependentSample, Range};
pub fn assert_nearly_equals(expected: f32, val: f32) {
assert!(nearly_equals(val, expected), "Got {}, expected {}.", val, expected);
assert!(
nearly_equals(val, expected),
"Got {}, expected {}.",
val,
expected
);
}
pub fn nearly_equals(a: f32, b: f32) -> bool {
(a - b).abs() < 0.0005 * (a + b).abs()
}
fn generate_array_with_seed(n: usize, ratio: f32, seed_val: u32) -> Vec<u32> {
let seed: &[u32; 4] = &[1, 2, 3, seed_val];
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
(0..u32::max_value())
.filter(|_| rng.next_f32() < ratio)
.take(n)
.collect()
}
pub fn generate_nonunique_unsorted(max_value: u32, n_elems: usize) -> Vec<u32> {
let seed: &[u32; 4] = &[1, 2, 3, 4];
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
@@ -318,10 +312,6 @@ mod tests {
.collect::<Vec<u32>>()
}
pub fn generate_array(n: usize, ratio: f32) -> Vec<u32> {
generate_array_with_seed(n, ratio, 4)
}
pub fn sample_with_seed(n: u32, ratio: f32, seed_val: u32) -> Vec<u32> {
let seed: &[u32; 4] = &[1, 2, 3, seed_val];
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
@@ -333,7 +323,7 @@ mod tests {
}
#[test]
#[cfg(feature="mmap")]
#[cfg(feature = "mmap")]
fn test_indexing() {
let mut schema_builder = SchemaBuilder::default();
let text_field = schema_builder.add_text_field("text", TEXT);
@@ -459,7 +449,6 @@ mod tests {
}
}
fn advance_undeleted(docset: &mut DocSet, reader: &SegmentReader) -> bool {
while docset.advance() {
if !reader.is_deleted(docset.doc()) {

View File

@@ -6,20 +6,18 @@ Postings module (also called inverted index)
///
/// Postings, also called inverted lists, is the key datastructure
/// to full-text search.
mod postings;
mod recorder;
mod serializer;
mod postings_writer;
mod term_info;
mod recorder;
mod segment_postings;
mod serializer;
mod term_info;
use self::recorder::{NothingRecorder, Recorder, TFAndPositionRecorder, TermFrequencyRecorder};
pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
pub(crate) use self::postings_writer::MultiFieldPostingsWriter;
pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
pub use self::term_info::TermInfo;
pub use self::postings::Postings;
pub use self::term_info::TermInfo;
pub use self::segment_postings::{BlockSegmentPostings, SegmentPostings};
@@ -38,25 +36,22 @@ pub(crate) enum FreqReadingOption {
pub mod tests {
use super::*;
use core::Index;
use core::SegmentComponent;
use core::SegmentReader;
use datastruct::stacker::Heap;
use docset::{DocSet, SkipResult};
use fieldnorm::FieldNormReader;
use indexer::operation::AddOperation;
use indexer::SegmentWriter;
use query::Scorer;
use rand::{Rng, SeedableRng, XorShiftRng};
use schema::Field;
use schema::IndexRecordOption;
use schema::{Document, SchemaBuilder, Term, INT_INDEXED, STRING, TEXT};
use std::iter;
use DocId;
use Score;
use query::Intersection;
use query::Scorer;
use schema::{Document, SchemaBuilder, Term, INT_INDEXED, STRING, TEXT};
use core::SegmentComponent;
use indexer::SegmentWriter;
use core::SegmentReader;
use core::Index;
use schema::IndexRecordOption;
use std::iter;
use datastruct::stacker::Heap;
use schema::Field;
use test::{self, Bencher};
use indexer::operation::AddOperation;
use tests;
use rand::{Rng, SeedableRng, XorShiftRng};
use fieldnorm::FieldNormReader;
#[test]
pub fn test_position_write() {
@@ -127,7 +122,6 @@ pub mod tests {
assert_eq!(&[0, 5], &positions[..]);
}
{
let mut postings = inverted_index
.read_postings(&term, IndexRecordOption::WithFreqsAndPositions)
.unwrap();
@@ -206,13 +200,14 @@ pub mod tests {
{
let segment_reader = SegmentReader::open(&segment).unwrap();
{
let fieldnorm_reader = segment_reader.get_fieldnorms_reader(text_field) ;
let fieldnorm_reader = segment_reader.get_fieldnorms_reader(text_field);
assert_eq!(fieldnorm_reader.fieldnorm(0), 8 + 5);
assert_eq!(fieldnorm_reader.fieldnorm(1), 2);
for i in 2..1000 {
assert_eq!(
fieldnorm_reader.fieldnorm_id(i),
FieldNormReader::fieldnorm_to_id(i + 1) );
FieldNormReader::fieldnorm_to_id(i + 1)
);
}
}
{
@@ -449,7 +444,7 @@ pub mod tests {
// delete everything else
{
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
index_writer.delete_term(term_1);
index_writer.delete_term(term_1);
assert!(index_writer.commit().is_ok());
}
@@ -479,23 +474,23 @@ pub mod tests {
}
lazy_static! {
static ref TERM_A: Term = {
pub static ref TERM_A: Term = {
let field = Field(0);
Term::from_field_text(field, "a")
};
static ref TERM_B: Term = {
pub static ref TERM_B: Term = {
let field = Field(0);
Term::from_field_text(field, "b")
};
static ref TERM_C: Term = {
pub static ref TERM_C: Term = {
let field = Field(0);
Term::from_field_text(field, "c")
};
static ref TERM_D: Term = {
pub static ref TERM_D: Term = {
let field = Field(0);
Term::from_field_text(field, "d")
};
static ref INDEX: Index = {
pub static ref INDEX: Index = {
let mut schema_builder = SchemaBuilder::default();
let text_field = schema_builder.add_text_field("text", STRING);
let schema = schema_builder.build();
@@ -507,7 +502,7 @@ pub mod tests {
let posting_list_size = 1_000_000;
{
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
for _ in 0 .. posting_list_size {
for _ in 0..posting_list_size {
let mut doc = Document::default();
if rng.gen_weighted_bool(15) {
doc.add_text(text_field, "a");
@@ -530,6 +525,85 @@ pub mod tests {
};
}
/// Wraps a given docset, and forward alls call but the
/// `.skip_next(...)`. This is useful to test that a specialized
/// implementation of `.skip_next(...)` is consistent
/// with the default implementation.
pub(crate) struct UnoptimizedDocSet<TDocSet: DocSet>(TDocSet);
impl<TDocSet: DocSet> UnoptimizedDocSet<TDocSet> {
pub fn wrap(docset: TDocSet) -> UnoptimizedDocSet<TDocSet> {
UnoptimizedDocSet(docset)
}
}
impl<TDocSet: DocSet> DocSet for UnoptimizedDocSet<TDocSet> {
fn advance(&mut self) -> bool {
self.0.advance()
}
fn doc(&self) -> DocId {
self.0.doc()
}
fn size_hint(&self) -> u32 {
self.0.size_hint()
}
}
impl<TScorer: Scorer> Scorer for UnoptimizedDocSet<TScorer> {
fn score(&mut self) -> Score {
self.0.score()
}
}
pub fn test_skip_against_unoptimized<F: Fn() -> Box<DocSet>>(
postings_factory: F,
targets: Vec<u32>,
) {
for target in targets {
let mut postings_opt = postings_factory();
let mut postings_unopt = UnoptimizedDocSet::wrap(postings_factory());
let skip_result_opt = postings_opt.skip_next(target);
let skip_result_unopt = postings_unopt.skip_next(target);
assert_eq!(
skip_result_unopt, skip_result_opt,
"Failed while skipping to {}",
target
);
match skip_result_opt {
SkipResult::Reached => assert_eq!(postings_opt.doc(), target),
SkipResult::OverStep => assert!(postings_opt.doc() > target),
SkipResult::End => {
return;
}
}
while postings_opt.advance() {
assert!(postings_unopt.advance());
assert_eq!(
postings_opt.doc(),
postings_unopt.doc(),
"Failed while skipping to {}",
target
);
}
assert!(!postings_unopt.advance());
}
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use super::tests::*;
use docset::SkipResult;
use query::Intersection;
use schema::IndexRecordOption;
use test::{self, Bencher};
use tests;
use DocSet;
#[bench]
fn bench_segment_postings(b: &mut Bencher) {
let searcher = INDEX.searcher();
@@ -646,71 +720,4 @@ pub mod tests {
s
});
}
/// Wraps a given docset, and forward alls call but the
/// `.skip_next(...)`. This is useful to test that a specialized
/// implementation of `.skip_next(...)` is consistent
/// with the default implementation.
pub(crate) struct UnoptimizedDocSet<TDocSet: DocSet>(TDocSet);
impl<TDocSet: DocSet> UnoptimizedDocSet<TDocSet> {
pub fn wrap(docset: TDocSet) -> UnoptimizedDocSet<TDocSet> {
UnoptimizedDocSet(docset)
}
}
impl<TDocSet: DocSet> DocSet for UnoptimizedDocSet<TDocSet> {
fn advance(&mut self) -> bool {
self.0.advance()
}
fn doc(&self) -> DocId {
self.0.doc()
}
fn size_hint(&self) -> u32 {
self.0.size_hint()
}
}
impl<TScorer: Scorer> Scorer for UnoptimizedDocSet<TScorer> {
fn score(&mut self) -> Score {
self.0.score()
}
}
pub fn test_skip_against_unoptimized<F: Fn() -> Box<DocSet>>(
postings_factory: F,
targets: Vec<u32>,
) {
for target in targets {
let mut postings_opt = postings_factory();
let mut postings_unopt = UnoptimizedDocSet::wrap(postings_factory());
let skip_result_opt = postings_opt.skip_next(target);
let skip_result_unopt = postings_unopt.skip_next(target);
assert_eq!(
skip_result_unopt, skip_result_opt,
"Failed while skipping to {}",
target
);
match skip_result_opt {
SkipResult::Reached => assert_eq!(postings_opt.doc(), target),
SkipResult::OverStep => assert!(postings_opt.doc() > target),
SkipResult::End => {
return;
}
}
while postings_opt.advance() {
assert!(postings_unopt.advance());
assert_eq!(
postings_opt.doc(),
postings_unopt.doc(),
"Failed while skipping to {}",
target
);
}
assert!(!postings_unopt.advance());
}
}
}

View File

@@ -1,21 +1,18 @@
use DocId;
use schema::Term;
use datastruct::stacker::{Heap, TermHashMap};
use postings::recorder::{NothingRecorder, Recorder, TFAndPositionRecorder, TermFrequencyRecorder};
use postings::UnorderedTermId;
use postings::{FieldSerializer, InvertedIndexSerializer};
use std::io;
use schema::IndexRecordOption;
use schema::{Field, FieldEntry, FieldType, Schema, Term};
use std::collections::HashMap;
use postings::Recorder;
use Result;
use schema::{Field, Schema};
use std::io;
use std::marker::PhantomData;
use std::ops::DerefMut;
use datastruct::stacker::{Heap, TermHashMap};
use postings::{NothingRecorder, TFAndPositionRecorder, TermFrequencyRecorder};
use schema::FieldEntry;
use schema::FieldType;
use termdict::TermOrdinal;
use tokenizer::Token;
use tokenizer::TokenStream;
use schema::IndexRecordOption;
use postings::UnorderedTermId;
use DocId;
use Result;
fn posting_from_field_entry<'a>(
field_entry: &FieldEntry,
@@ -39,11 +36,17 @@ fn posting_from_field_entry<'a>(
FieldType::U64(_) | FieldType::I64(_) | FieldType::HierarchicalFacet => {
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
}
FieldType::Bytes => {
// FieldType::Bytes cannot actually be indexed.
// TODO fix during the indexer refactoring described in #276
SpecializedPostingsWriter::<NothingRecorder>::new_boxed(heap)
}
}
}
pub struct MultiFieldPostingsWriter<'a> {
heap: &'a Heap,
schema: Schema,
term_index: TermHashMap<'a>,
per_field_postings_writers: Vec<Box<PostingsWriter + 'a>>,
}
@@ -58,8 +61,8 @@ impl<'a> MultiFieldPostingsWriter<'a> {
.iter()
.map(|field_entry| posting_from_field_entry(field_entry, heap))
.collect();
MultiFieldPostingsWriter {
schema: schema.clone(),
heap,
term_index,
per_field_postings_writers,
@@ -83,7 +86,7 @@ impl<'a> MultiFieldPostingsWriter<'a> {
pub fn serialize(
&self,
serializer: &mut InvertedIndexSerializer,
) -> Result<HashMap<Field, HashMap<UnorderedTermId, usize>>> {
) -> Result<HashMap<Field, HashMap<UnorderedTermId, TermOrdinal>>> {
let mut term_offsets: Vec<(&[u8], u32, UnorderedTermId)> = self.term_index.iter().collect();
term_offsets.sort_by_key(|&(k, _, _)| k);
@@ -94,8 +97,10 @@ impl<'a> MultiFieldPostingsWriter<'a> {
.map(|(key, _, _)| Term::wrap(key).field())
.enumerate();
let mut unordered_term_mappings: HashMap<Field, HashMap<UnorderedTermId, usize>> =
HashMap::new();
let mut unordered_term_mappings: HashMap<
Field,
HashMap<UnorderedTermId, TermOrdinal>,
> = HashMap::new();
let mut prev_field = Field(u32::max_value());
for (offset, field) in term_offsets_it {
@@ -110,20 +115,30 @@ impl<'a> MultiFieldPostingsWriter<'a> {
let (field, start) = offsets[i];
let (_, stop) = offsets[i + 1];
// populating the unordered term ord -> ordered term ord mapping
// for the field.
let mut mapping = HashMap::new();
for (term_ord, term_unord_id) in term_offsets[start..stop]
.iter()
.map(|&(_, _, bucket)| bucket)
.enumerate()
{
mapping.insert(term_unord_id, term_ord);
let field_entry = self.schema.get_field_entry(field);
match field_entry.field_type() {
&FieldType::Str(_) | &FieldType::HierarchicalFacet => {
// populating the (unordered term ord) -> (ordered term ord) mapping
// for the field.
let mut unordered_term_ids = term_offsets[start..stop]
.iter()
.map(|&(_, _, bucket)| bucket);
let mut mapping: HashMap<UnorderedTermId, TermOrdinal> = unordered_term_ids
.enumerate()
.map(|(term_ord, unord_term_id)| {
(unord_term_id as UnorderedTermId, term_ord as TermOrdinal)
})
.collect();
unordered_term_mappings.insert(field, mapping);
}
&FieldType::U64(_) | &FieldType::I64(_) => {}
&FieldType::Bytes => {}
}
unordered_term_mappings.insert(field, mapping);
let postings_writer = &self.per_field_postings_writers[field.0 as usize];
let mut field_serializer = serializer.new_field(field, postings_writer.total_num_tokens())?;
let mut field_serializer =
serializer.new_field(field, postings_writer.total_num_tokens())?;
postings_writer.serialize(
&term_offsets[start..stop],
&mut field_serializer,
@@ -179,8 +194,7 @@ pub trait PostingsWriter {
token_stream: &mut TokenStream,
heap: &Heap,
) -> u32 {
let mut term = unsafe { Term::with_capacity(100) };
term.set_field(field);
let mut term = Term::for_field(field);
let num_tokens = {
let mut sink = |token: &Token| {
term.set_text(token.text.as_str());

View File

@@ -1,7 +1,7 @@
use DocId;
use std::{self, io};
use postings::FieldSerializer;
use datastruct::stacker::{ExpUnrolledLinkedList, Heap, HeapAllocable};
use postings::FieldSerializer;
use std::{self, io};
use DocId;
const EMPTY_ARRAY: [u32; 0] = [0u32; 0];
const POSITION_END: u32 = std::u32::MAX;

View File

@@ -2,15 +2,15 @@ use compression::{BlockDecoder, CompressedIntStream, VIntDecoder, COMPRESSION_BL
use DocId;
use common::BitSet;
use common::CountingWriter;
use common::HasLen;
use postings::Postings;
use docset::{DocSet, SkipResult};
use fst::Streamer;
use compression::compressed_block_size;
use directory::{ReadOnlySource, SourceRead};
use postings::FreqReadingOption;
use docset::{DocSet, SkipResult};
use fst::Streamer;
use postings::serializer::PostingsSerializer;
use common::CountingWriter;
use postings::FreqReadingOption;
use postings::Postings;
struct PositionComputer {
// store the amount of position int
@@ -84,9 +84,13 @@ impl SegmentPostings {
for &doc in docs {
postings_serializer.write_doc(doc, 1u32).unwrap();
}
postings_serializer.close_term().expect("In memory Serialization should never fail.");
postings_serializer
.close_term()
.expect("In memory Serialization should never fail.");
}
let (buffer , _) = counting_writer.finish().expect("Serializing in a buffer should never fail.");
let (buffer, _) = counting_writer
.finish()
.expect("Serializing in a buffer should never fail.");
let data = ReadOnlySource::from(buffer);
let block_segment_postings = BlockSegmentPostings::from_data(
docs.len(),
@@ -98,7 +102,6 @@ impl SegmentPostings {
}
impl SegmentPostings {
/// Reads a Segment postings from an &[u8]
///
/// * `len` - number of document in the posting lists.
@@ -125,7 +128,7 @@ fn exponential_search(target: u32, mut start: usize, arr: &[u32]) -> (usize, usi
loop {
let new = start + jump;
if new >= end {
return (start, end)
return (start, end);
}
if arr[new] > target {
return (start, new);
@@ -163,7 +166,8 @@ impl DocSet for SegmentPostings {
if self.position_computer.is_some() {
let freqs_skipped = &self.block_cursor.freqs()[self.cur..];
let sum_freq: u32 = freqs_skipped.iter().sum();
self.position_computer.as_mut()
self.position_computer
.as_mut()
.unwrap()
.add_skip(sum_freq as usize);
}
@@ -198,7 +202,8 @@ impl DocSet for SegmentPostings {
if self.position_computer.is_some() {
let freqs_skipped = &self.block_cursor.freqs()[self.cur..start];
let sum_freqs: u32 = freqs_skipped.iter().sum();
self.position_computer.as_mut()
self.position_computer
.as_mut()
.unwrap()
.add_skip(sum_freqs as usize);
}
@@ -211,7 +216,6 @@ impl DocSet for SegmentPostings {
}
}
// goes to the next element.
// next needs to be called a first time to point to the correct element.
#[inline]
@@ -262,7 +266,6 @@ impl DocSet for SegmentPostings {
}
}
impl HasLen for SegmentPostings {
fn len(&self) -> usize {
self.block_cursor.doc_freq()
@@ -276,16 +279,11 @@ impl Postings for SegmentPostings {
fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
if self.position_computer.is_some() {
let prev_capacity = output.capacity();
let term_freq = self.term_freq() as usize;
if term_freq > prev_capacity {
let additional_len = term_freq - output.len();
output.reserve(additional_len);
}
unsafe {
output.set_len(term_freq);
self.position_computer.as_mut().unwrap().positions_with_offset(offset, &mut output[..])
}
output.resize(self.term_freq() as usize, 0u32);
self.position_computer
.as_mut()
.unwrap()
.positions_with_offset(offset, &mut output[..])
} else {
output.clear();
}
@@ -473,16 +471,16 @@ impl<'b> Streamer<'b> for BlockSegmentPostings {
#[cfg(test)]
mod tests {
use docset::DocSet;
use super::BlockSegmentPostings;
use super::SegmentPostings;
use schema::SchemaBuilder;
use common::HasLen;
use core::Index;
use schema::INT_INDEXED;
use schema::Term;
use docset::DocSet;
use fst::Streamer;
use schema::IndexRecordOption;
use common::HasLen;
use super::BlockSegmentPostings;
use schema::SchemaBuilder;
use schema::Term;
use schema::INT_INDEXED;
#[test]
fn test_empty_segment_postings() {
@@ -570,4 +568,3 @@ mod tests {
assert_eq!(block_segments.docs(), &[1, 3, 5]);
}
}

View File

@@ -1,20 +1,16 @@
use Result;
use termdict::TermDictionaryBuilderImpl;
use super::TermInfo;
use schema::Field;
use schema::FieldEntry;
use schema::FieldType;
use schema::Schema;
use directory::WritePtr;
use compression::{BlockEncoder, COMPRESSION_BLOCK_SIZE};
use DocId;
use core::Segment;
use std::io::{self, Write};
use compression::VIntEncoder;
use common::BinarySerializable;
use common::CountingWriter;
use common::CompositeWrite;
use termdict::TermDictionaryBuilder;
use common::{CompositeWrite, CountingWriter};
use compression::VIntEncoder;
use compression::{BlockEncoder, COMPRESSION_BLOCK_SIZE};
use core::Segment;
use directory::WritePtr;
use schema::Schema;
use schema::{Field, FieldEntry, FieldType};
use std::io::{self, Write};
use termdict::{TermDictionaryBuilder, TermOrdinal};
use DocId;
use Result;
/// `PostingsSerializer` is in charge of serializing
/// postings on disk, in the
@@ -85,7 +81,11 @@ impl InvertedIndexSerializer {
/// a given field.
///
/// Loads the indexing options for the given field.
pub fn new_field(&mut self, field: Field, total_num_tokens: u64) -> io::Result<FieldSerializer> {
pub fn new_field(
&mut self,
field: Field,
total_num_tokens: u64,
) -> io::Result<FieldSerializer> {
let field_entry: &FieldEntry = self.schema.get_field_entry(field);
let term_dictionary_write = self.terms_write.for_field(field);
let postings_write = self.postings_write.for_field(field);
@@ -111,11 +111,12 @@ impl InvertedIndexSerializer {
/// The field serializer is in charge of
/// the serialization of a specific field.
pub struct FieldSerializer<'a> {
term_dictionary_builder: TermDictionaryBuilderImpl<&'a mut CountingWriter<WritePtr>>,
term_dictionary_builder: TermDictionaryBuilder<&'a mut CountingWriter<WritePtr>>,
postings_serializer: PostingsSerializer<&'a mut CountingWriter<WritePtr>>,
positions_serializer_opt: Option<PositionSerializer<&'a mut CountingWriter<WritePtr>>>,
current_term_info: TermInfo,
term_open: bool,
num_terms: TermOrdinal,
}
impl<'a> FieldSerializer<'a> {
@@ -125,7 +126,6 @@ impl<'a> FieldSerializer<'a> {
postings_write: &'a mut CountingWriter<WritePtr>,
positions_write: &'a mut CountingWriter<WritePtr>,
) -> io::Result<FieldSerializer<'a>> {
let (term_freq_enabled, position_enabled): (bool, bool) = match field_type {
FieldType::Str(ref text_options) => {
if let Some(text_indexing_options) = text_options.get_indexing_options() {
@@ -141,7 +141,7 @@ impl<'a> FieldSerializer<'a> {
_ => (false, false),
};
let term_dictionary_builder =
TermDictionaryBuilderImpl::new(term_dictionary_write, field_type)?;
TermDictionaryBuilder::new(term_dictionary_write, field_type)?;
let postings_serializer = PostingsSerializer::new(postings_write, term_freq_enabled);
let positions_serializer_opt = if position_enabled {
Some(PositionSerializer::new(positions_write))
@@ -155,6 +155,7 @@ impl<'a> FieldSerializer<'a> {
positions_serializer_opt,
current_term_info: TermInfo::default(),
term_open: false,
num_terms: TermOrdinal::default(),
})
}
@@ -175,7 +176,7 @@ impl<'a> FieldSerializer<'a> {
/// * term - the term. It needs to come after the previous term according
/// to the lexicographical order.
/// * doc_freq - return the number of document containing the term.
pub fn new_term(&mut self, term: &[u8]) -> io::Result<()> {
pub fn new_term(&mut self, term: &[u8]) -> io::Result<TermOrdinal> {
assert!(
!self.term_open,
"Called new_term, while the previous term was not closed."
@@ -183,7 +184,10 @@ impl<'a> FieldSerializer<'a> {
self.term_open = true;
self.postings_serializer.clear();
self.current_term_info = self.current_term_info();
self.term_dictionary_builder.insert_key(term)
self.term_dictionary_builder.insert_key(term)?;
let term_ordinal = self.num_terms;
self.num_terms += 1;
Ok(term_ordinal)
}
/// Serialize the information that a document contains the current term,

View File

@@ -1,34 +1,25 @@
use common::{BinarySerializable, FixedSize};
use std::io;
/// `TermInfo` contains all of the information
/// associated to terms in the `.term` file.
///
/// It consists of
/// * `doc_freq` : the number of document in the segment
/// containing this term. It is also the length of the
/// posting list associated to this term
/// * `postings_offset` : an offset in the `.idx` file
/// addressing the start of the posting list associated
/// to this term.
/// `TermInfo` wraps the metadata associated to a Term.
/// It is segment-local.
#[derive(Debug, Default, Ord, PartialOrd, Eq, PartialEq, Clone)]
pub struct TermInfo {
/// Number of documents in the segment containing the term
pub doc_freq: u32,
/// Offset within the postings (`.idx`) file.
/// Start offset within the postings (`.idx`) file.
pub postings_offset: u64,
/// Offset within the position (`.pos`) file.
/// Start offset of the first block within the position (`.pos`) file.
pub positions_offset: u64,
/// Offset within the position block.
/// Start offset within this position block.
pub positions_inner_offset: u8,
}
impl FixedSize for TermInfo {
/// Size required for the binary serialization of `TermInfo`.
/// This is large, but in practise, all `TermInfo` but the first one
/// of the block are bitpacked.
///
/// See `TermInfoStore`.
/// Size required for the binary serialization of a `TermInfo` object.
/// This is large, but in practise, `TermInfo` are encoded in blocks and
/// only the first `TermInfo` of a block is serialized uncompressed.
/// The subsequent `TermInfo` are delta encoded and bitpacked.
const SIZE_IN_BYTES: usize = u32::SIZE_IN_BYTES + 2 * u64::SIZE_IN_BYTES + u8::SIZE_IN_BYTES;
}

View File

@@ -1,22 +1,20 @@
use query::Query;
use query::Weight;
use query::Scorer;
use core::Searcher;
use core::SegmentReader;
use docset::DocSet;
use query::{Query, Scorer, Weight};
use DocId;
use Result;
use Score;
use DocId;
use core::Searcher;
/// Query that matches all of the documents.
///
/// All of the document get the score 1f32.
#[derive(Debug)]
#[derive(Clone, Debug)]
pub struct AllQuery;
impl Query for AllQuery {
fn weight(&self, _: &Searcher, _: bool) -> Result<Box<Weight>> {
Ok(box AllWeight)
Ok(Box::new(AllWeight))
}
}
@@ -25,29 +23,47 @@ pub struct AllWeight;
impl Weight for AllWeight {
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
Ok(box AllScorer {
started: false,
Ok(Box::new(AllScorer {
state: State::NotStarted,
doc: 0u32,
max_doc: reader.max_doc(),
})
}))
}
}
enum State {
NotStarted,
Started,
Finished,
}
/// Scorer associated to the `AllQuery` query.
pub struct AllScorer {
started: bool,
state: State,
doc: DocId,
max_doc: DocId,
}
impl DocSet for AllScorer {
fn advance(&mut self) -> bool {
if self.started {
self.doc += 1u32;
} else {
self.started = true;
match self.state {
State::NotStarted => {
self.state = State::Started;
self.doc = 0;
}
State::Started => {
self.doc += 1u32;
}
State::Finished => {
return false;
}
}
if self.doc < self.max_doc {
return true;
} else {
self.state = State::Finished;
return false;
}
self.doc < self.max_doc
}
fn doc(&self) -> DocId {
@@ -64,3 +80,46 @@ impl Scorer for AllScorer {
1f32
}
}
#[cfg(test)]
mod tests {
use super::AllQuery;
use query::Query;
use schema::{SchemaBuilder, TEXT};
use Index;
#[test]
fn test_all_query() {
let mut schema_builder = SchemaBuilder::default();
let field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 10_000_000).unwrap();
index_writer.add_document(doc!(field=>"aaa"));
index_writer.add_document(doc!(field=>"bbb"));
index_writer.commit().unwrap();
index_writer.add_document(doc!(field=>"ccc"));
index_writer.commit().unwrap();
index.load_searchers().unwrap();
let searcher = index.searcher();
let weight = AllQuery.weight(&searcher, false).unwrap();
{
let reader = searcher.segment_reader(0);
let mut scorer = weight.scorer(reader).unwrap();
assert!(scorer.advance());
assert_eq!(scorer.doc(), 0u32);
assert!(scorer.advance());
assert_eq!(scorer.doc(), 1u32);
assert!(!scorer.advance());
}
{
let reader = searcher.segment_reader(1);
let mut scorer = weight.scorer(reader).unwrap();
assert!(scorer.advance());
assert_eq!(scorer.doc(), 0u32);
assert!(!scorer.advance());
}
}
}

View File

@@ -1,7 +1,7 @@
use common::{BitSet, TinySet};
use DocId;
use docset::{DocSet, SkipResult};
use std::cmp::Ordering;
use DocId;
/// A `BitSetDocSet` makes it possible to iterate through a bitset as if it was a `DocSet`.
///
@@ -120,12 +120,10 @@ impl DocSet for BitSetDocSet {
#[cfg(test)]
mod tests {
use DocId;
use super::BitSetDocSet;
use common::BitSet;
use docset::{DocSet, SkipResult};
use super::BitSetDocSet;
extern crate test;
use tests;
use DocId;
fn create_docbitset(docs: &[DocId], max_doc: DocId) -> BitSetDocSet {
let mut docset = BitSet::with_max_value(max_doc);
@@ -219,6 +217,17 @@ mod tests {
}
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use super::BitSet;
use super::BitSetDocSet;
use test;
use tests;
use DocSet;
#[bench]
fn bench_bitset_1pct_insert(b: &mut test::Bencher) {
use tests;

View File

@@ -1,7 +1,7 @@
use fieldnorm::FieldNormReader;
use Term;
use Searcher;
use Score;
use Searcher;
use Term;
const K1: f32 = 1.2;
const B: f32 = 0.75;
@@ -11,7 +11,6 @@ fn idf(doc_freq: u64, doc_count: u64) -> f32 {
(1f32 + x).ln()
}
fn cached_tf_component(fieldnorm: u32, average_fieldnorm: f32) -> f32 {
K1 * (1f32 - B + B * fieldnorm as f32 / average_fieldnorm)
}
@@ -32,11 +31,10 @@ pub struct BM25Weight {
}
impl BM25Weight {
pub fn null() -> BM25Weight {
BM25Weight {
weight: 0f32,
cache: [1f32; 256]
cache: [1f32; 256],
}
}
@@ -44,7 +42,11 @@ impl BM25Weight {
assert!(!terms.is_empty(), "BM25 requires at least one term");
let field = terms[0].field();
for term in &terms[1..] {
assert_eq!(term.field(), field, "All terms must belong to the same field.");
assert_eq!(
term.field(),
field,
"All terms must belong to the same field."
);
}
let mut total_num_tokens = 0u64;
@@ -56,7 +58,8 @@ impl BM25Weight {
}
let average_fieldnorm = total_num_tokens as f32 / total_num_docs as f32;
let idf = terms.iter()
let idf = terms
.iter()
.map(|term| {
let term_doc_freq = searcher.doc_freq(term);
idf(term_doc_freq, total_num_docs)
@@ -83,12 +86,12 @@ impl BM25Weight {
#[cfg(test)]
mod tests {
use tests::assert_nearly_equals;
use super::idf;
use tests::assert_nearly_equals;
#[test]
fn test_idf() {
assert_nearly_equals(idf(1, 2), 0.6931472);
assert_nearly_equals(idf(1, 2), 0.6931472);
}
}

View File

@@ -1,12 +1,12 @@
use Result;
use super::boolean_weight::BooleanWeight;
use query::Weight;
use Searcher;
use query::Query;
use schema::Term;
use query::TermQuery;
use schema::IndexRecordOption;
use query::Occur;
use query::Query;
use query::TermQuery;
use query::Weight;
use schema::IndexRecordOption;
use schema::Term;
use Result;
use Searcher;
/// The boolean query combines a set of queries
///
@@ -23,6 +23,16 @@ pub struct BooleanQuery {
subqueries: Vec<(Occur, Box<Query>)>,
}
impl Clone for BooleanQuery {
fn clone(&self) -> Self {
self.subqueries
.iter()
.map(|(x, y)| (x.clone(), y.box_clone()))
.collect::<Vec<_>>()
.into()
}
}
impl From<Vec<(Occur, Box<Query>)>> for BooleanQuery {
fn from(subqueries: Vec<(Occur, Box<Query>)>) -> BooleanQuery {
BooleanQuery { subqueries }
@@ -37,7 +47,7 @@ impl Query for BooleanQuery {
Ok((*occur, subquery.weight(searcher, scoring_enabled)?))
})
.collect::<Result<_>>()?;
Ok(box BooleanWeight::new(sub_weights, scoring_enabled))
Ok(Box::new(BooleanWeight::new(sub_weights, scoring_enabled)))
}
}
@@ -48,10 +58,16 @@ impl BooleanQuery {
let occur_term_queries: Vec<(Occur, Box<Query>)> = terms
.into_iter()
.map(|term| {
let term_query: Box<Query> = box TermQuery::new(term, IndexRecordOption::WithFreqs);
let term_query: Box<Query> =
Box::new(TermQuery::new(term, IndexRecordOption::WithFreqs));
(Occur::Should, term_query)
})
.collect();
BooleanQuery::from(occur_term_queries)
}
/// Deconstructed view of the clauses making up this query.
pub fn clauses(&self) -> &[(Occur, Box<Query>)] {
&self.subqueries[..]
}
}

View File

@@ -1,19 +1,18 @@
use query::Weight;
use core::SegmentReader;
use query::Union;
use std::collections::HashMap;
use query::EmptyScorer;
use query::Scorer;
use downcast::Downcast;
use std::borrow::Borrow;
use query::intersect_scorers;
use query::score_combiner::{DoNothingCombiner, ScoreCombiner, SumWithCoordsCombiner};
use query::term_query::TermScorer;
use query::EmptyScorer;
use query::Exclude;
use query::Occur;
use query::RequiredOptionalScorer;
use query::score_combiner::{DoNothingCombiner, ScoreCombiner, SumWithCoordsCombiner};
use query::Scorer;
use query::Union;
use query::Weight;
use std::borrow::Borrow;
use std::collections::HashMap;
use Result;
use query::intersect_scorers;
use query::term_query::TermScorer;
fn scorer_union<TScoreCombiner>(scorers: Vec<Box<Scorer>>) -> Box<Scorer>
where
@@ -34,14 +33,13 @@ where
.into_iter()
.map(|scorer| *Downcast::<TermScorer>::downcast(scorer).unwrap())
.collect();
let scorer: Box<Scorer> = box Union::<TermScorer, TScoreCombiner>::from(scorers);
let scorer: Box<Scorer> = Box::new(Union::<TermScorer, TScoreCombiner>::from(scorers));
return scorer;
}
}
let scorer: Box<Scorer> = box Union::<_, TScoreCombiner>::from(scorers);
let scorer: Box<Scorer> = Box::new(Union::<_, TScoreCombiner>::from(scorers));
return scorer;
}
pub struct BooleanWeight {
@@ -78,17 +76,17 @@ impl BooleanWeight {
.remove(&Occur::MustNot)
.map(scorer_union::<TScoreCombiner>);
let must_scorer_opt: Option<Box<Scorer>> =
per_occur_scorers.remove(&Occur::Must)
.map(intersect_scorers);
let must_scorer_opt: Option<Box<Scorer>> = per_occur_scorers
.remove(&Occur::Must)
.map(intersect_scorers);
let positive_scorer: Box<Scorer> = match (should_scorer_opt, must_scorer_opt) {
(Some(should_scorer), Some(must_scorer)) => {
if self.scoring_enabled {
box RequiredOptionalScorer::<_, _, TScoreCombiner>::new(
Box::new(RequiredOptionalScorer::<_, _, TScoreCombiner>::new(
must_scorer,
should_scorer,
)
))
} else {
must_scorer
}
@@ -96,12 +94,12 @@ impl BooleanWeight {
(None, Some(must_scorer)) => must_scorer,
(Some(should_scorer), None) => should_scorer,
(None, None) => {
return Ok(box EmptyScorer);
return Ok(Box::new(EmptyScorer));
}
};
if let Some(exclude_scorer) = exclude_scorer_opt {
Ok(box Exclude::new(positive_scorer, exclude_scorer))
Ok(Box::new(Exclude::new(positive_scorer, exclude_scorer)))
} else {
Ok(positive_scorer)
}
@@ -111,11 +109,11 @@ impl BooleanWeight {
impl Weight for BooleanWeight {
fn scorer(&self, reader: &SegmentReader) -> Result<Box<Scorer>> {
if self.weights.is_empty() {
Ok(box EmptyScorer)
Ok(Box::new(EmptyScorer))
} else if self.weights.len() == 1 {
let &(occur, ref weight) = &self.weights[0];
if occur == Occur::MustNot {
Ok(box EmptyScorer)
Ok(Box::new(EmptyScorer))
} else {
weight.scorer(reader)
}

View File

@@ -7,19 +7,19 @@ pub use self::boolean_query::BooleanQuery;
mod tests {
use super::*;
use query::Occur;
use query::Query;
use query::TermQuery;
use query::Intersection;
use query::Scorer;
use collector::tests::TestCollector;
use Index;
use downcast::Downcast;
use schema::*;
use query::QueryParser;
use query::RequiredOptionalScorer;
use query::score_combiner::SumWithCoordsCombiner;
use query::term_query::TermScorer;
use query::Intersection;
use query::Occur;
use query::Query;
use query::QueryParser;
use query::RequiredOptionalScorer;
use query::Scorer;
use query::TermQuery;
use schema::*;
use Index;
fn aux_test_helper() -> (Index, Field) {
let mut schema_builder = SchemaBuilder::default();
@@ -110,6 +110,7 @@ mod tests {
let query = query_parser.parse_query("+a b").unwrap();
let weight = query.weight(&*searcher, false).unwrap();
let scorer = weight.scorer(searcher.segment_reader(0u32)).unwrap();
println!("{:?}", scorer.type_name());
assert!(Downcast::<TermScorer>::is_type(&*scorer));
}
}
@@ -123,7 +124,7 @@ mod tests {
Term::from_field_text(text_field, text),
IndexRecordOption::Basic,
);
let query: Box<Query> = box term_query;
let query: Box<Query> = Box::new(term_query);
query
};
@@ -170,7 +171,6 @@ mod tests {
}
}
#[test]
pub fn test_intersection_score() {
let (index, text_field) = aux_test_helper();
@@ -180,7 +180,7 @@ mod tests {
Term::from_field_text(text_field, text),
IndexRecordOption::Basic,
);
let query: Box<Query> = box term_query;
let query: Box<Query> = Box::new(term_query);
query
};
@@ -192,7 +192,10 @@ mod tests {
};
{
let boolean_query = BooleanQuery::from(vec![(Occur::Must, make_term_query("a")), (Occur::Must, make_term_query("b"))]);
let boolean_query = BooleanQuery::from(vec![
(Occur::Must, make_term_query("a")),
(Occur::Must, make_term_query("b")),
]);
assert_eq!(score_docs(&boolean_query), vec![0.977973, 0.84699446]);
}
}

View File

@@ -1,7 +1,7 @@
use query::Scorer;
use docset::{DocSet, SkipResult};
use Score;
use query::Scorer;
use DocId;
use Score;
#[derive(Clone, Copy, Debug)]
enum State {
@@ -129,10 +129,10 @@ where
#[cfg(test)]
mod tests {
use tests::sample_with_seed;
use postings::tests::test_skip_against_unoptimized;
use super::*;
use postings::tests::test_skip_against_unoptimized;
use query::VecDocSet;
use tests::sample_with_seed;
#[test]
fn test_exclude() {
@@ -151,10 +151,10 @@ mod tests {
fn test_exclude_skip() {
test_skip_against_unoptimized(
|| {
box Exclude::new(
Box::new(Exclude::new(
VecDocSet::from(vec![1, 2, 5, 8, 10, 15, 24]),
VecDocSet::from(vec![1, 2, 3, 10, 16, 24]),
)
))
},
vec![1, 2, 5, 8, 10, 15, 24],
);
@@ -167,10 +167,10 @@ mod tests {
let sample_skip = sample_with_seed(10_000, 0.005, 3);
test_skip_against_unoptimized(
|| {
box Exclude::new(
Box::new(Exclude::new(
VecDocSet::from(sample_include.clone()),
VecDocSet::from(sample_exclude.clone()),
)
))
},
sample_skip,
);

View File

@@ -1,11 +1,11 @@
use docset::{DocSet, SkipResult};
use query::Scorer;
use query::EmptyScorer;
use DocId;
use downcast::Downcast;
use std::borrow::Borrow;
use Score;
use query::term_query::TermScorer;
use query::EmptyScorer;
use query::Scorer;
use std::borrow::Borrow;
use DocId;
use Score;
/// Returns the intersection scorer.
///
@@ -22,7 +22,7 @@ pub fn intersect_scorers(mut scorers: Vec<Box<Scorer>>) -> Box<Scorer> {
let second_rarest_opt = scorers.pop();
scorers.reverse();
match (rarest_opt, second_rarest_opt) {
(None, None) => box EmptyScorer,
(None, None) => Box::new(EmptyScorer),
(Some(single_docset), None) => single_docset,
(Some(left), Some(right)) => {
{
@@ -32,31 +32,33 @@ pub fn intersect_scorers(mut scorers: Vec<Box<Scorer>>) -> Box<Scorer> {
}) {
let left = *Downcast::<TermScorer>::downcast(left).unwrap();
let right = *Downcast::<TermScorer>::downcast(right).unwrap();
return box Intersection {
return Box::new(Intersection {
left,
right,
others: scorers,
num_docsets
}
num_docsets,
});
}
}
return box Intersection {
return Box::new(Intersection {
left,
right,
others: scorers,
num_docsets
}
num_docsets,
});
}
_ => {
unreachable!();
}
_ => { unreachable!(); }
}
}
/// Creates a `DocSet` that iterator through the intersection of two `DocSet`s.
pub struct Intersection<TDocSet: DocSet, TOtherDocSet: DocSet=Box<Scorer>> {
pub struct Intersection<TDocSet: DocSet, TOtherDocSet: DocSet = Box<Scorer>> {
left: TDocSet,
right: TDocSet,
others: Vec<TOtherDocSet>,
num_docsets: usize
num_docsets: usize,
}
impl<TDocSet: DocSet> Intersection<TDocSet, TDocSet> {
@@ -71,18 +73,17 @@ impl<TDocSet: DocSet> Intersection<TDocSet, TDocSet> {
left,
right,
others: docsets,
num_docsets
num_docsets,
}
}
}
impl<TDocSet: DocSet> Intersection<TDocSet, TDocSet> {
pub(crate) fn docset_mut_specialized(&mut self, ord: usize) -> &mut TDocSet {
match ord {
0 => &mut self.left,
1 => &mut self.right,
n => &mut self.others[n - 2]
n => &mut self.others[n - 2],
}
}
}
@@ -92,7 +93,7 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> Intersection<TDocSet, TOtherDocSet>
match ord {
0 => &mut self.left,
1 => &mut self.right,
n => &mut self.others[n - 2]
n => &mut self.others[n - 2],
}
}
}
@@ -114,23 +115,30 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
// of the two rarest `DocSet` in the intersection.
loop {
match right.skip_next(candidate) {
SkipResult::Reached => { break; }
SkipResult::Reached => {
break;
}
SkipResult::OverStep => {
candidate = right.doc();
other_candidate_ord = usize::max_value();
}
SkipResult::End => { return false; }
SkipResult::End => {
return false;
}
}
match left.skip_next(candidate) {
SkipResult::Reached => { break; }
SkipResult::Reached => {
break;
}
SkipResult::OverStep => {
candidate = left.doc();
other_candidate_ord = usize::max_value();
}
SkipResult::End => { return false; }
SkipResult::End => {
return false;
}
}
}
// test the remaining scorers;
for (ord, docset) in self.others.iter_mut().enumerate() {
@@ -147,16 +155,22 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
// let's update our candidate.
candidate = docset.doc();
match left.skip_next(candidate) {
SkipResult::Reached => { other_candidate_ord = ord; }
SkipResult::Reached => {
other_candidate_ord = ord;
}
SkipResult::OverStep => {
candidate = left.doc();
other_candidate_ord = usize::max_value();
}
SkipResult::End => { return false; }
SkipResult::End => {
return false;
}
}
continue 'outer;
}
SkipResult::End => { return false; }
SkipResult::End => {
return false;
}
}
}
}
@@ -164,9 +178,7 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
}
}
fn skip_next(&mut self, target: DocId) -> SkipResult {
// We optimize skipping by skipping every single member
// of the intersection to target.
let mut current_target: DocId = target;
@@ -211,18 +223,22 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
}
impl<TScorer, TOtherScorer> Scorer for Intersection<TScorer, TOtherScorer>
where TScorer: Scorer, TOtherScorer: Scorer {
where
TScorer: Scorer,
TOtherScorer: Scorer,
{
fn score(&mut self) -> Score {
self.left.score() + self.right.score() + self.others.iter_mut().map(Scorer::score).sum::<Score>()
self.left.score() + self.right.score()
+ self.others.iter_mut().map(Scorer::score).sum::<Score>()
}
}
#[cfg(test)]
mod tests {
use docset::{DocSet, SkipResult};
use super::Intersection;
use query::VecDocSet;
use docset::{DocSet, SkipResult};
use postings::tests::test_skip_against_unoptimized;
use query::VecDocSet;
#[test]
fn test_intersection() {
@@ -271,7 +287,7 @@ mod tests {
|| {
let left = VecDocSet::from(vec![4]);
let right = VecDocSet::from(vec![2, 5]);
box Intersection::new(vec![left, right])
Box::new(Intersection::new(vec![left, right]))
},
vec![0, 2, 4, 5, 6],
);
@@ -281,19 +297,19 @@ mod tests {
let mut right = VecDocSet::from(vec![2, 5, 10]);
left.advance();
right.advance();
box Intersection::new(vec![left, right])
Box::new(Intersection::new(vec![left, right]))
},
vec![0, 1, 2, 3, 4, 5, 6, 7, 10, 11],
);
test_skip_against_unoptimized(
|| {
box Intersection::new(vec![
Box::new(Intersection::new(vec![
VecDocSet::from(vec![1, 4, 5, 6]),
VecDocSet::from(vec![1, 2, 5, 6]),
VecDocSet::from(vec![1, 4, 5, 6]),
VecDocSet::from(vec![1, 5, 6]),
VecDocSet::from(vec![2, 4, 5, 7, 8]),
])
]))
},
vec![0, 1, 2, 3, 4, 5, 6, 7, 10, 11],
);

Some files were not shown because too many files have changed in this diff Show More